diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -2,2762 +2,183512 @@ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, - "epoch": 0.5438006723353767, + "epoch": 0.9887284951552304, "eval_steps": 500, - "global_step": 5500, + "global_step": 10000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "entropy": 8.700479960441589, + "entropy": 3.3239192962646484, + "epoch": 0, + "mean_token_accuracy": 0.6472114324569702, + "num_tokens": 5243.0, + "step": 0, + "train/ce_loss": 2.4552130699157715 + }, + { + "epoch": 0, + "step": 0, + "train/sim_loss": 1.015625 + }, + { + "epoch": 0, + "step": 0, + "train/total_loss": 1.2611463069915771 + }, + { + "entropy": 3.365748882293701, + "epoch": 9.887284951552304e-05, + "mean_token_accuracy": 0.6583143472671509, + "num_tokens": 10097.0, + "step": 1, + "train/ce_loss": 0.7174420952796936 + }, + { + "epoch": 9.887284951552304e-05, + "step": 1, + "train/sim_loss": 1.0078125 + }, + { + "epoch": 9.887284951552304e-05, + "step": 1, + "train/total_loss": 1.0795567035675049 + }, + { + "entropy": 3.1243503093719482, + "epoch": 0.00019774569903104609, + "mean_token_accuracy": 0.6797671318054199, + "num_tokens": 15213.0, + "step": 2, + "train/ce_loss": 2.3286292552948 + }, + { + "epoch": 0.00019774569903104609, + "step": 2, + "train/sim_loss": 0.98828125 + }, + { + "epoch": 0.00019774569903104609, + "step": 2, + "train/total_loss": 1.221144199371338 + }, + { + "entropy": 3.2535760402679443, + "epoch": 0.00029661854854656913, + "mean_token_accuracy": 0.6405940651893616, + "num_tokens": 20700.0, + "step": 3, + "train/ce_loss": 1.0501251220703125 + }, + { + "epoch": 0.00029661854854656913, + "step": 3, + "train/sim_loss": 0.98828125 + }, + { + "epoch": 0.00029661854854656913, + "step": 3, + "train/total_loss": 1.0932937860488892 + }, + { + "entropy": 3.384281873703003, + "epoch": 0.00039549139806209217, + "mean_token_accuracy": 0.6576381325721741, + "num_tokens": 26091.0, + "step": 4, + "train/ce_loss": 1.9366257190704346 + }, + { + "epoch": 0.00039549139806209217, + "step": 4, + "train/sim_loss": 0.98828125 + }, + { + "epoch": 0.00039549139806209217, + "step": 4, + "train/total_loss": 1.1819437742233276 + }, + { + "entropy": 3.4554200172424316, + "epoch": 0.0004943642475776152, + "mean_token_accuracy": 0.727007269859314, + "num_tokens": 31430.0, + "step": 5, + "train/ce_loss": 1.1327928304672241 + }, + { + "epoch": 0.0004943642475776152, + "step": 5, + "train/sim_loss": 0.97265625 + }, + { + "epoch": 0.0004943642475776152, + "step": 5, + "train/total_loss": 1.0859355926513672 + }, + { + "entropy": 3.5804295539855957, + "epoch": 0.0005932370970931383, + "mean_token_accuracy": 0.6846153736114502, + "num_tokens": 36852.0, + "step": 6, + "train/ce_loss": 1.342855453491211 + }, + { + "epoch": 0.0005932370970931383, + "step": 6, + "train/sim_loss": 0.98828125 + }, + { + "epoch": 0.0005932370970931383, + "step": 6, + "train/total_loss": 1.122566819190979 + }, + { + "entropy": 3.361402750015259, + "epoch": 0.0006921099466086612, + "mean_token_accuracy": 0.6744966506958008, + "num_tokens": 41935.0, + "step": 7, + "train/ce_loss": 1.7447583675384521 + }, + { + "epoch": 0.0006921099466086612, + "step": 7, + "train/sim_loss": 0.9296875 + }, + { + "epoch": 0.0006921099466086612, + "step": 7, + "train/total_loss": 1.1041632890701294 + }, + { + "entropy": 3.931365966796875, + "epoch": 0.0007909827961241843, + "mean_token_accuracy": 0.6637630462646484, + "num_tokens": 46986.0, + "step": 8, + "train/ce_loss": 0.3796125054359436 + }, + { + "epoch": 0.0007909827961241843, + "step": 8, + "train/sim_loss": 0.9296875 + }, + { + "epoch": 0.0007909827961241843, + "step": 8, + "train/total_loss": 0.9676487445831299 + }, + { + "entropy": 3.9508445262908936, + "epoch": 0.0008898556456397073, + "mean_token_accuracy": 0.6216216087341309, + "num_tokens": 52049.0, + "step": 9, + "train/ce_loss": 0.3588610291481018 + }, + { + "epoch": 0.0008898556456397073, + "step": 9, + "train/sim_loss": 0.8828125 + }, + { + "epoch": 0.0008898556456397073, + "step": 9, + "train/total_loss": 0.9186986088752747 + }, + { + "entropy": 3.788201332092285, + "epoch": 0.0009887284951552303, + "mean_token_accuracy": 0.7525773048400879, + "num_tokens": 56847.0, + "step": 10, + "train/ce_loss": 1.950152039527893 + }, + { + "epoch": 0.0009887284951552303, + "step": 10, + "train/sim_loss": 0.8359375 + }, + { + "epoch": 0.0009887284951552303, + "step": 10, + "train/total_loss": 1.0309526920318604 + }, + { + "entropy": 3.670529842376709, + "epoch": 0.0010876013446707534, + "mean_token_accuracy": 0.725824773311615, + "num_tokens": 62255.0, + "step": 11, + "train/ce_loss": 0.854789674282074 + }, + { + "epoch": 0.0010876013446707534, + "step": 11, + "train/sim_loss": 0.7890625 + }, + { + "epoch": 0.0010876013446707534, + "step": 11, + "train/total_loss": 0.8745414614677429 + }, + { + "entropy": 4.051430702209473, + "epoch": 0.0011864741941862765, + "mean_token_accuracy": 0.7071713209152222, + "num_tokens": 67232.0, + "step": 12, + "train/ce_loss": 0.32699134945869446 + }, + { + "epoch": 0.0011864741941862765, + "step": 12, + "train/sim_loss": 0.7265625 + }, + { + "epoch": 0.0011864741941862765, + "step": 12, + "train/total_loss": 0.7592616081237793 + }, + { + "entropy": 4.368044853210449, + "epoch": 0.0012853470437017994, + "mean_token_accuracy": 0.7151514887809753, + "num_tokens": 71963.0, + "step": 13, + "train/ce_loss": 0.49288395047187805 + }, + { + "epoch": 0.0012853470437017994, + "step": 13, + "train/sim_loss": 0.64453125 + }, + { + "epoch": 0.0012853470437017994, + "step": 13, + "train/total_loss": 0.6938196420669556 + }, + { + "entropy": 4.399681091308594, + "epoch": 0.0013842198932173225, + "mean_token_accuracy": 0.7262210845947266, + "num_tokens": 77190.0, + "step": 14, + "train/ce_loss": 0.9703823328018188 + }, + { + "epoch": 0.0013842198932173225, + "step": 14, + "train/sim_loss": 0.62890625 + }, + { + "epoch": 0.0013842198932173225, + "step": 14, + "train/total_loss": 0.725944459438324 + }, + { + "entropy": 4.859724044799805, + "epoch": 0.0014830927427328456, + "mean_token_accuracy": 0.7016759514808655, + "num_tokens": 82566.0, + "step": 15, + "train/ce_loss": 0.9824153184890747 + }, + { + "epoch": 0.0014830927427328456, + "step": 15, + "train/sim_loss": 0.55078125 + }, + { + "epoch": 0.0014830927427328456, + "step": 15, + "train/total_loss": 0.6490227580070496 + }, + { + "entropy": 4.952378273010254, + "epoch": 0.0015819655922483687, + "mean_token_accuracy": 0.6652078628540039, + "num_tokens": 87957.0, + "step": 16, + "train/ce_loss": 1.4136582612991333 + }, + { + "epoch": 0.0015819655922483687, + "step": 16, + "train/sim_loss": 0.58984375 + }, + { + "epoch": 0.0015819655922483687, + "step": 16, + "train/total_loss": 0.7312095761299133 + }, + { + "entropy": 4.8109025955200195, + "epoch": 0.0016808384417638916, + "mean_token_accuracy": 0.7169811129570007, + "num_tokens": 93055.0, + "step": 17, + "train/ce_loss": 1.1063599586486816 + }, + { + "epoch": 0.0016808384417638916, + "step": 17, + "train/sim_loss": 0.5078125 + }, + { + "epoch": 0.0016808384417638916, + "step": 17, + "train/total_loss": 0.6184484958648682 + }, + { + "entropy": 4.44307804107666, + "epoch": 0.0017797112912794147, + "mean_token_accuracy": 0.7559681534767151, + "num_tokens": 98691.0, + "step": 18, + "train/ce_loss": 0.914014995098114 + }, + { + "epoch": 0.0017797112912794147, + "step": 18, + "train/sim_loss": 0.5078125 + }, + { + "epoch": 0.0017797112912794147, + "step": 18, + "train/total_loss": 0.5992140173912048 + }, + { + "entropy": 5.108180046081543, + "epoch": 0.0018785841407949378, + "mean_token_accuracy": 0.692307710647583, + "num_tokens": 103954.0, + "step": 19, + "train/ce_loss": 1.0503363609313965 + }, + { + "epoch": 0.0018785841407949378, + "step": 19, + "train/sim_loss": 0.43359375 + }, + { + "epoch": 0.0018785841407949378, + "step": 19, + "train/total_loss": 0.5386273860931396 + }, + { "epoch": 0.0019774569903104606, - "grad_norm": 11.612215995788574, + "grad_norm": 1.845420241355896, "learning_rate": 9.997774810858923e-06, - "loss": 0.6989, - "mean_token_accuracy": 0.7780524671077729, - "num_tokens": 95751.0, + "loss": 0.9294, "step": 20 }, { - "entropy": 8.777435779571533, - "epoch": 0.003954913980620921, - "grad_norm": 8.975788116455078, - "learning_rate": 9.992829946100975e-06, - "loss": 0.6191, - "mean_token_accuracy": 0.7971815288066864, - "num_tokens": 191702.0, - "step": 40 + "entropy": 4.6566619873046875, + "epoch": 0.0019774569903104606, + "mean_token_accuracy": 0.7223684191703796, + "num_tokens": 109226.0, + "step": 20, + "train/ce_loss": 0.6507769227027893 }, { - "entropy": 8.858469605445862, - "epoch": 0.005932370970931382, - "grad_norm": 23.675687789916992, - "learning_rate": 9.987885081343026e-06, - "loss": 0.5636, - "mean_token_accuracy": 0.8123172074556351, - "num_tokens": 287004.0, - "step": 60 + "epoch": 0.0019774569903104606, + "step": 20, + "train/sim_loss": 0.42578125 }, { - "entropy": 8.736580753326416, - "epoch": 0.007909827961241843, - "grad_norm": 15.75529956817627, - "learning_rate": 9.982940216585078e-06, - "loss": 0.5782, - "mean_token_accuracy": 0.8042884260416031, - "num_tokens": 383064.0, - "step": 80 + "epoch": 0.0019774569903104606, + "step": 20, + "train/total_loss": 0.49085894227027893 }, { - "entropy": 8.834376287460326, - "epoch": 0.009887284951552304, - "grad_norm": 9.401670455932617, - "learning_rate": 9.977995351827128e-06, - "loss": 0.5242, - "mean_token_accuracy": 0.8219753801822662, - "num_tokens": 479140.0, - "step": 100 + "entropy": 4.65466833114624, + "epoch": 0.002076329839825984, + "mean_token_accuracy": 0.7402912378311157, + "num_tokens": 114480.0, + "step": 21, + "train/ce_loss": 1.0225709676742554 }, { - "entropy": 8.757733392715455, - "epoch": 0.011864741941862765, - "grad_norm": 11.580300331115723, - "learning_rate": 9.973050487069179e-06, - "loss": 0.5687, - "mean_token_accuracy": 0.8115669339895248, - "num_tokens": 575189.0, - "step": 120 + "epoch": 0.002076329839825984, + "step": 21, + "train/sim_loss": 0.41015625 }, { - "entropy": 8.739085245132447, - "epoch": 0.013842198932173226, - "grad_norm": 8.677336692810059, - "learning_rate": 9.968105622311231e-06, - "loss": 0.5445, - "mean_token_accuracy": 0.8199418306350708, - "num_tokens": 671631.0, - "step": 140 + "epoch": 0.002076329839825984, + "step": 21, + "train/total_loss": 0.5124133229255676 }, { - "entropy": 8.825765037536621, - "epoch": 0.015819655922483685, - "grad_norm": 6.102746963500977, - "learning_rate": 9.963160757553282e-06, - "loss": 0.5514, - "mean_token_accuracy": 0.81886205971241, - "num_tokens": 767185.0, - "step": 160 + "entropy": 5.295852184295654, + "epoch": 0.002175202689341507, + "mean_token_accuracy": 0.6411564350128174, + "num_tokens": 119473.0, + "step": 22, + "train/ce_loss": 1.4177800416946411 }, { - "entropy": 8.77472472190857, - "epoch": 0.017797112912794146, - "grad_norm": 8.439604759216309, - "learning_rate": 9.958215892795334e-06, - "loss": 0.56, - "mean_token_accuracy": 0.809479159116745, - "num_tokens": 863906.0, - "step": 180 + "epoch": 0.002175202689341507, + "step": 22, + "train/sim_loss": 0.4140625 }, { - "entropy": 8.694449472427369, - "epoch": 0.019774569903104607, - "grad_norm": 5.165731906890869, - "learning_rate": 9.953271028037384e-06, - "loss": 0.5464, - "mean_token_accuracy": 0.8180436283349991, - "num_tokens": 960716.0, - "step": 200 + "epoch": 0.002175202689341507, + "step": 22, + "train/total_loss": 0.5558404922485352 }, { - "entropy": 8.891126871109009, - "epoch": 0.02175202689341507, - "grad_norm": 22.927352905273438, - "learning_rate": 9.948326163279435e-06, - "loss": 0.5319, - "mean_token_accuracy": 0.8234529435634613, - "num_tokens": 1057086.0, - "step": 220 + "entropy": 4.775343894958496, + "epoch": 0.0022740755388570297, + "mean_token_accuracy": 0.701298713684082, + "num_tokens": 124877.0, + "step": 23, + "train/ce_loss": 1.3397403955459595 }, { - "entropy": 8.95149827003479, - "epoch": 0.02372948388372553, - "grad_norm": 5.129114151000977, - "learning_rate": 9.943381298521487e-06, - "loss": 0.5308, - "mean_token_accuracy": 0.823627719283104, - "num_tokens": 1152749.0, - "step": 240 + "epoch": 0.0022740755388570297, + "step": 23, + "train/sim_loss": 0.328125 }, { - "entropy": 8.7442786693573, - "epoch": 0.02570694087403599, - "grad_norm": 6.199312210083008, - "learning_rate": 9.938436433763537e-06, - "loss": 0.5927, - "mean_token_accuracy": 0.804636737704277, - "num_tokens": 1249712.0, - "step": 260 + "epoch": 0.0022740755388570297, + "step": 23, + "train/total_loss": 0.4620990455150604 }, { - "entropy": 8.761948728561402, - "epoch": 0.02768439786434645, - "grad_norm": 10.885489463806152, - "learning_rate": 9.93349156900559e-06, - "loss": 0.6116, - "mean_token_accuracy": 0.795692366361618, - "num_tokens": 1346033.0, - "step": 280 + "entropy": 4.733779430389404, + "epoch": 0.002372948388372553, + "mean_token_accuracy": 0.689393937587738, + "num_tokens": 130113.0, + "step": 24, + "train/ce_loss": 0.7229293584823608 }, { - "entropy": 8.93413405418396, - "epoch": 0.029661854854656913, - "grad_norm": 5.984589576721191, - "learning_rate": 9.928546704247638e-06, - "loss": 0.5427, - "mean_token_accuracy": 0.8186128795146942, - "num_tokens": 1440750.0, - "step": 300 + "epoch": 0.002372948388372553, + "step": 24, + "train/sim_loss": 0.3671875 }, { - "entropy": 8.907150077819825, - "epoch": 0.03163931184496737, - "grad_norm": 10.327075958251953, - "learning_rate": 9.92360183948969e-06, - "loss": 0.5869, - "mean_token_accuracy": 0.8066493749618531, - "num_tokens": 1536369.0, - "step": 320 + "epoch": 0.002372948388372553, + "step": 24, + "train/total_loss": 0.43948042392730713 }, { - "entropy": 8.76447286605835, - "epoch": 0.03361676883527783, - "grad_norm": 8.689876556396484, - "learning_rate": 9.918656974731741e-06, - "loss": 0.6042, - "mean_token_accuracy": 0.7982973754405975, - "num_tokens": 1633585.0, - "step": 340 + "entropy": 5.126186370849609, + "epoch": 0.002471821237888076, + "mean_token_accuracy": 0.6462736129760742, + "num_tokens": 135527.0, + "step": 25, + "train/ce_loss": 1.0404521226882935 }, { - "entropy": 9.078729391098022, - "epoch": 0.03559422582558829, - "grad_norm": 5.838440895080566, - "learning_rate": 9.913712109973793e-06, - "loss": 0.5542, - "mean_token_accuracy": 0.8149173945188523, - "num_tokens": 1728634.0, - "step": 360 + "epoch": 0.002471821237888076, + "step": 25, + "train/sim_loss": 0.359375 }, { - "entropy": 8.85760531425476, - "epoch": 0.03757168281589875, - "grad_norm": 7.205398082733154, - "learning_rate": 9.908767245215844e-06, - "loss": 0.5395, - "mean_token_accuracy": 0.8209541589021683, - "num_tokens": 1825487.0, - "step": 380 + "epoch": 0.002471821237888076, + "step": 25, + "train/total_loss": 0.46342021226882935 }, { - "entropy": 8.916018342971801, - "epoch": 0.039549139806209214, - "grad_norm": 11.460290908813477, - "learning_rate": 9.903822380457894e-06, - "loss": 0.5726, - "mean_token_accuracy": 0.8081452161073684, - "num_tokens": 1920683.0, - "step": 400 + "entropy": 5.366857528686523, + "epoch": 0.002570694087403599, + "mean_token_accuracy": 0.719939112663269, + "num_tokens": 140661.0, + "step": 26, + "train/ce_loss": 1.3868837356567383 }, { - "entropy": 8.921336793899536, - "epoch": 0.041526596796519676, - "grad_norm": 7.3233418464660645, - "learning_rate": 9.898877515699947e-06, - "loss": 0.5604, - "mean_token_accuracy": 0.8093248993158341, - "num_tokens": 2016851.0, - "step": 420 + "epoch": 0.002570694087403599, + "step": 26, + "train/sim_loss": 0.29296875 + }, + { + "epoch": 0.002570694087403599, + "step": 26, + "train/total_loss": 0.4316571354866028 + }, + { + "entropy": 5.796281814575195, + "epoch": 0.002669566936919122, + "mean_token_accuracy": 0.7521514892578125, + "num_tokens": 145711.0, + "step": 27, + "train/ce_loss": 0.20895634591579437 + }, + { + "epoch": 0.002669566936919122, + "step": 27, + "train/sim_loss": 0.2890625 + }, + { + "epoch": 0.002669566936919122, + "step": 27, + "train/total_loss": 0.3099581301212311 + }, + { + "entropy": 5.276698112487793, + "epoch": 0.002768439786434645, + "mean_token_accuracy": 0.7659817337989807, + "num_tokens": 151237.0, + "step": 28, + "train/ce_loss": 0.811927855014801 + }, + { + "epoch": 0.002768439786434645, + "step": 28, + "train/sim_loss": 0.359375 + }, + { + "epoch": 0.002768439786434645, + "step": 28, + "train/total_loss": 0.4405677914619446 + }, + { + "entropy": 5.565755844116211, + "epoch": 0.0028673126359501683, + "mean_token_accuracy": 0.7230098247528076, + "num_tokens": 156640.0, + "step": 29, + "train/ce_loss": 1.0701594352722168 + }, + { + "epoch": 0.0028673126359501683, + "step": 29, + "train/sim_loss": 0.39453125 + }, + { + "epoch": 0.0028673126359501683, + "step": 29, + "train/total_loss": 0.5015472173690796 + }, + { + "entropy": 5.5919294357299805, + "epoch": 0.002966185485465691, + "mean_token_accuracy": 0.7300275564193726, + "num_tokens": 161820.0, + "step": 30, + "train/ce_loss": 1.280920147895813 + }, + { + "epoch": 0.002966185485465691, + "step": 30, + "train/sim_loss": 0.375 + }, + { + "epoch": 0.002966185485465691, + "step": 30, + "train/total_loss": 0.5030920505523682 + }, + { + "entropy": 5.7281036376953125, + "epoch": 0.003065058334981214, + "mean_token_accuracy": 0.7092568278312683, + "num_tokens": 167014.0, + "step": 31, + "train/ce_loss": 0.6602377891540527 + }, + { + "epoch": 0.003065058334981214, + "step": 31, + "train/sim_loss": 0.31640625 + }, + { + "epoch": 0.003065058334981214, + "step": 31, + "train/total_loss": 0.3824300169944763 + }, + { + "entropy": 6.002870559692383, + "epoch": 0.0031639311844967374, + "mean_token_accuracy": 0.7331671118736267, + "num_tokens": 171815.0, + "step": 32, + "train/ce_loss": 1.5884499549865723 + }, + { + "epoch": 0.0031639311844967374, + "step": 32, + "train/sim_loss": 0.33203125 + }, + { + "epoch": 0.0031639311844967374, + "step": 32, + "train/total_loss": 0.4908762574195862 + }, + { + "entropy": 5.498105049133301, + "epoch": 0.0032628040340122602, + "mean_token_accuracy": 0.7279322743415833, + "num_tokens": 177099.0, + "step": 33, + "train/ce_loss": 0.8982157707214355 + }, + { + "epoch": 0.0032628040340122602, + "step": 33, + "train/sim_loss": 0.26953125 + }, + { + "epoch": 0.0032628040340122602, + "step": 33, + "train/total_loss": 0.35935282707214355 + }, + { + "entropy": 5.875737190246582, + "epoch": 0.003361676883527783, + "mean_token_accuracy": 0.7120181322097778, + "num_tokens": 182442.0, + "step": 34, + "train/ce_loss": 0.9330689907073975 + }, + { + "epoch": 0.003361676883527783, + "step": 34, + "train/sim_loss": 0.26953125 + }, + { + "epoch": 0.003361676883527783, + "step": 34, + "train/total_loss": 0.36283814907073975 + }, + { + "entropy": 5.939522743225098, + "epoch": 0.0034605497330433064, + "mean_token_accuracy": 0.6893453001976013, + "num_tokens": 187705.0, + "step": 35, + "train/ce_loss": 1.2632049322128296 + }, + { + "epoch": 0.0034605497330433064, + "step": 35, + "train/sim_loss": 0.29296875 + }, + { + "epoch": 0.0034605497330433064, + "step": 35, + "train/total_loss": 0.419289231300354 + }, + { + "entropy": 5.7719573974609375, + "epoch": 0.0035594225825588293, + "mean_token_accuracy": 0.749492883682251, + "num_tokens": 193137.0, + "step": 36, + "train/ce_loss": 0.9662005305290222 + }, + { + "epoch": 0.0035594225825588293, + "step": 36, + "train/sim_loss": 0.29296875 + }, + { + "epoch": 0.0035594225825588293, + "step": 36, + "train/total_loss": 0.3895888030529022 + }, + { + "entropy": 6.139836311340332, + "epoch": 0.003658295432074352, + "mean_token_accuracy": 0.7349397540092468, + "num_tokens": 198212.0, + "step": 37, + "train/ce_loss": 1.0053077936172485 + }, + { + "epoch": 0.003658295432074352, + "step": 37, + "train/sim_loss": 0.2421875 + }, + { + "epoch": 0.003658295432074352, + "step": 37, + "train/total_loss": 0.3427182734012604 + }, + { + "entropy": 6.456868648529053, + "epoch": 0.0037571682815898755, + "mean_token_accuracy": 0.74055415391922, + "num_tokens": 203017.0, + "step": 38, + "train/ce_loss": 0.23852603137493134 + }, + { + "epoch": 0.0037571682815898755, + "step": 38, + "train/sim_loss": 0.3203125 + }, + { + "epoch": 0.0037571682815898755, + "step": 38, + "train/total_loss": 0.3441651165485382 + }, + { + "entropy": 6.685752868652344, + "epoch": 0.0038560411311053984, + "mean_token_accuracy": 0.6641104221343994, + "num_tokens": 208114.0, + "step": 39, + "train/ce_loss": 2.193326711654663 + }, + { + "epoch": 0.0038560411311053984, + "step": 39, + "train/sim_loss": 0.26171875 + }, + { + "epoch": 0.0038560411311053984, + "step": 39, + "train/total_loss": 0.4810514450073242 + }, + { + "epoch": 0.003954913980620921, + "grad_norm": 1.2726562023162842, + "learning_rate": 9.992829946100975e-06, + "loss": 0.4364, + "step": 40 + }, + { + "entropy": 6.089438438415527, + "epoch": 0.003954913980620921, + "mean_token_accuracy": 0.7372262477874756, + "num_tokens": 213442.0, + "step": 40, + "train/ce_loss": 0.7173500657081604 + }, + { + "epoch": 0.003954913980620921, + "step": 40, + "train/sim_loss": 0.26171875 + }, + { + "epoch": 0.003954913980620921, + "step": 40, + "train/total_loss": 0.3334537744522095 + }, + { + "entropy": 6.468903064727783, + "epoch": 0.004053786830136444, + "mean_token_accuracy": 0.6836581826210022, + "num_tokens": 218574.0, + "step": 41, + "train/ce_loss": 1.1456220149993896 + }, + { + "epoch": 0.004053786830136444, + "step": 41, + "train/sim_loss": 0.3125 + }, + { + "epoch": 0.004053786830136444, + "step": 41, + "train/total_loss": 0.4270622134208679 + }, + { + "entropy": 5.9639973640441895, + "epoch": 0.004152659679651968, + "mean_token_accuracy": 0.7407407164573669, + "num_tokens": 223990.0, + "step": 42, + "train/ce_loss": 0.8651658892631531 + }, + { + "epoch": 0.004152659679651968, + "step": 42, + "train/sim_loss": 0.296875 + }, + { + "epoch": 0.004152659679651968, + "step": 42, + "train/total_loss": 0.3833915889263153 + }, + { + "entropy": 6.586842060089111, + "epoch": 0.004251532529167491, + "mean_token_accuracy": 0.7338129281997681, + "num_tokens": 229012.0, + "step": 43, + "train/ce_loss": 1.1294350624084473 + }, + { + "epoch": 0.004251532529167491, + "step": 43, + "train/sim_loss": 0.27734375 + }, + { + "epoch": 0.004251532529167491, + "step": 43, + "train/total_loss": 0.39028725028038025 + }, + { + "entropy": 6.888969421386719, + "epoch": 0.004350405378683014, + "mean_token_accuracy": 0.7364705801010132, + "num_tokens": 233869.0, + "step": 44, + "train/ce_loss": 0.15823636949062347 + }, + { + "epoch": 0.004350405378683014, + "step": 44, + "train/sim_loss": 0.265625 + }, + { + "epoch": 0.004350405378683014, + "step": 44, + "train/total_loss": 0.281448632478714 + }, + { + "entropy": 6.940787315368652, + "epoch": 0.0044492782281985365, + "mean_token_accuracy": 0.7454175353050232, + "num_tokens": 238801.0, + "step": 45, + "train/ce_loss": 1.0938209295272827 + }, + { + "epoch": 0.0044492782281985365, + "step": 45, + "train/sim_loss": 0.19921875 + }, + { + "epoch": 0.0044492782281985365, + "step": 45, + "train/total_loss": 0.30860084295272827 + }, + { + "entropy": 7.062989234924316, + "epoch": 0.004548151077714059, + "mean_token_accuracy": 0.7319587469100952, + "num_tokens": 243831.0, + "step": 46, + "train/ce_loss": 0.1027403399348259 + }, + { + "epoch": 0.004548151077714059, + "step": 46, + "train/sim_loss": 0.21875 + }, + { + "epoch": 0.004548151077714059, + "step": 46, + "train/total_loss": 0.2290240377187729 + }, + { + "entropy": 6.859626770019531, + "epoch": 0.004647023927229583, + "mean_token_accuracy": 0.6785079836845398, + "num_tokens": 248970.0, + "step": 47, + "train/ce_loss": 1.9300769567489624 + }, + { + "epoch": 0.004647023927229583, + "step": 47, + "train/sim_loss": 0.2421875 + }, + { + "epoch": 0.004647023927229583, + "step": 47, + "train/total_loss": 0.4351952075958252 + }, + { + "entropy": 6.56555700302124, + "epoch": 0.004745896776745106, + "mean_token_accuracy": 0.7164633870124817, + "num_tokens": 253993.0, + "step": 48, + "train/ce_loss": 1.6052716970443726 + }, + { + "epoch": 0.004745896776745106, + "step": 48, + "train/sim_loss": 0.21875 + }, + { + "epoch": 0.004745896776745106, + "step": 48, + "train/total_loss": 0.37927716970443726 + }, + { + "entropy": 6.6610870361328125, + "epoch": 0.004844769626260629, + "mean_token_accuracy": 0.7629796862602234, + "num_tokens": 259345.0, + "step": 49, + "train/ce_loss": 0.9208154678344727 + }, + { + "epoch": 0.004844769626260629, + "step": 49, + "train/sim_loss": 0.23046875 + }, + { + "epoch": 0.004844769626260629, + "step": 49, + "train/total_loss": 0.32255029678344727 + }, + { + "entropy": 7.085768699645996, + "epoch": 0.004943642475776152, + "mean_token_accuracy": 0.7152317762374878, + "num_tokens": 264265.0, + "step": 50, + "train/ce_loss": 1.4471031427383423 + }, + { + "epoch": 0.004943642475776152, + "step": 50, + "train/sim_loss": 0.23828125 + }, + { + "epoch": 0.004943642475776152, + "step": 50, + "train/total_loss": 0.3829915523529053 + }, + { + "entropy": 6.328176498413086, + "epoch": 0.005042515325291675, + "mean_token_accuracy": 0.7368420958518982, + "num_tokens": 269783.0, + "step": 51, + "train/ce_loss": 0.6005712151527405 + }, + { + "epoch": 0.005042515325291675, + "step": 51, + "train/sim_loss": 0.2265625 + }, + { + "epoch": 0.005042515325291675, + "step": 51, + "train/total_loss": 0.286619633436203 + }, + { + "entropy": 6.613964080810547, + "epoch": 0.005141388174807198, + "mean_token_accuracy": 0.7525196075439453, + "num_tokens": 275151.0, + "step": 52, + "train/ce_loss": 0.982277512550354 + }, + { + "epoch": 0.005141388174807198, + "step": 52, + "train/sim_loss": 0.2578125 + }, + { + "epoch": 0.005141388174807198, + "step": 52, + "train/total_loss": 0.35604023933410645 + }, + { + "entropy": 6.406770706176758, + "epoch": 0.005240261024322721, + "mean_token_accuracy": 0.6592427492141724, + "num_tokens": 280523.0, + "step": 53, + "train/ce_loss": 1.1580888032913208 + }, + { + "epoch": 0.005240261024322721, + "step": 53, + "train/sim_loss": 0.23828125 + }, + { + "epoch": 0.005240261024322721, + "step": 53, + "train/total_loss": 0.3540901243686676 + }, + { + "entropy": 7.066647052764893, + "epoch": 0.005339133873838244, + "mean_token_accuracy": 0.7491582632064819, + "num_tokens": 285554.0, + "step": 54, + "train/ce_loss": 1.6794772148132324 + }, + { + "epoch": 0.005339133873838244, + "step": 54, + "train/sim_loss": 0.25390625 + }, + { + "epoch": 0.005339133873838244, + "step": 54, + "train/total_loss": 0.4218539595603943 + }, + { + "entropy": 6.347116470336914, + "epoch": 0.005438006723353767, + "mean_token_accuracy": 0.7419700026512146, + "num_tokens": 290978.0, + "step": 55, + "train/ce_loss": 0.4674301743507385 + }, + { + "epoch": 0.005438006723353767, + "step": 55, + "train/sim_loss": 0.24609375 + }, + { + "epoch": 0.005438006723353767, + "step": 55, + "train/total_loss": 0.2928367555141449 + }, + { + "entropy": 6.935833930969238, + "epoch": 0.00553687957286929, + "mean_token_accuracy": 0.7323232293128967, + "num_tokens": 296402.0, + "step": 56, + "train/ce_loss": 0.6243418455123901 + }, + { + "epoch": 0.00553687957286929, + "step": 56, + "train/sim_loss": 0.33203125 + }, + { + "epoch": 0.00553687957286929, + "step": 56, + "train/total_loss": 0.39446544647216797 + }, + { + "entropy": 6.755821228027344, + "epoch": 0.005635752422384813, + "mean_token_accuracy": 0.7094240784645081, + "num_tokens": 301565.0, + "step": 57, + "train/ce_loss": 1.3958739042282104 + }, + { + "epoch": 0.005635752422384813, + "step": 57, + "train/sim_loss": 0.3125 + }, + { + "epoch": 0.005635752422384813, + "step": 57, + "train/total_loss": 0.45208740234375 + }, + { + "entropy": 6.747868061065674, + "epoch": 0.005734625271900337, + "mean_token_accuracy": 0.7036224007606506, + "num_tokens": 306899.0, + "step": 58, + "train/ce_loss": 1.1025996208190918 + }, + { + "epoch": 0.005734625271900337, + "step": 58, + "train/sim_loss": 0.2265625 + }, + { + "epoch": 0.005734625271900337, + "step": 58, + "train/total_loss": 0.3368224501609802 + }, + { + "entropy": 7.568367958068848, + "epoch": 0.0058334981214158595, + "mean_token_accuracy": 0.6982248425483704, + "num_tokens": 311618.0, + "step": 59, + "train/ce_loss": 0.13859635591506958 + }, + { + "epoch": 0.0058334981214158595, + "step": 59, + "train/sim_loss": 0.2265625 + }, + { + "epoch": 0.0058334981214158595, + "step": 59, + "train/total_loss": 0.24042212963104248 + }, + { + "epoch": 0.005932370970931382, + "grad_norm": 1.4971586465835571, + "learning_rate": 9.987885081343026e-06, + "loss": 0.3521, + "step": 60 + }, + { + "entropy": 6.701387882232666, + "epoch": 0.005932370970931382, + "mean_token_accuracy": 0.6508380174636841, + "num_tokens": 316843.0, + "step": 60, + "train/ce_loss": 1.0780704021453857 + }, + { + "epoch": 0.005932370970931382, + "step": 60, + "train/sim_loss": 0.21875 + }, + { + "epoch": 0.005932370970931382, + "step": 60, + "train/total_loss": 0.3265570402145386 + }, + { + "entropy": 7.042557239532471, + "epoch": 0.006031243820446905, + "mean_token_accuracy": 0.7535853981971741, + "num_tokens": 322073.0, + "step": 61, + "train/ce_loss": 0.8363357186317444 + }, + { + "epoch": 0.006031243820446905, + "step": 61, + "train/sim_loss": 0.21875 + }, + { + "epoch": 0.006031243820446905, + "step": 61, + "train/total_loss": 0.30238357186317444 + }, + { + "entropy": 7.509478569030762, + "epoch": 0.006130116669962428, + "mean_token_accuracy": 0.7476979494094849, + "num_tokens": 327050.0, + "step": 62, + "train/ce_loss": 1.542819857597351 + }, + { + "epoch": 0.006130116669962428, + "step": 62, + "train/sim_loss": 0.21875 + }, + { + "epoch": 0.006130116669962428, + "step": 62, + "train/total_loss": 0.37303197383880615 + }, + { + "entropy": 7.262565612792969, + "epoch": 0.006228989519477951, + "mean_token_accuracy": 0.7211428284645081, + "num_tokens": 332360.0, + "step": 63, + "train/ce_loss": 0.8255243897438049 + }, + { + "epoch": 0.006228989519477951, + "step": 63, + "train/sim_loss": 0.2734375 + }, + { + "epoch": 0.006228989519477951, + "step": 63, + "train/total_loss": 0.355989933013916 + }, + { + "entropy": 7.196902751922607, + "epoch": 0.006327862368993475, + "mean_token_accuracy": 0.7398772835731506, + "num_tokens": 337688.0, + "step": 64, + "train/ce_loss": 0.8732907176017761 + }, + { + "epoch": 0.006327862368993475, + "step": 64, + "train/sim_loss": 0.20703125 + }, + { + "epoch": 0.006327862368993475, + "step": 64, + "train/total_loss": 0.29436033964157104 + }, + { + "entropy": 6.966497421264648, + "epoch": 0.006426735218508998, + "mean_token_accuracy": 0.7328431606292725, + "num_tokens": 342964.0, + "step": 65, + "train/ce_loss": 1.7824984788894653 + }, + { + "epoch": 0.006426735218508998, + "step": 65, + "train/sim_loss": 0.20703125 + }, + { + "epoch": 0.006426735218508998, + "step": 65, + "train/total_loss": 0.3852810859680176 + }, + { + "entropy": 6.835407257080078, + "epoch": 0.0065256080680245205, + "mean_token_accuracy": 0.7473170757293701, + "num_tokens": 348451.0, + "step": 66, + "train/ce_loss": 0.9521581530570984 + }, + { + "epoch": 0.0065256080680245205, + "step": 66, + "train/sim_loss": 0.21875 + }, + { + "epoch": 0.0065256080680245205, + "step": 66, + "train/total_loss": 0.3139658272266388 + }, + { + "entropy": 7.106851100921631, + "epoch": 0.006624480917540043, + "mean_token_accuracy": 0.7011128664016724, + "num_tokens": 353568.0, + "step": 67, + "train/ce_loss": 2.121598243713379 + }, + { + "epoch": 0.006624480917540043, + "step": 67, + "train/sim_loss": 0.20703125 + }, + { + "epoch": 0.006624480917540043, + "step": 67, + "train/total_loss": 0.41919106245040894 + }, + { + "entropy": 7.332256317138672, + "epoch": 0.006723353767055566, + "mean_token_accuracy": 0.7397590279579163, + "num_tokens": 358854.0, + "step": 68, + "train/ce_loss": 0.9141528010368347 + }, + { + "epoch": 0.006723353767055566, + "step": 68, + "train/sim_loss": 0.1953125 + }, + { + "epoch": 0.006723353767055566, + "step": 68, + "train/total_loss": 0.28672778606414795 + }, + { + "entropy": 7.315262317657471, + "epoch": 0.00682222661657109, + "mean_token_accuracy": 0.6781250238418579, + "num_tokens": 363938.0, + "step": 69, + "train/ce_loss": 1.1206138134002686 + }, + { + "epoch": 0.00682222661657109, + "step": 69, + "train/sim_loss": 0.265625 + }, + { + "epoch": 0.00682222661657109, + "step": 69, + "train/total_loss": 0.37768638134002686 + }, + { + "entropy": 7.905448913574219, + "epoch": 0.006921099466086613, + "mean_token_accuracy": 0.70243901014328, + "num_tokens": 368995.0, + "step": 70, + "train/ce_loss": 0.7935793399810791 + }, + { + "epoch": 0.006921099466086613, + "step": 70, + "train/sim_loss": 0.2109375 + }, + { + "epoch": 0.006921099466086613, + "step": 70, + "train/total_loss": 0.29029542207717896 + }, + { + "entropy": 7.295677185058594, + "epoch": 0.007019972315602136, + "mean_token_accuracy": 0.6715328693389893, + "num_tokens": 374234.0, + "step": 71, + "train/ce_loss": 1.6432149410247803 + }, + { + "epoch": 0.007019972315602136, + "step": 71, + "train/sim_loss": 0.32421875 + }, + { + "epoch": 0.007019972315602136, + "step": 71, + "train/total_loss": 0.4885402321815491 + }, + { + "entropy": 7.1108551025390625, + "epoch": 0.007118845165117659, + "mean_token_accuracy": 0.7644927501678467, + "num_tokens": 379526.0, + "step": 72, + "train/ce_loss": 0.6361651420593262 + }, + { + "epoch": 0.007118845165117659, + "step": 72, + "train/sim_loss": 0.19921875 + }, + { + "epoch": 0.007118845165117659, + "step": 72, + "train/total_loss": 0.2628352642059326 + }, + { + "entropy": 7.745475769042969, + "epoch": 0.0072177180146331815, + "mean_token_accuracy": 0.7246596217155457, + "num_tokens": 384624.0, + "step": 73, + "train/ce_loss": 1.0735074281692505 + }, + { + "epoch": 0.0072177180146331815, + "step": 73, + "train/sim_loss": 0.25 + }, + { + "epoch": 0.0072177180146331815, + "step": 73, + "train/total_loss": 0.35735073685646057 + }, + { + "entropy": 7.32032585144043, + "epoch": 0.007316590864148704, + "mean_token_accuracy": 0.699284017086029, + "num_tokens": 389936.0, + "step": 74, + "train/ce_loss": 1.220793604850769 + }, + { + "epoch": 0.007316590864148704, + "step": 74, + "train/sim_loss": 0.30078125 + }, + { + "epoch": 0.007316590864148704, + "step": 74, + "train/total_loss": 0.42286062240600586 + }, + { + "entropy": 7.601529121398926, + "epoch": 0.007415463713664228, + "mean_token_accuracy": 0.723192036151886, + "num_tokens": 395205.0, + "step": 75, + "train/ce_loss": 1.2222076654434204 + }, + { + "epoch": 0.007415463713664228, + "step": 75, + "train/sim_loss": 0.23046875 + }, + { + "epoch": 0.007415463713664228, + "step": 75, + "train/total_loss": 0.3526895046234131 + }, + { + "entropy": 7.278405666351318, + "epoch": 0.007514336563179751, + "mean_token_accuracy": 0.7372061014175415, + "num_tokens": 400368.0, + "step": 76, + "train/ce_loss": 0.859329104423523 + }, + { + "epoch": 0.007514336563179751, + "step": 76, + "train/sim_loss": 0.17578125 + }, + { + "epoch": 0.007514336563179751, + "step": 76, + "train/total_loss": 0.2617141604423523 + }, + { + "entropy": 7.346745491027832, + "epoch": 0.007613209412695274, + "mean_token_accuracy": 0.6926229596138, + "num_tokens": 405502.0, + "step": 77, + "train/ce_loss": 0.06304176896810532 + }, + { + "epoch": 0.007613209412695274, + "step": 77, + "train/sim_loss": 0.19140625 + }, + { + "epoch": 0.007613209412695274, + "step": 77, + "train/total_loss": 0.19771042466163635 + }, + { + "entropy": 7.988180160522461, + "epoch": 0.007712082262210797, + "mean_token_accuracy": 0.7505938410758972, + "num_tokens": 410364.0, + "step": 78, + "train/ce_loss": 0.10073232650756836 + }, + { + "epoch": 0.007712082262210797, + "step": 78, + "train/sim_loss": 0.18359375 + }, + { + "epoch": 0.007712082262210797, + "step": 78, + "train/total_loss": 0.1936669796705246 + }, + { + "entropy": 7.201304912567139, + "epoch": 0.00781095511172632, + "mean_token_accuracy": 0.724950909614563, + "num_tokens": 415885.0, + "step": 79, + "train/ce_loss": 1.0036214590072632 + }, + { + "epoch": 0.00781095511172632, + "step": 79, + "train/sim_loss": 0.20703125 + }, + { + "epoch": 0.00781095511172632, + "step": 79, + "train/total_loss": 0.3073934018611908 + }, + { + "epoch": 0.007909827961241843, + "grad_norm": 0.834606409072876, + "learning_rate": 9.982940216585078e-06, + "loss": 0.3239, + "step": 80 + }, + { + "entropy": 7.307192802429199, + "epoch": 0.007909827961241843, + "mean_token_accuracy": 0.7760290503501892, + "num_tokens": 421192.0, + "step": 80, + "train/ce_loss": 1.2137980461120605 + }, + { + "epoch": 0.007909827961241843, + "step": 80, + "train/sim_loss": 0.28125 + }, + { + "epoch": 0.007909827961241843, + "step": 80, + "train/total_loss": 0.4026297926902771 + }, + { + "entropy": 6.885316848754883, + "epoch": 0.008008700810757366, + "mean_token_accuracy": 0.7960339784622192, + "num_tokens": 426776.0, + "step": 81, + "train/ce_loss": 0.7223502397537231 + }, + { + "epoch": 0.008008700810757366, + "step": 81, + "train/sim_loss": 0.19140625 + }, + { + "epoch": 0.008008700810757366, + "step": 81, + "train/total_loss": 0.26364126801490784 + }, + { + "entropy": 7.086678504943848, + "epoch": 0.008107573660272888, + "mean_token_accuracy": 0.7789784073829651, + "num_tokens": 432456.0, + "step": 82, + "train/ce_loss": 0.7674393653869629 + }, + { + "epoch": 0.008107573660272888, + "step": 82, + "train/sim_loss": 0.17578125 + }, + { + "epoch": 0.008107573660272888, + "step": 82, + "train/total_loss": 0.2525251805782318 + }, + { + "entropy": 8.099905967712402, + "epoch": 0.008206446509788412, + "mean_token_accuracy": 0.8287937641143799, + "num_tokens": 437135.0, + "step": 83, + "train/ce_loss": 0.17125576734542847 + }, + { + "epoch": 0.008206446509788412, + "step": 83, + "train/sim_loss": 0.1796875 + }, + { + "epoch": 0.008206446509788412, + "step": 83, + "train/total_loss": 0.19681307673454285 + }, + { + "entropy": 7.160604000091553, + "epoch": 0.008305319359303936, + "mean_token_accuracy": 0.7324613332748413, + "num_tokens": 442472.0, + "step": 84, + "train/ce_loss": 0.8014112114906311 + }, + { + "epoch": 0.008305319359303936, + "step": 84, + "train/sim_loss": 0.15625 + }, + { + "epoch": 0.008305319359303936, + "step": 84, + "train/total_loss": 0.2363911271095276 + }, + { + "entropy": 7.76218318939209, + "epoch": 0.008404192208819458, + "mean_token_accuracy": 0.7582237124443054, + "num_tokens": 447559.0, + "step": 85, + "train/ce_loss": 0.9764944314956665 + }, + { + "epoch": 0.008404192208819458, + "step": 85, + "train/sim_loss": 0.13671875 + }, + { + "epoch": 0.008404192208819458, + "step": 85, + "train/total_loss": 0.2343682050704956 + }, + { + "entropy": 8.122018814086914, + "epoch": 0.008503065058334982, + "mean_token_accuracy": 0.7223300933837891, + "num_tokens": 452491.0, + "step": 86, + "train/ce_loss": 1.1845427751541138 + }, + { + "epoch": 0.008503065058334982, + "step": 86, + "train/sim_loss": 0.1796875 + }, + { + "epoch": 0.008503065058334982, + "step": 86, + "train/total_loss": 0.2981417775154114 + }, + { + "entropy": 7.716374397277832, + "epoch": 0.008601937907850504, + "mean_token_accuracy": 0.7362499833106995, + "num_tokens": 457972.0, + "step": 87, + "train/ce_loss": 0.9235560894012451 + }, + { + "epoch": 0.008601937907850504, + "step": 87, + "train/sim_loss": 0.26171875 + }, + { + "epoch": 0.008601937907850504, + "step": 87, + "train/total_loss": 0.3540743589401245 + }, + { + "entropy": 7.842241287231445, + "epoch": 0.008700810757366027, + "mean_token_accuracy": 0.703797459602356, + "num_tokens": 463188.0, + "step": 88, + "train/ce_loss": 1.2208952903747559 + }, + { + "epoch": 0.008700810757366027, + "step": 88, + "train/sim_loss": 0.1953125 + }, + { + "epoch": 0.008700810757366027, + "step": 88, + "train/total_loss": 0.31740203499794006 + }, + { + "entropy": 7.5404253005981445, + "epoch": 0.008799683606881551, + "mean_token_accuracy": 0.7666666507720947, + "num_tokens": 468311.0, + "step": 89, + "train/ce_loss": 1.0468343496322632 + }, + { + "epoch": 0.008799683606881551, + "step": 89, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.008799683606881551, + "step": 89, + "train/total_loss": 0.21405842900276184 + }, + { + "entropy": 7.966272354125977, + "epoch": 0.008898556456397073, + "mean_token_accuracy": 0.6932907104492188, + "num_tokens": 473025.0, + "step": 90, + "train/ce_loss": 0.13910500705242157 + }, + { + "epoch": 0.008898556456397073, + "step": 90, + "train/sim_loss": 0.21484375 + }, + { + "epoch": 0.008898556456397073, + "step": 90, + "train/total_loss": 0.22875425219535828 + }, + { + "entropy": 7.517707824707031, + "epoch": 0.008997429305912597, + "mean_token_accuracy": 0.7390761375427246, + "num_tokens": 478290.0, + "step": 91, + "train/ce_loss": 0.6987117528915405 + }, + { + "epoch": 0.008997429305912597, + "step": 91, + "train/sim_loss": 0.1328125 + }, + { + "epoch": 0.008997429305912597, + "step": 91, + "train/total_loss": 0.202683687210083 + }, + { + "entropy": 7.53427791595459, + "epoch": 0.009096302155428119, + "mean_token_accuracy": 0.6783004403114319, + "num_tokens": 483445.0, + "step": 92, + "train/ce_loss": 1.182236909866333 + }, + { + "epoch": 0.009096302155428119, + "step": 92, + "train/sim_loss": 0.234375 + }, + { + "epoch": 0.009096302155428119, + "step": 92, + "train/total_loss": 0.3525986969470978 + }, + { + "entropy": 7.789042949676514, + "epoch": 0.009195175004943643, + "mean_token_accuracy": 0.7073529362678528, + "num_tokens": 488538.0, + "step": 93, + "train/ce_loss": 0.06518832594156265 + }, + { + "epoch": 0.009195175004943643, + "step": 93, + "train/sim_loss": 0.15234375 + }, + { + "epoch": 0.009195175004943643, + "step": 93, + "train/total_loss": 0.15886257588863373 + }, + { + "entropy": 7.619516372680664, + "epoch": 0.009294047854459166, + "mean_token_accuracy": 0.7179487347602844, + "num_tokens": 493813.0, + "step": 94, + "train/ce_loss": 0.8823347687721252 + }, + { + "epoch": 0.009294047854459166, + "step": 94, + "train/sim_loss": 0.18359375 + }, + { + "epoch": 0.009294047854459166, + "step": 94, + "train/total_loss": 0.27182722091674805 + }, + { + "entropy": 7.926183700561523, + "epoch": 0.009392920703974688, + "mean_token_accuracy": 0.7148703932762146, + "num_tokens": 498978.0, + "step": 95, + "train/ce_loss": 0.8853976130485535 + }, + { + "epoch": 0.009392920703974688, + "step": 95, + "train/sim_loss": 0.171875 + }, + { + "epoch": 0.009392920703974688, + "step": 95, + "train/total_loss": 0.2604147791862488 + }, + { + "entropy": 8.150131225585938, + "epoch": 0.009491793553490212, + "mean_token_accuracy": 0.7428571581840515, + "num_tokens": 504097.0, + "step": 96, + "train/ce_loss": 0.6387143731117249 + }, + { + "epoch": 0.009491793553490212, + "step": 96, + "train/sim_loss": 0.17578125 + }, + { + "epoch": 0.009491793553490212, + "step": 96, + "train/total_loss": 0.23965269327163696 + }, + { + "entropy": 7.242536544799805, + "epoch": 0.009590666403005734, + "mean_token_accuracy": 0.752212405204773, + "num_tokens": 509554.0, + "step": 97, + "train/ce_loss": 0.8059234023094177 + }, + { + "epoch": 0.009590666403005734, + "step": 97, + "train/sim_loss": 0.2421875 + }, + { + "epoch": 0.009590666403005734, + "step": 97, + "train/total_loss": 0.3227798342704773 + }, + { + "entropy": 7.3880696296691895, + "epoch": 0.009689539252521258, + "mean_token_accuracy": 0.724952757358551, + "num_tokens": 515123.0, + "step": 98, + "train/ce_loss": 1.176647424697876 + }, + { + "epoch": 0.009689539252521258, + "step": 98, + "train/sim_loss": 0.203125 + }, + { + "epoch": 0.009689539252521258, + "step": 98, + "train/total_loss": 0.32078975439071655 + }, + { + "entropy": 8.349966049194336, + "epoch": 0.00978841210203678, + "mean_token_accuracy": 0.749588131904602, + "num_tokens": 520171.0, + "step": 99, + "train/ce_loss": 0.06600559502840042 + }, + { + "epoch": 0.00978841210203678, + "step": 99, + "train/sim_loss": 0.203125 + }, + { + "epoch": 0.00978841210203678, + "step": 99, + "train/total_loss": 0.20972555875778198 + }, + { + "epoch": 0.009887284951552304, + "grad_norm": 1.0337260961532593, + "learning_rate": 9.977995351827128e-06, + "loss": 0.2901, + "step": 100 + }, + { + "entropy": 8.475637435913086, + "epoch": 0.009887284951552304, + "mean_token_accuracy": 0.6748120188713074, + "num_tokens": 525179.0, + "step": 100, + "train/ce_loss": 0.07647012174129486 + }, + { + "epoch": 0.009887284951552304, + "step": 100, + "train/sim_loss": 0.1796875 + }, + { + "epoch": 0.009887284951552304, + "step": 100, + "train/total_loss": 0.18733450770378113 + }, + { + "entropy": 8.2037353515625, + "epoch": 0.009986157801067827, + "mean_token_accuracy": 0.7037037014961243, + "num_tokens": 530303.0, + "step": 101, + "train/ce_loss": 0.7040248513221741 + }, + { + "epoch": 0.009986157801067827, + "step": 101, + "train/sim_loss": 0.16796875 + }, + { + "epoch": 0.009986157801067827, + "step": 101, + "train/total_loss": 0.23837123811244965 + }, + { + "entropy": 8.168619155883789, + "epoch": 0.01008503065058335, + "mean_token_accuracy": 0.7303797602653503, + "num_tokens": 535507.0, + "step": 102, + "train/ce_loss": 0.7577227354049683 + }, + { + "epoch": 0.01008503065058335, + "step": 102, + "train/sim_loss": 0.18359375 + }, + { + "epoch": 0.01008503065058335, + "step": 102, + "train/total_loss": 0.2593660354614258 + }, + { + "entropy": 8.024353981018066, + "epoch": 0.010183903500098873, + "mean_token_accuracy": 0.6881720423698425, + "num_tokens": 540582.0, + "step": 103, + "train/ce_loss": 1.9404637813568115 + }, + { + "epoch": 0.010183903500098873, + "step": 103, + "train/sim_loss": 0.2265625 + }, + { + "epoch": 0.010183903500098873, + "step": 103, + "train/total_loss": 0.42060887813568115 + }, + { + "entropy": 8.253752708435059, + "epoch": 0.010282776349614395, + "mean_token_accuracy": 0.7130434513092041, + "num_tokens": 545677.0, + "step": 104, + "train/ce_loss": 0.05784047022461891 + }, + { + "epoch": 0.010282776349614395, + "step": 104, + "train/sim_loss": 0.140625 + }, + { + "epoch": 0.010282776349614395, + "step": 104, + "train/total_loss": 0.1464090496301651 + }, + { + "entropy": 7.711334705352783, + "epoch": 0.010381649199129919, + "mean_token_accuracy": 0.7691428661346436, + "num_tokens": 551034.0, + "step": 105, + "train/ce_loss": 1.0411088466644287 + }, + { + "epoch": 0.010381649199129919, + "step": 105, + "train/sim_loss": 0.18359375 + }, + { + "epoch": 0.010381649199129919, + "step": 105, + "train/total_loss": 0.2877046465873718 + }, + { + "entropy": 7.441492080688477, + "epoch": 0.010480522048645443, + "mean_token_accuracy": 0.6875, + "num_tokens": 556413.0, + "step": 106, + "train/ce_loss": 0.6729193329811096 + }, + { + "epoch": 0.010480522048645443, + "step": 106, + "train/sim_loss": 0.1640625 + }, + { + "epoch": 0.010480522048645443, + "step": 106, + "train/total_loss": 0.23135444521903992 + }, + { + "entropy": 7.938560962677002, + "epoch": 0.010579394898160965, + "mean_token_accuracy": 0.6752910614013672, + "num_tokens": 561628.0, + "step": 107, + "train/ce_loss": 1.1121141910552979 + }, + { + "epoch": 0.010579394898160965, + "step": 107, + "train/sim_loss": 0.15625 + }, + { + "epoch": 0.010579394898160965, + "step": 107, + "train/total_loss": 0.2674614191055298 + }, + { + "entropy": 7.696544170379639, + "epoch": 0.010678267747676488, + "mean_token_accuracy": 0.7175732254981995, + "num_tokens": 567069.0, + "step": 108, + "train/ce_loss": 0.9015482068061829 + }, + { + "epoch": 0.010678267747676488, + "step": 108, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.010678267747676488, + "step": 108, + "train/total_loss": 0.19952982664108276 + }, + { + "entropy": 8.239730834960938, + "epoch": 0.01077714059719201, + "mean_token_accuracy": 0.7056276798248291, + "num_tokens": 572228.0, + "step": 109, + "train/ce_loss": 0.8910127282142639 + }, + { + "epoch": 0.01077714059719201, + "step": 109, + "train/sim_loss": 0.1875 + }, + { + "epoch": 0.01077714059719201, + "step": 109, + "train/total_loss": 0.27660128474235535 + }, + { + "entropy": 8.282722473144531, + "epoch": 0.010876013446707534, + "mean_token_accuracy": 0.711757242679596, + "num_tokens": 577446.0, + "step": 110, + "train/ce_loss": 1.4429800510406494 + }, + { + "epoch": 0.010876013446707534, + "step": 110, + "train/sim_loss": 0.234375 + }, + { + "epoch": 0.010876013446707534, + "step": 110, + "train/total_loss": 0.3786730170249939 + }, + { + "entropy": 8.14548110961914, + "epoch": 0.010974886296223058, + "mean_token_accuracy": 0.73591548204422, + "num_tokens": 582743.0, + "step": 111, + "train/ce_loss": 1.082576036453247 + }, + { + "epoch": 0.010974886296223058, + "step": 111, + "train/sim_loss": 0.20703125 + }, + { + "epoch": 0.010974886296223058, + "step": 111, + "train/total_loss": 0.31528884172439575 + }, + { + "entropy": 8.513275146484375, + "epoch": 0.01107375914573858, + "mean_token_accuracy": 0.7968127727508545, + "num_tokens": 587669.0, + "step": 112, + "train/ce_loss": 0.08766748011112213 + }, + { + "epoch": 0.01107375914573858, + "step": 112, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.01107375914573858, + "step": 112, + "train/total_loss": 0.12595424056053162 + }, + { + "entropy": 8.212099075317383, + "epoch": 0.011172631995254104, + "mean_token_accuracy": 0.7019002437591553, + "num_tokens": 592930.0, + "step": 113, + "train/ce_loss": 0.9837558269500732 + }, + { + "epoch": 0.011172631995254104, + "step": 113, + "train/sim_loss": 0.171875 + }, + { + "epoch": 0.011172631995254104, + "step": 113, + "train/total_loss": 0.2702505886554718 + }, + { + "entropy": 7.418394088745117, + "epoch": 0.011271504844769626, + "mean_token_accuracy": 0.7642626762390137, + "num_tokens": 598372.0, + "step": 114, + "train/ce_loss": 0.8529636859893799 + }, + { + "epoch": 0.011271504844769626, + "step": 114, + "train/sim_loss": 0.1796875 + }, + { + "epoch": 0.011271504844769626, + "step": 114, + "train/total_loss": 0.2649838626384735 + }, + { + "entropy": 8.68088436126709, + "epoch": 0.01137037769428515, + "mean_token_accuracy": 0.6915887594223022, + "num_tokens": 603489.0, + "step": 115, + "train/ce_loss": 0.07375526428222656 + }, + { + "epoch": 0.01137037769428515, + "step": 115, + "train/sim_loss": 0.14453125 + }, + { + "epoch": 0.01137037769428515, + "step": 115, + "train/total_loss": 0.15190677344799042 + }, + { + "entropy": 8.23654556274414, + "epoch": 0.011469250543800673, + "mean_token_accuracy": 0.6918158531188965, + "num_tokens": 608730.0, + "step": 116, + "train/ce_loss": 1.41244375705719 + }, + { + "epoch": 0.011469250543800673, + "step": 116, + "train/sim_loss": 0.26953125 + }, + { + "epoch": 0.011469250543800673, + "step": 116, + "train/total_loss": 0.41077563166618347 + }, + { + "entropy": 8.532476425170898, + "epoch": 0.011568123393316195, + "mean_token_accuracy": 0.7419928908348083, + "num_tokens": 613721.0, + "step": 117, + "train/ce_loss": 0.07354797422885895 + }, + { + "epoch": 0.011568123393316195, + "step": 117, + "train/sim_loss": 0.1484375 + }, + { + "epoch": 0.011568123393316195, + "step": 117, + "train/total_loss": 0.15579229593276978 + }, + { + "entropy": 8.25015640258789, + "epoch": 0.011666996242831719, + "mean_token_accuracy": 0.6930572390556335, + "num_tokens": 618995.0, + "step": 118, + "train/ce_loss": 0.7497038841247559 + }, + { + "epoch": 0.011666996242831719, + "step": 118, + "train/sim_loss": 0.16796875 + }, + { + "epoch": 0.011666996242831719, + "step": 118, + "train/total_loss": 0.24293914437294006 + }, + { + "entropy": 7.956493377685547, + "epoch": 0.011765869092347241, + "mean_token_accuracy": 0.7132115960121155, + "num_tokens": 624420.0, + "step": 119, + "train/ce_loss": 0.7325726747512817 + }, + { + "epoch": 0.011765869092347241, + "step": 119, + "train/sim_loss": 0.1484375 + }, + { + "epoch": 0.011765869092347241, + "step": 119, + "train/total_loss": 0.22169476747512817 + }, + { + "epoch": 0.011864741941862765, + "grad_norm": 0.8489806056022644, + "learning_rate": 9.973050487069179e-06, + "loss": 0.2843, + "step": 120 + }, + { + "entropy": 7.759334564208984, + "epoch": 0.011864741941862765, + "mean_token_accuracy": 0.7204301357269287, + "num_tokens": 629755.0, + "step": 120, + "train/ce_loss": 0.5303604006767273 + }, + { + "epoch": 0.011864741941862765, + "step": 120, + "train/sim_loss": 0.1875 + }, + { + "epoch": 0.011864741941862765, + "step": 120, + "train/total_loss": 0.24053603410720825 + }, + { + "entropy": 8.225971221923828, + "epoch": 0.011963614791378287, + "mean_token_accuracy": 0.6958677768707275, + "num_tokens": 634814.0, + "step": 121, + "train/ce_loss": 1.8621231317520142 + }, + { + "epoch": 0.011963614791378287, + "step": 121, + "train/sim_loss": 0.36328125 + }, + { + "epoch": 0.011963614791378287, + "step": 121, + "train/total_loss": 0.5494935512542725 + }, + { + "entropy": 8.129207611083984, + "epoch": 0.01206248764089381, + "mean_token_accuracy": 0.7058823704719543, + "num_tokens": 640095.0, + "step": 122, + "train/ce_loss": 0.8336842656135559 + }, + { + "epoch": 0.01206248764089381, + "step": 122, + "train/sim_loss": 0.15234375 + }, + { + "epoch": 0.01206248764089381, + "step": 122, + "train/total_loss": 0.2357121706008911 + }, + { + "entropy": 8.380830764770508, + "epoch": 0.012161360490409334, + "mean_token_accuracy": 0.7931034564971924, + "num_tokens": 645132.0, + "step": 123, + "train/ce_loss": 1.075549840927124 + }, + { + "epoch": 0.012161360490409334, + "step": 123, + "train/sim_loss": 0.15625 + }, + { + "epoch": 0.012161360490409334, + "step": 123, + "train/total_loss": 0.26380497217178345 + }, + { + "entropy": 8.702407836914062, + "epoch": 0.012260233339924856, + "mean_token_accuracy": 0.6796460151672363, + "num_tokens": 650314.0, + "step": 124, + "train/ce_loss": 0.9473351836204529 + }, + { + "epoch": 0.012260233339924856, + "step": 124, + "train/sim_loss": 0.19921875 + }, + { + "epoch": 0.012260233339924856, + "step": 124, + "train/total_loss": 0.2939522862434387 + }, + { + "entropy": 8.119245529174805, + "epoch": 0.01235910618944038, + "mean_token_accuracy": 0.773099422454834, + "num_tokens": 655612.0, + "step": 125, + "train/ce_loss": 0.8545771837234497 + }, + { + "epoch": 0.01235910618944038, + "step": 125, + "train/sim_loss": 0.2421875 + }, + { + "epoch": 0.01235910618944038, + "step": 125, + "train/total_loss": 0.3276452124118805 + }, + { + "entropy": 8.00175666809082, + "epoch": 0.012457979038955902, + "mean_token_accuracy": 0.7394366264343262, + "num_tokens": 660934.0, + "step": 126, + "train/ce_loss": 1.1516802310943604 + }, + { + "epoch": 0.012457979038955902, + "step": 126, + "train/sim_loss": 0.234375 + }, + { + "epoch": 0.012457979038955902, + "step": 126, + "train/total_loss": 0.349543035030365 + }, + { + "entropy": 8.274015426635742, + "epoch": 0.012556851888471426, + "mean_token_accuracy": 0.7601156234741211, + "num_tokens": 666080.0, + "step": 127, + "train/ce_loss": 0.6554973125457764 + }, + { + "epoch": 0.012556851888471426, + "step": 127, + "train/sim_loss": 0.2109375 + }, + { + "epoch": 0.012556851888471426, + "step": 127, + "train/total_loss": 0.27648723125457764 + }, + { + "entropy": 8.076434135437012, + "epoch": 0.01265572473798695, + "mean_token_accuracy": 0.6938519477844238, + "num_tokens": 671277.0, + "step": 128, + "train/ce_loss": 1.0039684772491455 + }, + { + "epoch": 0.01265572473798695, + "step": 128, + "train/sim_loss": 0.22265625 + }, + { + "epoch": 0.01265572473798695, + "step": 128, + "train/total_loss": 0.3230530917644501 + }, + { + "entropy": 7.593088626861572, + "epoch": 0.012754597587502471, + "mean_token_accuracy": 0.6751986145973206, + "num_tokens": 676804.0, + "step": 129, + "train/ce_loss": 1.3502253293991089 + }, + { + "epoch": 0.012754597587502471, + "step": 129, + "train/sim_loss": 0.20703125 + }, + { + "epoch": 0.012754597587502471, + "step": 129, + "train/total_loss": 0.34205377101898193 + }, + { + "entropy": 8.646316528320312, + "epoch": 0.012853470437017995, + "mean_token_accuracy": 0.7296819686889648, + "num_tokens": 681786.0, + "step": 130, + "train/ce_loss": 1.2546935081481934 + }, + { + "epoch": 0.012853470437017995, + "step": 130, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.012853470437017995, + "step": 130, + "train/total_loss": 0.2309381067752838 + }, + { + "entropy": 7.9116291999816895, + "epoch": 0.012952343286533517, + "mean_token_accuracy": 0.7299435138702393, + "num_tokens": 687168.0, + "step": 131, + "train/ce_loss": 1.1150727272033691 + }, + { + "epoch": 0.012952343286533517, + "step": 131, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.012952343286533517, + "step": 131, + "train/total_loss": 0.20525726675987244 + }, + { + "entropy": 8.308708190917969, + "epoch": 0.013051216136049041, + "mean_token_accuracy": 0.6818181872367859, + "num_tokens": 692372.0, + "step": 132, + "train/ce_loss": 1.6832152605056763 + }, + { + "epoch": 0.013051216136049041, + "step": 132, + "train/sim_loss": 0.2578125 + }, + { + "epoch": 0.013051216136049041, + "step": 132, + "train/total_loss": 0.42613404989242554 + }, + { + "entropy": 8.511515617370605, + "epoch": 0.013150088985564565, + "mean_token_accuracy": 0.7616000175476074, + "num_tokens": 697466.0, + "step": 133, + "train/ce_loss": 1.3688126802444458 + }, + { + "epoch": 0.013150088985564565, + "step": 133, + "train/sim_loss": 0.24609375 + }, + { + "epoch": 0.013150088985564565, + "step": 133, + "train/total_loss": 0.3829750418663025 + }, + { + "entropy": 8.20486068725586, + "epoch": 0.013248961835080087, + "mean_token_accuracy": 0.7224118113517761, + "num_tokens": 702816.0, + "step": 134, + "train/ce_loss": 0.690566897392273 + }, + { + "epoch": 0.013248961835080087, + "step": 134, + "train/sim_loss": 0.15234375 + }, + { + "epoch": 0.013248961835080087, + "step": 134, + "train/total_loss": 0.2214004397392273 + }, + { + "entropy": 8.287908554077148, + "epoch": 0.01334783468459561, + "mean_token_accuracy": 0.6638772487640381, + "num_tokens": 708006.0, + "step": 135, + "train/ce_loss": 1.7614285945892334 + }, + { + "epoch": 0.01334783468459561, + "step": 135, + "train/sim_loss": 0.23046875 + }, + { + "epoch": 0.01334783468459561, + "step": 135, + "train/total_loss": 0.4066116213798523 + }, + { + "entropy": 8.253705978393555, + "epoch": 0.013446707534111133, + "mean_token_accuracy": 0.7321041226387024, + "num_tokens": 713369.0, + "step": 136, + "train/ce_loss": 1.069230079650879 + }, + { + "epoch": 0.013446707534111133, + "step": 136, + "train/sim_loss": 0.16796875 + }, + { + "epoch": 0.013446707534111133, + "step": 136, + "train/total_loss": 0.27489176392555237 + }, + { + "entropy": 8.739953994750977, + "epoch": 0.013545580383626656, + "mean_token_accuracy": 0.7504000067710876, + "num_tokens": 718456.0, + "step": 137, + "train/ce_loss": 0.06112891063094139 + }, + { + "epoch": 0.013545580383626656, + "step": 137, + "train/sim_loss": 0.14453125 + }, + { + "epoch": 0.013545580383626656, + "step": 137, + "train/total_loss": 0.15064413845539093 + }, + { + "entropy": 7.99261474609375, + "epoch": 0.01364445323314218, + "mean_token_accuracy": 0.7210884094238281, + "num_tokens": 724037.0, + "step": 138, + "train/ce_loss": 0.8059151768684387 + }, + { + "epoch": 0.01364445323314218, + "step": 138, + "train/sim_loss": 0.20703125 + }, + { + "epoch": 0.01364445323314218, + "step": 138, + "train/total_loss": 0.2876227796077728 + }, + { + "entropy": 8.674912452697754, + "epoch": 0.013743326082657702, + "mean_token_accuracy": 0.6875981092453003, + "num_tokens": 729068.0, + "step": 139, + "train/ce_loss": 0.06736356765031815 + }, + { + "epoch": 0.013743326082657702, + "step": 139, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.013743326082657702, + "step": 139, + "train/total_loss": 0.12392385303974152 + }, + { + "epoch": 0.013842198932173226, + "grad_norm": 0.956753134727478, + "learning_rate": 9.968105622311231e-06, + "loss": 0.2816, + "step": 140 + }, + { + "entropy": 8.696903228759766, + "epoch": 0.013842198932173226, + "mean_token_accuracy": 0.723796010017395, + "num_tokens": 734252.0, + "step": 140, + "train/ce_loss": 1.2009042501449585 + }, + { + "epoch": 0.013842198932173226, + "step": 140, + "train/sim_loss": 0.18359375 + }, + { + "epoch": 0.013842198932173226, + "step": 140, + "train/total_loss": 0.30368417501449585 + }, + { + "entropy": 8.498991966247559, + "epoch": 0.013941071781688748, + "mean_token_accuracy": 0.6803030371665955, + "num_tokens": 739342.0, + "step": 141, + "train/ce_loss": 1.6696449518203735 + }, + { + "epoch": 0.013941071781688748, + "step": 141, + "train/sim_loss": 0.140625 + }, + { + "epoch": 0.013941071781688748, + "step": 141, + "train/total_loss": 0.30758950114250183 + }, + { + "entropy": 8.561853408813477, + "epoch": 0.014039944631204272, + "mean_token_accuracy": 0.7418967485427856, + "num_tokens": 744599.0, + "step": 142, + "train/ce_loss": 1.0845693349838257 + }, + { + "epoch": 0.014039944631204272, + "step": 142, + "train/sim_loss": 0.21875 + }, + { + "epoch": 0.014039944631204272, + "step": 142, + "train/total_loss": 0.32720693945884705 + }, + { + "entropy": 9.196142196655273, + "epoch": 0.014138817480719794, + "mean_token_accuracy": 0.690773069858551, + "num_tokens": 749401.0, + "step": 143, + "train/ce_loss": 0.10551132261753082 + }, + { + "epoch": 0.014138817480719794, + "step": 143, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.014138817480719794, + "step": 143, + "train/total_loss": 0.10039488226175308 + }, + { + "entropy": 8.558753967285156, + "epoch": 0.014237690330235317, + "mean_token_accuracy": 0.7275362610816956, + "num_tokens": 754544.0, + "step": 144, + "train/ce_loss": 0.8957123160362244 + }, + { + "epoch": 0.014237690330235317, + "step": 144, + "train/sim_loss": 0.2109375 + }, + { + "epoch": 0.014237690330235317, + "step": 144, + "train/total_loss": 0.3005087375640869 + }, + { + "entropy": 8.242622375488281, + "epoch": 0.014336563179750841, + "mean_token_accuracy": 0.7183406352996826, + "num_tokens": 759930.0, + "step": 145, + "train/ce_loss": 0.9891186952590942 + }, + { + "epoch": 0.014336563179750841, + "step": 145, + "train/sim_loss": 0.234375 + }, + { + "epoch": 0.014336563179750841, + "step": 145, + "train/total_loss": 0.3332868814468384 + }, + { + "entropy": 8.081424713134766, + "epoch": 0.014435436029266363, + "mean_token_accuracy": 0.7152941226959229, + "num_tokens": 765262.0, + "step": 146, + "train/ce_loss": 0.8288601040840149 + }, + { + "epoch": 0.014435436029266363, + "step": 146, + "train/sim_loss": 0.1796875 + }, + { + "epoch": 0.014435436029266363, + "step": 146, + "train/total_loss": 0.2625735104084015 + }, + { + "entropy": 8.312261581420898, + "epoch": 0.014534308878781887, + "mean_token_accuracy": 0.7093153595924377, + "num_tokens": 770605.0, + "step": 147, + "train/ce_loss": 0.94538414478302 + }, + { + "epoch": 0.014534308878781887, + "step": 147, + "train/sim_loss": 0.203125 + }, + { + "epoch": 0.014534308878781887, + "step": 147, + "train/total_loss": 0.2976634204387665 + }, + { + "entropy": 8.404376029968262, + "epoch": 0.014633181728297409, + "mean_token_accuracy": 0.7289837002754211, + "num_tokens": 775906.0, + "step": 148, + "train/ce_loss": 1.1064625978469849 + }, + { + "epoch": 0.014633181728297409, + "step": 148, + "train/sim_loss": 0.21484375 + }, + { + "epoch": 0.014633181728297409, + "step": 148, + "train/total_loss": 0.32548999786376953 + }, + { + "entropy": 9.281839370727539, + "epoch": 0.014732054577812933, + "mean_token_accuracy": 0.6978922486305237, + "num_tokens": 780756.0, + "step": 149, + "train/ce_loss": 0.09931197762489319 + }, + { + "epoch": 0.014732054577812933, + "step": 149, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.014732054577812933, + "step": 149, + "train/total_loss": 0.08805619925260544 + }, + { + "entropy": 8.621355056762695, + "epoch": 0.014830927427328456, + "mean_token_accuracy": 0.7159841656684875, + "num_tokens": 785977.0, + "step": 150, + "train/ce_loss": 0.6905444860458374 + }, + { + "epoch": 0.014830927427328456, + "step": 150, + "train/sim_loss": 0.17578125 + }, + { + "epoch": 0.014830927427328456, + "step": 150, + "train/total_loss": 0.24483570456504822 + }, + { + "entropy": 8.721384048461914, + "epoch": 0.014929800276843978, + "mean_token_accuracy": 0.7140864729881287, + "num_tokens": 791153.0, + "step": 151, + "train/ce_loss": 0.8488461375236511 + }, + { + "epoch": 0.014929800276843978, + "step": 151, + "train/sim_loss": 0.1328125 + }, + { + "epoch": 0.014929800276843978, + "step": 151, + "train/total_loss": 0.2176971137523651 + }, + { + "entropy": 8.786897659301758, + "epoch": 0.015028673126359502, + "mean_token_accuracy": 0.6991018056869507, + "num_tokens": 796262.0, + "step": 152, + "train/ce_loss": 1.2345761060714722 + }, + { + "epoch": 0.015028673126359502, + "step": 152, + "train/sim_loss": 0.2109375 + }, + { + "epoch": 0.015028673126359502, + "step": 152, + "train/total_loss": 0.3343951106071472 + }, + { + "entropy": 8.233671188354492, + "epoch": 0.015127545975875024, + "mean_token_accuracy": 0.7185430526733398, + "num_tokens": 801682.0, + "step": 153, + "train/ce_loss": 1.0754245519638062 + }, + { + "epoch": 0.015127545975875024, + "step": 153, + "train/sim_loss": 0.2109375 + }, + { + "epoch": 0.015127545975875024, + "step": 153, + "train/total_loss": 0.3184799551963806 + }, + { + "entropy": 8.390192985534668, + "epoch": 0.015226418825390548, + "mean_token_accuracy": 0.725400447845459, + "num_tokens": 806996.0, + "step": 154, + "train/ce_loss": 0.7750483751296997 + }, + { + "epoch": 0.015226418825390548, + "step": 154, + "train/sim_loss": 0.1875 + }, + { + "epoch": 0.015226418825390548, + "step": 154, + "train/total_loss": 0.26500484347343445 + }, + { + "entropy": 8.63515853881836, + "epoch": 0.015325291674906072, + "mean_token_accuracy": 0.7489655017852783, + "num_tokens": 812174.0, + "step": 155, + "train/ce_loss": 1.156675934791565 + }, + { + "epoch": 0.015325291674906072, + "step": 155, + "train/sim_loss": 0.2109375 + }, + { + "epoch": 0.015325291674906072, + "step": 155, + "train/total_loss": 0.32660508155822754 + }, + { + "entropy": 8.505970001220703, + "epoch": 0.015424164524421594, + "mean_token_accuracy": 0.7043847441673279, + "num_tokens": 817404.0, + "step": 156, + "train/ce_loss": 1.3409483432769775 + }, + { + "epoch": 0.015424164524421594, + "step": 156, + "train/sim_loss": 0.16796875 + }, + { + "epoch": 0.015424164524421594, + "step": 156, + "train/total_loss": 0.30206358432769775 + }, + { + "entropy": 8.562335968017578, + "epoch": 0.015523037373937117, + "mean_token_accuracy": 0.703903079032898, + "num_tokens": 822586.0, + "step": 157, + "train/ce_loss": 0.9353137612342834 + }, + { + "epoch": 0.015523037373937117, + "step": 157, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.015523037373937117, + "step": 157, + "train/total_loss": 0.21071887016296387 + }, + { + "entropy": 8.562350273132324, + "epoch": 0.01562191022345264, + "mean_token_accuracy": 0.7104825377464294, + "num_tokens": 827639.0, + "step": 158, + "train/ce_loss": 2.006349802017212 + }, + { + "epoch": 0.01562191022345264, + "step": 158, + "train/sim_loss": 0.21875 + }, + { + "epoch": 0.01562191022345264, + "step": 158, + "train/total_loss": 0.41938498616218567 + }, + { + "entropy": 8.904520034790039, + "epoch": 0.01572078307296816, + "mean_token_accuracy": 0.7043235898017883, + "num_tokens": 832831.0, + "step": 159, + "train/ce_loss": 0.9618018865585327 + }, + { + "epoch": 0.01572078307296816, + "step": 159, + "train/sim_loss": 0.140625 + }, + { + "epoch": 0.01572078307296816, + "step": 159, + "train/total_loss": 0.23680520057678223 + }, + { + "epoch": 0.015819655922483685, + "grad_norm": 1.015324592590332, + "learning_rate": 9.963160757553282e-06, + "loss": 0.2759, + "step": 160 + }, + { + "entropy": 8.706175804138184, + "epoch": 0.015819655922483685, + "mean_token_accuracy": 0.6988416910171509, + "num_tokens": 838103.0, + "step": 160, + "train/ce_loss": 0.5171559453010559 + }, + { + "epoch": 0.015819655922483685, + "step": 160, + "train/sim_loss": 0.15234375 + }, + { + "epoch": 0.015819655922483685, + "step": 160, + "train/total_loss": 0.20405934751033783 + }, + { + "entropy": 8.741148948669434, + "epoch": 0.01591852877199921, + "mean_token_accuracy": 0.7098930478096008, + "num_tokens": 843328.0, + "step": 161, + "train/ce_loss": 1.5813908576965332 + }, + { + "epoch": 0.01591852877199921, + "step": 161, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.01591852877199921, + "step": 161, + "train/total_loss": 0.23235784471035004 + }, + { + "entropy": 8.80101203918457, + "epoch": 0.016017401621514733, + "mean_token_accuracy": 0.7891246676445007, + "num_tokens": 848505.0, + "step": 162, + "train/ce_loss": 0.05155276134610176 + }, + { + "epoch": 0.016017401621514733, + "step": 162, + "train/sim_loss": 0.1328125 + }, + { + "epoch": 0.016017401621514733, + "step": 162, + "train/total_loss": 0.1379677802324295 + }, + { + "entropy": 9.00037956237793, + "epoch": 0.016116274471030256, + "mean_token_accuracy": 0.708737850189209, + "num_tokens": 853575.0, + "step": 163, + "train/ce_loss": 0.06782928854227066 + }, + { + "epoch": 0.016116274471030256, + "step": 163, + "train/sim_loss": 0.14453125 + }, + { + "epoch": 0.016116274471030256, + "step": 163, + "train/total_loss": 0.15131418406963348 + }, + { + "entropy": 8.507973670959473, + "epoch": 0.016215147320545777, + "mean_token_accuracy": 0.7311370968818665, + "num_tokens": 858942.0, + "step": 164, + "train/ce_loss": 0.906366765499115 + }, + { + "epoch": 0.016215147320545777, + "step": 164, + "train/sim_loss": 0.17578125 + }, + { + "epoch": 0.016215147320545777, + "step": 164, + "train/total_loss": 0.266417920589447 + }, + { + "entropy": 8.641292572021484, + "epoch": 0.0163140201700613, + "mean_token_accuracy": 0.6557788848876953, + "num_tokens": 864259.0, + "step": 165, + "train/ce_loss": 1.7006336450576782 + }, + { + "epoch": 0.0163140201700613, + "step": 165, + "train/sim_loss": 0.2265625 + }, + { + "epoch": 0.0163140201700613, + "step": 165, + "train/total_loss": 0.3966258764266968 + }, + { + "entropy": 8.167360305786133, + "epoch": 0.016412893019576824, + "mean_token_accuracy": 0.6848739385604858, + "num_tokens": 869746.0, + "step": 166, + "train/ce_loss": 1.4542746543884277 + }, + { + "epoch": 0.016412893019576824, + "step": 166, + "train/sim_loss": 0.23828125 + }, + { + "epoch": 0.016412893019576824, + "step": 166, + "train/total_loss": 0.3837087154388428 + }, + { + "entropy": 9.23829460144043, + "epoch": 0.016511765869092348, + "mean_token_accuracy": 0.6920473575592041, + "num_tokens": 874724.0, + "step": 167, + "train/ce_loss": 0.07225502282381058 + }, + { + "epoch": 0.016511765869092348, + "step": 167, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.016511765869092348, + "step": 167, + "train/total_loss": 0.13222549855709076 + }, + { + "entropy": 8.727310180664062, + "epoch": 0.01661063871860787, + "mean_token_accuracy": 0.7001023292541504, + "num_tokens": 880467.0, + "step": 168, + "train/ce_loss": 1.0050170421600342 + }, + { + "epoch": 0.01661063871860787, + "step": 168, + "train/sim_loss": 0.16796875 + }, + { + "epoch": 0.01661063871860787, + "step": 168, + "train/total_loss": 0.2684704661369324 + }, + { + "entropy": 9.13823127746582, + "epoch": 0.016709511568123392, + "mean_token_accuracy": 0.7419354915618896, + "num_tokens": 885447.0, + "step": 169, + "train/ce_loss": 0.07551628351211548 + }, + { + "epoch": 0.016709511568123392, + "step": 169, + "train/sim_loss": 0.22265625 + }, + { + "epoch": 0.016709511568123392, + "step": 169, + "train/total_loss": 0.2302078753709793 + }, + { + "entropy": 8.848125457763672, + "epoch": 0.016808384417638916, + "mean_token_accuracy": 0.7138413786888123, + "num_tokens": 890538.0, + "step": 170, + "train/ce_loss": 1.4403102397918701 + }, + { + "epoch": 0.016808384417638916, + "step": 170, + "train/sim_loss": 0.1953125 + }, + { + "epoch": 0.016808384417638916, + "step": 170, + "train/total_loss": 0.3393435478210449 + }, + { + "entropy": 9.174699783325195, + "epoch": 0.01690725726715444, + "mean_token_accuracy": 0.7342256307601929, + "num_tokens": 895528.0, + "step": 171, + "train/ce_loss": 1.0800552368164062 + }, + { + "epoch": 0.01690725726715444, + "step": 171, + "train/sim_loss": 0.15234375 + }, + { + "epoch": 0.01690725726715444, + "step": 171, + "train/total_loss": 0.2603492736816406 + }, + { + "entropy": 8.827863693237305, + "epoch": 0.017006130116669963, + "mean_token_accuracy": 0.7016574740409851, + "num_tokens": 900722.0, + "step": 172, + "train/ce_loss": 1.1682084798812866 + }, + { + "epoch": 0.017006130116669963, + "step": 172, + "train/sim_loss": 0.12890625 + }, + { + "epoch": 0.017006130116669963, + "step": 172, + "train/total_loss": 0.24572709202766418 + }, + { + "entropy": 8.44694709777832, + "epoch": 0.017105002966185487, + "mean_token_accuracy": 0.6686217188835144, + "num_tokens": 906192.0, + "step": 173, + "train/ce_loss": 1.4545676708221436 + }, + { + "epoch": 0.017105002966185487, + "step": 173, + "train/sim_loss": 0.2265625 + }, + { + "epoch": 0.017105002966185487, + "step": 173, + "train/total_loss": 0.37201929092407227 + }, + { + "entropy": 9.067190170288086, + "epoch": 0.017203875815701007, + "mean_token_accuracy": 0.7195325493812561, + "num_tokens": 911200.0, + "step": 174, + "train/ce_loss": 1.3222960233688354 + }, + { + "epoch": 0.017203875815701007, + "step": 174, + "train/sim_loss": 0.21484375 + }, + { + "epoch": 0.017203875815701007, + "step": 174, + "train/total_loss": 0.34707337617874146 + }, + { + "entropy": 8.505998611450195, + "epoch": 0.01730274866521653, + "mean_token_accuracy": 0.7523029446601868, + "num_tokens": 916678.0, + "step": 175, + "train/ce_loss": 0.8026020526885986 + }, + { + "epoch": 0.01730274866521653, + "step": 175, + "train/sim_loss": 0.12109375 + }, + { + "epoch": 0.01730274866521653, + "step": 175, + "train/total_loss": 0.20135396718978882 + }, + { + "entropy": 8.75352954864502, + "epoch": 0.017401621514732055, + "mean_token_accuracy": 0.7118226885795593, + "num_tokens": 921948.0, + "step": 176, + "train/ce_loss": 1.0283185243606567 + }, + { + "epoch": 0.017401621514732055, + "step": 176, + "train/sim_loss": 0.1484375 + }, + { + "epoch": 0.017401621514732055, + "step": 176, + "train/total_loss": 0.2512693405151367 + }, + { + "entropy": 8.730976104736328, + "epoch": 0.01750049436424758, + "mean_token_accuracy": 0.727385401725769, + "num_tokens": 927247.0, + "step": 177, + "train/ce_loss": 0.911525547504425 + }, + { + "epoch": 0.01750049436424758, + "step": 177, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.01750049436424758, + "step": 177, + "train/total_loss": 0.20834004878997803 + }, + { + "entropy": 8.64146614074707, + "epoch": 0.017599367213763102, + "mean_token_accuracy": 0.7541163563728333, + "num_tokens": 932641.0, + "step": 178, + "train/ce_loss": 0.9116729497909546 + }, + { + "epoch": 0.017599367213763102, + "step": 178, + "train/sim_loss": 0.1328125 + }, + { + "epoch": 0.017599367213763102, + "step": 178, + "train/total_loss": 0.22397980093955994 + }, + { + "entropy": 9.030397415161133, + "epoch": 0.017698240063278622, + "mean_token_accuracy": 0.7196261882781982, + "num_tokens": 937755.0, + "step": 179, + "train/ce_loss": 0.7100560069084167 + }, + { + "epoch": 0.017698240063278622, + "step": 179, + "train/sim_loss": 0.1484375 + }, + { + "epoch": 0.017698240063278622, + "step": 179, + "train/total_loss": 0.21944311261177063 + }, + { + "epoch": 0.017797112912794146, + "grad_norm": 1.0556875467300415, + "learning_rate": 9.958215892795334e-06, + "loss": 0.2623, + "step": 180 + }, + { + "entropy": 9.004786491394043, + "epoch": 0.017797112912794146, + "mean_token_accuracy": 0.670976996421814, + "num_tokens": 942893.0, + "step": 180, + "train/ce_loss": 1.569191575050354 + }, + { + "epoch": 0.017797112912794146, + "step": 180, + "train/sim_loss": 0.16796875 + }, + { + "epoch": 0.017797112912794146, + "step": 180, + "train/total_loss": 0.3248879313468933 + }, + { + "entropy": 8.789143562316895, + "epoch": 0.01789598576230967, + "mean_token_accuracy": 0.7061281204223633, + "num_tokens": 948108.0, + "step": 181, + "train/ce_loss": 0.8462908864021301 + }, + { + "epoch": 0.01789598576230967, + "step": 181, + "train/sim_loss": 0.14453125 + }, + { + "epoch": 0.01789598576230967, + "step": 181, + "train/total_loss": 0.229160338640213 + }, + { + "entropy": 8.294668197631836, + "epoch": 0.017994858611825194, + "mean_token_accuracy": 0.7203980088233948, + "num_tokens": 953603.0, + "step": 182, + "train/ce_loss": 1.0397672653198242 + }, + { + "epoch": 0.017994858611825194, + "step": 182, + "train/sim_loss": 0.1328125 + }, + { + "epoch": 0.017994858611825194, + "step": 182, + "train/total_loss": 0.23678922653198242 + }, + { + "entropy": 8.335565567016602, + "epoch": 0.018093731461340717, + "mean_token_accuracy": 0.7104413509368896, + "num_tokens": 959017.0, + "step": 183, + "train/ce_loss": 1.5085524320602417 + }, + { + "epoch": 0.018093731461340717, + "step": 183, + "train/sim_loss": 0.2265625 + }, + { + "epoch": 0.018093731461340717, + "step": 183, + "train/total_loss": 0.37741774320602417 + }, + { + "entropy": 8.205455780029297, + "epoch": 0.018192604310856238, + "mean_token_accuracy": 0.7226027250289917, + "num_tokens": 964411.0, + "step": 184, + "train/ce_loss": 0.7219122648239136 + }, + { + "epoch": 0.018192604310856238, + "step": 184, + "train/sim_loss": 0.1953125 + }, + { + "epoch": 0.018192604310856238, + "step": 184, + "train/total_loss": 0.2675037384033203 + }, + { + "entropy": 9.04613971710205, + "epoch": 0.01829147716037176, + "mean_token_accuracy": 0.7063491940498352, + "num_tokens": 969465.0, + "step": 185, + "train/ce_loss": 1.5215908288955688 + }, + { + "epoch": 0.01829147716037176, + "step": 185, + "train/sim_loss": 0.17578125 + }, + { + "epoch": 0.01829147716037176, + "step": 185, + "train/total_loss": 0.32794034481048584 + }, + { + "entropy": 9.08860969543457, + "epoch": 0.018390350009887285, + "mean_token_accuracy": 0.7233333587646484, + "num_tokens": 974538.0, + "step": 186, + "train/ce_loss": 2.49151611328125 + }, + { + "epoch": 0.018390350009887285, + "step": 186, + "train/sim_loss": 0.20703125 + }, + { + "epoch": 0.018390350009887285, + "step": 186, + "train/total_loss": 0.4561828672885895 + }, + { + "entropy": 8.43522834777832, + "epoch": 0.01848922285940281, + "mean_token_accuracy": 0.692150890827179, + "num_tokens": 980012.0, + "step": 187, + "train/ce_loss": 0.5604060292243958 + }, + { + "epoch": 0.01848922285940281, + "step": 187, + "train/sim_loss": 0.13671875 + }, + { + "epoch": 0.01848922285940281, + "step": 187, + "train/total_loss": 0.19275934994220734 + }, + { + "entropy": 8.536243438720703, + "epoch": 0.018588095708918333, + "mean_token_accuracy": 0.761562168598175, + "num_tokens": 985454.0, + "step": 188, + "train/ce_loss": 0.6313604712486267 + }, + { + "epoch": 0.018588095708918333, + "step": 188, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.018588095708918333, + "step": 188, + "train/total_loss": 0.1686047911643982 + }, + { + "entropy": 9.47385025024414, + "epoch": 0.018686968558433853, + "mean_token_accuracy": 0.7377398610115051, + "num_tokens": 990496.0, + "step": 189, + "train/ce_loss": 0.0897272527217865 + }, + { + "epoch": 0.018686968558433853, + "step": 189, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.018686968558433853, + "step": 189, + "train/total_loss": 0.08709772676229477 + }, + { + "entropy": 8.622787475585938, + "epoch": 0.018785841407949377, + "mean_token_accuracy": 0.7806817889213562, + "num_tokens": 995895.0, + "step": 190, + "train/ce_loss": 0.710089385509491 + }, + { + "epoch": 0.018785841407949377, + "step": 190, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.018785841407949377, + "step": 190, + "train/total_loss": 0.15694645047187805 + }, + { + "entropy": 8.932117462158203, + "epoch": 0.0188847142574649, + "mean_token_accuracy": 0.7503876090049744, + "num_tokens": 1000931.0, + "step": 191, + "train/ce_loss": 0.06429041922092438 + }, + { + "epoch": 0.0188847142574649, + "step": 191, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.0188847142574649, + "step": 191, + "train/total_loss": 0.1236165389418602 + }, + { + "entropy": 9.011377334594727, + "epoch": 0.018983587106980424, + "mean_token_accuracy": 0.7205674052238464, + "num_tokens": 1006037.0, + "step": 192, + "train/ce_loss": 1.5213985443115234 + }, + { + "epoch": 0.018983587106980424, + "step": 192, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.018983587106980424, + "step": 192, + "train/total_loss": 0.23807735741138458 + }, + { + "entropy": 8.612946510314941, + "epoch": 0.019082459956495944, + "mean_token_accuracy": 0.6990496516227722, + "num_tokens": 1011448.0, + "step": 193, + "train/ce_loss": 0.9318941831588745 + }, + { + "epoch": 0.019082459956495944, + "step": 193, + "train/sim_loss": 0.1640625 + }, + { + "epoch": 0.019082459956495944, + "step": 193, + "train/total_loss": 0.25725191831588745 + }, + { + "entropy": 8.525861740112305, + "epoch": 0.019181332806011468, + "mean_token_accuracy": 0.7281553149223328, + "num_tokens": 1016745.0, + "step": 194, + "train/ce_loss": 1.1830980777740479 + }, + { + "epoch": 0.019181332806011468, + "step": 194, + "train/sim_loss": 0.19140625 + }, + { + "epoch": 0.019181332806011468, + "step": 194, + "train/total_loss": 0.30971604585647583 + }, + { + "entropy": 9.627721786499023, + "epoch": 0.019280205655526992, + "mean_token_accuracy": 0.7314410209655762, + "num_tokens": 1021631.0, + "step": 195, + "train/ce_loss": 0.09438954293727875 + }, + { + "epoch": 0.019280205655526992, + "step": 195, + "train/sim_loss": 0.19140625 + }, + { + "epoch": 0.019280205655526992, + "step": 195, + "train/total_loss": 0.20084521174430847 + }, + { + "entropy": 8.480622291564941, + "epoch": 0.019379078505042516, + "mean_token_accuracy": 0.7431694269180298, + "num_tokens": 1027045.0, + "step": 196, + "train/ce_loss": 0.9766530990600586 + }, + { + "epoch": 0.019379078505042516, + "step": 196, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.019379078505042516, + "step": 196, + "train/total_loss": 0.17188405990600586 + }, + { + "entropy": 9.240240097045898, + "epoch": 0.01947795135455804, + "mean_token_accuracy": 0.7355371713638306, + "num_tokens": 1032065.0, + "step": 197, + "train/ce_loss": 1.199245810508728 + }, + { + "epoch": 0.01947795135455804, + "step": 197, + "train/sim_loss": 0.234375 + }, + { + "epoch": 0.01947795135455804, + "step": 197, + "train/total_loss": 0.3542995750904083 + }, + { + "entropy": 8.914320945739746, + "epoch": 0.01957682420407356, + "mean_token_accuracy": 0.7111111283302307, + "num_tokens": 1037297.0, + "step": 198, + "train/ce_loss": 1.3729205131530762 + }, + { + "epoch": 0.01957682420407356, + "step": 198, + "train/sim_loss": 0.21875 + }, + { + "epoch": 0.01957682420407356, + "step": 198, + "train/total_loss": 0.3560420572757721 + }, + { + "entropy": 8.601119041442871, + "epoch": 0.019675697053589083, + "mean_token_accuracy": 0.7517588138580322, + "num_tokens": 1042771.0, + "step": 199, + "train/ce_loss": 0.7754992246627808 + }, + { + "epoch": 0.019675697053589083, + "step": 199, + "train/sim_loss": 0.13671875 + }, + { + "epoch": 0.019675697053589083, + "step": 199, + "train/total_loss": 0.21426868438720703 + }, + { + "epoch": 0.019774569903104607, + "grad_norm": 1.345518946647644, + "learning_rate": 9.953271028037384e-06, + "loss": 0.2608, + "step": 200 + }, + { + "entropy": 9.236515045166016, + "epoch": 0.019774569903104607, + "mean_token_accuracy": 0.7239263653755188, + "num_tokens": 1047876.0, + "step": 200, + "train/ce_loss": 0.7935214638710022 + }, + { + "epoch": 0.019774569903104607, + "step": 200, + "train/sim_loss": 0.24609375 + }, + { + "epoch": 0.019774569903104607, + "step": 200, + "train/total_loss": 0.32544589042663574 + }, + { + "entropy": 8.959417343139648, + "epoch": 0.01987344275262013, + "mean_token_accuracy": 0.7191011309623718, + "num_tokens": 1053155.0, + "step": 201, + "train/ce_loss": 1.5126712322235107 + }, + { + "epoch": 0.01987344275262013, + "step": 201, + "train/sim_loss": 0.12109375 + }, + { + "epoch": 0.01987344275262013, + "step": 201, + "train/total_loss": 0.2723608613014221 + }, + { + "entropy": 8.881634712219238, + "epoch": 0.019972315602135655, + "mean_token_accuracy": 0.728205144405365, + "num_tokens": 1058418.0, + "step": 202, + "train/ce_loss": 0.977267861366272 + }, + { + "epoch": 0.019972315602135655, + "step": 202, + "train/sim_loss": 0.16796875 + }, + { + "epoch": 0.019972315602135655, + "step": 202, + "train/total_loss": 0.2656955420970917 + }, + { + "entropy": 8.64078140258789, + "epoch": 0.020071188451651175, + "mean_token_accuracy": 0.6912899613380432, + "num_tokens": 1063760.0, + "step": 203, + "train/ce_loss": 0.5148274302482605 + }, + { + "epoch": 0.020071188451651175, + "step": 203, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.020071188451651175, + "step": 203, + "train/total_loss": 0.12179524451494217 + }, + { + "entropy": 8.804824829101562, + "epoch": 0.0201700613011667, + "mean_token_accuracy": 0.7189542651176453, + "num_tokens": 1069021.0, + "step": 204, + "train/ce_loss": 0.5810104608535767 + }, + { + "epoch": 0.0201700613011667, + "step": 204, + "train/sim_loss": 0.13671875 + }, + { + "epoch": 0.0201700613011667, + "step": 204, + "train/total_loss": 0.19481979310512543 + }, + { + "entropy": 8.664105415344238, + "epoch": 0.020268934150682223, + "mean_token_accuracy": 0.7547568678855896, + "num_tokens": 1074471.0, + "step": 205, + "train/ce_loss": 0.7599750757217407 + }, + { + "epoch": 0.020268934150682223, + "step": 205, + "train/sim_loss": 0.1640625 + }, + { + "epoch": 0.020268934150682223, + "step": 205, + "train/total_loss": 0.2400600016117096 + }, + { + "entropy": 8.950141906738281, + "epoch": 0.020367807000197746, + "mean_token_accuracy": 0.7085253596305847, + "num_tokens": 1079796.0, + "step": 206, + "train/ce_loss": 1.1631953716278076 + }, + { + "epoch": 0.020367807000197746, + "step": 206, + "train/sim_loss": 0.19140625 + }, + { + "epoch": 0.020367807000197746, + "step": 206, + "train/total_loss": 0.30772578716278076 + }, + { + "entropy": 9.293176651000977, + "epoch": 0.02046667984971327, + "mean_token_accuracy": 0.6983333230018616, + "num_tokens": 1084873.0, + "step": 207, + "train/ce_loss": 1.2679121494293213 + }, + { + "epoch": 0.02046667984971327, + "step": 207, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.02046667984971327, + "step": 207, + "train/total_loss": 0.25179123878479004 + }, + { + "entropy": 8.642351150512695, + "epoch": 0.02056555269922879, + "mean_token_accuracy": 0.6852207183837891, + "num_tokens": 1090360.0, + "step": 208, + "train/ce_loss": 1.1404317617416382 + }, + { + "epoch": 0.02056555269922879, + "step": 208, + "train/sim_loss": 0.12890625 + }, + { + "epoch": 0.02056555269922879, + "step": 208, + "train/total_loss": 0.24294942617416382 + }, + { + "entropy": 9.028209686279297, + "epoch": 0.020664425548744314, + "mean_token_accuracy": 0.7037037014961243, + "num_tokens": 1095739.0, + "step": 209, + "train/ce_loss": 0.8197504281997681 + }, + { + "epoch": 0.020664425548744314, + "step": 209, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.020664425548744314, + "step": 209, + "train/total_loss": 0.2069750428199768 + }, + { + "entropy": 9.236760139465332, + "epoch": 0.020763298398259838, + "mean_token_accuracy": 0.736672043800354, + "num_tokens": 1100775.0, + "step": 210, + "train/ce_loss": 1.0761586427688599 + }, + { + "epoch": 0.020763298398259838, + "step": 210, + "train/sim_loss": 0.12890625 + }, + { + "epoch": 0.020763298398259838, + "step": 210, + "train/total_loss": 0.2365221083164215 + }, + { + "entropy": 9.078277587890625, + "epoch": 0.02086217124777536, + "mean_token_accuracy": 0.7010869383811951, + "num_tokens": 1105989.0, + "step": 211, + "train/ce_loss": 1.85743248462677 + }, + { + "epoch": 0.02086217124777536, + "step": 211, + "train/sim_loss": 0.15234375 + }, + { + "epoch": 0.02086217124777536, + "step": 211, + "train/total_loss": 0.3380870223045349 + }, + { + "entropy": 8.900236129760742, + "epoch": 0.020961044097290885, + "mean_token_accuracy": 0.7445997595787048, + "num_tokens": 1111249.0, + "step": 212, + "train/ce_loss": 0.9222058653831482 + }, + { + "epoch": 0.020961044097290885, + "step": 212, + "train/sim_loss": 0.1796875 + }, + { + "epoch": 0.020961044097290885, + "step": 212, + "train/total_loss": 0.27190810441970825 + }, + { + "entropy": 9.74795913696289, + "epoch": 0.021059916946806406, + "mean_token_accuracy": 0.730512261390686, + "num_tokens": 1116279.0, + "step": 213, + "train/ce_loss": 1.8484524488449097 + }, + { + "epoch": 0.021059916946806406, + "step": 213, + "train/sim_loss": 0.19140625 + }, + { + "epoch": 0.021059916946806406, + "step": 213, + "train/total_loss": 0.3762515187263489 + }, + { + "entropy": 9.32369613647461, + "epoch": 0.02115878979632193, + "mean_token_accuracy": 0.7382352948188782, + "num_tokens": 1121425.0, + "step": 214, + "train/ce_loss": 1.1132593154907227 + }, + { + "epoch": 0.02115878979632193, + "step": 214, + "train/sim_loss": 0.13671875 + }, + { + "epoch": 0.02115878979632193, + "step": 214, + "train/total_loss": 0.2480446845293045 + }, + { + "entropy": 9.323495864868164, + "epoch": 0.021257662645837453, + "mean_token_accuracy": 0.7352024912834167, + "num_tokens": 1126563.0, + "step": 215, + "train/ce_loss": 0.9907536506652832 + }, + { + "epoch": 0.021257662645837453, + "step": 215, + "train/sim_loss": 0.1875 + }, + { + "epoch": 0.021257662645837453, + "step": 215, + "train/total_loss": 0.2865753769874573 + }, + { + "entropy": 8.703104972839355, + "epoch": 0.021356535495352977, + "mean_token_accuracy": 0.7093185186386108, + "num_tokens": 1131731.0, + "step": 216, + "train/ce_loss": 1.185186743736267 + }, + { + "epoch": 0.021356535495352977, + "step": 216, + "train/sim_loss": 0.21875 + }, + { + "epoch": 0.021356535495352977, + "step": 216, + "train/total_loss": 0.3372686803340912 + }, + { + "entropy": 8.79998779296875, + "epoch": 0.0214554083448685, + "mean_token_accuracy": 0.7827004194259644, + "num_tokens": 1137169.0, + "step": 217, + "train/ce_loss": 0.562030553817749 + }, + { + "epoch": 0.0214554083448685, + "step": 217, + "train/sim_loss": 0.1328125 + }, + { + "epoch": 0.0214554083448685, + "step": 217, + "train/total_loss": 0.18901555240154266 + }, + { + "entropy": 9.627821922302246, + "epoch": 0.02155428119438402, + "mean_token_accuracy": 0.7794677019119263, + "num_tokens": 1142098.0, + "step": 218, + "train/ce_loss": 1.4625898599624634 + }, + { + "epoch": 0.02155428119438402, + "step": 218, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.02155428119438402, + "step": 218, + "train/total_loss": 0.21657149493694305 + }, + { + "entropy": 9.095985412597656, + "epoch": 0.021653154043899545, + "mean_token_accuracy": 0.6560587286949158, + "num_tokens": 1147354.0, + "step": 219, + "train/ce_loss": 1.4578096866607666 + }, + { + "epoch": 0.021653154043899545, + "step": 219, + "train/sim_loss": 0.1953125 + }, + { + "epoch": 0.021653154043899545, + "step": 219, + "train/total_loss": 0.3410934805870056 + }, + { + "epoch": 0.02175202689341507, + "grad_norm": 1.3434762954711914, + "learning_rate": 9.948326163279435e-06, + "loss": 0.2473, + "step": 220 + }, + { + "entropy": 9.08565616607666, + "epoch": 0.02175202689341507, + "mean_token_accuracy": 0.7113022208213806, + "num_tokens": 1152640.0, + "step": 220, + "train/ce_loss": 0.6921712160110474 + }, + { + "epoch": 0.02175202689341507, + "step": 220, + "train/sim_loss": 0.14453125 + }, + { + "epoch": 0.02175202689341507, + "step": 220, + "train/total_loss": 0.21374836564064026 + }, + { + "entropy": 10.296051979064941, + "epoch": 0.021850899742930592, + "mean_token_accuracy": 0.7894737124443054, + "num_tokens": 1157164.0, + "step": 221, + "train/ce_loss": 2.8208775520324707 + }, + { + "epoch": 0.021850899742930592, + "step": 221, + "train/sim_loss": 0.1484375 + }, + { + "epoch": 0.021850899742930592, + "step": 221, + "train/total_loss": 0.4305252730846405 + }, + { + "entropy": 9.4681396484375, + "epoch": 0.021949772592446116, + "mean_token_accuracy": 0.6847290396690369, + "num_tokens": 1162232.0, + "step": 222, + "train/ce_loss": 1.3300937414169312 + }, + { + "epoch": 0.021949772592446116, + "step": 222, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.021949772592446116, + "step": 222, + "train/total_loss": 0.2580093741416931 + }, + { + "entropy": 8.884187698364258, + "epoch": 0.022048645441961636, + "mean_token_accuracy": 0.7592137455940247, + "num_tokens": 1167498.0, + "step": 223, + "train/ce_loss": 0.6539856791496277 + }, + { + "epoch": 0.022048645441961636, + "step": 223, + "train/sim_loss": 0.15625 + }, + { + "epoch": 0.022048645441961636, + "step": 223, + "train/total_loss": 0.22164857387542725 + }, + { + "entropy": 9.357826232910156, + "epoch": 0.02214751829147716, + "mean_token_accuracy": 0.7581395506858826, + "num_tokens": 1172577.0, + "step": 224, + "train/ce_loss": 0.06450015306472778 + }, + { + "epoch": 0.02214751829147716, + "step": 224, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.02214751829147716, + "step": 224, + "train/total_loss": 0.11973126232624054 + }, + { + "entropy": 8.823542594909668, + "epoch": 0.022246391140992684, + "mean_token_accuracy": 0.7916666865348816, + "num_tokens": 1177916.0, + "step": 225, + "train/ce_loss": 0.6573609113693237 + }, + { + "epoch": 0.022246391140992684, + "step": 225, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.022246391140992684, + "step": 225, + "train/total_loss": 0.1672985851764679 + }, + { + "entropy": 9.321389198303223, + "epoch": 0.022345263990508207, + "mean_token_accuracy": 0.7525179982185364, + "num_tokens": 1183041.0, + "step": 226, + "train/ce_loss": 0.8095218539237976 + }, + { + "epoch": 0.022345263990508207, + "step": 226, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.022345263990508207, + "step": 226, + "train/total_loss": 0.19032719731330872 + }, + { + "entropy": 8.588724136352539, + "epoch": 0.02244413684002373, + "mean_token_accuracy": 0.7323809266090393, + "num_tokens": 1188541.0, + "step": 227, + "train/ce_loss": 1.3050811290740967 + }, + { + "epoch": 0.02244413684002373, + "step": 227, + "train/sim_loss": 0.15234375 + }, + { + "epoch": 0.02244413684002373, + "step": 227, + "train/total_loss": 0.2828518748283386 + }, + { + "entropy": 9.121466636657715, + "epoch": 0.02254300968953925, + "mean_token_accuracy": 0.7220843434333801, + "num_tokens": 1193847.0, + "step": 228, + "train/ce_loss": 0.7918413281440735 + }, + { + "epoch": 0.02254300968953925, + "step": 228, + "train/sim_loss": 0.17578125 + }, + { + "epoch": 0.02254300968953925, + "step": 228, + "train/total_loss": 0.2549653947353363 + }, + { + "entropy": 9.202960968017578, + "epoch": 0.022641882539054775, + "mean_token_accuracy": 0.7212918400764465, + "num_tokens": 1199108.0, + "step": 229, + "train/ce_loss": 0.5830985307693481 + }, + { + "epoch": 0.022641882539054775, + "step": 229, + "train/sim_loss": 0.17578125 + }, + { + "epoch": 0.022641882539054775, + "step": 229, + "train/total_loss": 0.23409110307693481 + }, + { + "entropy": 9.505131721496582, + "epoch": 0.0227407553885703, + "mean_token_accuracy": 0.6988266110420227, + "num_tokens": 1204451.0, + "step": 230, + "train/ce_loss": 1.8573004007339478 + }, + { + "epoch": 0.0227407553885703, + "step": 230, + "train/sim_loss": 0.21484375 + }, + { + "epoch": 0.0227407553885703, + "step": 230, + "train/total_loss": 0.4005737900733948 + }, + { + "entropy": 8.627418518066406, + "epoch": 0.022839628238085823, + "mean_token_accuracy": 0.7078787684440613, + "num_tokens": 1209740.0, + "step": 231, + "train/ce_loss": 1.0552197694778442 + }, + { + "epoch": 0.022839628238085823, + "step": 231, + "train/sim_loss": 0.19921875 + }, + { + "epoch": 0.022839628238085823, + "step": 231, + "train/total_loss": 0.3047407269477844 + }, + { + "entropy": 9.916093826293945, + "epoch": 0.022938501087601346, + "mean_token_accuracy": 0.6979695558547974, + "num_tokens": 1214531.0, + "step": 232, + "train/ce_loss": 3.3532495498657227 + }, + { + "epoch": 0.022938501087601346, + "step": 232, + "train/sim_loss": 0.16015625 + }, + { + "epoch": 0.022938501087601346, + "step": 232, + "train/total_loss": 0.4954812228679657 + }, + { + "entropy": 8.920723915100098, + "epoch": 0.023037373937116867, + "mean_token_accuracy": 0.7028985619544983, + "num_tokens": 1219879.0, + "step": 233, + "train/ce_loss": 1.1744595766067505 + }, + { + "epoch": 0.023037373937116867, + "step": 233, + "train/sim_loss": 0.1953125 + }, + { + "epoch": 0.023037373937116867, + "step": 233, + "train/total_loss": 0.3127584457397461 + }, + { + "entropy": 8.755692481994629, + "epoch": 0.02313624678663239, + "mean_token_accuracy": 0.7393509149551392, + "num_tokens": 1225367.0, + "step": 234, + "train/ce_loss": 0.7512011528015137 + }, + { + "epoch": 0.02313624678663239, + "step": 234, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.02313624678663239, + "step": 234, + "train/total_loss": 0.15324512124061584 + }, + { + "entropy": 9.171567916870117, + "epoch": 0.023235119636147914, + "mean_token_accuracy": 0.7664429545402527, + "num_tokens": 1230564.0, + "step": 235, + "train/ce_loss": 1.2915208339691162 + }, + { + "epoch": 0.023235119636147914, + "step": 235, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.023235119636147914, + "step": 235, + "train/total_loss": 0.2424333393573761 + }, + { + "entropy": 9.673821449279785, + "epoch": 0.023333992485663438, + "mean_token_accuracy": 0.7728055119514465, + "num_tokens": 1235536.0, + "step": 236, + "train/ce_loss": 1.4527373313903809 + }, + { + "epoch": 0.023333992485663438, + "step": 236, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.023333992485663438, + "step": 236, + "train/total_loss": 0.21949248015880585 + }, + { + "entropy": 9.116235733032227, + "epoch": 0.023432865335178958, + "mean_token_accuracy": 0.7522816061973572, + "num_tokens": 1240792.0, + "step": 237, + "train/ce_loss": 0.7418687343597412 + }, + { + "epoch": 0.023432865335178958, + "step": 237, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.023432865335178958, + "step": 237, + "train/total_loss": 0.19918687641620636 + }, + { + "entropy": 9.20254898071289, + "epoch": 0.023531738184694482, + "mean_token_accuracy": 0.7182235717773438, + "num_tokens": 1245972.0, + "step": 238, + "train/ce_loss": 1.3997796773910522 + }, + { + "epoch": 0.023531738184694482, + "step": 238, + "train/sim_loss": 0.15625 + }, + { + "epoch": 0.023531738184694482, + "step": 238, + "train/total_loss": 0.29622799158096313 + }, + { + "entropy": 9.339031219482422, + "epoch": 0.023630611034210006, + "mean_token_accuracy": 0.7014741897583008, + "num_tokens": 1251225.0, + "step": 239, + "train/ce_loss": 1.2503479719161987 + }, + { + "epoch": 0.023630611034210006, + "step": 239, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.023630611034210006, + "step": 239, + "train/total_loss": 0.25003480911254883 + }, + { + "epoch": 0.02372948388372553, + "grad_norm": 1.3856909275054932, + "learning_rate": 9.943381298521487e-06, + "loss": 0.2277, + "step": 240 + }, + { + "entropy": 9.820657730102539, + "epoch": 0.02372948388372553, + "mean_token_accuracy": 0.7542856931686401, + "num_tokens": 1256159.0, + "step": 240, + "train/ce_loss": 0.5057271718978882 + }, + { + "epoch": 0.02372948388372553, + "step": 240, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.02372948388372553, + "step": 240, + "train/total_loss": 0.1599477231502533 + }, + { + "entropy": 9.042003631591797, + "epoch": 0.023828356733241053, + "mean_token_accuracy": 0.6929824352264404, + "num_tokens": 1261426.0, + "step": 241, + "train/ce_loss": 0.5546204447746277 + }, + { + "epoch": 0.023828356733241053, + "step": 241, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.023828356733241053, + "step": 241, + "train/total_loss": 0.172649547457695 + }, + { + "entropy": 9.184640884399414, + "epoch": 0.023927229582756573, + "mean_token_accuracy": 0.7013630867004395, + "num_tokens": 1266638.0, + "step": 242, + "train/ce_loss": 1.0061686038970947 + }, + { + "epoch": 0.023927229582756573, + "step": 242, + "train/sim_loss": 0.24609375 + }, + { + "epoch": 0.023927229582756573, + "step": 242, + "train/total_loss": 0.3467106223106384 + }, + { + "entropy": 8.989482879638672, + "epoch": 0.024026102432272097, + "mean_token_accuracy": 0.6966426968574524, + "num_tokens": 1271922.0, + "step": 243, + "train/ce_loss": 1.0107966661453247 + }, + { + "epoch": 0.024026102432272097, + "step": 243, + "train/sim_loss": 0.1484375 + }, + { + "epoch": 0.024026102432272097, + "step": 243, + "train/total_loss": 0.24951717257499695 + }, + { + "entropy": 8.77995491027832, + "epoch": 0.02412497528178762, + "mean_token_accuracy": 0.7386723160743713, + "num_tokens": 1277360.0, + "step": 244, + "train/ce_loss": 1.490092158317566 + }, + { + "epoch": 0.02412497528178762, + "step": 244, + "train/sim_loss": 0.171875 + }, + { + "epoch": 0.02412497528178762, + "step": 244, + "train/total_loss": 0.32088422775268555 + }, + { + "entropy": 9.096035957336426, + "epoch": 0.024223848131303145, + "mean_token_accuracy": 0.703529417514801, + "num_tokens": 1282714.0, + "step": 245, + "train/ce_loss": 1.1417651176452637 + }, + { + "epoch": 0.024223848131303145, + "step": 245, + "train/sim_loss": 0.16015625 + }, + { + "epoch": 0.024223848131303145, + "step": 245, + "train/total_loss": 0.27433276176452637 + }, + { + "entropy": 9.295907974243164, + "epoch": 0.02432272098081867, + "mean_token_accuracy": 0.7151702642440796, + "num_tokens": 1287806.0, + "step": 246, + "train/ce_loss": 1.053331732749939 + }, + { + "epoch": 0.02432272098081867, + "step": 246, + "train/sim_loss": 0.15234375 + }, + { + "epoch": 0.02432272098081867, + "step": 246, + "train/total_loss": 0.2576769292354584 + }, + { + "entropy": 8.644203186035156, + "epoch": 0.02442159383033419, + "mean_token_accuracy": 0.7180910110473633, + "num_tokens": 1293268.0, + "step": 247, + "train/ce_loss": 1.44148588180542 + }, + { + "epoch": 0.02442159383033419, + "step": 247, + "train/sim_loss": 0.171875 + }, + { + "epoch": 0.02442159383033419, + "step": 247, + "train/total_loss": 0.316023588180542 + }, + { + "entropy": 9.263618469238281, + "epoch": 0.024520466679849712, + "mean_token_accuracy": 0.7442159652709961, + "num_tokens": 1298474.0, + "step": 248, + "train/ce_loss": 0.9490503072738647 + }, + { + "epoch": 0.024520466679849712, + "step": 248, + "train/sim_loss": 0.1640625 + }, + { + "epoch": 0.024520466679849712, + "step": 248, + "train/total_loss": 0.2589675188064575 + }, + { + "entropy": 9.133621215820312, + "epoch": 0.024619339529365236, + "mean_token_accuracy": 0.6857825517654419, + "num_tokens": 1303710.0, + "step": 249, + "train/ce_loss": 0.9590094685554504 + }, + { + "epoch": 0.024619339529365236, + "step": 249, + "train/sim_loss": 0.1484375 + }, + { + "epoch": 0.024619339529365236, + "step": 249, + "train/total_loss": 0.24433845281600952 + }, + { + "entropy": 9.832183837890625, + "epoch": 0.02471821237888076, + "mean_token_accuracy": 0.7421875, + "num_tokens": 1308639.0, + "step": 250, + "train/ce_loss": 1.9395623207092285 + }, + { + "epoch": 0.02471821237888076, + "step": 250, + "train/sim_loss": 0.14453125 + }, + { + "epoch": 0.02471821237888076, + "step": 250, + "train/total_loss": 0.33848750591278076 + }, + { + "entropy": 9.327045440673828, + "epoch": 0.024817085228396284, + "mean_token_accuracy": 0.7238605618476868, + "num_tokens": 1313843.0, + "step": 251, + "train/ce_loss": 1.5444014072418213 + }, + { + "epoch": 0.024817085228396284, + "step": 251, + "train/sim_loss": 0.12109375 + }, + { + "epoch": 0.024817085228396284, + "step": 251, + "train/total_loss": 0.27553391456604004 + }, + { + "entropy": 8.991327285766602, + "epoch": 0.024915958077911804, + "mean_token_accuracy": 0.7205542922019958, + "num_tokens": 1319197.0, + "step": 252, + "train/ce_loss": 1.2354319095611572 + }, + { + "epoch": 0.024915958077911804, + "step": 252, + "train/sim_loss": 0.1640625 + }, + { + "epoch": 0.024915958077911804, + "step": 252, + "train/total_loss": 0.2876057028770447 + }, + { + "entropy": 8.753467559814453, + "epoch": 0.025014830927427328, + "mean_token_accuracy": 0.7426470518112183, + "num_tokens": 1324679.0, + "step": 253, + "train/ce_loss": 0.651845395565033 + }, + { + "epoch": 0.025014830927427328, + "step": 253, + "train/sim_loss": 0.16796875 + }, + { + "epoch": 0.025014830927427328, + "step": 253, + "train/total_loss": 0.23315328359603882 + }, + { + "entropy": 9.486014366149902, + "epoch": 0.02511370377694285, + "mean_token_accuracy": 0.6701337099075317, + "num_tokens": 1329835.0, + "step": 254, + "train/ce_loss": 1.8440042734146118 + }, + { + "epoch": 0.02511370377694285, + "step": 254, + "train/sim_loss": 0.1484375 + }, + { + "epoch": 0.02511370377694285, + "step": 254, + "train/total_loss": 0.33283793926239014 + }, + { + "entropy": 9.138690948486328, + "epoch": 0.025212576626458375, + "mean_token_accuracy": 0.7478787899017334, + "num_tokens": 1335118.0, + "step": 255, + "train/ce_loss": 0.7411956787109375 + }, + { + "epoch": 0.025212576626458375, + "step": 255, + "train/sim_loss": 0.1328125 + }, + { + "epoch": 0.025212576626458375, + "step": 255, + "train/total_loss": 0.20693206787109375 + }, + { + "entropy": 9.330408096313477, + "epoch": 0.0253114494759739, + "mean_token_accuracy": 0.7630208134651184, + "num_tokens": 1340380.0, + "step": 256, + "train/ce_loss": 0.6438601613044739 + }, + { + "epoch": 0.0253114494759739, + "step": 256, + "train/sim_loss": 0.12109375 + }, + { + "epoch": 0.0253114494759739, + "step": 256, + "train/total_loss": 0.1854797601699829 + }, + { + "entropy": 9.453132629394531, + "epoch": 0.02541032232548942, + "mean_token_accuracy": 0.6998368501663208, + "num_tokens": 1345474.0, + "step": 257, + "train/ce_loss": 0.07052500545978546 + }, + { + "epoch": 0.02541032232548942, + "step": 257, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.02541032232548942, + "step": 257, + "train/total_loss": 0.11642750352621078 + }, + { + "entropy": 8.803450584411621, + "epoch": 0.025509195175004943, + "mean_token_accuracy": 0.697926938533783, + "num_tokens": 1350942.0, + "step": 258, + "train/ce_loss": 0.8056007027626038 + }, + { + "epoch": 0.025509195175004943, + "step": 258, + "train/sim_loss": 0.2421875 + }, + { + "epoch": 0.025509195175004943, + "step": 258, + "train/total_loss": 0.3227475881576538 + }, + { + "entropy": 8.871196746826172, + "epoch": 0.025608068024520467, + "mean_token_accuracy": 0.7041800618171692, + "num_tokens": 1356393.0, + "step": 259, + "train/ce_loss": 0.6629928350448608 + }, + { + "epoch": 0.025608068024520467, + "step": 259, + "train/sim_loss": 0.1484375 + }, + { + "epoch": 0.025608068024520467, + "step": 259, + "train/total_loss": 0.21473678946495056 + }, + { + "epoch": 0.02570694087403599, + "grad_norm": 1.321649193763733, + "learning_rate": 9.938436433763537e-06, + "loss": 0.2425, + "step": 260 + }, + { + "entropy": 9.480998992919922, + "epoch": 0.02570694087403599, + "mean_token_accuracy": 0.7395994067192078, + "num_tokens": 1361529.0, + "step": 260, + "train/ce_loss": 1.1267880201339722 + }, + { + "epoch": 0.02570694087403599, + "step": 260, + "train/sim_loss": 0.14453125 + }, + { + "epoch": 0.02570694087403599, + "step": 260, + "train/total_loss": 0.25721004605293274 + }, + { + "entropy": 9.206134796142578, + "epoch": 0.025805813723551514, + "mean_token_accuracy": 0.7177985906600952, + "num_tokens": 1366857.0, + "step": 261, + "train/ce_loss": 0.786657989025116 + }, + { + "epoch": 0.025805813723551514, + "step": 261, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.025805813723551514, + "step": 261, + "train/total_loss": 0.18804079294204712 + }, + { + "entropy": 8.90103816986084, + "epoch": 0.025904686573067034, + "mean_token_accuracy": 0.6851851940155029, + "num_tokens": 1372246.0, + "step": 262, + "train/ce_loss": 1.7658196687698364 + }, + { + "epoch": 0.025904686573067034, + "step": 262, + "train/sim_loss": 0.1875 + }, + { + "epoch": 0.025904686573067034, + "step": 262, + "train/total_loss": 0.3640819787979126 + }, + { + "entropy": 9.348938941955566, + "epoch": 0.026003559422582558, + "mean_token_accuracy": 0.6954612135887146, + "num_tokens": 1377399.0, + "step": 263, + "train/ce_loss": 1.1223899126052856 + }, + { + "epoch": 0.026003559422582558, + "step": 263, + "train/sim_loss": 0.140625 + }, + { + "epoch": 0.026003559422582558, + "step": 263, + "train/total_loss": 0.2528640031814575 + }, + { + "entropy": 9.85448169708252, + "epoch": 0.026102432272098082, + "mean_token_accuracy": 0.6844106316566467, + "num_tokens": 1382339.0, + "step": 264, + "train/ce_loss": 2.0575814247131348 + }, + { + "epoch": 0.026102432272098082, + "step": 264, + "train/sim_loss": 0.21875 + }, + { + "epoch": 0.026102432272098082, + "step": 264, + "train/total_loss": 0.42450815439224243 + }, + { + "entropy": 10.01374340057373, + "epoch": 0.026201305121613606, + "mean_token_accuracy": 0.7354369163513184, + "num_tokens": 1387182.0, + "step": 265, + "train/ce_loss": 2.1004700660705566 + }, + { + "epoch": 0.026201305121613606, + "step": 265, + "train/sim_loss": 0.1640625 + }, + { + "epoch": 0.026201305121613606, + "step": 265, + "train/total_loss": 0.37410950660705566 + }, + { + "entropy": 9.003646850585938, + "epoch": 0.02630017797112913, + "mean_token_accuracy": 0.6959183812141418, + "num_tokens": 1392641.0, + "step": 266, + "train/ce_loss": 0.881372332572937 + }, + { + "epoch": 0.02630017797112913, + "step": 266, + "train/sim_loss": 0.21875 + }, + { + "epoch": 0.02630017797112913, + "step": 266, + "train/total_loss": 0.3068872392177582 + }, + { + "entropy": 9.18246841430664, + "epoch": 0.02639905082064465, + "mean_token_accuracy": 0.7476525902748108, + "num_tokens": 1397972.0, + "step": 267, + "train/ce_loss": 0.6705335378646851 + }, + { + "epoch": 0.02639905082064465, + "step": 267, + "train/sim_loss": 0.16796875 + }, + { + "epoch": 0.02639905082064465, + "step": 267, + "train/total_loss": 0.23502209782600403 + }, + { + "entropy": 8.929038047790527, + "epoch": 0.026497923670160173, + "mean_token_accuracy": 0.7204058766365051, + "num_tokens": 1403304.0, + "step": 268, + "train/ce_loss": 0.5589403510093689 + }, + { + "epoch": 0.026497923670160173, + "step": 268, + "train/sim_loss": 0.14453125 + }, + { + "epoch": 0.026497923670160173, + "step": 268, + "train/total_loss": 0.20042528212070465 + }, + { + "entropy": 9.119290351867676, + "epoch": 0.026596796519675697, + "mean_token_accuracy": 0.7673377990722656, + "num_tokens": 1408660.0, + "step": 269, + "train/ce_loss": 0.984940767288208 + }, + { + "epoch": 0.026596796519675697, + "step": 269, + "train/sim_loss": 0.12890625 + }, + { + "epoch": 0.026596796519675697, + "step": 269, + "train/total_loss": 0.22740033268928528 + }, + { + "entropy": 9.511564254760742, + "epoch": 0.02669566936919122, + "mean_token_accuracy": 0.6966966986656189, + "num_tokens": 1413763.0, + "step": 270, + "train/ce_loss": 1.857006311416626 + }, + { + "epoch": 0.02669566936919122, + "step": 270, + "train/sim_loss": 0.140625 + }, + { + "epoch": 0.02669566936919122, + "step": 270, + "train/total_loss": 0.3263256549835205 + }, + { + "entropy": 8.517393112182617, + "epoch": 0.026794542218706745, + "mean_token_accuracy": 0.7054985761642456, + "num_tokens": 1419345.0, + "step": 271, + "train/ce_loss": 1.1406935453414917 + }, + { + "epoch": 0.026794542218706745, + "step": 271, + "train/sim_loss": 0.15234375 + }, + { + "epoch": 0.026794542218706745, + "step": 271, + "train/total_loss": 0.2664130926132202 + }, + { + "entropy": 8.833395004272461, + "epoch": 0.026893415068222265, + "mean_token_accuracy": 0.7211934328079224, + "num_tokens": 1424814.0, + "step": 272, + "train/ce_loss": 1.1049021482467651 + }, + { + "epoch": 0.026893415068222265, + "step": 272, + "train/sim_loss": 0.1484375 + }, + { + "epoch": 0.026893415068222265, + "step": 272, + "train/total_loss": 0.25892770290374756 + }, + { + "entropy": 9.114679336547852, + "epoch": 0.02699228791773779, + "mean_token_accuracy": 0.7144607901573181, + "num_tokens": 1430064.0, + "step": 273, + "train/ce_loss": 0.4850656986236572 + }, + { + "epoch": 0.02699228791773779, + "step": 273, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.02699228791773779, + "step": 273, + "train/total_loss": 0.14225657284259796 + }, + { + "entropy": 9.55802059173584, + "epoch": 0.027091160767253313, + "mean_token_accuracy": 0.7269790172576904, + "num_tokens": 1435123.0, + "step": 274, + "train/ce_loss": 1.139512062072754 + }, + { + "epoch": 0.027091160767253313, + "step": 274, + "train/sim_loss": 0.19140625 + }, + { + "epoch": 0.027091160767253313, + "step": 274, + "train/total_loss": 0.3053574562072754 + }, + { + "entropy": 8.908613204956055, + "epoch": 0.027190033616768836, + "mean_token_accuracy": 0.6693735718727112, + "num_tokens": 1440477.0, + "step": 275, + "train/ce_loss": 1.4616297483444214 + }, + { + "epoch": 0.027190033616768836, + "step": 275, + "train/sim_loss": 0.16015625 + }, + { + "epoch": 0.027190033616768836, + "step": 275, + "train/total_loss": 0.3063192367553711 + }, + { + "entropy": 9.507146835327148, + "epoch": 0.02728890646628436, + "mean_token_accuracy": 0.7745571732521057, + "num_tokens": 1445599.0, + "step": 276, + "train/ce_loss": 0.06952886283397675 + }, + { + "epoch": 0.02728890646628436, + "step": 276, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.02728890646628436, + "step": 276, + "train/total_loss": 0.12414038926362991 + }, + { + "entropy": 10.23452377319336, + "epoch": 0.02738777931579988, + "mean_token_accuracy": 0.7180156707763672, + "num_tokens": 1450376.0, + "step": 277, + "train/ce_loss": 1.0686619281768799 + }, + { + "epoch": 0.02738777931579988, + "step": 277, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.02738777931579988, + "step": 277, + "train/total_loss": 0.20842869579792023 + }, + { + "entropy": 8.921943664550781, + "epoch": 0.027486652165315404, + "mean_token_accuracy": 0.7830578684806824, + "num_tokens": 1455813.0, + "step": 278, + "train/ce_loss": 0.6289975047111511 + }, + { + "epoch": 0.027486652165315404, + "step": 278, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.027486652165315404, + "step": 278, + "train/total_loss": 0.15664975345134735 + }, + { + "entropy": 9.548613548278809, + "epoch": 0.027585525014830928, + "mean_token_accuracy": 0.6795827150344849, + "num_tokens": 1460924.0, + "step": 279, + "train/ce_loss": 1.3225817680358887 + }, + { + "epoch": 0.027585525014830928, + "step": 279, + "train/sim_loss": 0.1484375 + }, + { + "epoch": 0.027585525014830928, + "step": 279, + "train/total_loss": 0.28069567680358887 + }, + { + "epoch": 0.02768439786434645, + "grad_norm": 2.2994251251220703, + "learning_rate": 9.93349156900559e-06, + "loss": 0.2291, + "step": 280 + }, + { + "entropy": 8.961726188659668, + "epoch": 0.02768439786434645, + "mean_token_accuracy": 0.7474972009658813, + "num_tokens": 1466310.0, + "step": 280, + "train/ce_loss": 1.1064954996109009 + }, + { + "epoch": 0.02768439786434645, + "step": 280, + "train/sim_loss": 0.13671875 + }, + { + "epoch": 0.02768439786434645, + "step": 280, + "train/total_loss": 0.24736830592155457 + }, + { + "entropy": 9.037384033203125, + "epoch": 0.027783270713861972, + "mean_token_accuracy": 0.7144444584846497, + "num_tokens": 1471685.0, + "step": 281, + "train/ce_loss": 0.73092120885849 + }, + { + "epoch": 0.027783270713861972, + "step": 281, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.027783270713861972, + "step": 281, + "train/total_loss": 0.19809213280677795 + }, + { + "entropy": 9.636255264282227, + "epoch": 0.027882143563377496, + "mean_token_accuracy": 0.7265238761901855, + "num_tokens": 1476760.0, + "step": 282, + "train/ce_loss": 1.2324522733688354 + }, + { + "epoch": 0.027882143563377496, + "step": 282, + "train/sim_loss": 0.12890625 + }, + { + "epoch": 0.027882143563377496, + "step": 282, + "train/total_loss": 0.2521514892578125 + }, + { + "entropy": 9.781538009643555, + "epoch": 0.02798101641289302, + "mean_token_accuracy": 0.7630161643028259, + "num_tokens": 1481688.0, + "step": 283, + "train/ce_loss": 0.08090229332447052 + }, + { + "epoch": 0.02798101641289302, + "step": 283, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.02798101641289302, + "step": 283, + "train/total_loss": 0.11355897784233093 + }, + { + "entropy": 9.071699142456055, + "epoch": 0.028079889262408543, + "mean_token_accuracy": 0.7766439914703369, + "num_tokens": 1487047.0, + "step": 284, + "train/ce_loss": 0.8658326864242554 + }, + { + "epoch": 0.028079889262408543, + "step": 284, + "train/sim_loss": 0.1640625 + }, + { + "epoch": 0.028079889262408543, + "step": 284, + "train/total_loss": 0.2506457567214966 + }, + { + "entropy": 9.438531875610352, + "epoch": 0.028178762111924067, + "mean_token_accuracy": 0.7694753408432007, + "num_tokens": 1492121.0, + "step": 285, + "train/ce_loss": 1.3623826503753662 + }, + { + "epoch": 0.028178762111924067, + "step": 285, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.028178762111924067, + "step": 285, + "train/total_loss": 0.23780076205730438 + }, + { + "entropy": 9.435054779052734, + "epoch": 0.028277634961439587, + "mean_token_accuracy": 0.6922094225883484, + "num_tokens": 1497370.0, + "step": 286, + "train/ce_loss": 1.057147741317749 + }, + { + "epoch": 0.028277634961439587, + "step": 286, + "train/sim_loss": 0.13671875 + }, + { + "epoch": 0.028277634961439587, + "step": 286, + "train/total_loss": 0.24243351817131042 + }, + { + "entropy": 9.316326141357422, + "epoch": 0.02837650781095511, + "mean_token_accuracy": 0.7567954063415527, + "num_tokens": 1502553.0, + "step": 287, + "train/ce_loss": 1.2863101959228516 + }, + { + "epoch": 0.02837650781095511, + "step": 287, + "train/sim_loss": 0.1875 + }, + { + "epoch": 0.02837650781095511, + "step": 287, + "train/total_loss": 0.31613102555274963 + }, + { + "entropy": 10.227319717407227, + "epoch": 0.028475380660470635, + "mean_token_accuracy": 0.64402174949646, + "num_tokens": 1507305.0, + "step": 288, + "train/ce_loss": 0.12142963707447052 + }, + { + "epoch": 0.028475380660470635, + "step": 288, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.028475380660470635, + "step": 288, + "train/total_loss": 0.09026796370744705 + }, + { + "entropy": 9.190692901611328, + "epoch": 0.02857425350998616, + "mean_token_accuracy": 0.7074910998344421, + "num_tokens": 1512615.0, + "step": 289, + "train/ce_loss": 1.271957278251648 + }, + { + "epoch": 0.02857425350998616, + "step": 289, + "train/sim_loss": 0.16796875 + }, + { + "epoch": 0.02857425350998616, + "step": 289, + "train/total_loss": 0.29516446590423584 + }, + { + "entropy": 9.307730674743652, + "epoch": 0.028673126359501682, + "mean_token_accuracy": 0.7369077205657959, + "num_tokens": 1517908.0, + "step": 290, + "train/ce_loss": 0.9792628288269043 + }, + { + "epoch": 0.028673126359501682, + "step": 290, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.028673126359501682, + "step": 290, + "train/total_loss": 0.1838637888431549 + }, + { + "entropy": 9.437780380249023, + "epoch": 0.028771999209017202, + "mean_token_accuracy": 0.7896341681480408, + "num_tokens": 1523017.0, + "step": 291, + "train/ce_loss": 0.0677185133099556 + }, + { + "epoch": 0.028771999209017202, + "step": 291, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.028771999209017202, + "step": 291, + "train/total_loss": 0.05364685133099556 + }, + { + "entropy": 9.36805534362793, + "epoch": 0.028870872058532726, + "mean_token_accuracy": 0.7188329100608826, + "num_tokens": 1528212.0, + "step": 292, + "train/ce_loss": 1.1883916854858398 + }, + { + "epoch": 0.028870872058532726, + "step": 292, + "train/sim_loss": 0.171875 + }, + { + "epoch": 0.028870872058532726, + "step": 292, + "train/total_loss": 0.29071417450904846 + }, + { + "entropy": 9.275108337402344, + "epoch": 0.02896974490804825, + "mean_token_accuracy": 0.7531328201293945, + "num_tokens": 1533509.0, + "step": 293, + "train/ce_loss": 0.7196071743965149 + }, + { + "epoch": 0.02896974490804825, + "step": 293, + "train/sim_loss": 0.16796875 + }, + { + "epoch": 0.02896974490804825, + "step": 293, + "train/total_loss": 0.2399294674396515 + }, + { + "entropy": 9.360564231872559, + "epoch": 0.029068617757563774, + "mean_token_accuracy": 0.7331606149673462, + "num_tokens": 1538701.0, + "step": 294, + "train/ce_loss": 1.553326964378357 + }, + { + "epoch": 0.029068617757563774, + "step": 294, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.029068617757563774, + "step": 294, + "train/total_loss": 0.25689518451690674 + }, + { + "entropy": 9.396177291870117, + "epoch": 0.029167490607079297, + "mean_token_accuracy": 0.7080581188201904, + "num_tokens": 1543900.0, + "step": 295, + "train/ce_loss": 1.2857218980789185 + }, + { + "epoch": 0.029167490607079297, + "step": 295, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.029167490607079297, + "step": 295, + "train/total_loss": 0.23794719576835632 + }, + { + "entropy": 9.138364791870117, + "epoch": 0.029266363456594818, + "mean_token_accuracy": 0.7395397424697876, + "num_tokens": 1549280.0, + "step": 296, + "train/ce_loss": 0.8193778991699219 + }, + { + "epoch": 0.029266363456594818, + "step": 296, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.029266363456594818, + "step": 296, + "train/total_loss": 0.1874065399169922 + }, + { + "entropy": 9.80689811706543, + "epoch": 0.02936523630611034, + "mean_token_accuracy": 0.7438016533851624, + "num_tokens": 1554295.0, + "step": 297, + "train/ce_loss": 0.5103650689125061 + }, + { + "epoch": 0.02936523630611034, + "step": 297, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.02936523630611034, + "step": 297, + "train/total_loss": 0.1330677568912506 + }, + { + "entropy": 10.591114044189453, + "epoch": 0.029464109155625865, + "mean_token_accuracy": 0.6646341681480408, + "num_tokens": 1558872.0, + "step": 298, + "train/ce_loss": 0.27430862188339233 + }, + { + "epoch": 0.029464109155625865, + "step": 298, + "train/sim_loss": 0.140625 + }, + { + "epoch": 0.029464109155625865, + "step": 298, + "train/total_loss": 0.16805586218833923 + }, + { + "entropy": 9.821271896362305, + "epoch": 0.02956298200514139, + "mean_token_accuracy": 0.7472324967384338, + "num_tokens": 1563847.0, + "step": 299, + "train/ce_loss": 1.1159749031066895 + }, + { + "epoch": 0.02956298200514139, + "step": 299, + "train/sim_loss": 0.15234375 + }, + { + "epoch": 0.02956298200514139, + "step": 299, + "train/total_loss": 0.26394122838974 + }, + { + "epoch": 0.029661854854656913, + "grad_norm": 1.9446110725402832, + "learning_rate": 9.928546704247638e-06, + "loss": 0.2196, + "step": 300 + }, + { + "entropy": 9.165465354919434, + "epoch": 0.029661854854656913, + "mean_token_accuracy": 0.7384615540504456, + "num_tokens": 1569256.0, + "step": 300, + "train/ce_loss": 0.6590936779975891 + }, + { + "epoch": 0.029661854854656913, + "step": 300, + "train/sim_loss": 0.15234375 + }, + { + "epoch": 0.029661854854656913, + "step": 300, + "train/total_loss": 0.21825312077999115 + }, + { + "entropy": 9.37040901184082, + "epoch": 0.029760727704172433, + "mean_token_accuracy": 0.6537467837333679, + "num_tokens": 1574467.0, + "step": 301, + "train/ce_loss": 0.953637421131134 + }, + { + "epoch": 0.029760727704172433, + "step": 301, + "train/sim_loss": 0.171875 + }, + { + "epoch": 0.029760727704172433, + "step": 301, + "train/total_loss": 0.2672387361526489 + }, + { + "entropy": 9.353124618530273, + "epoch": 0.029859600553687957, + "mean_token_accuracy": 0.7333333492279053, + "num_tokens": 1579772.0, + "step": 302, + "train/ce_loss": 1.0786752700805664 + }, + { + "epoch": 0.029859600553687957, + "step": 302, + "train/sim_loss": 0.140625 + }, + { + "epoch": 0.029859600553687957, + "step": 302, + "train/total_loss": 0.2484925389289856 + }, + { + "entropy": 9.137678146362305, + "epoch": 0.02995847340320348, + "mean_token_accuracy": 0.7180232405662537, + "num_tokens": 1585265.0, + "step": 303, + "train/ce_loss": 0.6878090500831604 + }, + { + "epoch": 0.02995847340320348, + "step": 303, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.02995847340320348, + "step": 303, + "train/total_loss": 0.18206214904785156 + }, + { + "entropy": 9.260383605957031, + "epoch": 0.030057346252719004, + "mean_token_accuracy": 0.6810966730117798, + "num_tokens": 1590428.0, + "step": 304, + "train/ce_loss": 0.5839409828186035 + }, + { + "epoch": 0.030057346252719004, + "step": 304, + "train/sim_loss": 0.16015625 + }, + { + "epoch": 0.030057346252719004, + "step": 304, + "train/total_loss": 0.21855035424232483 + }, + { + "entropy": 9.679158210754395, + "epoch": 0.030156219102234528, + "mean_token_accuracy": 0.6967340707778931, + "num_tokens": 1595508.0, + "step": 305, + "train/ce_loss": 1.1889206171035767 + }, + { + "epoch": 0.030156219102234528, + "step": 305, + "train/sim_loss": 0.1640625 + }, + { + "epoch": 0.030156219102234528, + "step": 305, + "train/total_loss": 0.2829545736312866 + }, + { + "entropy": 9.837874412536621, + "epoch": 0.030255091951750048, + "mean_token_accuracy": 0.7153153419494629, + "num_tokens": 1600467.0, + "step": 306, + "train/ce_loss": 1.511080026626587 + }, + { + "epoch": 0.030255091951750048, + "step": 306, + "train/sim_loss": 0.14453125 + }, + { + "epoch": 0.030255091951750048, + "step": 306, + "train/total_loss": 0.2956392765045166 + }, + { + "entropy": 8.882853507995605, + "epoch": 0.030353964801265572, + "mean_token_accuracy": 0.7215346693992615, + "num_tokens": 1605747.0, + "step": 307, + "train/ce_loss": 0.7038434743881226 + }, + { + "epoch": 0.030353964801265572, + "step": 307, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.030353964801265572, + "step": 307, + "train/total_loss": 0.13679060339927673 + }, + { + "entropy": 9.678632736206055, + "epoch": 0.030452837650781096, + "mean_token_accuracy": 0.7492163181304932, + "num_tokens": 1610875.0, + "step": 308, + "train/ce_loss": 1.0676531791687012 + }, + { + "epoch": 0.030452837650781096, + "step": 308, + "train/sim_loss": 0.1328125 + }, + { + "epoch": 0.030452837650781096, + "step": 308, + "train/total_loss": 0.23957782983779907 + }, + { + "entropy": 9.990983963012695, + "epoch": 0.03055171050029662, + "mean_token_accuracy": 0.7561521530151367, + "num_tokens": 1615768.0, + "step": 309, + "train/ce_loss": 1.7758872509002686 + }, + { + "epoch": 0.03055171050029662, + "step": 309, + "train/sim_loss": 0.12890625 + }, + { + "epoch": 0.03055171050029662, + "step": 309, + "train/total_loss": 0.30649498105049133 + }, + { + "entropy": 9.119295120239258, + "epoch": 0.030650583349812143, + "mean_token_accuracy": 0.7405515909194946, + "num_tokens": 1621219.0, + "step": 310, + "train/ce_loss": 1.228122591972351 + }, + { + "epoch": 0.030650583349812143, + "step": 310, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.030650583349812143, + "step": 310, + "train/total_loss": 0.23999977111816406 + }, + { + "entropy": 9.466287612915039, + "epoch": 0.030749456199327663, + "mean_token_accuracy": 0.7510431408882141, + "num_tokens": 1626430.0, + "step": 311, + "train/ce_loss": 1.2873427867889404 + }, + { + "epoch": 0.030749456199327663, + "step": 311, + "train/sim_loss": 0.1484375 + }, + { + "epoch": 0.030749456199327663, + "step": 311, + "train/total_loss": 0.277171790599823 + }, + { + "entropy": 9.017744064331055, + "epoch": 0.030848329048843187, + "mean_token_accuracy": 0.7283422350883484, + "num_tokens": 1631878.0, + "step": 312, + "train/ce_loss": 0.7938199639320374 + }, + { + "epoch": 0.030848329048843187, + "step": 312, + "train/sim_loss": 0.12109375 + }, + { + "epoch": 0.030848329048843187, + "step": 312, + "train/total_loss": 0.2004757523536682 + }, + { + "entropy": 10.244565963745117, + "epoch": 0.03094720189835871, + "mean_token_accuracy": 0.7329843044281006, + "num_tokens": 1636689.0, + "step": 313, + "train/ce_loss": 0.11315623670816422 + }, + { + "epoch": 0.03094720189835871, + "step": 313, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.03094720189835871, + "step": 313, + "train/total_loss": 0.13631562888622284 + }, + { + "entropy": 9.71835708618164, + "epoch": 0.031046074747874235, + "mean_token_accuracy": 0.655986487865448, + "num_tokens": 1641719.0, + "step": 314, + "train/ce_loss": 1.4221258163452148 + }, + { + "epoch": 0.031046074747874235, + "step": 314, + "train/sim_loss": 0.15234375 + }, + { + "epoch": 0.031046074747874235, + "step": 314, + "train/total_loss": 0.29455631971359253 + }, + { + "entropy": 9.370756149291992, + "epoch": 0.03114494759738976, + "mean_token_accuracy": 0.6858572959899902, + "num_tokens": 1647010.0, + "step": 315, + "train/ce_loss": 1.1413757801055908 + }, + { + "epoch": 0.03114494759738976, + "step": 315, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.03114494759738976, + "step": 315, + "train/total_loss": 0.19616883993148804 + }, + { + "entropy": 9.630697250366211, + "epoch": 0.03124382044690528, + "mean_token_accuracy": 0.7165932655334473, + "num_tokens": 1652125.0, + "step": 316, + "train/ce_loss": 1.0022145509719849 + }, + { + "epoch": 0.03124382044690528, + "step": 316, + "train/sim_loss": 0.13671875 + }, + { + "epoch": 0.03124382044690528, + "step": 316, + "train/total_loss": 0.2369402050971985 + }, + { + "entropy": 9.33189868927002, + "epoch": 0.0313426932964208, + "mean_token_accuracy": 0.687915027141571, + "num_tokens": 1657349.0, + "step": 317, + "train/ce_loss": 0.7280907034873962 + }, + { + "epoch": 0.0313426932964208, + "step": 317, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.0313426932964208, + "step": 317, + "train/total_loss": 0.17046532034873962 + }, + { + "entropy": 9.760059356689453, + "epoch": 0.03144156614593632, + "mean_token_accuracy": 0.734446108341217, + "num_tokens": 1662421.0, + "step": 318, + "train/ce_loss": 1.6615796089172363 + }, + { + "epoch": 0.03144156614593632, + "step": 318, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.03144156614593632, + "step": 318, + "train/total_loss": 0.29115796089172363 + }, + { + "entropy": 9.35573673248291, + "epoch": 0.03154043899545185, + "mean_token_accuracy": 0.7798036336898804, + "num_tokens": 1667672.0, + "step": 319, + "train/ce_loss": 0.9388341903686523 + }, + { + "epoch": 0.03154043899545185, + "step": 319, + "train/sim_loss": 0.1875 + }, + { + "epoch": 0.03154043899545185, + "step": 319, + "train/total_loss": 0.2813834249973297 + }, + { + "epoch": 0.03163931184496737, + "grad_norm": 1.9336750507354736, + "learning_rate": 9.92360183948969e-06, + "loss": 0.23, + "step": 320 + }, + { + "entropy": 8.928403854370117, + "epoch": 0.03163931184496737, + "mean_token_accuracy": 0.7151514887809753, + "num_tokens": 1673111.0, + "step": 320, + "train/ce_loss": 1.0507612228393555 + }, + { + "epoch": 0.03163931184496737, + "step": 320, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.03163931184496737, + "step": 320, + "train/total_loss": 0.2222636342048645 + }, + { + "entropy": 9.394420623779297, + "epoch": 0.0317381846944829, + "mean_token_accuracy": 0.7555555701255798, + "num_tokens": 1678316.0, + "step": 321, + "train/ce_loss": 0.5773379802703857 + }, + { + "epoch": 0.0317381846944829, + "step": 321, + "train/sim_loss": 0.15625 + }, + { + "epoch": 0.0317381846944829, + "step": 321, + "train/total_loss": 0.21398380398750305 + }, + { + "entropy": 9.462547302246094, + "epoch": 0.03183705754399842, + "mean_token_accuracy": 0.7281420826911926, + "num_tokens": 1683502.0, + "step": 322, + "train/ce_loss": 0.8007969260215759 + }, + { + "epoch": 0.03183705754399842, + "step": 322, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.03183705754399842, + "step": 322, + "train/total_loss": 0.16601720452308655 + }, + { + "entropy": 9.313180923461914, + "epoch": 0.03193593039351394, + "mean_token_accuracy": 0.6699629426002502, + "num_tokens": 1688804.0, + "step": 323, + "train/ce_loss": 1.2433942556381226 + }, + { + "epoch": 0.03193593039351394, + "step": 323, + "train/sim_loss": 0.17578125 + }, + { + "epoch": 0.03193593039351394, + "step": 323, + "train/total_loss": 0.30012068152427673 + }, + { + "entropy": 8.810771942138672, + "epoch": 0.032034803243029465, + "mean_token_accuracy": 0.6816443800926208, + "num_tokens": 1694314.0, + "step": 324, + "train/ce_loss": 1.380662441253662 + }, + { + "epoch": 0.032034803243029465, + "step": 324, + "train/sim_loss": 0.1953125 + }, + { + "epoch": 0.032034803243029465, + "step": 324, + "train/total_loss": 0.33337873220443726 + }, + { + "entropy": 9.738914489746094, + "epoch": 0.032133676092544985, + "mean_token_accuracy": 0.7730061411857605, + "num_tokens": 1699329.0, + "step": 325, + "train/ce_loss": 0.8244234919548035 + }, + { + "epoch": 0.032133676092544985, + "step": 325, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.032133676092544985, + "step": 325, + "train/total_loss": 0.14884859323501587 + }, + { + "entropy": 9.07105827331543, + "epoch": 0.03223254894206051, + "mean_token_accuracy": 0.6907756924629211, + "num_tokens": 1704797.0, + "step": 326, + "train/ce_loss": 0.7959145307540894 + }, + { + "epoch": 0.03223254894206051, + "step": 326, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.03223254894206051, + "step": 326, + "train/total_loss": 0.18506020307540894 + }, + { + "entropy": 9.063455581665039, + "epoch": 0.03233142179157603, + "mean_token_accuracy": 0.6761487722396851, + "num_tokens": 1710183.0, + "step": 327, + "train/ce_loss": 1.2873786687850952 + }, + { + "epoch": 0.03233142179157603, + "step": 327, + "train/sim_loss": 0.15625 + }, + { + "epoch": 0.03233142179157603, + "step": 327, + "train/total_loss": 0.2849878668785095 + }, + { + "entropy": 9.247129440307617, + "epoch": 0.03243029464109155, + "mean_token_accuracy": 0.678329586982727, + "num_tokens": 1715575.0, + "step": 328, + "train/ce_loss": 0.8147888779640198 + }, + { + "epoch": 0.03243029464109155, + "step": 328, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.03243029464109155, + "step": 328, + "train/total_loss": 0.19476014375686646 + }, + { + "entropy": 9.43368911743164, + "epoch": 0.03252916749060708, + "mean_token_accuracy": 0.7616707682609558, + "num_tokens": 1720850.0, + "step": 329, + "train/ce_loss": 1.0006930828094482 + }, + { + "epoch": 0.03252916749060708, + "step": 329, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.03252916749060708, + "step": 329, + "train/total_loss": 0.1977255642414093 + }, + { + "entropy": 9.383726119995117, + "epoch": 0.0326280403401226, + "mean_token_accuracy": 0.7228915691375732, + "num_tokens": 1726093.0, + "step": 330, + "train/ce_loss": 0.9005025625228882 + }, + { + "epoch": 0.0326280403401226, + "step": 330, + "train/sim_loss": 0.15234375 + }, + { + "epoch": 0.0326280403401226, + "step": 330, + "train/total_loss": 0.24239400029182434 + }, + { + "entropy": 9.186288833618164, + "epoch": 0.03272691318963813, + "mean_token_accuracy": 0.6726190447807312, + "num_tokens": 1731378.0, + "step": 331, + "train/ce_loss": 0.8888669013977051 + }, + { + "epoch": 0.03272691318963813, + "step": 331, + "train/sim_loss": 0.12109375 + }, + { + "epoch": 0.03272691318963813, + "step": 331, + "train/total_loss": 0.20998044312000275 + }, + { + "entropy": 9.234413146972656, + "epoch": 0.03282578603915365, + "mean_token_accuracy": 0.7278989553451538, + "num_tokens": 1736731.0, + "step": 332, + "train/ce_loss": 1.1174771785736084 + }, + { + "epoch": 0.03282578603915365, + "step": 332, + "train/sim_loss": 0.140625 + }, + { + "epoch": 0.03282578603915365, + "step": 332, + "train/total_loss": 0.25237271189689636 + }, + { + "entropy": 9.7683687210083, + "epoch": 0.03292465888866917, + "mean_token_accuracy": 0.761904776096344, + "num_tokens": 1741814.0, + "step": 333, + "train/ce_loss": 0.810090959072113 + }, + { + "epoch": 0.03292465888866917, + "step": 333, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.03292465888866917, + "step": 333, + "train/total_loss": 0.19038408994674683 + }, + { + "entropy": 9.496259689331055, + "epoch": 0.033023531738184696, + "mean_token_accuracy": 0.7402088642120361, + "num_tokens": 1747017.0, + "step": 334, + "train/ce_loss": 1.5065089464187622 + }, + { + "epoch": 0.033023531738184696, + "step": 334, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.033023531738184696, + "step": 334, + "train/total_loss": 0.26783841848373413 + }, + { + "entropy": 9.342421531677246, + "epoch": 0.033122404587700216, + "mean_token_accuracy": 0.7607142925262451, + "num_tokens": 1752372.0, + "step": 335, + "train/ce_loss": 1.0633383989334106 + }, + { + "epoch": 0.033122404587700216, + "step": 335, + "train/sim_loss": 0.140625 + }, + { + "epoch": 0.033122404587700216, + "step": 335, + "train/total_loss": 0.24695885181427002 + }, + { + "entropy": 9.478039741516113, + "epoch": 0.03322127743721574, + "mean_token_accuracy": 0.7618438005447388, + "num_tokens": 1757607.0, + "step": 336, + "train/ce_loss": 0.7672297358512878 + }, + { + "epoch": 0.03322127743721574, + "step": 336, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.03322127743721574, + "step": 336, + "train/total_loss": 0.19000422954559326 + }, + { + "entropy": 9.245959281921387, + "epoch": 0.033320150286731264, + "mean_token_accuracy": 0.7064732313156128, + "num_tokens": 1763010.0, + "step": 337, + "train/ce_loss": 1.899410367012024 + }, + { + "epoch": 0.033320150286731264, + "step": 337, + "train/sim_loss": 0.21875 + }, + { + "epoch": 0.033320150286731264, + "step": 337, + "train/total_loss": 0.40869104862213135 + }, + { + "entropy": 9.674976348876953, + "epoch": 0.033419023136246784, + "mean_token_accuracy": 0.7416918277740479, + "num_tokens": 1768106.0, + "step": 338, + "train/ce_loss": 0.6345680356025696 + }, + { + "epoch": 0.033419023136246784, + "step": 338, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.033419023136246784, + "step": 338, + "train/total_loss": 0.11814430356025696 + }, + { + "entropy": 9.843158721923828, + "epoch": 0.03351789598576231, + "mean_token_accuracy": 0.7960000038146973, + "num_tokens": 1773101.0, + "step": 339, + "train/ce_loss": 0.09235163033008575 + }, + { + "epoch": 0.03351789598576231, + "step": 339, + "train/sim_loss": 0.12109375 + }, + { + "epoch": 0.03351789598576231, + "step": 339, + "train/total_loss": 0.13032890856266022 + }, + { + "epoch": 0.03361676883527783, + "grad_norm": 1.593528151512146, + "learning_rate": 9.918656974731741e-06, + "loss": 0.2225, + "step": 340 + }, + { + "entropy": 9.499095916748047, + "epoch": 0.03361676883527783, + "mean_token_accuracy": 0.7245430946350098, + "num_tokens": 1778328.0, + "step": 340, + "train/ce_loss": 1.1299872398376465 + }, + { + "epoch": 0.03361676883527783, + "step": 340, + "train/sim_loss": 0.15625 + }, + { + "epoch": 0.03361676883527783, + "step": 340, + "train/total_loss": 0.26924872398376465 + }, + { + "entropy": 9.061765670776367, + "epoch": 0.03371564168479336, + "mean_token_accuracy": 0.7465968728065491, + "num_tokens": 1783740.0, + "step": 341, + "train/ce_loss": 1.1679484844207764 + }, + { + "epoch": 0.03371564168479336, + "step": 341, + "train/sim_loss": 0.15234375 + }, + { + "epoch": 0.03371564168479336, + "step": 341, + "train/total_loss": 0.2691386044025421 + }, + { + "entropy": 9.927787780761719, + "epoch": 0.03381451453430888, + "mean_token_accuracy": 0.7411764860153198, + "num_tokens": 1788923.0, + "step": 342, + "train/ce_loss": 0.07725854218006134 + }, + { + "epoch": 0.03381451453430888, + "step": 342, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.03381451453430888, + "step": 342, + "train/total_loss": 0.08975710719823837 + }, + { + "entropy": 9.046907424926758, + "epoch": 0.0339133873838244, + "mean_token_accuracy": 0.7765957713127136, + "num_tokens": 1794222.0, + "step": 343, + "train/ce_loss": 0.5280411243438721 + }, + { + "epoch": 0.0339133873838244, + "step": 343, + "train/sim_loss": 0.140625 + }, + { + "epoch": 0.0339133873838244, + "step": 343, + "train/total_loss": 0.1934291124343872 + }, + { + "entropy": 9.82237434387207, + "epoch": 0.034012260233339926, + "mean_token_accuracy": 0.7479674816131592, + "num_tokens": 1799263.0, + "step": 344, + "train/ce_loss": 0.07228964567184448 + }, + { + "epoch": 0.034012260233339926, + "step": 344, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.034012260233339926, + "step": 344, + "train/total_loss": 0.06582271307706833 + }, + { + "entropy": 9.202108383178711, + "epoch": 0.03411113308285545, + "mean_token_accuracy": 0.803748607635498, + "num_tokens": 1804661.0, + "step": 345, + "train/ce_loss": 0.6875723004341125 + }, + { + "epoch": 0.03411113308285545, + "step": 345, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.03411113308285545, + "step": 345, + "train/total_loss": 0.13516348600387573 + }, + { + "entropy": 9.554802894592285, + "epoch": 0.034210005932370974, + "mean_token_accuracy": 0.7437407970428467, + "num_tokens": 1809827.0, + "step": 346, + "train/ce_loss": 0.7660393714904785 + }, + { + "epoch": 0.034210005932370974, + "step": 346, + "train/sim_loss": 0.12109375 + }, + { + "epoch": 0.034210005932370974, + "step": 346, + "train/total_loss": 0.1976976990699768 + }, + { + "entropy": 9.579947471618652, + "epoch": 0.034308878781886494, + "mean_token_accuracy": 0.6934749484062195, + "num_tokens": 1814939.0, + "step": 347, + "train/ce_loss": 0.06933347135782242 + }, + { + "epoch": 0.034308878781886494, + "step": 347, + "train/sim_loss": 0.1328125 + }, + { + "epoch": 0.034308878781886494, + "step": 347, + "train/total_loss": 0.13974584639072418 + }, + { + "entropy": 9.451644897460938, + "epoch": 0.034407751631402014, + "mean_token_accuracy": 0.6985769867897034, + "num_tokens": 1820363.0, + "step": 348, + "train/ce_loss": 1.155254602432251 + }, + { + "epoch": 0.034407751631402014, + "step": 348, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.034407751631402014, + "step": 348, + "train/total_loss": 0.22880670428276062 + }, + { + "entropy": 10.010807991027832, + "epoch": 0.03450662448091754, + "mean_token_accuracy": 0.6879310607910156, + "num_tokens": 1825338.0, + "step": 349, + "train/ce_loss": 0.0773419514298439 + }, + { + "epoch": 0.03450662448091754, + "step": 349, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.03450662448091754, + "step": 349, + "train/total_loss": 0.05851544439792633 + }, + { + "entropy": 9.17041015625, + "epoch": 0.03460549733043306, + "mean_token_accuracy": 0.7520184516906738, + "num_tokens": 1830655.0, + "step": 350, + "train/ce_loss": 0.6814654469490051 + }, + { + "epoch": 0.03460549733043306, + "step": 350, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.03460549733043306, + "step": 350, + "train/total_loss": 0.13455280661582947 + }, + { + "entropy": 10.224859237670898, + "epoch": 0.03470437017994859, + "mean_token_accuracy": 0.7102137804031372, + "num_tokens": 1835458.0, + "step": 351, + "train/ce_loss": 0.10597500205039978 + }, + { + "epoch": 0.03470437017994859, + "step": 351, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.03470437017994859, + "step": 351, + "train/total_loss": 0.05747250095009804 + }, + { + "entropy": 9.771078109741211, + "epoch": 0.03480324302946411, + "mean_token_accuracy": 0.6955752372741699, + "num_tokens": 1840454.0, + "step": 352, + "train/ce_loss": 0.07685268670320511 + }, + { + "epoch": 0.03480324302946411, + "step": 352, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.03480324302946411, + "step": 352, + "train/total_loss": 0.13268527388572693 + }, + { + "entropy": 9.915056228637695, + "epoch": 0.03490211587897963, + "mean_token_accuracy": 0.6846542954444885, + "num_tokens": 1845477.0, + "step": 353, + "train/ce_loss": 0.07793489098548889 + }, + { + "epoch": 0.03490211587897963, + "step": 353, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.03490211587897963, + "step": 353, + "train/total_loss": 0.09373098611831665 + }, + { + "entropy": 9.970849990844727, + "epoch": 0.03500098872849516, + "mean_token_accuracy": 0.7813687920570374, + "num_tokens": 1850429.0, + "step": 354, + "train/ce_loss": 0.08181675523519516 + }, + { + "epoch": 0.03500098872849516, + "step": 354, + "train/sim_loss": 0.12890625 + }, + { + "epoch": 0.03500098872849516, + "step": 354, + "train/total_loss": 0.13708792626857758 + }, + { + "entropy": 9.199779510498047, + "epoch": 0.03509986157801068, + "mean_token_accuracy": 0.759100615978241, + "num_tokens": 1855814.0, + "step": 355, + "train/ce_loss": 0.9021751284599304 + }, + { + "epoch": 0.03509986157801068, + "step": 355, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.03509986157801068, + "step": 355, + "train/total_loss": 0.17224876582622528 + }, + { + "entropy": 9.291690826416016, + "epoch": 0.035198734427526204, + "mean_token_accuracy": 0.709563136100769, + "num_tokens": 1861150.0, + "step": 356, + "train/ce_loss": 0.7254281044006348 + }, + { + "epoch": 0.035198734427526204, + "step": 356, + "train/sim_loss": 0.13671875 + }, + { + "epoch": 0.035198734427526204, + "step": 356, + "train/total_loss": 0.20926156640052795 + }, + { + "entropy": 9.447186470031738, + "epoch": 0.035297607277041725, + "mean_token_accuracy": 0.7272727489471436, + "num_tokens": 1866398.0, + "step": 357, + "train/ce_loss": 0.6839994788169861 + }, + { + "epoch": 0.035297607277041725, + "step": 357, + "train/sim_loss": 0.203125 + }, + { + "epoch": 0.035297607277041725, + "step": 357, + "train/total_loss": 0.27152496576309204 + }, + { + "entropy": 10.161275863647461, + "epoch": 0.035396480126557245, + "mean_token_accuracy": 0.7622222304344177, + "num_tokens": 1871278.0, + "step": 358, + "train/ce_loss": 1.1349841356277466 + }, + { + "epoch": 0.035396480126557245, + "step": 358, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.035396480126557245, + "step": 358, + "train/total_loss": 0.21506091952323914 + }, + { + "entropy": 9.809469223022461, + "epoch": 0.03549535297607277, + "mean_token_accuracy": 0.6945337653160095, + "num_tokens": 1876358.0, + "step": 359, + "train/ce_loss": 0.07336652278900146 + }, + { + "epoch": 0.03549535297607277, + "step": 359, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.03549535297607277, + "step": 359, + "train/total_loss": 0.09327415376901627 + }, + { + "epoch": 0.03559422582558829, + "grad_norm": 1.2241110801696777, + "learning_rate": 9.913712109973793e-06, + "loss": 0.2122, + "step": 360 + }, + { + "entropy": 9.10953140258789, + "epoch": 0.03559422582558829, + "mean_token_accuracy": 0.7454128265380859, + "num_tokens": 1881776.0, + "step": 360, + "train/ce_loss": 0.8973351120948792 + }, + { + "epoch": 0.03559422582558829, + "step": 360, + "train/sim_loss": 0.19140625 + }, + { + "epoch": 0.03559422582558829, + "step": 360, + "train/total_loss": 0.2811397612094879 + }, + { + "entropy": 9.590354919433594, + "epoch": 0.03569309867510382, + "mean_token_accuracy": 0.807894766330719, + "num_tokens": 1886981.0, + "step": 361, + "train/ce_loss": 0.05856641009449959 + }, + { + "epoch": 0.03569309867510382, + "step": 361, + "train/sim_loss": 0.12109375 + }, + { + "epoch": 0.03569309867510382, + "step": 361, + "train/total_loss": 0.12695039808750153 + }, + { + "entropy": 9.504430770874023, + "epoch": 0.03579197152461934, + "mean_token_accuracy": 0.702570378780365, + "num_tokens": 1892273.0, + "step": 362, + "train/ce_loss": 0.9794031381607056 + }, + { + "epoch": 0.03579197152461934, + "step": 362, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.03579197152461934, + "step": 362, + "train/total_loss": 0.1995028257369995 + }, + { + "entropy": 9.122315406799316, + "epoch": 0.03589084437413486, + "mean_token_accuracy": 0.7554240822792053, + "num_tokens": 1897749.0, + "step": 363, + "train/ce_loss": 0.9510576128959656 + }, + { + "epoch": 0.03589084437413486, + "step": 363, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.03589084437413486, + "step": 363, + "train/total_loss": 0.16541826725006104 + }, + { + "entropy": 8.951866149902344, + "epoch": 0.03598971722365039, + "mean_token_accuracy": 0.7251461744308472, + "num_tokens": 1903146.0, + "step": 364, + "train/ce_loss": 0.5546991229057312 + }, + { + "epoch": 0.03598971722365039, + "step": 364, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.03598971722365039, + "step": 364, + "train/total_loss": 0.11015741527080536 + }, + { + "entropy": 9.105632781982422, + "epoch": 0.03608859007316591, + "mean_token_accuracy": 0.7198622226715088, + "num_tokens": 1908511.0, + "step": 365, + "train/ce_loss": 0.8597995042800903 + }, + { + "epoch": 0.03608859007316591, + "step": 365, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.03608859007316591, + "step": 365, + "train/total_loss": 0.18363620340824127 + }, + { + "entropy": 9.629191398620605, + "epoch": 0.036187462922681435, + "mean_token_accuracy": 0.703342616558075, + "num_tokens": 1913848.0, + "step": 366, + "train/ce_loss": 1.2104308605194092 + }, + { + "epoch": 0.036187462922681435, + "step": 366, + "train/sim_loss": 0.12109375 + }, + { + "epoch": 0.036187462922681435, + "step": 366, + "train/total_loss": 0.24213683605194092 + }, + { + "entropy": 9.564308166503906, + "epoch": 0.036286335772196955, + "mean_token_accuracy": 0.7109004855155945, + "num_tokens": 1919134.0, + "step": 367, + "train/ce_loss": 1.0658434629440308 + }, + { + "epoch": 0.036286335772196955, + "step": 367, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.036286335772196955, + "step": 367, + "train/total_loss": 0.2315843403339386 + }, + { + "entropy": 9.375260353088379, + "epoch": 0.036385208621712475, + "mean_token_accuracy": 0.7547169923782349, + "num_tokens": 1924431.0, + "step": 368, + "train/ce_loss": 1.1075860261917114 + }, + { + "epoch": 0.036385208621712475, + "step": 368, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.036385208621712475, + "step": 368, + "train/total_loss": 0.22794610261917114 + }, + { + "entropy": 9.342070579528809, + "epoch": 0.036484081471228, + "mean_token_accuracy": 0.6740740537643433, + "num_tokens": 1929668.0, + "step": 369, + "train/ce_loss": 2.2320828437805176 + }, + { + "epoch": 0.036484081471228, + "step": 369, + "train/sim_loss": 0.1953125 + }, + { + "epoch": 0.036484081471228, + "step": 369, + "train/total_loss": 0.41852080821990967 + }, + { + "entropy": 9.583789825439453, + "epoch": 0.03658295432074352, + "mean_token_accuracy": 0.71378093957901, + "num_tokens": 1934657.0, + "step": 370, + "train/ce_loss": 1.2746697664260864 + }, + { + "epoch": 0.03658295432074352, + "step": 370, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.03658295432074352, + "step": 370, + "train/total_loss": 0.20949822664260864 + }, + { + "entropy": 9.580131530761719, + "epoch": 0.03668182717025905, + "mean_token_accuracy": 0.755215585231781, + "num_tokens": 1939828.0, + "step": 371, + "train/ce_loss": 0.8176199793815613 + }, + { + "epoch": 0.03668182717025905, + "step": 371, + "train/sim_loss": 0.171875 + }, + { + "epoch": 0.03668182717025905, + "step": 371, + "train/total_loss": 0.25363701581954956 + }, + { + "entropy": 9.380910873413086, + "epoch": 0.03678070001977457, + "mean_token_accuracy": 0.7086801528930664, + "num_tokens": 1945141.0, + "step": 372, + "train/ce_loss": 1.2361494302749634 + }, + { + "epoch": 0.03678070001977457, + "step": 372, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.03678070001977457, + "step": 372, + "train/total_loss": 0.20564618706703186 + }, + { + "entropy": 9.45290756225586, + "epoch": 0.03687957286929009, + "mean_token_accuracy": 0.7642679810523987, + "num_tokens": 1950405.0, + "step": 373, + "train/ce_loss": 0.949661135673523 + }, + { + "epoch": 0.03687957286929009, + "step": 373, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.03687957286929009, + "step": 373, + "train/total_loss": 0.2199661135673523 + }, + { + "entropy": 9.106277465820312, + "epoch": 0.03697844571880562, + "mean_token_accuracy": 0.7809057235717773, + "num_tokens": 1955673.0, + "step": 374, + "train/ce_loss": 0.6012702584266663 + }, + { + "epoch": 0.03697844571880562, + "step": 374, + "train/sim_loss": 0.16796875 + }, + { + "epoch": 0.03697844571880562, + "step": 374, + "train/total_loss": 0.22809576988220215 + }, + { + "entropy": 9.308792114257812, + "epoch": 0.03707731856832114, + "mean_token_accuracy": 0.6877990365028381, + "num_tokens": 1960969.0, + "step": 375, + "train/ce_loss": 0.7121626734733582 + }, + { + "epoch": 0.03707731856832114, + "step": 375, + "train/sim_loss": 0.1484375 + }, + { + "epoch": 0.03707731856832114, + "step": 375, + "train/total_loss": 0.21965377032756805 + }, + { + "entropy": 9.10552978515625, + "epoch": 0.037176191417836665, + "mean_token_accuracy": 0.7052631378173828, + "num_tokens": 1966398.0, + "step": 376, + "train/ce_loss": 1.4431555271148682 + }, + { + "epoch": 0.037176191417836665, + "step": 376, + "train/sim_loss": 0.19921875 + }, + { + "epoch": 0.037176191417836665, + "step": 376, + "train/total_loss": 0.34353429079055786 + }, + { + "entropy": 9.340739250183105, + "epoch": 0.037275064267352186, + "mean_token_accuracy": 0.7555012106895447, + "num_tokens": 1971693.0, + "step": 377, + "train/ce_loss": 0.5882933735847473 + }, + { + "epoch": 0.037275064267352186, + "step": 377, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.037275064267352186, + "step": 377, + "train/total_loss": 0.13695433735847473 + }, + { + "entropy": 10.635891914367676, + "epoch": 0.037373937116867706, + "mean_token_accuracy": 0.8177340030670166, + "num_tokens": 1976284.0, + "step": 378, + "train/ce_loss": 0.2206471860408783 + }, + { + "epoch": 0.037373937116867706, + "step": 378, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.037373937116867706, + "step": 378, + "train/total_loss": 0.09628346562385559 + }, + { + "entropy": 9.537246704101562, + "epoch": 0.03747280996638323, + "mean_token_accuracy": 0.7893961668014526, + "num_tokens": 1981422.0, + "step": 379, + "train/ce_loss": 1.5890189409255981 + }, + { + "epoch": 0.03747280996638323, + "step": 379, + "train/sim_loss": 0.16015625 + }, + { + "epoch": 0.03747280996638323, + "step": 379, + "train/total_loss": 0.3190581500530243 + }, + { + "epoch": 0.03757168281589875, + "grad_norm": 1.5449098348617554, + "learning_rate": 9.908767245215844e-06, + "loss": 0.2045, + "step": 380 + }, + { + "entropy": 9.678614616394043, + "epoch": 0.03757168281589875, + "mean_token_accuracy": 0.7061538696289062, + "num_tokens": 1986517.0, + "step": 380, + "train/ce_loss": 1.311052680015564 + }, + { + "epoch": 0.03757168281589875, + "step": 380, + "train/sim_loss": 0.12890625 + }, + { + "epoch": 0.03757168281589875, + "step": 380, + "train/total_loss": 0.2600115239620209 + }, + { + "entropy": 9.551870346069336, + "epoch": 0.03767055566541428, + "mean_token_accuracy": 0.7640449404716492, + "num_tokens": 1991679.0, + "step": 381, + "train/ce_loss": 0.06471993774175644 + }, + { + "epoch": 0.03767055566541428, + "step": 381, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.03767055566541428, + "step": 381, + "train/total_loss": 0.08850324153900146 + }, + { + "entropy": 8.966535568237305, + "epoch": 0.0377694285149298, + "mean_token_accuracy": 0.7140204310417175, + "num_tokens": 1997213.0, + "step": 382, + "train/ce_loss": 1.2641946077346802 + }, + { + "epoch": 0.0377694285149298, + "step": 382, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.0377694285149298, + "step": 382, + "train/total_loss": 0.23970071971416473 + }, + { + "entropy": 9.849260330200195, + "epoch": 0.03786830136444532, + "mean_token_accuracy": 0.796875, + "num_tokens": 2002255.0, + "step": 383, + "train/ce_loss": 1.4398695230484009 + }, + { + "epoch": 0.03786830136444532, + "step": 383, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.03786830136444532, + "step": 383, + "train/total_loss": 0.24554945528507233 + }, + { + "entropy": 9.840927124023438, + "epoch": 0.03796717421396085, + "mean_token_accuracy": 0.6904761791229248, + "num_tokens": 2007253.0, + "step": 384, + "train/ce_loss": 0.07275044918060303 + }, + { + "epoch": 0.03796717421396085, + "step": 384, + "train/sim_loss": 0.13671875 + }, + { + "epoch": 0.03796717421396085, + "step": 384, + "train/total_loss": 0.1439937949180603 + }, + { + "entropy": 9.434771537780762, + "epoch": 0.03806604706347637, + "mean_token_accuracy": 0.7175843715667725, + "num_tokens": 2012308.0, + "step": 385, + "train/ce_loss": 1.2704976797103882 + }, + { + "epoch": 0.03806604706347637, + "step": 385, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.03806604706347637, + "step": 385, + "train/total_loss": 0.2364247739315033 + }, + { + "entropy": 9.346845626831055, + "epoch": 0.03816491991299189, + "mean_token_accuracy": 0.7320573925971985, + "num_tokens": 2017561.0, + "step": 386, + "train/ce_loss": 0.6856738924980164 + }, + { + "epoch": 0.03816491991299189, + "step": 386, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.03816491991299189, + "step": 386, + "train/total_loss": 0.1505986452102661 + }, + { + "entropy": 9.976537704467773, + "epoch": 0.038263792762507416, + "mean_token_accuracy": 0.7317939400672913, + "num_tokens": 2022530.0, + "step": 387, + "train/ce_loss": 0.07988831400871277 + }, + { + "epoch": 0.038263792762507416, + "step": 387, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.038263792762507416, + "step": 387, + "train/total_loss": 0.1017388328909874 + }, + { + "entropy": 9.263082504272461, + "epoch": 0.038362665612022936, + "mean_token_accuracy": 0.7423728704452515, + "num_tokens": 2027916.0, + "step": 388, + "train/ce_loss": 0.9694319367408752 + }, + { + "epoch": 0.038362665612022936, + "step": 388, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.038362665612022936, + "step": 388, + "train/total_loss": 0.167255699634552 + }, + { + "entropy": 9.486333847045898, + "epoch": 0.038461538461538464, + "mean_token_accuracy": 0.7420538067817688, + "num_tokens": 2033210.0, + "step": 389, + "train/ce_loss": 1.5127630233764648 + }, + { + "epoch": 0.038461538461538464, + "step": 389, + "train/sim_loss": 0.171875 + }, + { + "epoch": 0.038461538461538464, + "step": 389, + "train/total_loss": 0.32315129041671753 + }, + { + "entropy": 9.922418594360352, + "epoch": 0.038560411311053984, + "mean_token_accuracy": 0.7598039507865906, + "num_tokens": 2038255.0, + "step": 390, + "train/ce_loss": 0.6838778853416443 + }, + { + "epoch": 0.038560411311053984, + "step": 390, + "train/sim_loss": 0.12890625 + }, + { + "epoch": 0.038560411311053984, + "step": 390, + "train/total_loss": 0.19729404151439667 + }, + { + "entropy": 9.55936050415039, + "epoch": 0.038659284160569504, + "mean_token_accuracy": 0.7387005686759949, + "num_tokens": 2043404.0, + "step": 391, + "train/ce_loss": 0.9636222720146179 + }, + { + "epoch": 0.038659284160569504, + "step": 391, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.038659284160569504, + "step": 391, + "train/total_loss": 0.19401848316192627 + }, + { + "entropy": 9.560068130493164, + "epoch": 0.03875815701008503, + "mean_token_accuracy": 0.712435245513916, + "num_tokens": 2048633.0, + "step": 392, + "train/ce_loss": 0.05836481228470802 + }, + { + "epoch": 0.03875815701008503, + "step": 392, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.03875815701008503, + "step": 392, + "train/total_loss": 0.10739897936582565 + }, + { + "entropy": 9.025551795959473, + "epoch": 0.03885702985960055, + "mean_token_accuracy": 0.7123420834541321, + "num_tokens": 2054126.0, + "step": 393, + "train/ce_loss": 0.6507374048233032 + }, + { + "epoch": 0.03885702985960055, + "step": 393, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.03885702985960055, + "step": 393, + "train/total_loss": 0.10804249346256256 + }, + { + "entropy": 9.996772766113281, + "epoch": 0.03895590270911608, + "mean_token_accuracy": 0.7022058963775635, + "num_tokens": 2059108.0, + "step": 394, + "train/ce_loss": 2.8516266345977783 + }, + { + "epoch": 0.03895590270911608, + "step": 394, + "train/sim_loss": 0.140625 + }, + { + "epoch": 0.03895590270911608, + "step": 394, + "train/total_loss": 0.42578765749931335 + }, + { + "entropy": 10.15201187133789, + "epoch": 0.0390547755586316, + "mean_token_accuracy": 0.7736263871192932, + "num_tokens": 2063954.0, + "step": 395, + "train/ce_loss": 1.0598214864730835 + }, + { + "epoch": 0.0390547755586316, + "step": 395, + "train/sim_loss": 0.171875 + }, + { + "epoch": 0.0390547755586316, + "step": 395, + "train/total_loss": 0.2778571546077728 + }, + { + "entropy": 9.360065460205078, + "epoch": 0.03915364840814712, + "mean_token_accuracy": 0.7875586748123169, + "num_tokens": 2069258.0, + "step": 396, + "train/ce_loss": 1.0520858764648438 + }, + { + "epoch": 0.03915364840814712, + "step": 396, + "train/sim_loss": 0.14453125 + }, + { + "epoch": 0.03915364840814712, + "step": 396, + "train/total_loss": 0.2497398406267166 + }, + { + "entropy": 9.63375473022461, + "epoch": 0.03925252125766265, + "mean_token_accuracy": 0.691717803478241, + "num_tokens": 2074378.0, + "step": 397, + "train/ce_loss": 1.5342135429382324 + }, + { + "epoch": 0.03925252125766265, + "step": 397, + "train/sim_loss": 0.17578125 + }, + { + "epoch": 0.03925252125766265, + "step": 397, + "train/total_loss": 0.3292025923728943 + }, + { + "entropy": 9.150320053100586, + "epoch": 0.03935139410717817, + "mean_token_accuracy": 0.6758767366409302, + "num_tokens": 2079768.0, + "step": 398, + "train/ce_loss": 1.444730281829834 + }, + { + "epoch": 0.03935139410717817, + "step": 398, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.03935139410717817, + "step": 398, + "train/total_loss": 0.24603553116321564 + }, + { + "entropy": 9.808318138122559, + "epoch": 0.039450266956693694, + "mean_token_accuracy": 0.6682692170143127, + "num_tokens": 2084825.0, + "step": 399, + "train/ce_loss": 1.5510262250900269 + }, + { + "epoch": 0.039450266956693694, + "step": 399, + "train/sim_loss": 0.16015625 + }, + { + "epoch": 0.039450266956693694, + "step": 399, + "train/total_loss": 0.31525886058807373 + }, + { + "epoch": 0.039549139806209214, + "grad_norm": 2.2871906757354736, + "learning_rate": 9.903822380457894e-06, + "loss": 0.212, + "step": 400 + }, + { + "entropy": 10.660465240478516, + "epoch": 0.039549139806209214, + "mean_token_accuracy": 0.7262569665908813, + "num_tokens": 2089397.0, + "step": 400, + "train/ce_loss": 0.256531298160553 + }, + { + "epoch": 0.039549139806209214, + "step": 400, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.039549139806209214, + "step": 400, + "train/total_loss": 0.11940313130617142 + }, + { + "entropy": 9.446050643920898, + "epoch": 0.039648012655724735, + "mean_token_accuracy": 0.6892856955528259, + "num_tokens": 2094670.0, + "step": 401, + "train/ce_loss": 1.4382058382034302 + }, + { + "epoch": 0.039648012655724735, + "step": 401, + "train/sim_loss": 0.1796875 + }, + { + "epoch": 0.039648012655724735, + "step": 401, + "train/total_loss": 0.323508083820343 + }, + { + "entropy": 9.751455307006836, + "epoch": 0.03974688550524026, + "mean_token_accuracy": 0.7054263353347778, + "num_tokens": 2099743.0, + "step": 402, + "train/ce_loss": 0.8057286143302917 + }, + { + "epoch": 0.03974688550524026, + "step": 402, + "train/sim_loss": 0.1328125 + }, + { + "epoch": 0.03974688550524026, + "step": 402, + "train/total_loss": 0.21338537335395813 + }, + { + "entropy": 9.942941665649414, + "epoch": 0.03984575835475578, + "mean_token_accuracy": 0.7459749579429626, + "num_tokens": 2104887.0, + "step": 403, + "train/ce_loss": 1.0311578512191772 + }, + { + "epoch": 0.03984575835475578, + "step": 403, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.03984575835475578, + "step": 403, + "train/total_loss": 0.20858454704284668 + }, + { + "entropy": 10.0800199508667, + "epoch": 0.03994463120427131, + "mean_token_accuracy": 0.6900901198387146, + "num_tokens": 2109903.0, + "step": 404, + "train/ce_loss": 1.4212541580200195 + }, + { + "epoch": 0.03994463120427131, + "step": 404, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.03994463120427131, + "step": 404, + "train/total_loss": 0.24759416282176971 + }, + { + "entropy": 9.820121765136719, + "epoch": 0.04004350405378683, + "mean_token_accuracy": 0.7098150849342346, + "num_tokens": 2115001.0, + "step": 405, + "train/ce_loss": 0.9588977098464966 + }, + { + "epoch": 0.04004350405378683, + "step": 405, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.04004350405378683, + "step": 405, + "train/total_loss": 0.17401477694511414 + }, + { + "entropy": 8.908750534057617, + "epoch": 0.04014237690330235, + "mean_token_accuracy": 0.7490909099578857, + "num_tokens": 2120590.0, + "step": 406, + "train/ce_loss": 0.5333845615386963 + }, + { + "epoch": 0.04014237690330235, + "step": 406, + "train/sim_loss": 0.1484375 + }, + { + "epoch": 0.04014237690330235, + "step": 406, + "train/total_loss": 0.2017759531736374 + }, + { + "entropy": 9.100845336914062, + "epoch": 0.04024124975281788, + "mean_token_accuracy": 0.7977142930030823, + "num_tokens": 2125997.0, + "step": 407, + "train/ce_loss": 0.46464645862579346 + }, + { + "epoch": 0.04024124975281788, + "step": 407, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.04024124975281788, + "step": 407, + "train/total_loss": 0.10115215182304382 + }, + { + "entropy": 9.426050186157227, + "epoch": 0.0403401226023334, + "mean_token_accuracy": 0.7438867688179016, + "num_tokens": 2131308.0, + "step": 408, + "train/ce_loss": 1.037602186203003 + }, + { + "epoch": 0.0403401226023334, + "step": 408, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.0403401226023334, + "step": 408, + "train/total_loss": 0.17407271265983582 + }, + { + "entropy": 9.679730415344238, + "epoch": 0.040438995451848925, + "mean_token_accuracy": 0.7153392434120178, + "num_tokens": 2136448.0, + "step": 409, + "train/ce_loss": 0.06517762690782547 + }, + { + "epoch": 0.040438995451848925, + "step": 409, + "train/sim_loss": 0.171875 + }, + { + "epoch": 0.040438995451848925, + "step": 409, + "train/total_loss": 0.17839276790618896 + }, + { + "entropy": 9.857072830200195, + "epoch": 0.040537868301364445, + "mean_token_accuracy": 0.7915966510772705, + "num_tokens": 2141513.0, + "step": 410, + "train/ce_loss": 1.0456303358078003 + }, + { + "epoch": 0.040537868301364445, + "step": 410, + "train/sim_loss": 0.16796875 + }, + { + "epoch": 0.040537868301364445, + "step": 410, + "train/total_loss": 0.27253177762031555 + }, + { + "entropy": 9.174946784973145, + "epoch": 0.040636741150879965, + "mean_token_accuracy": 0.7205422520637512, + "num_tokens": 2146915.0, + "step": 411, + "train/ce_loss": 0.7723899483680725 + }, + { + "epoch": 0.040636741150879965, + "step": 411, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.040636741150879965, + "step": 411, + "train/total_loss": 0.1905202567577362 + }, + { + "entropy": 9.211700439453125, + "epoch": 0.04073561400039549, + "mean_token_accuracy": 0.7519181370735168, + "num_tokens": 2152109.0, + "step": 412, + "train/ce_loss": 0.9920454621315002 + }, + { + "epoch": 0.04073561400039549, + "step": 412, + "train/sim_loss": 0.13671875 + }, + { + "epoch": 0.04073561400039549, + "step": 412, + "train/total_loss": 0.23592329025268555 + }, + { + "entropy": 9.378646850585938, + "epoch": 0.04083448684991101, + "mean_token_accuracy": 0.7542856931686401, + "num_tokens": 2157459.0, + "step": 413, + "train/ce_loss": 0.9726569652557373 + }, + { + "epoch": 0.04083448684991101, + "step": 413, + "train/sim_loss": 0.1328125 + }, + { + "epoch": 0.04083448684991101, + "step": 413, + "train/total_loss": 0.23007819056510925 + }, + { + "entropy": 9.208342552185059, + "epoch": 0.04093335969942654, + "mean_token_accuracy": 0.6861538290977478, + "num_tokens": 2162936.0, + "step": 414, + "train/ce_loss": 1.107186198234558 + }, + { + "epoch": 0.04093335969942654, + "step": 414, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.04093335969942654, + "step": 414, + "train/total_loss": 0.20446862280368805 + }, + { + "entropy": 9.583135604858398, + "epoch": 0.04103223254894206, + "mean_token_accuracy": 0.740641713142395, + "num_tokens": 2168070.0, + "step": 415, + "train/ce_loss": 0.6643601059913635 + }, + { + "epoch": 0.04103223254894206, + "step": 415, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.04103223254894206, + "step": 415, + "train/total_loss": 0.1758110225200653 + }, + { + "entropy": 9.213274002075195, + "epoch": 0.04113110539845758, + "mean_token_accuracy": 0.6670157313346863, + "num_tokens": 2173464.0, + "step": 416, + "train/ce_loss": 1.370169997215271 + }, + { + "epoch": 0.04113110539845758, + "step": 416, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.04113110539845758, + "step": 416, + "train/total_loss": 0.25029826164245605 + }, + { + "entropy": 9.854242324829102, + "epoch": 0.04122997824797311, + "mean_token_accuracy": 0.7210776805877686, + "num_tokens": 2178511.0, + "step": 417, + "train/ce_loss": 1.0721811056137085 + }, + { + "epoch": 0.04122997824797311, + "step": 417, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.04122997824797311, + "step": 417, + "train/total_loss": 0.15409311652183533 + }, + { + "entropy": 9.106668472290039, + "epoch": 0.04132885109748863, + "mean_token_accuracy": 0.773797333240509, + "num_tokens": 2183928.0, + "step": 418, + "train/ce_loss": 0.7181837558746338 + }, + { + "epoch": 0.04132885109748863, + "step": 418, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.04132885109748863, + "step": 418, + "train/total_loss": 0.11478712409734726 + }, + { + "entropy": 9.570853233337402, + "epoch": 0.041427723947004155, + "mean_token_accuracy": 0.7247474789619446, + "num_tokens": 2189194.0, + "step": 419, + "train/ce_loss": 1.313214898109436 + }, + { + "epoch": 0.041427723947004155, + "step": 419, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.041427723947004155, + "step": 419, + "train/total_loss": 0.2406964898109436 + }, + { + "epoch": 0.041526596796519676, + "grad_norm": 1.4583357572555542, + "learning_rate": 9.898877515699947e-06, + "loss": 0.2068, + "step": 420 + }, + { + "entropy": 9.421991348266602, + "epoch": 0.041526596796519676, + "mean_token_accuracy": 0.7029449343681335, + "num_tokens": 2194472.0, + "step": 420, + "train/ce_loss": 1.083237886428833 + }, + { + "epoch": 0.041526596796519676, + "step": 420, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.041526596796519676, + "step": 420, + "train/total_loss": 0.20207378268241882 + }, + { + "entropy": 9.213184356689453, + "epoch": 0.041625469646035196, + "mean_token_accuracy": 0.7299435138702393, + "num_tokens": 2199854.0, + "step": 421, + "train/ce_loss": 0.7965179085731506 + }, + { + "epoch": 0.041625469646035196, + "step": 421, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.041625469646035196, + "step": 421, + "train/total_loss": 0.14605805277824402 + }, + { + "entropy": 9.282001495361328, + "epoch": 0.04172434249555072, + "mean_token_accuracy": 0.731225311756134, + "num_tokens": 2205172.0, + "step": 422, + "train/ce_loss": 0.6847802996635437 + }, + { + "epoch": 0.04172434249555072, + "step": 422, + "train/sim_loss": 0.13671875 + }, + { + "epoch": 0.04172434249555072, + "step": 422, + "train/total_loss": 0.2051967829465866 + }, + { + "entropy": 9.40710163116455, + "epoch": 0.04182321534506624, + "mean_token_accuracy": 0.7071688771247864, + "num_tokens": 2210504.0, + "step": 423, + "train/ce_loss": 0.9189309477806091 + }, + { + "epoch": 0.04182321534506624, + "step": 423, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.04182321534506624, + "step": 423, + "train/total_loss": 0.20126810669898987 + }, + { + "entropy": 9.208239555358887, + "epoch": 0.04192208819458177, + "mean_token_accuracy": 0.7650273442268372, + "num_tokens": 2215960.0, + "step": 424, + "train/ce_loss": 0.6772084832191467 + }, + { + "epoch": 0.04192208819458177, + "step": 424, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.04192208819458177, + "step": 424, + "train/total_loss": 0.15756461024284363 + }, + { + "entropy": 9.339761734008789, + "epoch": 0.04202096104409729, + "mean_token_accuracy": 0.7011995911598206, + "num_tokens": 2221354.0, + "step": 425, + "train/ce_loss": 1.2315471172332764 + }, + { + "epoch": 0.04202096104409729, + "step": 425, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.04202096104409729, + "step": 425, + "train/total_loss": 0.23252971470355988 + }, + { + "entropy": 10.32594108581543, + "epoch": 0.04211983389361281, + "mean_token_accuracy": 0.6701570749282837, + "num_tokens": 2226144.0, + "step": 426, + "train/ce_loss": 0.11662330478429794 + }, + { + "epoch": 0.04211983389361281, + "step": 426, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.04211983389361281, + "step": 426, + "train/total_loss": 0.06634983420372009 + }, + { + "entropy": 9.58592414855957, + "epoch": 0.04221870674312834, + "mean_token_accuracy": 0.6890410780906677, + "num_tokens": 2231272.0, + "step": 427, + "train/ce_loss": 1.2044618129730225 + }, + { + "epoch": 0.04221870674312834, + "step": 427, + "train/sim_loss": 0.1328125 + }, + { + "epoch": 0.04221870674312834, + "step": 427, + "train/total_loss": 0.25325867533683777 + }, + { + "entropy": 10.165045738220215, + "epoch": 0.04231757959264386, + "mean_token_accuracy": 0.6581395268440247, + "num_tokens": 2236127.0, + "step": 428, + "train/ce_loss": 2.049638509750366 + }, + { + "epoch": 0.04231757959264386, + "step": 428, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.04231757959264386, + "step": 428, + "train/total_loss": 0.3299638628959656 + }, + { + "entropy": 9.492660522460938, + "epoch": 0.042416452442159386, + "mean_token_accuracy": 0.7185792326927185, + "num_tokens": 2241314.0, + "step": 429, + "train/ce_loss": 0.8751475214958191 + }, + { + "epoch": 0.042416452442159386, + "step": 429, + "train/sim_loss": 0.17578125 + }, + { + "epoch": 0.042416452442159386, + "step": 429, + "train/total_loss": 0.2632960081100464 + }, + { + "entropy": 9.028284072875977, + "epoch": 0.042515325291674906, + "mean_token_accuracy": 0.6537585258483887, + "num_tokens": 2246691.0, + "step": 430, + "train/ce_loss": 1.0627728700637817 + }, + { + "epoch": 0.042515325291674906, + "step": 430, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.042515325291674906, + "step": 430, + "train/total_loss": 0.19612103700637817 + }, + { + "entropy": 9.469210624694824, + "epoch": 0.042614198141190426, + "mean_token_accuracy": 0.6758104562759399, + "num_tokens": 2251923.0, + "step": 431, + "train/ce_loss": 0.8347735404968262 + }, + { + "epoch": 0.042614198141190426, + "step": 431, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.042614198141190426, + "step": 431, + "train/total_loss": 0.17722734808921814 + }, + { + "entropy": 9.71932601928711, + "epoch": 0.042713070990705954, + "mean_token_accuracy": 0.7784615159034729, + "num_tokens": 2257009.0, + "step": 432, + "train/ce_loss": 0.8408069014549255 + }, + { + "epoch": 0.042713070990705954, + "step": 432, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.042713070990705954, + "step": 432, + "train/total_loss": 0.20908069610595703 + }, + { + "entropy": 9.299928665161133, + "epoch": 0.042811943840221474, + "mean_token_accuracy": 0.7300771474838257, + "num_tokens": 2262256.0, + "step": 433, + "train/ce_loss": 1.170479655265808 + }, + { + "epoch": 0.042811943840221474, + "step": 433, + "train/sim_loss": 0.16796875 + }, + { + "epoch": 0.042811943840221474, + "step": 433, + "train/total_loss": 0.2850167155265808 + }, + { + "entropy": 9.70884895324707, + "epoch": 0.042910816689737, + "mean_token_accuracy": 0.7266775965690613, + "num_tokens": 2267328.0, + "step": 434, + "train/ce_loss": 0.07351674884557724 + }, + { + "epoch": 0.042910816689737, + "step": 434, + "train/sim_loss": 0.12890625 + }, + { + "epoch": 0.042910816689737, + "step": 434, + "train/total_loss": 0.13625793159008026 + }, + { + "entropy": 9.696551322937012, + "epoch": 0.04300968953925252, + "mean_token_accuracy": 0.7911184430122375, + "num_tokens": 2272356.0, + "step": 435, + "train/ce_loss": 0.9318480491638184 + }, + { + "epoch": 0.04300968953925252, + "step": 435, + "train/sim_loss": 0.15234375 + }, + { + "epoch": 0.04300968953925252, + "step": 435, + "train/total_loss": 0.24552854895591736 + }, + { + "entropy": 9.346677780151367, + "epoch": 0.04310856238876804, + "mean_token_accuracy": 0.7118644118309021, + "num_tokens": 2277737.0, + "step": 436, + "train/ce_loss": 0.6921613216400146 + }, + { + "epoch": 0.04310856238876804, + "step": 436, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.04310856238876804, + "step": 436, + "train/total_loss": 0.18640363216400146 + }, + { + "entropy": 9.694978713989258, + "epoch": 0.04320743523828357, + "mean_token_accuracy": 0.673202633857727, + "num_tokens": 2283078.0, + "step": 437, + "train/ce_loss": 0.05739998072385788 + }, + { + "epoch": 0.04320743523828357, + "step": 437, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.04320743523828357, + "step": 437, + "train/total_loss": 0.05652124807238579 + }, + { + "entropy": 9.755623817443848, + "epoch": 0.04330630808779909, + "mean_token_accuracy": 0.6567862629890442, + "num_tokens": 2288169.0, + "step": 438, + "train/ce_loss": 3.244351625442505 + }, + { + "epoch": 0.04330630808779909, + "step": 438, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.04330630808779909, + "step": 438, + "train/total_loss": 0.44943517446517944 + }, + { + "entropy": 9.664872169494629, + "epoch": 0.043405180937314616, + "mean_token_accuracy": 0.7266187071800232, + "num_tokens": 2293346.0, + "step": 439, + "train/ce_loss": 1.3087403774261475 + }, + { + "epoch": 0.043405180937314616, + "step": 439, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.043405180937314616, + "step": 439, + "train/total_loss": 0.20899903774261475 + }, + { + "epoch": 0.04350405378683014, + "grad_norm": 2.076592445373535, + "learning_rate": 9.893932650941997e-06, + "loss": 0.2254, + "step": 440 + }, + { + "entropy": 9.562398910522461, + "epoch": 0.04350405378683014, + "mean_token_accuracy": 0.7599999904632568, + "num_tokens": 2298580.0, + "step": 440, + "train/ce_loss": 1.0011541843414307 + }, + { + "epoch": 0.04350405378683014, + "step": 440, + "train/sim_loss": 0.15234375 + }, + { + "epoch": 0.04350405378683014, + "step": 440, + "train/total_loss": 0.25245916843414307 + }, + { + "entropy": 9.547796249389648, + "epoch": 0.04360292663634566, + "mean_token_accuracy": 0.7114177942276001, + "num_tokens": 2303822.0, + "step": 441, + "train/ce_loss": 1.0273072719573975 + }, + { + "epoch": 0.04360292663634566, + "step": 441, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.04360292663634566, + "step": 441, + "train/total_loss": 0.17694947123527527 + }, + { + "entropy": 9.212635040283203, + "epoch": 0.043701799485861184, + "mean_token_accuracy": 0.6997663378715515, + "num_tokens": 2309186.0, + "step": 442, + "train/ce_loss": 1.113999605178833 + }, + { + "epoch": 0.043701799485861184, + "step": 442, + "train/sim_loss": 0.1328125 + }, + { + "epoch": 0.043701799485861184, + "step": 442, + "train/total_loss": 0.24421246349811554 + }, + { + "entropy": 9.891897201538086, + "epoch": 0.043800672335376704, + "mean_token_accuracy": 0.7316293716430664, + "num_tokens": 2314244.0, + "step": 443, + "train/ce_loss": 1.3147135972976685 + }, + { + "epoch": 0.043800672335376704, + "step": 443, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.043800672335376704, + "step": 443, + "train/total_loss": 0.22912761569023132 + }, + { + "entropy": 9.439208984375, + "epoch": 0.04389954518489223, + "mean_token_accuracy": 0.6620603203773499, + "num_tokens": 2319515.0, + "step": 444, + "train/ce_loss": 0.9787640571594238 + }, + { + "epoch": 0.04389954518489223, + "step": 444, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.04389954518489223, + "step": 444, + "train/total_loss": 0.1916263997554779 + }, + { + "entropy": 9.741747856140137, + "epoch": 0.04399841803440775, + "mean_token_accuracy": 0.7788617610931396, + "num_tokens": 2324573.0, + "step": 445, + "train/ce_loss": 0.9956402778625488 + }, + { + "epoch": 0.04399841803440775, + "step": 445, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.04399841803440775, + "step": 445, + "train/total_loss": 0.22456403076648712 + }, + { + "entropy": 9.297388076782227, + "epoch": 0.04409729088392327, + "mean_token_accuracy": 0.7089151740074158, + "num_tokens": 2329978.0, + "step": 446, + "train/ce_loss": 0.643767774105072 + }, + { + "epoch": 0.04409729088392327, + "step": 446, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.04409729088392327, + "step": 446, + "train/total_loss": 0.14250177145004272 + }, + { + "entropy": 9.491058349609375, + "epoch": 0.0441961637334388, + "mean_token_accuracy": 0.7719298005104065, + "num_tokens": 2335112.0, + "step": 447, + "train/ce_loss": 1.470329999923706 + }, + { + "epoch": 0.0441961637334388, + "step": 447, + "train/sim_loss": 0.140625 + }, + { + "epoch": 0.0441961637334388, + "step": 447, + "train/total_loss": 0.2876580059528351 + }, + { + "entropy": 9.639659881591797, + "epoch": 0.04429503658295432, + "mean_token_accuracy": 0.7009345889091492, + "num_tokens": 2340175.0, + "step": 448, + "train/ce_loss": 0.07066857069730759 + }, + { + "epoch": 0.04429503658295432, + "step": 448, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.04429503658295432, + "step": 448, + "train/total_loss": 0.08909811079502106 + }, + { + "entropy": 9.312711715698242, + "epoch": 0.04439390943246985, + "mean_token_accuracy": 0.6821120977401733, + "num_tokens": 2345527.0, + "step": 449, + "train/ce_loss": 1.3275420665740967 + }, + { + "epoch": 0.04439390943246985, + "step": 449, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.04439390943246985, + "step": 449, + "train/total_loss": 0.23822295665740967 + }, + { + "entropy": 9.274275779724121, + "epoch": 0.04449278228198537, + "mean_token_accuracy": 0.7433920502662659, + "num_tokens": 2350889.0, + "step": 450, + "train/ce_loss": 0.801036536693573 + }, + { + "epoch": 0.04449278228198537, + "step": 450, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.04449278228198537, + "step": 450, + "train/total_loss": 0.16604116559028625 + }, + { + "entropy": 9.944259643554688, + "epoch": 0.04459165513150089, + "mean_token_accuracy": 0.7213114500045776, + "num_tokens": 2355951.0, + "step": 451, + "train/ce_loss": 1.1396982669830322 + }, + { + "epoch": 0.04459165513150089, + "step": 451, + "train/sim_loss": 0.15234375 + }, + { + "epoch": 0.04459165513150089, + "step": 451, + "train/total_loss": 0.2663135826587677 + }, + { + "entropy": 9.422792434692383, + "epoch": 0.044690527981016415, + "mean_token_accuracy": 0.7263888716697693, + "num_tokens": 2361143.0, + "step": 452, + "train/ce_loss": 0.7414805889129639 + }, + { + "epoch": 0.044690527981016415, + "step": 452, + "train/sim_loss": 0.16015625 + }, + { + "epoch": 0.044690527981016415, + "step": 452, + "train/total_loss": 0.2343043088912964 + }, + { + "entropy": 9.042624473571777, + "epoch": 0.044789400830531935, + "mean_token_accuracy": 0.6950430870056152, + "num_tokens": 2366528.0, + "step": 453, + "train/ce_loss": 1.3365452289581299 + }, + { + "epoch": 0.044789400830531935, + "step": 453, + "train/sim_loss": 0.1796875 + }, + { + "epoch": 0.044789400830531935, + "step": 453, + "train/total_loss": 0.31334203481674194 + }, + { + "entropy": 9.04693603515625, + "epoch": 0.04488827368004746, + "mean_token_accuracy": 0.6670415997505188, + "num_tokens": 2371927.0, + "step": 454, + "train/ce_loss": 0.7594133019447327 + }, + { + "epoch": 0.04488827368004746, + "step": 454, + "train/sim_loss": 0.15234375 + }, + { + "epoch": 0.04488827368004746, + "step": 454, + "train/total_loss": 0.2282850742340088 + }, + { + "entropy": 9.376440048217773, + "epoch": 0.04498714652956298, + "mean_token_accuracy": 0.667117714881897, + "num_tokens": 2377192.0, + "step": 455, + "train/ce_loss": 0.8834444880485535 + }, + { + "epoch": 0.04498714652956298, + "step": 455, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.04498714652956298, + "step": 455, + "train/total_loss": 0.20162570476531982 + }, + { + "entropy": 9.437824249267578, + "epoch": 0.0450860193790785, + "mean_token_accuracy": 0.7165775299072266, + "num_tokens": 2382435.0, + "step": 456, + "train/ce_loss": 0.956652820110321 + }, + { + "epoch": 0.0450860193790785, + "step": 456, + "train/sim_loss": 0.19921875 + }, + { + "epoch": 0.0450860193790785, + "step": 456, + "train/total_loss": 0.2948840260505676 + }, + { + "entropy": 9.413591384887695, + "epoch": 0.04518489222859403, + "mean_token_accuracy": 0.7445520758628845, + "num_tokens": 2387728.0, + "step": 457, + "train/ce_loss": 0.6375630497932434 + }, + { + "epoch": 0.04518489222859403, + "step": 457, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.04518489222859403, + "step": 457, + "train/total_loss": 0.1184438094496727 + }, + { + "entropy": 9.387490272521973, + "epoch": 0.04528376507810955, + "mean_token_accuracy": 0.7684478163719177, + "num_tokens": 2392969.0, + "step": 458, + "train/ce_loss": 0.643700897693634 + }, + { + "epoch": 0.04528376507810955, + "step": 458, + "train/sim_loss": 0.140625 + }, + { + "epoch": 0.04528376507810955, + "step": 458, + "train/total_loss": 0.20499509572982788 + }, + { + "entropy": 9.37730598449707, + "epoch": 0.04538263792762508, + "mean_token_accuracy": 0.7394366264343262, + "num_tokens": 2398500.0, + "step": 459, + "train/ce_loss": 0.8843984007835388 + }, + { + "epoch": 0.04538263792762508, + "step": 459, + "train/sim_loss": 0.12890625 + }, + { + "epoch": 0.04538263792762508, + "step": 459, + "train/total_loss": 0.21734610199928284 + }, + { + "epoch": 0.0454815107771406, + "grad_norm": 1.35333251953125, + "learning_rate": 9.88898778618405e-06, + "loss": 0.2168, + "step": 460 + }, + { + "entropy": 9.22109317779541, + "epoch": 0.0454815107771406, + "mean_token_accuracy": 0.7326968908309937, + "num_tokens": 2403843.0, + "step": 460, + "train/ce_loss": 1.2081056833267212 + }, + { + "epoch": 0.0454815107771406, + "step": 460, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.0454815107771406, + "step": 460, + "train/total_loss": 0.22237306833267212 + }, + { + "entropy": 9.541427612304688, + "epoch": 0.04558038362665612, + "mean_token_accuracy": 0.7227214574813843, + "num_tokens": 2409094.0, + "step": 461, + "train/ce_loss": 1.2430551052093506 + }, + { + "epoch": 0.04558038362665612, + "step": 461, + "train/sim_loss": 0.1640625 + }, + { + "epoch": 0.04558038362665612, + "step": 461, + "train/total_loss": 0.28836801648139954 + }, + { + "entropy": 8.860454559326172, + "epoch": 0.045679256476171645, + "mean_token_accuracy": 0.7073863744735718, + "num_tokens": 2414702.0, + "step": 462, + "train/ce_loss": 1.036400318145752 + }, + { + "epoch": 0.045679256476171645, + "step": 462, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.045679256476171645, + "step": 462, + "train/total_loss": 0.22864003479480743 + }, + { + "entropy": 9.177661895751953, + "epoch": 0.045778129325687165, + "mean_token_accuracy": 0.7801822423934937, + "num_tokens": 2420073.0, + "step": 463, + "train/ce_loss": 1.0977107286453247 + }, + { + "epoch": 0.045778129325687165, + "step": 463, + "train/sim_loss": 0.12890625 + }, + { + "epoch": 0.045778129325687165, + "step": 463, + "train/total_loss": 0.23867732286453247 + }, + { + "entropy": 9.636924743652344, + "epoch": 0.04587700217520269, + "mean_token_accuracy": 0.7410179376602173, + "num_tokens": 2425227.0, + "step": 464, + "train/ce_loss": 0.7735141515731812 + }, + { + "epoch": 0.04587700217520269, + "step": 464, + "train/sim_loss": 0.12109375 + }, + { + "epoch": 0.04587700217520269, + "step": 464, + "train/total_loss": 0.1984451711177826 + }, + { + "entropy": 10.003236770629883, + "epoch": 0.04597587502471821, + "mean_token_accuracy": 0.7064220309257507, + "num_tokens": 2430139.0, + "step": 465, + "train/ce_loss": 0.0809466615319252 + }, + { + "epoch": 0.04597587502471821, + "step": 465, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.04597587502471821, + "step": 465, + "train/total_loss": 0.0666884183883667 + }, + { + "entropy": 9.566146850585938, + "epoch": 0.04607474787423373, + "mean_token_accuracy": 0.7546699643135071, + "num_tokens": 2435381.0, + "step": 466, + "train/ce_loss": 0.9219437837600708 + }, + { + "epoch": 0.04607474787423373, + "step": 466, + "train/sim_loss": 0.13671875 + }, + { + "epoch": 0.04607474787423373, + "step": 466, + "train/total_loss": 0.22891312837600708 + }, + { + "entropy": 9.887555122375488, + "epoch": 0.04617362072374926, + "mean_token_accuracy": 0.801996648311615, + "num_tokens": 2440437.0, + "step": 467, + "train/ce_loss": 0.07591411471366882 + }, + { + "epoch": 0.04617362072374926, + "step": 467, + "train/sim_loss": 0.1875 + }, + { + "epoch": 0.04617362072374926, + "step": 467, + "train/total_loss": 0.19509141147136688 + }, + { + "entropy": 9.326565742492676, + "epoch": 0.04627249357326478, + "mean_token_accuracy": 0.7552836537361145, + "num_tokens": 2445733.0, + "step": 468, + "train/ce_loss": 1.1807743310928345 + }, + { + "epoch": 0.04627249357326478, + "step": 468, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.04627249357326478, + "step": 468, + "train/total_loss": 0.22745242714881897 + }, + { + "entropy": 9.14306640625, + "epoch": 0.04637136642278031, + "mean_token_accuracy": 0.7111356258392334, + "num_tokens": 2451137.0, + "step": 469, + "train/ce_loss": 1.2142341136932373 + }, + { + "epoch": 0.04637136642278031, + "step": 469, + "train/sim_loss": 0.17578125 + }, + { + "epoch": 0.04637136642278031, + "step": 469, + "train/total_loss": 0.2972046732902527 + }, + { + "entropy": 9.252565383911133, + "epoch": 0.04647023927229583, + "mean_token_accuracy": 0.6867052316665649, + "num_tokens": 2456435.0, + "step": 470, + "train/ce_loss": 1.1048601865768433 + }, + { + "epoch": 0.04647023927229583, + "step": 470, + "train/sim_loss": 0.1953125 + }, + { + "epoch": 0.04647023927229583, + "step": 470, + "train/total_loss": 0.3057985305786133 + }, + { + "entropy": 9.195484161376953, + "epoch": 0.04656911212181135, + "mean_token_accuracy": 0.7412935495376587, + "num_tokens": 2461679.0, + "step": 471, + "train/ce_loss": 0.7700126767158508 + }, + { + "epoch": 0.04656911212181135, + "step": 471, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.04656911212181135, + "step": 471, + "train/total_loss": 0.17856377363204956 + }, + { + "entropy": 9.602058410644531, + "epoch": 0.046667984971326876, + "mean_token_accuracy": 0.7310252785682678, + "num_tokens": 2467055.0, + "step": 472, + "train/ce_loss": 1.0718854665756226 + }, + { + "epoch": 0.046667984971326876, + "step": 472, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.046667984971326876, + "step": 472, + "train/total_loss": 0.22437605261802673 + }, + { + "entropy": 9.021293640136719, + "epoch": 0.046766857820842396, + "mean_token_accuracy": 0.7156549692153931, + "num_tokens": 2472511.0, + "step": 473, + "train/ce_loss": 1.4495042562484741 + }, + { + "epoch": 0.046766857820842396, + "step": 473, + "train/sim_loss": 0.14453125 + }, + { + "epoch": 0.046766857820842396, + "step": 473, + "train/total_loss": 0.2894816994667053 + }, + { + "entropy": 10.21648120880127, + "epoch": 0.046865730670357916, + "mean_token_accuracy": 0.729468584060669, + "num_tokens": 2477402.0, + "step": 474, + "train/ce_loss": 0.11186335980892181 + }, + { + "epoch": 0.046865730670357916, + "step": 474, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.046865730670357916, + "step": 474, + "train/total_loss": 0.11274883896112442 + }, + { + "entropy": 10.212539672851562, + "epoch": 0.046964603519873444, + "mean_token_accuracy": 0.6848635077476501, + "num_tokens": 2482244.0, + "step": 475, + "train/ce_loss": 2.5846259593963623 + }, + { + "epoch": 0.046964603519873444, + "step": 475, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.046964603519873444, + "step": 475, + "train/total_loss": 0.3639313578605652 + }, + { + "entropy": 9.680295944213867, + "epoch": 0.047063476369388964, + "mean_token_accuracy": 0.7712329030036926, + "num_tokens": 2487419.0, + "step": 476, + "train/ce_loss": 1.2396368980407715 + }, + { + "epoch": 0.047063476369388964, + "step": 476, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.047063476369388964, + "step": 476, + "train/total_loss": 0.24896368384361267 + }, + { + "entropy": 9.602041244506836, + "epoch": 0.04716234921890449, + "mean_token_accuracy": 0.6979591846466064, + "num_tokens": 2492585.0, + "step": 477, + "train/ce_loss": 1.0866364240646362 + }, + { + "epoch": 0.04716234921890449, + "step": 477, + "train/sim_loss": 0.1328125 + }, + { + "epoch": 0.04716234921890449, + "step": 477, + "train/total_loss": 0.2414761483669281 + }, + { + "entropy": 9.360517501831055, + "epoch": 0.04726122206842001, + "mean_token_accuracy": 0.746760904788971, + "num_tokens": 2497904.0, + "step": 478, + "train/ce_loss": 0.7504639029502869 + }, + { + "epoch": 0.04726122206842001, + "step": 478, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.04726122206842001, + "step": 478, + "train/total_loss": 0.1297338902950287 + }, + { + "entropy": 9.118058204650879, + "epoch": 0.04736009491793553, + "mean_token_accuracy": 0.7364264726638794, + "num_tokens": 2503342.0, + "step": 479, + "train/ce_loss": 0.8444780707359314 + }, + { + "epoch": 0.04736009491793553, + "step": 479, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.04736009491793553, + "step": 479, + "train/total_loss": 0.20163530111312866 + }, + { + "epoch": 0.04745896776745106, + "grad_norm": 1.105547308921814, + "learning_rate": 9.8840429214261e-06, + "loss": 0.2074, + "step": 480 + }, + { + "entropy": 9.979605674743652, + "epoch": 0.04745896776745106, + "mean_token_accuracy": 0.77173912525177, + "num_tokens": 2508154.0, + "step": 480, + "train/ce_loss": 1.515071988105774 + }, + { + "epoch": 0.04745896776745106, + "step": 480, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.04745896776745106, + "step": 480, + "train/total_loss": 0.2022884488105774 + }, + { + "entropy": 9.231979370117188, + "epoch": 0.04755784061696658, + "mean_token_accuracy": 0.7763440608978271, + "num_tokens": 2513512.0, + "step": 481, + "train/ce_loss": 0.8422334790229797 + }, + { + "epoch": 0.04755784061696658, + "step": 481, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.04755784061696658, + "step": 481, + "train/total_loss": 0.13500460982322693 + }, + { + "entropy": 10.054229736328125, + "epoch": 0.047656713466482106, + "mean_token_accuracy": 0.748106062412262, + "num_tokens": 2518458.0, + "step": 482, + "train/ce_loss": 1.097081184387207 + }, + { + "epoch": 0.047656713466482106, + "step": 482, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.047656713466482106, + "step": 482, + "train/total_loss": 0.18783313035964966 + }, + { + "entropy": 9.745567321777344, + "epoch": 0.04775558631599763, + "mean_token_accuracy": 0.6627907156944275, + "num_tokens": 2523459.0, + "step": 483, + "train/ce_loss": 1.2584025859832764 + }, + { + "epoch": 0.04775558631599763, + "step": 483, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.04775558631599763, + "step": 483, + "train/total_loss": 0.18052776157855988 + }, + { + "entropy": 10.35242748260498, + "epoch": 0.04785445916551315, + "mean_token_accuracy": 0.7480719685554504, + "num_tokens": 2528257.0, + "step": 484, + "train/ce_loss": 2.1718273162841797 + }, + { + "epoch": 0.04785445916551315, + "step": 484, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.04785445916551315, + "step": 484, + "train/total_loss": 0.3109327554702759 + }, + { + "entropy": 9.667442321777344, + "epoch": 0.047953332015028674, + "mean_token_accuracy": 0.7165242433547974, + "num_tokens": 2533434.0, + "step": 485, + "train/ce_loss": 1.0756824016571045 + }, + { + "epoch": 0.047953332015028674, + "step": 485, + "train/sim_loss": 0.171875 + }, + { + "epoch": 0.047953332015028674, + "step": 485, + "train/total_loss": 0.27944323420524597 + }, + { + "entropy": 9.372124671936035, + "epoch": 0.048052204864544194, + "mean_token_accuracy": 0.744508683681488, + "num_tokens": 2538761.0, + "step": 486, + "train/ce_loss": 1.0770900249481201 + }, + { + "epoch": 0.048052204864544194, + "step": 486, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.048052204864544194, + "step": 486, + "train/total_loss": 0.23270900547504425 + }, + { + "entropy": 9.491572380065918, + "epoch": 0.04815107771405972, + "mean_token_accuracy": 0.7078787684440613, + "num_tokens": 2544017.0, + "step": 487, + "train/ce_loss": 0.5395680069923401 + }, + { + "epoch": 0.04815107771405972, + "step": 487, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.04815107771405972, + "step": 487, + "train/total_loss": 0.1633318066596985 + }, + { + "entropy": 8.881023406982422, + "epoch": 0.04824995056357524, + "mean_token_accuracy": 0.680672287940979, + "num_tokens": 2549587.0, + "step": 488, + "train/ce_loss": 1.331809163093567 + }, + { + "epoch": 0.04824995056357524, + "step": 488, + "train/sim_loss": 0.12109375 + }, + { + "epoch": 0.04824995056357524, + "step": 488, + "train/total_loss": 0.2542746663093567 + }, + { + "entropy": 9.483545303344727, + "epoch": 0.04834882341309076, + "mean_token_accuracy": 0.7666666507720947, + "num_tokens": 2554754.0, + "step": 489, + "train/ce_loss": 0.055041637271642685 + }, + { + "epoch": 0.04834882341309076, + "step": 489, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.04834882341309076, + "step": 489, + "train/total_loss": 0.13050416111946106 + }, + { + "entropy": 9.063933372497559, + "epoch": 0.04844769626260629, + "mean_token_accuracy": 0.6822529435157776, + "num_tokens": 2560222.0, + "step": 490, + "train/ce_loss": 1.1069523096084595 + }, + { + "epoch": 0.04844769626260629, + "step": 490, + "train/sim_loss": 0.1640625 + }, + { + "epoch": 0.04844769626260629, + "step": 490, + "train/total_loss": 0.2747577428817749 + }, + { + "entropy": 9.450166702270508, + "epoch": 0.04854656911212181, + "mean_token_accuracy": 0.7581775784492493, + "num_tokens": 2565512.0, + "step": 491, + "train/ce_loss": 1.0669002532958984 + }, + { + "epoch": 0.04854656911212181, + "step": 491, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.04854656911212181, + "step": 491, + "train/total_loss": 0.18872126936912537 + }, + { + "entropy": 9.656312942504883, + "epoch": 0.04864544196163734, + "mean_token_accuracy": 0.6796992421150208, + "num_tokens": 2570648.0, + "step": 492, + "train/ce_loss": 1.0691183805465698 + }, + { + "epoch": 0.04864544196163734, + "step": 492, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.04864544196163734, + "step": 492, + "train/total_loss": 0.20847433805465698 + }, + { + "entropy": 9.513638496398926, + "epoch": 0.04874431481115286, + "mean_token_accuracy": 0.7193675637245178, + "num_tokens": 2575927.0, + "step": 493, + "train/ce_loss": 0.8813692331314087 + }, + { + "epoch": 0.04874431481115286, + "step": 493, + "train/sim_loss": 0.12109375 + }, + { + "epoch": 0.04874431481115286, + "step": 493, + "train/total_loss": 0.2092306762933731 + }, + { + "entropy": 9.012145042419434, + "epoch": 0.04884318766066838, + "mean_token_accuracy": 0.7292340993881226, + "num_tokens": 2581273.0, + "step": 494, + "train/ce_loss": 0.643973708152771 + }, + { + "epoch": 0.04884318766066838, + "step": 494, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.04884318766066838, + "step": 494, + "train/total_loss": 0.12299112230539322 + }, + { + "entropy": 9.455644607543945, + "epoch": 0.048942060510183905, + "mean_token_accuracy": 0.6707482933998108, + "num_tokens": 2586482.0, + "step": 495, + "train/ce_loss": 1.2485941648483276 + }, + { + "epoch": 0.048942060510183905, + "step": 495, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.048942060510183905, + "step": 495, + "train/total_loss": 0.24204692244529724 + }, + { + "entropy": 9.610002517700195, + "epoch": 0.049040933359699425, + "mean_token_accuracy": 0.7510373592376709, + "num_tokens": 2591696.0, + "step": 496, + "train/ce_loss": 0.8027147650718689 + }, + { + "epoch": 0.049040933359699425, + "step": 496, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.049040933359699425, + "step": 496, + "train/total_loss": 0.13495898246765137 + }, + { + "entropy": 10.199856758117676, + "epoch": 0.04913980620921495, + "mean_token_accuracy": 0.6554054021835327, + "num_tokens": 2596533.0, + "step": 497, + "train/ce_loss": 0.09949162602424622 + }, + { + "epoch": 0.04913980620921495, + "step": 497, + "train/sim_loss": 0.171875 + }, + { + "epoch": 0.04913980620921495, + "step": 497, + "train/total_loss": 0.18182416260242462 + }, + { + "entropy": 9.396942138671875, + "epoch": 0.04923867905873047, + "mean_token_accuracy": 0.6440251469612122, + "num_tokens": 2601827.0, + "step": 498, + "train/ce_loss": 0.9401248097419739 + }, + { + "epoch": 0.04923867905873047, + "step": 498, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.04923867905873047, + "step": 498, + "train/total_loss": 0.18385623395442963 + }, + { + "entropy": 9.42705249786377, + "epoch": 0.04933755190824599, + "mean_token_accuracy": 0.6958277225494385, + "num_tokens": 2607043.0, + "step": 499, + "train/ce_loss": 1.1366710662841797 + }, + { + "epoch": 0.04933755190824599, + "step": 499, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.04933755190824599, + "step": 499, + "train/total_loss": 0.1761671006679535 + }, + { + "epoch": 0.04943642475776152, + "grad_norm": 1.5242397785186768, + "learning_rate": 9.87909805666815e-06, + "loss": 0.2099, + "step": 500 + }, + { + "entropy": 9.39438247680664, + "epoch": 0.04943642475776152, + "mean_token_accuracy": 0.6682521104812622, + "num_tokens": 2612307.0, + "step": 500, + "train/ce_loss": 1.0083014965057373 + }, + { + "epoch": 0.04943642475776152, + "step": 500, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.04943642475776152, + "step": 500, + "train/total_loss": 0.18676765263080597 + }, + { + "entropy": 9.212362289428711, + "epoch": 0.04953529760727704, + "mean_token_accuracy": 0.7669584155082703, + "num_tokens": 2617734.0, + "step": 501, + "train/ce_loss": 0.5354273915290833 + }, + { + "epoch": 0.04953529760727704, + "step": 501, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.04953529760727704, + "step": 501, + "train/total_loss": 0.15901148319244385 + }, + { + "entropy": 9.426509857177734, + "epoch": 0.04963417045679257, + "mean_token_accuracy": 0.7060890197753906, + "num_tokens": 2623043.0, + "step": 502, + "train/ce_loss": 2.2972195148468018 + }, + { + "epoch": 0.04963417045679257, + "step": 502, + "train/sim_loss": 0.3046875 + }, + { + "epoch": 0.04963417045679257, + "step": 502, + "train/total_loss": 0.5344094634056091 + }, + { + "entropy": 9.411643981933594, + "epoch": 0.04973304330630809, + "mean_token_accuracy": 0.7471839785575867, + "num_tokens": 2628337.0, + "step": 503, + "train/ce_loss": 0.5850355625152588 + }, + { + "epoch": 0.04973304330630809, + "step": 503, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.04973304330630809, + "step": 503, + "train/total_loss": 0.10537855327129364 + }, + { + "entropy": 9.113931655883789, + "epoch": 0.04983191615582361, + "mean_token_accuracy": 0.748400866985321, + "num_tokens": 2633762.0, + "step": 504, + "train/ce_loss": 0.8196927905082703 + }, + { + "epoch": 0.04983191615582361, + "step": 504, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.04983191615582361, + "step": 504, + "train/total_loss": 0.17962554097175598 + }, + { + "entropy": 9.683710098266602, + "epoch": 0.049930789005339135, + "mean_token_accuracy": 0.7558479309082031, + "num_tokens": 2638905.0, + "step": 505, + "train/ce_loss": 1.0427159070968628 + }, + { + "epoch": 0.049930789005339135, + "step": 505, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.049930789005339135, + "step": 505, + "train/total_loss": 0.21364659070968628 + }, + { + "entropy": 9.304154396057129, + "epoch": 0.050029661854854655, + "mean_token_accuracy": 0.7181817889213562, + "num_tokens": 2644217.0, + "step": 506, + "train/ce_loss": 0.917572557926178 + }, + { + "epoch": 0.050029661854854655, + "step": 506, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.050029661854854655, + "step": 506, + "train/total_loss": 0.15035101771354675 + }, + { + "entropy": 9.906225204467773, + "epoch": 0.05012853470437018, + "mean_token_accuracy": 0.754749596118927, + "num_tokens": 2649187.0, + "step": 507, + "train/ce_loss": 1.5441699028015137 + }, + { + "epoch": 0.05012853470437018, + "step": 507, + "train/sim_loss": 0.16015625 + }, + { + "epoch": 0.05012853470437018, + "step": 507, + "train/total_loss": 0.3145732283592224 + }, + { + "entropy": 10.101706504821777, + "epoch": 0.0502274075538857, + "mean_token_accuracy": 0.7184466123580933, + "num_tokens": 2653966.0, + "step": 508, + "train/ce_loss": 0.10800564289093018 + }, + { + "epoch": 0.0502274075538857, + "step": 508, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.0502274075538857, + "step": 508, + "train/total_loss": 0.0733005627989769 + }, + { + "entropy": 9.47825813293457, + "epoch": 0.05032628040340122, + "mean_token_accuracy": 0.7415143847465515, + "num_tokens": 2659143.0, + "step": 509, + "train/ce_loss": 0.05864207446575165 + }, + { + "epoch": 0.05032628040340122, + "step": 509, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.05032628040340122, + "step": 509, + "train/total_loss": 0.0800829604268074 + }, + { + "entropy": 9.604631423950195, + "epoch": 0.05042515325291675, + "mean_token_accuracy": 0.6968838572502136, + "num_tokens": 2664325.0, + "step": 510, + "train/ce_loss": 0.48100969195365906 + }, + { + "epoch": 0.05042515325291675, + "step": 510, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.05042515325291675, + "step": 510, + "train/total_loss": 0.14966347813606262 + }, + { + "entropy": 8.994229316711426, + "epoch": 0.05052402610243227, + "mean_token_accuracy": 0.6837257146835327, + "num_tokens": 2669802.0, + "step": 511, + "train/ce_loss": 1.3078932762145996 + }, + { + "epoch": 0.05052402610243227, + "step": 511, + "train/sim_loss": 0.1875 + }, + { + "epoch": 0.05052402610243227, + "step": 511, + "train/total_loss": 0.3182893395423889 + }, + { + "entropy": 9.30910587310791, + "epoch": 0.0506228989519478, + "mean_token_accuracy": 0.68727707862854, + "num_tokens": 2675131.0, + "step": 512, + "train/ce_loss": 1.0803829431533813 + }, + { + "epoch": 0.0506228989519478, + "step": 512, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.0506228989519478, + "step": 512, + "train/total_loss": 0.1900695562362671 + }, + { + "entropy": 9.34923267364502, + "epoch": 0.05072177180146332, + "mean_token_accuracy": 0.7068965435028076, + "num_tokens": 2680306.0, + "step": 513, + "train/ce_loss": 0.8655760288238525 + }, + { + "epoch": 0.05072177180146332, + "step": 513, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.05072177180146332, + "step": 513, + "train/total_loss": 0.19983884692192078 + }, + { + "entropy": 9.359048843383789, + "epoch": 0.05082064465097884, + "mean_token_accuracy": 0.7277227640151978, + "num_tokens": 2685562.0, + "step": 514, + "train/ce_loss": 0.9464130401611328 + }, + { + "epoch": 0.05082064465097884, + "step": 514, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.05082064465097884, + "step": 514, + "train/total_loss": 0.1454225480556488 + }, + { + "entropy": 9.247976303100586, + "epoch": 0.050919517500494366, + "mean_token_accuracy": 0.7032474875450134, + "num_tokens": 2690926.0, + "step": 515, + "train/ce_loss": 0.6532353162765503 + }, + { + "epoch": 0.050919517500494366, + "step": 515, + "train/sim_loss": 0.14453125 + }, + { + "epoch": 0.050919517500494366, + "step": 515, + "train/total_loss": 0.20985478162765503 + }, + { + "entropy": 9.181419372558594, + "epoch": 0.051018390350009886, + "mean_token_accuracy": 0.7311608791351318, + "num_tokens": 2696367.0, + "step": 516, + "train/ce_loss": 1.1315739154815674 + }, + { + "epoch": 0.051018390350009886, + "step": 516, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.051018390350009886, + "step": 516, + "train/total_loss": 0.22253239154815674 + }, + { + "entropy": 9.452371597290039, + "epoch": 0.05111726319952541, + "mean_token_accuracy": 0.8046242594718933, + "num_tokens": 2701822.0, + "step": 517, + "train/ce_loss": 0.04842658340930939 + }, + { + "epoch": 0.05111726319952541, + "step": 517, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.05111726319952541, + "step": 517, + "train/total_loss": 0.11421766132116318 + }, + { + "entropy": 9.96130084991455, + "epoch": 0.05121613604904093, + "mean_token_accuracy": 0.7293497323989868, + "num_tokens": 2706837.0, + "step": 518, + "train/ce_loss": 1.3989778757095337 + }, + { + "epoch": 0.05121613604904093, + "step": 518, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.05121613604904093, + "step": 518, + "train/total_loss": 0.24927279353141785 + }, + { + "entropy": 9.840981483459473, + "epoch": 0.051315008898556454, + "mean_token_accuracy": 0.7715654969215393, + "num_tokens": 2711892.0, + "step": 519, + "train/ce_loss": 0.07052932679653168 + }, + { + "epoch": 0.051315008898556454, + "step": 519, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.051315008898556454, + "step": 519, + "train/total_loss": 0.1320529282093048 + }, + { + "epoch": 0.05141388174807198, + "grad_norm": 1.1278705596923828, + "learning_rate": 9.874153191910203e-06, + "loss": 0.1989, + "step": 520 + }, + { + "entropy": 10.010367393493652, + "epoch": 0.05141388174807198, + "mean_token_accuracy": 0.695652186870575, + "num_tokens": 2716867.0, + "step": 520, + "train/ce_loss": 1.4642590284347534 + }, + { + "epoch": 0.05141388174807198, + "step": 520, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.05141388174807198, + "step": 520, + "train/total_loss": 0.19330090284347534 + }, + { + "entropy": 9.677255630493164, + "epoch": 0.0515127545975875, + "mean_token_accuracy": 0.7307132482528687, + "num_tokens": 2722025.0, + "step": 521, + "train/ce_loss": 0.8722675442695618 + }, + { + "epoch": 0.0515127545975875, + "step": 521, + "train/sim_loss": 0.16796875 + }, + { + "epoch": 0.0515127545975875, + "step": 521, + "train/total_loss": 0.2551954984664917 + }, + { + "entropy": 10.041577339172363, + "epoch": 0.05161162744710303, + "mean_token_accuracy": 0.7870370149612427, + "num_tokens": 2726902.0, + "step": 522, + "train/ce_loss": 1.583846092224121 + }, + { + "epoch": 0.05161162744710303, + "step": 522, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.05161162744710303, + "step": 522, + "train/total_loss": 0.24822835624217987 + }, + { + "entropy": 9.485618591308594, + "epoch": 0.05171050029661855, + "mean_token_accuracy": 0.6848484873771667, + "num_tokens": 2732201.0, + "step": 523, + "train/ce_loss": 1.1445075273513794 + }, + { + "epoch": 0.05171050029661855, + "step": 523, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.05171050029661855, + "step": 523, + "train/total_loss": 0.22773200273513794 + }, + { + "entropy": 9.58382797241211, + "epoch": 0.05180937314613407, + "mean_token_accuracy": 0.7578796744346619, + "num_tokens": 2737328.0, + "step": 524, + "train/ce_loss": 0.06139937788248062 + }, + { + "epoch": 0.05180937314613407, + "step": 524, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.05180937314613407, + "step": 524, + "train/total_loss": 0.05301493778824806 + }, + { + "entropy": 9.250560760498047, + "epoch": 0.051908245995649596, + "mean_token_accuracy": 0.6807563900947571, + "num_tokens": 2742615.0, + "step": 525, + "train/ce_loss": 0.8151491284370422 + }, + { + "epoch": 0.051908245995649596, + "step": 525, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.051908245995649596, + "step": 525, + "train/total_loss": 0.15182742476463318 + }, + { + "entropy": 9.5101318359375, + "epoch": 0.052007118845165116, + "mean_token_accuracy": 0.7257484793663025, + "num_tokens": 2748071.0, + "step": 526, + "train/ce_loss": 0.8393839597702026 + }, + { + "epoch": 0.052007118845165116, + "step": 526, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.052007118845165116, + "step": 526, + "train/total_loss": 0.1581571400165558 + }, + { + "entropy": 9.4252347946167, + "epoch": 0.052105991694680644, + "mean_token_accuracy": 0.743658185005188, + "num_tokens": 2753307.0, + "step": 527, + "train/ce_loss": 0.6566883325576782 + }, + { + "epoch": 0.052105991694680644, + "step": 527, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.052105991694680644, + "step": 527, + "train/total_loss": 0.16332508623600006 + }, + { + "entropy": 9.265995979309082, + "epoch": 0.052204864544196164, + "mean_token_accuracy": 0.741360068321228, + "num_tokens": 2758632.0, + "step": 528, + "train/ce_loss": 0.4899671971797943 + }, + { + "epoch": 0.052204864544196164, + "step": 528, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.052204864544196164, + "step": 528, + "train/total_loss": 0.1661842167377472 + }, + { + "entropy": 9.533434867858887, + "epoch": 0.052303737393711684, + "mean_token_accuracy": 0.6966145634651184, + "num_tokens": 2763831.0, + "step": 529, + "train/ce_loss": 0.7207930684089661 + }, + { + "epoch": 0.052303737393711684, + "step": 529, + "train/sim_loss": 0.171875 + }, + { + "epoch": 0.052303737393711684, + "step": 529, + "train/total_loss": 0.24395430088043213 + }, + { + "entropy": 8.904180526733398, + "epoch": 0.05240261024322721, + "mean_token_accuracy": 0.739051103591919, + "num_tokens": 2769420.0, + "step": 530, + "train/ce_loss": 0.6011399030685425 + }, + { + "epoch": 0.05240261024322721, + "step": 530, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.05240261024322721, + "step": 530, + "train/total_loss": 0.10698899626731873 + }, + { + "entropy": 9.722763061523438, + "epoch": 0.05250148309274273, + "mean_token_accuracy": 0.7542504072189331, + "num_tokens": 2774496.0, + "step": 531, + "train/ce_loss": 1.466348648071289 + }, + { + "epoch": 0.05250148309274273, + "step": 531, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.05250148309274273, + "step": 531, + "train/total_loss": 0.23647861182689667 + }, + { + "entropy": 9.23657512664795, + "epoch": 0.05260035594225826, + "mean_token_accuracy": 0.7497048377990723, + "num_tokens": 2779765.0, + "step": 532, + "train/ce_loss": 1.1639745235443115 + }, + { + "epoch": 0.05260035594225826, + "step": 532, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.05260035594225826, + "step": 532, + "train/total_loss": 0.2257724553346634 + }, + { + "entropy": 9.56563663482666, + "epoch": 0.05269922879177378, + "mean_token_accuracy": 0.7853403091430664, + "num_tokens": 2784999.0, + "step": 533, + "train/ce_loss": 0.7111084461212158 + }, + { + "epoch": 0.05269922879177378, + "step": 533, + "train/sim_loss": 0.12890625 + }, + { + "epoch": 0.05269922879177378, + "step": 533, + "train/total_loss": 0.20001709461212158 + }, + { + "entropy": 9.463350296020508, + "epoch": 0.0527981016412893, + "mean_token_accuracy": 0.6979695558547974, + "num_tokens": 2790274.0, + "step": 534, + "train/ce_loss": 0.8471155166625977 + }, + { + "epoch": 0.0527981016412893, + "step": 534, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.0527981016412893, + "step": 534, + "train/total_loss": 0.16674280166625977 + }, + { + "entropy": 10.123266220092773, + "epoch": 0.05289697449080483, + "mean_token_accuracy": 0.7316017150878906, + "num_tokens": 2795156.0, + "step": 535, + "train/ce_loss": 1.0770008563995361 + }, + { + "epoch": 0.05289697449080483, + "step": 535, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.05289697449080483, + "step": 535, + "train/total_loss": 0.15457507967948914 + }, + { + "entropy": 10.131725311279297, + "epoch": 0.05299584734032035, + "mean_token_accuracy": 0.7266514897346497, + "num_tokens": 2800024.0, + "step": 536, + "train/ce_loss": 0.10147576034069061 + }, + { + "epoch": 0.05299584734032035, + "step": 536, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.05299584734032035, + "step": 536, + "train/total_loss": 0.1078038290143013 + }, + { + "entropy": 9.347770690917969, + "epoch": 0.053094720189835874, + "mean_token_accuracy": 0.6723237633705139, + "num_tokens": 2805230.0, + "step": 537, + "train/ce_loss": 0.8107247948646545 + }, + { + "epoch": 0.053094720189835874, + "step": 537, + "train/sim_loss": 0.1328125 + }, + { + "epoch": 0.053094720189835874, + "step": 537, + "train/total_loss": 0.21388497948646545 + }, + { + "entropy": 9.418624877929688, + "epoch": 0.053193593039351394, + "mean_token_accuracy": 0.7631224989891052, + "num_tokens": 2810488.0, + "step": 538, + "train/ce_loss": 0.7745002508163452 + }, + { + "epoch": 0.053193593039351394, + "step": 538, + "train/sim_loss": 0.13671875 + }, + { + "epoch": 0.053193593039351394, + "step": 538, + "train/total_loss": 0.21416878700256348 + }, + { + "entropy": 9.554218292236328, + "epoch": 0.053292465888866915, + "mean_token_accuracy": 0.719298243522644, + "num_tokens": 2815640.0, + "step": 539, + "train/ce_loss": 1.0348252058029175 + }, + { + "epoch": 0.053292465888866915, + "step": 539, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.053292465888866915, + "step": 539, + "train/total_loss": 0.20113876461982727 + }, + { + "epoch": 0.05339133873838244, + "grad_norm": 1.2608516216278076, + "learning_rate": 9.869208327152253e-06, + "loss": 0.1932, + "step": 540 + }, + { + "entropy": 9.439250946044922, + "epoch": 0.05339133873838244, + "mean_token_accuracy": 0.6946264505386353, + "num_tokens": 2820919.0, + "step": 540, + "train/ce_loss": 1.2679036855697632 + }, + { + "epoch": 0.05339133873838244, + "step": 540, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.05339133873838244, + "step": 540, + "train/total_loss": 0.1931966245174408 + }, + { + "entropy": 9.167460441589355, + "epoch": 0.05349021158789796, + "mean_token_accuracy": 0.7379454970359802, + "num_tokens": 2826540.0, + "step": 541, + "train/ce_loss": 0.576278030872345 + }, + { + "epoch": 0.05349021158789796, + "step": 541, + "train/sim_loss": 0.1484375 + }, + { + "epoch": 0.05349021158789796, + "step": 541, + "train/total_loss": 0.20606529712677002 + }, + { + "entropy": 9.229609489440918, + "epoch": 0.05358908443741349, + "mean_token_accuracy": 0.805038332939148, + "num_tokens": 2831934.0, + "step": 542, + "train/ce_loss": 0.5699486136436462 + }, + { + "epoch": 0.05358908443741349, + "step": 542, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.05358908443741349, + "step": 542, + "train/total_loss": 0.11168236285448074 + }, + { + "entropy": 8.880252838134766, + "epoch": 0.05368795728692901, + "mean_token_accuracy": 0.765531063079834, + "num_tokens": 2837431.0, + "step": 543, + "train/ce_loss": 0.5533732175827026 + }, + { + "epoch": 0.05368795728692901, + "step": 543, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.05368795728692901, + "step": 543, + "train/total_loss": 0.1100248247385025 + }, + { + "entropy": 9.366069793701172, + "epoch": 0.05378683013644453, + "mean_token_accuracy": 0.6319176554679871, + "num_tokens": 2842676.0, + "step": 544, + "train/ce_loss": 1.4185377359390259 + }, + { + "epoch": 0.05378683013644453, + "step": 544, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.05378683013644453, + "step": 544, + "train/total_loss": 0.23951002955436707 + }, + { + "entropy": 10.311344146728516, + "epoch": 0.05388570298596006, + "mean_token_accuracy": 0.7603305578231812, + "num_tokens": 2847451.0, + "step": 545, + "train/ce_loss": 1.1562058925628662 + }, + { + "epoch": 0.05388570298596006, + "step": 545, + "train/sim_loss": 0.265625 + }, + { + "epoch": 0.05388570298596006, + "step": 545, + "train/total_loss": 0.38124558329582214 + }, + { + "entropy": 9.588722229003906, + "epoch": 0.05398457583547558, + "mean_token_accuracy": 0.7671428322792053, + "num_tokens": 2852698.0, + "step": 546, + "train/ce_loss": 0.7407620549201965 + }, + { + "epoch": 0.05398457583547558, + "step": 546, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.05398457583547558, + "step": 546, + "train/total_loss": 0.19126370549201965 + }, + { + "entropy": 9.780805587768555, + "epoch": 0.054083448684991105, + "mean_token_accuracy": 0.7216174006462097, + "num_tokens": 2857813.0, + "step": 547, + "train/ce_loss": 0.6773979663848877 + }, + { + "epoch": 0.054083448684991105, + "step": 547, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.054083448684991105, + "step": 547, + "train/total_loss": 0.12242729961872101 + }, + { + "entropy": 9.329841613769531, + "epoch": 0.054182321534506625, + "mean_token_accuracy": 0.722453236579895, + "num_tokens": 2863270.0, + "step": 548, + "train/ce_loss": 0.5865836143493652 + }, + { + "epoch": 0.054182321534506625, + "step": 548, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.054182321534506625, + "step": 548, + "train/total_loss": 0.15240836143493652 + }, + { + "entropy": 9.572759628295898, + "epoch": 0.054281194384022145, + "mean_token_accuracy": 0.7383966445922852, + "num_tokens": 2868525.0, + "step": 549, + "train/ce_loss": 1.0673495531082153 + }, + { + "epoch": 0.054281194384022145, + "step": 549, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.054281194384022145, + "step": 549, + "train/total_loss": 0.169234961271286 + }, + { + "entropy": 9.241981506347656, + "epoch": 0.05438006723353767, + "mean_token_accuracy": 0.6978335380554199, + "num_tokens": 2873852.0, + "step": 550, + "train/ce_loss": 0.8549608588218689 + }, + { + "epoch": 0.05438006723353767, + "step": 550, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.05438006723353767, + "step": 550, + "train/total_loss": 0.18705859780311584 + }, + { + "entropy": 9.983388900756836, + "epoch": 0.05447894008305319, + "mean_token_accuracy": 0.7796934843063354, + "num_tokens": 2878819.0, + "step": 551, + "train/ce_loss": 0.6586815118789673 + }, + { + "epoch": 0.05447894008305319, + "step": 551, + "train/sim_loss": 0.13671875 + }, + { + "epoch": 0.05447894008305319, + "step": 551, + "train/total_loss": 0.20258690416812897 + }, + { + "entropy": 9.14186954498291, + "epoch": 0.05457781293256872, + "mean_token_accuracy": 0.7487623691558838, + "num_tokens": 2884128.0, + "step": 552, + "train/ce_loss": 0.9660906791687012 + }, + { + "epoch": 0.05457781293256872, + "step": 552, + "train/sim_loss": 0.12109375 + }, + { + "epoch": 0.05457781293256872, + "step": 552, + "train/total_loss": 0.21770282089710236 + }, + { + "entropy": 10.010492324829102, + "epoch": 0.05467668578208424, + "mean_token_accuracy": 0.70652174949646, + "num_tokens": 2889127.0, + "step": 553, + "train/ce_loss": 0.07915318757295609 + }, + { + "epoch": 0.05467668578208424, + "step": 553, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.05467668578208424, + "step": 553, + "train/total_loss": 0.08994656801223755 + }, + { + "entropy": 9.300575256347656, + "epoch": 0.05477555863159976, + "mean_token_accuracy": 0.7167630195617676, + "num_tokens": 2894476.0, + "step": 554, + "train/ce_loss": 1.0042680501937866 + }, + { + "epoch": 0.05477555863159976, + "step": 554, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.05477555863159976, + "step": 554, + "train/total_loss": 0.1785518079996109 + }, + { + "entropy": 9.623985290527344, + "epoch": 0.05487443148111529, + "mean_token_accuracy": 0.7852256894111633, + "num_tokens": 2899852.0, + "step": 555, + "train/ce_loss": 1.194027304649353 + }, + { + "epoch": 0.05487443148111529, + "step": 555, + "train/sim_loss": 0.19921875 + }, + { + "epoch": 0.05487443148111529, + "step": 555, + "train/total_loss": 0.3186214864253998 + }, + { + "entropy": 9.840734481811523, + "epoch": 0.05497330433063081, + "mean_token_accuracy": 0.7689822316169739, + "num_tokens": 2904891.0, + "step": 556, + "train/ce_loss": 1.167907476425171 + }, + { + "epoch": 0.05497330433063081, + "step": 556, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.05497330433063081, + "step": 556, + "train/total_loss": 0.2300719916820526 + }, + { + "entropy": 9.579257011413574, + "epoch": 0.055072177180146335, + "mean_token_accuracy": 0.7628571391105652, + "num_tokens": 2910033.0, + "step": 557, + "train/ce_loss": 0.9383928775787354 + }, + { + "epoch": 0.055072177180146335, + "step": 557, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.055072177180146335, + "step": 557, + "train/total_loss": 0.17196428775787354 + }, + { + "entropy": 9.686721801757812, + "epoch": 0.055171050029661856, + "mean_token_accuracy": 0.7272727489471436, + "num_tokens": 2915156.0, + "step": 558, + "train/ce_loss": 0.8214370608329773 + }, + { + "epoch": 0.055171050029661856, + "step": 558, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.055171050029661856, + "step": 558, + "train/total_loss": 0.19933120906352997 + }, + { + "entropy": 9.445137023925781, + "epoch": 0.055269922879177376, + "mean_token_accuracy": 0.7244501709938049, + "num_tokens": 2920382.0, + "step": 559, + "train/ce_loss": 0.8776147961616516 + }, + { + "epoch": 0.055269922879177376, + "step": 559, + "train/sim_loss": 0.12109375 + }, + { + "epoch": 0.055269922879177376, + "step": 559, + "train/total_loss": 0.20885524153709412 + }, + { + "epoch": 0.0553687957286929, + "grad_norm": 1.3400676250457764, + "learning_rate": 9.864263462394305e-06, + "loss": 0.2003, + "step": 560 + }, + { + "entropy": 9.43281078338623, + "epoch": 0.0553687957286929, + "mean_token_accuracy": 0.7690140604972839, + "num_tokens": 2925595.0, + "step": 560, + "train/ce_loss": 1.3493784666061401 + }, + { + "epoch": 0.0553687957286929, + "step": 560, + "train/sim_loss": 0.12109375 + }, + { + "epoch": 0.0553687957286929, + "step": 560, + "train/total_loss": 0.2560316026210785 + }, + { + "entropy": 9.54813289642334, + "epoch": 0.05546766857820842, + "mean_token_accuracy": 0.7298091053962708, + "num_tokens": 2930687.0, + "step": 561, + "train/ce_loss": 1.03697669506073 + }, + { + "epoch": 0.05546766857820842, + "step": 561, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.05546766857820842, + "step": 561, + "train/total_loss": 0.21697892248630524 + }, + { + "entropy": 9.249414443969727, + "epoch": 0.055566541427723944, + "mean_token_accuracy": 0.6990394592285156, + "num_tokens": 2936080.0, + "step": 562, + "train/ce_loss": 1.2749476432800293 + }, + { + "epoch": 0.055566541427723944, + "step": 562, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.055566541427723944, + "step": 562, + "train/total_loss": 0.252494752407074 + }, + { + "entropy": 9.528446197509766, + "epoch": 0.05566541427723947, + "mean_token_accuracy": 0.7445721626281738, + "num_tokens": 2941279.0, + "step": 563, + "train/ce_loss": 0.868265688419342 + }, + { + "epoch": 0.05566541427723947, + "step": 563, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.05566541427723947, + "step": 563, + "train/total_loss": 0.16495156288146973 + }, + { + "entropy": 9.777507781982422, + "epoch": 0.05576428712675499, + "mean_token_accuracy": 0.7195325493812561, + "num_tokens": 2946373.0, + "step": 564, + "train/ce_loss": 1.328007459640503 + }, + { + "epoch": 0.05576428712675499, + "step": 564, + "train/sim_loss": 0.12109375 + }, + { + "epoch": 0.05576428712675499, + "step": 564, + "train/total_loss": 0.25389450788497925 + }, + { + "entropy": 9.187501907348633, + "epoch": 0.05586315997627052, + "mean_token_accuracy": 0.6438775658607483, + "num_tokens": 2951840.0, + "step": 565, + "train/ce_loss": 1.264036774635315 + }, + { + "epoch": 0.05586315997627052, + "step": 565, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.05586315997627052, + "step": 565, + "train/total_loss": 0.24359117448329926 + }, + { + "entropy": 9.460844039916992, + "epoch": 0.05596203282578604, + "mean_token_accuracy": 0.7052767276763916, + "num_tokens": 2957077.0, + "step": 566, + "train/ce_loss": 0.8064752221107483 + }, + { + "epoch": 0.05596203282578604, + "step": 566, + "train/sim_loss": 0.12109375 + }, + { + "epoch": 0.05596203282578604, + "step": 566, + "train/total_loss": 0.2017412781715393 + }, + { + "entropy": 9.599678993225098, + "epoch": 0.05606090567530156, + "mean_token_accuracy": 0.8073529601097107, + "num_tokens": 2962264.0, + "step": 567, + "train/ce_loss": 0.8236745595932007 + }, + { + "epoch": 0.05606090567530156, + "step": 567, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.05606090567530156, + "step": 567, + "train/total_loss": 0.1761174499988556 + }, + { + "entropy": 10.204925537109375, + "epoch": 0.056159778524817086, + "mean_token_accuracy": 0.6845637559890747, + "num_tokens": 2967118.0, + "step": 568, + "train/ce_loss": 0.09642869979143143 + }, + { + "epoch": 0.056159778524817086, + "step": 568, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.056159778524817086, + "step": 568, + "train/total_loss": 0.09558036923408508 + }, + { + "entropy": 9.058152198791504, + "epoch": 0.056258651374332606, + "mean_token_accuracy": 0.7917485237121582, + "num_tokens": 2972615.0, + "step": 569, + "train/ce_loss": 0.4953159689903259 + }, + { + "epoch": 0.056258651374332606, + "step": 569, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.056258651374332606, + "step": 569, + "train/total_loss": 0.12765659391880035 + }, + { + "entropy": 9.651445388793945, + "epoch": 0.056357524223848134, + "mean_token_accuracy": 0.6978021860122681, + "num_tokens": 2977773.0, + "step": 570, + "train/ce_loss": 1.5411763191223145 + }, + { + "epoch": 0.056357524223848134, + "step": 570, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.056357524223848134, + "step": 570, + "train/total_loss": 0.2713051438331604 + }, + { + "entropy": 9.766046524047852, + "epoch": 0.056456397073363654, + "mean_token_accuracy": 0.7850467562675476, + "num_tokens": 2982852.0, + "step": 571, + "train/ce_loss": 0.638634979724884 + }, + { + "epoch": 0.056456397073363654, + "step": 571, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.056456397073363654, + "step": 571, + "train/total_loss": 0.14980100095272064 + }, + { + "entropy": 9.760435104370117, + "epoch": 0.056555269922879174, + "mean_token_accuracy": 0.7678571343421936, + "num_tokens": 2988004.0, + "step": 572, + "train/ce_loss": 0.059007056057453156 + }, + { + "epoch": 0.056555269922879174, + "step": 572, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.056555269922879174, + "step": 572, + "train/total_loss": 0.08011945337057114 + }, + { + "entropy": 8.902538299560547, + "epoch": 0.0566541427723947, + "mean_token_accuracy": 0.7229129672050476, + "num_tokens": 2993614.0, + "step": 573, + "train/ce_loss": 0.6874699592590332 + }, + { + "epoch": 0.0566541427723947, + "step": 573, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.0566541427723947, + "step": 573, + "train/total_loss": 0.14687199890613556 + }, + { + "entropy": 10.377233505249023, + "epoch": 0.05675301562191022, + "mean_token_accuracy": 0.7002881765365601, + "num_tokens": 2998359.0, + "step": 574, + "train/ce_loss": 2.1008381843566895 + }, + { + "epoch": 0.05675301562191022, + "step": 574, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.05675301562191022, + "step": 574, + "train/total_loss": 0.30774009227752686 + }, + { + "entropy": 9.222179412841797, + "epoch": 0.05685188847142575, + "mean_token_accuracy": 0.6544342637062073, + "num_tokens": 3003840.0, + "step": 575, + "train/ce_loss": 0.9479613304138184 + }, + { + "epoch": 0.05685188847142575, + "step": 575, + "train/sim_loss": 0.1328125 + }, + { + "epoch": 0.05685188847142575, + "step": 575, + "train/total_loss": 0.22760863602161407 + }, + { + "entropy": 9.846975326538086, + "epoch": 0.05695076132094127, + "mean_token_accuracy": 0.7049742937088013, + "num_tokens": 3008886.0, + "step": 576, + "train/ce_loss": 0.8759105205535889 + }, + { + "epoch": 0.05695076132094127, + "step": 576, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.05695076132094127, + "step": 576, + "train/total_loss": 0.1852473020553589 + }, + { + "entropy": 9.408296585083008, + "epoch": 0.05704963417045679, + "mean_token_accuracy": 0.7602040767669678, + "num_tokens": 3014182.0, + "step": 577, + "train/ce_loss": 1.0684754848480225 + }, + { + "epoch": 0.05704963417045679, + "step": 577, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.05704963417045679, + "step": 577, + "train/total_loss": 0.17325380444526672 + }, + { + "entropy": 9.4006986618042, + "epoch": 0.05714850701997232, + "mean_token_accuracy": 0.7496976852416992, + "num_tokens": 3019471.0, + "step": 578, + "train/ce_loss": 0.5649070739746094 + }, + { + "epoch": 0.05714850701997232, + "step": 578, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.05714850701997232, + "step": 578, + "train/total_loss": 0.1463344544172287 + }, + { + "entropy": 9.148704528808594, + "epoch": 0.05724737986948784, + "mean_token_accuracy": 0.6821345686912537, + "num_tokens": 3024797.0, + "step": 579, + "train/ce_loss": 1.0922415256500244 + }, + { + "epoch": 0.05724737986948784, + "step": 579, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.05724737986948784, + "step": 579, + "train/total_loss": 0.22250540554523468 + }, + { + "epoch": 0.057346252719003364, + "grad_norm": 1.7009624242782593, + "learning_rate": 9.859318597636356e-06, + "loss": 0.1971, + "step": 580 + }, + { + "entropy": 9.603752136230469, + "epoch": 0.057346252719003364, + "mean_token_accuracy": 0.7529761791229248, + "num_tokens": 3029923.0, + "step": 580, + "train/ce_loss": 0.06308668851852417 + }, + { + "epoch": 0.057346252719003364, + "step": 580, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.057346252719003364, + "step": 580, + "train/total_loss": 0.0805274173617363 + }, + { + "entropy": 9.038854598999023, + "epoch": 0.057445125568518884, + "mean_token_accuracy": 0.7690557241439819, + "num_tokens": 3035294.0, + "step": 581, + "train/ce_loss": 0.8573618531227112 + }, + { + "epoch": 0.057445125568518884, + "step": 581, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.057445125568518884, + "step": 581, + "train/total_loss": 0.18339243531227112 + }, + { + "entropy": 9.591069221496582, + "epoch": 0.057543998418034405, + "mean_token_accuracy": 0.7294429540634155, + "num_tokens": 3040500.0, + "step": 582, + "train/ce_loss": 0.6902552247047424 + }, + { + "epoch": 0.057543998418034405, + "step": 582, + "train/sim_loss": 0.1875 + }, + { + "epoch": 0.057543998418034405, + "step": 582, + "train/total_loss": 0.25652551651000977 + }, + { + "entropy": 9.707898139953613, + "epoch": 0.05764287126754993, + "mean_token_accuracy": 0.6551265120506287, + "num_tokens": 3045672.0, + "step": 583, + "train/ce_loss": 0.05402826890349388 + }, + { + "epoch": 0.05764287126754993, + "step": 583, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.05764287126754993, + "step": 583, + "train/total_loss": 0.0991528257727623 + }, + { + "entropy": 9.415910720825195, + "epoch": 0.05774174411706545, + "mean_token_accuracy": 0.7229064106941223, + "num_tokens": 3050888.0, + "step": 584, + "train/ce_loss": 0.9903222918510437 + }, + { + "epoch": 0.05774174411706545, + "step": 584, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.05774174411706545, + "step": 584, + "train/total_loss": 0.1576259732246399 + }, + { + "entropy": 9.809968948364258, + "epoch": 0.05784061696658098, + "mean_token_accuracy": 0.7357414364814758, + "num_tokens": 3055823.0, + "step": 585, + "train/ce_loss": 1.3496983051300049 + }, + { + "epoch": 0.05784061696658098, + "step": 585, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.05784061696658098, + "step": 585, + "train/total_loss": 0.2404385805130005 + }, + { + "entropy": 8.99539566040039, + "epoch": 0.0579394898160965, + "mean_token_accuracy": 0.8194444179534912, + "num_tokens": 3061280.0, + "step": 586, + "train/ce_loss": 0.6846917271614075 + }, + { + "epoch": 0.0579394898160965, + "step": 586, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.0579394898160965, + "step": 586, + "train/total_loss": 0.12315667420625687 + }, + { + "entropy": 9.648222923278809, + "epoch": 0.05803836266561202, + "mean_token_accuracy": 0.6480938196182251, + "num_tokens": 3066535.0, + "step": 587, + "train/ce_loss": 1.0551172494888306 + }, + { + "epoch": 0.05803836266561202, + "step": 587, + "train/sim_loss": 0.14453125 + }, + { + "epoch": 0.05803836266561202, + "step": 587, + "train/total_loss": 0.25004297494888306 + }, + { + "entropy": 9.561786651611328, + "epoch": 0.05813723551512755, + "mean_token_accuracy": 0.7098844647407532, + "num_tokens": 3071709.0, + "step": 588, + "train/ce_loss": 1.5621851682662964 + }, + { + "epoch": 0.05813723551512755, + "step": 588, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.05813723551512755, + "step": 588, + "train/total_loss": 0.2460622638463974 + }, + { + "entropy": 9.043785095214844, + "epoch": 0.05823610836464307, + "mean_token_accuracy": 0.7582089304924011, + "num_tokens": 3077207.0, + "step": 589, + "train/ce_loss": 0.7210156321525574 + }, + { + "epoch": 0.05823610836464307, + "step": 589, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.05823610836464307, + "step": 589, + "train/total_loss": 0.10335156321525574 + }, + { + "entropy": 9.510514259338379, + "epoch": 0.058334981214158595, + "mean_token_accuracy": 0.7247340679168701, + "num_tokens": 3082407.0, + "step": 590, + "train/ce_loss": 1.260585904121399 + }, + { + "epoch": 0.058334981214158595, + "step": 590, + "train/sim_loss": 0.12890625 + }, + { + "epoch": 0.058334981214158595, + "step": 590, + "train/total_loss": 0.25496482849121094 + }, + { + "entropy": 9.443502426147461, + "epoch": 0.058433854063674115, + "mean_token_accuracy": 0.7330729365348816, + "num_tokens": 3087607.0, + "step": 591, + "train/ce_loss": 1.3981369733810425 + }, + { + "epoch": 0.058433854063674115, + "step": 591, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.058433854063674115, + "step": 591, + "train/total_loss": 0.25700122117996216 + }, + { + "entropy": 9.518030166625977, + "epoch": 0.058532726913189635, + "mean_token_accuracy": 0.7631579041481018, + "num_tokens": 3092856.0, + "step": 592, + "train/ce_loss": 1.1735337972640991 + }, + { + "epoch": 0.058532726913189635, + "step": 592, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.058532726913189635, + "step": 592, + "train/total_loss": 0.2032908797264099 + }, + { + "entropy": 9.151599884033203, + "epoch": 0.05863159976270516, + "mean_token_accuracy": 0.7040951251983643, + "num_tokens": 3098148.0, + "step": 593, + "train/ce_loss": 1.2240796089172363 + }, + { + "epoch": 0.05863159976270516, + "step": 593, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.05863159976270516, + "step": 593, + "train/total_loss": 0.2239704728126526 + }, + { + "entropy": 10.255637168884277, + "epoch": 0.05873047261222068, + "mean_token_accuracy": 0.6727688908576965, + "num_tokens": 3103189.0, + "step": 594, + "train/ce_loss": 0.10033336281776428 + }, + { + "epoch": 0.05873047261222068, + "step": 594, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.05873047261222068, + "step": 594, + "train/total_loss": 0.12722083926200867 + }, + { + "entropy": 9.687835693359375, + "epoch": 0.05882934546173621, + "mean_token_accuracy": 0.760188102722168, + "num_tokens": 3108264.0, + "step": 595, + "train/ce_loss": 0.7539442777633667 + }, + { + "epoch": 0.05882934546173621, + "step": 595, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.05882934546173621, + "step": 595, + "train/total_loss": 0.1691444218158722 + }, + { + "entropy": 9.354427337646484, + "epoch": 0.05892821831125173, + "mean_token_accuracy": 0.6974595785140991, + "num_tokens": 3113589.0, + "step": 596, + "train/ce_loss": 0.5949295163154602 + }, + { + "epoch": 0.05892821831125173, + "step": 596, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.05892821831125173, + "step": 596, + "train/total_loss": 0.14933669567108154 + }, + { + "entropy": 9.385342597961426, + "epoch": 0.05902709116076725, + "mean_token_accuracy": 0.7252747416496277, + "num_tokens": 3118832.0, + "step": 597, + "train/ce_loss": 0.048359014093875885 + }, + { + "epoch": 0.05902709116076725, + "step": 597, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.05902709116076725, + "step": 597, + "train/total_loss": 0.12983590364456177 + }, + { + "entropy": 9.755470275878906, + "epoch": 0.05912596401028278, + "mean_token_accuracy": 0.6884498596191406, + "num_tokens": 3123936.0, + "step": 598, + "train/ce_loss": 0.06409426033496857 + }, + { + "epoch": 0.05912596401028278, + "step": 598, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.05912596401028278, + "step": 598, + "train/total_loss": 0.0767219290137291 + }, + { + "entropy": 8.945388793945312, + "epoch": 0.0592248368597983, + "mean_token_accuracy": 0.726396918296814, + "num_tokens": 3129554.0, + "step": 599, + "train/ce_loss": 0.8508357405662537 + }, + { + "epoch": 0.0592248368597983, + "step": 599, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.0592248368597983, + "step": 599, + "train/total_loss": 0.19055232405662537 + }, + { + "epoch": 0.059323709709313825, + "grad_norm": 1.3134665489196777, + "learning_rate": 9.854373732878406e-06, + "loss": 0.1913, + "step": 600 + }, + { + "entropy": 9.647882461547852, + "epoch": 0.059323709709313825, + "mean_token_accuracy": 0.7123098373413086, + "num_tokens": 3134698.0, + "step": 600, + "train/ce_loss": 0.7676689624786377 + }, + { + "epoch": 0.059323709709313825, + "step": 600, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.059323709709313825, + "step": 600, + "train/total_loss": 0.15879815816879272 + }, + { + "entropy": 9.90464973449707, + "epoch": 0.059422582558829345, + "mean_token_accuracy": 0.739130437374115, + "num_tokens": 3139674.0, + "step": 601, + "train/ce_loss": 0.9781233668327332 + }, + { + "epoch": 0.059422582558829345, + "step": 601, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.059422582558829345, + "step": 601, + "train/total_loss": 0.22281233966350555 + }, + { + "entropy": 9.9501953125, + "epoch": 0.059521455408344866, + "mean_token_accuracy": 0.776442289352417, + "num_tokens": 3144516.0, + "step": 602, + "train/ce_loss": 0.10536352545022964 + }, + { + "epoch": 0.059521455408344866, + "step": 602, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.059521455408344866, + "step": 602, + "train/total_loss": 0.06913010030984879 + }, + { + "entropy": 9.114591598510742, + "epoch": 0.05962032825786039, + "mean_token_accuracy": 0.7934537529945374, + "num_tokens": 3149858.0, + "step": 603, + "train/ce_loss": 0.8752561807632446 + }, + { + "epoch": 0.05962032825786039, + "step": 603, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.05962032825786039, + "step": 603, + "train/total_loss": 0.1890881210565567 + }, + { + "entropy": 9.432883262634277, + "epoch": 0.05971920110737591, + "mean_token_accuracy": 0.7468706369400024, + "num_tokens": 3154999.0, + "step": 604, + "train/ce_loss": 1.2306396961212158 + }, + { + "epoch": 0.05971920110737591, + "step": 604, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.05971920110737591, + "step": 604, + "train/total_loss": 0.20509523153305054 + }, + { + "entropy": 9.856866836547852, + "epoch": 0.05981807395689144, + "mean_token_accuracy": 0.6504347920417786, + "num_tokens": 3160014.0, + "step": 605, + "train/ce_loss": 1.4992445707321167 + }, + { + "epoch": 0.05981807395689144, + "step": 605, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.05981807395689144, + "step": 605, + "train/total_loss": 0.27492445707321167 + }, + { + "entropy": 9.179306030273438, + "epoch": 0.05991694680640696, + "mean_token_accuracy": 0.7101293206214905, + "num_tokens": 3165444.0, + "step": 606, + "train/ce_loss": 1.657849907875061 + }, + { + "epoch": 0.05991694680640696, + "step": 606, + "train/sim_loss": 0.1328125 + }, + { + "epoch": 0.05991694680640696, + "step": 606, + "train/total_loss": 0.298597514629364 + }, + { + "entropy": 9.692015647888184, + "epoch": 0.06001581965592248, + "mean_token_accuracy": 0.7776243090629578, + "num_tokens": 3170762.0, + "step": 607, + "train/ce_loss": 0.6523178815841675 + }, + { + "epoch": 0.06001581965592248, + "step": 607, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.06001581965592248, + "step": 607, + "train/total_loss": 0.1433568000793457 + }, + { + "entropy": 9.208277702331543, + "epoch": 0.06011469250543801, + "mean_token_accuracy": 0.7749999761581421, + "num_tokens": 3176133.0, + "step": 608, + "train/ce_loss": 0.6562087535858154 + }, + { + "epoch": 0.06011469250543801, + "step": 608, + "train/sim_loss": 0.12890625 + }, + { + "epoch": 0.06011469250543801, + "step": 608, + "train/total_loss": 0.19452711939811707 + }, + { + "entropy": 9.781991958618164, + "epoch": 0.06021356535495353, + "mean_token_accuracy": 0.7298938035964966, + "num_tokens": 3181214.0, + "step": 609, + "train/ce_loss": 0.7615407109260559 + }, + { + "epoch": 0.06021356535495353, + "step": 609, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.06021356535495353, + "step": 609, + "train/total_loss": 0.13865408301353455 + }, + { + "entropy": 9.374441146850586, + "epoch": 0.060312438204469056, + "mean_token_accuracy": 0.7690476179122925, + "num_tokens": 3186517.0, + "step": 610, + "train/ce_loss": 0.7123885154724121 + }, + { + "epoch": 0.060312438204469056, + "step": 610, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.060312438204469056, + "step": 610, + "train/total_loss": 0.18061384558677673 + }, + { + "entropy": 9.316181182861328, + "epoch": 0.060411311053984576, + "mean_token_accuracy": 0.6707317233085632, + "num_tokens": 3191845.0, + "step": 611, + "train/ce_loss": 0.6949711441993713 + }, + { + "epoch": 0.060411311053984576, + "step": 611, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.060411311053984576, + "step": 611, + "train/total_loss": 0.17887210845947266 + }, + { + "entropy": 9.724642753601074, + "epoch": 0.060510183903500096, + "mean_token_accuracy": 0.759878396987915, + "num_tokens": 3196946.0, + "step": 612, + "train/ce_loss": 0.5086504817008972 + }, + { + "epoch": 0.060510183903500096, + "step": 612, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.060510183903500096, + "step": 612, + "train/total_loss": 0.1328963041305542 + }, + { + "entropy": 9.029947280883789, + "epoch": 0.060609056753015624, + "mean_token_accuracy": 0.7461773753166199, + "num_tokens": 3202457.0, + "step": 613, + "train/ce_loss": 0.4887118637561798 + }, + { + "epoch": 0.060609056753015624, + "step": 613, + "train/sim_loss": 0.1328125 + }, + { + "epoch": 0.060609056753015624, + "step": 613, + "train/total_loss": 0.18168368935585022 + }, + { + "entropy": 9.9183931350708, + "epoch": 0.060707929602531144, + "mean_token_accuracy": 0.7015177011489868, + "num_tokens": 3207503.0, + "step": 614, + "train/ce_loss": 1.7783939838409424 + }, + { + "epoch": 0.060707929602531144, + "step": 614, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.060707929602531144, + "step": 614, + "train/total_loss": 0.28721439838409424 + }, + { + "entropy": 9.622869491577148, + "epoch": 0.06080680245204667, + "mean_token_accuracy": 0.7181409001350403, + "num_tokens": 3212625.0, + "step": 615, + "train/ce_loss": 1.2805120944976807 + }, + { + "epoch": 0.06080680245204667, + "step": 615, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.06080680245204667, + "step": 615, + "train/total_loss": 0.23351995646953583 + }, + { + "entropy": 10.772820472717285, + "epoch": 0.06090567530156219, + "mean_token_accuracy": 0.7873563170433044, + "num_tokens": 3217180.0, + "step": 616, + "train/ce_loss": 0.2533998489379883 + }, + { + "epoch": 0.06090567530156219, + "step": 616, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.06090567530156219, + "step": 616, + "train/total_loss": 0.09174623340368271 + }, + { + "entropy": 9.706140518188477, + "epoch": 0.06100454815107771, + "mean_token_accuracy": 0.7152974605560303, + "num_tokens": 3222383.0, + "step": 617, + "train/ce_loss": 0.9342068433761597 + }, + { + "epoch": 0.06100454815107771, + "step": 617, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.06100454815107771, + "step": 617, + "train/total_loss": 0.20670193433761597 + }, + { + "entropy": 9.423571586608887, + "epoch": 0.06110342100059324, + "mean_token_accuracy": 0.6854742169380188, + "num_tokens": 3227588.0, + "step": 618, + "train/ce_loss": 1.3230915069580078 + }, + { + "epoch": 0.06110342100059324, + "step": 618, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.06110342100059324, + "step": 618, + "train/total_loss": 0.24168415367603302 + }, + { + "entropy": 9.546838760375977, + "epoch": 0.06120229385010876, + "mean_token_accuracy": 0.7903226017951965, + "num_tokens": 3232774.0, + "step": 619, + "train/ce_loss": 0.7212303876876831 + }, + { + "epoch": 0.06120229385010876, + "step": 619, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.06120229385010876, + "step": 619, + "train/total_loss": 0.11899804323911667 + }, + { + "epoch": 0.061301166699624286, + "grad_norm": 1.0468522310256958, + "learning_rate": 9.849428868120457e-06, + "loss": 0.1921, + "step": 620 + }, + { + "entropy": 9.641387939453125, + "epoch": 0.061301166699624286, + "mean_token_accuracy": 0.7740525007247925, + "num_tokens": 3237908.0, + "step": 620, + "train/ce_loss": 1.0195034742355347 + }, + { + "epoch": 0.061301166699624286, + "step": 620, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.061301166699624286, + "step": 620, + "train/total_loss": 0.14882534742355347 + }, + { + "entropy": 9.968416213989258, + "epoch": 0.06140003954913981, + "mean_token_accuracy": 0.7218309640884399, + "num_tokens": 3242863.0, + "step": 621, + "train/ce_loss": 1.2666391134262085 + }, + { + "epoch": 0.06140003954913981, + "step": 621, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.06140003954913981, + "step": 621, + "train/total_loss": 0.2360389083623886 + }, + { + "entropy": 9.490911483764648, + "epoch": 0.06149891239865533, + "mean_token_accuracy": 0.6958277225494385, + "num_tokens": 3248076.0, + "step": 622, + "train/ce_loss": 1.2347162961959839 + }, + { + "epoch": 0.06149891239865533, + "step": 622, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.06149891239865533, + "step": 622, + "train/total_loss": 0.24847163259983063 + }, + { + "entropy": 9.121641159057617, + "epoch": 0.061597785248170854, + "mean_token_accuracy": 0.7781609296798706, + "num_tokens": 3253474.0, + "step": 623, + "train/ce_loss": 0.7475561499595642 + }, + { + "epoch": 0.061597785248170854, + "step": 623, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.061597785248170854, + "step": 623, + "train/total_loss": 0.17631810903549194 + }, + { + "entropy": 9.5985746383667, + "epoch": 0.061696658097686374, + "mean_token_accuracy": 0.7353723645210266, + "num_tokens": 3258670.0, + "step": 624, + "train/ce_loss": 0.7435654997825623 + }, + { + "epoch": 0.061696658097686374, + "step": 624, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.061696658097686374, + "step": 624, + "train/total_loss": 0.1134190484881401 + }, + { + "entropy": 9.410194396972656, + "epoch": 0.0617955309472019, + "mean_token_accuracy": 0.7748690843582153, + "num_tokens": 3263910.0, + "step": 625, + "train/ce_loss": 0.6336197257041931 + }, + { + "epoch": 0.0617955309472019, + "step": 625, + "train/sim_loss": 0.12109375 + }, + { + "epoch": 0.0617955309472019, + "step": 625, + "train/total_loss": 0.1844557225704193 + }, + { + "entropy": 9.588516235351562, + "epoch": 0.06189440379671742, + "mean_token_accuracy": 0.7860139608383179, + "num_tokens": 3269075.0, + "step": 626, + "train/ce_loss": 0.060945674777030945 + }, + { + "epoch": 0.06189440379671742, + "step": 626, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.06189440379671742, + "step": 626, + "train/total_loss": 0.0646883174777031 + }, + { + "entropy": 10.053567886352539, + "epoch": 0.06199327664623294, + "mean_token_accuracy": 0.7411273717880249, + "num_tokens": 3273928.0, + "step": 627, + "train/ce_loss": 1.2324460744857788 + }, + { + "epoch": 0.06199327664623294, + "step": 627, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.06199327664623294, + "step": 627, + "train/total_loss": 0.22871336340904236 + }, + { + "entropy": 9.442203521728516, + "epoch": 0.06209214949574847, + "mean_token_accuracy": 0.8010075688362122, + "num_tokens": 3279180.0, + "step": 628, + "train/ce_loss": 0.6030117273330688 + }, + { + "epoch": 0.06209214949574847, + "step": 628, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.06209214949574847, + "step": 628, + "train/total_loss": 0.13842616975307465 + }, + { + "entropy": 10.123472213745117, + "epoch": 0.06219102234526399, + "mean_token_accuracy": 0.7330595254898071, + "num_tokens": 3284069.0, + "step": 629, + "train/ce_loss": 0.07958053052425385 + }, + { + "epoch": 0.06219102234526399, + "step": 629, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.06219102234526399, + "step": 629, + "train/total_loss": 0.0978018045425415 + }, + { + "entropy": 9.632625579833984, + "epoch": 0.06228989519477952, + "mean_token_accuracy": 0.7446808218955994, + "num_tokens": 3289163.0, + "step": 630, + "train/ce_loss": 0.661207377910614 + }, + { + "epoch": 0.06228989519477952, + "step": 630, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.06228989519477952, + "step": 630, + "train/total_loss": 0.12862074375152588 + }, + { + "entropy": 9.37982177734375, + "epoch": 0.06238876804429504, + "mean_token_accuracy": 0.6502857208251953, + "num_tokens": 3294500.0, + "step": 631, + "train/ce_loss": 1.2884637117385864 + }, + { + "epoch": 0.06238876804429504, + "step": 631, + "train/sim_loss": 0.13671875 + }, + { + "epoch": 0.06238876804429504, + "step": 631, + "train/total_loss": 0.2655651271343231 + }, + { + "entropy": 10.025651931762695, + "epoch": 0.06248764089381056, + "mean_token_accuracy": 0.8018691539764404, + "num_tokens": 3299447.0, + "step": 632, + "train/ce_loss": 0.0742001160979271 + }, + { + "epoch": 0.06248764089381056, + "step": 632, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.06248764089381056, + "step": 632, + "train/total_loss": 0.05820126086473465 + }, + { + "entropy": 9.40864372253418, + "epoch": 0.06258651374332608, + "mean_token_accuracy": 0.7171581983566284, + "num_tokens": 3304648.0, + "step": 633, + "train/ce_loss": 0.8080118894577026 + }, + { + "epoch": 0.06258651374332608, + "step": 633, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.06258651374332608, + "step": 633, + "train/total_loss": 0.17455118894577026 + }, + { + "entropy": 9.281893730163574, + "epoch": 0.0626853865928416, + "mean_token_accuracy": 0.688524603843689, + "num_tokens": 3309914.0, + "step": 634, + "train/ce_loss": 1.0388758182525635 + }, + { + "epoch": 0.0626853865928416, + "step": 634, + "train/sim_loss": 0.14453125 + }, + { + "epoch": 0.0626853865928416, + "step": 634, + "train/total_loss": 0.24841883778572083 + }, + { + "entropy": 9.273179054260254, + "epoch": 0.06278425944235713, + "mean_token_accuracy": 0.7076923251152039, + "num_tokens": 3315246.0, + "step": 635, + "train/ce_loss": 1.4154484272003174 + }, + { + "epoch": 0.06278425944235713, + "step": 635, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.06278425944235713, + "step": 635, + "train/total_loss": 0.2665448486804962 + }, + { + "entropy": 9.427886962890625, + "epoch": 0.06288313229187265, + "mean_token_accuracy": 0.8057553768157959, + "num_tokens": 3320563.0, + "step": 636, + "train/ce_loss": 0.04886097088456154 + }, + { + "epoch": 0.06288313229187265, + "step": 636, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.06288313229187265, + "step": 636, + "train/total_loss": 0.09863609820604324 + }, + { + "entropy": 9.08419132232666, + "epoch": 0.06298200514138817, + "mean_token_accuracy": 0.7119438052177429, + "num_tokens": 3325840.0, + "step": 637, + "train/ce_loss": 0.6452086567878723 + }, + { + "epoch": 0.06298200514138817, + "step": 637, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.06298200514138817, + "step": 637, + "train/total_loss": 0.14264586567878723 + }, + { + "entropy": 9.62785816192627, + "epoch": 0.0630808779909037, + "mean_token_accuracy": 0.7223926186561584, + "num_tokens": 3330918.0, + "step": 638, + "train/ce_loss": 0.8809434175491333 + }, + { + "epoch": 0.0630808779909037, + "step": 638, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.0630808779909037, + "step": 638, + "train/total_loss": 0.15059435367584229 + }, + { + "entropy": 9.237200736999512, + "epoch": 0.06317975084041923, + "mean_token_accuracy": 0.6712734699249268, + "num_tokens": 3336571.0, + "step": 639, + "train/ce_loss": 1.0680677890777588 + }, + { + "epoch": 0.06317975084041923, + "step": 639, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.06317975084041923, + "step": 639, + "train/total_loss": 0.20055678486824036 + }, + { + "epoch": 0.06327862368993474, + "grad_norm": 1.2951865196228027, + "learning_rate": 9.844484003362509e-06, + "loss": 0.1846, + "step": 640 + }, + { + "entropy": 9.697471618652344, + "epoch": 0.06327862368993474, + "mean_token_accuracy": 0.667117714881897, + "num_tokens": 3341744.0, + "step": 640, + "train/ce_loss": 0.056046780198812485 + }, + { + "epoch": 0.06327862368993474, + "step": 640, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.06327862368993474, + "step": 640, + "train/total_loss": 0.11497967690229416 + }, + { + "entropy": 9.747730255126953, + "epoch": 0.06337749653945027, + "mean_token_accuracy": 0.7149606347084045, + "num_tokens": 3346815.0, + "step": 641, + "train/ce_loss": 0.9643349051475525 + }, + { + "epoch": 0.06337749653945027, + "step": 641, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.06337749653945027, + "step": 641, + "train/total_loss": 0.22143349051475525 + }, + { + "entropy": 9.916828155517578, + "epoch": 0.0634763693889658, + "mean_token_accuracy": 0.7209677696228027, + "num_tokens": 3351870.0, + "step": 642, + "train/ce_loss": 1.0646116733551025 + }, + { + "epoch": 0.0634763693889658, + "step": 642, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.0634763693889658, + "step": 642, + "train/total_loss": 0.20411741733551025 + }, + { + "entropy": 9.16053581237793, + "epoch": 0.06357524223848131, + "mean_token_accuracy": 0.7201645970344543, + "num_tokens": 3357323.0, + "step": 643, + "train/ce_loss": 1.1711736917495728 + }, + { + "epoch": 0.06357524223848131, + "step": 643, + "train/sim_loss": 0.18359375 + }, + { + "epoch": 0.06357524223848131, + "step": 643, + "train/total_loss": 0.30071112513542175 + }, + { + "entropy": 9.730305671691895, + "epoch": 0.06367411508799684, + "mean_token_accuracy": 0.7410179376602173, + "num_tokens": 3362499.0, + "step": 644, + "train/ce_loss": 0.6209876537322998 + }, + { + "epoch": 0.06367411508799684, + "step": 644, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.06367411508799684, + "step": 644, + "train/total_loss": 0.17147377133369446 + }, + { + "entropy": 9.807441711425781, + "epoch": 0.06377298793751236, + "mean_token_accuracy": 0.7218155264854431, + "num_tokens": 3367619.0, + "step": 645, + "train/ce_loss": 1.9182504415512085 + }, + { + "epoch": 0.06377298793751236, + "step": 645, + "train/sim_loss": 0.12109375 + }, + { + "epoch": 0.06377298793751236, + "step": 645, + "train/total_loss": 0.3129187822341919 + }, + { + "entropy": 9.33283805847168, + "epoch": 0.06387186078702788, + "mean_token_accuracy": 0.7675804495811462, + "num_tokens": 3372926.0, + "step": 646, + "train/ce_loss": 0.49201691150665283 + }, + { + "epoch": 0.06387186078702788, + "step": 646, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.06387186078702788, + "step": 646, + "train/total_loss": 0.11170169711112976 + }, + { + "entropy": 9.105297088623047, + "epoch": 0.0639707336365434, + "mean_token_accuracy": 0.761529803276062, + "num_tokens": 3378291.0, + "step": 647, + "train/ce_loss": 0.666023313999176 + }, + { + "epoch": 0.0639707336365434, + "step": 647, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.0639707336365434, + "step": 647, + "train/total_loss": 0.12128983438014984 + }, + { + "entropy": 9.271563529968262, + "epoch": 0.06406960648605893, + "mean_token_accuracy": 0.7352246046066284, + "num_tokens": 3383646.0, + "step": 648, + "train/ce_loss": 0.9176062345504761 + }, + { + "epoch": 0.06406960648605893, + "step": 648, + "train/sim_loss": 0.12109375 + }, + { + "epoch": 0.06406960648605893, + "step": 648, + "train/total_loss": 0.21285438537597656 + }, + { + "entropy": 9.301518440246582, + "epoch": 0.06416847933557446, + "mean_token_accuracy": 0.657549262046814, + "num_tokens": 3389018.0, + "step": 649, + "train/ce_loss": 0.6130936145782471 + }, + { + "epoch": 0.06416847933557446, + "step": 649, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.06416847933557446, + "step": 649, + "train/total_loss": 0.18630936741828918 + }, + { + "entropy": 9.155862808227539, + "epoch": 0.06426735218508997, + "mean_token_accuracy": 0.7696139216423035, + "num_tokens": 3394266.0, + "step": 650, + "train/ce_loss": 0.6263456344604492 + }, + { + "epoch": 0.06426735218508997, + "step": 650, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.06426735218508997, + "step": 650, + "train/total_loss": 0.15638455748558044 + }, + { + "entropy": 9.07483196258545, + "epoch": 0.0643662250346055, + "mean_token_accuracy": 0.6795699000358582, + "num_tokens": 3399684.0, + "step": 651, + "train/ce_loss": 0.9619243741035461 + }, + { + "epoch": 0.0643662250346055, + "step": 651, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.0643662250346055, + "step": 651, + "train/total_loss": 0.22119244933128357 + }, + { + "entropy": 10.023591995239258, + "epoch": 0.06446509788412103, + "mean_token_accuracy": 0.7412451505661011, + "num_tokens": 3404623.0, + "step": 652, + "train/ce_loss": 0.08032934367656708 + }, + { + "epoch": 0.06446509788412103, + "step": 652, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.06446509788412103, + "step": 652, + "train/total_loss": 0.10178293287754059 + }, + { + "entropy": 9.418949127197266, + "epoch": 0.06456397073363654, + "mean_token_accuracy": 0.7397820353507996, + "num_tokens": 3409860.0, + "step": 653, + "train/ce_loss": 1.1483129262924194 + }, + { + "epoch": 0.06456397073363654, + "step": 653, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.06456397073363654, + "step": 653, + "train/total_loss": 0.22420629858970642 + }, + { + "entropy": 9.003963470458984, + "epoch": 0.06466284358315207, + "mean_token_accuracy": 0.6903499364852905, + "num_tokens": 3415334.0, + "step": 654, + "train/ce_loss": 1.5239121913909912 + }, + { + "epoch": 0.06466284358315207, + "step": 654, + "train/sim_loss": 0.171875 + }, + { + "epoch": 0.06466284358315207, + "step": 654, + "train/total_loss": 0.3242662250995636 + }, + { + "entropy": 9.603049278259277, + "epoch": 0.0647617164326676, + "mean_token_accuracy": 0.7332382202148438, + "num_tokens": 3420483.0, + "step": 655, + "train/ce_loss": 0.8936455845832825 + }, + { + "epoch": 0.0647617164326676, + "step": 655, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.0647617164326676, + "step": 655, + "train/total_loss": 0.14795830845832825 + }, + { + "entropy": 9.058694839477539, + "epoch": 0.0648605892821831, + "mean_token_accuracy": 0.7370558381080627, + "num_tokens": 3425941.0, + "step": 656, + "train/ce_loss": 0.53897625207901 + }, + { + "epoch": 0.0648605892821831, + "step": 656, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.0648605892821831, + "step": 656, + "train/total_loss": 0.13592886924743652 + }, + { + "entropy": 9.526253700256348, + "epoch": 0.06495946213169863, + "mean_token_accuracy": 0.7169274687767029, + "num_tokens": 3431138.0, + "step": 657, + "train/ce_loss": 1.0278583765029907 + }, + { + "epoch": 0.06495946213169863, + "step": 657, + "train/sim_loss": 0.140625 + }, + { + "epoch": 0.06495946213169863, + "step": 657, + "train/total_loss": 0.2434108406305313 + }, + { + "entropy": 9.444064140319824, + "epoch": 0.06505833498121416, + "mean_token_accuracy": 0.7347875833511353, + "num_tokens": 3436467.0, + "step": 658, + "train/ce_loss": 1.1032007932662964 + }, + { + "epoch": 0.06505833498121416, + "step": 658, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.06505833498121416, + "step": 658, + "train/total_loss": 0.1806325912475586 + }, + { + "entropy": 9.832465171813965, + "epoch": 0.06515720783072969, + "mean_token_accuracy": 0.6955223679542542, + "num_tokens": 3441567.0, + "step": 659, + "train/ce_loss": 0.9809430241584778 + }, + { + "epoch": 0.06515720783072969, + "step": 659, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.06515720783072969, + "step": 659, + "train/total_loss": 0.22309431433677673 + }, + { + "epoch": 0.0652560806802452, + "grad_norm": 1.191369891166687, + "learning_rate": 9.83953913860456e-06, + "loss": 0.193, + "step": 660 + }, + { + "entropy": 9.245681762695312, + "epoch": 0.0652560806802452, + "mean_token_accuracy": 0.7570194602012634, + "num_tokens": 3446942.0, + "step": 660, + "train/ce_loss": 0.6208062171936035 + }, + { + "epoch": 0.0652560806802452, + "step": 660, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.0652560806802452, + "step": 660, + "train/total_loss": 0.13629937171936035 + }, + { + "entropy": 9.718344688415527, + "epoch": 0.06535495352976073, + "mean_token_accuracy": 0.6290801167488098, + "num_tokens": 3452047.0, + "step": 661, + "train/ce_loss": 2.3280043601989746 + }, + { + "epoch": 0.06535495352976073, + "step": 661, + "train/sim_loss": 0.14453125 + }, + { + "epoch": 0.06535495352976073, + "step": 661, + "train/total_loss": 0.3773316740989685 + }, + { + "entropy": 9.507204055786133, + "epoch": 0.06545382637927626, + "mean_token_accuracy": 0.7333333492279053, + "num_tokens": 3457287.0, + "step": 662, + "train/ce_loss": 0.7894484996795654 + }, + { + "epoch": 0.06545382637927626, + "step": 662, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.06545382637927626, + "step": 662, + "train/total_loss": 0.1805073618888855 + }, + { + "entropy": 9.868836402893066, + "epoch": 0.06555269922879177, + "mean_token_accuracy": 0.7058823704719543, + "num_tokens": 3462330.0, + "step": 663, + "train/ce_loss": 1.783589243888855 + }, + { + "epoch": 0.06555269922879177, + "step": 663, + "train/sim_loss": 0.140625 + }, + { + "epoch": 0.06555269922879177, + "step": 663, + "train/total_loss": 0.31898391246795654 + }, + { + "entropy": 9.411312103271484, + "epoch": 0.0656515720783073, + "mean_token_accuracy": 0.7514863014221191, + "num_tokens": 3467654.0, + "step": 664, + "train/ce_loss": 1.2834221124649048 + }, + { + "epoch": 0.0656515720783073, + "step": 664, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.0656515720783073, + "step": 664, + "train/total_loss": 0.22990471124649048 + }, + { + "entropy": 9.016120910644531, + "epoch": 0.06575044492782282, + "mean_token_accuracy": 0.7105262875556946, + "num_tokens": 3473114.0, + "step": 665, + "train/ce_loss": 1.0795176029205322 + }, + { + "epoch": 0.06575044492782282, + "step": 665, + "train/sim_loss": 0.1796875 + }, + { + "epoch": 0.06575044492782282, + "step": 665, + "train/total_loss": 0.2876392602920532 + }, + { + "entropy": 10.028396606445312, + "epoch": 0.06584931777733834, + "mean_token_accuracy": 0.8230912685394287, + "num_tokens": 3478105.0, + "step": 666, + "train/ce_loss": 1.5367385149002075 + }, + { + "epoch": 0.06584931777733834, + "step": 666, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.06584931777733834, + "step": 666, + "train/total_loss": 0.27086135745048523 + }, + { + "entropy": 9.48210620880127, + "epoch": 0.06594819062685386, + "mean_token_accuracy": 0.7305825352668762, + "num_tokens": 3483359.0, + "step": 667, + "train/ce_loss": 0.828326940536499 + }, + { + "epoch": 0.06594819062685386, + "step": 667, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.06594819062685386, + "step": 667, + "train/total_loss": 0.1453326940536499 + }, + { + "entropy": 9.934640884399414, + "epoch": 0.06604706347636939, + "mean_token_accuracy": 0.7124394178390503, + "num_tokens": 3488405.0, + "step": 668, + "train/ce_loss": 0.059258297085762024 + }, + { + "epoch": 0.06604706347636939, + "step": 668, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.06604706347636939, + "step": 668, + "train/total_loss": 0.09967582672834396 + }, + { + "entropy": 10.445979118347168, + "epoch": 0.06614593632588492, + "mean_token_accuracy": 0.7656765580177307, + "num_tokens": 3493089.0, + "step": 669, + "train/ce_loss": 0.12868449091911316 + }, + { + "epoch": 0.06614593632588492, + "step": 669, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.06614593632588492, + "step": 669, + "train/total_loss": 0.051930949091911316 + }, + { + "entropy": 9.246888160705566, + "epoch": 0.06624480917540043, + "mean_token_accuracy": 0.7436159253120422, + "num_tokens": 3498540.0, + "step": 670, + "train/ce_loss": 0.9152721166610718 + }, + { + "epoch": 0.06624480917540043, + "step": 670, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.06624480917540043, + "step": 670, + "train/total_loss": 0.16183972358703613 + }, + { + "entropy": 10.375326156616211, + "epoch": 0.06634368202491596, + "mean_token_accuracy": 0.6973684430122375, + "num_tokens": 3503331.0, + "step": 671, + "train/ce_loss": 2.0260884761810303 + }, + { + "epoch": 0.06634368202491596, + "step": 671, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.06634368202491596, + "step": 671, + "train/total_loss": 0.2729213535785675 + }, + { + "entropy": 10.073959350585938, + "epoch": 0.06644255487443149, + "mean_token_accuracy": 0.6776180863380432, + "num_tokens": 3508288.0, + "step": 672, + "train/ce_loss": 1.6511471271514893 + }, + { + "epoch": 0.06644255487443149, + "step": 672, + "train/sim_loss": 0.16796875 + }, + { + "epoch": 0.06644255487443149, + "step": 672, + "train/total_loss": 0.33308345079421997 + }, + { + "entropy": 9.359809875488281, + "epoch": 0.066541427723947, + "mean_token_accuracy": 0.7652173638343811, + "num_tokens": 3513547.0, + "step": 673, + "train/ce_loss": 0.8948929905891418 + }, + { + "epoch": 0.066541427723947, + "step": 673, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.066541427723947, + "step": 673, + "train/total_loss": 0.14808306097984314 + }, + { + "entropy": 9.301137924194336, + "epoch": 0.06664030057346253, + "mean_token_accuracy": 0.709046483039856, + "num_tokens": 3518803.0, + "step": 674, + "train/ce_loss": 1.2451282739639282 + }, + { + "epoch": 0.06664030057346253, + "step": 674, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.06664030057346253, + "step": 674, + "train/total_loss": 0.21826282143592834 + }, + { + "entropy": 9.372105598449707, + "epoch": 0.06673917342297805, + "mean_token_accuracy": 0.6800920367240906, + "num_tokens": 3524178.0, + "step": 675, + "train/ce_loss": 0.6253710389137268 + }, + { + "epoch": 0.06673917342297805, + "step": 675, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.06673917342297805, + "step": 675, + "train/total_loss": 0.17581835389137268 + }, + { + "entropy": 9.635863304138184, + "epoch": 0.06683804627249357, + "mean_token_accuracy": 0.7761394381523132, + "num_tokens": 3529381.0, + "step": 676, + "train/ce_loss": 0.8285779356956482 + }, + { + "epoch": 0.06683804627249357, + "step": 676, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.06683804627249357, + "step": 676, + "train/total_loss": 0.19613903760910034 + }, + { + "entropy": 9.848868370056152, + "epoch": 0.0669369191220091, + "mean_token_accuracy": 0.764026403427124, + "num_tokens": 3534381.0, + "step": 677, + "train/ce_loss": 1.0774803161621094 + }, + { + "epoch": 0.0669369191220091, + "step": 677, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.0669369191220091, + "step": 677, + "train/total_loss": 0.15071678161621094 + }, + { + "entropy": 9.583255767822266, + "epoch": 0.06703579197152462, + "mean_token_accuracy": 0.7356687784194946, + "num_tokens": 3539473.0, + "step": 678, + "train/ce_loss": 0.6760376691818237 + }, + { + "epoch": 0.06703579197152462, + "step": 678, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.06703579197152462, + "step": 678, + "train/total_loss": 0.12619751691818237 + }, + { + "entropy": 10.006082534790039, + "epoch": 0.06713466482104015, + "mean_token_accuracy": 0.7246891856193542, + "num_tokens": 3544441.0, + "step": 679, + "train/ce_loss": 0.05885869264602661 + }, + { + "epoch": 0.06713466482104015, + "step": 679, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.06713466482104015, + "step": 679, + "train/total_loss": 0.10354211926460266 + }, + { + "epoch": 0.06723353767055566, + "grad_norm": 1.089155673980713, + "learning_rate": 9.834594273846612e-06, + "loss": 0.1867, + "step": 680 + }, + { + "entropy": 9.445066452026367, + "epoch": 0.06723353767055566, + "mean_token_accuracy": 0.7359356880187988, + "num_tokens": 3549734.0, + "step": 680, + "train/ce_loss": 1.1421045064926147 + }, + { + "epoch": 0.06723353767055566, + "step": 680, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.06723353767055566, + "step": 680, + "train/total_loss": 0.22358545660972595 + }, + { + "entropy": 9.163797378540039, + "epoch": 0.06733241052007119, + "mean_token_accuracy": 0.6962421536445618, + "num_tokens": 3555145.0, + "step": 681, + "train/ce_loss": 0.4922032058238983 + }, + { + "epoch": 0.06733241052007119, + "step": 681, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.06733241052007119, + "step": 681, + "train/total_loss": 0.15468907356262207 + }, + { + "entropy": 9.931070327758789, + "epoch": 0.06743128336958672, + "mean_token_accuracy": 0.7162162065505981, + "num_tokens": 3560215.0, + "step": 682, + "train/ce_loss": 2.0106382369995117 + }, + { + "epoch": 0.06743128336958672, + "step": 682, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.06743128336958672, + "step": 682, + "train/total_loss": 0.3260638117790222 + }, + { + "entropy": 9.990118980407715, + "epoch": 0.06753015621910223, + "mean_token_accuracy": 0.7467700242996216, + "num_tokens": 3565072.0, + "step": 683, + "train/ce_loss": 1.3418598175048828 + }, + { + "epoch": 0.06753015621910223, + "step": 683, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.06753015621910223, + "step": 683, + "train/total_loss": 0.2591859698295593 + }, + { + "entropy": 9.619613647460938, + "epoch": 0.06762902906861776, + "mean_token_accuracy": 0.7405857443809509, + "num_tokens": 3570234.0, + "step": 684, + "train/ce_loss": 1.0212657451629639 + }, + { + "epoch": 0.06762902906861776, + "step": 684, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.06762902906861776, + "step": 684, + "train/total_loss": 0.1997828185558319 + }, + { + "entropy": 9.682656288146973, + "epoch": 0.06772790191813328, + "mean_token_accuracy": 0.7384615540504456, + "num_tokens": 3575280.0, + "step": 685, + "train/ce_loss": 1.205757737159729 + }, + { + "epoch": 0.06772790191813328, + "step": 685, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.06772790191813328, + "step": 685, + "train/total_loss": 0.19870078563690186 + }, + { + "entropy": 9.31159782409668, + "epoch": 0.0678267747676488, + "mean_token_accuracy": 0.7185016870498657, + "num_tokens": 3580607.0, + "step": 686, + "train/ce_loss": 0.6654388308525085 + }, + { + "epoch": 0.0678267747676488, + "step": 686, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.0678267747676488, + "step": 686, + "train/total_loss": 0.12513762712478638 + }, + { + "entropy": 9.789628982543945, + "epoch": 0.06792564761716433, + "mean_token_accuracy": 0.749627411365509, + "num_tokens": 3585915.0, + "step": 687, + "train/ce_loss": 1.253875494003296 + }, + { + "epoch": 0.06792564761716433, + "step": 687, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.06792564761716433, + "step": 687, + "train/total_loss": 0.2425750494003296 + }, + { + "entropy": 9.748092651367188, + "epoch": 0.06802452046667985, + "mean_token_accuracy": 0.6965517401695251, + "num_tokens": 3590926.0, + "step": 688, + "train/ce_loss": 1.903677225112915 + }, + { + "epoch": 0.06802452046667985, + "step": 688, + "train/sim_loss": 0.19921875 + }, + { + "epoch": 0.06802452046667985, + "step": 688, + "train/total_loss": 0.389586478471756 + }, + { + "entropy": 9.030678749084473, + "epoch": 0.06812339331619537, + "mean_token_accuracy": 0.779033899307251, + "num_tokens": 3596414.0, + "step": 689, + "train/ce_loss": 1.0212630033493042 + }, + { + "epoch": 0.06812339331619537, + "step": 689, + "train/sim_loss": 0.12109375 + }, + { + "epoch": 0.06812339331619537, + "step": 689, + "train/total_loss": 0.22322005033493042 + }, + { + "entropy": 10.724943161010742, + "epoch": 0.0682222661657109, + "mean_token_accuracy": 0.8090452551841736, + "num_tokens": 3600992.0, + "step": 690, + "train/ce_loss": 0.20112809538841248 + }, + { + "epoch": 0.0682222661657109, + "step": 690, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.0682222661657109, + "step": 690, + "train/total_loss": 0.08261281251907349 + }, + { + "entropy": 9.349640846252441, + "epoch": 0.06832113901522642, + "mean_token_accuracy": 0.7215777039527893, + "num_tokens": 3606369.0, + "step": 691, + "train/ce_loss": 1.2464035749435425 + }, + { + "epoch": 0.06832113901522642, + "step": 691, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.06832113901522642, + "step": 691, + "train/total_loss": 0.2496403604745865 + }, + { + "entropy": 9.738096237182617, + "epoch": 0.06842001186474195, + "mean_token_accuracy": 0.736923098564148, + "num_tokens": 3611444.0, + "step": 692, + "train/ce_loss": 1.0789051055908203 + }, + { + "epoch": 0.06842001186474195, + "step": 692, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.06842001186474195, + "step": 692, + "train/total_loss": 0.1742967665195465 + }, + { + "entropy": 9.667228698730469, + "epoch": 0.06851888471425746, + "mean_token_accuracy": 0.724397599697113, + "num_tokens": 3616530.0, + "step": 693, + "train/ce_loss": 1.6246271133422852 + }, + { + "epoch": 0.06851888471425746, + "step": 693, + "train/sim_loss": 0.14453125 + }, + { + "epoch": 0.06851888471425746, + "step": 693, + "train/total_loss": 0.3069939613342285 + }, + { + "entropy": 9.364891052246094, + "epoch": 0.06861775756377299, + "mean_token_accuracy": 0.7209567427635193, + "num_tokens": 3621889.0, + "step": 694, + "train/ce_loss": 1.0871267318725586 + }, + { + "epoch": 0.06861775756377299, + "step": 694, + "train/sim_loss": 0.1328125 + }, + { + "epoch": 0.06861775756377299, + "step": 694, + "train/total_loss": 0.24152517318725586 + }, + { + "entropy": 10.386150360107422, + "epoch": 0.06871663041328852, + "mean_token_accuracy": 0.7190082669258118, + "num_tokens": 3626639.0, + "step": 695, + "train/ce_loss": 0.08880459517240524 + }, + { + "epoch": 0.06871663041328852, + "step": 695, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.06871663041328852, + "step": 695, + "train/total_loss": 0.047942958772182465 + }, + { + "entropy": 9.494813919067383, + "epoch": 0.06881550326280403, + "mean_token_accuracy": 0.6580311059951782, + "num_tokens": 3631873.0, + "step": 696, + "train/ce_loss": 0.921953558921814 + }, + { + "epoch": 0.06881550326280403, + "step": 696, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.06881550326280403, + "step": 696, + "train/total_loss": 0.17032036185264587 + }, + { + "entropy": 9.156608581542969, + "epoch": 0.06891437611231956, + "mean_token_accuracy": 0.7486573457717896, + "num_tokens": 3637255.0, + "step": 697, + "train/ce_loss": 0.3901670277118683 + }, + { + "epoch": 0.06891437611231956, + "step": 697, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.06891437611231956, + "step": 697, + "train/total_loss": 0.1327667087316513 + }, + { + "entropy": 9.285764694213867, + "epoch": 0.06901324896183508, + "mean_token_accuracy": 0.6714628338813782, + "num_tokens": 3642508.0, + "step": 698, + "train/ce_loss": 1.7411725521087646 + }, + { + "epoch": 0.06901324896183508, + "step": 698, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.06901324896183508, + "step": 698, + "train/total_loss": 0.2913047671318054 + }, + { + "entropy": 9.458796501159668, + "epoch": 0.0691121218113506, + "mean_token_accuracy": 0.7443181872367859, + "num_tokens": 3647698.0, + "step": 699, + "train/ce_loss": 0.5074321627616882 + }, + { + "epoch": 0.0691121218113506, + "step": 699, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.0691121218113506, + "step": 699, + "train/total_loss": 0.1249619722366333 + }, + { + "epoch": 0.06921099466086612, + "grad_norm": 1.3604364395141602, + "learning_rate": 9.829649409088662e-06, + "loss": 0.1957, + "step": 700 + }, + { + "entropy": 9.569518089294434, + "epoch": 0.06921099466086612, + "mean_token_accuracy": 0.737051784992218, + "num_tokens": 3652914.0, + "step": 700, + "train/ce_loss": 0.03997810557484627 + }, + { + "epoch": 0.06921099466086612, + "step": 700, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.06921099466086612, + "step": 700, + "train/total_loss": 0.1016540601849556 + }, + { + "entropy": 9.300058364868164, + "epoch": 0.06930986751038165, + "mean_token_accuracy": 0.701545774936676, + "num_tokens": 3658230.0, + "step": 701, + "train/ce_loss": 1.087414264678955 + }, + { + "epoch": 0.06930986751038165, + "step": 701, + "train/sim_loss": 0.140625 + }, + { + "epoch": 0.06930986751038165, + "step": 701, + "train/total_loss": 0.24936643242835999 + }, + { + "entropy": 8.97523307800293, + "epoch": 0.06940874035989718, + "mean_token_accuracy": 0.7360979914665222, + "num_tokens": 3663780.0, + "step": 702, + "train/ce_loss": 0.4771391451358795 + }, + { + "epoch": 0.06940874035989718, + "step": 702, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.06940874035989718, + "step": 702, + "train/total_loss": 0.10630767047405243 + }, + { + "entropy": 9.283151626586914, + "epoch": 0.06950761320941269, + "mean_token_accuracy": 0.7625418305397034, + "num_tokens": 3669110.0, + "step": 703, + "train/ce_loss": 0.7032759189605713 + }, + { + "epoch": 0.06950761320941269, + "step": 703, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.06950761320941269, + "step": 703, + "train/total_loss": 0.11720259487628937 + }, + { + "entropy": 9.970033645629883, + "epoch": 0.06960648605892822, + "mean_token_accuracy": 0.6790606379508972, + "num_tokens": 3674050.0, + "step": 704, + "train/ce_loss": 1.2082698345184326 + }, + { + "epoch": 0.06960648605892822, + "step": 704, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.06960648605892822, + "step": 704, + "train/total_loss": 0.20676448941230774 + }, + { + "entropy": 10.034111022949219, + "epoch": 0.06970535890844375, + "mean_token_accuracy": 0.7245762944221497, + "num_tokens": 3678912.0, + "step": 705, + "train/ce_loss": 0.08424589782953262 + }, + { + "epoch": 0.06970535890844375, + "step": 705, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.06970535890844375, + "step": 705, + "train/total_loss": 0.09436208754777908 + }, + { + "entropy": 10.111080169677734, + "epoch": 0.06980423175795926, + "mean_token_accuracy": 0.6854838728904724, + "num_tokens": 3683843.0, + "step": 706, + "train/ce_loss": 1.994897484779358 + }, + { + "epoch": 0.06980423175795926, + "step": 706, + "train/sim_loss": 0.1328125 + }, + { + "epoch": 0.06980423175795926, + "step": 706, + "train/total_loss": 0.3323022723197937 + }, + { + "entropy": 9.114567756652832, + "epoch": 0.06990310460747479, + "mean_token_accuracy": 0.7799385786056519, + "num_tokens": 3689314.0, + "step": 707, + "train/ce_loss": 0.48830723762512207 + }, + { + "epoch": 0.06990310460747479, + "step": 707, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.06990310460747479, + "step": 707, + "train/total_loss": 0.09179947525262833 + }, + { + "entropy": 8.870497703552246, + "epoch": 0.07000197745699031, + "mean_token_accuracy": 0.7571288347244263, + "num_tokens": 3694841.0, + "step": 708, + "train/ce_loss": 0.9047189950942993 + }, + { + "epoch": 0.07000197745699031, + "step": 708, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.07000197745699031, + "step": 708, + "train/total_loss": 0.12562814354896545 + }, + { + "entropy": 9.620412826538086, + "epoch": 0.07010085030650583, + "mean_token_accuracy": 0.7257575988769531, + "num_tokens": 3700157.0, + "step": 709, + "train/ce_loss": 0.427399605512619 + }, + { + "epoch": 0.07010085030650583, + "step": 709, + "train/sim_loss": 0.18359375 + }, + { + "epoch": 0.07010085030650583, + "step": 709, + "train/total_loss": 0.22633370757102966 + }, + { + "entropy": 9.372359275817871, + "epoch": 0.07019972315602135, + "mean_token_accuracy": 0.7134146094322205, + "num_tokens": 3705452.0, + "step": 710, + "train/ce_loss": 0.5273782014846802 + }, + { + "epoch": 0.07019972315602135, + "step": 710, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.07019972315602135, + "step": 710, + "train/total_loss": 0.14648781716823578 + }, + { + "entropy": 10.039669036865234, + "epoch": 0.07029859600553688, + "mean_token_accuracy": 0.7710084319114685, + "num_tokens": 3710372.0, + "step": 711, + "train/ce_loss": 1.3590974807739258 + }, + { + "epoch": 0.07029859600553688, + "step": 711, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.07029859600553688, + "step": 711, + "train/total_loss": 0.22184725105762482 + }, + { + "entropy": 9.72056770324707, + "epoch": 0.07039746885505241, + "mean_token_accuracy": 0.7262658476829529, + "num_tokens": 3715474.0, + "step": 712, + "train/ce_loss": 1.278663992881775 + }, + { + "epoch": 0.07039746885505241, + "step": 712, + "train/sim_loss": 0.140625 + }, + { + "epoch": 0.07039746885505241, + "step": 712, + "train/total_loss": 0.26849138736724854 + }, + { + "entropy": 9.267029762268066, + "epoch": 0.07049634170456792, + "mean_token_accuracy": 0.7059509754180908, + "num_tokens": 3720799.0, + "step": 713, + "train/ce_loss": 0.4272323548793793 + }, + { + "epoch": 0.07049634170456792, + "step": 713, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.07049634170456792, + "step": 713, + "train/total_loss": 0.12866073846817017 + }, + { + "entropy": 9.170409202575684, + "epoch": 0.07059521455408345, + "mean_token_accuracy": 0.7936508059501648, + "num_tokens": 3726103.0, + "step": 714, + "train/ce_loss": 0.6024886965751648 + }, + { + "epoch": 0.07059521455408345, + "step": 714, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.07059521455408345, + "step": 714, + "train/total_loss": 0.13446761667728424 + }, + { + "entropy": 9.855649948120117, + "epoch": 0.07069408740359898, + "mean_token_accuracy": 0.6976743936538696, + "num_tokens": 3731103.0, + "step": 715, + "train/ce_loss": 0.04739635810256004 + }, + { + "epoch": 0.07069408740359898, + "step": 715, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.07069408740359898, + "step": 715, + "train/total_loss": 0.059427134692668915 + }, + { + "entropy": 9.538551330566406, + "epoch": 0.07079296025311449, + "mean_token_accuracy": 0.7252010703086853, + "num_tokens": 3736312.0, + "step": 716, + "train/ce_loss": 1.1417169570922852 + }, + { + "epoch": 0.07079296025311449, + "step": 716, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.07079296025311449, + "step": 716, + "train/total_loss": 0.22745294868946075 + }, + { + "entropy": 9.267305374145508, + "epoch": 0.07089183310263002, + "mean_token_accuracy": 0.7060086131095886, + "num_tokens": 3741658.0, + "step": 717, + "train/ce_loss": 0.7700828313827515 + }, + { + "epoch": 0.07089183310263002, + "step": 717, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.07089183310263002, + "step": 717, + "train/total_loss": 0.17075827717781067 + }, + { + "entropy": 9.660655975341797, + "epoch": 0.07099070595214554, + "mean_token_accuracy": 0.664505660533905, + "num_tokens": 3746718.0, + "step": 718, + "train/ce_loss": 1.1156566143035889 + }, + { + "epoch": 0.07099070595214554, + "step": 718, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.07099070595214554, + "step": 718, + "train/total_loss": 0.20140941441059113 + }, + { + "entropy": 9.282903671264648, + "epoch": 0.07108957880166106, + "mean_token_accuracy": 0.7349665760993958, + "num_tokens": 3752108.0, + "step": 719, + "train/ce_loss": 0.8221208453178406 + }, + { + "epoch": 0.07108957880166106, + "step": 719, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.07108957880166106, + "step": 719, + "train/total_loss": 0.19158709049224854 + }, + { + "epoch": 0.07118845165117658, + "grad_norm": 1.1553668975830078, + "learning_rate": 9.824704544330713e-06, + "loss": 0.1963, + "step": 720 + }, + { + "entropy": 9.120210647583008, + "epoch": 0.07118845165117658, + "mean_token_accuracy": 0.741631805896759, + "num_tokens": 3757553.0, + "step": 720, + "train/ce_loss": 0.8454121351242065 + }, + { + "epoch": 0.07118845165117658, + "step": 720, + "train/sim_loss": 0.1484375 + }, + { + "epoch": 0.07118845165117658, + "step": 720, + "train/total_loss": 0.2329787164926529 + }, + { + "entropy": 9.471561431884766, + "epoch": 0.07128732450069211, + "mean_token_accuracy": 0.7384823560714722, + "num_tokens": 3762730.0, + "step": 721, + "train/ce_loss": 1.2541835308074951 + }, + { + "epoch": 0.07128732450069211, + "step": 721, + "train/sim_loss": 0.13671875 + }, + { + "epoch": 0.07128732450069211, + "step": 721, + "train/total_loss": 0.26213711500167847 + }, + { + "entropy": 9.546029090881348, + "epoch": 0.07138619735020764, + "mean_token_accuracy": 0.7725321650505066, + "num_tokens": 3767914.0, + "step": 722, + "train/ce_loss": 0.8785749077796936 + }, + { + "epoch": 0.07138619735020764, + "step": 722, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.07138619735020764, + "step": 722, + "train/total_loss": 0.13863873481750488 + }, + { + "entropy": 9.323598861694336, + "epoch": 0.07148507019972315, + "mean_token_accuracy": 0.7405345439910889, + "num_tokens": 3773233.0, + "step": 723, + "train/ce_loss": 0.9451057314872742 + }, + { + "epoch": 0.07148507019972315, + "step": 723, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.07148507019972315, + "step": 723, + "train/total_loss": 0.19607308506965637 + }, + { + "entropy": 9.958362579345703, + "epoch": 0.07158394304923868, + "mean_token_accuracy": 0.7546531558036804, + "num_tokens": 3778258.0, + "step": 724, + "train/ce_loss": 1.2022948265075684 + }, + { + "epoch": 0.07158394304923868, + "step": 724, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.07158394304923868, + "step": 724, + "train/total_loss": 0.18663573265075684 + }, + { + "entropy": 9.390419006347656, + "epoch": 0.0716828158987542, + "mean_token_accuracy": 0.7416563630104065, + "num_tokens": 3783553.0, + "step": 725, + "train/ce_loss": 0.6417506337165833 + }, + { + "epoch": 0.0716828158987542, + "step": 725, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.0716828158987542, + "step": 725, + "train/total_loss": 0.1618313193321228 + }, + { + "entropy": 9.722644805908203, + "epoch": 0.07178168874826972, + "mean_token_accuracy": 0.7279411554336548, + "num_tokens": 3788630.0, + "step": 726, + "train/ce_loss": 1.2358253002166748 + }, + { + "epoch": 0.07178168874826972, + "step": 726, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.07178168874826972, + "step": 726, + "train/total_loss": 0.23295754194259644 + }, + { + "entropy": 10.481596946716309, + "epoch": 0.07188056159778525, + "mean_token_accuracy": 0.6655948758125305, + "num_tokens": 3793333.0, + "step": 727, + "train/ce_loss": 0.035042211413383484 + }, + { + "epoch": 0.07188056159778525, + "step": 727, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.07188056159778525, + "step": 727, + "train/total_loss": 0.05428547039628029 + }, + { + "entropy": 9.238446235656738, + "epoch": 0.07197943444730077, + "mean_token_accuracy": 0.7754868268966675, + "num_tokens": 3798691.0, + "step": 728, + "train/ce_loss": 0.4300273656845093 + }, + { + "epoch": 0.07197943444730077, + "step": 728, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.07197943444730077, + "step": 728, + "train/total_loss": 0.14065898954868317 + }, + { + "entropy": 9.407970428466797, + "epoch": 0.07207830729681629, + "mean_token_accuracy": 0.7565789222717285, + "num_tokens": 3803934.0, + "step": 729, + "train/ce_loss": 0.6835651397705078 + }, + { + "epoch": 0.07207830729681629, + "step": 729, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.07207830729681629, + "step": 729, + "train/total_loss": 0.13085651397705078 + }, + { + "entropy": 9.088936805725098, + "epoch": 0.07217718014633182, + "mean_token_accuracy": 0.708737850189209, + "num_tokens": 3809399.0, + "step": 730, + "train/ce_loss": 0.6989524364471436 + }, + { + "epoch": 0.07217718014633182, + "step": 730, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.07217718014633182, + "step": 730, + "train/total_loss": 0.15192648768424988 + }, + { + "entropy": 9.598482131958008, + "epoch": 0.07227605299584734, + "mean_token_accuracy": 0.7364568114280701, + "num_tokens": 3814590.0, + "step": 731, + "train/ce_loss": 1.3085780143737793 + }, + { + "epoch": 0.07227605299584734, + "step": 731, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.07227605299584734, + "step": 731, + "train/total_loss": 0.22070156037807465 + }, + { + "entropy": 10.219598770141602, + "epoch": 0.07237492584536287, + "mean_token_accuracy": 0.75, + "num_tokens": 3819480.0, + "step": 732, + "train/ce_loss": 0.05476241931319237 + }, + { + "epoch": 0.07237492584536287, + "step": 732, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.07237492584536287, + "step": 732, + "train/total_loss": 0.07969499379396439 + }, + { + "entropy": 9.48927116394043, + "epoch": 0.07247379869487838, + "mean_token_accuracy": 0.7329114079475403, + "num_tokens": 3824729.0, + "step": 733, + "train/ce_loss": 0.7749419212341309 + }, + { + "epoch": 0.07247379869487838, + "step": 733, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.07247379869487838, + "step": 733, + "train/total_loss": 0.15952545404434204 + }, + { + "entropy": 9.768869400024414, + "epoch": 0.07257267154439391, + "mean_token_accuracy": 0.6836734414100647, + "num_tokens": 3829855.0, + "step": 734, + "train/ce_loss": 1.120697259902954 + }, + { + "epoch": 0.07257267154439391, + "step": 734, + "train/sim_loss": 0.14453125 + }, + { + "epoch": 0.07257267154439391, + "step": 734, + "train/total_loss": 0.2566009759902954 + }, + { + "entropy": 9.667350769042969, + "epoch": 0.07267154439390944, + "mean_token_accuracy": 0.7994186282157898, + "num_tokens": 3834992.0, + "step": 735, + "train/ce_loss": 0.8130565881729126 + }, + { + "epoch": 0.07267154439390944, + "step": 735, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.07267154439390944, + "step": 735, + "train/total_loss": 0.14771190285682678 + }, + { + "entropy": 9.413403511047363, + "epoch": 0.07277041724342495, + "mean_token_accuracy": 0.7240506410598755, + "num_tokens": 3840270.0, + "step": 736, + "train/ce_loss": 0.9753805994987488 + }, + { + "epoch": 0.07277041724342495, + "step": 736, + "train/sim_loss": 0.12109375 + }, + { + "epoch": 0.07277041724342495, + "step": 736, + "train/total_loss": 0.2186318039894104 + }, + { + "entropy": 10.649709701538086, + "epoch": 0.07286929009294048, + "mean_token_accuracy": 0.7428571581840515, + "num_tokens": 3844835.0, + "step": 737, + "train/ce_loss": 0.0780835896730423 + }, + { + "epoch": 0.07286929009294048, + "step": 737, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.07286929009294048, + "step": 737, + "train/total_loss": 0.05858960747718811 + }, + { + "entropy": 9.241476058959961, + "epoch": 0.072968162942456, + "mean_token_accuracy": 0.7703016400337219, + "num_tokens": 3850202.0, + "step": 738, + "train/ce_loss": 0.5357682704925537 + }, + { + "epoch": 0.072968162942456, + "step": 738, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.072968162942456, + "step": 738, + "train/total_loss": 0.10045182704925537 + }, + { + "entropy": 10.102038383483887, + "epoch": 0.07306703579197152, + "mean_token_accuracy": 0.6833333373069763, + "num_tokens": 3855122.0, + "step": 739, + "train/ce_loss": 0.037747181951999664 + }, + { + "epoch": 0.07306703579197152, + "step": 739, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.07306703579197152, + "step": 739, + "train/total_loss": 0.0975247174501419 + }, + { + "epoch": 0.07316590864148705, + "grad_norm": 1.4861098527908325, + "learning_rate": 9.819759679572765e-06, + "loss": 0.1844, + "step": 740 + }, + { + "entropy": 9.190587997436523, + "epoch": 0.07316590864148705, + "mean_token_accuracy": 0.6952695250511169, + "num_tokens": 3860520.0, + "step": 740, + "train/ce_loss": 0.8424211740493774 + }, + { + "epoch": 0.07316590864148705, + "step": 740, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.07316590864148705, + "step": 740, + "train/total_loss": 0.16627337038516998 + }, + { + "entropy": 9.568659782409668, + "epoch": 0.07326478149100257, + "mean_token_accuracy": 0.7337278127670288, + "num_tokens": 3865690.0, + "step": 741, + "train/ce_loss": 0.713638961315155 + }, + { + "epoch": 0.07326478149100257, + "step": 741, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.07326478149100257, + "step": 741, + "train/total_loss": 0.1573013961315155 + }, + { + "entropy": 9.289871215820312, + "epoch": 0.0733636543405181, + "mean_token_accuracy": 0.7718120813369751, + "num_tokens": 3871187.0, + "step": 742, + "train/ce_loss": 0.6669222712516785 + }, + { + "epoch": 0.0733636543405181, + "step": 742, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.0733636543405181, + "step": 742, + "train/total_loss": 0.13700473308563232 + }, + { + "entropy": 9.741256713867188, + "epoch": 0.07346252719003361, + "mean_token_accuracy": 0.7789046764373779, + "num_tokens": 3876114.0, + "step": 743, + "train/ce_loss": 0.697907567024231 + }, + { + "epoch": 0.07346252719003361, + "step": 743, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.07346252719003361, + "step": 743, + "train/total_loss": 0.15572825074195862 + }, + { + "entropy": 9.256278038024902, + "epoch": 0.07356140003954914, + "mean_token_accuracy": 0.8060781359672546, + "num_tokens": 3881308.0, + "step": 744, + "train/ce_loss": 0.8655441403388977 + }, + { + "epoch": 0.07356140003954914, + "step": 744, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.07356140003954914, + "step": 744, + "train/total_loss": 0.1529606580734253 + }, + { + "entropy": 9.161439895629883, + "epoch": 0.07366027288906467, + "mean_token_accuracy": 0.7142857313156128, + "num_tokens": 3886614.0, + "step": 745, + "train/ce_loss": 0.6780821084976196 + }, + { + "epoch": 0.07366027288906467, + "step": 745, + "train/sim_loss": 0.13671875 + }, + { + "epoch": 0.07366027288906467, + "step": 745, + "train/total_loss": 0.20452696084976196 + }, + { + "entropy": 9.79833984375, + "epoch": 0.07375914573858018, + "mean_token_accuracy": 0.6719492673873901, + "num_tokens": 3891750.0, + "step": 746, + "train/ce_loss": 0.010583448223769665 + }, + { + "epoch": 0.07375914573858018, + "step": 746, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.07375914573858018, + "step": 746, + "train/total_loss": 0.07918334752321243 + }, + { + "entropy": 9.648619651794434, + "epoch": 0.07385801858809571, + "mean_token_accuracy": 0.7537190318107605, + "num_tokens": 3896805.0, + "step": 747, + "train/ce_loss": 1.2625455856323242 + }, + { + "epoch": 0.07385801858809571, + "step": 747, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.07385801858809571, + "step": 747, + "train/total_loss": 0.23172330856323242 + }, + { + "entropy": 9.038060188293457, + "epoch": 0.07395689143761124, + "mean_token_accuracy": 0.7724867463111877, + "num_tokens": 3902211.0, + "step": 748, + "train/ce_loss": 0.7919908761978149 + }, + { + "epoch": 0.07395689143761124, + "step": 748, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.07395689143761124, + "step": 748, + "train/total_loss": 0.17685534060001373 + }, + { + "entropy": 9.768566131591797, + "epoch": 0.07405576428712675, + "mean_token_accuracy": 0.6908283829689026, + "num_tokens": 3907482.0, + "step": 749, + "train/ce_loss": 1.213486671447754 + }, + { + "epoch": 0.07405576428712675, + "step": 749, + "train/sim_loss": 0.140625 + }, + { + "epoch": 0.07405576428712675, + "step": 749, + "train/total_loss": 0.26197367906570435 + }, + { + "entropy": 9.529090881347656, + "epoch": 0.07415463713664228, + "mean_token_accuracy": 0.769784152507782, + "num_tokens": 3912423.0, + "step": 750, + "train/ce_loss": 1.1830240488052368 + }, + { + "epoch": 0.07415463713664228, + "step": 750, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.07415463713664228, + "step": 750, + "train/total_loss": 0.23548990488052368 + }, + { + "entropy": 9.040364265441895, + "epoch": 0.0742535099861578, + "mean_token_accuracy": 0.73758864402771, + "num_tokens": 3917883.0, + "step": 751, + "train/ce_loss": 0.48614466190338135 + }, + { + "epoch": 0.0742535099861578, + "step": 751, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.0742535099861578, + "step": 751, + "train/total_loss": 0.1462707221508026 + }, + { + "entropy": 9.44776725769043, + "epoch": 0.07435238283567333, + "mean_token_accuracy": 0.6906565427780151, + "num_tokens": 3923172.0, + "step": 752, + "train/ce_loss": 1.3117228746414185 + }, + { + "epoch": 0.07435238283567333, + "step": 752, + "train/sim_loss": 0.12109375 + }, + { + "epoch": 0.07435238283567333, + "step": 752, + "train/total_loss": 0.2522660493850708 + }, + { + "entropy": 9.157966613769531, + "epoch": 0.07445125568518884, + "mean_token_accuracy": 0.7769230604171753, + "num_tokens": 3928570.0, + "step": 753, + "train/ce_loss": 0.8286904096603394 + }, + { + "epoch": 0.07445125568518884, + "step": 753, + "train/sim_loss": 0.15625 + }, + { + "epoch": 0.07445125568518884, + "step": 753, + "train/total_loss": 0.2391190528869629 + }, + { + "entropy": 9.417069435119629, + "epoch": 0.07455012853470437, + "mean_token_accuracy": 0.7154285907745361, + "num_tokens": 3934052.0, + "step": 754, + "train/ce_loss": 1.1700571775436401 + }, + { + "epoch": 0.07455012853470437, + "step": 754, + "train/sim_loss": 0.1796875 + }, + { + "epoch": 0.07455012853470437, + "step": 754, + "train/total_loss": 0.29669320583343506 + }, + { + "entropy": 9.932807922363281, + "epoch": 0.0746490013842199, + "mean_token_accuracy": 0.7793851494789124, + "num_tokens": 3939025.0, + "step": 755, + "train/ce_loss": 0.0001756744022713974 + }, + { + "epoch": 0.0746490013842199, + "step": 755, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.0746490013842199, + "step": 755, + "train/total_loss": 0.058611318469047546 + }, + { + "entropy": 10.180121421813965, + "epoch": 0.07474787423373541, + "mean_token_accuracy": 0.769444465637207, + "num_tokens": 3943785.0, + "step": 756, + "train/ce_loss": 2.040376901626587 + }, + { + "epoch": 0.07474787423373541, + "step": 756, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.07474787423373541, + "step": 756, + "train/total_loss": 0.30950644612312317 + }, + { + "entropy": 9.88981819152832, + "epoch": 0.07484674708325094, + "mean_token_accuracy": 0.7750439643859863, + "num_tokens": 3948807.0, + "step": 757, + "train/ce_loss": 0.9959053993225098 + }, + { + "epoch": 0.07484674708325094, + "step": 757, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.07484674708325094, + "step": 757, + "train/total_loss": 0.20115303993225098 + }, + { + "entropy": 9.706342697143555, + "epoch": 0.07494561993276647, + "mean_token_accuracy": 0.6854838728904724, + "num_tokens": 3953879.0, + "step": 758, + "train/ce_loss": 2.2091052532196045 + }, + { + "epoch": 0.07494561993276647, + "step": 758, + "train/sim_loss": 0.15625 + }, + { + "epoch": 0.07494561993276647, + "step": 758, + "train/total_loss": 0.37716054916381836 + }, + { + "entropy": 10.303378105163574, + "epoch": 0.07504449278228198, + "mean_token_accuracy": 0.7054455280303955, + "num_tokens": 3958686.0, + "step": 759, + "train/ce_loss": 0.00018329703016206622 + }, + { + "epoch": 0.07504449278228198, + "step": 759, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.07504449278228198, + "step": 759, + "train/total_loss": 0.09376832842826843 + }, + { + "epoch": 0.0751433656317975, + "grad_norm": 1.2485485076904297, + "learning_rate": 9.814814814814815e-06, + "loss": 0.1817, + "step": 760 + }, + { + "entropy": 9.339644432067871, + "epoch": 0.0751433656317975, + "mean_token_accuracy": 0.7108753323554993, + "num_tokens": 3963948.0, + "step": 760, + "train/ce_loss": 0.7239080667495728 + }, + { + "epoch": 0.0751433656317975, + "step": 760, + "train/sim_loss": 0.12890625 + }, + { + "epoch": 0.0751433656317975, + "step": 760, + "train/total_loss": 0.20129705965518951 + }, + { + "entropy": 9.127784729003906, + "epoch": 0.07524223848131303, + "mean_token_accuracy": 0.7497155666351318, + "num_tokens": 3969361.0, + "step": 761, + "train/ce_loss": 0.8279957175254822 + }, + { + "epoch": 0.07524223848131303, + "step": 761, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.07524223848131303, + "step": 761, + "train/total_loss": 0.12186207622289658 + }, + { + "entropy": 10.072265625, + "epoch": 0.07534111133082856, + "mean_token_accuracy": 0.7269155383110046, + "num_tokens": 3974483.0, + "step": 762, + "train/ce_loss": 1.78815495967865 + }, + { + "epoch": 0.07534111133082856, + "step": 762, + "train/sim_loss": 0.14453125 + }, + { + "epoch": 0.07534111133082856, + "step": 762, + "train/total_loss": 0.32334673404693604 + }, + { + "entropy": 8.886787414550781, + "epoch": 0.07543998418034407, + "mean_token_accuracy": 0.7236328125, + "num_tokens": 3980013.0, + "step": 763, + "train/ce_loss": 0.7672596573829651 + }, + { + "epoch": 0.07543998418034407, + "step": 763, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.07543998418034407, + "step": 763, + "train/total_loss": 0.10406971722841263 + }, + { + "entropy": 9.612545013427734, + "epoch": 0.0755388570298596, + "mean_token_accuracy": 0.7057291865348816, + "num_tokens": 3985241.0, + "step": 764, + "train/ce_loss": 0.0003228633722756058 + }, + { + "epoch": 0.0755388570298596, + "step": 764, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.0755388570298596, + "step": 764, + "train/total_loss": 0.12503229081630707 + }, + { + "entropy": 10.012237548828125, + "epoch": 0.07563772987937513, + "mean_token_accuracy": 0.7092783451080322, + "num_tokens": 3990048.0, + "step": 765, + "train/ce_loss": 0.00010308609489584342 + }, + { + "epoch": 0.07563772987937513, + "step": 765, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.07563772987937513, + "step": 765, + "train/total_loss": 0.07813531160354614 + }, + { + "entropy": 10.73228645324707, + "epoch": 0.07573660272889064, + "mean_token_accuracy": 0.7272727489471436, + "num_tokens": 3994606.0, + "step": 766, + "train/ce_loss": 3.5277905464172363 + }, + { + "epoch": 0.07573660272889064, + "step": 766, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.07573660272889064, + "step": 766, + "train/total_loss": 0.4699665606021881 + }, + { + "entropy": 10.159137725830078, + "epoch": 0.07583547557840617, + "mean_token_accuracy": 0.7402299046516418, + "num_tokens": 3999438.0, + "step": 767, + "train/ce_loss": 0.00011631346569629386 + }, + { + "epoch": 0.07583547557840617, + "step": 767, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.07583547557840617, + "step": 767, + "train/total_loss": 0.0703241303563118 + }, + { + "entropy": 9.458081245422363, + "epoch": 0.0759343484279217, + "mean_token_accuracy": 0.7712082266807556, + "num_tokens": 4004694.0, + "step": 768, + "train/ce_loss": 0.7753697633743286 + }, + { + "epoch": 0.0759343484279217, + "step": 768, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.0759343484279217, + "step": 768, + "train/total_loss": 0.11269322782754898 + }, + { + "entropy": 9.82442855834961, + "epoch": 0.07603322127743721, + "mean_token_accuracy": 0.7715231776237488, + "num_tokens": 4009700.0, + "step": 769, + "train/ce_loss": 0.6574856042861938 + }, + { + "epoch": 0.07603322127743721, + "step": 769, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.07603322127743721, + "step": 769, + "train/total_loss": 0.14387357234954834 + }, + { + "entropy": 9.83502197265625, + "epoch": 0.07613209412695274, + "mean_token_accuracy": 0.738831639289856, + "num_tokens": 4014669.0, + "step": 770, + "train/ce_loss": 0.7562925219535828 + }, + { + "epoch": 0.07613209412695274, + "step": 770, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.07613209412695274, + "step": 770, + "train/total_loss": 0.16156676411628723 + }, + { + "entropy": 9.765600204467773, + "epoch": 0.07623096697646826, + "mean_token_accuracy": 0.737942099571228, + "num_tokens": 4019740.0, + "step": 771, + "train/ce_loss": 6.540792674059048e-05 + }, + { + "epoch": 0.07623096697646826, + "step": 771, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.07623096697646826, + "step": 771, + "train/total_loss": 0.0859440416097641 + }, + { + "entropy": 9.84894847869873, + "epoch": 0.07632983982598378, + "mean_token_accuracy": 0.743922233581543, + "num_tokens": 4024689.0, + "step": 772, + "train/ce_loss": 5.4590320360148326e-05 + }, + { + "epoch": 0.07632983982598378, + "step": 772, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.07632983982598378, + "step": 772, + "train/total_loss": 0.08203671127557755 + }, + { + "entropy": 9.404129981994629, + "epoch": 0.0764287126754993, + "mean_token_accuracy": 0.7545219659805298, + "num_tokens": 4029914.0, + "step": 773, + "train/ce_loss": 0.7593308091163635 + }, + { + "epoch": 0.0764287126754993, + "step": 773, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.0764287126754993, + "step": 773, + "train/total_loss": 0.1657768338918686 + }, + { + "entropy": 10.176400184631348, + "epoch": 0.07652758552501483, + "mean_token_accuracy": 0.6865979433059692, + "num_tokens": 4034779.0, + "step": 774, + "train/ce_loss": 7.022523641353473e-05 + }, + { + "epoch": 0.07652758552501483, + "step": 774, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.07652758552501483, + "step": 774, + "train/total_loss": 0.0507882721722126 + }, + { + "entropy": 9.250575065612793, + "epoch": 0.07662645837453036, + "mean_token_accuracy": 0.704395592212677, + "num_tokens": 4040187.0, + "step": 775, + "train/ce_loss": 0.5897475481033325 + }, + { + "epoch": 0.07662645837453036, + "step": 775, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.07662645837453036, + "step": 775, + "train/total_loss": 0.1331935077905655 + }, + { + "entropy": 9.51695728302002, + "epoch": 0.07672533122404587, + "mean_token_accuracy": 0.7522255182266235, + "num_tokens": 4045271.0, + "step": 776, + "train/ce_loss": 5.094786683912389e-05 + }, + { + "epoch": 0.07672533122404587, + "step": 776, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.07672533122404587, + "step": 776, + "train/total_loss": 0.1015675961971283 + }, + { + "entropy": 9.079865455627441, + "epoch": 0.0768242040735614, + "mean_token_accuracy": 0.7241014838218689, + "num_tokens": 4050727.0, + "step": 777, + "train/ce_loss": 1.0284991264343262 + }, + { + "epoch": 0.0768242040735614, + "step": 777, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.0768242040735614, + "step": 777, + "train/total_loss": 0.22784991562366486 + }, + { + "entropy": 9.44433307647705, + "epoch": 0.07692307692307693, + "mean_token_accuracy": 0.7258262038230896, + "num_tokens": 4056011.0, + "step": 778, + "train/ce_loss": 1.0514321327209473 + }, + { + "epoch": 0.07692307692307693, + "step": 778, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.07692307692307693, + "step": 778, + "train/total_loss": 0.1988932192325592 + }, + { + "entropy": 9.88050651550293, + "epoch": 0.07702194977259244, + "mean_token_accuracy": 0.7445651888847351, + "num_tokens": 4061016.0, + "step": 779, + "train/ce_loss": 1.3254461288452148 + }, + { + "epoch": 0.07702194977259244, + "step": 779, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.07702194977259244, + "step": 779, + "train/total_loss": 0.2380133718252182 + }, + { + "epoch": 0.07712082262210797, + "grad_norm": 1.189727544784546, + "learning_rate": 9.809869950056868e-06, + "loss": 0.1784, + "step": 780 + }, + { + "entropy": 9.282702445983887, + "epoch": 0.07712082262210797, + "mean_token_accuracy": 0.7541191577911377, + "num_tokens": 4066318.0, + "step": 780, + "train/ce_loss": 0.8596696257591248 + }, + { + "epoch": 0.07712082262210797, + "step": 780, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.07712082262210797, + "step": 780, + "train/total_loss": 0.18752947449684143 + }, + { + "entropy": 9.363134384155273, + "epoch": 0.0772196954716235, + "mean_token_accuracy": 0.7042440176010132, + "num_tokens": 4071532.0, + "step": 781, + "train/ce_loss": 1.0713800191879272 + }, + { + "epoch": 0.0772196954716235, + "step": 781, + "train/sim_loss": 0.15234375 + }, + { + "epoch": 0.0772196954716235, + "step": 781, + "train/total_loss": 0.2594817578792572 + }, + { + "entropy": 9.268877983093262, + "epoch": 0.07731856832113901, + "mean_token_accuracy": 0.7172653675079346, + "num_tokens": 4076917.0, + "step": 782, + "train/ce_loss": 0.9807270765304565 + }, + { + "epoch": 0.07731856832113901, + "step": 782, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.07731856832113901, + "step": 782, + "train/total_loss": 0.18010395765304565 + }, + { + "entropy": 9.296899795532227, + "epoch": 0.07741744117065454, + "mean_token_accuracy": 0.7200435996055603, + "num_tokens": 4082478.0, + "step": 783, + "train/ce_loss": 0.713777482509613 + }, + { + "epoch": 0.07741744117065454, + "step": 783, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.07741744117065454, + "step": 783, + "train/total_loss": 0.16903400421142578 + }, + { + "entropy": 9.328934669494629, + "epoch": 0.07751631402017006, + "mean_token_accuracy": 0.7242236137390137, + "num_tokens": 4087763.0, + "step": 784, + "train/ce_loss": 0.4932112991809845 + }, + { + "epoch": 0.07751631402017006, + "step": 784, + "train/sim_loss": 0.1328125 + }, + { + "epoch": 0.07751631402017006, + "step": 784, + "train/total_loss": 0.18213362991809845 + }, + { + "entropy": 9.931402206420898, + "epoch": 0.07761518686968559, + "mean_token_accuracy": 0.810234546661377, + "num_tokens": 4092642.0, + "step": 785, + "train/ce_loss": 0.8886024951934814 + }, + { + "epoch": 0.07761518686968559, + "step": 785, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.07761518686968559, + "step": 785, + "train/total_loss": 0.12792274355888367 + }, + { + "entropy": 8.978738784790039, + "epoch": 0.0777140597192011, + "mean_token_accuracy": 0.7637795209884644, + "num_tokens": 4098166.0, + "step": 786, + "train/ce_loss": 0.644202709197998 + }, + { + "epoch": 0.0777140597192011, + "step": 786, + "train/sim_loss": 0.171875 + }, + { + "epoch": 0.0777140597192011, + "step": 786, + "train/total_loss": 0.23629528284072876 + }, + { + "entropy": 9.127129554748535, + "epoch": 0.07781293256871663, + "mean_token_accuracy": 0.7309812307357788, + "num_tokens": 4103562.0, + "step": 787, + "train/ce_loss": 0.9794098138809204 + }, + { + "epoch": 0.07781293256871663, + "step": 787, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.07781293256871663, + "step": 787, + "train/total_loss": 0.20731598138809204 + }, + { + "entropy": 8.971997261047363, + "epoch": 0.07791180541823216, + "mean_token_accuracy": 0.7138643264770508, + "num_tokens": 4109055.0, + "step": 788, + "train/ce_loss": 1.256629467010498 + }, + { + "epoch": 0.07791180541823216, + "step": 788, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.07791180541823216, + "step": 788, + "train/total_loss": 0.21160045266151428 + }, + { + "entropy": 10.002260208129883, + "epoch": 0.07801067826774767, + "mean_token_accuracy": 0.7286527752876282, + "num_tokens": 4113975.0, + "step": 789, + "train/ce_loss": 1.4703837633132935 + }, + { + "epoch": 0.07801067826774767, + "step": 789, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.07801067826774767, + "step": 789, + "train/total_loss": 0.24860088527202606 + }, + { + "entropy": 9.624162673950195, + "epoch": 0.0781095511172632, + "mean_token_accuracy": 0.6926286220550537, + "num_tokens": 4119187.0, + "step": 790, + "train/ce_loss": 1.9462153911590576 + }, + { + "epoch": 0.0781095511172632, + "step": 790, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.0781095511172632, + "step": 790, + "train/total_loss": 0.29618406295776367 + }, + { + "entropy": 9.789203643798828, + "epoch": 0.07820842396677873, + "mean_token_accuracy": 0.6942771077156067, + "num_tokens": 4124288.0, + "step": 791, + "train/ce_loss": 1.2474993467330933 + }, + { + "epoch": 0.07820842396677873, + "step": 791, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.07820842396677873, + "step": 791, + "train/total_loss": 0.21459367871284485 + }, + { + "entropy": 10.033029556274414, + "epoch": 0.07830729681629424, + "mean_token_accuracy": 0.7454175353050232, + "num_tokens": 4129206.0, + "step": 792, + "train/ce_loss": 1.3075237274169922 + }, + { + "epoch": 0.07830729681629424, + "step": 792, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.07830729681629424, + "step": 792, + "train/total_loss": 0.20887736976146698 + }, + { + "entropy": 9.54617691040039, + "epoch": 0.07840616966580977, + "mean_token_accuracy": 0.7549933195114136, + "num_tokens": 4134381.0, + "step": 793, + "train/ce_loss": 0.8156450390815735 + }, + { + "epoch": 0.07840616966580977, + "step": 793, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.07840616966580977, + "step": 793, + "train/total_loss": 0.1792207658290863 + }, + { + "entropy": 9.540731430053711, + "epoch": 0.0785050425153253, + "mean_token_accuracy": 0.7285513281822205, + "num_tokens": 4139579.0, + "step": 794, + "train/ce_loss": 0.6351312398910522 + }, + { + "epoch": 0.0785050425153253, + "step": 794, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.0785050425153253, + "step": 794, + "train/total_loss": 0.1533568799495697 + }, + { + "entropy": 9.890420913696289, + "epoch": 0.07860391536484082, + "mean_token_accuracy": 0.693493127822876, + "num_tokens": 4144627.0, + "step": 795, + "train/ce_loss": 0.7012494206428528 + }, + { + "epoch": 0.07860391536484082, + "step": 795, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.07860391536484082, + "step": 795, + "train/total_loss": 0.15996870398521423 + }, + { + "entropy": 9.151998519897461, + "epoch": 0.07870278821435633, + "mean_token_accuracy": 0.7046688199043274, + "num_tokens": 4150017.0, + "step": 796, + "train/ce_loss": 1.022414207458496 + }, + { + "epoch": 0.07870278821435633, + "step": 796, + "train/sim_loss": 0.12109375 + }, + { + "epoch": 0.07870278821435633, + "step": 796, + "train/total_loss": 0.2233351767063141 + }, + { + "entropy": 10.323640823364258, + "epoch": 0.07880166106387186, + "mean_token_accuracy": 0.7662721872329712, + "num_tokens": 4154737.0, + "step": 797, + "train/ce_loss": 7.12625915184617e-05 + }, + { + "epoch": 0.07880166106387186, + "step": 797, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.07880166106387186, + "step": 797, + "train/total_loss": 0.03906962648034096 + }, + { + "entropy": 9.252408981323242, + "epoch": 0.07890053391338739, + "mean_token_accuracy": 0.7404674291610718, + "num_tokens": 4160031.0, + "step": 798, + "train/ce_loss": 0.7661677598953247 + }, + { + "epoch": 0.07890053391338739, + "step": 798, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.07890053391338739, + "step": 798, + "train/total_loss": 0.1898980289697647 + }, + { + "entropy": 9.693077087402344, + "epoch": 0.0789994067629029, + "mean_token_accuracy": 0.7832586169242859, + "num_tokens": 4165172.0, + "step": 799, + "train/ce_loss": 0.8967200517654419 + }, + { + "epoch": 0.0789994067629029, + "step": 799, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.0789994067629029, + "step": 799, + "train/total_loss": 0.1482657492160797 + }, + { + "epoch": 0.07909827961241843, + "grad_norm": 1.1841247081756592, + "learning_rate": 9.804925085298918e-06, + "loss": 0.1842, + "step": 800 + }, + { + "entropy": 9.122665405273438, + "epoch": 0.07909827961241843, + "mean_token_accuracy": 0.696703314781189, + "num_tokens": 4170555.0, + "step": 800, + "train/ce_loss": 1.649169921875 + }, + { + "epoch": 0.07909827961241843, + "step": 800, + "train/sim_loss": 0.12890625 + }, + { + "epoch": 0.07909827961241843, + "step": 800, + "train/total_loss": 0.2938232421875 + }, + { + "entropy": 9.88027572631836, + "epoch": 0.07919715246193396, + "mean_token_accuracy": 0.7062146663665771, + "num_tokens": 4175529.0, + "step": 801, + "train/ce_loss": 1.0107178688049316 + }, + { + "epoch": 0.07919715246193396, + "step": 801, + "train/sim_loss": 0.15234375 + }, + { + "epoch": 0.07919715246193396, + "step": 801, + "train/total_loss": 0.2534155249595642 + }, + { + "entropy": 9.497377395629883, + "epoch": 0.07929602531144947, + "mean_token_accuracy": 0.7316129207611084, + "num_tokens": 4180700.0, + "step": 802, + "train/ce_loss": 1.0892964601516724 + }, + { + "epoch": 0.07929602531144947, + "step": 802, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.07929602531144947, + "step": 802, + "train/total_loss": 0.17924214899539948 + }, + { + "entropy": 9.278539657592773, + "epoch": 0.079394898160965, + "mean_token_accuracy": 0.7182390093803406, + "num_tokens": 4185971.0, + "step": 803, + "train/ce_loss": 0.8930141925811768 + }, + { + "epoch": 0.079394898160965, + "step": 803, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.079394898160965, + "step": 803, + "train/total_loss": 0.20258267223834991 + }, + { + "entropy": 9.887369155883789, + "epoch": 0.07949377101048052, + "mean_token_accuracy": 0.71875, + "num_tokens": 4190973.0, + "step": 804, + "train/ce_loss": 1.0374951362609863 + }, + { + "epoch": 0.07949377101048052, + "step": 804, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.07949377101048052, + "step": 804, + "train/total_loss": 0.19359326362609863 + }, + { + "entropy": 9.997756958007812, + "epoch": 0.07959264385999605, + "mean_token_accuracy": 0.7203791737556458, + "num_tokens": 4195773.0, + "step": 805, + "train/ce_loss": 4.149144297116436e-05 + }, + { + "epoch": 0.07959264385999605, + "step": 805, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.07959264385999605, + "step": 805, + "train/total_loss": 0.10547289997339249 + }, + { + "entropy": 9.551786422729492, + "epoch": 0.07969151670951156, + "mean_token_accuracy": 0.6892109513282776, + "num_tokens": 4200813.0, + "step": 806, + "train/ce_loss": 3.241149170207791e-05 + }, + { + "epoch": 0.07969151670951156, + "step": 806, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.07969151670951156, + "step": 806, + "train/total_loss": 0.11719074100255966 + }, + { + "entropy": 9.735570907592773, + "epoch": 0.07979038955902709, + "mean_token_accuracy": 0.7573872208595276, + "num_tokens": 4205860.0, + "step": 807, + "train/ce_loss": 1.1239843368530273 + }, + { + "epoch": 0.07979038955902709, + "step": 807, + "train/sim_loss": 0.12890625 + }, + { + "epoch": 0.07979038955902709, + "step": 807, + "train/total_loss": 0.2413046956062317 + }, + { + "entropy": 9.355799674987793, + "epoch": 0.07988926240854262, + "mean_token_accuracy": 0.748062014579773, + "num_tokens": 4211070.0, + "step": 808, + "train/ce_loss": 1.0023411512374878 + }, + { + "epoch": 0.07988926240854262, + "step": 808, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.07988926240854262, + "step": 808, + "train/total_loss": 0.21351537108421326 + }, + { + "entropy": 9.87933349609375, + "epoch": 0.07998813525805813, + "mean_token_accuracy": 0.7006688714027405, + "num_tokens": 4216082.0, + "step": 809, + "train/ce_loss": 1.4445483684539795 + }, + { + "epoch": 0.07998813525805813, + "step": 809, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.07998813525805813, + "step": 809, + "train/total_loss": 0.24601733684539795 + }, + { + "entropy": 9.267508506774902, + "epoch": 0.08008700810757366, + "mean_token_accuracy": 0.7016759514808655, + "num_tokens": 4221494.0, + "step": 810, + "train/ce_loss": 1.6310838460922241 + }, + { + "epoch": 0.08008700810757366, + "step": 810, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.08008700810757366, + "step": 810, + "train/total_loss": 0.2881084084510803 + }, + { + "entropy": 10.011655807495117, + "epoch": 0.08018588095708919, + "mean_token_accuracy": 0.6947565674781799, + "num_tokens": 4226468.0, + "step": 811, + "train/ce_loss": 2.444669246673584 + }, + { + "epoch": 0.08018588095708919, + "step": 811, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.08018588095708919, + "step": 811, + "train/total_loss": 0.3616544306278229 + }, + { + "entropy": 9.906841278076172, + "epoch": 0.0802847538066047, + "mean_token_accuracy": 0.6685288548469543, + "num_tokens": 4231432.0, + "step": 812, + "train/ce_loss": 3.6316334444563836e-05 + }, + { + "epoch": 0.0802847538066047, + "step": 812, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.0802847538066047, + "step": 812, + "train/total_loss": 0.0703161284327507 + }, + { + "entropy": 9.732148170471191, + "epoch": 0.08038362665612023, + "mean_token_accuracy": 0.7801653146743774, + "num_tokens": 4236501.0, + "step": 813, + "train/ce_loss": 0.6189934611320496 + }, + { + "epoch": 0.08038362665612023, + "step": 813, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.08038362665612023, + "step": 813, + "train/total_loss": 0.1751805990934372 + }, + { + "entropy": 9.342995643615723, + "epoch": 0.08048249950563575, + "mean_token_accuracy": 0.7265536785125732, + "num_tokens": 4241872.0, + "step": 814, + "train/ce_loss": 1.6427977085113525 + }, + { + "epoch": 0.08048249950563575, + "step": 814, + "train/sim_loss": 0.12109375 + }, + { + "epoch": 0.08048249950563575, + "step": 814, + "train/total_loss": 0.2853735089302063 + }, + { + "entropy": 9.426212310791016, + "epoch": 0.08058137235515128, + "mean_token_accuracy": 0.7403100728988647, + "num_tokens": 4247155.0, + "step": 815, + "train/ce_loss": 0.9434316754341125 + }, + { + "epoch": 0.08058137235515128, + "step": 815, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.08058137235515128, + "step": 815, + "train/total_loss": 0.1451244205236435 + }, + { + "entropy": 9.224058151245117, + "epoch": 0.0806802452046668, + "mean_token_accuracy": 0.7601279020309448, + "num_tokens": 4252553.0, + "step": 816, + "train/ce_loss": 0.40708041191101074 + }, + { + "epoch": 0.0806802452046668, + "step": 816, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.0806802452046668, + "step": 816, + "train/total_loss": 0.0953955426812172 + }, + { + "entropy": 9.849893569946289, + "epoch": 0.08077911805418232, + "mean_token_accuracy": 0.7182866334915161, + "num_tokens": 4257586.0, + "step": 817, + "train/ce_loss": 0.7182490229606628 + }, + { + "epoch": 0.08077911805418232, + "step": 817, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.08077911805418232, + "step": 817, + "train/total_loss": 0.10307490080595016 + }, + { + "entropy": 9.295900344848633, + "epoch": 0.08087799090369785, + "mean_token_accuracy": 0.7526754140853882, + "num_tokens": 4262891.0, + "step": 818, + "train/ce_loss": 0.8068259954452515 + }, + { + "epoch": 0.08087799090369785, + "step": 818, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.08087799090369785, + "step": 818, + "train/total_loss": 0.11974509805440903 + }, + { + "entropy": 9.492517471313477, + "epoch": 0.08097686375321336, + "mean_token_accuracy": 0.6806282997131348, + "num_tokens": 4268115.0, + "step": 819, + "train/ce_loss": 1.0423537492752075 + }, + { + "epoch": 0.08097686375321336, + "step": 819, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.08097686375321336, + "step": 819, + "train/total_loss": 0.19407913088798523 + }, + { + "epoch": 0.08107573660272889, + "grad_norm": 1.2709230184555054, + "learning_rate": 9.799980220540969e-06, + "loss": 0.186, + "step": 820 + }, + { + "entropy": 9.600884437561035, + "epoch": 0.08107573660272889, + "mean_token_accuracy": 0.7112135291099548, + "num_tokens": 4273215.0, + "step": 820, + "train/ce_loss": 0.8723232746124268 + }, + { + "epoch": 0.08107573660272889, + "step": 820, + "train/sim_loss": 0.12890625 + }, + { + "epoch": 0.08107573660272889, + "step": 820, + "train/total_loss": 0.2161385715007782 + }, + { + "entropy": 10.040645599365234, + "epoch": 0.08117460945224442, + "mean_token_accuracy": 0.7112526297569275, + "num_tokens": 4278091.0, + "step": 821, + "train/ce_loss": 1.208526372909546 + }, + { + "epoch": 0.08117460945224442, + "step": 821, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.08117460945224442, + "step": 821, + "train/total_loss": 0.21850889921188354 + }, + { + "entropy": 8.934160232543945, + "epoch": 0.08127348230175993, + "mean_token_accuracy": 0.7260825634002686, + "num_tokens": 4283566.0, + "step": 822, + "train/ce_loss": 1.0187729597091675 + }, + { + "epoch": 0.08127348230175993, + "step": 822, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.08127348230175993, + "step": 822, + "train/total_loss": 0.22687730193138123 + }, + { + "entropy": 9.646869659423828, + "epoch": 0.08137235515127546, + "mean_token_accuracy": 0.7361563444137573, + "num_tokens": 4288637.0, + "step": 823, + "train/ce_loss": 4.1637067624833435e-05 + }, + { + "epoch": 0.08137235515127546, + "step": 823, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.08137235515127546, + "step": 823, + "train/total_loss": 0.03906666487455368 + }, + { + "entropy": 9.507827758789062, + "epoch": 0.08147122800079099, + "mean_token_accuracy": 0.7921568751335144, + "num_tokens": 4293902.0, + "step": 824, + "train/ce_loss": 0.918192446231842 + }, + { + "epoch": 0.08147122800079099, + "step": 824, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.08147122800079099, + "step": 824, + "train/total_loss": 0.18556925654411316 + }, + { + "entropy": 9.711023330688477, + "epoch": 0.08157010085030651, + "mean_token_accuracy": 0.7573099136352539, + "num_tokens": 4299053.0, + "step": 825, + "train/ce_loss": 0.00011551461648195982 + }, + { + "epoch": 0.08157010085030651, + "step": 825, + "train/sim_loss": 0.1796875 + }, + { + "epoch": 0.08157010085030651, + "step": 825, + "train/total_loss": 0.17969904839992523 + }, + { + "entropy": 9.974448204040527, + "epoch": 0.08166897369982203, + "mean_token_accuracy": 0.748062014579773, + "num_tokens": 4303992.0, + "step": 826, + "train/ce_loss": 1.3299791812896729 + }, + { + "epoch": 0.08166897369982203, + "step": 826, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.08166897369982203, + "step": 826, + "train/total_loss": 0.23846666514873505 + }, + { + "entropy": 9.31650161743164, + "epoch": 0.08176784654933755, + "mean_token_accuracy": 0.7368420958518982, + "num_tokens": 4309328.0, + "step": 827, + "train/ce_loss": 0.5922544002532959 + }, + { + "epoch": 0.08176784654933755, + "step": 827, + "train/sim_loss": 0.1640625 + }, + { + "epoch": 0.08176784654933755, + "step": 827, + "train/total_loss": 0.2232879400253296 + }, + { + "entropy": 9.702505111694336, + "epoch": 0.08186671939885308, + "mean_token_accuracy": 0.662756621837616, + "num_tokens": 4314475.0, + "step": 828, + "train/ce_loss": 3.506695065880194e-05 + }, + { + "epoch": 0.08186671939885308, + "step": 828, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.08186671939885308, + "step": 828, + "train/total_loss": 0.03906600549817085 + }, + { + "entropy": 9.546751976013184, + "epoch": 0.0819655922483686, + "mean_token_accuracy": 0.733418345451355, + "num_tokens": 4319698.0, + "step": 829, + "train/ce_loss": 1.594700813293457 + }, + { + "epoch": 0.0819655922483686, + "step": 829, + "train/sim_loss": 0.1328125 + }, + { + "epoch": 0.0819655922483686, + "step": 829, + "train/total_loss": 0.2922825813293457 + }, + { + "entropy": 9.644353866577148, + "epoch": 0.08206446509788412, + "mean_token_accuracy": 0.752136766910553, + "num_tokens": 4324752.0, + "step": 830, + "train/ce_loss": 0.859276294708252 + }, + { + "epoch": 0.08206446509788412, + "step": 830, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.08206446509788412, + "step": 830, + "train/total_loss": 0.17186513543128967 + }, + { + "entropy": 9.546089172363281, + "epoch": 0.08216333794739965, + "mean_token_accuracy": 0.7421875, + "num_tokens": 4329984.0, + "step": 831, + "train/ce_loss": 0.4032405912876129 + }, + { + "epoch": 0.08216333794739965, + "step": 831, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.08216333794739965, + "step": 831, + "train/total_loss": 0.11063656210899353 + }, + { + "entropy": 10.189659118652344, + "epoch": 0.08226221079691516, + "mean_token_accuracy": 0.8125, + "num_tokens": 4334821.0, + "step": 832, + "train/ce_loss": 5.903546480112709e-05 + }, + { + "epoch": 0.08226221079691516, + "step": 832, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.08226221079691516, + "step": 832, + "train/total_loss": 0.03125590458512306 + }, + { + "entropy": 9.257970809936523, + "epoch": 0.08236108364643069, + "mean_token_accuracy": 0.7412140369415283, + "num_tokens": 4340191.0, + "step": 833, + "train/ce_loss": 1.0270909070968628 + }, + { + "epoch": 0.08236108364643069, + "step": 833, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.08236108364643069, + "step": 833, + "train/total_loss": 0.2081778347492218 + }, + { + "entropy": 9.155115127563477, + "epoch": 0.08245995649594622, + "mean_token_accuracy": 0.7144288420677185, + "num_tokens": 4345735.0, + "step": 834, + "train/ce_loss": 0.3554162085056305 + }, + { + "epoch": 0.08245995649594622, + "step": 834, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.08245995649594622, + "step": 834, + "train/total_loss": 0.1331978738307953 + }, + { + "entropy": 9.520933151245117, + "epoch": 0.08255882934546174, + "mean_token_accuracy": 0.6748299598693848, + "num_tokens": 4350961.0, + "step": 835, + "train/ce_loss": 1.509456992149353 + }, + { + "epoch": 0.08255882934546174, + "step": 835, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.08255882934546174, + "step": 835, + "train/total_loss": 0.2603207230567932 + }, + { + "entropy": 9.509539604187012, + "epoch": 0.08265770219497726, + "mean_token_accuracy": 0.7180616855621338, + "num_tokens": 4356047.0, + "step": 836, + "train/ce_loss": 1.479731559753418 + }, + { + "epoch": 0.08265770219497726, + "step": 836, + "train/sim_loss": 0.15234375 + }, + { + "epoch": 0.08265770219497726, + "step": 836, + "train/total_loss": 0.3003169298171997 + }, + { + "entropy": 9.277669906616211, + "epoch": 0.08275657504449278, + "mean_token_accuracy": 0.7243436574935913, + "num_tokens": 4361380.0, + "step": 837, + "train/ce_loss": 0.9445345997810364 + }, + { + "epoch": 0.08275657504449278, + "step": 837, + "train/sim_loss": 0.16015625 + }, + { + "epoch": 0.08275657504449278, + "step": 837, + "train/total_loss": 0.25460970401763916 + }, + { + "entropy": 9.328760147094727, + "epoch": 0.08285544789400831, + "mean_token_accuracy": 0.7488937973976135, + "num_tokens": 4366759.0, + "step": 838, + "train/ce_loss": 0.5685387253761292 + }, + { + "epoch": 0.08285544789400831, + "step": 838, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.08285544789400831, + "step": 838, + "train/total_loss": 0.15451012551784515 + }, + { + "entropy": 9.904058456420898, + "epoch": 0.08295432074352382, + "mean_token_accuracy": 0.7512953281402588, + "num_tokens": 4371773.0, + "step": 839, + "train/ce_loss": 1.4188343286514282 + }, + { + "epoch": 0.08295432074352382, + "step": 839, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.08295432074352382, + "step": 839, + "train/total_loss": 0.23172718286514282 + }, + { + "epoch": 0.08305319359303935, + "grad_norm": 1.0367178916931152, + "learning_rate": 9.79503535578302e-06, + "loss": 0.1826, + "step": 840 + }, + { + "entropy": 9.342744827270508, + "epoch": 0.08305319359303935, + "mean_token_accuracy": 0.7129629850387573, + "num_tokens": 4377104.0, + "step": 840, + "train/ce_loss": 0.6612659096717834 + }, + { + "epoch": 0.08305319359303935, + "step": 840, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.08305319359303935, + "step": 840, + "train/total_loss": 0.13643908500671387 + }, + { + "entropy": 9.328734397888184, + "epoch": 0.08315206644255488, + "mean_token_accuracy": 0.717674970626831, + "num_tokens": 4382420.0, + "step": 841, + "train/ce_loss": 1.2937597036361694 + }, + { + "epoch": 0.08315206644255488, + "step": 841, + "train/sim_loss": 0.1328125 + }, + { + "epoch": 0.08315206644255488, + "step": 841, + "train/total_loss": 0.26218849420547485 + }, + { + "entropy": 9.546821594238281, + "epoch": 0.08325093929207039, + "mean_token_accuracy": 0.691428542137146, + "num_tokens": 4387593.0, + "step": 842, + "train/ce_loss": 0.7460771203041077 + }, + { + "epoch": 0.08325093929207039, + "step": 842, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.08325093929207039, + "step": 842, + "train/total_loss": 0.176170215010643 + }, + { + "entropy": 9.157147407531738, + "epoch": 0.08334981214158592, + "mean_token_accuracy": 0.7199147939682007, + "num_tokens": 4393014.0, + "step": 843, + "train/ce_loss": 0.7931463122367859 + }, + { + "epoch": 0.08334981214158592, + "step": 843, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.08334981214158592, + "step": 843, + "train/total_loss": 0.17697088420391083 + }, + { + "entropy": 9.803672790527344, + "epoch": 0.08344868499110145, + "mean_token_accuracy": 0.6762917637825012, + "num_tokens": 4398141.0, + "step": 844, + "train/ce_loss": 0.8283305764198303 + }, + { + "epoch": 0.08344868499110145, + "step": 844, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.08344868499110145, + "step": 844, + "train/total_loss": 0.18439555168151855 + }, + { + "entropy": 9.153200149536133, + "epoch": 0.08354755784061697, + "mean_token_accuracy": 0.6939426064491272, + "num_tokens": 4403592.0, + "step": 845, + "train/ce_loss": 0.9548829197883606 + }, + { + "epoch": 0.08354755784061697, + "step": 845, + "train/sim_loss": 0.1328125 + }, + { + "epoch": 0.08354755784061697, + "step": 845, + "train/total_loss": 0.2283007949590683 + }, + { + "entropy": 9.16877555847168, + "epoch": 0.08364643069013249, + "mean_token_accuracy": 0.6962785124778748, + "num_tokens": 4408930.0, + "step": 846, + "train/ce_loss": 0.8261193633079529 + }, + { + "epoch": 0.08364643069013249, + "step": 846, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.08364643069013249, + "step": 846, + "train/total_loss": 0.17245569825172424 + }, + { + "entropy": 9.682516098022461, + "epoch": 0.08374530353964801, + "mean_token_accuracy": 0.7147335410118103, + "num_tokens": 4413996.0, + "step": 847, + "train/ce_loss": 1.019052505493164 + }, + { + "epoch": 0.08374530353964801, + "step": 847, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.08374530353964801, + "step": 847, + "train/total_loss": 0.17221775650978088 + }, + { + "entropy": 9.821207046508789, + "epoch": 0.08384417638916354, + "mean_token_accuracy": 0.723747968673706, + "num_tokens": 4419084.0, + "step": 848, + "train/ce_loss": 1.4414478540420532 + }, + { + "epoch": 0.08384417638916354, + "step": 848, + "train/sim_loss": 0.14453125 + }, + { + "epoch": 0.08384417638916354, + "step": 848, + "train/total_loss": 0.28867602348327637 + }, + { + "entropy": 10.034945487976074, + "epoch": 0.08394304923867905, + "mean_token_accuracy": 0.7482993006706238, + "num_tokens": 4423942.0, + "step": 849, + "train/ce_loss": 2.0451998710632324 + }, + { + "epoch": 0.08394304923867905, + "step": 849, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.08394304923867905, + "step": 849, + "train/total_loss": 0.30217623710632324 + }, + { + "entropy": 9.162328720092773, + "epoch": 0.08404192208819458, + "mean_token_accuracy": 0.7416148781776428, + "num_tokens": 4429239.0, + "step": 850, + "train/ce_loss": 0.6268206834793091 + }, + { + "epoch": 0.08404192208819458, + "step": 850, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.08404192208819458, + "step": 850, + "train/total_loss": 0.10174456983804703 + }, + { + "entropy": 9.801444053649902, + "epoch": 0.08414079493771011, + "mean_token_accuracy": 0.7170172333717346, + "num_tokens": 4434188.0, + "step": 851, + "train/ce_loss": 5.47383569937665e-05 + }, + { + "epoch": 0.08414079493771011, + "step": 851, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.08414079493771011, + "step": 851, + "train/total_loss": 0.05469297245144844 + }, + { + "entropy": 10.159244537353516, + "epoch": 0.08423966778722562, + "mean_token_accuracy": 0.7085019946098328, + "num_tokens": 4439049.0, + "step": 852, + "train/ce_loss": 3.2078307413030416e-05 + }, + { + "epoch": 0.08423966778722562, + "step": 852, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.08423966778722562, + "step": 852, + "train/total_loss": 0.06640946120023727 + }, + { + "entropy": 10.302398681640625, + "epoch": 0.08433854063674115, + "mean_token_accuracy": 0.7385057210922241, + "num_tokens": 4443823.0, + "step": 853, + "train/ce_loss": 4.880238338955678e-05 + }, + { + "epoch": 0.08433854063674115, + "step": 853, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.08433854063674115, + "step": 853, + "train/total_loss": 0.07031738013029099 + }, + { + "entropy": 9.40043830871582, + "epoch": 0.08443741348625668, + "mean_token_accuracy": 0.6945169568061829, + "num_tokens": 4449051.0, + "step": 854, + "train/ce_loss": 1.1474387645721436 + }, + { + "epoch": 0.08443741348625668, + "step": 854, + "train/sim_loss": 0.12109375 + }, + { + "epoch": 0.08443741348625668, + "step": 854, + "train/total_loss": 0.2358376383781433 + }, + { + "entropy": 9.591285705566406, + "epoch": 0.0845362863357722, + "mean_token_accuracy": 0.7363494634628296, + "num_tokens": 4454147.0, + "step": 855, + "train/ce_loss": 0.8136089444160461 + }, + { + "epoch": 0.0845362863357722, + "step": 855, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.0845362863357722, + "step": 855, + "train/total_loss": 0.14386090636253357 + }, + { + "entropy": 9.808528900146484, + "epoch": 0.08463515918528772, + "mean_token_accuracy": 0.7299145460128784, + "num_tokens": 4459176.0, + "step": 856, + "train/ce_loss": 0.8623424768447876 + }, + { + "epoch": 0.08463515918528772, + "step": 856, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.08463515918528772, + "step": 856, + "train/total_loss": 0.17607799172401428 + }, + { + "entropy": 9.614444732666016, + "epoch": 0.08473403203480324, + "mean_token_accuracy": 0.7460992932319641, + "num_tokens": 4464368.0, + "step": 857, + "train/ce_loss": 0.6678289771080017 + }, + { + "epoch": 0.08473403203480324, + "step": 857, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.08473403203480324, + "step": 857, + "train/total_loss": 0.11756414920091629 + }, + { + "entropy": 9.298678398132324, + "epoch": 0.08483290488431877, + "mean_token_accuracy": 0.7164179086685181, + "num_tokens": 4469722.0, + "step": 858, + "train/ce_loss": 0.6989213228225708 + }, + { + "epoch": 0.08483290488431877, + "step": 858, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.08483290488431877, + "step": 858, + "train/total_loss": 0.11676713079214096 + }, + { + "entropy": 9.135697364807129, + "epoch": 0.08493177773383428, + "mean_token_accuracy": 0.6835051774978638, + "num_tokens": 4475159.0, + "step": 859, + "train/ce_loss": 1.6708698272705078 + }, + { + "epoch": 0.08493177773383428, + "step": 859, + "train/sim_loss": 0.140625 + }, + { + "epoch": 0.08493177773383428, + "step": 859, + "train/total_loss": 0.30771198868751526 + }, + { + "epoch": 0.08503065058334981, + "grad_norm": 1.4547375440597534, + "learning_rate": 9.790090491025071e-06, + "loss": 0.1914, + "step": 860 + }, + { + "entropy": 8.923101425170898, + "epoch": 0.08503065058334981, + "mean_token_accuracy": 0.6813910007476807, + "num_tokens": 4480776.0, + "step": 860, + "train/ce_loss": 0.9627625942230225 + }, + { + "epoch": 0.08503065058334981, + "step": 860, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.08503065058334981, + "step": 860, + "train/total_loss": 0.21346375346183777 + }, + { + "entropy": 9.888015747070312, + "epoch": 0.08512952343286534, + "mean_token_accuracy": 0.6981450319290161, + "num_tokens": 4485823.0, + "step": 861, + "train/ce_loss": 1.032806634902954 + }, + { + "epoch": 0.08512952343286534, + "step": 861, + "train/sim_loss": 0.140625 + }, + { + "epoch": 0.08512952343286534, + "step": 861, + "train/total_loss": 0.2439056634902954 + }, + { + "entropy": 9.753445625305176, + "epoch": 0.08522839628238085, + "mean_token_accuracy": 0.70597243309021, + "num_tokens": 4490898.0, + "step": 862, + "train/ce_loss": 2.412477731704712 + }, + { + "epoch": 0.08522839628238085, + "step": 862, + "train/sim_loss": 0.1328125 + }, + { + "epoch": 0.08522839628238085, + "step": 862, + "train/total_loss": 0.3740602731704712 + }, + { + "entropy": 9.161497116088867, + "epoch": 0.08532726913189638, + "mean_token_accuracy": 0.7283422350883484, + "num_tokens": 4496297.0, + "step": 863, + "train/ce_loss": 0.9021885395050049 + }, + { + "epoch": 0.08532726913189638, + "step": 863, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.08532726913189638, + "step": 863, + "train/total_loss": 0.12928135693073273 + }, + { + "entropy": 9.778253555297852, + "epoch": 0.08542614198141191, + "mean_token_accuracy": 0.710616409778595, + "num_tokens": 4501313.0, + "step": 864, + "train/ce_loss": 0.8009510040283203 + }, + { + "epoch": 0.08542614198141191, + "step": 864, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.08542614198141191, + "step": 864, + "train/total_loss": 0.138688862323761 + }, + { + "entropy": 9.464872360229492, + "epoch": 0.08552501483092742, + "mean_token_accuracy": 0.7915531396865845, + "num_tokens": 4506536.0, + "step": 865, + "train/ce_loss": 0.520185649394989 + }, + { + "epoch": 0.08552501483092742, + "step": 865, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.08552501483092742, + "step": 865, + "train/total_loss": 0.08326856791973114 + }, + { + "entropy": 10.019142150878906, + "epoch": 0.08562388768044295, + "mean_token_accuracy": 0.7115043997764587, + "num_tokens": 4511517.0, + "step": 866, + "train/ce_loss": 0.0004360276216175407 + }, + { + "epoch": 0.08562388768044295, + "step": 866, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.08562388768044295, + "step": 866, + "train/total_loss": 0.0781686007976532 + }, + { + "entropy": 9.592554092407227, + "epoch": 0.08572276052995847, + "mean_token_accuracy": 0.6846985816955566, + "num_tokens": 4516633.0, + "step": 867, + "train/ce_loss": 1.4119404554367065 + }, + { + "epoch": 0.08572276052995847, + "step": 867, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.08572276052995847, + "step": 867, + "train/total_loss": 0.23103779554367065 + }, + { + "entropy": 9.140426635742188, + "epoch": 0.085821633379474, + "mean_token_accuracy": 0.7066666483879089, + "num_tokens": 4522097.0, + "step": 868, + "train/ce_loss": 0.7785761952400208 + }, + { + "epoch": 0.085821633379474, + "step": 868, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.085821633379474, + "step": 868, + "train/total_loss": 0.1325451135635376 + }, + { + "entropy": 9.77696418762207, + "epoch": 0.08592050622898952, + "mean_token_accuracy": 0.804958701133728, + "num_tokens": 4527137.0, + "step": 869, + "train/ce_loss": 0.9853973388671875 + }, + { + "epoch": 0.08592050622898952, + "step": 869, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.08592050622898952, + "step": 869, + "train/total_loss": 0.22353973984718323 + }, + { + "entropy": 9.248411178588867, + "epoch": 0.08601937907850504, + "mean_token_accuracy": 0.7294981479644775, + "num_tokens": 4532446.0, + "step": 870, + "train/ce_loss": 0.5703291296958923 + }, + { + "epoch": 0.08601937907850504, + "step": 870, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.08601937907850504, + "step": 870, + "train/total_loss": 0.09609541296958923 + }, + { + "entropy": 9.832139015197754, + "epoch": 0.08611825192802057, + "mean_token_accuracy": 0.7669421434402466, + "num_tokens": 4537515.0, + "step": 871, + "train/ce_loss": 0.8696778416633606 + }, + { + "epoch": 0.08611825192802057, + "step": 871, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.08611825192802057, + "step": 871, + "train/total_loss": 0.15728029608726501 + }, + { + "entropy": 9.310486793518066, + "epoch": 0.08621712477753608, + "mean_token_accuracy": 0.6934306621551514, + "num_tokens": 4542820.0, + "step": 872, + "train/ce_loss": 0.6153068542480469 + }, + { + "epoch": 0.08621712477753608, + "step": 872, + "train/sim_loss": 0.1484375 + }, + { + "epoch": 0.08621712477753608, + "step": 872, + "train/total_loss": 0.2099681794643402 + }, + { + "entropy": 10.34494400024414, + "epoch": 0.08631599762705161, + "mean_token_accuracy": 0.8199999928474426, + "num_tokens": 4547574.0, + "step": 873, + "train/ce_loss": 1.1924973726272583 + }, + { + "epoch": 0.08631599762705161, + "step": 873, + "train/sim_loss": 0.13671875 + }, + { + "epoch": 0.08631599762705161, + "step": 873, + "train/total_loss": 0.25596848130226135 + }, + { + "entropy": 9.449071884155273, + "epoch": 0.08641487047656714, + "mean_token_accuracy": 0.7564259767532349, + "num_tokens": 4552805.0, + "step": 874, + "train/ce_loss": 0.8073897957801819 + }, + { + "epoch": 0.08641487047656714, + "step": 874, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.08641487047656714, + "step": 874, + "train/total_loss": 0.19011399149894714 + }, + { + "entropy": 9.491059303283691, + "epoch": 0.08651374332608265, + "mean_token_accuracy": 0.7123655676841736, + "num_tokens": 4558024.0, + "step": 875, + "train/ce_loss": 1.0081188678741455 + }, + { + "epoch": 0.08651374332608265, + "step": 875, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.08651374332608265, + "step": 875, + "train/total_loss": 0.1867493987083435 + }, + { + "entropy": 9.761913299560547, + "epoch": 0.08661261617559818, + "mean_token_accuracy": 0.6963696479797363, + "num_tokens": 4563031.0, + "step": 876, + "train/ce_loss": 1.437301754951477 + }, + { + "epoch": 0.08661261617559818, + "step": 876, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.08661261617559818, + "step": 876, + "train/total_loss": 0.25310516357421875 + }, + { + "entropy": 9.131916046142578, + "epoch": 0.0867114890251137, + "mean_token_accuracy": 0.7195122241973877, + "num_tokens": 4568486.0, + "step": 877, + "train/ce_loss": 0.7580614686012268 + }, + { + "epoch": 0.0867114890251137, + "step": 877, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.0867114890251137, + "step": 877, + "train/total_loss": 0.1929936408996582 + }, + { + "entropy": 9.216597557067871, + "epoch": 0.08681036187462923, + "mean_token_accuracy": 0.6851248741149902, + "num_tokens": 4573853.0, + "step": 878, + "train/ce_loss": 1.3044555187225342 + }, + { + "epoch": 0.08681036187462923, + "step": 878, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.08681036187462923, + "step": 878, + "train/total_loss": 0.23591430485248566 + }, + { + "entropy": 9.29898452758789, + "epoch": 0.08690923472414475, + "mean_token_accuracy": 0.7606936693191528, + "num_tokens": 4579205.0, + "step": 879, + "train/ce_loss": 0.6638868451118469 + }, + { + "epoch": 0.08690923472414475, + "step": 879, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.08690923472414475, + "step": 879, + "train/total_loss": 0.15232619643211365 + }, + { + "epoch": 0.08700810757366027, + "grad_norm": 1.011390209197998, + "learning_rate": 9.785145626267124e-06, + "loss": 0.1816, + "step": 880 + }, + { + "entropy": 9.205062866210938, + "epoch": 0.08700810757366027, + "mean_token_accuracy": 0.7522421479225159, + "num_tokens": 4584625.0, + "step": 880, + "train/ce_loss": 0.7033084630966187 + }, + { + "epoch": 0.08700810757366027, + "step": 880, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.08700810757366027, + "step": 880, + "train/total_loss": 0.10158085078001022 + }, + { + "entropy": 9.204500198364258, + "epoch": 0.0871069804231758, + "mean_token_accuracy": 0.7185500860214233, + "num_tokens": 4590024.0, + "step": 881, + "train/ce_loss": 1.1597356796264648 + }, + { + "epoch": 0.0871069804231758, + "step": 881, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.0871069804231758, + "step": 881, + "train/total_loss": 0.190192312002182 + }, + { + "entropy": 9.53515625, + "epoch": 0.08720585327269131, + "mean_token_accuracy": 0.6879194378852844, + "num_tokens": 4595085.0, + "step": 882, + "train/ce_loss": 2.1317622661590576 + }, + { + "epoch": 0.08720585327269131, + "step": 882, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.08720585327269131, + "step": 882, + "train/total_loss": 0.29520750045776367 + }, + { + "entropy": 9.357887268066406, + "epoch": 0.08730472612220684, + "mean_token_accuracy": 0.7347418069839478, + "num_tokens": 4600452.0, + "step": 883, + "train/ce_loss": 1.2176082134246826 + }, + { + "epoch": 0.08730472612220684, + "step": 883, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.08730472612220684, + "step": 883, + "train/total_loss": 0.23894831538200378 + }, + { + "entropy": 9.738540649414062, + "epoch": 0.08740359897172237, + "mean_token_accuracy": 0.7417762875556946, + "num_tokens": 4605515.0, + "step": 884, + "train/ce_loss": 3.276742063462734e-05 + }, + { + "epoch": 0.08740359897172237, + "step": 884, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.08740359897172237, + "step": 884, + "train/total_loss": 0.042972028255462646 + }, + { + "entropy": 9.246567726135254, + "epoch": 0.08750247182123788, + "mean_token_accuracy": 0.681664764881134, + "num_tokens": 4610849.0, + "step": 885, + "train/ce_loss": 1.3851035833358765 + }, + { + "epoch": 0.08750247182123788, + "step": 885, + "train/sim_loss": 0.1484375 + }, + { + "epoch": 0.08750247182123788, + "step": 885, + "train/total_loss": 0.2869478464126587 + }, + { + "entropy": 9.525009155273438, + "epoch": 0.08760134467075341, + "mean_token_accuracy": 0.7420249581336975, + "num_tokens": 4616043.0, + "step": 886, + "train/ce_loss": 0.9769114255905151 + }, + { + "epoch": 0.08760134467075341, + "step": 886, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.08760134467075341, + "step": 886, + "train/total_loss": 0.183628648519516 + }, + { + "entropy": 9.74372386932373, + "epoch": 0.08770021752026894, + "mean_token_accuracy": 0.7533556818962097, + "num_tokens": 4621077.0, + "step": 887, + "train/ce_loss": 0.6396549940109253 + }, + { + "epoch": 0.08770021752026894, + "step": 887, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.08770021752026894, + "step": 887, + "train/total_loss": 0.14990299940109253 + }, + { + "entropy": 9.419316291809082, + "epoch": 0.08779909036978446, + "mean_token_accuracy": 0.767471432685852, + "num_tokens": 4626323.0, + "step": 888, + "train/ce_loss": 0.7446501851081848 + }, + { + "epoch": 0.08779909036978446, + "step": 888, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.08779909036978446, + "step": 888, + "train/total_loss": 0.11352752149105072 + }, + { + "entropy": 9.304037094116211, + "epoch": 0.08789796321929998, + "mean_token_accuracy": 0.6585366129875183, + "num_tokens": 4631525.0, + "step": 889, + "train/ce_loss": 2.263930320739746 + }, + { + "epoch": 0.08789796321929998, + "step": 889, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.08789796321929998, + "step": 889, + "train/total_loss": 0.33576804399490356 + }, + { + "entropy": 9.804574966430664, + "epoch": 0.0879968360688155, + "mean_token_accuracy": 0.7204968929290771, + "num_tokens": 4636601.0, + "step": 890, + "train/ce_loss": 1.3510684967041016 + }, + { + "epoch": 0.0879968360688155, + "step": 890, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.0879968360688155, + "step": 890, + "train/total_loss": 0.23276309669017792 + }, + { + "entropy": 9.326436042785645, + "epoch": 0.08809570891833103, + "mean_token_accuracy": 0.7505882382392883, + "num_tokens": 4641903.0, + "step": 891, + "train/ce_loss": 0.6687228679656982 + }, + { + "epoch": 0.08809570891833103, + "step": 891, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.08809570891833103, + "step": 891, + "train/total_loss": 0.16062229871749878 + }, + { + "entropy": 9.97100830078125, + "epoch": 0.08819458176784654, + "mean_token_accuracy": 0.6701940298080444, + "num_tokens": 4646893.0, + "step": 892, + "train/ce_loss": 1.1901462078094482 + }, + { + "epoch": 0.08819458176784654, + "step": 892, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.08819458176784654, + "step": 892, + "train/total_loss": 0.23229587078094482 + }, + { + "entropy": 9.102214813232422, + "epoch": 0.08829345461736207, + "mean_token_accuracy": 0.7774358987808228, + "num_tokens": 4652394.0, + "step": 893, + "train/ce_loss": 1.2304139137268066 + }, + { + "epoch": 0.08829345461736207, + "step": 893, + "train/sim_loss": 0.12890625 + }, + { + "epoch": 0.08829345461736207, + "step": 893, + "train/total_loss": 0.25194764137268066 + }, + { + "entropy": 9.257131576538086, + "epoch": 0.0883923274668776, + "mean_token_accuracy": 0.7405345439910889, + "num_tokens": 4657736.0, + "step": 894, + "train/ce_loss": 0.5056064128875732 + }, + { + "epoch": 0.0883923274668776, + "step": 894, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.0883923274668776, + "step": 894, + "train/total_loss": 0.09743563830852509 + }, + { + "entropy": 9.271703720092773, + "epoch": 0.08849120031639311, + "mean_token_accuracy": 0.7413395047187805, + "num_tokens": 4663017.0, + "step": 895, + "train/ce_loss": 0.7187294363975525 + }, + { + "epoch": 0.08849120031639311, + "step": 895, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.08849120031639311, + "step": 895, + "train/total_loss": 0.16562294960021973 + }, + { + "entropy": 9.240007400512695, + "epoch": 0.08859007316590864, + "mean_token_accuracy": 0.7541766166687012, + "num_tokens": 4668301.0, + "step": 896, + "train/ce_loss": 0.8012892007827759 + }, + { + "epoch": 0.08859007316590864, + "step": 896, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.08859007316590864, + "step": 896, + "train/total_loss": 0.18559767305850983 + }, + { + "entropy": 8.960734367370605, + "epoch": 0.08868894601542417, + "mean_token_accuracy": 0.6455331444740295, + "num_tokens": 4673841.0, + "step": 897, + "train/ce_loss": 1.7212449312210083 + }, + { + "epoch": 0.08868894601542417, + "step": 897, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.08868894601542417, + "step": 897, + "train/total_loss": 0.2697807550430298 + }, + { + "entropy": 9.661792755126953, + "epoch": 0.0887878188649397, + "mean_token_accuracy": 0.7244318127632141, + "num_tokens": 4678986.0, + "step": 898, + "train/ce_loss": 2.423352088953834e-05 + }, + { + "epoch": 0.0887878188649397, + "step": 898, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.0887878188649397, + "step": 898, + "train/total_loss": 0.0390649251639843 + }, + { + "entropy": 9.03544807434082, + "epoch": 0.08888669171445521, + "mean_token_accuracy": 0.7222787141799927, + "num_tokens": 4684460.0, + "step": 899, + "train/ce_loss": 1.1360260248184204 + }, + { + "epoch": 0.08888669171445521, + "step": 899, + "train/sim_loss": 0.14453125 + }, + { + "epoch": 0.08888669171445521, + "step": 899, + "train/total_loss": 0.2581338584423065 + }, + { + "epoch": 0.08898556456397073, + "grad_norm": 1.515589714050293, + "learning_rate": 9.780200761509172e-06, + "loss": 0.1777, + "step": 900 + }, + { + "entropy": 9.385675430297852, + "epoch": 0.08898556456397073, + "mean_token_accuracy": 0.6979310512542725, + "num_tokens": 4689651.0, + "step": 900, + "train/ce_loss": 0.9100170731544495 + }, + { + "epoch": 0.08898556456397073, + "step": 900, + "train/sim_loss": 0.1484375 + }, + { + "epoch": 0.08898556456397073, + "step": 900, + "train/total_loss": 0.2394392192363739 + }, + { + "entropy": 9.832839965820312, + "epoch": 0.08908443741348626, + "mean_token_accuracy": 0.7122302055358887, + "num_tokens": 4694651.0, + "step": 901, + "train/ce_loss": 1.3100913763046265 + }, + { + "epoch": 0.08908443741348626, + "step": 901, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.08908443741348626, + "step": 901, + "train/total_loss": 0.21304039657115936 + }, + { + "entropy": 9.541091918945312, + "epoch": 0.08918331026300177, + "mean_token_accuracy": 0.7403973340988159, + "num_tokens": 4699878.0, + "step": 902, + "train/ce_loss": 0.7895435094833374 + }, + { + "epoch": 0.08918331026300177, + "step": 902, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.08918331026300177, + "step": 902, + "train/total_loss": 0.12973560392856598 + }, + { + "entropy": 9.55825424194336, + "epoch": 0.0892821831125173, + "mean_token_accuracy": 0.7576974630355835, + "num_tokens": 4705071.0, + "step": 903, + "train/ce_loss": 0.7540633082389832 + }, + { + "epoch": 0.0892821831125173, + "step": 903, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.0892821831125173, + "step": 903, + "train/total_loss": 0.11446883529424667 + }, + { + "entropy": 9.702561378479004, + "epoch": 0.08938105596203283, + "mean_token_accuracy": 0.7054908275604248, + "num_tokens": 4710161.0, + "step": 904, + "train/ce_loss": 6.917696737218648e-05 + }, + { + "epoch": 0.08938105596203283, + "step": 904, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.08938105596203283, + "step": 904, + "train/total_loss": 0.09766316413879395 + }, + { + "entropy": 9.32586669921875, + "epoch": 0.08947992881154834, + "mean_token_accuracy": 0.7013463973999023, + "num_tokens": 4715421.0, + "step": 905, + "train/ce_loss": 0.6814620494842529 + }, + { + "epoch": 0.08947992881154834, + "step": 905, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.08947992881154834, + "step": 905, + "train/total_loss": 0.17752119898796082 + }, + { + "entropy": 9.214298248291016, + "epoch": 0.08957880166106387, + "mean_token_accuracy": 0.7730569839477539, + "num_tokens": 4720850.0, + "step": 906, + "train/ce_loss": 0.6316813826560974 + }, + { + "epoch": 0.08957880166106387, + "step": 906, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.08957880166106387, + "step": 906, + "train/total_loss": 0.10223063826560974 + }, + { + "entropy": 8.960771560668945, + "epoch": 0.0896776745105794, + "mean_token_accuracy": 0.804950475692749, + "num_tokens": 4726345.0, + "step": 907, + "train/ce_loss": 0.5146143436431885 + }, + { + "epoch": 0.0896776745105794, + "step": 907, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.0896776745105794, + "step": 907, + "train/total_loss": 0.08271143585443497 + }, + { + "entropy": 9.457979202270508, + "epoch": 0.08977654736009492, + "mean_token_accuracy": 0.7007672786712646, + "num_tokens": 4731594.0, + "step": 908, + "train/ce_loss": 0.9663236737251282 + }, + { + "epoch": 0.08977654736009492, + "step": 908, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.08977654736009492, + "step": 908, + "train/total_loss": 0.16694486141204834 + }, + { + "entropy": 9.720237731933594, + "epoch": 0.08987542020961044, + "mean_token_accuracy": 0.6881405711174011, + "num_tokens": 4736726.0, + "step": 909, + "train/ce_loss": 1.5250821113586426 + }, + { + "epoch": 0.08987542020961044, + "step": 909, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.08987542020961044, + "step": 909, + "train/total_loss": 0.2696956992149353 + }, + { + "entropy": 9.777441024780273, + "epoch": 0.08997429305912596, + "mean_token_accuracy": 0.718196451663971, + "num_tokens": 4741758.0, + "step": 910, + "train/ce_loss": 1.7164819240570068 + }, + { + "epoch": 0.08997429305912596, + "step": 910, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.08997429305912596, + "step": 910, + "train/total_loss": 0.23805443942546844 + }, + { + "entropy": 10.040216445922852, + "epoch": 0.09007316590864149, + "mean_token_accuracy": 0.75, + "num_tokens": 4746633.0, + "step": 911, + "train/ce_loss": 1.6431957483291626 + }, + { + "epoch": 0.09007316590864149, + "step": 911, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.09007316590864149, + "step": 911, + "train/total_loss": 0.25806957483291626 + }, + { + "entropy": 9.284655570983887, + "epoch": 0.090172038758157, + "mean_token_accuracy": 0.7226791977882385, + "num_tokens": 4751974.0, + "step": 912, + "train/ce_loss": 0.9069715142250061 + }, + { + "epoch": 0.090172038758157, + "step": 912, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.090172038758157, + "step": 912, + "train/total_loss": 0.19225965440273285 + }, + { + "entropy": 9.134580612182617, + "epoch": 0.09027091160767253, + "mean_token_accuracy": 0.8069105744361877, + "num_tokens": 4757651.0, + "step": 913, + "train/ce_loss": 0.661210834980011 + }, + { + "epoch": 0.09027091160767253, + "step": 913, + "train/sim_loss": 0.1328125 + }, + { + "epoch": 0.09027091160767253, + "step": 913, + "train/total_loss": 0.19893358647823334 + }, + { + "entropy": 9.497674942016602, + "epoch": 0.09036978445718806, + "mean_token_accuracy": 0.7619718313217163, + "num_tokens": 4762807.0, + "step": 914, + "train/ce_loss": 0.8399479389190674 + }, + { + "epoch": 0.09036978445718806, + "step": 914, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.09036978445718806, + "step": 914, + "train/total_loss": 0.1191510483622551 + }, + { + "entropy": 9.850480079650879, + "epoch": 0.09046865730670357, + "mean_token_accuracy": 0.7324414849281311, + "num_tokens": 4767810.0, + "step": 915, + "train/ce_loss": 2.1993157133692876e-05 + }, + { + "epoch": 0.09046865730670357, + "step": 915, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.09046865730670357, + "step": 915, + "train/total_loss": 0.03515844792127609 + }, + { + "entropy": 9.506845474243164, + "epoch": 0.0905675301562191, + "mean_token_accuracy": 0.7412223815917969, + "num_tokens": 4773030.0, + "step": 916, + "train/ce_loss": 0.8922023773193359 + }, + { + "epoch": 0.0905675301562191, + "step": 916, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.0905675301562191, + "step": 916, + "train/total_loss": 0.17906399071216583 + }, + { + "entropy": 9.768218040466309, + "epoch": 0.09066640300573463, + "mean_token_accuracy": 0.7072418928146362, + "num_tokens": 4778103.0, + "step": 917, + "train/ce_loss": 4.770288069266826e-05 + }, + { + "epoch": 0.09066640300573463, + "step": 917, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.09066640300573463, + "step": 917, + "train/total_loss": 0.04687977209687233 + }, + { + "entropy": 10.2958345413208, + "epoch": 0.09076527585525015, + "mean_token_accuracy": 0.7989276051521301, + "num_tokens": 4782833.0, + "step": 918, + "train/ce_loss": 1.2475789785385132 + }, + { + "epoch": 0.09076527585525015, + "step": 918, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.09076527585525015, + "step": 918, + "train/total_loss": 0.16382040083408356 + }, + { + "entropy": 9.392905235290527, + "epoch": 0.09086414870476567, + "mean_token_accuracy": 0.7146371603012085, + "num_tokens": 4788116.0, + "step": 919, + "train/ce_loss": 1.3511189222335815 + }, + { + "epoch": 0.09086414870476567, + "step": 919, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.09086414870476567, + "step": 919, + "train/total_loss": 0.20933064818382263 + }, + { + "epoch": 0.0909630215542812, + "grad_norm": 1.054306983947754, + "learning_rate": 9.775255896751225e-06, + "loss": 0.1746, + "step": 920 + }, + { + "entropy": 9.122296333312988, + "epoch": 0.0909630215542812, + "mean_token_accuracy": 0.6872385144233704, + "num_tokens": 4793545.0, + "step": 920, + "train/ce_loss": 0.6360870599746704 + }, + { + "epoch": 0.0909630215542812, + "step": 920, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.0909630215542812, + "step": 920, + "train/total_loss": 0.17688995599746704 + }, + { + "entropy": 9.493011474609375, + "epoch": 0.09106189440379672, + "mean_token_accuracy": 0.6517857313156128, + "num_tokens": 4798806.0, + "step": 921, + "train/ce_loss": 2.0138051695539616e-05 + }, + { + "epoch": 0.09106189440379672, + "step": 921, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.09106189440379672, + "step": 921, + "train/total_loss": 0.07031451165676117 + }, + { + "entropy": 9.315386772155762, + "epoch": 0.09116076725331224, + "mean_token_accuracy": 0.7726027369499207, + "num_tokens": 4804026.0, + "step": 922, + "train/ce_loss": 0.5254396200180054 + }, + { + "epoch": 0.09116076725331224, + "step": 922, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.09116076725331224, + "step": 922, + "train/total_loss": 0.09160646796226501 + }, + { + "entropy": 9.016719818115234, + "epoch": 0.09125964010282776, + "mean_token_accuracy": 0.7427983283996582, + "num_tokens": 4809561.0, + "step": 923, + "train/ce_loss": 0.8595583438873291 + }, + { + "epoch": 0.09125964010282776, + "step": 923, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.09125964010282776, + "step": 923, + "train/total_loss": 0.15626832842826843 + }, + { + "entropy": 9.173903465270996, + "epoch": 0.09135851295234329, + "mean_token_accuracy": 0.6830732226371765, + "num_tokens": 4814882.0, + "step": 924, + "train/ce_loss": 0.7632126212120056 + }, + { + "epoch": 0.09135851295234329, + "step": 924, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.09135851295234329, + "step": 924, + "train/total_loss": 0.15054002404212952 + }, + { + "entropy": 9.216700553894043, + "epoch": 0.0914573858018588, + "mean_token_accuracy": 0.7312775254249573, + "num_tokens": 4820252.0, + "step": 925, + "train/ce_loss": 0.8972272872924805 + }, + { + "epoch": 0.0914573858018588, + "step": 925, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.0914573858018588, + "step": 925, + "train/total_loss": 0.15222272276878357 + }, + { + "entropy": 9.649072647094727, + "epoch": 0.09155625865137433, + "mean_token_accuracy": 0.7263888716697693, + "num_tokens": 4825419.0, + "step": 926, + "train/ce_loss": 1.167543888092041 + }, + { + "epoch": 0.09155625865137433, + "step": 926, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.09155625865137433, + "step": 926, + "train/total_loss": 0.21441063284873962 + }, + { + "entropy": 9.430672645568848, + "epoch": 0.09165513150088986, + "mean_token_accuracy": 0.71875, + "num_tokens": 4830615.0, + "step": 927, + "train/ce_loss": 0.7259710431098938 + }, + { + "epoch": 0.09165513150088986, + "step": 927, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.09165513150088986, + "step": 927, + "train/total_loss": 0.18197211623191833 + }, + { + "entropy": 9.15649127960205, + "epoch": 0.09175400435040539, + "mean_token_accuracy": 0.6683831214904785, + "num_tokens": 4836069.0, + "step": 928, + "train/ce_loss": 0.8065360188484192 + }, + { + "epoch": 0.09175400435040539, + "step": 928, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.09175400435040539, + "step": 928, + "train/total_loss": 0.2056536078453064 + }, + { + "entropy": 9.24024772644043, + "epoch": 0.0918528771999209, + "mean_token_accuracy": 0.7321212291717529, + "num_tokens": 4841299.0, + "step": 929, + "train/ce_loss": 1.1482356786727905 + }, + { + "epoch": 0.0918528771999209, + "step": 929, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.0918528771999209, + "step": 929, + "train/total_loss": 0.208573579788208 + }, + { + "entropy": 9.268930435180664, + "epoch": 0.09195175004943643, + "mean_token_accuracy": 0.7400721907615662, + "num_tokens": 4846612.0, + "step": 930, + "train/ce_loss": 1.3190456628799438 + }, + { + "epoch": 0.09195175004943643, + "step": 930, + "train/sim_loss": 0.15625 + }, + { + "epoch": 0.09195175004943643, + "step": 930, + "train/total_loss": 0.28815457224845886 + }, + { + "entropy": 9.457271575927734, + "epoch": 0.09205062289895195, + "mean_token_accuracy": 0.7328145503997803, + "num_tokens": 4852035.0, + "step": 931, + "train/ce_loss": 0.5545473098754883 + }, + { + "epoch": 0.09205062289895195, + "step": 931, + "train/sim_loss": 0.13671875 + }, + { + "epoch": 0.09205062289895195, + "step": 931, + "train/total_loss": 0.19217348098754883 + }, + { + "entropy": 9.846050262451172, + "epoch": 0.09214949574846747, + "mean_token_accuracy": 0.7230769395828247, + "num_tokens": 4857039.0, + "step": 932, + "train/ce_loss": 0.896928608417511 + }, + { + "epoch": 0.09214949574846747, + "step": 932, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.09214949574846747, + "step": 932, + "train/total_loss": 0.1990678608417511 + }, + { + "entropy": 9.475932121276855, + "epoch": 0.092248368597983, + "mean_token_accuracy": 0.6655791401863098, + "num_tokens": 4862013.0, + "step": 933, + "train/ce_loss": 0.8814436197280884 + }, + { + "epoch": 0.092248368597983, + "step": 933, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.092248368597983, + "step": 933, + "train/total_loss": 0.16626936197280884 + }, + { + "entropy": 9.894923210144043, + "epoch": 0.09234724144749852, + "mean_token_accuracy": 0.7560975551605225, + "num_tokens": 4867047.0, + "step": 934, + "train/ce_loss": 1.5262436866760254 + }, + { + "epoch": 0.09234724144749852, + "step": 934, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.09234724144749852, + "step": 934, + "train/total_loss": 0.20340561866760254 + }, + { + "entropy": 9.473404884338379, + "epoch": 0.09244611429701403, + "mean_token_accuracy": 0.7365792989730835, + "num_tokens": 4872304.0, + "step": 935, + "train/ce_loss": 0.7553489804267883 + }, + { + "epoch": 0.09244611429701403, + "step": 935, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.09244611429701403, + "step": 935, + "train/total_loss": 0.1731911599636078 + }, + { + "entropy": 8.571310043334961, + "epoch": 0.09254498714652956, + "mean_token_accuracy": 0.7900262475013733, + "num_tokens": 4877960.0, + "step": 936, + "train/ce_loss": 0.400553435087204 + }, + { + "epoch": 0.09254498714652956, + "step": 936, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.09254498714652956, + "step": 936, + "train/total_loss": 0.14161784946918488 + }, + { + "entropy": 9.622425079345703, + "epoch": 0.09264385999604509, + "mean_token_accuracy": 0.7332361340522766, + "num_tokens": 4883083.0, + "step": 937, + "train/ce_loss": 0.6467565894126892 + }, + { + "epoch": 0.09264385999604509, + "step": 937, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.09264385999604509, + "step": 937, + "train/total_loss": 0.15451940894126892 + }, + { + "entropy": 9.209803581237793, + "epoch": 0.09274273284556062, + "mean_token_accuracy": 0.7240990996360779, + "num_tokens": 4888480.0, + "step": 938, + "train/ce_loss": 0.7532393336296082 + }, + { + "epoch": 0.09274273284556062, + "step": 938, + "train/sim_loss": 0.140625 + }, + { + "epoch": 0.09274273284556062, + "step": 938, + "train/total_loss": 0.2159489393234253 + }, + { + "entropy": 9.478158950805664, + "epoch": 0.09284160569507613, + "mean_token_accuracy": 0.7897371649742126, + "num_tokens": 4893692.0, + "step": 939, + "train/ce_loss": 0.7379792928695679 + }, + { + "epoch": 0.09284160569507613, + "step": 939, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.09284160569507613, + "step": 939, + "train/total_loss": 0.12457918375730515 + }, + { + "epoch": 0.09294047854459166, + "grad_norm": 0.9931323528289795, + "learning_rate": 9.770311031993277e-06, + "loss": 0.1841, + "step": 940 + }, + { + "entropy": 9.09237289428711, + "epoch": 0.09294047854459166, + "mean_token_accuracy": 0.7732426524162292, + "num_tokens": 4899037.0, + "step": 940, + "train/ce_loss": 0.5895564556121826 + }, + { + "epoch": 0.09294047854459166, + "step": 940, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.09294047854459166, + "step": 940, + "train/total_loss": 0.16051813960075378 + }, + { + "entropy": 10.08241081237793, + "epoch": 0.09303935139410718, + "mean_token_accuracy": 0.752293586730957, + "num_tokens": 4903761.0, + "step": 941, + "train/ce_loss": 4.899001578451134e-05 + }, + { + "epoch": 0.09303935139410718, + "step": 941, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.09303935139410718, + "step": 941, + "train/total_loss": 0.07812990248203278 + }, + { + "entropy": 9.072853088378906, + "epoch": 0.0931382242436227, + "mean_token_accuracy": 0.7931416034698486, + "num_tokens": 4909129.0, + "step": 942, + "train/ce_loss": 0.58165043592453 + }, + { + "epoch": 0.0931382242436227, + "step": 942, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.0931382242436227, + "step": 942, + "train/total_loss": 0.101133793592453 + }, + { + "entropy": 9.140288352966309, + "epoch": 0.09323709709313822, + "mean_token_accuracy": 0.766978919506073, + "num_tokens": 4914501.0, + "step": 943, + "train/ce_loss": 0.5601727962493896 + }, + { + "epoch": 0.09323709709313822, + "step": 943, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.09323709709313822, + "step": 943, + "train/total_loss": 0.09117352962493896 + }, + { + "entropy": 9.401217460632324, + "epoch": 0.09333596994265375, + "mean_token_accuracy": 0.7375796437263489, + "num_tokens": 4919689.0, + "step": 944, + "train/ce_loss": 1.0126149654388428 + }, + { + "epoch": 0.09333596994265375, + "step": 944, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.09333596994265375, + "step": 944, + "train/total_loss": 0.12860524654388428 + }, + { + "entropy": 9.376395225524902, + "epoch": 0.09343484279216926, + "mean_token_accuracy": 0.7076537013053894, + "num_tokens": 4924936.0, + "step": 945, + "train/ce_loss": 0.7645605206489563 + }, + { + "epoch": 0.09343484279216926, + "step": 945, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.09343484279216926, + "step": 945, + "train/total_loss": 0.15848730504512787 + }, + { + "entropy": 9.753252983093262, + "epoch": 0.09353371564168479, + "mean_token_accuracy": 0.7732656598091125, + "num_tokens": 4930019.0, + "step": 946, + "train/ce_loss": 0.7650435566902161 + }, + { + "epoch": 0.09353371564168479, + "step": 946, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.09353371564168479, + "step": 946, + "train/total_loss": 0.13900434970855713 + }, + { + "entropy": 9.455278396606445, + "epoch": 0.09363258849120032, + "mean_token_accuracy": 0.7410423159599304, + "num_tokens": 4935086.0, + "step": 947, + "train/ce_loss": 1.4153460264205933 + }, + { + "epoch": 0.09363258849120032, + "step": 947, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.09363258849120032, + "step": 947, + "train/total_loss": 0.25090962648391724 + }, + { + "entropy": 9.894110679626465, + "epoch": 0.09373146134071583, + "mean_token_accuracy": 0.7921478152275085, + "num_tokens": 4939913.0, + "step": 948, + "train/ce_loss": 7.748230564175174e-05 + }, + { + "epoch": 0.09373146134071583, + "step": 948, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.09373146134071583, + "step": 948, + "train/total_loss": 0.0820389986038208 + }, + { + "entropy": 9.149972915649414, + "epoch": 0.09383033419023136, + "mean_token_accuracy": 0.6889804601669312, + "num_tokens": 4945318.0, + "step": 949, + "train/ce_loss": 0.6303824186325073 + }, + { + "epoch": 0.09383033419023136, + "step": 949, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.09383033419023136, + "step": 949, + "train/total_loss": 0.13725699484348297 + }, + { + "entropy": 8.84632682800293, + "epoch": 0.09392920703974689, + "mean_token_accuracy": 0.7248826026916504, + "num_tokens": 4950855.0, + "step": 950, + "train/ce_loss": 1.2966424226760864 + }, + { + "epoch": 0.09392920703974689, + "step": 950, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.09392920703974689, + "step": 950, + "train/total_loss": 0.21169549226760864 + }, + { + "entropy": 9.00080680847168, + "epoch": 0.09402807988926241, + "mean_token_accuracy": 0.7522211074829102, + "num_tokens": 4956337.0, + "step": 951, + "train/ce_loss": 0.5790229439735413 + }, + { + "epoch": 0.09402807988926241, + "step": 951, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.09402807988926241, + "step": 951, + "train/total_loss": 0.10087104141712189 + }, + { + "entropy": 9.652454376220703, + "epoch": 0.09412695273877793, + "mean_token_accuracy": 0.7160120606422424, + "num_tokens": 4961403.0, + "step": 952, + "train/ce_loss": 1.1361289024353027 + }, + { + "epoch": 0.09412695273877793, + "step": 952, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.09412695273877793, + "step": 952, + "train/total_loss": 0.20345664024353027 + }, + { + "entropy": 9.625324249267578, + "epoch": 0.09422582558829345, + "mean_token_accuracy": 0.7744565010070801, + "num_tokens": 4966589.0, + "step": 953, + "train/ce_loss": 2.6591091227601282e-05 + }, + { + "epoch": 0.09422582558829345, + "step": 953, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.09422582558829345, + "step": 953, + "train/total_loss": 0.0703151598572731 + }, + { + "entropy": 9.048276901245117, + "epoch": 0.09432469843780898, + "mean_token_accuracy": 0.6781609058380127, + "num_tokens": 4971948.0, + "step": 954, + "train/ce_loss": 1.3899551630020142 + }, + { + "epoch": 0.09432469843780898, + "step": 954, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.09432469843780898, + "step": 954, + "train/total_loss": 0.25227677822113037 + }, + { + "entropy": 9.31239128112793, + "epoch": 0.0944235712873245, + "mean_token_accuracy": 0.6825581192970276, + "num_tokens": 4977237.0, + "step": 955, + "train/ce_loss": 0.6473762392997742 + }, + { + "epoch": 0.0944235712873245, + "step": 955, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.0944235712873245, + "step": 955, + "train/total_loss": 0.14676886796951294 + }, + { + "entropy": 8.887161254882812, + "epoch": 0.09452244413684002, + "mean_token_accuracy": 0.759829044342041, + "num_tokens": 4982837.0, + "step": 956, + "train/ce_loss": 0.6316436529159546 + }, + { + "epoch": 0.09452244413684002, + "step": 956, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.09452244413684002, + "step": 956, + "train/total_loss": 0.1295706182718277 + }, + { + "entropy": 9.931779861450195, + "epoch": 0.09462131698635555, + "mean_token_accuracy": 0.7406014800071716, + "num_tokens": 4987864.0, + "step": 957, + "train/ce_loss": 1.021917462348938 + }, + { + "epoch": 0.09462131698635555, + "step": 957, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.09462131698635555, + "step": 957, + "train/total_loss": 0.1959417462348938 + }, + { + "entropy": 9.646172523498535, + "epoch": 0.09472018983587106, + "mean_token_accuracy": 0.7307132482528687, + "num_tokens": 4992995.0, + "step": 958, + "train/ce_loss": 1.0254849195480347 + }, + { + "epoch": 0.09472018983587106, + "step": 958, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.09472018983587106, + "step": 958, + "train/total_loss": 0.2197359949350357 + }, + { + "entropy": 9.367281913757324, + "epoch": 0.09481906268538659, + "mean_token_accuracy": 0.6963824033737183, + "num_tokens": 4998185.0, + "step": 959, + "train/ce_loss": 0.7240278720855713 + }, + { + "epoch": 0.09481906268538659, + "step": 959, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.09481906268538659, + "step": 959, + "train/total_loss": 0.16224654018878937 + }, + { + "epoch": 0.09491793553490212, + "grad_norm": 1.1352251768112183, + "learning_rate": 9.765366167235327e-06, + "loss": 0.1763, + "step": 960 + }, + { + "entropy": 9.286064147949219, + "epoch": 0.09491793553490212, + "mean_token_accuracy": 0.7055492401123047, + "num_tokens": 5003682.0, + "step": 960, + "train/ce_loss": 0.8170363903045654 + }, + { + "epoch": 0.09491793553490212, + "step": 960, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.09491793553490212, + "step": 960, + "train/total_loss": 0.19889113306999207 + }, + { + "entropy": 9.945854187011719, + "epoch": 0.09501680838441764, + "mean_token_accuracy": 0.7458677887916565, + "num_tokens": 5008588.0, + "step": 961, + "train/ce_loss": 1.0800068378448486 + }, + { + "epoch": 0.09501680838441764, + "step": 961, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.09501680838441764, + "step": 961, + "train/total_loss": 0.19393819570541382 + }, + { + "entropy": 9.646007537841797, + "epoch": 0.09511568123393316, + "mean_token_accuracy": 0.7693602442741394, + "num_tokens": 5013660.0, + "step": 962, + "train/ce_loss": 0.8580648303031921 + }, + { + "epoch": 0.09511568123393316, + "step": 962, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.09511568123393316, + "step": 962, + "train/total_loss": 0.1209627315402031 + }, + { + "entropy": 9.333677291870117, + "epoch": 0.09521455408344869, + "mean_token_accuracy": 0.7670251131057739, + "num_tokens": 5018978.0, + "step": 963, + "train/ce_loss": 0.6943764686584473 + }, + { + "epoch": 0.09521455408344869, + "step": 963, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.09521455408344869, + "step": 963, + "train/total_loss": 0.1710001528263092 + }, + { + "entropy": 9.317039489746094, + "epoch": 0.09531342693296421, + "mean_token_accuracy": 0.7134146094322205, + "num_tokens": 5024290.0, + "step": 964, + "train/ce_loss": 1.0419347286224365 + }, + { + "epoch": 0.09531342693296421, + "step": 964, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.09531342693296421, + "step": 964, + "train/total_loss": 0.20575597882270813 + }, + { + "entropy": 9.468748092651367, + "epoch": 0.09541229978247973, + "mean_token_accuracy": 0.7516425848007202, + "num_tokens": 5029511.0, + "step": 965, + "train/ce_loss": 0.7559531927108765 + }, + { + "epoch": 0.09541229978247973, + "step": 965, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.09541229978247973, + "step": 965, + "train/total_loss": 0.13418906927108765 + }, + { + "entropy": 9.766124725341797, + "epoch": 0.09551117263199525, + "mean_token_accuracy": 0.7006269693374634, + "num_tokens": 5034627.0, + "step": 966, + "train/ce_loss": 0.7853338122367859 + }, + { + "epoch": 0.09551117263199525, + "step": 966, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.09551117263199525, + "step": 966, + "train/total_loss": 0.1879083812236786 + }, + { + "entropy": 10.407069206237793, + "epoch": 0.09561004548151078, + "mean_token_accuracy": 0.7586206793785095, + "num_tokens": 5039340.0, + "step": 967, + "train/ce_loss": 4.891712887911126e-05 + }, + { + "epoch": 0.09561004548151078, + "step": 967, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.09561004548151078, + "step": 967, + "train/total_loss": 0.03906739130616188 + }, + { + "entropy": 10.262247085571289, + "epoch": 0.0957089183310263, + "mean_token_accuracy": 0.723557710647583, + "num_tokens": 5044132.0, + "step": 968, + "train/ce_loss": 4.142152829444967e-05 + }, + { + "epoch": 0.0957089183310263, + "step": 968, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.0957089183310263, + "step": 968, + "train/total_loss": 0.03125414252281189 + }, + { + "entropy": 9.122846603393555, + "epoch": 0.09580779118054182, + "mean_token_accuracy": 0.7097130417823792, + "num_tokens": 5049493.0, + "step": 969, + "train/ce_loss": 1.2161741256713867 + }, + { + "epoch": 0.09580779118054182, + "step": 969, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.09580779118054182, + "step": 969, + "train/total_loss": 0.1567736566066742 + }, + { + "entropy": 9.219634056091309, + "epoch": 0.09590666403005735, + "mean_token_accuracy": 0.7389830350875854, + "num_tokens": 5054859.0, + "step": 970, + "train/ce_loss": 0.8640303611755371 + }, + { + "epoch": 0.09590666403005735, + "step": 970, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.09590666403005735, + "step": 970, + "train/total_loss": 0.2035905420780182 + }, + { + "entropy": 9.576761245727539, + "epoch": 0.09600553687957288, + "mean_token_accuracy": 0.6657682061195374, + "num_tokens": 5060061.0, + "step": 971, + "train/ce_loss": 2.4594476222991943 + }, + { + "epoch": 0.09600553687957288, + "step": 971, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.09600553687957288, + "step": 971, + "train/total_loss": 0.3396947681903839 + }, + { + "entropy": 9.380084991455078, + "epoch": 0.09610440972908839, + "mean_token_accuracy": 0.7656458020210266, + "num_tokens": 5065271.0, + "step": 972, + "train/ce_loss": 0.49439525604248047 + }, + { + "epoch": 0.09610440972908839, + "step": 972, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.09610440972908839, + "step": 972, + "train/total_loss": 0.16662701964378357 + }, + { + "entropy": 9.686589241027832, + "epoch": 0.09620328257860392, + "mean_token_accuracy": 0.6975609660148621, + "num_tokens": 5070409.0, + "step": 973, + "train/ce_loss": 1.1760450601577759 + }, + { + "epoch": 0.09620328257860392, + "step": 973, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.09620328257860392, + "step": 973, + "train/total_loss": 0.24260450899600983 + }, + { + "entropy": 10.30485725402832, + "epoch": 0.09630215542811944, + "mean_token_accuracy": 0.7692307829856873, + "num_tokens": 5075148.0, + "step": 974, + "train/ce_loss": 1.261991262435913 + }, + { + "epoch": 0.09630215542811944, + "step": 974, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.09630215542811944, + "step": 974, + "train/total_loss": 0.2316678762435913 + }, + { + "entropy": 10.129904747009277, + "epoch": 0.09640102827763496, + "mean_token_accuracy": 0.7991071343421936, + "num_tokens": 5080046.0, + "step": 975, + "train/ce_loss": 3.663766983663663e-05 + }, + { + "epoch": 0.09640102827763496, + "step": 975, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.09640102827763496, + "step": 975, + "train/total_loss": 0.09765991568565369 + }, + { + "entropy": 9.284428596496582, + "epoch": 0.09649990112715048, + "mean_token_accuracy": 0.6944785118103027, + "num_tokens": 5085291.0, + "step": 976, + "train/ce_loss": 0.760468065738678 + }, + { + "epoch": 0.09649990112715048, + "step": 976, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.09649990112715048, + "step": 976, + "train/total_loss": 0.16589055955410004 + }, + { + "entropy": 9.093676567077637, + "epoch": 0.09659877397666601, + "mean_token_accuracy": 0.7232635021209717, + "num_tokens": 5090657.0, + "step": 977, + "train/ce_loss": 0.6065735220909119 + }, + { + "epoch": 0.09659877397666601, + "step": 977, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.09659877397666601, + "step": 977, + "train/total_loss": 0.10753235220909119 + }, + { + "entropy": 10.156436920166016, + "epoch": 0.09669764682618152, + "mean_token_accuracy": 0.773955762386322, + "num_tokens": 5095471.0, + "step": 978, + "train/ce_loss": 1.2206401824951172 + }, + { + "epoch": 0.09669764682618152, + "step": 978, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.09669764682618152, + "step": 978, + "train/total_loss": 0.1689390242099762 + }, + { + "entropy": 9.224005699157715, + "epoch": 0.09679651967569705, + "mean_token_accuracy": 0.7690557241439819, + "num_tokens": 5100782.0, + "step": 979, + "train/ce_loss": 0.9182709455490112 + }, + { + "epoch": 0.09679651967569705, + "step": 979, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.09679651967569705, + "step": 979, + "train/total_loss": 0.19338959455490112 + }, + { + "epoch": 0.09689539252521258, + "grad_norm": 0.9375425577163696, + "learning_rate": 9.760421302477378e-06, + "loss": 0.1738, + "step": 980 + }, + { + "entropy": 9.039779663085938, + "epoch": 0.09689539252521258, + "mean_token_accuracy": 0.6974595785140991, + "num_tokens": 5106046.0, + "step": 980, + "train/ce_loss": 0.6134800314903259 + }, + { + "epoch": 0.09689539252521258, + "step": 980, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.09689539252521258, + "step": 980, + "train/total_loss": 0.13947300612926483 + }, + { + "entropy": 10.0238676071167, + "epoch": 0.0969942653747281, + "mean_token_accuracy": 0.7307692170143127, + "num_tokens": 5110808.0, + "step": 981, + "train/ce_loss": 0.0001462309737689793 + }, + { + "epoch": 0.0969942653747281, + "step": 981, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.0969942653747281, + "step": 981, + "train/total_loss": 0.10548337548971176 + }, + { + "entropy": 9.56509780883789, + "epoch": 0.09709313822424362, + "mean_token_accuracy": 0.7315634489059448, + "num_tokens": 5115944.0, + "step": 982, + "train/ce_loss": 1.0472911596298218 + }, + { + "epoch": 0.09709313822424362, + "step": 982, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.09709313822424362, + "step": 982, + "train/total_loss": 0.18676036596298218 + }, + { + "entropy": 9.660545349121094, + "epoch": 0.09719201107375915, + "mean_token_accuracy": 0.7556818127632141, + "num_tokens": 5121091.0, + "step": 983, + "train/ce_loss": 0.528469979763031 + }, + { + "epoch": 0.09719201107375915, + "step": 983, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.09719201107375915, + "step": 983, + "train/total_loss": 0.0997219979763031 + }, + { + "entropy": 9.2199125289917, + "epoch": 0.09729088392327467, + "mean_token_accuracy": 0.6720368266105652, + "num_tokens": 5126456.0, + "step": 984, + "train/ce_loss": 1.1708029508590698 + }, + { + "epoch": 0.09729088392327467, + "step": 984, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.09729088392327467, + "step": 984, + "train/total_loss": 0.22645530104637146 + }, + { + "entropy": 9.225263595581055, + "epoch": 0.09738975677279019, + "mean_token_accuracy": 0.7661470174789429, + "num_tokens": 5131806.0, + "step": 985, + "train/ce_loss": 1.202368974685669 + }, + { + "epoch": 0.09738975677279019, + "step": 985, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.09738975677279019, + "step": 985, + "train/total_loss": 0.15539315342903137 + }, + { + "entropy": 9.521390914916992, + "epoch": 0.09748862962230571, + "mean_token_accuracy": 0.7075588703155518, + "num_tokens": 5137059.0, + "step": 986, + "train/ce_loss": 0.992639422416687 + }, + { + "epoch": 0.09748862962230571, + "step": 986, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.09748862962230571, + "step": 986, + "train/total_loss": 0.14613893628120422 + }, + { + "entropy": 8.910574913024902, + "epoch": 0.09758750247182124, + "mean_token_accuracy": 0.7683258056640625, + "num_tokens": 5142658.0, + "step": 987, + "train/ce_loss": 0.9678569436073303 + }, + { + "epoch": 0.09758750247182124, + "step": 987, + "train/sim_loss": 0.15625 + }, + { + "epoch": 0.09758750247182124, + "step": 987, + "train/total_loss": 0.25303569436073303 + }, + { + "entropy": 9.38957691192627, + "epoch": 0.09768637532133675, + "mean_token_accuracy": 0.770370364189148, + "num_tokens": 5147938.0, + "step": 988, + "train/ce_loss": 0.846873939037323 + }, + { + "epoch": 0.09768637532133675, + "step": 988, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.09768637532133675, + "step": 988, + "train/total_loss": 0.19796864688396454 + }, + { + "entropy": 9.404655456542969, + "epoch": 0.09778524817085228, + "mean_token_accuracy": 0.7073760628700256, + "num_tokens": 5153258.0, + "step": 989, + "train/ce_loss": 0.7749897241592407 + }, + { + "epoch": 0.09778524817085228, + "step": 989, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.09778524817085228, + "step": 989, + "train/total_loss": 0.19078022241592407 + }, + { + "entropy": 9.652052879333496, + "epoch": 0.09788412102036781, + "mean_token_accuracy": 0.700276255607605, + "num_tokens": 5158422.0, + "step": 990, + "train/ce_loss": 1.1462301015853882 + }, + { + "epoch": 0.09788412102036781, + "step": 990, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.09788412102036781, + "step": 990, + "train/total_loss": 0.19665426015853882 + }, + { + "entropy": 9.799245834350586, + "epoch": 0.09798299386988334, + "mean_token_accuracy": 0.7184750437736511, + "num_tokens": 5163525.0, + "step": 991, + "train/ce_loss": 0.8648521304130554 + }, + { + "epoch": 0.09798299386988334, + "step": 991, + "train/sim_loss": 0.1484375 + }, + { + "epoch": 0.09798299386988334, + "step": 991, + "train/total_loss": 0.23492270708084106 + }, + { + "entropy": 9.31769847869873, + "epoch": 0.09808186671939885, + "mean_token_accuracy": 0.7049723863601685, + "num_tokens": 5168924.0, + "step": 992, + "train/ce_loss": 0.7937401533126831 + }, + { + "epoch": 0.09808186671939885, + "step": 992, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.09808186671939885, + "step": 992, + "train/total_loss": 0.1887490153312683 + }, + { + "entropy": 9.720190048217773, + "epoch": 0.09818073956891438, + "mean_token_accuracy": 0.7551724314689636, + "num_tokens": 5173940.0, + "step": 993, + "train/ce_loss": 1.0298943519592285 + }, + { + "epoch": 0.09818073956891438, + "step": 993, + "train/sim_loss": 0.1484375 + }, + { + "epoch": 0.09818073956891438, + "step": 993, + "train/total_loss": 0.25142693519592285 + }, + { + "entropy": 9.876079559326172, + "epoch": 0.0982796124184299, + "mean_token_accuracy": 0.7508650422096252, + "num_tokens": 5178969.0, + "step": 994, + "train/ce_loss": 2.5448929591220804e-05 + }, + { + "epoch": 0.0982796124184299, + "step": 994, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.0982796124184299, + "step": 994, + "train/total_loss": 0.04297129437327385 + }, + { + "entropy": 10.213848114013672, + "epoch": 0.09837848526794542, + "mean_token_accuracy": 0.7462038993835449, + "num_tokens": 5183853.0, + "step": 995, + "train/ce_loss": 2.6947966034640558e-05 + }, + { + "epoch": 0.09837848526794542, + "step": 995, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.09837848526794542, + "step": 995, + "train/total_loss": 0.03125269338488579 + }, + { + "entropy": 9.504982948303223, + "epoch": 0.09847735811746094, + "mean_token_accuracy": 0.7118194103240967, + "num_tokens": 5189055.0, + "step": 996, + "train/ce_loss": 1.0164376497268677 + }, + { + "epoch": 0.09847735811746094, + "step": 996, + "train/sim_loss": 0.1796875 + }, + { + "epoch": 0.09847735811746094, + "step": 996, + "train/total_loss": 0.28133127093315125 + }, + { + "entropy": 9.319685935974121, + "epoch": 0.09857623096697647, + "mean_token_accuracy": 0.7459584474563599, + "num_tokens": 5194385.0, + "step": 997, + "train/ce_loss": 0.9244051575660706 + }, + { + "epoch": 0.09857623096697647, + "step": 997, + "train/sim_loss": 0.12890625 + }, + { + "epoch": 0.09857623096697647, + "step": 997, + "train/total_loss": 0.22134676575660706 + }, + { + "entropy": 9.0696439743042, + "epoch": 0.09867510381649199, + "mean_token_accuracy": 0.7338709831237793, + "num_tokens": 5199780.0, + "step": 998, + "train/ce_loss": 0.7940992116928101 + }, + { + "epoch": 0.09867510381649199, + "step": 998, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.09867510381649199, + "step": 998, + "train/total_loss": 0.14972242712974548 + }, + { + "entropy": 9.231082916259766, + "epoch": 0.09877397666600751, + "mean_token_accuracy": 0.706695020198822, + "num_tokens": 5205221.0, + "step": 999, + "train/ce_loss": 1.249048113822937 + }, + { + "epoch": 0.09877397666600751, + "step": 999, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.09877397666600751, + "step": 999, + "train/total_loss": 0.1678735613822937 + }, + { + "epoch": 0.09887284951552304, + "grad_norm": 1.0724185705184937, + "learning_rate": 9.755476437719428e-06, + "loss": 0.1774, + "step": 1000 + }, + { + "entropy": 9.569454193115234, + "epoch": 0.09887284951552304, + "mean_token_accuracy": 0.7770618796348572, + "num_tokens": 5210499.0, + "step": 1000, + "train/ce_loss": 1.22596275806427 + }, + { + "epoch": 0.09887284951552304, + "step": 1000, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.09887284951552304, + "step": 1000, + "train/total_loss": 0.21244002878665924 + }, + { + "entropy": 9.117839813232422, + "epoch": 0.09897172236503857, + "mean_token_accuracy": 0.7530747652053833, + "num_tokens": 5215992.0, + "step": 1001, + "train/ce_loss": 0.6427945494651794 + }, + { + "epoch": 0.09897172236503857, + "step": 1001, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.09897172236503857, + "step": 1001, + "train/total_loss": 0.1541232168674469 + }, + { + "entropy": 9.556461334228516, + "epoch": 0.09907059521455408, + "mean_token_accuracy": 0.7525510191917419, + "num_tokens": 5221190.0, + "step": 1002, + "train/ce_loss": 0.8305707573890686 + }, + { + "epoch": 0.09907059521455408, + "step": 1002, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.09907059521455408, + "step": 1002, + "train/total_loss": 0.16899457573890686 + }, + { + "entropy": 9.66070556640625, + "epoch": 0.09916946806406961, + "mean_token_accuracy": 0.6742081642150879, + "num_tokens": 5226301.0, + "step": 1003, + "train/ce_loss": 1.040513515472412 + }, + { + "epoch": 0.09916946806406961, + "step": 1003, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.09916946806406961, + "step": 1003, + "train/total_loss": 0.1821763515472412 + }, + { + "entropy": 9.522982597351074, + "epoch": 0.09926834091358513, + "mean_token_accuracy": 0.7161290049552917, + "num_tokens": 5231531.0, + "step": 1004, + "train/ce_loss": 0.7382574677467346 + }, + { + "epoch": 0.09926834091358513, + "step": 1004, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.09926834091358513, + "step": 1004, + "train/total_loss": 0.11288824677467346 + }, + { + "entropy": 9.404748916625977, + "epoch": 0.09936721376310065, + "mean_token_accuracy": 0.737171471118927, + "num_tokens": 5236786.0, + "step": 1005, + "train/ce_loss": 1.1236399412155151 + }, + { + "epoch": 0.09936721376310065, + "step": 1005, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.09936721376310065, + "step": 1005, + "train/total_loss": 0.2022077441215515 + }, + { + "entropy": 10.117021560668945, + "epoch": 0.09946608661261618, + "mean_token_accuracy": 0.738095223903656, + "num_tokens": 5241623.0, + "step": 1006, + "train/ce_loss": 1.809425950050354 + }, + { + "epoch": 0.09946608661261618, + "step": 1006, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.09946608661261618, + "step": 1006, + "train/total_loss": 0.2825050950050354 + }, + { + "entropy": 9.110442161560059, + "epoch": 0.0995649594621317, + "mean_token_accuracy": 0.6974697709083557, + "num_tokens": 5247022.0, + "step": 1007, + "train/ce_loss": 1.1598174571990967 + }, + { + "epoch": 0.0995649594621317, + "step": 1007, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.0995649594621317, + "step": 1007, + "train/total_loss": 0.21754425764083862 + }, + { + "entropy": 9.71247673034668, + "epoch": 0.09966383231164722, + "mean_token_accuracy": 0.7104136943817139, + "num_tokens": 5252114.0, + "step": 1008, + "train/ce_loss": 0.6906753182411194 + }, + { + "epoch": 0.09966383231164722, + "step": 1008, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.09966383231164722, + "step": 1008, + "train/total_loss": 0.13938003778457642 + }, + { + "entropy": 9.66971492767334, + "epoch": 0.09976270516116274, + "mean_token_accuracy": 0.8030592799186707, + "num_tokens": 5257107.0, + "step": 1009, + "train/ce_loss": 0.7509585618972778 + }, + { + "epoch": 0.09976270516116274, + "step": 1009, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.09976270516116274, + "step": 1009, + "train/total_loss": 0.12978336215019226 + }, + { + "entropy": 9.871513366699219, + "epoch": 0.09986157801067827, + "mean_token_accuracy": 0.7030201554298401, + "num_tokens": 5262083.0, + "step": 1010, + "train/ce_loss": 1.4761762619018555 + }, + { + "epoch": 0.09986157801067827, + "step": 1010, + "train/sim_loss": 0.13671875 + }, + { + "epoch": 0.09986157801067827, + "step": 1010, + "train/total_loss": 0.2843363881111145 + }, + { + "entropy": 9.108389854431152, + "epoch": 0.0999604508601938, + "mean_token_accuracy": 0.7257072329521179, + "num_tokens": 5267393.0, + "step": 1011, + "train/ce_loss": 1.0095230340957642 + }, + { + "epoch": 0.0999604508601938, + "step": 1011, + "train/sim_loss": 0.1484375 + }, + { + "epoch": 0.0999604508601938, + "step": 1011, + "train/total_loss": 0.24938979744911194 + }, + { + "entropy": 9.610630989074707, + "epoch": 0.10005932370970931, + "mean_token_accuracy": 0.7174515128135681, + "num_tokens": 5272534.0, + "step": 1012, + "train/ce_loss": 0.6784527897834778 + }, + { + "epoch": 0.10005932370970931, + "step": 1012, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.10005932370970931, + "step": 1012, + "train/total_loss": 0.12253277748823166 + }, + { + "entropy": 8.754789352416992, + "epoch": 0.10015819655922484, + "mean_token_accuracy": 0.7428810596466064, + "num_tokens": 5278206.0, + "step": 1013, + "train/ce_loss": 0.6056615114212036 + }, + { + "epoch": 0.10015819655922484, + "step": 1013, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.10015819655922484, + "step": 1013, + "train/total_loss": 0.09572240710258484 + }, + { + "entropy": 9.353209495544434, + "epoch": 0.10025706940874037, + "mean_token_accuracy": 0.7290886640548706, + "num_tokens": 5283457.0, + "step": 1014, + "train/ce_loss": 0.7436414361000061 + }, + { + "epoch": 0.10025706940874037, + "step": 1014, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.10025706940874037, + "step": 1014, + "train/total_loss": 0.16811415553092957 + }, + { + "entropy": 9.666563987731934, + "epoch": 0.10035594225825588, + "mean_token_accuracy": 0.7506702542304993, + "num_tokens": 5288815.0, + "step": 1015, + "train/ce_loss": 0.5352444648742676 + }, + { + "epoch": 0.10035594225825588, + "step": 1015, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.10035594225825588, + "step": 1015, + "train/total_loss": 0.088680699467659 + }, + { + "entropy": 9.092962265014648, + "epoch": 0.1004548151077714, + "mean_token_accuracy": 0.6735324263572693, + "num_tokens": 5294243.0, + "step": 1016, + "train/ce_loss": 0.8530191779136658 + }, + { + "epoch": 0.1004548151077714, + "step": 1016, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.1004548151077714, + "step": 1016, + "train/total_loss": 0.15561442077159882 + }, + { + "entropy": 9.992460250854492, + "epoch": 0.10055368795728693, + "mean_token_accuracy": 0.7484536170959473, + "num_tokens": 5299175.0, + "step": 1017, + "train/ce_loss": 4.608002564054914e-05 + }, + { + "epoch": 0.10055368795728693, + "step": 1017, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.10055368795728693, + "step": 1017, + "train/total_loss": 0.0781296044588089 + }, + { + "entropy": 9.113978385925293, + "epoch": 0.10065256080680245, + "mean_token_accuracy": 0.6854166388511658, + "num_tokens": 5304671.0, + "step": 1018, + "train/ce_loss": 0.5623672604560852 + }, + { + "epoch": 0.10065256080680245, + "step": 1018, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.10065256080680245, + "step": 1018, + "train/total_loss": 0.16170547902584076 + }, + { + "entropy": 9.790916442871094, + "epoch": 0.10075143365631797, + "mean_token_accuracy": 0.7684210538864136, + "num_tokens": 5309699.0, + "step": 1019, + "train/ce_loss": 1.3232040405273438 + }, + { + "epoch": 0.10075143365631797, + "step": 1019, + "train/sim_loss": 0.12109375 + }, + { + "epoch": 0.10075143365631797, + "step": 1019, + "train/total_loss": 0.2534141540527344 + }, + { + "epoch": 0.1008503065058335, + "grad_norm": 1.2360754013061523, + "learning_rate": 9.75053157296148e-06, + "loss": 0.1749, + "step": 1020 + }, + { + "entropy": 9.670308113098145, + "epoch": 0.1008503065058335, + "mean_token_accuracy": 0.6661631464958191, + "num_tokens": 5314810.0, + "step": 1020, + "train/ce_loss": 1.7671234607696533 + }, + { + "epoch": 0.1008503065058335, + "step": 1020, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.1008503065058335, + "step": 1020, + "train/total_loss": 0.2821810841560364 + }, + { + "entropy": 9.031726837158203, + "epoch": 0.10094917935534903, + "mean_token_accuracy": 0.730975329875946, + "num_tokens": 5320233.0, + "step": 1021, + "train/ce_loss": 0.9387472867965698 + }, + { + "epoch": 0.10094917935534903, + "step": 1021, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.10094917935534903, + "step": 1021, + "train/total_loss": 0.1485622227191925 + }, + { + "entropy": 9.516468048095703, + "epoch": 0.10104805220486454, + "mean_token_accuracy": 0.7382920384407043, + "num_tokens": 5325369.0, + "step": 1022, + "train/ce_loss": 0.86199951171875 + }, + { + "epoch": 0.10104805220486454, + "step": 1022, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.10104805220486454, + "step": 1022, + "train/total_loss": 0.16432495415210724 + }, + { + "entropy": 9.152571678161621, + "epoch": 0.10114692505438007, + "mean_token_accuracy": 0.7382628917694092, + "num_tokens": 5330709.0, + "step": 1023, + "train/ce_loss": 0.9139989614486694 + }, + { + "epoch": 0.10114692505438007, + "step": 1023, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.10114692505438007, + "step": 1023, + "train/total_loss": 0.1617124080657959 + }, + { + "entropy": 9.668933868408203, + "epoch": 0.1012457979038956, + "mean_token_accuracy": 0.7255216836929321, + "num_tokens": 5335785.0, + "step": 1024, + "train/ce_loss": 0.7937629818916321 + }, + { + "epoch": 0.1012457979038956, + "step": 1024, + "train/sim_loss": 0.16015625 + }, + { + "epoch": 0.1012457979038956, + "step": 1024, + "train/total_loss": 0.23953256011009216 + }, + { + "entropy": 8.975940704345703, + "epoch": 0.10134467075341111, + "mean_token_accuracy": 0.7204641103744507, + "num_tokens": 5341261.0, + "step": 1025, + "train/ce_loss": 1.1749017238616943 + }, + { + "epoch": 0.10134467075341111, + "step": 1025, + "train/sim_loss": 0.19140625 + }, + { + "epoch": 0.10134467075341111, + "step": 1025, + "train/total_loss": 0.30889642238616943 + }, + { + "entropy": 9.504355430603027, + "epoch": 0.10144354360292664, + "mean_token_accuracy": 0.7365177273750305, + "num_tokens": 5346381.0, + "step": 1026, + "train/ce_loss": 1.250915765762329 + }, + { + "epoch": 0.10144354360292664, + "step": 1026, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.10144354360292664, + "step": 1026, + "train/total_loss": 0.1914978325366974 + }, + { + "entropy": 9.745513916015625, + "epoch": 0.10154241645244216, + "mean_token_accuracy": 0.8098256587982178, + "num_tokens": 5351478.0, + "step": 1027, + "train/ce_loss": 1.9117256670142524e-05 + }, + { + "epoch": 0.10154241645244216, + "step": 1027, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.10154241645244216, + "step": 1027, + "train/total_loss": 0.07422066479921341 + }, + { + "entropy": 9.939186096191406, + "epoch": 0.10164128930195768, + "mean_token_accuracy": 0.7660818696022034, + "num_tokens": 5356403.0, + "step": 1028, + "train/ce_loss": 0.6355189681053162 + }, + { + "epoch": 0.10164128930195768, + "step": 1028, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.10164128930195768, + "step": 1028, + "train/total_loss": 0.1533956527709961 + }, + { + "entropy": 9.516606330871582, + "epoch": 0.1017401621514732, + "mean_token_accuracy": 0.7118881344795227, + "num_tokens": 5361618.0, + "step": 1029, + "train/ce_loss": 0.7298515439033508 + }, + { + "epoch": 0.1017401621514732, + "step": 1029, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.1017401621514732, + "step": 1029, + "train/total_loss": 0.14720390737056732 + }, + { + "entropy": 9.831457138061523, + "epoch": 0.10183903500098873, + "mean_token_accuracy": 0.744966447353363, + "num_tokens": 5366926.0, + "step": 1030, + "train/ce_loss": 2.8248850867385045e-05 + }, + { + "epoch": 0.10183903500098873, + "step": 1030, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.10183903500098873, + "step": 1030, + "train/total_loss": 0.10547157377004623 + }, + { + "entropy": 9.04391098022461, + "epoch": 0.10193790785050424, + "mean_token_accuracy": 0.7756202816963196, + "num_tokens": 5372332.0, + "step": 1031, + "train/ce_loss": 0.3345872759819031 + }, + { + "epoch": 0.10193790785050424, + "step": 1031, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.10193790785050424, + "step": 1031, + "train/total_loss": 0.06080247834324837 + }, + { + "entropy": 9.163641929626465, + "epoch": 0.10203678070001977, + "mean_token_accuracy": 0.7380688190460205, + "num_tokens": 5377746.0, + "step": 1032, + "train/ce_loss": 1.9390454326639883e-05 + }, + { + "epoch": 0.10203678070001977, + "step": 1032, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.10203678070001977, + "step": 1032, + "train/total_loss": 0.0625019371509552 + }, + { + "entropy": 9.652329444885254, + "epoch": 0.1021356535495353, + "mean_token_accuracy": 0.7177541851997375, + "num_tokens": 5382866.0, + "step": 1033, + "train/ce_loss": 1.420344352722168 + }, + { + "epoch": 0.1021356535495353, + "step": 1033, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.1021356535495353, + "step": 1033, + "train/total_loss": 0.21234694123268127 + }, + { + "entropy": 9.247446060180664, + "epoch": 0.10223452639905083, + "mean_token_accuracy": 0.7298919558525085, + "num_tokens": 5388181.0, + "step": 1034, + "train/ce_loss": 1.1431541442871094 + }, + { + "epoch": 0.10223452639905083, + "step": 1034, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.10223452639905083, + "step": 1034, + "train/total_loss": 0.17290917038917542 + }, + { + "entropy": 9.286208152770996, + "epoch": 0.10233339924856634, + "mean_token_accuracy": 0.723514199256897, + "num_tokens": 5393399.0, + "step": 1035, + "train/ce_loss": 0.8224537968635559 + }, + { + "epoch": 0.10233339924856634, + "step": 1035, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.10233339924856634, + "step": 1035, + "train/total_loss": 0.1447453796863556 + }, + { + "entropy": 10.020112991333008, + "epoch": 0.10243227209808187, + "mean_token_accuracy": 0.773955762386322, + "num_tokens": 5398283.0, + "step": 1036, + "train/ce_loss": 2.7731024601962417e-05 + }, + { + "epoch": 0.10243227209808187, + "step": 1036, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.10243227209808187, + "step": 1036, + "train/total_loss": 0.07812777161598206 + }, + { + "entropy": 9.74041748046875, + "epoch": 0.1025311449475974, + "mean_token_accuracy": 0.7201907634735107, + "num_tokens": 5403341.0, + "step": 1037, + "train/ce_loss": 1.832025191106368e-05 + }, + { + "epoch": 0.1025311449475974, + "step": 1037, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.1025311449475974, + "step": 1037, + "train/total_loss": 0.03515808284282684 + }, + { + "entropy": 9.702130317687988, + "epoch": 0.10263001779711291, + "mean_token_accuracy": 0.739469587802887, + "num_tokens": 5408403.0, + "step": 1038, + "train/ce_loss": 0.9791857004165649 + }, + { + "epoch": 0.10263001779711291, + "step": 1038, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.10263001779711291, + "step": 1038, + "train/total_loss": 0.1565123200416565 + }, + { + "entropy": 9.729958534240723, + "epoch": 0.10272889064662843, + "mean_token_accuracy": 0.6488189101219177, + "num_tokens": 5413443.0, + "step": 1039, + "train/ce_loss": 3.65582927770447e-05 + }, + { + "epoch": 0.10272889064662843, + "step": 1039, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.10272889064662843, + "step": 1039, + "train/total_loss": 0.07812865823507309 + }, + { + "epoch": 0.10282776349614396, + "grad_norm": 1.0242434740066528, + "learning_rate": 9.745586708203531e-06, + "loss": 0.1722, + "step": 1040 + }, + { + "entropy": 9.663080215454102, + "epoch": 0.10282776349614396, + "mean_token_accuracy": 0.6986532211303711, + "num_tokens": 5418525.0, + "step": 1040, + "train/ce_loss": 3.0713326850673184e-05 + }, + { + "epoch": 0.10282776349614396, + "step": 1040, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.10282776349614396, + "step": 1040, + "train/total_loss": 0.09765931963920593 + }, + { + "entropy": 10.008130073547363, + "epoch": 0.10292663634565948, + "mean_token_accuracy": 0.7440347075462341, + "num_tokens": 5423418.0, + "step": 1041, + "train/ce_loss": 1.2210838794708252 + }, + { + "epoch": 0.10292663634565948, + "step": 1041, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.10292663634565948, + "step": 1041, + "train/total_loss": 0.20413964986801147 + }, + { + "entropy": 9.635592460632324, + "epoch": 0.103025509195175, + "mean_token_accuracy": 0.6676342487335205, + "num_tokens": 5428561.0, + "step": 1042, + "train/ce_loss": 1.4121594429016113 + }, + { + "epoch": 0.103025509195175, + "step": 1042, + "train/sim_loss": 0.12109375 + }, + { + "epoch": 0.103025509195175, + "step": 1042, + "train/total_loss": 0.2623097002506256 + }, + { + "entropy": 9.016424179077148, + "epoch": 0.10312438204469053, + "mean_token_accuracy": 0.7136015295982361, + "num_tokens": 5434047.0, + "step": 1043, + "train/ce_loss": 0.9875646233558655 + }, + { + "epoch": 0.10312438204469053, + "step": 1043, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.10312438204469053, + "step": 1043, + "train/total_loss": 0.17688146233558655 + }, + { + "entropy": 9.06117057800293, + "epoch": 0.10322325489420606, + "mean_token_accuracy": 0.7363834381103516, + "num_tokens": 5439435.0, + "step": 1044, + "train/ce_loss": 0.7383791208267212 + }, + { + "epoch": 0.10322325489420606, + "step": 1044, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.10322325489420606, + "step": 1044, + "train/total_loss": 0.17540040612220764 + }, + { + "entropy": 9.030341148376465, + "epoch": 0.10332212774372157, + "mean_token_accuracy": 0.711448609828949, + "num_tokens": 5444805.0, + "step": 1045, + "train/ce_loss": 1.4875901937484741 + }, + { + "epoch": 0.10332212774372157, + "step": 1045, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.10332212774372157, + "step": 1045, + "train/total_loss": 0.27375900745391846 + }, + { + "entropy": 9.205885887145996, + "epoch": 0.1034210005932371, + "mean_token_accuracy": 0.7298136353492737, + "num_tokens": 5450226.0, + "step": 1046, + "train/ce_loss": 0.5827401876449585 + }, + { + "epoch": 0.1034210005932371, + "step": 1046, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.1034210005932371, + "step": 1046, + "train/total_loss": 0.11686776578426361 + }, + { + "entropy": 9.355142593383789, + "epoch": 0.10351987344275262, + "mean_token_accuracy": 0.7164887189865112, + "num_tokens": 5455576.0, + "step": 1047, + "train/ce_loss": 0.7967851758003235 + }, + { + "epoch": 0.10351987344275262, + "step": 1047, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.10351987344275262, + "step": 1047, + "train/total_loss": 0.1773347705602646 + }, + { + "entropy": 8.922216415405273, + "epoch": 0.10361874629226814, + "mean_token_accuracy": 0.7060241103172302, + "num_tokens": 5460920.0, + "step": 1048, + "train/ce_loss": 0.6458163857460022 + }, + { + "epoch": 0.10361874629226814, + "step": 1048, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.10361874629226814, + "step": 1048, + "train/total_loss": 0.15833163261413574 + }, + { + "entropy": 9.688494682312012, + "epoch": 0.10371761914178367, + "mean_token_accuracy": 0.7666666507720947, + "num_tokens": 5465943.0, + "step": 1049, + "train/ce_loss": 3.978270251536742e-05 + }, + { + "epoch": 0.10371761914178367, + "step": 1049, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.10371761914178367, + "step": 1049, + "train/total_loss": 0.08203522861003876 + }, + { + "entropy": 9.07177734375, + "epoch": 0.10381649199129919, + "mean_token_accuracy": 0.7392638325691223, + "num_tokens": 5471426.0, + "step": 1050, + "train/ce_loss": 0.9018810391426086 + }, + { + "epoch": 0.10381649199129919, + "step": 1050, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.10381649199129919, + "step": 1050, + "train/total_loss": 0.19175061583518982 + }, + { + "entropy": 9.343345642089844, + "epoch": 0.1039153648408147, + "mean_token_accuracy": 0.7020023465156555, + "num_tokens": 5476722.0, + "step": 1051, + "train/ce_loss": 1.2637797594070435 + }, + { + "epoch": 0.1039153648408147, + "step": 1051, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.1039153648408147, + "step": 1051, + "train/total_loss": 0.18106548488140106 + }, + { + "entropy": 9.867258071899414, + "epoch": 0.10401423769033023, + "mean_token_accuracy": 0.7724252343177795, + "num_tokens": 5481763.0, + "step": 1052, + "train/ce_loss": 6.280098750721663e-05 + }, + { + "epoch": 0.10401423769033023, + "step": 1052, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.10401423769033023, + "step": 1052, + "train/total_loss": 0.0742250308394432 + }, + { + "entropy": 9.537542343139648, + "epoch": 0.10411311053984576, + "mean_token_accuracy": 0.7412280440330505, + "num_tokens": 5486868.0, + "step": 1053, + "train/ce_loss": 0.6791310906410217 + }, + { + "epoch": 0.10411311053984576, + "step": 1053, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.10411311053984576, + "step": 1053, + "train/total_loss": 0.09525685757398605 + }, + { + "entropy": 9.18380355834961, + "epoch": 0.10421198338936129, + "mean_token_accuracy": 0.7676130533218384, + "num_tokens": 5492286.0, + "step": 1054, + "train/ce_loss": 0.6990429162979126 + }, + { + "epoch": 0.10421198338936129, + "step": 1054, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.10421198338936129, + "step": 1054, + "train/total_loss": 0.10896679013967514 + }, + { + "entropy": 9.26451301574707, + "epoch": 0.1043108562388768, + "mean_token_accuracy": 0.7121034264564514, + "num_tokens": 5497607.0, + "step": 1055, + "train/ce_loss": 1.6029696464538574 + }, + { + "epoch": 0.1043108562388768, + "step": 1055, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.1043108562388768, + "step": 1055, + "train/total_loss": 0.2149844616651535 + }, + { + "entropy": 9.994356155395508, + "epoch": 0.10440972908839233, + "mean_token_accuracy": 0.7188118696212769, + "num_tokens": 5502539.0, + "step": 1056, + "train/ce_loss": 0.8876862525939941 + }, + { + "epoch": 0.10440972908839233, + "step": 1056, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.10440972908839233, + "step": 1056, + "train/total_loss": 0.1590811312198639 + }, + { + "entropy": 9.06218147277832, + "epoch": 0.10450860193790786, + "mean_token_accuracy": 0.7263681888580322, + "num_tokens": 5508019.0, + "step": 1057, + "train/ce_loss": 0.6542841196060181 + }, + { + "epoch": 0.10450860193790786, + "step": 1057, + "train/sim_loss": 0.140625 + }, + { + "epoch": 0.10450860193790786, + "step": 1057, + "train/total_loss": 0.20605340600013733 + }, + { + "entropy": 9.794883728027344, + "epoch": 0.10460747478742337, + "mean_token_accuracy": 0.7626811861991882, + "num_tokens": 5513039.0, + "step": 1058, + "train/ce_loss": 0.9645821452140808 + }, + { + "epoch": 0.10460747478742337, + "step": 1058, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.10460747478742337, + "step": 1058, + "train/total_loss": 0.14723947644233704 + }, + { + "entropy": 8.95814323425293, + "epoch": 0.1047063476369389, + "mean_token_accuracy": 0.7363238334655762, + "num_tokens": 5518471.0, + "step": 1059, + "train/ce_loss": 0.8257557153701782 + }, + { + "epoch": 0.1047063476369389, + "step": 1059, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.1047063476369389, + "step": 1059, + "train/total_loss": 0.20757557451725006 + }, + { + "epoch": 0.10480522048645442, + "grad_norm": 0.9177259206771851, + "learning_rate": 9.740641843445583e-06, + "loss": 0.1816, + "step": 1060 + }, + { + "entropy": 9.62043571472168, + "epoch": 0.10480522048645442, + "mean_token_accuracy": 0.8195956349372864, + "num_tokens": 5523555.0, + "step": 1060, + "train/ce_loss": 0.784421443939209 + }, + { + "epoch": 0.10480522048645442, + "step": 1060, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.10480522048645442, + "step": 1060, + "train/total_loss": 0.11750464886426926 + }, + { + "entropy": 9.526849746704102, + "epoch": 0.10490409333596994, + "mean_token_accuracy": 0.7537190318107605, + "num_tokens": 5528557.0, + "step": 1061, + "train/ce_loss": 1.3954969644546509 + }, + { + "epoch": 0.10490409333596994, + "step": 1061, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.10490409333596994, + "step": 1061, + "train/total_loss": 0.23329970240592957 + }, + { + "entropy": 9.389505386352539, + "epoch": 0.10500296618548546, + "mean_token_accuracy": 0.7473053932189941, + "num_tokens": 5533850.0, + "step": 1062, + "train/ce_loss": 0.5176913142204285 + }, + { + "epoch": 0.10500296618548546, + "step": 1062, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.10500296618548546, + "step": 1062, + "train/total_loss": 0.12598788738250732 + }, + { + "entropy": 9.29703140258789, + "epoch": 0.10510183903500099, + "mean_token_accuracy": 0.7314285635948181, + "num_tokens": 5539182.0, + "step": 1063, + "train/ce_loss": 0.7778090238571167 + }, + { + "epoch": 0.10510183903500099, + "step": 1063, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.10510183903500099, + "step": 1063, + "train/total_loss": 0.16371840238571167 + }, + { + "entropy": 9.751190185546875, + "epoch": 0.10520071188451652, + "mean_token_accuracy": 0.708053708076477, + "num_tokens": 5544175.0, + "step": 1064, + "train/ce_loss": 2.324305295944214 + }, + { + "epoch": 0.10520071188451652, + "step": 1064, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.10520071188451652, + "step": 1064, + "train/total_loss": 0.32618051767349243 + }, + { + "entropy": 10.641530990600586, + "epoch": 0.10529958473403203, + "mean_token_accuracy": 0.7652581930160522, + "num_tokens": 5548778.0, + "step": 1065, + "train/ce_loss": 0.00010860838665394112 + }, + { + "epoch": 0.10529958473403203, + "step": 1065, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.10529958473403203, + "step": 1065, + "train/total_loss": 0.07813586294651031 + }, + { + "entropy": 9.428884506225586, + "epoch": 0.10539845758354756, + "mean_token_accuracy": 0.7327935099601746, + "num_tokens": 5553960.0, + "step": 1066, + "train/ce_loss": 0.9732573628425598 + }, + { + "epoch": 0.10539845758354756, + "step": 1066, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.10539845758354756, + "step": 1066, + "train/total_loss": 0.20670074224472046 + }, + { + "entropy": 9.512097358703613, + "epoch": 0.10549733043306309, + "mean_token_accuracy": 0.7585752010345459, + "num_tokens": 5559205.0, + "step": 1067, + "train/ce_loss": 1.512286901473999 + }, + { + "epoch": 0.10549733043306309, + "step": 1067, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.10549733043306309, + "step": 1067, + "train/total_loss": 0.24497869610786438 + }, + { + "entropy": 9.436140060424805, + "epoch": 0.1055962032825786, + "mean_token_accuracy": 0.7049382925033569, + "num_tokens": 5564461.0, + "step": 1068, + "train/ce_loss": 0.7769739031791687 + }, + { + "epoch": 0.1055962032825786, + "step": 1068, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.1055962032825786, + "step": 1068, + "train/total_loss": 0.14019739627838135 + }, + { + "entropy": 9.738252639770508, + "epoch": 0.10569507613209413, + "mean_token_accuracy": 0.716586172580719, + "num_tokens": 5569521.0, + "step": 1069, + "train/ce_loss": 0.6802010536193848 + }, + { + "epoch": 0.10569507613209413, + "step": 1069, + "train/sim_loss": 0.12109375 + }, + { + "epoch": 0.10569507613209413, + "step": 1069, + "train/total_loss": 0.18911385536193848 + }, + { + "entropy": 9.438884735107422, + "epoch": 0.10579394898160965, + "mean_token_accuracy": 0.6901004314422607, + "num_tokens": 5574704.0, + "step": 1070, + "train/ce_loss": 1.1930865049362183 + }, + { + "epoch": 0.10579394898160965, + "step": 1070, + "train/sim_loss": 0.171875 + }, + { + "epoch": 0.10579394898160965, + "step": 1070, + "train/total_loss": 0.2911836504936218 + }, + { + "entropy": 8.921512603759766, + "epoch": 0.10589282183112517, + "mean_token_accuracy": 0.6904761791229248, + "num_tokens": 5580112.0, + "step": 1071, + "train/ce_loss": 0.835815966129303 + }, + { + "epoch": 0.10589282183112517, + "step": 1071, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.10589282183112517, + "step": 1071, + "train/total_loss": 0.1656128466129303 + }, + { + "entropy": 10.20355224609375, + "epoch": 0.1059916946806407, + "mean_token_accuracy": 0.6859122514724731, + "num_tokens": 5584972.0, + "step": 1072, + "train/ce_loss": 3.075763743254356e-05 + }, + { + "epoch": 0.1059916946806407, + "step": 1072, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.1059916946806407, + "step": 1072, + "train/total_loss": 0.07812807708978653 + }, + { + "entropy": 9.451912879943848, + "epoch": 0.10609056753015622, + "mean_token_accuracy": 0.7582260370254517, + "num_tokens": 5590117.0, + "step": 1073, + "train/ce_loss": 0.6548001170158386 + }, + { + "epoch": 0.10609056753015622, + "step": 1073, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.10609056753015622, + "step": 1073, + "train/total_loss": 0.11626126617193222 + }, + { + "entropy": 10.318904876708984, + "epoch": 0.10618944037967175, + "mean_token_accuracy": 0.716292142868042, + "num_tokens": 5594851.0, + "step": 1074, + "train/ce_loss": 2.4563345909118652 + }, + { + "epoch": 0.10618944037967175, + "step": 1074, + "train/sim_loss": 0.171875 + }, + { + "epoch": 0.10618944037967175, + "step": 1074, + "train/total_loss": 0.41750848293304443 + }, + { + "entropy": 9.271596908569336, + "epoch": 0.10628831322918726, + "mean_token_accuracy": 0.7515225410461426, + "num_tokens": 5600196.0, + "step": 1075, + "train/ce_loss": 0.5061623454093933 + }, + { + "epoch": 0.10628831322918726, + "step": 1075, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.10628831322918726, + "step": 1075, + "train/total_loss": 0.14436623454093933 + }, + { + "entropy": 10.099292755126953, + "epoch": 0.10638718607870279, + "mean_token_accuracy": 0.7397260069847107, + "num_tokens": 5605059.0, + "step": 1076, + "train/ce_loss": 1.4282630681991577 + }, + { + "epoch": 0.10638718607870279, + "step": 1076, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.10638718607870279, + "step": 1076, + "train/total_loss": 0.21313880383968353 + }, + { + "entropy": 9.397590637207031, + "epoch": 0.10648605892821832, + "mean_token_accuracy": 0.7211055159568787, + "num_tokens": 5610301.0, + "step": 1077, + "train/ce_loss": 1.4666411876678467 + }, + { + "epoch": 0.10648605892821832, + "step": 1077, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.10648605892821832, + "step": 1077, + "train/total_loss": 0.2599453926086426 + }, + { + "entropy": 9.761058807373047, + "epoch": 0.10658493177773383, + "mean_token_accuracy": 0.8031495809555054, + "num_tokens": 5615385.0, + "step": 1078, + "train/ce_loss": 2.565521572250873e-05 + }, + { + "epoch": 0.10658493177773383, + "step": 1078, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.10658493177773383, + "step": 1078, + "train/total_loss": 0.07031506299972534 + }, + { + "entropy": 9.661194801330566, + "epoch": 0.10668380462724936, + "mean_token_accuracy": 0.7828054428100586, + "num_tokens": 5620426.0, + "step": 1079, + "train/ce_loss": 0.5483853816986084 + }, + { + "epoch": 0.10668380462724936, + "step": 1079, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.10668380462724936, + "step": 1079, + "train/total_loss": 0.11343228816986084 + }, + { + "epoch": 0.10678267747676488, + "grad_norm": 0.8281675577163696, + "learning_rate": 9.735696978687634e-06, + "loss": 0.1665, + "step": 1080 + }, + { + "entropy": 9.907204627990723, + "epoch": 0.10678267747676488, + "mean_token_accuracy": 0.8071428537368774, + "num_tokens": 5625303.0, + "step": 1080, + "train/ce_loss": 0.878250002861023 + }, + { + "epoch": 0.10678267747676488, + "step": 1080, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.10678267747676488, + "step": 1080, + "train/total_loss": 0.1425125002861023 + }, + { + "entropy": 9.570771217346191, + "epoch": 0.1068815503262804, + "mean_token_accuracy": 0.7329462766647339, + "num_tokens": 5630459.0, + "step": 1081, + "train/ce_loss": 0.875616192817688 + }, + { + "epoch": 0.1068815503262804, + "step": 1081, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.1068815503262804, + "step": 1081, + "train/total_loss": 0.18521787226200104 + }, + { + "entropy": 9.296771049499512, + "epoch": 0.10698042317579592, + "mean_token_accuracy": 0.7612107396125793, + "num_tokens": 5635751.0, + "step": 1082, + "train/ce_loss": 1.1663848161697388 + }, + { + "epoch": 0.10698042317579592, + "step": 1082, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.10698042317579592, + "step": 1082, + "train/total_loss": 0.20257598161697388 + }, + { + "entropy": 9.091501235961914, + "epoch": 0.10707929602531145, + "mean_token_accuracy": 0.7034631967544556, + "num_tokens": 5641138.0, + "step": 1083, + "train/ce_loss": 1.280279278755188 + }, + { + "epoch": 0.10707929602531145, + "step": 1083, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.10707929602531145, + "step": 1083, + "train/total_loss": 0.23349668085575104 + }, + { + "entropy": 9.119293212890625, + "epoch": 0.10717816887482698, + "mean_token_accuracy": 0.7681970596313477, + "num_tokens": 5646516.0, + "step": 1084, + "train/ce_loss": 0.7127261757850647 + }, + { + "epoch": 0.10717816887482698, + "step": 1084, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.10717816887482698, + "step": 1084, + "train/total_loss": 0.145491361618042 + }, + { + "entropy": 9.799027442932129, + "epoch": 0.10727704172434249, + "mean_token_accuracy": 0.6939102411270142, + "num_tokens": 5651551.0, + "step": 1085, + "train/ce_loss": 1.602504014968872 + }, + { + "epoch": 0.10727704172434249, + "step": 1085, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.10727704172434249, + "step": 1085, + "train/total_loss": 0.2500941753387451 + }, + { + "entropy": 9.298020362854004, + "epoch": 0.10737591457385802, + "mean_token_accuracy": 0.6997166872024536, + "num_tokens": 5656746.0, + "step": 1086, + "train/ce_loss": 1.2978819608688354 + }, + { + "epoch": 0.10737591457385802, + "step": 1086, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.10737591457385802, + "step": 1086, + "train/total_loss": 0.21963195502758026 + }, + { + "entropy": 9.827201843261719, + "epoch": 0.10747478742337355, + "mean_token_accuracy": 0.771019697189331, + "num_tokens": 5661765.0, + "step": 1087, + "train/ce_loss": 0.9341949224472046 + }, + { + "epoch": 0.10747478742337355, + "step": 1087, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.10747478742337355, + "step": 1087, + "train/total_loss": 0.14810699224472046 + }, + { + "entropy": 9.570361137390137, + "epoch": 0.10757366027288906, + "mean_token_accuracy": 0.7847533822059631, + "num_tokens": 5666913.0, + "step": 1088, + "train/ce_loss": 4.315694241086021e-05 + }, + { + "epoch": 0.10757366027288906, + "step": 1088, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.10757366027288906, + "step": 1088, + "train/total_loss": 0.08594181388616562 + }, + { + "entropy": 9.161052703857422, + "epoch": 0.10767253312240459, + "mean_token_accuracy": 0.7218044996261597, + "num_tokens": 5672177.0, + "step": 1089, + "train/ce_loss": 0.7218360304832458 + }, + { + "epoch": 0.10767253312240459, + "step": 1089, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.10767253312240459, + "step": 1089, + "train/total_loss": 0.13468360900878906 + }, + { + "entropy": 9.451432228088379, + "epoch": 0.10777140597192011, + "mean_token_accuracy": 0.6954177618026733, + "num_tokens": 5677299.0, + "step": 1090, + "train/ce_loss": 0.6777337193489075 + }, + { + "epoch": 0.10777140597192011, + "step": 1090, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.10777140597192011, + "step": 1090, + "train/total_loss": 0.13027337193489075 + }, + { + "entropy": 9.760773658752441, + "epoch": 0.10787027882143563, + "mean_token_accuracy": 0.746691882610321, + "num_tokens": 5682235.0, + "step": 1091, + "train/ce_loss": 1.132237434387207 + }, + { + "epoch": 0.10787027882143563, + "step": 1091, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.10787027882143563, + "step": 1091, + "train/total_loss": 0.19134874641895294 + }, + { + "entropy": 9.321340560913086, + "epoch": 0.10796915167095116, + "mean_token_accuracy": 0.7540760636329651, + "num_tokens": 5687477.0, + "step": 1092, + "train/ce_loss": 0.7502046823501587 + }, + { + "epoch": 0.10796915167095116, + "step": 1092, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.10796915167095116, + "step": 1092, + "train/total_loss": 0.1804892122745514 + }, + { + "entropy": 9.607926368713379, + "epoch": 0.10806802452046668, + "mean_token_accuracy": 0.6939597129821777, + "num_tokens": 5692664.0, + "step": 1093, + "train/ce_loss": 1.2573391199111938 + }, + { + "epoch": 0.10806802452046668, + "step": 1093, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.10806802452046668, + "step": 1093, + "train/total_loss": 0.24292141199111938 + }, + { + "entropy": 9.303705215454102, + "epoch": 0.10816689736998221, + "mean_token_accuracy": 0.7413554787635803, + "num_tokens": 5697887.0, + "step": 1094, + "train/ce_loss": 0.9573739171028137 + }, + { + "epoch": 0.10816689736998221, + "step": 1094, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.10816689736998221, + "step": 1094, + "train/total_loss": 0.18167489767074585 + }, + { + "entropy": 9.43138313293457, + "epoch": 0.10826577021949772, + "mean_token_accuracy": 0.714102566242218, + "num_tokens": 5703140.0, + "step": 1095, + "train/ce_loss": 0.4888365864753723 + }, + { + "epoch": 0.10826577021949772, + "step": 1095, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.10826577021949772, + "step": 1095, + "train/total_loss": 0.10747741162776947 + }, + { + "entropy": 9.34381103515625, + "epoch": 0.10836464306901325, + "mean_token_accuracy": 0.7085561752319336, + "num_tokens": 5708319.0, + "step": 1096, + "train/ce_loss": 1.3514018064597622e-05 + }, + { + "epoch": 0.10836464306901325, + "step": 1096, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.10836464306901325, + "step": 1096, + "train/total_loss": 0.10937634855508804 + }, + { + "entropy": 9.446690559387207, + "epoch": 0.10846351591852878, + "mean_token_accuracy": 0.7742817997932434, + "num_tokens": 5713462.0, + "step": 1097, + "train/ce_loss": 3.5229713830631226e-05 + }, + { + "epoch": 0.10846351591852878, + "step": 1097, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.10846351591852878, + "step": 1097, + "train/total_loss": 0.08984727412462234 + }, + { + "entropy": 9.997285842895508, + "epoch": 0.10856238876804429, + "mean_token_accuracy": 0.8060606122016907, + "num_tokens": 5718357.0, + "step": 1098, + "train/ce_loss": 2.3086209694156423e-05 + }, + { + "epoch": 0.10856238876804429, + "step": 1098, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.10856238876804429, + "step": 1098, + "train/total_loss": 0.042971059679985046 + }, + { + "entropy": 9.321104049682617, + "epoch": 0.10866126161755982, + "mean_token_accuracy": 0.6739409565925598, + "num_tokens": 5723572.0, + "step": 1099, + "train/ce_loss": 1.665620038693305e-05 + }, + { + "epoch": 0.10866126161755982, + "step": 1099, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.10866126161755982, + "step": 1099, + "train/total_loss": 0.08984541893005371 + }, + { + "epoch": 0.10876013446707535, + "grad_norm": 1.111937403678894, + "learning_rate": 9.730752113929684e-06, + "loss": 0.1699, + "step": 1100 + }, + { + "entropy": 8.92409896850586, + "epoch": 0.10876013446707535, + "mean_token_accuracy": 0.6962025165557861, + "num_tokens": 5729117.0, + "step": 1100, + "train/ce_loss": 0.6997928619384766 + }, + { + "epoch": 0.10876013446707535, + "step": 1100, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.10876013446707535, + "step": 1100, + "train/total_loss": 0.19497928023338318 + }, + { + "entropy": 9.248737335205078, + "epoch": 0.10885900731659086, + "mean_token_accuracy": 0.7200474739074707, + "num_tokens": 5734411.0, + "step": 1101, + "train/ce_loss": 1.9167568683624268 + }, + { + "epoch": 0.10885900731659086, + "step": 1101, + "train/sim_loss": 0.140625 + }, + { + "epoch": 0.10885900731659086, + "step": 1101, + "train/total_loss": 0.33230069279670715 + }, + { + "entropy": 9.988884925842285, + "epoch": 0.10895788016610639, + "mean_token_accuracy": 0.698113203048706, + "num_tokens": 5739384.0, + "step": 1102, + "train/ce_loss": 4.440023985807784e-05 + }, + { + "epoch": 0.10895788016610639, + "step": 1102, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.10895788016610639, + "step": 1102, + "train/total_loss": 0.08984819054603577 + }, + { + "entropy": 9.443476676940918, + "epoch": 0.10905675301562191, + "mean_token_accuracy": 0.7716216444969177, + "num_tokens": 5744567.0, + "step": 1103, + "train/ce_loss": 5.284923463477753e-05 + }, + { + "epoch": 0.10905675301562191, + "step": 1103, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.10905675301562191, + "step": 1103, + "train/total_loss": 0.10547403246164322 + }, + { + "entropy": 9.307315826416016, + "epoch": 0.10915562586513744, + "mean_token_accuracy": 0.7219387888908386, + "num_tokens": 5749839.0, + "step": 1104, + "train/ce_loss": 0.724831223487854 + }, + { + "epoch": 0.10915562586513744, + "step": 1104, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.10915562586513744, + "step": 1104, + "train/total_loss": 0.1388893723487854 + }, + { + "entropy": 9.767234802246094, + "epoch": 0.10925449871465295, + "mean_token_accuracy": 0.7443868517875671, + "num_tokens": 5754879.0, + "step": 1105, + "train/ce_loss": 0.9078450202941895 + }, + { + "epoch": 0.10925449871465295, + "step": 1105, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.10925449871465295, + "step": 1105, + "train/total_loss": 0.17672200500965118 + }, + { + "entropy": 9.019737243652344, + "epoch": 0.10935337156416848, + "mean_token_accuracy": 0.7746341228485107, + "num_tokens": 5760364.0, + "step": 1106, + "train/ce_loss": 0.6363298892974854 + }, + { + "epoch": 0.10935337156416848, + "step": 1106, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.10935337156416848, + "step": 1106, + "train/total_loss": 0.126132994890213 + }, + { + "entropy": 9.645170211791992, + "epoch": 0.10945224441368401, + "mean_token_accuracy": 0.7149681448936462, + "num_tokens": 5765419.0, + "step": 1107, + "train/ce_loss": 1.4607880115509033 + }, + { + "epoch": 0.10945224441368401, + "step": 1107, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.10945224441368401, + "step": 1107, + "train/total_loss": 0.25154757499694824 + }, + { + "entropy": 9.698060989379883, + "epoch": 0.10955111726319952, + "mean_token_accuracy": 0.6975036859512329, + "num_tokens": 5770503.0, + "step": 1108, + "train/ce_loss": 1.2893515825271606 + }, + { + "epoch": 0.10955111726319952, + "step": 1108, + "train/sim_loss": 0.12890625 + }, + { + "epoch": 0.10955111726319952, + "step": 1108, + "train/total_loss": 0.25784140825271606 + }, + { + "entropy": 9.17084789276123, + "epoch": 0.10964999011271505, + "mean_token_accuracy": 0.743534505367279, + "num_tokens": 5775872.0, + "step": 1109, + "train/ce_loss": 0.7339786887168884 + }, + { + "epoch": 0.10964999011271505, + "step": 1109, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.10964999011271505, + "step": 1109, + "train/total_loss": 0.16714787483215332 + }, + { + "entropy": 9.010650634765625, + "epoch": 0.10974886296223058, + "mean_token_accuracy": 0.777063250541687, + "num_tokens": 5781283.0, + "step": 1110, + "train/ce_loss": 0.5257327556610107 + }, + { + "epoch": 0.10974886296223058, + "step": 1110, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.10974886296223058, + "step": 1110, + "train/total_loss": 0.12288577854633331 + }, + { + "entropy": 8.9091796875, + "epoch": 0.10984773581174609, + "mean_token_accuracy": 0.7399380803108215, + "num_tokens": 5786765.0, + "step": 1111, + "train/ce_loss": 1.0092471837997437 + }, + { + "epoch": 0.10984773581174609, + "step": 1111, + "train/sim_loss": 0.1328125 + }, + { + "epoch": 0.10984773581174609, + "step": 1111, + "train/total_loss": 0.23373723030090332 + }, + { + "entropy": 9.311386108398438, + "epoch": 0.10994660866126162, + "mean_token_accuracy": 0.7463087439537048, + "num_tokens": 5792003.0, + "step": 1112, + "train/ce_loss": 1.1881719827651978 + }, + { + "epoch": 0.10994660866126162, + "step": 1112, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.10994660866126162, + "step": 1112, + "train/total_loss": 0.23600471019744873 + }, + { + "entropy": 9.483650207519531, + "epoch": 0.11004548151077714, + "mean_token_accuracy": 0.6721556782722473, + "num_tokens": 5797096.0, + "step": 1113, + "train/ce_loss": 0.7234494090080261 + }, + { + "epoch": 0.11004548151077714, + "step": 1113, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.11004548151077714, + "step": 1113, + "train/total_loss": 0.18171994388103485 + }, + { + "entropy": 9.235286712646484, + "epoch": 0.11014435436029267, + "mean_token_accuracy": 0.7658593058586121, + "num_tokens": 5802446.0, + "step": 1114, + "train/ce_loss": 0.5792244672775269 + }, + { + "epoch": 0.11014435436029267, + "step": 1114, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.11014435436029267, + "step": 1114, + "train/total_loss": 0.18292245268821716 + }, + { + "entropy": 9.276355743408203, + "epoch": 0.11024322720980818, + "mean_token_accuracy": 0.7150062918663025, + "num_tokens": 5807718.0, + "step": 1115, + "train/ce_loss": 0.9252979159355164 + }, + { + "epoch": 0.11024322720980818, + "step": 1115, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.11024322720980818, + "step": 1115, + "train/total_loss": 0.1667485535144806 + }, + { + "entropy": 9.39252758026123, + "epoch": 0.11034210005932371, + "mean_token_accuracy": 0.6945288777351379, + "num_tokens": 5812826.0, + "step": 1116, + "train/ce_loss": 1.2116296291351318 + }, + { + "epoch": 0.11034210005932371, + "step": 1116, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.11034210005932371, + "step": 1116, + "train/total_loss": 0.17194421589374542 + }, + { + "entropy": 9.744169235229492, + "epoch": 0.11044097290883924, + "mean_token_accuracy": 0.7659574747085571, + "num_tokens": 5817847.0, + "step": 1117, + "train/ce_loss": 0.00010859971371246502 + }, + { + "epoch": 0.11044097290883924, + "step": 1117, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.11044097290883924, + "step": 1117, + "train/total_loss": 0.12501086294651031 + }, + { + "entropy": 8.795273780822754, + "epoch": 0.11053984575835475, + "mean_token_accuracy": 0.7181664109230042, + "num_tokens": 5823707.0, + "step": 1118, + "train/ce_loss": 1.1598750352859497 + }, + { + "epoch": 0.11053984575835475, + "step": 1118, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.11053984575835475, + "step": 1118, + "train/total_loss": 0.20583125948905945 + }, + { + "entropy": 9.927274703979492, + "epoch": 0.11063871860787028, + "mean_token_accuracy": 0.6340996026992798, + "num_tokens": 5828644.0, + "step": 1119, + "train/ce_loss": 0.0007295234245248139 + }, + { + "epoch": 0.11063871860787028, + "step": 1119, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.11063871860787028, + "step": 1119, + "train/total_loss": 0.06647919863462448 + }, + { + "epoch": 0.1107375914573858, + "grad_norm": 1.2310987710952759, + "learning_rate": 9.725807249171736e-06, + "loss": 0.1763, + "step": 1120 + }, + { + "entropy": 9.093071937561035, + "epoch": 0.1107375914573858, + "mean_token_accuracy": 0.7035794258117676, + "num_tokens": 5834016.0, + "step": 1120, + "train/ce_loss": 0.984878420829773 + }, + { + "epoch": 0.1107375914573858, + "step": 1120, + "train/sim_loss": 0.1640625 + }, + { + "epoch": 0.1107375914573858, + "step": 1120, + "train/total_loss": 0.26255035400390625 + }, + { + "entropy": 9.372610092163086, + "epoch": 0.11083646430690132, + "mean_token_accuracy": 0.767175555229187, + "num_tokens": 5839414.0, + "step": 1121, + "train/ce_loss": 0.6642642617225647 + }, + { + "epoch": 0.11083646430690132, + "step": 1121, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.11083646430690132, + "step": 1121, + "train/total_loss": 0.16017642617225647 + }, + { + "entropy": 9.518269538879395, + "epoch": 0.11093533715641685, + "mean_token_accuracy": 0.7764227390289307, + "num_tokens": 5844801.0, + "step": 1122, + "train/ce_loss": 1.3010786771774292 + }, + { + "epoch": 0.11093533715641685, + "step": 1122, + "train/sim_loss": 0.14453125 + }, + { + "epoch": 0.11093533715641685, + "step": 1122, + "train/total_loss": 0.2746391296386719 + }, + { + "entropy": 9.283157348632812, + "epoch": 0.11103421000593237, + "mean_token_accuracy": 0.728380024433136, + "num_tokens": 5850116.0, + "step": 1123, + "train/ce_loss": 1.0420175790786743 + }, + { + "epoch": 0.11103421000593237, + "step": 1123, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.11103421000593237, + "step": 1123, + "train/total_loss": 0.1901392638683319 + }, + { + "entropy": 9.071952819824219, + "epoch": 0.11113308285544789, + "mean_token_accuracy": 0.7816537618637085, + "num_tokens": 5855276.0, + "step": 1124, + "train/ce_loss": 0.6013221144676208 + }, + { + "epoch": 0.11113308285544789, + "step": 1124, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.11113308285544789, + "step": 1124, + "train/total_loss": 0.1187259629368782 + }, + { + "entropy": 9.901144981384277, + "epoch": 0.11123195570496341, + "mean_token_accuracy": 0.7330754399299622, + "num_tokens": 5860228.0, + "step": 1125, + "train/ce_loss": 1.0996679067611694 + }, + { + "epoch": 0.11123195570496341, + "step": 1125, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.11123195570496341, + "step": 1125, + "train/total_loss": 0.16465428471565247 + }, + { + "entropy": 9.006317138671875, + "epoch": 0.11133082855447894, + "mean_token_accuracy": 0.7103825211524963, + "num_tokens": 5865653.0, + "step": 1126, + "train/ce_loss": 0.7401002049446106 + }, + { + "epoch": 0.11133082855447894, + "step": 1126, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.11133082855447894, + "step": 1126, + "train/total_loss": 0.18338501453399658 + }, + { + "entropy": 9.19222640991211, + "epoch": 0.11142970140399447, + "mean_token_accuracy": 0.6580976843833923, + "num_tokens": 5870913.0, + "step": 1127, + "train/ce_loss": 4.0744573198026046e-05 + }, + { + "epoch": 0.11142970140399447, + "step": 1127, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.11142970140399447, + "step": 1127, + "train/total_loss": 0.08594157546758652 + }, + { + "entropy": 9.887141227722168, + "epoch": 0.11152857425350998, + "mean_token_accuracy": 0.7782427072525024, + "num_tokens": 5875820.0, + "step": 1128, + "train/ce_loss": 1.148012638092041 + }, + { + "epoch": 0.11152857425350998, + "step": 1128, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.11152857425350998, + "step": 1128, + "train/total_loss": 0.22808250784873962 + }, + { + "entropy": 9.06885814666748, + "epoch": 0.11162744710302551, + "mean_token_accuracy": 0.6842105388641357, + "num_tokens": 5881218.0, + "step": 1129, + "train/ce_loss": 1.2860099077224731 + }, + { + "epoch": 0.11162744710302551, + "step": 1129, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.11162744710302551, + "step": 1129, + "train/total_loss": 0.22235099971294403 + }, + { + "entropy": 9.067543029785156, + "epoch": 0.11172631995254104, + "mean_token_accuracy": 0.7619631886482239, + "num_tokens": 5886545.0, + "step": 1130, + "train/ce_loss": 0.9816094636917114 + }, + { + "epoch": 0.11172631995254104, + "step": 1130, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.11172631995254104, + "step": 1130, + "train/total_loss": 0.13331720232963562 + }, + { + "entropy": 9.337303161621094, + "epoch": 0.11182519280205655, + "mean_token_accuracy": 0.7229336500167847, + "num_tokens": 5892028.0, + "step": 1131, + "train/ce_loss": 1.4003444910049438 + }, + { + "epoch": 0.11182519280205655, + "step": 1131, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.11182519280205655, + "step": 1131, + "train/total_loss": 0.23378445208072662 + }, + { + "entropy": 9.111893653869629, + "epoch": 0.11192406565157208, + "mean_token_accuracy": 0.7122060656547546, + "num_tokens": 5897437.0, + "step": 1132, + "train/ce_loss": 0.9091646671295166 + }, + { + "epoch": 0.11192406565157208, + "step": 1132, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.11192406565157208, + "step": 1132, + "train/total_loss": 0.1885727196931839 + }, + { + "entropy": 9.357961654663086, + "epoch": 0.1120229385010876, + "mean_token_accuracy": 0.7841823101043701, + "num_tokens": 5902671.0, + "step": 1133, + "train/ce_loss": 1.0447713136672974 + }, + { + "epoch": 0.1120229385010876, + "step": 1133, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.1120229385010876, + "step": 1133, + "train/total_loss": 0.1591646373271942 + }, + { + "entropy": 9.133411407470703, + "epoch": 0.11212181135060312, + "mean_token_accuracy": 0.7156652212142944, + "num_tokens": 5908128.0, + "step": 1134, + "train/ce_loss": 1.4204862117767334 + }, + { + "epoch": 0.11212181135060312, + "step": 1134, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.11212181135060312, + "step": 1134, + "train/total_loss": 0.24751737713813782 + }, + { + "entropy": 10.052787780761719, + "epoch": 0.11222068420011864, + "mean_token_accuracy": 0.7367256879806519, + "num_tokens": 5912934.0, + "step": 1135, + "train/ce_loss": 2.2169891963130794e-05 + }, + { + "epoch": 0.11222068420011864, + "step": 1135, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.11222068420011864, + "step": 1135, + "train/total_loss": 0.031252216547727585 + }, + { + "entropy": 8.959535598754883, + "epoch": 0.11231955704963417, + "mean_token_accuracy": 0.709452748298645, + "num_tokens": 5918441.0, + "step": 1136, + "train/ce_loss": 0.8237015604972839 + }, + { + "epoch": 0.11231955704963417, + "step": 1136, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.11231955704963417, + "step": 1136, + "train/total_loss": 0.13315141201019287 + }, + { + "entropy": 9.312917709350586, + "epoch": 0.1124184298991497, + "mean_token_accuracy": 0.7187864780426025, + "num_tokens": 5923747.0, + "step": 1137, + "train/ce_loss": 1.4374332427978516 + }, + { + "epoch": 0.1124184298991497, + "step": 1137, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.1124184298991497, + "step": 1137, + "train/total_loss": 0.23749332129955292 + }, + { + "entropy": 9.914706230163574, + "epoch": 0.11251730274866521, + "mean_token_accuracy": 0.76106196641922, + "num_tokens": 5928933.0, + "step": 1138, + "train/ce_loss": 5.610586595139466e-05 + }, + { + "epoch": 0.11251730274866521, + "step": 1138, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.11251730274866521, + "step": 1138, + "train/total_loss": 0.07031811028718948 + }, + { + "entropy": 9.102922439575195, + "epoch": 0.11261617559818074, + "mean_token_accuracy": 0.7156862616539001, + "num_tokens": 5934184.0, + "step": 1139, + "train/ce_loss": 0.8545843362808228 + }, + { + "epoch": 0.11261617559818074, + "step": 1139, + "train/sim_loss": 0.1640625 + }, + { + "epoch": 0.11261617559818074, + "step": 1139, + "train/total_loss": 0.2495209276676178 + }, + { + "epoch": 0.11271504844769627, + "grad_norm": 0.9209204912185669, + "learning_rate": 9.720862384413787e-06, + "loss": 0.178, + "step": 1140 + }, + { + "entropy": 9.652783393859863, + "epoch": 0.11271504844769627, + "mean_token_accuracy": 0.7483333349227905, + "num_tokens": 5939261.0, + "step": 1140, + "train/ce_loss": 0.00013310209033079445 + }, + { + "epoch": 0.11271504844769627, + "step": 1140, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.11271504844769627, + "step": 1140, + "train/total_loss": 0.031263310462236404 + }, + { + "entropy": 10.0499849319458, + "epoch": 0.11281392129721178, + "mean_token_accuracy": 0.7789757251739502, + "num_tokens": 5944050.0, + "step": 1141, + "train/ce_loss": 0.001149828196503222 + }, + { + "epoch": 0.11281392129721178, + "step": 1141, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.11281392129721178, + "step": 1141, + "train/total_loss": 0.09777123481035233 + }, + { + "entropy": 9.451024055480957, + "epoch": 0.11291279414672731, + "mean_token_accuracy": 0.746268630027771, + "num_tokens": 5949252.0, + "step": 1142, + "train/ce_loss": 0.7472849488258362 + }, + { + "epoch": 0.11291279414672731, + "step": 1142, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.11291279414672731, + "step": 1142, + "train/total_loss": 0.16847848892211914 + }, + { + "entropy": 9.157025337219238, + "epoch": 0.11301166699624283, + "mean_token_accuracy": 0.686956524848938, + "num_tokens": 5954442.0, + "step": 1143, + "train/ce_loss": 1.2585369348526 + }, + { + "epoch": 0.11301166699624283, + "step": 1143, + "train/sim_loss": 0.140625 + }, + { + "epoch": 0.11301166699624283, + "step": 1143, + "train/total_loss": 0.2664787173271179 + }, + { + "entropy": 8.705760955810547, + "epoch": 0.11311053984575835, + "mean_token_accuracy": 0.727192223072052, + "num_tokens": 5960065.0, + "step": 1144, + "train/ce_loss": 0.6710728406906128 + }, + { + "epoch": 0.11311053984575835, + "step": 1144, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.11311053984575835, + "step": 1144, + "train/total_loss": 0.14523229002952576 + }, + { + "entropy": 9.306068420410156, + "epoch": 0.11320941269527388, + "mean_token_accuracy": 0.7318500876426697, + "num_tokens": 5965508.0, + "step": 1145, + "train/ce_loss": 1.1127384901046753 + }, + { + "epoch": 0.11320941269527388, + "step": 1145, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.11320941269527388, + "step": 1145, + "train/total_loss": 0.212836354970932 + }, + { + "entropy": 9.491476058959961, + "epoch": 0.1133082855447894, + "mean_token_accuracy": 0.7203947305679321, + "num_tokens": 5970603.0, + "step": 1146, + "train/ce_loss": 1.5402119970531203e-05 + }, + { + "epoch": 0.1133082855447894, + "step": 1146, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.1133082855447894, + "step": 1146, + "train/total_loss": 0.08203279227018356 + }, + { + "entropy": 8.89062786102295, + "epoch": 0.11340715839430493, + "mean_token_accuracy": 0.6906552314758301, + "num_tokens": 5976012.0, + "step": 1147, + "train/ce_loss": 1.2451964616775513 + }, + { + "epoch": 0.11340715839430493, + "step": 1147, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.11340715839430493, + "step": 1147, + "train/total_loss": 0.23389464616775513 + }, + { + "entropy": 8.958955764770508, + "epoch": 0.11350603124382044, + "mean_token_accuracy": 0.746051013469696, + "num_tokens": 5981384.0, + "step": 1148, + "train/ce_loss": 0.6065599918365479 + }, + { + "epoch": 0.11350603124382044, + "step": 1148, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.11350603124382044, + "step": 1148, + "train/total_loss": 0.14659349620342255 + }, + { + "entropy": 9.490360260009766, + "epoch": 0.11360490409333597, + "mean_token_accuracy": 0.802431583404541, + "num_tokens": 5986525.0, + "step": 1149, + "train/ce_loss": 1.0267833471298218 + }, + { + "epoch": 0.11360490409333597, + "step": 1149, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.11360490409333597, + "step": 1149, + "train/total_loss": 0.1534595787525177 + }, + { + "entropy": 10.175813674926758, + "epoch": 0.1137037769428515, + "mean_token_accuracy": 0.7017543911933899, + "num_tokens": 5991269.0, + "step": 1150, + "train/ce_loss": 1.901901364326477 + }, + { + "epoch": 0.1137037769428515, + "step": 1150, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.1137037769428515, + "step": 1150, + "train/total_loss": 0.2605026364326477 + }, + { + "entropy": 8.89851188659668, + "epoch": 0.11380264979236701, + "mean_token_accuracy": 0.7866028547286987, + "num_tokens": 5996778.0, + "step": 1151, + "train/ce_loss": 0.6228799223899841 + }, + { + "epoch": 0.11380264979236701, + "step": 1151, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.11380264979236701, + "step": 1151, + "train/total_loss": 0.14041298627853394 + }, + { + "entropy": 9.506275177001953, + "epoch": 0.11390152264188254, + "mean_token_accuracy": 0.776068389415741, + "num_tokens": 6001841.0, + "step": 1152, + "train/ce_loss": 1.0933926105499268 + }, + { + "epoch": 0.11390152264188254, + "step": 1152, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.11390152264188254, + "step": 1152, + "train/total_loss": 0.22262051701545715 + }, + { + "entropy": 9.191484451293945, + "epoch": 0.11400039549139807, + "mean_token_accuracy": 0.762326180934906, + "num_tokens": 6007095.0, + "step": 1153, + "train/ce_loss": 0.8622652292251587 + }, + { + "epoch": 0.11400039549139807, + "step": 1153, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.11400039549139807, + "step": 1153, + "train/total_loss": 0.14872652292251587 + }, + { + "entropy": 9.810675621032715, + "epoch": 0.11409926834091358, + "mean_token_accuracy": 0.7382671236991882, + "num_tokens": 6012059.0, + "step": 1154, + "train/ce_loss": 0.6489583253860474 + }, + { + "epoch": 0.11409926834091358, + "step": 1154, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.11409926834091358, + "step": 1154, + "train/total_loss": 0.1352083384990692 + }, + { + "entropy": 9.498499870300293, + "epoch": 0.1141981411904291, + "mean_token_accuracy": 0.7010723948478699, + "num_tokens": 6017238.0, + "step": 1155, + "train/ce_loss": 1.6045225858688354 + }, + { + "epoch": 0.1141981411904291, + "step": 1155, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.1141981411904291, + "step": 1155, + "train/total_loss": 0.2776397466659546 + }, + { + "entropy": 9.012279510498047, + "epoch": 0.11429701403994463, + "mean_token_accuracy": 0.7267683744430542, + "num_tokens": 6022452.0, + "step": 1156, + "train/ce_loss": 1.3016753196716309 + }, + { + "epoch": 0.11429701403994463, + "step": 1156, + "train/sim_loss": 0.12109375 + }, + { + "epoch": 0.11429701403994463, + "step": 1156, + "train/total_loss": 0.25126129388809204 + }, + { + "entropy": 9.472189903259277, + "epoch": 0.11439588688946016, + "mean_token_accuracy": 0.7290779948234558, + "num_tokens": 6027586.0, + "step": 1157, + "train/ce_loss": 1.8474470376968384 + }, + { + "epoch": 0.11439588688946016, + "step": 1157, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.11439588688946016, + "step": 1157, + "train/total_loss": 0.2902134656906128 + }, + { + "entropy": 9.285955429077148, + "epoch": 0.11449475973897567, + "mean_token_accuracy": 0.7551020383834839, + "num_tokens": 6032785.0, + "step": 1158, + "train/ce_loss": 0.9319847226142883 + }, + { + "epoch": 0.11449475973897567, + "step": 1158, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.11449475973897567, + "step": 1158, + "train/total_loss": 0.1635109782218933 + }, + { + "entropy": 8.998316764831543, + "epoch": 0.1145936325884912, + "mean_token_accuracy": 0.7122736573219299, + "num_tokens": 6038253.0, + "step": 1159, + "train/ce_loss": 0.7804404497146606 + }, + { + "epoch": 0.1145936325884912, + "step": 1159, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.1145936325884912, + "step": 1159, + "train/total_loss": 0.15616905689239502 + }, + { + "epoch": 0.11469250543800673, + "grad_norm": 1.0438960790634155, + "learning_rate": 9.715917519655839e-06, + "loss": 0.1692, + "step": 1160 + }, + { + "entropy": 9.792644500732422, + "epoch": 0.11469250543800673, + "mean_token_accuracy": 0.7573657035827637, + "num_tokens": 6043287.0, + "step": 1160, + "train/ce_loss": 0.9914917945861816 + }, + { + "epoch": 0.11469250543800673, + "step": 1160, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.11469250543800673, + "step": 1160, + "train/total_loss": 0.1655554324388504 + }, + { + "entropy": 9.431596755981445, + "epoch": 0.11479137828752224, + "mean_token_accuracy": 0.6828829050064087, + "num_tokens": 6048297.0, + "step": 1161, + "train/ce_loss": 1.0226948261260986 + }, + { + "epoch": 0.11479137828752224, + "step": 1161, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.11479137828752224, + "step": 1161, + "train/total_loss": 0.2155507355928421 + }, + { + "entropy": 9.102705001831055, + "epoch": 0.11489025113703777, + "mean_token_accuracy": 0.755630612373352, + "num_tokens": 6053670.0, + "step": 1162, + "train/ce_loss": 0.7495688796043396 + }, + { + "epoch": 0.11489025113703777, + "step": 1162, + "train/sim_loss": 0.12109375 + }, + { + "epoch": 0.11489025113703777, + "step": 1162, + "train/total_loss": 0.19605064392089844 + }, + { + "entropy": 9.561321258544922, + "epoch": 0.1149891239865533, + "mean_token_accuracy": 0.7164804339408875, + "num_tokens": 6058813.0, + "step": 1163, + "train/ce_loss": 1.1004800398950465e-05 + }, + { + "epoch": 0.1149891239865533, + "step": 1163, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.1149891239865533, + "step": 1163, + "train/total_loss": 0.07421985268592834 + }, + { + "entropy": 9.675859451293945, + "epoch": 0.11508799683606881, + "mean_token_accuracy": 0.6913996338844299, + "num_tokens": 6063837.0, + "step": 1164, + "train/ce_loss": 2.5309915145044215e-05 + }, + { + "epoch": 0.11508799683606881, + "step": 1164, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.11508799683606881, + "step": 1164, + "train/total_loss": 0.039065029472112656 + }, + { + "entropy": 9.744415283203125, + "epoch": 0.11518686968558434, + "mean_token_accuracy": 0.7025547623634338, + "num_tokens": 6068853.0, + "step": 1165, + "train/ce_loss": 1.094001293182373 + }, + { + "epoch": 0.11518686968558434, + "step": 1165, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.11518686968558434, + "step": 1165, + "train/total_loss": 0.19533762335777283 + }, + { + "entropy": 9.051517486572266, + "epoch": 0.11528574253509986, + "mean_token_accuracy": 0.7436781525611877, + "num_tokens": 6074213.0, + "step": 1166, + "train/ce_loss": 1.2844007015228271 + }, + { + "epoch": 0.11528574253509986, + "step": 1166, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.11528574253509986, + "step": 1166, + "train/total_loss": 0.22609631717205048 + }, + { + "entropy": 9.856742858886719, + "epoch": 0.11538461538461539, + "mean_token_accuracy": 0.6978998184204102, + "num_tokens": 6079217.0, + "step": 1167, + "train/ce_loss": 1.1069480180740356 + }, + { + "epoch": 0.11538461538461539, + "step": 1167, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.11538461538461539, + "step": 1167, + "train/total_loss": 0.1653822958469391 + }, + { + "entropy": 9.239913940429688, + "epoch": 0.1154834882341309, + "mean_token_accuracy": 0.7493917346000671, + "num_tokens": 6084490.0, + "step": 1168, + "train/ce_loss": 0.7585064768791199 + }, + { + "epoch": 0.1154834882341309, + "step": 1168, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.1154834882341309, + "step": 1168, + "train/total_loss": 0.16178815066814423 + }, + { + "entropy": 9.63561725616455, + "epoch": 0.11558236108364643, + "mean_token_accuracy": 0.7750759720802307, + "num_tokens": 6089607.0, + "step": 1169, + "train/ce_loss": 1.0800330638885498 + }, + { + "epoch": 0.11558236108364643, + "step": 1169, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.11558236108364643, + "step": 1169, + "train/total_loss": 0.19784706830978394 + }, + { + "entropy": 9.188960075378418, + "epoch": 0.11568123393316196, + "mean_token_accuracy": 0.6948717832565308, + "num_tokens": 6094847.0, + "step": 1170, + "train/ce_loss": 0.8472950458526611 + }, + { + "epoch": 0.11568123393316196, + "step": 1170, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.11568123393316196, + "step": 1170, + "train/total_loss": 0.14722950756549835 + }, + { + "entropy": 9.626472473144531, + "epoch": 0.11578010678267747, + "mean_token_accuracy": 0.7612403035163879, + "num_tokens": 6099914.0, + "step": 1171, + "train/ce_loss": 1.5983554124832153 + }, + { + "epoch": 0.11578010678267747, + "step": 1171, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.11578010678267747, + "step": 1171, + "train/total_loss": 0.226241797208786 + }, + { + "entropy": 10.03702163696289, + "epoch": 0.115878979632193, + "mean_token_accuracy": 0.7382550239562988, + "num_tokens": 6104787.0, + "step": 1172, + "train/ce_loss": 1.2654229402542114 + }, + { + "epoch": 0.115878979632193, + "step": 1172, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.115878979632193, + "step": 1172, + "train/total_loss": 0.16169854998588562 + }, + { + "entropy": 9.217288970947266, + "epoch": 0.11597785248170853, + "mean_token_accuracy": 0.7721238732337952, + "num_tokens": 6110145.0, + "step": 1173, + "train/ce_loss": 0.9713622331619263 + }, + { + "epoch": 0.11597785248170853, + "step": 1173, + "train/sim_loss": 0.15234375 + }, + { + "epoch": 0.11597785248170853, + "step": 1173, + "train/total_loss": 0.2494799792766571 + }, + { + "entropy": 9.51988410949707, + "epoch": 0.11607672533122404, + "mean_token_accuracy": 0.6904109716415405, + "num_tokens": 6115359.0, + "step": 1174, + "train/ce_loss": 1.0642578601837158 + }, + { + "epoch": 0.11607672533122404, + "step": 1174, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.11607672533122404, + "step": 1174, + "train/total_loss": 0.23142579197883606 + }, + { + "entropy": 9.422069549560547, + "epoch": 0.11617559818073957, + "mean_token_accuracy": 0.6796116232872009, + "num_tokens": 6120522.0, + "step": 1175, + "train/ce_loss": 1.9794244766235352 + }, + { + "epoch": 0.11617559818073957, + "step": 1175, + "train/sim_loss": 0.12109375 + }, + { + "epoch": 0.11617559818073957, + "step": 1175, + "train/total_loss": 0.31903618574142456 + }, + { + "entropy": 9.484894752502441, + "epoch": 0.1162744710302551, + "mean_token_accuracy": 0.6625683307647705, + "num_tokens": 6125659.0, + "step": 1176, + "train/ce_loss": 1.1174323844898026e-05 + }, + { + "epoch": 0.1162744710302551, + "step": 1176, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.1162744710302551, + "step": 1176, + "train/total_loss": 0.04296986758708954 + }, + { + "entropy": 8.861796379089355, + "epoch": 0.11637334387977062, + "mean_token_accuracy": 0.7552238702774048, + "num_tokens": 6131122.0, + "step": 1177, + "train/ce_loss": 1.1340677738189697 + }, + { + "epoch": 0.11637334387977062, + "step": 1177, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.11637334387977062, + "step": 1177, + "train/total_loss": 0.21887552738189697 + }, + { + "entropy": 9.71383285522461, + "epoch": 0.11647221672928613, + "mean_token_accuracy": 0.7714285850524902, + "num_tokens": 6136176.0, + "step": 1178, + "train/ce_loss": 0.6980844736099243 + }, + { + "epoch": 0.11647221672928613, + "step": 1178, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.11647221672928613, + "step": 1178, + "train/total_loss": 0.1713709533214569 + }, + { + "entropy": 9.452953338623047, + "epoch": 0.11657108957880166, + "mean_token_accuracy": 0.7136871218681335, + "num_tokens": 6141338.0, + "step": 1179, + "train/ce_loss": 0.9732190370559692 + }, + { + "epoch": 0.11657108957880166, + "step": 1179, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.11657108957880166, + "step": 1179, + "train/total_loss": 0.16372814774513245 + }, + { + "epoch": 0.11666996242831719, + "grad_norm": 1.0579614639282227, + "learning_rate": 9.71097265489789e-06, + "loss": 0.1707, + "step": 1180 + }, + { + "entropy": 9.792144775390625, + "epoch": 0.11666996242831719, + "mean_token_accuracy": 0.6960950493812561, + "num_tokens": 6146359.0, + "step": 1180, + "train/ce_loss": 1.1791037321090698 + }, + { + "epoch": 0.11666996242831719, + "step": 1180, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.11666996242831719, + "step": 1180, + "train/total_loss": 0.22728538513183594 + }, + { + "entropy": 10.021678924560547, + "epoch": 0.1167688352778327, + "mean_token_accuracy": 0.7434554696083069, + "num_tokens": 6151194.0, + "step": 1181, + "train/ce_loss": 1.1177552938461304 + }, + { + "epoch": 0.1167688352778327, + "step": 1181, + "train/sim_loss": 0.12890625 + }, + { + "epoch": 0.1167688352778327, + "step": 1181, + "train/total_loss": 0.24068178236484528 + }, + { + "entropy": 9.493809700012207, + "epoch": 0.11686770812734823, + "mean_token_accuracy": 0.73051518201828, + "num_tokens": 6156397.0, + "step": 1182, + "train/ce_loss": 1.126855731010437 + }, + { + "epoch": 0.11686770812734823, + "step": 1182, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.11686770812734823, + "step": 1182, + "train/total_loss": 0.15956057608127594 + }, + { + "entropy": 9.727752685546875, + "epoch": 0.11696658097686376, + "mean_token_accuracy": 0.7101200819015503, + "num_tokens": 6161441.0, + "step": 1183, + "train/ce_loss": 1.4100347757339478 + }, + { + "epoch": 0.11696658097686376, + "step": 1183, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.11696658097686376, + "step": 1183, + "train/total_loss": 0.19569097459316254 + }, + { + "entropy": 9.780705451965332, + "epoch": 0.11706545382637927, + "mean_token_accuracy": 0.739051103591919, + "num_tokens": 6166437.0, + "step": 1184, + "train/ce_loss": 1.4163333177566528 + }, + { + "epoch": 0.11706545382637927, + "step": 1184, + "train/sim_loss": 0.1484375 + }, + { + "epoch": 0.11706545382637927, + "step": 1184, + "train/total_loss": 0.2900708317756653 + }, + { + "entropy": 9.239494323730469, + "epoch": 0.1171643266758948, + "mean_token_accuracy": 0.7134831547737122, + "num_tokens": 6171772.0, + "step": 1185, + "train/ce_loss": 0.5141210556030273 + }, + { + "epoch": 0.1171643266758948, + "step": 1185, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.1171643266758948, + "step": 1185, + "train/total_loss": 0.12563085556030273 + }, + { + "entropy": 9.675355911254883, + "epoch": 0.11726319952541032, + "mean_token_accuracy": 0.7637795209884644, + "num_tokens": 6176786.0, + "step": 1186, + "train/ce_loss": 1.1922436952590942 + }, + { + "epoch": 0.11726319952541032, + "step": 1186, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.11726319952541032, + "step": 1186, + "train/total_loss": 0.18953686952590942 + }, + { + "entropy": 9.679882049560547, + "epoch": 0.11736207237492585, + "mean_token_accuracy": 0.751655638217926, + "num_tokens": 6181865.0, + "step": 1187, + "train/ce_loss": 6.324046262307093e-05 + }, + { + "epoch": 0.11736207237492585, + "step": 1187, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.11736207237492585, + "step": 1187, + "train/total_loss": 0.05078757554292679 + }, + { + "entropy": 9.549026489257812, + "epoch": 0.11746094522444137, + "mean_token_accuracy": 0.6795030832290649, + "num_tokens": 6187096.0, + "step": 1188, + "train/ce_loss": 2.21201753616333 + }, + { + "epoch": 0.11746094522444137, + "step": 1188, + "train/sim_loss": 0.1484375 + }, + { + "epoch": 0.11746094522444137, + "step": 1188, + "train/total_loss": 0.3696392774581909 + }, + { + "entropy": 10.246437072753906, + "epoch": 0.11755981807395689, + "mean_token_accuracy": 0.7467948794364929, + "num_tokens": 6191811.0, + "step": 1189, + "train/ce_loss": 2.0585265159606934 + }, + { + "epoch": 0.11755981807395689, + "step": 1189, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.11755981807395689, + "step": 1189, + "train/total_loss": 0.2644464075565338 + }, + { + "entropy": 9.535202026367188, + "epoch": 0.11765869092347242, + "mean_token_accuracy": 0.7592319250106812, + "num_tokens": 6196954.0, + "step": 1190, + "train/ce_loss": 1.5350197553634644 + }, + { + "epoch": 0.11765869092347242, + "step": 1190, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.11765869092347242, + "step": 1190, + "train/total_loss": 0.2667832374572754 + }, + { + "entropy": 9.761550903320312, + "epoch": 0.11775756377298793, + "mean_token_accuracy": 0.6554770469665527, + "num_tokens": 6201988.0, + "step": 1191, + "train/ce_loss": 1.2218689918518066 + }, + { + "epoch": 0.11775756377298793, + "step": 1191, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.11775756377298793, + "step": 1191, + "train/total_loss": 0.24718689918518066 + }, + { + "entropy": 9.070679664611816, + "epoch": 0.11785643662250346, + "mean_token_accuracy": 0.7737818956375122, + "num_tokens": 6207322.0, + "step": 1192, + "train/ce_loss": 0.7580581903457642 + }, + { + "epoch": 0.11785643662250346, + "step": 1192, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.11785643662250346, + "step": 1192, + "train/total_loss": 0.17346206307411194 + }, + { + "entropy": 9.435079574584961, + "epoch": 0.11795530947201899, + "mean_token_accuracy": 0.7184873819351196, + "num_tokens": 6212482.0, + "step": 1193, + "train/ce_loss": 3.4512537240516394e-05 + }, + { + "epoch": 0.11795530947201899, + "step": 1193, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.11795530947201899, + "step": 1193, + "train/total_loss": 0.06640969961881638 + }, + { + "entropy": 9.026063919067383, + "epoch": 0.1180541823215345, + "mean_token_accuracy": 0.7972167134284973, + "num_tokens": 6217954.0, + "step": 1194, + "train/ce_loss": 0.8212894201278687 + }, + { + "epoch": 0.1180541823215345, + "step": 1194, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.1180541823215345, + "step": 1194, + "train/total_loss": 0.16806644201278687 + }, + { + "entropy": 9.829679489135742, + "epoch": 0.11815305517105003, + "mean_token_accuracy": 0.8259385824203491, + "num_tokens": 6223014.0, + "step": 1195, + "train/ce_loss": 1.441159110981971e-05 + }, + { + "epoch": 0.11815305517105003, + "step": 1195, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.11815305517105003, + "step": 1195, + "train/total_loss": 0.0859389379620552 + }, + { + "entropy": 9.668957710266113, + "epoch": 0.11825192802056556, + "mean_token_accuracy": 0.7221324443817139, + "num_tokens": 6228082.0, + "step": 1196, + "train/ce_loss": 1.3952945664641447e-05 + }, + { + "epoch": 0.11825192802056556, + "step": 1196, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.11825192802056556, + "step": 1196, + "train/total_loss": 0.027345145121216774 + }, + { + "entropy": 8.92799186706543, + "epoch": 0.11835080087008108, + "mean_token_accuracy": 0.7669094800949097, + "num_tokens": 6233533.0, + "step": 1197, + "train/ce_loss": 0.5769115090370178 + }, + { + "epoch": 0.11835080087008108, + "step": 1197, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.11835080087008108, + "step": 1197, + "train/total_loss": 0.14753490686416626 + }, + { + "entropy": 9.498449325561523, + "epoch": 0.1184496737195966, + "mean_token_accuracy": 0.73221755027771, + "num_tokens": 6238696.0, + "step": 1198, + "train/ce_loss": 1.7542729377746582 + }, + { + "epoch": 0.1184496737195966, + "step": 1198, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.1184496737195966, + "step": 1198, + "train/total_loss": 0.29261481761932373 + }, + { + "entropy": 8.95993423461914, + "epoch": 0.11854854656911212, + "mean_token_accuracy": 0.7315508127212524, + "num_tokens": 6244122.0, + "step": 1199, + "train/ce_loss": 0.6192775964736938 + }, + { + "epoch": 0.11854854656911212, + "step": 1199, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.11854854656911212, + "step": 1199, + "train/total_loss": 0.12052151560783386 + }, + { + "epoch": 0.11864741941862765, + "grad_norm": 1.1604726314544678, + "learning_rate": 9.70602779013994e-06, + "loss": 0.1688, + "step": 1200 + }, + { + "entropy": 9.959648132324219, + "epoch": 0.11864741941862765, + "mean_token_accuracy": 0.804347813129425, + "num_tokens": 6249015.0, + "step": 1200, + "train/ce_loss": 1.8018967239186168e-05 + }, + { + "epoch": 0.11864741941862765, + "step": 1200, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.11864741941862765, + "step": 1200, + "train/total_loss": 0.027345551177859306 + }, + { + "entropy": 9.165637969970703, + "epoch": 0.11874629226814316, + "mean_token_accuracy": 0.7649667263031006, + "num_tokens": 6254365.0, + "step": 1201, + "train/ce_loss": 0.9161604046821594 + }, + { + "epoch": 0.11874629226814316, + "step": 1201, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.11874629226814316, + "step": 1201, + "train/total_loss": 0.20099103450775146 + }, + { + "entropy": 9.422985076904297, + "epoch": 0.11884516511765869, + "mean_token_accuracy": 0.7157894968986511, + "num_tokens": 6259554.0, + "step": 1202, + "train/ce_loss": 0.6448574066162109 + }, + { + "epoch": 0.11884516511765869, + "step": 1202, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.11884516511765869, + "step": 1202, + "train/total_loss": 0.09964199364185333 + }, + { + "entropy": 9.062889099121094, + "epoch": 0.11894403796717422, + "mean_token_accuracy": 0.7551020383834839, + "num_tokens": 6265015.0, + "step": 1203, + "train/ce_loss": 0.49379420280456543 + }, + { + "epoch": 0.11894403796717422, + "step": 1203, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.11894403796717422, + "step": 1203, + "train/total_loss": 0.16266067326068878 + }, + { + "entropy": 9.328768730163574, + "epoch": 0.11904291081668973, + "mean_token_accuracy": 0.7580437660217285, + "num_tokens": 6270231.0, + "step": 1204, + "train/ce_loss": 0.7451881170272827 + }, + { + "epoch": 0.11904291081668973, + "step": 1204, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.11904291081668973, + "step": 1204, + "train/total_loss": 0.1409250646829605 + }, + { + "entropy": 9.521407127380371, + "epoch": 0.11914178366620526, + "mean_token_accuracy": 0.7482993006706238, + "num_tokens": 6275434.0, + "step": 1205, + "train/ce_loss": 0.8505513072013855 + }, + { + "epoch": 0.11914178366620526, + "step": 1205, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.11914178366620526, + "step": 1205, + "train/total_loss": 0.1748988926410675 + }, + { + "entropy": 9.026713371276855, + "epoch": 0.11924065651572079, + "mean_token_accuracy": 0.7139507532119751, + "num_tokens": 6280762.0, + "step": 1206, + "train/ce_loss": 0.9918038249015808 + }, + { + "epoch": 0.11924065651572079, + "step": 1206, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.11924065651572079, + "step": 1206, + "train/total_loss": 0.20074288547039032 + }, + { + "entropy": 9.690537452697754, + "epoch": 0.1193395293652363, + "mean_token_accuracy": 0.7435529828071594, + "num_tokens": 6285845.0, + "step": 1207, + "train/ce_loss": 0.7968851327896118 + }, + { + "epoch": 0.1193395293652363, + "step": 1207, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.1193395293652363, + "step": 1207, + "train/total_loss": 0.18125101923942566 + }, + { + "entropy": 9.778779029846191, + "epoch": 0.11943840221475183, + "mean_token_accuracy": 0.7252964377403259, + "num_tokens": 6290782.0, + "step": 1208, + "train/ce_loss": 1.856166958808899 + }, + { + "epoch": 0.11943840221475183, + "step": 1208, + "train/sim_loss": 0.1328125 + }, + { + "epoch": 0.11943840221475183, + "step": 1208, + "train/total_loss": 0.31842920184135437 + }, + { + "entropy": 9.426246643066406, + "epoch": 0.11953727506426735, + "mean_token_accuracy": 0.7288590669631958, + "num_tokens": 6295967.0, + "step": 1209, + "train/ce_loss": 0.974338948726654 + }, + { + "epoch": 0.11953727506426735, + "step": 1209, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.11953727506426735, + "step": 1209, + "train/total_loss": 0.1482151448726654 + }, + { + "entropy": 8.880840301513672, + "epoch": 0.11963614791378288, + "mean_token_accuracy": 0.6835051774978638, + "num_tokens": 6301374.0, + "step": 1210, + "train/ce_loss": 1.1998740434646606 + }, + { + "epoch": 0.11963614791378288, + "step": 1210, + "train/sim_loss": 0.171875 + }, + { + "epoch": 0.11963614791378288, + "step": 1210, + "train/total_loss": 0.2918623983860016 + }, + { + "entropy": 9.126518249511719, + "epoch": 0.1197350207632984, + "mean_token_accuracy": 0.7554535269737244, + "num_tokens": 6306708.0, + "step": 1211, + "train/ce_loss": 0.4112582504749298 + }, + { + "epoch": 0.1197350207632984, + "step": 1211, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.1197350207632984, + "step": 1211, + "train/total_loss": 0.0958133265376091 + }, + { + "entropy": 9.21739387512207, + "epoch": 0.11983389361281392, + "mean_token_accuracy": 0.7209011316299438, + "num_tokens": 6311950.0, + "step": 1212, + "train/ce_loss": 1.0548444986343384 + }, + { + "epoch": 0.11983389361281392, + "step": 1212, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.11983389361281392, + "step": 1212, + "train/total_loss": 0.19532820582389832 + }, + { + "entropy": 9.517651557922363, + "epoch": 0.11993276646232945, + "mean_token_accuracy": 0.7482900023460388, + "num_tokens": 6317089.0, + "step": 1213, + "train/ce_loss": 0.5034375786781311 + }, + { + "epoch": 0.11993276646232945, + "step": 1213, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.11993276646232945, + "step": 1213, + "train/total_loss": 0.07768750935792923 + }, + { + "entropy": 9.730310440063477, + "epoch": 0.12003163931184496, + "mean_token_accuracy": 0.6894639730453491, + "num_tokens": 6322081.0, + "step": 1214, + "train/ce_loss": 1.6999226808547974 + }, + { + "epoch": 0.12003163931184496, + "step": 1214, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.12003163931184496, + "step": 1214, + "train/total_loss": 0.27936726808547974 + }, + { + "entropy": 9.209872245788574, + "epoch": 0.12013051216136049, + "mean_token_accuracy": 0.755750298500061, + "num_tokens": 6327402.0, + "step": 1215, + "train/ce_loss": 1.0638724565505981 + }, + { + "epoch": 0.12013051216136049, + "step": 1215, + "train/sim_loss": 0.203125 + }, + { + "epoch": 0.12013051216136049, + "step": 1215, + "train/total_loss": 0.30951225757598877 + }, + { + "entropy": 9.435253143310547, + "epoch": 0.12022938501087602, + "mean_token_accuracy": 0.6762028336524963, + "num_tokens": 6332575.0, + "step": 1216, + "train/ce_loss": 0.6653417944908142 + }, + { + "epoch": 0.12022938501087602, + "step": 1216, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.12022938501087602, + "step": 1216, + "train/total_loss": 0.17590919137001038 + }, + { + "entropy": 9.680652618408203, + "epoch": 0.12032825786039153, + "mean_token_accuracy": 0.7171052694320679, + "num_tokens": 6337618.0, + "step": 1217, + "train/ce_loss": 0.9547468423843384 + }, + { + "epoch": 0.12032825786039153, + "step": 1217, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.12032825786039153, + "step": 1217, + "train/total_loss": 0.18531844019889832 + }, + { + "entropy": 9.75971508026123, + "epoch": 0.12042713070990706, + "mean_token_accuracy": 0.7676767706871033, + "num_tokens": 6342641.0, + "step": 1218, + "train/ce_loss": 1.0770344734191895 + }, + { + "epoch": 0.12042713070990706, + "step": 1218, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.12042713070990706, + "step": 1218, + "train/total_loss": 0.17020344734191895 + }, + { + "entropy": 9.716499328613281, + "epoch": 0.12052600355942258, + "mean_token_accuracy": 0.7244367599487305, + "num_tokens": 6347644.0, + "step": 1219, + "train/ce_loss": 1.3945896625518799 + }, + { + "epoch": 0.12052600355942258, + "step": 1219, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.12052600355942258, + "step": 1219, + "train/total_loss": 0.22149021923542023 + }, + { + "epoch": 0.12062487640893811, + "grad_norm": 1.1249451637268066, + "learning_rate": 9.701082925381992e-06, + "loss": 0.1742, + "step": 1220 + }, + { + "entropy": 9.388711929321289, + "epoch": 0.12062487640893811, + "mean_token_accuracy": 0.7427440881729126, + "num_tokens": 6352873.0, + "step": 1220, + "train/ce_loss": 0.7889469265937805 + }, + { + "epoch": 0.12062487640893811, + "step": 1220, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.12062487640893811, + "step": 1220, + "train/total_loss": 0.164832204580307 + }, + { + "entropy": 9.422139167785645, + "epoch": 0.12072374925845362, + "mean_token_accuracy": 0.735897421836853, + "num_tokens": 6358135.0, + "step": 1221, + "train/ce_loss": 1.3089293241500854 + }, + { + "epoch": 0.12072374925845362, + "step": 1221, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.12072374925845362, + "step": 1221, + "train/total_loss": 0.22073668241500854 + }, + { + "entropy": 9.557997703552246, + "epoch": 0.12082262210796915, + "mean_token_accuracy": 0.7147541046142578, + "num_tokens": 6363158.0, + "step": 1222, + "train/ce_loss": 0.9907953143119812 + }, + { + "epoch": 0.12082262210796915, + "step": 1222, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.12082262210796915, + "step": 1222, + "train/total_loss": 0.16548578441143036 + }, + { + "entropy": 9.598645210266113, + "epoch": 0.12092149495748468, + "mean_token_accuracy": 0.7001394629478455, + "num_tokens": 6368329.0, + "step": 1223, + "train/ce_loss": 1.5701709985733032 + }, + { + "epoch": 0.12092149495748468, + "step": 1223, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.12092149495748468, + "step": 1223, + "train/total_loss": 0.2820171117782593 + }, + { + "entropy": 9.731266021728516, + "epoch": 0.12102036780700019, + "mean_token_accuracy": 0.7379181981086731, + "num_tokens": 6373324.0, + "step": 1224, + "train/ce_loss": 3.4697419323492795e-05 + }, + { + "epoch": 0.12102036780700019, + "step": 1224, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.12102036780700019, + "step": 1224, + "train/total_loss": 0.07422222197055817 + }, + { + "entropy": 9.88830280303955, + "epoch": 0.12111924065651572, + "mean_token_accuracy": 0.752964437007904, + "num_tokens": 6378294.0, + "step": 1225, + "train/ce_loss": 0.966044008731842 + }, + { + "epoch": 0.12111924065651572, + "step": 1225, + "train/sim_loss": 0.14453125 + }, + { + "epoch": 0.12111924065651572, + "step": 1225, + "train/total_loss": 0.24113565683364868 + }, + { + "entropy": 9.497847557067871, + "epoch": 0.12121811350603125, + "mean_token_accuracy": 0.7132768630981445, + "num_tokens": 6383478.0, + "step": 1226, + "train/ce_loss": 1.0445523262023926 + }, + { + "epoch": 0.12121811350603125, + "step": 1226, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.12121811350603125, + "step": 1226, + "train/total_loss": 0.18648648262023926 + }, + { + "entropy": 9.349191665649414, + "epoch": 0.12131698635554676, + "mean_token_accuracy": 0.7230169177055359, + "num_tokens": 6388691.0, + "step": 1227, + "train/ce_loss": 0.7283400893211365 + }, + { + "epoch": 0.12131698635554676, + "step": 1227, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.12131698635554676, + "step": 1227, + "train/total_loss": 0.13533401489257812 + }, + { + "entropy": 9.422361373901367, + "epoch": 0.12141585920506229, + "mean_token_accuracy": 0.7373096346855164, + "num_tokens": 6393930.0, + "step": 1228, + "train/ce_loss": 0.3774206340312958 + }, + { + "epoch": 0.12141585920506229, + "step": 1228, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.12141585920506229, + "step": 1228, + "train/total_loss": 0.10414831340312958 + }, + { + "entropy": 9.410026550292969, + "epoch": 0.12151473205457781, + "mean_token_accuracy": 0.7279693484306335, + "num_tokens": 6399156.0, + "step": 1229, + "train/ce_loss": 0.9712676405906677 + }, + { + "epoch": 0.12151473205457781, + "step": 1229, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.12151473205457781, + "step": 1229, + "train/total_loss": 0.147908017039299 + }, + { + "entropy": 10.178994178771973, + "epoch": 0.12161360490409334, + "mean_token_accuracy": 0.7486910820007324, + "num_tokens": 6403792.0, + "step": 1230, + "train/ce_loss": 3.394787549972534 + }, + { + "epoch": 0.12161360490409334, + "step": 1230, + "train/sim_loss": 0.140625 + }, + { + "epoch": 0.12161360490409334, + "step": 1230, + "train/total_loss": 0.4801037609577179 + }, + { + "entropy": 10.003273010253906, + "epoch": 0.12171247775360886, + "mean_token_accuracy": 0.6084656119346619, + "num_tokens": 6408747.0, + "step": 1231, + "train/ce_loss": 1.4144474334898405e-05 + }, + { + "epoch": 0.12171247775360886, + "step": 1231, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.12171247775360886, + "step": 1231, + "train/total_loss": 0.07031391561031342 + }, + { + "entropy": 9.176398277282715, + "epoch": 0.12181135060312438, + "mean_token_accuracy": 0.6727467775344849, + "num_tokens": 6414145.0, + "step": 1232, + "train/ce_loss": 0.8553107976913452 + }, + { + "epoch": 0.12181135060312438, + "step": 1232, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.12181135060312438, + "step": 1232, + "train/total_loss": 0.179281085729599 + }, + { + "entropy": 9.400400161743164, + "epoch": 0.12191022345263991, + "mean_token_accuracy": 0.739130437374115, + "num_tokens": 6419402.0, + "step": 1233, + "train/ce_loss": 1.1326872110366821 + }, + { + "epoch": 0.12191022345263991, + "step": 1233, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.12191022345263991, + "step": 1233, + "train/total_loss": 0.20701873302459717 + }, + { + "entropy": 9.089184761047363, + "epoch": 0.12200909630215542, + "mean_token_accuracy": 0.7258883118629456, + "num_tokens": 6424840.0, + "step": 1234, + "train/ce_loss": 0.7860023379325867 + }, + { + "epoch": 0.12200909630215542, + "step": 1234, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.12200909630215542, + "step": 1234, + "train/total_loss": 0.1293814778327942 + }, + { + "entropy": 9.444795608520508, + "epoch": 0.12210796915167095, + "mean_token_accuracy": 0.75, + "num_tokens": 6429992.0, + "step": 1235, + "train/ce_loss": 0.5777448415756226 + }, + { + "epoch": 0.12210796915167095, + "step": 1235, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.12210796915167095, + "step": 1235, + "train/total_loss": 0.10464948415756226 + }, + { + "entropy": 8.957151412963867, + "epoch": 0.12220684200118648, + "mean_token_accuracy": 0.7469262480735779, + "num_tokens": 6435485.0, + "step": 1236, + "train/ce_loss": 0.6073027849197388 + }, + { + "epoch": 0.12220684200118648, + "step": 1236, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.12220684200118648, + "step": 1236, + "train/total_loss": 0.17010527849197388 + }, + { + "entropy": 9.437676429748535, + "epoch": 0.12230571485070199, + "mean_token_accuracy": 0.7027778029441833, + "num_tokens": 6440692.0, + "step": 1237, + "train/ce_loss": 0.6752130389213562 + }, + { + "epoch": 0.12230571485070199, + "step": 1237, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.12230571485070199, + "step": 1237, + "train/total_loss": 0.15345880389213562 + }, + { + "entropy": 9.129096984863281, + "epoch": 0.12240458770021752, + "mean_token_accuracy": 0.6656050682067871, + "num_tokens": 6446127.0, + "step": 1238, + "train/ce_loss": 0.9888354539871216 + }, + { + "epoch": 0.12240458770021752, + "step": 1238, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.12240458770021752, + "step": 1238, + "train/total_loss": 0.16138353943824768 + }, + { + "entropy": 9.386200904846191, + "epoch": 0.12250346054973305, + "mean_token_accuracy": 0.7580437660217285, + "num_tokens": 6451381.0, + "step": 1239, + "train/ce_loss": 0.8257952332496643 + }, + { + "epoch": 0.12250346054973305, + "step": 1239, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.12250346054973305, + "step": 1239, + "train/total_loss": 0.15289202332496643 + }, + { + "epoch": 0.12260233339924857, + "grad_norm": 0.992129385471344, + "learning_rate": 9.696138060624043e-06, + "loss": 0.1771, + "step": 1240 + }, + { + "entropy": 9.329833984375, + "epoch": 0.12260233339924857, + "mean_token_accuracy": 0.7079953551292419, + "num_tokens": 6456712.0, + "step": 1240, + "train/ce_loss": 0.3363990783691406 + }, + { + "epoch": 0.12260233339924857, + "step": 1240, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.12260233339924857, + "step": 1240, + "train/total_loss": 0.08832740783691406 + }, + { + "entropy": 9.286294937133789, + "epoch": 0.12270120624876409, + "mean_token_accuracy": 0.7363834381103516, + "num_tokens": 6462330.0, + "step": 1241, + "train/ce_loss": 1.0246698366245255e-05 + }, + { + "epoch": 0.12270120624876409, + "step": 1241, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.12270120624876409, + "step": 1241, + "train/total_loss": 0.07812602818012238 + }, + { + "entropy": 9.592647552490234, + "epoch": 0.12280007909827961, + "mean_token_accuracy": 0.7633228898048401, + "num_tokens": 6467407.0, + "step": 1242, + "train/ce_loss": 1.1297849416732788 + }, + { + "epoch": 0.12280007909827961, + "step": 1242, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.12280007909827961, + "step": 1242, + "train/total_loss": 0.1754784882068634 + }, + { + "entropy": 9.25661849975586, + "epoch": 0.12289895194779514, + "mean_token_accuracy": 0.7703788876533508, + "num_tokens": 6472744.0, + "step": 1243, + "train/ce_loss": 0.5336284637451172 + }, + { + "epoch": 0.12289895194779514, + "step": 1243, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.12289895194779514, + "step": 1243, + "train/total_loss": 0.14711284637451172 + }, + { + "entropy": 9.378896713256836, + "epoch": 0.12299782479731065, + "mean_token_accuracy": 0.6879240274429321, + "num_tokens": 6477966.0, + "step": 1244, + "train/ce_loss": 1.2743122577667236 + }, + { + "epoch": 0.12299782479731065, + "step": 1244, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.12299782479731065, + "step": 1244, + "train/total_loss": 0.1860249787569046 + }, + { + "entropy": 9.449660301208496, + "epoch": 0.12309669764682618, + "mean_token_accuracy": 0.7572559118270874, + "num_tokens": 6483175.0, + "step": 1245, + "train/ce_loss": 3.446074333623983e-05 + }, + { + "epoch": 0.12309669764682618, + "step": 1245, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.12309669764682618, + "step": 1245, + "train/total_loss": 0.08594094961881638 + }, + { + "entropy": 9.363601684570312, + "epoch": 0.12319557049634171, + "mean_token_accuracy": 0.6921212077140808, + "num_tokens": 6488457.0, + "step": 1246, + "train/ce_loss": 0.9129509329795837 + }, + { + "epoch": 0.12319557049634171, + "step": 1246, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.12319557049634171, + "step": 1246, + "train/total_loss": 0.20067009329795837 + }, + { + "entropy": 9.186436653137207, + "epoch": 0.12329444334585722, + "mean_token_accuracy": 0.7436463832855225, + "num_tokens": 6493826.0, + "step": 1247, + "train/ce_loss": 0.9415988922119141 + }, + { + "epoch": 0.12329444334585722, + "step": 1247, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.12329444334585722, + "step": 1247, + "train/total_loss": 0.15665990114212036 + }, + { + "entropy": 9.832391738891602, + "epoch": 0.12339331619537275, + "mean_token_accuracy": 0.7275862097740173, + "num_tokens": 6498876.0, + "step": 1248, + "train/ce_loss": 0.860569417476654 + }, + { + "epoch": 0.12339331619537275, + "step": 1248, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.12339331619537275, + "step": 1248, + "train/total_loss": 0.15246319770812988 + }, + { + "entropy": 9.70506477355957, + "epoch": 0.12349218904488828, + "mean_token_accuracy": 0.6977152824401855, + "num_tokens": 6503856.0, + "step": 1249, + "train/ce_loss": 2.863835652533453e-05 + }, + { + "epoch": 0.12349218904488828, + "step": 1249, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.12349218904488828, + "step": 1249, + "train/total_loss": 0.06250286102294922 + }, + { + "entropy": 9.391181945800781, + "epoch": 0.1235910618944038, + "mean_token_accuracy": 0.7162346243858337, + "num_tokens": 6509076.0, + "step": 1250, + "train/ce_loss": 0.6494923233985901 + }, + { + "epoch": 0.1235910618944038, + "step": 1250, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.1235910618944038, + "step": 1250, + "train/total_loss": 0.11573048681020737 + }, + { + "entropy": 9.535764694213867, + "epoch": 0.12368993474391932, + "mean_token_accuracy": 0.754408061504364, + "num_tokens": 6514262.0, + "step": 1251, + "train/ce_loss": 1.2194236516952515 + }, + { + "epoch": 0.12368993474391932, + "step": 1251, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.12368993474391932, + "step": 1251, + "train/total_loss": 0.18053612112998962 + }, + { + "entropy": 9.383031845092773, + "epoch": 0.12378880759343484, + "mean_token_accuracy": 0.7243173122406006, + "num_tokens": 6519477.0, + "step": 1252, + "train/ce_loss": 0.6060498952865601 + }, + { + "epoch": 0.12378880759343484, + "step": 1252, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.12378880759343484, + "step": 1252, + "train/total_loss": 0.111386239528656 + }, + { + "entropy": 10.231584548950195, + "epoch": 0.12388768044295037, + "mean_token_accuracy": 0.7390300035476685, + "num_tokens": 6524383.0, + "step": 1253, + "train/ce_loss": 1.7479231357574463 + }, + { + "epoch": 0.12388768044295037, + "step": 1253, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.12388768044295037, + "step": 1253, + "train/total_loss": 0.2451048195362091 + }, + { + "entropy": 9.92870044708252, + "epoch": 0.12398655329246588, + "mean_token_accuracy": 0.7698541283607483, + "num_tokens": 6529570.0, + "step": 1254, + "train/ce_loss": 1.1913306479982566e-05 + }, + { + "epoch": 0.12398655329246588, + "step": 1254, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.12398655329246588, + "step": 1254, + "train/total_loss": 0.03125119209289551 + }, + { + "entropy": 9.180500984191895, + "epoch": 0.12408542614198141, + "mean_token_accuracy": 0.7398189902305603, + "num_tokens": 6534953.0, + "step": 1255, + "train/ce_loss": 0.7377535700798035 + }, + { + "epoch": 0.12408542614198141, + "step": 1255, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.12408542614198141, + "step": 1255, + "train/total_loss": 0.14799410104751587 + }, + { + "entropy": 9.337590217590332, + "epoch": 0.12418429899149694, + "mean_token_accuracy": 0.7755101919174194, + "num_tokens": 6540321.0, + "step": 1256, + "train/ce_loss": 0.681896448135376 + }, + { + "epoch": 0.12418429899149694, + "step": 1256, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.12418429899149694, + "step": 1256, + "train/total_loss": 0.11115839332342148 + }, + { + "entropy": 9.066247940063477, + "epoch": 0.12428317184101245, + "mean_token_accuracy": 0.723243236541748, + "num_tokens": 6545708.0, + "step": 1257, + "train/ce_loss": 1.0786329507827759 + }, + { + "epoch": 0.12428317184101245, + "step": 1257, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.12428317184101245, + "step": 1257, + "train/total_loss": 0.21723830699920654 + }, + { + "entropy": 9.692575454711914, + "epoch": 0.12438204469052798, + "mean_token_accuracy": 0.7036011219024658, + "num_tokens": 6550874.0, + "step": 1258, + "train/ce_loss": 1.9783855676651 + }, + { + "epoch": 0.12438204469052798, + "step": 1258, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.12438204469052798, + "step": 1258, + "train/total_loss": 0.29549479484558105 + }, + { + "entropy": 9.475704193115234, + "epoch": 0.1244809175400435, + "mean_token_accuracy": 0.7932098507881165, + "num_tokens": 6555977.0, + "step": 1259, + "train/ce_loss": 1.2450464963912964 + }, + { + "epoch": 0.1244809175400435, + "step": 1259, + "train/sim_loss": 0.140625 + }, + { + "epoch": 0.1244809175400435, + "step": 1259, + "train/total_loss": 0.2651296555995941 + }, + { + "epoch": 0.12457979038955903, + "grad_norm": 1.0474660396575928, + "learning_rate": 9.691193195866095e-06, + "loss": 0.1628, + "step": 1260 + }, + { + "entropy": 9.542675018310547, + "epoch": 0.12457979038955903, + "mean_token_accuracy": 0.7211155295372009, + "num_tokens": 6561183.0, + "step": 1260, + "train/ce_loss": 1.0661858320236206 + }, + { + "epoch": 0.12457979038955903, + "step": 1260, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.12457979038955903, + "step": 1260, + "train/total_loss": 0.17302483320236206 + }, + { + "entropy": 9.597604751586914, + "epoch": 0.12467866323907455, + "mean_token_accuracy": 0.795918345451355, + "num_tokens": 6566282.0, + "step": 1261, + "train/ce_loss": 0.525081217288971 + }, + { + "epoch": 0.12467866323907455, + "step": 1261, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.12467866323907455, + "step": 1261, + "train/total_loss": 0.07594562321901321 + }, + { + "entropy": 9.565014839172363, + "epoch": 0.12477753608859007, + "mean_token_accuracy": 0.7484471797943115, + "num_tokens": 6571362.0, + "step": 1262, + "train/ce_loss": 0.7680662870407104 + }, + { + "epoch": 0.12477753608859007, + "step": 1262, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.12477753608859007, + "step": 1262, + "train/total_loss": 0.13149413466453552 + }, + { + "entropy": 10.078500747680664, + "epoch": 0.1248764089381056, + "mean_token_accuracy": 0.7957746386528015, + "num_tokens": 6576215.0, + "step": 1263, + "train/ce_loss": 2.5891031327773817e-05 + }, + { + "epoch": 0.1248764089381056, + "step": 1263, + "train/sim_loss": 0.12890625 + }, + { + "epoch": 0.1248764089381056, + "step": 1263, + "train/total_loss": 0.12890884280204773 + }, + { + "entropy": 9.09901237487793, + "epoch": 0.12497528178762111, + "mean_token_accuracy": 0.744027316570282, + "num_tokens": 6581545.0, + "step": 1264, + "train/ce_loss": 0.8924403786659241 + }, + { + "epoch": 0.12497528178762111, + "step": 1264, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.12497528178762111, + "step": 1264, + "train/total_loss": 0.1556502878665924 + }, + { + "entropy": 9.241326332092285, + "epoch": 0.12507415463713664, + "mean_token_accuracy": 0.7755101919174194, + "num_tokens": 6586889.0, + "step": 1265, + "train/ce_loss": 0.770359456539154 + }, + { + "epoch": 0.12507415463713664, + "step": 1265, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.12507415463713664, + "step": 1265, + "train/total_loss": 0.14734844863414764 + }, + { + "entropy": 9.286764144897461, + "epoch": 0.12517302748665216, + "mean_token_accuracy": 0.7348901033401489, + "num_tokens": 6592091.0, + "step": 1266, + "train/ce_loss": 0.9730169177055359 + }, + { + "epoch": 0.12517302748665216, + "step": 1266, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.12517302748665216, + "step": 1266, + "train/total_loss": 0.2066766917705536 + }, + { + "entropy": 9.571826934814453, + "epoch": 0.1252719003361677, + "mean_token_accuracy": 0.7157142758369446, + "num_tokens": 6597224.0, + "step": 1267, + "train/ce_loss": 1.1601194143295288 + }, + { + "epoch": 0.1252719003361677, + "step": 1267, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.1252719003361677, + "step": 1267, + "train/total_loss": 0.22538694739341736 + }, + { + "entropy": 9.823686599731445, + "epoch": 0.1253707731856832, + "mean_token_accuracy": 0.7427184581756592, + "num_tokens": 6602268.0, + "step": 1268, + "train/ce_loss": 1.0998046398162842 + }, + { + "epoch": 0.1253707731856832, + "step": 1268, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.1253707731856832, + "step": 1268, + "train/total_loss": 0.17638671398162842 + }, + { + "entropy": 9.747212409973145, + "epoch": 0.12546964603519872, + "mean_token_accuracy": 0.7157360315322876, + "num_tokens": 6607335.0, + "step": 1269, + "train/ce_loss": 1.0469977855682373 + }, + { + "epoch": 0.12546964603519872, + "step": 1269, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.12546964603519872, + "step": 1269, + "train/total_loss": 0.15548104047775269 + }, + { + "entropy": 9.508520126342773, + "epoch": 0.12556851888471426, + "mean_token_accuracy": 0.6960651278495789, + "num_tokens": 6612522.0, + "step": 1270, + "train/ce_loss": 0.7423213124275208 + }, + { + "epoch": 0.12556851888471426, + "step": 1270, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.12556851888471426, + "step": 1270, + "train/total_loss": 0.13673213124275208 + }, + { + "entropy": 10.032516479492188, + "epoch": 0.12566739173422978, + "mean_token_accuracy": 0.7343358397483826, + "num_tokens": 6617352.0, + "step": 1271, + "train/ce_loss": 1.6255320310592651 + }, + { + "epoch": 0.12566739173422978, + "step": 1271, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.12566739173422978, + "step": 1271, + "train/total_loss": 0.27583444118499756 + }, + { + "entropy": 9.865862846374512, + "epoch": 0.1257662645837453, + "mean_token_accuracy": 0.7620751261711121, + "num_tokens": 6622340.0, + "step": 1272, + "train/ce_loss": 1.6030211448669434 + }, + { + "epoch": 0.1257662645837453, + "step": 1272, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.1257662645837453, + "step": 1272, + "train/total_loss": 0.24233336746692657 + }, + { + "entropy": 9.0894775390625, + "epoch": 0.12586513743326083, + "mean_token_accuracy": 0.7485907673835754, + "num_tokens": 6627710.0, + "step": 1273, + "train/ce_loss": 1.1820942163467407 + }, + { + "epoch": 0.12586513743326083, + "step": 1273, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.12586513743326083, + "step": 1273, + "train/total_loss": 0.20024067163467407 + }, + { + "entropy": 10.18376350402832, + "epoch": 0.12596401028277635, + "mean_token_accuracy": 0.7234042286872864, + "num_tokens": 6632482.0, + "step": 1274, + "train/ce_loss": 1.2288492918014526 + }, + { + "epoch": 0.12596401028277635, + "step": 1274, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.12596401028277635, + "step": 1274, + "train/total_loss": 0.21663492918014526 + }, + { + "entropy": 10.220690727233887, + "epoch": 0.1260628831322919, + "mean_token_accuracy": 0.6997318863868713, + "num_tokens": 6637254.0, + "step": 1275, + "train/ce_loss": 2.4932305812835693 + }, + { + "epoch": 0.1260628831322919, + "step": 1275, + "train/sim_loss": 0.140625 + }, + { + "epoch": 0.1260628831322919, + "step": 1275, + "train/total_loss": 0.3899480700492859 + }, + { + "entropy": 9.369550704956055, + "epoch": 0.1261617559818074, + "mean_token_accuracy": 0.7096336483955383, + "num_tokens": 6642467.0, + "step": 1276, + "train/ce_loss": 0.7277787327766418 + }, + { + "epoch": 0.1261617559818074, + "step": 1276, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.1261617559818074, + "step": 1276, + "train/total_loss": 0.1743403673171997 + }, + { + "entropy": 9.829463958740234, + "epoch": 0.1262606288313229, + "mean_token_accuracy": 0.7222222089767456, + "num_tokens": 6647512.0, + "step": 1277, + "train/ce_loss": 1.1789212226867676 + }, + { + "epoch": 0.1262606288313229, + "step": 1277, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.1262606288313229, + "step": 1277, + "train/total_loss": 0.22336086630821228 + }, + { + "entropy": 8.898569107055664, + "epoch": 0.12635950168083845, + "mean_token_accuracy": 0.7696228623390198, + "num_tokens": 6652940.0, + "step": 1278, + "train/ce_loss": 0.7270646095275879 + }, + { + "epoch": 0.12635950168083845, + "step": 1278, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.12635950168083845, + "step": 1278, + "train/total_loss": 0.10395646095275879 + }, + { + "entropy": 9.275156021118164, + "epoch": 0.12645837453035397, + "mean_token_accuracy": 0.716167688369751, + "num_tokens": 6658269.0, + "step": 1279, + "train/ce_loss": 0.9162885546684265 + }, + { + "epoch": 0.12645837453035397, + "step": 1279, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.12645837453035397, + "step": 1279, + "train/total_loss": 0.18537884950637817 + }, + { + "epoch": 0.12655724737986948, + "grad_norm": 1.0265246629714966, + "learning_rate": 9.686248331108144e-06, + "loss": 0.1631, + "step": 1280 + }, + { + "entropy": 9.02204418182373, + "epoch": 0.12655724737986948, + "mean_token_accuracy": 0.738095223903656, + "num_tokens": 6663676.0, + "step": 1280, + "train/ce_loss": 0.5852922797203064 + }, + { + "epoch": 0.12655724737986948, + "step": 1280, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.12655724737986948, + "step": 1280, + "train/total_loss": 0.09759172797203064 + }, + { + "entropy": 9.600400924682617, + "epoch": 0.12665612022938502, + "mean_token_accuracy": 0.7263843417167664, + "num_tokens": 6668723.0, + "step": 1281, + "train/ce_loss": 1.3899685144424438 + }, + { + "epoch": 0.12665612022938502, + "step": 1281, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.12665612022938502, + "step": 1281, + "train/total_loss": 0.22102810442447662 + }, + { + "entropy": 9.386069297790527, + "epoch": 0.12675499307890054, + "mean_token_accuracy": 0.7247706651687622, + "num_tokens": 6674135.0, + "step": 1282, + "train/ce_loss": 1.2490756511688232 + }, + { + "epoch": 0.12675499307890054, + "step": 1282, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.12675499307890054, + "step": 1282, + "train/total_loss": 0.22256381809711456 + }, + { + "entropy": 9.768744468688965, + "epoch": 0.12685386592841605, + "mean_token_accuracy": 0.7996453642845154, + "num_tokens": 6679155.0, + "step": 1283, + "train/ce_loss": 1.2421215615177061e-05 + }, + { + "epoch": 0.12685386592841605, + "step": 1283, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.12685386592841605, + "step": 1283, + "train/total_loss": 0.03515749052166939 + }, + { + "entropy": 9.283811569213867, + "epoch": 0.1269527387779316, + "mean_token_accuracy": 0.691428542137146, + "num_tokens": 6684510.0, + "step": 1284, + "train/ce_loss": 0.9716193079948425 + }, + { + "epoch": 0.1269527387779316, + "step": 1284, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.1269527387779316, + "step": 1284, + "train/total_loss": 0.1909119337797165 + }, + { + "entropy": 9.552628517150879, + "epoch": 0.1270516116274471, + "mean_token_accuracy": 0.6743044257164001, + "num_tokens": 6689557.0, + "step": 1285, + "train/ce_loss": 1.5597680807113647 + }, + { + "epoch": 0.1270516116274471, + "step": 1285, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.1270516116274471, + "step": 1285, + "train/total_loss": 0.2809768319129944 + }, + { + "entropy": 8.94034194946289, + "epoch": 0.12715048447696262, + "mean_token_accuracy": 0.802480936050415, + "num_tokens": 6695068.0, + "step": 1286, + "train/ce_loss": 0.4175623655319214 + }, + { + "epoch": 0.12715048447696262, + "step": 1286, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.12715048447696262, + "step": 1286, + "train/total_loss": 0.16675624251365662 + }, + { + "entropy": 9.418731689453125, + "epoch": 0.12724935732647816, + "mean_token_accuracy": 0.7613104581832886, + "num_tokens": 6700156.0, + "step": 1287, + "train/ce_loss": 0.9082545638084412 + }, + { + "epoch": 0.12724935732647816, + "step": 1287, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.12724935732647816, + "step": 1287, + "train/total_loss": 0.16113796830177307 + }, + { + "entropy": 10.113789558410645, + "epoch": 0.12734823017599367, + "mean_token_accuracy": 0.7019704580307007, + "num_tokens": 6704954.0, + "step": 1288, + "train/ce_loss": 1.5278859791578725e-05 + }, + { + "epoch": 0.12734823017599367, + "step": 1288, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.12734823017599367, + "step": 1288, + "train/total_loss": 0.02734527736902237 + }, + { + "entropy": 9.118091583251953, + "epoch": 0.12744710302550918, + "mean_token_accuracy": 0.7314715385437012, + "num_tokens": 6710522.0, + "step": 1289, + "train/ce_loss": 0.701236367225647 + }, + { + "epoch": 0.12744710302550918, + "step": 1289, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.12744710302550918, + "step": 1289, + "train/total_loss": 0.18340489268302917 + }, + { + "entropy": 9.750611305236816, + "epoch": 0.12754597587502473, + "mean_token_accuracy": 0.7197802066802979, + "num_tokens": 6715529.0, + "step": 1290, + "train/ce_loss": 9.375158697366714e-06 + }, + { + "epoch": 0.12754597587502473, + "step": 1290, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.12754597587502473, + "step": 1290, + "train/total_loss": 0.08593843877315521 + }, + { + "entropy": 9.286053657531738, + "epoch": 0.12764484872454024, + "mean_token_accuracy": 0.7582821846008301, + "num_tokens": 6720773.0, + "step": 1291, + "train/ce_loss": 0.7109894752502441 + }, + { + "epoch": 0.12764484872454024, + "step": 1291, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.12764484872454024, + "step": 1291, + "train/total_loss": 0.1843802034854889 + }, + { + "entropy": 8.948986053466797, + "epoch": 0.12774372157405575, + "mean_token_accuracy": 0.6772777438163757, + "num_tokens": 6726136.0, + "step": 1292, + "train/ce_loss": 0.9387810230255127 + }, + { + "epoch": 0.12774372157405575, + "step": 1292, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.12774372157405575, + "step": 1292, + "train/total_loss": 0.1485656052827835 + }, + { + "entropy": 9.39590835571289, + "epoch": 0.1278425944235713, + "mean_token_accuracy": 0.7406855225563049, + "num_tokens": 6731196.0, + "step": 1293, + "train/ce_loss": 0.7263284921646118 + }, + { + "epoch": 0.1278425944235713, + "step": 1293, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.1278425944235713, + "step": 1293, + "train/total_loss": 0.16638284921646118 + }, + { + "entropy": 9.307497024536133, + "epoch": 0.1279414672730868, + "mean_token_accuracy": 0.6719160079956055, + "num_tokens": 6736402.0, + "step": 1294, + "train/ce_loss": 0.8595036268234253 + }, + { + "epoch": 0.1279414672730868, + "step": 1294, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.1279414672730868, + "step": 1294, + "train/total_loss": 0.19532537460327148 + }, + { + "entropy": 9.940542221069336, + "epoch": 0.12804034012260232, + "mean_token_accuracy": 0.7020785212516785, + "num_tokens": 6741231.0, + "step": 1295, + "train/ce_loss": 2.441803216934204 + }, + { + "epoch": 0.12804034012260232, + "step": 1295, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.12804034012260232, + "step": 1295, + "train/total_loss": 0.3301178216934204 + }, + { + "entropy": 10.023080825805664, + "epoch": 0.12813921297211786, + "mean_token_accuracy": 0.6821561455726624, + "num_tokens": 6746207.0, + "step": 1296, + "train/ce_loss": 2.109379529953003 + }, + { + "epoch": 0.12813921297211786, + "step": 1296, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.12813921297211786, + "step": 1296, + "train/total_loss": 0.3203129768371582 + }, + { + "entropy": 8.976844787597656, + "epoch": 0.12823808582163337, + "mean_token_accuracy": 0.7243243455886841, + "num_tokens": 6751446.0, + "step": 1297, + "train/ce_loss": 1.3865740299224854 + }, + { + "epoch": 0.12823808582163337, + "step": 1297, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.12823808582163337, + "step": 1297, + "train/total_loss": 0.23240740597248077 + }, + { + "entropy": 9.06201171875, + "epoch": 0.12833695867114892, + "mean_token_accuracy": 0.7755308151245117, + "num_tokens": 6757105.0, + "step": 1298, + "train/ce_loss": 0.8118742108345032 + }, + { + "epoch": 0.12833695867114892, + "step": 1298, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.12833695867114892, + "step": 1298, + "train/total_loss": 0.1202499195933342 + }, + { + "entropy": 9.676658630371094, + "epoch": 0.12843583152066443, + "mean_token_accuracy": 0.7454545497894287, + "num_tokens": 6762103.0, + "step": 1299, + "train/ce_loss": 0.00011109438491985202 + }, + { + "epoch": 0.12843583152066443, + "step": 1299, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.12843583152066443, + "step": 1299, + "train/total_loss": 0.07032360881567001 + }, + { + "epoch": 0.12853470437017994, + "grad_norm": 0.9925926327705383, + "learning_rate": 9.681303466350196e-06, + "loss": 0.1728, + "step": 1300 + }, + { + "entropy": 9.072019577026367, + "epoch": 0.12853470437017994, + "mean_token_accuracy": 0.7306163311004639, + "num_tokens": 6767619.0, + "step": 1300, + "train/ce_loss": 0.3801075518131256 + }, + { + "epoch": 0.12853470437017994, + "step": 1300, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.12853470437017994, + "step": 1300, + "train/total_loss": 0.09660451114177704 + }, + { + "entropy": 9.6489839553833, + "epoch": 0.12863357721969548, + "mean_token_accuracy": 0.7655068039894104, + "num_tokens": 6772693.0, + "step": 1301, + "train/ce_loss": 0.8618592619895935 + }, + { + "epoch": 0.12863357721969548, + "step": 1301, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.12863357721969548, + "step": 1301, + "train/total_loss": 0.11352967470884323 + }, + { + "entropy": 9.157691955566406, + "epoch": 0.128732450069211, + "mean_token_accuracy": 0.7541766166687012, + "num_tokens": 6778026.0, + "step": 1302, + "train/ce_loss": 0.543510913848877 + }, + { + "epoch": 0.128732450069211, + "step": 1302, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.128732450069211, + "step": 1302, + "train/total_loss": 0.0895073413848877 + }, + { + "entropy": 9.758062362670898, + "epoch": 0.1288313229187265, + "mean_token_accuracy": 0.7643097639083862, + "num_tokens": 6783068.0, + "step": 1303, + "train/ce_loss": 0.7941330671310425 + }, + { + "epoch": 0.1288313229187265, + "step": 1303, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.1288313229187265, + "step": 1303, + "train/total_loss": 0.1575383096933365 + }, + { + "entropy": 8.827596664428711, + "epoch": 0.12893019576824205, + "mean_token_accuracy": 0.7171814441680908, + "num_tokens": 6788595.0, + "step": 1304, + "train/ce_loss": 1.030279278755188 + }, + { + "epoch": 0.12893019576824205, + "step": 1304, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.12893019576824205, + "step": 1304, + "train/total_loss": 0.16943418979644775 + }, + { + "entropy": 9.598592758178711, + "epoch": 0.12902906861775756, + "mean_token_accuracy": 0.7517730593681335, + "num_tokens": 6793607.0, + "step": 1305, + "train/ce_loss": 0.9198727607727051 + }, + { + "epoch": 0.12902906861775756, + "step": 1305, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.12902906861775756, + "step": 1305, + "train/total_loss": 0.17792478203773499 + }, + { + "entropy": 9.47142505645752, + "epoch": 0.12912794146727308, + "mean_token_accuracy": 0.811188817024231, + "num_tokens": 6798780.0, + "step": 1306, + "train/ce_loss": 1.043941119860392e-05 + }, + { + "epoch": 0.12912794146727308, + "step": 1306, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.12912794146727308, + "step": 1306, + "train/total_loss": 0.09375104308128357 + }, + { + "entropy": 9.123851776123047, + "epoch": 0.12922681431678862, + "mean_token_accuracy": 0.7857961058616638, + "num_tokens": 6804138.0, + "step": 1307, + "train/ce_loss": 0.45554080605506897 + }, + { + "epoch": 0.12922681431678862, + "step": 1307, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.12922681431678862, + "step": 1307, + "train/total_loss": 0.07680408656597137 + }, + { + "entropy": 9.088683128356934, + "epoch": 0.12932568716630413, + "mean_token_accuracy": 0.7415599822998047, + "num_tokens": 6809477.0, + "step": 1308, + "train/ce_loss": 0.7919617891311646 + }, + { + "epoch": 0.12932568716630413, + "step": 1308, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.12932568716630413, + "step": 1308, + "train/total_loss": 0.13388368487358093 + }, + { + "entropy": 9.484408378601074, + "epoch": 0.12942456001581965, + "mean_token_accuracy": 0.7262997031211853, + "num_tokens": 6814748.0, + "step": 1309, + "train/ce_loss": 4.9226622650166973e-05 + }, + { + "epoch": 0.12942456001581965, + "step": 1309, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.12942456001581965, + "step": 1309, + "train/total_loss": 0.05469242110848427 + }, + { + "entropy": 9.560070037841797, + "epoch": 0.1295234328653352, + "mean_token_accuracy": 0.765531063079834, + "num_tokens": 6819718.0, + "step": 1310, + "train/ce_loss": 0.3943033814430237 + }, + { + "epoch": 0.1295234328653352, + "step": 1310, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.1295234328653352, + "step": 1310, + "train/total_loss": 0.13318033516407013 + }, + { + "entropy": 9.553999900817871, + "epoch": 0.1296223057148507, + "mean_token_accuracy": 0.6898638606071472, + "num_tokens": 6824837.0, + "step": 1311, + "train/ce_loss": 1.1553877592086792 + }, + { + "epoch": 0.1296223057148507, + "step": 1311, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.1296223057148507, + "step": 1311, + "train/total_loss": 0.22100752592086792 + }, + { + "entropy": 9.081045150756836, + "epoch": 0.1297211785643662, + "mean_token_accuracy": 0.7494456768035889, + "num_tokens": 6830181.0, + "step": 1312, + "train/ce_loss": 0.9298657774925232 + }, + { + "epoch": 0.1297211785643662, + "step": 1312, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.1297211785643662, + "step": 1312, + "train/total_loss": 0.1945490837097168 + }, + { + "entropy": 9.104818344116211, + "epoch": 0.12982005141388175, + "mean_token_accuracy": 0.7246752977371216, + "num_tokens": 6835437.0, + "step": 1313, + "train/ce_loss": 0.5295743346214294 + }, + { + "epoch": 0.12982005141388175, + "step": 1313, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.12982005141388175, + "step": 1313, + "train/total_loss": 0.0881136804819107 + }, + { + "entropy": 9.376556396484375, + "epoch": 0.12991892426339727, + "mean_token_accuracy": 0.7823129296302795, + "num_tokens": 6840675.0, + "step": 1314, + "train/ce_loss": 2.5385288608958945e-05 + }, + { + "epoch": 0.12991892426339727, + "step": 1314, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.12991892426339727, + "step": 1314, + "train/total_loss": 0.07031504064798355 + }, + { + "entropy": 9.420551300048828, + "epoch": 0.13001779711291278, + "mean_token_accuracy": 0.7459893226623535, + "num_tokens": 6845847.0, + "step": 1315, + "train/ce_loss": 1.6422370672225952 + }, + { + "epoch": 0.13001779711291278, + "step": 1315, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.13001779711291278, + "step": 1315, + "train/total_loss": 0.25406748056411743 + }, + { + "entropy": 9.569178581237793, + "epoch": 0.13011666996242832, + "mean_token_accuracy": 0.7777777910232544, + "num_tokens": 6850885.0, + "step": 1316, + "train/ce_loss": 0.7575596570968628 + }, + { + "epoch": 0.13011666996242832, + "step": 1316, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.13011666996242832, + "step": 1316, + "train/total_loss": 0.11872471868991852 + }, + { + "entropy": 9.686807632446289, + "epoch": 0.13021554281194384, + "mean_token_accuracy": 0.6978297233581543, + "num_tokens": 6855927.0, + "step": 1317, + "train/ce_loss": 1.4601283073425293 + }, + { + "epoch": 0.13021554281194384, + "step": 1317, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.13021554281194384, + "step": 1317, + "train/total_loss": 0.2397628277540207 + }, + { + "entropy": 9.00992488861084, + "epoch": 0.13031441566145938, + "mean_token_accuracy": 0.6796690225601196, + "num_tokens": 6861209.0, + "step": 1318, + "train/ce_loss": 0.777147650718689 + }, + { + "epoch": 0.13031441566145938, + "step": 1318, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.13031441566145938, + "step": 1318, + "train/total_loss": 0.13630852103233337 + }, + { + "entropy": 9.652292251586914, + "epoch": 0.1304132885109749, + "mean_token_accuracy": 0.7416520118713379, + "num_tokens": 6866227.0, + "step": 1319, + "train/ce_loss": 1.0801567441376392e-05 + }, + { + "epoch": 0.1304132885109749, + "step": 1319, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.1304132885109749, + "step": 1319, + "train/total_loss": 0.07421983033418655 + }, + { + "epoch": 0.1305121613604904, + "grad_norm": 1.0008471012115479, + "learning_rate": 9.676358601592247e-06, + "loss": 0.1571, + "step": 1320 + }, + { + "entropy": 9.465921401977539, + "epoch": 0.1305121613604904, + "mean_token_accuracy": 0.7111111283302307, + "num_tokens": 6871414.0, + "step": 1320, + "train/ce_loss": 0.6895711421966553 + }, + { + "epoch": 0.1305121613604904, + "step": 1320, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.1305121613604904, + "step": 1320, + "train/total_loss": 0.14317587018013 + }, + { + "entropy": 10.085680961608887, + "epoch": 0.13061103421000594, + "mean_token_accuracy": 0.8009478449821472, + "num_tokens": 6876185.0, + "step": 1321, + "train/ce_loss": 1.1681467294692993 + }, + { + "epoch": 0.13061103421000594, + "step": 1321, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.13061103421000594, + "step": 1321, + "train/total_loss": 0.17931467294692993 + }, + { + "entropy": 9.769105911254883, + "epoch": 0.13070990705952146, + "mean_token_accuracy": 0.8288770318031311, + "num_tokens": 6881137.0, + "step": 1322, + "train/ce_loss": 8.012760190467816e-06 + }, + { + "epoch": 0.13070990705952146, + "step": 1322, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.13070990705952146, + "step": 1322, + "train/total_loss": 0.06640705466270447 + }, + { + "entropy": 9.293441772460938, + "epoch": 0.13080877990903697, + "mean_token_accuracy": 0.7526316046714783, + "num_tokens": 6886334.0, + "step": 1323, + "train/ce_loss": 1.331781029701233 + }, + { + "epoch": 0.13080877990903697, + "step": 1323, + "train/sim_loss": 0.140625 + }, + { + "epoch": 0.13080877990903697, + "step": 1323, + "train/total_loss": 0.27380311489105225 + }, + { + "entropy": 9.82375717163086, + "epoch": 0.1309076527585525, + "mean_token_accuracy": 0.7560521364212036, + "num_tokens": 6891483.0, + "step": 1324, + "train/ce_loss": 1.423461675643921 + }, + { + "epoch": 0.1309076527585525, + "step": 1324, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.1309076527585525, + "step": 1324, + "train/total_loss": 0.25953367352485657 + }, + { + "entropy": 9.339868545532227, + "epoch": 0.13100652560806803, + "mean_token_accuracy": 0.6946022510528564, + "num_tokens": 6896649.0, + "step": 1325, + "train/ce_loss": 1.1555442810058594 + }, + { + "epoch": 0.13100652560806803, + "step": 1325, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.13100652560806803, + "step": 1325, + "train/total_loss": 0.17805442214012146 + }, + { + "entropy": 8.74455738067627, + "epoch": 0.13110539845758354, + "mean_token_accuracy": 0.7673649191856384, + "num_tokens": 6902037.0, + "step": 1326, + "train/ce_loss": 0.5528606176376343 + }, + { + "epoch": 0.13110539845758354, + "step": 1326, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.13110539845758354, + "step": 1326, + "train/total_loss": 0.11778606474399567 + }, + { + "entropy": 9.457708358764648, + "epoch": 0.13120427130709908, + "mean_token_accuracy": 0.6380281448364258, + "num_tokens": 6907205.0, + "step": 1327, + "train/ce_loss": 8.960471859609243e-06 + }, + { + "epoch": 0.13120427130709908, + "step": 1327, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.13120427130709908, + "step": 1327, + "train/total_loss": 0.02343839593231678 + }, + { + "entropy": 9.52875804901123, + "epoch": 0.1313031441566146, + "mean_token_accuracy": 0.639769434928894, + "num_tokens": 6912340.0, + "step": 1328, + "train/ce_loss": 5.955170308880042e-06 + }, + { + "epoch": 0.1313031441566146, + "step": 1328, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.1313031441566146, + "step": 1328, + "train/total_loss": 0.06250059604644775 + }, + { + "entropy": 9.210979461669922, + "epoch": 0.1314020170061301, + "mean_token_accuracy": 0.7533742189407349, + "num_tokens": 6917620.0, + "step": 1329, + "train/ce_loss": 0.816834568977356 + }, + { + "epoch": 0.1314020170061301, + "step": 1329, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.1314020170061301, + "step": 1329, + "train/total_loss": 0.1441834568977356 + }, + { + "entropy": 9.25547981262207, + "epoch": 0.13150088985564565, + "mean_token_accuracy": 0.7266921997070312, + "num_tokens": 6922884.0, + "step": 1330, + "train/ce_loss": 0.3945234417915344 + }, + { + "epoch": 0.13150088985564565, + "step": 1330, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.13150088985564565, + "step": 1330, + "train/total_loss": 0.10585859417915344 + }, + { + "entropy": 9.18018913269043, + "epoch": 0.13159976270516116, + "mean_token_accuracy": 0.7842857241630554, + "num_tokens": 6928075.0, + "step": 1331, + "train/ce_loss": 0.8674007058143616 + }, + { + "epoch": 0.13159976270516116, + "step": 1331, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.13159976270516116, + "step": 1331, + "train/total_loss": 0.20392757654190063 + }, + { + "entropy": 8.887187004089355, + "epoch": 0.13169863555467667, + "mean_token_accuracy": 0.6983373165130615, + "num_tokens": 6933415.0, + "step": 1332, + "train/ce_loss": 0.5412377715110779 + }, + { + "epoch": 0.13169863555467667, + "step": 1332, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.13169863555467667, + "step": 1332, + "train/total_loss": 0.17131127417087555 + }, + { + "entropy": 8.93563461303711, + "epoch": 0.13179750840419222, + "mean_token_accuracy": 0.7395301461219788, + "num_tokens": 6938869.0, + "step": 1333, + "train/ce_loss": 1.14861261844635 + }, + { + "epoch": 0.13179750840419222, + "step": 1333, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.13179750840419222, + "step": 1333, + "train/total_loss": 0.23986126482486725 + }, + { + "entropy": 8.794504165649414, + "epoch": 0.13189638125370773, + "mean_token_accuracy": 0.7954971790313721, + "num_tokens": 6944435.0, + "step": 1334, + "train/ce_loss": 0.5525534749031067 + }, + { + "epoch": 0.13189638125370773, + "step": 1334, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.13189638125370773, + "step": 1334, + "train/total_loss": 0.09431785345077515 + }, + { + "entropy": 9.051572799682617, + "epoch": 0.13199525410322324, + "mean_token_accuracy": 0.6588888764381409, + "num_tokens": 6949781.0, + "step": 1335, + "train/ce_loss": 0.9546046257019043 + }, + { + "epoch": 0.13199525410322324, + "step": 1335, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.13199525410322324, + "step": 1335, + "train/total_loss": 0.19311672449111938 + }, + { + "entropy": 9.176790237426758, + "epoch": 0.13209412695273878, + "mean_token_accuracy": 0.76953125, + "num_tokens": 6955007.0, + "step": 1336, + "train/ce_loss": 0.7356476783752441 + }, + { + "epoch": 0.13209412695273878, + "step": 1336, + "train/sim_loss": 0.1328125 + }, + { + "epoch": 0.13209412695273878, + "step": 1336, + "train/total_loss": 0.20637726783752441 + }, + { + "entropy": 9.438520431518555, + "epoch": 0.1321929998022543, + "mean_token_accuracy": 0.7435897588729858, + "num_tokens": 6960066.0, + "step": 1337, + "train/ce_loss": 0.9659104943275452 + }, + { + "epoch": 0.1321929998022543, + "step": 1337, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.1321929998022543, + "step": 1337, + "train/total_loss": 0.213778555393219 + }, + { + "entropy": 9.292132377624512, + "epoch": 0.13229187265176984, + "mean_token_accuracy": 0.7315270900726318, + "num_tokens": 6965292.0, + "step": 1338, + "train/ce_loss": 0.6084064245223999 + }, + { + "epoch": 0.13229187265176984, + "step": 1338, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.13229187265176984, + "step": 1338, + "train/total_loss": 0.1584968864917755 + }, + { + "entropy": 9.065678596496582, + "epoch": 0.13239074550128535, + "mean_token_accuracy": 0.7868303656578064, + "num_tokens": 6970713.0, + "step": 1339, + "train/ce_loss": 0.6088027954101562 + }, + { + "epoch": 0.13239074550128535, + "step": 1339, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.13239074550128535, + "step": 1339, + "train/total_loss": 0.12338028103113174 + }, + { + "epoch": 0.13248961835080086, + "grad_norm": 0.9765375852584839, + "learning_rate": 9.671413736834299e-06, + "loss": 0.1662, + "step": 1340 + }, + { + "entropy": 9.935449600219727, + "epoch": 0.13248961835080086, + "mean_token_accuracy": 0.697265625, + "num_tokens": 6975626.0, + "step": 1340, + "train/ce_loss": 2.081921100616455 + }, + { + "epoch": 0.13248961835080086, + "step": 1340, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.13248961835080086, + "step": 1340, + "train/total_loss": 0.2550671100616455 + }, + { + "entropy": 10.374621391296387, + "epoch": 0.1325884912003164, + "mean_token_accuracy": 0.6761133670806885, + "num_tokens": 6980238.0, + "step": 1341, + "train/ce_loss": 4.566015243530273 + }, + { + "epoch": 0.1325884912003164, + "step": 1341, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.1325884912003164, + "step": 1341, + "train/total_loss": 0.5503515005111694 + }, + { + "entropy": 9.028810501098633, + "epoch": 0.13268736404983192, + "mean_token_accuracy": 0.70659339427948, + "num_tokens": 6985616.0, + "step": 1342, + "train/ce_loss": 1.0756028890609741 + }, + { + "epoch": 0.13268736404983192, + "step": 1342, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.13268736404983192, + "step": 1342, + "train/total_loss": 0.18568529188632965 + }, + { + "entropy": 9.860417366027832, + "epoch": 0.13278623689934743, + "mean_token_accuracy": 0.7653631567955017, + "num_tokens": 6990572.0, + "step": 1343, + "train/ce_loss": 0.8157594203948975 + }, + { + "epoch": 0.13278623689934743, + "step": 1343, + "train/sim_loss": 0.14453125 + }, + { + "epoch": 0.13278623689934743, + "step": 1343, + "train/total_loss": 0.22610719501972198 + }, + { + "entropy": 9.547462463378906, + "epoch": 0.13288510974886297, + "mean_token_accuracy": 0.7703081369400024, + "num_tokens": 6995727.0, + "step": 1344, + "train/ce_loss": 8.18125863588648e-06 + }, + { + "epoch": 0.13288510974886297, + "step": 1344, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.13288510974886297, + "step": 1344, + "train/total_loss": 0.06250081956386566 + }, + { + "entropy": 9.098556518554688, + "epoch": 0.1329839825983785, + "mean_token_accuracy": 0.7614781856536865, + "num_tokens": 7001136.0, + "step": 1345, + "train/ce_loss": 0.6963076591491699 + }, + { + "epoch": 0.1329839825983785, + "step": 1345, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.1329839825983785, + "step": 1345, + "train/total_loss": 0.09697451442480087 + }, + { + "entropy": 9.58292007446289, + "epoch": 0.133082855447894, + "mean_token_accuracy": 0.7492354512214661, + "num_tokens": 7006230.0, + "step": 1346, + "train/ce_loss": 1.217063546180725 + }, + { + "epoch": 0.133082855447894, + "step": 1346, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.133082855447894, + "step": 1346, + "train/total_loss": 0.22717511653900146 + }, + { + "entropy": 9.555110931396484, + "epoch": 0.13318172829740954, + "mean_token_accuracy": 0.714067280292511, + "num_tokens": 7011333.0, + "step": 1347, + "train/ce_loss": 0.780555009841919 + }, + { + "epoch": 0.13318172829740954, + "step": 1347, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.13318172829740954, + "step": 1347, + "train/total_loss": 0.1483680009841919 + }, + { + "entropy": 9.313787460327148, + "epoch": 0.13328060114692505, + "mean_token_accuracy": 0.7480417490005493, + "num_tokens": 7016588.0, + "step": 1348, + "train/ce_loss": 0.8471536040306091 + }, + { + "epoch": 0.13328060114692505, + "step": 1348, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.13328060114692505, + "step": 1348, + "train/total_loss": 0.1706528663635254 + }, + { + "entropy": 9.890573501586914, + "epoch": 0.13337947399644057, + "mean_token_accuracy": 0.7034631967544556, + "num_tokens": 7021478.0, + "step": 1349, + "train/ce_loss": 1.1914722919464111 + }, + { + "epoch": 0.13337947399644057, + "step": 1349, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.13337947399644057, + "step": 1349, + "train/total_loss": 0.17383474111557007 + }, + { + "entropy": 9.21375846862793, + "epoch": 0.1334783468459561, + "mean_token_accuracy": 0.7373737096786499, + "num_tokens": 7026726.0, + "step": 1350, + "train/ce_loss": 1.0719325542449951 + }, + { + "epoch": 0.1334783468459561, + "step": 1350, + "train/sim_loss": 0.12109375 + }, + { + "epoch": 0.1334783468459561, + "step": 1350, + "train/total_loss": 0.228287011384964 + }, + { + "entropy": 9.115421295166016, + "epoch": 0.13357721969547162, + "mean_token_accuracy": 0.774193525314331, + "num_tokens": 7031974.0, + "step": 1351, + "train/ce_loss": 0.8924320340156555 + }, + { + "epoch": 0.13357721969547162, + "step": 1351, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.13357721969547162, + "step": 1351, + "train/total_loss": 0.18299320340156555 + }, + { + "entropy": 9.520133018493652, + "epoch": 0.13367609254498714, + "mean_token_accuracy": 0.7358490824699402, + "num_tokens": 7037118.0, + "step": 1352, + "train/ce_loss": 2.67130380962044e-05 + }, + { + "epoch": 0.13367609254498714, + "step": 1352, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.13367609254498714, + "step": 1352, + "train/total_loss": 0.0937526747584343 + }, + { + "entropy": 9.93098258972168, + "epoch": 0.13377496539450268, + "mean_token_accuracy": 0.7065637111663818, + "num_tokens": 7042062.0, + "step": 1353, + "train/ce_loss": 1.890914427349344e-05 + }, + { + "epoch": 0.13377496539450268, + "step": 1353, + "train/sim_loss": 0.12890625 + }, + { + "epoch": 0.13377496539450268, + "step": 1353, + "train/total_loss": 0.12890814244747162 + }, + { + "entropy": 9.51821517944336, + "epoch": 0.1338738382440182, + "mean_token_accuracy": 0.6343558430671692, + "num_tokens": 7047333.0, + "step": 1354, + "train/ce_loss": 2.017146348953247 + }, + { + "epoch": 0.1338738382440182, + "step": 1354, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.1338738382440182, + "step": 1354, + "train/total_loss": 0.2524958848953247 + }, + { + "entropy": 9.332813262939453, + "epoch": 0.1339727110935337, + "mean_token_accuracy": 0.7398273944854736, + "num_tokens": 7052611.0, + "step": 1355, + "train/ce_loss": 0.5288713574409485 + }, + { + "epoch": 0.1339727110935337, + "step": 1355, + "train/sim_loss": 0.12109375 + }, + { + "epoch": 0.1339727110935337, + "step": 1355, + "train/total_loss": 0.17398089170455933 + }, + { + "entropy": 8.66998291015625, + "epoch": 0.13407158394304924, + "mean_token_accuracy": 0.7036363482475281, + "num_tokens": 7058353.0, + "step": 1356, + "train/ce_loss": 1.4327878952026367 + }, + { + "epoch": 0.13407158394304924, + "step": 1356, + "train/sim_loss": 0.1328125 + }, + { + "epoch": 0.13407158394304924, + "step": 1356, + "train/total_loss": 0.2760912775993347 + }, + { + "entropy": 9.798766136169434, + "epoch": 0.13417045679256476, + "mean_token_accuracy": 0.7202796936035156, + "num_tokens": 7063369.0, + "step": 1357, + "train/ce_loss": 1.655919913901016e-05 + }, + { + "epoch": 0.13417045679256476, + "step": 1357, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.13417045679256476, + "step": 1357, + "train/total_loss": 0.07422040402889252 + }, + { + "entropy": 9.45732307434082, + "epoch": 0.1342693296420803, + "mean_token_accuracy": 0.6950617432594299, + "num_tokens": 7068607.0, + "step": 1358, + "train/ce_loss": 0.6530421376228333 + }, + { + "epoch": 0.1342693296420803, + "step": 1358, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.1342693296420803, + "step": 1358, + "train/total_loss": 0.1668667197227478 + }, + { + "entropy": 9.124235153198242, + "epoch": 0.1343682024915958, + "mean_token_accuracy": 0.6947236061096191, + "num_tokens": 7073816.0, + "step": 1359, + "train/ce_loss": 0.9237757325172424 + }, + { + "epoch": 0.1343682024915958, + "step": 1359, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.1343682024915958, + "step": 1359, + "train/total_loss": 0.17440882325172424 + }, + { + "epoch": 0.13446707534111133, + "grad_norm": 1.1598478555679321, + "learning_rate": 9.66646887207635e-06, + "loss": 0.1733, + "step": 1360 + }, + { + "entropy": 9.222099304199219, + "epoch": 0.13446707534111133, + "mean_token_accuracy": 0.7375144958496094, + "num_tokens": 7079093.0, + "step": 1360, + "train/ce_loss": 0.4310187101364136 + }, + { + "epoch": 0.13446707534111133, + "step": 1360, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.13446707534111133, + "step": 1360, + "train/total_loss": 0.10560187697410583 + }, + { + "entropy": 9.048439979553223, + "epoch": 0.13456594819062687, + "mean_token_accuracy": 0.7167043089866638, + "num_tokens": 7084466.0, + "step": 1361, + "train/ce_loss": 1.3650734424591064 + }, + { + "epoch": 0.13456594819062687, + "step": 1361, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.13456594819062687, + "step": 1361, + "train/total_loss": 0.20291359722614288 + }, + { + "entropy": 8.655082702636719, + "epoch": 0.13466482104014238, + "mean_token_accuracy": 0.7400379776954651, + "num_tokens": 7089959.0, + "step": 1362, + "train/ce_loss": 1.21505868434906 + }, + { + "epoch": 0.13466482104014238, + "step": 1362, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.13466482104014238, + "step": 1362, + "train/total_loss": 0.22306837141513824 + }, + { + "entropy": 10.349322319030762, + "epoch": 0.1347636938896579, + "mean_token_accuracy": 0.7899686694145203, + "num_tokens": 7094666.0, + "step": 1363, + "train/ce_loss": 2.0730125470436178e-05 + }, + { + "epoch": 0.1347636938896579, + "step": 1363, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.1347636938896579, + "step": 1363, + "train/total_loss": 0.031252071261405945 + }, + { + "entropy": 9.29534912109375, + "epoch": 0.13486256673917343, + "mean_token_accuracy": 0.7682198286056519, + "num_tokens": 7099994.0, + "step": 1364, + "train/ce_loss": 0.5866829752922058 + }, + { + "epoch": 0.13486256673917343, + "step": 1364, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.13486256673917343, + "step": 1364, + "train/total_loss": 0.13679330050945282 + }, + { + "entropy": 9.404373168945312, + "epoch": 0.13496143958868895, + "mean_token_accuracy": 0.69986891746521, + "num_tokens": 7105236.0, + "step": 1365, + "train/ce_loss": 0.7443293333053589 + }, + { + "epoch": 0.13496143958868895, + "step": 1365, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.13496143958868895, + "step": 1365, + "train/total_loss": 0.19943293929100037 + }, + { + "entropy": 8.976805686950684, + "epoch": 0.13506031243820446, + "mean_token_accuracy": 0.7087682485580444, + "num_tokens": 7110755.0, + "step": 1366, + "train/ce_loss": 1.2494643926620483 + }, + { + "epoch": 0.13506031243820446, + "step": 1366, + "train/sim_loss": 0.15625 + }, + { + "epoch": 0.13506031243820446, + "step": 1366, + "train/total_loss": 0.2811964452266693 + }, + { + "entropy": 10.0078125, + "epoch": 0.13515918528772, + "mean_token_accuracy": 0.7490636706352234, + "num_tokens": 7115692.0, + "step": 1367, + "train/ce_loss": 1.1242824257351458e-05 + }, + { + "epoch": 0.13515918528772, + "step": 1367, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.13515918528772, + "step": 1367, + "train/total_loss": 0.027344875037670135 + }, + { + "entropy": 9.153022766113281, + "epoch": 0.13525805813723552, + "mean_token_accuracy": 0.7320799231529236, + "num_tokens": 7121020.0, + "step": 1368, + "train/ce_loss": 0.8959968686103821 + }, + { + "epoch": 0.13525805813723552, + "step": 1368, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.13525805813723552, + "step": 1368, + "train/total_loss": 0.17944344878196716 + }, + { + "entropy": 9.92242431640625, + "epoch": 0.13535693098675103, + "mean_token_accuracy": 0.8330308794975281, + "num_tokens": 7126028.0, + "step": 1369, + "train/ce_loss": 0.9085344672203064 + }, + { + "epoch": 0.13535693098675103, + "step": 1369, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.13535693098675103, + "step": 1369, + "train/total_loss": 0.17679095268249512 + }, + { + "entropy": 9.732063293457031, + "epoch": 0.13545580383626657, + "mean_token_accuracy": 0.7522388100624084, + "num_tokens": 7131328.0, + "step": 1370, + "train/ce_loss": 9.229583156411536e-06 + }, + { + "epoch": 0.13545580383626657, + "step": 1370, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.13545580383626657, + "step": 1370, + "train/total_loss": 0.03125092387199402 + }, + { + "entropy": 9.762304306030273, + "epoch": 0.13555467668578208, + "mean_token_accuracy": 0.6817447543144226, + "num_tokens": 7136433.0, + "step": 1371, + "train/ce_loss": 1.118974208831787 + }, + { + "epoch": 0.13555467668578208, + "step": 1371, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.13555467668578208, + "step": 1371, + "train/total_loss": 0.22517867386341095 + }, + { + "entropy": 9.191245079040527, + "epoch": 0.1356535495352976, + "mean_token_accuracy": 0.7228608131408691, + "num_tokens": 7141585.0, + "step": 1372, + "train/ce_loss": 0.38976824283599854 + }, + { + "epoch": 0.1356535495352976, + "step": 1372, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.1356535495352976, + "step": 1372, + "train/total_loss": 0.13663306832313538 + }, + { + "entropy": 9.613529205322266, + "epoch": 0.13575242238481314, + "mean_token_accuracy": 0.7711213231086731, + "num_tokens": 7146681.0, + "step": 1373, + "train/ce_loss": 1.1483960151672363 + }, + { + "epoch": 0.13575242238481314, + "step": 1373, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.13575242238481314, + "step": 1373, + "train/total_loss": 0.1812458634376526 + }, + { + "entropy": 9.747818946838379, + "epoch": 0.13585129523432865, + "mean_token_accuracy": 0.7123287916183472, + "num_tokens": 7151721.0, + "step": 1374, + "train/ce_loss": 1.276660680770874 + }, + { + "epoch": 0.13585129523432865, + "step": 1374, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.13585129523432865, + "step": 1374, + "train/total_loss": 0.20579107105731964 + }, + { + "entropy": 9.456674575805664, + "epoch": 0.13595016808384416, + "mean_token_accuracy": 0.7160193920135498, + "num_tokens": 7156995.0, + "step": 1375, + "train/ce_loss": 1.8477184772491455 + }, + { + "epoch": 0.13595016808384416, + "step": 1375, + "train/sim_loss": 0.15625 + }, + { + "epoch": 0.13595016808384416, + "step": 1375, + "train/total_loss": 0.3410218358039856 + }, + { + "entropy": 9.060297012329102, + "epoch": 0.1360490409333597, + "mean_token_accuracy": 0.7053254246711731, + "num_tokens": 7162351.0, + "step": 1376, + "train/ce_loss": 1.2942602634429932 + }, + { + "epoch": 0.1360490409333597, + "step": 1376, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.1360490409333597, + "step": 1376, + "train/total_loss": 0.2153635323047638 + }, + { + "entropy": 9.12033462524414, + "epoch": 0.13614791378287522, + "mean_token_accuracy": 0.6435294151306152, + "num_tokens": 7167679.0, + "step": 1377, + "train/ce_loss": 0.8891092538833618 + }, + { + "epoch": 0.13614791378287522, + "step": 1377, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.13614791378287522, + "step": 1377, + "train/total_loss": 0.18266093730926514 + }, + { + "entropy": 9.498316764831543, + "epoch": 0.13624678663239073, + "mean_token_accuracy": 0.739062488079071, + "num_tokens": 7172731.0, + "step": 1378, + "train/ce_loss": 1.083294137060875e-05 + }, + { + "epoch": 0.13624678663239073, + "step": 1378, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.13624678663239073, + "step": 1378, + "train/total_loss": 0.027344834059476852 + }, + { + "entropy": 9.433088302612305, + "epoch": 0.13634565948190627, + "mean_token_accuracy": 0.6906779408454895, + "num_tokens": 7177888.0, + "step": 1379, + "train/ce_loss": 1.207837462425232 + }, + { + "epoch": 0.13634565948190627, + "step": 1379, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.13634565948190627, + "step": 1379, + "train/total_loss": 0.1871899962425232 + }, + { + "epoch": 0.1364445323314218, + "grad_norm": 1.1392931938171387, + "learning_rate": 9.6615240073184e-06, + "loss": 0.1708, + "step": 1380 + }, + { + "entropy": 9.162598609924316, + "epoch": 0.1364445323314218, + "mean_token_accuracy": 0.7412031888961792, + "num_tokens": 7183249.0, + "step": 1380, + "train/ce_loss": 0.7054414749145508 + }, + { + "epoch": 0.1364445323314218, + "step": 1380, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.1364445323314218, + "step": 1380, + "train/total_loss": 0.10960664600133896 + }, + { + "entropy": 9.58977222442627, + "epoch": 0.13654340518093733, + "mean_token_accuracy": 0.7033112645149231, + "num_tokens": 7188484.0, + "step": 1381, + "train/ce_loss": 1.0873607397079468 + }, + { + "epoch": 0.13654340518093733, + "step": 1381, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.13654340518093733, + "step": 1381, + "train/total_loss": 0.1477985680103302 + }, + { + "entropy": 9.628973007202148, + "epoch": 0.13664227803045284, + "mean_token_accuracy": 0.7557142972946167, + "num_tokens": 7193633.0, + "step": 1382, + "train/ce_loss": 0.7194046378135681 + }, + { + "epoch": 0.13664227803045284, + "step": 1382, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.13664227803045284, + "step": 1382, + "train/total_loss": 0.18522171676158905 + }, + { + "entropy": 9.48279094696045, + "epoch": 0.13674115087996835, + "mean_token_accuracy": 0.6994134783744812, + "num_tokens": 7198964.0, + "step": 1383, + "train/ce_loss": 0.981130838394165 + }, + { + "epoch": 0.13674115087996835, + "step": 1383, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.13674115087996835, + "step": 1383, + "train/total_loss": 0.18405058979988098 + }, + { + "entropy": 9.090730667114258, + "epoch": 0.1368400237294839, + "mean_token_accuracy": 0.7595744729042053, + "num_tokens": 7204415.0, + "step": 1384, + "train/ce_loss": 1.2270708084106445 + }, + { + "epoch": 0.1368400237294839, + "step": 1384, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.1368400237294839, + "step": 1384, + "train/total_loss": 0.2086445838212967 + }, + { + "entropy": 9.309127807617188, + "epoch": 0.1369388965789994, + "mean_token_accuracy": 0.7635053992271423, + "num_tokens": 7209716.0, + "step": 1385, + "train/ce_loss": 0.714293360710144 + }, + { + "epoch": 0.1369388965789994, + "step": 1385, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.1369388965789994, + "step": 1385, + "train/total_loss": 0.12611684203147888 + }, + { + "entropy": 9.814483642578125, + "epoch": 0.13703776942851492, + "mean_token_accuracy": 0.7638190984725952, + "num_tokens": 7214776.0, + "step": 1386, + "train/ce_loss": 2.559537097113207e-05 + }, + { + "epoch": 0.13703776942851492, + "step": 1386, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.13703776942851492, + "step": 1386, + "train/total_loss": 0.07812756299972534 + }, + { + "entropy": 10.152091979980469, + "epoch": 0.13713664227803046, + "mean_token_accuracy": 0.7139587998390198, + "num_tokens": 7219571.0, + "step": 1387, + "train/ce_loss": 1.8109357357025146 + }, + { + "epoch": 0.13713664227803046, + "step": 1387, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.13713664227803046, + "step": 1387, + "train/total_loss": 0.27874982357025146 + }, + { + "entropy": 8.986875534057617, + "epoch": 0.13723551512754598, + "mean_token_accuracy": 0.7254273295402527, + "num_tokens": 7224988.0, + "step": 1388, + "train/ce_loss": 1.1162502765655518 + }, + { + "epoch": 0.13723551512754598, + "step": 1388, + "train/sim_loss": 0.1953125 + }, + { + "epoch": 0.13723551512754598, + "step": 1388, + "train/total_loss": 0.3069375157356262 + }, + { + "entropy": 9.793027877807617, + "epoch": 0.1373343879770615, + "mean_token_accuracy": 0.7576736807823181, + "num_tokens": 7230030.0, + "step": 1389, + "train/ce_loss": 1.2323343753814697 + }, + { + "epoch": 0.1373343879770615, + "step": 1389, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.1373343879770615, + "step": 1389, + "train/total_loss": 0.18963968753814697 + }, + { + "entropy": 9.297113418579102, + "epoch": 0.13743326082657703, + "mean_token_accuracy": 0.6991368532180786, + "num_tokens": 7235295.0, + "step": 1390, + "train/ce_loss": 1.4138586521148682 + }, + { + "epoch": 0.13743326082657703, + "step": 1390, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.13743326082657703, + "step": 1390, + "train/total_loss": 0.25857335329055786 + }, + { + "entropy": 9.281339645385742, + "epoch": 0.13753213367609254, + "mean_token_accuracy": 0.6515151262283325, + "num_tokens": 7240653.0, + "step": 1391, + "train/ce_loss": 1.2768079042434692 + }, + { + "epoch": 0.13753213367609254, + "step": 1391, + "train/sim_loss": 0.13671875 + }, + { + "epoch": 0.13753213367609254, + "step": 1391, + "train/total_loss": 0.26439952850341797 + }, + { + "entropy": 9.057063102722168, + "epoch": 0.13763100652560806, + "mean_token_accuracy": 0.755646824836731, + "num_tokens": 7246146.0, + "step": 1392, + "train/ce_loss": 0.5638306140899658 + }, + { + "epoch": 0.13763100652560806, + "step": 1392, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.13763100652560806, + "step": 1392, + "train/total_loss": 0.14622680842876434 + }, + { + "entropy": 10.018474578857422, + "epoch": 0.1377298793751236, + "mean_token_accuracy": 0.6976743936538696, + "num_tokens": 7251021.0, + "step": 1393, + "train/ce_loss": 2.264758348464966 + }, + { + "epoch": 0.1377298793751236, + "step": 1393, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.1377298793751236, + "step": 1393, + "train/total_loss": 0.3436633348464966 + }, + { + "entropy": 8.718015670776367, + "epoch": 0.1378287522246391, + "mean_token_accuracy": 0.6834763884544373, + "num_tokens": 7256420.0, + "step": 1394, + "train/ce_loss": 1.1374813318252563 + }, + { + "epoch": 0.1378287522246391, + "step": 1394, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.1378287522246391, + "step": 1394, + "train/total_loss": 0.19577938318252563 + }, + { + "entropy": 10.238953590393066, + "epoch": 0.13792762507415463, + "mean_token_accuracy": 0.7974026203155518, + "num_tokens": 7261187.0, + "step": 1395, + "train/ce_loss": 3.1131625291891396e-05 + }, + { + "epoch": 0.13792762507415463, + "step": 1395, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.13792762507415463, + "step": 1395, + "train/total_loss": 0.039065614342689514 + }, + { + "entropy": 9.5997896194458, + "epoch": 0.13802649792367017, + "mean_token_accuracy": 0.7196581363677979, + "num_tokens": 7266220.0, + "step": 1396, + "train/ce_loss": 1.3521981239318848 + }, + { + "epoch": 0.13802649792367017, + "step": 1396, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.13802649792367017, + "step": 1396, + "train/total_loss": 0.22896981239318848 + }, + { + "entropy": 9.595669746398926, + "epoch": 0.13812537077318568, + "mean_token_accuracy": 0.6781250238418579, + "num_tokens": 7271470.0, + "step": 1397, + "train/ce_loss": 0.8560667037963867 + }, + { + "epoch": 0.13812537077318568, + "step": 1397, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.13812537077318568, + "step": 1397, + "train/total_loss": 0.1988879144191742 + }, + { + "entropy": 8.980318069458008, + "epoch": 0.1382242436227012, + "mean_token_accuracy": 0.7407024502754211, + "num_tokens": 7276938.0, + "step": 1398, + "train/ce_loss": 0.6260985136032104 + }, + { + "epoch": 0.1382242436227012, + "step": 1398, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.1382242436227012, + "step": 1398, + "train/total_loss": 0.16807860136032104 + }, + { + "entropy": 9.427846908569336, + "epoch": 0.13832311647221673, + "mean_token_accuracy": 0.6952381134033203, + "num_tokens": 7282047.0, + "step": 1399, + "train/ce_loss": 1.5995008945465088 + }, + { + "epoch": 0.13832311647221673, + "step": 1399, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.13832311647221673, + "step": 1399, + "train/total_loss": 0.23807509243488312 + }, + { + "epoch": 0.13842198932173225, + "grad_norm": 0.9312789440155029, + "learning_rate": 9.656579142560452e-06, + "loss": 0.1721, + "step": 1400 + }, + { + "entropy": 9.420281410217285, + "epoch": 0.13842198932173225, + "mean_token_accuracy": 0.7587600946426392, + "num_tokens": 7287273.0, + "step": 1400, + "train/ce_loss": 0.911980390548706 + }, + { + "epoch": 0.13842198932173225, + "step": 1400, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.13842198932173225, + "step": 1400, + "train/total_loss": 0.17713554203510284 + }, + { + "entropy": 10.491708755493164, + "epoch": 0.1385208621712478, + "mean_token_accuracy": 0.764102578163147, + "num_tokens": 7291863.0, + "step": 1401, + "train/ce_loss": 2.9230513973743655e-05 + }, + { + "epoch": 0.1385208621712478, + "step": 1401, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.1385208621712478, + "step": 1401, + "train/total_loss": 0.062502920627594 + }, + { + "entropy": 9.89574146270752, + "epoch": 0.1386197350207633, + "mean_token_accuracy": 0.6787072420120239, + "num_tokens": 7296824.0, + "step": 1402, + "train/ce_loss": 1.0397026538848877 + }, + { + "epoch": 0.1386197350207633, + "step": 1402, + "train/sim_loss": 0.12109375 + }, + { + "epoch": 0.1386197350207633, + "step": 1402, + "train/total_loss": 0.2250640094280243 + }, + { + "entropy": 9.101736068725586, + "epoch": 0.13871860787027882, + "mean_token_accuracy": 0.6821191906929016, + "num_tokens": 7302205.0, + "step": 1403, + "train/ce_loss": 0.3729458153247833 + }, + { + "epoch": 0.13871860787027882, + "step": 1403, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.13871860787027882, + "step": 1403, + "train/total_loss": 0.08416958153247833 + }, + { + "entropy": 9.347423553466797, + "epoch": 0.13881748071979436, + "mean_token_accuracy": 0.6495097875595093, + "num_tokens": 7307695.0, + "step": 1404, + "train/ce_loss": 1.291334867477417 + }, + { + "epoch": 0.13881748071979436, + "step": 1404, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.13881748071979436, + "step": 1404, + "train/total_loss": 0.19163349270820618 + }, + { + "entropy": 9.36381721496582, + "epoch": 0.13891635356930987, + "mean_token_accuracy": 0.7665745615959167, + "num_tokens": 7312906.0, + "step": 1405, + "train/ce_loss": 0.6868124008178711 + }, + { + "epoch": 0.13891635356930987, + "step": 1405, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.13891635356930987, + "step": 1405, + "train/total_loss": 0.1468062400817871 + }, + { + "entropy": 9.406861305236816, + "epoch": 0.13901522641882538, + "mean_token_accuracy": 0.7622950673103333, + "num_tokens": 7318039.0, + "step": 1406, + "train/ce_loss": 0.9608842730522156 + }, + { + "epoch": 0.13901522641882538, + "step": 1406, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.13901522641882538, + "step": 1406, + "train/total_loss": 0.1898384392261505 + }, + { + "entropy": 9.218897819519043, + "epoch": 0.13911409926834092, + "mean_token_accuracy": 0.7496790885925293, + "num_tokens": 7323261.0, + "step": 1407, + "train/ce_loss": 0.6161945462226868 + }, + { + "epoch": 0.13911409926834092, + "step": 1407, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.13911409926834092, + "step": 1407, + "train/total_loss": 0.09286946058273315 + }, + { + "entropy": 9.789392471313477, + "epoch": 0.13921297211785644, + "mean_token_accuracy": 0.6774774789810181, + "num_tokens": 7328238.0, + "step": 1408, + "train/ce_loss": 1.5993882417678833 + }, + { + "epoch": 0.13921297211785644, + "step": 1408, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.13921297211785644, + "step": 1408, + "train/total_loss": 0.2771263122558594 + }, + { + "entropy": 9.464317321777344, + "epoch": 0.13931184496737195, + "mean_token_accuracy": 0.6744186282157898, + "num_tokens": 7333407.0, + "step": 1409, + "train/ce_loss": 7.5892635322816204e-06 + }, + { + "epoch": 0.13931184496737195, + "step": 1409, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.13931184496737195, + "step": 1409, + "train/total_loss": 0.07421950995922089 + }, + { + "entropy": 8.900279998779297, + "epoch": 0.1394107178168875, + "mean_token_accuracy": 0.7890382409095764, + "num_tokens": 7338884.0, + "step": 1410, + "train/ce_loss": 0.5504341125488281 + }, + { + "epoch": 0.1394107178168875, + "step": 1410, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.1394107178168875, + "step": 1410, + "train/total_loss": 0.08629341423511505 + }, + { + "entropy": 8.84177017211914, + "epoch": 0.139509590666403, + "mean_token_accuracy": 0.7534090876579285, + "num_tokens": 7344229.0, + "step": 1411, + "train/ce_loss": 0.7043808698654175 + }, + { + "epoch": 0.139509590666403, + "step": 1411, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.139509590666403, + "step": 1411, + "train/total_loss": 0.08996933698654175 + }, + { + "entropy": 9.614435195922852, + "epoch": 0.13960846351591852, + "mean_token_accuracy": 0.6666666865348816, + "num_tokens": 7349216.0, + "step": 1412, + "train/ce_loss": 2.0270464420318604 + }, + { + "epoch": 0.13960846351591852, + "step": 1412, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.13960846351591852, + "step": 1412, + "train/total_loss": 0.29254841804504395 + }, + { + "entropy": 9.185687065124512, + "epoch": 0.13970733636543406, + "mean_token_accuracy": 0.7352555990219116, + "num_tokens": 7354461.0, + "step": 1413, + "train/ce_loss": 0.8546119928359985 + }, + { + "epoch": 0.13970733636543406, + "step": 1413, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.13970733636543406, + "step": 1413, + "train/total_loss": 0.15967994928359985 + }, + { + "entropy": 9.345466613769531, + "epoch": 0.13980620921494957, + "mean_token_accuracy": 0.7362499833106995, + "num_tokens": 7359711.0, + "step": 1414, + "train/ce_loss": 1.5574983358383179 + }, + { + "epoch": 0.13980620921494957, + "step": 1414, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.13980620921494957, + "step": 1414, + "train/total_loss": 0.2416873425245285 + }, + { + "entropy": 8.867634773254395, + "epoch": 0.13990508206446509, + "mean_token_accuracy": 0.7376705408096313, + "num_tokens": 7365166.0, + "step": 1415, + "train/ce_loss": 0.7297521829605103 + }, + { + "epoch": 0.13990508206446509, + "step": 1415, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.13990508206446509, + "step": 1415, + "train/total_loss": 0.11203771829605103 + }, + { + "entropy": 8.939706802368164, + "epoch": 0.14000395491398063, + "mean_token_accuracy": 0.6901565790176392, + "num_tokens": 7370554.0, + "step": 1416, + "train/ce_loss": 1.0094400644302368 + }, + { + "epoch": 0.14000395491398063, + "step": 1416, + "train/sim_loss": 0.1328125 + }, + { + "epoch": 0.14000395491398063, + "step": 1416, + "train/total_loss": 0.23375651240348816 + }, + { + "entropy": 9.093274116516113, + "epoch": 0.14010282776349614, + "mean_token_accuracy": 0.7566079497337341, + "num_tokens": 7375943.0, + "step": 1417, + "train/ce_loss": 0.6850955486297607 + }, + { + "epoch": 0.14010282776349614, + "step": 1417, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.14010282776349614, + "step": 1417, + "train/total_loss": 0.1036658063530922 + }, + { + "entropy": 9.350330352783203, + "epoch": 0.14020170061301165, + "mean_token_accuracy": 0.7856155037879944, + "num_tokens": 7381104.0, + "step": 1418, + "train/ce_loss": 0.8399333953857422 + }, + { + "epoch": 0.14020170061301165, + "step": 1418, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.14020170061301165, + "step": 1418, + "train/total_loss": 0.1308683454990387 + }, + { + "entropy": 9.108048439025879, + "epoch": 0.1403005734625272, + "mean_token_accuracy": 0.7621809840202332, + "num_tokens": 7386436.0, + "step": 1419, + "train/ce_loss": 0.876664936542511 + }, + { + "epoch": 0.1403005734625272, + "step": 1419, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.1403005734625272, + "step": 1419, + "train/total_loss": 0.16969774663448334 + }, + { + "epoch": 0.1403994463120427, + "grad_norm": 0.8239012360572815, + "learning_rate": 9.651634277802503e-06, + "loss": 0.1676, + "step": 1420 + }, + { + "entropy": 9.418233871459961, + "epoch": 0.1403994463120427, + "mean_token_accuracy": 0.7874125838279724, + "num_tokens": 7391582.0, + "step": 1420, + "train/ce_loss": 0.986320972442627 + }, + { + "epoch": 0.1403994463120427, + "step": 1420, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.1403994463120427, + "step": 1420, + "train/total_loss": 0.1611320972442627 + }, + { + "entropy": 9.41115951538086, + "epoch": 0.14049831916155825, + "mean_token_accuracy": 0.7489986419677734, + "num_tokens": 7396791.0, + "step": 1421, + "train/ce_loss": 1.4863344430923462 + }, + { + "epoch": 0.14049831916155825, + "step": 1421, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.14049831916155825, + "step": 1421, + "train/total_loss": 0.2345709502696991 + }, + { + "entropy": 9.614538192749023, + "epoch": 0.14059719201107376, + "mean_token_accuracy": 0.7886056900024414, + "num_tokens": 7401889.0, + "step": 1422, + "train/ce_loss": 0.8483023643493652 + }, + { + "epoch": 0.14059719201107376, + "step": 1422, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.14059719201107376, + "step": 1422, + "train/total_loss": 0.14733023941516876 + }, + { + "entropy": 9.525871276855469, + "epoch": 0.14069606486058928, + "mean_token_accuracy": 0.7485029697418213, + "num_tokens": 7406993.0, + "step": 1423, + "train/ce_loss": 1.4722119569778442 + }, + { + "epoch": 0.14069606486058928, + "step": 1423, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.14069606486058928, + "step": 1423, + "train/total_loss": 0.20972119271755219 + }, + { + "entropy": 9.20797061920166, + "epoch": 0.14079493771010482, + "mean_token_accuracy": 0.6947835683822632, + "num_tokens": 7412350.0, + "step": 1424, + "train/ce_loss": 1.1180000305175781 + }, + { + "epoch": 0.14079493771010482, + "step": 1424, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.14079493771010482, + "step": 1424, + "train/total_loss": 0.20164376497268677 + }, + { + "entropy": 9.134281158447266, + "epoch": 0.14089381055962033, + "mean_token_accuracy": 0.747586190700531, + "num_tokens": 7417620.0, + "step": 1425, + "train/ce_loss": 0.5653731226921082 + }, + { + "epoch": 0.14089381055962033, + "step": 1425, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.14089381055962033, + "step": 1425, + "train/total_loss": 0.13856856524944305 + }, + { + "entropy": 9.078322410583496, + "epoch": 0.14099268340913584, + "mean_token_accuracy": 0.7253086566925049, + "num_tokens": 7423051.0, + "step": 1426, + "train/ce_loss": 1.2762891054153442 + }, + { + "epoch": 0.14099268340913584, + "step": 1426, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.14099268340913584, + "step": 1426, + "train/total_loss": 0.20575390756130219 + }, + { + "entropy": 9.37423324584961, + "epoch": 0.14109155625865138, + "mean_token_accuracy": 0.6671575903892517, + "num_tokens": 7428196.0, + "step": 1427, + "train/ce_loss": 1.6712976694107056 + }, + { + "epoch": 0.14109155625865138, + "step": 1427, + "train/sim_loss": 0.12890625 + }, + { + "epoch": 0.14109155625865138, + "step": 1427, + "train/total_loss": 0.2960360050201416 + }, + { + "entropy": 9.619972229003906, + "epoch": 0.1411904291081669, + "mean_token_accuracy": 0.7265100479125977, + "num_tokens": 7433246.0, + "step": 1428, + "train/ce_loss": 1.672378420829773 + }, + { + "epoch": 0.1411904291081669, + "step": 1428, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.1411904291081669, + "step": 1428, + "train/total_loss": 0.2609878480434418 + }, + { + "entropy": 9.88757038116455, + "epoch": 0.1412893019576824, + "mean_token_accuracy": 0.7270992398262024, + "num_tokens": 7438150.0, + "step": 1429, + "train/ce_loss": 0.7832049131393433 + }, + { + "epoch": 0.1412893019576824, + "step": 1429, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.1412893019576824, + "step": 1429, + "train/total_loss": 0.16425800323486328 + }, + { + "entropy": 9.301030158996582, + "epoch": 0.14138817480719795, + "mean_token_accuracy": 0.6925169825553894, + "num_tokens": 7443337.0, + "step": 1430, + "train/ce_loss": 0.8136307001113892 + }, + { + "epoch": 0.14138817480719795, + "step": 1430, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.14138817480719795, + "step": 1430, + "train/total_loss": 0.14386308193206787 + }, + { + "entropy": 8.811057090759277, + "epoch": 0.14148704765671347, + "mean_token_accuracy": 0.7604060769081116, + "num_tokens": 7448858.0, + "step": 1431, + "train/ce_loss": 0.9372698664665222 + }, + { + "epoch": 0.14148704765671347, + "step": 1431, + "train/sim_loss": 0.1328125 + }, + { + "epoch": 0.14148704765671347, + "step": 1431, + "train/total_loss": 0.2265394926071167 + }, + { + "entropy": 9.793838500976562, + "epoch": 0.14158592050622898, + "mean_token_accuracy": 0.7733089327812195, + "num_tokens": 7453846.0, + "step": 1432, + "train/ce_loss": 1.3188660144805908 + }, + { + "epoch": 0.14158592050622898, + "step": 1432, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.14158592050622898, + "step": 1432, + "train/total_loss": 0.2568866014480591 + }, + { + "entropy": 8.781266212463379, + "epoch": 0.14168479335574452, + "mean_token_accuracy": 0.7399267554283142, + "num_tokens": 7459427.0, + "step": 1433, + "train/ce_loss": 1.0480564832687378 + }, + { + "epoch": 0.14168479335574452, + "step": 1433, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.14168479335574452, + "step": 1433, + "train/total_loss": 0.21418064832687378 + }, + { + "entropy": 9.048070907592773, + "epoch": 0.14178366620526003, + "mean_token_accuracy": 0.75, + "num_tokens": 7464698.0, + "step": 1434, + "train/ce_loss": 0.6495194435119629 + }, + { + "epoch": 0.14178366620526003, + "step": 1434, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.14178366620526003, + "step": 1434, + "train/total_loss": 0.11182694882154465 + }, + { + "entropy": 9.225584030151367, + "epoch": 0.14188253905477555, + "mean_token_accuracy": 0.7363515496253967, + "num_tokens": 7469916.0, + "step": 1435, + "train/ce_loss": 0.8719775676727295 + }, + { + "epoch": 0.14188253905477555, + "step": 1435, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.14188253905477555, + "step": 1435, + "train/total_loss": 0.16922900080680847 + }, + { + "entropy": 9.48530101776123, + "epoch": 0.1419814119042911, + "mean_token_accuracy": 0.6973294019699097, + "num_tokens": 7475084.0, + "step": 1436, + "train/ce_loss": 0.7331254482269287 + }, + { + "epoch": 0.1419814119042911, + "step": 1436, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.1419814119042911, + "step": 1436, + "train/total_loss": 0.17487505078315735 + }, + { + "entropy": 8.971145629882812, + "epoch": 0.1420802847538066, + "mean_token_accuracy": 0.7418181896209717, + "num_tokens": 7480409.0, + "step": 1437, + "train/ce_loss": 1.2895128726959229 + }, + { + "epoch": 0.1420802847538066, + "step": 1437, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.1420802847538066, + "step": 1437, + "train/total_loss": 0.222701296210289 + }, + { + "entropy": 10.205839157104492, + "epoch": 0.14217915760332211, + "mean_token_accuracy": 0.6894736886024475, + "num_tokens": 7485142.0, + "step": 1438, + "train/ce_loss": 2.168229103088379 + }, + { + "epoch": 0.14217915760332211, + "step": 1438, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.14217915760332211, + "step": 1438, + "train/total_loss": 0.29885417222976685 + }, + { + "entropy": 8.872293472290039, + "epoch": 0.14227803045283766, + "mean_token_accuracy": 0.7761341333389282, + "num_tokens": 7490696.0, + "step": 1439, + "train/ce_loss": 0.7610247731208801 + }, + { + "epoch": 0.14227803045283766, + "step": 1439, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.14227803045283766, + "step": 1439, + "train/total_loss": 0.19328998029232025 + }, + { + "epoch": 0.14237690330235317, + "grad_norm": 0.9774222373962402, + "learning_rate": 9.646689413044555e-06, + "loss": 0.1714, + "step": 1440 + }, + { + "entropy": 9.629098892211914, + "epoch": 0.14237690330235317, + "mean_token_accuracy": 0.6889579892158508, + "num_tokens": 7495780.0, + "step": 1440, + "train/ce_loss": 1.546508550643921 + }, + { + "epoch": 0.14237690330235317, + "step": 1440, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.14237690330235317, + "step": 1440, + "train/total_loss": 0.22496335208415985 + }, + { + "entropy": 8.989510536193848, + "epoch": 0.1424757761518687, + "mean_token_accuracy": 0.7749725580215454, + "num_tokens": 7501192.0, + "step": 1441, + "train/ce_loss": 0.7729292511940002 + }, + { + "epoch": 0.1424757761518687, + "step": 1441, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.1424757761518687, + "step": 1441, + "train/total_loss": 0.19057416915893555 + }, + { + "entropy": 9.388324737548828, + "epoch": 0.14257464900138422, + "mean_token_accuracy": 0.7248322367668152, + "num_tokens": 7506414.0, + "step": 1442, + "train/ce_loss": 0.4920440912246704 + }, + { + "epoch": 0.14257464900138422, + "step": 1442, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.14257464900138422, + "step": 1442, + "train/total_loss": 0.11951690912246704 + }, + { + "entropy": 9.656713485717773, + "epoch": 0.14267352185089974, + "mean_token_accuracy": 0.7351524829864502, + "num_tokens": 7511502.0, + "step": 1443, + "train/ce_loss": 1.2703362703323364 + }, + { + "epoch": 0.14267352185089974, + "step": 1443, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.14267352185089974, + "step": 1443, + "train/total_loss": 0.17390863597393036 + }, + { + "entropy": 9.42349624633789, + "epoch": 0.14277239470041528, + "mean_token_accuracy": 0.6608344316482544, + "num_tokens": 7516696.0, + "step": 1444, + "train/ce_loss": 1.5800093412399292 + }, + { + "epoch": 0.14277239470041528, + "step": 1444, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.14277239470041528, + "step": 1444, + "train/total_loss": 0.22831343114376068 + }, + { + "entropy": 9.205739974975586, + "epoch": 0.1428712675499308, + "mean_token_accuracy": 0.7598944306373596, + "num_tokens": 7521908.0, + "step": 1445, + "train/ce_loss": 0.8339932560920715 + }, + { + "epoch": 0.1428712675499308, + "step": 1445, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.1428712675499308, + "step": 1445, + "train/total_loss": 0.14199307560920715 + }, + { + "entropy": 9.594503402709961, + "epoch": 0.1429701403994463, + "mean_token_accuracy": 0.75, + "num_tokens": 7526952.0, + "step": 1446, + "train/ce_loss": 1.2667876482009888 + }, + { + "epoch": 0.1429701403994463, + "step": 1446, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.1429701403994463, + "step": 1446, + "train/total_loss": 0.21261626482009888 + }, + { + "entropy": 9.727521896362305, + "epoch": 0.14306901324896185, + "mean_token_accuracy": 0.7665056586265564, + "num_tokens": 7532050.0, + "step": 1447, + "train/ce_loss": 0.8516501784324646 + }, + { + "epoch": 0.14306901324896185, + "step": 1447, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.14306901324896185, + "step": 1447, + "train/total_loss": 0.15938377380371094 + }, + { + "entropy": 9.347183227539062, + "epoch": 0.14316788609847736, + "mean_token_accuracy": 0.7016248106956482, + "num_tokens": 7537184.0, + "step": 1448, + "train/ce_loss": 1.6525059938430786 + }, + { + "epoch": 0.14316788609847736, + "step": 1448, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.14316788609847736, + "step": 1448, + "train/total_loss": 0.23556309938430786 + }, + { + "entropy": 8.938035011291504, + "epoch": 0.14326675894799287, + "mean_token_accuracy": 0.713178277015686, + "num_tokens": 7542662.0, + "step": 1449, + "train/ce_loss": 1.2951596975326538 + }, + { + "epoch": 0.14326675894799287, + "step": 1449, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.14326675894799287, + "step": 1449, + "train/total_loss": 0.24670347571372986 + }, + { + "entropy": 9.225709915161133, + "epoch": 0.1433656317975084, + "mean_token_accuracy": 0.7651775479316711, + "num_tokens": 7547966.0, + "step": 1450, + "train/ce_loss": 0.8416441082954407 + }, + { + "epoch": 0.1433656317975084, + "step": 1450, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.1433656317975084, + "step": 1450, + "train/total_loss": 0.15057066082954407 + }, + { + "entropy": 9.266736030578613, + "epoch": 0.14346450464702393, + "mean_token_accuracy": 0.7343065738677979, + "num_tokens": 7553141.0, + "step": 1451, + "train/ce_loss": 1.0648012161254883 + }, + { + "epoch": 0.14346450464702393, + "step": 1451, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.14346450464702393, + "step": 1451, + "train/total_loss": 0.18460512161254883 + }, + { + "entropy": 9.525014877319336, + "epoch": 0.14356337749653944, + "mean_token_accuracy": 0.7324749827384949, + "num_tokens": 7558305.0, + "step": 1452, + "train/ce_loss": 1.7234762907028198 + }, + { + "epoch": 0.14356337749653944, + "step": 1452, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.14356337749653944, + "step": 1452, + "train/total_loss": 0.28953513503074646 + }, + { + "entropy": 8.922002792358398, + "epoch": 0.14366225034605498, + "mean_token_accuracy": 0.6986986994743347, + "num_tokens": 7563774.0, + "step": 1453, + "train/ce_loss": 0.5577152967453003 + }, + { + "epoch": 0.14366225034605498, + "step": 1453, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.14366225034605498, + "step": 1453, + "train/total_loss": 0.09092777967453003 + }, + { + "entropy": 9.489364624023438, + "epoch": 0.1437611231955705, + "mean_token_accuracy": 0.7285714149475098, + "num_tokens": 7568986.0, + "step": 1454, + "train/ce_loss": 0.6063663959503174 + }, + { + "epoch": 0.1437611231955705, + "step": 1454, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.1437611231955705, + "step": 1454, + "train/total_loss": 0.13094913959503174 + }, + { + "entropy": 9.168773651123047, + "epoch": 0.143859996045086, + "mean_token_accuracy": 0.7039238810539246, + "num_tokens": 7574270.0, + "step": 1455, + "train/ce_loss": 0.8265439867973328 + }, + { + "epoch": 0.143859996045086, + "step": 1455, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.143859996045086, + "step": 1455, + "train/total_loss": 0.18031065165996552 + }, + { + "entropy": 9.695110321044922, + "epoch": 0.14395886889460155, + "mean_token_accuracy": 0.7261484265327454, + "num_tokens": 7579265.0, + "step": 1456, + "train/ce_loss": 1.3975584806757979e-05 + }, + { + "epoch": 0.14395886889460155, + "step": 1456, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.14395886889460155, + "step": 1456, + "train/total_loss": 0.08593890070915222 + }, + { + "entropy": 9.658077239990234, + "epoch": 0.14405774174411706, + "mean_token_accuracy": 0.7463557124137878, + "num_tokens": 7584372.0, + "step": 1457, + "train/ce_loss": 1.2585256099700928 + }, + { + "epoch": 0.14405774174411706, + "step": 1457, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.14405774174411706, + "step": 1457, + "train/total_loss": 0.164915069937706 + }, + { + "entropy": 9.691431045532227, + "epoch": 0.14415661459363258, + "mean_token_accuracy": 0.7751798629760742, + "num_tokens": 7589333.0, + "step": 1458, + "train/ce_loss": 1.035921013681218e-05 + }, + { + "epoch": 0.14415661459363258, + "step": 1458, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.14415661459363258, + "step": 1458, + "train/total_loss": 0.06640728563070297 + }, + { + "entropy": 9.916348457336426, + "epoch": 0.14425548744314812, + "mean_token_accuracy": 0.7798742055892944, + "num_tokens": 7594257.0, + "step": 1459, + "train/ce_loss": 1.742569088935852 + }, + { + "epoch": 0.14425548744314812, + "step": 1459, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.14425548744314812, + "step": 1459, + "train/total_loss": 0.26019442081451416 + }, + { + "epoch": 0.14435436029266363, + "grad_norm": 1.05721914768219, + "learning_rate": 9.641744548286605e-06, + "loss": 0.1613, + "step": 1460 + }, + { + "entropy": 9.46556282043457, + "epoch": 0.14435436029266363, + "mean_token_accuracy": 0.7061469554901123, + "num_tokens": 7599326.0, + "step": 1460, + "train/ce_loss": 1.0082299709320068 + }, + { + "epoch": 0.14435436029266363, + "step": 1460, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.14435436029266363, + "step": 1460, + "train/total_loss": 0.17894800007343292 + }, + { + "entropy": 9.255624771118164, + "epoch": 0.14445323314217914, + "mean_token_accuracy": 0.6964064240455627, + "num_tokens": 7604626.0, + "step": 1461, + "train/ce_loss": 0.9624316692352295 + }, + { + "epoch": 0.14445323314217914, + "step": 1461, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.14445323314217914, + "step": 1461, + "train/total_loss": 0.21343067288398743 + }, + { + "entropy": 9.89693546295166, + "epoch": 0.14455210599169468, + "mean_token_accuracy": 0.7595818638801575, + "num_tokens": 7609670.0, + "step": 1462, + "train/ce_loss": 0.7699599862098694 + }, + { + "epoch": 0.14455210599169468, + "step": 1462, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.14455210599169468, + "step": 1462, + "train/total_loss": 0.13168349862098694 + }, + { + "entropy": 9.130101203918457, + "epoch": 0.1446509788412102, + "mean_token_accuracy": 0.7557003498077393, + "num_tokens": 7615070.0, + "step": 1463, + "train/ce_loss": 0.7649668455123901 + }, + { + "epoch": 0.1446509788412102, + "step": 1463, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.1446509788412102, + "step": 1463, + "train/total_loss": 0.1155591830611229 + }, + { + "entropy": 9.673083305358887, + "epoch": 0.14474985169072574, + "mean_token_accuracy": 0.7098283767700195, + "num_tokens": 7620116.0, + "step": 1464, + "train/ce_loss": 0.9854806661605835 + }, + { + "epoch": 0.14474985169072574, + "step": 1464, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.14474985169072574, + "step": 1464, + "train/total_loss": 0.2001105695962906 + }, + { + "entropy": 9.212682723999023, + "epoch": 0.14484872454024125, + "mean_token_accuracy": 0.6935840845108032, + "num_tokens": 7625416.0, + "step": 1465, + "train/ce_loss": 1.523110270500183 + }, + { + "epoch": 0.14484872454024125, + "step": 1465, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.14484872454024125, + "step": 1465, + "train/total_loss": 0.2421547770500183 + }, + { + "entropy": 10.001008033752441, + "epoch": 0.14494759738975677, + "mean_token_accuracy": 0.7122557759284973, + "num_tokens": 7630393.0, + "step": 1466, + "train/ce_loss": 1.0846751928329468 + }, + { + "epoch": 0.14494759738975677, + "step": 1466, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.14494759738975677, + "step": 1466, + "train/total_loss": 0.16315501928329468 + }, + { + "entropy": 9.443642616271973, + "epoch": 0.1450464702392723, + "mean_token_accuracy": 0.7260459065437317, + "num_tokens": 7635565.0, + "step": 1467, + "train/ce_loss": 0.9722166657447815 + }, + { + "epoch": 0.1450464702392723, + "step": 1467, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.1450464702392723, + "step": 1467, + "train/total_loss": 0.14800292253494263 + }, + { + "entropy": 9.269857406616211, + "epoch": 0.14514534308878782, + "mean_token_accuracy": 0.6705756783485413, + "num_tokens": 7640988.0, + "step": 1468, + "train/ce_loss": 1.412301778793335 + }, + { + "epoch": 0.14514534308878782, + "step": 1468, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.14514534308878782, + "step": 1468, + "train/total_loss": 0.25841766595840454 + }, + { + "entropy": 9.7373046875, + "epoch": 0.14524421593830333, + "mean_token_accuracy": 0.7007407546043396, + "num_tokens": 7646091.0, + "step": 1469, + "train/ce_loss": 9.146291631623171e-06 + }, + { + "epoch": 0.14524421593830333, + "step": 1469, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.14524421593830333, + "step": 1469, + "train/total_loss": 0.03515716642141342 + }, + { + "entropy": 9.382098197937012, + "epoch": 0.14534308878781887, + "mean_token_accuracy": 0.7916167378425598, + "num_tokens": 7651404.0, + "step": 1470, + "train/ce_loss": 1.072875738143921 + }, + { + "epoch": 0.14534308878781887, + "step": 1470, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.14534308878781887, + "step": 1470, + "train/total_loss": 0.14244383573532104 + }, + { + "entropy": 8.834354400634766, + "epoch": 0.1454419616373344, + "mean_token_accuracy": 0.6691973805427551, + "num_tokens": 7656797.0, + "step": 1471, + "train/ce_loss": 1.2199304103851318 + }, + { + "epoch": 0.1454419616373344, + "step": 1471, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.1454419616373344, + "step": 1471, + "train/total_loss": 0.2313680350780487 + }, + { + "entropy": 9.114487648010254, + "epoch": 0.1455408344868499, + "mean_token_accuracy": 0.718120813369751, + "num_tokens": 7662221.0, + "step": 1472, + "train/ce_loss": 1.2842098474502563 + }, + { + "epoch": 0.1455408344868499, + "step": 1472, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.1455408344868499, + "step": 1472, + "train/total_loss": 0.21045224368572235 + }, + { + "entropy": 10.331319808959961, + "epoch": 0.14563970733636544, + "mean_token_accuracy": 0.7259474992752075, + "num_tokens": 7666962.0, + "step": 1473, + "train/ce_loss": 1.2172787189483643 + }, + { + "epoch": 0.14563970733636544, + "step": 1473, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.14563970733636544, + "step": 1473, + "train/total_loss": 0.17641538381576538 + }, + { + "entropy": 10.037343978881836, + "epoch": 0.14573858018588096, + "mean_token_accuracy": 0.7982646226882935, + "num_tokens": 7671868.0, + "step": 1474, + "train/ce_loss": 3.5985547583550215e-05 + }, + { + "epoch": 0.14573858018588096, + "step": 1474, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.14573858018588096, + "step": 1474, + "train/total_loss": 0.07812859863042831 + }, + { + "entropy": 9.554939270019531, + "epoch": 0.14583745303539647, + "mean_token_accuracy": 0.7434402108192444, + "num_tokens": 7676999.0, + "step": 1475, + "train/ce_loss": 0.491904079914093 + }, + { + "epoch": 0.14583745303539647, + "step": 1475, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.14583745303539647, + "step": 1475, + "train/total_loss": 0.15856540203094482 + }, + { + "entropy": 9.385547637939453, + "epoch": 0.145936325884912, + "mean_token_accuracy": 0.7004889845848083, + "num_tokens": 7682266.0, + "step": 1476, + "train/ce_loss": 1.4973679780960083 + }, + { + "epoch": 0.145936325884912, + "step": 1476, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.145936325884912, + "step": 1476, + "train/total_loss": 0.27473682165145874 + }, + { + "entropy": 9.379226684570312, + "epoch": 0.14603519873442752, + "mean_token_accuracy": 0.718358039855957, + "num_tokens": 7687576.0, + "step": 1477, + "train/ce_loss": 0.6333465576171875 + }, + { + "epoch": 0.14603519873442752, + "step": 1477, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.14603519873442752, + "step": 1477, + "train/total_loss": 0.149272158741951 + }, + { + "entropy": 9.491981506347656, + "epoch": 0.14613407158394304, + "mean_token_accuracy": 0.7100591659545898, + "num_tokens": 7692734.0, + "step": 1478, + "train/ce_loss": 0.7247401475906372 + }, + { + "epoch": 0.14613407158394304, + "step": 1478, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.14613407158394304, + "step": 1478, + "train/total_loss": 0.14669276773929596 + }, + { + "entropy": 9.006340026855469, + "epoch": 0.14623294443345858, + "mean_token_accuracy": 0.6806231737136841, + "num_tokens": 7698217.0, + "step": 1479, + "train/ce_loss": 1.313188910484314 + }, + { + "epoch": 0.14623294443345858, + "step": 1479, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.14623294443345858, + "step": 1479, + "train/total_loss": 0.19772514700889587 + }, + { + "epoch": 0.1463318172829741, + "grad_norm": 1.075100064277649, + "learning_rate": 9.636799683528656e-06, + "loss": 0.1685, + "step": 1480 + }, + { + "entropy": 9.058917999267578, + "epoch": 0.1463318172829741, + "mean_token_accuracy": 0.7405857443809509, + "num_tokens": 7703678.0, + "step": 1480, + "train/ce_loss": 1.1823251247406006 + }, + { + "epoch": 0.1463318172829741, + "step": 1480, + "train/sim_loss": 0.1953125 + }, + { + "epoch": 0.1463318172829741, + "step": 1480, + "train/total_loss": 0.31354501843452454 + }, + { + "entropy": 9.945625305175781, + "epoch": 0.1464306901324896, + "mean_token_accuracy": 0.701171875, + "num_tokens": 7708647.0, + "step": 1481, + "train/ce_loss": 1.6655864715576172 + }, + { + "epoch": 0.1464306901324896, + "step": 1481, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.1464306901324896, + "step": 1481, + "train/total_loss": 0.2368711531162262 + }, + { + "entropy": 9.082256317138672, + "epoch": 0.14652956298200515, + "mean_token_accuracy": 0.7210718393325806, + "num_tokens": 7713952.0, + "step": 1482, + "train/ce_loss": 1.3255983591079712 + }, + { + "epoch": 0.14652956298200515, + "step": 1482, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.14652956298200515, + "step": 1482, + "train/total_loss": 0.20287233591079712 + }, + { + "entropy": 10.126291275024414, + "epoch": 0.14662843583152066, + "mean_token_accuracy": 0.7613065242767334, + "num_tokens": 7718771.0, + "step": 1483, + "train/ce_loss": 1.0498250722885132 + }, + { + "epoch": 0.14662843583152066, + "step": 1483, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.14662843583152066, + "step": 1483, + "train/total_loss": 0.15967001020908356 + }, + { + "entropy": 9.443145751953125, + "epoch": 0.1467273086810362, + "mean_token_accuracy": 0.7363751530647278, + "num_tokens": 7724015.0, + "step": 1484, + "train/ce_loss": 1.3845174312591553 + }, + { + "epoch": 0.1467273086810362, + "step": 1484, + "train/sim_loss": 0.1484375 + }, + { + "epoch": 0.1467273086810362, + "step": 1484, + "train/total_loss": 0.2868892550468445 + }, + { + "entropy": 10.079927444458008, + "epoch": 0.1468261815305517, + "mean_token_accuracy": 0.7978723645210266, + "num_tokens": 7728918.0, + "step": 1485, + "train/ce_loss": 0.8031333088874817 + }, + { + "epoch": 0.1468261815305517, + "step": 1485, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.1468261815305517, + "step": 1485, + "train/total_loss": 0.1389070749282837 + }, + { + "entropy": 10.058080673217773, + "epoch": 0.14692505438006723, + "mean_token_accuracy": 0.7175398468971252, + "num_tokens": 7733758.0, + "step": 1486, + "train/ce_loss": 3.7512934795813635e-05 + }, + { + "epoch": 0.14692505438006723, + "step": 1486, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.14692505438006723, + "step": 1486, + "train/total_loss": 0.08594124764204025 + }, + { + "entropy": 9.744989395141602, + "epoch": 0.14702392722958277, + "mean_token_accuracy": 0.7365930676460266, + "num_tokens": 7738850.0, + "step": 1487, + "train/ce_loss": 1.0534642934799194 + }, + { + "epoch": 0.14702392722958277, + "step": 1487, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.14702392722958277, + "step": 1487, + "train/total_loss": 0.1795651912689209 + }, + { + "entropy": 9.28485107421875, + "epoch": 0.14712280007909828, + "mean_token_accuracy": 0.7431551218032837, + "num_tokens": 7744070.0, + "step": 1488, + "train/ce_loss": 0.7180099487304688 + }, + { + "epoch": 0.14712280007909828, + "step": 1488, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.14712280007909828, + "step": 1488, + "train/total_loss": 0.12258224934339523 + }, + { + "entropy": 9.276777267456055, + "epoch": 0.1472216729286138, + "mean_token_accuracy": 0.7062423229217529, + "num_tokens": 7749376.0, + "step": 1489, + "train/ce_loss": 0.9177853465080261 + }, + { + "epoch": 0.1472216729286138, + "step": 1489, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.1472216729286138, + "step": 1489, + "train/total_loss": 0.13084104657173157 + }, + { + "entropy": 9.664016723632812, + "epoch": 0.14732054577812934, + "mean_token_accuracy": 0.7136222720146179, + "num_tokens": 7754478.0, + "step": 1490, + "train/ce_loss": 1.612519383430481 + }, + { + "epoch": 0.14732054577812934, + "step": 1490, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.14732054577812934, + "step": 1490, + "train/total_loss": 0.262814462184906 + }, + { + "entropy": 9.540077209472656, + "epoch": 0.14741941862764485, + "mean_token_accuracy": 0.7288401126861572, + "num_tokens": 7759574.0, + "step": 1491, + "train/ce_loss": 1.006054401397705 + }, + { + "epoch": 0.14741941862764485, + "step": 1491, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.14741941862764485, + "step": 1491, + "train/total_loss": 0.19044919312000275 + }, + { + "entropy": 9.13851547241211, + "epoch": 0.14751829147716036, + "mean_token_accuracy": 0.7488636374473572, + "num_tokens": 7764984.0, + "step": 1492, + "train/ce_loss": 0.8154755234718323 + }, + { + "epoch": 0.14751829147716036, + "step": 1492, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.14751829147716036, + "step": 1492, + "train/total_loss": 0.1596725583076477 + }, + { + "entropy": 9.106375694274902, + "epoch": 0.1476171643266759, + "mean_token_accuracy": 0.7339848279953003, + "num_tokens": 7770380.0, + "step": 1493, + "train/ce_loss": 0.41329845786094666 + }, + { + "epoch": 0.1476171643266759, + "step": 1493, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.1476171643266759, + "step": 1493, + "train/total_loss": 0.11164234578609467 + }, + { + "entropy": 9.136665344238281, + "epoch": 0.14771603717619142, + "mean_token_accuracy": 0.7275640964508057, + "num_tokens": 7775789.0, + "step": 1494, + "train/ce_loss": 1.3060837984085083 + }, + { + "epoch": 0.14771603717619142, + "step": 1494, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.14771603717619142, + "step": 1494, + "train/total_loss": 0.23217087984085083 + }, + { + "entropy": 9.167590141296387, + "epoch": 0.14781491002570693, + "mean_token_accuracy": 0.7491926550865173, + "num_tokens": 7781215.0, + "step": 1495, + "train/ce_loss": 0.5833946466445923 + }, + { + "epoch": 0.14781491002570693, + "step": 1495, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.14781491002570693, + "step": 1495, + "train/total_loss": 0.144276961684227 + }, + { + "entropy": 9.965858459472656, + "epoch": 0.14791378287522247, + "mean_token_accuracy": 0.7415329813957214, + "num_tokens": 7786166.0, + "step": 1496, + "train/ce_loss": 1.4708634614944458 + }, + { + "epoch": 0.14791378287522247, + "step": 1496, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.14791378287522247, + "step": 1496, + "train/total_loss": 0.19786760210990906 + }, + { + "entropy": 9.608375549316406, + "epoch": 0.14801265572473798, + "mean_token_accuracy": 0.780415415763855, + "num_tokens": 7791279.0, + "step": 1497, + "train/ce_loss": 0.7494889497756958 + }, + { + "epoch": 0.14801265572473798, + "step": 1497, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.14801265572473798, + "step": 1497, + "train/total_loss": 0.11791764944791794 + }, + { + "entropy": 9.619302749633789, + "epoch": 0.1481115285742535, + "mean_token_accuracy": 0.6865671873092651, + "num_tokens": 7796438.0, + "step": 1498, + "train/ce_loss": 1.3919106721878052 + }, + { + "epoch": 0.1481115285742535, + "step": 1498, + "train/sim_loss": 0.16015625 + }, + { + "epoch": 0.1481115285742535, + "step": 1498, + "train/total_loss": 0.2993473410606384 + }, + { + "entropy": 9.500656127929688, + "epoch": 0.14821040142376904, + "mean_token_accuracy": 0.7028796076774597, + "num_tokens": 7801662.0, + "step": 1499, + "train/ce_loss": 1.4153218269348145 + }, + { + "epoch": 0.14821040142376904, + "step": 1499, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.14821040142376904, + "step": 1499, + "train/total_loss": 0.26653218269348145 + }, + { + "epoch": 0.14830927427328455, + "grad_norm": 1.0314209461212158, + "learning_rate": 9.631854818770708e-06, + "loss": 0.1656, + "step": 1500 + }, + { + "entropy": 9.492480278015137, + "epoch": 0.14830927427328455, + "mean_token_accuracy": 0.6932599544525146, + "num_tokens": 7806835.0, + "step": 1500, + "train/ce_loss": 1.1232261657714844 + }, + { + "epoch": 0.14830927427328455, + "step": 1500, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.14830927427328455, + "step": 1500, + "train/total_loss": 0.1631038784980774 + }, + { + "entropy": 9.169363021850586, + "epoch": 0.14840814712280007, + "mean_token_accuracy": 0.7156549692153931, + "num_tokens": 7812264.0, + "step": 1501, + "train/ce_loss": 0.7301574349403381 + }, + { + "epoch": 0.14840814712280007, + "step": 1501, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.14840814712280007, + "step": 1501, + "train/total_loss": 0.1628594994544983 + }, + { + "entropy": 9.594934463500977, + "epoch": 0.1485070199723156, + "mean_token_accuracy": 0.6979332566261292, + "num_tokens": 7817365.0, + "step": 1502, + "train/ce_loss": 2.1555140018463135 + }, + { + "epoch": 0.1485070199723156, + "step": 1502, + "train/sim_loss": 0.13671875 + }, + { + "epoch": 0.1485070199723156, + "step": 1502, + "train/total_loss": 0.3522701561450958 + }, + { + "entropy": 9.353469848632812, + "epoch": 0.14860589282183112, + "mean_token_accuracy": 0.7362637519836426, + "num_tokens": 7822630.0, + "step": 1503, + "train/ce_loss": 0.7238584756851196 + }, + { + "epoch": 0.14860589282183112, + "step": 1503, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.14860589282183112, + "step": 1503, + "train/total_loss": 0.09582334756851196 + }, + { + "entropy": 9.216747283935547, + "epoch": 0.14870476567134666, + "mean_token_accuracy": 0.7195402383804321, + "num_tokens": 7827948.0, + "step": 1504, + "train/ce_loss": 0.7982300519943237 + }, + { + "epoch": 0.14870476567134666, + "step": 1504, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.14870476567134666, + "step": 1504, + "train/total_loss": 0.11107300966978073 + }, + { + "entropy": 9.300481796264648, + "epoch": 0.14880363852086217, + "mean_token_accuracy": 0.7751938104629517, + "num_tokens": 7833248.0, + "step": 1505, + "train/ce_loss": 1.175078272819519 + }, + { + "epoch": 0.14880363852086217, + "step": 1505, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.14880363852086217, + "step": 1505, + "train/total_loss": 0.21907033026218414 + }, + { + "entropy": 9.822269439697266, + "epoch": 0.1489025113703777, + "mean_token_accuracy": 0.7555555701255798, + "num_tokens": 7838269.0, + "step": 1506, + "train/ce_loss": 1.6059404611587524 + }, + { + "epoch": 0.1489025113703777, + "step": 1506, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.1489025113703777, + "step": 1506, + "train/total_loss": 0.26215654611587524 + }, + { + "entropy": 9.485581398010254, + "epoch": 0.14900138421989323, + "mean_token_accuracy": 0.7326057553291321, + "num_tokens": 7843453.0, + "step": 1507, + "train/ce_loss": 0.951874315738678 + }, + { + "epoch": 0.14900138421989323, + "step": 1507, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.14900138421989323, + "step": 1507, + "train/total_loss": 0.18112492561340332 + }, + { + "entropy": 9.385790824890137, + "epoch": 0.14910025706940874, + "mean_token_accuracy": 0.725824773311615, + "num_tokens": 7848740.0, + "step": 1508, + "train/ce_loss": 1.8204835653305054 + }, + { + "epoch": 0.14910025706940874, + "step": 1508, + "train/sim_loss": 0.1484375 + }, + { + "epoch": 0.14910025706940874, + "step": 1508, + "train/total_loss": 0.33048588037490845 + }, + { + "entropy": 9.386913299560547, + "epoch": 0.14919912991892426, + "mean_token_accuracy": 0.7376312017440796, + "num_tokens": 7853875.0, + "step": 1509, + "train/ce_loss": 1.0938643217086792 + }, + { + "epoch": 0.14919912991892426, + "step": 1509, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.14919912991892426, + "step": 1509, + "train/total_loss": 0.19923019409179688 + }, + { + "entropy": 9.619256973266602, + "epoch": 0.1492980027684398, + "mean_token_accuracy": 0.7361878156661987, + "num_tokens": 7859076.0, + "step": 1510, + "train/ce_loss": 0.8881609439849854 + }, + { + "epoch": 0.1492980027684398, + "step": 1510, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.1492980027684398, + "step": 1510, + "train/total_loss": 0.1630348563194275 + }, + { + "entropy": 10.16584587097168, + "epoch": 0.1493968756179553, + "mean_token_accuracy": 0.8153846263885498, + "num_tokens": 7863919.0, + "step": 1511, + "train/ce_loss": 2.056551238638349e-05 + }, + { + "epoch": 0.1493968756179553, + "step": 1511, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.1493968756179553, + "step": 1511, + "train/total_loss": 0.05859580636024475 + }, + { + "entropy": 9.185079574584961, + "epoch": 0.14949574846747082, + "mean_token_accuracy": 0.7560647130012512, + "num_tokens": 7869161.0, + "step": 1512, + "train/ce_loss": 0.828264594078064 + }, + { + "epoch": 0.14949574846747082, + "step": 1512, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.14949574846747082, + "step": 1512, + "train/total_loss": 0.18829521536827087 + }, + { + "entropy": 9.522941589355469, + "epoch": 0.14959462131698636, + "mean_token_accuracy": 0.7742424011230469, + "num_tokens": 7874312.0, + "step": 1513, + "train/ce_loss": 0.9248256087303162 + }, + { + "epoch": 0.14959462131698636, + "step": 1513, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.14959462131698636, + "step": 1513, + "train/total_loss": 0.1745138168334961 + }, + { + "entropy": 9.77141284942627, + "epoch": 0.14969349416650188, + "mean_token_accuracy": 0.6744186282157898, + "num_tokens": 7879340.0, + "step": 1514, + "train/ce_loss": 1.1118297576904297 + }, + { + "epoch": 0.14969349416650188, + "step": 1514, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.14969349416650188, + "step": 1514, + "train/total_loss": 0.23618298768997192 + }, + { + "entropy": 9.230764389038086, + "epoch": 0.1497923670160174, + "mean_token_accuracy": 0.691142201423645, + "num_tokens": 7884685.0, + "step": 1515, + "train/ce_loss": 0.7974668741226196 + }, + { + "epoch": 0.1497923670160174, + "step": 1515, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.1497923670160174, + "step": 1515, + "train/total_loss": 0.13443419337272644 + }, + { + "entropy": 9.194846153259277, + "epoch": 0.14989123986553293, + "mean_token_accuracy": 0.7187893986701965, + "num_tokens": 7889955.0, + "step": 1516, + "train/ce_loss": 0.6924360990524292 + }, + { + "epoch": 0.14989123986553293, + "step": 1516, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.14989123986553293, + "step": 1516, + "train/total_loss": 0.13955610990524292 + }, + { + "entropy": 9.383991241455078, + "epoch": 0.14999011271504845, + "mean_token_accuracy": 0.768324613571167, + "num_tokens": 7895187.0, + "step": 1517, + "train/ce_loss": 0.7051076889038086 + }, + { + "epoch": 0.14999011271504845, + "step": 1517, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.14999011271504845, + "step": 1517, + "train/total_loss": 0.14472952485084534 + }, + { + "entropy": 9.848875999450684, + "epoch": 0.15008898556456396, + "mean_token_accuracy": 0.7439446449279785, + "num_tokens": 7900211.0, + "step": 1518, + "train/ce_loss": 0.7502337098121643 + }, + { + "epoch": 0.15008898556456396, + "step": 1518, + "train/sim_loss": 0.13671875 + }, + { + "epoch": 0.15008898556456396, + "step": 1518, + "train/total_loss": 0.21174213290214539 + }, + { + "entropy": 9.283403396606445, + "epoch": 0.1501878584140795, + "mean_token_accuracy": 0.7236841917037964, + "num_tokens": 7905520.0, + "step": 1519, + "train/ce_loss": 0.9833032488822937 + }, + { + "epoch": 0.1501878584140795, + "step": 1519, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.1501878584140795, + "step": 1519, + "train/total_loss": 0.1803615689277649 + }, + { + "epoch": 0.150286731263595, + "grad_norm": 1.0560803413391113, + "learning_rate": 9.626909954012758e-06, + "loss": 0.1666, + "step": 1520 + }, + { + "entropy": 9.216127395629883, + "epoch": 0.150286731263595, + "mean_token_accuracy": 0.7326343655586243, + "num_tokens": 7910732.0, + "step": 1520, + "train/ce_loss": 0.8644230961799622 + }, + { + "epoch": 0.150286731263595, + "step": 1520, + "train/sim_loss": 0.15234375 + }, + { + "epoch": 0.150286731263595, + "step": 1520, + "train/total_loss": 0.23878607153892517 + }, + { + "entropy": 9.040716171264648, + "epoch": 0.15038560411311053, + "mean_token_accuracy": 0.7021716833114624, + "num_tokens": 7916187.0, + "step": 1521, + "train/ce_loss": 0.9593912363052368 + }, + { + "epoch": 0.15038560411311053, + "step": 1521, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.15038560411311053, + "step": 1521, + "train/total_loss": 0.16625162959098816 + }, + { + "entropy": 9.894933700561523, + "epoch": 0.15048447696262607, + "mean_token_accuracy": 0.7157676219940186, + "num_tokens": 7921098.0, + "step": 1522, + "train/ce_loss": 1.4292757511138916 + }, + { + "epoch": 0.15048447696262607, + "step": 1522, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.15048447696262607, + "step": 1522, + "train/total_loss": 0.2679275870323181 + }, + { + "entropy": 9.750065803527832, + "epoch": 0.15058334981214158, + "mean_token_accuracy": 0.6848030090332031, + "num_tokens": 7926100.0, + "step": 1523, + "train/ce_loss": 1.9158490896224976 + }, + { + "epoch": 0.15058334981214158, + "step": 1523, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.15058334981214158, + "step": 1523, + "train/total_loss": 0.31658491492271423 + }, + { + "entropy": 9.4520902633667, + "epoch": 0.15068222266165712, + "mean_token_accuracy": 0.7044585943222046, + "num_tokens": 7931336.0, + "step": 1524, + "train/ce_loss": 0.9469541311264038 + }, + { + "epoch": 0.15068222266165712, + "step": 1524, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.15068222266165712, + "step": 1524, + "train/total_loss": 0.17282041907310486 + }, + { + "entropy": 9.343910217285156, + "epoch": 0.15078109551117264, + "mean_token_accuracy": 0.7021546363830566, + "num_tokens": 7936514.0, + "step": 1525, + "train/ce_loss": 1.0948412418365479 + }, + { + "epoch": 0.15078109551117264, + "step": 1525, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.15078109551117264, + "step": 1525, + "train/total_loss": 0.18760913610458374 + }, + { + "entropy": 9.896944046020508, + "epoch": 0.15087996836068815, + "mean_token_accuracy": 0.7777777910232544, + "num_tokens": 7941447.0, + "step": 1526, + "train/ce_loss": 0.83757483959198 + }, + { + "epoch": 0.15087996836068815, + "step": 1526, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.15087996836068815, + "step": 1526, + "train/total_loss": 0.14235123991966248 + }, + { + "entropy": 10.245054244995117, + "epoch": 0.1509788412102037, + "mean_token_accuracy": 0.7386363744735718, + "num_tokens": 7946249.0, + "step": 1527, + "train/ce_loss": 1.0803058103192598e-05 + }, + { + "epoch": 0.1509788412102037, + "step": 1527, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.1509788412102037, + "step": 1527, + "train/total_loss": 0.039063580334186554 + }, + { + "entropy": 9.269997596740723, + "epoch": 0.1510777140597192, + "mean_token_accuracy": 0.6519208550453186, + "num_tokens": 7951606.0, + "step": 1528, + "train/ce_loss": 0.794880747795105 + }, + { + "epoch": 0.1510777140597192, + "step": 1528, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.1510777140597192, + "step": 1528, + "train/total_loss": 0.14589431881904602 + }, + { + "entropy": 9.347448348999023, + "epoch": 0.15117658690923472, + "mean_token_accuracy": 0.7196382284164429, + "num_tokens": 7956820.0, + "step": 1529, + "train/ce_loss": 0.5854206681251526 + }, + { + "epoch": 0.15117658690923472, + "step": 1529, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.15117658690923472, + "step": 1529, + "train/total_loss": 0.12494832277297974 + }, + { + "entropy": 9.150819778442383, + "epoch": 0.15127545975875026, + "mean_token_accuracy": 0.7061556577682495, + "num_tokens": 7962135.0, + "step": 1530, + "train/ce_loss": 1.0358163118362427 + }, + { + "epoch": 0.15127545975875026, + "step": 1530, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.15127545975875026, + "step": 1530, + "train/total_loss": 0.19733163714408875 + }, + { + "entropy": 9.922235488891602, + "epoch": 0.15137433260826577, + "mean_token_accuracy": 0.7366412281990051, + "num_tokens": 7967078.0, + "step": 1531, + "train/ce_loss": 1.2564538717269897 + }, + { + "epoch": 0.15137433260826577, + "step": 1531, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.15137433260826577, + "step": 1531, + "train/total_loss": 0.20377038419246674 + }, + { + "entropy": 8.938532829284668, + "epoch": 0.15147320545778128, + "mean_token_accuracy": 0.7037383317947388, + "num_tokens": 7972631.0, + "step": 1532, + "train/ce_loss": 0.841424286365509 + }, + { + "epoch": 0.15147320545778128, + "step": 1532, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.15147320545778128, + "step": 1532, + "train/total_loss": 0.17398618161678314 + }, + { + "entropy": 9.842126846313477, + "epoch": 0.15157207830729683, + "mean_token_accuracy": 0.7336522936820984, + "num_tokens": 7977858.0, + "step": 1533, + "train/ce_loss": 8.909573807613924e-06 + }, + { + "epoch": 0.15157207830729683, + "step": 1533, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.15157207830729683, + "step": 1533, + "train/total_loss": 0.03125089034438133 + }, + { + "entropy": 9.684154510498047, + "epoch": 0.15167095115681234, + "mean_token_accuracy": 0.7536231875419617, + "num_tokens": 7982805.0, + "step": 1534, + "train/ce_loss": 4.817128137801774e-05 + }, + { + "epoch": 0.15167095115681234, + "step": 1534, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.15167095115681234, + "step": 1534, + "train/total_loss": 0.05859856680035591 + }, + { + "entropy": 9.907486915588379, + "epoch": 0.15176982400632785, + "mean_token_accuracy": 0.7193675637245178, + "num_tokens": 7987813.0, + "step": 1535, + "train/ce_loss": 1.0405404282209929e-05 + }, + { + "epoch": 0.15176982400632785, + "step": 1535, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.15176982400632785, + "step": 1535, + "train/total_loss": 0.07812604308128357 + }, + { + "entropy": 9.681005477905273, + "epoch": 0.1518686968558434, + "mean_token_accuracy": 0.6660412549972534, + "num_tokens": 7992807.0, + "step": 1536, + "train/ce_loss": 3.722548353835009e-05 + }, + { + "epoch": 0.1518686968558434, + "step": 1536, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.1518686968558434, + "step": 1536, + "train/total_loss": 0.07031622529029846 + }, + { + "entropy": 9.169857025146484, + "epoch": 0.1519675697053589, + "mean_token_accuracy": 0.7594936490058899, + "num_tokens": 7998059.0, + "step": 1537, + "train/ce_loss": 0.584730863571167 + }, + { + "epoch": 0.1519675697053589, + "step": 1537, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.1519675697053589, + "step": 1537, + "train/total_loss": 0.12097308784723282 + }, + { + "entropy": 9.898794174194336, + "epoch": 0.15206644255487442, + "mean_token_accuracy": 0.7634854912757874, + "num_tokens": 8002938.0, + "step": 1538, + "train/ce_loss": 0.8556390404701233 + }, + { + "epoch": 0.15206644255487442, + "step": 1538, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.15206644255487442, + "step": 1538, + "train/total_loss": 0.13634514808654785 + }, + { + "entropy": 9.214766502380371, + "epoch": 0.15216531540438996, + "mean_token_accuracy": 0.7001166939735413, + "num_tokens": 8008306.0, + "step": 1539, + "train/ce_loss": 7.306869520107284e-06 + }, + { + "epoch": 0.15216531540438996, + "step": 1539, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.15216531540438996, + "step": 1539, + "train/total_loss": 0.0468757301568985 + }, + { + "epoch": 0.15226418825390547, + "grad_norm": 1.021117091178894, + "learning_rate": 9.62196508925481e-06, + "loss": 0.1727, + "step": 1540 + }, + { + "entropy": 9.442438125610352, + "epoch": 0.15226418825390547, + "mean_token_accuracy": 0.752077579498291, + "num_tokens": 8013523.0, + "step": 1540, + "train/ce_loss": 0.7084006667137146 + }, + { + "epoch": 0.15226418825390547, + "step": 1540, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.15226418825390547, + "step": 1540, + "train/total_loss": 0.16068381071090698 + }, + { + "entropy": 9.875503540039062, + "epoch": 0.152363061103421, + "mean_token_accuracy": 0.80694979429245, + "num_tokens": 8018507.0, + "step": 1541, + "train/ce_loss": 8.61121679918142e-06 + }, + { + "epoch": 0.152363061103421, + "step": 1541, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.152363061103421, + "step": 1541, + "train/total_loss": 0.08593836426734924 + }, + { + "entropy": 9.098029136657715, + "epoch": 0.15246193395293653, + "mean_token_accuracy": 0.6898002028465271, + "num_tokens": 8023943.0, + "step": 1542, + "train/ce_loss": 0.507659375667572 + }, + { + "epoch": 0.15246193395293653, + "step": 1542, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.15246193395293653, + "step": 1542, + "train/total_loss": 0.14842218160629272 + }, + { + "entropy": 9.00258731842041, + "epoch": 0.15256080680245204, + "mean_token_accuracy": 0.739393949508667, + "num_tokens": 8029469.0, + "step": 1543, + "train/ce_loss": 0.696696937084198 + }, + { + "epoch": 0.15256080680245204, + "step": 1543, + "train/sim_loss": 0.140625 + }, + { + "epoch": 0.15256080680245204, + "step": 1543, + "train/total_loss": 0.2102946937084198 + }, + { + "entropy": 9.308021545410156, + "epoch": 0.15265967965196756, + "mean_token_accuracy": 0.7641866207122803, + "num_tokens": 8034739.0, + "step": 1544, + "train/ce_loss": 0.6892697811126709 + }, + { + "epoch": 0.15265967965196756, + "step": 1544, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.15265967965196756, + "step": 1544, + "train/total_loss": 0.10017698258161545 + }, + { + "entropy": 9.620084762573242, + "epoch": 0.1527585525014831, + "mean_token_accuracy": 0.7102649211883545, + "num_tokens": 8039836.0, + "step": 1545, + "train/ce_loss": 1.0270184247929137e-05 + }, + { + "epoch": 0.1527585525014831, + "step": 1545, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.1527585525014831, + "step": 1545, + "train/total_loss": 0.09375102818012238 + }, + { + "entropy": 9.019035339355469, + "epoch": 0.1528574253509986, + "mean_token_accuracy": 0.688095211982727, + "num_tokens": 8045133.0, + "step": 1546, + "train/ce_loss": 0.926342248916626 + }, + { + "epoch": 0.1528574253509986, + "step": 1546, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.1528574253509986, + "step": 1546, + "train/total_loss": 0.15122798085212708 + }, + { + "entropy": 10.257822036743164, + "epoch": 0.15295629820051415, + "mean_token_accuracy": 0.7606837749481201, + "num_tokens": 8049916.0, + "step": 1547, + "train/ce_loss": 1.0292736291885376 + }, + { + "epoch": 0.15295629820051415, + "step": 1547, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.15295629820051415, + "step": 1547, + "train/total_loss": 0.18105235695838928 + }, + { + "entropy": 9.112434387207031, + "epoch": 0.15305517105002966, + "mean_token_accuracy": 0.7713310718536377, + "num_tokens": 8055255.0, + "step": 1548, + "train/ce_loss": 0.35855820775032043 + }, + { + "epoch": 0.15305517105002966, + "step": 1548, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.15305517105002966, + "step": 1548, + "train/total_loss": 0.08663707226514816 + }, + { + "entropy": 9.885303497314453, + "epoch": 0.15315404389954518, + "mean_token_accuracy": 0.7345890402793884, + "num_tokens": 8060292.0, + "step": 1549, + "train/ce_loss": 1.0865520238876343 + }, + { + "epoch": 0.15315404389954518, + "step": 1549, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.15315404389954518, + "step": 1549, + "train/total_loss": 0.20631146430969238 + }, + { + "entropy": 9.570383071899414, + "epoch": 0.15325291674906072, + "mean_token_accuracy": 0.7030625939369202, + "num_tokens": 8065463.0, + "step": 1550, + "train/ce_loss": 1.4615740776062012 + }, + { + "epoch": 0.15325291674906072, + "step": 1550, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.15325291674906072, + "step": 1550, + "train/total_loss": 0.1969386637210846 + }, + { + "entropy": 9.84042739868164, + "epoch": 0.15335178959857623, + "mean_token_accuracy": 0.7349823117256165, + "num_tokens": 8070463.0, + "step": 1551, + "train/ce_loss": 1.0207172632217407 + }, + { + "epoch": 0.15335178959857623, + "step": 1551, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.15335178959857623, + "step": 1551, + "train/total_loss": 0.15675923228263855 + }, + { + "entropy": 9.249610900878906, + "epoch": 0.15345066244809175, + "mean_token_accuracy": 0.7213114500045776, + "num_tokens": 8075756.0, + "step": 1552, + "train/ce_loss": 1.1852610111236572 + }, + { + "epoch": 0.15345066244809175, + "step": 1552, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.15345066244809175, + "step": 1552, + "train/total_loss": 0.22790110111236572 + }, + { + "entropy": 9.230417251586914, + "epoch": 0.1535495352976073, + "mean_token_accuracy": 0.6813187003135681, + "num_tokens": 8081026.0, + "step": 1553, + "train/ce_loss": 1.0361573696136475 + }, + { + "epoch": 0.1535495352976073, + "step": 1553, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.1535495352976073, + "step": 1553, + "train/total_loss": 0.15439698100090027 + }, + { + "entropy": 9.386066436767578, + "epoch": 0.1536484081471228, + "mean_token_accuracy": 0.7218710780143738, + "num_tokens": 8086245.0, + "step": 1554, + "train/ce_loss": 0.7083413600921631 + }, + { + "epoch": 0.1536484081471228, + "step": 1554, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.1536484081471228, + "step": 1554, + "train/total_loss": 0.14505288004875183 + }, + { + "entropy": 9.255256652832031, + "epoch": 0.1537472809966383, + "mean_token_accuracy": 0.7347931861877441, + "num_tokens": 8091542.0, + "step": 1555, + "train/ce_loss": 1.0050069093704224 + }, + { + "epoch": 0.1537472809966383, + "step": 1555, + "train/sim_loss": 0.13671875 + }, + { + "epoch": 0.1537472809966383, + "step": 1555, + "train/total_loss": 0.2372194528579712 + }, + { + "entropy": 9.164133071899414, + "epoch": 0.15384615384615385, + "mean_token_accuracy": 0.6586695909500122, + "num_tokens": 8096957.0, + "step": 1556, + "train/ce_loss": 2.3513078689575195 + }, + { + "epoch": 0.15384615384615385, + "step": 1556, + "train/sim_loss": 0.1328125 + }, + { + "epoch": 0.15384615384615385, + "step": 1556, + "train/total_loss": 0.36794328689575195 + }, + { + "entropy": 9.656140327453613, + "epoch": 0.15394502669566937, + "mean_token_accuracy": 0.7245222926139832, + "num_tokens": 8102034.0, + "step": 1557, + "train/ce_loss": 1.2057785987854004 + }, + { + "epoch": 0.15394502669566937, + "step": 1557, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.15394502669566937, + "step": 1557, + "train/total_loss": 0.198702871799469 + }, + { + "entropy": 9.500467300415039, + "epoch": 0.15404389954518488, + "mean_token_accuracy": 0.7125827670097351, + "num_tokens": 8107382.0, + "step": 1558, + "train/ce_loss": 0.7562654614448547 + }, + { + "epoch": 0.15404389954518488, + "step": 1558, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.15404389954518488, + "step": 1558, + "train/total_loss": 0.12640780210494995 + }, + { + "entropy": 8.847757339477539, + "epoch": 0.15414277239470042, + "mean_token_accuracy": 0.6982097029685974, + "num_tokens": 8112637.0, + "step": 1559, + "train/ce_loss": 1.044164776802063 + }, + { + "epoch": 0.15414277239470042, + "step": 1559, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.15414277239470042, + "step": 1559, + "train/total_loss": 0.18254148960113525 + }, + { + "epoch": 0.15424164524421594, + "grad_norm": 1.083085060119629, + "learning_rate": 9.617020224496861e-06, + "loss": 0.1719, + "step": 1560 + }, + { + "entropy": 10.149479866027832, + "epoch": 0.15424164524421594, + "mean_token_accuracy": 0.7255370020866394, + "num_tokens": 8117514.0, + "step": 1560, + "train/ce_loss": 2.61259210674325e-05 + }, + { + "epoch": 0.15424164524421594, + "step": 1560, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.15424164524421594, + "step": 1560, + "train/total_loss": 0.06250261515378952 + }, + { + "entropy": 9.694318771362305, + "epoch": 0.15434051809373145, + "mean_token_accuracy": 0.6761363744735718, + "num_tokens": 8122653.0, + "step": 1561, + "train/ce_loss": 1.5257987976074219 + }, + { + "epoch": 0.15434051809373145, + "step": 1561, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.15434051809373145, + "step": 1561, + "train/total_loss": 0.1955486387014389 + }, + { + "entropy": 9.275548934936523, + "epoch": 0.154439390943247, + "mean_token_accuracy": 0.6985645890235901, + "num_tokens": 8127997.0, + "step": 1562, + "train/ce_loss": 1.181897759437561 + }, + { + "epoch": 0.154439390943247, + "step": 1562, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.154439390943247, + "step": 1562, + "train/total_loss": 0.23147103190422058 + }, + { + "entropy": 9.00159740447998, + "epoch": 0.1545382637927625, + "mean_token_accuracy": 0.7092896103858948, + "num_tokens": 8133435.0, + "step": 1563, + "train/ce_loss": 1.2038424015045166 + }, + { + "epoch": 0.1545382637927625, + "step": 1563, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.1545382637927625, + "step": 1563, + "train/total_loss": 0.19069674611091614 + }, + { + "entropy": 9.292047500610352, + "epoch": 0.15463713664227802, + "mean_token_accuracy": 0.7600979208946228, + "num_tokens": 8138693.0, + "step": 1564, + "train/ce_loss": 1.0820578336715698 + }, + { + "epoch": 0.15463713664227802, + "step": 1564, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.15463713664227802, + "step": 1564, + "train/total_loss": 0.16289329528808594 + }, + { + "entropy": 9.516674995422363, + "epoch": 0.15473600949179356, + "mean_token_accuracy": 0.7142857313156128, + "num_tokens": 8143884.0, + "step": 1565, + "train/ce_loss": 0.7768955230712891 + }, + { + "epoch": 0.15473600949179356, + "step": 1565, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.15473600949179356, + "step": 1565, + "train/total_loss": 0.16362705826759338 + }, + { + "entropy": 9.774168968200684, + "epoch": 0.15483488234130907, + "mean_token_accuracy": 0.7204724550247192, + "num_tokens": 8148876.0, + "step": 1566, + "train/ce_loss": 1.6146434545516968 + }, + { + "epoch": 0.15483488234130907, + "step": 1566, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.15483488234130907, + "step": 1566, + "train/total_loss": 0.2552143335342407 + }, + { + "entropy": 9.159017562866211, + "epoch": 0.1549337551908246, + "mean_token_accuracy": 0.7898627519607544, + "num_tokens": 8154290.0, + "step": 1567, + "train/ce_loss": 0.40811553597450256 + }, + { + "epoch": 0.1549337551908246, + "step": 1567, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.1549337551908246, + "step": 1567, + "train/total_loss": 0.09159280359745026 + }, + { + "entropy": 9.256782531738281, + "epoch": 0.15503262804034013, + "mean_token_accuracy": 0.6602316498756409, + "num_tokens": 8159488.0, + "step": 1568, + "train/ce_loss": 1.8952783346176147 + }, + { + "epoch": 0.15503262804034013, + "step": 1568, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.15503262804034013, + "step": 1568, + "train/total_loss": 0.30671533942222595 + }, + { + "entropy": 9.015083312988281, + "epoch": 0.15513150088985564, + "mean_token_accuracy": 0.7527352571487427, + "num_tokens": 8164885.0, + "step": 1569, + "train/ce_loss": 0.5627985000610352 + }, + { + "epoch": 0.15513150088985564, + "step": 1569, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.15513150088985564, + "step": 1569, + "train/total_loss": 0.15393610298633575 + }, + { + "entropy": 9.708174705505371, + "epoch": 0.15523037373937118, + "mean_token_accuracy": 0.7035398483276367, + "num_tokens": 8170002.0, + "step": 1570, + "train/ce_loss": 8.378126949537545e-06 + }, + { + "epoch": 0.15523037373937118, + "step": 1570, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.15523037373937118, + "step": 1570, + "train/total_loss": 0.039063338190317154 + }, + { + "entropy": 10.102365493774414, + "epoch": 0.1553292465888867, + "mean_token_accuracy": 0.7489539980888367, + "num_tokens": 8174859.0, + "step": 1571, + "train/ce_loss": 2.2206978797912598 + }, + { + "epoch": 0.1553292465888867, + "step": 1571, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.1553292465888867, + "step": 1571, + "train/total_loss": 0.31191354990005493 + }, + { + "entropy": 9.020883560180664, + "epoch": 0.1554281194384022, + "mean_token_accuracy": 0.7104121446609497, + "num_tokens": 8180256.0, + "step": 1572, + "train/ce_loss": 0.5232083797454834 + }, + { + "epoch": 0.1554281194384022, + "step": 1572, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.1554281194384022, + "step": 1572, + "train/total_loss": 0.09919583797454834 + }, + { + "entropy": 9.084095001220703, + "epoch": 0.15552699228791775, + "mean_token_accuracy": 0.8171926140785217, + "num_tokens": 8185667.0, + "step": 1573, + "train/ce_loss": 0.41737455129623413 + }, + { + "epoch": 0.15552699228791775, + "step": 1573, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.15552699228791775, + "step": 1573, + "train/total_loss": 0.07298745214939117 + }, + { + "entropy": 9.414447784423828, + "epoch": 0.15562586513743326, + "mean_token_accuracy": 0.7877551317214966, + "num_tokens": 8190897.0, + "step": 1574, + "train/ce_loss": 0.43452367186546326 + }, + { + "epoch": 0.15562586513743326, + "step": 1574, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.15562586513743326, + "step": 1574, + "train/total_loss": 0.09813986718654633 + }, + { + "entropy": 9.340505599975586, + "epoch": 0.15572473798694877, + "mean_token_accuracy": 0.7110552787780762, + "num_tokens": 8196141.0, + "step": 1575, + "train/ce_loss": 1.0214818716049194 + }, + { + "epoch": 0.15572473798694877, + "step": 1575, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.15572473798694877, + "step": 1575, + "train/total_loss": 0.14121069014072418 + }, + { + "entropy": 9.347097396850586, + "epoch": 0.15582361083646432, + "mean_token_accuracy": 0.7240506410598755, + "num_tokens": 8201409.0, + "step": 1576, + "train/ce_loss": 0.595423698425293 + }, + { + "epoch": 0.15582361083646432, + "step": 1576, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.15582361083646432, + "step": 1576, + "train/total_loss": 0.09469862282276154 + }, + { + "entropy": 9.248504638671875, + "epoch": 0.15592248368597983, + "mean_token_accuracy": 0.7483588457107544, + "num_tokens": 8206760.0, + "step": 1577, + "train/ce_loss": 0.6549820303916931 + }, + { + "epoch": 0.15592248368597983, + "step": 1577, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.15592248368597983, + "step": 1577, + "train/total_loss": 0.1358107030391693 + }, + { + "entropy": 9.31238079071045, + "epoch": 0.15602135653549534, + "mean_token_accuracy": 0.6877370476722717, + "num_tokens": 8212045.0, + "step": 1578, + "train/ce_loss": 0.9893155097961426 + }, + { + "epoch": 0.15602135653549534, + "step": 1578, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.15602135653549534, + "step": 1578, + "train/total_loss": 0.20830655097961426 + }, + { + "entropy": 9.541158676147461, + "epoch": 0.15612022938501088, + "mean_token_accuracy": 0.7480106353759766, + "num_tokens": 8217402.0, + "step": 1579, + "train/ce_loss": 0.5257939100265503 + }, + { + "epoch": 0.15612022938501088, + "step": 1579, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.15612022938501088, + "step": 1579, + "train/total_loss": 0.1385168880224228 + }, + { + "epoch": 0.1562191022345264, + "grad_norm": 0.8479238748550415, + "learning_rate": 9.612075359738912e-06, + "loss": 0.1645, + "step": 1580 + }, + { + "entropy": 8.832939147949219, + "epoch": 0.1562191022345264, + "mean_token_accuracy": 0.7540687322616577, + "num_tokens": 8222972.0, + "step": 1580, + "train/ce_loss": 1.0235515832901 + }, + { + "epoch": 0.1562191022345264, + "step": 1580, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.1562191022345264, + "step": 1580, + "train/total_loss": 0.20391765236854553 + }, + { + "entropy": 9.819954872131348, + "epoch": 0.1563179750840419, + "mean_token_accuracy": 0.7840909361839294, + "num_tokens": 8228118.0, + "step": 1581, + "train/ce_loss": 7.304859536816366e-06 + }, + { + "epoch": 0.1563179750840419, + "step": 1581, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.1563179750840419, + "step": 1581, + "train/total_loss": 0.0742194801568985 + }, + { + "entropy": 9.163996696472168, + "epoch": 0.15641684793355745, + "mean_token_accuracy": 0.7970521450042725, + "num_tokens": 8233462.0, + "step": 1582, + "train/ce_loss": 1.022832020680653e-05 + }, + { + "epoch": 0.15641684793355745, + "step": 1582, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.15641684793355745, + "step": 1582, + "train/total_loss": 0.05468852445483208 + }, + { + "entropy": 9.698354721069336, + "epoch": 0.15651572078307296, + "mean_token_accuracy": 0.7314662337303162, + "num_tokens": 8238506.0, + "step": 1583, + "train/ce_loss": 1.5570802133879624e-05 + }, + { + "epoch": 0.15651572078307296, + "step": 1583, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.15651572078307296, + "step": 1583, + "train/total_loss": 0.05078280717134476 + }, + { + "entropy": 9.24047565460205, + "epoch": 0.15661459363258848, + "mean_token_accuracy": 0.7287024855613708, + "num_tokens": 8243759.0, + "step": 1584, + "train/ce_loss": 1.1748043298721313 + }, + { + "epoch": 0.15661459363258848, + "step": 1584, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.15661459363258848, + "step": 1584, + "train/total_loss": 0.22294917702674866 + }, + { + "entropy": 9.562520980834961, + "epoch": 0.15671346648210402, + "mean_token_accuracy": 0.7647058963775635, + "num_tokens": 8248901.0, + "step": 1585, + "train/ce_loss": 0.8184059858322144 + }, + { + "epoch": 0.15671346648210402, + "step": 1585, + "train/sim_loss": 0.12109375 + }, + { + "epoch": 0.15671346648210402, + "step": 1585, + "train/total_loss": 0.2029343545436859 + }, + { + "entropy": 9.282878875732422, + "epoch": 0.15681233933161953, + "mean_token_accuracy": 0.7263017296791077, + "num_tokens": 8254067.0, + "step": 1586, + "train/ce_loss": 0.4955042600631714 + }, + { + "epoch": 0.15681233933161953, + "step": 1586, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.15681233933161953, + "step": 1586, + "train/total_loss": 0.13548792898654938 + }, + { + "entropy": 9.205556869506836, + "epoch": 0.15691121218113507, + "mean_token_accuracy": 0.7452085614204407, + "num_tokens": 8259383.0, + "step": 1587, + "train/ce_loss": 1.2147839069366455 + }, + { + "epoch": 0.15691121218113507, + "step": 1587, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.15691121218113507, + "step": 1587, + "train/total_loss": 0.1527283936738968 + }, + { + "entropy": 9.279157638549805, + "epoch": 0.1570100850306506, + "mean_token_accuracy": 0.7449275255203247, + "num_tokens": 8264532.0, + "step": 1588, + "train/ce_loss": 0.5279999375343323 + }, + { + "epoch": 0.1570100850306506, + "step": 1588, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.1570100850306506, + "step": 1588, + "train/total_loss": 0.1270187497138977 + }, + { + "entropy": 9.94023609161377, + "epoch": 0.1571089578801661, + "mean_token_accuracy": 0.7112970948219299, + "num_tokens": 8269447.0, + "step": 1589, + "train/ce_loss": 1.4001447198097594e-05 + }, + { + "epoch": 0.1571089578801661, + "step": 1589, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.1571089578801661, + "step": 1589, + "train/total_loss": 0.03515765070915222 + }, + { + "entropy": 9.349342346191406, + "epoch": 0.15720783072968164, + "mean_token_accuracy": 0.7328145503997803, + "num_tokens": 8274737.0, + "step": 1590, + "train/ce_loss": 1.1686177253723145 + }, + { + "epoch": 0.15720783072968164, + "step": 1590, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.15720783072968164, + "step": 1590, + "train/total_loss": 0.19498677551746368 + }, + { + "entropy": 9.330066680908203, + "epoch": 0.15730670357919715, + "mean_token_accuracy": 0.7281795740127563, + "num_tokens": 8279977.0, + "step": 1591, + "train/ce_loss": 0.6450347304344177 + }, + { + "epoch": 0.15730670357919715, + "step": 1591, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.15730670357919715, + "step": 1591, + "train/total_loss": 0.154347226023674 + }, + { + "entropy": 9.549765586853027, + "epoch": 0.15740557642871267, + "mean_token_accuracy": 0.7203166484832764, + "num_tokens": 8285156.0, + "step": 1592, + "train/ce_loss": 0.5984858274459839 + }, + { + "epoch": 0.15740557642871267, + "step": 1592, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.15740557642871267, + "step": 1592, + "train/total_loss": 0.11453608423471451 + }, + { + "entropy": 9.196601867675781, + "epoch": 0.1575044492782282, + "mean_token_accuracy": 0.8042269349098206, + "num_tokens": 8290532.0, + "step": 1593, + "train/ce_loss": 0.6578105688095093 + }, + { + "epoch": 0.1575044492782282, + "step": 1593, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.1575044492782282, + "step": 1593, + "train/total_loss": 0.17124980688095093 + }, + { + "entropy": 8.923377990722656, + "epoch": 0.15760332212774372, + "mean_token_accuracy": 0.738231897354126, + "num_tokens": 8295923.0, + "step": 1594, + "train/ce_loss": 0.9051287770271301 + }, + { + "epoch": 0.15760332212774372, + "step": 1594, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.15760332212774372, + "step": 1594, + "train/total_loss": 0.16082537174224854 + }, + { + "entropy": 10.551109313964844, + "epoch": 0.15770219497725924, + "mean_token_accuracy": 0.7696969509124756, + "num_tokens": 8300451.0, + "step": 1595, + "train/ce_loss": 2.871335527743213e-05 + }, + { + "epoch": 0.15770219497725924, + "step": 1595, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.15770219497725924, + "step": 1595, + "train/total_loss": 0.07812786847352982 + }, + { + "entropy": 9.211976051330566, + "epoch": 0.15780106782677478, + "mean_token_accuracy": 0.7874564528465271, + "num_tokens": 8305782.0, + "step": 1596, + "train/ce_loss": 0.6386144757270813 + }, + { + "epoch": 0.15780106782677478, + "step": 1596, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.15780106782677478, + "step": 1596, + "train/total_loss": 0.09120520204305649 + }, + { + "entropy": 9.565742492675781, + "epoch": 0.1578999406762903, + "mean_token_accuracy": 0.7011308670043945, + "num_tokens": 8311013.0, + "step": 1597, + "train/ce_loss": 1.6720161437988281 + }, + { + "epoch": 0.1578999406762903, + "step": 1597, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.1578999406762903, + "step": 1597, + "train/total_loss": 0.2570453882217407 + }, + { + "entropy": 8.96369743347168, + "epoch": 0.1579988135258058, + "mean_token_accuracy": 0.7186788320541382, + "num_tokens": 8316396.0, + "step": 1598, + "train/ce_loss": 1.4170739650726318 + }, + { + "epoch": 0.1579988135258058, + "step": 1598, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.1579988135258058, + "step": 1598, + "train/total_loss": 0.2237386554479599 + }, + { + "entropy": 9.431184768676758, + "epoch": 0.15809768637532134, + "mean_token_accuracy": 0.6938202381134033, + "num_tokens": 8321563.0, + "step": 1599, + "train/ce_loss": 0.6445886492729187 + }, + { + "epoch": 0.15809768637532134, + "step": 1599, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.15809768637532134, + "step": 1599, + "train/total_loss": 0.15430262684822083 + }, + { + "epoch": 0.15819655922483686, + "grad_norm": 0.9533355832099915, + "learning_rate": 9.607130494980962e-06, + "loss": 0.1562, + "step": 1600 + }, + { + "entropy": 8.762441635131836, + "epoch": 0.15819655922483686, + "mean_token_accuracy": 0.700095534324646, + "num_tokens": 8327160.0, + "step": 1600, + "train/ce_loss": 0.7110863327980042 + }, + { + "epoch": 0.15819655922483686, + "step": 1600, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.15819655922483686, + "step": 1600, + "train/total_loss": 0.1648586392402649 + }, + { + "entropy": 9.795531272888184, + "epoch": 0.15829543207435237, + "mean_token_accuracy": 0.7343234419822693, + "num_tokens": 8332145.0, + "step": 1601, + "train/ce_loss": 0.9776668548583984 + }, + { + "epoch": 0.15829543207435237, + "step": 1601, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.15829543207435237, + "step": 1601, + "train/total_loss": 0.1680791974067688 + }, + { + "entropy": 9.971131324768066, + "epoch": 0.1583943049238679, + "mean_token_accuracy": 0.7102272510528564, + "num_tokens": 8337102.0, + "step": 1602, + "train/ce_loss": 8.836418601276819e-06 + }, + { + "epoch": 0.1583943049238679, + "step": 1602, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.1583943049238679, + "step": 1602, + "train/total_loss": 0.07421963661909103 + }, + { + "entropy": 9.375772476196289, + "epoch": 0.15849317777338343, + "mean_token_accuracy": 0.7432432174682617, + "num_tokens": 8342371.0, + "step": 1603, + "train/ce_loss": 0.6301258206367493 + }, + { + "epoch": 0.15849317777338343, + "step": 1603, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.15849317777338343, + "step": 1603, + "train/total_loss": 0.09426258504390717 + }, + { + "entropy": 9.804561614990234, + "epoch": 0.15859205062289894, + "mean_token_accuracy": 0.7110389471054077, + "num_tokens": 8347419.0, + "step": 1604, + "train/ce_loss": 1.3388265371322632 + }, + { + "epoch": 0.15859205062289894, + "step": 1604, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.15859205062289894, + "step": 1604, + "train/total_loss": 0.22763265669345856 + }, + { + "entropy": 9.409191131591797, + "epoch": 0.15869092347241448, + "mean_token_accuracy": 0.8163539171218872, + "num_tokens": 8352638.0, + "step": 1605, + "train/ce_loss": 6.432980626414064e-06 + }, + { + "epoch": 0.15869092347241448, + "step": 1605, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.15869092347241448, + "step": 1605, + "train/total_loss": 0.10156314074993134 + }, + { + "entropy": 9.398286819458008, + "epoch": 0.15878979632193, + "mean_token_accuracy": 0.7496706247329712, + "num_tokens": 8357890.0, + "step": 1606, + "train/ce_loss": 0.6533154845237732 + }, + { + "epoch": 0.15878979632193, + "step": 1606, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.15878979632193, + "step": 1606, + "train/total_loss": 0.16689404845237732 + }, + { + "entropy": 9.626225471496582, + "epoch": 0.15888866917144553, + "mean_token_accuracy": 0.7177914381027222, + "num_tokens": 8362990.0, + "step": 1607, + "train/ce_loss": 0.9942994713783264 + }, + { + "epoch": 0.15888866917144553, + "step": 1607, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.15888866917144553, + "step": 1607, + "train/total_loss": 0.16583620011806488 + }, + { + "entropy": 9.111536026000977, + "epoch": 0.15898754202096105, + "mean_token_accuracy": 0.720652163028717, + "num_tokens": 8368387.0, + "step": 1608, + "train/ce_loss": 0.5740724205970764 + }, + { + "epoch": 0.15898754202096105, + "step": 1608, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.15898754202096105, + "step": 1608, + "train/total_loss": 0.08865724503993988 + }, + { + "entropy": 9.151995658874512, + "epoch": 0.15908641487047656, + "mean_token_accuracy": 0.7651006579399109, + "num_tokens": 8373779.0, + "step": 1609, + "train/ce_loss": 0.744520366191864 + }, + { + "epoch": 0.15908641487047656, + "step": 1609, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.15908641487047656, + "step": 1609, + "train/total_loss": 0.14476454257965088 + }, + { + "entropy": 9.361624717712402, + "epoch": 0.1591852877199921, + "mean_token_accuracy": 0.7044673562049866, + "num_tokens": 8379120.0, + "step": 1610, + "train/ce_loss": 0.8378784656524658 + }, + { + "epoch": 0.1591852877199921, + "step": 1610, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.1591852877199921, + "step": 1610, + "train/total_loss": 0.14628785848617554 + }, + { + "entropy": 9.501016616821289, + "epoch": 0.15928416056950762, + "mean_token_accuracy": 0.6928374767303467, + "num_tokens": 8384368.0, + "step": 1611, + "train/ce_loss": 1.7459763288497925 + }, + { + "epoch": 0.15928416056950762, + "step": 1611, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.15928416056950762, + "step": 1611, + "train/total_loss": 0.2449101358652115 + }, + { + "entropy": 9.5258207321167, + "epoch": 0.15938303341902313, + "mean_token_accuracy": 0.7316341996192932, + "num_tokens": 8389514.0, + "step": 1612, + "train/ce_loss": 0.6329296231269836 + }, + { + "epoch": 0.15938303341902313, + "step": 1612, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.15938303341902313, + "step": 1612, + "train/total_loss": 0.1414179652929306 + }, + { + "entropy": 9.500965118408203, + "epoch": 0.15948190626853867, + "mean_token_accuracy": 0.7055057883262634, + "num_tokens": 8394754.0, + "step": 1613, + "train/ce_loss": 0.66053307056427 + }, + { + "epoch": 0.15948190626853867, + "step": 1613, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.15948190626853867, + "step": 1613, + "train/total_loss": 0.12464705854654312 + }, + { + "entropy": 9.78488540649414, + "epoch": 0.15958077911805418, + "mean_token_accuracy": 0.6838235259056091, + "num_tokens": 8399725.0, + "step": 1614, + "train/ce_loss": 5.6860840231820475e-06 + }, + { + "epoch": 0.15958077911805418, + "step": 1614, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.15958077911805418, + "step": 1614, + "train/total_loss": 0.054688069969415665 + }, + { + "entropy": 9.601234436035156, + "epoch": 0.1596796519675697, + "mean_token_accuracy": 0.7455138564109802, + "num_tokens": 8404824.0, + "step": 1615, + "train/ce_loss": 1.4592608213424683 + }, + { + "epoch": 0.1596796519675697, + "step": 1615, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.1596796519675697, + "step": 1615, + "train/total_loss": 0.1928010880947113 + }, + { + "entropy": 9.11591911315918, + "epoch": 0.15977852481708524, + "mean_token_accuracy": 0.7567287683486938, + "num_tokens": 8410279.0, + "step": 1616, + "train/ce_loss": 1.1196359395980835 + }, + { + "epoch": 0.15977852481708524, + "step": 1616, + "train/sim_loss": 0.16796875 + }, + { + "epoch": 0.15977852481708524, + "step": 1616, + "train/total_loss": 0.2799323499202728 + }, + { + "entropy": 9.921876907348633, + "epoch": 0.15987739766660075, + "mean_token_accuracy": 0.7521968483924866, + "num_tokens": 8415285.0, + "step": 1617, + "train/ce_loss": 0.8378710746765137 + }, + { + "epoch": 0.15987739766660075, + "step": 1617, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.15987739766660075, + "step": 1617, + "train/total_loss": 0.14238086342811584 + }, + { + "entropy": 9.652484893798828, + "epoch": 0.15997627051611626, + "mean_token_accuracy": 0.731054961681366, + "num_tokens": 8420408.0, + "step": 1618, + "train/ce_loss": 1.6161900758743286 + }, + { + "epoch": 0.15997627051611626, + "step": 1618, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.15997627051611626, + "step": 1618, + "train/total_loss": 0.23974400758743286 + }, + { + "entropy": 10.246427536010742, + "epoch": 0.1600751433656318, + "mean_token_accuracy": 0.7930174469947815, + "num_tokens": 8425172.0, + "step": 1619, + "train/ce_loss": 1.2845847606658936 + }, + { + "epoch": 0.1600751433656318, + "step": 1619, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.1600751433656318, + "step": 1619, + "train/total_loss": 0.15580223500728607 + }, + { + "epoch": 0.16017401621514732, + "grad_norm": 0.9123067259788513, + "learning_rate": 9.602185630223014e-06, + "loss": 0.1584, + "step": 1620 + }, + { + "entropy": 9.728799819946289, + "epoch": 0.16017401621514732, + "mean_token_accuracy": 0.7475728392601013, + "num_tokens": 8430156.0, + "step": 1620, + "train/ce_loss": 1.1937299966812134 + }, + { + "epoch": 0.16017401621514732, + "step": 1620, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.16017401621514732, + "step": 1620, + "train/total_loss": 0.16234174370765686 + }, + { + "entropy": 8.924592971801758, + "epoch": 0.16027288906466283, + "mean_token_accuracy": 0.7698259353637695, + "num_tokens": 8435650.0, + "step": 1621, + "train/ce_loss": 0.4493315517902374 + }, + { + "epoch": 0.16027288906466283, + "step": 1621, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.16027288906466283, + "step": 1621, + "train/total_loss": 0.08399565517902374 + }, + { + "entropy": 9.233839988708496, + "epoch": 0.16037176191417837, + "mean_token_accuracy": 0.7586206793785095, + "num_tokens": 8440954.0, + "step": 1622, + "train/ce_loss": 0.7626397013664246 + }, + { + "epoch": 0.16037176191417837, + "step": 1622, + "train/sim_loss": 0.12890625 + }, + { + "epoch": 0.16037176191417837, + "step": 1622, + "train/total_loss": 0.20517021417617798 + }, + { + "entropy": 8.83321762084961, + "epoch": 0.1604706347636939, + "mean_token_accuracy": 0.7466539144515991, + "num_tokens": 8446556.0, + "step": 1623, + "train/ce_loss": 0.7785013318061829 + }, + { + "epoch": 0.1604706347636939, + "step": 1623, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.1604706347636939, + "step": 1623, + "train/total_loss": 0.1325376331806183 + }, + { + "entropy": 9.502970695495605, + "epoch": 0.1605695076132094, + "mean_token_accuracy": 0.7137452960014343, + "num_tokens": 8451759.0, + "step": 1624, + "train/ce_loss": 5.685467840521596e-06 + }, + { + "epoch": 0.1605695076132094, + "step": 1624, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.1605695076132094, + "step": 1624, + "train/total_loss": 0.035156819969415665 + }, + { + "entropy": 9.826895713806152, + "epoch": 0.16066838046272494, + "mean_token_accuracy": 0.7473118305206299, + "num_tokens": 8456739.0, + "step": 1625, + "train/ce_loss": 1.6536694765090942 + }, + { + "epoch": 0.16066838046272494, + "step": 1625, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.16066838046272494, + "step": 1625, + "train/total_loss": 0.2513044476509094 + }, + { + "entropy": 9.346691131591797, + "epoch": 0.16076725331224045, + "mean_token_accuracy": 0.6819338202476501, + "num_tokens": 8461927.0, + "step": 1626, + "train/ce_loss": 1.2664635181427002 + }, + { + "epoch": 0.16076725331224045, + "step": 1626, + "train/sim_loss": 0.12890625 + }, + { + "epoch": 0.16076725331224045, + "step": 1626, + "train/total_loss": 0.25555258989334106 + }, + { + "entropy": 9.530488014221191, + "epoch": 0.160866126161756, + "mean_token_accuracy": 0.7142857313156128, + "num_tokens": 8467011.0, + "step": 1627, + "train/ce_loss": 1.0578060150146484 + }, + { + "epoch": 0.160866126161756, + "step": 1627, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.160866126161756, + "step": 1627, + "train/total_loss": 0.17999935150146484 + }, + { + "entropy": 8.888346672058105, + "epoch": 0.1609649990112715, + "mean_token_accuracy": 0.7515375018119812, + "num_tokens": 8472313.0, + "step": 1628, + "train/ce_loss": 0.6371399164199829 + }, + { + "epoch": 0.1609649990112715, + "step": 1628, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.1609649990112715, + "step": 1628, + "train/total_loss": 0.14965149760246277 + }, + { + "entropy": 10.206025123596191, + "epoch": 0.16106387186078702, + "mean_token_accuracy": 0.7366071343421936, + "num_tokens": 8477123.0, + "step": 1629, + "train/ce_loss": 2.246410846710205 + }, + { + "epoch": 0.16106387186078702, + "step": 1629, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.16106387186078702, + "step": 1629, + "train/total_loss": 0.3105785846710205 + }, + { + "entropy": 9.567769050598145, + "epoch": 0.16116274471030256, + "mean_token_accuracy": 0.6633093357086182, + "num_tokens": 8482235.0, + "step": 1630, + "train/ce_loss": 1.9544039964675903 + }, + { + "epoch": 0.16116274471030256, + "step": 1630, + "train/sim_loss": 0.15234375 + }, + { + "epoch": 0.16116274471030256, + "step": 1630, + "train/total_loss": 0.347784161567688 + }, + { + "entropy": 8.883554458618164, + "epoch": 0.16126161755981808, + "mean_token_accuracy": 0.750507116317749, + "num_tokens": 8487778.0, + "step": 1631, + "train/ce_loss": 1.0531898736953735 + }, + { + "epoch": 0.16126161755981808, + "step": 1631, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.16126161755981808, + "step": 1631, + "train/total_loss": 0.22250649333000183 + }, + { + "entropy": 9.575586318969727, + "epoch": 0.1613604904093336, + "mean_token_accuracy": 0.7296072244644165, + "num_tokens": 8492885.0, + "step": 1632, + "train/ce_loss": 0.48304834961891174 + }, + { + "epoch": 0.1613604904093336, + "step": 1632, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.1613604904093336, + "step": 1632, + "train/total_loss": 0.13814859092235565 + }, + { + "entropy": 9.59064769744873, + "epoch": 0.16145936325884913, + "mean_token_accuracy": 0.7426981925964355, + "num_tokens": 8498052.0, + "step": 1633, + "train/ce_loss": 0.9048088192939758 + }, + { + "epoch": 0.16145936325884913, + "step": 1633, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.16145936325884913, + "step": 1633, + "train/total_loss": 0.17641839385032654 + }, + { + "entropy": 9.502883911132812, + "epoch": 0.16155823610836464, + "mean_token_accuracy": 0.8213802576065063, + "num_tokens": 8503255.0, + "step": 1634, + "train/ce_loss": 0.4299742579460144 + }, + { + "epoch": 0.16155823610836464, + "step": 1634, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.16155823610836464, + "step": 1634, + "train/total_loss": 0.07424742728471756 + }, + { + "entropy": 9.213842391967773, + "epoch": 0.16165710895788016, + "mean_token_accuracy": 0.730434775352478, + "num_tokens": 8508630.0, + "step": 1635, + "train/ce_loss": 0.8116820454597473 + }, + { + "epoch": 0.16165710895788016, + "step": 1635, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.16165710895788016, + "step": 1635, + "train/total_loss": 0.13976195454597473 + }, + { + "entropy": 8.935359954833984, + "epoch": 0.1617559818073957, + "mean_token_accuracy": 0.7946336269378662, + "num_tokens": 8514056.0, + "step": 1636, + "train/ce_loss": 0.49651533365249634 + }, + { + "epoch": 0.1617559818073957, + "step": 1636, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.1617559818073957, + "step": 1636, + "train/total_loss": 0.11605778336524963 + }, + { + "entropy": 9.745095252990723, + "epoch": 0.1618548546569112, + "mean_token_accuracy": 0.7855855822563171, + "num_tokens": 8519048.0, + "step": 1637, + "train/ce_loss": 0.637229859828949 + }, + { + "epoch": 0.1618548546569112, + "step": 1637, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.1618548546569112, + "step": 1637, + "train/total_loss": 0.11059799045324326 + }, + { + "entropy": 9.171215057373047, + "epoch": 0.16195372750642673, + "mean_token_accuracy": 0.7346465587615967, + "num_tokens": 8524339.0, + "step": 1638, + "train/ce_loss": 0.8786928057670593 + }, + { + "epoch": 0.16195372750642673, + "step": 1638, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.16195372750642673, + "step": 1638, + "train/total_loss": 0.11911927908658981 + }, + { + "entropy": 9.65593147277832, + "epoch": 0.16205260035594227, + "mean_token_accuracy": 0.7678855061531067, + "num_tokens": 8529436.0, + "step": 1639, + "train/ce_loss": 0.5777314305305481 + }, + { + "epoch": 0.16205260035594227, + "step": 1639, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.16205260035594227, + "step": 1639, + "train/total_loss": 0.10464814305305481 + }, + { + "epoch": 0.16215147320545778, + "grad_norm": 1.0075432062149048, + "learning_rate": 9.597240765465065e-06, + "loss": 0.1601, + "step": 1640 + }, + { + "entropy": 9.951290130615234, + "epoch": 0.16215147320545778, + "mean_token_accuracy": 0.7233644723892212, + "num_tokens": 8534411.0, + "step": 1640, + "train/ce_loss": 0.5973339676856995 + }, + { + "epoch": 0.16215147320545778, + "step": 1640, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.16215147320545778, + "step": 1640, + "train/total_loss": 0.10270214825868607 + }, + { + "entropy": 9.90272045135498, + "epoch": 0.1622503460549733, + "mean_token_accuracy": 0.7053942084312439, + "num_tokens": 8539330.0, + "step": 1641, + "train/ce_loss": 8.89528018888086e-06 + }, + { + "epoch": 0.1622503460549733, + "step": 1641, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.1622503460549733, + "step": 1641, + "train/total_loss": 0.023438390344381332 + }, + { + "entropy": 9.613363265991211, + "epoch": 0.16234921890448883, + "mean_token_accuracy": 0.732064425945282, + "num_tokens": 8544441.0, + "step": 1642, + "train/ce_loss": 6.30884505881113e-06 + }, + { + "epoch": 0.16234921890448883, + "step": 1642, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.16234921890448883, + "step": 1642, + "train/total_loss": 0.08984438329935074 + }, + { + "entropy": 9.551526069641113, + "epoch": 0.16244809175400435, + "mean_token_accuracy": 0.7136498689651489, + "num_tokens": 8549611.0, + "step": 1643, + "train/ce_loss": 1.3973978757858276 + }, + { + "epoch": 0.16244809175400435, + "step": 1643, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.16244809175400435, + "step": 1643, + "train/total_loss": 0.2530210614204407 + }, + { + "entropy": 9.609851837158203, + "epoch": 0.16254696460351986, + "mean_token_accuracy": 0.7132667899131775, + "num_tokens": 8554782.0, + "step": 1644, + "train/ce_loss": 0.5838186740875244 + }, + { + "epoch": 0.16254696460351986, + "step": 1644, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.16254696460351986, + "step": 1644, + "train/total_loss": 0.14041312038898468 + }, + { + "entropy": 9.33971118927002, + "epoch": 0.1626458374530354, + "mean_token_accuracy": 0.7272727489471436, + "num_tokens": 8559990.0, + "step": 1645, + "train/ce_loss": 0.6293501257896423 + }, + { + "epoch": 0.1626458374530354, + "step": 1645, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.1626458374530354, + "step": 1645, + "train/total_loss": 0.1488725244998932 + }, + { + "entropy": 9.75534439086914, + "epoch": 0.16274471030255092, + "mean_token_accuracy": 0.7266436219215393, + "num_tokens": 8564992.0, + "step": 1646, + "train/ce_loss": 6.342493634292623e-06 + }, + { + "epoch": 0.16274471030255092, + "step": 1646, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.16274471030255092, + "step": 1646, + "train/total_loss": 0.07421938329935074 + }, + { + "entropy": 9.056360244750977, + "epoch": 0.16284358315206643, + "mean_token_accuracy": 0.6840425729751587, + "num_tokens": 8570449.0, + "step": 1647, + "train/ce_loss": 0.9854462146759033 + }, + { + "epoch": 0.16284358315206643, + "step": 1647, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.16284358315206643, + "step": 1647, + "train/total_loss": 0.1962008774280548 + }, + { + "entropy": 9.603727340698242, + "epoch": 0.16294245600158197, + "mean_token_accuracy": 0.7390071153640747, + "num_tokens": 8575590.0, + "step": 1648, + "train/ce_loss": 1.2806551456451416 + }, + { + "epoch": 0.16294245600158197, + "step": 1648, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.16294245600158197, + "step": 1648, + "train/total_loss": 0.20619051158428192 + }, + { + "entropy": 9.485952377319336, + "epoch": 0.16304132885109748, + "mean_token_accuracy": 0.6822840571403503, + "num_tokens": 8580732.0, + "step": 1649, + "train/ce_loss": 1.824843406677246 + }, + { + "epoch": 0.16304132885109748, + "step": 1649, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.16304132885109748, + "step": 1649, + "train/total_loss": 0.29185932874679565 + }, + { + "entropy": 9.710960388183594, + "epoch": 0.16314020170061302, + "mean_token_accuracy": 0.6900311708450317, + "num_tokens": 8585824.0, + "step": 1650, + "train/ce_loss": 1.2377186976664234e-05 + }, + { + "epoch": 0.16314020170061302, + "step": 1650, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.16314020170061302, + "step": 1650, + "train/total_loss": 0.04687623679637909 + }, + { + "entropy": 9.404132843017578, + "epoch": 0.16323907455012854, + "mean_token_accuracy": 0.7272727489471436, + "num_tokens": 8591053.0, + "step": 1651, + "train/ce_loss": 1.1155272722244263 + }, + { + "epoch": 0.16323907455012854, + "step": 1651, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.16323907455012854, + "step": 1651, + "train/total_loss": 0.18186523020267487 + }, + { + "entropy": 9.23090648651123, + "epoch": 0.16333794739964405, + "mean_token_accuracy": 0.7109375, + "num_tokens": 8596443.0, + "step": 1652, + "train/ce_loss": 0.6732193827629089 + }, + { + "epoch": 0.16333794739964405, + "step": 1652, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.16333794739964405, + "step": 1652, + "train/total_loss": 0.11419694125652313 + }, + { + "entropy": 9.733444213867188, + "epoch": 0.1634368202491596, + "mean_token_accuracy": 0.6919431090354919, + "num_tokens": 8601525.0, + "step": 1653, + "train/ce_loss": 1.4947330951690674 + }, + { + "epoch": 0.1634368202491596, + "step": 1653, + "train/sim_loss": 0.1328125 + }, + { + "epoch": 0.1634368202491596, + "step": 1653, + "train/total_loss": 0.28228580951690674 + }, + { + "entropy": 9.211160659790039, + "epoch": 0.1635356930986751, + "mean_token_accuracy": 0.70138019323349, + "num_tokens": 8606780.0, + "step": 1654, + "train/ce_loss": 0.9958420991897583 + }, + { + "epoch": 0.1635356930986751, + "step": 1654, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.1635356930986751, + "step": 1654, + "train/total_loss": 0.18161547183990479 + }, + { + "entropy": 9.393009185791016, + "epoch": 0.16363456594819062, + "mean_token_accuracy": 0.7454323768615723, + "num_tokens": 8612040.0, + "step": 1655, + "train/ce_loss": 1.3290444612503052 + }, + { + "epoch": 0.16363456594819062, + "step": 1655, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.16363456594819062, + "step": 1655, + "train/total_loss": 0.16806070506572723 + }, + { + "entropy": 10.141335487365723, + "epoch": 0.16373343879770616, + "mean_token_accuracy": 0.7228915691375732, + "num_tokens": 8616865.0, + "step": 1656, + "train/ce_loss": 2.1531684398651123 + }, + { + "epoch": 0.16373343879770616, + "step": 1656, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.16373343879770616, + "step": 1656, + "train/total_loss": 0.3012543320655823 + }, + { + "entropy": 9.644229888916016, + "epoch": 0.16383231164722167, + "mean_token_accuracy": 0.7107913494110107, + "num_tokens": 8621975.0, + "step": 1657, + "train/ce_loss": 6.4753439801279455e-06 + }, + { + "epoch": 0.16383231164722167, + "step": 1657, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.16383231164722167, + "step": 1657, + "train/total_loss": 0.07421939820051193 + }, + { + "entropy": 9.311150550842285, + "epoch": 0.1639311844967372, + "mean_token_accuracy": 0.6637279391288757, + "num_tokens": 8627271.0, + "step": 1658, + "train/ce_loss": 1.0205533504486084 + }, + { + "epoch": 0.1639311844967372, + "step": 1658, + "train/sim_loss": 0.12109375 + }, + { + "epoch": 0.1639311844967372, + "step": 1658, + "train/total_loss": 0.22314909100532532 + }, + { + "entropy": 9.454523086547852, + "epoch": 0.16403005734625273, + "mean_token_accuracy": 0.720588207244873, + "num_tokens": 8632532.0, + "step": 1659, + "train/ce_loss": 0.7508016228675842 + }, + { + "epoch": 0.16403005734625273, + "step": 1659, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.16403005734625273, + "step": 1659, + "train/total_loss": 0.14539265632629395 + }, + { + "epoch": 0.16412893019576824, + "grad_norm": 0.9204435348510742, + "learning_rate": 9.592295900707115e-06, + "loss": 0.1754, + "step": 1660 + }, + { + "entropy": 9.35174560546875, + "epoch": 0.16412893019576824, + "mean_token_accuracy": 0.6658163070678711, + "num_tokens": 8637783.0, + "step": 1660, + "train/ce_loss": 1.5418193340301514 + }, + { + "epoch": 0.16412893019576824, + "step": 1660, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.16412893019576824, + "step": 1660, + "train/total_loss": 0.21668194234371185 + }, + { + "entropy": 8.99431037902832, + "epoch": 0.16422780304528375, + "mean_token_accuracy": 0.733195424079895, + "num_tokens": 8643218.0, + "step": 1661, + "train/ce_loss": 1.2750509977340698 + }, + { + "epoch": 0.16422780304528375, + "step": 1661, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.16422780304528375, + "step": 1661, + "train/total_loss": 0.2290676087141037 + }, + { + "entropy": 9.160455703735352, + "epoch": 0.1643266758947993, + "mean_token_accuracy": 0.7895902395248413, + "num_tokens": 8648639.0, + "step": 1662, + "train/ce_loss": 0.7102704644203186 + }, + { + "epoch": 0.1643266758947993, + "step": 1662, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.1643266758947993, + "step": 1662, + "train/total_loss": 0.13352704048156738 + }, + { + "entropy": 9.707138061523438, + "epoch": 0.1644255487443148, + "mean_token_accuracy": 0.7761194109916687, + "num_tokens": 8653687.0, + "step": 1663, + "train/ce_loss": 0.8248770833015442 + }, + { + "epoch": 0.1644255487443148, + "step": 1663, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.1644255487443148, + "step": 1663, + "train/total_loss": 0.11764395982027054 + }, + { + "entropy": 9.405868530273438, + "epoch": 0.16452442159383032, + "mean_token_accuracy": 0.7403726577758789, + "num_tokens": 8658946.0, + "step": 1664, + "train/ce_loss": 0.4154486656188965 + }, + { + "epoch": 0.16452442159383032, + "step": 1664, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.16452442159383032, + "step": 1664, + "train/total_loss": 0.06888861954212189 + }, + { + "entropy": 9.920665740966797, + "epoch": 0.16462329444334586, + "mean_token_accuracy": 0.7779660820960999, + "num_tokens": 8663950.0, + "step": 1665, + "train/ce_loss": 0.37819704413414 + }, + { + "epoch": 0.16462329444334586, + "step": 1665, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.16462329444334586, + "step": 1665, + "train/total_loss": 0.08860095590353012 + }, + { + "entropy": 9.274250030517578, + "epoch": 0.16472216729286138, + "mean_token_accuracy": 0.6619552373886108, + "num_tokens": 8669240.0, + "step": 1666, + "train/ce_loss": 1.6018245220184326 + }, + { + "epoch": 0.16472216729286138, + "step": 1666, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.16472216729286138, + "step": 1666, + "train/total_loss": 0.23440121114253998 + }, + { + "entropy": 10.213384628295898, + "epoch": 0.1648210401423769, + "mean_token_accuracy": 0.6941580772399902, + "num_tokens": 8673963.0, + "step": 1667, + "train/ce_loss": 0.00013563338143285364 + }, + { + "epoch": 0.1648210401423769, + "step": 1667, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.1648210401423769, + "step": 1667, + "train/total_loss": 0.0625135600566864 + }, + { + "entropy": 9.858098983764648, + "epoch": 0.16491991299189243, + "mean_token_accuracy": 0.7063903212547302, + "num_tokens": 8678967.0, + "step": 1668, + "train/ce_loss": 0.9753656387329102 + }, + { + "epoch": 0.16491991299189243, + "step": 1668, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.16491991299189243, + "step": 1668, + "train/total_loss": 0.16003656387329102 + }, + { + "entropy": 9.863697052001953, + "epoch": 0.16501878584140794, + "mean_token_accuracy": 0.7641682028770447, + "num_tokens": 8683950.0, + "step": 1669, + "train/ce_loss": 0.6916494965553284 + }, + { + "epoch": 0.16501878584140794, + "step": 1669, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.16501878584140794, + "step": 1669, + "train/total_loss": 0.1668212115764618 + }, + { + "entropy": 9.501869201660156, + "epoch": 0.16511765869092349, + "mean_token_accuracy": 0.7032679915428162, + "num_tokens": 8689133.0, + "step": 1670, + "train/ce_loss": 0.8607410788536072 + }, + { + "epoch": 0.16511765869092349, + "step": 1670, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.16511765869092349, + "step": 1670, + "train/total_loss": 0.1524803638458252 + }, + { + "entropy": 9.70029067993164, + "epoch": 0.165216531540439, + "mean_token_accuracy": 0.7043847441673279, + "num_tokens": 8694288.0, + "step": 1671, + "train/ce_loss": 4.156109298492083e-06 + }, + { + "epoch": 0.165216531540439, + "step": 1671, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.165216531540439, + "step": 1671, + "train/total_loss": 0.07421916723251343 + }, + { + "entropy": 9.59384536743164, + "epoch": 0.1653154043899545, + "mean_token_accuracy": 0.7766367197036743, + "num_tokens": 8699651.0, + "step": 1672, + "train/ce_loss": 0.8478952050209045 + }, + { + "epoch": 0.1653154043899545, + "step": 1672, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.1653154043899545, + "step": 1672, + "train/total_loss": 0.14338326454162598 + }, + { + "entropy": 9.08733081817627, + "epoch": 0.16541427723947005, + "mean_token_accuracy": 0.7172414064407349, + "num_tokens": 8704976.0, + "step": 1673, + "train/ce_loss": 1.1948295831680298 + }, + { + "epoch": 0.16541427723947005, + "step": 1673, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.16541427723947005, + "step": 1673, + "train/total_loss": 0.18979546427726746 + }, + { + "entropy": 9.397780418395996, + "epoch": 0.16551315008898557, + "mean_token_accuracy": 0.7293776869773865, + "num_tokens": 8710158.0, + "step": 1674, + "train/ce_loss": 1.657582402229309 + }, + { + "epoch": 0.16551315008898557, + "step": 1674, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.16551315008898557, + "step": 1674, + "train/total_loss": 0.27122700214385986 + }, + { + "entropy": 8.993717193603516, + "epoch": 0.16561202293850108, + "mean_token_accuracy": 0.7628865838050842, + "num_tokens": 8715600.0, + "step": 1675, + "train/ce_loss": 0.7835026979446411 + }, + { + "epoch": 0.16561202293850108, + "step": 1675, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.16561202293850108, + "step": 1675, + "train/total_loss": 0.10960026830434799 + }, + { + "entropy": 9.839988708496094, + "epoch": 0.16571089578801662, + "mean_token_accuracy": 0.7283531427383423, + "num_tokens": 8720639.0, + "step": 1676, + "train/ce_loss": 1.15177321434021 + }, + { + "epoch": 0.16571089578801662, + "step": 1676, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.16571089578801662, + "step": 1676, + "train/total_loss": 0.22455233335494995 + }, + { + "entropy": 9.535063743591309, + "epoch": 0.16580976863753213, + "mean_token_accuracy": 0.7291960716247559, + "num_tokens": 8725817.0, + "step": 1677, + "train/ce_loss": 0.6581219434738159 + }, + { + "epoch": 0.16580976863753213, + "step": 1677, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.16580976863753213, + "step": 1677, + "train/total_loss": 0.09706219285726547 + }, + { + "entropy": 9.273845672607422, + "epoch": 0.16590864148704765, + "mean_token_accuracy": 0.7103694677352905, + "num_tokens": 8731115.0, + "step": 1678, + "train/ce_loss": 0.6409934163093567 + }, + { + "epoch": 0.16590864148704765, + "step": 1678, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.16590864148704765, + "step": 1678, + "train/total_loss": 0.18128684163093567 + }, + { + "entropy": 9.664846420288086, + "epoch": 0.1660075143365632, + "mean_token_accuracy": 0.7144970297813416, + "num_tokens": 8736445.0, + "step": 1679, + "train/ce_loss": 7.968543286551721e-06 + }, + { + "epoch": 0.1660075143365632, + "step": 1679, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.1660075143365632, + "step": 1679, + "train/total_loss": 0.07812579721212387 + }, + { + "epoch": 0.1661063871860787, + "grad_norm": 1.0694078207015991, + "learning_rate": 9.587351035949168e-06, + "loss": 0.1613, + "step": 1680 + }, + { + "entropy": 9.293163299560547, + "epoch": 0.1661063871860787, + "mean_token_accuracy": 0.6978609561920166, + "num_tokens": 8741700.0, + "step": 1680, + "train/ce_loss": 0.6442439556121826 + }, + { + "epoch": 0.1661063871860787, + "step": 1680, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.1661063871860787, + "step": 1680, + "train/total_loss": 0.10348689556121826 + }, + { + "entropy": 9.476476669311523, + "epoch": 0.16620526003559422, + "mean_token_accuracy": 0.7588832378387451, + "num_tokens": 8746903.0, + "step": 1681, + "train/ce_loss": 1.2075145244598389 + }, + { + "epoch": 0.16620526003559422, + "step": 1681, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.16620526003559422, + "step": 1681, + "train/total_loss": 0.17934520542621613 + }, + { + "entropy": 9.433666229248047, + "epoch": 0.16630413288510976, + "mean_token_accuracy": 0.748308539390564, + "num_tokens": 8752118.0, + "step": 1682, + "train/ce_loss": 1.4417132139205933 + }, + { + "epoch": 0.16630413288510976, + "step": 1682, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.16630413288510976, + "step": 1682, + "train/total_loss": 0.2301088273525238 + }, + { + "entropy": 9.663443565368652, + "epoch": 0.16640300573462527, + "mean_token_accuracy": 0.7016248106956482, + "num_tokens": 8757254.0, + "step": 1683, + "train/ce_loss": 9.290965863328893e-06 + }, + { + "epoch": 0.16640300573462527, + "step": 1683, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.16640300573462527, + "step": 1683, + "train/total_loss": 0.08203218132257462 + }, + { + "entropy": 9.118719100952148, + "epoch": 0.16650187858414078, + "mean_token_accuracy": 0.7209302186965942, + "num_tokens": 8762648.0, + "step": 1684, + "train/ce_loss": 0.8776395916938782 + }, + { + "epoch": 0.16650187858414078, + "step": 1684, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.16650187858414078, + "step": 1684, + "train/total_loss": 0.1385452151298523 + }, + { + "entropy": 9.244359016418457, + "epoch": 0.16660075143365632, + "mean_token_accuracy": 0.7730496525764465, + "num_tokens": 8767929.0, + "step": 1685, + "train/ce_loss": 0.8379502892494202 + }, + { + "epoch": 0.16660075143365632, + "step": 1685, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.16660075143365632, + "step": 1685, + "train/total_loss": 0.14238879084587097 + }, + { + "entropy": 9.807613372802734, + "epoch": 0.16669962428317184, + "mean_token_accuracy": 0.7221269011497498, + "num_tokens": 8772948.0, + "step": 1686, + "train/ce_loss": 5.5832565521996e-06 + }, + { + "epoch": 0.16669962428317184, + "step": 1686, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.16669962428317184, + "step": 1686, + "train/total_loss": 0.07421930879354477 + }, + { + "entropy": 9.008600234985352, + "epoch": 0.16679849713268735, + "mean_token_accuracy": 0.7431102395057678, + "num_tokens": 8778432.0, + "step": 1687, + "train/ce_loss": 0.8022541403770447 + }, + { + "epoch": 0.16679849713268735, + "step": 1687, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.16679849713268735, + "step": 1687, + "train/total_loss": 0.11147541552782059 + }, + { + "entropy": 9.089427947998047, + "epoch": 0.1668973699822029, + "mean_token_accuracy": 0.7213114500045776, + "num_tokens": 8783724.0, + "step": 1688, + "train/ce_loss": 0.683851420879364 + }, + { + "epoch": 0.1668973699822029, + "step": 1688, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.1668973699822029, + "step": 1688, + "train/total_loss": 0.14260390400886536 + }, + { + "entropy": 9.778931617736816, + "epoch": 0.1669962428317184, + "mean_token_accuracy": 0.7264957427978516, + "num_tokens": 8788906.0, + "step": 1689, + "train/ce_loss": 1.454725742340088 + }, + { + "epoch": 0.1669962428317184, + "step": 1689, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.1669962428317184, + "step": 1689, + "train/total_loss": 0.24703507125377655 + }, + { + "entropy": 9.725312232971191, + "epoch": 0.16709511568123395, + "mean_token_accuracy": 0.7404129505157471, + "num_tokens": 8794029.0, + "step": 1690, + "train/ce_loss": 1.193360686302185 + }, + { + "epoch": 0.16709511568123395, + "step": 1690, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.16709511568123395, + "step": 1690, + "train/total_loss": 0.1896485686302185 + }, + { + "entropy": 9.521604537963867, + "epoch": 0.16719398853074946, + "mean_token_accuracy": 0.7085714340209961, + "num_tokens": 8799239.0, + "step": 1691, + "train/ce_loss": 4.235959295328939e-06 + }, + { + "epoch": 0.16719398853074946, + "step": 1691, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.16719398853074946, + "step": 1691, + "train/total_loss": 0.050781674683094025 + }, + { + "entropy": 9.517845153808594, + "epoch": 0.16729286138026497, + "mean_token_accuracy": 0.7022556662559509, + "num_tokens": 8804320.0, + "step": 1692, + "train/ce_loss": 0.7737181782722473 + }, + { + "epoch": 0.16729286138026497, + "step": 1692, + "train/sim_loss": 0.1328125 + }, + { + "epoch": 0.16729286138026497, + "step": 1692, + "train/total_loss": 0.21018432080745697 + }, + { + "entropy": 9.612174034118652, + "epoch": 0.16739173422978051, + "mean_token_accuracy": 0.7832061052322388, + "num_tokens": 8809461.0, + "step": 1693, + "train/ce_loss": 4.751113465317758e-06 + }, + { + "epoch": 0.16739173422978051, + "step": 1693, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.16739173422978051, + "step": 1693, + "train/total_loss": 0.1015629768371582 + }, + { + "entropy": 9.11648178100586, + "epoch": 0.16749060707929603, + "mean_token_accuracy": 0.7461140155792236, + "num_tokens": 8814839.0, + "step": 1694, + "train/ce_loss": 0.8760917782783508 + }, + { + "epoch": 0.16749060707929603, + "step": 1694, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.16749060707929603, + "step": 1694, + "train/total_loss": 0.1227654293179512 + }, + { + "entropy": 9.678754806518555, + "epoch": 0.16758947992881154, + "mean_token_accuracy": 0.6839762330055237, + "num_tokens": 8819942.0, + "step": 1695, + "train/ce_loss": 1.5940691232681274 + }, + { + "epoch": 0.16758947992881154, + "step": 1695, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.16758947992881154, + "step": 1695, + "train/total_loss": 0.23753191530704498 + }, + { + "entropy": 9.327474594116211, + "epoch": 0.16768835277832708, + "mean_token_accuracy": 0.725261926651001, + "num_tokens": 8825243.0, + "step": 1696, + "train/ce_loss": 0.5194224119186401 + }, + { + "epoch": 0.16768835277832708, + "step": 1696, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.16768835277832708, + "step": 1696, + "train/total_loss": 0.11444224417209625 + }, + { + "entropy": 9.477210998535156, + "epoch": 0.1677872256278426, + "mean_token_accuracy": 0.761255145072937, + "num_tokens": 8830605.0, + "step": 1697, + "train/ce_loss": 0.9472794532775879 + }, + { + "epoch": 0.1677872256278426, + "step": 1697, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.1677872256278426, + "step": 1697, + "train/total_loss": 0.15722794830799103 + }, + { + "entropy": 9.329252243041992, + "epoch": 0.1678860984773581, + "mean_token_accuracy": 0.7214111685752869, + "num_tokens": 8835917.0, + "step": 1698, + "train/ce_loss": 0.852024495601654 + }, + { + "epoch": 0.1678860984773581, + "step": 1698, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.1678860984773581, + "step": 1698, + "train/total_loss": 0.17113995552062988 + }, + { + "entropy": 9.541632652282715, + "epoch": 0.16798497132687365, + "mean_token_accuracy": 0.781593382358551, + "num_tokens": 8841115.0, + "step": 1699, + "train/ce_loss": 0.9113073945045471 + }, + { + "epoch": 0.16798497132687365, + "step": 1699, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.16798497132687365, + "step": 1699, + "train/total_loss": 0.14191198348999023 + }, + { + "epoch": 0.16808384417638916, + "grad_norm": 0.8907069563865662, + "learning_rate": 9.582406171191218e-06, + "loss": 0.1606, + "step": 1700 + }, + { + "entropy": 9.1544189453125, + "epoch": 0.16808384417638916, + "mean_token_accuracy": 0.7335600852966309, + "num_tokens": 8846482.0, + "step": 1700, + "train/ce_loss": 0.7185544967651367 + }, + { + "epoch": 0.16808384417638916, + "step": 1700, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.16808384417638916, + "step": 1700, + "train/total_loss": 0.10701169818639755 + }, + { + "entropy": 9.01425552368164, + "epoch": 0.16818271702590468, + "mean_token_accuracy": 0.6836434602737427, + "num_tokens": 8851999.0, + "step": 1701, + "train/ce_loss": 0.7083297967910767 + }, + { + "epoch": 0.16818271702590468, + "step": 1701, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.16818271702590468, + "step": 1701, + "train/total_loss": 0.1255204826593399 + }, + { + "entropy": 9.019742965698242, + "epoch": 0.16828158987542022, + "mean_token_accuracy": 0.680232584476471, + "num_tokens": 8857320.0, + "step": 1702, + "train/ce_loss": 0.9772090911865234 + }, + { + "epoch": 0.16828158987542022, + "step": 1702, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.16828158987542022, + "step": 1702, + "train/total_loss": 0.1914709210395813 + }, + { + "entropy": 9.636129379272461, + "epoch": 0.16838046272493573, + "mean_token_accuracy": 0.7067238688468933, + "num_tokens": 8862376.0, + "step": 1703, + "train/ce_loss": 9.42720907914918e-06 + }, + { + "epoch": 0.16838046272493573, + "step": 1703, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.16838046272493573, + "step": 1703, + "train/total_loss": 0.06640719622373581 + }, + { + "entropy": 10.004971504211426, + "epoch": 0.16847933557445124, + "mean_token_accuracy": 0.7386831045150757, + "num_tokens": 8867303.0, + "step": 1704, + "train/ce_loss": 7.465568614861695e-06 + }, + { + "epoch": 0.16847933557445124, + "step": 1704, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.16847933557445124, + "step": 1704, + "train/total_loss": 0.06640699505805969 + }, + { + "entropy": 9.0205659866333, + "epoch": 0.16857820842396679, + "mean_token_accuracy": 0.7471042275428772, + "num_tokens": 8872843.0, + "step": 1705, + "train/ce_loss": 0.6346682906150818 + }, + { + "epoch": 0.16857820842396679, + "step": 1705, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.16857820842396679, + "step": 1705, + "train/total_loss": 0.16112308204174042 + }, + { + "entropy": 9.970184326171875, + "epoch": 0.1686770812734823, + "mean_token_accuracy": 0.7250509262084961, + "num_tokens": 8877748.0, + "step": 1706, + "train/ce_loss": 7.713006198173389e-06 + }, + { + "epoch": 0.1686770812734823, + "step": 1706, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.1686770812734823, + "step": 1706, + "train/total_loss": 0.07031327486038208 + }, + { + "entropy": 9.801458358764648, + "epoch": 0.1687759541229978, + "mean_token_accuracy": 0.7711039185523987, + "num_tokens": 8882812.0, + "step": 1707, + "train/ce_loss": 1.2045811414718628 + }, + { + "epoch": 0.1687759541229978, + "step": 1707, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.1687759541229978, + "step": 1707, + "train/total_loss": 0.23764562606811523 + }, + { + "entropy": 10.350695610046387, + "epoch": 0.16887482697251335, + "mean_token_accuracy": 0.7046783566474915, + "num_tokens": 8887543.0, + "step": 1708, + "train/ce_loss": 8.123223778966349e-06 + }, + { + "epoch": 0.16887482697251335, + "step": 1708, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.16887482697251335, + "step": 1708, + "train/total_loss": 0.07812581211328506 + }, + { + "entropy": 9.536373138427734, + "epoch": 0.16897369982202887, + "mean_token_accuracy": 0.7261345982551575, + "num_tokens": 8892672.0, + "step": 1709, + "train/ce_loss": 0.6556292772293091 + }, + { + "epoch": 0.16897369982202887, + "step": 1709, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.16897369982202887, + "step": 1709, + "train/total_loss": 0.12806293368339539 + }, + { + "entropy": 9.55009651184082, + "epoch": 0.1690725726715444, + "mean_token_accuracy": 0.7368420958518982, + "num_tokens": 8897775.0, + "step": 1710, + "train/ce_loss": 6.441311597882304e-06 + }, + { + "epoch": 0.1690725726715444, + "step": 1710, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.1690725726715444, + "step": 1710, + "train/total_loss": 0.10937564074993134 + }, + { + "entropy": 10.291550636291504, + "epoch": 0.16917144552105992, + "mean_token_accuracy": 0.7945205569267273, + "num_tokens": 8902499.0, + "step": 1711, + "train/ce_loss": 0.8966747522354126 + }, + { + "epoch": 0.16917144552105992, + "step": 1711, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.16917144552105992, + "step": 1711, + "train/total_loss": 0.14826121926307678 + }, + { + "entropy": 9.649927139282227, + "epoch": 0.16927031837057543, + "mean_token_accuracy": 0.7388632893562317, + "num_tokens": 8907600.0, + "step": 1712, + "train/ce_loss": 1.3270708322525024 + }, + { + "epoch": 0.16927031837057543, + "step": 1712, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.16927031837057543, + "step": 1712, + "train/total_loss": 0.21473833918571472 + }, + { + "entropy": 9.199341773986816, + "epoch": 0.16936919122009098, + "mean_token_accuracy": 0.7558528184890747, + "num_tokens": 8912983.0, + "step": 1713, + "train/ce_loss": 0.5694192051887512 + }, + { + "epoch": 0.16936919122009098, + "step": 1713, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.16936919122009098, + "step": 1713, + "train/total_loss": 0.1272544264793396 + }, + { + "entropy": 8.896648406982422, + "epoch": 0.1694680640696065, + "mean_token_accuracy": 0.7320930361747742, + "num_tokens": 8918509.0, + "step": 1714, + "train/ce_loss": 0.6630674004554749 + }, + { + "epoch": 0.1694680640696065, + "step": 1714, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.1694680640696065, + "step": 1714, + "train/total_loss": 0.14443174004554749 + }, + { + "entropy": 9.277908325195312, + "epoch": 0.169566936919122, + "mean_token_accuracy": 0.7257861495018005, + "num_tokens": 8923778.0, + "step": 1715, + "train/ce_loss": 0.8705785870552063 + }, + { + "epoch": 0.169566936919122, + "step": 1715, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.169566936919122, + "step": 1715, + "train/total_loss": 0.13002660870552063 + }, + { + "entropy": 9.339280128479004, + "epoch": 0.16966580976863754, + "mean_token_accuracy": 0.7461340427398682, + "num_tokens": 8928996.0, + "step": 1716, + "train/ce_loss": 0.7038314938545227 + }, + { + "epoch": 0.16966580976863754, + "step": 1716, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.16966580976863754, + "step": 1716, + "train/total_loss": 0.14069566130638123 + }, + { + "entropy": 9.661998748779297, + "epoch": 0.16976468261815306, + "mean_token_accuracy": 0.7586776614189148, + "num_tokens": 8934047.0, + "step": 1717, + "train/ce_loss": 1.1090891361236572 + }, + { + "epoch": 0.16976468261815306, + "step": 1717, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.16976468261815306, + "step": 1717, + "train/total_loss": 0.14215892553329468 + }, + { + "entropy": 9.382301330566406, + "epoch": 0.16986355546766857, + "mean_token_accuracy": 0.7525380849838257, + "num_tokens": 8939288.0, + "step": 1718, + "train/ce_loss": 1.004603385925293 + }, + { + "epoch": 0.16986355546766857, + "step": 1718, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.16986355546766857, + "step": 1718, + "train/total_loss": 0.19030410051345825 + }, + { + "entropy": 9.96207046508789, + "epoch": 0.1699624283171841, + "mean_token_accuracy": 0.7532956600189209, + "num_tokens": 8944267.0, + "step": 1719, + "train/ce_loss": 0.6833586692810059 + }, + { + "epoch": 0.1699624283171841, + "step": 1719, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.1699624283171841, + "step": 1719, + "train/total_loss": 0.1777108609676361 + }, + { + "epoch": 0.17006130116669962, + "grad_norm": 0.9704808592796326, + "learning_rate": 9.57746130643327e-06, + "loss": 0.1675, + "step": 1720 + }, + { + "entropy": 9.23289680480957, + "epoch": 0.17006130116669962, + "mean_token_accuracy": 0.7065088748931885, + "num_tokens": 8949589.0, + "step": 1720, + "train/ce_loss": 0.6869282126426697 + }, + { + "epoch": 0.17006130116669962, + "step": 1720, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.17006130116669962, + "step": 1720, + "train/total_loss": 0.13509908318519592 + }, + { + "entropy": 8.945226669311523, + "epoch": 0.17016017401621514, + "mean_token_accuracy": 0.7189542651176453, + "num_tokens": 8955139.0, + "step": 1721, + "train/ce_loss": 0.5507825016975403 + }, + { + "epoch": 0.17016017401621514, + "step": 1721, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.17016017401621514, + "step": 1721, + "train/total_loss": 0.09023450314998627 + }, + { + "entropy": 9.419705390930176, + "epoch": 0.17025904686573068, + "mean_token_accuracy": 0.7095046639442444, + "num_tokens": 8960324.0, + "step": 1722, + "train/ce_loss": 0.48278287053108215 + }, + { + "epoch": 0.17025904686573068, + "step": 1722, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.17025904686573068, + "step": 1722, + "train/total_loss": 0.11468453705310822 + }, + { + "entropy": 9.383966445922852, + "epoch": 0.1703579197152462, + "mean_token_accuracy": 0.6917989253997803, + "num_tokens": 8965505.0, + "step": 1723, + "train/ce_loss": 1.9925827980041504 + }, + { + "epoch": 0.1703579197152462, + "step": 1723, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.1703579197152462, + "step": 1723, + "train/total_loss": 0.2578520178794861 + }, + { + "entropy": 8.971576690673828, + "epoch": 0.1704567925647617, + "mean_token_accuracy": 0.7637362480163574, + "num_tokens": 8970896.0, + "step": 1724, + "train/ce_loss": 0.7792263031005859 + }, + { + "epoch": 0.1704567925647617, + "step": 1724, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.1704567925647617, + "step": 1724, + "train/total_loss": 0.15214139223098755 + }, + { + "entropy": 9.419415473937988, + "epoch": 0.17055566541427725, + "mean_token_accuracy": 0.7274073958396912, + "num_tokens": 8976002.0, + "step": 1725, + "train/ce_loss": 0.7176265716552734 + }, + { + "epoch": 0.17055566541427725, + "step": 1725, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.17055566541427725, + "step": 1725, + "train/total_loss": 0.13816890120506287 + }, + { + "entropy": 9.516657829284668, + "epoch": 0.17065453826379276, + "mean_token_accuracy": 0.714067280292511, + "num_tokens": 8981144.0, + "step": 1726, + "train/ce_loss": 1.1410041224735323e-05 + }, + { + "epoch": 0.17065453826379276, + "step": 1726, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.17065453826379276, + "step": 1726, + "train/total_loss": 0.05078238993883133 + }, + { + "entropy": 9.524466514587402, + "epoch": 0.17075341111330827, + "mean_token_accuracy": 0.7178003191947937, + "num_tokens": 8986258.0, + "step": 1727, + "train/ce_loss": 0.5584552884101868 + }, + { + "epoch": 0.17075341111330827, + "step": 1727, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.17075341111330827, + "step": 1727, + "train/total_loss": 0.13006427884101868 + }, + { + "entropy": 9.789735794067383, + "epoch": 0.17085228396282381, + "mean_token_accuracy": 0.7452667951583862, + "num_tokens": 8991289.0, + "step": 1728, + "train/ce_loss": 1.0358099643781316e-05 + }, + { + "epoch": 0.17085228396282381, + "step": 1728, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.17085228396282381, + "step": 1728, + "train/total_loss": 0.07812603563070297 + }, + { + "entropy": 9.533740997314453, + "epoch": 0.17095115681233933, + "mean_token_accuracy": 0.69532710313797, + "num_tokens": 8996289.0, + "step": 1729, + "train/ce_loss": 1.4981647729873657 + }, + { + "epoch": 0.17095115681233933, + "step": 1729, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.17095115681233933, + "step": 1729, + "train/total_loss": 0.27481648325920105 + }, + { + "entropy": 8.76301383972168, + "epoch": 0.17105002966185484, + "mean_token_accuracy": 0.8076152205467224, + "num_tokens": 9001811.0, + "step": 1730, + "train/ce_loss": 0.495195597410202 + }, + { + "epoch": 0.17105002966185484, + "step": 1730, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.17105002966185484, + "step": 1730, + "train/total_loss": 0.08467581123113632 + }, + { + "entropy": 9.538856506347656, + "epoch": 0.17114890251137038, + "mean_token_accuracy": 0.688693106174469, + "num_tokens": 9006951.0, + "step": 1731, + "train/ce_loss": 1.0818617343902588 + }, + { + "epoch": 0.17114890251137038, + "step": 1731, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.17114890251137038, + "step": 1731, + "train/total_loss": 0.18240493535995483 + }, + { + "entropy": 9.177574157714844, + "epoch": 0.1712477753608859, + "mean_token_accuracy": 0.7191489338874817, + "num_tokens": 9012359.0, + "step": 1732, + "train/ce_loss": 1.2056026458740234 + }, + { + "epoch": 0.1712477753608859, + "step": 1732, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.1712477753608859, + "step": 1732, + "train/total_loss": 0.20649775862693787 + }, + { + "entropy": 8.994552612304688, + "epoch": 0.17134664821040144, + "mean_token_accuracy": 0.7307692170143127, + "num_tokens": 9017704.0, + "step": 1733, + "train/ce_loss": 1.107088327407837 + }, + { + "epoch": 0.17134664821040144, + "step": 1733, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.17134664821040144, + "step": 1733, + "train/total_loss": 0.2122713327407837 + }, + { + "entropy": 9.316804885864258, + "epoch": 0.17144552105991695, + "mean_token_accuracy": 0.8085365891456604, + "num_tokens": 9022970.0, + "step": 1734, + "train/ce_loss": 0.4841051697731018 + }, + { + "epoch": 0.17144552105991695, + "step": 1734, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.17144552105991695, + "step": 1734, + "train/total_loss": 0.08356676995754242 + }, + { + "entropy": 9.439652442932129, + "epoch": 0.17154439390943246, + "mean_token_accuracy": 0.7350318431854248, + "num_tokens": 9028160.0, + "step": 1735, + "train/ce_loss": 1.0343725681304932 + }, + { + "epoch": 0.17154439390943246, + "step": 1735, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.17154439390943246, + "step": 1735, + "train/total_loss": 0.15812475979328156 + }, + { + "entropy": 9.1663818359375, + "epoch": 0.171643266758948, + "mean_token_accuracy": 0.7737226486206055, + "num_tokens": 9033519.0, + "step": 1736, + "train/ce_loss": 0.5285016894340515 + }, + { + "epoch": 0.171643266758948, + "step": 1736, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.171643266758948, + "step": 1736, + "train/total_loss": 0.09191267192363739 + }, + { + "entropy": 9.119280815124512, + "epoch": 0.17174213960846352, + "mean_token_accuracy": 0.7171609997749329, + "num_tokens": 9038914.0, + "step": 1737, + "train/ce_loss": 0.5667960047721863 + }, + { + "epoch": 0.17174213960846352, + "step": 1737, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.17174213960846352, + "step": 1737, + "train/total_loss": 0.1348046064376831 + }, + { + "entropy": 9.536650657653809, + "epoch": 0.17184101245797903, + "mean_token_accuracy": 0.7481805086135864, + "num_tokens": 9044056.0, + "step": 1738, + "train/ce_loss": 0.6263763308525085 + }, + { + "epoch": 0.17184101245797903, + "step": 1738, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.17184101245797903, + "step": 1738, + "train/total_loss": 0.13685637712478638 + }, + { + "entropy": 9.257691383361816, + "epoch": 0.17193988530749457, + "mean_token_accuracy": 0.6525934934616089, + "num_tokens": 9049393.0, + "step": 1739, + "train/ce_loss": 1.0220292806625366 + }, + { + "epoch": 0.17193988530749457, + "step": 1739, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.17193988530749457, + "step": 1739, + "train/total_loss": 0.19204667210578918 + }, + { + "epoch": 0.17203875815701009, + "grad_norm": 1.1733081340789795, + "learning_rate": 9.57251644167532e-06, + "loss": 0.1643, + "step": 1740 + }, + { + "entropy": 9.278755187988281, + "epoch": 0.17203875815701009, + "mean_token_accuracy": 0.7256990671157837, + "num_tokens": 9054661.0, + "step": 1740, + "train/ce_loss": 0.8057752251625061 + }, + { + "epoch": 0.17203875815701009, + "step": 1740, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.17203875815701009, + "step": 1740, + "train/total_loss": 0.1508900225162506 + }, + { + "entropy": 9.168914794921875, + "epoch": 0.1721376310065256, + "mean_token_accuracy": 0.6828729510307312, + "num_tokens": 9059997.0, + "step": 1741, + "train/ce_loss": 0.3121006190776825 + }, + { + "epoch": 0.1721376310065256, + "step": 1741, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.1721376310065256, + "step": 1741, + "train/total_loss": 0.09371006488800049 + }, + { + "entropy": 9.900165557861328, + "epoch": 0.17223650385604114, + "mean_token_accuracy": 0.7354166507720947, + "num_tokens": 9064908.0, + "step": 1742, + "train/ce_loss": 0.9869362711906433 + }, + { + "epoch": 0.17223650385604114, + "step": 1742, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.17223650385604114, + "step": 1742, + "train/total_loss": 0.1729123890399933 + }, + { + "entropy": 9.285699844360352, + "epoch": 0.17233537670555665, + "mean_token_accuracy": 0.7460890412330627, + "num_tokens": 9070163.0, + "step": 1743, + "train/ce_loss": 0.4222497344017029 + }, + { + "epoch": 0.17233537670555665, + "step": 1743, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.17233537670555665, + "step": 1743, + "train/total_loss": 0.10863122344017029 + }, + { + "entropy": 9.915960311889648, + "epoch": 0.17243424955507217, + "mean_token_accuracy": 0.7307001948356628, + "num_tokens": 9075131.0, + "step": 1744, + "train/ce_loss": 1.5877418518066406 + }, + { + "epoch": 0.17243424955507217, + "step": 1744, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.17243424955507217, + "step": 1744, + "train/total_loss": 0.23299293220043182 + }, + { + "entropy": 8.995991706848145, + "epoch": 0.1725331224045877, + "mean_token_accuracy": 0.7079152464866638, + "num_tokens": 9080499.0, + "step": 1745, + "train/ce_loss": 0.6653462052345276 + }, + { + "epoch": 0.1725331224045877, + "step": 1745, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.1725331224045877, + "step": 1745, + "train/total_loss": 0.132940873503685 + }, + { + "entropy": 9.508094787597656, + "epoch": 0.17263199525410322, + "mean_token_accuracy": 0.7595772743225098, + "num_tokens": 9085891.0, + "step": 1746, + "train/ce_loss": 0.7341980338096619 + }, + { + "epoch": 0.17263199525410322, + "step": 1746, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.17263199525410322, + "step": 1746, + "train/total_loss": 0.13982605934143066 + }, + { + "entropy": 9.453865051269531, + "epoch": 0.17273086810361873, + "mean_token_accuracy": 0.7516005039215088, + "num_tokens": 9091113.0, + "step": 1747, + "train/ce_loss": 0.8095971941947937 + }, + { + "epoch": 0.17273086810361873, + "step": 1747, + "train/sim_loss": 0.1484375 + }, + { + "epoch": 0.17273086810361873, + "step": 1747, + "train/total_loss": 0.2293972223997116 + }, + { + "entropy": 9.227066040039062, + "epoch": 0.17282974095313428, + "mean_token_accuracy": 0.7286063432693481, + "num_tokens": 9096424.0, + "step": 1748, + "train/ce_loss": 0.7434126734733582 + }, + { + "epoch": 0.17282974095313428, + "step": 1748, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.17282974095313428, + "step": 1748, + "train/total_loss": 0.12512251734733582 + }, + { + "entropy": 9.189695358276367, + "epoch": 0.1729286138026498, + "mean_token_accuracy": 0.7150837779045105, + "num_tokens": 9101762.0, + "step": 1749, + "train/ce_loss": 0.9477271437644958 + }, + { + "epoch": 0.1729286138026498, + "step": 1749, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.1729286138026498, + "step": 1749, + "train/total_loss": 0.20024147629737854 + }, + { + "entropy": 8.743956565856934, + "epoch": 0.1730274866521653, + "mean_token_accuracy": 0.7459016442298889, + "num_tokens": 9107245.0, + "step": 1750, + "train/ce_loss": 0.8299065828323364 + }, + { + "epoch": 0.1730274866521653, + "step": 1750, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.1730274866521653, + "step": 1750, + "train/total_loss": 0.18455316126346588 + }, + { + "entropy": 9.632036209106445, + "epoch": 0.17312635950168084, + "mean_token_accuracy": 0.7817638516426086, + "num_tokens": 9112356.0, + "step": 1751, + "train/ce_loss": 1.2616857290267944 + }, + { + "epoch": 0.17312635950168084, + "step": 1751, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.17312635950168084, + "step": 1751, + "train/total_loss": 0.18085607886314392 + }, + { + "entropy": 9.808963775634766, + "epoch": 0.17322523235119636, + "mean_token_accuracy": 0.7113593816757202, + "num_tokens": 9117345.0, + "step": 1752, + "train/ce_loss": 1.0092039108276367 + }, + { + "epoch": 0.17322523235119636, + "step": 1752, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.17322523235119636, + "step": 1752, + "train/total_loss": 0.1790453940629959 + }, + { + "entropy": 9.167421340942383, + "epoch": 0.1733241052007119, + "mean_token_accuracy": 0.7189384698867798, + "num_tokens": 9122644.0, + "step": 1753, + "train/ce_loss": 0.7508464455604553 + }, + { + "epoch": 0.1733241052007119, + "step": 1753, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.1733241052007119, + "step": 1753, + "train/total_loss": 0.1571159064769745 + }, + { + "entropy": 9.825424194335938, + "epoch": 0.1734229780502274, + "mean_token_accuracy": 0.7134831547737122, + "num_tokens": 9127657.0, + "step": 1754, + "train/ce_loss": 0.8539113402366638 + }, + { + "epoch": 0.1734229780502274, + "step": 1754, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.1734229780502274, + "step": 1754, + "train/total_loss": 0.19867238402366638 + }, + { + "entropy": 9.104223251342773, + "epoch": 0.17352185089974292, + "mean_token_accuracy": 0.7300000190734863, + "num_tokens": 9133087.0, + "step": 1755, + "train/ce_loss": 0.3209971487522125 + }, + { + "epoch": 0.17352185089974292, + "step": 1755, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.17352185089974292, + "step": 1755, + "train/total_loss": 0.11803721636533737 + }, + { + "entropy": 9.736286163330078, + "epoch": 0.17362072374925847, + "mean_token_accuracy": 0.7467741966247559, + "num_tokens": 9138128.0, + "step": 1756, + "train/ce_loss": 1.6898686226340942e-05 + }, + { + "epoch": 0.17362072374925847, + "step": 1756, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.17362072374925847, + "step": 1756, + "train/total_loss": 0.0703141912817955 + }, + { + "entropy": 9.524857521057129, + "epoch": 0.17371959659877398, + "mean_token_accuracy": 0.7045840620994568, + "num_tokens": 9143311.0, + "step": 1757, + "train/ce_loss": 1.2180140018463135 + }, + { + "epoch": 0.17371959659877398, + "step": 1757, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.17371959659877398, + "step": 1757, + "train/total_loss": 0.20383265614509583 + }, + { + "entropy": 9.574028968811035, + "epoch": 0.1738184694482895, + "mean_token_accuracy": 0.7369862794876099, + "num_tokens": 9148469.0, + "step": 1758, + "train/ce_loss": 0.7238757610321045 + }, + { + "epoch": 0.1738184694482895, + "step": 1758, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.1738184694482895, + "step": 1758, + "train/total_loss": 0.15832507610321045 + }, + { + "entropy": 8.992545127868652, + "epoch": 0.17391734229780503, + "mean_token_accuracy": 0.7678571343421936, + "num_tokens": 9153893.0, + "step": 1759, + "train/ce_loss": 0.360592246055603 + }, + { + "epoch": 0.17391734229780503, + "step": 1759, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.17391734229780503, + "step": 1759, + "train/total_loss": 0.11027798056602478 + }, + { + "epoch": 0.17401621514732055, + "grad_norm": 0.8445634245872498, + "learning_rate": 9.567571576917371e-06, + "loss": 0.1678, + "step": 1760 + }, + { + "entropy": 9.652566909790039, + "epoch": 0.17401621514732055, + "mean_token_accuracy": 0.7388059496879578, + "num_tokens": 9158991.0, + "step": 1760, + "train/ce_loss": 1.5193932056427002 + }, + { + "epoch": 0.17401621514732055, + "step": 1760, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.17401621514732055, + "step": 1760, + "train/total_loss": 0.21834556758403778 + }, + { + "entropy": 9.348615646362305, + "epoch": 0.17411508799683606, + "mean_token_accuracy": 0.7608982920646667, + "num_tokens": 9164282.0, + "step": 1761, + "train/ce_loss": 0.7403160333633423 + }, + { + "epoch": 0.17411508799683606, + "step": 1761, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.17411508799683606, + "step": 1761, + "train/total_loss": 0.14043785631656647 + }, + { + "entropy": 9.238094329833984, + "epoch": 0.1742139608463516, + "mean_token_accuracy": 0.7394859790802002, + "num_tokens": 9169601.0, + "step": 1762, + "train/ce_loss": 0.3605496287345886 + }, + { + "epoch": 0.1742139608463516, + "step": 1762, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.1742139608463516, + "step": 1762, + "train/total_loss": 0.11417996883392334 + }, + { + "entropy": 9.508635520935059, + "epoch": 0.17431283369586711, + "mean_token_accuracy": 0.7448559403419495, + "num_tokens": 9174958.0, + "step": 1763, + "train/ce_loss": 1.4740917682647705 + }, + { + "epoch": 0.17431283369586711, + "step": 1763, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.17431283369586711, + "step": 1763, + "train/total_loss": 0.22944043576717377 + }, + { + "entropy": 10.01222038269043, + "epoch": 0.17441170654538263, + "mean_token_accuracy": 0.7267080545425415, + "num_tokens": 9179827.0, + "step": 1764, + "train/ce_loss": 1.3136488632881083e-05 + }, + { + "epoch": 0.17441170654538263, + "step": 1764, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.17441170654538263, + "step": 1764, + "train/total_loss": 0.023438813164830208 + }, + { + "entropy": 9.579922676086426, + "epoch": 0.17451057939489817, + "mean_token_accuracy": 0.7388167381286621, + "num_tokens": 9184970.0, + "step": 1765, + "train/ce_loss": 1.531071424484253 + }, + { + "epoch": 0.17451057939489817, + "step": 1765, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.17451057939489817, + "step": 1765, + "train/total_loss": 0.242950901389122 + }, + { + "entropy": 9.957202911376953, + "epoch": 0.17460945224441368, + "mean_token_accuracy": 0.853210985660553, + "num_tokens": 9189851.0, + "step": 1766, + "train/ce_loss": 1.0298511981964111 + }, + { + "epoch": 0.17460945224441368, + "step": 1766, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.17460945224441368, + "step": 1766, + "train/total_loss": 0.14986011385917664 + }, + { + "entropy": 9.486515045166016, + "epoch": 0.1747083250939292, + "mean_token_accuracy": 0.728205144405365, + "num_tokens": 9194877.0, + "step": 1767, + "train/ce_loss": 1.5970531702041626 + }, + { + "epoch": 0.1747083250939292, + "step": 1767, + "train/sim_loss": 0.2578125 + }, + { + "epoch": 0.1747083250939292, + "step": 1767, + "train/total_loss": 0.41751784086227417 + }, + { + "entropy": 10.033461570739746, + "epoch": 0.17480719794344474, + "mean_token_accuracy": 0.7226890921592712, + "num_tokens": 9199768.0, + "step": 1768, + "train/ce_loss": 6.694883632007986e-05 + }, + { + "epoch": 0.17480719794344474, + "step": 1768, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.17480719794344474, + "step": 1768, + "train/total_loss": 0.039069194346666336 + }, + { + "entropy": 9.921037673950195, + "epoch": 0.17490607079296025, + "mean_token_accuracy": 0.7867803573608398, + "num_tokens": 9204667.0, + "step": 1769, + "train/ce_loss": 1.650039792060852 + }, + { + "epoch": 0.17490607079296025, + "step": 1769, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.17490607079296025, + "step": 1769, + "train/total_loss": 0.24312898516654968 + }, + { + "entropy": 9.630839347839355, + "epoch": 0.17500494364247576, + "mean_token_accuracy": 0.7410423159599304, + "num_tokens": 9209727.0, + "step": 1770, + "train/ce_loss": 0.8447070717811584 + }, + { + "epoch": 0.17500494364247576, + "step": 1770, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.17500494364247576, + "step": 1770, + "train/total_loss": 0.1547832190990448 + }, + { + "entropy": 9.445236206054688, + "epoch": 0.1751038164919913, + "mean_token_accuracy": 0.7431694269180298, + "num_tokens": 9214955.0, + "step": 1771, + "train/ce_loss": 0.4419805109500885 + }, + { + "epoch": 0.1751038164919913, + "step": 1771, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.1751038164919913, + "step": 1771, + "train/total_loss": 0.16138555109500885 + }, + { + "entropy": 9.136075019836426, + "epoch": 0.17520268934150682, + "mean_token_accuracy": 0.6778378486633301, + "num_tokens": 9220329.0, + "step": 1772, + "train/ce_loss": 0.6905080080032349 + }, + { + "epoch": 0.17520268934150682, + "step": 1772, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.17520268934150682, + "step": 1772, + "train/total_loss": 0.12764455378055573 + }, + { + "entropy": 9.195317268371582, + "epoch": 0.17530156219102236, + "mean_token_accuracy": 0.7616707682609558, + "num_tokens": 9225623.0, + "step": 1773, + "train/ce_loss": 1.0541272163391113 + }, + { + "epoch": 0.17530156219102236, + "step": 1773, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.17530156219102236, + "step": 1773, + "train/total_loss": 0.20306897163391113 + }, + { + "entropy": 9.68917179107666, + "epoch": 0.17540043504053787, + "mean_token_accuracy": 0.7045454382896423, + "num_tokens": 9230712.0, + "step": 1774, + "train/ce_loss": 1.472992181777954 + }, + { + "epoch": 0.17540043504053787, + "step": 1774, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.17540043504053787, + "step": 1774, + "train/total_loss": 0.23714296519756317 + }, + { + "entropy": 9.076818466186523, + "epoch": 0.17549930789005339, + "mean_token_accuracy": 0.6739811897277832, + "num_tokens": 9236187.0, + "step": 1775, + "train/ce_loss": 0.6155195832252502 + }, + { + "epoch": 0.17549930789005339, + "step": 1775, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.17549930789005339, + "step": 1775, + "train/total_loss": 0.15920820832252502 + }, + { + "entropy": 9.201278686523438, + "epoch": 0.17559818073956893, + "mean_token_accuracy": 0.7463414669036865, + "num_tokens": 9241467.0, + "step": 1776, + "train/ce_loss": 0.9912561178207397 + }, + { + "epoch": 0.17559818073956893, + "step": 1776, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.17559818073956893, + "step": 1776, + "train/total_loss": 0.17334437370300293 + }, + { + "entropy": 9.124858856201172, + "epoch": 0.17569705358908444, + "mean_token_accuracy": 0.682692289352417, + "num_tokens": 9246847.0, + "step": 1777, + "train/ce_loss": 0.9765233993530273 + }, + { + "epoch": 0.17569705358908444, + "step": 1777, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.17569705358908444, + "step": 1777, + "train/total_loss": 0.1484335958957672 + }, + { + "entropy": 9.057942390441895, + "epoch": 0.17579592643859995, + "mean_token_accuracy": 0.7431289553642273, + "num_tokens": 9252263.0, + "step": 1778, + "train/ce_loss": 0.8119320869445801 + }, + { + "epoch": 0.17579592643859995, + "step": 1778, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.17579592643859995, + "step": 1778, + "train/total_loss": 0.194474458694458 + }, + { + "entropy": 9.562312126159668, + "epoch": 0.1758947992881155, + "mean_token_accuracy": 0.818320631980896, + "num_tokens": 9257360.0, + "step": 1779, + "train/ce_loss": 5.361784587876173e-06 + }, + { + "epoch": 0.1758947992881155, + "step": 1779, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.1758947992881155, + "step": 1779, + "train/total_loss": 0.03125053644180298 + }, + { + "epoch": 0.175993672137631, + "grad_norm": 0.7717829942703247, + "learning_rate": 9.562626712159424e-06, + "loss": 0.1539, + "step": 1780 + }, + { + "entropy": 9.121821403503418, + "epoch": 0.175993672137631, + "mean_token_accuracy": 0.7740740776062012, + "num_tokens": 9262637.0, + "step": 1780, + "train/ce_loss": 0.759074330329895 + }, + { + "epoch": 0.175993672137631, + "step": 1780, + "train/sim_loss": 0.12109375 + }, + { + "epoch": 0.175993672137631, + "step": 1780, + "train/total_loss": 0.19700118899345398 + }, + { + "entropy": 9.320338249206543, + "epoch": 0.17609254498714652, + "mean_token_accuracy": 0.744535505771637, + "num_tokens": 9267864.0, + "step": 1781, + "train/ce_loss": 0.7638721466064453 + }, + { + "epoch": 0.17609254498714652, + "step": 1781, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.17609254498714652, + "step": 1781, + "train/total_loss": 0.1545122265815735 + }, + { + "entropy": 8.921955108642578, + "epoch": 0.17619141783666206, + "mean_token_accuracy": 0.7149321436882019, + "num_tokens": 9273168.0, + "step": 1782, + "train/ce_loss": 1.2899847030639648 + }, + { + "epoch": 0.17619141783666206, + "step": 1782, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.17619141783666206, + "step": 1782, + "train/total_loss": 0.18759222328662872 + }, + { + "entropy": 9.738253593444824, + "epoch": 0.17629029068617758, + "mean_token_accuracy": 0.7022653818130493, + "num_tokens": 9278273.0, + "step": 1783, + "train/ce_loss": 1.3247605562210083 + }, + { + "epoch": 0.17629029068617758, + "step": 1783, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.17629029068617758, + "step": 1783, + "train/total_loss": 0.2418510615825653 + }, + { + "entropy": 9.320428848266602, + "epoch": 0.1763891635356931, + "mean_token_accuracy": 0.734375, + "num_tokens": 9283410.0, + "step": 1784, + "train/ce_loss": 1.5505481958389282 + }, + { + "epoch": 0.1763891635356931, + "step": 1784, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.1763891635356931, + "step": 1784, + "train/total_loss": 0.23708607256412506 + }, + { + "entropy": 8.998424530029297, + "epoch": 0.17648803638520863, + "mean_token_accuracy": 0.7615965604782104, + "num_tokens": 9288826.0, + "step": 1785, + "train/ce_loss": 0.8054037094116211 + }, + { + "epoch": 0.17648803638520863, + "step": 1785, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.17648803638520863, + "step": 1785, + "train/total_loss": 0.18991537392139435 + }, + { + "entropy": 9.724132537841797, + "epoch": 0.17658690923472414, + "mean_token_accuracy": 0.7370242476463318, + "num_tokens": 9293868.0, + "step": 1786, + "train/ce_loss": 0.8708922863006592 + }, + { + "epoch": 0.17658690923472414, + "step": 1786, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.17658690923472414, + "step": 1786, + "train/total_loss": 0.15349549055099487 + }, + { + "entropy": 9.522594451904297, + "epoch": 0.17668578208423966, + "mean_token_accuracy": 0.7403225898742676, + "num_tokens": 9298944.0, + "step": 1787, + "train/ce_loss": 1.2386717796325684 + }, + { + "epoch": 0.17668578208423966, + "step": 1787, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.17668578208423966, + "step": 1787, + "train/total_loss": 0.2098046839237213 + }, + { + "entropy": 10.408124923706055, + "epoch": 0.1767846549337552, + "mean_token_accuracy": 0.7104377150535583, + "num_tokens": 9303661.0, + "step": 1788, + "train/ce_loss": 6.664123793598264e-05 + }, + { + "epoch": 0.1767846549337552, + "step": 1788, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.1767846549337552, + "step": 1788, + "train/total_loss": 0.04688166454434395 + }, + { + "entropy": 9.13021183013916, + "epoch": 0.1768835277832707, + "mean_token_accuracy": 0.730526328086853, + "num_tokens": 9309093.0, + "step": 1789, + "train/ce_loss": 0.8388895988464355 + }, + { + "epoch": 0.1768835277832707, + "step": 1789, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.1768835277832707, + "step": 1789, + "train/total_loss": 0.1424827128648758 + }, + { + "entropy": 9.56302261352539, + "epoch": 0.17698240063278622, + "mean_token_accuracy": 0.6944444179534912, + "num_tokens": 9314152.0, + "step": 1790, + "train/ce_loss": 1.0453897714614868 + }, + { + "epoch": 0.17698240063278622, + "step": 1790, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.17698240063278622, + "step": 1790, + "train/total_loss": 0.18266397714614868 + }, + { + "entropy": 9.247736930847168, + "epoch": 0.17708127348230177, + "mean_token_accuracy": 0.788557231426239, + "num_tokens": 9319444.0, + "step": 1791, + "train/ce_loss": 0.49960947036743164 + }, + { + "epoch": 0.17708127348230177, + "step": 1791, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.17708127348230177, + "step": 1791, + "train/total_loss": 0.1476171910762787 + }, + { + "entropy": 9.873125076293945, + "epoch": 0.17718014633181728, + "mean_token_accuracy": 0.7311608791351318, + "num_tokens": 9324346.0, + "step": 1792, + "train/ce_loss": 2.2994272708892822 + }, + { + "epoch": 0.17718014633181728, + "step": 1792, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.17718014633181728, + "step": 1792, + "train/total_loss": 0.2924427390098572 + }, + { + "entropy": 9.160506248474121, + "epoch": 0.17727901918133282, + "mean_token_accuracy": 0.7244898080825806, + "num_tokens": 9329680.0, + "step": 1793, + "train/ce_loss": 1.0658172369003296 + }, + { + "epoch": 0.17727901918133282, + "step": 1793, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.17727901918133282, + "step": 1793, + "train/total_loss": 0.19642546772956848 + }, + { + "entropy": 9.280560493469238, + "epoch": 0.17737789203084833, + "mean_token_accuracy": 0.7568270564079285, + "num_tokens": 9334912.0, + "step": 1794, + "train/ce_loss": 1.158437728881836 + }, + { + "epoch": 0.17737789203084833, + "step": 1794, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.17737789203084833, + "step": 1794, + "train/total_loss": 0.1861562728881836 + }, + { + "entropy": 10.03182601928711, + "epoch": 0.17747676488036385, + "mean_token_accuracy": 0.7526881694793701, + "num_tokens": 9339782.0, + "step": 1795, + "train/ce_loss": 2.1850147247314453 + }, + { + "epoch": 0.17747676488036385, + "step": 1795, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.17747676488036385, + "step": 1795, + "train/total_loss": 0.304438978433609 + }, + { + "entropy": 8.88985824584961, + "epoch": 0.1775756377298794, + "mean_token_accuracy": 0.7665244936943054, + "num_tokens": 9345184.0, + "step": 1796, + "train/ce_loss": 0.5084155797958374 + }, + { + "epoch": 0.1775756377298794, + "step": 1796, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.1775756377298794, + "step": 1796, + "train/total_loss": 0.1172478049993515 + }, + { + "entropy": 8.852987289428711, + "epoch": 0.1776745105793949, + "mean_token_accuracy": 0.7342026233673096, + "num_tokens": 9350669.0, + "step": 1797, + "train/ce_loss": 0.6396374106407166 + }, + { + "epoch": 0.1776745105793949, + "step": 1797, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.1776745105793949, + "step": 1797, + "train/total_loss": 0.15380749106407166 + }, + { + "entropy": 9.820240020751953, + "epoch": 0.17777338342891041, + "mean_token_accuracy": 0.765072762966156, + "num_tokens": 9355647.0, + "step": 1798, + "train/ce_loss": 1.4581230878829956 + }, + { + "epoch": 0.17777338342891041, + "step": 1798, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.17777338342891041, + "step": 1798, + "train/total_loss": 0.21612481772899628 + }, + { + "entropy": 9.143946647644043, + "epoch": 0.17787225627842596, + "mean_token_accuracy": 0.6997518539428711, + "num_tokens": 9360961.0, + "step": 1799, + "train/ce_loss": 1.3224116563796997 + }, + { + "epoch": 0.17787225627842596, + "step": 1799, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.17787225627842596, + "step": 1799, + "train/total_loss": 0.1947411745786667 + }, + { + "epoch": 0.17797112912794147, + "grad_norm": 1.0880522727966309, + "learning_rate": 9.557681847401474e-06, + "loss": 0.1681, + "step": 1800 + }, + { + "entropy": 9.875439643859863, + "epoch": 0.17797112912794147, + "mean_token_accuracy": 0.7449209690093994, + "num_tokens": 9365872.0, + "step": 1800, + "train/ce_loss": 0.9147996306419373 + }, + { + "epoch": 0.17797112912794147, + "step": 1800, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.17797112912794147, + "step": 1800, + "train/total_loss": 0.14616745710372925 + }, + { + "entropy": 9.731260299682617, + "epoch": 0.17807000197745698, + "mean_token_accuracy": 0.7075098752975464, + "num_tokens": 9370789.0, + "step": 1801, + "train/ce_loss": 1.002318024635315 + }, + { + "epoch": 0.17807000197745698, + "step": 1801, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.17807000197745698, + "step": 1801, + "train/total_loss": 0.16663804650306702 + }, + { + "entropy": 9.157102584838867, + "epoch": 0.17816887482697252, + "mean_token_accuracy": 0.7615384459495544, + "num_tokens": 9376296.0, + "step": 1802, + "train/ce_loss": 0.8391835689544678 + }, + { + "epoch": 0.17816887482697252, + "step": 1802, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.17816887482697252, + "step": 1802, + "train/total_loss": 0.15032461285591125 + }, + { + "entropy": 9.377922058105469, + "epoch": 0.17826774767648804, + "mean_token_accuracy": 0.7012278437614441, + "num_tokens": 9381493.0, + "step": 1803, + "train/ce_loss": 0.7130151987075806 + }, + { + "epoch": 0.17826774767648804, + "step": 1803, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.17826774767648804, + "step": 1803, + "train/total_loss": 0.15333276987075806 + }, + { + "entropy": 9.224993705749512, + "epoch": 0.17836662052600355, + "mean_token_accuracy": 0.7852028608322144, + "num_tokens": 9386798.0, + "step": 1804, + "train/ce_loss": 0.8284642696380615 + }, + { + "epoch": 0.17836662052600355, + "step": 1804, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.17836662052600355, + "step": 1804, + "train/total_loss": 0.11019017547369003 + }, + { + "entropy": 9.024450302124023, + "epoch": 0.1784654933755191, + "mean_token_accuracy": 0.7416413426399231, + "num_tokens": 9392253.0, + "step": 1805, + "train/ce_loss": 1.0536538362503052 + }, + { + "epoch": 0.1784654933755191, + "step": 1805, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.1784654933755191, + "step": 1805, + "train/total_loss": 0.21864664554595947 + }, + { + "entropy": 9.209526062011719, + "epoch": 0.1785643662250346, + "mean_token_accuracy": 0.7149425148963928, + "num_tokens": 9397564.0, + "step": 1806, + "train/ce_loss": 0.672696590423584 + }, + { + "epoch": 0.1785643662250346, + "step": 1806, + "train/sim_loss": 0.12109375 + }, + { + "epoch": 0.1785643662250346, + "step": 1806, + "train/total_loss": 0.18836340308189392 + }, + { + "entropy": 8.976957321166992, + "epoch": 0.17866323907455012, + "mean_token_accuracy": 0.7857911586761475, + "num_tokens": 9402970.0, + "step": 1807, + "train/ce_loss": 0.602595329284668 + }, + { + "epoch": 0.17866323907455012, + "step": 1807, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.17866323907455012, + "step": 1807, + "train/total_loss": 0.12666578590869904 + }, + { + "entropy": 9.577447891235352, + "epoch": 0.17876211192406566, + "mean_token_accuracy": 0.7555555701255798, + "num_tokens": 9408069.0, + "step": 1808, + "train/ce_loss": 0.6192771792411804 + }, + { + "epoch": 0.17876211192406566, + "step": 1808, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.17876211192406566, + "step": 1808, + "train/total_loss": 0.15177147090435028 + }, + { + "entropy": 9.357033729553223, + "epoch": 0.17886098477358117, + "mean_token_accuracy": 0.7422401905059814, + "num_tokens": 9413239.0, + "step": 1809, + "train/ce_loss": 1.4079818725585938 + }, + { + "epoch": 0.17886098477358117, + "step": 1809, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.17886098477358117, + "step": 1809, + "train/total_loss": 0.2345481961965561 + }, + { + "entropy": 9.14383316040039, + "epoch": 0.17895985762309669, + "mean_token_accuracy": 0.7847533822059631, + "num_tokens": 9418600.0, + "step": 1810, + "train/ce_loss": 0.9785193204879761 + }, + { + "epoch": 0.17895985762309669, + "step": 1810, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.17895985762309669, + "step": 1810, + "train/total_loss": 0.1837894320487976 + }, + { + "entropy": 10.573735237121582, + "epoch": 0.17905873047261223, + "mean_token_accuracy": 0.6759776473045349, + "num_tokens": 9423167.0, + "step": 1811, + "train/ce_loss": 4.147632122039795 + }, + { + "epoch": 0.17905873047261223, + "step": 1811, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.17905873047261223, + "step": 1811, + "train/total_loss": 0.5007007122039795 + }, + { + "entropy": 9.858811378479004, + "epoch": 0.17915760332212774, + "mean_token_accuracy": 0.7148080468177795, + "num_tokens": 9428113.0, + "step": 1812, + "train/ce_loss": 1.2605161666870117 + }, + { + "epoch": 0.17915760332212774, + "step": 1812, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.17915760332212774, + "step": 1812, + "train/total_loss": 0.1807391196489334 + }, + { + "entropy": 8.92715835571289, + "epoch": 0.17925647617164325, + "mean_token_accuracy": 0.7004877924919128, + "num_tokens": 9433602.0, + "step": 1813, + "train/ce_loss": 1.0510623455047607 + }, + { + "epoch": 0.17925647617164325, + "step": 1813, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.17925647617164325, + "step": 1813, + "train/total_loss": 0.15588748455047607 + }, + { + "entropy": 9.373226165771484, + "epoch": 0.1793553490211588, + "mean_token_accuracy": 0.7319711446762085, + "num_tokens": 9439078.0, + "step": 1814, + "train/ce_loss": 1.2842234373092651 + }, + { + "epoch": 0.1793553490211588, + "step": 1814, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.1793553490211588, + "step": 1814, + "train/total_loss": 0.233891099691391 + }, + { + "entropy": 9.695732116699219, + "epoch": 0.1794542218706743, + "mean_token_accuracy": 0.7933884263038635, + "num_tokens": 9444124.0, + "step": 1815, + "train/ce_loss": 5.596983555733459e-06 + }, + { + "epoch": 0.1794542218706743, + "step": 1815, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.1794542218706743, + "step": 1815, + "train/total_loss": 0.03515680879354477 + }, + { + "entropy": 9.347949981689453, + "epoch": 0.17955309472018985, + "mean_token_accuracy": 0.7309136390686035, + "num_tokens": 9449371.0, + "step": 1816, + "train/ce_loss": 0.9072271585464478 + }, + { + "epoch": 0.17955309472018985, + "step": 1816, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.17955309472018985, + "step": 1816, + "train/total_loss": 0.1219727173447609 + }, + { + "entropy": 9.286724090576172, + "epoch": 0.17965196756970536, + "mean_token_accuracy": 0.7020997405052185, + "num_tokens": 9454597.0, + "step": 1817, + "train/ce_loss": 0.5547173023223877 + }, + { + "epoch": 0.17965196756970536, + "step": 1817, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.17965196756970536, + "step": 1817, + "train/total_loss": 0.10234673321247101 + }, + { + "entropy": 9.670854568481445, + "epoch": 0.17975084041922088, + "mean_token_accuracy": 0.7434312105178833, + "num_tokens": 9459663.0, + "step": 1818, + "train/ce_loss": 5.238787252892507e-06 + }, + { + "epoch": 0.17975084041922088, + "step": 1818, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.17975084041922088, + "step": 1818, + "train/total_loss": 0.07812552154064178 + }, + { + "entropy": 9.288985252380371, + "epoch": 0.17984971326873642, + "mean_token_accuracy": 0.7222222089767456, + "num_tokens": 9464980.0, + "step": 1819, + "train/ce_loss": 0.8383810520172119 + }, + { + "epoch": 0.17984971326873642, + "step": 1819, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.17984971326873642, + "step": 1819, + "train/total_loss": 0.1346193552017212 + }, + { + "epoch": 0.17994858611825193, + "grad_norm": 0.9709348082542419, + "learning_rate": 9.552736982643526e-06, + "loss": 0.1551, + "step": 1820 + }, + { + "entropy": 9.821374893188477, + "epoch": 0.17994858611825193, + "mean_token_accuracy": 0.6834782361984253, + "num_tokens": 9469973.0, + "step": 1820, + "train/ce_loss": 1.030767798423767 + }, + { + "epoch": 0.17994858611825193, + "step": 1820, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.17994858611825193, + "step": 1820, + "train/total_loss": 0.1851080358028412 + }, + { + "entropy": 9.43313217163086, + "epoch": 0.18004745896776744, + "mean_token_accuracy": 0.7410852909088135, + "num_tokens": 9475082.0, + "step": 1821, + "train/ce_loss": 0.7259458899497986 + }, + { + "epoch": 0.18004745896776744, + "step": 1821, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.18004745896776744, + "step": 1821, + "train/total_loss": 0.12728208303451538 + }, + { + "entropy": 9.883010864257812, + "epoch": 0.18014633181728298, + "mean_token_accuracy": 0.7248576879501343, + "num_tokens": 9479978.0, + "step": 1822, + "train/ce_loss": 6.774416760890745e-06 + }, + { + "epoch": 0.18014633181728298, + "step": 1822, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.18014633181728298, + "step": 1822, + "train/total_loss": 0.02734442800283432 + }, + { + "entropy": 9.508912086486816, + "epoch": 0.1802452046667985, + "mean_token_accuracy": 0.7350901365280151, + "num_tokens": 9485097.0, + "step": 1823, + "train/ce_loss": 1.1825639009475708 + }, + { + "epoch": 0.1802452046667985, + "step": 1823, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.1802452046667985, + "step": 1823, + "train/total_loss": 0.21200639009475708 + }, + { + "entropy": 9.023045539855957, + "epoch": 0.180344077516314, + "mean_token_accuracy": 0.7669801712036133, + "num_tokens": 9490520.0, + "step": 1824, + "train/ce_loss": 1.0307142734527588 + }, + { + "epoch": 0.180344077516314, + "step": 1824, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.180344077516314, + "step": 1824, + "train/total_loss": 0.1616651713848114 + }, + { + "entropy": 9.16715145111084, + "epoch": 0.18044295036582955, + "mean_token_accuracy": 0.7367149591445923, + "num_tokens": 9495871.0, + "step": 1825, + "train/ce_loss": 0.8486382961273193 + }, + { + "epoch": 0.18044295036582955, + "step": 1825, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.18044295036582955, + "step": 1825, + "train/total_loss": 0.1551763415336609 + }, + { + "entropy": 10.543107032775879, + "epoch": 0.18054182321534507, + "mean_token_accuracy": 0.7471264600753784, + "num_tokens": 9500474.0, + "step": 1826, + "train/ce_loss": 5.42702000529971e-05 + }, + { + "epoch": 0.18054182321534507, + "step": 1826, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.18054182321534507, + "step": 1826, + "train/total_loss": 0.06641167402267456 + }, + { + "entropy": 9.587188720703125, + "epoch": 0.18064069606486058, + "mean_token_accuracy": 0.7706552743911743, + "num_tokens": 9505620.0, + "step": 1827, + "train/ce_loss": 1.4216008186340332 + }, + { + "epoch": 0.18064069606486058, + "step": 1827, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.18064069606486058, + "step": 1827, + "train/total_loss": 0.2085663378238678 + }, + { + "entropy": 9.30483627319336, + "epoch": 0.18073956891437612, + "mean_token_accuracy": 0.7323943376541138, + "num_tokens": 9510941.0, + "step": 1828, + "train/ce_loss": 0.8432378172874451 + }, + { + "epoch": 0.18073956891437612, + "step": 1828, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.18073956891437612, + "step": 1828, + "train/total_loss": 0.15854254364967346 + }, + { + "entropy": 9.974531173706055, + "epoch": 0.18083844176389163, + "mean_token_accuracy": 0.6699604988098145, + "num_tokens": 9515857.0, + "step": 1829, + "train/ce_loss": 1.2670997381210327 + }, + { + "epoch": 0.18083844176389163, + "step": 1829, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.18083844176389163, + "step": 1829, + "train/total_loss": 0.2126474827528 + }, + { + "entropy": 9.296100616455078, + "epoch": 0.18093731461340715, + "mean_token_accuracy": 0.7124260067939758, + "num_tokens": 9521156.0, + "step": 1830, + "train/ce_loss": 0.601858913898468 + }, + { + "epoch": 0.18093731461340715, + "step": 1830, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.18093731461340715, + "step": 1830, + "train/total_loss": 0.08362339437007904 + }, + { + "entropy": 9.4048433303833, + "epoch": 0.1810361874629227, + "mean_token_accuracy": 0.6895705461502075, + "num_tokens": 9526351.0, + "step": 1831, + "train/ce_loss": 0.674392580986023 + }, + { + "epoch": 0.1810361874629227, + "step": 1831, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.1810361874629227, + "step": 1831, + "train/total_loss": 0.1611892580986023 + }, + { + "entropy": 9.436763763427734, + "epoch": 0.1811350603124382, + "mean_token_accuracy": 0.7588152289390564, + "num_tokens": 9531530.0, + "step": 1832, + "train/ce_loss": 0.8237096071243286 + }, + { + "epoch": 0.1811350603124382, + "step": 1832, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.1811350603124382, + "step": 1832, + "train/total_loss": 0.16049596667289734 + }, + { + "entropy": 9.5198335647583, + "epoch": 0.18123393316195371, + "mean_token_accuracy": 0.7661623358726501, + "num_tokens": 9536692.0, + "step": 1833, + "train/ce_loss": 1.0668498277664185 + }, + { + "epoch": 0.18123393316195371, + "step": 1833, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.18123393316195371, + "step": 1833, + "train/total_loss": 0.14574748277664185 + }, + { + "entropy": 8.950961112976074, + "epoch": 0.18133280601146926, + "mean_token_accuracy": 0.7164319157600403, + "num_tokens": 9542387.0, + "step": 1834, + "train/ce_loss": 1.0130492448806763 + }, + { + "epoch": 0.18133280601146926, + "step": 1834, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.18133280601146926, + "step": 1834, + "train/total_loss": 0.18333616852760315 + }, + { + "entropy": 9.354697227478027, + "epoch": 0.18143167886098477, + "mean_token_accuracy": 0.7476635575294495, + "num_tokens": 9547598.0, + "step": 1835, + "train/ce_loss": 0.7982195615768433 + }, + { + "epoch": 0.18143167886098477, + "step": 1835, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.18143167886098477, + "step": 1835, + "train/total_loss": 0.13841570913791656 + }, + { + "entropy": 9.004920959472656, + "epoch": 0.1815305517105003, + "mean_token_accuracy": 0.7362030744552612, + "num_tokens": 9552969.0, + "step": 1836, + "train/ce_loss": 0.7738270163536072 + }, + { + "epoch": 0.1815305517105003, + "step": 1836, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.1815305517105003, + "step": 1836, + "train/total_loss": 0.17113271355628967 + }, + { + "entropy": 8.88099193572998, + "epoch": 0.18162942456001582, + "mean_token_accuracy": 0.7259978652000427, + "num_tokens": 9558426.0, + "step": 1837, + "train/ce_loss": 0.3200748562812805 + }, + { + "epoch": 0.18162942456001582, + "step": 1837, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.18162942456001582, + "step": 1837, + "train/total_loss": 0.05935123562812805 + }, + { + "entropy": 9.812850952148438, + "epoch": 0.18172829740953134, + "mean_token_accuracy": 0.7011070251464844, + "num_tokens": 9563367.0, + "step": 1838, + "train/ce_loss": 0.746933102607727 + }, + { + "epoch": 0.18172829740953134, + "step": 1838, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.18172829740953134, + "step": 1838, + "train/total_loss": 0.17234957218170166 + }, + { + "entropy": 9.44894790649414, + "epoch": 0.18182717025904688, + "mean_token_accuracy": 0.7223684191703796, + "num_tokens": 9568596.0, + "step": 1839, + "train/ce_loss": 1.4718496799468994 + }, + { + "epoch": 0.18182717025904688, + "step": 1839, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.18182717025904688, + "step": 1839, + "train/total_loss": 0.27218496799468994 + }, + { + "epoch": 0.1819260431085624, + "grad_norm": 0.8998593091964722, + "learning_rate": 9.547792117885577e-06, + "loss": 0.1612, + "step": 1840 + }, + { + "entropy": 9.199764251708984, + "epoch": 0.1819260431085624, + "mean_token_accuracy": 0.7387606501579285, + "num_tokens": 9573826.0, + "step": 1840, + "train/ce_loss": 0.780877947807312 + }, + { + "epoch": 0.1819260431085624, + "step": 1840, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.1819260431085624, + "step": 1840, + "train/total_loss": 0.12886905670166016 + }, + { + "entropy": 8.80765151977539, + "epoch": 0.1820249159580779, + "mean_token_accuracy": 0.7360248565673828, + "num_tokens": 9579299.0, + "step": 1841, + "train/ce_loss": 0.8180506825447083 + }, + { + "epoch": 0.1820249159580779, + "step": 1841, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.1820249159580779, + "step": 1841, + "train/total_loss": 0.17946133017539978 + }, + { + "entropy": 9.316116333007812, + "epoch": 0.18212378880759345, + "mean_token_accuracy": 0.7462499737739563, + "num_tokens": 9584548.0, + "step": 1842, + "train/ce_loss": 0.7248634099960327 + }, + { + "epoch": 0.18212378880759345, + "step": 1842, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.18212378880759345, + "step": 1842, + "train/total_loss": 0.09983009099960327 + }, + { + "entropy": 9.40951919555664, + "epoch": 0.18222266165710896, + "mean_token_accuracy": 0.7121211886405945, + "num_tokens": 9589726.0, + "step": 1843, + "train/ce_loss": 0.7894455194473267 + }, + { + "epoch": 0.18222266165710896, + "step": 1843, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.18222266165710896, + "step": 1843, + "train/total_loss": 0.16097581386566162 + }, + { + "entropy": 9.293411254882812, + "epoch": 0.18232153450662447, + "mean_token_accuracy": 0.766749382019043, + "num_tokens": 9595011.0, + "step": 1844, + "train/ce_loss": 1.7790314814192243e-05 + }, + { + "epoch": 0.18232153450662447, + "step": 1844, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.18232153450662447, + "step": 1844, + "train/total_loss": 0.09375178068876266 + }, + { + "entropy": 9.97812271118164, + "epoch": 0.18242040735614, + "mean_token_accuracy": 0.7121211886405945, + "num_tokens": 9599837.0, + "step": 1845, + "train/ce_loss": 1.2837271690368652 + }, + { + "epoch": 0.18242040735614, + "step": 1845, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.18242040735614, + "step": 1845, + "train/total_loss": 0.23384146392345428 + }, + { + "entropy": 9.744732856750488, + "epoch": 0.18251928020565553, + "mean_token_accuracy": 0.7655986547470093, + "num_tokens": 9604874.0, + "step": 1846, + "train/ce_loss": 0.9753443598747253 + }, + { + "epoch": 0.18251928020565553, + "step": 1846, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.18251928020565553, + "step": 1846, + "train/total_loss": 0.12487819045782089 + }, + { + "entropy": 9.101228713989258, + "epoch": 0.18261815305517104, + "mean_token_accuracy": 0.7458006739616394, + "num_tokens": 9610218.0, + "step": 1847, + "train/ce_loss": 0.9057663679122925 + }, + { + "epoch": 0.18261815305517104, + "step": 1847, + "train/sim_loss": 0.1328125 + }, + { + "epoch": 0.18261815305517104, + "step": 1847, + "train/total_loss": 0.2233891487121582 + }, + { + "entropy": 9.602880477905273, + "epoch": 0.18271702590468658, + "mean_token_accuracy": 0.7296848893165588, + "num_tokens": 9615236.0, + "step": 1848, + "train/ce_loss": 1.7166345119476318 + }, + { + "epoch": 0.18271702590468658, + "step": 1848, + "train/sim_loss": 0.1328125 + }, + { + "epoch": 0.18271702590468658, + "step": 1848, + "train/total_loss": 0.30447596311569214 + }, + { + "entropy": 9.3284912109375, + "epoch": 0.1828158987542021, + "mean_token_accuracy": 0.7758318781852722, + "num_tokens": 9620228.0, + "step": 1849, + "train/ce_loss": 0.9070501923561096 + }, + { + "epoch": 0.1828158987542021, + "step": 1849, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.1828158987542021, + "step": 1849, + "train/total_loss": 0.1805487722158432 + }, + { + "entropy": 9.956958770751953, + "epoch": 0.1829147716037176, + "mean_token_accuracy": 0.7658079862594604, + "num_tokens": 9625117.0, + "step": 1850, + "train/ce_loss": 9.23292463994585e-06 + }, + { + "epoch": 0.1829147716037176, + "step": 1850, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.1829147716037176, + "step": 1850, + "train/total_loss": 0.05859467387199402 + }, + { + "entropy": 9.088489532470703, + "epoch": 0.18301364445323315, + "mean_token_accuracy": 0.7605321407318115, + "num_tokens": 9630474.0, + "step": 1851, + "train/ce_loss": 1.0857402086257935 + }, + { + "epoch": 0.18301364445323315, + "step": 1851, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.18301364445323315, + "step": 1851, + "train/total_loss": 0.2257615327835083 + }, + { + "entropy": 8.86593246459961, + "epoch": 0.18311251730274866, + "mean_token_accuracy": 0.6904024481773376, + "num_tokens": 9635916.0, + "step": 1852, + "train/ce_loss": 1.391951322555542 + }, + { + "epoch": 0.18311251730274866, + "step": 1852, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.18311251730274866, + "step": 1852, + "train/total_loss": 0.20950762927532196 + }, + { + "entropy": 9.429494857788086, + "epoch": 0.18321139015226418, + "mean_token_accuracy": 0.7647849321365356, + "num_tokens": 9641109.0, + "step": 1853, + "train/ce_loss": 0.6080541014671326 + }, + { + "epoch": 0.18321139015226418, + "step": 1853, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.18321139015226418, + "step": 1853, + "train/total_loss": 0.09596166014671326 + }, + { + "entropy": 9.24432373046875, + "epoch": 0.18331026300177972, + "mean_token_accuracy": 0.7325268983840942, + "num_tokens": 9646320.0, + "step": 1854, + "train/ce_loss": 0.9263020157814026 + }, + { + "epoch": 0.18331026300177972, + "step": 1854, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.18331026300177972, + "step": 1854, + "train/total_loss": 0.11216145008802414 + }, + { + "entropy": 9.437981605529785, + "epoch": 0.18340913585129523, + "mean_token_accuracy": 0.7396870851516724, + "num_tokens": 9651493.0, + "step": 1855, + "train/ce_loss": 0.9242285490036011 + }, + { + "epoch": 0.18340913585129523, + "step": 1855, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.18340913585129523, + "step": 1855, + "train/total_loss": 0.13539160788059235 + }, + { + "entropy": 9.209920883178711, + "epoch": 0.18350800870081077, + "mean_token_accuracy": 0.7011643052101135, + "num_tokens": 9656744.0, + "step": 1856, + "train/ce_loss": 1.042596459388733 + }, + { + "epoch": 0.18350800870081077, + "step": 1856, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.18350800870081077, + "step": 1856, + "train/total_loss": 0.2097283899784088 + }, + { + "entropy": 9.788610458374023, + "epoch": 0.18360688155032628, + "mean_token_accuracy": 0.7260788083076477, + "num_tokens": 9661703.0, + "step": 1857, + "train/ce_loss": 6.188375664351042e-06 + }, + { + "epoch": 0.18360688155032628, + "step": 1857, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.18360688155032628, + "step": 1857, + "train/total_loss": 0.042969368398189545 + }, + { + "entropy": 9.401823043823242, + "epoch": 0.1837057543998418, + "mean_token_accuracy": 0.6855670213699341, + "num_tokens": 9666917.0, + "step": 1858, + "train/ce_loss": 1.1027350425720215 + }, + { + "epoch": 0.1837057543998418, + "step": 1858, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.1837057543998418, + "step": 1858, + "train/total_loss": 0.17277351021766663 + }, + { + "entropy": 9.327844619750977, + "epoch": 0.18380462724935734, + "mean_token_accuracy": 0.742514967918396, + "num_tokens": 9672173.0, + "step": 1859, + "train/ce_loss": 0.74072265625 + }, + { + "epoch": 0.18380462724935734, + "step": 1859, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.18380462724935734, + "step": 1859, + "train/total_loss": 0.13657227158546448 + }, + { + "epoch": 0.18390350009887285, + "grad_norm": 0.8869383335113525, + "learning_rate": 9.542847253127627e-06, + "loss": 0.153, + "step": 1860 + }, + { + "entropy": 9.441791534423828, + "epoch": 0.18390350009887285, + "mean_token_accuracy": 0.7203728556632996, + "num_tokens": 9677377.0, + "step": 1860, + "train/ce_loss": 1.3488295078277588 + }, + { + "epoch": 0.18390350009887285, + "step": 1860, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.18390350009887285, + "step": 1860, + "train/total_loss": 0.20519545674324036 + }, + { + "entropy": 8.826675415039062, + "epoch": 0.18400237294838837, + "mean_token_accuracy": 0.7460018992424011, + "num_tokens": 9682960.0, + "step": 1861, + "train/ce_loss": 0.5895362496376038 + }, + { + "epoch": 0.18400237294838837, + "step": 1861, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.18400237294838837, + "step": 1861, + "train/total_loss": 0.09020362794399261 + }, + { + "entropy": 9.426769256591797, + "epoch": 0.1841012457979039, + "mean_token_accuracy": 0.7626886367797852, + "num_tokens": 9688133.0, + "step": 1862, + "train/ce_loss": 3.955638931074645e-06 + }, + { + "epoch": 0.1841012457979039, + "step": 1862, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.1841012457979039, + "step": 1862, + "train/total_loss": 0.08593789488077164 + }, + { + "entropy": 9.118680000305176, + "epoch": 0.18420011864741942, + "mean_token_accuracy": 0.7420091032981873, + "num_tokens": 9693421.0, + "step": 1863, + "train/ce_loss": 0.7646657228469849 + }, + { + "epoch": 0.18420011864741942, + "step": 1863, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.18420011864741942, + "step": 1863, + "train/total_loss": 0.12724782526493073 + }, + { + "entropy": 10.062543869018555, + "epoch": 0.18429899149693493, + "mean_token_accuracy": 0.6973365545272827, + "num_tokens": 9698256.0, + "step": 1864, + "train/ce_loss": 2.019545718212612e-05 + }, + { + "epoch": 0.18429899149693493, + "step": 1864, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.18429899149693493, + "step": 1864, + "train/total_loss": 0.046877019107341766 + }, + { + "entropy": 9.804080963134766, + "epoch": 0.18439786434645047, + "mean_token_accuracy": 0.6678898930549622, + "num_tokens": 9703264.0, + "step": 1865, + "train/ce_loss": 2.0530173778533936 + }, + { + "epoch": 0.18439786434645047, + "step": 1865, + "train/sim_loss": 0.12890625 + }, + { + "epoch": 0.18439786434645047, + "step": 1865, + "train/total_loss": 0.33420801162719727 + }, + { + "entropy": 9.011037826538086, + "epoch": 0.184496737195966, + "mean_token_accuracy": 0.6892778873443604, + "num_tokens": 9708695.0, + "step": 1866, + "train/ce_loss": 0.7095602750778198 + }, + { + "epoch": 0.184496737195966, + "step": 1866, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.184496737195966, + "step": 1866, + "train/total_loss": 0.1568935215473175 + }, + { + "entropy": 9.28196907043457, + "epoch": 0.1845956100454815, + "mean_token_accuracy": 0.7383177280426025, + "num_tokens": 9713972.0, + "step": 1867, + "train/ce_loss": 0.9496920108795166 + }, + { + "epoch": 0.1845956100454815, + "step": 1867, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.1845956100454815, + "step": 1867, + "train/total_loss": 0.16137546300888062 + }, + { + "entropy": 9.117605209350586, + "epoch": 0.18469448289499704, + "mean_token_accuracy": 0.7533632516860962, + "num_tokens": 9719362.0, + "step": 1868, + "train/ce_loss": 0.6435233354568481 + }, + { + "epoch": 0.18469448289499704, + "step": 1868, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.18469448289499704, + "step": 1868, + "train/total_loss": 0.12685233354568481 + }, + { + "entropy": 9.366474151611328, + "epoch": 0.18479335574451256, + "mean_token_accuracy": 0.7259551882743835, + "num_tokens": 9724575.0, + "step": 1869, + "train/ce_loss": 0.67132568359375 + }, + { + "epoch": 0.18479335574451256, + "step": 1869, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.18479335574451256, + "step": 1869, + "train/total_loss": 0.11791381984949112 + }, + { + "entropy": 9.35400390625, + "epoch": 0.18489222859402807, + "mean_token_accuracy": 0.7232037782669067, + "num_tokens": 9729866.0, + "step": 1870, + "train/ce_loss": 0.6131327152252197 + }, + { + "epoch": 0.18489222859402807, + "step": 1870, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.18489222859402807, + "step": 1870, + "train/total_loss": 0.11209452152252197 + }, + { + "entropy": 10.150435447692871, + "epoch": 0.1849911014435436, + "mean_token_accuracy": 0.7472527623176575, + "num_tokens": 9734623.0, + "step": 1871, + "train/ce_loss": 8.638548024464399e-06 + }, + { + "epoch": 0.1849911014435436, + "step": 1871, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.1849911014435436, + "step": 1871, + "train/total_loss": 0.03125086426734924 + }, + { + "entropy": 9.388439178466797, + "epoch": 0.18508997429305912, + "mean_token_accuracy": 0.7086183428764343, + "num_tokens": 9739807.0, + "step": 1872, + "train/ce_loss": 0.6699540019035339 + }, + { + "epoch": 0.18508997429305912, + "step": 1872, + "train/sim_loss": 0.12890625 + }, + { + "epoch": 0.18508997429305912, + "step": 1872, + "train/total_loss": 0.19590166211128235 + }, + { + "entropy": 10.134531021118164, + "epoch": 0.18518884714257464, + "mean_token_accuracy": 0.6870229244232178, + "num_tokens": 9744553.0, + "step": 1873, + "train/ce_loss": 1.5828860998153687 + }, + { + "epoch": 0.18518884714257464, + "step": 1873, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.18518884714257464, + "step": 1873, + "train/total_loss": 0.2012573629617691 + }, + { + "entropy": 9.382888793945312, + "epoch": 0.18528771999209018, + "mean_token_accuracy": 0.721784770488739, + "num_tokens": 9749768.0, + "step": 1874, + "train/ce_loss": 0.5177397727966309 + }, + { + "epoch": 0.18528771999209018, + "step": 1874, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.18528771999209018, + "step": 1874, + "train/total_loss": 0.16896148025989532 + }, + { + "entropy": 9.096595764160156, + "epoch": 0.1853865928416057, + "mean_token_accuracy": 0.7314702272415161, + "num_tokens": 9755067.0, + "step": 1875, + "train/ce_loss": 0.4712185263633728 + }, + { + "epoch": 0.1853865928416057, + "step": 1875, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.1853865928416057, + "step": 1875, + "train/total_loss": 0.07055935263633728 + }, + { + "entropy": 8.749658584594727, + "epoch": 0.18548546569112123, + "mean_token_accuracy": 0.6856856942176819, + "num_tokens": 9760586.0, + "step": 1876, + "train/ce_loss": 0.7312849760055542 + }, + { + "epoch": 0.18548546569112123, + "step": 1876, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.18548546569112123, + "step": 1876, + "train/total_loss": 0.16687849164009094 + }, + { + "entropy": 9.147841453552246, + "epoch": 0.18558433854063675, + "mean_token_accuracy": 0.7352941036224365, + "num_tokens": 9765912.0, + "step": 1877, + "train/ce_loss": 1.8808138370513916 + }, + { + "epoch": 0.18558433854063675, + "step": 1877, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.18558433854063675, + "step": 1877, + "train/total_loss": 0.28573763370513916 + }, + { + "entropy": 9.344097137451172, + "epoch": 0.18568321139015226, + "mean_token_accuracy": 0.7443609237670898, + "num_tokens": 9771137.0, + "step": 1878, + "train/ce_loss": 0.42486655712127686 + }, + { + "epoch": 0.18568321139015226, + "step": 1878, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.18568321139015226, + "step": 1878, + "train/total_loss": 0.10108040273189545 + }, + { + "entropy": 9.687973022460938, + "epoch": 0.1857820842396678, + "mean_token_accuracy": 0.7692307829856873, + "num_tokens": 9776174.0, + "step": 1879, + "train/ce_loss": 1.415340542793274 + }, + { + "epoch": 0.1857820842396678, + "step": 1879, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.1857820842396678, + "step": 1879, + "train/total_loss": 0.23528406023979187 + }, + { + "epoch": 0.1858809570891833, + "grad_norm": 1.0253803730010986, + "learning_rate": 9.53790238836968e-06, + "loss": 0.1698, + "step": 1880 + }, + { + "entropy": 9.919747352600098, + "epoch": 0.1858809570891833, + "mean_token_accuracy": 0.7460629940032959, + "num_tokens": 9781115.0, + "step": 1880, + "train/ce_loss": 2.0956978797912598 + }, + { + "epoch": 0.1858809570891833, + "step": 1880, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.1858809570891833, + "step": 1880, + "train/total_loss": 0.2798823118209839 + }, + { + "entropy": 9.35361385345459, + "epoch": 0.18597982993869883, + "mean_token_accuracy": 0.7058081030845642, + "num_tokens": 9786382.0, + "step": 1881, + "train/ce_loss": 1.2247314453125 + }, + { + "epoch": 0.18597982993869883, + "step": 1881, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.18597982993869883, + "step": 1881, + "train/total_loss": 0.22403565049171448 + }, + { + "entropy": 9.506695747375488, + "epoch": 0.18607870278821437, + "mean_token_accuracy": 0.7617135047912598, + "num_tokens": 9791569.0, + "step": 1882, + "train/ce_loss": 7.135338364605559e-06 + }, + { + "epoch": 0.18607870278821437, + "step": 1882, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.18607870278821437, + "step": 1882, + "train/total_loss": 0.027344463393092155 + }, + { + "entropy": 9.088245391845703, + "epoch": 0.18617757563772988, + "mean_token_accuracy": 0.7137891054153442, + "num_tokens": 9796935.0, + "step": 1883, + "train/ce_loss": 1.3417773246765137 + }, + { + "epoch": 0.18617757563772988, + "step": 1883, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.18617757563772988, + "step": 1883, + "train/total_loss": 0.22792772948741913 + }, + { + "entropy": 9.646706581115723, + "epoch": 0.1862764484872454, + "mean_token_accuracy": 0.7172312140464783, + "num_tokens": 9801993.0, + "step": 1884, + "train/ce_loss": 1.3433440923690796 + }, + { + "epoch": 0.1862764484872454, + "step": 1884, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.1862764484872454, + "step": 1884, + "train/total_loss": 0.19683441519737244 + }, + { + "entropy": 9.41108512878418, + "epoch": 0.18637532133676094, + "mean_token_accuracy": 0.7837116122245789, + "num_tokens": 9807193.0, + "step": 1885, + "train/ce_loss": 0.5483527779579163 + }, + { + "epoch": 0.18637532133676094, + "step": 1885, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.18637532133676094, + "step": 1885, + "train/total_loss": 0.08608527481555939 + }, + { + "entropy": 9.341239929199219, + "epoch": 0.18647419418627645, + "mean_token_accuracy": 0.7682403326034546, + "num_tokens": 9812334.0, + "step": 1886, + "train/ce_loss": 0.5725024938583374 + }, + { + "epoch": 0.18647419418627645, + "step": 1886, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.18647419418627645, + "step": 1886, + "train/total_loss": 0.1588127464056015 + }, + { + "entropy": 9.302617073059082, + "epoch": 0.18657306703579196, + "mean_token_accuracy": 0.7543640732765198, + "num_tokens": 9817596.0, + "step": 1887, + "train/ce_loss": 1.5095512866973877 + }, + { + "epoch": 0.18657306703579196, + "step": 1887, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.18657306703579196, + "step": 1887, + "train/total_loss": 0.21345512568950653 + }, + { + "entropy": 9.235432624816895, + "epoch": 0.1866719398853075, + "mean_token_accuracy": 0.6889185309410095, + "num_tokens": 9822802.0, + "step": 1888, + "train/ce_loss": 0.9855005145072937 + }, + { + "epoch": 0.1866719398853075, + "step": 1888, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.1866719398853075, + "step": 1888, + "train/total_loss": 0.17667505145072937 + }, + { + "entropy": 9.256308555603027, + "epoch": 0.18677081273482302, + "mean_token_accuracy": 0.7429577708244324, + "num_tokens": 9828116.0, + "step": 1889, + "train/ce_loss": 0.7325549721717834 + }, + { + "epoch": 0.18677081273482302, + "step": 1889, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.18677081273482302, + "step": 1889, + "train/total_loss": 0.0927867516875267 + }, + { + "entropy": 10.075715065002441, + "epoch": 0.18686968558433853, + "mean_token_accuracy": 0.7091836929321289, + "num_tokens": 9832906.0, + "step": 1890, + "train/ce_loss": 1.0335999727249146 + }, + { + "epoch": 0.18686968558433853, + "step": 1890, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.18686968558433853, + "step": 1890, + "train/total_loss": 0.17757874727249146 + }, + { + "entropy": 9.484654426574707, + "epoch": 0.18696855843385407, + "mean_token_accuracy": 0.7492307424545288, + "num_tokens": 9838044.0, + "step": 1891, + "train/ce_loss": 0.8732141852378845 + }, + { + "epoch": 0.18696855843385407, + "step": 1891, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.18696855843385407, + "step": 1891, + "train/total_loss": 0.1810714304447174 + }, + { + "entropy": 9.349531173706055, + "epoch": 0.18706743128336958, + "mean_token_accuracy": 0.75157630443573, + "num_tokens": 9843240.0, + "step": 1892, + "train/ce_loss": 0.8059046864509583 + }, + { + "epoch": 0.18706743128336958, + "step": 1892, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.18706743128336958, + "step": 1892, + "train/total_loss": 0.17434047162532806 + }, + { + "entropy": 8.950783729553223, + "epoch": 0.1871663041328851, + "mean_token_accuracy": 0.7363343834877014, + "num_tokens": 9848615.0, + "step": 1893, + "train/ce_loss": 0.7747853994369507 + }, + { + "epoch": 0.1871663041328851, + "step": 1893, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.1871663041328851, + "step": 1893, + "train/total_loss": 0.11263479292392731 + }, + { + "entropy": 9.945171356201172, + "epoch": 0.18726517698240064, + "mean_token_accuracy": 0.7347368597984314, + "num_tokens": 9853517.0, + "step": 1894, + "train/ce_loss": 2.5057694074348547e-05 + }, + { + "epoch": 0.18726517698240064, + "step": 1894, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.18726517698240064, + "step": 1894, + "train/total_loss": 0.10937750339508057 + }, + { + "entropy": 9.073461532592773, + "epoch": 0.18736404983191615, + "mean_token_accuracy": 0.7284234762191772, + "num_tokens": 9858879.0, + "step": 1895, + "train/ce_loss": 1.130031943321228 + }, + { + "epoch": 0.18736404983191615, + "step": 1895, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.18736404983191615, + "step": 1895, + "train/total_loss": 0.1794094443321228 + }, + { + "entropy": 9.198670387268066, + "epoch": 0.18746292268143167, + "mean_token_accuracy": 0.7430406808853149, + "num_tokens": 9864262.0, + "step": 1896, + "train/ce_loss": 0.4583074152469635 + }, + { + "epoch": 0.18746292268143167, + "step": 1896, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.18746292268143167, + "step": 1896, + "train/total_loss": 0.07708074152469635 + }, + { + "entropy": 9.202181816101074, + "epoch": 0.1875617955309472, + "mean_token_accuracy": 0.6662763357162476, + "num_tokens": 9869603.0, + "step": 1897, + "train/ce_loss": 1.7217258214950562 + }, + { + "epoch": 0.1875617955309472, + "step": 1897, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.1875617955309472, + "step": 1897, + "train/total_loss": 0.2698288559913635 + }, + { + "entropy": 9.915672302246094, + "epoch": 0.18766066838046272, + "mean_token_accuracy": 0.6188679337501526, + "num_tokens": 9874546.0, + "step": 1898, + "train/ce_loss": 1.6661015251884237e-05 + }, + { + "epoch": 0.18766066838046272, + "step": 1898, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.18766066838046272, + "step": 1898, + "train/total_loss": 0.06640791893005371 + }, + { + "entropy": 10.016805648803711, + "epoch": 0.18775954122997826, + "mean_token_accuracy": 0.7373737096786499, + "num_tokens": 9879365.0, + "step": 1899, + "train/ce_loss": 1.7677421569824219 + }, + { + "epoch": 0.18775954122997826, + "step": 1899, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.18775954122997826, + "step": 1899, + "train/total_loss": 0.25880545377731323 + }, + { + "epoch": 0.18785841407949377, + "grad_norm": 1.1335080862045288, + "learning_rate": 9.53295752361173e-06, + "loss": 0.1604, + "step": 1900 + }, + { + "entropy": 9.707734107971191, + "epoch": 0.18785841407949377, + "mean_token_accuracy": 0.7730711102485657, + "num_tokens": 9884433.0, + "step": 1900, + "train/ce_loss": 0.7513488531112671 + }, + { + "epoch": 0.18785841407949377, + "step": 1900, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.18785841407949377, + "step": 1900, + "train/total_loss": 0.13763488829135895 + }, + { + "entropy": 9.360363006591797, + "epoch": 0.1879572869290093, + "mean_token_accuracy": 0.6983240246772766, + "num_tokens": 9889590.0, + "step": 1901, + "train/ce_loss": 0.8884443640708923 + }, + { + "epoch": 0.1879572869290093, + "step": 1901, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.1879572869290093, + "step": 1901, + "train/total_loss": 0.1708756983280182 + }, + { + "entropy": 9.513907432556152, + "epoch": 0.18805615977852483, + "mean_token_accuracy": 0.7134052515029907, + "num_tokens": 9894660.0, + "step": 1902, + "train/ce_loss": 1.128973364830017 + }, + { + "epoch": 0.18805615977852483, + "step": 1902, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.18805615977852483, + "step": 1902, + "train/total_loss": 0.2066473364830017 + }, + { + "entropy": 9.177980422973633, + "epoch": 0.18815503262804034, + "mean_token_accuracy": 0.7287173867225647, + "num_tokens": 9899966.0, + "step": 1903, + "train/ce_loss": 0.6958595514297485 + }, + { + "epoch": 0.18815503262804034, + "step": 1903, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.18815503262804034, + "step": 1903, + "train/total_loss": 0.09692970663309097 + }, + { + "entropy": 9.603252410888672, + "epoch": 0.18825390547755586, + "mean_token_accuracy": 0.7086092829704285, + "num_tokens": 9905018.0, + "step": 1904, + "train/ce_loss": 1.0595810413360596 + }, + { + "epoch": 0.18825390547755586, + "step": 1904, + "train/sim_loss": 0.17578125 + }, + { + "epoch": 0.18825390547755586, + "step": 1904, + "train/total_loss": 0.28173935413360596 + }, + { + "entropy": 9.92378044128418, + "epoch": 0.1883527783270714, + "mean_token_accuracy": 0.8280922174453735, + "num_tokens": 9909953.0, + "step": 1905, + "train/ce_loss": 1.083510398864746 + }, + { + "epoch": 0.1883527783270714, + "step": 1905, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.1883527783270714, + "step": 1905, + "train/total_loss": 0.15522605180740356 + }, + { + "entropy": 10.098663330078125, + "epoch": 0.1884516511765869, + "mean_token_accuracy": 0.6756032109260559, + "num_tokens": 9914775.0, + "step": 1906, + "train/ce_loss": 2.1146256923675537 + }, + { + "epoch": 0.1884516511765869, + "step": 1906, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.1884516511765869, + "step": 1906, + "train/total_loss": 0.3286500573158264 + }, + { + "entropy": 9.783650398254395, + "epoch": 0.18855052402610242, + "mean_token_accuracy": 0.7022375464439392, + "num_tokens": 9919760.0, + "step": 1907, + "train/ce_loss": 0.675777792930603 + }, + { + "epoch": 0.18855052402610242, + "step": 1907, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.18855052402610242, + "step": 1907, + "train/total_loss": 0.1222652792930603 + }, + { + "entropy": 9.392547607421875, + "epoch": 0.18864939687561796, + "mean_token_accuracy": 0.7565789222717285, + "num_tokens": 9924979.0, + "step": 1908, + "train/ce_loss": 0.8832827806472778 + }, + { + "epoch": 0.18864939687561796, + "step": 1908, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.18864939687561796, + "step": 1908, + "train/total_loss": 0.1664532721042633 + }, + { + "entropy": 10.167774200439453, + "epoch": 0.18874826972513348, + "mean_token_accuracy": 0.7160493731498718, + "num_tokens": 9929806.0, + "step": 1909, + "train/ce_loss": 2.2639083862304688 + }, + { + "epoch": 0.18874826972513348, + "step": 1909, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.18874826972513348, + "step": 1909, + "train/total_loss": 0.3201408386230469 + }, + { + "entropy": 9.179858207702637, + "epoch": 0.188847142574649, + "mean_token_accuracy": 0.6723940372467041, + "num_tokens": 9935182.0, + "step": 1910, + "train/ce_loss": 0.9893306493759155 + }, + { + "epoch": 0.188847142574649, + "step": 1910, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.188847142574649, + "step": 1910, + "train/total_loss": 0.19268307089805603 + }, + { + "entropy": 9.44682788848877, + "epoch": 0.18894601542416453, + "mean_token_accuracy": 0.7652284502983093, + "num_tokens": 9940395.0, + "step": 1911, + "train/ce_loss": 0.8784785270690918 + }, + { + "epoch": 0.18894601542416453, + "step": 1911, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.18894601542416453, + "step": 1911, + "train/total_loss": 0.15034785866737366 + }, + { + "entropy": 8.869483947753906, + "epoch": 0.18904488827368005, + "mean_token_accuracy": 0.7418073415756226, + "num_tokens": 9945921.0, + "step": 1912, + "train/ce_loss": 0.8187457323074341 + }, + { + "epoch": 0.18904488827368005, + "step": 1912, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.18904488827368005, + "step": 1912, + "train/total_loss": 0.10921832174062729 + }, + { + "entropy": 9.935173034667969, + "epoch": 0.18914376112319556, + "mean_token_accuracy": 0.7488986849784851, + "num_tokens": 9950794.0, + "step": 1913, + "train/ce_loss": 2.188868284225464 + }, + { + "epoch": 0.18914376112319556, + "step": 1913, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.18914376112319556, + "step": 1913, + "train/total_loss": 0.3126368522644043 + }, + { + "entropy": 9.276559829711914, + "epoch": 0.1892426339727111, + "mean_token_accuracy": 0.7250324487686157, + "num_tokens": 9956008.0, + "step": 1914, + "train/ce_loss": 0.7460759282112122 + }, + { + "epoch": 0.1892426339727111, + "step": 1914, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.1892426339727111, + "step": 1914, + "train/total_loss": 0.15663884580135345 + }, + { + "entropy": 9.76962661743164, + "epoch": 0.1893415068222266, + "mean_token_accuracy": 0.7380560040473938, + "num_tokens": 9961048.0, + "step": 1915, + "train/ce_loss": 0.7595387101173401 + }, + { + "epoch": 0.1893415068222266, + "step": 1915, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.1893415068222266, + "step": 1915, + "train/total_loss": 0.142360121011734 + }, + { + "entropy": 9.06021785736084, + "epoch": 0.18944037967174213, + "mean_token_accuracy": 0.6996337175369263, + "num_tokens": 9966328.0, + "step": 1916, + "train/ce_loss": 0.719662606716156 + }, + { + "epoch": 0.18944037967174213, + "step": 1916, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.18944037967174213, + "step": 1916, + "train/total_loss": 0.1344662606716156 + }, + { + "entropy": 9.592153549194336, + "epoch": 0.18953925252125767, + "mean_token_accuracy": 0.7346278429031372, + "num_tokens": 9971393.0, + "step": 1917, + "train/ce_loss": 1.203270435333252 + }, + { + "epoch": 0.18953925252125767, + "step": 1917, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.18953925252125767, + "step": 1917, + "train/total_loss": 0.22188955545425415 + }, + { + "entropy": 9.755895614624023, + "epoch": 0.18963812537077318, + "mean_token_accuracy": 0.807106614112854, + "num_tokens": 9976375.0, + "step": 1918, + "train/ce_loss": 8.259370588348247e-06 + }, + { + "epoch": 0.18963812537077318, + "step": 1918, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.18963812537077318, + "step": 1918, + "train/total_loss": 0.10156332701444626 + }, + { + "entropy": 9.63601303100586, + "epoch": 0.18973699822028872, + "mean_token_accuracy": 0.7152875065803528, + "num_tokens": 9981514.0, + "step": 1919, + "train/ce_loss": 0.933495283126831 + }, + { + "epoch": 0.18973699822028872, + "step": 1919, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.18973699822028872, + "step": 1919, + "train/total_loss": 0.11678703129291534 + }, + { + "epoch": 0.18983587106980424, + "grad_norm": 0.9176076054573059, + "learning_rate": 9.528012658853782e-06, + "loss": 0.1639, + "step": 1920 + }, + { + "entropy": 9.022931098937988, + "epoch": 0.18983587106980424, + "mean_token_accuracy": 0.6468129754066467, + "num_tokens": 9986990.0, + "step": 1920, + "train/ce_loss": 0.8590229749679565 + }, + { + "epoch": 0.18983587106980424, + "step": 1920, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.18983587106980424, + "step": 1920, + "train/total_loss": 0.15621480345726013 + }, + { + "entropy": 8.768945693969727, + "epoch": 0.18993474391931975, + "mean_token_accuracy": 0.673511266708374, + "num_tokens": 9992442.0, + "step": 1921, + "train/ce_loss": 0.7011252045631409 + }, + { + "epoch": 0.18993474391931975, + "step": 1921, + "train/sim_loss": 0.140625 + }, + { + "epoch": 0.18993474391931975, + "step": 1921, + "train/total_loss": 0.21073752641677856 + }, + { + "entropy": 9.983613014221191, + "epoch": 0.1900336167688353, + "mean_token_accuracy": 0.7366336584091187, + "num_tokens": 9997384.0, + "step": 1922, + "train/ce_loss": 7.890875167504419e-06 + }, + { + "epoch": 0.1900336167688353, + "step": 1922, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.1900336167688353, + "step": 1922, + "train/total_loss": 0.027344539761543274 + }, + { + "entropy": 9.837148666381836, + "epoch": 0.1901324896183508, + "mean_token_accuracy": 0.6741154789924622, + "num_tokens": 10002300.0, + "step": 1923, + "train/ce_loss": 1.8100438117980957 + }, + { + "epoch": 0.1901324896183508, + "step": 1923, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.1901324896183508, + "step": 1923, + "train/total_loss": 0.2591294050216675 + }, + { + "entropy": 10.034281730651855, + "epoch": 0.19023136246786632, + "mean_token_accuracy": 0.7677165269851685, + "num_tokens": 10007226.0, + "step": 1924, + "train/ce_loss": 5.1621764214360155e-06 + }, + { + "epoch": 0.19023136246786632, + "step": 1924, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.19023136246786632, + "step": 1924, + "train/total_loss": 0.08203176409006119 + }, + { + "entropy": 9.544149398803711, + "epoch": 0.19033023531738186, + "mean_token_accuracy": 0.6960408687591553, + "num_tokens": 10012625.0, + "step": 1925, + "train/ce_loss": 1.2972491979599 + }, + { + "epoch": 0.19033023531738186, + "step": 1925, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.19033023531738186, + "step": 1925, + "train/total_loss": 0.21566241979599 + }, + { + "entropy": 9.560464859008789, + "epoch": 0.19042910816689737, + "mean_token_accuracy": 0.671897292137146, + "num_tokens": 10017783.0, + "step": 1926, + "train/ce_loss": 0.7921290397644043 + }, + { + "epoch": 0.19042910816689737, + "step": 1926, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.19042910816689737, + "step": 1926, + "train/total_loss": 0.15733790397644043 + }, + { + "entropy": 10.270628929138184, + "epoch": 0.19052798101641288, + "mean_token_accuracy": 0.7486486434936523, + "num_tokens": 10022587.0, + "step": 1927, + "train/ce_loss": 1.5367345809936523 + }, + { + "epoch": 0.19052798101641288, + "step": 1927, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.19052798101641288, + "step": 1927, + "train/total_loss": 0.2708609700202942 + }, + { + "entropy": 9.204045295715332, + "epoch": 0.19062685386592843, + "mean_token_accuracy": 0.7177242636680603, + "num_tokens": 10027979.0, + "step": 1928, + "train/ce_loss": 0.8198716044425964 + }, + { + "epoch": 0.19062685386592843, + "step": 1928, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.19062685386592843, + "step": 1928, + "train/total_loss": 0.1405809223651886 + }, + { + "entropy": 9.579994201660156, + "epoch": 0.19072572671544394, + "mean_token_accuracy": 0.7909215688705444, + "num_tokens": 10033171.0, + "step": 1929, + "train/ce_loss": 0.8041414618492126 + }, + { + "epoch": 0.19072572671544394, + "step": 1929, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.19072572671544394, + "step": 1929, + "train/total_loss": 0.16635164618492126 + }, + { + "entropy": 10.178987503051758, + "epoch": 0.19082459956495945, + "mean_token_accuracy": 0.717277467250824, + "num_tokens": 10037978.0, + "step": 1930, + "train/ce_loss": 1.842941164970398 + }, + { + "epoch": 0.19082459956495945, + "step": 1930, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.19082459956495945, + "step": 1930, + "train/total_loss": 0.27413785457611084 + }, + { + "entropy": 9.233572959899902, + "epoch": 0.190923472414475, + "mean_token_accuracy": 0.7350332736968994, + "num_tokens": 10043339.0, + "step": 1931, + "train/ce_loss": 0.5585495233535767 + }, + { + "epoch": 0.190923472414475, + "step": 1931, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.190923472414475, + "step": 1931, + "train/total_loss": 0.09491745382547379 + }, + { + "entropy": 9.72354793548584, + "epoch": 0.1910223452639905, + "mean_token_accuracy": 0.7014681696891785, + "num_tokens": 10048394.0, + "step": 1932, + "train/ce_loss": 1.6334415674209595 + }, + { + "epoch": 0.1910223452639905, + "step": 1932, + "train/sim_loss": 0.21875 + }, + { + "epoch": 0.1910223452639905, + "step": 1932, + "train/total_loss": 0.382094144821167 + }, + { + "entropy": 9.566108703613281, + "epoch": 0.19112121811350602, + "mean_token_accuracy": 0.7293233275413513, + "num_tokens": 10053524.0, + "step": 1933, + "train/ce_loss": 0.9320631623268127 + }, + { + "epoch": 0.19112121811350602, + "step": 1933, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.19112121811350602, + "step": 1933, + "train/total_loss": 0.13617506623268127 + }, + { + "entropy": 9.240278244018555, + "epoch": 0.19122009096302156, + "mean_token_accuracy": 0.719260036945343, + "num_tokens": 10058882.0, + "step": 1934, + "train/ce_loss": 1.0645043849945068 + }, + { + "epoch": 0.19122009096302156, + "step": 1934, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.19122009096302156, + "step": 1934, + "train/total_loss": 0.17676293849945068 + }, + { + "entropy": 9.086050033569336, + "epoch": 0.19131896381253707, + "mean_token_accuracy": 0.7275574207305908, + "num_tokens": 10064339.0, + "step": 1935, + "train/ce_loss": 1.5135074853897095 + }, + { + "epoch": 0.19131896381253707, + "step": 1935, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.19131896381253707, + "step": 1935, + "train/total_loss": 0.24510075151920319 + }, + { + "entropy": 9.469228744506836, + "epoch": 0.1914178366620526, + "mean_token_accuracy": 0.7276478409767151, + "num_tokens": 10069518.0, + "step": 1936, + "train/ce_loss": 0.5873953700065613 + }, + { + "epoch": 0.1914178366620526, + "step": 1936, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.1914178366620526, + "step": 1936, + "train/total_loss": 0.1173332929611206 + }, + { + "entropy": 9.303143501281738, + "epoch": 0.19151670951156813, + "mean_token_accuracy": 0.7047058939933777, + "num_tokens": 10074828.0, + "step": 1937, + "train/ce_loss": 1.2339478731155396 + }, + { + "epoch": 0.19151670951156813, + "step": 1937, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.19151670951156813, + "step": 1937, + "train/total_loss": 0.22495728731155396 + }, + { + "entropy": 9.396015167236328, + "epoch": 0.19161558236108364, + "mean_token_accuracy": 0.6743002533912659, + "num_tokens": 10080079.0, + "step": 1938, + "train/ce_loss": 1.1823872327804565 + }, + { + "epoch": 0.19161558236108364, + "step": 1938, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.19161558236108364, + "step": 1938, + "train/total_loss": 0.17683246731758118 + }, + { + "entropy": 9.46355152130127, + "epoch": 0.19171445521059918, + "mean_token_accuracy": 0.737062931060791, + "num_tokens": 10085240.0, + "step": 1939, + "train/ce_loss": 0.6518946290016174 + }, + { + "epoch": 0.19171445521059918, + "step": 1939, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.19171445521059918, + "step": 1939, + "train/total_loss": 0.13550196588039398 + }, + { + "epoch": 0.1918133280601147, + "grad_norm": 0.878887414932251, + "learning_rate": 9.523067794095833e-06, + "loss": 0.1717, + "step": 1940 + }, + { + "entropy": 8.839654922485352, + "epoch": 0.1918133280601147, + "mean_token_accuracy": 0.7251356244087219, + "num_tokens": 10090860.0, + "step": 1940, + "train/ce_loss": 0.914640486240387 + }, + { + "epoch": 0.1918133280601147, + "step": 1940, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.1918133280601147, + "step": 1940, + "train/total_loss": 0.18521404266357422 + }, + { + "entropy": 9.40005874633789, + "epoch": 0.1919122009096302, + "mean_token_accuracy": 0.6986469626426697, + "num_tokens": 10096084.0, + "step": 1941, + "train/ce_loss": 1.039476752281189 + }, + { + "epoch": 0.1919122009096302, + "step": 1941, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.1919122009096302, + "step": 1941, + "train/total_loss": 0.18207266926765442 + }, + { + "entropy": 9.450826644897461, + "epoch": 0.19201107375914575, + "mean_token_accuracy": 0.6740914583206177, + "num_tokens": 10101355.0, + "step": 1942, + "train/ce_loss": 0.4420863091945648 + }, + { + "epoch": 0.19201107375914575, + "step": 1942, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.19201107375914575, + "step": 1942, + "train/total_loss": 0.14186488091945648 + }, + { + "entropy": 9.06132698059082, + "epoch": 0.19210994660866126, + "mean_token_accuracy": 0.7876213788986206, + "num_tokens": 10106650.0, + "step": 1943, + "train/ce_loss": 0.6898800134658813 + }, + { + "epoch": 0.19210994660866126, + "step": 1943, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.19210994660866126, + "step": 1943, + "train/total_loss": 0.10023800283670425 + }, + { + "entropy": 9.049436569213867, + "epoch": 0.19220881945817678, + "mean_token_accuracy": 0.7545090317726135, + "num_tokens": 10112168.0, + "step": 1944, + "train/ce_loss": 0.8400173783302307 + }, + { + "epoch": 0.19220881945817678, + "step": 1944, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.19220881945817678, + "step": 1944, + "train/total_loss": 0.17384549975395203 + }, + { + "entropy": 9.39749813079834, + "epoch": 0.19230769230769232, + "mean_token_accuracy": 0.7249322533607483, + "num_tokens": 10117366.0, + "step": 1945, + "train/ce_loss": 1.7692957044346258e-05 + }, + { + "epoch": 0.19230769230769232, + "step": 1945, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.19230769230769232, + "step": 1945, + "train/total_loss": 0.09375176578760147 + }, + { + "entropy": 9.324932098388672, + "epoch": 0.19240656515720783, + "mean_token_accuracy": 0.7276536226272583, + "num_tokens": 10122549.0, + "step": 1946, + "train/ce_loss": 0.9241024255752563 + }, + { + "epoch": 0.19240656515720783, + "step": 1946, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.19240656515720783, + "step": 1946, + "train/total_loss": 0.18616023659706116 + }, + { + "entropy": 9.378195762634277, + "epoch": 0.19250543800672335, + "mean_token_accuracy": 0.7531734704971313, + "num_tokens": 10127689.0, + "step": 1947, + "train/ce_loss": 1.0089960098266602 + }, + { + "epoch": 0.19250543800672335, + "step": 1947, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.19250543800672335, + "step": 1947, + "train/total_loss": 0.1204308494925499 + }, + { + "entropy": 8.946748733520508, + "epoch": 0.1926043108562389, + "mean_token_accuracy": 0.7372187972068787, + "num_tokens": 10133186.0, + "step": 1948, + "train/ce_loss": 1.051811933517456 + }, + { + "epoch": 0.1926043108562389, + "step": 1948, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.1926043108562389, + "step": 1948, + "train/total_loss": 0.20283743739128113 + }, + { + "entropy": 9.343496322631836, + "epoch": 0.1927031837057544, + "mean_token_accuracy": 0.7685185074806213, + "num_tokens": 10138462.0, + "step": 1949, + "train/ce_loss": 0.6328471302986145 + }, + { + "epoch": 0.1927031837057544, + "step": 1949, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.1927031837057544, + "step": 1949, + "train/total_loss": 0.11797221750020981 + }, + { + "entropy": 9.369144439697266, + "epoch": 0.1928020565552699, + "mean_token_accuracy": 0.7584415674209595, + "num_tokens": 10143716.0, + "step": 1950, + "train/ce_loss": 0.37200042605400085 + }, + { + "epoch": 0.1928020565552699, + "step": 1950, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.1928020565552699, + "step": 1950, + "train/total_loss": 0.10751254856586456 + }, + { + "entropy": 9.393310546875, + "epoch": 0.19290092940478545, + "mean_token_accuracy": 0.7613940834999084, + "num_tokens": 10148974.0, + "step": 1951, + "train/ce_loss": 0.6348753571510315 + }, + { + "epoch": 0.19290092940478545, + "step": 1951, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.19290092940478545, + "step": 1951, + "train/total_loss": 0.14161252975463867 + }, + { + "entropy": 9.001161575317383, + "epoch": 0.19299980225430097, + "mean_token_accuracy": 0.7382550239562988, + "num_tokens": 10154489.0, + "step": 1952, + "train/ce_loss": 0.3051251769065857 + }, + { + "epoch": 0.19299980225430097, + "step": 1952, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.19299980225430097, + "step": 1952, + "train/total_loss": 0.10473126918077469 + }, + { + "entropy": 8.901924133300781, + "epoch": 0.19309867510381648, + "mean_token_accuracy": 0.7837837934494019, + "num_tokens": 10160000.0, + "step": 1953, + "train/ce_loss": 0.7139679193496704 + }, + { + "epoch": 0.19309867510381648, + "step": 1953, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.19309867510381648, + "step": 1953, + "train/total_loss": 0.12999054789543152 + }, + { + "entropy": 9.370849609375, + "epoch": 0.19319754795333202, + "mean_token_accuracy": 0.7202072739601135, + "num_tokens": 10165235.0, + "step": 1954, + "train/ce_loss": 0.9253755211830139 + }, + { + "epoch": 0.19319754795333202, + "step": 1954, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.19319754795333202, + "step": 1954, + "train/total_loss": 0.1980063021183014 + }, + { + "entropy": 9.580575942993164, + "epoch": 0.19329642080284754, + "mean_token_accuracy": 0.677570104598999, + "num_tokens": 10170374.0, + "step": 1955, + "train/ce_loss": 1.3786916732788086 + }, + { + "epoch": 0.19329642080284754, + "step": 1955, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.19329642080284754, + "step": 1955, + "train/total_loss": 0.19646291434764862 + }, + { + "entropy": 9.054275512695312, + "epoch": 0.19339529365236305, + "mean_token_accuracy": 0.7981651425361633, + "num_tokens": 10175700.0, + "step": 1956, + "train/ce_loss": 0.5524548292160034 + }, + { + "epoch": 0.19339529365236305, + "step": 1956, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.19339529365236305, + "step": 1956, + "train/total_loss": 0.08258923888206482 + }, + { + "entropy": 9.024927139282227, + "epoch": 0.1934941665018786, + "mean_token_accuracy": 0.7222222089767456, + "num_tokens": 10181162.0, + "step": 1957, + "train/ce_loss": 1.0508496761322021 + }, + { + "epoch": 0.1934941665018786, + "step": 1957, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.1934941665018786, + "step": 1957, + "train/total_loss": 0.15195997059345245 + }, + { + "entropy": 9.6265869140625, + "epoch": 0.1935930393513941, + "mean_token_accuracy": 0.7394034266471863, + "num_tokens": 10186273.0, + "step": 1958, + "train/ce_loss": 1.0031788349151611 + }, + { + "epoch": 0.1935930393513941, + "step": 1958, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.1935930393513941, + "step": 1958, + "train/total_loss": 0.14328664541244507 + }, + { + "entropy": 9.224421501159668, + "epoch": 0.19369191220090964, + "mean_token_accuracy": 0.6613076329231262, + "num_tokens": 10191710.0, + "step": 1959, + "train/ce_loss": 1.8621876239776611 + }, + { + "epoch": 0.19369191220090964, + "step": 1959, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.19369191220090964, + "step": 1959, + "train/total_loss": 0.2838750183582306 + }, + { + "epoch": 0.19379078505042516, + "grad_norm": 1.1886433362960815, + "learning_rate": 9.518122929337883e-06, + "loss": 0.159, + "step": 1960 + }, + { + "entropy": 9.636155128479004, + "epoch": 0.19379078505042516, + "mean_token_accuracy": 0.7268292903900146, + "num_tokens": 10196723.0, + "step": 1960, + "train/ce_loss": 1.3479249477386475 + }, + { + "epoch": 0.19379078505042516, + "step": 1960, + "train/sim_loss": 0.13671875 + }, + { + "epoch": 0.19379078505042516, + "step": 1960, + "train/total_loss": 0.2715112566947937 + }, + { + "entropy": 9.221063613891602, + "epoch": 0.19388965789994067, + "mean_token_accuracy": 0.7819383144378662, + "num_tokens": 10202077.0, + "step": 1961, + "train/ce_loss": 0.7505377531051636 + }, + { + "epoch": 0.19388965789994067, + "step": 1961, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.19388965789994067, + "step": 1961, + "train/total_loss": 0.12974128127098083 + }, + { + "entropy": 9.390584945678711, + "epoch": 0.1939885307494562, + "mean_token_accuracy": 0.7436241507530212, + "num_tokens": 10207246.0, + "step": 1962, + "train/ce_loss": 0.6959307789802551 + }, + { + "epoch": 0.1939885307494562, + "step": 1962, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.1939885307494562, + "step": 1962, + "train/total_loss": 0.16334307193756104 + }, + { + "entropy": 9.111087799072266, + "epoch": 0.19408740359897173, + "mean_token_accuracy": 0.8062953948974609, + "num_tokens": 10212568.0, + "step": 1963, + "train/ce_loss": 0.6606025695800781 + }, + { + "epoch": 0.19408740359897173, + "step": 1963, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.19408740359897173, + "step": 1963, + "train/total_loss": 0.16371650993824005 + }, + { + "entropy": 9.369743347167969, + "epoch": 0.19418627644848724, + "mean_token_accuracy": 0.7235984206199646, + "num_tokens": 10217814.0, + "step": 1964, + "train/ce_loss": 0.7288702130317688 + }, + { + "epoch": 0.19418627644848724, + "step": 1964, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.19418627644848724, + "step": 1964, + "train/total_loss": 0.17835578322410583 + }, + { + "entropy": 10.039051055908203, + "epoch": 0.19428514929800278, + "mean_token_accuracy": 0.7597402334213257, + "num_tokens": 10222727.0, + "step": 1965, + "train/ce_loss": 3.78113218175713e-05 + }, + { + "epoch": 0.19428514929800278, + "step": 1965, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.19428514929800278, + "step": 1965, + "train/total_loss": 0.05859753116965294 + }, + { + "entropy": 9.540410995483398, + "epoch": 0.1943840221475183, + "mean_token_accuracy": 0.6957746744155884, + "num_tokens": 10227884.0, + "step": 1966, + "train/ce_loss": 1.4989935159683228 + }, + { + "epoch": 0.1943840221475183, + "step": 1966, + "train/sim_loss": 0.1328125 + }, + { + "epoch": 0.1943840221475183, + "step": 1966, + "train/total_loss": 0.28271186351776123 + }, + { + "entropy": 9.632904052734375, + "epoch": 0.1944828949970338, + "mean_token_accuracy": 0.7837445735931396, + "num_tokens": 10233008.0, + "step": 1967, + "train/ce_loss": 0.5353147983551025 + }, + { + "epoch": 0.1944828949970338, + "step": 1967, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.1944828949970338, + "step": 1967, + "train/total_loss": 0.10431273281574249 + }, + { + "entropy": 9.63956069946289, + "epoch": 0.19458176784654935, + "mean_token_accuracy": 0.7118644118309021, + "num_tokens": 10238022.0, + "step": 1968, + "train/ce_loss": 8.986912689579185e-06 + }, + { + "epoch": 0.19458176784654935, + "step": 1968, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.19458176784654935, + "step": 1968, + "train/total_loss": 0.06250090152025223 + }, + { + "entropy": 10.186599731445312, + "epoch": 0.19468064069606486, + "mean_token_accuracy": 0.6994949579238892, + "num_tokens": 10242853.0, + "step": 1969, + "train/ce_loss": 9.860812497208826e-06 + }, + { + "epoch": 0.19468064069606486, + "step": 1969, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.19468064069606486, + "step": 1969, + "train/total_loss": 0.027344735339283943 + }, + { + "entropy": 9.200822830200195, + "epoch": 0.19477951354558037, + "mean_token_accuracy": 0.7674919366836548, + "num_tokens": 10248207.0, + "step": 1970, + "train/ce_loss": 0.49040570855140686 + }, + { + "epoch": 0.19477951354558037, + "step": 1970, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.19477951354558037, + "step": 1970, + "train/total_loss": 0.10763432085514069 + }, + { + "entropy": 9.327460289001465, + "epoch": 0.19487838639509591, + "mean_token_accuracy": 0.7048054933547974, + "num_tokens": 10253554.0, + "step": 1971, + "train/ce_loss": 1.5515893697738647 + }, + { + "epoch": 0.19487838639509591, + "step": 1971, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.19487838639509591, + "step": 1971, + "train/total_loss": 0.22937768697738647 + }, + { + "entropy": 9.314977645874023, + "epoch": 0.19497725924461143, + "mean_token_accuracy": 0.6891566514968872, + "num_tokens": 10258854.0, + "step": 1972, + "train/ce_loss": 0.5195350646972656 + }, + { + "epoch": 0.19497725924461143, + "step": 1972, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.19497725924461143, + "step": 1972, + "train/total_loss": 0.1066410094499588 + }, + { + "entropy": 9.803762435913086, + "epoch": 0.19507613209412694, + "mean_token_accuracy": 0.7456446290016174, + "num_tokens": 10263834.0, + "step": 1973, + "train/ce_loss": 1.242767333984375 + }, + { + "epoch": 0.19507613209412694, + "step": 1973, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.19507613209412694, + "step": 1973, + "train/total_loss": 0.19458922743797302 + }, + { + "entropy": 9.587112426757812, + "epoch": 0.19517500494364248, + "mean_token_accuracy": 0.7117117047309875, + "num_tokens": 10269022.0, + "step": 1974, + "train/ce_loss": 1.0832068920135498 + }, + { + "epoch": 0.19517500494364248, + "step": 1974, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.19517500494364248, + "step": 1974, + "train/total_loss": 0.1512894332408905 + }, + { + "entropy": 9.864027976989746, + "epoch": 0.195273877793158, + "mean_token_accuracy": 0.7209705114364624, + "num_tokens": 10273997.0, + "step": 1975, + "train/ce_loss": 1.0521684885025024 + }, + { + "epoch": 0.195273877793158, + "step": 1975, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.195273877793158, + "step": 1975, + "train/total_loss": 0.1716231107711792 + }, + { + "entropy": 9.501541137695312, + "epoch": 0.1953727506426735, + "mean_token_accuracy": 0.7556179761886597, + "num_tokens": 10279164.0, + "step": 1976, + "train/ce_loss": 1.044768214225769 + }, + { + "epoch": 0.1953727506426735, + "step": 1976, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.1953727506426735, + "step": 1976, + "train/total_loss": 0.19432057440280914 + }, + { + "entropy": 9.759073257446289, + "epoch": 0.19547162349218905, + "mean_token_accuracy": 0.735433042049408, + "num_tokens": 10284245.0, + "step": 1977, + "train/ce_loss": 1.3436446351988707e-05 + }, + { + "epoch": 0.19547162349218905, + "step": 1977, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.19547162349218905, + "step": 1977, + "train/total_loss": 0.046876344829797745 + }, + { + "entropy": 9.271638870239258, + "epoch": 0.19557049634170456, + "mean_token_accuracy": 0.7273743152618408, + "num_tokens": 10289591.0, + "step": 1978, + "train/ce_loss": 1.3765709400177002 + }, + { + "epoch": 0.19557049634170456, + "step": 1978, + "train/sim_loss": 0.12890625 + }, + { + "epoch": 0.19557049634170456, + "step": 1978, + "train/total_loss": 0.266563355922699 + }, + { + "entropy": 10.238898277282715, + "epoch": 0.19566936919122008, + "mean_token_accuracy": 0.762273907661438, + "num_tokens": 10294361.0, + "step": 1979, + "train/ce_loss": 8.265675205620937e-06 + }, + { + "epoch": 0.19566936919122008, + "step": 1979, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.19566936919122008, + "step": 1979, + "train/total_loss": 0.02734457701444626 + }, + { + "epoch": 0.19576824204073562, + "grad_norm": 0.8895950317382812, + "learning_rate": 9.513178064579934e-06, + "loss": 0.1545, + "step": 1980 + }, + { + "entropy": 9.463583946228027, + "epoch": 0.19576824204073562, + "mean_token_accuracy": 0.707317054271698, + "num_tokens": 10299584.0, + "step": 1980, + "train/ce_loss": 0.8408167362213135 + }, + { + "epoch": 0.19576824204073562, + "step": 1980, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.19576824204073562, + "step": 1980, + "train/total_loss": 0.18564417958259583 + }, + { + "entropy": 9.447678565979004, + "epoch": 0.19586711489025113, + "mean_token_accuracy": 0.6492537260055542, + "num_tokens": 10304859.0, + "step": 1981, + "train/ce_loss": 3.376751010364387e-06 + }, + { + "epoch": 0.19586711489025113, + "step": 1981, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.19586711489025113, + "step": 1981, + "train/total_loss": 0.05859408900141716 + }, + { + "entropy": 9.249719619750977, + "epoch": 0.19596598773976667, + "mean_token_accuracy": 0.6746126413345337, + "num_tokens": 10310173.0, + "step": 1982, + "train/ce_loss": 1.762305498123169 + }, + { + "epoch": 0.19596598773976667, + "step": 1982, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.19596598773976667, + "step": 1982, + "train/total_loss": 0.2777930498123169 + }, + { + "entropy": 9.396759033203125, + "epoch": 0.19606486058928219, + "mean_token_accuracy": 0.7430232763290405, + "num_tokens": 10315525.0, + "step": 1983, + "train/ce_loss": 0.5767508745193481 + }, + { + "epoch": 0.19606486058928219, + "step": 1983, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.19606486058928219, + "step": 1983, + "train/total_loss": 0.08501884341239929 + }, + { + "entropy": 9.488676071166992, + "epoch": 0.1961637334387977, + "mean_token_accuracy": 0.8308921456336975, + "num_tokens": 10320712.0, + "step": 1984, + "train/ce_loss": 0.8398823738098145 + }, + { + "epoch": 0.1961637334387977, + "step": 1984, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.1961637334387977, + "step": 1984, + "train/total_loss": 0.1152382418513298 + }, + { + "entropy": 9.176420211791992, + "epoch": 0.19626260628831324, + "mean_token_accuracy": 0.7489361763000488, + "num_tokens": 10326162.0, + "step": 1985, + "train/ce_loss": 0.7047630548477173 + }, + { + "epoch": 0.19626260628831324, + "step": 1985, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.19626260628831324, + "step": 1985, + "train/total_loss": 0.11735130846500397 + }, + { + "entropy": 9.40720272064209, + "epoch": 0.19636147913782875, + "mean_token_accuracy": 0.7972292304039001, + "num_tokens": 10331390.0, + "step": 1986, + "train/ce_loss": 0.6272144317626953 + }, + { + "epoch": 0.19636147913782875, + "step": 1986, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.19636147913782875, + "step": 1986, + "train/total_loss": 0.11740894615650177 + }, + { + "entropy": 9.483461380004883, + "epoch": 0.19646035198734427, + "mean_token_accuracy": 0.7078085541725159, + "num_tokens": 10336778.0, + "step": 1987, + "train/ce_loss": 1.3223289251327515 + }, + { + "epoch": 0.19646035198734427, + "step": 1987, + "train/sim_loss": 0.12890625 + }, + { + "epoch": 0.19646035198734427, + "step": 1987, + "train/total_loss": 0.2611391544342041 + }, + { + "entropy": 9.039703369140625, + "epoch": 0.1965592248368598, + "mean_token_accuracy": 0.8130000233650208, + "num_tokens": 10342263.0, + "step": 1988, + "train/ce_loss": 0.38791629672050476 + }, + { + "epoch": 0.1965592248368598, + "step": 1988, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.1965592248368598, + "step": 1988, + "train/total_loss": 0.062229130417108536 + }, + { + "entropy": 10.164739608764648, + "epoch": 0.19665809768637532, + "mean_token_accuracy": 0.7388059496879578, + "num_tokens": 10347104.0, + "step": 1989, + "train/ce_loss": 0.8840498328208923 + }, + { + "epoch": 0.19665809768637532, + "step": 1989, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.19665809768637532, + "step": 1989, + "train/total_loss": 0.19777998328208923 + }, + { + "entropy": 9.35018539428711, + "epoch": 0.19675697053589083, + "mean_token_accuracy": 0.7886179089546204, + "num_tokens": 10352411.0, + "step": 1990, + "train/ce_loss": 1.022213339805603 + }, + { + "epoch": 0.19675697053589083, + "step": 1990, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.19675697053589083, + "step": 1990, + "train/total_loss": 0.18034633994102478 + }, + { + "entropy": 8.978738784790039, + "epoch": 0.19685584338540638, + "mean_token_accuracy": 0.728728711605072, + "num_tokens": 10357881.0, + "step": 1991, + "train/ce_loss": 1.0197044610977173 + }, + { + "epoch": 0.19685584338540638, + "step": 1991, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.19685584338540638, + "step": 1991, + "train/total_loss": 0.17228294909000397 + }, + { + "entropy": 9.523167610168457, + "epoch": 0.1969547162349219, + "mean_token_accuracy": 0.7468531727790833, + "num_tokens": 10363079.0, + "step": 1992, + "train/ce_loss": 0.4212130010128021 + }, + { + "epoch": 0.1969547162349219, + "step": 1992, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.1969547162349219, + "step": 1992, + "train/total_loss": 0.06946505606174469 + }, + { + "entropy": 9.488813400268555, + "epoch": 0.1970535890844374, + "mean_token_accuracy": 0.756035566329956, + "num_tokens": 10368308.0, + "step": 1993, + "train/ce_loss": 0.8103978037834167 + }, + { + "epoch": 0.1970535890844374, + "step": 1993, + "train/sim_loss": 0.12109375 + }, + { + "epoch": 0.1970535890844374, + "step": 1993, + "train/total_loss": 0.20213353633880615 + }, + { + "entropy": 9.04245376586914, + "epoch": 0.19715246193395294, + "mean_token_accuracy": 0.6892712712287903, + "num_tokens": 10373763.0, + "step": 1994, + "train/ce_loss": 0.8689247965812683 + }, + { + "epoch": 0.19715246193395294, + "step": 1994, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.19715246193395294, + "step": 1994, + "train/total_loss": 0.1962674856185913 + }, + { + "entropy": 9.254375457763672, + "epoch": 0.19725133478346846, + "mean_token_accuracy": 0.711442768573761, + "num_tokens": 10379021.0, + "step": 1995, + "train/ce_loss": 1.226904034614563 + }, + { + "epoch": 0.19725133478346846, + "step": 1995, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.19725133478346846, + "step": 1995, + "train/total_loss": 0.22425290942192078 + }, + { + "entropy": 9.452052116394043, + "epoch": 0.19735020763298397, + "mean_token_accuracy": 0.676701545715332, + "num_tokens": 10384240.0, + "step": 1996, + "train/ce_loss": 1.1110657453536987 + }, + { + "epoch": 0.19735020763298397, + "step": 1996, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.19735020763298397, + "step": 1996, + "train/total_loss": 0.23610657453536987 + }, + { + "entropy": 9.328840255737305, + "epoch": 0.1974490804824995, + "mean_token_accuracy": 0.7708095908164978, + "num_tokens": 10389549.0, + "step": 1997, + "train/ce_loss": 0.5044722557067871 + }, + { + "epoch": 0.1974490804824995, + "step": 1997, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.1974490804824995, + "step": 1997, + "train/total_loss": 0.11685347557067871 + }, + { + "entropy": 9.00434684753418, + "epoch": 0.19754795333201502, + "mean_token_accuracy": 0.7303252816200256, + "num_tokens": 10394939.0, + "step": 1998, + "train/ce_loss": 0.9697033166885376 + }, + { + "epoch": 0.19754795333201502, + "step": 1998, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.19754795333201502, + "step": 1998, + "train/total_loss": 0.190720334649086 + }, + { + "entropy": 9.811996459960938, + "epoch": 0.19764682618153054, + "mean_token_accuracy": 0.6995447874069214, + "num_tokens": 10399994.0, + "step": 1999, + "train/ce_loss": 1.406103253364563 + }, + { + "epoch": 0.19764682618153054, + "step": 1999, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.19764682618153054, + "step": 1999, + "train/total_loss": 0.23826657235622406 + }, + { + "epoch": 0.19774569903104608, + "grad_norm": 0.9430355429649353, + "learning_rate": 9.508233199821986e-06, + "loss": 0.1605, + "step": 2000 + }, + { + "entropy": 9.675430297851562, + "epoch": 0.19774569903104608, + "mean_token_accuracy": 0.7976366281509399, + "num_tokens": 10405114.0, + "step": 2000, + "train/ce_loss": 0.9603677988052368 + }, + { + "epoch": 0.19774569903104608, + "step": 2000, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.19774569903104608, + "step": 2000, + "train/total_loss": 0.15463054180145264 + }, + { + "entropy": 9.420642852783203, + "epoch": 0.1978445718805616, + "mean_token_accuracy": 0.7375178337097168, + "num_tokens": 10410268.0, + "step": 2001, + "train/ce_loss": 0.8256149888038635 + }, + { + "epoch": 0.1978445718805616, + "step": 2001, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.1978445718805616, + "step": 2001, + "train/total_loss": 0.14896774291992188 + }, + { + "entropy": 9.433884620666504, + "epoch": 0.19794344473007713, + "mean_token_accuracy": 0.7377892136573792, + "num_tokens": 10415502.0, + "step": 2002, + "train/ce_loss": 1.000722885131836 + }, + { + "epoch": 0.19794344473007713, + "step": 2002, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.19794344473007713, + "step": 2002, + "train/total_loss": 0.21335354447364807 + }, + { + "entropy": 9.553163528442383, + "epoch": 0.19804231757959265, + "mean_token_accuracy": 0.7405475974082947, + "num_tokens": 10420738.0, + "step": 2003, + "train/ce_loss": 1.2696847915649414 + }, + { + "epoch": 0.19804231757959265, + "step": 2003, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.19804231757959265, + "step": 2003, + "train/total_loss": 0.23634348809719086 + }, + { + "entropy": 9.052651405334473, + "epoch": 0.19814119042910816, + "mean_token_accuracy": 0.7702127695083618, + "num_tokens": 10426204.0, + "step": 2004, + "train/ce_loss": 0.8379826545715332 + }, + { + "epoch": 0.19814119042910816, + "step": 2004, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.19814119042910816, + "step": 2004, + "train/total_loss": 0.13848575949668884 + }, + { + "entropy": 9.25126838684082, + "epoch": 0.1982400632786237, + "mean_token_accuracy": 0.774193525314331, + "num_tokens": 10431436.0, + "step": 2005, + "train/ce_loss": 0.679263710975647 + }, + { + "epoch": 0.1982400632786237, + "step": 2005, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.1982400632786237, + "step": 2005, + "train/total_loss": 0.14995762705802917 + }, + { + "entropy": 9.719476699829102, + "epoch": 0.19833893612813921, + "mean_token_accuracy": 0.8030534386634827, + "num_tokens": 10436726.0, + "step": 2006, + "train/ce_loss": 0.9897971749305725 + }, + { + "epoch": 0.19833893612813921, + "step": 2006, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.19833893612813921, + "step": 2006, + "train/total_loss": 0.20054221153259277 + }, + { + "entropy": 9.661178588867188, + "epoch": 0.19843780897765473, + "mean_token_accuracy": 0.7221373915672302, + "num_tokens": 10441828.0, + "step": 2007, + "train/ce_loss": 1.348845362663269 + }, + { + "epoch": 0.19843780897765473, + "step": 2007, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.19843780897765473, + "step": 2007, + "train/total_loss": 0.2286345362663269 + }, + { + "entropy": 9.856801986694336, + "epoch": 0.19853668182717027, + "mean_token_accuracy": 0.7596774101257324, + "num_tokens": 10446866.0, + "step": 2008, + "train/ce_loss": 6.274824954743963e-06 + }, + { + "epoch": 0.19853668182717027, + "step": 2008, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.19853668182717027, + "step": 2008, + "train/total_loss": 0.02343812771141529 + }, + { + "entropy": 9.621440887451172, + "epoch": 0.19863555467668578, + "mean_token_accuracy": 0.7267355918884277, + "num_tokens": 10452005.0, + "step": 2009, + "train/ce_loss": 1.4356766939163208 + }, + { + "epoch": 0.19863555467668578, + "step": 2009, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.19863555467668578, + "step": 2009, + "train/total_loss": 0.25684893131256104 + }, + { + "entropy": 8.885970115661621, + "epoch": 0.1987344275262013, + "mean_token_accuracy": 0.7532588243484497, + "num_tokens": 10457556.0, + "step": 2010, + "train/ce_loss": 0.6419457793235779 + }, + { + "epoch": 0.1987344275262013, + "step": 2010, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.1987344275262013, + "step": 2010, + "train/total_loss": 0.09544458240270615 + }, + { + "entropy": 9.021108627319336, + "epoch": 0.19883330037571684, + "mean_token_accuracy": 0.7590000033378601, + "num_tokens": 10462993.0, + "step": 2011, + "train/ce_loss": 0.4544513523578644 + }, + { + "epoch": 0.19883330037571684, + "step": 2011, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.19883330037571684, + "step": 2011, + "train/total_loss": 0.07278888672590256 + }, + { + "entropy": 9.056339263916016, + "epoch": 0.19893217322523235, + "mean_token_accuracy": 0.7043294906616211, + "num_tokens": 10468374.0, + "step": 2012, + "train/ce_loss": 0.8080570101737976 + }, + { + "epoch": 0.19893217322523235, + "step": 2012, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.19893217322523235, + "step": 2012, + "train/total_loss": 0.127680703997612 + }, + { + "entropy": 10.461679458618164, + "epoch": 0.19903104607474786, + "mean_token_accuracy": 0.6085526347160339, + "num_tokens": 10473075.0, + "step": 2013, + "train/ce_loss": 5.331155776977539 + }, + { + "epoch": 0.19903104607474786, + "step": 2013, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.19903104607474786, + "step": 2013, + "train/total_loss": 0.626865565776825 + }, + { + "entropy": 9.206826210021973, + "epoch": 0.1991299189242634, + "mean_token_accuracy": 0.7319098711013794, + "num_tokens": 10478410.0, + "step": 2014, + "train/ce_loss": 1.3367818593978882 + }, + { + "epoch": 0.1991299189242634, + "step": 2014, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.1991299189242634, + "step": 2014, + "train/total_loss": 0.20789693295955658 + }, + { + "entropy": 9.818578720092773, + "epoch": 0.19922879177377892, + "mean_token_accuracy": 0.7318255305290222, + "num_tokens": 10483465.0, + "step": 2015, + "train/ce_loss": 1.089387387764873e-05 + }, + { + "epoch": 0.19922879177377892, + "step": 2015, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.19922879177377892, + "step": 2015, + "train/total_loss": 0.06250108778476715 + }, + { + "entropy": 10.016085624694824, + "epoch": 0.19932766462329443, + "mean_token_accuracy": 0.7523629665374756, + "num_tokens": 10488400.0, + "step": 2016, + "train/ce_loss": 6.818392648710869e-06 + }, + { + "epoch": 0.19932766462329443, + "step": 2016, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.19932766462329443, + "step": 2016, + "train/total_loss": 0.03125068172812462 + }, + { + "entropy": 8.950575828552246, + "epoch": 0.19942653747280997, + "mean_token_accuracy": 0.7698113322257996, + "num_tokens": 10493922.0, + "step": 2017, + "train/ce_loss": 0.6243674159049988 + }, + { + "epoch": 0.19942653747280997, + "step": 2017, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.19942653747280997, + "step": 2017, + "train/total_loss": 0.12103049457073212 + }, + { + "entropy": 10.060829162597656, + "epoch": 0.19952541032232549, + "mean_token_accuracy": 0.6961206793785095, + "num_tokens": 10498835.0, + "step": 2018, + "train/ce_loss": 1.1689340681186877e-05 + }, + { + "epoch": 0.19952541032232549, + "step": 2018, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.19952541032232549, + "step": 2018, + "train/total_loss": 0.04296991974115372 + }, + { + "entropy": 10.04655933380127, + "epoch": 0.199624283171841, + "mean_token_accuracy": 0.7794871926307678, + "num_tokens": 10503663.0, + "step": 2019, + "train/ce_loss": 1.3177484273910522 + }, + { + "epoch": 0.199624283171841, + "step": 2019, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.199624283171841, + "step": 2019, + "train/total_loss": 0.15521234273910522 + }, + { + "epoch": 0.19972315602135654, + "grad_norm": 0.8853054642677307, + "learning_rate": 9.503288335064036e-06, + "loss": 0.1522, + "step": 2020 + }, + { + "entropy": 9.00905990600586, + "epoch": 0.19972315602135654, + "mean_token_accuracy": 0.7656404972076416, + "num_tokens": 10509122.0, + "step": 2020, + "train/ce_loss": 0.7272489070892334 + }, + { + "epoch": 0.19972315602135654, + "step": 2020, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.19972315602135654, + "step": 2020, + "train/total_loss": 0.13522489368915558 + }, + { + "entropy": 9.366575241088867, + "epoch": 0.19982202887087205, + "mean_token_accuracy": 0.7505720853805542, + "num_tokens": 10514448.0, + "step": 2021, + "train/ce_loss": 0.6705920100212097 + }, + { + "epoch": 0.19982202887087205, + "step": 2021, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.19982202887087205, + "step": 2021, + "train/total_loss": 0.11784045398235321 + }, + { + "entropy": 8.827877044677734, + "epoch": 0.1999209017203876, + "mean_token_accuracy": 0.7924311757087708, + "num_tokens": 10519766.0, + "step": 2022, + "train/ce_loss": 0.4559319317340851 + }, + { + "epoch": 0.1999209017203876, + "step": 2022, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.1999209017203876, + "step": 2022, + "train/total_loss": 0.06903069466352463 + }, + { + "entropy": 9.280956268310547, + "epoch": 0.2000197745699031, + "mean_token_accuracy": 0.7597883343696594, + "num_tokens": 10525168.0, + "step": 2023, + "train/ce_loss": 0.746547520160675 + }, + { + "epoch": 0.2000197745699031, + "step": 2023, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.2000197745699031, + "step": 2023, + "train/total_loss": 0.14106100797653198 + }, + { + "entropy": 9.516263961791992, + "epoch": 0.20011864741941862, + "mean_token_accuracy": 0.703496515750885, + "num_tokens": 10530333.0, + "step": 2024, + "train/ce_loss": 0.9688807725906372 + }, + { + "epoch": 0.20011864741941862, + "step": 2024, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.20011864741941862, + "step": 2024, + "train/total_loss": 0.12423183023929596 + }, + { + "entropy": 9.22756576538086, + "epoch": 0.20021752026893416, + "mean_token_accuracy": 0.7782909870147705, + "num_tokens": 10535661.0, + "step": 2025, + "train/ce_loss": 0.6405593156814575 + }, + { + "epoch": 0.20021752026893416, + "step": 2025, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.20021752026893416, + "step": 2025, + "train/total_loss": 0.149993434548378 + }, + { + "entropy": 10.073745727539062, + "epoch": 0.20031639311844968, + "mean_token_accuracy": 0.7888198494911194, + "num_tokens": 10540556.0, + "step": 2026, + "train/ce_loss": 7.1674990067549516e-06 + }, + { + "epoch": 0.20031639311844968, + "step": 2026, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.20031639311844968, + "step": 2026, + "train/total_loss": 0.019531967118382454 + }, + { + "entropy": 8.897648811340332, + "epoch": 0.2004152659679652, + "mean_token_accuracy": 0.7366803288459778, + "num_tokens": 10546011.0, + "step": 2027, + "train/ce_loss": 0.645298421382904 + }, + { + "epoch": 0.2004152659679652, + "step": 2027, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.2004152659679652, + "step": 2027, + "train/total_loss": 0.16609233617782593 + }, + { + "entropy": 9.375082015991211, + "epoch": 0.20051413881748073, + "mean_token_accuracy": 0.7173637747764587, + "num_tokens": 10551280.0, + "step": 2028, + "train/ce_loss": 0.9982043504714966 + }, + { + "epoch": 0.20051413881748073, + "step": 2028, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.20051413881748073, + "step": 2028, + "train/total_loss": 0.17794543504714966 + }, + { + "entropy": 9.171791076660156, + "epoch": 0.20061301166699624, + "mean_token_accuracy": 0.6918465495109558, + "num_tokens": 10556633.0, + "step": 2029, + "train/ce_loss": 0.8046217560768127 + }, + { + "epoch": 0.20061301166699624, + "step": 2029, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.20061301166699624, + "step": 2029, + "train/total_loss": 0.18202468752861023 + }, + { + "entropy": 9.75556755065918, + "epoch": 0.20071188451651176, + "mean_token_accuracy": 0.7542213797569275, + "num_tokens": 10561578.0, + "step": 2030, + "train/ce_loss": 0.7943035364151001 + }, + { + "epoch": 0.20071188451651176, + "step": 2030, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.20071188451651176, + "step": 2030, + "train/total_loss": 0.10677410662174225 + }, + { + "entropy": 9.545900344848633, + "epoch": 0.2008107573660273, + "mean_token_accuracy": 0.7027438879013062, + "num_tokens": 10566714.0, + "step": 2031, + "train/ce_loss": 1.2213687896728516 + }, + { + "epoch": 0.2008107573660273, + "step": 2031, + "train/sim_loss": 0.1484375 + }, + { + "epoch": 0.2008107573660273, + "step": 2031, + "train/total_loss": 0.2705743908882141 + }, + { + "entropy": 9.558181762695312, + "epoch": 0.2009096302155428, + "mean_token_accuracy": 0.6989409923553467, + "num_tokens": 10571794.0, + "step": 2032, + "train/ce_loss": 0.7340117692947388 + }, + { + "epoch": 0.2009096302155428, + "step": 2032, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.2009096302155428, + "step": 2032, + "train/total_loss": 0.14371368288993835 + }, + { + "entropy": 9.313700675964355, + "epoch": 0.20100850306505832, + "mean_token_accuracy": 0.7363530993461609, + "num_tokens": 10577102.0, + "step": 2033, + "train/ce_loss": 1.2940865755081177 + }, + { + "epoch": 0.20100850306505832, + "step": 2033, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.20100850306505832, + "step": 2033, + "train/total_loss": 0.21925240755081177 + }, + { + "entropy": 9.426789283752441, + "epoch": 0.20110737591457387, + "mean_token_accuracy": 0.7166666388511658, + "num_tokens": 10582423.0, + "step": 2034, + "train/ce_loss": 1.9577373266220093 + }, + { + "epoch": 0.20110737591457387, + "step": 2034, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.20110737591457387, + "step": 2034, + "train/total_loss": 0.293429970741272 + }, + { + "entropy": 9.317292213439941, + "epoch": 0.20120624876408938, + "mean_token_accuracy": 0.7283950448036194, + "num_tokens": 10587736.0, + "step": 2035, + "train/ce_loss": 0.5608477592468262 + }, + { + "epoch": 0.20120624876408938, + "step": 2035, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.20120624876408938, + "step": 2035, + "train/total_loss": 0.0951472818851471 + }, + { + "entropy": 9.143550872802734, + "epoch": 0.2013051216136049, + "mean_token_accuracy": 0.6800433993339539, + "num_tokens": 10593142.0, + "step": 2036, + "train/ce_loss": 0.742180585861206 + }, + { + "epoch": 0.2013051216136049, + "step": 2036, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.2013051216136049, + "step": 2036, + "train/total_loss": 0.12499930709600449 + }, + { + "entropy": 9.595016479492188, + "epoch": 0.20140399446312043, + "mean_token_accuracy": 0.7216216325759888, + "num_tokens": 10598345.0, + "step": 2037, + "train/ce_loss": 1.1758081912994385 + }, + { + "epoch": 0.20140399446312043, + "step": 2037, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.20140399446312043, + "step": 2037, + "train/total_loss": 0.2152370810508728 + }, + { + "entropy": 9.325058937072754, + "epoch": 0.20150286731263595, + "mean_token_accuracy": 0.7465667724609375, + "num_tokens": 10603613.0, + "step": 2038, + "train/ce_loss": 1.1411672830581665 + }, + { + "epoch": 0.20150286731263595, + "step": 2038, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.20150286731263595, + "step": 2038, + "train/total_loss": 0.20005422830581665 + }, + { + "entropy": 8.968401908874512, + "epoch": 0.20160174016215146, + "mean_token_accuracy": 0.7274436354637146, + "num_tokens": 10609180.0, + "step": 2039, + "train/ce_loss": 1.000770092010498 + }, + { + "epoch": 0.20160174016215146, + "step": 2039, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.20160174016215146, + "step": 2039, + "train/total_loss": 0.15085825324058533 + }, + { + "epoch": 0.201700613011667, + "grad_norm": 0.8397514820098877, + "learning_rate": 9.498343470306089e-06, + "loss": 0.1556, + "step": 2040 + }, + { + "entropy": 9.181343078613281, + "epoch": 0.201700613011667, + "mean_token_accuracy": 0.6988505721092224, + "num_tokens": 10614524.0, + "step": 2040, + "train/ce_loss": 0.8533067107200623 + }, + { + "epoch": 0.201700613011667, + "step": 2040, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.201700613011667, + "step": 2040, + "train/total_loss": 0.14001816511154175 + }, + { + "entropy": 9.63900375366211, + "epoch": 0.20179948586118251, + "mean_token_accuracy": 0.7133758068084717, + "num_tokens": 10619605.0, + "step": 2041, + "train/ce_loss": 1.1804910898208618 + }, + { + "epoch": 0.20179948586118251, + "step": 2041, + "train/sim_loss": 0.14453125 + }, + { + "epoch": 0.20179948586118251, + "step": 2041, + "train/total_loss": 0.26258036494255066 + }, + { + "entropy": 9.486444473266602, + "epoch": 0.20189835871069806, + "mean_token_accuracy": 0.7389610409736633, + "num_tokens": 10624824.0, + "step": 2042, + "train/ce_loss": 1.2508841753005981 + }, + { + "epoch": 0.20189835871069806, + "step": 2042, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.20189835871069806, + "step": 2042, + "train/total_loss": 0.2266509234905243 + }, + { + "entropy": 9.04334831237793, + "epoch": 0.20199723156021357, + "mean_token_accuracy": 0.7137404680252075, + "num_tokens": 10630340.0, + "step": 2043, + "train/ce_loss": 0.6191092133522034 + }, + { + "epoch": 0.20199723156021357, + "step": 2043, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.20199723156021357, + "step": 2043, + "train/total_loss": 0.12831717729568481 + }, + { + "entropy": 9.424947738647461, + "epoch": 0.20209610440972908, + "mean_token_accuracy": 0.7550744414329529, + "num_tokens": 10635549.0, + "step": 2044, + "train/ce_loss": 0.7709404230117798 + }, + { + "epoch": 0.20209610440972908, + "step": 2044, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.20209610440972908, + "step": 2044, + "train/total_loss": 0.13178154826164246 + }, + { + "entropy": 9.402710914611816, + "epoch": 0.20219497725924462, + "mean_token_accuracy": 0.6962233185768127, + "num_tokens": 10640594.0, + "step": 2045, + "train/ce_loss": 1.000903844833374 + }, + { + "epoch": 0.20219497725924462, + "step": 2045, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.20219497725924462, + "step": 2045, + "train/total_loss": 0.1430591344833374 + }, + { + "entropy": 9.356023788452148, + "epoch": 0.20229385010876014, + "mean_token_accuracy": 0.6862980723381042, + "num_tokens": 10645914.0, + "step": 2046, + "train/ce_loss": 0.7545071840286255 + }, + { + "epoch": 0.20229385010876014, + "step": 2046, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.20229385010876014, + "step": 2046, + "train/total_loss": 0.13795071840286255 + }, + { + "entropy": 9.378089904785156, + "epoch": 0.20239272295827565, + "mean_token_accuracy": 0.7333333492279053, + "num_tokens": 10651059.0, + "step": 2047, + "train/ce_loss": 0.6553515195846558 + }, + { + "epoch": 0.20239272295827565, + "step": 2047, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.20239272295827565, + "step": 2047, + "train/total_loss": 0.09287890046834946 + }, + { + "entropy": 9.854352951049805, + "epoch": 0.2024915958077912, + "mean_token_accuracy": 0.7586776614189148, + "num_tokens": 10656119.0, + "step": 2048, + "train/ce_loss": 0.636549711227417 + }, + { + "epoch": 0.2024915958077912, + "step": 2048, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.2024915958077912, + "step": 2048, + "train/total_loss": 0.11443622410297394 + }, + { + "entropy": 9.793302536010742, + "epoch": 0.2025904686573067, + "mean_token_accuracy": 0.7410714030265808, + "num_tokens": 10661123.0, + "step": 2049, + "train/ce_loss": 5.3264425332599785e-06 + }, + { + "epoch": 0.2025904686573067, + "step": 2049, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.2025904686573067, + "step": 2049, + "train/total_loss": 0.02343803271651268 + }, + { + "entropy": 9.3540678024292, + "epoch": 0.20268934150682222, + "mean_token_accuracy": 0.6831579208374023, + "num_tokens": 10666513.0, + "step": 2050, + "train/ce_loss": 0.7889255285263062 + }, + { + "epoch": 0.20268934150682222, + "step": 2050, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.20268934150682222, + "step": 2050, + "train/total_loss": 0.1335800588130951 + }, + { + "entropy": 9.04610824584961, + "epoch": 0.20278821435633776, + "mean_token_accuracy": 0.7904656529426575, + "num_tokens": 10671915.0, + "step": 2051, + "train/ce_loss": 0.40603914856910706 + }, + { + "epoch": 0.20278821435633776, + "step": 2051, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.20278821435633776, + "step": 2051, + "train/total_loss": 0.07185392081737518 + }, + { + "entropy": 9.612567901611328, + "epoch": 0.20288708720585327, + "mean_token_accuracy": 0.7117241621017456, + "num_tokens": 10677099.0, + "step": 2052, + "train/ce_loss": 1.4318925142288208 + }, + { + "epoch": 0.20288708720585327, + "step": 2052, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.20288708720585327, + "step": 2052, + "train/total_loss": 0.21740800142288208 + }, + { + "entropy": 9.450386047363281, + "epoch": 0.20298596005536879, + "mean_token_accuracy": 0.7205513715744019, + "num_tokens": 10682354.0, + "step": 2053, + "train/ce_loss": 1.453116536140442 + }, + { + "epoch": 0.20298596005536879, + "step": 2053, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.20298596005536879, + "step": 2053, + "train/total_loss": 0.2195304036140442 + }, + { + "entropy": 9.776745796203613, + "epoch": 0.20308483290488433, + "mean_token_accuracy": 0.7428571581840515, + "num_tokens": 10687412.0, + "step": 2054, + "train/ce_loss": 0.9126024842262268 + }, + { + "epoch": 0.20308483290488433, + "step": 2054, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.20308483290488433, + "step": 2054, + "train/total_loss": 0.14985400438308716 + }, + { + "entropy": 9.455513000488281, + "epoch": 0.20318370575439984, + "mean_token_accuracy": 0.7038626670837402, + "num_tokens": 10692568.0, + "step": 2055, + "train/ce_loss": 1.1099965572357178 + }, + { + "epoch": 0.20318370575439984, + "step": 2055, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.20318370575439984, + "step": 2055, + "train/total_loss": 0.21256215870380402 + }, + { + "entropy": 9.406907081604004, + "epoch": 0.20328257860391535, + "mean_token_accuracy": 0.7060849666595459, + "num_tokens": 10697869.0, + "step": 2056, + "train/ce_loss": 1.2631419897079468 + }, + { + "epoch": 0.20328257860391535, + "step": 2056, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.20328257860391535, + "step": 2056, + "train/total_loss": 0.2239704579114914 + }, + { + "entropy": 10.24700927734375, + "epoch": 0.2033814514534309, + "mean_token_accuracy": 0.7822784781455994, + "num_tokens": 10702662.0, + "step": 2057, + "train/ce_loss": 7.433172413584543e-06 + }, + { + "epoch": 0.2033814514534309, + "step": 2057, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.2033814514534309, + "step": 2057, + "train/total_loss": 0.03125074505805969 + }, + { + "entropy": 9.484310150146484, + "epoch": 0.2034803243029464, + "mean_token_accuracy": 0.7227455973625183, + "num_tokens": 10707839.0, + "step": 2058, + "train/ce_loss": 0.6100543737411499 + }, + { + "epoch": 0.2034803243029464, + "step": 2058, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.2034803243029464, + "step": 2058, + "train/total_loss": 0.11569294333457947 + }, + { + "entropy": 8.70419692993164, + "epoch": 0.20357919715246192, + "mean_token_accuracy": 0.7291280031204224, + "num_tokens": 10713432.0, + "step": 2059, + "train/ce_loss": 1.097187876701355 + }, + { + "epoch": 0.20357919715246192, + "step": 2059, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.20357919715246192, + "step": 2059, + "train/total_loss": 0.16831254959106445 + }, + { + "epoch": 0.20367807000197746, + "grad_norm": 0.9894576668739319, + "learning_rate": 9.493398605548139e-06, + "loss": 0.1646, + "step": 2060 + }, + { + "entropy": 9.25218391418457, + "epoch": 0.20367807000197746, + "mean_token_accuracy": 0.786120593547821, + "num_tokens": 10718952.0, + "step": 2060, + "train/ce_loss": 0.46557751297950745 + }, + { + "epoch": 0.20367807000197746, + "step": 2060, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.20367807000197746, + "step": 2060, + "train/total_loss": 0.06608900427818298 + }, + { + "entropy": 9.467554092407227, + "epoch": 0.20377694285149298, + "mean_token_accuracy": 0.7763819098472595, + "num_tokens": 10724180.0, + "step": 2061, + "train/ce_loss": 1.1352170076861512e-05 + }, + { + "epoch": 0.20377694285149298, + "step": 2061, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.20377694285149298, + "step": 2061, + "train/total_loss": 0.07421988248825073 + }, + { + "entropy": 8.840513229370117, + "epoch": 0.2038758157010085, + "mean_token_accuracy": 0.7084745764732361, + "num_tokens": 10729844.0, + "step": 2062, + "train/ce_loss": 0.8225208520889282 + }, + { + "epoch": 0.2038758157010085, + "step": 2062, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.2038758157010085, + "step": 2062, + "train/total_loss": 0.17600208520889282 + }, + { + "entropy": 9.026546478271484, + "epoch": 0.20397468855052403, + "mean_token_accuracy": 0.7510729432106018, + "num_tokens": 10735238.0, + "step": 2063, + "train/ce_loss": 0.48530179262161255 + }, + { + "epoch": 0.20397468855052403, + "step": 2063, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.20397468855052403, + "step": 2063, + "train/total_loss": 0.07978017628192902 + }, + { + "entropy": 9.21430778503418, + "epoch": 0.20407356140003954, + "mean_token_accuracy": 0.6976439952850342, + "num_tokens": 10740468.0, + "step": 2064, + "train/ce_loss": 1.1475073099136353 + }, + { + "epoch": 0.20407356140003954, + "step": 2064, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.20407356140003954, + "step": 2064, + "train/total_loss": 0.19287574291229248 + }, + { + "entropy": 9.391798973083496, + "epoch": 0.20417243424955508, + "mean_token_accuracy": 0.7493734359741211, + "num_tokens": 10745754.0, + "step": 2065, + "train/ce_loss": 0.6040437817573547 + }, + { + "epoch": 0.20417243424955508, + "step": 2065, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.20417243424955508, + "step": 2065, + "train/total_loss": 0.12681062519550323 + }, + { + "entropy": 9.232144355773926, + "epoch": 0.2042713070990706, + "mean_token_accuracy": 0.7660738825798035, + "num_tokens": 10750975.0, + "step": 2066, + "train/ce_loss": 1.651016116142273 + }, + { + "epoch": 0.2042713070990706, + "step": 2066, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.2042713070990706, + "step": 2066, + "train/total_loss": 0.21197661757469177 + }, + { + "entropy": 9.279935836791992, + "epoch": 0.2043701799485861, + "mean_token_accuracy": 0.6896162629127502, + "num_tokens": 10756340.0, + "step": 2067, + "train/ce_loss": 1.0655268430709839 + }, + { + "epoch": 0.2043701799485861, + "step": 2067, + "train/sim_loss": 0.1484375 + }, + { + "epoch": 0.2043701799485861, + "step": 2067, + "train/total_loss": 0.25499019026756287 + }, + { + "entropy": 9.068220138549805, + "epoch": 0.20446905279810165, + "mean_token_accuracy": 0.7304643392562866, + "num_tokens": 10761649.0, + "step": 2068, + "train/ce_loss": 0.7303726077079773 + }, + { + "epoch": 0.20446905279810165, + "step": 2068, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.20446905279810165, + "step": 2068, + "train/total_loss": 0.1785060167312622 + }, + { + "entropy": 9.917533874511719, + "epoch": 0.20456792564761717, + "mean_token_accuracy": 0.7224137783050537, + "num_tokens": 10766651.0, + "step": 2069, + "train/ce_loss": 1.1582452058792114 + }, + { + "epoch": 0.20456792564761717, + "step": 2069, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.20456792564761717, + "step": 2069, + "train/total_loss": 0.20566827058792114 + }, + { + "entropy": 9.242486953735352, + "epoch": 0.20466679849713268, + "mean_token_accuracy": 0.7773631811141968, + "num_tokens": 10771977.0, + "step": 2070, + "train/ce_loss": 0.8605750799179077 + }, + { + "epoch": 0.20466679849713268, + "step": 2070, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.20466679849713268, + "step": 2070, + "train/total_loss": 0.17199501395225525 + }, + { + "entropy": 9.046150207519531, + "epoch": 0.20476567134664822, + "mean_token_accuracy": 0.7056995034217834, + "num_tokens": 10777407.0, + "step": 2071, + "train/ce_loss": 0.9511678218841553 + }, + { + "epoch": 0.20476567134664822, + "step": 2071, + "train/sim_loss": 0.12109375 + }, + { + "epoch": 0.20476567134664822, + "step": 2071, + "train/total_loss": 0.21621054410934448 + }, + { + "entropy": 9.479668617248535, + "epoch": 0.20486454419616373, + "mean_token_accuracy": 0.7462068796157837, + "num_tokens": 10782603.0, + "step": 2072, + "train/ce_loss": 0.616319477558136 + }, + { + "epoch": 0.20486454419616373, + "step": 2072, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.20486454419616373, + "step": 2072, + "train/total_loss": 0.1241319477558136 + }, + { + "entropy": 9.510650634765625, + "epoch": 0.20496341704567925, + "mean_token_accuracy": 0.7013513445854187, + "num_tokens": 10787838.0, + "step": 2073, + "train/ce_loss": 0.8334776163101196 + }, + { + "epoch": 0.20496341704567925, + "step": 2073, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.20496341704567925, + "step": 2073, + "train/total_loss": 0.16537901759147644 + }, + { + "entropy": 9.949750900268555, + "epoch": 0.2050622898951948, + "mean_token_accuracy": 0.6925858855247498, + "num_tokens": 10792781.0, + "step": 2074, + "train/ce_loss": 8.121015525830444e-06 + }, + { + "epoch": 0.2050622898951948, + "step": 2074, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.2050622898951948, + "step": 2074, + "train/total_loss": 0.035157062113285065 + }, + { + "entropy": 9.321216583251953, + "epoch": 0.2051611627447103, + "mean_token_accuracy": 0.7588306665420532, + "num_tokens": 10798042.0, + "step": 2075, + "train/ce_loss": 0.9396710991859436 + }, + { + "epoch": 0.2051611627447103, + "step": 2075, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.2051611627447103, + "step": 2075, + "train/total_loss": 0.14474835991859436 + }, + { + "entropy": 9.646879196166992, + "epoch": 0.20526003559422581, + "mean_token_accuracy": 0.7349768877029419, + "num_tokens": 10803123.0, + "step": 2076, + "train/ce_loss": 0.8319485187530518 + }, + { + "epoch": 0.20526003559422581, + "step": 2076, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.20526003559422581, + "step": 2076, + "train/total_loss": 0.15350735187530518 + }, + { + "entropy": 9.569722175598145, + "epoch": 0.20535890844374136, + "mean_token_accuracy": 0.762734591960907, + "num_tokens": 10808297.0, + "step": 2077, + "train/ce_loss": 1.0843838453292847 + }, + { + "epoch": 0.20535890844374136, + "step": 2077, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.20535890844374136, + "step": 2077, + "train/total_loss": 0.1318758875131607 + }, + { + "entropy": 9.064342498779297, + "epoch": 0.20545778129325687, + "mean_token_accuracy": 0.7198581695556641, + "num_tokens": 10813689.0, + "step": 2078, + "train/ce_loss": 0.8889843225479126 + }, + { + "epoch": 0.20545778129325687, + "step": 2078, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.20545778129325687, + "step": 2078, + "train/total_loss": 0.1592109352350235 + }, + { + "entropy": 9.698354721069336, + "epoch": 0.20555665414277238, + "mean_token_accuracy": 0.724473237991333, + "num_tokens": 10818763.0, + "step": 2079, + "train/ce_loss": 1.1419720649719238 + }, + { + "epoch": 0.20555665414277238, + "step": 2079, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.20555665414277238, + "step": 2079, + "train/total_loss": 0.23138470947742462 + }, + { + "epoch": 0.20565552699228792, + "grad_norm": 1.1899652481079102, + "learning_rate": 9.48845374079019e-06, + "loss": 0.1626, + "step": 2080 + }, + { + "entropy": 9.852563858032227, + "epoch": 0.20565552699228792, + "mean_token_accuracy": 0.7165775299072266, + "num_tokens": 10823782.0, + "step": 2080, + "train/ce_loss": 1.4960167407989502 + }, + { + "epoch": 0.20565552699228792, + "step": 2080, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.20565552699228792, + "step": 2080, + "train/total_loss": 0.23553918302059174 + }, + { + "entropy": 9.01875114440918, + "epoch": 0.20575439984180344, + "mean_token_accuracy": 0.7856468558311462, + "num_tokens": 10829311.0, + "step": 2081, + "train/ce_loss": 1.119086503982544 + }, + { + "epoch": 0.20575439984180344, + "step": 2081, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.20575439984180344, + "step": 2081, + "train/total_loss": 0.20565864443778992 + }, + { + "entropy": 9.640281677246094, + "epoch": 0.20585327269131895, + "mean_token_accuracy": 0.7061403393745422, + "num_tokens": 10834455.0, + "step": 2082, + "train/ce_loss": 2.283698797225952 + }, + { + "epoch": 0.20585327269131895, + "step": 2082, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.20585327269131895, + "step": 2082, + "train/total_loss": 0.29086989164352417 + }, + { + "entropy": 9.904674530029297, + "epoch": 0.2059521455408345, + "mean_token_accuracy": 0.7439516186714172, + "num_tokens": 10839380.0, + "step": 2083, + "train/ce_loss": 7.640025614819024e-06 + }, + { + "epoch": 0.2059521455408345, + "step": 2083, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.2059521455408345, + "step": 2083, + "train/total_loss": 0.08984451740980148 + }, + { + "entropy": 9.319259643554688, + "epoch": 0.20605101839035, + "mean_token_accuracy": 0.752662718296051, + "num_tokens": 10844689.0, + "step": 2084, + "train/ce_loss": 0.8190650939941406 + }, + { + "epoch": 0.20605101839035, + "step": 2084, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.20605101839035, + "step": 2084, + "train/total_loss": 0.1444065123796463 + }, + { + "entropy": 8.912906646728516, + "epoch": 0.20614989123986555, + "mean_token_accuracy": 0.7344045639038086, + "num_tokens": 10850216.0, + "step": 2085, + "train/ce_loss": 1.335227131843567 + }, + { + "epoch": 0.20614989123986555, + "step": 2085, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.20614989123986555, + "step": 2085, + "train/total_loss": 0.22727271914482117 + }, + { + "entropy": 9.269612312316895, + "epoch": 0.20624876408938106, + "mean_token_accuracy": 0.7739726305007935, + "num_tokens": 10855563.0, + "step": 2086, + "train/ce_loss": 0.8642188906669617 + }, + { + "epoch": 0.20624876408938106, + "step": 2086, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.20624876408938106, + "step": 2086, + "train/total_loss": 0.1489218920469284 + }, + { + "entropy": 9.489455223083496, + "epoch": 0.20634763693889657, + "mean_token_accuracy": 0.7437673211097717, + "num_tokens": 10860733.0, + "step": 2087, + "train/ce_loss": 1.35664701461792 + }, + { + "epoch": 0.20634763693889657, + "step": 2087, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.20634763693889657, + "step": 2087, + "train/total_loss": 0.213789701461792 + }, + { + "entropy": 9.293228149414062, + "epoch": 0.2064465097884121, + "mean_token_accuracy": 0.6814159154891968, + "num_tokens": 10866067.0, + "step": 2088, + "train/ce_loss": 0.9681704640388489 + }, + { + "epoch": 0.2064465097884121, + "step": 2088, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.2064465097884121, + "step": 2088, + "train/total_loss": 0.1397857964038849 + }, + { + "entropy": 9.249713897705078, + "epoch": 0.20654538263792763, + "mean_token_accuracy": 0.660804033279419, + "num_tokens": 10871350.0, + "step": 2089, + "train/ce_loss": 0.7813665270805359 + }, + { + "epoch": 0.20654538263792763, + "step": 2089, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.20654538263792763, + "step": 2089, + "train/total_loss": 0.1250116527080536 + }, + { + "entropy": 10.592601776123047, + "epoch": 0.20664425548744314, + "mean_token_accuracy": 0.7488986849784851, + "num_tokens": 10876001.0, + "step": 2090, + "train/ce_loss": 3.5592474887380376e-05 + }, + { + "epoch": 0.20664425548744314, + "step": 2090, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.20664425548744314, + "step": 2090, + "train/total_loss": 0.09375356137752533 + }, + { + "entropy": 9.088579177856445, + "epoch": 0.20674312833695868, + "mean_token_accuracy": 0.8073298335075378, + "num_tokens": 10881443.0, + "step": 2091, + "train/ce_loss": 0.6509912610054016 + }, + { + "epoch": 0.20674312833695868, + "step": 2091, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.20674312833695868, + "step": 2091, + "train/total_loss": 0.12759912014007568 + }, + { + "entropy": 9.509904861450195, + "epoch": 0.2068420011864742, + "mean_token_accuracy": 0.7426356673240662, + "num_tokens": 10886506.0, + "step": 2092, + "train/ce_loss": 1.523234486579895 + }, + { + "epoch": 0.2068420011864742, + "step": 2092, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.2068420011864742, + "step": 2092, + "train/total_loss": 0.21482345461845398 + }, + { + "entropy": 9.662224769592285, + "epoch": 0.2069408740359897, + "mean_token_accuracy": 0.7342767119407654, + "num_tokens": 10891595.0, + "step": 2093, + "train/ce_loss": 0.9919571876525879 + }, + { + "epoch": 0.2069408740359897, + "step": 2093, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.2069408740359897, + "step": 2093, + "train/total_loss": 0.1773207187652588 + }, + { + "entropy": 10.016589164733887, + "epoch": 0.20703974688550525, + "mean_token_accuracy": 0.7780821919441223, + "num_tokens": 10896358.0, + "step": 2094, + "train/ce_loss": 1.0874220132827759 + }, + { + "epoch": 0.20703974688550525, + "step": 2094, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.20703974688550525, + "step": 2094, + "train/total_loss": 0.16733595728874207 + }, + { + "entropy": 10.13791275024414, + "epoch": 0.20713861973502076, + "mean_token_accuracy": 0.713178277015686, + "num_tokens": 10901188.0, + "step": 2095, + "train/ce_loss": 2.002369365072809e-05 + }, + { + "epoch": 0.20713861973502076, + "step": 2095, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.20713861973502076, + "step": 2095, + "train/total_loss": 0.06640825420618057 + }, + { + "entropy": 9.9132080078125, + "epoch": 0.20723749258453628, + "mean_token_accuracy": 0.7882599830627441, + "num_tokens": 10906056.0, + "step": 2096, + "train/ce_loss": 1.241743803024292 + }, + { + "epoch": 0.20723749258453628, + "step": 2096, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.20723749258453628, + "step": 2096, + "train/total_loss": 0.17886188626289368 + }, + { + "entropy": 9.56640625, + "epoch": 0.20733636543405182, + "mean_token_accuracy": 0.7264705896377563, + "num_tokens": 10911182.0, + "step": 2097, + "train/ce_loss": 1.0573230981826782 + }, + { + "epoch": 0.20733636543405182, + "step": 2097, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.20733636543405182, + "step": 2097, + "train/total_loss": 0.19948232173919678 + }, + { + "entropy": 9.239940643310547, + "epoch": 0.20743523828356733, + "mean_token_accuracy": 0.7880299091339111, + "num_tokens": 10916466.0, + "step": 2098, + "train/ce_loss": 0.7562162280082703 + }, + { + "epoch": 0.20743523828356733, + "step": 2098, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.20743523828356733, + "step": 2098, + "train/total_loss": 0.17327788472175598 + }, + { + "entropy": 9.75033187866211, + "epoch": 0.20753411113308284, + "mean_token_accuracy": 0.7783985137939453, + "num_tokens": 10921439.0, + "step": 2099, + "train/ce_loss": 0.9728204011917114 + }, + { + "epoch": 0.20753411113308284, + "step": 2099, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.20753411113308284, + "step": 2099, + "train/total_loss": 0.1715008020401001 + }, + { + "epoch": 0.20763298398259838, + "grad_norm": 0.9003250002861023, + "learning_rate": 9.483508876032242e-06, + "loss": 0.1486, + "step": 2100 + }, + { + "entropy": 9.183356285095215, + "epoch": 0.20763298398259838, + "mean_token_accuracy": 0.7854356169700623, + "num_tokens": 10927015.0, + "step": 2100, + "train/ce_loss": 1.995689672185108e-05 + }, + { + "epoch": 0.20763298398259838, + "step": 2100, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.20763298398259838, + "step": 2100, + "train/total_loss": 0.046876996755599976 + }, + { + "entropy": 9.04728889465332, + "epoch": 0.2077318568321139, + "mean_token_accuracy": 0.7083854675292969, + "num_tokens": 10932288.0, + "step": 2101, + "train/ce_loss": 1.0550248622894287 + }, + { + "epoch": 0.2077318568321139, + "step": 2101, + "train/sim_loss": 0.12890625 + }, + { + "epoch": 0.2077318568321139, + "step": 2101, + "train/total_loss": 0.23440873622894287 + }, + { + "entropy": 9.192512512207031, + "epoch": 0.2078307296816294, + "mean_token_accuracy": 0.7589189410209656, + "num_tokens": 10937698.0, + "step": 2102, + "train/ce_loss": 0.8490728139877319 + }, + { + "epoch": 0.2078307296816294, + "step": 2102, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.2078307296816294, + "step": 2102, + "train/total_loss": 0.16303229331970215 + }, + { + "entropy": 9.685310363769531, + "epoch": 0.20792960253114495, + "mean_token_accuracy": 0.699999988079071, + "num_tokens": 10942748.0, + "step": 2103, + "train/ce_loss": 2.1654441356658936 + }, + { + "epoch": 0.20792960253114495, + "step": 2103, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.20792960253114495, + "step": 2103, + "train/total_loss": 0.29857566952705383 + }, + { + "entropy": 10.370462417602539, + "epoch": 0.20802847538066047, + "mean_token_accuracy": 0.732758641242981, + "num_tokens": 10947501.0, + "step": 2104, + "train/ce_loss": 3.596203896449879e-05 + }, + { + "epoch": 0.20802847538066047, + "step": 2104, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.20802847538066047, + "step": 2104, + "train/total_loss": 0.058597344905138016 + }, + { + "entropy": 9.564261436462402, + "epoch": 0.208127348230176, + "mean_token_accuracy": 0.7226074934005737, + "num_tokens": 10952852.0, + "step": 2105, + "train/ce_loss": 1.111342191696167 + }, + { + "epoch": 0.208127348230176, + "step": 2105, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.208127348230176, + "step": 2105, + "train/total_loss": 0.20488423109054565 + }, + { + "entropy": 9.011837005615234, + "epoch": 0.20822622107969152, + "mean_token_accuracy": 0.6643495559692383, + "num_tokens": 10958359.0, + "step": 2106, + "train/ce_loss": 1.959960699081421 + }, + { + "epoch": 0.20822622107969152, + "step": 2106, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.20822622107969152, + "step": 2106, + "train/total_loss": 0.27802732586860657 + }, + { + "entropy": 9.457014083862305, + "epoch": 0.20832509392920703, + "mean_token_accuracy": 0.7094801068305969, + "num_tokens": 10963500.0, + "step": 2107, + "train/ce_loss": 0.552707850933075 + }, + { + "epoch": 0.20832509392920703, + "step": 2107, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.20832509392920703, + "step": 2107, + "train/total_loss": 0.11386454105377197 + }, + { + "entropy": 9.38405990600586, + "epoch": 0.20842396677872257, + "mean_token_accuracy": 0.7141134142875671, + "num_tokens": 10968779.0, + "step": 2108, + "train/ce_loss": 1.2475124597549438 + }, + { + "epoch": 0.20842396677872257, + "step": 2108, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.20842396677872257, + "step": 2108, + "train/total_loss": 0.1716262400150299 + }, + { + "entropy": 9.579483032226562, + "epoch": 0.2085228396282381, + "mean_token_accuracy": 0.7328858971595764, + "num_tokens": 10973967.0, + "step": 2109, + "train/ce_loss": 1.4650330543518066 + }, + { + "epoch": 0.2085228396282381, + "step": 2109, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.2085228396282381, + "step": 2109, + "train/total_loss": 0.24025331437587738 + }, + { + "entropy": 9.960945129394531, + "epoch": 0.2086217124777536, + "mean_token_accuracy": 0.789264440536499, + "num_tokens": 10978893.0, + "step": 2110, + "train/ce_loss": 1.619686918274965e-05 + }, + { + "epoch": 0.2086217124777536, + "step": 2110, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.2086217124777536, + "step": 2110, + "train/total_loss": 0.05468912050127983 + }, + { + "entropy": 9.54365348815918, + "epoch": 0.20872058532726914, + "mean_token_accuracy": 0.723849356174469, + "num_tokens": 10984029.0, + "step": 2111, + "train/ce_loss": 0.7713753581047058 + }, + { + "epoch": 0.20872058532726914, + "step": 2111, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.20872058532726914, + "step": 2111, + "train/total_loss": 0.1591687798500061 + }, + { + "entropy": 8.940006256103516, + "epoch": 0.20881945817678466, + "mean_token_accuracy": 0.7734752893447876, + "num_tokens": 10989556.0, + "step": 2112, + "train/ce_loss": 0.7423337697982788 + }, + { + "epoch": 0.20881945817678466, + "step": 2112, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.20881945817678466, + "step": 2112, + "train/total_loss": 0.12892088294029236 + }, + { + "entropy": 9.339137077331543, + "epoch": 0.20891833102630017, + "mean_token_accuracy": 0.7015098929405212, + "num_tokens": 10994905.0, + "step": 2113, + "train/ce_loss": 0.9357536435127258 + }, + { + "epoch": 0.20891833102630017, + "step": 2113, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.20891833102630017, + "step": 2113, + "train/total_loss": 0.1912316083908081 + }, + { + "entropy": 8.97794246673584, + "epoch": 0.2090172038758157, + "mean_token_accuracy": 0.7137647867202759, + "num_tokens": 11000354.0, + "step": 2114, + "train/ce_loss": 0.583050012588501 + }, + { + "epoch": 0.2090172038758157, + "step": 2114, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.2090172038758157, + "step": 2114, + "train/total_loss": 0.11299250274896622 + }, + { + "entropy": 9.324902534484863, + "epoch": 0.20911607672533122, + "mean_token_accuracy": 0.7692307829856873, + "num_tokens": 11005621.0, + "step": 2115, + "train/ce_loss": 0.7004877924919128 + }, + { + "epoch": 0.20911607672533122, + "step": 2115, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.20911607672533122, + "step": 2115, + "train/total_loss": 0.10911127924919128 + }, + { + "entropy": 10.162012100219727, + "epoch": 0.20921494957484674, + "mean_token_accuracy": 0.7238979339599609, + "num_tokens": 11010461.0, + "step": 2116, + "train/ce_loss": 7.267015007528244e-06 + }, + { + "epoch": 0.20921494957484674, + "step": 2116, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.20921494957484674, + "step": 2116, + "train/total_loss": 0.0585944764316082 + }, + { + "entropy": 9.51352310180664, + "epoch": 0.20931382242436228, + "mean_token_accuracy": 0.6925795078277588, + "num_tokens": 11015508.0, + "step": 2117, + "train/ce_loss": 1.5570176401524805e-05 + }, + { + "epoch": 0.20931382242436228, + "step": 2117, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.20931382242436228, + "step": 2117, + "train/total_loss": 0.04687655717134476 + }, + { + "entropy": 9.183328628540039, + "epoch": 0.2094126952738778, + "mean_token_accuracy": 0.7293233275413513, + "num_tokens": 11020821.0, + "step": 2118, + "train/ce_loss": 1.0247259140014648 + }, + { + "epoch": 0.2094126952738778, + "step": 2118, + "train/sim_loss": 0.12109375 + }, + { + "epoch": 0.2094126952738778, + "step": 2118, + "train/total_loss": 0.22356635332107544 + }, + { + "entropy": 9.582716941833496, + "epoch": 0.2095115681233933, + "mean_token_accuracy": 0.7318840622901917, + "num_tokens": 11025944.0, + "step": 2119, + "train/ce_loss": 1.099069595336914 + }, + { + "epoch": 0.2095115681233933, + "step": 2119, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.2095115681233933, + "step": 2119, + "train/total_loss": 0.17631322145462036 + }, + { + "epoch": 0.20961044097290885, + "grad_norm": 0.7997581362724304, + "learning_rate": 9.478564011274292e-06, + "loss": 0.1587, + "step": 2120 + }, + { + "entropy": 9.7577486038208, + "epoch": 0.20961044097290885, + "mean_token_accuracy": 0.7158878445625305, + "num_tokens": 11030972.0, + "step": 2120, + "train/ce_loss": 6.1127025219320785e-06 + }, + { + "epoch": 0.20961044097290885, + "step": 2120, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.20961044097290885, + "step": 2120, + "train/total_loss": 0.06640686094760895 + }, + { + "entropy": 9.57359504699707, + "epoch": 0.20970931382242436, + "mean_token_accuracy": 0.707446813583374, + "num_tokens": 11036159.0, + "step": 2121, + "train/ce_loss": 0.7156136631965637 + }, + { + "epoch": 0.20970931382242436, + "step": 2121, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.20970931382242436, + "step": 2121, + "train/total_loss": 0.14187386631965637 + }, + { + "entropy": 9.341856002807617, + "epoch": 0.20980818667193987, + "mean_token_accuracy": 0.7126582264900208, + "num_tokens": 11041429.0, + "step": 2122, + "train/ce_loss": 1.9096126556396484 + }, + { + "epoch": 0.20980818667193987, + "step": 2122, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.20980818667193987, + "step": 2122, + "train/total_loss": 0.3003362715244293 + }, + { + "entropy": 9.650550842285156, + "epoch": 0.2099070595214554, + "mean_token_accuracy": 0.7735294103622437, + "num_tokens": 11046545.0, + "step": 2123, + "train/ce_loss": 1.2696391344070435 + }, + { + "epoch": 0.2099070595214554, + "step": 2123, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.2099070595214554, + "step": 2123, + "train/total_loss": 0.14649516344070435 + }, + { + "entropy": 9.482165336608887, + "epoch": 0.21000593237097093, + "mean_token_accuracy": 0.7585185170173645, + "num_tokens": 11051671.0, + "step": 2124, + "train/ce_loss": 0.9867421984672546 + }, + { + "epoch": 0.21000593237097093, + "step": 2124, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.21000593237097093, + "step": 2124, + "train/total_loss": 0.1689867228269577 + }, + { + "entropy": 9.264106750488281, + "epoch": 0.21010480522048647, + "mean_token_accuracy": 0.7266591787338257, + "num_tokens": 11056995.0, + "step": 2125, + "train/ce_loss": 0.8551509976387024 + }, + { + "epoch": 0.21010480522048647, + "step": 2125, + "train/sim_loss": 0.1328125 + }, + { + "epoch": 0.21010480522048647, + "step": 2125, + "train/total_loss": 0.2183276116847992 + }, + { + "entropy": 9.480263710021973, + "epoch": 0.21020367807000198, + "mean_token_accuracy": 0.6494413614273071, + "num_tokens": 11062190.0, + "step": 2126, + "train/ce_loss": 1.0613958835601807 + }, + { + "epoch": 0.21020367807000198, + "step": 2126, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.21020367807000198, + "step": 2126, + "train/total_loss": 0.18426460027694702 + }, + { + "entropy": 9.736364364624023, + "epoch": 0.2103025509195175, + "mean_token_accuracy": 0.7264705896377563, + "num_tokens": 11067286.0, + "step": 2127, + "train/ce_loss": 6.552802005899139e-06 + }, + { + "epoch": 0.2103025509195175, + "step": 2127, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.2103025509195175, + "step": 2127, + "train/total_loss": 0.10937565565109253 + }, + { + "entropy": 9.37752914428711, + "epoch": 0.21040142376903304, + "mean_token_accuracy": 0.7278401851654053, + "num_tokens": 11072598.0, + "step": 2128, + "train/ce_loss": 1.0987099409103394 + }, + { + "epoch": 0.21040142376903304, + "step": 2128, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.21040142376903304, + "step": 2128, + "train/total_loss": 0.1958085000514984 + }, + { + "entropy": 9.320964813232422, + "epoch": 0.21050029661854855, + "mean_token_accuracy": 0.6972255706787109, + "num_tokens": 11077856.0, + "step": 2129, + "train/ce_loss": 1.042314052581787 + }, + { + "epoch": 0.21050029661854855, + "step": 2129, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.21050029661854855, + "step": 2129, + "train/total_loss": 0.19016891717910767 + }, + { + "entropy": 9.59323787689209, + "epoch": 0.21059916946806406, + "mean_token_accuracy": 0.716312050819397, + "num_tokens": 11083033.0, + "step": 2130, + "train/ce_loss": 1.0699758529663086 + }, + { + "epoch": 0.21059916946806406, + "step": 2130, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.21059916946806406, + "step": 2130, + "train/total_loss": 0.21246632933616638 + }, + { + "entropy": 9.647937774658203, + "epoch": 0.2106980423175796, + "mean_token_accuracy": 0.7286356687545776, + "num_tokens": 11088142.0, + "step": 2131, + "train/ce_loss": 0.695247232913971 + }, + { + "epoch": 0.2106980423175796, + "step": 2131, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.2106980423175796, + "step": 2131, + "train/total_loss": 0.11639972776174545 + }, + { + "entropy": 9.487251281738281, + "epoch": 0.21079691516709512, + "mean_token_accuracy": 0.6836734414100647, + "num_tokens": 11093232.0, + "step": 2132, + "train/ce_loss": 1.0981719493865967 + }, + { + "epoch": 0.21079691516709512, + "step": 2132, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.21079691516709512, + "step": 2132, + "train/total_loss": 0.18012970685958862 + }, + { + "entropy": 8.924519538879395, + "epoch": 0.21089578801661063, + "mean_token_accuracy": 0.6953441500663757, + "num_tokens": 11098710.0, + "step": 2133, + "train/ce_loss": 1.0247503519058228 + }, + { + "epoch": 0.21089578801661063, + "step": 2133, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.21089578801661063, + "step": 2133, + "train/total_loss": 0.18841254711151123 + }, + { + "entropy": 9.11449146270752, + "epoch": 0.21099466086612617, + "mean_token_accuracy": 0.7630979418754578, + "num_tokens": 11104097.0, + "step": 2134, + "train/ce_loss": 0.796212375164032 + }, + { + "epoch": 0.21099466086612617, + "step": 2134, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.21099466086612617, + "step": 2134, + "train/total_loss": 0.11868374049663544 + }, + { + "entropy": 9.475465774536133, + "epoch": 0.21109353371564168, + "mean_token_accuracy": 0.7591036558151245, + "num_tokens": 11109279.0, + "step": 2135, + "train/ce_loss": 0.7177180647850037 + }, + { + "epoch": 0.21109353371564168, + "step": 2135, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.21109353371564168, + "step": 2135, + "train/total_loss": 0.1381780505180359 + }, + { + "entropy": 9.492043495178223, + "epoch": 0.2111924065651572, + "mean_token_accuracy": 0.7643678188323975, + "num_tokens": 11114417.0, + "step": 2136, + "train/ce_loss": 0.841325581073761 + }, + { + "epoch": 0.2111924065651572, + "step": 2136, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.2111924065651572, + "step": 2136, + "train/total_loss": 0.16225755214691162 + }, + { + "entropy": 9.480161666870117, + "epoch": 0.21129127941467274, + "mean_token_accuracy": 0.7516087293624878, + "num_tokens": 11119571.0, + "step": 2137, + "train/ce_loss": 0.8480008840560913 + }, + { + "epoch": 0.21129127941467274, + "step": 2137, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.21129127941467274, + "step": 2137, + "train/total_loss": 0.1512063443660736 + }, + { + "entropy": 9.458097457885742, + "epoch": 0.21139015226418825, + "mean_token_accuracy": 0.717208206653595, + "num_tokens": 11124856.0, + "step": 2138, + "train/ce_loss": 1.1783355474472046 + }, + { + "epoch": 0.21139015226418825, + "step": 2138, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.21139015226418825, + "step": 2138, + "train/total_loss": 0.18814605474472046 + }, + { + "entropy": 9.53870964050293, + "epoch": 0.21148902511370377, + "mean_token_accuracy": 0.6192959547042847, + "num_tokens": 11130074.0, + "step": 2139, + "train/ce_loss": 4.688951321440982e-06 + }, + { + "epoch": 0.21148902511370377, + "step": 2139, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.21148902511370377, + "step": 2139, + "train/total_loss": 0.0664067193865776 + }, + { + "epoch": 0.2115878979632193, + "grad_norm": 0.9693803191184998, + "learning_rate": 9.473619146516345e-06, + "loss": 0.1694, + "step": 2140 + }, + { + "entropy": 10.037150382995605, + "epoch": 0.2115878979632193, + "mean_token_accuracy": 0.6808118224143982, + "num_tokens": 11135043.0, + "step": 2140, + "train/ce_loss": 7.014597940724343e-06 + }, + { + "epoch": 0.2115878979632193, + "step": 2140, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.2115878979632193, + "step": 2140, + "train/total_loss": 0.07421945035457611 + }, + { + "entropy": 10.606910705566406, + "epoch": 0.21168677081273482, + "mean_token_accuracy": 0.7755101919174194, + "num_tokens": 11139654.0, + "step": 2141, + "train/ce_loss": 3.7913382053375244 + }, + { + "epoch": 0.21168677081273482, + "step": 2141, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.21168677081273482, + "step": 2141, + "train/total_loss": 0.44554007053375244 + }, + { + "entropy": 9.145793914794922, + "epoch": 0.21178564366225033, + "mean_token_accuracy": 0.7507820725440979, + "num_tokens": 11145031.0, + "step": 2142, + "train/ce_loss": 0.847519040107727 + }, + { + "epoch": 0.21178564366225033, + "step": 2142, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.21178564366225033, + "step": 2142, + "train/total_loss": 0.1824081540107727 + }, + { + "entropy": 9.520444869995117, + "epoch": 0.21188451651176587, + "mean_token_accuracy": 0.7269326448440552, + "num_tokens": 11150242.0, + "step": 2143, + "train/ce_loss": 1.060303807258606 + }, + { + "epoch": 0.21188451651176587, + "step": 2143, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.21188451651176587, + "step": 2143, + "train/total_loss": 0.18415537476539612 + }, + { + "entropy": 9.657861709594727, + "epoch": 0.2119833893612814, + "mean_token_accuracy": 0.7496063113212585, + "num_tokens": 11155328.0, + "step": 2144, + "train/ce_loss": 1.1257550716400146 + }, + { + "epoch": 0.2119833893612814, + "step": 2144, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.2119833893612814, + "step": 2144, + "train/total_loss": 0.178981751203537 + }, + { + "entropy": 9.078840255737305, + "epoch": 0.2120822622107969, + "mean_token_accuracy": 0.7188796401023865, + "num_tokens": 11160787.0, + "step": 2145, + "train/ce_loss": 1.0356298685073853 + }, + { + "epoch": 0.2120822622107969, + "step": 2145, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.2120822622107969, + "step": 2145, + "train/total_loss": 0.19340673089027405 + }, + { + "entropy": 9.330584526062012, + "epoch": 0.21218113506031244, + "mean_token_accuracy": 0.7790432572364807, + "num_tokens": 11166100.0, + "step": 2146, + "train/ce_loss": 0.5168320536613464 + }, + { + "epoch": 0.21218113506031244, + "step": 2146, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.21218113506031244, + "step": 2146, + "train/total_loss": 0.0790269523859024 + }, + { + "entropy": 9.300143241882324, + "epoch": 0.21228000790982796, + "mean_token_accuracy": 0.7127937078475952, + "num_tokens": 11171354.0, + "step": 2147, + "train/ce_loss": 0.5148841142654419 + }, + { + "epoch": 0.21228000790982796, + "step": 2147, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.21228000790982796, + "step": 2147, + "train/total_loss": 0.12570716440677643 + }, + { + "entropy": 9.406492233276367, + "epoch": 0.2123788807593435, + "mean_token_accuracy": 0.7316455841064453, + "num_tokens": 11176646.0, + "step": 2148, + "train/ce_loss": 4.975815500074532e-06 + }, + { + "epoch": 0.2123788807593435, + "step": 2148, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.2123788807593435, + "step": 2148, + "train/total_loss": 0.046875499188899994 + }, + { + "entropy": 9.134361267089844, + "epoch": 0.212477753608859, + "mean_token_accuracy": 0.7242562770843506, + "num_tokens": 11182054.0, + "step": 2149, + "train/ce_loss": 0.6157044768333435 + }, + { + "epoch": 0.212477753608859, + "step": 2149, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.212477753608859, + "step": 2149, + "train/total_loss": 0.1318829506635666 + }, + { + "entropy": 9.376569747924805, + "epoch": 0.21257662645837452, + "mean_token_accuracy": 0.7273809313774109, + "num_tokens": 11187347.0, + "step": 2150, + "train/ce_loss": 0.8534751534461975 + }, + { + "epoch": 0.21257662645837452, + "step": 2150, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.21257662645837452, + "step": 2150, + "train/total_loss": 0.175191268324852 + }, + { + "entropy": 8.996522903442383, + "epoch": 0.21267549930789006, + "mean_token_accuracy": 0.6850152611732483, + "num_tokens": 11192806.0, + "step": 2151, + "train/ce_loss": 1.2981199026107788 + }, + { + "epoch": 0.21267549930789006, + "step": 2151, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.21267549930789006, + "step": 2151, + "train/total_loss": 0.19621823728084564 + }, + { + "entropy": 9.780609130859375, + "epoch": 0.21277437215740558, + "mean_token_accuracy": 0.7373417615890503, + "num_tokens": 11197876.0, + "step": 2152, + "train/ce_loss": 1.066979169845581 + }, + { + "epoch": 0.21277437215740558, + "step": 2152, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.21277437215740558, + "step": 2152, + "train/total_loss": 0.1613854169845581 + }, + { + "entropy": 9.951211929321289, + "epoch": 0.2128732450069211, + "mean_token_accuracy": 0.7306337952613831, + "num_tokens": 11202843.0, + "step": 2153, + "train/ce_loss": 0.8087383508682251 + }, + { + "epoch": 0.2128732450069211, + "step": 2153, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.2128732450069211, + "step": 2153, + "train/total_loss": 0.17462384700775146 + }, + { + "entropy": 9.533098220825195, + "epoch": 0.21297211785643663, + "mean_token_accuracy": 0.7285318374633789, + "num_tokens": 11207989.0, + "step": 2154, + "train/ce_loss": 1.0698336362838745 + }, + { + "epoch": 0.21297211785643663, + "step": 2154, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.21297211785643663, + "step": 2154, + "train/total_loss": 0.18901461362838745 + }, + { + "entropy": 9.042793273925781, + "epoch": 0.21307099070595215, + "mean_token_accuracy": 0.7118847370147705, + "num_tokens": 11213272.0, + "step": 2155, + "train/ce_loss": 1.1877574920654297 + }, + { + "epoch": 0.21307099070595215, + "step": 2155, + "train/sim_loss": 0.12890625 + }, + { + "epoch": 0.21307099070595215, + "step": 2155, + "train/total_loss": 0.24768200516700745 + }, + { + "entropy": 9.832735061645508, + "epoch": 0.21316986355546766, + "mean_token_accuracy": 0.69749516248703, + "num_tokens": 11218417.0, + "step": 2156, + "train/ce_loss": 1.2040752172470093 + }, + { + "epoch": 0.21316986355546766, + "step": 2156, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.21316986355546766, + "step": 2156, + "train/total_loss": 0.17509502172470093 + }, + { + "entropy": 9.607536315917969, + "epoch": 0.2132687364049832, + "mean_token_accuracy": 0.7495826482772827, + "num_tokens": 11223441.0, + "step": 2157, + "train/ce_loss": 1.0795719623565674 + }, + { + "epoch": 0.2132687364049832, + "step": 2157, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.2132687364049832, + "step": 2157, + "train/total_loss": 0.20951969921588898 + }, + { + "entropy": 9.695856094360352, + "epoch": 0.2133676092544987, + "mean_token_accuracy": 0.7353760600090027, + "num_tokens": 11228621.0, + "step": 2158, + "train/ce_loss": 0.9282440543174744 + }, + { + "epoch": 0.2133676092544987, + "step": 2158, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.2133676092544987, + "step": 2158, + "train/total_loss": 0.17094939947128296 + }, + { + "entropy": 9.801984786987305, + "epoch": 0.21346648210401423, + "mean_token_accuracy": 0.769784152507782, + "num_tokens": 11233643.0, + "step": 2159, + "train/ce_loss": 0.6551988124847412 + }, + { + "epoch": 0.21346648210401423, + "step": 2159, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.21346648210401423, + "step": 2159, + "train/total_loss": 0.13192613422870636 + }, + { + "epoch": 0.21356535495352977, + "grad_norm": 0.8683087229728699, + "learning_rate": 9.468674281758395e-06, + "loss": 0.1605, + "step": 2160 + }, + { + "entropy": 9.378292083740234, + "epoch": 0.21356535495352977, + "mean_token_accuracy": 0.6399999856948853, + "num_tokens": 11238962.0, + "step": 2160, + "train/ce_loss": 1.1224546432495117 + }, + { + "epoch": 0.21356535495352977, + "step": 2160, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.21356535495352977, + "step": 2160, + "train/total_loss": 0.15521422028541565 + }, + { + "entropy": 9.72561264038086, + "epoch": 0.21366422780304528, + "mean_token_accuracy": 0.7355072498321533, + "num_tokens": 11243918.0, + "step": 2161, + "train/ce_loss": 1.2311592102050781 + }, + { + "epoch": 0.21366422780304528, + "step": 2161, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.21366422780304528, + "step": 2161, + "train/total_loss": 0.1973346769809723 + }, + { + "entropy": 9.457310676574707, + "epoch": 0.2137631006525608, + "mean_token_accuracy": 0.7752043604850769, + "num_tokens": 11249148.0, + "step": 2162, + "train/ce_loss": 0.3882007598876953 + }, + { + "epoch": 0.2137631006525608, + "step": 2162, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.2137631006525608, + "step": 2162, + "train/total_loss": 0.11694507300853729 + }, + { + "entropy": 8.943735122680664, + "epoch": 0.21386197350207634, + "mean_token_accuracy": 0.7989473938941956, + "num_tokens": 11254627.0, + "step": 2163, + "train/ce_loss": 0.644926130771637 + }, + { + "epoch": 0.21386197350207634, + "step": 2163, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.21386197350207634, + "step": 2163, + "train/total_loss": 0.1230863630771637 + }, + { + "entropy": 8.746071815490723, + "epoch": 0.21396084635159185, + "mean_token_accuracy": 0.754162609577179, + "num_tokens": 11260360.0, + "step": 2164, + "train/ce_loss": 1.0563163757324219 + }, + { + "epoch": 0.21396084635159185, + "step": 2164, + "train/sim_loss": 0.13671875 + }, + { + "epoch": 0.21396084635159185, + "step": 2164, + "train/total_loss": 0.24235039949417114 + }, + { + "entropy": 9.765542984008789, + "epoch": 0.21405971920110736, + "mean_token_accuracy": 0.7383177280426025, + "num_tokens": 11265321.0, + "step": 2165, + "train/ce_loss": 0.630692720413208 + }, + { + "epoch": 0.21405971920110736, + "step": 2165, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.21405971920110736, + "step": 2165, + "train/total_loss": 0.14119428396224976 + }, + { + "entropy": 9.095598220825195, + "epoch": 0.2141585920506229, + "mean_token_accuracy": 0.8049792647361755, + "num_tokens": 11270720.0, + "step": 2166, + "train/ce_loss": 0.8454383611679077 + }, + { + "epoch": 0.2141585920506229, + "step": 2166, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.2141585920506229, + "step": 2166, + "train/total_loss": 0.11188758909702301 + }, + { + "entropy": 9.24032974243164, + "epoch": 0.21425746490013842, + "mean_token_accuracy": 0.68727707862854, + "num_tokens": 11276037.0, + "step": 2167, + "train/ce_loss": 0.8193873167037964 + }, + { + "epoch": 0.21425746490013842, + "step": 2167, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.21425746490013842, + "step": 2167, + "train/total_loss": 0.1444387435913086 + }, + { + "entropy": 9.336334228515625, + "epoch": 0.21435633774965396, + "mean_token_accuracy": 0.690157949924469, + "num_tokens": 11281342.0, + "step": 2168, + "train/ce_loss": 1.3831806182861328 + }, + { + "epoch": 0.21435633774965396, + "step": 2168, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.21435633774965396, + "step": 2168, + "train/total_loss": 0.21644306182861328 + }, + { + "entropy": 9.339741706848145, + "epoch": 0.21445521059916947, + "mean_token_accuracy": 0.6936339735984802, + "num_tokens": 11286521.0, + "step": 2169, + "train/ce_loss": 2.5318486223113723e-05 + }, + { + "epoch": 0.21445521059916947, + "step": 2169, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.21445521059916947, + "step": 2169, + "train/total_loss": 0.054690033197402954 + }, + { + "entropy": 9.174461364746094, + "epoch": 0.21455408344868498, + "mean_token_accuracy": 0.7567886710166931, + "num_tokens": 11291855.0, + "step": 2170, + "train/ce_loss": 0.6254871487617493 + }, + { + "epoch": 0.21455408344868498, + "step": 2170, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.21455408344868498, + "step": 2170, + "train/total_loss": 0.08598621934652328 + }, + { + "entropy": 9.489059448242188, + "epoch": 0.21465295629820053, + "mean_token_accuracy": 0.8178191781044006, + "num_tokens": 11297026.0, + "step": 2171, + "train/ce_loss": 0.5916562080383301 + }, + { + "epoch": 0.21465295629820053, + "step": 2171, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.21465295629820053, + "step": 2171, + "train/total_loss": 0.12166562676429749 + }, + { + "entropy": 9.058624267578125, + "epoch": 0.21475182914771604, + "mean_token_accuracy": 0.7308510541915894, + "num_tokens": 11302421.0, + "step": 2172, + "train/ce_loss": 0.659095823764801 + }, + { + "epoch": 0.21475182914771604, + "step": 2172, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.21475182914771604, + "step": 2172, + "train/total_loss": 0.08934708684682846 + }, + { + "entropy": 9.979301452636719, + "epoch": 0.21485070199723155, + "mean_token_accuracy": 0.7422037124633789, + "num_tokens": 11307316.0, + "step": 2173, + "train/ce_loss": 1.0199291706085205 + }, + { + "epoch": 0.21485070199723155, + "step": 2173, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.21485070199723155, + "step": 2173, + "train/total_loss": 0.1488679200410843 + }, + { + "entropy": 10.074808120727539, + "epoch": 0.2149495748467471, + "mean_token_accuracy": 0.7215447425842285, + "num_tokens": 11312359.0, + "step": 2174, + "train/ce_loss": 4.587761395669077e-06 + }, + { + "epoch": 0.2149495748467471, + "step": 2174, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.2149495748467471, + "step": 2174, + "train/total_loss": 0.05859420821070671 + }, + { + "entropy": 9.64946174621582, + "epoch": 0.2150484476962626, + "mean_token_accuracy": 0.6759868264198303, + "num_tokens": 11317399.0, + "step": 2175, + "train/ce_loss": 1.8297922611236572 + }, + { + "epoch": 0.2150484476962626, + "step": 2175, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.2150484476962626, + "step": 2175, + "train/total_loss": 0.2650104761123657 + }, + { + "entropy": 10.183679580688477, + "epoch": 0.21514732054577812, + "mean_token_accuracy": 0.7074999809265137, + "num_tokens": 11322203.0, + "step": 2176, + "train/ce_loss": 6.507055786642013e-06 + }, + { + "epoch": 0.21514732054577812, + "step": 2176, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.21514732054577812, + "step": 2176, + "train/total_loss": 0.02343815006315708 + }, + { + "entropy": 9.698988914489746, + "epoch": 0.21524619339529366, + "mean_token_accuracy": 0.7300319671630859, + "num_tokens": 11327284.0, + "step": 2177, + "train/ce_loss": 5.713675363949733e-06 + }, + { + "epoch": 0.21524619339529366, + "step": 2177, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.21524619339529366, + "step": 2177, + "train/total_loss": 0.07812557369470596 + }, + { + "entropy": 9.833122253417969, + "epoch": 0.21534506624480917, + "mean_token_accuracy": 0.7629233598709106, + "num_tokens": 11332247.0, + "step": 2178, + "train/ce_loss": 3.8449916246463545e-06 + }, + { + "epoch": 0.21534506624480917, + "step": 2178, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.21534506624480917, + "step": 2178, + "train/total_loss": 0.03125038370490074 + }, + { + "entropy": 9.060279846191406, + "epoch": 0.2154439390943247, + "mean_token_accuracy": 0.7414247989654541, + "num_tokens": 11337479.0, + "step": 2179, + "train/ce_loss": 0.8337211608886719 + }, + { + "epoch": 0.2154439390943247, + "step": 2179, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.2154439390943247, + "step": 2179, + "train/total_loss": 0.2005596160888672 + }, + { + "epoch": 0.21554281194384023, + "grad_norm": 0.9379047751426697, + "learning_rate": 9.463729417000446e-06, + "loss": 0.1581, + "step": 2180 + }, + { + "entropy": 9.990635871887207, + "epoch": 0.21554281194384023, + "mean_token_accuracy": 0.7019438147544861, + "num_tokens": 11342383.0, + "step": 2180, + "train/ce_loss": 2.821929454803467 + }, + { + "epoch": 0.21554281194384023, + "step": 2180, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.21554281194384023, + "step": 2180, + "train/total_loss": 0.3407866954803467 + }, + { + "entropy": 9.707884788513184, + "epoch": 0.21564168479335574, + "mean_token_accuracy": 0.6892489194869995, + "num_tokens": 11347519.0, + "step": 2181, + "train/ce_loss": 2.8116862722527003e-06 + }, + { + "epoch": 0.21564168479335574, + "step": 2181, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.21564168479335574, + "step": 2181, + "train/total_loss": 0.08203153312206268 + }, + { + "entropy": 9.095956802368164, + "epoch": 0.21574055764287126, + "mean_token_accuracy": 0.7733773589134216, + "num_tokens": 11352908.0, + "step": 2182, + "train/ce_loss": 0.8760699033737183 + }, + { + "epoch": 0.21574055764287126, + "step": 2182, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.21574055764287126, + "step": 2182, + "train/total_loss": 0.1813569962978363 + }, + { + "entropy": 9.073506355285645, + "epoch": 0.2158394304923868, + "mean_token_accuracy": 0.778372585773468, + "num_tokens": 11358491.0, + "step": 2183, + "train/ce_loss": 0.340642511844635 + }, + { + "epoch": 0.2158394304923868, + "step": 2183, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.2158394304923868, + "step": 2183, + "train/total_loss": 0.06531424820423126 + }, + { + "entropy": 9.381427764892578, + "epoch": 0.2159383033419023, + "mean_token_accuracy": 0.7582128643989563, + "num_tokens": 11363764.0, + "step": 2184, + "train/ce_loss": 2.680990064618527e-06 + }, + { + "epoch": 0.2159383033419023, + "step": 2184, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.2159383033419023, + "step": 2184, + "train/total_loss": 0.06250026822090149 + }, + { + "entropy": 9.285751342773438, + "epoch": 0.21603717619141782, + "mean_token_accuracy": 0.6935867071151733, + "num_tokens": 11369103.0, + "step": 2185, + "train/ce_loss": 0.6994189620018005 + }, + { + "epoch": 0.21603717619141782, + "step": 2185, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.21603717619141782, + "step": 2185, + "train/total_loss": 0.175410658121109 + }, + { + "entropy": 9.424877166748047, + "epoch": 0.21613604904093336, + "mean_token_accuracy": 0.7536704540252686, + "num_tokens": 11374159.0, + "step": 2186, + "train/ce_loss": 0.9362339973449707 + }, + { + "epoch": 0.21613604904093336, + "step": 2186, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.21613604904093336, + "step": 2186, + "train/total_loss": 0.19518589973449707 + }, + { + "entropy": 9.593955993652344, + "epoch": 0.21623492189044888, + "mean_token_accuracy": 0.7074927687644958, + "num_tokens": 11379250.0, + "step": 2187, + "train/ce_loss": 2.730981577769853e-06 + }, + { + "epoch": 0.21623492189044888, + "step": 2187, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.21623492189044888, + "step": 2187, + "train/total_loss": 0.023437773808836937 + }, + { + "entropy": 9.340346336364746, + "epoch": 0.21633379473996442, + "mean_token_accuracy": 0.7113665342330933, + "num_tokens": 11384501.0, + "step": 2188, + "train/ce_loss": 1.0747685432434082 + }, + { + "epoch": 0.21633379473996442, + "step": 2188, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.21633379473996442, + "step": 2188, + "train/total_loss": 0.1856018602848053 + }, + { + "entropy": 9.467276573181152, + "epoch": 0.21643266758947993, + "mean_token_accuracy": 0.7388059496879578, + "num_tokens": 11389935.0, + "step": 2189, + "train/ce_loss": 0.872168242931366 + }, + { + "epoch": 0.21643266758947993, + "step": 2189, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.21643266758947993, + "step": 2189, + "train/total_loss": 0.1692480742931366 + }, + { + "entropy": 9.654823303222656, + "epoch": 0.21653154043899545, + "mean_token_accuracy": 0.7686567306518555, + "num_tokens": 11394899.0, + "step": 2190, + "train/ce_loss": 0.628430962562561 + }, + { + "epoch": 0.21653154043899545, + "step": 2190, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.21653154043899545, + "step": 2190, + "train/total_loss": 0.09409309923648834 + }, + { + "entropy": 10.016326904296875, + "epoch": 0.216630413288511, + "mean_token_accuracy": 0.7416666746139526, + "num_tokens": 11399780.0, + "step": 2191, + "train/ce_loss": 2.7722220420837402 + }, + { + "epoch": 0.216630413288511, + "step": 2191, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.216630413288511, + "step": 2191, + "train/total_loss": 0.386597216129303 + }, + { + "entropy": 9.48812484741211, + "epoch": 0.2167292861380265, + "mean_token_accuracy": 0.6918518543243408, + "num_tokens": 11404881.0, + "step": 2192, + "train/ce_loss": 2.0111641883850098 + }, + { + "epoch": 0.2167292861380265, + "step": 2192, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.2167292861380265, + "step": 2192, + "train/total_loss": 0.2870539426803589 + }, + { + "entropy": 8.953411102294922, + "epoch": 0.216828158987542, + "mean_token_accuracy": 0.7527749538421631, + "num_tokens": 11410374.0, + "step": 2193, + "train/ce_loss": 0.8288949728012085 + }, + { + "epoch": 0.216828158987542, + "step": 2193, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.216828158987542, + "step": 2193, + "train/total_loss": 0.14929574728012085 + }, + { + "entropy": 9.059518814086914, + "epoch": 0.21692703183705755, + "mean_token_accuracy": 0.7093712687492371, + "num_tokens": 11415688.0, + "step": 2194, + "train/ce_loss": 0.7937802076339722 + }, + { + "epoch": 0.21692703183705755, + "step": 2194, + "train/sim_loss": 0.1484375 + }, + { + "epoch": 0.21692703183705755, + "step": 2194, + "train/total_loss": 0.22781552374362946 + }, + { + "entropy": 9.807968139648438, + "epoch": 0.21702590468657307, + "mean_token_accuracy": 0.7543859481811523, + "num_tokens": 11420710.0, + "step": 2195, + "train/ce_loss": 1.0085327625274658 + }, + { + "epoch": 0.21702590468657307, + "step": 2195, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.21702590468657307, + "step": 2195, + "train/total_loss": 0.15163452923297882 + }, + { + "entropy": 9.237098693847656, + "epoch": 0.21712477753608858, + "mean_token_accuracy": 0.7193675637245178, + "num_tokens": 11425935.0, + "step": 2196, + "train/ce_loss": 0.41018494963645935 + }, + { + "epoch": 0.21712477753608858, + "step": 2196, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.21712477753608858, + "step": 2196, + "train/total_loss": 0.11523725092411041 + }, + { + "entropy": 9.309840202331543, + "epoch": 0.21722365038560412, + "mean_token_accuracy": 0.6972891688346863, + "num_tokens": 11431040.0, + "step": 2197, + "train/ce_loss": 1.0857893228530884 + }, + { + "epoch": 0.21722365038560412, + "step": 2197, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.21722365038560412, + "step": 2197, + "train/total_loss": 0.17498518526554108 + }, + { + "entropy": 9.340157508850098, + "epoch": 0.21732252323511964, + "mean_token_accuracy": 0.7953668236732483, + "num_tokens": 11436233.0, + "step": 2198, + "train/ce_loss": 0.7481324672698975 + }, + { + "epoch": 0.21732252323511964, + "step": 2198, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.21732252323511964, + "step": 2198, + "train/total_loss": 0.13340699672698975 + }, + { + "entropy": 9.236837387084961, + "epoch": 0.21742139608463515, + "mean_token_accuracy": 0.7602339386940002, + "num_tokens": 11441538.0, + "step": 2199, + "train/ce_loss": 0.4316747188568115 + }, + { + "epoch": 0.21742139608463515, + "step": 2199, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.21742139608463515, + "step": 2199, + "train/total_loss": 0.12910497188568115 + }, + { + "epoch": 0.2175202689341507, + "grad_norm": 0.8440031409263611, + "learning_rate": 9.458784552242498e-06, + "loss": 0.1549, + "step": 2200 + }, + { + "entropy": 9.14334487915039, + "epoch": 0.2175202689341507, + "mean_token_accuracy": 0.7280248403549194, + "num_tokens": 11446957.0, + "step": 2200, + "train/ce_loss": 0.8459938764572144 + }, + { + "epoch": 0.2175202689341507, + "step": 2200, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.2175202689341507, + "step": 2200, + "train/total_loss": 0.15881814062595367 + }, + { + "entropy": 9.404414176940918, + "epoch": 0.2176191417836662, + "mean_token_accuracy": 0.6830188632011414, + "num_tokens": 11452210.0, + "step": 2201, + "train/ce_loss": 0.4240676462650299 + }, + { + "epoch": 0.2176191417836662, + "step": 2201, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.2176191417836662, + "step": 2201, + "train/total_loss": 0.09709426760673523 + }, + { + "entropy": 8.77522087097168, + "epoch": 0.21771801463318172, + "mean_token_accuracy": 0.751960813999176, + "num_tokens": 11457701.0, + "step": 2202, + "train/ce_loss": 0.7309415936470032 + }, + { + "epoch": 0.21771801463318172, + "step": 2202, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.21771801463318172, + "step": 2202, + "train/total_loss": 0.09653165936470032 + }, + { + "entropy": 9.528435707092285, + "epoch": 0.21781688748269726, + "mean_token_accuracy": 0.7116212248802185, + "num_tokens": 11462797.0, + "step": 2203, + "train/ce_loss": 1.2283047437667847 + }, + { + "epoch": 0.21781688748269726, + "step": 2203, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.21781688748269726, + "step": 2203, + "train/total_loss": 0.21658048033714294 + }, + { + "entropy": 9.46872329711914, + "epoch": 0.21791576033221277, + "mean_token_accuracy": 0.6633416414260864, + "num_tokens": 11468050.0, + "step": 2204, + "train/ce_loss": 0.693771481513977 + }, + { + "epoch": 0.21791576033221277, + "step": 2204, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.21791576033221277, + "step": 2204, + "train/total_loss": 0.09281464666128159 + }, + { + "entropy": 9.334493637084961, + "epoch": 0.21801463318172828, + "mean_token_accuracy": 0.6779448390007019, + "num_tokens": 11473282.0, + "step": 2205, + "train/ce_loss": 0.7677831053733826 + }, + { + "epoch": 0.21801463318172828, + "step": 2205, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.21801463318172828, + "step": 2205, + "train/total_loss": 0.1470908224582672 + }, + { + "entropy": 10.113032341003418, + "epoch": 0.21811350603124383, + "mean_token_accuracy": 0.7927711009979248, + "num_tokens": 11478115.0, + "step": 2206, + "train/ce_loss": 1.3351314919418655e-05 + }, + { + "epoch": 0.21811350603124383, + "step": 2206, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.21811350603124383, + "step": 2206, + "train/total_loss": 0.04297008365392685 + }, + { + "entropy": 8.985885620117188, + "epoch": 0.21821237888075934, + "mean_token_accuracy": 0.8191377520561218, + "num_tokens": 11483524.0, + "step": 2207, + "train/ce_loss": 0.4432702958583832 + }, + { + "epoch": 0.21821237888075934, + "step": 2207, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.21821237888075934, + "step": 2207, + "train/total_loss": 0.0677645355463028 + }, + { + "entropy": 8.988540649414062, + "epoch": 0.21831125173027488, + "mean_token_accuracy": 0.7454954981803894, + "num_tokens": 11488894.0, + "step": 2208, + "train/ce_loss": 0.8499881029129028 + }, + { + "epoch": 0.21831125173027488, + "step": 2208, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.21831125173027488, + "step": 2208, + "train/total_loss": 0.13187381625175476 + }, + { + "entropy": 9.603333473205566, + "epoch": 0.2184101245797904, + "mean_token_accuracy": 0.694868266582489, + "num_tokens": 11494043.0, + "step": 2209, + "train/ce_loss": 0.9686959981918335 + }, + { + "epoch": 0.2184101245797904, + "step": 2209, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.2184101245797904, + "step": 2209, + "train/total_loss": 0.1984321027994156 + }, + { + "entropy": 9.524908065795898, + "epoch": 0.2185089974293059, + "mean_token_accuracy": 0.699999988079071, + "num_tokens": 11499168.0, + "step": 2210, + "train/ce_loss": 0.6224838495254517 + }, + { + "epoch": 0.2185089974293059, + "step": 2210, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.2185089974293059, + "step": 2210, + "train/total_loss": 0.12474838644266129 + }, + { + "entropy": 9.155143737792969, + "epoch": 0.21860787027882145, + "mean_token_accuracy": 0.8020833134651184, + "num_tokens": 11504503.0, + "step": 2211, + "train/ce_loss": 0.6591212749481201 + }, + { + "epoch": 0.21860787027882145, + "step": 2211, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.21860787027882145, + "step": 2211, + "train/total_loss": 0.08934962749481201 + }, + { + "entropy": 9.740113258361816, + "epoch": 0.21870674312833696, + "mean_token_accuracy": 0.7446808218955994, + "num_tokens": 11509505.0, + "step": 2212, + "train/ce_loss": 1.76637864112854 + }, + { + "epoch": 0.21870674312833696, + "step": 2212, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.21870674312833696, + "step": 2212, + "train/total_loss": 0.21960662305355072 + }, + { + "entropy": 10.373140335083008, + "epoch": 0.21880561597785247, + "mean_token_accuracy": 0.7290970087051392, + "num_tokens": 11514212.0, + "step": 2213, + "train/ce_loss": 7.3785072345344815e-06 + }, + { + "epoch": 0.21880561597785247, + "step": 2213, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.21880561597785247, + "step": 2213, + "train/total_loss": 0.058594487607479095 + }, + { + "entropy": 8.925299644470215, + "epoch": 0.21890448882736802, + "mean_token_accuracy": 0.7633745074272156, + "num_tokens": 11519663.0, + "step": 2214, + "train/ce_loss": 0.9736812710762024 + }, + { + "epoch": 0.21890448882736802, + "step": 2214, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.21890448882736802, + "step": 2214, + "train/total_loss": 0.22236812114715576 + }, + { + "entropy": 9.144561767578125, + "epoch": 0.21900336167688353, + "mean_token_accuracy": 0.7680690288543701, + "num_tokens": 11525043.0, + "step": 2215, + "train/ce_loss": 0.9562681317329407 + }, + { + "epoch": 0.21900336167688353, + "step": 2215, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.21900336167688353, + "step": 2215, + "train/total_loss": 0.1620330661535263 + }, + { + "entropy": 9.364012718200684, + "epoch": 0.21910223452639904, + "mean_token_accuracy": 0.6739690899848938, + "num_tokens": 11530351.0, + "step": 2216, + "train/ce_loss": 1.0492823123931885 + }, + { + "epoch": 0.21910223452639904, + "step": 2216, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.21910223452639904, + "step": 2216, + "train/total_loss": 0.18305322527885437 + }, + { + "entropy": 9.427347183227539, + "epoch": 0.21920110737591458, + "mean_token_accuracy": 0.7275280952453613, + "num_tokens": 11535521.0, + "step": 2217, + "train/ce_loss": 4.278482265362982e-06 + }, + { + "epoch": 0.21920110737591458, + "step": 2217, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.21920110737591458, + "step": 2217, + "train/total_loss": 0.07421917468309402 + }, + { + "entropy": 9.088309288024902, + "epoch": 0.2192999802254301, + "mean_token_accuracy": 0.6828012466430664, + "num_tokens": 11540934.0, + "step": 2218, + "train/ce_loss": 1.5599801540374756 + }, + { + "epoch": 0.2192999802254301, + "step": 2218, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.2192999802254301, + "step": 2218, + "train/total_loss": 0.24193552136421204 + }, + { + "entropy": 9.686138153076172, + "epoch": 0.2193988530749456, + "mean_token_accuracy": 0.7630252242088318, + "num_tokens": 11545959.0, + "step": 2219, + "train/ce_loss": 2.031754970550537 + }, + { + "epoch": 0.2193988530749456, + "step": 2219, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.2193988530749456, + "step": 2219, + "train/total_loss": 0.28911298513412476 + }, + { + "epoch": 0.21949772592446115, + "grad_norm": 0.9843518733978271, + "learning_rate": 9.453839687484548e-06, + "loss": 0.1545, + "step": 2220 + }, + { + "entropy": 9.309232711791992, + "epoch": 0.21949772592446115, + "mean_token_accuracy": 0.7461629509925842, + "num_tokens": 11551312.0, + "step": 2220, + "train/ce_loss": 1.0132420063018799 + }, + { + "epoch": 0.21949772592446115, + "step": 2220, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.21949772592446115, + "step": 2220, + "train/total_loss": 0.163824200630188 + }, + { + "entropy": 9.845184326171875, + "epoch": 0.21959659877397666, + "mean_token_accuracy": 0.6597077250480652, + "num_tokens": 11556192.0, + "step": 2221, + "train/ce_loss": 1.2175583839416504 + }, + { + "epoch": 0.21959659877397666, + "step": 2221, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.21959659877397666, + "step": 2221, + "train/total_loss": 0.19206833839416504 + }, + { + "entropy": 9.62173080444336, + "epoch": 0.21969547162349218, + "mean_token_accuracy": 0.7169811129570007, + "num_tokens": 11561289.0, + "step": 2222, + "train/ce_loss": 1.0519672632217407 + }, + { + "epoch": 0.21969547162349218, + "step": 2222, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.21969547162349218, + "step": 2222, + "train/total_loss": 0.1872279793024063 + }, + { + "entropy": 9.193510055541992, + "epoch": 0.21979434447300772, + "mean_token_accuracy": 0.7065337896347046, + "num_tokens": 11566685.0, + "step": 2223, + "train/ce_loss": 1.1219364404678345 + }, + { + "epoch": 0.21979434447300772, + "step": 2223, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.21979434447300772, + "step": 2223, + "train/total_loss": 0.20594364404678345 + }, + { + "entropy": 9.803560256958008, + "epoch": 0.21989321732252323, + "mean_token_accuracy": 0.7218543291091919, + "num_tokens": 11571724.0, + "step": 2224, + "train/ce_loss": 1.250420331954956 + }, + { + "epoch": 0.21989321732252323, + "step": 2224, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.21989321732252323, + "step": 2224, + "train/total_loss": 0.19535453617572784 + }, + { + "entropy": 9.618412971496582, + "epoch": 0.21999209017203875, + "mean_token_accuracy": 0.7300613522529602, + "num_tokens": 11576830.0, + "step": 2225, + "train/ce_loss": 1.8763059415505268e-05 + }, + { + "epoch": 0.21999209017203875, + "step": 2225, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.21999209017203875, + "step": 2225, + "train/total_loss": 0.08984562754631042 + }, + { + "entropy": 9.272396087646484, + "epoch": 0.2200909630215543, + "mean_token_accuracy": 0.6992574334144592, + "num_tokens": 11582136.0, + "step": 2226, + "train/ce_loss": 0.5853594541549683 + }, + { + "epoch": 0.2200909630215543, + "step": 2226, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.2200909630215543, + "step": 2226, + "train/total_loss": 0.12884844839572906 + }, + { + "entropy": 9.071741104125977, + "epoch": 0.2201898358710698, + "mean_token_accuracy": 0.778969943523407, + "num_tokens": 11587575.0, + "step": 2227, + "train/ce_loss": 0.5541576147079468 + }, + { + "epoch": 0.2201898358710698, + "step": 2227, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.2201898358710698, + "step": 2227, + "train/total_loss": 0.08666576445102692 + }, + { + "entropy": 9.476241111755371, + "epoch": 0.22028870872058534, + "mean_token_accuracy": 0.70071941614151, + "num_tokens": 11592776.0, + "step": 2228, + "train/ce_loss": 0.9042754173278809 + }, + { + "epoch": 0.22028870872058534, + "step": 2228, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.22028870872058534, + "step": 2228, + "train/total_loss": 0.16074004769325256 + }, + { + "entropy": 9.842856407165527, + "epoch": 0.22038758157010085, + "mean_token_accuracy": 0.7243697643280029, + "num_tokens": 11597818.0, + "step": 2229, + "train/ce_loss": 4.030011496070074e-06 + }, + { + "epoch": 0.22038758157010085, + "step": 2229, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.22038758157010085, + "step": 2229, + "train/total_loss": 0.06250040233135223 + }, + { + "entropy": 9.958674430847168, + "epoch": 0.22048645441961637, + "mean_token_accuracy": 0.7037037014961243, + "num_tokens": 11602640.0, + "step": 2230, + "train/ce_loss": 6.45430336589925e-05 + }, + { + "epoch": 0.22048645441961637, + "step": 2230, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.22048645441961637, + "step": 2230, + "train/total_loss": 0.042975205928087234 + }, + { + "entropy": 8.876422882080078, + "epoch": 0.2205853272691319, + "mean_token_accuracy": 0.7611111402511597, + "num_tokens": 11608034.0, + "step": 2231, + "train/ce_loss": 0.5030422806739807 + }, + { + "epoch": 0.2205853272691319, + "step": 2231, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.2205853272691319, + "step": 2231, + "train/total_loss": 0.07764798402786255 + }, + { + "entropy": 8.97739028930664, + "epoch": 0.22068420011864742, + "mean_token_accuracy": 0.7794561982154846, + "num_tokens": 11613469.0, + "step": 2232, + "train/ce_loss": 0.6213583946228027 + }, + { + "epoch": 0.22068420011864742, + "step": 2232, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.22068420011864742, + "step": 2232, + "train/total_loss": 0.08947959542274475 + }, + { + "entropy": 9.215559005737305, + "epoch": 0.22078307296816294, + "mean_token_accuracy": 0.7322834730148315, + "num_tokens": 11618844.0, + "step": 2233, + "train/ce_loss": 0.7574310898780823 + }, + { + "epoch": 0.22078307296816294, + "step": 2233, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.22078307296816294, + "step": 2233, + "train/total_loss": 0.14996185898780823 + }, + { + "entropy": 9.45633316040039, + "epoch": 0.22088194581767848, + "mean_token_accuracy": 0.7794729471206665, + "num_tokens": 11624024.0, + "step": 2234, + "train/ce_loss": 0.6323754787445068 + }, + { + "epoch": 0.22088194581767848, + "step": 2234, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.22088194581767848, + "step": 2234, + "train/total_loss": 0.09058129787445068 + }, + { + "entropy": 9.645082473754883, + "epoch": 0.220980818667194, + "mean_token_accuracy": 0.7199312448501587, + "num_tokens": 11629020.0, + "step": 2235, + "train/ce_loss": 0.8218998908996582 + }, + { + "epoch": 0.220980818667194, + "step": 2235, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.220980818667194, + "step": 2235, + "train/total_loss": 0.17593999207019806 + }, + { + "entropy": 9.402721405029297, + "epoch": 0.2210796915167095, + "mean_token_accuracy": 0.6622516512870789, + "num_tokens": 11634182.0, + "step": 2236, + "train/ce_loss": 0.9104689359664917 + }, + { + "epoch": 0.2210796915167095, + "step": 2236, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.2210796915167095, + "step": 2236, + "train/total_loss": 0.18089064955711365 + }, + { + "entropy": 9.542716979980469, + "epoch": 0.22117856436622504, + "mean_token_accuracy": 0.7368420958518982, + "num_tokens": 11639348.0, + "step": 2237, + "train/ce_loss": 0.8644982576370239 + }, + { + "epoch": 0.22117856436622504, + "step": 2237, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.22117856436622504, + "step": 2237, + "train/total_loss": 0.12160607427358627 + }, + { + "entropy": 9.052597045898438, + "epoch": 0.22127743721574056, + "mean_token_accuracy": 0.7108141183853149, + "num_tokens": 11644629.0, + "step": 2238, + "train/ce_loss": 0.8265287280082703 + }, + { + "epoch": 0.22127743721574056, + "step": 2238, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.22127743721574056, + "step": 2238, + "train/total_loss": 0.14905911684036255 + }, + { + "entropy": 8.985305786132812, + "epoch": 0.22137631006525607, + "mean_token_accuracy": 0.7151966094970703, + "num_tokens": 11650015.0, + "step": 2239, + "train/ce_loss": 0.9002116918563843 + }, + { + "epoch": 0.22137631006525607, + "step": 2239, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.22137631006525607, + "step": 2239, + "train/total_loss": 0.16814616322517395 + }, + { + "epoch": 0.2214751829147716, + "grad_norm": 0.9024277925491333, + "learning_rate": 9.4488948227266e-06, + "loss": 0.1592, + "step": 2240 + }, + { + "entropy": 9.394206047058105, + "epoch": 0.2214751829147716, + "mean_token_accuracy": 0.6993548274040222, + "num_tokens": 11655258.0, + "step": 2240, + "train/ce_loss": 0.9053270816802979 + }, + { + "epoch": 0.2214751829147716, + "step": 2240, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.2214751829147716, + "step": 2240, + "train/total_loss": 0.18818897008895874 + }, + { + "entropy": 9.000853538513184, + "epoch": 0.22157405576428713, + "mean_token_accuracy": 0.7426556944847107, + "num_tokens": 11660591.0, + "step": 2241, + "train/ce_loss": 0.8556745648384094 + }, + { + "epoch": 0.22157405576428713, + "step": 2241, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.22157405576428713, + "step": 2241, + "train/total_loss": 0.18712995946407318 + }, + { + "entropy": 8.76073169708252, + "epoch": 0.22167292861380264, + "mean_token_accuracy": 0.6751313209533691, + "num_tokens": 11666227.0, + "step": 2242, + "train/ce_loss": 0.46014127135276794 + }, + { + "epoch": 0.22167292861380264, + "step": 2242, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.22167292861380264, + "step": 2242, + "train/total_loss": 0.07726413011550903 + }, + { + "entropy": 9.666487693786621, + "epoch": 0.22177180146331818, + "mean_token_accuracy": 0.7293233275413513, + "num_tokens": 11671206.0, + "step": 2243, + "train/ce_loss": 1.7687726020812988 + }, + { + "epoch": 0.22177180146331818, + "step": 2243, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.22177180146331818, + "step": 2243, + "train/total_loss": 0.24328351020812988 + }, + { + "entropy": 9.189653396606445, + "epoch": 0.2218706743128337, + "mean_token_accuracy": 0.752077579498291, + "num_tokens": 11676373.0, + "step": 2244, + "train/ce_loss": 0.7033227682113647 + }, + { + "epoch": 0.2218706743128337, + "step": 2244, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.2218706743128337, + "step": 2244, + "train/total_loss": 0.12892603874206543 + }, + { + "entropy": 9.874273300170898, + "epoch": 0.2219695471623492, + "mean_token_accuracy": 0.8305785059928894, + "num_tokens": 11681287.0, + "step": 2245, + "train/ce_loss": 0.964332640171051 + }, + { + "epoch": 0.2219695471623492, + "step": 2245, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.2219695471623492, + "step": 2245, + "train/total_loss": 0.17455826699733734 + }, + { + "entropy": 9.143362045288086, + "epoch": 0.22206842001186475, + "mean_token_accuracy": 0.7358943819999695, + "num_tokens": 11686593.0, + "step": 2246, + "train/ce_loss": 0.8456717729568481 + }, + { + "epoch": 0.22206842001186475, + "step": 2246, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.22206842001186475, + "step": 2246, + "train/total_loss": 0.13144218921661377 + }, + { + "entropy": 8.868471145629883, + "epoch": 0.22216729286138026, + "mean_token_accuracy": 0.7589852213859558, + "num_tokens": 11691989.0, + "step": 2247, + "train/ce_loss": 0.5861417651176453 + }, + { + "epoch": 0.22216729286138026, + "step": 2247, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.22216729286138026, + "step": 2247, + "train/total_loss": 0.08986417949199677 + }, + { + "entropy": 9.47134017944336, + "epoch": 0.22226616571089577, + "mean_token_accuracy": 0.7125172019004822, + "num_tokens": 11697161.0, + "step": 2248, + "train/ce_loss": 0.8098688721656799 + }, + { + "epoch": 0.22226616571089577, + "step": 2248, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.22226616571089577, + "step": 2248, + "train/total_loss": 0.170830637216568 + }, + { + "entropy": 9.266434669494629, + "epoch": 0.22236503856041132, + "mean_token_accuracy": 0.7151898741722107, + "num_tokens": 11702424.0, + "step": 2249, + "train/ce_loss": 0.3487287163734436 + }, + { + "epoch": 0.22236503856041132, + "step": 2249, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.22236503856041132, + "step": 2249, + "train/total_loss": 0.0856541246175766 + }, + { + "entropy": 8.874809265136719, + "epoch": 0.22246391140992683, + "mean_token_accuracy": 0.8147773146629333, + "num_tokens": 11707903.0, + "step": 2250, + "train/ce_loss": 1.0702500343322754 + }, + { + "epoch": 0.22246391140992683, + "step": 2250, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.22246391140992683, + "step": 2250, + "train/total_loss": 0.20077499747276306 + }, + { + "entropy": 9.210561752319336, + "epoch": 0.22256278425944237, + "mean_token_accuracy": 0.718826413154602, + "num_tokens": 11713170.0, + "step": 2251, + "train/ce_loss": 1.1450979709625244 + }, + { + "epoch": 0.22256278425944237, + "step": 2251, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.22256278425944237, + "step": 2251, + "train/total_loss": 0.14966604113578796 + }, + { + "entropy": 9.203631401062012, + "epoch": 0.22266165710895788, + "mean_token_accuracy": 0.7115384340286255, + "num_tokens": 11718495.0, + "step": 2252, + "train/ce_loss": 0.9820226430892944 + }, + { + "epoch": 0.22266165710895788, + "step": 2252, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.22266165710895788, + "step": 2252, + "train/total_loss": 0.16460850834846497 + }, + { + "entropy": 9.035469055175781, + "epoch": 0.2227605299584734, + "mean_token_accuracy": 0.6905737519264221, + "num_tokens": 11723907.0, + "step": 2253, + "train/ce_loss": 0.7407964468002319 + }, + { + "epoch": 0.2227605299584734, + "step": 2253, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.2227605299584734, + "step": 2253, + "train/total_loss": 0.15220464766025543 + }, + { + "entropy": 9.292431831359863, + "epoch": 0.22285940280798894, + "mean_token_accuracy": 0.7409090995788574, + "num_tokens": 11729054.0, + "step": 2254, + "train/ce_loss": 6.4857699726417195e-06 + }, + { + "epoch": 0.22285940280798894, + "step": 2254, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.22285940280798894, + "step": 2254, + "train/total_loss": 0.03906314820051193 + }, + { + "entropy": 9.472380638122559, + "epoch": 0.22295827565750445, + "mean_token_accuracy": 0.7387140989303589, + "num_tokens": 11734131.0, + "step": 2255, + "train/ce_loss": 1.2210959196090698 + }, + { + "epoch": 0.22295827565750445, + "step": 2255, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.22295827565750445, + "step": 2255, + "train/total_loss": 0.19242209196090698 + }, + { + "entropy": 8.898763656616211, + "epoch": 0.22305714850701996, + "mean_token_accuracy": 0.6935166716575623, + "num_tokens": 11739640.0, + "step": 2256, + "train/ce_loss": 0.6935907006263733 + }, + { + "epoch": 0.22305714850701996, + "step": 2256, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.22305714850701996, + "step": 2256, + "train/total_loss": 0.12404657155275345 + }, + { + "entropy": 9.321701049804688, + "epoch": 0.2231560213565355, + "mean_token_accuracy": 0.7052631378173828, + "num_tokens": 11744824.0, + "step": 2257, + "train/ce_loss": 0.7226592898368835 + }, + { + "epoch": 0.2231560213565355, + "step": 2257, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.2231560213565355, + "step": 2257, + "train/total_loss": 0.12695342302322388 + }, + { + "entropy": 9.250961303710938, + "epoch": 0.22325489420605102, + "mean_token_accuracy": 0.7439724206924438, + "num_tokens": 11750153.0, + "step": 2258, + "train/ce_loss": 1.0045374631881714 + }, + { + "epoch": 0.22325489420605102, + "step": 2258, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.22325489420605102, + "step": 2258, + "train/total_loss": 0.17076624929904938 + }, + { + "entropy": 9.526915550231934, + "epoch": 0.22335376705556653, + "mean_token_accuracy": 0.7809917330741882, + "num_tokens": 11755325.0, + "step": 2259, + "train/ce_loss": 1.122484564781189 + }, + { + "epoch": 0.22335376705556653, + "step": 2259, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.22335376705556653, + "step": 2259, + "train/total_loss": 0.20599845051765442 + }, + { + "epoch": 0.22345263990508207, + "grad_norm": 0.7781448364257812, + "learning_rate": 9.44394995796865e-06, + "loss": 0.1563, + "step": 2260 + }, + { + "entropy": 9.540910720825195, + "epoch": 0.22345263990508207, + "mean_token_accuracy": 0.6835616230964661, + "num_tokens": 11760471.0, + "step": 2260, + "train/ce_loss": 1.0501680374145508 + }, + { + "epoch": 0.22345263990508207, + "step": 2260, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.22345263990508207, + "step": 2260, + "train/total_loss": 0.1948605477809906 + }, + { + "entropy": 9.239645004272461, + "epoch": 0.2235515127545976, + "mean_token_accuracy": 0.725261926651001, + "num_tokens": 11765819.0, + "step": 2261, + "train/ce_loss": 1.27482271194458 + }, + { + "epoch": 0.2235515127545976, + "step": 2261, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.2235515127545976, + "step": 2261, + "train/total_loss": 0.24466978013515472 + }, + { + "entropy": 8.915700912475586, + "epoch": 0.2236503856041131, + "mean_token_accuracy": 0.7682570815086365, + "num_tokens": 11771331.0, + "step": 2262, + "train/ce_loss": 0.6882832646369934 + }, + { + "epoch": 0.2236503856041131, + "step": 2262, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.2236503856041131, + "step": 2262, + "train/total_loss": 0.10398457944393158 + }, + { + "entropy": 9.431346893310547, + "epoch": 0.22374925845362864, + "mean_token_accuracy": 0.7694235444068909, + "num_tokens": 11776578.0, + "step": 2263, + "train/ce_loss": 5.010083441447932e-06 + }, + { + "epoch": 0.22374925845362864, + "step": 2263, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.22374925845362864, + "step": 2263, + "train/total_loss": 0.031250499188899994 + }, + { + "entropy": 9.091398239135742, + "epoch": 0.22384813130314415, + "mean_token_accuracy": 0.7158351540565491, + "num_tokens": 11781960.0, + "step": 2264, + "train/ce_loss": 0.6672316193580627 + }, + { + "epoch": 0.22384813130314415, + "step": 2264, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.22384813130314415, + "step": 2264, + "train/total_loss": 0.13312941789627075 + }, + { + "entropy": 9.166069984436035, + "epoch": 0.22394700415265967, + "mean_token_accuracy": 0.7060241103172302, + "num_tokens": 11787250.0, + "step": 2265, + "train/ce_loss": 1.1839373111724854 + }, + { + "epoch": 0.22394700415265967, + "step": 2265, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.22394700415265967, + "step": 2265, + "train/total_loss": 0.16526873409748077 + }, + { + "entropy": 9.317614555358887, + "epoch": 0.2240458770021752, + "mean_token_accuracy": 0.7234042286872864, + "num_tokens": 11792515.0, + "step": 2266, + "train/ce_loss": 1.562074065208435 + }, + { + "epoch": 0.2240458770021752, + "step": 2266, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.2240458770021752, + "step": 2266, + "train/total_loss": 0.24605116248130798 + }, + { + "entropy": 9.677188873291016, + "epoch": 0.22414474985169072, + "mean_token_accuracy": 0.704049825668335, + "num_tokens": 11797539.0, + "step": 2267, + "train/ce_loss": 4.742025339510292e-06 + }, + { + "epoch": 0.22414474985169072, + "step": 2267, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.22414474985169072, + "step": 2267, + "train/total_loss": 0.050781723111867905 + }, + { + "entropy": 9.175324440002441, + "epoch": 0.22424362270120624, + "mean_token_accuracy": 0.7828004360198975, + "num_tokens": 11802872.0, + "step": 2268, + "train/ce_loss": 0.6030412912368774 + }, + { + "epoch": 0.22424362270120624, + "step": 2268, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.22424362270120624, + "step": 2268, + "train/total_loss": 0.08374163508415222 + }, + { + "entropy": 9.462089538574219, + "epoch": 0.22434249555072178, + "mean_token_accuracy": 0.744516134262085, + "num_tokens": 11808065.0, + "step": 2269, + "train/ce_loss": 0.4913155138492584 + }, + { + "epoch": 0.22434249555072178, + "step": 2269, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.22434249555072178, + "step": 2269, + "train/total_loss": 0.12725655734539032 + }, + { + "entropy": 10.407243728637695, + "epoch": 0.2244413684002373, + "mean_token_accuracy": 0.7772276997566223, + "num_tokens": 11812659.0, + "step": 2270, + "train/ce_loss": 2.864804628188722e-05 + }, + { + "epoch": 0.2244413684002373, + "step": 2270, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.2244413684002373, + "step": 2270, + "train/total_loss": 0.05469036474823952 + }, + { + "entropy": 9.114701271057129, + "epoch": 0.22454024124975283, + "mean_token_accuracy": 0.7374461889266968, + "num_tokens": 11817856.0, + "step": 2271, + "train/ce_loss": 0.9672939777374268 + }, + { + "epoch": 0.22454024124975283, + "step": 2271, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.22454024124975283, + "step": 2271, + "train/total_loss": 0.15141689777374268 + }, + { + "entropy": 9.434842109680176, + "epoch": 0.22463911409926834, + "mean_token_accuracy": 0.7405914068222046, + "num_tokens": 11823046.0, + "step": 2272, + "train/ce_loss": 0.6265137791633606 + }, + { + "epoch": 0.22463911409926834, + "step": 2272, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.22463911409926834, + "step": 2272, + "train/total_loss": 0.0782763808965683 + }, + { + "entropy": 9.087112426757812, + "epoch": 0.22473798694878386, + "mean_token_accuracy": 0.7478684782981873, + "num_tokens": 11828361.0, + "step": 2273, + "train/ce_loss": 1.1432479619979858 + }, + { + "epoch": 0.22473798694878386, + "step": 2273, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.22473798694878386, + "step": 2273, + "train/total_loss": 0.15729355812072754 + }, + { + "entropy": 9.735007286071777, + "epoch": 0.2248368597982994, + "mean_token_accuracy": 0.7039473652839661, + "num_tokens": 11833438.0, + "step": 2274, + "train/ce_loss": 0.6995331645011902 + }, + { + "epoch": 0.2248368597982994, + "step": 2274, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.2248368597982994, + "step": 2274, + "train/total_loss": 0.1324533224105835 + }, + { + "entropy": 9.384618759155273, + "epoch": 0.2249357326478149, + "mean_token_accuracy": 0.7997347712516785, + "num_tokens": 11838646.0, + "step": 2275, + "train/ce_loss": 0.5165915489196777 + }, + { + "epoch": 0.2249357326478149, + "step": 2275, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.2249357326478149, + "step": 2275, + "train/total_loss": 0.08290915191173553 + }, + { + "entropy": 8.890913963317871, + "epoch": 0.22503460549733043, + "mean_token_accuracy": 0.7223942279815674, + "num_tokens": 11844105.0, + "step": 2276, + "train/ce_loss": 1.4923746585845947 + }, + { + "epoch": 0.22503460549733043, + "step": 2276, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.22503460549733043, + "step": 2276, + "train/total_loss": 0.2742374539375305 + }, + { + "entropy": 9.439136505126953, + "epoch": 0.22513347834684597, + "mean_token_accuracy": 0.7201645970344543, + "num_tokens": 11849298.0, + "step": 2277, + "train/ce_loss": 1.2139506340026855 + }, + { + "epoch": 0.22513347834684597, + "step": 2277, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.22513347834684597, + "step": 2277, + "train/total_loss": 0.1917075663805008 + }, + { + "entropy": 9.488235473632812, + "epoch": 0.22523235119636148, + "mean_token_accuracy": 0.7834224700927734, + "num_tokens": 11854542.0, + "step": 2278, + "train/ce_loss": 1.1686978340148926 + }, + { + "epoch": 0.22523235119636148, + "step": 2278, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.22523235119636148, + "step": 2278, + "train/total_loss": 0.21452602744102478 + }, + { + "entropy": 9.137245178222656, + "epoch": 0.225331224045877, + "mean_token_accuracy": 0.7211764454841614, + "num_tokens": 11859830.0, + "step": 2279, + "train/ce_loss": 1.1765626668930054 + }, + { + "epoch": 0.225331224045877, + "step": 2279, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.225331224045877, + "step": 2279, + "train/total_loss": 0.19968751072883606 + }, + { + "epoch": 0.22543009689539253, + "grad_norm": 0.8348625898361206, + "learning_rate": 9.439005093210701e-06, + "loss": 0.1487, + "step": 2280 + }, + { + "entropy": 9.51508617401123, + "epoch": 0.22543009689539253, + "mean_token_accuracy": 0.7322946190834045, + "num_tokens": 11864989.0, + "step": 2280, + "train/ce_loss": 0.5634602308273315 + }, + { + "epoch": 0.22543009689539253, + "step": 2280, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.22543009689539253, + "step": 2280, + "train/total_loss": 0.09540852904319763 + }, + { + "entropy": 8.554391860961914, + "epoch": 0.22552896974490805, + "mean_token_accuracy": 0.748110830783844, + "num_tokens": 11870654.0, + "step": 2281, + "train/ce_loss": 0.9140045642852783 + }, + { + "epoch": 0.22552896974490805, + "step": 2281, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.22552896974490805, + "step": 2281, + "train/total_loss": 0.21640045940876007 + }, + { + "entropy": 9.242164611816406, + "epoch": 0.22562784259442356, + "mean_token_accuracy": 0.7529411911964417, + "num_tokens": 11875902.0, + "step": 2282, + "train/ce_loss": 0.6295803785324097 + }, + { + "epoch": 0.22562784259442356, + "step": 2282, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.22562784259442356, + "step": 2282, + "train/total_loss": 0.10202053934335709 + }, + { + "entropy": 9.475540161132812, + "epoch": 0.2257267154439391, + "mean_token_accuracy": 0.7105942964553833, + "num_tokens": 11881137.0, + "step": 2283, + "train/ce_loss": 0.8041722774505615 + }, + { + "epoch": 0.2257267154439391, + "step": 2283, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.2257267154439391, + "step": 2283, + "train/total_loss": 0.11166723072528839 + }, + { + "entropy": 9.342806816101074, + "epoch": 0.22582558829345462, + "mean_token_accuracy": 0.6896985173225403, + "num_tokens": 11886401.0, + "step": 2284, + "train/ce_loss": 3.392033931959304e-06 + }, + { + "epoch": 0.22582558829345462, + "step": 2284, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.22582558829345462, + "step": 2284, + "train/total_loss": 0.03125033900141716 + }, + { + "entropy": 10.024946212768555, + "epoch": 0.22592446114297013, + "mean_token_accuracy": 0.7042889595031738, + "num_tokens": 11891316.0, + "step": 2285, + "train/ce_loss": 1.474079181207344e-05 + }, + { + "epoch": 0.22592446114297013, + "step": 2285, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.22592446114297013, + "step": 2285, + "train/total_loss": 0.02734522335231304 + }, + { + "entropy": 9.042903900146484, + "epoch": 0.22602333399248567, + "mean_token_accuracy": 0.686087965965271, + "num_tokens": 11896656.0, + "step": 2286, + "train/ce_loss": 1.1429626941680908 + }, + { + "epoch": 0.22602333399248567, + "step": 2286, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.22602333399248567, + "step": 2286, + "train/total_loss": 0.17679627239704132 + }, + { + "entropy": 9.304134368896484, + "epoch": 0.22612220684200118, + "mean_token_accuracy": 0.6961326003074646, + "num_tokens": 11901859.0, + "step": 2287, + "train/ce_loss": 0.8572551012039185 + }, + { + "epoch": 0.22612220684200118, + "step": 2287, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.22612220684200118, + "step": 2287, + "train/total_loss": 0.15994426608085632 + }, + { + "entropy": 10.196354866027832, + "epoch": 0.2262210796915167, + "mean_token_accuracy": 0.6957831382751465, + "num_tokens": 11906568.0, + "step": 2288, + "train/ce_loss": 2.1421995162963867 + }, + { + "epoch": 0.2262210796915167, + "step": 2288, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.2262210796915167, + "step": 2288, + "train/total_loss": 0.28453245759010315 + }, + { + "entropy": 9.624486923217773, + "epoch": 0.22631995254103224, + "mean_token_accuracy": 0.7454819083213806, + "num_tokens": 11911684.0, + "step": 2289, + "train/ce_loss": 0.869027853012085 + }, + { + "epoch": 0.22631995254103224, + "step": 2289, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.22631995254103224, + "step": 2289, + "train/total_loss": 0.17284029722213745 + }, + { + "entropy": 9.850872039794922, + "epoch": 0.22641882539054775, + "mean_token_accuracy": 0.757785439491272, + "num_tokens": 11916672.0, + "step": 2290, + "train/ce_loss": 0.4710646867752075 + }, + { + "epoch": 0.22641882539054775, + "step": 2290, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.22641882539054775, + "step": 2290, + "train/total_loss": 0.12132522463798523 + }, + { + "entropy": 9.050373077392578, + "epoch": 0.2265176982400633, + "mean_token_accuracy": 0.7090216279029846, + "num_tokens": 11921931.0, + "step": 2291, + "train/ce_loss": 1.124954342842102 + }, + { + "epoch": 0.2265176982400633, + "step": 2291, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.2265176982400633, + "step": 2291, + "train/total_loss": 0.19062043726444244 + }, + { + "entropy": 9.606853485107422, + "epoch": 0.2266165710895788, + "mean_token_accuracy": 0.6097561120986938, + "num_tokens": 11926987.0, + "step": 2292, + "train/ce_loss": 1.8163892030715942 + }, + { + "epoch": 0.2266165710895788, + "step": 2292, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.2266165710895788, + "step": 2292, + "train/total_loss": 0.2519514262676239 + }, + { + "entropy": 9.96500015258789, + "epoch": 0.22671544393909432, + "mean_token_accuracy": 0.7556390762329102, + "num_tokens": 11932069.0, + "step": 2293, + "train/ce_loss": 0.970014750957489 + }, + { + "epoch": 0.22671544393909432, + "step": 2293, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.22671544393909432, + "step": 2293, + "train/total_loss": 0.15950147807598114 + }, + { + "entropy": 8.88833999633789, + "epoch": 0.22681431678860986, + "mean_token_accuracy": 0.7294994592666626, + "num_tokens": 11937461.0, + "step": 2294, + "train/ce_loss": 0.9902238249778748 + }, + { + "epoch": 0.22681431678860986, + "step": 2294, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.22681431678860986, + "step": 2294, + "train/total_loss": 0.19667863845825195 + }, + { + "entropy": 9.839460372924805, + "epoch": 0.22691318963812537, + "mean_token_accuracy": 0.7652329802513123, + "num_tokens": 11942415.0, + "step": 2295, + "train/ce_loss": 1.1105287075042725 + }, + { + "epoch": 0.22691318963812537, + "step": 2295, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.22691318963812537, + "step": 2295, + "train/total_loss": 0.16964662075042725 + }, + { + "entropy": 9.365549087524414, + "epoch": 0.2270120624876409, + "mean_token_accuracy": 0.7042253613471985, + "num_tokens": 11947562.0, + "step": 2296, + "train/ce_loss": 0.7497787475585938 + }, + { + "epoch": 0.2270120624876409, + "step": 2296, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.2270120624876409, + "step": 2296, + "train/total_loss": 0.12966537475585938 + }, + { + "entropy": 9.545838356018066, + "epoch": 0.22711093533715643, + "mean_token_accuracy": 0.7606461048126221, + "num_tokens": 11952663.0, + "step": 2297, + "train/ce_loss": 1.0381242036819458 + }, + { + "epoch": 0.22711093533715643, + "step": 2297, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.22711093533715643, + "step": 2297, + "train/total_loss": 0.16631242632865906 + }, + { + "entropy": 9.693061828613281, + "epoch": 0.22720980818667194, + "mean_token_accuracy": 0.7562189102172852, + "num_tokens": 11957898.0, + "step": 2298, + "train/ce_loss": 6.120082161942264e-06 + }, + { + "epoch": 0.22720980818667194, + "step": 2298, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.22720980818667194, + "step": 2298, + "train/total_loss": 0.05859436094760895 + }, + { + "entropy": 9.140596389770508, + "epoch": 0.22730868103618745, + "mean_token_accuracy": 0.6908893585205078, + "num_tokens": 11963326.0, + "step": 2299, + "train/ce_loss": 1.0507303476333618 + }, + { + "epoch": 0.22730868103618745, + "step": 2299, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.22730868103618745, + "step": 2299, + "train/total_loss": 0.14413553476333618 + }, + { + "epoch": 0.227407553885703, + "grad_norm": 1.2080398797988892, + "learning_rate": 9.434060228452752e-06, + "loss": 0.164, + "step": 2300 + }, + { + "entropy": 9.693990707397461, + "epoch": 0.227407553885703, + "mean_token_accuracy": 0.7839999794960022, + "num_tokens": 11968449.0, + "step": 2300, + "train/ce_loss": 0.5579846501350403 + }, + { + "epoch": 0.227407553885703, + "step": 2300, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.227407553885703, + "step": 2300, + "train/total_loss": 0.0792359709739685 + }, + { + "entropy": 10.13557243347168, + "epoch": 0.2275064267352185, + "mean_token_accuracy": 0.7394067645072937, + "num_tokens": 11973294.0, + "step": 2301, + "train/ce_loss": 2.46250581741333 + }, + { + "epoch": 0.2275064267352185, + "step": 2301, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.2275064267352185, + "step": 2301, + "train/total_loss": 0.35171931982040405 + }, + { + "entropy": 10.132203102111816, + "epoch": 0.22760529958473402, + "mean_token_accuracy": 0.7748344540596008, + "num_tokens": 11978011.0, + "step": 2302, + "train/ce_loss": 5.034738205722533e-05 + }, + { + "epoch": 0.22760529958473402, + "step": 2302, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.22760529958473402, + "step": 2302, + "train/total_loss": 0.07813003659248352 + }, + { + "entropy": 10.068281173706055, + "epoch": 0.22770417243424956, + "mean_token_accuracy": 0.755294144153595, + "num_tokens": 11982872.0, + "step": 2303, + "train/ce_loss": 1.7586405277252197 + }, + { + "epoch": 0.22770417243424956, + "step": 2303, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.22770417243424956, + "step": 2303, + "train/total_loss": 0.2461765557527542 + }, + { + "entropy": 9.399394989013672, + "epoch": 0.22780304528376508, + "mean_token_accuracy": 0.748633861541748, + "num_tokens": 11988082.0, + "step": 2304, + "train/ce_loss": 1.0429457426071167 + }, + { + "epoch": 0.22780304528376508, + "step": 2304, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.22780304528376508, + "step": 2304, + "train/total_loss": 0.1550758183002472 + }, + { + "entropy": 9.462631225585938, + "epoch": 0.2279019181332806, + "mean_token_accuracy": 0.7350901365280151, + "num_tokens": 11993404.0, + "step": 2305, + "train/ce_loss": 0.5820447206497192 + }, + { + "epoch": 0.2279019181332806, + "step": 2305, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.2279019181332806, + "step": 2305, + "train/total_loss": 0.12461072206497192 + }, + { + "entropy": 9.83642864227295, + "epoch": 0.22800079098279613, + "mean_token_accuracy": 0.7275922894477844, + "num_tokens": 11998424.0, + "step": 2306, + "train/ce_loss": 6.891273187648039e-06 + }, + { + "epoch": 0.22800079098279613, + "step": 2306, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.22800079098279613, + "step": 2306, + "train/total_loss": 0.054688189178705215 + }, + { + "entropy": 9.044208526611328, + "epoch": 0.22809966383231164, + "mean_token_accuracy": 0.7373737096786499, + "num_tokens": 12003871.0, + "step": 2307, + "train/ce_loss": 0.6468204259872437 + }, + { + "epoch": 0.22809966383231164, + "step": 2307, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.22809966383231164, + "step": 2307, + "train/total_loss": 0.08421329408884048 + }, + { + "entropy": 9.22705078125, + "epoch": 0.22819853668182716, + "mean_token_accuracy": 0.7378048896789551, + "num_tokens": 12009355.0, + "step": 2308, + "train/ce_loss": 0.7152095437049866 + }, + { + "epoch": 0.22819853668182716, + "step": 2308, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.22819853668182716, + "step": 2308, + "train/total_loss": 0.19652095437049866 + }, + { + "entropy": 9.639345169067383, + "epoch": 0.2282974095313427, + "mean_token_accuracy": 0.6328927874565125, + "num_tokens": 12014423.0, + "step": 2309, + "train/ce_loss": 2.6829929993255064e-06 + }, + { + "epoch": 0.2282974095313427, + "step": 2309, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.2282974095313427, + "step": 2309, + "train/total_loss": 0.02734401822090149 + }, + { + "entropy": 9.361225128173828, + "epoch": 0.2283962823808582, + "mean_token_accuracy": 0.7243173122406006, + "num_tokens": 12019641.0, + "step": 2310, + "train/ce_loss": 1.1629915237426758 + }, + { + "epoch": 0.2283962823808582, + "step": 2310, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.2283962823808582, + "step": 2310, + "train/total_loss": 0.22176790237426758 + }, + { + "entropy": 9.329537391662598, + "epoch": 0.22849515523037375, + "mean_token_accuracy": 0.7163398861885071, + "num_tokens": 12024849.0, + "step": 2311, + "train/ce_loss": 1.7918075323104858 + }, + { + "epoch": 0.22849515523037375, + "step": 2311, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.22849515523037375, + "step": 2311, + "train/total_loss": 0.25730574131011963 + }, + { + "entropy": 8.798558235168457, + "epoch": 0.22859402807988927, + "mean_token_accuracy": 0.7682177424430847, + "num_tokens": 12030490.0, + "step": 2312, + "train/ce_loss": 0.7416049838066101 + }, + { + "epoch": 0.22859402807988927, + "step": 2312, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.22859402807988927, + "step": 2312, + "train/total_loss": 0.17962925136089325 + }, + { + "entropy": 9.236748695373535, + "epoch": 0.22869290092940478, + "mean_token_accuracy": 0.7161290049552917, + "num_tokens": 12035879.0, + "step": 2313, + "train/ce_loss": 1.196192979812622 + }, + { + "epoch": 0.22869290092940478, + "step": 2313, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.22869290092940478, + "step": 2313, + "train/total_loss": 0.18993180990219116 + }, + { + "entropy": 9.573179244995117, + "epoch": 0.22879177377892032, + "mean_token_accuracy": 0.7083333134651184, + "num_tokens": 12040982.0, + "step": 2314, + "train/ce_loss": 0.769433319568634 + }, + { + "epoch": 0.22879177377892032, + "step": 2314, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.22879177377892032, + "step": 2314, + "train/total_loss": 0.17069333791732788 + }, + { + "entropy": 8.967704772949219, + "epoch": 0.22889064662843583, + "mean_token_accuracy": 0.7914980053901672, + "num_tokens": 12046498.0, + "step": 2315, + "train/ce_loss": 0.49730974435806274 + }, + { + "epoch": 0.22889064662843583, + "step": 2315, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.22889064662843583, + "step": 2315, + "train/total_loss": 0.09269972145557404 + }, + { + "entropy": 9.235469818115234, + "epoch": 0.22898951947795135, + "mean_token_accuracy": 0.7600896954536438, + "num_tokens": 12051874.0, + "step": 2316, + "train/ce_loss": 1.099819302558899 + }, + { + "epoch": 0.22898951947795135, + "step": 2316, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.22898951947795135, + "step": 2316, + "train/total_loss": 0.12951317429542542 + }, + { + "entropy": 9.946176528930664, + "epoch": 0.2290883923274669, + "mean_token_accuracy": 0.69305020570755, + "num_tokens": 12056821.0, + "step": 2317, + "train/ce_loss": 1.7881754636764526 + }, + { + "epoch": 0.2290883923274669, + "step": 2317, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.2290883923274669, + "step": 2317, + "train/total_loss": 0.2803800702095032 + }, + { + "entropy": 9.353560447692871, + "epoch": 0.2291872651769824, + "mean_token_accuracy": 0.7390244007110596, + "num_tokens": 12062127.0, + "step": 2318, + "train/ce_loss": 0.66374272108078 + }, + { + "epoch": 0.2291872651769824, + "step": 2318, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.2291872651769824, + "step": 2318, + "train/total_loss": 0.144499272108078 + }, + { + "entropy": 9.604637145996094, + "epoch": 0.22928613802649792, + "mean_token_accuracy": 0.7388888597488403, + "num_tokens": 12067442.0, + "step": 2319, + "train/ce_loss": 0.6843863725662231 + }, + { + "epoch": 0.22928613802649792, + "step": 2319, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.22928613802649792, + "step": 2319, + "train/total_loss": 0.14265739917755127 + }, + { + "epoch": 0.22938501087601346, + "grad_norm": 1.1014442443847656, + "learning_rate": 9.429115363694804e-06, + "loss": 0.1555, + "step": 2320 + }, + { + "entropy": 8.732933044433594, + "epoch": 0.22938501087601346, + "mean_token_accuracy": 0.729629635810852, + "num_tokens": 12073023.0, + "step": 2320, + "train/ce_loss": 0.61324542760849 + }, + { + "epoch": 0.22938501087601346, + "step": 2320, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.22938501087601346, + "step": 2320, + "train/total_loss": 0.10038704425096512 + }, + { + "entropy": 9.829085350036621, + "epoch": 0.22948388372552897, + "mean_token_accuracy": 0.803636372089386, + "num_tokens": 12077972.0, + "step": 2321, + "train/ce_loss": 0.9266363382339478 + }, + { + "epoch": 0.22948388372552897, + "step": 2321, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.22948388372552897, + "step": 2321, + "train/total_loss": 0.15516364574432373 + }, + { + "entropy": 9.217707633972168, + "epoch": 0.22958275657504448, + "mean_token_accuracy": 0.7194570302963257, + "num_tokens": 12083345.0, + "step": 2322, + "train/ce_loss": 0.6711381673812866 + }, + { + "epoch": 0.22958275657504448, + "step": 2322, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.22958275657504448, + "step": 2322, + "train/total_loss": 0.11789506673812866 + }, + { + "entropy": 9.400322914123535, + "epoch": 0.22968162942456002, + "mean_token_accuracy": 0.7345844507217407, + "num_tokens": 12088564.0, + "step": 2323, + "train/ce_loss": 0.5601037740707397 + }, + { + "epoch": 0.22968162942456002, + "step": 2323, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.22968162942456002, + "step": 2323, + "train/total_loss": 0.09507288038730621 + }, + { + "entropy": 9.304672241210938, + "epoch": 0.22978050227407554, + "mean_token_accuracy": 0.7310087084770203, + "num_tokens": 12093878.0, + "step": 2324, + "train/ce_loss": 0.8067693710327148 + }, + { + "epoch": 0.22978050227407554, + "step": 2324, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.22978050227407554, + "step": 2324, + "train/total_loss": 0.11583318561315536 + }, + { + "entropy": 9.614727020263672, + "epoch": 0.22987937512359105, + "mean_token_accuracy": 0.7204142212867737, + "num_tokens": 12099042.0, + "step": 2325, + "train/ce_loss": 3.208590214853757e-06 + }, + { + "epoch": 0.22987937512359105, + "step": 2325, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.22987937512359105, + "step": 2325, + "train/total_loss": 0.04687532037496567 + }, + { + "entropy": 9.11031723022461, + "epoch": 0.2299782479731066, + "mean_token_accuracy": 0.7690631747245789, + "num_tokens": 12104492.0, + "step": 2326, + "train/ce_loss": 0.7134276032447815 + }, + { + "epoch": 0.2299782479731066, + "step": 2326, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.2299782479731066, + "step": 2326, + "train/total_loss": 0.11431150883436203 + }, + { + "entropy": 9.151698112487793, + "epoch": 0.2300771208226221, + "mean_token_accuracy": 0.732300877571106, + "num_tokens": 12109856.0, + "step": 2327, + "train/ce_loss": 1.6674649715423584 + }, + { + "epoch": 0.2300771208226221, + "step": 2327, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.2300771208226221, + "step": 2327, + "train/total_loss": 0.24487149715423584 + }, + { + "entropy": 9.683753967285156, + "epoch": 0.23017599367213762, + "mean_token_accuracy": 0.7667785286903381, + "num_tokens": 12114920.0, + "step": 2328, + "train/ce_loss": 1.7838212251663208 + }, + { + "epoch": 0.23017599367213762, + "step": 2328, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.23017599367213762, + "step": 2328, + "train/total_loss": 0.29556962847709656 + }, + { + "entropy": 9.871530532836914, + "epoch": 0.23027486652165316, + "mean_token_accuracy": 0.7833333611488342, + "num_tokens": 12119981.0, + "step": 2329, + "train/ce_loss": 0.41187918186187744 + }, + { + "epoch": 0.23027486652165316, + "step": 2329, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.23027486652165316, + "step": 2329, + "train/total_loss": 0.11540666967630386 + }, + { + "entropy": 9.21509838104248, + "epoch": 0.23037373937116867, + "mean_token_accuracy": 0.7616875767707825, + "num_tokens": 12125302.0, + "step": 2330, + "train/ce_loss": 1.1086196899414062 + }, + { + "epoch": 0.23037373937116867, + "step": 2330, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.23037373937116867, + "step": 2330, + "train/total_loss": 0.16554947197437286 + }, + { + "entropy": 9.648240089416504, + "epoch": 0.2304726122206842, + "mean_token_accuracy": 0.789207398891449, + "num_tokens": 12130365.0, + "step": 2331, + "train/ce_loss": 0.7285244464874268 + }, + { + "epoch": 0.2304726122206842, + "step": 2331, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.2304726122206842, + "step": 2331, + "train/total_loss": 0.13535244762897491 + }, + { + "entropy": 9.431110382080078, + "epoch": 0.23057148507019973, + "mean_token_accuracy": 0.7144790291786194, + "num_tokens": 12135585.0, + "step": 2332, + "train/ce_loss": 1.6808106899261475 + }, + { + "epoch": 0.23057148507019973, + "step": 2332, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.23057148507019973, + "step": 2332, + "train/total_loss": 0.2774560749530792 + }, + { + "entropy": 9.513031959533691, + "epoch": 0.23067035791971524, + "mean_token_accuracy": 0.7231788039207458, + "num_tokens": 12140781.0, + "step": 2333, + "train/ce_loss": 1.0438014268875122 + }, + { + "epoch": 0.23067035791971524, + "step": 2333, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.23067035791971524, + "step": 2333, + "train/total_loss": 0.17078639566898346 + }, + { + "entropy": 9.623817443847656, + "epoch": 0.23076923076923078, + "mean_token_accuracy": 0.7717041969299316, + "num_tokens": 12145849.0, + "step": 2334, + "train/ce_loss": 0.7471662163734436 + }, + { + "epoch": 0.23076923076923078, + "step": 2334, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.23076923076923078, + "step": 2334, + "train/total_loss": 0.09424787014722824 + }, + { + "entropy": 9.290124893188477, + "epoch": 0.2308681036187463, + "mean_token_accuracy": 0.6814371347427368, + "num_tokens": 12151116.0, + "step": 2335, + "train/ce_loss": 1.317223072052002 + }, + { + "epoch": 0.2308681036187463, + "step": 2335, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.2308681036187463, + "step": 2335, + "train/total_loss": 0.2215660661458969 + }, + { + "entropy": 9.283143043518066, + "epoch": 0.2309669764682618, + "mean_token_accuracy": 0.7582159638404846, + "num_tokens": 12156438.0, + "step": 2336, + "train/ce_loss": 0.47142666578292847 + }, + { + "epoch": 0.2309669764682618, + "step": 2336, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.2309669764682618, + "step": 2336, + "train/total_loss": 0.07448641955852509 + }, + { + "entropy": 8.935561180114746, + "epoch": 0.23106584931777735, + "mean_token_accuracy": 0.7375133037567139, + "num_tokens": 12161874.0, + "step": 2337, + "train/ce_loss": 0.9972467422485352 + }, + { + "epoch": 0.23106584931777735, + "step": 2337, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.23106584931777735, + "step": 2337, + "train/total_loss": 0.185662180185318 + }, + { + "entropy": 9.580480575561523, + "epoch": 0.23116472216729286, + "mean_token_accuracy": 0.7230538725852966, + "num_tokens": 12167025.0, + "step": 2338, + "train/ce_loss": 1.1938048601150513 + }, + { + "epoch": 0.23116472216729286, + "step": 2338, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.23116472216729286, + "step": 2338, + "train/total_loss": 0.20141173899173737 + }, + { + "entropy": 9.525504112243652, + "epoch": 0.23126359501680838, + "mean_token_accuracy": 0.7517531514167786, + "num_tokens": 12172214.0, + "step": 2339, + "train/ce_loss": 0.7448764443397522 + }, + { + "epoch": 0.23126359501680838, + "step": 2339, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.23126359501680838, + "step": 2339, + "train/total_loss": 0.10573764890432358 + }, + { + "epoch": 0.23136246786632392, + "grad_norm": 0.8302991986274719, + "learning_rate": 9.424170498936855e-06, + "loss": 0.1451, + "step": 2340 + }, + { + "entropy": 8.966384887695312, + "epoch": 0.23136246786632392, + "mean_token_accuracy": 0.6930232644081116, + "num_tokens": 12177567.0, + "step": 2340, + "train/ce_loss": 0.9528385400772095 + }, + { + "epoch": 0.23136246786632392, + "step": 2340, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.23136246786632392, + "step": 2340, + "train/total_loss": 0.1851276159286499 + }, + { + "entropy": 8.769596099853516, + "epoch": 0.23146134071583943, + "mean_token_accuracy": 0.8007380366325378, + "num_tokens": 12183128.0, + "step": 2341, + "train/ce_loss": 0.5972678065299988 + }, + { + "epoch": 0.23146134071583943, + "step": 2341, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.23146134071583943, + "step": 2341, + "train/total_loss": 0.1769142746925354 + }, + { + "entropy": 8.95469856262207, + "epoch": 0.23156021356535494, + "mean_token_accuracy": 0.7057521939277649, + "num_tokens": 12188499.0, + "step": 2342, + "train/ce_loss": 0.523081362247467 + }, + { + "epoch": 0.23156021356535494, + "step": 2342, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.23156021356535494, + "step": 2342, + "train/total_loss": 0.11090189218521118 + }, + { + "entropy": 9.6317138671875, + "epoch": 0.23165908641487049, + "mean_token_accuracy": 0.7047451734542847, + "num_tokens": 12193481.0, + "step": 2343, + "train/ce_loss": 1.2870943546295166 + }, + { + "epoch": 0.23165908641487049, + "step": 2343, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.23165908641487049, + "step": 2343, + "train/total_loss": 0.19511568546295166 + }, + { + "entropy": 9.163106918334961, + "epoch": 0.231757959264386, + "mean_token_accuracy": 0.7786343693733215, + "num_tokens": 12198817.0, + "step": 2344, + "train/ce_loss": 1.093612790107727 + }, + { + "epoch": 0.231757959264386, + "step": 2344, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.231757959264386, + "step": 2344, + "train/total_loss": 0.14451754093170166 + }, + { + "entropy": 8.905069351196289, + "epoch": 0.2318568321139015, + "mean_token_accuracy": 0.7400398254394531, + "num_tokens": 12204316.0, + "step": 2345, + "train/ce_loss": 0.7283441424369812 + }, + { + "epoch": 0.2318568321139015, + "step": 2345, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.2318568321139015, + "step": 2345, + "train/total_loss": 0.14314691722393036 + }, + { + "entropy": 9.299427032470703, + "epoch": 0.23195570496341705, + "mean_token_accuracy": 0.7548138499259949, + "num_tokens": 12209574.0, + "step": 2346, + "train/ce_loss": 0.859463632106781 + }, + { + "epoch": 0.23195570496341705, + "step": 2346, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.23195570496341705, + "step": 2346, + "train/total_loss": 0.16016511619091034 + }, + { + "entropy": 9.65144157409668, + "epoch": 0.23205457781293257, + "mean_token_accuracy": 0.70606529712677, + "num_tokens": 12214672.0, + "step": 2347, + "train/ce_loss": 2.027458906173706 + }, + { + "epoch": 0.23205457781293257, + "step": 2347, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.23205457781293257, + "step": 2347, + "train/total_loss": 0.2808709144592285 + }, + { + "entropy": 9.273920059204102, + "epoch": 0.23215345066244808, + "mean_token_accuracy": 0.6890243887901306, + "num_tokens": 12219959.0, + "step": 2348, + "train/ce_loss": 0.7437017560005188 + }, + { + "epoch": 0.23215345066244808, + "step": 2348, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.23215345066244808, + "step": 2348, + "train/total_loss": 0.12515142560005188 + }, + { + "entropy": 9.235380172729492, + "epoch": 0.23225232351196362, + "mean_token_accuracy": 0.7119438052177429, + "num_tokens": 12225223.0, + "step": 2349, + "train/ce_loss": 1.7639403343200684 + }, + { + "epoch": 0.23225232351196362, + "step": 2349, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.23225232351196362, + "step": 2349, + "train/total_loss": 0.2740502953529358 + }, + { + "entropy": 9.568511962890625, + "epoch": 0.23235119636147913, + "mean_token_accuracy": 0.7660208940505981, + "num_tokens": 12230328.0, + "step": 2350, + "train/ce_loss": 0.7514338493347168 + }, + { + "epoch": 0.23235119636147913, + "step": 2350, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.23235119636147913, + "step": 2350, + "train/total_loss": 0.17279964685440063 + }, + { + "entropy": 9.128923416137695, + "epoch": 0.23245006921099465, + "mean_token_accuracy": 0.6614255905151367, + "num_tokens": 12235765.0, + "step": 2351, + "train/ce_loss": 1.1933038234710693 + }, + { + "epoch": 0.23245006921099465, + "step": 2351, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.23245006921099465, + "step": 2351, + "train/total_loss": 0.20526787638664246 + }, + { + "entropy": 9.096158981323242, + "epoch": 0.2325489420605102, + "mean_token_accuracy": 0.7404994368553162, + "num_tokens": 12241175.0, + "step": 2352, + "train/ce_loss": 1.2961671352386475 + }, + { + "epoch": 0.2325489420605102, + "step": 2352, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.2325489420605102, + "step": 2352, + "train/total_loss": 0.18821047246456146 + }, + { + "entropy": 9.5964994430542, + "epoch": 0.2326478149100257, + "mean_token_accuracy": 0.7549019455909729, + "num_tokens": 12246203.0, + "step": 2353, + "train/ce_loss": 2.9441860078804893e-06 + }, + { + "epoch": 0.2326478149100257, + "step": 2353, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.2326478149100257, + "step": 2353, + "train/total_loss": 0.02343779429793358 + }, + { + "entropy": 9.449602127075195, + "epoch": 0.23274668775954124, + "mean_token_accuracy": 0.6873920559883118, + "num_tokens": 12251233.0, + "step": 2354, + "train/ce_loss": 1.7445043325424194 + }, + { + "epoch": 0.23274668775954124, + "step": 2354, + "train/sim_loss": 0.12109375 + }, + { + "epoch": 0.23274668775954124, + "step": 2354, + "train/total_loss": 0.29554420709609985 + }, + { + "entropy": 9.216875076293945, + "epoch": 0.23284556060905676, + "mean_token_accuracy": 0.7317351698875427, + "num_tokens": 12256564.0, + "step": 2355, + "train/ce_loss": 0.9889079332351685 + }, + { + "epoch": 0.23284556060905676, + "step": 2355, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.23284556060905676, + "step": 2355, + "train/total_loss": 0.17701579630374908 + }, + { + "entropy": 9.126455307006836, + "epoch": 0.23294443345857227, + "mean_token_accuracy": 0.6959620118141174, + "num_tokens": 12261836.0, + "step": 2356, + "train/ce_loss": 0.8084344863891602 + }, + { + "epoch": 0.23294443345857227, + "step": 2356, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.23294443345857227, + "step": 2356, + "train/total_loss": 0.15896844863891602 + }, + { + "entropy": 9.406908988952637, + "epoch": 0.2330433063080878, + "mean_token_accuracy": 0.7142857313156128, + "num_tokens": 12267089.0, + "step": 2357, + "train/ce_loss": 0.7463719844818115 + }, + { + "epoch": 0.2330433063080878, + "step": 2357, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.2330433063080878, + "step": 2357, + "train/total_loss": 0.14494970440864563 + }, + { + "entropy": 10.010465621948242, + "epoch": 0.23314217915760332, + "mean_token_accuracy": 0.784518837928772, + "num_tokens": 12272006.0, + "step": 2358, + "train/ce_loss": 1.2236218935868237e-05 + }, + { + "epoch": 0.23314217915760332, + "step": 2358, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.23314217915760332, + "step": 2358, + "train/total_loss": 0.0664074718952179 + }, + { + "entropy": 9.247428894042969, + "epoch": 0.23324105200711884, + "mean_token_accuracy": 0.7581453919410706, + "num_tokens": 12277301.0, + "step": 2359, + "train/ce_loss": 1.2065562009811401 + }, + { + "epoch": 0.23324105200711884, + "step": 2359, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.23324105200711884, + "step": 2359, + "train/total_loss": 0.1909681260585785 + }, + { + "epoch": 0.23333992485663438, + "grad_norm": 0.9280444383621216, + "learning_rate": 9.419225634178905e-06, + "loss": 0.1634, + "step": 2360 + }, + { + "entropy": 9.522346496582031, + "epoch": 0.23333992485663438, + "mean_token_accuracy": 0.7637444138526917, + "num_tokens": 12282397.0, + "step": 2360, + "train/ce_loss": 0.9355092644691467 + }, + { + "epoch": 0.23333992485663438, + "step": 2360, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.23333992485663438, + "step": 2360, + "train/total_loss": 0.1833946704864502 + }, + { + "entropy": 9.466015815734863, + "epoch": 0.2334387977061499, + "mean_token_accuracy": 0.7586750984191895, + "num_tokens": 12287449.0, + "step": 2361, + "train/ce_loss": 1.500874638557434 + }, + { + "epoch": 0.2334387977061499, + "step": 2361, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.2334387977061499, + "step": 2361, + "train/total_loss": 0.23993121087551117 + }, + { + "entropy": 9.620744705200195, + "epoch": 0.2335376705556654, + "mean_token_accuracy": 0.75789475440979, + "num_tokens": 12292533.0, + "step": 2362, + "train/ce_loss": 0.8770025968551636 + }, + { + "epoch": 0.2335376705556654, + "step": 2362, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.2335376705556654, + "step": 2362, + "train/total_loss": 0.1033252626657486 + }, + { + "entropy": 9.022543907165527, + "epoch": 0.23363654340518095, + "mean_token_accuracy": 0.7319062352180481, + "num_tokens": 12297991.0, + "step": 2363, + "train/ce_loss": 0.6032585501670837 + }, + { + "epoch": 0.23363654340518095, + "step": 2363, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.23363654340518095, + "step": 2363, + "train/total_loss": 0.09938836097717285 + }, + { + "entropy": 9.33456802368164, + "epoch": 0.23373541625469646, + "mean_token_accuracy": 0.7348951697349548, + "num_tokens": 12303223.0, + "step": 2364, + "train/ce_loss": 1.0315171480178833 + }, + { + "epoch": 0.23373541625469646, + "step": 2364, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.23373541625469646, + "step": 2364, + "train/total_loss": 0.16955795884132385 + }, + { + "entropy": 9.068136215209961, + "epoch": 0.23383428910421197, + "mean_token_accuracy": 0.7019438147544861, + "num_tokens": 12308573.0, + "step": 2365, + "train/ce_loss": 0.833172082901001 + }, + { + "epoch": 0.23383428910421197, + "step": 2365, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.23383428910421197, + "step": 2365, + "train/total_loss": 0.11456721276044846 + }, + { + "entropy": 9.199182510375977, + "epoch": 0.23393316195372751, + "mean_token_accuracy": 0.7469066381454468, + "num_tokens": 12313984.0, + "step": 2366, + "train/ce_loss": 1.0053783655166626 + }, + { + "epoch": 0.23393316195372751, + "step": 2366, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.23393316195372751, + "step": 2366, + "train/total_loss": 0.17866283655166626 + }, + { + "entropy": 9.322465896606445, + "epoch": 0.23403203480324303, + "mean_token_accuracy": 0.7112299203872681, + "num_tokens": 12319359.0, + "step": 2367, + "train/ce_loss": 0.5418054461479187 + }, + { + "epoch": 0.23403203480324303, + "step": 2367, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.23403203480324303, + "step": 2367, + "train/total_loss": 0.07371179759502411 + }, + { + "entropy": 9.816730499267578, + "epoch": 0.23413090765275854, + "mean_token_accuracy": 0.716549277305603, + "num_tokens": 12324391.0, + "step": 2368, + "train/ce_loss": 1.0450146198272705 + }, + { + "epoch": 0.23413090765275854, + "step": 2368, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.23413090765275854, + "step": 2368, + "train/total_loss": 0.16700145602226257 + }, + { + "entropy": 8.824502944946289, + "epoch": 0.23422978050227408, + "mean_token_accuracy": 0.75, + "num_tokens": 12329954.0, + "step": 2369, + "train/ce_loss": 0.7634261846542358 + }, + { + "epoch": 0.23422978050227408, + "step": 2369, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.23422978050227408, + "step": 2369, + "train/total_loss": 0.0997801199555397 + }, + { + "entropy": 9.87130355834961, + "epoch": 0.2343286533517896, + "mean_token_accuracy": 0.7819548845291138, + "num_tokens": 12334935.0, + "step": 2370, + "train/ce_loss": 5.111191057949327e-06 + }, + { + "epoch": 0.2343286533517896, + "step": 2370, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.2343286533517896, + "step": 2370, + "train/total_loss": 0.05078176036477089 + }, + { + "entropy": 9.217727661132812, + "epoch": 0.2344275262013051, + "mean_token_accuracy": 0.6961394548416138, + "num_tokens": 12340348.0, + "step": 2371, + "train/ce_loss": 0.7166080474853516 + }, + { + "epoch": 0.2344275262013051, + "step": 2371, + "train/sim_loss": 0.1484375 + }, + { + "epoch": 0.2344275262013051, + "step": 2371, + "train/total_loss": 0.2200983166694641 + }, + { + "entropy": 9.32795238494873, + "epoch": 0.23452639905082065, + "mean_token_accuracy": 0.7483176589012146, + "num_tokens": 12345528.0, + "step": 2372, + "train/ce_loss": 0.5925230383872986 + }, + { + "epoch": 0.23452639905082065, + "step": 2372, + "train/sim_loss": 0.12890625 + }, + { + "epoch": 0.23452639905082065, + "step": 2372, + "train/total_loss": 0.1881585568189621 + }, + { + "entropy": 9.745134353637695, + "epoch": 0.23462527190033616, + "mean_token_accuracy": 0.7691029906272888, + "num_tokens": 12350563.0, + "step": 2373, + "train/ce_loss": 0.8051249980926514 + }, + { + "epoch": 0.23462527190033616, + "step": 2373, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.23462527190033616, + "step": 2373, + "train/total_loss": 0.14691874384880066 + }, + { + "entropy": 9.267991065979004, + "epoch": 0.2347241447498517, + "mean_token_accuracy": 0.7484811544418335, + "num_tokens": 12355901.0, + "step": 2374, + "train/ce_loss": 1.1690332889556885 + }, + { + "epoch": 0.2347241447498517, + "step": 2374, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.2347241447498517, + "step": 2374, + "train/total_loss": 0.19502833485603333 + }, + { + "entropy": 9.697690963745117, + "epoch": 0.23482301759936722, + "mean_token_accuracy": 0.6909871101379395, + "num_tokens": 12360971.0, + "step": 2375, + "train/ce_loss": 1.5387448072433472 + }, + { + "epoch": 0.23482301759936722, + "step": 2375, + "train/sim_loss": 0.12890625 + }, + { + "epoch": 0.23482301759936722, + "step": 2375, + "train/total_loss": 0.2827807366847992 + }, + { + "entropy": 9.765154838562012, + "epoch": 0.23492189044888273, + "mean_token_accuracy": 0.7376725673675537, + "num_tokens": 12365945.0, + "step": 2376, + "train/ce_loss": 4.265839379513636e-06 + }, + { + "epoch": 0.23492189044888273, + "step": 2376, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.23492189044888273, + "step": 2376, + "train/total_loss": 0.03125042840838432 + }, + { + "entropy": 9.0798978805542, + "epoch": 0.23502076329839827, + "mean_token_accuracy": 0.7807229161262512, + "num_tokens": 12371285.0, + "step": 2377, + "train/ce_loss": 0.35677269101142883 + }, + { + "epoch": 0.23502076329839827, + "step": 2377, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.23502076329839827, + "step": 2377, + "train/total_loss": 0.10598976910114288 + }, + { + "entropy": 9.423152923583984, + "epoch": 0.23511963614791379, + "mean_token_accuracy": 0.7773279547691345, + "num_tokens": 12376466.0, + "step": 2378, + "train/ce_loss": 1.1971715688705444 + }, + { + "epoch": 0.23511963614791379, + "step": 2378, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.23511963614791379, + "step": 2378, + "train/total_loss": 0.19784215092658997 + }, + { + "entropy": 9.05411148071289, + "epoch": 0.2352185089974293, + "mean_token_accuracy": 0.7235772609710693, + "num_tokens": 12381819.0, + "step": 2379, + "train/ce_loss": 0.789045512676239 + }, + { + "epoch": 0.2352185089974293, + "step": 2379, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.2352185089974293, + "step": 2379, + "train/total_loss": 0.11015455424785614 + }, + { + "epoch": 0.23531738184694484, + "grad_norm": 0.8408113121986389, + "learning_rate": 9.414280769420957e-06, + "loss": 0.1538, + "step": 2380 + }, + { + "entropy": 9.860815048217773, + "epoch": 0.23531738184694484, + "mean_token_accuracy": 0.7065972089767456, + "num_tokens": 12386833.0, + "step": 2380, + "train/ce_loss": 1.434122920036316 + }, + { + "epoch": 0.23531738184694484, + "step": 2380, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.23531738184694484, + "step": 2380, + "train/total_loss": 0.2332560420036316 + }, + { + "entropy": 9.214044570922852, + "epoch": 0.23541625469646035, + "mean_token_accuracy": 0.6609124541282654, + "num_tokens": 12392296.0, + "step": 2381, + "train/ce_loss": 0.6754460334777832 + }, + { + "epoch": 0.23541625469646035, + "step": 2381, + "train/sim_loss": 0.12109375 + }, + { + "epoch": 0.23541625469646035, + "step": 2381, + "train/total_loss": 0.1886383593082428 + }, + { + "entropy": 9.326742172241211, + "epoch": 0.23551512754597587, + "mean_token_accuracy": 0.7060931921005249, + "num_tokens": 12397634.0, + "step": 2382, + "train/ce_loss": 0.9092620015144348 + }, + { + "epoch": 0.23551512754597587, + "step": 2382, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.23551512754597587, + "step": 2382, + "train/total_loss": 0.19248870015144348 + }, + { + "entropy": 9.355029106140137, + "epoch": 0.2356140003954914, + "mean_token_accuracy": 0.7112010717391968, + "num_tokens": 12402868.0, + "step": 2383, + "train/ce_loss": 1.1074140071868896 + }, + { + "epoch": 0.2356140003954914, + "step": 2383, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.2356140003954914, + "step": 2383, + "train/total_loss": 0.20839765667915344 + }, + { + "entropy": 9.30242919921875, + "epoch": 0.23571287324500692, + "mean_token_accuracy": 0.7903030514717102, + "num_tokens": 12408161.0, + "step": 2384, + "train/ce_loss": 0.5750543475151062 + }, + { + "epoch": 0.23571287324500692, + "step": 2384, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.23571287324500692, + "step": 2384, + "train/total_loss": 0.13172417879104614 + }, + { + "entropy": 8.88375186920166, + "epoch": 0.23581174609452243, + "mean_token_accuracy": 0.7622950673103333, + "num_tokens": 12413596.0, + "step": 2385, + "train/ce_loss": 1.5440810918807983 + }, + { + "epoch": 0.23581174609452243, + "step": 2385, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.23581174609452243, + "step": 2385, + "train/total_loss": 0.2559705972671509 + }, + { + "entropy": 9.507736206054688, + "epoch": 0.23591061894403798, + "mean_token_accuracy": 0.691428542137146, + "num_tokens": 12418754.0, + "step": 2386, + "train/ce_loss": 0.6624653935432434 + }, + { + "epoch": 0.23591061894403798, + "step": 2386, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.23591061894403798, + "step": 2386, + "train/total_loss": 0.15218403935432434 + }, + { + "entropy": 9.170475006103516, + "epoch": 0.2360094917935535, + "mean_token_accuracy": 0.7021276354789734, + "num_tokens": 12423895.0, + "step": 2387, + "train/ce_loss": 7.41304620532901e-06 + }, + { + "epoch": 0.2360094917935535, + "step": 2387, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.2360094917935535, + "step": 2387, + "train/total_loss": 0.054688241332769394 + }, + { + "entropy": 9.166980743408203, + "epoch": 0.236108364643069, + "mean_token_accuracy": 0.711448609828949, + "num_tokens": 12429216.0, + "step": 2388, + "train/ce_loss": 0.8316457867622375 + }, + { + "epoch": 0.236108364643069, + "step": 2388, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.236108364643069, + "step": 2388, + "train/total_loss": 0.15738332271575928 + }, + { + "entropy": 9.178657531738281, + "epoch": 0.23620723749258454, + "mean_token_accuracy": 0.7100130319595337, + "num_tokens": 12434508.0, + "step": 2389, + "train/ce_loss": 0.9058764576911926 + }, + { + "epoch": 0.23620723749258454, + "step": 2389, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.23620723749258454, + "step": 2389, + "train/total_loss": 0.15699389576911926 + }, + { + "entropy": 8.597973823547363, + "epoch": 0.23630611034210006, + "mean_token_accuracy": 0.7439758777618408, + "num_tokens": 12440061.0, + "step": 2390, + "train/ce_loss": 0.933378279209137 + }, + { + "epoch": 0.23630611034210006, + "step": 2390, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.23630611034210006, + "step": 2390, + "train/total_loss": 0.21052533388137817 + }, + { + "entropy": 9.485719680786133, + "epoch": 0.23640498319161557, + "mean_token_accuracy": 0.7674112915992737, + "num_tokens": 12445292.0, + "step": 2391, + "train/ce_loss": 0.8362389206886292 + }, + { + "epoch": 0.23640498319161557, + "step": 2391, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.23640498319161557, + "step": 2391, + "train/total_loss": 0.11487389355897903 + }, + { + "entropy": 9.044958114624023, + "epoch": 0.2365038560411311, + "mean_token_accuracy": 0.7096399664878845, + "num_tokens": 12450637.0, + "step": 2392, + "train/ce_loss": 1.012323021888733 + }, + { + "epoch": 0.2365038560411311, + "step": 2392, + "train/sim_loss": 0.1328125 + }, + { + "epoch": 0.2365038560411311, + "step": 2392, + "train/total_loss": 0.23404480516910553 + }, + { + "entropy": 9.472670555114746, + "epoch": 0.23660272889064662, + "mean_token_accuracy": 0.7845934629440308, + "num_tokens": 12455778.0, + "step": 2393, + "train/ce_loss": 0.8923097848892212 + }, + { + "epoch": 0.23660272889064662, + "step": 2393, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.23660272889064662, + "step": 2393, + "train/total_loss": 0.1400122344493866 + }, + { + "entropy": 9.029642105102539, + "epoch": 0.23670160174016217, + "mean_token_accuracy": 0.7072368264198303, + "num_tokens": 12461199.0, + "step": 2394, + "train/ce_loss": 1.3686507940292358 + }, + { + "epoch": 0.23670160174016217, + "step": 2394, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.23670160174016217, + "step": 2394, + "train/total_loss": 0.20327132940292358 + }, + { + "entropy": 9.2318754196167, + "epoch": 0.23680047458967768, + "mean_token_accuracy": 0.769132673740387, + "num_tokens": 12466479.0, + "step": 2395, + "train/ce_loss": 2.7288724595564418e-06 + }, + { + "epoch": 0.23680047458967768, + "step": 2395, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.23680047458967768, + "step": 2395, + "train/total_loss": 0.05468777194619179 + }, + { + "entropy": 9.625486373901367, + "epoch": 0.2368993474391932, + "mean_token_accuracy": 0.7072368264198303, + "num_tokens": 12471521.0, + "step": 2396, + "train/ce_loss": 1.7447322607040405 + }, + { + "epoch": 0.2368993474391932, + "step": 2396, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.2368993474391932, + "step": 2396, + "train/total_loss": 0.25259822607040405 + }, + { + "entropy": 9.427966117858887, + "epoch": 0.23699822028870873, + "mean_token_accuracy": 0.7125340700149536, + "num_tokens": 12476719.0, + "step": 2397, + "train/ce_loss": 1.1014299392700195 + }, + { + "epoch": 0.23699822028870873, + "step": 2397, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.23699822028870873, + "step": 2397, + "train/total_loss": 0.1531117558479309 + }, + { + "entropy": 9.326618194580078, + "epoch": 0.23709709313822425, + "mean_token_accuracy": 0.733564019203186, + "num_tokens": 12482048.0, + "step": 2398, + "train/ce_loss": 0.9821603894233704 + }, + { + "epoch": 0.23709709313822425, + "step": 2398, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.23709709313822425, + "step": 2398, + "train/total_loss": 0.15680979192256927 + }, + { + "entropy": 9.529634475708008, + "epoch": 0.23719596598773976, + "mean_token_accuracy": 0.7684659361839294, + "num_tokens": 12487137.0, + "step": 2399, + "train/ce_loss": 0.5558364391326904 + }, + { + "epoch": 0.23719596598773976, + "step": 2399, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.23719596598773976, + "step": 2399, + "train/total_loss": 0.1298023909330368 + }, + { + "epoch": 0.2372948388372553, + "grad_norm": 0.8500503897666931, + "learning_rate": 9.409335904663008e-06, + "loss": 0.1632, + "step": 2400 + }, + { + "entropy": 9.319252014160156, + "epoch": 0.2372948388372553, + "mean_token_accuracy": 0.7546948194503784, + "num_tokens": 12492440.0, + "step": 2400, + "train/ce_loss": 0.4326237738132477 + }, + { + "epoch": 0.2372948388372553, + "step": 2400, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.2372948388372553, + "step": 2400, + "train/total_loss": 0.07451237738132477 + }, + { + "entropy": 9.431442260742188, + "epoch": 0.23739371168677081, + "mean_token_accuracy": 0.7329843044281006, + "num_tokens": 12497660.0, + "step": 2401, + "train/ce_loss": 0.8422648906707764 + }, + { + "epoch": 0.23739371168677081, + "step": 2401, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.23739371168677081, + "step": 2401, + "train/total_loss": 0.12719523906707764 + }, + { + "entropy": 8.835033416748047, + "epoch": 0.23749258453628633, + "mean_token_accuracy": 0.737500011920929, + "num_tokens": 12503149.0, + "step": 2402, + "train/ce_loss": 0.6837636828422546 + }, + { + "epoch": 0.23749258453628633, + "step": 2402, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.23749258453628633, + "step": 2402, + "train/total_loss": 0.154313862323761 + }, + { + "entropy": 9.53884220123291, + "epoch": 0.23759145738580187, + "mean_token_accuracy": 0.7151424288749695, + "num_tokens": 12508235.0, + "step": 2403, + "train/ce_loss": 1.3083539009094238 + }, + { + "epoch": 0.23759145738580187, + "step": 2403, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.23759145738580187, + "step": 2403, + "train/total_loss": 0.2167728990316391 + }, + { + "entropy": 9.01998519897461, + "epoch": 0.23769033023531738, + "mean_token_accuracy": 0.7577962875366211, + "num_tokens": 12513661.0, + "step": 2404, + "train/ce_loss": 0.9385044574737549 + }, + { + "epoch": 0.23769033023531738, + "step": 2404, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.23769033023531738, + "step": 2404, + "train/total_loss": 0.21103794872760773 + }, + { + "entropy": 9.81583309173584, + "epoch": 0.2377892030848329, + "mean_token_accuracy": 0.7698412537574768, + "num_tokens": 12518719.0, + "step": 2405, + "train/ce_loss": 0.7289450764656067 + }, + { + "epoch": 0.2377892030848329, + "step": 2405, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.2377892030848329, + "step": 2405, + "train/total_loss": 0.19008201360702515 + }, + { + "entropy": 9.46564769744873, + "epoch": 0.23788807593434844, + "mean_token_accuracy": 0.7140804529190063, + "num_tokens": 12523755.0, + "step": 2406, + "train/ce_loss": 3.154036221530987e-06 + }, + { + "epoch": 0.23788807593434844, + "step": 2406, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.23788807593434844, + "step": 2406, + "train/total_loss": 0.07031281292438507 + }, + { + "entropy": 9.094564437866211, + "epoch": 0.23798694878386395, + "mean_token_accuracy": 0.700964629650116, + "num_tokens": 12529144.0, + "step": 2407, + "train/ce_loss": 0.8377649784088135 + }, + { + "epoch": 0.23798694878386395, + "step": 2407, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.23798694878386395, + "step": 2407, + "train/total_loss": 0.11893274635076523 + }, + { + "entropy": 9.922409057617188, + "epoch": 0.23808582163337946, + "mean_token_accuracy": 0.7257019281387329, + "num_tokens": 12534083.0, + "step": 2408, + "train/ce_loss": 1.3888559341430664 + }, + { + "epoch": 0.23808582163337946, + "step": 2408, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.23808582163337946, + "step": 2408, + "train/total_loss": 0.24044810235500336 + }, + { + "entropy": 9.742548942565918, + "epoch": 0.238184694482895, + "mean_token_accuracy": 0.744425356388092, + "num_tokens": 12539062.0, + "step": 2409, + "train/ce_loss": 1.2016829252243042 + }, + { + "epoch": 0.238184694482895, + "step": 2409, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.238184694482895, + "step": 2409, + "train/total_loss": 0.1826682984828949 + }, + { + "entropy": 9.329795837402344, + "epoch": 0.23828356733241052, + "mean_token_accuracy": 0.7608142495155334, + "num_tokens": 12544271.0, + "step": 2410, + "train/ce_loss": 1.061061978340149 + }, + { + "epoch": 0.23828356733241052, + "step": 2410, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.23828356733241052, + "step": 2410, + "train/total_loss": 0.14516869187355042 + }, + { + "entropy": 9.265291213989258, + "epoch": 0.23838244018192603, + "mean_token_accuracy": 0.7192118167877197, + "num_tokens": 12549520.0, + "step": 2411, + "train/ce_loss": 1.0520355701446533 + }, + { + "epoch": 0.23838244018192603, + "step": 2411, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.23838244018192603, + "step": 2411, + "train/total_loss": 0.1833285689353943 + }, + { + "entropy": 9.62429428100586, + "epoch": 0.23848131303144157, + "mean_token_accuracy": 0.7111111283302307, + "num_tokens": 12554572.0, + "step": 2412, + "train/ce_loss": 0.5724148154258728 + }, + { + "epoch": 0.23848131303144157, + "step": 2412, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.23848131303144157, + "step": 2412, + "train/total_loss": 0.11974148452281952 + }, + { + "entropy": 10.618011474609375, + "epoch": 0.23858018588095709, + "mean_token_accuracy": 0.7802197933197021, + "num_tokens": 12559164.0, + "step": 2413, + "train/ce_loss": 1.9048376998398453e-05 + }, + { + "epoch": 0.23858018588095709, + "step": 2413, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.23858018588095709, + "step": 2413, + "train/total_loss": 0.07812690734863281 + }, + { + "entropy": 9.363680839538574, + "epoch": 0.2386790587304726, + "mean_token_accuracy": 0.7350901365280151, + "num_tokens": 12564374.0, + "step": 2414, + "train/ce_loss": 0.8084295392036438 + }, + { + "epoch": 0.2386790587304726, + "step": 2414, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.2386790587304726, + "step": 2414, + "train/total_loss": 0.19803045690059662 + }, + { + "entropy": 9.255803108215332, + "epoch": 0.23877793157998814, + "mean_token_accuracy": 0.7384230494499207, + "num_tokens": 12569635.0, + "step": 2415, + "train/ce_loss": 0.6242900490760803 + }, + { + "epoch": 0.23877793157998814, + "step": 2415, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.23877793157998814, + "step": 2415, + "train/total_loss": 0.1288352608680725 + }, + { + "entropy": 9.360810279846191, + "epoch": 0.23887680442950365, + "mean_token_accuracy": 0.7057416439056396, + "num_tokens": 12574909.0, + "step": 2416, + "train/ce_loss": 0.941565990447998 + }, + { + "epoch": 0.23887680442950365, + "step": 2416, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.23887680442950365, + "step": 2416, + "train/total_loss": 0.14103159308433533 + }, + { + "entropy": 9.784135818481445, + "epoch": 0.2389756772790192, + "mean_token_accuracy": 0.709618866443634, + "num_tokens": 12580069.0, + "step": 2417, + "train/ce_loss": 1.183141827583313 + }, + { + "epoch": 0.2389756772790192, + "step": 2417, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.2389756772790192, + "step": 2417, + "train/total_loss": 0.19643917679786682 + }, + { + "entropy": 9.049604415893555, + "epoch": 0.2390745501285347, + "mean_token_accuracy": 0.7241379022598267, + "num_tokens": 12585471.0, + "step": 2418, + "train/ce_loss": 1.1365526914596558 + }, + { + "epoch": 0.2390745501285347, + "step": 2418, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.2390745501285347, + "step": 2418, + "train/total_loss": 0.19959276914596558 + }, + { + "entropy": 9.176121711730957, + "epoch": 0.23917342297805022, + "mean_token_accuracy": 0.6867321729660034, + "num_tokens": 12590754.0, + "step": 2419, + "train/ce_loss": 0.7100818753242493 + }, + { + "epoch": 0.23917342297805022, + "step": 2419, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.23917342297805022, + "step": 2419, + "train/total_loss": 0.15694569051265717 + }, + { + "epoch": 0.23927229582756576, + "grad_norm": 1.0710947513580322, + "learning_rate": 9.40439103990506e-06, + "loss": 0.1545, + "step": 2420 + }, + { + "entropy": 9.725388526916504, + "epoch": 0.23927229582756576, + "mean_token_accuracy": 0.7470085620880127, + "num_tokens": 12595783.0, + "step": 2420, + "train/ce_loss": 1.1202698945999146 + }, + { + "epoch": 0.23927229582756576, + "step": 2420, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.23927229582756576, + "step": 2420, + "train/total_loss": 0.18233948945999146 + }, + { + "entropy": 9.089994430541992, + "epoch": 0.23937116867708128, + "mean_token_accuracy": 0.7723214030265808, + "num_tokens": 12601138.0, + "step": 2421, + "train/ce_loss": 0.7415687441825867 + }, + { + "epoch": 0.23937116867708128, + "step": 2421, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.23937116867708128, + "step": 2421, + "train/total_loss": 0.10540687292814255 + }, + { + "entropy": 9.053451538085938, + "epoch": 0.2394700415265968, + "mean_token_accuracy": 0.6727052927017212, + "num_tokens": 12606463.0, + "step": 2422, + "train/ce_loss": 0.5993922352790833 + }, + { + "epoch": 0.2394700415265968, + "step": 2422, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.2394700415265968, + "step": 2422, + "train/total_loss": 0.1458767205476761 + }, + { + "entropy": 9.50720500946045, + "epoch": 0.23956891437611233, + "mean_token_accuracy": 0.7133956551551819, + "num_tokens": 12611511.0, + "step": 2423, + "train/ce_loss": 7.332210316235432e-06 + }, + { + "epoch": 0.23956891437611233, + "step": 2423, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.23956891437611233, + "step": 2423, + "train/total_loss": 0.0625007301568985 + }, + { + "entropy": 9.788806915283203, + "epoch": 0.23966778722562784, + "mean_token_accuracy": 0.8044692873954773, + "num_tokens": 12616454.0, + "step": 2424, + "train/ce_loss": 3.934250344173051e-06 + }, + { + "epoch": 0.23966778722562784, + "step": 2424, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.23966778722562784, + "step": 2424, + "train/total_loss": 0.019531643018126488 + }, + { + "entropy": 8.997499465942383, + "epoch": 0.23976666007514336, + "mean_token_accuracy": 0.7777777910232544, + "num_tokens": 12621557.0, + "step": 2425, + "train/ce_loss": 0.48350977897644043 + }, + { + "epoch": 0.23976666007514336, + "step": 2425, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.23976666007514336, + "step": 2425, + "train/total_loss": 0.1030384749174118 + }, + { + "entropy": 9.196101188659668, + "epoch": 0.2398655329246589, + "mean_token_accuracy": 0.7543390989303589, + "num_tokens": 12626810.0, + "step": 2426, + "train/ce_loss": 1.0639623403549194 + }, + { + "epoch": 0.2398655329246589, + "step": 2426, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.2398655329246589, + "step": 2426, + "train/total_loss": 0.21967747807502747 + }, + { + "entropy": 9.574728012084961, + "epoch": 0.2399644057741744, + "mean_token_accuracy": 0.7956656217575073, + "num_tokens": 12631852.0, + "step": 2427, + "train/ce_loss": 1.0240403413772583 + }, + { + "epoch": 0.2399644057741744, + "step": 2427, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.2399644057741744, + "step": 2427, + "train/total_loss": 0.19615402817726135 + }, + { + "entropy": 9.197060585021973, + "epoch": 0.24006327862368992, + "mean_token_accuracy": 0.7612359523773193, + "num_tokens": 12636993.0, + "step": 2428, + "train/ce_loss": 8.116682693071198e-06 + }, + { + "epoch": 0.24006327862368992, + "step": 2428, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.24006327862368992, + "step": 2428, + "train/total_loss": 0.050782062113285065 + }, + { + "entropy": 9.274468421936035, + "epoch": 0.24016215147320547, + "mean_token_accuracy": 0.6811594367027283, + "num_tokens": 12642066.0, + "step": 2429, + "train/ce_loss": 1.1202927827835083 + }, + { + "epoch": 0.24016215147320547, + "step": 2429, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.24016215147320547, + "step": 2429, + "train/total_loss": 0.2135917842388153 + }, + { + "entropy": 9.780946731567383, + "epoch": 0.24026102432272098, + "mean_token_accuracy": 0.7403509020805359, + "num_tokens": 12647082.0, + "step": 2430, + "train/ce_loss": 1.2716385126113892 + }, + { + "epoch": 0.24026102432272098, + "step": 2430, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.24026102432272098, + "step": 2430, + "train/total_loss": 0.2170076072216034 + }, + { + "entropy": 9.411595344543457, + "epoch": 0.2403598971722365, + "mean_token_accuracy": 0.7183908224105835, + "num_tokens": 12652265.0, + "step": 2431, + "train/ce_loss": 0.9056243896484375 + }, + { + "epoch": 0.2403598971722365, + "step": 2431, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.2403598971722365, + "step": 2431, + "train/total_loss": 0.15696868300437927 + }, + { + "entropy": 9.484373092651367, + "epoch": 0.24045877002175203, + "mean_token_accuracy": 0.7074626684188843, + "num_tokens": 12657375.0, + "step": 2432, + "train/ce_loss": 3.2736211323936004e-06 + }, + { + "epoch": 0.24045877002175203, + "step": 2432, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.24045877002175203, + "step": 2432, + "train/total_loss": 0.06250032782554626 + }, + { + "entropy": 9.2290620803833, + "epoch": 0.24055764287126755, + "mean_token_accuracy": 0.7011628150939941, + "num_tokens": 12662704.0, + "step": 2433, + "train/ce_loss": 1.1158980131149292 + }, + { + "epoch": 0.24055764287126755, + "step": 2433, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.24055764287126755, + "step": 2433, + "train/total_loss": 0.17018355429172516 + }, + { + "entropy": 10.093072891235352, + "epoch": 0.24065651572078306, + "mean_token_accuracy": 0.7209302186965942, + "num_tokens": 12667477.0, + "step": 2434, + "train/ce_loss": 1.2968782357347663e-05 + }, + { + "epoch": 0.24065651572078306, + "step": 2434, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.24065651572078306, + "step": 2434, + "train/total_loss": 0.058595046401023865 + }, + { + "entropy": 9.002099990844727, + "epoch": 0.2407553885702986, + "mean_token_accuracy": 0.7219361662864685, + "num_tokens": 12672857.0, + "step": 2435, + "train/ce_loss": 0.6061140298843384 + }, + { + "epoch": 0.2407553885702986, + "step": 2435, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.2407553885702986, + "step": 2435, + "train/total_loss": 0.09186140447854996 + }, + { + "entropy": 9.172452926635742, + "epoch": 0.24085426141981411, + "mean_token_accuracy": 0.7418224215507507, + "num_tokens": 12678190.0, + "step": 2436, + "train/ce_loss": 0.33215388655662537 + }, + { + "epoch": 0.24085426141981411, + "step": 2436, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.24085426141981411, + "step": 2436, + "train/total_loss": 0.05665288865566254 + }, + { + "entropy": 8.969165802001953, + "epoch": 0.24095313426932966, + "mean_token_accuracy": 0.7849566340446472, + "num_tokens": 12683684.0, + "step": 2437, + "train/ce_loss": 0.6042222380638123 + }, + { + "epoch": 0.24095313426932966, + "step": 2437, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.24095313426932966, + "step": 2437, + "train/total_loss": 0.12292222678661346 + }, + { + "entropy": 9.200159072875977, + "epoch": 0.24105200711884517, + "mean_token_accuracy": 0.7333333492279053, + "num_tokens": 12689004.0, + "step": 2438, + "train/ce_loss": 0.9200250506401062 + }, + { + "epoch": 0.24105200711884517, + "step": 2438, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.24105200711884517, + "step": 2438, + "train/total_loss": 0.1466900110244751 + }, + { + "entropy": 8.7935152053833, + "epoch": 0.24115087996836068, + "mean_token_accuracy": 0.7602339386940002, + "num_tokens": 12694506.0, + "step": 2439, + "train/ce_loss": 0.32769227027893066 + }, + { + "epoch": 0.24115087996836068, + "step": 2439, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.24115087996836068, + "step": 2439, + "train/total_loss": 0.056206729263067245 + }, + { + "epoch": 0.24124975281787622, + "grad_norm": 0.9195847511291504, + "learning_rate": 9.39944617514711e-06, + "loss": 0.1548, + "step": 2440 + }, + { + "entropy": 8.913129806518555, + "epoch": 0.24124975281787622, + "mean_token_accuracy": 0.7268041372299194, + "num_tokens": 12699932.0, + "step": 2440, + "train/ce_loss": 0.6807551980018616 + }, + { + "epoch": 0.24124975281787622, + "step": 2440, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.24124975281787622, + "step": 2440, + "train/total_loss": 0.1930755227804184 + }, + { + "entropy": 9.239109992980957, + "epoch": 0.24134862566739174, + "mean_token_accuracy": 0.6910112500190735, + "num_tokens": 12705303.0, + "step": 2441, + "train/ce_loss": 0.7849908471107483 + }, + { + "epoch": 0.24134862566739174, + "step": 2441, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.24134862566739174, + "step": 2441, + "train/total_loss": 0.15662407875061035 + }, + { + "entropy": 9.928701400756836, + "epoch": 0.24144749851690725, + "mean_token_accuracy": 0.6928701996803284, + "num_tokens": 12710276.0, + "step": 2442, + "train/ce_loss": 1.6926069259643555 + }, + { + "epoch": 0.24144749851690725, + "step": 2442, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.24144749851690725, + "step": 2442, + "train/total_loss": 0.2512919306755066 + }, + { + "entropy": 9.428268432617188, + "epoch": 0.2415463713664228, + "mean_token_accuracy": 0.7929577231407166, + "num_tokens": 12715450.0, + "step": 2443, + "train/ce_loss": 7.356356945820153e-06 + }, + { + "epoch": 0.2415463713664228, + "step": 2443, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.2415463713664228, + "step": 2443, + "train/total_loss": 0.0507819838821888 + }, + { + "entropy": 9.439188957214355, + "epoch": 0.2416452442159383, + "mean_token_accuracy": 0.756369411945343, + "num_tokens": 12720552.0, + "step": 2444, + "train/ce_loss": 0.8201816082000732 + }, + { + "epoch": 0.2416452442159383, + "step": 2444, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.2416452442159383, + "step": 2444, + "train/total_loss": 0.1523306667804718 + }, + { + "entropy": 9.659980773925781, + "epoch": 0.24174411706545382, + "mean_token_accuracy": 0.7495462894439697, + "num_tokens": 12725493.0, + "step": 2445, + "train/ce_loss": 8.589844583184458e-06 + }, + { + "epoch": 0.24174411706545382, + "step": 2445, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.24174411706545382, + "step": 2445, + "train/total_loss": 0.07421960681676865 + }, + { + "entropy": 9.174163818359375, + "epoch": 0.24184298991496936, + "mean_token_accuracy": 0.7416107654571533, + "num_tokens": 12731051.0, + "step": 2446, + "train/ce_loss": 0.842617928981781 + }, + { + "epoch": 0.24184298991496936, + "step": 2446, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.24184298991496936, + "step": 2446, + "train/total_loss": 0.19363680481910706 + }, + { + "entropy": 9.033437728881836, + "epoch": 0.24194186276448487, + "mean_token_accuracy": 0.7300546169281006, + "num_tokens": 12736473.0, + "step": 2447, + "train/ce_loss": 0.6673880815505981 + }, + { + "epoch": 0.24194186276448487, + "step": 2447, + "train/sim_loss": 0.1484375 + }, + { + "epoch": 0.24194186276448487, + "step": 2447, + "train/total_loss": 0.2151763141155243 + }, + { + "entropy": 9.262384414672852, + "epoch": 0.24204073561400039, + "mean_token_accuracy": 0.661478579044342, + "num_tokens": 12741709.0, + "step": 2448, + "train/ce_loss": 1.1020350456237793 + }, + { + "epoch": 0.24204073561400039, + "step": 2448, + "train/sim_loss": 0.1484375 + }, + { + "epoch": 0.24204073561400039, + "step": 2448, + "train/total_loss": 0.25864100456237793 + }, + { + "entropy": 9.353168487548828, + "epoch": 0.24213960846351593, + "mean_token_accuracy": 0.743073046207428, + "num_tokens": 12746961.0, + "step": 2449, + "train/ce_loss": 0.5089231133460999 + }, + { + "epoch": 0.24213960846351593, + "step": 2449, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.24213960846351593, + "step": 2449, + "train/total_loss": 0.07823605835437775 + }, + { + "entropy": 8.918380737304688, + "epoch": 0.24223848131303144, + "mean_token_accuracy": 0.7183098793029785, + "num_tokens": 12752208.0, + "step": 2450, + "train/ce_loss": 0.6301059126853943 + }, + { + "epoch": 0.24223848131303144, + "step": 2450, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.24223848131303144, + "step": 2450, + "train/total_loss": 0.13332310318946838 + }, + { + "entropy": 9.42928695678711, + "epoch": 0.24233735416254695, + "mean_token_accuracy": 0.7073863744735718, + "num_tokens": 12757351.0, + "step": 2451, + "train/ce_loss": 1.2831615209579468 + }, + { + "epoch": 0.24233735416254695, + "step": 2451, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.24233735416254695, + "step": 2451, + "train/total_loss": 0.17909739911556244 + }, + { + "entropy": 10.1730375289917, + "epoch": 0.2424362270120625, + "mean_token_accuracy": 0.7560975551605225, + "num_tokens": 12762123.0, + "step": 2452, + "train/ce_loss": 6.707434295094572e-06 + }, + { + "epoch": 0.2424362270120625, + "step": 2452, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.2424362270120625, + "step": 2452, + "train/total_loss": 0.027344420552253723 + }, + { + "entropy": 9.21902847290039, + "epoch": 0.242535099861578, + "mean_token_accuracy": 0.7799353003501892, + "num_tokens": 12767656.0, + "step": 2453, + "train/ce_loss": 0.953403651714325 + }, + { + "epoch": 0.242535099861578, + "step": 2453, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.242535099861578, + "step": 2453, + "train/total_loss": 0.11877786368131638 + }, + { + "entropy": 9.858123779296875, + "epoch": 0.24263397271109352, + "mean_token_accuracy": 0.7278761267662048, + "num_tokens": 12772502.0, + "step": 2454, + "train/ce_loss": 2.1266531944274902 + }, + { + "epoch": 0.24263397271109352, + "step": 2454, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.24263397271109352, + "step": 2454, + "train/total_loss": 0.310321569442749 + }, + { + "entropy": 9.473504066467285, + "epoch": 0.24273284556060906, + "mean_token_accuracy": 0.7373887300491333, + "num_tokens": 12777684.0, + "step": 2455, + "train/ce_loss": 1.3440054655075073 + }, + { + "epoch": 0.24273284556060906, + "step": 2455, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.24273284556060906, + "step": 2455, + "train/total_loss": 0.24768179655075073 + }, + { + "entropy": 9.814481735229492, + "epoch": 0.24283171841012458, + "mean_token_accuracy": 0.6863710880279541, + "num_tokens": 12782738.0, + "step": 2456, + "train/ce_loss": 3.664860969365691e-06 + }, + { + "epoch": 0.24283171841012458, + "step": 2456, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.24283171841012458, + "step": 2456, + "train/total_loss": 0.0273441169410944 + }, + { + "entropy": 9.205331802368164, + "epoch": 0.24293059125964012, + "mean_token_accuracy": 0.7039626836776733, + "num_tokens": 12788094.0, + "step": 2457, + "train/ce_loss": 0.9541040658950806 + }, + { + "epoch": 0.24293059125964012, + "step": 2457, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.24293059125964012, + "step": 2457, + "train/total_loss": 0.20087915658950806 + }, + { + "entropy": 8.715977668762207, + "epoch": 0.24302946410915563, + "mean_token_accuracy": 0.7298076748847961, + "num_tokens": 12793628.0, + "step": 2458, + "train/ce_loss": 0.3996654152870178 + }, + { + "epoch": 0.24302946410915563, + "step": 2458, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.24302946410915563, + "step": 2458, + "train/total_loss": 0.06340403854846954 + }, + { + "entropy": 8.838790893554688, + "epoch": 0.24312833695867114, + "mean_token_accuracy": 0.711033284664154, + "num_tokens": 12799246.0, + "step": 2459, + "train/ce_loss": 0.5818819403648376 + }, + { + "epoch": 0.24312833695867114, + "step": 2459, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.24312833695867114, + "step": 2459, + "train/total_loss": 0.12459444999694824 + }, + { + "epoch": 0.24322720980818668, + "grad_norm": 0.944430410861969, + "learning_rate": 9.394501310389161e-06, + "loss": 0.1629, + "step": 2460 + }, + { + "entropy": 9.599706649780273, + "epoch": 0.24322720980818668, + "mean_token_accuracy": 0.7435455918312073, + "num_tokens": 12804276.0, + "step": 2460, + "train/ce_loss": 1.0893281698226929 + }, + { + "epoch": 0.24322720980818668, + "step": 2460, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.24322720980818668, + "step": 2460, + "train/total_loss": 0.14799532294273376 + }, + { + "entropy": 9.560248374938965, + "epoch": 0.2433260826577022, + "mean_token_accuracy": 0.728787899017334, + "num_tokens": 12809374.0, + "step": 2461, + "train/ce_loss": 0.5702160596847534 + }, + { + "epoch": 0.2433260826577022, + "step": 2461, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.2433260826577022, + "step": 2461, + "train/total_loss": 0.1234278529882431 + }, + { + "entropy": 9.090559959411621, + "epoch": 0.2434249555072177, + "mean_token_accuracy": 0.7119021415710449, + "num_tokens": 12814732.0, + "step": 2462, + "train/ce_loss": 0.943033754825592 + }, + { + "epoch": 0.2434249555072177, + "step": 2462, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.2434249555072177, + "step": 2462, + "train/total_loss": 0.21149086952209473 + }, + { + "entropy": 9.076742172241211, + "epoch": 0.24352382835673325, + "mean_token_accuracy": 0.7723840475082397, + "num_tokens": 12820124.0, + "step": 2463, + "train/ce_loss": 0.5626240968704224 + }, + { + "epoch": 0.24352382835673325, + "step": 2463, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.24352382835673325, + "step": 2463, + "train/total_loss": 0.15001240372657776 + }, + { + "entropy": 8.93365478515625, + "epoch": 0.24362270120624877, + "mean_token_accuracy": 0.7019027471542358, + "num_tokens": 12825515.0, + "step": 2464, + "train/ce_loss": 2.3353025913238525 + }, + { + "epoch": 0.24362270120624877, + "step": 2464, + "train/sim_loss": 0.12890625 + }, + { + "epoch": 0.24362270120624877, + "step": 2464, + "train/total_loss": 0.36243653297424316 + }, + { + "entropy": 9.150115966796875, + "epoch": 0.24372157405576428, + "mean_token_accuracy": 0.7881944179534912, + "num_tokens": 12830882.0, + "step": 2465, + "train/ce_loss": 0.7780004143714905 + }, + { + "epoch": 0.24372157405576428, + "step": 2465, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.24372157405576428, + "step": 2465, + "train/total_loss": 0.17155003547668457 + }, + { + "entropy": 9.220715522766113, + "epoch": 0.24382044690527982, + "mean_token_accuracy": 0.7518518567085266, + "num_tokens": 12836315.0, + "step": 2466, + "train/ce_loss": 0.7762071490287781 + }, + { + "epoch": 0.24382044690527982, + "step": 2466, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.24382044690527982, + "step": 2466, + "train/total_loss": 0.1362144649028778 + }, + { + "entropy": 9.594511985778809, + "epoch": 0.24391931975479533, + "mean_token_accuracy": 0.7945205569267273, + "num_tokens": 12841377.0, + "step": 2467, + "train/ce_loss": 0.9444525241851807 + }, + { + "epoch": 0.24391931975479533, + "step": 2467, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.24391931975479533, + "step": 2467, + "train/total_loss": 0.14522650837898254 + }, + { + "entropy": 9.246139526367188, + "epoch": 0.24401819260431085, + "mean_token_accuracy": 0.7575757503509521, + "num_tokens": 12846724.0, + "step": 2468, + "train/ce_loss": 0.9505940079689026 + }, + { + "epoch": 0.24401819260431085, + "step": 2468, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.24401819260431085, + "step": 2468, + "train/total_loss": 0.20052814483642578 + }, + { + "entropy": 9.038537979125977, + "epoch": 0.2441170654538264, + "mean_token_accuracy": 0.7349112629890442, + "num_tokens": 12852019.0, + "step": 2469, + "train/ce_loss": 1.0029042959213257 + }, + { + "epoch": 0.2441170654538264, + "step": 2469, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.2441170654538264, + "step": 2469, + "train/total_loss": 0.1471654325723648 + }, + { + "entropy": 10.116767883300781, + "epoch": 0.2442159383033419, + "mean_token_accuracy": 0.792682945728302, + "num_tokens": 12856759.0, + "step": 2470, + "train/ce_loss": 0.8636196255683899 + }, + { + "epoch": 0.2442159383033419, + "step": 2470, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.2442159383033419, + "step": 2470, + "train/total_loss": 0.15276822447776794 + }, + { + "entropy": 9.359464645385742, + "epoch": 0.24431481115285741, + "mean_token_accuracy": 0.6978609561920166, + "num_tokens": 12861969.0, + "step": 2471, + "train/ce_loss": 0.5816138982772827 + }, + { + "epoch": 0.24431481115285741, + "step": 2471, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.24431481115285741, + "step": 2471, + "train/total_loss": 0.10503639280796051 + }, + { + "entropy": 8.996708869934082, + "epoch": 0.24441368400237296, + "mean_token_accuracy": 0.7271783947944641, + "num_tokens": 12867383.0, + "step": 2472, + "train/ce_loss": 0.7601787447929382 + }, + { + "epoch": 0.24441368400237296, + "step": 2472, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.24441368400237296, + "step": 2472, + "train/total_loss": 0.18148663640022278 + }, + { + "entropy": 9.336339950561523, + "epoch": 0.24451255685188847, + "mean_token_accuracy": 0.7127272486686707, + "num_tokens": 12872678.0, + "step": 2473, + "train/ce_loss": 0.5133230686187744 + }, + { + "epoch": 0.24451255685188847, + "step": 2473, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.24451255685188847, + "step": 2473, + "train/total_loss": 0.07867605984210968 + }, + { + "entropy": 9.151508331298828, + "epoch": 0.24461142970140398, + "mean_token_accuracy": 0.7403314709663391, + "num_tokens": 12878006.0, + "step": 2474, + "train/ce_loss": 0.4109875559806824 + }, + { + "epoch": 0.24461142970140398, + "step": 2474, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.24461142970140398, + "step": 2474, + "train/total_loss": 0.15438000857830048 + }, + { + "entropy": 9.76147174835205, + "epoch": 0.24471030255091952, + "mean_token_accuracy": 0.748633861541748, + "num_tokens": 12883008.0, + "step": 2475, + "train/ce_loss": 1.5969499349594116 + }, + { + "epoch": 0.24471030255091952, + "step": 2475, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.24471030255091952, + "step": 2475, + "train/total_loss": 0.21438249945640564 + }, + { + "entropy": 8.862588882446289, + "epoch": 0.24480917540043504, + "mean_token_accuracy": 0.7681007385253906, + "num_tokens": 12888452.0, + "step": 2476, + "train/ce_loss": 0.4870178997516632 + }, + { + "epoch": 0.24480917540043504, + "step": 2476, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.24480917540043504, + "step": 2476, + "train/total_loss": 0.10338929295539856 + }, + { + "entropy": 9.651182174682617, + "epoch": 0.24490804824995058, + "mean_token_accuracy": 0.7303754091262817, + "num_tokens": 12893440.0, + "step": 2477, + "train/ce_loss": 0.9295541644096375 + }, + { + "epoch": 0.24490804824995058, + "step": 2477, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.24490804824995058, + "step": 2477, + "train/total_loss": 0.14373666048049927 + }, + { + "entropy": 9.563039779663086, + "epoch": 0.2450069210994661, + "mean_token_accuracy": 0.6873977184295654, + "num_tokens": 12898499.0, + "step": 2478, + "train/ce_loss": 3.324387989778188e-06 + }, + { + "epoch": 0.2450069210994661, + "step": 2478, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.2450069210994661, + "step": 2478, + "train/total_loss": 0.03125033155083656 + }, + { + "entropy": 9.432746887207031, + "epoch": 0.2451057939489816, + "mean_token_accuracy": 0.6913043260574341, + "num_tokens": 12903646.0, + "step": 2479, + "train/ce_loss": 5.7135166571242735e-06 + }, + { + "epoch": 0.2451057939489816, + "step": 2479, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.2451057939489816, + "step": 2479, + "train/total_loss": 0.07031307369470596 + }, + { + "epoch": 0.24520466679849715, + "grad_norm": 0.927836000919342, + "learning_rate": 9.389556445631213e-06, + "loss": 0.1475, + "step": 2480 + }, + { + "entropy": 9.195361137390137, + "epoch": 0.24520466679849715, + "mean_token_accuracy": 0.7652068138122559, + "num_tokens": 12908980.0, + "step": 2480, + "train/ce_loss": 0.8688988089561462 + }, + { + "epoch": 0.24520466679849715, + "step": 2480, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.24520466679849715, + "step": 2480, + "train/total_loss": 0.11032738536596298 + }, + { + "entropy": 9.80865478515625, + "epoch": 0.24530353964801266, + "mean_token_accuracy": 0.7228682041168213, + "num_tokens": 12913917.0, + "step": 2481, + "train/ce_loss": 0.9114884734153748 + }, + { + "epoch": 0.24530353964801266, + "step": 2481, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.24530353964801266, + "step": 2481, + "train/total_loss": 0.15755510330200195 + }, + { + "entropy": 8.944324493408203, + "epoch": 0.24540241249752817, + "mean_token_accuracy": 0.7332035303115845, + "num_tokens": 12919367.0, + "step": 2482, + "train/ce_loss": 0.8474249243736267 + }, + { + "epoch": 0.24540241249752817, + "step": 2482, + "train/sim_loss": 0.15625 + }, + { + "epoch": 0.24540241249752817, + "step": 2482, + "train/total_loss": 0.2409924864768982 + }, + { + "entropy": 9.81283187866211, + "epoch": 0.2455012853470437, + "mean_token_accuracy": 0.717756986618042, + "num_tokens": 12924512.0, + "step": 2483, + "train/ce_loss": 1.4316576719284058 + }, + { + "epoch": 0.2455012853470437, + "step": 2483, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.2455012853470437, + "step": 2483, + "train/total_loss": 0.2681657671928406 + }, + { + "entropy": 9.55518913269043, + "epoch": 0.24560015819655923, + "mean_token_accuracy": 0.695588231086731, + "num_tokens": 12929645.0, + "step": 2484, + "train/ce_loss": 1.2374972105026245 + }, + { + "epoch": 0.24560015819655923, + "step": 2484, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.24560015819655923, + "step": 2484, + "train/total_loss": 0.1940622329711914 + }, + { + "entropy": 9.15027904510498, + "epoch": 0.24569903104607474, + "mean_token_accuracy": 0.7681818008422852, + "num_tokens": 12935026.0, + "step": 2485, + "train/ce_loss": 1.2009178400039673 + }, + { + "epoch": 0.24569903104607474, + "step": 2485, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.24569903104607474, + "step": 2485, + "train/total_loss": 0.20212304592132568 + }, + { + "entropy": 9.405292510986328, + "epoch": 0.24579790389559028, + "mean_token_accuracy": 0.7732558250427246, + "num_tokens": 12940185.0, + "step": 2486, + "train/ce_loss": 0.4908983111381531 + }, + { + "epoch": 0.24579790389559028, + "step": 2486, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.24579790389559028, + "step": 2486, + "train/total_loss": 0.09987108409404755 + }, + { + "entropy": 9.505070686340332, + "epoch": 0.2458967767451058, + "mean_token_accuracy": 0.7264000177383423, + "num_tokens": 12945220.0, + "step": 2487, + "train/ce_loss": 1.4660943746566772 + }, + { + "epoch": 0.2458967767451058, + "step": 2487, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.2458967767451058, + "step": 2487, + "train/total_loss": 0.27160942554473877 + }, + { + "entropy": 9.689166069030762, + "epoch": 0.2459956495946213, + "mean_token_accuracy": 0.785263180732727, + "num_tokens": 12950181.0, + "step": 2488, + "train/ce_loss": 1.524032473564148 + }, + { + "epoch": 0.2459956495946213, + "step": 2488, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.2459956495946213, + "step": 2488, + "train/total_loss": 0.18755950033664703 + }, + { + "entropy": 9.337489128112793, + "epoch": 0.24609452244413685, + "mean_token_accuracy": 0.7402597665786743, + "num_tokens": 12955430.0, + "step": 2489, + "train/ce_loss": 1.362806797027588 + }, + { + "epoch": 0.24609452244413685, + "step": 2489, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.24609452244413685, + "step": 2489, + "train/total_loss": 0.20659318566322327 + }, + { + "entropy": 9.787436485290527, + "epoch": 0.24619339529365236, + "mean_token_accuracy": 0.75789475440979, + "num_tokens": 12960340.0, + "step": 2490, + "train/ce_loss": 1.1043155193328857 + }, + { + "epoch": 0.24619339529365236, + "step": 2490, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.24619339529365236, + "step": 2490, + "train/total_loss": 0.18855655193328857 + }, + { + "entropy": 9.608366012573242, + "epoch": 0.24629226814316788, + "mean_token_accuracy": 0.7488986849784851, + "num_tokens": 12965472.0, + "step": 2491, + "train/ce_loss": 1.2403558492660522 + }, + { + "epoch": 0.24629226814316788, + "step": 2491, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.24629226814316788, + "step": 2491, + "train/total_loss": 0.19434809684753418 + }, + { + "entropy": 9.125654220581055, + "epoch": 0.24639114099268342, + "mean_token_accuracy": 0.7756613492965698, + "num_tokens": 12970877.0, + "step": 2492, + "train/ce_loss": 0.8300853371620178 + }, + { + "epoch": 0.24639114099268342, + "step": 2492, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.24639114099268342, + "step": 2492, + "train/total_loss": 0.1611335277557373 + }, + { + "entropy": 9.14429759979248, + "epoch": 0.24649001384219893, + "mean_token_accuracy": 0.7069351077079773, + "num_tokens": 12976191.0, + "step": 2493, + "train/ce_loss": 0.6313163638114929 + }, + { + "epoch": 0.24649001384219893, + "step": 2493, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.24649001384219893, + "step": 2493, + "train/total_loss": 0.12563163042068481 + }, + { + "entropy": 9.490312576293945, + "epoch": 0.24658888669171444, + "mean_token_accuracy": 0.7224606871604919, + "num_tokens": 12981315.0, + "step": 2494, + "train/ce_loss": 1.229867696762085 + }, + { + "epoch": 0.24658888669171444, + "step": 2494, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.24658888669171444, + "step": 2494, + "train/total_loss": 0.23236176371574402 + }, + { + "entropy": 9.757224082946777, + "epoch": 0.24668775954122998, + "mean_token_accuracy": 0.7421150207519531, + "num_tokens": 12986288.0, + "step": 2495, + "train/ce_loss": 0.757111132144928 + }, + { + "epoch": 0.24668775954122998, + "step": 2495, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.24668775954122998, + "step": 2495, + "train/total_loss": 0.10305486619472504 + }, + { + "entropy": 9.02226448059082, + "epoch": 0.2467866323907455, + "mean_token_accuracy": 0.6985596418380737, + "num_tokens": 12991727.0, + "step": 2496, + "train/ce_loss": 0.681783139705658 + }, + { + "epoch": 0.2467866323907455, + "step": 2496, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.2467866323907455, + "step": 2496, + "train/total_loss": 0.10724081844091415 + }, + { + "entropy": 9.69623851776123, + "epoch": 0.246885505240261, + "mean_token_accuracy": 0.7811993360519409, + "num_tokens": 12996742.0, + "step": 2497, + "train/ce_loss": 0.8984249234199524 + }, + { + "epoch": 0.246885505240261, + "step": 2497, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.246885505240261, + "step": 2497, + "train/total_loss": 0.14843624830245972 + }, + { + "entropy": 9.031787872314453, + "epoch": 0.24698437808977655, + "mean_token_accuracy": 0.6890459656715393, + "num_tokens": 13002119.0, + "step": 2498, + "train/ce_loss": 0.5838040709495544 + }, + { + "epoch": 0.24698437808977655, + "step": 2498, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.24698437808977655, + "step": 2498, + "train/total_loss": 0.15603666007518768 + }, + { + "entropy": 8.913003921508789, + "epoch": 0.24708325093929207, + "mean_token_accuracy": 0.7412280440330505, + "num_tokens": 13007483.0, + "step": 2499, + "train/ce_loss": 1.175858736038208 + }, + { + "epoch": 0.24708325093929207, + "step": 2499, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.24708325093929207, + "step": 2499, + "train/total_loss": 0.17227336764335632 + }, + { + "epoch": 0.2471821237888076, + "grad_norm": 0.7621631026268005, + "learning_rate": 9.384611580873264e-06, + "loss": 0.1552, + "step": 2500 + }, + { + "entropy": 8.792287826538086, + "epoch": 0.2471821237888076, + "mean_token_accuracy": 0.7477656602859497, + "num_tokens": 13012953.0, + "step": 2500, + "train/ce_loss": 1.028020977973938 + }, + { + "epoch": 0.2471821237888076, + "step": 2500, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.2471821237888076, + "step": 2500, + "train/total_loss": 0.1653020977973938 + }, + { + "entropy": 9.223542213439941, + "epoch": 0.24728099663832312, + "mean_token_accuracy": 0.7072847485542297, + "num_tokens": 13018167.0, + "step": 2501, + "train/ce_loss": 0.8749057650566101 + }, + { + "epoch": 0.24728099663832312, + "step": 2501, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.24728099663832312, + "step": 2501, + "train/total_loss": 0.12655308842658997 + }, + { + "entropy": 9.162238121032715, + "epoch": 0.24737986948783863, + "mean_token_accuracy": 0.7095178961753845, + "num_tokens": 13023442.0, + "step": 2502, + "train/ce_loss": 0.6095376014709473 + }, + { + "epoch": 0.24737986948783863, + "step": 2502, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.24737986948783863, + "step": 2502, + "train/total_loss": 0.1429850161075592 + }, + { + "entropy": 9.14134693145752, + "epoch": 0.24747874233735417, + "mean_token_accuracy": 0.7111111283302307, + "num_tokens": 13028807.0, + "step": 2503, + "train/ce_loss": 0.769873857498169 + }, + { + "epoch": 0.24747874233735417, + "step": 2503, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.24747874233735417, + "step": 2503, + "train/total_loss": 0.1238623857498169 + }, + { + "entropy": 9.429458618164062, + "epoch": 0.2475776151868697, + "mean_token_accuracy": 0.7438271641731262, + "num_tokens": 13033910.0, + "step": 2504, + "train/ce_loss": 0.831933319568634 + }, + { + "epoch": 0.2475776151868697, + "step": 2504, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.2475776151868697, + "step": 2504, + "train/total_loss": 0.1222558319568634 + }, + { + "entropy": 9.209266662597656, + "epoch": 0.2476764880363852, + "mean_token_accuracy": 0.761083722114563, + "num_tokens": 13039171.0, + "step": 2505, + "train/ce_loss": 0.9590550065040588 + }, + { + "epoch": 0.2476764880363852, + "step": 2505, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.2476764880363852, + "step": 2505, + "train/total_loss": 0.18574926257133484 + }, + { + "entropy": 10.064163208007812, + "epoch": 0.24777536088590074, + "mean_token_accuracy": 0.7623762488365173, + "num_tokens": 13043995.0, + "step": 2506, + "train/ce_loss": 6.885237326059723e-06 + }, + { + "epoch": 0.24777536088590074, + "step": 2506, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.24777536088590074, + "step": 2506, + "train/total_loss": 0.046875689178705215 + }, + { + "entropy": 9.656054496765137, + "epoch": 0.24787423373541626, + "mean_token_accuracy": 0.7292993664741516, + "num_tokens": 13049047.0, + "step": 2507, + "train/ce_loss": 0.7390437722206116 + }, + { + "epoch": 0.24787423373541626, + "step": 2507, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.24787423373541626, + "step": 2507, + "train/total_loss": 0.1207793802022934 + }, + { + "entropy": 10.271624565124512, + "epoch": 0.24797310658493177, + "mean_token_accuracy": 0.7892976403236389, + "num_tokens": 13053750.0, + "step": 2508, + "train/ce_loss": 1.015043020248413 + }, + { + "epoch": 0.24797310658493177, + "step": 2508, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.24797310658493177, + "step": 2508, + "train/total_loss": 0.16400429606437683 + }, + { + "entropy": 10.410440444946289, + "epoch": 0.2480719794344473, + "mean_token_accuracy": 0.7052238583564758, + "num_tokens": 13058442.0, + "step": 2509, + "train/ce_loss": 3.608438730239868 + }, + { + "epoch": 0.2480719794344473, + "step": 2509, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.2480719794344473, + "step": 2509, + "train/total_loss": 0.43506261706352234 + }, + { + "entropy": 9.161474227905273, + "epoch": 0.24817085228396282, + "mean_token_accuracy": 0.7288776636123657, + "num_tokens": 13063739.0, + "step": 2510, + "train/ce_loss": 1.0487580299377441 + }, + { + "epoch": 0.24817085228396282, + "step": 2510, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.24817085228396282, + "step": 2510, + "train/total_loss": 0.15565705299377441 + }, + { + "entropy": 10.024190902709961, + "epoch": 0.24826972513347834, + "mean_token_accuracy": 0.7843601703643799, + "num_tokens": 13068589.0, + "step": 2511, + "train/ce_loss": 2.9918932341388427e-05 + }, + { + "epoch": 0.24826972513347834, + "step": 2511, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.24826972513347834, + "step": 2511, + "train/total_loss": 0.06250299513339996 + }, + { + "entropy": 9.230805397033691, + "epoch": 0.24836859798299388, + "mean_token_accuracy": 0.7678160667419434, + "num_tokens": 13073950.0, + "step": 2512, + "train/ce_loss": 0.6158214807510376 + }, + { + "epoch": 0.24836859798299388, + "step": 2512, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.24836859798299388, + "step": 2512, + "train/total_loss": 0.11236339807510376 + }, + { + "entropy": 9.491394996643066, + "epoch": 0.2484674708325094, + "mean_token_accuracy": 0.7425249218940735, + "num_tokens": 13078989.0, + "step": 2513, + "train/ce_loss": 4.977840035280678e-06 + }, + { + "epoch": 0.2484674708325094, + "step": 2513, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.2484674708325094, + "step": 2513, + "train/total_loss": 0.0820317491889 + }, + { + "entropy": 9.507766723632812, + "epoch": 0.2485663436820249, + "mean_token_accuracy": 0.7412790656089783, + "num_tokens": 13084140.0, + "step": 2514, + "train/ce_loss": 4.862940841121599e-06 + }, + { + "epoch": 0.2485663436820249, + "step": 2514, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.2485663436820249, + "step": 2514, + "train/total_loss": 0.0664067342877388 + }, + { + "entropy": 9.013659477233887, + "epoch": 0.24866521653154045, + "mean_token_accuracy": 0.7486842274665833, + "num_tokens": 13089351.0, + "step": 2515, + "train/ce_loss": 0.6694341897964478 + }, + { + "epoch": 0.24866521653154045, + "step": 2515, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.24866521653154045, + "step": 2515, + "train/total_loss": 0.16069342195987701 + }, + { + "entropy": 9.053864479064941, + "epoch": 0.24876408938105596, + "mean_token_accuracy": 0.8036322593688965, + "num_tokens": 13094670.0, + "step": 2516, + "train/ce_loss": 0.5531548857688904 + }, + { + "epoch": 0.24876408938105596, + "step": 2516, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.24876408938105596, + "step": 2516, + "train/total_loss": 0.07484674453735352 + }, + { + "entropy": 9.353614807128906, + "epoch": 0.24886296223057147, + "mean_token_accuracy": 0.7054973840713501, + "num_tokens": 13099916.0, + "step": 2517, + "train/ce_loss": 4.7901185098453425e-06 + }, + { + "epoch": 0.24886296223057147, + "step": 2517, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.24886296223057147, + "step": 2517, + "train/total_loss": 0.0703129768371582 + }, + { + "entropy": 9.010061264038086, + "epoch": 0.248961835080087, + "mean_token_accuracy": 0.7446569204330444, + "num_tokens": 13105308.0, + "step": 2518, + "train/ce_loss": 0.6093876361846924 + }, + { + "epoch": 0.248961835080087, + "step": 2518, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.248961835080087, + "step": 2518, + "train/total_loss": 0.088282510638237 + }, + { + "entropy": 9.791950225830078, + "epoch": 0.24906070792960253, + "mean_token_accuracy": 0.7227926254272461, + "num_tokens": 13110241.0, + "step": 2519, + "train/ce_loss": 1.5051255226135254 + }, + { + "epoch": 0.24906070792960253, + "step": 2519, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.24906070792960253, + "step": 2519, + "train/total_loss": 0.22082506120204926 + }, + { + "epoch": 0.24915958077911807, + "grad_norm": 1.0966280698776245, + "learning_rate": 9.379666716115316e-06, + "loss": 0.1507, + "step": 2520 + }, + { + "entropy": 9.009889602661133, + "epoch": 0.24915958077911807, + "mean_token_accuracy": 0.7174638509750366, + "num_tokens": 13115607.0, + "step": 2520, + "train/ce_loss": 0.9668665528297424 + }, + { + "epoch": 0.24915958077911807, + "step": 2520, + "train/sim_loss": 0.13671875 + }, + { + "epoch": 0.24915958077911807, + "step": 2520, + "train/total_loss": 0.23340541124343872 + }, + { + "entropy": 9.36453628540039, + "epoch": 0.24925845362863358, + "mean_token_accuracy": 0.7572413682937622, + "num_tokens": 13120805.0, + "step": 2521, + "train/ce_loss": 0.5397550463676453 + }, + { + "epoch": 0.24925845362863358, + "step": 2521, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.24925845362863358, + "step": 2521, + "train/total_loss": 0.12038175761699677 + }, + { + "entropy": 9.566434860229492, + "epoch": 0.2493573264781491, + "mean_token_accuracy": 0.693708598613739, + "num_tokens": 13125908.0, + "step": 2522, + "train/ce_loss": 6.7832647800969426e-06 + }, + { + "epoch": 0.2493573264781491, + "step": 2522, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.2493573264781491, + "step": 2522, + "train/total_loss": 0.06250067800283432 + }, + { + "entropy": 9.14031982421875, + "epoch": 0.24945619932766464, + "mean_token_accuracy": 0.7293233275413513, + "num_tokens": 13131177.0, + "step": 2523, + "train/ce_loss": 0.9928783774375916 + }, + { + "epoch": 0.24945619932766464, + "step": 2523, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.24945619932766464, + "step": 2523, + "train/total_loss": 0.14616283774375916 + }, + { + "entropy": 9.486601829528809, + "epoch": 0.24955507217718015, + "mean_token_accuracy": 0.72398841381073, + "num_tokens": 13136337.0, + "step": 2524, + "train/ce_loss": 1.0188546180725098 + }, + { + "epoch": 0.24955507217718015, + "step": 2524, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.24955507217718015, + "step": 2524, + "train/total_loss": 0.12922921776771545 + }, + { + "entropy": 9.250629425048828, + "epoch": 0.24965394502669566, + "mean_token_accuracy": 0.6994082927703857, + "num_tokens": 13141661.0, + "step": 2525, + "train/ce_loss": 0.5898566246032715 + }, + { + "epoch": 0.24965394502669566, + "step": 2525, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.24965394502669566, + "step": 2525, + "train/total_loss": 0.1410169154405594 + }, + { + "entropy": 8.768533706665039, + "epoch": 0.2497528178762112, + "mean_token_accuracy": 0.7286995649337769, + "num_tokens": 13147085.0, + "step": 2526, + "train/ce_loss": 1.1483701467514038 + }, + { + "epoch": 0.2497528178762112, + "step": 2526, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.2497528178762112, + "step": 2526, + "train/total_loss": 0.19296202063560486 + }, + { + "entropy": 9.395888328552246, + "epoch": 0.24985169072572672, + "mean_token_accuracy": 0.7089946866035461, + "num_tokens": 13152328.0, + "step": 2527, + "train/ce_loss": 1.4271979331970215 + }, + { + "epoch": 0.24985169072572672, + "step": 2527, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.24985169072572672, + "step": 2527, + "train/total_loss": 0.1974072903394699 + }, + { + "entropy": 9.114011764526367, + "epoch": 0.24995056357524223, + "mean_token_accuracy": 0.7521929740905762, + "num_tokens": 13157862.0, + "step": 2528, + "train/ce_loss": 0.9586581587791443 + }, + { + "epoch": 0.24995056357524223, + "step": 2528, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.24995056357524223, + "step": 2528, + "train/total_loss": 0.20914706587791443 + }, + { + "entropy": 10.094575881958008, + "epoch": 0.25004943642475774, + "mean_token_accuracy": 0.7950819730758667, + "num_tokens": 13162524.0, + "step": 2529, + "train/ce_loss": 9.78577918431256e-06 + }, + { + "epoch": 0.25004943642475774, + "step": 2529, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.25004943642475774, + "step": 2529, + "train/total_loss": 0.054688479751348495 + }, + { + "entropy": 9.44721794128418, + "epoch": 0.2501483092742733, + "mean_token_accuracy": 0.6842923760414124, + "num_tokens": 13167641.0, + "step": 2530, + "train/ce_loss": 0.9610504508018494 + }, + { + "epoch": 0.2501483092742733, + "step": 2530, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.2501483092742733, + "step": 2530, + "train/total_loss": 0.18204253911972046 + }, + { + "entropy": 8.911985397338867, + "epoch": 0.2502471821237888, + "mean_token_accuracy": 0.7421109676361084, + "num_tokens": 13173074.0, + "step": 2531, + "train/ce_loss": 1.0923680067062378 + }, + { + "epoch": 0.2502471821237888, + "step": 2531, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.2502471821237888, + "step": 2531, + "train/total_loss": 0.22251805663108826 + }, + { + "entropy": 9.064131736755371, + "epoch": 0.2503460549733043, + "mean_token_accuracy": 0.7820823192596436, + "num_tokens": 13178415.0, + "step": 2532, + "train/ce_loss": 0.7013729810714722 + }, + { + "epoch": 0.2503460549733043, + "step": 2532, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.2503460549733043, + "step": 2532, + "train/total_loss": 0.16388729214668274 + }, + { + "entropy": 9.00645637512207, + "epoch": 0.25044492782281985, + "mean_token_accuracy": 0.6718266010284424, + "num_tokens": 13183842.0, + "step": 2533, + "train/ce_loss": 0.8548325896263123 + }, + { + "epoch": 0.25044492782281985, + "step": 2533, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.25044492782281985, + "step": 2533, + "train/total_loss": 0.10501451045274734 + }, + { + "entropy": 8.99317741394043, + "epoch": 0.2505438006723354, + "mean_token_accuracy": 0.6922246217727661, + "num_tokens": 13189247.0, + "step": 2534, + "train/ce_loss": 0.7916908860206604 + }, + { + "epoch": 0.2505438006723354, + "step": 2534, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.2505438006723354, + "step": 2534, + "train/total_loss": 0.12995034456253052 + }, + { + "entropy": 9.250624656677246, + "epoch": 0.2506426735218509, + "mean_token_accuracy": 0.744413435459137, + "num_tokens": 13194438.0, + "step": 2535, + "train/ce_loss": 1.1858785152435303 + }, + { + "epoch": 0.2506426735218509, + "step": 2535, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.2506426735218509, + "step": 2535, + "train/total_loss": 0.20452535152435303 + }, + { + "entropy": 8.684514999389648, + "epoch": 0.2507415463713664, + "mean_token_accuracy": 0.7258979082107544, + "num_tokens": 13199940.0, + "step": 2536, + "train/ce_loss": 0.5790791511535645 + }, + { + "epoch": 0.2507415463713664, + "step": 2536, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.2507415463713664, + "step": 2536, + "train/total_loss": 0.10868916660547256 + }, + { + "entropy": 9.351373672485352, + "epoch": 0.25084041922088196, + "mean_token_accuracy": 0.8149210810661316, + "num_tokens": 13205120.0, + "step": 2537, + "train/ce_loss": 0.6165726184844971 + }, + { + "epoch": 0.25084041922088196, + "step": 2537, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.25084041922088196, + "step": 2537, + "train/total_loss": 0.15150101482868195 + }, + { + "entropy": 9.45691204071045, + "epoch": 0.25093929207039745, + "mean_token_accuracy": 0.7552674412727356, + "num_tokens": 13210138.0, + "step": 2538, + "train/ce_loss": 1.2463139295578003 + }, + { + "epoch": 0.25093929207039745, + "step": 2538, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.25093929207039745, + "step": 2538, + "train/total_loss": 0.18322515487670898 + }, + { + "entropy": 8.860736846923828, + "epoch": 0.251038164919913, + "mean_token_accuracy": 0.7325102686882019, + "num_tokens": 13215355.0, + "step": 2539, + "train/ce_loss": 0.6914465427398682 + }, + { + "epoch": 0.251038164919913, + "step": 2539, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.251038164919913, + "step": 2539, + "train/total_loss": 0.13945716619491577 + }, + { + "epoch": 0.25113703776942853, + "grad_norm": 1.002090334892273, + "learning_rate": 9.374721851357365e-06, + "loss": 0.1571, + "step": 2540 + }, + { + "entropy": 9.910036087036133, + "epoch": 0.25113703776942853, + "mean_token_accuracy": 0.7692307829856873, + "num_tokens": 13220275.0, + "step": 2540, + "train/ce_loss": 1.0769728422164917 + }, + { + "epoch": 0.25113703776942853, + "step": 2540, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.25113703776942853, + "step": 2540, + "train/total_loss": 0.1584785282611847 + }, + { + "entropy": 9.055376052856445, + "epoch": 0.251235910618944, + "mean_token_accuracy": 0.7429218292236328, + "num_tokens": 13225675.0, + "step": 2541, + "train/ce_loss": 0.6915692687034607 + }, + { + "epoch": 0.251235910618944, + "step": 2541, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.251235910618944, + "step": 2541, + "train/total_loss": 0.08868817985057831 + }, + { + "entropy": 9.54286003112793, + "epoch": 0.25133478346845955, + "mean_token_accuracy": 0.7547425627708435, + "num_tokens": 13231008.0, + "step": 2542, + "train/ce_loss": 6.5046788222389296e-06 + }, + { + "epoch": 0.25133478346845955, + "step": 2542, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.25133478346845955, + "step": 2542, + "train/total_loss": 0.05078190192580223 + }, + { + "entropy": 9.195383071899414, + "epoch": 0.2514336563179751, + "mean_token_accuracy": 0.701564371585846, + "num_tokens": 13236334.0, + "step": 2543, + "train/ce_loss": 1.094359040260315 + }, + { + "epoch": 0.2514336563179751, + "step": 2543, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.2514336563179751, + "step": 2543, + "train/total_loss": 0.21881091594696045 + }, + { + "entropy": 9.518705368041992, + "epoch": 0.2515325291674906, + "mean_token_accuracy": 0.7325581312179565, + "num_tokens": 13241464.0, + "step": 2544, + "train/ce_loss": 0.729640543460846 + }, + { + "epoch": 0.2515325291674906, + "step": 2544, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.2515325291674906, + "step": 2544, + "train/total_loss": 0.13546405732631683 + }, + { + "entropy": 9.59697151184082, + "epoch": 0.2516314020170061, + "mean_token_accuracy": 0.7433234453201294, + "num_tokens": 13246605.0, + "step": 2545, + "train/ce_loss": 1.2695198059082031 + }, + { + "epoch": 0.2516314020170061, + "step": 2545, + "train/sim_loss": 0.19140625 + }, + { + "epoch": 0.2516314020170061, + "step": 2545, + "train/total_loss": 0.31835824251174927 + }, + { + "entropy": 9.31287670135498, + "epoch": 0.25173027486652166, + "mean_token_accuracy": 0.6676300764083862, + "num_tokens": 13251730.0, + "step": 2546, + "train/ce_loss": 1.5146877765655518 + }, + { + "epoch": 0.25173027486652166, + "step": 2546, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.25173027486652166, + "step": 2546, + "train/total_loss": 0.24912503361701965 + }, + { + "entropy": 9.147940635681152, + "epoch": 0.25182914771603715, + "mean_token_accuracy": 0.7267230749130249, + "num_tokens": 13257016.0, + "step": 2547, + "train/ce_loss": 1.3301770687103271 + }, + { + "epoch": 0.25182914771603715, + "step": 2547, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.25182914771603715, + "step": 2547, + "train/total_loss": 0.18379895389080048 + }, + { + "entropy": 9.736891746520996, + "epoch": 0.2519280205655527, + "mean_token_accuracy": 0.7821576595306396, + "num_tokens": 13261915.0, + "step": 2548, + "train/ce_loss": 1.6027624607086182 + }, + { + "epoch": 0.2519280205655527, + "step": 2548, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.2519280205655527, + "step": 2548, + "train/total_loss": 0.25793248414993286 + }, + { + "entropy": 9.215120315551758, + "epoch": 0.25202689341506823, + "mean_token_accuracy": 0.6797829270362854, + "num_tokens": 13267133.0, + "step": 2549, + "train/ce_loss": 1.545188069343567 + }, + { + "epoch": 0.25202689341506823, + "step": 2549, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.25202689341506823, + "step": 2549, + "train/total_loss": 0.27951881289482117 + }, + { + "entropy": 9.126137733459473, + "epoch": 0.2521257662645838, + "mean_token_accuracy": 0.7073459625244141, + "num_tokens": 13272442.0, + "step": 2550, + "train/ce_loss": 0.8889264464378357 + }, + { + "epoch": 0.2521257662645838, + "step": 2550, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.2521257662645838, + "step": 2550, + "train/total_loss": 0.1709238886833191 + }, + { + "entropy": 9.034530639648438, + "epoch": 0.25222463911409926, + "mean_token_accuracy": 0.7701271176338196, + "num_tokens": 13277860.0, + "step": 2551, + "train/ce_loss": 1.3219208717346191 + }, + { + "epoch": 0.25222463911409926, + "step": 2551, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.25222463911409926, + "step": 2551, + "train/total_loss": 0.21812959015369415 + }, + { + "entropy": 9.521261215209961, + "epoch": 0.2523235119636148, + "mean_token_accuracy": 0.7627627849578857, + "num_tokens": 13282954.0, + "step": 2552, + "train/ce_loss": 0.8999001383781433 + }, + { + "epoch": 0.2523235119636148, + "step": 2552, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.2523235119636148, + "step": 2552, + "train/total_loss": 0.12124001234769821 + }, + { + "entropy": 9.218066215515137, + "epoch": 0.25242238481313034, + "mean_token_accuracy": 0.7368420958518982, + "num_tokens": 13288302.0, + "step": 2553, + "train/ce_loss": 0.7164695858955383 + }, + { + "epoch": 0.25242238481313034, + "step": 2553, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.25242238481313034, + "step": 2553, + "train/total_loss": 0.12633445858955383 + }, + { + "entropy": 10.222471237182617, + "epoch": 0.2525212576626458, + "mean_token_accuracy": 0.7066974639892578, + "num_tokens": 13293085.0, + "step": 2554, + "train/ce_loss": 1.2147566080093384 + }, + { + "epoch": 0.2525212576626458, + "step": 2554, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.2525212576626458, + "step": 2554, + "train/total_loss": 0.17225691676139832 + }, + { + "entropy": 9.82444953918457, + "epoch": 0.25262013051216137, + "mean_token_accuracy": 0.7295082211494446, + "num_tokens": 13297993.0, + "step": 2555, + "train/ce_loss": 4.79925893159816e-06 + }, + { + "epoch": 0.25262013051216137, + "step": 2555, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.25262013051216137, + "step": 2555, + "train/total_loss": 0.0312504805624485 + }, + { + "entropy": 9.691442489624023, + "epoch": 0.2527190033616769, + "mean_token_accuracy": 0.7115384340286255, + "num_tokens": 13303070.0, + "step": 2556, + "train/ce_loss": 1.222868800163269 + }, + { + "epoch": 0.2527190033616769, + "step": 2556, + "train/sim_loss": 0.140625 + }, + { + "epoch": 0.2527190033616769, + "step": 2556, + "train/total_loss": 0.2629118859767914 + }, + { + "entropy": 9.206101417541504, + "epoch": 0.2528178762111924, + "mean_token_accuracy": 0.7218863368034363, + "num_tokens": 13308338.0, + "step": 2557, + "train/ce_loss": 0.688447117805481 + }, + { + "epoch": 0.2528178762111924, + "step": 2557, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.2528178762111924, + "step": 2557, + "train/total_loss": 0.15868845582008362 + }, + { + "entropy": 9.227208137512207, + "epoch": 0.25291674906070793, + "mean_token_accuracy": 0.7322468161582947, + "num_tokens": 13313619.0, + "step": 2558, + "train/ce_loss": 0.6929495930671692 + }, + { + "epoch": 0.25291674906070793, + "step": 2558, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.25291674906070793, + "step": 2558, + "train/total_loss": 0.12007620930671692 + }, + { + "entropy": 9.633859634399414, + "epoch": 0.2530156219102235, + "mean_token_accuracy": 0.6998368501663208, + "num_tokens": 13318682.0, + "step": 2559, + "train/ce_loss": 1.085984706878662 + }, + { + "epoch": 0.2530156219102235, + "step": 2559, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.2530156219102235, + "step": 2559, + "train/total_loss": 0.1867234706878662 + }, + { + "epoch": 0.25311449475973896, + "grad_norm": 1.0176974534988403, + "learning_rate": 9.369776986599417e-06, + "loss": 0.1588, + "step": 2560 + }, + { + "entropy": 8.93287467956543, + "epoch": 0.25311449475973896, + "mean_token_accuracy": 0.7111681699752808, + "num_tokens": 13323908.0, + "step": 2560, + "train/ce_loss": 0.551044762134552 + }, + { + "epoch": 0.25311449475973896, + "step": 2560, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.25311449475973896, + "step": 2560, + "train/total_loss": 0.17229197919368744 + }, + { + "entropy": 9.74870491027832, + "epoch": 0.2532133676092545, + "mean_token_accuracy": 0.7206896543502808, + "num_tokens": 13328845.0, + "step": 2561, + "train/ce_loss": 1.1745083332061768 + }, + { + "epoch": 0.2532133676092545, + "step": 2561, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.2532133676092545, + "step": 2561, + "train/total_loss": 0.19948208332061768 + }, + { + "entropy": 9.688440322875977, + "epoch": 0.25331224045877004, + "mean_token_accuracy": 0.7081910967826843, + "num_tokens": 13333846.0, + "step": 2562, + "train/ce_loss": 1.4161006212234497 + }, + { + "epoch": 0.25331224045877004, + "step": 2562, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.25331224045877004, + "step": 2562, + "train/total_loss": 0.2041100710630417 + }, + { + "entropy": 9.229511260986328, + "epoch": 0.25341111330828553, + "mean_token_accuracy": 0.749685525894165, + "num_tokens": 13339123.0, + "step": 2563, + "train/ce_loss": 5.28743566974299e-06 + }, + { + "epoch": 0.25341111330828553, + "step": 2563, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.25341111330828553, + "step": 2563, + "train/total_loss": 0.05859427899122238 + }, + { + "entropy": 9.266935348510742, + "epoch": 0.25350998615780107, + "mean_token_accuracy": 0.7222222089767456, + "num_tokens": 13344364.0, + "step": 2564, + "train/ce_loss": 0.6972231268882751 + }, + { + "epoch": 0.25350998615780107, + "step": 2564, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.25350998615780107, + "step": 2564, + "train/total_loss": 0.11659731715917587 + }, + { + "entropy": 9.317707061767578, + "epoch": 0.2536088590073166, + "mean_token_accuracy": 0.6845549941062927, + "num_tokens": 13349568.0, + "step": 2565, + "train/ce_loss": 1.2373405694961548 + }, + { + "epoch": 0.2536088590073166, + "step": 2565, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.2536088590073166, + "step": 2565, + "train/total_loss": 0.20967155694961548 + }, + { + "entropy": 9.177165985107422, + "epoch": 0.2537077318568321, + "mean_token_accuracy": 0.6903669834136963, + "num_tokens": 13354883.0, + "step": 2566, + "train/ce_loss": 1.2569265365600586 + }, + { + "epoch": 0.2537077318568321, + "step": 2566, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.2537077318568321, + "step": 2566, + "train/total_loss": 0.23897390067577362 + }, + { + "entropy": 10.023475646972656, + "epoch": 0.25380660470634764, + "mean_token_accuracy": 0.7082067131996155, + "num_tokens": 13359654.0, + "step": 2567, + "train/ce_loss": 3.0070043067098595e-05 + }, + { + "epoch": 0.25380660470634764, + "step": 2567, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.25380660470634764, + "step": 2567, + "train/total_loss": 0.05469050630927086 + }, + { + "entropy": 9.127727508544922, + "epoch": 0.2539054775558632, + "mean_token_accuracy": 0.7402135133743286, + "num_tokens": 13364965.0, + "step": 2568, + "train/ce_loss": 1.4286116361618042 + }, + { + "epoch": 0.2539054775558632, + "step": 2568, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.2539054775558632, + "step": 2568, + "train/total_loss": 0.26786118745803833 + }, + { + "entropy": 9.038599967956543, + "epoch": 0.25400435040537866, + "mean_token_accuracy": 0.679358720779419, + "num_tokens": 13370437.0, + "step": 2569, + "train/ce_loss": 1.4212325811386108 + }, + { + "epoch": 0.25400435040537866, + "step": 2569, + "train/sim_loss": 0.12890625 + }, + { + "epoch": 0.25400435040537866, + "step": 2569, + "train/total_loss": 0.271029531955719 + }, + { + "entropy": 9.771385192871094, + "epoch": 0.2541032232548942, + "mean_token_accuracy": 0.7594433426856995, + "num_tokens": 13375385.0, + "step": 2570, + "train/ce_loss": 0.8145433664321899 + }, + { + "epoch": 0.2541032232548942, + "step": 2570, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.2541032232548942, + "step": 2570, + "train/total_loss": 0.163485586643219 + }, + { + "entropy": 9.49769115447998, + "epoch": 0.25420209610440975, + "mean_token_accuracy": 0.7037037014961243, + "num_tokens": 13380585.0, + "step": 2571, + "train/ce_loss": 1.4433395862579346 + }, + { + "epoch": 0.25420209610440975, + "step": 2571, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.25420209610440975, + "step": 2571, + "train/total_loss": 0.26933395862579346 + }, + { + "entropy": 9.775506973266602, + "epoch": 0.25430096895392523, + "mean_token_accuracy": 0.7543520331382751, + "num_tokens": 13385558.0, + "step": 2572, + "train/ce_loss": 1.139591932296753 + }, + { + "epoch": 0.25430096895392523, + "step": 2572, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.25430096895392523, + "step": 2572, + "train/total_loss": 0.1998966932296753 + }, + { + "entropy": 9.066909790039062, + "epoch": 0.2543998418034408, + "mean_token_accuracy": 0.7022809386253357, + "num_tokens": 13390861.0, + "step": 2573, + "train/ce_loss": 0.6508738398551941 + }, + { + "epoch": 0.2543998418034408, + "step": 2573, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.2543998418034408, + "step": 2573, + "train/total_loss": 0.10414988547563553 + }, + { + "entropy": 9.451150894165039, + "epoch": 0.2544987146529563, + "mean_token_accuracy": 0.7231404781341553, + "num_tokens": 13396042.0, + "step": 2574, + "train/ce_loss": 3.7908125705143902e-06 + }, + { + "epoch": 0.2544987146529563, + "step": 2574, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.2544987146529563, + "step": 2574, + "train/total_loss": 0.05078162997961044 + }, + { + "entropy": 9.539295196533203, + "epoch": 0.2545975875024718, + "mean_token_accuracy": 0.7214815020561218, + "num_tokens": 13401179.0, + "step": 2575, + "train/ce_loss": 0.5825445055961609 + }, + { + "epoch": 0.2545975875024718, + "step": 2575, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.2545975875024718, + "step": 2575, + "train/total_loss": 0.09731695055961609 + }, + { + "entropy": 8.767807006835938, + "epoch": 0.25469646035198734, + "mean_token_accuracy": 0.754923403263092, + "num_tokens": 13406605.0, + "step": 2576, + "train/ce_loss": 1.4180355072021484 + }, + { + "epoch": 0.25469646035198734, + "step": 2576, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.25469646035198734, + "step": 2576, + "train/total_loss": 0.2238347977399826 + }, + { + "entropy": 9.604494094848633, + "epoch": 0.2547953332015029, + "mean_token_accuracy": 0.7901785969734192, + "num_tokens": 13411903.0, + "step": 2577, + "train/ce_loss": 0.6346091032028198 + }, + { + "epoch": 0.2547953332015029, + "step": 2577, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.2547953332015029, + "step": 2577, + "train/total_loss": 0.11033590883016586 + }, + { + "entropy": 9.478580474853516, + "epoch": 0.25489420605101837, + "mean_token_accuracy": 0.7314949035644531, + "num_tokens": 13417023.0, + "step": 2578, + "train/ce_loss": 1.3560292720794678 + }, + { + "epoch": 0.25489420605101837, + "step": 2578, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.25489420605101837, + "step": 2578, + "train/total_loss": 0.2215404361486435 + }, + { + "entropy": 9.453705787658691, + "epoch": 0.2549930789005339, + "mean_token_accuracy": 0.6845729947090149, + "num_tokens": 13422193.0, + "step": 2579, + "train/ce_loss": 1.7850563526153564 + }, + { + "epoch": 0.2549930789005339, + "step": 2579, + "train/sim_loss": 0.12109375 + }, + { + "epoch": 0.2549930789005339, + "step": 2579, + "train/total_loss": 0.29959940910339355 + }, + { + "epoch": 0.25509195175004945, + "grad_norm": 1.0193370580673218, + "learning_rate": 9.364832121841468e-06, + "loss": 0.171, + "step": 2580 + }, + { + "entropy": 9.64789867401123, + "epoch": 0.25509195175004945, + "mean_token_accuracy": 0.8156934380531311, + "num_tokens": 13427205.0, + "step": 2580, + "train/ce_loss": 0.7694849371910095 + }, + { + "epoch": 0.25509195175004945, + "step": 2580, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.25509195175004945, + "step": 2580, + "train/total_loss": 0.10038599371910095 + }, + { + "entropy": 9.008233070373535, + "epoch": 0.25519082459956494, + "mean_token_accuracy": 0.764018714427948, + "num_tokens": 13432544.0, + "step": 2581, + "train/ce_loss": 0.773780345916748 + }, + { + "epoch": 0.25519082459956494, + "step": 2581, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.25519082459956494, + "step": 2581, + "train/total_loss": 0.1437842845916748 + }, + { + "entropy": 9.066333770751953, + "epoch": 0.2552896974490805, + "mean_token_accuracy": 0.6932185292243958, + "num_tokens": 13437941.0, + "step": 2582, + "train/ce_loss": 0.7666678428649902 + }, + { + "epoch": 0.2552896974490805, + "step": 2582, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.2552896974490805, + "step": 2582, + "train/total_loss": 0.15088553726673126 + }, + { + "entropy": 9.372958183288574, + "epoch": 0.255388570298596, + "mean_token_accuracy": 0.7142857313156128, + "num_tokens": 13443142.0, + "step": 2583, + "train/ce_loss": 0.8054994344711304 + }, + { + "epoch": 0.255388570298596, + "step": 2583, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.255388570298596, + "step": 2583, + "train/total_loss": 0.135237455368042 + }, + { + "entropy": 9.219785690307617, + "epoch": 0.2554874431481115, + "mean_token_accuracy": 0.6632064580917358, + "num_tokens": 13448477.0, + "step": 2584, + "train/ce_loss": 0.5881164073944092 + }, + { + "epoch": 0.2554874431481115, + "step": 2584, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.2554874431481115, + "step": 2584, + "train/total_loss": 0.12521788477897644 + }, + { + "entropy": 9.666365623474121, + "epoch": 0.25558631599762704, + "mean_token_accuracy": 0.700952410697937, + "num_tokens": 13453434.0, + "step": 2585, + "train/ce_loss": 1.365299940109253 + }, + { + "epoch": 0.25558631599762704, + "step": 2585, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.25558631599762704, + "step": 2585, + "train/total_loss": 0.22246749699115753 + }, + { + "entropy": 8.801751136779785, + "epoch": 0.2556851888471426, + "mean_token_accuracy": 0.6927710771560669, + "num_tokens": 13458880.0, + "step": 2586, + "train/ce_loss": 1.258221983909607 + }, + { + "epoch": 0.2556851888471426, + "step": 2586, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.2556851888471426, + "step": 2586, + "train/total_loss": 0.19613470137119293 + }, + { + "entropy": 9.132341384887695, + "epoch": 0.25578406169665807, + "mean_token_accuracy": 0.7421307563781738, + "num_tokens": 13464184.0, + "step": 2587, + "train/ce_loss": 0.6278568506240845 + }, + { + "epoch": 0.25578406169665807, + "step": 2587, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.25578406169665807, + "step": 2587, + "train/total_loss": 0.12528568506240845 + }, + { + "entropy": 9.190263748168945, + "epoch": 0.2558829345461736, + "mean_token_accuracy": 0.8046783804893494, + "num_tokens": 13469516.0, + "step": 2588, + "train/ce_loss": 0.763489842414856 + }, + { + "epoch": 0.2558829345461736, + "step": 2588, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.2558829345461736, + "step": 2588, + "train/total_loss": 0.09588023275136948 + }, + { + "entropy": 9.274324417114258, + "epoch": 0.25598180739568915, + "mean_token_accuracy": 0.7122641801834106, + "num_tokens": 13474770.0, + "step": 2589, + "train/ce_loss": 0.5519495010375977 + }, + { + "epoch": 0.25598180739568915, + "step": 2589, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.25598180739568915, + "step": 2589, + "train/total_loss": 0.08644495159387589 + }, + { + "entropy": 9.52096176147461, + "epoch": 0.25608068024520464, + "mean_token_accuracy": 0.7340764403343201, + "num_tokens": 13479838.0, + "step": 2590, + "train/ce_loss": 1.3775403499603271 + }, + { + "epoch": 0.25608068024520464, + "step": 2590, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.25608068024520464, + "step": 2590, + "train/total_loss": 0.23150403797626495 + }, + { + "entropy": 9.556732177734375, + "epoch": 0.2561795530947202, + "mean_token_accuracy": 0.699999988079071, + "num_tokens": 13484889.0, + "step": 2591, + "train/ce_loss": 1.050861120223999 + }, + { + "epoch": 0.2561795530947202, + "step": 2591, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.2561795530947202, + "step": 2591, + "train/total_loss": 0.16758611798286438 + }, + { + "entropy": 9.318330764770508, + "epoch": 0.2562784259442357, + "mean_token_accuracy": 0.7173051238059998, + "num_tokens": 13490131.0, + "step": 2592, + "train/ce_loss": 1.2866765260696411 + }, + { + "epoch": 0.2562784259442357, + "step": 2592, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.2562784259442357, + "step": 2592, + "train/total_loss": 0.2224176526069641 + }, + { + "entropy": 9.17322826385498, + "epoch": 0.25637729879375126, + "mean_token_accuracy": 0.7382388710975647, + "num_tokens": 13495374.0, + "step": 2593, + "train/ce_loss": 0.7236014604568481 + }, + { + "epoch": 0.25637729879375126, + "step": 2593, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.25637729879375126, + "step": 2593, + "train/total_loss": 0.11923515051603317 + }, + { + "entropy": 9.930195808410645, + "epoch": 0.25647617164326675, + "mean_token_accuracy": 0.7596566677093506, + "num_tokens": 13500283.0, + "step": 2594, + "train/ce_loss": 0.6788000464439392 + }, + { + "epoch": 0.25647617164326675, + "step": 2594, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.25647617164326675, + "step": 2594, + "train/total_loss": 0.08741125464439392 + }, + { + "entropy": 9.190201759338379, + "epoch": 0.2565750444927823, + "mean_token_accuracy": 0.7316455841064453, + "num_tokens": 13505489.0, + "step": 2595, + "train/ce_loss": 0.7839797139167786 + }, + { + "epoch": 0.2565750444927823, + "step": 2595, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.2565750444927823, + "step": 2595, + "train/total_loss": 0.1565229743719101 + }, + { + "entropy": 9.254472732543945, + "epoch": 0.25667391734229783, + "mean_token_accuracy": 0.7011494040489197, + "num_tokens": 13510986.0, + "step": 2596, + "train/ce_loss": 1.567700982093811 + }, + { + "epoch": 0.25667391734229783, + "step": 2596, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.25667391734229783, + "step": 2596, + "train/total_loss": 0.25052011013031006 + }, + { + "entropy": 9.898000717163086, + "epoch": 0.2567727901918133, + "mean_token_accuracy": 0.725450873374939, + "num_tokens": 13515906.0, + "step": 2597, + "train/ce_loss": 1.308440089225769 + }, + { + "epoch": 0.2567727901918133, + "step": 2597, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.2567727901918133, + "step": 2597, + "train/total_loss": 0.20506276190280914 + }, + { + "entropy": 8.82431697845459, + "epoch": 0.25687166304132886, + "mean_token_accuracy": 0.7403957843780518, + "num_tokens": 13521331.0, + "step": 2598, + "train/ce_loss": 0.4845927059650421 + }, + { + "epoch": 0.25687166304132886, + "step": 2598, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.25687166304132886, + "step": 2598, + "train/total_loss": 0.11095927655696869 + }, + { + "entropy": 9.454943656921387, + "epoch": 0.2569705358908444, + "mean_token_accuracy": 0.791946291923523, + "num_tokens": 13526553.0, + "step": 2599, + "train/ce_loss": 0.6428021192550659 + }, + { + "epoch": 0.2569705358908444, + "step": 2599, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.2569705358908444, + "step": 2599, + "train/total_loss": 0.08771771192550659 + }, + { + "epoch": 0.2570694087403599, + "grad_norm": 0.6037888526916504, + "learning_rate": 9.35988725708352e-06, + "loss": 0.1501, + "step": 2600 + }, + { + "entropy": 9.420867919921875, + "epoch": 0.2570694087403599, + "mean_token_accuracy": 0.751358687877655, + "num_tokens": 13531751.0, + "step": 2600, + "train/ce_loss": 0.953595757484436 + }, + { + "epoch": 0.2570694087403599, + "step": 2600, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.2570694087403599, + "step": 2600, + "train/total_loss": 0.18129707872867584 + }, + { + "entropy": 10.32927131652832, + "epoch": 0.2571682815898754, + "mean_token_accuracy": 0.7687074542045593, + "num_tokens": 13536484.0, + "step": 2601, + "train/ce_loss": 1.3974251747131348 + }, + { + "epoch": 0.2571682815898754, + "step": 2601, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.2571682815898754, + "step": 2601, + "train/total_loss": 0.22958627343177795 + }, + { + "entropy": 9.080526351928711, + "epoch": 0.25726715443939097, + "mean_token_accuracy": 0.7560693621635437, + "num_tokens": 13541822.0, + "step": 2602, + "train/ce_loss": 0.722567081451416 + }, + { + "epoch": 0.25726715443939097, + "step": 2602, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.25726715443939097, + "step": 2602, + "train/total_loss": 0.11913170665502548 + }, + { + "entropy": 9.02514934539795, + "epoch": 0.25736602728890645, + "mean_token_accuracy": 0.7454545497894287, + "num_tokens": 13547143.0, + "step": 2603, + "train/ce_loss": 0.533889889717102 + }, + { + "epoch": 0.25736602728890645, + "step": 2603, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.25736602728890645, + "step": 2603, + "train/total_loss": 0.13932648301124573 + }, + { + "entropy": 10.110696792602539, + "epoch": 0.257464900138422, + "mean_token_accuracy": 0.7905882596969604, + "num_tokens": 13551999.0, + "step": 2604, + "train/ce_loss": 1.1310733556747437 + }, + { + "epoch": 0.257464900138422, + "step": 2604, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.257464900138422, + "step": 2604, + "train/total_loss": 0.2029510885477066 + }, + { + "entropy": 9.495134353637695, + "epoch": 0.25756377298793753, + "mean_token_accuracy": 0.7222222089767456, + "num_tokens": 13556854.0, + "step": 2605, + "train/ce_loss": 2.322931686649099e-05 + }, + { + "epoch": 0.25756377298793753, + "step": 2605, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.25756377298793753, + "step": 2605, + "train/total_loss": 0.05859607458114624 + }, + { + "entropy": 9.316610336303711, + "epoch": 0.257662645837453, + "mean_token_accuracy": 0.8257668614387512, + "num_tokens": 13562154.0, + "step": 2606, + "train/ce_loss": 0.5784516334533691 + }, + { + "epoch": 0.257662645837453, + "step": 2606, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.257662645837453, + "step": 2606, + "train/total_loss": 0.13987641036510468 + }, + { + "entropy": 8.904605865478516, + "epoch": 0.25776151868696856, + "mean_token_accuracy": 0.7675233483314514, + "num_tokens": 13567489.0, + "step": 2607, + "train/ce_loss": 0.9174344539642334 + }, + { + "epoch": 0.25776151868696856, + "step": 2607, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.25776151868696856, + "step": 2607, + "train/total_loss": 0.12689968943595886 + }, + { + "entropy": 9.172128677368164, + "epoch": 0.2578603915364841, + "mean_token_accuracy": 0.738831639289856, + "num_tokens": 13572838.0, + "step": 2608, + "train/ce_loss": 0.7097308039665222 + }, + { + "epoch": 0.2578603915364841, + "step": 2608, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.2578603915364841, + "step": 2608, + "train/total_loss": 0.14128558337688446 + }, + { + "entropy": 10.204506874084473, + "epoch": 0.2579592643859996, + "mean_token_accuracy": 0.7416020631790161, + "num_tokens": 13577616.0, + "step": 2609, + "train/ce_loss": 1.2679545879364014 + }, + { + "epoch": 0.2579592643859996, + "step": 2609, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.2579592643859996, + "step": 2609, + "train/total_loss": 0.1736704558134079 + }, + { + "entropy": 9.232139587402344, + "epoch": 0.25805813723551513, + "mean_token_accuracy": 0.7124842405319214, + "num_tokens": 13582854.0, + "step": 2610, + "train/ce_loss": 1.4001924991607666 + }, + { + "epoch": 0.25805813723551513, + "step": 2610, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.25805813723551513, + "step": 2610, + "train/total_loss": 0.1868942528963089 + }, + { + "entropy": 9.506097793579102, + "epoch": 0.25815701008503067, + "mean_token_accuracy": 0.7235772609710693, + "num_tokens": 13587925.0, + "step": 2611, + "train/ce_loss": 8.264971256721765e-06 + }, + { + "epoch": 0.25815701008503067, + "step": 2611, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.25815701008503067, + "step": 2611, + "train/total_loss": 0.05468832701444626 + }, + { + "entropy": 9.140655517578125, + "epoch": 0.25825588293454615, + "mean_token_accuracy": 0.7585033774375916, + "num_tokens": 13593277.0, + "step": 2612, + "train/ce_loss": 1.0882909297943115 + }, + { + "epoch": 0.25825588293454615, + "step": 2612, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.25825588293454615, + "step": 2612, + "train/total_loss": 0.1869540959596634 + }, + { + "entropy": 9.401294708251953, + "epoch": 0.2583547557840617, + "mean_token_accuracy": 0.7689922451972961, + "num_tokens": 13598355.0, + "step": 2613, + "train/ce_loss": 0.8922169208526611 + }, + { + "epoch": 0.2583547557840617, + "step": 2613, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.2583547557840617, + "step": 2613, + "train/total_loss": 0.14781543612480164 + }, + { + "entropy": 8.947813987731934, + "epoch": 0.25845362863357724, + "mean_token_accuracy": 0.7288659811019897, + "num_tokens": 13603794.0, + "step": 2614, + "train/ce_loss": 0.91164630651474 + }, + { + "epoch": 0.25845362863357724, + "step": 2614, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.25845362863357724, + "step": 2614, + "train/total_loss": 0.11069588363170624 + }, + { + "entropy": 9.560693740844727, + "epoch": 0.2585525014830927, + "mean_token_accuracy": 0.7612208127975464, + "num_tokens": 13608757.0, + "step": 2615, + "train/ce_loss": 0.5483984351158142 + }, + { + "epoch": 0.2585525014830927, + "step": 2615, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.2585525014830927, + "step": 2615, + "train/total_loss": 0.1564023494720459 + }, + { + "entropy": 9.200815200805664, + "epoch": 0.25865137433260826, + "mean_token_accuracy": 0.7626506090164185, + "num_tokens": 13613996.0, + "step": 2616, + "train/ce_loss": 0.8643326759338379 + }, + { + "epoch": 0.25865137433260826, + "step": 2616, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.25865137433260826, + "step": 2616, + "train/total_loss": 0.1411207616329193 + }, + { + "entropy": 9.343541145324707, + "epoch": 0.2587502471821238, + "mean_token_accuracy": 0.7405247688293457, + "num_tokens": 13619308.0, + "step": 2617, + "train/ce_loss": 1.2328028678894043 + }, + { + "epoch": 0.2587502471821238, + "step": 2617, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.2587502471821238, + "step": 2617, + "train/total_loss": 0.19749903678894043 + }, + { + "entropy": 9.184523582458496, + "epoch": 0.2588491200316393, + "mean_token_accuracy": 0.7261761426925659, + "num_tokens": 13624625.0, + "step": 2618, + "train/ce_loss": 0.7512075901031494 + }, + { + "epoch": 0.2588491200316393, + "step": 2618, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.2588491200316393, + "step": 2618, + "train/total_loss": 0.11027701199054718 + }, + { + "entropy": 9.113435745239258, + "epoch": 0.25894799288115483, + "mean_token_accuracy": 0.7227723002433777, + "num_tokens": 13630022.0, + "step": 2619, + "train/ce_loss": 0.7234528064727783 + }, + { + "epoch": 0.25894799288115483, + "step": 2619, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.25894799288115483, + "step": 2619, + "train/total_loss": 0.1270327866077423 + }, + { + "epoch": 0.2590468657306704, + "grad_norm": 0.7478315234184265, + "learning_rate": 9.35494239232557e-06, + "loss": 0.1419, + "step": 2620 + }, + { + "entropy": 9.056140899658203, + "epoch": 0.2590468657306704, + "mean_token_accuracy": 0.7356979250907898, + "num_tokens": 13635382.0, + "step": 2620, + "train/ce_loss": 0.4664541184902191 + }, + { + "epoch": 0.2590468657306704, + "step": 2620, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.2590468657306704, + "step": 2620, + "train/total_loss": 0.07008291780948639 + }, + { + "entropy": 9.462387084960938, + "epoch": 0.25914573858018586, + "mean_token_accuracy": 0.816500723361969, + "num_tokens": 13640490.0, + "step": 2621, + "train/ce_loss": 2.0883476281596813e-06 + }, + { + "epoch": 0.25914573858018586, + "step": 2621, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.25914573858018586, + "step": 2621, + "train/total_loss": 0.039062708616256714 + }, + { + "entropy": 9.461859703063965, + "epoch": 0.2592446114297014, + "mean_token_accuracy": 0.7144906520843506, + "num_tokens": 13645635.0, + "step": 2622, + "train/ce_loss": 1.0317198038101196 + }, + { + "epoch": 0.2592446114297014, + "step": 2622, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.2592446114297014, + "step": 2622, + "train/total_loss": 0.21254697442054749 + }, + { + "entropy": 9.722253799438477, + "epoch": 0.25934348427921694, + "mean_token_accuracy": 0.7546584010124207, + "num_tokens": 13650708.0, + "step": 2623, + "train/ce_loss": 0.48645493388175964 + }, + { + "epoch": 0.25934348427921694, + "step": 2623, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.25934348427921694, + "step": 2623, + "train/total_loss": 0.0994267463684082 + }, + { + "entropy": 9.369498252868652, + "epoch": 0.2594423571287324, + "mean_token_accuracy": 0.8027397394180298, + "num_tokens": 13655927.0, + "step": 2624, + "train/ce_loss": 0.6022058129310608 + }, + { + "epoch": 0.2594423571287324, + "step": 2624, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.2594423571287324, + "step": 2624, + "train/total_loss": 0.13053308427333832 + }, + { + "entropy": 8.932147026062012, + "epoch": 0.25954122997824797, + "mean_token_accuracy": 0.7331118583679199, + "num_tokens": 13661313.0, + "step": 2625, + "train/ce_loss": 0.9405847191810608 + }, + { + "epoch": 0.25954122997824797, + "step": 2625, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.25954122997824797, + "step": 2625, + "train/total_loss": 0.12921473383903503 + }, + { + "entropy": 9.141525268554688, + "epoch": 0.2596401028277635, + "mean_token_accuracy": 0.7060109376907349, + "num_tokens": 13666720.0, + "step": 2626, + "train/ce_loss": 0.9899309873580933 + }, + { + "epoch": 0.2596401028277635, + "step": 2626, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.2596401028277635, + "step": 2626, + "train/total_loss": 0.18102434277534485 + }, + { + "entropy": 8.945329666137695, + "epoch": 0.259738975677279, + "mean_token_accuracy": 0.676986575126648, + "num_tokens": 13672196.0, + "step": 2627, + "train/ce_loss": 0.8837153315544128 + }, + { + "epoch": 0.259738975677279, + "step": 2627, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.259738975677279, + "step": 2627, + "train/total_loss": 0.16259029507637024 + }, + { + "entropy": 9.45634651184082, + "epoch": 0.25983784852679453, + "mean_token_accuracy": 0.7279305458068848, + "num_tokens": 13677492.0, + "step": 2628, + "train/ce_loss": 5.773954853793839e-06 + }, + { + "epoch": 0.25983784852679453, + "step": 2628, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.25983784852679453, + "step": 2628, + "train/total_loss": 0.05468807741999626 + }, + { + "entropy": 9.273452758789062, + "epoch": 0.2599367213763101, + "mean_token_accuracy": 0.7415204644203186, + "num_tokens": 13683001.0, + "step": 2629, + "train/ce_loss": 0.9866542816162109 + }, + { + "epoch": 0.2599367213763101, + "step": 2629, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.2599367213763101, + "step": 2629, + "train/total_loss": 0.20804043114185333 + }, + { + "entropy": 9.282662391662598, + "epoch": 0.26003559422582556, + "mean_token_accuracy": 0.8014616370201111, + "num_tokens": 13688283.0, + "step": 2630, + "train/ce_loss": 0.8849129676818848 + }, + { + "epoch": 0.26003559422582556, + "step": 2630, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.26003559422582556, + "step": 2630, + "train/total_loss": 0.127553790807724 + }, + { + "entropy": 9.507521629333496, + "epoch": 0.2601344670753411, + "mean_token_accuracy": 0.6752265691757202, + "num_tokens": 13693401.0, + "step": 2631, + "train/ce_loss": 1.5338138341903687 + }, + { + "epoch": 0.2601344670753411, + "step": 2631, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.2601344670753411, + "step": 2631, + "train/total_loss": 0.23150639235973358 + }, + { + "entropy": 9.034505844116211, + "epoch": 0.26023333992485664, + "mean_token_accuracy": 0.7340425252914429, + "num_tokens": 13698642.0, + "step": 2632, + "train/ce_loss": 0.5224541425704956 + }, + { + "epoch": 0.26023333992485664, + "step": 2632, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.26023333992485664, + "step": 2632, + "train/total_loss": 0.08740166574716568 + }, + { + "entropy": 9.128165245056152, + "epoch": 0.2603322127743722, + "mean_token_accuracy": 0.7700650691986084, + "num_tokens": 13704167.0, + "step": 2633, + "train/ce_loss": 0.5268516540527344 + }, + { + "epoch": 0.2603322127743722, + "step": 2633, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.2603322127743722, + "step": 2633, + "train/total_loss": 0.12299767136573792 + }, + { + "entropy": 8.632181167602539, + "epoch": 0.26043108562388767, + "mean_token_accuracy": 0.6927223801612854, + "num_tokens": 13709781.0, + "step": 2634, + "train/ce_loss": 0.9153441786766052 + }, + { + "epoch": 0.26043108562388767, + "step": 2634, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.26043108562388767, + "step": 2634, + "train/total_loss": 0.16184692084789276 + }, + { + "entropy": 9.012310028076172, + "epoch": 0.2605299584734032, + "mean_token_accuracy": 0.6920454502105713, + "num_tokens": 13715082.0, + "step": 2635, + "train/ce_loss": 0.9514340758323669 + }, + { + "epoch": 0.2605299584734032, + "step": 2635, + "train/sim_loss": 0.14453125 + }, + { + "epoch": 0.2605299584734032, + "step": 2635, + "train/total_loss": 0.2396746575832367 + }, + { + "entropy": 9.437811851501465, + "epoch": 0.26062883132291875, + "mean_token_accuracy": 0.7741456031799316, + "num_tokens": 13720240.0, + "step": 2636, + "train/ce_loss": 0.8296997547149658 + }, + { + "epoch": 0.26062883132291875, + "step": 2636, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.26062883132291875, + "step": 2636, + "train/total_loss": 0.12203247845172882 + }, + { + "entropy": 8.832395553588867, + "epoch": 0.26072770417243424, + "mean_token_accuracy": 0.7161290049552917, + "num_tokens": 13725769.0, + "step": 2637, + "train/ce_loss": 0.7955597639083862 + }, + { + "epoch": 0.26072770417243424, + "step": 2637, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.26072770417243424, + "step": 2637, + "train/total_loss": 0.11471223086118698 + }, + { + "entropy": 10.038055419921875, + "epoch": 0.2608265770219498, + "mean_token_accuracy": 0.7489270567893982, + "num_tokens": 13730658.0, + "step": 2638, + "train/ce_loss": 1.0722737312316895 + }, + { + "epoch": 0.2608265770219498, + "step": 2638, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.2608265770219498, + "step": 2638, + "train/total_loss": 0.1736336350440979 + }, + { + "entropy": 9.274852752685547, + "epoch": 0.2609254498714653, + "mean_token_accuracy": 0.7162954211235046, + "num_tokens": 13735971.0, + "step": 2639, + "train/ce_loss": 0.656727135181427 + }, + { + "epoch": 0.2609254498714653, + "step": 2639, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.2609254498714653, + "step": 2639, + "train/total_loss": 0.15161022543907166 + }, + { + "epoch": 0.2610243227209808, + "grad_norm": 1.2802402973175049, + "learning_rate": 9.34999752756762e-06, + "loss": 0.1581, + "step": 2640 + }, + { + "entropy": 9.607377052307129, + "epoch": 0.2610243227209808, + "mean_token_accuracy": 0.6711111068725586, + "num_tokens": 13741121.0, + "step": 2640, + "train/ce_loss": 0.7046462893486023 + }, + { + "epoch": 0.2610243227209808, + "step": 2640, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.2610243227209808, + "step": 2640, + "train/total_loss": 0.14468339085578918 + }, + { + "entropy": 9.388420104980469, + "epoch": 0.26112319557049635, + "mean_token_accuracy": 0.7647058963775635, + "num_tokens": 13746335.0, + "step": 2641, + "train/ce_loss": 0.9969542622566223 + }, + { + "epoch": 0.26112319557049635, + "step": 2641, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.26112319557049635, + "step": 2641, + "train/total_loss": 0.15438292920589447 + }, + { + "entropy": 9.117688179016113, + "epoch": 0.2612220684200119, + "mean_token_accuracy": 0.7053669095039368, + "num_tokens": 13751728.0, + "step": 2642, + "train/ce_loss": 1.0301885604858398 + }, + { + "epoch": 0.2612220684200119, + "step": 2642, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.2612220684200119, + "step": 2642, + "train/total_loss": 0.1459876000881195 + }, + { + "entropy": 9.035491943359375, + "epoch": 0.2613209412695274, + "mean_token_accuracy": 0.7678132653236389, + "num_tokens": 13757016.0, + "step": 2643, + "train/ce_loss": 0.6681311130523682 + }, + { + "epoch": 0.2613209412695274, + "step": 2643, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.2613209412695274, + "step": 2643, + "train/total_loss": 0.12540686130523682 + }, + { + "entropy": 8.702983856201172, + "epoch": 0.2614198141190429, + "mean_token_accuracy": 0.7816091775894165, + "num_tokens": 13762587.0, + "step": 2644, + "train/ce_loss": 0.4604906737804413 + }, + { + "epoch": 0.2614198141190429, + "step": 2644, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.2614198141190429, + "step": 2644, + "train/total_loss": 0.1124553233385086 + }, + { + "entropy": 9.319679260253906, + "epoch": 0.26151868696855846, + "mean_token_accuracy": 0.7560283541679382, + "num_tokens": 13767732.0, + "step": 2645, + "train/ce_loss": 0.8887007832527161 + }, + { + "epoch": 0.26151868696855846, + "step": 2645, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.26151868696855846, + "step": 2645, + "train/total_loss": 0.1552763283252716 + }, + { + "entropy": 10.013471603393555, + "epoch": 0.26161755981807394, + "mean_token_accuracy": 0.7711111307144165, + "num_tokens": 13772583.0, + "step": 2646, + "train/ce_loss": 1.5753374099731445 + }, + { + "epoch": 0.26161755981807394, + "step": 2646, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.26161755981807394, + "step": 2646, + "train/total_loss": 0.20050249993801117 + }, + { + "entropy": 9.408167839050293, + "epoch": 0.2617164326675895, + "mean_token_accuracy": 0.7364864945411682, + "num_tokens": 13777763.0, + "step": 2647, + "train/ce_loss": 0.8203741908073425 + }, + { + "epoch": 0.2617164326675895, + "step": 2647, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.2617164326675895, + "step": 2647, + "train/total_loss": 0.11328741908073425 + }, + { + "entropy": 8.955461502075195, + "epoch": 0.261815305517105, + "mean_token_accuracy": 0.7596899271011353, + "num_tokens": 13783089.0, + "step": 2648, + "train/ce_loss": 0.9542549848556519 + }, + { + "epoch": 0.261815305517105, + "step": 2648, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.261815305517105, + "step": 2648, + "train/total_loss": 0.21261300146579742 + }, + { + "entropy": 9.848175048828125, + "epoch": 0.2619141783666205, + "mean_token_accuracy": 0.7568710446357727, + "num_tokens": 13787967.0, + "step": 2649, + "train/ce_loss": 7.882207682996523e-06 + }, + { + "epoch": 0.2619141783666205, + "step": 2649, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.2619141783666205, + "step": 2649, + "train/total_loss": 0.042969539761543274 + }, + { + "entropy": 9.237049102783203, + "epoch": 0.26201305121613605, + "mean_token_accuracy": 0.7001153230667114, + "num_tokens": 13793348.0, + "step": 2650, + "train/ce_loss": 1.8977290391921997 + }, + { + "epoch": 0.26201305121613605, + "step": 2650, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.26201305121613605, + "step": 2650, + "train/total_loss": 0.29524165391921997 + }, + { + "entropy": 9.357877731323242, + "epoch": 0.2621119240656516, + "mean_token_accuracy": 0.7086092829704285, + "num_tokens": 13798438.0, + "step": 2651, + "train/ce_loss": 6.794214277761057e-05 + }, + { + "epoch": 0.2621119240656516, + "step": 2651, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.2621119240656516, + "step": 2651, + "train/total_loss": 0.0703192949295044 + }, + { + "entropy": 9.049224853515625, + "epoch": 0.2622107969151671, + "mean_token_accuracy": 0.702075719833374, + "num_tokens": 13803753.0, + "step": 2652, + "train/ce_loss": 1.3432912826538086 + }, + { + "epoch": 0.2622107969151671, + "step": 2652, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.2622107969151671, + "step": 2652, + "train/total_loss": 0.21636037528514862 + }, + { + "entropy": 9.077301025390625, + "epoch": 0.2623096697646826, + "mean_token_accuracy": 0.7144444584846497, + "num_tokens": 13809095.0, + "step": 2653, + "train/ce_loss": 0.6699445247650146 + }, + { + "epoch": 0.2623096697646826, + "step": 2653, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.2623096697646826, + "step": 2653, + "train/total_loss": 0.12949445843696594 + }, + { + "entropy": 9.025361061096191, + "epoch": 0.26240854261419816, + "mean_token_accuracy": 0.7596566677093506, + "num_tokens": 13814520.0, + "step": 2654, + "train/ce_loss": 0.5731825828552246 + }, + { + "epoch": 0.26240854261419816, + "step": 2654, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.26240854261419816, + "step": 2654, + "train/total_loss": 0.14716200530529022 + }, + { + "entropy": 9.267854690551758, + "epoch": 0.26250741546371364, + "mean_token_accuracy": 0.739130437374115, + "num_tokens": 13819649.0, + "step": 2655, + "train/ce_loss": 0.726793646812439 + }, + { + "epoch": 0.26250741546371364, + "step": 2655, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.26250741546371364, + "step": 2655, + "train/total_loss": 0.14299187064170837 + }, + { + "entropy": 9.054587364196777, + "epoch": 0.2626062883132292, + "mean_token_accuracy": 0.7280513644218445, + "num_tokens": 13825107.0, + "step": 2656, + "train/ce_loss": 0.9306489825248718 + }, + { + "epoch": 0.2626062883132292, + "step": 2656, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.2626062883132292, + "step": 2656, + "train/total_loss": 0.18681490421295166 + }, + { + "entropy": 9.160943031311035, + "epoch": 0.2627051611627447, + "mean_token_accuracy": 0.7162162065505981, + "num_tokens": 13830453.0, + "step": 2657, + "train/ce_loss": 1.320202112197876 + }, + { + "epoch": 0.2627051611627447, + "step": 2657, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.2627051611627447, + "step": 2657, + "train/total_loss": 0.22577022016048431 + }, + { + "entropy": 9.296772003173828, + "epoch": 0.2628040340122602, + "mean_token_accuracy": 0.7425997257232666, + "num_tokens": 13835670.0, + "step": 2658, + "train/ce_loss": 0.8563773036003113 + }, + { + "epoch": 0.2628040340122602, + "step": 2658, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.2628040340122602, + "step": 2658, + "train/total_loss": 0.14813773334026337 + }, + { + "entropy": 9.201019287109375, + "epoch": 0.26290290686177575, + "mean_token_accuracy": 0.7122128009796143, + "num_tokens": 13840980.0, + "step": 2659, + "train/ce_loss": 0.7281718254089355 + }, + { + "epoch": 0.26290290686177575, + "step": 2659, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.26290290686177575, + "step": 2659, + "train/total_loss": 0.11578593403100967 + }, + { + "epoch": 0.2630017797112913, + "grad_norm": 0.7757517099380493, + "learning_rate": 9.345052662809673e-06, + "loss": 0.1477, + "step": 2660 + }, + { + "entropy": 9.477794647216797, + "epoch": 0.2630017797112913, + "mean_token_accuracy": 0.8177965879440308, + "num_tokens": 13846147.0, + "step": 2660, + "train/ce_loss": 0.7558349967002869 + }, + { + "epoch": 0.2630017797112913, + "step": 2660, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.2630017797112913, + "step": 2660, + "train/total_loss": 0.16152100265026093 + }, + { + "entropy": 10.013696670532227, + "epoch": 0.2631006525608068, + "mean_token_accuracy": 0.681614339351654, + "num_tokens": 13851006.0, + "step": 2661, + "train/ce_loss": 1.5152400732040405 + }, + { + "epoch": 0.2631006525608068, + "step": 2661, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.2631006525608068, + "step": 2661, + "train/total_loss": 0.24136775732040405 + }, + { + "entropy": 8.90575885772705, + "epoch": 0.2631995254103223, + "mean_token_accuracy": 0.7424242496490479, + "num_tokens": 13856510.0, + "step": 2662, + "train/ce_loss": 0.9599835276603699 + }, + { + "epoch": 0.2631995254103223, + "step": 2662, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.2631995254103223, + "step": 2662, + "train/total_loss": 0.1858420968055725 + }, + { + "entropy": 9.146432876586914, + "epoch": 0.26329839825983786, + "mean_token_accuracy": 0.7649123072624207, + "num_tokens": 13861865.0, + "step": 2663, + "train/ce_loss": 0.5537664890289307 + }, + { + "epoch": 0.26329839825983786, + "step": 2663, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.26329839825983786, + "step": 2663, + "train/total_loss": 0.13350164890289307 + }, + { + "entropy": 9.466894149780273, + "epoch": 0.26339727110935335, + "mean_token_accuracy": 0.7438849210739136, + "num_tokens": 13867002.0, + "step": 2664, + "train/ce_loss": 0.908247709274292 + }, + { + "epoch": 0.26339727110935335, + "step": 2664, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.26339727110935335, + "step": 2664, + "train/total_loss": 0.12598103284835815 + }, + { + "entropy": 9.13033676147461, + "epoch": 0.2634961439588689, + "mean_token_accuracy": 0.696450412273407, + "num_tokens": 13872493.0, + "step": 2665, + "train/ce_loss": 0.4399969279766083 + }, + { + "epoch": 0.2634961439588689, + "step": 2665, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.2634961439588689, + "step": 2665, + "train/total_loss": 0.12212469428777695 + }, + { + "entropy": 8.980000495910645, + "epoch": 0.26359501680838443, + "mean_token_accuracy": 0.672251284122467, + "num_tokens": 13877922.0, + "step": 2666, + "train/ce_loss": 1.4724513292312622 + }, + { + "epoch": 0.26359501680838443, + "step": 2666, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.26359501680838443, + "step": 2666, + "train/total_loss": 0.2644326388835907 + }, + { + "entropy": 9.773246765136719, + "epoch": 0.2636938896578999, + "mean_token_accuracy": 0.7590579986572266, + "num_tokens": 13882872.0, + "step": 2667, + "train/ce_loss": 0.9083855748176575 + }, + { + "epoch": 0.2636938896578999, + "step": 2667, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.2636938896578999, + "step": 2667, + "train/total_loss": 0.14943230152130127 + }, + { + "entropy": 9.056009292602539, + "epoch": 0.26379276250741546, + "mean_token_accuracy": 0.7304551005363464, + "num_tokens": 13888200.0, + "step": 2668, + "train/ce_loss": 0.5513181090354919 + }, + { + "epoch": 0.26379276250741546, + "step": 2668, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.26379276250741546, + "step": 2668, + "train/total_loss": 0.14106930792331696 + }, + { + "entropy": 9.509899139404297, + "epoch": 0.263891635356931, + "mean_token_accuracy": 0.6895368695259094, + "num_tokens": 13893238.0, + "step": 2669, + "train/ce_loss": 1.6937878131866455 + }, + { + "epoch": 0.263891635356931, + "step": 2669, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.263891635356931, + "step": 2669, + "train/total_loss": 0.26703503727912903 + }, + { + "entropy": 8.59216022491455, + "epoch": 0.2639905082064465, + "mean_token_accuracy": 0.7532597780227661, + "num_tokens": 13898822.0, + "step": 2670, + "train/ce_loss": 0.6907157301902771 + }, + { + "epoch": 0.2639905082064465, + "step": 2670, + "train/sim_loss": 0.140625 + }, + { + "epoch": 0.2639905082064465, + "step": 2670, + "train/total_loss": 0.20969657599925995 + }, + { + "entropy": 9.557438850402832, + "epoch": 0.264089381055962, + "mean_token_accuracy": 0.8209407925605774, + "num_tokens": 13903907.0, + "step": 2671, + "train/ce_loss": 0.43957269191741943 + }, + { + "epoch": 0.264089381055962, + "step": 2671, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.264089381055962, + "step": 2671, + "train/total_loss": 0.06348852068185806 + }, + { + "entropy": 9.627422332763672, + "epoch": 0.26418825390547757, + "mean_token_accuracy": 0.7705479264259338, + "num_tokens": 13908922.0, + "step": 2672, + "train/ce_loss": 2.151191234588623 + }, + { + "epoch": 0.26418825390547757, + "step": 2672, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.26418825390547757, + "step": 2672, + "train/total_loss": 0.2932441234588623 + }, + { + "entropy": 9.203516006469727, + "epoch": 0.26428712675499305, + "mean_token_accuracy": 0.6953020095825195, + "num_tokens": 13914151.0, + "step": 2673, + "train/ce_loss": 1.3459376096725464 + }, + { + "epoch": 0.26428712675499305, + "step": 2673, + "train/sim_loss": 0.12109375 + }, + { + "epoch": 0.26428712675499305, + "step": 2673, + "train/total_loss": 0.25568753480911255 + }, + { + "entropy": 9.146098136901855, + "epoch": 0.2643859996045086, + "mean_token_accuracy": 0.7183908224105835, + "num_tokens": 13919447.0, + "step": 2674, + "train/ce_loss": 0.566749632358551 + }, + { + "epoch": 0.2643859996045086, + "step": 2674, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.2643859996045086, + "step": 2674, + "train/total_loss": 0.09573746472597122 + }, + { + "entropy": 9.722884178161621, + "epoch": 0.26448487245402413, + "mean_token_accuracy": 0.713274359703064, + "num_tokens": 13924391.0, + "step": 2675, + "train/ce_loss": 1.1495646238327026 + }, + { + "epoch": 0.26448487245402413, + "step": 2675, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.26448487245402413, + "step": 2675, + "train/total_loss": 0.17745646834373474 + }, + { + "entropy": 9.050594329833984, + "epoch": 0.2645837453035397, + "mean_token_accuracy": 0.738269031047821, + "num_tokens": 13929809.0, + "step": 2676, + "train/ce_loss": 1.4384698867797852 + }, + { + "epoch": 0.2645837453035397, + "step": 2676, + "train/sim_loss": 0.12109375 + }, + { + "epoch": 0.2645837453035397, + "step": 2676, + "train/total_loss": 0.2649407386779785 + }, + { + "entropy": 9.252591133117676, + "epoch": 0.26468261815305516, + "mean_token_accuracy": 0.7371967434883118, + "num_tokens": 13935041.0, + "step": 2677, + "train/ce_loss": 0.8613839149475098 + }, + { + "epoch": 0.26468261815305516, + "step": 2677, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.26468261815305516, + "step": 2677, + "train/total_loss": 0.16816964745521545 + }, + { + "entropy": 9.040063858032227, + "epoch": 0.2647814910025707, + "mean_token_accuracy": 0.7625133395195007, + "num_tokens": 13940464.0, + "step": 2678, + "train/ce_loss": 0.3832489848136902 + }, + { + "epoch": 0.2647814910025707, + "step": 2678, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.2647814910025707, + "step": 2678, + "train/total_loss": 0.11254364997148514 + }, + { + "entropy": 8.749004364013672, + "epoch": 0.26488036385208624, + "mean_token_accuracy": 0.7801911234855652, + "num_tokens": 13946111.0, + "step": 2679, + "train/ce_loss": 0.6618013381958008 + }, + { + "epoch": 0.26488036385208624, + "step": 2679, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.26488036385208624, + "step": 2679, + "train/total_loss": 0.10914888232946396 + }, + { + "epoch": 0.26497923670160173, + "grad_norm": 0.7406871914863586, + "learning_rate": 9.340107798051723e-06, + "loss": 0.1539, + "step": 2680 + }, + { + "entropy": 9.133565902709961, + "epoch": 0.26497923670160173, + "mean_token_accuracy": 0.7111111283302307, + "num_tokens": 13951535.0, + "step": 2680, + "train/ce_loss": 0.535231351852417 + }, + { + "epoch": 0.26497923670160173, + "step": 2680, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.26497923670160173, + "step": 2680, + "train/total_loss": 0.11992938816547394 + }, + { + "entropy": 9.485006332397461, + "epoch": 0.26507810955111727, + "mean_token_accuracy": 0.7763347625732422, + "num_tokens": 13956685.0, + "step": 2681, + "train/ce_loss": 3.8054508877394255e-06 + }, + { + "epoch": 0.26507810955111727, + "step": 2681, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.26507810955111727, + "step": 2681, + "train/total_loss": 0.05468787997961044 + }, + { + "entropy": 8.994706153869629, + "epoch": 0.2651769824006328, + "mean_token_accuracy": 0.7014613747596741, + "num_tokens": 13962305.0, + "step": 2682, + "train/ce_loss": 1.3398152589797974 + }, + { + "epoch": 0.2651769824006328, + "step": 2682, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.2651769824006328, + "step": 2682, + "train/total_loss": 0.22382527589797974 + }, + { + "entropy": 9.234704971313477, + "epoch": 0.2652758552501483, + "mean_token_accuracy": 0.7142857313156128, + "num_tokens": 13967559.0, + "step": 2683, + "train/ce_loss": 0.8524354100227356 + }, + { + "epoch": 0.2652758552501483, + "step": 2683, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.2652758552501483, + "step": 2683, + "train/total_loss": 0.14774355292320251 + }, + { + "entropy": 9.198078155517578, + "epoch": 0.26537472809966384, + "mean_token_accuracy": 0.666304349899292, + "num_tokens": 13972909.0, + "step": 2684, + "train/ce_loss": 0.9787866473197937 + }, + { + "epoch": 0.26537472809966384, + "step": 2684, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.26537472809966384, + "step": 2684, + "train/total_loss": 0.15256616473197937 + }, + { + "entropy": 9.263998985290527, + "epoch": 0.2654736009491794, + "mean_token_accuracy": 0.7625570893287659, + "num_tokens": 13978240.0, + "step": 2685, + "train/ce_loss": 0.8679942488670349 + }, + { + "epoch": 0.2654736009491794, + "step": 2685, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.2654736009491794, + "step": 2685, + "train/total_loss": 0.10633067786693573 + }, + { + "entropy": 9.46180534362793, + "epoch": 0.26557247379869486, + "mean_token_accuracy": 0.7883817553520203, + "num_tokens": 13983368.0, + "step": 2686, + "train/ce_loss": 0.8221791982650757 + }, + { + "epoch": 0.26557247379869486, + "step": 2686, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.26557247379869486, + "step": 2686, + "train/total_loss": 0.09784292429685593 + }, + { + "entropy": 9.256744384765625, + "epoch": 0.2656713466482104, + "mean_token_accuracy": 0.7591686844825745, + "num_tokens": 13988655.0, + "step": 2687, + "train/ce_loss": 0.514398455619812 + }, + { + "epoch": 0.2656713466482104, + "step": 2687, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.2656713466482104, + "step": 2687, + "train/total_loss": 0.10222110152244568 + }, + { + "entropy": 9.252236366271973, + "epoch": 0.26577021949772595, + "mean_token_accuracy": 0.7030674815177917, + "num_tokens": 13993934.0, + "step": 2688, + "train/ce_loss": 0.5606142282485962 + }, + { + "epoch": 0.26577021949772595, + "step": 2688, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.26577021949772595, + "step": 2688, + "train/total_loss": 0.07949892431497574 + }, + { + "entropy": 9.90015983581543, + "epoch": 0.26586909234724143, + "mean_token_accuracy": 0.7364531755447388, + "num_tokens": 13998774.0, + "step": 2689, + "train/ce_loss": 1.4620862007141113 + }, + { + "epoch": 0.26586909234724143, + "step": 2689, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.26586909234724143, + "step": 2689, + "train/total_loss": 0.20089612901210785 + }, + { + "entropy": 9.599283218383789, + "epoch": 0.265967965196757, + "mean_token_accuracy": 0.74631267786026, + "num_tokens": 14003925.0, + "step": 2690, + "train/ce_loss": 1.3352909263630863e-05 + }, + { + "epoch": 0.265967965196757, + "step": 2690, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.265967965196757, + "step": 2690, + "train/total_loss": 0.03906383365392685 + }, + { + "entropy": 9.03776741027832, + "epoch": 0.2660668380462725, + "mean_token_accuracy": 0.7853982448577881, + "num_tokens": 14009313.0, + "step": 2691, + "train/ce_loss": 0.680451512336731 + }, + { + "epoch": 0.2660668380462725, + "step": 2691, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.2660668380462725, + "step": 2691, + "train/total_loss": 0.18132640421390533 + }, + { + "entropy": 9.915488243103027, + "epoch": 0.266165710895788, + "mean_token_accuracy": 0.7182447910308838, + "num_tokens": 14014121.0, + "step": 2692, + "train/ce_loss": 1.067265272140503 + }, + { + "epoch": 0.266165710895788, + "step": 2692, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.266165710895788, + "step": 2692, + "train/total_loss": 0.1457890272140503 + }, + { + "entropy": 9.574028968811035, + "epoch": 0.26626458374530354, + "mean_token_accuracy": 0.7054597735404968, + "num_tokens": 14019227.0, + "step": 2693, + "train/ce_loss": 1.486788005422568e-05 + }, + { + "epoch": 0.26626458374530354, + "step": 2693, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.26626458374530354, + "step": 2693, + "train/total_loss": 0.046876486390829086 + }, + { + "entropy": 9.717619895935059, + "epoch": 0.2663634565948191, + "mean_token_accuracy": 0.7028571367263794, + "num_tokens": 14024358.0, + "step": 2694, + "train/ce_loss": 0.3979763388633728 + }, + { + "epoch": 0.2663634565948191, + "step": 2694, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.2663634565948191, + "step": 2694, + "train/total_loss": 0.11792263388633728 + }, + { + "entropy": 8.958935737609863, + "epoch": 0.26646232944433457, + "mean_token_accuracy": 0.7351408004760742, + "num_tokens": 14029824.0, + "step": 2695, + "train/ce_loss": 0.9816790223121643 + }, + { + "epoch": 0.26646232944433457, + "step": 2695, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.26646232944433457, + "step": 2695, + "train/total_loss": 0.16457414627075195 + }, + { + "entropy": 9.341829299926758, + "epoch": 0.2665612022938501, + "mean_token_accuracy": 0.7549933195114136, + "num_tokens": 14034991.0, + "step": 2696, + "train/ce_loss": 0.5137763023376465 + }, + { + "epoch": 0.2665612022938501, + "step": 2696, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.2665612022938501, + "step": 2696, + "train/total_loss": 0.09825263172388077 + }, + { + "entropy": 9.283841133117676, + "epoch": 0.26666007514336565, + "mean_token_accuracy": 0.7165149450302124, + "num_tokens": 14040233.0, + "step": 2697, + "train/ce_loss": 0.6727295517921448 + }, + { + "epoch": 0.26666007514336565, + "step": 2697, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.26666007514336565, + "step": 2697, + "train/total_loss": 0.09461670368909836 + }, + { + "entropy": 9.179637908935547, + "epoch": 0.26675894799288113, + "mean_token_accuracy": 0.7394285798072815, + "num_tokens": 14045575.0, + "step": 2698, + "train/ce_loss": 1.1516114473342896 + }, + { + "epoch": 0.26675894799288113, + "step": 2698, + "train/sim_loss": 0.12109375 + }, + { + "epoch": 0.26675894799288113, + "step": 2698, + "train/total_loss": 0.23625490069389343 + }, + { + "entropy": 9.415515899658203, + "epoch": 0.2668578208423967, + "mean_token_accuracy": 0.7286432385444641, + "num_tokens": 14050817.0, + "step": 2699, + "train/ce_loss": 1.4189918041229248 + }, + { + "epoch": 0.2668578208423967, + "step": 2699, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.2668578208423967, + "step": 2699, + "train/total_loss": 0.18096168339252472 + }, + { + "epoch": 0.2669566936919122, + "grad_norm": 0.730399489402771, + "learning_rate": 9.335162933293776e-06, + "loss": 0.1546, + "step": 2700 + }, + { + "entropy": 9.281803131103516, + "epoch": 0.2669566936919122, + "mean_token_accuracy": 0.7296954393386841, + "num_tokens": 14056045.0, + "step": 2700, + "train/ce_loss": 0.4679917097091675 + }, + { + "epoch": 0.2669566936919122, + "step": 2700, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.2669566936919122, + "step": 2700, + "train/total_loss": 0.08976791799068451 + }, + { + "entropy": 9.70930004119873, + "epoch": 0.2670555665414277, + "mean_token_accuracy": 0.7396551966667175, + "num_tokens": 14061058.0, + "step": 2701, + "train/ce_loss": 5.598945335805183e-06 + }, + { + "epoch": 0.2670555665414277, + "step": 2701, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.2670555665414277, + "step": 2701, + "train/total_loss": 0.06250055879354477 + }, + { + "entropy": 9.451964378356934, + "epoch": 0.26715443939094324, + "mean_token_accuracy": 0.8056679964065552, + "num_tokens": 14066259.0, + "step": 2702, + "train/ce_loss": 0.5921843647956848 + }, + { + "epoch": 0.26715443939094324, + "step": 2702, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.26715443939094324, + "step": 2702, + "train/total_loss": 0.14124968647956848 + }, + { + "entropy": 9.086771011352539, + "epoch": 0.2672533122404588, + "mean_token_accuracy": 0.6866515874862671, + "num_tokens": 14071634.0, + "step": 2703, + "train/ce_loss": 0.9547269940376282 + }, + { + "epoch": 0.2672533122404588, + "step": 2703, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.2672533122404588, + "step": 2703, + "train/total_loss": 0.13844144344329834 + }, + { + "entropy": 9.125083923339844, + "epoch": 0.26735218508997427, + "mean_token_accuracy": 0.7378190159797668, + "num_tokens": 14076987.0, + "step": 2704, + "train/ce_loss": 0.784442663192749 + }, + { + "epoch": 0.26735218508997427, + "step": 2704, + "train/sim_loss": 0.12109375 + }, + { + "epoch": 0.26735218508997427, + "step": 2704, + "train/total_loss": 0.19953802227973938 + }, + { + "entropy": 9.073034286499023, + "epoch": 0.2674510579394898, + "mean_token_accuracy": 0.720588207244873, + "num_tokens": 14082326.0, + "step": 2705, + "train/ce_loss": 0.6992771625518799 + }, + { + "epoch": 0.2674510579394898, + "step": 2705, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.2674510579394898, + "step": 2705, + "train/total_loss": 0.15586522221565247 + }, + { + "entropy": 9.043624877929688, + "epoch": 0.26754993078900535, + "mean_token_accuracy": 0.7110874056816101, + "num_tokens": 14087772.0, + "step": 2706, + "train/ce_loss": 1.204443097114563 + }, + { + "epoch": 0.26754993078900535, + "step": 2706, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.26754993078900535, + "step": 2706, + "train/total_loss": 0.19075681269168854 + }, + { + "entropy": 9.483551025390625, + "epoch": 0.26764880363852084, + "mean_token_accuracy": 0.6895043849945068, + "num_tokens": 14092881.0, + "step": 2707, + "train/ce_loss": 1.4192895889282227 + }, + { + "epoch": 0.26764880363852084, + "step": 2707, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.26764880363852084, + "step": 2707, + "train/total_loss": 0.2513039708137512 + }, + { + "entropy": 9.470633506774902, + "epoch": 0.2677476764880364, + "mean_token_accuracy": 0.814479649066925, + "num_tokens": 14098056.0, + "step": 2708, + "train/ce_loss": 0.9829438328742981 + }, + { + "epoch": 0.2677476764880364, + "step": 2708, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.2677476764880364, + "step": 2708, + "train/total_loss": 0.12173188477754593 + }, + { + "entropy": 9.47817611694336, + "epoch": 0.2678465493375519, + "mean_token_accuracy": 0.7224669456481934, + "num_tokens": 14103210.0, + "step": 2709, + "train/ce_loss": 1.5246411561965942 + }, + { + "epoch": 0.2678465493375519, + "step": 2709, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.2678465493375519, + "step": 2709, + "train/total_loss": 0.2423078715801239 + }, + { + "entropy": 9.424848556518555, + "epoch": 0.2679454221870674, + "mean_token_accuracy": 0.7935578227043152, + "num_tokens": 14108322.0, + "step": 2710, + "train/ce_loss": 0.4777304232120514 + }, + { + "epoch": 0.2679454221870674, + "step": 2710, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.2679454221870674, + "step": 2710, + "train/total_loss": 0.07511679828166962 + }, + { + "entropy": 9.044717788696289, + "epoch": 0.26804429503658295, + "mean_token_accuracy": 0.7251732349395752, + "num_tokens": 14113629.0, + "step": 2711, + "train/ce_loss": 0.7718260884284973 + }, + { + "epoch": 0.26804429503658295, + "step": 2711, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.26804429503658295, + "step": 2711, + "train/total_loss": 0.1474951207637787 + }, + { + "entropy": 9.391767501831055, + "epoch": 0.2681431678860985, + "mean_token_accuracy": 0.7617079615592957, + "num_tokens": 14118769.0, + "step": 2712, + "train/ce_loss": 0.5797140598297119 + }, + { + "epoch": 0.2681431678860985, + "step": 2712, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.2681431678860985, + "step": 2712, + "train/total_loss": 0.11265890300273895 + }, + { + "entropy": 8.919556617736816, + "epoch": 0.268242040735614, + "mean_token_accuracy": 0.7400932312011719, + "num_tokens": 14124076.0, + "step": 2713, + "train/ce_loss": 0.4740203619003296 + }, + { + "epoch": 0.268242040735614, + "step": 2713, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.268242040735614, + "step": 2713, + "train/total_loss": 0.0747457891702652 + }, + { + "entropy": 9.423288345336914, + "epoch": 0.2683409135851295, + "mean_token_accuracy": 0.7017310261726379, + "num_tokens": 14129215.0, + "step": 2714, + "train/ce_loss": 1.82626473903656 + }, + { + "epoch": 0.2683409135851295, + "step": 2714, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.2683409135851295, + "step": 2714, + "train/total_loss": 0.23340772092342377 + }, + { + "entropy": 8.878215789794922, + "epoch": 0.26843978643464506, + "mean_token_accuracy": 0.7340530157089233, + "num_tokens": 14134701.0, + "step": 2715, + "train/ce_loss": 0.7186618447303772 + }, + { + "epoch": 0.26843978643464506, + "step": 2715, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.26843978643464506, + "step": 2715, + "train/total_loss": 0.10702243447303772 + }, + { + "entropy": 9.063881874084473, + "epoch": 0.2685386592841606, + "mean_token_accuracy": 0.7230955362319946, + "num_tokens": 14140018.0, + "step": 2716, + "train/ce_loss": 0.4996785521507263 + }, + { + "epoch": 0.2685386592841606, + "step": 2716, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.2685386592841606, + "step": 2716, + "train/total_loss": 0.12418660521507263 + }, + { + "entropy": 8.891548156738281, + "epoch": 0.2686375321336761, + "mean_token_accuracy": 0.7863330245018005, + "num_tokens": 14145505.0, + "step": 2717, + "train/ce_loss": 0.5857919454574585 + }, + { + "epoch": 0.2686375321336761, + "step": 2717, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.2686375321336761, + "step": 2717, + "train/total_loss": 0.07420419156551361 + }, + { + "entropy": 9.50385856628418, + "epoch": 0.2687364049831916, + "mean_token_accuracy": 0.7188552021980286, + "num_tokens": 14150523.0, + "step": 2718, + "train/ce_loss": 1.3414199352264404 + }, + { + "epoch": 0.2687364049831916, + "step": 2718, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.2687364049831916, + "step": 2718, + "train/total_loss": 0.18882949650287628 + }, + { + "entropy": 9.160788536071777, + "epoch": 0.26883527783270716, + "mean_token_accuracy": 0.668865442276001, + "num_tokens": 14155753.0, + "step": 2719, + "train/ce_loss": 0.684292197227478 + }, + { + "epoch": 0.26883527783270716, + "step": 2719, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.26883527783270716, + "step": 2719, + "train/total_loss": 0.12311672419309616 + }, + { + "epoch": 0.26893415068222265, + "grad_norm": 1.0595309734344482, + "learning_rate": 9.330218068535826e-06, + "loss": 0.1516, + "step": 2720 + }, + { + "entropy": 9.514728546142578, + "epoch": 0.26893415068222265, + "mean_token_accuracy": 0.7625201940536499, + "num_tokens": 14160826.0, + "step": 2720, + "train/ce_loss": 0.7204391360282898 + }, + { + "epoch": 0.26893415068222265, + "step": 2720, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.26893415068222265, + "step": 2720, + "train/total_loss": 0.11110641807317734 + }, + { + "entropy": 8.682881355285645, + "epoch": 0.2690330235317382, + "mean_token_accuracy": 0.7777777910232544, + "num_tokens": 14166345.0, + "step": 2721, + "train/ce_loss": 0.7576871514320374 + }, + { + "epoch": 0.2690330235317382, + "step": 2721, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.2690330235317382, + "step": 2721, + "train/total_loss": 0.16561245918273926 + }, + { + "entropy": 9.194377899169922, + "epoch": 0.26913189638125373, + "mean_token_accuracy": 0.7034813761711121, + "num_tokens": 14171693.0, + "step": 2722, + "train/ce_loss": 0.3500674068927765 + }, + { + "epoch": 0.26913189638125373, + "step": 2722, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.26913189638125373, + "step": 2722, + "train/total_loss": 0.09360049664974213 + }, + { + "entropy": 9.035322189331055, + "epoch": 0.2692307692307692, + "mean_token_accuracy": 0.7163197994232178, + "num_tokens": 14177012.0, + "step": 2723, + "train/ce_loss": 1.296778678894043 + }, + { + "epoch": 0.2692307692307692, + "step": 2723, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.2692307692307692, + "step": 2723, + "train/total_loss": 0.172646626830101 + }, + { + "entropy": 9.263710021972656, + "epoch": 0.26932964208028476, + "mean_token_accuracy": 0.7361282110214233, + "num_tokens": 14182469.0, + "step": 2724, + "train/ce_loss": 0.6803064942359924 + }, + { + "epoch": 0.26932964208028476, + "step": 2724, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.26932964208028476, + "step": 2724, + "train/total_loss": 0.13053065538406372 + }, + { + "entropy": 9.153655052185059, + "epoch": 0.2694285149298003, + "mean_token_accuracy": 0.7928571701049805, + "num_tokens": 14187748.0, + "step": 2725, + "train/ce_loss": 0.6589941382408142 + }, + { + "epoch": 0.2694285149298003, + "step": 2725, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.2694285149298003, + "step": 2725, + "train/total_loss": 0.13621191680431366 + }, + { + "entropy": 9.351223945617676, + "epoch": 0.2695273877793158, + "mean_token_accuracy": 0.72265625, + "num_tokens": 14192949.0, + "step": 2726, + "train/ce_loss": 0.6545261740684509 + }, + { + "epoch": 0.2695273877793158, + "step": 2726, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.2695273877793158, + "step": 2726, + "train/total_loss": 0.13185887038707733 + }, + { + "entropy": 9.045417785644531, + "epoch": 0.2696262606288313, + "mean_token_accuracy": 0.7489270567893982, + "num_tokens": 14198387.0, + "step": 2727, + "train/ce_loss": 0.7137749791145325 + }, + { + "epoch": 0.2696262606288313, + "step": 2727, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.2696262606288313, + "step": 2727, + "train/total_loss": 0.14169000089168549 + }, + { + "entropy": 9.564960479736328, + "epoch": 0.26972513347834687, + "mean_token_accuracy": 0.7658959627151489, + "num_tokens": 14203506.0, + "step": 2728, + "train/ce_loss": 4.330248884798493e-06 + }, + { + "epoch": 0.26972513347834687, + "step": 2728, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.26972513347834687, + "step": 2728, + "train/total_loss": 0.02734418213367462 + }, + { + "entropy": 8.947754859924316, + "epoch": 0.26982400632786235, + "mean_token_accuracy": 0.7384230494499207, + "num_tokens": 14208789.0, + "step": 2729, + "train/ce_loss": 0.6324231624603271 + }, + { + "epoch": 0.26982400632786235, + "step": 2729, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.26982400632786235, + "step": 2729, + "train/total_loss": 0.14136731624603271 + }, + { + "entropy": 9.119012832641602, + "epoch": 0.2699228791773779, + "mean_token_accuracy": 0.7366703152656555, + "num_tokens": 14214085.0, + "step": 2730, + "train/ce_loss": 0.6751307845115662 + }, + { + "epoch": 0.2699228791773779, + "step": 2730, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.2699228791773779, + "step": 2730, + "train/total_loss": 0.15735682845115662 + }, + { + "entropy": 9.13355541229248, + "epoch": 0.27002175202689344, + "mean_token_accuracy": 0.7561837434768677, + "num_tokens": 14219572.0, + "step": 2731, + "train/ce_loss": 0.6862268447875977 + }, + { + "epoch": 0.27002175202689344, + "step": 2731, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.27002175202689344, + "step": 2731, + "train/total_loss": 0.09206018596887589 + }, + { + "entropy": 9.203107833862305, + "epoch": 0.2701206248764089, + "mean_token_accuracy": 0.747474730014801, + "num_tokens": 14224909.0, + "step": 2732, + "train/ce_loss": 0.658726155757904 + }, + { + "epoch": 0.2701206248764089, + "step": 2732, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.2701206248764089, + "step": 2732, + "train/total_loss": 0.14790385961532593 + }, + { + "entropy": 10.122865676879883, + "epoch": 0.27021949772592446, + "mean_token_accuracy": 0.6963788270950317, + "num_tokens": 14229721.0, + "step": 2733, + "train/ce_loss": 2.6643052101135254 + }, + { + "epoch": 0.27021949772592446, + "step": 2733, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.27021949772592446, + "step": 2733, + "train/total_loss": 0.360180526971817 + }, + { + "entropy": 8.99232006072998, + "epoch": 0.27031837057544, + "mean_token_accuracy": 0.7479674816131592, + "num_tokens": 14235104.0, + "step": 2734, + "train/ce_loss": 1.247081995010376 + }, + { + "epoch": 0.27031837057544, + "step": 2734, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.27031837057544, + "step": 2734, + "train/total_loss": 0.17158320546150208 + }, + { + "entropy": 9.292896270751953, + "epoch": 0.2704172434249555, + "mean_token_accuracy": 0.7465145587921143, + "num_tokens": 14240571.0, + "step": 2735, + "train/ce_loss": 0.6882023811340332 + }, + { + "epoch": 0.2704172434249555, + "step": 2735, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.2704172434249555, + "step": 2735, + "train/total_loss": 0.11960148811340332 + }, + { + "entropy": 9.37419319152832, + "epoch": 0.27051611627447103, + "mean_token_accuracy": 0.7206632494926453, + "num_tokens": 14245799.0, + "step": 2736, + "train/ce_loss": 0.7072527408599854 + }, + { + "epoch": 0.27051611627447103, + "step": 2736, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.27051611627447103, + "step": 2736, + "train/total_loss": 0.12150652706623077 + }, + { + "entropy": 8.763299942016602, + "epoch": 0.27061498912398657, + "mean_token_accuracy": 0.7679324746131897, + "num_tokens": 14251248.0, + "step": 2737, + "train/ce_loss": 0.7712081074714661 + }, + { + "epoch": 0.27061498912398657, + "step": 2737, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.27061498912398657, + "step": 2737, + "train/total_loss": 0.1474333107471466 + }, + { + "entropy": 9.102038383483887, + "epoch": 0.27071386197350206, + "mean_token_accuracy": 0.7104895114898682, + "num_tokens": 14256446.0, + "step": 2738, + "train/ce_loss": 1.6209111213684082 + }, + { + "epoch": 0.27071386197350206, + "step": 2738, + "train/sim_loss": 0.265625 + }, + { + "epoch": 0.27071386197350206, + "step": 2738, + "train/total_loss": 0.42771613597869873 + }, + { + "entropy": 9.282876968383789, + "epoch": 0.2708127348230176, + "mean_token_accuracy": 0.755667507648468, + "num_tokens": 14261693.0, + "step": 2739, + "train/ce_loss": 0.3576745390892029 + }, + { + "epoch": 0.2708127348230176, + "step": 2739, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.2708127348230176, + "step": 2739, + "train/total_loss": 0.09436120092868805 + }, + { + "epoch": 0.27091160767253314, + "grad_norm": 0.8129703998565674, + "learning_rate": 9.325273203777877e-06, + "loss": 0.1538, + "step": 2740 + }, + { + "entropy": 9.425825119018555, + "epoch": 0.27091160767253314, + "mean_token_accuracy": 0.6977058053016663, + "num_tokens": 14266945.0, + "step": 2740, + "train/ce_loss": 0.7067881226539612 + }, + { + "epoch": 0.27091160767253314, + "step": 2740, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.27091160767253314, + "step": 2740, + "train/total_loss": 0.17224131524562836 + }, + { + "entropy": 9.145968437194824, + "epoch": 0.2710104805220486, + "mean_token_accuracy": 0.765196681022644, + "num_tokens": 14272207.0, + "step": 2741, + "train/ce_loss": 0.7855210304260254 + }, + { + "epoch": 0.2710104805220486, + "step": 2741, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.2710104805220486, + "step": 2741, + "train/total_loss": 0.14495834708213806 + }, + { + "entropy": 9.4079008102417, + "epoch": 0.27110935337156417, + "mean_token_accuracy": 0.8165760636329651, + "num_tokens": 14277352.0, + "step": 2742, + "train/ce_loss": 0.7447280883789062 + }, + { + "epoch": 0.27110935337156417, + "step": 2742, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.27110935337156417, + "step": 2742, + "train/total_loss": 0.0979103073477745 + }, + { + "entropy": 8.978082656860352, + "epoch": 0.2712082262210797, + "mean_token_accuracy": 0.69852215051651, + "num_tokens": 14282870.0, + "step": 2743, + "train/ce_loss": 0.867680549621582 + }, + { + "epoch": 0.2712082262210797, + "step": 2743, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.2712082262210797, + "step": 2743, + "train/total_loss": 0.18833056092262268 + }, + { + "entropy": 9.525739669799805, + "epoch": 0.2713070990705952, + "mean_token_accuracy": 0.7130177617073059, + "num_tokens": 14288003.0, + "step": 2744, + "train/ce_loss": 1.0470430850982666 + }, + { + "epoch": 0.2713070990705952, + "step": 2744, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.2713070990705952, + "step": 2744, + "train/total_loss": 0.20236057043075562 + }, + { + "entropy": 9.163778305053711, + "epoch": 0.27140597192011073, + "mean_token_accuracy": 0.7372986078262329, + "num_tokens": 14293289.0, + "step": 2745, + "train/ce_loss": 0.6977776288986206 + }, + { + "epoch": 0.27140597192011073, + "step": 2745, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.27140597192011073, + "step": 2745, + "train/total_loss": 0.18696525692939758 + }, + { + "entropy": 9.077678680419922, + "epoch": 0.2715048447696263, + "mean_token_accuracy": 0.7273838520050049, + "num_tokens": 14298592.0, + "step": 2746, + "train/ce_loss": 6.585466962860664e-06 + }, + { + "epoch": 0.2715048447696263, + "step": 2746, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.2715048447696263, + "step": 2746, + "train/total_loss": 0.04296940937638283 + }, + { + "entropy": 9.930625915527344, + "epoch": 0.27160371761914176, + "mean_token_accuracy": 0.7298049926757812, + "num_tokens": 14303340.0, + "step": 2747, + "train/ce_loss": 2.05059552192688 + }, + { + "epoch": 0.27160371761914176, + "step": 2747, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.27160371761914176, + "step": 2747, + "train/total_loss": 0.25193455815315247 + }, + { + "entropy": 9.546127319335938, + "epoch": 0.2717025904686573, + "mean_token_accuracy": 0.7295690774917603, + "num_tokens": 14308489.0, + "step": 2748, + "train/ce_loss": 1.3548938035964966 + }, + { + "epoch": 0.2717025904686573, + "step": 2748, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.2717025904686573, + "step": 2748, + "train/total_loss": 0.21361438930034637 + }, + { + "entropy": 8.843116760253906, + "epoch": 0.27180146331817284, + "mean_token_accuracy": 0.7509881258010864, + "num_tokens": 14313917.0, + "step": 2749, + "train/ce_loss": 1.0115149021148682 + }, + { + "epoch": 0.27180146331817284, + "step": 2749, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.27180146331817284, + "step": 2749, + "train/total_loss": 0.2144327461719513 + }, + { + "entropy": 9.490730285644531, + "epoch": 0.27190033616768833, + "mean_token_accuracy": 0.766853928565979, + "num_tokens": 14319046.0, + "step": 2750, + "train/ce_loss": 3.2003395062929485e-06 + }, + { + "epoch": 0.27190033616768833, + "step": 2750, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.27190033616768833, + "step": 2750, + "train/total_loss": 0.04687532037496567 + }, + { + "entropy": 8.729164123535156, + "epoch": 0.27199920901720387, + "mean_token_accuracy": 0.7644135355949402, + "num_tokens": 14324571.0, + "step": 2751, + "train/ce_loss": 0.6920303702354431 + }, + { + "epoch": 0.27199920901720387, + "step": 2751, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.27199920901720387, + "step": 2751, + "train/total_loss": 0.14732804894447327 + }, + { + "entropy": 9.034978866577148, + "epoch": 0.2720980818667194, + "mean_token_accuracy": 0.7425414323806763, + "num_tokens": 14329945.0, + "step": 2752, + "train/ce_loss": 0.8318552374839783 + }, + { + "epoch": 0.2720980818667194, + "step": 2752, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.2720980818667194, + "step": 2752, + "train/total_loss": 0.14177927374839783 + }, + { + "entropy": 9.5916109085083, + "epoch": 0.2721969547162349, + "mean_token_accuracy": 0.7996768951416016, + "num_tokens": 14335009.0, + "step": 2753, + "train/ce_loss": 4.757183887704741e-06 + }, + { + "epoch": 0.2721969547162349, + "step": 2753, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.2721969547162349, + "step": 2753, + "train/total_loss": 0.023437974974513054 + }, + { + "entropy": 9.023578643798828, + "epoch": 0.27229582756575044, + "mean_token_accuracy": 0.6979637742042542, + "num_tokens": 14340352.0, + "step": 2754, + "train/ce_loss": 0.4947526454925537 + }, + { + "epoch": 0.27229582756575044, + "step": 2754, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.27229582756575044, + "step": 2754, + "train/total_loss": 0.1315065175294876 + }, + { + "entropy": 9.618267059326172, + "epoch": 0.272394700415266, + "mean_token_accuracy": 0.7423780560493469, + "num_tokens": 14345391.0, + "step": 2755, + "train/ce_loss": 8.185864317056257e-06 + }, + { + "epoch": 0.272394700415266, + "step": 2755, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.272394700415266, + "step": 2755, + "train/total_loss": 0.07031331956386566 + }, + { + "entropy": 8.738838195800781, + "epoch": 0.27249357326478146, + "mean_token_accuracy": 0.7375964522361755, + "num_tokens": 14350794.0, + "step": 2756, + "train/ce_loss": 0.6735537052154541 + }, + { + "epoch": 0.27249357326478146, + "step": 2756, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.27249357326478146, + "step": 2756, + "train/total_loss": 0.13376161456108093 + }, + { + "entropy": 9.224106788635254, + "epoch": 0.272592446114297, + "mean_token_accuracy": 0.7684346437454224, + "num_tokens": 14356057.0, + "step": 2757, + "train/ce_loss": 0.9679180383682251 + }, + { + "epoch": 0.272592446114297, + "step": 2757, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.272592446114297, + "step": 2757, + "train/total_loss": 0.2100730538368225 + }, + { + "entropy": 9.142610549926758, + "epoch": 0.27269131896381255, + "mean_token_accuracy": 0.7483870983123779, + "num_tokens": 14361301.0, + "step": 2758, + "train/ce_loss": 1.4557925462722778 + }, + { + "epoch": 0.27269131896381255, + "step": 2758, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.27269131896381255, + "step": 2758, + "train/total_loss": 0.1885480135679245 + }, + { + "entropy": 10.052937507629395, + "epoch": 0.2727901918133281, + "mean_token_accuracy": 0.7412935495376587, + "num_tokens": 14366114.0, + "step": 2759, + "train/ce_loss": 1.7297953367233276 + }, + { + "epoch": 0.2727901918133281, + "step": 2759, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.2727901918133281, + "step": 2759, + "train/total_loss": 0.25501078367233276 + }, + { + "epoch": 0.2728890646628436, + "grad_norm": 0.9541262984275818, + "learning_rate": 9.320328339019929e-06, + "loss": 0.1457, + "step": 2760 + }, + { + "entropy": 9.396015167236328, + "epoch": 0.2728890646628436, + "mean_token_accuracy": 0.6876675486564636, + "num_tokens": 14371281.0, + "step": 2760, + "train/ce_loss": 1.7390131950378418 + }, + { + "epoch": 0.2728890646628436, + "step": 2760, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.2728890646628436, + "step": 2760, + "train/total_loss": 0.2559325695037842 + }, + { + "entropy": 9.222189903259277, + "epoch": 0.2729879375123591, + "mean_token_accuracy": 0.6744186282157898, + "num_tokens": 14376507.0, + "step": 2761, + "train/ce_loss": 1.4619855880737305 + }, + { + "epoch": 0.2729879375123591, + "step": 2761, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.2729879375123591, + "step": 2761, + "train/total_loss": 0.2047923058271408 + }, + { + "entropy": 9.410306930541992, + "epoch": 0.27308681036187465, + "mean_token_accuracy": 0.7361769080162048, + "num_tokens": 14381564.0, + "step": 2762, + "train/ce_loss": 0.8330072164535522 + }, + { + "epoch": 0.27308681036187465, + "step": 2762, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.27308681036187465, + "step": 2762, + "train/total_loss": 0.13408197462558746 + }, + { + "entropy": 9.200156211853027, + "epoch": 0.27318568321139014, + "mean_token_accuracy": 0.7051281929016113, + "num_tokens": 14386819.0, + "step": 2763, + "train/ce_loss": 0.8308500647544861 + }, + { + "epoch": 0.27318568321139014, + "step": 2763, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.27318568321139014, + "step": 2763, + "train/total_loss": 0.15339750051498413 + }, + { + "entropy": 8.76707935333252, + "epoch": 0.2732845560609057, + "mean_token_accuracy": 0.70010906457901, + "num_tokens": 14392131.0, + "step": 2764, + "train/ce_loss": 0.5953741669654846 + }, + { + "epoch": 0.2732845560609057, + "step": 2764, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.2732845560609057, + "step": 2764, + "train/total_loss": 0.09859991818666458 + }, + { + "entropy": 8.96510124206543, + "epoch": 0.2733834289104212, + "mean_token_accuracy": 0.7502774596214294, + "num_tokens": 14397494.0, + "step": 2765, + "train/ce_loss": 0.9151800274848938 + }, + { + "epoch": 0.2733834289104212, + "step": 2765, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.2733834289104212, + "step": 2765, + "train/total_loss": 0.15401801466941833 + }, + { + "entropy": 8.955915451049805, + "epoch": 0.2734823017599367, + "mean_token_accuracy": 0.6983805894851685, + "num_tokens": 14402944.0, + "step": 2766, + "train/ce_loss": 0.6279106140136719 + }, + { + "epoch": 0.2734823017599367, + "step": 2766, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.2734823017599367, + "step": 2766, + "train/total_loss": 0.11357231438159943 + }, + { + "entropy": 9.095291137695312, + "epoch": 0.27358117460945225, + "mean_token_accuracy": 0.746198832988739, + "num_tokens": 14408277.0, + "step": 2767, + "train/ce_loss": 0.9595216512680054 + }, + { + "epoch": 0.27358117460945225, + "step": 2767, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.27358117460945225, + "step": 2767, + "train/total_loss": 0.18970216810703278 + }, + { + "entropy": 9.259521484375, + "epoch": 0.2736800474589678, + "mean_token_accuracy": 0.7518518567085266, + "num_tokens": 14413564.0, + "step": 2768, + "train/ce_loss": 0.7817373871803284 + }, + { + "epoch": 0.2736800474589678, + "step": 2768, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.2736800474589678, + "step": 2768, + "train/total_loss": 0.15629874169826508 + }, + { + "entropy": 9.800960540771484, + "epoch": 0.2737789203084833, + "mean_token_accuracy": 0.7534791231155396, + "num_tokens": 14418497.0, + "step": 2769, + "train/ce_loss": 0.00027163056074641645 + }, + { + "epoch": 0.2737789203084833, + "step": 2769, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.2737789203084833, + "step": 2769, + "train/total_loss": 0.046902164816856384 + }, + { + "entropy": 9.137109756469727, + "epoch": 0.2738777931579988, + "mean_token_accuracy": 0.7103128433227539, + "num_tokens": 14423836.0, + "step": 2770, + "train/ce_loss": 1.4109325408935547 + }, + { + "epoch": 0.2738777931579988, + "step": 2770, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.2738777931579988, + "step": 2770, + "train/total_loss": 0.24265575408935547 + }, + { + "entropy": 9.510459899902344, + "epoch": 0.27397666600751436, + "mean_token_accuracy": 0.7751572132110596, + "num_tokens": 14428938.0, + "step": 2771, + "train/ce_loss": 3.7481873732758686e-05 + }, + { + "epoch": 0.27397666600751436, + "step": 2771, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.27397666600751436, + "step": 2771, + "train/total_loss": 0.06250374764204025 + }, + { + "entropy": 9.130108833312988, + "epoch": 0.27407553885702984, + "mean_token_accuracy": 0.7431629300117493, + "num_tokens": 14434239.0, + "step": 2772, + "train/ce_loss": 0.3732340335845947 + }, + { + "epoch": 0.27407553885702984, + "step": 2772, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.27407553885702984, + "step": 2772, + "train/total_loss": 0.06466715037822723 + }, + { + "entropy": 9.484024047851562, + "epoch": 0.2741744117065454, + "mean_token_accuracy": 0.7289073467254639, + "num_tokens": 14439399.0, + "step": 2773, + "train/ce_loss": 1.231729507446289 + }, + { + "epoch": 0.2741744117065454, + "step": 2773, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.2741744117065454, + "step": 2773, + "train/total_loss": 0.19739170372486115 + }, + { + "entropy": 9.430547714233398, + "epoch": 0.2742732845560609, + "mean_token_accuracy": 0.7442922592163086, + "num_tokens": 14444573.0, + "step": 2774, + "train/ce_loss": 1.1785828064603265e-05 + }, + { + "epoch": 0.2742732845560609, + "step": 2774, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.2742732845560609, + "step": 2774, + "train/total_loss": 0.10937617719173431 + }, + { + "entropy": 9.98629093170166, + "epoch": 0.2743721574055764, + "mean_token_accuracy": 0.7030162215232849, + "num_tokens": 14449465.0, + "step": 2775, + "train/ce_loss": 8.80569132277742e-06 + }, + { + "epoch": 0.2743721574055764, + "step": 2775, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.2743721574055764, + "step": 2775, + "train/total_loss": 0.05859462916851044 + }, + { + "entropy": 9.538305282592773, + "epoch": 0.27447103025509195, + "mean_token_accuracy": 0.7018348574638367, + "num_tokens": 14454580.0, + "step": 2776, + "train/ce_loss": 1.2678555250167847 + }, + { + "epoch": 0.27447103025509195, + "step": 2776, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.27447103025509195, + "step": 2776, + "train/total_loss": 0.23616056144237518 + }, + { + "entropy": 10.267413139343262, + "epoch": 0.2745699031046075, + "mean_token_accuracy": 0.7951807379722595, + "num_tokens": 14459267.0, + "step": 2777, + "train/ce_loss": 1.6592833995819092 + }, + { + "epoch": 0.2745699031046075, + "step": 2777, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.2745699031046075, + "step": 2777, + "train/total_loss": 0.21670959889888763 + }, + { + "entropy": 8.74622917175293, + "epoch": 0.274668775954123, + "mean_token_accuracy": 0.7353951930999756, + "num_tokens": 14464615.0, + "step": 2778, + "train/ce_loss": 0.8360884189605713 + }, + { + "epoch": 0.274668775954123, + "step": 2778, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.274668775954123, + "step": 2778, + "train/total_loss": 0.16173383593559265 + }, + { + "entropy": 8.989816665649414, + "epoch": 0.2747676488036385, + "mean_token_accuracy": 0.8011363744735718, + "num_tokens": 14469986.0, + "step": 2779, + "train/ce_loss": 1.058032512664795 + }, + { + "epoch": 0.2747676488036385, + "step": 2779, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.2747676488036385, + "step": 2779, + "train/total_loss": 0.1526782512664795 + }, + { + "epoch": 0.27486652165315406, + "grad_norm": 0.7337502837181091, + "learning_rate": 9.31538347426198e-06, + "loss": 0.1501, + "step": 2780 + }, + { + "entropy": 9.15650749206543, + "epoch": 0.27486652165315406, + "mean_token_accuracy": 0.6983758807182312, + "num_tokens": 14475357.0, + "step": 2780, + "train/ce_loss": 0.8407629728317261 + }, + { + "epoch": 0.27486652165315406, + "step": 2780, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.27486652165315406, + "step": 2780, + "train/total_loss": 0.15438880026340485 + }, + { + "entropy": 9.087913513183594, + "epoch": 0.27496539450266955, + "mean_token_accuracy": 0.7535714507102966, + "num_tokens": 14480666.0, + "step": 2781, + "train/ce_loss": 0.8932794332504272 + }, + { + "epoch": 0.27496539450266955, + "step": 2781, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.27496539450266955, + "step": 2781, + "train/total_loss": 0.17917169630527496 + }, + { + "entropy": 9.604347229003906, + "epoch": 0.2750642673521851, + "mean_token_accuracy": 0.7685325145721436, + "num_tokens": 14485781.0, + "step": 2782, + "train/ce_loss": 8.136759788612835e-06 + }, + { + "epoch": 0.2750642673521851, + "step": 2782, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.2750642673521851, + "step": 2782, + "train/total_loss": 0.07812581211328506 + }, + { + "entropy": 8.990873336791992, + "epoch": 0.27516314020170063, + "mean_token_accuracy": 0.7177497744560242, + "num_tokens": 14491339.0, + "step": 2783, + "train/ce_loss": 0.8948922157287598 + }, + { + "epoch": 0.27516314020170063, + "step": 2783, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.27516314020170063, + "step": 2783, + "train/total_loss": 0.18323922157287598 + }, + { + "entropy": 9.077988624572754, + "epoch": 0.2752620130512161, + "mean_token_accuracy": 0.7343412637710571, + "num_tokens": 14496756.0, + "step": 2784, + "train/ce_loss": 0.7399857044219971 + }, + { + "epoch": 0.2752620130512161, + "step": 2784, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.2752620130512161, + "step": 2784, + "train/total_loss": 0.09743607044219971 + }, + { + "entropy": 8.992471694946289, + "epoch": 0.27536088590073166, + "mean_token_accuracy": 0.7187817096710205, + "num_tokens": 14502240.0, + "step": 2785, + "train/ce_loss": 1.4025267362594604 + }, + { + "epoch": 0.27536088590073166, + "step": 2785, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.27536088590073166, + "step": 2785, + "train/total_loss": 0.21056517958641052 + }, + { + "entropy": 9.678565979003906, + "epoch": 0.2754597587502472, + "mean_token_accuracy": 0.7094339728355408, + "num_tokens": 14507181.0, + "step": 2786, + "train/ce_loss": 1.3096341717755422e-05 + }, + { + "epoch": 0.2754597587502472, + "step": 2786, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.2754597587502472, + "step": 2786, + "train/total_loss": 0.07422006130218506 + }, + { + "entropy": 9.417784690856934, + "epoch": 0.2755586315997627, + "mean_token_accuracy": 0.7039473652839661, + "num_tokens": 14512373.0, + "step": 2787, + "train/ce_loss": 0.9774115085601807 + }, + { + "epoch": 0.2755586315997627, + "step": 2787, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.2755586315997627, + "step": 2787, + "train/total_loss": 0.14461615681648254 + }, + { + "entropy": 9.363090515136719, + "epoch": 0.2756575044492782, + "mean_token_accuracy": 0.7112675905227661, + "num_tokens": 14517538.0, + "step": 2788, + "train/ce_loss": 1.3934249877929688 + }, + { + "epoch": 0.2756575044492782, + "step": 2788, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.2756575044492782, + "step": 2788, + "train/total_loss": 0.2174675017595291 + }, + { + "entropy": 9.457781791687012, + "epoch": 0.27575637729879376, + "mean_token_accuracy": 0.7311521768569946, + "num_tokens": 14522649.0, + "step": 2789, + "train/ce_loss": 1.1051501035690308 + }, + { + "epoch": 0.27575637729879376, + "step": 2789, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.27575637729879376, + "step": 2789, + "train/total_loss": 0.21989001333713531 + }, + { + "entropy": 9.891679763793945, + "epoch": 0.27585525014830925, + "mean_token_accuracy": 0.7549999952316284, + "num_tokens": 14527455.0, + "step": 2790, + "train/ce_loss": 1.8008737564086914 + }, + { + "epoch": 0.27585525014830925, + "step": 2790, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.27585525014830925, + "step": 2790, + "train/total_loss": 0.2582123875617981 + }, + { + "entropy": 9.288782119750977, + "epoch": 0.2759541229978248, + "mean_token_accuracy": 0.7717791199684143, + "num_tokens": 14532762.0, + "step": 2791, + "train/ce_loss": 0.8080936074256897 + }, + { + "epoch": 0.2759541229978248, + "step": 2791, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.2759541229978248, + "step": 2791, + "train/total_loss": 0.1433093547821045 + }, + { + "entropy": 9.532764434814453, + "epoch": 0.27605299584734033, + "mean_token_accuracy": 0.7211093902587891, + "num_tokens": 14537852.0, + "step": 2792, + "train/ce_loss": 0.8462459444999695 + }, + { + "epoch": 0.27605299584734033, + "step": 2792, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.27605299584734033, + "step": 2792, + "train/total_loss": 0.16665583848953247 + }, + { + "entropy": 9.049671173095703, + "epoch": 0.2761518686968558, + "mean_token_accuracy": 0.8083961009979248, + "num_tokens": 14543216.0, + "step": 2793, + "train/ce_loss": 0.6352055668830872 + }, + { + "epoch": 0.2761518686968558, + "step": 2793, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.2761518686968558, + "step": 2793, + "train/total_loss": 0.09477055817842484 + }, + { + "entropy": 9.601463317871094, + "epoch": 0.27625074154637136, + "mean_token_accuracy": 0.7698675394058228, + "num_tokens": 14548250.0, + "step": 2794, + "train/ce_loss": 0.5902164578437805 + }, + { + "epoch": 0.27625074154637136, + "step": 2794, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.27625074154637136, + "step": 2794, + "train/total_loss": 0.11761540174484253 + }, + { + "entropy": 9.143898963928223, + "epoch": 0.2763496143958869, + "mean_token_accuracy": 0.7332535982131958, + "num_tokens": 14553551.0, + "step": 2795, + "train/ce_loss": 0.4416395425796509 + }, + { + "epoch": 0.2763496143958869, + "step": 2795, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.2763496143958869, + "step": 2795, + "train/total_loss": 0.11447645723819733 + }, + { + "entropy": 9.792064666748047, + "epoch": 0.2764484872454024, + "mean_token_accuracy": 0.7258319854736328, + "num_tokens": 14558634.0, + "step": 2796, + "train/ce_loss": 0.9641293287277222 + }, + { + "epoch": 0.2764484872454024, + "step": 2796, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.2764484872454024, + "step": 2796, + "train/total_loss": 0.19016292691230774 + }, + { + "entropy": 8.983678817749023, + "epoch": 0.2765473600949179, + "mean_token_accuracy": 0.7092511057853699, + "num_tokens": 14564042.0, + "step": 2797, + "train/ce_loss": 0.49966195225715637 + }, + { + "epoch": 0.2765473600949179, + "step": 2797, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.2765473600949179, + "step": 2797, + "train/total_loss": 0.12418495118618011 + }, + { + "entropy": 9.377925872802734, + "epoch": 0.27664623294443347, + "mean_token_accuracy": 0.729194164276123, + "num_tokens": 14569246.0, + "step": 2798, + "train/ce_loss": 0.8390363454818726 + }, + { + "epoch": 0.27664623294443347, + "step": 2798, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.27664623294443347, + "step": 2798, + "train/total_loss": 0.10734113305807114 + }, + { + "entropy": 9.556909561157227, + "epoch": 0.276745105793949, + "mean_token_accuracy": 0.7122302055358887, + "num_tokens": 14574244.0, + "step": 2799, + "train/ce_loss": 5.166768460185267e-06 + }, + { + "epoch": 0.276745105793949, + "step": 2799, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.276745105793949, + "step": 2799, + "train/total_loss": 0.07421926409006119 + }, + { + "epoch": 0.2768439786434645, + "grad_norm": 0.9261484742164612, + "learning_rate": 9.310438609504032e-06, + "loss": 0.1536, + "step": 2800 + }, + { + "entropy": 9.34654426574707, + "epoch": 0.2768439786434645, + "mean_token_accuracy": 0.7158034443855286, + "num_tokens": 14579495.0, + "step": 2800, + "train/ce_loss": 0.40792229771614075 + }, + { + "epoch": 0.2768439786434645, + "step": 2800, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.2768439786434645, + "step": 2800, + "train/total_loss": 0.09547972679138184 + }, + { + "entropy": 9.522933959960938, + "epoch": 0.27694285149298004, + "mean_token_accuracy": 0.7470414042472839, + "num_tokens": 14584561.0, + "step": 2801, + "train/ce_loss": 4.356124918558635e-06 + }, + { + "epoch": 0.27694285149298004, + "step": 2801, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.27694285149298004, + "step": 2801, + "train/total_loss": 0.01953168585896492 + }, + { + "entropy": 9.155094146728516, + "epoch": 0.2770417243424956, + "mean_token_accuracy": 0.7702227234840393, + "num_tokens": 14589871.0, + "step": 2802, + "train/ce_loss": 0.8684111833572388 + }, + { + "epoch": 0.2770417243424956, + "step": 2802, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.2770417243424956, + "step": 2802, + "train/total_loss": 0.16496612131595612 + }, + { + "entropy": 10.107953071594238, + "epoch": 0.27714059719201106, + "mean_token_accuracy": 0.722347617149353, + "num_tokens": 14594700.0, + "step": 2803, + "train/ce_loss": 1.263716459274292 + }, + { + "epoch": 0.27714059719201106, + "step": 2803, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.27714059719201106, + "step": 2803, + "train/total_loss": 0.15371540188789368 + }, + { + "entropy": 9.887292861938477, + "epoch": 0.2772394700415266, + "mean_token_accuracy": 0.7801268696784973, + "num_tokens": 14599591.0, + "step": 2804, + "train/ce_loss": 1.0284250492986757e-05 + }, + { + "epoch": 0.2772394700415266, + "step": 2804, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.2772394700415266, + "step": 2804, + "train/total_loss": 0.050782278180122375 + }, + { + "entropy": 9.335623741149902, + "epoch": 0.27733834289104214, + "mean_token_accuracy": 0.710089385509491, + "num_tokens": 14604746.0, + "step": 2805, + "train/ce_loss": 0.9892579913139343 + }, + { + "epoch": 0.27733834289104214, + "step": 2805, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.27733834289104214, + "step": 2805, + "train/total_loss": 0.16533204913139343 + }, + { + "entropy": 9.14814567565918, + "epoch": 0.27743721574055763, + "mean_token_accuracy": 0.7142857313156128, + "num_tokens": 14609955.0, + "step": 2806, + "train/ce_loss": 0.6708879470825195 + }, + { + "epoch": 0.27743721574055763, + "step": 2806, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.27743721574055763, + "step": 2806, + "train/total_loss": 0.1452137976884842 + }, + { + "entropy": 8.987321853637695, + "epoch": 0.27753608859007317, + "mean_token_accuracy": 0.7251396775245667, + "num_tokens": 14615308.0, + "step": 2807, + "train/ce_loss": 0.8159539699554443 + }, + { + "epoch": 0.27753608859007317, + "step": 2807, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.27753608859007317, + "step": 2807, + "train/total_loss": 0.12847039103507996 + }, + { + "entropy": 9.317058563232422, + "epoch": 0.2776349614395887, + "mean_token_accuracy": 0.7585693001747131, + "num_tokens": 14620460.0, + "step": 2808, + "train/ce_loss": 0.7746213674545288 + }, + { + "epoch": 0.2776349614395887, + "step": 2808, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.2776349614395887, + "step": 2808, + "train/total_loss": 0.14777463674545288 + }, + { + "entropy": 9.289009094238281, + "epoch": 0.2777338342891042, + "mean_token_accuracy": 0.806609570980072, + "num_tokens": 14625717.0, + "step": 2809, + "train/ce_loss": 0.7532195448875427 + }, + { + "epoch": 0.2777338342891042, + "step": 2809, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.2777338342891042, + "step": 2809, + "train/total_loss": 0.09875945746898651 + }, + { + "entropy": 9.066307067871094, + "epoch": 0.27783270713861974, + "mean_token_accuracy": 0.7464008927345276, + "num_tokens": 14631122.0, + "step": 2810, + "train/ce_loss": 1.13222336769104 + }, + { + "epoch": 0.27783270713861974, + "step": 2810, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.27783270713861974, + "step": 2810, + "train/total_loss": 0.19525358080863953 + }, + { + "entropy": 9.542865753173828, + "epoch": 0.2779315799881353, + "mean_token_accuracy": 0.6925514936447144, + "num_tokens": 14636203.0, + "step": 2811, + "train/ce_loss": 1.0428379774093628 + }, + { + "epoch": 0.2779315799881353, + "step": 2811, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.2779315799881353, + "step": 2811, + "train/total_loss": 0.15115880966186523 + }, + { + "entropy": 9.164630889892578, + "epoch": 0.27803045283765077, + "mean_token_accuracy": 0.7756410241127014, + "num_tokens": 14641408.0, + "step": 2812, + "train/ce_loss": 0.8429121971130371 + }, + { + "epoch": 0.27803045283765077, + "step": 2812, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.27803045283765077, + "step": 2812, + "train/total_loss": 0.1702287197113037 + }, + { + "entropy": 9.204523086547852, + "epoch": 0.2781293256871663, + "mean_token_accuracy": 0.8227990865707397, + "num_tokens": 14646764.0, + "step": 2813, + "train/ce_loss": 0.4245660901069641 + }, + { + "epoch": 0.2781293256871663, + "step": 2813, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.2781293256871663, + "step": 2813, + "train/total_loss": 0.14792536199092865 + }, + { + "entropy": 9.020892143249512, + "epoch": 0.27822819853668185, + "mean_token_accuracy": 0.7318652868270874, + "num_tokens": 14651994.0, + "step": 2814, + "train/ce_loss": 0.6662297248840332 + }, + { + "epoch": 0.27822819853668185, + "step": 2814, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.27822819853668185, + "step": 2814, + "train/total_loss": 0.12521672248840332 + }, + { + "entropy": 9.035504341125488, + "epoch": 0.27832707138619733, + "mean_token_accuracy": 0.7324973940849304, + "num_tokens": 14657409.0, + "step": 2815, + "train/ce_loss": 0.6135442852973938 + }, + { + "epoch": 0.27832707138619733, + "step": 2815, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.27832707138619733, + "step": 2815, + "train/total_loss": 0.14729192852973938 + }, + { + "entropy": 9.42744255065918, + "epoch": 0.2784259442357129, + "mean_token_accuracy": 0.7241379022598267, + "num_tokens": 14662533.0, + "step": 2816, + "train/ce_loss": 1.1029224395751953 + }, + { + "epoch": 0.2784259442357129, + "step": 2816, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.2784259442357129, + "step": 2816, + "train/total_loss": 0.1884172558784485 + }, + { + "entropy": 9.399081230163574, + "epoch": 0.2785248170852284, + "mean_token_accuracy": 0.7789017558097839, + "num_tokens": 14667689.0, + "step": 2817, + "train/ce_loss": 0.41269248723983765 + }, + { + "epoch": 0.2785248170852284, + "step": 2817, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.2785248170852284, + "step": 2817, + "train/total_loss": 0.09986300021409988 + }, + { + "entropy": 9.163670539855957, + "epoch": 0.2786236899347439, + "mean_token_accuracy": 0.6848691701889038, + "num_tokens": 14673011.0, + "step": 2818, + "train/ce_loss": 1.004690170288086 + }, + { + "epoch": 0.2786236899347439, + "step": 2818, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.2786236899347439, + "step": 2818, + "train/total_loss": 0.13171902298927307 + }, + { + "entropy": 9.63107681274414, + "epoch": 0.27872256278425944, + "mean_token_accuracy": 0.709618866443634, + "num_tokens": 14677986.0, + "step": 2819, + "train/ce_loss": 0.9162476062774658 + }, + { + "epoch": 0.27872256278425944, + "step": 2819, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.27872256278425944, + "step": 2819, + "train/total_loss": 0.15021851658821106 + }, + { + "epoch": 0.278821435633775, + "grad_norm": 0.9040817618370056, + "learning_rate": 9.305493744746082e-06, + "loss": 0.1469, + "step": 2820 + }, + { + "entropy": 9.209228515625, + "epoch": 0.278821435633775, + "mean_token_accuracy": 0.7755681872367859, + "num_tokens": 14683165.0, + "step": 2820, + "train/ce_loss": 1.6597121953964233 + }, + { + "epoch": 0.278821435633775, + "step": 2820, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.278821435633775, + "step": 2820, + "train/total_loss": 0.24800246953964233 + }, + { + "entropy": 9.346404075622559, + "epoch": 0.27892030848329047, + "mean_token_accuracy": 0.7366310358047485, + "num_tokens": 14688348.0, + "step": 2821, + "train/ce_loss": 0.7856786251068115 + }, + { + "epoch": 0.27892030848329047, + "step": 2821, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.27892030848329047, + "step": 2821, + "train/total_loss": 0.14888036251068115 + }, + { + "entropy": 9.371283531188965, + "epoch": 0.279019181332806, + "mean_token_accuracy": 0.7604562640190125, + "num_tokens": 14693626.0, + "step": 2822, + "train/ce_loss": 2.148430109024048 + }, + { + "epoch": 0.279019181332806, + "step": 2822, + "train/sim_loss": 0.1328125 + }, + { + "epoch": 0.279019181332806, + "step": 2822, + "train/total_loss": 0.3476555347442627 + }, + { + "entropy": 9.872785568237305, + "epoch": 0.27911805418232155, + "mean_token_accuracy": 0.7659574747085571, + "num_tokens": 14698524.0, + "step": 2823, + "train/ce_loss": 1.5956604480743408 + }, + { + "epoch": 0.27911805418232155, + "step": 2823, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.27911805418232155, + "step": 2823, + "train/total_loss": 0.24550354480743408 + }, + { + "entropy": 9.347860336303711, + "epoch": 0.27921692703183704, + "mean_token_accuracy": 0.6710700392723083, + "num_tokens": 14703738.0, + "step": 2824, + "train/ce_loss": 3.157474793624715e-06 + }, + { + "epoch": 0.27921692703183704, + "step": 2824, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.27921692703183704, + "step": 2824, + "train/total_loss": 0.06640656292438507 + }, + { + "entropy": 9.093656539916992, + "epoch": 0.2793157998813526, + "mean_token_accuracy": 0.7427184581756592, + "num_tokens": 14709041.0, + "step": 2825, + "train/ce_loss": 0.9866620302200317 + }, + { + "epoch": 0.2793157998813526, + "step": 2825, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.2793157998813526, + "step": 2825, + "train/total_loss": 0.1260099560022354 + }, + { + "entropy": 9.071894645690918, + "epoch": 0.2794146727308681, + "mean_token_accuracy": 0.6545040011405945, + "num_tokens": 14714345.0, + "step": 2826, + "train/ce_loss": 1.7742172479629517 + }, + { + "epoch": 0.2794146727308681, + "step": 2826, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.2794146727308681, + "step": 2826, + "train/total_loss": 0.23992173373699188 + }, + { + "entropy": 9.314325332641602, + "epoch": 0.2795135455803836, + "mean_token_accuracy": 0.707317054271698, + "num_tokens": 14719580.0, + "step": 2827, + "train/ce_loss": 1.368817687034607 + }, + { + "epoch": 0.2795135455803836, + "step": 2827, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.2795135455803836, + "step": 2827, + "train/total_loss": 0.1915692687034607 + }, + { + "entropy": 9.146047592163086, + "epoch": 0.27961241842989915, + "mean_token_accuracy": 0.735897421836853, + "num_tokens": 14724845.0, + "step": 2828, + "train/ce_loss": 0.8126648664474487 + }, + { + "epoch": 0.27961241842989915, + "step": 2828, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.27961241842989915, + "step": 2828, + "train/total_loss": 0.15939149260520935 + }, + { + "entropy": 9.197863578796387, + "epoch": 0.2797112912794147, + "mean_token_accuracy": 0.7515375018119812, + "num_tokens": 14730317.0, + "step": 2829, + "train/ce_loss": 0.8095092177391052 + }, + { + "epoch": 0.2797112912794147, + "step": 2829, + "train/sim_loss": 0.1328125 + }, + { + "epoch": 0.2797112912794147, + "step": 2829, + "train/total_loss": 0.21376341581344604 + }, + { + "entropy": 9.530399322509766, + "epoch": 0.27981016412893017, + "mean_token_accuracy": 0.728787899017334, + "num_tokens": 14735388.0, + "step": 2830, + "train/ce_loss": 1.2663441896438599 + }, + { + "epoch": 0.27981016412893017, + "step": 2830, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.27981016412893017, + "step": 2830, + "train/total_loss": 0.146165668964386 + }, + { + "entropy": 9.17813777923584, + "epoch": 0.2799090369784457, + "mean_token_accuracy": 0.7043189406394958, + "num_tokens": 14740784.0, + "step": 2831, + "train/ce_loss": 0.6137787103652954 + }, + { + "epoch": 0.2799090369784457, + "step": 2831, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.2799090369784457, + "step": 2831, + "train/total_loss": 0.0926278680562973 + }, + { + "entropy": 9.317256927490234, + "epoch": 0.28000790982796125, + "mean_token_accuracy": 0.7368420958518982, + "num_tokens": 14745999.0, + "step": 2832, + "train/ce_loss": 0.6407901048660278 + }, + { + "epoch": 0.28000790982796125, + "step": 2832, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.28000790982796125, + "step": 2832, + "train/total_loss": 0.12267275899648666 + }, + { + "entropy": 9.02281379699707, + "epoch": 0.28010678267747674, + "mean_token_accuracy": 0.6962719559669495, + "num_tokens": 14751364.0, + "step": 2833, + "train/ce_loss": 1.4975244998931885 + }, + { + "epoch": 0.28010678267747674, + "step": 2833, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.28010678267747674, + "step": 2833, + "train/total_loss": 0.2552211880683899 + }, + { + "entropy": 9.3422269821167, + "epoch": 0.2802056555269923, + "mean_token_accuracy": 0.7568627595901489, + "num_tokens": 14756565.0, + "step": 2834, + "train/ce_loss": 0.9791196584701538 + }, + { + "epoch": 0.2802056555269923, + "step": 2834, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.2802056555269923, + "step": 2834, + "train/total_loss": 0.13697446882724762 + }, + { + "entropy": 9.067024230957031, + "epoch": 0.2803045283765078, + "mean_token_accuracy": 0.7177215218544006, + "num_tokens": 14761813.0, + "step": 2835, + "train/ce_loss": 0.9501418471336365 + }, + { + "epoch": 0.2803045283765078, + "step": 2835, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.2803045283765078, + "step": 2835, + "train/total_loss": 0.17313918471336365 + }, + { + "entropy": 9.283982276916504, + "epoch": 0.2804034012260233, + "mean_token_accuracy": 0.768831193447113, + "num_tokens": 14767079.0, + "step": 2836, + "train/ce_loss": 0.3750511407852173 + }, + { + "epoch": 0.2804034012260233, + "step": 2836, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.2804034012260233, + "step": 2836, + "train/total_loss": 0.1234426200389862 + }, + { + "entropy": 9.057904243469238, + "epoch": 0.28050227407553885, + "mean_token_accuracy": 0.7276166677474976, + "num_tokens": 14772364.0, + "step": 2837, + "train/ce_loss": 1.0986840724945068 + }, + { + "epoch": 0.28050227407553885, + "step": 2837, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.28050227407553885, + "step": 2837, + "train/total_loss": 0.16846215724945068 + }, + { + "entropy": 9.07217788696289, + "epoch": 0.2806011469250544, + "mean_token_accuracy": 0.751207709312439, + "num_tokens": 14777663.0, + "step": 2838, + "train/ce_loss": 1.0109590291976929 + }, + { + "epoch": 0.2806011469250544, + "step": 2838, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.2806011469250544, + "step": 2838, + "train/total_loss": 0.16750216484069824 + }, + { + "entropy": 9.474416732788086, + "epoch": 0.2807000197745699, + "mean_token_accuracy": 0.6593245267868042, + "num_tokens": 14782794.0, + "step": 2839, + "train/ce_loss": 1.2314399480819702 + }, + { + "epoch": 0.2807000197745699, + "step": 2839, + "train/sim_loss": 0.12890625 + }, + { + "epoch": 0.2807000197745699, + "step": 2839, + "train/total_loss": 0.2520502507686615 + }, + { + "epoch": 0.2807988926240854, + "grad_norm": 0.9944719672203064, + "learning_rate": 9.300548879988133e-06, + "loss": 0.1588, + "step": 2840 + }, + { + "entropy": 10.013803482055664, + "epoch": 0.2807988926240854, + "mean_token_accuracy": 0.6607669591903687, + "num_tokens": 14787549.0, + "step": 2840, + "train/ce_loss": 9.427340046386234e-06 + }, + { + "epoch": 0.2807988926240854, + "step": 2840, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.2807988926240854, + "step": 2840, + "train/total_loss": 0.03125094249844551 + }, + { + "entropy": 9.168352127075195, + "epoch": 0.28089776547360096, + "mean_token_accuracy": 0.7252090573310852, + "num_tokens": 14792888.0, + "step": 2841, + "train/ce_loss": 1.0289748907089233 + }, + { + "epoch": 0.28089776547360096, + "step": 2841, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.28089776547360096, + "step": 2841, + "train/total_loss": 0.1927412450313568 + }, + { + "entropy": 9.441322326660156, + "epoch": 0.2809966383231165, + "mean_token_accuracy": 0.7796852588653564, + "num_tokens": 14798033.0, + "step": 2842, + "train/ce_loss": 0.7183638215065002 + }, + { + "epoch": 0.2809966383231165, + "step": 2842, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.2809966383231165, + "step": 2842, + "train/total_loss": 0.16949263215065002 + }, + { + "entropy": 9.063495635986328, + "epoch": 0.281095511172632, + "mean_token_accuracy": 0.7673377990722656, + "num_tokens": 14803361.0, + "step": 2843, + "train/ce_loss": 0.5977473258972168 + }, + { + "epoch": 0.281095511172632, + "step": 2843, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.281095511172632, + "step": 2843, + "train/total_loss": 0.1183684840798378 + }, + { + "entropy": 9.264204025268555, + "epoch": 0.2811943840221475, + "mean_token_accuracy": 0.6883604526519775, + "num_tokens": 14808698.0, + "step": 2844, + "train/ce_loss": 4.171130967733916e-06 + }, + { + "epoch": 0.2811943840221475, + "step": 2844, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.2811943840221475, + "step": 2844, + "train/total_loss": 0.05468791723251343 + }, + { + "entropy": 9.403308868408203, + "epoch": 0.28129325687166307, + "mean_token_accuracy": 0.7954220175743103, + "num_tokens": 14813867.0, + "step": 2845, + "train/ce_loss": 4.887909199169371e-06 + }, + { + "epoch": 0.28129325687166307, + "step": 2845, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.28129325687166307, + "step": 2845, + "train/total_loss": 0.0664067417383194 + }, + { + "entropy": 9.499643325805664, + "epoch": 0.28139212972117855, + "mean_token_accuracy": 0.7121211886405945, + "num_tokens": 14818982.0, + "step": 2846, + "train/ce_loss": 1.0086324214935303 + }, + { + "epoch": 0.28139212972117855, + "step": 2846, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.28139212972117855, + "step": 2846, + "train/total_loss": 0.1360194981098175 + }, + { + "entropy": 9.10026741027832, + "epoch": 0.2814910025706941, + "mean_token_accuracy": 0.7444444298744202, + "num_tokens": 14824310.0, + "step": 2847, + "train/ce_loss": 0.7505509257316589 + }, + { + "epoch": 0.2814910025706941, + "step": 2847, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.2814910025706941, + "step": 2847, + "train/total_loss": 0.1727113425731659 + }, + { + "entropy": 9.757712364196777, + "epoch": 0.28158987542020963, + "mean_token_accuracy": 0.7311643958091736, + "num_tokens": 14829337.0, + "step": 2848, + "train/ce_loss": 4.63628703073482e-06 + }, + { + "epoch": 0.28158987542020963, + "step": 2848, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.28158987542020963, + "step": 2848, + "train/total_loss": 0.03125046193599701 + }, + { + "entropy": 8.91356372833252, + "epoch": 0.2816887482697251, + "mean_token_accuracy": 0.7150395512580872, + "num_tokens": 14834547.0, + "step": 2849, + "train/ce_loss": 0.7498126029968262 + }, + { + "epoch": 0.2816887482697251, + "step": 2849, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.2816887482697251, + "step": 2849, + "train/total_loss": 0.13357502222061157 + }, + { + "entropy": 9.169994354248047, + "epoch": 0.28178762111924066, + "mean_token_accuracy": 0.7188940048217773, + "num_tokens": 14839629.0, + "step": 2850, + "train/ce_loss": 2.004420518875122 + }, + { + "epoch": 0.28178762111924066, + "step": 2850, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.28178762111924066, + "step": 2850, + "train/total_loss": 0.2824733257293701 + }, + { + "entropy": 10.078506469726562, + "epoch": 0.2818864939687562, + "mean_token_accuracy": 0.7661290168762207, + "num_tokens": 14844407.0, + "step": 2851, + "train/ce_loss": 6.761013082723366e-06 + }, + { + "epoch": 0.2818864939687562, + "step": 2851, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.2818864939687562, + "step": 2851, + "train/total_loss": 0.02734442614018917 + }, + { + "entropy": 8.773055076599121, + "epoch": 0.2819853668182717, + "mean_token_accuracy": 0.7260416746139526, + "num_tokens": 14849859.0, + "step": 2852, + "train/ce_loss": 0.8448234796524048 + }, + { + "epoch": 0.2819853668182717, + "step": 2852, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.2819853668182717, + "step": 2852, + "train/total_loss": 0.135263592004776 + }, + { + "entropy": 9.628518104553223, + "epoch": 0.28208423966778723, + "mean_token_accuracy": 0.7423934936523438, + "num_tokens": 14854786.0, + "step": 2853, + "train/ce_loss": 8.594476639700588e-06 + }, + { + "epoch": 0.28208423966778723, + "step": 2853, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.28208423966778723, + "step": 2853, + "train/total_loss": 0.09375085681676865 + }, + { + "entropy": 9.283641815185547, + "epoch": 0.28218311251730277, + "mean_token_accuracy": 0.7653478980064392, + "num_tokens": 14859967.0, + "step": 2854, + "train/ce_loss": 0.9590861201286316 + }, + { + "epoch": 0.28218311251730277, + "step": 2854, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.28218311251730277, + "step": 2854, + "train/total_loss": 0.15059611201286316 + }, + { + "entropy": 9.685651779174805, + "epoch": 0.28228198536681826, + "mean_token_accuracy": 0.7138047218322754, + "num_tokens": 14864979.0, + "step": 2855, + "train/ce_loss": 0.7458218932151794 + }, + { + "epoch": 0.28228198536681826, + "step": 2855, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.28228198536681826, + "step": 2855, + "train/total_loss": 0.15270718932151794 + }, + { + "entropy": 9.327569961547852, + "epoch": 0.2823808582163338, + "mean_token_accuracy": 0.7463863492012024, + "num_tokens": 14870208.0, + "step": 2856, + "train/ce_loss": 0.46743056178092957 + }, + { + "epoch": 0.2823808582163338, + "step": 2856, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.2823808582163338, + "step": 2856, + "train/total_loss": 0.07018055766820908 + }, + { + "entropy": 8.706644058227539, + "epoch": 0.28247973106584934, + "mean_token_accuracy": 0.7232142686843872, + "num_tokens": 14875736.0, + "step": 2857, + "train/ce_loss": 0.7347967028617859 + }, + { + "epoch": 0.28247973106584934, + "step": 2857, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.28247973106584934, + "step": 2857, + "train/total_loss": 0.09691717475652695 + }, + { + "entropy": 9.690895080566406, + "epoch": 0.2825786039153648, + "mean_token_accuracy": 0.6859813332557678, + "num_tokens": 14880704.0, + "step": 2858, + "train/ce_loss": 1.3040494918823242 + }, + { + "epoch": 0.2825786039153648, + "step": 2858, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.2825786039153648, + "step": 2858, + "train/total_loss": 0.23196744918823242 + }, + { + "entropy": 8.822907447814941, + "epoch": 0.28267747676488036, + "mean_token_accuracy": 0.7335243821144104, + "num_tokens": 14886240.0, + "step": 2859, + "train/ce_loss": 0.7813608646392822 + }, + { + "epoch": 0.28267747676488036, + "step": 2859, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.28267747676488036, + "step": 2859, + "train/total_loss": 0.11719858646392822 + }, + { + "epoch": 0.2827763496143959, + "grad_norm": 0.7708230018615723, + "learning_rate": 9.295604015230185e-06, + "loss": 0.1569, + "step": 2860 + }, + { + "entropy": 9.035205841064453, + "epoch": 0.2827763496143959, + "mean_token_accuracy": 0.7477638721466064, + "num_tokens": 14891285.0, + "step": 2860, + "train/ce_loss": 0.9769201874732971 + }, + { + "epoch": 0.2827763496143959, + "step": 2860, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.2827763496143959, + "step": 2860, + "train/total_loss": 0.15628576278686523 + }, + { + "entropy": 8.971607208251953, + "epoch": 0.2828752224639114, + "mean_token_accuracy": 0.707446813583374, + "num_tokens": 14896680.0, + "step": 2861, + "train/ce_loss": 0.7312490940093994 + }, + { + "epoch": 0.2828752224639114, + "step": 2861, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.2828752224639114, + "step": 2861, + "train/total_loss": 0.13953116536140442 + }, + { + "entropy": 9.324117660522461, + "epoch": 0.28297409531342693, + "mean_token_accuracy": 0.7440000176429749, + "num_tokens": 14901911.0, + "step": 2862, + "train/ce_loss": 0.8002615571022034 + }, + { + "epoch": 0.28297409531342693, + "step": 2862, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.28297409531342693, + "step": 2862, + "train/total_loss": 0.11127615720033646 + }, + { + "entropy": 9.020425796508789, + "epoch": 0.2830729681629425, + "mean_token_accuracy": 0.7620651125907898, + "num_tokens": 14907299.0, + "step": 2863, + "train/ce_loss": 0.40052181482315063 + }, + { + "epoch": 0.2830729681629425, + "step": 2863, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.2830729681629425, + "step": 2863, + "train/total_loss": 0.07130218297243118 + }, + { + "entropy": 9.081808090209961, + "epoch": 0.28317184101245796, + "mean_token_accuracy": 0.7048260569572449, + "num_tokens": 14912676.0, + "step": 2864, + "train/ce_loss": 1.4557803869247437 + }, + { + "epoch": 0.28317184101245796, + "step": 2864, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.28317184101245796, + "step": 2864, + "train/total_loss": 0.2549530267715454 + }, + { + "entropy": 9.343101501464844, + "epoch": 0.2832707138619735, + "mean_token_accuracy": 0.8108108043670654, + "num_tokens": 14917810.0, + "step": 2865, + "train/ce_loss": 0.6384103298187256 + }, + { + "epoch": 0.2832707138619735, + "step": 2865, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.2832707138619735, + "step": 2865, + "train/total_loss": 0.09509103745222092 + }, + { + "entropy": 9.158092498779297, + "epoch": 0.28336958671148904, + "mean_token_accuracy": 0.7088607549667358, + "num_tokens": 14923164.0, + "step": 2866, + "train/ce_loss": 0.77399080991745 + }, + { + "epoch": 0.28336958671148904, + "step": 2866, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.28336958671148904, + "step": 2866, + "train/total_loss": 0.16724282503128052 + }, + { + "entropy": 9.385732650756836, + "epoch": 0.2834684595610045, + "mean_token_accuracy": 0.7286295890808105, + "num_tokens": 14928346.0, + "step": 2867, + "train/ce_loss": 1.191542387008667 + }, + { + "epoch": 0.2834684595610045, + "step": 2867, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.2834684595610045, + "step": 2867, + "train/total_loss": 0.18556049466133118 + }, + { + "entropy": 8.915904998779297, + "epoch": 0.28356733241052007, + "mean_token_accuracy": 0.710208535194397, + "num_tokens": 14933734.0, + "step": 2868, + "train/ce_loss": 1.2880035638809204 + }, + { + "epoch": 0.28356733241052007, + "step": 2868, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.28356733241052007, + "step": 2868, + "train/total_loss": 0.19130036234855652 + }, + { + "entropy": 9.131593704223633, + "epoch": 0.2836662052600356, + "mean_token_accuracy": 0.7538631558418274, + "num_tokens": 14939093.0, + "step": 2869, + "train/ce_loss": 0.5264643430709839 + }, + { + "epoch": 0.2836662052600356, + "step": 2869, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.2836662052600356, + "step": 2869, + "train/total_loss": 0.11514643579721451 + }, + { + "entropy": 8.972114562988281, + "epoch": 0.2837650781095511, + "mean_token_accuracy": 0.7587336301803589, + "num_tokens": 14944519.0, + "step": 2870, + "train/ce_loss": 0.5326451063156128 + }, + { + "epoch": 0.2837650781095511, + "step": 2870, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.2837650781095511, + "step": 2870, + "train/total_loss": 0.13920201361179352 + }, + { + "entropy": 9.17553424835205, + "epoch": 0.28386395095906664, + "mean_token_accuracy": 0.6920454502105713, + "num_tokens": 14949813.0, + "step": 2871, + "train/ce_loss": 1.0072126388549805 + }, + { + "epoch": 0.28386395095906664, + "step": 2871, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.28386395095906664, + "step": 2871, + "train/total_loss": 0.14759626984596252 + }, + { + "entropy": 8.945381164550781, + "epoch": 0.2839628238085822, + "mean_token_accuracy": 0.728249192237854, + "num_tokens": 14955235.0, + "step": 2872, + "train/ce_loss": 0.6857571601867676 + }, + { + "epoch": 0.2839628238085822, + "step": 2872, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.2839628238085822, + "step": 2872, + "train/total_loss": 0.12326321750879288 + }, + { + "entropy": 8.911649703979492, + "epoch": 0.28406169665809766, + "mean_token_accuracy": 0.7469135522842407, + "num_tokens": 14960550.0, + "step": 2873, + "train/ce_loss": 0.6256386041641235 + }, + { + "epoch": 0.28406169665809766, + "step": 2873, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.28406169665809766, + "step": 2873, + "train/total_loss": 0.12506386637687683 + }, + { + "entropy": 9.24261474609375, + "epoch": 0.2841605695076132, + "mean_token_accuracy": 0.7627118825912476, + "num_tokens": 14965777.0, + "step": 2874, + "train/ce_loss": 0.5531490445137024 + }, + { + "epoch": 0.2841605695076132, + "step": 2874, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.2841605695076132, + "step": 2874, + "train/total_loss": 0.09437740594148636 + }, + { + "entropy": 9.203690528869629, + "epoch": 0.28425944235712874, + "mean_token_accuracy": 0.7638888955116272, + "num_tokens": 14970899.0, + "step": 2875, + "train/ce_loss": 1.4430269402510021e-05 + }, + { + "epoch": 0.28425944235712874, + "step": 2875, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.28425944235712874, + "step": 2875, + "train/total_loss": 0.046876441687345505 + }, + { + "entropy": 8.744983673095703, + "epoch": 0.28435831520664423, + "mean_token_accuracy": 0.7970244288444519, + "num_tokens": 14976265.0, + "step": 2876, + "train/ce_loss": 0.37529221177101135 + }, + { + "epoch": 0.28435831520664423, + "step": 2876, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.28435831520664423, + "step": 2876, + "train/total_loss": 0.06487297266721725 + }, + { + "entropy": 9.226240158081055, + "epoch": 0.28445718805615977, + "mean_token_accuracy": 0.7182044982910156, + "num_tokens": 14981516.0, + "step": 2877, + "train/ce_loss": 2.5748761345312232e-06 + }, + { + "epoch": 0.28445718805615977, + "step": 2877, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.28445718805615977, + "step": 2877, + "train/total_loss": 0.06250026077032089 + }, + { + "entropy": 8.894309043884277, + "epoch": 0.2845560609056753, + "mean_token_accuracy": 0.7683315873146057, + "num_tokens": 14986940.0, + "step": 2878, + "train/ce_loss": 0.6498753428459167 + }, + { + "epoch": 0.2845560609056753, + "step": 2878, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.2845560609056753, + "step": 2878, + "train/total_loss": 0.13139379024505615 + }, + { + "entropy": 9.124774932861328, + "epoch": 0.2846549337551908, + "mean_token_accuracy": 0.738095223903656, + "num_tokens": 14992231.0, + "step": 2879, + "train/ce_loss": 1.1597203016281128 + }, + { + "epoch": 0.2846549337551908, + "step": 2879, + "train/sim_loss": 0.1328125 + }, + { + "epoch": 0.2846549337551908, + "step": 2879, + "train/total_loss": 0.24878454208374023 + }, + { + "epoch": 0.28475380660470634, + "grad_norm": 0.9130677580833435, + "learning_rate": 9.290659150472235e-06, + "loss": 0.1444, + "step": 2880 + }, + { + "entropy": 9.95883560180664, + "epoch": 0.28475380660470634, + "mean_token_accuracy": 0.7068607211112976, + "num_tokens": 14997134.0, + "step": 2880, + "train/ce_loss": 1.6533236503601074 + }, + { + "epoch": 0.28475380660470634, + "step": 2880, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.28475380660470634, + "step": 2880, + "train/total_loss": 0.2903323769569397 + }, + { + "entropy": 9.849489212036133, + "epoch": 0.2848526794542219, + "mean_token_accuracy": 0.7683615684509277, + "num_tokens": 15002099.0, + "step": 2881, + "train/ce_loss": 1.461854338645935 + }, + { + "epoch": 0.2848526794542219, + "step": 2881, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.2848526794542219, + "step": 2881, + "train/total_loss": 0.22431044280529022 + }, + { + "entropy": 9.121145248413086, + "epoch": 0.2849515523037374, + "mean_token_accuracy": 0.7242646813392639, + "num_tokens": 15007399.0, + "step": 2882, + "train/ce_loss": 0.46917709708213806 + }, + { + "epoch": 0.2849515523037374, + "step": 2882, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.2849515523037374, + "step": 2882, + "train/total_loss": 0.11332395672798157 + }, + { + "entropy": 8.879717826843262, + "epoch": 0.2850504251532529, + "mean_token_accuracy": 0.7605459094047546, + "num_tokens": 15012741.0, + "step": 2883, + "train/ce_loss": 0.7481593489646912 + }, + { + "epoch": 0.2850504251532529, + "step": 2883, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.2850504251532529, + "step": 2883, + "train/total_loss": 0.18419092893600464 + }, + { + "entropy": 9.377570152282715, + "epoch": 0.28514929800276845, + "mean_token_accuracy": 0.7293035387992859, + "num_tokens": 15017922.0, + "step": 2884, + "train/ce_loss": 1.36378014087677 + }, + { + "epoch": 0.28514929800276845, + "step": 2884, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.28514929800276845, + "step": 2884, + "train/total_loss": 0.17153427004814148 + }, + { + "entropy": 9.331884384155273, + "epoch": 0.285248170852284, + "mean_token_accuracy": 0.7418879270553589, + "num_tokens": 15023068.0, + "step": 2885, + "train/ce_loss": 1.1111305866506882e-05 + }, + { + "epoch": 0.285248170852284, + "step": 2885, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.285248170852284, + "step": 2885, + "train/total_loss": 0.02734486199915409 + }, + { + "entropy": 8.684528350830078, + "epoch": 0.2853470437017995, + "mean_token_accuracy": 0.7504363059997559, + "num_tokens": 15028713.0, + "step": 2886, + "train/ce_loss": 0.8095030784606934 + }, + { + "epoch": 0.2853470437017995, + "step": 2886, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.2853470437017995, + "step": 2886, + "train/total_loss": 0.1747003197669983 + }, + { + "entropy": 9.024653434753418, + "epoch": 0.285445916551315, + "mean_token_accuracy": 0.7375296950340271, + "num_tokens": 15034053.0, + "step": 2887, + "train/ce_loss": 1.1938775777816772 + }, + { + "epoch": 0.285445916551315, + "step": 2887, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.285445916551315, + "step": 2887, + "train/total_loss": 0.23657526075839996 + }, + { + "entropy": 8.832128524780273, + "epoch": 0.28554478940083056, + "mean_token_accuracy": 0.701508641242981, + "num_tokens": 15039481.0, + "step": 2888, + "train/ce_loss": 1.0464129447937012 + }, + { + "epoch": 0.28554478940083056, + "step": 2888, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.28554478940083056, + "step": 2888, + "train/total_loss": 0.21011003851890564 + }, + { + "entropy": 9.106720924377441, + "epoch": 0.28564366225034604, + "mean_token_accuracy": 0.7845982313156128, + "num_tokens": 15044874.0, + "step": 2889, + "train/ce_loss": 0.8580244183540344 + }, + { + "epoch": 0.28564366225034604, + "step": 2889, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.28564366225034604, + "step": 2889, + "train/total_loss": 0.16783368587493896 + }, + { + "entropy": 9.683690071105957, + "epoch": 0.2857425350998616, + "mean_token_accuracy": 0.7377049326896667, + "num_tokens": 15049889.0, + "step": 2890, + "train/ce_loss": 0.7985544204711914 + }, + { + "epoch": 0.2857425350998616, + "step": 2890, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.2857425350998616, + "step": 2890, + "train/total_loss": 0.13844919204711914 + }, + { + "entropy": 9.650266647338867, + "epoch": 0.2858414079493771, + "mean_token_accuracy": 0.6741573214530945, + "num_tokens": 15054946.0, + "step": 2891, + "train/ce_loss": 1.537845492362976 + }, + { + "epoch": 0.2858414079493771, + "step": 2891, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.2858414079493771, + "step": 2891, + "train/total_loss": 0.2592533230781555 + }, + { + "entropy": 9.020576477050781, + "epoch": 0.2859402807988926, + "mean_token_accuracy": 0.7421320080757141, + "num_tokens": 15060366.0, + "step": 2892, + "train/ce_loss": 0.669991672039032 + }, + { + "epoch": 0.2859402807988926, + "step": 2892, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.2859402807988926, + "step": 2892, + "train/total_loss": 0.0943429172039032 + }, + { + "entropy": 8.795308113098145, + "epoch": 0.28603915364840815, + "mean_token_accuracy": 0.789875864982605, + "num_tokens": 15065882.0, + "step": 2893, + "train/ce_loss": 0.2111099511384964 + }, + { + "epoch": 0.28603915364840815, + "step": 2893, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.28603915364840815, + "step": 2893, + "train/total_loss": 0.04454849660396576 + }, + { + "entropy": 9.871052742004395, + "epoch": 0.2861380264979237, + "mean_token_accuracy": 0.7721822261810303, + "num_tokens": 15070716.0, + "step": 2894, + "train/ce_loss": 2.063713788986206 + }, + { + "epoch": 0.2861380264979237, + "step": 2894, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.2861380264979237, + "step": 2894, + "train/total_loss": 0.27668386697769165 + }, + { + "entropy": 9.305549621582031, + "epoch": 0.2862368993474392, + "mean_token_accuracy": 0.6906290054321289, + "num_tokens": 15075962.0, + "step": 2895, + "train/ce_loss": 1.2572402954101562 + }, + { + "epoch": 0.2862368993474392, + "step": 2895, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.2862368993474392, + "step": 2895, + "train/total_loss": 0.20384903252124786 + }, + { + "entropy": 9.614995002746582, + "epoch": 0.2863357721969547, + "mean_token_accuracy": 0.7966386675834656, + "num_tokens": 15081032.0, + "step": 2896, + "train/ce_loss": 1.1156418323516846 + }, + { + "epoch": 0.2863357721969547, + "step": 2896, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.2863357721969547, + "step": 2896, + "train/total_loss": 0.16625168919563293 + }, + { + "entropy": 9.07950210571289, + "epoch": 0.28643464504647026, + "mean_token_accuracy": 0.7869023084640503, + "num_tokens": 15086442.0, + "step": 2897, + "train/ce_loss": 0.8098610043525696 + }, + { + "epoch": 0.28643464504647026, + "step": 2897, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.28643464504647026, + "step": 2897, + "train/total_loss": 0.1630173623561859 + }, + { + "entropy": 9.426172256469727, + "epoch": 0.28653351789598575, + "mean_token_accuracy": 0.7753247022628784, + "num_tokens": 15091634.0, + "step": 2898, + "train/ce_loss": 0.95197993516922 + }, + { + "epoch": 0.28653351789598575, + "step": 2898, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.28653351789598575, + "step": 2898, + "train/total_loss": 0.13816675543785095 + }, + { + "entropy": 9.329826354980469, + "epoch": 0.2866323907455013, + "mean_token_accuracy": 0.7532981634140015, + "num_tokens": 15096781.0, + "step": 2899, + "train/ce_loss": 0.6716318130493164 + }, + { + "epoch": 0.2866323907455013, + "step": 2899, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.2866323907455013, + "step": 2899, + "train/total_loss": 0.13747568428516388 + }, + { + "epoch": 0.2867312635950168, + "grad_norm": 0.7907260060310364, + "learning_rate": 9.285714285714288e-06, + "loss": 0.1448, + "step": 2900 + }, + { + "entropy": 9.610153198242188, + "epoch": 0.2867312635950168, + "mean_token_accuracy": 0.7203647494316101, + "num_tokens": 15102227.0, + "step": 2900, + "train/ce_loss": 1.2978801727294922 + }, + { + "epoch": 0.2867312635950168, + "step": 2900, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.2867312635950168, + "step": 2900, + "train/total_loss": 0.21572552621364594 + }, + { + "entropy": 9.642109870910645, + "epoch": 0.2868301364445323, + "mean_token_accuracy": 0.745794415473938, + "num_tokens": 15107202.0, + "step": 2901, + "train/ce_loss": 3.4009508453891613e-06 + }, + { + "epoch": 0.2868301364445323, + "step": 2901, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.2868301364445323, + "step": 2901, + "train/total_loss": 0.04296908900141716 + }, + { + "entropy": 9.003314971923828, + "epoch": 0.28692900929404785, + "mean_token_accuracy": 0.7480998635292053, + "num_tokens": 15112616.0, + "step": 2902, + "train/ce_loss": 0.8984993696212769 + }, + { + "epoch": 0.28692900929404785, + "step": 2902, + "train/sim_loss": 0.12109375 + }, + { + "epoch": 0.28692900929404785, + "step": 2902, + "train/total_loss": 0.21094369888305664 + }, + { + "entropy": 9.450581550598145, + "epoch": 0.2870278821435634, + "mean_token_accuracy": 0.6834094524383545, + "num_tokens": 15117680.0, + "step": 2903, + "train/ce_loss": 1.4190950393676758 + }, + { + "epoch": 0.2870278821435634, + "step": 2903, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.2870278821435634, + "step": 2903, + "train/total_loss": 0.23565950989723206 + }, + { + "entropy": 9.08843994140625, + "epoch": 0.2871267549930789, + "mean_token_accuracy": 0.6920199394226074, + "num_tokens": 15122975.0, + "step": 2904, + "train/ce_loss": 1.9272888898849487 + }, + { + "epoch": 0.2871267549930789, + "step": 2904, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.2871267549930789, + "step": 2904, + "train/total_loss": 0.2669476270675659 + }, + { + "entropy": 9.10743522644043, + "epoch": 0.2872256278425944, + "mean_token_accuracy": 0.7439903616905212, + "num_tokens": 15128284.0, + "step": 2905, + "train/ce_loss": 0.8795641660690308 + }, + { + "epoch": 0.2872256278425944, + "step": 2905, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.2872256278425944, + "step": 2905, + "train/total_loss": 0.17780017852783203 + }, + { + "entropy": 9.162273406982422, + "epoch": 0.28732450069210996, + "mean_token_accuracy": 0.7629009485244751, + "num_tokens": 15133392.0, + "step": 2906, + "train/ce_loss": 0.5848463773727417 + }, + { + "epoch": 0.28732450069210996, + "step": 2906, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.28732450069210996, + "step": 2906, + "train/total_loss": 0.10145339369773865 + }, + { + "entropy": 9.031187057495117, + "epoch": 0.28742337354162545, + "mean_token_accuracy": 0.700796365737915, + "num_tokens": 15138727.0, + "step": 2907, + "train/ce_loss": 0.8902770280838013 + }, + { + "epoch": 0.28742337354162545, + "step": 2907, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.28742337354162545, + "step": 2907, + "train/total_loss": 0.15152770280838013 + }, + { + "entropy": 9.404449462890625, + "epoch": 0.287522246391141, + "mean_token_accuracy": 0.807479202747345, + "num_tokens": 15143927.0, + "step": 2908, + "train/ce_loss": 0.7218267917633057 + }, + { + "epoch": 0.287522246391141, + "step": 2908, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.287522246391141, + "step": 2908, + "train/total_loss": 0.09562017768621445 + }, + { + "entropy": 9.481965065002441, + "epoch": 0.28762111924065653, + "mean_token_accuracy": 0.7522796392440796, + "num_tokens": 15149001.0, + "step": 2909, + "train/ce_loss": 4.39267796537024e-06 + }, + { + "epoch": 0.28762111924065653, + "step": 2909, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.28762111924065653, + "step": 2909, + "train/total_loss": 0.03906293958425522 + }, + { + "entropy": 9.307743072509766, + "epoch": 0.287719992090172, + "mean_token_accuracy": 0.7609561681747437, + "num_tokens": 15154226.0, + "step": 2910, + "train/ce_loss": 0.664038896560669 + }, + { + "epoch": 0.287719992090172, + "step": 2910, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.287719992090172, + "step": 2910, + "train/total_loss": 0.10546638816595078 + }, + { + "entropy": 9.571969985961914, + "epoch": 0.28781886493968756, + "mean_token_accuracy": 0.7578616142272949, + "num_tokens": 15159339.0, + "step": 2911, + "train/ce_loss": 0.739711344242096 + }, + { + "epoch": 0.28781886493968756, + "step": 2911, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.28781886493968756, + "step": 2911, + "train/total_loss": 0.16381488740444183 + }, + { + "entropy": 9.01880931854248, + "epoch": 0.2879177377892031, + "mean_token_accuracy": 0.7094017267227173, + "num_tokens": 15164629.0, + "step": 2912, + "train/ce_loss": 0.7169989943504333 + }, + { + "epoch": 0.2879177377892031, + "step": 2912, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.2879177377892031, + "step": 2912, + "train/total_loss": 0.13419990241527557 + }, + { + "entropy": 8.842731475830078, + "epoch": 0.2880166106387186, + "mean_token_accuracy": 0.6873683929443359, + "num_tokens": 15170096.0, + "step": 2913, + "train/ce_loss": 0.895346462726593 + }, + { + "epoch": 0.2880166106387186, + "step": 2913, + "train/sim_loss": 0.1484375 + }, + { + "epoch": 0.2880166106387186, + "step": 2913, + "train/total_loss": 0.23797214031219482 + }, + { + "entropy": 9.337254524230957, + "epoch": 0.2881154834882341, + "mean_token_accuracy": 0.7176470756530762, + "num_tokens": 15175243.0, + "step": 2914, + "train/ce_loss": 1.541421890258789 + }, + { + "epoch": 0.2881154834882341, + "step": 2914, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.2881154834882341, + "step": 2914, + "train/total_loss": 0.23226718604564667 + }, + { + "entropy": 9.131725311279297, + "epoch": 0.28821435633774967, + "mean_token_accuracy": 0.715976357460022, + "num_tokens": 15180551.0, + "step": 2915, + "train/ce_loss": 0.9469427466392517 + }, + { + "epoch": 0.28821435633774967, + "step": 2915, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.28821435633774967, + "step": 2915, + "train/total_loss": 0.12985053658485413 + }, + { + "entropy": 8.793785095214844, + "epoch": 0.28831322918726515, + "mean_token_accuracy": 0.7376705408096313, + "num_tokens": 15185952.0, + "step": 2916, + "train/ce_loss": 0.4122583568096161 + }, + { + "epoch": 0.28831322918726515, + "step": 2916, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.28831322918726515, + "step": 2916, + "train/total_loss": 0.05685083568096161 + }, + { + "entropy": 9.526094436645508, + "epoch": 0.2884121020367807, + "mean_token_accuracy": 0.7099999785423279, + "num_tokens": 15191005.0, + "step": 2917, + "train/ce_loss": 1.3818445205688477 + }, + { + "epoch": 0.2884121020367807, + "step": 2917, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.2884121020367807, + "step": 2917, + "train/total_loss": 0.22021570801734924 + }, + { + "entropy": 9.113298416137695, + "epoch": 0.28851097488629623, + "mean_token_accuracy": 0.7255594730377197, + "num_tokens": 15196322.0, + "step": 2918, + "train/ce_loss": 0.6022658348083496 + }, + { + "epoch": 0.28851097488629623, + "step": 2918, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.28851097488629623, + "step": 2918, + "train/total_loss": 0.12663283944129944 + }, + { + "entropy": 9.03260612487793, + "epoch": 0.2886098477358117, + "mean_token_accuracy": 0.7084826827049255, + "num_tokens": 15201584.0, + "step": 2919, + "train/ce_loss": 1.2507596015930176 + }, + { + "epoch": 0.2886098477358117, + "step": 2919, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.2886098477358117, + "step": 2919, + "train/total_loss": 0.24226346611976624 + }, + { + "epoch": 0.28870872058532726, + "grad_norm": 0.8603032231330872, + "learning_rate": 9.280769420956338e-06, + "loss": 0.1559, + "step": 2920 + }, + { + "entropy": 9.646169662475586, + "epoch": 0.28870872058532726, + "mean_token_accuracy": 0.71378093957901, + "num_tokens": 15206600.0, + "step": 2920, + "train/ce_loss": 1.6396628618240356 + }, + { + "epoch": 0.28870872058532726, + "step": 2920, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.28870872058532726, + "step": 2920, + "train/total_loss": 0.2538100481033325 + }, + { + "entropy": 9.373332977294922, + "epoch": 0.2888075934348428, + "mean_token_accuracy": 0.7517630457878113, + "num_tokens": 15211718.0, + "step": 2921, + "train/ce_loss": 1.1262335777282715 + }, + { + "epoch": 0.2888075934348428, + "step": 2921, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.2888075934348428, + "step": 2921, + "train/total_loss": 0.17121711373329163 + }, + { + "entropy": 9.027542114257812, + "epoch": 0.2889064662843583, + "mean_token_accuracy": 0.76949542760849, + "num_tokens": 15217065.0, + "step": 2922, + "train/ce_loss": 1.0222415924072266 + }, + { + "epoch": 0.2889064662843583, + "step": 2922, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.2889064662843583, + "step": 2922, + "train/total_loss": 0.1647241711616516 + }, + { + "entropy": 8.895853996276855, + "epoch": 0.28900533913387383, + "mean_token_accuracy": 0.7224880456924438, + "num_tokens": 15222387.0, + "step": 2923, + "train/ce_loss": 0.9339638948440552 + }, + { + "epoch": 0.28900533913387383, + "step": 2923, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.28900533913387383, + "step": 2923, + "train/total_loss": 0.16370889544487 + }, + { + "entropy": 9.915444374084473, + "epoch": 0.28910421198338937, + "mean_token_accuracy": 0.7452229261398315, + "num_tokens": 15227275.0, + "step": 2924, + "train/ce_loss": 1.3535692691802979 + }, + { + "epoch": 0.28910421198338937, + "step": 2924, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.28910421198338937, + "step": 2924, + "train/total_loss": 0.20957568287849426 + }, + { + "entropy": 9.574451446533203, + "epoch": 0.2892030848329049, + "mean_token_accuracy": 0.7394958138465881, + "num_tokens": 15232294.0, + "step": 2925, + "train/ce_loss": 0.8031234741210938 + }, + { + "epoch": 0.2892030848329049, + "step": 2925, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.2892030848329049, + "step": 2925, + "train/total_loss": 0.1428123414516449 + }, + { + "entropy": 9.17717170715332, + "epoch": 0.2893019576824204, + "mean_token_accuracy": 0.7391842007637024, + "num_tokens": 15237572.0, + "step": 2926, + "train/ce_loss": 0.6628701686859131 + }, + { + "epoch": 0.2893019576824204, + "step": 2926, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.2893019576824204, + "step": 2926, + "train/total_loss": 0.10534951835870743 + }, + { + "entropy": 9.602263450622559, + "epoch": 0.28940083053193594, + "mean_token_accuracy": 0.7700170278549194, + "num_tokens": 15242616.0, + "step": 2927, + "train/ce_loss": 1.0057820081710815 + }, + { + "epoch": 0.28940083053193594, + "step": 2927, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.28940083053193594, + "step": 2927, + "train/total_loss": 0.2021407037973404 + }, + { + "entropy": 9.086362838745117, + "epoch": 0.2894997033814515, + "mean_token_accuracy": 0.6846733689308167, + "num_tokens": 15247803.0, + "step": 2928, + "train/ce_loss": 0.859923779964447 + }, + { + "epoch": 0.2894997033814515, + "step": 2928, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.2894997033814515, + "step": 2928, + "train/total_loss": 0.16411738097667694 + }, + { + "entropy": 9.409735679626465, + "epoch": 0.28959857623096696, + "mean_token_accuracy": 0.7667140960693359, + "num_tokens": 15252974.0, + "step": 2929, + "train/ce_loss": 0.8224440217018127 + }, + { + "epoch": 0.28959857623096696, + "step": 2929, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.28959857623096696, + "step": 2929, + "train/total_loss": 0.1095881536602974 + }, + { + "entropy": 9.117687225341797, + "epoch": 0.2896974490804825, + "mean_token_accuracy": 0.7347995042800903, + "num_tokens": 15258217.0, + "step": 2930, + "train/ce_loss": 0.755094587802887 + }, + { + "epoch": 0.2896974490804825, + "step": 2930, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.2896974490804825, + "step": 2930, + "train/total_loss": 0.1184782087802887 + }, + { + "entropy": 9.611143112182617, + "epoch": 0.28979632192999805, + "mean_token_accuracy": 0.686274528503418, + "num_tokens": 15263206.0, + "step": 2931, + "train/ce_loss": 0.620786190032959 + }, + { + "epoch": 0.28979632192999805, + "step": 2931, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.28979632192999805, + "step": 2931, + "train/total_loss": 0.14801612496376038 + }, + { + "entropy": 8.850191116333008, + "epoch": 0.28989519477951353, + "mean_token_accuracy": 0.7307692170143127, + "num_tokens": 15268699.0, + "step": 2932, + "train/ce_loss": 0.6986292004585266 + }, + { + "epoch": 0.28989519477951353, + "step": 2932, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.28989519477951353, + "step": 2932, + "train/total_loss": 0.09720667451620102 + }, + { + "entropy": 9.25748348236084, + "epoch": 0.2899940676290291, + "mean_token_accuracy": 0.7631579041481018, + "num_tokens": 15273784.0, + "step": 2933, + "train/ce_loss": 1.0593773126602173 + }, + { + "epoch": 0.2899940676290291, + "step": 2933, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.2899940676290291, + "step": 2933, + "train/total_loss": 0.16453148424625397 + }, + { + "entropy": 9.362494468688965, + "epoch": 0.2900929404785446, + "mean_token_accuracy": 0.7336010932922363, + "num_tokens": 15278971.0, + "step": 2934, + "train/ce_loss": 0.6346144676208496 + }, + { + "epoch": 0.2900929404785446, + "step": 2934, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.2900929404785446, + "step": 2934, + "train/total_loss": 0.14158645272254944 + }, + { + "entropy": 8.851987838745117, + "epoch": 0.2901918133280601, + "mean_token_accuracy": 0.7468220591545105, + "num_tokens": 15284426.0, + "step": 2935, + "train/ce_loss": 0.7906885743141174 + }, + { + "epoch": 0.2901918133280601, + "step": 2935, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.2901918133280601, + "step": 2935, + "train/total_loss": 0.1220376119017601 + }, + { + "entropy": 9.00190544128418, + "epoch": 0.29029068617757564, + "mean_token_accuracy": 0.6767337918281555, + "num_tokens": 15289749.0, + "step": 2936, + "train/ce_loss": 1.0030051469802856 + }, + { + "epoch": 0.29029068617757564, + "step": 2936, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.29029068617757564, + "step": 2936, + "train/total_loss": 0.19405052065849304 + }, + { + "entropy": 8.663482666015625, + "epoch": 0.2903895590270912, + "mean_token_accuracy": 0.7568134069442749, + "num_tokens": 15295187.0, + "step": 2937, + "train/ce_loss": 1.220179557800293 + }, + { + "epoch": 0.2903895590270912, + "step": 2937, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.2903895590270912, + "step": 2937, + "train/total_loss": 0.18061169981956482 + }, + { + "entropy": 9.416010856628418, + "epoch": 0.29048843187660667, + "mean_token_accuracy": 0.6734992861747742, + "num_tokens": 15300387.0, + "step": 2938, + "train/ce_loss": 1.1120859384536743 + }, + { + "epoch": 0.29048843187660667, + "step": 2938, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.29048843187660667, + "step": 2938, + "train/total_loss": 0.16589608788490295 + }, + { + "entropy": 8.690788269042969, + "epoch": 0.2905873047261222, + "mean_token_accuracy": 0.7480403184890747, + "num_tokens": 15305747.0, + "step": 2939, + "train/ce_loss": 0.9094756245613098 + }, + { + "epoch": 0.2905873047261222, + "step": 2939, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.2905873047261222, + "step": 2939, + "train/total_loss": 0.12610381841659546 + }, + { + "epoch": 0.29068617757563775, + "grad_norm": 0.9364753365516663, + "learning_rate": 9.275824556198389e-06, + "loss": 0.1539, + "step": 2940 + }, + { + "entropy": 9.060997009277344, + "epoch": 0.29068617757563775, + "mean_token_accuracy": 0.7299363017082214, + "num_tokens": 15310994.0, + "step": 2940, + "train/ce_loss": 0.6110745668411255 + }, + { + "epoch": 0.29068617757563775, + "step": 2940, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.29068617757563775, + "step": 2940, + "train/total_loss": 0.16657620668411255 + }, + { + "entropy": 9.382147789001465, + "epoch": 0.29078505042515324, + "mean_token_accuracy": 0.716617226600647, + "num_tokens": 15316134.0, + "step": 2941, + "train/ce_loss": 1.2948169708251953 + }, + { + "epoch": 0.29078505042515324, + "step": 2941, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.29078505042515324, + "step": 2941, + "train/total_loss": 0.199794203042984 + }, + { + "entropy": 8.88481616973877, + "epoch": 0.2908839232746688, + "mean_token_accuracy": 0.6993464231491089, + "num_tokens": 15321475.0, + "step": 2942, + "train/ce_loss": 0.6280501484870911 + }, + { + "epoch": 0.2908839232746688, + "step": 2942, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.2908839232746688, + "step": 2942, + "train/total_loss": 0.11358626931905746 + }, + { + "entropy": 9.055749893188477, + "epoch": 0.2909827961241843, + "mean_token_accuracy": 0.7310513257980347, + "num_tokens": 15326761.0, + "step": 2943, + "train/ce_loss": 0.5801316499710083 + }, + { + "epoch": 0.2909827961241843, + "step": 2943, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.2909827961241843, + "step": 2943, + "train/total_loss": 0.11270067095756531 + }, + { + "entropy": 9.614754676818848, + "epoch": 0.2910816689736998, + "mean_token_accuracy": 0.6486486196517944, + "num_tokens": 15331621.0, + "step": 2944, + "train/ce_loss": 3.4918501377105713 + }, + { + "epoch": 0.2910816689736998, + "step": 2944, + "train/sim_loss": 0.1328125 + }, + { + "epoch": 0.2910816689736998, + "step": 2944, + "train/total_loss": 0.4819975197315216 + }, + { + "entropy": 9.08978271484375, + "epoch": 0.29118054182321534, + "mean_token_accuracy": 0.7785235047340393, + "num_tokens": 15336991.0, + "step": 2945, + "train/ce_loss": 0.48373153805732727 + }, + { + "epoch": 0.29118054182321534, + "step": 2945, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.29118054182321534, + "step": 2945, + "train/total_loss": 0.06790440529584885 + }, + { + "entropy": 8.795520782470703, + "epoch": 0.2912794146727309, + "mean_token_accuracy": 0.7587511539459229, + "num_tokens": 15342530.0, + "step": 2946, + "train/ce_loss": 0.6220370531082153 + }, + { + "epoch": 0.2912794146727309, + "step": 2946, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.2912794146727309, + "step": 2946, + "train/total_loss": 0.13251620531082153 + }, + { + "entropy": 9.074284553527832, + "epoch": 0.29137828752224637, + "mean_token_accuracy": 0.6840193867683411, + "num_tokens": 15347811.0, + "step": 2947, + "train/ce_loss": 1.5848338603973389 + }, + { + "epoch": 0.29137828752224637, + "step": 2947, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.29137828752224637, + "step": 2947, + "train/total_loss": 0.2170771360397339 + }, + { + "entropy": 9.856287956237793, + "epoch": 0.2914771603717619, + "mean_token_accuracy": 0.7050691246986389, + "num_tokens": 15352696.0, + "step": 2948, + "train/ce_loss": 0.8591023087501526 + }, + { + "epoch": 0.2914771603717619, + "step": 2948, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.2914771603717619, + "step": 2948, + "train/total_loss": 0.13278523087501526 + }, + { + "entropy": 9.76569938659668, + "epoch": 0.29157603322127745, + "mean_token_accuracy": 0.7038834691047668, + "num_tokens": 15357498.0, + "step": 2949, + "train/ce_loss": 2.4213409423828125 + }, + { + "epoch": 0.29157603322127745, + "step": 2949, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.29157603322127745, + "step": 2949, + "train/total_loss": 0.34369659423828125 + }, + { + "entropy": 9.00585651397705, + "epoch": 0.29167490607079294, + "mean_token_accuracy": 0.7393422722816467, + "num_tokens": 15362774.0, + "step": 2950, + "train/ce_loss": 1.1268589496612549 + }, + { + "epoch": 0.29167490607079294, + "step": 2950, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.29167490607079294, + "step": 2950, + "train/total_loss": 0.194717139005661 + }, + { + "entropy": 9.489755630493164, + "epoch": 0.2917737789203085, + "mean_token_accuracy": 0.7415929436683655, + "num_tokens": 15367806.0, + "step": 2951, + "train/ce_loss": 5.05991420141072e-06 + }, + { + "epoch": 0.2917737789203085, + "step": 2951, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.2917737789203085, + "step": 2951, + "train/total_loss": 0.04687550663948059 + }, + { + "entropy": 8.950197219848633, + "epoch": 0.291872651769824, + "mean_token_accuracy": 0.7437020540237427, + "num_tokens": 15373187.0, + "step": 2952, + "train/ce_loss": 0.34434279799461365 + }, + { + "epoch": 0.291872651769824, + "step": 2952, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.291872651769824, + "step": 2952, + "train/total_loss": 0.10474678128957748 + }, + { + "entropy": 9.185539245605469, + "epoch": 0.2919715246193395, + "mean_token_accuracy": 0.7609289884567261, + "num_tokens": 15378396.0, + "step": 2953, + "train/ce_loss": 0.8392499089241028 + }, + { + "epoch": 0.2919715246193395, + "step": 2953, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.2919715246193395, + "step": 2953, + "train/total_loss": 0.12689374387264252 + }, + { + "entropy": 9.294303894042969, + "epoch": 0.29207039746885505, + "mean_token_accuracy": 0.7472527623176575, + "num_tokens": 15383579.0, + "step": 2954, + "train/ce_loss": 1.0456651449203491 + }, + { + "epoch": 0.29207039746885505, + "step": 2954, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.29207039746885505, + "step": 2954, + "train/total_loss": 0.1319102644920349 + }, + { + "entropy": 9.182682037353516, + "epoch": 0.2921692703183706, + "mean_token_accuracy": 0.7450722455978394, + "num_tokens": 15388851.0, + "step": 2955, + "train/ce_loss": 7.724240276729688e-05 + }, + { + "epoch": 0.2921692703183706, + "step": 2955, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.2921692703183706, + "step": 2955, + "train/total_loss": 0.04688272252678871 + }, + { + "entropy": 9.01830005645752, + "epoch": 0.2922681431678861, + "mean_token_accuracy": 0.7248018383979797, + "num_tokens": 15394197.0, + "step": 2956, + "train/ce_loss": 0.949018657207489 + }, + { + "epoch": 0.2922681431678861, + "step": 2956, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.2922681431678861, + "step": 2956, + "train/total_loss": 0.20427685976028442 + }, + { + "entropy": 9.368885040283203, + "epoch": 0.2923670160174016, + "mean_token_accuracy": 0.7099023461341858, + "num_tokens": 15399370.0, + "step": 2957, + "train/ce_loss": 1.0313037633895874 + }, + { + "epoch": 0.2923670160174016, + "step": 2957, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.2923670160174016, + "step": 2957, + "train/total_loss": 0.15000537037849426 + }, + { + "entropy": 9.373347282409668, + "epoch": 0.29246588886691716, + "mean_token_accuracy": 0.704827606678009, + "num_tokens": 15404561.0, + "step": 2958, + "train/ce_loss": 0.7711067795753479 + }, + { + "epoch": 0.29246588886691716, + "step": 2958, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.29246588886691716, + "step": 2958, + "train/total_loss": 0.1278919279575348 + }, + { + "entropy": 9.563098907470703, + "epoch": 0.29256476171643264, + "mean_token_accuracy": 0.7034220695495605, + "num_tokens": 15409563.0, + "step": 2959, + "train/ce_loss": 0.7286401987075806 + }, + { + "epoch": 0.29256476171643264, + "step": 2959, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.29256476171643264, + "step": 2959, + "train/total_loss": 0.15880152583122253 + }, + { + "epoch": 0.2926636345659482, + "grad_norm": 1.0534138679504395, + "learning_rate": 9.270879691440439e-06, + "loss": 0.1555, + "step": 2960 + }, + { + "entropy": 9.002914428710938, + "epoch": 0.2926636345659482, + "mean_token_accuracy": 0.7153284549713135, + "num_tokens": 15414801.0, + "step": 2960, + "train/ce_loss": 0.8889736533164978 + }, + { + "epoch": 0.2926636345659482, + "step": 2960, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.2926636345659482, + "step": 2960, + "train/total_loss": 0.13577237725257874 + }, + { + "entropy": 9.103862762451172, + "epoch": 0.2927625074154637, + "mean_token_accuracy": 0.7592829465866089, + "num_tokens": 15420053.0, + "step": 2961, + "train/ce_loss": 0.4983813464641571 + }, + { + "epoch": 0.2927625074154637, + "step": 2961, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.2927625074154637, + "step": 2961, + "train/total_loss": 0.12015064060688019 + }, + { + "entropy": 9.445127487182617, + "epoch": 0.2928613802649792, + "mean_token_accuracy": 0.7324561476707458, + "num_tokens": 15425156.0, + "step": 2962, + "train/ce_loss": 0.8944116830825806 + }, + { + "epoch": 0.2928613802649792, + "step": 2962, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.2928613802649792, + "step": 2962, + "train/total_loss": 0.163659930229187 + }, + { + "entropy": 9.726035118103027, + "epoch": 0.29296025311449475, + "mean_token_accuracy": 0.6761363744735718, + "num_tokens": 15430123.0, + "step": 2963, + "train/ce_loss": 4.313818408263614e-06 + }, + { + "epoch": 0.29296025311449475, + "step": 2963, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.29296025311449475, + "step": 2963, + "train/total_loss": 0.01953168213367462 + }, + { + "entropy": 9.181549072265625, + "epoch": 0.2930591259640103, + "mean_token_accuracy": 0.7400768399238586, + "num_tokens": 15435323.0, + "step": 2964, + "train/ce_loss": 0.5938209891319275 + }, + { + "epoch": 0.2930591259640103, + "step": 2964, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.2930591259640103, + "step": 2964, + "train/total_loss": 0.10235084593296051 + }, + { + "entropy": 9.563488006591797, + "epoch": 0.29315799881352583, + "mean_token_accuracy": 0.7197986841201782, + "num_tokens": 15440341.0, + "step": 2965, + "train/ce_loss": 1.5780949592590332 + }, + { + "epoch": 0.29315799881352583, + "step": 2965, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.29315799881352583, + "step": 2965, + "train/total_loss": 0.2710907459259033 + }, + { + "entropy": 9.043390274047852, + "epoch": 0.2932568716630413, + "mean_token_accuracy": 0.7263948321342468, + "num_tokens": 15445852.0, + "step": 2966, + "train/ce_loss": 5.926921858190326e-06 + }, + { + "epoch": 0.2932568716630413, + "step": 2966, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.2932568716630413, + "step": 2966, + "train/total_loss": 0.042969342321157455 + }, + { + "entropy": 9.691654205322266, + "epoch": 0.29335574451255686, + "mean_token_accuracy": 0.734375, + "num_tokens": 15450772.0, + "step": 2967, + "train/ce_loss": 1.099898099899292 + }, + { + "epoch": 0.29335574451255686, + "step": 2967, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.29335574451255686, + "step": 2967, + "train/total_loss": 0.19202107191085815 + }, + { + "entropy": 9.398199081420898, + "epoch": 0.2934546173620724, + "mean_token_accuracy": 0.7475317120552063, + "num_tokens": 15455899.0, + "step": 2968, + "train/ce_loss": 5.713059636036633e-06 + }, + { + "epoch": 0.2934546173620724, + "step": 2968, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.2934546173620724, + "step": 2968, + "train/total_loss": 0.031250569969415665 + }, + { + "entropy": 9.10877799987793, + "epoch": 0.2935534902115879, + "mean_token_accuracy": 0.7523696422576904, + "num_tokens": 15461222.0, + "step": 2969, + "train/ce_loss": 0.6074536442756653 + }, + { + "epoch": 0.2935534902115879, + "step": 2969, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.2935534902115879, + "step": 2969, + "train/total_loss": 0.13887035846710205 + }, + { + "entropy": 8.610416412353516, + "epoch": 0.2936523630611034, + "mean_token_accuracy": 0.7567567825317383, + "num_tokens": 15466487.0, + "step": 2970, + "train/ce_loss": 1.0411075353622437 + }, + { + "epoch": 0.2936523630611034, + "step": 2970, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.2936523630611034, + "step": 2970, + "train/total_loss": 0.1783294975757599 + }, + { + "entropy": 9.788820266723633, + "epoch": 0.29375123591061897, + "mean_token_accuracy": 0.7931034564971924, + "num_tokens": 15471259.0, + "step": 2971, + "train/ce_loss": 1.6351121664047241 + }, + { + "epoch": 0.29375123591061897, + "step": 2971, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.29375123591061897, + "step": 2971, + "train/total_loss": 0.2260112166404724 + }, + { + "entropy": 8.552578926086426, + "epoch": 0.29385010876013445, + "mean_token_accuracy": 0.717597484588623, + "num_tokens": 15476722.0, + "step": 2972, + "train/ce_loss": 1.4389077425003052 + }, + { + "epoch": 0.29385010876013445, + "step": 2972, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.29385010876013445, + "step": 2972, + "train/total_loss": 0.21810953319072723 + }, + { + "entropy": 9.41911506652832, + "epoch": 0.29394898160965, + "mean_token_accuracy": 0.7859424948692322, + "num_tokens": 15481802.0, + "step": 2973, + "train/ce_loss": 0.8379352688789368 + }, + { + "epoch": 0.29394898160965, + "step": 2973, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.29394898160965, + "step": 2973, + "train/total_loss": 0.1189497783780098 + }, + { + "entropy": 9.36585807800293, + "epoch": 0.29404785445916554, + "mean_token_accuracy": 0.7714748978614807, + "num_tokens": 15486875.0, + "step": 2974, + "train/ce_loss": 1.2797170877456665 + }, + { + "epoch": 0.29404785445916554, + "step": 2974, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.29404785445916554, + "step": 2974, + "train/total_loss": 0.21390920877456665 + }, + { + "entropy": 8.743827819824219, + "epoch": 0.294146727308681, + "mean_token_accuracy": 0.7288317084312439, + "num_tokens": 15492311.0, + "step": 2975, + "train/ce_loss": 0.7437622547149658 + }, + { + "epoch": 0.294146727308681, + "step": 2975, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.294146727308681, + "step": 2975, + "train/total_loss": 0.12906372547149658 + }, + { + "entropy": 9.55495834350586, + "epoch": 0.29424560015819656, + "mean_token_accuracy": 0.7990115284919739, + "num_tokens": 15497382.0, + "step": 2976, + "train/ce_loss": 0.7216197848320007 + }, + { + "epoch": 0.29424560015819656, + "step": 2976, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.29424560015819656, + "step": 2976, + "train/total_loss": 0.09169322997331619 + }, + { + "entropy": 9.320695877075195, + "epoch": 0.2943444730077121, + "mean_token_accuracy": 0.6904177069664001, + "num_tokens": 15502633.0, + "step": 2977, + "train/ce_loss": 1.0655757188796997 + }, + { + "epoch": 0.2943444730077121, + "step": 2977, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.2943444730077121, + "step": 2977, + "train/total_loss": 0.20812007784843445 + }, + { + "entropy": 9.47970199584961, + "epoch": 0.2944433458572276, + "mean_token_accuracy": 0.7394495606422424, + "num_tokens": 15507596.0, + "step": 2978, + "train/ce_loss": 4.474019533518003e-06 + }, + { + "epoch": 0.2944433458572276, + "step": 2978, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.2944433458572276, + "step": 2978, + "train/total_loss": 0.06250044703483582 + }, + { + "entropy": 9.713456153869629, + "epoch": 0.29454221870674313, + "mean_token_accuracy": 0.7357001900672913, + "num_tokens": 15512496.0, + "step": 2979, + "train/ce_loss": 1.6623332500457764 + }, + { + "epoch": 0.29454221870674313, + "step": 2979, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.29454221870674313, + "step": 2979, + "train/total_loss": 0.23654583096504211 + }, + { + "epoch": 0.29464109155625867, + "grad_norm": 0.9678965210914612, + "learning_rate": 9.265934826682491e-06, + "loss": 0.1494, + "step": 2980 + }, + { + "entropy": 9.160664558410645, + "epoch": 0.29464109155625867, + "mean_token_accuracy": 0.7824324369430542, + "num_tokens": 15517731.0, + "step": 2980, + "train/ce_loss": 0.5237296223640442 + }, + { + "epoch": 0.29464109155625867, + "step": 2980, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.29464109155625867, + "step": 2980, + "train/total_loss": 0.10706046223640442 + }, + { + "entropy": 9.939748764038086, + "epoch": 0.29473996440577416, + "mean_token_accuracy": 0.7386363744735718, + "num_tokens": 15522727.0, + "step": 2981, + "train/ce_loss": 2.172889471054077 + }, + { + "epoch": 0.29473996440577416, + "step": 2981, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.29473996440577416, + "step": 2981, + "train/total_loss": 0.24463270604610443 + }, + { + "entropy": 8.616790771484375, + "epoch": 0.2948388372552897, + "mean_token_accuracy": 0.7509191036224365, + "num_tokens": 15528327.0, + "step": 2982, + "train/ce_loss": 1.0044151544570923 + }, + { + "epoch": 0.2948388372552897, + "step": 2982, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.2948388372552897, + "step": 2982, + "train/total_loss": 0.20981651544570923 + }, + { + "entropy": 9.805368423461914, + "epoch": 0.29493771010480524, + "mean_token_accuracy": 0.680898904800415, + "num_tokens": 15533212.0, + "step": 2983, + "train/ce_loss": 1.1280209037067834e-05 + }, + { + "epoch": 0.29493771010480524, + "step": 2983, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.29493771010480524, + "step": 2983, + "train/total_loss": 0.039063628762960434 + }, + { + "entropy": 9.743290901184082, + "epoch": 0.2950365829543207, + "mean_token_accuracy": 0.7201565504074097, + "num_tokens": 15538143.0, + "step": 2984, + "train/ce_loss": 0.9904972910881042 + }, + { + "epoch": 0.2950365829543207, + "step": 2984, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.2950365829543207, + "step": 2984, + "train/total_loss": 0.14201848208904266 + }, + { + "entropy": 9.05086898803711, + "epoch": 0.29513545580383627, + "mean_token_accuracy": 0.746760904788971, + "num_tokens": 15543493.0, + "step": 2985, + "train/ce_loss": 0.6934479475021362 + }, + { + "epoch": 0.29513545580383627, + "step": 2985, + "train/sim_loss": 0.15625 + }, + { + "epoch": 0.29513545580383627, + "step": 2985, + "train/total_loss": 0.22559478878974915 + }, + { + "entropy": 9.10383415222168, + "epoch": 0.2952343286533518, + "mean_token_accuracy": 0.7514654397964478, + "num_tokens": 15548867.0, + "step": 2986, + "train/ce_loss": 1.0702656507492065 + }, + { + "epoch": 0.2952343286533518, + "step": 2986, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.2952343286533518, + "step": 2986, + "train/total_loss": 0.1773390769958496 + }, + { + "entropy": 9.471105575561523, + "epoch": 0.2953332015028673, + "mean_token_accuracy": 0.7421758770942688, + "num_tokens": 15553924.0, + "step": 2987, + "train/ce_loss": 0.9849151372909546 + }, + { + "epoch": 0.2953332015028673, + "step": 2987, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.2953332015028673, + "step": 2987, + "train/total_loss": 0.15708526968955994 + }, + { + "entropy": 9.303641319274902, + "epoch": 0.29543207435238283, + "mean_token_accuracy": 0.7553865909576416, + "num_tokens": 15559154.0, + "step": 2988, + "train/ce_loss": 1.1377649307250977 + }, + { + "epoch": 0.29543207435238283, + "step": 2988, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.29543207435238283, + "step": 2988, + "train/total_loss": 0.20362025499343872 + }, + { + "entropy": 9.135931015014648, + "epoch": 0.2955309472018984, + "mean_token_accuracy": 0.7256637215614319, + "num_tokens": 15564441.0, + "step": 2989, + "train/ce_loss": 0.7426436543464661 + }, + { + "epoch": 0.2955309472018984, + "step": 2989, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.2955309472018984, + "step": 2989, + "train/total_loss": 0.15629562735557556 + }, + { + "entropy": 9.353001594543457, + "epoch": 0.29562982005141386, + "mean_token_accuracy": 0.749576985836029, + "num_tokens": 15569484.0, + "step": 2990, + "train/ce_loss": 1.0848103761672974 + }, + { + "epoch": 0.29562982005141386, + "step": 2990, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.29562982005141386, + "step": 2990, + "train/total_loss": 0.2022310495376587 + }, + { + "entropy": 9.00560474395752, + "epoch": 0.2957286929009294, + "mean_token_accuracy": 0.7407878041267395, + "num_tokens": 15574769.0, + "step": 2991, + "train/ce_loss": 0.5350890159606934 + }, + { + "epoch": 0.2957286929009294, + "step": 2991, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.2957286929009294, + "step": 2991, + "train/total_loss": 0.11600890755653381 + }, + { + "entropy": 9.298500061035156, + "epoch": 0.29582756575044494, + "mean_token_accuracy": 0.71659916639328, + "num_tokens": 15579900.0, + "step": 2992, + "train/ce_loss": 1.0368627309799194 + }, + { + "epoch": 0.29582756575044494, + "step": 2992, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.29582756575044494, + "step": 2992, + "train/total_loss": 0.15446752309799194 + }, + { + "entropy": 10.281341552734375, + "epoch": 0.29592643859996043, + "mean_token_accuracy": 0.6860068440437317, + "num_tokens": 15584585.0, + "step": 2993, + "train/ce_loss": 1.00157221822883e-05 + }, + { + "epoch": 0.29592643859996043, + "step": 2993, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.29592643859996043, + "step": 2993, + "train/total_loss": 0.07031349837779999 + }, + { + "entropy": 8.819000244140625, + "epoch": 0.29602531144947597, + "mean_token_accuracy": 0.6941176652908325, + "num_tokens": 15589974.0, + "step": 2994, + "train/ce_loss": 0.7955414056777954 + }, + { + "epoch": 0.29602531144947597, + "step": 2994, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.29602531144947597, + "step": 2994, + "train/total_loss": 0.11861664056777954 + }, + { + "entropy": 8.97614574432373, + "epoch": 0.2961241842989915, + "mean_token_accuracy": 0.6965174078941345, + "num_tokens": 15595279.0, + "step": 2995, + "train/ce_loss": 0.32965347170829773 + }, + { + "epoch": 0.2961241842989915, + "step": 2995, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.2961241842989915, + "step": 2995, + "train/total_loss": 0.05249659717082977 + }, + { + "entropy": 9.65750503540039, + "epoch": 0.296223057148507, + "mean_token_accuracy": 0.739130437374115, + "num_tokens": 15600178.0, + "step": 2996, + "train/ce_loss": 4.405275831231847e-06 + }, + { + "epoch": 0.296223057148507, + "step": 2996, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.296223057148507, + "step": 2996, + "train/total_loss": 0.019531691446900368 + }, + { + "entropy": 9.083763122558594, + "epoch": 0.29632192999802254, + "mean_token_accuracy": 0.7827102541923523, + "num_tokens": 15605535.0, + "step": 2997, + "train/ce_loss": 0.523099958896637 + }, + { + "epoch": 0.29632192999802254, + "step": 2997, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.29632192999802254, + "step": 2997, + "train/total_loss": 0.11480999737977982 + }, + { + "entropy": 9.692913055419922, + "epoch": 0.2964208028475381, + "mean_token_accuracy": 0.7287522554397583, + "num_tokens": 15610637.0, + "step": 2998, + "train/ce_loss": 2.1075310707092285 + }, + { + "epoch": 0.2964208028475381, + "step": 2998, + "train/sim_loss": 0.1328125 + }, + { + "epoch": 0.2964208028475381, + "step": 2998, + "train/total_loss": 0.34356561303138733 + }, + { + "entropy": 9.991308212280273, + "epoch": 0.29651967569705356, + "mean_token_accuracy": 0.6886075735092163, + "num_tokens": 15615399.0, + "step": 2999, + "train/ce_loss": 5.973896350042196e-06 + }, + { + "epoch": 0.29651967569705356, + "step": 2999, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.29651967569705356, + "step": 2999, + "train/total_loss": 0.06250059604644775 + }, + { + "epoch": 0.2966185485465691, + "grad_norm": 0.9837698340415955, + "learning_rate": 9.260989961924542e-06, + "loss": 0.151, + "step": 3000 + }, + { + "entropy": 8.74220085144043, + "epoch": 0.2966185485465691, + "mean_token_accuracy": 0.7542277574539185, + "num_tokens": 15620771.0, + "step": 3000, + "train/ce_loss": 1.4459656476974487 + }, + { + "epoch": 0.2966185485465691, + "step": 3000, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.2966185485465691, + "step": 3000, + "train/total_loss": 0.23053406178951263 + }, + { + "entropy": 9.76188850402832, + "epoch": 0.29671742139608465, + "mean_token_accuracy": 0.7573964595794678, + "num_tokens": 15625897.0, + "step": 3001, + "train/ce_loss": 7.043448931653984e-06 + }, + { + "epoch": 0.29671742139608465, + "step": 3001, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.29671742139608465, + "step": 3001, + "train/total_loss": 0.05859445407986641 + }, + { + "entropy": 8.959300994873047, + "epoch": 0.29681629424560013, + "mean_token_accuracy": 0.6892911195755005, + "num_tokens": 15631045.0, + "step": 3002, + "train/ce_loss": 1.4792759429838043e-05 + }, + { + "epoch": 0.29681629424560013, + "step": 3002, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.29681629424560013, + "step": 3002, + "train/total_loss": 0.04687647894024849 + }, + { + "entropy": 8.936592102050781, + "epoch": 0.2969151670951157, + "mean_token_accuracy": 0.723192036151886, + "num_tokens": 15636344.0, + "step": 3003, + "train/ce_loss": 0.9106271862983704 + }, + { + "epoch": 0.2969151670951157, + "step": 3003, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.2969151670951157, + "step": 3003, + "train/total_loss": 0.1262189745903015 + }, + { + "entropy": 9.652338027954102, + "epoch": 0.2970140399446312, + "mean_token_accuracy": 0.709193229675293, + "num_tokens": 15641357.0, + "step": 3004, + "train/ce_loss": 0.7016077041625977 + }, + { + "epoch": 0.2970140399446312, + "step": 3004, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.2970140399446312, + "step": 3004, + "train/total_loss": 0.13266077637672424 + }, + { + "entropy": 9.529338836669922, + "epoch": 0.2971129127941467, + "mean_token_accuracy": 0.7367576360702515, + "num_tokens": 15646415.0, + "step": 3005, + "train/ce_loss": 4.004227321274811e-06 + }, + { + "epoch": 0.2971129127941467, + "step": 3005, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.2971129127941467, + "step": 3005, + "train/total_loss": 0.07812540233135223 + }, + { + "entropy": 9.7982177734375, + "epoch": 0.29721178564366224, + "mean_token_accuracy": 0.6998341679573059, + "num_tokens": 15651450.0, + "step": 3006, + "train/ce_loss": 1.2795770168304443 + }, + { + "epoch": 0.29721178564366224, + "step": 3006, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.29721178564366224, + "step": 3006, + "train/total_loss": 0.17483270168304443 + }, + { + "entropy": 9.549211502075195, + "epoch": 0.2973106584931778, + "mean_token_accuracy": 0.7585585713386536, + "num_tokens": 15656415.0, + "step": 3007, + "train/ce_loss": 1.0237561464309692 + }, + { + "epoch": 0.2973106584931778, + "step": 3007, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.2973106584931778, + "step": 3007, + "train/total_loss": 0.14925062656402588 + }, + { + "entropy": 9.022342681884766, + "epoch": 0.2974095313426933, + "mean_token_accuracy": 0.7371967434883118, + "num_tokens": 15661813.0, + "step": 3008, + "train/ce_loss": 0.7531060576438904 + }, + { + "epoch": 0.2974095313426933, + "step": 3008, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.2974095313426933, + "step": 3008, + "train/total_loss": 0.184685617685318 + }, + { + "entropy": 9.182098388671875, + "epoch": 0.2975084041922088, + "mean_token_accuracy": 0.7335957884788513, + "num_tokens": 15667055.0, + "step": 3009, + "train/ce_loss": 1.872267723083496 + }, + { + "epoch": 0.2975084041922088, + "step": 3009, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.2975084041922088, + "step": 3009, + "train/total_loss": 0.2575392723083496 + }, + { + "entropy": 9.481335639953613, + "epoch": 0.29760727704172435, + "mean_token_accuracy": 0.7445141077041626, + "num_tokens": 15672150.0, + "step": 3010, + "train/ce_loss": 4.337508471508045e-06 + }, + { + "epoch": 0.29760727704172435, + "step": 3010, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.29760727704172435, + "step": 3010, + "train/total_loss": 0.05468793213367462 + }, + { + "entropy": 9.197444915771484, + "epoch": 0.2977061498912399, + "mean_token_accuracy": 0.7210718393325806, + "num_tokens": 15677443.0, + "step": 3011, + "train/ce_loss": 1.0222855806350708 + }, + { + "epoch": 0.2977061498912399, + "step": 3011, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.2977061498912399, + "step": 3011, + "train/total_loss": 0.2116035521030426 + }, + { + "entropy": 8.939278602600098, + "epoch": 0.2978050227407554, + "mean_token_accuracy": 0.7448036670684814, + "num_tokens": 15682864.0, + "step": 3012, + "train/ce_loss": 0.4343223571777344 + }, + { + "epoch": 0.2978050227407554, + "step": 3012, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.2978050227407554, + "step": 3012, + "train/total_loss": 0.13327598571777344 + }, + { + "entropy": 9.049263000488281, + "epoch": 0.2979038955902709, + "mean_token_accuracy": 0.7024901509284973, + "num_tokens": 15688110.0, + "step": 3013, + "train/ce_loss": 0.6559205651283264 + }, + { + "epoch": 0.2979038955902709, + "step": 3013, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.2979038955902709, + "step": 3013, + "train/total_loss": 0.13981080055236816 + }, + { + "entropy": 9.697021484375, + "epoch": 0.29800276843978646, + "mean_token_accuracy": 0.7936508059501648, + "num_tokens": 15692993.0, + "step": 3014, + "train/ce_loss": 7.438169177476084e-06 + }, + { + "epoch": 0.29800276843978646, + "step": 3014, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.29800276843978646, + "step": 3014, + "train/total_loss": 0.05078199505805969 + }, + { + "entropy": 9.081084251403809, + "epoch": 0.29810164128930194, + "mean_token_accuracy": 0.7256532311439514, + "num_tokens": 15698285.0, + "step": 3015, + "train/ce_loss": 1.2116591930389404 + }, + { + "epoch": 0.29810164128930194, + "step": 3015, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.29810164128930194, + "step": 3015, + "train/total_loss": 0.230540931224823 + }, + { + "entropy": 9.241430282592773, + "epoch": 0.2982005141388175, + "mean_token_accuracy": 0.7848557829856873, + "num_tokens": 15703559.0, + "step": 3016, + "train/ce_loss": 0.7737722396850586 + }, + { + "epoch": 0.2982005141388175, + "step": 3016, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.2982005141388175, + "step": 3016, + "train/total_loss": 0.14378347992897034 + }, + { + "entropy": 9.451658248901367, + "epoch": 0.298299386988333, + "mean_token_accuracy": 0.7496296167373657, + "num_tokens": 15708647.0, + "step": 3017, + "train/ce_loss": 0.33514443039894104 + }, + { + "epoch": 0.298299386988333, + "step": 3017, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.298299386988333, + "step": 3017, + "train/total_loss": 0.08038944005966187 + }, + { + "entropy": 9.318082809448242, + "epoch": 0.2983982598378485, + "mean_token_accuracy": 0.7401574850082397, + "num_tokens": 15713885.0, + "step": 3018, + "train/ce_loss": 1.3564872741699219 + }, + { + "epoch": 0.2983982598378485, + "step": 3018, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.2983982598378485, + "step": 3018, + "train/total_loss": 0.2098674774169922 + }, + { + "entropy": 8.701761245727539, + "epoch": 0.29849713268736405, + "mean_token_accuracy": 0.7411225438117981, + "num_tokens": 15719261.0, + "step": 3019, + "train/ce_loss": 1.5156086683273315 + }, + { + "epoch": 0.29849713268736405, + "step": 3019, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.29849713268736405, + "step": 3019, + "train/total_loss": 0.19843587279319763 + }, + { + "epoch": 0.2985960055368796, + "grad_norm": 0.8209056854248047, + "learning_rate": 9.256045097166592e-06, + "loss": 0.1495, + "step": 3020 + }, + { + "entropy": 8.67845344543457, + "epoch": 0.2985960055368796, + "mean_token_accuracy": 0.7316784858703613, + "num_tokens": 15724599.0, + "step": 3020, + "train/ce_loss": 0.8231831192970276 + }, + { + "epoch": 0.2985960055368796, + "step": 3020, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.2985960055368796, + "step": 3020, + "train/total_loss": 0.12919330596923828 + }, + { + "entropy": 9.154563903808594, + "epoch": 0.2986948783863951, + "mean_token_accuracy": 0.7467362880706787, + "num_tokens": 15729836.0, + "step": 3021, + "train/ce_loss": 0.7567043900489807 + }, + { + "epoch": 0.2986948783863951, + "step": 3021, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.2986948783863951, + "step": 3021, + "train/total_loss": 0.12645170092582703 + }, + { + "entropy": 8.827190399169922, + "epoch": 0.2987937512359106, + "mean_token_accuracy": 0.7154762148857117, + "num_tokens": 15735176.0, + "step": 3022, + "train/ce_loss": 0.410447359085083 + }, + { + "epoch": 0.2987937512359106, + "step": 3022, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.2987937512359106, + "step": 3022, + "train/total_loss": 0.13088849186897278 + }, + { + "entropy": 9.213386535644531, + "epoch": 0.29889262408542616, + "mean_token_accuracy": 0.7359412908554077, + "num_tokens": 15740429.0, + "step": 3023, + "train/ce_loss": 0.957610011100769 + }, + { + "epoch": 0.29889262408542616, + "step": 3023, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.29889262408542616, + "step": 3023, + "train/total_loss": 0.1738860011100769 + }, + { + "entropy": 9.915075302124023, + "epoch": 0.29899149693494165, + "mean_token_accuracy": 0.7661795616149902, + "num_tokens": 15745358.0, + "step": 3024, + "train/ce_loss": 4.308172265155008e-06 + }, + { + "epoch": 0.29899149693494165, + "step": 3024, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.29899149693494165, + "step": 3024, + "train/total_loss": 0.06640668213367462 + }, + { + "entropy": 9.263346672058105, + "epoch": 0.2990903697844572, + "mean_token_accuracy": 0.6996245384216309, + "num_tokens": 15750767.0, + "step": 3025, + "train/ce_loss": 1.3881711959838867 + }, + { + "epoch": 0.2990903697844572, + "step": 3025, + "train/sim_loss": 0.12109375 + }, + { + "epoch": 0.2990903697844572, + "step": 3025, + "train/total_loss": 0.2599108815193176 + }, + { + "entropy": 9.625162124633789, + "epoch": 0.29918924263397273, + "mean_token_accuracy": 0.7014925479888916, + "num_tokens": 15755797.0, + "step": 3026, + "train/ce_loss": 6.861389920231886e-06 + }, + { + "epoch": 0.29918924263397273, + "step": 3026, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.29918924263397273, + "step": 3026, + "train/total_loss": 0.05468818545341492 + }, + { + "entropy": 9.5260648727417, + "epoch": 0.2992881154834882, + "mean_token_accuracy": 0.7708674073219299, + "num_tokens": 15761032.0, + "step": 3027, + "train/ce_loss": 0.8745700716972351 + }, + { + "epoch": 0.2992881154834882, + "step": 3027, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.2992881154834882, + "step": 3027, + "train/total_loss": 0.15386325120925903 + }, + { + "entropy": 10.308094024658203, + "epoch": 0.29938698833300376, + "mean_token_accuracy": 0.7317073345184326, + "num_tokens": 15765748.0, + "step": 3028, + "train/ce_loss": 6.836529337306274e-06 + }, + { + "epoch": 0.29938698833300376, + "step": 3028, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.29938698833300376, + "step": 3028, + "train/total_loss": 0.019531933590769768 + }, + { + "entropy": 8.922569274902344, + "epoch": 0.2994858611825193, + "mean_token_accuracy": 0.7791342735290527, + "num_tokens": 15771169.0, + "step": 3029, + "train/ce_loss": 0.5738962292671204 + }, + { + "epoch": 0.2994858611825193, + "step": 3029, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.2994858611825193, + "step": 3029, + "train/total_loss": 0.14723336696624756 + }, + { + "entropy": 10.167108535766602, + "epoch": 0.2995847340320348, + "mean_token_accuracy": 0.8157894611358643, + "num_tokens": 15775953.0, + "step": 3030, + "train/ce_loss": 7.155690582294483e-06 + }, + { + "epoch": 0.2995847340320348, + "step": 3030, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.2995847340320348, + "step": 3030, + "train/total_loss": 0.027344465255737305 + }, + { + "entropy": 9.04146957397461, + "epoch": 0.2996836068815503, + "mean_token_accuracy": 0.7317073345184326, + "num_tokens": 15781355.0, + "step": 3031, + "train/ce_loss": 0.710956335067749 + }, + { + "epoch": 0.2996836068815503, + "step": 3031, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.2996836068815503, + "step": 3031, + "train/total_loss": 0.14531439542770386 + }, + { + "entropy": 9.568553924560547, + "epoch": 0.29978247973106587, + "mean_token_accuracy": 0.8229665160179138, + "num_tokens": 15786602.0, + "step": 3032, + "train/ce_loss": 1.3454652616928797e-05 + }, + { + "epoch": 0.29978247973106587, + "step": 3032, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.29978247973106587, + "step": 3032, + "train/total_loss": 0.054688844829797745 + }, + { + "entropy": 9.415324211120605, + "epoch": 0.29988135258058135, + "mean_token_accuracy": 0.8056337833404541, + "num_tokens": 15791777.0, + "step": 3033, + "train/ce_loss": 0.6456983685493469 + }, + { + "epoch": 0.29988135258058135, + "step": 3033, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.29988135258058135, + "step": 3033, + "train/total_loss": 0.09581983834505081 + }, + { + "entropy": 10.041857719421387, + "epoch": 0.2999802254300969, + "mean_token_accuracy": 0.7089337110519409, + "num_tokens": 15796545.0, + "step": 3034, + "train/ce_loss": 1.1669424566207454e-05 + }, + { + "epoch": 0.2999802254300969, + "step": 3034, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.2999802254300969, + "step": 3034, + "train/total_loss": 0.05468866601586342 + }, + { + "entropy": 9.296567916870117, + "epoch": 0.30007909827961243, + "mean_token_accuracy": 0.6926751732826233, + "num_tokens": 15801647.0, + "step": 3035, + "train/ce_loss": 3.612391992646735e-06 + }, + { + "epoch": 0.30007909827961243, + "step": 3035, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.30007909827961243, + "step": 3035, + "train/total_loss": 0.07421910762786865 + }, + { + "entropy": 9.452152252197266, + "epoch": 0.3001779711291279, + "mean_token_accuracy": 0.7451253533363342, + "num_tokens": 15806994.0, + "step": 3036, + "train/ce_loss": 1.4053936004638672 + }, + { + "epoch": 0.3001779711291279, + "step": 3036, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.3001779711291279, + "step": 3036, + "train/total_loss": 0.22647686302661896 + }, + { + "entropy": 9.989208221435547, + "epoch": 0.30027684397864346, + "mean_token_accuracy": 0.7664670944213867, + "num_tokens": 15811913.0, + "step": 3037, + "train/ce_loss": 4.4016423998982646e-06 + }, + { + "epoch": 0.30027684397864346, + "step": 3037, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.30027684397864346, + "step": 3037, + "train/total_loss": 0.07421918958425522 + }, + { + "entropy": 9.653634071350098, + "epoch": 0.300375716828159, + "mean_token_accuracy": 0.7589454054832458, + "num_tokens": 15816888.0, + "step": 3038, + "train/ce_loss": 0.9903601408004761 + }, + { + "epoch": 0.300375716828159, + "step": 3038, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.300375716828159, + "step": 3038, + "train/total_loss": 0.16934850811958313 + }, + { + "entropy": 9.67619514465332, + "epoch": 0.3004745896776745, + "mean_token_accuracy": 0.771799623966217, + "num_tokens": 15821864.0, + "step": 3039, + "train/ce_loss": 3.5057651075476315e-06 + }, + { + "epoch": 0.3004745896776745, + "step": 3039, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.3004745896776745, + "step": 3039, + "train/total_loss": 0.015625350177288055 + }, + { + "epoch": 0.30057346252719, + "grad_norm": 0.781501829624176, + "learning_rate": 9.251100232408645e-06, + "loss": 0.143, + "step": 3040 + }, + { + "entropy": 9.31530475616455, + "epoch": 0.30057346252719, + "mean_token_accuracy": 0.7032679915428162, + "num_tokens": 15827127.0, + "step": 3040, + "train/ce_loss": 1.4572179317474365 + }, + { + "epoch": 0.30057346252719, + "step": 3040, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.30057346252719, + "step": 3040, + "train/total_loss": 0.22775304317474365 + }, + { + "entropy": 8.973918914794922, + "epoch": 0.30067233537670557, + "mean_token_accuracy": 0.7213459610939026, + "num_tokens": 15832582.0, + "step": 3041, + "train/ce_loss": 0.9301877021789551 + }, + { + "epoch": 0.30067233537670557, + "step": 3041, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.30067233537670557, + "step": 3041, + "train/total_loss": 0.1477062702178955 + }, + { + "entropy": 9.18991756439209, + "epoch": 0.30077120822622105, + "mean_token_accuracy": 0.7188329100608826, + "num_tokens": 15837739.0, + "step": 3042, + "train/ce_loss": 7.996571184776258e-06 + }, + { + "epoch": 0.30077120822622105, + "step": 3042, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.30077120822622105, + "step": 3042, + "train/total_loss": 0.06250079721212387 + }, + { + "entropy": 10.266804695129395, + "epoch": 0.3008700810757366, + "mean_token_accuracy": 0.7838827967643738, + "num_tokens": 15842382.0, + "step": 3043, + "train/ce_loss": 2.7267353534698486 + }, + { + "epoch": 0.3008700810757366, + "step": 3043, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.3008700810757366, + "step": 3043, + "train/total_loss": 0.3586110472679138 + }, + { + "entropy": 9.074047088623047, + "epoch": 0.30096895392525214, + "mean_token_accuracy": 0.7470725774765015, + "num_tokens": 15847691.0, + "step": 3044, + "train/ce_loss": 0.5398385524749756 + }, + { + "epoch": 0.30096895392525214, + "step": 3044, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.30096895392525214, + "step": 3044, + "train/total_loss": 0.10476510226726532 + }, + { + "entropy": 9.435815811157227, + "epoch": 0.3010678267747676, + "mean_token_accuracy": 0.7376788258552551, + "num_tokens": 15852747.0, + "step": 3045, + "train/ce_loss": 4.20673950429773e-06 + }, + { + "epoch": 0.3010678267747676, + "step": 3045, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.3010678267747676, + "step": 3045, + "train/total_loss": 0.039062920957803726 + }, + { + "entropy": 9.279495239257812, + "epoch": 0.30116669962428316, + "mean_token_accuracy": 0.7272727489471436, + "num_tokens": 15857797.0, + "step": 3046, + "train/ce_loss": 1.2155992984771729 + }, + { + "epoch": 0.30116669962428316, + "step": 3046, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.30116669962428316, + "step": 3046, + "train/total_loss": 0.22702868282794952 + }, + { + "entropy": 9.83897876739502, + "epoch": 0.3012655724737987, + "mean_token_accuracy": 0.7052631378173828, + "num_tokens": 15862659.0, + "step": 3047, + "train/ce_loss": 2.269005537033081 + }, + { + "epoch": 0.3012655724737987, + "step": 3047, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.3012655724737987, + "step": 3047, + "train/total_loss": 0.312838077545166 + }, + { + "entropy": 8.666114807128906, + "epoch": 0.30136444532331425, + "mean_token_accuracy": 0.7765362858772278, + "num_tokens": 15868075.0, + "step": 3048, + "train/ce_loss": 0.27126842737197876 + }, + { + "epoch": 0.30136444532331425, + "step": 3048, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.30136444532331425, + "step": 3048, + "train/total_loss": 0.05056434124708176 + }, + { + "entropy": 9.339645385742188, + "epoch": 0.30146331817282973, + "mean_token_accuracy": 0.7465648651123047, + "num_tokens": 15873217.0, + "step": 3049, + "train/ce_loss": 0.7426149249076843 + }, + { + "epoch": 0.30146331817282973, + "step": 3049, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.30146331817282973, + "step": 3049, + "train/total_loss": 0.13676148653030396 + }, + { + "entropy": 9.430669784545898, + "epoch": 0.30156219102234527, + "mean_token_accuracy": 0.7079136967658997, + "num_tokens": 15878324.0, + "step": 3050, + "train/ce_loss": 1.067453384399414 + }, + { + "epoch": 0.30156219102234527, + "step": 3050, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.30156219102234527, + "step": 3050, + "train/total_loss": 0.19658908247947693 + }, + { + "entropy": 9.262472152709961, + "epoch": 0.3016610638718608, + "mean_token_accuracy": 0.7813712954521179, + "num_tokens": 15883535.0, + "step": 3051, + "train/ce_loss": 2.010103702545166 + }, + { + "epoch": 0.3016610638718608, + "step": 3051, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.3016610638718608, + "step": 3051, + "train/total_loss": 0.3103853762149811 + }, + { + "entropy": 9.47055721282959, + "epoch": 0.3017599367213763, + "mean_token_accuracy": 0.7318435907363892, + "num_tokens": 15888691.0, + "step": 3052, + "train/ce_loss": 1.2855441570281982 + }, + { + "epoch": 0.3017599367213763, + "step": 3052, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.3017599367213763, + "step": 3052, + "train/total_loss": 0.15980441868305206 + }, + { + "entropy": 9.202533721923828, + "epoch": 0.30185880957089184, + "mean_token_accuracy": 0.774631917476654, + "num_tokens": 15893983.0, + "step": 3053, + "train/ce_loss": 0.7063089609146118 + }, + { + "epoch": 0.30185880957089184, + "step": 3053, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.30185880957089184, + "step": 3053, + "train/total_loss": 0.11750590056180954 + }, + { + "entropy": 10.076286315917969, + "epoch": 0.3019576824204074, + "mean_token_accuracy": 0.8044009804725647, + "num_tokens": 15898767.0, + "step": 3054, + "train/ce_loss": 1.245094895362854 + }, + { + "epoch": 0.3019576824204074, + "step": 3054, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.3019576824204074, + "step": 3054, + "train/total_loss": 0.19482198357582092 + }, + { + "entropy": 9.487077713012695, + "epoch": 0.30205655526992287, + "mean_token_accuracy": 0.7615384459495544, + "num_tokens": 15903852.0, + "step": 3055, + "train/ce_loss": 0.783889651298523 + }, + { + "epoch": 0.30205655526992287, + "step": 3055, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.30205655526992287, + "step": 3055, + "train/total_loss": 0.10182646661996841 + }, + { + "entropy": 8.927887916564941, + "epoch": 0.3021554281194384, + "mean_token_accuracy": 0.6800433993339539, + "num_tokens": 15909207.0, + "step": 3056, + "train/ce_loss": 1.4836653470993042 + }, + { + "epoch": 0.3021554281194384, + "step": 3056, + "train/sim_loss": 0.1328125 + }, + { + "epoch": 0.3021554281194384, + "step": 3056, + "train/total_loss": 0.2811790406703949 + }, + { + "entropy": 8.685641288757324, + "epoch": 0.30225430096895395, + "mean_token_accuracy": 0.7129999995231628, + "num_tokens": 15914643.0, + "step": 3057, + "train/ce_loss": 0.6042128801345825 + }, + { + "epoch": 0.30225430096895395, + "step": 3057, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.30225430096895395, + "step": 3057, + "train/total_loss": 0.13854628801345825 + }, + { + "entropy": 9.32431411743164, + "epoch": 0.30235317381846943, + "mean_token_accuracy": 0.7949852347373962, + "num_tokens": 15919782.0, + "step": 3058, + "train/ce_loss": 1.0000278949737549 + }, + { + "epoch": 0.30235317381846943, + "step": 3058, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.30235317381846943, + "step": 3058, + "train/total_loss": 0.15469029545783997 + }, + { + "entropy": 9.148672103881836, + "epoch": 0.302452046667985, + "mean_token_accuracy": 0.7779204249382019, + "num_tokens": 15925208.0, + "step": 3059, + "train/ce_loss": 0.49779924750328064 + }, + { + "epoch": 0.302452046667985, + "step": 3059, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.302452046667985, + "step": 3059, + "train/total_loss": 0.16306117177009583 + }, + { + "epoch": 0.3025509195175005, + "grad_norm": 0.8780242204666138, + "learning_rate": 9.246155367650695e-06, + "loss": 0.1425, + "step": 3060 + }, + { + "entropy": 8.764741897583008, + "epoch": 0.3025509195175005, + "mean_token_accuracy": 0.6839577555656433, + "num_tokens": 15930688.0, + "step": 3060, + "train/ce_loss": 1.202653408050537 + }, + { + "epoch": 0.3025509195175005, + "step": 3060, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.3025509195175005, + "step": 3060, + "train/total_loss": 0.19057783484458923 + }, + { + "entropy": 9.180252075195312, + "epoch": 0.302649792367016, + "mean_token_accuracy": 0.7259615659713745, + "num_tokens": 15935943.0, + "step": 3061, + "train/ce_loss": 0.8565216064453125 + }, + { + "epoch": 0.302649792367016, + "step": 3061, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.302649792367016, + "step": 3061, + "train/total_loss": 0.1286209225654602 + }, + { + "entropy": 8.950722694396973, + "epoch": 0.30274866521653154, + "mean_token_accuracy": 0.6937377452850342, + "num_tokens": 15941391.0, + "step": 3062, + "train/ce_loss": 1.1386116743087769 + }, + { + "epoch": 0.30274866521653154, + "step": 3062, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.30274866521653154, + "step": 3062, + "train/total_loss": 0.16073617339134216 + }, + { + "entropy": 9.415882110595703, + "epoch": 0.3028475380660471, + "mean_token_accuracy": 0.763271152973175, + "num_tokens": 15946498.0, + "step": 3063, + "train/ce_loss": 5.109886387799634e-06 + }, + { + "epoch": 0.3028475380660471, + "step": 3063, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.3028475380660471, + "step": 3063, + "train/total_loss": 0.02734426036477089 + }, + { + "entropy": 9.254497528076172, + "epoch": 0.30294641091556257, + "mean_token_accuracy": 0.7294469475746155, + "num_tokens": 15951646.0, + "step": 3064, + "train/ce_loss": 0.635911762714386 + }, + { + "epoch": 0.30294641091556257, + "step": 3064, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.30294641091556257, + "step": 3064, + "train/total_loss": 0.16124743223190308 + }, + { + "entropy": 10.180620193481445, + "epoch": 0.3030452837650781, + "mean_token_accuracy": 0.7554945349693298, + "num_tokens": 15956420.0, + "step": 3065, + "train/ce_loss": 1.0206608772277832 + }, + { + "epoch": 0.3030452837650781, + "step": 3065, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.3030452837650781, + "step": 3065, + "train/total_loss": 0.20362859964370728 + }, + { + "entropy": 9.169899940490723, + "epoch": 0.30314415661459365, + "mean_token_accuracy": 0.707317054271698, + "num_tokens": 15961613.0, + "step": 3066, + "train/ce_loss": 1.2100774049758911 + }, + { + "epoch": 0.30314415661459365, + "step": 3066, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.30314415661459365, + "step": 3066, + "train/total_loss": 0.1796014904975891 + }, + { + "entropy": 9.298154830932617, + "epoch": 0.30324302946410914, + "mean_token_accuracy": 0.7533632516860962, + "num_tokens": 15966738.0, + "step": 3067, + "train/ce_loss": 0.850917637348175 + }, + { + "epoch": 0.30324302946410914, + "step": 3067, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.30324302946410914, + "step": 3067, + "train/total_loss": 0.19056051969528198 + }, + { + "entropy": 9.30412483215332, + "epoch": 0.3033419023136247, + "mean_token_accuracy": 0.7869565486907959, + "num_tokens": 15971864.0, + "step": 3068, + "train/ce_loss": 0.6976792216300964 + }, + { + "epoch": 0.3033419023136247, + "step": 3068, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.3033419023136247, + "step": 3068, + "train/total_loss": 0.08539292216300964 + }, + { + "entropy": 9.682281494140625, + "epoch": 0.3034407751631402, + "mean_token_accuracy": 0.7980952262878418, + "num_tokens": 15976825.0, + "step": 3069, + "train/ce_loss": 0.8077563643455505 + }, + { + "epoch": 0.3034407751631402, + "step": 3069, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.3034407751631402, + "step": 3069, + "train/total_loss": 0.158900648355484 + }, + { + "entropy": 9.056129455566406, + "epoch": 0.3035396480126557, + "mean_token_accuracy": 0.800000011920929, + "num_tokens": 15981974.0, + "step": 3070, + "train/ce_loss": 0.5365228056907654 + }, + { + "epoch": 0.3035396480126557, + "step": 3070, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.3035396480126557, + "step": 3070, + "train/total_loss": 0.12396478652954102 + }, + { + "entropy": 8.618220329284668, + "epoch": 0.30363852086217125, + "mean_token_accuracy": 0.7547547817230225, + "num_tokens": 15987522.0, + "step": 3071, + "train/ce_loss": 0.6690239906311035 + }, + { + "epoch": 0.30363852086217125, + "step": 3071, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.30363852086217125, + "step": 3071, + "train/total_loss": 0.14893364906311035 + }, + { + "entropy": 9.58163833618164, + "epoch": 0.3037373937116868, + "mean_token_accuracy": 0.7361563444137573, + "num_tokens": 15992550.0, + "step": 3072, + "train/ce_loss": 2.2256726879277267e-06 + }, + { + "epoch": 0.3037373937116868, + "step": 3072, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.3037373937116868, + "step": 3072, + "train/total_loss": 0.01953147165477276 + }, + { + "entropy": 9.08739185333252, + "epoch": 0.3038362665612023, + "mean_token_accuracy": 0.7316455841064453, + "num_tokens": 15997828.0, + "step": 3073, + "train/ce_loss": 1.5055323839187622 + }, + { + "epoch": 0.3038362665612023, + "step": 3073, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.3038362665612023, + "step": 3073, + "train/total_loss": 0.22867824137210846 + }, + { + "entropy": 8.78681755065918, + "epoch": 0.3039351394107178, + "mean_token_accuracy": 0.7214206457138062, + "num_tokens": 16003209.0, + "step": 3074, + "train/ce_loss": 1.348905086517334 + }, + { + "epoch": 0.3039351394107178, + "step": 3074, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.3039351394107178, + "step": 3074, + "train/total_loss": 0.23645301163196564 + }, + { + "entropy": 9.146982192993164, + "epoch": 0.30403401226023336, + "mean_token_accuracy": 0.7832258343696594, + "num_tokens": 16008418.0, + "step": 3075, + "train/ce_loss": 0.4269832372665405 + }, + { + "epoch": 0.30403401226023336, + "step": 3075, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.30403401226023336, + "step": 3075, + "train/total_loss": 0.10129207372665405 + }, + { + "entropy": 9.387057304382324, + "epoch": 0.30413288510974884, + "mean_token_accuracy": 0.728314220905304, + "num_tokens": 16013486.0, + "step": 3076, + "train/ce_loss": 0.9136193990707397 + }, + { + "epoch": 0.30413288510974884, + "step": 3076, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.30413288510974884, + "step": 3076, + "train/total_loss": 0.14214318990707397 + }, + { + "entropy": 9.939265251159668, + "epoch": 0.3042317579592644, + "mean_token_accuracy": 0.6820276379585266, + "num_tokens": 16018300.0, + "step": 3077, + "train/ce_loss": 3.037581443786621 + }, + { + "epoch": 0.3042317579592644, + "step": 3077, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.3042317579592644, + "step": 3077, + "train/total_loss": 0.4014143943786621 + }, + { + "entropy": 9.897211074829102, + "epoch": 0.3043306308087799, + "mean_token_accuracy": 0.7265822887420654, + "num_tokens": 16023133.0, + "step": 3078, + "train/ce_loss": 1.5921827554702759 + }, + { + "epoch": 0.3043306308087799, + "step": 3078, + "train/sim_loss": 0.12109375 + }, + { + "epoch": 0.3043306308087799, + "step": 3078, + "train/total_loss": 0.28031203150749207 + }, + { + "entropy": 9.130783081054688, + "epoch": 0.3044295036582954, + "mean_token_accuracy": 0.6957638263702393, + "num_tokens": 16028389.0, + "step": 3079, + "train/ce_loss": 0.8683300614356995 + }, + { + "epoch": 0.3044295036582954, + "step": 3079, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.3044295036582954, + "step": 3079, + "train/total_loss": 0.11417675763368607 + }, + { + "epoch": 0.30452837650781095, + "grad_norm": 0.9272714853286743, + "learning_rate": 9.241210502892747e-06, + "loss": 0.1476, + "step": 3080 + }, + { + "entropy": 9.466758728027344, + "epoch": 0.30452837650781095, + "mean_token_accuracy": 0.7138211131095886, + "num_tokens": 16033449.0, + "step": 3080, + "train/ce_loss": 3.2655682389304275e-06 + }, + { + "epoch": 0.30452837650781095, + "step": 3080, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.30452837650781095, + "step": 3080, + "train/total_loss": 0.054687827825546265 + }, + { + "entropy": 8.598503112792969, + "epoch": 0.3046272493573265, + "mean_token_accuracy": 0.6991720199584961, + "num_tokens": 16039052.0, + "step": 3081, + "train/ce_loss": 0.7666484713554382 + }, + { + "epoch": 0.3046272493573265, + "step": 3081, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.3046272493573265, + "step": 3081, + "train/total_loss": 0.13135235011577606 + }, + { + "entropy": 9.395477294921875, + "epoch": 0.304726122206842, + "mean_token_accuracy": 0.730434775352478, + "num_tokens": 16044176.0, + "step": 3082, + "train/ce_loss": 0.900779128074646 + }, + { + "epoch": 0.304726122206842, + "step": 3082, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.304726122206842, + "step": 3082, + "train/total_loss": 0.12914040684700012 + }, + { + "entropy": 9.107585906982422, + "epoch": 0.3048249950563575, + "mean_token_accuracy": 0.7666666507720947, + "num_tokens": 16049500.0, + "step": 3083, + "train/ce_loss": 0.46048715710639954 + }, + { + "epoch": 0.3048249950563575, + "step": 3083, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.3048249950563575, + "step": 3083, + "train/total_loss": 0.10464246571063995 + }, + { + "entropy": 9.299919128417969, + "epoch": 0.30492386790587306, + "mean_token_accuracy": 0.7155612111091614, + "num_tokens": 16054733.0, + "step": 3084, + "train/ce_loss": 1.1572389602661133 + }, + { + "epoch": 0.30492386790587306, + "step": 3084, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.30492386790587306, + "step": 3084, + "train/total_loss": 0.16650515794754028 + }, + { + "entropy": 9.56911849975586, + "epoch": 0.30502274075538854, + "mean_token_accuracy": 0.7896440029144287, + "num_tokens": 16059787.0, + "step": 3085, + "train/ce_loss": 8.066250302363187e-05 + }, + { + "epoch": 0.30502274075538854, + "step": 3085, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.30502274075538854, + "step": 3085, + "train/total_loss": 0.03516431525349617 + }, + { + "entropy": 9.23663330078125, + "epoch": 0.3051216136049041, + "mean_token_accuracy": 0.699312686920166, + "num_tokens": 16064835.0, + "step": 3086, + "train/ce_loss": 1.2652978897094727 + }, + { + "epoch": 0.3051216136049041, + "step": 3086, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.3051216136049041, + "step": 3086, + "train/total_loss": 0.18121729791164398 + }, + { + "entropy": 9.628506660461426, + "epoch": 0.3052204864544196, + "mean_token_accuracy": 0.7040650248527527, + "num_tokens": 16069860.0, + "step": 3087, + "train/ce_loss": 0.7562285661697388 + }, + { + "epoch": 0.3052204864544196, + "step": 3087, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.3052204864544196, + "step": 3087, + "train/total_loss": 0.13812285661697388 + }, + { + "entropy": 9.595111846923828, + "epoch": 0.3053193593039351, + "mean_token_accuracy": 0.7810107469558716, + "num_tokens": 16074935.0, + "step": 3088, + "train/ce_loss": 0.593614935874939 + }, + { + "epoch": 0.3053193593039351, + "step": 3088, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.3053193593039351, + "step": 3088, + "train/total_loss": 0.07498649507761002 + }, + { + "entropy": 8.711251258850098, + "epoch": 0.30541823215345065, + "mean_token_accuracy": 0.7074527144432068, + "num_tokens": 16080314.0, + "step": 3089, + "train/ce_loss": 1.2984533309936523 + }, + { + "epoch": 0.30541823215345065, + "step": 3089, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.30541823215345065, + "step": 3089, + "train/total_loss": 0.22750158607959747 + }, + { + "entropy": 9.0078763961792, + "epoch": 0.3055171050029662, + "mean_token_accuracy": 0.7614269852638245, + "num_tokens": 16085713.0, + "step": 3090, + "train/ce_loss": 1.0311739444732666 + }, + { + "epoch": 0.3055171050029662, + "step": 3090, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.3055171050029662, + "step": 3090, + "train/total_loss": 0.17342990636825562 + }, + { + "entropy": 8.846348762512207, + "epoch": 0.30561597785248173, + "mean_token_accuracy": 0.7243852615356445, + "num_tokens": 16091151.0, + "step": 3091, + "train/ce_loss": 0.6214485168457031 + }, + { + "epoch": 0.30561597785248173, + "step": 3091, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.30561597785248173, + "step": 3091, + "train/total_loss": 0.09730110317468643 + }, + { + "entropy": 9.06347942352295, + "epoch": 0.3057148507019972, + "mean_token_accuracy": 0.667037844657898, + "num_tokens": 16096528.0, + "step": 3092, + "train/ce_loss": 0.662187933921814 + }, + { + "epoch": 0.3057148507019972, + "step": 3092, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.3057148507019972, + "step": 3092, + "train/total_loss": 0.1599687933921814 + }, + { + "entropy": 9.287656784057617, + "epoch": 0.30581372355151276, + "mean_token_accuracy": 0.7780612111091614, + "num_tokens": 16101772.0, + "step": 3093, + "train/ce_loss": 0.5878052711486816 + }, + { + "epoch": 0.30581372355151276, + "step": 3093, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.30581372355151276, + "step": 3093, + "train/total_loss": 0.08612427860498428 + }, + { + "entropy": 10.067656517028809, + "epoch": 0.3059125964010283, + "mean_token_accuracy": 0.6492146849632263, + "num_tokens": 16106532.0, + "step": 3094, + "train/ce_loss": 2.511756181716919 + }, + { + "epoch": 0.3059125964010283, + "step": 3094, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.3059125964010283, + "step": 3094, + "train/total_loss": 0.3605506122112274 + }, + { + "entropy": 8.915959358215332, + "epoch": 0.3060114692505438, + "mean_token_accuracy": 0.7392290234565735, + "num_tokens": 16111942.0, + "step": 3095, + "train/ce_loss": 0.5130492448806763 + }, + { + "epoch": 0.3060114692505438, + "step": 3095, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.3060114692505438, + "step": 3095, + "train/total_loss": 0.10599242150783539 + }, + { + "entropy": 9.439672470092773, + "epoch": 0.30611034210005933, + "mean_token_accuracy": 0.7339622378349304, + "num_tokens": 16116897.0, + "step": 3096, + "train/ce_loss": 4.731972694571596e-06 + }, + { + "epoch": 0.30611034210005933, + "step": 3096, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.30611034210005933, + "step": 3096, + "train/total_loss": 0.027344223111867905 + }, + { + "entropy": 9.428709983825684, + "epoch": 0.30620921494957487, + "mean_token_accuracy": 0.7720706462860107, + "num_tokens": 16121998.0, + "step": 3097, + "train/ce_loss": 0.6409549117088318 + }, + { + "epoch": 0.30620921494957487, + "step": 3097, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.30620921494957487, + "step": 3097, + "train/total_loss": 0.08753298968076706 + }, + { + "entropy": 8.80047607421875, + "epoch": 0.30630808779909036, + "mean_token_accuracy": 0.6861878633499146, + "num_tokens": 16127368.0, + "step": 3098, + "train/ce_loss": 1.2583460807800293 + }, + { + "epoch": 0.30630808779909036, + "step": 3098, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.30630808779909036, + "step": 3098, + "train/total_loss": 0.1648971140384674 + }, + { + "entropy": 9.62500286102295, + "epoch": 0.3064069606486059, + "mean_token_accuracy": 0.6607999801635742, + "num_tokens": 16132408.0, + "step": 3099, + "train/ce_loss": 1.3447495698928833 + }, + { + "epoch": 0.3064069606486059, + "step": 3099, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.3064069606486059, + "step": 3099, + "train/total_loss": 0.2008812129497528 + }, + { + "epoch": 0.30650583349812144, + "grad_norm": 0.9475986957550049, + "learning_rate": 9.236265638134798e-06, + "loss": 0.1548, + "step": 3100 + }, + { + "entropy": 8.950824737548828, + "epoch": 0.30650583349812144, + "mean_token_accuracy": 0.7482100129127502, + "num_tokens": 16137687.0, + "step": 3100, + "train/ce_loss": 0.47615012526512146 + }, + { + "epoch": 0.30650583349812144, + "step": 3100, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.30650583349812144, + "step": 3100, + "train/total_loss": 0.11792751401662827 + }, + { + "entropy": 9.56070327758789, + "epoch": 0.3066047063476369, + "mean_token_accuracy": 0.7605633735656738, + "num_tokens": 16142761.0, + "step": 3101, + "train/ce_loss": 3.0056301056902157e-06 + }, + { + "epoch": 0.3066047063476369, + "step": 3101, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.3066047063476369, + "step": 3101, + "train/total_loss": 0.023437799885869026 + }, + { + "entropy": 8.973735809326172, + "epoch": 0.30670357919715246, + "mean_token_accuracy": 0.7640320658683777, + "num_tokens": 16148088.0, + "step": 3102, + "train/ce_loss": 0.608039915561676 + }, + { + "epoch": 0.30670357919715246, + "step": 3102, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.30670357919715246, + "step": 3102, + "train/total_loss": 0.09596024453639984 + }, + { + "entropy": 9.528715133666992, + "epoch": 0.306802452046668, + "mean_token_accuracy": 0.7862714529037476, + "num_tokens": 16153141.0, + "step": 3103, + "train/ce_loss": 0.6295854449272156 + }, + { + "epoch": 0.306802452046668, + "step": 3103, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.306802452046668, + "step": 3103, + "train/total_loss": 0.16061478853225708 + }, + { + "entropy": 9.321884155273438, + "epoch": 0.3069013248961835, + "mean_token_accuracy": 0.8230769038200378, + "num_tokens": 16158235.0, + "step": 3104, + "train/ce_loss": 0.7484152317047119 + }, + { + "epoch": 0.3069013248961835, + "step": 3104, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.3069013248961835, + "step": 3104, + "train/total_loss": 0.10609152168035507 + }, + { + "entropy": 9.431598663330078, + "epoch": 0.30700019774569903, + "mean_token_accuracy": 0.7129032015800476, + "num_tokens": 16163312.0, + "step": 3105, + "train/ce_loss": 0.6860657930374146 + }, + { + "epoch": 0.30700019774569903, + "step": 3105, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.30700019774569903, + "step": 3105, + "train/total_loss": 0.11548157781362534 + }, + { + "entropy": 8.806082725524902, + "epoch": 0.3070990705952146, + "mean_token_accuracy": 0.7071651220321655, + "num_tokens": 16168751.0, + "step": 3106, + "train/ce_loss": 1.0154913663864136 + }, + { + "epoch": 0.3070990705952146, + "step": 3106, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.3070990705952146, + "step": 3106, + "train/total_loss": 0.1601428985595703 + }, + { + "entropy": 9.546571731567383, + "epoch": 0.30719794344473006, + "mean_token_accuracy": 0.7418181896209717, + "num_tokens": 16173742.0, + "step": 3107, + "train/ce_loss": 0.7134097218513489 + }, + { + "epoch": 0.30719794344473006, + "step": 3107, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.30719794344473006, + "step": 3107, + "train/total_loss": 0.14555972814559937 + }, + { + "entropy": 9.867581367492676, + "epoch": 0.3072968162942456, + "mean_token_accuracy": 0.7926267385482788, + "num_tokens": 16178597.0, + "step": 3108, + "train/ce_loss": 6.592382760572946e-06 + }, + { + "epoch": 0.3072968162942456, + "step": 3108, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.3072968162942456, + "step": 3108, + "train/total_loss": 0.03125065937638283 + }, + { + "entropy": 9.264135360717773, + "epoch": 0.30739568914376114, + "mean_token_accuracy": 0.6494565010070801, + "num_tokens": 16183800.0, + "step": 3109, + "train/ce_loss": 1.8196673393249512 + }, + { + "epoch": 0.30739568914376114, + "step": 3109, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.30739568914376114, + "step": 3109, + "train/total_loss": 0.29915422201156616 + }, + { + "entropy": 9.16361141204834, + "epoch": 0.3074945619932766, + "mean_token_accuracy": 0.6535341739654541, + "num_tokens": 16189111.0, + "step": 3110, + "train/ce_loss": 1.6297848224639893 + }, + { + "epoch": 0.3074945619932766, + "step": 3110, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.3074945619932766, + "step": 3110, + "train/total_loss": 0.23329098522663116 + }, + { + "entropy": 8.848861694335938, + "epoch": 0.30759343484279217, + "mean_token_accuracy": 0.7415611743927002, + "num_tokens": 16194602.0, + "step": 3111, + "train/ce_loss": 1.1059902906417847 + }, + { + "epoch": 0.30759343484279217, + "step": 3111, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.30759343484279217, + "step": 3111, + "train/total_loss": 0.20044279098510742 + }, + { + "entropy": 9.377516746520996, + "epoch": 0.3076923076923077, + "mean_token_accuracy": 0.7333333492279053, + "num_tokens": 16199684.0, + "step": 3112, + "train/ce_loss": 0.9643053412437439 + }, + { + "epoch": 0.3076923076923077, + "step": 3112, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.3076923076923077, + "step": 3112, + "train/total_loss": 0.22143054008483887 + }, + { + "entropy": 9.034387588500977, + "epoch": 0.3077911805418232, + "mean_token_accuracy": 0.759036123752594, + "num_tokens": 16204989.0, + "step": 3113, + "train/ce_loss": 1.4106206893920898 + }, + { + "epoch": 0.3077911805418232, + "step": 3113, + "train/sim_loss": 0.1640625 + }, + { + "epoch": 0.3077911805418232, + "step": 3113, + "train/total_loss": 0.30512458086013794 + }, + { + "entropy": 9.227992057800293, + "epoch": 0.30789005339133874, + "mean_token_accuracy": 0.7476635575294495, + "num_tokens": 16210224.0, + "step": 3114, + "train/ce_loss": 1.1176414489746094 + }, + { + "epoch": 0.30789005339133874, + "step": 3114, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.30789005339133874, + "step": 3114, + "train/total_loss": 0.16645164787769318 + }, + { + "entropy": 9.282114028930664, + "epoch": 0.3079889262408543, + "mean_token_accuracy": 0.7710674405097961, + "num_tokens": 16215366.0, + "step": 3115, + "train/ce_loss": 0.7169881463050842 + }, + { + "epoch": 0.3079889262408543, + "step": 3115, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.3079889262408543, + "step": 3115, + "train/total_loss": 0.15373006463050842 + }, + { + "entropy": 8.980093002319336, + "epoch": 0.30808779909036976, + "mean_token_accuracy": 0.6628895401954651, + "num_tokens": 16220547.0, + "step": 3116, + "train/ce_loss": 1.4373340606689453 + }, + { + "epoch": 0.30808779909036976, + "step": 3116, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.30808779909036976, + "step": 3116, + "train/total_loss": 0.257014662027359 + }, + { + "entropy": 9.515814781188965, + "epoch": 0.3081866719398853, + "mean_token_accuracy": 0.7389240264892578, + "num_tokens": 16225621.0, + "step": 3117, + "train/ce_loss": 3.717155323101906e-06 + }, + { + "epoch": 0.3081866719398853, + "step": 3117, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.3081866719398853, + "step": 3117, + "train/total_loss": 0.12500037252902985 + }, + { + "entropy": 9.999287605285645, + "epoch": 0.30828554478940084, + "mean_token_accuracy": 0.8034397959709167, + "num_tokens": 16230406.0, + "step": 3118, + "train/ce_loss": 3.893018401868176e-06 + }, + { + "epoch": 0.30828554478940084, + "step": 3118, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.30828554478940084, + "step": 3118, + "train/total_loss": 0.04296914115548134 + }, + { + "entropy": 8.90539836883545, + "epoch": 0.30838441763891633, + "mean_token_accuracy": 0.7177508473396301, + "num_tokens": 16235807.0, + "step": 3119, + "train/ce_loss": 0.8543577194213867 + }, + { + "epoch": 0.30838441763891633, + "step": 3119, + "train/sim_loss": 0.1640625 + }, + { + "epoch": 0.30838441763891633, + "step": 3119, + "train/total_loss": 0.24949827790260315 + }, + { + "epoch": 0.30848329048843187, + "grad_norm": 0.9575387239456177, + "learning_rate": 9.231320773376848e-06, + "loss": 0.1514, + "step": 3120 + }, + { + "entropy": 8.715691566467285, + "epoch": 0.30848329048843187, + "mean_token_accuracy": 0.7260406613349915, + "num_tokens": 16241355.0, + "step": 3120, + "train/ce_loss": 0.4119970500469208 + }, + { + "epoch": 0.30848329048843187, + "step": 3120, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.30848329048843187, + "step": 3120, + "train/total_loss": 0.0568247064948082 + }, + { + "entropy": 9.690220832824707, + "epoch": 0.3085821633379474, + "mean_token_accuracy": 0.6921606063842773, + "num_tokens": 16246318.0, + "step": 3121, + "train/ce_loss": 1.8799567222595215 + }, + { + "epoch": 0.3085821633379474, + "step": 3121, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.3085821633379474, + "step": 3121, + "train/total_loss": 0.23877692222595215 + }, + { + "entropy": 9.361258506774902, + "epoch": 0.3086810361874629, + "mean_token_accuracy": 0.8055987358093262, + "num_tokens": 16251400.0, + "step": 3122, + "train/ce_loss": 0.8268361687660217 + }, + { + "epoch": 0.3086810361874629, + "step": 3122, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.3086810361874629, + "step": 3122, + "train/total_loss": 0.12565237283706665 + }, + { + "entropy": 9.38145637512207, + "epoch": 0.30877990903697844, + "mean_token_accuracy": 0.6562905311584473, + "num_tokens": 16256645.0, + "step": 3123, + "train/ce_loss": 2.5330162048339844 + }, + { + "epoch": 0.30877990903697844, + "step": 3123, + "train/sim_loss": 0.15234375 + }, + { + "epoch": 0.30877990903697844, + "step": 3123, + "train/total_loss": 0.40564537048339844 + }, + { + "entropy": 9.34083366394043, + "epoch": 0.308878781886494, + "mean_token_accuracy": 0.7066051959991455, + "num_tokens": 16261805.0, + "step": 3124, + "train/ce_loss": 1.233890414237976 + }, + { + "epoch": 0.308878781886494, + "step": 3124, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.308878781886494, + "step": 3124, + "train/total_loss": 0.22104528546333313 + }, + { + "entropy": 9.12912368774414, + "epoch": 0.30897765473600947, + "mean_token_accuracy": 0.7390244007110596, + "num_tokens": 16267104.0, + "step": 3125, + "train/ce_loss": 0.8176206946372986 + }, + { + "epoch": 0.30897765473600947, + "step": 3125, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.30897765473600947, + "step": 3125, + "train/total_loss": 0.14426207542419434 + }, + { + "entropy": 9.835182189941406, + "epoch": 0.309076527585525, + "mean_token_accuracy": 0.7484909296035767, + "num_tokens": 16272039.0, + "step": 3126, + "train/ce_loss": 8.67279049998615e-06 + }, + { + "epoch": 0.309076527585525, + "step": 3126, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.309076527585525, + "step": 3126, + "train/total_loss": 0.03125086799263954 + }, + { + "entropy": 10.046555519104004, + "epoch": 0.30917540043504055, + "mean_token_accuracy": 0.7424242496490479, + "num_tokens": 16276819.0, + "step": 3127, + "train/ce_loss": 1.4793922901153564 + }, + { + "epoch": 0.30917540043504055, + "step": 3127, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.30917540043504055, + "step": 3127, + "train/total_loss": 0.22606423497200012 + }, + { + "entropy": 8.99412727355957, + "epoch": 0.30927427328455603, + "mean_token_accuracy": 0.6790606379508972, + "num_tokens": 16282326.0, + "step": 3128, + "train/ce_loss": 0.9208659529685974 + }, + { + "epoch": 0.30927427328455603, + "step": 3128, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.30927427328455603, + "step": 3128, + "train/total_loss": 0.18583659827709198 + }, + { + "entropy": 10.206575393676758, + "epoch": 0.3093731461340716, + "mean_token_accuracy": 0.75314861536026, + "num_tokens": 16287075.0, + "step": 3129, + "train/ce_loss": 1.8498486280441284 + }, + { + "epoch": 0.3093731461340716, + "step": 3129, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.3093731461340716, + "step": 3129, + "train/total_loss": 0.27482861280441284 + }, + { + "entropy": 9.601837158203125, + "epoch": 0.3094720189835871, + "mean_token_accuracy": 0.6882882714271545, + "num_tokens": 16292083.0, + "step": 3130, + "train/ce_loss": 3.6322981031844392e-06 + }, + { + "epoch": 0.3094720189835871, + "step": 3130, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.3094720189835871, + "step": 3130, + "train/total_loss": 0.0273441132158041 + }, + { + "entropy": 9.262863159179688, + "epoch": 0.30957089183310266, + "mean_token_accuracy": 0.7254672646522522, + "num_tokens": 16297395.0, + "step": 3131, + "train/ce_loss": 0.8118441104888916 + }, + { + "epoch": 0.30957089183310266, + "step": 3131, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.30957089183310266, + "step": 3131, + "train/total_loss": 0.13587191700935364 + }, + { + "entropy": 8.937446594238281, + "epoch": 0.30966976468261814, + "mean_token_accuracy": 0.7626903653144836, + "num_tokens": 16302727.0, + "step": 3132, + "train/ce_loss": 0.764589786529541 + }, + { + "epoch": 0.30966976468261814, + "step": 3132, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.30966976468261814, + "step": 3132, + "train/total_loss": 0.13505274057388306 + }, + { + "entropy": 9.000936508178711, + "epoch": 0.3097686375321337, + "mean_token_accuracy": 0.707196056842804, + "num_tokens": 16308051.0, + "step": 3133, + "train/ce_loss": 1.7300879955291748 + }, + { + "epoch": 0.3097686375321337, + "step": 3133, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.3097686375321337, + "step": 3133, + "train/total_loss": 0.24722754955291748 + }, + { + "entropy": 9.17640209197998, + "epoch": 0.3098675103816492, + "mean_token_accuracy": 0.7517814636230469, + "num_tokens": 16313356.0, + "step": 3134, + "train/ce_loss": 0.6477929353713989 + }, + { + "epoch": 0.3098675103816492, + "step": 3134, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.3098675103816492, + "step": 3134, + "train/total_loss": 0.09212304651737213 + }, + { + "entropy": 8.919665336608887, + "epoch": 0.3099663832311647, + "mean_token_accuracy": 0.7678795456886292, + "num_tokens": 16318630.0, + "step": 3135, + "train/ce_loss": 0.49486202001571655 + }, + { + "epoch": 0.3099663832311647, + "step": 3135, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.3099663832311647, + "step": 3135, + "train/total_loss": 0.0963612049818039 + }, + { + "entropy": 9.14445686340332, + "epoch": 0.31006525608068025, + "mean_token_accuracy": 0.6593785881996155, + "num_tokens": 16323970.0, + "step": 3136, + "train/ce_loss": 1.3321263790130615 + }, + { + "epoch": 0.31006525608068025, + "step": 3136, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.31006525608068025, + "step": 3136, + "train/total_loss": 0.2113376408815384 + }, + { + "entropy": 8.572418212890625, + "epoch": 0.3101641289301958, + "mean_token_accuracy": 0.755156934261322, + "num_tokens": 16329557.0, + "step": 3137, + "train/ce_loss": 0.6957716345787048 + }, + { + "epoch": 0.3101641289301958, + "step": 3137, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.3101641289301958, + "step": 3137, + "train/total_loss": 0.0891084149479866 + }, + { + "entropy": 10.049623489379883, + "epoch": 0.3102630017797113, + "mean_token_accuracy": 0.7021276354789734, + "num_tokens": 16334419.0, + "step": 3138, + "train/ce_loss": 2.307546377182007 + }, + { + "epoch": 0.3102630017797113, + "step": 3138, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.3102630017797113, + "step": 3138, + "train/total_loss": 0.29325464367866516 + }, + { + "entropy": 9.033727645874023, + "epoch": 0.3103618746292268, + "mean_token_accuracy": 0.7027027010917664, + "num_tokens": 16339795.0, + "step": 3139, + "train/ce_loss": 0.7809948921203613 + }, + { + "epoch": 0.3103618746292268, + "step": 3139, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.3103618746292268, + "step": 3139, + "train/total_loss": 0.14059948921203613 + }, + { + "epoch": 0.31046074747874236, + "grad_norm": 0.7521636486053467, + "learning_rate": 9.2263759086189e-06, + "loss": 0.1506, + "step": 3140 + }, + { + "entropy": 9.636505126953125, + "epoch": 0.31046074747874236, + "mean_token_accuracy": 0.7461240291595459, + "num_tokens": 16344756.0, + "step": 3140, + "train/ce_loss": 0.6385220289230347 + }, + { + "epoch": 0.31046074747874236, + "step": 3140, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.31046074747874236, + "step": 3140, + "train/total_loss": 0.1693209558725357 + }, + { + "entropy": 9.397228240966797, + "epoch": 0.31055962032825785, + "mean_token_accuracy": 0.7478134036064148, + "num_tokens": 16349847.0, + "step": 3141, + "train/ce_loss": 1.0517762899398804 + }, + { + "epoch": 0.31055962032825785, + "step": 3141, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.31055962032825785, + "step": 3141, + "train/total_loss": 0.167677640914917 + }, + { + "entropy": 9.273998260498047, + "epoch": 0.3106584931777734, + "mean_token_accuracy": 0.7318840622901917, + "num_tokens": 16354945.0, + "step": 3142, + "train/ce_loss": 1.263601541519165 + }, + { + "epoch": 0.3106584931777734, + "step": 3142, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.3106584931777734, + "step": 3142, + "train/total_loss": 0.22792266309261322 + }, + { + "entropy": 9.283475875854492, + "epoch": 0.31075736602728893, + "mean_token_accuracy": 0.7634561061859131, + "num_tokens": 16360131.0, + "step": 3143, + "train/ce_loss": 0.5632201433181763 + }, + { + "epoch": 0.31075736602728893, + "step": 3143, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.31075736602728893, + "step": 3143, + "train/total_loss": 0.13444700837135315 + }, + { + "entropy": 9.175178527832031, + "epoch": 0.3108562388768044, + "mean_token_accuracy": 0.7473053932189941, + "num_tokens": 16365359.0, + "step": 3144, + "train/ce_loss": 0.8894725441932678 + }, + { + "epoch": 0.3108562388768044, + "step": 3144, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.3108562388768044, + "step": 3144, + "train/total_loss": 0.13972851634025574 + }, + { + "entropy": 9.034858703613281, + "epoch": 0.31095511172631995, + "mean_token_accuracy": 0.7553443908691406, + "num_tokens": 16370678.0, + "step": 3145, + "train/ce_loss": 0.49990347027778625 + }, + { + "epoch": 0.31095511172631995, + "step": 3145, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.31095511172631995, + "step": 3145, + "train/total_loss": 0.10467784851789474 + }, + { + "entropy": 9.623411178588867, + "epoch": 0.3110539845758355, + "mean_token_accuracy": 0.7321428656578064, + "num_tokens": 16375657.0, + "step": 3146, + "train/ce_loss": 5.4201768762141e-06 + }, + { + "epoch": 0.3110539845758355, + "step": 3146, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.3110539845758355, + "step": 3146, + "train/total_loss": 0.04296929016709328 + }, + { + "entropy": 9.056873321533203, + "epoch": 0.311152857425351, + "mean_token_accuracy": 0.7586206793785095, + "num_tokens": 16381050.0, + "step": 3147, + "train/ce_loss": 0.5513096451759338 + }, + { + "epoch": 0.311152857425351, + "step": 3147, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.311152857425351, + "step": 3147, + "train/total_loss": 0.1059122160077095 + }, + { + "entropy": 9.472996711730957, + "epoch": 0.3112517302748665, + "mean_token_accuracy": 0.7312977313995361, + "num_tokens": 16386171.0, + "step": 3148, + "train/ce_loss": 5.088227680971613e-06 + }, + { + "epoch": 0.3112517302748665, + "step": 3148, + "train/sim_loss": 0.14453125 + }, + { + "epoch": 0.3112517302748665, + "step": 3148, + "train/total_loss": 0.1445317566394806 + }, + { + "entropy": 9.353708267211914, + "epoch": 0.31135060312438206, + "mean_token_accuracy": 0.698727011680603, + "num_tokens": 16391339.0, + "step": 3149, + "train/ce_loss": 1.2778841257095337 + }, + { + "epoch": 0.31135060312438206, + "step": 3149, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.31135060312438206, + "step": 3149, + "train/total_loss": 0.22153840959072113 + }, + { + "entropy": 9.361830711364746, + "epoch": 0.31144947597389755, + "mean_token_accuracy": 0.6990740895271301, + "num_tokens": 16396433.0, + "step": 3150, + "train/ce_loss": 1.6144299507141113 + }, + { + "epoch": 0.31144947597389755, + "step": 3150, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.31144947597389755, + "step": 3150, + "train/total_loss": 0.28644299507141113 + }, + { + "entropy": 9.052051544189453, + "epoch": 0.3115483488234131, + "mean_token_accuracy": 0.7449344396591187, + "num_tokens": 16401689.0, + "step": 3151, + "train/ce_loss": 0.8088662624359131 + }, + { + "epoch": 0.3115483488234131, + "step": 3151, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.3115483488234131, + "step": 3151, + "train/total_loss": 0.1902616322040558 + }, + { + "entropy": 9.249351501464844, + "epoch": 0.31164722167292863, + "mean_token_accuracy": 0.7616580128669739, + "num_tokens": 16406969.0, + "step": 3152, + "train/ce_loss": 0.9997432231903076 + }, + { + "epoch": 0.31164722167292863, + "step": 3152, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.31164722167292863, + "step": 3152, + "train/total_loss": 0.20153683423995972 + }, + { + "entropy": 9.221841812133789, + "epoch": 0.3117460945224441, + "mean_token_accuracy": 0.6978609561920166, + "num_tokens": 16412160.0, + "step": 3153, + "train/ce_loss": 1.0066462755203247 + }, + { + "epoch": 0.3117460945224441, + "step": 3153, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.3117460945224441, + "step": 3153, + "train/total_loss": 0.1866021305322647 + }, + { + "entropy": 9.281400680541992, + "epoch": 0.31184496737195966, + "mean_token_accuracy": 0.7664835453033447, + "num_tokens": 16417330.0, + "step": 3154, + "train/ce_loss": 0.7727817296981812 + }, + { + "epoch": 0.31184496737195966, + "step": 3154, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.31184496737195966, + "step": 3154, + "train/total_loss": 0.10852817445993423 + }, + { + "entropy": 8.738920211791992, + "epoch": 0.3119438402214752, + "mean_token_accuracy": 0.7442424297332764, + "num_tokens": 16422602.0, + "step": 3155, + "train/ce_loss": 0.9020947217941284 + }, + { + "epoch": 0.3119438402214752, + "step": 3155, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.3119438402214752, + "step": 3155, + "train/total_loss": 0.1527094841003418 + }, + { + "entropy": 9.108892440795898, + "epoch": 0.3120427130709907, + "mean_token_accuracy": 0.743682324886322, + "num_tokens": 16427880.0, + "step": 3156, + "train/ce_loss": 0.610110342502594 + }, + { + "epoch": 0.3120427130709907, + "step": 3156, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.3120427130709907, + "step": 3156, + "train/total_loss": 0.13522978127002716 + }, + { + "entropy": 8.891284942626953, + "epoch": 0.3121415859205062, + "mean_token_accuracy": 0.7242798209190369, + "num_tokens": 16433372.0, + "step": 3157, + "train/ce_loss": 0.8658456206321716 + }, + { + "epoch": 0.3121415859205062, + "step": 3157, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.3121415859205062, + "step": 3157, + "train/total_loss": 0.17252206802368164 + }, + { + "entropy": 9.185648918151855, + "epoch": 0.31224045877002177, + "mean_token_accuracy": 0.7341935634613037, + "num_tokens": 16438572.0, + "step": 3158, + "train/ce_loss": 0.590491771697998 + }, + { + "epoch": 0.31224045877002177, + "step": 3158, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.31224045877002177, + "step": 3158, + "train/total_loss": 0.09420542418956757 + }, + { + "entropy": 9.610228538513184, + "epoch": 0.31233933161953725, + "mean_token_accuracy": 0.6815742254257202, + "num_tokens": 16443682.0, + "step": 3159, + "train/ce_loss": 0.9321441054344177 + }, + { + "epoch": 0.31233933161953725, + "step": 3159, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.31233933161953725, + "step": 3159, + "train/total_loss": 0.20258942246437073 + }, + { + "epoch": 0.3124382044690528, + "grad_norm": 0.9434493780136108, + "learning_rate": 9.221431043860951e-06, + "loss": 0.1525, + "step": 3160 + }, + { + "entropy": 8.815746307373047, + "epoch": 0.3124382044690528, + "mean_token_accuracy": 0.7195301055908203, + "num_tokens": 16448886.0, + "step": 3160, + "train/ce_loss": 0.8756749033927917 + }, + { + "epoch": 0.3124382044690528, + "step": 3160, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.3124382044690528, + "step": 3160, + "train/total_loss": 0.1695987433195114 + }, + { + "entropy": 9.118791580200195, + "epoch": 0.31253707731856833, + "mean_token_accuracy": 0.7247259616851807, + "num_tokens": 16454171.0, + "step": 3161, + "train/ce_loss": 1.127454161643982 + }, + { + "epoch": 0.31253707731856833, + "step": 3161, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.31253707731856833, + "step": 3161, + "train/total_loss": 0.16743291914463043 + }, + { + "entropy": 9.115190505981445, + "epoch": 0.3126359501680838, + "mean_token_accuracy": 0.723122239112854, + "num_tokens": 16459335.0, + "step": 3162, + "train/ce_loss": 1.120043158531189 + }, + { + "epoch": 0.3126359501680838, + "step": 3162, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.3126359501680838, + "step": 3162, + "train/total_loss": 0.19012930989265442 + }, + { + "entropy": 9.974798202514648, + "epoch": 0.31273482301759936, + "mean_token_accuracy": 0.6629955768585205, + "num_tokens": 16464216.0, + "step": 3163, + "train/ce_loss": 2.5433461666107178 + }, + { + "epoch": 0.31273482301759936, + "step": 3163, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.31273482301759936, + "step": 3163, + "train/total_loss": 0.36370962858200073 + }, + { + "entropy": 9.074464797973633, + "epoch": 0.3128336958671149, + "mean_token_accuracy": 0.7586981058120728, + "num_tokens": 16469597.0, + "step": 3164, + "train/ce_loss": 0.9268357157707214 + }, + { + "epoch": 0.3128336958671149, + "step": 3164, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.3128336958671149, + "step": 3164, + "train/total_loss": 0.1356523334980011 + }, + { + "entropy": 8.859774589538574, + "epoch": 0.3129325687166304, + "mean_token_accuracy": 0.7575107216835022, + "num_tokens": 16475021.0, + "step": 3165, + "train/ce_loss": 0.7807568907737732 + }, + { + "epoch": 0.3129325687166304, + "step": 3165, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.3129325687166304, + "step": 3165, + "train/total_loss": 0.12885694205760956 + }, + { + "entropy": 9.364564895629883, + "epoch": 0.31303144156614593, + "mean_token_accuracy": 0.8024523258209229, + "num_tokens": 16480188.0, + "step": 3166, + "train/ce_loss": 2.2346878267853754e-06 + }, + { + "epoch": 0.31303144156614593, + "step": 3166, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.31303144156614593, + "step": 3166, + "train/total_loss": 0.019531473517417908 + }, + { + "entropy": 9.303370475769043, + "epoch": 0.31313031441566147, + "mean_token_accuracy": 0.7154762148857117, + "num_tokens": 16485415.0, + "step": 3167, + "train/ce_loss": 1.0272390842437744 + }, + { + "epoch": 0.31313031441566147, + "step": 3167, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.31313031441566147, + "step": 3167, + "train/total_loss": 0.16913016140460968 + }, + { + "entropy": 9.708673477172852, + "epoch": 0.31322918726517696, + "mean_token_accuracy": 0.7376146912574768, + "num_tokens": 16490363.0, + "step": 3168, + "train/ce_loss": 0.5770002603530884 + }, + { + "epoch": 0.31322918726517696, + "step": 3168, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.31322918726517696, + "step": 3168, + "train/total_loss": 0.1280125230550766 + }, + { + "entropy": 9.506532669067383, + "epoch": 0.3133280601146925, + "mean_token_accuracy": 0.6943164467811584, + "num_tokens": 16495481.0, + "step": 3169, + "train/ce_loss": 0.9109711647033691 + }, + { + "epoch": 0.3133280601146925, + "step": 3169, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.3133280601146925, + "step": 3169, + "train/total_loss": 0.18875336647033691 + }, + { + "entropy": 9.18989372253418, + "epoch": 0.31342693296420804, + "mean_token_accuracy": 0.7279999852180481, + "num_tokens": 16500676.0, + "step": 3170, + "train/ce_loss": 0.7564029097557068 + }, + { + "epoch": 0.31342693296420804, + "step": 3170, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.31342693296420804, + "step": 3170, + "train/total_loss": 0.15376529097557068 + }, + { + "entropy": 9.426542282104492, + "epoch": 0.3135258058137236, + "mean_token_accuracy": 0.7111716866493225, + "num_tokens": 16505864.0, + "step": 3171, + "train/ce_loss": 0.4606419801712036 + }, + { + "epoch": 0.3135258058137236, + "step": 3171, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.3135258058137236, + "step": 3171, + "train/total_loss": 0.11637669801712036 + }, + { + "entropy": 9.041022300720215, + "epoch": 0.31362467866323906, + "mean_token_accuracy": 0.7745803594589233, + "num_tokens": 16511167.0, + "step": 3172, + "train/ce_loss": 0.7756233215332031 + }, + { + "epoch": 0.31362467866323906, + "step": 3172, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.31362467866323906, + "step": 3172, + "train/total_loss": 0.1517810821533203 + }, + { + "entropy": 9.487861633300781, + "epoch": 0.3137235515127546, + "mean_token_accuracy": 0.7473867535591125, + "num_tokens": 16516217.0, + "step": 3173, + "train/ce_loss": 1.0670230388641357 + }, + { + "epoch": 0.3137235515127546, + "step": 3173, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.3137235515127546, + "step": 3173, + "train/total_loss": 0.1535772979259491 + }, + { + "entropy": 9.665082931518555, + "epoch": 0.31382242436227015, + "mean_token_accuracy": 0.7163904309272766, + "num_tokens": 16521222.0, + "step": 3174, + "train/ce_loss": 1.644775390625 + }, + { + "epoch": 0.31382242436227015, + "step": 3174, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.31382242436227015, + "step": 3174, + "train/total_loss": 0.28166502714157104 + }, + { + "entropy": 8.908178329467773, + "epoch": 0.31392129721178563, + "mean_token_accuracy": 0.7110311985015869, + "num_tokens": 16526518.0, + "step": 3175, + "train/ce_loss": 0.7185203433036804 + }, + { + "epoch": 0.31392129721178563, + "step": 3175, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.31392129721178563, + "step": 3175, + "train/total_loss": 0.09528953582048416 + }, + { + "entropy": 9.595062255859375, + "epoch": 0.3140201700613012, + "mean_token_accuracy": 0.6961538195610046, + "num_tokens": 16531477.0, + "step": 3176, + "train/ce_loss": 0.9679924845695496 + }, + { + "epoch": 0.3140201700613012, + "step": 3176, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.3140201700613012, + "step": 3176, + "train/total_loss": 0.12804925441741943 + }, + { + "entropy": 8.992053985595703, + "epoch": 0.3141190429108167, + "mean_token_accuracy": 0.7331838607788086, + "num_tokens": 16536837.0, + "step": 3177, + "train/ce_loss": 0.8361531496047974 + }, + { + "epoch": 0.3141190429108167, + "step": 3177, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.3141190429108167, + "step": 3177, + "train/total_loss": 0.16564656794071198 + }, + { + "entropy": 9.319987297058105, + "epoch": 0.3142179157603322, + "mean_token_accuracy": 0.7346683144569397, + "num_tokens": 16542118.0, + "step": 3178, + "train/ce_loss": 1.495343804359436 + }, + { + "epoch": 0.3142179157603322, + "step": 3178, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.3142179157603322, + "step": 3178, + "train/total_loss": 0.23547188937664032 + }, + { + "entropy": 9.595830917358398, + "epoch": 0.31431678860984774, + "mean_token_accuracy": 0.7257575988769531, + "num_tokens": 16547215.0, + "step": 3179, + "train/ce_loss": 0.7838876247406006 + }, + { + "epoch": 0.31431678860984774, + "step": 3179, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.31431678860984774, + "step": 3179, + "train/total_loss": 0.1487012654542923 + }, + { + "epoch": 0.3144156614593633, + "grad_norm": 0.7691072225570679, + "learning_rate": 9.216486179103003e-06, + "loss": 0.1531, + "step": 3180 + }, + { + "entropy": 9.449749946594238, + "epoch": 0.3144156614593633, + "mean_token_accuracy": 0.6988636255264282, + "num_tokens": 16552342.0, + "step": 3180, + "train/ce_loss": 1.6050139665603638 + }, + { + "epoch": 0.3144156614593633, + "step": 3180, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.3144156614593633, + "step": 3180, + "train/total_loss": 0.2776889204978943 + }, + { + "entropy": 8.785375595092773, + "epoch": 0.31451453430887877, + "mean_token_accuracy": 0.7557436227798462, + "num_tokens": 16557692.0, + "step": 3181, + "train/ce_loss": 1.0190057754516602 + }, + { + "epoch": 0.31451453430887877, + "step": 3181, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.31451453430887877, + "step": 3181, + "train/total_loss": 0.15658807754516602 + }, + { + "entropy": 10.074111938476562, + "epoch": 0.3146134071583943, + "mean_token_accuracy": 0.7526881694793701, + "num_tokens": 16562523.0, + "step": 3182, + "train/ce_loss": 1.4351893663406372 + }, + { + "epoch": 0.3146134071583943, + "step": 3182, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.3146134071583943, + "step": 3182, + "train/total_loss": 0.19039393961429596 + }, + { + "entropy": 9.068748474121094, + "epoch": 0.31471228000790985, + "mean_token_accuracy": 0.7479191422462463, + "num_tokens": 16567865.0, + "step": 3183, + "train/ce_loss": 0.7203229665756226 + }, + { + "epoch": 0.31471228000790985, + "step": 3183, + "train/sim_loss": 0.12890625 + }, + { + "epoch": 0.31471228000790985, + "step": 3183, + "train/total_loss": 0.20093855261802673 + }, + { + "entropy": 9.319864273071289, + "epoch": 0.31481115285742534, + "mean_token_accuracy": 0.7212205529212952, + "num_tokens": 16573022.0, + "step": 3184, + "train/ce_loss": 0.5705752372741699 + }, + { + "epoch": 0.31481115285742534, + "step": 3184, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.31481115285742534, + "step": 3184, + "train/total_loss": 0.12346377968788147 + }, + { + "entropy": 8.951447486877441, + "epoch": 0.3149100257069409, + "mean_token_accuracy": 0.7768691778182983, + "num_tokens": 16578319.0, + "step": 3185, + "train/ce_loss": 0.7091139554977417 + }, + { + "epoch": 0.3149100257069409, + "step": 3185, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.3149100257069409, + "step": 3185, + "train/total_loss": 0.09044265002012253 + }, + { + "entropy": 9.045388221740723, + "epoch": 0.3150088985564564, + "mean_token_accuracy": 0.6927710771560669, + "num_tokens": 16583633.0, + "step": 3186, + "train/ce_loss": 0.8720293045043945 + }, + { + "epoch": 0.3150088985564564, + "step": 3186, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.3150088985564564, + "step": 3186, + "train/total_loss": 0.20048418641090393 + }, + { + "entropy": 8.728281021118164, + "epoch": 0.3151077714059719, + "mean_token_accuracy": 0.7602880597114563, + "num_tokens": 16589125.0, + "step": 3187, + "train/ce_loss": 0.6601799726486206 + }, + { + "epoch": 0.3151077714059719, + "step": 3187, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.3151077714059719, + "step": 3187, + "train/total_loss": 0.0972680002450943 + }, + { + "entropy": 9.076705932617188, + "epoch": 0.31520664425548744, + "mean_token_accuracy": 0.7634961605072021, + "num_tokens": 16594360.0, + "step": 3188, + "train/ce_loss": 0.8506819605827332 + }, + { + "epoch": 0.31520664425548744, + "step": 3188, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.31520664425548744, + "step": 3188, + "train/total_loss": 0.14366194605827332 + }, + { + "entropy": 8.745386123657227, + "epoch": 0.315305517105003, + "mean_token_accuracy": 0.7200435996055603, + "num_tokens": 16599729.0, + "step": 3189, + "train/ce_loss": 1.0710463523864746 + }, + { + "epoch": 0.315305517105003, + "step": 3189, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.315305517105003, + "step": 3189, + "train/total_loss": 0.18522962927818298 + }, + { + "entropy": 9.436115264892578, + "epoch": 0.31540438995451847, + "mean_token_accuracy": 0.703125, + "num_tokens": 16604919.0, + "step": 3190, + "train/ce_loss": 2.183468818664551 + }, + { + "epoch": 0.31540438995451847, + "step": 3190, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.31540438995451847, + "step": 3190, + "train/total_loss": 0.32381564378738403 + }, + { + "entropy": 9.276498794555664, + "epoch": 0.315503262804034, + "mean_token_accuracy": 0.7179487347602844, + "num_tokens": 16610501.0, + "step": 3191, + "train/ce_loss": 0.4966878294944763 + }, + { + "epoch": 0.315503262804034, + "step": 3191, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.315503262804034, + "step": 3191, + "train/total_loss": 0.08091878890991211 + }, + { + "entropy": 8.970152854919434, + "epoch": 0.31560213565354955, + "mean_token_accuracy": 0.6796785593032837, + "num_tokens": 16615819.0, + "step": 3192, + "train/ce_loss": 0.5181173086166382 + }, + { + "epoch": 0.31560213565354955, + "step": 3192, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.31560213565354955, + "step": 3192, + "train/total_loss": 0.09087423235177994 + }, + { + "entropy": 9.527421951293945, + "epoch": 0.31570100850306504, + "mean_token_accuracy": 0.7154471278190613, + "num_tokens": 16620892.0, + "step": 3193, + "train/ce_loss": 0.9545297026634216 + }, + { + "epoch": 0.31570100850306504, + "step": 3193, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.31570100850306504, + "step": 3193, + "train/total_loss": 0.20092171430587769 + }, + { + "entropy": 9.597978591918945, + "epoch": 0.3157998813525806, + "mean_token_accuracy": 0.7486534714698792, + "num_tokens": 16625888.0, + "step": 3194, + "train/ce_loss": 0.6437567472457886 + }, + { + "epoch": 0.3157998813525806, + "step": 3194, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.3157998813525806, + "step": 3194, + "train/total_loss": 0.12296942621469498 + }, + { + "entropy": 8.84429931640625, + "epoch": 0.3158987542020961, + "mean_token_accuracy": 0.7665369510650635, + "num_tokens": 16631432.0, + "step": 3195, + "train/ce_loss": 0.8552486896514893 + }, + { + "epoch": 0.3158987542020961, + "step": 3195, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.3158987542020961, + "step": 3195, + "train/total_loss": 0.14802487194538116 + }, + { + "entropy": 9.271564483642578, + "epoch": 0.3159976270516116, + "mean_token_accuracy": 0.750952959060669, + "num_tokens": 16636665.0, + "step": 3196, + "train/ce_loss": 0.6849063634872437 + }, + { + "epoch": 0.3159976270516116, + "step": 3196, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.3159976270516116, + "step": 3196, + "train/total_loss": 0.1466156393289566 + }, + { + "entropy": 9.034075736999512, + "epoch": 0.31609649990112715, + "mean_token_accuracy": 0.7160919308662415, + "num_tokens": 16642031.0, + "step": 3197, + "train/ce_loss": 0.8857916593551636 + }, + { + "epoch": 0.31609649990112715, + "step": 3197, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.31609649990112715, + "step": 3197, + "train/total_loss": 0.1393604278564453 + }, + { + "entropy": 9.200294494628906, + "epoch": 0.3161953727506427, + "mean_token_accuracy": 0.7126168012619019, + "num_tokens": 16647359.0, + "step": 3198, + "train/ce_loss": 1.015448808670044 + }, + { + "epoch": 0.3161953727506427, + "step": 3198, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.3161953727506427, + "step": 3198, + "train/total_loss": 0.14841988682746887 + }, + { + "entropy": 9.951388359069824, + "epoch": 0.3162942456001582, + "mean_token_accuracy": 0.7136563658714294, + "num_tokens": 16652276.0, + "step": 3199, + "train/ce_loss": 1.0957013368606567 + }, + { + "epoch": 0.3162942456001582, + "step": 3199, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.3162942456001582, + "step": 3199, + "train/total_loss": 0.15253889560699463 + }, + { + "epoch": 0.3163931184496737, + "grad_norm": 0.8756003379821777, + "learning_rate": 9.211541314345054e-06, + "loss": 0.1548, + "step": 3200 + }, + { + "entropy": 9.194119453430176, + "epoch": 0.3163931184496737, + "mean_token_accuracy": 0.7058823704719543, + "num_tokens": 16657483.0, + "step": 3200, + "train/ce_loss": 0.7935322523117065 + }, + { + "epoch": 0.3163931184496737, + "step": 3200, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.3163931184496737, + "step": 3200, + "train/total_loss": 0.1262282282114029 + }, + { + "entropy": 9.994163513183594, + "epoch": 0.31649199129918926, + "mean_token_accuracy": 0.756302535533905, + "num_tokens": 16662287.0, + "step": 3201, + "train/ce_loss": 7.051830834825523e-06 + }, + { + "epoch": 0.31649199129918926, + "step": 3201, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.31649199129918926, + "step": 3201, + "train/total_loss": 0.04296945407986641 + }, + { + "entropy": 9.194380760192871, + "epoch": 0.31659086414870474, + "mean_token_accuracy": 0.7325870394706726, + "num_tokens": 16667538.0, + "step": 3202, + "train/ce_loss": 1.6637096405029297 + }, + { + "epoch": 0.31659086414870474, + "step": 3202, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.31659086414870474, + "step": 3202, + "train/total_loss": 0.2562147378921509 + }, + { + "entropy": 9.233200073242188, + "epoch": 0.3166897369982203, + "mean_token_accuracy": 0.7308584451675415, + "num_tokens": 16672857.0, + "step": 3203, + "train/ce_loss": 0.989677369594574 + }, + { + "epoch": 0.3166897369982203, + "step": 3203, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.3166897369982203, + "step": 3203, + "train/total_loss": 0.18881148099899292 + }, + { + "entropy": 8.693717956542969, + "epoch": 0.3167886098477358, + "mean_token_accuracy": 0.7471697926521301, + "num_tokens": 16678392.0, + "step": 3204, + "train/ce_loss": 0.99169921875 + }, + { + "epoch": 0.3167886098477358, + "step": 3204, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.3167886098477358, + "step": 3204, + "train/total_loss": 0.18510742485523224 + }, + { + "entropy": 9.406003952026367, + "epoch": 0.3168874826972513, + "mean_token_accuracy": 0.7399463653564453, + "num_tokens": 16683567.0, + "step": 3205, + "train/ce_loss": 1.0676230192184448 + }, + { + "epoch": 0.3168874826972513, + "step": 3205, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.3168874826972513, + "step": 3205, + "train/total_loss": 0.17316855490207672 + }, + { + "entropy": 9.885889053344727, + "epoch": 0.31698635554676685, + "mean_token_accuracy": 0.6584269404411316, + "num_tokens": 16688440.0, + "step": 3206, + "train/ce_loss": 1.2701112031936646 + }, + { + "epoch": 0.31698635554676685, + "step": 3206, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.31698635554676685, + "step": 3206, + "train/total_loss": 0.16607362031936646 + }, + { + "entropy": 9.325813293457031, + "epoch": 0.3170852283962824, + "mean_token_accuracy": 0.7401477694511414, + "num_tokens": 16693729.0, + "step": 3207, + "train/ce_loss": 1.0808814764022827 + }, + { + "epoch": 0.3170852283962824, + "step": 3207, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.3170852283962824, + "step": 3207, + "train/total_loss": 0.1549631506204605 + }, + { + "entropy": 9.387101173400879, + "epoch": 0.3171841012457979, + "mean_token_accuracy": 0.7160000205039978, + "num_tokens": 16698937.0, + "step": 3208, + "train/ce_loss": 0.9226993918418884 + }, + { + "epoch": 0.3171841012457979, + "step": 3208, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.3171841012457979, + "step": 3208, + "train/total_loss": 0.17430119216442108 + }, + { + "entropy": 8.81142520904541, + "epoch": 0.3172829740953134, + "mean_token_accuracy": 0.7158403992652893, + "num_tokens": 16704208.0, + "step": 3209, + "train/ce_loss": 1.0430004596710205 + }, + { + "epoch": 0.3172829740953134, + "step": 3209, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.3172829740953134, + "step": 3209, + "train/total_loss": 0.15898755192756653 + }, + { + "entropy": 9.000001907348633, + "epoch": 0.31738184694482896, + "mean_token_accuracy": 0.7566702365875244, + "num_tokens": 16709636.0, + "step": 3210, + "train/ce_loss": 0.6519407629966736 + }, + { + "epoch": 0.31738184694482896, + "step": 3210, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.31738184694482896, + "step": 3210, + "train/total_loss": 0.09644407778978348 + }, + { + "entropy": 9.263021469116211, + "epoch": 0.31748071979434445, + "mean_token_accuracy": 0.756926953792572, + "num_tokens": 16714887.0, + "step": 3211, + "train/ce_loss": 0.7199892401695251 + }, + { + "epoch": 0.31748071979434445, + "step": 3211, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.31748071979434445, + "step": 3211, + "train/total_loss": 0.15012392401695251 + }, + { + "entropy": 8.933192253112793, + "epoch": 0.31757959264386, + "mean_token_accuracy": 0.769487738609314, + "num_tokens": 16720282.0, + "step": 3212, + "train/ce_loss": 0.8922109603881836 + }, + { + "epoch": 0.31757959264386, + "step": 3212, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.31757959264386, + "step": 3212, + "train/total_loss": 0.11656484752893448 + }, + { + "entropy": 8.905010223388672, + "epoch": 0.31767846549337553, + "mean_token_accuracy": 0.7305524349212646, + "num_tokens": 16725652.0, + "step": 3213, + "train/ce_loss": 0.9052978754043579 + }, + { + "epoch": 0.31767846549337553, + "step": 3213, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.31767846549337553, + "step": 3213, + "train/total_loss": 0.15302979946136475 + }, + { + "entropy": 10.404428482055664, + "epoch": 0.31777733834289107, + "mean_token_accuracy": 0.8070175647735596, + "num_tokens": 16730211.0, + "step": 3214, + "train/ce_loss": 0.00039690217818133533 + }, + { + "epoch": 0.31777733834289107, + "step": 3214, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.31777733834289107, + "step": 3214, + "train/total_loss": 0.05082093924283981 + }, + { + "entropy": 8.677364349365234, + "epoch": 0.31787621119240655, + "mean_token_accuracy": 0.7495238184928894, + "num_tokens": 16735741.0, + "step": 3215, + "train/ce_loss": 1.1565896272659302 + }, + { + "epoch": 0.31787621119240655, + "step": 3215, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.31787621119240655, + "step": 3215, + "train/total_loss": 0.1390964686870575 + }, + { + "entropy": 9.153155326843262, + "epoch": 0.3179750840419221, + "mean_token_accuracy": 0.7590027451515198, + "num_tokens": 16740934.0, + "step": 3216, + "train/ce_loss": 0.5160759091377258 + }, + { + "epoch": 0.3179750840419221, + "step": 3216, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.3179750840419221, + "step": 3216, + "train/total_loss": 0.10629509389400482 + }, + { + "entropy": 9.285148620605469, + "epoch": 0.31807395689143764, + "mean_token_accuracy": 0.7387499809265137, + "num_tokens": 16746179.0, + "step": 3217, + "train/ce_loss": 1.7083173990249634 + }, + { + "epoch": 0.31807395689143764, + "step": 3217, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.31807395689143764, + "step": 3217, + "train/total_loss": 0.24505048990249634 + }, + { + "entropy": 10.631352424621582, + "epoch": 0.3181728297409531, + "mean_token_accuracy": 0.7166666388511658, + "num_tokens": 16750712.0, + "step": 3218, + "train/ce_loss": 1.9401524696149863e-05 + }, + { + "epoch": 0.3181728297409531, + "step": 3218, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.3181728297409531, + "step": 3218, + "train/total_loss": 0.0234394408762455 + }, + { + "entropy": 10.037153244018555, + "epoch": 0.31827170259046866, + "mean_token_accuracy": 0.7359412908554077, + "num_tokens": 16755485.0, + "step": 3219, + "train/ce_loss": 4.458036073629046e-06 + }, + { + "epoch": 0.31827170259046866, + "step": 3219, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.31827170259046866, + "step": 3219, + "train/total_loss": 0.023437945172190666 + }, + { + "epoch": 0.3183705754399842, + "grad_norm": 0.8842136263847351, + "learning_rate": 9.206596449587104e-06, + "loss": 0.1428, + "step": 3220 + }, + { + "entropy": 9.433752059936523, + "epoch": 0.3183705754399842, + "mean_token_accuracy": 0.7534013390541077, + "num_tokens": 16760520.0, + "step": 3220, + "train/ce_loss": 1.7352644205093384 + }, + { + "epoch": 0.3183705754399842, + "step": 3220, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.3183705754399842, + "step": 3220, + "train/total_loss": 0.23212020099163055 + }, + { + "entropy": 9.202911376953125, + "epoch": 0.3184694482894997, + "mean_token_accuracy": 0.7560975551605225, + "num_tokens": 16765810.0, + "step": 3221, + "train/ce_loss": 0.5386577844619751 + }, + { + "epoch": 0.3184694482894997, + "step": 3221, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.3184694482894997, + "step": 3221, + "train/total_loss": 0.08120952546596527 + }, + { + "entropy": 9.050951957702637, + "epoch": 0.31856832113901523, + "mean_token_accuracy": 0.7635402679443359, + "num_tokens": 16771075.0, + "step": 3222, + "train/ce_loss": 0.6109569668769836 + }, + { + "epoch": 0.31856832113901523, + "step": 3222, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.31856832113901523, + "step": 3222, + "train/total_loss": 0.1118769496679306 + }, + { + "entropy": 8.684123039245605, + "epoch": 0.3186671939885308, + "mean_token_accuracy": 0.7515257000923157, + "num_tokens": 16776859.0, + "step": 3223, + "train/ce_loss": 0.7818444967269897 + }, + { + "epoch": 0.3186671939885308, + "step": 3223, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.3186671939885308, + "step": 3223, + "train/total_loss": 0.13677820563316345 + }, + { + "entropy": 9.767860412597656, + "epoch": 0.31876606683804626, + "mean_token_accuracy": 0.7011494040489197, + "num_tokens": 16781818.0, + "step": 3224, + "train/ce_loss": 1.282180905342102 + }, + { + "epoch": 0.31876606683804626, + "step": 3224, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.31876606683804626, + "step": 3224, + "train/total_loss": 0.16728059947490692 + }, + { + "entropy": 9.266471862792969, + "epoch": 0.3188649396875618, + "mean_token_accuracy": 0.7120822668075562, + "num_tokens": 16787079.0, + "step": 3225, + "train/ce_loss": 0.778668999671936 + }, + { + "epoch": 0.3188649396875618, + "step": 3225, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.3188649396875618, + "step": 3225, + "train/total_loss": 0.11692940443754196 + }, + { + "entropy": 9.97616958618164, + "epoch": 0.31896381253707734, + "mean_token_accuracy": 0.7937219738960266, + "num_tokens": 16791900.0, + "step": 3226, + "train/ce_loss": 3.3997418995568296e-06 + }, + { + "epoch": 0.31896381253707734, + "step": 3226, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.31896381253707734, + "step": 3226, + "train/total_loss": 0.02343784086406231 + }, + { + "entropy": 9.26513957977295, + "epoch": 0.3190626853865928, + "mean_token_accuracy": 0.7091836929321289, + "num_tokens": 16797154.0, + "step": 3227, + "train/ce_loss": 0.9920081496238708 + }, + { + "epoch": 0.3190626853865928, + "step": 3227, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.3190626853865928, + "step": 3227, + "train/total_loss": 0.22420081496238708 + }, + { + "entropy": 9.090368270874023, + "epoch": 0.31916155823610837, + "mean_token_accuracy": 0.7353723645210266, + "num_tokens": 16802313.0, + "step": 3228, + "train/ce_loss": 1.317256212234497 + }, + { + "epoch": 0.31916155823610837, + "step": 3228, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.31916155823610837, + "step": 3228, + "train/total_loss": 0.22547562420368195 + }, + { + "entropy": 10.012613296508789, + "epoch": 0.3192604310856239, + "mean_token_accuracy": 0.7511627674102783, + "num_tokens": 16807139.0, + "step": 3229, + "train/ce_loss": 0.7629386186599731 + }, + { + "epoch": 0.3192604310856239, + "step": 3229, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.3192604310856239, + "step": 3229, + "train/total_loss": 0.11145011335611343 + }, + { + "entropy": 9.738869667053223, + "epoch": 0.3193593039351394, + "mean_token_accuracy": 0.7278911471366882, + "num_tokens": 16812152.0, + "step": 3230, + "train/ce_loss": 4.822842583962483e-06 + }, + { + "epoch": 0.3193593039351394, + "step": 3230, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.3193593039351394, + "step": 3230, + "train/total_loss": 0.0664067342877388 + }, + { + "entropy": 9.756131172180176, + "epoch": 0.31945817678465493, + "mean_token_accuracy": 0.676300585269928, + "num_tokens": 16817074.0, + "step": 3231, + "train/ce_loss": 4.654988060792675e-06 + }, + { + "epoch": 0.31945817678465493, + "step": 3231, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.31945817678465493, + "step": 3231, + "train/total_loss": 0.08593796193599701 + }, + { + "entropy": 9.703554153442383, + "epoch": 0.3195570496341705, + "mean_token_accuracy": 0.7551020383834839, + "num_tokens": 16822074.0, + "step": 3232, + "train/ce_loss": 5.237433470028918e-06 + }, + { + "epoch": 0.3195570496341705, + "step": 3232, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.3195570496341705, + "step": 3232, + "train/total_loss": 0.05468802526593208 + }, + { + "entropy": 9.267679214477539, + "epoch": 0.31965592248368596, + "mean_token_accuracy": 0.8261421322822571, + "num_tokens": 16827324.0, + "step": 3233, + "train/ce_loss": 0.4245811104774475 + }, + { + "epoch": 0.31965592248368596, + "step": 3233, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.31965592248368596, + "step": 3233, + "train/total_loss": 0.06589561700820923 + }, + { + "entropy": 8.957054138183594, + "epoch": 0.3197547953332015, + "mean_token_accuracy": 0.732300877571106, + "num_tokens": 16832679.0, + "step": 3234, + "train/ce_loss": 0.6917397975921631 + }, + { + "epoch": 0.3197547953332015, + "step": 3234, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.3197547953332015, + "step": 3234, + "train/total_loss": 0.15901774168014526 + }, + { + "entropy": 9.423365592956543, + "epoch": 0.31985366818271704, + "mean_token_accuracy": 0.6760828495025635, + "num_tokens": 16837632.0, + "step": 3235, + "train/ce_loss": 2.117717981338501 + }, + { + "epoch": 0.31985366818271704, + "step": 3235, + "train/sim_loss": 0.12109375 + }, + { + "epoch": 0.31985366818271704, + "step": 3235, + "train/total_loss": 0.33286553621292114 + }, + { + "entropy": 9.442780494689941, + "epoch": 0.31995254103223253, + "mean_token_accuracy": 0.7576243877410889, + "num_tokens": 16842769.0, + "step": 3236, + "train/ce_loss": 1.2201064825057983 + }, + { + "epoch": 0.31995254103223253, + "step": 3236, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.31995254103223253, + "step": 3236, + "train/total_loss": 0.23919814825057983 + }, + { + "entropy": 9.171239852905273, + "epoch": 0.32005141388174807, + "mean_token_accuracy": 0.7342105507850647, + "num_tokens": 16848004.0, + "step": 3237, + "train/ce_loss": 0.7349594831466675 + }, + { + "epoch": 0.32005141388174807, + "step": 3237, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.32005141388174807, + "step": 3237, + "train/total_loss": 0.10474594682455063 + }, + { + "entropy": 9.50808334350586, + "epoch": 0.3201502867312636, + "mean_token_accuracy": 0.7857142686843872, + "num_tokens": 16853036.0, + "step": 3238, + "train/ce_loss": 0.8079267144203186 + }, + { + "epoch": 0.3201502867312636, + "step": 3238, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.3201502867312636, + "step": 3238, + "train/total_loss": 0.12766766548156738 + }, + { + "entropy": 9.111824989318848, + "epoch": 0.3202491595807791, + "mean_token_accuracy": 0.7210965156555176, + "num_tokens": 16858318.0, + "step": 3239, + "train/ce_loss": 0.6129790544509888 + }, + { + "epoch": 0.3202491595807791, + "step": 3239, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.3202491595807791, + "step": 3239, + "train/total_loss": 0.08082915842533112 + }, + { + "epoch": 0.32034803243029464, + "grad_norm": 0.7715190052986145, + "learning_rate": 9.201651584829155e-06, + "loss": 0.1442, + "step": 3240 + }, + { + "entropy": 9.485016822814941, + "epoch": 0.32034803243029464, + "mean_token_accuracy": 0.8025078177452087, + "num_tokens": 16863377.0, + "step": 3240, + "train/ce_loss": 3.0977219012129353e-06 + }, + { + "epoch": 0.32034803243029464, + "step": 3240, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.32034803243029464, + "step": 3240, + "train/total_loss": 0.05859405919909477 + }, + { + "entropy": 9.315765380859375, + "epoch": 0.3204469052798102, + "mean_token_accuracy": 0.7462121248245239, + "num_tokens": 16868608.0, + "step": 3241, + "train/ce_loss": 0.7169209718704224 + }, + { + "epoch": 0.3204469052798102, + "step": 3241, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.3204469052798102, + "step": 3241, + "train/total_loss": 0.1068483516573906 + }, + { + "entropy": 9.324462890625, + "epoch": 0.32054577812932566, + "mean_token_accuracy": 0.7270408272743225, + "num_tokens": 16873884.0, + "step": 3242, + "train/ce_loss": 1.1520270109176636 + }, + { + "epoch": 0.32054577812932566, + "step": 3242, + "train/sim_loss": 0.12109375 + }, + { + "epoch": 0.32054577812932566, + "step": 3242, + "train/total_loss": 0.23629644513130188 + }, + { + "entropy": 8.780170440673828, + "epoch": 0.3206446509788412, + "mean_token_accuracy": 0.7125129103660583, + "num_tokens": 16879468.0, + "step": 3243, + "train/ce_loss": 1.032793641090393 + }, + { + "epoch": 0.3206446509788412, + "step": 3243, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.3206446509788412, + "step": 3243, + "train/total_loss": 0.19312311708927155 + }, + { + "entropy": 8.811084747314453, + "epoch": 0.32074352382835675, + "mean_token_accuracy": 0.6864801645278931, + "num_tokens": 16884823.0, + "step": 3244, + "train/ce_loss": 0.7729099988937378 + }, + { + "epoch": 0.32074352382835675, + "step": 3244, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.32074352382835675, + "step": 3244, + "train/total_loss": 0.17104101181030273 + }, + { + "entropy": 8.963603019714355, + "epoch": 0.32084239667787223, + "mean_token_accuracy": 0.6889564394950867, + "num_tokens": 16890253.0, + "step": 3245, + "train/ce_loss": 1.0300863981246948 + }, + { + "epoch": 0.32084239667787223, + "step": 3245, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.32084239667787223, + "step": 3245, + "train/total_loss": 0.15378989279270172 + }, + { + "entropy": 8.786639213562012, + "epoch": 0.3209412695273878, + "mean_token_accuracy": 0.7233644723892212, + "num_tokens": 16895816.0, + "step": 3246, + "train/ce_loss": 0.6837566494941711 + }, + { + "epoch": 0.3209412695273878, + "step": 3246, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.3209412695273878, + "step": 3246, + "train/total_loss": 0.15431317687034607 + }, + { + "entropy": 8.63482666015625, + "epoch": 0.3210401423769033, + "mean_token_accuracy": 0.7578431367874146, + "num_tokens": 16901363.0, + "step": 3247, + "train/ce_loss": 1.2165522575378418 + }, + { + "epoch": 0.3210401423769033, + "step": 3247, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.3210401423769033, + "step": 3247, + "train/total_loss": 0.17243647575378418 + }, + { + "entropy": 9.373018264770508, + "epoch": 0.3211390152264188, + "mean_token_accuracy": 0.7123696208000183, + "num_tokens": 16906484.0, + "step": 3248, + "train/ce_loss": 2.632043106132187e-06 + }, + { + "epoch": 0.3211390152264188, + "step": 3248, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.3211390152264188, + "step": 3248, + "train/total_loss": 0.05078151449561119 + }, + { + "entropy": 8.7036771774292, + "epoch": 0.32123788807593434, + "mean_token_accuracy": 0.8161478638648987, + "num_tokens": 16911995.0, + "step": 3249, + "train/ce_loss": 0.49719151854515076 + }, + { + "epoch": 0.32123788807593434, + "step": 3249, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.32123788807593434, + "step": 3249, + "train/total_loss": 0.08096915483474731 + }, + { + "entropy": 9.056543350219727, + "epoch": 0.3213367609254499, + "mean_token_accuracy": 0.7183979749679565, + "num_tokens": 16917262.0, + "step": 3250, + "train/ce_loss": 0.9369495511054993 + }, + { + "epoch": 0.3213367609254499, + "step": 3250, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.3213367609254499, + "step": 3250, + "train/total_loss": 0.15228870511054993 + }, + { + "entropy": 8.583107948303223, + "epoch": 0.32143563377496537, + "mean_token_accuracy": 0.6989351511001587, + "num_tokens": 16922768.0, + "step": 3251, + "train/ce_loss": 0.5158079266548157 + }, + { + "epoch": 0.32143563377496537, + "step": 3251, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.32143563377496537, + "step": 3251, + "train/total_loss": 0.10236204415559769 + }, + { + "entropy": 9.377542495727539, + "epoch": 0.3215345066244809, + "mean_token_accuracy": 0.7456555962562561, + "num_tokens": 16927804.0, + "step": 3252, + "train/ce_loss": 1.2434115409851074 + }, + { + "epoch": 0.3215345066244809, + "step": 3252, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.3215345066244809, + "step": 3252, + "train/total_loss": 0.18293491005897522 + }, + { + "entropy": 9.138757705688477, + "epoch": 0.32163337947399645, + "mean_token_accuracy": 0.7271557450294495, + "num_tokens": 16933053.0, + "step": 3253, + "train/ce_loss": 0.7804774641990662 + }, + { + "epoch": 0.32163337947399645, + "step": 3253, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.32163337947399645, + "step": 3253, + "train/total_loss": 0.1483602523803711 + }, + { + "entropy": 9.449091911315918, + "epoch": 0.321732252323512, + "mean_token_accuracy": 0.7568093538284302, + "num_tokens": 16938054.0, + "step": 3254, + "train/ce_loss": 0.8022903800010681 + }, + { + "epoch": 0.321732252323512, + "step": 3254, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.321732252323512, + "step": 3254, + "train/total_loss": 0.1505415439605713 + }, + { + "entropy": 9.11605453491211, + "epoch": 0.3218311251730275, + "mean_token_accuracy": 0.7584269642829895, + "num_tokens": 16943467.0, + "step": 3255, + "train/ce_loss": 1.0913804769515991 + }, + { + "epoch": 0.3218311251730275, + "step": 3255, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.3218311251730275, + "step": 3255, + "train/total_loss": 0.15991929173469543 + }, + { + "entropy": 9.231228828430176, + "epoch": 0.321929998022543, + "mean_token_accuracy": 0.7351154088973999, + "num_tokens": 16948885.0, + "step": 3256, + "train/ce_loss": 0.8510516285896301 + }, + { + "epoch": 0.321929998022543, + "step": 3256, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.321929998022543, + "step": 3256, + "train/total_loss": 0.12416766583919525 + }, + { + "entropy": 10.03917407989502, + "epoch": 0.32202887087205856, + "mean_token_accuracy": 0.7881773114204407, + "num_tokens": 16953734.0, + "step": 3257, + "train/ce_loss": 4.570816599880345e-06 + }, + { + "epoch": 0.32202887087205856, + "step": 3257, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.32202887087205856, + "step": 3257, + "train/total_loss": 0.04296920821070671 + }, + { + "entropy": 9.223027229309082, + "epoch": 0.32212774372157404, + "mean_token_accuracy": 0.7317073345184326, + "num_tokens": 16958776.0, + "step": 3258, + "train/ce_loss": 1.3286868333816528 + }, + { + "epoch": 0.32212774372157404, + "step": 3258, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.32212774372157404, + "step": 3258, + "train/total_loss": 0.203181192278862 + }, + { + "entropy": 9.222864151000977, + "epoch": 0.3222266165710896, + "mean_token_accuracy": 0.6962190270423889, + "num_tokens": 16963943.0, + "step": 3259, + "train/ce_loss": 1.0266321897506714 + }, + { + "epoch": 0.3222266165710896, + "step": 3259, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.3222266165710896, + "step": 3259, + "train/total_loss": 0.16906946897506714 + }, + { + "epoch": 0.3223254894206051, + "grad_norm": 1.0479072332382202, + "learning_rate": 9.196706720071207e-06, + "loss": 0.1523, + "step": 3260 + }, + { + "entropy": 8.734139442443848, + "epoch": 0.3223254894206051, + "mean_token_accuracy": 0.774685800075531, + "num_tokens": 16969556.0, + "step": 3260, + "train/ce_loss": 0.42346420884132385 + }, + { + "epoch": 0.3223254894206051, + "step": 3260, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.3223254894206051, + "step": 3260, + "train/total_loss": 0.06578391790390015 + }, + { + "entropy": 8.73713493347168, + "epoch": 0.3224243622701206, + "mean_token_accuracy": 0.7389221787452698, + "num_tokens": 16974904.0, + "step": 3261, + "train/ce_loss": 1.1096874475479126 + }, + { + "epoch": 0.3224243622701206, + "step": 3261, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.3224243622701206, + "step": 3261, + "train/total_loss": 0.17737498879432678 + }, + { + "entropy": 9.207422256469727, + "epoch": 0.32252323511963615, + "mean_token_accuracy": 0.7205188870429993, + "num_tokens": 16980227.0, + "step": 3262, + "train/ce_loss": 0.8753836750984192 + }, + { + "epoch": 0.32252323511963615, + "step": 3262, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.32252323511963615, + "step": 3262, + "train/total_loss": 0.18519461154937744 + }, + { + "entropy": 10.055367469787598, + "epoch": 0.3226221079691517, + "mean_token_accuracy": 0.6889952421188354, + "num_tokens": 16985060.0, + "step": 3263, + "train/ce_loss": 1.4159187078475952 + }, + { + "epoch": 0.3226221079691517, + "step": 3263, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.3226221079691517, + "step": 3263, + "train/total_loss": 0.266591876745224 + }, + { + "entropy": 9.026175498962402, + "epoch": 0.3227209808186672, + "mean_token_accuracy": 0.7289719581604004, + "num_tokens": 16990358.0, + "step": 3264, + "train/ce_loss": 0.7006011605262756 + }, + { + "epoch": 0.3227209808186672, + "step": 3264, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.3227209808186672, + "step": 3264, + "train/total_loss": 0.1091226190328598 + }, + { + "entropy": 10.099632263183594, + "epoch": 0.3228198536681827, + "mean_token_accuracy": 0.77173912525177, + "num_tokens": 16995104.0, + "step": 3265, + "train/ce_loss": 6.077885245758807e-06 + }, + { + "epoch": 0.3228198536681827, + "step": 3265, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.3228198536681827, + "step": 3265, + "train/total_loss": 0.05078185722231865 + }, + { + "entropy": 9.232105255126953, + "epoch": 0.32291872651769826, + "mean_token_accuracy": 0.724252462387085, + "num_tokens": 17000159.0, + "step": 3266, + "train/ce_loss": 0.7505518198013306 + }, + { + "epoch": 0.32291872651769826, + "step": 3266, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.32291872651769826, + "step": 3266, + "train/total_loss": 0.11411768198013306 + }, + { + "entropy": 9.911155700683594, + "epoch": 0.32301759936721375, + "mean_token_accuracy": 0.7787056565284729, + "num_tokens": 17005083.0, + "step": 3267, + "train/ce_loss": 1.1564180850982666 + }, + { + "epoch": 0.32301759936721375, + "step": 3267, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.32301759936721375, + "step": 3267, + "train/total_loss": 0.14689180254936218 + }, + { + "entropy": 9.332996368408203, + "epoch": 0.3231164722167293, + "mean_token_accuracy": 0.7897648811340332, + "num_tokens": 17010302.0, + "step": 3268, + "train/ce_loss": 0.8998430967330933 + }, + { + "epoch": 0.3231164722167293, + "step": 3268, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.3231164722167293, + "step": 3268, + "train/total_loss": 0.16420306265354156 + }, + { + "entropy": 9.12976360321045, + "epoch": 0.32321534506624483, + "mean_token_accuracy": 0.7894737124443054, + "num_tokens": 17015635.0, + "step": 3269, + "train/ce_loss": 0.6157534122467041 + }, + { + "epoch": 0.32321534506624483, + "step": 3269, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.32321534506624483, + "step": 3269, + "train/total_loss": 0.08891908824443817 + }, + { + "entropy": 9.445563316345215, + "epoch": 0.3233142179157603, + "mean_token_accuracy": 0.7508772015571594, + "num_tokens": 17020678.0, + "step": 3270, + "train/ce_loss": 0.9528619050979614 + }, + { + "epoch": 0.3233142179157603, + "step": 3270, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.3233142179157603, + "step": 3270, + "train/total_loss": 0.14606744050979614 + }, + { + "entropy": 9.506591796875, + "epoch": 0.32341309076527586, + "mean_token_accuracy": 0.7190812826156616, + "num_tokens": 17025649.0, + "step": 3271, + "train/ce_loss": 2.4868256787158316e-06 + }, + { + "epoch": 0.32341309076527586, + "step": 3271, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.32341309076527586, + "step": 3271, + "train/total_loss": 0.019531499594449997 + }, + { + "entropy": 10.046039581298828, + "epoch": 0.3235119636147914, + "mean_token_accuracy": 0.7460317611694336, + "num_tokens": 17030429.0, + "step": 3272, + "train/ce_loss": 3.1560873594571603e-06 + }, + { + "epoch": 0.3235119636147914, + "step": 3272, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.3235119636147914, + "step": 3272, + "train/total_loss": 0.03125031664967537 + }, + { + "entropy": 9.029333114624023, + "epoch": 0.3236108364643069, + "mean_token_accuracy": 0.7631160616874695, + "num_tokens": 17035544.0, + "step": 3273, + "train/ce_loss": 1.2163193225860596 + }, + { + "epoch": 0.3236108364643069, + "step": 3273, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.3236108364643069, + "step": 3273, + "train/total_loss": 0.1880381852388382 + }, + { + "entropy": 9.000850677490234, + "epoch": 0.3237097093138224, + "mean_token_accuracy": 0.7205128073692322, + "num_tokens": 17040768.0, + "step": 3274, + "train/ce_loss": 1.0964419841766357 + }, + { + "epoch": 0.3237097093138224, + "step": 3274, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.3237097093138224, + "step": 3274, + "train/total_loss": 0.16823795437812805 + }, + { + "entropy": 9.481927871704102, + "epoch": 0.32380858216333797, + "mean_token_accuracy": 0.7527777552604675, + "num_tokens": 17045945.0, + "step": 3275, + "train/ce_loss": 1.2007566690444946 + }, + { + "epoch": 0.32380858216333797, + "step": 3275, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.32380858216333797, + "step": 3275, + "train/total_loss": 0.17476317286491394 + }, + { + "entropy": 8.948715209960938, + "epoch": 0.32390745501285345, + "mean_token_accuracy": 0.7540322542190552, + "num_tokens": 17051436.0, + "step": 3276, + "train/ce_loss": 0.9103338122367859 + }, + { + "epoch": 0.32390745501285345, + "step": 3276, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.32390745501285345, + "step": 3276, + "train/total_loss": 0.15353338420391083 + }, + { + "entropy": 9.306329727172852, + "epoch": 0.324006327862369, + "mean_token_accuracy": 0.7360248565673828, + "num_tokens": 17056539.0, + "step": 3277, + "train/ce_loss": 0.7820467948913574 + }, + { + "epoch": 0.324006327862369, + "step": 3277, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.324006327862369, + "step": 3277, + "train/total_loss": 0.1719546914100647 + }, + { + "entropy": 8.889620780944824, + "epoch": 0.32410520071188453, + "mean_token_accuracy": 0.6985781788825989, + "num_tokens": 17062255.0, + "step": 3278, + "train/ce_loss": 0.6463266611099243 + }, + { + "epoch": 0.32410520071188453, + "step": 3278, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.32410520071188453, + "step": 3278, + "train/total_loss": 0.11932016909122467 + }, + { + "entropy": 9.355274200439453, + "epoch": 0.3242040735614, + "mean_token_accuracy": 0.7539797425270081, + "num_tokens": 17067389.0, + "step": 3279, + "train/ce_loss": 0.6823098659515381 + }, + { + "epoch": 0.3242040735614, + "step": 3279, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.3242040735614, + "step": 3279, + "train/total_loss": 0.10338723659515381 + }, + { + "epoch": 0.32430294641091556, + "grad_norm": 0.8628454208374023, + "learning_rate": 9.191761855313257e-06, + "loss": 0.1487, + "step": 3280 + }, + { + "entropy": 9.686016082763672, + "epoch": 0.32430294641091556, + "mean_token_accuracy": 0.7093275785446167, + "num_tokens": 17072308.0, + "step": 3280, + "train/ce_loss": 2.1156375408172607 + }, + { + "epoch": 0.32430294641091556, + "step": 3280, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.32430294641091556, + "step": 3280, + "train/total_loss": 0.28187626600265503 + }, + { + "entropy": 8.727543830871582, + "epoch": 0.3244018192604311, + "mean_token_accuracy": 0.834343433380127, + "num_tokens": 17077830.0, + "step": 3281, + "train/ce_loss": 0.9242648482322693 + }, + { + "epoch": 0.3244018192604311, + "step": 3281, + "train/sim_loss": 0.16796875 + }, + { + "epoch": 0.3244018192604311, + "step": 3281, + "train/total_loss": 0.26039522886276245 + }, + { + "entropy": 9.704157829284668, + "epoch": 0.3245006921099466, + "mean_token_accuracy": 0.777365505695343, + "num_tokens": 17082804.0, + "step": 3282, + "train/ce_loss": 1.0163599252700806 + }, + { + "epoch": 0.3245006921099466, + "step": 3282, + "train/sim_loss": 0.15625 + }, + { + "epoch": 0.3245006921099466, + "step": 3282, + "train/total_loss": 0.25788599252700806 + }, + { + "entropy": 9.161046981811523, + "epoch": 0.32459956495946213, + "mean_token_accuracy": 0.76106196641922, + "num_tokens": 17088103.0, + "step": 3283, + "train/ce_loss": 0.3052811026573181 + }, + { + "epoch": 0.32459956495946213, + "step": 3283, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.32459956495946213, + "step": 3283, + "train/total_loss": 0.05005936324596405 + }, + { + "entropy": 9.814420700073242, + "epoch": 0.32469843780897767, + "mean_token_accuracy": 0.8077753782272339, + "num_tokens": 17092978.0, + "step": 3284, + "train/ce_loss": 9.27206565393135e-06 + }, + { + "epoch": 0.32469843780897767, + "step": 3284, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.32469843780897767, + "step": 3284, + "train/total_loss": 0.04687592759728432 + }, + { + "entropy": 8.795757293701172, + "epoch": 0.32479731065849315, + "mean_token_accuracy": 0.7015834450721741, + "num_tokens": 17098316.0, + "step": 3285, + "train/ce_loss": 0.6898893713951111 + }, + { + "epoch": 0.32479731065849315, + "step": 3285, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.32479731065849315, + "step": 3285, + "train/total_loss": 0.11977019160985947 + }, + { + "entropy": 8.67201042175293, + "epoch": 0.3248961835080087, + "mean_token_accuracy": 0.7737603187561035, + "num_tokens": 17103730.0, + "step": 3286, + "train/ce_loss": 0.6129236221313477 + }, + { + "epoch": 0.3248961835080087, + "step": 3286, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.3248961835080087, + "step": 3286, + "train/total_loss": 0.088636115193367 + }, + { + "entropy": 8.647956848144531, + "epoch": 0.32499505635752424, + "mean_token_accuracy": 0.7273631691932678, + "num_tokens": 17109229.0, + "step": 3287, + "train/ce_loss": 0.9534184336662292 + }, + { + "epoch": 0.32499505635752424, + "step": 3287, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.32499505635752424, + "step": 3287, + "train/total_loss": 0.19299809634685516 + }, + { + "entropy": 9.613130569458008, + "epoch": 0.3250939292070397, + "mean_token_accuracy": 0.7248120307922363, + "num_tokens": 17114307.0, + "step": 3288, + "train/ce_loss": 1.9667793367261766e-06 + }, + { + "epoch": 0.3250939292070397, + "step": 3288, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.3250939292070397, + "step": 3288, + "train/total_loss": 0.01953144744038582 + }, + { + "entropy": 9.082571029663086, + "epoch": 0.32519280205655526, + "mean_token_accuracy": 0.7468208074569702, + "num_tokens": 17119641.0, + "step": 3289, + "train/ce_loss": 0.3608906865119934 + }, + { + "epoch": 0.32519280205655526, + "step": 3289, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.32519280205655526, + "step": 3289, + "train/total_loss": 0.07905782014131546 + }, + { + "entropy": 9.360298156738281, + "epoch": 0.3252916749060708, + "mean_token_accuracy": 0.7407407164573669, + "num_tokens": 17124711.0, + "step": 3290, + "train/ce_loss": 0.6039575934410095 + }, + { + "epoch": 0.3252916749060708, + "step": 3290, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.3252916749060708, + "step": 3290, + "train/total_loss": 0.11508326232433319 + }, + { + "entropy": 9.86099624633789, + "epoch": 0.3253905477555863, + "mean_token_accuracy": 0.7488687634468079, + "num_tokens": 17129567.0, + "step": 3291, + "train/ce_loss": 1.2142945528030396 + }, + { + "epoch": 0.3253905477555863, + "step": 3291, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.3253905477555863, + "step": 3291, + "train/total_loss": 0.1800232082605362 + }, + { + "entropy": 8.741572380065918, + "epoch": 0.32548942060510183, + "mean_token_accuracy": 0.7155172228813171, + "num_tokens": 17135114.0, + "step": 3292, + "train/ce_loss": 0.803793728351593 + }, + { + "epoch": 0.32548942060510183, + "step": 3292, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.32548942060510183, + "step": 3292, + "train/total_loss": 0.14678561687469482 + }, + { + "entropy": 9.086403846740723, + "epoch": 0.3255882934546174, + "mean_token_accuracy": 0.796785295009613, + "num_tokens": 17140425.0, + "step": 3293, + "train/ce_loss": 0.5832430720329285 + }, + { + "epoch": 0.3255882934546174, + "step": 3293, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.3255882934546174, + "step": 3293, + "train/total_loss": 0.10910555720329285 + }, + { + "entropy": 9.40902328491211, + "epoch": 0.32568716630413286, + "mean_token_accuracy": 0.8159999847412109, + "num_tokens": 17145567.0, + "step": 3294, + "train/ce_loss": 1.3668820884049637e-06 + }, + { + "epoch": 0.32568716630413286, + "step": 3294, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.32568716630413286, + "step": 3294, + "train/total_loss": 0.023437635973095894 + }, + { + "entropy": 9.294602394104004, + "epoch": 0.3257860391536484, + "mean_token_accuracy": 0.7448275685310364, + "num_tokens": 17150691.0, + "step": 3295, + "train/ce_loss": 0.8498972058296204 + }, + { + "epoch": 0.3257860391536484, + "step": 3295, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.3257860391536484, + "step": 3295, + "train/total_loss": 0.1396772265434265 + }, + { + "entropy": 9.750741958618164, + "epoch": 0.32588491200316394, + "mean_token_accuracy": 0.6804123520851135, + "num_tokens": 17155490.0, + "step": 3296, + "train/ce_loss": 5.232382591202622e-06 + }, + { + "epoch": 0.32588491200316394, + "step": 3296, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.32588491200316394, + "step": 3296, + "train/total_loss": 0.023438023403286934 + }, + { + "entropy": 9.786356925964355, + "epoch": 0.3259837848526795, + "mean_token_accuracy": 0.7335701584815979, + "num_tokens": 17160469.0, + "step": 3297, + "train/ce_loss": 1.3135344982147217 + }, + { + "epoch": 0.3259837848526795, + "step": 3297, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.3259837848526795, + "step": 3297, + "train/total_loss": 0.2016659528017044 + }, + { + "entropy": 8.91646957397461, + "epoch": 0.32608265770219497, + "mean_token_accuracy": 0.770691990852356, + "num_tokens": 17165637.0, + "step": 3298, + "train/ce_loss": 0.9684990644454956 + }, + { + "epoch": 0.32608265770219497, + "step": 3298, + "train/sim_loss": 0.12890625 + }, + { + "epoch": 0.32608265770219497, + "step": 3298, + "train/total_loss": 0.22575616836547852 + }, + { + "entropy": 8.74300479888916, + "epoch": 0.3261815305517105, + "mean_token_accuracy": 0.7272727489471436, + "num_tokens": 17171049.0, + "step": 3299, + "train/ce_loss": 1.2194308042526245 + }, + { + "epoch": 0.3261815305517105, + "step": 3299, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.3261815305517105, + "step": 3299, + "train/total_loss": 0.20397433638572693 + }, + { + "epoch": 0.32628040340122605, + "grad_norm": 0.7850518822669983, + "learning_rate": 9.18681699055531e-06, + "loss": 0.1409, + "step": 3300 + }, + { + "entropy": 8.887176513671875, + "epoch": 0.32628040340122605, + "mean_token_accuracy": 0.7696506381034851, + "num_tokens": 17176416.0, + "step": 3300, + "train/ce_loss": 0.8500009775161743 + }, + { + "epoch": 0.32628040340122605, + "step": 3300, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.32628040340122605, + "step": 3300, + "train/total_loss": 0.19437509775161743 + }, + { + "entropy": 9.179983139038086, + "epoch": 0.32637927625074153, + "mean_token_accuracy": 0.7182254195213318, + "num_tokens": 17181676.0, + "step": 3301, + "train/ce_loss": 0.6478583216667175 + }, + { + "epoch": 0.32637927625074153, + "step": 3301, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.32637927625074153, + "step": 3301, + "train/total_loss": 0.09603583067655563 + }, + { + "entropy": 8.650181770324707, + "epoch": 0.3264781491002571, + "mean_token_accuracy": 0.7620087265968323, + "num_tokens": 17187096.0, + "step": 3302, + "train/ce_loss": 0.6787617802619934 + }, + { + "epoch": 0.3264781491002571, + "step": 3302, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.3264781491002571, + "step": 3302, + "train/total_loss": 0.1733449399471283 + }, + { + "entropy": 9.029155731201172, + "epoch": 0.3265770219497726, + "mean_token_accuracy": 0.7097142934799194, + "num_tokens": 17192413.0, + "step": 3303, + "train/ce_loss": 0.9886357188224792 + }, + { + "epoch": 0.3265770219497726, + "step": 3303, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.3265770219497726, + "step": 3303, + "train/total_loss": 0.15355107188224792 + }, + { + "entropy": 8.551504135131836, + "epoch": 0.3266758947992881, + "mean_token_accuracy": 0.6666666865348816, + "num_tokens": 17197919.0, + "step": 3304, + "train/ce_loss": 1.1812376976013184 + }, + { + "epoch": 0.3266758947992881, + "step": 3304, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.3266758947992881, + "step": 3304, + "train/total_loss": 0.21187376976013184 + }, + { + "entropy": 9.83358383178711, + "epoch": 0.32677476764880364, + "mean_token_accuracy": 0.8218181729316711, + "num_tokens": 17202870.0, + "step": 3305, + "train/ce_loss": 2.4499606752215186e-06 + }, + { + "epoch": 0.32677476764880364, + "step": 3305, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.32677476764880364, + "step": 3305, + "train/total_loss": 0.0156252458691597 + }, + { + "entropy": 9.238372802734375, + "epoch": 0.3268736404983192, + "mean_token_accuracy": 0.7621145248413086, + "num_tokens": 17208061.0, + "step": 3306, + "train/ce_loss": 3.0834571589366533e-06 + }, + { + "epoch": 0.3268736404983192, + "step": 3306, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.3268736404983192, + "step": 3306, + "train/total_loss": 0.08593780547380447 + }, + { + "entropy": 9.290925025939941, + "epoch": 0.32697251334783467, + "mean_token_accuracy": 0.7260677218437195, + "num_tokens": 17213221.0, + "step": 3307, + "train/ce_loss": 1.2993706464767456 + }, + { + "epoch": 0.32697251334783467, + "step": 3307, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.32697251334783467, + "step": 3307, + "train/total_loss": 0.2393120676279068 + }, + { + "entropy": 9.71102523803711, + "epoch": 0.3270713861973502, + "mean_token_accuracy": 0.7114093899726868, + "num_tokens": 17218246.0, + "step": 3308, + "train/ce_loss": 1.9544768292689696e-06 + }, + { + "epoch": 0.3270713861973502, + "step": 3308, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.3270713861973502, + "step": 3308, + "train/total_loss": 0.04687519371509552 + }, + { + "entropy": 9.074646949768066, + "epoch": 0.32717025904686575, + "mean_token_accuracy": 0.7472035884857178, + "num_tokens": 17223649.0, + "step": 3309, + "train/ce_loss": 0.647305428981781 + }, + { + "epoch": 0.32717025904686575, + "step": 3309, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.32717025904686575, + "step": 3309, + "train/total_loss": 0.12723055481910706 + }, + { + "entropy": 8.787233352661133, + "epoch": 0.32726913189638124, + "mean_token_accuracy": 0.7602397799491882, + "num_tokens": 17229133.0, + "step": 3310, + "train/ce_loss": 0.5999415516853333 + }, + { + "epoch": 0.32726913189638124, + "step": 3310, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.32726913189638124, + "step": 3310, + "train/total_loss": 0.0990566611289978 + }, + { + "entropy": 8.729236602783203, + "epoch": 0.3273680047458968, + "mean_token_accuracy": 0.7316620349884033, + "num_tokens": 17234668.0, + "step": 3311, + "train/ce_loss": 1.5345820188522339 + }, + { + "epoch": 0.3273680047458968, + "step": 3311, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.3273680047458968, + "step": 3311, + "train/total_loss": 0.21986445784568787 + }, + { + "entropy": 8.944244384765625, + "epoch": 0.3274668775954123, + "mean_token_accuracy": 0.7325194478034973, + "num_tokens": 17240048.0, + "step": 3312, + "train/ce_loss": 0.7197909951210022 + }, + { + "epoch": 0.3274668775954123, + "step": 3312, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.3274668775954123, + "step": 3312, + "train/total_loss": 0.1032290980219841 + }, + { + "entropy": 9.22994613647461, + "epoch": 0.3275657504449278, + "mean_token_accuracy": 0.7666666507720947, + "num_tokens": 17245188.0, + "step": 3313, + "train/ce_loss": 0.8360373377799988 + }, + { + "epoch": 0.3275657504449278, + "step": 3313, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.3275657504449278, + "step": 3313, + "train/total_loss": 0.13829123973846436 + }, + { + "entropy": 9.057548522949219, + "epoch": 0.32766462329444335, + "mean_token_accuracy": 0.7311960458755493, + "num_tokens": 17250437.0, + "step": 3314, + "train/ce_loss": 0.7472837567329407 + }, + { + "epoch": 0.32766462329444335, + "step": 3314, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.32766462329444335, + "step": 3314, + "train/total_loss": 0.12160337716341019 + }, + { + "entropy": 9.754228591918945, + "epoch": 0.3277634961439589, + "mean_token_accuracy": 0.6967418789863586, + "num_tokens": 17255304.0, + "step": 3315, + "train/ce_loss": 1.4389730495167896e-05 + }, + { + "epoch": 0.3277634961439589, + "step": 3315, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.3277634961439589, + "step": 3315, + "train/total_loss": 0.046876437962055206 + }, + { + "entropy": 9.672286033630371, + "epoch": 0.3278623689934744, + "mean_token_accuracy": 0.7417218685150146, + "num_tokens": 17260334.0, + "step": 3316, + "train/ce_loss": 1.4596654176712036 + }, + { + "epoch": 0.3278623689934744, + "step": 3316, + "train/sim_loss": 0.1484375 + }, + { + "epoch": 0.3278623689934744, + "step": 3316, + "train/total_loss": 0.2944040298461914 + }, + { + "entropy": 8.372819900512695, + "epoch": 0.3279612418429899, + "mean_token_accuracy": 0.7288888692855835, + "num_tokens": 17265775.0, + "step": 3317, + "train/ce_loss": 1.578393816947937 + }, + { + "epoch": 0.3279612418429899, + "step": 3317, + "train/sim_loss": 0.1328125 + }, + { + "epoch": 0.3279612418429899, + "step": 3317, + "train/total_loss": 0.2906518876552582 + }, + { + "entropy": 9.448688507080078, + "epoch": 0.32806011469250546, + "mean_token_accuracy": 0.7260638475418091, + "num_tokens": 17270928.0, + "step": 3318, + "train/ce_loss": 1.353439211845398 + }, + { + "epoch": 0.32806011469250546, + "step": 3318, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.32806011469250546, + "step": 3318, + "train/total_loss": 0.22909392416477203 + }, + { + "entropy": 8.928474426269531, + "epoch": 0.32815898754202094, + "mean_token_accuracy": 0.6913319230079651, + "num_tokens": 17276352.0, + "step": 3319, + "train/ce_loss": 0.34745627641677856 + }, + { + "epoch": 0.32815898754202094, + "step": 3319, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.32815898754202094, + "step": 3319, + "train/total_loss": 0.09333938360214233 + }, + { + "epoch": 0.3282578603915365, + "grad_norm": 0.7752724885940552, + "learning_rate": 9.18187212579736e-06, + "loss": 0.1568, + "step": 3320 + }, + { + "entropy": 9.880072593688965, + "epoch": 0.3282578603915365, + "mean_token_accuracy": 0.7188405990600586, + "num_tokens": 17281088.0, + "step": 3320, + "train/ce_loss": 4.62160005554324e-06 + }, + { + "epoch": 0.3282578603915365, + "step": 3320, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.3282578603915365, + "step": 3320, + "train/total_loss": 0.05078171193599701 + }, + { + "entropy": 9.501537322998047, + "epoch": 0.328356733241052, + "mean_token_accuracy": 0.7589820623397827, + "num_tokens": 17286241.0, + "step": 3321, + "train/ce_loss": 1.1275746822357178 + }, + { + "epoch": 0.328356733241052, + "step": 3321, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.328356733241052, + "step": 3321, + "train/total_loss": 0.17525747418403625 + }, + { + "entropy": 9.380807876586914, + "epoch": 0.3284556060905675, + "mean_token_accuracy": 0.8179271817207336, + "num_tokens": 17291434.0, + "step": 3322, + "train/ce_loss": 0.5504809617996216 + }, + { + "epoch": 0.3284556060905675, + "step": 3322, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.3284556060905675, + "step": 3322, + "train/total_loss": 0.10973559319972992 + }, + { + "entropy": 9.613485336303711, + "epoch": 0.32855447894008305, + "mean_token_accuracy": 0.8215962648391724, + "num_tokens": 17296491.0, + "step": 3323, + "train/ce_loss": 0.5445230603218079 + }, + { + "epoch": 0.32855447894008305, + "step": 3323, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.32855447894008305, + "step": 3323, + "train/total_loss": 0.0778898075222969 + }, + { + "entropy": 9.144057273864746, + "epoch": 0.3286533517895986, + "mean_token_accuracy": 0.747474730014801, + "num_tokens": 17301778.0, + "step": 3324, + "train/ce_loss": 0.9967942833900452 + }, + { + "epoch": 0.3286533517895986, + "step": 3324, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.3286533517895986, + "step": 3324, + "train/total_loss": 0.15046069025993347 + }, + { + "entropy": 9.607072830200195, + "epoch": 0.3287522246391141, + "mean_token_accuracy": 0.7135325074195862, + "num_tokens": 17306792.0, + "step": 3325, + "train/ce_loss": 3.519417987263296e-06 + }, + { + "epoch": 0.3287522246391141, + "step": 3325, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.3287522246391141, + "step": 3325, + "train/total_loss": 0.054687850177288055 + }, + { + "entropy": 9.660314559936523, + "epoch": 0.3288510974886296, + "mean_token_accuracy": 0.7192716002464294, + "num_tokens": 17311899.0, + "step": 3326, + "train/ce_loss": 1.3766443729400635 + }, + { + "epoch": 0.3288510974886296, + "step": 3326, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.3288510974886296, + "step": 3326, + "train/total_loss": 0.20016443729400635 + }, + { + "entropy": 9.089285850524902, + "epoch": 0.32894997033814516, + "mean_token_accuracy": 0.6810228824615479, + "num_tokens": 17317099.0, + "step": 3327, + "train/ce_loss": 0.921363890171051 + }, + { + "epoch": 0.32894997033814516, + "step": 3327, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.32894997033814516, + "step": 3327, + "train/total_loss": 0.14291763305664062 + }, + { + "entropy": 9.025397300720215, + "epoch": 0.32904884318766064, + "mean_token_accuracy": 0.8428720235824585, + "num_tokens": 17322534.0, + "step": 3328, + "train/ce_loss": 0.7051679491996765 + }, + { + "epoch": 0.32904884318766064, + "step": 3328, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.32904884318766064, + "step": 3328, + "train/total_loss": 0.16426679491996765 + }, + { + "entropy": 9.427388191223145, + "epoch": 0.3291477160371762, + "mean_token_accuracy": 0.7158774137496948, + "num_tokens": 17327690.0, + "step": 3329, + "train/ce_loss": 0.8867825269699097 + }, + { + "epoch": 0.3291477160371762, + "step": 3329, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.3291477160371762, + "step": 3329, + "train/total_loss": 0.1394595056772232 + }, + { + "entropy": 9.08204460144043, + "epoch": 0.3292465888866917, + "mean_token_accuracy": 0.7878412008285522, + "num_tokens": 17332978.0, + "step": 3330, + "train/ce_loss": 6.495631623693043e-06 + }, + { + "epoch": 0.3292465888866917, + "step": 3330, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.3292465888866917, + "step": 3330, + "train/total_loss": 0.04687564820051193 + }, + { + "entropy": 8.862564086914062, + "epoch": 0.3293454617362072, + "mean_token_accuracy": 0.7699293494224548, + "num_tokens": 17338440.0, + "step": 3331, + "train/ce_loss": 0.8806781768798828 + }, + { + "epoch": 0.3293454617362072, + "step": 3331, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.3293454617362072, + "step": 3331, + "train/total_loss": 0.16228657960891724 + }, + { + "entropy": 8.995798110961914, + "epoch": 0.32944433458572275, + "mean_token_accuracy": 0.7753396034240723, + "num_tokens": 17343931.0, + "step": 3332, + "train/ce_loss": 0.5379678606987 + }, + { + "epoch": 0.32944433458572275, + "step": 3332, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.32944433458572275, + "step": 3332, + "train/total_loss": 0.12020303308963776 + }, + { + "entropy": 9.290687561035156, + "epoch": 0.3295432074352383, + "mean_token_accuracy": 0.7165697813034058, + "num_tokens": 17349064.0, + "step": 3333, + "train/ce_loss": 1.9717219856829615e-06 + }, + { + "epoch": 0.3295432074352383, + "step": 3333, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.3295432074352383, + "step": 3333, + "train/total_loss": 0.04687519744038582 + }, + { + "entropy": 9.608075141906738, + "epoch": 0.3296420802847538, + "mean_token_accuracy": 0.80738365650177, + "num_tokens": 17354153.0, + "step": 3334, + "train/ce_loss": 2.263906480948208e-06 + }, + { + "epoch": 0.3296420802847538, + "step": 3334, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.3296420802847538, + "step": 3334, + "train/total_loss": 0.039062727242708206 + }, + { + "entropy": 9.076807975769043, + "epoch": 0.3297409531342693, + "mean_token_accuracy": 0.7295742034912109, + "num_tokens": 17359534.0, + "step": 3335, + "train/ce_loss": 0.7153067588806152 + }, + { + "epoch": 0.3297409531342693, + "step": 3335, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.3297409531342693, + "step": 3335, + "train/total_loss": 0.14574941992759705 + }, + { + "entropy": 9.463798522949219, + "epoch": 0.32983982598378486, + "mean_token_accuracy": 0.7430249452590942, + "num_tokens": 17364666.0, + "step": 3336, + "train/ce_loss": 1.0005092008213978e-05 + }, + { + "epoch": 0.32983982598378486, + "step": 3336, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.32983982598378486, + "step": 3336, + "train/total_loss": 0.039063502103090286 + }, + { + "entropy": 9.013387680053711, + "epoch": 0.3299386988333004, + "mean_token_accuracy": 0.7323529124259949, + "num_tokens": 17369975.0, + "step": 3337, + "train/ce_loss": 1.050665020942688 + }, + { + "epoch": 0.3299386988333004, + "step": 3337, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.3299386988333004, + "step": 3337, + "train/total_loss": 0.16756650805473328 + }, + { + "entropy": 9.326025009155273, + "epoch": 0.3300375716828159, + "mean_token_accuracy": 0.7771428823471069, + "num_tokens": 17375113.0, + "step": 3338, + "train/ce_loss": 0.417694628238678 + }, + { + "epoch": 0.3300375716828159, + "step": 3338, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.3300375716828159, + "step": 3338, + "train/total_loss": 0.06911320984363556 + }, + { + "entropy": 8.72673511505127, + "epoch": 0.33013644453233143, + "mean_token_accuracy": 0.7530120611190796, + "num_tokens": 17380644.0, + "step": 3339, + "train/ce_loss": 0.4539792835712433 + }, + { + "epoch": 0.33013644453233143, + "step": 3339, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.33013644453233143, + "step": 3339, + "train/total_loss": 0.08446042984724045 + }, + { + "epoch": 0.33023531738184697, + "grad_norm": 0.7518821954727173, + "learning_rate": 9.17692726103941e-06, + "loss": 0.1369, + "step": 3340 + }, + { + "entropy": 8.83997917175293, + "epoch": 0.33023531738184697, + "mean_token_accuracy": 0.7636761665344238, + "num_tokens": 17386031.0, + "step": 3340, + "train/ce_loss": 0.7052160501480103 + }, + { + "epoch": 0.33023531738184697, + "step": 3340, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.33023531738184697, + "step": 3340, + "train/total_loss": 0.15255285799503326 + }, + { + "entropy": 8.709097862243652, + "epoch": 0.33033419023136246, + "mean_token_accuracy": 0.6950549483299255, + "num_tokens": 17391586.0, + "step": 3341, + "train/ce_loss": 0.5241850018501282 + }, + { + "epoch": 0.33033419023136246, + "step": 3341, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.33033419023136246, + "step": 3341, + "train/total_loss": 0.13054350018501282 + }, + { + "entropy": 9.87069034576416, + "epoch": 0.330433063080878, + "mean_token_accuracy": 0.8385744094848633, + "num_tokens": 17396475.0, + "step": 3342, + "train/ce_loss": 2.3771021915308665e-06 + }, + { + "epoch": 0.330433063080878, + "step": 3342, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.330433063080878, + "step": 3342, + "train/total_loss": 0.0234377384185791 + }, + { + "entropy": 10.025890350341797, + "epoch": 0.33053193593039354, + "mean_token_accuracy": 0.751207709312439, + "num_tokens": 17401317.0, + "step": 3343, + "train/ce_loss": 2.90927482637926e-06 + }, + { + "epoch": 0.33053193593039354, + "step": 3343, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.33053193593039354, + "step": 3343, + "train/total_loss": 0.06250029057264328 + }, + { + "entropy": 9.361654281616211, + "epoch": 0.330630808779909, + "mean_token_accuracy": 0.7660484910011292, + "num_tokens": 17406504.0, + "step": 3344, + "train/ce_loss": 0.613772988319397 + }, + { + "epoch": 0.330630808779909, + "step": 3344, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.330630808779909, + "step": 3344, + "train/total_loss": 0.07700230181217194 + }, + { + "entropy": 9.017574310302734, + "epoch": 0.33072968162942457, + "mean_token_accuracy": 0.7683615684509277, + "num_tokens": 17411911.0, + "step": 3345, + "train/ce_loss": 0.8716453313827515 + }, + { + "epoch": 0.33072968162942457, + "step": 3345, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.33072968162942457, + "step": 3345, + "train/total_loss": 0.15747703611850739 + }, + { + "entropy": 9.815315246582031, + "epoch": 0.3308285544789401, + "mean_token_accuracy": 0.6613636612892151, + "num_tokens": 17416713.0, + "step": 3346, + "train/ce_loss": 9.277481694880407e-06 + }, + { + "epoch": 0.3308285544789401, + "step": 3346, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.3308285544789401, + "step": 3346, + "train/total_loss": 0.11718843132257462 + }, + { + "entropy": 9.554647445678711, + "epoch": 0.3309274273284556, + "mean_token_accuracy": 0.7161654233932495, + "num_tokens": 17421720.0, + "step": 3347, + "train/ce_loss": 1.4893670082092285 + }, + { + "epoch": 0.3309274273284556, + "step": 3347, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.3309274273284556, + "step": 3347, + "train/total_loss": 0.2348742038011551 + }, + { + "entropy": 8.847419738769531, + "epoch": 0.33102630017797113, + "mean_token_accuracy": 0.7027601003646851, + "num_tokens": 17427104.0, + "step": 3348, + "train/ce_loss": 1.1632755994796753 + }, + { + "epoch": 0.33102630017797113, + "step": 3348, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.33102630017797113, + "step": 3348, + "train/total_loss": 0.19835880398750305 + }, + { + "entropy": 9.26319694519043, + "epoch": 0.3311251730274867, + "mean_token_accuracy": 0.7883333563804626, + "num_tokens": 17432137.0, + "step": 3349, + "train/ce_loss": 3.58709644388e-06 + }, + { + "epoch": 0.3311251730274867, + "step": 3349, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.3311251730274867, + "step": 3349, + "train/total_loss": 0.05078160762786865 + }, + { + "entropy": 8.61819076538086, + "epoch": 0.33122404587700216, + "mean_token_accuracy": 0.7494692206382751, + "num_tokens": 17437573.0, + "step": 3350, + "train/ce_loss": 0.7018135786056519 + }, + { + "epoch": 0.33122404587700216, + "step": 3350, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.33122404587700216, + "step": 3350, + "train/total_loss": 0.12877511978149414 + }, + { + "entropy": 9.104698181152344, + "epoch": 0.3313229187265177, + "mean_token_accuracy": 0.7133917212486267, + "num_tokens": 17442841.0, + "step": 3351, + "train/ce_loss": 1.2091833353042603 + }, + { + "epoch": 0.3313229187265177, + "step": 3351, + "train/sim_loss": 0.19921875 + }, + { + "epoch": 0.3313229187265177, + "step": 3351, + "train/total_loss": 0.320137083530426 + }, + { + "entropy": 8.993057250976562, + "epoch": 0.33142179157603324, + "mean_token_accuracy": 0.7660256624221802, + "num_tokens": 17448257.0, + "step": 3352, + "train/ce_loss": 0.6769647598266602 + }, + { + "epoch": 0.33142179157603324, + "step": 3352, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.33142179157603324, + "step": 3352, + "train/total_loss": 0.1341027319431305 + }, + { + "entropy": 9.338879585266113, + "epoch": 0.33152066442554873, + "mean_token_accuracy": 0.7210242748260498, + "num_tokens": 17453449.0, + "step": 3353, + "train/ce_loss": 0.9190797209739685 + }, + { + "epoch": 0.33152066442554873, + "step": 3353, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.33152066442554873, + "step": 3353, + "train/total_loss": 0.13878297805786133 + }, + { + "entropy": 9.517727851867676, + "epoch": 0.33161953727506427, + "mean_token_accuracy": 0.7896296381950378, + "num_tokens": 17458526.0, + "step": 3354, + "train/ce_loss": 0.8352819085121155 + }, + { + "epoch": 0.33161953727506427, + "step": 3354, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.33161953727506427, + "step": 3354, + "train/total_loss": 0.11087194085121155 + }, + { + "entropy": 8.94295883178711, + "epoch": 0.3317184101245798, + "mean_token_accuracy": 0.723127007484436, + "num_tokens": 17463947.0, + "step": 3355, + "train/ce_loss": 0.895768404006958 + }, + { + "epoch": 0.3317184101245798, + "step": 3355, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.3317184101245798, + "step": 3355, + "train/total_loss": 0.1481705904006958 + }, + { + "entropy": 9.064741134643555, + "epoch": 0.3318172829740953, + "mean_token_accuracy": 0.6784037351608276, + "num_tokens": 17469284.0, + "step": 3356, + "train/ce_loss": 1.3284772634506226 + }, + { + "epoch": 0.3318172829740953, + "step": 3356, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.3318172829740953, + "step": 3356, + "train/total_loss": 0.24612897634506226 + }, + { + "entropy": 9.327483177185059, + "epoch": 0.33191615582361084, + "mean_token_accuracy": 0.8005390763282776, + "num_tokens": 17474422.0, + "step": 3357, + "train/ce_loss": 0.6446126103401184 + }, + { + "epoch": 0.33191615582361084, + "step": 3357, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.33191615582361084, + "step": 3357, + "train/total_loss": 0.08789876103401184 + }, + { + "entropy": 8.652473449707031, + "epoch": 0.3320150286731264, + "mean_token_accuracy": 0.7570776343345642, + "num_tokens": 17480060.0, + "step": 3358, + "train/ce_loss": 0.8517627716064453 + }, + { + "epoch": 0.3320150286731264, + "step": 3358, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.3320150286731264, + "step": 3358, + "train/total_loss": 0.1398637890815735 + }, + { + "entropy": 8.441993713378906, + "epoch": 0.33211390152264186, + "mean_token_accuracy": 0.7657308578491211, + "num_tokens": 17485646.0, + "step": 3359, + "train/ce_loss": 0.44434165954589844 + }, + { + "epoch": 0.33211390152264186, + "step": 3359, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.33211390152264186, + "step": 3359, + "train/total_loss": 0.11474666744470596 + }, + { + "epoch": 0.3322127743721574, + "grad_norm": 0.7457917332649231, + "learning_rate": 9.171982396281463e-06, + "loss": 0.1451, + "step": 3360 + }, + { + "entropy": 9.506464004516602, + "epoch": 0.3322127743721574, + "mean_token_accuracy": 0.722129762172699, + "num_tokens": 17490669.0, + "step": 3360, + "train/ce_loss": 1.7210886478424072 + }, + { + "epoch": 0.3322127743721574, + "step": 3360, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.3322127743721574, + "step": 3360, + "train/total_loss": 0.22679637372493744 + }, + { + "entropy": 9.958076477050781, + "epoch": 0.33231164722167295, + "mean_token_accuracy": 0.7402597665786743, + "num_tokens": 17495499.0, + "step": 3361, + "train/ce_loss": 2.4337151050567627 + }, + { + "epoch": 0.33231164722167295, + "step": 3361, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.33231164722167295, + "step": 3361, + "train/total_loss": 0.29415276646614075 + }, + { + "entropy": 9.22468090057373, + "epoch": 0.33241052007118843, + "mean_token_accuracy": 0.7092511057853699, + "num_tokens": 17500605.0, + "step": 3362, + "train/ce_loss": 0.7624481320381165 + }, + { + "epoch": 0.33241052007118843, + "step": 3362, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.33241052007118843, + "step": 3362, + "train/total_loss": 0.10749481618404388 + }, + { + "entropy": 9.560653686523438, + "epoch": 0.33250939292070397, + "mean_token_accuracy": 0.7523659467697144, + "num_tokens": 17505674.0, + "step": 3363, + "train/ce_loss": 0.8970683217048645 + }, + { + "epoch": 0.33250939292070397, + "step": 3363, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.33250939292070397, + "step": 3363, + "train/total_loss": 0.10923808068037033 + }, + { + "entropy": 9.083115577697754, + "epoch": 0.3326082657702195, + "mean_token_accuracy": 0.7306843400001526, + "num_tokens": 17511066.0, + "step": 3364, + "train/ce_loss": 0.8307674527168274 + }, + { + "epoch": 0.3326082657702195, + "step": 3364, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.3326082657702195, + "step": 3364, + "train/total_loss": 0.14948299527168274 + }, + { + "entropy": 9.192109107971191, + "epoch": 0.332707138619735, + "mean_token_accuracy": 0.7402912378311157, + "num_tokens": 17516323.0, + "step": 3365, + "train/ce_loss": 0.43051204085350037 + }, + { + "epoch": 0.332707138619735, + "step": 3365, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.332707138619735, + "step": 3365, + "train/total_loss": 0.06648870557546616 + }, + { + "entropy": 9.800932884216309, + "epoch": 0.33280601146925054, + "mean_token_accuracy": 0.7151394486427307, + "num_tokens": 17521236.0, + "step": 3366, + "train/ce_loss": 2.51992560151848e-06 + }, + { + "epoch": 0.33280601146925054, + "step": 3366, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.33280601146925054, + "step": 3366, + "train/total_loss": 0.023437751457095146 + }, + { + "entropy": 9.11934757232666, + "epoch": 0.3329048843187661, + "mean_token_accuracy": 0.7839080691337585, + "num_tokens": 17526605.0, + "step": 3367, + "train/ce_loss": 1.0057429075241089 + }, + { + "epoch": 0.3329048843187661, + "step": 3367, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.3329048843187661, + "step": 3367, + "train/total_loss": 0.12401179224252701 + }, + { + "entropy": 9.207134246826172, + "epoch": 0.33300375716828157, + "mean_token_accuracy": 0.7644736766815186, + "num_tokens": 17531820.0, + "step": 3368, + "train/ce_loss": 0.9864628911018372 + }, + { + "epoch": 0.33300375716828157, + "step": 3368, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.33300375716828157, + "step": 3368, + "train/total_loss": 0.16114628314971924 + }, + { + "entropy": 9.180716514587402, + "epoch": 0.3331026300177971, + "mean_token_accuracy": 0.7611510753631592, + "num_tokens": 17537022.0, + "step": 3369, + "train/ce_loss": 1.3197712898254395 + }, + { + "epoch": 0.3331026300177971, + "step": 3369, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.3331026300177971, + "step": 3369, + "train/total_loss": 0.2257271260023117 + }, + { + "entropy": 10.049286842346191, + "epoch": 0.33320150286731265, + "mean_token_accuracy": 0.7659574747085571, + "num_tokens": 17541755.0, + "step": 3370, + "train/ce_loss": 1.915870189666748 + }, + { + "epoch": 0.33320150286731265, + "step": 3370, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.33320150286731265, + "step": 3370, + "train/total_loss": 0.27752453088760376 + }, + { + "entropy": 9.705484390258789, + "epoch": 0.33330037571682813, + "mean_token_accuracy": 0.7814313173294067, + "num_tokens": 17546717.0, + "step": 3371, + "train/ce_loss": 2.390544295849395e-06 + }, + { + "epoch": 0.33330037571682813, + "step": 3371, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.33330037571682813, + "step": 3371, + "train/total_loss": 0.0468752384185791 + }, + { + "entropy": 9.473861694335938, + "epoch": 0.3333992485663437, + "mean_token_accuracy": 0.7057010531425476, + "num_tokens": 17551795.0, + "step": 3372, + "train/ce_loss": 2.3074352741241455 + }, + { + "epoch": 0.3333992485663437, + "step": 3372, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.3333992485663437, + "step": 3372, + "train/total_loss": 0.32449352741241455 + }, + { + "entropy": 9.567621231079102, + "epoch": 0.3334981214158592, + "mean_token_accuracy": 0.6865149140357971, + "num_tokens": 17556808.0, + "step": 3373, + "train/ce_loss": 1.390802025794983 + }, + { + "epoch": 0.3334981214158592, + "step": 3373, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.3334981214158592, + "step": 3373, + "train/total_loss": 0.178142711520195 + }, + { + "entropy": 9.133459091186523, + "epoch": 0.3335969942653747, + "mean_token_accuracy": 0.7361878156661987, + "num_tokens": 17562021.0, + "step": 3374, + "train/ce_loss": 2.002731434913585e-06 + }, + { + "epoch": 0.3335969942653747, + "step": 3374, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.3335969942653747, + "step": 3374, + "train/total_loss": 0.04687520116567612 + }, + { + "entropy": 8.745182037353516, + "epoch": 0.33369586711489024, + "mean_token_accuracy": 0.7504743933677673, + "num_tokens": 17567587.0, + "step": 3375, + "train/ce_loss": 1.0268542766571045 + }, + { + "epoch": 0.33369586711489024, + "step": 3375, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.33369586711489024, + "step": 3375, + "train/total_loss": 0.16909167170524597 + }, + { + "entropy": 8.663030624389648, + "epoch": 0.3337947399644058, + "mean_token_accuracy": 0.7615545988082886, + "num_tokens": 17572966.0, + "step": 3376, + "train/ce_loss": 0.5454652905464172 + }, + { + "epoch": 0.3337947399644058, + "step": 3376, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.3337947399644058, + "step": 3376, + "train/total_loss": 0.1209527850151062 + }, + { + "entropy": 9.087337493896484, + "epoch": 0.33389361281392127, + "mean_token_accuracy": 0.7656427621841431, + "num_tokens": 17578301.0, + "step": 3377, + "train/ce_loss": 0.6339848637580872 + }, + { + "epoch": 0.33389361281392127, + "step": 3377, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.33389361281392127, + "step": 3377, + "train/total_loss": 0.13371098041534424 + }, + { + "entropy": 9.56787395477295, + "epoch": 0.3339924856634368, + "mean_token_accuracy": 0.7808219194412231, + "num_tokens": 17583251.0, + "step": 3378, + "train/ce_loss": 1.6264250461972551e-06 + }, + { + "epoch": 0.3339924856634368, + "step": 3378, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.3339924856634368, + "step": 3378, + "train/total_loss": 0.04687516391277313 + }, + { + "entropy": 9.46949577331543, + "epoch": 0.33409135851295235, + "mean_token_accuracy": 0.7528571486473083, + "num_tokens": 17588362.0, + "step": 3379, + "train/ce_loss": 1.3034005165100098 + }, + { + "epoch": 0.33409135851295235, + "step": 3379, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.33409135851295235, + "step": 3379, + "train/total_loss": 0.18893380463123322 + }, + { + "epoch": 0.3341902313624679, + "grad_norm": 0.7638806104660034, + "learning_rate": 9.167037531523513e-06, + "loss": 0.1375, + "step": 3380 + }, + { + "entropy": 9.141291618347168, + "epoch": 0.3341902313624679, + "mean_token_accuracy": 0.740440309047699, + "num_tokens": 17593694.0, + "step": 3380, + "train/ce_loss": 0.5845214128494263 + }, + { + "epoch": 0.3341902313624679, + "step": 3380, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.3341902313624679, + "step": 3380, + "train/total_loss": 0.08970214426517487 + }, + { + "entropy": 9.934433937072754, + "epoch": 0.3342891042119834, + "mean_token_accuracy": 0.791208803653717, + "num_tokens": 17598577.0, + "step": 3381, + "train/ce_loss": 2.761363248282578e-06 + }, + { + "epoch": 0.3342891042119834, + "step": 3381, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.3342891042119834, + "step": 3381, + "train/total_loss": 0.046875275671482086 + }, + { + "entropy": 9.295759201049805, + "epoch": 0.3343879770614989, + "mean_token_accuracy": 0.7364864945411682, + "num_tokens": 17603765.0, + "step": 3382, + "train/ce_loss": 0.5177283883094788 + }, + { + "epoch": 0.3343879770614989, + "step": 3382, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.3343879770614989, + "step": 3382, + "train/total_loss": 0.098647840321064 + }, + { + "entropy": 9.589226722717285, + "epoch": 0.33448684991101446, + "mean_token_accuracy": 0.7853492498397827, + "num_tokens": 17608778.0, + "step": 3383, + "train/ce_loss": 0.7475631237030029 + }, + { + "epoch": 0.33448684991101446, + "step": 3383, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.33448684991101446, + "step": 3383, + "train/total_loss": 0.14116257429122925 + }, + { + "entropy": 9.307208061218262, + "epoch": 0.33458572276052995, + "mean_token_accuracy": 0.7857142686843872, + "num_tokens": 17613969.0, + "step": 3384, + "train/ce_loss": 1.0145411491394043 + }, + { + "epoch": 0.33458572276052995, + "step": 3384, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.33458572276052995, + "step": 3384, + "train/total_loss": 0.14832910895347595 + }, + { + "entropy": 8.7385835647583, + "epoch": 0.3346845956100455, + "mean_token_accuracy": 0.8264462947845459, + "num_tokens": 17619475.0, + "step": 3385, + "train/ce_loss": 0.6401126980781555 + }, + { + "epoch": 0.3346845956100455, + "step": 3385, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.3346845956100455, + "step": 3385, + "train/total_loss": 0.09135501831769943 + }, + { + "entropy": 9.472051620483398, + "epoch": 0.33478346845956103, + "mean_token_accuracy": 0.6609195470809937, + "num_tokens": 17624580.0, + "step": 3386, + "train/ce_loss": 1.3405520915985107 + }, + { + "epoch": 0.33478346845956103, + "step": 3386, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.33478346845956103, + "step": 3386, + "train/total_loss": 0.2317114621400833 + }, + { + "entropy": 9.845640182495117, + "epoch": 0.3348823413090765, + "mean_token_accuracy": 0.7963917255401611, + "num_tokens": 17629396.0, + "step": 3387, + "train/ce_loss": 3.4778004192048684e-06 + }, + { + "epoch": 0.3348823413090765, + "step": 3387, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.3348823413090765, + "step": 3387, + "train/total_loss": 0.04687534645199776 + }, + { + "entropy": 9.08883285522461, + "epoch": 0.33498121415859206, + "mean_token_accuracy": 0.6870588064193726, + "num_tokens": 17634731.0, + "step": 3388, + "train/ce_loss": 0.7789780497550964 + }, + { + "epoch": 0.33498121415859206, + "step": 3388, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.33498121415859206, + "step": 3388, + "train/total_loss": 0.1599290668964386 + }, + { + "entropy": 9.02785873413086, + "epoch": 0.3350800870081076, + "mean_token_accuracy": 0.765625, + "num_tokens": 17640107.0, + "step": 3389, + "train/ce_loss": 0.5221772789955139 + }, + { + "epoch": 0.3350800870081076, + "step": 3389, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.3350800870081076, + "step": 3389, + "train/total_loss": 0.07565522938966751 + }, + { + "entropy": 9.07229995727539, + "epoch": 0.3351789598576231, + "mean_token_accuracy": 0.7211538553237915, + "num_tokens": 17645437.0, + "step": 3390, + "train/ce_loss": 0.6404953598976135 + }, + { + "epoch": 0.3351789598576231, + "step": 3390, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.3351789598576231, + "step": 3390, + "train/total_loss": 0.16561204195022583 + }, + { + "entropy": 9.288196563720703, + "epoch": 0.3352778327071386, + "mean_token_accuracy": 0.694779098033905, + "num_tokens": 17650690.0, + "step": 3391, + "train/ce_loss": 0.983015239238739 + }, + { + "epoch": 0.3352778327071386, + "step": 3391, + "train/sim_loss": 0.1328125 + }, + { + "epoch": 0.3352778327071386, + "step": 3391, + "train/total_loss": 0.23111402988433838 + }, + { + "entropy": 10.127918243408203, + "epoch": 0.33537670555665416, + "mean_token_accuracy": 0.8166666626930237, + "num_tokens": 17655411.0, + "step": 3392, + "train/ce_loss": 3.3920389341801638e-06 + }, + { + "epoch": 0.33537670555665416, + "step": 3392, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.33537670555665416, + "step": 3392, + "train/total_loss": 0.05078158900141716 + }, + { + "entropy": 9.554422378540039, + "epoch": 0.33547557840616965, + "mean_token_accuracy": 0.7822784781455994, + "num_tokens": 17660218.0, + "step": 3393, + "train/ce_loss": 1.4625734090805054 + }, + { + "epoch": 0.33547557840616965, + "step": 3393, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.33547557840616965, + "step": 3393, + "train/total_loss": 0.22047609090805054 + }, + { + "entropy": 8.788867950439453, + "epoch": 0.3355744512556852, + "mean_token_accuracy": 0.75, + "num_tokens": 17665591.0, + "step": 3394, + "train/ce_loss": 0.4802239239215851 + }, + { + "epoch": 0.3355744512556852, + "step": 3394, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.3355744512556852, + "step": 3394, + "train/total_loss": 0.10661613941192627 + }, + { + "entropy": 8.993380546569824, + "epoch": 0.33567332410520073, + "mean_token_accuracy": 0.7278761267662048, + "num_tokens": 17670979.0, + "step": 3395, + "train/ce_loss": 0.7232365012168884 + }, + { + "epoch": 0.33567332410520073, + "step": 3395, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.33567332410520073, + "step": 3395, + "train/total_loss": 0.12310490012168884 + }, + { + "entropy": 9.209710121154785, + "epoch": 0.3357721969547162, + "mean_token_accuracy": 0.7336561679840088, + "num_tokens": 17676220.0, + "step": 3396, + "train/ce_loss": 1.664810299873352 + }, + { + "epoch": 0.3357721969547162, + "step": 3396, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.3357721969547162, + "step": 3396, + "train/total_loss": 0.25241851806640625 + }, + { + "entropy": 9.402828216552734, + "epoch": 0.33587106980423176, + "mean_token_accuracy": 0.7558139562606812, + "num_tokens": 17681440.0, + "step": 3397, + "train/ce_loss": 1.0300356149673462 + }, + { + "epoch": 0.33587106980423176, + "step": 3397, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.33587106980423176, + "step": 3397, + "train/total_loss": 0.21628481149673462 + }, + { + "entropy": 9.31285572052002, + "epoch": 0.3359699426537473, + "mean_token_accuracy": 0.8018741607666016, + "num_tokens": 17686694.0, + "step": 3398, + "train/ce_loss": 0.5148482918739319 + }, + { + "epoch": 0.3359699426537473, + "step": 3398, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.3359699426537473, + "step": 3398, + "train/total_loss": 0.08273483067750931 + }, + { + "entropy": 8.956930160522461, + "epoch": 0.3360688155032628, + "mean_token_accuracy": 0.7489224076271057, + "num_tokens": 17692096.0, + "step": 3399, + "train/ce_loss": 0.6533750295639038 + }, + { + "epoch": 0.3360688155032628, + "step": 3399, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.3360688155032628, + "step": 3399, + "train/total_loss": 0.13955625891685486 + }, + { + "epoch": 0.3361676883527783, + "grad_norm": 0.7196058630943298, + "learning_rate": 9.162092666765566e-06, + "loss": 0.1369, + "step": 3400 + }, + { + "entropy": 9.541491508483887, + "epoch": 0.3361676883527783, + "mean_token_accuracy": 0.7649842500686646, + "num_tokens": 17697189.0, + "step": 3400, + "train/ce_loss": 1.5455391348950798e-06 + }, + { + "epoch": 0.3361676883527783, + "step": 3400, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.3361676883527783, + "step": 3400, + "train/total_loss": 0.05078140273690224 + }, + { + "entropy": 9.987796783447266, + "epoch": 0.33626656120229387, + "mean_token_accuracy": 0.7028688788414001, + "num_tokens": 17702107.0, + "step": 3401, + "train/ce_loss": 2.4917209148406982 + }, + { + "epoch": 0.33626656120229387, + "step": 3401, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.33626656120229387, + "step": 3401, + "train/total_loss": 0.3351095914840698 + }, + { + "entropy": 9.113516807556152, + "epoch": 0.33636543405180935, + "mean_token_accuracy": 0.7106273770332336, + "num_tokens": 17707393.0, + "step": 3402, + "train/ce_loss": 1.3251246213912964 + }, + { + "epoch": 0.33636543405180935, + "step": 3402, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.33636543405180935, + "step": 3402, + "train/total_loss": 0.19891871511936188 + }, + { + "entropy": 9.230640411376953, + "epoch": 0.3364643069013249, + "mean_token_accuracy": 0.7346405386924744, + "num_tokens": 17712659.0, + "step": 3403, + "train/ce_loss": 0.8238956332206726 + }, + { + "epoch": 0.3364643069013249, + "step": 3403, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.3364643069013249, + "step": 3403, + "train/total_loss": 0.15270206332206726 + }, + { + "entropy": 8.905786514282227, + "epoch": 0.33656317975084044, + "mean_token_accuracy": 0.7210065722465515, + "num_tokens": 17718082.0, + "step": 3404, + "train/ce_loss": 0.9621824622154236 + }, + { + "epoch": 0.33656317975084044, + "step": 3404, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.33656317975084044, + "step": 3404, + "train/total_loss": 0.2134057581424713 + }, + { + "entropy": 9.548670768737793, + "epoch": 0.3366620526003559, + "mean_token_accuracy": 0.7258347868919373, + "num_tokens": 17723049.0, + "step": 3405, + "train/ce_loss": 1.2460538148880005 + }, + { + "epoch": 0.3366620526003559, + "step": 3405, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.3366620526003559, + "step": 3405, + "train/total_loss": 0.20663663744926453 + }, + { + "entropy": 9.482522010803223, + "epoch": 0.33676092544987146, + "mean_token_accuracy": 0.6987951993942261, + "num_tokens": 17728115.0, + "step": 3406, + "train/ce_loss": 1.601701259613037 + }, + { + "epoch": 0.33676092544987146, + "step": 3406, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.33676092544987146, + "step": 3406, + "train/total_loss": 0.21485762298107147 + }, + { + "entropy": 8.978238105773926, + "epoch": 0.336859798299387, + "mean_token_accuracy": 0.7990430593490601, + "num_tokens": 17733398.0, + "step": 3407, + "train/ce_loss": 0.6218247413635254 + }, + { + "epoch": 0.336859798299387, + "step": 3407, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.336859798299387, + "step": 3407, + "train/total_loss": 0.1520262211561203 + }, + { + "entropy": 9.083845138549805, + "epoch": 0.3369586711489025, + "mean_token_accuracy": 0.7783018946647644, + "num_tokens": 17738756.0, + "step": 3408, + "train/ce_loss": 0.5001293420791626 + }, + { + "epoch": 0.3369586711489025, + "step": 3408, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.3369586711489025, + "step": 3408, + "train/total_loss": 0.10860668122768402 + }, + { + "entropy": 9.258260726928711, + "epoch": 0.33705754399841803, + "mean_token_accuracy": 0.7490909099578857, + "num_tokens": 17743993.0, + "step": 3409, + "train/ce_loss": 1.1198776960372925 + }, + { + "epoch": 0.33705754399841803, + "step": 3409, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.33705754399841803, + "step": 3409, + "train/total_loss": 0.18620651960372925 + }, + { + "entropy": 8.85833740234375, + "epoch": 0.33715641684793357, + "mean_token_accuracy": 0.6942528486251831, + "num_tokens": 17749358.0, + "step": 3410, + "train/ce_loss": 0.5631909370422363 + }, + { + "epoch": 0.33715641684793357, + "step": 3410, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.33715641684793357, + "step": 3410, + "train/total_loss": 0.15006908774375916 + }, + { + "entropy": 9.585782051086426, + "epoch": 0.33725528969744906, + "mean_token_accuracy": 0.7329545617103577, + "num_tokens": 17754258.0, + "step": 3411, + "train/ce_loss": 1.4850685596466064 + }, + { + "epoch": 0.33725528969744906, + "step": 3411, + "train/sim_loss": 0.13671875 + }, + { + "epoch": 0.33725528969744906, + "step": 3411, + "train/total_loss": 0.28522562980651855 + }, + { + "entropy": 9.434508323669434, + "epoch": 0.3373541625469646, + "mean_token_accuracy": 0.7887538075447083, + "num_tokens": 17759407.0, + "step": 3412, + "train/ce_loss": 0.9562662839889526 + }, + { + "epoch": 0.3373541625469646, + "step": 3412, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.3373541625469646, + "step": 3412, + "train/total_loss": 0.1737516224384308 + }, + { + "entropy": 8.994601249694824, + "epoch": 0.33745303539648014, + "mean_token_accuracy": 0.7205720543861389, + "num_tokens": 17764786.0, + "step": 3413, + "train/ce_loss": 0.573242723941803 + }, + { + "epoch": 0.33745303539648014, + "step": 3413, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.33745303539648014, + "step": 3413, + "train/total_loss": 0.08076177537441254 + }, + { + "entropy": 9.084918975830078, + "epoch": 0.3375519082459956, + "mean_token_accuracy": 0.7689486742019653, + "num_tokens": 17770056.0, + "step": 3414, + "train/ce_loss": 0.9779831171035767 + }, + { + "epoch": 0.3375519082459956, + "step": 3414, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.3375519082459956, + "step": 3414, + "train/total_loss": 0.18764206767082214 + }, + { + "entropy": 8.784640312194824, + "epoch": 0.33765078109551117, + "mean_token_accuracy": 0.7269193530082703, + "num_tokens": 17775607.0, + "step": 3415, + "train/ce_loss": 1.16817307472229 + }, + { + "epoch": 0.33765078109551117, + "step": 3415, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.33765078109551117, + "step": 3415, + "train/total_loss": 0.19103606045246124 + }, + { + "entropy": 9.306246757507324, + "epoch": 0.3377496539450267, + "mean_token_accuracy": 0.728923499584198, + "num_tokens": 17780854.0, + "step": 3416, + "train/ce_loss": 1.4378396272659302 + }, + { + "epoch": 0.3377496539450267, + "step": 3416, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.3377496539450267, + "step": 3416, + "train/total_loss": 0.24534647166728973 + }, + { + "entropy": 9.247640609741211, + "epoch": 0.3378485267945422, + "mean_token_accuracy": 0.7110266089439392, + "num_tokens": 17786050.0, + "step": 3417, + "train/ce_loss": 0.8138503432273865 + }, + { + "epoch": 0.3378485267945422, + "step": 3417, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.3378485267945422, + "step": 3417, + "train/total_loss": 0.108728788793087 + }, + { + "entropy": 9.1106595993042, + "epoch": 0.33794739964405773, + "mean_token_accuracy": 0.7891492247581482, + "num_tokens": 17791337.0, + "step": 3418, + "train/ce_loss": 0.39447054266929626 + }, + { + "epoch": 0.33794739964405773, + "step": 3418, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.33794739964405773, + "step": 3418, + "train/total_loss": 0.06679080426692963 + }, + { + "entropy": 9.286377906799316, + "epoch": 0.3380462724935733, + "mean_token_accuracy": 0.75660640001297, + "num_tokens": 17796494.0, + "step": 3419, + "train/ce_loss": 0.7106546759605408 + }, + { + "epoch": 0.3380462724935733, + "step": 3419, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.3380462724935733, + "step": 3419, + "train/total_loss": 0.12184672057628632 + }, + { + "epoch": 0.3381451453430888, + "grad_norm": 0.9079579710960388, + "learning_rate": 9.157147802007616e-06, + "loss": 0.152, + "step": 3420 + }, + { + "entropy": 9.076547622680664, + "epoch": 0.3381451453430888, + "mean_token_accuracy": 0.7792068719863892, + "num_tokens": 17801885.0, + "step": 3420, + "train/ce_loss": 0.3072172999382019 + }, + { + "epoch": 0.3381451453430888, + "step": 3420, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.3381451453430888, + "step": 3420, + "train/total_loss": 0.09322173148393631 + }, + { + "entropy": 8.976940155029297, + "epoch": 0.3382440181926043, + "mean_token_accuracy": 0.7343251705169678, + "num_tokens": 17807318.0, + "step": 3421, + "train/ce_loss": 0.8840020895004272 + }, + { + "epoch": 0.3382440181926043, + "step": 3421, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.3382440181926043, + "step": 3421, + "train/total_loss": 0.1665252149105072 + }, + { + "entropy": 9.07284164428711, + "epoch": 0.33834289104211984, + "mean_token_accuracy": 0.7465277910232544, + "num_tokens": 17812630.0, + "step": 3422, + "train/ce_loss": 1.0425264835357666 + }, + { + "epoch": 0.33834289104211984, + "step": 3422, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.33834289104211984, + "step": 3422, + "train/total_loss": 0.1511276513338089 + }, + { + "entropy": 9.675848007202148, + "epoch": 0.3384417638916354, + "mean_token_accuracy": 0.7110389471054077, + "num_tokens": 17817679.0, + "step": 3423, + "train/ce_loss": 1.2692245244979858 + }, + { + "epoch": 0.3384417638916354, + "step": 3423, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.3384417638916354, + "step": 3423, + "train/total_loss": 0.19723495841026306 + }, + { + "entropy": 9.075258255004883, + "epoch": 0.33854063674115087, + "mean_token_accuracy": 0.7951807379722595, + "num_tokens": 17822976.0, + "step": 3424, + "train/ce_loss": 0.8383387327194214 + }, + { + "epoch": 0.33854063674115087, + "step": 3424, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.33854063674115087, + "step": 3424, + "train/total_loss": 0.18539637327194214 + }, + { + "entropy": 9.544485092163086, + "epoch": 0.3386395095906664, + "mean_token_accuracy": 0.7225913405418396, + "num_tokens": 17828044.0, + "step": 3425, + "train/ce_loss": 1.075201153755188 + }, + { + "epoch": 0.3386395095906664, + "step": 3425, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.3386395095906664, + "step": 3425, + "train/total_loss": 0.19736386835575104 + }, + { + "entropy": 9.646242141723633, + "epoch": 0.33873838244018195, + "mean_token_accuracy": 0.7439862489700317, + "num_tokens": 17833065.0, + "step": 3426, + "train/ce_loss": 0.8191995024681091 + }, + { + "epoch": 0.33873838244018195, + "step": 3426, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.33873838244018195, + "step": 3426, + "train/total_loss": 0.13660745322704315 + }, + { + "entropy": 9.409704208374023, + "epoch": 0.33883725528969744, + "mean_token_accuracy": 0.7482219338417053, + "num_tokens": 17838195.0, + "step": 3427, + "train/ce_loss": 0.432102769613266 + }, + { + "epoch": 0.33883725528969744, + "step": 3427, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.33883725528969744, + "step": 3427, + "train/total_loss": 0.07836653292179108 + }, + { + "entropy": 9.23155403137207, + "epoch": 0.338936128139213, + "mean_token_accuracy": 0.7944663763046265, + "num_tokens": 17843405.0, + "step": 3428, + "train/ce_loss": 0.49656328558921814 + }, + { + "epoch": 0.338936128139213, + "step": 3428, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.338936128139213, + "step": 3428, + "train/total_loss": 0.12387508153915405 + }, + { + "entropy": 9.279233932495117, + "epoch": 0.3390350009887285, + "mean_token_accuracy": 0.7304469347000122, + "num_tokens": 17848578.0, + "step": 3429, + "train/ce_loss": 0.7106434106826782 + }, + { + "epoch": 0.3390350009887285, + "step": 3429, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.3390350009887285, + "step": 3429, + "train/total_loss": 0.11012684553861618 + }, + { + "entropy": 9.007104873657227, + "epoch": 0.339133873838244, + "mean_token_accuracy": 0.7529411911964417, + "num_tokens": 17853896.0, + "step": 3430, + "train/ce_loss": 0.4403240382671356 + }, + { + "epoch": 0.339133873838244, + "step": 3430, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.339133873838244, + "step": 3430, + "train/total_loss": 0.07137615978717804 + }, + { + "entropy": 9.360210418701172, + "epoch": 0.33923274668775955, + "mean_token_accuracy": 0.6985173225402832, + "num_tokens": 17858938.0, + "step": 3431, + "train/ce_loss": 5.252084520179778e-06 + }, + { + "epoch": 0.33923274668775955, + "step": 3431, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.33923274668775955, + "step": 3431, + "train/total_loss": 0.04687552526593208 + }, + { + "entropy": 9.197908401489258, + "epoch": 0.3393316195372751, + "mean_token_accuracy": 0.7270194888114929, + "num_tokens": 17864127.0, + "step": 3432, + "train/ce_loss": 0.9348120093345642 + }, + { + "epoch": 0.3393316195372751, + "step": 3432, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.3393316195372751, + "step": 3432, + "train/total_loss": 0.17160621285438538 + }, + { + "entropy": 9.071158409118652, + "epoch": 0.33943049238679057, + "mean_token_accuracy": 0.6770708560943604, + "num_tokens": 17869420.0, + "step": 3433, + "train/ce_loss": 1.0051276683807373 + }, + { + "epoch": 0.33943049238679057, + "step": 3433, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.33943049238679057, + "step": 3433, + "train/total_loss": 0.1591065227985382 + }, + { + "entropy": 9.667444229125977, + "epoch": 0.3395293652363061, + "mean_token_accuracy": 0.673758864402771, + "num_tokens": 17874441.0, + "step": 3434, + "train/ce_loss": 1.7448782920837402 + }, + { + "epoch": 0.3395293652363061, + "step": 3434, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.3395293652363061, + "step": 3434, + "train/total_loss": 0.299487829208374 + }, + { + "entropy": 9.665374755859375, + "epoch": 0.33962823808582165, + "mean_token_accuracy": 0.7579832077026367, + "num_tokens": 17879426.0, + "step": 3435, + "train/ce_loss": 4.009837084595347e-06 + }, + { + "epoch": 0.33962823808582165, + "step": 3435, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.33962823808582165, + "step": 3435, + "train/total_loss": 0.054687902331352234 + }, + { + "entropy": 9.500116348266602, + "epoch": 0.33972711093533714, + "mean_token_accuracy": 0.8029412031173706, + "num_tokens": 17884537.0, + "step": 3436, + "train/ce_loss": 1.0814032554626465 + }, + { + "epoch": 0.33972711093533714, + "step": 3436, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.33972711093533714, + "step": 3436, + "train/total_loss": 0.19407781958580017 + }, + { + "entropy": 9.49143123626709, + "epoch": 0.3398259837848527, + "mean_token_accuracy": 0.7176128029823303, + "num_tokens": 17889653.0, + "step": 3437, + "train/ce_loss": 1.7539986371994019 + }, + { + "epoch": 0.3398259837848527, + "step": 3437, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.3398259837848527, + "step": 3437, + "train/total_loss": 0.26914986968040466 + }, + { + "entropy": 9.031991958618164, + "epoch": 0.3399248566343682, + "mean_token_accuracy": 0.746582567691803, + "num_tokens": 17895113.0, + "step": 3438, + "train/ce_loss": 0.5026720762252808 + }, + { + "epoch": 0.3399248566343682, + "step": 3438, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.3399248566343682, + "step": 3438, + "train/total_loss": 0.09714220464229584 + }, + { + "entropy": 9.607033729553223, + "epoch": 0.3400237294838837, + "mean_token_accuracy": 0.7119740843772888, + "num_tokens": 17900170.0, + "step": 3439, + "train/ce_loss": 1.0218220949172974 + }, + { + "epoch": 0.3400237294838837, + "step": 3439, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.3400237294838837, + "step": 3439, + "train/total_loss": 0.13343220949172974 + }, + { + "epoch": 0.34012260233339925, + "grad_norm": 0.9295529127120972, + "learning_rate": 9.152202937249667e-06, + "loss": 0.1506, + "step": 3440 + }, + { + "entropy": 9.612879753112793, + "epoch": 0.34012260233339925, + "mean_token_accuracy": 0.7244094610214233, + "num_tokens": 17905243.0, + "step": 3440, + "train/ce_loss": 1.4253815412521362 + }, + { + "epoch": 0.34012260233339925, + "step": 3440, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.34012260233339925, + "step": 3440, + "train/total_loss": 0.2050381600856781 + }, + { + "entropy": 9.158390045166016, + "epoch": 0.3402214751829148, + "mean_token_accuracy": 0.7317365407943726, + "num_tokens": 17910674.0, + "step": 3441, + "train/ce_loss": 0.8184359669685364 + }, + { + "epoch": 0.3402214751829148, + "step": 3441, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.3402214751829148, + "step": 3441, + "train/total_loss": 0.14043734967708588 + }, + { + "entropy": 9.40418815612793, + "epoch": 0.3403203480324303, + "mean_token_accuracy": 0.6985074877738953, + "num_tokens": 17915825.0, + "step": 3442, + "train/ce_loss": 1.0082809925079346 + }, + { + "epoch": 0.3403203480324303, + "step": 3442, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.3403203480324303, + "step": 3442, + "train/total_loss": 0.1633281111717224 + }, + { + "entropy": 8.87204360961914, + "epoch": 0.3404192208819458, + "mean_token_accuracy": 0.7135576009750366, + "num_tokens": 17921358.0, + "step": 3443, + "train/ce_loss": 1.1382791996002197 + }, + { + "epoch": 0.3404192208819458, + "step": 3443, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.3404192208819458, + "step": 3443, + "train/total_loss": 0.1997654139995575 + }, + { + "entropy": 9.070274353027344, + "epoch": 0.34051809373146136, + "mean_token_accuracy": 0.7418546080589294, + "num_tokens": 17926609.0, + "step": 3444, + "train/ce_loss": 1.0991380214691162 + }, + { + "epoch": 0.34051809373146136, + "step": 3444, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.34051809373146136, + "step": 3444, + "train/total_loss": 0.21928879618644714 + }, + { + "entropy": 9.582058906555176, + "epoch": 0.34061696658097684, + "mean_token_accuracy": 0.7063903212547302, + "num_tokens": 17931675.0, + "step": 3445, + "train/ce_loss": 1.2208040971017908e-05 + }, + { + "epoch": 0.34061696658097684, + "step": 3445, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.34061696658097684, + "step": 3445, + "train/total_loss": 0.0859387218952179 + }, + { + "entropy": 9.25374984741211, + "epoch": 0.3407158394304924, + "mean_token_accuracy": 0.7633987069129944, + "num_tokens": 17936890.0, + "step": 3446, + "train/ce_loss": 0.5326510667800903 + }, + { + "epoch": 0.3407158394304924, + "step": 3446, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.3407158394304924, + "step": 3446, + "train/total_loss": 0.15092135965824127 + }, + { + "entropy": 9.824382781982422, + "epoch": 0.3408147122800079, + "mean_token_accuracy": 0.6777777671813965, + "num_tokens": 17941772.0, + "step": 3447, + "train/ce_loss": 3.712680381795508e-06 + }, + { + "epoch": 0.3408147122800079, + "step": 3447, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.3408147122800079, + "step": 3447, + "train/total_loss": 0.058594122529029846 + }, + { + "entropy": 9.640596389770508, + "epoch": 0.3409135851295234, + "mean_token_accuracy": 0.7375415563583374, + "num_tokens": 17946862.0, + "step": 3448, + "train/ce_loss": 0.8982908725738525 + }, + { + "epoch": 0.3409135851295234, + "step": 3448, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.3409135851295234, + "step": 3448, + "train/total_loss": 0.14061033725738525 + }, + { + "entropy": 8.825933456420898, + "epoch": 0.34101245797903895, + "mean_token_accuracy": 0.748314619064331, + "num_tokens": 17952236.0, + "step": 3449, + "train/ce_loss": 0.6523842811584473 + }, + { + "epoch": 0.34101245797903895, + "step": 3449, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.34101245797903895, + "step": 3449, + "train/total_loss": 0.13555093109607697 + }, + { + "entropy": 9.274543762207031, + "epoch": 0.3411113308285545, + "mean_token_accuracy": 0.7023086547851562, + "num_tokens": 17957479.0, + "step": 3450, + "train/ce_loss": 1.0586771965026855 + }, + { + "epoch": 0.3411113308285545, + "step": 3450, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.3411113308285545, + "step": 3450, + "train/total_loss": 0.21133646368980408 + }, + { + "entropy": 8.989279747009277, + "epoch": 0.34121020367807, + "mean_token_accuracy": 0.7483370304107666, + "num_tokens": 17962874.0, + "step": 3451, + "train/ce_loss": 0.4404990077018738 + }, + { + "epoch": 0.34121020367807, + "step": 3451, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.34121020367807, + "step": 3451, + "train/total_loss": 0.07139365375041962 + }, + { + "entropy": 9.191324234008789, + "epoch": 0.3413090765275855, + "mean_token_accuracy": 0.6980440020561218, + "num_tokens": 17968098.0, + "step": 3452, + "train/ce_loss": 1.2236419916152954 + }, + { + "epoch": 0.3413090765275855, + "step": 3452, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.3413090765275855, + "step": 3452, + "train/total_loss": 0.19267669320106506 + }, + { + "entropy": 9.291399002075195, + "epoch": 0.34140794937710106, + "mean_token_accuracy": 0.6732919216156006, + "num_tokens": 17973394.0, + "step": 3453, + "train/ce_loss": 1.0472372196090873e-06 + }, + { + "epoch": 0.34140794937710106, + "step": 3453, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.34140794937710106, + "step": 3453, + "train/total_loss": 0.023437604308128357 + }, + { + "entropy": 9.780986785888672, + "epoch": 0.34150682222661655, + "mean_token_accuracy": 0.8097165822982788, + "num_tokens": 17978311.0, + "step": 3454, + "train/ce_loss": 1.6872035264968872 + }, + { + "epoch": 0.34150682222661655, + "step": 3454, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.34150682222661655, + "step": 3454, + "train/total_loss": 0.2820016145706177 + }, + { + "entropy": 8.907554626464844, + "epoch": 0.3416056950761321, + "mean_token_accuracy": 0.7538779973983765, + "num_tokens": 17983789.0, + "step": 3455, + "train/ce_loss": 0.6267127990722656 + }, + { + "epoch": 0.3416056950761321, + "step": 3455, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.3416056950761321, + "step": 3455, + "train/total_loss": 0.08220253139734268 + }, + { + "entropy": 9.012369155883789, + "epoch": 0.34170456792564763, + "mean_token_accuracy": 0.7421307563781738, + "num_tokens": 17989126.0, + "step": 3456, + "train/ce_loss": 0.5928884744644165 + }, + { + "epoch": 0.34170456792564763, + "step": 3456, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.34170456792564763, + "step": 3456, + "train/total_loss": 0.09835134446620941 + }, + { + "entropy": 9.332815170288086, + "epoch": 0.3418034407751631, + "mean_token_accuracy": 0.7809523940086365, + "num_tokens": 17994327.0, + "step": 3457, + "train/ce_loss": 0.9819399118423462 + }, + { + "epoch": 0.3418034407751631, + "step": 3457, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.3418034407751631, + "step": 3457, + "train/total_loss": 0.12944400310516357 + }, + { + "entropy": 9.30552864074707, + "epoch": 0.34190231362467866, + "mean_token_accuracy": 0.7752613425254822, + "num_tokens": 17999369.0, + "step": 3458, + "train/ce_loss": 1.0199953317642212 + }, + { + "epoch": 0.34190231362467866, + "step": 3458, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.34190231362467866, + "step": 3458, + "train/total_loss": 0.13715578615665436 + }, + { + "entropy": 8.830644607543945, + "epoch": 0.3420011864741942, + "mean_token_accuracy": 0.7116104960441589, + "num_tokens": 18004662.0, + "step": 3459, + "train/ce_loss": 1.4183692932128906 + }, + { + "epoch": 0.3420011864741942, + "step": 3459, + "train/sim_loss": 0.1484375 + }, + { + "epoch": 0.3420011864741942, + "step": 3459, + "train/total_loss": 0.290274441242218 + }, + { + "epoch": 0.3421000593237097, + "grad_norm": 0.8935452103614807, + "learning_rate": 9.147258072491719e-06, + "loss": 0.1516, + "step": 3460 + }, + { + "entropy": 9.550859451293945, + "epoch": 0.3421000593237097, + "mean_token_accuracy": 0.6639871597290039, + "num_tokens": 18009731.0, + "step": 3460, + "train/ce_loss": 1.6412410736083984 + }, + { + "epoch": 0.3421000593237097, + "step": 3460, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.3421000593237097, + "step": 3460, + "train/total_loss": 0.20709286630153656 + }, + { + "entropy": 9.644804000854492, + "epoch": 0.3421989321732252, + "mean_token_accuracy": 0.8066298365592957, + "num_tokens": 18014723.0, + "step": 3461, + "train/ce_loss": 0.7327430844306946 + }, + { + "epoch": 0.3421989321732252, + "step": 3461, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.3421989321732252, + "step": 3461, + "train/total_loss": 0.12405555695295334 + }, + { + "entropy": 9.740586280822754, + "epoch": 0.34229780502274076, + "mean_token_accuracy": 0.7336152195930481, + "num_tokens": 18019589.0, + "step": 3462, + "train/ce_loss": 1.472940444946289 + }, + { + "epoch": 0.34229780502274076, + "step": 3462, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.34229780502274076, + "step": 3462, + "train/total_loss": 0.2019815444946289 + }, + { + "entropy": 8.780726432800293, + "epoch": 0.3423966778722563, + "mean_token_accuracy": 0.6923901438713074, + "num_tokens": 18025027.0, + "step": 3463, + "train/ce_loss": 1.0836405754089355 + }, + { + "epoch": 0.3423966778722563, + "step": 3463, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.3423966778722563, + "step": 3463, + "train/total_loss": 0.1708640605211258 + }, + { + "entropy": 9.670867919921875, + "epoch": 0.3424955507217718, + "mean_token_accuracy": 0.7439446449279785, + "num_tokens": 18030076.0, + "step": 3464, + "train/ce_loss": 0.7355344891548157 + }, + { + "epoch": 0.3424955507217718, + "step": 3464, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.3424955507217718, + "step": 3464, + "train/total_loss": 0.11652220040559769 + }, + { + "entropy": 9.505711555480957, + "epoch": 0.34259442357128733, + "mean_token_accuracy": 0.727142870426178, + "num_tokens": 18035219.0, + "step": 3465, + "train/ce_loss": 0.724556565284729 + }, + { + "epoch": 0.34259442357128733, + "step": 3465, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.34259442357128733, + "step": 3465, + "train/total_loss": 0.14276815950870514 + }, + { + "entropy": 9.53701114654541, + "epoch": 0.3426932964208029, + "mean_token_accuracy": 0.7606298923492432, + "num_tokens": 18040267.0, + "step": 3466, + "train/ce_loss": 0.8044400215148926 + }, + { + "epoch": 0.3426932964208029, + "step": 3466, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.3426932964208029, + "step": 3466, + "train/total_loss": 0.12731900811195374 + }, + { + "entropy": 9.149065971374512, + "epoch": 0.34279216927031836, + "mean_token_accuracy": 0.7601390480995178, + "num_tokens": 18045617.0, + "step": 3467, + "train/ce_loss": 7.459978405677248e-06 + }, + { + "epoch": 0.34279216927031836, + "step": 3467, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.34279216927031836, + "step": 3467, + "train/total_loss": 0.05468824505805969 + }, + { + "entropy": 10.015134811401367, + "epoch": 0.3428910421198339, + "mean_token_accuracy": 0.699999988079071, + "num_tokens": 18050470.0, + "step": 3468, + "train/ce_loss": 3.0289363861083984 + }, + { + "epoch": 0.3428910421198339, + "step": 3468, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.3428910421198339, + "step": 3468, + "train/total_loss": 0.39664363861083984 + }, + { + "entropy": 9.345232963562012, + "epoch": 0.34298991496934944, + "mean_token_accuracy": 0.726047933101654, + "num_tokens": 18055605.0, + "step": 3469, + "train/ce_loss": 0.5725339651107788 + }, + { + "epoch": 0.34298991496934944, + "step": 3469, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.34298991496934944, + "step": 3469, + "train/total_loss": 0.092409648001194 + }, + { + "entropy": 9.524428367614746, + "epoch": 0.3430887878188649, + "mean_token_accuracy": 0.7312703728675842, + "num_tokens": 18060629.0, + "step": 3470, + "train/ce_loss": 0.5799019932746887 + }, + { + "epoch": 0.3430887878188649, + "step": 3470, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.3430887878188649, + "step": 3470, + "train/total_loss": 0.10486520081758499 + }, + { + "entropy": 9.27010726928711, + "epoch": 0.34318766066838047, + "mean_token_accuracy": 0.7620751261711121, + "num_tokens": 18065629.0, + "step": 3471, + "train/ce_loss": 0.9724286198616028 + }, + { + "epoch": 0.34318766066838047, + "step": 3471, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.34318766066838047, + "step": 3471, + "train/total_loss": 0.14411786198616028 + }, + { + "entropy": 8.874292373657227, + "epoch": 0.343286533517896, + "mean_token_accuracy": 0.7732426524162292, + "num_tokens": 18070997.0, + "step": 3472, + "train/ce_loss": 0.6216502785682678 + }, + { + "epoch": 0.343286533517896, + "step": 3472, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.343286533517896, + "step": 3472, + "train/total_loss": 0.0895087793469429 + }, + { + "entropy": 8.968184471130371, + "epoch": 0.3433854063674115, + "mean_token_accuracy": 0.6972677707672119, + "num_tokens": 18076395.0, + "step": 3473, + "train/ce_loss": 0.8104959726333618 + }, + { + "epoch": 0.3433854063674115, + "step": 3473, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.3433854063674115, + "step": 3473, + "train/total_loss": 0.1747995913028717 + }, + { + "entropy": 9.562467575073242, + "epoch": 0.34348427921692704, + "mean_token_accuracy": 0.7288428544998169, + "num_tokens": 18081397.0, + "step": 3474, + "train/ce_loss": 8.978898222267162e-06 + }, + { + "epoch": 0.34348427921692704, + "step": 3474, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.34348427921692704, + "step": 3474, + "train/total_loss": 0.02734464779496193 + }, + { + "entropy": 9.386184692382812, + "epoch": 0.3435831520664426, + "mean_token_accuracy": 0.7627840638160706, + "num_tokens": 18086573.0, + "step": 3475, + "train/ce_loss": 2.367150500504067e-06 + }, + { + "epoch": 0.3435831520664426, + "step": 3475, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.3435831520664426, + "step": 3475, + "train/total_loss": 0.0390627384185791 + }, + { + "entropy": 9.22478199005127, + "epoch": 0.34368202491595806, + "mean_token_accuracy": 0.7738814949989319, + "num_tokens": 18091858.0, + "step": 3476, + "train/ce_loss": 0.6053165793418884 + }, + { + "epoch": 0.34368202491595806, + "step": 3476, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.34368202491595806, + "step": 3476, + "train/total_loss": 0.09959416091442108 + }, + { + "entropy": 8.968910217285156, + "epoch": 0.3437808977654736, + "mean_token_accuracy": 0.7267637252807617, + "num_tokens": 18097212.0, + "step": 3477, + "train/ce_loss": 1.425128698348999 + }, + { + "epoch": 0.3437808977654736, + "step": 3477, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.3437808977654736, + "step": 3477, + "train/total_loss": 0.25579410791397095 + }, + { + "entropy": 9.667119026184082, + "epoch": 0.34387977061498914, + "mean_token_accuracy": 0.7336769700050354, + "num_tokens": 18102192.0, + "step": 3478, + "train/ce_loss": 0.7826544642448425 + }, + { + "epoch": 0.34387977061498914, + "step": 3478, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.34387977061498914, + "step": 3478, + "train/total_loss": 0.1368592083454132 + }, + { + "entropy": 9.635202407836914, + "epoch": 0.34397864346450463, + "mean_token_accuracy": 0.6964285969734192, + "num_tokens": 18107051.0, + "step": 3479, + "train/ce_loss": 4.103856554138474e-06 + }, + { + "epoch": 0.34397864346450463, + "step": 3479, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.34397864346450463, + "step": 3479, + "train/total_loss": 0.04296915978193283 + }, + { + "epoch": 0.34407751631402017, + "grad_norm": 1.0079654455184937, + "learning_rate": 9.14231320773377e-06, + "loss": 0.141, + "step": 3480 + }, + { + "entropy": 8.853361129760742, + "epoch": 0.34407751631402017, + "mean_token_accuracy": 0.7193158864974976, + "num_tokens": 18112485.0, + "step": 3480, + "train/ce_loss": 0.8331007957458496 + }, + { + "epoch": 0.34407751631402017, + "step": 3480, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.34407751631402017, + "step": 3480, + "train/total_loss": 0.1575288325548172 + }, + { + "entropy": 9.428400039672852, + "epoch": 0.3441763891635357, + "mean_token_accuracy": 0.6616848111152649, + "num_tokens": 18117660.0, + "step": 3481, + "train/ce_loss": 2.4557323455810547 + }, + { + "epoch": 0.3441763891635357, + "step": 3481, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.3441763891635357, + "step": 3481, + "train/total_loss": 0.3510419726371765 + }, + { + "entropy": 8.761262893676758, + "epoch": 0.3442752620130512, + "mean_token_accuracy": 0.7782857418060303, + "num_tokens": 18123011.0, + "step": 3482, + "train/ce_loss": 0.6788957715034485 + }, + { + "epoch": 0.3442752620130512, + "step": 3482, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.3442752620130512, + "step": 3482, + "train/total_loss": 0.12257707864046097 + }, + { + "entropy": 10.270698547363281, + "epoch": 0.34437413486256674, + "mean_token_accuracy": 0.6736111044883728, + "num_tokens": 18127668.0, + "step": 3483, + "train/ce_loss": 4.814478415937629e-06 + }, + { + "epoch": 0.34437413486256674, + "step": 3483, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.34437413486256674, + "step": 3483, + "train/total_loss": 0.0195317305624485 + }, + { + "entropy": 9.341055870056152, + "epoch": 0.3444730077120823, + "mean_token_accuracy": 0.6681286692619324, + "num_tokens": 18132822.0, + "step": 3484, + "train/ce_loss": 1.518288254737854 + }, + { + "epoch": 0.3444730077120823, + "step": 3484, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.3444730077120823, + "step": 3484, + "train/total_loss": 0.2416725754737854 + }, + { + "entropy": 9.43017578125, + "epoch": 0.34457188056159777, + "mean_token_accuracy": 0.7330508232116699, + "num_tokens": 18138000.0, + "step": 3485, + "train/ce_loss": 0.5761935114860535 + }, + { + "epoch": 0.34457188056159777, + "step": 3485, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.34457188056159777, + "step": 3485, + "train/total_loss": 0.10449434816837311 + }, + { + "entropy": 8.839698791503906, + "epoch": 0.3446707534111133, + "mean_token_accuracy": 0.7234927415847778, + "num_tokens": 18143459.0, + "step": 3486, + "train/ce_loss": 0.9702885746955872 + }, + { + "epoch": 0.3446707534111133, + "step": 3486, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.3446707534111133, + "step": 3486, + "train/total_loss": 0.13218510150909424 + }, + { + "entropy": 9.350061416625977, + "epoch": 0.34476962626062885, + "mean_token_accuracy": 0.7810107469558716, + "num_tokens": 18148562.0, + "step": 3487, + "train/ce_loss": 1.4726591871294659e-05 + }, + { + "epoch": 0.34476962626062885, + "step": 3487, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.34476962626062885, + "step": 3487, + "train/total_loss": 0.06640772521495819 + }, + { + "entropy": 9.24201774597168, + "epoch": 0.34486849911014433, + "mean_token_accuracy": 0.6737288236618042, + "num_tokens": 18153731.0, + "step": 3488, + "train/ce_loss": 1.0004758834838867 + }, + { + "epoch": 0.34486849911014433, + "step": 3488, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.34486849911014433, + "step": 3488, + "train/total_loss": 0.16254758834838867 + }, + { + "entropy": 9.01718521118164, + "epoch": 0.3449673719596599, + "mean_token_accuracy": 0.6993710398674011, + "num_tokens": 18159006.0, + "step": 3489, + "train/ce_loss": 0.4811428189277649 + }, + { + "epoch": 0.3449673719596599, + "step": 3489, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.3449673719596599, + "step": 3489, + "train/total_loss": 0.10280178487300873 + }, + { + "entropy": 9.156839370727539, + "epoch": 0.3450662448091754, + "mean_token_accuracy": 0.7376623153686523, + "num_tokens": 18164259.0, + "step": 3490, + "train/ce_loss": 1.078993797302246 + }, + { + "epoch": 0.3450662448091754, + "step": 3490, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.3450662448091754, + "step": 3490, + "train/total_loss": 0.17039938271045685 + }, + { + "entropy": 9.458637237548828, + "epoch": 0.3451651176586909, + "mean_token_accuracy": 0.7687296271324158, + "num_tokens": 18169353.0, + "step": 3491, + "train/ce_loss": 1.1278468370437622 + }, + { + "epoch": 0.3451651176586909, + "step": 3491, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.3451651176586909, + "step": 3491, + "train/total_loss": 0.17528468370437622 + }, + { + "entropy": 9.081297874450684, + "epoch": 0.34526399050820644, + "mean_token_accuracy": 0.7879133224487305, + "num_tokens": 18174714.0, + "step": 3492, + "train/ce_loss": 0.8648220300674438 + }, + { + "epoch": 0.34526399050820644, + "step": 3492, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.34526399050820644, + "step": 3492, + "train/total_loss": 0.1567946970462799 + }, + { + "entropy": 8.773237228393555, + "epoch": 0.345362863357722, + "mean_token_accuracy": 0.7347908616065979, + "num_tokens": 18180212.0, + "step": 3493, + "train/ce_loss": 1.1458483934402466 + }, + { + "epoch": 0.345362863357722, + "step": 3493, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.345362863357722, + "step": 3493, + "train/total_loss": 0.14583483338356018 + }, + { + "entropy": 8.975872039794922, + "epoch": 0.34546173620723747, + "mean_token_accuracy": 0.7310252785682678, + "num_tokens": 18185412.0, + "step": 3494, + "train/ce_loss": 0.642943799495697 + }, + { + "epoch": 0.34546173620723747, + "step": 3494, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.34546173620723747, + "step": 3494, + "train/total_loss": 0.10726313292980194 + }, + { + "entropy": 8.932552337646484, + "epoch": 0.345560609056753, + "mean_token_accuracy": 0.7300613522529602, + "num_tokens": 18190755.0, + "step": 3495, + "train/ce_loss": 1.3272416591644287 + }, + { + "epoch": 0.345560609056753, + "step": 3495, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.345560609056753, + "step": 3495, + "train/total_loss": 0.18350541591644287 + }, + { + "entropy": 9.683507919311523, + "epoch": 0.34565948190626855, + "mean_token_accuracy": 0.7188678979873657, + "num_tokens": 18195738.0, + "step": 3496, + "train/ce_loss": 0.8042916655540466 + }, + { + "epoch": 0.34565948190626855, + "step": 3496, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.34565948190626855, + "step": 3496, + "train/total_loss": 0.09605416655540466 + }, + { + "entropy": 8.802057266235352, + "epoch": 0.34575835475578404, + "mean_token_accuracy": 0.7775306105613708, + "num_tokens": 18201125.0, + "step": 3497, + "train/ce_loss": 0.6206459403038025 + }, + { + "epoch": 0.34575835475578404, + "step": 3497, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.34575835475578404, + "step": 3497, + "train/total_loss": 0.08940834552049637 + }, + { + "entropy": 9.39529037475586, + "epoch": 0.3458572276052996, + "mean_token_accuracy": 0.7643835544586182, + "num_tokens": 18206320.0, + "step": 3498, + "train/ce_loss": 0.6989363431930542 + }, + { + "epoch": 0.3458572276052996, + "step": 3498, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.3458572276052996, + "step": 3498, + "train/total_loss": 0.16364362835884094 + }, + { + "entropy": 9.509500503540039, + "epoch": 0.3459561004548151, + "mean_token_accuracy": 0.777414083480835, + "num_tokens": 18211365.0, + "step": 3499, + "train/ce_loss": 1.8198801399194053e-06 + }, + { + "epoch": 0.3459561004548151, + "step": 3499, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.3459561004548151, + "step": 3499, + "train/total_loss": 0.039062682539224625 + }, + { + "epoch": 0.3460549733043306, + "grad_norm": 0.6896134614944458, + "learning_rate": 9.137368342975821e-06, + "loss": 0.1465, + "step": 3500 + }, + { + "entropy": 9.481149673461914, + "epoch": 0.3460549733043306, + "mean_token_accuracy": 0.720812201499939, + "num_tokens": 18216378.0, + "step": 3500, + "train/ce_loss": 1.0319294929504395 + }, + { + "epoch": 0.3460549733043306, + "step": 3500, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.3460549733043306, + "step": 3500, + "train/total_loss": 0.16178670525550842 + }, + { + "entropy": 9.102712631225586, + "epoch": 0.34615384615384615, + "mean_token_accuracy": 0.7146596908569336, + "num_tokens": 18221548.0, + "step": 3501, + "train/ce_loss": 1.060309886932373 + }, + { + "epoch": 0.34615384615384615, + "step": 3501, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.34615384615384615, + "step": 3501, + "train/total_loss": 0.18024975061416626 + }, + { + "entropy": 9.14307975769043, + "epoch": 0.3462527190033617, + "mean_token_accuracy": 0.7654808759689331, + "num_tokens": 18226792.0, + "step": 3502, + "train/ce_loss": 0.6257838010787964 + }, + { + "epoch": 0.3462527190033617, + "step": 3502, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.3462527190033617, + "step": 3502, + "train/total_loss": 0.10164088010787964 + }, + { + "entropy": 9.397144317626953, + "epoch": 0.3463515918528772, + "mean_token_accuracy": 0.6901615262031555, + "num_tokens": 18231912.0, + "step": 3503, + "train/ce_loss": 2.106091187670245e-06 + }, + { + "epoch": 0.3463515918528772, + "step": 3503, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.3463515918528772, + "step": 3503, + "train/total_loss": 0.08203145861625671 + }, + { + "entropy": 8.943714141845703, + "epoch": 0.3464504647023927, + "mean_token_accuracy": 0.6891566514968872, + "num_tokens": 18237203.0, + "step": 3504, + "train/ce_loss": 1.4671072959899902 + }, + { + "epoch": 0.3464504647023927, + "step": 3504, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.3464504647023927, + "step": 3504, + "train/total_loss": 0.20530448853969574 + }, + { + "entropy": 9.217236518859863, + "epoch": 0.34654933755190825, + "mean_token_accuracy": 0.7440944910049438, + "num_tokens": 18242389.0, + "step": 3505, + "train/ce_loss": 0.7367294430732727 + }, + { + "epoch": 0.34654933755190825, + "step": 3505, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.34654933755190825, + "step": 3505, + "train/total_loss": 0.09711044281721115 + }, + { + "entropy": 8.967019081115723, + "epoch": 0.3466482104014238, + "mean_token_accuracy": 0.6997929811477661, + "num_tokens": 18247769.0, + "step": 3506, + "train/ce_loss": 0.7110524773597717 + }, + { + "epoch": 0.3466482104014238, + "step": 3506, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.3466482104014238, + "step": 3506, + "train/total_loss": 0.11016774922609329 + }, + { + "entropy": 9.190967559814453, + "epoch": 0.3467470832509393, + "mean_token_accuracy": 0.6906710267066956, + "num_tokens": 18252815.0, + "step": 3507, + "train/ce_loss": 1.5314048528671265 + }, + { + "epoch": 0.3467470832509393, + "step": 3507, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.3467470832509393, + "step": 3507, + "train/total_loss": 0.27814048528671265 + }, + { + "entropy": 8.84999942779541, + "epoch": 0.3468459561004548, + "mean_token_accuracy": 0.6917900443077087, + "num_tokens": 18258039.0, + "step": 3508, + "train/ce_loss": 0.9880190491676331 + }, + { + "epoch": 0.3468459561004548, + "step": 3508, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.3468459561004548, + "step": 3508, + "train/total_loss": 0.14177066087722778 + }, + { + "entropy": 9.307038307189941, + "epoch": 0.34694482894997036, + "mean_token_accuracy": 0.6992366313934326, + "num_tokens": 18263149.0, + "step": 3509, + "train/ce_loss": 1.0231679677963257 + }, + { + "epoch": 0.34694482894997036, + "step": 3509, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.34694482894997036, + "step": 3509, + "train/total_loss": 0.16872304677963257 + }, + { + "entropy": 9.371397018432617, + "epoch": 0.34704370179948585, + "mean_token_accuracy": 0.687589168548584, + "num_tokens": 18268301.0, + "step": 3510, + "train/ce_loss": 4.765811354445759e-06 + }, + { + "epoch": 0.34704370179948585, + "step": 3510, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.34704370179948585, + "step": 3510, + "train/total_loss": 0.0429692268371582 + }, + { + "entropy": 8.887444496154785, + "epoch": 0.3471425746490014, + "mean_token_accuracy": 0.8310502171516418, + "num_tokens": 18273666.0, + "step": 3511, + "train/ce_loss": 0.693034291267395 + }, + { + "epoch": 0.3471425746490014, + "step": 3511, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.3471425746490014, + "step": 3511, + "train/total_loss": 0.15524092316627502 + }, + { + "entropy": 9.004448890686035, + "epoch": 0.34724144749851693, + "mean_token_accuracy": 0.6916201114654541, + "num_tokens": 18278990.0, + "step": 3512, + "train/ce_loss": 0.901794970035553 + }, + { + "epoch": 0.34724144749851693, + "step": 3512, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.34724144749851693, + "step": 3512, + "train/total_loss": 0.14486700296401978 + }, + { + "entropy": 8.916728019714355, + "epoch": 0.3473403203480324, + "mean_token_accuracy": 0.7067669034004211, + "num_tokens": 18284412.0, + "step": 3513, + "train/ce_loss": 0.8450969457626343 + }, + { + "epoch": 0.3473403203480324, + "step": 3513, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.3473403203480324, + "step": 3513, + "train/total_loss": 0.1313847005367279 + }, + { + "entropy": 9.046451568603516, + "epoch": 0.34743919319754796, + "mean_token_accuracy": 0.7526754140853882, + "num_tokens": 18289752.0, + "step": 3514, + "train/ce_loss": 0.6713233590126038 + }, + { + "epoch": 0.34743919319754796, + "step": 3514, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.34743919319754796, + "step": 3514, + "train/total_loss": 0.12181983888149261 + }, + { + "entropy": 9.24777603149414, + "epoch": 0.3475380660470635, + "mean_token_accuracy": 0.7366771101951599, + "num_tokens": 18294871.0, + "step": 3515, + "train/ce_loss": 1.0542774200439453 + }, + { + "epoch": 0.3475380660470635, + "step": 3515, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.3475380660470635, + "step": 3515, + "train/total_loss": 0.16011524200439453 + }, + { + "entropy": 9.478700637817383, + "epoch": 0.347636938896579, + "mean_token_accuracy": 0.7149532437324524, + "num_tokens": 18299954.0, + "step": 3516, + "train/ce_loss": 1.3608589172363281 + }, + { + "epoch": 0.347636938896579, + "step": 3516, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.347636938896579, + "step": 3516, + "train/total_loss": 0.1829608976840973 + }, + { + "entropy": 8.81755256652832, + "epoch": 0.3477358117460945, + "mean_token_accuracy": 0.6959064602851868, + "num_tokens": 18305277.0, + "step": 3517, + "train/ce_loss": 0.6051265001296997 + }, + { + "epoch": 0.3477358117460945, + "step": 3517, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.3477358117460945, + "step": 3517, + "train/total_loss": 0.12301264703273773 + }, + { + "entropy": 8.472495079040527, + "epoch": 0.34783468459561007, + "mean_token_accuracy": 0.7226970791816711, + "num_tokens": 18310916.0, + "step": 3518, + "train/ce_loss": 0.5649114847183228 + }, + { + "epoch": 0.34783468459561007, + "step": 3518, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.34783468459561007, + "step": 3518, + "train/total_loss": 0.09164740145206451 + }, + { + "entropy": 9.438545227050781, + "epoch": 0.34793355744512555, + "mean_token_accuracy": 0.7453504800796509, + "num_tokens": 18316089.0, + "step": 3519, + "train/ce_loss": 0.6985089182853699 + }, + { + "epoch": 0.34793355744512555, + "step": 3519, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.34793355744512555, + "step": 3519, + "train/total_loss": 0.09328839182853699 + }, + { + "epoch": 0.3480324302946411, + "grad_norm": 0.8204121589660645, + "learning_rate": 9.132423478217872e-06, + "loss": 0.1556, + "step": 3520 + }, + { + "entropy": 9.595911026000977, + "epoch": 0.3480324302946411, + "mean_token_accuracy": 0.7762237787246704, + "num_tokens": 18321068.0, + "step": 3520, + "train/ce_loss": 1.5138328990360606e-06 + }, + { + "epoch": 0.3480324302946411, + "step": 3520, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.3480324302946411, + "step": 3520, + "train/total_loss": 0.015625150874257088 + }, + { + "entropy": 9.212435722351074, + "epoch": 0.34813130314415663, + "mean_token_accuracy": 0.7172236442565918, + "num_tokens": 18326301.0, + "step": 3521, + "train/ce_loss": 0.6783554553985596 + }, + { + "epoch": 0.34813130314415663, + "step": 3521, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.34813130314415663, + "step": 3521, + "train/total_loss": 0.11471054702997208 + }, + { + "entropy": 9.33420467376709, + "epoch": 0.3482301759936721, + "mean_token_accuracy": 0.7138554453849792, + "num_tokens": 18331470.0, + "step": 3522, + "train/ce_loss": 1.7616900205612183 + }, + { + "epoch": 0.3482301759936721, + "step": 3522, + "train/sim_loss": 0.140625 + }, + { + "epoch": 0.3482301759936721, + "step": 3522, + "train/total_loss": 0.3167940080165863 + }, + { + "entropy": 9.230966567993164, + "epoch": 0.34832904884318766, + "mean_token_accuracy": 0.7760563492774963, + "num_tokens": 18336621.0, + "step": 3523, + "train/ce_loss": 0.7635095715522766 + }, + { + "epoch": 0.34832904884318766, + "step": 3523, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.34832904884318766, + "step": 3523, + "train/total_loss": 0.14666345715522766 + }, + { + "entropy": 9.205339431762695, + "epoch": 0.3484279216927032, + "mean_token_accuracy": 0.6947852969169617, + "num_tokens": 18341745.0, + "step": 3524, + "train/ce_loss": 0.9602331519126892 + }, + { + "epoch": 0.3484279216927032, + "step": 3524, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.3484279216927032, + "step": 3524, + "train/total_loss": 0.1663358211517334 + }, + { + "entropy": 8.775196075439453, + "epoch": 0.3485267945422187, + "mean_token_accuracy": 0.7293318510055542, + "num_tokens": 18347080.0, + "step": 3525, + "train/ce_loss": 0.6082144379615784 + }, + { + "epoch": 0.3485267945422187, + "step": 3525, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.3485267945422187, + "step": 3525, + "train/total_loss": 0.10769644379615784 + }, + { + "entropy": 9.457630157470703, + "epoch": 0.34862566739173423, + "mean_token_accuracy": 0.7979274392127991, + "num_tokens": 18352084.0, + "step": 3526, + "train/ce_loss": 0.6494618058204651 + }, + { + "epoch": 0.34862566739173423, + "step": 3526, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.34862566739173423, + "step": 3526, + "train/total_loss": 0.09228993207216263 + }, + { + "entropy": 9.636478424072266, + "epoch": 0.34872454024124977, + "mean_token_accuracy": 0.7107023596763611, + "num_tokens": 18357101.0, + "step": 3527, + "train/ce_loss": 1.1041392087936401 + }, + { + "epoch": 0.34872454024124977, + "step": 3527, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.34872454024124977, + "step": 3527, + "train/total_loss": 0.15338267385959625 + }, + { + "entropy": 9.457572937011719, + "epoch": 0.34882341309076526, + "mean_token_accuracy": 0.7376811504364014, + "num_tokens": 18362231.0, + "step": 3528, + "train/ce_loss": 1.1171162128448486 + }, + { + "epoch": 0.34882341309076526, + "step": 3528, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.34882341309076526, + "step": 3528, + "train/total_loss": 0.16639912128448486 + }, + { + "entropy": 9.173391342163086, + "epoch": 0.3489222859402808, + "mean_token_accuracy": 0.6962785124778748, + "num_tokens": 18367491.0, + "step": 3529, + "train/ce_loss": 0.6444375514984131 + }, + { + "epoch": 0.3489222859402808, + "step": 3529, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.3489222859402808, + "step": 3529, + "train/total_loss": 0.09569375962018967 + }, + { + "entropy": 9.359664916992188, + "epoch": 0.34902115878979634, + "mean_token_accuracy": 0.779552698135376, + "num_tokens": 18372553.0, + "step": 3530, + "train/ce_loss": 0.6456083655357361 + }, + { + "epoch": 0.34902115878979634, + "step": 3530, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.34902115878979634, + "step": 3530, + "train/total_loss": 0.09971708804368973 + }, + { + "entropy": 9.433663368225098, + "epoch": 0.3491200316393118, + "mean_token_accuracy": 0.7432217001914978, + "num_tokens": 18377618.0, + "step": 3531, + "train/ce_loss": 9.10674953047419e-06 + }, + { + "epoch": 0.3491200316393118, + "step": 3531, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.3491200316393118, + "step": 3531, + "train/total_loss": 0.08593840897083282 + }, + { + "entropy": 9.148578643798828, + "epoch": 0.34921890448882736, + "mean_token_accuracy": 0.7959427237510681, + "num_tokens": 18382937.0, + "step": 3532, + "train/ce_loss": 0.6363712549209595 + }, + { + "epoch": 0.34921890448882736, + "step": 3532, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.34921890448882736, + "step": 3532, + "train/total_loss": 0.0870746299624443 + }, + { + "entropy": 8.85528564453125, + "epoch": 0.3493177773383429, + "mean_token_accuracy": 0.7173038125038147, + "num_tokens": 18388528.0, + "step": 3533, + "train/ce_loss": 1.2859928607940674 + }, + { + "epoch": 0.3493177773383429, + "step": 3533, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.3493177773383429, + "step": 3533, + "train/total_loss": 0.21453678607940674 + }, + { + "entropy": 9.364201545715332, + "epoch": 0.3494166501878584, + "mean_token_accuracy": 0.7604976892471313, + "num_tokens": 18393649.0, + "step": 3534, + "train/ce_loss": 0.7129225134849548 + }, + { + "epoch": 0.3494166501878584, + "step": 3534, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.3494166501878584, + "step": 3534, + "train/total_loss": 0.12207350134849548 + }, + { + "entropy": 9.06539535522461, + "epoch": 0.34951552303737393, + "mean_token_accuracy": 0.7291169166564941, + "num_tokens": 18398963.0, + "step": 3535, + "train/ce_loss": 0.6554270386695862 + }, + { + "epoch": 0.34951552303737393, + "step": 3535, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.34951552303737393, + "step": 3535, + "train/total_loss": 0.12023020535707474 + }, + { + "entropy": 9.511955261230469, + "epoch": 0.3496143958868895, + "mean_token_accuracy": 0.7662771344184875, + "num_tokens": 18404061.0, + "step": 3536, + "train/ce_loss": 0.5374231338500977 + }, + { + "epoch": 0.3496143958868895, + "step": 3536, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.3496143958868895, + "step": 3536, + "train/total_loss": 0.11624231934547424 + }, + { + "entropy": 8.863699913024902, + "epoch": 0.34971326873640496, + "mean_token_accuracy": 0.8055271506309509, + "num_tokens": 18409487.0, + "step": 3537, + "train/ce_loss": 0.40641874074935913 + }, + { + "epoch": 0.34971326873640496, + "step": 3537, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.34971326873640496, + "step": 3537, + "train/total_loss": 0.07579812407493591 + }, + { + "entropy": 8.911162376403809, + "epoch": 0.3498121415859205, + "mean_token_accuracy": 0.7116279006004333, + "num_tokens": 18414808.0, + "step": 3538, + "train/ce_loss": 1.2232792377471924 + }, + { + "epoch": 0.3498121415859205, + "step": 3538, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.3498121415859205, + "step": 3538, + "train/total_loss": 0.19264042377471924 + }, + { + "entropy": 9.118431091308594, + "epoch": 0.34991101443543604, + "mean_token_accuracy": 0.7496932744979858, + "num_tokens": 18420151.0, + "step": 3539, + "train/ce_loss": 1.101403832435608 + }, + { + "epoch": 0.34991101443543604, + "step": 3539, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.34991101443543604, + "step": 3539, + "train/total_loss": 0.1921716332435608 + }, + { + "epoch": 0.3500098872849515, + "grad_norm": 0.7718408703804016, + "learning_rate": 9.127478613459922e-06, + "loss": 0.1463, + "step": 3540 + }, + { + "entropy": 9.095006942749023, + "epoch": 0.3500098872849515, + "mean_token_accuracy": 0.7156334519386292, + "num_tokens": 18425384.0, + "step": 3540, + "train/ce_loss": 1.2889493703842163 + }, + { + "epoch": 0.3500098872849515, + "step": 3540, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.3500098872849515, + "step": 3540, + "train/total_loss": 0.19920744001865387 + }, + { + "entropy": 9.312450408935547, + "epoch": 0.35010876013446707, + "mean_token_accuracy": 0.7492997050285339, + "num_tokens": 18430574.0, + "step": 3541, + "train/ce_loss": 1.968036940525053e-06 + }, + { + "epoch": 0.35010876013446707, + "step": 3541, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.35010876013446707, + "step": 3541, + "train/total_loss": 0.07812519371509552 + }, + { + "entropy": 8.899011611938477, + "epoch": 0.3502076329839826, + "mean_token_accuracy": 0.6872928142547607, + "num_tokens": 18435941.0, + "step": 3542, + "train/ce_loss": 0.9003424644470215 + }, + { + "epoch": 0.3502076329839826, + "step": 3542, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.3502076329839826, + "step": 3542, + "train/total_loss": 0.18378424644470215 + }, + { + "entropy": 9.032732009887695, + "epoch": 0.3503065058334981, + "mean_token_accuracy": 0.7408804893493652, + "num_tokens": 18441250.0, + "step": 3543, + "train/ce_loss": 0.49240124225616455 + }, + { + "epoch": 0.3503065058334981, + "step": 3543, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.3503065058334981, + "step": 3543, + "train/total_loss": 0.1078338772058487 + }, + { + "entropy": 9.10744857788086, + "epoch": 0.35040537868301364, + "mean_token_accuracy": 0.7295690774917603, + "num_tokens": 18446369.0, + "step": 3544, + "train/ce_loss": 0.6408175230026245 + }, + { + "epoch": 0.35040537868301364, + "step": 3544, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.35040537868301364, + "step": 3544, + "train/total_loss": 0.12267550081014633 + }, + { + "entropy": 8.559624671936035, + "epoch": 0.3505042515325292, + "mean_token_accuracy": 0.7497507333755493, + "num_tokens": 18451885.0, + "step": 3545, + "train/ce_loss": 1.0121359825134277 + }, + { + "epoch": 0.3505042515325292, + "step": 3545, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.3505042515325292, + "step": 3545, + "train/total_loss": 0.19886985421180725 + }, + { + "entropy": 8.960552215576172, + "epoch": 0.3506031243820447, + "mean_token_accuracy": 0.7813853025436401, + "num_tokens": 18457299.0, + "step": 3546, + "train/ce_loss": 0.47213152050971985 + }, + { + "epoch": 0.3506031243820447, + "step": 3546, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.3506031243820447, + "step": 3546, + "train/total_loss": 0.14486940205097198 + }, + { + "entropy": 9.17786979675293, + "epoch": 0.3507019972315602, + "mean_token_accuracy": 0.7319316864013672, + "num_tokens": 18462511.0, + "step": 3547, + "train/ce_loss": 0.768038809299469 + }, + { + "epoch": 0.3507019972315602, + "step": 3547, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.3507019972315602, + "step": 3547, + "train/total_loss": 0.13149139285087585 + }, + { + "entropy": 9.23261833190918, + "epoch": 0.35080087008107574, + "mean_token_accuracy": 0.7530201077461243, + "num_tokens": 18467667.0, + "step": 3548, + "train/ce_loss": 1.1861225366592407 + }, + { + "epoch": 0.35080087008107574, + "step": 3548, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.35080087008107574, + "step": 3548, + "train/total_loss": 0.19283100962638855 + }, + { + "entropy": 9.488293647766113, + "epoch": 0.3508997429305913, + "mean_token_accuracy": 0.7153846025466919, + "num_tokens": 18472733.0, + "step": 3549, + "train/ce_loss": 0.6497202515602112 + }, + { + "epoch": 0.3508997429305913, + "step": 3549, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.3508997429305913, + "step": 3549, + "train/total_loss": 0.11575327813625336 + }, + { + "entropy": 8.950626373291016, + "epoch": 0.35099861578010677, + "mean_token_accuracy": 0.7555555701255798, + "num_tokens": 18478118.0, + "step": 3550, + "train/ce_loss": 0.7787980437278748 + }, + { + "epoch": 0.35099861578010677, + "step": 3550, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.35099861578010677, + "step": 3550, + "train/total_loss": 0.14819231629371643 + }, + { + "entropy": 9.40908432006836, + "epoch": 0.3510974886296223, + "mean_token_accuracy": 0.6710963249206543, + "num_tokens": 18483194.0, + "step": 3551, + "train/ce_loss": 1.643475890159607 + }, + { + "epoch": 0.3510974886296223, + "step": 3551, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.3510974886296223, + "step": 3551, + "train/total_loss": 0.2151288390159607 + }, + { + "entropy": 9.369651794433594, + "epoch": 0.35119636147913785, + "mean_token_accuracy": 0.8061674237251282, + "num_tokens": 18488319.0, + "step": 3552, + "train/ce_loss": 1.0746724605560303 + }, + { + "epoch": 0.35119636147913785, + "step": 3552, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.35119636147913785, + "step": 3552, + "train/total_loss": 0.19731099903583527 + }, + { + "entropy": 9.777578353881836, + "epoch": 0.35129523432865334, + "mean_token_accuracy": 0.705234169960022, + "num_tokens": 18493039.0, + "step": 3553, + "train/ce_loss": 4.909882136416854e-06 + }, + { + "epoch": 0.35129523432865334, + "step": 3553, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.35129523432865334, + "step": 3553, + "train/total_loss": 0.0664067417383194 + }, + { + "entropy": 8.643465042114258, + "epoch": 0.3513941071781689, + "mean_token_accuracy": 0.7199074029922485, + "num_tokens": 18498387.0, + "step": 3554, + "train/ce_loss": 0.8254873752593994 + }, + { + "epoch": 0.3513941071781689, + "step": 3554, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.3513941071781689, + "step": 3554, + "train/total_loss": 0.16067373752593994 + }, + { + "entropy": 9.082072257995605, + "epoch": 0.3514929800276844, + "mean_token_accuracy": 0.7390804886817932, + "num_tokens": 18503715.0, + "step": 3555, + "train/ce_loss": 0.5262618660926819 + }, + { + "epoch": 0.3514929800276844, + "step": 3555, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.3514929800276844, + "step": 3555, + "train/total_loss": 0.11512619256973267 + }, + { + "entropy": 9.256733894348145, + "epoch": 0.3515918528771999, + "mean_token_accuracy": 0.7245178818702698, + "num_tokens": 18508925.0, + "step": 3556, + "train/ce_loss": 0.7370480895042419 + }, + { + "epoch": 0.3515918528771999, + "step": 3556, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.3515918528771999, + "step": 3556, + "train/total_loss": 0.1479235589504242 + }, + { + "entropy": 9.786665916442871, + "epoch": 0.35169072572671545, + "mean_token_accuracy": 0.7542017102241516, + "num_tokens": 18513840.0, + "step": 3557, + "train/ce_loss": 3.821523478109157e-06 + }, + { + "epoch": 0.35169072572671545, + "step": 3557, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.35169072572671545, + "step": 3557, + "train/total_loss": 0.05078163370490074 + }, + { + "entropy": 10.097780227661133, + "epoch": 0.351789598576231, + "mean_token_accuracy": 0.7157190442085266, + "num_tokens": 18518562.0, + "step": 3558, + "train/ce_loss": 1.937991976737976 + }, + { + "epoch": 0.351789598576231, + "step": 3558, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.351789598576231, + "step": 3558, + "train/total_loss": 0.2523929476737976 + }, + { + "entropy": 9.427690505981445, + "epoch": 0.3518884714257465, + "mean_token_accuracy": 0.7645429372787476, + "num_tokens": 18523769.0, + "step": 3559, + "train/ce_loss": 0.6540150046348572 + }, + { + "epoch": 0.3518884714257465, + "step": 3559, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.3518884714257465, + "step": 3559, + "train/total_loss": 0.15524524450302124 + }, + { + "epoch": 0.351987344275262, + "grad_norm": 0.7682899236679077, + "learning_rate": 9.122533748701975e-06, + "loss": 0.1492, + "step": 3560 + }, + { + "entropy": 9.192558288574219, + "epoch": 0.351987344275262, + "mean_token_accuracy": 0.7062663435935974, + "num_tokens": 18528996.0, + "step": 3560, + "train/ce_loss": 0.8725659251213074 + }, + { + "epoch": 0.351987344275262, + "step": 3560, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.351987344275262, + "step": 3560, + "train/total_loss": 0.18491284549236298 + }, + { + "entropy": 10.469324111938477, + "epoch": 0.35208621712477756, + "mean_token_accuracy": 1.0, + "num_tokens": 18533420.0, + "step": 3561, + "train/ce_loss": 9.297148790210485e-05 + }, + { + "epoch": 0.35208621712477756, + "step": 3561, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.35208621712477756, + "step": 3561, + "train/total_loss": 0.04297804832458496 + }, + { + "entropy": 9.145017623901367, + "epoch": 0.35218508997429304, + "mean_token_accuracy": 0.7345911860466003, + "num_tokens": 18538685.0, + "step": 3562, + "train/ce_loss": 1.5550450086593628 + }, + { + "epoch": 0.35218508997429304, + "step": 3562, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.35218508997429304, + "step": 3562, + "train/total_loss": 0.249254509806633 + }, + { + "entropy": 8.81939697265625, + "epoch": 0.3522839628238086, + "mean_token_accuracy": 0.701694905757904, + "num_tokens": 18544032.0, + "step": 3563, + "train/ce_loss": 1.2731009721755981 + }, + { + "epoch": 0.3522839628238086, + "step": 3563, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.3522839628238086, + "step": 3563, + "train/total_loss": 0.19371634721755981 + }, + { + "entropy": 8.905570983886719, + "epoch": 0.3523828356733241, + "mean_token_accuracy": 0.7117318511009216, + "num_tokens": 18549406.0, + "step": 3564, + "train/ce_loss": 0.6244997978210449 + }, + { + "epoch": 0.3523828356733241, + "step": 3564, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.3523828356733241, + "step": 3564, + "train/total_loss": 0.14057497680187225 + }, + { + "entropy": 8.975542068481445, + "epoch": 0.3524817085228396, + "mean_token_accuracy": 0.7894737124443054, + "num_tokens": 18554792.0, + "step": 3565, + "train/ce_loss": 0.7156611084938049 + }, + { + "epoch": 0.3524817085228396, + "step": 3565, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.3524817085228396, + "step": 3565, + "train/total_loss": 0.14578485488891602 + }, + { + "entropy": 9.697669982910156, + "epoch": 0.35258058137235515, + "mean_token_accuracy": 0.7649006843566895, + "num_tokens": 18559778.0, + "step": 3566, + "train/ce_loss": 1.360660433769226 + }, + { + "epoch": 0.35258058137235515, + "step": 3566, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.35258058137235515, + "step": 3566, + "train/total_loss": 0.18294104933738708 + }, + { + "entropy": 8.912150382995605, + "epoch": 0.3526794542218707, + "mean_token_accuracy": 0.7199124693870544, + "num_tokens": 18565133.0, + "step": 3567, + "train/ce_loss": 1.1734925508499146 + }, + { + "epoch": 0.3526794542218707, + "step": 3567, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.3526794542218707, + "step": 3567, + "train/total_loss": 0.1720367670059204 + }, + { + "entropy": 9.849849700927734, + "epoch": 0.3527783270713862, + "mean_token_accuracy": 0.7318681478500366, + "num_tokens": 18570001.0, + "step": 3568, + "train/ce_loss": 1.5497937056352384e-05 + }, + { + "epoch": 0.3527783270713862, + "step": 3568, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.3527783270713862, + "step": 3568, + "train/total_loss": 0.01953279972076416 + }, + { + "entropy": 9.48647689819336, + "epoch": 0.3528771999209017, + "mean_token_accuracy": 0.7436708807945251, + "num_tokens": 18575056.0, + "step": 3569, + "train/ce_loss": 1.4351483583450317 + }, + { + "epoch": 0.3528771999209017, + "step": 3569, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.3528771999209017, + "step": 3569, + "train/total_loss": 0.19820234179496765 + }, + { + "entropy": 8.485700607299805, + "epoch": 0.35297607277041726, + "mean_token_accuracy": 0.7702991366386414, + "num_tokens": 18580543.0, + "step": 3570, + "train/ce_loss": 0.6253058910369873 + }, + { + "epoch": 0.35297607277041726, + "step": 3570, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.35297607277041726, + "step": 3570, + "train/total_loss": 0.08987434208393097 + }, + { + "entropy": 9.469551086425781, + "epoch": 0.35307494561993275, + "mean_token_accuracy": 0.7402234673500061, + "num_tokens": 18585719.0, + "step": 3571, + "train/ce_loss": 1.6994985116980388e-06 + }, + { + "epoch": 0.35307494561993275, + "step": 3571, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.35307494561993275, + "step": 3571, + "train/total_loss": 0.08984392136335373 + }, + { + "entropy": 9.067724227905273, + "epoch": 0.3531738184694483, + "mean_token_accuracy": 0.7933579087257385, + "num_tokens": 18591174.0, + "step": 3572, + "train/ce_loss": 0.7888551354408264 + }, + { + "epoch": 0.3531738184694483, + "step": 3572, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.3531738184694483, + "step": 3572, + "train/total_loss": 0.1491980254650116 + }, + { + "entropy": 9.695735931396484, + "epoch": 0.3532726913189638, + "mean_token_accuracy": 0.7185500860214233, + "num_tokens": 18596056.0, + "step": 3573, + "train/ce_loss": 1.1860848665237427 + }, + { + "epoch": 0.3532726913189638, + "step": 3573, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.3532726913189638, + "step": 3573, + "train/total_loss": 0.1693897396326065 + }, + { + "entropy": 9.05880355834961, + "epoch": 0.3533715641684793, + "mean_token_accuracy": 0.7036625742912292, + "num_tokens": 18601450.0, + "step": 3574, + "train/ce_loss": 1.1042697429656982 + }, + { + "epoch": 0.3533715641684793, + "step": 3574, + "train/sim_loss": 0.14453125 + }, + { + "epoch": 0.3533715641684793, + "step": 3574, + "train/total_loss": 0.25495821237564087 + }, + { + "entropy": 9.551956176757812, + "epoch": 0.35347043701799485, + "mean_token_accuracy": 0.7982906103134155, + "num_tokens": 18606441.0, + "step": 3575, + "train/ce_loss": 2.0116110590606695e-06 + }, + { + "epoch": 0.35347043701799485, + "step": 3575, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.35347043701799485, + "step": 3575, + "train/total_loss": 0.06640645116567612 + }, + { + "entropy": 9.449283599853516, + "epoch": 0.3535693098675104, + "mean_token_accuracy": 0.7388059496879578, + "num_tokens": 18611435.0, + "step": 3576, + "train/ce_loss": 1.50284743309021 + }, + { + "epoch": 0.3535693098675104, + "step": 3576, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.3535693098675104, + "step": 3576, + "train/total_loss": 0.2557535171508789 + }, + { + "entropy": 9.170859336853027, + "epoch": 0.3536681827170259, + "mean_token_accuracy": 0.8545994162559509, + "num_tokens": 18616592.0, + "step": 3577, + "train/ce_loss": 0.6139150261878967 + }, + { + "epoch": 0.3536681827170259, + "step": 3577, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.3536681827170259, + "step": 3577, + "train/total_loss": 0.11607900261878967 + }, + { + "entropy": 9.522819519042969, + "epoch": 0.3537670555665414, + "mean_token_accuracy": 0.7272727489471436, + "num_tokens": 18621748.0, + "step": 3578, + "train/ce_loss": 1.217820644378662 + }, + { + "epoch": 0.3537670555665414, + "step": 3578, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.3537670555665414, + "step": 3578, + "train/total_loss": 0.1686570644378662 + }, + { + "entropy": 8.783790588378906, + "epoch": 0.35386592841605696, + "mean_token_accuracy": 0.7079002261161804, + "num_tokens": 18627189.0, + "step": 3579, + "train/ce_loss": 0.8037675023078918 + }, + { + "epoch": 0.35386592841605696, + "step": 3579, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.35386592841605696, + "step": 3579, + "train/total_loss": 0.1272517442703247 + }, + { + "epoch": 0.35396480126557245, + "grad_norm": 0.8489325642585754, + "learning_rate": 9.117588883944025e-06, + "loss": 0.14, + "step": 3580 + }, + { + "entropy": 9.09628677368164, + "epoch": 0.35396480126557245, + "mean_token_accuracy": 0.7522624731063843, + "num_tokens": 18632524.0, + "step": 3580, + "train/ce_loss": 0.4255582392215729 + }, + { + "epoch": 0.35396480126557245, + "step": 3580, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.35396480126557245, + "step": 3580, + "train/total_loss": 0.10505582392215729 + }, + { + "entropy": 9.367462158203125, + "epoch": 0.354063674115088, + "mean_token_accuracy": 0.8267831206321716, + "num_tokens": 18637674.0, + "step": 3581, + "train/ce_loss": 0.4969341456890106 + }, + { + "epoch": 0.354063674115088, + "step": 3581, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.354063674115088, + "step": 3581, + "train/total_loss": 0.08875592052936554 + }, + { + "entropy": 9.681544303894043, + "epoch": 0.35416254696460353, + "mean_token_accuracy": 0.7083333134651184, + "num_tokens": 18642660.0, + "step": 3582, + "train/ce_loss": 0.9497576951980591 + }, + { + "epoch": 0.35416254696460353, + "step": 3582, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.35416254696460353, + "step": 3582, + "train/total_loss": 0.1379445195198059 + }, + { + "entropy": 9.453405380249023, + "epoch": 0.354261419814119, + "mean_token_accuracy": 0.6682927012443542, + "num_tokens": 18647712.0, + "step": 3583, + "train/ce_loss": 1.421942442902946e-06 + }, + { + "epoch": 0.354261419814119, + "step": 3583, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.354261419814119, + "step": 3583, + "train/total_loss": 0.01562514156103134 + }, + { + "entropy": 9.411760330200195, + "epoch": 0.35436029266363456, + "mean_token_accuracy": 0.7788732647895813, + "num_tokens": 18652869.0, + "step": 3584, + "train/ce_loss": 0.6261138319969177 + }, + { + "epoch": 0.35436029266363456, + "step": 3584, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.35436029266363456, + "step": 3584, + "train/total_loss": 0.08604888617992401 + }, + { + "entropy": 8.78024959564209, + "epoch": 0.3544591655131501, + "mean_token_accuracy": 0.8067581653594971, + "num_tokens": 18658316.0, + "step": 3585, + "train/ce_loss": 0.5842998027801514 + }, + { + "epoch": 0.3544591655131501, + "step": 3585, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.3544591655131501, + "step": 3585, + "train/total_loss": 0.08967998623847961 + }, + { + "entropy": 9.422521591186523, + "epoch": 0.35455803836266564, + "mean_token_accuracy": 0.791208803653717, + "num_tokens": 18663258.0, + "step": 3586, + "train/ce_loss": 4.0228596844826825e-06 + }, + { + "epoch": 0.35455803836266564, + "step": 3586, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.35455803836266564, + "step": 3586, + "train/total_loss": 0.06640665233135223 + }, + { + "entropy": 9.755395889282227, + "epoch": 0.3546569112121811, + "mean_token_accuracy": 0.741847813129425, + "num_tokens": 18668243.0, + "step": 3587, + "train/ce_loss": 1.243598222732544 + }, + { + "epoch": 0.3546569112121811, + "step": 3587, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.3546569112121811, + "step": 3587, + "train/total_loss": 0.23764106631278992 + }, + { + "entropy": 10.178962707519531, + "epoch": 0.35475578406169667, + "mean_token_accuracy": 0.7535014152526855, + "num_tokens": 18673022.0, + "step": 3588, + "train/ce_loss": 1.7247364521026611 + }, + { + "epoch": 0.35475578406169667, + "step": 3588, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.35475578406169667, + "step": 3588, + "train/total_loss": 0.250598669052124 + }, + { + "entropy": 9.204524993896484, + "epoch": 0.3548546569112122, + "mean_token_accuracy": 0.7333333492279053, + "num_tokens": 18678305.0, + "step": 3589, + "train/ce_loss": 1.0631451606750488 + }, + { + "epoch": 0.3548546569112122, + "step": 3589, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.3548546569112122, + "step": 3589, + "train/total_loss": 0.1610020101070404 + }, + { + "entropy": 8.949295043945312, + "epoch": 0.3549535297607277, + "mean_token_accuracy": 0.7013888955116272, + "num_tokens": 18683571.0, + "step": 3590, + "train/ce_loss": 1.1846740245819092 + }, + { + "epoch": 0.3549535297607277, + "step": 3590, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.3549535297607277, + "step": 3590, + "train/total_loss": 0.15752990543842316 + }, + { + "entropy": 9.124601364135742, + "epoch": 0.35505240261024323, + "mean_token_accuracy": 0.7449344396591187, + "num_tokens": 18688918.0, + "step": 3591, + "train/ce_loss": 0.7621549963951111 + }, + { + "epoch": 0.35505240261024323, + "step": 3591, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.35505240261024323, + "step": 3591, + "train/total_loss": 0.11137174814939499 + }, + { + "entropy": 9.815519332885742, + "epoch": 0.3551512754597588, + "mean_token_accuracy": 0.7307692170143127, + "num_tokens": 18693812.0, + "step": 3592, + "train/ce_loss": 1.170204758644104 + }, + { + "epoch": 0.3551512754597588, + "step": 3592, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.3551512754597588, + "step": 3592, + "train/total_loss": 0.17561423778533936 + }, + { + "entropy": 8.898412704467773, + "epoch": 0.35525014830927426, + "mean_token_accuracy": 0.7038251161575317, + "num_tokens": 18699240.0, + "step": 3593, + "train/ce_loss": 0.7210386395454407 + }, + { + "epoch": 0.35525014830927426, + "step": 3593, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.35525014830927426, + "step": 3593, + "train/total_loss": 0.1814788579940796 + }, + { + "entropy": 8.81280517578125, + "epoch": 0.3553490211587898, + "mean_token_accuracy": 0.7233368754386902, + "num_tokens": 18704639.0, + "step": 3594, + "train/ce_loss": 0.8128060102462769 + }, + { + "epoch": 0.3553490211587898, + "step": 3594, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.3553490211587898, + "step": 3594, + "train/total_loss": 0.15940560400485992 + }, + { + "entropy": 10.054403305053711, + "epoch": 0.35544789400830534, + "mean_token_accuracy": 0.7058823704719543, + "num_tokens": 18709449.0, + "step": 3595, + "train/ce_loss": 5.0441635721654166e-06 + }, + { + "epoch": 0.35544789400830534, + "step": 3595, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.35544789400830534, + "step": 3595, + "train/total_loss": 0.03125050291419029 + }, + { + "entropy": 9.578506469726562, + "epoch": 0.35554676685782083, + "mean_token_accuracy": 0.7337662577629089, + "num_tokens": 18714481.0, + "step": 3596, + "train/ce_loss": 1.2608743906021118 + }, + { + "epoch": 0.35554676685782083, + "step": 3596, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.35554676685782083, + "step": 3596, + "train/total_loss": 0.16124369204044342 + }, + { + "entropy": 9.35792064666748, + "epoch": 0.35564563970733637, + "mean_token_accuracy": 0.7272727489471436, + "num_tokens": 18719708.0, + "step": 3597, + "train/ce_loss": 1.0480810403823853 + }, + { + "epoch": 0.35564563970733637, + "step": 3597, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.35564563970733637, + "step": 3597, + "train/total_loss": 0.16730810701847076 + }, + { + "entropy": 8.493367195129395, + "epoch": 0.3557445125568519, + "mean_token_accuracy": 0.7280898690223694, + "num_tokens": 18725065.0, + "step": 3598, + "train/ce_loss": 1.273380994796753 + }, + { + "epoch": 0.3557445125568519, + "step": 3598, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.3557445125568519, + "step": 3598, + "train/total_loss": 0.20155684649944305 + }, + { + "entropy": 9.373510360717773, + "epoch": 0.3558433854063674, + "mean_token_accuracy": 0.7911646366119385, + "num_tokens": 18730248.0, + "step": 3599, + "train/ce_loss": 0.8422386646270752 + }, + { + "epoch": 0.3558433854063674, + "step": 3599, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.3558433854063674, + "step": 3599, + "train/total_loss": 0.12328636646270752 + }, + { + "epoch": 0.35594225825588294, + "grad_norm": 0.6824468374252319, + "learning_rate": 9.112644019186077e-06, + "loss": 0.141, + "step": 3600 + }, + { + "entropy": 9.461212158203125, + "epoch": 0.35594225825588294, + "mean_token_accuracy": 0.8100889921188354, + "num_tokens": 18735348.0, + "step": 3600, + "train/ce_loss": 0.6437177062034607 + }, + { + "epoch": 0.35594225825588294, + "step": 3600, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.35594225825588294, + "step": 3600, + "train/total_loss": 0.09952802211046219 + }, + { + "entropy": 8.997678756713867, + "epoch": 0.3560411311053985, + "mean_token_accuracy": 0.7733473181724548, + "num_tokens": 18740765.0, + "step": 3601, + "train/ce_loss": 0.7001959085464478 + }, + { + "epoch": 0.3560411311053985, + "step": 3601, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.3560411311053985, + "step": 3601, + "train/total_loss": 0.08564459532499313 + }, + { + "entropy": 9.560249328613281, + "epoch": 0.35614000395491396, + "mean_token_accuracy": 0.7255520224571228, + "num_tokens": 18745855.0, + "step": 3602, + "train/ce_loss": 0.8443648815155029 + }, + { + "epoch": 0.35614000395491396, + "step": 3602, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.35614000395491396, + "step": 3602, + "train/total_loss": 0.13131149113178253 + }, + { + "entropy": 9.580425262451172, + "epoch": 0.3562388768044295, + "mean_token_accuracy": 0.6866764426231384, + "num_tokens": 18750999.0, + "step": 3603, + "train/ce_loss": 1.130759596824646 + }, + { + "epoch": 0.3562388768044295, + "step": 3603, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.3562388768044295, + "step": 3603, + "train/total_loss": 0.19120097160339355 + }, + { + "entropy": 9.033404350280762, + "epoch": 0.35633774965394505, + "mean_token_accuracy": 0.7333333492279053, + "num_tokens": 18756297.0, + "step": 3604, + "train/ce_loss": 0.8131483793258667 + }, + { + "epoch": 0.35633774965394505, + "step": 3604, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.35633774965394505, + "step": 3604, + "train/total_loss": 0.1438148319721222 + }, + { + "entropy": 9.32512378692627, + "epoch": 0.35643662250346053, + "mean_token_accuracy": 0.7956204414367676, + "num_tokens": 18761380.0, + "step": 3605, + "train/ce_loss": 0.7058120369911194 + }, + { + "epoch": 0.35643662250346053, + "step": 3605, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.35643662250346053, + "step": 3605, + "train/total_loss": 0.09011245518922806 + }, + { + "entropy": 9.25851821899414, + "epoch": 0.3565354953529761, + "mean_token_accuracy": 0.7393689751625061, + "num_tokens": 18766558.0, + "step": 3606, + "train/ce_loss": 0.5824702382087708 + }, + { + "epoch": 0.3565354953529761, + "step": 3606, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.3565354953529761, + "step": 3606, + "train/total_loss": 0.14418452978134155 + }, + { + "entropy": 9.170404434204102, + "epoch": 0.3566343682024916, + "mean_token_accuracy": 0.7585784196853638, + "num_tokens": 18771844.0, + "step": 3607, + "train/ce_loss": 0.38632792234420776 + }, + { + "epoch": 0.3566343682024916, + "step": 3607, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.3566343682024916, + "step": 3607, + "train/total_loss": 0.08550779521465302 + }, + { + "entropy": 9.091513633728027, + "epoch": 0.3567332410520071, + "mean_token_accuracy": 0.7013630867004395, + "num_tokens": 18777148.0, + "step": 3608, + "train/ce_loss": 1.1107176542282104 + }, + { + "epoch": 0.3567332410520071, + "step": 3608, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.3567332410520071, + "step": 3608, + "train/total_loss": 0.22435301542282104 + }, + { + "entropy": 8.946985244750977, + "epoch": 0.35683211390152264, + "mean_token_accuracy": 0.71875, + "num_tokens": 18782443.0, + "step": 3609, + "train/ce_loss": 1.255902647972107 + }, + { + "epoch": 0.35683211390152264, + "step": 3609, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.35683211390152264, + "step": 3609, + "train/total_loss": 0.1959027647972107 + }, + { + "entropy": 8.945352554321289, + "epoch": 0.3569309867510382, + "mean_token_accuracy": 0.7400419116020203, + "num_tokens": 18787880.0, + "step": 3610, + "train/ce_loss": 0.8099817037582397 + }, + { + "epoch": 0.3569309867510382, + "step": 3610, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.3569309867510382, + "step": 3610, + "train/total_loss": 0.19818568229675293 + }, + { + "entropy": 9.248785972595215, + "epoch": 0.35702985960055367, + "mean_token_accuracy": 0.7289256453514099, + "num_tokens": 18792973.0, + "step": 3611, + "train/ce_loss": 1.0842132568359375 + }, + { + "epoch": 0.35702985960055367, + "step": 3611, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.35702985960055367, + "step": 3611, + "train/total_loss": 0.20607757568359375 + }, + { + "entropy": 9.346733093261719, + "epoch": 0.3571287324500692, + "mean_token_accuracy": 0.7813299298286438, + "num_tokens": 18798198.0, + "step": 3612, + "train/ce_loss": 0.5885779857635498 + }, + { + "epoch": 0.3571287324500692, + "step": 3612, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.3571287324500692, + "step": 3612, + "train/total_loss": 0.11354529857635498 + }, + { + "entropy": 9.392707824707031, + "epoch": 0.35722760529958475, + "mean_token_accuracy": 0.75789475440979, + "num_tokens": 18803418.0, + "step": 3613, + "train/ce_loss": 0.734527587890625 + }, + { + "epoch": 0.35722760529958475, + "step": 3613, + "train/sim_loss": 0.12890625 + }, + { + "epoch": 0.35722760529958475, + "step": 3613, + "train/total_loss": 0.20235902070999146 + }, + { + "entropy": 9.530153274536133, + "epoch": 0.35732647814910024, + "mean_token_accuracy": 0.6506550312042236, + "num_tokens": 18808536.0, + "step": 3614, + "train/ce_loss": 2.2162020206451416 + }, + { + "epoch": 0.35732647814910024, + "step": 3614, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.35732647814910024, + "step": 3614, + "train/total_loss": 0.33880770206451416 + }, + { + "entropy": 8.940558433532715, + "epoch": 0.3574253509986158, + "mean_token_accuracy": 0.7585470080375671, + "num_tokens": 18813971.0, + "step": 3615, + "train/ce_loss": 1.023219108581543 + }, + { + "epoch": 0.3574253509986158, + "step": 3615, + "train/sim_loss": 0.1328125 + }, + { + "epoch": 0.3574253509986158, + "step": 3615, + "train/total_loss": 0.23513442277908325 + }, + { + "entropy": 9.895393371582031, + "epoch": 0.3575242238481313, + "mean_token_accuracy": 0.7389557957649231, + "num_tokens": 18818888.0, + "step": 3616, + "train/ce_loss": 0.701510488986969 + }, + { + "epoch": 0.3575242238481313, + "step": 3616, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.3575242238481313, + "step": 3616, + "train/total_loss": 0.14827606081962585 + }, + { + "entropy": 8.8057861328125, + "epoch": 0.3576230966976468, + "mean_token_accuracy": 0.7281845808029175, + "num_tokens": 18824365.0, + "step": 3617, + "train/ce_loss": 0.2709960341453552 + }, + { + "epoch": 0.3576230966976468, + "step": 3617, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.3576230966976468, + "step": 3617, + "train/total_loss": 0.0505371019244194 + }, + { + "entropy": 8.984687805175781, + "epoch": 0.35772196954716234, + "mean_token_accuracy": 0.7944622039794922, + "num_tokens": 18829811.0, + "step": 3618, + "train/ce_loss": 0.5673206448554993 + }, + { + "epoch": 0.35772196954716234, + "step": 3618, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.35772196954716234, + "step": 3618, + "train/total_loss": 0.08016956597566605 + }, + { + "entropy": 9.221881866455078, + "epoch": 0.3578208423966779, + "mean_token_accuracy": 0.771556556224823, + "num_tokens": 18835097.0, + "step": 3619, + "train/ce_loss": 0.7672033309936523 + }, + { + "epoch": 0.3578208423966779, + "step": 3619, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.3578208423966779, + "step": 3619, + "train/total_loss": 0.12359533458948135 + }, + { + "epoch": 0.35791971524619337, + "grad_norm": 0.5600435733795166, + "learning_rate": 9.107699154428126e-06, + "loss": 0.1408, + "step": 3620 + }, + { + "entropy": 9.152168273925781, + "epoch": 0.35791971524619337, + "mean_token_accuracy": 0.732824444770813, + "num_tokens": 18840361.0, + "step": 3620, + "train/ce_loss": 0.8965581655502319 + }, + { + "epoch": 0.35791971524619337, + "step": 3620, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.35791971524619337, + "step": 3620, + "train/total_loss": 0.1482495665550232 + }, + { + "entropy": 8.964424133300781, + "epoch": 0.3580185880957089, + "mean_token_accuracy": 0.6876190304756165, + "num_tokens": 18845887.0, + "step": 3621, + "train/ce_loss": 0.6420177817344666 + }, + { + "epoch": 0.3580185880957089, + "step": 3621, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.3580185880957089, + "step": 3621, + "train/total_loss": 0.16967052221298218 + }, + { + "entropy": 9.759793281555176, + "epoch": 0.35811746094522445, + "mean_token_accuracy": 0.7165217399597168, + "num_tokens": 18850933.0, + "step": 3622, + "train/ce_loss": 1.5218240022659302 + }, + { + "epoch": 0.35811746094522445, + "step": 3622, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.35811746094522445, + "step": 3622, + "train/total_loss": 0.21858865022659302 + }, + { + "entropy": 9.021388053894043, + "epoch": 0.35821633379473994, + "mean_token_accuracy": 0.699999988079071, + "num_tokens": 18856409.0, + "step": 3623, + "train/ce_loss": 0.7790200710296631 + }, + { + "epoch": 0.35821633379473994, + "step": 3623, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.35821633379473994, + "step": 3623, + "train/total_loss": 0.13649576902389526 + }, + { + "entropy": 9.041007041931152, + "epoch": 0.3583152066442555, + "mean_token_accuracy": 0.7039312124252319, + "num_tokens": 18861669.0, + "step": 3624, + "train/ce_loss": 0.6578070521354675 + }, + { + "epoch": 0.3583152066442555, + "step": 3624, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.3583152066442555, + "step": 3624, + "train/total_loss": 0.13999944925308228 + }, + { + "entropy": 8.717838287353516, + "epoch": 0.358414079493771, + "mean_token_accuracy": 0.7262672781944275, + "num_tokens": 18867235.0, + "step": 3625, + "train/ce_loss": 1.4119700193405151 + }, + { + "epoch": 0.358414079493771, + "step": 3625, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.358414079493771, + "step": 3625, + "train/total_loss": 0.23104076087474823 + }, + { + "entropy": 9.458995819091797, + "epoch": 0.3585129523432865, + "mean_token_accuracy": 0.7584269642829895, + "num_tokens": 18872254.0, + "step": 3626, + "train/ce_loss": 0.7267706394195557 + }, + { + "epoch": 0.3585129523432865, + "step": 3626, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.3585129523432865, + "step": 3626, + "train/total_loss": 0.13517707586288452 + }, + { + "entropy": 8.873926162719727, + "epoch": 0.35861182519280205, + "mean_token_accuracy": 0.7026143670082092, + "num_tokens": 18877645.0, + "step": 3627, + "train/ce_loss": 0.7436491250991821 + }, + { + "epoch": 0.35861182519280205, + "step": 3627, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.35861182519280205, + "step": 3627, + "train/total_loss": 0.16811491549015045 + }, + { + "entropy": 8.854890823364258, + "epoch": 0.3587106980423176, + "mean_token_accuracy": 0.8172757625579834, + "num_tokens": 18883029.0, + "step": 3628, + "train/ce_loss": 0.6892380714416504 + }, + { + "epoch": 0.3587106980423176, + "step": 3628, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.3587106980423176, + "step": 3628, + "train/total_loss": 0.13923630118370056 + }, + { + "entropy": 8.848349571228027, + "epoch": 0.35880957089183313, + "mean_token_accuracy": 0.7848837375640869, + "num_tokens": 18888473.0, + "step": 3629, + "train/ce_loss": 0.6006161570549011 + }, + { + "epoch": 0.35880957089183313, + "step": 3629, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.35880957089183313, + "step": 3629, + "train/total_loss": 0.09912411868572235 + }, + { + "entropy": 9.404335021972656, + "epoch": 0.3589084437413486, + "mean_token_accuracy": 0.6823362112045288, + "num_tokens": 18893603.0, + "step": 3630, + "train/ce_loss": 1.101285696029663 + }, + { + "epoch": 0.3589084437413486, + "step": 3630, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.3589084437413486, + "step": 3630, + "train/total_loss": 0.16872233152389526 + }, + { + "entropy": 9.75284481048584, + "epoch": 0.35900731659086416, + "mean_token_accuracy": 0.7788844704627991, + "num_tokens": 18898517.0, + "step": 3631, + "train/ce_loss": 8.16061128716683e-06 + }, + { + "epoch": 0.35900731659086416, + "step": 3631, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.35900731659086416, + "step": 3631, + "train/total_loss": 0.03906331583857536 + }, + { + "entropy": 9.497733116149902, + "epoch": 0.3591061894403797, + "mean_token_accuracy": 0.7729323506355286, + "num_tokens": 18903681.0, + "step": 3632, + "train/ce_loss": 2.0427783056220505e-06 + }, + { + "epoch": 0.3591061894403797, + "step": 3632, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.3591061894403797, + "step": 3632, + "train/total_loss": 0.042968954890966415 + }, + { + "entropy": 8.947818756103516, + "epoch": 0.3592050622898952, + "mean_token_accuracy": 0.7249224185943604, + "num_tokens": 18909102.0, + "step": 3633, + "train/ce_loss": 1.2241902351379395 + }, + { + "epoch": 0.3592050622898952, + "step": 3633, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.3592050622898952, + "step": 3633, + "train/total_loss": 0.23570027947425842 + }, + { + "entropy": 9.941341400146484, + "epoch": 0.3593039351394107, + "mean_token_accuracy": 0.6866484880447388, + "num_tokens": 18913868.0, + "step": 3634, + "train/ce_loss": 2.229797601699829 + }, + { + "epoch": 0.3593039351394107, + "step": 3634, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.3593039351394107, + "step": 3634, + "train/total_loss": 0.2932922840118408 + }, + { + "entropy": 8.790292739868164, + "epoch": 0.35940280798892627, + "mean_token_accuracy": 0.6800422668457031, + "num_tokens": 18919251.0, + "step": 3635, + "train/ce_loss": 0.8259347677230835 + }, + { + "epoch": 0.35940280798892627, + "step": 3635, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.35940280798892627, + "step": 3635, + "train/total_loss": 0.12165597826242447 + }, + { + "entropy": 9.229532241821289, + "epoch": 0.35950168083844175, + "mean_token_accuracy": 0.7695364356040955, + "num_tokens": 18924461.0, + "step": 3636, + "train/ce_loss": 0.5599935054779053 + }, + { + "epoch": 0.35950168083844175, + "step": 3636, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.35950168083844175, + "step": 3636, + "train/total_loss": 0.11459310352802277 + }, + { + "entropy": 9.133469581604004, + "epoch": 0.3596005536879573, + "mean_token_accuracy": 0.7037037014961243, + "num_tokens": 18929719.0, + "step": 3637, + "train/ce_loss": 0.480461448431015 + }, + { + "epoch": 0.3596005536879573, + "step": 3637, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.3596005536879573, + "step": 3637, + "train/total_loss": 0.07929614186286926 + }, + { + "entropy": 9.364177703857422, + "epoch": 0.35969942653747283, + "mean_token_accuracy": 0.738095223903656, + "num_tokens": 18934832.0, + "step": 3638, + "train/ce_loss": 0.6780454516410828 + }, + { + "epoch": 0.35969942653747283, + "step": 3638, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.35969942653747283, + "step": 3638, + "train/total_loss": 0.11858579516410828 + }, + { + "entropy": 9.644359588623047, + "epoch": 0.3597982993869883, + "mean_token_accuracy": 0.8057692050933838, + "num_tokens": 18939785.0, + "step": 3639, + "train/ce_loss": 2.528952109059901e-06 + }, + { + "epoch": 0.3597982993869883, + "step": 3639, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.3597982993869883, + "step": 3639, + "train/total_loss": 0.050781503319740295 + }, + { + "epoch": 0.35989717223650386, + "grad_norm": 0.7416434288024902, + "learning_rate": 9.102754289670178e-06, + "loss": 0.1487, + "step": 3640 + }, + { + "entropy": 10.403352737426758, + "epoch": 0.35989717223650386, + "mean_token_accuracy": 0.7214611768722534, + "num_tokens": 18944388.0, + "step": 3640, + "train/ce_loss": 3.3780710697174072 + }, + { + "epoch": 0.35989717223650386, + "step": 3640, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.35989717223650386, + "step": 3640, + "train/total_loss": 0.3924946188926697 + }, + { + "entropy": 9.390605926513672, + "epoch": 0.3599960450860194, + "mean_token_accuracy": 0.7394468784332275, + "num_tokens": 18949553.0, + "step": 3641, + "train/ce_loss": 3.4617044093465665e-06 + }, + { + "epoch": 0.3599960450860194, + "step": 3641, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.3599960450860194, + "step": 3641, + "train/total_loss": 0.06250034272670746 + }, + { + "entropy": 9.734277725219727, + "epoch": 0.3600949179355349, + "mean_token_accuracy": 0.8051689863204956, + "num_tokens": 18954459.0, + "step": 3642, + "train/ce_loss": 1.4679824113845825 + }, + { + "epoch": 0.3600949179355349, + "step": 3642, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.3600949179355349, + "step": 3642, + "train/total_loss": 0.166329488158226 + }, + { + "entropy": 9.015350341796875, + "epoch": 0.3601937907850504, + "mean_token_accuracy": 0.7357142567634583, + "num_tokens": 18959821.0, + "step": 3643, + "train/ce_loss": 1.2550936937332153 + }, + { + "epoch": 0.3601937907850504, + "step": 3643, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.3601937907850504, + "step": 3643, + "train/total_loss": 0.2192593663930893 + }, + { + "entropy": 9.254035949707031, + "epoch": 0.36029266363456597, + "mean_token_accuracy": 0.7264276146888733, + "num_tokens": 18965046.0, + "step": 3644, + "train/ce_loss": 3.5190134894946823e-06 + }, + { + "epoch": 0.36029266363456597, + "step": 3644, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.36029266363456597, + "step": 3644, + "train/total_loss": 0.054687850177288055 + }, + { + "entropy": 9.217218399047852, + "epoch": 0.36039153648408145, + "mean_token_accuracy": 0.704635739326477, + "num_tokens": 18970310.0, + "step": 3645, + "train/ce_loss": 0.49777185916900635 + }, + { + "epoch": 0.36039153648408145, + "step": 3645, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.36039153648408145, + "step": 3645, + "train/total_loss": 0.12008968740701675 + }, + { + "entropy": 9.125036239624023, + "epoch": 0.360490409333597, + "mean_token_accuracy": 0.7408906817436218, + "num_tokens": 18975548.0, + "step": 3646, + "train/ce_loss": 1.0844553709030151 + }, + { + "epoch": 0.360490409333597, + "step": 3646, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.360490409333597, + "step": 3646, + "train/total_loss": 0.15532054007053375 + }, + { + "entropy": 9.504810333251953, + "epoch": 0.36058928218311254, + "mean_token_accuracy": 0.6901172399520874, + "num_tokens": 18980597.0, + "step": 3647, + "train/ce_loss": 2.1324833596736426e-06 + }, + { + "epoch": 0.36058928218311254, + "step": 3647, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.36058928218311254, + "step": 3647, + "train/total_loss": 0.05468771234154701 + }, + { + "entropy": 9.096872329711914, + "epoch": 0.360688155032628, + "mean_token_accuracy": 0.7618497014045715, + "num_tokens": 18985912.0, + "step": 3648, + "train/ce_loss": 1.5423396462210803e-06 + }, + { + "epoch": 0.360688155032628, + "step": 3648, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.360688155032628, + "step": 3648, + "train/total_loss": 0.06250015646219254 + }, + { + "entropy": 9.308656692504883, + "epoch": 0.36078702788214356, + "mean_token_accuracy": 0.7789633870124817, + "num_tokens": 18990979.0, + "step": 3649, + "train/ce_loss": 0.8892550468444824 + }, + { + "epoch": 0.36078702788214356, + "step": 3649, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.36078702788214356, + "step": 3649, + "train/total_loss": 0.13580051064491272 + }, + { + "entropy": 9.134403228759766, + "epoch": 0.3608859007316591, + "mean_token_accuracy": 0.7423638701438904, + "num_tokens": 18996217.0, + "step": 3650, + "train/ce_loss": 0.9975591897964478 + }, + { + "epoch": 0.3608859007316591, + "step": 3650, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.3608859007316591, + "step": 3650, + "train/total_loss": 0.1388184130191803 + }, + { + "entropy": 9.531702995300293, + "epoch": 0.3609847735811746, + "mean_token_accuracy": 0.7331136465072632, + "num_tokens": 19001236.0, + "step": 3651, + "train/ce_loss": 1.3827279806137085 + }, + { + "epoch": 0.3609847735811746, + "step": 3651, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.3609847735811746, + "step": 3651, + "train/total_loss": 0.21249155700206757 + }, + { + "entropy": 8.824071884155273, + "epoch": 0.36108364643069013, + "mean_token_accuracy": 0.7599999904632568, + "num_tokens": 19006597.0, + "step": 3652, + "train/ce_loss": 0.6945485472679138 + }, + { + "epoch": 0.36108364643069013, + "step": 3652, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.36108364643069013, + "step": 3652, + "train/total_loss": 0.0967986062169075 + }, + { + "entropy": 9.394201278686523, + "epoch": 0.36118251928020567, + "mean_token_accuracy": 0.7309185266494751, + "num_tokens": 19011785.0, + "step": 3653, + "train/ce_loss": 1.196912407875061 + }, + { + "epoch": 0.36118251928020567, + "step": 3653, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.36118251928020567, + "step": 3653, + "train/total_loss": 0.19781625270843506 + }, + { + "entropy": 9.342180252075195, + "epoch": 0.36128139212972116, + "mean_token_accuracy": 0.7170329689979553, + "num_tokens": 19016984.0, + "step": 3654, + "train/ce_loss": 0.7816472053527832 + }, + { + "epoch": 0.36128139212972116, + "step": 3654, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.36128139212972116, + "step": 3654, + "train/total_loss": 0.1445709764957428 + }, + { + "entropy": 9.088046073913574, + "epoch": 0.3613802649792367, + "mean_token_accuracy": 0.7180451154708862, + "num_tokens": 19022215.0, + "step": 3655, + "train/ce_loss": 1.2801862955093384 + }, + { + "epoch": 0.3613802649792367, + "step": 3655, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.3613802649792367, + "step": 3655, + "train/total_loss": 0.20223738253116608 + }, + { + "entropy": 8.835012435913086, + "epoch": 0.36147913782875224, + "mean_token_accuracy": 0.7701525092124939, + "num_tokens": 19027634.0, + "step": 3656, + "train/ce_loss": 0.7795095443725586 + }, + { + "epoch": 0.36147913782875224, + "step": 3656, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.36147913782875224, + "step": 3656, + "train/total_loss": 0.14435720443725586 + }, + { + "entropy": 8.987771987915039, + "epoch": 0.3615780106782677, + "mean_token_accuracy": 0.7279322743415833, + "num_tokens": 19032920.0, + "step": 3657, + "train/ce_loss": 0.6514043211936951 + }, + { + "epoch": 0.3615780106782677, + "step": 3657, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.3615780106782677, + "step": 3657, + "train/total_loss": 0.11201543360948563 + }, + { + "entropy": 9.48392391204834, + "epoch": 0.36167688352778327, + "mean_token_accuracy": 0.7801302671432495, + "num_tokens": 19037973.0, + "step": 3658, + "train/ce_loss": 0.8745675683021545 + }, + { + "epoch": 0.36167688352778327, + "step": 3658, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.36167688352778327, + "step": 3658, + "train/total_loss": 0.19683176279067993 + }, + { + "entropy": 8.97184944152832, + "epoch": 0.3617757563772988, + "mean_token_accuracy": 0.7978494763374329, + "num_tokens": 19043372.0, + "step": 3659, + "train/ce_loss": 0.7526065111160278 + }, + { + "epoch": 0.3617757563772988, + "step": 3659, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.3617757563772988, + "step": 3659, + "train/total_loss": 0.10651065409183502 + }, + { + "epoch": 0.3618746292268143, + "grad_norm": 0.7123227715492249, + "learning_rate": 9.097809424912229e-06, + "loss": 0.1448, + "step": 3660 + }, + { + "entropy": 9.43770694732666, + "epoch": 0.3618746292268143, + "mean_token_accuracy": 0.7228915691375732, + "num_tokens": 19048447.0, + "step": 3660, + "train/ce_loss": 1.235249400138855 + }, + { + "epoch": 0.3618746292268143, + "step": 3660, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.3618746292268143, + "step": 3660, + "train/total_loss": 0.20946243405342102 + }, + { + "entropy": 9.074600219726562, + "epoch": 0.36197350207632983, + "mean_token_accuracy": 0.7693333625793457, + "num_tokens": 19053679.0, + "step": 3661, + "train/ce_loss": 0.6530929207801819 + }, + { + "epoch": 0.36197350207632983, + "step": 3661, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.36197350207632983, + "step": 3661, + "train/total_loss": 0.1473405361175537 + }, + { + "entropy": 8.966264724731445, + "epoch": 0.3620723749258454, + "mean_token_accuracy": 0.765116274356842, + "num_tokens": 19059013.0, + "step": 3662, + "train/ce_loss": 0.5833165645599365 + }, + { + "epoch": 0.3620723749258454, + "step": 3662, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.3620723749258454, + "step": 3662, + "train/total_loss": 0.08567540347576141 + }, + { + "entropy": 9.553458213806152, + "epoch": 0.36217124777536086, + "mean_token_accuracy": 0.7155476808547974, + "num_tokens": 19064032.0, + "step": 3663, + "train/ce_loss": 5.074959517514799e-06 + }, + { + "epoch": 0.36217124777536086, + "step": 3663, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.36217124777536086, + "step": 3663, + "train/total_loss": 0.06250050663948059 + }, + { + "entropy": 9.122953414916992, + "epoch": 0.3622701206248764, + "mean_token_accuracy": 0.7018140554428101, + "num_tokens": 19069381.0, + "step": 3664, + "train/ce_loss": 0.9899625778198242 + }, + { + "epoch": 0.3622701206248764, + "step": 3664, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.3622701206248764, + "step": 3664, + "train/total_loss": 0.17321500182151794 + }, + { + "entropy": 9.707911491394043, + "epoch": 0.36236899347439194, + "mean_token_accuracy": 0.7546468377113342, + "num_tokens": 19074348.0, + "step": 3665, + "train/ce_loss": 1.7823099369707052e-06 + }, + { + "epoch": 0.36236899347439194, + "step": 3665, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.36236899347439194, + "step": 3665, + "train/total_loss": 0.058593928813934326 + }, + { + "entropy": 9.818957328796387, + "epoch": 0.36246786632390743, + "mean_token_accuracy": 0.8381502628326416, + "num_tokens": 19079290.0, + "step": 3666, + "train/ce_loss": 3.033597977264435e-06 + }, + { + "epoch": 0.36246786632390743, + "step": 3666, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.36246786632390743, + "step": 3666, + "train/total_loss": 0.07421905547380447 + }, + { + "entropy": 8.935047149658203, + "epoch": 0.36256673917342297, + "mean_token_accuracy": 0.7860026955604553, + "num_tokens": 19084590.0, + "step": 3667, + "train/ce_loss": 1.4201887097442523e-06 + }, + { + "epoch": 0.36256673917342297, + "step": 3667, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.36256673917342297, + "step": 3667, + "train/total_loss": 0.07421889156103134 + }, + { + "entropy": 9.333765029907227, + "epoch": 0.3626656120229385, + "mean_token_accuracy": 0.7039999961853027, + "num_tokens": 19089664.0, + "step": 3668, + "train/ce_loss": 0.8184553980827332 + }, + { + "epoch": 0.3626656120229385, + "step": 3668, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.3626656120229385, + "step": 3668, + "train/total_loss": 0.14043930172920227 + }, + { + "entropy": 8.790042877197266, + "epoch": 0.36276448487245405, + "mean_token_accuracy": 0.7546584010124207, + "num_tokens": 19095120.0, + "step": 3669, + "train/ce_loss": 0.6159719228744507 + }, + { + "epoch": 0.36276448487245405, + "step": 3669, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.36276448487245405, + "step": 3669, + "train/total_loss": 0.07722219824790955 + }, + { + "entropy": 9.340641021728516, + "epoch": 0.36286335772196954, + "mean_token_accuracy": 0.7985714077949524, + "num_tokens": 19100293.0, + "step": 3670, + "train/ce_loss": 0.8443805575370789 + }, + { + "epoch": 0.36286335772196954, + "step": 3670, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.36286335772196954, + "step": 3670, + "train/total_loss": 0.12740680575370789 + }, + { + "entropy": 9.315393447875977, + "epoch": 0.3629622305714851, + "mean_token_accuracy": 0.7367773652076721, + "num_tokens": 19105574.0, + "step": 3671, + "train/ce_loss": 1.1162911653518677 + }, + { + "epoch": 0.3629622305714851, + "step": 3671, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.3629622305714851, + "step": 3671, + "train/total_loss": 0.17412912845611572 + }, + { + "entropy": 9.151384353637695, + "epoch": 0.3630611034210006, + "mean_token_accuracy": 0.7649208307266235, + "num_tokens": 19110853.0, + "step": 3672, + "train/ce_loss": 0.6073848009109497 + }, + { + "epoch": 0.3630611034210006, + "step": 3672, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.3630611034210006, + "step": 3672, + "train/total_loss": 0.1388634741306305 + }, + { + "entropy": 9.221534729003906, + "epoch": 0.3631599762705161, + "mean_token_accuracy": 0.7123473286628723, + "num_tokens": 19116059.0, + "step": 3673, + "train/ce_loss": 1.1618515253067017 + }, + { + "epoch": 0.3631599762705161, + "step": 3673, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.3631599762705161, + "step": 3673, + "train/total_loss": 0.24118515849113464 + }, + { + "entropy": 9.611105918884277, + "epoch": 0.36325884912003165, + "mean_token_accuracy": 0.7008032202720642, + "num_tokens": 19120998.0, + "step": 3674, + "train/ce_loss": 1.4367402791976929 + }, + { + "epoch": 0.36325884912003165, + "step": 3674, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.36325884912003165, + "step": 3674, + "train/total_loss": 0.21398653090000153 + }, + { + "entropy": 9.03463077545166, + "epoch": 0.3633577219695472, + "mean_token_accuracy": 0.7253270149230957, + "num_tokens": 19126289.0, + "step": 3675, + "train/ce_loss": 0.9281492233276367 + }, + { + "epoch": 0.3633577219695472, + "step": 3675, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.3633577219695472, + "step": 3675, + "train/total_loss": 0.16703367233276367 + }, + { + "entropy": 9.68355655670166, + "epoch": 0.3634565948190627, + "mean_token_accuracy": 0.6813559532165527, + "num_tokens": 19131350.0, + "step": 3676, + "train/ce_loss": 1.1487598419189453 + }, + { + "epoch": 0.3634565948190627, + "step": 3676, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.3634565948190627, + "step": 3676, + "train/total_loss": 0.20081348717212677 + }, + { + "entropy": 8.875577926635742, + "epoch": 0.3635554676685782, + "mean_token_accuracy": 0.7400000095367432, + "num_tokens": 19136685.0, + "step": 3677, + "train/ce_loss": 1.3505109548568726 + }, + { + "epoch": 0.3635554676685782, + "step": 3677, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.3635554676685782, + "step": 3677, + "train/total_loss": 0.20926985144615173 + }, + { + "entropy": 9.280202865600586, + "epoch": 0.36365434051809375, + "mean_token_accuracy": 0.7714987993240356, + "num_tokens": 19141958.0, + "step": 3678, + "train/ce_loss": 1.2463619709014893 + }, + { + "epoch": 0.36365434051809375, + "step": 3678, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.36365434051809375, + "step": 3678, + "train/total_loss": 0.2066674530506134 + }, + { + "entropy": 9.189447402954102, + "epoch": 0.36375321336760924, + "mean_token_accuracy": 0.73557049036026, + "num_tokens": 19147179.0, + "step": 3679, + "train/ce_loss": 0.4812721312046051 + }, + { + "epoch": 0.36375321336760924, + "step": 3679, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.36375321336760924, + "step": 3679, + "train/total_loss": 0.06375221908092499 + }, + { + "epoch": 0.3638520862171248, + "grad_norm": 0.7537246346473694, + "learning_rate": 9.092864560154281e-06, + "loss": 0.1415, + "step": 3680 + }, + { + "entropy": 8.884530067443848, + "epoch": 0.3638520862171248, + "mean_token_accuracy": 0.7246963381767273, + "num_tokens": 19152817.0, + "step": 3680, + "train/ce_loss": 0.4940720796585083 + }, + { + "epoch": 0.3638520862171248, + "step": 3680, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.3638520862171248, + "step": 3680, + "train/total_loss": 0.11190721392631531 + }, + { + "entropy": 9.291428565979004, + "epoch": 0.3639509590666403, + "mean_token_accuracy": 0.728715717792511, + "num_tokens": 19157923.0, + "step": 3681, + "train/ce_loss": 1.2066850662231445 + }, + { + "epoch": 0.3639509590666403, + "step": 3681, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.3639509590666403, + "step": 3681, + "train/total_loss": 0.15582475066184998 + }, + { + "entropy": 9.171480178833008, + "epoch": 0.3640498319161558, + "mean_token_accuracy": 0.7663981318473816, + "num_tokens": 19163236.0, + "step": 3682, + "train/ce_loss": 1.0046883821487427 + }, + { + "epoch": 0.3640498319161558, + "step": 3682, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.3640498319161558, + "step": 3682, + "train/total_loss": 0.12781259417533875 + }, + { + "entropy": 9.120622634887695, + "epoch": 0.36414870476567135, + "mean_token_accuracy": 0.7536814212799072, + "num_tokens": 19168476.0, + "step": 3683, + "train/ce_loss": 0.875571608543396 + }, + { + "epoch": 0.36414870476567135, + "step": 3683, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.36414870476567135, + "step": 3683, + "train/total_loss": 0.16568216681480408 + }, + { + "entropy": 8.828895568847656, + "epoch": 0.3642475776151869, + "mean_token_accuracy": 0.7705286741256714, + "num_tokens": 19173865.0, + "step": 3684, + "train/ce_loss": 0.8268486857414246 + }, + { + "epoch": 0.3642475776151869, + "step": 3684, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.3642475776151869, + "step": 3684, + "train/total_loss": 0.15299737453460693 + }, + { + "entropy": 9.62015438079834, + "epoch": 0.3643464504647024, + "mean_token_accuracy": 0.7137096524238586, + "num_tokens": 19178787.0, + "step": 3685, + "train/ce_loss": 0.9937043786048889 + }, + { + "epoch": 0.3643464504647024, + "step": 3685, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.3643464504647024, + "step": 3685, + "train/total_loss": 0.15405794978141785 + }, + { + "entropy": 9.672927856445312, + "epoch": 0.3644453233142179, + "mean_token_accuracy": 0.7844203114509583, + "num_tokens": 19183738.0, + "step": 3686, + "train/ce_loss": 1.1332205533981323 + }, + { + "epoch": 0.3644453233142179, + "step": 3686, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.3644453233142179, + "step": 3686, + "train/total_loss": 0.22660329937934875 + }, + { + "entropy": 8.847968101501465, + "epoch": 0.36454419616373346, + "mean_token_accuracy": 0.7869177460670471, + "num_tokens": 19189270.0, + "step": 3687, + "train/ce_loss": 0.6084604859352112 + }, + { + "epoch": 0.36454419616373346, + "step": 3687, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.36454419616373346, + "step": 3687, + "train/total_loss": 0.09990854561328888 + }, + { + "entropy": 8.820528984069824, + "epoch": 0.36464306901324894, + "mean_token_accuracy": 0.7956273555755615, + "num_tokens": 19194795.0, + "step": 3688, + "train/ce_loss": 0.5975689888000488 + }, + { + "epoch": 0.36464306901324894, + "step": 3688, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.36464306901324894, + "step": 3688, + "train/total_loss": 0.12616315484046936 + }, + { + "entropy": 9.499797821044922, + "epoch": 0.3647419418627645, + "mean_token_accuracy": 0.7402032017707825, + "num_tokens": 19199912.0, + "step": 3689, + "train/ce_loss": 1.0911719799041748 + }, + { + "epoch": 0.3647419418627645, + "step": 3689, + "train/sim_loss": 0.140625 + }, + { + "epoch": 0.3647419418627645, + "step": 3689, + "train/total_loss": 0.24974220991134644 + }, + { + "entropy": 9.30494499206543, + "epoch": 0.36484081471228, + "mean_token_accuracy": 0.7166866660118103, + "num_tokens": 19205211.0, + "step": 3690, + "train/ce_loss": 0.6577334403991699 + }, + { + "epoch": 0.36484081471228, + "step": 3690, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.36484081471228, + "step": 3690, + "train/total_loss": 0.13608583807945251 + }, + { + "entropy": 8.974608421325684, + "epoch": 0.3649396875617955, + "mean_token_accuracy": 0.7792887091636658, + "num_tokens": 19210653.0, + "step": 3691, + "train/ce_loss": 0.9531930685043335 + }, + { + "epoch": 0.3649396875617955, + "step": 3691, + "train/sim_loss": 0.13671875 + }, + { + "epoch": 0.3649396875617955, + "step": 3691, + "train/total_loss": 0.23203805088996887 + }, + { + "entropy": 9.016339302062988, + "epoch": 0.36503856041131105, + "mean_token_accuracy": 0.7889273166656494, + "num_tokens": 19215949.0, + "step": 3692, + "train/ce_loss": 0.6592164635658264 + }, + { + "epoch": 0.36503856041131105, + "step": 3692, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.36503856041131105, + "step": 3692, + "train/total_loss": 0.10498414933681488 + }, + { + "entropy": 8.699670791625977, + "epoch": 0.3651374332608266, + "mean_token_accuracy": 0.7269681692123413, + "num_tokens": 19221613.0, + "step": 3693, + "train/ce_loss": 0.4130028486251831 + }, + { + "epoch": 0.3651374332608266, + "step": 3693, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.3651374332608266, + "step": 3693, + "train/total_loss": 0.06473778188228607 + }, + { + "entropy": 9.287849426269531, + "epoch": 0.3652363061103421, + "mean_token_accuracy": 0.7048345804214478, + "num_tokens": 19226835.0, + "step": 3694, + "train/ce_loss": 1.6899060010910034 + }, + { + "epoch": 0.3652363061103421, + "step": 3694, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.3652363061103421, + "step": 3694, + "train/total_loss": 0.2236780971288681 + }, + { + "entropy": 8.74072551727295, + "epoch": 0.3653351789598576, + "mean_token_accuracy": 0.7337883710861206, + "num_tokens": 19232182.0, + "step": 3695, + "train/ce_loss": 0.9204556941986084 + }, + { + "epoch": 0.3653351789598576, + "step": 3695, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.3653351789598576, + "step": 3695, + "train/total_loss": 0.12720182538032532 + }, + { + "entropy": 9.471048355102539, + "epoch": 0.36543405180937316, + "mean_token_accuracy": 0.7438162565231323, + "num_tokens": 19237276.0, + "step": 3696, + "train/ce_loss": 0.9572367668151855 + }, + { + "epoch": 0.36543405180937316, + "step": 3696, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.36543405180937316, + "step": 3696, + "train/total_loss": 0.1543174386024475 + }, + { + "entropy": 9.895501136779785, + "epoch": 0.36553292465888865, + "mean_token_accuracy": 0.7692307829856873, + "num_tokens": 19242142.0, + "step": 3697, + "train/ce_loss": 0.5988715291023254 + }, + { + "epoch": 0.36553292465888865, + "step": 3697, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.36553292465888865, + "step": 3697, + "train/total_loss": 0.16144965589046478 + }, + { + "entropy": 9.960525512695312, + "epoch": 0.3656317975084042, + "mean_token_accuracy": 0.6813953518867493, + "num_tokens": 19246972.0, + "step": 3698, + "train/ce_loss": 1.8187681436538696 + }, + { + "epoch": 0.3656317975084042, + "step": 3698, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.3656317975084042, + "step": 3698, + "train/total_loss": 0.24047057330608368 + }, + { + "entropy": 8.915639877319336, + "epoch": 0.36573067035791973, + "mean_token_accuracy": 0.7660332322120667, + "num_tokens": 19252320.0, + "step": 3699, + "train/ce_loss": 0.39387181401252747 + }, + { + "epoch": 0.36573067035791973, + "step": 3699, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.36573067035791973, + "step": 3699, + "train/total_loss": 0.06282468140125275 + }, + { + "epoch": 0.3658295432074352, + "grad_norm": 0.7704654335975647, + "learning_rate": 9.087919695396332e-06, + "loss": 0.141, + "step": 3700 + }, + { + "entropy": 9.320602416992188, + "epoch": 0.3658295432074352, + "mean_token_accuracy": 0.7112299203872681, + "num_tokens": 19257515.0, + "step": 3700, + "train/ce_loss": 0.9579900503158569 + }, + { + "epoch": 0.3658295432074352, + "step": 3700, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.3658295432074352, + "step": 3700, + "train/total_loss": 0.15829899907112122 + }, + { + "entropy": 9.27962875366211, + "epoch": 0.36592841605695076, + "mean_token_accuracy": 0.7882187962532043, + "num_tokens": 19262693.0, + "step": 3701, + "train/ce_loss": 0.8543885946273804 + }, + { + "epoch": 0.36592841605695076, + "step": 3701, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.36592841605695076, + "step": 3701, + "train/total_loss": 0.12450136244297028 + }, + { + "entropy": 9.562347412109375, + "epoch": 0.3660272889064663, + "mean_token_accuracy": 0.7322970628738403, + "num_tokens": 19267699.0, + "step": 3702, + "train/ce_loss": 1.6200648546218872 + }, + { + "epoch": 0.3660272889064663, + "step": 3702, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.3660272889064663, + "step": 3702, + "train/total_loss": 0.22060023248195648 + }, + { + "entropy": 9.190065383911133, + "epoch": 0.3661261617559818, + "mean_token_accuracy": 0.7539936304092407, + "num_tokens": 19272757.0, + "step": 3703, + "train/ce_loss": 0.5445184111595154 + }, + { + "epoch": 0.3661261617559818, + "step": 3703, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.3661261617559818, + "step": 3703, + "train/total_loss": 0.1091393381357193 + }, + { + "entropy": 9.860889434814453, + "epoch": 0.3662250346054973, + "mean_token_accuracy": 0.8409090638160706, + "num_tokens": 19277586.0, + "step": 3704, + "train/ce_loss": 1.1505147218704224 + }, + { + "epoch": 0.3662250346054973, + "step": 3704, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.3662250346054973, + "step": 3704, + "train/total_loss": 0.1423952281475067 + }, + { + "entropy": 9.17490291595459, + "epoch": 0.36632390745501286, + "mean_token_accuracy": 0.7366504669189453, + "num_tokens": 19283053.0, + "step": 3705, + "train/ce_loss": 0.5870340466499329 + }, + { + "epoch": 0.36632390745501286, + "step": 3705, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.36632390745501286, + "step": 3705, + "train/total_loss": 0.09385965764522552 + }, + { + "entropy": 8.471872329711914, + "epoch": 0.36642278030452835, + "mean_token_accuracy": 0.6974874138832092, + "num_tokens": 19288551.0, + "step": 3706, + "train/ce_loss": 0.6062349677085876 + }, + { + "epoch": 0.36642278030452835, + "step": 3706, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.36642278030452835, + "step": 3706, + "train/total_loss": 0.14656099677085876 + }, + { + "entropy": 8.921075820922852, + "epoch": 0.3665216531540439, + "mean_token_accuracy": 0.7591792941093445, + "num_tokens": 19293922.0, + "step": 3707, + "train/ce_loss": 0.659113883972168 + }, + { + "epoch": 0.3665216531540439, + "step": 3707, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.3665216531540439, + "step": 3707, + "train/total_loss": 0.08934888988733292 + }, + { + "entropy": 9.633601188659668, + "epoch": 0.36662052600355943, + "mean_token_accuracy": 0.7111486196517944, + "num_tokens": 19298925.0, + "step": 3708, + "train/ce_loss": 1.052996039390564 + }, + { + "epoch": 0.36662052600355943, + "step": 3708, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.36662052600355943, + "step": 3708, + "train/total_loss": 0.14436210691928864 + }, + { + "entropy": 8.578943252563477, + "epoch": 0.3667193988530749, + "mean_token_accuracy": 0.7411988377571106, + "num_tokens": 19304451.0, + "step": 3709, + "train/ce_loss": 1.1131523847579956 + }, + { + "epoch": 0.3667193988530749, + "step": 3709, + "train/sim_loss": 0.140625 + }, + { + "epoch": 0.3667193988530749, + "step": 3709, + "train/total_loss": 0.2519402503967285 + }, + { + "entropy": 9.549026489257812, + "epoch": 0.36681827170259046, + "mean_token_accuracy": 0.7397769689559937, + "num_tokens": 19309414.0, + "step": 3710, + "train/ce_loss": 1.154641032218933 + }, + { + "epoch": 0.36681827170259046, + "step": 3710, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.36681827170259046, + "step": 3710, + "train/total_loss": 0.16624535620212555 + }, + { + "entropy": 9.582513809204102, + "epoch": 0.366917144552106, + "mean_token_accuracy": 0.757785439491272, + "num_tokens": 19314484.0, + "step": 3711, + "train/ce_loss": 0.8000929951667786 + }, + { + "epoch": 0.366917144552106, + "step": 3711, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.366917144552106, + "step": 3711, + "train/total_loss": 0.1346968114376068 + }, + { + "entropy": 9.365667343139648, + "epoch": 0.36701601740162154, + "mean_token_accuracy": 0.7093185186386108, + "num_tokens": 19319616.0, + "step": 3712, + "train/ce_loss": 0.8726776242256165 + }, + { + "epoch": 0.36701601740162154, + "step": 3712, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.36701601740162154, + "step": 3712, + "train/total_loss": 0.13414275646209717 + }, + { + "entropy": 8.654797554016113, + "epoch": 0.367114890251137, + "mean_token_accuracy": 0.7637088894844055, + "num_tokens": 19325105.0, + "step": 3713, + "train/ce_loss": 0.8119332194328308 + }, + { + "epoch": 0.367114890251137, + "step": 3713, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.367114890251137, + "step": 3713, + "train/total_loss": 0.12806832790374756 + }, + { + "entropy": 9.25880241394043, + "epoch": 0.36721376310065257, + "mean_token_accuracy": 0.7027741074562073, + "num_tokens": 19330350.0, + "step": 3714, + "train/ce_loss": 1.0818811655044556 + }, + { + "epoch": 0.36721376310065257, + "step": 3714, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.36721376310065257, + "step": 3714, + "train/total_loss": 0.14334437251091003 + }, + { + "entropy": 9.742050170898438, + "epoch": 0.3673126359501681, + "mean_token_accuracy": 0.7057416439056396, + "num_tokens": 19335185.0, + "step": 3715, + "train/ce_loss": 1.0356760867580306e-05 + }, + { + "epoch": 0.3673126359501681, + "step": 3715, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.3673126359501681, + "step": 3715, + "train/total_loss": 0.03515728563070297 + }, + { + "entropy": 9.491357803344727, + "epoch": 0.3674115087996836, + "mean_token_accuracy": 0.8006535768508911, + "num_tokens": 19340254.0, + "step": 3716, + "train/ce_loss": 0.784355878829956 + }, + { + "epoch": 0.3674115087996836, + "step": 3716, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.3674115087996836, + "step": 3716, + "train/total_loss": 0.11749809235334396 + }, + { + "entropy": 9.089653968811035, + "epoch": 0.36751038164919914, + "mean_token_accuracy": 0.7354037165641785, + "num_tokens": 19345497.0, + "step": 3717, + "train/ce_loss": 0.49602407217025757 + }, + { + "epoch": 0.36751038164919914, + "step": 3717, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.36751038164919914, + "step": 3717, + "train/total_loss": 0.12382115423679352 + }, + { + "entropy": 9.133569717407227, + "epoch": 0.3676092544987147, + "mean_token_accuracy": 0.7141134142875671, + "num_tokens": 19350770.0, + "step": 3718, + "train/ce_loss": 1.0562591552734375 + }, + { + "epoch": 0.3676092544987147, + "step": 3718, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.3676092544987147, + "step": 3718, + "train/total_loss": 0.1759384274482727 + }, + { + "entropy": 9.450826644897461, + "epoch": 0.36770812734823016, + "mean_token_accuracy": 0.7879282236099243, + "num_tokens": 19355863.0, + "step": 3719, + "train/ce_loss": 7.788343282300048e-06 + }, + { + "epoch": 0.36770812734823016, + "step": 3719, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.36770812734823016, + "step": 3719, + "train/total_loss": 0.06640703231096268 + }, + { + "epoch": 0.3678070001977457, + "grad_norm": 0.6311770677566528, + "learning_rate": 9.082974830638382e-06, + "loss": 0.1422, + "step": 3720 + }, + { + "entropy": 8.871721267700195, + "epoch": 0.3678070001977457, + "mean_token_accuracy": 0.7399380803108215, + "num_tokens": 19361335.0, + "step": 3720, + "train/ce_loss": 0.5923374891281128 + }, + { + "epoch": 0.3678070001977457, + "step": 3720, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.3678070001977457, + "step": 3720, + "train/total_loss": 0.10610875487327576 + }, + { + "entropy": 9.379371643066406, + "epoch": 0.36790587304726124, + "mean_token_accuracy": 0.7760252356529236, + "num_tokens": 19366365.0, + "step": 3721, + "train/ce_loss": 3.045043285965221e-06 + }, + { + "epoch": 0.36790587304726124, + "step": 3721, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.36790587304726124, + "step": 3721, + "train/total_loss": 0.058594055473804474 + }, + { + "entropy": 8.701794624328613, + "epoch": 0.36800474589677673, + "mean_token_accuracy": 0.7163197994232178, + "num_tokens": 19371750.0, + "step": 3722, + "train/ce_loss": 1.0332472324371338 + }, + { + "epoch": 0.36800474589677673, + "step": 3722, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.36800474589677673, + "step": 3722, + "train/total_loss": 0.13457472622394562 + }, + { + "entropy": 9.15493392944336, + "epoch": 0.36810361874629227, + "mean_token_accuracy": 0.7419354915618896, + "num_tokens": 19377018.0, + "step": 3723, + "train/ce_loss": 0.6793240308761597 + }, + { + "epoch": 0.36810361874629227, + "step": 3723, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.36810361874629227, + "step": 3723, + "train/total_loss": 0.08355740457773209 + }, + { + "entropy": 9.095837593078613, + "epoch": 0.3682024915958078, + "mean_token_accuracy": 0.7668463587760925, + "num_tokens": 19382280.0, + "step": 3724, + "train/ce_loss": 3.6239430301066022e-06 + }, + { + "epoch": 0.3682024915958078, + "step": 3724, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.3682024915958078, + "step": 3724, + "train/total_loss": 0.06250036507844925 + }, + { + "entropy": 9.607484817504883, + "epoch": 0.3683013644453233, + "mean_token_accuracy": 0.7284403443336487, + "num_tokens": 19387237.0, + "step": 3725, + "train/ce_loss": 1.377042531967163 + }, + { + "epoch": 0.3683013644453233, + "step": 3725, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.3683013644453233, + "step": 3725, + "train/total_loss": 0.1923917531967163 + }, + { + "entropy": 9.639066696166992, + "epoch": 0.36840023729483884, + "mean_token_accuracy": 0.686274528503418, + "num_tokens": 19392196.0, + "step": 3726, + "train/ce_loss": 4.535922016657423e-06 + }, + { + "epoch": 0.36840023729483884, + "step": 3726, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.36840023729483884, + "step": 3726, + "train/total_loss": 0.05859420448541641 + }, + { + "entropy": 8.723943710327148, + "epoch": 0.3684991101443544, + "mean_token_accuracy": 0.7697993516921997, + "num_tokens": 19397686.0, + "step": 3727, + "train/ce_loss": 0.8708702325820923 + }, + { + "epoch": 0.3684991101443544, + "step": 3727, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.3684991101443544, + "step": 3727, + "train/total_loss": 0.14568078517913818 + }, + { + "entropy": 9.060258865356445, + "epoch": 0.36859798299386987, + "mean_token_accuracy": 0.7626112699508667, + "num_tokens": 19402807.0, + "step": 3728, + "train/ce_loss": 1.1802632808685303 + }, + { + "epoch": 0.36859798299386987, + "step": 3728, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.36859798299386987, + "step": 3728, + "train/total_loss": 0.18052633106708527 + }, + { + "entropy": 9.597804069519043, + "epoch": 0.3686968558433854, + "mean_token_accuracy": 0.7648351788520813, + "num_tokens": 19407668.0, + "step": 3729, + "train/ce_loss": 1.4211921691894531 + }, + { + "epoch": 0.3686968558433854, + "step": 3729, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.3686968558433854, + "step": 3729, + "train/total_loss": 0.18508796393871307 + }, + { + "entropy": 9.590475082397461, + "epoch": 0.36879572869290095, + "mean_token_accuracy": 0.7209677696228027, + "num_tokens": 19412752.0, + "step": 3730, + "train/ce_loss": 0.8765764832496643 + }, + { + "epoch": 0.36879572869290095, + "step": 3730, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.36879572869290095, + "step": 3730, + "train/total_loss": 0.11890765279531479 + }, + { + "entropy": 9.332487106323242, + "epoch": 0.36889460154241643, + "mean_token_accuracy": 0.8088012337684631, + "num_tokens": 19417831.0, + "step": 3731, + "train/ce_loss": 6.978231340326602e-06 + }, + { + "epoch": 0.36889460154241643, + "step": 3731, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.36889460154241643, + "step": 3731, + "train/total_loss": 0.05078194662928581 + }, + { + "entropy": 9.907342910766602, + "epoch": 0.368993474391932, + "mean_token_accuracy": 0.7770270109176636, + "num_tokens": 19422694.0, + "step": 3732, + "train/ce_loss": 1.3829725980758667 + }, + { + "epoch": 0.368993474391932, + "step": 3732, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.368993474391932, + "step": 3732, + "train/total_loss": 0.16954725980758667 + }, + { + "entropy": 8.675329208374023, + "epoch": 0.3690923472414475, + "mean_token_accuracy": 0.769011378288269, + "num_tokens": 19428250.0, + "step": 3733, + "train/ce_loss": 1.1538902521133423 + }, + { + "epoch": 0.3690923472414475, + "step": 3733, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.3690923472414475, + "step": 3733, + "train/total_loss": 0.18179526925086975 + }, + { + "entropy": 8.950559616088867, + "epoch": 0.369191220090963, + "mean_token_accuracy": 0.733031690120697, + "num_tokens": 19433631.0, + "step": 3734, + "train/ce_loss": 1.0515861511230469 + }, + { + "epoch": 0.369191220090963, + "step": 3734, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.369191220090963, + "step": 3734, + "train/total_loss": 0.15203362703323364 + }, + { + "entropy": 8.948648452758789, + "epoch": 0.36929009294047854, + "mean_token_accuracy": 0.7572254538536072, + "num_tokens": 19438924.0, + "step": 3735, + "train/ce_loss": 0.8543544411659241 + }, + { + "epoch": 0.36929009294047854, + "step": 3735, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.36929009294047854, + "step": 3735, + "train/total_loss": 0.17137295007705688 + }, + { + "entropy": 9.056068420410156, + "epoch": 0.3693889657899941, + "mean_token_accuracy": 0.7157894968986511, + "num_tokens": 19444153.0, + "step": 3736, + "train/ce_loss": 0.5833434462547302 + }, + { + "epoch": 0.3693889657899941, + "step": 3736, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.3693889657899941, + "step": 3736, + "train/total_loss": 0.1286468505859375 + }, + { + "entropy": 9.409212112426758, + "epoch": 0.36948783863950957, + "mean_token_accuracy": 0.739130437374115, + "num_tokens": 19449169.0, + "step": 3737, + "train/ce_loss": 1.196805715560913 + }, + { + "epoch": 0.36948783863950957, + "step": 3737, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.36948783863950957, + "step": 3737, + "train/total_loss": 0.17827433347702026 + }, + { + "entropy": 9.145063400268555, + "epoch": 0.3695867114890251, + "mean_token_accuracy": 0.7054263353347778, + "num_tokens": 19454369.0, + "step": 3738, + "train/ce_loss": 0.6760798096656799 + }, + { + "epoch": 0.3695867114890251, + "step": 3738, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.3695867114890251, + "step": 3738, + "train/total_loss": 0.13792048394680023 + }, + { + "entropy": 8.771507263183594, + "epoch": 0.36968558433854065, + "mean_token_accuracy": 0.7543054223060608, + "num_tokens": 19459770.0, + "step": 3739, + "train/ce_loss": 1.1303457021713257 + }, + { + "epoch": 0.36968558433854065, + "step": 3739, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.36968558433854065, + "step": 3739, + "train/total_loss": 0.16772207617759705 + }, + { + "epoch": 0.36978445718805614, + "grad_norm": 0.6994771361351013, + "learning_rate": 9.078029965880434e-06, + "loss": 0.1386, + "step": 3740 + }, + { + "entropy": 9.018630027770996, + "epoch": 0.36978445718805614, + "mean_token_accuracy": 0.7392900586128235, + "num_tokens": 19465066.0, + "step": 3740, + "train/ce_loss": 0.9878615736961365 + }, + { + "epoch": 0.36978445718805614, + "step": 3740, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.36978445718805614, + "step": 3740, + "train/total_loss": 0.1651924103498459 + }, + { + "entropy": 9.008108139038086, + "epoch": 0.3698833300375717, + "mean_token_accuracy": 0.7908979058265686, + "num_tokens": 19470352.0, + "step": 3741, + "train/ce_loss": 0.7387509346008301 + }, + { + "epoch": 0.3698833300375717, + "step": 3741, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.3698833300375717, + "step": 3741, + "train/total_loss": 0.09731259196996689 + }, + { + "entropy": 9.422561645507812, + "epoch": 0.3699822028870872, + "mean_token_accuracy": 0.7617260813713074, + "num_tokens": 19475356.0, + "step": 3742, + "train/ce_loss": 2.9977215945109492e-06 + }, + { + "epoch": 0.3699822028870872, + "step": 3742, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.3699822028870872, + "step": 3742, + "train/total_loss": 0.06250029802322388 + }, + { + "entropy": 9.204683303833008, + "epoch": 0.3700810757366027, + "mean_token_accuracy": 0.7597222328186035, + "num_tokens": 19480550.0, + "step": 3743, + "train/ce_loss": 3.021031261596363e-06 + }, + { + "epoch": 0.3700810757366027, + "step": 3743, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.3700810757366027, + "step": 3743, + "train/total_loss": 0.07421905547380447 + }, + { + "entropy": 9.255865097045898, + "epoch": 0.37017994858611825, + "mean_token_accuracy": 0.7313432693481445, + "num_tokens": 19485791.0, + "step": 3744, + "train/ce_loss": 1.0754189491271973 + }, + { + "epoch": 0.37017994858611825, + "step": 3744, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.37017994858611825, + "step": 3744, + "train/total_loss": 0.18176063895225525 + }, + { + "entropy": 8.607366561889648, + "epoch": 0.3702788214356338, + "mean_token_accuracy": 0.7419354915618896, + "num_tokens": 19491292.0, + "step": 3745, + "train/ce_loss": 0.5163165926933289 + }, + { + "epoch": 0.3702788214356338, + "step": 3745, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.3702788214356338, + "step": 3745, + "train/total_loss": 0.13756915926933289 + }, + { + "entropy": 9.31511116027832, + "epoch": 0.3703776942851493, + "mean_token_accuracy": 0.7166947722434998, + "num_tokens": 19496345.0, + "step": 3746, + "train/ce_loss": 2.6260327103955206e-06 + }, + { + "epoch": 0.3703776942851493, + "step": 3746, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.3703776942851493, + "step": 3746, + "train/total_loss": 0.03125026077032089 + }, + { + "entropy": 8.843770980834961, + "epoch": 0.3704765671346648, + "mean_token_accuracy": 0.7981220483779907, + "num_tokens": 19501675.0, + "step": 3747, + "train/ce_loss": 0.416376531124115 + }, + { + "epoch": 0.3704765671346648, + "step": 3747, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.3704765671346648, + "step": 3747, + "train/total_loss": 0.10804390907287598 + }, + { + "entropy": 8.891765594482422, + "epoch": 0.37057543998418035, + "mean_token_accuracy": 0.6957123279571533, + "num_tokens": 19506866.0, + "step": 3748, + "train/ce_loss": 1.1417073011398315 + }, + { + "epoch": 0.37057543998418035, + "step": 3748, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.37057543998418035, + "step": 3748, + "train/total_loss": 0.21573323011398315 + }, + { + "entropy": 8.899946212768555, + "epoch": 0.37067431283369584, + "mean_token_accuracy": 0.7761732935905457, + "num_tokens": 19512168.0, + "step": 3749, + "train/ce_loss": 0.7011145949363708 + }, + { + "epoch": 0.37067431283369584, + "step": 3749, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.37067431283369584, + "step": 3749, + "train/total_loss": 0.1208927109837532 + }, + { + "entropy": 8.65135383605957, + "epoch": 0.3707731856832114, + "mean_token_accuracy": 0.7608225345611572, + "num_tokens": 19517536.0, + "step": 3750, + "train/ce_loss": 0.9012789130210876 + }, + { + "epoch": 0.3707731856832114, + "step": 3750, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.3707731856832114, + "step": 3750, + "train/total_loss": 0.1565341353416443 + }, + { + "entropy": 8.757684707641602, + "epoch": 0.3708720585327269, + "mean_token_accuracy": 0.6804009079933167, + "num_tokens": 19522891.0, + "step": 3751, + "train/ce_loss": 0.7830634117126465 + }, + { + "epoch": 0.3708720585327269, + "step": 3751, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.3708720585327269, + "step": 3751, + "train/total_loss": 0.12127508968114853 + }, + { + "entropy": 8.842325210571289, + "epoch": 0.37097093138224246, + "mean_token_accuracy": 0.7467455863952637, + "num_tokens": 19528197.0, + "step": 3752, + "train/ce_loss": 0.8398581743240356 + }, + { + "epoch": 0.37097093138224246, + "step": 3752, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.37097093138224246, + "step": 3752, + "train/total_loss": 0.1738295704126358 + }, + { + "entropy": 9.236410140991211, + "epoch": 0.37106980423175795, + "mean_token_accuracy": 0.8135592937469482, + "num_tokens": 19533245.0, + "step": 3753, + "train/ce_loss": 0.872316837310791 + }, + { + "epoch": 0.37106980423175795, + "step": 3753, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.37106980423175795, + "step": 3753, + "train/total_loss": 0.13801294565200806 + }, + { + "entropy": 8.678380966186523, + "epoch": 0.3711686770812735, + "mean_token_accuracy": 0.7435367107391357, + "num_tokens": 19538721.0, + "step": 3754, + "train/ce_loss": 1.0098522901535034 + }, + { + "epoch": 0.3711686770812735, + "step": 3754, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.3711686770812735, + "step": 3754, + "train/total_loss": 0.18692272901535034 + }, + { + "entropy": 9.058197975158691, + "epoch": 0.37126754993078903, + "mean_token_accuracy": 0.7084870934486389, + "num_tokens": 19544029.0, + "step": 3755, + "train/ce_loss": 1.0240709781646729 + }, + { + "epoch": 0.37126754993078903, + "step": 3755, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.37126754993078903, + "step": 3755, + "train/total_loss": 0.15318834781646729 + }, + { + "entropy": 9.473255157470703, + "epoch": 0.3713664227803045, + "mean_token_accuracy": 0.7191780805587769, + "num_tokens": 19549017.0, + "step": 3756, + "train/ce_loss": 1.2590043544769287 + }, + { + "epoch": 0.3713664227803045, + "step": 3756, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.3713664227803045, + "step": 3756, + "train/total_loss": 0.18840043246746063 + }, + { + "entropy": 9.128877639770508, + "epoch": 0.37146529562982006, + "mean_token_accuracy": 0.7311139702796936, + "num_tokens": 19554241.0, + "step": 3757, + "train/ce_loss": 0.7337197661399841 + }, + { + "epoch": 0.37146529562982006, + "step": 3757, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.37146529562982006, + "step": 3757, + "train/total_loss": 0.1319657266139984 + }, + { + "entropy": 9.343884468078613, + "epoch": 0.3715641684793356, + "mean_token_accuracy": 0.7074722051620483, + "num_tokens": 19559295.0, + "step": 3758, + "train/ce_loss": 1.0605524778366089 + }, + { + "epoch": 0.3715641684793356, + "step": 3758, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.3715641684793356, + "step": 3758, + "train/total_loss": 0.18808650970458984 + }, + { + "entropy": 9.300300598144531, + "epoch": 0.3716630413288511, + "mean_token_accuracy": 0.7159686088562012, + "num_tokens": 19564515.0, + "step": 3759, + "train/ce_loss": 2.6648469884094084e-06 + }, + { + "epoch": 0.3716630413288511, + "step": 3759, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.3716630413288511, + "step": 3759, + "train/total_loss": 0.03906276822090149 + }, + { + "epoch": 0.3717619141783666, + "grad_norm": 0.9004268646240234, + "learning_rate": 9.073085101122485e-06, + "loss": 0.1443, + "step": 3760 + }, + { + "entropy": 9.299562454223633, + "epoch": 0.3717619141783666, + "mean_token_accuracy": 0.7503268122673035, + "num_tokens": 19569737.0, + "step": 3760, + "train/ce_loss": 1.3224796056747437 + }, + { + "epoch": 0.3717619141783666, + "step": 3760, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.3717619141783666, + "step": 3760, + "train/total_loss": 0.20256046950817108 + }, + { + "entropy": 8.562389373779297, + "epoch": 0.37186078702788217, + "mean_token_accuracy": 0.732083797454834, + "num_tokens": 19575171.0, + "step": 3761, + "train/ce_loss": 0.6876528263092041 + }, + { + "epoch": 0.37186078702788217, + "step": 3761, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.37186078702788217, + "step": 3761, + "train/total_loss": 0.08439028263092041 + }, + { + "entropy": 9.407758712768555, + "epoch": 0.37195965987739765, + "mean_token_accuracy": 0.7322946190834045, + "num_tokens": 19580292.0, + "step": 3762, + "train/ce_loss": 0.7906780242919922 + }, + { + "epoch": 0.37195965987739765, + "step": 3762, + "train/sim_loss": 0.01171875 + }, + { + "epoch": 0.37195965987739765, + "step": 3762, + "train/total_loss": 0.09078655391931534 + }, + { + "entropy": 9.819665908813477, + "epoch": 0.3720585327269132, + "mean_token_accuracy": 0.7414187788963318, + "num_tokens": 19585112.0, + "step": 3763, + "train/ce_loss": 1.981115698814392 + }, + { + "epoch": 0.3720585327269132, + "step": 3763, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.3720585327269132, + "step": 3763, + "train/total_loss": 0.2723303437232971 + }, + { + "entropy": 8.616861343383789, + "epoch": 0.37215740557642873, + "mean_token_accuracy": 0.726685106754303, + "num_tokens": 19590774.0, + "step": 3764, + "train/ce_loss": 1.3146535158157349 + }, + { + "epoch": 0.37215740557642873, + "step": 3764, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.37215740557642873, + "step": 3764, + "train/total_loss": 0.1978716105222702 + }, + { + "entropy": 9.059759140014648, + "epoch": 0.3722562784259442, + "mean_token_accuracy": 0.7260115742683411, + "num_tokens": 19596094.0, + "step": 3765, + "train/ce_loss": 0.7476335167884827 + }, + { + "epoch": 0.3722562784259442, + "step": 3765, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.3722562784259442, + "step": 3765, + "train/total_loss": 0.16460710763931274 + }, + { + "entropy": 9.937812805175781, + "epoch": 0.37235515127545976, + "mean_token_accuracy": 0.7248677015304565, + "num_tokens": 19600862.0, + "step": 3766, + "train/ce_loss": 2.5392666884727078e-06 + }, + { + "epoch": 0.37235515127545976, + "step": 3766, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.37235515127545976, + "step": 3766, + "train/total_loss": 0.027344003319740295 + }, + { + "entropy": 9.089120864868164, + "epoch": 0.3724540241249753, + "mean_token_accuracy": 0.7475177049636841, + "num_tokens": 19606050.0, + "step": 3767, + "train/ce_loss": 1.9539186268957565e-06 + }, + { + "epoch": 0.3724540241249753, + "step": 3767, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.3724540241249753, + "step": 3767, + "train/total_loss": 0.08593769371509552 + }, + { + "entropy": 8.93012809753418, + "epoch": 0.3725528969744908, + "mean_token_accuracy": 0.7232635021209717, + "num_tokens": 19611421.0, + "step": 3768, + "train/ce_loss": 0.8828412890434265 + }, + { + "epoch": 0.3725528969744908, + "step": 3768, + "train/sim_loss": 0.12109375 + }, + { + "epoch": 0.3725528969744908, + "step": 3768, + "train/total_loss": 0.20937788486480713 + }, + { + "entropy": 8.709407806396484, + "epoch": 0.37265176982400633, + "mean_token_accuracy": 0.7229524850845337, + "num_tokens": 19616908.0, + "step": 3769, + "train/ce_loss": 0.7235414981842041 + }, + { + "epoch": 0.37265176982400633, + "step": 3769, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.37265176982400633, + "step": 3769, + "train/total_loss": 0.12313540279865265 + }, + { + "entropy": 8.594377517700195, + "epoch": 0.37275064267352187, + "mean_token_accuracy": 0.7008032202720642, + "num_tokens": 19622364.0, + "step": 3770, + "train/ce_loss": 0.5381051898002625 + }, + { + "epoch": 0.37275064267352187, + "step": 3770, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.37275064267352187, + "step": 3770, + "train/total_loss": 0.08115427196025848 + }, + { + "entropy": 8.811583518981934, + "epoch": 0.37284951552303736, + "mean_token_accuracy": 0.7488636374473572, + "num_tokens": 19627722.0, + "step": 3771, + "train/ce_loss": 1.0054184198379517 + }, + { + "epoch": 0.37284951552303736, + "step": 3771, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.37284951552303736, + "step": 3771, + "train/total_loss": 0.1474168449640274 + }, + { + "entropy": 8.790473937988281, + "epoch": 0.3729483883725529, + "mean_token_accuracy": 0.7562437653541565, + "num_tokens": 19633180.0, + "step": 3772, + "train/ce_loss": 0.5944498181343079 + }, + { + "epoch": 0.3729483883725529, + "step": 3772, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.3729483883725529, + "step": 3772, + "train/total_loss": 0.14538247883319855 + }, + { + "entropy": 10.097342491149902, + "epoch": 0.37304726122206844, + "mean_token_accuracy": 0.7429718971252441, + "num_tokens": 19637830.0, + "step": 3773, + "train/ce_loss": 7.948490747367032e-06 + }, + { + "epoch": 0.37304726122206844, + "step": 3773, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.37304726122206844, + "step": 3773, + "train/total_loss": 0.04687579348683357 + }, + { + "entropy": 9.866277694702148, + "epoch": 0.3731461340715839, + "mean_token_accuracy": 0.7611607313156128, + "num_tokens": 19642660.0, + "step": 3774, + "train/ce_loss": 2.121217903550132e-06 + }, + { + "epoch": 0.3731461340715839, + "step": 3774, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.3731461340715839, + "step": 3774, + "train/total_loss": 0.019531462341547012 + }, + { + "entropy": 8.923385620117188, + "epoch": 0.37324500692109946, + "mean_token_accuracy": 0.7015834450721741, + "num_tokens": 19647911.0, + "step": 3775, + "train/ce_loss": 0.6671419143676758 + }, + { + "epoch": 0.37324500692109946, + "step": 3775, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.37324500692109946, + "step": 3775, + "train/total_loss": 0.15265169739723206 + }, + { + "entropy": 9.472021102905273, + "epoch": 0.373343879770615, + "mean_token_accuracy": 0.7642405033111572, + "num_tokens": 19652943.0, + "step": 3776, + "train/ce_loss": 2.8804249723179964e-06 + }, + { + "epoch": 0.373343879770615, + "step": 3776, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.373343879770615, + "step": 3776, + "train/total_loss": 0.05078153684735298 + }, + { + "entropy": 8.827619552612305, + "epoch": 0.3734427526201305, + "mean_token_accuracy": 0.6972375512123108, + "num_tokens": 19658312.0, + "step": 3777, + "train/ce_loss": 1.2203423976898193 + }, + { + "epoch": 0.3734427526201305, + "step": 3777, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.3734427526201305, + "step": 3777, + "train/total_loss": 0.1728155016899109 + }, + { + "entropy": 9.172332763671875, + "epoch": 0.37354162546964603, + "mean_token_accuracy": 0.7862694263458252, + "num_tokens": 19663559.0, + "step": 3778, + "train/ce_loss": 0.989366888999939 + }, + { + "epoch": 0.37354162546964603, + "step": 3778, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.37354162546964603, + "step": 3778, + "train/total_loss": 0.18096794188022614 + }, + { + "entropy": 8.897441864013672, + "epoch": 0.3736404983191616, + "mean_token_accuracy": 0.7415485382080078, + "num_tokens": 19668924.0, + "step": 3779, + "train/ce_loss": 0.6184482574462891 + }, + { + "epoch": 0.3736404983191616, + "step": 3779, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.3736404983191616, + "step": 3779, + "train/total_loss": 0.0852823257446289 + }, + { + "epoch": 0.37373937116867706, + "grad_norm": 0.7398377060890198, + "learning_rate": 9.068140236364537e-06, + "loss": 0.1467, + "step": 3780 + }, + { + "entropy": 8.876144409179688, + "epoch": 0.37373937116867706, + "mean_token_accuracy": 0.7533936500549316, + "num_tokens": 19674200.0, + "step": 3780, + "train/ce_loss": 0.6192217469215393 + }, + { + "epoch": 0.37373937116867706, + "step": 3780, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.37373937116867706, + "step": 3780, + "train/total_loss": 0.12051592767238617 + }, + { + "entropy": 9.752557754516602, + "epoch": 0.3738382440181926, + "mean_token_accuracy": 0.7612524628639221, + "num_tokens": 19679089.0, + "step": 3781, + "train/ce_loss": 1.2465006113052368 + }, + { + "epoch": 0.3738382440181926, + "step": 3781, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.3738382440181926, + "step": 3781, + "train/total_loss": 0.17933756113052368 + }, + { + "entropy": 8.792906761169434, + "epoch": 0.37393711686770814, + "mean_token_accuracy": 0.734468936920166, + "num_tokens": 19684562.0, + "step": 3782, + "train/ce_loss": 0.5887306928634644 + }, + { + "epoch": 0.37393711686770814, + "step": 3782, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.37393711686770814, + "step": 3782, + "train/total_loss": 0.08231057226657867 + }, + { + "entropy": 9.0285005569458, + "epoch": 0.3740359897172236, + "mean_token_accuracy": 0.7572916746139526, + "num_tokens": 19689998.0, + "step": 3783, + "train/ce_loss": 1.0398918390274048 + }, + { + "epoch": 0.3740359897172236, + "step": 3783, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.3740359897172236, + "step": 3783, + "train/total_loss": 0.17430168390274048 + }, + { + "entropy": 8.794363021850586, + "epoch": 0.37413486256673917, + "mean_token_accuracy": 0.7221134901046753, + "num_tokens": 19695471.0, + "step": 3784, + "train/ce_loss": 0.9877570271492004 + }, + { + "epoch": 0.37413486256673917, + "step": 3784, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.37413486256673917, + "step": 3784, + "train/total_loss": 0.184713214635849 + }, + { + "entropy": 9.161565780639648, + "epoch": 0.3742337354162547, + "mean_token_accuracy": 0.6902777552604675, + "num_tokens": 19700659.0, + "step": 3785, + "train/ce_loss": 0.7769063115119934 + }, + { + "epoch": 0.3742337354162547, + "step": 3785, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.3742337354162547, + "step": 3785, + "train/total_loss": 0.15190938115119934 + }, + { + "entropy": 9.485732078552246, + "epoch": 0.3743326082657702, + "mean_token_accuracy": 0.8172231912612915, + "num_tokens": 19705675.0, + "step": 3786, + "train/ce_loss": 3.4305571716686245e-06 + }, + { + "epoch": 0.3743326082657702, + "step": 3786, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.3743326082657702, + "step": 3786, + "train/total_loss": 0.06640659272670746 + }, + { + "entropy": 9.806770324707031, + "epoch": 0.37443148111528574, + "mean_token_accuracy": 0.6660447716712952, + "num_tokens": 19710646.0, + "step": 3787, + "train/ce_loss": 1.9555909633636475 + }, + { + "epoch": 0.37443148111528574, + "step": 3787, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.37443148111528574, + "step": 3787, + "train/total_loss": 0.2580590844154358 + }, + { + "entropy": 9.27357006072998, + "epoch": 0.3745303539648013, + "mean_token_accuracy": 0.7506361603736877, + "num_tokens": 19715895.0, + "step": 3788, + "train/ce_loss": 0.4121975004673004 + }, + { + "epoch": 0.3745303539648013, + "step": 3788, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.3745303539648013, + "step": 3788, + "train/total_loss": 0.09200100600719452 + }, + { + "entropy": 8.91637897491455, + "epoch": 0.37462922681431676, + "mean_token_accuracy": 0.8262910842895508, + "num_tokens": 19721253.0, + "step": 3789, + "train/ce_loss": 0.73322993516922 + }, + { + "epoch": 0.37462922681431676, + "step": 3789, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.37462922681431676, + "step": 3789, + "train/total_loss": 0.13972924649715424 + }, + { + "entropy": 9.351829528808594, + "epoch": 0.3747280996638323, + "mean_token_accuracy": 0.722520112991333, + "num_tokens": 19726465.0, + "step": 3790, + "train/ce_loss": 1.9493904801493045e-06 + }, + { + "epoch": 0.3747280996638323, + "step": 3790, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.3747280996638323, + "step": 3790, + "train/total_loss": 0.04296894371509552 + }, + { + "entropy": 9.415502548217773, + "epoch": 0.37482697251334784, + "mean_token_accuracy": 0.7718120813369751, + "num_tokens": 19731496.0, + "step": 3791, + "train/ce_loss": 0.9651876091957092 + }, + { + "epoch": 0.37482697251334784, + "step": 3791, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.37482697251334784, + "step": 3791, + "train/total_loss": 0.15901875495910645 + }, + { + "entropy": 9.059098243713379, + "epoch": 0.37492584536286333, + "mean_token_accuracy": 0.7314578294754028, + "num_tokens": 19736796.0, + "step": 3792, + "train/ce_loss": 0.6716099381446838 + }, + { + "epoch": 0.37492584536286333, + "step": 3792, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.37492584536286333, + "step": 3792, + "train/total_loss": 0.11794224381446838 + }, + { + "entropy": 9.588038444519043, + "epoch": 0.37502471821237887, + "mean_token_accuracy": 0.7354085445404053, + "num_tokens": 19741696.0, + "step": 3793, + "train/ce_loss": 1.068302035331726 + }, + { + "epoch": 0.37502471821237887, + "step": 3793, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.37502471821237887, + "step": 3793, + "train/total_loss": 0.14589270949363708 + }, + { + "entropy": 8.875151634216309, + "epoch": 0.3751235910618944, + "mean_token_accuracy": 0.703201949596405, + "num_tokens": 19747127.0, + "step": 3794, + "train/ce_loss": 0.8356339931488037 + }, + { + "epoch": 0.3751235910618944, + "step": 3794, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.3751235910618944, + "step": 3794, + "train/total_loss": 0.1929384022951126 + }, + { + "entropy": 9.004408836364746, + "epoch": 0.37522246391140995, + "mean_token_accuracy": 0.6998950839042664, + "num_tokens": 19752553.0, + "step": 3795, + "train/ce_loss": 0.7078590989112854 + }, + { + "epoch": 0.37522246391140995, + "step": 3795, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.37522246391140995, + "step": 3795, + "train/total_loss": 0.10984840989112854 + }, + { + "entropy": 8.763167381286621, + "epoch": 0.37532133676092544, + "mean_token_accuracy": 0.7606635093688965, + "num_tokens": 19757833.0, + "step": 3796, + "train/ce_loss": 0.671595573425293 + }, + { + "epoch": 0.37532133676092544, + "step": 3796, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.37532133676092544, + "step": 3796, + "train/total_loss": 0.10231580585241318 + }, + { + "entropy": 8.933568954467773, + "epoch": 0.375420209610441, + "mean_token_accuracy": 0.801369845867157, + "num_tokens": 19763197.0, + "step": 3797, + "train/ce_loss": 0.5379621386528015 + }, + { + "epoch": 0.375420209610441, + "step": 3797, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.375420209610441, + "step": 3797, + "train/total_loss": 0.11238996684551239 + }, + { + "entropy": 9.240084648132324, + "epoch": 0.3755190824599565, + "mean_token_accuracy": 0.7153945565223694, + "num_tokens": 19768420.0, + "step": 3798, + "train/ce_loss": 0.882220447063446 + }, + { + "epoch": 0.3755190824599565, + "step": 3798, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.3755190824599565, + "step": 3798, + "train/total_loss": 0.13900330662727356 + }, + { + "entropy": 9.321863174438477, + "epoch": 0.375617955309472, + "mean_token_accuracy": 0.7720994353294373, + "num_tokens": 19773583.0, + "step": 3799, + "train/ce_loss": 1.9414674170548096e-06 + }, + { + "epoch": 0.375617955309472, + "step": 3799, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.375617955309472, + "step": 3799, + "train/total_loss": 0.05468769371509552 + }, + { + "epoch": 0.37571682815898755, + "grad_norm": 0.704355001449585, + "learning_rate": 9.063195371606588e-06, + "loss": 0.1405, + "step": 3800 + }, + { + "entropy": 9.065167427062988, + "epoch": 0.37571682815898755, + "mean_token_accuracy": 0.7735849022865295, + "num_tokens": 19778870.0, + "step": 3800, + "train/ce_loss": 0.713546872138977 + }, + { + "epoch": 0.37571682815898755, + "step": 3800, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.37571682815898755, + "step": 3800, + "train/total_loss": 0.1104171872138977 + }, + { + "entropy": 9.010021209716797, + "epoch": 0.3758157010085031, + "mean_token_accuracy": 0.7234273552894592, + "num_tokens": 19784226.0, + "step": 3801, + "train/ce_loss": 0.9238497614860535 + }, + { + "epoch": 0.3758157010085031, + "step": 3801, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.3758157010085031, + "step": 3801, + "train/total_loss": 0.11972872912883759 + }, + { + "entropy": 9.002856254577637, + "epoch": 0.3759145738580186, + "mean_token_accuracy": 0.7265822887420654, + "num_tokens": 19789545.0, + "step": 3802, + "train/ce_loss": 1.0768572092056274 + }, + { + "epoch": 0.3759145738580186, + "step": 3802, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.3759145738580186, + "step": 3802, + "train/total_loss": 0.20924821496009827 + }, + { + "entropy": 9.009138107299805, + "epoch": 0.3760134467075341, + "mean_token_accuracy": 0.7669441103935242, + "num_tokens": 19794881.0, + "step": 3803, + "train/ce_loss": 1.5670578479766846 + }, + { + "epoch": 0.3760134467075341, + "step": 3803, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.3760134467075341, + "step": 3803, + "train/total_loss": 0.2660807967185974 + }, + { + "entropy": 9.229785919189453, + "epoch": 0.37611231955704966, + "mean_token_accuracy": 0.7591036558151245, + "num_tokens": 19800032.0, + "step": 3804, + "train/ce_loss": 0.8593621850013733 + }, + { + "epoch": 0.37611231955704966, + "step": 3804, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.37611231955704966, + "step": 3804, + "train/total_loss": 0.14843621850013733 + }, + { + "entropy": 9.098108291625977, + "epoch": 0.37621119240656514, + "mean_token_accuracy": 0.7685834765434265, + "num_tokens": 19805187.0, + "step": 3805, + "train/ce_loss": 0.8727970719337463 + }, + { + "epoch": 0.37621119240656514, + "step": 3805, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.37621119240656514, + "step": 3805, + "train/total_loss": 0.13806095719337463 + }, + { + "entropy": 9.53138542175293, + "epoch": 0.3763100652560807, + "mean_token_accuracy": 0.7435455918312073, + "num_tokens": 19810177.0, + "step": 3806, + "train/ce_loss": 2.8374862670898438 + }, + { + "epoch": 0.3763100652560807, + "step": 3806, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.3763100652560807, + "step": 3806, + "train/total_loss": 0.3501548767089844 + }, + { + "entropy": 9.33613109588623, + "epoch": 0.3764089381055962, + "mean_token_accuracy": 0.7612156271934509, + "num_tokens": 19815315.0, + "step": 3807, + "train/ce_loss": 0.5548412799835205 + }, + { + "epoch": 0.3764089381055962, + "step": 3807, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.3764089381055962, + "step": 3807, + "train/total_loss": 0.11017163097858429 + }, + { + "entropy": 9.431605339050293, + "epoch": 0.3765078109551117, + "mean_token_accuracy": 0.7742782235145569, + "num_tokens": 19820492.0, + "step": 3808, + "train/ce_loss": 0.8345779776573181 + }, + { + "epoch": 0.3765078109551117, + "step": 3808, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.3765078109551117, + "step": 3808, + "train/total_loss": 0.10298904776573181 + }, + { + "entropy": 8.796649932861328, + "epoch": 0.37660668380462725, + "mean_token_accuracy": 0.7225501537322998, + "num_tokens": 19825756.0, + "step": 3809, + "train/ce_loss": 1.249238133430481 + }, + { + "epoch": 0.37660668380462725, + "step": 3809, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.37660668380462725, + "step": 3809, + "train/total_loss": 0.23039257526397705 + }, + { + "entropy": 9.011679649353027, + "epoch": 0.3767055566541428, + "mean_token_accuracy": 0.7263033390045166, + "num_tokens": 19831070.0, + "step": 3810, + "train/ce_loss": 0.7903326153755188 + }, + { + "epoch": 0.3767055566541428, + "step": 3810, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.3767055566541428, + "step": 3810, + "train/total_loss": 0.122002013027668 + }, + { + "entropy": 9.251346588134766, + "epoch": 0.3768044295036583, + "mean_token_accuracy": 0.7066166996955872, + "num_tokens": 19836485.0, + "step": 3811, + "train/ce_loss": 0.6743360757827759 + }, + { + "epoch": 0.3768044295036583, + "step": 3811, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.3768044295036583, + "step": 3811, + "train/total_loss": 0.11430861055850983 + }, + { + "entropy": 9.102466583251953, + "epoch": 0.3769033023531738, + "mean_token_accuracy": 0.7443708777427673, + "num_tokens": 19841875.0, + "step": 3812, + "train/ce_loss": 1.1818798780441284 + }, + { + "epoch": 0.3769033023531738, + "step": 3812, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.3769033023531738, + "step": 3812, + "train/total_loss": 0.22756299376487732 + }, + { + "entropy": 9.400089263916016, + "epoch": 0.37700217520268936, + "mean_token_accuracy": 0.7442528605461121, + "num_tokens": 19847000.0, + "step": 3813, + "train/ce_loss": 1.1718734502792358 + }, + { + "epoch": 0.37700217520268936, + "step": 3813, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.37700217520268936, + "step": 3813, + "train/total_loss": 0.16796860098838806 + }, + { + "entropy": 9.248943328857422, + "epoch": 0.37710104805220485, + "mean_token_accuracy": 0.7553957104682922, + "num_tokens": 19852179.0, + "step": 3814, + "train/ce_loss": 0.893125057220459 + }, + { + "epoch": 0.37710104805220485, + "step": 3814, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.37710104805220485, + "step": 3814, + "train/total_loss": 0.14790625870227814 + }, + { + "entropy": 9.229852676391602, + "epoch": 0.3771999209017204, + "mean_token_accuracy": 0.8393632173538208, + "num_tokens": 19857393.0, + "step": 3815, + "train/ce_loss": 0.6361907124519348 + }, + { + "epoch": 0.3771999209017204, + "step": 3815, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.3771999209017204, + "step": 3815, + "train/total_loss": 0.13393157720565796 + }, + { + "entropy": 9.12453556060791, + "epoch": 0.37729879375123593, + "mean_token_accuracy": 0.7630208134651184, + "num_tokens": 19862651.0, + "step": 3816, + "train/ce_loss": 0.6658760905265808 + }, + { + "epoch": 0.37729879375123593, + "step": 3816, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.37729879375123593, + "step": 3816, + "train/total_loss": 0.11346261203289032 + }, + { + "entropy": 8.986717224121094, + "epoch": 0.3773976666007514, + "mean_token_accuracy": 0.7001209259033203, + "num_tokens": 19867960.0, + "step": 3817, + "train/ce_loss": 0.6120560169219971 + }, + { + "epoch": 0.3773976666007514, + "step": 3817, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.3773976666007514, + "step": 3817, + "train/total_loss": 0.13542434573173523 + }, + { + "entropy": 9.174938201904297, + "epoch": 0.37749653945026695, + "mean_token_accuracy": 0.7614678740501404, + "num_tokens": 19873224.0, + "step": 3818, + "train/ce_loss": 0.6425613164901733 + }, + { + "epoch": 0.37749653945026695, + "step": 3818, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.37749653945026695, + "step": 3818, + "train/total_loss": 0.08769363164901733 + }, + { + "entropy": 8.792425155639648, + "epoch": 0.3775954122997825, + "mean_token_accuracy": 0.6988416910171509, + "num_tokens": 19878472.0, + "step": 3819, + "train/ce_loss": 1.3050808906555176 + }, + { + "epoch": 0.3775954122997825, + "step": 3819, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.3775954122997825, + "step": 3819, + "train/total_loss": 0.18128934502601624 + }, + { + "epoch": 0.377694285149298, + "grad_norm": 0.8487709760665894, + "learning_rate": 9.058250506848638e-06, + "loss": 0.1432, + "step": 3820 + }, + { + "entropy": 9.246591567993164, + "epoch": 0.377694285149298, + "mean_token_accuracy": 0.7724795937538147, + "num_tokens": 19883687.0, + "step": 3820, + "train/ce_loss": 1.2820429801940918 + }, + { + "epoch": 0.377694285149298, + "step": 3820, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.377694285149298, + "step": 3820, + "train/total_loss": 0.22586055099964142 + }, + { + "entropy": 9.413658142089844, + "epoch": 0.3777931579988135, + "mean_token_accuracy": 0.7532281279563904, + "num_tokens": 19888821.0, + "step": 3821, + "train/ce_loss": 0.7351582050323486 + }, + { + "epoch": 0.3777931579988135, + "step": 3821, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.3777931579988135, + "step": 3821, + "train/total_loss": 0.13601583242416382 + }, + { + "entropy": 8.993918418884277, + "epoch": 0.37789203084832906, + "mean_token_accuracy": 0.7847642302513123, + "num_tokens": 19894132.0, + "step": 3822, + "train/ce_loss": 0.4223306179046631 + }, + { + "epoch": 0.37789203084832906, + "step": 3822, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.37789203084832906, + "step": 3822, + "train/total_loss": 0.08129556477069855 + }, + { + "entropy": 9.318099021911621, + "epoch": 0.37799090369784455, + "mean_token_accuracy": 0.7189348936080933, + "num_tokens": 19899215.0, + "step": 3823, + "train/ce_loss": 1.242138182533381e-06 + }, + { + "epoch": 0.37799090369784455, + "step": 3823, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.37799090369784455, + "step": 3823, + "train/total_loss": 0.03125012293457985 + }, + { + "entropy": 9.346855163574219, + "epoch": 0.3780897765473601, + "mean_token_accuracy": 0.6827676296234131, + "num_tokens": 19904443.0, + "step": 3824, + "train/ce_loss": 0.6489213705062866 + }, + { + "epoch": 0.3780897765473601, + "step": 3824, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.3780897765473601, + "step": 3824, + "train/total_loss": 0.11957963556051254 + }, + { + "entropy": 9.698902130126953, + "epoch": 0.37818864939687563, + "mean_token_accuracy": 0.7413793206214905, + "num_tokens": 19909412.0, + "step": 3825, + "train/ce_loss": 0.7541821599006653 + }, + { + "epoch": 0.37818864939687563, + "step": 3825, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.37818864939687563, + "step": 3825, + "train/total_loss": 0.10666821897029877 + }, + { + "entropy": 9.310688972473145, + "epoch": 0.3782875222463911, + "mean_token_accuracy": 0.737300455570221, + "num_tokens": 19914583.0, + "step": 3826, + "train/ce_loss": 0.7176569700241089 + }, + { + "epoch": 0.3782875222463911, + "step": 3826, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.3782875222463911, + "step": 3826, + "train/total_loss": 0.11864069849252701 + }, + { + "entropy": 9.054895401000977, + "epoch": 0.37838639509590666, + "mean_token_accuracy": 0.7268232107162476, + "num_tokens": 19919852.0, + "step": 3827, + "train/ce_loss": 0.7547178864479065 + }, + { + "epoch": 0.37838639509590666, + "step": 3827, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.37838639509590666, + "step": 3827, + "train/total_loss": 0.11844053864479065 + }, + { + "entropy": 9.694982528686523, + "epoch": 0.3784852679454222, + "mean_token_accuracy": 0.7440147399902344, + "num_tokens": 19924806.0, + "step": 3828, + "train/ce_loss": 2.6799655188369798e-06 + }, + { + "epoch": 0.3784852679454222, + "step": 3828, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.3784852679454222, + "step": 3828, + "train/total_loss": 0.06250026822090149 + }, + { + "entropy": 8.97214126586914, + "epoch": 0.3785841407949377, + "mean_token_accuracy": 0.7381443381309509, + "num_tokens": 19930262.0, + "step": 3829, + "train/ce_loss": 0.3500278890132904 + }, + { + "epoch": 0.3785841407949377, + "step": 3829, + "train/sim_loss": 0.171875 + }, + { + "epoch": 0.3785841407949377, + "step": 3829, + "train/total_loss": 0.20687779784202576 + }, + { + "entropy": 9.158282279968262, + "epoch": 0.3786830136444532, + "mean_token_accuracy": 0.7079545259475708, + "num_tokens": 19935623.0, + "step": 3830, + "train/ce_loss": 0.7968263626098633 + }, + { + "epoch": 0.3786830136444532, + "step": 3830, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.3786830136444532, + "step": 3830, + "train/total_loss": 0.13437014818191528 + }, + { + "entropy": 8.646978378295898, + "epoch": 0.37878188649396877, + "mean_token_accuracy": 0.7291462421417236, + "num_tokens": 19941082.0, + "step": 3831, + "train/ce_loss": 0.9026403427124023 + }, + { + "epoch": 0.37878188649396877, + "step": 3831, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.37878188649396877, + "step": 3831, + "train/total_loss": 0.14104528725147247 + }, + { + "entropy": 9.50467300415039, + "epoch": 0.37888075934348425, + "mean_token_accuracy": 0.7625786066055298, + "num_tokens": 19946125.0, + "step": 3832, + "train/ce_loss": 4.323581379139796e-06 + }, + { + "epoch": 0.37888075934348425, + "step": 3832, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.37888075934348425, + "step": 3832, + "train/total_loss": 0.02734418213367462 + }, + { + "entropy": 9.258719444274902, + "epoch": 0.3789796321929998, + "mean_token_accuracy": 0.7279999852180481, + "num_tokens": 19951367.0, + "step": 3833, + "train/ce_loss": 1.208894968032837 + }, + { + "epoch": 0.3789796321929998, + "step": 3833, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.3789796321929998, + "step": 3833, + "train/total_loss": 0.18729574978351593 + }, + { + "entropy": 8.93228530883789, + "epoch": 0.37907850504251533, + "mean_token_accuracy": 0.6701164245605469, + "num_tokens": 19956618.0, + "step": 3834, + "train/ce_loss": 1.5145224332809448 + }, + { + "epoch": 0.37907850504251533, + "step": 3834, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.37907850504251533, + "step": 3834, + "train/total_loss": 0.23738974332809448 + }, + { + "entropy": 8.946246147155762, + "epoch": 0.3791773778920309, + "mean_token_accuracy": 0.7346711158752441, + "num_tokens": 19962033.0, + "step": 3835, + "train/ce_loss": 0.5565978288650513 + }, + { + "epoch": 0.3791773778920309, + "step": 3835, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.3791773778920309, + "step": 3835, + "train/total_loss": 0.11425353586673737 + }, + { + "entropy": 9.340824127197266, + "epoch": 0.37927625074154636, + "mean_token_accuracy": 0.757446825504303, + "num_tokens": 19967175.0, + "step": 3836, + "train/ce_loss": 1.0683367252349854 + }, + { + "epoch": 0.37927625074154636, + "step": 3836, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.37927625074154636, + "step": 3836, + "train/total_loss": 0.22402116656303406 + }, + { + "entropy": 8.507728576660156, + "epoch": 0.3793751235910619, + "mean_token_accuracy": 0.782608687877655, + "num_tokens": 19972781.0, + "step": 3837, + "train/ce_loss": 0.6930010914802551 + }, + { + "epoch": 0.3793751235910619, + "step": 3837, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.3793751235910619, + "step": 3837, + "train/total_loss": 0.17867511510849 + }, + { + "entropy": 8.375957489013672, + "epoch": 0.37947399644057744, + "mean_token_accuracy": 0.7422680258750916, + "num_tokens": 19978170.0, + "step": 3838, + "train/ce_loss": 0.8468574285507202 + }, + { + "epoch": 0.37947399644057744, + "step": 3838, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.37947399644057744, + "step": 3838, + "train/total_loss": 0.13546699285507202 + }, + { + "entropy": 9.039892196655273, + "epoch": 0.37957286929009293, + "mean_token_accuracy": 0.7541401386260986, + "num_tokens": 19983454.0, + "step": 3839, + "train/ce_loss": 0.5882134437561035 + }, + { + "epoch": 0.37957286929009293, + "step": 3839, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.37957286929009293, + "step": 3839, + "train/total_loss": 0.10179010033607483 + }, + { + "epoch": 0.37967174213960847, + "grad_norm": 0.7944733500480652, + "learning_rate": 9.05330564209069e-06, + "loss": 0.1476, + "step": 3840 + }, + { + "entropy": 9.035563468933105, + "epoch": 0.37967174213960847, + "mean_token_accuracy": 0.6805251836776733, + "num_tokens": 19988803.0, + "step": 3840, + "train/ce_loss": 1.2471295595169067 + }, + { + "epoch": 0.37967174213960847, + "step": 3840, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.37967174213960847, + "step": 3840, + "train/total_loss": 0.2067442089319229 + }, + { + "entropy": 8.570268630981445, + "epoch": 0.379770614989124, + "mean_token_accuracy": 0.7588832378387451, + "num_tokens": 19994460.0, + "step": 3841, + "train/ce_loss": 0.7722548246383667 + }, + { + "epoch": 0.379770614989124, + "step": 3841, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.379770614989124, + "step": 3841, + "train/total_loss": 0.10456923395395279 + }, + { + "entropy": 9.15269660949707, + "epoch": 0.3798694878386395, + "mean_token_accuracy": 0.7444589138031006, + "num_tokens": 19999705.0, + "step": 3842, + "train/ce_loss": 1.028852105140686 + }, + { + "epoch": 0.3798694878386395, + "step": 3842, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.3798694878386395, + "step": 3842, + "train/total_loss": 0.16147896647453308 + }, + { + "entropy": 8.799553871154785, + "epoch": 0.37996836068815504, + "mean_token_accuracy": 0.8022598624229431, + "num_tokens": 20005092.0, + "step": 3843, + "train/ce_loss": 0.8331433534622192 + }, + { + "epoch": 0.37996836068815504, + "step": 3843, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.37996836068815504, + "step": 3843, + "train/total_loss": 0.14972057938575745 + }, + { + "entropy": 9.060023307800293, + "epoch": 0.3800672335376706, + "mean_token_accuracy": 0.7175140976905823, + "num_tokens": 20010417.0, + "step": 3844, + "train/ce_loss": 1.1553999185562134 + }, + { + "epoch": 0.3800672335376706, + "step": 3844, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.3800672335376706, + "step": 3844, + "train/total_loss": 0.17413374781608582 + }, + { + "entropy": 9.151344299316406, + "epoch": 0.38016610638718606, + "mean_token_accuracy": 0.778124988079071, + "num_tokens": 20015500.0, + "step": 3845, + "train/ce_loss": 0.6391857266426086 + }, + { + "epoch": 0.38016610638718606, + "step": 3845, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.38016610638718606, + "step": 3845, + "train/total_loss": 0.1225123256444931 + }, + { + "entropy": 9.005912780761719, + "epoch": 0.3802649792367016, + "mean_token_accuracy": 0.7820343375205994, + "num_tokens": 20020706.0, + "step": 3846, + "train/ce_loss": 0.8451929092407227 + }, + { + "epoch": 0.3802649792367016, + "step": 3846, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.3802649792367016, + "step": 3846, + "train/total_loss": 0.15092554688453674 + }, + { + "entropy": 8.730301856994629, + "epoch": 0.38036385208621715, + "mean_token_accuracy": 0.7464115023612976, + "num_tokens": 20026203.0, + "step": 3847, + "train/ce_loss": 0.6203240752220154 + }, + { + "epoch": 0.38036385208621715, + "step": 3847, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.38036385208621715, + "step": 3847, + "train/total_loss": 0.08937615901231766 + }, + { + "entropy": 8.813580513000488, + "epoch": 0.38046272493573263, + "mean_token_accuracy": 0.7479091882705688, + "num_tokens": 20031517.0, + "step": 3848, + "train/ce_loss": 1.0179005861282349 + }, + { + "epoch": 0.38046272493573263, + "step": 3848, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.38046272493573263, + "step": 3848, + "train/total_loss": 0.18772757053375244 + }, + { + "entropy": 8.879956245422363, + "epoch": 0.3805615977852482, + "mean_token_accuracy": 0.7799999713897705, + "num_tokens": 20036851.0, + "step": 3849, + "train/ce_loss": 0.49752455949783325 + }, + { + "epoch": 0.3805615977852482, + "step": 3849, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.3805615977852482, + "step": 3849, + "train/total_loss": 0.08881495893001556 + }, + { + "entropy": 8.920181274414062, + "epoch": 0.3806604706347637, + "mean_token_accuracy": 0.7253446578979492, + "num_tokens": 20042271.0, + "step": 3850, + "train/ce_loss": 0.4901108145713806 + }, + { + "epoch": 0.3806604706347637, + "step": 3850, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.3806604706347637, + "step": 3850, + "train/total_loss": 0.06854233145713806 + }, + { + "entropy": 9.092105865478516, + "epoch": 0.3807593434842792, + "mean_token_accuracy": 0.7521613836288452, + "num_tokens": 20047454.0, + "step": 3851, + "train/ce_loss": 0.42295077443122864 + }, + { + "epoch": 0.3807593434842792, + "step": 3851, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.3807593434842792, + "step": 3851, + "train/total_loss": 0.08917008340358734 + }, + { + "entropy": 9.74991226196289, + "epoch": 0.38085821633379474, + "mean_token_accuracy": 0.727078914642334, + "num_tokens": 20052354.0, + "step": 3852, + "train/ce_loss": 5.97653524891939e-06 + }, + { + "epoch": 0.38085821633379474, + "step": 3852, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.38085821633379474, + "step": 3852, + "train/total_loss": 0.09375059604644775 + }, + { + "entropy": 9.296963691711426, + "epoch": 0.3809570891833103, + "mean_token_accuracy": 0.7491856813430786, + "num_tokens": 20057444.0, + "step": 3853, + "train/ce_loss": 1.38630211353302 + }, + { + "epoch": 0.3809570891833103, + "step": 3853, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.3809570891833103, + "step": 3853, + "train/total_loss": 0.220661461353302 + }, + { + "entropy": 8.965834617614746, + "epoch": 0.38105596203282577, + "mean_token_accuracy": 0.7508690357208252, + "num_tokens": 20062746.0, + "step": 3854, + "train/ce_loss": 1.291054368019104 + }, + { + "epoch": 0.38105596203282577, + "step": 3854, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.38105596203282577, + "step": 3854, + "train/total_loss": 0.18379293382167816 + }, + { + "entropy": 8.771659851074219, + "epoch": 0.3811548348823413, + "mean_token_accuracy": 0.7436399459838867, + "num_tokens": 20068285.0, + "step": 3855, + "train/ce_loss": 1.536427617073059 + }, + { + "epoch": 0.3811548348823413, + "step": 3855, + "train/sim_loss": 0.1484375 + }, + { + "epoch": 0.3811548348823413, + "step": 3855, + "train/total_loss": 0.30208027362823486 + }, + { + "entropy": 10.270421981811523, + "epoch": 0.38125370773185685, + "mean_token_accuracy": 0.6927083134651184, + "num_tokens": 20072909.0, + "step": 3856, + "train/ce_loss": 3.842646837234497 + }, + { + "epoch": 0.38125370773185685, + "step": 3856, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.38125370773185685, + "step": 3856, + "train/total_loss": 0.45067092776298523 + }, + { + "entropy": 9.520597457885742, + "epoch": 0.38135258058137234, + "mean_token_accuracy": 0.6839728951454163, + "num_tokens": 20077756.0, + "step": 3857, + "train/ce_loss": 2.0288615226745605 + }, + { + "epoch": 0.38135258058137234, + "step": 3857, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.38135258058137234, + "step": 3857, + "train/total_loss": 0.24976114928722382 + }, + { + "entropy": 8.86798095703125, + "epoch": 0.3814514534308879, + "mean_token_accuracy": 0.8033707737922668, + "num_tokens": 20083112.0, + "step": 3858, + "train/ce_loss": 0.5570780038833618 + }, + { + "epoch": 0.3814514534308879, + "step": 3858, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.3814514534308879, + "step": 3858, + "train/total_loss": 0.08305154740810394 + }, + { + "entropy": 8.96121883392334, + "epoch": 0.3815503262804034, + "mean_token_accuracy": 0.7189819812774658, + "num_tokens": 20088544.0, + "step": 3859, + "train/ce_loss": 0.33686110377311707 + }, + { + "epoch": 0.3815503262804034, + "step": 3859, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.3815503262804034, + "step": 3859, + "train/total_loss": 0.08446736633777618 + }, + { + "epoch": 0.3816491991299189, + "grad_norm": 0.899243175983429, + "learning_rate": 9.04836077733274e-06, + "loss": 0.141, + "step": 3860 + }, + { + "entropy": 9.109146118164062, + "epoch": 0.3816491991299189, + "mean_token_accuracy": 0.7835820913314819, + "num_tokens": 20093784.0, + "step": 3860, + "train/ce_loss": 0.7749420404434204 + }, + { + "epoch": 0.3816491991299189, + "step": 3860, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.3816491991299189, + "step": 3860, + "train/total_loss": 0.15171295404434204 + }, + { + "entropy": 9.232582092285156, + "epoch": 0.38174807197943444, + "mean_token_accuracy": 0.7138314843177795, + "num_tokens": 20098878.0, + "step": 3861, + "train/ce_loss": 1.1303707361221313 + }, + { + "epoch": 0.38174807197943444, + "step": 3861, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.38174807197943444, + "step": 3861, + "train/total_loss": 0.1755370795726776 + }, + { + "entropy": 8.834117889404297, + "epoch": 0.38184694482895, + "mean_token_accuracy": 0.7311111092567444, + "num_tokens": 20104249.0, + "step": 3862, + "train/ce_loss": 0.5333002805709839 + }, + { + "epoch": 0.38184694482895, + "step": 3862, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.38184694482895, + "step": 3862, + "train/total_loss": 0.11973628401756287 + }, + { + "entropy": 9.02542495727539, + "epoch": 0.38194581767846547, + "mean_token_accuracy": 0.7144444584846497, + "num_tokens": 20109640.0, + "step": 3863, + "train/ce_loss": 1.5636701583862305 + }, + { + "epoch": 0.38194581767846547, + "step": 3863, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.38194581767846547, + "step": 3863, + "train/total_loss": 0.2579295039176941 + }, + { + "entropy": 8.74275016784668, + "epoch": 0.382044690527981, + "mean_token_accuracy": 0.7470308542251587, + "num_tokens": 20114926.0, + "step": 3864, + "train/ce_loss": 1.2879526615142822 + }, + { + "epoch": 0.382044690527981, + "step": 3864, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.382044690527981, + "step": 3864, + "train/total_loss": 0.19910776615142822 + }, + { + "entropy": 8.74221420288086, + "epoch": 0.38214356337749655, + "mean_token_accuracy": 0.7284700870513916, + "num_tokens": 20120403.0, + "step": 3865, + "train/ce_loss": 1.0889049768447876 + }, + { + "epoch": 0.38214356337749655, + "step": 3865, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.38214356337749655, + "step": 3865, + "train/total_loss": 0.21826550364494324 + }, + { + "entropy": 8.940792083740234, + "epoch": 0.38224243622701204, + "mean_token_accuracy": 0.745591938495636, + "num_tokens": 20125626.0, + "step": 3866, + "train/ce_loss": 0.9817897081375122 + }, + { + "epoch": 0.38224243622701204, + "step": 3866, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.38224243622701204, + "step": 3866, + "train/total_loss": 0.17239773273468018 + }, + { + "entropy": 8.570219039916992, + "epoch": 0.3823413090765276, + "mean_token_accuracy": 0.6934911012649536, + "num_tokens": 20130975.0, + "step": 3867, + "train/ce_loss": 0.5346583127975464 + }, + { + "epoch": 0.3823413090765276, + "step": 3867, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.3823413090765276, + "step": 3867, + "train/total_loss": 0.1120595782995224 + }, + { + "entropy": 9.852130889892578, + "epoch": 0.3824401819260431, + "mean_token_accuracy": 0.7345309257507324, + "num_tokens": 20136069.0, + "step": 3868, + "train/ce_loss": 2.2908136543264845e-06 + }, + { + "epoch": 0.3824401819260431, + "step": 3868, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.3824401819260431, + "step": 3868, + "train/total_loss": 0.039062727242708206 + }, + { + "entropy": 8.938485145568848, + "epoch": 0.3825390547755586, + "mean_token_accuracy": 0.7442660331726074, + "num_tokens": 20141461.0, + "step": 3869, + "train/ce_loss": 0.779180109500885 + }, + { + "epoch": 0.3825390547755586, + "step": 3869, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.3825390547755586, + "step": 3869, + "train/total_loss": 0.15604302287101746 + }, + { + "entropy": 8.845773696899414, + "epoch": 0.38263792762507415, + "mean_token_accuracy": 0.7202796936035156, + "num_tokens": 20146942.0, + "step": 3870, + "train/ce_loss": 0.7213982343673706 + }, + { + "epoch": 0.38263792762507415, + "step": 3870, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.38263792762507415, + "step": 3870, + "train/total_loss": 0.11120232194662094 + }, + { + "entropy": 9.102130889892578, + "epoch": 0.3827368004745897, + "mean_token_accuracy": 0.6877990365028381, + "num_tokens": 20152244.0, + "step": 3871, + "train/ce_loss": 1.3690425157546997 + }, + { + "epoch": 0.3827368004745897, + "step": 3871, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.3827368004745897, + "step": 3871, + "train/total_loss": 0.250185489654541 + }, + { + "entropy": 9.01791000366211, + "epoch": 0.3828356733241052, + "mean_token_accuracy": 0.7416666746139526, + "num_tokens": 20157552.0, + "step": 3872, + "train/ce_loss": 1.1151341199874878 + }, + { + "epoch": 0.3828356733241052, + "step": 3872, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.3828356733241052, + "step": 3872, + "train/total_loss": 0.1583884060382843 + }, + { + "entropy": 9.082117080688477, + "epoch": 0.3829345461736207, + "mean_token_accuracy": 0.6948198080062866, + "num_tokens": 20162918.0, + "step": 3873, + "train/ce_loss": 0.9492505192756653 + }, + { + "epoch": 0.3829345461736207, + "step": 3873, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.3829345461736207, + "step": 3873, + "train/total_loss": 0.20039379596710205 + }, + { + "entropy": 9.00233268737793, + "epoch": 0.38303341902313626, + "mean_token_accuracy": 0.7436241507530212, + "num_tokens": 20168095.0, + "step": 3874, + "train/ce_loss": 0.8107348680496216 + }, + { + "epoch": 0.38303341902313626, + "step": 3874, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.38303341902313626, + "step": 3874, + "train/total_loss": 0.12404223531484604 + }, + { + "entropy": 8.733619689941406, + "epoch": 0.38313229187265174, + "mean_token_accuracy": 0.7614269852638245, + "num_tokens": 20173430.0, + "step": 3875, + "train/ce_loss": 0.821361780166626 + }, + { + "epoch": 0.38313229187265174, + "step": 3875, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.38313229187265174, + "step": 3875, + "train/total_loss": 0.14072993397712708 + }, + { + "entropy": 9.802820205688477, + "epoch": 0.3832311647221673, + "mean_token_accuracy": 0.6971279382705688, + "num_tokens": 20178219.0, + "step": 3876, + "train/ce_loss": 5.660176157107344e-06 + }, + { + "epoch": 0.3832311647221673, + "step": 3876, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.3832311647221673, + "step": 3876, + "train/total_loss": 0.07031306624412537 + }, + { + "entropy": 10.102174758911133, + "epoch": 0.3833300375716828, + "mean_token_accuracy": 0.7095709443092346, + "num_tokens": 20182941.0, + "step": 3877, + "train/ce_loss": 7.492818440368865e-06 + }, + { + "epoch": 0.3833300375716828, + "step": 3877, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.3833300375716828, + "step": 3877, + "train/total_loss": 0.05859449878334999 + }, + { + "entropy": 9.372560501098633, + "epoch": 0.38342891042119837, + "mean_token_accuracy": 0.7775768637657166, + "num_tokens": 20187923.0, + "step": 3878, + "train/ce_loss": 0.7272844910621643 + }, + { + "epoch": 0.38342891042119837, + "step": 3878, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.38342891042119837, + "step": 3878, + "train/total_loss": 0.1899159550666809 + }, + { + "entropy": 8.713886260986328, + "epoch": 0.38352778327071385, + "mean_token_accuracy": 0.692307710647583, + "num_tokens": 20193302.0, + "step": 3879, + "train/ce_loss": 1.1886383295059204 + }, + { + "epoch": 0.38352778327071385, + "step": 3879, + "train/sim_loss": 0.1875 + }, + { + "epoch": 0.38352778327071385, + "step": 3879, + "train/total_loss": 0.3063638210296631 + }, + { + "epoch": 0.3836266561202294, + "grad_norm": 1.0428014993667603, + "learning_rate": 9.043415912574793e-06, + "loss": 0.1569, + "step": 3880 + }, + { + "entropy": 9.826498031616211, + "epoch": 0.3836266561202294, + "mean_token_accuracy": 0.7492063641548157, + "num_tokens": 20198195.0, + "step": 3880, + "train/ce_loss": 2.52473219006788e-05 + }, + { + "epoch": 0.3836266561202294, + "step": 3880, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.3836266561202294, + "step": 3880, + "train/total_loss": 0.04687752574682236 + }, + { + "entropy": 8.948516845703125, + "epoch": 0.38372552896974493, + "mean_token_accuracy": 0.7144607901573181, + "num_tokens": 20203491.0, + "step": 3881, + "train/ce_loss": 1.069422960281372 + }, + { + "epoch": 0.38372552896974493, + "step": 3881, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.38372552896974493, + "step": 3881, + "train/total_loss": 0.1772547960281372 + }, + { + "entropy": 9.142729759216309, + "epoch": 0.3838244018192604, + "mean_token_accuracy": 0.8066860437393188, + "num_tokens": 20208652.0, + "step": 3882, + "train/ce_loss": 2.052043100775336e-06 + }, + { + "epoch": 0.3838244018192604, + "step": 3882, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.3838244018192604, + "step": 3882, + "train/total_loss": 0.07421895861625671 + }, + { + "entropy": 8.838113784790039, + "epoch": 0.38392327466877596, + "mean_token_accuracy": 0.7088477611541748, + "num_tokens": 20214129.0, + "step": 3883, + "train/ce_loss": 0.6217888593673706 + }, + { + "epoch": 0.38392327466877596, + "step": 3883, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.38392327466877596, + "step": 3883, + "train/total_loss": 0.15202262997627258 + }, + { + "entropy": 8.645570755004883, + "epoch": 0.3840221475182915, + "mean_token_accuracy": 0.7366212010383606, + "num_tokens": 20219603.0, + "step": 3884, + "train/ce_loss": 0.6932132840156555 + }, + { + "epoch": 0.3840221475182915, + "step": 3884, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.3840221475182915, + "step": 3884, + "train/total_loss": 0.08885257691144943 + }, + { + "entropy": 8.908561706542969, + "epoch": 0.384121020367807, + "mean_token_accuracy": 0.7578378319740295, + "num_tokens": 20225007.0, + "step": 3885, + "train/ce_loss": 0.4770452082157135 + }, + { + "epoch": 0.384121020367807, + "step": 3885, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.384121020367807, + "step": 3885, + "train/total_loss": 0.07114201784133911 + }, + { + "entropy": 9.26992130279541, + "epoch": 0.38421989321732253, + "mean_token_accuracy": 0.7266010046005249, + "num_tokens": 20230242.0, + "step": 3886, + "train/ce_loss": 2.017613724092371e-06 + }, + { + "epoch": 0.38421989321732253, + "step": 3886, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.38421989321732253, + "step": 3886, + "train/total_loss": 0.023437701165676117 + }, + { + "entropy": 8.833425521850586, + "epoch": 0.38431876606683807, + "mean_token_accuracy": 0.7277969717979431, + "num_tokens": 20235624.0, + "step": 3887, + "train/ce_loss": 0.8311039805412292 + }, + { + "epoch": 0.38431876606683807, + "step": 3887, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.38431876606683807, + "step": 3887, + "train/total_loss": 0.14951664209365845 + }, + { + "entropy": 9.60293197631836, + "epoch": 0.38441763891635355, + "mean_token_accuracy": 0.7222222089767456, + "num_tokens": 20240654.0, + "step": 3888, + "train/ce_loss": 1.3117893104208633e-05 + }, + { + "epoch": 0.38441763891635355, + "step": 3888, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.38441763891635355, + "step": 3888, + "train/total_loss": 0.07422006130218506 + }, + { + "entropy": 9.436559677124023, + "epoch": 0.3845165117658691, + "mean_token_accuracy": 0.7728285193443298, + "num_tokens": 20245585.0, + "step": 3889, + "train/ce_loss": 0.940013587474823 + }, + { + "epoch": 0.3845165117658691, + "step": 3889, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.3845165117658691, + "step": 3889, + "train/total_loss": 0.15650135278701782 + }, + { + "entropy": 9.050984382629395, + "epoch": 0.38461538461538464, + "mean_token_accuracy": 0.7281213402748108, + "num_tokens": 20250916.0, + "step": 3890, + "train/ce_loss": 0.9562475085258484 + }, + { + "epoch": 0.38461538461538464, + "step": 3890, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.38461538461538464, + "step": 3890, + "train/total_loss": 0.15421849489212036 + }, + { + "entropy": 9.18235969543457, + "epoch": 0.3847142574649001, + "mean_token_accuracy": 0.7667560577392578, + "num_tokens": 20256155.0, + "step": 3891, + "train/ce_loss": 1.0980581045150757 + }, + { + "epoch": 0.3847142574649001, + "step": 3891, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.3847142574649001, + "step": 3891, + "train/total_loss": 0.18011832237243652 + }, + { + "entropy": 8.994253158569336, + "epoch": 0.38481313031441566, + "mean_token_accuracy": 0.7229336500167847, + "num_tokens": 20261514.0, + "step": 3892, + "train/ce_loss": 6.4584337451378815e-06 + }, + { + "epoch": 0.38481313031441566, + "step": 3892, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.38481313031441566, + "step": 3892, + "train/total_loss": 0.046875644475221634 + }, + { + "entropy": 10.120834350585938, + "epoch": 0.3849120031639312, + "mean_token_accuracy": 0.8395061492919922, + "num_tokens": 20266243.0, + "step": 3893, + "train/ce_loss": 1.5382053852081299 + }, + { + "epoch": 0.3849120031639312, + "step": 3893, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.3849120031639312, + "step": 3893, + "train/total_loss": 0.20460179448127747 + }, + { + "entropy": 9.669536590576172, + "epoch": 0.3850108760134467, + "mean_token_accuracy": 0.7058823704719543, + "num_tokens": 20271159.0, + "step": 3894, + "train/ce_loss": 1.501869559288025 + }, + { + "epoch": 0.3850108760134467, + "step": 3894, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.3850108760134467, + "step": 3894, + "train/total_loss": 0.2244057059288025 + }, + { + "entropy": 9.655606269836426, + "epoch": 0.38510974886296223, + "mean_token_accuracy": 0.7670156955718994, + "num_tokens": 20275968.0, + "step": 3895, + "train/ce_loss": 3.86687361242366e-06 + }, + { + "epoch": 0.38510974886296223, + "step": 3895, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.38510974886296223, + "step": 3895, + "train/total_loss": 0.05468788743019104 + }, + { + "entropy": 8.863473892211914, + "epoch": 0.3852086217124778, + "mean_token_accuracy": 0.7544731497764587, + "num_tokens": 20281499.0, + "step": 3896, + "train/ce_loss": 0.7062889337539673 + }, + { + "epoch": 0.3852086217124778, + "step": 3896, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.3852086217124778, + "step": 3896, + "train/total_loss": 0.11750389635562897 + }, + { + "entropy": 8.718485832214355, + "epoch": 0.38530749456199326, + "mean_token_accuracy": 0.7341317534446716, + "num_tokens": 20286796.0, + "step": 3897, + "train/ce_loss": 0.5802757740020752 + }, + { + "epoch": 0.38530749456199326, + "step": 3897, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.38530749456199326, + "step": 3897, + "train/total_loss": 0.08146508038043976 + }, + { + "entropy": 9.35566520690918, + "epoch": 0.3854063674115088, + "mean_token_accuracy": 0.7277701497077942, + "num_tokens": 20291948.0, + "step": 3898, + "train/ce_loss": 0.38343045115470886 + }, + { + "epoch": 0.3854063674115088, + "step": 3898, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.3854063674115088, + "step": 3898, + "train/total_loss": 0.061780545860528946 + }, + { + "entropy": 9.213844299316406, + "epoch": 0.38550524026102434, + "mean_token_accuracy": 0.7834645509719849, + "num_tokens": 20297178.0, + "step": 3899, + "train/ce_loss": 1.0819836854934692 + }, + { + "epoch": 0.38550524026102434, + "step": 3899, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.38550524026102434, + "step": 3899, + "train/total_loss": 0.1394483745098114 + }, + { + "epoch": 0.3856041131105398, + "grad_norm": 0.8254612684249878, + "learning_rate": 9.038471047816842e-06, + "loss": 0.1416, + "step": 3900 + }, + { + "entropy": 8.882987976074219, + "epoch": 0.3856041131105398, + "mean_token_accuracy": 0.7002341747283936, + "num_tokens": 20302591.0, + "step": 3900, + "train/ce_loss": 0.5525063276290894 + }, + { + "epoch": 0.3856041131105398, + "step": 3900, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.3856041131105398, + "step": 3900, + "train/total_loss": 0.1411881297826767 + }, + { + "entropy": 9.601035118103027, + "epoch": 0.38570298596005537, + "mean_token_accuracy": 0.6929982304573059, + "num_tokens": 20307595.0, + "step": 3901, + "train/ce_loss": 1.6465243101119995 + }, + { + "epoch": 0.38570298596005537, + "step": 3901, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.38570298596005537, + "step": 3901, + "train/total_loss": 0.26621493697166443 + }, + { + "entropy": 9.398444175720215, + "epoch": 0.3858018588095709, + "mean_token_accuracy": 0.7452531456947327, + "num_tokens": 20312693.0, + "step": 3902, + "train/ce_loss": 1.2758805751800537 + }, + { + "epoch": 0.3858018588095709, + "step": 3902, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.3858018588095709, + "step": 3902, + "train/total_loss": 0.16665056347846985 + }, + { + "entropy": 9.34980297088623, + "epoch": 0.3859007316590864, + "mean_token_accuracy": 0.7067371010780334, + "num_tokens": 20317920.0, + "step": 3903, + "train/ce_loss": 0.8974692225456238 + }, + { + "epoch": 0.3859007316590864, + "step": 3903, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.3859007316590864, + "step": 3903, + "train/total_loss": 0.18740317225456238 + }, + { + "entropy": 9.25191879272461, + "epoch": 0.38599960450860193, + "mean_token_accuracy": 0.8173785209655762, + "num_tokens": 20323057.0, + "step": 3904, + "train/ce_loss": 0.8279136419296265 + }, + { + "epoch": 0.38599960450860193, + "step": 3904, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.38599960450860193, + "step": 3904, + "train/total_loss": 0.12966635823249817 + }, + { + "entropy": 9.476541519165039, + "epoch": 0.3860984773581175, + "mean_token_accuracy": 0.7361563444137573, + "num_tokens": 20328091.0, + "step": 3905, + "train/ce_loss": 0.7952406406402588 + }, + { + "epoch": 0.3860984773581175, + "step": 3905, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.3860984773581175, + "step": 3905, + "train/total_loss": 0.13421157002449036 + }, + { + "entropy": 9.087745666503906, + "epoch": 0.38619735020763296, + "mean_token_accuracy": 0.754601240158081, + "num_tokens": 20333386.0, + "step": 3906, + "train/ce_loss": 0.7630630135536194 + }, + { + "epoch": 0.38619735020763296, + "step": 3906, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.38619735020763296, + "step": 3906, + "train/total_loss": 0.0997438058257103 + }, + { + "entropy": 9.5733642578125, + "epoch": 0.3862962230571485, + "mean_token_accuracy": 0.7789473533630371, + "num_tokens": 20338391.0, + "step": 3907, + "train/ce_loss": 4.043435637868242e-06 + }, + { + "epoch": 0.3862962230571485, + "step": 3907, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.3862962230571485, + "step": 3907, + "train/total_loss": 0.04687540605664253 + }, + { + "entropy": 8.956235885620117, + "epoch": 0.38639509590666404, + "mean_token_accuracy": 0.7097862958908081, + "num_tokens": 20343760.0, + "step": 3908, + "train/ce_loss": 1.3375918865203857 + }, + { + "epoch": 0.38639509590666404, + "step": 3908, + "train/sim_loss": 0.1796875 + }, + { + "epoch": 0.38639509590666404, + "step": 3908, + "train/total_loss": 0.31344670057296753 + }, + { + "entropy": 9.310017585754395, + "epoch": 0.38649396875617953, + "mean_token_accuracy": 0.7263157963752747, + "num_tokens": 20348858.0, + "step": 3909, + "train/ce_loss": 1.3528118133544922 + }, + { + "epoch": 0.38649396875617953, + "step": 3909, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.38649396875617953, + "step": 3909, + "train/total_loss": 0.24856244027614594 + }, + { + "entropy": 9.043222427368164, + "epoch": 0.38659284160569507, + "mean_token_accuracy": 0.7064934968948364, + "num_tokens": 20354094.0, + "step": 3910, + "train/ce_loss": 0.9024978280067444 + }, + { + "epoch": 0.38659284160569507, + "step": 3910, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.38659284160569507, + "step": 3910, + "train/total_loss": 0.14884352684020996 + }, + { + "entropy": 8.958993911743164, + "epoch": 0.3866917144552106, + "mean_token_accuracy": 0.7493887543678284, + "num_tokens": 20359382.0, + "step": 3911, + "train/ce_loss": 0.911043107509613 + }, + { + "epoch": 0.3866917144552106, + "step": 3911, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.3866917144552106, + "step": 3911, + "train/total_loss": 0.16141681373119354 + }, + { + "entropy": 8.577729225158691, + "epoch": 0.3867905873047261, + "mean_token_accuracy": 0.6969146728515625, + "num_tokens": 20365011.0, + "step": 3912, + "train/ce_loss": 1.1239310503005981 + }, + { + "epoch": 0.3867905873047261, + "step": 3912, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.3867905873047261, + "step": 3912, + "train/total_loss": 0.2061431109905243 + }, + { + "entropy": 9.092487335205078, + "epoch": 0.38688946015424164, + "mean_token_accuracy": 0.7576923370361328, + "num_tokens": 20370270.0, + "step": 3913, + "train/ce_loss": 1.0013328790664673 + }, + { + "epoch": 0.38688946015424164, + "step": 3913, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.38688946015424164, + "step": 3913, + "train/total_loss": 0.13919579982757568 + }, + { + "entropy": 9.505253791809082, + "epoch": 0.3869883330037572, + "mean_token_accuracy": 0.6710097789764404, + "num_tokens": 20375311.0, + "step": 3914, + "train/ce_loss": 0.8630321621894836 + }, + { + "epoch": 0.3869883330037572, + "step": 3914, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.3869883330037572, + "step": 3914, + "train/total_loss": 0.1175532191991806 + }, + { + "entropy": 10.00143051147461, + "epoch": 0.38708720585327266, + "mean_token_accuracy": 0.7777777910232544, + "num_tokens": 20380117.0, + "step": 3915, + "train/ce_loss": 1.1587531566619873 + }, + { + "epoch": 0.38708720585327266, + "step": 3915, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.38708720585327266, + "step": 3915, + "train/total_loss": 0.14321906864643097 + }, + { + "entropy": 9.14274787902832, + "epoch": 0.3871860787027882, + "mean_token_accuracy": 0.6725888252258301, + "num_tokens": 20385412.0, + "step": 3916, + "train/ce_loss": 0.8936051726341248 + }, + { + "epoch": 0.3871860787027882, + "step": 3916, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.3871860787027882, + "step": 3916, + "train/total_loss": 0.20654802024364471 + }, + { + "entropy": 9.054058074951172, + "epoch": 0.38728495155230375, + "mean_token_accuracy": 0.7832929491996765, + "num_tokens": 20390771.0, + "step": 3917, + "train/ce_loss": 0.706035315990448 + }, + { + "epoch": 0.38728495155230375, + "step": 3917, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.38728495155230375, + "step": 3917, + "train/total_loss": 0.10966603457927704 + }, + { + "entropy": 9.652740478515625, + "epoch": 0.3873838244018193, + "mean_token_accuracy": 0.7368420958518982, + "num_tokens": 20395784.0, + "step": 3918, + "train/ce_loss": 1.7945903539657593 + }, + { + "epoch": 0.3873838244018193, + "step": 3918, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.3873838244018193, + "step": 3918, + "train/total_loss": 0.2810215353965759 + }, + { + "entropy": 9.720868110656738, + "epoch": 0.3874826972513348, + "mean_token_accuracy": 0.7803030014038086, + "num_tokens": 20400784.0, + "step": 3919, + "train/ce_loss": 0.9807897806167603 + }, + { + "epoch": 0.3874826972513348, + "step": 3919, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.3874826972513348, + "step": 3919, + "train/total_loss": 0.19182898104190826 + }, + { + "epoch": 0.3875815701008503, + "grad_norm": 0.7641163468360901, + "learning_rate": 9.033526183058894e-06, + "loss": 0.1508, + "step": 3920 + }, + { + "entropy": 9.394466400146484, + "epoch": 0.3875815701008503, + "mean_token_accuracy": 0.7298049926757812, + "num_tokens": 20405958.0, + "step": 3920, + "train/ce_loss": 2.060258150100708 + }, + { + "epoch": 0.3875815701008503, + "step": 3920, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.3875815701008503, + "step": 3920, + "train/total_loss": 0.2763383388519287 + }, + { + "entropy": 9.030954360961914, + "epoch": 0.38768044295036586, + "mean_token_accuracy": 0.7218543291091919, + "num_tokens": 20411205.0, + "step": 3921, + "train/ce_loss": 1.163877248764038 + }, + { + "epoch": 0.38768044295036586, + "step": 3921, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.38768044295036586, + "step": 3921, + "train/total_loss": 0.1945127248764038 + }, + { + "entropy": 9.603231430053711, + "epoch": 0.38777931579988134, + "mean_token_accuracy": 0.7215999960899353, + "num_tokens": 20416287.0, + "step": 3922, + "train/ce_loss": 3.7166912534303265e-06 + }, + { + "epoch": 0.38777931579988134, + "step": 3922, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.38777931579988134, + "step": 3922, + "train/total_loss": 0.06250037252902985 + }, + { + "entropy": 9.065607070922852, + "epoch": 0.3878781886493969, + "mean_token_accuracy": 0.7531865835189819, + "num_tokens": 20421601.0, + "step": 3923, + "train/ce_loss": 0.4875900149345398 + }, + { + "epoch": 0.3878781886493969, + "step": 3923, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.3878781886493969, + "step": 3923, + "train/total_loss": 0.13860274851322174 + }, + { + "entropy": 9.044381141662598, + "epoch": 0.3879770614989124, + "mean_token_accuracy": 0.7794285416603088, + "num_tokens": 20426943.0, + "step": 3924, + "train/ce_loss": 0.6263932585716248 + }, + { + "epoch": 0.3879770614989124, + "step": 3924, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.3879770614989124, + "step": 3924, + "train/total_loss": 0.13685807585716248 + }, + { + "entropy": 8.900918006896973, + "epoch": 0.3880759343484279, + "mean_token_accuracy": 0.72365802526474, + "num_tokens": 20432472.0, + "step": 3925, + "train/ce_loss": 0.7355908751487732 + }, + { + "epoch": 0.3880759343484279, + "step": 3925, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.3880759343484279, + "step": 3925, + "train/total_loss": 0.17121534049510956 + }, + { + "entropy": 9.017250061035156, + "epoch": 0.38817480719794345, + "mean_token_accuracy": 0.7524971961975098, + "num_tokens": 20437837.0, + "step": 3926, + "train/ce_loss": 0.7789159417152405 + }, + { + "epoch": 0.38817480719794345, + "step": 3926, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.38817480719794345, + "step": 3926, + "train/total_loss": 0.14820408821105957 + }, + { + "entropy": 8.924264907836914, + "epoch": 0.388273680047459, + "mean_token_accuracy": 0.7144444584846497, + "num_tokens": 20443220.0, + "step": 3927, + "train/ce_loss": 1.333925724029541 + }, + { + "epoch": 0.388273680047459, + "step": 3927, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.388273680047459, + "step": 3927, + "train/total_loss": 0.2154238224029541 + }, + { + "entropy": 9.6730375289917, + "epoch": 0.3883725528969745, + "mean_token_accuracy": 0.7755101919174194, + "num_tokens": 20448240.0, + "step": 3928, + "train/ce_loss": 3.7251754747558152e-06 + }, + { + "epoch": 0.3883725528969745, + "step": 3928, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.3883725528969745, + "step": 3928, + "train/total_loss": 0.046875372529029846 + }, + { + "entropy": 8.855838775634766, + "epoch": 0.38847142574649, + "mean_token_accuracy": 0.7216721773147583, + "num_tokens": 20453696.0, + "step": 3929, + "train/ce_loss": 1.4190943241119385 + }, + { + "epoch": 0.38847142574649, + "step": 3929, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.38847142574649, + "step": 3929, + "train/total_loss": 0.2239406853914261 + }, + { + "entropy": 9.049814224243164, + "epoch": 0.38857029859600556, + "mean_token_accuracy": 0.740656852722168, + "num_tokens": 20459047.0, + "step": 3930, + "train/ce_loss": 0.8216625452041626 + }, + { + "epoch": 0.38857029859600556, + "step": 3930, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.38857029859600556, + "step": 3930, + "train/total_loss": 0.16029125452041626 + }, + { + "entropy": 9.331798553466797, + "epoch": 0.38866917144552104, + "mean_token_accuracy": 0.7557544708251953, + "num_tokens": 20464285.0, + "step": 3931, + "train/ce_loss": 0.8100857734680176 + }, + { + "epoch": 0.38866917144552104, + "step": 3931, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.38866917144552104, + "step": 3931, + "train/total_loss": 0.11225857585668564 + }, + { + "entropy": 10.082314491271973, + "epoch": 0.3887680442950366, + "mean_token_accuracy": 0.7195122241973877, + "num_tokens": 20469067.0, + "step": 3932, + "train/ce_loss": 9.16772842174396e-06 + }, + { + "epoch": 0.3887680442950366, + "step": 3932, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.3887680442950366, + "step": 3932, + "train/total_loss": 0.01562591642141342 + }, + { + "entropy": 8.685100555419922, + "epoch": 0.3888669171445521, + "mean_token_accuracy": 0.7468030452728271, + "num_tokens": 20474377.0, + "step": 3933, + "train/ce_loss": 0.8407461643218994 + }, + { + "epoch": 0.3888669171445521, + "step": 3933, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.3888669171445521, + "step": 3933, + "train/total_loss": 0.14266836643218994 + }, + { + "entropy": 9.024192810058594, + "epoch": 0.3889657899940676, + "mean_token_accuracy": 0.7377398610115051, + "num_tokens": 20479779.0, + "step": 3934, + "train/ce_loss": 0.5858728289604187 + }, + { + "epoch": 0.3889657899940676, + "step": 3934, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.3889657899940676, + "step": 3934, + "train/total_loss": 0.07421228289604187 + }, + { + "entropy": 9.15198040008545, + "epoch": 0.38906466284358315, + "mean_token_accuracy": 0.7543604373931885, + "num_tokens": 20484968.0, + "step": 3935, + "train/ce_loss": 1.2744925022125244 + }, + { + "epoch": 0.38906466284358315, + "step": 3935, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.38906466284358315, + "step": 3935, + "train/total_loss": 0.22119925916194916 + }, + { + "entropy": 9.710990905761719, + "epoch": 0.3891635356930987, + "mean_token_accuracy": 0.683501660823822, + "num_tokens": 20489983.0, + "step": 3936, + "train/ce_loss": 1.713331937789917 + }, + { + "epoch": 0.3891635356930987, + "step": 3936, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.3891635356930987, + "step": 3936, + "train/total_loss": 0.2768019437789917 + }, + { + "entropy": 9.299409866333008, + "epoch": 0.3892624085426142, + "mean_token_accuracy": 0.760869562625885, + "num_tokens": 20495206.0, + "step": 3937, + "train/ce_loss": 0.7358940839767456 + }, + { + "epoch": 0.3892624085426142, + "step": 3937, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.3892624085426142, + "step": 3937, + "train/total_loss": 0.10483940690755844 + }, + { + "entropy": 9.159974098205566, + "epoch": 0.3893612813921297, + "mean_token_accuracy": 0.780379056930542, + "num_tokens": 20500729.0, + "step": 3938, + "train/ce_loss": 0.6641556620597839 + }, + { + "epoch": 0.3893612813921297, + "step": 3938, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.3893612813921297, + "step": 3938, + "train/total_loss": 0.08594682067632675 + }, + { + "entropy": 9.078100204467773, + "epoch": 0.38946015424164526, + "mean_token_accuracy": 0.7342073917388916, + "num_tokens": 20506056.0, + "step": 3939, + "train/ce_loss": 1.0068473815917969 + }, + { + "epoch": 0.38946015424164526, + "step": 3939, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.38946015424164526, + "step": 3939, + "train/total_loss": 0.1905284821987152 + }, + { + "epoch": 0.38955902709116075, + "grad_norm": 0.7789677381515503, + "learning_rate": 9.028581318300944e-06, + "loss": 0.146, + "step": 3940 + }, + { + "entropy": 9.419309616088867, + "epoch": 0.38955902709116075, + "mean_token_accuracy": 0.7227866649627686, + "num_tokens": 20511234.0, + "step": 3940, + "train/ce_loss": 1.1209956407546997 + }, + { + "epoch": 0.38955902709116075, + "step": 3940, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.38955902709116075, + "step": 3940, + "train/total_loss": 0.1706933081150055 + }, + { + "entropy": 9.43798828125, + "epoch": 0.3896578999406763, + "mean_token_accuracy": 0.7090163826942444, + "num_tokens": 20516380.0, + "step": 3941, + "train/ce_loss": 1.5165958404541016 + }, + { + "epoch": 0.3896578999406763, + "step": 3941, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.3896578999406763, + "step": 3941, + "train/total_loss": 0.25322210788726807 + }, + { + "entropy": 9.000892639160156, + "epoch": 0.38975677279019183, + "mean_token_accuracy": 0.7237762212753296, + "num_tokens": 20521719.0, + "step": 3942, + "train/ce_loss": 0.4196535050868988 + }, + { + "epoch": 0.38975677279019183, + "step": 3942, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.38975677279019183, + "step": 3942, + "train/total_loss": 0.06540285050868988 + }, + { + "entropy": 8.770895004272461, + "epoch": 0.3898556456397073, + "mean_token_accuracy": 0.7023809552192688, + "num_tokens": 20527120.0, + "step": 3943, + "train/ce_loss": 0.820130467414856 + }, + { + "epoch": 0.3898556456397073, + "step": 3943, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.3898556456397073, + "step": 3943, + "train/total_loss": 0.14451304078102112 + }, + { + "entropy": 9.20180606842041, + "epoch": 0.38995451848922286, + "mean_token_accuracy": 0.746051013469696, + "num_tokens": 20532579.0, + "step": 3944, + "train/ce_loss": 0.7872023582458496 + }, + { + "epoch": 0.38995451848922286, + "step": 3944, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.38995451848922286, + "step": 3944, + "train/total_loss": 0.14122024178504944 + }, + { + "entropy": 9.434796333312988, + "epoch": 0.3900533913387384, + "mean_token_accuracy": 0.7356828451156616, + "num_tokens": 20537628.0, + "step": 3945, + "train/ce_loss": 1.1515980958938599 + }, + { + "epoch": 0.3900533913387384, + "step": 3945, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.3900533913387384, + "step": 3945, + "train/total_loss": 0.169847309589386 + }, + { + "entropy": 9.186416625976562, + "epoch": 0.3901522641882539, + "mean_token_accuracy": 0.7122692465782166, + "num_tokens": 20542993.0, + "step": 3946, + "train/ce_loss": 0.8618674874305725 + }, + { + "epoch": 0.3901522641882539, + "step": 3946, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.3901522641882539, + "step": 3946, + "train/total_loss": 0.1330617517232895 + }, + { + "entropy": 9.214897155761719, + "epoch": 0.3902511370377694, + "mean_token_accuracy": 0.7219387888908386, + "num_tokens": 20548235.0, + "step": 3947, + "train/ce_loss": 1.2426503896713257 + }, + { + "epoch": 0.3902511370377694, + "step": 3947, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.3902511370377694, + "step": 3947, + "train/total_loss": 0.22192129492759705 + }, + { + "entropy": 10.047253608703613, + "epoch": 0.39035000988728497, + "mean_token_accuracy": 0.7130681872367859, + "num_tokens": 20553022.0, + "step": 3948, + "train/ce_loss": 1.644702434539795 + }, + { + "epoch": 0.39035000988728497, + "step": 3948, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.39035000988728497, + "step": 3948, + "train/total_loss": 0.21525149047374725 + }, + { + "entropy": 9.5345458984375, + "epoch": 0.39044888273680045, + "mean_token_accuracy": 0.7266082167625427, + "num_tokens": 20558172.0, + "step": 3949, + "train/ce_loss": 1.3004337549209595 + }, + { + "epoch": 0.39044888273680045, + "step": 3949, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.39044888273680045, + "step": 3949, + "train/total_loss": 0.2355121225118637 + }, + { + "entropy": 8.942769050598145, + "epoch": 0.390547755586316, + "mean_token_accuracy": 0.7530364394187927, + "num_tokens": 20563603.0, + "step": 3950, + "train/ce_loss": 0.6398596167564392 + }, + { + "epoch": 0.390547755586316, + "step": 3950, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.390547755586316, + "step": 3950, + "train/total_loss": 0.11086096614599228 + }, + { + "entropy": 10.712668418884277, + "epoch": 0.39064662843583153, + "mean_token_accuracy": 1.0, + "num_tokens": 20568005.0, + "step": 3951, + "train/ce_loss": 0.00010620328976074234 + }, + { + "epoch": 0.39064662843583153, + "step": 3951, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.39064662843583153, + "step": 3951, + "train/total_loss": 0.042979370802640915 + }, + { + "entropy": 9.488534927368164, + "epoch": 0.390745501285347, + "mean_token_accuracy": 0.738095223903656, + "num_tokens": 20573047.0, + "step": 3952, + "train/ce_loss": 0.8325533270835876 + }, + { + "epoch": 0.390745501285347, + "step": 3952, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.390745501285347, + "step": 3952, + "train/total_loss": 0.157474085688591 + }, + { + "entropy": 9.086153030395508, + "epoch": 0.39084437413486256, + "mean_token_accuracy": 0.6866059899330139, + "num_tokens": 20578328.0, + "step": 3953, + "train/ce_loss": 0.8276225924491882 + }, + { + "epoch": 0.39084437413486256, + "step": 3953, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.39084437413486256, + "step": 3953, + "train/total_loss": 0.16088727116584778 + }, + { + "entropy": 9.038970947265625, + "epoch": 0.3909432469843781, + "mean_token_accuracy": 0.7560175061225891, + "num_tokens": 20583712.0, + "step": 3954, + "train/ce_loss": 0.6037119626998901 + }, + { + "epoch": 0.3909432469843781, + "step": 3954, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.3909432469843781, + "step": 3954, + "train/total_loss": 0.07990244776010513 + }, + { + "entropy": 9.257122039794922, + "epoch": 0.3910421198338936, + "mean_token_accuracy": 0.7065693140029907, + "num_tokens": 20588855.0, + "step": 3955, + "train/ce_loss": 0.9117102026939392 + }, + { + "epoch": 0.3910421198338936, + "step": 3955, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.3910421198338936, + "step": 3955, + "train/total_loss": 0.1380460262298584 + }, + { + "entropy": 9.421026229858398, + "epoch": 0.3911409926834091, + "mean_token_accuracy": 0.7773167490959167, + "num_tokens": 20594046.0, + "step": 3956, + "train/ce_loss": 0.7747334241867065 + }, + { + "epoch": 0.3911409926834091, + "step": 3956, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.3911409926834091, + "step": 3956, + "train/total_loss": 0.11262959241867065 + }, + { + "entropy": 9.129478454589844, + "epoch": 0.39123986553292467, + "mean_token_accuracy": 0.776566743850708, + "num_tokens": 20599254.0, + "step": 3957, + "train/ce_loss": 0.5620206594467163 + }, + { + "epoch": 0.39123986553292467, + "step": 3957, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.39123986553292467, + "step": 3957, + "train/total_loss": 0.07963956892490387 + }, + { + "entropy": 9.739778518676758, + "epoch": 0.39133873838244015, + "mean_token_accuracy": 0.8050000071525574, + "num_tokens": 20604094.0, + "step": 3958, + "train/ce_loss": 1.341671109199524 + }, + { + "epoch": 0.39133873838244015, + "step": 3958, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.39133873838244015, + "step": 3958, + "train/total_loss": 0.1810421198606491 + }, + { + "entropy": 9.148651123046875, + "epoch": 0.3914376112319557, + "mean_token_accuracy": 0.7365119457244873, + "num_tokens": 20609350.0, + "step": 3959, + "train/ce_loss": 1.4228763580322266 + }, + { + "epoch": 0.3914376112319557, + "step": 3959, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.3914376112319557, + "step": 3959, + "train/total_loss": 0.22041264176368713 + }, + { + "epoch": 0.39153648408147124, + "grad_norm": 0.7743656635284424, + "learning_rate": 9.023636453542997e-06, + "loss": 0.1417, + "step": 3960 + }, + { + "entropy": 8.91891860961914, + "epoch": 0.39153648408147124, + "mean_token_accuracy": 0.7766081690788269, + "num_tokens": 20614672.0, + "step": 3960, + "train/ce_loss": 0.6227753162384033 + }, + { + "epoch": 0.39153648408147124, + "step": 3960, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.39153648408147124, + "step": 3960, + "train/total_loss": 0.11305878311395645 + }, + { + "entropy": 8.867339134216309, + "epoch": 0.3916353569309868, + "mean_token_accuracy": 0.7745591998100281, + "num_tokens": 20619968.0, + "step": 3961, + "train/ce_loss": 0.700613260269165 + }, + { + "epoch": 0.3916353569309868, + "step": 3961, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.3916353569309868, + "step": 3961, + "train/total_loss": 0.0856863260269165 + }, + { + "entropy": 9.311765670776367, + "epoch": 0.39173422978050226, + "mean_token_accuracy": 0.7412095665931702, + "num_tokens": 20625139.0, + "step": 3962, + "train/ce_loss": 1.589390516281128 + }, + { + "epoch": 0.39173422978050226, + "step": 3962, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.39173422978050226, + "step": 3962, + "train/total_loss": 0.22143904864788055 + }, + { + "entropy": 9.160322189331055, + "epoch": 0.3918331026300178, + "mean_token_accuracy": 0.7806122303009033, + "num_tokens": 20630350.0, + "step": 3963, + "train/ce_loss": 0.8469388484954834 + }, + { + "epoch": 0.3918331026300178, + "step": 3963, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.3918331026300178, + "step": 3963, + "train/total_loss": 0.13938137888908386 + }, + { + "entropy": 9.489648818969727, + "epoch": 0.39193197547953335, + "mean_token_accuracy": 0.7718750238418579, + "num_tokens": 20635472.0, + "step": 3964, + "train/ce_loss": 2.640196498759906e-06 + }, + { + "epoch": 0.39193197547953335, + "step": 3964, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.39193197547953335, + "step": 3964, + "train/total_loss": 0.02343776449561119 + }, + { + "entropy": 9.338415145874023, + "epoch": 0.39203084832904883, + "mean_token_accuracy": 0.7424483299255371, + "num_tokens": 20640570.0, + "step": 3965, + "train/ce_loss": 0.6520880460739136 + }, + { + "epoch": 0.39203084832904883, + "step": 3965, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.39203084832904883, + "step": 3965, + "train/total_loss": 0.1277088075876236 + }, + { + "entropy": 9.578125, + "epoch": 0.39212972117856437, + "mean_token_accuracy": 0.7858347296714783, + "num_tokens": 20645601.0, + "step": 3966, + "train/ce_loss": 1.149623155593872 + }, + { + "epoch": 0.39212972117856437, + "step": 3966, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.39212972117856437, + "step": 3966, + "train/total_loss": 0.13058730959892273 + }, + { + "entropy": 9.139913558959961, + "epoch": 0.3922285940280799, + "mean_token_accuracy": 0.7357051968574524, + "num_tokens": 20650852.0, + "step": 3967, + "train/ce_loss": 0.43606361746788025 + }, + { + "epoch": 0.3922285940280799, + "step": 3967, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.3922285940280799, + "step": 3967, + "train/total_loss": 0.07095011323690414 + }, + { + "entropy": 9.946247100830078, + "epoch": 0.3923274668775954, + "mean_token_accuracy": 0.7689969539642334, + "num_tokens": 20655632.0, + "step": 3968, + "train/ce_loss": 1.7134802341461182 + }, + { + "epoch": 0.3923274668775954, + "step": 3968, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.3923274668775954, + "step": 3968, + "train/total_loss": 0.21822302043437958 + }, + { + "entropy": 9.227418899536133, + "epoch": 0.39242633972711094, + "mean_token_accuracy": 0.7559171319007874, + "num_tokens": 20660746.0, + "step": 3969, + "train/ce_loss": 0.6671918034553528 + }, + { + "epoch": 0.39242633972711094, + "step": 3969, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.39242633972711094, + "step": 3969, + "train/total_loss": 0.1096879318356514 + }, + { + "entropy": 8.94714069366455, + "epoch": 0.3925252125766265, + "mean_token_accuracy": 0.7716436386108398, + "num_tokens": 20666020.0, + "step": 3970, + "train/ce_loss": 0.8409286141395569 + }, + { + "epoch": 0.3925252125766265, + "step": 3970, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.3925252125766265, + "step": 3970, + "train/total_loss": 0.1426866054534912 + }, + { + "entropy": 8.83590030670166, + "epoch": 0.39262408542614197, + "mean_token_accuracy": 0.680272102355957, + "num_tokens": 20671340.0, + "step": 3971, + "train/ce_loss": 0.7781625390052795 + }, + { + "epoch": 0.39262408542614197, + "step": 3971, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.39262408542614197, + "step": 3971, + "train/total_loss": 0.14031624794006348 + }, + { + "entropy": 9.116752624511719, + "epoch": 0.3927229582756575, + "mean_token_accuracy": 0.7691256999969482, + "num_tokens": 20676527.0, + "step": 3972, + "train/ce_loss": 0.5245115160942078 + }, + { + "epoch": 0.3927229582756575, + "step": 3972, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.3927229582756575, + "step": 3972, + "train/total_loss": 0.08370114862918854 + }, + { + "entropy": 9.668756484985352, + "epoch": 0.39282183112517305, + "mean_token_accuracy": 0.7322970628738403, + "num_tokens": 20681526.0, + "step": 3973, + "train/ce_loss": 1.531264066696167 + }, + { + "epoch": 0.39282183112517305, + "step": 3973, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.39282183112517305, + "step": 3973, + "train/total_loss": 0.21953265368938446 + }, + { + "entropy": 8.670350074768066, + "epoch": 0.39292070397468853, + "mean_token_accuracy": 0.6647331714630127, + "num_tokens": 20686872.0, + "step": 3974, + "train/ce_loss": 1.3226938247680664 + }, + { + "epoch": 0.39292070397468853, + "step": 3974, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.39292070397468853, + "step": 3974, + "train/total_loss": 0.18305063247680664 + }, + { + "entropy": 9.280973434448242, + "epoch": 0.3930195768242041, + "mean_token_accuracy": 0.7009569406509399, + "num_tokens": 20692188.0, + "step": 3975, + "train/ce_loss": 1.4136747121810913 + }, + { + "epoch": 0.3930195768242041, + "step": 3975, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.3930195768242041, + "step": 3975, + "train/total_loss": 0.22339873015880585 + }, + { + "entropy": 9.94870376586914, + "epoch": 0.3931184496737196, + "mean_token_accuracy": 0.7412399053573608, + "num_tokens": 20696937.0, + "step": 3976, + "train/ce_loss": 1.8449875116348267 + }, + { + "epoch": 0.3931184496737196, + "step": 3976, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.3931184496737196, + "step": 3976, + "train/total_loss": 0.23918625712394714 + }, + { + "entropy": 10.018001556396484, + "epoch": 0.3932173225232351, + "mean_token_accuracy": 0.7804877758026123, + "num_tokens": 20701772.0, + "step": 3977, + "train/ce_loss": 6.123746970843058e-06 + }, + { + "epoch": 0.3932173225232351, + "step": 3977, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.3932173225232351, + "step": 3977, + "train/total_loss": 0.05859436094760895 + }, + { + "entropy": 8.99118423461914, + "epoch": 0.39331619537275064, + "mean_token_accuracy": 0.761904776096344, + "num_tokens": 20707067.0, + "step": 3978, + "train/ce_loss": 1.0223777294158936 + }, + { + "epoch": 0.39331619537275064, + "step": 3978, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.39331619537275064, + "step": 3978, + "train/total_loss": 0.1452065259218216 + }, + { + "entropy": 9.36185073852539, + "epoch": 0.3934150682222662, + "mean_token_accuracy": 0.7319728136062622, + "num_tokens": 20712434.0, + "step": 3979, + "train/ce_loss": 0.9951471090316772 + }, + { + "epoch": 0.3934150682222662, + "step": 3979, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.3934150682222662, + "step": 3979, + "train/total_loss": 0.16592097282409668 + }, + { + "epoch": 0.39351394107178167, + "grad_norm": 0.7715674638748169, + "learning_rate": 9.018691588785047e-06, + "loss": 0.1368, + "step": 3980 + }, + { + "entropy": 8.942005157470703, + "epoch": 0.39351394107178167, + "mean_token_accuracy": 0.757777750492096, + "num_tokens": 20717817.0, + "step": 3980, + "train/ce_loss": 0.8228210210800171 + }, + { + "epoch": 0.39351394107178167, + "step": 3980, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.39351394107178167, + "step": 3980, + "train/total_loss": 0.17993834614753723 + }, + { + "entropy": 9.453428268432617, + "epoch": 0.3936128139212972, + "mean_token_accuracy": 0.7148817777633667, + "num_tokens": 20723013.0, + "step": 3981, + "train/ce_loss": 0.5984261631965637 + }, + { + "epoch": 0.3936128139212972, + "step": 3981, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.3936128139212972, + "step": 3981, + "train/total_loss": 0.08718636631965637 + }, + { + "entropy": 8.952157974243164, + "epoch": 0.39371168677081275, + "mean_token_accuracy": 0.7518796920776367, + "num_tokens": 20728459.0, + "step": 3982, + "train/ce_loss": 0.4479600787162781 + }, + { + "epoch": 0.39371168677081275, + "step": 3982, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.39371168677081275, + "step": 3982, + "train/total_loss": 0.09948350489139557 + }, + { + "entropy": 9.203887939453125, + "epoch": 0.39381055962032824, + "mean_token_accuracy": 0.7377423048019409, + "num_tokens": 20733804.0, + "step": 3983, + "train/ce_loss": 0.4607117474079132 + }, + { + "epoch": 0.39381055962032824, + "step": 3983, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.39381055962032824, + "step": 3983, + "train/total_loss": 0.10857117176055908 + }, + { + "entropy": 8.882675170898438, + "epoch": 0.3939094324698438, + "mean_token_accuracy": 0.7382857203483582, + "num_tokens": 20739260.0, + "step": 3984, + "train/ce_loss": 1.2117985486984253 + }, + { + "epoch": 0.3939094324698438, + "step": 3984, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.3939094324698438, + "step": 3984, + "train/total_loss": 0.19930484890937805 + }, + { + "entropy": 8.734657287597656, + "epoch": 0.3940083053193593, + "mean_token_accuracy": 0.7133758068084717, + "num_tokens": 20744666.0, + "step": 3985, + "train/ce_loss": 0.7336761355400085 + }, + { + "epoch": 0.3940083053193593, + "step": 3985, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.3940083053193593, + "step": 3985, + "train/total_loss": 0.1514926254749298 + }, + { + "entropy": 9.270834922790527, + "epoch": 0.3941071781688748, + "mean_token_accuracy": 0.7160161733627319, + "num_tokens": 20749855.0, + "step": 3986, + "train/ce_loss": 0.663804292678833 + }, + { + "epoch": 0.3941071781688748, + "step": 3986, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.3941071781688748, + "step": 3986, + "train/total_loss": 0.10934918373823166 + }, + { + "entropy": 8.473923683166504, + "epoch": 0.39420605101839035, + "mean_token_accuracy": 0.7654075622558594, + "num_tokens": 20755331.0, + "step": 3987, + "train/ce_loss": 0.7835092544555664 + }, + { + "epoch": 0.39420605101839035, + "step": 3987, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.39420605101839035, + "step": 3987, + "train/total_loss": 0.13694468140602112 + }, + { + "entropy": 9.105308532714844, + "epoch": 0.3943049238679059, + "mean_token_accuracy": 0.6714456677436829, + "num_tokens": 20760625.0, + "step": 3988, + "train/ce_loss": 0.9587419033050537 + }, + { + "epoch": 0.3943049238679059, + "step": 3988, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.3943049238679059, + "step": 3988, + "train/total_loss": 0.14274919033050537 + }, + { + "entropy": 9.182525634765625, + "epoch": 0.3944037967174214, + "mean_token_accuracy": 0.7326478362083435, + "num_tokens": 20765824.0, + "step": 3989, + "train/ce_loss": 0.9583200216293335 + }, + { + "epoch": 0.3944037967174214, + "step": 3989, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.3944037967174214, + "step": 3989, + "train/total_loss": 0.1778632551431656 + }, + { + "entropy": 9.696810722351074, + "epoch": 0.3945026695669369, + "mean_token_accuracy": 0.7580340504646301, + "num_tokens": 20770796.0, + "step": 3990, + "train/ce_loss": 1.025506615638733 + }, + { + "epoch": 0.3945026695669369, + "step": 3990, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.3945026695669369, + "step": 3990, + "train/total_loss": 0.1806756556034088 + }, + { + "entropy": 9.201448440551758, + "epoch": 0.39460154241645246, + "mean_token_accuracy": 0.7397260069847107, + "num_tokens": 20775989.0, + "step": 3991, + "train/ce_loss": 0.9222592115402222 + }, + { + "epoch": 0.39460154241645246, + "step": 3991, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.39460154241645246, + "step": 3991, + "train/total_loss": 0.13519467413425446 + }, + { + "entropy": 8.77785587310791, + "epoch": 0.39470041526596794, + "mean_token_accuracy": 0.7600411772727966, + "num_tokens": 20781468.0, + "step": 3992, + "train/ce_loss": 0.945110559463501 + }, + { + "epoch": 0.39470041526596794, + "step": 3992, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.39470041526596794, + "step": 3992, + "train/total_loss": 0.19607356190681458 + }, + { + "entropy": 9.376846313476562, + "epoch": 0.3947992881154835, + "mean_token_accuracy": 0.6996123790740967, + "num_tokens": 20786429.0, + "step": 3993, + "train/ce_loss": 1.9209712743759155 + }, + { + "epoch": 0.3947992881154835, + "step": 3993, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.3947992881154835, + "step": 3993, + "train/total_loss": 0.29756587743759155 + }, + { + "entropy": 9.21438217163086, + "epoch": 0.394898160964999, + "mean_token_accuracy": 0.673374593257904, + "num_tokens": 20791554.0, + "step": 3994, + "train/ce_loss": 5.2261252676544245e-06 + }, + { + "epoch": 0.394898160964999, + "step": 3994, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.394898160964999, + "step": 3994, + "train/total_loss": 0.039063021540641785 + }, + { + "entropy": 9.785654067993164, + "epoch": 0.3949970338145145, + "mean_token_accuracy": 0.7127882838249207, + "num_tokens": 20796429.0, + "step": 3995, + "train/ce_loss": 1.522355079650879 + }, + { + "epoch": 0.3949970338145145, + "step": 3995, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.3949970338145145, + "step": 3995, + "train/total_loss": 0.1912980079650879 + }, + { + "entropy": 9.108304977416992, + "epoch": 0.39509590666403005, + "mean_token_accuracy": 0.7588739395141602, + "num_tokens": 20801696.0, + "step": 3996, + "train/ce_loss": 0.9614579081535339 + }, + { + "epoch": 0.39509590666403005, + "step": 3996, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.39509590666403005, + "step": 3996, + "train/total_loss": 0.18989579379558563 + }, + { + "entropy": 9.596675872802734, + "epoch": 0.3951947795135456, + "mean_token_accuracy": 0.7289562225341797, + "num_tokens": 20806701.0, + "step": 3997, + "train/ce_loss": 1.8334210380999139e-06 + }, + { + "epoch": 0.3951947795135456, + "step": 3997, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.3951947795135456, + "step": 3997, + "train/total_loss": 0.06640643626451492 + }, + { + "entropy": 8.852028846740723, + "epoch": 0.3952936523630611, + "mean_token_accuracy": 0.748344361782074, + "num_tokens": 20812123.0, + "step": 3998, + "train/ce_loss": 0.32659050822257996 + }, + { + "epoch": 0.3952936523630611, + "step": 3998, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.3952936523630611, + "step": 3998, + "train/total_loss": 0.08734655380249023 + }, + { + "entropy": 9.218839645385742, + "epoch": 0.3953925252125766, + "mean_token_accuracy": 0.7477477192878723, + "num_tokens": 20817347.0, + "step": 3999, + "train/ce_loss": 0.91793292760849 + }, + { + "epoch": 0.3953925252125766, + "step": 3999, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.3953925252125766, + "step": 3999, + "train/total_loss": 0.15038704872131348 + }, + { + "epoch": 0.39549139806209216, + "grad_norm": 0.7181991934776306, + "learning_rate": 9.013746724027098e-06, + "loss": 0.155, + "step": 4000 + }, + { + "entropy": 9.046833038330078, + "epoch": 0.39549139806209216, + "mean_token_accuracy": 0.7191435694694519, + "num_tokens": 5203.0, + "step": 4000, + "train/ce_loss": 0.9499548077583313 + }, + { + "epoch": 0.39549139806209216, + "step": 4000, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.39549139806209216, + "step": 4000, + "train/total_loss": 0.16530798375606537 + }, + { + "entropy": 8.831292152404785, + "epoch": 0.3955902709116077, + "mean_token_accuracy": 0.7470775842666626, + "num_tokens": 10644.0, + "step": 4001, + "train/ce_loss": 1.0550512075424194 + }, + { + "epoch": 0.3955902709116077, + "step": 4001, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.3955902709116077, + "step": 4001, + "train/total_loss": 0.19534887373447418 + }, + { + "entropy": 9.527122497558594, + "epoch": 0.3956891437611232, + "mean_token_accuracy": 0.7454545497894287, + "num_tokens": 15582.0, + "step": 4002, + "train/ce_loss": 1.7859098306871601e-06 + }, + { + "epoch": 0.3956891437611232, + "step": 4002, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.3956891437611232, + "step": 4002, + "train/total_loss": 0.054687678813934326 + }, + { + "entropy": 8.842239379882812, + "epoch": 0.3957880166106387, + "mean_token_accuracy": 0.7266436219215393, + "num_tokens": 20895.0, + "step": 4003, + "train/ce_loss": 0.4752326011657715 + }, + { + "epoch": 0.3957880166106387, + "step": 4003, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.3957880166106387, + "step": 4003, + "train/total_loss": 0.10221076011657715 + }, + { + "entropy": 9.415111541748047, + "epoch": 0.39588688946015427, + "mean_token_accuracy": 0.72398841381073, + "num_tokens": 26048.0, + "step": 4004, + "train/ce_loss": 0.8236679434776306 + }, + { + "epoch": 0.39588688946015427, + "step": 4004, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.39588688946015427, + "step": 4004, + "train/total_loss": 0.13314804434776306 + }, + { + "entropy": 9.289140701293945, + "epoch": 0.39598576230966975, + "mean_token_accuracy": 0.7312775254249573, + "num_tokens": 31206.0, + "step": 4005, + "train/ce_loss": 0.7567775249481201 + }, + { + "epoch": 0.39598576230966975, + "step": 4005, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.39598576230966975, + "step": 4005, + "train/total_loss": 0.153802752494812 + }, + { + "entropy": 8.88846492767334, + "epoch": 0.3960846351591853, + "mean_token_accuracy": 0.74210524559021, + "num_tokens": 36670.0, + "step": 4006, + "train/ce_loss": 0.7667742967605591 + }, + { + "epoch": 0.3960846351591853, + "step": 4006, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.3960846351591853, + "step": 4006, + "train/total_loss": 0.16652119159698486 + }, + { + "entropy": 9.517263412475586, + "epoch": 0.39618350800870084, + "mean_token_accuracy": 0.757328987121582, + "num_tokens": 41714.0, + "step": 4007, + "train/ce_loss": 1.5702300970588112e-06 + }, + { + "epoch": 0.39618350800870084, + "step": 4007, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.39618350800870084, + "step": 4007, + "train/total_loss": 0.046875156462192535 + }, + { + "entropy": 9.251596450805664, + "epoch": 0.3962823808582163, + "mean_token_accuracy": 0.7770082950592041, + "num_tokens": 46944.0, + "step": 4008, + "train/ce_loss": 1.188407301902771 + }, + { + "epoch": 0.3962823808582163, + "step": 4008, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.3962823808582163, + "step": 4008, + "train/total_loss": 0.20477822422981262 + }, + { + "entropy": 8.82939624786377, + "epoch": 0.39638125370773186, + "mean_token_accuracy": 0.7306122183799744, + "num_tokens": 52395.0, + "step": 4009, + "train/ce_loss": 0.7744306325912476 + }, + { + "epoch": 0.39638125370773186, + "step": 4009, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.39638125370773186, + "step": 4009, + "train/total_loss": 0.15556806325912476 + }, + { + "entropy": 9.621936798095703, + "epoch": 0.3964801265572474, + "mean_token_accuracy": 0.6951026916503906, + "num_tokens": 57442.0, + "step": 4010, + "train/ce_loss": 1.723623514175415 + }, + { + "epoch": 0.3964801265572474, + "step": 4010, + "train/sim_loss": 0.13671875 + }, + { + "epoch": 0.3964801265572474, + "step": 4010, + "train/total_loss": 0.309081107378006 + }, + { + "entropy": 9.005047798156738, + "epoch": 0.3965789994067629, + "mean_token_accuracy": 0.7599093914031982, + "num_tokens": 62814.0, + "step": 4011, + "train/ce_loss": 0.543965756893158 + }, + { + "epoch": 0.3965789994067629, + "step": 4011, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.3965789994067629, + "step": 4011, + "train/total_loss": 0.09345907717943192 + }, + { + "entropy": 9.017001152038574, + "epoch": 0.39667787225627843, + "mean_token_accuracy": 0.7417452931404114, + "num_tokens": 68119.0, + "step": 4012, + "train/ce_loss": 0.8745249509811401 + }, + { + "epoch": 0.39667787225627843, + "step": 4012, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.39667787225627843, + "step": 4012, + "train/total_loss": 0.1421400010585785 + }, + { + "entropy": 9.530156135559082, + "epoch": 0.39677674510579397, + "mean_token_accuracy": 0.7612403035163879, + "num_tokens": 73200.0, + "step": 4013, + "train/ce_loss": 1.0481228828430176 + }, + { + "epoch": 0.39677674510579397, + "step": 4013, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.39677674510579397, + "step": 4013, + "train/total_loss": 0.15168729424476624 + }, + { + "entropy": 8.660186767578125, + "epoch": 0.39687561795530946, + "mean_token_accuracy": 0.7231578826904297, + "num_tokens": 78622.0, + "step": 4014, + "train/ce_loss": 1.055025339126587 + }, + { + "epoch": 0.39687561795530946, + "step": 4014, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.39687561795530946, + "step": 4014, + "train/total_loss": 0.14847129583358765 + }, + { + "entropy": 9.057394027709961, + "epoch": 0.396974490804825, + "mean_token_accuracy": 0.7364621162414551, + "num_tokens": 83914.0, + "step": 4015, + "train/ce_loss": 0.8721376657485962 + }, + { + "epoch": 0.396974490804825, + "step": 4015, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.396974490804825, + "step": 4015, + "train/total_loss": 0.14190126955509186 + }, + { + "entropy": 9.577649116516113, + "epoch": 0.39707336365434054, + "mean_token_accuracy": 0.6562032699584961, + "num_tokens": 89023.0, + "step": 4016, + "train/ce_loss": 2.242205482616555e-06 + }, + { + "epoch": 0.39707336365434054, + "step": 4016, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.39707336365434054, + "step": 4016, + "train/total_loss": 0.03515647351741791 + }, + { + "entropy": 9.50831413269043, + "epoch": 0.397172236503856, + "mean_token_accuracy": 0.6845238208770752, + "num_tokens": 94210.0, + "step": 4017, + "train/ce_loss": 1.4990187883377075 + }, + { + "epoch": 0.397172236503856, + "step": 4017, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.397172236503856, + "step": 4017, + "train/total_loss": 0.196776881814003 + }, + { + "entropy": 9.12798023223877, + "epoch": 0.39727110935337157, + "mean_token_accuracy": 0.6982543468475342, + "num_tokens": 99452.0, + "step": 4018, + "train/ce_loss": 1.048298716545105 + }, + { + "epoch": 0.39727110935337157, + "step": 4018, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.39727110935337157, + "step": 4018, + "train/total_loss": 0.18295487761497498 + }, + { + "entropy": 8.695804595947266, + "epoch": 0.3973699822028871, + "mean_token_accuracy": 0.7866419553756714, + "num_tokens": 105007.0, + "step": 4019, + "train/ce_loss": 0.5710273385047913 + }, + { + "epoch": 0.3973699822028871, + "step": 4019, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.3973699822028871, + "step": 4019, + "train/total_loss": 0.1742902398109436 + }, + { + "epoch": 0.3974688550524026, + "grad_norm": 0.6420843005180359, + "learning_rate": 9.00880185926915e-06, + "loss": 0.1499, + "step": 4020 + }, + { + "entropy": 9.110542297363281, + "epoch": 0.3974688550524026, + "mean_token_accuracy": 0.738386332988739, + "num_tokens": 110297.0, + "step": 4020, + "train/ce_loss": 0.6451484560966492 + }, + { + "epoch": 0.3974688550524026, + "step": 4020, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.3974688550524026, + "step": 4020, + "train/total_loss": 0.13482734560966492 + }, + { + "entropy": 9.310352325439453, + "epoch": 0.39756772790191813, + "mean_token_accuracy": 0.7536423802375793, + "num_tokens": 115507.0, + "step": 4021, + "train/ce_loss": 0.8712981343269348 + }, + { + "epoch": 0.39756772790191813, + "step": 4021, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.39756772790191813, + "step": 4021, + "train/total_loss": 0.11837981641292572 + }, + { + "entropy": 8.997873306274414, + "epoch": 0.3976666007514337, + "mean_token_accuracy": 0.7185821533203125, + "num_tokens": 120905.0, + "step": 4022, + "train/ce_loss": 0.6323196291923523 + }, + { + "epoch": 0.3976666007514337, + "step": 4022, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.3976666007514337, + "step": 4022, + "train/total_loss": 0.11010696738958359 + }, + { + "entropy": 8.847343444824219, + "epoch": 0.39776547360094916, + "mean_token_accuracy": 0.7548022866249084, + "num_tokens": 126255.0, + "step": 4023, + "train/ce_loss": 0.6254032850265503 + }, + { + "epoch": 0.39776547360094916, + "step": 4023, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.39776547360094916, + "step": 4023, + "train/total_loss": 0.14457157254219055 + }, + { + "entropy": 9.098240852355957, + "epoch": 0.3978643464504647, + "mean_token_accuracy": 0.7392900586128235, + "num_tokens": 131579.0, + "step": 4024, + "train/ce_loss": 0.5944740772247314 + }, + { + "epoch": 0.3978643464504647, + "step": 4024, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.3978643464504647, + "step": 4024, + "train/total_loss": 0.08288490772247314 + }, + { + "entropy": 8.962140083312988, + "epoch": 0.39796321929998024, + "mean_token_accuracy": 0.6808510422706604, + "num_tokens": 136828.0, + "step": 4025, + "train/ce_loss": 0.6933006048202515 + }, + { + "epoch": 0.39796321929998024, + "step": 4025, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.39796321929998024, + "step": 4025, + "train/total_loss": 0.14745506644248962 + }, + { + "entropy": 9.307699203491211, + "epoch": 0.3980620921494957, + "mean_token_accuracy": 0.6658163070678711, + "num_tokens": 142239.0, + "step": 4026, + "train/ce_loss": 2.1176609992980957 + }, + { + "epoch": 0.3980620921494957, + "step": 4026, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.3980620921494957, + "step": 4026, + "train/total_loss": 0.2859848737716675 + }, + { + "entropy": 9.146602630615234, + "epoch": 0.39816096499901127, + "mean_token_accuracy": 0.7664429545402527, + "num_tokens": 147485.0, + "step": 4027, + "train/ce_loss": 0.6791132092475891 + }, + { + "epoch": 0.39816096499901127, + "step": 4027, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.39816096499901127, + "step": 4027, + "train/total_loss": 0.1265050768852234 + }, + { + "entropy": 8.8102388381958, + "epoch": 0.3982598378485268, + "mean_token_accuracy": 0.7497593760490417, + "num_tokens": 152976.0, + "step": 4028, + "train/ce_loss": 0.8654963374137878 + }, + { + "epoch": 0.3982598378485268, + "step": 4028, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.3982598378485268, + "step": 4028, + "train/total_loss": 0.17248713970184326 + }, + { + "entropy": 8.74563217163086, + "epoch": 0.3983587106980423, + "mean_token_accuracy": 0.6796380281448364, + "num_tokens": 158604.0, + "step": 4029, + "train/ce_loss": 1.4658328294754028 + }, + { + "epoch": 0.3983587106980423, + "step": 4029, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.3983587106980423, + "step": 4029, + "train/total_loss": 0.21689578890800476 + }, + { + "entropy": 10.073447227478027, + "epoch": 0.39845758354755784, + "mean_token_accuracy": 0.7485380172729492, + "num_tokens": 163393.0, + "step": 4030, + "train/ce_loss": 2.0892558097839355 + }, + { + "epoch": 0.39845758354755784, + "step": 4030, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.39845758354755784, + "step": 4030, + "train/total_loss": 0.23626933991909027 + }, + { + "entropy": 9.204407691955566, + "epoch": 0.3985564563970734, + "mean_token_accuracy": 0.710303008556366, + "num_tokens": 168700.0, + "step": 4031, + "train/ce_loss": 1.2037955522537231 + }, + { + "epoch": 0.3985564563970734, + "step": 4031, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.3985564563970734, + "step": 4031, + "train/total_loss": 0.23366081714630127 + }, + { + "entropy": 9.018072128295898, + "epoch": 0.39865532924658886, + "mean_token_accuracy": 0.7207637429237366, + "num_tokens": 174018.0, + "step": 4032, + "train/ce_loss": 0.6557134389877319 + }, + { + "epoch": 0.39865532924658886, + "step": 4032, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.39865532924658886, + "step": 4032, + "train/total_loss": 0.10072759538888931 + }, + { + "entropy": 9.474685668945312, + "epoch": 0.3987542020961044, + "mean_token_accuracy": 0.7649006843566895, + "num_tokens": 179124.0, + "step": 4033, + "train/ce_loss": 1.5377615690231323 + }, + { + "epoch": 0.3987542020961044, + "step": 4033, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.3987542020961044, + "step": 4033, + "train/total_loss": 0.239713653922081 + }, + { + "entropy": 9.250163078308105, + "epoch": 0.39885307494561995, + "mean_token_accuracy": 0.7286713123321533, + "num_tokens": 184274.0, + "step": 4034, + "train/ce_loss": 0.9188748598098755 + }, + { + "epoch": 0.39885307494561995, + "step": 4034, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.39885307494561995, + "step": 4034, + "train/total_loss": 0.1348562389612198 + }, + { + "entropy": 10.157920837402344, + "epoch": 0.39895194779513543, + "mean_token_accuracy": 0.7976539731025696, + "num_tokens": 189047.0, + "step": 4035, + "train/ce_loss": 3.7278227864590008e-06 + }, + { + "epoch": 0.39895194779513543, + "step": 4035, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.39895194779513543, + "step": 4035, + "train/total_loss": 0.054687872529029846 + }, + { + "entropy": 8.8870849609375, + "epoch": 0.39905082064465097, + "mean_token_accuracy": 0.7160633206367493, + "num_tokens": 194408.0, + "step": 4036, + "train/ce_loss": 0.7803953289985657 + }, + { + "epoch": 0.39905082064465097, + "step": 4036, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.39905082064465097, + "step": 4036, + "train/total_loss": 0.1522582769393921 + }, + { + "entropy": 8.751253128051758, + "epoch": 0.3991496934941665, + "mean_token_accuracy": 0.7391742467880249, + "num_tokens": 199845.0, + "step": 4037, + "train/ce_loss": 0.8194625973701477 + }, + { + "epoch": 0.3991496934941665, + "step": 4037, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.3991496934941665, + "step": 4037, + "train/total_loss": 0.1288212537765503 + }, + { + "entropy": 8.811112403869629, + "epoch": 0.399248566343682, + "mean_token_accuracy": 0.7383720874786377, + "num_tokens": 205327.0, + "step": 4038, + "train/ce_loss": 0.7337889671325684 + }, + { + "epoch": 0.399248566343682, + "step": 4038, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.399248566343682, + "step": 4038, + "train/total_loss": 0.12806639075279236 + }, + { + "entropy": 9.679691314697266, + "epoch": 0.39934743919319754, + "mean_token_accuracy": 0.6666666865348816, + "num_tokens": 210204.0, + "step": 4039, + "train/ce_loss": 1.6742430943850195e-06 + }, + { + "epoch": 0.39934743919319754, + "step": 4039, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.39934743919319754, + "step": 4039, + "train/total_loss": 0.02343766763806343 + }, + { + "epoch": 0.3994463120427131, + "grad_norm": 0.9456307291984558, + "learning_rate": 9.0038569945112e-06, + "loss": 0.15, + "step": 4040 + }, + { + "entropy": 9.541059494018555, + "epoch": 0.3994463120427131, + "mean_token_accuracy": 0.7266054749488831, + "num_tokens": 215164.0, + "step": 4040, + "train/ce_loss": 3.007882595062256 + }, + { + "epoch": 0.3994463120427131, + "step": 4040, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.3994463120427131, + "step": 4040, + "train/total_loss": 0.4062570035457611 + }, + { + "entropy": 9.531622886657715, + "epoch": 0.39954518489222857, + "mean_token_accuracy": 0.698113203048706, + "num_tokens": 220296.0, + "step": 4041, + "train/ce_loss": 1.396550487697823e-06 + }, + { + "epoch": 0.39954518489222857, + "step": 4041, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.39954518489222857, + "step": 4041, + "train/total_loss": 0.03125013783574104 + }, + { + "entropy": 9.091503143310547, + "epoch": 0.3996440577417441, + "mean_token_accuracy": 0.8155940771102905, + "num_tokens": 225589.0, + "step": 4042, + "train/ce_loss": 0.5756384134292603 + }, + { + "epoch": 0.3996440577417441, + "step": 4042, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.3996440577417441, + "step": 4042, + "train/total_loss": 0.12787634134292603 + }, + { + "entropy": 8.81747817993164, + "epoch": 0.39974293059125965, + "mean_token_accuracy": 0.7452547550201416, + "num_tokens": 231007.0, + "step": 4043, + "train/ce_loss": 0.8199257254600525 + }, + { + "epoch": 0.39974293059125965, + "step": 4043, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.39974293059125965, + "step": 4043, + "train/total_loss": 0.14058631658554077 + }, + { + "entropy": 9.08637523651123, + "epoch": 0.3998418034407752, + "mean_token_accuracy": 0.7206266522407532, + "num_tokens": 236225.0, + "step": 4044, + "train/ce_loss": 1.1336801052093506 + }, + { + "epoch": 0.3998418034407752, + "step": 4044, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.3998418034407752, + "step": 4044, + "train/total_loss": 0.14461800456047058 + }, + { + "entropy": 9.368659019470215, + "epoch": 0.3999406762902907, + "mean_token_accuracy": 0.7040441036224365, + "num_tokens": 241239.0, + "step": 4045, + "train/ce_loss": 1.5637998580932617 + }, + { + "epoch": 0.3999406762902907, + "step": 4045, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.3999406762902907, + "step": 4045, + "train/total_loss": 0.20325498282909393 + }, + { + "entropy": 9.326437950134277, + "epoch": 0.4000395491398062, + "mean_token_accuracy": 0.7401960492134094, + "num_tokens": 246301.0, + "step": 4046, + "train/ce_loss": 0.9135624766349792 + }, + { + "epoch": 0.4000395491398062, + "step": 4046, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.4000395491398062, + "step": 4046, + "train/total_loss": 0.15776249766349792 + }, + { + "entropy": 8.795669555664062, + "epoch": 0.40013842198932176, + "mean_token_accuracy": 0.7771428823471069, + "num_tokens": 251820.0, + "step": 4047, + "train/ce_loss": 0.8271172046661377 + }, + { + "epoch": 0.40013842198932176, + "step": 4047, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.40013842198932176, + "step": 4047, + "train/total_loss": 0.13349297642707825 + }, + { + "entropy": 8.837505340576172, + "epoch": 0.40023729483883724, + "mean_token_accuracy": 0.7815040946006775, + "num_tokens": 257247.0, + "step": 4048, + "train/ce_loss": 0.5455641746520996 + }, + { + "epoch": 0.40023729483883724, + "step": 4048, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.40023729483883724, + "step": 4048, + "train/total_loss": 0.11705641448497772 + }, + { + "entropy": 9.698233604431152, + "epoch": 0.4003361676883528, + "mean_token_accuracy": 0.670040488243103, + "num_tokens": 262179.0, + "step": 4049, + "train/ce_loss": 2.601757287979126 + }, + { + "epoch": 0.4003361676883528, + "step": 4049, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.4003361676883528, + "step": 4049, + "train/total_loss": 0.3383007347583771 + }, + { + "entropy": 9.253774642944336, + "epoch": 0.4004350405378683, + "mean_token_accuracy": 0.7510259747505188, + "num_tokens": 267369.0, + "step": 4050, + "train/ce_loss": 0.7595701813697815 + }, + { + "epoch": 0.4004350405378683, + "step": 4050, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.4004350405378683, + "step": 4050, + "train/total_loss": 0.10720702260732651 + }, + { + "entropy": 9.428849220275879, + "epoch": 0.4005339133873838, + "mean_token_accuracy": 0.7894002795219421, + "num_tokens": 272478.0, + "step": 4051, + "train/ce_loss": 0.5480824112892151 + }, + { + "epoch": 0.4005339133873838, + "step": 4051, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.4005339133873838, + "step": 4051, + "train/total_loss": 0.07433949410915375 + }, + { + "entropy": 8.907392501831055, + "epoch": 0.40063278623689935, + "mean_token_accuracy": 0.7654054164886475, + "num_tokens": 277944.0, + "step": 4052, + "train/ce_loss": 0.570549488067627 + }, + { + "epoch": 0.40063278623689935, + "step": 4052, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.40063278623689935, + "step": 4052, + "train/total_loss": 0.10002370178699493 + }, + { + "entropy": 8.90610408782959, + "epoch": 0.4007316590864149, + "mean_token_accuracy": 0.7560483813285828, + "num_tokens": 283426.0, + "step": 4053, + "train/ce_loss": 0.9984250068664551 + }, + { + "epoch": 0.4007316590864149, + "step": 4053, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.4007316590864149, + "step": 4053, + "train/total_loss": 0.17406125366687775 + }, + { + "entropy": 9.252974510192871, + "epoch": 0.4008305319359304, + "mean_token_accuracy": 0.7737127542495728, + "num_tokens": 288633.0, + "step": 4054, + "train/ce_loss": 0.7401827573776245 + }, + { + "epoch": 0.4008305319359304, + "step": 4054, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.4008305319359304, + "step": 4054, + "train/total_loss": 0.11308077722787857 + }, + { + "entropy": 9.408609390258789, + "epoch": 0.4009294047854459, + "mean_token_accuracy": 0.7308781743049622, + "num_tokens": 293964.0, + "step": 4055, + "train/ce_loss": 1.6208299398422241 + }, + { + "epoch": 0.4009294047854459, + "step": 4055, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.4009294047854459, + "step": 4055, + "train/total_loss": 0.2480204999446869 + }, + { + "entropy": 9.414863586425781, + "epoch": 0.40102827763496146, + "mean_token_accuracy": 0.7586206793785095, + "num_tokens": 298968.0, + "step": 4056, + "train/ce_loss": 1.4367305993800983e-06 + }, + { + "epoch": 0.40102827763496146, + "step": 4056, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.40102827763496146, + "step": 4056, + "train/total_loss": 0.03125014528632164 + }, + { + "entropy": 9.196757316589355, + "epoch": 0.40112715048447695, + "mean_token_accuracy": 0.7459893226623535, + "num_tokens": 304217.0, + "step": 4057, + "train/ce_loss": 0.9032488465309143 + }, + { + "epoch": 0.40112715048447695, + "step": 4057, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.40112715048447695, + "step": 4057, + "train/total_loss": 0.17626237869262695 + }, + { + "entropy": 9.397751808166504, + "epoch": 0.4012260233339925, + "mean_token_accuracy": 0.7250755429267883, + "num_tokens": 309337.0, + "step": 4058, + "train/ce_loss": 1.0142661333084106 + }, + { + "epoch": 0.4012260233339925, + "step": 4058, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.4012260233339925, + "step": 4058, + "train/total_loss": 0.1561141163110733 + }, + { + "entropy": 9.22425651550293, + "epoch": 0.40132489618350803, + "mean_token_accuracy": 0.7647768259048462, + "num_tokens": 314625.0, + "step": 4059, + "train/ce_loss": 0.8428784012794495 + }, + { + "epoch": 0.40132489618350803, + "step": 4059, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.40132489618350803, + "step": 4059, + "train/total_loss": 0.1546003520488739 + }, + { + "epoch": 0.4014237690330235, + "grad_norm": 0.7428350448608398, + "learning_rate": 8.998912129753253e-06, + "loss": 0.1401, + "step": 4060 + }, + { + "entropy": 9.935905456542969, + "epoch": 0.4014237690330235, + "mean_token_accuracy": 0.7200000286102295, + "num_tokens": 319406.0, + "step": 4060, + "train/ce_loss": 1.5060312747955322 + }, + { + "epoch": 0.4014237690330235, + "step": 4060, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.4014237690330235, + "step": 4060, + "train/total_loss": 0.23654063045978546 + }, + { + "entropy": 9.732525825500488, + "epoch": 0.40152264188253906, + "mean_token_accuracy": 0.7864583134651184, + "num_tokens": 324443.0, + "step": 4061, + "train/ce_loss": 1.0543136596679688 + }, + { + "epoch": 0.40152264188253906, + "step": 4061, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.40152264188253906, + "step": 4061, + "train/total_loss": 0.23043137788772583 + }, + { + "entropy": 8.742220878601074, + "epoch": 0.4016215147320546, + "mean_token_accuracy": 0.7975584864616394, + "num_tokens": 329879.0, + "step": 4062, + "train/ce_loss": 0.620628297328949 + }, + { + "epoch": 0.4016215147320546, + "step": 4062, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.4016215147320546, + "step": 4062, + "train/total_loss": 0.0933128297328949 + }, + { + "entropy": 9.26221752166748, + "epoch": 0.4017203875815701, + "mean_token_accuracy": 0.7550251483917236, + "num_tokens": 335157.0, + "step": 4063, + "train/ce_loss": 0.947535514831543 + }, + { + "epoch": 0.4017203875815701, + "step": 4063, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.4017203875815701, + "step": 4063, + "train/total_loss": 0.15725356340408325 + }, + { + "entropy": 9.272913932800293, + "epoch": 0.4018192604310856, + "mean_token_accuracy": 0.7151898741722107, + "num_tokens": 340404.0, + "step": 4064, + "train/ce_loss": 1.4104467630386353 + }, + { + "epoch": 0.4018192604310856, + "step": 4064, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.4018192604310856, + "step": 4064, + "train/total_loss": 0.20745092630386353 + }, + { + "entropy": 9.822046279907227, + "epoch": 0.40191813328060116, + "mean_token_accuracy": 0.7368420958518982, + "num_tokens": 345124.0, + "step": 4065, + "train/ce_loss": 1.7242532968521118 + }, + { + "epoch": 0.40191813328060116, + "step": 4065, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.40191813328060116, + "step": 4065, + "train/total_loss": 0.21930032968521118 + }, + { + "entropy": 8.932167053222656, + "epoch": 0.40201700613011665, + "mean_token_accuracy": 0.7044100165367126, + "num_tokens": 350486.0, + "step": 4066, + "train/ce_loss": 0.748856782913208 + }, + { + "epoch": 0.40201700613011665, + "step": 4066, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.40201700613011665, + "step": 4066, + "train/total_loss": 0.14129193127155304 + }, + { + "entropy": 9.134580612182617, + "epoch": 0.4021158789796322, + "mean_token_accuracy": 0.7713178396224976, + "num_tokens": 355764.0, + "step": 4067, + "train/ce_loss": 0.8877795934677124 + }, + { + "epoch": 0.4021158789796322, + "step": 4067, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.4021158789796322, + "step": 4067, + "train/total_loss": 0.17080920934677124 + }, + { + "entropy": 9.582756042480469, + "epoch": 0.40221475182914773, + "mean_token_accuracy": 0.6673228144645691, + "num_tokens": 360713.0, + "step": 4068, + "train/ce_loss": 0.9494542479515076 + }, + { + "epoch": 0.40221475182914773, + "step": 4068, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.40221475182914773, + "step": 4068, + "train/total_loss": 0.14572668075561523 + }, + { + "entropy": 9.266700744628906, + "epoch": 0.4023136246786632, + "mean_token_accuracy": 0.7130434513092041, + "num_tokens": 365887.0, + "step": 4069, + "train/ce_loss": 0.8378681540489197 + }, + { + "epoch": 0.4023136246786632, + "step": 4069, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.4023136246786632, + "step": 4069, + "train/total_loss": 0.14628681540489197 + }, + { + "entropy": 9.064632415771484, + "epoch": 0.40241249752817876, + "mean_token_accuracy": 0.740645170211792, + "num_tokens": 371197.0, + "step": 4070, + "train/ce_loss": 0.9820173978805542 + }, + { + "epoch": 0.40241249752817876, + "step": 4070, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.40241249752817876, + "step": 4070, + "train/total_loss": 0.15679550170898438 + }, + { + "entropy": 9.865793228149414, + "epoch": 0.4025113703776943, + "mean_token_accuracy": 0.7670885920524597, + "num_tokens": 375991.0, + "step": 4071, + "train/ce_loss": 1.585090160369873 + }, + { + "epoch": 0.4025113703776943, + "step": 4071, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.4025113703776943, + "step": 4071, + "train/total_loss": 0.1897590160369873 + }, + { + "entropy": 9.119791030883789, + "epoch": 0.4026102432272098, + "mean_token_accuracy": 0.7205387353897095, + "num_tokens": 381359.0, + "step": 4072, + "train/ce_loss": 1.0067178010940552 + }, + { + "epoch": 0.4026102432272098, + "step": 4072, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.4026102432272098, + "step": 4072, + "train/total_loss": 0.21395303308963776 + }, + { + "entropy": 9.519779205322266, + "epoch": 0.4027091160767253, + "mean_token_accuracy": 0.692307710647583, + "num_tokens": 386436.0, + "step": 4073, + "train/ce_loss": 1.344114707535482e-06 + }, + { + "epoch": 0.4027091160767253, + "step": 4073, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.4027091160767253, + "step": 4073, + "train/total_loss": 0.039062634110450745 + }, + { + "entropy": 8.639932632446289, + "epoch": 0.40280798892624087, + "mean_token_accuracy": 0.7018572688102722, + "num_tokens": 391914.0, + "step": 4074, + "train/ce_loss": 1.4248647689819336 + }, + { + "epoch": 0.40280798892624087, + "step": 4074, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.40280798892624087, + "step": 4074, + "train/total_loss": 0.24014273285865784 + }, + { + "entropy": 9.420963287353516, + "epoch": 0.40290686177575635, + "mean_token_accuracy": 0.7201017737388611, + "num_tokens": 397177.0, + "step": 4075, + "train/ce_loss": 1.538402557343943e-06 + }, + { + "epoch": 0.40290686177575635, + "step": 4075, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.40290686177575635, + "step": 4075, + "train/total_loss": 0.023437654599547386 + }, + { + "entropy": 8.777081489562988, + "epoch": 0.4030057346252719, + "mean_token_accuracy": 0.7351778745651245, + "num_tokens": 402650.0, + "step": 4076, + "train/ce_loss": 0.6327884793281555 + }, + { + "epoch": 0.4030057346252719, + "step": 4076, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.4030057346252719, + "step": 4076, + "train/total_loss": 0.09452884644269943 + }, + { + "entropy": 9.25904369354248, + "epoch": 0.40310460747478744, + "mean_token_accuracy": 0.7472826242446899, + "num_tokens": 407881.0, + "step": 4077, + "train/ce_loss": 0.6763638257980347 + }, + { + "epoch": 0.40310460747478744, + "step": 4077, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.40310460747478744, + "step": 4077, + "train/total_loss": 0.1027926355600357 + }, + { + "entropy": 9.023961067199707, + "epoch": 0.4032034803243029, + "mean_token_accuracy": 0.723247230052948, + "num_tokens": 413130.0, + "step": 4078, + "train/ce_loss": 1.1932096481323242 + }, + { + "epoch": 0.4032034803243029, + "step": 4078, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.4032034803243029, + "step": 4078, + "train/total_loss": 0.23650845885276794 + }, + { + "entropy": 9.099632263183594, + "epoch": 0.40330235317381846, + "mean_token_accuracy": 0.7875416874885559, + "num_tokens": 418475.0, + "step": 4079, + "train/ce_loss": 0.6685423851013184 + }, + { + "epoch": 0.40330235317381846, + "step": 4079, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.40330235317381846, + "step": 4079, + "train/total_loss": 0.12935423851013184 + }, + { + "epoch": 0.403401226023334, + "grad_norm": 0.6031910181045532, + "learning_rate": 8.993967264995303e-06, + "loss": 0.1484, + "step": 4080 + }, + { + "entropy": 9.174674987792969, + "epoch": 0.403401226023334, + "mean_token_accuracy": 0.8140770196914673, + "num_tokens": 423713.0, + "step": 4080, + "train/ce_loss": 0.45924171805381775 + }, + { + "epoch": 0.403401226023334, + "step": 4080, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.403401226023334, + "step": 4080, + "train/total_loss": 0.07326792180538177 + }, + { + "entropy": 8.881291389465332, + "epoch": 0.4035000988728495, + "mean_token_accuracy": 0.713567852973938, + "num_tokens": 428986.0, + "step": 4081, + "train/ce_loss": 0.8884828686714172 + }, + { + "epoch": 0.4035000988728495, + "step": 4081, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.4035000988728495, + "step": 4081, + "train/total_loss": 0.1435357928276062 + }, + { + "entropy": 9.463319778442383, + "epoch": 0.40359897172236503, + "mean_token_accuracy": 0.749576985836029, + "num_tokens": 434015.0, + "step": 4082, + "train/ce_loss": 0.9069718718528748 + }, + { + "epoch": 0.40359897172236503, + "step": 4082, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.40359897172236503, + "step": 4082, + "train/total_loss": 0.12585344910621643 + }, + { + "entropy": 9.164590835571289, + "epoch": 0.40369784457188057, + "mean_token_accuracy": 0.7601390480995178, + "num_tokens": 439342.0, + "step": 4083, + "train/ce_loss": 0.6753251552581787 + }, + { + "epoch": 0.40369784457188057, + "step": 4083, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.40369784457188057, + "step": 4083, + "train/total_loss": 0.1261262595653534 + }, + { + "entropy": 8.680729866027832, + "epoch": 0.4037967174213961, + "mean_token_accuracy": 0.7681007385253906, + "num_tokens": 444758.0, + "step": 4084, + "train/ce_loss": 0.6548745036125183 + }, + { + "epoch": 0.4037967174213961, + "step": 4084, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.4037967174213961, + "step": 4084, + "train/total_loss": 0.12017495185136795 + }, + { + "entropy": 9.053796768188477, + "epoch": 0.4038955902709116, + "mean_token_accuracy": 0.7254237532615662, + "num_tokens": 450083.0, + "step": 4085, + "train/ce_loss": 0.6766396164894104 + }, + { + "epoch": 0.4038955902709116, + "step": 4085, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.4038955902709116, + "step": 4085, + "train/total_loss": 0.14969521760940552 + }, + { + "entropy": 8.84349250793457, + "epoch": 0.40399446312042714, + "mean_token_accuracy": 0.7057613134384155, + "num_tokens": 455495.0, + "step": 4086, + "train/ce_loss": 1.072960376739502 + }, + { + "epoch": 0.40399446312042714, + "step": 4086, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.40399446312042714, + "step": 4086, + "train/total_loss": 0.15417104959487915 + }, + { + "entropy": 9.179718017578125, + "epoch": 0.4040933359699427, + "mean_token_accuracy": 0.7775148153305054, + "num_tokens": 460781.0, + "step": 4087, + "train/ce_loss": 0.4510173797607422 + }, + { + "epoch": 0.4040933359699427, + "step": 4087, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.4040933359699427, + "step": 4087, + "train/total_loss": 0.06853923946619034 + }, + { + "entropy": 9.581786155700684, + "epoch": 0.40419220881945817, + "mean_token_accuracy": 0.7359550595283508, + "num_tokens": 465717.0, + "step": 4088, + "train/ce_loss": 1.939404455697513e-06 + }, + { + "epoch": 0.40419220881945817, + "step": 4088, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.40419220881945817, + "step": 4088, + "train/total_loss": 0.03906269371509552 + }, + { + "entropy": 8.916521072387695, + "epoch": 0.4042910816689737, + "mean_token_accuracy": 0.7421320080757141, + "num_tokens": 471190.0, + "step": 4089, + "train/ce_loss": 0.8648796081542969 + }, + { + "epoch": 0.4042910816689737, + "step": 4089, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.4042910816689737, + "step": 4089, + "train/total_loss": 0.14898796379566193 + }, + { + "entropy": 8.816831588745117, + "epoch": 0.40438995451848925, + "mean_token_accuracy": 0.7719836235046387, + "num_tokens": 476652.0, + "step": 4090, + "train/ce_loss": 0.5155811905860901 + }, + { + "epoch": 0.40438995451848925, + "step": 4090, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.40438995451848925, + "step": 4090, + "train/total_loss": 0.06718312203884125 + }, + { + "entropy": 8.626619338989258, + "epoch": 0.40448882736800473, + "mean_token_accuracy": 0.7412280440330505, + "num_tokens": 482049.0, + "step": 4091, + "train/ce_loss": 0.579270601272583 + }, + { + "epoch": 0.40448882736800473, + "step": 4091, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.40448882736800473, + "step": 4091, + "train/total_loss": 0.08136455714702606 + }, + { + "entropy": 9.28750228881836, + "epoch": 0.4045877002175203, + "mean_token_accuracy": 0.66847825050354, + "num_tokens": 487210.0, + "step": 4092, + "train/ce_loss": 1.484215658820176e-06 + }, + { + "epoch": 0.4045877002175203, + "step": 4092, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.4045877002175203, + "step": 4092, + "train/total_loss": 0.06640639901161194 + }, + { + "entropy": 9.270790100097656, + "epoch": 0.4046865730670358, + "mean_token_accuracy": 0.7216066718101501, + "num_tokens": 492397.0, + "step": 4093, + "train/ce_loss": 0.3460865616798401 + }, + { + "epoch": 0.4046865730670358, + "step": 4093, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.4046865730670358, + "step": 4093, + "train/total_loss": 0.09710866212844849 + }, + { + "entropy": 10.077104568481445, + "epoch": 0.4047854459165513, + "mean_token_accuracy": 0.739534854888916, + "num_tokens": 497040.0, + "step": 4094, + "train/ce_loss": 6.597715582756791e-06 + }, + { + "epoch": 0.4047854459165513, + "step": 4094, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.4047854459165513, + "step": 4094, + "train/total_loss": 0.06640691310167313 + }, + { + "entropy": 9.576848030090332, + "epoch": 0.40488431876606684, + "mean_token_accuracy": 0.7211538553237915, + "num_tokens": 502022.0, + "step": 4095, + "train/ce_loss": 1.0170562267303467 + }, + { + "epoch": 0.40488431876606684, + "step": 4095, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.40488431876606684, + "step": 4095, + "train/total_loss": 0.1876431256532669 + }, + { + "entropy": 9.736879348754883, + "epoch": 0.4049831916155824, + "mean_token_accuracy": 0.7448630332946777, + "num_tokens": 507008.0, + "step": 4096, + "train/ce_loss": 1.0227447748184204 + }, + { + "epoch": 0.4049831916155824, + "step": 4096, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.4049831916155824, + "step": 4096, + "train/total_loss": 0.20774322748184204 + }, + { + "entropy": 9.458332061767578, + "epoch": 0.40508206446509787, + "mean_token_accuracy": 0.7558320164680481, + "num_tokens": 512081.0, + "step": 4097, + "train/ce_loss": 0.6700068116188049 + }, + { + "epoch": 0.40508206446509787, + "step": 4097, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.40508206446509787, + "step": 4097, + "train/total_loss": 0.09825067967176437 + }, + { + "entropy": 9.689407348632812, + "epoch": 0.4051809373146134, + "mean_token_accuracy": 0.7162471413612366, + "num_tokens": 516974.0, + "step": 4098, + "train/ce_loss": 2.7539579150470672e-06 + }, + { + "epoch": 0.4051809373146134, + "step": 4098, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.4051809373146134, + "step": 4098, + "train/total_loss": 0.07031277567148209 + }, + { + "entropy": 9.511401176452637, + "epoch": 0.40527981016412895, + "mean_token_accuracy": 0.7224264740943909, + "num_tokens": 521918.0, + "step": 4099, + "train/ce_loss": 0.7716624140739441 + }, + { + "epoch": 0.40527981016412895, + "step": 4099, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.40527981016412895, + "step": 4099, + "train/total_loss": 0.13185374438762665 + }, + { + "epoch": 0.40537868301364444, + "grad_norm": 0.7709155082702637, + "learning_rate": 8.989022400237354e-06, + "loss": 0.1442, + "step": 4100 + }, + { + "entropy": 8.770706176757812, + "epoch": 0.40537868301364444, + "mean_token_accuracy": 0.7266880869865417, + "num_tokens": 527264.0, + "step": 4100, + "train/ce_loss": 1.0688512325286865 + }, + { + "epoch": 0.40537868301364444, + "step": 4100, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.40537868301364444, + "step": 4100, + "train/total_loss": 0.1615726351737976 + }, + { + "entropy": 9.146575927734375, + "epoch": 0.40547755586316, + "mean_token_accuracy": 0.747863233089447, + "num_tokens": 532406.0, + "step": 4101, + "train/ce_loss": 0.6212337613105774 + }, + { + "epoch": 0.40547755586316, + "step": 4101, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.40547755586316, + "step": 4101, + "train/total_loss": 0.0894671231508255 + }, + { + "entropy": 8.769676208496094, + "epoch": 0.4055764287126755, + "mean_token_accuracy": 0.7096070051193237, + "num_tokens": 537843.0, + "step": 4102, + "train/ce_loss": 1.2739291191101074 + }, + { + "epoch": 0.4055764287126755, + "step": 4102, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.4055764287126755, + "step": 4102, + "train/total_loss": 0.22114291787147522 + }, + { + "entropy": 9.421545028686523, + "epoch": 0.405675301562191, + "mean_token_accuracy": 0.761695921421051, + "num_tokens": 542962.0, + "step": 4103, + "train/ce_loss": 3.845986611850094e-06 + }, + { + "epoch": 0.405675301562191, + "step": 4103, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.405675301562191, + "step": 4103, + "train/total_loss": 0.05078163370490074 + }, + { + "entropy": 9.3009033203125, + "epoch": 0.40577417441170655, + "mean_token_accuracy": 0.7651195526123047, + "num_tokens": 548130.0, + "step": 4104, + "train/ce_loss": 0.5734964609146118 + }, + { + "epoch": 0.40577417441170655, + "step": 4104, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.40577417441170655, + "step": 4104, + "train/total_loss": 0.12375590205192566 + }, + { + "entropy": 9.662715911865234, + "epoch": 0.4058730472612221, + "mean_token_accuracy": 0.7347294688224792, + "num_tokens": 553113.0, + "step": 4105, + "train/ce_loss": 0.7485767006874084 + }, + { + "epoch": 0.4058730472612221, + "step": 4105, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.4058730472612221, + "step": 4105, + "train/total_loss": 0.1295451819896698 + }, + { + "entropy": 8.815567970275879, + "epoch": 0.40597192011073757, + "mean_token_accuracy": 0.711275041103363, + "num_tokens": 558553.0, + "step": 4106, + "train/ce_loss": 0.6791121363639832 + }, + { + "epoch": 0.40597192011073757, + "step": 4106, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.40597192011073757, + "step": 4106, + "train/total_loss": 0.11478621512651443 + }, + { + "entropy": 9.781185150146484, + "epoch": 0.4060707929602531, + "mean_token_accuracy": 0.7198953032493591, + "num_tokens": 563349.0, + "step": 4107, + "train/ce_loss": 4.859100954490714e-06 + }, + { + "epoch": 0.4060707929602531, + "step": 4107, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.4060707929602531, + "step": 4107, + "train/total_loss": 0.0664067342877388 + }, + { + "entropy": 9.460572242736816, + "epoch": 0.40616966580976865, + "mean_token_accuracy": 0.7015625238418579, + "num_tokens": 568465.0, + "step": 4108, + "train/ce_loss": 7.071306754369289e-06 + }, + { + "epoch": 0.40616966580976865, + "step": 4108, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.40616966580976865, + "step": 4108, + "train/total_loss": 0.03906320780515671 + }, + { + "entropy": 9.23255729675293, + "epoch": 0.40626853865928414, + "mean_token_accuracy": 0.769336998462677, + "num_tokens": 573626.0, + "step": 4109, + "train/ce_loss": 0.6542064547538757 + }, + { + "epoch": 0.40626853865928414, + "step": 4109, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.40626853865928414, + "step": 4109, + "train/total_loss": 0.10838939994573593 + }, + { + "entropy": 9.546173095703125, + "epoch": 0.4063674115087997, + "mean_token_accuracy": 0.7324841022491455, + "num_tokens": 578639.0, + "step": 4110, + "train/ce_loss": 0.6791893243789673 + }, + { + "epoch": 0.4063674115087997, + "step": 4110, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.4063674115087997, + "step": 4110, + "train/total_loss": 0.13041892647743225 + }, + { + "entropy": 9.005331993103027, + "epoch": 0.4064662843583152, + "mean_token_accuracy": 0.7317380309104919, + "num_tokens": 583958.0, + "step": 4111, + "train/ce_loss": 0.8003168106079102 + }, + { + "epoch": 0.4064662843583152, + "step": 4111, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.4064662843583152, + "step": 4111, + "train/total_loss": 0.12690669298171997 + }, + { + "entropy": 9.159075736999512, + "epoch": 0.4065651572078307, + "mean_token_accuracy": 0.7363966107368469, + "num_tokens": 589231.0, + "step": 4112, + "train/ce_loss": 0.8423686027526855 + }, + { + "epoch": 0.4065651572078307, + "step": 4112, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.4065651572078307, + "step": 4112, + "train/total_loss": 0.12720561027526855 + }, + { + "entropy": 8.986069679260254, + "epoch": 0.40666403005734625, + "mean_token_accuracy": 0.7165071964263916, + "num_tokens": 594554.0, + "step": 4113, + "train/ce_loss": 0.9497794508934021 + }, + { + "epoch": 0.40666403005734625, + "step": 4113, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.40666403005734625, + "step": 4113, + "train/total_loss": 0.1809154450893402 + }, + { + "entropy": 8.952409744262695, + "epoch": 0.4067629029068618, + "mean_token_accuracy": 0.7755101919174194, + "num_tokens": 599955.0, + "step": 4114, + "train/ce_loss": 0.6269693970680237 + }, + { + "epoch": 0.4067629029068618, + "step": 4114, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.4067629029068618, + "step": 4114, + "train/total_loss": 0.08222819119691849 + }, + { + "entropy": 8.910983085632324, + "epoch": 0.4068617757563773, + "mean_token_accuracy": 0.7552447319030762, + "num_tokens": 605377.0, + "step": 4115, + "train/ce_loss": 0.5977582931518555 + }, + { + "epoch": 0.4068617757563773, + "step": 4115, + "train/sim_loss": 0.12109375 + }, + { + "epoch": 0.4068617757563773, + "step": 4115, + "train/total_loss": 0.18086957931518555 + }, + { + "entropy": 9.15473747253418, + "epoch": 0.4069606486058928, + "mean_token_accuracy": 0.7110552787780762, + "num_tokens": 610645.0, + "step": 4116, + "train/ce_loss": 8.161274308804423e-06 + }, + { + "epoch": 0.4069606486058928, + "step": 4116, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.4069606486058928, + "step": 4116, + "train/total_loss": 0.06640706956386566 + }, + { + "entropy": 8.895200729370117, + "epoch": 0.40705952145540836, + "mean_token_accuracy": 0.7756041288375854, + "num_tokens": 615997.0, + "step": 4117, + "train/ce_loss": 0.7694007754325867 + }, + { + "epoch": 0.40705952145540836, + "step": 4117, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.40705952145540836, + "step": 4117, + "train/total_loss": 0.13944008946418762 + }, + { + "entropy": 9.071516990661621, + "epoch": 0.40715839430492384, + "mean_token_accuracy": 0.718471348285675, + "num_tokens": 621268.0, + "step": 4118, + "train/ce_loss": 1.1130056381225586 + }, + { + "epoch": 0.40715839430492384, + "step": 4118, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.40715839430492384, + "step": 4118, + "train/total_loss": 0.14255055785179138 + }, + { + "entropy": 8.723745346069336, + "epoch": 0.4072572671544394, + "mean_token_accuracy": 0.7781955003738403, + "num_tokens": 626806.0, + "step": 4119, + "train/ce_loss": 0.3095889985561371 + }, + { + "epoch": 0.4072572671544394, + "step": 4119, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.4072572671544394, + "step": 4119, + "train/total_loss": 0.05049014836549759 + }, + { + "epoch": 0.4073561400039549, + "grad_norm": 0.59541255235672, + "learning_rate": 8.984077535479406e-06, + "loss": 0.1382, + "step": 4120 + }, + { + "entropy": 10.090121269226074, + "epoch": 0.4073561400039549, + "mean_token_accuracy": 0.7755610942840576, + "num_tokens": 631611.0, + "step": 4120, + "train/ce_loss": 1.8787650333251804e-06 + }, + { + "epoch": 0.4073561400039549, + "step": 4120, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.4073561400039549, + "step": 4120, + "train/total_loss": 0.019531438127160072 + }, + { + "entropy": 8.96327018737793, + "epoch": 0.4074550128534704, + "mean_token_accuracy": 0.6701461672782898, + "num_tokens": 637197.0, + "step": 4121, + "train/ce_loss": 0.7925607562065125 + }, + { + "epoch": 0.4074550128534704, + "step": 4121, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.4074550128534704, + "step": 4121, + "train/total_loss": 0.1222248300909996 + }, + { + "entropy": 10.416955947875977, + "epoch": 0.40755388570298595, + "mean_token_accuracy": 0.7746478915214539, + "num_tokens": 641789.0, + "step": 4122, + "train/ce_loss": 4.615934358298546e-06 + }, + { + "epoch": 0.40755388570298595, + "step": 4122, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.40755388570298595, + "step": 4122, + "train/total_loss": 0.01562546193599701 + }, + { + "entropy": 8.998491287231445, + "epoch": 0.4076527585525015, + "mean_token_accuracy": 0.7345013618469238, + "num_tokens": 646980.0, + "step": 4123, + "train/ce_loss": 1.7812578678131104 + }, + { + "epoch": 0.4076527585525015, + "step": 4123, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.4076527585525015, + "step": 4123, + "train/total_loss": 0.25625079870224 + }, + { + "entropy": 9.302095413208008, + "epoch": 0.407751631402017, + "mean_token_accuracy": 0.6926407217979431, + "num_tokens": 652290.0, + "step": 4124, + "train/ce_loss": 0.5359441637992859 + }, + { + "epoch": 0.407751631402017, + "step": 4124, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.407751631402017, + "step": 4124, + "train/total_loss": 0.10437566787004471 + }, + { + "entropy": 8.885763168334961, + "epoch": 0.4078505042515325, + "mean_token_accuracy": 0.7222222089767456, + "num_tokens": 657511.0, + "step": 4125, + "train/ce_loss": 0.9504994750022888 + }, + { + "epoch": 0.4078505042515325, + "step": 4125, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.4078505042515325, + "step": 4125, + "train/total_loss": 0.14973744750022888 + }, + { + "entropy": 8.990278244018555, + "epoch": 0.40794937710104806, + "mean_token_accuracy": 0.7949336767196655, + "num_tokens": 662855.0, + "step": 4126, + "train/ce_loss": 0.8072158098220825 + }, + { + "epoch": 0.40794937710104806, + "step": 4126, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.40794937710104806, + "step": 4126, + "train/total_loss": 0.14712783694267273 + }, + { + "entropy": 8.927145004272461, + "epoch": 0.4080482499505636, + "mean_token_accuracy": 0.7793939113616943, + "num_tokens": 668150.0, + "step": 4127, + "train/ce_loss": 0.6687441468238831 + }, + { + "epoch": 0.4080482499505636, + "step": 4127, + "train/sim_loss": 0.12109375 + }, + { + "epoch": 0.4080482499505636, + "step": 4127, + "train/total_loss": 0.1879681646823883 + }, + { + "entropy": 9.023634910583496, + "epoch": 0.4081471228000791, + "mean_token_accuracy": 0.7296416759490967, + "num_tokens": 673574.0, + "step": 4128, + "train/ce_loss": 0.825593888759613 + }, + { + "epoch": 0.4081471228000791, + "step": 4128, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.4081471228000791, + "step": 4128, + "train/total_loss": 0.15677814185619354 + }, + { + "entropy": 9.965190887451172, + "epoch": 0.40824599564959463, + "mean_token_accuracy": 0.7906976938247681, + "num_tokens": 678333.0, + "step": 4129, + "train/ce_loss": 1.6968340873718262 + }, + { + "epoch": 0.40824599564959463, + "step": 4129, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.40824599564959463, + "step": 4129, + "train/total_loss": 0.20483966171741486 + }, + { + "entropy": 9.249101638793945, + "epoch": 0.40834486849911017, + "mean_token_accuracy": 0.7264437675476074, + "num_tokens": 683440.0, + "step": 4130, + "train/ce_loss": 1.220321536064148 + }, + { + "epoch": 0.40834486849911017, + "step": 4130, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.40834486849911017, + "step": 4130, + "train/total_loss": 0.21187591552734375 + }, + { + "entropy": 9.138092041015625, + "epoch": 0.40844374134862566, + "mean_token_accuracy": 0.7407407164573669, + "num_tokens": 688917.0, + "step": 4131, + "train/ce_loss": 0.9167888760566711 + }, + { + "epoch": 0.40844374134862566, + "step": 4131, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.40844374134862566, + "step": 4131, + "train/total_loss": 0.16199138760566711 + }, + { + "entropy": 9.164538383483887, + "epoch": 0.4085426141981412, + "mean_token_accuracy": 0.7535853981971741, + "num_tokens": 694046.0, + "step": 4132, + "train/ce_loss": 0.8670970797538757 + }, + { + "epoch": 0.4085426141981412, + "step": 4132, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.4085426141981412, + "step": 4132, + "train/total_loss": 0.13358470797538757 + }, + { + "entropy": 8.78746223449707, + "epoch": 0.40864148704765674, + "mean_token_accuracy": 0.6848341226577759, + "num_tokens": 699336.0, + "step": 4133, + "train/ce_loss": 0.9098950624465942 + }, + { + "epoch": 0.40864148704765674, + "step": 4133, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.40864148704765674, + "step": 4133, + "train/total_loss": 0.20817700028419495 + }, + { + "entropy": 10.089083671569824, + "epoch": 0.4087403598971722, + "mean_token_accuracy": 0.7387387156486511, + "num_tokens": 703964.0, + "step": 4134, + "train/ce_loss": 8.100925697362982e-06 + }, + { + "epoch": 0.4087403598971722, + "step": 4134, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.4087403598971722, + "step": 4134, + "train/total_loss": 0.027344560250639915 + }, + { + "entropy": 9.525264739990234, + "epoch": 0.40883923274668776, + "mean_token_accuracy": 0.7131537199020386, + "num_tokens": 709024.0, + "step": 4135, + "train/ce_loss": 0.7740110754966736 + }, + { + "epoch": 0.40883923274668776, + "step": 4135, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.40883923274668776, + "step": 4135, + "train/total_loss": 0.10865110903978348 + }, + { + "entropy": 8.619873046875, + "epoch": 0.4089381055962033, + "mean_token_accuracy": 0.740170955657959, + "num_tokens": 714657.0, + "step": 4136, + "train/ce_loss": 0.383798211812973 + }, + { + "epoch": 0.4089381055962033, + "step": 4136, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.4089381055962033, + "step": 4136, + "train/total_loss": 0.09697356820106506 + }, + { + "entropy": 9.893526077270508, + "epoch": 0.4090369784457188, + "mean_token_accuracy": 0.7224576473236084, + "num_tokens": 719549.0, + "step": 4137, + "train/ce_loss": 2.5385968685150146 + }, + { + "epoch": 0.4090369784457188, + "step": 4137, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.4090369784457188, + "step": 4137, + "train/total_loss": 0.3163596987724304 + }, + { + "entropy": 9.709188461303711, + "epoch": 0.40913585129523433, + "mean_token_accuracy": 0.7655038833618164, + "num_tokens": 724521.0, + "step": 4138, + "train/ce_loss": 2.8100243980588857e-06 + }, + { + "epoch": 0.40913585129523433, + "step": 4138, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.40913585129523433, + "step": 4138, + "train/total_loss": 0.058594029396772385 + }, + { + "entropy": 9.046256065368652, + "epoch": 0.4092347241447499, + "mean_token_accuracy": 0.7681970596313477, + "num_tokens": 729901.0, + "step": 4139, + "train/ce_loss": 0.5904944539070129 + }, + { + "epoch": 0.4092347241447499, + "step": 4139, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.4092347241447499, + "step": 4139, + "train/total_loss": 0.10983069241046906 + }, + { + "epoch": 0.40933359699426536, + "grad_norm": 0.652369499206543, + "learning_rate": 8.979132670721456e-06, + "loss": 0.1384, + "step": 4140 + }, + { + "entropy": 9.250444412231445, + "epoch": 0.40933359699426536, + "mean_token_accuracy": 0.7360594868659973, + "num_tokens": 735138.0, + "step": 4140, + "train/ce_loss": 1.768407940864563 + }, + { + "epoch": 0.40933359699426536, + "step": 4140, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.40933359699426536, + "step": 4140, + "train/total_loss": 0.22762204706668854 + }, + { + "entropy": 8.794022560119629, + "epoch": 0.4094324698437809, + "mean_token_accuracy": 0.703157901763916, + "num_tokens": 740592.0, + "step": 4141, + "train/ce_loss": 1.0213520526885986 + }, + { + "epoch": 0.4094324698437809, + "step": 4141, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.4094324698437809, + "step": 4141, + "train/total_loss": 0.21541646122932434 + }, + { + "entropy": 9.040531158447266, + "epoch": 0.40953134269329644, + "mean_token_accuracy": 0.7473822236061096, + "num_tokens": 745841.0, + "step": 4142, + "train/ce_loss": 0.9278818368911743 + }, + { + "epoch": 0.40953134269329644, + "step": 4142, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.40953134269329644, + "step": 4142, + "train/total_loss": 0.1513819396495819 + }, + { + "entropy": 9.186511039733887, + "epoch": 0.4096302155428119, + "mean_token_accuracy": 0.7512500286102295, + "num_tokens": 751087.0, + "step": 4143, + "train/ce_loss": 1.1020587180610164e-06 + }, + { + "epoch": 0.4096302155428119, + "step": 4143, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.4096302155428119, + "step": 4143, + "train/total_loss": 0.035156361758708954 + }, + { + "entropy": 8.611774444580078, + "epoch": 0.40972908839232747, + "mean_token_accuracy": 0.7169615030288696, + "num_tokens": 756512.0, + "step": 4144, + "train/ce_loss": 1.0824406147003174 + }, + { + "epoch": 0.40972908839232747, + "step": 4144, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.40972908839232747, + "step": 4144, + "train/total_loss": 0.16683781147003174 + }, + { + "entropy": 9.649202346801758, + "epoch": 0.409827961241843, + "mean_token_accuracy": 0.7317460179328918, + "num_tokens": 761563.0, + "step": 4145, + "train/ce_loss": 1.1951160430908203 + }, + { + "epoch": 0.409827961241843, + "step": 4145, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.409827961241843, + "step": 4145, + "train/total_loss": 0.18201160430908203 + }, + { + "entropy": 8.656286239624023, + "epoch": 0.4099268340913585, + "mean_token_accuracy": 0.7485822439193726, + "num_tokens": 767104.0, + "step": 4146, + "train/ce_loss": 0.3673864006996155 + }, + { + "epoch": 0.4099268340913585, + "step": 4146, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.4099268340913585, + "step": 4146, + "train/total_loss": 0.09142614156007767 + }, + { + "entropy": 9.47769546508789, + "epoch": 0.41002570694087404, + "mean_token_accuracy": 0.7534013390541077, + "num_tokens": 772128.0, + "step": 4147, + "train/ce_loss": 2.0078127818123903e-06 + }, + { + "epoch": 0.41002570694087404, + "step": 4147, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.41002570694087404, + "step": 4147, + "train/total_loss": 0.05859395116567612 + }, + { + "entropy": 9.3026123046875, + "epoch": 0.4101245797903896, + "mean_token_accuracy": 0.7651821970939636, + "num_tokens": 777339.0, + "step": 4148, + "train/ce_loss": 0.8742901682853699 + }, + { + "epoch": 0.4101245797903896, + "step": 4148, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.4101245797903896, + "step": 4148, + "train/total_loss": 0.138210266828537 + }, + { + "entropy": 8.528619766235352, + "epoch": 0.41022345263990506, + "mean_token_accuracy": 0.8164300322532654, + "num_tokens": 782830.0, + "step": 4149, + "train/ce_loss": 0.5077932476997375 + }, + { + "epoch": 0.41022345263990506, + "step": 4149, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.41022345263990506, + "step": 4149, + "train/total_loss": 0.070310577750206 + }, + { + "entropy": 9.738550186157227, + "epoch": 0.4103223254894206, + "mean_token_accuracy": 0.8369781374931335, + "num_tokens": 787749.0, + "step": 4150, + "train/ce_loss": 2.5044425910891732e-06 + }, + { + "epoch": 0.4103223254894206, + "step": 4150, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.4103223254894206, + "step": 4150, + "train/total_loss": 0.05468774959445 + }, + { + "entropy": 9.343286514282227, + "epoch": 0.41042119833893614, + "mean_token_accuracy": 0.7023977637290955, + "num_tokens": 792922.0, + "step": 4151, + "train/ce_loss": 0.5836455225944519 + }, + { + "epoch": 0.41042119833893614, + "step": 4151, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.41042119833893614, + "step": 4151, + "train/total_loss": 0.12086455523967743 + }, + { + "entropy": 9.220907211303711, + "epoch": 0.41052007118845163, + "mean_token_accuracy": 0.7028713822364807, + "num_tokens": 798218.0, + "step": 4152, + "train/ce_loss": 1.197948932647705 + }, + { + "epoch": 0.41052007118845163, + "step": 4152, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.41052007118845163, + "step": 4152, + "train/total_loss": 0.21354490518569946 + }, + { + "entropy": 9.451131820678711, + "epoch": 0.41061894403796717, + "mean_token_accuracy": 0.7160493731498718, + "num_tokens": 803372.0, + "step": 4153, + "train/ce_loss": 0.8334396481513977 + }, + { + "epoch": 0.41061894403796717, + "step": 4153, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.41061894403796717, + "step": 4153, + "train/total_loss": 0.192718967795372 + }, + { + "entropy": 9.613548278808594, + "epoch": 0.4107178168874827, + "mean_token_accuracy": 0.7343173623085022, + "num_tokens": 808338.0, + "step": 4154, + "train/ce_loss": 1.4410486221313477 + }, + { + "epoch": 0.4107178168874827, + "step": 4154, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.4107178168874827, + "step": 4154, + "train/total_loss": 0.20269861817359924 + }, + { + "entropy": 9.733526229858398, + "epoch": 0.4108166897369982, + "mean_token_accuracy": 0.7347368597984314, + "num_tokens": 813253.0, + "step": 4155, + "train/ce_loss": 1.0818486213684082 + }, + { + "epoch": 0.4108166897369982, + "step": 4155, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.4108166897369982, + "step": 4155, + "train/total_loss": 0.15505987405776978 + }, + { + "entropy": 9.090243339538574, + "epoch": 0.41091556258651374, + "mean_token_accuracy": 0.6897767186164856, + "num_tokens": 818527.0, + "step": 4156, + "train/ce_loss": 1.2623934745788574 + }, + { + "epoch": 0.41091556258651374, + "step": 4156, + "train/sim_loss": 0.140625 + }, + { + "epoch": 0.41091556258651374, + "step": 4156, + "train/total_loss": 0.2668643593788147 + }, + { + "entropy": 8.927118301391602, + "epoch": 0.4110144354360293, + "mean_token_accuracy": 0.6821621656417847, + "num_tokens": 823946.0, + "step": 4157, + "train/ce_loss": 1.2227681875228882 + }, + { + "epoch": 0.4110144354360293, + "step": 4157, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.4110144354360293, + "step": 4157, + "train/total_loss": 0.22383931279182434 + }, + { + "entropy": 9.305601119995117, + "epoch": 0.41111330828554477, + "mean_token_accuracy": 0.7320703864097595, + "num_tokens": 829164.0, + "step": 4158, + "train/ce_loss": 0.6768001914024353 + }, + { + "epoch": 0.41111330828554477, + "step": 4158, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.41111330828554477, + "step": 4158, + "train/total_loss": 0.12236752361059189 + }, + { + "entropy": 9.253849983215332, + "epoch": 0.4112121811350603, + "mean_token_accuracy": 0.7406483888626099, + "num_tokens": 834422.0, + "step": 4159, + "train/ce_loss": 1.0460083484649658 + }, + { + "epoch": 0.4112121811350603, + "step": 4159, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.4112121811350603, + "step": 4159, + "train/total_loss": 0.21397584676742554 + }, + { + "epoch": 0.41131105398457585, + "grad_norm": 0.7883924245834351, + "learning_rate": 8.974187805963509e-06, + "loss": 0.1431, + "step": 4160 + }, + { + "entropy": 9.040374755859375, + "epoch": 0.41131105398457585, + "mean_token_accuracy": 0.7023153305053711, + "num_tokens": 839765.0, + "step": 4160, + "train/ce_loss": 1.358425498008728 + }, + { + "epoch": 0.41131105398457585, + "step": 4160, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.41131105398457585, + "step": 4160, + "train/total_loss": 0.19053004682064056 + }, + { + "entropy": 9.092089653015137, + "epoch": 0.41140992683409133, + "mean_token_accuracy": 0.7114177942276001, + "num_tokens": 845004.0, + "step": 4161, + "train/ce_loss": 0.8708246350288391 + }, + { + "epoch": 0.41140992683409133, + "step": 4161, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.41140992683409133, + "step": 4161, + "train/total_loss": 0.15348872542381287 + }, + { + "entropy": 8.921030044555664, + "epoch": 0.4115087996836069, + "mean_token_accuracy": 0.7410617470741272, + "num_tokens": 850422.0, + "step": 4162, + "train/ce_loss": 0.5317978262901306 + }, + { + "epoch": 0.4115087996836069, + "step": 4162, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.4115087996836069, + "step": 4162, + "train/total_loss": 0.1156797856092453 + }, + { + "entropy": 8.912398338317871, + "epoch": 0.4116076725331224, + "mean_token_accuracy": 0.7205284833908081, + "num_tokens": 855846.0, + "step": 4163, + "train/ce_loss": 1.3807965517044067 + }, + { + "epoch": 0.4116076725331224, + "step": 4163, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.4116076725331224, + "step": 4163, + "train/total_loss": 0.1849546581506729 + }, + { + "entropy": 9.397573471069336, + "epoch": 0.4117065453826379, + "mean_token_accuracy": 0.7416918277740479, + "num_tokens": 860972.0, + "step": 4164, + "train/ce_loss": 0.8983836770057678 + }, + { + "epoch": 0.4117065453826379, + "step": 4164, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.4117065453826379, + "step": 4164, + "train/total_loss": 0.16015087068080902 + }, + { + "entropy": 8.739873886108398, + "epoch": 0.41180541823215344, + "mean_token_accuracy": 0.7679324746131897, + "num_tokens": 866399.0, + "step": 4165, + "train/ce_loss": 0.801369309425354 + }, + { + "epoch": 0.41180541823215344, + "step": 4165, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.41180541823215344, + "step": 4165, + "train/total_loss": 0.10748068243265152 + }, + { + "entropy": 9.21189022064209, + "epoch": 0.411904291081669, + "mean_token_accuracy": 0.7422552704811096, + "num_tokens": 871674.0, + "step": 4166, + "train/ce_loss": 0.5884183645248413 + }, + { + "epoch": 0.411904291081669, + "step": 4166, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.411904291081669, + "step": 4166, + "train/total_loss": 0.12134183943271637 + }, + { + "entropy": 9.931614875793457, + "epoch": 0.4120031639311845, + "mean_token_accuracy": 0.738095223903656, + "num_tokens": 876489.0, + "step": 4167, + "train/ce_loss": 1.0798530578613281 + }, + { + "epoch": 0.4120031639311845, + "step": 4167, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.4120031639311845, + "step": 4167, + "train/total_loss": 0.15876656770706177 + }, + { + "entropy": 9.207361221313477, + "epoch": 0.4121020367807, + "mean_token_accuracy": 0.7567164301872253, + "num_tokens": 881592.0, + "step": 4168, + "train/ce_loss": 2.0488355403358582e-06 + }, + { + "epoch": 0.4121020367807, + "step": 4168, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.4121020367807, + "step": 4168, + "train/total_loss": 0.046875204890966415 + }, + { + "entropy": 8.989534378051758, + "epoch": 0.41220090963021555, + "mean_token_accuracy": 0.7125550508499146, + "num_tokens": 886980.0, + "step": 4169, + "train/ce_loss": 0.9053176045417786 + }, + { + "epoch": 0.41220090963021555, + "step": 4169, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.41220090963021555, + "step": 4169, + "train/total_loss": 0.12959426641464233 + }, + { + "entropy": 9.171548843383789, + "epoch": 0.4122997824797311, + "mean_token_accuracy": 0.7469879388809204, + "num_tokens": 892232.0, + "step": 4170, + "train/ce_loss": 0.6478603482246399 + }, + { + "epoch": 0.4122997824797311, + "step": 4170, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.4122997824797311, + "step": 4170, + "train/total_loss": 0.09212978929281235 + }, + { + "entropy": 8.943717956542969, + "epoch": 0.4123986553292466, + "mean_token_accuracy": 0.7210065722465515, + "num_tokens": 897634.0, + "step": 4171, + "train/ce_loss": 1.3900470733642578 + }, + { + "epoch": 0.4123986553292466, + "step": 4171, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.4123986553292466, + "step": 4171, + "train/total_loss": 0.23666095733642578 + }, + { + "entropy": 8.863798141479492, + "epoch": 0.4124975281787621, + "mean_token_accuracy": 0.7757575511932373, + "num_tokens": 903065.0, + "step": 4172, + "train/ce_loss": 1.0701133012771606 + }, + { + "epoch": 0.4124975281787621, + "step": 4172, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.4124975281787621, + "step": 4172, + "train/total_loss": 0.1734175831079483 + }, + { + "entropy": 9.983399391174316, + "epoch": 0.41259640102827766, + "mean_token_accuracy": 0.751366138458252, + "num_tokens": 907822.0, + "step": 4173, + "train/ce_loss": 2.0375791791593656e-06 + }, + { + "epoch": 0.41259640102827766, + "step": 4173, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.41259640102827766, + "step": 4173, + "train/total_loss": 0.023437703028321266 + }, + { + "entropy": 9.877622604370117, + "epoch": 0.41269527387779315, + "mean_token_accuracy": 0.7553191781044006, + "num_tokens": 912435.0, + "step": 4174, + "train/ce_loss": 6.041376764187589e-06 + }, + { + "epoch": 0.41269527387779315, + "step": 4174, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.41269527387779315, + "step": 4174, + "train/total_loss": 0.03515685349702835 + }, + { + "entropy": 9.246767044067383, + "epoch": 0.4127941467273087, + "mean_token_accuracy": 0.6970803141593933, + "num_tokens": 917902.0, + "step": 4175, + "train/ce_loss": 0.69617760181427 + }, + { + "epoch": 0.4127941467273087, + "step": 4175, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.4127941467273087, + "step": 4175, + "train/total_loss": 0.12039901316165924 + }, + { + "entropy": 9.241673469543457, + "epoch": 0.4128930195768242, + "mean_token_accuracy": 0.7374005317687988, + "num_tokens": 923107.0, + "step": 4176, + "train/ce_loss": 0.7027705907821655 + }, + { + "epoch": 0.4128930195768242, + "step": 4176, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.4128930195768242, + "step": 4176, + "train/total_loss": 0.15230831503868103 + }, + { + "entropy": 9.122090339660645, + "epoch": 0.4129918924263397, + "mean_token_accuracy": 0.7830423712730408, + "num_tokens": 928396.0, + "step": 4177, + "train/ce_loss": 0.9528135657310486 + }, + { + "epoch": 0.4129918924263397, + "step": 4177, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.4129918924263397, + "step": 4177, + "train/total_loss": 0.14215636253356934 + }, + { + "entropy": 9.369904518127441, + "epoch": 0.41309076527585525, + "mean_token_accuracy": 0.8083735704421997, + "num_tokens": 933458.0, + "step": 4178, + "train/ce_loss": 0.718553900718689 + }, + { + "epoch": 0.41309076527585525, + "step": 4178, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.41309076527585525, + "step": 4178, + "train/total_loss": 0.16169914603233337 + }, + { + "entropy": 9.105664253234863, + "epoch": 0.4131896381253708, + "mean_token_accuracy": 0.7049180269241333, + "num_tokens": 938746.0, + "step": 4179, + "train/ce_loss": 1.1738409996032715 + }, + { + "epoch": 0.4131896381253708, + "step": 4179, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.4131896381253708, + "step": 4179, + "train/total_loss": 0.19160285592079163 + }, + { + "epoch": 0.4132885109748863, + "grad_norm": 0.8040392994880676, + "learning_rate": 8.969242941205559e-06, + "loss": 0.1405, + "step": 4180 + }, + { + "entropy": 8.808820724487305, + "epoch": 0.4132885109748863, + "mean_token_accuracy": 0.6976987719535828, + "num_tokens": 944186.0, + "step": 4180, + "train/ce_loss": 1.2142090797424316 + }, + { + "epoch": 0.4132885109748863, + "step": 4180, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.4132885109748863, + "step": 4180, + "train/total_loss": 0.17610841989517212 + }, + { + "entropy": 8.927715301513672, + "epoch": 0.4133873838244018, + "mean_token_accuracy": 0.7618510127067566, + "num_tokens": 949529.0, + "step": 4181, + "train/ce_loss": 0.5040786266326904 + }, + { + "epoch": 0.4133873838244018, + "step": 4181, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.4133873838244018, + "step": 4181, + "train/total_loss": 0.07775161415338516 + }, + { + "entropy": 9.648799896240234, + "epoch": 0.41348625667391736, + "mean_token_accuracy": 0.7495256066322327, + "num_tokens": 954500.0, + "step": 4182, + "train/ce_loss": 0.7559933066368103 + }, + { + "epoch": 0.41348625667391736, + "step": 4182, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.41348625667391736, + "step": 4182, + "train/total_loss": 0.10294308513402939 + }, + { + "entropy": 9.496209144592285, + "epoch": 0.41358512952343285, + "mean_token_accuracy": 0.7256198525428772, + "num_tokens": 959543.0, + "step": 4183, + "train/ce_loss": 1.2499913282226771e-06 + }, + { + "epoch": 0.41358512952343285, + "step": 4183, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.41358512952343285, + "step": 4183, + "train/total_loss": 0.027343874797225 + }, + { + "entropy": 8.89802360534668, + "epoch": 0.4136840023729484, + "mean_token_accuracy": 0.7200461030006409, + "num_tokens": 964881.0, + "step": 4184, + "train/ce_loss": 0.7619472742080688 + }, + { + "epoch": 0.4136840023729484, + "step": 4184, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.4136840023729484, + "step": 4184, + "train/total_loss": 0.13478848338127136 + }, + { + "entropy": 9.495051383972168, + "epoch": 0.41378287522246393, + "mean_token_accuracy": 0.6853002309799194, + "num_tokens": 969783.0, + "step": 4185, + "train/ce_loss": 5.939029506407678e-06 + }, + { + "epoch": 0.41378287522246393, + "step": 4185, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.41378287522246393, + "step": 4185, + "train/total_loss": 0.07031309604644775 + }, + { + "entropy": 9.305354118347168, + "epoch": 0.4138817480719794, + "mean_token_accuracy": 0.7168758511543274, + "num_tokens": 974986.0, + "step": 4186, + "train/ce_loss": 0.5816423892974854 + }, + { + "epoch": 0.4138817480719794, + "step": 4186, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.4138817480719794, + "step": 4186, + "train/total_loss": 0.13628923892974854 + }, + { + "entropy": 9.138479232788086, + "epoch": 0.41398062092149496, + "mean_token_accuracy": 0.6998770236968994, + "num_tokens": 980283.0, + "step": 4187, + "train/ce_loss": 1.1875509023666382 + }, + { + "epoch": 0.41398062092149496, + "step": 4187, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.41398062092149496, + "step": 4187, + "train/total_loss": 0.14219260215759277 + }, + { + "entropy": 8.918331146240234, + "epoch": 0.4140794937710105, + "mean_token_accuracy": 0.7854785323143005, + "num_tokens": 985695.0, + "step": 4188, + "train/ce_loss": 0.6920308470726013 + }, + { + "epoch": 0.4140794937710105, + "step": 4188, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.4140794937710105, + "step": 4188, + "train/total_loss": 0.13170307874679565 + }, + { + "entropy": 9.225728988647461, + "epoch": 0.414178366620526, + "mean_token_accuracy": 0.7077131271362305, + "num_tokens": 990919.0, + "step": 4189, + "train/ce_loss": 1.3891641401642119e-06 + }, + { + "epoch": 0.414178366620526, + "step": 4189, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.414178366620526, + "step": 4189, + "train/total_loss": 0.05468763783574104 + }, + { + "entropy": 9.02457332611084, + "epoch": 0.4142772394700415, + "mean_token_accuracy": 0.762566864490509, + "num_tokens": 996528.0, + "step": 4190, + "train/ce_loss": 0.4660285413265228 + }, + { + "epoch": 0.4142772394700415, + "step": 4190, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.4142772394700415, + "step": 4190, + "train/total_loss": 0.12472786009311676 + }, + { + "entropy": 8.8408784866333, + "epoch": 0.41437611231955707, + "mean_token_accuracy": 0.7253599166870117, + "num_tokens": 1001897.0, + "step": 4191, + "train/ce_loss": 0.8709747791290283 + }, + { + "epoch": 0.41437611231955707, + "step": 4191, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.41437611231955707, + "step": 4191, + "train/total_loss": 0.15740998089313507 + }, + { + "entropy": 9.4285888671875, + "epoch": 0.41447498516907255, + "mean_token_accuracy": 0.773809552192688, + "num_tokens": 1007005.0, + "step": 4192, + "train/ce_loss": 0.7056515216827393 + }, + { + "epoch": 0.41447498516907255, + "step": 4192, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.41447498516907255, + "step": 4192, + "train/total_loss": 0.16431516408920288 + }, + { + "entropy": 9.37752628326416, + "epoch": 0.4145738580185881, + "mean_token_accuracy": 0.7282127141952515, + "num_tokens": 1012202.0, + "step": 4193, + "train/ce_loss": 0.6693522930145264 + }, + { + "epoch": 0.4145738580185881, + "step": 4193, + "train/sim_loss": 0.15625 + }, + { + "epoch": 0.4145738580185881, + "step": 4193, + "train/total_loss": 0.2231852412223816 + }, + { + "entropy": 9.460760116577148, + "epoch": 0.41467273086810363, + "mean_token_accuracy": 0.7652892470359802, + "num_tokens": 1017277.0, + "step": 4194, + "train/ce_loss": 1.9088233709335327 + }, + { + "epoch": 0.41467273086810363, + "step": 4194, + "train/sim_loss": 0.12890625 + }, + { + "epoch": 0.41467273086810363, + "step": 4194, + "train/total_loss": 0.3197885751724243 + }, + { + "entropy": 9.202156066894531, + "epoch": 0.4147716037176191, + "mean_token_accuracy": 0.7761836647987366, + "num_tokens": 1022458.0, + "step": 4195, + "train/ce_loss": 0.7725761532783508 + }, + { + "epoch": 0.4147716037176191, + "step": 4195, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.4147716037176191, + "step": 4195, + "train/total_loss": 0.10069511830806732 + }, + { + "entropy": 9.696306228637695, + "epoch": 0.41487047656713466, + "mean_token_accuracy": 0.789383590221405, + "num_tokens": 1027490.0, + "step": 4196, + "train/ce_loss": 3.918094080290757e-06 + }, + { + "epoch": 0.41487047656713466, + "step": 4196, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.41487047656713466, + "step": 4196, + "train/total_loss": 0.05859414115548134 + }, + { + "entropy": 8.785720825195312, + "epoch": 0.4149693494166502, + "mean_token_accuracy": 0.7217742204666138, + "num_tokens": 1033026.0, + "step": 4197, + "train/ce_loss": 1.0272657871246338 + }, + { + "epoch": 0.4149693494166502, + "step": 4197, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.4149693494166502, + "step": 4197, + "train/total_loss": 0.14569532871246338 + }, + { + "entropy": 9.278945922851562, + "epoch": 0.4150682222661657, + "mean_token_accuracy": 0.7827869057655334, + "num_tokens": 1038239.0, + "step": 4198, + "train/ce_loss": 0.6755519509315491 + }, + { + "epoch": 0.4150682222661657, + "step": 4198, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.4150682222661657, + "step": 4198, + "train/total_loss": 0.09099269658327103 + }, + { + "entropy": 9.39266586303711, + "epoch": 0.41516709511568123, + "mean_token_accuracy": 0.7037037014961243, + "num_tokens": 1043308.0, + "step": 4199, + "train/ce_loss": 2.017041879298631e-06 + }, + { + "epoch": 0.41516709511568123, + "step": 4199, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.41516709511568123, + "step": 4199, + "train/total_loss": 0.06250020116567612 + }, + { + "epoch": 0.41526596796519677, + "grad_norm": 0.8774731755256653, + "learning_rate": 8.96429807644761e-06, + "loss": 0.147, + "step": 4200 + }, + { + "entropy": 8.4367094039917, + "epoch": 0.41526596796519677, + "mean_token_accuracy": 0.7406989932060242, + "num_tokens": 1048683.0, + "step": 4200, + "train/ce_loss": 1.0317161083221436 + }, + { + "epoch": 0.41526596796519677, + "step": 4200, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.41526596796519677, + "step": 4200, + "train/total_loss": 0.18129661679267883 + }, + { + "entropy": 9.368021011352539, + "epoch": 0.41536484081471226, + "mean_token_accuracy": 0.699999988079071, + "num_tokens": 1053755.0, + "step": 4201, + "train/ce_loss": 2.232920451206155e-06 + }, + { + "epoch": 0.41536484081471226, + "step": 4201, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.41536484081471226, + "step": 4201, + "train/total_loss": 0.03515647351741791 + }, + { + "entropy": 10.30126953125, + "epoch": 0.4154637136642278, + "mean_token_accuracy": 0.8042704463005066, + "num_tokens": 1058581.0, + "step": 4202, + "train/ce_loss": 1.648161768913269 + }, + { + "epoch": 0.4154637136642278, + "step": 4202, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.4154637136642278, + "step": 4202, + "train/total_loss": 0.2741912007331848 + }, + { + "entropy": 8.870027542114258, + "epoch": 0.41556258651374334, + "mean_token_accuracy": 0.7354685664176941, + "num_tokens": 1063856.0, + "step": 4203, + "train/ce_loss": 1.175770878791809 + }, + { + "epoch": 0.41556258651374334, + "step": 4203, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.41556258651374334, + "step": 4203, + "train/total_loss": 0.15273334085941315 + }, + { + "entropy": 9.552102088928223, + "epoch": 0.4156614593632588, + "mean_token_accuracy": 0.7178502678871155, + "num_tokens": 1068804.0, + "step": 4204, + "train/ce_loss": 1.5738786458969116 + }, + { + "epoch": 0.4156614593632588, + "step": 4204, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.4156614593632588, + "step": 4204, + "train/total_loss": 0.2628566026687622 + }, + { + "entropy": 9.039390563964844, + "epoch": 0.41576033221277436, + "mean_token_accuracy": 0.7024221420288086, + "num_tokens": 1074141.0, + "step": 4205, + "train/ce_loss": 0.701492190361023 + }, + { + "epoch": 0.41576033221277436, + "step": 4205, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.41576033221277436, + "step": 4205, + "train/total_loss": 0.10530547052621841 + }, + { + "entropy": 9.621667861938477, + "epoch": 0.4158592050622899, + "mean_token_accuracy": 0.7717041969299316, + "num_tokens": 1079197.0, + "step": 4206, + "train/ce_loss": 0.9569666981697083 + }, + { + "epoch": 0.4158592050622899, + "step": 4206, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.4158592050622899, + "step": 4206, + "train/total_loss": 0.14647792279720306 + }, + { + "entropy": 9.241792678833008, + "epoch": 0.4159580779118054, + "mean_token_accuracy": 0.6965811848640442, + "num_tokens": 1084368.0, + "step": 4207, + "train/ce_loss": 1.1052935123443604 + }, + { + "epoch": 0.4159580779118054, + "step": 4207, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.4159580779118054, + "step": 4207, + "train/total_loss": 0.196466863155365 + }, + { + "entropy": 8.973245620727539, + "epoch": 0.41605695076132093, + "mean_token_accuracy": 0.6955017447471619, + "num_tokens": 1089641.0, + "step": 4208, + "train/ce_loss": 0.8473589420318604 + }, + { + "epoch": 0.41605695076132093, + "step": 4208, + "train/sim_loss": 0.16015625 + }, + { + "epoch": 0.41605695076132093, + "step": 4208, + "train/total_loss": 0.2448921501636505 + }, + { + "entropy": 8.825902938842773, + "epoch": 0.4161558236108365, + "mean_token_accuracy": 0.7387387156486511, + "num_tokens": 1095097.0, + "step": 4209, + "train/ce_loss": 0.43971484899520874 + }, + { + "epoch": 0.4161558236108365, + "step": 4209, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.4161558236108365, + "step": 4209, + "train/total_loss": 0.098658986389637 + }, + { + "entropy": 9.571261405944824, + "epoch": 0.416254696460352, + "mean_token_accuracy": 0.675000011920929, + "num_tokens": 1100225.0, + "step": 4210, + "train/ce_loss": 1.4352977275848389 + }, + { + "epoch": 0.416254696460352, + "step": 4210, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.416254696460352, + "step": 4210, + "train/total_loss": 0.2411860227584839 + }, + { + "entropy": 8.91400146484375, + "epoch": 0.4163535693098675, + "mean_token_accuracy": 0.7109634280204773, + "num_tokens": 1105585.0, + "step": 4211, + "train/ce_loss": 0.828546404838562 + }, + { + "epoch": 0.4163535693098675, + "step": 4211, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.4163535693098675, + "step": 4211, + "train/total_loss": 0.12972964346408844 + }, + { + "entropy": 9.401948928833008, + "epoch": 0.41645244215938304, + "mean_token_accuracy": 0.7531055808067322, + "num_tokens": 1110588.0, + "step": 4212, + "train/ce_loss": 1.419521689414978 + }, + { + "epoch": 0.41645244215938304, + "step": 4212, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.41645244215938304, + "step": 4212, + "train/total_loss": 0.23570217192173004 + }, + { + "entropy": 9.189818382263184, + "epoch": 0.4165513150088986, + "mean_token_accuracy": 0.7932816743850708, + "num_tokens": 1115833.0, + "step": 4213, + "train/ce_loss": 0.5189548134803772 + }, + { + "epoch": 0.4165513150088986, + "step": 4213, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.4165513150088986, + "step": 4213, + "train/total_loss": 0.07533298432826996 + }, + { + "entropy": 8.877925872802734, + "epoch": 0.41665018785841407, + "mean_token_accuracy": 0.703398585319519, + "num_tokens": 1121296.0, + "step": 4214, + "train/ce_loss": 0.8614786267280579 + }, + { + "epoch": 0.41665018785841407, + "step": 4214, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.41665018785841407, + "step": 4214, + "train/total_loss": 0.18771037459373474 + }, + { + "entropy": 9.34935474395752, + "epoch": 0.4167490607079296, + "mean_token_accuracy": 0.7023643851280212, + "num_tokens": 1126472.0, + "step": 4215, + "train/ce_loss": 0.5411409139633179 + }, + { + "epoch": 0.4167490607079296, + "step": 4215, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.4167490607079296, + "step": 4215, + "train/total_loss": 0.11661408841609955 + }, + { + "entropy": 9.215014457702637, + "epoch": 0.41684793355744515, + "mean_token_accuracy": 0.6658536791801453, + "num_tokens": 1131774.0, + "step": 4216, + "train/ce_loss": 1.4825798273086548 + }, + { + "epoch": 0.41684793355744515, + "step": 4216, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.41684793355744515, + "step": 4216, + "train/total_loss": 0.21857048571109772 + }, + { + "entropy": 9.464516639709473, + "epoch": 0.41694680640696064, + "mean_token_accuracy": 0.8243451714515686, + "num_tokens": 1136869.0, + "step": 4217, + "train/ce_loss": 1.5847562053750153e-06 + }, + { + "epoch": 0.41694680640696064, + "step": 4217, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.41694680640696064, + "step": 4217, + "train/total_loss": 0.027343908324837685 + }, + { + "entropy": 9.304022789001465, + "epoch": 0.4170456792564762, + "mean_token_accuracy": 0.747787594795227, + "num_tokens": 1142053.0, + "step": 4218, + "train/ce_loss": 1.9297674498375272e-06 + }, + { + "epoch": 0.4170456792564762, + "step": 4218, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.4170456792564762, + "step": 4218, + "train/total_loss": 0.05078144371509552 + }, + { + "entropy": 9.988414764404297, + "epoch": 0.4171445521059917, + "mean_token_accuracy": 0.7318681478500366, + "num_tokens": 1146940.0, + "step": 4219, + "train/ce_loss": 5.972405233478639e-06 + }, + { + "epoch": 0.4171445521059917, + "step": 4219, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.4171445521059917, + "step": 4219, + "train/total_loss": 0.046875596046447754 + }, + { + "epoch": 0.4172434249555072, + "grad_norm": 0.9290772080421448, + "learning_rate": 8.95935321168966e-06, + "loss": 0.1533, + "step": 4220 + }, + { + "entropy": 8.99875259399414, + "epoch": 0.4172434249555072, + "mean_token_accuracy": 0.7621302008628845, + "num_tokens": 1152240.0, + "step": 4220, + "train/ce_loss": 0.849389910697937 + }, + { + "epoch": 0.4172434249555072, + "step": 4220, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.4172434249555072, + "step": 4220, + "train/total_loss": 0.10837649554014206 + }, + { + "entropy": 9.093700408935547, + "epoch": 0.41734229780502274, + "mean_token_accuracy": 0.7468982338905334, + "num_tokens": 1157422.0, + "step": 4221, + "train/ce_loss": 0.7816839218139648 + }, + { + "epoch": 0.41734229780502274, + "step": 4221, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.41734229780502274, + "step": 4221, + "train/total_loss": 0.14066839218139648 + }, + { + "entropy": 9.681419372558594, + "epoch": 0.4174411706545383, + "mean_token_accuracy": 0.7091836929321289, + "num_tokens": 1162433.0, + "step": 4222, + "train/ce_loss": 1.0733975172042847 + }, + { + "epoch": 0.4174411706545383, + "step": 4222, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.4174411706545383, + "step": 4222, + "train/total_loss": 0.1464022547006607 + }, + { + "entropy": 8.821057319641113, + "epoch": 0.41754004350405377, + "mean_token_accuracy": 0.7184684872627258, + "num_tokens": 1167752.0, + "step": 4223, + "train/ce_loss": 0.6913005113601685 + }, + { + "epoch": 0.41754004350405377, + "step": 4223, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.41754004350405377, + "step": 4223, + "train/total_loss": 0.1042863056063652 + }, + { + "entropy": 9.29666519165039, + "epoch": 0.4176389163535693, + "mean_token_accuracy": 0.7994186282157898, + "num_tokens": 1172913.0, + "step": 4224, + "train/ce_loss": 0.7657065391540527 + }, + { + "epoch": 0.4176389163535693, + "step": 4224, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.4176389163535693, + "step": 4224, + "train/total_loss": 0.11172690242528915 + }, + { + "entropy": 9.670294761657715, + "epoch": 0.41773778920308485, + "mean_token_accuracy": 0.7474226951599121, + "num_tokens": 1177937.0, + "step": 4225, + "train/ce_loss": 0.6599021553993225 + }, + { + "epoch": 0.41773778920308485, + "step": 4225, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.41773778920308485, + "step": 4225, + "train/total_loss": 0.12458396703004837 + }, + { + "entropy": 8.872920989990234, + "epoch": 0.41783666205260034, + "mean_token_accuracy": 0.7144653797149658, + "num_tokens": 1183207.0, + "step": 4226, + "train/ce_loss": 1.1690150499343872 + }, + { + "epoch": 0.41783666205260034, + "step": 4226, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.41783666205260034, + "step": 4226, + "train/total_loss": 0.18330776691436768 + }, + { + "entropy": 8.915066719055176, + "epoch": 0.4179355349021159, + "mean_token_accuracy": 0.7074999809265137, + "num_tokens": 1188463.0, + "step": 4227, + "train/ce_loss": 0.9992348551750183 + }, + { + "epoch": 0.4179355349021159, + "step": 4227, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.4179355349021159, + "step": 4227, + "train/total_loss": 0.1389859914779663 + }, + { + "entropy": 10.26605224609375, + "epoch": 0.4180344077516314, + "mean_token_accuracy": 0.6666666865348816, + "num_tokens": 1193198.0, + "step": 4228, + "train/ce_loss": 1.8083083629608154 + }, + { + "epoch": 0.4180344077516314, + "step": 4228, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.4180344077516314, + "step": 4228, + "train/total_loss": 0.23942458629608154 + }, + { + "entropy": 9.408763885498047, + "epoch": 0.4181332806011469, + "mean_token_accuracy": 0.7004950642585754, + "num_tokens": 1198058.0, + "step": 4229, + "train/ce_loss": 2.6393215656280518 + }, + { + "epoch": 0.4181332806011469, + "step": 4229, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.4181332806011469, + "step": 4229, + "train/total_loss": 0.35768216848373413 + }, + { + "entropy": 9.29364013671875, + "epoch": 0.41823215345066245, + "mean_token_accuracy": 0.7573632597923279, + "num_tokens": 1203223.0, + "step": 4230, + "train/ce_loss": 1.110211730003357 + }, + { + "epoch": 0.41823215345066245, + "step": 4230, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.41823215345066245, + "step": 4230, + "train/total_loss": 0.13836492598056793 + }, + { + "entropy": 9.558351516723633, + "epoch": 0.418331026300178, + "mean_token_accuracy": 0.7590163946151733, + "num_tokens": 1208246.0, + "step": 4231, + "train/ce_loss": 1.1962000131607056 + }, + { + "epoch": 0.418331026300178, + "step": 4231, + "train/sim_loss": 0.12890625 + }, + { + "epoch": 0.418331026300178, + "step": 4231, + "train/total_loss": 0.24852624535560608 + }, + { + "entropy": 9.735548973083496, + "epoch": 0.4184298991496935, + "mean_token_accuracy": 0.73046875, + "num_tokens": 1213179.0, + "step": 4232, + "train/ce_loss": 0.875798761844635 + }, + { + "epoch": 0.4184298991496935, + "step": 4232, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.4184298991496935, + "step": 4232, + "train/total_loss": 0.2047673761844635 + }, + { + "entropy": 9.036125183105469, + "epoch": 0.418528771999209, + "mean_token_accuracy": 0.7426210045814514, + "num_tokens": 1218482.0, + "step": 4233, + "train/ce_loss": 0.6533321142196655 + }, + { + "epoch": 0.418528771999209, + "step": 4233, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.418528771999209, + "step": 4233, + "train/total_loss": 0.10439570993185043 + }, + { + "entropy": 9.350015640258789, + "epoch": 0.41862764484872456, + "mean_token_accuracy": 0.824999988079071, + "num_tokens": 1223655.0, + "step": 4234, + "train/ce_loss": 0.5197864174842834 + }, + { + "epoch": 0.41862764484872456, + "step": 4234, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.41862764484872456, + "step": 4234, + "train/total_loss": 0.09494739770889282 + }, + { + "entropy": 8.948705673217773, + "epoch": 0.41872651769824004, + "mean_token_accuracy": 0.7506426572799683, + "num_tokens": 1228878.0, + "step": 4235, + "train/ce_loss": 0.4958691895008087 + }, + { + "epoch": 0.41872651769824004, + "step": 4235, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.41872651769824004, + "step": 4235, + "train/total_loss": 0.08083692193031311 + }, + { + "entropy": 9.025856018066406, + "epoch": 0.4188253905477556, + "mean_token_accuracy": 0.6833713054656982, + "num_tokens": 1234207.0, + "step": 4236, + "train/ce_loss": 1.5610315799713135 + }, + { + "epoch": 0.4188253905477556, + "step": 4236, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.4188253905477556, + "step": 4236, + "train/total_loss": 0.23422816395759583 + }, + { + "entropy": 8.754049301147461, + "epoch": 0.4189242633972711, + "mean_token_accuracy": 0.7425025701522827, + "num_tokens": 1239659.0, + "step": 4237, + "train/ce_loss": 0.5289074182510376 + }, + { + "epoch": 0.4189242633972711, + "step": 4237, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.4189242633972711, + "step": 4237, + "train/total_loss": 0.08414074778556824 + }, + { + "entropy": 9.412731170654297, + "epoch": 0.4190231362467866, + "mean_token_accuracy": 0.7376543283462524, + "num_tokens": 1244737.0, + "step": 4238, + "train/ce_loss": 1.6346156597137451 + }, + { + "epoch": 0.4190231362467866, + "step": 4238, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.4190231362467866, + "step": 4238, + "train/total_loss": 0.2337740659713745 + }, + { + "entropy": 9.039575576782227, + "epoch": 0.41912200909630215, + "mean_token_accuracy": 0.7407878041267395, + "num_tokens": 1250042.0, + "step": 4239, + "train/ce_loss": 0.9413285255432129 + }, + { + "epoch": 0.41912200909630215, + "step": 4239, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.41912200909630215, + "step": 4239, + "train/total_loss": 0.18397660553455353 + }, + { + "epoch": 0.4192208819458177, + "grad_norm": 0.8705694079399109, + "learning_rate": 8.954408346931712e-06, + "loss": 0.144, + "step": 4240 + }, + { + "entropy": 9.725851058959961, + "epoch": 0.4192208819458177, + "mean_token_accuracy": 0.7514340281486511, + "num_tokens": 1254948.0, + "step": 4240, + "train/ce_loss": 1.937035083770752 + }, + { + "epoch": 0.4192208819458177, + "step": 4240, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.4192208819458177, + "step": 4240, + "train/total_loss": 0.2522972822189331 + }, + { + "entropy": 9.782069206237793, + "epoch": 0.4193197547953332, + "mean_token_accuracy": 0.6924999952316284, + "num_tokens": 1259763.0, + "step": 4241, + "train/ce_loss": 1.8429124355316162 + }, + { + "epoch": 0.4193197547953332, + "step": 4241, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.4193197547953332, + "step": 4241, + "train/total_loss": 0.2975724935531616 + }, + { + "entropy": 9.174646377563477, + "epoch": 0.4194186276448487, + "mean_token_accuracy": 0.6773761510848999, + "num_tokens": 1264942.0, + "step": 4242, + "train/ce_loss": 1.1125339269638062 + }, + { + "epoch": 0.4194186276448487, + "step": 4242, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.4194186276448487, + "step": 4242, + "train/total_loss": 0.19719089567661285 + }, + { + "entropy": 9.241769790649414, + "epoch": 0.41951750049436426, + "mean_token_accuracy": 0.774193525314331, + "num_tokens": 1270001.0, + "step": 4243, + "train/ce_loss": 1.6668464013491757e-06 + }, + { + "epoch": 0.41951750049436426, + "step": 4243, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.41951750049436426, + "step": 4243, + "train/total_loss": 0.06250016391277313 + }, + { + "entropy": 8.818593978881836, + "epoch": 0.41961637334387974, + "mean_token_accuracy": 0.7326139211654663, + "num_tokens": 1275281.0, + "step": 4244, + "train/ce_loss": 0.7373816967010498 + }, + { + "epoch": 0.41961637334387974, + "step": 4244, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.41961637334387974, + "step": 4244, + "train/total_loss": 0.17139442265033722 + }, + { + "entropy": 8.850120544433594, + "epoch": 0.4197152461933953, + "mean_token_accuracy": 0.7569988965988159, + "num_tokens": 1280574.0, + "step": 4245, + "train/ce_loss": 0.9278842806816101 + }, + { + "epoch": 0.4197152461933953, + "step": 4245, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.4197152461933953, + "step": 4245, + "train/total_loss": 0.135757178068161 + }, + { + "entropy": 8.878185272216797, + "epoch": 0.4198141190429108, + "mean_token_accuracy": 0.7516411542892456, + "num_tokens": 1285915.0, + "step": 4246, + "train/ce_loss": 0.8967689871788025 + }, + { + "epoch": 0.4198141190429108, + "step": 4246, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.4198141190429108, + "step": 4246, + "train/total_loss": 0.1521769016981125 + }, + { + "entropy": 8.870865821838379, + "epoch": 0.4199129918924263, + "mean_token_accuracy": 0.6959064602851868, + "num_tokens": 1291257.0, + "step": 4247, + "train/ce_loss": 0.5926907658576965 + }, + { + "epoch": 0.4199129918924263, + "step": 4247, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.4199129918924263, + "step": 4247, + "train/total_loss": 0.10223782807588577 + }, + { + "entropy": 9.483327865600586, + "epoch": 0.42001186474194185, + "mean_token_accuracy": 0.8003802299499512, + "num_tokens": 1296220.0, + "step": 4248, + "train/ce_loss": 0.8804767727851868 + }, + { + "epoch": 0.42001186474194185, + "step": 4248, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.42001186474194185, + "step": 4248, + "train/total_loss": 0.11148517578840256 + }, + { + "entropy": 9.329161643981934, + "epoch": 0.4201107375914574, + "mean_token_accuracy": 0.7123551964759827, + "num_tokens": 1301156.0, + "step": 4249, + "train/ce_loss": 1.829318642616272 + }, + { + "epoch": 0.4201107375914574, + "step": 4249, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.4201107375914574, + "step": 4249, + "train/total_loss": 0.24152562022209167 + }, + { + "entropy": 9.395343780517578, + "epoch": 0.42020961044097294, + "mean_token_accuracy": 0.7155025601387024, + "num_tokens": 1306213.0, + "step": 4250, + "train/ce_loss": 1.8414853811264038 + }, + { + "epoch": 0.42020961044097294, + "step": 4250, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.42020961044097294, + "step": 4250, + "train/total_loss": 0.25836730003356934 + }, + { + "entropy": 9.611597061157227, + "epoch": 0.4203084832904884, + "mean_token_accuracy": 0.8132635354995728, + "num_tokens": 1311164.0, + "step": 4251, + "train/ce_loss": 0.9971811175346375 + }, + { + "epoch": 0.4203084832904884, + "step": 4251, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.4203084832904884, + "step": 4251, + "train/total_loss": 0.1153431162238121 + }, + { + "entropy": 9.493104934692383, + "epoch": 0.42040735614000396, + "mean_token_accuracy": 0.7350000143051147, + "num_tokens": 1316167.0, + "step": 4252, + "train/ce_loss": 0.7681896686553955 + }, + { + "epoch": 0.42040735614000396, + "step": 4252, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.42040735614000396, + "step": 4252, + "train/total_loss": 0.18228772282600403 + }, + { + "entropy": 9.07140827178955, + "epoch": 0.4205062289895195, + "mean_token_accuracy": 0.7261641025543213, + "num_tokens": 1321550.0, + "step": 4253, + "train/ce_loss": 0.9427207112312317 + }, + { + "epoch": 0.4205062289895195, + "step": 4253, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.4205062289895195, + "step": 4253, + "train/total_loss": 0.14114707708358765 + }, + { + "entropy": 8.669411659240723, + "epoch": 0.420605101839035, + "mean_token_accuracy": 0.7895287871360779, + "num_tokens": 1326977.0, + "step": 4254, + "train/ce_loss": 0.7670964002609253 + }, + { + "epoch": 0.420605101839035, + "step": 4254, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.420605101839035, + "step": 4254, + "train/total_loss": 0.10405339300632477 + }, + { + "entropy": 9.015830993652344, + "epoch": 0.42070397468855053, + "mean_token_accuracy": 0.7553443908691406, + "num_tokens": 1332264.0, + "step": 4255, + "train/ce_loss": 0.7133545279502869 + }, + { + "epoch": 0.42070397468855053, + "step": 4255, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.42070397468855053, + "step": 4255, + "train/total_loss": 0.11039795726537704 + }, + { + "entropy": 9.15105152130127, + "epoch": 0.42080284753806607, + "mean_token_accuracy": 0.7324749827384949, + "num_tokens": 1337405.0, + "step": 4256, + "train/ce_loss": 1.7517278365630773e-06 + }, + { + "epoch": 0.42080284753806607, + "step": 4256, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.42080284753806607, + "step": 4256, + "train/total_loss": 0.05468767508864403 + }, + { + "entropy": 9.284518241882324, + "epoch": 0.42090172038758156, + "mean_token_accuracy": 0.717783510684967, + "num_tokens": 1342615.0, + "step": 4257, + "train/ce_loss": 0.5699039101600647 + }, + { + "epoch": 0.42090172038758156, + "step": 4257, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.42090172038758156, + "step": 4257, + "train/total_loss": 0.08433414250612259 + }, + { + "entropy": 8.994129180908203, + "epoch": 0.4210005932370971, + "mean_token_accuracy": 0.7823129296302795, + "num_tokens": 1347959.0, + "step": 4258, + "train/ce_loss": 0.3339727818965912 + }, + { + "epoch": 0.4210005932370971, + "step": 4258, + "train/sim_loss": 0.12109375 + }, + { + "epoch": 0.4210005932370971, + "step": 4258, + "train/total_loss": 0.15449103713035583 + }, + { + "entropy": 9.126960754394531, + "epoch": 0.42109946608661264, + "mean_token_accuracy": 0.7601476311683655, + "num_tokens": 1353254.0, + "step": 4259, + "train/ce_loss": 0.7362488508224487 + }, + { + "epoch": 0.42109946608661264, + "step": 4259, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.42109946608661264, + "step": 4259, + "train/total_loss": 0.1361248791217804 + }, + { + "epoch": 0.4211983389361281, + "grad_norm": 0.6947392225265503, + "learning_rate": 8.949463482173763e-06, + "loss": 0.1431, + "step": 4260 + }, + { + "entropy": 9.342792510986328, + "epoch": 0.4211983389361281, + "mean_token_accuracy": 0.82343989610672, + "num_tokens": 1358361.0, + "step": 4260, + "train/ce_loss": 0.482095330953598 + }, + { + "epoch": 0.4211983389361281, + "step": 4260, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.4211983389361281, + "step": 4260, + "train/total_loss": 0.1263345330953598 + }, + { + "entropy": 9.324569702148438, + "epoch": 0.42129721178564367, + "mean_token_accuracy": 0.7637231349945068, + "num_tokens": 1363211.0, + "step": 4261, + "train/ce_loss": 1.7761139869689941 + }, + { + "epoch": 0.42129721178564367, + "step": 4261, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.42129721178564367, + "step": 4261, + "train/total_loss": 0.28698641061782837 + }, + { + "entropy": 9.17307186126709, + "epoch": 0.4213960846351592, + "mean_token_accuracy": 0.7001338601112366, + "num_tokens": 1368428.0, + "step": 4262, + "train/ce_loss": 0.8786683082580566 + }, + { + "epoch": 0.4213960846351592, + "step": 4262, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.4213960846351592, + "step": 4262, + "train/total_loss": 0.14255434274673462 + }, + { + "entropy": 9.052959442138672, + "epoch": 0.4214949574846747, + "mean_token_accuracy": 0.7310426831245422, + "num_tokens": 1373711.0, + "step": 4263, + "train/ce_loss": 0.8060519695281982 + }, + { + "epoch": 0.4214949574846747, + "step": 4263, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.4214949574846747, + "step": 4263, + "train/total_loss": 0.13529270887374878 + }, + { + "entropy": 9.800721168518066, + "epoch": 0.42159383033419023, + "mean_token_accuracy": 0.7330595254898071, + "num_tokens": 1378592.0, + "step": 4264, + "train/ce_loss": 1.6116140386657207e-06 + }, + { + "epoch": 0.42159383033419023, + "step": 4264, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.42159383033419023, + "step": 4264, + "train/total_loss": 0.023437662050127983 + }, + { + "entropy": 8.876489639282227, + "epoch": 0.4216927031837058, + "mean_token_accuracy": 0.7310606241226196, + "num_tokens": 1383858.0, + "step": 4265, + "train/ce_loss": 0.5489982962608337 + }, + { + "epoch": 0.4216927031837058, + "step": 4265, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.4216927031837058, + "step": 4265, + "train/total_loss": 0.09396232664585114 + }, + { + "entropy": 9.334985733032227, + "epoch": 0.42179157603322126, + "mean_token_accuracy": 0.746347963809967, + "num_tokens": 1389034.0, + "step": 4266, + "train/ce_loss": 1.1328575055813417e-06 + }, + { + "epoch": 0.42179157603322126, + "step": 4266, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.42179157603322126, + "step": 4266, + "train/total_loss": 0.023437613621354103 + }, + { + "entropy": 9.425538063049316, + "epoch": 0.4218904488827368, + "mean_token_accuracy": 0.6842105388641357, + "num_tokens": 1394115.0, + "step": 4267, + "train/ce_loss": 2.066435172309866e-06 + }, + { + "epoch": 0.4218904488827368, + "step": 4267, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.4218904488827368, + "step": 4267, + "train/total_loss": 0.015625206753611565 + }, + { + "entropy": 9.141765594482422, + "epoch": 0.42198932173225234, + "mean_token_accuracy": 0.7256410121917725, + "num_tokens": 1399353.0, + "step": 4268, + "train/ce_loss": 0.8137962222099304 + }, + { + "epoch": 0.42198932173225234, + "step": 4268, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.42198932173225234, + "step": 4268, + "train/total_loss": 0.15950462222099304 + }, + { + "entropy": 9.34766960144043, + "epoch": 0.42208819458176783, + "mean_token_accuracy": 0.7127329111099243, + "num_tokens": 1404438.0, + "step": 4269, + "train/ce_loss": 1.6777497648945427e-06 + }, + { + "epoch": 0.42208819458176783, + "step": 4269, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.42208819458176783, + "step": 4269, + "train/total_loss": 0.01953141763806343 + }, + { + "entropy": 9.749281883239746, + "epoch": 0.42218706743128337, + "mean_token_accuracy": 0.7399576902389526, + "num_tokens": 1409340.0, + "step": 4270, + "train/ce_loss": 0.8929030299186707 + }, + { + "epoch": 0.42218706743128337, + "step": 4270, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.42218706743128337, + "step": 4270, + "train/total_loss": 0.1361653059720993 + }, + { + "entropy": 9.200545310974121, + "epoch": 0.4222859402807989, + "mean_token_accuracy": 0.7155067324638367, + "num_tokens": 1414669.0, + "step": 4271, + "train/ce_loss": 0.6485669016838074 + }, + { + "epoch": 0.4222859402807989, + "step": 4271, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.4222859402807989, + "step": 4271, + "train/total_loss": 0.11173169314861298 + }, + { + "entropy": 8.720863342285156, + "epoch": 0.4223848131303144, + "mean_token_accuracy": 0.6928879022598267, + "num_tokens": 1420107.0, + "step": 4272, + "train/ce_loss": 1.576992154121399 + }, + { + "epoch": 0.4223848131303144, + "step": 4272, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.4223848131303144, + "step": 4272, + "train/total_loss": 0.27488672733306885 + }, + { + "entropy": 10.158475875854492, + "epoch": 0.42248368597982994, + "mean_token_accuracy": 0.7209302186965942, + "num_tokens": 1424809.0, + "step": 4273, + "train/ce_loss": 3.0050621262489585e-06 + }, + { + "epoch": 0.42248368597982994, + "step": 4273, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.42248368597982994, + "step": 4273, + "train/total_loss": 0.042969051748514175 + }, + { + "entropy": 9.288434982299805, + "epoch": 0.4225825588293455, + "mean_token_accuracy": 0.7050359845161438, + "num_tokens": 1429909.0, + "step": 4274, + "train/ce_loss": 0.8266807794570923 + }, + { + "epoch": 0.4225825588293455, + "step": 4274, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.4225825588293455, + "step": 4274, + "train/total_loss": 0.19985558092594147 + }, + { + "entropy": 9.331501007080078, + "epoch": 0.42268143167886096, + "mean_token_accuracy": 0.7471751570701599, + "num_tokens": 1435102.0, + "step": 4275, + "train/ce_loss": 1.366287112236023 + }, + { + "epoch": 0.42268143167886096, + "step": 4275, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.42268143167886096, + "step": 4275, + "train/total_loss": 0.21475371718406677 + }, + { + "entropy": 9.115610122680664, + "epoch": 0.4227803045283765, + "mean_token_accuracy": 0.6718562841415405, + "num_tokens": 1440426.0, + "step": 4276, + "train/ce_loss": 0.6482602953910828 + }, + { + "epoch": 0.4227803045283765, + "step": 4276, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.4227803045283765, + "step": 4276, + "train/total_loss": 0.13513854146003723 + }, + { + "entropy": 8.894174575805664, + "epoch": 0.42287917737789205, + "mean_token_accuracy": 0.710010290145874, + "num_tokens": 1445880.0, + "step": 4277, + "train/ce_loss": 0.5530392527580261 + }, + { + "epoch": 0.42287917737789205, + "step": 4277, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.42287917737789205, + "step": 4277, + "train/total_loss": 0.1412414312362671 + }, + { + "entropy": 8.725154876708984, + "epoch": 0.42297805022740753, + "mean_token_accuracy": 0.75208580493927, + "num_tokens": 1451202.0, + "step": 4278, + "train/ce_loss": 0.7852307558059692 + }, + { + "epoch": 0.42297805022740753, + "step": 4278, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.42297805022740753, + "step": 4278, + "train/total_loss": 0.10586682707071304 + }, + { + "entropy": 8.779287338256836, + "epoch": 0.4230769230769231, + "mean_token_accuracy": 0.7813440561294556, + "num_tokens": 1456646.0, + "step": 4279, + "train/ce_loss": 0.8656463623046875 + }, + { + "epoch": 0.4230769230769231, + "step": 4279, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.4230769230769231, + "step": 4279, + "train/total_loss": 0.11390838772058487 + }, + { + "epoch": 0.4231757959264386, + "grad_norm": 0.6826988458633423, + "learning_rate": 8.944518617415815e-06, + "loss": 0.1495, + "step": 4280 + }, + { + "entropy": 8.758885383605957, + "epoch": 0.4231757959264386, + "mean_token_accuracy": 0.7528089880943298, + "num_tokens": 1462266.0, + "step": 4280, + "train/ce_loss": 0.8109167814254761 + }, + { + "epoch": 0.4231757959264386, + "step": 4280, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.4231757959264386, + "step": 4280, + "train/total_loss": 0.18265417218208313 + }, + { + "entropy": 9.55448055267334, + "epoch": 0.4232746687759541, + "mean_token_accuracy": 0.7217805981636047, + "num_tokens": 1467293.0, + "step": 4281, + "train/ce_loss": 2.5310926048405236e-06 + }, + { + "epoch": 0.4232746687759541, + "step": 4281, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.4232746687759541, + "step": 4281, + "train/total_loss": 0.0664065033197403 + }, + { + "entropy": 9.334671974182129, + "epoch": 0.42337354162546964, + "mean_token_accuracy": 0.7144754528999329, + "num_tokens": 1472462.0, + "step": 4282, + "train/ce_loss": 3.7898300888628e-06 + }, + { + "epoch": 0.42337354162546964, + "step": 4282, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.42337354162546964, + "step": 4282, + "train/total_loss": 0.04687537997961044 + }, + { + "entropy": 9.370052337646484, + "epoch": 0.4234724144749852, + "mean_token_accuracy": 0.8143274784088135, + "num_tokens": 1477587.0, + "step": 4283, + "train/ce_loss": 0.45702382922172546 + }, + { + "epoch": 0.4234724144749852, + "step": 4283, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.4234724144749852, + "step": 4283, + "train/total_loss": 0.08085863292217255 + }, + { + "entropy": 9.209254264831543, + "epoch": 0.42357128732450067, + "mean_token_accuracy": 0.7461928725242615, + "num_tokens": 1482840.0, + "step": 4284, + "train/ce_loss": 0.7366754412651062 + }, + { + "epoch": 0.42357128732450067, + "step": 4284, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.42357128732450067, + "step": 4284, + "train/total_loss": 0.15960505604743958 + }, + { + "entropy": 9.531827926635742, + "epoch": 0.4236701601740162, + "mean_token_accuracy": 0.7326732873916626, + "num_tokens": 1487818.0, + "step": 4285, + "train/ce_loss": 2.4422627120657125e-06 + }, + { + "epoch": 0.4236701601740162, + "step": 4285, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.4236701601740162, + "step": 4285, + "train/total_loss": 0.0625002458691597 + }, + { + "entropy": 8.964927673339844, + "epoch": 0.42376903302353175, + "mean_token_accuracy": 0.7400000095367432, + "num_tokens": 1493137.0, + "step": 4286, + "train/ce_loss": 1.0670584440231323 + }, + { + "epoch": 0.42376903302353175, + "step": 4286, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.42376903302353175, + "step": 4286, + "train/total_loss": 0.18092459440231323 + }, + { + "entropy": 9.768739700317383, + "epoch": 0.42386790587304723, + "mean_token_accuracy": 0.6830986142158508, + "num_tokens": 1497995.0, + "step": 4287, + "train/ce_loss": 2.5790059566497803 + }, + { + "epoch": 0.42386790587304723, + "step": 4287, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.42386790587304723, + "step": 4287, + "train/total_loss": 0.359463095664978 + }, + { + "entropy": 8.72038459777832, + "epoch": 0.4239667787225628, + "mean_token_accuracy": 0.7435653209686279, + "num_tokens": 1503577.0, + "step": 4288, + "train/ce_loss": 0.43054938316345215 + }, + { + "epoch": 0.4239667787225628, + "step": 4288, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.4239667787225628, + "step": 4288, + "train/total_loss": 0.06258618831634521 + }, + { + "entropy": 9.362419128417969, + "epoch": 0.4240656515720783, + "mean_token_accuracy": 0.7729323506355286, + "num_tokens": 1508689.0, + "step": 4289, + "train/ce_loss": 1.0027906894683838 + }, + { + "epoch": 0.4240656515720783, + "step": 4289, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.4240656515720783, + "step": 4289, + "train/total_loss": 0.1705915629863739 + }, + { + "entropy": 9.029733657836914, + "epoch": 0.4241645244215938, + "mean_token_accuracy": 0.6946848034858704, + "num_tokens": 1513942.0, + "step": 4290, + "train/ce_loss": 1.0345737934112549 + }, + { + "epoch": 0.4241645244215938, + "step": 4290, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.4241645244215938, + "step": 4290, + "train/total_loss": 0.19720739126205444 + }, + { + "entropy": 9.854422569274902, + "epoch": 0.42426339727110934, + "mean_token_accuracy": 0.6627358198165894, + "num_tokens": 1518783.0, + "step": 4291, + "train/ce_loss": 2.7461735498945927e-06 + }, + { + "epoch": 0.42426339727110934, + "step": 4291, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.42426339727110934, + "step": 4291, + "train/total_loss": 0.06250027567148209 + }, + { + "entropy": 8.991606712341309, + "epoch": 0.4243622701206249, + "mean_token_accuracy": 0.758368194103241, + "num_tokens": 1524212.0, + "step": 4292, + "train/ce_loss": 0.4276218116283417 + }, + { + "epoch": 0.4243622701206249, + "step": 4292, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.4243622701206249, + "step": 4292, + "train/total_loss": 0.06229343265295029 + }, + { + "entropy": 9.449363708496094, + "epoch": 0.4244611429701404, + "mean_token_accuracy": 0.791540801525116, + "num_tokens": 1529334.0, + "step": 4293, + "train/ce_loss": 0.5087490677833557 + }, + { + "epoch": 0.4244611429701404, + "step": 4293, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.4244611429701404, + "step": 4293, + "train/total_loss": 0.06649990379810333 + }, + { + "entropy": 8.642279624938965, + "epoch": 0.4245600158196559, + "mean_token_accuracy": 0.7688171863555908, + "num_tokens": 1534765.0, + "step": 4294, + "train/ce_loss": 0.7547642588615417 + }, + { + "epoch": 0.4245600158196559, + "step": 4294, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.4245600158196559, + "step": 4294, + "train/total_loss": 0.15360143780708313 + }, + { + "entropy": 9.176563262939453, + "epoch": 0.42465888866917145, + "mean_token_accuracy": 0.7251908183097839, + "num_tokens": 1539943.0, + "step": 4295, + "train/ce_loss": 1.0541415214538574 + }, + { + "epoch": 0.42465888866917145, + "step": 4295, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.42465888866917145, + "step": 4295, + "train/total_loss": 0.15228915214538574 + }, + { + "entropy": 9.577062606811523, + "epoch": 0.424757761518687, + "mean_token_accuracy": 0.7086092829704285, + "num_tokens": 1545004.0, + "step": 4296, + "train/ce_loss": 1.3417185544967651 + }, + { + "epoch": 0.424757761518687, + "step": 4296, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.424757761518687, + "step": 4296, + "train/total_loss": 0.18495310842990875 + }, + { + "entropy": 9.091691970825195, + "epoch": 0.4248566343682025, + "mean_token_accuracy": 0.7349665760993958, + "num_tokens": 1550347.0, + "step": 4297, + "train/ce_loss": 1.865136742591858 + }, + { + "epoch": 0.4248566343682025, + "step": 4297, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.4248566343682025, + "step": 4297, + "train/total_loss": 0.24510742723941803 + }, + { + "entropy": 8.988584518432617, + "epoch": 0.424955507217718, + "mean_token_accuracy": 0.7348777055740356, + "num_tokens": 1555663.0, + "step": 4298, + "train/ce_loss": 1.3776904344558716 + }, + { + "epoch": 0.424955507217718, + "step": 4298, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.424955507217718, + "step": 4298, + "train/total_loss": 0.19245654344558716 + }, + { + "entropy": 9.33218002319336, + "epoch": 0.42505438006723356, + "mean_token_accuracy": 0.6875, + "num_tokens": 1560946.0, + "step": 4299, + "train/ce_loss": 0.9154267907142639 + }, + { + "epoch": 0.42505438006723356, + "step": 4299, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.42505438006723356, + "step": 4299, + "train/total_loss": 0.20873019099235535 + }, + { + "epoch": 0.42515325291674905, + "grad_norm": 0.8996347188949585, + "learning_rate": 8.939573752657866e-06, + "loss": 0.1505, + "step": 4300 + }, + { + "entropy": 9.435907363891602, + "epoch": 0.42515325291674905, + "mean_token_accuracy": 0.7455782294273376, + "num_tokens": 1566058.0, + "step": 4300, + "train/ce_loss": 0.8440230488777161 + }, + { + "epoch": 0.42515325291674905, + "step": 4300, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.42515325291674905, + "step": 4300, + "train/total_loss": 0.12346480786800385 + }, + { + "entropy": 9.667113304138184, + "epoch": 0.4252521257662646, + "mean_token_accuracy": 0.7288732528686523, + "num_tokens": 1571051.0, + "step": 4301, + "train/ce_loss": 1.0589016675949097 + }, + { + "epoch": 0.4252521257662646, + "step": 4301, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.4252521257662646, + "step": 4301, + "train/total_loss": 0.1722964197397232 + }, + { + "entropy": 8.825034141540527, + "epoch": 0.42535099861578013, + "mean_token_accuracy": 0.8170731663703918, + "num_tokens": 1576471.0, + "step": 4302, + "train/ce_loss": 0.45814794301986694 + }, + { + "epoch": 0.42535099861578013, + "step": 4302, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.42535099861578013, + "step": 4302, + "train/total_loss": 0.07315854728221893 + }, + { + "entropy": 8.938833236694336, + "epoch": 0.4254498714652956, + "mean_token_accuracy": 0.6905537247657776, + "num_tokens": 1581861.0, + "step": 4303, + "train/ce_loss": 1.396524429321289 + }, + { + "epoch": 0.4254498714652956, + "step": 4303, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.4254498714652956, + "step": 4303, + "train/total_loss": 0.21387119591236115 + }, + { + "entropy": 9.038439750671387, + "epoch": 0.42554874431481116, + "mean_token_accuracy": 0.7139713764190674, + "num_tokens": 1587255.0, + "step": 4304, + "train/ce_loss": 0.8129435777664185 + }, + { + "epoch": 0.42554874431481116, + "step": 4304, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.42554874431481116, + "step": 4304, + "train/total_loss": 0.13988810777664185 + }, + { + "entropy": 9.519832611083984, + "epoch": 0.4256476171643267, + "mean_token_accuracy": 0.7729195952415466, + "num_tokens": 1592395.0, + "step": 4305, + "train/ce_loss": 0.9152183532714844 + }, + { + "epoch": 0.4256476171643267, + "step": 4305, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.4256476171643267, + "step": 4305, + "train/total_loss": 0.15402182936668396 + }, + { + "entropy": 9.419305801391602, + "epoch": 0.4257464900138422, + "mean_token_accuracy": 0.7150837779045105, + "num_tokens": 1597364.0, + "step": 4306, + "train/ce_loss": 2.3006148239801405e-06 + }, + { + "epoch": 0.4257464900138422, + "step": 4306, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.4257464900138422, + "step": 4306, + "train/total_loss": 0.023437730967998505 + }, + { + "entropy": 9.322014808654785, + "epoch": 0.4258453628633577, + "mean_token_accuracy": 0.7663185596466064, + "num_tokens": 1602577.0, + "step": 4307, + "train/ce_loss": 0.5652867555618286 + }, + { + "epoch": 0.4258453628633577, + "step": 4307, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.4258453628633577, + "step": 4307, + "train/total_loss": 0.10730992257595062 + }, + { + "entropy": 9.617816925048828, + "epoch": 0.42594423571287326, + "mean_token_accuracy": 0.7290909290313721, + "num_tokens": 1607532.0, + "step": 4308, + "train/ce_loss": 0.9149979948997498 + }, + { + "epoch": 0.42594423571287326, + "step": 4308, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.42594423571287326, + "step": 4308, + "train/total_loss": 0.16181230545043945 + }, + { + "entropy": 9.620207786560059, + "epoch": 0.42604310856238875, + "mean_token_accuracy": 0.7269681692123413, + "num_tokens": 1612599.0, + "step": 4309, + "train/ce_loss": 1.7339175939559937 + }, + { + "epoch": 0.42604310856238875, + "step": 4309, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.42604310856238875, + "step": 4309, + "train/total_loss": 0.23979800939559937 + }, + { + "entropy": 9.4932222366333, + "epoch": 0.4261419814119043, + "mean_token_accuracy": 0.7774524092674255, + "num_tokens": 1617730.0, + "step": 4310, + "train/ce_loss": 0.7773568630218506 + }, + { + "epoch": 0.4261419814119043, + "step": 4310, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.4261419814119043, + "step": 4310, + "train/total_loss": 0.12461068481206894 + }, + { + "entropy": 9.982711791992188, + "epoch": 0.42624085426141983, + "mean_token_accuracy": 0.6958763003349304, + "num_tokens": 1622529.0, + "step": 4311, + "train/ce_loss": 2.5695910453796387 + }, + { + "epoch": 0.42624085426141983, + "step": 4311, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.42624085426141983, + "step": 4311, + "train/total_loss": 0.30383411049842834 + }, + { + "entropy": 9.458898544311523, + "epoch": 0.4263397271109353, + "mean_token_accuracy": 0.766153872013092, + "num_tokens": 1627604.0, + "step": 4312, + "train/ce_loss": 4.623173936124658e-06 + }, + { + "epoch": 0.4263397271109353, + "step": 4312, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.4263397271109353, + "step": 4312, + "train/total_loss": 0.06250046193599701 + }, + { + "entropy": 8.879953384399414, + "epoch": 0.42643859996045086, + "mean_token_accuracy": 0.7508342862129211, + "num_tokens": 1632977.0, + "step": 4313, + "train/ce_loss": 0.8125248551368713 + }, + { + "epoch": 0.42643859996045086, + "step": 4313, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.42643859996045086, + "step": 4313, + "train/total_loss": 0.14375248551368713 + }, + { + "entropy": 9.249433517456055, + "epoch": 0.4265374728099664, + "mean_token_accuracy": 0.7860082387924194, + "num_tokens": 1638216.0, + "step": 4314, + "train/ce_loss": 0.9679844379425049 + }, + { + "epoch": 0.4265374728099664, + "step": 4314, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.4265374728099664, + "step": 4314, + "train/total_loss": 0.13976719975471497 + }, + { + "entropy": 9.489304542541504, + "epoch": 0.4266363456594819, + "mean_token_accuracy": 0.7419354915618896, + "num_tokens": 1643212.0, + "step": 4315, + "train/ce_loss": 2.237540456917486e-06 + }, + { + "epoch": 0.4266363456594819, + "step": 4315, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.4266363456594819, + "step": 4315, + "train/total_loss": 0.03906272351741791 + }, + { + "entropy": 9.41923713684082, + "epoch": 0.4267352185089974, + "mean_token_accuracy": 0.7376000285148621, + "num_tokens": 1648301.0, + "step": 4316, + "train/ce_loss": 3.542704234860139e-06 + }, + { + "epoch": 0.4267352185089974, + "step": 4316, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.4267352185089974, + "step": 4316, + "train/total_loss": 0.046875353902578354 + }, + { + "entropy": 9.066547393798828, + "epoch": 0.42683409135851297, + "mean_token_accuracy": 0.7229050397872925, + "num_tokens": 1653821.0, + "step": 4317, + "train/ce_loss": 0.4514893889427185 + }, + { + "epoch": 0.42683409135851297, + "step": 4317, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.42683409135851297, + "step": 4317, + "train/total_loss": 0.15061768889427185 + }, + { + "entropy": 9.121603965759277, + "epoch": 0.42693296420802845, + "mean_token_accuracy": 0.698019802570343, + "num_tokens": 1659091.0, + "step": 4318, + "train/ce_loss": 1.3111400604248047 + }, + { + "epoch": 0.42693296420802845, + "step": 4318, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.42693296420802845, + "step": 4318, + "train/total_loss": 0.20533275604248047 + }, + { + "entropy": 9.072944641113281, + "epoch": 0.427031837057544, + "mean_token_accuracy": 0.6868250370025635, + "num_tokens": 1664538.0, + "step": 4319, + "train/ce_loss": 0.5692331194877625 + }, + { + "epoch": 0.427031837057544, + "step": 4319, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.427031837057544, + "step": 4319, + "train/total_loss": 0.13114206492900848 + }, + { + "epoch": 0.42713070990705954, + "grad_norm": 0.729607880115509, + "learning_rate": 8.934628887899916e-06, + "loss": 0.1417, + "step": 4320 + }, + { + "entropy": 9.600298881530762, + "epoch": 0.42713070990705954, + "mean_token_accuracy": 0.7055920958518982, + "num_tokens": 1669550.0, + "step": 4320, + "train/ce_loss": 1.7043551206588745 + }, + { + "epoch": 0.42713070990705954, + "step": 4320, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.42713070990705954, + "step": 4320, + "train/total_loss": 0.23293551802635193 + }, + { + "entropy": 8.856857299804688, + "epoch": 0.427229582756575, + "mean_token_accuracy": 0.7200000286102295, + "num_tokens": 1674829.0, + "step": 4321, + "train/ce_loss": 0.8603862524032593 + }, + { + "epoch": 0.427229582756575, + "step": 4321, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.427229582756575, + "step": 4321, + "train/total_loss": 0.13291361927986145 + }, + { + "entropy": 9.070219039916992, + "epoch": 0.42732845560609056, + "mean_token_accuracy": 0.7744966149330139, + "num_tokens": 1680009.0, + "step": 4322, + "train/ce_loss": 0.5782650709152222 + }, + { + "epoch": 0.42732845560609056, + "step": 4322, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.42732845560609056, + "step": 4322, + "train/total_loss": 0.10470150411128998 + }, + { + "entropy": 8.989773750305176, + "epoch": 0.4274273284556061, + "mean_token_accuracy": 0.7233532667160034, + "num_tokens": 1685396.0, + "step": 4323, + "train/ce_loss": 0.9252065420150757 + }, + { + "epoch": 0.4274273284556061, + "step": 4323, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.4274273284556061, + "step": 4323, + "train/total_loss": 0.14720815420150757 + }, + { + "entropy": 9.072076797485352, + "epoch": 0.4275262013051216, + "mean_token_accuracy": 0.7448200583457947, + "num_tokens": 1690761.0, + "step": 4324, + "train/ce_loss": 0.8619343638420105 + }, + { + "epoch": 0.4275262013051216, + "step": 4324, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.4275262013051216, + "step": 4324, + "train/total_loss": 0.18775594234466553 + }, + { + "entropy": 9.002479553222656, + "epoch": 0.42762507415463713, + "mean_token_accuracy": 0.7133890986442566, + "num_tokens": 1696123.0, + "step": 4325, + "train/ce_loss": 1.1929075717926025 + }, + { + "epoch": 0.42762507415463713, + "step": 4325, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.42762507415463713, + "step": 4325, + "train/total_loss": 0.1622595191001892 + }, + { + "entropy": 8.67890739440918, + "epoch": 0.42772394700415267, + "mean_token_accuracy": 0.800000011920929, + "num_tokens": 1701618.0, + "step": 4326, + "train/ce_loss": 0.34899601340293884 + }, + { + "epoch": 0.42772394700415267, + "step": 4326, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.42772394700415267, + "step": 4326, + "train/total_loss": 0.09349335730075836 + }, + { + "entropy": 9.073480606079102, + "epoch": 0.42782281985366816, + "mean_token_accuracy": 0.7301775217056274, + "num_tokens": 1706933.0, + "step": 4327, + "train/ce_loss": 0.7541035413742065 + }, + { + "epoch": 0.42782281985366816, + "step": 4327, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.42782281985366816, + "step": 4327, + "train/total_loss": 0.11447285860776901 + }, + { + "entropy": 8.725275039672852, + "epoch": 0.4279216927031837, + "mean_token_accuracy": 0.7545367479324341, + "num_tokens": 1712425.0, + "step": 4328, + "train/ce_loss": 0.6070301532745361 + }, + { + "epoch": 0.4279216927031837, + "step": 4328, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.4279216927031837, + "step": 4328, + "train/total_loss": 0.08023426681756973 + }, + { + "entropy": 9.50638198852539, + "epoch": 0.42802056555269924, + "mean_token_accuracy": 0.7130559682846069, + "num_tokens": 1717579.0, + "step": 4329, + "train/ce_loss": 3.69126155419508e-06 + }, + { + "epoch": 0.42802056555269924, + "step": 4329, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.42802056555269924, + "step": 4329, + "train/total_loss": 0.05859411880373955 + }, + { + "entropy": 9.952825546264648, + "epoch": 0.4281194384022147, + "mean_token_accuracy": 0.7051281929016113, + "num_tokens": 1722420.0, + "step": 4330, + "train/ce_loss": 3.110465968347853e-06 + }, + { + "epoch": 0.4281194384022147, + "step": 4330, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.4281194384022147, + "step": 4330, + "train/total_loss": 0.02734406106173992 + }, + { + "entropy": 8.836921691894531, + "epoch": 0.42821831125173027, + "mean_token_accuracy": 0.7042410969734192, + "num_tokens": 1727834.0, + "step": 4331, + "train/ce_loss": 1.8216817378997803 + }, + { + "epoch": 0.42821831125173027, + "step": 4331, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.42821831125173027, + "step": 4331, + "train/total_loss": 0.260293185710907 + }, + { + "entropy": 9.085926055908203, + "epoch": 0.4283171841012458, + "mean_token_accuracy": 0.7299363017082214, + "num_tokens": 1733113.0, + "step": 4332, + "train/ce_loss": 0.8461929559707642 + }, + { + "epoch": 0.4283171841012458, + "step": 4332, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.4283171841012458, + "step": 4332, + "train/total_loss": 0.16274429857730865 + }, + { + "entropy": 9.921957015991211, + "epoch": 0.42841605695076135, + "mean_token_accuracy": 0.7659574747085571, + "num_tokens": 1737805.0, + "step": 4333, + "train/ce_loss": 1.589762806892395 + }, + { + "epoch": 0.42841605695076135, + "step": 4333, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.42841605695076135, + "step": 4333, + "train/total_loss": 0.23710128664970398 + }, + { + "entropy": 9.177725791931152, + "epoch": 0.42851492980027683, + "mean_token_accuracy": 0.7027707695960999, + "num_tokens": 1743085.0, + "step": 4334, + "train/ce_loss": 1.3322070837020874 + }, + { + "epoch": 0.42851492980027683, + "step": 4334, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.42851492980027683, + "step": 4334, + "train/total_loss": 0.18400196731090546 + }, + { + "entropy": 8.909058570861816, + "epoch": 0.4286138026497924, + "mean_token_accuracy": 0.7259668707847595, + "num_tokens": 1748419.0, + "step": 4335, + "train/ce_loss": 0.4872305989265442 + }, + { + "epoch": 0.4286138026497924, + "step": 4335, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.4286138026497924, + "step": 4335, + "train/total_loss": 0.06825430691242218 + }, + { + "entropy": 8.963470458984375, + "epoch": 0.4287126754993079, + "mean_token_accuracy": 0.7875136733055115, + "num_tokens": 1753772.0, + "step": 4336, + "train/ce_loss": 0.7437989711761475 + }, + { + "epoch": 0.4287126754993079, + "step": 4336, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.4287126754993079, + "step": 4336, + "train/total_loss": 0.18766114115715027 + }, + { + "entropy": 9.545357704162598, + "epoch": 0.4288115483488234, + "mean_token_accuracy": 0.6684684753417969, + "num_tokens": 1758747.0, + "step": 4337, + "train/ce_loss": 4.981597157893702e-06 + }, + { + "epoch": 0.4288115483488234, + "step": 4337, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.4288115483488234, + "step": 4337, + "train/total_loss": 0.0820317491889 + }, + { + "entropy": 8.90612506866455, + "epoch": 0.42891042119833894, + "mean_token_accuracy": 0.7734042406082153, + "num_tokens": 1764338.0, + "step": 4338, + "train/ce_loss": 0.468054860830307 + }, + { + "epoch": 0.42891042119833894, + "step": 4338, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.42891042119833894, + "step": 4338, + "train/total_loss": 0.1366492360830307 + }, + { + "entropy": 9.132699966430664, + "epoch": 0.4290092940478545, + "mean_token_accuracy": 0.6747159361839294, + "num_tokens": 1769519.0, + "step": 4339, + "train/ce_loss": 0.6216713190078735 + }, + { + "epoch": 0.4290092940478545, + "step": 4339, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.4290092940478545, + "step": 4339, + "train/total_loss": 0.13247963786125183 + }, + { + "epoch": 0.42910816689736997, + "grad_norm": 0.904602587223053, + "learning_rate": 8.929684023141968e-06, + "loss": 0.1537, + "step": 4340 + }, + { + "entropy": 8.995580673217773, + "epoch": 0.42910816689736997, + "mean_token_accuracy": 0.6687631011009216, + "num_tokens": 1774949.0, + "step": 4340, + "train/ce_loss": 0.8476549983024597 + }, + { + "epoch": 0.42910816689736997, + "step": 4340, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.42910816689736997, + "step": 4340, + "train/total_loss": 0.12382800132036209 + }, + { + "entropy": 8.944741249084473, + "epoch": 0.4292070397468855, + "mean_token_accuracy": 0.707975447177887, + "num_tokens": 1780237.0, + "step": 4341, + "train/ce_loss": 1.2588831186294556 + }, + { + "epoch": 0.4292070397468855, + "step": 4341, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.4292070397468855, + "step": 4341, + "train/total_loss": 0.20791956782341003 + }, + { + "entropy": 9.220817565917969, + "epoch": 0.42930591259640105, + "mean_token_accuracy": 0.6861979365348816, + "num_tokens": 1785491.0, + "step": 4342, + "train/ce_loss": 1.0993037223815918 + }, + { + "epoch": 0.42930591259640105, + "step": 4342, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.42930591259640105, + "step": 4342, + "train/total_loss": 0.1607116162776947 + }, + { + "entropy": 9.002422332763672, + "epoch": 0.42940478544591654, + "mean_token_accuracy": 0.7678571343421936, + "num_tokens": 1790793.0, + "step": 4343, + "train/ce_loss": 0.8925429582595825 + }, + { + "epoch": 0.42940478544591654, + "step": 4343, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.42940478544591654, + "step": 4343, + "train/total_loss": 0.14394178986549377 + }, + { + "entropy": 8.725269317626953, + "epoch": 0.4295036582954321, + "mean_token_accuracy": 0.7561797499656677, + "num_tokens": 1796128.0, + "step": 4344, + "train/ce_loss": 0.9660356044769287 + }, + { + "epoch": 0.4295036582954321, + "step": 4344, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.4295036582954321, + "step": 4344, + "train/total_loss": 0.16300982236862183 + }, + { + "entropy": 8.95657730102539, + "epoch": 0.4296025311449476, + "mean_token_accuracy": 0.7015834450721741, + "num_tokens": 1801405.0, + "step": 4345, + "train/ce_loss": 0.7507035136222839 + }, + { + "epoch": 0.4296025311449476, + "step": 4345, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.4296025311449476, + "step": 4345, + "train/total_loss": 0.1766328513622284 + }, + { + "entropy": 9.237102508544922, + "epoch": 0.4297014039944631, + "mean_token_accuracy": 0.6799007654190063, + "num_tokens": 1806678.0, + "step": 4346, + "train/ce_loss": 1.0277491807937622 + }, + { + "epoch": 0.4297014039944631, + "step": 4346, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.4297014039944631, + "step": 4346, + "train/total_loss": 0.16527491807937622 + }, + { + "entropy": 9.291812896728516, + "epoch": 0.42980027684397865, + "mean_token_accuracy": 0.6936115026473999, + "num_tokens": 1811933.0, + "step": 4347, + "train/ce_loss": 0.6227318048477173 + }, + { + "epoch": 0.42980027684397865, + "step": 4347, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.42980027684397865, + "step": 4347, + "train/total_loss": 0.10524193197488785 + }, + { + "entropy": 8.870292663574219, + "epoch": 0.4298991496934942, + "mean_token_accuracy": 0.7926315665245056, + "num_tokens": 1817358.0, + "step": 4348, + "train/ce_loss": 0.9479617476463318 + }, + { + "epoch": 0.4298991496934942, + "step": 4348, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.4298991496934942, + "step": 4348, + "train/total_loss": 0.15729618072509766 + }, + { + "entropy": 9.563199996948242, + "epoch": 0.4299980225430097, + "mean_token_accuracy": 0.7112902998924255, + "num_tokens": 1822386.0, + "step": 4349, + "train/ce_loss": 1.2201199531555176 + }, + { + "epoch": 0.4299980225430097, + "step": 4349, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.4299980225430097, + "step": 4349, + "train/total_loss": 0.21185573935508728 + }, + { + "entropy": 9.326417922973633, + "epoch": 0.4300968953925252, + "mean_token_accuracy": 0.6901595592498779, + "num_tokens": 1827622.0, + "step": 4350, + "train/ce_loss": 0.9021326303482056 + }, + { + "epoch": 0.4300968953925252, + "step": 4350, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.4300968953925252, + "step": 4350, + "train/total_loss": 0.14099451899528503 + }, + { + "entropy": 9.50046443939209, + "epoch": 0.43019576824204075, + "mean_token_accuracy": 0.7844112515449524, + "num_tokens": 1832669.0, + "step": 4351, + "train/ce_loss": 0.6640459895133972 + }, + { + "epoch": 0.43019576824204075, + "step": 4351, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.43019576824204075, + "step": 4351, + "train/total_loss": 0.08202960342168808 + }, + { + "entropy": 8.944676399230957, + "epoch": 0.43029464109155624, + "mean_token_accuracy": 0.7688098549842834, + "num_tokens": 1837863.0, + "step": 4352, + "train/ce_loss": 0.4522591531276703 + }, + { + "epoch": 0.43029464109155624, + "step": 4352, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.43029464109155624, + "step": 4352, + "train/total_loss": 0.13506966829299927 + }, + { + "entropy": 9.45470905303955, + "epoch": 0.4303935139410718, + "mean_token_accuracy": 0.8481012582778931, + "num_tokens": 1842920.0, + "step": 4353, + "train/ce_loss": 1.6522606074431678e-06 + }, + { + "epoch": 0.4303935139410718, + "step": 4353, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.4303935139410718, + "step": 4353, + "train/total_loss": 0.05859391391277313 + }, + { + "entropy": 9.531600952148438, + "epoch": 0.4304923867905873, + "mean_token_accuracy": 0.7777777910232544, + "num_tokens": 1847968.0, + "step": 4354, + "train/ce_loss": 0.3012424409389496 + }, + { + "epoch": 0.4304923867905873, + "step": 4354, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.4304923867905873, + "step": 4354, + "train/total_loss": 0.1160617470741272 + }, + { + "entropy": 9.445377349853516, + "epoch": 0.4305912596401028, + "mean_token_accuracy": 0.7200621962547302, + "num_tokens": 1853092.0, + "step": 4355, + "train/ce_loss": 1.6494358777999878 + }, + { + "epoch": 0.4305912596401028, + "step": 4355, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.4305912596401028, + "step": 4355, + "train/total_loss": 0.23525609076023102 + }, + { + "entropy": 8.41752815246582, + "epoch": 0.43069013248961835, + "mean_token_accuracy": 0.7389340400695801, + "num_tokens": 1858729.0, + "step": 4356, + "train/ce_loss": 1.3742927312850952 + }, + { + "epoch": 0.43069013248961835, + "step": 4356, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.43069013248961835, + "step": 4356, + "train/total_loss": 0.19602303206920624 + }, + { + "entropy": 9.577810287475586, + "epoch": 0.4307890053391339, + "mean_token_accuracy": 0.7698541283607483, + "num_tokens": 1863928.0, + "step": 4357, + "train/ce_loss": 0.918196439743042 + }, + { + "epoch": 0.4307890053391339, + "step": 4357, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.4307890053391339, + "step": 4357, + "train/total_loss": 0.1855696439743042 + }, + { + "entropy": 8.904155731201172, + "epoch": 0.4308878781886494, + "mean_token_accuracy": 0.7665505409240723, + "num_tokens": 1869201.0, + "step": 4358, + "train/ce_loss": 0.5582960844039917 + }, + { + "epoch": 0.4308878781886494, + "step": 4358, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.4308878781886494, + "step": 4358, + "train/total_loss": 0.11051711440086365 + }, + { + "entropy": 9.43018627166748, + "epoch": 0.4309867510381649, + "mean_token_accuracy": 0.7713884711265564, + "num_tokens": 1874546.0, + "step": 4359, + "train/ce_loss": 1.0824960470199585 + }, + { + "epoch": 0.4309867510381649, + "step": 4359, + "train/sim_loss": 0.12890625 + }, + { + "epoch": 0.4309867510381649, + "step": 4359, + "train/total_loss": 0.23715585470199585 + }, + { + "epoch": 0.43108562388768046, + "grad_norm": 0.731275737285614, + "learning_rate": 8.924739158384019e-06, + "loss": 0.1463, + "step": 4360 + }, + { + "entropy": 9.604022026062012, + "epoch": 0.43108562388768046, + "mean_token_accuracy": 0.7116564512252808, + "num_tokens": 1879636.0, + "step": 4360, + "train/ce_loss": 1.1926167011260986 + }, + { + "epoch": 0.43108562388768046, + "step": 4360, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.43108562388768046, + "step": 4360, + "train/total_loss": 0.23254293203353882 + }, + { + "entropy": 8.92547607421875, + "epoch": 0.43118449673719594, + "mean_token_accuracy": 0.7513157725334167, + "num_tokens": 1884836.0, + "step": 4361, + "train/ce_loss": 0.5982657670974731 + }, + { + "epoch": 0.43118449673719594, + "step": 4361, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.43118449673719594, + "step": 4361, + "train/total_loss": 0.1379515826702118 + }, + { + "entropy": 8.663128852844238, + "epoch": 0.4312833695867115, + "mean_token_accuracy": 0.7497414946556091, + "num_tokens": 1890228.0, + "step": 4362, + "train/ce_loss": 1.0565632581710815 + }, + { + "epoch": 0.4312833695867115, + "step": 4362, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.4312833695867115, + "step": 4362, + "train/total_loss": 0.18378132581710815 + }, + { + "entropy": 8.881734848022461, + "epoch": 0.431382242436227, + "mean_token_accuracy": 0.7706422209739685, + "num_tokens": 1895571.0, + "step": 4363, + "train/ce_loss": 0.6953521966934204 + }, + { + "epoch": 0.431382242436227, + "step": 4363, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.431382242436227, + "step": 4363, + "train/total_loss": 0.10469146817922592 + }, + { + "entropy": 8.88752555847168, + "epoch": 0.4314811152857425, + "mean_token_accuracy": 0.7144457101821899, + "num_tokens": 1900906.0, + "step": 4364, + "train/ce_loss": 0.595391571521759 + }, + { + "epoch": 0.4314811152857425, + "step": 4364, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.4314811152857425, + "step": 4364, + "train/total_loss": 0.08297665417194366 + }, + { + "entropy": 8.735705375671387, + "epoch": 0.43157998813525805, + "mean_token_accuracy": 0.7157360315322876, + "num_tokens": 1906313.0, + "step": 4365, + "train/ce_loss": 1.6949741840362549 + }, + { + "epoch": 0.43157998813525805, + "step": 4365, + "train/sim_loss": 0.15234375 + }, + { + "epoch": 0.43157998813525805, + "step": 4365, + "train/total_loss": 0.32184118032455444 + }, + { + "entropy": 9.029319763183594, + "epoch": 0.4316788609847736, + "mean_token_accuracy": 0.6994818449020386, + "num_tokens": 1911645.0, + "step": 4366, + "train/ce_loss": 1.2209709882736206 + }, + { + "epoch": 0.4316788609847736, + "step": 4366, + "train/sim_loss": 0.15625 + }, + { + "epoch": 0.4316788609847736, + "step": 4366, + "train/total_loss": 0.27834710478782654 + }, + { + "entropy": 9.10180377960205, + "epoch": 0.4317777338342891, + "mean_token_accuracy": 0.7441540360450745, + "num_tokens": 1916849.0, + "step": 4367, + "train/ce_loss": 0.9811787009239197 + }, + { + "epoch": 0.4317777338342891, + "step": 4367, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.4317777338342891, + "step": 4367, + "train/total_loss": 0.1723366230726242 + }, + { + "entropy": 9.039645195007324, + "epoch": 0.4318766066838046, + "mean_token_accuracy": 0.6883604526519775, + "num_tokens": 1922098.0, + "step": 4368, + "train/ce_loss": 0.7841721177101135 + }, + { + "epoch": 0.4318766066838046, + "step": 4368, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.4318766066838046, + "step": 4368, + "train/total_loss": 0.10966721177101135 + }, + { + "entropy": 8.731348037719727, + "epoch": 0.43197547953332016, + "mean_token_accuracy": 0.7702381014823914, + "num_tokens": 1927427.0, + "step": 4369, + "train/ce_loss": 0.7687444686889648 + }, + { + "epoch": 0.43197547953332016, + "step": 4369, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.43197547953332016, + "step": 4369, + "train/total_loss": 0.11984319984912872 + }, + { + "entropy": 8.865472793579102, + "epoch": 0.43207435238283565, + "mean_token_accuracy": 0.7451971769332886, + "num_tokens": 1932875.0, + "step": 4370, + "train/ce_loss": 0.7522867321968079 + }, + { + "epoch": 0.43207435238283565, + "step": 4370, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.43207435238283565, + "step": 4370, + "train/total_loss": 0.09475992619991302 + }, + { + "entropy": 9.40871810913086, + "epoch": 0.4321732252323512, + "mean_token_accuracy": 0.6855828166007996, + "num_tokens": 1937990.0, + "step": 4371, + "train/ce_loss": 1.3217101097106934 + }, + { + "epoch": 0.4321732252323512, + "step": 4371, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.4321732252323512, + "step": 4371, + "train/total_loss": 0.19076476991176605 + }, + { + "entropy": 8.581138610839844, + "epoch": 0.43227209808186673, + "mean_token_accuracy": 0.764762818813324, + "num_tokens": 1943478.0, + "step": 4372, + "train/ce_loss": 0.5864698886871338 + }, + { + "epoch": 0.43227209808186673, + "step": 4372, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.43227209808186673, + "step": 4372, + "train/total_loss": 0.08208449184894562 + }, + { + "entropy": 9.11349105834961, + "epoch": 0.4323709709313822, + "mean_token_accuracy": 0.753947377204895, + "num_tokens": 1948706.0, + "step": 4373, + "train/ce_loss": 0.831530749797821 + }, + { + "epoch": 0.4323709709313822, + "step": 4373, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.4323709709313822, + "step": 4373, + "train/total_loss": 0.16127806901931763 + }, + { + "entropy": 9.241002082824707, + "epoch": 0.43246984378089776, + "mean_token_accuracy": 0.7766624689102173, + "num_tokens": 1953911.0, + "step": 4374, + "train/ce_loss": 2.589736368463491e-06 + }, + { + "epoch": 0.43246984378089776, + "step": 4374, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.43246984378089776, + "step": 4374, + "train/total_loss": 0.027344008907675743 + }, + { + "entropy": 9.518906593322754, + "epoch": 0.4325687166304133, + "mean_token_accuracy": 0.7632450461387634, + "num_tokens": 1958934.0, + "step": 4375, + "train/ce_loss": 1.8838533163070679 + }, + { + "epoch": 0.4325687166304133, + "step": 4375, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.4325687166304133, + "step": 4375, + "train/total_loss": 0.24697908759117126 + }, + { + "entropy": 8.939252853393555, + "epoch": 0.43266758947992884, + "mean_token_accuracy": 0.7582128643989563, + "num_tokens": 1964186.0, + "step": 4376, + "train/ce_loss": 0.8972609043121338 + }, + { + "epoch": 0.43266758947992884, + "step": 4376, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.43266758947992884, + "step": 4376, + "train/total_loss": 0.14050734043121338 + }, + { + "entropy": 9.001517295837402, + "epoch": 0.4327664623294443, + "mean_token_accuracy": 0.7444987893104553, + "num_tokens": 1969436.0, + "step": 4377, + "train/ce_loss": 0.8296143412590027 + }, + { + "epoch": 0.4327664623294443, + "step": 4377, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.4327664623294443, + "step": 4377, + "train/total_loss": 0.11421143263578415 + }, + { + "entropy": 9.012580871582031, + "epoch": 0.43286533517895986, + "mean_token_accuracy": 0.7726218104362488, + "num_tokens": 1974738.0, + "step": 4378, + "train/ce_loss": 0.8610218167304993 + }, + { + "epoch": 0.43286533517895986, + "step": 4378, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.43286533517895986, + "step": 4378, + "train/total_loss": 0.10172718018293381 + }, + { + "entropy": 9.393769264221191, + "epoch": 0.4329642080284754, + "mean_token_accuracy": 0.7551928758621216, + "num_tokens": 1979917.0, + "step": 4379, + "train/ce_loss": 0.5917772054672241 + }, + { + "epoch": 0.4329642080284754, + "step": 4379, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.4329642080284754, + "step": 4379, + "train/total_loss": 0.09042772650718689 + }, + { + "epoch": 0.4330630808779909, + "grad_norm": 0.8429343700408936, + "learning_rate": 8.91979429362607e-06, + "loss": 0.1395, + "step": 4380 + }, + { + "entropy": 9.748490333557129, + "epoch": 0.4330630808779909, + "mean_token_accuracy": 0.8401727676391602, + "num_tokens": 1984804.0, + "step": 4380, + "train/ce_loss": 0.794469952583313 + }, + { + "epoch": 0.4330630808779909, + "step": 4380, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.4330630808779909, + "step": 4380, + "train/total_loss": 0.10288449376821518 + }, + { + "entropy": 8.892608642578125, + "epoch": 0.43316195372750643, + "mean_token_accuracy": 0.7439824938774109, + "num_tokens": 1990143.0, + "step": 4381, + "train/ce_loss": 0.8625227808952332 + }, + { + "epoch": 0.43316195372750643, + "step": 4381, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.43316195372750643, + "step": 4381, + "train/total_loss": 0.10187727957963943 + }, + { + "entropy": 9.310935974121094, + "epoch": 0.433260826577022, + "mean_token_accuracy": 0.7307692170143127, + "num_tokens": 1995295.0, + "step": 4382, + "train/ce_loss": 1.0909391641616821 + }, + { + "epoch": 0.433260826577022, + "step": 4382, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.433260826577022, + "step": 4382, + "train/total_loss": 0.14815641939640045 + }, + { + "entropy": 8.98115062713623, + "epoch": 0.43335969942653746, + "mean_token_accuracy": 0.7350427508354187, + "num_tokens": 2000609.0, + "step": 4383, + "train/ce_loss": 1.1102403402328491 + }, + { + "epoch": 0.43335969942653746, + "step": 4383, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.43335969942653746, + "step": 4383, + "train/total_loss": 0.14618028700351715 + }, + { + "entropy": 8.781379699707031, + "epoch": 0.433458572276053, + "mean_token_accuracy": 0.7820796370506287, + "num_tokens": 2006028.0, + "step": 4384, + "train/ce_loss": 0.9370577931404114 + }, + { + "epoch": 0.433458572276053, + "step": 4384, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.433458572276053, + "step": 4384, + "train/total_loss": 0.10933078080415726 + }, + { + "entropy": 9.149648666381836, + "epoch": 0.43355744512556854, + "mean_token_accuracy": 0.7650063633918762, + "num_tokens": 2011440.0, + "step": 4385, + "train/ce_loss": 0.5976223945617676 + }, + { + "epoch": 0.43355744512556854, + "step": 4385, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.43355744512556854, + "step": 4385, + "train/total_loss": 0.08710598945617676 + }, + { + "entropy": 9.48122501373291, + "epoch": 0.433656317975084, + "mean_token_accuracy": 0.687821626663208, + "num_tokens": 2016434.0, + "step": 4386, + "train/ce_loss": 1.4672801853521378e-06 + }, + { + "epoch": 0.433656317975084, + "step": 4386, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.433656317975084, + "step": 4386, + "train/total_loss": 0.02343764714896679 + }, + { + "entropy": 9.54377555847168, + "epoch": 0.43375519082459957, + "mean_token_accuracy": 0.7569444179534912, + "num_tokens": 2021561.0, + "step": 4387, + "train/ce_loss": 0.7149176597595215 + }, + { + "epoch": 0.43375519082459957, + "step": 4387, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.43375519082459957, + "step": 4387, + "train/total_loss": 0.0910230204463005 + }, + { + "entropy": 9.069219589233398, + "epoch": 0.4338540636741151, + "mean_token_accuracy": 0.7426108121871948, + "num_tokens": 2026994.0, + "step": 4388, + "train/ce_loss": 0.4522751569747925 + }, + { + "epoch": 0.4338540636741151, + "step": 4388, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.4338540636741151, + "step": 4388, + "train/total_loss": 0.09600876271724701 + }, + { + "entropy": 8.795243263244629, + "epoch": 0.4339529365236306, + "mean_token_accuracy": 0.7527812123298645, + "num_tokens": 2032231.0, + "step": 4389, + "train/ce_loss": 0.7872169613838196 + }, + { + "epoch": 0.4339529365236306, + "step": 4389, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.4339529365236306, + "step": 4389, + "train/total_loss": 0.14512795209884644 + }, + { + "entropy": 9.059467315673828, + "epoch": 0.43405180937314614, + "mean_token_accuracy": 0.7508571147918701, + "num_tokens": 2037591.0, + "step": 4390, + "train/ce_loss": 0.9338961243629456 + }, + { + "epoch": 0.43405180937314614, + "step": 4390, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.43405180937314614, + "step": 4390, + "train/total_loss": 0.1246396154165268 + }, + { + "entropy": 9.725197792053223, + "epoch": 0.4341506822226617, + "mean_token_accuracy": 0.7208872437477112, + "num_tokens": 2042575.0, + "step": 4391, + "train/ce_loss": 1.528663992881775 + }, + { + "epoch": 0.4341506822226617, + "step": 4391, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.4341506822226617, + "step": 4391, + "train/total_loss": 0.2153664082288742 + }, + { + "entropy": 9.387313842773438, + "epoch": 0.43424955507217716, + "mean_token_accuracy": 0.7525773048400879, + "num_tokens": 2047701.0, + "step": 4392, + "train/ce_loss": 0.5898157954216003 + }, + { + "epoch": 0.43424955507217716, + "step": 4392, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.43424955507217716, + "step": 4392, + "train/total_loss": 0.09804408252239227 + }, + { + "entropy": 8.884873390197754, + "epoch": 0.4343484279216927, + "mean_token_accuracy": 0.7536231875419617, + "num_tokens": 2053147.0, + "step": 4393, + "train/ce_loss": 0.7696712613105774 + }, + { + "epoch": 0.4343484279216927, + "step": 4393, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.4343484279216927, + "step": 4393, + "train/total_loss": 0.11993587762117386 + }, + { + "entropy": 8.939346313476562, + "epoch": 0.43444730077120824, + "mean_token_accuracy": 0.7422360181808472, + "num_tokens": 2058579.0, + "step": 4394, + "train/ce_loss": 0.7860175371170044 + }, + { + "epoch": 0.43444730077120824, + "step": 4394, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.43444730077120824, + "step": 4394, + "train/total_loss": 0.15282049775123596 + }, + { + "entropy": 9.010202407836914, + "epoch": 0.43454617362072373, + "mean_token_accuracy": 0.6784840822219849, + "num_tokens": 2063801.0, + "step": 4395, + "train/ce_loss": 2.32151460647583 + }, + { + "epoch": 0.43454617362072373, + "step": 4395, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.43454617362072373, + "step": 4395, + "train/total_loss": 0.30637019872665405 + }, + { + "entropy": 9.382925033569336, + "epoch": 0.43464504647023927, + "mean_token_accuracy": 0.7042062282562256, + "num_tokens": 2069058.0, + "step": 4396, + "train/ce_loss": 1.4940139055252075 + }, + { + "epoch": 0.43464504647023927, + "step": 4396, + "train/sim_loss": 0.12109375 + }, + { + "epoch": 0.43464504647023927, + "step": 4396, + "train/total_loss": 0.27049514651298523 + }, + { + "entropy": 9.745540618896484, + "epoch": 0.4347439193197548, + "mean_token_accuracy": 0.7615230679512024, + "num_tokens": 2074018.0, + "step": 4397, + "train/ce_loss": 2.7070050236943644e-06 + }, + { + "epoch": 0.4347439193197548, + "step": 4397, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.4347439193197548, + "step": 4397, + "train/total_loss": 0.01953152008354664 + }, + { + "entropy": 8.980493545532227, + "epoch": 0.4348427921692703, + "mean_token_accuracy": 0.7279236316680908, + "num_tokens": 2079314.0, + "step": 4398, + "train/ce_loss": 1.029262900352478 + }, + { + "epoch": 0.4348427921692703, + "step": 4398, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.4348427921692703, + "step": 4398, + "train/total_loss": 0.13417628407478333 + }, + { + "entropy": 9.474332809448242, + "epoch": 0.43494166501878584, + "mean_token_accuracy": 0.7854729890823364, + "num_tokens": 2084509.0, + "step": 4399, + "train/ce_loss": 1.1530356407165527 + }, + { + "epoch": 0.43494166501878584, + "step": 4399, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.43494166501878584, + "step": 4399, + "train/total_loss": 0.19733482599258423 + }, + { + "epoch": 0.4350405378683014, + "grad_norm": 0.7104471921920776, + "learning_rate": 8.914849428868121e-06, + "loss": 0.1403, + "step": 4400 + }, + { + "entropy": 9.791597366333008, + "epoch": 0.4350405378683014, + "mean_token_accuracy": 0.8131313323974609, + "num_tokens": 2089348.0, + "step": 4400, + "train/ce_loss": 1.4423400163650513 + }, + { + "epoch": 0.4350405378683014, + "step": 4400, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.4350405378683014, + "step": 4400, + "train/total_loss": 0.18329650163650513 + }, + { + "entropy": 8.359624862670898, + "epoch": 0.43513941071781687, + "mean_token_accuracy": 0.7226890921592712, + "num_tokens": 2094914.0, + "step": 4401, + "train/ce_loss": 1.043874740600586 + }, + { + "epoch": 0.43513941071781687, + "step": 4401, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.43513941071781687, + "step": 4401, + "train/total_loss": 0.16688747704029083 + }, + { + "entropy": 9.358190536499023, + "epoch": 0.4352382835673324, + "mean_token_accuracy": 0.6929460763931274, + "num_tokens": 2100115.0, + "step": 4402, + "train/ce_loss": 2.138016700744629 + }, + { + "epoch": 0.4352382835673324, + "step": 4402, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.4352382835673324, + "step": 4402, + "train/total_loss": 0.29973918199539185 + }, + { + "entropy": 9.750505447387695, + "epoch": 0.43533715641684795, + "mean_token_accuracy": 0.7675438523292542, + "num_tokens": 2104988.0, + "step": 4403, + "train/ce_loss": 0.8936744928359985 + }, + { + "epoch": 0.43533715641684795, + "step": 4403, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.43533715641684795, + "step": 4403, + "train/total_loss": 0.13624244928359985 + }, + { + "entropy": 9.501214027404785, + "epoch": 0.43543602926636343, + "mean_token_accuracy": 0.7021604776382446, + "num_tokens": 2110080.0, + "step": 4404, + "train/ce_loss": 1.308893084526062 + }, + { + "epoch": 0.43543602926636343, + "step": 4404, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.43543602926636343, + "step": 4404, + "train/total_loss": 0.16213931143283844 + }, + { + "entropy": 9.20964241027832, + "epoch": 0.435534902115879, + "mean_token_accuracy": 0.7559171319007874, + "num_tokens": 2115387.0, + "step": 4405, + "train/ce_loss": 2.1970881789457053e-06 + }, + { + "epoch": 0.435534902115879, + "step": 4405, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.435534902115879, + "step": 4405, + "train/total_loss": 0.03515646979212761 + }, + { + "entropy": 8.804841995239258, + "epoch": 0.4356337749653945, + "mean_token_accuracy": 0.7403433322906494, + "num_tokens": 2120795.0, + "step": 4406, + "train/ce_loss": 0.4965910017490387 + }, + { + "epoch": 0.4356337749653945, + "step": 4406, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.4356337749653945, + "step": 4406, + "train/total_loss": 0.11215910315513611 + }, + { + "entropy": 8.861150741577148, + "epoch": 0.43573264781491, + "mean_token_accuracy": 0.7491785287857056, + "num_tokens": 2126175.0, + "step": 4407, + "train/ce_loss": 0.5222152471542358 + }, + { + "epoch": 0.43573264781491, + "step": 4407, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.43573264781491, + "step": 4407, + "train/total_loss": 0.08347152173519135 + }, + { + "entropy": 9.679935455322266, + "epoch": 0.43583152066442554, + "mean_token_accuracy": 0.6915887594223022, + "num_tokens": 2131153.0, + "step": 4408, + "train/ce_loss": 1.8620322942733765 + }, + { + "epoch": 0.43583152066442554, + "step": 4408, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.43583152066442554, + "step": 4408, + "train/total_loss": 0.2799532413482666 + }, + { + "entropy": 9.366382598876953, + "epoch": 0.4359303935139411, + "mean_token_accuracy": 0.7406876683235168, + "num_tokens": 2136351.0, + "step": 4409, + "train/ce_loss": 0.778279721736908 + }, + { + "epoch": 0.4359303935139411, + "step": 4409, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.4359303935139411, + "step": 4409, + "train/total_loss": 0.15204672515392303 + }, + { + "entropy": 9.035079956054688, + "epoch": 0.43602926636345657, + "mean_token_accuracy": 0.7981545329093933, + "num_tokens": 2141681.0, + "step": 4410, + "train/ce_loss": 0.4029959738254547 + }, + { + "epoch": 0.43602926636345657, + "step": 4410, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.43602926636345657, + "step": 4410, + "train/total_loss": 0.08326834440231323 + }, + { + "entropy": 9.631559371948242, + "epoch": 0.4361281392129721, + "mean_token_accuracy": 0.7532894611358643, + "num_tokens": 2146726.0, + "step": 4411, + "train/ce_loss": 0.7673465609550476 + }, + { + "epoch": 0.4361281392129721, + "step": 4411, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.4361281392129721, + "step": 4411, + "train/total_loss": 0.12751591205596924 + }, + { + "entropy": 9.441352844238281, + "epoch": 0.43622701206248765, + "mean_token_accuracy": 0.7333333492279053, + "num_tokens": 2151798.0, + "step": 4412, + "train/ce_loss": 1.9639170169830322 + }, + { + "epoch": 0.43622701206248765, + "step": 4412, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.43622701206248765, + "step": 4412, + "train/total_loss": 0.23545420169830322 + }, + { + "entropy": 8.656509399414062, + "epoch": 0.43632588491200314, + "mean_token_accuracy": 0.7327365875244141, + "num_tokens": 2157047.0, + "step": 4413, + "train/ce_loss": 0.7206571698188782 + }, + { + "epoch": 0.43632588491200314, + "step": 4413, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.43632588491200314, + "step": 4413, + "train/total_loss": 0.14237821102142334 + }, + { + "entropy": 9.049860000610352, + "epoch": 0.4364247577615187, + "mean_token_accuracy": 0.7032418847084045, + "num_tokens": 2162305.0, + "step": 4414, + "train/ce_loss": 0.868624210357666 + }, + { + "epoch": 0.4364247577615187, + "step": 4414, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.4364247577615187, + "step": 4414, + "train/total_loss": 0.10248742252588272 + }, + { + "entropy": 9.341044425964355, + "epoch": 0.4365236306110342, + "mean_token_accuracy": 0.7268128395080566, + "num_tokens": 2167355.0, + "step": 4415, + "train/ce_loss": 1.2310768365859985 + }, + { + "epoch": 0.4365236306110342, + "step": 4415, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.4365236306110342, + "step": 4415, + "train/total_loss": 0.1817014366388321 + }, + { + "entropy": 8.874947547912598, + "epoch": 0.43662250346054976, + "mean_token_accuracy": 0.7199017405509949, + "num_tokens": 2172702.0, + "step": 4416, + "train/ce_loss": 0.9515064358711243 + }, + { + "epoch": 0.43662250346054976, + "step": 4416, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.43662250346054976, + "step": 4416, + "train/total_loss": 0.1381193995475769 + }, + { + "entropy": 8.804052352905273, + "epoch": 0.43672137631006525, + "mean_token_accuracy": 0.732833981513977, + "num_tokens": 2178037.0, + "step": 4417, + "train/ce_loss": 1.0769851207733154 + }, + { + "epoch": 0.43672137631006525, + "step": 4417, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.43672137631006525, + "step": 4417, + "train/total_loss": 0.21707351505756378 + }, + { + "entropy": 9.064543724060059, + "epoch": 0.4368202491595808, + "mean_token_accuracy": 0.6856435537338257, + "num_tokens": 2183303.0, + "step": 4418, + "train/ce_loss": 0.9741151928901672 + }, + { + "epoch": 0.4368202491595808, + "step": 4418, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.4368202491595808, + "step": 4418, + "train/total_loss": 0.18725526332855225 + }, + { + "entropy": 9.179658889770508, + "epoch": 0.43691912200909633, + "mean_token_accuracy": 0.7299168705940247, + "num_tokens": 2188446.0, + "step": 4419, + "train/ce_loss": 0.6126939058303833 + }, + { + "epoch": 0.43691912200909633, + "step": 4419, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.43691912200909633, + "step": 4419, + "train/total_loss": 0.11595688760280609 + }, + { + "epoch": 0.4370179948586118, + "grad_norm": 0.7463781833648682, + "learning_rate": 8.909904564110172e-06, + "loss": 0.1475, + "step": 4420 + }, + { + "entropy": 9.411497116088867, + "epoch": 0.4370179948586118, + "mean_token_accuracy": 0.7161290049552917, + "num_tokens": 2193535.0, + "step": 4420, + "train/ce_loss": 1.4859209060668945 + }, + { + "epoch": 0.4370179948586118, + "step": 4420, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.4370179948586118, + "step": 4420, + "train/total_loss": 0.20327959954738617 + }, + { + "entropy": 9.652908325195312, + "epoch": 0.43711686770812735, + "mean_token_accuracy": 0.7307692170143127, + "num_tokens": 2198343.0, + "step": 4421, + "train/ce_loss": 1.3125678300857544 + }, + { + "epoch": 0.43711686770812735, + "step": 4421, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.43711686770812735, + "step": 4421, + "train/total_loss": 0.20156928896903992 + }, + { + "entropy": 9.79127311706543, + "epoch": 0.4372157405576429, + "mean_token_accuracy": 0.7169373631477356, + "num_tokens": 2203158.0, + "step": 4422, + "train/ce_loss": 3.1071267127990723 + }, + { + "epoch": 0.4372157405576429, + "step": 4422, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.4372157405576429, + "step": 4422, + "train/total_loss": 0.39274391531944275 + }, + { + "entropy": 9.302753448486328, + "epoch": 0.4373146134071584, + "mean_token_accuracy": 0.7614424228668213, + "num_tokens": 2208297.0, + "step": 4423, + "train/ce_loss": 1.1405718326568604 + }, + { + "epoch": 0.4373146134071584, + "step": 4423, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.4373146134071584, + "step": 4423, + "train/total_loss": 0.20390093326568604 + }, + { + "entropy": 8.907490730285645, + "epoch": 0.4374134862566739, + "mean_token_accuracy": 0.7476525902748108, + "num_tokens": 2213654.0, + "step": 4424, + "train/ce_loss": 0.684638500213623 + }, + { + "epoch": 0.4374134862566739, + "step": 4424, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.4374134862566739, + "step": 4424, + "train/total_loss": 0.13096386194229126 + }, + { + "entropy": 8.990495681762695, + "epoch": 0.43751235910618946, + "mean_token_accuracy": 0.7110214829444885, + "num_tokens": 2218858.0, + "step": 4425, + "train/ce_loss": 7.0683554440620355e-06 + }, + { + "epoch": 0.43751235910618946, + "step": 4425, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.43751235910618946, + "step": 4425, + "train/total_loss": 0.05468820780515671 + }, + { + "entropy": 9.02608585357666, + "epoch": 0.43761123195570495, + "mean_token_accuracy": 0.7302483320236206, + "num_tokens": 2224232.0, + "step": 4426, + "train/ce_loss": 0.8047318458557129 + }, + { + "epoch": 0.43761123195570495, + "step": 4426, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.43761123195570495, + "step": 4426, + "train/total_loss": 0.1273481845855713 + }, + { + "entropy": 8.982162475585938, + "epoch": 0.4377101048052205, + "mean_token_accuracy": 0.7677419185638428, + "num_tokens": 2229540.0, + "step": 4427, + "train/ce_loss": 0.6013153791427612 + }, + { + "epoch": 0.4377101048052205, + "step": 4427, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.4377101048052205, + "step": 4427, + "train/total_loss": 0.09138153493404388 + }, + { + "entropy": 9.857841491699219, + "epoch": 0.43780897765473603, + "mean_token_accuracy": 0.7625330090522766, + "num_tokens": 2234303.0, + "step": 4428, + "train/ce_loss": 1.9261402485426515e-05 + }, + { + "epoch": 0.43780897765473603, + "step": 4428, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.43780897765473603, + "step": 4428, + "train/total_loss": 0.035158175975084305 + }, + { + "entropy": 9.193056106567383, + "epoch": 0.4379078505042515, + "mean_token_accuracy": 0.7736132144927979, + "num_tokens": 2239384.0, + "step": 4429, + "train/ce_loss": 1.0085554122924805 + }, + { + "epoch": 0.4379078505042515, + "step": 4429, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.4379078505042515, + "step": 4429, + "train/total_loss": 0.18679304420948029 + }, + { + "entropy": 10.10301399230957, + "epoch": 0.43800672335376706, + "mean_token_accuracy": 0.75390625, + "num_tokens": 2244042.0, + "step": 4430, + "train/ce_loss": 1.0564612239249982e-05 + }, + { + "epoch": 0.43800672335376706, + "step": 4430, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.43800672335376706, + "step": 4430, + "train/total_loss": 0.04687605798244476 + }, + { + "entropy": 8.985715866088867, + "epoch": 0.4381055962032826, + "mean_token_accuracy": 0.7023977637290955, + "num_tokens": 2249240.0, + "step": 4431, + "train/ce_loss": 1.4215041399002075 + }, + { + "epoch": 0.4381055962032826, + "step": 4431, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.4381055962032826, + "step": 4431, + "train/total_loss": 0.216369166970253 + }, + { + "entropy": 9.483869552612305, + "epoch": 0.4382044690527981, + "mean_token_accuracy": 0.7144948840141296, + "num_tokens": 2254372.0, + "step": 4432, + "train/ce_loss": 0.6930326223373413 + }, + { + "epoch": 0.4382044690527981, + "step": 4432, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.4382044690527981, + "step": 4432, + "train/total_loss": 0.15133452415466309 + }, + { + "entropy": 8.84086799621582, + "epoch": 0.4383033419023136, + "mean_token_accuracy": 0.7124518752098083, + "num_tokens": 2259648.0, + "step": 4433, + "train/ce_loss": 0.6114106774330139 + }, + { + "epoch": 0.4383033419023136, + "step": 4433, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.4383033419023136, + "step": 4433, + "train/total_loss": 0.15098482370376587 + }, + { + "entropy": 9.003669738769531, + "epoch": 0.43840221475182917, + "mean_token_accuracy": 0.7843137383460999, + "num_tokens": 2265037.0, + "step": 4434, + "train/ce_loss": 0.8309410214424133 + }, + { + "epoch": 0.43840221475182917, + "step": 4434, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.43840221475182917, + "step": 4434, + "train/total_loss": 0.13778160512447357 + }, + { + "entropy": 8.685771942138672, + "epoch": 0.43850108760134465, + "mean_token_accuracy": 0.7375954389572144, + "num_tokens": 2270553.0, + "step": 4435, + "train/ce_loss": 1.0269848108291626 + }, + { + "epoch": 0.43850108760134465, + "step": 4435, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.43850108760134465, + "step": 4435, + "train/total_loss": 0.15347972512245178 + }, + { + "entropy": 9.298337936401367, + "epoch": 0.4385999604508602, + "mean_token_accuracy": 0.6993288397789001, + "num_tokens": 2275770.0, + "step": 4436, + "train/ce_loss": 1.4781383275985718 + }, + { + "epoch": 0.4385999604508602, + "step": 4436, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.4385999604508602, + "step": 4436, + "train/total_loss": 0.1751575917005539 + }, + { + "entropy": 8.53411865234375, + "epoch": 0.43869883330037573, + "mean_token_accuracy": 0.7323818206787109, + "num_tokens": 2281335.0, + "step": 4437, + "train/ce_loss": 0.9833084940910339 + }, + { + "epoch": 0.43869883330037573, + "step": 4437, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.43869883330037573, + "step": 4437, + "train/total_loss": 0.17645585536956787 + }, + { + "entropy": 9.266082763671875, + "epoch": 0.4387977061498912, + "mean_token_accuracy": 0.7793493866920471, + "num_tokens": 2286490.0, + "step": 4438, + "train/ce_loss": 0.9420968890190125 + }, + { + "epoch": 0.4387977061498912, + "step": 4438, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.4387977061498912, + "step": 4438, + "train/total_loss": 0.1723347008228302 + }, + { + "entropy": 9.278593063354492, + "epoch": 0.43889657899940676, + "mean_token_accuracy": 0.743658185005188, + "num_tokens": 2291650.0, + "step": 4439, + "train/ce_loss": 0.7231143712997437 + }, + { + "epoch": 0.43889657899940676, + "step": 4439, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.43889657899940676, + "step": 4439, + "train/total_loss": 0.1543426811695099 + }, + { + "epoch": 0.4389954518489223, + "grad_norm": 0.769729495048523, + "learning_rate": 8.904959699352224e-06, + "loss": 0.1442, + "step": 4440 + }, + { + "entropy": 9.450221061706543, + "epoch": 0.4389954518489223, + "mean_token_accuracy": 0.7487091422080994, + "num_tokens": 2296647.0, + "step": 4440, + "train/ce_loss": 0.8850860595703125 + }, + { + "epoch": 0.4389954518489223, + "step": 4440, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.4389954518489223, + "step": 4440, + "train/total_loss": 0.12366485595703125 + }, + { + "entropy": 9.613666534423828, + "epoch": 0.4390943246984378, + "mean_token_accuracy": 0.692307710647583, + "num_tokens": 2301592.0, + "step": 4441, + "train/ce_loss": 2.9959271614643512e-06 + }, + { + "epoch": 0.4390943246984378, + "step": 4441, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.4390943246984378, + "step": 4441, + "train/total_loss": 0.07031279802322388 + }, + { + "entropy": 9.27306079864502, + "epoch": 0.43919319754795333, + "mean_token_accuracy": 0.7120211124420166, + "num_tokens": 2306781.0, + "step": 4442, + "train/ce_loss": 2.1641550064086914 + }, + { + "epoch": 0.43919319754795333, + "step": 4442, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.43919319754795333, + "step": 4442, + "train/total_loss": 0.30235302448272705 + }, + { + "entropy": 8.658539772033691, + "epoch": 0.43929207039746887, + "mean_token_accuracy": 0.7223427295684814, + "num_tokens": 2312149.0, + "step": 4443, + "train/ce_loss": 1.2649974822998047 + }, + { + "epoch": 0.43929207039746887, + "step": 4443, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.43929207039746887, + "step": 4443, + "train/total_loss": 0.19290600717067719 + }, + { + "entropy": 9.389749526977539, + "epoch": 0.43939094324698436, + "mean_token_accuracy": 0.7435897588729858, + "num_tokens": 2317269.0, + "step": 4444, + "train/ce_loss": 0.3333715796470642 + }, + { + "epoch": 0.43939094324698436, + "step": 4444, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.43939094324698436, + "step": 4444, + "train/total_loss": 0.09583716094493866 + }, + { + "entropy": 9.218347549438477, + "epoch": 0.4394898160964999, + "mean_token_accuracy": 0.6879084706306458, + "num_tokens": 2322329.0, + "step": 4445, + "train/ce_loss": 1.192376971244812 + }, + { + "epoch": 0.4394898160964999, + "step": 4445, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.4394898160964999, + "step": 4445, + "train/total_loss": 0.20908144116401672 + }, + { + "entropy": 9.563655853271484, + "epoch": 0.43958868894601544, + "mean_token_accuracy": 0.664893627166748, + "num_tokens": 2327339.0, + "step": 4446, + "train/ce_loss": 0.9157935380935669 + }, + { + "epoch": 0.43958868894601544, + "step": 4446, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.43958868894601544, + "step": 4446, + "train/total_loss": 0.1423605978488922 + }, + { + "entropy": 9.286184310913086, + "epoch": 0.4396875617955309, + "mean_token_accuracy": 0.7344322204589844, + "num_tokens": 2332353.0, + "step": 4447, + "train/ce_loss": 1.0957647562026978 + }, + { + "epoch": 0.4396875617955309, + "step": 4447, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.4396875617955309, + "step": 4447, + "train/total_loss": 0.17988897860050201 + }, + { + "entropy": 9.33708381652832, + "epoch": 0.43978643464504646, + "mean_token_accuracy": 0.7566037774085999, + "num_tokens": 2337337.0, + "step": 4448, + "train/ce_loss": 3.0838871225569164e-06 + }, + { + "epoch": 0.43978643464504646, + "step": 4448, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.43978643464504646, + "step": 4448, + "train/total_loss": 0.04687530919909477 + }, + { + "entropy": 9.299051284790039, + "epoch": 0.439885307494562, + "mean_token_accuracy": 0.7994467616081238, + "num_tokens": 2342497.0, + "step": 4449, + "train/ce_loss": 5.771404630650068e-06 + }, + { + "epoch": 0.439885307494562, + "step": 4449, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.439885307494562, + "step": 4449, + "train/total_loss": 0.05859432741999626 + }, + { + "entropy": 9.380624771118164, + "epoch": 0.4399841803440775, + "mean_token_accuracy": 0.7617765665054321, + "num_tokens": 2347718.0, + "step": 4450, + "train/ce_loss": 0.9750670790672302 + }, + { + "epoch": 0.4399841803440775, + "step": 4450, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.4399841803440775, + "step": 4450, + "train/total_loss": 0.17563170194625854 + }, + { + "entropy": 8.716615676879883, + "epoch": 0.44008305319359303, + "mean_token_accuracy": 0.7680981755256653, + "num_tokens": 2353076.0, + "step": 4451, + "train/ce_loss": 1.348904013633728 + }, + { + "epoch": 0.44008305319359303, + "step": 4451, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.44008305319359303, + "step": 4451, + "train/total_loss": 0.22864040732383728 + }, + { + "entropy": 9.949045181274414, + "epoch": 0.4401819260431086, + "mean_token_accuracy": 0.7323601245880127, + "num_tokens": 2357858.0, + "step": 4452, + "train/ce_loss": 1.8708515167236328 + }, + { + "epoch": 0.4401819260431086, + "step": 4452, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.4401819260431086, + "step": 4452, + "train/total_loss": 0.24177265167236328 + }, + { + "entropy": 9.444894790649414, + "epoch": 0.44028079889262406, + "mean_token_accuracy": 0.7267080545425415, + "num_tokens": 2362948.0, + "step": 4453, + "train/ce_loss": 1.2766855955123901 + }, + { + "epoch": 0.44028079889262406, + "step": 4453, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.44028079889262406, + "step": 4453, + "train/total_loss": 0.155012309551239 + }, + { + "entropy": 10.00452995300293, + "epoch": 0.4403796717421396, + "mean_token_accuracy": 0.7047353982925415, + "num_tokens": 2367711.0, + "step": 4454, + "train/ce_loss": 5.856952611793531e-06 + }, + { + "epoch": 0.4403796717421396, + "step": 4454, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.4403796717421396, + "step": 4454, + "train/total_loss": 0.05078183487057686 + }, + { + "entropy": 8.824409484863281, + "epoch": 0.44047854459165514, + "mean_token_accuracy": 0.7411764860153198, + "num_tokens": 2373109.0, + "step": 4455, + "train/ce_loss": 0.769832193851471 + }, + { + "epoch": 0.44047854459165514, + "step": 4455, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.44047854459165514, + "step": 4455, + "train/total_loss": 0.11213947087526321 + }, + { + "entropy": 8.866456985473633, + "epoch": 0.4405774174411707, + "mean_token_accuracy": 0.716803789138794, + "num_tokens": 2378478.0, + "step": 4456, + "train/ce_loss": 1.0894641876220703 + }, + { + "epoch": 0.4405774174411707, + "step": 4456, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.4405774174411707, + "step": 4456, + "train/total_loss": 0.16363391280174255 + }, + { + "entropy": 9.864400863647461, + "epoch": 0.44067629029068617, + "mean_token_accuracy": 0.7126213312149048, + "num_tokens": 2383367.0, + "step": 4457, + "train/ce_loss": 1.4397059679031372 + }, + { + "epoch": 0.44067629029068617, + "step": 4457, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.44067629029068617, + "step": 4457, + "train/total_loss": 0.22990809381008148 + }, + { + "entropy": 8.804418563842773, + "epoch": 0.4407751631402017, + "mean_token_accuracy": 0.7957219481468201, + "num_tokens": 2388794.0, + "step": 4458, + "train/ce_loss": 0.3089240491390228 + }, + { + "epoch": 0.4407751631402017, + "step": 4458, + "train/sim_loss": 0.01171875 + }, + { + "epoch": 0.4407751631402017, + "step": 4458, + "train/total_loss": 0.04261115565896034 + }, + { + "entropy": 8.95989990234375, + "epoch": 0.44087403598971725, + "mean_token_accuracy": 0.7293144464492798, + "num_tokens": 2394072.0, + "step": 4459, + "train/ce_loss": 1.2242991924285889 + }, + { + "epoch": 0.44087403598971725, + "step": 4459, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.44087403598971725, + "step": 4459, + "train/total_loss": 0.22399242222309113 + }, + { + "epoch": 0.44097290883923274, + "grad_norm": 0.662868082523346, + "learning_rate": 8.900014834594275e-06, + "loss": 0.1468, + "step": 4460 + }, + { + "entropy": 9.7796049118042, + "epoch": 0.44097290883923274, + "mean_token_accuracy": 0.6968504190444946, + "num_tokens": 2399007.0, + "step": 4460, + "train/ce_loss": 2.228492498397827 + }, + { + "epoch": 0.44097290883923274, + "step": 4460, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.44097290883923274, + "step": 4460, + "train/total_loss": 0.2853492498397827 + }, + { + "entropy": 9.543054580688477, + "epoch": 0.4410717816887483, + "mean_token_accuracy": 0.8024263381958008, + "num_tokens": 2404063.0, + "step": 4461, + "train/ce_loss": 0.8705273866653442 + }, + { + "epoch": 0.4410717816887483, + "step": 4461, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.4410717816887483, + "step": 4461, + "train/total_loss": 0.13783398270606995 + }, + { + "entropy": 9.04907512664795, + "epoch": 0.4411706545382638, + "mean_token_accuracy": 0.7178649306297302, + "num_tokens": 2409424.0, + "step": 4462, + "train/ce_loss": 0.821657121181488 + }, + { + "epoch": 0.4411706545382638, + "step": 4462, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.4411706545382638, + "step": 4462, + "train/total_loss": 0.14075946807861328 + }, + { + "entropy": 8.752744674682617, + "epoch": 0.4412695273877793, + "mean_token_accuracy": 0.7126303315162659, + "num_tokens": 2414826.0, + "step": 4463, + "train/ce_loss": 1.4678646326065063 + }, + { + "epoch": 0.4412695273877793, + "step": 4463, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.4412695273877793, + "step": 4463, + "train/total_loss": 0.20538021624088287 + }, + { + "entropy": 9.18133544921875, + "epoch": 0.44136840023729484, + "mean_token_accuracy": 0.7166416645050049, + "num_tokens": 2420018.0, + "step": 4464, + "train/ce_loss": 0.4874807596206665 + }, + { + "epoch": 0.44136840023729484, + "step": 4464, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.44136840023729484, + "step": 4464, + "train/total_loss": 0.10734182596206665 + }, + { + "entropy": 10.004258155822754, + "epoch": 0.4414672730868104, + "mean_token_accuracy": 0.7440476417541504, + "num_tokens": 2424788.0, + "step": 4465, + "train/ce_loss": 1.915676474571228 + }, + { + "epoch": 0.4414672730868104, + "step": 4465, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.4414672730868104, + "step": 4465, + "train/total_loss": 0.28141140937805176 + }, + { + "entropy": 10.212564468383789, + "epoch": 0.44156614593632587, + "mean_token_accuracy": 0.7654867172241211, + "num_tokens": 2429405.0, + "step": 4466, + "train/ce_loss": 5.95388155488763e-06 + }, + { + "epoch": 0.44156614593632587, + "step": 4466, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.44156614593632587, + "step": 4466, + "train/total_loss": 0.046875596046447754 + }, + { + "entropy": 8.973442077636719, + "epoch": 0.4416650187858414, + "mean_token_accuracy": 0.763832688331604, + "num_tokens": 2434591.0, + "step": 4467, + "train/ce_loss": 0.42459407448768616 + }, + { + "epoch": 0.4416650187858414, + "step": 4467, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.4416650187858414, + "step": 4467, + "train/total_loss": 0.0737094134092331 + }, + { + "entropy": 8.980101585388184, + "epoch": 0.44176389163535695, + "mean_token_accuracy": 0.7095671892166138, + "num_tokens": 2439939.0, + "step": 4468, + "train/ce_loss": 1.2070839405059814 + }, + { + "epoch": 0.44176389163535695, + "step": 4468, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.44176389163535695, + "step": 4468, + "train/total_loss": 0.1441459059715271 + }, + { + "entropy": 9.259428977966309, + "epoch": 0.44186276448487244, + "mean_token_accuracy": 0.7447090148925781, + "num_tokens": 2445089.0, + "step": 4469, + "train/ce_loss": 1.2775579690933228 + }, + { + "epoch": 0.44186276448487244, + "step": 4469, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.44186276448487244, + "step": 4469, + "train/total_loss": 0.174630805850029 + }, + { + "entropy": 9.76385498046875, + "epoch": 0.441961637334388, + "mean_token_accuracy": 0.766590416431427, + "num_tokens": 2449918.0, + "step": 4470, + "train/ce_loss": 2.6808668280864367e-06 + }, + { + "epoch": 0.441961637334388, + "step": 4470, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.441961637334388, + "step": 4470, + "train/total_loss": 0.06250026822090149 + }, + { + "entropy": 9.008390426635742, + "epoch": 0.4420605101839035, + "mean_token_accuracy": 0.8142250776290894, + "num_tokens": 2455348.0, + "step": 4471, + "train/ce_loss": 0.43461742997169495 + }, + { + "epoch": 0.4420605101839035, + "step": 4471, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.4420605101839035, + "step": 4471, + "train/total_loss": 0.06689924001693726 + }, + { + "entropy": 9.000526428222656, + "epoch": 0.442159383033419, + "mean_token_accuracy": 0.712435245513916, + "num_tokens": 2460570.0, + "step": 4472, + "train/ce_loss": 0.7038297653198242 + }, + { + "epoch": 0.442159383033419, + "step": 4472, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.442159383033419, + "step": 4472, + "train/total_loss": 0.1055392250418663 + }, + { + "entropy": 8.812034606933594, + "epoch": 0.44225825588293455, + "mean_token_accuracy": 0.7597330212593079, + "num_tokens": 2465975.0, + "step": 4473, + "train/ce_loss": 0.5284525156021118 + }, + { + "epoch": 0.44225825588293455, + "step": 4473, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.44225825588293455, + "step": 4473, + "train/total_loss": 0.07628275454044342 + }, + { + "entropy": 9.443574905395508, + "epoch": 0.4423571287324501, + "mean_token_accuracy": 0.7342767119407654, + "num_tokens": 2471047.0, + "step": 4474, + "train/ce_loss": 3.8014932215446606e-06 + }, + { + "epoch": 0.4423571287324501, + "step": 4474, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.4423571287324501, + "step": 4474, + "train/total_loss": 0.07031287997961044 + }, + { + "entropy": 9.411504745483398, + "epoch": 0.4424560015819656, + "mean_token_accuracy": 0.7269303202629089, + "num_tokens": 2476012.0, + "step": 4475, + "train/ce_loss": 0.9485357999801636 + }, + { + "epoch": 0.4424560015819656, + "step": 4475, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.4424560015819656, + "step": 4475, + "train/total_loss": 0.18469732999801636 + }, + { + "entropy": 9.112486839294434, + "epoch": 0.4425548744314811, + "mean_token_accuracy": 0.7319201827049255, + "num_tokens": 2481280.0, + "step": 4476, + "train/ce_loss": 0.4585082232952118 + }, + { + "epoch": 0.4425548744314811, + "step": 4476, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.4425548744314811, + "step": 4476, + "train/total_loss": 0.08881957828998566 + }, + { + "entropy": 9.011128425598145, + "epoch": 0.44265374728099666, + "mean_token_accuracy": 0.7423887848854065, + "num_tokens": 2486582.0, + "step": 4477, + "train/ce_loss": 0.679169774055481 + }, + { + "epoch": 0.44265374728099666, + "step": 4477, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.44265374728099666, + "step": 4477, + "train/total_loss": 0.16947948932647705 + }, + { + "entropy": 8.964334487915039, + "epoch": 0.44275262013051214, + "mean_token_accuracy": 0.6931949257850647, + "num_tokens": 2491911.0, + "step": 4478, + "train/ce_loss": 0.6913797855377197 + }, + { + "epoch": 0.44275262013051214, + "step": 4478, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.44275262013051214, + "step": 4478, + "train/total_loss": 0.11601298302412033 + }, + { + "entropy": 9.322576522827148, + "epoch": 0.4428514929800277, + "mean_token_accuracy": 0.7546897530555725, + "num_tokens": 2497061.0, + "step": 4479, + "train/ce_loss": 0.8054964542388916 + }, + { + "epoch": 0.4428514929800277, + "step": 4479, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.4428514929800277, + "step": 4479, + "train/total_loss": 0.13523715734481812 + }, + { + "epoch": 0.4429503658295432, + "grad_norm": 0.699345052242279, + "learning_rate": 8.895069969836325e-06, + "loss": 0.1442, + "step": 4480 + }, + { + "entropy": 9.255762100219727, + "epoch": 0.4429503658295432, + "mean_token_accuracy": 0.7518796920776367, + "num_tokens": 2502221.0, + "step": 4480, + "train/ce_loss": 0.8112313747406006 + }, + { + "epoch": 0.4429503658295432, + "step": 4480, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.4429503658295432, + "step": 4480, + "train/total_loss": 0.10065438598394394 + }, + { + "entropy": 9.210655212402344, + "epoch": 0.4430492386790587, + "mean_token_accuracy": 0.7523584961891174, + "num_tokens": 2507517.0, + "step": 4481, + "train/ce_loss": 0.8111066818237305 + }, + { + "epoch": 0.4430492386790587, + "step": 4481, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.4430492386790587, + "step": 4481, + "train/total_loss": 0.12798567116260529 + }, + { + "entropy": 8.875221252441406, + "epoch": 0.44314811152857425, + "mean_token_accuracy": 0.7130852341651917, + "num_tokens": 2512856.0, + "step": 4482, + "train/ce_loss": 1.2064754962921143 + }, + { + "epoch": 0.44314811152857425, + "step": 4482, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.44314811152857425, + "step": 4482, + "train/total_loss": 0.15189754962921143 + }, + { + "entropy": 9.424863815307617, + "epoch": 0.4432469843780898, + "mean_token_accuracy": 0.7870778441429138, + "num_tokens": 2518023.0, + "step": 4483, + "train/ce_loss": 1.2678290605545044 + }, + { + "epoch": 0.4432469843780898, + "step": 4483, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.4432469843780898, + "step": 4483, + "train/total_loss": 0.22834540903568268 + }, + { + "entropy": 9.196966171264648, + "epoch": 0.4433458572276053, + "mean_token_accuracy": 0.736775815486908, + "num_tokens": 2523289.0, + "step": 4484, + "train/ce_loss": 1.2598520517349243 + }, + { + "epoch": 0.4433458572276053, + "step": 4484, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.4433458572276053, + "step": 4484, + "train/total_loss": 0.17676645517349243 + }, + { + "entropy": 8.98042106628418, + "epoch": 0.4434447300771208, + "mean_token_accuracy": 0.7055960893630981, + "num_tokens": 2528596.0, + "step": 4485, + "train/ce_loss": 0.34039002656936646 + }, + { + "epoch": 0.4434447300771208, + "step": 4485, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.4434447300771208, + "step": 4485, + "train/total_loss": 0.057476501911878586 + }, + { + "entropy": 9.068885803222656, + "epoch": 0.44354360292663636, + "mean_token_accuracy": 0.7556390762329102, + "num_tokens": 2533928.0, + "step": 4486, + "train/ce_loss": 0.46264269948005676 + }, + { + "epoch": 0.44354360292663636, + "step": 4486, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.44354360292663636, + "step": 4486, + "train/total_loss": 0.10485802590847015 + }, + { + "entropy": 8.65289306640625, + "epoch": 0.44364247577615185, + "mean_token_accuracy": 0.7263875603675842, + "num_tokens": 2539488.0, + "step": 4487, + "train/ce_loss": 0.7177829146385193 + }, + { + "epoch": 0.44364247577615185, + "step": 4487, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.44364247577615185, + "step": 4487, + "train/total_loss": 0.1733407974243164 + }, + { + "entropy": 8.838767051696777, + "epoch": 0.4437413486256674, + "mean_token_accuracy": 0.7434210777282715, + "num_tokens": 2545037.0, + "step": 4488, + "train/ce_loss": 0.638717770576477 + }, + { + "epoch": 0.4437413486256674, + "step": 4488, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.4437413486256674, + "step": 4488, + "train/total_loss": 0.12246552854776382 + }, + { + "entropy": 9.20651912689209, + "epoch": 0.44384022147518293, + "mean_token_accuracy": 0.7527011036872864, + "num_tokens": 2550320.0, + "step": 4489, + "train/ce_loss": 0.790439248085022 + }, + { + "epoch": 0.44384022147518293, + "step": 4489, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.44384022147518293, + "step": 4489, + "train/total_loss": 0.1649814248085022 + }, + { + "entropy": 9.543380737304688, + "epoch": 0.4439390943246984, + "mean_token_accuracy": 0.699999988079071, + "num_tokens": 2555328.0, + "step": 4490, + "train/ce_loss": 1.331195592880249 + }, + { + "epoch": 0.4439390943246984, + "step": 4490, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.4439390943246984, + "step": 4490, + "train/total_loss": 0.21515081822872162 + }, + { + "entropy": 9.06105899810791, + "epoch": 0.44403796717421395, + "mean_token_accuracy": 0.7285180687904358, + "num_tokens": 2560606.0, + "step": 4491, + "train/ce_loss": 1.2990325689315796 + }, + { + "epoch": 0.44403796717421395, + "step": 4491, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.44403796717421395, + "step": 4491, + "train/total_loss": 0.21974700689315796 + }, + { + "entropy": 9.204246520996094, + "epoch": 0.4441368400237295, + "mean_token_accuracy": 0.7277628183364868, + "num_tokens": 2565842.0, + "step": 4492, + "train/ce_loss": 0.6739773154258728 + }, + { + "epoch": 0.4441368400237295, + "step": 4492, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.4441368400237295, + "step": 4492, + "train/total_loss": 0.11427273601293564 + }, + { + "entropy": 9.881002426147461, + "epoch": 0.444235712873245, + "mean_token_accuracy": 0.8098039031028748, + "num_tokens": 2570756.0, + "step": 4493, + "train/ce_loss": 1.4940541177566047e-06 + }, + { + "epoch": 0.444235712873245, + "step": 4493, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.444235712873245, + "step": 4493, + "train/total_loss": 0.02343764901161194 + }, + { + "entropy": 9.324443817138672, + "epoch": 0.4443345857227605, + "mean_token_accuracy": 0.7636363506317139, + "num_tokens": 2575777.0, + "step": 4494, + "train/ce_loss": 1.4793231457588263e-06 + }, + { + "epoch": 0.4443345857227605, + "step": 4494, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.4443345857227605, + "step": 4494, + "train/total_loss": 0.07421889901161194 + }, + { + "entropy": 9.813060760498047, + "epoch": 0.44443345857227606, + "mean_token_accuracy": 0.7376344203948975, + "num_tokens": 2580655.0, + "step": 4495, + "train/ce_loss": 1.8447369711793726e-06 + }, + { + "epoch": 0.44443345857227606, + "step": 4495, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.44443345857227606, + "step": 4495, + "train/total_loss": 0.03125018626451492 + }, + { + "entropy": 9.63151741027832, + "epoch": 0.44453233142179155, + "mean_token_accuracy": 0.7180384993553162, + "num_tokens": 2585676.0, + "step": 4496, + "train/ce_loss": 1.6704481840133667 + }, + { + "epoch": 0.44453233142179155, + "step": 4496, + "train/sim_loss": 0.1484375 + }, + { + "epoch": 0.44453233142179155, + "step": 4496, + "train/total_loss": 0.31548231840133667 + }, + { + "entropy": 9.124244689941406, + "epoch": 0.4446312042713071, + "mean_token_accuracy": 0.7172236442565918, + "num_tokens": 2590930.0, + "step": 4497, + "train/ce_loss": 1.4694288969039917 + }, + { + "epoch": 0.4446312042713071, + "step": 4497, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.4446312042713071, + "step": 4497, + "train/total_loss": 0.2094428986310959 + }, + { + "entropy": 9.301881790161133, + "epoch": 0.44473007712082263, + "mean_token_accuracy": 0.7427386045455933, + "num_tokens": 2596133.0, + "step": 4498, + "train/ce_loss": 0.8499370813369751 + }, + { + "epoch": 0.44473007712082263, + "step": 4498, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.44473007712082263, + "step": 4498, + "train/total_loss": 0.15921247005462646 + }, + { + "entropy": 9.577167510986328, + "epoch": 0.44482894997033817, + "mean_token_accuracy": 0.7610294222831726, + "num_tokens": 2601100.0, + "step": 4499, + "train/ce_loss": 1.0237698554992676 + }, + { + "epoch": 0.44482894997033817, + "step": 4499, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.44482894997033817, + "step": 4499, + "train/total_loss": 0.1961269974708557 + }, + { + "epoch": 0.44492782281985366, + "grad_norm": 0.7579295039176941, + "learning_rate": 8.890125105078377e-06, + "loss": 0.1471, + "step": 4500 + }, + { + "entropy": 8.780494689941406, + "epoch": 0.44492782281985366, + "mean_token_accuracy": 0.7605911493301392, + "num_tokens": 2606616.0, + "step": 4500, + "train/ce_loss": 0.8086697459220886 + }, + { + "epoch": 0.44492782281985366, + "step": 4500, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.44492782281985366, + "step": 4500, + "train/total_loss": 0.1746169775724411 + }, + { + "entropy": 9.408340454101562, + "epoch": 0.4450266956693692, + "mean_token_accuracy": 0.7112010717391968, + "num_tokens": 2611793.0, + "step": 4501, + "train/ce_loss": 1.1653521060943604 + }, + { + "epoch": 0.4450266956693692, + "step": 4501, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.4450266956693692, + "step": 4501, + "train/total_loss": 0.2180977165699005 + }, + { + "entropy": 8.905416488647461, + "epoch": 0.44512556851888474, + "mean_token_accuracy": 0.692307710647583, + "num_tokens": 2616968.0, + "step": 4502, + "train/ce_loss": 0.7832385897636414 + }, + { + "epoch": 0.44512556851888474, + "step": 4502, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.44512556851888474, + "step": 4502, + "train/total_loss": 0.1330113708972931 + }, + { + "entropy": 8.959816932678223, + "epoch": 0.4452244413684002, + "mean_token_accuracy": 0.720200777053833, + "num_tokens": 2622259.0, + "step": 4503, + "train/ce_loss": 0.5568627715110779 + }, + { + "epoch": 0.4452244413684002, + "step": 4503, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.4452244413684002, + "step": 4503, + "train/total_loss": 0.12209253013134003 + }, + { + "entropy": 9.612812042236328, + "epoch": 0.44532331421791577, + "mean_token_accuracy": 0.7937062978744507, + "num_tokens": 2627241.0, + "step": 4504, + "train/ce_loss": 0.8046440482139587 + }, + { + "epoch": 0.44532331421791577, + "step": 4504, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.44532331421791577, + "step": 4504, + "train/total_loss": 0.11562065780162811 + }, + { + "entropy": 9.064831733703613, + "epoch": 0.4454221870674313, + "mean_token_accuracy": 0.7515006065368652, + "num_tokens": 2632580.0, + "step": 4505, + "train/ce_loss": 1.301162600517273 + }, + { + "epoch": 0.4454221870674313, + "step": 4505, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.4454221870674313, + "step": 4505, + "train/total_loss": 0.196522518992424 + }, + { + "entropy": 9.75990104675293, + "epoch": 0.4455210599169468, + "mean_token_accuracy": 0.7781690359115601, + "num_tokens": 2637573.0, + "step": 4506, + "train/ce_loss": 1.1523325443267822 + }, + { + "epoch": 0.4455210599169468, + "step": 4506, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.4455210599169468, + "step": 4506, + "train/total_loss": 0.18554575741291046 + }, + { + "entropy": 9.029922485351562, + "epoch": 0.44561993276646233, + "mean_token_accuracy": 0.7578125, + "num_tokens": 2642930.0, + "step": 4507, + "train/ce_loss": 0.8819659352302551 + }, + { + "epoch": 0.44561993276646233, + "step": 4507, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.44561993276646233, + "step": 4507, + "train/total_loss": 0.15460285544395447 + }, + { + "entropy": 8.70202922821045, + "epoch": 0.4457188056159779, + "mean_token_accuracy": 0.7034883499145508, + "num_tokens": 2648300.0, + "step": 4508, + "train/ce_loss": 0.7986609935760498 + }, + { + "epoch": 0.4457188056159779, + "step": 4508, + "train/sim_loss": 0.1484375 + }, + { + "epoch": 0.4457188056159779, + "step": 4508, + "train/total_loss": 0.22830361127853394 + }, + { + "entropy": 9.651217460632324, + "epoch": 0.44581767846549336, + "mean_token_accuracy": 0.7282230257987976, + "num_tokens": 2653330.0, + "step": 4509, + "train/ce_loss": 1.7078146934509277 + }, + { + "epoch": 0.44581767846549336, + "step": 4509, + "train/sim_loss": 0.12109375 + }, + { + "epoch": 0.44581767846549336, + "step": 4509, + "train/total_loss": 0.2918752431869507 + }, + { + "entropy": 9.287551879882812, + "epoch": 0.4459165513150089, + "mean_token_accuracy": 0.7611026167869568, + "num_tokens": 2658628.0, + "step": 4510, + "train/ce_loss": 1.1713571548461914 + }, + { + "epoch": 0.4459165513150089, + "step": 4510, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.4459165513150089, + "step": 4510, + "train/total_loss": 0.23041696846485138 + }, + { + "entropy": 8.904544830322266, + "epoch": 0.44601542416452444, + "mean_token_accuracy": 0.8007850646972656, + "num_tokens": 2664116.0, + "step": 4511, + "train/ce_loss": 0.8100970387458801 + }, + { + "epoch": 0.44601542416452444, + "step": 4511, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.44601542416452444, + "step": 4511, + "train/total_loss": 0.15132221579551697 + }, + { + "entropy": 8.725772857666016, + "epoch": 0.44611429701403993, + "mean_token_accuracy": 0.7802197933197021, + "num_tokens": 2669594.0, + "step": 4512, + "train/ce_loss": 0.774977445602417 + }, + { + "epoch": 0.44611429701403993, + "step": 4512, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.44611429701403993, + "step": 4512, + "train/total_loss": 0.19468525052070618 + }, + { + "entropy": 9.246919631958008, + "epoch": 0.44621316986355547, + "mean_token_accuracy": 0.7734877467155457, + "num_tokens": 2674776.0, + "step": 4513, + "train/ce_loss": 0.3518202304840088 + }, + { + "epoch": 0.44621316986355547, + "step": 4513, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.44621316986355547, + "step": 4513, + "train/total_loss": 0.12502577900886536 + }, + { + "entropy": 9.12303352355957, + "epoch": 0.446312042713071, + "mean_token_accuracy": 0.6736842393875122, + "num_tokens": 2679914.0, + "step": 4514, + "train/ce_loss": 1.3278347253799438 + }, + { + "epoch": 0.446312042713071, + "step": 4514, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.446312042713071, + "step": 4514, + "train/total_loss": 0.18356472253799438 + }, + { + "entropy": 9.493539810180664, + "epoch": 0.4464109155625865, + "mean_token_accuracy": 0.7622504830360413, + "num_tokens": 2684898.0, + "step": 4515, + "train/ce_loss": 0.47448375821113586 + }, + { + "epoch": 0.4464109155625865, + "step": 4515, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.4464109155625865, + "step": 4515, + "train/total_loss": 0.07479213178157806 + }, + { + "entropy": 8.801179885864258, + "epoch": 0.44650978841210204, + "mean_token_accuracy": 0.7257732152938843, + "num_tokens": 2690354.0, + "step": 4516, + "train/ce_loss": 0.6246917843818665 + }, + { + "epoch": 0.44650978841210204, + "step": 4516, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.44650978841210204, + "step": 4516, + "train/total_loss": 0.12106293439865112 + }, + { + "entropy": 9.418305397033691, + "epoch": 0.4466086612616176, + "mean_token_accuracy": 0.7054010033607483, + "num_tokens": 2695422.0, + "step": 4517, + "train/ce_loss": 1.2904903888702393 + }, + { + "epoch": 0.4466086612616176, + "step": 4517, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.4466086612616176, + "step": 4517, + "train/total_loss": 0.23842404782772064 + }, + { + "entropy": 9.153996467590332, + "epoch": 0.44670753411113306, + "mean_token_accuracy": 0.6802973747253418, + "num_tokens": 2700763.0, + "step": 4518, + "train/ce_loss": 1.680827021598816 + }, + { + "epoch": 0.44670753411113306, + "step": 4518, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.44670753411113306, + "step": 4518, + "train/total_loss": 0.26183271408081055 + }, + { + "entropy": 8.895081520080566, + "epoch": 0.4468064069606486, + "mean_token_accuracy": 0.7473903894424438, + "num_tokens": 2706165.0, + "step": 4519, + "train/ce_loss": 1.1039037704467773 + }, + { + "epoch": 0.4468064069606486, + "step": 4519, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.4468064069606486, + "step": 4519, + "train/total_loss": 0.18460913002490997 + }, + { + "epoch": 0.44690527981016415, + "grad_norm": 0.7490431070327759, + "learning_rate": 8.885180240320428e-06, + "loss": 0.1494, + "step": 4520 + }, + { + "entropy": 9.420427322387695, + "epoch": 0.44690527981016415, + "mean_token_accuracy": 0.7320675253868103, + "num_tokens": 2711037.0, + "step": 4520, + "train/ce_loss": 0.6447953581809998 + }, + { + "epoch": 0.44690527981016415, + "step": 4520, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.44690527981016415, + "step": 4520, + "train/total_loss": 0.1074482873082161 + }, + { + "entropy": 9.204364776611328, + "epoch": 0.44700415265967963, + "mean_token_accuracy": 0.734455943107605, + "num_tokens": 2716309.0, + "step": 4521, + "train/ce_loss": 1.1876529455184937 + }, + { + "epoch": 0.44700415265967963, + "step": 4521, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.44700415265967963, + "step": 4521, + "train/total_loss": 0.20470279455184937 + }, + { + "entropy": 9.333433151245117, + "epoch": 0.4471030255091952, + "mean_token_accuracy": 0.7256515622138977, + "num_tokens": 2721504.0, + "step": 4522, + "train/ce_loss": 1.3652865886688232 + }, + { + "epoch": 0.4471030255091952, + "step": 4522, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.4471030255091952, + "step": 4522, + "train/total_loss": 0.21855990588665009 + }, + { + "entropy": 8.984024047851562, + "epoch": 0.4472018983587107, + "mean_token_accuracy": 0.7811484336853027, + "num_tokens": 2726904.0, + "step": 4523, + "train/ce_loss": 0.9568396210670471 + }, + { + "epoch": 0.4472018983587107, + "step": 4523, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.4472018983587107, + "step": 4523, + "train/total_loss": 0.11912146210670471 + }, + { + "entropy": 9.023128509521484, + "epoch": 0.4473007712082262, + "mean_token_accuracy": 0.7204030156135559, + "num_tokens": 2732127.0, + "step": 4524, + "train/ce_loss": 0.5748513340950012 + }, + { + "epoch": 0.4473007712082262, + "step": 4524, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.4473007712082262, + "step": 4524, + "train/total_loss": 0.08092263340950012 + }, + { + "entropy": 9.544071197509766, + "epoch": 0.44739964405774174, + "mean_token_accuracy": 0.7534013390541077, + "num_tokens": 2737155.0, + "step": 4525, + "train/ce_loss": 0.6580440998077393 + }, + { + "epoch": 0.44739964405774174, + "step": 4525, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.44739964405774174, + "step": 4525, + "train/total_loss": 0.10877316445112228 + }, + { + "entropy": 9.339006423950195, + "epoch": 0.4474985169072573, + "mean_token_accuracy": 0.7637906670570374, + "num_tokens": 2742304.0, + "step": 4526, + "train/ce_loss": 0.7049911618232727 + }, + { + "epoch": 0.4474985169072573, + "step": 4526, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.4474985169072573, + "step": 4526, + "train/total_loss": 0.09003036469221115 + }, + { + "entropy": 9.086111068725586, + "epoch": 0.44759738975677277, + "mean_token_accuracy": 0.7605294585227966, + "num_tokens": 2747559.0, + "step": 4527, + "train/ce_loss": 0.6927676796913147 + }, + { + "epoch": 0.44759738975677277, + "step": 4527, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.44759738975677277, + "step": 4527, + "train/total_loss": 0.13958927989006042 + }, + { + "entropy": 9.318374633789062, + "epoch": 0.4476962626062883, + "mean_token_accuracy": 0.7939777970314026, + "num_tokens": 2752659.0, + "step": 4528, + "train/ce_loss": 1.3353852033615112 + }, + { + "epoch": 0.4476962626062883, + "step": 4528, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.4476962626062883, + "step": 4528, + "train/total_loss": 0.24291352927684784 + }, + { + "entropy": 9.387085914611816, + "epoch": 0.44779513545580385, + "mean_token_accuracy": 0.7215999960899353, + "num_tokens": 2757745.0, + "step": 4529, + "train/ce_loss": 1.371522307395935 + }, + { + "epoch": 0.44779513545580385, + "step": 4529, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.44779513545580385, + "step": 4529, + "train/total_loss": 0.19574598968029022 + }, + { + "entropy": 8.896390914916992, + "epoch": 0.44789400830531934, + "mean_token_accuracy": 0.764374315738678, + "num_tokens": 2763071.0, + "step": 4530, + "train/ce_loss": 0.7218606472015381 + }, + { + "epoch": 0.44789400830531934, + "step": 4530, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.44789400830531934, + "step": 4530, + "train/total_loss": 0.10734231770038605 + }, + { + "entropy": 9.415645599365234, + "epoch": 0.4479928811548349, + "mean_token_accuracy": 0.759530782699585, + "num_tokens": 2768197.0, + "step": 4531, + "train/ce_loss": 1.0284645668434678e-06 + }, + { + "epoch": 0.4479928811548349, + "step": 4531, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.4479928811548349, + "step": 4531, + "train/total_loss": 0.023437602445483208 + }, + { + "entropy": 9.233901023864746, + "epoch": 0.4480917540043504, + "mean_token_accuracy": 0.6820428371429443, + "num_tokens": 2773273.0, + "step": 4532, + "train/ce_loss": 1.1894171237945557 + }, + { + "epoch": 0.4480917540043504, + "step": 4532, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.4480917540043504, + "step": 4532, + "train/total_loss": 0.16191047430038452 + }, + { + "entropy": 8.682958602905273, + "epoch": 0.4481906268538659, + "mean_token_accuracy": 0.7135416865348816, + "num_tokens": 2778736.0, + "step": 4533, + "train/ce_loss": 1.315225601196289 + }, + { + "epoch": 0.4481906268538659, + "step": 4533, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.4481906268538659, + "step": 4533, + "train/total_loss": 0.18230381608009338 + }, + { + "entropy": 9.534332275390625, + "epoch": 0.44828949970338144, + "mean_token_accuracy": 0.7224409580230713, + "num_tokens": 2783689.0, + "step": 4534, + "train/ce_loss": 1.1777727603912354 + }, + { + "epoch": 0.44828949970338144, + "step": 4534, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.44828949970338144, + "step": 4534, + "train/total_loss": 0.1568397879600525 + }, + { + "entropy": 8.914642333984375, + "epoch": 0.448388372552897, + "mean_token_accuracy": 0.7865030765533447, + "num_tokens": 2788972.0, + "step": 4535, + "train/ce_loss": 0.7574781775474548 + }, + { + "epoch": 0.448388372552897, + "step": 4535, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.448388372552897, + "step": 4535, + "train/total_loss": 0.13434156775474548 + }, + { + "entropy": 9.556293487548828, + "epoch": 0.44848724540241247, + "mean_token_accuracy": 0.7428571581840515, + "num_tokens": 2793841.0, + "step": 4536, + "train/ce_loss": 2.064066171646118 + }, + { + "epoch": 0.44848724540241247, + "step": 4536, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.44848724540241247, + "step": 4536, + "train/total_loss": 0.2884378731250763 + }, + { + "entropy": 9.276677131652832, + "epoch": 0.448586118251928, + "mean_token_accuracy": 0.7107309699058533, + "num_tokens": 2798925.0, + "step": 4537, + "train/ce_loss": 0.9554111361503601 + }, + { + "epoch": 0.448586118251928, + "step": 4537, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.448586118251928, + "step": 4537, + "train/total_loss": 0.1541348695755005 + }, + { + "entropy": 8.949641227722168, + "epoch": 0.44868499110144355, + "mean_token_accuracy": 0.752136766910553, + "num_tokens": 2804230.0, + "step": 4538, + "train/ce_loss": 1.0398943424224854 + }, + { + "epoch": 0.44868499110144355, + "step": 4538, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.44868499110144355, + "step": 4538, + "train/total_loss": 0.17039568722248077 + }, + { + "entropy": 9.061538696289062, + "epoch": 0.4487838639509591, + "mean_token_accuracy": 0.7519999742507935, + "num_tokens": 2809537.0, + "step": 4539, + "train/ce_loss": 0.9472593069076538 + }, + { + "epoch": 0.4487838639509591, + "step": 4539, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.4487838639509591, + "step": 4539, + "train/total_loss": 0.14941343665122986 + }, + { + "epoch": 0.4488827368004746, + "grad_norm": 0.688106894493103, + "learning_rate": 8.88023537556248e-06, + "loss": 0.1371, + "step": 4540 + }, + { + "entropy": 8.731950759887695, + "epoch": 0.4488827368004746, + "mean_token_accuracy": 0.7028824687004089, + "num_tokens": 2814906.0, + "step": 4540, + "train/ce_loss": 0.5285282135009766 + }, + { + "epoch": 0.4488827368004746, + "step": 4540, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.4488827368004746, + "step": 4540, + "train/total_loss": 0.0762903243303299 + }, + { + "entropy": 9.08930492401123, + "epoch": 0.4489816096499901, + "mean_token_accuracy": 0.789002537727356, + "num_tokens": 2820117.0, + "step": 4541, + "train/ce_loss": 1.020498275756836 + }, + { + "epoch": 0.4489816096499901, + "step": 4541, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.4489816096499901, + "step": 4541, + "train/total_loss": 0.1606435775756836 + }, + { + "entropy": 9.241561889648438, + "epoch": 0.44908048249950566, + "mean_token_accuracy": 0.7077562212944031, + "num_tokens": 2825421.0, + "step": 4542, + "train/ce_loss": 0.7609159350395203 + }, + { + "epoch": 0.44908048249950566, + "step": 4542, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.44908048249950566, + "step": 4542, + "train/total_loss": 0.14249783754348755 + }, + { + "entropy": 9.402660369873047, + "epoch": 0.44917935534902115, + "mean_token_accuracy": 0.7670068144798279, + "num_tokens": 2830467.0, + "step": 4543, + "train/ce_loss": 0.6022214889526367 + }, + { + "epoch": 0.44917935534902115, + "step": 4543, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.44917935534902115, + "step": 4543, + "train/total_loss": 0.11100339889526367 + }, + { + "entropy": 9.114690780639648, + "epoch": 0.4492782281985367, + "mean_token_accuracy": 0.7185929417610168, + "num_tokens": 2835649.0, + "step": 4544, + "train/ce_loss": 0.9506949186325073 + }, + { + "epoch": 0.4492782281985367, + "step": 4544, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.4492782281985367, + "step": 4544, + "train/total_loss": 0.1771007478237152 + }, + { + "entropy": 9.531662940979004, + "epoch": 0.44937710104805223, + "mean_token_accuracy": 0.7565084099769592, + "num_tokens": 2841024.0, + "step": 4545, + "train/ce_loss": 1.1496831178665161 + }, + { + "epoch": 0.44937710104805223, + "step": 4545, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.44937710104805223, + "step": 4545, + "train/total_loss": 0.19309331476688385 + }, + { + "entropy": 8.738716125488281, + "epoch": 0.4494759738975677, + "mean_token_accuracy": 0.7577497363090515, + "num_tokens": 2846354.0, + "step": 4546, + "train/ce_loss": 0.7805805802345276 + }, + { + "epoch": 0.4494759738975677, + "step": 4546, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.4494759738975677, + "step": 4546, + "train/total_loss": 0.15618306398391724 + }, + { + "entropy": 8.934268951416016, + "epoch": 0.44957484674708326, + "mean_token_accuracy": 0.7142857313156128, + "num_tokens": 2851684.0, + "step": 4547, + "train/ce_loss": 0.7433626055717468 + }, + { + "epoch": 0.44957484674708326, + "step": 4547, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.44957484674708326, + "step": 4547, + "train/total_loss": 0.12511751055717468 + }, + { + "entropy": 8.42918586730957, + "epoch": 0.4496737195965988, + "mean_token_accuracy": 0.748024582862854, + "num_tokens": 2857267.0, + "step": 4548, + "train/ce_loss": 0.5866866111755371 + }, + { + "epoch": 0.4496737195965988, + "step": 4548, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.4496737195965988, + "step": 4548, + "train/total_loss": 0.07819990813732147 + }, + { + "entropy": 9.045125961303711, + "epoch": 0.4497725924461143, + "mean_token_accuracy": 0.7625298500061035, + "num_tokens": 2862571.0, + "step": 4549, + "train/ce_loss": 0.5434145927429199 + }, + { + "epoch": 0.4497725924461143, + "step": 4549, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.4497725924461143, + "step": 4549, + "train/total_loss": 0.12074771523475647 + }, + { + "entropy": 9.409574508666992, + "epoch": 0.4498714652956298, + "mean_token_accuracy": 0.7969798445701599, + "num_tokens": 2867581.0, + "step": 4550, + "train/ce_loss": 3.5709465464606183e-06 + }, + { + "epoch": 0.4498714652956298, + "step": 4550, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.4498714652956298, + "step": 4550, + "train/total_loss": 0.03906285762786865 + }, + { + "entropy": 9.27652359008789, + "epoch": 0.44997033814514537, + "mean_token_accuracy": 0.6906946301460266, + "num_tokens": 2872818.0, + "step": 4551, + "train/ce_loss": 0.7897971868515015 + }, + { + "epoch": 0.44997033814514537, + "step": 4551, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.44997033814514537, + "step": 4551, + "train/total_loss": 0.1571047306060791 + }, + { + "entropy": 8.805782318115234, + "epoch": 0.45006921099466085, + "mean_token_accuracy": 0.795258641242981, + "num_tokens": 2878284.0, + "step": 4552, + "train/ce_loss": 0.5840756893157959 + }, + { + "epoch": 0.45006921099466085, + "step": 4552, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.45006921099466085, + "step": 4552, + "train/total_loss": 0.09747007489204407 + }, + { + "entropy": 8.947723388671875, + "epoch": 0.4501680838441764, + "mean_token_accuracy": 0.7066817879676819, + "num_tokens": 2883651.0, + "step": 4553, + "train/ce_loss": 0.7233148217201233 + }, + { + "epoch": 0.4501680838441764, + "step": 4553, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.4501680838441764, + "step": 4553, + "train/total_loss": 0.1465502381324768 + }, + { + "entropy": 8.907604217529297, + "epoch": 0.45026695669369193, + "mean_token_accuracy": 0.7900485396385193, + "num_tokens": 2888999.0, + "step": 4554, + "train/ce_loss": 0.7272635698318481 + }, + { + "epoch": 0.45026695669369193, + "step": 4554, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.45026695669369193, + "step": 4554, + "train/total_loss": 0.11178886145353317 + }, + { + "entropy": 9.367353439331055, + "epoch": 0.4503658295432074, + "mean_token_accuracy": 0.7003610134124756, + "num_tokens": 2894023.0, + "step": 4555, + "train/ce_loss": 0.8232207894325256 + }, + { + "epoch": 0.4503658295432074, + "step": 4555, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.4503658295432074, + "step": 4555, + "train/total_loss": 0.13700959086418152 + }, + { + "entropy": 9.503122329711914, + "epoch": 0.45046470239272296, + "mean_token_accuracy": 0.7219662070274353, + "num_tokens": 2899121.0, + "step": 4556, + "train/ce_loss": 1.519469141960144 + }, + { + "epoch": 0.45046470239272296, + "step": 4556, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.45046470239272296, + "step": 4556, + "train/total_loss": 0.24960316717624664 + }, + { + "entropy": 8.840147018432617, + "epoch": 0.4505635752422385, + "mean_token_accuracy": 0.7513397932052612, + "num_tokens": 2904536.0, + "step": 4557, + "train/ce_loss": 0.4257037341594696 + }, + { + "epoch": 0.4505635752422385, + "step": 4557, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.4505635752422385, + "step": 4557, + "train/total_loss": 0.06600787490606308 + }, + { + "entropy": 8.88971996307373, + "epoch": 0.450662448091754, + "mean_token_accuracy": 0.7338618636131287, + "num_tokens": 2909872.0, + "step": 4558, + "train/ce_loss": 0.4039987623691559 + }, + { + "epoch": 0.450662448091754, + "step": 4558, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.450662448091754, + "step": 4558, + "train/total_loss": 0.05602487549185753 + }, + { + "entropy": 8.467979431152344, + "epoch": 0.4507613209412695, + "mean_token_accuracy": 0.7732341885566711, + "num_tokens": 2915440.0, + "step": 4559, + "train/ce_loss": 0.8622865080833435 + }, + { + "epoch": 0.4507613209412695, + "step": 4559, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.4507613209412695, + "step": 4559, + "train/total_loss": 0.11357240378856659 + }, + { + "epoch": 0.45086019379078507, + "grad_norm": 0.6074954867362976, + "learning_rate": 8.87529051080453e-06, + "loss": 0.1409, + "step": 4560 + }, + { + "entropy": 8.623495101928711, + "epoch": 0.45086019379078507, + "mean_token_accuracy": 0.7909091114997864, + "num_tokens": 2921052.0, + "step": 4560, + "train/ce_loss": 0.5332843661308289 + }, + { + "epoch": 0.45086019379078507, + "step": 4560, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.45086019379078507, + "step": 4560, + "train/total_loss": 0.07285968959331512 + }, + { + "entropy": 9.29909896850586, + "epoch": 0.45095906664030055, + "mean_token_accuracy": 0.7981510162353516, + "num_tokens": 2926179.0, + "step": 4561, + "train/ce_loss": 2.328898744963226e-06 + }, + { + "epoch": 0.45095906664030055, + "step": 4561, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.45095906664030055, + "step": 4561, + "train/total_loss": 0.027343982830643654 + }, + { + "entropy": 9.259490013122559, + "epoch": 0.4510579394898161, + "mean_token_accuracy": 0.6875, + "num_tokens": 2931325.0, + "step": 4562, + "train/ce_loss": 0.7132740616798401 + }, + { + "epoch": 0.4510579394898161, + "step": 4562, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.4510579394898161, + "step": 4562, + "train/total_loss": 0.13773366808891296 + }, + { + "entropy": 9.044837951660156, + "epoch": 0.45115681233933164, + "mean_token_accuracy": 0.7777777910232544, + "num_tokens": 2936648.0, + "step": 4563, + "train/ce_loss": 0.6716727018356323 + }, + { + "epoch": 0.45115681233933164, + "step": 4563, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.45115681233933164, + "step": 4563, + "train/total_loss": 0.1452922821044922 + }, + { + "entropy": 9.083380699157715, + "epoch": 0.4512556851888471, + "mean_token_accuracy": 0.6751207709312439, + "num_tokens": 2941976.0, + "step": 4564, + "train/ce_loss": 1.7078438997268677 + }, + { + "epoch": 0.4512556851888471, + "step": 4564, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.4512556851888471, + "step": 4564, + "train/total_loss": 0.20203439891338348 + }, + { + "entropy": 9.328924179077148, + "epoch": 0.45135455803836266, + "mean_token_accuracy": 0.675302267074585, + "num_tokens": 2946985.0, + "step": 4565, + "train/ce_loss": 2.5967685360228643e-05 + }, + { + "epoch": 0.45135455803836266, + "step": 4565, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.45135455803836266, + "step": 4565, + "train/total_loss": 0.027346346527338028 + }, + { + "entropy": 8.78829574584961, + "epoch": 0.4514534308878782, + "mean_token_accuracy": 0.7212249040603638, + "num_tokens": 2952418.0, + "step": 4566, + "train/ce_loss": 0.8941664099693298 + }, + { + "epoch": 0.4514534308878782, + "step": 4566, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.4514534308878782, + "step": 4566, + "train/total_loss": 0.15191665291786194 + }, + { + "entropy": 9.640069961547852, + "epoch": 0.4515523037373937, + "mean_token_accuracy": 0.7910714149475098, + "num_tokens": 2957380.0, + "step": 4567, + "train/ce_loss": 6.438088348659221e-06 + }, + { + "epoch": 0.4515523037373937, + "step": 4567, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.4515523037373937, + "step": 4567, + "train/total_loss": 0.046875644475221634 + }, + { + "entropy": 8.637200355529785, + "epoch": 0.45165117658690923, + "mean_token_accuracy": 0.7096296548843384, + "num_tokens": 2962547.0, + "step": 4568, + "train/ce_loss": 1.6558645963668823 + }, + { + "epoch": 0.45165117658690923, + "step": 4568, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.45165117658690923, + "step": 4568, + "train/total_loss": 0.2515239715576172 + }, + { + "entropy": 9.362472534179688, + "epoch": 0.45175004943642477, + "mean_token_accuracy": 0.7953125238418579, + "num_tokens": 2967679.0, + "step": 4569, + "train/ce_loss": 1.5980865555320634e-06 + }, + { + "epoch": 0.45175004943642477, + "step": 4569, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.45175004943642477, + "step": 4569, + "train/total_loss": 0.06250015646219254 + }, + { + "entropy": 9.715399742126465, + "epoch": 0.45184892228594026, + "mean_token_accuracy": 0.7405303120613098, + "num_tokens": 2972687.0, + "step": 4570, + "train/ce_loss": 1.0586374998092651 + }, + { + "epoch": 0.45184892228594026, + "step": 4570, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.45184892228594026, + "step": 4570, + "train/total_loss": 0.1996137499809265 + }, + { + "entropy": 8.944488525390625, + "epoch": 0.4519477951354558, + "mean_token_accuracy": 0.7268722653388977, + "num_tokens": 2978053.0, + "step": 4571, + "train/ce_loss": 0.7541816234588623 + }, + { + "epoch": 0.4519477951354558, + "step": 4571, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.4519477951354558, + "step": 4571, + "train/total_loss": 0.12229316681623459 + }, + { + "entropy": 8.754767417907715, + "epoch": 0.45204666798497134, + "mean_token_accuracy": 0.6900212168693542, + "num_tokens": 2983438.0, + "step": 4572, + "train/ce_loss": 1.0171220302581787 + }, + { + "epoch": 0.45204666798497134, + "step": 4572, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.45204666798497134, + "step": 4572, + "train/total_loss": 0.1837434470653534 + }, + { + "entropy": 9.43647575378418, + "epoch": 0.4521455408344868, + "mean_token_accuracy": 0.7944079041481018, + "num_tokens": 2988504.0, + "step": 4573, + "train/ce_loss": 2.7032151592720766e-06 + }, + { + "epoch": 0.4521455408344868, + "step": 4573, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.4521455408344868, + "step": 4573, + "train/total_loss": 0.05078152194619179 + }, + { + "entropy": 8.957502365112305, + "epoch": 0.45224441368400237, + "mean_token_accuracy": 0.7759815454483032, + "num_tokens": 2993820.0, + "step": 4574, + "train/ce_loss": 0.5547528862953186 + }, + { + "epoch": 0.45224441368400237, + "step": 4574, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.45224441368400237, + "step": 4574, + "train/total_loss": 0.08672529458999634 + }, + { + "entropy": 9.467229843139648, + "epoch": 0.4523432865335179, + "mean_token_accuracy": 0.7510373592376709, + "num_tokens": 2998736.0, + "step": 4575, + "train/ce_loss": 1.649519443511963 + }, + { + "epoch": 0.4523432865335179, + "step": 4575, + "train/sim_loss": 0.140625 + }, + { + "epoch": 0.4523432865335179, + "step": 4575, + "train/total_loss": 0.30557695031166077 + }, + { + "entropy": 8.758031845092773, + "epoch": 0.4524421593830334, + "mean_token_accuracy": 0.7468827962875366, + "num_tokens": 3004071.0, + "step": 4576, + "train/ce_loss": 0.7991833686828613 + }, + { + "epoch": 0.4524421593830334, + "step": 4576, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.4524421593830334, + "step": 4576, + "train/total_loss": 0.12679333984851837 + }, + { + "entropy": 9.06222915649414, + "epoch": 0.45254103223254893, + "mean_token_accuracy": 0.7110438942909241, + "num_tokens": 3009153.0, + "step": 4577, + "train/ce_loss": 0.9158294796943665 + }, + { + "epoch": 0.45254103223254893, + "step": 4577, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.45254103223254893, + "step": 4577, + "train/total_loss": 0.11892669647932053 + }, + { + "entropy": 8.862366676330566, + "epoch": 0.4526399050820645, + "mean_token_accuracy": 0.7243852615356445, + "num_tokens": 3014601.0, + "step": 4578, + "train/ce_loss": 1.1242470741271973 + }, + { + "epoch": 0.4526399050820645, + "step": 4578, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.4526399050820645, + "step": 4578, + "train/total_loss": 0.19836220145225525 + }, + { + "entropy": 9.116423606872559, + "epoch": 0.45273877793157996, + "mean_token_accuracy": 0.7236255407333374, + "num_tokens": 3019671.0, + "step": 4579, + "train/ce_loss": 0.6466188430786133 + }, + { + "epoch": 0.45273877793157996, + "step": 4579, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.45273877793157996, + "step": 4579, + "train/total_loss": 0.10372438281774521 + }, + { + "epoch": 0.4528376507810955, + "grad_norm": 0.7198318839073181, + "learning_rate": 8.870345646046581e-06, + "loss": 0.1398, + "step": 4580 + }, + { + "entropy": 8.525514602661133, + "epoch": 0.4528376507810955, + "mean_token_accuracy": 0.7980072498321533, + "num_tokens": 3025328.0, + "step": 4580, + "train/ce_loss": 0.5483117699623108 + }, + { + "epoch": 0.4528376507810955, + "step": 4580, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.4528376507810955, + "step": 4580, + "train/total_loss": 0.14858117699623108 + }, + { + "entropy": 9.093343734741211, + "epoch": 0.45293652363061104, + "mean_token_accuracy": 0.7529722452163696, + "num_tokens": 3030532.0, + "step": 4581, + "train/ce_loss": 1.0639290809631348 + }, + { + "epoch": 0.45293652363061104, + "step": 4581, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.45293652363061104, + "step": 4581, + "train/total_loss": 0.14154917001724243 + }, + { + "entropy": 9.027142524719238, + "epoch": 0.4530353964801266, + "mean_token_accuracy": 0.7376294732093811, + "num_tokens": 3035887.0, + "step": 4582, + "train/ce_loss": 0.9235092997550964 + }, + { + "epoch": 0.4530353964801266, + "step": 4582, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.4530353964801266, + "step": 4582, + "train/total_loss": 0.14703842997550964 + }, + { + "entropy": 9.52957534790039, + "epoch": 0.45313426932964207, + "mean_token_accuracy": 0.7444444298744202, + "num_tokens": 3040888.0, + "step": 4583, + "train/ce_loss": 1.0812699794769287 + }, + { + "epoch": 0.45313426932964207, + "step": 4583, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.45313426932964207, + "step": 4583, + "train/total_loss": 0.15109574794769287 + }, + { + "entropy": 8.962906837463379, + "epoch": 0.4532331421791576, + "mean_token_accuracy": 0.7603796124458313, + "num_tokens": 3046130.0, + "step": 4584, + "train/ce_loss": 0.6397481560707092 + }, + { + "epoch": 0.4532331421791576, + "step": 4584, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.4532331421791576, + "step": 4584, + "train/total_loss": 0.09522482007741928 + }, + { + "entropy": 8.768574714660645, + "epoch": 0.45333201502867315, + "mean_token_accuracy": 0.7129135727882385, + "num_tokens": 3051537.0, + "step": 4585, + "train/ce_loss": 0.5469233989715576 + }, + { + "epoch": 0.45333201502867315, + "step": 4585, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.45333201502867315, + "step": 4585, + "train/total_loss": 0.113286092877388 + }, + { + "entropy": 8.842185974121094, + "epoch": 0.45343088787818864, + "mean_token_accuracy": 0.7220982313156128, + "num_tokens": 3056886.0, + "step": 4586, + "train/ce_loss": 0.767518937587738 + }, + { + "epoch": 0.45343088787818864, + "step": 4586, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.45343088787818864, + "step": 4586, + "train/total_loss": 0.11581439524888992 + }, + { + "entropy": 9.495777130126953, + "epoch": 0.4535297607277042, + "mean_token_accuracy": 0.752525269985199, + "num_tokens": 3061907.0, + "step": 4587, + "train/ce_loss": 1.1620025634765625 + }, + { + "epoch": 0.4535297607277042, + "step": 4587, + "train/sim_loss": 0.1328125 + }, + { + "epoch": 0.4535297607277042, + "step": 4587, + "train/total_loss": 0.2490127682685852 + }, + { + "entropy": 8.66620922088623, + "epoch": 0.4536286335772197, + "mean_token_accuracy": 0.7509652376174927, + "num_tokens": 3067414.0, + "step": 4588, + "train/ce_loss": 0.8208426237106323 + }, + { + "epoch": 0.4536286335772197, + "step": 4588, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.4536286335772197, + "step": 4588, + "train/total_loss": 0.12114676088094711 + }, + { + "entropy": 9.191309928894043, + "epoch": 0.4537275064267352, + "mean_token_accuracy": 0.7652173638343811, + "num_tokens": 3072543.0, + "step": 4589, + "train/ce_loss": 0.4373510777950287 + }, + { + "epoch": 0.4537275064267352, + "step": 4589, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.4537275064267352, + "step": 4589, + "train/total_loss": 0.06717260926961899 + }, + { + "entropy": 9.141670227050781, + "epoch": 0.45382637927625075, + "mean_token_accuracy": 0.7683615684509277, + "num_tokens": 3077693.0, + "step": 4590, + "train/ce_loss": 0.6883249878883362 + }, + { + "epoch": 0.45382637927625075, + "step": 4590, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.45382637927625075, + "step": 4590, + "train/total_loss": 0.16258250176906586 + }, + { + "entropy": 8.69253158569336, + "epoch": 0.4539252521257663, + "mean_token_accuracy": 0.7345225811004639, + "num_tokens": 3083064.0, + "step": 4591, + "train/ce_loss": 1.4125851392745972 + }, + { + "epoch": 0.4539252521257663, + "step": 4591, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.4539252521257663, + "step": 4591, + "train/total_loss": 0.21157102286815643 + }, + { + "entropy": 8.656013488769531, + "epoch": 0.4540241249752818, + "mean_token_accuracy": 0.7139664888381958, + "num_tokens": 3088403.0, + "step": 4592, + "train/ce_loss": 0.8411061763763428 + }, + { + "epoch": 0.4540241249752818, + "step": 4592, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.4540241249752818, + "step": 4592, + "train/total_loss": 0.13879811763763428 + }, + { + "entropy": 9.005350112915039, + "epoch": 0.4541229978247973, + "mean_token_accuracy": 0.7493857741355896, + "num_tokens": 3093682.0, + "step": 4593, + "train/ce_loss": 0.5997017621994019 + }, + { + "epoch": 0.4541229978247973, + "step": 4593, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.4541229978247973, + "step": 4593, + "train/total_loss": 0.0834076777100563 + }, + { + "entropy": 8.391681671142578, + "epoch": 0.45422187067431286, + "mean_token_accuracy": 0.7315497994422913, + "num_tokens": 3099261.0, + "step": 4594, + "train/ce_loss": 1.0334621667861938 + }, + { + "epoch": 0.45422187067431286, + "step": 4594, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.45422187067431286, + "step": 4594, + "train/total_loss": 0.14631497859954834 + }, + { + "entropy": 9.18332290649414, + "epoch": 0.45432074352382834, + "mean_token_accuracy": 0.752043604850769, + "num_tokens": 3104615.0, + "step": 4595, + "train/ce_loss": 0.8786539435386658 + }, + { + "epoch": 0.45432074352382834, + "step": 4595, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.45432074352382834, + "step": 4595, + "train/total_loss": 0.20114664733409882 + }, + { + "entropy": 8.647392272949219, + "epoch": 0.4544196163733439, + "mean_token_accuracy": 0.7243186831474304, + "num_tokens": 3110035.0, + "step": 4596, + "train/ce_loss": 1.0830390453338623 + }, + { + "epoch": 0.4544196163733439, + "step": 4596, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.4544196163733439, + "step": 4596, + "train/total_loss": 0.19814765453338623 + }, + { + "entropy": 9.091204643249512, + "epoch": 0.4545184892228594, + "mean_token_accuracy": 0.7458279728889465, + "num_tokens": 3115283.0, + "step": 4597, + "train/ce_loss": 1.1098209619522095 + }, + { + "epoch": 0.4545184892228594, + "step": 4597, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.4545184892228594, + "step": 4597, + "train/total_loss": 0.20082584023475647 + }, + { + "entropy": 8.805851936340332, + "epoch": 0.4546173620723749, + "mean_token_accuracy": 0.6880615949630737, + "num_tokens": 3120521.0, + "step": 4598, + "train/ce_loss": 1.021937608718872 + }, + { + "epoch": 0.4546173620723749, + "step": 4598, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.4546173620723749, + "step": 4598, + "train/total_loss": 0.19594377279281616 + }, + { + "entropy": 8.611743927001953, + "epoch": 0.45471623492189045, + "mean_token_accuracy": 0.7456575632095337, + "num_tokens": 3125841.0, + "step": 4599, + "train/ce_loss": 0.5259397029876709 + }, + { + "epoch": 0.45471623492189045, + "step": 4599, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.45471623492189045, + "step": 4599, + "train/total_loss": 0.09556272625923157 + }, + { + "epoch": 0.454815107771406, + "grad_norm": 0.6952683925628662, + "learning_rate": 8.865400781288632e-06, + "loss": 0.1411, + "step": 4600 + }, + { + "entropy": 9.587837219238281, + "epoch": 0.454815107771406, + "mean_token_accuracy": 0.8041958212852478, + "num_tokens": 3130691.0, + "step": 4600, + "train/ce_loss": 2.384063691351912e-06 + }, + { + "epoch": 0.454815107771406, + "step": 4600, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.454815107771406, + "step": 4600, + "train/total_loss": 0.0468752384185791 + }, + { + "entropy": 8.851499557495117, + "epoch": 0.4549139806209215, + "mean_token_accuracy": 0.7266355156898499, + "num_tokens": 3136026.0, + "step": 4601, + "train/ce_loss": 1.0402942895889282 + }, + { + "epoch": 0.4549139806209215, + "step": 4601, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.4549139806209215, + "step": 4601, + "train/total_loss": 0.16652943193912506 + }, + { + "entropy": 9.041091918945312, + "epoch": 0.455012853470437, + "mean_token_accuracy": 0.6972602605819702, + "num_tokens": 3141205.0, + "step": 4602, + "train/ce_loss": 1.1488741636276245 + }, + { + "epoch": 0.455012853470437, + "step": 4602, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.455012853470437, + "step": 4602, + "train/total_loss": 0.20473116636276245 + }, + { + "entropy": 9.126296997070312, + "epoch": 0.45511172631995256, + "mean_token_accuracy": 0.7178423404693604, + "num_tokens": 3146394.0, + "step": 4603, + "train/ce_loss": 1.7863327264785767 + }, + { + "epoch": 0.45511172631995256, + "step": 4603, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.45511172631995256, + "step": 4603, + "train/total_loss": 0.26457077264785767 + }, + { + "entropy": 9.33647632598877, + "epoch": 0.45521059916946804, + "mean_token_accuracy": 0.7474302649497986, + "num_tokens": 3151505.0, + "step": 4604, + "train/ce_loss": 1.4108805656433105 + }, + { + "epoch": 0.45521059916946804, + "step": 4604, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.45521059916946804, + "step": 4604, + "train/total_loss": 0.19577555358409882 + }, + { + "entropy": 8.726951599121094, + "epoch": 0.4553094720189836, + "mean_token_accuracy": 0.6741154789924622, + "num_tokens": 3157061.0, + "step": 4605, + "train/ce_loss": 1.2111716270446777 + }, + { + "epoch": 0.4553094720189836, + "step": 4605, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.4553094720189836, + "step": 4605, + "train/total_loss": 0.21877342462539673 + }, + { + "entropy": 8.776931762695312, + "epoch": 0.4554083448684991, + "mean_token_accuracy": 0.782608687877655, + "num_tokens": 3162345.0, + "step": 4606, + "train/ce_loss": 1.0162129402160645 + }, + { + "epoch": 0.4554083448684991, + "step": 4606, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.4554083448684991, + "step": 4606, + "train/total_loss": 0.16412129998207092 + }, + { + "entropy": 9.911093711853027, + "epoch": 0.4555072177180146, + "mean_token_accuracy": 0.6822157502174377, + "num_tokens": 3167103.0, + "step": 4607, + "train/ce_loss": 3.440540075302124 + }, + { + "epoch": 0.4555072177180146, + "step": 4607, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.4555072177180146, + "step": 4607, + "train/total_loss": 0.4026477634906769 + }, + { + "entropy": 9.38325023651123, + "epoch": 0.45560609056753015, + "mean_token_accuracy": 0.77920001745224, + "num_tokens": 3172183.0, + "step": 4608, + "train/ce_loss": 1.2304880619049072 + }, + { + "epoch": 0.45560609056753015, + "step": 4608, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.45560609056753015, + "step": 4608, + "train/total_loss": 0.1816425621509552 + }, + { + "entropy": 8.965509414672852, + "epoch": 0.4557049634170457, + "mean_token_accuracy": 0.7760358452796936, + "num_tokens": 3177528.0, + "step": 4609, + "train/ce_loss": 0.6849386692047119 + }, + { + "epoch": 0.4557049634170457, + "step": 4609, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.4557049634170457, + "step": 4609, + "train/total_loss": 0.17005637288093567 + }, + { + "entropy": 9.186532020568848, + "epoch": 0.4558038362665612, + "mean_token_accuracy": 0.7201017737388611, + "num_tokens": 3182878.0, + "step": 4610, + "train/ce_loss": 0.9957976341247559 + }, + { + "epoch": 0.4558038362665612, + "step": 4610, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.4558038362665612, + "step": 4610, + "train/total_loss": 0.17770476639270782 + }, + { + "entropy": 9.407671928405762, + "epoch": 0.4559027091160767, + "mean_token_accuracy": 0.6867284178733826, + "num_tokens": 3187978.0, + "step": 4611, + "train/ce_loss": 1.1797021627426147 + }, + { + "epoch": 0.4559027091160767, + "step": 4611, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.4559027091160767, + "step": 4611, + "train/total_loss": 0.18047022819519043 + }, + { + "entropy": 9.029430389404297, + "epoch": 0.45600158196559226, + "mean_token_accuracy": 0.7273809313774109, + "num_tokens": 3193290.0, + "step": 4612, + "train/ce_loss": 0.9973271489143372 + }, + { + "epoch": 0.45600158196559226, + "step": 4612, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.45600158196559226, + "step": 4612, + "train/total_loss": 0.14270147681236267 + }, + { + "entropy": 9.043212890625, + "epoch": 0.45610045481510775, + "mean_token_accuracy": 0.762666642665863, + "num_tokens": 3198428.0, + "step": 4613, + "train/ce_loss": 0.5948737263679504 + }, + { + "epoch": 0.45610045481510775, + "step": 4613, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.45610045481510775, + "step": 4613, + "train/total_loss": 0.11026862263679504 + }, + { + "entropy": 8.697711944580078, + "epoch": 0.4561993276646233, + "mean_token_accuracy": 0.7046413421630859, + "num_tokens": 3203786.0, + "step": 4614, + "train/ce_loss": 0.8781384229660034 + }, + { + "epoch": 0.4561993276646233, + "step": 4614, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.4561993276646233, + "step": 4614, + "train/total_loss": 0.1229700967669487 + }, + { + "entropy": 8.974030494689941, + "epoch": 0.45629820051413883, + "mean_token_accuracy": 0.7385542392730713, + "num_tokens": 3209084.0, + "step": 4615, + "train/ce_loss": 0.943402886390686 + }, + { + "epoch": 0.45629820051413883, + "step": 4615, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.45629820051413883, + "step": 4615, + "train/total_loss": 0.14512154459953308 + }, + { + "entropy": 9.349186897277832, + "epoch": 0.4563970733636543, + "mean_token_accuracy": 0.7431610822677612, + "num_tokens": 3214154.0, + "step": 4616, + "train/ce_loss": 2.604249402793357e-06 + }, + { + "epoch": 0.4563970733636543, + "step": 4616, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.4563970733636543, + "step": 4616, + "train/total_loss": 0.06250026077032089 + }, + { + "entropy": 8.613601684570312, + "epoch": 0.45649594621316986, + "mean_token_accuracy": 0.7882797718048096, + "num_tokens": 3219722.0, + "step": 4617, + "train/ce_loss": 1.5095481872558594 + }, + { + "epoch": 0.45649594621316986, + "step": 4617, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.45649594621316986, + "step": 4617, + "train/total_loss": 0.22126732766628265 + }, + { + "entropy": 8.749364852905273, + "epoch": 0.4565948190626854, + "mean_token_accuracy": 0.7271317839622498, + "num_tokens": 3224794.0, + "step": 4618, + "train/ce_loss": 1.9744768451346317e-06 + }, + { + "epoch": 0.4565948190626854, + "step": 4618, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.4565948190626854, + "step": 4618, + "train/total_loss": 0.04687519744038582 + }, + { + "entropy": 9.825479507446289, + "epoch": 0.4566936919122009, + "mean_token_accuracy": 0.7286821603775024, + "num_tokens": 3229581.0, + "step": 4619, + "train/ce_loss": 1.1511327028274536 + }, + { + "epoch": 0.4566936919122009, + "step": 4619, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.4566936919122009, + "step": 4619, + "train/total_loss": 0.1307382732629776 + }, + { + "epoch": 0.4567925647617164, + "grad_norm": 0.8595569729804993, + "learning_rate": 8.860455916530684e-06, + "loss": 0.1447, + "step": 4620 + }, + { + "entropy": 9.360201835632324, + "epoch": 0.4567925647617164, + "mean_token_accuracy": 0.7547826170921326, + "num_tokens": 3234576.0, + "step": 4620, + "train/ce_loss": 1.0287901163101196 + }, + { + "epoch": 0.4567925647617164, + "step": 4620, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.4567925647617164, + "step": 4620, + "train/total_loss": 0.15756651759147644 + }, + { + "entropy": 9.104970932006836, + "epoch": 0.45689143761123197, + "mean_token_accuracy": 0.7006993293762207, + "num_tokens": 3239726.0, + "step": 4621, + "train/ce_loss": 0.5973265171051025 + }, + { + "epoch": 0.45689143761123197, + "step": 4621, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.45689143761123197, + "step": 4621, + "train/total_loss": 0.09098265320062637 + }, + { + "entropy": 8.684115409851074, + "epoch": 0.4569903104607475, + "mean_token_accuracy": 0.7793522477149963, + "num_tokens": 3245193.0, + "step": 4622, + "train/ce_loss": 0.4985904097557068 + }, + { + "epoch": 0.4569903104607475, + "step": 4622, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.4569903104607475, + "step": 4622, + "train/total_loss": 0.09673404693603516 + }, + { + "entropy": 9.446758270263672, + "epoch": 0.457089183310263, + "mean_token_accuracy": 0.7712305188179016, + "num_tokens": 3250230.0, + "step": 4623, + "train/ce_loss": 1.3273210525512695 + }, + { + "epoch": 0.457089183310263, + "step": 4623, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.457089183310263, + "step": 4623, + "train/total_loss": 0.1717946082353592 + }, + { + "entropy": 9.79252815246582, + "epoch": 0.45718805615977853, + "mean_token_accuracy": 0.7248157262802124, + "num_tokens": 3255021.0, + "step": 4624, + "train/ce_loss": 5.8113864724873565e-06 + }, + { + "epoch": 0.45718805615977853, + "step": 4624, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.45718805615977853, + "step": 4624, + "train/total_loss": 0.05078183114528656 + }, + { + "entropy": 9.614484786987305, + "epoch": 0.4572869290092941, + "mean_token_accuracy": 0.7348993420600891, + "num_tokens": 3260106.0, + "step": 4625, + "train/ce_loss": 1.4386826753616333 + }, + { + "epoch": 0.4572869290092941, + "step": 4625, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.4572869290092941, + "step": 4625, + "train/total_loss": 0.21808701753616333 + }, + { + "entropy": 9.694557189941406, + "epoch": 0.45738580185880956, + "mean_token_accuracy": 0.7306967973709106, + "num_tokens": 3265061.0, + "step": 4626, + "train/ce_loss": 4.065531811647816e-06 + }, + { + "epoch": 0.45738580185880956, + "step": 4626, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.45738580185880956, + "step": 4626, + "train/total_loss": 0.03906290605664253 + }, + { + "entropy": 8.791769027709961, + "epoch": 0.4574846747083251, + "mean_token_accuracy": 0.7226074934005737, + "num_tokens": 3270298.0, + "step": 4627, + "train/ce_loss": 1.35926353931427 + }, + { + "epoch": 0.4574846747083251, + "step": 4627, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.4574846747083251, + "step": 4627, + "train/total_loss": 0.22186385095119476 + }, + { + "entropy": 9.312559127807617, + "epoch": 0.45758354755784064, + "mean_token_accuracy": 0.7418879270553589, + "num_tokens": 3275443.0, + "step": 4628, + "train/ce_loss": 0.955916702747345 + }, + { + "epoch": 0.45758354755784064, + "step": 4628, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.45758354755784064, + "step": 4628, + "train/total_loss": 0.15027916431427002 + }, + { + "entropy": 8.64016056060791, + "epoch": 0.4576824204073561, + "mean_token_accuracy": 0.7103717923164368, + "num_tokens": 3280924.0, + "step": 4629, + "train/ce_loss": 1.3245102167129517 + }, + { + "epoch": 0.4576824204073561, + "step": 4629, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.4576824204073561, + "step": 4629, + "train/total_loss": 0.22229477763175964 + }, + { + "entropy": 8.87697982788086, + "epoch": 0.45778129325687167, + "mean_token_accuracy": 0.698090672492981, + "num_tokens": 3286223.0, + "step": 4630, + "train/ce_loss": 0.564035177230835 + }, + { + "epoch": 0.45778129325687167, + "step": 4630, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.45778129325687167, + "step": 4630, + "train/total_loss": 0.1228097677230835 + }, + { + "entropy": 9.272031784057617, + "epoch": 0.4578801661063872, + "mean_token_accuracy": 0.647826075553894, + "num_tokens": 3291343.0, + "step": 4631, + "train/ce_loss": 1.9904353618621826 + }, + { + "epoch": 0.4578801661063872, + "step": 4631, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.4578801661063872, + "step": 4631, + "train/total_loss": 0.32404354214668274 + }, + { + "entropy": 8.951705932617188, + "epoch": 0.4579790389559027, + "mean_token_accuracy": 0.7830626368522644, + "num_tokens": 3296658.0, + "step": 4632, + "train/ce_loss": 0.6698645949363708 + }, + { + "epoch": 0.4579790389559027, + "step": 4632, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.4579790389559027, + "step": 4632, + "train/total_loss": 0.09433021396398544 + }, + { + "entropy": 8.547548294067383, + "epoch": 0.45807791180541824, + "mean_token_accuracy": 0.7302573323249817, + "num_tokens": 3302243.0, + "step": 4633, + "train/ce_loss": 1.1055241823196411 + }, + { + "epoch": 0.45807791180541824, + "step": 4633, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.45807791180541824, + "step": 4633, + "train/total_loss": 0.17695868015289307 + }, + { + "entropy": 9.031946182250977, + "epoch": 0.4581767846549338, + "mean_token_accuracy": 0.6810126304626465, + "num_tokens": 3307521.0, + "step": 4634, + "train/ce_loss": 0.7403306365013123 + }, + { + "epoch": 0.4581767846549338, + "step": 4634, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.4581767846549338, + "step": 4634, + "train/total_loss": 0.13653306663036346 + }, + { + "entropy": 9.08590316772461, + "epoch": 0.45827565750444926, + "mean_token_accuracy": 0.7239353656768799, + "num_tokens": 3312651.0, + "step": 4635, + "train/ce_loss": 1.0999616384506226 + }, + { + "epoch": 0.45827565750444926, + "step": 4635, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.45827565750444926, + "step": 4635, + "train/total_loss": 0.13733991980552673 + }, + { + "entropy": 8.861583709716797, + "epoch": 0.4583745303539648, + "mean_token_accuracy": 0.760221004486084, + "num_tokens": 3318036.0, + "step": 4636, + "train/ce_loss": 0.7783376574516296 + }, + { + "epoch": 0.4583745303539648, + "step": 4636, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.4583745303539648, + "step": 4636, + "train/total_loss": 0.12470876425504684 + }, + { + "entropy": 9.740151405334473, + "epoch": 0.45847340320348035, + "mean_token_accuracy": 0.6293245553970337, + "num_tokens": 3323038.0, + "step": 4637, + "train/ce_loss": 1.4067628383636475 + }, + { + "epoch": 0.45847340320348035, + "step": 4637, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.45847340320348035, + "step": 4637, + "train/total_loss": 0.18755128979682922 + }, + { + "entropy": 9.749921798706055, + "epoch": 0.45857227605299583, + "mean_token_accuracy": 0.7439613342285156, + "num_tokens": 3327889.0, + "step": 4638, + "train/ce_loss": 2.1111593468958745e-06 + }, + { + "epoch": 0.45857227605299583, + "step": 4638, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.45857227605299583, + "step": 4638, + "train/total_loss": 0.019531460478901863 + }, + { + "entropy": 9.055946350097656, + "epoch": 0.45867114890251137, + "mean_token_accuracy": 0.678518533706665, + "num_tokens": 3333090.0, + "step": 4639, + "train/ce_loss": 0.7745179533958435 + }, + { + "epoch": 0.45867114890251137, + "step": 4639, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.45867114890251137, + "step": 4639, + "train/total_loss": 0.14385804533958435 + }, + { + "epoch": 0.4587700217520269, + "grad_norm": 0.9158977270126343, + "learning_rate": 8.855511051772734e-06, + "loss": 0.1441, + "step": 4640 + }, + { + "entropy": 9.711057662963867, + "epoch": 0.4587700217520269, + "mean_token_accuracy": 0.6635338068008423, + "num_tokens": 3338025.0, + "step": 4640, + "train/ce_loss": 1.2220512628555298 + }, + { + "epoch": 0.4587700217520269, + "step": 4640, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.4587700217520269, + "step": 4640, + "train/total_loss": 0.17689263820648193 + }, + { + "entropy": 9.27509880065918, + "epoch": 0.4588688946015424, + "mean_token_accuracy": 0.7221324443817139, + "num_tokens": 3343035.0, + "step": 4641, + "train/ce_loss": 0.8792186975479126 + }, + { + "epoch": 0.4588688946015424, + "step": 4641, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.4588688946015424, + "step": 4641, + "train/total_loss": 0.1347968727350235 + }, + { + "entropy": 8.594276428222656, + "epoch": 0.45896776745105794, + "mean_token_accuracy": 0.703568160533905, + "num_tokens": 3348620.0, + "step": 4642, + "train/ce_loss": 1.3725320100784302 + }, + { + "epoch": 0.45896776745105794, + "step": 4642, + "train/sim_loss": 0.1484375 + }, + { + "epoch": 0.45896776745105794, + "step": 4642, + "train/total_loss": 0.2856907248497009 + }, + { + "entropy": 9.057815551757812, + "epoch": 0.4590666403005735, + "mean_token_accuracy": 0.7582547068595886, + "num_tokens": 3353899.0, + "step": 4643, + "train/ce_loss": 0.656229555606842 + }, + { + "epoch": 0.4590666403005735, + "step": 4643, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.4590666403005735, + "step": 4643, + "train/total_loss": 0.1476542055606842 + }, + { + "entropy": 9.26467227935791, + "epoch": 0.45916551315008897, + "mean_token_accuracy": 0.763610303401947, + "num_tokens": 3359044.0, + "step": 4644, + "train/ce_loss": 0.8465744256973267 + }, + { + "epoch": 0.45916551315008897, + "step": 4644, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.45916551315008897, + "step": 4644, + "train/total_loss": 0.1510636955499649 + }, + { + "entropy": 9.152750015258789, + "epoch": 0.4592643859996045, + "mean_token_accuracy": 0.7753530144691467, + "num_tokens": 3364215.0, + "step": 4645, + "train/ce_loss": 0.9924351572990417 + }, + { + "epoch": 0.4592643859996045, + "step": 4645, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.4592643859996045, + "step": 4645, + "train/total_loss": 0.15002477169036865 + }, + { + "entropy": 9.042369842529297, + "epoch": 0.45936325884912005, + "mean_token_accuracy": 0.7220588326454163, + "num_tokens": 3369432.0, + "step": 4646, + "train/ce_loss": 2.187622547149658 + }, + { + "epoch": 0.45936325884912005, + "step": 4646, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.45936325884912005, + "step": 4646, + "train/total_loss": 0.27344977855682373 + }, + { + "entropy": 8.96348762512207, + "epoch": 0.45946213169863553, + "mean_token_accuracy": 0.7243852615356445, + "num_tokens": 3374877.0, + "step": 4647, + "train/ce_loss": 0.5222508311271667 + }, + { + "epoch": 0.45946213169863553, + "step": 4647, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.45946213169863553, + "step": 4647, + "train/total_loss": 0.09128758311271667 + }, + { + "entropy": 8.688159942626953, + "epoch": 0.4595610045481511, + "mean_token_accuracy": 0.8160237669944763, + "num_tokens": 3380348.0, + "step": 4648, + "train/ce_loss": 0.5302615761756897 + }, + { + "epoch": 0.4595610045481511, + "step": 4648, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.4595610045481511, + "step": 4648, + "train/total_loss": 0.09599490463733673 + }, + { + "entropy": 9.019515991210938, + "epoch": 0.4596598773976666, + "mean_token_accuracy": 0.759096622467041, + "num_tokens": 3385587.0, + "step": 4649, + "train/ce_loss": 0.7856865525245667 + }, + { + "epoch": 0.4596598773976666, + "step": 4649, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.4596598773976666, + "step": 4649, + "train/total_loss": 0.10591240972280502 + }, + { + "entropy": 9.325714111328125, + "epoch": 0.4597587502471821, + "mean_token_accuracy": 0.7252747416496277, + "num_tokens": 3390662.0, + "step": 4650, + "train/ce_loss": 1.121762990951538 + }, + { + "epoch": 0.4597587502471821, + "step": 4650, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.4597587502471821, + "step": 4650, + "train/total_loss": 0.1551450490951538 + }, + { + "entropy": 9.195514678955078, + "epoch": 0.45985762309669764, + "mean_token_accuracy": 0.718137264251709, + "num_tokens": 3396107.0, + "step": 4651, + "train/ce_loss": 1.328965425491333 + }, + { + "epoch": 0.45985762309669764, + "step": 4651, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.45985762309669764, + "step": 4651, + "train/total_loss": 0.1993027925491333 + }, + { + "entropy": 9.12911605834961, + "epoch": 0.4599564959462132, + "mean_token_accuracy": 0.7340686321258545, + "num_tokens": 3401403.0, + "step": 4652, + "train/ce_loss": 0.9378371238708496 + }, + { + "epoch": 0.4599564959462132, + "step": 4652, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.4599564959462132, + "step": 4652, + "train/total_loss": 0.16409620642662048 + }, + { + "entropy": 9.175090789794922, + "epoch": 0.46005536879572867, + "mean_token_accuracy": 0.8226857781410217, + "num_tokens": 3406640.0, + "step": 4653, + "train/ce_loss": 1.1295832109681214e-06 + }, + { + "epoch": 0.46005536879572867, + "step": 4653, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.46005536879572867, + "step": 4653, + "train/total_loss": 0.023437613621354103 + }, + { + "entropy": 9.279150009155273, + "epoch": 0.4601542416452442, + "mean_token_accuracy": 0.7282758355140686, + "num_tokens": 3411757.0, + "step": 4654, + "train/ce_loss": 1.823083758354187 + }, + { + "epoch": 0.4601542416452442, + "step": 4654, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.4601542416452442, + "step": 4654, + "train/total_loss": 0.3073083758354187 + }, + { + "entropy": 8.993045806884766, + "epoch": 0.46025311449475975, + "mean_token_accuracy": 0.7315855026245117, + "num_tokens": 3417034.0, + "step": 4655, + "train/ce_loss": 0.4706941246986389 + }, + { + "epoch": 0.46025311449475975, + "step": 4655, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.46025311449475975, + "step": 4655, + "train/total_loss": 0.14863191545009613 + }, + { + "entropy": 9.490364074707031, + "epoch": 0.46035198734427524, + "mean_token_accuracy": 0.7389830350875854, + "num_tokens": 3422013.0, + "step": 4656, + "train/ce_loss": 1.3873450756072998 + }, + { + "epoch": 0.46035198734427524, + "step": 4656, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.46035198734427524, + "step": 4656, + "train/total_loss": 0.16607825458049774 + }, + { + "entropy": 9.437137603759766, + "epoch": 0.4604508601937908, + "mean_token_accuracy": 0.739635169506073, + "num_tokens": 3427080.0, + "step": 4657, + "train/ce_loss": 2.3839854748075595e-06 + }, + { + "epoch": 0.4604508601937908, + "step": 4657, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.4604508601937908, + "step": 4657, + "train/total_loss": 0.0351564884185791 + }, + { + "entropy": 9.324195861816406, + "epoch": 0.4605497330433063, + "mean_token_accuracy": 0.7063252925872803, + "num_tokens": 3432225.0, + "step": 4658, + "train/ce_loss": 6.33997342447401e-06 + }, + { + "epoch": 0.4605497330433063, + "step": 4658, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.4605497330433063, + "step": 4658, + "train/total_loss": 0.04296938329935074 + }, + { + "entropy": 9.167675971984863, + "epoch": 0.4606486058928218, + "mean_token_accuracy": 0.6957186460494995, + "num_tokens": 3437359.0, + "step": 4659, + "train/ce_loss": 6.890105851198314e-06 + }, + { + "epoch": 0.4606486058928218, + "step": 4659, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.4606486058928218, + "step": 4659, + "train/total_loss": 0.06250068545341492 + }, + { + "epoch": 0.46074747874233735, + "grad_norm": 0.9255831241607666, + "learning_rate": 8.850566187014787e-06, + "loss": 0.1386, + "step": 4660 + }, + { + "entropy": 8.594034194946289, + "epoch": 0.46074747874233735, + "mean_token_accuracy": 0.7196562886238098, + "num_tokens": 3442765.0, + "step": 4660, + "train/ce_loss": 0.7374122738838196 + }, + { + "epoch": 0.46074747874233735, + "step": 4660, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.46074747874233735, + "step": 4660, + "train/total_loss": 0.12842872738838196 + }, + { + "entropy": 9.10422134399414, + "epoch": 0.4608463515918529, + "mean_token_accuracy": 0.7849604487419128, + "num_tokens": 3447949.0, + "step": 4661, + "train/ce_loss": 0.9659251570701599 + }, + { + "epoch": 0.4608463515918529, + "step": 4661, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.4608463515918529, + "step": 4661, + "train/total_loss": 0.182530015707016 + }, + { + "entropy": 9.422708511352539, + "epoch": 0.4609452244413684, + "mean_token_accuracy": 0.7255244851112366, + "num_tokens": 3452957.0, + "step": 4662, + "train/ce_loss": 0.9071168899536133 + }, + { + "epoch": 0.4609452244413684, + "step": 4662, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.4609452244413684, + "step": 4662, + "train/total_loss": 0.12586793303489685 + }, + { + "entropy": 8.967698097229004, + "epoch": 0.4610440972908839, + "mean_token_accuracy": 0.6709601879119873, + "num_tokens": 3458249.0, + "step": 4663, + "train/ce_loss": 1.326856017112732 + }, + { + "epoch": 0.4610440972908839, + "step": 4663, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.4610440972908839, + "step": 4663, + "train/total_loss": 0.1873731017112732 + }, + { + "entropy": 9.37660026550293, + "epoch": 0.46114297014039946, + "mean_token_accuracy": 0.7287630438804626, + "num_tokens": 3463275.0, + "step": 4664, + "train/ce_loss": 1.7267590237679542e-06 + }, + { + "epoch": 0.46114297014039946, + "step": 4664, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.46114297014039946, + "step": 4664, + "train/total_loss": 0.02734392322599888 + }, + { + "entropy": 9.615408897399902, + "epoch": 0.461241842989915, + "mean_token_accuracy": 0.6708860993385315, + "num_tokens": 3468158.0, + "step": 4665, + "train/ce_loss": 2.5690736770629883 + }, + { + "epoch": 0.461241842989915, + "step": 4665, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.461241842989915, + "step": 4665, + "train/total_loss": 0.2881573736667633 + }, + { + "entropy": 8.768453598022461, + "epoch": 0.4613407158394305, + "mean_token_accuracy": 0.7710437774658203, + "num_tokens": 3473519.0, + "step": 4666, + "train/ce_loss": 1.3710461854934692 + }, + { + "epoch": 0.4613407158394305, + "step": 4666, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.4613407158394305, + "step": 4666, + "train/total_loss": 0.21522961556911469 + }, + { + "entropy": 9.292464256286621, + "epoch": 0.461439588688946, + "mean_token_accuracy": 0.762536883354187, + "num_tokens": 3478663.0, + "step": 4667, + "train/ce_loss": 1.1530660390853882 + }, + { + "epoch": 0.461439588688946, + "step": 4667, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.461439588688946, + "step": 4667, + "train/total_loss": 0.16608786582946777 + }, + { + "entropy": 8.926875114440918, + "epoch": 0.46153846153846156, + "mean_token_accuracy": 0.7527114748954773, + "num_tokens": 3484023.0, + "step": 4668, + "train/ce_loss": 0.3579964339733124 + }, + { + "epoch": 0.46153846153846156, + "step": 4668, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.46153846153846156, + "step": 4668, + "train/total_loss": 0.09829964488744736 + }, + { + "entropy": 8.580286979675293, + "epoch": 0.46163733438797705, + "mean_token_accuracy": 0.7695202231407166, + "num_tokens": 3489564.0, + "step": 4669, + "train/ce_loss": 0.6817471981048584 + }, + { + "epoch": 0.46163733438797705, + "step": 4669, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.46163733438797705, + "step": 4669, + "train/total_loss": 0.10333096981048584 + }, + { + "entropy": 9.17463493347168, + "epoch": 0.4617362072374926, + "mean_token_accuracy": 0.7695364356040955, + "num_tokens": 3494709.0, + "step": 4670, + "train/ce_loss": 0.9873186945915222 + }, + { + "epoch": 0.4617362072374926, + "step": 4670, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.4617362072374926, + "step": 4670, + "train/total_loss": 0.1299818754196167 + }, + { + "entropy": 9.358234405517578, + "epoch": 0.46183508008700813, + "mean_token_accuracy": 0.7796102166175842, + "num_tokens": 3499961.0, + "step": 4671, + "train/ce_loss": 1.2731976509094238 + }, + { + "epoch": 0.46183508008700813, + "step": 4671, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.46183508008700813, + "step": 4671, + "train/total_loss": 0.22106976807117462 + }, + { + "entropy": 9.134790420532227, + "epoch": 0.4619339529365236, + "mean_token_accuracy": 0.7370967864990234, + "num_tokens": 3505003.0, + "step": 4672, + "train/ce_loss": 1.8665708921616897e-05 + }, + { + "epoch": 0.4619339529365236, + "step": 4672, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.4619339529365236, + "step": 4672, + "train/total_loss": 0.09375187009572983 + }, + { + "entropy": 8.932821273803711, + "epoch": 0.46203282578603916, + "mean_token_accuracy": 0.7169811129570007, + "num_tokens": 3510269.0, + "step": 4673, + "train/ce_loss": 2.768101921901689e-06 + }, + { + "epoch": 0.46203282578603916, + "step": 4673, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.46203282578603916, + "step": 4673, + "train/total_loss": 0.06640652567148209 + }, + { + "entropy": 9.077056884765625, + "epoch": 0.4621316986355547, + "mean_token_accuracy": 0.7108433842658997, + "num_tokens": 3515392.0, + "step": 4674, + "train/ce_loss": 1.7194793224334717 + }, + { + "epoch": 0.4621316986355547, + "step": 4674, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.4621316986355547, + "step": 4674, + "train/total_loss": 0.2813229560852051 + }, + { + "entropy": 8.934788703918457, + "epoch": 0.4622305714850702, + "mean_token_accuracy": 0.6678487062454224, + "num_tokens": 3520759.0, + "step": 4675, + "train/ce_loss": 1.6158264875411987 + }, + { + "epoch": 0.4622305714850702, + "step": 4675, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.4622305714850702, + "step": 4675, + "train/total_loss": 0.23189514875411987 + }, + { + "entropy": 9.20814037322998, + "epoch": 0.4623294443345857, + "mean_token_accuracy": 0.7157434225082397, + "num_tokens": 3525791.0, + "step": 4676, + "train/ce_loss": 0.8203097581863403 + }, + { + "epoch": 0.4623294443345857, + "step": 4676, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.4623294443345857, + "step": 4676, + "train/total_loss": 0.1406247317790985 + }, + { + "entropy": 8.74182415008545, + "epoch": 0.46242831718410127, + "mean_token_accuracy": 0.7483588457107544, + "num_tokens": 3531185.0, + "step": 4677, + "train/ce_loss": 0.6870768666267395 + }, + { + "epoch": 0.46242831718410127, + "step": 4677, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.46242831718410127, + "step": 4677, + "train/total_loss": 0.11558268964290619 + }, + { + "entropy": 8.711596488952637, + "epoch": 0.46252719003361675, + "mean_token_accuracy": 0.7013274431228638, + "num_tokens": 3536573.0, + "step": 4678, + "train/ce_loss": 1.0243852138519287 + }, + { + "epoch": 0.46252719003361675, + "step": 4678, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.46252719003361675, + "step": 4678, + "train/total_loss": 0.1454072743654251 + }, + { + "entropy": 9.993766784667969, + "epoch": 0.4626260628831323, + "mean_token_accuracy": 0.8398268222808838, + "num_tokens": 3541209.0, + "step": 4679, + "train/ce_loss": 1.9226688146591187 + }, + { + "epoch": 0.4626260628831323, + "step": 4679, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.4626260628831323, + "step": 4679, + "train/total_loss": 0.23914188146591187 + }, + { + "epoch": 0.46272493573264784, + "grad_norm": 0.8097333312034607, + "learning_rate": 8.845621322256837e-06, + "loss": 0.1412, + "step": 4680 + }, + { + "entropy": 9.061151504516602, + "epoch": 0.46272493573264784, + "mean_token_accuracy": 0.6990423798561096, + "num_tokens": 3546400.0, + "step": 4680, + "train/ce_loss": 1.1291780471801758 + }, + { + "epoch": 0.46272493573264784, + "step": 4680, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.46272493573264784, + "step": 4680, + "train/total_loss": 0.16760531067848206 + }, + { + "entropy": 9.135835647583008, + "epoch": 0.4628238085821633, + "mean_token_accuracy": 0.8241758346557617, + "num_tokens": 3551608.0, + "step": 4681, + "train/ce_loss": 0.6338585019111633 + }, + { + "epoch": 0.4628238085821633, + "step": 4681, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.4628238085821633, + "step": 4681, + "train/total_loss": 0.08291710168123245 + }, + { + "entropy": 9.02741813659668, + "epoch": 0.46292268143167886, + "mean_token_accuracy": 0.7194805145263672, + "num_tokens": 3556816.0, + "step": 4682, + "train/ce_loss": 0.8726161122322083 + }, + { + "epoch": 0.46292268143167886, + "step": 4682, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.46292268143167886, + "step": 4682, + "train/total_loss": 0.1185116097331047 + }, + { + "entropy": 9.087461471557617, + "epoch": 0.4630215542811944, + "mean_token_accuracy": 0.7063291072845459, + "num_tokens": 3562086.0, + "step": 4683, + "train/ce_loss": 0.5230680704116821 + }, + { + "epoch": 0.4630215542811944, + "step": 4683, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.4630215542811944, + "step": 4683, + "train/total_loss": 0.15386930108070374 + }, + { + "entropy": 9.792854309082031, + "epoch": 0.4631204271307099, + "mean_token_accuracy": 0.6943128108978271, + "num_tokens": 3566934.0, + "step": 4684, + "train/ce_loss": 1.461733102798462 + }, + { + "epoch": 0.4631204271307099, + "step": 4684, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.4631204271307099, + "step": 4684, + "train/total_loss": 0.22820456326007843 + }, + { + "entropy": 9.342538833618164, + "epoch": 0.46321929998022543, + "mean_token_accuracy": 0.7240896224975586, + "num_tokens": 3572019.0, + "step": 4685, + "train/ce_loss": 1.6679964065551758 + }, + { + "epoch": 0.46321929998022543, + "step": 4685, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.46321929998022543, + "step": 4685, + "train/total_loss": 0.2917996644973755 + }, + { + "entropy": 9.116839408874512, + "epoch": 0.46331817282974097, + "mean_token_accuracy": 0.7314285635948181, + "num_tokens": 3577174.0, + "step": 4686, + "train/ce_loss": 2.474521807016572e-06 + }, + { + "epoch": 0.46331817282974097, + "step": 4686, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.46331817282974097, + "step": 4686, + "train/total_loss": 0.0312502458691597 + }, + { + "entropy": 8.821943283081055, + "epoch": 0.46341704567925646, + "mean_token_accuracy": 0.6961583495140076, + "num_tokens": 3582491.0, + "step": 4687, + "train/ce_loss": 0.4736084043979645 + }, + { + "epoch": 0.46341704567925646, + "step": 4687, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.46341704567925646, + "step": 4687, + "train/total_loss": 0.09423583745956421 + }, + { + "entropy": 9.354621887207031, + "epoch": 0.463515918528772, + "mean_token_accuracy": 0.6916524767875671, + "num_tokens": 3587542.0, + "step": 4688, + "train/ce_loss": 0.9794625639915466 + }, + { + "epoch": 0.463515918528772, + "step": 4688, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.463515918528772, + "step": 4688, + "train/total_loss": 0.13700875639915466 + }, + { + "entropy": 8.913888931274414, + "epoch": 0.46361479137828754, + "mean_token_accuracy": 0.7779056429862976, + "num_tokens": 3592914.0, + "step": 4689, + "train/ce_loss": 0.6756730079650879 + }, + { + "epoch": 0.46361479137828754, + "step": 4689, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.46361479137828754, + "step": 4689, + "train/total_loss": 0.13397355377674103 + }, + { + "entropy": 8.87656021118164, + "epoch": 0.463713664227803, + "mean_token_accuracy": 0.6682986617088318, + "num_tokens": 3598233.0, + "step": 4690, + "train/ce_loss": 0.47903019189834595 + }, + { + "epoch": 0.463713664227803, + "step": 4690, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.463713664227803, + "step": 4690, + "train/total_loss": 0.09477801620960236 + }, + { + "entropy": 8.997529983520508, + "epoch": 0.46381253707731857, + "mean_token_accuracy": 0.7174940705299377, + "num_tokens": 3603499.0, + "step": 4691, + "train/ce_loss": 0.7582955360412598 + }, + { + "epoch": 0.46381253707731857, + "step": 4691, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.46381253707731857, + "step": 4691, + "train/total_loss": 0.12270455807447433 + }, + { + "entropy": 8.872171401977539, + "epoch": 0.4639114099268341, + "mean_token_accuracy": 0.7293689250946045, + "num_tokens": 3608845.0, + "step": 4692, + "train/ce_loss": 0.7552477121353149 + }, + { + "epoch": 0.4639114099268341, + "step": 4692, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.4639114099268341, + "step": 4692, + "train/total_loss": 0.12239976972341537 + }, + { + "entropy": 9.48823356628418, + "epoch": 0.4640102827763496, + "mean_token_accuracy": 0.681208074092865, + "num_tokens": 3613840.0, + "step": 4693, + "train/ce_loss": 0.6615340709686279 + }, + { + "epoch": 0.4640102827763496, + "step": 4693, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.4640102827763496, + "step": 4693, + "train/total_loss": 0.12474715709686279 + }, + { + "entropy": 8.860837936401367, + "epoch": 0.46410915562586513, + "mean_token_accuracy": 0.7738748788833618, + "num_tokens": 3619213.0, + "step": 4694, + "train/ce_loss": 0.5516573786735535 + }, + { + "epoch": 0.46410915562586513, + "step": 4694, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.46410915562586513, + "step": 4694, + "train/total_loss": 0.07860323786735535 + }, + { + "entropy": 8.766907691955566, + "epoch": 0.4642080284753807, + "mean_token_accuracy": 0.7964988946914673, + "num_tokens": 3624624.0, + "step": 4695, + "train/ce_loss": 0.9928861856460571 + }, + { + "epoch": 0.4642080284753807, + "step": 4695, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.4642080284753807, + "step": 4695, + "train/total_loss": 0.13835111260414124 + }, + { + "entropy": 8.777790069580078, + "epoch": 0.46430690132489616, + "mean_token_accuracy": 0.7126436829566956, + "num_tokens": 3630097.0, + "step": 4696, + "train/ce_loss": 1.4684127569198608 + }, + { + "epoch": 0.46430690132489616, + "step": 4696, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.46430690132489616, + "step": 4696, + "train/total_loss": 0.22887252271175385 + }, + { + "entropy": 9.39077377319336, + "epoch": 0.4644057741744117, + "mean_token_accuracy": 0.7222222089767456, + "num_tokens": 3635247.0, + "step": 4697, + "train/ce_loss": 1.6382369995117188 + }, + { + "epoch": 0.4644057741744117, + "step": 4697, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.4644057741744117, + "step": 4697, + "train/total_loss": 0.2341362088918686 + }, + { + "entropy": 9.113056182861328, + "epoch": 0.46450464702392724, + "mean_token_accuracy": 0.7508532404899597, + "num_tokens": 3640724.0, + "step": 4698, + "train/ce_loss": 0.7351180911064148 + }, + { + "epoch": 0.46450464702392724, + "step": 4698, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.46450464702392724, + "step": 4698, + "train/total_loss": 0.18288680911064148 + }, + { + "entropy": 9.436710357666016, + "epoch": 0.4646035198734427, + "mean_token_accuracy": 0.7511811256408691, + "num_tokens": 3645922.0, + "step": 4699, + "train/ce_loss": 0.9424399733543396 + }, + { + "epoch": 0.4646035198734427, + "step": 4699, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.4646035198734427, + "step": 4699, + "train/total_loss": 0.12940025329589844 + }, + { + "epoch": 0.46470239272295827, + "grad_norm": 0.6447728276252747, + "learning_rate": 8.840676457498888e-06, + "loss": 0.1453, + "step": 4700 + }, + { + "entropy": 9.457176208496094, + "epoch": 0.46470239272295827, + "mean_token_accuracy": 0.672913134098053, + "num_tokens": 3650937.0, + "step": 4700, + "train/ce_loss": 0.9132030010223389 + }, + { + "epoch": 0.46470239272295827, + "step": 4700, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.46470239272295827, + "step": 4700, + "train/total_loss": 0.16163280606269836 + }, + { + "entropy": 9.087905883789062, + "epoch": 0.4648012655724738, + "mean_token_accuracy": 0.7549574971199036, + "num_tokens": 3656119.0, + "step": 4701, + "train/ce_loss": 1.379743218421936 + }, + { + "epoch": 0.4648012655724738, + "step": 4701, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.4648012655724738, + "step": 4701, + "train/total_loss": 0.1809430718421936 + }, + { + "entropy": 8.700032234191895, + "epoch": 0.4649001384219893, + "mean_token_accuracy": 0.736785352230072, + "num_tokens": 3661523.0, + "step": 4702, + "train/ce_loss": 0.7521803975105286 + }, + { + "epoch": 0.4649001384219893, + "step": 4702, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.4649001384219893, + "step": 4702, + "train/total_loss": 0.11037429422140121 + }, + { + "entropy": 9.367010116577148, + "epoch": 0.46499901127150484, + "mean_token_accuracy": 0.7763779759407043, + "num_tokens": 3666600.0, + "step": 4703, + "train/ce_loss": 0.8053690791130066 + }, + { + "epoch": 0.46499901127150484, + "step": 4703, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.46499901127150484, + "step": 4703, + "train/total_loss": 0.10006815940141678 + }, + { + "entropy": 9.050535202026367, + "epoch": 0.4650978841210204, + "mean_token_accuracy": 0.7229219079017639, + "num_tokens": 3671840.0, + "step": 4704, + "train/ce_loss": 1.0198999643325806 + }, + { + "epoch": 0.4650978841210204, + "step": 4704, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.4650978841210204, + "step": 4704, + "train/total_loss": 0.1879274994134903 + }, + { + "entropy": 9.247036933898926, + "epoch": 0.4651967569705359, + "mean_token_accuracy": 0.7175572514533997, + "num_tokens": 3676818.0, + "step": 4705, + "train/ce_loss": 0.8245893120765686 + }, + { + "epoch": 0.4651967569705359, + "step": 4705, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.4651967569705359, + "step": 4705, + "train/total_loss": 0.10980268567800522 + }, + { + "entropy": 9.679970741271973, + "epoch": 0.4652956298200514, + "mean_token_accuracy": 0.7616387605667114, + "num_tokens": 3681838.0, + "step": 4706, + "train/ce_loss": 1.5281262903954484e-06 + }, + { + "epoch": 0.4652956298200514, + "step": 4706, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.4652956298200514, + "step": 4706, + "train/total_loss": 0.03906265273690224 + }, + { + "entropy": 8.594926834106445, + "epoch": 0.46539450266956695, + "mean_token_accuracy": 0.7148981690406799, + "num_tokens": 3687284.0, + "step": 4707, + "train/ce_loss": 1.0162886381149292 + }, + { + "epoch": 0.46539450266956695, + "step": 4707, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.46539450266956695, + "step": 4707, + "train/total_loss": 0.1289726197719574 + }, + { + "entropy": 9.453357696533203, + "epoch": 0.4654933755190825, + "mean_token_accuracy": 0.7584459185600281, + "num_tokens": 3692380.0, + "step": 4708, + "train/ce_loss": 0.9438441395759583 + }, + { + "epoch": 0.4654933755190825, + "step": 4708, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.4654933755190825, + "step": 4708, + "train/total_loss": 0.15297816693782806 + }, + { + "entropy": 8.775794982910156, + "epoch": 0.46559224836859797, + "mean_token_accuracy": 0.7706635594367981, + "num_tokens": 3697690.0, + "step": 4709, + "train/ce_loss": 0.7100697755813599 + }, + { + "epoch": 0.46559224836859797, + "step": 4709, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.46559224836859797, + "step": 4709, + "train/total_loss": 0.09444447606801987 + }, + { + "entropy": 8.7677583694458, + "epoch": 0.4656911212181135, + "mean_token_accuracy": 0.7585825324058533, + "num_tokens": 3703064.0, + "step": 4710, + "train/ce_loss": 0.8214384317398071 + }, + { + "epoch": 0.4656911212181135, + "step": 4710, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.4656911212181135, + "step": 4710, + "train/total_loss": 0.11730009317398071 + }, + { + "entropy": 8.944156646728516, + "epoch": 0.46578999406762905, + "mean_token_accuracy": 0.7311960458755493, + "num_tokens": 3708316.0, + "step": 4711, + "train/ce_loss": 0.7888990640640259 + }, + { + "epoch": 0.46578999406762905, + "step": 4711, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.46578999406762905, + "step": 4711, + "train/total_loss": 0.1648274064064026 + }, + { + "entropy": 9.67165756225586, + "epoch": 0.46588886691714454, + "mean_token_accuracy": 0.7676767706871033, + "num_tokens": 3713239.0, + "step": 4712, + "train/ce_loss": 4.874921614828054e-06 + }, + { + "epoch": 0.46588886691714454, + "step": 4712, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.46588886691714454, + "step": 4712, + "train/total_loss": 0.0625004842877388 + }, + { + "entropy": 9.428511619567871, + "epoch": 0.4659877397666601, + "mean_token_accuracy": 0.7713841199874878, + "num_tokens": 3718263.0, + "step": 4713, + "train/ce_loss": 0.6457332372665405 + }, + { + "epoch": 0.4659877397666601, + "step": 4713, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.4659877397666601, + "step": 4713, + "train/total_loss": 0.10754207521677017 + }, + { + "entropy": 9.480663299560547, + "epoch": 0.4660866126161756, + "mean_token_accuracy": 0.7697368264198303, + "num_tokens": 3723331.0, + "step": 4714, + "train/ce_loss": 0.9293239712715149 + }, + { + "epoch": 0.4660866126161756, + "step": 4714, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.4660866126161756, + "step": 4714, + "train/total_loss": 0.11636989563703537 + }, + { + "entropy": 9.312999725341797, + "epoch": 0.4661854854656911, + "mean_token_accuracy": 0.7088791728019714, + "num_tokens": 3728472.0, + "step": 4715, + "train/ce_loss": 2.715044502110686e-06 + }, + { + "epoch": 0.4661854854656911, + "step": 4715, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.4661854854656911, + "step": 4715, + "train/total_loss": 0.07031276822090149 + }, + { + "entropy": 9.983501434326172, + "epoch": 0.46628435831520665, + "mean_token_accuracy": 0.7093595862388611, + "num_tokens": 3733317.0, + "step": 4716, + "train/ce_loss": 1.5866858120716643e-06 + }, + { + "epoch": 0.46628435831520665, + "step": 4716, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.46628435831520665, + "step": 4716, + "train/total_loss": 0.019531408324837685 + }, + { + "entropy": 8.879308700561523, + "epoch": 0.4663832311647222, + "mean_token_accuracy": 0.7541370987892151, + "num_tokens": 3738656.0, + "step": 4717, + "train/ce_loss": 1.1307368278503418 + }, + { + "epoch": 0.4663832311647222, + "step": 4717, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.4663832311647222, + "step": 4717, + "train/total_loss": 0.2107299268245697 + }, + { + "entropy": 9.347103118896484, + "epoch": 0.4664821040142377, + "mean_token_accuracy": 0.7087827324867249, + "num_tokens": 3743732.0, + "step": 4718, + "train/ce_loss": 0.7969828844070435 + }, + { + "epoch": 0.4664821040142377, + "step": 4718, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.4664821040142377, + "step": 4718, + "train/total_loss": 0.15391704440116882 + }, + { + "entropy": 8.976900100708008, + "epoch": 0.4665809768637532, + "mean_token_accuracy": 0.7645536661148071, + "num_tokens": 3749005.0, + "step": 4719, + "train/ce_loss": 0.543332576751709 + }, + { + "epoch": 0.4665809768637532, + "step": 4719, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.4665809768637532, + "step": 4719, + "train/total_loss": 0.09730200469493866 + }, + { + "epoch": 0.46667984971326876, + "grad_norm": 0.6814486384391785, + "learning_rate": 8.83573159274094e-06, + "loss": 0.1401, + "step": 4720 + }, + { + "entropy": 9.519277572631836, + "epoch": 0.46667984971326876, + "mean_token_accuracy": 0.6865203976631165, + "num_tokens": 3754073.0, + "step": 4720, + "train/ce_loss": 1.6626695394515991 + }, + { + "epoch": 0.46667984971326876, + "step": 4720, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.46667984971326876, + "step": 4720, + "train/total_loss": 0.22876696288585663 + }, + { + "entropy": 9.098243713378906, + "epoch": 0.46677872256278424, + "mean_token_accuracy": 0.7195571660995483, + "num_tokens": 3759381.0, + "step": 4721, + "train/ce_loss": 0.9952742457389832 + }, + { + "epoch": 0.46677872256278424, + "step": 4721, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.46677872256278424, + "step": 4721, + "train/total_loss": 0.16202741861343384 + }, + { + "entropy": 8.878311157226562, + "epoch": 0.4668775954122998, + "mean_token_accuracy": 0.709172248840332, + "num_tokens": 3764737.0, + "step": 4722, + "train/ce_loss": 0.7218380570411682 + }, + { + "epoch": 0.4668775954122998, + "step": 4722, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.4668775954122998, + "step": 4722, + "train/total_loss": 0.15812131762504578 + }, + { + "entropy": 9.033000946044922, + "epoch": 0.4669764682618153, + "mean_token_accuracy": 0.7345678806304932, + "num_tokens": 3770020.0, + "step": 4723, + "train/ce_loss": 0.5463669300079346 + }, + { + "epoch": 0.4669764682618153, + "step": 4723, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.4669764682618153, + "step": 4723, + "train/total_loss": 0.13276168704032898 + }, + { + "entropy": 9.114192962646484, + "epoch": 0.4670753411113308, + "mean_token_accuracy": 0.8257575631141663, + "num_tokens": 3775272.0, + "step": 4724, + "train/ce_loss": 0.45774757862091064 + }, + { + "epoch": 0.4670753411113308, + "step": 4724, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.4670753411113308, + "step": 4724, + "train/total_loss": 0.061399757862091064 + }, + { + "entropy": 9.174012184143066, + "epoch": 0.46717421396084635, + "mean_token_accuracy": 0.7595474123954773, + "num_tokens": 3780403.0, + "step": 4725, + "train/ce_loss": 0.662388265132904 + }, + { + "epoch": 0.46717421396084635, + "step": 4725, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.46717421396084635, + "step": 4725, + "train/total_loss": 0.09748882800340652 + }, + { + "entropy": 9.906539916992188, + "epoch": 0.4672730868103619, + "mean_token_accuracy": 0.8241758346557617, + "num_tokens": 3785262.0, + "step": 4726, + "train/ce_loss": 0.9337340593338013 + }, + { + "epoch": 0.4672730868103619, + "step": 4726, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.4672730868103619, + "step": 4726, + "train/total_loss": 0.10899841040372849 + }, + { + "entropy": 8.694759368896484, + "epoch": 0.4673719596598774, + "mean_token_accuracy": 0.7390710115432739, + "num_tokens": 3790497.0, + "step": 4727, + "train/ce_loss": 0.44753503799438477 + }, + { + "epoch": 0.4673719596598774, + "step": 4727, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.4673719596598774, + "step": 4727, + "train/total_loss": 0.08381600677967072 + }, + { + "entropy": 9.153018951416016, + "epoch": 0.4674708325093929, + "mean_token_accuracy": 0.7115384340286255, + "num_tokens": 3795913.0, + "step": 4728, + "train/ce_loss": 1.1834046840667725 + }, + { + "epoch": 0.4674708325093929, + "step": 4728, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.4674708325093929, + "step": 4728, + "train/total_loss": 0.14959046244621277 + }, + { + "entropy": 9.155204772949219, + "epoch": 0.46756970535890846, + "mean_token_accuracy": 0.7116912603378296, + "num_tokens": 3801267.0, + "step": 4729, + "train/ce_loss": 0.9349053502082825 + }, + { + "epoch": 0.46756970535890846, + "step": 4729, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.46756970535890846, + "step": 4729, + "train/total_loss": 0.14817804098129272 + }, + { + "entropy": 9.545472145080566, + "epoch": 0.46766857820842395, + "mean_token_accuracy": 0.7176259160041809, + "num_tokens": 3806325.0, + "step": 4730, + "train/ce_loss": 1.6894537111511454e-06 + }, + { + "epoch": 0.46766857820842395, + "step": 4730, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.46766857820842395, + "step": 4730, + "train/total_loss": 0.05078141763806343 + }, + { + "entropy": 9.513253211975098, + "epoch": 0.4677674510579395, + "mean_token_accuracy": 0.7858508825302124, + "num_tokens": 3811283.0, + "step": 4731, + "train/ce_loss": 7.440812169079436e-06 + }, + { + "epoch": 0.4677674510579395, + "step": 4731, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.4677674510579395, + "step": 4731, + "train/total_loss": 0.03515699505805969 + }, + { + "entropy": 8.996169090270996, + "epoch": 0.46786632390745503, + "mean_token_accuracy": 0.7416020631790161, + "num_tokens": 3816490.0, + "step": 4732, + "train/ce_loss": 1.2266058921813965 + }, + { + "epoch": 0.46786632390745503, + "step": 4732, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.46786632390745503, + "step": 4732, + "train/total_loss": 0.2046918421983719 + }, + { + "entropy": 9.178853988647461, + "epoch": 0.4679651967569705, + "mean_token_accuracy": 0.6740237474441528, + "num_tokens": 3821569.0, + "step": 4733, + "train/ce_loss": 1.269809603691101 + }, + { + "epoch": 0.4679651967569705, + "step": 4733, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.4679651967569705, + "step": 4733, + "train/total_loss": 0.1816684603691101 + }, + { + "entropy": 9.312187194824219, + "epoch": 0.46806406960648606, + "mean_token_accuracy": 0.7680000066757202, + "num_tokens": 3826623.0, + "step": 4734, + "train/ce_loss": 3.813394187091035e-06 + }, + { + "epoch": 0.46806406960648606, + "step": 4734, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.46806406960648606, + "step": 4734, + "train/total_loss": 0.05078162997961044 + }, + { + "entropy": 9.21867847442627, + "epoch": 0.4681629424560016, + "mean_token_accuracy": 0.7842031121253967, + "num_tokens": 3831784.0, + "step": 4735, + "train/ce_loss": 0.38276079297065735 + }, + { + "epoch": 0.4681629424560016, + "step": 4735, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.4681629424560016, + "step": 4735, + "train/total_loss": 0.1164010763168335 + }, + { + "entropy": 9.087265014648438, + "epoch": 0.4682618153055171, + "mean_token_accuracy": 0.7552631497383118, + "num_tokens": 3836996.0, + "step": 4736, + "train/ce_loss": 0.8710118532180786 + }, + { + "epoch": 0.4682618153055171, + "step": 4736, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.4682618153055171, + "step": 4736, + "train/total_loss": 0.19256994128227234 + }, + { + "entropy": 8.832172393798828, + "epoch": 0.4683606881550326, + "mean_token_accuracy": 0.7178571224212646, + "num_tokens": 3842416.0, + "step": 4737, + "train/ce_loss": 1.8286237716674805 + }, + { + "epoch": 0.4683606881550326, + "step": 4737, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.4683606881550326, + "step": 4737, + "train/total_loss": 0.24926863610744476 + }, + { + "entropy": 8.908565521240234, + "epoch": 0.46845956100454816, + "mean_token_accuracy": 0.7598522305488586, + "num_tokens": 3847732.0, + "step": 4738, + "train/ce_loss": 0.5367914438247681 + }, + { + "epoch": 0.46845956100454816, + "step": 4738, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.46845956100454816, + "step": 4738, + "train/total_loss": 0.17867913842201233 + }, + { + "entropy": 9.52511215209961, + "epoch": 0.46855843385406365, + "mean_token_accuracy": 0.7535934448242188, + "num_tokens": 3852654.0, + "step": 4739, + "train/ce_loss": 0.764208197593689 + }, + { + "epoch": 0.46855843385406365, + "step": 4739, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.46855843385406365, + "step": 4739, + "train/total_loss": 0.15454581379890442 + }, + { + "epoch": 0.4686573067035792, + "grad_norm": 0.761462390422821, + "learning_rate": 8.83078672798299e-06, + "loss": 0.1419, + "step": 4740 + }, + { + "entropy": 8.753978729248047, + "epoch": 0.4686573067035792, + "mean_token_accuracy": 0.7377398610115051, + "num_tokens": 3858050.0, + "step": 4740, + "train/ce_loss": 0.5637397766113281 + }, + { + "epoch": 0.4686573067035792, + "step": 4740, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.4686573067035792, + "step": 4740, + "train/total_loss": 0.10715523362159729 + }, + { + "entropy": 9.044390678405762, + "epoch": 0.46875617955309473, + "mean_token_accuracy": 0.7644171714782715, + "num_tokens": 3863329.0, + "step": 4741, + "train/ce_loss": 0.9783340096473694 + }, + { + "epoch": 0.46875617955309473, + "step": 4741, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.46875617955309473, + "step": 4741, + "train/total_loss": 0.15642714500427246 + }, + { + "entropy": 9.420166015625, + "epoch": 0.4688550524026102, + "mean_token_accuracy": 0.7177033424377441, + "num_tokens": 3868419.0, + "step": 4742, + "train/ce_loss": 1.453926682472229 + }, + { + "epoch": 0.4688550524026102, + "step": 4742, + "train/sim_loss": 0.12109375 + }, + { + "epoch": 0.4688550524026102, + "step": 4742, + "train/total_loss": 0.26648640632629395 + }, + { + "entropy": 9.0964994430542, + "epoch": 0.46895392525212576, + "mean_token_accuracy": 0.7308743000030518, + "num_tokens": 3873570.0, + "step": 4743, + "train/ce_loss": 1.4857808992019272e-06 + }, + { + "epoch": 0.46895392525212576, + "step": 4743, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.46895392525212576, + "step": 4743, + "train/total_loss": 0.05078139901161194 + }, + { + "entropy": 9.663649559020996, + "epoch": 0.4690527981016413, + "mean_token_accuracy": 0.7560553550720215, + "num_tokens": 3878602.0, + "step": 4744, + "train/ce_loss": 1.3290947675704956 + }, + { + "epoch": 0.4690527981016413, + "step": 4744, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.4690527981016413, + "step": 4744, + "train/total_loss": 0.19150322675704956 + }, + { + "entropy": 9.284122467041016, + "epoch": 0.4691516709511568, + "mean_token_accuracy": 0.671159029006958, + "num_tokens": 3883815.0, + "step": 4745, + "train/ce_loss": 1.277754783630371 + }, + { + "epoch": 0.4691516709511568, + "step": 4745, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.4691516709511568, + "step": 4745, + "train/total_loss": 0.19418172538280487 + }, + { + "entropy": 9.04500961303711, + "epoch": 0.4692505438006723, + "mean_token_accuracy": 0.7883211970329285, + "num_tokens": 3889091.0, + "step": 4746, + "train/ce_loss": 0.38756975531578064 + }, + { + "epoch": 0.4692505438006723, + "step": 4746, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.4692505438006723, + "step": 4746, + "train/total_loss": 0.06219447776675224 + }, + { + "entropy": 9.679605484008789, + "epoch": 0.46934941665018787, + "mean_token_accuracy": 0.7313974499702454, + "num_tokens": 3894034.0, + "step": 4747, + "train/ce_loss": 0.7483991384506226 + }, + { + "epoch": 0.46934941665018787, + "step": 4747, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.46934941665018787, + "step": 4747, + "train/total_loss": 0.16468366980552673 + }, + { + "entropy": 9.190399169921875, + "epoch": 0.4694482894997034, + "mean_token_accuracy": 0.7104247212409973, + "num_tokens": 3899261.0, + "step": 4748, + "train/ce_loss": 1.2331979274749756 + }, + { + "epoch": 0.4694482894997034, + "step": 4748, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.4694482894997034, + "step": 4748, + "train/total_loss": 0.1780073046684265 + }, + { + "entropy": 9.05646800994873, + "epoch": 0.4695471623492189, + "mean_token_accuracy": 0.6671069860458374, + "num_tokens": 3904435.0, + "step": 4749, + "train/ce_loss": 1.1537171602249146 + }, + { + "epoch": 0.4695471623492189, + "step": 4749, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.4695471623492189, + "step": 4749, + "train/total_loss": 0.1817779690027237 + }, + { + "entropy": 9.588630676269531, + "epoch": 0.46964603519873444, + "mean_token_accuracy": 0.73046875, + "num_tokens": 3909387.0, + "step": 4750, + "train/ce_loss": 0.6952922344207764 + }, + { + "epoch": 0.46964603519873444, + "step": 4750, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.46964603519873444, + "step": 4750, + "train/total_loss": 0.108591727912426 + }, + { + "entropy": 9.769367218017578, + "epoch": 0.46974490804825, + "mean_token_accuracy": 0.6651480793952942, + "num_tokens": 3914229.0, + "step": 4751, + "train/ce_loss": 1.5452396869659424 + }, + { + "epoch": 0.46974490804825, + "step": 4751, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.46974490804825, + "step": 4751, + "train/total_loss": 0.23264896869659424 + }, + { + "entropy": 9.275885581970215, + "epoch": 0.46984378089776546, + "mean_token_accuracy": 0.7324137687683105, + "num_tokens": 3919407.0, + "step": 4752, + "train/ce_loss": 2.936223154392792e-06 + }, + { + "epoch": 0.46984378089776546, + "step": 4752, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.46984378089776546, + "step": 4752, + "train/total_loss": 0.05078154429793358 + }, + { + "entropy": 10.161367416381836, + "epoch": 0.469942653747281, + "mean_token_accuracy": 0.7241379022598267, + "num_tokens": 3924120.0, + "step": 4753, + "train/ce_loss": 4.123912731301971e-06 + }, + { + "epoch": 0.469942653747281, + "step": 4753, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.469942653747281, + "step": 4753, + "train/total_loss": 0.03515666350722313 + }, + { + "entropy": 9.288694381713867, + "epoch": 0.47004152659679654, + "mean_token_accuracy": 0.729903519153595, + "num_tokens": 3929184.0, + "step": 4754, + "train/ce_loss": 1.5101879853318678e-06 + }, + { + "epoch": 0.47004152659679654, + "step": 4754, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.47004152659679654, + "step": 4754, + "train/total_loss": 0.019531400874257088 + }, + { + "entropy": 8.630434036254883, + "epoch": 0.47014039944631203, + "mean_token_accuracy": 0.7230046987533569, + "num_tokens": 3934624.0, + "step": 4755, + "train/ce_loss": 0.6190818548202515 + }, + { + "epoch": 0.47014039944631203, + "step": 4755, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.47014039944631203, + "step": 4755, + "train/total_loss": 0.10487693548202515 + }, + { + "entropy": 9.039437294006348, + "epoch": 0.47023927229582757, + "mean_token_accuracy": 0.7242743968963623, + "num_tokens": 3939846.0, + "step": 4756, + "train/ce_loss": 0.5676820874214172 + }, + { + "epoch": 0.47023927229582757, + "step": 4756, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.47023927229582757, + "step": 4756, + "train/total_loss": 0.09192445874214172 + }, + { + "entropy": 9.227334022521973, + "epoch": 0.4703381451453431, + "mean_token_accuracy": 0.7220744490623474, + "num_tokens": 3945039.0, + "step": 4757, + "train/ce_loss": 0.6822482943534851 + }, + { + "epoch": 0.4703381451453431, + "step": 4757, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.4703381451453431, + "step": 4757, + "train/total_loss": 0.14244358241558075 + }, + { + "entropy": 8.718744277954102, + "epoch": 0.4704370179948586, + "mean_token_accuracy": 0.708149790763855, + "num_tokens": 3950437.0, + "step": 4758, + "train/ce_loss": 0.8345645070075989 + }, + { + "epoch": 0.4704370179948586, + "step": 4758, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.4704370179948586, + "step": 4758, + "train/total_loss": 0.10689394921064377 + }, + { + "entropy": 9.396023750305176, + "epoch": 0.47053589084437414, + "mean_token_accuracy": 0.7441497445106506, + "num_tokens": 3955485.0, + "step": 4759, + "train/ce_loss": 1.5894752740859985 + }, + { + "epoch": 0.47053589084437414, + "step": 4759, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.47053589084437414, + "step": 4759, + "train/total_loss": 0.23316627740859985 + }, + { + "epoch": 0.4706347636938897, + "grad_norm": 0.7747063040733337, + "learning_rate": 8.825841863225042e-06, + "loss": 0.1533, + "step": 4760 + }, + { + "entropy": 9.248870849609375, + "epoch": 0.4706347636938897, + "mean_token_accuracy": 0.6353523135185242, + "num_tokens": 3960740.0, + "step": 4760, + "train/ce_loss": 1.3720840570385917e-06 + }, + { + "epoch": 0.4706347636938897, + "step": 4760, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.4706347636938897, + "step": 4760, + "train/total_loss": 0.03125013783574104 + }, + { + "entropy": 9.110275268554688, + "epoch": 0.47073363654340517, + "mean_token_accuracy": 0.7091194987297058, + "num_tokens": 3965831.0, + "step": 4761, + "train/ce_loss": 1.8087035417556763 + }, + { + "epoch": 0.47073363654340517, + "step": 4761, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.47073363654340517, + "step": 4761, + "train/total_loss": 0.2668078541755676 + }, + { + "entropy": 9.457289695739746, + "epoch": 0.4708325093929207, + "mean_token_accuracy": 0.7428115010261536, + "num_tokens": 3970849.0, + "step": 4762, + "train/ce_loss": 0.5878061652183533 + }, + { + "epoch": 0.4708325093929207, + "step": 4762, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.4708325093929207, + "step": 4762, + "train/total_loss": 0.13299936056137085 + }, + { + "entropy": 9.001456260681152, + "epoch": 0.47093138224243625, + "mean_token_accuracy": 0.7251613140106201, + "num_tokens": 3976068.0, + "step": 4763, + "train/ce_loss": 0.7660927772521973 + }, + { + "epoch": 0.47093138224243625, + "step": 4763, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.47093138224243625, + "step": 4763, + "train/total_loss": 0.10004677623510361 + }, + { + "entropy": 9.152788162231445, + "epoch": 0.47103025509195173, + "mean_token_accuracy": 0.6974790096282959, + "num_tokens": 3981326.0, + "step": 4764, + "train/ce_loss": 1.1601786613464355 + }, + { + "epoch": 0.47103025509195173, + "step": 4764, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.47103025509195173, + "step": 4764, + "train/total_loss": 0.1707053780555725 + }, + { + "entropy": 9.557083129882812, + "epoch": 0.4711291279414673, + "mean_token_accuracy": 0.7285068035125732, + "num_tokens": 3986385.0, + "step": 4765, + "train/ce_loss": 1.3284010887145996 + }, + { + "epoch": 0.4711291279414673, + "step": 4765, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.4711291279414673, + "step": 4765, + "train/total_loss": 0.2109651118516922 + }, + { + "entropy": 9.038352966308594, + "epoch": 0.4712280007909828, + "mean_token_accuracy": 0.7330827116966248, + "num_tokens": 3991665.0, + "step": 4766, + "train/ce_loss": 0.9846564531326294 + }, + { + "epoch": 0.4712280007909828, + "step": 4766, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.4712280007909828, + "step": 4766, + "train/total_loss": 0.20002815127372742 + }, + { + "entropy": 9.46235466003418, + "epoch": 0.4713268736404983, + "mean_token_accuracy": 0.6992248296737671, + "num_tokens": 3996722.0, + "step": 4767, + "train/ce_loss": 1.7705824375152588 + }, + { + "epoch": 0.4713268736404983, + "step": 4767, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.4713268736404983, + "step": 4767, + "train/total_loss": 0.28643324971199036 + }, + { + "entropy": 9.06019115447998, + "epoch": 0.47142574649001384, + "mean_token_accuracy": 0.743107795715332, + "num_tokens": 4002040.0, + "step": 4768, + "train/ce_loss": 0.8644979000091553 + }, + { + "epoch": 0.47142574649001384, + "step": 4768, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.47142574649001384, + "step": 4768, + "train/total_loss": 0.13332480192184448 + }, + { + "entropy": 8.78596305847168, + "epoch": 0.4715246193395294, + "mean_token_accuracy": 0.7808510661125183, + "num_tokens": 4007471.0, + "step": 4769, + "train/ce_loss": 0.38614627718925476 + }, + { + "epoch": 0.4715246193395294, + "step": 4769, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.4715246193395294, + "step": 4769, + "train/total_loss": 0.062052126973867416 + }, + { + "entropy": 9.571614265441895, + "epoch": 0.47162349218904487, + "mean_token_accuracy": 0.7563636302947998, + "num_tokens": 4012473.0, + "step": 4770, + "train/ce_loss": 0.6596222519874573 + }, + { + "epoch": 0.47162349218904487, + "step": 4770, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.47162349218904487, + "step": 4770, + "train/total_loss": 0.15189972519874573 + }, + { + "entropy": 9.231966972351074, + "epoch": 0.4717223650385604, + "mean_token_accuracy": 0.7604617476463318, + "num_tokens": 4017597.0, + "step": 4771, + "train/ce_loss": 9.691642844700254e-06 + }, + { + "epoch": 0.4717223650385604, + "step": 4771, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.4717223650385604, + "step": 4771, + "train/total_loss": 0.0312509685754776 + }, + { + "entropy": 8.980993270874023, + "epoch": 0.47182123788807595, + "mean_token_accuracy": 0.698924720287323, + "num_tokens": 4022916.0, + "step": 4772, + "train/ce_loss": 0.8316338062286377 + }, + { + "epoch": 0.47182123788807595, + "step": 4772, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.47182123788807595, + "step": 4772, + "train/total_loss": 0.12613213062286377 + }, + { + "entropy": 8.69944953918457, + "epoch": 0.47192011073759144, + "mean_token_accuracy": 0.7211538553237915, + "num_tokens": 4028488.0, + "step": 4773, + "train/ce_loss": 1.073030710220337 + }, + { + "epoch": 0.47192011073759144, + "step": 4773, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.47192011073759144, + "step": 4773, + "train/total_loss": 0.16589683294296265 + }, + { + "entropy": 9.96009635925293, + "epoch": 0.472018983587107, + "mean_token_accuracy": 0.7681564092636108, + "num_tokens": 4033253.0, + "step": 4774, + "train/ce_loss": 2.6304535367671633e-06 + }, + { + "epoch": 0.472018983587107, + "step": 4774, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.472018983587107, + "step": 4774, + "train/total_loss": 0.03906276449561119 + }, + { + "entropy": 10.047544479370117, + "epoch": 0.4721178564366225, + "mean_token_accuracy": 0.7305699586868286, + "num_tokens": 4038059.0, + "step": 4775, + "train/ce_loss": 3.5401205877860775e-06 + }, + { + "epoch": 0.4721178564366225, + "step": 4775, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.4721178564366225, + "step": 4775, + "train/total_loss": 0.07031285762786865 + }, + { + "entropy": 9.181392669677734, + "epoch": 0.472216729286138, + "mean_token_accuracy": 0.6970720887184143, + "num_tokens": 4043580.0, + "step": 4776, + "train/ce_loss": 0.6841987371444702 + }, + { + "epoch": 0.472216729286138, + "step": 4776, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.472216729286138, + "step": 4776, + "train/total_loss": 0.14654487371444702 + }, + { + "entropy": 8.766311645507812, + "epoch": 0.47231560213565355, + "mean_token_accuracy": 0.8162650465965271, + "num_tokens": 4049084.0, + "step": 4777, + "train/ce_loss": 0.48067957162857056 + }, + { + "epoch": 0.47231560213565355, + "step": 4777, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.47231560213565355, + "step": 4777, + "train/total_loss": 0.13009920716285706 + }, + { + "entropy": 9.338191032409668, + "epoch": 0.4724144749851691, + "mean_token_accuracy": 0.7837370038032532, + "num_tokens": 4054198.0, + "step": 4778, + "train/ce_loss": 0.6879231929779053 + }, + { + "epoch": 0.4724144749851691, + "step": 4778, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.4724144749851691, + "step": 4778, + "train/total_loss": 0.12347982078790665 + }, + { + "entropy": 8.6314697265625, + "epoch": 0.47251334783468457, + "mean_token_accuracy": 0.7299270033836365, + "num_tokens": 4059783.0, + "step": 4779, + "train/ce_loss": 0.6439928412437439 + }, + { + "epoch": 0.47251334783468457, + "step": 4779, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.47251334783468457, + "step": 4779, + "train/total_loss": 0.12689928710460663 + }, + { + "epoch": 0.4726122206842001, + "grad_norm": 0.6961796283721924, + "learning_rate": 8.820896998467093e-06, + "loss": 0.1489, + "step": 4780 + }, + { + "entropy": 9.529073715209961, + "epoch": 0.4726122206842001, + "mean_token_accuracy": 0.7566909790039062, + "num_tokens": 4064637.0, + "step": 4780, + "train/ce_loss": 1.0975415706634521 + }, + { + "epoch": 0.4726122206842001, + "step": 4780, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.4726122206842001, + "step": 4780, + "train/total_loss": 0.21912916004657745 + }, + { + "entropy": 9.070890426635742, + "epoch": 0.47271109353371565, + "mean_token_accuracy": 0.6548004150390625, + "num_tokens": 4070038.0, + "step": 4781, + "train/ce_loss": 1.7903450727462769 + }, + { + "epoch": 0.47271109353371565, + "step": 4781, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.47271109353371565, + "step": 4781, + "train/total_loss": 0.2688782811164856 + }, + { + "entropy": 8.831971168518066, + "epoch": 0.47280996638323114, + "mean_token_accuracy": 0.7632135152816772, + "num_tokens": 4075474.0, + "step": 4782, + "train/ce_loss": 0.7210497260093689 + }, + { + "epoch": 0.47280996638323114, + "step": 4782, + "train/sim_loss": 0.140625 + }, + { + "epoch": 0.47280996638323114, + "step": 4782, + "train/total_loss": 0.21272997558116913 + }, + { + "entropy": 9.82503604888916, + "epoch": 0.4729088392327467, + "mean_token_accuracy": 0.7314049601554871, + "num_tokens": 4080392.0, + "step": 4783, + "train/ce_loss": 1.004560112953186 + }, + { + "epoch": 0.4729088392327467, + "step": 4783, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.4729088392327467, + "step": 4783, + "train/total_loss": 0.16295601427555084 + }, + { + "entropy": 10.069318771362305, + "epoch": 0.4730077120822622, + "mean_token_accuracy": 0.7079207897186279, + "num_tokens": 4085156.0, + "step": 4784, + "train/ce_loss": 2.222496747970581 + }, + { + "epoch": 0.4730077120822622, + "step": 4784, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.4730077120822622, + "step": 4784, + "train/total_loss": 0.26131218671798706 + }, + { + "entropy": 8.968809127807617, + "epoch": 0.4731065849317777, + "mean_token_accuracy": 0.7136611938476562, + "num_tokens": 4090538.0, + "step": 4785, + "train/ce_loss": 1.2665410041809082 + }, + { + "epoch": 0.4731065849317777, + "step": 4785, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.4731065849317777, + "step": 4785, + "train/total_loss": 0.15790410339832306 + }, + { + "entropy": 9.298772811889648, + "epoch": 0.47320545778129325, + "mean_token_accuracy": 0.7463768124580383, + "num_tokens": 4095662.0, + "step": 4786, + "train/ce_loss": 1.8103519678115845 + }, + { + "epoch": 0.47320545778129325, + "step": 4786, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.47320545778129325, + "step": 4786, + "train/total_loss": 0.25525397062301636 + }, + { + "entropy": 9.402922630310059, + "epoch": 0.4733043306308088, + "mean_token_accuracy": 0.6640746593475342, + "num_tokens": 4100759.0, + "step": 4787, + "train/ce_loss": 2.30370831489563 + }, + { + "epoch": 0.4733043306308088, + "step": 4787, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.4733043306308088, + "step": 4787, + "train/total_loss": 0.30068331956863403 + }, + { + "entropy": 9.477928161621094, + "epoch": 0.47340320348032433, + "mean_token_accuracy": 0.7245509028434753, + "num_tokens": 4105864.0, + "step": 4788, + "train/ce_loss": 0.534600019454956 + }, + { + "epoch": 0.47340320348032433, + "step": 4788, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.47340320348032433, + "step": 4788, + "train/total_loss": 0.0847100019454956 + }, + { + "entropy": 9.502606391906738, + "epoch": 0.4735020763298398, + "mean_token_accuracy": 0.7243243455886841, + "num_tokens": 4110876.0, + "step": 4789, + "train/ce_loss": 5.169010819372488e-06 + }, + { + "epoch": 0.4735020763298398, + "step": 4789, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.4735020763298398, + "step": 4789, + "train/total_loss": 0.031250517815351486 + }, + { + "entropy": 9.144216537475586, + "epoch": 0.47360094917935536, + "mean_token_accuracy": 0.7207943797111511, + "num_tokens": 4116224.0, + "step": 4790, + "train/ce_loss": 0.9613037109375 + }, + { + "epoch": 0.47360094917935536, + "step": 4790, + "train/sim_loss": 0.12890625 + }, + { + "epoch": 0.47360094917935536, + "step": 4790, + "train/total_loss": 0.22503662109375 + }, + { + "entropy": 9.424751281738281, + "epoch": 0.4736998220288709, + "mean_token_accuracy": 0.7784090638160706, + "num_tokens": 4121385.0, + "step": 4791, + "train/ce_loss": 0.6650147438049316 + }, + { + "epoch": 0.4736998220288709, + "step": 4791, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.4736998220288709, + "step": 4791, + "train/total_loss": 0.08993897587060928 + }, + { + "entropy": 9.131213188171387, + "epoch": 0.4737986948783864, + "mean_token_accuracy": 0.7556080222129822, + "num_tokens": 4126648.0, + "step": 4792, + "train/ce_loss": 0.741764485836029 + }, + { + "epoch": 0.4737986948783864, + "step": 4792, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.4737986948783864, + "step": 4792, + "train/total_loss": 0.14058271050453186 + }, + { + "entropy": 9.20728874206543, + "epoch": 0.4738975677279019, + "mean_token_accuracy": 0.7727891206741333, + "num_tokens": 4131876.0, + "step": 4793, + "train/ce_loss": 0.5385979413986206 + }, + { + "epoch": 0.4738975677279019, + "step": 4793, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.4738975677279019, + "step": 4793, + "train/total_loss": 0.09292230010032654 + }, + { + "entropy": 9.474047660827637, + "epoch": 0.47399644057741747, + "mean_token_accuracy": 0.7229102253913879, + "num_tokens": 4136981.0, + "step": 4794, + "train/ce_loss": 1.6584529876708984 + }, + { + "epoch": 0.47399644057741747, + "step": 4794, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.47399644057741747, + "step": 4794, + "train/total_loss": 0.22834530472755432 + }, + { + "entropy": 9.055792808532715, + "epoch": 0.47409531342693295, + "mean_token_accuracy": 0.7245657444000244, + "num_tokens": 4142283.0, + "step": 4795, + "train/ce_loss": 1.4621721506118774 + }, + { + "epoch": 0.47409531342693295, + "step": 4795, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.47409531342693295, + "step": 4795, + "train/total_loss": 0.1891859620809555 + }, + { + "entropy": 9.447093963623047, + "epoch": 0.4741941862764485, + "mean_token_accuracy": 0.7011685967445374, + "num_tokens": 4147299.0, + "step": 4796, + "train/ce_loss": 1.4766745567321777 + }, + { + "epoch": 0.4741941862764485, + "step": 4796, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.4741941862764485, + "step": 4796, + "train/total_loss": 0.25313621759414673 + }, + { + "entropy": 9.53965950012207, + "epoch": 0.47429305912596403, + "mean_token_accuracy": 0.7479131817817688, + "num_tokens": 4152289.0, + "step": 4797, + "train/ce_loss": 0.9872329831123352 + }, + { + "epoch": 0.47429305912596403, + "step": 4797, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.47429305912596403, + "step": 4797, + "train/total_loss": 0.16122329235076904 + }, + { + "entropy": 9.425228118896484, + "epoch": 0.4743919319754795, + "mean_token_accuracy": 0.8389512896537781, + "num_tokens": 4157244.0, + "step": 4798, + "train/ce_loss": 1.3840967416763306 + }, + { + "epoch": 0.4743919319754795, + "step": 4798, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.4743919319754795, + "step": 4798, + "train/total_loss": 0.19700342416763306 + }, + { + "entropy": 9.233570098876953, + "epoch": 0.47449080482499506, + "mean_token_accuracy": 0.7161290049552917, + "num_tokens": 4162349.0, + "step": 4799, + "train/ce_loss": 0.9661892652511597 + }, + { + "epoch": 0.47449080482499506, + "step": 4799, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.47449080482499506, + "step": 4799, + "train/total_loss": 0.1669314205646515 + }, + { + "epoch": 0.4745896776745106, + "grad_norm": 0.8365682363510132, + "learning_rate": 8.815952133709143e-06, + "loss": 0.1458, + "step": 4800 + }, + { + "entropy": 9.029359817504883, + "epoch": 0.4745896776745106, + "mean_token_accuracy": 0.7110582590103149, + "num_tokens": 4167644.0, + "step": 4800, + "train/ce_loss": 0.8061527609825134 + }, + { + "epoch": 0.4745896776745106, + "step": 4800, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.4745896776745106, + "step": 4800, + "train/total_loss": 0.11967777460813522 + }, + { + "entropy": 9.284017562866211, + "epoch": 0.4746885505240261, + "mean_token_accuracy": 0.7239999771118164, + "num_tokens": 4172992.0, + "step": 4801, + "train/ce_loss": 1.5423961877822876 + }, + { + "epoch": 0.4746885505240261, + "step": 4801, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.4746885505240261, + "step": 4801, + "train/total_loss": 0.23236462473869324 + }, + { + "entropy": 9.442846298217773, + "epoch": 0.47478742337354163, + "mean_token_accuracy": 0.7652302980422974, + "num_tokens": 4178067.0, + "step": 4802, + "train/ce_loss": 0.841571033000946 + }, + { + "epoch": 0.47478742337354163, + "step": 4802, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.47478742337354163, + "step": 4802, + "train/total_loss": 0.17009460926055908 + }, + { + "entropy": 9.568414688110352, + "epoch": 0.47488629622305717, + "mean_token_accuracy": 0.76936936378479, + "num_tokens": 4183034.0, + "step": 4803, + "train/ce_loss": 0.863416850566864 + }, + { + "epoch": 0.47488629622305717, + "step": 4803, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.47488629622305717, + "step": 4803, + "train/total_loss": 0.12931042909622192 + }, + { + "entropy": 9.1487455368042, + "epoch": 0.47498516907257265, + "mean_token_accuracy": 0.7169559597969055, + "num_tokens": 4188328.0, + "step": 4804, + "train/ce_loss": 1.3763349056243896 + }, + { + "epoch": 0.47498516907257265, + "step": 4804, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.47498516907257265, + "step": 4804, + "train/total_loss": 0.17669598758220673 + }, + { + "entropy": 9.035324096679688, + "epoch": 0.4750840419220882, + "mean_token_accuracy": 0.6836086511611938, + "num_tokens": 4193629.0, + "step": 4805, + "train/ce_loss": 1.2081576585769653 + }, + { + "epoch": 0.4750840419220882, + "step": 4805, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.4750840419220882, + "step": 4805, + "train/total_loss": 0.17159701883792877 + }, + { + "entropy": 9.174358367919922, + "epoch": 0.47518291477160374, + "mean_token_accuracy": 0.7875317931175232, + "num_tokens": 4198844.0, + "step": 4806, + "train/ce_loss": 0.580572783946991 + }, + { + "epoch": 0.47518291477160374, + "step": 4806, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.47518291477160374, + "step": 4806, + "train/total_loss": 0.1127447783946991 + }, + { + "entropy": 8.938118934631348, + "epoch": 0.4752817876211192, + "mean_token_accuracy": 0.7669903039932251, + "num_tokens": 4204115.0, + "step": 4807, + "train/ce_loss": 0.3910205662250519 + }, + { + "epoch": 0.4752817876211192, + "step": 4807, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.4752817876211192, + "step": 4807, + "train/total_loss": 0.10160206258296967 + }, + { + "entropy": 9.694165229797363, + "epoch": 0.47538066047063476, + "mean_token_accuracy": 0.7274549007415771, + "num_tokens": 4209030.0, + "step": 4808, + "train/ce_loss": 1.0041385889053345 + }, + { + "epoch": 0.47538066047063476, + "step": 4808, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.47538066047063476, + "step": 4808, + "train/total_loss": 0.19416385889053345 + }, + { + "entropy": 8.995406150817871, + "epoch": 0.4754795333201503, + "mean_token_accuracy": 0.7344827651977539, + "num_tokens": 4214411.0, + "step": 4809, + "train/ce_loss": 0.7896780371665955 + }, + { + "epoch": 0.4754795333201503, + "step": 4809, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.4754795333201503, + "step": 4809, + "train/total_loss": 0.10631155222654343 + }, + { + "entropy": 8.896883964538574, + "epoch": 0.4755784061696658, + "mean_token_accuracy": 0.7447236180305481, + "num_tokens": 4220024.0, + "step": 4810, + "train/ce_loss": 0.6986748576164246 + }, + { + "epoch": 0.4755784061696658, + "step": 4810, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.4755784061696658, + "step": 4810, + "train/total_loss": 0.08549248427152634 + }, + { + "entropy": 9.272575378417969, + "epoch": 0.47567727901918133, + "mean_token_accuracy": 0.7357910871505737, + "num_tokens": 4225167.0, + "step": 4811, + "train/ce_loss": 1.4026121561983018e-06 + }, + { + "epoch": 0.47567727901918133, + "step": 4811, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.47567727901918133, + "step": 4811, + "train/total_loss": 0.04687514156103134 + }, + { + "entropy": 9.968191146850586, + "epoch": 0.4757761518686969, + "mean_token_accuracy": 0.7743902206420898, + "num_tokens": 4229949.0, + "step": 4812, + "train/ce_loss": 6.063038654247066e-06 + }, + { + "epoch": 0.4757761518686969, + "step": 4812, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.4757761518686969, + "step": 4812, + "train/total_loss": 0.04687560722231865 + }, + { + "entropy": 9.479616165161133, + "epoch": 0.47587502471821236, + "mean_token_accuracy": 0.7243402004241943, + "num_tokens": 4235067.0, + "step": 4813, + "train/ce_loss": 1.4710017442703247 + }, + { + "epoch": 0.47587502471821236, + "step": 4813, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.47587502471821236, + "step": 4813, + "train/total_loss": 0.21741268038749695 + }, + { + "entropy": 9.045944213867188, + "epoch": 0.4759738975677279, + "mean_token_accuracy": 0.7221006751060486, + "num_tokens": 4240480.0, + "step": 4814, + "train/ce_loss": 1.0600768327713013 + }, + { + "epoch": 0.4759738975677279, + "step": 4814, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.4759738975677279, + "step": 4814, + "train/total_loss": 0.15678894519805908 + }, + { + "entropy": 8.691741943359375, + "epoch": 0.47607277041724344, + "mean_token_accuracy": 0.7612565159797668, + "num_tokens": 4245936.0, + "step": 4815, + "train/ce_loss": 0.5213047862052917 + }, + { + "epoch": 0.47607277041724344, + "step": 4815, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.47607277041724344, + "step": 4815, + "train/total_loss": 0.09119297564029694 + }, + { + "entropy": 9.274696350097656, + "epoch": 0.4761716432667589, + "mean_token_accuracy": 0.7519747018814087, + "num_tokens": 4251038.0, + "step": 4816, + "train/ce_loss": 0.9562940001487732 + }, + { + "epoch": 0.4761716432667589, + "step": 4816, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.4761716432667589, + "step": 4816, + "train/total_loss": 0.16594189405441284 + }, + { + "entropy": 9.602999687194824, + "epoch": 0.47627051611627447, + "mean_token_accuracy": 0.7448747158050537, + "num_tokens": 4255925.0, + "step": 4817, + "train/ce_loss": 1.7758719650373678e-06 + }, + { + "epoch": 0.47627051611627447, + "step": 4817, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.47627051611627447, + "step": 4817, + "train/total_loss": 0.050781428813934326 + }, + { + "entropy": 8.861591339111328, + "epoch": 0.47636938896579, + "mean_token_accuracy": 0.7080820202827454, + "num_tokens": 4261243.0, + "step": 4818, + "train/ce_loss": 1.1278631687164307 + }, + { + "epoch": 0.47636938896579, + "step": 4818, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.47636938896579, + "step": 4818, + "train/total_loss": 0.21825507283210754 + }, + { + "entropy": 9.071257591247559, + "epoch": 0.4764682618153055, + "mean_token_accuracy": 0.7853535413742065, + "num_tokens": 4266473.0, + "step": 4819, + "train/ce_loss": 0.601017415523529 + }, + { + "epoch": 0.4764682618153055, + "step": 4819, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.4764682618153055, + "step": 4819, + "train/total_loss": 0.07572674751281738 + }, + { + "epoch": 0.47656713466482103, + "grad_norm": 0.683932900428772, + "learning_rate": 8.811007268951196e-06, + "loss": 0.1375, + "step": 4820 + }, + { + "entropy": 8.734341621398926, + "epoch": 0.47656713466482103, + "mean_token_accuracy": 0.7373225092887878, + "num_tokens": 4272090.0, + "step": 4820, + "train/ce_loss": 1.2590872049331665 + }, + { + "epoch": 0.47656713466482103, + "step": 4820, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.47656713466482103, + "step": 4820, + "train/total_loss": 0.2352837175130844 + }, + { + "entropy": 9.463361740112305, + "epoch": 0.4766660075143366, + "mean_token_accuracy": 0.7535545229911804, + "num_tokens": 4277158.0, + "step": 4821, + "train/ce_loss": 0.7862908244132996 + }, + { + "epoch": 0.4766660075143366, + "step": 4821, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.4766660075143366, + "step": 4821, + "train/total_loss": 0.12159783393144608 + }, + { + "entropy": 8.814202308654785, + "epoch": 0.47676488036385206, + "mean_token_accuracy": 0.7385892271995544, + "num_tokens": 4282650.0, + "step": 4822, + "train/ce_loss": 1.111302375793457 + }, + { + "epoch": 0.47676488036385206, + "step": 4822, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.47676488036385206, + "step": 4822, + "train/total_loss": 0.1970677375793457 + }, + { + "entropy": 8.99859619140625, + "epoch": 0.4768637532133676, + "mean_token_accuracy": 0.7559366822242737, + "num_tokens": 4287895.0, + "step": 4823, + "train/ce_loss": 0.7384012937545776 + }, + { + "epoch": 0.4768637532133676, + "step": 4823, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.4768637532133676, + "step": 4823, + "train/total_loss": 0.15196514129638672 + }, + { + "entropy": 8.797042846679688, + "epoch": 0.47696262606288314, + "mean_token_accuracy": 0.7273743152618408, + "num_tokens": 4293266.0, + "step": 4824, + "train/ce_loss": 0.8045260310173035 + }, + { + "epoch": 0.47696262606288314, + "step": 4824, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.47696262606288314, + "step": 4824, + "train/total_loss": 0.15857760608196259 + }, + { + "entropy": 9.423627853393555, + "epoch": 0.47706149891239863, + "mean_token_accuracy": 0.7994056344032288, + "num_tokens": 4298359.0, + "step": 4825, + "train/ce_loss": 0.8552690744400024 + }, + { + "epoch": 0.47706149891239863, + "step": 4825, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.47706149891239863, + "step": 4825, + "train/total_loss": 0.13240191340446472 + }, + { + "entropy": 9.24445629119873, + "epoch": 0.47716037176191417, + "mean_token_accuracy": 0.7112902998924255, + "num_tokens": 4303382.0, + "step": 4826, + "train/ce_loss": 1.6132769584655762 + }, + { + "epoch": 0.47716037176191417, + "step": 4826, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.47716037176191417, + "step": 4826, + "train/total_loss": 0.23945270478725433 + }, + { + "entropy": 9.483026504516602, + "epoch": 0.4772592446114297, + "mean_token_accuracy": 0.7545605301856995, + "num_tokens": 4308358.0, + "step": 4827, + "train/ce_loss": 0.869056224822998 + }, + { + "epoch": 0.4772592446114297, + "step": 4827, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.4772592446114297, + "step": 4827, + "train/total_loss": 0.12596812844276428 + }, + { + "entropy": 9.725470542907715, + "epoch": 0.4773581174609452, + "mean_token_accuracy": 0.7925636172294617, + "num_tokens": 4313474.0, + "step": 4828, + "train/ce_loss": 1.4528536796569824 + }, + { + "epoch": 0.4773581174609452, + "step": 4828, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.4773581174609452, + "step": 4828, + "train/total_loss": 0.21950411796569824 + }, + { + "entropy": 9.19894027709961, + "epoch": 0.47745699031046074, + "mean_token_accuracy": 0.6939040422439575, + "num_tokens": 4318687.0, + "step": 4829, + "train/ce_loss": 1.0451105833053589 + }, + { + "epoch": 0.47745699031046074, + "step": 4829, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.47745699031046074, + "step": 4829, + "train/total_loss": 0.1631048023700714 + }, + { + "entropy": 9.40733814239502, + "epoch": 0.4775558631599763, + "mean_token_accuracy": 0.7928994297981262, + "num_tokens": 4323836.0, + "step": 4830, + "train/ce_loss": 5.014096586819505e-06 + }, + { + "epoch": 0.4775558631599763, + "step": 4830, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.4775558631599763, + "step": 4830, + "train/total_loss": 0.05078175291419029 + }, + { + "entropy": 9.185754776000977, + "epoch": 0.4776547360094918, + "mean_token_accuracy": 0.7546418905258179, + "num_tokens": 4329014.0, + "step": 4831, + "train/ce_loss": 0.9354567527770996 + }, + { + "epoch": 0.4776547360094918, + "step": 4831, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.4776547360094918, + "step": 4831, + "train/total_loss": 0.14042067527770996 + }, + { + "entropy": 9.714949607849121, + "epoch": 0.4777536088590073, + "mean_token_accuracy": 0.7931034564971924, + "num_tokens": 4333916.0, + "step": 4832, + "train/ce_loss": 1.6440129280090332 + }, + { + "epoch": 0.4777536088590073, + "step": 4832, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.4777536088590073, + "step": 4832, + "train/total_loss": 0.23862004280090332 + }, + { + "entropy": 9.325277328491211, + "epoch": 0.47785248170852285, + "mean_token_accuracy": 0.7177321910858154, + "num_tokens": 4339216.0, + "step": 4833, + "train/ce_loss": 1.1986669505859027e-06 + }, + { + "epoch": 0.47785248170852285, + "step": 4833, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.47785248170852285, + "step": 4833, + "train/total_loss": 0.03906261920928955 + }, + { + "entropy": 9.78929328918457, + "epoch": 0.4779513545580384, + "mean_token_accuracy": 0.8075221180915833, + "num_tokens": 4344103.0, + "step": 4834, + "train/ce_loss": 1.0145269632339478 + }, + { + "epoch": 0.4779513545580384, + "step": 4834, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.4779513545580384, + "step": 4834, + "train/total_loss": 0.15614020824432373 + }, + { + "entropy": 9.039548873901367, + "epoch": 0.4780502274075539, + "mean_token_accuracy": 0.730681836605072, + "num_tokens": 4349480.0, + "step": 4835, + "train/ce_loss": 0.5511088371276855 + }, + { + "epoch": 0.4780502274075539, + "step": 4835, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.4780502274075539, + "step": 4835, + "train/total_loss": 0.1137046366930008 + }, + { + "entropy": 8.990793228149414, + "epoch": 0.4781491002570694, + "mean_token_accuracy": 0.6390804648399353, + "num_tokens": 4354945.0, + "step": 4836, + "train/ce_loss": 1.185433030128479 + }, + { + "epoch": 0.4781491002570694, + "step": 4836, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.4781491002570694, + "step": 4836, + "train/total_loss": 0.18104329705238342 + }, + { + "entropy": 9.270225524902344, + "epoch": 0.47824797310658496, + "mean_token_accuracy": 0.7189542651176453, + "num_tokens": 4360128.0, + "step": 4837, + "train/ce_loss": 0.6311092376708984 + }, + { + "epoch": 0.47824797310658496, + "step": 4837, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.47824797310658496, + "step": 4837, + "train/total_loss": 0.12561091780662537 + }, + { + "entropy": 9.547813415527344, + "epoch": 0.47834684595610044, + "mean_token_accuracy": 0.8039867281913757, + "num_tokens": 4365148.0, + "step": 4838, + "train/ce_loss": 0.6666879653930664 + }, + { + "epoch": 0.47834684595610044, + "step": 4838, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.47834684595610044, + "step": 4838, + "train/total_loss": 0.105731301009655 + }, + { + "entropy": 9.311174392700195, + "epoch": 0.478445718805616, + "mean_token_accuracy": 0.7417582273483276, + "num_tokens": 4370360.0, + "step": 4839, + "train/ce_loss": 0.6862412691116333 + }, + { + "epoch": 0.478445718805616, + "step": 4839, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.478445718805616, + "step": 4839, + "train/total_loss": 0.16237413883209229 + }, + { + "epoch": 0.4785445916551315, + "grad_norm": 0.6970511674880981, + "learning_rate": 8.806062404193246e-06, + "loss": 0.1414, + "step": 4840 + }, + { + "entropy": 9.550680160522461, + "epoch": 0.4785445916551315, + "mean_token_accuracy": 0.7301587462425232, + "num_tokens": 4375402.0, + "step": 4840, + "train/ce_loss": 0.864875316619873 + }, + { + "epoch": 0.4785445916551315, + "step": 4840, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.4785445916551315, + "step": 4840, + "train/total_loss": 0.1646125316619873 + }, + { + "entropy": 9.34185791015625, + "epoch": 0.478643464504647, + "mean_token_accuracy": 0.6724637746810913, + "num_tokens": 4380581.0, + "step": 4841, + "train/ce_loss": 1.8873491287231445 + }, + { + "epoch": 0.478643464504647, + "step": 4841, + "train/sim_loss": 0.140625 + }, + { + "epoch": 0.478643464504647, + "step": 4841, + "train/total_loss": 0.32935991883277893 + }, + { + "entropy": 9.676137924194336, + "epoch": 0.47874233735416255, + "mean_token_accuracy": 0.784380316734314, + "num_tokens": 4385628.0, + "step": 4842, + "train/ce_loss": 3.004107156812097e-06 + }, + { + "epoch": 0.47874233735416255, + "step": 4842, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.47874233735416255, + "step": 4842, + "train/total_loss": 0.07812529802322388 + }, + { + "entropy": 9.124101638793945, + "epoch": 0.4788412102036781, + "mean_token_accuracy": 0.8434210419654846, + "num_tokens": 4390852.0, + "step": 4843, + "train/ce_loss": 0.5420211553573608 + }, + { + "epoch": 0.4788412102036781, + "step": 4843, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.4788412102036781, + "step": 4843, + "train/total_loss": 0.0737333670258522 + }, + { + "entropy": 9.000953674316406, + "epoch": 0.4789400830531936, + "mean_token_accuracy": 0.7689393758773804, + "num_tokens": 4396117.0, + "step": 4844, + "train/ce_loss": 0.6013636589050293 + }, + { + "epoch": 0.4789400830531936, + "step": 4844, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.4789400830531936, + "step": 4844, + "train/total_loss": 0.1695113629102707 + }, + { + "entropy": 8.986391067504883, + "epoch": 0.4790389559027091, + "mean_token_accuracy": 0.7383784055709839, + "num_tokens": 4401542.0, + "step": 4845, + "train/ce_loss": 1.416501760482788 + }, + { + "epoch": 0.4790389559027091, + "step": 4845, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.4790389559027091, + "step": 4845, + "train/total_loss": 0.2510251998901367 + }, + { + "entropy": 9.099742889404297, + "epoch": 0.47913782875222466, + "mean_token_accuracy": 0.7825000286102295, + "num_tokens": 4406841.0, + "step": 4846, + "train/ce_loss": 1.2701870203018188 + }, + { + "epoch": 0.47913782875222466, + "step": 4846, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.47913782875222466, + "step": 4846, + "train/total_loss": 0.20514370501041412 + }, + { + "entropy": 8.608301162719727, + "epoch": 0.47923670160174014, + "mean_token_accuracy": 0.7108571529388428, + "num_tokens": 4412193.0, + "step": 4847, + "train/ce_loss": 0.48382073640823364 + }, + { + "epoch": 0.47923670160174014, + "step": 4847, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.47923670160174014, + "step": 4847, + "train/total_loss": 0.15775707364082336 + }, + { + "entropy": 8.367998123168945, + "epoch": 0.4793355744512557, + "mean_token_accuracy": 0.6746666431427002, + "num_tokens": 4417897.0, + "step": 4848, + "train/ce_loss": 1.1643277406692505 + }, + { + "epoch": 0.4793355744512557, + "step": 4848, + "train/sim_loss": 0.17578125 + }, + { + "epoch": 0.4793355744512557, + "step": 4848, + "train/total_loss": 0.292214035987854 + }, + { + "entropy": 8.996297836303711, + "epoch": 0.4794344473007712, + "mean_token_accuracy": 0.7831021547317505, + "num_tokens": 4423119.0, + "step": 4849, + "train/ce_loss": 0.7570226788520813 + }, + { + "epoch": 0.4794344473007712, + "step": 4849, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.4794344473007712, + "step": 4849, + "train/total_loss": 0.11867102235555649 + }, + { + "entropy": 9.125381469726562, + "epoch": 0.4795333201502867, + "mean_token_accuracy": 0.7447090148925781, + "num_tokens": 4428353.0, + "step": 4850, + "train/ce_loss": 0.8565332293510437 + }, + { + "epoch": 0.4795333201502867, + "step": 4850, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.4795333201502867, + "step": 4850, + "train/total_loss": 0.16768458485603333 + }, + { + "entropy": 9.312715530395508, + "epoch": 0.47963219299980225, + "mean_token_accuracy": 0.7510204315185547, + "num_tokens": 4433506.0, + "step": 4851, + "train/ce_loss": 1.4466944932937622 + }, + { + "epoch": 0.47963219299980225, + "step": 4851, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.47963219299980225, + "step": 4851, + "train/total_loss": 0.23841945827007294 + }, + { + "entropy": 9.121928215026855, + "epoch": 0.4797310658493178, + "mean_token_accuracy": 0.7729918360710144, + "num_tokens": 4438799.0, + "step": 4852, + "train/ce_loss": 0.7193275094032288 + }, + { + "epoch": 0.4797310658493178, + "step": 4852, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.4797310658493178, + "step": 4852, + "train/total_loss": 0.13833901286125183 + }, + { + "entropy": 9.010169982910156, + "epoch": 0.4798299386988333, + "mean_token_accuracy": 0.7578418850898743, + "num_tokens": 4444061.0, + "step": 4853, + "train/ce_loss": 0.5989353656768799 + }, + { + "epoch": 0.4798299386988333, + "step": 4853, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.4798299386988333, + "step": 4853, + "train/total_loss": 0.11848728358745575 + }, + { + "entropy": 8.95584487915039, + "epoch": 0.4799288115483488, + "mean_token_accuracy": 0.8355827927589417, + "num_tokens": 4449334.0, + "step": 4854, + "train/ce_loss": 0.47817134857177734 + }, + { + "epoch": 0.4799288115483488, + "step": 4854, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.4799288115483488, + "step": 4854, + "train/total_loss": 0.07906714081764221 + }, + { + "entropy": 9.436650276184082, + "epoch": 0.48002768439786436, + "mean_token_accuracy": 0.7937008142471313, + "num_tokens": 4454463.0, + "step": 4855, + "train/ce_loss": 0.5432444214820862 + }, + { + "epoch": 0.48002768439786436, + "step": 4855, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.48002768439786436, + "step": 4855, + "train/total_loss": 0.1011994481086731 + }, + { + "entropy": 8.903467178344727, + "epoch": 0.48012655724737985, + "mean_token_accuracy": 0.7973856329917908, + "num_tokens": 4459866.0, + "step": 4856, + "train/ce_loss": 0.42568978667259216 + }, + { + "epoch": 0.48012655724737985, + "step": 4856, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.48012655724737985, + "step": 4856, + "train/total_loss": 0.08163148164749146 + }, + { + "entropy": 8.95402717590332, + "epoch": 0.4802254300968954, + "mean_token_accuracy": 0.6666666865348816, + "num_tokens": 4465094.0, + "step": 4857, + "train/ce_loss": 0.8045952320098877 + }, + { + "epoch": 0.4802254300968954, + "step": 4857, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.4802254300968954, + "step": 4857, + "train/total_loss": 0.16249078512191772 + }, + { + "entropy": 8.827604293823242, + "epoch": 0.48032430294641093, + "mean_token_accuracy": 0.791374146938324, + "num_tokens": 4470541.0, + "step": 4858, + "train/ce_loss": 0.48000288009643555 + }, + { + "epoch": 0.48032430294641093, + "step": 4858, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.48032430294641093, + "step": 4858, + "train/total_loss": 0.0714377909898758 + }, + { + "entropy": 9.142309188842773, + "epoch": 0.4804231757959264, + "mean_token_accuracy": 0.7300435304641724, + "num_tokens": 4475717.0, + "step": 4859, + "train/ce_loss": 1.0548945665359497 + }, + { + "epoch": 0.4804231757959264, + "step": 4859, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.4804231757959264, + "step": 4859, + "train/total_loss": 0.19142696261405945 + }, + { + "epoch": 0.48052204864544196, + "grad_norm": 0.7656567096710205, + "learning_rate": 8.801117539435298e-06, + "loss": 0.1385, + "step": 4860 + }, + { + "entropy": 8.858229637145996, + "epoch": 0.48052204864544196, + "mean_token_accuracy": 0.767756462097168, + "num_tokens": 4481076.0, + "step": 4860, + "train/ce_loss": 0.43485215306282043 + }, + { + "epoch": 0.48052204864544196, + "step": 4860, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.48052204864544196, + "step": 4860, + "train/total_loss": 0.07864146679639816 + }, + { + "entropy": 9.248090744018555, + "epoch": 0.4806209214949575, + "mean_token_accuracy": 0.7274096608161926, + "num_tokens": 4486161.0, + "step": 4861, + "train/ce_loss": 1.2672001123428345 + }, + { + "epoch": 0.4806209214949575, + "step": 4861, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.4806209214949575, + "step": 4861, + "train/total_loss": 0.19312626123428345 + }, + { + "entropy": 10.162076950073242, + "epoch": 0.480719794344473, + "mean_token_accuracy": 0.75, + "num_tokens": 4490733.0, + "step": 4862, + "train/ce_loss": 5.364162461773958e-06 + }, + { + "epoch": 0.480719794344473, + "step": 4862, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.480719794344473, + "step": 4862, + "train/total_loss": 0.03125053644180298 + }, + { + "entropy": 8.947798728942871, + "epoch": 0.4808186671939885, + "mean_token_accuracy": 0.7198660969734192, + "num_tokens": 4496133.0, + "step": 4863, + "train/ce_loss": 1.493488073348999 + }, + { + "epoch": 0.4808186671939885, + "step": 4863, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.4808186671939885, + "step": 4863, + "train/total_loss": 0.21575506031513214 + }, + { + "entropy": 9.071576118469238, + "epoch": 0.48091754004350407, + "mean_token_accuracy": 0.7739899158477783, + "num_tokens": 4501368.0, + "step": 4864, + "train/ce_loss": 0.396329402923584 + }, + { + "epoch": 0.48091754004350407, + "step": 4864, + "train/sim_loss": 0.15625 + }, + { + "epoch": 0.48091754004350407, + "step": 4864, + "train/total_loss": 0.19588294625282288 + }, + { + "entropy": 8.7305908203125, + "epoch": 0.48101641289301955, + "mean_token_accuracy": 0.7126545906066895, + "num_tokens": 4506935.0, + "step": 4865, + "train/ce_loss": 1.3752968311309814 + }, + { + "epoch": 0.48101641289301955, + "step": 4865, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.48101641289301955, + "step": 4865, + "train/total_loss": 0.21174843609333038 + }, + { + "entropy": 9.435068130493164, + "epoch": 0.4811152857425351, + "mean_token_accuracy": 0.7475728392601013, + "num_tokens": 4511938.0, + "step": 4866, + "train/ce_loss": 0.795413613319397 + }, + { + "epoch": 0.4811152857425351, + "step": 4866, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.4811152857425351, + "step": 4866, + "train/total_loss": 0.16547885537147522 + }, + { + "entropy": 8.641799926757812, + "epoch": 0.48121415859205063, + "mean_token_accuracy": 0.7578268647193909, + "num_tokens": 4517491.0, + "step": 4867, + "train/ce_loss": 0.7943580746650696 + }, + { + "epoch": 0.48121415859205063, + "step": 4867, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.48121415859205063, + "step": 4867, + "train/total_loss": 0.1419358104467392 + }, + { + "entropy": 9.303564071655273, + "epoch": 0.4813130314415661, + "mean_token_accuracy": 0.7377567291259766, + "num_tokens": 4522600.0, + "step": 4868, + "train/ce_loss": 3.0106859867373714e-06 + }, + { + "epoch": 0.4813130314415661, + "step": 4868, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.4813130314415661, + "step": 4868, + "train/total_loss": 0.046875301748514175 + }, + { + "entropy": 8.76177978515625, + "epoch": 0.48141190429108166, + "mean_token_accuracy": 0.7208791375160217, + "num_tokens": 4527967.0, + "step": 4869, + "train/ce_loss": 0.673498272895813 + }, + { + "epoch": 0.48141190429108166, + "step": 4869, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.48141190429108166, + "step": 4869, + "train/total_loss": 0.12203732877969742 + }, + { + "entropy": 8.733413696289062, + "epoch": 0.4815107771405972, + "mean_token_accuracy": 0.7470588088035583, + "num_tokens": 4533451.0, + "step": 4870, + "train/ce_loss": 0.949754536151886 + }, + { + "epoch": 0.4815107771405972, + "step": 4870, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.4815107771405972, + "step": 4870, + "train/total_loss": 0.16919420659542084 + }, + { + "entropy": 8.815168380737305, + "epoch": 0.48160964999011274, + "mean_token_accuracy": 0.7336841821670532, + "num_tokens": 4538863.0, + "step": 4871, + "train/ce_loss": 0.7179884314537048 + }, + { + "epoch": 0.48160964999011274, + "step": 4871, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.48160964999011274, + "step": 4871, + "train/total_loss": 0.11086134612560272 + }, + { + "entropy": 8.572389602661133, + "epoch": 0.48170852283962823, + "mean_token_accuracy": 0.6777777671813965, + "num_tokens": 4544282.0, + "step": 4872, + "train/ce_loss": 1.2481721639633179 + }, + { + "epoch": 0.48170852283962823, + "step": 4872, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.48170852283962823, + "step": 4872, + "train/total_loss": 0.23809847235679626 + }, + { + "entropy": 8.487792015075684, + "epoch": 0.48180739568914377, + "mean_token_accuracy": 0.6997244954109192, + "num_tokens": 4549887.0, + "step": 4873, + "train/ce_loss": 1.1593464612960815 + }, + { + "epoch": 0.48180739568914377, + "step": 4873, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.48180739568914377, + "step": 4873, + "train/total_loss": 0.17062214016914368 + }, + { + "entropy": 8.484737396240234, + "epoch": 0.4819062685386593, + "mean_token_accuracy": 0.7349081635475159, + "num_tokens": 4555535.0, + "step": 4874, + "train/ce_loss": 0.9921157360076904 + }, + { + "epoch": 0.4819062685386593, + "step": 4874, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.4819062685386593, + "step": 4874, + "train/total_loss": 0.15389907360076904 + }, + { + "entropy": 9.836902618408203, + "epoch": 0.4820051413881748, + "mean_token_accuracy": 0.7613365054130554, + "num_tokens": 4560363.0, + "step": 4875, + "train/ce_loss": 1.13887619972229 + }, + { + "epoch": 0.4820051413881748, + "step": 4875, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.4820051413881748, + "step": 4875, + "train/total_loss": 0.17248137295246124 + }, + { + "entropy": 9.677080154418945, + "epoch": 0.48210401423769034, + "mean_token_accuracy": 0.7024070024490356, + "num_tokens": 4565259.0, + "step": 4876, + "train/ce_loss": 1.0439310244692024e-05 + }, + { + "epoch": 0.48210401423769034, + "step": 4876, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.48210401423769034, + "step": 4876, + "train/total_loss": 0.06250104308128357 + }, + { + "entropy": 8.960569381713867, + "epoch": 0.4822028870872059, + "mean_token_accuracy": 0.7482993006706238, + "num_tokens": 4570583.0, + "step": 4877, + "train/ce_loss": 0.8461055755615234 + }, + { + "epoch": 0.4822028870872059, + "step": 4877, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.4822028870872059, + "step": 4877, + "train/total_loss": 0.13148555159568787 + }, + { + "entropy": 9.241521835327148, + "epoch": 0.48230175993672136, + "mean_token_accuracy": 0.6995581984519958, + "num_tokens": 4575710.0, + "step": 4878, + "train/ce_loss": 0.7538856863975525 + }, + { + "epoch": 0.48230175993672136, + "step": 4878, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.48230175993672136, + "step": 4878, + "train/total_loss": 0.11835732311010361 + }, + { + "entropy": 8.968240737915039, + "epoch": 0.4824006327862369, + "mean_token_accuracy": 0.7649208307266235, + "num_tokens": 4580976.0, + "step": 4879, + "train/ce_loss": 0.5129885077476501 + }, + { + "epoch": 0.4824006327862369, + "step": 4879, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.4824006327862369, + "step": 4879, + "train/total_loss": 0.07864260673522949 + }, + { + "epoch": 0.48249950563575245, + "grad_norm": 0.7346249222755432, + "learning_rate": 8.796172674677347e-06, + "loss": 0.1431, + "step": 4880 + }, + { + "entropy": 9.064022064208984, + "epoch": 0.48249950563575245, + "mean_token_accuracy": 0.7308228611946106, + "num_tokens": 4586152.0, + "step": 4880, + "train/ce_loss": 2.089444706143695e-06 + }, + { + "epoch": 0.48249950563575245, + "step": 4880, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.48249950563575245, + "step": 4880, + "train/total_loss": 0.042968958616256714 + }, + { + "entropy": 9.454401016235352, + "epoch": 0.48259837848526793, + "mean_token_accuracy": 0.6894824504852295, + "num_tokens": 4591169.0, + "step": 4881, + "train/ce_loss": 0.8874126672744751 + }, + { + "epoch": 0.48259837848526793, + "step": 4881, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.48259837848526793, + "step": 4881, + "train/total_loss": 0.135616272687912 + }, + { + "entropy": 9.163633346557617, + "epoch": 0.4826972513347835, + "mean_token_accuracy": 0.7462887763977051, + "num_tokens": 4596377.0, + "step": 4882, + "train/ce_loss": 1.0831094980239868 + }, + { + "epoch": 0.4826972513347835, + "step": 4882, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.4826972513347835, + "step": 4882, + "train/total_loss": 0.21768595278263092 + }, + { + "entropy": 9.48274040222168, + "epoch": 0.482796124184299, + "mean_token_accuracy": 0.7407407164573669, + "num_tokens": 4601453.0, + "step": 4883, + "train/ce_loss": 0.6060320734977722 + }, + { + "epoch": 0.482796124184299, + "step": 4883, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.482796124184299, + "step": 4883, + "train/total_loss": 0.08404070883989334 + }, + { + "entropy": 8.87118148803711, + "epoch": 0.4828949970338145, + "mean_token_accuracy": 0.738990306854248, + "num_tokens": 4606906.0, + "step": 4884, + "train/ce_loss": 0.6846893429756165 + }, + { + "epoch": 0.4828949970338145, + "step": 4884, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.4828949970338145, + "step": 4884, + "train/total_loss": 0.10362518578767776 + }, + { + "entropy": 9.23701000213623, + "epoch": 0.48299386988333004, + "mean_token_accuracy": 0.7627118825912476, + "num_tokens": 4612078.0, + "step": 4885, + "train/ce_loss": 0.6392655372619629 + }, + { + "epoch": 0.48299386988333004, + "step": 4885, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.48299386988333004, + "step": 4885, + "train/total_loss": 0.09517655521631241 + }, + { + "entropy": 9.396322250366211, + "epoch": 0.4830927427328456, + "mean_token_accuracy": 0.726190447807312, + "num_tokens": 4617226.0, + "step": 4886, + "train/ce_loss": 2.1839709281921387 + }, + { + "epoch": 0.4830927427328456, + "step": 4886, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.4830927427328456, + "step": 4886, + "train/total_loss": 0.3316783308982849 + }, + { + "entropy": 8.550373077392578, + "epoch": 0.48319161558236107, + "mean_token_accuracy": 0.7473683953285217, + "num_tokens": 4622645.0, + "step": 4887, + "train/ce_loss": 1.0314589738845825 + }, + { + "epoch": 0.48319161558236107, + "step": 4887, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.48319161558236107, + "step": 4887, + "train/total_loss": 0.16564589738845825 + }, + { + "entropy": 9.357982635498047, + "epoch": 0.4832904884318766, + "mean_token_accuracy": 0.7330447435379028, + "num_tokens": 4627808.0, + "step": 4888, + "train/ce_loss": 0.5194808840751648 + }, + { + "epoch": 0.4832904884318766, + "step": 4888, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.4832904884318766, + "step": 4888, + "train/total_loss": 0.11835433542728424 + }, + { + "entropy": 9.263805389404297, + "epoch": 0.48338936128139215, + "mean_token_accuracy": 0.7716763019561768, + "num_tokens": 4632900.0, + "step": 4889, + "train/ce_loss": 0.3726261556148529 + }, + { + "epoch": 0.48338936128139215, + "step": 4889, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.48338936128139215, + "step": 4889, + "train/total_loss": 0.07632511854171753 + }, + { + "entropy": 9.518759727478027, + "epoch": 0.48348823413090763, + "mean_token_accuracy": 0.7730956077575684, + "num_tokens": 4637976.0, + "step": 4890, + "train/ce_loss": 0.6279394030570984 + }, + { + "epoch": 0.48348823413090763, + "step": 4890, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.48348823413090763, + "step": 4890, + "train/total_loss": 0.07841894030570984 + }, + { + "entropy": 8.828695297241211, + "epoch": 0.4835871069804232, + "mean_token_accuracy": 0.6888889074325562, + "num_tokens": 4643377.0, + "step": 4891, + "train/ce_loss": 0.5468465089797974 + }, + { + "epoch": 0.4835871069804232, + "step": 4891, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.4835871069804232, + "step": 4891, + "train/total_loss": 0.10546590387821198 + }, + { + "entropy": 8.950201034545898, + "epoch": 0.4836859798299387, + "mean_token_accuracy": 0.7175368070602417, + "num_tokens": 4648600.0, + "step": 4892, + "train/ce_loss": 1.4159440994262695 + }, + { + "epoch": 0.4836859798299387, + "step": 4892, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.4836859798299387, + "step": 4892, + "train/total_loss": 0.23925065994262695 + }, + { + "entropy": 9.095492362976074, + "epoch": 0.4837848526794542, + "mean_token_accuracy": 0.7314148545265198, + "num_tokens": 4653960.0, + "step": 4893, + "train/ce_loss": 1.207137942314148 + }, + { + "epoch": 0.4837848526794542, + "step": 4893, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.4837848526794542, + "step": 4893, + "train/total_loss": 0.20274505019187927 + }, + { + "entropy": 8.578197479248047, + "epoch": 0.48388372552896974, + "mean_token_accuracy": 0.7013100385665894, + "num_tokens": 4659591.0, + "step": 4894, + "train/ce_loss": 1.2564072608947754 + }, + { + "epoch": 0.48388372552896974, + "step": 4894, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.48388372552896974, + "step": 4894, + "train/total_loss": 0.17251573503017426 + }, + { + "entropy": 9.875054359436035, + "epoch": 0.4839825983784853, + "mean_token_accuracy": 0.7128099203109741, + "num_tokens": 4664503.0, + "step": 4895, + "train/ce_loss": 1.451451063156128 + }, + { + "epoch": 0.4839825983784853, + "step": 4895, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.4839825983784853, + "step": 4895, + "train/total_loss": 0.18420760333538055 + }, + { + "entropy": 8.972516059875488, + "epoch": 0.48408147122800077, + "mean_token_accuracy": 0.732758641242981, + "num_tokens": 4669931.0, + "step": 4896, + "train/ce_loss": 0.4388304054737091 + }, + { + "epoch": 0.48408147122800077, + "step": 4896, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.48408147122800077, + "step": 4896, + "train/total_loss": 0.1337267905473709 + }, + { + "entropy": 9.007209777832031, + "epoch": 0.4841803440775163, + "mean_token_accuracy": 0.7642015218734741, + "num_tokens": 4675331.0, + "step": 4897, + "train/ce_loss": 0.7995807528495789 + }, + { + "epoch": 0.4841803440775163, + "step": 4897, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.4841803440775163, + "step": 4897, + "train/total_loss": 0.14636433124542236 + }, + { + "entropy": 8.663717269897461, + "epoch": 0.48427921692703185, + "mean_token_accuracy": 0.7679924368858337, + "num_tokens": 4680850.0, + "step": 4898, + "train/ce_loss": 0.3658985495567322 + }, + { + "epoch": 0.48427921692703185, + "step": 4898, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.48427921692703185, + "step": 4898, + "train/total_loss": 0.0639336109161377 + }, + { + "entropy": 9.340051651000977, + "epoch": 0.48437808977654734, + "mean_token_accuracy": 0.7296918630599976, + "num_tokens": 4686050.0, + "step": 4899, + "train/ce_loss": 0.7124799489974976 + }, + { + "epoch": 0.48437808977654734, + "step": 4899, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.48437808977654734, + "step": 4899, + "train/total_loss": 0.11421674489974976 + }, + { + "epoch": 0.4844769626260629, + "grad_norm": 0.7022703289985657, + "learning_rate": 8.7912278099194e-06, + "loss": 0.1444, + "step": 4900 + }, + { + "entropy": 9.204004287719727, + "epoch": 0.4844769626260629, + "mean_token_accuracy": 0.7441558241844177, + "num_tokens": 4691321.0, + "step": 4900, + "train/ce_loss": 0.453791081905365 + }, + { + "epoch": 0.4844769626260629, + "step": 4900, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.4844769626260629, + "step": 4900, + "train/total_loss": 0.06491035968065262 + }, + { + "entropy": 8.931779861450195, + "epoch": 0.4845758354755784, + "mean_token_accuracy": 0.7771618366241455, + "num_tokens": 4696679.0, + "step": 4901, + "train/ce_loss": 0.5937322974205017 + }, + { + "epoch": 0.4845758354755784, + "step": 4901, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.4845758354755784, + "step": 4901, + "train/total_loss": 0.07499822974205017 + }, + { + "entropy": 9.884674072265625, + "epoch": 0.4846747083250939, + "mean_token_accuracy": 0.710106372833252, + "num_tokens": 4701478.0, + "step": 4902, + "train/ce_loss": 2.7379010134609416e-06 + }, + { + "epoch": 0.4846747083250939, + "step": 4902, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.4846747083250939, + "step": 4902, + "train/total_loss": 0.07031277567148209 + }, + { + "entropy": 8.596389770507812, + "epoch": 0.48477358117460945, + "mean_token_accuracy": 0.7246804237365723, + "num_tokens": 4707014.0, + "step": 4903, + "train/ce_loss": 0.7283797264099121 + }, + { + "epoch": 0.48477358117460945, + "step": 4903, + "train/sim_loss": 0.17578125 + }, + { + "epoch": 0.48477358117460945, + "step": 4903, + "train/total_loss": 0.2486192286014557 + }, + { + "entropy": 8.942275047302246, + "epoch": 0.484872454024125, + "mean_token_accuracy": 0.7307236194610596, + "num_tokens": 4712354.0, + "step": 4904, + "train/ce_loss": 0.9904487133026123 + }, + { + "epoch": 0.484872454024125, + "step": 4904, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.484872454024125, + "step": 4904, + "train/total_loss": 0.21623237431049347 + }, + { + "entropy": 9.129148483276367, + "epoch": 0.4849713268736405, + "mean_token_accuracy": 0.709392249584198, + "num_tokens": 4717708.0, + "step": 4905, + "train/ce_loss": 9.562788818584522e-07 + }, + { + "epoch": 0.4849713268736405, + "step": 4905, + "train/sim_loss": 0.01171875 + }, + { + "epoch": 0.4849713268736405, + "step": 4905, + "train/total_loss": 0.011718845926225185 + }, + { + "entropy": 9.028470993041992, + "epoch": 0.485070199723156, + "mean_token_accuracy": 0.7608951926231384, + "num_tokens": 4723023.0, + "step": 4906, + "train/ce_loss": 0.7763880491256714 + }, + { + "epoch": 0.485070199723156, + "step": 4906, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.485070199723156, + "step": 4906, + "train/total_loss": 0.14013880491256714 + }, + { + "entropy": 8.938543319702148, + "epoch": 0.48516907257267156, + "mean_token_accuracy": 0.7885652780532837, + "num_tokens": 4728351.0, + "step": 4907, + "train/ce_loss": 0.7680575847625732 + }, + { + "epoch": 0.48516907257267156, + "step": 4907, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.48516907257267156, + "step": 4907, + "train/total_loss": 0.16274327039718628 + }, + { + "entropy": 8.846967697143555, + "epoch": 0.48526794542218704, + "mean_token_accuracy": 0.7796178460121155, + "num_tokens": 4733658.0, + "step": 4908, + "train/ce_loss": 0.6358514428138733 + }, + { + "epoch": 0.48526794542218704, + "step": 4908, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.48526794542218704, + "step": 4908, + "train/total_loss": 0.08702264726161957 + }, + { + "entropy": 8.666057586669922, + "epoch": 0.4853668182717026, + "mean_token_accuracy": 0.7359050512313843, + "num_tokens": 4739205.0, + "step": 4909, + "train/ce_loss": 0.7128942608833313 + }, + { + "epoch": 0.4853668182717026, + "step": 4909, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.4853668182717026, + "step": 4909, + "train/total_loss": 0.11035192757844925 + }, + { + "entropy": 8.993949890136719, + "epoch": 0.4854656911212181, + "mean_token_accuracy": 0.7426390647888184, + "num_tokens": 4744604.0, + "step": 4910, + "train/ce_loss": 0.5839024186134338 + }, + { + "epoch": 0.4854656911212181, + "step": 4910, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.4854656911212181, + "step": 4910, + "train/total_loss": 0.10526524484157562 + }, + { + "entropy": 9.8870849609375, + "epoch": 0.4855645639707336, + "mean_token_accuracy": 0.7641196250915527, + "num_tokens": 4749306.0, + "step": 4911, + "train/ce_loss": 1.7026309967041016 + }, + { + "epoch": 0.4855645639707336, + "step": 4911, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.4855645639707336, + "step": 4911, + "train/total_loss": 0.23666934669017792 + }, + { + "entropy": 9.434379577636719, + "epoch": 0.48566343682024915, + "mean_token_accuracy": 0.8050000071525574, + "num_tokens": 4754367.0, + "step": 4912, + "train/ce_loss": 0.8110925555229187 + }, + { + "epoch": 0.48566343682024915, + "step": 4912, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.48566343682024915, + "step": 4912, + "train/total_loss": 0.10064050555229187 + }, + { + "entropy": 9.033245086669922, + "epoch": 0.4857623096697647, + "mean_token_accuracy": 0.7487499713897705, + "num_tokens": 4759630.0, + "step": 4913, + "train/ce_loss": 0.7126909494400024 + }, + { + "epoch": 0.4857623096697647, + "step": 4913, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.4857623096697647, + "step": 4913, + "train/total_loss": 0.13376909494400024 + }, + { + "entropy": 9.069683074951172, + "epoch": 0.48586118251928023, + "mean_token_accuracy": 0.7380627393722534, + "num_tokens": 4764817.0, + "step": 4914, + "train/ce_loss": 0.6253024339675903 + }, + { + "epoch": 0.48586118251928023, + "step": 4914, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.48586118251928023, + "step": 4914, + "train/total_loss": 0.08596774190664291 + }, + { + "entropy": 9.71072769165039, + "epoch": 0.4859600553687957, + "mean_token_accuracy": 0.7920792102813721, + "num_tokens": 4769611.0, + "step": 4915, + "train/ce_loss": 1.6909226179122925 + }, + { + "epoch": 0.4859600553687957, + "step": 4915, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.4859600553687957, + "step": 4915, + "train/total_loss": 0.21987351775169373 + }, + { + "entropy": 9.659954071044922, + "epoch": 0.48605892821831126, + "mean_token_accuracy": 0.7441314458847046, + "num_tokens": 4774478.0, + "step": 4916, + "train/ce_loss": 1.0239133189315908e-05 + }, + { + "epoch": 0.48605892821831126, + "step": 4916, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.48605892821831126, + "step": 4916, + "train/total_loss": 0.06250102072954178 + }, + { + "entropy": 9.636788368225098, + "epoch": 0.4861578010678268, + "mean_token_accuracy": 0.7456979155540466, + "num_tokens": 4779410.0, + "step": 4917, + "train/ce_loss": 0.8914626836776733 + }, + { + "epoch": 0.4861578010678268, + "step": 4917, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.4861578010678268, + "step": 4917, + "train/total_loss": 0.14774002134799957 + }, + { + "entropy": 9.167765617370605, + "epoch": 0.4862566739173423, + "mean_token_accuracy": 0.7288359999656677, + "num_tokens": 4784536.0, + "step": 4918, + "train/ce_loss": 0.6476504802703857 + }, + { + "epoch": 0.4862566739173423, + "step": 4918, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.4862566739173423, + "step": 4918, + "train/total_loss": 0.11945255100727081 + }, + { + "entropy": 9.297834396362305, + "epoch": 0.4863555467668578, + "mean_token_accuracy": 0.74301677942276, + "num_tokens": 4789682.0, + "step": 4919, + "train/ce_loss": 1.2960455417633057 + }, + { + "epoch": 0.4863555467668578, + "step": 4919, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.4863555467668578, + "step": 4919, + "train/total_loss": 0.18038581311702728 + }, + { + "epoch": 0.48645441961637337, + "grad_norm": 0.7376166582107544, + "learning_rate": 8.78628294516145e-06, + "loss": 0.1446, + "step": 4920 + }, + { + "entropy": 9.638121604919434, + "epoch": 0.48645441961637337, + "mean_token_accuracy": 0.7441016435623169, + "num_tokens": 4794733.0, + "step": 4920, + "train/ce_loss": 0.8663396239280701 + }, + { + "epoch": 0.48645441961637337, + "step": 4920, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.48645441961637337, + "step": 4920, + "train/total_loss": 0.14522771537303925 + }, + { + "entropy": 8.860578536987305, + "epoch": 0.48655329246588885, + "mean_token_accuracy": 0.7983014583587646, + "num_tokens": 4800140.0, + "step": 4921, + "train/ce_loss": 0.8453391194343567 + }, + { + "epoch": 0.48655329246588885, + "step": 4921, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.48655329246588885, + "step": 4921, + "train/total_loss": 0.1314089149236679 + }, + { + "entropy": 8.960800170898438, + "epoch": 0.4866521653154044, + "mean_token_accuracy": 0.7310647368431091, + "num_tokens": 4805546.0, + "step": 4922, + "train/ce_loss": 0.9981918334960938 + }, + { + "epoch": 0.4866521653154044, + "step": 4922, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.4866521653154044, + "step": 4922, + "train/total_loss": 0.16622543334960938 + }, + { + "entropy": 9.311447143554688, + "epoch": 0.48675103816491994, + "mean_token_accuracy": 0.7135134935379028, + "num_tokens": 4810700.0, + "step": 4923, + "train/ce_loss": 1.1179834604263306 + }, + { + "epoch": 0.48675103816491994, + "step": 4923, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.48675103816491994, + "step": 4923, + "train/total_loss": 0.14304834604263306 + }, + { + "entropy": 9.43608283996582, + "epoch": 0.4868499110144354, + "mean_token_accuracy": 0.7465887069702148, + "num_tokens": 4815708.0, + "step": 4924, + "train/ce_loss": 0.8741804957389832 + }, + { + "epoch": 0.4868499110144354, + "step": 4924, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.4868499110144354, + "step": 4924, + "train/total_loss": 0.16554304957389832 + }, + { + "entropy": 9.55827522277832, + "epoch": 0.48694878386395096, + "mean_token_accuracy": 0.7261029481887817, + "num_tokens": 4820691.0, + "step": 4925, + "train/ce_loss": 1.1498408317565918 + }, + { + "epoch": 0.48694878386395096, + "step": 4925, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.48694878386395096, + "step": 4925, + "train/total_loss": 0.17748409509658813 + }, + { + "entropy": 9.66183853149414, + "epoch": 0.4870476567134665, + "mean_token_accuracy": 0.7651331424713135, + "num_tokens": 4825551.0, + "step": 4926, + "train/ce_loss": 3.2787506825115997e-06 + }, + { + "epoch": 0.4870476567134665, + "step": 4926, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.4870476567134665, + "step": 4926, + "train/total_loss": 0.046875327825546265 + }, + { + "entropy": 9.215739250183105, + "epoch": 0.487146529562982, + "mean_token_accuracy": 0.7790697813034058, + "num_tokens": 4830744.0, + "step": 4927, + "train/ce_loss": 0.6452703475952148 + }, + { + "epoch": 0.487146529562982, + "step": 4927, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.487146529562982, + "step": 4927, + "train/total_loss": 0.10358953475952148 + }, + { + "entropy": 8.913700103759766, + "epoch": 0.48724540241249753, + "mean_token_accuracy": 0.7453183531761169, + "num_tokens": 4836066.0, + "step": 4928, + "train/ce_loss": 1.0108767747879028 + }, + { + "epoch": 0.48724540241249753, + "step": 4928, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.48724540241249753, + "step": 4928, + "train/total_loss": 0.20655643939971924 + }, + { + "entropy": 9.374261856079102, + "epoch": 0.48734427526201307, + "mean_token_accuracy": 0.7281690239906311, + "num_tokens": 4841163.0, + "step": 4929, + "train/ce_loss": 1.19328773021698 + }, + { + "epoch": 0.48734427526201307, + "step": 4929, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.48734427526201307, + "step": 4929, + "train/total_loss": 0.17401626706123352 + }, + { + "entropy": 9.023553848266602, + "epoch": 0.48744314811152856, + "mean_token_accuracy": 0.7415143847465515, + "num_tokens": 4846455.0, + "step": 4930, + "train/ce_loss": 1.073759913444519 + }, + { + "epoch": 0.48744314811152856, + "step": 4930, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.48744314811152856, + "step": 4930, + "train/total_loss": 0.13862599432468414 + }, + { + "entropy": 9.224691390991211, + "epoch": 0.4875420209610441, + "mean_token_accuracy": 0.6523736715316772, + "num_tokens": 4851542.0, + "step": 4931, + "train/ce_loss": 1.0722086429595947 + }, + { + "epoch": 0.4875420209610441, + "step": 4931, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.4875420209610441, + "step": 4931, + "train/total_loss": 0.165814608335495 + }, + { + "entropy": 9.236860275268555, + "epoch": 0.48764089381055964, + "mean_token_accuracy": 0.7120822668075562, + "num_tokens": 4856762.0, + "step": 4932, + "train/ce_loss": 0.8498473167419434 + }, + { + "epoch": 0.48764089381055964, + "step": 4932, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.48764089381055964, + "step": 4932, + "train/total_loss": 0.17092223465442657 + }, + { + "entropy": 8.873945236206055, + "epoch": 0.4877397666600751, + "mean_token_accuracy": 0.7639344334602356, + "num_tokens": 4862133.0, + "step": 4933, + "train/ce_loss": 0.6059170365333557 + }, + { + "epoch": 0.4877397666600751, + "step": 4933, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.4877397666600751, + "step": 4933, + "train/total_loss": 0.09965420514345169 + }, + { + "entropy": 9.093036651611328, + "epoch": 0.48783863950959067, + "mean_token_accuracy": 0.6778350472450256, + "num_tokens": 4867340.0, + "step": 4934, + "train/ce_loss": 1.148424871644238e-06 + }, + { + "epoch": 0.48783863950959067, + "step": 4934, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.48783863950959067, + "step": 4934, + "train/total_loss": 0.07812511175870895 + }, + { + "entropy": 9.153524398803711, + "epoch": 0.4879375123591062, + "mean_token_accuracy": 0.7430025339126587, + "num_tokens": 4872608.0, + "step": 4935, + "train/ce_loss": 1.041776418685913 + }, + { + "epoch": 0.4879375123591062, + "step": 4935, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.4879375123591062, + "step": 4935, + "train/total_loss": 0.22917765378952026 + }, + { + "entropy": 8.765460014343262, + "epoch": 0.4880363852086217, + "mean_token_accuracy": 0.71875, + "num_tokens": 4878047.0, + "step": 4936, + "train/ce_loss": 0.9821575880050659 + }, + { + "epoch": 0.4880363852086217, + "step": 4936, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.4880363852086217, + "step": 4936, + "train/total_loss": 0.1568095088005066 + }, + { + "entropy": 9.62529182434082, + "epoch": 0.48813525805813723, + "mean_token_accuracy": 0.715859055519104, + "num_tokens": 4882896.0, + "step": 4937, + "train/ce_loss": 2.728528897932847e-06 + }, + { + "epoch": 0.48813525805813723, + "step": 4937, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.48813525805813723, + "step": 4937, + "train/total_loss": 0.05468777194619179 + }, + { + "entropy": 9.226823806762695, + "epoch": 0.4882341309076528, + "mean_token_accuracy": 0.7292225360870361, + "num_tokens": 4888112.0, + "step": 4938, + "train/ce_loss": 0.8293290138244629 + }, + { + "epoch": 0.4882341309076528, + "step": 4938, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.4882341309076528, + "step": 4938, + "train/total_loss": 0.13762040436267853 + }, + { + "entropy": 8.93838119506836, + "epoch": 0.48833300375716826, + "mean_token_accuracy": 0.7661574482917786, + "num_tokens": 4893505.0, + "step": 4939, + "train/ce_loss": 0.6429872512817383 + }, + { + "epoch": 0.48833300375716826, + "step": 4939, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.48833300375716826, + "step": 4939, + "train/total_loss": 0.11117372661828995 + }, + { + "epoch": 0.4884318766066838, + "grad_norm": 0.7874643206596375, + "learning_rate": 8.781338080403502e-06, + "loss": 0.1473, + "step": 4940 + }, + { + "entropy": 8.490642547607422, + "epoch": 0.4884318766066838, + "mean_token_accuracy": 0.7120419144630432, + "num_tokens": 4898952.0, + "step": 4940, + "train/ce_loss": 1.5694804191589355 + }, + { + "epoch": 0.4884318766066838, + "step": 4940, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.4884318766066838, + "step": 4940, + "train/total_loss": 0.2233542948961258 + }, + { + "entropy": 9.060588836669922, + "epoch": 0.48853074945619934, + "mean_token_accuracy": 0.7146596908569336, + "num_tokens": 4904169.0, + "step": 4941, + "train/ce_loss": 0.8909388184547424 + }, + { + "epoch": 0.48853074945619934, + "step": 4941, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.48853074945619934, + "step": 4941, + "train/total_loss": 0.1594063937664032 + }, + { + "entropy": 9.662264823913574, + "epoch": 0.48862962230571483, + "mean_token_accuracy": 0.75314861536026, + "num_tokens": 4908980.0, + "step": 4942, + "train/ce_loss": 1.9760382175445557 + }, + { + "epoch": 0.48862962230571483, + "step": 4942, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.48862962230571483, + "step": 4942, + "train/total_loss": 0.23666632175445557 + }, + { + "entropy": 9.120369911193848, + "epoch": 0.48872849515523037, + "mean_token_accuracy": 0.7997010350227356, + "num_tokens": 4914156.0, + "step": 4943, + "train/ce_loss": 2.164554189221235e-06 + }, + { + "epoch": 0.48872849515523037, + "step": 4943, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.48872849515523037, + "step": 4943, + "train/total_loss": 0.07812521606683731 + }, + { + "entropy": 10.052497863769531, + "epoch": 0.4888273680047459, + "mean_token_accuracy": 0.7403100728988647, + "num_tokens": 4918831.0, + "step": 4944, + "train/ce_loss": 1.1864396583405323e-05 + }, + { + "epoch": 0.4888273680047459, + "step": 4944, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.4888273680047459, + "step": 4944, + "train/total_loss": 0.05078243464231491 + }, + { + "entropy": 8.989214897155762, + "epoch": 0.4889262408542614, + "mean_token_accuracy": 0.7435265183448792, + "num_tokens": 4924090.0, + "step": 4945, + "train/ce_loss": 0.7429519295692444 + }, + { + "epoch": 0.4889262408542614, + "step": 4945, + "train/sim_loss": 0.01171875 + }, + { + "epoch": 0.4889262408542614, + "step": 4945, + "train/total_loss": 0.08601394295692444 + }, + { + "entropy": 8.845163345336914, + "epoch": 0.48902511370377694, + "mean_token_accuracy": 0.7583603262901306, + "num_tokens": 4929551.0, + "step": 4946, + "train/ce_loss": 0.6005305051803589 + }, + { + "epoch": 0.48902511370377694, + "step": 4946, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.48902511370377694, + "step": 4946, + "train/total_loss": 0.09520930051803589 + }, + { + "entropy": 9.364870071411133, + "epoch": 0.4891239865532925, + "mean_token_accuracy": 0.7204116582870483, + "num_tokens": 4934529.0, + "step": 4947, + "train/ce_loss": 1.5267508029937744 + }, + { + "epoch": 0.4891239865532925, + "step": 4947, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.4891239865532925, + "step": 4947, + "train/total_loss": 0.2190813273191452 + }, + { + "entropy": 9.116228103637695, + "epoch": 0.48922285940280796, + "mean_token_accuracy": 0.7240437269210815, + "num_tokens": 4939707.0, + "step": 4948, + "train/ce_loss": 0.4577290713787079 + }, + { + "epoch": 0.48922285940280796, + "step": 4948, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.48922285940280796, + "step": 4948, + "train/total_loss": 0.08483541011810303 + }, + { + "entropy": 9.415693283081055, + "epoch": 0.4893217322523235, + "mean_token_accuracy": 0.7996794581413269, + "num_tokens": 4944829.0, + "step": 4949, + "train/ce_loss": 1.471177339553833 + }, + { + "epoch": 0.4893217322523235, + "step": 4949, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.4893217322523235, + "step": 4949, + "train/total_loss": 0.2174302339553833 + }, + { + "entropy": 9.266605377197266, + "epoch": 0.48942060510183905, + "mean_token_accuracy": 0.7365661859512329, + "num_tokens": 4949989.0, + "step": 4950, + "train/ce_loss": 0.6825771927833557 + }, + { + "epoch": 0.48942060510183905, + "step": 4950, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.48942060510183905, + "step": 4950, + "train/total_loss": 0.13857021927833557 + }, + { + "entropy": 9.919319152832031, + "epoch": 0.48951947795135453, + "mean_token_accuracy": 0.76115483045578, + "num_tokens": 4954809.0, + "step": 4951, + "train/ce_loss": 2.257694177387748e-06 + }, + { + "epoch": 0.48951947795135453, + "step": 4951, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.48951947795135453, + "step": 4951, + "train/total_loss": 0.023437725380063057 + }, + { + "entropy": 9.214216232299805, + "epoch": 0.4896183508008701, + "mean_token_accuracy": 0.7194968461990356, + "num_tokens": 4960064.0, + "step": 4952, + "train/ce_loss": 1.0852882862091064 + }, + { + "epoch": 0.4896183508008701, + "step": 4952, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.4896183508008701, + "step": 4952, + "train/total_loss": 0.14759132266044617 + }, + { + "entropy": 9.708972930908203, + "epoch": 0.4897172236503856, + "mean_token_accuracy": 0.7222222089767456, + "num_tokens": 4964884.0, + "step": 4953, + "train/ce_loss": 1.529525252408348e-05 + }, + { + "epoch": 0.4897172236503856, + "step": 4953, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.4897172236503856, + "step": 4953, + "train/total_loss": 0.02343902923166752 + }, + { + "entropy": 9.27254867553711, + "epoch": 0.48981609649990115, + "mean_token_accuracy": 0.7869565486907959, + "num_tokens": 4970010.0, + "step": 4954, + "train/ce_loss": 1.4466725587844849 + }, + { + "epoch": 0.48981609649990115, + "step": 4954, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.48981609649990115, + "step": 4954, + "train/total_loss": 0.19935475289821625 + }, + { + "entropy": 8.7774019241333, + "epoch": 0.48991496934941664, + "mean_token_accuracy": 0.7177264094352722, + "num_tokens": 4975524.0, + "step": 4955, + "train/ce_loss": 1.0846054553985596 + }, + { + "epoch": 0.48991496934941664, + "step": 4955, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.48991496934941664, + "step": 4955, + "train/total_loss": 0.15924179553985596 + }, + { + "entropy": 9.477668762207031, + "epoch": 0.4900138421989322, + "mean_token_accuracy": 0.7560073733329773, + "num_tokens": 4980501.0, + "step": 4956, + "train/ce_loss": 1.5517706871032715 + }, + { + "epoch": 0.4900138421989322, + "step": 4956, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.4900138421989322, + "step": 4956, + "train/total_loss": 0.2020520716905594 + }, + { + "entropy": 9.712724685668945, + "epoch": 0.4901127150484477, + "mean_token_accuracy": 0.7151514887809753, + "num_tokens": 4985440.0, + "step": 4957, + "train/ce_loss": 1.111464262008667 + }, + { + "epoch": 0.4901127150484477, + "step": 4957, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.4901127150484477, + "step": 4957, + "train/total_loss": 0.15802142024040222 + }, + { + "entropy": 8.931743621826172, + "epoch": 0.4902115878979632, + "mean_token_accuracy": 0.7174638509750366, + "num_tokens": 4990855.0, + "step": 4958, + "train/ce_loss": 0.7936546206474304 + }, + { + "epoch": 0.4902115878979632, + "step": 4958, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.4902115878979632, + "step": 4958, + "train/total_loss": 0.14186546206474304 + }, + { + "entropy": 8.859094619750977, + "epoch": 0.49031046074747875, + "mean_token_accuracy": 0.7553072571754456, + "num_tokens": 4996280.0, + "step": 4959, + "train/ce_loss": 0.5950724482536316 + }, + { + "epoch": 0.49031046074747875, + "step": 4959, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.49031046074747875, + "step": 4959, + "train/total_loss": 0.10638225078582764 + }, + { + "epoch": 0.4904093335969943, + "grad_norm": 0.6664409637451172, + "learning_rate": 8.776393215645553e-06, + "loss": 0.1418, + "step": 4960 + }, + { + "entropy": 10.08526611328125, + "epoch": 0.4904093335969943, + "mean_token_accuracy": 0.732087254524231, + "num_tokens": 5000962.0, + "step": 4960, + "train/ce_loss": 0.8471253514289856 + }, + { + "epoch": 0.4904093335969943, + "step": 4960, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.4904093335969943, + "step": 4960, + "train/total_loss": 0.14330628514289856 + }, + { + "entropy": 9.833306312561035, + "epoch": 0.4905082064465098, + "mean_token_accuracy": 0.7071239948272705, + "num_tokens": 5005775.0, + "step": 4961, + "train/ce_loss": 2.1891096366744023e-06 + }, + { + "epoch": 0.4905082064465098, + "step": 4961, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.4905082064465098, + "step": 4961, + "train/total_loss": 0.03125021979212761 + }, + { + "entropy": 9.284728050231934, + "epoch": 0.4906070792960253, + "mean_token_accuracy": 0.7516778707504272, + "num_tokens": 5010963.0, + "step": 4962, + "train/ce_loss": 0.6591690182685852 + }, + { + "epoch": 0.4906070792960253, + "step": 4962, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.4906070792960253, + "step": 4962, + "train/total_loss": 0.10888565331697464 + }, + { + "entropy": 8.825736045837402, + "epoch": 0.49070595214554086, + "mean_token_accuracy": 0.7436181902885437, + "num_tokens": 5016355.0, + "step": 4963, + "train/ce_loss": 0.7082291841506958 + }, + { + "epoch": 0.49070595214554086, + "step": 4963, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.49070595214554086, + "step": 4963, + "train/total_loss": 0.09426041692495346 + }, + { + "entropy": 9.79039478302002, + "epoch": 0.49080482499505634, + "mean_token_accuracy": 0.7394366264343262, + "num_tokens": 5021170.0, + "step": 4964, + "train/ce_loss": 2.0843520164489746 + }, + { + "epoch": 0.49080482499505634, + "step": 4964, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.49080482499505634, + "step": 4964, + "train/total_loss": 0.27874770760536194 + }, + { + "entropy": 9.001119613647461, + "epoch": 0.4909036978445719, + "mean_token_accuracy": 0.7306175827980042, + "num_tokens": 5026413.0, + "step": 4965, + "train/ce_loss": 0.8822122812271118 + }, + { + "epoch": 0.4909036978445719, + "step": 4965, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.4909036978445719, + "step": 4965, + "train/total_loss": 0.1663462221622467 + }, + { + "entropy": 9.187026977539062, + "epoch": 0.4910025706940874, + "mean_token_accuracy": 0.7484737634658813, + "num_tokens": 5031678.0, + "step": 4966, + "train/ce_loss": 0.9261685013771057 + }, + { + "epoch": 0.4910025706940874, + "step": 4966, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.4910025706940874, + "step": 4966, + "train/total_loss": 0.15511685609817505 + }, + { + "entropy": 9.472979545593262, + "epoch": 0.4911014435436029, + "mean_token_accuracy": 0.6653944253921509, + "num_tokens": 5036938.0, + "step": 4967, + "train/ce_loss": 2.3817548751831055 + }, + { + "epoch": 0.4911014435436029, + "step": 4967, + "train/sim_loss": 0.171875 + }, + { + "epoch": 0.4911014435436029, + "step": 4967, + "train/total_loss": 0.41005051136016846 + }, + { + "entropy": 8.875371932983398, + "epoch": 0.49120031639311845, + "mean_token_accuracy": 0.7755857110023499, + "num_tokens": 5042166.0, + "step": 4968, + "train/ce_loss": 8.783146768109873e-06 + }, + { + "epoch": 0.49120031639311845, + "step": 4968, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.49120031639311845, + "step": 4968, + "train/total_loss": 0.03906337916851044 + }, + { + "entropy": 9.897775650024414, + "epoch": 0.491299189242634, + "mean_token_accuracy": 0.7455621361732483, + "num_tokens": 5047056.0, + "step": 4969, + "train/ce_loss": 0.8315859436988831 + }, + { + "epoch": 0.491299189242634, + "step": 4969, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.491299189242634, + "step": 4969, + "train/total_loss": 0.13393984735012054 + }, + { + "entropy": 9.3680419921875, + "epoch": 0.4913980620921495, + "mean_token_accuracy": 0.725874125957489, + "num_tokens": 5052190.0, + "step": 4970, + "train/ce_loss": 1.4078030586242676 + }, + { + "epoch": 0.4913980620921495, + "step": 4970, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.4913980620921495, + "step": 4970, + "train/total_loss": 0.19156156480312347 + }, + { + "entropy": 9.263813018798828, + "epoch": 0.491496934941665, + "mean_token_accuracy": 0.6920152306556702, + "num_tokens": 5057409.0, + "step": 4971, + "train/ce_loss": 1.0340186236135196e-06 + }, + { + "epoch": 0.491496934941665, + "step": 4971, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.491496934941665, + "step": 4971, + "train/total_loss": 0.023437604308128357 + }, + { + "entropy": 9.463968276977539, + "epoch": 0.49159580779118056, + "mean_token_accuracy": 0.7051724195480347, + "num_tokens": 5062430.0, + "step": 4972, + "train/ce_loss": 2.8040617507940624e-06 + }, + { + "epoch": 0.49159580779118056, + "step": 4972, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.49159580779118056, + "step": 4972, + "train/total_loss": 0.039062779396772385 + }, + { + "entropy": 9.55792236328125, + "epoch": 0.49169468064069605, + "mean_token_accuracy": 0.675302267074585, + "num_tokens": 5067455.0, + "step": 4973, + "train/ce_loss": 1.6470224863951444e-06 + }, + { + "epoch": 0.49169468064069605, + "step": 4973, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.49169468064069605, + "step": 4973, + "train/total_loss": 0.05078141391277313 + }, + { + "entropy": 9.557991027832031, + "epoch": 0.4917935534902116, + "mean_token_accuracy": 0.7660256624221802, + "num_tokens": 5072514.0, + "step": 4974, + "train/ce_loss": 2.7783050882135285e-06 + }, + { + "epoch": 0.4917935534902116, + "step": 4974, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.4917935534902116, + "step": 4974, + "train/total_loss": 0.050781529396772385 + }, + { + "entropy": 9.60000228881836, + "epoch": 0.49189242633972713, + "mean_token_accuracy": 0.7406143546104431, + "num_tokens": 5077508.0, + "step": 4975, + "train/ce_loss": 1.644429403313552e-06 + }, + { + "epoch": 0.49189242633972713, + "step": 4975, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.49189242633972713, + "step": 4975, + "train/total_loss": 0.05078141391277313 + }, + { + "entropy": 9.294367790222168, + "epoch": 0.4919912991892426, + "mean_token_accuracy": 0.7238689661026001, + "num_tokens": 5082613.0, + "step": 4976, + "train/ce_loss": 1.3646399974822998 + }, + { + "epoch": 0.4919912991892426, + "step": 4976, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.4919912991892426, + "step": 4976, + "train/total_loss": 0.15990149974822998 + }, + { + "entropy": 9.06892204284668, + "epoch": 0.49209017203875816, + "mean_token_accuracy": 0.7161290049552917, + "num_tokens": 5087988.0, + "step": 4977, + "train/ce_loss": 1.189186930656433 + }, + { + "epoch": 0.49209017203875816, + "step": 4977, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.49209017203875816, + "step": 4977, + "train/total_loss": 0.22438743710517883 + }, + { + "entropy": 9.453010559082031, + "epoch": 0.4921890448882737, + "mean_token_accuracy": 0.7511811256408691, + "num_tokens": 5093055.0, + "step": 4978, + "train/ce_loss": 1.4672602415084839 + }, + { + "epoch": 0.4921890448882737, + "step": 4978, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.4921890448882737, + "step": 4978, + "train/total_loss": 0.22485102713108063 + }, + { + "entropy": 8.757181167602539, + "epoch": 0.4922879177377892, + "mean_token_accuracy": 0.7450593113899231, + "num_tokens": 5098498.0, + "step": 4979, + "train/ce_loss": 0.9394068121910095 + }, + { + "epoch": 0.4922879177377892, + "step": 4979, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.4922879177377892, + "step": 4979, + "train/total_loss": 0.16425317525863647 + }, + { + "epoch": 0.4923867905873047, + "grad_norm": 0.6681188941001892, + "learning_rate": 8.771448350887603e-06, + "loss": 0.1469, + "step": 4980 + }, + { + "entropy": 9.40768051147461, + "epoch": 0.4923867905873047, + "mean_token_accuracy": 0.7666666507720947, + "num_tokens": 5103610.0, + "step": 4980, + "train/ce_loss": 0.6188808083534241 + }, + { + "epoch": 0.4923867905873047, + "step": 4980, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.4923867905873047, + "step": 4980, + "train/total_loss": 0.12048183381557465 + }, + { + "entropy": 8.912360191345215, + "epoch": 0.49248566343682026, + "mean_token_accuracy": 0.7740046977996826, + "num_tokens": 5108923.0, + "step": 4981, + "train/ce_loss": 1.0425912141799927 + }, + { + "epoch": 0.49248566343682026, + "step": 4981, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.49248566343682026, + "step": 4981, + "train/total_loss": 0.15504038333892822 + }, + { + "entropy": 10.315068244934082, + "epoch": 0.49258453628633575, + "mean_token_accuracy": 0.7553191781044006, + "num_tokens": 5113504.0, + "step": 4982, + "train/ce_loss": 1.1097929927927908e-05 + }, + { + "epoch": 0.49258453628633575, + "step": 4982, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.49258453628633575, + "step": 4982, + "train/total_loss": 0.03515736013650894 + }, + { + "entropy": 9.888456344604492, + "epoch": 0.4926834091358513, + "mean_token_accuracy": 0.7823529243469238, + "num_tokens": 5118268.0, + "step": 4983, + "train/ce_loss": 0.65955650806427 + }, + { + "epoch": 0.4926834091358513, + "step": 4983, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.4926834091358513, + "step": 4983, + "train/total_loss": 0.09720565378665924 + }, + { + "entropy": 9.141186714172363, + "epoch": 0.49278228198536683, + "mean_token_accuracy": 0.737051784992218, + "num_tokens": 5123492.0, + "step": 4984, + "train/ce_loss": 0.6550917625427246 + }, + { + "epoch": 0.49278228198536683, + "step": 4984, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.49278228198536683, + "step": 4984, + "train/total_loss": 0.12800917029380798 + }, + { + "entropy": 9.001733779907227, + "epoch": 0.4928811548348823, + "mean_token_accuracy": 0.7353951930999756, + "num_tokens": 5128838.0, + "step": 4985, + "train/ce_loss": 0.6808602213859558 + }, + { + "epoch": 0.4928811548348823, + "step": 4985, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.4928811548348823, + "step": 4985, + "train/total_loss": 0.11496102064847946 + }, + { + "entropy": 9.17730712890625, + "epoch": 0.49298002768439786, + "mean_token_accuracy": 0.6987951993942261, + "num_tokens": 5134064.0, + "step": 4986, + "train/ce_loss": 1.2225298881530762 + }, + { + "epoch": 0.49298002768439786, + "step": 4986, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.49298002768439786, + "step": 4986, + "train/total_loss": 0.21600300073623657 + }, + { + "entropy": 9.292339324951172, + "epoch": 0.4930789005339134, + "mean_token_accuracy": 0.7302452325820923, + "num_tokens": 5139220.0, + "step": 4987, + "train/ce_loss": 1.093064308166504 + }, + { + "epoch": 0.4930789005339134, + "step": 4987, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.4930789005339134, + "step": 4987, + "train/total_loss": 0.1561814248561859 + }, + { + "entropy": 9.520135879516602, + "epoch": 0.4931777733834289, + "mean_token_accuracy": 0.6859259009361267, + "num_tokens": 5144375.0, + "step": 4988, + "train/ce_loss": 0.881460428237915 + }, + { + "epoch": 0.4931777733834289, + "step": 4988, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.4931777733834289, + "step": 4988, + "train/total_loss": 0.17408354580402374 + }, + { + "entropy": 9.749824523925781, + "epoch": 0.4932766462329444, + "mean_token_accuracy": 0.75262051820755, + "num_tokens": 5149261.0, + "step": 4989, + "train/ce_loss": 3.8940765989536885e-06 + }, + { + "epoch": 0.4932766462329444, + "step": 4989, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.4932766462329444, + "step": 4989, + "train/total_loss": 0.06250038743019104 + }, + { + "entropy": 8.942789077758789, + "epoch": 0.49337551908245997, + "mean_token_accuracy": 0.7583429217338562, + "num_tokens": 5154595.0, + "step": 4990, + "train/ce_loss": 0.8096892237663269 + }, + { + "epoch": 0.49337551908245997, + "step": 4990, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.49337551908245997, + "step": 4990, + "train/total_loss": 0.1395626664161682 + }, + { + "entropy": 9.447273254394531, + "epoch": 0.49347439193197545, + "mean_token_accuracy": 0.7243067026138306, + "num_tokens": 5159656.0, + "step": 4991, + "train/ce_loss": 0.8232438564300537 + }, + { + "epoch": 0.49347439193197545, + "step": 4991, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.49347439193197545, + "step": 4991, + "train/total_loss": 0.10185563564300537 + }, + { + "entropy": 9.18798542022705, + "epoch": 0.493573264781491, + "mean_token_accuracy": 0.7008872032165527, + "num_tokens": 5165099.0, + "step": 4992, + "train/ce_loss": 1.2660822868347168 + }, + { + "epoch": 0.493573264781491, + "step": 4992, + "train/sim_loss": 0.15625 + }, + { + "epoch": 0.493573264781491, + "step": 4992, + "train/total_loss": 0.2828582525253296 + }, + { + "entropy": 8.92338752746582, + "epoch": 0.49367213763100654, + "mean_token_accuracy": 0.7684674859046936, + "num_tokens": 5170491.0, + "step": 4993, + "train/ce_loss": 0.6353029608726501 + }, + { + "epoch": 0.49367213763100654, + "step": 4993, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.49367213763100654, + "step": 4993, + "train/total_loss": 0.07915529608726501 + }, + { + "entropy": 8.678300857543945, + "epoch": 0.493771010480522, + "mean_token_accuracy": 0.7744680643081665, + "num_tokens": 5175913.0, + "step": 4994, + "train/ce_loss": 0.8314459323883057 + }, + { + "epoch": 0.493771010480522, + "step": 4994, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.493771010480522, + "step": 4994, + "train/total_loss": 0.12611335515975952 + }, + { + "entropy": 8.942366600036621, + "epoch": 0.49386988333003756, + "mean_token_accuracy": 0.7802874445915222, + "num_tokens": 5181510.0, + "step": 4995, + "train/ce_loss": 0.6299203038215637 + }, + { + "epoch": 0.49386988333003756, + "step": 4995, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.49386988333003756, + "step": 4995, + "train/total_loss": 0.12939828634262085 + }, + { + "entropy": 9.119218826293945, + "epoch": 0.4939687561795531, + "mean_token_accuracy": 0.72826087474823, + "num_tokens": 5186811.0, + "step": 4996, + "train/ce_loss": 0.5254350304603577 + }, + { + "epoch": 0.4939687561795531, + "step": 4996, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.4939687561795531, + "step": 4996, + "train/total_loss": 0.075981006026268 + }, + { + "entropy": 9.664339065551758, + "epoch": 0.49406762902906864, + "mean_token_accuracy": 0.7256944179534912, + "num_tokens": 5191835.0, + "step": 4997, + "train/ce_loss": 0.9205860495567322 + }, + { + "epoch": 0.49406762902906864, + "step": 4997, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.49406762902906864, + "step": 4997, + "train/total_loss": 0.13893359899520874 + }, + { + "entropy": 8.853754043579102, + "epoch": 0.49416650187858413, + "mean_token_accuracy": 0.7513691186904907, + "num_tokens": 5197280.0, + "step": 4998, + "train/ce_loss": 0.4986410140991211 + }, + { + "epoch": 0.49416650187858413, + "step": 4998, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.49416650187858413, + "step": 4998, + "train/total_loss": 0.13189534842967987 + }, + { + "entropy": 9.307087898254395, + "epoch": 0.49426537472809967, + "mean_token_accuracy": 0.6903669834136963, + "num_tokens": 5202112.0, + "step": 4999, + "train/ce_loss": 2.536309242248535 + }, + { + "epoch": 0.49426537472809967, + "step": 4999, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.49426537472809967, + "step": 4999, + "train/total_loss": 0.31613093614578247 + }, + { + "epoch": 0.4943642475776152, + "grad_norm": 0.978468656539917, + "learning_rate": 8.766503486129655e-06, + "loss": 0.1417, + "step": 5000 + }, + { + "entropy": 9.330076217651367, + "epoch": 0.4943642475776152, + "mean_token_accuracy": 0.7440559267997742, + "num_tokens": 5207271.0, + "step": 5000, + "train/ce_loss": 1.2702337503433228 + }, + { + "epoch": 0.4943642475776152, + "step": 5000, + "train/sim_loss": 0.12890625 + }, + { + "epoch": 0.4943642475776152, + "step": 5000, + "train/total_loss": 0.2559296488761902 + }, + { + "entropy": 8.971585273742676, + "epoch": 0.4944631204271307, + "mean_token_accuracy": 0.7370203137397766, + "num_tokens": 5212763.0, + "step": 5001, + "train/ce_loss": 0.84300297498703 + }, + { + "epoch": 0.4944631204271307, + "step": 5001, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.4944631204271307, + "step": 5001, + "train/total_loss": 0.16242530941963196 + }, + { + "entropy": 8.718502044677734, + "epoch": 0.49456199327664624, + "mean_token_accuracy": 0.7404162287712097, + "num_tokens": 5218152.0, + "step": 5002, + "train/ce_loss": 1.145225167274475 + }, + { + "epoch": 0.49456199327664624, + "step": 5002, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.49456199327664624, + "step": 5002, + "train/total_loss": 0.2082725167274475 + }, + { + "entropy": 9.640294075012207, + "epoch": 0.4946608661261618, + "mean_token_accuracy": 0.7985865473747253, + "num_tokens": 5223156.0, + "step": 5003, + "train/ce_loss": 0.8569324612617493 + }, + { + "epoch": 0.4946608661261618, + "step": 5003, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.4946608661261618, + "step": 5003, + "train/total_loss": 0.12866199016571045 + }, + { + "entropy": 9.10590934753418, + "epoch": 0.49475973897567727, + "mean_token_accuracy": 0.6654135584831238, + "num_tokens": 5228396.0, + "step": 5004, + "train/ce_loss": 2.4376211058552144e-06 + }, + { + "epoch": 0.49475973897567727, + "step": 5004, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.49475973897567727, + "step": 5004, + "train/total_loss": 0.0468752421438694 + }, + { + "entropy": 8.985590934753418, + "epoch": 0.4948586118251928, + "mean_token_accuracy": 0.724602222442627, + "num_tokens": 5233636.0, + "step": 5005, + "train/ce_loss": 0.7069535255432129 + }, + { + "epoch": 0.4948586118251928, + "step": 5005, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.4948586118251928, + "step": 5005, + "train/total_loss": 0.11757035553455353 + }, + { + "entropy": 9.321735382080078, + "epoch": 0.49495748467470835, + "mean_token_accuracy": 0.7353760600090027, + "num_tokens": 5238815.0, + "step": 5006, + "train/ce_loss": 0.682883083820343 + }, + { + "epoch": 0.49495748467470835, + "step": 5006, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.49495748467470835, + "step": 5006, + "train/total_loss": 0.13078831136226654 + }, + { + "entropy": 9.03097152709961, + "epoch": 0.49505635752422383, + "mean_token_accuracy": 0.6895734667778015, + "num_tokens": 5244130.0, + "step": 5007, + "train/ce_loss": 1.4150376319885254 + }, + { + "epoch": 0.49505635752422383, + "step": 5007, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.49505635752422383, + "step": 5007, + "train/total_loss": 0.18447251617908478 + }, + { + "entropy": 9.243738174438477, + "epoch": 0.4951552303737394, + "mean_token_accuracy": 0.82201087474823, + "num_tokens": 5249280.0, + "step": 5008, + "train/ce_loss": 0.3235234320163727 + }, + { + "epoch": 0.4951552303737394, + "step": 5008, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.4951552303737394, + "step": 5008, + "train/total_loss": 0.05969609320163727 + }, + { + "entropy": 9.48786735534668, + "epoch": 0.4952541032232549, + "mean_token_accuracy": 0.7474600672721863, + "num_tokens": 5254421.0, + "step": 5009, + "train/ce_loss": 0.7793648838996887 + }, + { + "epoch": 0.4952541032232549, + "step": 5009, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.4952541032232549, + "step": 5009, + "train/total_loss": 0.18731150031089783 + }, + { + "entropy": 9.293548583984375, + "epoch": 0.4953529760727704, + "mean_token_accuracy": 0.647606372833252, + "num_tokens": 5259564.0, + "step": 5010, + "train/ce_loss": 8.262034612016578e-07 + }, + { + "epoch": 0.4953529760727704, + "step": 5010, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.4953529760727704, + "step": 5010, + "train/total_loss": 0.023437581956386566 + }, + { + "entropy": 8.798707962036133, + "epoch": 0.49545184892228594, + "mean_token_accuracy": 0.7243852615356445, + "num_tokens": 5265004.0, + "step": 5011, + "train/ce_loss": 0.8004510402679443 + }, + { + "epoch": 0.49545184892228594, + "step": 5011, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.49545184892228594, + "step": 5011, + "train/total_loss": 0.15035760402679443 + }, + { + "entropy": 8.94283676147461, + "epoch": 0.4955507217718015, + "mean_token_accuracy": 0.7059496641159058, + "num_tokens": 5270370.0, + "step": 5012, + "train/ce_loss": 0.8203482031822205 + }, + { + "epoch": 0.4955507217718015, + "step": 5012, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.4955507217718015, + "step": 5012, + "train/total_loss": 0.17969107627868652 + }, + { + "entropy": 9.285341262817383, + "epoch": 0.49564959462131697, + "mean_token_accuracy": 0.7302799224853516, + "num_tokens": 5275594.0, + "step": 5013, + "train/ce_loss": 0.4494679272174835 + }, + { + "epoch": 0.49564959462131697, + "step": 5013, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.49564959462131697, + "step": 5013, + "train/total_loss": 0.11916553974151611 + }, + { + "entropy": 8.98812198638916, + "epoch": 0.4957484674708325, + "mean_token_accuracy": 0.7211764454841614, + "num_tokens": 5280922.0, + "step": 5014, + "train/ce_loss": 0.7902920246124268 + }, + { + "epoch": 0.4957484674708325, + "step": 5014, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.4957484674708325, + "step": 5014, + "train/total_loss": 0.11027920246124268 + }, + { + "entropy": 8.832348823547363, + "epoch": 0.49584734032034805, + "mean_token_accuracy": 0.7766179442405701, + "num_tokens": 5286345.0, + "step": 5015, + "train/ce_loss": 0.5260584354400635 + }, + { + "epoch": 0.49584734032034805, + "step": 5015, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.49584734032034805, + "step": 5015, + "train/total_loss": 0.07213709503412247 + }, + { + "entropy": 9.457469940185547, + "epoch": 0.49594621316986354, + "mean_token_accuracy": 0.7450330853462219, + "num_tokens": 5291401.0, + "step": 5016, + "train/ce_loss": 0.8888313174247742 + }, + { + "epoch": 0.49594621316986354, + "step": 5016, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.49594621316986354, + "step": 5016, + "train/total_loss": 0.18653938174247742 + }, + { + "entropy": 8.61086368560791, + "epoch": 0.4960450860193791, + "mean_token_accuracy": 0.7685185074806213, + "num_tokens": 5296942.0, + "step": 5017, + "train/ce_loss": 0.8579637408256531 + }, + { + "epoch": 0.4960450860193791, + "step": 5017, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.4960450860193791, + "step": 5017, + "train/total_loss": 0.14829638600349426 + }, + { + "entropy": 8.979242324829102, + "epoch": 0.4961439588688946, + "mean_token_accuracy": 0.7670329809188843, + "num_tokens": 5302309.0, + "step": 5018, + "train/ce_loss": 0.7281983494758606 + }, + { + "epoch": 0.4961439588688946, + "step": 5018, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.4961439588688946, + "step": 5018, + "train/total_loss": 0.12360108643770218 + }, + { + "entropy": 8.694494247436523, + "epoch": 0.4962428317184101, + "mean_token_accuracy": 0.76382976770401, + "num_tokens": 5307742.0, + "step": 5019, + "train/ce_loss": 0.7149852514266968 + }, + { + "epoch": 0.4962428317184101, + "step": 5019, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.4962428317184101, + "step": 5019, + "train/total_loss": 0.09493602812290192 + }, + { + "epoch": 0.49634170456792565, + "grad_norm": 0.708148181438446, + "learning_rate": 8.761558621371706e-06, + "loss": 0.1442, + "step": 5020 + }, + { + "entropy": 9.002897262573242, + "epoch": 0.49634170456792565, + "mean_token_accuracy": 0.7137203216552734, + "num_tokens": 5312990.0, + "step": 5020, + "train/ce_loss": 0.8256528377532959 + }, + { + "epoch": 0.49634170456792565, + "step": 5020, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.49634170456792565, + "step": 5020, + "train/total_loss": 0.1333465278148651 + }, + { + "entropy": 8.83686351776123, + "epoch": 0.4964405774174412, + "mean_token_accuracy": 0.7586981058120728, + "num_tokens": 5318355.0, + "step": 5021, + "train/ce_loss": 0.8046140670776367 + }, + { + "epoch": 0.4964405774174412, + "step": 5021, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.4964405774174412, + "step": 5021, + "train/total_loss": 0.12343015521764755 + }, + { + "entropy": 9.446584701538086, + "epoch": 0.4965394502669567, + "mean_token_accuracy": 0.7009803652763367, + "num_tokens": 5323403.0, + "step": 5022, + "train/ce_loss": 3.4185343338322127e-06 + }, + { + "epoch": 0.4965394502669567, + "step": 5022, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.4965394502669567, + "step": 5022, + "train/total_loss": 0.03125034272670746 + }, + { + "entropy": 9.473800659179688, + "epoch": 0.4966383231164722, + "mean_token_accuracy": 0.754687488079071, + "num_tokens": 5328506.0, + "step": 5023, + "train/ce_loss": 1.2287180423736572 + }, + { + "epoch": 0.4966383231164722, + "step": 5023, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.4966383231164722, + "step": 5023, + "train/total_loss": 0.19709056615829468 + }, + { + "entropy": 8.936258316040039, + "epoch": 0.49673719596598775, + "mean_token_accuracy": 0.7270588278770447, + "num_tokens": 5333952.0, + "step": 5024, + "train/ce_loss": 1.431602120399475 + }, + { + "epoch": 0.49673719596598775, + "step": 5024, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.49673719596598775, + "step": 5024, + "train/total_loss": 0.24862895905971527 + }, + { + "entropy": 8.994499206542969, + "epoch": 0.49683606881550324, + "mean_token_accuracy": 0.7741203308105469, + "num_tokens": 5339275.0, + "step": 5025, + "train/ce_loss": 0.5269079208374023 + }, + { + "epoch": 0.49683606881550324, + "step": 5025, + "train/sim_loss": 0.01171875 + }, + { + "epoch": 0.49683606881550324, + "step": 5025, + "train/total_loss": 0.064409539103508 + }, + { + "entropy": 8.922952651977539, + "epoch": 0.4969349416650188, + "mean_token_accuracy": 0.7412333488464355, + "num_tokens": 5344477.0, + "step": 5026, + "train/ce_loss": 0.9101450443267822 + }, + { + "epoch": 0.4969349416650188, + "step": 5026, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.4969349416650188, + "step": 5026, + "train/total_loss": 0.13007700443267822 + }, + { + "entropy": 8.915870666503906, + "epoch": 0.4970338145145343, + "mean_token_accuracy": 0.756394624710083, + "num_tokens": 5349750.0, + "step": 5027, + "train/ce_loss": 0.9538937211036682 + }, + { + "epoch": 0.4970338145145343, + "step": 5027, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.4970338145145343, + "step": 5027, + "train/total_loss": 0.18132686614990234 + }, + { + "entropy": 8.895801544189453, + "epoch": 0.4971326873640498, + "mean_token_accuracy": 0.7528795599937439, + "num_tokens": 5355175.0, + "step": 5028, + "train/ce_loss": 0.5815815329551697 + }, + { + "epoch": 0.4971326873640498, + "step": 5028, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.4971326873640498, + "step": 5028, + "train/total_loss": 0.11284565925598145 + }, + { + "entropy": 8.86722469329834, + "epoch": 0.49723156021356535, + "mean_token_accuracy": 0.7579185366630554, + "num_tokens": 5360546.0, + "step": 5029, + "train/ce_loss": 0.5811032652854919 + }, + { + "epoch": 0.49723156021356535, + "step": 5029, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.49723156021356535, + "step": 5029, + "train/total_loss": 0.1127978265285492 + }, + { + "entropy": 9.113153457641602, + "epoch": 0.4973304330630809, + "mean_token_accuracy": 0.7327001094818115, + "num_tokens": 5365752.0, + "step": 5030, + "train/ce_loss": 0.8052600026130676 + }, + { + "epoch": 0.4973304330630809, + "step": 5030, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.4973304330630809, + "step": 5030, + "train/total_loss": 0.13521349430084229 + }, + { + "entropy": 9.06174087524414, + "epoch": 0.4974293059125964, + "mean_token_accuracy": 0.7424072027206421, + "num_tokens": 5371100.0, + "step": 5031, + "train/ce_loss": 0.5513002872467041 + }, + { + "epoch": 0.4974293059125964, + "step": 5031, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.4974293059125964, + "step": 5031, + "train/total_loss": 0.08247378468513489 + }, + { + "entropy": 9.803543090820312, + "epoch": 0.4975281787621119, + "mean_token_accuracy": 0.7523364424705505, + "num_tokens": 5375937.0, + "step": 5032, + "train/ce_loss": 3.624947339631035e-06 + }, + { + "epoch": 0.4975281787621119, + "step": 5032, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.4975281787621119, + "step": 5032, + "train/total_loss": 0.0234378632158041 + }, + { + "entropy": 8.803306579589844, + "epoch": 0.49762705161162746, + "mean_token_accuracy": 0.7314629554748535, + "num_tokens": 5381452.0, + "step": 5033, + "train/ce_loss": 0.923446774482727 + }, + { + "epoch": 0.49762705161162746, + "step": 5033, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.49762705161162746, + "step": 5033, + "train/total_loss": 0.19781342148780823 + }, + { + "entropy": 9.446455001831055, + "epoch": 0.49772592446114294, + "mean_token_accuracy": 0.7828371524810791, + "num_tokens": 5386481.0, + "step": 5034, + "train/ce_loss": 1.076431393623352 + }, + { + "epoch": 0.49772592446114294, + "step": 5034, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.49772592446114294, + "step": 5034, + "train/total_loss": 0.13889314234256744 + }, + { + "entropy": 9.671392440795898, + "epoch": 0.4978247973106585, + "mean_token_accuracy": 0.8042105436325073, + "num_tokens": 5391399.0, + "step": 5035, + "train/ce_loss": 0.9019742012023926 + }, + { + "epoch": 0.4978247973106585, + "step": 5035, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.4978247973106585, + "step": 5035, + "train/total_loss": 0.12144742161035538 + }, + { + "entropy": 8.512903213500977, + "epoch": 0.497923670160174, + "mean_token_accuracy": 0.7120291590690613, + "num_tokens": 5396742.0, + "step": 5036, + "train/ce_loss": 1.0357922315597534 + }, + { + "epoch": 0.497923670160174, + "step": 5036, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.497923670160174, + "step": 5036, + "train/total_loss": 0.13482922315597534 + }, + { + "entropy": 9.497211456298828, + "epoch": 0.49802254300968957, + "mean_token_accuracy": 0.7265501022338867, + "num_tokens": 5401805.0, + "step": 5037, + "train/ce_loss": 0.9477840065956116 + }, + { + "epoch": 0.49802254300968957, + "step": 5037, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.49802254300968957, + "step": 5037, + "train/total_loss": 0.1611846536397934 + }, + { + "entropy": 9.765968322753906, + "epoch": 0.49812141585920505, + "mean_token_accuracy": 0.745920717716217, + "num_tokens": 5406604.0, + "step": 5038, + "train/ce_loss": 3.671036438390729e-06 + }, + { + "epoch": 0.49812141585920505, + "step": 5038, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.49812141585920505, + "step": 5038, + "train/total_loss": 0.03125036880373955 + }, + { + "entropy": 8.735824584960938, + "epoch": 0.4982202887087206, + "mean_token_accuracy": 0.7946635484695435, + "num_tokens": 5411940.0, + "step": 5039, + "train/ce_loss": 0.4332225024700165 + }, + { + "epoch": 0.4982202887087206, + "step": 5039, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.4982202887087206, + "step": 5039, + "train/total_loss": 0.07066600024700165 + }, + { + "epoch": 0.49831916155823613, + "grad_norm": 0.6704682111740112, + "learning_rate": 8.756613756613758e-06, + "loss": 0.1323, + "step": 5040 + }, + { + "entropy": 9.419351577758789, + "epoch": 0.49831916155823613, + "mean_token_accuracy": 0.702786386013031, + "num_tokens": 5417029.0, + "step": 5040, + "train/ce_loss": 1.07545006275177 + }, + { + "epoch": 0.49831916155823613, + "step": 5040, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.49831916155823613, + "step": 5040, + "train/total_loss": 0.16223251819610596 + }, + { + "entropy": 9.154134750366211, + "epoch": 0.4984180344077516, + "mean_token_accuracy": 0.7070844769477844, + "num_tokens": 5422191.0, + "step": 5041, + "train/ce_loss": 0.9596898555755615 + }, + { + "epoch": 0.4984180344077516, + "step": 5041, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.4984180344077516, + "step": 5041, + "train/total_loss": 0.15065649151802063 + }, + { + "entropy": 8.867704391479492, + "epoch": 0.49851690725726716, + "mean_token_accuracy": 0.7425968050956726, + "num_tokens": 5427569.0, + "step": 5042, + "train/ce_loss": 1.0225627422332764 + }, + { + "epoch": 0.49851690725726716, + "step": 5042, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.49851690725726716, + "step": 5042, + "train/total_loss": 0.19210001826286316 + }, + { + "entropy": 8.442657470703125, + "epoch": 0.4986157801067827, + "mean_token_accuracy": 0.7573149800300598, + "num_tokens": 5433229.0, + "step": 5043, + "train/ce_loss": 0.5535759329795837 + }, + { + "epoch": 0.4986157801067827, + "step": 5043, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.4986157801067827, + "step": 5043, + "train/total_loss": 0.08270134031772614 + }, + { + "entropy": 8.905779838562012, + "epoch": 0.4987146529562982, + "mean_token_accuracy": 0.6979293823242188, + "num_tokens": 5438570.0, + "step": 5044, + "train/ce_loss": 1.03871750831604 + }, + { + "epoch": 0.4987146529562982, + "step": 5044, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.4987146529562982, + "step": 5044, + "train/total_loss": 0.14293426275253296 + }, + { + "entropy": 8.809632301330566, + "epoch": 0.49881352580581373, + "mean_token_accuracy": 0.7377220392227173, + "num_tokens": 5443984.0, + "step": 5045, + "train/ce_loss": 0.9378743767738342 + }, + { + "epoch": 0.49881352580581373, + "step": 5045, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.49881352580581373, + "step": 5045, + "train/total_loss": 0.16019368171691895 + }, + { + "entropy": 9.212902069091797, + "epoch": 0.49891239865532927, + "mean_token_accuracy": 0.7313432693481445, + "num_tokens": 5449114.0, + "step": 5046, + "train/ce_loss": 0.8752282857894897 + }, + { + "epoch": 0.49891239865532927, + "step": 5046, + "train/sim_loss": 0.12109375 + }, + { + "epoch": 0.49891239865532927, + "step": 5046, + "train/total_loss": 0.20861658453941345 + }, + { + "entropy": 9.310138702392578, + "epoch": 0.49901127150484476, + "mean_token_accuracy": 0.7603833675384521, + "num_tokens": 5454223.0, + "step": 5047, + "train/ce_loss": 0.8113348484039307 + }, + { + "epoch": 0.49901127150484476, + "step": 5047, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.49901127150484476, + "step": 5047, + "train/total_loss": 0.11628973484039307 + }, + { + "entropy": 8.914920806884766, + "epoch": 0.4991101443543603, + "mean_token_accuracy": 0.6916950941085815, + "num_tokens": 5459572.0, + "step": 5048, + "train/ce_loss": 0.7768692970275879 + }, + { + "epoch": 0.4991101443543603, + "step": 5048, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.4991101443543603, + "step": 5048, + "train/total_loss": 0.14799943566322327 + }, + { + "entropy": 9.293927192687988, + "epoch": 0.49920901720387584, + "mean_token_accuracy": 0.7607913613319397, + "num_tokens": 5464588.0, + "step": 5049, + "train/ce_loss": 0.628460705280304 + }, + { + "epoch": 0.49920901720387584, + "step": 5049, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.49920901720387584, + "step": 5049, + "train/total_loss": 0.14097106456756592 + }, + { + "entropy": 8.986357688903809, + "epoch": 0.4993078900533913, + "mean_token_accuracy": 0.7927711009979248, + "num_tokens": 5469900.0, + "step": 5050, + "train/ce_loss": 0.8069027662277222 + }, + { + "epoch": 0.4993078900533913, + "step": 5050, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.4993078900533913, + "step": 5050, + "train/total_loss": 0.17053402960300446 + }, + { + "entropy": 8.951379776000977, + "epoch": 0.49940676290290686, + "mean_token_accuracy": 0.7124999761581421, + "num_tokens": 5475275.0, + "step": 5051, + "train/ce_loss": 1.2251209020614624 + }, + { + "epoch": 0.49940676290290686, + "step": 5051, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.49940676290290686, + "step": 5051, + "train/total_loss": 0.1576683521270752 + }, + { + "entropy": 8.93557071685791, + "epoch": 0.4995056357524224, + "mean_token_accuracy": 0.7454545497894287, + "num_tokens": 5480505.0, + "step": 5052, + "train/ce_loss": 0.6807805895805359 + }, + { + "epoch": 0.4995056357524224, + "step": 5052, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.4995056357524224, + "step": 5052, + "train/total_loss": 0.09542181342840195 + }, + { + "entropy": 8.674575805664062, + "epoch": 0.4996045086019379, + "mean_token_accuracy": 0.7017892599105835, + "num_tokens": 5485975.0, + "step": 5053, + "train/ce_loss": 0.5238415598869324 + }, + { + "epoch": 0.4996045086019379, + "step": 5053, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.4996045086019379, + "step": 5053, + "train/total_loss": 0.107071653008461 + }, + { + "entropy": 9.003825187683105, + "epoch": 0.49970338145145343, + "mean_token_accuracy": 0.7698323726654053, + "num_tokens": 5491353.0, + "step": 5054, + "train/ce_loss": 0.5558789372444153 + }, + { + "epoch": 0.49970338145145343, + "step": 5054, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.49970338145145343, + "step": 5054, + "train/total_loss": 0.10246289521455765 + }, + { + "entropy": 9.294960021972656, + "epoch": 0.499802254300969, + "mean_token_accuracy": 0.6948148012161255, + "num_tokens": 5496551.0, + "step": 5055, + "train/ce_loss": 1.2571916580200195 + }, + { + "epoch": 0.499802254300969, + "step": 5055, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.499802254300969, + "step": 5055, + "train/total_loss": 0.19603167474269867 + }, + { + "entropy": 8.980306625366211, + "epoch": 0.49990112715048446, + "mean_token_accuracy": 0.7334109544754028, + "num_tokens": 5501879.0, + "step": 5056, + "train/ce_loss": 0.9166693091392517 + }, + { + "epoch": 0.49990112715048446, + "step": 5056, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.49990112715048446, + "step": 5056, + "train/total_loss": 0.14635443687438965 + }, + { + "entropy": 8.993450164794922, + "epoch": 0.5, + "mean_token_accuracy": 0.7285714149475098, + "num_tokens": 5507152.0, + "step": 5057, + "train/ce_loss": 1.3876967430114746 + }, + { + "epoch": 0.5, + "step": 5057, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.5, + "step": 5057, + "train/total_loss": 0.20126967132091522 + }, + { + "entropy": 9.219903945922852, + "epoch": 0.5000988728495155, + "mean_token_accuracy": 0.7168141603469849, + "num_tokens": 5512277.0, + "step": 5058, + "train/ce_loss": 0.6238693594932556 + }, + { + "epoch": 0.5000988728495155, + "step": 5058, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.5000988728495155, + "step": 5058, + "train/total_loss": 0.10144943743944168 + }, + { + "entropy": 8.974245071411133, + "epoch": 0.5001977456990311, + "mean_token_accuracy": 0.7571234703063965, + "num_tokens": 5517552.0, + "step": 5059, + "train/ce_loss": 1.0427740812301636 + }, + { + "epoch": 0.5001977456990311, + "step": 5059, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.5001977456990311, + "step": 5059, + "train/total_loss": 0.16677740216255188 + }, + { + "epoch": 0.5002966185485466, + "grad_norm": 0.7067264914512634, + "learning_rate": 8.751668891855809e-06, + "loss": 0.148, + "step": 5060 + }, + { + "entropy": 9.101495742797852, + "epoch": 0.5002966185485466, + "mean_token_accuracy": 0.7624831199645996, + "num_tokens": 5522769.0, + "step": 5060, + "train/ce_loss": 0.7306716442108154 + }, + { + "epoch": 0.5002966185485466, + "step": 5060, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.5002966185485466, + "step": 5060, + "train/total_loss": 0.13556715846061707 + }, + { + "entropy": 9.395573616027832, + "epoch": 0.500395491398062, + "mean_token_accuracy": 0.7682119011878967, + "num_tokens": 5527923.0, + "step": 5061, + "train/ce_loss": 1.0372035503387451 + }, + { + "epoch": 0.500395491398062, + "step": 5061, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.500395491398062, + "step": 5061, + "train/total_loss": 0.12715786695480347 + }, + { + "entropy": 8.775077819824219, + "epoch": 0.5004943642475776, + "mean_token_accuracy": 0.7476922869682312, + "num_tokens": 5533379.0, + "step": 5062, + "train/ce_loss": 0.8463733196258545 + }, + { + "epoch": 0.5004943642475776, + "step": 5062, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.5004943642475776, + "step": 5062, + "train/total_loss": 0.1666685938835144 + }, + { + "entropy": 9.09598159790039, + "epoch": 0.5005932370970931, + "mean_token_accuracy": 0.7230273485183716, + "num_tokens": 5538438.0, + "step": 5063, + "train/ce_loss": 1.003156304359436 + }, + { + "epoch": 0.5005932370970931, + "step": 5063, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.5005932370970931, + "step": 5063, + "train/total_loss": 0.1550031304359436 + }, + { + "entropy": 9.751112937927246, + "epoch": 0.5006921099466086, + "mean_token_accuracy": 0.7576419115066528, + "num_tokens": 5543300.0, + "step": 5064, + "train/ce_loss": 1.6366160480174585e-06 + }, + { + "epoch": 0.5006921099466086, + "step": 5064, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.5006921099466086, + "step": 5064, + "train/total_loss": 0.019531413912773132 + }, + { + "entropy": 9.934198379516602, + "epoch": 0.5007909827961242, + "mean_token_accuracy": 0.7223719954490662, + "num_tokens": 5548099.0, + "step": 5065, + "train/ce_loss": 1.7864806522993604e-06 + }, + { + "epoch": 0.5007909827961242, + "step": 5065, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.5007909827961242, + "step": 5065, + "train/total_loss": 0.06250017881393433 + }, + { + "entropy": 9.23078727722168, + "epoch": 0.5008898556456397, + "mean_token_accuracy": 0.695067286491394, + "num_tokens": 5553218.0, + "step": 5066, + "train/ce_loss": 2.113192067554337e-06 + }, + { + "epoch": 0.5008898556456397, + "step": 5066, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.5008898556456397, + "step": 5066, + "train/total_loss": 0.05468771234154701 + }, + { + "entropy": 8.907807350158691, + "epoch": 0.5009887284951552, + "mean_token_accuracy": 0.7356828451156616, + "num_tokens": 5558378.0, + "step": 5067, + "train/ce_loss": 3.088776111326297e-06 + }, + { + "epoch": 0.5009887284951552, + "step": 5067, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.5009887284951552, + "step": 5067, + "train/total_loss": 0.03125030919909477 + }, + { + "entropy": 8.918107986450195, + "epoch": 0.5010876013446708, + "mean_token_accuracy": 0.7873620986938477, + "num_tokens": 5563990.0, + "step": 5068, + "train/ce_loss": 0.8910037279129028 + }, + { + "epoch": 0.5010876013446708, + "step": 5068, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.5010876013446708, + "step": 5068, + "train/total_loss": 0.15550662577152252 + }, + { + "entropy": 9.20634937286377, + "epoch": 0.5011864741941863, + "mean_token_accuracy": 0.8256275057792664, + "num_tokens": 5569206.0, + "step": 5069, + "train/ce_loss": 1.3271516081658774e-06 + }, + { + "epoch": 0.5011864741941863, + "step": 5069, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.5011864741941863, + "step": 5069, + "train/total_loss": 0.08203138411045074 + }, + { + "entropy": 9.636856079101562, + "epoch": 0.5012853470437018, + "mean_token_accuracy": 0.8415637612342834, + "num_tokens": 5574152.0, + "step": 5070, + "train/ce_loss": 2.1191829091549153e-06 + }, + { + "epoch": 0.5012853470437018, + "step": 5070, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.5012853470437018, + "step": 5070, + "train/total_loss": 0.03906271234154701 + }, + { + "entropy": 9.437459945678711, + "epoch": 0.5013842198932174, + "mean_token_accuracy": 0.7422680258750916, + "num_tokens": 5579077.0, + "step": 5071, + "train/ce_loss": 1.5218898852253915e-06 + }, + { + "epoch": 0.5013842198932174, + "step": 5071, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.5013842198932174, + "step": 5071, + "train/total_loss": 0.04296890273690224 + }, + { + "entropy": 8.63110637664795, + "epoch": 0.5014830927427328, + "mean_token_accuracy": 0.7207123041152954, + "num_tokens": 5584571.0, + "step": 5072, + "train/ce_loss": 1.1009140014648438 + }, + { + "epoch": 0.5014830927427328, + "step": 5072, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.5014830927427328, + "step": 5072, + "train/total_loss": 0.2116539031267166 + }, + { + "entropy": 8.963809967041016, + "epoch": 0.5015819655922483, + "mean_token_accuracy": 0.7305764555931091, + "num_tokens": 5589899.0, + "step": 5073, + "train/ce_loss": 0.9008622169494629 + }, + { + "epoch": 0.5015819655922483, + "step": 5073, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.5015819655922483, + "step": 5073, + "train/total_loss": 0.1721174716949463 + }, + { + "entropy": 8.869132041931152, + "epoch": 0.5016808384417639, + "mean_token_accuracy": 0.7274800539016724, + "num_tokens": 5595264.0, + "step": 5074, + "train/ce_loss": 0.7225847244262695 + }, + { + "epoch": 0.5016808384417639, + "step": 5074, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.5016808384417639, + "step": 5074, + "train/total_loss": 0.09569597244262695 + }, + { + "entropy": 9.230053901672363, + "epoch": 0.5017797112912794, + "mean_token_accuracy": 0.7642045617103577, + "num_tokens": 5600426.0, + "step": 5075, + "train/ce_loss": 1.804285034268105e-06 + }, + { + "epoch": 0.5017797112912794, + "step": 5075, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.5017797112912794, + "step": 5075, + "train/total_loss": 0.08203142881393433 + }, + { + "entropy": 8.775399208068848, + "epoch": 0.5018785841407949, + "mean_token_accuracy": 0.7524038553237915, + "num_tokens": 5605725.0, + "step": 5076, + "train/ce_loss": 0.5871142745018005 + }, + { + "epoch": 0.5018785841407949, + "step": 5076, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.5018785841407949, + "step": 5076, + "train/total_loss": 0.10168017446994781 + }, + { + "entropy": 9.455656051635742, + "epoch": 0.5019774569903105, + "mean_token_accuracy": 0.7743785977363586, + "num_tokens": 5610706.0, + "step": 5077, + "train/ce_loss": 1.207939863204956 + }, + { + "epoch": 0.5019774569903105, + "step": 5077, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.5019774569903105, + "step": 5077, + "train/total_loss": 0.16376274824142456 + }, + { + "entropy": 8.894990921020508, + "epoch": 0.502076329839826, + "mean_token_accuracy": 0.6934023499488831, + "num_tokens": 5615949.0, + "step": 5078, + "train/ce_loss": 0.9794240593910217 + }, + { + "epoch": 0.502076329839826, + "step": 5078, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.502076329839826, + "step": 5078, + "train/total_loss": 0.19950491189956665 + }, + { + "entropy": 8.97636604309082, + "epoch": 0.5021752026893415, + "mean_token_accuracy": 0.6861042380332947, + "num_tokens": 5621186.0, + "step": 5079, + "train/ce_loss": 1.6270090341567993 + }, + { + "epoch": 0.5021752026893415, + "step": 5079, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.5021752026893415, + "step": 5079, + "train/total_loss": 0.21348215639591217 + }, + { + "epoch": 0.5022740755388571, + "grad_norm": 0.8323706984519958, + "learning_rate": 8.746724027097859e-06, + "loss": 0.1363, + "step": 5080 + }, + { + "entropy": 9.041561126708984, + "epoch": 0.5022740755388571, + "mean_token_accuracy": 0.6832579374313354, + "num_tokens": 5626592.0, + "step": 5080, + "train/ce_loss": 0.9357867240905762 + }, + { + "epoch": 0.5022740755388571, + "step": 5080, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.5022740755388571, + "step": 5080, + "train/total_loss": 0.15217241644859314 + }, + { + "entropy": 9.809600830078125, + "epoch": 0.5023729483883725, + "mean_token_accuracy": 0.6909871101379395, + "num_tokens": 5631449.0, + "step": 5081, + "train/ce_loss": 2.0013485482195392e-06 + }, + { + "epoch": 0.5023729483883725, + "step": 5081, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.5023729483883725, + "step": 5081, + "train/total_loss": 0.05078145116567612 + }, + { + "entropy": 9.92713737487793, + "epoch": 0.502471821237888, + "mean_token_accuracy": 0.6908315420150757, + "num_tokens": 5636336.0, + "step": 5082, + "train/ce_loss": 1.3149290084838867 + }, + { + "epoch": 0.502471821237888, + "step": 5082, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.502471821237888, + "step": 5082, + "train/total_loss": 0.16274289786815643 + }, + { + "entropy": 8.473834037780762, + "epoch": 0.5025706940874036, + "mean_token_accuracy": 0.7618147730827332, + "num_tokens": 5641881.0, + "step": 5083, + "train/ce_loss": 0.5337604284286499 + }, + { + "epoch": 0.5025706940874036, + "step": 5083, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.5025706940874036, + "step": 5083, + "train/total_loss": 0.10806354880332947 + }, + { + "entropy": 8.596364974975586, + "epoch": 0.5026695669369191, + "mean_token_accuracy": 0.738070011138916, + "num_tokens": 5647324.0, + "step": 5084, + "train/ce_loss": 0.8980840444564819 + }, + { + "epoch": 0.5026695669369191, + "step": 5084, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.5026695669369191, + "step": 5084, + "train/total_loss": 0.1288709044456482 + }, + { + "entropy": 9.061573028564453, + "epoch": 0.5027684397864346, + "mean_token_accuracy": 0.772020697593689, + "num_tokens": 5652538.0, + "step": 5085, + "train/ce_loss": 0.4668557643890381 + }, + { + "epoch": 0.5027684397864346, + "step": 5085, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.5027684397864346, + "step": 5085, + "train/total_loss": 0.09746682643890381 + }, + { + "entropy": 9.521434783935547, + "epoch": 0.5028673126359502, + "mean_token_accuracy": 0.761168360710144, + "num_tokens": 5657519.0, + "step": 5086, + "train/ce_loss": 1.008385419845581 + }, + { + "epoch": 0.5028673126359502, + "step": 5086, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.5028673126359502, + "step": 5086, + "train/total_loss": 0.1711510419845581 + }, + { + "entropy": 9.393722534179688, + "epoch": 0.5029661854854657, + "mean_token_accuracy": 0.7013698816299438, + "num_tokens": 5662607.0, + "step": 5087, + "train/ce_loss": 0.8183227777481079 + }, + { + "epoch": 0.5029661854854657, + "step": 5087, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.5029661854854657, + "step": 5087, + "train/total_loss": 0.14042603969573975 + }, + { + "entropy": 9.471672058105469, + "epoch": 0.5030650583349812, + "mean_token_accuracy": 0.7374045848846436, + "num_tokens": 5667655.0, + "step": 5088, + "train/ce_loss": 1.2775394916534424 + }, + { + "epoch": 0.5030650583349812, + "step": 5088, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.5030650583349812, + "step": 5088, + "train/total_loss": 0.17462895810604095 + }, + { + "entropy": 8.942428588867188, + "epoch": 0.5031639311844968, + "mean_token_accuracy": 0.7096773982048035, + "num_tokens": 5673086.0, + "step": 5089, + "train/ce_loss": 0.5040650367736816 + }, + { + "epoch": 0.5031639311844968, + "step": 5089, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.5031639311844968, + "step": 5089, + "train/total_loss": 0.13243775069713593 + }, + { + "entropy": 9.104717254638672, + "epoch": 0.5032628040340122, + "mean_token_accuracy": 0.7839999794960022, + "num_tokens": 5678425.0, + "step": 5090, + "train/ce_loss": 0.7472172379493713 + }, + { + "epoch": 0.5032628040340122, + "step": 5090, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.5032628040340122, + "step": 5090, + "train/total_loss": 0.10597172379493713 + }, + { + "entropy": 8.89947509765625, + "epoch": 0.5033616768835277, + "mean_token_accuracy": 0.7226697206497192, + "num_tokens": 5683794.0, + "step": 5091, + "train/ce_loss": 1.0725902318954468 + }, + { + "epoch": 0.5033616768835277, + "step": 5091, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.5033616768835277, + "step": 5091, + "train/total_loss": 0.16975903511047363 + }, + { + "entropy": 9.385111808776855, + "epoch": 0.5034605497330433, + "mean_token_accuracy": 0.7701492309570312, + "num_tokens": 5688932.0, + "step": 5092, + "train/ce_loss": 0.46031317114830017 + }, + { + "epoch": 0.5034605497330433, + "step": 5092, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.5034605497330433, + "step": 5092, + "train/total_loss": 0.07728131860494614 + }, + { + "entropy": 9.301116943359375, + "epoch": 0.5035594225825588, + "mean_token_accuracy": 0.7307132482528687, + "num_tokens": 5694057.0, + "step": 5093, + "train/ce_loss": 0.8710409998893738 + }, + { + "epoch": 0.5035594225825588, + "step": 5093, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.5035594225825588, + "step": 5093, + "train/total_loss": 0.13788536190986633 + }, + { + "entropy": 9.771774291992188, + "epoch": 0.5036582954320743, + "mean_token_accuracy": 0.752598762512207, + "num_tokens": 5698972.0, + "step": 5094, + "train/ce_loss": 1.4751821756362915 + }, + { + "epoch": 0.5036582954320743, + "step": 5094, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.5036582954320743, + "step": 5094, + "train/total_loss": 0.18267446756362915 + }, + { + "entropy": 9.59885311126709, + "epoch": 0.5037571682815899, + "mean_token_accuracy": 0.8051947951316833, + "num_tokens": 5704009.0, + "step": 5095, + "train/ce_loss": 1.8043161844616407e-06 + }, + { + "epoch": 0.5037571682815899, + "step": 5095, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.5037571682815899, + "step": 5095, + "train/total_loss": 0.042968928813934326 + }, + { + "entropy": 9.618340492248535, + "epoch": 0.5038560411311054, + "mean_token_accuracy": 0.7326202988624573, + "num_tokens": 5708996.0, + "step": 5096, + "train/ce_loss": 1.0726532764238073e-06 + }, + { + "epoch": 0.5038560411311054, + "step": 5096, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.5038560411311054, + "step": 5096, + "train/total_loss": 0.023437608033418655 + }, + { + "entropy": 8.889970779418945, + "epoch": 0.5039549139806209, + "mean_token_accuracy": 0.7784877419471741, + "num_tokens": 5714431.0, + "step": 5097, + "train/ce_loss": 1.165142297744751 + }, + { + "epoch": 0.5039549139806209, + "step": 5097, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.5039549139806209, + "step": 5097, + "train/total_loss": 0.21417048573493958 + }, + { + "entropy": 9.063730239868164, + "epoch": 0.5040537868301365, + "mean_token_accuracy": 0.6886792182922363, + "num_tokens": 5719672.0, + "step": 5098, + "train/ce_loss": 1.0833317041397095 + }, + { + "epoch": 0.5040537868301365, + "step": 5098, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.5040537868301365, + "step": 5098, + "train/total_loss": 0.13958317041397095 + }, + { + "entropy": 9.895206451416016, + "epoch": 0.504152659679652, + "mean_token_accuracy": 0.7579908967018127, + "num_tokens": 5724532.0, + "step": 5099, + "train/ce_loss": 1.2715253829956055 + }, + { + "epoch": 0.504152659679652, + "step": 5099, + "train/sim_loss": 0.140625 + }, + { + "epoch": 0.504152659679652, + "step": 5099, + "train/total_loss": 0.26777756214141846 + }, + { + "epoch": 0.5042515325291675, + "grad_norm": 0.8440897464752197, + "learning_rate": 8.741779162339911e-06, + "loss": 0.136, + "step": 5100 + }, + { + "entropy": 9.225937843322754, + "epoch": 0.5042515325291675, + "mean_token_accuracy": 0.7293333411216736, + "num_tokens": 5729720.0, + "step": 5100, + "train/ce_loss": 0.8220096230506897 + }, + { + "epoch": 0.5042515325291675, + "step": 5100, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.5042515325291675, + "step": 5100, + "train/total_loss": 0.15251347422599792 + }, + { + "entropy": 9.440406799316406, + "epoch": 0.504350405378683, + "mean_token_accuracy": 0.7401032447814941, + "num_tokens": 5734798.0, + "step": 5101, + "train/ce_loss": 1.4222238063812256 + }, + { + "epoch": 0.504350405378683, + "step": 5101, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.504350405378683, + "step": 5101, + "train/total_loss": 0.22815988957881927 + }, + { + "entropy": 9.28703498840332, + "epoch": 0.5044492782281985, + "mean_token_accuracy": 0.7796609997749329, + "num_tokens": 5739901.0, + "step": 5102, + "train/ce_loss": 2.876155122066848e-06 + }, + { + "epoch": 0.5044492782281985, + "step": 5102, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.5044492782281985, + "step": 5102, + "train/total_loss": 0.05468778684735298 + }, + { + "entropy": 8.814054489135742, + "epoch": 0.5045481510777141, + "mean_token_accuracy": 0.7667638659477234, + "num_tokens": 5745440.0, + "step": 5103, + "train/ce_loss": 0.5586997866630554 + }, + { + "epoch": 0.5045481510777141, + "step": 5103, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.5045481510777141, + "step": 5103, + "train/total_loss": 0.07540123164653778 + }, + { + "entropy": 9.398179054260254, + "epoch": 0.5046470239272296, + "mean_token_accuracy": 0.7651515007019043, + "num_tokens": 5750815.0, + "step": 5104, + "train/ce_loss": 1.1095727682113647 + }, + { + "epoch": 0.5046470239272296, + "step": 5104, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.5046470239272296, + "step": 5104, + "train/total_loss": 0.1851760298013687 + }, + { + "entropy": 9.207182884216309, + "epoch": 0.5047458967767451, + "mean_token_accuracy": 0.7735334038734436, + "num_tokens": 5756036.0, + "step": 5105, + "train/ce_loss": 0.7678409814834595 + }, + { + "epoch": 0.5047458967767451, + "step": 5105, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.5047458967767451, + "step": 5105, + "train/total_loss": 0.10412784665822983 + }, + { + "entropy": 9.221953392028809, + "epoch": 0.5048447696262607, + "mean_token_accuracy": 0.7303370833396912, + "num_tokens": 5761183.0, + "step": 5106, + "train/ce_loss": 1.2139334678649902 + }, + { + "epoch": 0.5048447696262607, + "step": 5106, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.5048447696262607, + "step": 5106, + "train/total_loss": 0.1760808527469635 + }, + { + "entropy": 8.855124473571777, + "epoch": 0.5049436424757762, + "mean_token_accuracy": 0.7108306884765625, + "num_tokens": 5766601.0, + "step": 5107, + "train/ce_loss": 1.0811680555343628 + }, + { + "epoch": 0.5049436424757762, + "step": 5107, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.5049436424757762, + "step": 5107, + "train/total_loss": 0.16280430555343628 + }, + { + "entropy": 8.905277252197266, + "epoch": 0.5050425153252917, + "mean_token_accuracy": 0.7352085113525391, + "num_tokens": 5772022.0, + "step": 5108, + "train/ce_loss": 0.4442138969898224 + }, + { + "epoch": 0.5050425153252917, + "step": 5108, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.5050425153252917, + "step": 5108, + "train/total_loss": 0.09129638969898224 + }, + { + "entropy": 9.026247024536133, + "epoch": 0.5051413881748072, + "mean_token_accuracy": 0.7465224266052246, + "num_tokens": 5777074.0, + "step": 5109, + "train/ce_loss": 1.2243313789367676 + }, + { + "epoch": 0.5051413881748072, + "step": 5109, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.5051413881748072, + "step": 5109, + "train/total_loss": 0.200558140873909 + }, + { + "entropy": 8.925554275512695, + "epoch": 0.5052402610243227, + "mean_token_accuracy": 0.7467144727706909, + "num_tokens": 5782392.0, + "step": 5110, + "train/ce_loss": 0.7173311114311218 + }, + { + "epoch": 0.5052402610243227, + "step": 5110, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.5052402610243227, + "step": 5110, + "train/total_loss": 0.11470185965299606 + }, + { + "entropy": 9.078937530517578, + "epoch": 0.5053391338738382, + "mean_token_accuracy": 0.7120000123977661, + "num_tokens": 5787672.0, + "step": 5111, + "train/ce_loss": 1.3853713274002075 + }, + { + "epoch": 0.5053391338738382, + "step": 5111, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.5053391338738382, + "step": 5111, + "train/total_loss": 0.20103713870048523 + }, + { + "entropy": 9.134469985961914, + "epoch": 0.5054380067233538, + "mean_token_accuracy": 0.7195122241973877, + "num_tokens": 5792963.0, + "step": 5112, + "train/ce_loss": 0.4597097933292389 + }, + { + "epoch": 0.5054380067233538, + "step": 5112, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.5054380067233538, + "step": 5112, + "train/total_loss": 0.09284597635269165 + }, + { + "entropy": 8.931024551391602, + "epoch": 0.5055368795728693, + "mean_token_accuracy": 0.7422459721565247, + "num_tokens": 5798547.0, + "step": 5113, + "train/ce_loss": 1.419727087020874 + }, + { + "epoch": 0.5055368795728693, + "step": 5113, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.5055368795728693, + "step": 5113, + "train/total_loss": 0.24744145572185516 + }, + { + "entropy": 8.888985633850098, + "epoch": 0.5056357524223848, + "mean_token_accuracy": 0.7433217167854309, + "num_tokens": 5803878.0, + "step": 5114, + "train/ce_loss": 0.7526900768280029 + }, + { + "epoch": 0.5056357524223848, + "step": 5114, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.5056357524223848, + "step": 5114, + "train/total_loss": 0.17683151364326477 + }, + { + "entropy": 8.805142402648926, + "epoch": 0.5057346252719004, + "mean_token_accuracy": 0.7789815664291382, + "num_tokens": 5809279.0, + "step": 5115, + "train/ce_loss": 0.848304808139801 + }, + { + "epoch": 0.5057346252719004, + "step": 5115, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.5057346252719004, + "step": 5115, + "train/total_loss": 0.13561174273490906 + }, + { + "entropy": 9.011064529418945, + "epoch": 0.5058334981214159, + "mean_token_accuracy": 0.7626146674156189, + "num_tokens": 5814627.0, + "step": 5116, + "train/ce_loss": 0.5869022607803345 + }, + { + "epoch": 0.5058334981214159, + "step": 5116, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.5058334981214159, + "step": 5116, + "train/total_loss": 0.12119022756814957 + }, + { + "entropy": 9.190732955932617, + "epoch": 0.5059323709709314, + "mean_token_accuracy": 0.7644628286361694, + "num_tokens": 5819870.0, + "step": 5117, + "train/ce_loss": 1.3621413472719723e-06 + }, + { + "epoch": 0.5059323709709314, + "step": 5117, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.5059323709709314, + "step": 5117, + "train/total_loss": 0.03515638783574104 + }, + { + "entropy": 9.612802505493164, + "epoch": 0.506031243820447, + "mean_token_accuracy": 0.7743902206420898, + "num_tokens": 5824818.0, + "step": 5118, + "train/ce_loss": 1.2309508323669434 + }, + { + "epoch": 0.506031243820447, + "step": 5118, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.506031243820447, + "step": 5118, + "train/total_loss": 0.1543450951576233 + }, + { + "entropy": 8.730239868164062, + "epoch": 0.5061301166699624, + "mean_token_accuracy": 0.7402597665786743, + "num_tokens": 5830222.0, + "step": 5119, + "train/ce_loss": 1.0233420133590698 + }, + { + "epoch": 0.5061301166699624, + "step": 5119, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.5061301166699624, + "step": 5119, + "train/total_loss": 0.18827170133590698 + }, + { + "epoch": 0.5062289895194779, + "grad_norm": 0.7438586354255676, + "learning_rate": 8.736834297581962e-06, + "loss": 0.1341, + "step": 5120 + }, + { + "entropy": 9.60583209991455, + "epoch": 0.5062289895194779, + "mean_token_accuracy": 0.7167530059814453, + "num_tokens": 5835240.0, + "step": 5120, + "train/ce_loss": 0.6151160001754761 + }, + { + "epoch": 0.5062289895194779, + "step": 5120, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.5062289895194779, + "step": 5120, + "train/total_loss": 0.10057410597801208 + }, + { + "entropy": 9.251870155334473, + "epoch": 0.5063278623689935, + "mean_token_accuracy": 0.7232267260551453, + "num_tokens": 5840406.0, + "step": 5121, + "train/ce_loss": 1.176452087747748e-06 + }, + { + "epoch": 0.5063278623689935, + "step": 5121, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.5063278623689935, + "step": 5121, + "train/total_loss": 0.05078136920928955 + }, + { + "entropy": 8.833154678344727, + "epoch": 0.506426735218509, + "mean_token_accuracy": 0.7489919066429138, + "num_tokens": 5845853.0, + "step": 5122, + "train/ce_loss": 0.9704189896583557 + }, + { + "epoch": 0.506426735218509, + "step": 5122, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.506426735218509, + "step": 5122, + "train/total_loss": 0.17516690492630005 + }, + { + "entropy": 8.725252151489258, + "epoch": 0.5065256080680245, + "mean_token_accuracy": 0.7568534016609192, + "num_tokens": 5851194.0, + "step": 5123, + "train/ce_loss": 1.1533256769180298 + }, + { + "epoch": 0.5065256080680245, + "step": 5123, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.5065256080680245, + "step": 5123, + "train/total_loss": 0.20908257365226746 + }, + { + "entropy": 8.7609224319458, + "epoch": 0.5066244809175401, + "mean_token_accuracy": 0.7646474838256836, + "num_tokens": 5856675.0, + "step": 5124, + "train/ce_loss": 1.1014946699142456 + }, + { + "epoch": 0.5066244809175401, + "step": 5124, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.5066244809175401, + "step": 5124, + "train/total_loss": 0.14530572295188904 + }, + { + "entropy": 8.80714225769043, + "epoch": 0.5067233537670556, + "mean_token_accuracy": 0.7259439826011658, + "num_tokens": 5861970.0, + "step": 5125, + "train/ce_loss": 0.9872626662254333 + }, + { + "epoch": 0.5067233537670556, + "step": 5125, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.5067233537670556, + "step": 5125, + "train/total_loss": 0.1456012725830078 + }, + { + "entropy": 9.590246200561523, + "epoch": 0.5068222266165711, + "mean_token_accuracy": 0.7184000015258789, + "num_tokens": 5867050.0, + "step": 5126, + "train/ce_loss": 1.390322208404541 + }, + { + "epoch": 0.5068222266165711, + "step": 5126, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.5068222266165711, + "step": 5126, + "train/total_loss": 0.20543847978115082 + }, + { + "entropy": 9.42148208618164, + "epoch": 0.5069210994660867, + "mean_token_accuracy": 0.7205169796943665, + "num_tokens": 5872079.0, + "step": 5127, + "train/ce_loss": 1.5720489025115967 + }, + { + "epoch": 0.5069210994660867, + "step": 5127, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.5069210994660867, + "step": 5127, + "train/total_loss": 0.21579864621162415 + }, + { + "entropy": 9.098977088928223, + "epoch": 0.5070199723156021, + "mean_token_accuracy": 0.7375504970550537, + "num_tokens": 5877255.0, + "step": 5128, + "train/ce_loss": 0.5713717937469482 + }, + { + "epoch": 0.5070199723156021, + "step": 5128, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.5070199723156021, + "step": 5128, + "train/total_loss": 0.10401217639446259 + }, + { + "entropy": 8.864845275878906, + "epoch": 0.5071188451651176, + "mean_token_accuracy": 0.790673553943634, + "num_tokens": 5882717.0, + "step": 5129, + "train/ce_loss": 0.46934014558792114 + }, + { + "epoch": 0.5071188451651176, + "step": 5129, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.5071188451651176, + "step": 5129, + "train/total_loss": 0.07818401604890823 + }, + { + "entropy": 9.221084594726562, + "epoch": 0.5072177180146332, + "mean_token_accuracy": 0.7324561476707458, + "num_tokens": 5887871.0, + "step": 5130, + "train/ce_loss": 1.0543817281723022 + }, + { + "epoch": 0.5072177180146332, + "step": 5130, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.5072177180146332, + "step": 5130, + "train/total_loss": 0.14840692281723022 + }, + { + "entropy": 9.405488967895508, + "epoch": 0.5073165908641487, + "mean_token_accuracy": 0.7632450461387634, + "num_tokens": 5892911.0, + "step": 5131, + "train/ce_loss": 0.8800402283668518 + }, + { + "epoch": 0.5073165908641487, + "step": 5131, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.5073165908641487, + "step": 5131, + "train/total_loss": 0.15441027283668518 + }, + { + "entropy": 8.841413497924805, + "epoch": 0.5074154637136642, + "mean_token_accuracy": 0.7932535409927368, + "num_tokens": 5898280.0, + "step": 5132, + "train/ce_loss": 0.5812978744506836 + }, + { + "epoch": 0.5074154637136642, + "step": 5132, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.5074154637136642, + "step": 5132, + "train/total_loss": 0.10109853744506836 + }, + { + "entropy": 9.707983016967773, + "epoch": 0.5075143365631798, + "mean_token_accuracy": 0.78899085521698, + "num_tokens": 5903242.0, + "step": 5133, + "train/ce_loss": 0.8480736613273621 + }, + { + "epoch": 0.5075143365631798, + "step": 5133, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.5075143365631798, + "step": 5133, + "train/total_loss": 0.1199636161327362 + }, + { + "entropy": 8.927094459533691, + "epoch": 0.5076132094126953, + "mean_token_accuracy": 0.6925795078277588, + "num_tokens": 5908548.0, + "step": 5134, + "train/ce_loss": 1.1896045207977295 + }, + { + "epoch": 0.5076132094126953, + "step": 5134, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.5076132094126953, + "step": 5134, + "train/total_loss": 0.2361479550600052 + }, + { + "entropy": 9.103012084960938, + "epoch": 0.5077120822622108, + "mean_token_accuracy": 0.7661388516426086, + "num_tokens": 5913852.0, + "step": 5135, + "train/ce_loss": 0.8272993564605713 + }, + { + "epoch": 0.5077120822622108, + "step": 5135, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.5077120822622108, + "step": 5135, + "train/total_loss": 0.14132368564605713 + }, + { + "entropy": 9.943624496459961, + "epoch": 0.5078109551117264, + "mean_token_accuracy": 0.695035457611084, + "num_tokens": 5918679.0, + "step": 5136, + "train/ce_loss": 2.4764817680988926e-06 + }, + { + "epoch": 0.5078109551117264, + "step": 5136, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.5078109551117264, + "step": 5136, + "train/total_loss": 0.0507814958691597 + }, + { + "entropy": 9.284172058105469, + "epoch": 0.5079098279612418, + "mean_token_accuracy": 0.7684729099273682, + "num_tokens": 5923756.0, + "step": 5137, + "train/ce_loss": 1.1524968147277832 + }, + { + "epoch": 0.5079098279612418, + "step": 5137, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.5079098279612418, + "step": 5137, + "train/total_loss": 0.14259344339370728 + }, + { + "entropy": 9.191142082214355, + "epoch": 0.5080087008107573, + "mean_token_accuracy": 0.7150635123252869, + "num_tokens": 5928742.0, + "step": 5138, + "train/ce_loss": 0.8419607877731323 + }, + { + "epoch": 0.5080087008107573, + "step": 5138, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.5080087008107573, + "step": 5138, + "train/total_loss": 0.1388835906982422 + }, + { + "entropy": 9.891863822937012, + "epoch": 0.5081075736602729, + "mean_token_accuracy": 0.8214285969734192, + "num_tokens": 5933591.0, + "step": 5139, + "train/ce_loss": 2.745048732322175e-06 + }, + { + "epoch": 0.5081075736602729, + "step": 5139, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.5081075736602729, + "step": 5139, + "train/total_loss": 0.023437773808836937 + }, + { + "epoch": 0.5082064465097884, + "grad_norm": 0.8406258225440979, + "learning_rate": 8.731889432824014e-06, + "loss": 0.1341, + "step": 5140 + }, + { + "entropy": 9.00814151763916, + "epoch": 0.5082064465097884, + "mean_token_accuracy": 0.7981545329093933, + "num_tokens": 5938939.0, + "step": 5140, + "train/ce_loss": 0.4677835702896118 + }, + { + "epoch": 0.5082064465097884, + "step": 5140, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.5082064465097884, + "step": 5140, + "train/total_loss": 0.0663096085190773 + }, + { + "entropy": 9.357422828674316, + "epoch": 0.5083053193593039, + "mean_token_accuracy": 0.716911792755127, + "num_tokens": 5943938.0, + "step": 5141, + "train/ce_loss": 0.9411023259162903 + }, + { + "epoch": 0.5083053193593039, + "step": 5141, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.5083053193593039, + "step": 5141, + "train/total_loss": 0.14879773557186127 + }, + { + "entropy": 9.110038757324219, + "epoch": 0.5084041922088195, + "mean_token_accuracy": 0.7844611406326294, + "num_tokens": 5949220.0, + "step": 5142, + "train/ce_loss": 0.8312951326370239 + }, + { + "epoch": 0.5084041922088195, + "step": 5142, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.5084041922088195, + "step": 5142, + "train/total_loss": 0.14562952518463135 + }, + { + "entropy": 9.562170028686523, + "epoch": 0.508503065058335, + "mean_token_accuracy": 0.7077465057373047, + "num_tokens": 5954258.0, + "step": 5143, + "train/ce_loss": 2.580452701295144e-06 + }, + { + "epoch": 0.508503065058335, + "step": 5143, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.508503065058335, + "step": 5143, + "train/total_loss": 0.08593776077032089 + }, + { + "entropy": 9.183984756469727, + "epoch": 0.5086019379078505, + "mean_token_accuracy": 0.7325000166893005, + "num_tokens": 5959454.0, + "step": 5144, + "train/ce_loss": 0.8328563570976257 + }, + { + "epoch": 0.5086019379078505, + "step": 5144, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.5086019379078505, + "step": 5144, + "train/total_loss": 0.1457856297492981 + }, + { + "entropy": 9.396947860717773, + "epoch": 0.5087008107573661, + "mean_token_accuracy": 0.7349768877029419, + "num_tokens": 5964502.0, + "step": 5145, + "train/ce_loss": 1.4666695594787598 + }, + { + "epoch": 0.5087008107573661, + "step": 5145, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.5087008107573661, + "step": 5145, + "train/total_loss": 0.19354195892810822 + }, + { + "entropy": 9.139756202697754, + "epoch": 0.5087996836068815, + "mean_token_accuracy": 0.7391874194145203, + "num_tokens": 5969739.0, + "step": 5146, + "train/ce_loss": 0.9517946839332581 + }, + { + "epoch": 0.5087996836068815, + "step": 5146, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.5087996836068815, + "step": 5146, + "train/total_loss": 0.1850232183933258 + }, + { + "entropy": 8.929136276245117, + "epoch": 0.508898556456397, + "mean_token_accuracy": 0.7115117907524109, + "num_tokens": 5974947.0, + "step": 5147, + "train/ce_loss": 0.612743616104126 + }, + { + "epoch": 0.508898556456397, + "step": 5147, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.508898556456397, + "step": 5147, + "train/total_loss": 0.11205561459064484 + }, + { + "entropy": 9.182073593139648, + "epoch": 0.5089974293059126, + "mean_token_accuracy": 0.692307710647583, + "num_tokens": 5980120.0, + "step": 5148, + "train/ce_loss": 0.8845396041870117 + }, + { + "epoch": 0.5089974293059126, + "step": 5148, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.5089974293059126, + "step": 5148, + "train/total_loss": 0.1548602133989334 + }, + { + "entropy": 9.10268783569336, + "epoch": 0.5090963021554281, + "mean_token_accuracy": 0.698952853679657, + "num_tokens": 5985355.0, + "step": 5149, + "train/ce_loss": 0.7536455988883972 + }, + { + "epoch": 0.5090963021554281, + "step": 5149, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.5090963021554281, + "step": 5149, + "train/total_loss": 0.11833330988883972 + }, + { + "entropy": 9.613290786743164, + "epoch": 0.5091951750049436, + "mean_token_accuracy": 0.7426470518112183, + "num_tokens": 5990114.0, + "step": 5150, + "train/ce_loss": 2.3226072788238525 + }, + { + "epoch": 0.5091951750049436, + "step": 5150, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.5091951750049436, + "step": 5150, + "train/total_loss": 0.28694823384284973 + }, + { + "entropy": 9.525121688842773, + "epoch": 0.5092940478544592, + "mean_token_accuracy": 0.7599999904632568, + "num_tokens": 5995171.0, + "step": 5151, + "train/ce_loss": 1.2991708517074585 + }, + { + "epoch": 0.5092940478544592, + "step": 5151, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.5092940478544592, + "step": 5151, + "train/total_loss": 0.20413583517074585 + }, + { + "entropy": 8.902762413024902, + "epoch": 0.5093929207039747, + "mean_token_accuracy": 0.7444589138031006, + "num_tokens": 6000442.0, + "step": 5152, + "train/ce_loss": 1.308106541633606 + }, + { + "epoch": 0.5093929207039747, + "step": 5152, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.5093929207039747, + "step": 5152, + "train/total_loss": 0.2050294131040573 + }, + { + "entropy": 9.54294204711914, + "epoch": 0.5094917935534902, + "mean_token_accuracy": 0.7561779022216797, + "num_tokens": 6005433.0, + "step": 5153, + "train/ce_loss": 0.9077314138412476 + }, + { + "epoch": 0.5094917935534902, + "step": 5153, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.5094917935534902, + "step": 5153, + "train/total_loss": 0.16889813542366028 + }, + { + "entropy": 9.558610916137695, + "epoch": 0.5095906664030058, + "mean_token_accuracy": 0.7542662024497986, + "num_tokens": 6010451.0, + "step": 5154, + "train/ce_loss": 0.6818474531173706 + }, + { + "epoch": 0.5095906664030058, + "step": 5154, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.5095906664030058, + "step": 5154, + "train/total_loss": 0.1228722482919693 + }, + { + "entropy": 9.01298713684082, + "epoch": 0.5096895392525213, + "mean_token_accuracy": 0.7458563446998596, + "num_tokens": 6015843.0, + "step": 5155, + "train/ce_loss": 1.1367805004119873 + }, + { + "epoch": 0.5096895392525213, + "step": 5155, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.5096895392525213, + "step": 5155, + "train/total_loss": 0.21524055302143097 + }, + { + "entropy": 8.672857284545898, + "epoch": 0.5097884121020367, + "mean_token_accuracy": 0.7292870879173279, + "num_tokens": 6021363.0, + "step": 5156, + "train/ce_loss": 1.090519905090332 + }, + { + "epoch": 0.5097884121020367, + "step": 5156, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.5097884121020367, + "step": 5156, + "train/total_loss": 0.18717700242996216 + }, + { + "entropy": 9.39059829711914, + "epoch": 0.5098872849515523, + "mean_token_accuracy": 0.7779456377029419, + "num_tokens": 6026521.0, + "step": 5157, + "train/ce_loss": 0.7553310990333557 + }, + { + "epoch": 0.5098872849515523, + "step": 5157, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.5098872849515523, + "step": 5157, + "train/total_loss": 0.12631437182426453 + }, + { + "entropy": 9.064903259277344, + "epoch": 0.5099861578010678, + "mean_token_accuracy": 0.7238442897796631, + "num_tokens": 6031797.0, + "step": 5158, + "train/ce_loss": 1.1853233575820923 + }, + { + "epoch": 0.5099861578010678, + "step": 5158, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.5099861578010678, + "step": 5158, + "train/total_loss": 0.23181357979774475 + }, + { + "entropy": 9.55142593383789, + "epoch": 0.5100850306505833, + "mean_token_accuracy": 0.7651122808456421, + "num_tokens": 6036802.0, + "step": 5159, + "train/ce_loss": 0.8801589608192444 + }, + { + "epoch": 0.5100850306505833, + "step": 5159, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.5100850306505833, + "step": 5159, + "train/total_loss": 0.16223464906215668 + }, + { + "epoch": 0.5101839035000989, + "grad_norm": 0.7660681009292603, + "learning_rate": 8.726944568066063e-06, + "loss": 0.1418, + "step": 5160 + }, + { + "entropy": 8.865266799926758, + "epoch": 0.5101839035000989, + "mean_token_accuracy": 0.7838745713233948, + "num_tokens": 6042172.0, + "step": 5160, + "train/ce_loss": 0.6156168580055237 + }, + { + "epoch": 0.5101839035000989, + "step": 5160, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.5101839035000989, + "step": 5160, + "train/total_loss": 0.0928116887807846 + }, + { + "entropy": 9.807842254638672, + "epoch": 0.5102827763496144, + "mean_token_accuracy": 0.7559633255004883, + "num_tokens": 6047103.0, + "step": 5161, + "train/ce_loss": 1.5274415016174316 + }, + { + "epoch": 0.5102827763496144, + "step": 5161, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.5102827763496144, + "step": 5161, + "train/total_loss": 0.22305665910243988 + }, + { + "entropy": 8.848272323608398, + "epoch": 0.5103816491991299, + "mean_token_accuracy": 0.7283511161804199, + "num_tokens": 6052342.0, + "step": 5162, + "train/ce_loss": 0.5998438596725464 + }, + { + "epoch": 0.5103816491991299, + "step": 5162, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.5103816491991299, + "step": 5162, + "train/total_loss": 0.13029688596725464 + }, + { + "entropy": 9.544805526733398, + "epoch": 0.5104805220486455, + "mean_token_accuracy": 0.6873747706413269, + "num_tokens": 6057300.0, + "step": 5163, + "train/ce_loss": 1.176200032234192 + }, + { + "epoch": 0.5104805220486455, + "step": 5163, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.5104805220486455, + "step": 5163, + "train/total_loss": 0.17230750620365143 + }, + { + "entropy": 9.44500732421875, + "epoch": 0.510579394898161, + "mean_token_accuracy": 0.7471264600753784, + "num_tokens": 6062382.0, + "step": 5164, + "train/ce_loss": 0.8489091992378235 + }, + { + "epoch": 0.510579394898161, + "step": 5164, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.510579394898161, + "step": 5164, + "train/total_loss": 0.1005159243941307 + }, + { + "entropy": 9.15986442565918, + "epoch": 0.5106782677476764, + "mean_token_accuracy": 0.715068519115448, + "num_tokens": 6067603.0, + "step": 5165, + "train/ce_loss": 0.9437806606292725 + }, + { + "epoch": 0.5106782677476764, + "step": 5165, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.5106782677476764, + "step": 5165, + "train/total_loss": 0.12953431904315948 + }, + { + "entropy": 9.356000900268555, + "epoch": 0.510777140597192, + "mean_token_accuracy": 0.8156862854957581, + "num_tokens": 6072833.0, + "step": 5166, + "train/ce_loss": 0.4159621596336365 + }, + { + "epoch": 0.510777140597192, + "step": 5166, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.510777140597192, + "step": 5166, + "train/total_loss": 0.05722121521830559 + }, + { + "entropy": 8.838285446166992, + "epoch": 0.5108760134467075, + "mean_token_accuracy": 0.7408313155174255, + "num_tokens": 6078100.0, + "step": 5167, + "train/ce_loss": 0.7657850980758667 + }, + { + "epoch": 0.5108760134467075, + "step": 5167, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.5108760134467075, + "step": 5167, + "train/total_loss": 0.11173476278781891 + }, + { + "entropy": 9.36933708190918, + "epoch": 0.510974886296223, + "mean_token_accuracy": 0.7630813717842102, + "num_tokens": 6083249.0, + "step": 5168, + "train/ce_loss": 1.2221713066101074 + }, + { + "epoch": 0.510974886296223, + "step": 5168, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.510974886296223, + "step": 5168, + "train/total_loss": 0.21206088364124298 + }, + { + "entropy": 9.319719314575195, + "epoch": 0.5110737591457386, + "mean_token_accuracy": 0.7013372778892517, + "num_tokens": 6088357.0, + "step": 5169, + "train/ce_loss": 9.366481776851288e-07 + }, + { + "epoch": 0.5110737591457386, + "step": 5169, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.5110737591457386, + "step": 5169, + "train/total_loss": 0.01953134313225746 + }, + { + "entropy": 9.192684173583984, + "epoch": 0.5111726319952541, + "mean_token_accuracy": 0.7037037014961243, + "num_tokens": 6093557.0, + "step": 5170, + "train/ce_loss": 0.6855148077011108 + }, + { + "epoch": 0.5111726319952541, + "step": 5170, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.5111726319952541, + "step": 5170, + "train/total_loss": 0.11152023077011108 + }, + { + "entropy": 8.690927505493164, + "epoch": 0.5112715048447696, + "mean_token_accuracy": 0.7698113322257996, + "num_tokens": 6099088.0, + "step": 5171, + "train/ce_loss": 0.9583162069320679 + }, + { + "epoch": 0.5112715048447696, + "step": 5171, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.5112715048447696, + "step": 5171, + "train/total_loss": 0.19348788261413574 + }, + { + "entropy": 9.323888778686523, + "epoch": 0.5113703776942852, + "mean_token_accuracy": 0.7824859023094177, + "num_tokens": 6104239.0, + "step": 5172, + "train/ce_loss": 0.878183126449585 + }, + { + "epoch": 0.5113703776942852, + "step": 5172, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.5113703776942852, + "step": 5172, + "train/total_loss": 0.11125581711530685 + }, + { + "entropy": 9.09682559967041, + "epoch": 0.5114692505438007, + "mean_token_accuracy": 0.7463235259056091, + "num_tokens": 6109557.0, + "step": 5173, + "train/ce_loss": 0.5591075420379639 + }, + { + "epoch": 0.5114692505438007, + "step": 5173, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.5114692505438007, + "step": 5173, + "train/total_loss": 0.11450450122356415 + }, + { + "entropy": 9.161510467529297, + "epoch": 0.5115681233933161, + "mean_token_accuracy": 0.7394958138465881, + "num_tokens": 6114768.0, + "step": 5174, + "train/ce_loss": 0.5803232192993164 + }, + { + "epoch": 0.5115681233933161, + "step": 5174, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.5115681233933161, + "step": 5174, + "train/total_loss": 0.1439698189496994 + }, + { + "entropy": 9.101654052734375, + "epoch": 0.5116669962428317, + "mean_token_accuracy": 0.7426108121871948, + "num_tokens": 6120001.0, + "step": 5175, + "train/ce_loss": 1.0444610118865967 + }, + { + "epoch": 0.5116669962428317, + "step": 5175, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.5116669962428317, + "step": 5175, + "train/total_loss": 0.16303986310958862 + }, + { + "entropy": 8.89405632019043, + "epoch": 0.5117658690923472, + "mean_token_accuracy": 0.7247706651687622, + "num_tokens": 6125273.0, + "step": 5176, + "train/ce_loss": 0.5537621974945068 + }, + { + "epoch": 0.5117658690923472, + "step": 5176, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.5117658690923472, + "step": 5176, + "train/total_loss": 0.11396996676921844 + }, + { + "entropy": 9.195863723754883, + "epoch": 0.5118647419418627, + "mean_token_accuracy": 0.7087666988372803, + "num_tokens": 6130373.0, + "step": 5177, + "train/ce_loss": 0.8780604600906372 + }, + { + "epoch": 0.5118647419418627, + "step": 5177, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.5118647419418627, + "step": 5177, + "train/total_loss": 0.13468104600906372 + }, + { + "entropy": 8.717813491821289, + "epoch": 0.5119636147913783, + "mean_token_accuracy": 0.7383177280426025, + "num_tokens": 6135845.0, + "step": 5178, + "train/ce_loss": 1.014733910560608 + }, + { + "epoch": 0.5119636147913783, + "step": 5178, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.5119636147913783, + "step": 5178, + "train/total_loss": 0.1952233910560608 + }, + { + "entropy": 9.682064056396484, + "epoch": 0.5120624876408938, + "mean_token_accuracy": 0.6962843537330627, + "num_tokens": 6140897.0, + "step": 5179, + "train/ce_loss": 2.111259698867798 + }, + { + "epoch": 0.5120624876408938, + "step": 5179, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.5120624876408938, + "step": 5179, + "train/total_loss": 0.2892509698867798 + }, + { + "epoch": 0.5121613604904093, + "grad_norm": 0.7663262486457825, + "learning_rate": 8.721999703308115e-06, + "loss": 0.1382, + "step": 5180 + }, + { + "entropy": 9.39062213897705, + "epoch": 0.5121613604904093, + "mean_token_accuracy": 0.7942073345184326, + "num_tokens": 6146025.0, + "step": 5180, + "train/ce_loss": 2.2146377887111157e-06 + }, + { + "epoch": 0.5121613604904093, + "step": 5180, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.5121613604904093, + "step": 5180, + "train/total_loss": 0.03906271979212761 + }, + { + "entropy": 9.252467155456543, + "epoch": 0.5122602333399249, + "mean_token_accuracy": 0.7272727489471436, + "num_tokens": 6151153.0, + "step": 5181, + "train/ce_loss": 1.0725518465042114 + }, + { + "epoch": 0.5122602333399249, + "step": 5181, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.5122602333399249, + "step": 5181, + "train/total_loss": 0.18928644061088562 + }, + { + "entropy": 9.536457061767578, + "epoch": 0.5123591061894404, + "mean_token_accuracy": 0.7960526347160339, + "num_tokens": 6156175.0, + "step": 5182, + "train/ce_loss": 0.8920789957046509 + }, + { + "epoch": 0.5123591061894404, + "step": 5182, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.5123591061894404, + "step": 5182, + "train/total_loss": 0.11655165255069733 + }, + { + "entropy": 9.23127555847168, + "epoch": 0.512457979038956, + "mean_token_accuracy": 0.743697464466095, + "num_tokens": 6161354.0, + "step": 5183, + "train/ce_loss": 1.3253401517868042 + }, + { + "epoch": 0.512457979038956, + "step": 5183, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.512457979038956, + "step": 5183, + "train/total_loss": 0.19503401219844818 + }, + { + "entropy": 8.898605346679688, + "epoch": 0.5125568518884714, + "mean_token_accuracy": 0.7256944179534912, + "num_tokens": 6166738.0, + "step": 5184, + "train/ce_loss": 0.4349287450313568 + }, + { + "epoch": 0.5125568518884714, + "step": 5184, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.5125568518884714, + "step": 5184, + "train/total_loss": 0.0669303759932518 + }, + { + "entropy": 8.950806617736816, + "epoch": 0.5126557247379869, + "mean_token_accuracy": 0.7852272987365723, + "num_tokens": 6172117.0, + "step": 5185, + "train/ce_loss": 0.8759341239929199 + }, + { + "epoch": 0.5126557247379869, + "step": 5185, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.5126557247379869, + "step": 5185, + "train/total_loss": 0.14618715643882751 + }, + { + "entropy": 8.852334022521973, + "epoch": 0.5127545975875025, + "mean_token_accuracy": 0.7770069241523743, + "num_tokens": 6177593.0, + "step": 5186, + "train/ce_loss": 0.664239764213562 + }, + { + "epoch": 0.5127545975875025, + "step": 5186, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.5127545975875025, + "step": 5186, + "train/total_loss": 0.08204897493124008 + }, + { + "entropy": 9.912579536437988, + "epoch": 0.512853470437018, + "mean_token_accuracy": 0.6643192768096924, + "num_tokens": 6182401.0, + "step": 5187, + "train/ce_loss": 1.1493152379989624 + }, + { + "epoch": 0.512853470437018, + "step": 5187, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.512853470437018, + "step": 5187, + "train/total_loss": 0.17743152379989624 + }, + { + "entropy": 9.319049835205078, + "epoch": 0.5129523432865335, + "mean_token_accuracy": 0.7262872457504272, + "num_tokens": 6187575.0, + "step": 5188, + "train/ce_loss": 1.1227772235870361 + }, + { + "epoch": 0.5129523432865335, + "step": 5188, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.5129523432865335, + "step": 5188, + "train/total_loss": 0.16696521639823914 + }, + { + "entropy": 9.464544296264648, + "epoch": 0.5130512161360491, + "mean_token_accuracy": 0.7881844639778137, + "num_tokens": 6192667.0, + "step": 5189, + "train/ce_loss": 1.174917459487915 + }, + { + "epoch": 0.5130512161360491, + "step": 5189, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.5130512161360491, + "step": 5189, + "train/total_loss": 0.13702300190925598 + }, + { + "entropy": 9.355015754699707, + "epoch": 0.5131500889855646, + "mean_token_accuracy": 0.7685714364051819, + "num_tokens": 6197860.0, + "step": 5190, + "train/ce_loss": 0.5055925846099854 + }, + { + "epoch": 0.5131500889855646, + "step": 5190, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.5131500889855646, + "step": 5190, + "train/total_loss": 0.07790300995111465 + }, + { + "entropy": 9.103748321533203, + "epoch": 0.5132489618350801, + "mean_token_accuracy": 0.7351225018501282, + "num_tokens": 6203222.0, + "step": 5191, + "train/ce_loss": 0.31338703632354736 + }, + { + "epoch": 0.5132489618350801, + "step": 5191, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.5132489618350801, + "step": 5191, + "train/total_loss": 0.05086995288729668 + }, + { + "entropy": 9.010952949523926, + "epoch": 0.5133478346845957, + "mean_token_accuracy": 0.7311557531356812, + "num_tokens": 6208479.0, + "step": 5192, + "train/ce_loss": 0.680903434753418 + }, + { + "epoch": 0.5133478346845957, + "step": 5192, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.5133478346845957, + "step": 5192, + "train/total_loss": 0.12668409943580627 + }, + { + "entropy": 8.905805587768555, + "epoch": 0.5134467075341111, + "mean_token_accuracy": 0.7019562721252441, + "num_tokens": 6213812.0, + "step": 5193, + "train/ce_loss": 0.8126767873764038 + }, + { + "epoch": 0.5134467075341111, + "step": 5193, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.5134467075341111, + "step": 5193, + "train/total_loss": 0.16329893469810486 + }, + { + "entropy": 9.774221420288086, + "epoch": 0.5135455803836266, + "mean_token_accuracy": 0.7419962286949158, + "num_tokens": 6218729.0, + "step": 5194, + "train/ce_loss": 1.7258882962778443e-06 + }, + { + "epoch": 0.5135455803836266, + "step": 5194, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.5135455803836266, + "step": 5194, + "train/total_loss": 0.01953142322599888 + }, + { + "entropy": 10.152777671813965, + "epoch": 0.5136444532331422, + "mean_token_accuracy": 0.7647058963775635, + "num_tokens": 6223498.0, + "step": 5195, + "train/ce_loss": 1.5547560453414917 + }, + { + "epoch": 0.5136444532331422, + "step": 5195, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.5136444532331422, + "step": 5195, + "train/total_loss": 0.19063185155391693 + }, + { + "entropy": 8.596878051757812, + "epoch": 0.5137433260826577, + "mean_token_accuracy": 0.7590909004211426, + "num_tokens": 6229028.0, + "step": 5196, + "train/ce_loss": 0.8892180323600769 + }, + { + "epoch": 0.5137433260826577, + "step": 5196, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.5137433260826577, + "step": 5196, + "train/total_loss": 0.13970306515693665 + }, + { + "entropy": 8.842941284179688, + "epoch": 0.5138421989321732, + "mean_token_accuracy": 0.796798050403595, + "num_tokens": 6234258.0, + "step": 5197, + "train/ce_loss": 0.36208295822143555 + }, + { + "epoch": 0.5138421989321732, + "step": 5197, + "train/sim_loss": 0.1328125 + }, + { + "epoch": 0.5138421989321732, + "step": 5197, + "train/total_loss": 0.16902080178260803 + }, + { + "entropy": 9.162590026855469, + "epoch": 0.5139410717816888, + "mean_token_accuracy": 0.7354085445404053, + "num_tokens": 6239484.0, + "step": 5198, + "train/ce_loss": 1.4314345121383667 + }, + { + "epoch": 0.5139410717816888, + "step": 5198, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.5139410717816888, + "step": 5198, + "train/total_loss": 0.2212684601545334 + }, + { + "entropy": 9.713400840759277, + "epoch": 0.5140399446312043, + "mean_token_accuracy": 0.7263339161872864, + "num_tokens": 6244500.0, + "step": 5199, + "train/ce_loss": 0.7679638862609863 + }, + { + "epoch": 0.5140399446312043, + "step": 5199, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.5140399446312043, + "step": 5199, + "train/total_loss": 0.10804639011621475 + }, + { + "epoch": 0.5141388174807198, + "grad_norm": 0.7476158142089844, + "learning_rate": 8.717054838550165e-06, + "loss": 0.1343, + "step": 5200 + }, + { + "entropy": 9.169939041137695, + "epoch": 0.5141388174807198, + "mean_token_accuracy": 0.7317396998405457, + "num_tokens": 6249713.0, + "step": 5200, + "train/ce_loss": 0.6050461530685425 + }, + { + "epoch": 0.5141388174807198, + "step": 5200, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.5141388174807198, + "step": 5200, + "train/total_loss": 0.12300461530685425 + }, + { + "entropy": 9.089229583740234, + "epoch": 0.5142376903302354, + "mean_token_accuracy": 0.7239263653755188, + "num_tokens": 6255006.0, + "step": 5201, + "train/ce_loss": 0.9613548517227173 + }, + { + "epoch": 0.5142376903302354, + "step": 5201, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.5142376903302354, + "step": 5201, + "train/total_loss": 0.19769799709320068 + }, + { + "entropy": 9.296426773071289, + "epoch": 0.5143365631797508, + "mean_token_accuracy": 0.713004469871521, + "num_tokens": 6260124.0, + "step": 5202, + "train/ce_loss": 0.9375792145729065 + }, + { + "epoch": 0.5143365631797508, + "step": 5202, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.5143365631797508, + "step": 5202, + "train/total_loss": 0.13672667741775513 + }, + { + "entropy": 8.995756149291992, + "epoch": 0.5144354360292663, + "mean_token_accuracy": 0.7040572762489319, + "num_tokens": 6265506.0, + "step": 5203, + "train/ce_loss": 1.0593247413635254 + }, + { + "epoch": 0.5144354360292663, + "step": 5203, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.5144354360292663, + "step": 5203, + "train/total_loss": 0.18015122413635254 + }, + { + "entropy": 9.792580604553223, + "epoch": 0.5145343088787819, + "mean_token_accuracy": 0.7164556980133057, + "num_tokens": 6270339.0, + "step": 5204, + "train/ce_loss": 1.2554758787155151 + }, + { + "epoch": 0.5145343088787819, + "step": 5204, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.5145343088787819, + "step": 5204, + "train/total_loss": 0.1567975878715515 + }, + { + "entropy": 9.043886184692383, + "epoch": 0.5146331817282974, + "mean_token_accuracy": 0.7170263528823853, + "num_tokens": 6275629.0, + "step": 5205, + "train/ce_loss": 1.027775764465332 + }, + { + "epoch": 0.5146331817282974, + "step": 5205, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.5146331817282974, + "step": 5205, + "train/total_loss": 0.18871507048606873 + }, + { + "entropy": 8.943634033203125, + "epoch": 0.5147320545778129, + "mean_token_accuracy": 0.7334109544754028, + "num_tokens": 6280937.0, + "step": 5206, + "train/ce_loss": 1.068730115890503 + }, + { + "epoch": 0.5147320545778129, + "step": 5206, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.5147320545778129, + "step": 5206, + "train/total_loss": 0.17718550562858582 + }, + { + "entropy": 9.01152229309082, + "epoch": 0.5148309274273285, + "mean_token_accuracy": 0.7050997614860535, + "num_tokens": 6286307.0, + "step": 5207, + "train/ce_loss": 0.5275875329971313 + }, + { + "epoch": 0.5148309274273285, + "step": 5207, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.5148309274273285, + "step": 5207, + "train/total_loss": 0.13479000329971313 + }, + { + "entropy": 9.549314498901367, + "epoch": 0.514929800276844, + "mean_token_accuracy": 0.7138508558273315, + "num_tokens": 6291365.0, + "step": 5208, + "train/ce_loss": 1.2059669494628906 + }, + { + "epoch": 0.514929800276844, + "step": 5208, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.514929800276844, + "step": 5208, + "train/total_loss": 0.14794045686721802 + }, + { + "entropy": 9.489936828613281, + "epoch": 0.5150286731263595, + "mean_token_accuracy": 0.7491748929023743, + "num_tokens": 6296472.0, + "step": 5209, + "train/ce_loss": 1.0462239980697632 + }, + { + "epoch": 0.5150286731263595, + "step": 5209, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.5150286731263595, + "step": 5209, + "train/total_loss": 0.17493489384651184 + }, + { + "entropy": 8.975013732910156, + "epoch": 0.5151275459758751, + "mean_token_accuracy": 0.7044392228126526, + "num_tokens": 6301820.0, + "step": 5210, + "train/ce_loss": 0.8023355007171631 + }, + { + "epoch": 0.5151275459758751, + "step": 5210, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.5151275459758751, + "step": 5210, + "train/total_loss": 0.13492104411125183 + }, + { + "entropy": 8.907506942749023, + "epoch": 0.5152264188253906, + "mean_token_accuracy": 0.7075055241584778, + "num_tokens": 6307203.0, + "step": 5211, + "train/ce_loss": 0.6670815348625183 + }, + { + "epoch": 0.5152264188253906, + "step": 5211, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.5152264188253906, + "step": 5211, + "train/total_loss": 0.10967690497636795 + }, + { + "entropy": 8.65451431274414, + "epoch": 0.515325291674906, + "mean_token_accuracy": 0.7407054305076599, + "num_tokens": 6312751.0, + "step": 5212, + "train/ce_loss": 0.5702404379844666 + }, + { + "epoch": 0.515325291674906, + "step": 5212, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.515325291674906, + "step": 5212, + "train/total_loss": 0.0765552967786789 + }, + { + "entropy": 9.231220245361328, + "epoch": 0.5154241645244216, + "mean_token_accuracy": 0.7912687659263611, + "num_tokens": 6317975.0, + "step": 5213, + "train/ce_loss": 0.46404504776000977 + }, + { + "epoch": 0.5154241645244216, + "step": 5213, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.5154241645244216, + "step": 5213, + "train/total_loss": 0.15187326073646545 + }, + { + "entropy": 9.298812866210938, + "epoch": 0.5155230373739371, + "mean_token_accuracy": 0.75, + "num_tokens": 6323085.0, + "step": 5214, + "train/ce_loss": 1.2744684219360352 + }, + { + "epoch": 0.5155230373739371, + "step": 5214, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.5155230373739371, + "step": 5214, + "train/total_loss": 0.18604059517383575 + }, + { + "entropy": 8.690983772277832, + "epoch": 0.5156219102234526, + "mean_token_accuracy": 0.7608453631401062, + "num_tokens": 6328435.0, + "step": 5215, + "train/ce_loss": 0.4523555636405945 + }, + { + "epoch": 0.5156219102234526, + "step": 5215, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.5156219102234526, + "step": 5215, + "train/total_loss": 0.1350793093442917 + }, + { + "entropy": 9.051522254943848, + "epoch": 0.5157207830729682, + "mean_token_accuracy": 0.8401322960853577, + "num_tokens": 6333767.0, + "step": 5216, + "train/ce_loss": 0.3867724537849426 + }, + { + "epoch": 0.5157207830729682, + "step": 5216, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.5157207830729682, + "step": 5216, + "train/total_loss": 0.05820849537849426 + }, + { + "entropy": 9.134048461914062, + "epoch": 0.5158196559224837, + "mean_token_accuracy": 0.7589403986930847, + "num_tokens": 6338941.0, + "step": 5217, + "train/ce_loss": 0.5398648381233215 + }, + { + "epoch": 0.5158196559224837, + "step": 5217, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.5158196559224837, + "step": 5217, + "train/total_loss": 0.09304898977279663 + }, + { + "entropy": 9.101371765136719, + "epoch": 0.5159185287719992, + "mean_token_accuracy": 0.7251700758934021, + "num_tokens": 6344129.0, + "step": 5218, + "train/ce_loss": 1.1684240102767944 + }, + { + "epoch": 0.5159185287719992, + "step": 5218, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.5159185287719992, + "step": 5218, + "train/total_loss": 0.21449865400791168 + }, + { + "entropy": 9.00767707824707, + "epoch": 0.5160174016215148, + "mean_token_accuracy": 0.7233532667160034, + "num_tokens": 6349427.0, + "step": 5219, + "train/ce_loss": 1.1756572723388672 + }, + { + "epoch": 0.5160174016215148, + "step": 5219, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.5160174016215148, + "step": 5219, + "train/total_loss": 0.17615947127342224 + }, + { + "epoch": 0.5161162744710303, + "grad_norm": 0.8571836352348328, + "learning_rate": 8.712109973792218e-06, + "loss": 0.1413, + "step": 5220 + }, + { + "entropy": 9.020292282104492, + "epoch": 0.5161162744710303, + "mean_token_accuracy": 0.745743453502655, + "num_tokens": 6354758.0, + "step": 5220, + "train/ce_loss": 1.1463453769683838 + }, + { + "epoch": 0.5161162744710303, + "step": 5220, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.5161162744710303, + "step": 5220, + "train/total_loss": 0.19275954365730286 + }, + { + "entropy": 9.523673057556152, + "epoch": 0.5162151473205457, + "mean_token_accuracy": 0.7344537973403931, + "num_tokens": 6359794.0, + "step": 5221, + "train/ce_loss": 1.286370038986206 + }, + { + "epoch": 0.5162151473205457, + "step": 5221, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.5162151473205457, + "step": 5221, + "train/total_loss": 0.17551200091838837 + }, + { + "entropy": 9.838347434997559, + "epoch": 0.5163140201700613, + "mean_token_accuracy": 0.6756151914596558, + "num_tokens": 6364656.0, + "step": 5222, + "train/ce_loss": 2.0478384494781494 + }, + { + "epoch": 0.5163140201700613, + "step": 5222, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.5163140201700613, + "step": 5222, + "train/total_loss": 0.2868151068687439 + }, + { + "entropy": 8.906373977661133, + "epoch": 0.5164128930195768, + "mean_token_accuracy": 0.7170658707618713, + "num_tokens": 6369716.0, + "step": 5223, + "train/ce_loss": 0.5118294954299927 + }, + { + "epoch": 0.5164128930195768, + "step": 5223, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.5164128930195768, + "step": 5223, + "train/total_loss": 0.11368295550346375 + }, + { + "entropy": 9.602432250976562, + "epoch": 0.5165117658690923, + "mean_token_accuracy": 0.7116104960441589, + "num_tokens": 6374679.0, + "step": 5224, + "train/ce_loss": 2.8455499432311626e-06 + }, + { + "epoch": 0.5165117658690923, + "step": 5224, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.5165117658690923, + "step": 5224, + "train/total_loss": 0.04687528312206268 + }, + { + "entropy": 9.013307571411133, + "epoch": 0.5166106387186079, + "mean_token_accuracy": 0.7280488014221191, + "num_tokens": 6379944.0, + "step": 5225, + "train/ce_loss": 0.9952422380447388 + }, + { + "epoch": 0.5166106387186079, + "step": 5225, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.5166106387186079, + "step": 5225, + "train/total_loss": 0.16593047976493835 + }, + { + "entropy": 10.098270416259766, + "epoch": 0.5167095115681234, + "mean_token_accuracy": 0.7131367325782776, + "num_tokens": 6384717.0, + "step": 5226, + "train/ce_loss": 1.511696457862854 + }, + { + "epoch": 0.5167095115681234, + "step": 5226, + "train/sim_loss": 0.14453125 + }, + { + "epoch": 0.5167095115681234, + "step": 5226, + "train/total_loss": 0.29570090770721436 + }, + { + "entropy": 9.121529579162598, + "epoch": 0.5168083844176389, + "mean_token_accuracy": 0.6698337197303772, + "num_tokens": 6389984.0, + "step": 5227, + "train/ce_loss": 1.220581293106079 + }, + { + "epoch": 0.5168083844176389, + "step": 5227, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.5168083844176389, + "step": 5227, + "train/total_loss": 0.18065187335014343 + }, + { + "entropy": 9.042387008666992, + "epoch": 0.5169072572671545, + "mean_token_accuracy": 0.7225501537322998, + "num_tokens": 6395307.0, + "step": 5228, + "train/ce_loss": 1.320287823677063 + }, + { + "epoch": 0.5169072572671545, + "step": 5228, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.5169072572671545, + "step": 5228, + "train/total_loss": 0.19843503832817078 + }, + { + "entropy": 9.300015449523926, + "epoch": 0.51700613011667, + "mean_token_accuracy": 0.7410072088241577, + "num_tokens": 6400469.0, + "step": 5229, + "train/ce_loss": 0.9012818336486816 + }, + { + "epoch": 0.51700613011667, + "step": 5229, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.51700613011667, + "step": 5229, + "train/total_loss": 0.13309693336486816 + }, + { + "entropy": 8.792694091796875, + "epoch": 0.5171050029661854, + "mean_token_accuracy": 0.7126193046569824, + "num_tokens": 6405925.0, + "step": 5230, + "train/ce_loss": 0.6694502830505371 + }, + { + "epoch": 0.5171050029661854, + "step": 5230, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.5171050029661854, + "step": 5230, + "train/total_loss": 0.10210128128528595 + }, + { + "entropy": 9.179935455322266, + "epoch": 0.517203875815701, + "mean_token_accuracy": 0.7749077677726746, + "num_tokens": 6411195.0, + "step": 5231, + "train/ce_loss": 0.8501054048538208 + }, + { + "epoch": 0.517203875815701, + "step": 5231, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.517203875815701, + "step": 5231, + "train/total_loss": 0.16704179346561432 + }, + { + "entropy": 9.284443855285645, + "epoch": 0.5173027486652165, + "mean_token_accuracy": 0.6839762330055237, + "num_tokens": 6416325.0, + "step": 5232, + "train/ce_loss": 4.317288585298229e-06 + }, + { + "epoch": 0.5173027486652165, + "step": 5232, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.5173027486652165, + "step": 5232, + "train/total_loss": 0.03125043213367462 + }, + { + "entropy": 9.127395629882812, + "epoch": 0.517401621514732, + "mean_token_accuracy": 0.6614457964897156, + "num_tokens": 6421602.0, + "step": 5233, + "train/ce_loss": 0.7827679514884949 + }, + { + "epoch": 0.517401621514732, + "step": 5233, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.517401621514732, + "step": 5233, + "train/total_loss": 0.17202679812908173 + }, + { + "entropy": 8.866366386413574, + "epoch": 0.5175004943642476, + "mean_token_accuracy": 0.7063007950782776, + "num_tokens": 6427049.0, + "step": 5234, + "train/ce_loss": 1.2991992235183716 + }, + { + "epoch": 0.5175004943642476, + "step": 5234, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.5175004943642476, + "step": 5234, + "train/total_loss": 0.24710743129253387 + }, + { + "entropy": 9.905988693237305, + "epoch": 0.5175993672137631, + "mean_token_accuracy": 0.8165374398231506, + "num_tokens": 6431811.0, + "step": 5235, + "train/ce_loss": 7.886806088208687e-06 + }, + { + "epoch": 0.5175993672137631, + "step": 5235, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.5175993672137631, + "step": 5235, + "train/total_loss": 0.050782039761543274 + }, + { + "entropy": 8.81964111328125, + "epoch": 0.5176982400632786, + "mean_token_accuracy": 0.732833981513977, + "num_tokens": 6437136.0, + "step": 5236, + "train/ce_loss": 0.5881039500236511 + }, + { + "epoch": 0.5176982400632786, + "step": 5236, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.5176982400632786, + "step": 5236, + "train/total_loss": 0.11740414798259735 + }, + { + "entropy": 9.448318481445312, + "epoch": 0.5177971129127942, + "mean_token_accuracy": 0.7641509175300598, + "num_tokens": 6442205.0, + "step": 5237, + "train/ce_loss": 1.2880504131317139 + }, + { + "epoch": 0.5177971129127942, + "step": 5237, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.5177971129127942, + "step": 5237, + "train/total_loss": 0.2225550413131714 + }, + { + "entropy": 9.1721830368042, + "epoch": 0.5178959857623097, + "mean_token_accuracy": 0.7208480834960938, + "num_tokens": 6447550.0, + "step": 5238, + "train/ce_loss": 1.1573699712753296 + }, + { + "epoch": 0.5178959857623097, + "step": 5238, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.5178959857623097, + "step": 5238, + "train/total_loss": 0.22511199116706848 + }, + { + "entropy": 9.189175605773926, + "epoch": 0.5179948586118251, + "mean_token_accuracy": 0.7442747950553894, + "num_tokens": 6452801.0, + "step": 5239, + "train/ce_loss": 0.9737340807914734 + }, + { + "epoch": 0.5179948586118251, + "step": 5239, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.5179948586118251, + "step": 5239, + "train/total_loss": 0.12081091105937958 + }, + { + "epoch": 0.5180937314613407, + "grad_norm": 0.6664369702339172, + "learning_rate": 8.707165109034268e-06, + "loss": 0.1519, + "step": 5240 + }, + { + "entropy": 8.873210906982422, + "epoch": 0.5180937314613407, + "mean_token_accuracy": 0.7202441692352295, + "num_tokens": 6458195.0, + "step": 5240, + "train/ce_loss": 0.8848466277122498 + }, + { + "epoch": 0.5180937314613407, + "step": 5240, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.5180937314613407, + "step": 5240, + "train/total_loss": 0.11973466724157333 + }, + { + "entropy": 9.209671974182129, + "epoch": 0.5181926043108562, + "mean_token_accuracy": 0.7430093288421631, + "num_tokens": 6463421.0, + "step": 5241, + "train/ce_loss": 0.3739120066165924 + }, + { + "epoch": 0.5181926043108562, + "step": 5241, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.5181926043108562, + "step": 5241, + "train/total_loss": 0.15457870066165924 + }, + { + "entropy": 8.89012336730957, + "epoch": 0.5182914771603717, + "mean_token_accuracy": 0.746666669845581, + "num_tokens": 6468589.0, + "step": 5242, + "train/ce_loss": 0.6734580397605896 + }, + { + "epoch": 0.5182914771603717, + "step": 5242, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.5182914771603717, + "step": 5242, + "train/total_loss": 0.14937704801559448 + }, + { + "entropy": 9.542342185974121, + "epoch": 0.5183903500098873, + "mean_token_accuracy": 0.7629513144493103, + "num_tokens": 6473656.0, + "step": 5243, + "train/ce_loss": 1.0343583822250366 + }, + { + "epoch": 0.5183903500098873, + "step": 5243, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.5183903500098873, + "step": 5243, + "train/total_loss": 0.14640459418296814 + }, + { + "entropy": 9.919364929199219, + "epoch": 0.5184892228594028, + "mean_token_accuracy": 0.740359902381897, + "num_tokens": 6478456.0, + "step": 5244, + "train/ce_loss": 1.3556137084960938 + }, + { + "epoch": 0.5184892228594028, + "step": 5244, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.5184892228594028, + "step": 5244, + "train/total_loss": 0.17462387681007385 + }, + { + "entropy": 9.06728458404541, + "epoch": 0.5185880957089183, + "mean_token_accuracy": 0.7038043737411499, + "num_tokens": 6483656.0, + "step": 5245, + "train/ce_loss": 0.7342151403427124 + }, + { + "epoch": 0.5185880957089183, + "step": 5245, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.5185880957089183, + "step": 5245, + "train/total_loss": 0.15935900807380676 + }, + { + "entropy": 9.586074829101562, + "epoch": 0.5186869685584339, + "mean_token_accuracy": 0.7088607549667358, + "num_tokens": 6488459.0, + "step": 5246, + "train/ce_loss": 3.950816790165845e-06 + }, + { + "epoch": 0.5186869685584339, + "step": 5246, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.5186869685584339, + "step": 5246, + "train/total_loss": 0.05859414488077164 + }, + { + "entropy": 8.880306243896484, + "epoch": 0.5187858414079494, + "mean_token_accuracy": 0.751870334148407, + "num_tokens": 6493690.0, + "step": 5247, + "train/ce_loss": 0.5954692363739014 + }, + { + "epoch": 0.5187858414079494, + "step": 5247, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.5187858414079494, + "step": 5247, + "train/total_loss": 0.08689067512750626 + }, + { + "entropy": 9.145792007446289, + "epoch": 0.5188847142574649, + "mean_token_accuracy": 0.7543624043464661, + "num_tokens": 6498911.0, + "step": 5248, + "train/ce_loss": 1.5778733491897583 + }, + { + "epoch": 0.5188847142574649, + "step": 5248, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.5188847142574649, + "step": 5248, + "train/total_loss": 0.2671623229980469 + }, + { + "entropy": 9.40733528137207, + "epoch": 0.5189835871069804, + "mean_token_accuracy": 0.716946005821228, + "num_tokens": 6503915.0, + "step": 5249, + "train/ce_loss": 1.213287353515625 + }, + { + "epoch": 0.5189835871069804, + "step": 5249, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.5189835871069804, + "step": 5249, + "train/total_loss": 0.15257874131202698 + }, + { + "entropy": 8.57093620300293, + "epoch": 0.5190824599564959, + "mean_token_accuracy": 0.7903845906257629, + "num_tokens": 6509667.0, + "step": 5250, + "train/ce_loss": 0.6045466065406799 + }, + { + "epoch": 0.5190824599564959, + "step": 5250, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.5190824599564959, + "step": 5250, + "train/total_loss": 0.09951716661453247 + }, + { + "entropy": 9.538387298583984, + "epoch": 0.5191813328060114, + "mean_token_accuracy": 0.7171717286109924, + "num_tokens": 6514623.0, + "step": 5251, + "train/ce_loss": 1.3213231563568115 + }, + { + "epoch": 0.5191813328060114, + "step": 5251, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.5191813328060114, + "step": 5251, + "train/total_loss": 0.18681982159614563 + }, + { + "entropy": 9.209657669067383, + "epoch": 0.519280205655527, + "mean_token_accuracy": 0.7266187071800232, + "num_tokens": 6519770.0, + "step": 5252, + "train/ce_loss": 0.8175265789031982 + }, + { + "epoch": 0.519280205655527, + "step": 5252, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.519280205655527, + "step": 5252, + "train/total_loss": 0.12862765789031982 + }, + { + "entropy": 9.48554801940918, + "epoch": 0.5193790785050425, + "mean_token_accuracy": 0.6927176117897034, + "num_tokens": 6524802.0, + "step": 5253, + "train/ce_loss": 1.8125488758087158 + }, + { + "epoch": 0.5193790785050425, + "step": 5253, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.5193790785050425, + "step": 5253, + "train/total_loss": 0.25547364354133606 + }, + { + "entropy": 9.817337036132812, + "epoch": 0.519477951354558, + "mean_token_accuracy": 0.7514563202857971, + "num_tokens": 6529724.0, + "step": 5254, + "train/ce_loss": 0.7927056550979614 + }, + { + "epoch": 0.519477951354558, + "step": 5254, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.519477951354558, + "step": 5254, + "train/total_loss": 0.14177057147026062 + }, + { + "entropy": 9.053857803344727, + "epoch": 0.5195768242040736, + "mean_token_accuracy": 0.7128146290779114, + "num_tokens": 6535045.0, + "step": 5255, + "train/ce_loss": 0.7244611978530884 + }, + { + "epoch": 0.5195768242040736, + "step": 5255, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.5195768242040736, + "step": 5255, + "train/total_loss": 0.12713362276554108 + }, + { + "entropy": 9.557379722595215, + "epoch": 0.5196756970535891, + "mean_token_accuracy": 0.7753743529319763, + "num_tokens": 6540094.0, + "step": 5256, + "train/ce_loss": 1.0860475301742554 + }, + { + "epoch": 0.5196756970535891, + "step": 5256, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.5196756970535891, + "step": 5256, + "train/total_loss": 0.12422975152730942 + }, + { + "entropy": 9.26852798461914, + "epoch": 0.5197745699031046, + "mean_token_accuracy": 0.7614213228225708, + "num_tokens": 6545321.0, + "step": 5257, + "train/ce_loss": 0.38823726773262024 + }, + { + "epoch": 0.5197745699031046, + "step": 5257, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.5197745699031046, + "step": 5257, + "train/total_loss": 0.058354977518320084 + }, + { + "entropy": 9.255789756774902, + "epoch": 0.5198734427526202, + "mean_token_accuracy": 0.6846965551376343, + "num_tokens": 6550532.0, + "step": 5258, + "train/ce_loss": 1.7871288061141968 + }, + { + "epoch": 0.5198734427526202, + "step": 5258, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.5198734427526202, + "step": 5258, + "train/total_loss": 0.2880879044532776 + }, + { + "entropy": 9.217986106872559, + "epoch": 0.5199723156021356, + "mean_token_accuracy": 0.6962864995002747, + "num_tokens": 6555745.0, + "step": 5259, + "train/ce_loss": 0.5425359010696411 + }, + { + "epoch": 0.5199723156021356, + "step": 5259, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.5199723156021356, + "step": 5259, + "train/total_loss": 0.11675359308719635 + }, + { + "epoch": 0.5200711884516511, + "grad_norm": 0.7438312768936157, + "learning_rate": 8.702220244276319e-06, + "loss": 0.1409, + "step": 5260 + }, + { + "entropy": 9.3577880859375, + "epoch": 0.5200711884516511, + "mean_token_accuracy": 0.7811271548271179, + "num_tokens": 6560923.0, + "step": 5260, + "train/ce_loss": 1.415432848261844e-06 + }, + { + "epoch": 0.5200711884516511, + "step": 5260, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.5200711884516511, + "step": 5260, + "train/total_loss": 0.06250014156103134 + }, + { + "entropy": 9.415997505187988, + "epoch": 0.5201700613011667, + "mean_token_accuracy": 0.7555555701255798, + "num_tokens": 6565925.0, + "step": 5261, + "train/ce_loss": 2.781298690024414e-06 + }, + { + "epoch": 0.5201700613011667, + "step": 5261, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.5201700613011667, + "step": 5261, + "train/total_loss": 0.06250027567148209 + }, + { + "entropy": 8.994209289550781, + "epoch": 0.5202689341506822, + "mean_token_accuracy": 0.7211660146713257, + "num_tokens": 6571218.0, + "step": 5262, + "train/ce_loss": 1.2079790830612183 + }, + { + "epoch": 0.5202689341506822, + "step": 5262, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.5202689341506822, + "step": 5262, + "train/total_loss": 0.23017290234565735 + }, + { + "entropy": 9.183069229125977, + "epoch": 0.5203678070001977, + "mean_token_accuracy": 0.7012448310852051, + "num_tokens": 6576396.0, + "step": 5263, + "train/ce_loss": 0.9855268001556396 + }, + { + "epoch": 0.5203678070001977, + "step": 5263, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.5203678070001977, + "step": 5263, + "train/total_loss": 0.1337089240550995 + }, + { + "entropy": 9.500748634338379, + "epoch": 0.5204666798497133, + "mean_token_accuracy": 0.6936936974525452, + "num_tokens": 6581366.0, + "step": 5264, + "train/ce_loss": 3.6748667753272457e-06 + }, + { + "epoch": 0.5204666798497133, + "step": 5264, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.5204666798497133, + "step": 5264, + "train/total_loss": 0.0195316169410944 + }, + { + "entropy": 9.311800956726074, + "epoch": 0.5205655526992288, + "mean_token_accuracy": 0.7102922201156616, + "num_tokens": 6586606.0, + "step": 5265, + "train/ce_loss": 1.0904486179351807 + }, + { + "epoch": 0.5205655526992288, + "step": 5265, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.5205655526992288, + "step": 5265, + "train/total_loss": 0.1949823647737503 + }, + { + "entropy": 9.069988250732422, + "epoch": 0.5206644255487444, + "mean_token_accuracy": 0.71775221824646, + "num_tokens": 6591843.0, + "step": 5266, + "train/ce_loss": 0.8036020994186401 + }, + { + "epoch": 0.5206644255487444, + "step": 5266, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.5206644255487444, + "step": 5266, + "train/total_loss": 0.14286020398139954 + }, + { + "entropy": 9.300836563110352, + "epoch": 0.5207632983982599, + "mean_token_accuracy": 0.7404958605766296, + "num_tokens": 6596956.0, + "step": 5267, + "train/ce_loss": 0.7432721257209778 + }, + { + "epoch": 0.5207632983982599, + "step": 5267, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.5207632983982599, + "step": 5267, + "train/total_loss": 0.11338971555233002 + }, + { + "entropy": 8.987645149230957, + "epoch": 0.5208621712477753, + "mean_token_accuracy": 0.7181687951087952, + "num_tokens": 6602135.0, + "step": 5268, + "train/ce_loss": 1.1967494487762451 + }, + { + "epoch": 0.5208621712477753, + "step": 5268, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.5208621712477753, + "step": 5268, + "train/total_loss": 0.193893700838089 + }, + { + "entropy": 9.507927894592285, + "epoch": 0.5209610440972909, + "mean_token_accuracy": 0.6975308656692505, + "num_tokens": 6607210.0, + "step": 5269, + "train/ce_loss": 1.2455633878707886 + }, + { + "epoch": 0.5209610440972909, + "step": 5269, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.5209610440972909, + "step": 5269, + "train/total_loss": 0.21440008282661438 + }, + { + "entropy": 9.510259628295898, + "epoch": 0.5210599169468064, + "mean_token_accuracy": 0.6652047038078308, + "num_tokens": 6612311.0, + "step": 5270, + "train/ce_loss": 1.3466548919677734 + }, + { + "epoch": 0.5210599169468064, + "step": 5270, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.5210599169468064, + "step": 5270, + "train/total_loss": 0.20497798919677734 + }, + { + "entropy": 9.148332595825195, + "epoch": 0.5211587897963219, + "mean_token_accuracy": 0.7044943571090698, + "num_tokens": 6617811.0, + "step": 5271, + "train/ce_loss": 0.7625870108604431 + }, + { + "epoch": 0.5211587897963219, + "step": 5271, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.5211587897963219, + "step": 5271, + "train/total_loss": 0.13485245406627655 + }, + { + "entropy": 9.175765037536621, + "epoch": 0.5212576626458375, + "mean_token_accuracy": 0.7534818649291992, + "num_tokens": 6623000.0, + "step": 5272, + "train/ce_loss": 0.9953817129135132 + }, + { + "epoch": 0.5212576626458375, + "step": 5272, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.5212576626458375, + "step": 5272, + "train/total_loss": 0.1620381772518158 + }, + { + "entropy": 9.28423023223877, + "epoch": 0.521356535495353, + "mean_token_accuracy": 0.8075187802314758, + "num_tokens": 6628153.0, + "step": 5273, + "train/ce_loss": 0.4667331874370575 + }, + { + "epoch": 0.521356535495353, + "step": 5273, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.521356535495353, + "step": 5273, + "train/total_loss": 0.08964207023382187 + }, + { + "entropy": 8.493019104003906, + "epoch": 0.5214554083448685, + "mean_token_accuracy": 0.6848204135894775, + "num_tokens": 6633475.0, + "step": 5274, + "train/ce_loss": 0.7903342843055725 + }, + { + "epoch": 0.5214554083448685, + "step": 5274, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.5214554083448685, + "step": 5274, + "train/total_loss": 0.13372093439102173 + }, + { + "entropy": 9.316062927246094, + "epoch": 0.5215542811943841, + "mean_token_accuracy": 0.7250803709030151, + "num_tokens": 6638602.0, + "step": 5275, + "train/ce_loss": 2.0475013116083574e-06 + }, + { + "epoch": 0.5215542811943841, + "step": 5275, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.5215542811943841, + "step": 5275, + "train/total_loss": 0.035156454890966415 + }, + { + "entropy": 9.227267265319824, + "epoch": 0.5216531540438996, + "mean_token_accuracy": 0.7750309109687805, + "num_tokens": 6643871.0, + "step": 5276, + "train/ce_loss": 0.4963589310646057 + }, + { + "epoch": 0.5216531540438996, + "step": 5276, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.5216531540438996, + "step": 5276, + "train/total_loss": 0.07697964459657669 + }, + { + "entropy": 8.601722717285156, + "epoch": 0.521752026893415, + "mean_token_accuracy": 0.743984580039978, + "num_tokens": 6649451.0, + "step": 5277, + "train/ce_loss": 0.956932544708252 + }, + { + "epoch": 0.521752026893415, + "step": 5277, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.521752026893415, + "step": 5277, + "train/total_loss": 0.14647451043128967 + }, + { + "entropy": 9.529146194458008, + "epoch": 0.5218508997429306, + "mean_token_accuracy": 0.7466410994529724, + "num_tokens": 6654425.0, + "step": 5278, + "train/ce_loss": 0.8007546663284302 + }, + { + "epoch": 0.5218508997429306, + "step": 5278, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.5218508997429306, + "step": 5278, + "train/total_loss": 0.1230442151427269 + }, + { + "entropy": 9.57694149017334, + "epoch": 0.5219497725924461, + "mean_token_accuracy": 0.7283333539962769, + "num_tokens": 6659479.0, + "step": 5279, + "train/ce_loss": 0.7926003932952881 + }, + { + "epoch": 0.5219497725924461, + "step": 5279, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.5219497725924461, + "step": 5279, + "train/total_loss": 0.15738505125045776 + }, + { + "epoch": 0.5220486454419616, + "grad_norm": 0.7354840636253357, + "learning_rate": 8.697275379518371e-06, + "loss": 0.1431, + "step": 5280 + }, + { + "entropy": 9.120805740356445, + "epoch": 0.5220486454419616, + "mean_token_accuracy": 0.720963180065155, + "num_tokens": 6664626.0, + "step": 5280, + "train/ce_loss": 1.787015776244516e-06 + }, + { + "epoch": 0.5220486454419616, + "step": 5280, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.5220486454419616, + "step": 5280, + "train/total_loss": 0.042968928813934326 + }, + { + "entropy": 9.510129928588867, + "epoch": 0.5221475182914772, + "mean_token_accuracy": 0.7185184955596924, + "num_tokens": 6669642.0, + "step": 5281, + "train/ce_loss": 0.986981213092804 + }, + { + "epoch": 0.5221475182914772, + "step": 5281, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.5221475182914772, + "step": 5281, + "train/total_loss": 0.16510437428951263 + }, + { + "entropy": 9.158186912536621, + "epoch": 0.5222463911409927, + "mean_token_accuracy": 0.7033132314682007, + "num_tokens": 6674784.0, + "step": 5282, + "train/ce_loss": 2.32814621925354 + }, + { + "epoch": 0.5222463911409927, + "step": 5282, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.5222463911409927, + "step": 5282, + "train/total_loss": 0.30312711000442505 + }, + { + "entropy": 9.896245956420898, + "epoch": 0.5223452639905082, + "mean_token_accuracy": 0.7933070659637451, + "num_tokens": 6679664.0, + "step": 5283, + "train/ce_loss": 1.2164676945758401e-06 + }, + { + "epoch": 0.5223452639905082, + "step": 5283, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.5223452639905082, + "step": 5283, + "train/total_loss": 0.0156251210719347 + }, + { + "entropy": 9.684252738952637, + "epoch": 0.5224441368400238, + "mean_token_accuracy": 0.7310344576835632, + "num_tokens": 6684510.0, + "step": 5284, + "train/ce_loss": 2.1813316345214844 + }, + { + "epoch": 0.5224441368400238, + "step": 5284, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.5224441368400238, + "step": 5284, + "train/total_loss": 0.3040706515312195 + }, + { + "entropy": 9.001659393310547, + "epoch": 0.5225430096895393, + "mean_token_accuracy": 0.7677119374275208, + "num_tokens": 6689853.0, + "step": 5285, + "train/ce_loss": 0.3454212248325348 + }, + { + "epoch": 0.5225430096895393, + "step": 5285, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.5225430096895393, + "step": 5285, + "train/total_loss": 0.07360462844371796 + }, + { + "entropy": 8.513310432434082, + "epoch": 0.5226418825390547, + "mean_token_accuracy": 0.7743403315544128, + "num_tokens": 6695444.0, + "step": 5286, + "train/ce_loss": 0.8925740122795105 + }, + { + "epoch": 0.5226418825390547, + "step": 5286, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.5226418825390547, + "step": 5286, + "train/total_loss": 0.1712886542081833 + }, + { + "entropy": 8.87952995300293, + "epoch": 0.5227407553885703, + "mean_token_accuracy": 0.7326120734214783, + "num_tokens": 6700523.0, + "step": 5287, + "train/ce_loss": 9.726073585625272e-06 + }, + { + "epoch": 0.5227407553885703, + "step": 5287, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.5227407553885703, + "step": 5287, + "train/total_loss": 0.0429697223007679 + }, + { + "entropy": 9.098947525024414, + "epoch": 0.5228396282380858, + "mean_token_accuracy": 0.773964524269104, + "num_tokens": 6705833.0, + "step": 5288, + "train/ce_loss": 0.6194140911102295 + }, + { + "epoch": 0.5228396282380858, + "step": 5288, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.5228396282380858, + "step": 5288, + "train/total_loss": 0.11662891507148743 + }, + { + "entropy": 9.876415252685547, + "epoch": 0.5229385010876013, + "mean_token_accuracy": 0.7610389590263367, + "num_tokens": 6710620.0, + "step": 5289, + "train/ce_loss": 1.7455840110778809 + }, + { + "epoch": 0.5229385010876013, + "step": 5289, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.5229385010876013, + "step": 5289, + "train/total_loss": 0.2722146511077881 + }, + { + "entropy": 9.425487518310547, + "epoch": 0.5230373739371169, + "mean_token_accuracy": 0.769599974155426, + "num_tokens": 6715668.0, + "step": 5290, + "train/ce_loss": 8.170946443897265e-07 + }, + { + "epoch": 0.5230373739371169, + "step": 5290, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.5230373739371169, + "step": 5290, + "train/total_loss": 0.019531331956386566 + }, + { + "entropy": 8.816116333007812, + "epoch": 0.5231362467866324, + "mean_token_accuracy": 0.7628541588783264, + "num_tokens": 6721053.0, + "step": 5291, + "train/ce_loss": 1.1168615818023682 + }, + { + "epoch": 0.5231362467866324, + "step": 5291, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.5231362467866324, + "step": 5291, + "train/total_loss": 0.19762367010116577 + }, + { + "entropy": 9.165191650390625, + "epoch": 0.5232351196361479, + "mean_token_accuracy": 0.7084367275238037, + "num_tokens": 6726314.0, + "step": 5292, + "train/ce_loss": 0.8726534247398376 + }, + { + "epoch": 0.5232351196361479, + "step": 5292, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.5232351196361479, + "step": 5292, + "train/total_loss": 0.14195284247398376 + }, + { + "entropy": 9.813233375549316, + "epoch": 0.5233339924856635, + "mean_token_accuracy": 0.7095343470573425, + "num_tokens": 6731179.0, + "step": 5293, + "train/ce_loss": 5.149100161361275e-06 + }, + { + "epoch": 0.5233339924856635, + "step": 5293, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.5233339924856635, + "step": 5293, + "train/total_loss": 0.06250051409006119 + }, + { + "entropy": 9.578452110290527, + "epoch": 0.523432865335179, + "mean_token_accuracy": 0.6940966248512268, + "num_tokens": 6736110.0, + "step": 5294, + "train/ce_loss": 8.76888805123599e-07 + }, + { + "epoch": 0.523432865335179, + "step": 5294, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.523432865335179, + "step": 5294, + "train/total_loss": 0.015625087544322014 + }, + { + "entropy": 9.136296272277832, + "epoch": 0.5235317381846945, + "mean_token_accuracy": 0.728923499584198, + "num_tokens": 6741354.0, + "step": 5295, + "train/ce_loss": 0.9489161968231201 + }, + { + "epoch": 0.5235317381846945, + "step": 5295, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.5235317381846945, + "step": 5295, + "train/total_loss": 0.16520412266254425 + }, + { + "entropy": 8.934793472290039, + "epoch": 0.52363061103421, + "mean_token_accuracy": 0.7049808502197266, + "num_tokens": 6746643.0, + "step": 5296, + "train/ce_loss": 1.0172444581985474 + }, + { + "epoch": 0.52363061103421, + "step": 5296, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.52363061103421, + "step": 5296, + "train/total_loss": 0.18375569581985474 + }, + { + "entropy": 8.824613571166992, + "epoch": 0.5237294838837255, + "mean_token_accuracy": 0.7030302882194519, + "num_tokens": 6752077.0, + "step": 5297, + "train/ce_loss": 1.4362319707870483 + }, + { + "epoch": 0.5237294838837255, + "step": 5297, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.5237294838837255, + "step": 5297, + "train/total_loss": 0.2451857030391693 + }, + { + "entropy": 8.722902297973633, + "epoch": 0.523828356733241, + "mean_token_accuracy": 0.732421875, + "num_tokens": 6757588.0, + "step": 5298, + "train/ce_loss": 0.6351065039634705 + }, + { + "epoch": 0.523828356733241, + "step": 5298, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.523828356733241, + "step": 5298, + "train/total_loss": 0.11429189890623093 + }, + { + "entropy": 9.172910690307617, + "epoch": 0.5239272295827566, + "mean_token_accuracy": 0.732899010181427, + "num_tokens": 6762677.0, + "step": 5299, + "train/ce_loss": 1.0685198307037354 + }, + { + "epoch": 0.5239272295827566, + "step": 5299, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.5239272295827566, + "step": 5299, + "train/total_loss": 0.1654457449913025 + }, + { + "epoch": 0.5240261024322721, + "grad_norm": 0.7753241062164307, + "learning_rate": 8.692330514760421e-06, + "loss": 0.142, + "step": 5300 + }, + { + "entropy": 9.261425018310547, + "epoch": 0.5240261024322721, + "mean_token_accuracy": 0.7984732985496521, + "num_tokens": 6767778.0, + "step": 5300, + "train/ce_loss": 0.6743336319923401 + }, + { + "epoch": 0.5240261024322721, + "step": 5300, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.5240261024322721, + "step": 5300, + "train/total_loss": 0.10649586468935013 + }, + { + "entropy": 8.882976531982422, + "epoch": 0.5241249752817876, + "mean_token_accuracy": 0.7522624731063843, + "num_tokens": 6773118.0, + "step": 5301, + "train/ce_loss": 0.9140323996543884 + }, + { + "epoch": 0.5241249752817876, + "step": 5301, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.5241249752817876, + "step": 5301, + "train/total_loss": 0.13046574592590332 + }, + { + "entropy": 8.891812324523926, + "epoch": 0.5242238481313032, + "mean_token_accuracy": 0.7790178656578064, + "num_tokens": 6778453.0, + "step": 5302, + "train/ce_loss": 0.5919695496559143 + }, + { + "epoch": 0.5242238481313032, + "step": 5302, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.5242238481313032, + "step": 5302, + "train/total_loss": 0.08263445645570755 + }, + { + "entropy": 9.686471939086914, + "epoch": 0.5243227209808187, + "mean_token_accuracy": 0.7914572954177856, + "num_tokens": 6783293.0, + "step": 5303, + "train/ce_loss": 3.1267711619875627e-06 + }, + { + "epoch": 0.5243227209808187, + "step": 5303, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.5243227209808187, + "step": 5303, + "train/total_loss": 0.06250031292438507 + }, + { + "entropy": 10.26594066619873, + "epoch": 0.5244215938303342, + "mean_token_accuracy": 0.7287449240684509, + "num_tokens": 6787955.0, + "step": 5304, + "train/ce_loss": 1.0271916835336015e-05 + }, + { + "epoch": 0.5244215938303342, + "step": 5304, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.5244215938303342, + "step": 5304, + "train/total_loss": 0.039063528180122375 + }, + { + "entropy": 9.083431243896484, + "epoch": 0.5245204666798498, + "mean_token_accuracy": 0.7402746081352234, + "num_tokens": 6793496.0, + "step": 5305, + "train/ce_loss": 1.131496548652649 + }, + { + "epoch": 0.5245204666798498, + "step": 5305, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.5245204666798498, + "step": 5305, + "train/total_loss": 0.18346215784549713 + }, + { + "entropy": 8.841108322143555, + "epoch": 0.5246193395293652, + "mean_token_accuracy": 0.6474164128303528, + "num_tokens": 6798999.0, + "step": 5306, + "train/ce_loss": 1.169258952140808 + }, + { + "epoch": 0.5246193395293652, + "step": 5306, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.5246193395293652, + "step": 5306, + "train/total_loss": 0.2106758952140808 + }, + { + "entropy": 9.429069519042969, + "epoch": 0.5247182123788807, + "mean_token_accuracy": 0.7676281929016113, + "num_tokens": 6804040.0, + "step": 5307, + "train/ce_loss": 1.165016531944275 + }, + { + "epoch": 0.5247182123788807, + "step": 5307, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.5247182123788807, + "step": 5307, + "train/total_loss": 0.13993915915489197 + }, + { + "entropy": 9.255001068115234, + "epoch": 0.5248170852283963, + "mean_token_accuracy": 0.7620320916175842, + "num_tokens": 6809250.0, + "step": 5308, + "train/ce_loss": 0.6999289989471436 + }, + { + "epoch": 0.5248170852283963, + "step": 5308, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.5248170852283963, + "step": 5308, + "train/total_loss": 0.15202414989471436 + }, + { + "entropy": 9.225317001342773, + "epoch": 0.5249159580779118, + "mean_token_accuracy": 0.6915760636329651, + "num_tokens": 6814455.0, + "step": 5309, + "train/ce_loss": 1.4170299768447876 + }, + { + "epoch": 0.5249159580779118, + "step": 5309, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.5249159580779118, + "step": 5309, + "train/total_loss": 0.20810924470424652 + }, + { + "entropy": 9.13276481628418, + "epoch": 0.5250148309274273, + "mean_token_accuracy": 0.7012345790863037, + "num_tokens": 6819739.0, + "step": 5310, + "train/ce_loss": 0.6636089086532593 + }, + { + "epoch": 0.5250148309274273, + "step": 5310, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.5250148309274273, + "step": 5310, + "train/total_loss": 0.12104839086532593 + }, + { + "entropy": 9.337963104248047, + "epoch": 0.5251137037769429, + "mean_token_accuracy": 0.7988422513008118, + "num_tokens": 6824894.0, + "step": 5311, + "train/ce_loss": 0.8194974064826965 + }, + { + "epoch": 0.5251137037769429, + "step": 5311, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.5251137037769429, + "step": 5311, + "train/total_loss": 0.13663724064826965 + }, + { + "entropy": 9.612142562866211, + "epoch": 0.5252125766264584, + "mean_token_accuracy": 0.7628083229064941, + "num_tokens": 6829864.0, + "step": 5312, + "train/ce_loss": 1.1538187265396118 + }, + { + "epoch": 0.5252125766264584, + "step": 5312, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.5252125766264584, + "step": 5312, + "train/total_loss": 0.1544443666934967 + }, + { + "entropy": 9.37071704864502, + "epoch": 0.5253114494759739, + "mean_token_accuracy": 0.7274011373519897, + "num_tokens": 6835021.0, + "step": 5313, + "train/ce_loss": 4.946698027197272e-06 + }, + { + "epoch": 0.5253114494759739, + "step": 5313, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.5253114494759739, + "step": 5313, + "train/total_loss": 0.027344245463609695 + }, + { + "entropy": 8.565313339233398, + "epoch": 0.5254103223254895, + "mean_token_accuracy": 0.7023661136627197, + "num_tokens": 6840341.0, + "step": 5314, + "train/ce_loss": 0.6946756839752197 + }, + { + "epoch": 0.5254103223254895, + "step": 5314, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.5254103223254895, + "step": 5314, + "train/total_loss": 0.10853006690740585 + }, + { + "entropy": 9.333951950073242, + "epoch": 0.5255091951750049, + "mean_token_accuracy": 0.7058823704719543, + "num_tokens": 6845504.0, + "step": 5315, + "train/ce_loss": 1.2145005464553833 + }, + { + "epoch": 0.5255091951750049, + "step": 5315, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.5255091951750049, + "step": 5315, + "train/total_loss": 0.19957506656646729 + }, + { + "entropy": 9.258594512939453, + "epoch": 0.5256080680245204, + "mean_token_accuracy": 0.723308265209198, + "num_tokens": 6850575.0, + "step": 5316, + "train/ce_loss": 1.0137473344802856 + }, + { + "epoch": 0.5256080680245204, + "step": 5316, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.5256080680245204, + "step": 5316, + "train/total_loss": 0.15215599536895752 + }, + { + "entropy": 9.147095680236816, + "epoch": 0.525706940874036, + "mean_token_accuracy": 0.7363861203193665, + "num_tokens": 6855829.0, + "step": 5317, + "train/ce_loss": 5.505980880116113e-07 + }, + { + "epoch": 0.525706940874036, + "step": 5317, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.525706940874036, + "step": 5317, + "train/total_loss": 0.019531305879354477 + }, + { + "entropy": 9.644766807556152, + "epoch": 0.5258058137235515, + "mean_token_accuracy": 0.7862903475761414, + "num_tokens": 6860726.0, + "step": 5318, + "train/ce_loss": 1.0257605254082591e-06 + }, + { + "epoch": 0.5258058137235515, + "step": 5318, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.5258058137235515, + "step": 5318, + "train/total_loss": 0.03125010430812836 + }, + { + "entropy": 9.015385627746582, + "epoch": 0.525904686573067, + "mean_token_accuracy": 0.6856464743614197, + "num_tokens": 6866068.0, + "step": 5319, + "train/ce_loss": 1.1722711324691772 + }, + { + "epoch": 0.525904686573067, + "step": 5319, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.525904686573067, + "step": 5319, + "train/total_loss": 0.19925835728645325 + }, + { + "epoch": 0.5260035594225826, + "grad_norm": 0.7373986840248108, + "learning_rate": 8.687385650002474e-06, + "loss": 0.1406, + "step": 5320 + }, + { + "entropy": 9.100988388061523, + "epoch": 0.5260035594225826, + "mean_token_accuracy": 0.7322946190834045, + "num_tokens": 6871259.0, + "step": 5320, + "train/ce_loss": 0.6726405620574951 + }, + { + "epoch": 0.5260035594225826, + "step": 5320, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.5260035594225826, + "step": 5320, + "train/total_loss": 0.10242030769586563 + }, + { + "entropy": 9.619433403015137, + "epoch": 0.5261024322720981, + "mean_token_accuracy": 0.7317647337913513, + "num_tokens": 6876083.0, + "step": 5321, + "train/ce_loss": 2.4636622129037278e-06 + }, + { + "epoch": 0.5261024322720981, + "step": 5321, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.5261024322720981, + "step": 5321, + "train/total_loss": 0.0312502458691597 + }, + { + "entropy": 9.355940818786621, + "epoch": 0.5262013051216136, + "mean_token_accuracy": 0.7342342138290405, + "num_tokens": 6881193.0, + "step": 5322, + "train/ce_loss": 0.9472965002059937 + }, + { + "epoch": 0.5262013051216136, + "step": 5322, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.5262013051216136, + "step": 5322, + "train/total_loss": 0.19629216194152832 + }, + { + "entropy": 9.124074935913086, + "epoch": 0.5263001779711292, + "mean_token_accuracy": 0.7120141386985779, + "num_tokens": 6886202.0, + "step": 5323, + "train/ce_loss": 2.04034381567908e-06 + }, + { + "epoch": 0.5263001779711292, + "step": 5323, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.5263001779711292, + "step": 5323, + "train/total_loss": 0.035156454890966415 + }, + { + "entropy": 9.902753829956055, + "epoch": 0.5263990508206446, + "mean_token_accuracy": 0.6672897338867188, + "num_tokens": 6891114.0, + "step": 5324, + "train/ce_loss": 9.634625257604057e-07 + }, + { + "epoch": 0.5263990508206446, + "step": 5324, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.5263990508206446, + "step": 5324, + "train/total_loss": 0.01953134685754776 + }, + { + "entropy": 9.061259269714355, + "epoch": 0.5264979236701601, + "mean_token_accuracy": 0.729393482208252, + "num_tokens": 6896169.0, + "step": 5325, + "train/ce_loss": 1.6091874837875366 + }, + { + "epoch": 0.5264979236701601, + "step": 5325, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.5264979236701601, + "step": 5325, + "train/total_loss": 0.22341875731945038 + }, + { + "entropy": 8.957914352416992, + "epoch": 0.5265967965196757, + "mean_token_accuracy": 0.7336394786834717, + "num_tokens": 6901464.0, + "step": 5326, + "train/ce_loss": 0.5018377304077148 + }, + { + "epoch": 0.5265967965196757, + "step": 5326, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.5265967965196757, + "step": 5326, + "train/total_loss": 0.13612127304077148 + }, + { + "entropy": 8.64019775390625, + "epoch": 0.5266956693691912, + "mean_token_accuracy": 0.6845564246177673, + "num_tokens": 6907038.0, + "step": 5327, + "train/ce_loss": 0.9129147529602051 + }, + { + "epoch": 0.5266956693691912, + "step": 5327, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.5266956693691912, + "step": 5327, + "train/total_loss": 0.13816648721694946 + }, + { + "entropy": 9.07165241241455, + "epoch": 0.5267945422187067, + "mean_token_accuracy": 0.7884841561317444, + "num_tokens": 6912332.0, + "step": 5328, + "train/ce_loss": 0.5771158337593079 + }, + { + "epoch": 0.5267945422187067, + "step": 5328, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.5267945422187067, + "step": 5328, + "train/total_loss": 0.12021158635616302 + }, + { + "entropy": 9.02506160736084, + "epoch": 0.5268934150682223, + "mean_token_accuracy": 0.6881496906280518, + "num_tokens": 6917741.0, + "step": 5329, + "train/ce_loss": 1.7925119400024414 + }, + { + "epoch": 0.5268934150682223, + "step": 5329, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.5268934150682223, + "step": 5329, + "train/total_loss": 0.27690744400024414 + }, + { + "entropy": 9.574190139770508, + "epoch": 0.5269922879177378, + "mean_token_accuracy": 0.7754442691802979, + "num_tokens": 6922979.0, + "step": 5330, + "train/ce_loss": 2.2391729999071686e-06 + }, + { + "epoch": 0.5269922879177378, + "step": 5330, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.5269922879177378, + "step": 5330, + "train/total_loss": 0.05859397351741791 + }, + { + "entropy": 8.93344783782959, + "epoch": 0.5270911607672533, + "mean_token_accuracy": 0.769138753414154, + "num_tokens": 6928305.0, + "step": 5331, + "train/ce_loss": 0.5848641395568848 + }, + { + "epoch": 0.5270911607672533, + "step": 5331, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.5270911607672533, + "step": 5331, + "train/total_loss": 0.12098641693592072 + }, + { + "entropy": 8.794695854187012, + "epoch": 0.5271900336167689, + "mean_token_accuracy": 0.7119628190994263, + "num_tokens": 6933642.0, + "step": 5332, + "train/ce_loss": 0.5715578198432922 + }, + { + "epoch": 0.5271900336167689, + "step": 5332, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.5271900336167689, + "step": 5332, + "train/total_loss": 0.1157495379447937 + }, + { + "entropy": 9.645194053649902, + "epoch": 0.5272889064662843, + "mean_token_accuracy": 0.7186897993087769, + "num_tokens": 6938782.0, + "step": 5333, + "train/ce_loss": 1.0152158438359038e-06 + }, + { + "epoch": 0.5272889064662843, + "step": 5333, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.5272889064662843, + "step": 5333, + "train/total_loss": 0.015625102445483208 + }, + { + "entropy": 9.530113220214844, + "epoch": 0.5273877793157998, + "mean_token_accuracy": 0.752212405204773, + "num_tokens": 6943810.0, + "step": 5334, + "train/ce_loss": 0.7390918731689453 + }, + { + "epoch": 0.5273877793157998, + "step": 5334, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.5273877793157998, + "step": 5334, + "train/total_loss": 0.128596693277359 + }, + { + "entropy": 8.728327751159668, + "epoch": 0.5274866521653154, + "mean_token_accuracy": 0.7289617657661438, + "num_tokens": 6949243.0, + "step": 5335, + "train/ce_loss": 1.2972229719161987 + }, + { + "epoch": 0.5274866521653154, + "step": 5335, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.5274866521653154, + "step": 5335, + "train/total_loss": 0.18440979719161987 + }, + { + "entropy": 8.749822616577148, + "epoch": 0.5275855250148309, + "mean_token_accuracy": 0.7312961220741272, + "num_tokens": 6954625.0, + "step": 5336, + "train/ce_loss": 0.8289675116539001 + }, + { + "epoch": 0.5275855250148309, + "step": 5336, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.5275855250148309, + "step": 5336, + "train/total_loss": 0.13758425414562225 + }, + { + "entropy": 9.186654090881348, + "epoch": 0.5276843978643464, + "mean_token_accuracy": 0.7318932414054871, + "num_tokens": 6959816.0, + "step": 5337, + "train/ce_loss": 0.9297966361045837 + }, + { + "epoch": 0.5276843978643464, + "step": 5337, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.5276843978643464, + "step": 5337, + "train/total_loss": 0.13204216957092285 + }, + { + "entropy": 9.81143856048584, + "epoch": 0.527783270713862, + "mean_token_accuracy": 0.692307710647583, + "num_tokens": 6964574.0, + "step": 5338, + "train/ce_loss": 2.8371430289553246e-06 + }, + { + "epoch": 0.527783270713862, + "step": 5338, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.527783270713862, + "step": 5338, + "train/total_loss": 0.03906278312206268 + }, + { + "entropy": 9.589973449707031, + "epoch": 0.5278821435633775, + "mean_token_accuracy": 0.7482394576072693, + "num_tokens": 6969565.0, + "step": 5339, + "train/ce_loss": 1.5491751432418823 + }, + { + "epoch": 0.5278821435633775, + "step": 5339, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.5278821435633775, + "step": 5339, + "train/total_loss": 0.23304252326488495 + }, + { + "epoch": 0.527981016412893, + "grad_norm": 0.6893176436424255, + "learning_rate": 8.682440785244524e-06, + "loss": 0.1464, + "step": 5340 + }, + { + "entropy": 9.429092407226562, + "epoch": 0.527981016412893, + "mean_token_accuracy": 0.7631999850273132, + "num_tokens": 6974619.0, + "step": 5340, + "train/ce_loss": 1.4807854890823364 + }, + { + "epoch": 0.527981016412893, + "step": 5340, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.527981016412893, + "step": 5340, + "train/total_loss": 0.2105785459280014 + }, + { + "entropy": 8.987339973449707, + "epoch": 0.5280798892624086, + "mean_token_accuracy": 0.7665505409240723, + "num_tokens": 6979942.0, + "step": 5341, + "train/ce_loss": 0.435159295797348 + }, + { + "epoch": 0.5280798892624086, + "step": 5341, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.5280798892624086, + "step": 5341, + "train/total_loss": 0.07085968554019928 + }, + { + "entropy": 9.35151481628418, + "epoch": 0.528178762111924, + "mean_token_accuracy": 0.756533682346344, + "num_tokens": 6985119.0, + "step": 5342, + "train/ce_loss": 1.3392177820205688 + }, + { + "epoch": 0.528178762111924, + "step": 5342, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.528178762111924, + "step": 5342, + "train/total_loss": 0.2042342871427536 + }, + { + "entropy": 8.658845901489258, + "epoch": 0.5282776349614395, + "mean_token_accuracy": 0.7789585590362549, + "num_tokens": 6990565.0, + "step": 5343, + "train/ce_loss": 0.3492826521396637 + }, + { + "epoch": 0.5282776349614395, + "step": 5343, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.5282776349614395, + "step": 5343, + "train/total_loss": 0.07399076223373413 + }, + { + "entropy": 8.939874649047852, + "epoch": 0.5283765078109551, + "mean_token_accuracy": 0.7493734359741211, + "num_tokens": 6995821.0, + "step": 5344, + "train/ce_loss": 0.746372640132904 + }, + { + "epoch": 0.5283765078109551, + "step": 5344, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.5283765078109551, + "step": 5344, + "train/total_loss": 0.1801060140132904 + }, + { + "entropy": 8.87545108795166, + "epoch": 0.5284753806604706, + "mean_token_accuracy": 0.7189781069755554, + "num_tokens": 7001091.0, + "step": 5345, + "train/ce_loss": 1.1034655570983887 + }, + { + "epoch": 0.5284753806604706, + "step": 5345, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.5284753806604706, + "step": 5345, + "train/total_loss": 0.14940905570983887 + }, + { + "entropy": 9.050783157348633, + "epoch": 0.5285742535099861, + "mean_token_accuracy": 0.7047619223594666, + "num_tokens": 7006373.0, + "step": 5346, + "train/ce_loss": 0.8942811489105225 + }, + { + "epoch": 0.5285742535099861, + "step": 5346, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.5285742535099861, + "step": 5346, + "train/total_loss": 0.1558343768119812 + }, + { + "entropy": 8.830314636230469, + "epoch": 0.5286731263595017, + "mean_token_accuracy": 0.744508683681488, + "num_tokens": 7011710.0, + "step": 5347, + "train/ce_loss": 0.5430724620819092 + }, + { + "epoch": 0.5286731263595017, + "step": 5347, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.5286731263595017, + "step": 5347, + "train/total_loss": 0.1246197521686554 + }, + { + "entropy": 9.55591106414795, + "epoch": 0.5287719992090172, + "mean_token_accuracy": 0.7323688864707947, + "num_tokens": 7016726.0, + "step": 5348, + "train/ce_loss": 1.0442057847976685 + }, + { + "epoch": 0.5287719992090172, + "step": 5348, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.5287719992090172, + "step": 5348, + "train/total_loss": 0.18254557251930237 + }, + { + "entropy": 9.241381645202637, + "epoch": 0.5288708720585328, + "mean_token_accuracy": 0.7426035404205322, + "num_tokens": 7021787.0, + "step": 5349, + "train/ce_loss": 1.3119507684677956e-06 + }, + { + "epoch": 0.5288708720585328, + "step": 5349, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.5288708720585328, + "step": 5349, + "train/total_loss": 0.07812513411045074 + }, + { + "entropy": 8.90439224243164, + "epoch": 0.5289697449080483, + "mean_token_accuracy": 0.7367021441459656, + "num_tokens": 7027049.0, + "step": 5350, + "train/ce_loss": 0.9882033467292786 + }, + { + "epoch": 0.5289697449080483, + "step": 5350, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.5289697449080483, + "step": 5350, + "train/total_loss": 0.15350782871246338 + }, + { + "entropy": 9.076597213745117, + "epoch": 0.5290686177575638, + "mean_token_accuracy": 0.7213695645332336, + "num_tokens": 7032379.0, + "step": 5351, + "train/ce_loss": 0.6717884540557861 + }, + { + "epoch": 0.5290686177575638, + "step": 5351, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.5290686177575638, + "step": 5351, + "train/total_loss": 0.10233509540557861 + }, + { + "entropy": 9.232297897338867, + "epoch": 0.5291674906070793, + "mean_token_accuracy": 0.73221755027771, + "num_tokens": 7037574.0, + "step": 5352, + "train/ce_loss": 0.601751983165741 + }, + { + "epoch": 0.5291674906070793, + "step": 5352, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.5291674906070793, + "step": 5352, + "train/total_loss": 0.09923769533634186 + }, + { + "entropy": 9.587291717529297, + "epoch": 0.5292663634565948, + "mean_token_accuracy": 0.7495527863502502, + "num_tokens": 7042573.0, + "step": 5353, + "train/ce_loss": 1.2889903783798218 + }, + { + "epoch": 0.5292663634565948, + "step": 5353, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.5292663634565948, + "step": 5353, + "train/total_loss": 0.20702403783798218 + }, + { + "entropy": 9.119341850280762, + "epoch": 0.5293652363061103, + "mean_token_accuracy": 0.8156911730766296, + "num_tokens": 7047852.0, + "step": 5354, + "train/ce_loss": 0.966998279094696 + }, + { + "epoch": 0.5293652363061103, + "step": 5354, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.5293652363061103, + "step": 5354, + "train/total_loss": 0.17482483386993408 + }, + { + "entropy": 8.981361389160156, + "epoch": 0.5294641091556259, + "mean_token_accuracy": 0.7072625756263733, + "num_tokens": 7053239.0, + "step": 5355, + "train/ce_loss": 1.3036599159240723 + }, + { + "epoch": 0.5294641091556259, + "step": 5355, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.5294641091556259, + "step": 5355, + "train/total_loss": 0.1928659975528717 + }, + { + "entropy": 9.078542709350586, + "epoch": 0.5295629820051414, + "mean_token_accuracy": 0.7303493618965149, + "num_tokens": 7058558.0, + "step": 5356, + "train/ce_loss": 1.2763357162475586 + }, + { + "epoch": 0.5295629820051414, + "step": 5356, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.5295629820051414, + "step": 5356, + "train/total_loss": 0.18232107162475586 + }, + { + "entropy": 9.39804458618164, + "epoch": 0.5296618548546569, + "mean_token_accuracy": 0.7639344334602356, + "num_tokens": 7063629.0, + "step": 5357, + "train/ce_loss": 1.943985807884019e-06 + }, + { + "epoch": 0.5296618548546569, + "step": 5357, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.5296618548546569, + "step": 5357, + "train/total_loss": 0.06250019371509552 + }, + { + "entropy": 9.844568252563477, + "epoch": 0.5297607277041725, + "mean_token_accuracy": 0.7413395047187805, + "num_tokens": 7068489.0, + "step": 5358, + "train/ce_loss": 1.7062684297561646 + }, + { + "epoch": 0.5297607277041725, + "step": 5358, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.5297607277041725, + "step": 5358, + "train/total_loss": 0.27218934893608093 + }, + { + "entropy": 10.298467636108398, + "epoch": 0.529859600553688, + "mean_token_accuracy": 0.7388888597488403, + "num_tokens": 7073075.0, + "step": 5359, + "train/ce_loss": 9.308042535849381e-06 + }, + { + "epoch": 0.529859600553688, + "step": 5359, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.529859600553688, + "step": 5359, + "train/total_loss": 0.046875931322574615 + }, + { + "epoch": 0.5299584734032035, + "grad_norm": 1.147903323173523, + "learning_rate": 8.677495920486575e-06, + "loss": 0.1397, + "step": 5360 + }, + { + "entropy": 9.023246765136719, + "epoch": 0.5299584734032035, + "mean_token_accuracy": 0.7118881344795227, + "num_tokens": 7078321.0, + "step": 5360, + "train/ce_loss": 0.8227757811546326 + }, + { + "epoch": 0.5299584734032035, + "step": 5360, + "train/sim_loss": 0.14453125 + }, + { + "epoch": 0.5299584734032035, + "step": 5360, + "train/total_loss": 0.2268088310956955 + }, + { + "entropy": 8.935810089111328, + "epoch": 0.530057346252719, + "mean_token_accuracy": 0.7245370149612427, + "num_tokens": 7083701.0, + "step": 5361, + "train/ce_loss": 0.690895140171051 + }, + { + "epoch": 0.530057346252719, + "step": 5361, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.530057346252719, + "step": 5361, + "train/total_loss": 0.14330826699733734 + }, + { + "entropy": 9.249588012695312, + "epoch": 0.5301562191022345, + "mean_token_accuracy": 0.6653465628623962, + "num_tokens": 7088584.0, + "step": 5362, + "train/ce_loss": 3.4195520584034966e-06 + }, + { + "epoch": 0.5301562191022345, + "step": 5362, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.5301562191022345, + "step": 5362, + "train/total_loss": 0.03906284272670746 + }, + { + "entropy": 9.306418418884277, + "epoch": 0.53025509195175, + "mean_token_accuracy": 0.73893803358078, + "num_tokens": 7093689.0, + "step": 5363, + "train/ce_loss": 1.1926301717758179 + }, + { + "epoch": 0.53025509195175, + "step": 5363, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.53025509195175, + "step": 5363, + "train/total_loss": 0.15441927313804626 + }, + { + "entropy": 9.027421951293945, + "epoch": 0.5303539648012656, + "mean_token_accuracy": 0.7839721441268921, + "num_tokens": 7099034.0, + "step": 5364, + "train/ce_loss": 0.5805795788764954 + }, + { + "epoch": 0.5303539648012656, + "step": 5364, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.5303539648012656, + "step": 5364, + "train/total_loss": 0.10883921384811401 + }, + { + "entropy": 9.983185768127441, + "epoch": 0.5304528376507811, + "mean_token_accuracy": 0.6724137663841248, + "num_tokens": 7103776.0, + "step": 5365, + "train/ce_loss": 1.7935803953150753e-06 + }, + { + "epoch": 0.5304528376507811, + "step": 5365, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.5304528376507811, + "step": 5365, + "train/total_loss": 0.015625178813934326 + }, + { + "entropy": 9.114487648010254, + "epoch": 0.5305517105002966, + "mean_token_accuracy": 0.7476882338523865, + "num_tokens": 7109028.0, + "step": 5366, + "train/ce_loss": 1.0102288722991943 + }, + { + "epoch": 0.5305517105002966, + "step": 5366, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.5305517105002966, + "step": 5366, + "train/total_loss": 0.1400853991508484 + }, + { + "entropy": 9.042924880981445, + "epoch": 0.5306505833498122, + "mean_token_accuracy": 0.7706935405731201, + "num_tokens": 7114372.0, + "step": 5367, + "train/ce_loss": 0.9071125388145447 + }, + { + "epoch": 0.5306505833498122, + "step": 5367, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.5306505833498122, + "step": 5367, + "train/total_loss": 0.15321126580238342 + }, + { + "entropy": 8.998058319091797, + "epoch": 0.5307494561993277, + "mean_token_accuracy": 0.7087979912757874, + "num_tokens": 7119683.0, + "step": 5368, + "train/ce_loss": 1.2738776206970215 + }, + { + "epoch": 0.5307494561993277, + "step": 5368, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.5307494561993277, + "step": 5368, + "train/total_loss": 0.17426276206970215 + }, + { + "entropy": 9.279787063598633, + "epoch": 0.5308483290488432, + "mean_token_accuracy": 0.767169177532196, + "num_tokens": 7124710.0, + "step": 5369, + "train/ce_loss": 0.9067485332489014 + }, + { + "epoch": 0.5308483290488432, + "step": 5369, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.5308483290488432, + "step": 5369, + "train/total_loss": 0.11020610481500626 + }, + { + "entropy": 9.459781646728516, + "epoch": 0.5309472018983588, + "mean_token_accuracy": 0.7230046987533569, + "num_tokens": 7129771.0, + "step": 5370, + "train/ce_loss": 1.6413638591766357 + }, + { + "epoch": 0.5309472018983588, + "step": 5370, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.5309472018983588, + "step": 5370, + "train/total_loss": 0.1992926448583603 + }, + { + "entropy": 9.348337173461914, + "epoch": 0.5310460747478742, + "mean_token_accuracy": 0.7108238935470581, + "num_tokens": 7134836.0, + "step": 5371, + "train/ce_loss": 1.2921637296676636 + }, + { + "epoch": 0.5310460747478742, + "step": 5371, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.5310460747478742, + "step": 5371, + "train/total_loss": 0.17999762296676636 + }, + { + "entropy": 8.956254005432129, + "epoch": 0.5311449475973897, + "mean_token_accuracy": 0.7180910110473633, + "num_tokens": 7140224.0, + "step": 5372, + "train/ce_loss": 1.1430314779281616 + }, + { + "epoch": 0.5311449475973897, + "step": 5372, + "train/sim_loss": 0.12890625 + }, + { + "epoch": 0.5311449475973897, + "step": 5372, + "train/total_loss": 0.24320939183235168 + }, + { + "entropy": 9.061437606811523, + "epoch": 0.5312438204469053, + "mean_token_accuracy": 0.7108886241912842, + "num_tokens": 7145494.0, + "step": 5373, + "train/ce_loss": 1.20707106590271 + }, + { + "epoch": 0.5312438204469053, + "step": 5373, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.5312438204469053, + "step": 5373, + "train/total_loss": 0.15195710957050323 + }, + { + "entropy": 9.329811096191406, + "epoch": 0.5313426932964208, + "mean_token_accuracy": 0.7623066306114197, + "num_tokens": 7150651.0, + "step": 5374, + "train/ce_loss": 1.5130629539489746 + }, + { + "epoch": 0.5313426932964208, + "step": 5374, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.5313426932964208, + "step": 5374, + "train/total_loss": 0.24896255135536194 + }, + { + "entropy": 9.067573547363281, + "epoch": 0.5314415661459363, + "mean_token_accuracy": 0.7314211130142212, + "num_tokens": 7155870.0, + "step": 5375, + "train/ce_loss": 9.660790283305687e-07 + }, + { + "epoch": 0.5314415661459363, + "step": 5375, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.5314415661459363, + "step": 5375, + "train/total_loss": 0.04687509685754776 + }, + { + "entropy": 9.270325660705566, + "epoch": 0.5315404389954519, + "mean_token_accuracy": 0.7004104256629944, + "num_tokens": 7161085.0, + "step": 5376, + "train/ce_loss": 0.6302452087402344 + }, + { + "epoch": 0.5315404389954519, + "step": 5376, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.5315404389954519, + "step": 5376, + "train/total_loss": 0.10208702087402344 + }, + { + "entropy": 9.019856452941895, + "epoch": 0.5316393118449674, + "mean_token_accuracy": 0.7462871074676514, + "num_tokens": 7166398.0, + "step": 5377, + "train/ce_loss": 1.1178202629089355 + }, + { + "epoch": 0.5316393118449674, + "step": 5377, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.5316393118449674, + "step": 5377, + "train/total_loss": 0.1938132792711258 + }, + { + "entropy": 9.057889938354492, + "epoch": 0.5317381846944829, + "mean_token_accuracy": 0.7750611305236816, + "num_tokens": 7171683.0, + "step": 5378, + "train/ce_loss": 1.0019845962524414 + }, + { + "epoch": 0.5317381846944829, + "step": 5378, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.5317381846944829, + "step": 5378, + "train/total_loss": 0.20566721260547638 + }, + { + "entropy": 9.298063278198242, + "epoch": 0.5318370575439985, + "mean_token_accuracy": 0.7192254662513733, + "num_tokens": 7177019.0, + "step": 5379, + "train/ce_loss": 0.8760280013084412 + }, + { + "epoch": 0.5318370575439985, + "step": 5379, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.5318370575439985, + "step": 5379, + "train/total_loss": 0.18916529417037964 + }, + { + "epoch": 0.531935930393514, + "grad_norm": 0.729664146900177, + "learning_rate": 8.672551055728627e-06, + "loss": 0.1479, + "step": 5380 + }, + { + "entropy": 8.049644470214844, + "epoch": 0.531935930393514, + "mean_token_accuracy": 0.6997219920158386, + "num_tokens": 7182594.0, + "step": 5380, + "train/ce_loss": 0.6018348336219788 + }, + { + "epoch": 0.531935930393514, + "step": 5380, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.531935930393514, + "step": 5380, + "train/total_loss": 0.12268348038196564 + }, + { + "entropy": 8.955841064453125, + "epoch": 0.5320348032430294, + "mean_token_accuracy": 0.698060929775238, + "num_tokens": 7187783.0, + "step": 5381, + "train/ce_loss": 0.8994425535202026 + }, + { + "epoch": 0.5320348032430294, + "step": 5381, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.5320348032430294, + "step": 5381, + "train/total_loss": 0.1485380083322525 + }, + { + "entropy": 9.328859329223633, + "epoch": 0.532133676092545, + "mean_token_accuracy": 0.7279151678085327, + "num_tokens": 7192787.0, + "step": 5382, + "train/ce_loss": 1.0890889167785645 + }, + { + "epoch": 0.532133676092545, + "step": 5382, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.532133676092545, + "step": 5382, + "train/total_loss": 0.14406514167785645 + }, + { + "entropy": 9.188087463378906, + "epoch": 0.5322325489420605, + "mean_token_accuracy": 0.6950182318687439, + "num_tokens": 7198064.0, + "step": 5383, + "train/ce_loss": 1.312862753868103 + }, + { + "epoch": 0.5322325489420605, + "step": 5383, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.5322325489420605, + "step": 5383, + "train/total_loss": 0.18206752836704254 + }, + { + "entropy": 8.715286254882812, + "epoch": 0.532331421791576, + "mean_token_accuracy": 0.7806072235107422, + "num_tokens": 7203502.0, + "step": 5384, + "train/ce_loss": 0.5634164214134216 + }, + { + "epoch": 0.532331421791576, + "step": 5384, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.532331421791576, + "step": 5384, + "train/total_loss": 0.08368539810180664 + }, + { + "entropy": 9.236735343933105, + "epoch": 0.5324302946410916, + "mean_token_accuracy": 0.6630434989929199, + "num_tokens": 7208754.0, + "step": 5385, + "train/ce_loss": 0.845870316028595 + }, + { + "epoch": 0.5324302946410916, + "step": 5385, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.5324302946410916, + "step": 5385, + "train/total_loss": 0.13536828756332397 + }, + { + "entropy": 9.177253723144531, + "epoch": 0.5325291674906071, + "mean_token_accuracy": 0.7102342844009399, + "num_tokens": 7214044.0, + "step": 5386, + "train/ce_loss": 0.7868425250053406 + }, + { + "epoch": 0.5325291674906071, + "step": 5386, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.5325291674906071, + "step": 5386, + "train/total_loss": 0.1255592554807663 + }, + { + "entropy": 9.212908744812012, + "epoch": 0.5326280403401226, + "mean_token_accuracy": 0.7291960716247559, + "num_tokens": 7219250.0, + "step": 5387, + "train/ce_loss": 0.8648903965950012 + }, + { + "epoch": 0.5326280403401226, + "step": 5387, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.5326280403401226, + "step": 5387, + "train/total_loss": 0.14508280158042908 + }, + { + "entropy": 9.522449493408203, + "epoch": 0.5327269131896382, + "mean_token_accuracy": 0.6977124214172363, + "num_tokens": 7224314.0, + "step": 5388, + "train/ce_loss": 7.962746622069972e-07 + }, + { + "epoch": 0.5327269131896382, + "step": 5388, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.5327269131896382, + "step": 5388, + "train/total_loss": 0.027343830093741417 + }, + { + "entropy": 8.765491485595703, + "epoch": 0.5328257860391536, + "mean_token_accuracy": 0.7887005805969238, + "num_tokens": 7229679.0, + "step": 5389, + "train/ce_loss": 0.7940881848335266 + }, + { + "epoch": 0.5328257860391536, + "step": 5389, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.5328257860391536, + "step": 5389, + "train/total_loss": 0.14972132444381714 + }, + { + "entropy": 8.809341430664062, + "epoch": 0.5329246588886691, + "mean_token_accuracy": 0.7212249040603638, + "num_tokens": 7235097.0, + "step": 5390, + "train/ce_loss": 0.7956955432891846 + }, + { + "epoch": 0.5329246588886691, + "step": 5390, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.5329246588886691, + "step": 5390, + "train/total_loss": 0.14597579836845398 + }, + { + "entropy": 9.053311347961426, + "epoch": 0.5330235317381847, + "mean_token_accuracy": 0.782608687877655, + "num_tokens": 7240432.0, + "step": 5391, + "train/ce_loss": 0.7339907884597778 + }, + { + "epoch": 0.5330235317381847, + "step": 5391, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.5330235317381847, + "step": 5391, + "train/total_loss": 0.12418033182621002 + }, + { + "entropy": 8.960689544677734, + "epoch": 0.5331224045877002, + "mean_token_accuracy": 0.8105975389480591, + "num_tokens": 7245778.0, + "step": 5392, + "train/ce_loss": 0.5632300972938538 + }, + { + "epoch": 0.5331224045877002, + "step": 5392, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.5331224045877002, + "step": 5392, + "train/total_loss": 0.11101050674915314 + }, + { + "entropy": 8.973613739013672, + "epoch": 0.5332212774372157, + "mean_token_accuracy": 0.7176079750061035, + "num_tokens": 7251181.0, + "step": 5393, + "train/ce_loss": 0.6810163259506226 + }, + { + "epoch": 0.5332212774372157, + "step": 5393, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.5332212774372157, + "step": 5393, + "train/total_loss": 0.09153913706541061 + }, + { + "entropy": 10.398794174194336, + "epoch": 0.5333201502867313, + "mean_token_accuracy": 0.6315789222717285, + "num_tokens": 7255809.0, + "step": 5394, + "train/ce_loss": 1.6151298041222617e-06 + }, + { + "epoch": 0.5333201502867313, + "step": 5394, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.5333201502867313, + "step": 5394, + "train/total_loss": 0.023437662050127983 + }, + { + "entropy": 9.192532539367676, + "epoch": 0.5334190231362468, + "mean_token_accuracy": 0.7105262875556946, + "num_tokens": 7261046.0, + "step": 5395, + "train/ce_loss": 0.6318694353103638 + }, + { + "epoch": 0.5334190231362468, + "step": 5395, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.5334190231362468, + "step": 5395, + "train/total_loss": 0.18818694353103638 + }, + { + "entropy": 8.94017505645752, + "epoch": 0.5335178959857623, + "mean_token_accuracy": 0.7543859481811523, + "num_tokens": 7266242.0, + "step": 5396, + "train/ce_loss": 0.6685409545898438 + }, + { + "epoch": 0.5335178959857623, + "step": 5396, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.5335178959857623, + "step": 5396, + "train/total_loss": 0.1137290969491005 + }, + { + "entropy": 9.322994232177734, + "epoch": 0.5336167688352779, + "mean_token_accuracy": 0.7636022567749023, + "num_tokens": 7271206.0, + "step": 5397, + "train/ce_loss": 0.7755517363548279 + }, + { + "epoch": 0.5336167688352779, + "step": 5397, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.5336167688352779, + "step": 5397, + "train/total_loss": 0.10489892214536667 + }, + { + "entropy": 9.49734878540039, + "epoch": 0.5337156416847934, + "mean_token_accuracy": 0.7454873919487, + "num_tokens": 7276177.0, + "step": 5398, + "train/ce_loss": 1.4217755794525146 + }, + { + "epoch": 0.5337156416847934, + "step": 5398, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.5337156416847934, + "step": 5398, + "train/total_loss": 0.19295881688594818 + }, + { + "entropy": 9.841611862182617, + "epoch": 0.5338145145343088, + "mean_token_accuracy": 0.7167919874191284, + "num_tokens": 7280977.0, + "step": 5399, + "train/ce_loss": 1.1508462876008707e-06 + }, + { + "epoch": 0.5338145145343088, + "step": 5399, + "train/sim_loss": 0.01171875 + }, + { + "epoch": 0.5338145145343088, + "step": 5399, + "train/total_loss": 0.011718865483999252 + }, + { + "epoch": 0.5339133873838244, + "grad_norm": 1.0195847749710083, + "learning_rate": 8.667606190970677e-06, + "loss": 0.1459, + "step": 5400 + }, + { + "entropy": 9.270522117614746, + "epoch": 0.5339133873838244, + "mean_token_accuracy": 0.7745901346206665, + "num_tokens": 7286157.0, + "step": 5400, + "train/ce_loss": 0.9633910655975342 + }, + { + "epoch": 0.5339133873838244, + "step": 5400, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.5339133873838244, + "step": 5400, + "train/total_loss": 0.15102660655975342 + }, + { + "entropy": 8.760650634765625, + "epoch": 0.5340122602333399, + "mean_token_accuracy": 0.7218309640884399, + "num_tokens": 7291532.0, + "step": 5401, + "train/ce_loss": 1.5196077823638916 + }, + { + "epoch": 0.5340122602333399, + "step": 5401, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.5340122602333399, + "step": 5401, + "train/total_loss": 0.19883577525615692 + }, + { + "entropy": 9.711226463317871, + "epoch": 0.5341111330828554, + "mean_token_accuracy": 0.747474730014801, + "num_tokens": 7296457.0, + "step": 5402, + "train/ce_loss": 2.4552828108426183e-06 + }, + { + "epoch": 0.5341111330828554, + "step": 5402, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.5341111330828554, + "step": 5402, + "train/total_loss": 0.0429689958691597 + }, + { + "entropy": 10.111250877380371, + "epoch": 0.534210005932371, + "mean_token_accuracy": 0.71074378490448, + "num_tokens": 7301196.0, + "step": 5403, + "train/ce_loss": 2.5295214653015137 + }, + { + "epoch": 0.534210005932371, + "step": 5403, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.534210005932371, + "step": 5403, + "train/total_loss": 0.3193584084510803 + }, + { + "entropy": 9.483728408813477, + "epoch": 0.5343088787818865, + "mean_token_accuracy": 0.7246835231781006, + "num_tokens": 7306240.0, + "step": 5404, + "train/ce_loss": 1.2337623834609985 + }, + { + "epoch": 0.5343088787818865, + "step": 5404, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.5343088787818865, + "step": 5404, + "train/total_loss": 0.1741575002670288 + }, + { + "entropy": 9.128599166870117, + "epoch": 0.534407751631402, + "mean_token_accuracy": 0.75, + "num_tokens": 7311356.0, + "step": 5405, + "train/ce_loss": 0.9270554780960083 + }, + { + "epoch": 0.534407751631402, + "step": 5405, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.534407751631402, + "step": 5405, + "train/total_loss": 0.16301804780960083 + }, + { + "entropy": 9.280820846557617, + "epoch": 0.5345066244809176, + "mean_token_accuracy": 0.7132459878921509, + "num_tokens": 7316523.0, + "step": 5406, + "train/ce_loss": 1.3654454946517944 + }, + { + "epoch": 0.5345066244809176, + "step": 5406, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.5345066244809176, + "step": 5406, + "train/total_loss": 0.19123205542564392 + }, + { + "entropy": 8.945045471191406, + "epoch": 0.534605497330433, + "mean_token_accuracy": 0.7852193713188171, + "num_tokens": 7321865.0, + "step": 5407, + "train/ce_loss": 0.6675707101821899 + }, + { + "epoch": 0.534605497330433, + "step": 5407, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.534605497330433, + "step": 5407, + "train/total_loss": 0.14488208293914795 + }, + { + "entropy": 9.157123565673828, + "epoch": 0.5347043701799485, + "mean_token_accuracy": 0.6819338202476501, + "num_tokens": 7327096.0, + "step": 5408, + "train/ce_loss": 1.4747229215572588e-06 + }, + { + "epoch": 0.5347043701799485, + "step": 5408, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.5347043701799485, + "step": 5408, + "train/total_loss": 0.06640639901161194 + }, + { + "entropy": 9.102767944335938, + "epoch": 0.5348032430294641, + "mean_token_accuracy": 0.7913950681686401, + "num_tokens": 7332254.0, + "step": 5409, + "train/ce_loss": 0.6586277484893799 + }, + { + "epoch": 0.5348032430294641, + "step": 5409, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.5348032430294641, + "step": 5409, + "train/total_loss": 0.11273777484893799 + }, + { + "entropy": 8.996744155883789, + "epoch": 0.5349021158789796, + "mean_token_accuracy": 0.7413173913955688, + "num_tokens": 7337577.0, + "step": 5410, + "train/ce_loss": 0.6847343444824219 + }, + { + "epoch": 0.5349021158789796, + "step": 5410, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.5349021158789796, + "step": 5410, + "train/total_loss": 0.1583171784877777 + }, + { + "entropy": 8.977319717407227, + "epoch": 0.5350009887284951, + "mean_token_accuracy": 0.7600922584533691, + "num_tokens": 7342971.0, + "step": 5411, + "train/ce_loss": 0.6952391266822815 + }, + { + "epoch": 0.5350009887284951, + "step": 5411, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.5350009887284951, + "step": 5411, + "train/total_loss": 0.09296141564846039 + }, + { + "entropy": 8.610834121704102, + "epoch": 0.5350998615780107, + "mean_token_accuracy": 0.7481080889701843, + "num_tokens": 7348356.0, + "step": 5412, + "train/ce_loss": 0.9105154871940613 + }, + { + "epoch": 0.5350998615780107, + "step": 5412, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.5350998615780107, + "step": 5412, + "train/total_loss": 0.11839529871940613 + }, + { + "entropy": 9.360595703125, + "epoch": 0.5351987344275262, + "mean_token_accuracy": 0.767103374004364, + "num_tokens": 7353507.0, + "step": 5413, + "train/ce_loss": 1.1175366640090942 + }, + { + "epoch": 0.5351987344275262, + "step": 5413, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.5351987344275262, + "step": 5413, + "train/total_loss": 0.1508161723613739 + }, + { + "entropy": 9.405769348144531, + "epoch": 0.5352976072770417, + "mean_token_accuracy": 0.7421602606773376, + "num_tokens": 7358524.0, + "step": 5414, + "train/ce_loss": 0.7853061556816101 + }, + { + "epoch": 0.5352976072770417, + "step": 5414, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.5352976072770417, + "step": 5414, + "train/total_loss": 0.10978061705827713 + }, + { + "entropy": 8.846992492675781, + "epoch": 0.5353964801265573, + "mean_token_accuracy": 0.7455882430076599, + "num_tokens": 7363711.0, + "step": 5415, + "train/ce_loss": 1.6724973917007446 + }, + { + "epoch": 0.5353964801265573, + "step": 5415, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.5353964801265573, + "step": 5415, + "train/total_loss": 0.19068723917007446 + }, + { + "entropy": 8.983139991760254, + "epoch": 0.5354953529760728, + "mean_token_accuracy": 0.7798658013343811, + "num_tokens": 7368920.0, + "step": 5416, + "train/ce_loss": 1.1571681852728943e-06 + }, + { + "epoch": 0.5354953529760728, + "step": 5416, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.5354953529760728, + "step": 5416, + "train/total_loss": 0.05078136548399925 + }, + { + "entropy": 9.146753311157227, + "epoch": 0.5355942258255882, + "mean_token_accuracy": 0.7410072088241577, + "num_tokens": 7374226.0, + "step": 5417, + "train/ce_loss": 0.9910473227500916 + }, + { + "epoch": 0.5355942258255882, + "step": 5417, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.5355942258255882, + "step": 5417, + "train/total_loss": 0.18113598227500916 + }, + { + "entropy": 8.94820499420166, + "epoch": 0.5356930986751038, + "mean_token_accuracy": 0.7194163799285889, + "num_tokens": 7379585.0, + "step": 5418, + "train/ce_loss": 0.9936345219612122 + }, + { + "epoch": 0.5356930986751038, + "step": 5418, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.5356930986751038, + "step": 5418, + "train/total_loss": 0.13451969623565674 + }, + { + "entropy": 9.583169937133789, + "epoch": 0.5357919715246193, + "mean_token_accuracy": 0.803108811378479, + "num_tokens": 7384551.0, + "step": 5419, + "train/ce_loss": 0.7731736302375793 + }, + { + "epoch": 0.5357919715246193, + "step": 5419, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.5357919715246193, + "step": 5419, + "train/total_loss": 0.13200485706329346 + }, + { + "epoch": 0.5358908443741348, + "grad_norm": 0.5803881287574768, + "learning_rate": 8.66266132621273e-06, + "loss": 0.1316, + "step": 5420 + }, + { + "entropy": 8.793757438659668, + "epoch": 0.5358908443741348, + "mean_token_accuracy": 0.758368194103241, + "num_tokens": 7390020.0, + "step": 5420, + "train/ce_loss": 0.31864622235298157 + }, + { + "epoch": 0.5358908443741348, + "step": 5420, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.5358908443741348, + "step": 5420, + "train/total_loss": 0.10217712819576263 + }, + { + "entropy": 9.193429946899414, + "epoch": 0.5359897172236504, + "mean_token_accuracy": 0.7450722455978394, + "num_tokens": 7395276.0, + "step": 5421, + "train/ce_loss": 0.626777172088623 + }, + { + "epoch": 0.5359897172236504, + "step": 5421, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.5359897172236504, + "step": 5421, + "train/total_loss": 0.11736521869897842 + }, + { + "entropy": 9.44847583770752, + "epoch": 0.5360885900731659, + "mean_token_accuracy": 0.7686212658882141, + "num_tokens": 7400331.0, + "step": 5422, + "train/ce_loss": 1.4848267255729297e-06 + }, + { + "epoch": 0.5360885900731659, + "step": 5422, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.5360885900731659, + "step": 5422, + "train/total_loss": 0.05078139901161194 + }, + { + "entropy": 9.161100387573242, + "epoch": 0.5361874629226814, + "mean_token_accuracy": 0.6960784196853638, + "num_tokens": 7405450.0, + "step": 5423, + "train/ce_loss": 6.965454986129771e-07 + }, + { + "epoch": 0.5361874629226814, + "step": 5423, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.5361874629226814, + "step": 5423, + "train/total_loss": 0.01953131891787052 + }, + { + "entropy": 8.943927764892578, + "epoch": 0.536286335772197, + "mean_token_accuracy": 0.7231638431549072, + "num_tokens": 7410778.0, + "step": 5424, + "train/ce_loss": 0.6913469433784485 + }, + { + "epoch": 0.536286335772197, + "step": 5424, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.536286335772197, + "step": 5424, + "train/total_loss": 0.11210344731807709 + }, + { + "entropy": 9.350214958190918, + "epoch": 0.5363852086217125, + "mean_token_accuracy": 0.6972860097885132, + "num_tokens": 7415717.0, + "step": 5425, + "train/ce_loss": 1.726108166621998e-06 + }, + { + "epoch": 0.5363852086217125, + "step": 5425, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.5363852086217125, + "step": 5425, + "train/total_loss": 0.03906267136335373 + }, + { + "entropy": 8.813923835754395, + "epoch": 0.536484081471228, + "mean_token_accuracy": 0.7530864477157593, + "num_tokens": 7421082.0, + "step": 5426, + "train/ce_loss": 0.9239147901535034 + }, + { + "epoch": 0.536484081471228, + "step": 5426, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.536484081471228, + "step": 5426, + "train/total_loss": 0.1470789909362793 + }, + { + "entropy": 8.823338508605957, + "epoch": 0.5365829543207435, + "mean_token_accuracy": 0.7474518418312073, + "num_tokens": 7426419.0, + "step": 5427, + "train/ce_loss": 0.8750184774398804 + }, + { + "epoch": 0.5365829543207435, + "step": 5427, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.5365829543207435, + "step": 5427, + "train/total_loss": 0.16953310370445251 + }, + { + "entropy": 9.309595108032227, + "epoch": 0.536681827170259, + "mean_token_accuracy": 0.7337559461593628, + "num_tokens": 7431482.0, + "step": 5428, + "train/ce_loss": 0.5525712370872498 + }, + { + "epoch": 0.536681827170259, + "step": 5428, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.536681827170259, + "step": 5428, + "train/total_loss": 0.14119462668895721 + }, + { + "entropy": 8.547271728515625, + "epoch": 0.5367807000197745, + "mean_token_accuracy": 0.7235932946205139, + "num_tokens": 7437169.0, + "step": 5429, + "train/ce_loss": 0.5903127789497375 + }, + { + "epoch": 0.5367807000197745, + "step": 5429, + "train/sim_loss": 0.171875 + }, + { + "epoch": 0.5367807000197745, + "step": 5429, + "train/total_loss": 0.23090627789497375 + }, + { + "entropy": 8.891470909118652, + "epoch": 0.5368795728692901, + "mean_token_accuracy": 0.7647702693939209, + "num_tokens": 7442568.0, + "step": 5430, + "train/ce_loss": 0.40449684858322144 + }, + { + "epoch": 0.5368795728692901, + "step": 5430, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.5368795728692901, + "step": 5430, + "train/total_loss": 0.06388718634843826 + }, + { + "entropy": 8.927657127380371, + "epoch": 0.5369784457188056, + "mean_token_accuracy": 0.7164790034294128, + "num_tokens": 7447986.0, + "step": 5431, + "train/ce_loss": 1.090582013130188 + }, + { + "epoch": 0.5369784457188056, + "step": 5431, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.5369784457188056, + "step": 5431, + "train/total_loss": 0.1676519513130188 + }, + { + "entropy": 9.070655822753906, + "epoch": 0.5370773185683212, + "mean_token_accuracy": 0.7209653258323669, + "num_tokens": 7453096.0, + "step": 5432, + "train/ce_loss": 1.8256173133850098 + }, + { + "epoch": 0.5370773185683212, + "step": 5432, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.5370773185683212, + "step": 5432, + "train/total_loss": 0.2567805051803589 + }, + { + "entropy": 9.026044845581055, + "epoch": 0.5371761914178367, + "mean_token_accuracy": 0.7105590105056763, + "num_tokens": 7458387.0, + "step": 5433, + "train/ce_loss": 0.8001567721366882 + }, + { + "epoch": 0.5371761914178367, + "step": 5433, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.5371761914178367, + "step": 5433, + "train/total_loss": 0.11126568168401718 + }, + { + "entropy": 9.867725372314453, + "epoch": 0.5372750642673522, + "mean_token_accuracy": 0.7709359526634216, + "num_tokens": 7463151.0, + "step": 5434, + "train/ce_loss": 3.672146021926892e-06 + }, + { + "epoch": 0.5372750642673522, + "step": 5434, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.5372750642673522, + "step": 5434, + "train/total_loss": 0.05468786880373955 + }, + { + "entropy": 9.443593978881836, + "epoch": 0.5373739371168678, + "mean_token_accuracy": 0.7107023596763611, + "num_tokens": 7468198.0, + "step": 5435, + "train/ce_loss": 1.6333290338516235 + }, + { + "epoch": 0.5373739371168678, + "step": 5435, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.5373739371168678, + "step": 5435, + "train/total_loss": 0.24927040934562683 + }, + { + "entropy": 8.999428749084473, + "epoch": 0.5374728099663832, + "mean_token_accuracy": 0.8150510191917419, + "num_tokens": 7473460.0, + "step": 5436, + "train/ce_loss": 0.6773163676261902 + }, + { + "epoch": 0.5374728099663832, + "step": 5436, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.5374728099663832, + "step": 5436, + "train/total_loss": 0.11070039123296738 + }, + { + "entropy": 9.100971221923828, + "epoch": 0.5375716828158987, + "mean_token_accuracy": 0.7493606209754944, + "num_tokens": 7478680.0, + "step": 5437, + "train/ce_loss": 0.798964262008667 + }, + { + "epoch": 0.5375716828158987, + "step": 5437, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.5375716828158987, + "step": 5437, + "train/total_loss": 0.11505267769098282 + }, + { + "entropy": 9.791324615478516, + "epoch": 0.5376705556654143, + "mean_token_accuracy": 0.767241358757019, + "num_tokens": 7483522.0, + "step": 5438, + "train/ce_loss": 1.0660107135772705 + }, + { + "epoch": 0.5376705556654143, + "step": 5438, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.5376705556654143, + "step": 5438, + "train/total_loss": 0.1691010743379593 + }, + { + "entropy": 9.7655611038208, + "epoch": 0.5377694285149298, + "mean_token_accuracy": 0.7222222089767456, + "num_tokens": 7488401.0, + "step": 5439, + "train/ce_loss": 0.7153486013412476 + }, + { + "epoch": 0.5377694285149298, + "step": 5439, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.5377694285149298, + "step": 5439, + "train/total_loss": 0.1301286220550537 + }, + { + "epoch": 0.5378683013644453, + "grad_norm": 0.9586021900177002, + "learning_rate": 8.65771646145478e-06, + "loss": 0.1486, + "step": 5440 + }, + { + "entropy": 8.779112815856934, + "epoch": 0.5378683013644453, + "mean_token_accuracy": 0.7090336084365845, + "num_tokens": 7493878.0, + "step": 5440, + "train/ce_loss": 1.2352663278579712 + }, + { + "epoch": 0.5378683013644453, + "step": 5440, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.5378683013644453, + "step": 5440, + "train/total_loss": 0.22118288278579712 + }, + { + "entropy": 9.081262588500977, + "epoch": 0.5379671742139609, + "mean_token_accuracy": 0.7427241206169128, + "num_tokens": 7499203.0, + "step": 5441, + "train/ce_loss": 0.5701096653938293 + }, + { + "epoch": 0.5379671742139609, + "step": 5441, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.5379671742139609, + "step": 5441, + "train/total_loss": 0.1312297135591507 + }, + { + "entropy": 9.301429748535156, + "epoch": 0.5380660470634764, + "mean_token_accuracy": 0.8223140239715576, + "num_tokens": 7504379.0, + "step": 5442, + "train/ce_loss": 0.6167430281639099 + }, + { + "epoch": 0.5380660470634764, + "step": 5442, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.5380660470634764, + "step": 5442, + "train/total_loss": 0.08120555430650711 + }, + { + "entropy": 8.818253517150879, + "epoch": 0.5381649199129919, + "mean_token_accuracy": 0.7458704113960266, + "num_tokens": 7509628.0, + "step": 5443, + "train/ce_loss": 1.226873755455017 + }, + { + "epoch": 0.5381649199129919, + "step": 5443, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.5381649199129919, + "step": 5443, + "train/total_loss": 0.16174986958503723 + }, + { + "entropy": 8.970584869384766, + "epoch": 0.5382637927625075, + "mean_token_accuracy": 0.7167070508003235, + "num_tokens": 7515002.0, + "step": 5444, + "train/ce_loss": 0.552111029624939 + }, + { + "epoch": 0.5382637927625075, + "step": 5444, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.5382637927625075, + "step": 5444, + "train/total_loss": 0.08255485445261002 + }, + { + "entropy": 10.215428352355957, + "epoch": 0.538362665612023, + "mean_token_accuracy": 0.6946107745170593, + "num_tokens": 7519579.0, + "step": 5445, + "train/ce_loss": 6.035854312358424e-06 + }, + { + "epoch": 0.538362665612023, + "step": 5445, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.538362665612023, + "step": 5445, + "train/total_loss": 0.04687560349702835 + }, + { + "entropy": 9.030383110046387, + "epoch": 0.5384615384615384, + "mean_token_accuracy": 0.7371794581413269, + "num_tokens": 7524659.0, + "step": 5446, + "train/ce_loss": 1.2019760333714657e-06 + }, + { + "epoch": 0.5384615384615384, + "step": 5446, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.5384615384615384, + "step": 5446, + "train/total_loss": 0.05468761920928955 + }, + { + "entropy": 9.103740692138672, + "epoch": 0.538560411311054, + "mean_token_accuracy": 0.7551282048225403, + "num_tokens": 7529886.0, + "step": 5447, + "train/ce_loss": 0.6153397560119629 + }, + { + "epoch": 0.538560411311054, + "step": 5447, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.538560411311054, + "step": 5447, + "train/total_loss": 0.10840897262096405 + }, + { + "entropy": 8.652833938598633, + "epoch": 0.5386592841605695, + "mean_token_accuracy": 0.7880377769470215, + "num_tokens": 7535394.0, + "step": 5448, + "train/ce_loss": 0.5609210729598999 + }, + { + "epoch": 0.5386592841605695, + "step": 5448, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.5386592841605695, + "step": 5448, + "train/total_loss": 0.08734211325645447 + }, + { + "entropy": 9.259515762329102, + "epoch": 0.538758157010085, + "mean_token_accuracy": 0.751288652420044, + "num_tokens": 7540611.0, + "step": 5449, + "train/ce_loss": 1.5452930927276611 + }, + { + "epoch": 0.538758157010085, + "step": 5449, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.538758157010085, + "step": 5449, + "train/total_loss": 0.24046681821346283 + }, + { + "entropy": 8.676910400390625, + "epoch": 0.5388570298596006, + "mean_token_accuracy": 0.740818440914154, + "num_tokens": 7545979.0, + "step": 5450, + "train/ce_loss": 1.0719397068023682 + }, + { + "epoch": 0.5388570298596006, + "step": 5450, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.5388570298596006, + "step": 5450, + "train/total_loss": 0.1657877266407013 + }, + { + "entropy": 8.894207000732422, + "epoch": 0.5389559027091161, + "mean_token_accuracy": 0.7266272306442261, + "num_tokens": 7551300.0, + "step": 5451, + "train/ce_loss": 0.7713016271591187 + }, + { + "epoch": 0.5389559027091161, + "step": 5451, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.5389559027091161, + "step": 5451, + "train/total_loss": 0.12009891122579575 + }, + { + "entropy": 9.597565650939941, + "epoch": 0.5390547755586316, + "mean_token_accuracy": 0.7495291829109192, + "num_tokens": 7556265.0, + "step": 5452, + "train/ce_loss": 0.6753806471824646 + }, + { + "epoch": 0.5390547755586316, + "step": 5452, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.5390547755586316, + "step": 5452, + "train/total_loss": 0.1456630676984787 + }, + { + "entropy": 9.219755172729492, + "epoch": 0.5391536484081472, + "mean_token_accuracy": 0.7556818127632141, + "num_tokens": 7561428.0, + "step": 5453, + "train/ce_loss": 1.0690592527389526 + }, + { + "epoch": 0.5391536484081472, + "step": 5453, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.5391536484081472, + "step": 5453, + "train/total_loss": 0.16159343719482422 + }, + { + "entropy": 8.821651458740234, + "epoch": 0.5392525212576627, + "mean_token_accuracy": 0.7505399584770203, + "num_tokens": 7566844.0, + "step": 5454, + "train/ce_loss": 0.6399568915367126 + }, + { + "epoch": 0.5392525212576627, + "step": 5454, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.5392525212576627, + "step": 5454, + "train/total_loss": 0.08743318915367126 + }, + { + "entropy": 9.432109832763672, + "epoch": 0.5393513941071781, + "mean_token_accuracy": 0.730215847492218, + "num_tokens": 7571766.0, + "step": 5455, + "train/ce_loss": 1.1695923805236816 + }, + { + "epoch": 0.5393513941071781, + "step": 5455, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.5393513941071781, + "step": 5455, + "train/total_loss": 0.17945924401283264 + }, + { + "entropy": 8.712631225585938, + "epoch": 0.5394502669566937, + "mean_token_accuracy": 0.7405140995979309, + "num_tokens": 7577428.0, + "step": 5456, + "train/ce_loss": 0.5700088143348694 + }, + { + "epoch": 0.5394502669566937, + "step": 5456, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.5394502669566937, + "step": 5456, + "train/total_loss": 0.17418837547302246 + }, + { + "entropy": 9.612346649169922, + "epoch": 0.5395491398062092, + "mean_token_accuracy": 0.7223300933837891, + "num_tokens": 7582393.0, + "step": 5457, + "train/ce_loss": 1.0203193596680649e-06 + }, + { + "epoch": 0.5395491398062092, + "step": 5457, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.5395491398062092, + "step": 5457, + "train/total_loss": 0.019531352445483208 + }, + { + "entropy": 8.942792892456055, + "epoch": 0.5396480126557247, + "mean_token_accuracy": 0.6880733966827393, + "num_tokens": 7587664.0, + "step": 5458, + "train/ce_loss": 1.8375366926193237 + }, + { + "epoch": 0.5396480126557247, + "step": 5458, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.5396480126557247, + "step": 5458, + "train/total_loss": 0.2853161692619324 + }, + { + "entropy": 9.137945175170898, + "epoch": 0.5397468855052403, + "mean_token_accuracy": 0.7172932624816895, + "num_tokens": 7592807.0, + "step": 5459, + "train/ce_loss": 1.0247173309326172 + }, + { + "epoch": 0.5397468855052403, + "step": 5459, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.5397468855052403, + "step": 5459, + "train/total_loss": 0.1259092390537262 + }, + { + "epoch": 0.5398457583547558, + "grad_norm": 0.7609133124351501, + "learning_rate": 8.65277159669683e-06, + "loss": 0.1393, + "step": 5460 + }, + { + "entropy": 9.026578903198242, + "epoch": 0.5398457583547558, + "mean_token_accuracy": 0.6619718074798584, + "num_tokens": 7598038.0, + "step": 5460, + "train/ce_loss": 1.3797636029266869e-06 + }, + { + "epoch": 0.5398457583547558, + "step": 5460, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.5398457583547558, + "step": 5460, + "train/total_loss": 0.03125013783574104 + }, + { + "entropy": 8.902261734008789, + "epoch": 0.5399446312042713, + "mean_token_accuracy": 0.7545661926269531, + "num_tokens": 7603388.0, + "step": 5461, + "train/ce_loss": 0.5844436287879944 + }, + { + "epoch": 0.5399446312042713, + "step": 5461, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.5399446312042713, + "step": 5461, + "train/total_loss": 0.07406936585903168 + }, + { + "entropy": 9.208501815795898, + "epoch": 0.5400435040537869, + "mean_token_accuracy": 0.6607142686843872, + "num_tokens": 7608472.0, + "step": 5462, + "train/ce_loss": 1.7073651552200317 + }, + { + "epoch": 0.5400435040537869, + "step": 5462, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.5400435040537869, + "step": 5462, + "train/total_loss": 0.23323652148246765 + }, + { + "entropy": 9.232449531555176, + "epoch": 0.5401423769033024, + "mean_token_accuracy": 0.7136929631233215, + "num_tokens": 7613602.0, + "step": 5463, + "train/ce_loss": 1.0019513368606567 + }, + { + "epoch": 0.5401423769033024, + "step": 5463, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.5401423769033024, + "step": 5463, + "train/total_loss": 0.17050763964653015 + }, + { + "entropy": 9.604730606079102, + "epoch": 0.5402412497528178, + "mean_token_accuracy": 0.7513914704322815, + "num_tokens": 7618566.0, + "step": 5464, + "train/ce_loss": 0.5866101384162903 + }, + { + "epoch": 0.5402412497528178, + "step": 5464, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.5402412497528178, + "step": 5464, + "train/total_loss": 0.13287976384162903 + }, + { + "entropy": 9.419134140014648, + "epoch": 0.5403401226023334, + "mean_token_accuracy": 0.778388261795044, + "num_tokens": 7623545.0, + "step": 5465, + "train/ce_loss": 1.16068696975708 + }, + { + "epoch": 0.5403401226023334, + "step": 5465, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.5403401226023334, + "step": 5465, + "train/total_loss": 0.13559994101524353 + }, + { + "entropy": 9.015172958374023, + "epoch": 0.5404389954518489, + "mean_token_accuracy": 0.7227949500083923, + "num_tokens": 7629071.0, + "step": 5466, + "train/ce_loss": 1.0439870357513428 + }, + { + "epoch": 0.5404389954518489, + "step": 5466, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.5404389954518489, + "step": 5466, + "train/total_loss": 0.1590861976146698 + }, + { + "entropy": 9.223745346069336, + "epoch": 0.5405378683013644, + "mean_token_accuracy": 0.7883755564689636, + "num_tokens": 7634255.0, + "step": 5467, + "train/ce_loss": 0.5337609052658081 + }, + { + "epoch": 0.5405378683013644, + "step": 5467, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.5405378683013644, + "step": 5467, + "train/total_loss": 0.09243859350681305 + }, + { + "entropy": 9.87230110168457, + "epoch": 0.54063674115088, + "mean_token_accuracy": 0.6977329850196838, + "num_tokens": 7639037.0, + "step": 5468, + "train/ce_loss": 2.147436816812842e-06 + }, + { + "epoch": 0.54063674115088, + "step": 5468, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.54063674115088, + "step": 5468, + "train/total_loss": 0.03515646606683731 + }, + { + "entropy": 9.589224815368652, + "epoch": 0.5407356140003955, + "mean_token_accuracy": 0.7479507923126221, + "num_tokens": 7643967.0, + "step": 5469, + "train/ce_loss": 1.1842515732496395e-06 + }, + { + "epoch": 0.5407356140003955, + "step": 5469, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.5407356140003955, + "step": 5469, + "train/total_loss": 0.02734386920928955 + }, + { + "entropy": 9.101972579956055, + "epoch": 0.540834486849911, + "mean_token_accuracy": 0.7326589822769165, + "num_tokens": 7649081.0, + "step": 5470, + "train/ce_loss": 1.2851073741912842 + }, + { + "epoch": 0.540834486849911, + "step": 5470, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.540834486849911, + "step": 5470, + "train/total_loss": 0.1871044933795929 + }, + { + "entropy": 9.178415298461914, + "epoch": 0.5409333596994266, + "mean_token_accuracy": 0.7444608807563782, + "num_tokens": 7654207.0, + "step": 5471, + "train/ce_loss": 0.9581825137138367 + }, + { + "epoch": 0.5409333596994266, + "step": 5471, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.5409333596994266, + "step": 5471, + "train/total_loss": 0.12316200137138367 + }, + { + "entropy": 8.825922966003418, + "epoch": 0.5410322325489421, + "mean_token_accuracy": 0.7145946025848389, + "num_tokens": 7659633.0, + "step": 5472, + "train/ce_loss": 1.07634437084198 + }, + { + "epoch": 0.5410322325489421, + "step": 5472, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.5410322325489421, + "step": 5472, + "train/total_loss": 0.18966569006443024 + }, + { + "entropy": 9.041023254394531, + "epoch": 0.5411311053984575, + "mean_token_accuracy": 0.801001250743866, + "num_tokens": 7664920.0, + "step": 5473, + "train/ce_loss": 0.6893028020858765 + }, + { + "epoch": 0.5411311053984575, + "step": 5473, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.5411311053984575, + "step": 5473, + "train/total_loss": 0.08455528318881989 + }, + { + "entropy": 9.221052169799805, + "epoch": 0.5412299782479731, + "mean_token_accuracy": 0.7625330090522766, + "num_tokens": 7670148.0, + "step": 5474, + "train/ce_loss": 0.7779694199562073 + }, + { + "epoch": 0.5412299782479731, + "step": 5474, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.5412299782479731, + "step": 5474, + "train/total_loss": 0.09732819348573685 + }, + { + "entropy": 8.973783493041992, + "epoch": 0.5413288510974886, + "mean_token_accuracy": 0.7426981925964355, + "num_tokens": 7675305.0, + "step": 5475, + "train/ce_loss": 0.9248960018157959 + }, + { + "epoch": 0.5413288510974886, + "step": 5475, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.5413288510974886, + "step": 5475, + "train/total_loss": 0.1549896001815796 + }, + { + "entropy": 8.819339752197266, + "epoch": 0.5414277239470041, + "mean_token_accuracy": 0.754478394985199, + "num_tokens": 7680705.0, + "step": 5476, + "train/ce_loss": 0.8168224692344666 + }, + { + "epoch": 0.5414277239470041, + "step": 5476, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.5414277239470041, + "step": 5476, + "train/total_loss": 0.1129322499036789 + }, + { + "entropy": 9.046710968017578, + "epoch": 0.5415265967965197, + "mean_token_accuracy": 0.7265353202819824, + "num_tokens": 7686073.0, + "step": 5477, + "train/ce_loss": 0.7705683708190918 + }, + { + "epoch": 0.5415265967965197, + "step": 5477, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.5415265967965197, + "step": 5477, + "train/total_loss": 0.18643184006214142 + }, + { + "entropy": 9.227375030517578, + "epoch": 0.5416254696460352, + "mean_token_accuracy": 0.7328858971595764, + "num_tokens": 7691261.0, + "step": 5478, + "train/ce_loss": 9.011659471980238e-07 + }, + { + "epoch": 0.5416254696460352, + "step": 5478, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.5416254696460352, + "step": 5478, + "train/total_loss": 0.015625089406967163 + }, + { + "entropy": 8.970365524291992, + "epoch": 0.5417243424955507, + "mean_token_accuracy": 0.747178316116333, + "num_tokens": 7696551.0, + "step": 5479, + "train/ce_loss": 0.7276411652565002 + }, + { + "epoch": 0.5417243424955507, + "step": 5479, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.5417243424955507, + "step": 5479, + "train/total_loss": 0.15870162844657898 + }, + { + "epoch": 0.5418232153450663, + "grad_norm": 0.612647533416748, + "learning_rate": 8.647826731938883e-06, + "loss": 0.1365, + "step": 5480 + }, + { + "entropy": 9.256315231323242, + "epoch": 0.5418232153450663, + "mean_token_accuracy": 0.6985583305358887, + "num_tokens": 7701738.0, + "step": 5480, + "train/ce_loss": 0.77440345287323 + }, + { + "epoch": 0.5418232153450663, + "step": 5480, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.5418232153450663, + "step": 5480, + "train/total_loss": 0.11650284379720688 + }, + { + "entropy": 9.61116886138916, + "epoch": 0.5419220881945818, + "mean_token_accuracy": 0.774946928024292, + "num_tokens": 7706607.0, + "step": 5481, + "train/ce_loss": 2.2646183879260207e-06 + }, + { + "epoch": 0.5419220881945818, + "step": 5481, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.5419220881945818, + "step": 5481, + "train/total_loss": 0.039062727242708206 + }, + { + "entropy": 9.658862113952637, + "epoch": 0.5420209610440972, + "mean_token_accuracy": 0.8372092843055725, + "num_tokens": 7711598.0, + "step": 5482, + "train/ce_loss": 2.0505208340182435e-06 + }, + { + "epoch": 0.5420209610440972, + "step": 5482, + "train/sim_loss": 0.01171875 + }, + { + "epoch": 0.5420209610440972, + "step": 5482, + "train/total_loss": 0.011718954890966415 + }, + { + "entropy": 9.218994140625, + "epoch": 0.5421198338936128, + "mean_token_accuracy": 0.7635869383811951, + "num_tokens": 7716768.0, + "step": 5483, + "train/ce_loss": 1.0125045776367188 + }, + { + "epoch": 0.5421198338936128, + "step": 5483, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.5421198338936128, + "step": 5483, + "train/total_loss": 0.21843796968460083 + }, + { + "entropy": 9.483529090881348, + "epoch": 0.5422187067431283, + "mean_token_accuracy": 0.7338345646858215, + "num_tokens": 7721835.0, + "step": 5484, + "train/ce_loss": 0.6087969541549683 + }, + { + "epoch": 0.5422187067431283, + "step": 5484, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.5422187067431283, + "step": 5484, + "train/total_loss": 0.1272859424352646 + }, + { + "entropy": 9.390131950378418, + "epoch": 0.5423175795926438, + "mean_token_accuracy": 0.6901840567588806, + "num_tokens": 7726922.0, + "step": 5485, + "train/ce_loss": 1.0740700960159302 + }, + { + "epoch": 0.5423175795926438, + "step": 5485, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.5423175795926438, + "step": 5485, + "train/total_loss": 0.15818825364112854 + }, + { + "entropy": 9.398968696594238, + "epoch": 0.5424164524421594, + "mean_token_accuracy": 0.8065395355224609, + "num_tokens": 7732264.0, + "step": 5486, + "train/ce_loss": 2.0825088995479746e-06 + }, + { + "epoch": 0.5424164524421594, + "step": 5486, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.5424164524421594, + "step": 5486, + "train/total_loss": 0.06250020861625671 + }, + { + "entropy": 8.707170486450195, + "epoch": 0.5425153252916749, + "mean_token_accuracy": 0.7239958643913269, + "num_tokens": 7737670.0, + "step": 5487, + "train/ce_loss": 0.9993120431900024 + }, + { + "epoch": 0.5425153252916749, + "step": 5487, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.5425153252916749, + "step": 5487, + "train/total_loss": 0.16243121027946472 + }, + { + "entropy": 9.389225006103516, + "epoch": 0.5426141981411904, + "mean_token_accuracy": 0.816216230392456, + "num_tokens": 7742673.0, + "step": 5488, + "train/ce_loss": 0.7702173590660095 + }, + { + "epoch": 0.5426141981411904, + "step": 5488, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.5426141981411904, + "step": 5488, + "train/total_loss": 0.09655299037694931 + }, + { + "entropy": 8.623525619506836, + "epoch": 0.542713070990706, + "mean_token_accuracy": 0.7491961121559143, + "num_tokens": 7748137.0, + "step": 5489, + "train/ce_loss": 0.44330134987831116 + }, + { + "epoch": 0.542713070990706, + "step": 5489, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.542713070990706, + "step": 5489, + "train/total_loss": 0.10683013498783112 + }, + { + "entropy": 9.485376358032227, + "epoch": 0.5428119438402215, + "mean_token_accuracy": 0.7637271285057068, + "num_tokens": 7753158.0, + "step": 5490, + "train/ce_loss": 1.3455654652716476e-06 + }, + { + "epoch": 0.5428119438402215, + "step": 5490, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.5428119438402215, + "step": 5490, + "train/total_loss": 0.031250134110450745 + }, + { + "entropy": 8.845754623413086, + "epoch": 0.542910816689737, + "mean_token_accuracy": 0.7085533142089844, + "num_tokens": 7758749.0, + "step": 5491, + "train/ce_loss": 0.8674387335777283 + }, + { + "epoch": 0.542910816689737, + "step": 5491, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.542910816689737, + "step": 5491, + "train/total_loss": 0.12971262633800507 + }, + { + "entropy": 9.163055419921875, + "epoch": 0.5430096895392525, + "mean_token_accuracy": 0.6792699098587036, + "num_tokens": 7764038.0, + "step": 5492, + "train/ce_loss": 0.6201307773590088 + }, + { + "epoch": 0.5430096895392525, + "step": 5492, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.5430096895392525, + "step": 5492, + "train/total_loss": 0.12451307475566864 + }, + { + "entropy": 9.007641792297363, + "epoch": 0.543108562388768, + "mean_token_accuracy": 0.7056737542152405, + "num_tokens": 7769504.0, + "step": 5493, + "train/ce_loss": 0.7358147501945496 + }, + { + "epoch": 0.543108562388768, + "step": 5493, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.543108562388768, + "step": 5493, + "train/total_loss": 0.10873772948980331 + }, + { + "entropy": 9.138314247131348, + "epoch": 0.5432074352382835, + "mean_token_accuracy": 0.8242424130439758, + "num_tokens": 7774588.0, + "step": 5494, + "train/ce_loss": 1.0121489140146878e-06 + }, + { + "epoch": 0.5432074352382835, + "step": 5494, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.5432074352382835, + "step": 5494, + "train/total_loss": 0.05859385058283806 + }, + { + "entropy": 9.288567543029785, + "epoch": 0.5433063080877991, + "mean_token_accuracy": 0.7085889577865601, + "num_tokens": 7779660.0, + "step": 5495, + "train/ce_loss": 2.142662879123236e-06 + }, + { + "epoch": 0.5433063080877991, + "step": 5495, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.5433063080877991, + "step": 5495, + "train/total_loss": 0.05468771606683731 + }, + { + "entropy": 8.77684211730957, + "epoch": 0.5434051809373146, + "mean_token_accuracy": 0.8189300298690796, + "num_tokens": 7785143.0, + "step": 5496, + "train/ce_loss": 0.680293083190918 + }, + { + "epoch": 0.5434051809373146, + "step": 5496, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.5434051809373146, + "step": 5496, + "train/total_loss": 0.11490430682897568 + }, + { + "entropy": 8.529176712036133, + "epoch": 0.5435040537868301, + "mean_token_accuracy": 0.7483370304107666, + "num_tokens": 7790530.0, + "step": 5497, + "train/ce_loss": 0.8081743717193604 + }, + { + "epoch": 0.5435040537868301, + "step": 5497, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.5435040537868301, + "step": 5497, + "train/total_loss": 0.13159868121147156 + }, + { + "entropy": 9.397544860839844, + "epoch": 0.5436029266363457, + "mean_token_accuracy": 0.7481698393821716, + "num_tokens": 7795632.0, + "step": 5498, + "train/ce_loss": 1.8115889588443679e-06 + }, + { + "epoch": 0.5436029266363457, + "step": 5498, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.5436029266363457, + "step": 5498, + "train/total_loss": 0.035156432539224625 + }, + { + "entropy": 9.139359474182129, + "epoch": 0.5437017994858612, + "mean_token_accuracy": 0.7417027354240417, + "num_tokens": 7800751.0, + "step": 5499, + "train/ce_loss": 9.818531907512806e-07 + }, + { + "epoch": 0.5437017994858612, + "step": 5499, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.5437017994858612, + "step": 5499, + "train/total_loss": 0.07812509685754776 + }, + { + "epoch": 0.5438006723353767, + "grad_norm": 0.7031065225601196, + "learning_rate": 8.642881867180933e-06, + "loss": 0.1337, + "step": 5500 + }, + { + "entropy": 9.415504455566406, + "epoch": 0.5438006723353767, + "mean_token_accuracy": 0.7769110798835754, + "num_tokens": 7805849.0, + "step": 5500, + "train/ce_loss": 1.227493405342102 + }, + { + "epoch": 0.5438006723353767, + "step": 5500, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.5438006723353767, + "step": 5500, + "train/total_loss": 0.16571809351444244 + }, + { + "entropy": 9.082967758178711, + "epoch": 0.5438995451848923, + "mean_token_accuracy": 0.7361809015274048, + "num_tokens": 7811094.0, + "step": 5501, + "train/ce_loss": 0.47862014174461365 + }, + { + "epoch": 0.5438995451848923, + "step": 5501, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.5438995451848923, + "step": 5501, + "train/total_loss": 0.07911201566457748 + }, + { + "entropy": 9.015972137451172, + "epoch": 0.5439984180344077, + "mean_token_accuracy": 0.7757731676101685, + "num_tokens": 7816367.0, + "step": 5502, + "train/ce_loss": 0.6524413824081421 + }, + { + "epoch": 0.5439984180344077, + "step": 5502, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.5439984180344077, + "step": 5502, + "train/total_loss": 0.10040038824081421 + }, + { + "entropy": 8.995569229125977, + "epoch": 0.5440972908839232, + "mean_token_accuracy": 0.6952941417694092, + "num_tokens": 7821677.0, + "step": 5503, + "train/ce_loss": 1.2559822835100931e-06 + }, + { + "epoch": 0.5440972908839232, + "step": 5503, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.5440972908839232, + "step": 5503, + "train/total_loss": 0.06250012665987015 + }, + { + "entropy": 8.838412284851074, + "epoch": 0.5441961637334388, + "mean_token_accuracy": 0.7453415989875793, + "num_tokens": 7826935.0, + "step": 5504, + "train/ce_loss": 1.2809569835662842 + }, + { + "epoch": 0.5441961637334388, + "step": 5504, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.5441961637334388, + "step": 5504, + "train/total_loss": 0.19450195133686066 + }, + { + "entropy": 10.003866195678711, + "epoch": 0.5442950365829543, + "mean_token_accuracy": 0.8797953724861145, + "num_tokens": 7831694.0, + "step": 5505, + "train/ce_loss": 1.4319297075271606 + }, + { + "epoch": 0.5442950365829543, + "step": 5505, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.5442950365829543, + "step": 5505, + "train/total_loss": 0.16663047671318054 + }, + { + "entropy": 9.064350128173828, + "epoch": 0.5443939094324698, + "mean_token_accuracy": 0.7067484855651855, + "num_tokens": 7836989.0, + "step": 5506, + "train/ce_loss": 1.2671979665756226 + }, + { + "epoch": 0.5443939094324698, + "step": 5506, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.5443939094324698, + "step": 5506, + "train/total_loss": 0.15796980261802673 + }, + { + "entropy": 9.026290893554688, + "epoch": 0.5444927822819854, + "mean_token_accuracy": 0.7358490824699402, + "num_tokens": 7842327.0, + "step": 5507, + "train/ce_loss": 1.0443733930587769 + }, + { + "epoch": 0.5444927822819854, + "step": 5507, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.5444927822819854, + "step": 5507, + "train/total_loss": 0.16303110122680664 + }, + { + "entropy": 9.018562316894531, + "epoch": 0.5445916551315009, + "mean_token_accuracy": 0.6709007024765015, + "num_tokens": 7847639.0, + "step": 5508, + "train/ce_loss": 0.595085084438324 + }, + { + "epoch": 0.5445916551315009, + "step": 5508, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.5445916551315009, + "step": 5508, + "train/total_loss": 0.09466475993394852 + }, + { + "entropy": 9.13880729675293, + "epoch": 0.5446905279810164, + "mean_token_accuracy": 0.700507640838623, + "num_tokens": 7852881.0, + "step": 5509, + "train/ce_loss": 0.6465105414390564 + }, + { + "epoch": 0.5446905279810164, + "step": 5509, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.5446905279810164, + "step": 5509, + "train/total_loss": 0.09980730712413788 + }, + { + "entropy": 9.208677291870117, + "epoch": 0.544789400830532, + "mean_token_accuracy": 0.7856257557868958, + "num_tokens": 7858340.0, + "step": 5510, + "train/ce_loss": 0.6704494953155518 + }, + { + "epoch": 0.544789400830532, + "step": 5510, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.544789400830532, + "step": 5510, + "train/total_loss": 0.0826699510216713 + }, + { + "entropy": 9.106315612792969, + "epoch": 0.5448882736800474, + "mean_token_accuracy": 0.7473404407501221, + "num_tokens": 7863577.0, + "step": 5511, + "train/ce_loss": 0.7330856323242188 + }, + { + "epoch": 0.5448882736800474, + "step": 5511, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.5448882736800474, + "step": 5511, + "train/total_loss": 0.120183564722538 + }, + { + "entropy": 9.01171875, + "epoch": 0.5449871465295629, + "mean_token_accuracy": 0.7030812501907349, + "num_tokens": 7868762.0, + "step": 5512, + "train/ce_loss": 1.0409945249557495 + }, + { + "epoch": 0.5449871465295629, + "step": 5512, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.5449871465295629, + "step": 5512, + "train/total_loss": 0.13534945249557495 + }, + { + "entropy": 9.56721305847168, + "epoch": 0.5450860193790785, + "mean_token_accuracy": 0.7206572890281677, + "num_tokens": 7873649.0, + "step": 5513, + "train/ce_loss": 3.4404743018967565e-06 + }, + { + "epoch": 0.5450860193790785, + "step": 5513, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.5450860193790785, + "step": 5513, + "train/total_loss": 0.06640659272670746 + }, + { + "entropy": 9.214876174926758, + "epoch": 0.545184892228594, + "mean_token_accuracy": 0.7772151827812195, + "num_tokens": 7878876.0, + "step": 5514, + "train/ce_loss": 0.36950212717056274 + }, + { + "epoch": 0.545184892228594, + "step": 5514, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.545184892228594, + "step": 5514, + "train/total_loss": 0.056481461971998215 + }, + { + "entropy": 8.57907485961914, + "epoch": 0.5452837650781096, + "mean_token_accuracy": 0.7252985835075378, + "num_tokens": 7884243.0, + "step": 5515, + "train/ce_loss": 0.5983100533485413 + }, + { + "epoch": 0.5452837650781096, + "step": 5515, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.5452837650781096, + "step": 5515, + "train/total_loss": 0.12233100831508636 + }, + { + "entropy": 9.1329345703125, + "epoch": 0.5453826379276251, + "mean_token_accuracy": 0.7596899271011353, + "num_tokens": 7889492.0, + "step": 5516, + "train/ce_loss": 0.3740823566913605 + }, + { + "epoch": 0.5453826379276251, + "step": 5516, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.5453826379276251, + "step": 5516, + "train/total_loss": 0.12334573268890381 + }, + { + "entropy": 9.813523292541504, + "epoch": 0.5454815107771406, + "mean_token_accuracy": 0.7532467246055603, + "num_tokens": 7894327.0, + "step": 5517, + "train/ce_loss": 3.1217628020385746e-06 + }, + { + "epoch": 0.5454815107771406, + "step": 5517, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.5454815107771406, + "step": 5517, + "train/total_loss": 0.08593781292438507 + }, + { + "entropy": 9.10891342163086, + "epoch": 0.5455803836266562, + "mean_token_accuracy": 0.7839335203170776, + "num_tokens": 7899509.0, + "step": 5518, + "train/ce_loss": 1.0317003726959229 + }, + { + "epoch": 0.5455803836266562, + "step": 5518, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.5455803836266562, + "step": 5518, + "train/total_loss": 0.12270128726959229 + }, + { + "entropy": 8.824468612670898, + "epoch": 0.5456792564761717, + "mean_token_accuracy": 0.7766203880310059, + "num_tokens": 7904810.0, + "step": 5519, + "train/ce_loss": 0.4866676330566406 + }, + { + "epoch": 0.5456792564761717, + "step": 5519, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.5456792564761717, + "step": 5519, + "train/total_loss": 0.09163551032543182 + }, + { + "epoch": 0.5457781293256871, + "grad_norm": 0.563425600528717, + "learning_rate": 8.637937002422985e-06, + "loss": 0.133, + "step": 5520 + }, + { + "entropy": 9.161569595336914, + "epoch": 0.5457781293256871, + "mean_token_accuracy": 0.7279596924781799, + "num_tokens": 7910043.0, + "step": 5520, + "train/ce_loss": 5.308772301759745e-07 + }, + { + "epoch": 0.5457781293256871, + "step": 5520, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.5457781293256871, + "step": 5520, + "train/total_loss": 0.023437554016709328 + }, + { + "entropy": 8.903478622436523, + "epoch": 0.5458770021752027, + "mean_token_accuracy": 0.7416563630104065, + "num_tokens": 7915287.0, + "step": 5521, + "train/ce_loss": 1.7618522644042969 + }, + { + "epoch": 0.5458770021752027, + "step": 5521, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.5458770021752027, + "step": 5521, + "train/total_loss": 0.2621227502822876 + }, + { + "entropy": 9.213225364685059, + "epoch": 0.5459758750247182, + "mean_token_accuracy": 0.7211394309997559, + "num_tokens": 7920387.0, + "step": 5522, + "train/ce_loss": 1.2949116230010986 + }, + { + "epoch": 0.5459758750247182, + "step": 5522, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.5459758750247182, + "step": 5522, + "train/total_loss": 0.1998036652803421 + }, + { + "entropy": 9.422765731811523, + "epoch": 0.5460747478742337, + "mean_token_accuracy": 0.7446457743644714, + "num_tokens": 7925440.0, + "step": 5523, + "train/ce_loss": 5.0349794946669135e-06 + }, + { + "epoch": 0.5460747478742337, + "step": 5523, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.5460747478742337, + "step": 5523, + "train/total_loss": 0.04296925291419029 + }, + { + "entropy": 10.037985801696777, + "epoch": 0.5461736207237493, + "mean_token_accuracy": 0.7230320572853088, + "num_tokens": 7930181.0, + "step": 5524, + "train/ce_loss": 1.330657958984375 + }, + { + "epoch": 0.5461736207237493, + "step": 5524, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.5461736207237493, + "step": 5524, + "train/total_loss": 0.18775330483913422 + }, + { + "entropy": 9.685091018676758, + "epoch": 0.5462724935732648, + "mean_token_accuracy": 0.6944971680641174, + "num_tokens": 7935138.0, + "step": 5525, + "train/ce_loss": 1.6756025615904946e-06 + }, + { + "epoch": 0.5462724935732648, + "step": 5525, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.5462724935732648, + "step": 5525, + "train/total_loss": 0.07031266391277313 + }, + { + "entropy": 9.215286254882812, + "epoch": 0.5463713664227803, + "mean_token_accuracy": 0.7022398114204407, + "num_tokens": 7940341.0, + "step": 5526, + "train/ce_loss": 1.028321623802185 + }, + { + "epoch": 0.5463713664227803, + "step": 5526, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.5463713664227803, + "step": 5526, + "train/total_loss": 0.16923841834068298 + }, + { + "entropy": 8.892183303833008, + "epoch": 0.5464702392722959, + "mean_token_accuracy": 0.7200000286102295, + "num_tokens": 7945663.0, + "step": 5527, + "train/ce_loss": 0.6684654951095581 + }, + { + "epoch": 0.5464702392722959, + "step": 5527, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.5464702392722959, + "step": 5527, + "train/total_loss": 0.10590904951095581 + }, + { + "entropy": 8.921552658081055, + "epoch": 0.5465691121218114, + "mean_token_accuracy": 0.743139386177063, + "num_tokens": 7951040.0, + "step": 5528, + "train/ce_loss": 0.8332463502883911 + }, + { + "epoch": 0.5465691121218114, + "step": 5528, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.5465691121218114, + "step": 5528, + "train/total_loss": 0.1731683909893036 + }, + { + "entropy": 9.126616477966309, + "epoch": 0.5466679849713268, + "mean_token_accuracy": 0.6770833134651184, + "num_tokens": 7956156.0, + "step": 5529, + "train/ce_loss": 1.0017226934432983 + }, + { + "epoch": 0.5466679849713268, + "step": 5529, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.5466679849713268, + "step": 5529, + "train/total_loss": 0.1900160312652588 + }, + { + "entropy": 8.91090202331543, + "epoch": 0.5467668578208424, + "mean_token_accuracy": 0.7716346383094788, + "num_tokens": 7961411.0, + "step": 5530, + "train/ce_loss": 0.8838936686515808 + }, + { + "epoch": 0.5467668578208424, + "step": 5530, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.5467668578208424, + "step": 5530, + "train/total_loss": 0.13526436686515808 + }, + { + "entropy": 8.90369987487793, + "epoch": 0.5468657306703579, + "mean_token_accuracy": 0.710918128490448, + "num_tokens": 7966651.0, + "step": 5531, + "train/ce_loss": 1.5553159713745117 + }, + { + "epoch": 0.5468657306703579, + "step": 5531, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.5468657306703579, + "step": 5531, + "train/total_loss": 0.2375628501176834 + }, + { + "entropy": 9.05916690826416, + "epoch": 0.5469646035198734, + "mean_token_accuracy": 0.7317073345184326, + "num_tokens": 7972028.0, + "step": 5532, + "train/ce_loss": 0.7818222045898438 + }, + { + "epoch": 0.5469646035198734, + "step": 5532, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.5469646035198734, + "step": 5532, + "train/total_loss": 0.10161972045898438 + }, + { + "entropy": 8.867383003234863, + "epoch": 0.547063476369389, + "mean_token_accuracy": 0.7449947595596313, + "num_tokens": 7977475.0, + "step": 5533, + "train/ce_loss": 1.104671597480774 + }, + { + "epoch": 0.547063476369389, + "step": 5533, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.547063476369389, + "step": 5533, + "train/total_loss": 0.18859216570854187 + }, + { + "entropy": 9.046788215637207, + "epoch": 0.5471623492189045, + "mean_token_accuracy": 0.7216783165931702, + "num_tokens": 7982661.0, + "step": 5534, + "train/ce_loss": 1.2839608192443848 + }, + { + "epoch": 0.5471623492189045, + "step": 5534, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.5471623492189045, + "step": 5534, + "train/total_loss": 0.19480232894420624 + }, + { + "entropy": 9.152159690856934, + "epoch": 0.54726122206842, + "mean_token_accuracy": 0.70257967710495, + "num_tokens": 7987819.0, + "step": 5535, + "train/ce_loss": 1.0716601610183716 + }, + { + "epoch": 0.54726122206842, + "step": 5535, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.54726122206842, + "step": 5535, + "train/total_loss": 0.15013477206230164 + }, + { + "entropy": 9.02014446258545, + "epoch": 0.5473600949179356, + "mean_token_accuracy": 0.7812879681587219, + "num_tokens": 7993097.0, + "step": 5536, + "train/ce_loss": 0.5995551943778992 + }, + { + "epoch": 0.5473600949179356, + "step": 5536, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.5473600949179356, + "step": 5536, + "train/total_loss": 0.07948677241802216 + }, + { + "entropy": 8.631769180297852, + "epoch": 0.5474589677674511, + "mean_token_accuracy": 0.7827273011207581, + "num_tokens": 7998676.0, + "step": 5537, + "train/ce_loss": 0.4905291199684143 + }, + { + "epoch": 0.5474589677674511, + "step": 5537, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.5474589677674511, + "step": 5537, + "train/total_loss": 0.07639665901660919 + }, + { + "entropy": 8.562850952148438, + "epoch": 0.5475578406169666, + "mean_token_accuracy": 0.7535884976387024, + "num_tokens": 8003958.0, + "step": 5538, + "train/ce_loss": 0.5495491623878479 + }, + { + "epoch": 0.5475578406169666, + "step": 5538, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.5475578406169666, + "step": 5538, + "train/total_loss": 0.11745491623878479 + }, + { + "entropy": 9.187483787536621, + "epoch": 0.5476567134664821, + "mean_token_accuracy": 0.7204610705375671, + "num_tokens": 8009118.0, + "step": 5539, + "train/ce_loss": 1.957935182872461e-06 + }, + { + "epoch": 0.5476567134664821, + "step": 5539, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.5476567134664821, + "step": 5539, + "train/total_loss": 0.03906269744038582 + }, + { + "epoch": 0.5477555863159976, + "grad_norm": 0.8513239026069641, + "learning_rate": 8.632992137665036e-06, + "loss": 0.1451, + "step": 5540 + }, + { + "entropy": 9.163148880004883, + "epoch": 0.5477555863159976, + "mean_token_accuracy": 0.7201540470123291, + "num_tokens": 8014486.0, + "step": 5540, + "train/ce_loss": 1.322680115699768 + }, + { + "epoch": 0.5477555863159976, + "step": 5540, + "train/sim_loss": 0.13671875 + }, + { + "epoch": 0.5477555863159976, + "step": 5540, + "train/total_loss": 0.2689867615699768 + }, + { + "entropy": 9.031539916992188, + "epoch": 0.5478544591655131, + "mean_token_accuracy": 0.7678160667419434, + "num_tokens": 8019808.0, + "step": 5541, + "train/ce_loss": 0.5397918224334717 + }, + { + "epoch": 0.5478544591655131, + "step": 5541, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.5478544591655131, + "step": 5541, + "train/total_loss": 0.09694793820381165 + }, + { + "entropy": 9.817032814025879, + "epoch": 0.5479533320150287, + "mean_token_accuracy": 0.7709251046180725, + "num_tokens": 8024639.0, + "step": 5542, + "train/ce_loss": 1.1984164714813232 + }, + { + "epoch": 0.5479533320150287, + "step": 5542, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.5479533320150287, + "step": 5542, + "train/total_loss": 0.17062290012836456 + }, + { + "entropy": 8.763422012329102, + "epoch": 0.5480522048645442, + "mean_token_accuracy": 0.7785087823867798, + "num_tokens": 8030041.0, + "step": 5543, + "train/ce_loss": 0.48883992433547974 + }, + { + "epoch": 0.5480522048645442, + "step": 5543, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.5480522048645442, + "step": 5543, + "train/total_loss": 0.06841523945331573 + }, + { + "entropy": 8.496919631958008, + "epoch": 0.5481510777140597, + "mean_token_accuracy": 0.736580491065979, + "num_tokens": 8035507.0, + "step": 5544, + "train/ce_loss": 1.1177259683609009 + }, + { + "epoch": 0.5481510777140597, + "step": 5544, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.5481510777140597, + "step": 5544, + "train/total_loss": 0.2055225968360901 + }, + { + "entropy": 9.677846908569336, + "epoch": 0.5482499505635753, + "mean_token_accuracy": 0.7397849559783936, + "num_tokens": 8040397.0, + "step": 5545, + "train/ce_loss": 2.08055212169711e-06 + }, + { + "epoch": 0.5482499505635753, + "step": 5545, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.5482499505635753, + "step": 5545, + "train/total_loss": 0.039062708616256714 + }, + { + "entropy": 9.176383972167969, + "epoch": 0.5483488234130908, + "mean_token_accuracy": 0.7653791308403015, + "num_tokens": 8045550.0, + "step": 5546, + "train/ce_loss": 0.9093326926231384 + }, + { + "epoch": 0.5483488234130908, + "step": 5546, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.5483488234130908, + "step": 5546, + "train/total_loss": 0.13780826330184937 + }, + { + "entropy": 9.060911178588867, + "epoch": 0.5484476962626063, + "mean_token_accuracy": 0.7210348844528198, + "num_tokens": 8050908.0, + "step": 5547, + "train/ce_loss": 0.5934411883354187 + }, + { + "epoch": 0.5484476962626063, + "step": 5547, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.5484476962626063, + "step": 5547, + "train/total_loss": 0.1374691128730774 + }, + { + "entropy": 8.97642993927002, + "epoch": 0.5485465691121219, + "mean_token_accuracy": 0.7463414669036865, + "num_tokens": 8056164.0, + "step": 5548, + "train/ce_loss": 0.7455832362174988 + }, + { + "epoch": 0.5485465691121219, + "step": 5548, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.5485465691121219, + "step": 5548, + "train/total_loss": 0.1722145676612854 + }, + { + "entropy": 8.86227798461914, + "epoch": 0.5486454419616373, + "mean_token_accuracy": 0.8031784892082214, + "num_tokens": 8061437.0, + "step": 5549, + "train/ce_loss": 0.524359941482544 + }, + { + "epoch": 0.5486454419616373, + "step": 5549, + "train/sim_loss": 0.01171875 + }, + { + "epoch": 0.5486454419616373, + "step": 5549, + "train/total_loss": 0.0641547441482544 + }, + { + "entropy": 9.484848976135254, + "epoch": 0.5487443148111528, + "mean_token_accuracy": 0.709618866443634, + "num_tokens": 8066449.0, + "step": 5550, + "train/ce_loss": 1.797608733177185 + }, + { + "epoch": 0.5487443148111528, + "step": 5550, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.5487443148111528, + "step": 5550, + "train/total_loss": 0.2305421233177185 + }, + { + "entropy": 9.054540634155273, + "epoch": 0.5488431876606684, + "mean_token_accuracy": 0.7254902124404907, + "num_tokens": 8071599.0, + "step": 5551, + "train/ce_loss": 0.9352777004241943 + }, + { + "epoch": 0.5488431876606684, + "step": 5551, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.5488431876606684, + "step": 5551, + "train/total_loss": 0.13259026408195496 + }, + { + "entropy": 8.990945816040039, + "epoch": 0.5489420605101839, + "mean_token_accuracy": 0.7782805562019348, + "num_tokens": 8076884.0, + "step": 5552, + "train/ce_loss": 0.7061032652854919 + }, + { + "epoch": 0.5489420605101839, + "step": 5552, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.5489420605101839, + "step": 5552, + "train/total_loss": 0.09795407950878143 + }, + { + "entropy": 8.656270027160645, + "epoch": 0.5490409333596994, + "mean_token_accuracy": 0.7148148417472839, + "num_tokens": 8082111.0, + "step": 5553, + "train/ce_loss": 0.8703933358192444 + }, + { + "epoch": 0.5490409333596994, + "step": 5553, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.5490409333596994, + "step": 5553, + "train/total_loss": 0.14172683656215668 + }, + { + "entropy": 9.270323753356934, + "epoch": 0.549139806209215, + "mean_token_accuracy": 0.7340720295906067, + "num_tokens": 8087280.0, + "step": 5554, + "train/ce_loss": 0.9211452603340149 + }, + { + "epoch": 0.549139806209215, + "step": 5554, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.549139806209215, + "step": 5554, + "train/total_loss": 0.13508328795433044 + }, + { + "entropy": 9.821088790893555, + "epoch": 0.5492386790587305, + "mean_token_accuracy": 0.7368420958518982, + "num_tokens": 8092123.0, + "step": 5555, + "train/ce_loss": 2.184018135070801 + }, + { + "epoch": 0.5492386790587305, + "step": 5555, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.5492386790587305, + "step": 5555, + "train/total_loss": 0.27308931946754456 + }, + { + "entropy": 9.16021728515625, + "epoch": 0.549337551908246, + "mean_token_accuracy": 0.681556224822998, + "num_tokens": 8097253.0, + "step": 5556, + "train/ce_loss": 0.7490461468696594 + }, + { + "epoch": 0.549337551908246, + "step": 5556, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.549337551908246, + "step": 5556, + "train/total_loss": 0.11396711319684982 + }, + { + "entropy": 9.095970153808594, + "epoch": 0.5494364247577616, + "mean_token_accuracy": 0.7917241454124451, + "num_tokens": 8102413.0, + "step": 5557, + "train/ce_loss": 0.5837256908416748 + }, + { + "epoch": 0.5494364247577616, + "step": 5557, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.5494364247577616, + "step": 5557, + "train/total_loss": 0.09352882206439972 + }, + { + "entropy": 9.32501220703125, + "epoch": 0.549535297607277, + "mean_token_accuracy": 0.7417103052139282, + "num_tokens": 8107402.0, + "step": 5558, + "train/ce_loss": 3.2791404009913094e-06 + }, + { + "epoch": 0.549535297607277, + "step": 5558, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.549535297607277, + "step": 5558, + "train/total_loss": 0.054687827825546265 + }, + { + "entropy": 8.988893508911133, + "epoch": 0.5496341704567925, + "mean_token_accuracy": 0.7346241474151611, + "num_tokens": 8112744.0, + "step": 5559, + "train/ce_loss": 0.9331421852111816 + }, + { + "epoch": 0.5496341704567925, + "step": 5559, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.5496341704567925, + "step": 5559, + "train/total_loss": 0.14409548044204712 + }, + { + "epoch": 0.5497330433063081, + "grad_norm": 0.7400034070014954, + "learning_rate": 8.628047272907086e-06, + "loss": 0.138, + "step": 5560 + }, + { + "entropy": 9.708757400512695, + "epoch": 0.5497330433063081, + "mean_token_accuracy": 0.7265306115150452, + "num_tokens": 8117594.0, + "step": 5560, + "train/ce_loss": 8.806768505564833e-07 + }, + { + "epoch": 0.5497330433063081, + "step": 5560, + "train/sim_loss": 0.01171875 + }, + { + "epoch": 0.5497330433063081, + "step": 5560, + "train/total_loss": 0.011718838475644588 + }, + { + "entropy": 9.42172622680664, + "epoch": 0.5498319161558236, + "mean_token_accuracy": 0.731249988079071, + "num_tokens": 8122664.0, + "step": 5561, + "train/ce_loss": 1.261497139930725 + }, + { + "epoch": 0.5498319161558236, + "step": 5561, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.5498319161558236, + "step": 5561, + "train/total_loss": 0.1769309639930725 + }, + { + "entropy": 9.3308744430542, + "epoch": 0.5499307890053391, + "mean_token_accuracy": 0.6599063873291016, + "num_tokens": 8127792.0, + "step": 5562, + "train/ce_loss": 1.3462740182876587 + }, + { + "epoch": 0.5499307890053391, + "step": 5562, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.5499307890053391, + "step": 5562, + "train/total_loss": 0.20884615182876587 + }, + { + "entropy": 8.472457885742188, + "epoch": 0.5500296618548547, + "mean_token_accuracy": 0.7492983937263489, + "num_tokens": 8133348.0, + "step": 5563, + "train/ce_loss": 0.9793670177459717 + }, + { + "epoch": 0.5500296618548547, + "step": 5563, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.5500296618548547, + "step": 5563, + "train/total_loss": 0.1526242047548294 + }, + { + "entropy": 8.841888427734375, + "epoch": 0.5501285347043702, + "mean_token_accuracy": 0.6864516139030457, + "num_tokens": 8138586.0, + "step": 5564, + "train/ce_loss": 1.2656185626983643 + }, + { + "epoch": 0.5501285347043702, + "step": 5564, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.5501285347043702, + "step": 5564, + "train/total_loss": 0.17343686521053314 + }, + { + "entropy": 9.570108413696289, + "epoch": 0.5502274075538857, + "mean_token_accuracy": 0.7607476711273193, + "num_tokens": 8143549.0, + "step": 5565, + "train/ce_loss": 1.570382096360845e-06 + }, + { + "epoch": 0.5502274075538857, + "step": 5565, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.5502274075538857, + "step": 5565, + "train/total_loss": 0.058593906462192535 + }, + { + "entropy": 9.095592498779297, + "epoch": 0.5503262804034013, + "mean_token_accuracy": 0.7247838377952576, + "num_tokens": 8148730.0, + "step": 5566, + "train/ce_loss": 0.39141520857810974 + }, + { + "epoch": 0.5503262804034013, + "step": 5566, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.5503262804034013, + "step": 5566, + "train/total_loss": 0.07820402085781097 + }, + { + "entropy": 9.185461044311523, + "epoch": 0.5504251532529167, + "mean_token_accuracy": 0.7051926255226135, + "num_tokens": 8153783.0, + "step": 5567, + "train/ce_loss": 0.9862339496612549 + }, + { + "epoch": 0.5504251532529167, + "step": 5567, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.5504251532529167, + "step": 5567, + "train/total_loss": 0.1650296449661255 + }, + { + "entropy": 9.859312057495117, + "epoch": 0.5505240261024322, + "mean_token_accuracy": 0.7260273694992065, + "num_tokens": 8158558.0, + "step": 5568, + "train/ce_loss": 1.0583114089968149e-05 + }, + { + "epoch": 0.5505240261024322, + "step": 5568, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.5505240261024322, + "step": 5568, + "train/total_loss": 0.04296980798244476 + }, + { + "entropy": 9.152839660644531, + "epoch": 0.5506228989519478, + "mean_token_accuracy": 0.7062069177627563, + "num_tokens": 8163728.0, + "step": 5569, + "train/ce_loss": 4.242161821821355e-07 + }, + { + "epoch": 0.5506228989519478, + "step": 5569, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.5506228989519478, + "step": 5569, + "train/total_loss": 0.027343792840838432 + }, + { + "entropy": 8.736828804016113, + "epoch": 0.5507217718014633, + "mean_token_accuracy": 0.7492323517799377, + "num_tokens": 8169180.0, + "step": 5570, + "train/ce_loss": 0.9878089427947998 + }, + { + "epoch": 0.5507217718014633, + "step": 5570, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.5507217718014633, + "step": 5570, + "train/total_loss": 0.16128090023994446 + }, + { + "entropy": 9.42058277130127, + "epoch": 0.5508206446509788, + "mean_token_accuracy": 0.7604562640190125, + "num_tokens": 8174198.0, + "step": 5571, + "train/ce_loss": 0.9325904846191406 + }, + { + "epoch": 0.5508206446509788, + "step": 5571, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.5508206446509788, + "step": 5571, + "train/total_loss": 0.1635715514421463 + }, + { + "entropy": 8.800806045532227, + "epoch": 0.5509195175004944, + "mean_token_accuracy": 0.7384792566299438, + "num_tokens": 8179559.0, + "step": 5572, + "train/ce_loss": 1.2294607162475586 + }, + { + "epoch": 0.5509195175004944, + "step": 5572, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.5509195175004944, + "step": 5572, + "train/total_loss": 0.20107108354568481 + }, + { + "entropy": 8.983238220214844, + "epoch": 0.5510183903500099, + "mean_token_accuracy": 0.7275822758674622, + "num_tokens": 8184923.0, + "step": 5573, + "train/ce_loss": 0.8919557332992554 + }, + { + "epoch": 0.5510183903500099, + "step": 5573, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.5510183903500099, + "step": 5573, + "train/total_loss": 0.13997682929039001 + }, + { + "entropy": 9.278079986572266, + "epoch": 0.5511172631995254, + "mean_token_accuracy": 0.7447405457496643, + "num_tokens": 8190221.0, + "step": 5574, + "train/ce_loss": 1.7935465166374343e-06 + }, + { + "epoch": 0.5511172631995254, + "step": 5574, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.5511172631995254, + "step": 5574, + "train/total_loss": 0.07031267881393433 + }, + { + "entropy": 9.665382385253906, + "epoch": 0.551216136049041, + "mean_token_accuracy": 0.8256704807281494, + "num_tokens": 8195140.0, + "step": 5575, + "train/ce_loss": 8.778425808486645e-07 + }, + { + "epoch": 0.551216136049041, + "step": 5575, + "train/sim_loss": 0.01171875 + }, + { + "epoch": 0.551216136049041, + "step": 5575, + "train/total_loss": 0.011718837544322014 + }, + { + "entropy": 8.645299911499023, + "epoch": 0.5513150088985564, + "mean_token_accuracy": 0.7113187909126282, + "num_tokens": 8200579.0, + "step": 5576, + "train/ce_loss": 1.155021071434021 + }, + { + "epoch": 0.5513150088985564, + "step": 5576, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.5513150088985564, + "step": 5576, + "train/total_loss": 0.17018961906433105 + }, + { + "entropy": 8.342084884643555, + "epoch": 0.5514138817480719, + "mean_token_accuracy": 0.8221845626831055, + "num_tokens": 8206263.0, + "step": 5577, + "train/ce_loss": 0.45889559388160706 + }, + { + "epoch": 0.5514138817480719, + "step": 5577, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.5514138817480719, + "step": 5577, + "train/total_loss": 0.06932705640792847 + }, + { + "entropy": 9.861316680908203, + "epoch": 0.5515127545975875, + "mean_token_accuracy": 0.6985294222831726, + "num_tokens": 8211083.0, + "step": 5578, + "train/ce_loss": 1.9402579069137573 + }, + { + "epoch": 0.5515127545975875, + "step": 5578, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.5515127545975875, + "step": 5578, + "train/total_loss": 0.24871329963207245 + }, + { + "entropy": 8.840457916259766, + "epoch": 0.551611627447103, + "mean_token_accuracy": 0.7086419463157654, + "num_tokens": 8216389.0, + "step": 5579, + "train/ce_loss": 1.1512106657028198 + }, + { + "epoch": 0.551611627447103, + "step": 5579, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.551611627447103, + "step": 5579, + "train/total_loss": 0.21668356657028198 + }, + { + "epoch": 0.5517105002966185, + "grad_norm": 0.9113156795501709, + "learning_rate": 8.623102408149137e-06, + "loss": 0.1391, + "step": 5580 + }, + { + "entropy": 8.88958740234375, + "epoch": 0.5517105002966185, + "mean_token_accuracy": 0.707317054271698, + "num_tokens": 8221609.0, + "step": 5580, + "train/ce_loss": 1.4289519786834717 + }, + { + "epoch": 0.5517105002966185, + "step": 5580, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.5517105002966185, + "step": 5580, + "train/total_loss": 0.17805145680904388 + }, + { + "entropy": 8.861713409423828, + "epoch": 0.5518093731461341, + "mean_token_accuracy": 0.6754478216171265, + "num_tokens": 8227034.0, + "step": 5581, + "train/ce_loss": 1.0220152139663696 + }, + { + "epoch": 0.5518093731461341, + "step": 5581, + "train/sim_loss": 0.171875 + }, + { + "epoch": 0.5518093731461341, + "step": 5581, + "train/total_loss": 0.27407652139663696 + }, + { + "entropy": 9.000300407409668, + "epoch": 0.5519082459956496, + "mean_token_accuracy": 0.750952959060669, + "num_tokens": 8232332.0, + "step": 5582, + "train/ce_loss": 0.7720044255256653 + }, + { + "epoch": 0.5519082459956496, + "step": 5582, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.5519082459956496, + "step": 5582, + "train/total_loss": 0.14751294255256653 + }, + { + "entropy": 8.809768676757812, + "epoch": 0.5520071188451651, + "mean_token_accuracy": 0.6777777671813965, + "num_tokens": 8237716.0, + "step": 5583, + "train/ce_loss": 0.761299729347229 + }, + { + "epoch": 0.5520071188451651, + "step": 5583, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.5520071188451651, + "step": 5583, + "train/total_loss": 0.1308174729347229 + }, + { + "entropy": 9.893903732299805, + "epoch": 0.5521059916946807, + "mean_token_accuracy": 0.8021390438079834, + "num_tokens": 8242522.0, + "step": 5584, + "train/ce_loss": 1.2303241874178639e-06 + }, + { + "epoch": 0.5521059916946807, + "step": 5584, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.5521059916946807, + "step": 5584, + "train/total_loss": 0.02343762293457985 + }, + { + "entropy": 8.786431312561035, + "epoch": 0.5522048645441962, + "mean_token_accuracy": 0.6747252941131592, + "num_tokens": 8247842.0, + "step": 5585, + "train/ce_loss": 1.6742024421691895 + }, + { + "epoch": 0.5522048645441962, + "step": 5585, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.5522048645441962, + "step": 5585, + "train/total_loss": 0.20648275315761566 + }, + { + "entropy": 9.696439743041992, + "epoch": 0.5523037373937116, + "mean_token_accuracy": 0.8199446201324463, + "num_tokens": 8252630.0, + "step": 5586, + "train/ce_loss": 7.355477464443538e-06 + }, + { + "epoch": 0.5523037373937116, + "step": 5586, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.5523037373937116, + "step": 5586, + "train/total_loss": 0.0703132376074791 + }, + { + "entropy": 8.938692092895508, + "epoch": 0.5524026102432272, + "mean_token_accuracy": 0.7350993156433105, + "num_tokens": 8258009.0, + "step": 5587, + "train/ce_loss": 1.184370517730713 + }, + { + "epoch": 0.5524026102432272, + "step": 5587, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.5524026102432272, + "step": 5587, + "train/total_loss": 0.1926558017730713 + }, + { + "entropy": 8.986148834228516, + "epoch": 0.5525014830927427, + "mean_token_accuracy": 0.732375979423523, + "num_tokens": 8263220.0, + "step": 5588, + "train/ce_loss": 1.660227656364441 + }, + { + "epoch": 0.5525014830927427, + "step": 5588, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.5525014830927427, + "step": 5588, + "train/total_loss": 0.23633526265621185 + }, + { + "entropy": 9.082693099975586, + "epoch": 0.5526003559422582, + "mean_token_accuracy": 0.6867924332618713, + "num_tokens": 8268527.0, + "step": 5589, + "train/ce_loss": 0.9311652183532715 + }, + { + "epoch": 0.5526003559422582, + "step": 5589, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.5526003559422582, + "step": 5589, + "train/total_loss": 0.21030402183532715 + }, + { + "entropy": 8.685200691223145, + "epoch": 0.5526992287917738, + "mean_token_accuracy": 0.7639344334602356, + "num_tokens": 8273918.0, + "step": 5590, + "train/ce_loss": 0.8735048770904541 + }, + { + "epoch": 0.5526992287917738, + "step": 5590, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.5526992287917738, + "step": 5590, + "train/total_loss": 0.1459442377090454 + }, + { + "entropy": 8.771509170532227, + "epoch": 0.5527981016412893, + "mean_token_accuracy": 0.7356828451156616, + "num_tokens": 8279213.0, + "step": 5591, + "train/ce_loss": 1.3483002185821533 + }, + { + "epoch": 0.5527981016412893, + "step": 5591, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.5527981016412893, + "step": 5591, + "train/total_loss": 0.1973300278186798 + }, + { + "entropy": 9.093210220336914, + "epoch": 0.5528969744908048, + "mean_token_accuracy": 0.7544827461242676, + "num_tokens": 8284376.0, + "step": 5592, + "train/ce_loss": 0.5866910815238953 + }, + { + "epoch": 0.5528969744908048, + "step": 5592, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.5528969744908048, + "step": 5592, + "train/total_loss": 0.11335660517215729 + }, + { + "entropy": 8.567116737365723, + "epoch": 0.5529958473403204, + "mean_token_accuracy": 0.7652958631515503, + "num_tokens": 8289849.0, + "step": 5593, + "train/ce_loss": 0.9828934073448181 + }, + { + "epoch": 0.5529958473403204, + "step": 5593, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.5529958473403204, + "step": 5593, + "train/total_loss": 0.1607893407344818 + }, + { + "entropy": 8.970977783203125, + "epoch": 0.5530947201898359, + "mean_token_accuracy": 0.7647058963775635, + "num_tokens": 8295138.0, + "step": 5594, + "train/ce_loss": 1.2613295316696167 + }, + { + "epoch": 0.5530947201898359, + "step": 5594, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.5530947201898359, + "step": 5594, + "train/total_loss": 0.16910170018672943 + }, + { + "entropy": 8.619521141052246, + "epoch": 0.5531935930393513, + "mean_token_accuracy": 0.7246192693710327, + "num_tokens": 8300383.0, + "step": 5595, + "train/ce_loss": 0.591346263885498 + }, + { + "epoch": 0.5531935930393513, + "step": 5595, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.5531935930393513, + "step": 5595, + "train/total_loss": 0.15288463234901428 + }, + { + "entropy": 8.945405006408691, + "epoch": 0.5532924658888669, + "mean_token_accuracy": 0.730555534362793, + "num_tokens": 8305601.0, + "step": 5596, + "train/ce_loss": 1.1461552381515503 + }, + { + "epoch": 0.5532924658888669, + "step": 5596, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.5532924658888669, + "step": 5596, + "train/total_loss": 0.1849280297756195 + }, + { + "entropy": 8.84506607055664, + "epoch": 0.5533913387383824, + "mean_token_accuracy": 0.7098265886306763, + "num_tokens": 8310946.0, + "step": 5597, + "train/ce_loss": 0.8909079432487488 + }, + { + "epoch": 0.5533913387383824, + "step": 5597, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.5533913387383824, + "step": 5597, + "train/total_loss": 0.12815329432487488 + }, + { + "entropy": 8.550594329833984, + "epoch": 0.553490211587898, + "mean_token_accuracy": 0.7557471394538879, + "num_tokens": 8316458.0, + "step": 5598, + "train/ce_loss": 0.9047664403915405 + }, + { + "epoch": 0.553490211587898, + "step": 5598, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.553490211587898, + "step": 5598, + "train/total_loss": 0.1725078970193863 + }, + { + "entropy": 9.361294746398926, + "epoch": 0.5535890844374135, + "mean_token_accuracy": 0.7869822382926941, + "num_tokens": 8321599.0, + "step": 5599, + "train/ce_loss": 0.7582494020462036 + }, + { + "epoch": 0.5535890844374135, + "step": 5599, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.5535890844374135, + "step": 5599, + "train/total_loss": 0.13832494616508484 + }, + { + "epoch": 0.553687957286929, + "grad_norm": 0.6769205331802368, + "learning_rate": 8.61815754339119e-06, + "loss": 0.1456, + "step": 5600 + }, + { + "entropy": 9.160236358642578, + "epoch": 0.553687957286929, + "mean_token_accuracy": 0.7388535141944885, + "num_tokens": 8326844.0, + "step": 5600, + "train/ce_loss": 1.309563572249317e-06 + }, + { + "epoch": 0.553687957286929, + "step": 5600, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.553687957286929, + "step": 5600, + "train/total_loss": 0.058593880385160446 + }, + { + "entropy": 8.7738618850708, + "epoch": 0.5537868301364446, + "mean_token_accuracy": 0.773099422454834, + "num_tokens": 8332131.0, + "step": 5601, + "train/ce_loss": 0.9834937453269958 + }, + { + "epoch": 0.5537868301364446, + "step": 5601, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.5537868301364446, + "step": 5601, + "train/total_loss": 0.16866187751293182 + }, + { + "entropy": 9.020515441894531, + "epoch": 0.5538857029859601, + "mean_token_accuracy": 0.7438308000564575, + "num_tokens": 8337447.0, + "step": 5602, + "train/ce_loss": 0.47414088249206543 + }, + { + "epoch": 0.5538857029859601, + "step": 5602, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.5538857029859601, + "step": 5602, + "train/total_loss": 0.10991409420967102 + }, + { + "entropy": 8.730883598327637, + "epoch": 0.5539845758354756, + "mean_token_accuracy": 0.75, + "num_tokens": 8342565.0, + "step": 5603, + "train/ce_loss": 0.5624790191650391 + }, + { + "epoch": 0.5539845758354756, + "step": 5603, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.5539845758354756, + "step": 5603, + "train/total_loss": 0.09531040489673615 + }, + { + "entropy": 8.995072364807129, + "epoch": 0.5540834486849912, + "mean_token_accuracy": 0.7682020664215088, + "num_tokens": 8347687.0, + "step": 5604, + "train/ce_loss": 6.226739060366526e-06 + }, + { + "epoch": 0.5540834486849912, + "step": 5604, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.5540834486849912, + "step": 5604, + "train/total_loss": 0.06250062584877014 + }, + { + "entropy": 9.090400695800781, + "epoch": 0.5541823215345066, + "mean_token_accuracy": 0.697017252445221, + "num_tokens": 8352752.0, + "step": 5605, + "train/ce_loss": 1.042167067527771 + }, + { + "epoch": 0.5541823215345066, + "step": 5605, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.5541823215345066, + "step": 5605, + "train/total_loss": 0.19796670973300934 + }, + { + "entropy": 8.763729095458984, + "epoch": 0.5542811943840221, + "mean_token_accuracy": 0.7481162548065186, + "num_tokens": 8358183.0, + "step": 5606, + "train/ce_loss": 0.9994056224822998 + }, + { + "epoch": 0.5542811943840221, + "step": 5606, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.5542811943840221, + "step": 5606, + "train/total_loss": 0.15853431820869446 + }, + { + "entropy": 8.979375839233398, + "epoch": 0.5543800672335377, + "mean_token_accuracy": 0.7763158082962036, + "num_tokens": 8363442.0, + "step": 5607, + "train/ce_loss": 0.508840799331665 + }, + { + "epoch": 0.5543800672335377, + "step": 5607, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.5543800672335377, + "step": 5607, + "train/total_loss": 0.07822783291339874 + }, + { + "entropy": 9.339065551757812, + "epoch": 0.5544789400830532, + "mean_token_accuracy": 0.7257142663002014, + "num_tokens": 8368436.0, + "step": 5608, + "train/ce_loss": 0.7017092108726501 + }, + { + "epoch": 0.5544789400830532, + "step": 5608, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.5544789400830532, + "step": 5608, + "train/total_loss": 0.10923342406749725 + }, + { + "entropy": 8.672113418579102, + "epoch": 0.5545778129325687, + "mean_token_accuracy": 0.7023977637290955, + "num_tokens": 8373616.0, + "step": 5609, + "train/ce_loss": 1.250449538230896 + }, + { + "epoch": 0.5545778129325687, + "step": 5609, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.5545778129325687, + "step": 5609, + "train/total_loss": 0.17191995680332184 + }, + { + "entropy": 8.980615615844727, + "epoch": 0.5546766857820843, + "mean_token_accuracy": 0.6982455849647522, + "num_tokens": 8378942.0, + "step": 5610, + "train/ce_loss": 1.1419041156768799 + }, + { + "epoch": 0.5546766857820843, + "step": 5610, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.5546766857820843, + "step": 5610, + "train/total_loss": 0.21184666454792023 + }, + { + "entropy": 8.500741004943848, + "epoch": 0.5547755586315998, + "mean_token_accuracy": 0.7479423880577087, + "num_tokens": 8384412.0, + "step": 5611, + "train/ce_loss": 0.6746719479560852 + }, + { + "epoch": 0.5547755586315998, + "step": 5611, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.5547755586315998, + "step": 5611, + "train/total_loss": 0.08309219777584076 + }, + { + "entropy": 8.913674354553223, + "epoch": 0.5548744314811153, + "mean_token_accuracy": 0.8062111735343933, + "num_tokens": 8389686.0, + "step": 5612, + "train/ce_loss": 0.6028916239738464 + }, + { + "epoch": 0.5548744314811153, + "step": 5612, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.5548744314811153, + "step": 5612, + "train/total_loss": 0.1306016594171524 + }, + { + "entropy": 9.013097763061523, + "epoch": 0.5549733043306309, + "mean_token_accuracy": 0.7185697555541992, + "num_tokens": 8395083.0, + "step": 5613, + "train/ce_loss": 1.010940432548523 + }, + { + "epoch": 0.5549733043306309, + "step": 5613, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.5549733043306309, + "step": 5613, + "train/total_loss": 0.15578153729438782 + }, + { + "entropy": 9.437446594238281, + "epoch": 0.5550721771801463, + "mean_token_accuracy": 0.7008403539657593, + "num_tokens": 8400156.0, + "step": 5614, + "train/ce_loss": 9.103745810534747e-07 + }, + { + "epoch": 0.5550721771801463, + "step": 5614, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.5550721771801463, + "step": 5614, + "train/total_loss": 0.05859383940696716 + }, + { + "entropy": 9.192410469055176, + "epoch": 0.5551710500296618, + "mean_token_accuracy": 0.7471410632133484, + "num_tokens": 8405442.0, + "step": 5615, + "train/ce_loss": 0.9829228520393372 + }, + { + "epoch": 0.5551710500296618, + "step": 5615, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.5551710500296618, + "step": 5615, + "train/total_loss": 0.1451672911643982 + }, + { + "entropy": 8.884618759155273, + "epoch": 0.5552699228791774, + "mean_token_accuracy": 0.7849944233894348, + "num_tokens": 8410775.0, + "step": 5616, + "train/ce_loss": 0.7053366899490356 + }, + { + "epoch": 0.5552699228791774, + "step": 5616, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.5552699228791774, + "step": 5616, + "train/total_loss": 0.11740867048501968 + }, + { + "entropy": 9.2008695602417, + "epoch": 0.5553687957286929, + "mean_token_accuracy": 0.7620967626571655, + "num_tokens": 8415928.0, + "step": 5617, + "train/ce_loss": 1.3845499753952026 + }, + { + "epoch": 0.5553687957286929, + "step": 5617, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.5553687957286929, + "step": 5617, + "train/total_loss": 0.19704875349998474 + }, + { + "entropy": 8.806562423706055, + "epoch": 0.5554676685782084, + "mean_token_accuracy": 0.7621621489524841, + "num_tokens": 8421297.0, + "step": 5618, + "train/ce_loss": 0.6530529856681824 + }, + { + "epoch": 0.5554676685782084, + "step": 5618, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.5554676685782084, + "step": 5618, + "train/total_loss": 0.12780529260635376 + }, + { + "entropy": 9.03904914855957, + "epoch": 0.555566541427724, + "mean_token_accuracy": 0.6881720423698425, + "num_tokens": 8426593.0, + "step": 5619, + "train/ce_loss": 0.9342097043991089 + }, + { + "epoch": 0.555566541427724, + "step": 5619, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.555566541427724, + "step": 5619, + "train/total_loss": 0.12857723236083984 + }, + { + "epoch": 0.5556654142772395, + "grad_norm": 0.8386627435684204, + "learning_rate": 8.61321267863324e-06, + "loss": 0.1416, + "step": 5620 + }, + { + "entropy": 9.006744384765625, + "epoch": 0.5556654142772395, + "mean_token_accuracy": 0.7448856830596924, + "num_tokens": 8431952.0, + "step": 5620, + "train/ce_loss": 0.7422356605529785 + }, + { + "epoch": 0.5556654142772395, + "step": 5620, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.5556654142772395, + "step": 5620, + "train/total_loss": 0.1406298279762268 + }, + { + "entropy": 9.266400337219238, + "epoch": 0.555764287126755, + "mean_token_accuracy": 0.7294750809669495, + "num_tokens": 8437140.0, + "step": 5621, + "train/ce_loss": 0.8088457584381104 + }, + { + "epoch": 0.555764287126755, + "step": 5621, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.555764287126755, + "step": 5621, + "train/total_loss": 0.12775957584381104 + }, + { + "entropy": 9.065841674804688, + "epoch": 0.5558631599762706, + "mean_token_accuracy": 0.7953431606292725, + "num_tokens": 8442452.0, + "step": 5622, + "train/ce_loss": 0.9971874356269836 + }, + { + "epoch": 0.5558631599762706, + "step": 5622, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.5558631599762706, + "step": 5622, + "train/total_loss": 0.16612499952316284 + }, + { + "entropy": 9.286073684692383, + "epoch": 0.555962032825786, + "mean_token_accuracy": 0.7595356702804565, + "num_tokens": 8447535.0, + "step": 5623, + "train/ce_loss": 1.1132365465164185 + }, + { + "epoch": 0.555962032825786, + "step": 5623, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.555962032825786, + "step": 5623, + "train/total_loss": 0.16210490465164185 + }, + { + "entropy": 9.020340919494629, + "epoch": 0.5560609056753015, + "mean_token_accuracy": 0.7164705991744995, + "num_tokens": 8452899.0, + "step": 5624, + "train/ce_loss": 1.5640804767608643 + }, + { + "epoch": 0.5560609056753015, + "step": 5624, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.5560609056753015, + "step": 5624, + "train/total_loss": 0.20328305661678314 + }, + { + "entropy": 8.8846435546875, + "epoch": 0.5561597785248171, + "mean_token_accuracy": 0.7750582695007324, + "num_tokens": 8458215.0, + "step": 5625, + "train/ce_loss": 0.4915861487388611 + }, + { + "epoch": 0.5561597785248171, + "step": 5625, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.5561597785248171, + "step": 5625, + "train/total_loss": 0.06868986785411835 + }, + { + "entropy": 9.303529739379883, + "epoch": 0.5562586513743326, + "mean_token_accuracy": 0.7274052500724792, + "num_tokens": 8463356.0, + "step": 5626, + "train/ce_loss": 9.550376489642076e-07 + }, + { + "epoch": 0.5562586513743326, + "step": 5626, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.5562586513743326, + "step": 5626, + "train/total_loss": 0.03125009685754776 + }, + { + "entropy": 9.091670989990234, + "epoch": 0.5563575242238481, + "mean_token_accuracy": 0.7559523582458496, + "num_tokens": 8468465.0, + "step": 5627, + "train/ce_loss": 1.0190938711166382 + }, + { + "epoch": 0.5563575242238481, + "step": 5627, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.5563575242238481, + "step": 5627, + "train/total_loss": 0.17222189903259277 + }, + { + "entropy": 9.180963516235352, + "epoch": 0.5564563970733637, + "mean_token_accuracy": 0.7305389046669006, + "num_tokens": 8473808.0, + "step": 5628, + "train/ce_loss": 1.656849980354309 + }, + { + "epoch": 0.5564563970733637, + "step": 5628, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.5564563970733637, + "step": 5628, + "train/total_loss": 0.2203724980354309 + }, + { + "entropy": 9.32286262512207, + "epoch": 0.5565552699228792, + "mean_token_accuracy": 0.7824143171310425, + "num_tokens": 8478937.0, + "step": 5629, + "train/ce_loss": 1.4140739494905574e-06 + }, + { + "epoch": 0.5565552699228792, + "step": 5629, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.5565552699228792, + "step": 5629, + "train/total_loss": 0.05468764156103134 + }, + { + "entropy": 9.391812324523926, + "epoch": 0.5566541427723947, + "mean_token_accuracy": 0.7224805951118469, + "num_tokens": 8484065.0, + "step": 5630, + "train/ce_loss": 1.5405515796373948e-06 + }, + { + "epoch": 0.5566541427723947, + "step": 5630, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.5566541427723947, + "step": 5630, + "train/total_loss": 0.03906265273690224 + }, + { + "entropy": 8.987224578857422, + "epoch": 0.5567530156219103, + "mean_token_accuracy": 0.7538280487060547, + "num_tokens": 8489403.0, + "step": 5631, + "train/ce_loss": 0.8874568939208984 + }, + { + "epoch": 0.5567530156219103, + "step": 5631, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.5567530156219103, + "step": 5631, + "train/total_loss": 0.14343318343162537 + }, + { + "entropy": 8.805734634399414, + "epoch": 0.5568518884714257, + "mean_token_accuracy": 0.7350332736968994, + "num_tokens": 8494812.0, + "step": 5632, + "train/ce_loss": 0.5978017449378967 + }, + { + "epoch": 0.5568518884714257, + "step": 5632, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.5568518884714257, + "step": 5632, + "train/total_loss": 0.08712393045425415 + }, + { + "entropy": 9.129465103149414, + "epoch": 0.5569507613209412, + "mean_token_accuracy": 0.7413554787635803, + "num_tokens": 8500012.0, + "step": 5633, + "train/ce_loss": 1.4004480838775635 + }, + { + "epoch": 0.5569507613209412, + "step": 5633, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.5569507613209412, + "step": 5633, + "train/total_loss": 0.22598230838775635 + }, + { + "entropy": 9.708211898803711, + "epoch": 0.5570496341704568, + "mean_token_accuracy": 0.7262693047523499, + "num_tokens": 8504885.0, + "step": 5634, + "train/ce_loss": 2.0937819480895996 + }, + { + "epoch": 0.5570496341704568, + "step": 5634, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.5570496341704568, + "step": 5634, + "train/total_loss": 0.271878182888031 + }, + { + "entropy": 8.60002326965332, + "epoch": 0.5571485070199723, + "mean_token_accuracy": 0.7036669850349426, + "num_tokens": 8510350.0, + "step": 5635, + "train/ce_loss": 0.8146705031394958 + }, + { + "epoch": 0.5571485070199723, + "step": 5635, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.5571485070199723, + "step": 5635, + "train/total_loss": 0.12443580478429794 + }, + { + "entropy": 9.08309268951416, + "epoch": 0.5572473798694878, + "mean_token_accuracy": 0.7432065010070801, + "num_tokens": 8515556.0, + "step": 5636, + "train/ce_loss": 0.4288504719734192 + }, + { + "epoch": 0.5572473798694878, + "step": 5636, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.5572473798694878, + "step": 5636, + "train/total_loss": 0.09366630017757416 + }, + { + "entropy": 8.686668395996094, + "epoch": 0.5573462527190034, + "mean_token_accuracy": 0.7473806738853455, + "num_tokens": 8520870.0, + "step": 5637, + "train/ce_loss": 0.8942636847496033 + }, + { + "epoch": 0.5573462527190034, + "step": 5637, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.5573462527190034, + "step": 5637, + "train/total_loss": 0.14020761847496033 + }, + { + "entropy": 9.009510040283203, + "epoch": 0.5574451255685189, + "mean_token_accuracy": 0.7184594869613647, + "num_tokens": 8526107.0, + "step": 5638, + "train/ce_loss": 0.5916759967803955 + }, + { + "epoch": 0.5574451255685189, + "step": 5638, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.5574451255685189, + "step": 5638, + "train/total_loss": 0.12166760116815567 + }, + { + "entropy": 9.53514289855957, + "epoch": 0.5575439984180344, + "mean_token_accuracy": 0.7396551966667175, + "num_tokens": 8531106.0, + "step": 5639, + "train/ce_loss": 1.0941106081008911 + }, + { + "epoch": 0.5575439984180344, + "step": 5639, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.5575439984180344, + "step": 5639, + "train/total_loss": 0.2187860608100891 + }, + { + "epoch": 0.55764287126755, + "grad_norm": 0.7694854140281677, + "learning_rate": 8.608267813875292e-06, + "loss": 0.14, + "step": 5640 + }, + { + "entropy": 9.507564544677734, + "epoch": 0.55764287126755, + "mean_token_accuracy": 0.7487520575523376, + "num_tokens": 8536317.0, + "step": 5640, + "train/ce_loss": 0.8715117573738098 + }, + { + "epoch": 0.55764287126755, + "step": 5640, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.55764287126755, + "step": 5640, + "train/total_loss": 0.1926199197769165 + }, + { + "entropy": 8.974124908447266, + "epoch": 0.5577417441170655, + "mean_token_accuracy": 0.7629213333129883, + "num_tokens": 8541712.0, + "step": 5641, + "train/ce_loss": 0.6484330892562866 + }, + { + "epoch": 0.5577417441170655, + "step": 5641, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.5577417441170655, + "step": 5641, + "train/total_loss": 0.0999995619058609 + }, + { + "entropy": 8.264684677124023, + "epoch": 0.5578406169665809, + "mean_token_accuracy": 0.7664377093315125, + "num_tokens": 8547168.0, + "step": 5642, + "train/ce_loss": 0.7456291913986206 + }, + { + "epoch": 0.5578406169665809, + "step": 5642, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.5578406169665809, + "step": 5642, + "train/total_loss": 0.1214379221200943 + }, + { + "entropy": 8.468953132629395, + "epoch": 0.5579394898160965, + "mean_token_accuracy": 0.735609769821167, + "num_tokens": 8552677.0, + "step": 5643, + "train/ce_loss": 0.7391076683998108 + }, + { + "epoch": 0.5579394898160965, + "step": 5643, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.5579394898160965, + "step": 5643, + "train/total_loss": 0.14031702280044556 + }, + { + "entropy": 8.90363597869873, + "epoch": 0.558038362665612, + "mean_token_accuracy": 0.7963855266571045, + "num_tokens": 8557937.0, + "step": 5644, + "train/ce_loss": 0.4510335922241211 + }, + { + "epoch": 0.558038362665612, + "step": 5644, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.558038362665612, + "step": 5644, + "train/total_loss": 0.08025960624217987 + }, + { + "entropy": 8.877609252929688, + "epoch": 0.5581372355151275, + "mean_token_accuracy": 0.7982359528541565, + "num_tokens": 8563385.0, + "step": 5645, + "train/ce_loss": 0.7714386582374573 + }, + { + "epoch": 0.5581372355151275, + "step": 5645, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.5581372355151275, + "step": 5645, + "train/total_loss": 0.18261262774467468 + }, + { + "entropy": 8.755309104919434, + "epoch": 0.5582361083646431, + "mean_token_accuracy": 0.748110830783844, + "num_tokens": 8568666.0, + "step": 5646, + "train/ce_loss": 0.8779288530349731 + }, + { + "epoch": 0.5582361083646431, + "step": 5646, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.5582361083646431, + "step": 5646, + "train/total_loss": 0.15029288828372955 + }, + { + "entropy": 9.328277587890625, + "epoch": 0.5583349812141586, + "mean_token_accuracy": 0.7593880295753479, + "num_tokens": 8573972.0, + "step": 5647, + "train/ce_loss": 8.177023573807674e-07 + }, + { + "epoch": 0.5583349812141586, + "step": 5647, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.5583349812141586, + "step": 5647, + "train/total_loss": 0.031250081956386566 + }, + { + "entropy": 8.753901481628418, + "epoch": 0.5584338540636741, + "mean_token_accuracy": 0.7020000219345093, + "num_tokens": 8579475.0, + "step": 5648, + "train/ce_loss": 0.7852165699005127 + }, + { + "epoch": 0.5584338540636741, + "step": 5648, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.5584338540636741, + "step": 5648, + "train/total_loss": 0.17617791891098022 + }, + { + "entropy": 9.581437110900879, + "epoch": 0.5585327269131897, + "mean_token_accuracy": 0.7967742085456848, + "num_tokens": 8584686.0, + "step": 5649, + "train/ce_loss": 0.6837697625160217 + }, + { + "epoch": 0.5585327269131897, + "step": 5649, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.5585327269131897, + "step": 5649, + "train/total_loss": 0.08790823072195053 + }, + { + "entropy": 9.493171691894531, + "epoch": 0.5586315997627052, + "mean_token_accuracy": 0.6952381134033203, + "num_tokens": 8589667.0, + "step": 5650, + "train/ce_loss": 1.4867403507232666 + }, + { + "epoch": 0.5586315997627052, + "step": 5650, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.5586315997627052, + "step": 5650, + "train/total_loss": 0.21117404103279114 + }, + { + "entropy": 8.709896087646484, + "epoch": 0.5587304726122206, + "mean_token_accuracy": 0.7894117832183838, + "num_tokens": 8595022.0, + "step": 5651, + "train/ce_loss": 0.9483495950698853 + }, + { + "epoch": 0.5587304726122206, + "step": 5651, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.5587304726122206, + "step": 5651, + "train/total_loss": 0.13389745354652405 + }, + { + "entropy": 9.296415328979492, + "epoch": 0.5588293454617362, + "mean_token_accuracy": 0.7744107842445374, + "num_tokens": 8600050.0, + "step": 5652, + "train/ce_loss": 1.5259263363986975e-06 + }, + { + "epoch": 0.5588293454617362, + "step": 5652, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.5588293454617362, + "step": 5652, + "train/total_loss": 0.027343902736902237 + }, + { + "entropy": 9.139948844909668, + "epoch": 0.5589282183112517, + "mean_token_accuracy": 0.7699346542358398, + "num_tokens": 8605289.0, + "step": 5653, + "train/ce_loss": 0.6834624409675598 + }, + { + "epoch": 0.5589282183112517, + "step": 5653, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.5589282183112517, + "step": 5653, + "train/total_loss": 0.09178374707698822 + }, + { + "entropy": 9.215531349182129, + "epoch": 0.5590270911607672, + "mean_token_accuracy": 0.8176583647727966, + "num_tokens": 8610231.0, + "step": 5654, + "train/ce_loss": 1.2036710977554321 + }, + { + "epoch": 0.5590270911607672, + "step": 5654, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.5590270911607672, + "step": 5654, + "train/total_loss": 0.2023983597755432 + }, + { + "entropy": 9.8184232711792, + "epoch": 0.5591259640102828, + "mean_token_accuracy": 0.7412280440330505, + "num_tokens": 8615130.0, + "step": 5655, + "train/ce_loss": 1.816309350033407e-06 + }, + { + "epoch": 0.5591259640102828, + "step": 5655, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.5591259640102828, + "step": 5655, + "train/total_loss": 0.06640642881393433 + }, + { + "entropy": 8.991920471191406, + "epoch": 0.5592248368597983, + "mean_token_accuracy": 0.7032085657119751, + "num_tokens": 8620348.0, + "step": 5656, + "train/ce_loss": 0.9642362594604492 + }, + { + "epoch": 0.5592248368597983, + "step": 5656, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.5592248368597983, + "step": 5656, + "train/total_loss": 0.20970487594604492 + }, + { + "entropy": 8.72332763671875, + "epoch": 0.5593237097093138, + "mean_token_accuracy": 0.7709251046180725, + "num_tokens": 8625746.0, + "step": 5657, + "train/ce_loss": 0.8530164361000061 + }, + { + "epoch": 0.5593237097093138, + "step": 5657, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.5593237097093138, + "step": 5657, + "train/total_loss": 0.12436414510011673 + }, + { + "entropy": 9.364713668823242, + "epoch": 0.5594225825588294, + "mean_token_accuracy": 0.7829457521438599, + "num_tokens": 8630747.0, + "step": 5658, + "train/ce_loss": 1.0951863527297974 + }, + { + "epoch": 0.5594225825588294, + "step": 5658, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.5594225825588294, + "step": 5658, + "train/total_loss": 0.1837373971939087 + }, + { + "entropy": 8.714923858642578, + "epoch": 0.5595214554083449, + "mean_token_accuracy": 0.7194412350654602, + "num_tokens": 8636109.0, + "step": 5659, + "train/ce_loss": 0.4874354898929596 + }, + { + "epoch": 0.5595214554083449, + "step": 5659, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.5595214554083449, + "step": 5659, + "train/total_loss": 0.09561854600906372 + }, + { + "epoch": 0.5596203282578603, + "grad_norm": 0.7099284529685974, + "learning_rate": 8.603322949117342e-06, + "loss": 0.1325, + "step": 5660 + }, + { + "entropy": 8.914743423461914, + "epoch": 0.5596203282578603, + "mean_token_accuracy": 0.7547169923782349, + "num_tokens": 8641401.0, + "step": 5660, + "train/ce_loss": 0.7595081329345703 + }, + { + "epoch": 0.5596203282578603, + "step": 5660, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.5596203282578603, + "step": 5660, + "train/total_loss": 0.15016956627368927 + }, + { + "entropy": 9.22370433807373, + "epoch": 0.5597192011073759, + "mean_token_accuracy": 0.7557603716850281, + "num_tokens": 8646495.0, + "step": 5661, + "train/ce_loss": 1.8618015928950626e-06 + }, + { + "epoch": 0.5597192011073759, + "step": 5661, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.5597192011073759, + "step": 5661, + "train/total_loss": 0.027343936264514923 + }, + { + "entropy": 8.675178527832031, + "epoch": 0.5598180739568914, + "mean_token_accuracy": 0.711904764175415, + "num_tokens": 8651791.0, + "step": 5662, + "train/ce_loss": 1.1464154720306396 + }, + { + "epoch": 0.5598180739568914, + "step": 5662, + "train/sim_loss": 0.12890625 + }, + { + "epoch": 0.5598180739568914, + "step": 5662, + "train/total_loss": 0.24354779720306396 + }, + { + "entropy": 8.901063919067383, + "epoch": 0.5599169468064069, + "mean_token_accuracy": 0.7570694088935852, + "num_tokens": 8657026.0, + "step": 5663, + "train/ce_loss": 0.45017698407173157 + }, + { + "epoch": 0.5599169468064069, + "step": 5663, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.5599169468064069, + "step": 5663, + "train/total_loss": 0.09970520436763763 + }, + { + "entropy": 9.59109115600586, + "epoch": 0.5600158196559225, + "mean_token_accuracy": 0.7450658082962036, + "num_tokens": 8662195.0, + "step": 5664, + "train/ce_loss": 1.289162278175354 + }, + { + "epoch": 0.5600158196559225, + "step": 5664, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.5600158196559225, + "step": 5664, + "train/total_loss": 0.20704123377799988 + }, + { + "entropy": 9.324518203735352, + "epoch": 0.560114692505438, + "mean_token_accuracy": 0.7675111889839172, + "num_tokens": 8667321.0, + "step": 5665, + "train/ce_loss": 0.3714592456817627 + }, + { + "epoch": 0.560114692505438, + "step": 5665, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.560114692505438, + "step": 5665, + "train/total_loss": 0.08011467754840851 + }, + { + "entropy": 8.930580139160156, + "epoch": 0.5602135653549535, + "mean_token_accuracy": 0.7133758068084717, + "num_tokens": 8672583.0, + "step": 5666, + "train/ce_loss": 0.8320925831794739 + }, + { + "epoch": 0.5602135653549535, + "step": 5666, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.5602135653549535, + "step": 5666, + "train/total_loss": 0.11445926129817963 + }, + { + "entropy": 9.13973331451416, + "epoch": 0.5603124382044691, + "mean_token_accuracy": 0.7386519908905029, + "num_tokens": 8677824.0, + "step": 5667, + "train/ce_loss": 0.716332197189331 + }, + { + "epoch": 0.5603124382044691, + "step": 5667, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.5603124382044691, + "step": 5667, + "train/total_loss": 0.1302269697189331 + }, + { + "entropy": 9.639233589172363, + "epoch": 0.5604113110539846, + "mean_token_accuracy": 0.7438691854476929, + "num_tokens": 8682613.0, + "step": 5668, + "train/ce_loss": 1.7468125820159912 + }, + { + "epoch": 0.5604113110539846, + "step": 5668, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.5604113110539846, + "step": 5668, + "train/total_loss": 0.21374376118183136 + }, + { + "entropy": 9.132619857788086, + "epoch": 0.5605101839035, + "mean_token_accuracy": 0.7081760764122009, + "num_tokens": 8687793.0, + "step": 5669, + "train/ce_loss": 0.6844165921211243 + }, + { + "epoch": 0.5605101839035, + "step": 5669, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.5605101839035, + "step": 5669, + "train/total_loss": 0.10750415921211243 + }, + { + "entropy": 9.330621719360352, + "epoch": 0.5606090567530156, + "mean_token_accuracy": 0.7121661901473999, + "num_tokens": 8692919.0, + "step": 5670, + "train/ce_loss": 3.438371777519933e-06 + }, + { + "epoch": 0.5606090567530156, + "step": 5670, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.5606090567530156, + "step": 5670, + "train/total_loss": 0.04687534272670746 + }, + { + "entropy": 8.55720329284668, + "epoch": 0.5607079296025311, + "mean_token_accuracy": 0.7661691308021545, + "num_tokens": 8698438.0, + "step": 5671, + "train/ce_loss": 0.4348050355911255 + }, + { + "epoch": 0.5607079296025311, + "step": 5671, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.5607079296025311, + "step": 5671, + "train/total_loss": 0.06691800057888031 + }, + { + "entropy": 8.937002182006836, + "epoch": 0.5608068024520466, + "mean_token_accuracy": 0.7334801554679871, + "num_tokens": 8703803.0, + "step": 5672, + "train/ce_loss": 0.9411271810531616 + }, + { + "epoch": 0.5608068024520466, + "step": 5672, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.5608068024520466, + "step": 5672, + "train/total_loss": 0.14489397406578064 + }, + { + "entropy": 8.997447967529297, + "epoch": 0.5609056753015622, + "mean_token_accuracy": 0.7618438005447388, + "num_tokens": 8709092.0, + "step": 5673, + "train/ce_loss": 0.7748585939407349 + }, + { + "epoch": 0.5609056753015622, + "step": 5673, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.5609056753015622, + "step": 5673, + "train/total_loss": 0.11264210939407349 + }, + { + "entropy": 9.029390335083008, + "epoch": 0.5610045481510777, + "mean_token_accuracy": 0.7461809515953064, + "num_tokens": 8714419.0, + "step": 5674, + "train/ce_loss": 1.474387526512146 + }, + { + "epoch": 0.5610045481510777, + "step": 5674, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.5610045481510777, + "step": 5674, + "train/total_loss": 0.20212624967098236 + }, + { + "entropy": 8.8839750289917, + "epoch": 0.5611034210005932, + "mean_token_accuracy": 0.7093167901039124, + "num_tokens": 8719669.0, + "step": 5675, + "train/ce_loss": 0.7157660722732544 + }, + { + "epoch": 0.5611034210005932, + "step": 5675, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.5611034210005932, + "step": 5675, + "train/total_loss": 0.11063911020755768 + }, + { + "entropy": 8.97752571105957, + "epoch": 0.5612022938501088, + "mean_token_accuracy": 0.7981651425361633, + "num_tokens": 8724910.0, + "step": 5676, + "train/ce_loss": 0.7324572205543518 + }, + { + "epoch": 0.5612022938501088, + "step": 5676, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.5612022938501088, + "step": 5676, + "train/total_loss": 0.13574573397636414 + }, + { + "entropy": 9.560317039489746, + "epoch": 0.5613011666996243, + "mean_token_accuracy": 0.810606062412262, + "num_tokens": 8729871.0, + "step": 5677, + "train/ce_loss": 0.867394745349884 + }, + { + "epoch": 0.5613011666996243, + "step": 5677, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.5613011666996243, + "step": 5677, + "train/total_loss": 0.11017697304487228 + }, + { + "entropy": 9.277361869812012, + "epoch": 0.5614000395491398, + "mean_token_accuracy": 0.7605396509170532, + "num_tokens": 8734895.0, + "step": 5678, + "train/ce_loss": 0.9489947557449341 + }, + { + "epoch": 0.5614000395491398, + "step": 5678, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.5614000395491398, + "step": 5678, + "train/total_loss": 0.1456807255744934 + }, + { + "entropy": 8.804372787475586, + "epoch": 0.5614989123986553, + "mean_token_accuracy": 0.7472160458564758, + "num_tokens": 8740298.0, + "step": 5679, + "train/ce_loss": 1.4822356700897217 + }, + { + "epoch": 0.5614989123986553, + "step": 5679, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.5614989123986553, + "step": 5679, + "train/total_loss": 0.2615048289299011 + }, + { + "epoch": 0.5615977852481708, + "grad_norm": 0.7704647183418274, + "learning_rate": 8.598378084359393e-06, + "loss": 0.1372, + "step": 5680 + }, + { + "entropy": 8.915640830993652, + "epoch": 0.5615977852481708, + "mean_token_accuracy": 0.7661574482917786, + "num_tokens": 8745618.0, + "step": 5680, + "train/ce_loss": 0.8491262793540955 + }, + { + "epoch": 0.5615977852481708, + "step": 5680, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.5615977852481708, + "step": 5680, + "train/total_loss": 0.10053762793540955 + }, + { + "entropy": 9.401718139648438, + "epoch": 0.5616966580976864, + "mean_token_accuracy": 0.7157894968986511, + "num_tokens": 8750598.0, + "step": 5681, + "train/ce_loss": 2.3388354778289795 + }, + { + "epoch": 0.5616966580976864, + "step": 5681, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.5616966580976864, + "step": 5681, + "train/total_loss": 0.3198210597038269 + }, + { + "entropy": 8.893880844116211, + "epoch": 0.5617955309472019, + "mean_token_accuracy": 0.7247706651687622, + "num_tokens": 8755811.0, + "step": 5682, + "train/ce_loss": 0.6774603724479675 + }, + { + "epoch": 0.5617955309472019, + "step": 5682, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.5617955309472019, + "step": 5682, + "train/total_loss": 0.11852728575468063 + }, + { + "entropy": 9.121014595031738, + "epoch": 0.5618944037967174, + "mean_token_accuracy": 0.792682945728302, + "num_tokens": 8760992.0, + "step": 5683, + "train/ce_loss": 0.852364718914032 + }, + { + "epoch": 0.5618944037967174, + "step": 5683, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.5618944037967174, + "step": 5683, + "train/total_loss": 0.20242397487163544 + }, + { + "entropy": 9.24246597290039, + "epoch": 0.561993276646233, + "mean_token_accuracy": 0.7210526466369629, + "num_tokens": 8765999.0, + "step": 5684, + "train/ce_loss": 0.7150706648826599 + }, + { + "epoch": 0.561993276646233, + "step": 5684, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.561993276646233, + "step": 5684, + "train/total_loss": 0.09494456648826599 + }, + { + "entropy": 9.18613338470459, + "epoch": 0.5620921494957485, + "mean_token_accuracy": 0.6899350881576538, + "num_tokens": 8771036.0, + "step": 5685, + "train/ce_loss": 5.070203883406066e-07 + }, + { + "epoch": 0.5620921494957485, + "step": 5685, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.5620921494957485, + "step": 5685, + "train/total_loss": 0.01953130029141903 + }, + { + "entropy": 8.672883987426758, + "epoch": 0.562191022345264, + "mean_token_accuracy": 0.7551440596580505, + "num_tokens": 8776466.0, + "step": 5686, + "train/ce_loss": 0.8789169192314148 + }, + { + "epoch": 0.562191022345264, + "step": 5686, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.562191022345264, + "step": 5686, + "train/total_loss": 0.18945419788360596 + }, + { + "entropy": 9.249173164367676, + "epoch": 0.5622898951947796, + "mean_token_accuracy": 0.7518796920776367, + "num_tokens": 8781440.0, + "step": 5687, + "train/ce_loss": 0.6717962026596069 + }, + { + "epoch": 0.5622898951947796, + "step": 5687, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.5622898951947796, + "step": 5687, + "train/total_loss": 0.1062421202659607 + }, + { + "entropy": 9.638214111328125, + "epoch": 0.562388768044295, + "mean_token_accuracy": 0.7749999761581421, + "num_tokens": 8786281.0, + "step": 5688, + "train/ce_loss": 1.647937297821045 + }, + { + "epoch": 0.562388768044295, + "step": 5688, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.562388768044295, + "step": 5688, + "train/total_loss": 0.2116687297821045 + }, + { + "entropy": 8.669907569885254, + "epoch": 0.5624876408938105, + "mean_token_accuracy": 0.7421109676361084, + "num_tokens": 8791594.0, + "step": 5689, + "train/ce_loss": 0.7021977305412292 + }, + { + "epoch": 0.5624876408938105, + "step": 5689, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.5624876408938105, + "step": 5689, + "train/total_loss": 0.12881353497505188 + }, + { + "entropy": 8.93558406829834, + "epoch": 0.5625865137433261, + "mean_token_accuracy": 0.7482876777648926, + "num_tokens": 8796644.0, + "step": 5690, + "train/ce_loss": 1.319148063659668 + }, + { + "epoch": 0.5625865137433261, + "step": 5690, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.5625865137433261, + "step": 5690, + "train/total_loss": 0.19050855934619904 + }, + { + "entropy": 8.610952377319336, + "epoch": 0.5626853865928416, + "mean_token_accuracy": 0.7578475475311279, + "num_tokens": 8802089.0, + "step": 5691, + "train/ce_loss": 0.718697190284729 + }, + { + "epoch": 0.5626853865928416, + "step": 5691, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.5626853865928416, + "step": 5691, + "train/total_loss": 0.14218223094940186 + }, + { + "entropy": 9.175633430480957, + "epoch": 0.5627842594423571, + "mean_token_accuracy": 0.7532656192779541, + "num_tokens": 8807383.0, + "step": 5692, + "train/ce_loss": 0.9090287089347839 + }, + { + "epoch": 0.5627842594423571, + "step": 5692, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.5627842594423571, + "step": 5692, + "train/total_loss": 0.13387161493301392 + }, + { + "entropy": 8.94670295715332, + "epoch": 0.5628831322918727, + "mean_token_accuracy": 0.793379008769989, + "num_tokens": 8812742.0, + "step": 5693, + "train/ce_loss": 0.7289281487464905 + }, + { + "epoch": 0.5628831322918727, + "step": 5693, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.5628831322918727, + "step": 5693, + "train/total_loss": 0.08851781487464905 + }, + { + "entropy": 8.990468978881836, + "epoch": 0.5629820051413882, + "mean_token_accuracy": 0.791293203830719, + "num_tokens": 8818001.0, + "step": 5694, + "train/ce_loss": 0.6125853657722473 + }, + { + "epoch": 0.5629820051413882, + "step": 5694, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.5629820051413882, + "step": 5694, + "train/total_loss": 0.10813353955745697 + }, + { + "entropy": 8.891982078552246, + "epoch": 0.5630808779909037, + "mean_token_accuracy": 0.7771493196487427, + "num_tokens": 8823232.0, + "step": 5695, + "train/ce_loss": 0.6236464381217957 + }, + { + "epoch": 0.5630808779909037, + "step": 5695, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.5630808779909037, + "step": 5695, + "train/total_loss": 0.08580214530229568 + }, + { + "entropy": 9.578967094421387, + "epoch": 0.5631797508404193, + "mean_token_accuracy": 0.7374100685119629, + "num_tokens": 8828219.0, + "step": 5696, + "train/ce_loss": 1.7185809610964498e-06 + }, + { + "epoch": 0.5631797508404193, + "step": 5696, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.5631797508404193, + "step": 5696, + "train/total_loss": 0.06640642136335373 + }, + { + "entropy": 8.8345947265625, + "epoch": 0.5632786236899348, + "mean_token_accuracy": 0.7513455152511597, + "num_tokens": 8833644.0, + "step": 5697, + "train/ce_loss": 1.2025315761566162 + }, + { + "epoch": 0.5632786236899348, + "step": 5697, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.5632786236899348, + "step": 5697, + "train/total_loss": 0.22962816059589386 + }, + { + "entropy": 9.412832260131836, + "epoch": 0.5633774965394502, + "mean_token_accuracy": 0.7787742614746094, + "num_tokens": 8838743.0, + "step": 5698, + "train/ce_loss": 1.0013915300369263 + }, + { + "epoch": 0.5633774965394502, + "step": 5698, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.5633774965394502, + "step": 5698, + "train/total_loss": 0.14701415598392487 + }, + { + "entropy": 9.53570556640625, + "epoch": 0.5634763693889658, + "mean_token_accuracy": 0.7682333588600159, + "num_tokens": 8843736.0, + "step": 5699, + "train/ce_loss": 2.7366942958906293e-05 + }, + { + "epoch": 0.5634763693889658, + "step": 5699, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.5634763693889658, + "step": 5699, + "train/total_loss": 0.02344023622572422 + }, + { + "epoch": 0.5635752422384813, + "grad_norm": 0.6368483901023865, + "learning_rate": 8.593433219601445e-06, + "loss": 0.129, + "step": 5700 + }, + { + "entropy": 8.884232521057129, + "epoch": 0.5635752422384813, + "mean_token_accuracy": 0.7505567669868469, + "num_tokens": 8849050.0, + "step": 5700, + "train/ce_loss": 0.9633924961090088 + }, + { + "epoch": 0.5635752422384813, + "step": 5700, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.5635752422384813, + "step": 5700, + "train/total_loss": 0.15102675557136536 + }, + { + "entropy": 8.801679611206055, + "epoch": 0.5636741150879968, + "mean_token_accuracy": 0.733485221862793, + "num_tokens": 8854369.0, + "step": 5701, + "train/ce_loss": 1.0334956645965576 + }, + { + "epoch": 0.5636741150879968, + "step": 5701, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.5636741150879968, + "step": 5701, + "train/total_loss": 0.18147456645965576 + }, + { + "entropy": 8.707024574279785, + "epoch": 0.5637729879375124, + "mean_token_accuracy": 0.7114093899726868, + "num_tokens": 8859908.0, + "step": 5702, + "train/ce_loss": 0.5167938470840454 + }, + { + "epoch": 0.5637729879375124, + "step": 5702, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.5637729879375124, + "step": 5702, + "train/total_loss": 0.06730438768863678 + }, + { + "entropy": 9.086158752441406, + "epoch": 0.5638718607870279, + "mean_token_accuracy": 0.6867779493331909, + "num_tokens": 8865165.0, + "step": 5703, + "train/ce_loss": 1.7837257385253906 + }, + { + "epoch": 0.5638718607870279, + "step": 5703, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.5638718607870279, + "step": 5703, + "train/total_loss": 0.2447788268327713 + }, + { + "entropy": 9.026750564575195, + "epoch": 0.5639707336365434, + "mean_token_accuracy": 0.6993630528450012, + "num_tokens": 8870416.0, + "step": 5704, + "train/ce_loss": 0.8165999054908752 + }, + { + "epoch": 0.5639707336365434, + "step": 5704, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.5639707336365434, + "step": 5704, + "train/total_loss": 0.11290999501943588 + }, + { + "entropy": 9.089848518371582, + "epoch": 0.564069606486059, + "mean_token_accuracy": 0.7402777671813965, + "num_tokens": 8875584.0, + "step": 5705, + "train/ce_loss": 4.269746114005102e-06 + }, + { + "epoch": 0.564069606486059, + "step": 5705, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.564069606486059, + "step": 5705, + "train/total_loss": 0.03906292840838432 + }, + { + "entropy": 9.04459285736084, + "epoch": 0.5641684793355745, + "mean_token_accuracy": 0.739847719669342, + "num_tokens": 8880899.0, + "step": 5706, + "train/ce_loss": 0.8907245993614197 + }, + { + "epoch": 0.5641684793355745, + "step": 5706, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.5641684793355745, + "step": 5706, + "train/total_loss": 0.15938496589660645 + }, + { + "entropy": 8.861656188964844, + "epoch": 0.5642673521850899, + "mean_token_accuracy": 0.65625, + "num_tokens": 8886216.0, + "step": 5707, + "train/ce_loss": 1.6117205619812012 + }, + { + "epoch": 0.5642673521850899, + "step": 5707, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.5642673521850899, + "step": 5707, + "train/total_loss": 0.2549220621585846 + }, + { + "entropy": 9.780906677246094, + "epoch": 0.5643662250346055, + "mean_token_accuracy": 0.7160493731498718, + "num_tokens": 8891058.0, + "step": 5708, + "train/ce_loss": 2.286335984535981e-06 + }, + { + "epoch": 0.5643662250346055, + "step": 5708, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.5643662250346055, + "step": 5708, + "train/total_loss": 0.039062727242708206 + }, + { + "entropy": 9.029703140258789, + "epoch": 0.564465097884121, + "mean_token_accuracy": 0.7704517841339111, + "num_tokens": 8896294.0, + "step": 5709, + "train/ce_loss": 0.8209226727485657 + }, + { + "epoch": 0.564465097884121, + "step": 5709, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.564465097884121, + "step": 5709, + "train/total_loss": 0.1367797702550888 + }, + { + "entropy": 8.482695579528809, + "epoch": 0.5645639707336365, + "mean_token_accuracy": 0.7508772015571594, + "num_tokens": 8901869.0, + "step": 5710, + "train/ce_loss": 1.0111846923828125 + }, + { + "epoch": 0.5645639707336365, + "step": 5710, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.5645639707336365, + "step": 5710, + "train/total_loss": 0.15189972519874573 + }, + { + "entropy": 9.391302108764648, + "epoch": 0.5646628435831521, + "mean_token_accuracy": 0.7514880895614624, + "num_tokens": 8906958.0, + "step": 5711, + "train/ce_loss": 0.8454544544219971 + }, + { + "epoch": 0.5646628435831521, + "step": 5711, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.5646628435831521, + "step": 5711, + "train/total_loss": 0.11188919842243195 + }, + { + "entropy": 9.042204856872559, + "epoch": 0.5647617164326676, + "mean_token_accuracy": 0.7222222089767456, + "num_tokens": 8912267.0, + "step": 5712, + "train/ce_loss": 0.9595696926116943 + }, + { + "epoch": 0.5647617164326676, + "step": 5712, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.5647617164326676, + "step": 5712, + "train/total_loss": 0.1623632311820984 + }, + { + "entropy": 9.324300765991211, + "epoch": 0.5648605892821831, + "mean_token_accuracy": 0.7474892139434814, + "num_tokens": 8917395.0, + "step": 5713, + "train/ce_loss": 0.9969939589500427 + }, + { + "epoch": 0.5648605892821831, + "step": 5713, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.5648605892821831, + "step": 5713, + "train/total_loss": 0.16219940781593323 + }, + { + "entropy": 8.86493968963623, + "epoch": 0.5649594621316987, + "mean_token_accuracy": 0.6859956383705139, + "num_tokens": 8922773.0, + "step": 5714, + "train/ce_loss": 0.708761990070343 + }, + { + "epoch": 0.5649594621316987, + "step": 5714, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.5649594621316987, + "step": 5714, + "train/total_loss": 0.12946996092796326 + }, + { + "entropy": 9.991195678710938, + "epoch": 0.5650583349812142, + "mean_token_accuracy": 0.739130437374115, + "num_tokens": 8927471.0, + "step": 5715, + "train/ce_loss": 1.235878348350525 + }, + { + "epoch": 0.5650583349812142, + "step": 5715, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.5650583349812142, + "step": 5715, + "train/total_loss": 0.17436909675598145 + }, + { + "entropy": 8.891902923583984, + "epoch": 0.5651572078307296, + "mean_token_accuracy": 0.7307262420654297, + "num_tokens": 8932795.0, + "step": 5716, + "train/ce_loss": 0.42581817507743835 + }, + { + "epoch": 0.5651572078307296, + "step": 5716, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.5651572078307296, + "step": 5716, + "train/total_loss": 0.06601931899785995 + }, + { + "entropy": 9.35824966430664, + "epoch": 0.5652560806802452, + "mean_token_accuracy": 0.7387820482254028, + "num_tokens": 8937883.0, + "step": 5717, + "train/ce_loss": 1.1495380401611328 + }, + { + "epoch": 0.5652560806802452, + "step": 5717, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.5652560806802452, + "step": 5717, + "train/total_loss": 0.18526631593704224 + }, + { + "entropy": 9.609567642211914, + "epoch": 0.5653549535297607, + "mean_token_accuracy": 0.7029288411140442, + "num_tokens": 8942819.0, + "step": 5718, + "train/ce_loss": 1.4963005696699838e-06 + }, + { + "epoch": 0.5653549535297607, + "step": 5718, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.5653549535297607, + "step": 5718, + "train/total_loss": 0.03515639901161194 + }, + { + "entropy": 9.406478881835938, + "epoch": 0.5654538263792762, + "mean_token_accuracy": 0.6909937858581543, + "num_tokens": 8947942.0, + "step": 5719, + "train/ce_loss": 1.1935173915844643e-06 + }, + { + "epoch": 0.5654538263792762, + "step": 5719, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.5654538263792762, + "step": 5719, + "train/total_loss": 0.06250011920928955 + }, + { + "epoch": 0.5655526992287918, + "grad_norm": 0.7832507491111755, + "learning_rate": 8.588488354843496e-06, + "loss": 0.1492, + "step": 5720 + }, + { + "entropy": 8.786664009094238, + "epoch": 0.5655526992287918, + "mean_token_accuracy": 0.7355035543441772, + "num_tokens": 8953406.0, + "step": 5720, + "train/ce_loss": 1.0731121301651 + }, + { + "epoch": 0.5655526992287918, + "step": 5720, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.5655526992287918, + "step": 5720, + "train/total_loss": 0.1541862189769745 + }, + { + "entropy": 9.160669326782227, + "epoch": 0.5656515720783073, + "mean_token_accuracy": 0.673202633857727, + "num_tokens": 8958626.0, + "step": 5721, + "train/ce_loss": 0.8698038458824158 + }, + { + "epoch": 0.5656515720783073, + "step": 5721, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.5656515720783073, + "step": 5721, + "train/total_loss": 0.15729288756847382 + }, + { + "entropy": 9.497705459594727, + "epoch": 0.5657504449278228, + "mean_token_accuracy": 0.7447154521942139, + "num_tokens": 8963702.0, + "step": 5722, + "train/ce_loss": 0.8238604068756104 + }, + { + "epoch": 0.5657504449278228, + "step": 5722, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.5657504449278228, + "step": 5722, + "train/total_loss": 0.1292610466480255 + }, + { + "entropy": 8.987220764160156, + "epoch": 0.5658493177773384, + "mean_token_accuracy": 0.7350835204124451, + "num_tokens": 8968973.0, + "step": 5723, + "train/ce_loss": 0.6659753918647766 + }, + { + "epoch": 0.5658493177773384, + "step": 5723, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.5658493177773384, + "step": 5723, + "train/total_loss": 0.12128504365682602 + }, + { + "entropy": 8.96373176574707, + "epoch": 0.5659481906268539, + "mean_token_accuracy": 0.7341317534446716, + "num_tokens": 8974256.0, + "step": 5724, + "train/ce_loss": 0.8970287442207336 + }, + { + "epoch": 0.5659481906268539, + "step": 5724, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.5659481906268539, + "step": 5724, + "train/total_loss": 0.16001537442207336 + }, + { + "entropy": 8.677887916564941, + "epoch": 0.5660470634763693, + "mean_token_accuracy": 0.7993630766868591, + "num_tokens": 8979680.0, + "step": 5725, + "train/ce_loss": 0.8367934226989746 + }, + { + "epoch": 0.5660470634763693, + "step": 5725, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.5660470634763693, + "step": 5725, + "train/total_loss": 0.12664809823036194 + }, + { + "entropy": 9.57216739654541, + "epoch": 0.566145936325885, + "mean_token_accuracy": 0.7227926254272461, + "num_tokens": 8984589.0, + "step": 5726, + "train/ce_loss": 8.030754543142393e-06 + }, + { + "epoch": 0.566145936325885, + "step": 5726, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.566145936325885, + "step": 5726, + "train/total_loss": 0.07031330466270447 + }, + { + "entropy": 9.588593482971191, + "epoch": 0.5662448091754004, + "mean_token_accuracy": 0.7297297120094299, + "num_tokens": 8989477.0, + "step": 5727, + "train/ce_loss": 4.534702839009697e-06 + }, + { + "epoch": 0.5662448091754004, + "step": 5727, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.5662448091754004, + "step": 5727, + "train/total_loss": 0.05468795448541641 + }, + { + "entropy": 8.820411682128906, + "epoch": 0.5663436820249159, + "mean_token_accuracy": 0.7184615135192871, + "num_tokens": 8994590.0, + "step": 5728, + "train/ce_loss": 1.2334914207458496 + }, + { + "epoch": 0.5663436820249159, + "step": 5728, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.5663436820249159, + "step": 5728, + "train/total_loss": 0.1702241450548172 + }, + { + "entropy": 8.630937576293945, + "epoch": 0.5664425548744315, + "mean_token_accuracy": 0.785977840423584, + "num_tokens": 9000125.0, + "step": 5729, + "train/ce_loss": 0.677941083908081 + }, + { + "epoch": 0.5664425548744315, + "step": 5729, + "train/sim_loss": 0.01171875 + }, + { + "epoch": 0.5664425548744315, + "step": 5729, + "train/total_loss": 0.07951285690069199 + }, + { + "entropy": 9.175809860229492, + "epoch": 0.566541427723947, + "mean_token_accuracy": 0.7271448373794556, + "num_tokens": 9005298.0, + "step": 5730, + "train/ce_loss": 1.6741763353347778 + }, + { + "epoch": 0.566541427723947, + "step": 5730, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.566541427723947, + "step": 5730, + "train/total_loss": 0.22991763055324554 + }, + { + "entropy": 8.94044017791748, + "epoch": 0.5666403005734625, + "mean_token_accuracy": 0.7077087759971619, + "num_tokens": 9010654.0, + "step": 5731, + "train/ce_loss": 0.7423639297485352 + }, + { + "epoch": 0.5666403005734625, + "step": 5731, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.5666403005734625, + "step": 5731, + "train/total_loss": 0.10939264297485352 + }, + { + "entropy": 8.686186790466309, + "epoch": 0.5667391734229781, + "mean_token_accuracy": 0.737535297870636, + "num_tokens": 9016243.0, + "step": 5732, + "train/ce_loss": 0.42112496495246887 + }, + { + "epoch": 0.5667391734229781, + "step": 5732, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.5667391734229781, + "step": 5732, + "train/total_loss": 0.06164374575018883 + }, + { + "entropy": 9.956960678100586, + "epoch": 0.5668380462724936, + "mean_token_accuracy": 0.6780487895011902, + "num_tokens": 9021067.0, + "step": 5733, + "train/ce_loss": 1.3502155979949748e-06 + }, + { + "epoch": 0.5668380462724936, + "step": 5733, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.5668380462724936, + "step": 5733, + "train/total_loss": 0.050781384110450745 + }, + { + "entropy": 8.977663040161133, + "epoch": 0.566936919122009, + "mean_token_accuracy": 0.8058968186378479, + "num_tokens": 9026333.0, + "step": 5734, + "train/ce_loss": 0.6080503463745117 + }, + { + "epoch": 0.566936919122009, + "step": 5734, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.566936919122009, + "step": 5734, + "train/total_loss": 0.11939878761768341 + }, + { + "entropy": 9.010187149047852, + "epoch": 0.5670357919715246, + "mean_token_accuracy": 0.7435610294342041, + "num_tokens": 9031660.0, + "step": 5735, + "train/ce_loss": 0.48231545090675354 + }, + { + "epoch": 0.5670357919715246, + "step": 5735, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.5670357919715246, + "step": 5735, + "train/total_loss": 0.08338779211044312 + }, + { + "entropy": 9.17400074005127, + "epoch": 0.5671346648210401, + "mean_token_accuracy": 0.7989130616188049, + "num_tokens": 9036913.0, + "step": 5736, + "train/ce_loss": 0.842216968536377 + }, + { + "epoch": 0.5671346648210401, + "step": 5736, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.5671346648210401, + "step": 5736, + "train/total_loss": 0.18969044089317322 + }, + { + "entropy": 8.912348747253418, + "epoch": 0.5672335376705556, + "mean_token_accuracy": 0.7251700758934021, + "num_tokens": 9042115.0, + "step": 5737, + "train/ce_loss": 0.7594404220581055 + }, + { + "epoch": 0.5672335376705556, + "step": 5737, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.5672335376705556, + "step": 5737, + "train/total_loss": 0.17750653624534607 + }, + { + "entropy": 8.93150520324707, + "epoch": 0.5673324105200712, + "mean_token_accuracy": 0.7063106894493103, + "num_tokens": 9047416.0, + "step": 5738, + "train/ce_loss": 1.0738160610198975 + }, + { + "epoch": 0.5673324105200712, + "step": 5738, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.5673324105200712, + "step": 5738, + "train/total_loss": 0.22066286206245422 + }, + { + "entropy": 9.037921905517578, + "epoch": 0.5674312833695867, + "mean_token_accuracy": 0.714677631855011, + "num_tokens": 9052630.0, + "step": 5739, + "train/ce_loss": 0.9769010543823242 + }, + { + "epoch": 0.5674312833695867, + "step": 5739, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.5674312833695867, + "step": 5739, + "train/total_loss": 0.12894010543823242 + }, + { + "epoch": 0.5675301562191022, + "grad_norm": 0.79930579662323, + "learning_rate": 8.583543490085546e-06, + "loss": 0.1415, + "step": 5740 + }, + { + "entropy": 8.723766326904297, + "epoch": 0.5675301562191022, + "mean_token_accuracy": 0.6941308975219727, + "num_tokens": 9057993.0, + "step": 5740, + "train/ce_loss": 1.2526596784591675 + }, + { + "epoch": 0.5675301562191022, + "step": 5740, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.5675301562191022, + "step": 5740, + "train/total_loss": 0.238547220826149 + }, + { + "entropy": 9.474671363830566, + "epoch": 0.5676290290686178, + "mean_token_accuracy": 0.7772194147109985, + "num_tokens": 9063069.0, + "step": 5741, + "train/ce_loss": 1.3601796808870859e-06 + }, + { + "epoch": 0.5676290290686178, + "step": 5741, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.5676290290686178, + "step": 5741, + "train/total_loss": 0.08203138411045074 + }, + { + "entropy": 8.784934043884277, + "epoch": 0.5677279019181333, + "mean_token_accuracy": 0.8124330043792725, + "num_tokens": 9068503.0, + "step": 5742, + "train/ce_loss": 0.6312729716300964 + }, + { + "epoch": 0.5677279019181333, + "step": 5742, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.5677279019181333, + "step": 5742, + "train/total_loss": 0.110002301633358 + }, + { + "entropy": 9.346965789794922, + "epoch": 0.5678267747676488, + "mean_token_accuracy": 0.792151153087616, + "num_tokens": 9073622.0, + "step": 5743, + "train/ce_loss": 4.456435931388114e-07 + }, + { + "epoch": 0.5678267747676488, + "step": 5743, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.5678267747676488, + "step": 5743, + "train/total_loss": 0.02734379470348358 + }, + { + "entropy": 8.720890045166016, + "epoch": 0.5679256476171644, + "mean_token_accuracy": 0.7175324559211731, + "num_tokens": 9079060.0, + "step": 5744, + "train/ce_loss": 0.8249804377555847 + }, + { + "epoch": 0.5679256476171644, + "step": 5744, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.5679256476171644, + "step": 5744, + "train/total_loss": 0.13718554377555847 + }, + { + "entropy": 9.119203567504883, + "epoch": 0.5680245204666798, + "mean_token_accuracy": 0.7270668148994446, + "num_tokens": 9084379.0, + "step": 5745, + "train/ce_loss": 0.8372083902359009 + }, + { + "epoch": 0.5680245204666798, + "step": 5745, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.5680245204666798, + "step": 5745, + "train/total_loss": 0.1384083330631256 + }, + { + "entropy": 8.878551483154297, + "epoch": 0.5681233933161953, + "mean_token_accuracy": 0.7617411017417908, + "num_tokens": 9089678.0, + "step": 5746, + "train/ce_loss": 0.8750964403152466 + }, + { + "epoch": 0.5681233933161953, + "step": 5746, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.5681233933161953, + "step": 5746, + "train/total_loss": 0.1539158970117569 + }, + { + "entropy": 8.994913101196289, + "epoch": 0.5682222661657109, + "mean_token_accuracy": 0.7730496525764465, + "num_tokens": 9095020.0, + "step": 5747, + "train/ce_loss": 1.2646600008010864 + }, + { + "epoch": 0.5682222661657109, + "step": 5747, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.5682222661657109, + "step": 5747, + "train/total_loss": 0.19287225604057312 + }, + { + "entropy": 9.466012954711914, + "epoch": 0.5683211390152264, + "mean_token_accuracy": 0.7690762877464294, + "num_tokens": 9099930.0, + "step": 5748, + "train/ce_loss": 8.090804044513789e-07 + }, + { + "epoch": 0.5683211390152264, + "step": 5748, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.5683211390152264, + "step": 5748, + "train/total_loss": 0.023437580093741417 + }, + { + "entropy": 9.00436782836914, + "epoch": 0.5684200118647419, + "mean_token_accuracy": 0.7311435341835022, + "num_tokens": 9105172.0, + "step": 5749, + "train/ce_loss": 0.501306414604187 + }, + { + "epoch": 0.5684200118647419, + "step": 5749, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.5684200118647419, + "step": 5749, + "train/total_loss": 0.10481814295053482 + }, + { + "entropy": 8.950907707214355, + "epoch": 0.5685188847142575, + "mean_token_accuracy": 0.7416020631790161, + "num_tokens": 9110386.0, + "step": 5750, + "train/ce_loss": 0.7944943308830261 + }, + { + "epoch": 0.5685188847142575, + "step": 5750, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.5685188847142575, + "step": 5750, + "train/total_loss": 0.13023069500923157 + }, + { + "entropy": 9.473054885864258, + "epoch": 0.568617757563773, + "mean_token_accuracy": 0.6756272315979004, + "num_tokens": 9115382.0, + "step": 5751, + "train/ce_loss": 1.7320858205494005e-06 + }, + { + "epoch": 0.568617757563773, + "step": 5751, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.568617757563773, + "step": 5751, + "train/total_loss": 0.04687517136335373 + }, + { + "entropy": 8.896703720092773, + "epoch": 0.5687166304132885, + "mean_token_accuracy": 0.7521058917045593, + "num_tokens": 9120872.0, + "step": 5752, + "train/ce_loss": 0.5250265002250671 + }, + { + "epoch": 0.5687166304132885, + "step": 5752, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.5687166304132885, + "step": 5752, + "train/total_loss": 0.09156514704227448 + }, + { + "entropy": 8.722356796264648, + "epoch": 0.5688155032628041, + "mean_token_accuracy": 0.7077244520187378, + "num_tokens": 9126317.0, + "step": 5753, + "train/ce_loss": 0.8879641890525818 + }, + { + "epoch": 0.5688155032628041, + "step": 5753, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.5688155032628041, + "step": 5753, + "train/total_loss": 0.19035892188549042 + }, + { + "entropy": 9.647493362426758, + "epoch": 0.5689143761123195, + "mean_token_accuracy": 0.6870415806770325, + "num_tokens": 9131139.0, + "step": 5754, + "train/ce_loss": 1.3407045571511844e-06 + }, + { + "epoch": 0.5689143761123195, + "step": 5754, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.5689143761123195, + "step": 5754, + "train/total_loss": 0.054687634110450745 + }, + { + "entropy": 9.187198638916016, + "epoch": 0.569013248961835, + "mean_token_accuracy": 0.7637194991111755, + "num_tokens": 9136225.0, + "step": 5755, + "train/ce_loss": 0.8071531057357788 + }, + { + "epoch": 0.569013248961835, + "step": 5755, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.569013248961835, + "step": 5755, + "train/total_loss": 0.11977781355381012 + }, + { + "entropy": 8.6917724609375, + "epoch": 0.5691121218113506, + "mean_token_accuracy": 0.7009063363075256, + "num_tokens": 9141759.0, + "step": 5756, + "train/ce_loss": 0.8537380695343018 + }, + { + "epoch": 0.5691121218113506, + "step": 5756, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.5691121218113506, + "step": 5756, + "train/total_loss": 0.13224881887435913 + }, + { + "entropy": 8.852621078491211, + "epoch": 0.5692109946608661, + "mean_token_accuracy": 0.6953316926956177, + "num_tokens": 9147044.0, + "step": 5757, + "train/ce_loss": 1.4994142055511475 + }, + { + "epoch": 0.5692109946608661, + "step": 5757, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.5692109946608661, + "step": 5757, + "train/total_loss": 0.18900392949581146 + }, + { + "entropy": 9.199543952941895, + "epoch": 0.5693098675103816, + "mean_token_accuracy": 0.7749999761581421, + "num_tokens": 9152071.0, + "step": 5758, + "train/ce_loss": 0.7222161889076233 + }, + { + "epoch": 0.5693098675103816, + "step": 5758, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.5693098675103816, + "step": 5758, + "train/total_loss": 0.09956537187099457 + }, + { + "entropy": 8.63757038116455, + "epoch": 0.5694087403598972, + "mean_token_accuracy": 0.7416666746139526, + "num_tokens": 9157491.0, + "step": 5759, + "train/ce_loss": 0.6470235586166382 + }, + { + "epoch": 0.5694087403598972, + "step": 5759, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.5694087403598972, + "step": 5759, + "train/total_loss": 0.1076711043715477 + }, + { + "epoch": 0.5695076132094127, + "grad_norm": 0.621943473815918, + "learning_rate": 8.578598625327598e-06, + "loss": 0.1418, + "step": 5760 + }, + { + "entropy": 9.794168472290039, + "epoch": 0.5695076132094127, + "mean_token_accuracy": 0.7098445892333984, + "num_tokens": 9162299.0, + "step": 5760, + "train/ce_loss": 1.574930191040039 + }, + { + "epoch": 0.5695076132094127, + "step": 5760, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.5695076132094127, + "step": 5760, + "train/total_loss": 0.24343052506446838 + }, + { + "entropy": 9.196226119995117, + "epoch": 0.5696064860589282, + "mean_token_accuracy": 0.7082152962684631, + "num_tokens": 9167435.0, + "step": 5761, + "train/ce_loss": 1.043987512588501 + }, + { + "epoch": 0.5696064860589282, + "step": 5761, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.5696064860589282, + "step": 5761, + "train/total_loss": 0.12392999976873398 + }, + { + "entropy": 9.358050346374512, + "epoch": 0.5697053589084438, + "mean_token_accuracy": 0.8081841468811035, + "num_tokens": 9172263.0, + "step": 5762, + "train/ce_loss": 2.002396968237008e-06 + }, + { + "epoch": 0.5697053589084438, + "step": 5762, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.5697053589084438, + "step": 5762, + "train/total_loss": 0.03515645116567612 + }, + { + "entropy": 9.067380905151367, + "epoch": 0.5698042317579592, + "mean_token_accuracy": 0.7419354915618896, + "num_tokens": 9177485.0, + "step": 5763, + "train/ce_loss": 1.0077073574066162 + }, + { + "epoch": 0.5698042317579592, + "step": 5763, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.5698042317579592, + "step": 5763, + "train/total_loss": 0.1632707417011261 + }, + { + "entropy": 9.170974731445312, + "epoch": 0.5699031046074748, + "mean_token_accuracy": 0.7475035786628723, + "num_tokens": 9182640.0, + "step": 5764, + "train/ce_loss": 0.7894394993782043 + }, + { + "epoch": 0.5699031046074748, + "step": 5764, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.5699031046074748, + "step": 5764, + "train/total_loss": 0.14144395291805267 + }, + { + "entropy": 8.615402221679688, + "epoch": 0.5700019774569903, + "mean_token_accuracy": 0.7594339847564697, + "num_tokens": 9187950.0, + "step": 5765, + "train/ce_loss": 1.069886326789856 + }, + { + "epoch": 0.5700019774569903, + "step": 5765, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.5700019774569903, + "step": 5765, + "train/total_loss": 0.18511363863945007 + }, + { + "entropy": 9.6326904296875, + "epoch": 0.5701008503065058, + "mean_token_accuracy": 0.7012302279472351, + "num_tokens": 9192954.0, + "step": 5766, + "train/ce_loss": 1.0751601848824066e-06 + }, + { + "epoch": 0.5701008503065058, + "step": 5766, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.5701008503065058, + "step": 5766, + "train/total_loss": 0.023437608033418655 + }, + { + "entropy": 8.751548767089844, + "epoch": 0.5701997231560214, + "mean_token_accuracy": 0.770531415939331, + "num_tokens": 9198273.0, + "step": 5767, + "train/ce_loss": 0.7971166372299194 + }, + { + "epoch": 0.5701997231560214, + "step": 5767, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.5701997231560214, + "step": 5767, + "train/total_loss": 0.1187741681933403 + }, + { + "entropy": 9.144685745239258, + "epoch": 0.5702985960055369, + "mean_token_accuracy": 0.7083333134651184, + "num_tokens": 9203449.0, + "step": 5768, + "train/ce_loss": 1.262489914894104 + }, + { + "epoch": 0.5702985960055369, + "step": 5768, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.5702985960055369, + "step": 5768, + "train/total_loss": 0.20437400043010712 + }, + { + "entropy": 9.466706275939941, + "epoch": 0.5703974688550524, + "mean_token_accuracy": 0.6808118224143982, + "num_tokens": 9208382.0, + "step": 5769, + "train/ce_loss": 1.224777340888977 + }, + { + "epoch": 0.5703974688550524, + "step": 5769, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.5703974688550524, + "step": 5769, + "train/total_loss": 0.18107149004936218 + }, + { + "entropy": 8.870538711547852, + "epoch": 0.570496341704568, + "mean_token_accuracy": 0.7292954325675964, + "num_tokens": 9213684.0, + "step": 5770, + "train/ce_loss": 0.5599069595336914 + }, + { + "epoch": 0.570496341704568, + "step": 5770, + "train/sim_loss": 0.01171875 + }, + { + "epoch": 0.570496341704568, + "step": 5770, + "train/total_loss": 0.06770944595336914 + }, + { + "entropy": 9.689857482910156, + "epoch": 0.5705952145540835, + "mean_token_accuracy": 0.8302752375602722, + "num_tokens": 9218571.0, + "step": 5771, + "train/ce_loss": 1.0707685947418213 + }, + { + "epoch": 0.5705952145540835, + "step": 5771, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.5705952145540835, + "step": 5771, + "train/total_loss": 0.16957685351371765 + }, + { + "entropy": 8.794843673706055, + "epoch": 0.570694087403599, + "mean_token_accuracy": 0.7710084319114685, + "num_tokens": 9223957.0, + "step": 5772, + "train/ce_loss": 1.0351547002792358 + }, + { + "epoch": 0.570694087403599, + "step": 5772, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.570694087403599, + "step": 5772, + "train/total_loss": 0.16601547598838806 + }, + { + "entropy": 9.70067024230957, + "epoch": 0.5707929602531145, + "mean_token_accuracy": 0.7928388714790344, + "num_tokens": 9228788.0, + "step": 5773, + "train/ce_loss": 1.7402708530426025 + }, + { + "epoch": 0.5707929602531145, + "step": 5773, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.5707929602531145, + "step": 5773, + "train/total_loss": 0.24824583530426025 + }, + { + "entropy": 9.039346694946289, + "epoch": 0.57089183310263, + "mean_token_accuracy": 0.7471697926521301, + "num_tokens": 9234018.0, + "step": 5774, + "train/ce_loss": 0.7928961515426636 + }, + { + "epoch": 0.57089183310263, + "step": 5774, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.57089183310263, + "step": 5774, + "train/total_loss": 0.09882086515426636 + }, + { + "entropy": 9.668989181518555, + "epoch": 0.5709907059521455, + "mean_token_accuracy": 0.7594339847564697, + "num_tokens": 9238894.0, + "step": 5775, + "train/ce_loss": 3.404971039344673e-06 + }, + { + "epoch": 0.5709907059521455, + "step": 5775, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.5709907059521455, + "step": 5775, + "train/total_loss": 0.04687533900141716 + }, + { + "entropy": 9.012740135192871, + "epoch": 0.5710895788016611, + "mean_token_accuracy": 0.6688227653503418, + "num_tokens": 9244148.0, + "step": 5776, + "train/ce_loss": 1.3640942573547363 + }, + { + "epoch": 0.5710895788016611, + "step": 5776, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.5710895788016611, + "step": 5776, + "train/total_loss": 0.2028156816959381 + }, + { + "entropy": 9.636526107788086, + "epoch": 0.5711884516511766, + "mean_token_accuracy": 0.7274678349494934, + "num_tokens": 9249080.0, + "step": 5777, + "train/ce_loss": 3.842779278784292e-06 + }, + { + "epoch": 0.5711884516511766, + "step": 5777, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.5711884516511766, + "step": 5777, + "train/total_loss": 0.05468788370490074 + }, + { + "entropy": 9.943527221679688, + "epoch": 0.5712873245006921, + "mean_token_accuracy": 0.7570093274116516, + "num_tokens": 9253821.0, + "step": 5778, + "train/ce_loss": 1.4900537729263306 + }, + { + "epoch": 0.5712873245006921, + "step": 5778, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.5712873245006921, + "step": 5778, + "train/total_loss": 0.18416163325309753 + }, + { + "entropy": 9.047167778015137, + "epoch": 0.5713861973502077, + "mean_token_accuracy": 0.759096622467041, + "num_tokens": 9259251.0, + "step": 5779, + "train/ce_loss": 0.7313524484634399 + }, + { + "epoch": 0.5713861973502077, + "step": 5779, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.5713861973502077, + "step": 5779, + "train/total_loss": 0.09657274931669235 + }, + { + "epoch": 0.5714850701997232, + "grad_norm": 0.7609543800354004, + "learning_rate": 8.573653760569649e-06, + "loss": 0.1392, + "step": 5780 + }, + { + "entropy": 9.630558013916016, + "epoch": 0.5714850701997232, + "mean_token_accuracy": 0.7282850742340088, + "num_tokens": 9264079.0, + "step": 5780, + "train/ce_loss": 9.653415418142686e-07 + }, + { + "epoch": 0.5714850701997232, + "step": 5780, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.5714850701997232, + "step": 5780, + "train/total_loss": 0.02343759685754776 + }, + { + "entropy": 9.017946243286133, + "epoch": 0.5715839430492387, + "mean_token_accuracy": 0.7664740085601807, + "num_tokens": 9269406.0, + "step": 5781, + "train/ce_loss": 0.6722099781036377 + }, + { + "epoch": 0.5715839430492387, + "step": 5781, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.5715839430492387, + "step": 5781, + "train/total_loss": 0.08675225079059601 + }, + { + "entropy": 9.25473403930664, + "epoch": 0.5716828158987542, + "mean_token_accuracy": 0.8168557286262512, + "num_tokens": 9274471.0, + "step": 5782, + "train/ce_loss": 0.4532519280910492 + }, + { + "epoch": 0.5716828158987542, + "step": 5782, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.5716828158987542, + "step": 5782, + "train/total_loss": 0.06095019355416298 + }, + { + "entropy": 9.218679428100586, + "epoch": 0.5717816887482697, + "mean_token_accuracy": 0.7699999809265137, + "num_tokens": 9279683.0, + "step": 5783, + "train/ce_loss": 0.7935691475868225 + }, + { + "epoch": 0.5717816887482697, + "step": 5783, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.5717816887482697, + "step": 5783, + "train/total_loss": 0.11841941624879837 + }, + { + "entropy": 9.08060073852539, + "epoch": 0.5718805615977852, + "mean_token_accuracy": 0.7420634627342224, + "num_tokens": 9284892.0, + "step": 5784, + "train/ce_loss": 0.7454713582992554 + }, + { + "epoch": 0.5718805615977852, + "step": 5784, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.5718805615977852, + "step": 5784, + "train/total_loss": 0.17220339179039001 + }, + { + "entropy": 8.969976425170898, + "epoch": 0.5719794344473008, + "mean_token_accuracy": 0.7424058318138123, + "num_tokens": 9290175.0, + "step": 5785, + "train/ce_loss": 1.1526658535003662 + }, + { + "epoch": 0.5719794344473008, + "step": 5785, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.5719794344473008, + "step": 5785, + "train/total_loss": 0.1465165913105011 + }, + { + "entropy": 8.666062355041504, + "epoch": 0.5720783072968163, + "mean_token_accuracy": 0.7295373678207397, + "num_tokens": 9295493.0, + "step": 5786, + "train/ce_loss": 1.085310935974121 + }, + { + "epoch": 0.5720783072968163, + "step": 5786, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.5720783072968163, + "step": 5786, + "train/total_loss": 0.16712483763694763 + }, + { + "entropy": 9.163665771484375, + "epoch": 0.5721771801463318, + "mean_token_accuracy": 0.7680140733718872, + "num_tokens": 9300502.0, + "step": 5787, + "train/ce_loss": 0.8502943515777588 + }, + { + "epoch": 0.5721771801463318, + "step": 5787, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.5721771801463318, + "step": 5787, + "train/total_loss": 0.15924818813800812 + }, + { + "entropy": 9.09556770324707, + "epoch": 0.5722760529958474, + "mean_token_accuracy": 0.8070175647735596, + "num_tokens": 9305699.0, + "step": 5788, + "train/ce_loss": 0.9993642568588257 + }, + { + "epoch": 0.5722760529958474, + "step": 5788, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.5722760529958474, + "step": 5788, + "train/total_loss": 0.16243642568588257 + }, + { + "entropy": 8.845906257629395, + "epoch": 0.5723749258453629, + "mean_token_accuracy": 0.7489451766014099, + "num_tokens": 9311136.0, + "step": 5789, + "train/ce_loss": 0.7303217053413391 + }, + { + "epoch": 0.5723749258453629, + "step": 5789, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.5723749258453629, + "step": 5789, + "train/total_loss": 0.1667821705341339 + }, + { + "entropy": 9.882627487182617, + "epoch": 0.5724737986948784, + "mean_token_accuracy": 0.8067227005958557, + "num_tokens": 9315885.0, + "step": 5790, + "train/ce_loss": 1.4576836824417114 + }, + { + "epoch": 0.5724737986948784, + "step": 5790, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.5724737986948784, + "step": 5790, + "train/total_loss": 0.20436212420463562 + }, + { + "entropy": 9.175952911376953, + "epoch": 0.572572671544394, + "mean_token_accuracy": 0.800582230091095, + "num_tokens": 9321042.0, + "step": 5791, + "train/ce_loss": 0.7040167450904846 + }, + { + "epoch": 0.572572671544394, + "step": 5791, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.572572671544394, + "step": 5791, + "train/total_loss": 0.10555792599916458 + }, + { + "entropy": 8.598827362060547, + "epoch": 0.5726715443939094, + "mean_token_accuracy": 0.8154696226119995, + "num_tokens": 9326428.0, + "step": 5792, + "train/ce_loss": 0.6725344061851501 + }, + { + "epoch": 0.5726715443939094, + "step": 5792, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.5726715443939094, + "step": 5792, + "train/total_loss": 0.14147219061851501 + }, + { + "entropy": 9.333423614501953, + "epoch": 0.5727704172434249, + "mean_token_accuracy": 0.7334630489349365, + "num_tokens": 9331390.0, + "step": 5793, + "train/ce_loss": 1.299883484840393 + }, + { + "epoch": 0.5727704172434249, + "step": 5793, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.5727704172434249, + "step": 5793, + "train/total_loss": 0.19639460742473602 + }, + { + "entropy": 8.871832847595215, + "epoch": 0.5728692900929405, + "mean_token_accuracy": 0.7634854912757874, + "num_tokens": 9336780.0, + "step": 5794, + "train/ce_loss": 1.0562324523925781 + }, + { + "epoch": 0.5728692900929405, + "step": 5794, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.5728692900929405, + "step": 5794, + "train/total_loss": 0.2110919952392578 + }, + { + "entropy": 8.454917907714844, + "epoch": 0.572968162942456, + "mean_token_accuracy": 0.7221584320068359, + "num_tokens": 9342120.0, + "step": 5795, + "train/ce_loss": 0.7007420659065247 + }, + { + "epoch": 0.572968162942456, + "step": 5795, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.572968162942456, + "step": 5795, + "train/total_loss": 0.128667950630188 + }, + { + "entropy": 8.864130020141602, + "epoch": 0.5730670357919715, + "mean_token_accuracy": 0.7142857313156128, + "num_tokens": 9347303.0, + "step": 5796, + "train/ce_loss": 1.1294949054718018 + }, + { + "epoch": 0.5730670357919715, + "step": 5796, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.5730670357919715, + "step": 5796, + "train/total_loss": 0.17544949054718018 + }, + { + "entropy": 8.857627868652344, + "epoch": 0.5731659086414871, + "mean_token_accuracy": 0.7629629373550415, + "num_tokens": 9352662.0, + "step": 5797, + "train/ce_loss": 0.7051044702529907 + }, + { + "epoch": 0.5731659086414871, + "step": 5797, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.5731659086414871, + "step": 5797, + "train/total_loss": 0.14863544702529907 + }, + { + "entropy": 9.339004516601562, + "epoch": 0.5732647814910026, + "mean_token_accuracy": 0.7263843417167664, + "num_tokens": 9357745.0, + "step": 5798, + "train/ce_loss": 1.1820474863052368 + }, + { + "epoch": 0.5732647814910026, + "step": 5798, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.5732647814910026, + "step": 5798, + "train/total_loss": 0.1494547426700592 + }, + { + "entropy": 9.302085876464844, + "epoch": 0.5733636543405181, + "mean_token_accuracy": 0.723809540271759, + "num_tokens": 9362844.0, + "step": 5799, + "train/ce_loss": 0.4738314151763916 + }, + { + "epoch": 0.5733636543405181, + "step": 5799, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.5733636543405181, + "step": 5799, + "train/total_loss": 0.1020706444978714 + }, + { + "epoch": 0.5734625271900337, + "grad_norm": 0.695505678653717, + "learning_rate": 8.568708895811701e-06, + "loss": 0.1252, + "step": 5800 + }, + { + "entropy": 9.141944885253906, + "epoch": 0.5734625271900337, + "mean_token_accuracy": 0.7878788113594055, + "num_tokens": 9367985.0, + "step": 5800, + "train/ce_loss": 0.6440024375915527 + }, + { + "epoch": 0.5734625271900337, + "step": 5800, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.5734625271900337, + "step": 5800, + "train/total_loss": 0.14643150568008423 + }, + { + "entropy": 8.993462562561035, + "epoch": 0.5735614000395491, + "mean_token_accuracy": 0.7708830833435059, + "num_tokens": 9373301.0, + "step": 5801, + "train/ce_loss": 0.6326388716697693 + }, + { + "epoch": 0.5735614000395491, + "step": 5801, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.5735614000395491, + "step": 5801, + "train/total_loss": 0.1452951431274414 + }, + { + "entropy": 9.004293441772461, + "epoch": 0.5736602728890646, + "mean_token_accuracy": 0.6816431283950806, + "num_tokens": 9378565.0, + "step": 5802, + "train/ce_loss": 0.40664011240005493 + }, + { + "epoch": 0.5736602728890646, + "step": 5802, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.5736602728890646, + "step": 5802, + "train/total_loss": 0.08753901720046997 + }, + { + "entropy": 9.30048942565918, + "epoch": 0.5737591457385802, + "mean_token_accuracy": 0.7446103096008301, + "num_tokens": 9383657.0, + "step": 5803, + "train/ce_loss": 2.19503613152483e-06 + }, + { + "epoch": 0.5737591457385802, + "step": 5803, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.5737591457385802, + "step": 5803, + "train/total_loss": 0.03906271979212761 + }, + { + "entropy": 8.672707557678223, + "epoch": 0.5738580185880957, + "mean_token_accuracy": 0.7139852643013, + "num_tokens": 9389126.0, + "step": 5804, + "train/ce_loss": 0.5802134275436401 + }, + { + "epoch": 0.5738580185880957, + "step": 5804, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.5738580185880957, + "step": 5804, + "train/total_loss": 0.10880259424448013 + }, + { + "entropy": 8.985300064086914, + "epoch": 0.5739568914376112, + "mean_token_accuracy": 0.7011904716491699, + "num_tokens": 9394402.0, + "step": 5805, + "train/ce_loss": 0.8810886740684509 + }, + { + "epoch": 0.5739568914376112, + "step": 5805, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.5739568914376112, + "step": 5805, + "train/total_loss": 0.1740463674068451 + }, + { + "entropy": 8.950839042663574, + "epoch": 0.5740557642871268, + "mean_token_accuracy": 0.7651098966598511, + "num_tokens": 9399648.0, + "step": 5806, + "train/ce_loss": 0.8348586559295654 + }, + { + "epoch": 0.5740557642871268, + "step": 5806, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.5740557642871268, + "step": 5806, + "train/total_loss": 0.16161087155342102 + }, + { + "entropy": 9.569741249084473, + "epoch": 0.5741546371366423, + "mean_token_accuracy": 0.7185929417610168, + "num_tokens": 9404471.0, + "step": 5807, + "train/ce_loss": 1.4021042585372925 + }, + { + "epoch": 0.5741546371366423, + "step": 5807, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.5741546371366423, + "step": 5807, + "train/total_loss": 0.20271043479442596 + }, + { + "entropy": 8.872579574584961, + "epoch": 0.5742535099861578, + "mean_token_accuracy": 0.7598608136177063, + "num_tokens": 9409830.0, + "step": 5808, + "train/ce_loss": 1.0803982019424438 + }, + { + "epoch": 0.5742535099861578, + "step": 5808, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.5742535099861578, + "step": 5808, + "train/total_loss": 0.20960232615470886 + }, + { + "entropy": 9.010385513305664, + "epoch": 0.5743523828356734, + "mean_token_accuracy": 0.7640750408172607, + "num_tokens": 9415072.0, + "step": 5809, + "train/ce_loss": 0.39146870374679565 + }, + { + "epoch": 0.5743523828356734, + "step": 5809, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.5743523828356734, + "step": 5809, + "train/total_loss": 0.08992812037467957 + }, + { + "entropy": 9.353796005249023, + "epoch": 0.5744512556851888, + "mean_token_accuracy": 0.7794612646102905, + "num_tokens": 9420094.0, + "step": 5810, + "train/ce_loss": 0.9817899465560913 + }, + { + "epoch": 0.5744512556851888, + "step": 5810, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.5744512556851888, + "step": 5810, + "train/total_loss": 0.14896024763584137 + }, + { + "entropy": 8.967628479003906, + "epoch": 0.5745501285347043, + "mean_token_accuracy": 0.7174825072288513, + "num_tokens": 9425254.0, + "step": 5811, + "train/ce_loss": 1.0944974422454834 + }, + { + "epoch": 0.5745501285347043, + "step": 5811, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.5745501285347043, + "step": 5811, + "train/total_loss": 0.17194974422454834 + }, + { + "entropy": 9.016559600830078, + "epoch": 0.5746490013842199, + "mean_token_accuracy": 0.808041512966156, + "num_tokens": 9430450.0, + "step": 5812, + "train/ce_loss": 1.0918627977371216 + }, + { + "epoch": 0.5746490013842199, + "step": 5812, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.5746490013842199, + "step": 5812, + "train/total_loss": 0.1873112916946411 + }, + { + "entropy": 8.808783531188965, + "epoch": 0.5747478742337354, + "mean_token_accuracy": 0.7251908183097839, + "num_tokens": 9436021.0, + "step": 5813, + "train/ce_loss": 0.7345530986785889 + }, + { + "epoch": 0.5747478742337354, + "step": 5813, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.5747478742337354, + "step": 5813, + "train/total_loss": 0.1867365539073944 + }, + { + "entropy": 9.164575576782227, + "epoch": 0.5748467470832509, + "mean_token_accuracy": 0.7396121621131897, + "num_tokens": 9441244.0, + "step": 5814, + "train/ce_loss": 1.2660578489303589 + }, + { + "epoch": 0.5748467470832509, + "step": 5814, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.5748467470832509, + "step": 5814, + "train/total_loss": 0.1930120438337326 + }, + { + "entropy": 8.717116355895996, + "epoch": 0.5749456199327665, + "mean_token_accuracy": 0.7172839641571045, + "num_tokens": 9446579.0, + "step": 5815, + "train/ce_loss": 0.9173863530158997 + }, + { + "epoch": 0.5749456199327665, + "step": 5815, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.5749456199327665, + "step": 5815, + "train/total_loss": 0.20892614126205444 + }, + { + "entropy": 8.604945182800293, + "epoch": 0.575044492782282, + "mean_token_accuracy": 0.7659157514572144, + "num_tokens": 9452069.0, + "step": 5816, + "train/ce_loss": 1.1520651578903198 + }, + { + "epoch": 0.575044492782282, + "step": 5816, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.575044492782282, + "step": 5816, + "train/total_loss": 0.1816127598285675 + }, + { + "entropy": 8.624781608581543, + "epoch": 0.5751433656317975, + "mean_token_accuracy": 0.705450713634491, + "num_tokens": 9457675.0, + "step": 5817, + "train/ce_loss": 1.019660234451294 + }, + { + "epoch": 0.5751433656317975, + "step": 5817, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.5751433656317975, + "step": 5817, + "train/total_loss": 0.1722785234451294 + }, + { + "entropy": 8.754276275634766, + "epoch": 0.5752422384813131, + "mean_token_accuracy": 0.7320442199707031, + "num_tokens": 9462870.0, + "step": 5818, + "train/ce_loss": 0.7815767526626587 + }, + { + "epoch": 0.5752422384813131, + "step": 5818, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.5752422384813131, + "step": 5818, + "train/total_loss": 0.1367514282464981 + }, + { + "entropy": 9.46786117553711, + "epoch": 0.5753411113308285, + "mean_token_accuracy": 0.7357512712478638, + "num_tokens": 9467855.0, + "step": 5819, + "train/ce_loss": 0.8608783483505249 + }, + { + "epoch": 0.5753411113308285, + "step": 5819, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.5753411113308285, + "step": 5819, + "train/total_loss": 0.12905658781528473 + }, + { + "epoch": 0.575439984180344, + "grad_norm": 0.7835647463798523, + "learning_rate": 8.563764031053752e-06, + "loss": 0.1469, + "step": 5820 + }, + { + "entropy": 8.44367790222168, + "epoch": 0.575439984180344, + "mean_token_accuracy": 0.6938547492027283, + "num_tokens": 9473237.0, + "step": 5820, + "train/ce_loss": 0.3752153515815735 + }, + { + "epoch": 0.575439984180344, + "step": 5820, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.575439984180344, + "step": 5820, + "train/total_loss": 0.06877154111862183 + }, + { + "entropy": 9.075521469116211, + "epoch": 0.5755388570298596, + "mean_token_accuracy": 0.6924198269844055, + "num_tokens": 9478365.0, + "step": 5821, + "train/ce_loss": 1.483864426612854 + }, + { + "epoch": 0.5755388570298596, + "step": 5821, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.5755388570298596, + "step": 5821, + "train/total_loss": 0.23823019862174988 + }, + { + "entropy": 8.797341346740723, + "epoch": 0.5756377298793751, + "mean_token_accuracy": 0.7902023196220398, + "num_tokens": 9483787.0, + "step": 5822, + "train/ce_loss": 0.4620964527130127 + }, + { + "epoch": 0.5756377298793751, + "step": 5822, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.5756377298793751, + "step": 5822, + "train/total_loss": 0.08527214825153351 + }, + { + "entropy": 9.989152908325195, + "epoch": 0.5757366027288906, + "mean_token_accuracy": 0.7836065292358398, + "num_tokens": 9488520.0, + "step": 5823, + "train/ce_loss": 0.9272093772888184 + }, + { + "epoch": 0.5757366027288906, + "step": 5823, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.5757366027288906, + "step": 5823, + "train/total_loss": 0.13959594070911407 + }, + { + "entropy": 9.14828109741211, + "epoch": 0.5758354755784062, + "mean_token_accuracy": 0.7337367534637451, + "num_tokens": 9493639.0, + "step": 5824, + "train/ce_loss": 1.4593510627746582 + }, + { + "epoch": 0.5758354755784062, + "step": 5824, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.5758354755784062, + "step": 5824, + "train/total_loss": 0.22015385329723358 + }, + { + "entropy": 9.220829010009766, + "epoch": 0.5759343484279217, + "mean_token_accuracy": 0.7876505851745605, + "num_tokens": 9498773.0, + "step": 5825, + "train/ce_loss": 0.6129733920097351 + }, + { + "epoch": 0.5759343484279217, + "step": 5825, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.5759343484279217, + "step": 5825, + "train/total_loss": 0.08082859218120575 + }, + { + "entropy": 9.097583770751953, + "epoch": 0.5760332212774372, + "mean_token_accuracy": 0.7509778141975403, + "num_tokens": 9504007.0, + "step": 5826, + "train/ce_loss": 0.567491888999939 + }, + { + "epoch": 0.5760332212774372, + "step": 5826, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.5760332212774372, + "step": 5826, + "train/total_loss": 0.09971794486045837 + }, + { + "entropy": 9.62552261352539, + "epoch": 0.5761320941269528, + "mean_token_accuracy": 0.7119341492652893, + "num_tokens": 9508872.0, + "step": 5827, + "train/ce_loss": 1.5119009049158194e-06 + }, + { + "epoch": 0.5761320941269528, + "step": 5827, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.5761320941269528, + "step": 5827, + "train/total_loss": 0.019531400874257088 + }, + { + "entropy": 9.689656257629395, + "epoch": 0.5762309669764683, + "mean_token_accuracy": 0.756157636642456, + "num_tokens": 9513696.0, + "step": 5828, + "train/ce_loss": 3.948211087845266e-06 + }, + { + "epoch": 0.5762309669764683, + "step": 5828, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.5762309669764683, + "step": 5828, + "train/total_loss": 0.08593789488077164 + }, + { + "entropy": 9.26791763305664, + "epoch": 0.5763298398259837, + "mean_token_accuracy": 0.6971608996391296, + "num_tokens": 9518737.0, + "step": 5829, + "train/ce_loss": 2.2869637632538797e-06 + }, + { + "epoch": 0.5763298398259837, + "step": 5829, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.5763298398259837, + "step": 5829, + "train/total_loss": 0.0625002309679985 + }, + { + "entropy": 9.628270149230957, + "epoch": 0.5764287126754993, + "mean_token_accuracy": 0.7145833373069763, + "num_tokens": 9523842.0, + "step": 5830, + "train/ce_loss": 1.0487631559371948 + }, + { + "epoch": 0.5764287126754993, + "step": 5830, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.5764287126754993, + "step": 5830, + "train/total_loss": 0.163470059633255 + }, + { + "entropy": 9.06503677368164, + "epoch": 0.5765275855250148, + "mean_token_accuracy": 0.7355769276618958, + "num_tokens": 9528929.0, + "step": 5831, + "train/ce_loss": 0.6332504749298096 + }, + { + "epoch": 0.5765275855250148, + "step": 5831, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.5765275855250148, + "step": 5831, + "train/total_loss": 0.11020004749298096 + }, + { + "entropy": 8.997516632080078, + "epoch": 0.5766264583745303, + "mean_token_accuracy": 0.7780821919441223, + "num_tokens": 9534134.0, + "step": 5832, + "train/ce_loss": 0.7698352336883545 + }, + { + "epoch": 0.5766264583745303, + "step": 5832, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.5766264583745303, + "step": 5832, + "train/total_loss": 0.1629210263490677 + }, + { + "entropy": 8.555920600891113, + "epoch": 0.5767253312240459, + "mean_token_accuracy": 0.761800229549408, + "num_tokens": 9539559.0, + "step": 5833, + "train/ce_loss": 0.46046629548072815 + }, + { + "epoch": 0.5767253312240459, + "step": 5833, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.5767253312240459, + "step": 5833, + "train/total_loss": 0.10073412954807281 + }, + { + "entropy": 8.977663040161133, + "epoch": 0.5768242040735614, + "mean_token_accuracy": 0.7265258431434631, + "num_tokens": 9544888.0, + "step": 5834, + "train/ce_loss": 0.5896692276000977 + }, + { + "epoch": 0.5768242040735614, + "step": 5834, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.5768242040735614, + "step": 5834, + "train/total_loss": 0.09412316977977753 + }, + { + "entropy": 8.919035911560059, + "epoch": 0.5769230769230769, + "mean_token_accuracy": 0.7393548488616943, + "num_tokens": 9550121.0, + "step": 5835, + "train/ce_loss": 1.3863357305526733 + }, + { + "epoch": 0.5769230769230769, + "step": 5835, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.5769230769230769, + "step": 5835, + "train/total_loss": 0.2206648290157318 + }, + { + "entropy": 9.460243225097656, + "epoch": 0.5770219497725925, + "mean_token_accuracy": 0.6178571581840515, + "num_tokens": 9555092.0, + "step": 5836, + "train/ce_loss": 1.983798623085022 + }, + { + "epoch": 0.5770219497725925, + "step": 5836, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.5770219497725925, + "step": 5836, + "train/total_loss": 0.29994237422943115 + }, + { + "entropy": 8.946954727172852, + "epoch": 0.577120822622108, + "mean_token_accuracy": 0.7691197395324707, + "num_tokens": 9560222.0, + "step": 5837, + "train/ce_loss": 1.1302258968353271 + }, + { + "epoch": 0.577120822622108, + "step": 5837, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.577120822622108, + "step": 5837, + "train/total_loss": 0.1716163456439972 + }, + { + "entropy": 8.636548042297363, + "epoch": 0.5772196954716234, + "mean_token_accuracy": 0.7297979593276978, + "num_tokens": 9565480.0, + "step": 5838, + "train/ce_loss": 0.8710353374481201 + }, + { + "epoch": 0.5772196954716234, + "step": 5838, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.5772196954716234, + "step": 5838, + "train/total_loss": 0.15741604566574097 + }, + { + "entropy": 8.944669723510742, + "epoch": 0.577318568321139, + "mean_token_accuracy": 0.7527624368667603, + "num_tokens": 9570631.0, + "step": 5839, + "train/ce_loss": 1.3659089803695679 + }, + { + "epoch": 0.577318568321139, + "step": 5839, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.577318568321139, + "step": 5839, + "train/total_loss": 0.2108096480369568 + }, + { + "epoch": 0.5774174411706545, + "grad_norm": 0.6676998734474182, + "learning_rate": 8.558819166295802e-06, + "loss": 0.1389, + "step": 5840 + }, + { + "entropy": 8.932061195373535, + "epoch": 0.5774174411706545, + "mean_token_accuracy": 0.7333333492279053, + "num_tokens": 9575973.0, + "step": 5840, + "train/ce_loss": 1.2954726219177246 + }, + { + "epoch": 0.5774174411706545, + "step": 5840, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.5774174411706545, + "step": 5840, + "train/total_loss": 0.19204726815223694 + }, + { + "entropy": 8.650957107543945, + "epoch": 0.57751631402017, + "mean_token_accuracy": 0.7390829920768738, + "num_tokens": 9581377.0, + "step": 5841, + "train/ce_loss": 1.229686975479126 + }, + { + "epoch": 0.57751631402017, + "step": 5841, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.57751631402017, + "step": 5841, + "train/total_loss": 0.16203120350837708 + }, + { + "entropy": 9.259391784667969, + "epoch": 0.5776151868696856, + "mean_token_accuracy": 0.7354497313499451, + "num_tokens": 9586347.0, + "step": 5842, + "train/ce_loss": 1.0366559028625488 + }, + { + "epoch": 0.5776151868696856, + "step": 5842, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.5776151868696856, + "step": 5842, + "train/total_loss": 0.11929059028625488 + }, + { + "entropy": 9.26965618133545, + "epoch": 0.5777140597192011, + "mean_token_accuracy": 0.6484641432762146, + "num_tokens": 9591429.0, + "step": 5843, + "train/ce_loss": 1.5358511209487915 + }, + { + "epoch": 0.5777140597192011, + "step": 5843, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.5777140597192011, + "step": 5843, + "train/total_loss": 0.22389762103557587 + }, + { + "entropy": 8.18938159942627, + "epoch": 0.5778129325687166, + "mean_token_accuracy": 0.7324301600456238, + "num_tokens": 9597107.0, + "step": 5844, + "train/ce_loss": 0.6811568737030029 + }, + { + "epoch": 0.5778129325687166, + "step": 5844, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.5778129325687166, + "step": 5844, + "train/total_loss": 0.10717818886041641 + }, + { + "entropy": 9.126882553100586, + "epoch": 0.5779118054182322, + "mean_token_accuracy": 0.7203728556632996, + "num_tokens": 9602312.0, + "step": 5845, + "train/ce_loss": 0.791706919670105 + }, + { + "epoch": 0.5779118054182322, + "step": 5845, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.5779118054182322, + "step": 5845, + "train/total_loss": 0.11042069643735886 + }, + { + "entropy": 8.713577270507812, + "epoch": 0.5780106782677477, + "mean_token_accuracy": 0.7227822542190552, + "num_tokens": 9607747.0, + "step": 5846, + "train/ce_loss": 0.7399176359176636 + }, + { + "epoch": 0.5780106782677477, + "step": 5846, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.5780106782677477, + "step": 5846, + "train/total_loss": 0.11696051806211472 + }, + { + "entropy": 9.557413101196289, + "epoch": 0.5781095511172633, + "mean_token_accuracy": 0.7734940052032471, + "num_tokens": 9612553.0, + "step": 5847, + "train/ce_loss": 1.399521827697754 + }, + { + "epoch": 0.5781095511172633, + "step": 5847, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.5781095511172633, + "step": 5847, + "train/total_loss": 0.1985459327697754 + }, + { + "entropy": 9.565277099609375, + "epoch": 0.5782084239667787, + "mean_token_accuracy": 0.6928251385688782, + "num_tokens": 9617413.0, + "step": 5848, + "train/ce_loss": 0.7634159922599792 + }, + { + "epoch": 0.5782084239667787, + "step": 5848, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.5782084239667787, + "step": 5848, + "train/total_loss": 0.09196659922599792 + }, + { + "entropy": 8.671602249145508, + "epoch": 0.5783072968162942, + "mean_token_accuracy": 0.6481876373291016, + "num_tokens": 9622817.0, + "step": 5849, + "train/ce_loss": 1.070180058479309 + }, + { + "epoch": 0.5783072968162942, + "step": 5849, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.5783072968162942, + "step": 5849, + "train/total_loss": 0.23201800882816315 + }, + { + "entropy": 9.2224760055542, + "epoch": 0.5784061696658098, + "mean_token_accuracy": 0.7203007340431213, + "num_tokens": 9627905.0, + "step": 5850, + "train/ce_loss": 1.6534047517779982e-06 + }, + { + "epoch": 0.5784061696658098, + "step": 5850, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.5784061696658098, + "step": 5850, + "train/total_loss": 0.03906266391277313 + }, + { + "entropy": 8.414937973022461, + "epoch": 0.5785050425153253, + "mean_token_accuracy": 0.7024128437042236, + "num_tokens": 9633457.0, + "step": 5851, + "train/ce_loss": 1.6479556560516357 + }, + { + "epoch": 0.5785050425153253, + "step": 5851, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.5785050425153253, + "step": 5851, + "train/total_loss": 0.21557681262493134 + }, + { + "entropy": 8.665975570678711, + "epoch": 0.5786039153648408, + "mean_token_accuracy": 0.6990291476249695, + "num_tokens": 9638836.0, + "step": 5852, + "train/ce_loss": 1.2352977991104126 + }, + { + "epoch": 0.5786039153648408, + "step": 5852, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.5786039153648408, + "step": 5852, + "train/total_loss": 0.18993604183197021 + }, + { + "entropy": 8.959815979003906, + "epoch": 0.5787027882143564, + "mean_token_accuracy": 0.7120419144630432, + "num_tokens": 9644063.0, + "step": 5853, + "train/ce_loss": 1.505104899406433 + }, + { + "epoch": 0.5787027882143564, + "step": 5853, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.5787027882143564, + "step": 5853, + "train/total_loss": 0.2091042399406433 + }, + { + "entropy": 9.069355010986328, + "epoch": 0.5788016610638719, + "mean_token_accuracy": 0.7569974660873413, + "num_tokens": 9649366.0, + "step": 5854, + "train/ce_loss": 0.9077438712120056 + }, + { + "epoch": 0.5788016610638719, + "step": 5854, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.5788016610638719, + "step": 5854, + "train/total_loss": 0.17671188712120056 + }, + { + "entropy": 9.358163833618164, + "epoch": 0.5789005339133874, + "mean_token_accuracy": 0.7469135522842407, + "num_tokens": 9654286.0, + "step": 5855, + "train/ce_loss": 1.1978092193603516 + }, + { + "epoch": 0.5789005339133874, + "step": 5855, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.5789005339133874, + "step": 5855, + "train/total_loss": 0.14712467789649963 + }, + { + "entropy": 9.114377975463867, + "epoch": 0.578999406762903, + "mean_token_accuracy": 0.727129340171814, + "num_tokens": 9659334.0, + "step": 5856, + "train/ce_loss": 8.964946687228803e-07 + }, + { + "epoch": 0.578999406762903, + "step": 5856, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.578999406762903, + "step": 5856, + "train/total_loss": 0.04296883940696716 + }, + { + "entropy": 8.770225524902344, + "epoch": 0.5790982796124184, + "mean_token_accuracy": 0.7616707682609558, + "num_tokens": 9664628.0, + "step": 5857, + "train/ce_loss": 0.5496033430099487 + }, + { + "epoch": 0.5790982796124184, + "step": 5857, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.5790982796124184, + "step": 5857, + "train/total_loss": 0.07839784026145935 + }, + { + "entropy": 8.775310516357422, + "epoch": 0.5791971524619339, + "mean_token_accuracy": 0.793379008769989, + "num_tokens": 9669980.0, + "step": 5858, + "train/ce_loss": 0.48683834075927734 + }, + { + "epoch": 0.5791971524619339, + "step": 5858, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.5791971524619339, + "step": 5858, + "train/total_loss": 0.07212133705615997 + }, + { + "entropy": 8.731611251831055, + "epoch": 0.5792960253114495, + "mean_token_accuracy": 0.7424083948135376, + "num_tokens": 9675405.0, + "step": 5859, + "train/ce_loss": 0.6361439228057861 + }, + { + "epoch": 0.5792960253114495, + "step": 5859, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.5792960253114495, + "step": 5859, + "train/total_loss": 0.1456456482410431 + }, + { + "epoch": 0.579394898160965, + "grad_norm": 0.7735334634780884, + "learning_rate": 8.553874301537853e-06, + "loss": 0.1499, + "step": 5860 + }, + { + "entropy": 9.669195175170898, + "epoch": 0.579394898160965, + "mean_token_accuracy": 0.7727272510528564, + "num_tokens": 9680187.0, + "step": 5860, + "train/ce_loss": 1.6363650560379028 + }, + { + "epoch": 0.579394898160965, + "step": 5860, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.579394898160965, + "step": 5860, + "train/total_loss": 0.21832400560379028 + }, + { + "entropy": 8.610940933227539, + "epoch": 0.5794937710104805, + "mean_token_accuracy": 0.7306079864501953, + "num_tokens": 9685589.0, + "step": 5861, + "train/ce_loss": 0.8293500542640686 + }, + { + "epoch": 0.5794937710104805, + "step": 5861, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.5794937710104805, + "step": 5861, + "train/total_loss": 0.13762250542640686 + }, + { + "entropy": 9.000852584838867, + "epoch": 0.5795926438599961, + "mean_token_accuracy": 0.7827715277671814, + "num_tokens": 9690833.0, + "step": 5862, + "train/ce_loss": 0.8612764477729797 + }, + { + "epoch": 0.5795926438599961, + "step": 5862, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.5795926438599961, + "step": 5862, + "train/total_loss": 0.1369088888168335 + }, + { + "entropy": 8.683488845825195, + "epoch": 0.5796915167095116, + "mean_token_accuracy": 0.7246654033660889, + "num_tokens": 9696357.0, + "step": 5863, + "train/ce_loss": 1.3177578449249268 + }, + { + "epoch": 0.5796915167095116, + "step": 5863, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.5796915167095116, + "step": 5863, + "train/total_loss": 0.19818203151226044 + }, + { + "entropy": 8.988357543945312, + "epoch": 0.5797903895590271, + "mean_token_accuracy": 0.7926470637321472, + "num_tokens": 9701515.0, + "step": 5864, + "train/ce_loss": 0.8962416648864746 + }, + { + "epoch": 0.5797903895590271, + "step": 5864, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.5797903895590271, + "step": 5864, + "train/total_loss": 0.12087416648864746 + }, + { + "entropy": 8.894834518432617, + "epoch": 0.5798892624085427, + "mean_token_accuracy": 0.7848605513572693, + "num_tokens": 9706752.0, + "step": 5865, + "train/ce_loss": 0.5083255171775818 + }, + { + "epoch": 0.5798892624085427, + "step": 5865, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.5798892624085427, + "step": 5865, + "train/total_loss": 0.09770755469799042 + }, + { + "entropy": 9.450010299682617, + "epoch": 0.5799881352580581, + "mean_token_accuracy": 0.7307060956954956, + "num_tokens": 9711791.0, + "step": 5866, + "train/ce_loss": 1.6645127516312641e-06 + }, + { + "epoch": 0.5799881352580581, + "step": 5866, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.5799881352580581, + "step": 5866, + "train/total_loss": 0.03906266763806343 + }, + { + "entropy": 9.376605033874512, + "epoch": 0.5800870081075736, + "mean_token_accuracy": 0.7685950398445129, + "num_tokens": 9716736.0, + "step": 5867, + "train/ce_loss": 0.6073023676872253 + }, + { + "epoch": 0.5800870081075736, + "step": 5867, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.5800870081075736, + "step": 5867, + "train/total_loss": 0.0919802337884903 + }, + { + "entropy": 8.476332664489746, + "epoch": 0.5801858809570892, + "mean_token_accuracy": 0.7014613747596741, + "num_tokens": 9722183.0, + "step": 5868, + "train/ce_loss": 1.0599383115768433 + }, + { + "epoch": 0.5801858809570892, + "step": 5868, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.5801858809570892, + "step": 5868, + "train/total_loss": 0.1724000871181488 + }, + { + "entropy": 9.360960006713867, + "epoch": 0.5802847538066047, + "mean_token_accuracy": 0.734133780002594, + "num_tokens": 9727209.0, + "step": 5869, + "train/ce_loss": 1.5586105585098267 + }, + { + "epoch": 0.5802847538066047, + "step": 5869, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.5802847538066047, + "step": 5869, + "train/total_loss": 0.23398606479167938 + }, + { + "entropy": 9.488181114196777, + "epoch": 0.5803836266561202, + "mean_token_accuracy": 0.695049524307251, + "num_tokens": 9732150.0, + "step": 5870, + "train/ce_loss": 1.8666412415768718e-06 + }, + { + "epoch": 0.5803836266561202, + "step": 5870, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.5803836266561202, + "step": 5870, + "train/total_loss": 0.04296893626451492 + }, + { + "entropy": 9.39095687866211, + "epoch": 0.5804824995056358, + "mean_token_accuracy": 0.7685664892196655, + "num_tokens": 9737124.0, + "step": 5871, + "train/ce_loss": 1.294190764427185 + }, + { + "epoch": 0.5804824995056358, + "step": 5871, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.5804824995056358, + "step": 5871, + "train/total_loss": 0.20754407346248627 + }, + { + "entropy": 8.90446662902832, + "epoch": 0.5805813723551513, + "mean_token_accuracy": 0.7427123188972473, + "num_tokens": 9742313.0, + "step": 5872, + "train/ce_loss": 1.3353374004364014 + }, + { + "epoch": 0.5805813723551513, + "step": 5872, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.5805813723551513, + "step": 5872, + "train/total_loss": 0.18040874600410461 + }, + { + "entropy": 8.926846504211426, + "epoch": 0.5806802452046668, + "mean_token_accuracy": 0.7269503474235535, + "num_tokens": 9747598.0, + "step": 5873, + "train/ce_loss": 0.8143326640129089 + }, + { + "epoch": 0.5806802452046668, + "step": 5873, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.5806802452046668, + "step": 5873, + "train/total_loss": 0.1400270164012909 + }, + { + "entropy": 9.394083976745605, + "epoch": 0.5807791180541824, + "mean_token_accuracy": 0.7317487001419067, + "num_tokens": 9752631.0, + "step": 5874, + "train/ce_loss": 1.077366828918457 + }, + { + "epoch": 0.5807791180541824, + "step": 5874, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.5807791180541824, + "step": 5874, + "train/total_loss": 0.21320542693138123 + }, + { + "entropy": 9.731376647949219, + "epoch": 0.5808779909036978, + "mean_token_accuracy": 0.7923627495765686, + "num_tokens": 9757435.0, + "step": 5875, + "train/ce_loss": 1.2870942782683414e-06 + }, + { + "epoch": 0.5808779909036978, + "step": 5875, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.5808779909036978, + "step": 5875, + "train/total_loss": 0.015625128522515297 + }, + { + "entropy": 9.461321830749512, + "epoch": 0.5809768637532133, + "mean_token_accuracy": 0.7592592835426331, + "num_tokens": 9762418.0, + "step": 5876, + "train/ce_loss": 0.8081197142601013 + }, + { + "epoch": 0.5809768637532133, + "step": 5876, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.5809768637532133, + "step": 5876, + "train/total_loss": 0.12378071993589401 + }, + { + "entropy": 8.56025505065918, + "epoch": 0.5810757366027289, + "mean_token_accuracy": 0.746666669845581, + "num_tokens": 9767992.0, + "step": 5877, + "train/ce_loss": 0.9183955788612366 + }, + { + "epoch": 0.5810757366027289, + "step": 5877, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.5810757366027289, + "step": 5877, + "train/total_loss": 0.15824580192565918 + }, + { + "entropy": 8.989036560058594, + "epoch": 0.5811746094522444, + "mean_token_accuracy": 0.7418967485427856, + "num_tokens": 9773297.0, + "step": 5878, + "train/ce_loss": 1.1786950826644897 + }, + { + "epoch": 0.5811746094522444, + "step": 5878, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.5811746094522444, + "step": 5878, + "train/total_loss": 0.2077132612466812 + }, + { + "entropy": 9.000946998596191, + "epoch": 0.5812734823017599, + "mean_token_accuracy": 0.7270029783248901, + "num_tokens": 9778400.0, + "step": 5879, + "train/ce_loss": 1.591559648513794 + }, + { + "epoch": 0.5812734823017599, + "step": 5879, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.5812734823017599, + "step": 5879, + "train/total_loss": 0.1982184648513794 + }, + { + "epoch": 0.5813723551512755, + "grad_norm": 0.7643924951553345, + "learning_rate": 8.548929436779905e-06, + "loss": 0.134, + "step": 5880 + }, + { + "entropy": 8.803194046020508, + "epoch": 0.5813723551512755, + "mean_token_accuracy": 0.743888258934021, + "num_tokens": 9783757.0, + "step": 5880, + "train/ce_loss": 1.131672978401184 + }, + { + "epoch": 0.5813723551512755, + "step": 5880, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.5813723551512755, + "step": 5880, + "train/total_loss": 0.16004230082035065 + }, + { + "entropy": 9.034435272216797, + "epoch": 0.581471228000791, + "mean_token_accuracy": 0.7707006335258484, + "num_tokens": 9788827.0, + "step": 5881, + "train/ce_loss": 0.5767542123794556 + }, + { + "epoch": 0.581471228000791, + "step": 5881, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.581471228000791, + "step": 5881, + "train/total_loss": 0.10845667123794556 + }, + { + "entropy": 8.828594207763672, + "epoch": 0.5815701008503065, + "mean_token_accuracy": 0.681034505367279, + "num_tokens": 9794306.0, + "step": 5882, + "train/ce_loss": 1.3235604763031006 + }, + { + "epoch": 0.5815701008503065, + "step": 5882, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.5815701008503065, + "step": 5882, + "train/total_loss": 0.16360604763031006 + }, + { + "entropy": 9.103014945983887, + "epoch": 0.5816689736998221, + "mean_token_accuracy": 0.7027778029441833, + "num_tokens": 9799447.0, + "step": 5883, + "train/ce_loss": 1.4092310667037964 + }, + { + "epoch": 0.5816689736998221, + "step": 5883, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.5816689736998221, + "step": 5883, + "train/total_loss": 0.18779811263084412 + }, + { + "entropy": 8.762706756591797, + "epoch": 0.5817678465493376, + "mean_token_accuracy": 0.7728776335716248, + "num_tokens": 9804852.0, + "step": 5884, + "train/ce_loss": 0.5362897515296936 + }, + { + "epoch": 0.5817678465493376, + "step": 5884, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.5817678465493376, + "step": 5884, + "train/total_loss": 0.08097273111343384 + }, + { + "entropy": 8.828049659729004, + "epoch": 0.581866719398853, + "mean_token_accuracy": 0.7747858166694641, + "num_tokens": 9810111.0, + "step": 5885, + "train/ce_loss": 0.6966593861579895 + }, + { + "epoch": 0.581866719398853, + "step": 5885, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.581866719398853, + "step": 5885, + "train/total_loss": 0.13607218861579895 + }, + { + "entropy": 8.948248863220215, + "epoch": 0.5819655922483686, + "mean_token_accuracy": 0.70126873254776, + "num_tokens": 9815483.0, + "step": 5886, + "train/ce_loss": 0.7913849949836731 + }, + { + "epoch": 0.5819655922483686, + "step": 5886, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.5819655922483686, + "step": 5886, + "train/total_loss": 0.10648225247859955 + }, + { + "entropy": 9.371553421020508, + "epoch": 0.5820644650978841, + "mean_token_accuracy": 0.679411768913269, + "num_tokens": 9820572.0, + "step": 5887, + "train/ce_loss": 1.750806632117019e-06 + }, + { + "epoch": 0.5820644650978841, + "step": 5887, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.5820644650978841, + "step": 5887, + "train/total_loss": 0.023437675088644028 + }, + { + "entropy": 9.152580261230469, + "epoch": 0.5821633379473996, + "mean_token_accuracy": 0.7692307829856873, + "num_tokens": 9825757.0, + "step": 5888, + "train/ce_loss": 0.8407130837440491 + }, + { + "epoch": 0.5821633379473996, + "step": 5888, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.5821633379473996, + "step": 5888, + "train/total_loss": 0.1387588083744049 + }, + { + "entropy": 8.81597900390625, + "epoch": 0.5822622107969152, + "mean_token_accuracy": 0.7447335720062256, + "num_tokens": 9831034.0, + "step": 5889, + "train/ce_loss": 1.1196482181549072 + }, + { + "epoch": 0.5822622107969152, + "step": 5889, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.5822622107969152, + "step": 5889, + "train/total_loss": 0.18618357181549072 + }, + { + "entropy": 9.071273803710938, + "epoch": 0.5823610836464307, + "mean_token_accuracy": 0.787162184715271, + "num_tokens": 9836069.0, + "step": 5890, + "train/ce_loss": 0.8394007086753845 + }, + { + "epoch": 0.5823610836464307, + "step": 5890, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.5823610836464307, + "step": 5890, + "train/total_loss": 0.11909632384777069 + }, + { + "entropy": 8.615422248840332, + "epoch": 0.5824599564959462, + "mean_token_accuracy": 0.7299492359161377, + "num_tokens": 9841517.0, + "step": 5891, + "train/ce_loss": 0.6269214153289795 + }, + { + "epoch": 0.5824599564959462, + "step": 5891, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.5824599564959462, + "step": 5891, + "train/total_loss": 0.08612964302301407 + }, + { + "entropy": 8.989612579345703, + "epoch": 0.5825588293454618, + "mean_token_accuracy": 0.7749999761581421, + "num_tokens": 9846669.0, + "step": 5892, + "train/ce_loss": 0.9876495599746704 + }, + { + "epoch": 0.5825588293454618, + "step": 5892, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.5825588293454618, + "step": 5892, + "train/total_loss": 0.13782745599746704 + }, + { + "entropy": 8.770111083984375, + "epoch": 0.5826577021949773, + "mean_token_accuracy": 0.7146017551422119, + "num_tokens": 9852008.0, + "step": 5893, + "train/ce_loss": 0.8659339547157288 + }, + { + "epoch": 0.5826577021949773, + "step": 5893, + "train/sim_loss": 0.12890625 + }, + { + "epoch": 0.5826577021949773, + "step": 5893, + "train/total_loss": 0.2154996395111084 + }, + { + "entropy": 8.816354751586914, + "epoch": 0.5827565750444927, + "mean_token_accuracy": 0.7404305934906006, + "num_tokens": 9857316.0, + "step": 5894, + "train/ce_loss": 0.8825889229774475 + }, + { + "epoch": 0.5827565750444927, + "step": 5894, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.5827565750444927, + "step": 5894, + "train/total_loss": 0.15075889229774475 + }, + { + "entropy": 8.934374809265137, + "epoch": 0.5828554478940083, + "mean_token_accuracy": 0.7435232996940613, + "num_tokens": 9862594.0, + "step": 5895, + "train/ce_loss": 0.8922495245933533 + }, + { + "epoch": 0.5828554478940083, + "step": 5895, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.5828554478940083, + "step": 5895, + "train/total_loss": 0.19859996438026428 + }, + { + "entropy": 8.723108291625977, + "epoch": 0.5829543207435238, + "mean_token_accuracy": 0.7158836722373962, + "num_tokens": 9867987.0, + "step": 5896, + "train/ce_loss": 1.0666035413742065 + }, + { + "epoch": 0.5829543207435238, + "step": 5896, + "train/sim_loss": 0.1328125 + }, + { + "epoch": 0.5829543207435238, + "step": 5896, + "train/total_loss": 0.2394728660583496 + }, + { + "entropy": 8.87197494506836, + "epoch": 0.5830531935930393, + "mean_token_accuracy": 0.720441997051239, + "num_tokens": 9873373.0, + "step": 5897, + "train/ce_loss": 0.532087504863739 + }, + { + "epoch": 0.5830531935930393, + "step": 5897, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.5830531935930393, + "step": 5897, + "train/total_loss": 0.07664625346660614 + }, + { + "entropy": 8.925093650817871, + "epoch": 0.5831520664425549, + "mean_token_accuracy": 0.7520958185195923, + "num_tokens": 9878614.0, + "step": 5898, + "train/ce_loss": 1.1608963012695312 + }, + { + "epoch": 0.5831520664425549, + "step": 5898, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.5831520664425549, + "step": 5898, + "train/total_loss": 0.17077714204788208 + }, + { + "entropy": 8.685224533081055, + "epoch": 0.5832509392920704, + "mean_token_accuracy": 0.7431610822677612, + "num_tokens": 9883770.0, + "step": 5899, + "train/ce_loss": 1.1857091188430786 + }, + { + "epoch": 0.5832509392920704, + "step": 5899, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.5832509392920704, + "step": 5899, + "train/total_loss": 0.16544592380523682 + }, + { + "epoch": 0.5833498121415859, + "grad_norm": 0.8614020347595215, + "learning_rate": 8.543984572021955e-06, + "loss": 0.1414, + "step": 5900 + }, + { + "entropy": 8.918161392211914, + "epoch": 0.5833498121415859, + "mean_token_accuracy": 0.7391930818557739, + "num_tokens": 9889001.0, + "step": 5900, + "train/ce_loss": 0.8995566368103027 + }, + { + "epoch": 0.5833498121415859, + "step": 5900, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.5833498121415859, + "step": 5900, + "train/total_loss": 0.1446431577205658 + }, + { + "entropy": 9.084014892578125, + "epoch": 0.5834486849911015, + "mean_token_accuracy": 0.7184594869613647, + "num_tokens": 9894231.0, + "step": 5901, + "train/ce_loss": 1.231453537940979 + }, + { + "epoch": 0.5834486849911015, + "step": 5901, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.5834486849911015, + "step": 5901, + "train/total_loss": 0.15439535677433014 + }, + { + "entropy": 8.931279182434082, + "epoch": 0.583547557840617, + "mean_token_accuracy": 0.6940749883651733, + "num_tokens": 9899487.0, + "step": 5902, + "train/ce_loss": 0.7057415246963501 + }, + { + "epoch": 0.583547557840617, + "step": 5902, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.583547557840617, + "step": 5902, + "train/total_loss": 0.12135540693998337 + }, + { + "entropy": 8.705514907836914, + "epoch": 0.5836464306901324, + "mean_token_accuracy": 0.787994921207428, + "num_tokens": 9904759.0, + "step": 5903, + "train/ce_loss": 0.4530998170375824 + }, + { + "epoch": 0.5836464306901324, + "step": 5903, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.5836464306901324, + "step": 5903, + "train/total_loss": 0.06874748319387436 + }, + { + "entropy": 9.598268508911133, + "epoch": 0.583745303539648, + "mean_token_accuracy": 0.8318385481834412, + "num_tokens": 9909639.0, + "step": 5904, + "train/ce_loss": 9.429682563677488e-07 + }, + { + "epoch": 0.583745303539648, + "step": 5904, + "train/sim_loss": 0.01171875 + }, + { + "epoch": 0.583745303539648, + "step": 5904, + "train/total_loss": 0.011718844063580036 + }, + { + "entropy": 10.027769088745117, + "epoch": 0.5838441763891635, + "mean_token_accuracy": 0.8108108043670654, + "num_tokens": 9914188.0, + "step": 5905, + "train/ce_loss": 2.373533248901367 + }, + { + "epoch": 0.5838441763891635, + "step": 5905, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.5838441763891635, + "step": 5905, + "train/total_loss": 0.2686033248901367 + }, + { + "entropy": 9.258383750915527, + "epoch": 0.583943049238679, + "mean_token_accuracy": 0.7267950773239136, + "num_tokens": 9919172.0, + "step": 5906, + "train/ce_loss": 7.632613119312737e-07 + }, + { + "epoch": 0.583943049238679, + "step": 5906, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.583943049238679, + "step": 5906, + "train/total_loss": 0.02343757636845112 + }, + { + "entropy": 9.364818572998047, + "epoch": 0.5840419220881946, + "mean_token_accuracy": 0.7480559945106506, + "num_tokens": 9924213.0, + "step": 5907, + "train/ce_loss": 0.9966214299201965 + }, + { + "epoch": 0.5840419220881946, + "step": 5907, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.5840419220881946, + "step": 5907, + "train/total_loss": 0.1465371549129486 + }, + { + "entropy": 9.138063430786133, + "epoch": 0.5841407949377101, + "mean_token_accuracy": 0.7645466923713684, + "num_tokens": 9929419.0, + "step": 5908, + "train/ce_loss": 0.7238141298294067 + }, + { + "epoch": 0.5841407949377101, + "step": 5908, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.5841407949377101, + "step": 5908, + "train/total_loss": 0.1153501644730568 + }, + { + "entropy": 8.901884078979492, + "epoch": 0.5842396677872256, + "mean_token_accuracy": 0.7815231084823608, + "num_tokens": 9934694.0, + "step": 5909, + "train/ce_loss": 0.6918627619743347 + }, + { + "epoch": 0.5842396677872256, + "step": 5909, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.5842396677872256, + "step": 5909, + "train/total_loss": 0.143405020236969 + }, + { + "entropy": 8.765707015991211, + "epoch": 0.5843385406367412, + "mean_token_accuracy": 0.7502837777137756, + "num_tokens": 9940041.0, + "step": 5910, + "train/ce_loss": 0.906067430973053 + }, + { + "epoch": 0.5843385406367412, + "step": 5910, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.5843385406367412, + "step": 5910, + "train/total_loss": 0.19998174905776978 + }, + { + "entropy": 8.78848648071289, + "epoch": 0.5844374134862567, + "mean_token_accuracy": 0.746302604675293, + "num_tokens": 9945462.0, + "step": 5911, + "train/ce_loss": 1.031307578086853 + }, + { + "epoch": 0.5844374134862567, + "step": 5911, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.5844374134862567, + "step": 5911, + "train/total_loss": 0.1695370078086853 + }, + { + "entropy": 8.435439109802246, + "epoch": 0.5845362863357721, + "mean_token_accuracy": 0.7630661725997925, + "num_tokens": 9950858.0, + "step": 5912, + "train/ce_loss": 0.4197191894054413 + }, + { + "epoch": 0.5845362863357721, + "step": 5912, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.5845362863357721, + "step": 5912, + "train/total_loss": 0.10837817192077637 + }, + { + "entropy": 8.845929145812988, + "epoch": 0.5846351591852877, + "mean_token_accuracy": 0.7647753953933716, + "num_tokens": 9956161.0, + "step": 5913, + "train/ce_loss": 1.0874028205871582 + }, + { + "epoch": 0.5846351591852877, + "step": 5913, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.5846351591852877, + "step": 5913, + "train/total_loss": 0.15561528503894806 + }, + { + "entropy": 9.359431266784668, + "epoch": 0.5847340320348032, + "mean_token_accuracy": 0.7215384840965271, + "num_tokens": 9961182.0, + "step": 5914, + "train/ce_loss": 0.7800878286361694 + }, + { + "epoch": 0.5847340320348032, + "step": 5914, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.5847340320348032, + "step": 5914, + "train/total_loss": 0.10925878584384918 + }, + { + "entropy": 8.976252555847168, + "epoch": 0.5848329048843187, + "mean_token_accuracy": 0.7380607724189758, + "num_tokens": 9966365.0, + "step": 5915, + "train/ce_loss": 0.6915847063064575 + }, + { + "epoch": 0.5848329048843187, + "step": 5915, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.5848329048843187, + "step": 5915, + "train/total_loss": 0.11993972212076187 + }, + { + "entropy": 9.099713325500488, + "epoch": 0.5849317777338343, + "mean_token_accuracy": 0.7078014016151428, + "num_tokens": 9971523.0, + "step": 5916, + "train/ce_loss": 1.966060835911776e-06 + }, + { + "epoch": 0.5849317777338343, + "step": 5916, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.5849317777338343, + "step": 5916, + "train/total_loss": 0.05468769744038582 + }, + { + "entropy": 9.637526512145996, + "epoch": 0.5850306505833498, + "mean_token_accuracy": 0.7004504799842834, + "num_tokens": 9976436.0, + "step": 5917, + "train/ce_loss": 1.4811153050686698e-05 + }, + { + "epoch": 0.5850306505833498, + "step": 5917, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.5850306505833498, + "step": 5917, + "train/total_loss": 0.07031398266553879 + }, + { + "entropy": 9.061832427978516, + "epoch": 0.5851295234328653, + "mean_token_accuracy": 0.7558620572090149, + "num_tokens": 9981625.0, + "step": 5918, + "train/ce_loss": 0.8422671556472778 + }, + { + "epoch": 0.5851295234328653, + "step": 5918, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.5851295234328653, + "step": 5918, + "train/total_loss": 0.12719547748565674 + }, + { + "entropy": 9.495265007019043, + "epoch": 0.5852283962823809, + "mean_token_accuracy": 0.7879924774169922, + "num_tokens": 9986595.0, + "step": 5919, + "train/ce_loss": 1.5633907318115234 + }, + { + "epoch": 0.5852283962823809, + "step": 5919, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.5852283962823809, + "step": 5919, + "train/total_loss": 0.17977657914161682 + }, + { + "epoch": 0.5853272691318964, + "grad_norm": 0.660368025302887, + "learning_rate": 8.539039707264008e-06, + "loss": 0.1305, + "step": 5920 + }, + { + "entropy": 8.801708221435547, + "epoch": 0.5853272691318964, + "mean_token_accuracy": 0.779411792755127, + "num_tokens": 9992008.0, + "step": 5920, + "train/ce_loss": 0.7756808996200562 + }, + { + "epoch": 0.5853272691318964, + "step": 5920, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.5853272691318964, + "step": 5920, + "train/total_loss": 0.14397433400154114 + }, + { + "entropy": 9.236620903015137, + "epoch": 0.5854261419814119, + "mean_token_accuracy": 0.767123281955719, + "num_tokens": 9997165.0, + "step": 5921, + "train/ce_loss": 0.737661600112915 + }, + { + "epoch": 0.5854261419814119, + "step": 5921, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.5854261419814119, + "step": 5921, + "train/total_loss": 0.14017242193222046 + }, + { + "entropy": 8.805265426635742, + "epoch": 0.5855250148309274, + "mean_token_accuracy": 0.7797872424125671, + "num_tokens": 10002562.0, + "step": 5922, + "train/ce_loss": 1.2521300315856934 + }, + { + "epoch": 0.5855250148309274, + "step": 5922, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.5855250148309274, + "step": 5922, + "train/total_loss": 0.20333801209926605 + }, + { + "entropy": 9.093343734741211, + "epoch": 0.5856238876804429, + "mean_token_accuracy": 0.7347242832183838, + "num_tokens": 10007870.0, + "step": 5923, + "train/ce_loss": 1.0051777362823486 + }, + { + "epoch": 0.5856238876804429, + "step": 5923, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.5856238876804429, + "step": 5923, + "train/total_loss": 0.14739277958869934 + }, + { + "entropy": 9.039249420166016, + "epoch": 0.5857227605299584, + "mean_token_accuracy": 0.6990678906440735, + "num_tokens": 10013038.0, + "step": 5924, + "train/ce_loss": 1.0364798307418823 + }, + { + "epoch": 0.5857227605299584, + "step": 5924, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.5857227605299584, + "step": 5924, + "train/total_loss": 0.15442922711372375 + }, + { + "entropy": 8.729142189025879, + "epoch": 0.585821633379474, + "mean_token_accuracy": 0.824940025806427, + "num_tokens": 10018383.0, + "step": 5925, + "train/ce_loss": 0.5556420087814331 + }, + { + "epoch": 0.585821633379474, + "step": 5925, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.585821633379474, + "step": 5925, + "train/total_loss": 0.07509545236825943 + }, + { + "entropy": 9.028743743896484, + "epoch": 0.5859205062289895, + "mean_token_accuracy": 0.7197723984718323, + "num_tokens": 10023580.0, + "step": 5926, + "train/ce_loss": 0.7546852231025696 + }, + { + "epoch": 0.5859205062289895, + "step": 5926, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.5859205062289895, + "step": 5926, + "train/total_loss": 0.1301560252904892 + }, + { + "entropy": 9.160871505737305, + "epoch": 0.586019379078505, + "mean_token_accuracy": 0.8131386637687683, + "num_tokens": 10028723.0, + "step": 5927, + "train/ce_loss": 0.00012569209502544254 + }, + { + "epoch": 0.586019379078505, + "step": 5927, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.586019379078505, + "step": 5927, + "train/total_loss": 0.07032506912946701 + }, + { + "entropy": 9.165404319763184, + "epoch": 0.5861182519280206, + "mean_token_accuracy": 0.7296898365020752, + "num_tokens": 10033886.0, + "step": 5928, + "train/ce_loss": 1.1629759073257446 + }, + { + "epoch": 0.5861182519280206, + "step": 5928, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.5861182519280206, + "step": 5928, + "train/total_loss": 0.18661010265350342 + }, + { + "entropy": 9.300680160522461, + "epoch": 0.5862171247775361, + "mean_token_accuracy": 0.7137930989265442, + "num_tokens": 10038959.0, + "step": 5929, + "train/ce_loss": 1.3878321647644043 + }, + { + "epoch": 0.5862171247775361, + "step": 5929, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.5862171247775361, + "step": 5929, + "train/total_loss": 0.17003321647644043 + }, + { + "entropy": 8.649507522583008, + "epoch": 0.5863159976270517, + "mean_token_accuracy": 0.834645688533783, + "num_tokens": 10044446.0, + "step": 5930, + "train/ce_loss": 0.8138261437416077 + }, + { + "epoch": 0.5863159976270517, + "step": 5930, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.5863159976270517, + "step": 5930, + "train/total_loss": 0.151695117354393 + }, + { + "entropy": 9.599285125732422, + "epoch": 0.5864148704765672, + "mean_token_accuracy": 0.7827869057655334, + "num_tokens": 10049321.0, + "step": 5931, + "train/ce_loss": 1.53306245803833 + }, + { + "epoch": 0.5864148704765672, + "step": 5931, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.5864148704765672, + "step": 5931, + "train/total_loss": 0.227524995803833 + }, + { + "entropy": 9.229707717895508, + "epoch": 0.5865137433260826, + "mean_token_accuracy": 0.7536231875419617, + "num_tokens": 10054310.0, + "step": 5932, + "train/ce_loss": 0.8070537447929382 + }, + { + "epoch": 0.5865137433260826, + "step": 5932, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.5865137433260826, + "step": 5932, + "train/total_loss": 0.12758037447929382 + }, + { + "entropy": 8.597070693969727, + "epoch": 0.5866126161755982, + "mean_token_accuracy": 0.7008547186851501, + "num_tokens": 10059742.0, + "step": 5933, + "train/ce_loss": 0.6685133576393127 + }, + { + "epoch": 0.5866126161755982, + "step": 5933, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.5866126161755982, + "step": 5933, + "train/total_loss": 0.10982009023427963 + }, + { + "entropy": 9.315040588378906, + "epoch": 0.5867114890251137, + "mean_token_accuracy": 0.7763158082962036, + "num_tokens": 10064796.0, + "step": 5934, + "train/ce_loss": 0.8337532877922058 + }, + { + "epoch": 0.5867114890251137, + "step": 5934, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.5867114890251137, + "step": 5934, + "train/total_loss": 0.10290657728910446 + }, + { + "entropy": 9.534631729125977, + "epoch": 0.5868103618746292, + "mean_token_accuracy": 0.6329787373542786, + "num_tokens": 10069802.0, + "step": 5935, + "train/ce_loss": 0.7574530839920044 + }, + { + "epoch": 0.5868103618746292, + "step": 5935, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.5868103618746292, + "step": 5935, + "train/total_loss": 0.12262030690908432 + }, + { + "entropy": 8.960814476013184, + "epoch": 0.5869092347241448, + "mean_token_accuracy": 0.7309644818305969, + "num_tokens": 10075081.0, + "step": 5936, + "train/ce_loss": 0.8639675378799438 + }, + { + "epoch": 0.5869092347241448, + "step": 5936, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.5869092347241448, + "step": 5936, + "train/total_loss": 0.15280300378799438 + }, + { + "entropy": 9.429279327392578, + "epoch": 0.5870081075736603, + "mean_token_accuracy": 0.7545126080513, + "num_tokens": 10080048.0, + "step": 5937, + "train/ce_loss": 0.908481240272522 + }, + { + "epoch": 0.5870081075736603, + "step": 5937, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.5870081075736603, + "step": 5937, + "train/total_loss": 0.14162936806678772 + }, + { + "entropy": 9.159167289733887, + "epoch": 0.5871069804231758, + "mean_token_accuracy": 0.7885802388191223, + "num_tokens": 10085161.0, + "step": 5938, + "train/ce_loss": 0.7795634269714355 + }, + { + "epoch": 0.5871069804231758, + "step": 5938, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.5871069804231758, + "step": 5938, + "train/total_loss": 0.09748759120702744 + }, + { + "entropy": 8.759031295776367, + "epoch": 0.5872058532726914, + "mean_token_accuracy": 0.7425742745399475, + "num_tokens": 10090646.0, + "step": 5939, + "train/ce_loss": 0.8374418020248413 + }, + { + "epoch": 0.5872058532726914, + "step": 5939, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.5872058532726914, + "step": 5939, + "train/total_loss": 0.17749418318271637 + }, + { + "epoch": 0.5873047261222069, + "grad_norm": 0.7194293737411499, + "learning_rate": 8.534094842506058e-06, + "loss": 0.1329, + "step": 5940 + }, + { + "entropy": 8.76194953918457, + "epoch": 0.5873047261222069, + "mean_token_accuracy": 0.6855733394622803, + "num_tokens": 10095911.0, + "step": 5940, + "train/ce_loss": 0.8222272396087646 + }, + { + "epoch": 0.5873047261222069, + "step": 5940, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.5873047261222069, + "step": 5940, + "train/total_loss": 0.14862897992134094 + }, + { + "entropy": 8.507280349731445, + "epoch": 0.5874035989717223, + "mean_token_accuracy": 0.7427293062210083, + "num_tokens": 10101265.0, + "step": 5941, + "train/ce_loss": 1.1149815320968628 + }, + { + "epoch": 0.5874035989717223, + "step": 5941, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.5874035989717223, + "step": 5941, + "train/total_loss": 0.1466543972492218 + }, + { + "entropy": 9.517021179199219, + "epoch": 0.5875024718212379, + "mean_token_accuracy": 0.7517730593681335, + "num_tokens": 10106253.0, + "step": 5942, + "train/ce_loss": 1.2174076573501225e-06 + }, + { + "epoch": 0.5875024718212379, + "step": 5942, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.5875024718212379, + "step": 5942, + "train/total_loss": 0.0156251210719347 + }, + { + "entropy": 8.679609298706055, + "epoch": 0.5876013446707534, + "mean_token_accuracy": 0.7927400469779968, + "num_tokens": 10111574.0, + "step": 5943, + "train/ce_loss": 0.6443299651145935 + }, + { + "epoch": 0.5876013446707534, + "step": 5943, + "train/sim_loss": 0.12890625 + }, + { + "epoch": 0.5876013446707534, + "step": 5943, + "train/total_loss": 0.1933392584323883 + }, + { + "entropy": 9.07916259765625, + "epoch": 0.5877002175202689, + "mean_token_accuracy": 0.7250945568084717, + "num_tokens": 10116808.0, + "step": 5944, + "train/ce_loss": 0.5388218760490417 + }, + { + "epoch": 0.5877002175202689, + "step": 5944, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.5877002175202689, + "step": 5944, + "train/total_loss": 0.0929446890950203 + }, + { + "entropy": 8.75620174407959, + "epoch": 0.5877990903697845, + "mean_token_accuracy": 0.7218863368034363, + "num_tokens": 10122108.0, + "step": 5945, + "train/ce_loss": 0.9487734436988831 + }, + { + "epoch": 0.5877990903697845, + "step": 5945, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.5877990903697845, + "step": 5945, + "train/total_loss": 0.13784609735012054 + }, + { + "entropy": 9.748626708984375, + "epoch": 0.5878979632193, + "mean_token_accuracy": 0.6862170100212097, + "num_tokens": 10126858.0, + "step": 5946, + "train/ce_loss": 2.1903035640716553 + }, + { + "epoch": 0.5878979632193, + "step": 5946, + "train/sim_loss": 0.13671875 + }, + { + "epoch": 0.5878979632193, + "step": 5946, + "train/total_loss": 0.35574913024902344 + }, + { + "entropy": 8.954833984375, + "epoch": 0.5879968360688155, + "mean_token_accuracy": 0.7099125385284424, + "num_tokens": 10131995.0, + "step": 5947, + "train/ce_loss": 1.7198902368545532 + }, + { + "epoch": 0.5879968360688155, + "step": 5947, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.5879968360688155, + "step": 5947, + "train/total_loss": 0.19542652368545532 + }, + { + "entropy": 9.191041946411133, + "epoch": 0.5880957089183311, + "mean_token_accuracy": 0.6752827167510986, + "num_tokens": 10137079.0, + "step": 5948, + "train/ce_loss": 1.7161764844786376e-06 + }, + { + "epoch": 0.5880957089183311, + "step": 5948, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.5880957089183311, + "step": 5948, + "train/total_loss": 0.03515642136335373 + }, + { + "entropy": 8.782689094543457, + "epoch": 0.5881945817678466, + "mean_token_accuracy": 0.7277432680130005, + "num_tokens": 10142694.0, + "step": 5949, + "train/ce_loss": 0.9652318954467773 + }, + { + "epoch": 0.5881945817678466, + "step": 5949, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.5881945817678466, + "step": 5949, + "train/total_loss": 0.1746481955051422 + }, + { + "entropy": 8.81184196472168, + "epoch": 0.588293454617362, + "mean_token_accuracy": 0.7011111378669739, + "num_tokens": 10148015.0, + "step": 5950, + "train/ce_loss": 1.3074750900268555 + }, + { + "epoch": 0.588293454617362, + "step": 5950, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.588293454617362, + "step": 5950, + "train/total_loss": 0.16981001198291779 + }, + { + "entropy": 8.900465965270996, + "epoch": 0.5883923274668776, + "mean_token_accuracy": 0.7444314360618591, + "num_tokens": 10153377.0, + "step": 5951, + "train/ce_loss": 0.5374799370765686 + }, + { + "epoch": 0.5883923274668776, + "step": 5951, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.5883923274668776, + "step": 5951, + "train/total_loss": 0.1318729966878891 + }, + { + "entropy": 9.416954040527344, + "epoch": 0.5884912003163931, + "mean_token_accuracy": 0.7321131229400635, + "num_tokens": 10158414.0, + "step": 5952, + "train/ce_loss": 1.132142186164856 + }, + { + "epoch": 0.5884912003163931, + "step": 5952, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.5884912003163931, + "step": 5952, + "train/total_loss": 0.16790172457695007 + }, + { + "entropy": 9.676469802856445, + "epoch": 0.5885900731659086, + "mean_token_accuracy": 0.8164557218551636, + "num_tokens": 10163165.0, + "step": 5953, + "train/ce_loss": 1.314548134803772 + }, + { + "epoch": 0.5885900731659086, + "step": 5953, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.5885900731659086, + "step": 5953, + "train/total_loss": 0.20957981050014496 + }, + { + "entropy": 8.674261093139648, + "epoch": 0.5886889460154242, + "mean_token_accuracy": 0.6891133785247803, + "num_tokens": 10168631.0, + "step": 5954, + "train/ce_loss": 0.7551501989364624 + }, + { + "epoch": 0.5886889460154242, + "step": 5954, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.5886889460154242, + "step": 5954, + "train/total_loss": 0.1341087818145752 + }, + { + "entropy": 9.235870361328125, + "epoch": 0.5887878188649397, + "mean_token_accuracy": 0.7565485239028931, + "num_tokens": 10173749.0, + "step": 5955, + "train/ce_loss": 1.3685823678970337 + }, + { + "epoch": 0.5887878188649397, + "step": 5955, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.5887878188649397, + "step": 5955, + "train/total_loss": 0.2071707397699356 + }, + { + "entropy": 9.224539756774902, + "epoch": 0.5888866917144552, + "mean_token_accuracy": 0.6733601093292236, + "num_tokens": 10178962.0, + "step": 5956, + "train/ce_loss": 1.0337599515914917 + }, + { + "epoch": 0.5888866917144552, + "step": 5956, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.5888866917144552, + "step": 5956, + "train/total_loss": 0.16196975111961365 + }, + { + "entropy": 9.61436653137207, + "epoch": 0.5889855645639708, + "mean_token_accuracy": 0.7746478915214539, + "num_tokens": 10183818.0, + "step": 5957, + "train/ce_loss": 0.8637999892234802 + }, + { + "epoch": 0.5889855645639708, + "step": 5957, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.5889855645639708, + "step": 5957, + "train/total_loss": 0.1215362474322319 + }, + { + "entropy": 9.203855514526367, + "epoch": 0.5890844374134863, + "mean_token_accuracy": 0.7799696326255798, + "num_tokens": 10188971.0, + "step": 5958, + "train/ce_loss": 1.0645607709884644 + }, + { + "epoch": 0.5890844374134863, + "step": 5958, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.5890844374134863, + "step": 5958, + "train/total_loss": 0.16114357113838196 + }, + { + "entropy": 9.107339859008789, + "epoch": 0.5891833102630017, + "mean_token_accuracy": 0.7337837815284729, + "num_tokens": 10194205.0, + "step": 5959, + "train/ce_loss": 1.004172444343567 + }, + { + "epoch": 0.5891833102630017, + "step": 5959, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.5891833102630017, + "step": 5959, + "train/total_loss": 0.15901100635528564 + }, + { + "epoch": 0.5892821831125173, + "grad_norm": 0.751305103302002, + "learning_rate": 8.529149977748109e-06, + "loss": 0.1461, + "step": 5960 + }, + { + "entropy": 8.705224990844727, + "epoch": 0.5892821831125173, + "mean_token_accuracy": 0.6809881925582886, + "num_tokens": 10199614.0, + "step": 5960, + "train/ce_loss": 1.2652076482772827 + }, + { + "epoch": 0.5892821831125173, + "step": 5960, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.5892821831125173, + "step": 5960, + "train/total_loss": 0.1773020178079605 + }, + { + "entropy": 9.094744682312012, + "epoch": 0.5893810559620328, + "mean_token_accuracy": 0.6996148824691772, + "num_tokens": 10204796.0, + "step": 5961, + "train/ce_loss": 0.6137571334838867 + }, + { + "epoch": 0.5893810559620328, + "step": 5961, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.5893810559620328, + "step": 5961, + "train/total_loss": 0.08481321483850479 + }, + { + "entropy": 9.080042839050293, + "epoch": 0.5894799288115483, + "mean_token_accuracy": 0.7582278251647949, + "num_tokens": 10210082.0, + "step": 5962, + "train/ce_loss": 0.7311226725578308 + }, + { + "epoch": 0.5894799288115483, + "step": 5962, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.5894799288115483, + "step": 5962, + "train/total_loss": 0.09264352172613144 + }, + { + "entropy": 9.18613052368164, + "epoch": 0.5895788016610639, + "mean_token_accuracy": 0.7206133008003235, + "num_tokens": 10215123.0, + "step": 5963, + "train/ce_loss": 1.1980276107788086 + }, + { + "epoch": 0.5895788016610639, + "step": 5963, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.5895788016610639, + "step": 5963, + "train/total_loss": 0.21355277299880981 + }, + { + "entropy": 8.924026489257812, + "epoch": 0.5896776745105794, + "mean_token_accuracy": 0.6829574108123779, + "num_tokens": 10220411.0, + "step": 5964, + "train/ce_loss": 1.4706043004989624 + }, + { + "epoch": 0.5896776745105794, + "step": 5964, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.5896776745105794, + "step": 5964, + "train/total_loss": 0.24471668899059296 + }, + { + "entropy": 9.242369651794434, + "epoch": 0.5897765473600949, + "mean_token_accuracy": 0.8139534592628479, + "num_tokens": 10225578.0, + "step": 5965, + "train/ce_loss": 0.7793514728546143 + }, + { + "epoch": 0.5897765473600949, + "step": 5965, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.5897765473600949, + "step": 5965, + "train/total_loss": 0.14043515920639038 + }, + { + "entropy": 8.867958068847656, + "epoch": 0.5898754202096105, + "mean_token_accuracy": 0.7586206793785095, + "num_tokens": 10230797.0, + "step": 5966, + "train/ce_loss": 1.9965153932571411 + }, + { + "epoch": 0.5898754202096105, + "step": 5966, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.5898754202096105, + "step": 5966, + "train/total_loss": 0.3168390393257141 + }, + { + "entropy": 8.562590599060059, + "epoch": 0.589974293059126, + "mean_token_accuracy": 0.712990939617157, + "num_tokens": 10236311.0, + "step": 5967, + "train/ce_loss": 0.9384071826934814 + }, + { + "epoch": 0.589974293059126, + "step": 5967, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.589974293059126, + "step": 5967, + "train/total_loss": 0.17196571826934814 + }, + { + "entropy": 9.240837097167969, + "epoch": 0.5900731659086415, + "mean_token_accuracy": 0.6978852152824402, + "num_tokens": 10241585.0, + "step": 5968, + "train/ce_loss": 0.8842914700508118 + }, + { + "epoch": 0.5900731659086415, + "step": 5968, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.5900731659086415, + "step": 5968, + "train/total_loss": 0.18217915296554565 + }, + { + "entropy": 8.418539047241211, + "epoch": 0.590172038758157, + "mean_token_accuracy": 0.7446808218955994, + "num_tokens": 10247084.0, + "step": 5969, + "train/ce_loss": 0.591464638710022 + }, + { + "epoch": 0.590172038758157, + "step": 5969, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.590172038758157, + "step": 5969, + "train/total_loss": 0.0903964638710022 + }, + { + "entropy": 9.232823371887207, + "epoch": 0.5902709116076725, + "mean_token_accuracy": 0.7107195258140564, + "num_tokens": 10252373.0, + "step": 5970, + "train/ce_loss": 1.2386114597320557 + }, + { + "epoch": 0.5902709116076725, + "step": 5970, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.5902709116076725, + "step": 5970, + "train/total_loss": 0.1863611489534378 + }, + { + "entropy": 9.246217727661133, + "epoch": 0.590369784457188, + "mean_token_accuracy": 0.7820324301719666, + "num_tokens": 10257523.0, + "step": 5971, + "train/ce_loss": 0.5982750654220581 + }, + { + "epoch": 0.590369784457188, + "step": 5971, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.590369784457188, + "step": 5971, + "train/total_loss": 0.09498375654220581 + }, + { + "entropy": 8.83492660522461, + "epoch": 0.5904686573067036, + "mean_token_accuracy": 0.7675489187240601, + "num_tokens": 10262878.0, + "step": 5972, + "train/ce_loss": 1.4663736820220947 + }, + { + "epoch": 0.5904686573067036, + "step": 5972, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.5904686573067036, + "step": 5972, + "train/total_loss": 0.16226236522197723 + }, + { + "entropy": 9.27004623413086, + "epoch": 0.5905675301562191, + "mean_token_accuracy": 0.7492997050285339, + "num_tokens": 10268046.0, + "step": 5973, + "train/ce_loss": 0.9438756704330444 + }, + { + "epoch": 0.5905675301562191, + "step": 5973, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.5905675301562191, + "step": 5973, + "train/total_loss": 0.11391881853342056 + }, + { + "entropy": 8.810379028320312, + "epoch": 0.5906664030057346, + "mean_token_accuracy": 0.7188940048217773, + "num_tokens": 10273367.0, + "step": 5974, + "train/ce_loss": 0.8861865997314453 + }, + { + "epoch": 0.5906664030057346, + "step": 5974, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.5906664030057346, + "step": 5974, + "train/total_loss": 0.12377490848302841 + }, + { + "entropy": 9.149025917053223, + "epoch": 0.5907652758552502, + "mean_token_accuracy": 0.7546916604042053, + "num_tokens": 10278541.0, + "step": 5975, + "train/ce_loss": 0.8311251997947693 + }, + { + "epoch": 0.5907652758552502, + "step": 5975, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.5907652758552502, + "step": 5975, + "train/total_loss": 0.16905002295970917 + }, + { + "entropy": 8.849111557006836, + "epoch": 0.5908641487047657, + "mean_token_accuracy": 0.7751091718673706, + "num_tokens": 10283950.0, + "step": 5976, + "train/ce_loss": 0.901870608329773 + }, + { + "epoch": 0.5908641487047657, + "step": 5976, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.5908641487047657, + "step": 5976, + "train/total_loss": 0.14878082275390625 + }, + { + "entropy": 8.77665901184082, + "epoch": 0.5909630215542812, + "mean_token_accuracy": 0.6936842203140259, + "num_tokens": 10289340.0, + "step": 5977, + "train/ce_loss": 1.39908766746521 + }, + { + "epoch": 0.5909630215542812, + "step": 5977, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.5909630215542812, + "step": 5977, + "train/total_loss": 0.2297525256872177 + }, + { + "entropy": 9.108182907104492, + "epoch": 0.5910618944037968, + "mean_token_accuracy": 0.8383084535598755, + "num_tokens": 10294535.0, + "step": 5978, + "train/ce_loss": 0.6628880500793457 + }, + { + "epoch": 0.5910618944037968, + "step": 5978, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.5910618944037968, + "step": 5978, + "train/total_loss": 0.08582005649805069 + }, + { + "entropy": 8.767389297485352, + "epoch": 0.5911607672533122, + "mean_token_accuracy": 0.7164339423179626, + "num_tokens": 10299907.0, + "step": 5979, + "train/ce_loss": 0.8791292309761047 + }, + { + "epoch": 0.5911607672533122, + "step": 5979, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.5911607672533122, + "step": 5979, + "train/total_loss": 0.134787917137146 + }, + { + "epoch": 0.5912596401028277, + "grad_norm": 0.6115033626556396, + "learning_rate": 8.52420511299016e-06, + "loss": 0.1422, + "step": 5980 + }, + { + "entropy": 9.157818794250488, + "epoch": 0.5912596401028277, + "mean_token_accuracy": 0.7220670580863953, + "num_tokens": 10305099.0, + "step": 5980, + "train/ce_loss": 1.5501344203948975 + }, + { + "epoch": 0.5912596401028277, + "step": 5980, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.5912596401028277, + "step": 5980, + "train/total_loss": 0.21751344203948975 + }, + { + "entropy": 9.237415313720703, + "epoch": 0.5913585129523433, + "mean_token_accuracy": 0.7444794774055481, + "num_tokens": 10310359.0, + "step": 5981, + "train/ce_loss": 0.811318576335907 + }, + { + "epoch": 0.5913585129523433, + "step": 5981, + "train/sim_loss": 0.12109375 + }, + { + "epoch": 0.5913585129523433, + "step": 5981, + "train/total_loss": 0.20222561061382294 + }, + { + "entropy": 9.38630485534668, + "epoch": 0.5914573858018588, + "mean_token_accuracy": 0.7755681872367859, + "num_tokens": 10315509.0, + "step": 5982, + "train/ce_loss": 0.9271148443222046 + }, + { + "epoch": 0.5914573858018588, + "step": 5982, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.5914573858018588, + "step": 5982, + "train/total_loss": 0.20599272847175598 + }, + { + "entropy": 8.738609313964844, + "epoch": 0.5915562586513743, + "mean_token_accuracy": 0.7485648393630981, + "num_tokens": 10320852.0, + "step": 5983, + "train/ce_loss": 0.9920079112052917 + }, + { + "epoch": 0.5915562586513743, + "step": 5983, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.5915562586513743, + "step": 5983, + "train/total_loss": 0.1617007851600647 + }, + { + "entropy": 9.557707786560059, + "epoch": 0.5916551315008899, + "mean_token_accuracy": 0.7597955465316772, + "num_tokens": 10325898.0, + "step": 5984, + "train/ce_loss": 0.8455591201782227 + }, + { + "epoch": 0.5916551315008899, + "step": 5984, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.5916551315008899, + "step": 5984, + "train/total_loss": 0.13533717393875122 + }, + { + "entropy": 8.817047119140625, + "epoch": 0.5917540043504054, + "mean_token_accuracy": 0.7829099297523499, + "num_tokens": 10331184.0, + "step": 5985, + "train/ce_loss": 0.9472475647926331 + }, + { + "epoch": 0.5917540043504054, + "step": 5985, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.5917540043504054, + "step": 5985, + "train/total_loss": 0.14941225945949554 + }, + { + "entropy": 9.066792488098145, + "epoch": 0.5918528771999209, + "mean_token_accuracy": 0.7258297204971313, + "num_tokens": 10336377.0, + "step": 5986, + "train/ce_loss": 3.119315124422428e-06 + }, + { + "epoch": 0.5918528771999209, + "step": 5986, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.5918528771999209, + "step": 5986, + "train/total_loss": 0.05859406292438507 + }, + { + "entropy": 8.724923133850098, + "epoch": 0.5919517500494365, + "mean_token_accuracy": 0.7150654792785645, + "num_tokens": 10341818.0, + "step": 5987, + "train/ce_loss": 0.7353115081787109 + }, + { + "epoch": 0.5919517500494365, + "step": 5987, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.5919517500494365, + "step": 5987, + "train/total_loss": 0.1594686508178711 + }, + { + "entropy": 9.618450164794922, + "epoch": 0.5920506228989519, + "mean_token_accuracy": 0.7329649925231934, + "num_tokens": 10346779.0, + "step": 5988, + "train/ce_loss": 2.397352933883667 + }, + { + "epoch": 0.5920506228989519, + "step": 5988, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.5920506228989519, + "step": 5988, + "train/total_loss": 0.27489155530929565 + }, + { + "entropy": 8.573394775390625, + "epoch": 0.5921494957484674, + "mean_token_accuracy": 0.7003710865974426, + "num_tokens": 10352328.0, + "step": 5989, + "train/ce_loss": 0.8444566130638123 + }, + { + "epoch": 0.5921494957484674, + "step": 5989, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.5921494957484674, + "step": 5989, + "train/total_loss": 0.12350816279649734 + }, + { + "entropy": 8.723489761352539, + "epoch": 0.592248368597983, + "mean_token_accuracy": 0.7549824118614197, + "num_tokens": 10357679.0, + "step": 5990, + "train/ce_loss": 0.5021554231643677 + }, + { + "epoch": 0.592248368597983, + "step": 5990, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.592248368597983, + "step": 5990, + "train/total_loss": 0.08927804231643677 + }, + { + "entropy": 9.586736679077148, + "epoch": 0.5923472414474985, + "mean_token_accuracy": 0.6982142925262451, + "num_tokens": 10362687.0, + "step": 5991, + "train/ce_loss": 1.5734952967250138e-06 + }, + { + "epoch": 0.5923472414474985, + "step": 5991, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.5923472414474985, + "step": 5991, + "train/total_loss": 0.06250015646219254 + }, + { + "entropy": 9.379465103149414, + "epoch": 0.592446114297014, + "mean_token_accuracy": 0.6677471399307251, + "num_tokens": 10367729.0, + "step": 5992, + "train/ce_loss": 1.6429595947265625 + }, + { + "epoch": 0.592446114297014, + "step": 5992, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.592446114297014, + "step": 5992, + "train/total_loss": 0.211170956492424 + }, + { + "entropy": 9.429786682128906, + "epoch": 0.5925449871465296, + "mean_token_accuracy": 0.7732864618301392, + "num_tokens": 10372714.0, + "step": 5993, + "train/ce_loss": 0.835451066493988 + }, + { + "epoch": 0.5925449871465296, + "step": 5993, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.5925449871465296, + "step": 5993, + "train/total_loss": 0.16167011857032776 + }, + { + "entropy": 9.059162139892578, + "epoch": 0.5926438599960451, + "mean_token_accuracy": 0.6840620636940002, + "num_tokens": 10377885.0, + "step": 5994, + "train/ce_loss": 4.421832727530273e-06 + }, + { + "epoch": 0.5926438599960451, + "step": 5994, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.5926438599960451, + "step": 5994, + "train/total_loss": 0.04687544330954552 + }, + { + "entropy": 9.235883712768555, + "epoch": 0.5927427328455606, + "mean_token_accuracy": 0.7774648070335388, + "num_tokens": 10383039.0, + "step": 5995, + "train/ce_loss": 0.6720249056816101 + }, + { + "epoch": 0.5927427328455606, + "step": 5995, + "train/sim_loss": 0.13671875 + }, + { + "epoch": 0.5927427328455606, + "step": 5995, + "train/total_loss": 0.20392124354839325 + }, + { + "entropy": 9.269105911254883, + "epoch": 0.5928416056950762, + "mean_token_accuracy": 0.7384615540504456, + "num_tokens": 10388308.0, + "step": 5996, + "train/ce_loss": 1.1394755840301514 + }, + { + "epoch": 0.5928416056950762, + "step": 5996, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.5928416056950762, + "step": 5996, + "train/total_loss": 0.1803538203239441 + }, + { + "entropy": 9.188507080078125, + "epoch": 0.5929404785445916, + "mean_token_accuracy": 0.7356475591659546, + "num_tokens": 10393529.0, + "step": 5997, + "train/ce_loss": 0.7828561663627625 + }, + { + "epoch": 0.5929404785445916, + "step": 5997, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.5929404785445916, + "step": 5997, + "train/total_loss": 0.12516061961650848 + }, + { + "entropy": 8.738287925720215, + "epoch": 0.5930393513941071, + "mean_token_accuracy": 0.7173333168029785, + "num_tokens": 10398722.0, + "step": 5998, + "train/ce_loss": 0.8791782259941101 + }, + { + "epoch": 0.5930393513941071, + "step": 5998, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.5930393513941071, + "step": 5998, + "train/total_loss": 0.16604283452033997 + }, + { + "entropy": 9.403421401977539, + "epoch": 0.5931382242436227, + "mean_token_accuracy": 0.6938775777816772, + "num_tokens": 10403738.0, + "step": 5999, + "train/ce_loss": 1.8218058347702026 + }, + { + "epoch": 0.5931382242436227, + "step": 5999, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.5931382242436227, + "step": 5999, + "train/total_loss": 0.25249308347702026 + }, + { + "epoch": 0.5932370970931382, + "grad_norm": 0.835832953453064, + "learning_rate": 8.519260248232211e-06, + "loss": 0.1488, + "step": 6000 + }, + { + "entropy": 9.12228775024414, + "epoch": 0.5932370970931382, + "mean_token_accuracy": 0.7197368144989014, + "num_tokens": 10408978.0, + "step": 6000, + "train/ce_loss": 0.6845834851264954 + }, + { + "epoch": 0.5932370970931382, + "step": 6000, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.5932370970931382, + "step": 6000, + "train/total_loss": 0.15048959851264954 + }, + { + "entropy": 8.558881759643555, + "epoch": 0.5933359699426537, + "mean_token_accuracy": 0.7311272025108337, + "num_tokens": 10414416.0, + "step": 6001, + "train/ce_loss": 0.7443356513977051 + }, + { + "epoch": 0.5933359699426537, + "step": 6001, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.5933359699426537, + "step": 6001, + "train/total_loss": 0.1252148151397705 + }, + { + "entropy": 8.505014419555664, + "epoch": 0.5934348427921693, + "mean_token_accuracy": 0.7942631244659424, + "num_tokens": 10419925.0, + "step": 6002, + "train/ce_loss": 0.6493032574653625 + }, + { + "epoch": 0.5934348427921693, + "step": 6002, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.5934348427921693, + "step": 6002, + "train/total_loss": 0.10008657723665237 + }, + { + "entropy": 9.107410430908203, + "epoch": 0.5935337156416848, + "mean_token_accuracy": 0.7581967115402222, + "num_tokens": 10425124.0, + "step": 6003, + "train/ce_loss": 0.5387682914733887 + }, + { + "epoch": 0.5935337156416848, + "step": 6003, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.5935337156416848, + "step": 6003, + "train/total_loss": 0.1007518321275711 + }, + { + "entropy": 9.057453155517578, + "epoch": 0.5936325884912003, + "mean_token_accuracy": 0.7148760557174683, + "num_tokens": 10430300.0, + "step": 6004, + "train/ce_loss": 1.7691458463668823 + }, + { + "epoch": 0.5936325884912003, + "step": 6004, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.5936325884912003, + "step": 6004, + "train/total_loss": 0.21988333761692047 + }, + { + "entropy": 8.704456329345703, + "epoch": 0.5937314613407159, + "mean_token_accuracy": 0.7312373518943787, + "num_tokens": 10435742.0, + "step": 6005, + "train/ce_loss": 0.41926902532577515 + }, + { + "epoch": 0.5937314613407159, + "step": 6005, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.5937314613407159, + "step": 6005, + "train/total_loss": 0.12005190551280975 + }, + { + "entropy": 8.764145851135254, + "epoch": 0.5938303341902313, + "mean_token_accuracy": 0.7616060376167297, + "num_tokens": 10441015.0, + "step": 6006, + "train/ce_loss": 1.517891764640808 + }, + { + "epoch": 0.5938303341902313, + "step": 6006, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.5938303341902313, + "step": 6006, + "train/total_loss": 0.20257042348384857 + }, + { + "entropy": 8.942367553710938, + "epoch": 0.5939292070397468, + "mean_token_accuracy": 0.7817142605781555, + "num_tokens": 10446363.0, + "step": 6007, + "train/ce_loss": 0.6082258820533752 + }, + { + "epoch": 0.5939292070397468, + "step": 6007, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.5939292070397468, + "step": 6007, + "train/total_loss": 0.16238509118556976 + }, + { + "entropy": 9.576613426208496, + "epoch": 0.5940280798892624, + "mean_token_accuracy": 0.7442922592163086, + "num_tokens": 10451277.0, + "step": 6008, + "train/ce_loss": 2.372051540078246e-06 + }, + { + "epoch": 0.5940280798892624, + "step": 6008, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.5940280798892624, + "step": 6008, + "train/total_loss": 0.027343986555933952 + }, + { + "entropy": 9.160564422607422, + "epoch": 0.5941269527387779, + "mean_token_accuracy": 0.6874135732650757, + "num_tokens": 10456436.0, + "step": 6009, + "train/ce_loss": 0.8028679490089417 + }, + { + "epoch": 0.5941269527387779, + "step": 6009, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.5941269527387779, + "step": 6009, + "train/total_loss": 0.16231805086135864 + }, + { + "entropy": 9.258647918701172, + "epoch": 0.5942258255882934, + "mean_token_accuracy": 0.7239263653755188, + "num_tokens": 10461502.0, + "step": 6010, + "train/ce_loss": 2.2997858195594745e-06 + }, + { + "epoch": 0.5942258255882934, + "step": 6010, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.5942258255882934, + "step": 6010, + "train/total_loss": 0.050781480967998505 + }, + { + "entropy": 8.781116485595703, + "epoch": 0.594324698437809, + "mean_token_accuracy": 0.7196819186210632, + "num_tokens": 10466969.0, + "step": 6011, + "train/ce_loss": 0.5423060059547424 + }, + { + "epoch": 0.594324698437809, + "step": 6011, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.594324698437809, + "step": 6011, + "train/total_loss": 0.08157435059547424 + }, + { + "entropy": 9.20504379272461, + "epoch": 0.5944235712873245, + "mean_token_accuracy": 0.7269841432571411, + "num_tokens": 10472046.0, + "step": 6012, + "train/ce_loss": 1.0770103244794882e-06 + }, + { + "epoch": 0.5944235712873245, + "step": 6012, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.5944235712873245, + "step": 6012, + "train/total_loss": 0.031250108033418655 + }, + { + "entropy": 9.047160148620605, + "epoch": 0.5945224441368401, + "mean_token_accuracy": 0.7447698712348938, + "num_tokens": 10477235.0, + "step": 6013, + "train/ce_loss": 0.8327636122703552 + }, + { + "epoch": 0.5945224441368401, + "step": 6013, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.5945224441368401, + "step": 6013, + "train/total_loss": 0.10671386122703552 + }, + { + "entropy": 9.16783332824707, + "epoch": 0.5946213169863556, + "mean_token_accuracy": 0.7651515007019043, + "num_tokens": 10482350.0, + "step": 6014, + "train/ce_loss": 0.9572628140449524 + }, + { + "epoch": 0.5946213169863556, + "step": 6014, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.5946213169863556, + "step": 6014, + "train/total_loss": 0.17775753140449524 + }, + { + "entropy": 9.01803970336914, + "epoch": 0.594720189835871, + "mean_token_accuracy": 0.7959442138671875, + "num_tokens": 10487629.0, + "step": 6015, + "train/ce_loss": 0.5711748003959656 + }, + { + "epoch": 0.594720189835871, + "step": 6015, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.594720189835871, + "step": 6015, + "train/total_loss": 0.11180497705936432 + }, + { + "entropy": 9.087404251098633, + "epoch": 0.5948190626853866, + "mean_token_accuracy": 0.7900000214576721, + "num_tokens": 10492839.0, + "step": 6016, + "train/ce_loss": 0.7471591234207153 + }, + { + "epoch": 0.5948190626853866, + "step": 6016, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.5948190626853866, + "step": 6016, + "train/total_loss": 0.11377841234207153 + }, + { + "entropy": 8.938117980957031, + "epoch": 0.5949179355349021, + "mean_token_accuracy": 0.7794316411018372, + "num_tokens": 10498024.0, + "step": 6017, + "train/ce_loss": 0.7339110970497131 + }, + { + "epoch": 0.5949179355349021, + "step": 6017, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.5949179355349021, + "step": 6017, + "train/total_loss": 0.12417235970497131 + }, + { + "entropy": 8.622573852539062, + "epoch": 0.5950168083844176, + "mean_token_accuracy": 0.7354211807250977, + "num_tokens": 10503454.0, + "step": 6018, + "train/ce_loss": 1.3949031829833984 + }, + { + "epoch": 0.5950168083844176, + "step": 6018, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.5950168083844176, + "step": 6018, + "train/total_loss": 0.20589657127857208 + }, + { + "entropy": 8.990918159484863, + "epoch": 0.5951156812339332, + "mean_token_accuracy": 0.735336184501648, + "num_tokens": 10508657.0, + "step": 6019, + "train/ce_loss": 0.8849911093711853 + }, + { + "epoch": 0.5951156812339332, + "step": 6019, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.5951156812339332, + "step": 6019, + "train/total_loss": 0.14318661391735077 + }, + { + "epoch": 0.5952145540834487, + "grad_norm": 0.8095764517784119, + "learning_rate": 8.514315383474263e-06, + "loss": 0.1375, + "step": 6020 + }, + { + "entropy": 8.447221755981445, + "epoch": 0.5952145540834487, + "mean_token_accuracy": 0.7607361674308777, + "num_tokens": 10514128.0, + "step": 6020, + "train/ce_loss": 1.076474905014038 + }, + { + "epoch": 0.5952145540834487, + "step": 6020, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.5952145540834487, + "step": 6020, + "train/total_loss": 0.15452249348163605 + }, + { + "entropy": 9.175122261047363, + "epoch": 0.5953134269329642, + "mean_token_accuracy": 0.7266982793807983, + "num_tokens": 10519221.0, + "step": 6021, + "train/ce_loss": 1.7625705003738403 + }, + { + "epoch": 0.5953134269329642, + "step": 6021, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.5953134269329642, + "step": 6021, + "train/total_loss": 0.27000707387924194 + }, + { + "entropy": 9.325368881225586, + "epoch": 0.5954122997824798, + "mean_token_accuracy": 0.7176870703697205, + "num_tokens": 10524291.0, + "step": 6022, + "train/ce_loss": 0.9657346606254578 + }, + { + "epoch": 0.5954122997824798, + "step": 6022, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.5954122997824798, + "step": 6022, + "train/total_loss": 0.15126097202301025 + }, + { + "entropy": 8.491939544677734, + "epoch": 0.5955111726319953, + "mean_token_accuracy": 0.7690721750259399, + "num_tokens": 10529793.0, + "step": 6023, + "train/ce_loss": 0.39177945256233215 + }, + { + "epoch": 0.5955111726319953, + "step": 6023, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.5955111726319953, + "step": 6023, + "train/total_loss": 0.07042794674634933 + }, + { + "entropy": 9.656220436096191, + "epoch": 0.5956100454815108, + "mean_token_accuracy": 0.733031690120697, + "num_tokens": 10534646.0, + "step": 6024, + "train/ce_loss": 1.2732746601104736 + }, + { + "epoch": 0.5956100454815108, + "step": 6024, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.5956100454815108, + "step": 6024, + "train/total_loss": 0.17029622197151184 + }, + { + "entropy": 9.38523006439209, + "epoch": 0.5957089183310263, + "mean_token_accuracy": 0.6588419675827026, + "num_tokens": 10539704.0, + "step": 6025, + "train/ce_loss": 1.8997355699539185 + }, + { + "epoch": 0.5957089183310263, + "step": 6025, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.5957089183310263, + "step": 6025, + "train/total_loss": 0.24466106295585632 + }, + { + "entropy": 8.721146583557129, + "epoch": 0.5958077911805418, + "mean_token_accuracy": 0.7536800503730774, + "num_tokens": 10545233.0, + "step": 6026, + "train/ce_loss": 0.39012497663497925 + }, + { + "epoch": 0.5958077911805418, + "step": 6026, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.5958077911805418, + "step": 6026, + "train/total_loss": 0.058543749153614044 + }, + { + "entropy": 9.056755065917969, + "epoch": 0.5959066640300573, + "mean_token_accuracy": 0.7568305730819702, + "num_tokens": 10550422.0, + "step": 6027, + "train/ce_loss": 0.6360731720924377 + }, + { + "epoch": 0.5959066640300573, + "step": 6027, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.5959066640300573, + "step": 6027, + "train/total_loss": 0.08704482018947601 + }, + { + "entropy": 8.49323844909668, + "epoch": 0.5960055368795729, + "mean_token_accuracy": 0.6995798349380493, + "num_tokens": 10555866.0, + "step": 6028, + "train/ce_loss": 1.7057461738586426 + }, + { + "epoch": 0.5960055368795729, + "step": 6028, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.5960055368795729, + "step": 6028, + "train/total_loss": 0.2096371203660965 + }, + { + "entropy": 9.338155746459961, + "epoch": 0.5961044097290884, + "mean_token_accuracy": 0.7523961663246155, + "num_tokens": 10560934.0, + "step": 6029, + "train/ce_loss": 0.5146098136901855 + }, + { + "epoch": 0.5961044097290884, + "step": 6029, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.5961044097290884, + "step": 6029, + "train/total_loss": 0.11396098136901855 + }, + { + "entropy": 9.117780685424805, + "epoch": 0.5962032825786039, + "mean_token_accuracy": 0.7988826632499695, + "num_tokens": 10566128.0, + "step": 6030, + "train/ce_loss": 0.658829927444458 + }, + { + "epoch": 0.5962032825786039, + "step": 6030, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.5962032825786039, + "step": 6030, + "train/total_loss": 0.08541424572467804 + }, + { + "entropy": 8.720762252807617, + "epoch": 0.5963021554281195, + "mean_token_accuracy": 0.7643391489982605, + "num_tokens": 10571325.0, + "step": 6031, + "train/ce_loss": 1.2273279428482056 + }, + { + "epoch": 0.5963021554281195, + "step": 6031, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.5963021554281195, + "step": 6031, + "train/total_loss": 0.18132653832435608 + }, + { + "entropy": 8.94140625, + "epoch": 0.596401028277635, + "mean_token_accuracy": 0.8042269349098206, + "num_tokens": 10576682.0, + "step": 6032, + "train/ce_loss": 0.35526126623153687 + }, + { + "epoch": 0.596401028277635, + "step": 6032, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.596401028277635, + "step": 6032, + "train/total_loss": 0.051151126623153687 + }, + { + "entropy": 9.405384063720703, + "epoch": 0.5964999011271505, + "mean_token_accuracy": 0.7786116600036621, + "num_tokens": 10581685.0, + "step": 6033, + "train/ce_loss": 0.8011324405670166 + }, + { + "epoch": 0.5964999011271505, + "step": 6033, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.5964999011271505, + "step": 6033, + "train/total_loss": 0.1113632470369339 + }, + { + "entropy": 9.004520416259766, + "epoch": 0.596598773976666, + "mean_token_accuracy": 0.7183908224105835, + "num_tokens": 10586975.0, + "step": 6034, + "train/ce_loss": 2.220550775527954 + }, + { + "epoch": 0.596598773976666, + "step": 6034, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.596598773976666, + "step": 6034, + "train/total_loss": 0.2845550775527954 + }, + { + "entropy": 9.192344665527344, + "epoch": 0.5966976468261815, + "mean_token_accuracy": 0.6920821070671082, + "num_tokens": 10592166.0, + "step": 6035, + "train/ce_loss": 0.9314236640930176 + }, + { + "epoch": 0.5966976468261815, + "step": 6035, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.5966976468261815, + "step": 6035, + "train/total_loss": 0.16736111044883728 + }, + { + "entropy": 8.852960586547852, + "epoch": 0.596796519675697, + "mean_token_accuracy": 0.7294981479644775, + "num_tokens": 10597426.0, + "step": 6036, + "train/ce_loss": 0.7451646327972412 + }, + { + "epoch": 0.596796519675697, + "step": 6036, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.596796519675697, + "step": 6036, + "train/total_loss": 0.10576646775007248 + }, + { + "entropy": 8.74105453491211, + "epoch": 0.5968953925252126, + "mean_token_accuracy": 0.6511024832725525, + "num_tokens": 10602694.0, + "step": 6037, + "train/ce_loss": 1.219413161277771 + }, + { + "epoch": 0.5968953925252126, + "step": 6037, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.5968953925252126, + "step": 6037, + "train/total_loss": 0.22350382804870605 + }, + { + "entropy": 8.915050506591797, + "epoch": 0.5969942653747281, + "mean_token_accuracy": 0.7262569665908813, + "num_tokens": 10607843.0, + "step": 6038, + "train/ce_loss": 0.4067918658256531 + }, + { + "epoch": 0.5969942653747281, + "step": 6038, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.5969942653747281, + "step": 6038, + "train/total_loss": 0.09927293658256531 + }, + { + "entropy": 9.184982299804688, + "epoch": 0.5970931382242436, + "mean_token_accuracy": 0.7351852059364319, + "num_tokens": 10612812.0, + "step": 6039, + "train/ce_loss": 0.997426450252533 + }, + { + "epoch": 0.5970931382242436, + "step": 6039, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.5970931382242436, + "step": 6039, + "train/total_loss": 0.15833640098571777 + }, + { + "epoch": 0.5971920110737592, + "grad_norm": 0.8017409443855286, + "learning_rate": 8.509370518716314e-06, + "loss": 0.1375, + "step": 6040 + }, + { + "entropy": 8.831388473510742, + "epoch": 0.5971920110737592, + "mean_token_accuracy": 0.7482837438583374, + "num_tokens": 10618167.0, + "step": 6040, + "train/ce_loss": 1.6475485153932823e-06 + }, + { + "epoch": 0.5971920110737592, + "step": 6040, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.5971920110737592, + "step": 6040, + "train/total_loss": 0.03906266391277313 + }, + { + "entropy": 9.149188995361328, + "epoch": 0.5972908839232747, + "mean_token_accuracy": 0.7090619802474976, + "num_tokens": 10623212.0, + "step": 6041, + "train/ce_loss": 2.4567166292399634e-06 + }, + { + "epoch": 0.5972908839232747, + "step": 6041, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.5972908839232747, + "step": 6041, + "train/total_loss": 0.0351564958691597 + }, + { + "entropy": 9.088918685913086, + "epoch": 0.5973897567727902, + "mean_token_accuracy": 0.7598314881324768, + "num_tokens": 10628336.0, + "step": 6042, + "train/ce_loss": 1.2687687873840332 + }, + { + "epoch": 0.5973897567727902, + "step": 6042, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.5973897567727902, + "step": 6042, + "train/total_loss": 0.17375187575817108 + }, + { + "entropy": 9.040409088134766, + "epoch": 0.5974886296223058, + "mean_token_accuracy": 0.7434841990470886, + "num_tokens": 10633439.0, + "step": 6043, + "train/ce_loss": 1.3814735412597656 + }, + { + "epoch": 0.5974886296223058, + "step": 6043, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.5974886296223058, + "step": 6043, + "train/total_loss": 0.20064735412597656 + }, + { + "entropy": 9.518924713134766, + "epoch": 0.5975875024718212, + "mean_token_accuracy": 0.7703180313110352, + "num_tokens": 10638424.0, + "step": 6044, + "train/ce_loss": 1.0740244388580322 + }, + { + "epoch": 0.5975875024718212, + "step": 6044, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.5975875024718212, + "step": 6044, + "train/total_loss": 0.17380869388580322 + }, + { + "entropy": 8.724679946899414, + "epoch": 0.5976863753213367, + "mean_token_accuracy": 0.731452465057373, + "num_tokens": 10643886.0, + "step": 6045, + "train/ce_loss": 0.7668578028678894 + }, + { + "epoch": 0.5976863753213367, + "step": 6045, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.5976863753213367, + "step": 6045, + "train/total_loss": 0.15871703624725342 + }, + { + "entropy": 9.082891464233398, + "epoch": 0.5977852481708523, + "mean_token_accuracy": 0.6972602605819702, + "num_tokens": 10648984.0, + "step": 6046, + "train/ce_loss": 2.7422502171248198e-06 + }, + { + "epoch": 0.5977852481708523, + "step": 6046, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.5977852481708523, + "step": 6046, + "train/total_loss": 0.046875275671482086 + }, + { + "entropy": 9.151057243347168, + "epoch": 0.5978841210203678, + "mean_token_accuracy": 0.832647442817688, + "num_tokens": 10654140.0, + "step": 6047, + "train/ce_loss": 5.389683792600408e-07 + }, + { + "epoch": 0.5978841210203678, + "step": 6047, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.5978841210203678, + "step": 6047, + "train/total_loss": 0.019531304016709328 + }, + { + "entropy": 9.38824462890625, + "epoch": 0.5979829938698833, + "mean_token_accuracy": 0.7116736769676208, + "num_tokens": 10659254.0, + "step": 6048, + "train/ce_loss": 0.9343109130859375 + }, + { + "epoch": 0.5979829938698833, + "step": 6048, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.5979829938698833, + "step": 6048, + "train/total_loss": 0.11296234279870987 + }, + { + "entropy": 9.886924743652344, + "epoch": 0.5980818667193989, + "mean_token_accuracy": 0.6893203854560852, + "num_tokens": 10664054.0, + "step": 6049, + "train/ce_loss": 1.4633288383483887 + }, + { + "epoch": 0.5980818667193989, + "step": 6049, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.5980818667193989, + "step": 6049, + "train/total_loss": 0.21273913979530334 + }, + { + "entropy": 9.636835098266602, + "epoch": 0.5981807395689144, + "mean_token_accuracy": 0.7022900581359863, + "num_tokens": 10668863.0, + "step": 6050, + "train/ce_loss": 2.3065342903137207 + }, + { + "epoch": 0.5981807395689144, + "step": 6050, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.5981807395689144, + "step": 6050, + "train/total_loss": 0.34002843499183655 + }, + { + "entropy": 8.877485275268555, + "epoch": 0.5982796124184299, + "mean_token_accuracy": 0.8215129971504211, + "num_tokens": 10674161.0, + "step": 6051, + "train/ce_loss": 0.936775803565979 + }, + { + "epoch": 0.5982796124184299, + "step": 6051, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.5982796124184299, + "step": 6051, + "train/total_loss": 0.1171150803565979 + }, + { + "entropy": 9.124277114868164, + "epoch": 0.5983784852679455, + "mean_token_accuracy": 0.7508590817451477, + "num_tokens": 10679211.0, + "step": 6052, + "train/ce_loss": 3.786348315770738e-06 + }, + { + "epoch": 0.5983784852679455, + "step": 6052, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.5983784852679455, + "step": 6052, + "train/total_loss": 0.03515662997961044 + }, + { + "entropy": 9.106124877929688, + "epoch": 0.598477358117461, + "mean_token_accuracy": 0.7250945568084717, + "num_tokens": 10684475.0, + "step": 6053, + "train/ce_loss": 0.4788050949573517 + }, + { + "epoch": 0.598477358117461, + "step": 6053, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.598477358117461, + "step": 6053, + "train/total_loss": 0.08303676545619965 + }, + { + "entropy": 8.839000701904297, + "epoch": 0.5985762309669764, + "mean_token_accuracy": 0.7988505959510803, + "num_tokens": 10689626.0, + "step": 6054, + "train/ce_loss": 2.159317546102102e-06 + }, + { + "epoch": 0.5985762309669764, + "step": 6054, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.5985762309669764, + "step": 6054, + "train/total_loss": 0.03906271606683731 + }, + { + "entropy": 9.599370956420898, + "epoch": 0.598675103816492, + "mean_token_accuracy": 0.6694214940071106, + "num_tokens": 10694559.0, + "step": 6055, + "train/ce_loss": 1.485620941821253e-06 + }, + { + "epoch": 0.598675103816492, + "step": 6055, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.598675103816492, + "step": 6055, + "train/total_loss": 0.04296889901161194 + }, + { + "entropy": 9.249372482299805, + "epoch": 0.5987739766660075, + "mean_token_accuracy": 0.7612179517745972, + "num_tokens": 10699634.0, + "step": 6056, + "train/ce_loss": 1.3209097385406494 + }, + { + "epoch": 0.5987739766660075, + "step": 6056, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.5987739766660075, + "step": 6056, + "train/total_loss": 0.1906847208738327 + }, + { + "entropy": 9.658489227294922, + "epoch": 0.598872849515523, + "mean_token_accuracy": 0.7472727298736572, + "num_tokens": 10704629.0, + "step": 6057, + "train/ce_loss": 0.8913617730140686 + }, + { + "epoch": 0.598872849515523, + "step": 6057, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.598872849515523, + "step": 6057, + "train/total_loss": 0.16726118326187134 + }, + { + "entropy": 9.664912223815918, + "epoch": 0.5989717223650386, + "mean_token_accuracy": 0.8282442688941956, + "num_tokens": 10709560.0, + "step": 6058, + "train/ce_loss": 9.336384323432867e-07 + }, + { + "epoch": 0.5989717223650386, + "step": 6058, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.5989717223650386, + "step": 6058, + "train/total_loss": 0.01953134313225746 + }, + { + "entropy": 8.653030395507812, + "epoch": 0.5990705952145541, + "mean_token_accuracy": 0.7394514679908752, + "num_tokens": 10714994.0, + "step": 6059, + "train/ce_loss": 0.7872945666313171 + }, + { + "epoch": 0.5990705952145541, + "step": 6059, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.5990705952145541, + "step": 6059, + "train/total_loss": 0.13341695070266724 + }, + { + "epoch": 0.5991694680640696, + "grad_norm": 0.7073284983634949, + "learning_rate": 8.504425653958364e-06, + "loss": 0.1326, + "step": 6060 + }, + { + "entropy": 9.837320327758789, + "epoch": 0.5991694680640696, + "mean_token_accuracy": 0.6827794313430786, + "num_tokens": 10719723.0, + "step": 6060, + "train/ce_loss": 1.9000295400619507 + }, + { + "epoch": 0.5991694680640696, + "step": 6060, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.5991694680640696, + "step": 6060, + "train/total_loss": 0.260315477848053 + }, + { + "entropy": 9.144196510314941, + "epoch": 0.5992683409135852, + "mean_token_accuracy": 0.7172897458076477, + "num_tokens": 10724995.0, + "step": 6061, + "train/ce_loss": 5.91021830587124e-07 + }, + { + "epoch": 0.5992683409135852, + "step": 6061, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.5992683409135852, + "step": 6061, + "train/total_loss": 0.015625059604644775 + }, + { + "entropy": 9.109617233276367, + "epoch": 0.5993672137631006, + "mean_token_accuracy": 0.7466843724250793, + "num_tokens": 10730213.0, + "step": 6062, + "train/ce_loss": 0.5911141037940979 + }, + { + "epoch": 0.5993672137631006, + "step": 6062, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.5993672137631006, + "step": 6062, + "train/total_loss": 0.07473641633987427 + }, + { + "entropy": 8.925853729248047, + "epoch": 0.5994660866126161, + "mean_token_accuracy": 0.6997663378715515, + "num_tokens": 10735582.0, + "step": 6063, + "train/ce_loss": 1.1986355781555176 + }, + { + "epoch": 0.5994660866126161, + "step": 6063, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.5994660866126161, + "step": 6063, + "train/total_loss": 0.2331448197364807 + }, + { + "entropy": 9.185383796691895, + "epoch": 0.5995649594621317, + "mean_token_accuracy": 0.7010869383811951, + "num_tokens": 10740831.0, + "step": 6064, + "train/ce_loss": 0.6725782155990601 + }, + { + "epoch": 0.5995649594621317, + "step": 6064, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.5995649594621317, + "step": 6064, + "train/total_loss": 0.102414071559906 + }, + { + "entropy": 9.35740852355957, + "epoch": 0.5996638323116472, + "mean_token_accuracy": 0.7684563994407654, + "num_tokens": 10745859.0, + "step": 6065, + "train/ce_loss": 0.8081445097923279 + }, + { + "epoch": 0.5996638323116472, + "step": 6065, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.5996638323116472, + "step": 6065, + "train/total_loss": 0.1550332009792328 + }, + { + "entropy": 8.718416213989258, + "epoch": 0.5997627051611627, + "mean_token_accuracy": 0.7359490990638733, + "num_tokens": 10751274.0, + "step": 6066, + "train/ce_loss": 0.8912520408630371 + }, + { + "epoch": 0.5997627051611627, + "step": 6066, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.5997627051611627, + "step": 6066, + "train/total_loss": 0.17506271600723267 + }, + { + "entropy": 9.426103591918945, + "epoch": 0.5998615780106783, + "mean_token_accuracy": 0.8238636255264282, + "num_tokens": 10756195.0, + "step": 6067, + "train/ce_loss": 0.6718910932540894 + }, + { + "epoch": 0.5998615780106783, + "step": 6067, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.5998615780106783, + "step": 6067, + "train/total_loss": 0.09062661230564117 + }, + { + "entropy": 8.437259674072266, + "epoch": 0.5999604508601938, + "mean_token_accuracy": 0.7057521939277649, + "num_tokens": 10761553.0, + "step": 6068, + "train/ce_loss": 1.0020898580551147 + }, + { + "epoch": 0.5999604508601938, + "step": 6068, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.5999604508601938, + "step": 6068, + "train/total_loss": 0.21739649772644043 + }, + { + "entropy": 9.396541595458984, + "epoch": 0.6000593237097093, + "mean_token_accuracy": 0.7037617564201355, + "num_tokens": 10766650.0, + "step": 6069, + "train/ce_loss": 6.18084868619917e-07 + }, + { + "epoch": 0.6000593237097093, + "step": 6069, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.6000593237097093, + "step": 6069, + "train/total_loss": 0.035156313329935074 + }, + { + "entropy": 9.112813949584961, + "epoch": 0.6001581965592249, + "mean_token_accuracy": 0.744911789894104, + "num_tokens": 10771817.0, + "step": 6070, + "train/ce_loss": 0.3863801956176758 + }, + { + "epoch": 0.6001581965592249, + "step": 6070, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.6001581965592249, + "step": 6070, + "train/total_loss": 0.06598177552223206 + }, + { + "entropy": 8.795565605163574, + "epoch": 0.6002570694087404, + "mean_token_accuracy": 0.7511110901832581, + "num_tokens": 10777164.0, + "step": 6071, + "train/ce_loss": 0.993456244468689 + }, + { + "epoch": 0.6002570694087404, + "step": 6071, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.6002570694087404, + "step": 6071, + "train/total_loss": 0.1735643744468689 + }, + { + "entropy": 9.250688552856445, + "epoch": 0.6003559422582558, + "mean_token_accuracy": 0.7766830921173096, + "num_tokens": 10782202.0, + "step": 6072, + "train/ce_loss": 1.302918553352356 + }, + { + "epoch": 0.6003559422582558, + "step": 6072, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.6003559422582558, + "step": 6072, + "train/total_loss": 0.1810731142759323 + }, + { + "entropy": 8.888847351074219, + "epoch": 0.6004548151077714, + "mean_token_accuracy": 0.7934272289276123, + "num_tokens": 10787539.0, + "step": 6073, + "train/ce_loss": 0.9093997478485107 + }, + { + "epoch": 0.6004548151077714, + "step": 6073, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.6004548151077714, + "step": 6073, + "train/total_loss": 0.1182837262749672 + }, + { + "entropy": 8.762311935424805, + "epoch": 0.6005536879572869, + "mean_token_accuracy": 0.6850321292877197, + "num_tokens": 10793142.0, + "step": 6074, + "train/ce_loss": 0.8352211117744446 + }, + { + "epoch": 0.6005536879572869, + "step": 6074, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.6005536879572869, + "step": 6074, + "train/total_loss": 0.13820961117744446 + }, + { + "entropy": 9.677602767944336, + "epoch": 0.6006525608068024, + "mean_token_accuracy": 0.7514285445213318, + "num_tokens": 10797971.0, + "step": 6075, + "train/ce_loss": 4.10594248023699e-06 + }, + { + "epoch": 0.6006525608068024, + "step": 6075, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.6006525608068024, + "step": 6075, + "train/total_loss": 0.03125040978193283 + }, + { + "entropy": 9.393272399902344, + "epoch": 0.600751433656318, + "mean_token_accuracy": 0.7361111044883728, + "num_tokens": 10803038.0, + "step": 6076, + "train/ce_loss": 1.02798593044281 + }, + { + "epoch": 0.600751433656318, + "step": 6076, + "train/sim_loss": 0.12109375 + }, + { + "epoch": 0.600751433656318, + "step": 6076, + "train/total_loss": 0.22389234602451324 + }, + { + "entropy": 9.127182006835938, + "epoch": 0.6008503065058335, + "mean_token_accuracy": 0.6969696879386902, + "num_tokens": 10808164.0, + "step": 6077, + "train/ce_loss": 1.790145993232727 + }, + { + "epoch": 0.6008503065058335, + "step": 6077, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.6008503065058335, + "step": 6077, + "train/total_loss": 0.23370210826396942 + }, + { + "entropy": 8.687410354614258, + "epoch": 0.600949179355349, + "mean_token_accuracy": 0.743849515914917, + "num_tokens": 10813354.0, + "step": 6078, + "train/ce_loss": 0.8146626353263855 + }, + { + "epoch": 0.600949179355349, + "step": 6078, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.600949179355349, + "step": 6078, + "train/total_loss": 0.16740375757217407 + }, + { + "entropy": 9.673949241638184, + "epoch": 0.6010480522048646, + "mean_token_accuracy": 0.7717121839523315, + "num_tokens": 10818247.0, + "step": 6079, + "train/ce_loss": 1.099326252937317 + }, + { + "epoch": 0.6010480522048646, + "step": 6079, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.6010480522048646, + "step": 6079, + "train/total_loss": 0.17633888125419617 + }, + { + "epoch": 0.60114692505438, + "grad_norm": 0.8666896820068359, + "learning_rate": 8.499480789200417e-06, + "loss": 0.1416, + "step": 6080 + }, + { + "entropy": 8.867826461791992, + "epoch": 0.60114692505438, + "mean_token_accuracy": 0.7034883499145508, + "num_tokens": 10823618.0, + "step": 6080, + "train/ce_loss": 0.535771369934082 + }, + { + "epoch": 0.60114692505438, + "step": 6080, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.60114692505438, + "step": 6080, + "train/total_loss": 0.12779588997364044 + }, + { + "entropy": 8.964005470275879, + "epoch": 0.6012457979038955, + "mean_token_accuracy": 0.7682619690895081, + "num_tokens": 10828970.0, + "step": 6081, + "train/ce_loss": 0.8758470416069031 + }, + { + "epoch": 0.6012457979038955, + "step": 6081, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.6012457979038955, + "step": 6081, + "train/total_loss": 0.1657097041606903 + }, + { + "entropy": 9.141044616699219, + "epoch": 0.6013446707534111, + "mean_token_accuracy": 0.7322485446929932, + "num_tokens": 10834050.0, + "step": 6082, + "train/ce_loss": 1.4010944366455078 + }, + { + "epoch": 0.6013446707534111, + "step": 6082, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.6013446707534111, + "step": 6082, + "train/total_loss": 0.20260944962501526 + }, + { + "entropy": 8.897449493408203, + "epoch": 0.6014435436029266, + "mean_token_accuracy": 0.7319587469100952, + "num_tokens": 10839401.0, + "step": 6083, + "train/ce_loss": 0.9998795390129089 + }, + { + "epoch": 0.6014435436029266, + "step": 6083, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.6014435436029266, + "step": 6083, + "train/total_loss": 0.1390504539012909 + }, + { + "entropy": 8.964387893676758, + "epoch": 0.6015424164524421, + "mean_token_accuracy": 0.7765432000160217, + "num_tokens": 10844696.0, + "step": 6084, + "train/ce_loss": 0.6983901858329773 + }, + { + "epoch": 0.6015424164524421, + "step": 6084, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.6015424164524421, + "step": 6084, + "train/total_loss": 0.12062027305364609 + }, + { + "entropy": 9.072749137878418, + "epoch": 0.6016412893019577, + "mean_token_accuracy": 0.707257091999054, + "num_tokens": 10849973.0, + "step": 6085, + "train/ce_loss": 1.4259077310562134 + }, + { + "epoch": 0.6016412893019577, + "step": 6085, + "train/sim_loss": 0.15625 + }, + { + "epoch": 0.6016412893019577, + "step": 6085, + "train/total_loss": 0.2988407611846924 + }, + { + "entropy": 8.899450302124023, + "epoch": 0.6017401621514732, + "mean_token_accuracy": 0.7363834381103516, + "num_tokens": 10855356.0, + "step": 6086, + "train/ce_loss": 8.118449841276743e-07 + }, + { + "epoch": 0.6017401621514732, + "step": 6086, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.6017401621514732, + "step": 6086, + "train/total_loss": 0.046875081956386566 + }, + { + "entropy": 8.617920875549316, + "epoch": 0.6018390350009887, + "mean_token_accuracy": 0.8015102744102478, + "num_tokens": 10860763.0, + "step": 6087, + "train/ce_loss": 0.8570935726165771 + }, + { + "epoch": 0.6018390350009887, + "step": 6087, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.6018390350009887, + "step": 6087, + "train/total_loss": 0.1794593632221222 + }, + { + "entropy": 9.354616165161133, + "epoch": 0.6019379078505043, + "mean_token_accuracy": 0.7318611741065979, + "num_tokens": 10865843.0, + "step": 6088, + "train/ce_loss": 0.7066981792449951 + }, + { + "epoch": 0.6019379078505043, + "step": 6088, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.6019379078505043, + "step": 6088, + "train/total_loss": 0.13316982984542847 + }, + { + "entropy": 9.205799102783203, + "epoch": 0.6020367807000198, + "mean_token_accuracy": 0.7127516865730286, + "num_tokens": 10871069.0, + "step": 6089, + "train/ce_loss": 1.0004031658172607 + }, + { + "epoch": 0.6020367807000198, + "step": 6089, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.6020367807000198, + "step": 6089, + "train/total_loss": 0.16254031658172607 + }, + { + "entropy": 9.151839256286621, + "epoch": 0.6021356535495352, + "mean_token_accuracy": 0.76579350233078, + "num_tokens": 10876100.0, + "step": 6090, + "train/ce_loss": 0.9818342924118042 + }, + { + "epoch": 0.6021356535495352, + "step": 6090, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.6021356535495352, + "step": 6090, + "train/total_loss": 0.21146467328071594 + }, + { + "entropy": 8.641899108886719, + "epoch": 0.6022345263990508, + "mean_token_accuracy": 0.7617647051811218, + "num_tokens": 10881611.0, + "step": 6091, + "train/ce_loss": 0.9895736575126648 + }, + { + "epoch": 0.6022345263990508, + "step": 6091, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.6022345263990508, + "step": 6091, + "train/total_loss": 0.173176109790802 + }, + { + "entropy": 9.105466842651367, + "epoch": 0.6023333992485663, + "mean_token_accuracy": 0.7323037385940552, + "num_tokens": 10886756.0, + "step": 6092, + "train/ce_loss": 6.099965048633749e-07 + }, + { + "epoch": 0.6023333992485663, + "step": 6092, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.6023333992485663, + "step": 6092, + "train/total_loss": 0.023437561467289925 + }, + { + "entropy": 9.237039566040039, + "epoch": 0.6024322720980818, + "mean_token_accuracy": 0.8252426981925964, + "num_tokens": 10891950.0, + "step": 6093, + "train/ce_loss": 1.0687254667282104 + }, + { + "epoch": 0.6024322720980818, + "step": 6093, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.6024322720980818, + "step": 6093, + "train/total_loss": 0.14593505859375 + }, + { + "entropy": 9.210630416870117, + "epoch": 0.6025311449475974, + "mean_token_accuracy": 0.7210599780082703, + "num_tokens": 10897113.0, + "step": 6094, + "train/ce_loss": 1.0980263948440552 + }, + { + "epoch": 0.6025311449475974, + "step": 6094, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.6025311449475974, + "step": 6094, + "train/total_loss": 0.16058388352394104 + }, + { + "entropy": 10.069806098937988, + "epoch": 0.6026300177971129, + "mean_token_accuracy": 0.7323232293128967, + "num_tokens": 10901655.0, + "step": 6095, + "train/ce_loss": 7.5997854764864314e-06 + }, + { + "epoch": 0.6026300177971129, + "step": 6095, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.6026300177971129, + "step": 6095, + "train/total_loss": 0.08203200995922089 + }, + { + "entropy": 9.082328796386719, + "epoch": 0.6027288906466285, + "mean_token_accuracy": 0.7697121500968933, + "num_tokens": 10906921.0, + "step": 6096, + "train/ce_loss": 0.46874570846557617 + }, + { + "epoch": 0.6027288906466285, + "step": 6096, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.6027288906466285, + "step": 6096, + "train/total_loss": 0.08593706786632538 + }, + { + "entropy": 9.460693359375, + "epoch": 0.602827763496144, + "mean_token_accuracy": 0.7108209133148193, + "num_tokens": 10911897.0, + "step": 6097, + "train/ce_loss": 2.3511090603278717e-06 + }, + { + "epoch": 0.602827763496144, + "step": 6097, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.602827763496144, + "step": 6097, + "train/total_loss": 0.0429689846932888 + }, + { + "entropy": 9.391380310058594, + "epoch": 0.6029266363456595, + "mean_token_accuracy": 0.772357702255249, + "num_tokens": 10916950.0, + "step": 6098, + "train/ce_loss": 1.1386464834213257 + }, + { + "epoch": 0.6029266363456595, + "step": 6098, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.6029266363456595, + "step": 6098, + "train/total_loss": 0.19980216026306152 + }, + { + "entropy": 9.208715438842773, + "epoch": 0.6030255091951751, + "mean_token_accuracy": 0.7718023061752319, + "num_tokens": 10922149.0, + "step": 6099, + "train/ce_loss": 1.0111721167049836e-06 + }, + { + "epoch": 0.6030255091951751, + "step": 6099, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.6030255091951751, + "step": 6099, + "train/total_loss": 0.05468760058283806 + }, + { + "epoch": 0.6031243820446905, + "grad_norm": 0.7141965627670288, + "learning_rate": 8.494535924442467e-06, + "loss": 0.1357, + "step": 6100 + }, + { + "entropy": 8.944578170776367, + "epoch": 0.6031243820446905, + "mean_token_accuracy": 0.6878364086151123, + "num_tokens": 10927696.0, + "step": 6100, + "train/ce_loss": 0.508734941482544 + }, + { + "epoch": 0.6031243820446905, + "step": 6100, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.6031243820446905, + "step": 6100, + "train/total_loss": 0.08993599563837051 + }, + { + "entropy": 9.312324523925781, + "epoch": 0.603223254894206, + "mean_token_accuracy": 0.7329192757606506, + "num_tokens": 10932760.0, + "step": 6101, + "train/ce_loss": 1.2184512615203857 + }, + { + "epoch": 0.603223254894206, + "step": 6101, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.603223254894206, + "step": 6101, + "train/total_loss": 0.19997012615203857 + }, + { + "entropy": 9.61783218383789, + "epoch": 0.6033221277437216, + "mean_token_accuracy": 0.7670156955718994, + "num_tokens": 10937608.0, + "step": 6102, + "train/ce_loss": 1.2987922430038452 + }, + { + "epoch": 0.6033221277437216, + "step": 6102, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.6033221277437216, + "step": 6102, + "train/total_loss": 0.18456672132015228 + }, + { + "entropy": 8.94619369506836, + "epoch": 0.6034210005932371, + "mean_token_accuracy": 0.7599039673805237, + "num_tokens": 10942895.0, + "step": 6103, + "train/ce_loss": 0.8675341606140137 + }, + { + "epoch": 0.6034210005932371, + "step": 6103, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.6034210005932371, + "step": 6103, + "train/total_loss": 0.14925342798233032 + }, + { + "entropy": 9.454904556274414, + "epoch": 0.6035198734427526, + "mean_token_accuracy": 0.771266520023346, + "num_tokens": 10947834.0, + "step": 6104, + "train/ce_loss": 0.5550150871276855 + }, + { + "epoch": 0.6035198734427526, + "step": 6104, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.6035198734427526, + "step": 6104, + "train/total_loss": 0.07503275573253632 + }, + { + "entropy": 8.96200942993164, + "epoch": 0.6036187462922682, + "mean_token_accuracy": 0.7113526463508606, + "num_tokens": 10953158.0, + "step": 6105, + "train/ce_loss": 1.275610327720642 + }, + { + "epoch": 0.6036187462922682, + "step": 6105, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.6036187462922682, + "step": 6105, + "train/total_loss": 0.1900610327720642 + }, + { + "entropy": 8.748249053955078, + "epoch": 0.6037176191417837, + "mean_token_accuracy": 0.7409909963607788, + "num_tokens": 10958513.0, + "step": 6106, + "train/ce_loss": 0.9752168655395508 + }, + { + "epoch": 0.6037176191417837, + "step": 6106, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.6037176191417837, + "step": 6106, + "train/total_loss": 0.15220919251441956 + }, + { + "entropy": 9.620986938476562, + "epoch": 0.6038164919912992, + "mean_token_accuracy": 0.6916058659553528, + "num_tokens": 10963525.0, + "step": 6107, + "train/ce_loss": 1.447430968284607 + }, + { + "epoch": 0.6038164919912992, + "step": 6107, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.6038164919912992, + "step": 6107, + "train/total_loss": 0.23068059980869293 + }, + { + "entropy": 8.569491386413574, + "epoch": 0.6039153648408148, + "mean_token_accuracy": 0.7436241507530212, + "num_tokens": 10968760.0, + "step": 6108, + "train/ce_loss": 1.0208266973495483 + }, + { + "epoch": 0.6039153648408148, + "step": 6108, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.6039153648408148, + "step": 6108, + "train/total_loss": 0.14114516973495483 + }, + { + "entropy": 8.876388549804688, + "epoch": 0.6040142376903302, + "mean_token_accuracy": 0.7449495196342468, + "num_tokens": 10974002.0, + "step": 6109, + "train/ce_loss": 0.6652390956878662 + }, + { + "epoch": 0.6040142376903302, + "step": 6109, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.6040142376903302, + "step": 6109, + "train/total_loss": 0.10949265956878662 + }, + { + "entropy": 8.956596374511719, + "epoch": 0.6041131105398457, + "mean_token_accuracy": 0.8051947951316833, + "num_tokens": 10979256.0, + "step": 6110, + "train/ce_loss": 2.024814193646307e-06 + }, + { + "epoch": 0.6041131105398457, + "step": 6110, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.6041131105398457, + "step": 6110, + "train/total_loss": 0.05468770116567612 + }, + { + "entropy": 8.775054931640625, + "epoch": 0.6042119833893613, + "mean_token_accuracy": 0.7146198749542236, + "num_tokens": 10984617.0, + "step": 6111, + "train/ce_loss": 1.138087272644043 + }, + { + "epoch": 0.6042119833893613, + "step": 6111, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.6042119833893613, + "step": 6111, + "train/total_loss": 0.19583997130393982 + }, + { + "entropy": 8.835660934448242, + "epoch": 0.6043108562388768, + "mean_token_accuracy": 0.6846330165863037, + "num_tokens": 10989989.0, + "step": 6112, + "train/ce_loss": 0.7771117687225342 + }, + { + "epoch": 0.6043108562388768, + "step": 6112, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.6043108562388768, + "step": 6112, + "train/total_loss": 0.15583617985248566 + }, + { + "entropy": 9.560922622680664, + "epoch": 0.6044097290883923, + "mean_token_accuracy": 0.7399617433547974, + "num_tokens": 10994934.0, + "step": 6113, + "train/ce_loss": 2.5262149847549153e-06 + }, + { + "epoch": 0.6044097290883923, + "step": 6113, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.6044097290883923, + "step": 6113, + "train/total_loss": 0.0781252533197403 + }, + { + "entropy": 9.276796340942383, + "epoch": 0.6045086019379079, + "mean_token_accuracy": 0.7439544796943665, + "num_tokens": 11000044.0, + "step": 6114, + "train/ce_loss": 1.1325781345367432 + }, + { + "epoch": 0.6045086019379079, + "step": 6114, + "train/sim_loss": 0.16015625 + }, + { + "epoch": 0.6045086019379079, + "step": 6114, + "train/total_loss": 0.27341407537460327 + }, + { + "entropy": 9.39217758178711, + "epoch": 0.6046074747874234, + "mean_token_accuracy": 0.7068965435028076, + "num_tokens": 11005089.0, + "step": 6115, + "train/ce_loss": 1.7699488807920716e-06 + }, + { + "epoch": 0.6046074747874234, + "step": 6115, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.6046074747874234, + "step": 6115, + "train/total_loss": 0.035156428813934326 + }, + { + "entropy": 8.850866317749023, + "epoch": 0.6047063476369389, + "mean_token_accuracy": 0.7711442708969116, + "num_tokens": 11010548.0, + "step": 6116, + "train/ce_loss": 0.9158958792686462 + }, + { + "epoch": 0.6047063476369389, + "step": 6116, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.6047063476369389, + "step": 6116, + "train/total_loss": 0.12674584984779358 + }, + { + "entropy": 8.85025691986084, + "epoch": 0.6048052204864545, + "mean_token_accuracy": 0.7938144207000732, + "num_tokens": 11015930.0, + "step": 6117, + "train/ce_loss": 0.8996186852455139 + }, + { + "epoch": 0.6048052204864545, + "step": 6117, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.6048052204864545, + "step": 6117, + "train/total_loss": 0.13683687150478363 + }, + { + "entropy": 8.638938903808594, + "epoch": 0.60490409333597, + "mean_token_accuracy": 0.6982182860374451, + "num_tokens": 11021276.0, + "step": 6118, + "train/ce_loss": 0.8769782781600952 + }, + { + "epoch": 0.60490409333597, + "step": 6118, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.60490409333597, + "step": 6118, + "train/total_loss": 0.134572833776474 + }, + { + "entropy": 9.259170532226562, + "epoch": 0.6050029661854854, + "mean_token_accuracy": 0.7200000286102295, + "num_tokens": 11026382.0, + "step": 6119, + "train/ce_loss": 0.7559342980384827 + }, + { + "epoch": 0.6050029661854854, + "step": 6119, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.6050029661854854, + "step": 6119, + "train/total_loss": 0.11856218427419662 + }, + { + "epoch": 0.605101839035001, + "grad_norm": 0.718431830406189, + "learning_rate": 8.48959105968452e-06, + "loss": 0.1397, + "step": 6120 + }, + { + "entropy": 9.40379524230957, + "epoch": 0.605101839035001, + "mean_token_accuracy": 0.6692667603492737, + "num_tokens": 11031435.0, + "step": 6120, + "train/ce_loss": 1.3104554414749146 + }, + { + "epoch": 0.605101839035001, + "step": 6120, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.605101839035001, + "step": 6120, + "train/total_loss": 0.19354555010795593 + }, + { + "entropy": 9.068265914916992, + "epoch": 0.6052007118845165, + "mean_token_accuracy": 0.7567114233970642, + "num_tokens": 11036463.0, + "step": 6121, + "train/ce_loss": 0.6880939602851868 + }, + { + "epoch": 0.6052007118845165, + "step": 6121, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.6052007118845165, + "step": 6121, + "train/total_loss": 0.0922468975186348 + }, + { + "entropy": 8.987520217895508, + "epoch": 0.605299584734032, + "mean_token_accuracy": 0.7322834730148315, + "num_tokens": 11041702.0, + "step": 6122, + "train/ce_loss": 0.7852272391319275 + }, + { + "epoch": 0.605299584734032, + "step": 6122, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.605299584734032, + "step": 6122, + "train/total_loss": 0.148835226893425 + }, + { + "entropy": 8.811985969543457, + "epoch": 0.6053984575835476, + "mean_token_accuracy": 0.783643901348114, + "num_tokens": 11047111.0, + "step": 6123, + "train/ce_loss": 0.3789895176887512 + }, + { + "epoch": 0.6053984575835476, + "step": 6123, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.6053984575835476, + "step": 6123, + "train/total_loss": 0.0574302040040493 + }, + { + "entropy": 9.542729377746582, + "epoch": 0.6054973304330631, + "mean_token_accuracy": 0.7797979712486267, + "num_tokens": 11052093.0, + "step": 6124, + "train/ce_loss": 0.9592763185501099 + }, + { + "epoch": 0.6054973304330631, + "step": 6124, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.6054973304330631, + "step": 6124, + "train/total_loss": 0.1701463758945465 + }, + { + "entropy": 8.710733413696289, + "epoch": 0.6055962032825786, + "mean_token_accuracy": 0.7696477174758911, + "num_tokens": 11057289.0, + "step": 6125, + "train/ce_loss": 0.8214151263237 + }, + { + "epoch": 0.6055962032825786, + "step": 6125, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.6055962032825786, + "step": 6125, + "train/total_loss": 0.11729776114225388 + }, + { + "entropy": 9.01161003112793, + "epoch": 0.6056950761320942, + "mean_token_accuracy": 0.7325443625450134, + "num_tokens": 11062602.0, + "step": 6126, + "train/ce_loss": 1.3645176887512207 + }, + { + "epoch": 0.6056950761320942, + "step": 6126, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.6056950761320942, + "step": 6126, + "train/total_loss": 0.22238926589488983 + }, + { + "entropy": 8.993681907653809, + "epoch": 0.6057939489816097, + "mean_token_accuracy": 0.7953431606292725, + "num_tokens": 11067867.0, + "step": 6127, + "train/ce_loss": 1.6927732531257789e-06 + }, + { + "epoch": 0.6057939489816097, + "step": 6127, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.6057939489816097, + "step": 6127, + "train/total_loss": 0.08593767136335373 + }, + { + "entropy": 9.10866928100586, + "epoch": 0.6058928218311251, + "mean_token_accuracy": 0.7489878535270691, + "num_tokens": 11073058.0, + "step": 6128, + "train/ce_loss": 0.9587780237197876 + }, + { + "epoch": 0.6058928218311251, + "step": 6128, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.6058928218311251, + "step": 6128, + "train/total_loss": 0.17400279641151428 + }, + { + "entropy": 9.751190185546875, + "epoch": 0.6059916946806407, + "mean_token_accuracy": 0.75, + "num_tokens": 11077883.0, + "step": 6129, + "train/ce_loss": 4.459578576643253e-06 + }, + { + "epoch": 0.6059916946806407, + "step": 6129, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.6059916946806407, + "step": 6129, + "train/total_loss": 0.046875447034835815 + }, + { + "entropy": 9.06528091430664, + "epoch": 0.6060905675301562, + "mean_token_accuracy": 0.7226277589797974, + "num_tokens": 11082982.0, + "step": 6130, + "train/ce_loss": 0.836621105670929 + }, + { + "epoch": 0.6060905675301562, + "step": 6130, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.6060905675301562, + "step": 6130, + "train/total_loss": 0.14225587248802185 + }, + { + "entropy": 8.767593383789062, + "epoch": 0.6061894403796717, + "mean_token_accuracy": 0.7079038023948669, + "num_tokens": 11088295.0, + "step": 6131, + "train/ce_loss": 1.151301622390747 + }, + { + "epoch": 0.6061894403796717, + "step": 6131, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.6061894403796717, + "step": 6131, + "train/total_loss": 0.15419265627861023 + }, + { + "entropy": 8.503793716430664, + "epoch": 0.6062883132291873, + "mean_token_accuracy": 0.7227227091789246, + "num_tokens": 11093754.0, + "step": 6132, + "train/ce_loss": 0.7511727809906006 + }, + { + "epoch": 0.6062883132291873, + "step": 6132, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.6062883132291873, + "step": 6132, + "train/total_loss": 0.12199228256940842 + }, + { + "entropy": 9.331968307495117, + "epoch": 0.6063871860787028, + "mean_token_accuracy": 0.778294563293457, + "num_tokens": 11098842.0, + "step": 6133, + "train/ce_loss": 0.6059866547584534 + }, + { + "epoch": 0.6063871860787028, + "step": 6133, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.6063871860787028, + "step": 6133, + "train/total_loss": 0.13091117143630981 + }, + { + "entropy": 9.549592971801758, + "epoch": 0.6064860589282183, + "mean_token_accuracy": 0.6774716377258301, + "num_tokens": 11104053.0, + "step": 6134, + "train/ce_loss": 2.494525194168091 + }, + { + "epoch": 0.6064860589282183, + "step": 6134, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.6064860589282183, + "step": 6134, + "train/total_loss": 0.32367128133773804 + }, + { + "entropy": 9.277434349060059, + "epoch": 0.6065849317777339, + "mean_token_accuracy": 0.7071651220321655, + "num_tokens": 11109204.0, + "step": 6135, + "train/ce_loss": 1.1674458980560303 + }, + { + "epoch": 0.6065849317777339, + "step": 6135, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.6065849317777339, + "step": 6135, + "train/total_loss": 0.17533834278583527 + }, + { + "entropy": 9.13475227355957, + "epoch": 0.6066838046272494, + "mean_token_accuracy": 0.7972972989082336, + "num_tokens": 11114530.0, + "step": 6136, + "train/ce_loss": 0.8006073832511902 + }, + { + "epoch": 0.6066838046272494, + "step": 6136, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.6066838046272494, + "step": 6136, + "train/total_loss": 0.13474825024604797 + }, + { + "entropy": 9.748363494873047, + "epoch": 0.6067826774767648, + "mean_token_accuracy": 0.6660377383232117, + "num_tokens": 11119636.0, + "step": 6137, + "train/ce_loss": 2.632155179977417 + }, + { + "epoch": 0.6067826774767648, + "step": 6137, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.6067826774767648, + "step": 6137, + "train/total_loss": 0.3608717620372772 + }, + { + "entropy": 9.102104187011719, + "epoch": 0.6068815503262804, + "mean_token_accuracy": 0.8008241653442383, + "num_tokens": 11124881.0, + "step": 6138, + "train/ce_loss": 0.5030428767204285 + }, + { + "epoch": 0.6068815503262804, + "step": 6138, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.6068815503262804, + "step": 6138, + "train/total_loss": 0.10108554363250732 + }, + { + "entropy": 9.025053024291992, + "epoch": 0.6069804231757959, + "mean_token_accuracy": 0.7277108430862427, + "num_tokens": 11130172.0, + "step": 6139, + "train/ce_loss": 0.48416703939437866 + }, + { + "epoch": 0.6069804231757959, + "step": 6139, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.6069804231757959, + "step": 6139, + "train/total_loss": 0.08747920393943787 + }, + { + "epoch": 0.6070792960253114, + "grad_norm": 0.6249412298202515, + "learning_rate": 8.48464619492657e-06, + "loss": 0.1389, + "step": 6140 + }, + { + "entropy": 8.812685012817383, + "epoch": 0.6070792960253114, + "mean_token_accuracy": 0.681922197341919, + "num_tokens": 11135547.0, + "step": 6140, + "train/ce_loss": 1.019753336906433 + }, + { + "epoch": 0.6070792960253114, + "step": 6140, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.6070792960253114, + "step": 6140, + "train/total_loss": 0.18010033667087555 + }, + { + "entropy": 8.79395866394043, + "epoch": 0.607178168874827, + "mean_token_accuracy": 0.7002262473106384, + "num_tokens": 11140890.0, + "step": 6141, + "train/ce_loss": 1.1063992977142334 + }, + { + "epoch": 0.607178168874827, + "step": 6141, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.607178168874827, + "step": 6141, + "train/total_loss": 0.22392117977142334 + }, + { + "entropy": 9.144994735717773, + "epoch": 0.6072770417243425, + "mean_token_accuracy": 0.7426981925964355, + "num_tokens": 11146055.0, + "step": 6142, + "train/ce_loss": 0.8788449764251709 + }, + { + "epoch": 0.6072770417243425, + "step": 6142, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.6072770417243425, + "step": 6142, + "train/total_loss": 0.13475950062274933 + }, + { + "entropy": 8.9202880859375, + "epoch": 0.607375914573858, + "mean_token_accuracy": 0.7448186278343201, + "num_tokens": 11151298.0, + "step": 6143, + "train/ce_loss": 0.9303768873214722 + }, + { + "epoch": 0.607375914573858, + "step": 6143, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.607375914573858, + "step": 6143, + "train/total_loss": 0.1555376946926117 + }, + { + "entropy": 9.32742691040039, + "epoch": 0.6074747874233736, + "mean_token_accuracy": 0.6774193644523621, + "num_tokens": 11156392.0, + "step": 6144, + "train/ce_loss": 2.0724892616271973 + }, + { + "epoch": 0.6074747874233736, + "step": 6144, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.6074747874233736, + "step": 6144, + "train/total_loss": 0.3127176761627197 + }, + { + "entropy": 9.137593269348145, + "epoch": 0.6075736602728891, + "mean_token_accuracy": 0.7714646458625793, + "num_tokens": 11161622.0, + "step": 6145, + "train/ce_loss": 0.6917032599449158 + }, + { + "epoch": 0.6075736602728891, + "step": 6145, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.6075736602728891, + "step": 6145, + "train/total_loss": 0.10823282599449158 + }, + { + "entropy": 9.317638397216797, + "epoch": 0.6076725331224045, + "mean_token_accuracy": 0.7710674405097961, + "num_tokens": 11166792.0, + "step": 6146, + "train/ce_loss": 0.8067945837974548 + }, + { + "epoch": 0.6076725331224045, + "step": 6146, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.6076725331224045, + "step": 6146, + "train/total_loss": 0.11192946135997772 + }, + { + "entropy": 9.131617546081543, + "epoch": 0.6077714059719201, + "mean_token_accuracy": 0.752173900604248, + "num_tokens": 11171923.0, + "step": 6147, + "train/ce_loss": 1.4311491250991821 + }, + { + "epoch": 0.6077714059719201, + "step": 6147, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.6077714059719201, + "step": 6147, + "train/total_loss": 0.22123990952968597 + }, + { + "entropy": 8.97225570678711, + "epoch": 0.6078702788214356, + "mean_token_accuracy": 0.7412513494491577, + "num_tokens": 11177457.0, + "step": 6148, + "train/ce_loss": 0.9629305005073547 + }, + { + "epoch": 0.6078702788214356, + "step": 6148, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.6078702788214356, + "step": 6148, + "train/total_loss": 0.13926181197166443 + }, + { + "entropy": 8.673298835754395, + "epoch": 0.6079691516709511, + "mean_token_accuracy": 0.7277227640151978, + "num_tokens": 11182857.0, + "step": 6149, + "train/ce_loss": 0.9916688799858093 + }, + { + "epoch": 0.6079691516709511, + "step": 6149, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.6079691516709511, + "step": 6149, + "train/total_loss": 0.1460418999195099 + }, + { + "entropy": 8.975536346435547, + "epoch": 0.6080680245204667, + "mean_token_accuracy": 0.7418086528778076, + "num_tokens": 11188098.0, + "step": 6150, + "train/ce_loss": 1.0984766483306885 + }, + { + "epoch": 0.6080680245204667, + "step": 6150, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.6080680245204667, + "step": 6150, + "train/total_loss": 0.14891016483306885 + }, + { + "entropy": 9.112666130065918, + "epoch": 0.6081668973699822, + "mean_token_accuracy": 0.6454917788505554, + "num_tokens": 11193008.0, + "step": 6151, + "train/ce_loss": 2.2162926197052 + }, + { + "epoch": 0.6081668973699822, + "step": 6151, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.6081668973699822, + "step": 6151, + "train/total_loss": 0.33881676197052 + }, + { + "entropy": 8.722833633422852, + "epoch": 0.6082657702194977, + "mean_token_accuracy": 0.6876310110092163, + "num_tokens": 11198484.0, + "step": 6152, + "train/ce_loss": 1.057938575744629 + }, + { + "epoch": 0.6082657702194977, + "step": 6152, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.6082657702194977, + "step": 6152, + "train/total_loss": 0.16048136353492737 + }, + { + "entropy": 8.93104076385498, + "epoch": 0.6083646430690133, + "mean_token_accuracy": 0.760401725769043, + "num_tokens": 11203652.0, + "step": 6153, + "train/ce_loss": 0.45881423354148865 + }, + { + "epoch": 0.6083646430690133, + "step": 6153, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.6083646430690133, + "step": 6153, + "train/total_loss": 0.08885017037391663 + }, + { + "entropy": 9.29659652709961, + "epoch": 0.6084635159185288, + "mean_token_accuracy": 0.6714060306549072, + "num_tokens": 11208667.0, + "step": 6154, + "train/ce_loss": 1.8182228803634644 + }, + { + "epoch": 0.6084635159185288, + "step": 6154, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.6084635159185288, + "step": 6154, + "train/total_loss": 0.2990097999572754 + }, + { + "entropy": 8.691879272460938, + "epoch": 0.6085623887680442, + "mean_token_accuracy": 0.7736966609954834, + "num_tokens": 11213986.0, + "step": 6155, + "train/ce_loss": 0.9295175671577454 + }, + { + "epoch": 0.6085623887680442, + "step": 6155, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.6085623887680442, + "step": 6155, + "train/total_loss": 0.19060800969600677 + }, + { + "entropy": 9.033947944641113, + "epoch": 0.6086612616175598, + "mean_token_accuracy": 0.7052767276763916, + "num_tokens": 11219280.0, + "step": 6156, + "train/ce_loss": 0.701568067073822 + }, + { + "epoch": 0.6086612616175598, + "step": 6156, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.6086612616175598, + "step": 6156, + "train/total_loss": 0.12484430521726608 + }, + { + "entropy": 9.38157844543457, + "epoch": 0.6087601344670753, + "mean_token_accuracy": 0.7710145115852356, + "num_tokens": 11224413.0, + "step": 6157, + "train/ce_loss": 4.388226102491899e-07 + }, + { + "epoch": 0.6087601344670753, + "step": 6157, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.6087601344670753, + "step": 6157, + "train/total_loss": 0.01562504470348358 + }, + { + "entropy": 8.847856521606445, + "epoch": 0.6088590073165908, + "mean_token_accuracy": 0.7508896589279175, + "num_tokens": 11229683.0, + "step": 6158, + "train/ce_loss": 1.0196502208709717 + }, + { + "epoch": 0.6088590073165908, + "step": 6158, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.6088590073165908, + "step": 6158, + "train/total_loss": 0.1371212750673294 + }, + { + "entropy": 8.70765495300293, + "epoch": 0.6089578801661064, + "mean_token_accuracy": 0.7148891091346741, + "num_tokens": 11235072.0, + "step": 6159, + "train/ce_loss": 1.02029550075531 + }, + { + "epoch": 0.6089578801661064, + "step": 6159, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.6089578801661064, + "step": 6159, + "train/total_loss": 0.16452956199645996 + }, + { + "epoch": 0.6090567530156219, + "grad_norm": 0.6889066100120544, + "learning_rate": 8.47970133016862e-06, + "loss": 0.1488, + "step": 6160 + }, + { + "entropy": 9.873184204101562, + "epoch": 0.6090567530156219, + "mean_token_accuracy": 0.7747524976730347, + "num_tokens": 11239868.0, + "step": 6160, + "train/ce_loss": 8.155694217748533e-07 + }, + { + "epoch": 0.6090567530156219, + "step": 6160, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.6090567530156219, + "step": 6160, + "train/total_loss": 0.015625081956386566 + }, + { + "entropy": 9.006669998168945, + "epoch": 0.6091556258651374, + "mean_token_accuracy": 0.707379162311554, + "num_tokens": 11245147.0, + "step": 6161, + "train/ce_loss": 1.3221015930175781 + }, + { + "epoch": 0.6091556258651374, + "step": 6161, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.6091556258651374, + "step": 6161, + "train/total_loss": 0.1751789152622223 + }, + { + "entropy": 8.94556713104248, + "epoch": 0.609254498714653, + "mean_token_accuracy": 0.724252462387085, + "num_tokens": 11250504.0, + "step": 6162, + "train/ce_loss": 0.4792410135269165 + }, + { + "epoch": 0.609254498714653, + "step": 6162, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.609254498714653, + "step": 6162, + "train/total_loss": 0.12214285135269165 + }, + { + "entropy": 8.423084259033203, + "epoch": 0.6093533715641685, + "mean_token_accuracy": 0.678260862827301, + "num_tokens": 11255785.0, + "step": 6163, + "train/ce_loss": 1.0342999696731567 + }, + { + "epoch": 0.6093533715641685, + "step": 6163, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.6093533715641685, + "step": 6163, + "train/total_loss": 0.15030500292778015 + }, + { + "entropy": 9.02614974975586, + "epoch": 0.609452244413684, + "mean_token_accuracy": 0.7075471878051758, + "num_tokens": 11260945.0, + "step": 6164, + "train/ce_loss": 1.1955517530441284 + }, + { + "epoch": 0.609452244413684, + "step": 6164, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.609452244413684, + "step": 6164, + "train/total_loss": 0.17814892530441284 + }, + { + "entropy": 9.500265121459961, + "epoch": 0.6095511172631995, + "mean_token_accuracy": 0.7566909790039062, + "num_tokens": 11265750.0, + "step": 6165, + "train/ce_loss": 0.8175063729286194 + }, + { + "epoch": 0.6095511172631995, + "step": 6165, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.6095511172631995, + "step": 6165, + "train/total_loss": 0.15987563133239746 + }, + { + "entropy": 9.123942375183105, + "epoch": 0.609649990112715, + "mean_token_accuracy": 0.7383177280426025, + "num_tokens": 11270842.0, + "step": 6166, + "train/ce_loss": 0.5500314831733704 + }, + { + "epoch": 0.609649990112715, + "step": 6166, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.609649990112715, + "step": 6166, + "train/total_loss": 0.09406565129756927 + }, + { + "entropy": 8.76285457611084, + "epoch": 0.6097488629622305, + "mean_token_accuracy": 0.7166469693183899, + "num_tokens": 11276114.0, + "step": 6167, + "train/ce_loss": 0.7664913535118103 + }, + { + "epoch": 0.6097488629622305, + "step": 6167, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.6097488629622305, + "step": 6167, + "train/total_loss": 0.10789913684129715 + }, + { + "entropy": 8.750094413757324, + "epoch": 0.6098477358117461, + "mean_token_accuracy": 0.7453488111495972, + "num_tokens": 11281455.0, + "step": 6168, + "train/ce_loss": 0.5771999359130859 + }, + { + "epoch": 0.6098477358117461, + "step": 6168, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.6098477358117461, + "step": 6168, + "train/total_loss": 0.08506374061107635 + }, + { + "entropy": 9.110280990600586, + "epoch": 0.6099466086612616, + "mean_token_accuracy": 0.7319587469100952, + "num_tokens": 11286631.0, + "step": 6169, + "train/ce_loss": 1.0423439741134644 + }, + { + "epoch": 0.6099466086612616, + "step": 6169, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.6099466086612616, + "step": 6169, + "train/total_loss": 0.15110939741134644 + }, + { + "entropy": 8.88326644897461, + "epoch": 0.6100454815107771, + "mean_token_accuracy": 0.723127007484436, + "num_tokens": 11292023.0, + "step": 6170, + "train/ce_loss": 0.7747926712036133 + }, + { + "epoch": 0.6100454815107771, + "step": 6170, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.6100454815107771, + "step": 6170, + "train/total_loss": 0.10872926563024521 + }, + { + "entropy": 9.328957557678223, + "epoch": 0.6101443543602927, + "mean_token_accuracy": 0.699312686920166, + "num_tokens": 11297033.0, + "step": 6171, + "train/ce_loss": 1.3747528555541066e-06 + }, + { + "epoch": 0.6101443543602927, + "step": 6171, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.6101443543602927, + "step": 6171, + "train/total_loss": 0.06250013411045074 + }, + { + "entropy": 9.24807357788086, + "epoch": 0.6102432272098082, + "mean_token_accuracy": 0.7193877696990967, + "num_tokens": 11302105.0, + "step": 6172, + "train/ce_loss": 0.7044182419776917 + }, + { + "epoch": 0.6102432272098082, + "step": 6172, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.6102432272098082, + "step": 6172, + "train/total_loss": 0.1329418271780014 + }, + { + "entropy": 8.854923248291016, + "epoch": 0.6103421000593237, + "mean_token_accuracy": 0.6898016929626465, + "num_tokens": 11307296.0, + "step": 6173, + "train/ce_loss": 0.6190934777259827 + }, + { + "epoch": 0.6103421000593237, + "step": 6173, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.6103421000593237, + "step": 6173, + "train/total_loss": 0.10097184777259827 + }, + { + "entropy": 8.727234840393066, + "epoch": 0.6104409729088393, + "mean_token_accuracy": 0.7434988021850586, + "num_tokens": 11312632.0, + "step": 6174, + "train/ce_loss": 0.9979443550109863 + }, + { + "epoch": 0.6104409729088393, + "step": 6174, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.6104409729088393, + "step": 6174, + "train/total_loss": 0.1505756974220276 + }, + { + "entropy": 10.119205474853516, + "epoch": 0.6105398457583547, + "mean_token_accuracy": 0.8416422009468079, + "num_tokens": 11317340.0, + "step": 6175, + "train/ce_loss": 2.1631919935316546e-06 + }, + { + "epoch": 0.6105398457583547, + "step": 6175, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.6105398457583547, + "step": 6175, + "train/total_loss": 0.02343771606683731 + }, + { + "entropy": 8.866151809692383, + "epoch": 0.6106387186078702, + "mean_token_accuracy": 0.7371244430541992, + "num_tokens": 11322773.0, + "step": 6176, + "train/ce_loss": 0.47456467151641846 + }, + { + "epoch": 0.6106387186078702, + "step": 6176, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.6106387186078702, + "step": 6176, + "train/total_loss": 0.06308147311210632 + }, + { + "entropy": 9.114995002746582, + "epoch": 0.6107375914573858, + "mean_token_accuracy": 0.741605818271637, + "num_tokens": 11327909.0, + "step": 6177, + "train/ce_loss": 0.40683189034461975 + }, + { + "epoch": 0.6107375914573858, + "step": 6177, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.6107375914573858, + "step": 6177, + "train/total_loss": 0.06802694499492645 + }, + { + "entropy": 8.819618225097656, + "epoch": 0.6108364643069013, + "mean_token_accuracy": 0.7382199168205261, + "num_tokens": 11333130.0, + "step": 6178, + "train/ce_loss": 0.6157546043395996 + }, + { + "epoch": 0.6108364643069013, + "step": 6178, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.6108364643069013, + "step": 6178, + "train/total_loss": 0.07720045745372772 + }, + { + "entropy": 8.793582916259766, + "epoch": 0.6109353371564169, + "mean_token_accuracy": 0.7600446343421936, + "num_tokens": 11338471.0, + "step": 6179, + "train/ce_loss": 0.6357402801513672 + }, + { + "epoch": 0.6109353371564169, + "step": 6179, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.6109353371564169, + "step": 6179, + "train/total_loss": 0.13388653099536896 + }, + { + "epoch": 0.6110342100059324, + "grad_norm": 0.6334864497184753, + "learning_rate": 8.474756465410673e-06, + "loss": 0.137, + "step": 6180 + }, + { + "entropy": 8.904747009277344, + "epoch": 0.6110342100059324, + "mean_token_accuracy": 0.7613122463226318, + "num_tokens": 11343840.0, + "step": 6180, + "train/ce_loss": 0.6107622981071472 + }, + { + "epoch": 0.6110342100059324, + "step": 6180, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.6110342100059324, + "step": 6180, + "train/total_loss": 0.13138872385025024 + }, + { + "entropy": 8.89372444152832, + "epoch": 0.6111330828554479, + "mean_token_accuracy": 0.8065241575241089, + "num_tokens": 11349062.0, + "step": 6181, + "train/ce_loss": 0.8261597752571106 + }, + { + "epoch": 0.6111330828554479, + "step": 6181, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.6111330828554479, + "step": 6181, + "train/total_loss": 0.14511597156524658 + }, + { + "entropy": 8.715995788574219, + "epoch": 0.6112319557049635, + "mean_token_accuracy": 0.6978508234024048, + "num_tokens": 11354371.0, + "step": 6182, + "train/ce_loss": 0.6819457411766052 + }, + { + "epoch": 0.6112319557049635, + "step": 6182, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.6112319557049635, + "step": 6182, + "train/total_loss": 0.09553832560777664 + }, + { + "entropy": 9.236974716186523, + "epoch": 0.611330828554479, + "mean_token_accuracy": 0.7191600799560547, + "num_tokens": 11359717.0, + "step": 6183, + "train/ce_loss": 2.3581983441545162e-06 + }, + { + "epoch": 0.611330828554479, + "step": 6183, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.611330828554479, + "step": 6183, + "train/total_loss": 0.0859377384185791 + }, + { + "entropy": 8.74993896484375, + "epoch": 0.6114297014039944, + "mean_token_accuracy": 0.7932900190353394, + "num_tokens": 11365105.0, + "step": 6184, + "train/ce_loss": 0.482496052980423 + }, + { + "epoch": 0.6114297014039944, + "step": 6184, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.6114297014039944, + "step": 6184, + "train/total_loss": 0.07168710231781006 + }, + { + "entropy": 9.187471389770508, + "epoch": 0.61152857425351, + "mean_token_accuracy": 0.7275494933128357, + "num_tokens": 11370198.0, + "step": 6185, + "train/ce_loss": 1.1196050643920898 + }, + { + "epoch": 0.61152857425351, + "step": 6185, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.61152857425351, + "step": 6185, + "train/total_loss": 0.1627417504787445 + }, + { + "entropy": 9.1384859085083, + "epoch": 0.6116274471030255, + "mean_token_accuracy": 0.7666068077087402, + "num_tokens": 11375240.0, + "step": 6186, + "train/ce_loss": 0.7825351357460022 + }, + { + "epoch": 0.6116274471030255, + "step": 6186, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.6116274471030255, + "step": 6186, + "train/total_loss": 0.15247225761413574 + }, + { + "entropy": 8.774885177612305, + "epoch": 0.611726319952541, + "mean_token_accuracy": 0.7636786699295044, + "num_tokens": 11380570.0, + "step": 6187, + "train/ce_loss": 0.5543928146362305 + }, + { + "epoch": 0.611726319952541, + "step": 6187, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.611726319952541, + "step": 6187, + "train/total_loss": 0.11012677848339081 + }, + { + "entropy": 8.961973190307617, + "epoch": 0.6118251928020566, + "mean_token_accuracy": 0.7341935634613037, + "num_tokens": 11385743.0, + "step": 6188, + "train/ce_loss": 1.0541927814483643 + }, + { + "epoch": 0.6118251928020566, + "step": 6188, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.6118251928020566, + "step": 6188, + "train/total_loss": 0.17963802814483643 + }, + { + "entropy": 9.709959030151367, + "epoch": 0.6119240656515721, + "mean_token_accuracy": 0.7247058749198914, + "num_tokens": 11390595.0, + "step": 6189, + "train/ce_loss": 1.1530605554580688 + }, + { + "epoch": 0.6119240656515721, + "step": 6189, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.6119240656515721, + "step": 6189, + "train/total_loss": 0.1895247995853424 + }, + { + "entropy": 8.58003044128418, + "epoch": 0.6120229385010876, + "mean_token_accuracy": 0.7289271950721741, + "num_tokens": 11396092.0, + "step": 6190, + "train/ce_loss": 0.7498189806938171 + }, + { + "epoch": 0.6120229385010876, + "step": 6190, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.6120229385010876, + "step": 6190, + "train/total_loss": 0.1296693980693817 + }, + { + "entropy": 9.221874237060547, + "epoch": 0.6121218113506032, + "mean_token_accuracy": 0.7179487347602844, + "num_tokens": 11401173.0, + "step": 6191, + "train/ce_loss": 1.188353180885315 + }, + { + "epoch": 0.6121218113506032, + "step": 6191, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.6121218113506032, + "step": 6191, + "train/total_loss": 0.18914783000946045 + }, + { + "entropy": 9.272602081298828, + "epoch": 0.6122206842001187, + "mean_token_accuracy": 0.7354409098625183, + "num_tokens": 11406245.0, + "step": 6192, + "train/ce_loss": 1.5668947526137345e-06 + }, + { + "epoch": 0.6122206842001187, + "step": 6192, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.6122206842001187, + "step": 6192, + "train/total_loss": 0.046875156462192535 + }, + { + "entropy": 8.841050148010254, + "epoch": 0.6123195570496341, + "mean_token_accuracy": 0.724304735660553, + "num_tokens": 11411568.0, + "step": 6193, + "train/ce_loss": 0.6666838526725769 + }, + { + "epoch": 0.6123195570496341, + "step": 6193, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.6123195570496341, + "step": 6193, + "train/total_loss": 0.15260589122772217 + }, + { + "entropy": 9.335506439208984, + "epoch": 0.6124184298991497, + "mean_token_accuracy": 0.7161290049552917, + "num_tokens": 11416482.0, + "step": 6194, + "train/ce_loss": 1.1841667890548706 + }, + { + "epoch": 0.6124184298991497, + "step": 6194, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.6124184298991497, + "step": 6194, + "train/total_loss": 0.1731041818857193 + }, + { + "entropy": 8.814682006835938, + "epoch": 0.6125173027486652, + "mean_token_accuracy": 0.7270269989967346, + "num_tokens": 11421743.0, + "step": 6195, + "train/ce_loss": 1.20308256149292 + }, + { + "epoch": 0.6125173027486652, + "step": 6195, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.6125173027486652, + "step": 6195, + "train/total_loss": 0.21796450018882751 + }, + { + "entropy": 9.396188735961914, + "epoch": 0.6126161755981807, + "mean_token_accuracy": 0.7756314873695374, + "num_tokens": 11426869.0, + "step": 6196, + "train/ce_loss": 0.9634108543395996 + }, + { + "epoch": 0.6126161755981807, + "step": 6196, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.6126161755981807, + "step": 6196, + "train/total_loss": 0.1783723384141922 + }, + { + "entropy": 8.846820831298828, + "epoch": 0.6127150484476963, + "mean_token_accuracy": 0.7310252785682678, + "num_tokens": 11432067.0, + "step": 6197, + "train/ce_loss": 1.1840417385101318 + }, + { + "epoch": 0.6127150484476963, + "step": 6197, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.6127150484476963, + "step": 6197, + "train/total_loss": 0.16918542981147766 + }, + { + "entropy": 8.889070510864258, + "epoch": 0.6128139212972118, + "mean_token_accuracy": 0.7651006579399109, + "num_tokens": 11437402.0, + "step": 6198, + "train/ce_loss": 0.6077854633331299 + }, + { + "epoch": 0.6128139212972118, + "step": 6198, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.6128139212972118, + "step": 6198, + "train/total_loss": 0.09202854335308075 + }, + { + "entropy": 8.941230773925781, + "epoch": 0.6129127941467273, + "mean_token_accuracy": 0.771136999130249, + "num_tokens": 11442586.0, + "step": 6199, + "train/ce_loss": 0.756876528263092 + }, + { + "epoch": 0.6129127941467273, + "step": 6199, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.6129127941467273, + "step": 6199, + "train/total_loss": 0.13818764686584473 + }, + { + "epoch": 0.6130116669962429, + "grad_norm": 0.8004269599914551, + "learning_rate": 8.469811600652723e-06, + "loss": 0.1392, + "step": 6200 + }, + { + "entropy": 9.248784065246582, + "epoch": 0.6130116669962429, + "mean_token_accuracy": 0.6237244606018066, + "num_tokens": 11447810.0, + "step": 6200, + "train/ce_loss": 0.616258978843689 + }, + { + "epoch": 0.6130116669962429, + "step": 6200, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.6130116669962429, + "step": 6200, + "train/total_loss": 0.1280321478843689 + }, + { + "entropy": 8.512928009033203, + "epoch": 0.6131105398457584, + "mean_token_accuracy": 0.7426120042800903, + "num_tokens": 11453340.0, + "step": 6201, + "train/ce_loss": 0.7950108647346497 + }, + { + "epoch": 0.6131105398457584, + "step": 6201, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.6131105398457584, + "step": 6201, + "train/total_loss": 0.10293858498334885 + }, + { + "entropy": 9.26449966430664, + "epoch": 0.6132094126952738, + "mean_token_accuracy": 0.6986899375915527, + "num_tokens": 11458402.0, + "step": 6202, + "train/ce_loss": 1.1567436456680298 + }, + { + "epoch": 0.6132094126952738, + "step": 6202, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.6132094126952738, + "step": 6202, + "train/total_loss": 0.15864312648773193 + }, + { + "entropy": 8.734781265258789, + "epoch": 0.6133082855447894, + "mean_token_accuracy": 0.7141104340553284, + "num_tokens": 11463654.0, + "step": 6203, + "train/ce_loss": 0.7052172422409058 + }, + { + "epoch": 0.6133082855447894, + "step": 6203, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.6133082855447894, + "step": 6203, + "train/total_loss": 0.12520922720432281 + }, + { + "entropy": 8.867867469787598, + "epoch": 0.6134071583943049, + "mean_token_accuracy": 0.7478890419006348, + "num_tokens": 11468962.0, + "step": 6204, + "train/ce_loss": 0.589647114276886 + }, + { + "epoch": 0.6134071583943049, + "step": 6204, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.6134071583943049, + "step": 6204, + "train/total_loss": 0.09412096440792084 + }, + { + "entropy": 8.96460247039795, + "epoch": 0.6135060312438204, + "mean_token_accuracy": 0.7923875451087952, + "num_tokens": 11474277.0, + "step": 6205, + "train/ce_loss": 0.5550974607467651 + }, + { + "epoch": 0.6135060312438204, + "step": 6205, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.6135060312438204, + "step": 6205, + "train/total_loss": 0.08285349607467651 + }, + { + "entropy": 9.070470809936523, + "epoch": 0.613604904093336, + "mean_token_accuracy": 0.7423398494720459, + "num_tokens": 11479483.0, + "step": 6206, + "train/ce_loss": 1.1447519063949585 + }, + { + "epoch": 0.613604904093336, + "step": 6206, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.613604904093336, + "step": 6206, + "train/total_loss": 0.15353769063949585 + }, + { + "entropy": 9.126496315002441, + "epoch": 0.6137037769428515, + "mean_token_accuracy": 0.7736318111419678, + "num_tokens": 11484742.0, + "step": 6207, + "train/ce_loss": 0.7532952427864075 + }, + { + "epoch": 0.6137037769428515, + "step": 6207, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.6137037769428515, + "step": 6207, + "train/total_loss": 0.10657952725887299 + }, + { + "entropy": 8.78580093383789, + "epoch": 0.613802649792367, + "mean_token_accuracy": 0.7870563864707947, + "num_tokens": 11490166.0, + "step": 6208, + "train/ce_loss": 0.644048810005188 + }, + { + "epoch": 0.613802649792367, + "step": 6208, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.613802649792367, + "step": 6208, + "train/total_loss": 0.14252987504005432 + }, + { + "entropy": 9.546608924865723, + "epoch": 0.6139015226418826, + "mean_token_accuracy": 0.6877133250236511, + "num_tokens": 11495191.0, + "step": 6209, + "train/ce_loss": 1.6582438945770264 + }, + { + "epoch": 0.6139015226418826, + "step": 6209, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.6139015226418826, + "step": 6209, + "train/total_loss": 0.26348066329956055 + }, + { + "entropy": 8.778793334960938, + "epoch": 0.6140003954913981, + "mean_token_accuracy": 0.7088273763656616, + "num_tokens": 11500432.0, + "step": 6210, + "train/ce_loss": 1.4049540758132935 + }, + { + "epoch": 0.6140003954913981, + "step": 6210, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.6140003954913981, + "step": 6210, + "train/total_loss": 0.2147141546010971 + }, + { + "entropy": 8.937134742736816, + "epoch": 0.6140992683409136, + "mean_token_accuracy": 0.7649402618408203, + "num_tokens": 11505654.0, + "step": 6211, + "train/ce_loss": 0.6439159512519836 + }, + { + "epoch": 0.6140992683409136, + "step": 6211, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.6140992683409136, + "step": 6211, + "train/total_loss": 0.1151728481054306 + }, + { + "entropy": 9.340566635131836, + "epoch": 0.6141981411904291, + "mean_token_accuracy": 0.7306451797485352, + "num_tokens": 11510704.0, + "step": 6212, + "train/ce_loss": 0.6049285531044006 + }, + { + "epoch": 0.6141981411904291, + "step": 6212, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.6141981411904291, + "step": 6212, + "train/total_loss": 0.1151803582906723 + }, + { + "entropy": 8.783777236938477, + "epoch": 0.6142970140399446, + "mean_token_accuracy": 0.7322677373886108, + "num_tokens": 11516233.0, + "step": 6213, + "train/ce_loss": 0.45691490173339844 + }, + { + "epoch": 0.6142970140399446, + "step": 6213, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.6142970140399446, + "step": 6213, + "train/total_loss": 0.10819149017333984 + }, + { + "entropy": 9.364067077636719, + "epoch": 0.6143958868894601, + "mean_token_accuracy": 0.6932849287986755, + "num_tokens": 11521220.0, + "step": 6214, + "train/ce_loss": 1.3702287673950195 + }, + { + "epoch": 0.6143958868894601, + "step": 6214, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.6143958868894601, + "step": 6214, + "train/total_loss": 0.19561663269996643 + }, + { + "entropy": 9.151406288146973, + "epoch": 0.6144947597389757, + "mean_token_accuracy": 0.7899860739707947, + "num_tokens": 11526383.0, + "step": 6215, + "train/ce_loss": 0.522618293762207 + }, + { + "epoch": 0.6144947597389757, + "step": 6215, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.6144947597389757, + "step": 6215, + "train/total_loss": 0.0717930793762207 + }, + { + "entropy": 8.809883117675781, + "epoch": 0.6145936325884912, + "mean_token_accuracy": 0.7736625671386719, + "num_tokens": 11531831.0, + "step": 6216, + "train/ce_loss": 0.34042075276374817 + }, + { + "epoch": 0.6145936325884912, + "step": 6216, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.6145936325884912, + "step": 6216, + "train/total_loss": 0.04966707527637482 + }, + { + "entropy": 9.537630081176758, + "epoch": 0.6146925054380067, + "mean_token_accuracy": 0.8206785321235657, + "num_tokens": 11536843.0, + "step": 6217, + "train/ce_loss": 0.9631327986717224 + }, + { + "epoch": 0.6146925054380067, + "step": 6217, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.6146925054380067, + "step": 6217, + "train/total_loss": 0.11975078284740448 + }, + { + "entropy": 9.146952629089355, + "epoch": 0.6147913782875223, + "mean_token_accuracy": 0.7847328186035156, + "num_tokens": 11541948.0, + "step": 6218, + "train/ce_loss": 0.7013130784034729 + }, + { + "epoch": 0.6147913782875223, + "step": 6218, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.6147913782875223, + "step": 6218, + "train/total_loss": 0.1795063018798828 + }, + { + "entropy": 8.299921035766602, + "epoch": 0.6148902511370378, + "mean_token_accuracy": 0.7516198754310608, + "num_tokens": 11547343.0, + "step": 6219, + "train/ce_loss": 0.756189227104187 + }, + { + "epoch": 0.6148902511370378, + "step": 6219, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.6148902511370378, + "step": 6219, + "train/total_loss": 0.1342126727104187 + }, + { + "epoch": 0.6149891239865533, + "grad_norm": 0.6907902956008911, + "learning_rate": 8.464866735894775e-06, + "loss": 0.1332, + "step": 6220 + }, + { + "entropy": 9.126428604125977, + "epoch": 0.6149891239865533, + "mean_token_accuracy": 0.7591836452484131, + "num_tokens": 11552520.0, + "step": 6220, + "train/ce_loss": 1.4551453590393066 + }, + { + "epoch": 0.6149891239865533, + "step": 6220, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.6149891239865533, + "step": 6220, + "train/total_loss": 0.20801453292369843 + }, + { + "entropy": 9.308960914611816, + "epoch": 0.6150879968360689, + "mean_token_accuracy": 0.7242105007171631, + "num_tokens": 11557445.0, + "step": 6221, + "train/ce_loss": 1.0523173809051514 + }, + { + "epoch": 0.6150879968360689, + "step": 6221, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.6150879968360689, + "step": 6221, + "train/total_loss": 0.15210673213005066 + }, + { + "entropy": 9.870338439941406, + "epoch": 0.6151868696855843, + "mean_token_accuracy": 0.780927836894989, + "num_tokens": 11562238.0, + "step": 6222, + "train/ce_loss": 1.603911280632019 + }, + { + "epoch": 0.6151868696855843, + "step": 6222, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.6151868696855843, + "step": 6222, + "train/total_loss": 0.24242238700389862 + }, + { + "entropy": 8.37800121307373, + "epoch": 0.6152857425350998, + "mean_token_accuracy": 0.731225311756134, + "num_tokens": 11567710.0, + "step": 6223, + "train/ce_loss": 0.9325500130653381 + }, + { + "epoch": 0.6152857425350998, + "step": 6223, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.6152857425350998, + "step": 6223, + "train/total_loss": 0.14013001322746277 + }, + { + "entropy": 9.188399314880371, + "epoch": 0.6153846153846154, + "mean_token_accuracy": 0.7624223828315735, + "num_tokens": 11572785.0, + "step": 6224, + "train/ce_loss": 9.269812153434032e-07 + }, + { + "epoch": 0.6153846153846154, + "step": 6224, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.6153846153846154, + "step": 6224, + "train/total_loss": 0.07031258940696716 + }, + { + "entropy": 9.501968383789062, + "epoch": 0.6154834882341309, + "mean_token_accuracy": 0.776627242565155, + "num_tokens": 11577929.0, + "step": 6225, + "train/ce_loss": 0.9787071943283081 + }, + { + "epoch": 0.6154834882341309, + "step": 6225, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.6154834882341309, + "step": 6225, + "train/total_loss": 0.12130822241306305 + }, + { + "entropy": 8.997016906738281, + "epoch": 0.6155823610836464, + "mean_token_accuracy": 0.7281323671340942, + "num_tokens": 11583168.0, + "step": 6226, + "train/ce_loss": 1.01045823097229 + }, + { + "epoch": 0.6155823610836464, + "step": 6226, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.6155823610836464, + "step": 6226, + "train/total_loss": 0.13229581713676453 + }, + { + "entropy": 8.857669830322266, + "epoch": 0.615681233933162, + "mean_token_accuracy": 0.7765042781829834, + "num_tokens": 11588293.0, + "step": 6227, + "train/ce_loss": 1.1743441820144653 + }, + { + "epoch": 0.615681233933162, + "step": 6227, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.615681233933162, + "step": 6227, + "train/total_loss": 0.15259066224098206 + }, + { + "entropy": 8.81214714050293, + "epoch": 0.6157801067826775, + "mean_token_accuracy": 0.7611940503120422, + "num_tokens": 11593576.0, + "step": 6228, + "train/ce_loss": 0.9884230494499207 + }, + { + "epoch": 0.6157801067826775, + "step": 6228, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.6157801067826775, + "step": 6228, + "train/total_loss": 0.1261860579252243 + }, + { + "entropy": 8.675318717956543, + "epoch": 0.615878979632193, + "mean_token_accuracy": 0.7145969271659851, + "num_tokens": 11598959.0, + "step": 6229, + "train/ce_loss": 0.6410172581672668 + }, + { + "epoch": 0.615878979632193, + "step": 6229, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.615878979632193, + "step": 6229, + "train/total_loss": 0.14222672581672668 + }, + { + "entropy": 8.503095626831055, + "epoch": 0.6159778524817086, + "mean_token_accuracy": 0.7460484504699707, + "num_tokens": 11604406.0, + "step": 6230, + "train/ce_loss": 0.8307203650474548 + }, + { + "epoch": 0.6159778524817086, + "step": 6230, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.6159778524817086, + "step": 6230, + "train/total_loss": 0.17682203650474548 + }, + { + "entropy": 8.74378776550293, + "epoch": 0.616076725331224, + "mean_token_accuracy": 0.774193525314331, + "num_tokens": 11609913.0, + "step": 6231, + "train/ce_loss": 0.58668053150177 + }, + { + "epoch": 0.616076725331224, + "step": 6231, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.616076725331224, + "step": 6231, + "train/total_loss": 0.07429305464029312 + }, + { + "entropy": 8.621826171875, + "epoch": 0.6161755981807395, + "mean_token_accuracy": 0.7532728910446167, + "num_tokens": 11615550.0, + "step": 6232, + "train/ce_loss": 0.9261634945869446 + }, + { + "epoch": 0.6161755981807395, + "step": 6232, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.6161755981807395, + "step": 6232, + "train/total_loss": 0.16683509945869446 + }, + { + "entropy": 9.242183685302734, + "epoch": 0.6162744710302551, + "mean_token_accuracy": 0.7163233160972595, + "num_tokens": 11620645.0, + "step": 6233, + "train/ce_loss": 1.4938048124313354 + }, + { + "epoch": 0.6162744710302551, + "step": 6233, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.6162744710302551, + "step": 6233, + "train/total_loss": 0.22750549018383026 + }, + { + "entropy": 9.009716033935547, + "epoch": 0.6163733438797706, + "mean_token_accuracy": 0.7153284549713135, + "num_tokens": 11625815.0, + "step": 6234, + "train/ce_loss": 7.141983360270387e-07 + }, + { + "epoch": 0.6163733438797706, + "step": 6234, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.6163733438797706, + "step": 6234, + "train/total_loss": 0.07812507450580597 + }, + { + "entropy": 9.606576919555664, + "epoch": 0.6164722167292861, + "mean_token_accuracy": 0.7597172856330872, + "num_tokens": 11630837.0, + "step": 6235, + "train/ce_loss": 0.7407086491584778 + }, + { + "epoch": 0.6164722167292861, + "step": 6235, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.6164722167292861, + "step": 6235, + "train/total_loss": 0.12094586342573166 + }, + { + "entropy": 9.218091011047363, + "epoch": 0.6165710895788017, + "mean_token_accuracy": 0.7296848893165588, + "num_tokens": 11635860.0, + "step": 6236, + "train/ce_loss": 0.9628185629844666 + }, + { + "epoch": 0.6165710895788017, + "step": 6236, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.6165710895788017, + "step": 6236, + "train/total_loss": 0.17050060629844666 + }, + { + "entropy": 10.20110034942627, + "epoch": 0.6166699624283172, + "mean_token_accuracy": 0.757446825504303, + "num_tokens": 11640456.0, + "step": 6237, + "train/ce_loss": 2.8913418645970523e-06 + }, + { + "epoch": 0.6166699624283172, + "step": 6237, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.6166699624283172, + "step": 6237, + "train/total_loss": 0.02343778870999813 + }, + { + "entropy": 8.834396362304688, + "epoch": 0.6167688352778327, + "mean_token_accuracy": 0.7523584961891174, + "num_tokens": 11645773.0, + "step": 6238, + "train/ce_loss": 1.054317831993103 + }, + { + "epoch": 0.6167688352778327, + "step": 6238, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.6167688352778327, + "step": 6238, + "train/total_loss": 0.14840054512023926 + }, + { + "entropy": 9.45189094543457, + "epoch": 0.6168677081273483, + "mean_token_accuracy": 0.7443946003913879, + "num_tokens": 11650648.0, + "step": 6239, + "train/ce_loss": 1.6271706044790335e-06 + }, + { + "epoch": 0.6168677081273483, + "step": 6239, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.6168677081273483, + "step": 6239, + "train/total_loss": 0.04296891391277313 + }, + { + "epoch": 0.6169665809768637, + "grad_norm": 0.835460901260376, + "learning_rate": 8.459921871136824e-06, + "loss": 0.1377, + "step": 6240 + }, + { + "entropy": 8.653219223022461, + "epoch": 0.6169665809768637, + "mean_token_accuracy": 0.7077363729476929, + "num_tokens": 11656180.0, + "step": 6240, + "train/ce_loss": 1.6680773496627808 + }, + { + "epoch": 0.6169665809768637, + "step": 6240, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.6169665809768637, + "step": 6240, + "train/total_loss": 0.23712024092674255 + }, + { + "entropy": 9.151749610900879, + "epoch": 0.6170654538263792, + "mean_token_accuracy": 0.7945945858955383, + "num_tokens": 11661209.0, + "step": 6241, + "train/ce_loss": 0.5691468119621277 + }, + { + "epoch": 0.6170654538263792, + "step": 6241, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.6170654538263792, + "step": 6241, + "train/total_loss": 0.11550843715667725 + }, + { + "entropy": 8.956352233886719, + "epoch": 0.6171643266758948, + "mean_token_accuracy": 0.6747967600822449, + "num_tokens": 11666563.0, + "step": 6242, + "train/ce_loss": 0.6857337355613708 + }, + { + "epoch": 0.6171643266758948, + "step": 6242, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.6171643266758948, + "step": 6242, + "train/total_loss": 0.09201087802648544 + }, + { + "entropy": 8.7849702835083, + "epoch": 0.6172631995254103, + "mean_token_accuracy": 0.7371638417243958, + "num_tokens": 11671848.0, + "step": 6243, + "train/ce_loss": 0.596751868724823 + }, + { + "epoch": 0.6172631995254103, + "step": 6243, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.6172631995254103, + "step": 6243, + "train/total_loss": 0.0948314368724823 + }, + { + "entropy": 9.008289337158203, + "epoch": 0.6173620723749258, + "mean_token_accuracy": 0.8005865216255188, + "num_tokens": 11676988.0, + "step": 6244, + "train/ce_loss": 0.6968074440956116 + }, + { + "epoch": 0.6173620723749258, + "step": 6244, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.6173620723749258, + "step": 6244, + "train/total_loss": 0.10093074291944504 + }, + { + "entropy": 9.25759220123291, + "epoch": 0.6174609452244414, + "mean_token_accuracy": 0.7796609997749329, + "num_tokens": 11682104.0, + "step": 6245, + "train/ce_loss": 1.2047686576843262 + }, + { + "epoch": 0.6174609452244414, + "step": 6245, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.6174609452244414, + "step": 6245, + "train/total_loss": 0.2259456217288971 + }, + { + "entropy": 8.841020584106445, + "epoch": 0.6175598180739569, + "mean_token_accuracy": 0.7383592128753662, + "num_tokens": 11687445.0, + "step": 6246, + "train/ce_loss": 0.7534462809562683 + }, + { + "epoch": 0.6175598180739569, + "step": 6246, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.6175598180739569, + "step": 6246, + "train/total_loss": 0.12221962958574295 + }, + { + "entropy": 8.788390159606934, + "epoch": 0.6176586909234724, + "mean_token_accuracy": 0.7828418016433716, + "num_tokens": 11692736.0, + "step": 6247, + "train/ce_loss": 0.6744476556777954 + }, + { + "epoch": 0.6176586909234724, + "step": 6247, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.6176586909234724, + "step": 6247, + "train/total_loss": 0.09088226407766342 + }, + { + "entropy": 9.2723388671875, + "epoch": 0.617757563772988, + "mean_token_accuracy": 0.7493036389350891, + "num_tokens": 11697864.0, + "step": 6248, + "train/ce_loss": 0.7439592480659485 + }, + { + "epoch": 0.617757563772988, + "step": 6248, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.617757563772988, + "step": 6248, + "train/total_loss": 0.14470842480659485 + }, + { + "entropy": 9.078840255737305, + "epoch": 0.6178564366225034, + "mean_token_accuracy": 0.6967560052871704, + "num_tokens": 11703003.0, + "step": 6249, + "train/ce_loss": 1.248230218887329 + }, + { + "epoch": 0.6178564366225034, + "step": 6249, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.6178564366225034, + "step": 6249, + "train/total_loss": 0.19122928380966187 + }, + { + "entropy": 9.284374237060547, + "epoch": 0.6179553094720189, + "mean_token_accuracy": 0.6666666865348816, + "num_tokens": 11708102.0, + "step": 6250, + "train/ce_loss": 1.8740671873092651 + }, + { + "epoch": 0.6179553094720189, + "step": 6250, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.6179553094720189, + "step": 6250, + "train/total_loss": 0.2733442187309265 + }, + { + "entropy": 9.415840148925781, + "epoch": 0.6180541823215345, + "mean_token_accuracy": 0.7890772223472595, + "num_tokens": 11713093.0, + "step": 6251, + "train/ce_loss": 1.5481805801391602 + }, + { + "epoch": 0.6180541823215345, + "step": 6251, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.6180541823215345, + "step": 6251, + "train/total_loss": 0.22513055801391602 + }, + { + "entropy": 8.931975364685059, + "epoch": 0.61815305517105, + "mean_token_accuracy": 0.7244284152984619, + "num_tokens": 11718370.0, + "step": 6252, + "train/ce_loss": 0.5441824793815613 + }, + { + "epoch": 0.61815305517105, + "step": 6252, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.61815305517105, + "step": 6252, + "train/total_loss": 0.10519950091838837 + }, + { + "entropy": 9.544601440429688, + "epoch": 0.6182519280205655, + "mean_token_accuracy": 0.7572559118270874, + "num_tokens": 11723170.0, + "step": 6253, + "train/ce_loss": 2.360188545935671e-06 + }, + { + "epoch": 0.6182519280205655, + "step": 6253, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.6182519280205655, + "step": 6253, + "train/total_loss": 0.0546877346932888 + }, + { + "entropy": 8.967365264892578, + "epoch": 0.6183508008700811, + "mean_token_accuracy": 0.7658142447471619, + "num_tokens": 11728380.0, + "step": 6254, + "train/ce_loss": 0.2919480502605438 + }, + { + "epoch": 0.6183508008700811, + "step": 6254, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.6183508008700811, + "step": 6254, + "train/total_loss": 0.05653855577111244 + }, + { + "entropy": 9.081618309020996, + "epoch": 0.6184496737195966, + "mean_token_accuracy": 0.795484721660614, + "num_tokens": 11733605.0, + "step": 6255, + "train/ce_loss": 1.271119253942743e-05 + }, + { + "epoch": 0.6184496737195966, + "step": 6255, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.6184496737195966, + "step": 6255, + "train/total_loss": 0.06640752404928207 + }, + { + "entropy": 8.944194793701172, + "epoch": 0.6185485465691121, + "mean_token_accuracy": 0.7186261415481567, + "num_tokens": 11738795.0, + "step": 6256, + "train/ce_loss": 0.8012778162956238 + }, + { + "epoch": 0.6185485465691121, + "step": 6256, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.6185485465691121, + "step": 6256, + "train/total_loss": 0.1230965331196785 + }, + { + "entropy": 8.654861450195312, + "epoch": 0.6186474194186277, + "mean_token_accuracy": 0.7975663542747498, + "num_tokens": 11744109.0, + "step": 6257, + "train/ce_loss": 0.4617496728897095 + }, + { + "epoch": 0.6186474194186277, + "step": 6257, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.6186474194186277, + "step": 6257, + "train/total_loss": 0.08914372324943542 + }, + { + "entropy": 9.075235366821289, + "epoch": 0.6187462922681431, + "mean_token_accuracy": 0.7780784964561462, + "num_tokens": 11749328.0, + "step": 6258, + "train/ce_loss": 0.7430073022842407 + }, + { + "epoch": 0.6187462922681431, + "step": 6258, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.6187462922681431, + "step": 6258, + "train/total_loss": 0.12898823618888855 + }, + { + "entropy": 8.558425903320312, + "epoch": 0.6188451651176586, + "mean_token_accuracy": 0.7475622892379761, + "num_tokens": 11754749.0, + "step": 6259, + "train/ce_loss": 0.8147485256195068 + }, + { + "epoch": 0.6188451651176586, + "step": 6259, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.6188451651176586, + "step": 6259, + "train/total_loss": 0.17131860554218292 + }, + { + "epoch": 0.6189440379671742, + "grad_norm": 0.6895498633384705, + "learning_rate": 8.454977006378876e-06, + "loss": 0.1334, + "step": 6260 + }, + { + "entropy": 8.543374061584473, + "epoch": 0.6189440379671742, + "mean_token_accuracy": 0.7225490212440491, + "num_tokens": 11760286.0, + "step": 6260, + "train/ce_loss": 1.124602198600769 + }, + { + "epoch": 0.6189440379671742, + "step": 6260, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.6189440379671742, + "step": 6260, + "train/total_loss": 0.19449147582054138 + }, + { + "entropy": 9.300775527954102, + "epoch": 0.6190429108166897, + "mean_token_accuracy": 0.7272727489471436, + "num_tokens": 11765489.0, + "step": 6261, + "train/ce_loss": 0.8462640643119812 + }, + { + "epoch": 0.6190429108166897, + "step": 6261, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.6190429108166897, + "step": 6261, + "train/total_loss": 0.11978265643119812 + }, + { + "entropy": 8.763044357299805, + "epoch": 0.6191417836662053, + "mean_token_accuracy": 0.7466216087341309, + "num_tokens": 11770834.0, + "step": 6262, + "train/ce_loss": 0.8813503980636597 + }, + { + "epoch": 0.6191417836662053, + "step": 6262, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.6191417836662053, + "step": 6262, + "train/total_loss": 0.1428225338459015 + }, + { + "entropy": 8.738752365112305, + "epoch": 0.6192406565157208, + "mean_token_accuracy": 0.7152406573295593, + "num_tokens": 11775999.0, + "step": 6263, + "train/ce_loss": 0.6721773147583008 + }, + { + "epoch": 0.6192406565157208, + "step": 6263, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.6192406565157208, + "step": 6263, + "train/total_loss": 0.11018647998571396 + }, + { + "entropy": 8.763019561767578, + "epoch": 0.6193395293652363, + "mean_token_accuracy": 0.6934097409248352, + "num_tokens": 11781148.0, + "step": 6264, + "train/ce_loss": 1.057170033454895 + }, + { + "epoch": 0.6193395293652363, + "step": 6264, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.6193395293652363, + "step": 6264, + "train/total_loss": 0.1369670033454895 + }, + { + "entropy": 8.852022171020508, + "epoch": 0.6194384022147519, + "mean_token_accuracy": 0.7561880946159363, + "num_tokens": 11786434.0, + "step": 6265, + "train/ce_loss": 0.650295615196228 + }, + { + "epoch": 0.6194384022147519, + "step": 6265, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.6194384022147519, + "step": 6265, + "train/total_loss": 0.0845608115196228 + }, + { + "entropy": 8.8131685256958, + "epoch": 0.6195372750642674, + "mean_token_accuracy": 0.7301587462425232, + "num_tokens": 11791750.0, + "step": 6266, + "train/ce_loss": 1.1719716787338257 + }, + { + "epoch": 0.6195372750642674, + "step": 6266, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.6195372750642674, + "step": 6266, + "train/total_loss": 0.1640721708536148 + }, + { + "entropy": 9.048507690429688, + "epoch": 0.6196361479137829, + "mean_token_accuracy": 0.75, + "num_tokens": 11797015.0, + "step": 6267, + "train/ce_loss": 0.5786001086235046 + }, + { + "epoch": 0.6196361479137829, + "step": 6267, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.6196361479137829, + "step": 6267, + "train/total_loss": 0.13598501682281494 + }, + { + "entropy": 9.642312049865723, + "epoch": 0.6197350207632984, + "mean_token_accuracy": 0.7819253206253052, + "num_tokens": 11801960.0, + "step": 6268, + "train/ce_loss": 0.6379991769790649 + }, + { + "epoch": 0.6197350207632984, + "step": 6268, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.6197350207632984, + "step": 6268, + "train/total_loss": 0.0833311676979065 + }, + { + "entropy": 9.178560256958008, + "epoch": 0.6198338936128139, + "mean_token_accuracy": 0.7324159145355225, + "num_tokens": 11807065.0, + "step": 6269, + "train/ce_loss": 8.792806625024241e-07 + }, + { + "epoch": 0.6198338936128139, + "step": 6269, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.6198338936128139, + "step": 6269, + "train/total_loss": 0.03125008940696716 + }, + { + "entropy": 8.404592514038086, + "epoch": 0.6199327664623294, + "mean_token_accuracy": 0.7387914061546326, + "num_tokens": 11812520.0, + "step": 6270, + "train/ce_loss": 0.7952864170074463 + }, + { + "epoch": 0.6199327664623294, + "step": 6270, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.6199327664623294, + "step": 6270, + "train/total_loss": 0.10296614468097687 + }, + { + "entropy": 8.78640365600586, + "epoch": 0.620031639311845, + "mean_token_accuracy": 0.7868852615356445, + "num_tokens": 11817911.0, + "step": 6271, + "train/ce_loss": 0.78782057762146 + }, + { + "epoch": 0.620031639311845, + "step": 6271, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.620031639311845, + "step": 6271, + "train/total_loss": 0.12956330180168152 + }, + { + "entropy": 8.49759292602539, + "epoch": 0.6201305121613605, + "mean_token_accuracy": 0.7430703639984131, + "num_tokens": 11823337.0, + "step": 6272, + "train/ce_loss": 0.6456509232521057 + }, + { + "epoch": 0.6201305121613605, + "step": 6272, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.6201305121613605, + "step": 6272, + "train/total_loss": 0.09190884232521057 + }, + { + "entropy": 9.274723052978516, + "epoch": 0.620229385010876, + "mean_token_accuracy": 0.7439758777618408, + "num_tokens": 11828369.0, + "step": 6273, + "train/ce_loss": 0.9972986578941345 + }, + { + "epoch": 0.620229385010876, + "step": 6273, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.620229385010876, + "step": 6273, + "train/total_loss": 0.14660486578941345 + }, + { + "entropy": 8.786995887756348, + "epoch": 0.6203282578603916, + "mean_token_accuracy": 0.73380446434021, + "num_tokens": 11833698.0, + "step": 6274, + "train/ce_loss": 1.1607381105422974 + }, + { + "epoch": 0.6203282578603916, + "step": 6274, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.6203282578603916, + "step": 6274, + "train/total_loss": 0.1824800670146942 + }, + { + "entropy": 8.677114486694336, + "epoch": 0.6204271307099071, + "mean_token_accuracy": 0.698074996471405, + "num_tokens": 11839113.0, + "step": 6275, + "train/ce_loss": 0.9493786692619324 + }, + { + "epoch": 0.6204271307099071, + "step": 6275, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.6204271307099071, + "step": 6275, + "train/total_loss": 0.11837536841630936 + }, + { + "entropy": 8.763256072998047, + "epoch": 0.6205260035594226, + "mean_token_accuracy": 0.7807737588882446, + "num_tokens": 11844481.0, + "step": 6276, + "train/ce_loss": 0.3372906446456909 + }, + { + "epoch": 0.6205260035594226, + "step": 6276, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.6205260035594226, + "step": 6276, + "train/total_loss": 0.04935406520962715 + }, + { + "entropy": 8.933353424072266, + "epoch": 0.6206248764089382, + "mean_token_accuracy": 0.733742356300354, + "num_tokens": 11849706.0, + "step": 6277, + "train/ce_loss": 0.6353260278701782 + }, + { + "epoch": 0.6206248764089382, + "step": 6277, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.6206248764089382, + "step": 6277, + "train/total_loss": 0.14165760576725006 + }, + { + "entropy": 8.695978164672852, + "epoch": 0.6207237492584536, + "mean_token_accuracy": 0.7405900359153748, + "num_tokens": 11855180.0, + "step": 6278, + "train/ce_loss": 0.6998786926269531 + }, + { + "epoch": 0.6207237492584536, + "step": 6278, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.6207237492584536, + "step": 6278, + "train/total_loss": 0.14030036330223083 + }, + { + "entropy": 9.205509185791016, + "epoch": 0.6208226221079691, + "mean_token_accuracy": 0.7695418000221252, + "num_tokens": 11860317.0, + "step": 6279, + "train/ce_loss": 4.41487060243162e-07 + }, + { + "epoch": 0.6208226221079691, + "step": 6279, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.6208226221079691, + "step": 6279, + "train/total_loss": 0.01953129470348358 + }, + { + "epoch": 0.6209214949574847, + "grad_norm": 0.5874653458595276, + "learning_rate": 8.450032141620927e-06, + "loss": 0.1324, + "step": 6280 + }, + { + "entropy": 9.59320068359375, + "epoch": 0.6209214949574847, + "mean_token_accuracy": 0.7029703259468079, + "num_tokens": 11865216.0, + "step": 6280, + "train/ce_loss": 9.11168342554447e-07 + }, + { + "epoch": 0.6209214949574847, + "step": 6280, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.6209214949574847, + "step": 6280, + "train/total_loss": 0.03515633940696716 + }, + { + "entropy": 9.023270606994629, + "epoch": 0.6210203678070002, + "mean_token_accuracy": 0.686274528503418, + "num_tokens": 11870455.0, + "step": 6281, + "train/ce_loss": 0.9922044277191162 + }, + { + "epoch": 0.6210203678070002, + "step": 6281, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.6210203678070002, + "step": 6281, + "train/total_loss": 0.18906420469284058 + }, + { + "entropy": 9.550705909729004, + "epoch": 0.6211192406565157, + "mean_token_accuracy": 0.7443609237670898, + "num_tokens": 11875282.0, + "step": 6282, + "train/ce_loss": 2.086510903609451e-06 + }, + { + "epoch": 0.6211192406565157, + "step": 6282, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.6211192406565157, + "step": 6282, + "train/total_loss": 0.039062708616256714 + }, + { + "entropy": 9.219742774963379, + "epoch": 0.6212181135060313, + "mean_token_accuracy": 0.7130434513092041, + "num_tokens": 11880299.0, + "step": 6283, + "train/ce_loss": 1.163111686706543 + }, + { + "epoch": 0.6212181135060313, + "step": 6283, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.6212181135060313, + "step": 6283, + "train/total_loss": 0.15537366271018982 + }, + { + "entropy": 9.4982271194458, + "epoch": 0.6213169863555468, + "mean_token_accuracy": 0.701646089553833, + "num_tokens": 11885192.0, + "step": 6284, + "train/ce_loss": 2.250600814819336 + }, + { + "epoch": 0.6213169863555468, + "step": 6284, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.6213169863555468, + "step": 6284, + "train/total_loss": 0.3188101053237915 + }, + { + "entropy": 9.323200225830078, + "epoch": 0.6214158592050623, + "mean_token_accuracy": 0.8364197611808777, + "num_tokens": 11890271.0, + "step": 6285, + "train/ce_loss": 0.8418030142784119 + }, + { + "epoch": 0.6214158592050623, + "step": 6285, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.6214158592050623, + "step": 6285, + "train/total_loss": 0.1232428029179573 + }, + { + "entropy": 8.596650123596191, + "epoch": 0.6215147320545779, + "mean_token_accuracy": 0.7080745100975037, + "num_tokens": 11895712.0, + "step": 6286, + "train/ce_loss": 1.0133122205734253 + }, + { + "epoch": 0.6215147320545779, + "step": 6286, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.6215147320545779, + "step": 6286, + "train/total_loss": 0.15992498397827148 + }, + { + "entropy": 8.95394515991211, + "epoch": 0.6216136049040933, + "mean_token_accuracy": 0.7142857313156128, + "num_tokens": 11901026.0, + "step": 6287, + "train/ce_loss": 1.2561620473861694 + }, + { + "epoch": 0.6216136049040933, + "step": 6287, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.6216136049040933, + "step": 6287, + "train/total_loss": 0.21936620771884918 + }, + { + "entropy": 9.9725341796875, + "epoch": 0.6217124777536088, + "mean_token_accuracy": 0.8008849620819092, + "num_tokens": 11905646.0, + "step": 6288, + "train/ce_loss": 2.3084328174591064 + }, + { + "epoch": 0.6217124777536088, + "step": 6288, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.6217124777536088, + "step": 6288, + "train/total_loss": 0.27771830558776855 + }, + { + "entropy": 8.914336204528809, + "epoch": 0.6218113506031244, + "mean_token_accuracy": 0.7150062918663025, + "num_tokens": 11910907.0, + "step": 6289, + "train/ce_loss": 1.248734712600708 + }, + { + "epoch": 0.6218113506031244, + "step": 6289, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.6218113506031244, + "step": 6289, + "train/total_loss": 0.18737347424030304 + }, + { + "entropy": 8.865591049194336, + "epoch": 0.6219102234526399, + "mean_token_accuracy": 0.6875712871551514, + "num_tokens": 11916245.0, + "step": 6290, + "train/ce_loss": 1.488316297531128 + }, + { + "epoch": 0.6219102234526399, + "step": 6290, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.6219102234526399, + "step": 6290, + "train/total_loss": 0.18008163571357727 + }, + { + "entropy": 9.334177017211914, + "epoch": 0.6220090963021554, + "mean_token_accuracy": 0.7226890921592712, + "num_tokens": 11921281.0, + "step": 6291, + "train/ce_loss": 1.2111235857009888 + }, + { + "epoch": 0.6220090963021554, + "step": 6291, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.6220090963021554, + "step": 6291, + "train/total_loss": 0.16798736155033112 + }, + { + "entropy": 8.802948951721191, + "epoch": 0.622107969151671, + "mean_token_accuracy": 0.7705128192901611, + "num_tokens": 11926530.0, + "step": 6292, + "train/ce_loss": 0.5217919945716858 + }, + { + "epoch": 0.622107969151671, + "step": 6292, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.622107969151671, + "step": 6292, + "train/total_loss": 0.11467920243740082 + }, + { + "entropy": 9.367215156555176, + "epoch": 0.6222068420011865, + "mean_token_accuracy": 0.6818181872367859, + "num_tokens": 11931571.0, + "step": 6293, + "train/ce_loss": 1.4454087018966675 + }, + { + "epoch": 0.6222068420011865, + "step": 6293, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.6222068420011865, + "step": 6293, + "train/total_loss": 0.21094712615013123 + }, + { + "entropy": 8.841418266296387, + "epoch": 0.622305714850702, + "mean_token_accuracy": 0.7589802742004395, + "num_tokens": 11937241.0, + "step": 6294, + "train/ce_loss": 0.7581228613853455 + }, + { + "epoch": 0.622305714850702, + "step": 6294, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.622305714850702, + "step": 6294, + "train/total_loss": 0.13831228017807007 + }, + { + "entropy": 8.810118675231934, + "epoch": 0.6224045877002176, + "mean_token_accuracy": 0.7293986678123474, + "num_tokens": 11942602.0, + "step": 6295, + "train/ce_loss": 0.8448886275291443 + }, + { + "epoch": 0.6224045877002176, + "step": 6295, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.6224045877002176, + "step": 6295, + "train/total_loss": 0.11573886126279831 + }, + { + "entropy": 8.876449584960938, + "epoch": 0.622503460549733, + "mean_token_accuracy": 0.7777777910232544, + "num_tokens": 11947897.0, + "step": 6296, + "train/ce_loss": 0.6751101613044739 + }, + { + "epoch": 0.622503460549733, + "step": 6296, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.622503460549733, + "step": 6296, + "train/total_loss": 0.09876101464033127 + }, + { + "entropy": 9.05276107788086, + "epoch": 0.6226023333992485, + "mean_token_accuracy": 0.7639751434326172, + "num_tokens": 11953155.0, + "step": 6297, + "train/ce_loss": 0.8308619260787964 + }, + { + "epoch": 0.6226023333992485, + "step": 6297, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.6226023333992485, + "step": 6297, + "train/total_loss": 0.17292994260787964 + }, + { + "entropy": 9.669578552246094, + "epoch": 0.6227012062487641, + "mean_token_accuracy": 0.7356321811676025, + "num_tokens": 11957943.0, + "step": 6298, + "train/ce_loss": 1.627619981765747 + }, + { + "epoch": 0.6227012062487641, + "step": 6298, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.6227012062487641, + "step": 6298, + "train/total_loss": 0.22916825115680695 + }, + { + "entropy": 8.85727596282959, + "epoch": 0.6228000790982796, + "mean_token_accuracy": 0.7109066843986511, + "num_tokens": 11963151.0, + "step": 6299, + "train/ce_loss": 1.0590969324111938 + }, + { + "epoch": 0.6228000790982796, + "step": 6299, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.6228000790982796, + "step": 6299, + "train/total_loss": 0.14106595516204834 + }, + { + "epoch": 0.6228989519477951, + "grad_norm": 0.6998578906059265, + "learning_rate": 8.445087276862979e-06, + "loss": 0.1341, + "step": 6300 + }, + { + "entropy": 9.62222671508789, + "epoch": 0.6228989519477951, + "mean_token_accuracy": 0.7442307472229004, + "num_tokens": 11968136.0, + "step": 6300, + "train/ce_loss": 0.5273857116699219 + }, + { + "epoch": 0.6228989519477951, + "step": 6300, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.6228989519477951, + "step": 6300, + "train/total_loss": 0.09180107712745667 + }, + { + "entropy": 9.020427703857422, + "epoch": 0.6229978247973107, + "mean_token_accuracy": 0.7334167957305908, + "num_tokens": 11973427.0, + "step": 6301, + "train/ce_loss": 0.9961581230163574 + }, + { + "epoch": 0.6229978247973107, + "step": 6301, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.6229978247973107, + "step": 6301, + "train/total_loss": 0.19336581230163574 + }, + { + "entropy": 9.166910171508789, + "epoch": 0.6230966976468262, + "mean_token_accuracy": 0.7630137205123901, + "num_tokens": 11978577.0, + "step": 6302, + "train/ce_loss": 0.8610888123512268 + }, + { + "epoch": 0.6230966976468262, + "step": 6302, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.6230966976468262, + "step": 6302, + "train/total_loss": 0.18767139315605164 + }, + { + "entropy": 8.74166488647461, + "epoch": 0.6231955704963417, + "mean_token_accuracy": 0.7590497732162476, + "num_tokens": 11983959.0, + "step": 6303, + "train/ce_loss": 0.8055565357208252 + }, + { + "epoch": 0.6231955704963417, + "step": 6303, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.6231955704963417, + "step": 6303, + "train/total_loss": 0.12352440506219864 + }, + { + "entropy": 9.161663055419922, + "epoch": 0.6232944433458573, + "mean_token_accuracy": 0.8209876418113708, + "num_tokens": 11989073.0, + "step": 6304, + "train/ce_loss": 4.1683088056743145e-06 + }, + { + "epoch": 0.6232944433458573, + "step": 6304, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.6232944433458573, + "step": 6304, + "train/total_loss": 0.023437917232513428 + }, + { + "entropy": 8.7100830078125, + "epoch": 0.6233933161953727, + "mean_token_accuracy": 0.7294994592666626, + "num_tokens": 11994481.0, + "step": 6305, + "train/ce_loss": 1.027742624282837 + }, + { + "epoch": 0.6233933161953727, + "step": 6305, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.6233933161953727, + "step": 6305, + "train/total_loss": 0.1574617624282837 + }, + { + "entropy": 8.905563354492188, + "epoch": 0.6234921890448882, + "mean_token_accuracy": 0.728672981262207, + "num_tokens": 11999823.0, + "step": 6306, + "train/ce_loss": 1.1297070980072021 + }, + { + "epoch": 0.6234921890448882, + "step": 6306, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.6234921890448882, + "step": 6306, + "train/total_loss": 0.19109570980072021 + }, + { + "entropy": 8.815194129943848, + "epoch": 0.6235910618944038, + "mean_token_accuracy": 0.7109634280204773, + "num_tokens": 12005220.0, + "step": 6307, + "train/ce_loss": 1.033612608909607 + }, + { + "epoch": 0.6235910618944038, + "step": 6307, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.6235910618944038, + "step": 6307, + "train/total_loss": 0.18148626387119293 + }, + { + "entropy": 9.242063522338867, + "epoch": 0.6236899347439193, + "mean_token_accuracy": 0.7588757276535034, + "num_tokens": 12010285.0, + "step": 6308, + "train/ce_loss": 1.2583582247316372e-06 + }, + { + "epoch": 0.6236899347439193, + "step": 6308, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.6236899347439193, + "step": 6308, + "train/total_loss": 0.027343876659870148 + }, + { + "entropy": 8.727019309997559, + "epoch": 0.6237888075934348, + "mean_token_accuracy": 0.7207637429237366, + "num_tokens": 12015584.0, + "step": 6309, + "train/ce_loss": 0.8390503525733948 + }, + { + "epoch": 0.6237888075934348, + "step": 6309, + "train/sim_loss": 0.12109375 + }, + { + "epoch": 0.6237888075934348, + "step": 6309, + "train/total_loss": 0.20499879121780396 + }, + { + "entropy": 8.89903450012207, + "epoch": 0.6238876804429504, + "mean_token_accuracy": 0.7966963052749634, + "num_tokens": 12020875.0, + "step": 6310, + "train/ce_loss": 0.5423491597175598 + }, + { + "epoch": 0.6238876804429504, + "step": 6310, + "train/sim_loss": 0.13671875 + }, + { + "epoch": 0.6238876804429504, + "step": 6310, + "train/total_loss": 0.19095367193222046 + }, + { + "entropy": 9.328275680541992, + "epoch": 0.6239865532924659, + "mean_token_accuracy": 0.7196030020713806, + "num_tokens": 12025714.0, + "step": 6311, + "train/ce_loss": 2.41526198387146 + }, + { + "epoch": 0.6239865532924659, + "step": 6311, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.6239865532924659, + "step": 6311, + "train/total_loss": 0.29621368646621704 + }, + { + "entropy": 9.257919311523438, + "epoch": 0.6240854261419814, + "mean_token_accuracy": 0.730659008026123, + "num_tokens": 12030887.0, + "step": 6312, + "train/ce_loss": 0.5913631319999695 + }, + { + "epoch": 0.6240854261419814, + "step": 6312, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.6240854261419814, + "step": 6312, + "train/total_loss": 0.1411675661802292 + }, + { + "entropy": 8.812848091125488, + "epoch": 0.624184298991497, + "mean_token_accuracy": 0.7369697093963623, + "num_tokens": 12036388.0, + "step": 6313, + "train/ce_loss": 0.6528817415237427 + }, + { + "epoch": 0.624184298991497, + "step": 6313, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.624184298991497, + "step": 6313, + "train/total_loss": 0.12388192862272263 + }, + { + "entropy": 8.925192832946777, + "epoch": 0.6242831718410125, + "mean_token_accuracy": 0.7349260449409485, + "num_tokens": 12041744.0, + "step": 6314, + "train/ce_loss": 0.751113772392273 + }, + { + "epoch": 0.6242831718410125, + "step": 6314, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.6242831718410125, + "step": 6314, + "train/total_loss": 0.12589263916015625 + }, + { + "entropy": 9.45772933959961, + "epoch": 0.6243820446905279, + "mean_token_accuracy": 0.7693575024604797, + "num_tokens": 12046786.0, + "step": 6315, + "train/ce_loss": 0.7212726473808289 + }, + { + "epoch": 0.6243820446905279, + "step": 6315, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.6243820446905279, + "step": 6315, + "train/total_loss": 0.10337726771831512 + }, + { + "entropy": 9.179908752441406, + "epoch": 0.6244809175400435, + "mean_token_accuracy": 0.7732793688774109, + "num_tokens": 12051926.0, + "step": 6316, + "train/ce_loss": 0.7527188062667847 + }, + { + "epoch": 0.6244809175400435, + "step": 6316, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.6244809175400435, + "step": 6316, + "train/total_loss": 0.10652188211679459 + }, + { + "entropy": 8.725812911987305, + "epoch": 0.624579790389559, + "mean_token_accuracy": 0.6998950839042664, + "num_tokens": 12057407.0, + "step": 6317, + "train/ce_loss": 1.3678371906280518 + }, + { + "epoch": 0.624579790389559, + "step": 6317, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.624579790389559, + "step": 6317, + "train/total_loss": 0.15631496906280518 + }, + { + "entropy": 8.594905853271484, + "epoch": 0.6246786632390745, + "mean_token_accuracy": 0.7718191146850586, + "num_tokens": 12062862.0, + "step": 6318, + "train/ce_loss": 0.7105671763420105 + }, + { + "epoch": 0.6246786632390745, + "step": 6318, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.6246786632390745, + "step": 6318, + "train/total_loss": 0.11793171614408493 + }, + { + "entropy": 8.683724403381348, + "epoch": 0.6247775360885901, + "mean_token_accuracy": 0.8104794025421143, + "num_tokens": 12068199.0, + "step": 6319, + "train/ce_loss": 0.5049375891685486 + }, + { + "epoch": 0.6247775360885901, + "step": 6319, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.6247775360885901, + "step": 6319, + "train/total_loss": 0.0778375118970871 + }, + { + "epoch": 0.6248764089381056, + "grad_norm": 0.549186110496521, + "learning_rate": 8.44014241210503e-06, + "loss": 0.1383, + "step": 6320 + }, + { + "entropy": 9.419797897338867, + "epoch": 0.6248764089381056, + "mean_token_accuracy": 0.6722129583358765, + "num_tokens": 12073177.0, + "step": 6320, + "train/ce_loss": 1.4812781810760498 + }, + { + "epoch": 0.6248764089381056, + "step": 6320, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.6248764089381056, + "step": 6320, + "train/total_loss": 0.22625282406806946 + }, + { + "entropy": 9.006487846374512, + "epoch": 0.6249752817876211, + "mean_token_accuracy": 0.7139561772346497, + "num_tokens": 12078506.0, + "step": 6321, + "train/ce_loss": 1.4044777154922485 + }, + { + "epoch": 0.6249752817876211, + "step": 6321, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.6249752817876211, + "step": 6321, + "train/total_loss": 0.17951028048992157 + }, + { + "entropy": 8.784879684448242, + "epoch": 0.6250741546371367, + "mean_token_accuracy": 0.7167947292327881, + "num_tokens": 12083920.0, + "step": 6322, + "train/ce_loss": 0.6876106858253479 + }, + { + "epoch": 0.6250741546371367, + "step": 6322, + "train/sim_loss": 0.12109375 + }, + { + "epoch": 0.6250741546371367, + "step": 6322, + "train/total_loss": 0.18985483050346375 + }, + { + "entropy": 9.060832977294922, + "epoch": 0.6251730274866522, + "mean_token_accuracy": 0.7107232213020325, + "num_tokens": 12089167.0, + "step": 6323, + "train/ce_loss": 0.8286123871803284 + }, + { + "epoch": 0.6251730274866522, + "step": 6323, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.6251730274866522, + "step": 6323, + "train/total_loss": 0.14536124467849731 + }, + { + "entropy": 9.480755805969238, + "epoch": 0.6252719003361676, + "mean_token_accuracy": 0.7591623067855835, + "num_tokens": 12093963.0, + "step": 6324, + "train/ce_loss": 1.2280468940734863 + }, + { + "epoch": 0.6252719003361676, + "step": 6324, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.6252719003361676, + "step": 6324, + "train/total_loss": 0.1892109513282776 + }, + { + "entropy": 9.252645492553711, + "epoch": 0.6253707731856832, + "mean_token_accuracy": 0.6762226223945618, + "num_tokens": 12099018.0, + "step": 6325, + "train/ce_loss": 1.6568008661270142 + }, + { + "epoch": 0.6253707731856832, + "step": 6325, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.6253707731856832, + "step": 6325, + "train/total_loss": 0.22427384555339813 + }, + { + "entropy": 9.285510063171387, + "epoch": 0.6254696460351987, + "mean_token_accuracy": 0.7503828406333923, + "num_tokens": 12104055.0, + "step": 6326, + "train/ce_loss": 0.9159083366394043 + }, + { + "epoch": 0.6254696460351987, + "step": 6326, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.6254696460351987, + "step": 6326, + "train/total_loss": 0.21659083664417267 + }, + { + "entropy": 9.442464828491211, + "epoch": 0.6255685188847142, + "mean_token_accuracy": 0.7549019455909729, + "num_tokens": 12109112.0, + "step": 6327, + "train/ce_loss": 1.071745753288269 + }, + { + "epoch": 0.6255685188847142, + "step": 6327, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.6255685188847142, + "step": 6327, + "train/total_loss": 0.1501433253288269 + }, + { + "entropy": 8.657928466796875, + "epoch": 0.6256673917342298, + "mean_token_accuracy": 0.7690762877464294, + "num_tokens": 12114601.0, + "step": 6328, + "train/ce_loss": 0.8865790367126465 + }, + { + "epoch": 0.6256673917342298, + "step": 6328, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.6256673917342298, + "step": 6328, + "train/total_loss": 0.116001658141613 + }, + { + "entropy": 9.665094375610352, + "epoch": 0.6257662645837453, + "mean_token_accuracy": 0.7170731425285339, + "num_tokens": 12119404.0, + "step": 6329, + "train/ce_loss": 1.880294919013977 + }, + { + "epoch": 0.6257662645837453, + "step": 6329, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.6257662645837453, + "step": 6329, + "train/total_loss": 0.23490449786186218 + }, + { + "entropy": 9.09188461303711, + "epoch": 0.6258651374332608, + "mean_token_accuracy": 0.7213656306266785, + "num_tokens": 12124754.0, + "step": 6330, + "train/ce_loss": 1.0878103971481323 + }, + { + "epoch": 0.6258651374332608, + "step": 6330, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.6258651374332608, + "step": 6330, + "train/total_loss": 0.15565603971481323 + }, + { + "entropy": 8.86585807800293, + "epoch": 0.6259640102827764, + "mean_token_accuracy": 0.6994949579238892, + "num_tokens": 12130026.0, + "step": 6331, + "train/ce_loss": 0.7202993631362915 + }, + { + "epoch": 0.6259640102827764, + "step": 6331, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.6259640102827764, + "step": 6331, + "train/total_loss": 0.1501549482345581 + }, + { + "entropy": 9.033773422241211, + "epoch": 0.6260628831322919, + "mean_token_accuracy": 0.800000011920929, + "num_tokens": 12135253.0, + "step": 6332, + "train/ce_loss": 1.0049701586467563e-06 + }, + { + "epoch": 0.6260628831322919, + "step": 6332, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.6260628831322919, + "step": 6332, + "train/total_loss": 0.06640634685754776 + }, + { + "entropy": 8.9903564453125, + "epoch": 0.6261617559818073, + "mean_token_accuracy": 0.7041420340538025, + "num_tokens": 12140546.0, + "step": 6333, + "train/ce_loss": 1.2116358280181885 + }, + { + "epoch": 0.6261617559818073, + "step": 6333, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.6261617559818073, + "step": 6333, + "train/total_loss": 0.17194482684135437 + }, + { + "entropy": 8.666067123413086, + "epoch": 0.6262606288313229, + "mean_token_accuracy": 0.7283422350883484, + "num_tokens": 12145945.0, + "step": 6334, + "train/ce_loss": 0.9988052248954773 + }, + { + "epoch": 0.6262606288313229, + "step": 6334, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.6262606288313229, + "step": 6334, + "train/total_loss": 0.13503676652908325 + }, + { + "entropy": 8.50477409362793, + "epoch": 0.6263595016808384, + "mean_token_accuracy": 0.7868852615356445, + "num_tokens": 12151558.0, + "step": 6335, + "train/ce_loss": 0.24824056029319763 + }, + { + "epoch": 0.6263595016808384, + "step": 6335, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.6263595016808384, + "step": 6335, + "train/total_loss": 0.04826155677437782 + }, + { + "entropy": 8.916728973388672, + "epoch": 0.6264583745303539, + "mean_token_accuracy": 0.7371134161949158, + "num_tokens": 12156812.0, + "step": 6336, + "train/ce_loss": 0.47676023840904236 + }, + { + "epoch": 0.6264583745303539, + "step": 6336, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.6264583745303539, + "step": 6336, + "train/total_loss": 0.09845727682113647 + }, + { + "entropy": 8.71766471862793, + "epoch": 0.6265572473798695, + "mean_token_accuracy": 0.749417245388031, + "num_tokens": 12162113.0, + "step": 6337, + "train/ce_loss": 0.6502630114555359 + }, + { + "epoch": 0.6265572473798695, + "step": 6337, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.6265572473798695, + "step": 6337, + "train/total_loss": 0.13533881306648254 + }, + { + "entropy": 8.337970733642578, + "epoch": 0.626656120229385, + "mean_token_accuracy": 0.7336448431015015, + "num_tokens": 12167685.0, + "step": 6338, + "train/ce_loss": 0.7648563981056213 + }, + { + "epoch": 0.626656120229385, + "step": 6338, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.626656120229385, + "step": 6338, + "train/total_loss": 0.09211064130067825 + }, + { + "entropy": 9.744911193847656, + "epoch": 0.6267549930789005, + "mean_token_accuracy": 0.7819905281066895, + "num_tokens": 12172556.0, + "step": 6339, + "train/ce_loss": 2.3105878881324315e-06 + }, + { + "epoch": 0.6267549930789005, + "step": 6339, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.6267549930789005, + "step": 6339, + "train/total_loss": 0.046875230967998505 + }, + { + "epoch": 0.6268538659284161, + "grad_norm": 0.67138671875, + "learning_rate": 8.43519754734708e-06, + "loss": 0.1386, + "step": 6340 + }, + { + "entropy": 8.503151893615723, + "epoch": 0.6268538659284161, + "mean_token_accuracy": 0.7273972630500793, + "num_tokens": 12177778.0, + "step": 6340, + "train/ce_loss": 0.588840126991272 + }, + { + "epoch": 0.6268538659284161, + "step": 6340, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.6268538659284161, + "step": 6340, + "train/total_loss": 0.14872775971889496 + }, + { + "entropy": 9.482362747192383, + "epoch": 0.6269527387779316, + "mean_token_accuracy": 0.779552698135376, + "num_tokens": 12182813.0, + "step": 6341, + "train/ce_loss": 0.9476832151412964 + }, + { + "epoch": 0.6269527387779316, + "step": 6341, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.6269527387779316, + "step": 6341, + "train/total_loss": 0.15726831555366516 + }, + { + "entropy": 9.053950309753418, + "epoch": 0.6270516116274472, + "mean_token_accuracy": 0.7777777910232544, + "num_tokens": 12188013.0, + "step": 6342, + "train/ce_loss": 0.9743828773498535 + }, + { + "epoch": 0.6270516116274472, + "step": 6342, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.6270516116274472, + "step": 6342, + "train/total_loss": 0.1521257907152176 + }, + { + "entropy": 9.118183135986328, + "epoch": 0.6271504844769626, + "mean_token_accuracy": 0.7409972548484802, + "num_tokens": 12193201.0, + "step": 6343, + "train/ce_loss": 0.8215731978416443 + }, + { + "epoch": 0.6271504844769626, + "step": 6343, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.6271504844769626, + "step": 6343, + "train/total_loss": 0.18371981382369995 + }, + { + "entropy": 9.800601959228516, + "epoch": 0.6272493573264781, + "mean_token_accuracy": 0.7318007946014404, + "num_tokens": 12198142.0, + "step": 6344, + "train/ce_loss": 1.7427619695663452 + }, + { + "epoch": 0.6272493573264781, + "step": 6344, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.6272493573264781, + "step": 6344, + "train/total_loss": 0.240682452917099 + }, + { + "entropy": 9.138225555419922, + "epoch": 0.6273482301759937, + "mean_token_accuracy": 0.7191176414489746, + "num_tokens": 12203236.0, + "step": 6345, + "train/ce_loss": 1.5853768587112427 + }, + { + "epoch": 0.6273482301759937, + "step": 6345, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.6273482301759937, + "step": 6345, + "train/total_loss": 0.18978768587112427 + }, + { + "entropy": 8.585261344909668, + "epoch": 0.6274471030255092, + "mean_token_accuracy": 0.7087967395782471, + "num_tokens": 12208698.0, + "step": 6346, + "train/ce_loss": 0.6669883131980896 + }, + { + "epoch": 0.6274471030255092, + "step": 6346, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.6274471030255092, + "step": 6346, + "train/total_loss": 0.1252925843000412 + }, + { + "entropy": 8.82328987121582, + "epoch": 0.6275459758750247, + "mean_token_accuracy": 0.7107329964637756, + "num_tokens": 12213926.0, + "step": 6347, + "train/ce_loss": 0.7975558042526245 + }, + { + "epoch": 0.6275459758750247, + "step": 6347, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.6275459758750247, + "step": 6347, + "train/total_loss": 0.15006807446479797 + }, + { + "entropy": 8.962874412536621, + "epoch": 0.6276448487245403, + "mean_token_accuracy": 0.7805164456367493, + "num_tokens": 12219260.0, + "step": 6348, + "train/ce_loss": 0.5232795476913452 + }, + { + "epoch": 0.6276448487245403, + "step": 6348, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.6276448487245403, + "step": 6348, + "train/total_loss": 0.075765460729599 + }, + { + "entropy": 9.269477844238281, + "epoch": 0.6277437215740558, + "mean_token_accuracy": 0.7456647157669067, + "num_tokens": 12224416.0, + "step": 6349, + "train/ce_loss": 1.0365246534347534 + }, + { + "epoch": 0.6277437215740558, + "step": 6349, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.6277437215740558, + "step": 6349, + "train/total_loss": 0.1934962272644043 + }, + { + "entropy": 8.638045310974121, + "epoch": 0.6278425944235713, + "mean_token_accuracy": 0.7481323480606079, + "num_tokens": 12229876.0, + "step": 6350, + "train/ce_loss": 0.6363872289657593 + }, + { + "epoch": 0.6278425944235713, + "step": 6350, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.6278425944235713, + "step": 6350, + "train/total_loss": 0.09098247438669205 + }, + { + "entropy": 9.32752799987793, + "epoch": 0.6279414672730869, + "mean_token_accuracy": 0.7577807903289795, + "num_tokens": 12235072.0, + "step": 6351, + "train/ce_loss": 0.5960943698883057 + }, + { + "epoch": 0.6279414672730869, + "step": 6351, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.6279414672730869, + "step": 6351, + "train/total_loss": 0.07523444294929504 + }, + { + "entropy": 8.9277925491333, + "epoch": 0.6280403401226023, + "mean_token_accuracy": 0.7244501709938049, + "num_tokens": 12240321.0, + "step": 6352, + "train/ce_loss": 0.8829100728034973 + }, + { + "epoch": 0.6280403401226023, + "step": 6352, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.6280403401226023, + "step": 6352, + "train/total_loss": 0.1312597692012787 + }, + { + "entropy": 9.443146705627441, + "epoch": 0.6281392129721178, + "mean_token_accuracy": 0.7212317585945129, + "num_tokens": 12245356.0, + "step": 6353, + "train/ce_loss": 1.0199133157730103 + }, + { + "epoch": 0.6281392129721178, + "step": 6353, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.6281392129721178, + "step": 6353, + "train/total_loss": 0.16449132561683655 + }, + { + "entropy": 9.374839782714844, + "epoch": 0.6282380858216334, + "mean_token_accuracy": 0.7766666412353516, + "num_tokens": 12250374.0, + "step": 6354, + "train/ce_loss": 0.8880688548088074 + }, + { + "epoch": 0.6282380858216334, + "step": 6354, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.6282380858216334, + "step": 6354, + "train/total_loss": 0.1395881474018097 + }, + { + "entropy": 8.867729187011719, + "epoch": 0.6283369586711489, + "mean_token_accuracy": 0.732119619846344, + "num_tokens": 12255585.0, + "step": 6355, + "train/ce_loss": 1.7234163284301758 + }, + { + "epoch": 0.6283369586711489, + "step": 6355, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.6283369586711489, + "step": 6355, + "train/total_loss": 0.25046664476394653 + }, + { + "entropy": 9.388571739196777, + "epoch": 0.6284358315206644, + "mean_token_accuracy": 0.7986577153205872, + "num_tokens": 12260623.0, + "step": 6356, + "train/ce_loss": 0.6396639943122864 + }, + { + "epoch": 0.6284358315206644, + "step": 6356, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.6284358315206644, + "step": 6356, + "train/total_loss": 0.12256015092134476 + }, + { + "entropy": 8.816532135009766, + "epoch": 0.62853470437018, + "mean_token_accuracy": 0.8092672228813171, + "num_tokens": 12266060.0, + "step": 6357, + "train/ce_loss": 0.7067915201187134 + }, + { + "epoch": 0.62853470437018, + "step": 6357, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.62853470437018, + "step": 6357, + "train/total_loss": 0.12536665797233582 + }, + { + "entropy": 8.95190715789795, + "epoch": 0.6286335772196955, + "mean_token_accuracy": 0.7230320572853088, + "num_tokens": 12271182.0, + "step": 6358, + "train/ce_loss": 0.9889823794364929 + }, + { + "epoch": 0.6286335772196955, + "step": 6358, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.6286335772196955, + "step": 6358, + "train/total_loss": 0.16530448198318481 + }, + { + "entropy": 8.838409423828125, + "epoch": 0.628732450069211, + "mean_token_accuracy": 0.7487437129020691, + "num_tokens": 12276478.0, + "step": 6359, + "train/ce_loss": 0.9863818883895874 + }, + { + "epoch": 0.628732450069211, + "step": 6359, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.628732450069211, + "step": 6359, + "train/total_loss": 0.15332569181919098 + }, + { + "epoch": 0.6288313229187266, + "grad_norm": 0.8530322313308716, + "learning_rate": 8.430252682589132e-06, + "loss": 0.1424, + "step": 6360 + }, + { + "entropy": 9.095026016235352, + "epoch": 0.6288313229187266, + "mean_token_accuracy": 0.74609375, + "num_tokens": 12281716.0, + "step": 6360, + "train/ce_loss": 1.0583113431930542 + }, + { + "epoch": 0.6288313229187266, + "step": 6360, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.6288313229187266, + "step": 6360, + "train/total_loss": 0.15661239624023438 + }, + { + "entropy": 9.41407585144043, + "epoch": 0.628930195768242, + "mean_token_accuracy": 0.7508305907249451, + "num_tokens": 12286747.0, + "step": 6361, + "train/ce_loss": 1.2370884418487549 + }, + { + "epoch": 0.628930195768242, + "step": 6361, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.628930195768242, + "step": 6361, + "train/total_loss": 0.1705838441848755 + }, + { + "entropy": 9.389739036560059, + "epoch": 0.6290290686177575, + "mean_token_accuracy": 0.715925395488739, + "num_tokens": 12291911.0, + "step": 6362, + "train/ce_loss": 1.855309247970581 + }, + { + "epoch": 0.6290290686177575, + "step": 6362, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.6290290686177575, + "step": 6362, + "train/total_loss": 0.2675621807575226 + }, + { + "entropy": 8.758782386779785, + "epoch": 0.6291279414672731, + "mean_token_accuracy": 0.7784290909767151, + "num_tokens": 12297251.0, + "step": 6363, + "train/ce_loss": 1.0631740093231201 + }, + { + "epoch": 0.6291279414672731, + "step": 6363, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.6291279414672731, + "step": 6363, + "train/total_loss": 0.168817400932312 + }, + { + "entropy": 9.42778491973877, + "epoch": 0.6292268143167886, + "mean_token_accuracy": 0.7454545497894287, + "num_tokens": 12302070.0, + "step": 6364, + "train/ce_loss": 3.4973986657860223e-06 + }, + { + "epoch": 0.6292268143167886, + "step": 6364, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.6292268143167886, + "step": 6364, + "train/total_loss": 0.050781600177288055 + }, + { + "entropy": 9.25421142578125, + "epoch": 0.6293256871663041, + "mean_token_accuracy": 0.7796852588653564, + "num_tokens": 12307195.0, + "step": 6365, + "train/ce_loss": 0.7625394463539124 + }, + { + "epoch": 0.6293256871663041, + "step": 6365, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.6293256871663041, + "step": 6365, + "train/total_loss": 0.1270352005958557 + }, + { + "entropy": 9.317729949951172, + "epoch": 0.6294245600158197, + "mean_token_accuracy": 0.737730085849762, + "num_tokens": 12312297.0, + "step": 6366, + "train/ce_loss": 1.4506595134735107 + }, + { + "epoch": 0.6294245600158197, + "step": 6366, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.6294245600158197, + "step": 6366, + "train/total_loss": 0.19975344836711884 + }, + { + "entropy": 9.388843536376953, + "epoch": 0.6295234328653352, + "mean_token_accuracy": 0.7343096137046814, + "num_tokens": 12317217.0, + "step": 6367, + "train/ce_loss": 4.443568286660593e-06 + }, + { + "epoch": 0.6295234328653352, + "step": 6367, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.6295234328653352, + "step": 6367, + "train/total_loss": 0.04296919330954552 + }, + { + "entropy": 9.684952735900879, + "epoch": 0.6296223057148507, + "mean_token_accuracy": 0.6673684120178223, + "num_tokens": 12322082.0, + "step": 6368, + "train/ce_loss": 2.226114511489868 + }, + { + "epoch": 0.6296223057148507, + "step": 6368, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.6296223057148507, + "step": 6368, + "train/total_loss": 0.2929239571094513 + }, + { + "entropy": 9.337738037109375, + "epoch": 0.6297211785643663, + "mean_token_accuracy": 0.7398753762245178, + "num_tokens": 12327158.0, + "step": 6369, + "train/ce_loss": 1.3865617513656616 + }, + { + "epoch": 0.6297211785643663, + "step": 6369, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.6297211785643663, + "step": 6369, + "train/total_loss": 0.20506243407726288 + }, + { + "entropy": 9.073951721191406, + "epoch": 0.6298200514138818, + "mean_token_accuracy": 0.7574578523635864, + "num_tokens": 12332418.0, + "step": 6370, + "train/ce_loss": 0.981586217880249 + }, + { + "epoch": 0.6298200514138818, + "step": 6370, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.6298200514138818, + "step": 6370, + "train/total_loss": 0.16456487774848938 + }, + { + "entropy": 9.461869239807129, + "epoch": 0.6299189242633972, + "mean_token_accuracy": 0.7388888597488403, + "num_tokens": 12337395.0, + "step": 6371, + "train/ce_loss": 1.225242018699646 + }, + { + "epoch": 0.6299189242633972, + "step": 6371, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.6299189242633972, + "step": 6371, + "train/total_loss": 0.2084617018699646 + }, + { + "entropy": 8.798147201538086, + "epoch": 0.6300177971129128, + "mean_token_accuracy": 0.7759162187576294, + "num_tokens": 12342868.0, + "step": 6372, + "train/ce_loss": 0.590668797492981 + }, + { + "epoch": 0.6300177971129128, + "step": 6372, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.6300177971129128, + "step": 6372, + "train/total_loss": 0.07859812676906586 + }, + { + "entropy": 8.785633087158203, + "epoch": 0.6301166699624283, + "mean_token_accuracy": 0.7649824023246765, + "num_tokens": 12348262.0, + "step": 6373, + "train/ce_loss": 0.49392861127853394 + }, + { + "epoch": 0.6301166699624283, + "step": 6373, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.6301166699624283, + "step": 6373, + "train/total_loss": 0.10798661410808563 + }, + { + "entropy": 9.257098197937012, + "epoch": 0.6302155428119438, + "mean_token_accuracy": 0.7217742204666138, + "num_tokens": 12353458.0, + "step": 6374, + "train/ce_loss": 1.1889922618865967 + }, + { + "epoch": 0.6302155428119438, + "step": 6374, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.6302155428119438, + "step": 6374, + "train/total_loss": 0.20483672618865967 + }, + { + "entropy": 8.840173721313477, + "epoch": 0.6303144156614594, + "mean_token_accuracy": 0.7063491940498352, + "num_tokens": 12358819.0, + "step": 6375, + "train/ce_loss": 1.1618380546569824 + }, + { + "epoch": 0.6303144156614594, + "step": 6375, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.6303144156614594, + "step": 6375, + "train/total_loss": 0.2021213173866272 + }, + { + "entropy": 9.291227340698242, + "epoch": 0.6304132885109749, + "mean_token_accuracy": 0.761049747467041, + "num_tokens": 12363973.0, + "step": 6376, + "train/ce_loss": 1.1883305311203003 + }, + { + "epoch": 0.6304132885109749, + "step": 6376, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.6304132885109749, + "step": 6376, + "train/total_loss": 0.20477056503295898 + }, + { + "entropy": 8.358701705932617, + "epoch": 0.6305121613604904, + "mean_token_accuracy": 0.7315130829811096, + "num_tokens": 12369348.0, + "step": 6377, + "train/ce_loss": 0.6801207661628723 + }, + { + "epoch": 0.6305121613604904, + "step": 6377, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.6305121613604904, + "step": 6377, + "train/total_loss": 0.1305120885372162 + }, + { + "entropy": 8.610776901245117, + "epoch": 0.630611034210006, + "mean_token_accuracy": 0.728105902671814, + "num_tokens": 12374810.0, + "step": 6378, + "train/ce_loss": 1.374704122543335 + }, + { + "epoch": 0.630611034210006, + "step": 6378, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.630611034210006, + "step": 6378, + "train/total_loss": 0.23122040927410126 + }, + { + "entropy": 8.675466537475586, + "epoch": 0.6307099070595215, + "mean_token_accuracy": 0.7681159377098083, + "num_tokens": 12380439.0, + "step": 6379, + "train/ce_loss": 1.0569945573806763 + }, + { + "epoch": 0.6307099070595215, + "step": 6379, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.6307099070595215, + "step": 6379, + "train/total_loss": 0.19163694977760315 + }, + { + "epoch": 0.6308087799090369, + "grad_norm": 0.5189346671104431, + "learning_rate": 8.425307817831183e-06, + "loss": 0.1383, + "step": 6380 + }, + { + "entropy": 8.942543029785156, + "epoch": 0.6308087799090369, + "mean_token_accuracy": 0.7570093274116516, + "num_tokens": 12385839.0, + "step": 6380, + "train/ce_loss": 1.4659301042556763 + }, + { + "epoch": 0.6308087799090369, + "step": 6380, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.6308087799090369, + "step": 6380, + "train/total_loss": 0.21299926936626434 + }, + { + "entropy": 8.634477615356445, + "epoch": 0.6309076527585525, + "mean_token_accuracy": 0.7297979593276978, + "num_tokens": 12391104.0, + "step": 6381, + "train/ce_loss": 0.9466655254364014 + }, + { + "epoch": 0.6309076527585525, + "step": 6381, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.6309076527585525, + "step": 6381, + "train/total_loss": 0.15716655552387238 + }, + { + "entropy": 9.339715957641602, + "epoch": 0.631006525608068, + "mean_token_accuracy": 0.6768377423286438, + "num_tokens": 12396287.0, + "step": 6382, + "train/ce_loss": 1.5789258480072021 + }, + { + "epoch": 0.631006525608068, + "step": 6382, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.631006525608068, + "step": 6382, + "train/total_loss": 0.23211133480072021 + }, + { + "entropy": 8.758424758911133, + "epoch": 0.6311053984575835, + "mean_token_accuracy": 0.6982492208480835, + "num_tokens": 12401773.0, + "step": 6383, + "train/ce_loss": 0.5240266919136047 + }, + { + "epoch": 0.6311053984575835, + "step": 6383, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.6311053984575835, + "step": 6383, + "train/total_loss": 0.09927767515182495 + }, + { + "entropy": 8.72703742980957, + "epoch": 0.6312042713070991, + "mean_token_accuracy": 0.7908878326416016, + "num_tokens": 12407133.0, + "step": 6384, + "train/ce_loss": 0.3531772792339325 + }, + { + "epoch": 0.6312042713070991, + "step": 6384, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.6312042713070991, + "step": 6384, + "train/total_loss": 0.05875523015856743 + }, + { + "entropy": 9.161531448364258, + "epoch": 0.6313031441566146, + "mean_token_accuracy": 0.7280831933021545, + "num_tokens": 12412248.0, + "step": 6385, + "train/ce_loss": 0.7096350789070129 + }, + { + "epoch": 0.6313031441566146, + "step": 6385, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.6313031441566146, + "step": 6385, + "train/total_loss": 0.12565100193023682 + }, + { + "entropy": 9.38093376159668, + "epoch": 0.6314020170061301, + "mean_token_accuracy": 0.7904929518699646, + "num_tokens": 12417271.0, + "step": 6386, + "train/ce_loss": 1.4010456652613357e-06 + }, + { + "epoch": 0.6314020170061301, + "step": 6386, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.6314020170061301, + "step": 6386, + "train/total_loss": 0.04687514156103134 + }, + { + "entropy": 9.508302688598633, + "epoch": 0.6315008898556457, + "mean_token_accuracy": 0.7487603425979614, + "num_tokens": 12422339.0, + "step": 6387, + "train/ce_loss": 1.1079400777816772 + }, + { + "epoch": 0.6315008898556457, + "step": 6387, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.6315008898556457, + "step": 6387, + "train/total_loss": 0.17329400777816772 + }, + { + "entropy": 9.090853691101074, + "epoch": 0.6315997627051612, + "mean_token_accuracy": 0.7806913256645203, + "num_tokens": 12427658.0, + "step": 6388, + "train/ce_loss": 1.1928500498470385e-06 + }, + { + "epoch": 0.6315997627051612, + "step": 6388, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.6315997627051612, + "step": 6388, + "train/total_loss": 0.04687511920928955 + }, + { + "entropy": 9.914019584655762, + "epoch": 0.6316986355546766, + "mean_token_accuracy": 0.7288135886192322, + "num_tokens": 12432343.0, + "step": 6389, + "train/ce_loss": 1.8008298873901367 + }, + { + "epoch": 0.6316986355546766, + "step": 6389, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.6316986355546766, + "step": 6389, + "train/total_loss": 0.2660204768180847 + }, + { + "entropy": 9.699853897094727, + "epoch": 0.6317975084041922, + "mean_token_accuracy": 0.8121951222419739, + "num_tokens": 12437172.0, + "step": 6390, + "train/ce_loss": 1.049293875694275 + }, + { + "epoch": 0.6317975084041922, + "step": 6390, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.6317975084041922, + "step": 6390, + "train/total_loss": 0.12055438756942749 + }, + { + "entropy": 9.337457656860352, + "epoch": 0.6318963812537077, + "mean_token_accuracy": 0.6853766441345215, + "num_tokens": 12442307.0, + "step": 6391, + "train/ce_loss": 1.1118484735488892 + }, + { + "epoch": 0.6318963812537077, + "step": 6391, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.6318963812537077, + "step": 6391, + "train/total_loss": 0.18149735033512115 + }, + { + "entropy": 9.267208099365234, + "epoch": 0.6319952541032232, + "mean_token_accuracy": 0.7442622780799866, + "num_tokens": 12447362.0, + "step": 6392, + "train/ce_loss": 1.1730303764343262 + }, + { + "epoch": 0.6319952541032232, + "step": 6392, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.6319952541032232, + "step": 6392, + "train/total_loss": 0.1954280436038971 + }, + { + "entropy": 8.493053436279297, + "epoch": 0.6320941269527388, + "mean_token_accuracy": 0.8197908401489258, + "num_tokens": 12453064.0, + "step": 6393, + "train/ce_loss": 0.5338155627250671 + }, + { + "epoch": 0.6320941269527388, + "step": 6393, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.6320941269527388, + "step": 6393, + "train/total_loss": 0.08072531223297119 + }, + { + "entropy": 8.964149475097656, + "epoch": 0.6321929998022543, + "mean_token_accuracy": 0.692396342754364, + "num_tokens": 12458397.0, + "step": 6394, + "train/ce_loss": 0.34347906708717346 + }, + { + "epoch": 0.6321929998022543, + "step": 6394, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.6321929998022543, + "step": 6394, + "train/total_loss": 0.06559790670871735 + }, + { + "entropy": 8.968805313110352, + "epoch": 0.6322918726517698, + "mean_token_accuracy": 0.7429577708244324, + "num_tokens": 12463717.0, + "step": 6395, + "train/ce_loss": 1.5259385108947754 + }, + { + "epoch": 0.6322918726517698, + "step": 6395, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.6322918726517698, + "step": 6395, + "train/total_loss": 0.21900010108947754 + }, + { + "entropy": 9.250955581665039, + "epoch": 0.6323907455012854, + "mean_token_accuracy": 0.661556601524353, + "num_tokens": 12469048.0, + "step": 6396, + "train/ce_loss": 0.9907170534133911 + }, + { + "epoch": 0.6323907455012854, + "step": 6396, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.6323907455012854, + "step": 6396, + "train/total_loss": 0.1420404613018036 + }, + { + "entropy": 9.317476272583008, + "epoch": 0.6324896183508009, + "mean_token_accuracy": 0.7774193286895752, + "num_tokens": 12474121.0, + "step": 6397, + "train/ce_loss": 1.778739147084707e-06 + }, + { + "epoch": 0.6324896183508009, + "step": 6397, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.6324896183508009, + "step": 6397, + "train/total_loss": 0.042968928813934326 + }, + { + "entropy": 9.001506805419922, + "epoch": 0.6325884912003163, + "mean_token_accuracy": 0.7237196564674377, + "num_tokens": 12479371.0, + "step": 6398, + "train/ce_loss": 0.8848857879638672 + }, + { + "epoch": 0.6325884912003163, + "step": 6398, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.6325884912003163, + "step": 6398, + "train/total_loss": 0.17442607879638672 + }, + { + "entropy": 9.355804443359375, + "epoch": 0.632687364049832, + "mean_token_accuracy": 0.7774086594581604, + "num_tokens": 12484437.0, + "step": 6399, + "train/ce_loss": 0.8253676295280457 + }, + { + "epoch": 0.632687364049832, + "step": 6399, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.632687364049832, + "step": 6399, + "train/total_loss": 0.10597426444292068 + }, + { + "epoch": 0.6327862368993474, + "grad_norm": 0.7332557439804077, + "learning_rate": 8.420362953073235e-06, + "loss": 0.1347, + "step": 6400 + }, + { + "entropy": 8.898918151855469, + "epoch": 0.6327862368993474, + "mean_token_accuracy": 0.7534246444702148, + "num_tokens": 12489843.0, + "step": 6400, + "train/ce_loss": 1.0735923051834106 + }, + { + "epoch": 0.6327862368993474, + "step": 6400, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.6327862368993474, + "step": 6400, + "train/total_loss": 0.16595298051834106 + }, + { + "entropy": 8.987698554992676, + "epoch": 0.6328851097488629, + "mean_token_accuracy": 0.732824444770813, + "num_tokens": 12495149.0, + "step": 6401, + "train/ce_loss": 1.0781718492507935 + }, + { + "epoch": 0.6328851097488629, + "step": 6401, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.6328851097488629, + "step": 6401, + "train/total_loss": 0.19375468790531158 + }, + { + "entropy": 9.062227249145508, + "epoch": 0.6329839825983785, + "mean_token_accuracy": 0.7556008100509644, + "num_tokens": 12500041.0, + "step": 6402, + "train/ce_loss": 1.5401432165162987e-06 + }, + { + "epoch": 0.6329839825983785, + "step": 6402, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.6329839825983785, + "step": 6402, + "train/total_loss": 0.03515640273690224 + }, + { + "entropy": 9.318317413330078, + "epoch": 0.633082855447894, + "mean_token_accuracy": 0.6694560647010803, + "num_tokens": 12505169.0, + "step": 6403, + "train/ce_loss": 4.1232300418414525e-07 + }, + { + "epoch": 0.633082855447894, + "step": 6403, + "train/sim_loss": 0.01171875 + }, + { + "epoch": 0.633082855447894, + "step": 6403, + "train/total_loss": 0.011718790978193283 + }, + { + "entropy": 8.892905235290527, + "epoch": 0.6331817282974095, + "mean_token_accuracy": 0.8035503029823303, + "num_tokens": 12510449.0, + "step": 6404, + "train/ce_loss": 0.6142004132270813 + }, + { + "epoch": 0.6331817282974095, + "step": 6404, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.6331817282974095, + "step": 6404, + "train/total_loss": 0.1317325383424759 + }, + { + "entropy": 8.429710388183594, + "epoch": 0.6332806011469251, + "mean_token_accuracy": 0.7497621178627014, + "num_tokens": 12515875.0, + "step": 6405, + "train/ce_loss": 0.46298351883888245 + }, + { + "epoch": 0.6332806011469251, + "step": 6405, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.6332806011469251, + "step": 6405, + "train/total_loss": 0.09707960486412048 + }, + { + "entropy": 9.63508129119873, + "epoch": 0.6333794739964406, + "mean_token_accuracy": 0.6765676736831665, + "num_tokens": 12520900.0, + "step": 6406, + "train/ce_loss": 1.3361752033233643 + }, + { + "epoch": 0.6333794739964406, + "step": 6406, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.6333794739964406, + "step": 6406, + "train/total_loss": 0.22736752033233643 + }, + { + "entropy": 8.758066177368164, + "epoch": 0.633478346845956, + "mean_token_accuracy": 0.7649572491645813, + "num_tokens": 12526352.0, + "step": 6407, + "train/ce_loss": 0.8186273574829102 + }, + { + "epoch": 0.633478346845956, + "step": 6407, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.633478346845956, + "step": 6407, + "train/total_loss": 0.17170649766921997 + }, + { + "entropy": 9.324136734008789, + "epoch": 0.6335772196954716, + "mean_token_accuracy": 0.7474600672721863, + "num_tokens": 12531502.0, + "step": 6408, + "train/ce_loss": 7.128099923647824e-07 + }, + { + "epoch": 0.6335772196954716, + "step": 6408, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.6335772196954716, + "step": 6408, + "train/total_loss": 0.02734382078051567 + }, + { + "entropy": 9.120576858520508, + "epoch": 0.6336760925449871, + "mean_token_accuracy": 0.7368420958518982, + "num_tokens": 12536623.0, + "step": 6409, + "train/ce_loss": 1.019844651222229 + }, + { + "epoch": 0.6336760925449871, + "step": 6409, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.6336760925449871, + "step": 6409, + "train/total_loss": 0.15667197108268738 + }, + { + "entropy": 9.586650848388672, + "epoch": 0.6337749653945026, + "mean_token_accuracy": 0.7752212285995483, + "num_tokens": 12541606.0, + "step": 6410, + "train/ce_loss": 0.8787671327590942 + }, + { + "epoch": 0.6337749653945026, + "step": 6410, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.6337749653945026, + "step": 6410, + "train/total_loss": 0.11131421476602554 + }, + { + "entropy": 9.47934341430664, + "epoch": 0.6338738382440182, + "mean_token_accuracy": 0.7429149746894836, + "num_tokens": 12546526.0, + "step": 6411, + "train/ce_loss": 1.7796356678009033 + }, + { + "epoch": 0.6338738382440182, + "step": 6411, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.6338738382440182, + "step": 6411, + "train/total_loss": 0.2795260548591614 + }, + { + "entropy": 9.546884536743164, + "epoch": 0.6339727110935337, + "mean_token_accuracy": 0.7328000068664551, + "num_tokens": 12551552.0, + "step": 6412, + "train/ce_loss": 1.4202111959457397 + }, + { + "epoch": 0.6339727110935337, + "step": 6412, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.6339727110935337, + "step": 6412, + "train/total_loss": 0.16545861959457397 + }, + { + "entropy": 9.052999496459961, + "epoch": 0.6340715839430492, + "mean_token_accuracy": 0.7513020634651184, + "num_tokens": 12556784.0, + "step": 6413, + "train/ce_loss": 1.0188919305801392 + }, + { + "epoch": 0.6340715839430492, + "step": 6413, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.6340715839430492, + "step": 6413, + "train/total_loss": 0.17610794305801392 + }, + { + "entropy": 9.16131591796875, + "epoch": 0.6341704567925648, + "mean_token_accuracy": 0.7312312126159668, + "num_tokens": 12561880.0, + "step": 6414, + "train/ce_loss": 1.1434515714645386 + }, + { + "epoch": 0.6341704567925648, + "step": 6414, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.6341704567925648, + "step": 6414, + "train/total_loss": 0.14559516310691833 + }, + { + "entropy": 8.769760131835938, + "epoch": 0.6342693296420803, + "mean_token_accuracy": 0.740818440914154, + "num_tokens": 12567279.0, + "step": 6415, + "train/ce_loss": 0.8456613421440125 + }, + { + "epoch": 0.6342693296420803, + "step": 6415, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.6342693296420803, + "step": 6415, + "train/total_loss": 0.1470661461353302 + }, + { + "entropy": 8.74283218383789, + "epoch": 0.6343682024915958, + "mean_token_accuracy": 0.6560170650482178, + "num_tokens": 12572704.0, + "step": 6416, + "train/ce_loss": 0.8355754613876343 + }, + { + "epoch": 0.6343682024915958, + "step": 6416, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.6343682024915958, + "step": 6416, + "train/total_loss": 0.12652629613876343 + }, + { + "entropy": 8.902482986450195, + "epoch": 0.6344670753411114, + "mean_token_accuracy": 0.7208765745162964, + "num_tokens": 12578091.0, + "step": 6417, + "train/ce_loss": 1.4132318496704102 + }, + { + "epoch": 0.6344670753411114, + "step": 6417, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.6344670753411114, + "step": 6417, + "train/total_loss": 0.18819819390773773 + }, + { + "entropy": 9.10424518585205, + "epoch": 0.6345659481906268, + "mean_token_accuracy": 0.7798408269882202, + "num_tokens": 12583317.0, + "step": 6418, + "train/ce_loss": 2.373131792410277e-06 + }, + { + "epoch": 0.6345659481906268, + "step": 6418, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.6345659481906268, + "step": 6418, + "train/total_loss": 0.0585939884185791 + }, + { + "entropy": 8.467370986938477, + "epoch": 0.6346648210401423, + "mean_token_accuracy": 0.7738446593284607, + "num_tokens": 12588812.0, + "step": 6419, + "train/ce_loss": 0.7450070977210999 + }, + { + "epoch": 0.6346648210401423, + "step": 6419, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.6346648210401423, + "step": 6419, + "train/total_loss": 0.11746945977210999 + }, + { + "epoch": 0.6347636938896579, + "grad_norm": 0.7006480097770691, + "learning_rate": 8.415418088315285e-06, + "loss": 0.1424, + "step": 6420 + }, + { + "entropy": 8.973926544189453, + "epoch": 0.6347636938896579, + "mean_token_accuracy": 0.7991746664047241, + "num_tokens": 12594006.0, + "step": 6420, + "train/ce_loss": 0.6745920777320862 + }, + { + "epoch": 0.6347636938896579, + "step": 6420, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.6347636938896579, + "step": 6420, + "train/total_loss": 0.10261546075344086 + }, + { + "entropy": 10.236392974853516, + "epoch": 0.6348625667391734, + "mean_token_accuracy": 0.6958333253860474, + "num_tokens": 12598612.0, + "step": 6421, + "train/ce_loss": 3.722998826560797e-06 + }, + { + "epoch": 0.6348625667391734, + "step": 6421, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.6348625667391734, + "step": 6421, + "train/total_loss": 0.08593787252902985 + }, + { + "entropy": 9.714025497436523, + "epoch": 0.6349614395886889, + "mean_token_accuracy": 0.7431906461715698, + "num_tokens": 12603525.0, + "step": 6422, + "train/ce_loss": 0.7071661353111267 + }, + { + "epoch": 0.6349614395886889, + "step": 6422, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.6349614395886889, + "step": 6422, + "train/total_loss": 0.11759161204099655 + }, + { + "entropy": 8.769353866577148, + "epoch": 0.6350603124382045, + "mean_token_accuracy": 0.7455242872238159, + "num_tokens": 12608775.0, + "step": 6423, + "train/ce_loss": 0.9130332469940186 + }, + { + "epoch": 0.6350603124382045, + "step": 6423, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.6350603124382045, + "step": 6423, + "train/total_loss": 0.13817831873893738 + }, + { + "entropy": 9.194217681884766, + "epoch": 0.63515918528772, + "mean_token_accuracy": 0.8032069802284241, + "num_tokens": 12613870.0, + "step": 6424, + "train/ce_loss": 0.7883514165878296 + }, + { + "epoch": 0.63515918528772, + "step": 6424, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.63515918528772, + "step": 6424, + "train/total_loss": 0.0944601446390152 + }, + { + "entropy": 8.776252746582031, + "epoch": 0.6352580581372356, + "mean_token_accuracy": 0.7296379804611206, + "num_tokens": 12619275.0, + "step": 6425, + "train/ce_loss": 0.7394523024559021 + }, + { + "epoch": 0.6352580581372356, + "step": 6425, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.6352580581372356, + "step": 6425, + "train/total_loss": 0.13644522428512573 + }, + { + "entropy": 8.840118408203125, + "epoch": 0.6353569309867511, + "mean_token_accuracy": 0.6878109574317932, + "num_tokens": 12624502.0, + "step": 6426, + "train/ce_loss": 1.818405270576477 + }, + { + "epoch": 0.6353569309867511, + "step": 6426, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.6353569309867511, + "step": 6426, + "train/total_loss": 0.24434052407741547 + }, + { + "entropy": 8.930981636047363, + "epoch": 0.6354558038362665, + "mean_token_accuracy": 0.7754654884338379, + "num_tokens": 12629847.0, + "step": 6427, + "train/ce_loss": 0.5735181570053101 + }, + { + "epoch": 0.6354558038362665, + "step": 6427, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.6354558038362665, + "step": 6427, + "train/total_loss": 0.08078931272029877 + }, + { + "entropy": 9.04982852935791, + "epoch": 0.6355546766857821, + "mean_token_accuracy": 0.6829971075057983, + "num_tokens": 12635037.0, + "step": 6428, + "train/ce_loss": 0.8141086101531982 + }, + { + "epoch": 0.6355546766857821, + "step": 6428, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.6355546766857821, + "step": 6428, + "train/total_loss": 0.15172335505485535 + }, + { + "entropy": 8.66606330871582, + "epoch": 0.6356535495352976, + "mean_token_accuracy": 0.7685631513595581, + "num_tokens": 12640547.0, + "step": 6429, + "train/ce_loss": 0.5794277787208557 + }, + { + "epoch": 0.6356535495352976, + "step": 6429, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.6356535495352976, + "step": 6429, + "train/total_loss": 0.08528652787208557 + }, + { + "entropy": 9.186049461364746, + "epoch": 0.6357524223848131, + "mean_token_accuracy": 0.7824561595916748, + "num_tokens": 12645563.0, + "step": 6430, + "train/ce_loss": 0.9713578224182129 + }, + { + "epoch": 0.6357524223848131, + "step": 6430, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.6357524223848131, + "step": 6430, + "train/total_loss": 0.1479170322418213 + }, + { + "entropy": 9.148721694946289, + "epoch": 0.6358512952343287, + "mean_token_accuracy": 0.7407894730567932, + "num_tokens": 12650779.0, + "step": 6431, + "train/ce_loss": 1.3405790328979492 + }, + { + "epoch": 0.6358512952343287, + "step": 6431, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.6358512952343287, + "step": 6431, + "train/total_loss": 0.1614016592502594 + }, + { + "entropy": 8.77899169921875, + "epoch": 0.6359501680838442, + "mean_token_accuracy": 0.7346465587615967, + "num_tokens": 12656145.0, + "step": 6432, + "train/ce_loss": 0.7334739565849304 + }, + { + "epoch": 0.6359501680838442, + "step": 6432, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.6359501680838442, + "step": 6432, + "train/total_loss": 0.11631614714860916 + }, + { + "entropy": 9.036822319030762, + "epoch": 0.6360490409333597, + "mean_token_accuracy": 0.7083870768547058, + "num_tokens": 12661438.0, + "step": 6433, + "train/ce_loss": 0.9512785077095032 + }, + { + "epoch": 0.6360490409333597, + "step": 6433, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.6360490409333597, + "step": 6433, + "train/total_loss": 0.18497160077095032 + }, + { + "entropy": 9.052738189697266, + "epoch": 0.6361479137828753, + "mean_token_accuracy": 0.7056604027748108, + "num_tokens": 12666695.0, + "step": 6434, + "train/ce_loss": 1.518971562385559 + }, + { + "epoch": 0.6361479137828753, + "step": 6434, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.6361479137828753, + "step": 6434, + "train/total_loss": 0.21049091219902039 + }, + { + "entropy": 9.298844337463379, + "epoch": 0.6362467866323908, + "mean_token_accuracy": 0.7064846158027649, + "num_tokens": 12671751.0, + "step": 6435, + "train/ce_loss": 1.7070856301870663e-06 + }, + { + "epoch": 0.6362467866323908, + "step": 6435, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.6362467866323908, + "step": 6435, + "train/total_loss": 0.07031267136335373 + }, + { + "entropy": 8.464192390441895, + "epoch": 0.6363456594819062, + "mean_token_accuracy": 0.7317351698875427, + "num_tokens": 12677078.0, + "step": 6436, + "train/ce_loss": 0.853071928024292 + }, + { + "epoch": 0.6363456594819062, + "step": 6436, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.6363456594819062, + "step": 6436, + "train/total_loss": 0.10874469578266144 + }, + { + "entropy": 9.107930183410645, + "epoch": 0.6364445323314218, + "mean_token_accuracy": 0.7402032017707825, + "num_tokens": 12682158.0, + "step": 6437, + "train/ce_loss": 0.8887556791305542 + }, + { + "epoch": 0.6364445323314218, + "step": 6437, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.6364445323314218, + "step": 6437, + "train/total_loss": 0.12012556940317154 + }, + { + "entropy": 8.584230422973633, + "epoch": 0.6365434051809373, + "mean_token_accuracy": 0.7628541588783264, + "num_tokens": 12687593.0, + "step": 6438, + "train/ce_loss": 0.7757463455200195 + }, + { + "epoch": 0.6365434051809373, + "step": 6438, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.6365434051809373, + "step": 6438, + "train/total_loss": 0.11663713306188583 + }, + { + "entropy": 9.337575912475586, + "epoch": 0.6366422780304528, + "mean_token_accuracy": 0.7447916865348816, + "num_tokens": 12692596.0, + "step": 6439, + "train/ce_loss": 1.4264662265777588 + }, + { + "epoch": 0.6366422780304528, + "step": 6439, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.6366422780304528, + "step": 6439, + "train/total_loss": 0.18561537563800812 + }, + { + "epoch": 0.6367411508799684, + "grad_norm": 0.7183775305747986, + "learning_rate": 8.410473223557336e-06, + "loss": 0.1363, + "step": 6440 + }, + { + "entropy": 10.048894882202148, + "epoch": 0.6367411508799684, + "mean_token_accuracy": 0.6693877577781677, + "num_tokens": 12697211.0, + "step": 6440, + "train/ce_loss": 2.288748646606109e-06 + }, + { + "epoch": 0.6367411508799684, + "step": 6440, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.6367411508799684, + "step": 6440, + "train/total_loss": 0.023437729105353355 + }, + { + "entropy": 8.808197021484375, + "epoch": 0.6368400237294839, + "mean_token_accuracy": 0.7426966428756714, + "num_tokens": 12702526.0, + "step": 6441, + "train/ce_loss": 1.0223617553710938 + }, + { + "epoch": 0.6368400237294839, + "step": 6441, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.6368400237294839, + "step": 6441, + "train/total_loss": 0.14520493149757385 + }, + { + "entropy": 8.888721466064453, + "epoch": 0.6369388965789994, + "mean_token_accuracy": 0.757656455039978, + "num_tokens": 12707726.0, + "step": 6442, + "train/ce_loss": 1.2178153991699219 + }, + { + "epoch": 0.6369388965789994, + "step": 6442, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.6369388965789994, + "step": 6442, + "train/total_loss": 0.20771904289722443 + }, + { + "entropy": 9.222532272338867, + "epoch": 0.637037769428515, + "mean_token_accuracy": 0.7290909290313721, + "num_tokens": 12712704.0, + "step": 6443, + "train/ce_loss": 1.4290187358856201 + }, + { + "epoch": 0.637037769428515, + "step": 6443, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.637037769428515, + "step": 6443, + "train/total_loss": 0.21321438252925873 + }, + { + "entropy": 9.138294219970703, + "epoch": 0.6371366422780305, + "mean_token_accuracy": 0.7117263674736023, + "num_tokens": 12717775.0, + "step": 6444, + "train/ce_loss": 9.109377856475476e-07 + }, + { + "epoch": 0.6371366422780305, + "step": 6444, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.6371366422780305, + "step": 6444, + "train/total_loss": 0.03515633940696716 + }, + { + "entropy": 9.019227027893066, + "epoch": 0.637235515127546, + "mean_token_accuracy": 0.7219387888908386, + "num_tokens": 12723013.0, + "step": 6445, + "train/ce_loss": 0.775183379650116 + }, + { + "epoch": 0.637235515127546, + "step": 6445, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.637235515127546, + "step": 6445, + "train/total_loss": 0.17517459392547607 + }, + { + "entropy": 8.739466667175293, + "epoch": 0.6373343879770615, + "mean_token_accuracy": 0.776992917060852, + "num_tokens": 12728471.0, + "step": 6446, + "train/ce_loss": 0.9948819875717163 + }, + { + "epoch": 0.6373343879770615, + "step": 6446, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.6373343879770615, + "step": 6446, + "train/total_loss": 0.17370694875717163 + }, + { + "entropy": 9.282242774963379, + "epoch": 0.637433260826577, + "mean_token_accuracy": 0.7416173815727234, + "num_tokens": 12733394.0, + "step": 6447, + "train/ce_loss": 1.1608867645263672 + }, + { + "epoch": 0.637433260826577, + "step": 6447, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.637433260826577, + "step": 6447, + "train/total_loss": 0.15515118837356567 + }, + { + "entropy": 9.284667015075684, + "epoch": 0.6375321336760925, + "mean_token_accuracy": 0.7423999905586243, + "num_tokens": 12738428.0, + "step": 6448, + "train/ce_loss": 0.6794313192367554 + }, + { + "epoch": 0.6375321336760925, + "step": 6448, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.6375321336760925, + "step": 6448, + "train/total_loss": 0.12263063341379166 + }, + { + "entropy": 8.700347900390625, + "epoch": 0.6376310065256081, + "mean_token_accuracy": 0.773950457572937, + "num_tokens": 12743853.0, + "step": 6449, + "train/ce_loss": 0.7626098990440369 + }, + { + "epoch": 0.6376310065256081, + "step": 6449, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.6376310065256081, + "step": 6449, + "train/total_loss": 0.1739172339439392 + }, + { + "entropy": 8.891304016113281, + "epoch": 0.6377298793751236, + "mean_token_accuracy": 0.7259615659713745, + "num_tokens": 12749120.0, + "step": 6450, + "train/ce_loss": 0.8201631307601929 + }, + { + "epoch": 0.6377298793751236, + "step": 6450, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.6377298793751236, + "step": 6450, + "train/total_loss": 0.14451631903648376 + }, + { + "entropy": 9.129953384399414, + "epoch": 0.6378287522246391, + "mean_token_accuracy": 0.7582873106002808, + "num_tokens": 12754400.0, + "step": 6451, + "train/ce_loss": 9.450710081182478e-07 + }, + { + "epoch": 0.6378287522246391, + "step": 6451, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.6378287522246391, + "step": 6451, + "train/total_loss": 0.05859384313225746 + }, + { + "entropy": 9.122050285339355, + "epoch": 0.6379276250741547, + "mean_token_accuracy": 0.759365975856781, + "num_tokens": 12759548.0, + "step": 6452, + "train/ce_loss": 1.265630841255188 + }, + { + "epoch": 0.6379276250741547, + "step": 6452, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.6379276250741547, + "step": 6452, + "train/total_loss": 0.15781308710575104 + }, + { + "entropy": 8.692692756652832, + "epoch": 0.6380264979236702, + "mean_token_accuracy": 0.695652186870575, + "num_tokens": 12765035.0, + "step": 6453, + "train/ce_loss": 1.3813843727111816 + }, + { + "epoch": 0.6380264979236702, + "step": 6453, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.6380264979236702, + "step": 6453, + "train/total_loss": 0.19673219323158264 + }, + { + "entropy": 9.10655689239502, + "epoch": 0.6381253707731857, + "mean_token_accuracy": 0.7168508172035217, + "num_tokens": 12770169.0, + "step": 6454, + "train/ce_loss": 1.2932935953140259 + }, + { + "epoch": 0.6381253707731857, + "step": 6454, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.6381253707731857, + "step": 6454, + "train/total_loss": 0.2035481184720993 + }, + { + "entropy": 9.16287612915039, + "epoch": 0.6382242436227012, + "mean_token_accuracy": 0.7213114500045776, + "num_tokens": 12775363.0, + "step": 6455, + "train/ce_loss": 0.8710024356842041 + }, + { + "epoch": 0.6382242436227012, + "step": 6455, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.6382242436227012, + "step": 6455, + "train/total_loss": 0.16913148760795593 + }, + { + "entropy": 8.762480735778809, + "epoch": 0.6383231164722167, + "mean_token_accuracy": 0.7190082669258118, + "num_tokens": 12780679.0, + "step": 6456, + "train/ce_loss": 1.3025904893875122 + }, + { + "epoch": 0.6383231164722167, + "step": 6456, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.6383231164722167, + "step": 6456, + "train/total_loss": 0.20447780191898346 + }, + { + "entropy": 8.96703815460205, + "epoch": 0.6384219893217322, + "mean_token_accuracy": 0.7301790118217468, + "num_tokens": 12785913.0, + "step": 6457, + "train/ce_loss": 0.31815260648727417 + }, + { + "epoch": 0.6384219893217322, + "step": 6457, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.6384219893217322, + "step": 6457, + "train/total_loss": 0.08650276064872742 + }, + { + "entropy": 8.742262840270996, + "epoch": 0.6385208621712478, + "mean_token_accuracy": 0.7894201278686523, + "num_tokens": 12791331.0, + "step": 6458, + "train/ce_loss": 0.7575759291648865 + }, + { + "epoch": 0.6385208621712478, + "step": 6458, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.6385208621712478, + "step": 6458, + "train/total_loss": 0.11872634291648865 + }, + { + "entropy": 9.20828628540039, + "epoch": 0.6386197350207633, + "mean_token_accuracy": 0.7433751821517944, + "num_tokens": 12796509.0, + "step": 6459, + "train/ce_loss": 0.7584218382835388 + }, + { + "epoch": 0.6386197350207633, + "step": 6459, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.6386197350207633, + "step": 6459, + "train/total_loss": 0.14224843680858612 + }, + { + "epoch": 0.6387186078702788, + "grad_norm": 0.6138080358505249, + "learning_rate": 8.405528358799388e-06, + "loss": 0.1387, + "step": 6460 + }, + { + "entropy": 8.801475524902344, + "epoch": 0.6387186078702788, + "mean_token_accuracy": 0.7757009267807007, + "num_tokens": 12801787.0, + "step": 6460, + "train/ce_loss": 0.7655203938484192 + }, + { + "epoch": 0.6387186078702788, + "step": 6460, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.6387186078702788, + "step": 6460, + "train/total_loss": 0.11952079087495804 + }, + { + "entropy": 9.13096809387207, + "epoch": 0.6388174807197944, + "mean_token_accuracy": 0.7729393243789673, + "num_tokens": 12806895.0, + "step": 6461, + "train/ce_loss": 0.6062048077583313 + }, + { + "epoch": 0.6388174807197944, + "step": 6461, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.6388174807197944, + "step": 6461, + "train/total_loss": 0.1543704867362976 + }, + { + "entropy": 9.962745666503906, + "epoch": 0.6389163535693099, + "mean_token_accuracy": 0.7582417726516724, + "num_tokens": 12811478.0, + "step": 6462, + "train/ce_loss": 3.825109888566658e-05 + }, + { + "epoch": 0.6389163535693099, + "step": 6462, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.6389163535693099, + "step": 6462, + "train/total_loss": 0.02734757587313652 + }, + { + "entropy": 9.132343292236328, + "epoch": 0.6390152264188254, + "mean_token_accuracy": 0.7365438938140869, + "num_tokens": 12816662.0, + "step": 6463, + "train/ce_loss": 1.3348407745361328 + }, + { + "epoch": 0.6390152264188254, + "step": 6463, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.6390152264188254, + "step": 6463, + "train/total_loss": 0.21160908043384552 + }, + { + "entropy": 9.118932723999023, + "epoch": 0.639114099268341, + "mean_token_accuracy": 0.7469586133956909, + "num_tokens": 12821922.0, + "step": 6464, + "train/ce_loss": 1.0696030855178833 + }, + { + "epoch": 0.639114099268341, + "step": 6464, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.639114099268341, + "step": 6464, + "train/total_loss": 0.18508531153202057 + }, + { + "entropy": 8.88484001159668, + "epoch": 0.6392129721178564, + "mean_token_accuracy": 0.7077844142913818, + "num_tokens": 12827176.0, + "step": 6465, + "train/ce_loss": 0.5346351265907288 + }, + { + "epoch": 0.6392129721178564, + "step": 6465, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.6392129721178564, + "step": 6465, + "train/total_loss": 0.13549476861953735 + }, + { + "entropy": 9.236489295959473, + "epoch": 0.6393118449673719, + "mean_token_accuracy": 0.7913562059402466, + "num_tokens": 12832291.0, + "step": 6466, + "train/ce_loss": 0.8808301091194153 + }, + { + "epoch": 0.6393118449673719, + "step": 6466, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.6393118449673719, + "step": 6466, + "train/total_loss": 0.13495801389217377 + }, + { + "entropy": 8.801457405090332, + "epoch": 0.6394107178168875, + "mean_token_accuracy": 0.7459839582443237, + "num_tokens": 12837734.0, + "step": 6467, + "train/ce_loss": 0.6924058794975281 + }, + { + "epoch": 0.6394107178168875, + "step": 6467, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.6394107178168875, + "step": 6467, + "train/total_loss": 0.10830309242010117 + }, + { + "entropy": 8.825029373168945, + "epoch": 0.639509590666403, + "mean_token_accuracy": 0.724252462387085, + "num_tokens": 12843146.0, + "step": 6468, + "train/ce_loss": 9.435718766326318e-07 + }, + { + "epoch": 0.639509590666403, + "step": 6468, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.639509590666403, + "step": 6468, + "train/total_loss": 0.03125009313225746 + }, + { + "entropy": 8.679088592529297, + "epoch": 0.6396084635159185, + "mean_token_accuracy": 0.6833667159080505, + "num_tokens": 12848727.0, + "step": 6469, + "train/ce_loss": 0.9084466695785522 + }, + { + "epoch": 0.6396084635159185, + "step": 6469, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.6396084635159185, + "step": 6469, + "train/total_loss": 0.15725091099739075 + }, + { + "entropy": 9.108712196350098, + "epoch": 0.6397073363654341, + "mean_token_accuracy": 0.7720403075218201, + "num_tokens": 12853991.0, + "step": 6470, + "train/ce_loss": 0.39327272772789 + }, + { + "epoch": 0.6397073363654341, + "step": 6470, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.6397073363654341, + "step": 6470, + "train/total_loss": 0.05885852500796318 + }, + { + "entropy": 8.573925971984863, + "epoch": 0.6398062092149496, + "mean_token_accuracy": 0.7548240423202515, + "num_tokens": 12859513.0, + "step": 6471, + "train/ce_loss": 1.0028175115585327 + }, + { + "epoch": 0.6398062092149496, + "step": 6471, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.6398062092149496, + "step": 6471, + "train/total_loss": 0.1393442451953888 + }, + { + "entropy": 9.224263191223145, + "epoch": 0.6399050820644651, + "mean_token_accuracy": 0.758400022983551, + "num_tokens": 12864631.0, + "step": 6472, + "train/ce_loss": 0.7310265898704529 + }, + { + "epoch": 0.6399050820644651, + "step": 6472, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.6399050820644651, + "step": 6472, + "train/total_loss": 0.10825891047716141 + }, + { + "entropy": 9.070579528808594, + "epoch": 0.6400039549139807, + "mean_token_accuracy": 0.7385203838348389, + "num_tokens": 12869839.0, + "step": 6473, + "train/ce_loss": 1.044979214668274 + }, + { + "epoch": 0.6400039549139807, + "step": 6473, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.6400039549139807, + "step": 6473, + "train/total_loss": 0.14356042444705963 + }, + { + "entropy": 8.707610130310059, + "epoch": 0.6401028277634961, + "mean_token_accuracy": 0.7283549904823303, + "num_tokens": 12875225.0, + "step": 6474, + "train/ce_loss": 0.8016955256462097 + }, + { + "epoch": 0.6401028277634961, + "step": 6474, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.6401028277634961, + "step": 6474, + "train/total_loss": 0.15829455852508545 + }, + { + "entropy": 9.587264060974121, + "epoch": 0.6402017006130116, + "mean_token_accuracy": 0.7548746466636658, + "num_tokens": 12880039.0, + "step": 6475, + "train/ce_loss": 2.041363813987118e-06 + }, + { + "epoch": 0.6402017006130116, + "step": 6475, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.6402017006130116, + "step": 6475, + "train/total_loss": 0.035156454890966415 + }, + { + "entropy": 9.856155395507812, + "epoch": 0.6403005734625272, + "mean_token_accuracy": 0.7080745100975037, + "num_tokens": 12884958.0, + "step": 6476, + "train/ce_loss": 0.956764817237854 + }, + { + "epoch": 0.6403005734625272, + "step": 6476, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.6403005734625272, + "step": 6476, + "train/total_loss": 0.1620827317237854 + }, + { + "entropy": 9.147016525268555, + "epoch": 0.6403994463120427, + "mean_token_accuracy": 0.722482442855835, + "num_tokens": 12890305.0, + "step": 6477, + "train/ce_loss": 0.8601493239402771 + }, + { + "epoch": 0.6403994463120427, + "step": 6477, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.6403994463120427, + "step": 6477, + "train/total_loss": 0.16413992643356323 + }, + { + "entropy": 9.434209823608398, + "epoch": 0.6404983191615582, + "mean_token_accuracy": 0.7606679201126099, + "num_tokens": 12895298.0, + "step": 6478, + "train/ce_loss": 0.7040955424308777 + }, + { + "epoch": 0.6404983191615582, + "step": 6478, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.6404983191615582, + "step": 6478, + "train/total_loss": 0.10556580871343613 + }, + { + "entropy": 8.699009895324707, + "epoch": 0.6405971920110738, + "mean_token_accuracy": 0.7092288136482239, + "num_tokens": 12900625.0, + "step": 6479, + "train/ce_loss": 1.285062313079834 + }, + { + "epoch": 0.6405971920110738, + "step": 6479, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.6405971920110738, + "step": 6479, + "train/total_loss": 0.18319372832775116 + }, + { + "epoch": 0.6406960648605893, + "grad_norm": 0.7683160901069641, + "learning_rate": 8.400583494041439e-06, + "loss": 0.1412, + "step": 6480 + }, + { + "entropy": 9.59773063659668, + "epoch": 0.6406960648605893, + "mean_token_accuracy": 0.724252462387085, + "num_tokens": 12905647.0, + "step": 6480, + "train/ce_loss": 0.8121297955513 + }, + { + "epoch": 0.6406960648605893, + "step": 6480, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.6406960648605893, + "step": 6480, + "train/total_loss": 0.10465048253536224 + }, + { + "entropy": 8.975669860839844, + "epoch": 0.6407949377101048, + "mean_token_accuracy": 0.7036144733428955, + "num_tokens": 12910915.0, + "step": 6481, + "train/ce_loss": 1.2544474601745605 + }, + { + "epoch": 0.6407949377101048, + "step": 6481, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.6407949377101048, + "step": 6481, + "train/total_loss": 0.23481975495815277 + }, + { + "entropy": 9.157267570495605, + "epoch": 0.6408938105596204, + "mean_token_accuracy": 0.730708658695221, + "num_tokens": 12915981.0, + "step": 6482, + "train/ce_loss": 1.0704869031906128 + }, + { + "epoch": 0.6408938105596204, + "step": 6482, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.6408938105596204, + "step": 6482, + "train/total_loss": 0.17736119031906128 + }, + { + "entropy": 9.217813491821289, + "epoch": 0.6409926834091358, + "mean_token_accuracy": 0.7742382287979126, + "num_tokens": 12921133.0, + "step": 6483, + "train/ce_loss": 0.6621120572090149 + }, + { + "epoch": 0.6409926834091358, + "step": 6483, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.6409926834091358, + "step": 6483, + "train/total_loss": 0.10527370870113373 + }, + { + "entropy": 9.174644470214844, + "epoch": 0.6410915562586513, + "mean_token_accuracy": 0.7536231875419617, + "num_tokens": 12926156.0, + "step": 6484, + "train/ce_loss": 0.8040371537208557 + }, + { + "epoch": 0.6410915562586513, + "step": 6484, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.6410915562586513, + "step": 6484, + "train/total_loss": 0.11555996537208557 + }, + { + "entropy": 9.635343551635742, + "epoch": 0.6411904291081669, + "mean_token_accuracy": 0.7782177925109863, + "num_tokens": 12931054.0, + "step": 6485, + "train/ce_loss": 9.885932286124444e-07 + }, + { + "epoch": 0.6411904291081669, + "step": 6485, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.6411904291081669, + "step": 6485, + "train/total_loss": 0.01953134872019291 + }, + { + "entropy": 8.949685096740723, + "epoch": 0.6412893019576824, + "mean_token_accuracy": 0.7885952591896057, + "num_tokens": 12936237.0, + "step": 6486, + "train/ce_loss": 0.8853443264961243 + }, + { + "epoch": 0.6412893019576824, + "step": 6486, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.6412893019576824, + "step": 6486, + "train/total_loss": 0.16275319457054138 + }, + { + "entropy": 8.84429931640625, + "epoch": 0.6413881748071979, + "mean_token_accuracy": 0.7311320900917053, + "num_tokens": 12941576.0, + "step": 6487, + "train/ce_loss": 1.2528860569000244 + }, + { + "epoch": 0.6413881748071979, + "step": 6487, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.6413881748071979, + "step": 6487, + "train/total_loss": 0.17606985569000244 + }, + { + "entropy": 9.086837768554688, + "epoch": 0.6414870476567135, + "mean_token_accuracy": 0.7229064106941223, + "num_tokens": 12946900.0, + "step": 6488, + "train/ce_loss": 0.6264514923095703 + }, + { + "epoch": 0.6414870476567135, + "step": 6488, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.6414870476567135, + "step": 6488, + "train/total_loss": 0.12514515221118927 + }, + { + "entropy": 9.181316375732422, + "epoch": 0.641585920506229, + "mean_token_accuracy": 0.6954612135887146, + "num_tokens": 12952055.0, + "step": 6489, + "train/ce_loss": 1.2988181114196777 + }, + { + "epoch": 0.641585920506229, + "step": 6489, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.641585920506229, + "step": 6489, + "train/total_loss": 0.1494130641222 + }, + { + "entropy": 9.338220596313477, + "epoch": 0.6416847933557445, + "mean_token_accuracy": 0.7082683444023132, + "num_tokens": 12957049.0, + "step": 6490, + "train/ce_loss": 1.2148973941802979 + }, + { + "epoch": 0.6416847933557445, + "step": 6490, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.6416847933557445, + "step": 6490, + "train/total_loss": 0.1722709834575653 + }, + { + "entropy": 9.005218505859375, + "epoch": 0.6417836662052601, + "mean_token_accuracy": 0.7276028990745544, + "num_tokens": 12962344.0, + "step": 6491, + "train/ce_loss": 0.8991209268569946 + }, + { + "epoch": 0.6417836662052601, + "step": 6491, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.6417836662052601, + "step": 6491, + "train/total_loss": 0.14459958672523499 + }, + { + "entropy": 8.843526840209961, + "epoch": 0.6418825390547755, + "mean_token_accuracy": 0.7493403553962708, + "num_tokens": 12967605.0, + "step": 6492, + "train/ce_loss": 1.0386401414871216 + }, + { + "epoch": 0.6418825390547755, + "step": 6492, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.6418825390547755, + "step": 6492, + "train/total_loss": 0.15855151414871216 + }, + { + "entropy": 9.558185577392578, + "epoch": 0.641981411904291, + "mean_token_accuracy": 0.7782177925109863, + "num_tokens": 12972513.0, + "step": 6493, + "train/ce_loss": 2.0642099380493164 + }, + { + "epoch": 0.641981411904291, + "step": 6493, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.641981411904291, + "step": 6493, + "train/total_loss": 0.28845226764678955 + }, + { + "entropy": 8.830387115478516, + "epoch": 0.6420802847538066, + "mean_token_accuracy": 0.7817258834838867, + "num_tokens": 12977975.0, + "step": 6494, + "train/ce_loss": 0.7758707404136658 + }, + { + "epoch": 0.6420802847538066, + "step": 6494, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.6420802847538066, + "step": 6494, + "train/total_loss": 0.1049308255314827 + }, + { + "entropy": 9.699666023254395, + "epoch": 0.6421791576033221, + "mean_token_accuracy": 0.7637795209884644, + "num_tokens": 12982785.0, + "step": 6495, + "train/ce_loss": 0.19067265093326569 + }, + { + "epoch": 0.6421791576033221, + "step": 6495, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.6421791576033221, + "step": 6495, + "train/total_loss": 0.06984851509332657 + }, + { + "entropy": 8.90860366821289, + "epoch": 0.6422780304528376, + "mean_token_accuracy": 0.7694090604782104, + "num_tokens": 12988131.0, + "step": 6496, + "train/ce_loss": 0.4023219347000122 + }, + { + "epoch": 0.6422780304528376, + "step": 6496, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.6422780304528376, + "step": 6496, + "train/total_loss": 0.07538844645023346 + }, + { + "entropy": 8.920339584350586, + "epoch": 0.6423769033023532, + "mean_token_accuracy": 0.7369077205657959, + "num_tokens": 12993367.0, + "step": 6497, + "train/ce_loss": 0.7874730229377747 + }, + { + "epoch": 0.6423769033023532, + "step": 6497, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.6423769033023532, + "step": 6497, + "train/total_loss": 0.14515355229377747 + }, + { + "entropy": 9.975255966186523, + "epoch": 0.6424757761518687, + "mean_token_accuracy": 0.7456647157669067, + "num_tokens": 12998140.0, + "step": 6498, + "train/ce_loss": 0.11420520395040512 + }, + { + "epoch": 0.6424757761518687, + "step": 6498, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.6424757761518687, + "step": 6498, + "train/total_loss": 0.07392051815986633 + }, + { + "entropy": 9.006590843200684, + "epoch": 0.6425746490013842, + "mean_token_accuracy": 0.6763224005699158, + "num_tokens": 13003335.0, + "step": 6499, + "train/ce_loss": 0.0605606734752655 + }, + { + "epoch": 0.6425746490013842, + "step": 6499, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.6425746490013842, + "step": 6499, + "train/total_loss": 0.02949356660246849 + }, + { + "epoch": 0.6426735218508998, + "grad_norm": 0.769160270690918, + "learning_rate": 8.395638629283491e-06, + "loss": 0.1434, + "step": 6500 + }, + { + "entropy": 9.092881202697754, + "epoch": 0.6426735218508998, + "mean_token_accuracy": 0.7147335410118103, + "num_tokens": 13008415.0, + "step": 6500, + "train/ce_loss": 0.1218390017747879 + }, + { + "epoch": 0.6426735218508998, + "step": 6500, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.6426735218508998, + "step": 6500, + "train/total_loss": 0.07468389719724655 + }, + { + "entropy": 9.512760162353516, + "epoch": 0.6427723947004153, + "mean_token_accuracy": 0.7199282050132751, + "num_tokens": 13013385.0, + "step": 6501, + "train/ce_loss": 0.9416092038154602 + }, + { + "epoch": 0.6427723947004153, + "step": 6501, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.6427723947004153, + "step": 6501, + "train/total_loss": 0.15275466442108154 + }, + { + "entropy": 9.541532516479492, + "epoch": 0.6428712675499307, + "mean_token_accuracy": 0.722347617149353, + "num_tokens": 13018275.0, + "step": 6502, + "train/ce_loss": 0.13504788279533386 + }, + { + "epoch": 0.6428712675499307, + "step": 6502, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.6428712675499307, + "step": 6502, + "train/total_loss": 0.08772353827953339 + }, + { + "entropy": 9.089735984802246, + "epoch": 0.6429701403994463, + "mean_token_accuracy": 0.7394495606422424, + "num_tokens": 13023238.0, + "step": 6503, + "train/ce_loss": 0.05251012742519379 + }, + { + "epoch": 0.6429701403994463, + "step": 6503, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.6429701403994463, + "step": 6503, + "train/total_loss": 0.02868851274251938 + }, + { + "entropy": 9.593996047973633, + "epoch": 0.6430690132489618, + "mean_token_accuracy": 0.7515789270401001, + "num_tokens": 13028148.0, + "step": 6504, + "train/ce_loss": 0.05433094874024391 + }, + { + "epoch": 0.6430690132489618, + "step": 6504, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.6430690132489618, + "step": 6504, + "train/total_loss": 0.02105809561908245 + }, + { + "entropy": 9.474388122558594, + "epoch": 0.6431678860984773, + "mean_token_accuracy": 0.7316561937332153, + "num_tokens": 13033014.0, + "step": 6505, + "train/ce_loss": 1.4684100151062012 + }, + { + "epoch": 0.6431678860984773, + "step": 6505, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.6431678860984773, + "step": 6505, + "train/total_loss": 0.23668475449085236 + }, + { + "entropy": 9.409302711486816, + "epoch": 0.6432667589479929, + "mean_token_accuracy": 0.7596490979194641, + "num_tokens": 13038017.0, + "step": 6506, + "train/ce_loss": 1.0788112878799438 + }, + { + "epoch": 0.6432667589479929, + "step": 6506, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.6432667589479929, + "step": 6506, + "train/total_loss": 0.13913112878799438 + }, + { + "entropy": 9.000192642211914, + "epoch": 0.6433656317975084, + "mean_token_accuracy": 0.7247956395149231, + "num_tokens": 13043239.0, + "step": 6507, + "train/ce_loss": 0.7980868816375732 + }, + { + "epoch": 0.6433656317975084, + "step": 6507, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.6433656317975084, + "step": 6507, + "train/total_loss": 0.16183993220329285 + }, + { + "entropy": 9.12574577331543, + "epoch": 0.643464504647024, + "mean_token_accuracy": 0.7312101721763611, + "num_tokens": 13048482.0, + "step": 6508, + "train/ce_loss": 0.012120846658945084 + }, + { + "epoch": 0.643464504647024, + "step": 6508, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.643464504647024, + "step": 6508, + "train/total_loss": 0.01683708466589451 + }, + { + "entropy": 8.820637702941895, + "epoch": 0.6435633774965395, + "mean_token_accuracy": 0.7223684191703796, + "num_tokens": 13053694.0, + "step": 6509, + "train/ce_loss": 0.47832316160202026 + }, + { + "epoch": 0.6435633774965395, + "step": 6509, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.6435633774965395, + "step": 6509, + "train/total_loss": 0.12595731019973755 + }, + { + "entropy": 8.384625434875488, + "epoch": 0.643662250346055, + "mean_token_accuracy": 0.732891857624054, + "num_tokens": 13059088.0, + "step": 6510, + "train/ce_loss": 0.9610019326210022 + }, + { + "epoch": 0.643662250346055, + "step": 6510, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.643662250346055, + "step": 6510, + "train/total_loss": 0.15860019624233246 + }, + { + "entropy": 9.12200927734375, + "epoch": 0.6437611231955706, + "mean_token_accuracy": 0.7317554354667664, + "num_tokens": 13064041.0, + "step": 6511, + "train/ce_loss": 1.9280868768692017 + }, + { + "epoch": 0.6437611231955706, + "step": 6511, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.6437611231955706, + "step": 6511, + "train/total_loss": 0.24358993768692017 + }, + { + "entropy": 8.986196517944336, + "epoch": 0.643859996045086, + "mean_token_accuracy": 0.8145454525947571, + "num_tokens": 13069315.0, + "step": 6512, + "train/ce_loss": 8.834888285491616e-05 + }, + { + "epoch": 0.643859996045086, + "step": 6512, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.643859996045086, + "step": 6512, + "train/total_loss": 0.015633834525942802 + }, + { + "entropy": 9.146592140197754, + "epoch": 0.6439588688946015, + "mean_token_accuracy": 0.7810107469558716, + "num_tokens": 13074388.0, + "step": 6513, + "train/ce_loss": 1.715042233467102 + }, + { + "epoch": 0.6439588688946015, + "step": 6513, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.6439588688946015, + "step": 6513, + "train/total_loss": 0.23400422930717468 + }, + { + "entropy": 8.818115234375, + "epoch": 0.6440577417441171, + "mean_token_accuracy": 0.7394958138465881, + "num_tokens": 13079425.0, + "step": 6514, + "train/ce_loss": 1.3264988660812378 + }, + { + "epoch": 0.6440577417441171, + "step": 6514, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.6440577417441171, + "step": 6514, + "train/total_loss": 0.20296238362789154 + }, + { + "entropy": 9.274868965148926, + "epoch": 0.6441566145936326, + "mean_token_accuracy": 0.703832745552063, + "num_tokens": 13084444.0, + "step": 6515, + "train/ce_loss": 1.360609769821167 + }, + { + "epoch": 0.6441566145936326, + "step": 6515, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.6441566145936326, + "step": 6515, + "train/total_loss": 0.21809223294258118 + }, + { + "entropy": 9.169843673706055, + "epoch": 0.6442554874431481, + "mean_token_accuracy": 0.717423141002655, + "num_tokens": 13089704.0, + "step": 6516, + "train/ce_loss": 1.4866522178635933e-05 + }, + { + "epoch": 0.6442554874431481, + "step": 6516, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.6442554874431481, + "step": 6516, + "train/total_loss": 0.039063986390829086 + }, + { + "entropy": 8.721879005432129, + "epoch": 0.6443543602926637, + "mean_token_accuracy": 0.7563510537147522, + "num_tokens": 13095174.0, + "step": 6517, + "train/ce_loss": 1.2111377716064453 + }, + { + "epoch": 0.6443543602926637, + "step": 6517, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.6443543602926637, + "step": 6517, + "train/total_loss": 0.16798877716064453 + }, + { + "entropy": 8.488058090209961, + "epoch": 0.6444532331421792, + "mean_token_accuracy": 0.7695473432540894, + "num_tokens": 13100639.0, + "step": 6518, + "train/ce_loss": 0.6677665710449219 + }, + { + "epoch": 0.6444532331421792, + "step": 6518, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.6444532331421792, + "step": 6518, + "train/total_loss": 0.08630790561437607 + }, + { + "entropy": 9.38595199584961, + "epoch": 0.6445521059916947, + "mean_token_accuracy": 0.709756076335907, + "num_tokens": 13105459.0, + "step": 6519, + "train/ce_loss": 1.2585313320159912 + }, + { + "epoch": 0.6445521059916947, + "step": 6519, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.6445521059916947, + "step": 6519, + "train/total_loss": 0.17663438618183136 + }, + { + "epoch": 0.6446509788412103, + "grad_norm": 0.8845223784446716, + "learning_rate": 8.39069376452554e-06, + "loss": 0.1413, + "step": 6520 + }, + { + "entropy": 8.890151023864746, + "epoch": 0.6446509788412103, + "mean_token_accuracy": 0.7569974660873413, + "num_tokens": 13110715.0, + "step": 6520, + "train/ce_loss": 0.5002956390380859 + }, + { + "epoch": 0.6446509788412103, + "step": 6520, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.6446509788412103, + "step": 6520, + "train/total_loss": 0.09690456092357635 + }, + { + "entropy": 9.051708221435547, + "epoch": 0.6447498516907257, + "mean_token_accuracy": 0.7416666746139526, + "num_tokens": 13115781.0, + "step": 6521, + "train/ce_loss": 1.3550637959269807e-05 + }, + { + "epoch": 0.6447498516907257, + "step": 6521, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.6447498516907257, + "step": 6521, + "train/total_loss": 0.09375135600566864 + }, + { + "entropy": 8.863210678100586, + "epoch": 0.6448487245402412, + "mean_token_accuracy": 0.8128272294998169, + "num_tokens": 13121016.0, + "step": 6522, + "train/ce_loss": 0.3710916340351105 + }, + { + "epoch": 0.6448487245402412, + "step": 6522, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.6448487245402412, + "step": 6522, + "train/total_loss": 0.06835916638374329 + }, + { + "entropy": 8.455900192260742, + "epoch": 0.6449475973897568, + "mean_token_accuracy": 0.7523809671401978, + "num_tokens": 13126344.0, + "step": 6523, + "train/ce_loss": 1.0311092138290405 + }, + { + "epoch": 0.6449475973897568, + "step": 6523, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.6449475973897568, + "step": 6523, + "train/total_loss": 0.2007671743631363 + }, + { + "entropy": 8.69780445098877, + "epoch": 0.6450464702392723, + "mean_token_accuracy": 0.7423469424247742, + "num_tokens": 13131605.0, + "step": 6524, + "train/ce_loss": 1.0256714820861816 + }, + { + "epoch": 0.6450464702392723, + "step": 6524, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.6450464702392723, + "step": 6524, + "train/total_loss": 0.1650671511888504 + }, + { + "entropy": 9.350138664245605, + "epoch": 0.6451453430887878, + "mean_token_accuracy": 0.718654453754425, + "num_tokens": 13136683.0, + "step": 6525, + "train/ce_loss": 1.7895563840866089 + }, + { + "epoch": 0.6451453430887878, + "step": 6525, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.6451453430887878, + "step": 6525, + "train/total_loss": 0.24926814436912537 + }, + { + "entropy": 9.344786643981934, + "epoch": 0.6452442159383034, + "mean_token_accuracy": 0.6934046149253845, + "num_tokens": 13141698.0, + "step": 6526, + "train/ce_loss": 1.5403414964675903 + }, + { + "epoch": 0.6452442159383034, + "step": 6526, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.6452442159383034, + "step": 6526, + "train/total_loss": 0.22044040262699127 + }, + { + "entropy": 9.177766799926758, + "epoch": 0.6453430887878189, + "mean_token_accuracy": 0.8082901835441589, + "num_tokens": 13146733.0, + "step": 6527, + "train/ce_loss": 6.455883067246759e-06 + }, + { + "epoch": 0.6453430887878189, + "step": 6527, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.6453430887878189, + "step": 6527, + "train/total_loss": 0.015625646337866783 + }, + { + "entropy": 9.098952293395996, + "epoch": 0.6454419616373344, + "mean_token_accuracy": 0.7235772609710693, + "num_tokens": 13151751.0, + "step": 6528, + "train/ce_loss": 4.270254066796042e-06 + }, + { + "epoch": 0.6454419616373344, + "step": 6528, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.6454419616373344, + "step": 6528, + "train/total_loss": 0.06640667468309402 + }, + { + "entropy": 8.696406364440918, + "epoch": 0.64554083448685, + "mean_token_accuracy": 0.7783251404762268, + "num_tokens": 13157039.0, + "step": 6529, + "train/ce_loss": 0.7242198586463928 + }, + { + "epoch": 0.64554083448685, + "step": 6529, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.64554083448685, + "step": 6529, + "train/total_loss": 0.12710949778556824 + }, + { + "entropy": 8.717599868774414, + "epoch": 0.6456397073363654, + "mean_token_accuracy": 0.7016215920448303, + "num_tokens": 13162397.0, + "step": 6530, + "train/ce_loss": 0.6972009539604187 + }, + { + "epoch": 0.6456397073363654, + "step": 6530, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.6456397073363654, + "step": 6530, + "train/total_loss": 0.09315759688615799 + }, + { + "entropy": 8.455299377441406, + "epoch": 0.6457385801858809, + "mean_token_accuracy": 0.7663461565971375, + "num_tokens": 13167951.0, + "step": 6531, + "train/ce_loss": 0.9205726981163025 + }, + { + "epoch": 0.6457385801858809, + "step": 6531, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.6457385801858809, + "step": 6531, + "train/total_loss": 0.1545572727918625 + }, + { + "entropy": 8.538186073303223, + "epoch": 0.6458374530353965, + "mean_token_accuracy": 0.7246073484420776, + "num_tokens": 13173394.0, + "step": 6532, + "train/ce_loss": 1.1803375482559204 + }, + { + "epoch": 0.6458374530353965, + "step": 6532, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.6458374530353965, + "step": 6532, + "train/total_loss": 0.180533766746521 + }, + { + "entropy": 8.439651489257812, + "epoch": 0.645936325884912, + "mean_token_accuracy": 0.7225647568702698, + "num_tokens": 13178673.0, + "step": 6533, + "train/ce_loss": 0.6351827383041382 + }, + { + "epoch": 0.645936325884912, + "step": 6533, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.645936325884912, + "step": 6533, + "train/total_loss": 0.14164328575134277 + }, + { + "entropy": 8.515107154846191, + "epoch": 0.6460351987344275, + "mean_token_accuracy": 0.7172839641571045, + "num_tokens": 13183946.0, + "step": 6534, + "train/ce_loss": 0.9283249974250793 + }, + { + "epoch": 0.6460351987344275, + "step": 6534, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.6460351987344275, + "step": 6534, + "train/total_loss": 0.1514262557029724 + }, + { + "entropy": 9.123884201049805, + "epoch": 0.6461340715839431, + "mean_token_accuracy": 0.6688311696052551, + "num_tokens": 13189038.0, + "step": 6535, + "train/ce_loss": 1.2555750608444214 + }, + { + "epoch": 0.6461340715839431, + "step": 6535, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.6461340715839431, + "step": 6535, + "train/total_loss": 0.17243251204490662 + }, + { + "entropy": 8.950576782226562, + "epoch": 0.6462329444334586, + "mean_token_accuracy": 0.7177305221557617, + "num_tokens": 13194225.0, + "step": 6536, + "train/ce_loss": 1.3022525310516357 + }, + { + "epoch": 0.6462329444334586, + "step": 6536, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.6462329444334586, + "step": 6536, + "train/total_loss": 0.2161627560853958 + }, + { + "entropy": 8.180878639221191, + "epoch": 0.6463318172829741, + "mean_token_accuracy": 0.790450930595398, + "num_tokens": 13199834.0, + "step": 6537, + "train/ce_loss": 0.5404284000396729 + }, + { + "epoch": 0.6463318172829741, + "step": 6537, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.6463318172829741, + "step": 6537, + "train/total_loss": 0.06966784596443176 + }, + { + "entropy": 8.464290618896484, + "epoch": 0.6464306901324897, + "mean_token_accuracy": 0.756424605846405, + "num_tokens": 13205184.0, + "step": 6538, + "train/ce_loss": 1.1266599893569946 + }, + { + "epoch": 0.6464306901324897, + "step": 6538, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.6464306901324897, + "step": 6538, + "train/total_loss": 0.19860351085662842 + }, + { + "entropy": 9.445873260498047, + "epoch": 0.6465295629820051, + "mean_token_accuracy": 0.8164557218551636, + "num_tokens": 13210030.0, + "step": 6539, + "train/ce_loss": 1.4183518886566162 + }, + { + "epoch": 0.6465295629820051, + "step": 6539, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.6465295629820051, + "step": 6539, + "train/total_loss": 0.16136644780635834 + }, + { + "epoch": 0.6466284358315206, + "grad_norm": 0.6027600765228271, + "learning_rate": 8.385748899767592e-06, + "loss": 0.1377, + "step": 6540 + }, + { + "entropy": 8.796388626098633, + "epoch": 0.6466284358315206, + "mean_token_accuracy": 0.7493261694908142, + "num_tokens": 13215144.0, + "step": 6540, + "train/ce_loss": 1.2235296964645386 + }, + { + "epoch": 0.6466284358315206, + "step": 6540, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.6466284358315206, + "step": 6540, + "train/total_loss": 0.2004779726266861 + }, + { + "entropy": 9.158843994140625, + "epoch": 0.6467273086810362, + "mean_token_accuracy": 0.7662835121154785, + "num_tokens": 13220099.0, + "step": 6541, + "train/ce_loss": 0.7735692262649536 + }, + { + "epoch": 0.6467273086810362, + "step": 6541, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.6467273086810362, + "step": 6541, + "train/total_loss": 0.16329443454742432 + }, + { + "entropy": 8.371902465820312, + "epoch": 0.6468261815305517, + "mean_token_accuracy": 0.6707921028137207, + "num_tokens": 13225390.0, + "step": 6542, + "train/ce_loss": 0.6213967204093933 + }, + { + "epoch": 0.6468261815305517, + "step": 6542, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.6468261815305517, + "step": 6542, + "train/total_loss": 0.12463967502117157 + }, + { + "entropy": 9.107194900512695, + "epoch": 0.6469250543800672, + "mean_token_accuracy": 0.7919161915779114, + "num_tokens": 13230499.0, + "step": 6543, + "train/ce_loss": 1.2945265769958496 + }, + { + "epoch": 0.6469250543800672, + "step": 6543, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.6469250543800672, + "step": 6543, + "train/total_loss": 0.2153901606798172 + }, + { + "entropy": 8.621914863586426, + "epoch": 0.6470239272295828, + "mean_token_accuracy": 0.7265536785125732, + "num_tokens": 13235845.0, + "step": 6544, + "train/ce_loss": 0.813605546951294 + }, + { + "epoch": 0.6470239272295828, + "step": 6544, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.6470239272295828, + "step": 6544, + "train/total_loss": 0.11261055618524551 + }, + { + "entropy": 8.217103958129883, + "epoch": 0.6471228000790983, + "mean_token_accuracy": 0.6928229928016663, + "num_tokens": 13241403.0, + "step": 6545, + "train/ce_loss": 1.2843297719955444 + }, + { + "epoch": 0.6471228000790983, + "step": 6545, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.6471228000790983, + "step": 6545, + "train/total_loss": 0.2065579742193222 + }, + { + "entropy": 8.447120666503906, + "epoch": 0.6472216729286138, + "mean_token_accuracy": 0.8052356243133545, + "num_tokens": 13246812.0, + "step": 6546, + "train/ce_loss": 0.758453905582428 + }, + { + "epoch": 0.6472216729286138, + "step": 6546, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.6472216729286138, + "step": 6546, + "train/total_loss": 0.1422516405582428 + }, + { + "entropy": 8.966032981872559, + "epoch": 0.6473205457781294, + "mean_token_accuracy": 0.6984333992004395, + "num_tokens": 13252015.0, + "step": 6547, + "train/ce_loss": 0.8154184818267822 + }, + { + "epoch": 0.6473205457781294, + "step": 6547, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.6473205457781294, + "step": 6547, + "train/total_loss": 0.13622935116291046 + }, + { + "entropy": 8.51400375366211, + "epoch": 0.6474194186276448, + "mean_token_accuracy": 0.7518636584281921, + "num_tokens": 13257630.0, + "step": 6548, + "train/ce_loss": 0.7422522306442261 + }, + { + "epoch": 0.6474194186276448, + "step": 6548, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.6474194186276448, + "step": 6548, + "train/total_loss": 0.16406896710395813 + }, + { + "entropy": 9.053725242614746, + "epoch": 0.6475182914771603, + "mean_token_accuracy": 0.7325383424758911, + "num_tokens": 13262667.0, + "step": 6549, + "train/ce_loss": 0.7162719964981079 + }, + { + "epoch": 0.6475182914771603, + "step": 6549, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.6475182914771603, + "step": 6549, + "train/total_loss": 0.11068969964981079 + }, + { + "entropy": 8.683499336242676, + "epoch": 0.6476171643266759, + "mean_token_accuracy": 0.8201342225074768, + "num_tokens": 13267911.0, + "step": 6550, + "train/ce_loss": 0.8026498556137085 + }, + { + "epoch": 0.6476171643266759, + "step": 6550, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.6476171643266759, + "step": 6550, + "train/total_loss": 0.12713998556137085 + }, + { + "entropy": 8.909549713134766, + "epoch": 0.6477160371761914, + "mean_token_accuracy": 0.8012422323226929, + "num_tokens": 13273067.0, + "step": 6551, + "train/ce_loss": 0.8117330074310303 + }, + { + "epoch": 0.6477160371761914, + "step": 6551, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.6477160371761914, + "step": 6551, + "train/total_loss": 0.09679830074310303 + }, + { + "entropy": 8.517176628112793, + "epoch": 0.6478149100257069, + "mean_token_accuracy": 0.7692307829856873, + "num_tokens": 13278501.0, + "step": 6552, + "train/ce_loss": 0.9131290316581726 + }, + { + "epoch": 0.6478149100257069, + "step": 6552, + "train/sim_loss": 0.140625 + }, + { + "epoch": 0.6478149100257069, + "step": 6552, + "train/total_loss": 0.23193791508674622 + }, + { + "entropy": 8.51297378540039, + "epoch": 0.6479137828752225, + "mean_token_accuracy": 0.7438370585441589, + "num_tokens": 13284024.0, + "step": 6553, + "train/ce_loss": 0.7460567355155945 + }, + { + "epoch": 0.6479137828752225, + "step": 6553, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.6479137828752225, + "step": 6553, + "train/total_loss": 0.12929317355155945 + }, + { + "entropy": 8.837274551391602, + "epoch": 0.648012655724738, + "mean_token_accuracy": 0.7346938848495483, + "num_tokens": 13289135.0, + "step": 6554, + "train/ce_loss": 1.1686605215072632 + }, + { + "epoch": 0.648012655724738, + "step": 6554, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.648012655724738, + "step": 6554, + "train/total_loss": 0.18717855215072632 + }, + { + "entropy": 8.470785140991211, + "epoch": 0.6481115285742535, + "mean_token_accuracy": 0.7346723079681396, + "num_tokens": 13294540.0, + "step": 6555, + "train/ce_loss": 1.2083443403244019 + }, + { + "epoch": 0.6481115285742535, + "step": 6555, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.6481115285742535, + "step": 6555, + "train/total_loss": 0.15989693999290466 + }, + { + "entropy": 8.662359237670898, + "epoch": 0.6482104014237691, + "mean_token_accuracy": 0.6998841166496277, + "num_tokens": 13299849.0, + "step": 6556, + "train/ce_loss": 0.5744127035140991 + }, + { + "epoch": 0.6482104014237691, + "step": 6556, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.6482104014237691, + "step": 6556, + "train/total_loss": 0.11994127184152603 + }, + { + "entropy": 8.591453552246094, + "epoch": 0.6483092742732846, + "mean_token_accuracy": 0.7582417726516724, + "num_tokens": 13305145.0, + "step": 6557, + "train/ce_loss": 0.6765773892402649 + }, + { + "epoch": 0.6483092742732846, + "step": 6557, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.6483092742732846, + "step": 6557, + "train/total_loss": 0.09890773892402649 + }, + { + "entropy": 8.676153182983398, + "epoch": 0.6484081471228, + "mean_token_accuracy": 0.7421965599060059, + "num_tokens": 13310466.0, + "step": 6558, + "train/ce_loss": 0.8314873576164246 + }, + { + "epoch": 0.6484081471228, + "step": 6558, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.6484081471228, + "step": 6558, + "train/total_loss": 0.1456487476825714 + }, + { + "entropy": 9.11447525024414, + "epoch": 0.6485070199723156, + "mean_token_accuracy": 0.7424749135971069, + "num_tokens": 13315514.0, + "step": 6559, + "train/ce_loss": 2.266011279061786e-06 + }, + { + "epoch": 0.6485070199723156, + "step": 6559, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.6485070199723156, + "step": 6559, + "train/total_loss": 0.031250227242708206 + }, + { + "epoch": 0.6486058928218311, + "grad_norm": 0.6868183016777039, + "learning_rate": 8.380804035009642e-06, + "loss": 0.1334, + "step": 6560 + }, + { + "entropy": 9.356553077697754, + "epoch": 0.6486058928218311, + "mean_token_accuracy": 0.747706413269043, + "num_tokens": 13320406.0, + "step": 6560, + "train/ce_loss": 1.664231300354004 + }, + { + "epoch": 0.6486058928218311, + "step": 6560, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.6486058928218311, + "step": 6560, + "train/total_loss": 0.23282937705516815 + }, + { + "entropy": 9.0035400390625, + "epoch": 0.6487047656713466, + "mean_token_accuracy": 0.7130919098854065, + "num_tokens": 13325561.0, + "step": 6561, + "train/ce_loss": 1.0942925214767456 + }, + { + "epoch": 0.6487047656713466, + "step": 6561, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.6487047656713466, + "step": 6561, + "train/total_loss": 0.1406792551279068 + }, + { + "entropy": 8.760725021362305, + "epoch": 0.6488036385208622, + "mean_token_accuracy": 0.7328671216964722, + "num_tokens": 13330740.0, + "step": 6562, + "train/ce_loss": 1.063631296157837 + }, + { + "epoch": 0.6488036385208622, + "step": 6562, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.6488036385208622, + "step": 6562, + "train/total_loss": 0.18839438259601593 + }, + { + "entropy": 8.661602020263672, + "epoch": 0.6489025113703777, + "mean_token_accuracy": 0.7060975432395935, + "num_tokens": 13336050.0, + "step": 6563, + "train/ce_loss": 1.7457458972930908 + }, + { + "epoch": 0.6489025113703777, + "step": 6563, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.6489025113703777, + "step": 6563, + "train/total_loss": 0.2175433486700058 + }, + { + "entropy": 8.36601448059082, + "epoch": 0.6490013842198932, + "mean_token_accuracy": 0.6859323978424072, + "num_tokens": 13341495.0, + "step": 6564, + "train/ce_loss": 0.8401528596878052 + }, + { + "epoch": 0.6490013842198932, + "step": 6564, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.6490013842198932, + "step": 6564, + "train/total_loss": 0.16214028000831604 + }, + { + "entropy": 8.529932022094727, + "epoch": 0.6491002570694088, + "mean_token_accuracy": 0.7721354365348816, + "num_tokens": 13346721.0, + "step": 6565, + "train/ce_loss": 0.5375370383262634 + }, + { + "epoch": 0.6491002570694088, + "step": 6565, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.6491002570694088, + "step": 6565, + "train/total_loss": 0.10844120383262634 + }, + { + "entropy": 8.754827499389648, + "epoch": 0.6491991299189243, + "mean_token_accuracy": 0.6712141633033752, + "num_tokens": 13351907.0, + "step": 6566, + "train/ce_loss": 1.1823458671569824 + }, + { + "epoch": 0.6491991299189243, + "step": 6566, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.6491991299189243, + "step": 6566, + "train/total_loss": 0.18073458969593048 + }, + { + "entropy": 9.000566482543945, + "epoch": 0.6492980027684397, + "mean_token_accuracy": 0.7286096215248108, + "num_tokens": 13357104.0, + "step": 6567, + "train/ce_loss": 0.52986079454422 + }, + { + "epoch": 0.6492980027684397, + "step": 6567, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.6492980027684397, + "step": 6567, + "train/total_loss": 0.10376733541488647 + }, + { + "entropy": 8.475728034973145, + "epoch": 0.6493968756179553, + "mean_token_accuracy": 0.7357001900672913, + "num_tokens": 13362637.0, + "step": 6568, + "train/ce_loss": 0.8425010442733765 + }, + { + "epoch": 0.6493968756179553, + "step": 6568, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.6493968756179553, + "step": 6568, + "train/total_loss": 0.17018760740756989 + }, + { + "entropy": 8.903318405151367, + "epoch": 0.6494957484674708, + "mean_token_accuracy": 0.7919161915779114, + "num_tokens": 13367755.0, + "step": 6569, + "train/ce_loss": 1.1492321618788992e-06 + }, + { + "epoch": 0.6494957484674708, + "step": 6569, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.6494957484674708, + "step": 6569, + "train/total_loss": 0.03125011548399925 + }, + { + "entropy": 8.71034049987793, + "epoch": 0.6495946213169863, + "mean_token_accuracy": 0.8113924264907837, + "num_tokens": 13373034.0, + "step": 6570, + "train/ce_loss": 0.5691157579421997 + }, + { + "epoch": 0.6495946213169863, + "step": 6570, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.6495946213169863, + "step": 6570, + "train/total_loss": 0.08816157281398773 + }, + { + "entropy": 9.267087936401367, + "epoch": 0.6496934941665019, + "mean_token_accuracy": 0.7537091970443726, + "num_tokens": 13377814.0, + "step": 6571, + "train/ce_loss": 5.840172434545821e-06 + }, + { + "epoch": 0.6496934941665019, + "step": 6571, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.6496934941665019, + "step": 6571, + "train/total_loss": 0.02343808487057686 + }, + { + "entropy": 8.709084510803223, + "epoch": 0.6497923670160174, + "mean_token_accuracy": 0.7563804984092712, + "num_tokens": 13383181.0, + "step": 6572, + "train/ce_loss": 0.5205847024917603 + }, + { + "epoch": 0.6497923670160174, + "step": 6572, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.6497923670160174, + "step": 6572, + "train/total_loss": 0.08721472322940826 + }, + { + "entropy": 8.620328903198242, + "epoch": 0.6498912398655329, + "mean_token_accuracy": 0.6743383407592773, + "num_tokens": 13388552.0, + "step": 6573, + "train/ce_loss": 1.2111564874649048 + }, + { + "epoch": 0.6498912398655329, + "step": 6573, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.6498912398655329, + "step": 6573, + "train/total_loss": 0.19142815470695496 + }, + { + "entropy": 8.471712112426758, + "epoch": 0.6499901127150485, + "mean_token_accuracy": 0.6955530047416687, + "num_tokens": 13393887.0, + "step": 6574, + "train/ce_loss": 0.9428759813308716 + }, + { + "epoch": 0.6499901127150485, + "step": 6574, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.6499901127150485, + "step": 6574, + "train/total_loss": 0.16069385409355164 + }, + { + "entropy": 8.578201293945312, + "epoch": 0.650088985564564, + "mean_token_accuracy": 0.7132075428962708, + "num_tokens": 13399164.0, + "step": 6575, + "train/ce_loss": 0.6358181834220886 + }, + { + "epoch": 0.650088985564564, + "step": 6575, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.650088985564564, + "step": 6575, + "train/total_loss": 0.14561307430267334 + }, + { + "entropy": 8.559331893920898, + "epoch": 0.6501878584140794, + "mean_token_accuracy": 0.7372340559959412, + "num_tokens": 13404543.0, + "step": 6576, + "train/ce_loss": 1.044264793395996 + }, + { + "epoch": 0.6501878584140794, + "step": 6576, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.6501878584140794, + "step": 6576, + "train/total_loss": 0.14739522337913513 + }, + { + "entropy": 8.661439895629883, + "epoch": 0.650286731263595, + "mean_token_accuracy": 0.7629629373550415, + "num_tokens": 13409931.0, + "step": 6577, + "train/ce_loss": 0.8685452342033386 + }, + { + "epoch": 0.650286731263595, + "step": 6577, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.650286731263595, + "step": 6577, + "train/total_loss": 0.14154201745986938 + }, + { + "entropy": 8.24040699005127, + "epoch": 0.6503856041131105, + "mean_token_accuracy": 0.7872582674026489, + "num_tokens": 13415295.0, + "step": 6578, + "train/ce_loss": 0.36787208914756775 + }, + { + "epoch": 0.6503856041131105, + "step": 6578, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.6503856041131105, + "step": 6578, + "train/total_loss": 0.06803721189498901 + }, + { + "entropy": 8.668405532836914, + "epoch": 0.650484476962626, + "mean_token_accuracy": 0.7052767276763916, + "num_tokens": 13420605.0, + "step": 6579, + "train/ce_loss": 1.0733392238616943 + }, + { + "epoch": 0.650484476962626, + "step": 6579, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.650484476962626, + "step": 6579, + "train/total_loss": 0.1503026783466339 + }, + { + "epoch": 0.6505833498121416, + "grad_norm": 0.765305757522583, + "learning_rate": 8.375859170251695e-06, + "loss": 0.1413, + "step": 6580 + }, + { + "entropy": 8.656425476074219, + "epoch": 0.6505833498121416, + "mean_token_accuracy": 0.7190876603126526, + "num_tokens": 13425946.0, + "step": 6580, + "train/ce_loss": 0.8743836283683777 + }, + { + "epoch": 0.6505833498121416, + "step": 6580, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.6505833498121416, + "step": 6580, + "train/total_loss": 0.15775087475776672 + }, + { + "entropy": 8.495885848999023, + "epoch": 0.6506822226616571, + "mean_token_accuracy": 0.7882927060127258, + "num_tokens": 13431438.0, + "step": 6581, + "train/ce_loss": 0.6086604595184326 + }, + { + "epoch": 0.6506822226616571, + "step": 6581, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.6506822226616571, + "step": 6581, + "train/total_loss": 0.13508479297161102 + }, + { + "entropy": 9.197243690490723, + "epoch": 0.6507810955111726, + "mean_token_accuracy": 0.7380136847496033, + "num_tokens": 13436441.0, + "step": 6582, + "train/ce_loss": 0.9340671896934509 + }, + { + "epoch": 0.6507810955111726, + "step": 6582, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.6507810955111726, + "step": 6582, + "train/total_loss": 0.13637547194957733 + }, + { + "entropy": 8.552262306213379, + "epoch": 0.6508799683606882, + "mean_token_accuracy": 0.7147766351699829, + "num_tokens": 13441800.0, + "step": 6583, + "train/ce_loss": 1.2501468658447266 + }, + { + "epoch": 0.6508799683606882, + "step": 6583, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.6508799683606882, + "step": 6583, + "train/total_loss": 0.17188969254493713 + }, + { + "entropy": 8.310001373291016, + "epoch": 0.6509788412102037, + "mean_token_accuracy": 0.7433722019195557, + "num_tokens": 13447242.0, + "step": 6584, + "train/ce_loss": 0.8037084937095642 + }, + { + "epoch": 0.6509788412102037, + "step": 6584, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.6509788412102037, + "step": 6584, + "train/total_loss": 0.12724584341049194 + }, + { + "entropy": 8.924371719360352, + "epoch": 0.6510777140597191, + "mean_token_accuracy": 0.6680107712745667, + "num_tokens": 13452410.0, + "step": 6585, + "train/ce_loss": 1.5992698669433594 + }, + { + "epoch": 0.6510777140597191, + "step": 6585, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.6510777140597191, + "step": 6585, + "train/total_loss": 0.21461449563503265 + }, + { + "entropy": 8.38753890991211, + "epoch": 0.6511765869092347, + "mean_token_accuracy": 0.7152103781700134, + "num_tokens": 13457842.0, + "step": 6586, + "train/ce_loss": 0.8172120451927185 + }, + { + "epoch": 0.6511765869092347, + "step": 6586, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.6511765869092347, + "step": 6586, + "train/total_loss": 0.12468995898962021 + }, + { + "entropy": 8.405166625976562, + "epoch": 0.6512754597587502, + "mean_token_accuracy": 0.687637984752655, + "num_tokens": 13463232.0, + "step": 6587, + "train/ce_loss": 0.8654608130455017 + }, + { + "epoch": 0.6512754597587502, + "step": 6587, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.6512754597587502, + "step": 6587, + "train/total_loss": 0.18420234322547913 + }, + { + "entropy": 8.861724853515625, + "epoch": 0.6513743326082657, + "mean_token_accuracy": 0.7378516793251038, + "num_tokens": 13468498.0, + "step": 6588, + "train/ce_loss": 0.8934001326560974 + }, + { + "epoch": 0.6513743326082657, + "step": 6588, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.6513743326082657, + "step": 6588, + "train/total_loss": 0.15184001624584198 + }, + { + "entropy": 8.825806617736816, + "epoch": 0.6514732054577813, + "mean_token_accuracy": 0.7155388593673706, + "num_tokens": 13473776.0, + "step": 6589, + "train/ce_loss": 0.9997677803039551 + }, + { + "epoch": 0.6514732054577813, + "step": 6589, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.6514732054577813, + "step": 6589, + "train/total_loss": 0.1898205280303955 + }, + { + "entropy": 8.642440795898438, + "epoch": 0.6515720783072968, + "mean_token_accuracy": 0.7457212805747986, + "num_tokens": 13479071.0, + "step": 6590, + "train/ce_loss": 0.771452784538269 + }, + { + "epoch": 0.6515720783072968, + "step": 6590, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.6515720783072968, + "step": 6590, + "train/total_loss": 0.1552702784538269 + }, + { + "entropy": 8.950058937072754, + "epoch": 0.6516709511568124, + "mean_token_accuracy": 0.7306064963340759, + "num_tokens": 13484300.0, + "step": 6591, + "train/ce_loss": 1.9471641280688345e-05 + }, + { + "epoch": 0.6516709511568124, + "step": 6591, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.6516709511568124, + "step": 6591, + "train/total_loss": 0.050783198326826096 + }, + { + "entropy": 9.265976905822754, + "epoch": 0.6517698240063279, + "mean_token_accuracy": 0.7322404384613037, + "num_tokens": 13489275.0, + "step": 6592, + "train/ce_loss": 1.37646484375 + }, + { + "epoch": 0.6517698240063279, + "step": 6592, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.6517698240063279, + "step": 6592, + "train/total_loss": 0.21577148139476776 + }, + { + "entropy": 9.448829650878906, + "epoch": 0.6518686968558434, + "mean_token_accuracy": 0.7547974586486816, + "num_tokens": 13494129.0, + "step": 6593, + "train/ce_loss": 1.212557077407837 + }, + { + "epoch": 0.6518686968558434, + "step": 6593, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.6518686968558434, + "step": 6593, + "train/total_loss": 0.15641196072101593 + }, + { + "entropy": 9.146718978881836, + "epoch": 0.651967569705359, + "mean_token_accuracy": 0.7548291087150574, + "num_tokens": 13499291.0, + "step": 6594, + "train/ce_loss": 0.8624992966651917 + }, + { + "epoch": 0.651967569705359, + "step": 6594, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.651967569705359, + "step": 6594, + "train/total_loss": 0.1018749326467514 + }, + { + "entropy": 8.7354154586792, + "epoch": 0.6520664425548744, + "mean_token_accuracy": 0.7263157963752747, + "num_tokens": 13504541.0, + "step": 6595, + "train/ce_loss": 0.8909603953361511 + }, + { + "epoch": 0.6520664425548744, + "step": 6595, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.6520664425548744, + "step": 6595, + "train/total_loss": 0.11643978953361511 + }, + { + "entropy": 9.494924545288086, + "epoch": 0.6521653154043899, + "mean_token_accuracy": 0.7104557752609253, + "num_tokens": 13509364.0, + "step": 6596, + "train/ce_loss": 3.237225246266462e-05 + }, + { + "epoch": 0.6521653154043899, + "step": 6596, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.6521653154043899, + "step": 6596, + "train/total_loss": 0.04687823727726936 + }, + { + "entropy": 8.990909576416016, + "epoch": 0.6522641882539055, + "mean_token_accuracy": 0.7382671236991882, + "num_tokens": 13514338.0, + "step": 6597, + "train/ce_loss": 5.00813303005998e-06 + }, + { + "epoch": 0.6522641882539055, + "step": 6597, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.6522641882539055, + "step": 6597, + "train/total_loss": 0.050781749188899994 + }, + { + "entropy": 9.145400047302246, + "epoch": 0.652363061103421, + "mean_token_accuracy": 0.7071547508239746, + "num_tokens": 13519384.0, + "step": 6598, + "train/ce_loss": 0.9238376617431641 + }, + { + "epoch": 0.652363061103421, + "step": 6598, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.652363061103421, + "step": 6598, + "train/total_loss": 0.12754002213478088 + }, + { + "entropy": 8.485532760620117, + "epoch": 0.6524619339529365, + "mean_token_accuracy": 0.8067520260810852, + "num_tokens": 13524707.0, + "step": 6599, + "train/ce_loss": 0.9799271821975708 + }, + { + "epoch": 0.6524619339529365, + "step": 6599, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.6524619339529365, + "step": 6599, + "train/total_loss": 0.15268021821975708 + }, + { + "epoch": 0.6525608068024521, + "grad_norm": 0.6757383942604065, + "learning_rate": 8.370914305493745e-06, + "loss": 0.143, + "step": 6600 + }, + { + "entropy": 8.593257904052734, + "epoch": 0.6525608068024521, + "mean_token_accuracy": 0.7207586765289307, + "num_tokens": 13530165.0, + "step": 6600, + "train/ce_loss": 2.1290764808654785 + }, + { + "epoch": 0.6525608068024521, + "step": 6600, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.6525608068024521, + "step": 6600, + "train/total_loss": 0.28712642192840576 + }, + { + "entropy": 8.746946334838867, + "epoch": 0.6526596796519676, + "mean_token_accuracy": 0.7280187606811523, + "num_tokens": 13535521.0, + "step": 6601, + "train/ce_loss": 0.9030402302742004 + }, + { + "epoch": 0.6526596796519676, + "step": 6601, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.6526596796519676, + "step": 6601, + "train/total_loss": 0.12155402451753616 + }, + { + "entropy": 8.988773345947266, + "epoch": 0.6527585525014831, + "mean_token_accuracy": 0.7064343094825745, + "num_tokens": 13540744.0, + "step": 6602, + "train/ce_loss": 0.9543810486793518 + }, + { + "epoch": 0.6527585525014831, + "step": 6602, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.6527585525014831, + "step": 6602, + "train/total_loss": 0.15793810784816742 + }, + { + "entropy": 9.077032089233398, + "epoch": 0.6528574253509987, + "mean_token_accuracy": 0.7015113234519958, + "num_tokens": 13546016.0, + "step": 6603, + "train/ce_loss": 0.969390332698822 + }, + { + "epoch": 0.6528574253509987, + "step": 6603, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.6528574253509987, + "step": 6603, + "train/total_loss": 0.15943902730941772 + }, + { + "entropy": 8.732562065124512, + "epoch": 0.6529562982005142, + "mean_token_accuracy": 0.7128927707672119, + "num_tokens": 13551411.0, + "step": 6604, + "train/ce_loss": 1.147282600402832 + }, + { + "epoch": 0.6529562982005142, + "step": 6604, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.6529562982005142, + "step": 6604, + "train/total_loss": 0.17722827196121216 + }, + { + "entropy": 8.944042205810547, + "epoch": 0.6530551710500296, + "mean_token_accuracy": 0.7874214053153992, + "num_tokens": 13556656.0, + "step": 6605, + "train/ce_loss": 0.3569697141647339 + }, + { + "epoch": 0.6530551710500296, + "step": 6605, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.6530551710500296, + "step": 6605, + "train/total_loss": 0.05522822216153145 + }, + { + "entropy": 9.391716957092285, + "epoch": 0.6531540438995452, + "mean_token_accuracy": 0.6883116960525513, + "num_tokens": 13561482.0, + "step": 6606, + "train/ce_loss": 9.041209705173969e-05 + }, + { + "epoch": 0.6531540438995452, + "step": 6606, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.6531540438995452, + "step": 6606, + "train/total_loss": 0.04688404127955437 + }, + { + "entropy": 8.47945785522461, + "epoch": 0.6532529167490607, + "mean_token_accuracy": 0.7885228395462036, + "num_tokens": 13566904.0, + "step": 6607, + "train/ce_loss": 0.5496372580528259 + }, + { + "epoch": 0.6532529167490607, + "step": 6607, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.6532529167490607, + "step": 6607, + "train/total_loss": 0.10183872282505035 + }, + { + "entropy": 8.949787139892578, + "epoch": 0.6533517895985762, + "mean_token_accuracy": 0.6934523582458496, + "num_tokens": 13572036.0, + "step": 6608, + "train/ce_loss": 0.8607617616653442 + }, + { + "epoch": 0.6533517895985762, + "step": 6608, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.6533517895985762, + "step": 6608, + "train/total_loss": 0.14076367020606995 + }, + { + "entropy": 8.818286895751953, + "epoch": 0.6534506624480918, + "mean_token_accuracy": 0.6873508095741272, + "num_tokens": 13577314.0, + "step": 6609, + "train/ce_loss": 0.4437519609928131 + }, + { + "epoch": 0.6534506624480918, + "step": 6609, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.6534506624480918, + "step": 6609, + "train/total_loss": 0.07171894609928131 + }, + { + "entropy": 8.649569511413574, + "epoch": 0.6535495352976073, + "mean_token_accuracy": 0.7800776362419128, + "num_tokens": 13582591.0, + "step": 6610, + "train/ce_loss": 0.5533372163772583 + }, + { + "epoch": 0.6535495352976073, + "step": 6610, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.6535495352976073, + "step": 6610, + "train/total_loss": 0.09439621865749359 + }, + { + "entropy": 8.685169219970703, + "epoch": 0.6536484081471228, + "mean_token_accuracy": 0.7140974998474121, + "num_tokens": 13587857.0, + "step": 6611, + "train/ce_loss": 0.3878539204597473 + }, + { + "epoch": 0.6536484081471228, + "step": 6611, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.6536484081471228, + "step": 6611, + "train/total_loss": 0.08566039800643921 + }, + { + "entropy": 8.785577774047852, + "epoch": 0.6537472809966384, + "mean_token_accuracy": 0.7546099424362183, + "num_tokens": 13593052.0, + "step": 6612, + "train/ce_loss": 0.5305328369140625 + }, + { + "epoch": 0.6537472809966384, + "step": 6612, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.6537472809966384, + "step": 6612, + "train/total_loss": 0.11164703965187073 + }, + { + "entropy": 9.1278076171875, + "epoch": 0.6538461538461539, + "mean_token_accuracy": 0.7575187683105469, + "num_tokens": 13598061.0, + "step": 6613, + "train/ce_loss": 0.8386176824569702 + }, + { + "epoch": 0.6538461538461539, + "step": 6613, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.6538461538461539, + "step": 6613, + "train/total_loss": 0.14636176824569702 + }, + { + "entropy": 9.5991792678833, + "epoch": 0.6539450266956693, + "mean_token_accuracy": 0.7457143068313599, + "num_tokens": 13602844.0, + "step": 6614, + "train/ce_loss": 6.30193535471335e-05 + }, + { + "epoch": 0.6539450266956693, + "step": 6614, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.6539450266956693, + "step": 6614, + "train/total_loss": 0.035162553191185 + }, + { + "entropy": 8.444253921508789, + "epoch": 0.6540438995451849, + "mean_token_accuracy": 0.7926221489906311, + "num_tokens": 13608378.0, + "step": 6615, + "train/ce_loss": 0.5664340257644653 + }, + { + "epoch": 0.6540438995451849, + "step": 6615, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.6540438995451849, + "step": 6615, + "train/total_loss": 0.07617465406656265 + }, + { + "entropy": 8.906087875366211, + "epoch": 0.6541427723947004, + "mean_token_accuracy": 0.7394366264343262, + "num_tokens": 13613689.0, + "step": 6616, + "train/ce_loss": 1.4852736285320134e-06 + }, + { + "epoch": 0.6541427723947004, + "step": 6616, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.6541427723947004, + "step": 6616, + "train/total_loss": 0.02734389901161194 + }, + { + "entropy": 9.077184677124023, + "epoch": 0.6542416452442159, + "mean_token_accuracy": 0.7466266751289368, + "num_tokens": 13618816.0, + "step": 6617, + "train/ce_loss": 1.4459218978881836 + }, + { + "epoch": 0.6542416452442159, + "step": 6617, + "train/sim_loss": 0.12109375 + }, + { + "epoch": 0.6542416452442159, + "step": 6617, + "train/total_loss": 0.26568594574928284 + }, + { + "entropy": 9.16096305847168, + "epoch": 0.6543405180937315, + "mean_token_accuracy": 0.7279279232025146, + "num_tokens": 13623874.0, + "step": 6618, + "train/ce_loss": 1.1337878277117852e-05 + }, + { + "epoch": 0.6543405180937315, + "step": 6618, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.6543405180937315, + "step": 6618, + "train/total_loss": 0.05859488248825073 + }, + { + "entropy": 9.032449722290039, + "epoch": 0.654439390943247, + "mean_token_accuracy": 0.7461773753166199, + "num_tokens": 13628992.0, + "step": 6619, + "train/ce_loss": 1.344244122505188 + }, + { + "epoch": 0.654439390943247, + "step": 6619, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.654439390943247, + "step": 6619, + "train/total_loss": 0.24770566821098328 + }, + { + "epoch": 0.6545382637927625, + "grad_norm": 0.7162269353866577, + "learning_rate": 8.365969440735796e-06, + "loss": 0.1417, + "step": 6620 + }, + { + "entropy": 8.860625267028809, + "epoch": 0.6545382637927625, + "mean_token_accuracy": 0.7525906562805176, + "num_tokens": 13634159.0, + "step": 6620, + "train/ce_loss": 0.3801497220993042 + }, + { + "epoch": 0.6545382637927625, + "step": 6620, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.6545382637927625, + "step": 6620, + "train/total_loss": 0.0692649781703949 + }, + { + "entropy": 8.876182556152344, + "epoch": 0.6546371366422781, + "mean_token_accuracy": 0.688249409198761, + "num_tokens": 13639487.0, + "step": 6621, + "train/ce_loss": 0.8826169967651367 + }, + { + "epoch": 0.6546371366422781, + "step": 6621, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.6546371366422781, + "step": 6621, + "train/total_loss": 0.1468554437160492 + }, + { + "entropy": 9.362860679626465, + "epoch": 0.6547360094917936, + "mean_token_accuracy": 0.7879858613014221, + "num_tokens": 13644483.0, + "step": 6622, + "train/ce_loss": 3.3644362247287063e-06 + }, + { + "epoch": 0.6547360094917936, + "step": 6622, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.6547360094917936, + "step": 6622, + "train/total_loss": 0.05078158527612686 + }, + { + "entropy": 8.86358642578125, + "epoch": 0.654834882341309, + "mean_token_accuracy": 0.7381545901298523, + "num_tokens": 13649743.0, + "step": 6623, + "train/ce_loss": 0.5250521302223206 + }, + { + "epoch": 0.654834882341309, + "step": 6623, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.654834882341309, + "step": 6623, + "train/total_loss": 0.09156771004199982 + }, + { + "entropy": 8.727371215820312, + "epoch": 0.6549337551908246, + "mean_token_accuracy": 0.7717265486717224, + "num_tokens": 13655058.0, + "step": 6624, + "train/ce_loss": 0.3160201609134674 + }, + { + "epoch": 0.6549337551908246, + "step": 6624, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.6549337551908246, + "step": 6624, + "train/total_loss": 0.07066451758146286 + }, + { + "entropy": 8.269651412963867, + "epoch": 0.6550326280403401, + "mean_token_accuracy": 0.7158300876617432, + "num_tokens": 13660802.0, + "step": 6625, + "train/ce_loss": 0.7030523419380188 + }, + { + "epoch": 0.6550326280403401, + "step": 6625, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.6550326280403401, + "step": 6625, + "train/total_loss": 0.093742735683918 + }, + { + "entropy": 9.112279891967773, + "epoch": 0.6551315008898556, + "mean_token_accuracy": 0.7423312664031982, + "num_tokens": 13665900.0, + "step": 6626, + "train/ce_loss": 1.0576657056808472 + }, + { + "epoch": 0.6551315008898556, + "step": 6626, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.6551315008898556, + "step": 6626, + "train/total_loss": 0.16436031460762024 + }, + { + "entropy": 8.331881523132324, + "epoch": 0.6552303737393712, + "mean_token_accuracy": 0.7643442749977112, + "num_tokens": 13671380.0, + "step": 6627, + "train/ce_loss": 0.5838476419448853 + }, + { + "epoch": 0.6552303737393712, + "step": 6627, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.6552303737393712, + "step": 6627, + "train/total_loss": 0.07791601121425629 + }, + { + "entropy": 8.773751258850098, + "epoch": 0.6553292465888867, + "mean_token_accuracy": 0.7782964110374451, + "num_tokens": 13676686.0, + "step": 6628, + "train/ce_loss": 0.771364152431488 + }, + { + "epoch": 0.6553292465888867, + "step": 6628, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.6553292465888867, + "step": 6628, + "train/total_loss": 0.09276141971349716 + }, + { + "entropy": 9.157133102416992, + "epoch": 0.6554281194384022, + "mean_token_accuracy": 0.7109737396240234, + "num_tokens": 13681782.0, + "step": 6629, + "train/ce_loss": 1.915988802909851 + }, + { + "epoch": 0.6554281194384022, + "step": 6629, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.6554281194384022, + "step": 6629, + "train/total_loss": 0.25800514221191406 + }, + { + "entropy": 9.058282852172852, + "epoch": 0.6555269922879178, + "mean_token_accuracy": 0.7451612949371338, + "num_tokens": 13686850.0, + "step": 6630, + "train/ce_loss": 0.9509789943695068 + }, + { + "epoch": 0.6555269922879178, + "step": 6630, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.6555269922879178, + "step": 6630, + "train/total_loss": 0.14587914943695068 + }, + { + "entropy": 9.235963821411133, + "epoch": 0.6556258651374333, + "mean_token_accuracy": 0.7064220309257507, + "num_tokens": 13691920.0, + "step": 6631, + "train/ce_loss": 1.46619713306427 + }, + { + "epoch": 0.6556258651374333, + "step": 6631, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.6556258651374333, + "step": 6631, + "train/total_loss": 0.20130722224712372 + }, + { + "entropy": 9.007339477539062, + "epoch": 0.6557247379869487, + "mean_token_accuracy": 0.7758620977401733, + "num_tokens": 13697016.0, + "step": 6632, + "train/ce_loss": 0.962904691696167 + }, + { + "epoch": 0.6557247379869487, + "step": 6632, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.6557247379869487, + "step": 6632, + "train/total_loss": 0.1119154691696167 + }, + { + "entropy": 8.575750350952148, + "epoch": 0.6558236108364643, + "mean_token_accuracy": 0.7450593113899231, + "num_tokens": 13702497.0, + "step": 6633, + "train/ce_loss": 0.6679680347442627 + }, + { + "epoch": 0.6558236108364643, + "step": 6633, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.6558236108364643, + "step": 6633, + "train/total_loss": 0.08242180198431015 + }, + { + "entropy": 8.739509582519531, + "epoch": 0.6559224836859798, + "mean_token_accuracy": 0.6725490093231201, + "num_tokens": 13707967.0, + "step": 6634, + "train/ce_loss": 1.4301599264144897 + }, + { + "epoch": 0.6559224836859798, + "step": 6634, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.6559224836859798, + "step": 6634, + "train/total_loss": 0.2484847456216812 + }, + { + "entropy": 8.879558563232422, + "epoch": 0.6560213565354953, + "mean_token_accuracy": 0.7448275685310364, + "num_tokens": 13713279.0, + "step": 6635, + "train/ce_loss": 1.0661180019378662 + }, + { + "epoch": 0.6560213565354953, + "step": 6635, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.6560213565354953, + "step": 6635, + "train/total_loss": 0.16129930317401886 + }, + { + "entropy": 8.58578872680664, + "epoch": 0.6561202293850109, + "mean_token_accuracy": 0.7660878300666809, + "num_tokens": 13718709.0, + "step": 6636, + "train/ce_loss": 0.4772024154663086 + }, + { + "epoch": 0.6561202293850109, + "step": 6636, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.6561202293850109, + "step": 6636, + "train/total_loss": 0.09459523856639862 + }, + { + "entropy": 8.64152717590332, + "epoch": 0.6562191022345264, + "mean_token_accuracy": 0.7270916104316711, + "num_tokens": 13724209.0, + "step": 6637, + "train/ce_loss": 0.7879598736763 + }, + { + "epoch": 0.6562191022345264, + "step": 6637, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.6562191022345264, + "step": 6637, + "train/total_loss": 0.19207724928855896 + }, + { + "entropy": 8.790847778320312, + "epoch": 0.6563179750840419, + "mean_token_accuracy": 0.755359411239624, + "num_tokens": 13729482.0, + "step": 6638, + "train/ce_loss": 0.8471806049346924 + }, + { + "epoch": 0.6563179750840419, + "step": 6638, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.6563179750840419, + "step": 6638, + "train/total_loss": 0.13940556347370148 + }, + { + "entropy": 8.453397750854492, + "epoch": 0.6564168479335575, + "mean_token_accuracy": 0.8231083750724792, + "num_tokens": 13734927.0, + "step": 6639, + "train/ce_loss": 0.5777555704116821 + }, + { + "epoch": 0.6564168479335575, + "step": 6639, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.6564168479335575, + "step": 6639, + "train/total_loss": 0.07340055704116821 + }, + { + "epoch": 0.656515720783073, + "grad_norm": 0.5281113386154175, + "learning_rate": 8.361024575977848e-06, + "loss": 0.1352, + "step": 6640 + }, + { + "entropy": 8.344795227050781, + "epoch": 0.656515720783073, + "mean_token_accuracy": 0.7142857313156128, + "num_tokens": 13740538.0, + "step": 6640, + "train/ce_loss": 1.1282984018325806 + }, + { + "epoch": 0.656515720783073, + "step": 6640, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.656515720783073, + "step": 6640, + "train/total_loss": 0.18704858422279358 + }, + { + "entropy": 9.365694046020508, + "epoch": 0.6566145936325885, + "mean_token_accuracy": 0.7252336740493774, + "num_tokens": 13745421.0, + "step": 6641, + "train/ce_loss": 2.0379106998443604 + }, + { + "epoch": 0.6566145936325885, + "step": 6641, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.6566145936325885, + "step": 6641, + "train/total_loss": 0.250666081905365 + }, + { + "entropy": 9.762799263000488, + "epoch": 0.656713466482104, + "mean_token_accuracy": 0.688034176826477, + "num_tokens": 13750082.0, + "step": 6642, + "train/ce_loss": 4.048784255981445 + }, + { + "epoch": 0.656713466482104, + "step": 6642, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.656713466482104, + "step": 6642, + "train/total_loss": 0.4361284375190735 + }, + { + "entropy": 8.665780067443848, + "epoch": 0.6568123393316195, + "mean_token_accuracy": 0.7496463656425476, + "num_tokens": 13755510.0, + "step": 6643, + "train/ce_loss": 0.9824205040931702 + }, + { + "epoch": 0.6568123393316195, + "step": 6643, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.6568123393316195, + "step": 6643, + "train/total_loss": 0.18808579444885254 + }, + { + "entropy": 8.95378589630127, + "epoch": 0.656911212181135, + "mean_token_accuracy": 0.737864077091217, + "num_tokens": 13760950.0, + "step": 6644, + "train/ce_loss": 0.618170440196991 + }, + { + "epoch": 0.656911212181135, + "step": 6644, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.656911212181135, + "step": 6644, + "train/total_loss": 0.10087954998016357 + }, + { + "entropy": 8.267267227172852, + "epoch": 0.6570100850306506, + "mean_token_accuracy": 0.6891766786575317, + "num_tokens": 13766521.0, + "step": 6645, + "train/ce_loss": 0.5283001065254211 + }, + { + "epoch": 0.6570100850306506, + "step": 6645, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.6570100850306506, + "step": 6645, + "train/total_loss": 0.09579876065254211 + }, + { + "entropy": 8.782012939453125, + "epoch": 0.6571089578801661, + "mean_token_accuracy": 0.7252090573310852, + "num_tokens": 13771810.0, + "step": 6646, + "train/ce_loss": 0.9844553470611572 + }, + { + "epoch": 0.6571089578801661, + "step": 6646, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.6571089578801661, + "step": 6646, + "train/total_loss": 0.16485178470611572 + }, + { + "entropy": 9.146493911743164, + "epoch": 0.6572078307296816, + "mean_token_accuracy": 0.7063252925872803, + "num_tokens": 13776957.0, + "step": 6647, + "train/ce_loss": 0.8593490719795227 + }, + { + "epoch": 0.6572078307296816, + "step": 6647, + "train/sim_loss": 0.1875 + }, + { + "epoch": 0.6572078307296816, + "step": 6647, + "train/total_loss": 0.27343490719795227 + }, + { + "entropy": 8.893928527832031, + "epoch": 0.6573067035791972, + "mean_token_accuracy": 0.7760097980499268, + "num_tokens": 13782278.0, + "step": 6648, + "train/ce_loss": 1.0574133396148682 + }, + { + "epoch": 0.6573067035791972, + "step": 6648, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.6573067035791972, + "step": 6648, + "train/total_loss": 0.16042883694171906 + }, + { + "entropy": 8.998648643493652, + "epoch": 0.6574055764287127, + "mean_token_accuracy": 0.7454545497894287, + "num_tokens": 13787487.0, + "step": 6649, + "train/ce_loss": 0.7707734107971191 + }, + { + "epoch": 0.6574055764287127, + "step": 6649, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.6574055764287127, + "step": 6649, + "train/total_loss": 0.14738984405994415 + }, + { + "entropy": 8.66889762878418, + "epoch": 0.6575044492782282, + "mean_token_accuracy": 0.8360995650291443, + "num_tokens": 13792893.0, + "step": 6650, + "train/ce_loss": 0.8410592675209045 + }, + { + "epoch": 0.6575044492782282, + "step": 6650, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.6575044492782282, + "step": 6650, + "train/total_loss": 0.12316843122243881 + }, + { + "entropy": 8.6030912399292, + "epoch": 0.6576033221277438, + "mean_token_accuracy": 0.7995283007621765, + "num_tokens": 13798209.0, + "step": 6651, + "train/ce_loss": 0.335406094789505 + }, + { + "epoch": 0.6576033221277438, + "step": 6651, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.6576033221277438, + "step": 6651, + "train/total_loss": 0.05307186022400856 + }, + { + "entropy": 8.725292205810547, + "epoch": 0.6577021949772592, + "mean_token_accuracy": 0.7394285798072815, + "num_tokens": 13803603.0, + "step": 6652, + "train/ce_loss": 0.9964840412139893 + }, + { + "epoch": 0.6577021949772592, + "step": 6652, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.6577021949772592, + "step": 6652, + "train/total_loss": 0.16996091604232788 + }, + { + "entropy": 8.928272247314453, + "epoch": 0.6578010678267747, + "mean_token_accuracy": 0.7513889074325562, + "num_tokens": 13808781.0, + "step": 6653, + "train/ce_loss": 0.7644780278205872 + }, + { + "epoch": 0.6578010678267747, + "step": 6653, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.6578010678267747, + "step": 6653, + "train/total_loss": 0.17019781470298767 + }, + { + "entropy": 8.922335624694824, + "epoch": 0.6578999406762903, + "mean_token_accuracy": 0.7458279728889465, + "num_tokens": 13814004.0, + "step": 6654, + "train/ce_loss": 0.4588564932346344 + }, + { + "epoch": 0.6578999406762903, + "step": 6654, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.6578999406762903, + "step": 6654, + "train/total_loss": 0.08885440230369568 + }, + { + "entropy": 9.43021011352539, + "epoch": 0.6579988135258058, + "mean_token_accuracy": 0.66847825050354, + "num_tokens": 13818946.0, + "step": 6655, + "train/ce_loss": 2.87549187305558e-06 + }, + { + "epoch": 0.6579988135258058, + "step": 6655, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.6579988135258058, + "step": 6655, + "train/total_loss": 0.01953153684735298 + }, + { + "entropy": 8.475024223327637, + "epoch": 0.6580976863753213, + "mean_token_accuracy": 0.7195571660995483, + "num_tokens": 13824213.0, + "step": 6656, + "train/ce_loss": 0.807493269443512 + }, + { + "epoch": 0.6580976863753213, + "step": 6656, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.6580976863753213, + "step": 6656, + "train/total_loss": 0.11199932545423508 + }, + { + "entropy": 8.546957015991211, + "epoch": 0.6581965592248369, + "mean_token_accuracy": 0.7383177280426025, + "num_tokens": 13829565.0, + "step": 6657, + "train/ce_loss": 0.5284896492958069 + }, + { + "epoch": 0.6581965592248369, + "step": 6657, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.6581965592248369, + "step": 6657, + "train/total_loss": 0.11144271492958069 + }, + { + "entropy": 9.003338813781738, + "epoch": 0.6582954320743524, + "mean_token_accuracy": 0.7419354915618896, + "num_tokens": 13834659.0, + "step": 6658, + "train/ce_loss": 1.4813705682754517 + }, + { + "epoch": 0.6582954320743524, + "step": 6658, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.6582954320743524, + "step": 6658, + "train/total_loss": 0.25360581278800964 + }, + { + "entropy": 9.55681037902832, + "epoch": 0.6583943049238679, + "mean_token_accuracy": 0.6799163222312927, + "num_tokens": 13839510.0, + "step": 6659, + "train/ce_loss": 1.2218444347381592 + }, + { + "epoch": 0.6583943049238679, + "step": 6659, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.6583943049238679, + "step": 6659, + "train/total_loss": 0.16515320539474487 + }, + { + "epoch": 0.6584931777733835, + "grad_norm": 0.8593719601631165, + "learning_rate": 8.356079711219898e-06, + "loss": 0.1409, + "step": 6660 + }, + { + "entropy": 8.629777908325195, + "epoch": 0.6584931777733835, + "mean_token_accuracy": 0.7558494210243225, + "num_tokens": 13844922.0, + "step": 6660, + "train/ce_loss": 0.8216527700424194 + }, + { + "epoch": 0.6584931777733835, + "step": 6660, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.6584931777733835, + "step": 6660, + "train/total_loss": 0.11341527849435806 + }, + { + "entropy": 9.151214599609375, + "epoch": 0.6585920506228989, + "mean_token_accuracy": 0.7333333492279053, + "num_tokens": 13850144.0, + "step": 6661, + "train/ce_loss": 0.5413286685943604 + }, + { + "epoch": 0.6585920506228989, + "step": 6661, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.6585920506228989, + "step": 6661, + "train/total_loss": 0.0971016138792038 + }, + { + "entropy": 8.66889762878418, + "epoch": 0.6586909234724144, + "mean_token_accuracy": 0.7402452826499939, + "num_tokens": 13855484.0, + "step": 6662, + "train/ce_loss": 0.7611150741577148 + }, + { + "epoch": 0.6586909234724144, + "step": 6662, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.6586909234724144, + "step": 6662, + "train/total_loss": 0.09954901039600372 + }, + { + "entropy": 8.713974952697754, + "epoch": 0.65878979632193, + "mean_token_accuracy": 0.7663934230804443, + "num_tokens": 13860949.0, + "step": 6663, + "train/ce_loss": 0.49784982204437256 + }, + { + "epoch": 0.65878979632193, + "step": 6663, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.65878979632193, + "step": 6663, + "train/total_loss": 0.12400373816490173 + }, + { + "entropy": 9.036233901977539, + "epoch": 0.6588886691714455, + "mean_token_accuracy": 0.7224025726318359, + "num_tokens": 13866025.0, + "step": 6664, + "train/ce_loss": 3.2771895348560065e-06 + }, + { + "epoch": 0.6588886691714455, + "step": 6664, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.6588886691714455, + "step": 6664, + "train/total_loss": 0.08203157782554626 + }, + { + "entropy": 9.548892974853516, + "epoch": 0.658987542020961, + "mean_token_accuracy": 0.7231759428977966, + "num_tokens": 13870876.0, + "step": 6665, + "train/ce_loss": 1.8042532246909104e-05 + }, + { + "epoch": 0.658987542020961, + "step": 6665, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.658987542020961, + "step": 6665, + "train/total_loss": 0.023439304903149605 + }, + { + "entropy": 9.140039443969727, + "epoch": 0.6590864148704766, + "mean_token_accuracy": 0.7146739363670349, + "num_tokens": 13876055.0, + "step": 6666, + "train/ce_loss": 1.681594729423523 + }, + { + "epoch": 0.6590864148704766, + "step": 6666, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.6590864148704766, + "step": 6666, + "train/total_loss": 0.27362823486328125 + }, + { + "entropy": 8.506306648254395, + "epoch": 0.6591852877199921, + "mean_token_accuracy": 0.752525269985199, + "num_tokens": 13881475.0, + "step": 6667, + "train/ce_loss": 0.8816389441490173 + }, + { + "epoch": 0.6591852877199921, + "step": 6667, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.6591852877199921, + "step": 6667, + "train/total_loss": 0.14285139739513397 + }, + { + "entropy": 9.267065048217773, + "epoch": 0.6592841605695076, + "mean_token_accuracy": 0.7409090995788574, + "num_tokens": 13886542.0, + "step": 6668, + "train/ce_loss": 1.1963961124420166 + }, + { + "epoch": 0.6592841605695076, + "step": 6668, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.6592841605695076, + "step": 6668, + "train/total_loss": 0.17432710528373718 + }, + { + "entropy": 8.78718090057373, + "epoch": 0.6593830334190232, + "mean_token_accuracy": 0.790281355381012, + "num_tokens": 13891755.0, + "step": 6669, + "train/ce_loss": 0.8355273604393005 + }, + { + "epoch": 0.6593830334190232, + "step": 6669, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.6593830334190232, + "step": 6669, + "train/total_loss": 0.134333997964859 + }, + { + "entropy": 8.821840286254883, + "epoch": 0.6594819062685386, + "mean_token_accuracy": 0.7264770269393921, + "num_tokens": 13897144.0, + "step": 6670, + "train/ce_loss": 0.5888134241104126 + }, + { + "epoch": 0.6594819062685386, + "step": 6670, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.6594819062685386, + "step": 6670, + "train/total_loss": 0.13700634241104126 + }, + { + "entropy": 8.783811569213867, + "epoch": 0.6595807791180541, + "mean_token_accuracy": 0.6647531390190125, + "num_tokens": 13902488.0, + "step": 6671, + "train/ce_loss": 1.449289083480835 + }, + { + "epoch": 0.6595807791180541, + "step": 6671, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.6595807791180541, + "step": 6671, + "train/total_loss": 0.2230539172887802 + }, + { + "entropy": 9.072467803955078, + "epoch": 0.6596796519675697, + "mean_token_accuracy": 0.7473958134651184, + "num_tokens": 13907714.0, + "step": 6672, + "train/ce_loss": 0.7859037518501282 + }, + { + "epoch": 0.6596796519675697, + "step": 6672, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.6596796519675697, + "step": 6672, + "train/total_loss": 0.12546537816524506 + }, + { + "entropy": 8.862828254699707, + "epoch": 0.6597785248170852, + "mean_token_accuracy": 0.7766749262809753, + "num_tokens": 13913005.0, + "step": 6673, + "train/ce_loss": 0.4724079370498657 + }, + { + "epoch": 0.6597785248170852, + "step": 6673, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.6597785248170852, + "step": 6673, + "train/total_loss": 0.10974079370498657 + }, + { + "entropy": 9.026915550231934, + "epoch": 0.6598773976666008, + "mean_token_accuracy": 0.7480559945106506, + "num_tokens": 13918089.0, + "step": 6674, + "train/ce_loss": 1.6545954942703247 + }, + { + "epoch": 0.6598773976666008, + "step": 6674, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.6598773976666008, + "step": 6674, + "train/total_loss": 0.2396783083677292 + }, + { + "entropy": 8.767784118652344, + "epoch": 0.6599762705161163, + "mean_token_accuracy": 0.6919642686843872, + "num_tokens": 13923458.0, + "step": 6675, + "train/ce_loss": 0.8346720933914185 + }, + { + "epoch": 0.6599762705161163, + "step": 6675, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.6599762705161163, + "step": 6675, + "train/total_loss": 0.19674846529960632 + }, + { + "entropy": 8.52509593963623, + "epoch": 0.6600751433656318, + "mean_token_accuracy": 0.7439320683479309, + "num_tokens": 13928834.0, + "step": 6676, + "train/ce_loss": 0.4277784526348114 + }, + { + "epoch": 0.6600751433656318, + "step": 6676, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.6600751433656318, + "step": 6676, + "train/total_loss": 0.06621535122394562 + }, + { + "entropy": 8.762564659118652, + "epoch": 0.6601740162151474, + "mean_token_accuracy": 0.6973094344139099, + "num_tokens": 13934187.0, + "step": 6677, + "train/ce_loss": 0.502404510974884 + }, + { + "epoch": 0.6601740162151474, + "step": 6677, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.6601740162151474, + "step": 6677, + "train/total_loss": 0.10492795705795288 + }, + { + "entropy": 8.436891555786133, + "epoch": 0.6602728890646629, + "mean_token_accuracy": 0.7667103409767151, + "num_tokens": 13939522.0, + "step": 6678, + "train/ce_loss": 0.5824763178825378 + }, + { + "epoch": 0.6602728890646629, + "step": 6678, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.6602728890646629, + "step": 6678, + "train/total_loss": 0.0973101332783699 + }, + { + "entropy": 8.870096206665039, + "epoch": 0.6603717619141783, + "mean_token_accuracy": 0.7449101805686951, + "num_tokens": 13944801.0, + "step": 6679, + "train/ce_loss": 0.6867674589157104 + }, + { + "epoch": 0.6603717619141783, + "step": 6679, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.6603717619141783, + "step": 6679, + "train/total_loss": 0.11555174738168716 + }, + { + "epoch": 0.6604706347636939, + "grad_norm": 0.6254637837409973, + "learning_rate": 8.35113484646195e-06, + "loss": 0.1439, + "step": 6680 + }, + { + "entropy": 8.857760429382324, + "epoch": 0.6604706347636939, + "mean_token_accuracy": 0.7624633312225342, + "num_tokens": 13949960.0, + "step": 6680, + "train/ce_loss": 0.8578407168388367 + }, + { + "epoch": 0.6604706347636939, + "step": 6680, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.6604706347636939, + "step": 6680, + "train/total_loss": 0.12484657019376755 + }, + { + "entropy": 9.655352592468262, + "epoch": 0.6605695076132094, + "mean_token_accuracy": 0.75, + "num_tokens": 13954767.0, + "step": 6681, + "train/ce_loss": 3.15669763040205e-06 + }, + { + "epoch": 0.6605695076132094, + "step": 6681, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.6605695076132094, + "step": 6681, + "train/total_loss": 0.01562531478703022 + }, + { + "entropy": 8.700724601745605, + "epoch": 0.6606683804627249, + "mean_token_accuracy": 0.7291440963745117, + "num_tokens": 13960159.0, + "step": 6682, + "train/ce_loss": 0.2854388654232025 + }, + { + "epoch": 0.6606683804627249, + "step": 6682, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.6606683804627249, + "step": 6682, + "train/total_loss": 0.04807513952255249 + }, + { + "entropy": 8.941474914550781, + "epoch": 0.6607672533122405, + "mean_token_accuracy": 0.7733674645423889, + "num_tokens": 13965383.0, + "step": 6683, + "train/ce_loss": 0.7744351625442505 + }, + { + "epoch": 0.6607672533122405, + "step": 6683, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.6607672533122405, + "step": 6683, + "train/total_loss": 0.10088101774454117 + }, + { + "entropy": 8.106334686279297, + "epoch": 0.660866126161756, + "mean_token_accuracy": 0.7571174502372742, + "num_tokens": 13971010.0, + "step": 6684, + "train/ce_loss": 0.8612960577011108 + }, + { + "epoch": 0.660866126161756, + "step": 6684, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.660866126161756, + "step": 6684, + "train/total_loss": 0.15253585577011108 + }, + { + "entropy": 8.876348495483398, + "epoch": 0.6609649990112715, + "mean_token_accuracy": 0.689830482006073, + "num_tokens": 13976062.0, + "step": 6685, + "train/ce_loss": 0.8690265417098999 + }, + { + "epoch": 0.6609649990112715, + "step": 6685, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.6609649990112715, + "step": 6685, + "train/total_loss": 0.1533088982105255 + }, + { + "entropy": 8.80894660949707, + "epoch": 0.6610638718607871, + "mean_token_accuracy": 0.7735602259635925, + "num_tokens": 13981283.0, + "step": 6686, + "train/ce_loss": 0.5743243098258972 + }, + { + "epoch": 0.6610638718607871, + "step": 6686, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.6610638718607871, + "step": 6686, + "train/total_loss": 0.07696367800235748 + }, + { + "entropy": 9.32143783569336, + "epoch": 0.6611627447103026, + "mean_token_accuracy": 0.745233952999115, + "num_tokens": 13986289.0, + "step": 6687, + "train/ce_loss": 4.4183077989146113e-05 + }, + { + "epoch": 0.6611627447103026, + "step": 6687, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.6611627447103026, + "step": 6687, + "train/total_loss": 0.039066918194293976 + }, + { + "entropy": 9.051506042480469, + "epoch": 0.661261617559818, + "mean_token_accuracy": 0.7278911471366882, + "num_tokens": 13991496.0, + "step": 6688, + "train/ce_loss": 0.7195901870727539 + }, + { + "epoch": 0.661261617559818, + "step": 6688, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.661261617559818, + "step": 6688, + "train/total_loss": 0.11102151870727539 + }, + { + "entropy": 8.631498336791992, + "epoch": 0.6613604904093336, + "mean_token_accuracy": 0.7300683259963989, + "num_tokens": 13996889.0, + "step": 6689, + "train/ce_loss": 0.8472098112106323 + }, + { + "epoch": 0.6613604904093336, + "step": 6689, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.6613604904093336, + "step": 6689, + "train/total_loss": 0.16284598410129547 + }, + { + "entropy": 8.36520767211914, + "epoch": 0.6614593632588491, + "mean_token_accuracy": 0.7360097169876099, + "num_tokens": 14002188.0, + "step": 6690, + "train/ce_loss": 1.0956363677978516 + }, + { + "epoch": 0.6614593632588491, + "step": 6690, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.6614593632588491, + "step": 6690, + "train/total_loss": 0.1720636487007141 + }, + { + "entropy": 8.532737731933594, + "epoch": 0.6615582361083646, + "mean_token_accuracy": 0.7497527003288269, + "num_tokens": 14007707.0, + "step": 6691, + "train/ce_loss": 0.7537171840667725 + }, + { + "epoch": 0.6615582361083646, + "step": 6691, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.6615582361083646, + "step": 6691, + "train/total_loss": 0.10271546989679337 + }, + { + "entropy": 9.208215713500977, + "epoch": 0.6616571089578802, + "mean_token_accuracy": 0.6937212944030762, + "num_tokens": 14012788.0, + "step": 6692, + "train/ce_loss": 0.7378474473953247 + }, + { + "epoch": 0.6616571089578802, + "step": 6692, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.6616571089578802, + "step": 6692, + "train/total_loss": 0.11284724622964859 + }, + { + "entropy": 9.050117492675781, + "epoch": 0.6617559818073957, + "mean_token_accuracy": 0.6995447874069214, + "num_tokens": 14017857.0, + "step": 6693, + "train/ce_loss": 1.3012582063674927 + }, + { + "epoch": 0.6617559818073957, + "step": 6693, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.6617559818073957, + "step": 6693, + "train/total_loss": 0.22387582063674927 + }, + { + "entropy": 9.351299285888672, + "epoch": 0.6618548546569112, + "mean_token_accuracy": 0.6732673048973083, + "num_tokens": 14022792.0, + "step": 6694, + "train/ce_loss": 2.2628509998321533 + }, + { + "epoch": 0.6618548546569112, + "step": 6694, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.6618548546569112, + "step": 6694, + "train/total_loss": 0.24581634998321533 + }, + { + "entropy": 8.946569442749023, + "epoch": 0.6619537275064268, + "mean_token_accuracy": 0.7586206793785095, + "num_tokens": 14028030.0, + "step": 6695, + "train/ce_loss": 0.7077171206474304 + }, + { + "epoch": 0.6619537275064268, + "step": 6695, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.6619537275064268, + "step": 6695, + "train/total_loss": 0.1176467165350914 + }, + { + "entropy": 8.753110885620117, + "epoch": 0.6620526003559423, + "mean_token_accuracy": 0.7614796161651611, + "num_tokens": 14033286.0, + "step": 6696, + "train/ce_loss": 0.7196674942970276 + }, + { + "epoch": 0.6620526003559423, + "step": 6696, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.6620526003559423, + "step": 6696, + "train/total_loss": 0.126654252409935 + }, + { + "entropy": 8.630273818969727, + "epoch": 0.6621514732054578, + "mean_token_accuracy": 0.7274969220161438, + "num_tokens": 14038602.0, + "step": 6697, + "train/ce_loss": 0.9384482502937317 + }, + { + "epoch": 0.6621514732054578, + "step": 6697, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.6621514732054578, + "step": 6697, + "train/total_loss": 0.13290733098983765 + }, + { + "entropy": 8.813023567199707, + "epoch": 0.6622503460549733, + "mean_token_accuracy": 0.7220259308815002, + "num_tokens": 14043908.0, + "step": 6698, + "train/ce_loss": 0.8345155119895935 + }, + { + "epoch": 0.6622503460549733, + "step": 6698, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.6622503460549733, + "step": 6698, + "train/total_loss": 0.1615765541791916 + }, + { + "entropy": 9.198127746582031, + "epoch": 0.6623492189044888, + "mean_token_accuracy": 0.7068645358085632, + "num_tokens": 14048906.0, + "step": 6699, + "train/ce_loss": 0.8945678472518921 + }, + { + "epoch": 0.6623492189044888, + "step": 6699, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.6623492189044888, + "step": 6699, + "train/total_loss": 0.10898803919553757 + }, + { + "epoch": 0.6624480917540043, + "grad_norm": 0.8893702626228333, + "learning_rate": 8.346189981704001e-06, + "loss": 0.141, + "step": 6700 + }, + { + "entropy": 8.877273559570312, + "epoch": 0.6624480917540043, + "mean_token_accuracy": 0.7578814625740051, + "num_tokens": 14054187.0, + "step": 6700, + "train/ce_loss": 0.5039969682693481 + }, + { + "epoch": 0.6624480917540043, + "step": 6700, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.6624480917540043, + "step": 6700, + "train/total_loss": 0.06993094831705093 + }, + { + "entropy": 8.706487655639648, + "epoch": 0.6625469646035199, + "mean_token_accuracy": 0.7451456189155579, + "num_tokens": 14059441.0, + "step": 6701, + "train/ce_loss": 0.6316986083984375 + }, + { + "epoch": 0.6625469646035199, + "step": 6701, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.6625469646035199, + "step": 6701, + "train/total_loss": 0.09832610934972763 + }, + { + "entropy": 9.48685359954834, + "epoch": 0.6626458374530354, + "mean_token_accuracy": 0.7756563425064087, + "num_tokens": 14064301.0, + "step": 6702, + "train/ce_loss": 1.4424176216125488 + }, + { + "epoch": 0.6626458374530354, + "step": 6702, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.6626458374530354, + "step": 6702, + "train/total_loss": 0.18721051514148712 + }, + { + "entropy": 8.722457885742188, + "epoch": 0.6627447103025509, + "mean_token_accuracy": 0.6818675398826599, + "num_tokens": 14069707.0, + "step": 6703, + "train/ce_loss": 1.3143783807754517 + }, + { + "epoch": 0.6627447103025509, + "step": 6703, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.6627447103025509, + "step": 6703, + "train/total_loss": 0.18221908807754517 + }, + { + "entropy": 8.672255516052246, + "epoch": 0.6628435831520665, + "mean_token_accuracy": 0.7947434186935425, + "num_tokens": 14074947.0, + "step": 6704, + "train/ce_loss": 0.7217705845832825 + }, + { + "epoch": 0.6628435831520665, + "step": 6704, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.6628435831520665, + "step": 6704, + "train/total_loss": 0.09952080994844437 + }, + { + "entropy": 8.845043182373047, + "epoch": 0.662942456001582, + "mean_token_accuracy": 0.7313797473907471, + "num_tokens": 14080276.0, + "step": 6705, + "train/ce_loss": 1.2335104942321777 + }, + { + "epoch": 0.662942456001582, + "step": 6705, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.662942456001582, + "step": 6705, + "train/total_loss": 0.19756980240345 + }, + { + "entropy": 8.619894027709961, + "epoch": 0.6630413288510975, + "mean_token_accuracy": 0.7848244905471802, + "num_tokens": 14085779.0, + "step": 6706, + "train/ce_loss": 0.6108676195144653 + }, + { + "epoch": 0.6630413288510975, + "step": 6706, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.6630413288510975, + "step": 6706, + "train/total_loss": 0.1079617589712143 + }, + { + "entropy": 8.909207344055176, + "epoch": 0.663140201700613, + "mean_token_accuracy": 0.7596899271011353, + "num_tokens": 14090955.0, + "step": 6707, + "train/ce_loss": 1.4759429693222046 + }, + { + "epoch": 0.663140201700613, + "step": 6707, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.663140201700613, + "step": 6707, + "train/total_loss": 0.19837555289268494 + }, + { + "entropy": 9.152202606201172, + "epoch": 0.6632390745501285, + "mean_token_accuracy": 0.7245901823043823, + "num_tokens": 14096014.0, + "step": 6708, + "train/ce_loss": 0.8172332048416138 + }, + { + "epoch": 0.6632390745501285, + "step": 6708, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.6632390745501285, + "step": 6708, + "train/total_loss": 0.12078582495450974 + }, + { + "entropy": 9.569263458251953, + "epoch": 0.663337947399644, + "mean_token_accuracy": 0.7236467003822327, + "num_tokens": 14100751.0, + "step": 6709, + "train/ce_loss": 1.4401479959487915 + }, + { + "epoch": 0.663337947399644, + "step": 6709, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.663337947399644, + "step": 6709, + "train/total_loss": 0.22213980555534363 + }, + { + "entropy": 9.026773452758789, + "epoch": 0.6634368202491596, + "mean_token_accuracy": 0.7635983228683472, + "num_tokens": 14105697.0, + "step": 6710, + "train/ce_loss": 2.6175093807978556e-06 + }, + { + "epoch": 0.6634368202491596, + "step": 6710, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.6634368202491596, + "step": 6710, + "train/total_loss": 0.05859401077032089 + }, + { + "entropy": 8.975640296936035, + "epoch": 0.6635356930986751, + "mean_token_accuracy": 0.6936488151550293, + "num_tokens": 14111118.0, + "step": 6711, + "train/ce_loss": 0.9944825172424316 + }, + { + "epoch": 0.6635356930986751, + "step": 6711, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.6635356930986751, + "step": 6711, + "train/total_loss": 0.14632326364517212 + }, + { + "entropy": 8.736495971679688, + "epoch": 0.6636345659481906, + "mean_token_accuracy": 0.7727839946746826, + "num_tokens": 14116383.0, + "step": 6712, + "train/ce_loss": 0.6491132974624634 + }, + { + "epoch": 0.6636345659481906, + "step": 6712, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.6636345659481906, + "step": 6712, + "train/total_loss": 0.11178632825613022 + }, + { + "entropy": 9.282264709472656, + "epoch": 0.6637334387977062, + "mean_token_accuracy": 0.7543859481811523, + "num_tokens": 14121401.0, + "step": 6713, + "train/ce_loss": 0.7997955679893494 + }, + { + "epoch": 0.6637334387977062, + "step": 6713, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.6637334387977062, + "step": 6713, + "train/total_loss": 0.1268545687198639 + }, + { + "entropy": 9.13487434387207, + "epoch": 0.6638323116472217, + "mean_token_accuracy": 0.7386363744735718, + "num_tokens": 14126447.0, + "step": 6714, + "train/ce_loss": 1.0122233629226685 + }, + { + "epoch": 0.6638323116472217, + "step": 6714, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.6638323116472217, + "step": 6714, + "train/total_loss": 0.14809733629226685 + }, + { + "entropy": 8.932573318481445, + "epoch": 0.6639311844967372, + "mean_token_accuracy": 0.770039439201355, + "num_tokens": 14131682.0, + "step": 6715, + "train/ce_loss": 0.47455939650535583 + }, + { + "epoch": 0.6639311844967372, + "step": 6715, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.6639311844967372, + "step": 6715, + "train/total_loss": 0.10214343667030334 + }, + { + "entropy": 8.547382354736328, + "epoch": 0.6640300573462528, + "mean_token_accuracy": 0.7574094533920288, + "num_tokens": 14137106.0, + "step": 6716, + "train/ce_loss": 0.6185110211372375 + }, + { + "epoch": 0.6640300573462528, + "step": 6716, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.6640300573462528, + "step": 6716, + "train/total_loss": 0.08528859913349152 + }, + { + "entropy": 8.829623222351074, + "epoch": 0.6641289301957682, + "mean_token_accuracy": 0.7074742317199707, + "num_tokens": 14142346.0, + "step": 6717, + "train/ce_loss": 0.6980037689208984 + }, + { + "epoch": 0.6641289301957682, + "step": 6717, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.6641289301957682, + "step": 6717, + "train/total_loss": 0.11667537689208984 + }, + { + "entropy": 8.666450500488281, + "epoch": 0.6642278030452837, + "mean_token_accuracy": 0.7428198456764221, + "num_tokens": 14147590.0, + "step": 6718, + "train/ce_loss": 0.786668598651886 + }, + { + "epoch": 0.6642278030452837, + "step": 6718, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.6642278030452837, + "step": 6718, + "train/total_loss": 0.14897936582565308 + }, + { + "entropy": 9.03660774230957, + "epoch": 0.6643266758947993, + "mean_token_accuracy": 0.6939314007759094, + "num_tokens": 14152833.0, + "step": 6719, + "train/ce_loss": 7.933153028716333e-06 + }, + { + "epoch": 0.6643266758947993, + "step": 6719, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.6643266758947993, + "step": 6719, + "train/total_loss": 0.06250078976154327 + }, + { + "epoch": 0.6644255487443148, + "grad_norm": 0.8787350058555603, + "learning_rate": 8.341245116946052e-06, + "loss": 0.1347, + "step": 6720 + }, + { + "entropy": 8.631905555725098, + "epoch": 0.6644255487443148, + "mean_token_accuracy": 0.7658303380012512, + "num_tokens": 14158167.0, + "step": 6720, + "train/ce_loss": 0.6893506050109863 + }, + { + "epoch": 0.6644255487443148, + "step": 6720, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.6644255487443148, + "step": 6720, + "train/total_loss": 0.1275288164615631 + }, + { + "entropy": 9.196617126464844, + "epoch": 0.6645244215938303, + "mean_token_accuracy": 0.7628865838050842, + "num_tokens": 14163183.0, + "step": 6721, + "train/ce_loss": 1.015169620513916 + }, + { + "epoch": 0.6645244215938303, + "step": 6721, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.6645244215938303, + "step": 6721, + "train/total_loss": 0.1483919620513916 + }, + { + "entropy": 8.924755096435547, + "epoch": 0.6646232944433459, + "mean_token_accuracy": 0.7825520634651184, + "num_tokens": 14168390.0, + "step": 6722, + "train/ce_loss": 0.9925330281257629 + }, + { + "epoch": 0.6646232944433459, + "step": 6722, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.6646232944433459, + "step": 6722, + "train/total_loss": 0.11878455430269241 + }, + { + "entropy": 8.297525405883789, + "epoch": 0.6647221672928614, + "mean_token_accuracy": 0.7615176439285278, + "num_tokens": 14173964.0, + "step": 6723, + "train/ce_loss": 0.5884344577789307 + }, + { + "epoch": 0.6647221672928614, + "step": 6723, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.6647221672928614, + "step": 6723, + "train/total_loss": 0.0744684487581253 + }, + { + "entropy": 9.536665916442871, + "epoch": 0.6648210401423769, + "mean_token_accuracy": 0.7642105221748352, + "num_tokens": 14178859.0, + "step": 6724, + "train/ce_loss": 1.7731702327728271 + }, + { + "epoch": 0.6648210401423769, + "step": 6724, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.6648210401423769, + "step": 6724, + "train/total_loss": 0.24762952327728271 + }, + { + "entropy": 9.07353401184082, + "epoch": 0.6649199129918925, + "mean_token_accuracy": 0.7966616153717041, + "num_tokens": 14183943.0, + "step": 6725, + "train/ce_loss": 0.5030363202095032 + }, + { + "epoch": 0.6649199129918925, + "step": 6725, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.6649199129918925, + "step": 6725, + "train/total_loss": 0.0932723879814148 + }, + { + "entropy": 8.971988677978516, + "epoch": 0.6650187858414079, + "mean_token_accuracy": 0.7762619256973267, + "num_tokens": 14189111.0, + "step": 6726, + "train/ce_loss": 0.7912498712539673 + }, + { + "epoch": 0.6650187858414079, + "step": 6726, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.6650187858414079, + "step": 6726, + "train/total_loss": 0.10646873712539673 + }, + { + "entropy": 9.1195068359375, + "epoch": 0.6651176586909234, + "mean_token_accuracy": 0.7333333492279053, + "num_tokens": 14194174.0, + "step": 6727, + "train/ce_loss": 5.986967153148726e-06 + }, + { + "epoch": 0.6651176586909234, + "step": 6727, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.6651176586909234, + "step": 6727, + "train/total_loss": 0.03906309977173805 + }, + { + "entropy": 8.535524368286133, + "epoch": 0.665216531540439, + "mean_token_accuracy": 0.7098265886306763, + "num_tokens": 14199506.0, + "step": 6728, + "train/ce_loss": 0.8491111397743225 + }, + { + "epoch": 0.665216531540439, + "step": 6728, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.665216531540439, + "step": 6728, + "train/total_loss": 0.16694235801696777 + }, + { + "entropy": 8.829087257385254, + "epoch": 0.6653154043899545, + "mean_token_accuracy": 0.7119078040122986, + "num_tokens": 14204739.0, + "step": 6729, + "train/ce_loss": 1.2258275747299194 + }, + { + "epoch": 0.6653154043899545, + "step": 6729, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.6653154043899545, + "step": 6729, + "train/total_loss": 0.16945776343345642 + }, + { + "entropy": 9.08847427368164, + "epoch": 0.66541427723947, + "mean_token_accuracy": 0.7658119797706604, + "num_tokens": 14209774.0, + "step": 6730, + "train/ce_loss": 0.6165642142295837 + }, + { + "epoch": 0.66541427723947, + "step": 6730, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.66541427723947, + "step": 6730, + "train/total_loss": 0.0772814229130745 + }, + { + "entropy": 8.732650756835938, + "epoch": 0.6655131500889856, + "mean_token_accuracy": 0.7557715773582458, + "num_tokens": 14215036.0, + "step": 6731, + "train/ce_loss": 0.8895928859710693 + }, + { + "epoch": 0.6655131500889856, + "step": 6731, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.6655131500889856, + "step": 6731, + "train/total_loss": 0.13192804157733917 + }, + { + "entropy": 9.088386535644531, + "epoch": 0.6656120229385011, + "mean_token_accuracy": 0.757785439491272, + "num_tokens": 14220092.0, + "step": 6732, + "train/ce_loss": 1.3937000403529964e-05 + }, + { + "epoch": 0.6656120229385011, + "step": 6732, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.6656120229385011, + "step": 6732, + "train/total_loss": 0.054688893258571625 + }, + { + "entropy": 8.419425964355469, + "epoch": 0.6657108957880166, + "mean_token_accuracy": 0.8412538170814514, + "num_tokens": 14225602.0, + "step": 6733, + "train/ce_loss": 0.497548907995224 + }, + { + "epoch": 0.6657108957880166, + "step": 6733, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.6657108957880166, + "step": 6733, + "train/total_loss": 0.15131738781929016 + }, + { + "entropy": 9.298429489135742, + "epoch": 0.6658097686375322, + "mean_token_accuracy": 0.75, + "num_tokens": 14230566.0, + "step": 6734, + "train/ce_loss": 1.1033003829652444e-05 + }, + { + "epoch": 0.6658097686375322, + "step": 6734, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.6658097686375322, + "step": 6734, + "train/total_loss": 0.07421985268592834 + }, + { + "entropy": 9.47636890411377, + "epoch": 0.6659086414870476, + "mean_token_accuracy": 0.7843137383460999, + "num_tokens": 14235495.0, + "step": 6735, + "train/ce_loss": 0.6626639366149902 + }, + { + "epoch": 0.6659086414870476, + "step": 6735, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.6659086414870476, + "step": 6735, + "train/total_loss": 0.12095389515161514 + }, + { + "entropy": 8.497119903564453, + "epoch": 0.6660075143365631, + "mean_token_accuracy": 0.781361997127533, + "num_tokens": 14240819.0, + "step": 6736, + "train/ce_loss": 0.9271575808525085 + }, + { + "epoch": 0.6660075143365631, + "step": 6736, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.6660075143365631, + "step": 6736, + "train/total_loss": 0.1474032700061798 + }, + { + "entropy": 8.962126731872559, + "epoch": 0.6661063871860787, + "mean_token_accuracy": 0.754358172416687, + "num_tokens": 14245889.0, + "step": 6737, + "train/ce_loss": 4.2335354919487145e-06 + }, + { + "epoch": 0.6661063871860787, + "step": 6737, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.6661063871860787, + "step": 6737, + "train/total_loss": 0.023437922820448875 + }, + { + "entropy": 8.75960922241211, + "epoch": 0.6662052600355942, + "mean_token_accuracy": 0.7316076159477234, + "num_tokens": 14251090.0, + "step": 6738, + "train/ce_loss": 0.414532333612442 + }, + { + "epoch": 0.6662052600355942, + "step": 6738, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.6662052600355942, + "step": 6738, + "train/total_loss": 0.06489073485136032 + }, + { + "entropy": 9.062548637390137, + "epoch": 0.6663041328851097, + "mean_token_accuracy": 0.6617050170898438, + "num_tokens": 14256276.0, + "step": 6739, + "train/ce_loss": 0.9967235922813416 + }, + { + "epoch": 0.6663041328851097, + "step": 6739, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.6663041328851097, + "step": 6739, + "train/total_loss": 0.1426411122083664 + }, + { + "epoch": 0.6664030057346253, + "grad_norm": 0.7395882606506348, + "learning_rate": 8.336300252188104e-06, + "loss": 0.1269, + "step": 6740 + }, + { + "entropy": 8.70880126953125, + "epoch": 0.6664030057346253, + "mean_token_accuracy": 0.7319004535675049, + "num_tokens": 14261661.0, + "step": 6740, + "train/ce_loss": 0.8813509941101074 + }, + { + "epoch": 0.6664030057346253, + "step": 6740, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.6664030057346253, + "step": 6740, + "train/total_loss": 0.13110384345054626 + }, + { + "entropy": 8.729297637939453, + "epoch": 0.6665018785841408, + "mean_token_accuracy": 0.8273615837097168, + "num_tokens": 14267084.0, + "step": 6741, + "train/ce_loss": 0.31738346815109253 + }, + { + "epoch": 0.6665018785841408, + "step": 6741, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.6665018785841408, + "step": 6741, + "train/total_loss": 0.05517584830522537 + }, + { + "entropy": 9.303884506225586, + "epoch": 0.6666007514336563, + "mean_token_accuracy": 0.721875011920929, + "num_tokens": 14272200.0, + "step": 6742, + "train/ce_loss": 2.1535837650299072 + }, + { + "epoch": 0.6666007514336563, + "step": 6742, + "train/sim_loss": 0.13671875 + }, + { + "epoch": 0.6666007514336563, + "step": 6742, + "train/total_loss": 0.3520771265029907 + }, + { + "entropy": 9.582070350646973, + "epoch": 0.6666996242831719, + "mean_token_accuracy": 0.7639999985694885, + "num_tokens": 14277111.0, + "step": 6743, + "train/ce_loss": 0.5690571069717407 + }, + { + "epoch": 0.6666996242831719, + "step": 6743, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.6666996242831719, + "step": 6743, + "train/total_loss": 0.08815571665763855 + }, + { + "entropy": 8.569799423217773, + "epoch": 0.6667984971326874, + "mean_token_accuracy": 0.6907216310501099, + "num_tokens": 14282536.0, + "step": 6744, + "train/ce_loss": 0.659394383430481 + }, + { + "epoch": 0.6667984971326874, + "step": 6744, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.6667984971326874, + "step": 6744, + "train/total_loss": 0.12062694132328033 + }, + { + "entropy": 9.198576927185059, + "epoch": 0.6668973699822028, + "mean_token_accuracy": 0.6841155290603638, + "num_tokens": 14287538.0, + "step": 6745, + "train/ce_loss": 0.8136149644851685 + }, + { + "epoch": 0.6668973699822028, + "step": 6745, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.6668973699822028, + "step": 6745, + "train/total_loss": 0.12433024495840073 + }, + { + "entropy": 9.104917526245117, + "epoch": 0.6669962428317184, + "mean_token_accuracy": 0.7232415676116943, + "num_tokens": 14292623.0, + "step": 6746, + "train/ce_loss": 1.055641531944275 + }, + { + "epoch": 0.6669962428317184, + "step": 6746, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.6669962428317184, + "step": 6746, + "train/total_loss": 0.152439147233963 + }, + { + "entropy": 8.609912872314453, + "epoch": 0.6670951156812339, + "mean_token_accuracy": 0.7218468189239502, + "num_tokens": 14298041.0, + "step": 6747, + "train/ce_loss": 0.8968522548675537 + }, + { + "epoch": 0.6670951156812339, + "step": 6747, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.6670951156812339, + "step": 6747, + "train/total_loss": 0.20296648144721985 + }, + { + "entropy": 9.488840103149414, + "epoch": 0.6671939885307494, + "mean_token_accuracy": 0.6616702079772949, + "num_tokens": 14302963.0, + "step": 6748, + "train/ce_loss": 3.5787792205810547 + }, + { + "epoch": 0.6671939885307494, + "step": 6748, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.6671939885307494, + "step": 6748, + "train/total_loss": 0.4281904399394989 + }, + { + "entropy": 8.588305473327637, + "epoch": 0.667292861380265, + "mean_token_accuracy": 0.7162446975708008, + "num_tokens": 14308357.0, + "step": 6749, + "train/ce_loss": 0.5696796178817749 + }, + { + "epoch": 0.667292861380265, + "step": 6749, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.667292861380265, + "step": 6749, + "train/total_loss": 0.10384295880794525 + }, + { + "entropy": 9.04929256439209, + "epoch": 0.6673917342297805, + "mean_token_accuracy": 0.687158465385437, + "num_tokens": 14313550.0, + "step": 6750, + "train/ce_loss": 1.2263870239257812 + }, + { + "epoch": 0.6673917342297805, + "step": 6750, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.6673917342297805, + "step": 6750, + "train/total_loss": 0.19685745239257812 + }, + { + "entropy": 8.779260635375977, + "epoch": 0.667490607079296, + "mean_token_accuracy": 0.7692307829856873, + "num_tokens": 14318931.0, + "step": 6751, + "train/ce_loss": 0.47107505798339844 + }, + { + "epoch": 0.667490607079296, + "step": 6751, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.667490607079296, + "step": 6751, + "train/total_loss": 0.1721075028181076 + }, + { + "entropy": 8.935807228088379, + "epoch": 0.6675894799288116, + "mean_token_accuracy": 0.7761989235877991, + "num_tokens": 14323949.0, + "step": 6752, + "train/ce_loss": 1.0147687196731567 + }, + { + "epoch": 0.6675894799288116, + "step": 6752, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.6675894799288116, + "step": 6752, + "train/total_loss": 0.13663312792778015 + }, + { + "entropy": 8.626876831054688, + "epoch": 0.667688352778327, + "mean_token_accuracy": 0.7016574740409851, + "num_tokens": 14329367.0, + "step": 6753, + "train/ce_loss": 1.08138906955719 + }, + { + "epoch": 0.667688352778327, + "step": 6753, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.667688352778327, + "step": 6753, + "train/total_loss": 0.19798266887664795 + }, + { + "entropy": 8.819351196289062, + "epoch": 0.6677872256278425, + "mean_token_accuracy": 0.7192053198814392, + "num_tokens": 14334578.0, + "step": 6754, + "train/ce_loss": 2.0306320948293433e-05 + }, + { + "epoch": 0.6677872256278425, + "step": 6754, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.6677872256278425, + "step": 6754, + "train/total_loss": 0.023439530283212662 + }, + { + "entropy": 8.370071411132812, + "epoch": 0.6678860984773581, + "mean_token_accuracy": 0.7449078559875488, + "num_tokens": 14340035.0, + "step": 6755, + "train/ce_loss": 0.7535091042518616 + }, + { + "epoch": 0.6678860984773581, + "step": 6755, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.6678860984773581, + "step": 6755, + "train/total_loss": 0.13003841042518616 + }, + { + "entropy": 8.58914566040039, + "epoch": 0.6679849713268736, + "mean_token_accuracy": 0.7662061452865601, + "num_tokens": 14345496.0, + "step": 6756, + "train/ce_loss": 1.0949351787567139 + }, + { + "epoch": 0.6679849713268736, + "step": 6756, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.6679849713268736, + "step": 6756, + "train/total_loss": 0.17980602383613586 + }, + { + "entropy": 9.329448699951172, + "epoch": 0.6680838441763892, + "mean_token_accuracy": 0.7410358786582947, + "num_tokens": 14350425.0, + "step": 6757, + "train/ce_loss": 0.9834825992584229 + }, + { + "epoch": 0.6680838441763892, + "step": 6757, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.6680838441763892, + "step": 6757, + "train/total_loss": 0.13350450992584229 + }, + { + "entropy": 9.280977249145508, + "epoch": 0.6681827170259047, + "mean_token_accuracy": 0.7773787975311279, + "num_tokens": 14355429.0, + "step": 6758, + "train/ce_loss": 1.32757568359375 + }, + { + "epoch": 0.6681827170259047, + "step": 6758, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.6681827170259047, + "step": 6758, + "train/total_loss": 0.17963257431983948 + }, + { + "entropy": 8.651885032653809, + "epoch": 0.6682815898754202, + "mean_token_accuracy": 0.7680995464324951, + "num_tokens": 14360828.0, + "step": 6759, + "train/ce_loss": 1.0034481287002563 + }, + { + "epoch": 0.6682815898754202, + "step": 6759, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.6682815898754202, + "step": 6759, + "train/total_loss": 0.15112605690956116 + }, + { + "epoch": 0.6683804627249358, + "grad_norm": 0.6059948205947876, + "learning_rate": 8.331355387430154e-06, + "loss": 0.146, + "step": 6760 + }, + { + "entropy": 8.796469688415527, + "epoch": 0.6683804627249358, + "mean_token_accuracy": 0.7750906944274902, + "num_tokens": 14366073.0, + "step": 6760, + "train/ce_loss": 0.7929439544677734 + }, + { + "epoch": 0.6683804627249358, + "step": 6760, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.6683804627249358, + "step": 6760, + "train/total_loss": 0.10663814842700958 + }, + { + "entropy": 9.272623062133789, + "epoch": 0.6684793355744513, + "mean_token_accuracy": 0.7857142686843872, + "num_tokens": 14371091.0, + "step": 6761, + "train/ce_loss": 0.6518434286117554 + }, + { + "epoch": 0.6684793355744513, + "step": 6761, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.6684793355744513, + "step": 6761, + "train/total_loss": 0.1237780973315239 + }, + { + "entropy": 9.03161907196045, + "epoch": 0.6685782084239668, + "mean_token_accuracy": 0.6916548609733582, + "num_tokens": 14376243.0, + "step": 6762, + "train/ce_loss": 1.4648661613464355 + }, + { + "epoch": 0.6685782084239668, + "step": 6762, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.6685782084239668, + "step": 6762, + "train/total_loss": 0.23242412507534027 + }, + { + "entropy": 8.697053909301758, + "epoch": 0.6686770812734824, + "mean_token_accuracy": 0.7248520851135254, + "num_tokens": 14381295.0, + "step": 6763, + "train/ce_loss": 1.4075952768325806 + }, + { + "epoch": 0.6686770812734824, + "step": 6763, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.6686770812734824, + "step": 6763, + "train/total_loss": 0.18372827768325806 + }, + { + "entropy": 9.094472885131836, + "epoch": 0.6687759541229978, + "mean_token_accuracy": 0.7451274394989014, + "num_tokens": 14386423.0, + "step": 6764, + "train/ce_loss": 6.359361123031704e-06 + }, + { + "epoch": 0.6687759541229978, + "step": 6764, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.6687759541229978, + "step": 6764, + "train/total_loss": 0.03515688702464104 + }, + { + "entropy": 9.747419357299805, + "epoch": 0.6688748269725133, + "mean_token_accuracy": 0.7487179636955261, + "num_tokens": 14391225.0, + "step": 6765, + "train/ce_loss": 7.125037427613279e-06 + }, + { + "epoch": 0.6688748269725133, + "step": 6765, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.6688748269725133, + "step": 6765, + "train/total_loss": 0.0625007152557373 + }, + { + "entropy": 8.462725639343262, + "epoch": 0.6689736998220289, + "mean_token_accuracy": 0.7445887327194214, + "num_tokens": 14396633.0, + "step": 6766, + "train/ce_loss": 0.8765015602111816 + }, + { + "epoch": 0.6689736998220289, + "step": 6766, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.6689736998220289, + "step": 6766, + "train/total_loss": 0.1892126500606537 + }, + { + "entropy": 8.475957870483398, + "epoch": 0.6690725726715444, + "mean_token_accuracy": 0.743682324886322, + "num_tokens": 14401977.0, + "step": 6767, + "train/ce_loss": 0.8576280474662781 + }, + { + "epoch": 0.6690725726715444, + "step": 6767, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.6690725726715444, + "step": 6767, + "train/total_loss": 0.11310655623674393 + }, + { + "entropy": 9.403925895690918, + "epoch": 0.6691714455210599, + "mean_token_accuracy": 0.7103717923164368, + "num_tokens": 14406898.0, + "step": 6768, + "train/ce_loss": 2.383197069168091 + }, + { + "epoch": 0.6691714455210599, + "step": 6768, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.6691714455210599, + "step": 6768, + "train/total_loss": 0.2812884449958801 + }, + { + "entropy": 8.560701370239258, + "epoch": 0.6692703183705755, + "mean_token_accuracy": 0.7568756937980652, + "num_tokens": 14412318.0, + "step": 6769, + "train/ce_loss": 0.4928951561450958 + }, + { + "epoch": 0.6692703183705755, + "step": 6769, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.6692703183705755, + "step": 6769, + "train/total_loss": 0.0649145171046257 + }, + { + "entropy": 9.07827377319336, + "epoch": 0.669369191220091, + "mean_token_accuracy": 0.7397260069847107, + "num_tokens": 14417363.0, + "step": 6770, + "train/ce_loss": 0.8147823214530945 + }, + { + "epoch": 0.669369191220091, + "step": 6770, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.669369191220091, + "step": 6770, + "train/total_loss": 0.11272823065519333 + }, + { + "entropy": 8.791725158691406, + "epoch": 0.6694680640696065, + "mean_token_accuracy": 0.699881374835968, + "num_tokens": 14422677.0, + "step": 6771, + "train/ce_loss": 1.2302576303482056 + }, + { + "epoch": 0.6694680640696065, + "step": 6771, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.6694680640696065, + "step": 6771, + "train/total_loss": 0.1738070249557495 + }, + { + "entropy": 9.144021034240723, + "epoch": 0.6695669369191221, + "mean_token_accuracy": 0.7154605388641357, + "num_tokens": 14427707.0, + "step": 6772, + "train/ce_loss": 0.8733604550361633 + }, + { + "epoch": 0.6695669369191221, + "step": 6772, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.6695669369191221, + "step": 6772, + "train/total_loss": 0.14592979848384857 + }, + { + "entropy": 9.025733947753906, + "epoch": 0.6696658097686375, + "mean_token_accuracy": 0.7476038336753845, + "num_tokens": 14432767.0, + "step": 6773, + "train/ce_loss": 0.9594969153404236 + }, + { + "epoch": 0.6696658097686375, + "step": 6773, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.6696658097686375, + "step": 6773, + "train/total_loss": 0.1623559445142746 + }, + { + "entropy": 9.640586853027344, + "epoch": 0.669764682618153, + "mean_token_accuracy": 0.6961451172828674, + "num_tokens": 14437596.0, + "step": 6774, + "train/ce_loss": 1.403988242149353 + }, + { + "epoch": 0.669764682618153, + "step": 6774, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.669764682618153, + "step": 6774, + "train/total_loss": 0.19508633017539978 + }, + { + "entropy": 9.733133316040039, + "epoch": 0.6698635554676686, + "mean_token_accuracy": 0.7267904281616211, + "num_tokens": 14442353.0, + "step": 6775, + "train/ce_loss": 2.2351179122924805 + }, + { + "epoch": 0.6698635554676686, + "step": 6775, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.6698635554676686, + "step": 6775, + "train/total_loss": 0.34069931507110596 + }, + { + "entropy": 9.393617630004883, + "epoch": 0.6699624283171841, + "mean_token_accuracy": 0.689130425453186, + "num_tokens": 14447230.0, + "step": 6776, + "train/ce_loss": 6.242476956686005e-06 + }, + { + "epoch": 0.6699624283171841, + "step": 6776, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.6699624283171841, + "step": 6776, + "train/total_loss": 0.05468812584877014 + }, + { + "entropy": 8.555450439453125, + "epoch": 0.6700613011666996, + "mean_token_accuracy": 0.7341463565826416, + "num_tokens": 14452488.0, + "step": 6777, + "train/ce_loss": 0.9241055846214294 + }, + { + "epoch": 0.6700613011666996, + "step": 6777, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.6700613011666996, + "step": 6777, + "train/total_loss": 0.14709806442260742 + }, + { + "entropy": 8.696893692016602, + "epoch": 0.6701601740162152, + "mean_token_accuracy": 0.7157464027404785, + "num_tokens": 14457945.0, + "step": 6778, + "train/ce_loss": 0.891961932182312 + }, + { + "epoch": 0.6701601740162152, + "step": 6778, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.6701601740162152, + "step": 6778, + "train/total_loss": 0.14388370513916016 + }, + { + "entropy": 8.653599739074707, + "epoch": 0.6702590468657307, + "mean_token_accuracy": 0.6867470145225525, + "num_tokens": 14463368.0, + "step": 6779, + "train/ce_loss": 0.9441421627998352 + }, + { + "epoch": 0.6702590468657307, + "step": 6779, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.6702590468657307, + "step": 6779, + "train/total_loss": 0.17644546926021576 + }, + { + "epoch": 0.6703579197152462, + "grad_norm": 0.7936645150184631, + "learning_rate": 8.326410522672206e-06, + "loss": 0.1447, + "step": 6780 + }, + { + "entropy": 8.938943862915039, + "epoch": 0.6703579197152462, + "mean_token_accuracy": 0.7451984882354736, + "num_tokens": 14468595.0, + "step": 6780, + "train/ce_loss": 0.7573527693748474 + }, + { + "epoch": 0.6703579197152462, + "step": 6780, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.6703579197152462, + "step": 6780, + "train/total_loss": 0.11089152842760086 + }, + { + "entropy": 8.220525741577148, + "epoch": 0.6704567925647618, + "mean_token_accuracy": 0.7124413251876831, + "num_tokens": 14473916.0, + "step": 6781, + "train/ce_loss": 0.3497462272644043 + }, + { + "epoch": 0.6704567925647618, + "step": 6781, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.6704567925647618, + "step": 6781, + "train/total_loss": 0.08575586974620819 + }, + { + "entropy": 8.594995498657227, + "epoch": 0.6705556654142772, + "mean_token_accuracy": 0.7618534564971924, + "num_tokens": 14479332.0, + "step": 6782, + "train/ce_loss": 0.8362644910812378 + }, + { + "epoch": 0.6705556654142772, + "step": 6782, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.6705556654142772, + "step": 6782, + "train/total_loss": 0.13831394910812378 + }, + { + "entropy": 8.570377349853516, + "epoch": 0.6706545382637927, + "mean_token_accuracy": 0.7267876267433167, + "num_tokens": 14484733.0, + "step": 6783, + "train/ce_loss": 1.20476233959198 + }, + { + "epoch": 0.6706545382637927, + "step": 6783, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.6706545382637927, + "step": 6783, + "train/total_loss": 0.18688249588012695 + }, + { + "entropy": 8.916112899780273, + "epoch": 0.6707534111133083, + "mean_token_accuracy": 0.7323369383811951, + "num_tokens": 14489945.0, + "step": 6784, + "train/ce_loss": 0.5036496520042419 + }, + { + "epoch": 0.6707534111133083, + "step": 6784, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.6707534111133083, + "step": 6784, + "train/total_loss": 0.10114622116088867 + }, + { + "entropy": 8.686356544494629, + "epoch": 0.6708522839628238, + "mean_token_accuracy": 0.7149999737739563, + "num_tokens": 14495205.0, + "step": 6785, + "train/ce_loss": 0.35887444019317627 + }, + { + "epoch": 0.6708522839628238, + "step": 6785, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.6708522839628238, + "step": 6785, + "train/total_loss": 0.0788561999797821 + }, + { + "entropy": 9.628941535949707, + "epoch": 0.6709511568123393, + "mean_token_accuracy": 0.7025641202926636, + "num_tokens": 14500025.0, + "step": 6786, + "train/ce_loss": 7.966723387653474e-06 + }, + { + "epoch": 0.6709511568123393, + "step": 6786, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.6709511568123393, + "step": 6786, + "train/total_loss": 0.05859454721212387 + }, + { + "entropy": 8.674129486083984, + "epoch": 0.6710500296618549, + "mean_token_accuracy": 0.7329341173171997, + "num_tokens": 14505339.0, + "step": 6787, + "train/ce_loss": 0.7985280752182007 + }, + { + "epoch": 0.6710500296618549, + "step": 6787, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.6710500296618549, + "step": 6787, + "train/total_loss": 0.16188406944274902 + }, + { + "entropy": 8.701408386230469, + "epoch": 0.6711489025113704, + "mean_token_accuracy": 0.7905982732772827, + "num_tokens": 14510482.0, + "step": 6788, + "train/ce_loss": 0.5930907130241394 + }, + { + "epoch": 0.6711489025113704, + "step": 6788, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.6711489025113704, + "step": 6788, + "train/total_loss": 0.11399657279253006 + }, + { + "entropy": 9.007550239562988, + "epoch": 0.6712477753608859, + "mean_token_accuracy": 0.7002801299095154, + "num_tokens": 14515652.0, + "step": 6789, + "train/ce_loss": 4.562507001537597e-06 + }, + { + "epoch": 0.6712477753608859, + "step": 6789, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.6712477753608859, + "step": 6789, + "train/total_loss": 0.04687545448541641 + }, + { + "entropy": 9.074921607971191, + "epoch": 0.6713466482104015, + "mean_token_accuracy": 0.733846127986908, + "num_tokens": 14520731.0, + "step": 6790, + "train/ce_loss": 1.7275176048278809 + }, + { + "epoch": 0.6713466482104015, + "step": 6790, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.6713466482104015, + "step": 6790, + "train/total_loss": 0.2391580194234848 + }, + { + "entropy": 8.621129035949707, + "epoch": 0.671445521059917, + "mean_token_accuracy": 0.7228145003318787, + "num_tokens": 14526142.0, + "step": 6791, + "train/ce_loss": 0.8564296960830688 + }, + { + "epoch": 0.671445521059917, + "step": 6791, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.671445521059917, + "step": 6791, + "train/total_loss": 0.1403304636478424 + }, + { + "entropy": 9.489485740661621, + "epoch": 0.6715443939094324, + "mean_token_accuracy": 0.7197580933570862, + "num_tokens": 14531241.0, + "step": 6792, + "train/ce_loss": 0.8726058602333069 + }, + { + "epoch": 0.6715443939094324, + "step": 6792, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.6715443939094324, + "step": 6792, + "train/total_loss": 0.19663558900356293 + }, + { + "entropy": 8.867376327514648, + "epoch": 0.671643266758948, + "mean_token_accuracy": 0.7956621050834656, + "num_tokens": 14536649.0, + "step": 6793, + "train/ce_loss": 0.6209725737571716 + }, + { + "epoch": 0.671643266758948, + "step": 6793, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.671643266758948, + "step": 6793, + "train/total_loss": 0.08944100886583328 + }, + { + "entropy": 8.421364784240723, + "epoch": 0.6717421396084635, + "mean_token_accuracy": 0.7338618636131287, + "num_tokens": 14542010.0, + "step": 6794, + "train/ce_loss": 0.8923219442367554 + }, + { + "epoch": 0.6717421396084635, + "step": 6794, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.6717421396084635, + "step": 6794, + "train/total_loss": 0.1361072063446045 + }, + { + "entropy": 9.235627174377441, + "epoch": 0.671841012457979, + "mean_token_accuracy": 0.7441471815109253, + "num_tokens": 14547029.0, + "step": 6795, + "train/ce_loss": 5.730522843805375e-06 + }, + { + "epoch": 0.671841012457979, + "step": 6795, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.671841012457979, + "step": 6795, + "train/total_loss": 0.07031307369470596 + }, + { + "entropy": 8.904651641845703, + "epoch": 0.6719398853074946, + "mean_token_accuracy": 0.6785714030265808, + "num_tokens": 14552398.0, + "step": 6796, + "train/ce_loss": 0.8459588885307312 + }, + { + "epoch": 0.6719398853074946, + "step": 6796, + "train/sim_loss": 0.140625 + }, + { + "epoch": 0.6719398853074946, + "step": 6796, + "train/total_loss": 0.22522088885307312 + }, + { + "entropy": 9.159074783325195, + "epoch": 0.6720387581570101, + "mean_token_accuracy": 0.7045100927352905, + "num_tokens": 14557506.0, + "step": 6797, + "train/ce_loss": 1.0695310831069946 + }, + { + "epoch": 0.6720387581570101, + "step": 6797, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.6720387581570101, + "step": 6797, + "train/total_loss": 0.18117186427116394 + }, + { + "entropy": 8.746875762939453, + "epoch": 0.6721376310065256, + "mean_token_accuracy": 0.737726092338562, + "num_tokens": 14562784.0, + "step": 6798, + "train/ce_loss": 0.9978029131889343 + }, + { + "epoch": 0.6721376310065256, + "step": 6798, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.6721376310065256, + "step": 6798, + "train/total_loss": 0.15056154131889343 + }, + { + "entropy": 8.332484245300293, + "epoch": 0.6722365038560412, + "mean_token_accuracy": 0.7210215926170349, + "num_tokens": 14568293.0, + "step": 6799, + "train/ce_loss": 0.6993699073791504 + }, + { + "epoch": 0.6722365038560412, + "step": 6799, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.6722365038560412, + "step": 6799, + "train/total_loss": 0.12462449073791504 + }, + { + "epoch": 0.6723353767055567, + "grad_norm": 0.6852823495864868, + "learning_rate": 8.321465657914257e-06, + "loss": 0.1423, + "step": 6800 + }, + { + "entropy": 8.945253372192383, + "epoch": 0.6723353767055567, + "mean_token_accuracy": 0.7063882350921631, + "num_tokens": 14573574.0, + "step": 6800, + "train/ce_loss": 1.25028657913208 + }, + { + "epoch": 0.6723353767055567, + "step": 6800, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.6723353767055567, + "step": 6800, + "train/total_loss": 0.16799740493297577 + }, + { + "entropy": 9.010234832763672, + "epoch": 0.6724342495550721, + "mean_token_accuracy": 0.7916666865348816, + "num_tokens": 14578724.0, + "step": 6801, + "train/ce_loss": 0.6684384346008301 + }, + { + "epoch": 0.6724342495550721, + "step": 6801, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.6724342495550721, + "step": 6801, + "train/total_loss": 0.10200009495019913 + }, + { + "entropy": 9.172411918640137, + "epoch": 0.6725331224045877, + "mean_token_accuracy": 0.7311643958091736, + "num_tokens": 14583733.0, + "step": 6802, + "train/ce_loss": 5.024392521590926e-06 + }, + { + "epoch": 0.6725331224045877, + "step": 6802, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.6725331224045877, + "step": 6802, + "train/total_loss": 0.027344252914190292 + }, + { + "entropy": 8.602190017700195, + "epoch": 0.6726319952541032, + "mean_token_accuracy": 0.7981330156326294, + "num_tokens": 14589126.0, + "step": 6803, + "train/ce_loss": 0.8179534077644348 + }, + { + "epoch": 0.6726319952541032, + "step": 6803, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.6726319952541032, + "step": 6803, + "train/total_loss": 0.140389084815979 + }, + { + "entropy": 8.877935409545898, + "epoch": 0.6727308681036187, + "mean_token_accuracy": 0.7146666646003723, + "num_tokens": 14594385.0, + "step": 6804, + "train/ce_loss": 1.2639179229736328 + }, + { + "epoch": 0.6727308681036187, + "step": 6804, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.6727308681036187, + "step": 6804, + "train/total_loss": 0.22795429825782776 + }, + { + "entropy": 8.61834716796875, + "epoch": 0.6728297409531343, + "mean_token_accuracy": 0.7352246046066284, + "num_tokens": 14599790.0, + "step": 6805, + "train/ce_loss": 0.7303071618080139 + }, + { + "epoch": 0.6728297409531343, + "step": 6805, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.6728297409531343, + "step": 6805, + "train/total_loss": 0.14724946022033691 + }, + { + "entropy": 8.895477294921875, + "epoch": 0.6729286138026498, + "mean_token_accuracy": 0.8366477489471436, + "num_tokens": 14604943.0, + "step": 6806, + "train/ce_loss": 0.6171497702598572 + }, + { + "epoch": 0.6729286138026498, + "step": 6806, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.6729286138026498, + "step": 6806, + "train/total_loss": 0.12421497702598572 + }, + { + "entropy": 8.971907615661621, + "epoch": 0.6730274866521653, + "mean_token_accuracy": 0.7116212248802185, + "num_tokens": 14610104.0, + "step": 6807, + "train/ce_loss": 1.6657736523484346e-06 + }, + { + "epoch": 0.6730274866521653, + "step": 6807, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.6730274866521653, + "step": 6807, + "train/total_loss": 0.05468766763806343 + }, + { + "entropy": 8.885343551635742, + "epoch": 0.6731263595016809, + "mean_token_accuracy": 0.7256515622138977, + "num_tokens": 14615334.0, + "step": 6808, + "train/ce_loss": 0.7519450783729553 + }, + { + "epoch": 0.6731263595016809, + "step": 6808, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.6731263595016809, + "step": 6808, + "train/total_loss": 0.15331950783729553 + }, + { + "entropy": 8.883771896362305, + "epoch": 0.6732252323511964, + "mean_token_accuracy": 0.7266355156898499, + "num_tokens": 14620641.0, + "step": 6809, + "train/ce_loss": 0.9417589902877808 + }, + { + "epoch": 0.6732252323511964, + "step": 6809, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.6732252323511964, + "step": 6809, + "train/total_loss": 0.14886340498924255 + }, + { + "entropy": 9.174736022949219, + "epoch": 0.6733241052007118, + "mean_token_accuracy": 0.7337837815284729, + "num_tokens": 14625817.0, + "step": 6810, + "train/ce_loss": 0.47125375270843506 + }, + { + "epoch": 0.6733241052007118, + "step": 6810, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.6733241052007118, + "step": 6810, + "train/total_loss": 0.10962537676095963 + }, + { + "entropy": 8.47231674194336, + "epoch": 0.6734229780502274, + "mean_token_accuracy": 0.7186098694801331, + "num_tokens": 14631156.0, + "step": 6811, + "train/ce_loss": 1.0583274364471436 + }, + { + "epoch": 0.6734229780502274, + "step": 6811, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.6734229780502274, + "step": 6811, + "train/total_loss": 0.1488015055656433 + }, + { + "entropy": 8.644718170166016, + "epoch": 0.6735218508997429, + "mean_token_accuracy": 0.7537961006164551, + "num_tokens": 14636570.0, + "step": 6812, + "train/ce_loss": 0.8156536221504211 + }, + { + "epoch": 0.6735218508997429, + "step": 6812, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.6735218508997429, + "step": 6812, + "train/total_loss": 0.10109661519527435 + }, + { + "entropy": 9.330245971679688, + "epoch": 0.6736207237492584, + "mean_token_accuracy": 0.7583333253860474, + "num_tokens": 14641591.0, + "step": 6813, + "train/ce_loss": 1.5970101356506348 + }, + { + "epoch": 0.6736207237492584, + "step": 6813, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.6736207237492584, + "step": 6813, + "train/total_loss": 0.18704476952552795 + }, + { + "entropy": 8.915796279907227, + "epoch": 0.673719596598774, + "mean_token_accuracy": 0.6687578558921814, + "num_tokens": 14646856.0, + "step": 6814, + "train/ce_loss": 0.5850891470909119 + }, + { + "epoch": 0.673719596598774, + "step": 6814, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.673719596598774, + "step": 6814, + "train/total_loss": 0.14054016768932343 + }, + { + "entropy": 8.369819641113281, + "epoch": 0.6738184694482895, + "mean_token_accuracy": 0.7742214798927307, + "num_tokens": 14652496.0, + "step": 6815, + "train/ce_loss": 0.9914873838424683 + }, + { + "epoch": 0.6738184694482895, + "step": 6815, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.6738184694482895, + "step": 6815, + "train/total_loss": 0.14993000030517578 + }, + { + "entropy": 8.665181159973145, + "epoch": 0.673917342297805, + "mean_token_accuracy": 0.7642857432365417, + "num_tokens": 14657829.0, + "step": 6816, + "train/ce_loss": 1.0319336652755737 + }, + { + "epoch": 0.673917342297805, + "step": 6816, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.673917342297805, + "step": 6816, + "train/total_loss": 0.16959962248802185 + }, + { + "entropy": 8.193489074707031, + "epoch": 0.6740162151473206, + "mean_token_accuracy": 0.7354085445404053, + "num_tokens": 14663450.0, + "step": 6817, + "train/ce_loss": 1.1664539575576782 + }, + { + "epoch": 0.6740162151473206, + "step": 6817, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.6740162151473206, + "step": 6817, + "train/total_loss": 0.21430164575576782 + }, + { + "entropy": 8.494543075561523, + "epoch": 0.6741150879968361, + "mean_token_accuracy": 0.7384230494499207, + "num_tokens": 14668760.0, + "step": 6818, + "train/ce_loss": 0.7610848546028137 + }, + { + "epoch": 0.6741150879968361, + "step": 6818, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.6741150879968361, + "step": 6818, + "train/total_loss": 0.14642098546028137 + }, + { + "entropy": 9.040728569030762, + "epoch": 0.6742139608463515, + "mean_token_accuracy": 0.7747092843055725, + "num_tokens": 14673904.0, + "step": 6819, + "train/ce_loss": 0.5640541911125183 + }, + { + "epoch": 0.6742139608463515, + "step": 6819, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.6742139608463515, + "step": 6819, + "train/total_loss": 0.10328042507171631 + }, + { + "epoch": 0.6743128336958671, + "grad_norm": 0.661296546459198, + "learning_rate": 8.316520793156307e-06, + "loss": 0.1371, + "step": 6820 + }, + { + "entropy": 9.017964363098145, + "epoch": 0.6743128336958671, + "mean_token_accuracy": 0.7128129601478577, + "num_tokens": 14679041.0, + "step": 6820, + "train/ce_loss": 0.4595761299133301 + }, + { + "epoch": 0.6743128336958671, + "step": 6820, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.6743128336958671, + "step": 6820, + "train/total_loss": 0.13970761001110077 + }, + { + "entropy": 9.40074348449707, + "epoch": 0.6744117065453826, + "mean_token_accuracy": 0.7176684737205505, + "num_tokens": 14683965.0, + "step": 6821, + "train/ce_loss": 4.357888883532723e-06 + }, + { + "epoch": 0.6744117065453826, + "step": 6821, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.6744117065453826, + "step": 6821, + "train/total_loss": 0.02343793585896492 + }, + { + "entropy": 8.930868148803711, + "epoch": 0.6745105793948981, + "mean_token_accuracy": 0.7690058350563049, + "num_tokens": 14689138.0, + "step": 6822, + "train/ce_loss": 3.26072949974332e-06 + }, + { + "epoch": 0.6745105793948981, + "step": 6822, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.6745105793948981, + "step": 6822, + "train/total_loss": 0.039062827825546265 + }, + { + "entropy": 8.487780570983887, + "epoch": 0.6746094522444137, + "mean_token_accuracy": 0.7112582921981812, + "num_tokens": 14694351.0, + "step": 6823, + "train/ce_loss": 1.1790664196014404 + }, + { + "epoch": 0.6746094522444137, + "step": 6823, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.6746094522444137, + "step": 6823, + "train/total_loss": 0.16478164494037628 + }, + { + "entropy": 8.819021224975586, + "epoch": 0.6747083250939292, + "mean_token_accuracy": 0.7446556091308594, + "num_tokens": 14699733.0, + "step": 6824, + "train/ce_loss": 0.4874028265476227 + }, + { + "epoch": 0.6747083250939292, + "step": 6824, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.6747083250939292, + "step": 6824, + "train/total_loss": 0.11124028265476227 + }, + { + "entropy": 8.674089431762695, + "epoch": 0.6748071979434447, + "mean_token_accuracy": 0.7688171863555908, + "num_tokens": 14704947.0, + "step": 6825, + "train/ce_loss": 0.40436989068984985 + }, + { + "epoch": 0.6748071979434447, + "step": 6825, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.6748071979434447, + "step": 6825, + "train/total_loss": 0.1029369905591011 + }, + { + "entropy": 9.120749473571777, + "epoch": 0.6749060707929603, + "mean_token_accuracy": 0.7311320900917053, + "num_tokens": 14709988.0, + "step": 6826, + "train/ce_loss": 1.4153249263763428 + }, + { + "epoch": 0.6749060707929603, + "step": 6826, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.6749060707929603, + "step": 6826, + "train/total_loss": 0.19231374561786652 + }, + { + "entropy": 8.94398307800293, + "epoch": 0.6750049436424758, + "mean_token_accuracy": 0.7023959755897522, + "num_tokens": 14715260.0, + "step": 6827, + "train/ce_loss": 0.6066616773605347 + }, + { + "epoch": 0.6750049436424758, + "step": 6827, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.6750049436424758, + "step": 6827, + "train/total_loss": 0.09972867369651794 + }, + { + "entropy": 8.429058074951172, + "epoch": 0.6751038164919912, + "mean_token_accuracy": 0.7477295398712158, + "num_tokens": 14720683.0, + "step": 6828, + "train/ce_loss": 1.1365456581115723 + }, + { + "epoch": 0.6751038164919912, + "step": 6828, + "train/sim_loss": 0.12890625 + }, + { + "epoch": 0.6751038164919912, + "step": 6828, + "train/total_loss": 0.24256081879138947 + }, + { + "entropy": 8.282581329345703, + "epoch": 0.6752026893415068, + "mean_token_accuracy": 0.7324913740158081, + "num_tokens": 14726027.0, + "step": 6829, + "train/ce_loss": 1.027675747871399 + }, + { + "epoch": 0.6752026893415068, + "step": 6829, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.6752026893415068, + "step": 6829, + "train/total_loss": 0.19261133670806885 + }, + { + "entropy": 8.828989028930664, + "epoch": 0.6753015621910223, + "mean_token_accuracy": 0.7523696422576904, + "num_tokens": 14731516.0, + "step": 6830, + "train/ce_loss": 0.8268953561782837 + }, + { + "epoch": 0.6753015621910223, + "step": 6830, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.6753015621910223, + "step": 6830, + "train/total_loss": 0.1530020385980606 + }, + { + "entropy": 9.216299057006836, + "epoch": 0.6754004350405378, + "mean_token_accuracy": 0.8033794164657593, + "num_tokens": 14736587.0, + "step": 6831, + "train/ce_loss": 0.6765879988670349 + }, + { + "epoch": 0.6754004350405378, + "step": 6831, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.6754004350405378, + "step": 6831, + "train/total_loss": 0.15359631180763245 + }, + { + "entropy": 9.358110427856445, + "epoch": 0.6754993078900534, + "mean_token_accuracy": 0.719298243522644, + "num_tokens": 14741464.0, + "step": 6832, + "train/ce_loss": 2.968577064166311e-06 + }, + { + "epoch": 0.6754993078900534, + "step": 6832, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.6754993078900534, + "step": 6832, + "train/total_loss": 0.04296904802322388 + }, + { + "entropy": 9.096577644348145, + "epoch": 0.6755981807395689, + "mean_token_accuracy": 0.7557522058486938, + "num_tokens": 14746443.0, + "step": 6833, + "train/ce_loss": 1.0856497287750244 + }, + { + "epoch": 0.6755981807395689, + "step": 6833, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.6755981807395689, + "step": 6833, + "train/total_loss": 0.15543997287750244 + }, + { + "entropy": 8.473155975341797, + "epoch": 0.6756970535890844, + "mean_token_accuracy": 0.7213459610939026, + "num_tokens": 14751898.0, + "step": 6834, + "train/ce_loss": 1.297159194946289 + }, + { + "epoch": 0.6756970535890844, + "step": 6834, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.6756970535890844, + "step": 6834, + "train/total_loss": 0.1961221694946289 + }, + { + "entropy": 9.074554443359375, + "epoch": 0.6757959264386, + "mean_token_accuracy": 0.7377938628196716, + "num_tokens": 14756840.0, + "step": 6835, + "train/ce_loss": 3.831556114164414e-06 + }, + { + "epoch": 0.6757959264386, + "step": 6835, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.6757959264386, + "step": 6835, + "train/total_loss": 0.03906288370490074 + }, + { + "entropy": 8.668545722961426, + "epoch": 0.6758947992881155, + "mean_token_accuracy": 0.7134703397750854, + "num_tokens": 14762208.0, + "step": 6836, + "train/ce_loss": 0.8337442874908447 + }, + { + "epoch": 0.6758947992881155, + "step": 6836, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.6758947992881155, + "step": 6836, + "train/total_loss": 0.14587444067001343 + }, + { + "entropy": 8.646568298339844, + "epoch": 0.675993672137631, + "mean_token_accuracy": 0.7409909963607788, + "num_tokens": 14767590.0, + "step": 6837, + "train/ce_loss": 0.5781188011169434 + }, + { + "epoch": 0.675993672137631, + "step": 6837, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.675993672137631, + "step": 6837, + "train/total_loss": 0.10468688607215881 + }, + { + "entropy": 8.695724487304688, + "epoch": 0.6760925449871465, + "mean_token_accuracy": 0.7177321910858154, + "num_tokens": 14772971.0, + "step": 6838, + "train/ce_loss": 0.9211334586143494 + }, + { + "epoch": 0.6760925449871465, + "step": 6838, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.6760925449871465, + "step": 6838, + "train/total_loss": 0.13508209586143494 + }, + { + "entropy": 8.353857040405273, + "epoch": 0.676191417836662, + "mean_token_accuracy": 0.7373448014259338, + "num_tokens": 14778468.0, + "step": 6839, + "train/ce_loss": 1.2006250619888306 + }, + { + "epoch": 0.676191417836662, + "step": 6839, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.676191417836662, + "step": 6839, + "train/total_loss": 0.17865625023841858 + }, + { + "epoch": 0.6762902906861776, + "grad_norm": 0.6604064702987671, + "learning_rate": 8.311575928398358e-06, + "loss": 0.1354, + "step": 6840 + }, + { + "entropy": 8.77322006225586, + "epoch": 0.6762902906861776, + "mean_token_accuracy": 0.8018134832382202, + "num_tokens": 14783842.0, + "step": 6840, + "train/ce_loss": 1.3195806741714478 + }, + { + "epoch": 0.6762902906861776, + "step": 6840, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.6762902906861776, + "step": 6840, + "train/total_loss": 0.19445806741714478 + }, + { + "entropy": 9.559065818786621, + "epoch": 0.6763891635356931, + "mean_token_accuracy": 0.7227488160133362, + "num_tokens": 14788688.0, + "step": 6841, + "train/ce_loss": 0.7800194621086121 + }, + { + "epoch": 0.6763891635356931, + "step": 6841, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.6763891635356931, + "step": 6841, + "train/total_loss": 0.1248769462108612 + }, + { + "entropy": 9.368246078491211, + "epoch": 0.6764880363852086, + "mean_token_accuracy": 0.7395498156547546, + "num_tokens": 14793765.0, + "step": 6842, + "train/ce_loss": 1.6094815731048584 + }, + { + "epoch": 0.6764880363852086, + "step": 6842, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.6764880363852086, + "step": 6842, + "train/total_loss": 0.22344815731048584 + }, + { + "entropy": 8.946149826049805, + "epoch": 0.6765869092347242, + "mean_token_accuracy": 0.7457886934280396, + "num_tokens": 14798847.0, + "step": 6843, + "train/ce_loss": 0.8735904693603516 + }, + { + "epoch": 0.6765869092347242, + "step": 6843, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.6765869092347242, + "step": 6843, + "train/total_loss": 0.14985904097557068 + }, + { + "entropy": 9.335906982421875, + "epoch": 0.6766857820842397, + "mean_token_accuracy": 0.7568027377128601, + "num_tokens": 14803898.0, + "step": 6844, + "train/ce_loss": 4.403523234941531e-06 + }, + { + "epoch": 0.6766857820842397, + "step": 6844, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.6766857820842397, + "step": 6844, + "train/total_loss": 0.02343793958425522 + }, + { + "entropy": 8.541854858398438, + "epoch": 0.6767846549337552, + "mean_token_accuracy": 0.6717724204063416, + "num_tokens": 14809218.0, + "step": 6845, + "train/ce_loss": 0.6349949240684509 + }, + { + "epoch": 0.6767846549337552, + "step": 6845, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.6767846549337552, + "step": 6845, + "train/total_loss": 0.10646824538707733 + }, + { + "entropy": 8.886573791503906, + "epoch": 0.6768835277832708, + "mean_token_accuracy": 0.7605633735656738, + "num_tokens": 14814368.0, + "step": 6846, + "train/ce_loss": 1.423653244972229 + }, + { + "epoch": 0.6768835277832708, + "step": 6846, + "train/sim_loss": 0.1328125 + }, + { + "epoch": 0.6768835277832708, + "step": 6846, + "train/total_loss": 0.27517783641815186 + }, + { + "entropy": 9.543976783752441, + "epoch": 0.6769824006327863, + "mean_token_accuracy": 0.7192575335502625, + "num_tokens": 14819205.0, + "step": 6847, + "train/ce_loss": 9.700500413600821e-06 + }, + { + "epoch": 0.6769824006327863, + "step": 6847, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.6769824006327863, + "step": 6847, + "train/total_loss": 0.0585947185754776 + }, + { + "entropy": 8.52188491821289, + "epoch": 0.6770812734823017, + "mean_token_accuracy": 0.7841945290565491, + "num_tokens": 14824616.0, + "step": 6848, + "train/ce_loss": 1.2499181032180786 + }, + { + "epoch": 0.6770812734823017, + "step": 6848, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.6770812734823017, + "step": 6848, + "train/total_loss": 0.19530430436134338 + }, + { + "entropy": 9.283021926879883, + "epoch": 0.6771801463318173, + "mean_token_accuracy": 0.7885714173316956, + "num_tokens": 14829594.0, + "step": 6849, + "train/ce_loss": 0.9894952178001404 + }, + { + "epoch": 0.6771801463318173, + "step": 6849, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.6771801463318173, + "step": 6849, + "train/total_loss": 0.18098077178001404 + }, + { + "entropy": 8.678661346435547, + "epoch": 0.6772790191813328, + "mean_token_accuracy": 0.7326086759567261, + "num_tokens": 14834961.0, + "step": 6850, + "train/ce_loss": 0.7352795004844666 + }, + { + "epoch": 0.6772790191813328, + "step": 6850, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.6772790191813328, + "step": 6850, + "train/total_loss": 0.09696545451879501 + }, + { + "entropy": 9.54398250579834, + "epoch": 0.6773778920308483, + "mean_token_accuracy": 0.7697674632072449, + "num_tokens": 14839716.0, + "step": 6851, + "train/ce_loss": 3.1513832254859153e-06 + }, + { + "epoch": 0.6773778920308483, + "step": 6851, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.6773778920308483, + "step": 6851, + "train/total_loss": 0.04687531664967537 + }, + { + "entropy": 8.723682403564453, + "epoch": 0.6774767648803639, + "mean_token_accuracy": 0.7455012798309326, + "num_tokens": 14844916.0, + "step": 6852, + "train/ce_loss": 0.7275230288505554 + }, + { + "epoch": 0.6774767648803639, + "step": 6852, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.6774767648803639, + "step": 6852, + "train/total_loss": 0.13915854692459106 + }, + { + "entropy": 8.776834487915039, + "epoch": 0.6775756377298794, + "mean_token_accuracy": 0.7715404629707336, + "num_tokens": 14850185.0, + "step": 6853, + "train/ce_loss": 0.847054123878479 + }, + { + "epoch": 0.6775756377298794, + "step": 6853, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.6775756377298794, + "step": 6853, + "train/total_loss": 0.1589241623878479 + }, + { + "entropy": 9.03692626953125, + "epoch": 0.6776745105793949, + "mean_token_accuracy": 0.7419354915618896, + "num_tokens": 14855253.0, + "step": 6854, + "train/ce_loss": 5.175889782549348e-06 + }, + { + "epoch": 0.6776745105793949, + "step": 6854, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.6776745105793949, + "step": 6854, + "train/total_loss": 0.054688017815351486 + }, + { + "entropy": 9.113094329833984, + "epoch": 0.6777733834289105, + "mean_token_accuracy": 0.7496522665023804, + "num_tokens": 14860449.0, + "step": 6855, + "train/ce_loss": 6.646732799708843e-06 + }, + { + "epoch": 0.6777733834289105, + "step": 6855, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.6777733834289105, + "step": 6855, + "train/total_loss": 0.08593816310167313 + }, + { + "entropy": 8.981744766235352, + "epoch": 0.677872256278426, + "mean_token_accuracy": 0.7337748408317566, + "num_tokens": 14865638.0, + "step": 6856, + "train/ce_loss": 1.427980661392212 + }, + { + "epoch": 0.677872256278426, + "step": 6856, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.677872256278426, + "step": 6856, + "train/total_loss": 0.2131105661392212 + }, + { + "entropy": 8.741118431091309, + "epoch": 0.6779711291279414, + "mean_token_accuracy": 0.7225950956344604, + "num_tokens": 14870977.0, + "step": 6857, + "train/ce_loss": 0.7294755578041077 + }, + { + "epoch": 0.6779711291279414, + "step": 6857, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.6779711291279414, + "step": 6857, + "train/total_loss": 0.14716631174087524 + }, + { + "entropy": 8.609580039978027, + "epoch": 0.678070001977457, + "mean_token_accuracy": 0.7299578189849854, + "num_tokens": 14876388.0, + "step": 6858, + "train/ce_loss": 0.8117855787277222 + }, + { + "epoch": 0.678070001977457, + "step": 6858, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.678070001977457, + "step": 6858, + "train/total_loss": 0.13977231085300446 + }, + { + "entropy": 8.76502799987793, + "epoch": 0.6781688748269725, + "mean_token_accuracy": 0.6953846216201782, + "num_tokens": 14881842.0, + "step": 6859, + "train/ce_loss": 0.9069038033485413 + }, + { + "epoch": 0.6781688748269725, + "step": 6859, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.6781688748269725, + "step": 6859, + "train/total_loss": 0.18053412437438965 + }, + { + "epoch": 0.678267747676488, + "grad_norm": 0.7154859304428101, + "learning_rate": 8.30663106364041e-06, + "loss": 0.1401, + "step": 6860 + }, + { + "entropy": 9.234578132629395, + "epoch": 0.678267747676488, + "mean_token_accuracy": 0.7308319807052612, + "num_tokens": 14886916.0, + "step": 6860, + "train/ce_loss": 1.1456135511398315 + }, + { + "epoch": 0.678267747676488, + "step": 6860, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.678267747676488, + "step": 6860, + "train/total_loss": 0.22784259915351868 + }, + { + "entropy": 9.708675384521484, + "epoch": 0.6783666205260036, + "mean_token_accuracy": 0.6778523325920105, + "num_tokens": 14891772.0, + "step": 6861, + "train/ce_loss": 2.165543556213379 + }, + { + "epoch": 0.6783666205260036, + "step": 6861, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.6783666205260036, + "step": 6861, + "train/total_loss": 0.32592934370040894 + }, + { + "entropy": 8.549935340881348, + "epoch": 0.6784654933755191, + "mean_token_accuracy": 0.7832335233688354, + "num_tokens": 14897086.0, + "step": 6862, + "train/ce_loss": 0.6527609825134277 + }, + { + "epoch": 0.6784654933755191, + "step": 6862, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.6784654933755191, + "step": 6862, + "train/total_loss": 0.10433860123157501 + }, + { + "entropy": 9.58963394165039, + "epoch": 0.6785643662250346, + "mean_token_accuracy": 0.8432835936546326, + "num_tokens": 14901904.0, + "step": 6863, + "train/ce_loss": 5.530352154892171e-06 + }, + { + "epoch": 0.6785643662250346, + "step": 6863, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.6785643662250346, + "step": 6863, + "train/total_loss": 0.03906305134296417 + }, + { + "entropy": 9.278046607971191, + "epoch": 0.6786632390745502, + "mean_token_accuracy": 0.7939698696136475, + "num_tokens": 14906965.0, + "step": 6864, + "train/ce_loss": 1.0491366386413574 + }, + { + "epoch": 0.6786632390745502, + "step": 6864, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.6786632390745502, + "step": 6864, + "train/total_loss": 0.17522616684436798 + }, + { + "entropy": 8.817192077636719, + "epoch": 0.6787621119240657, + "mean_token_accuracy": 0.7254408001899719, + "num_tokens": 14912234.0, + "step": 6865, + "train/ce_loss": 0.6184538006782532 + }, + { + "epoch": 0.6787621119240657, + "step": 6865, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.6787621119240657, + "step": 6865, + "train/total_loss": 0.10872037708759308 + }, + { + "entropy": 8.771540641784668, + "epoch": 0.6788609847735811, + "mean_token_accuracy": 0.6845729947090149, + "num_tokens": 14917446.0, + "step": 6866, + "train/ce_loss": 2.3287134170532227 + }, + { + "epoch": 0.6788609847735811, + "step": 6866, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.6788609847735811, + "step": 6866, + "train/total_loss": 0.2758401036262512 + }, + { + "entropy": 8.880404472351074, + "epoch": 0.6789598576230967, + "mean_token_accuracy": 0.768757700920105, + "num_tokens": 14922702.0, + "step": 6867, + "train/ce_loss": 0.3743933141231537 + }, + { + "epoch": 0.6789598576230967, + "step": 6867, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.6789598576230967, + "step": 6867, + "train/total_loss": 0.08431433141231537 + }, + { + "entropy": 8.87601375579834, + "epoch": 0.6790587304726122, + "mean_token_accuracy": 0.7733989953994751, + "num_tokens": 14928006.0, + "step": 6868, + "train/ce_loss": 0.48566004633903503 + }, + { + "epoch": 0.6790587304726122, + "step": 6868, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.6790587304726122, + "step": 6868, + "train/total_loss": 0.06419100612401962 + }, + { + "entropy": 8.96407413482666, + "epoch": 0.6791576033221277, + "mean_token_accuracy": 0.7106825113296509, + "num_tokens": 14933135.0, + "step": 6869, + "train/ce_loss": 1.0607415437698364 + }, + { + "epoch": 0.6791576033221277, + "step": 6869, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.6791576033221277, + "step": 6869, + "train/total_loss": 0.14904290437698364 + }, + { + "entropy": 9.414806365966797, + "epoch": 0.6792564761716433, + "mean_token_accuracy": 0.7077175974845886, + "num_tokens": 14938181.0, + "step": 6870, + "train/ce_loss": 1.7140169143676758 + }, + { + "epoch": 0.6792564761716433, + "step": 6870, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.6792564761716433, + "step": 6870, + "train/total_loss": 0.2612454295158386 + }, + { + "entropy": 8.594289779663086, + "epoch": 0.6793553490211588, + "mean_token_accuracy": 0.7311475276947021, + "num_tokens": 14943598.0, + "step": 6871, + "train/ce_loss": 1.0748900175094604 + }, + { + "epoch": 0.6793553490211588, + "step": 6871, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.6793553490211588, + "step": 6871, + "train/total_loss": 0.13483275473117828 + }, + { + "entropy": 9.227981567382812, + "epoch": 0.6794542218706743, + "mean_token_accuracy": 0.6536585092544556, + "num_tokens": 14948691.0, + "step": 6872, + "train/ce_loss": 1.4994652701716404e-06 + }, + { + "epoch": 0.6794542218706743, + "step": 6872, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.6794542218706743, + "step": 6872, + "train/total_loss": 0.03515639901161194 + }, + { + "entropy": 8.879727363586426, + "epoch": 0.6795530947201899, + "mean_token_accuracy": 0.7410714030265808, + "num_tokens": 14953970.0, + "step": 6873, + "train/ce_loss": 0.49233755469322205 + }, + { + "epoch": 0.6795530947201899, + "step": 6873, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.6795530947201899, + "step": 6873, + "train/total_loss": 0.08439000695943832 + }, + { + "entropy": 8.354992866516113, + "epoch": 0.6796519675697054, + "mean_token_accuracy": 0.7238709926605225, + "num_tokens": 14959246.0, + "step": 6874, + "train/ce_loss": 1.4569038152694702 + }, + { + "epoch": 0.6796519675697054, + "step": 6874, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.6796519675697054, + "step": 6874, + "train/total_loss": 0.255065381526947 + }, + { + "entropy": 8.69202995300293, + "epoch": 0.6797508404192208, + "mean_token_accuracy": 0.643468976020813, + "num_tokens": 14964645.0, + "step": 6875, + "train/ce_loss": 1.3519893884658813 + }, + { + "epoch": 0.6797508404192208, + "step": 6875, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.6797508404192208, + "step": 6875, + "train/total_loss": 0.2289489358663559 + }, + { + "entropy": 8.685219764709473, + "epoch": 0.6798497132687364, + "mean_token_accuracy": 0.8020954728126526, + "num_tokens": 14969947.0, + "step": 6876, + "train/ce_loss": 0.3879144787788391 + }, + { + "epoch": 0.6798497132687364, + "step": 6876, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.6798497132687364, + "step": 6876, + "train/total_loss": 0.08566644787788391 + }, + { + "entropy": 8.884944915771484, + "epoch": 0.6799485861182519, + "mean_token_accuracy": 0.7363494634628296, + "num_tokens": 14975010.0, + "step": 6877, + "train/ce_loss": 1.5219731330871582 + }, + { + "epoch": 0.6799485861182519, + "step": 6877, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.6799485861182519, + "step": 6877, + "train/total_loss": 0.23032231628894806 + }, + { + "entropy": 9.34930419921875, + "epoch": 0.6800474589677674, + "mean_token_accuracy": 0.7857142686843872, + "num_tokens": 14980055.0, + "step": 6878, + "train/ce_loss": 6.238361947907833e-06 + }, + { + "epoch": 0.6800474589677674, + "step": 6878, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.6800474589677674, + "step": 6878, + "train/total_loss": 0.11718812584877014 + }, + { + "entropy": 8.546012878417969, + "epoch": 0.680146331817283, + "mean_token_accuracy": 0.6915477514266968, + "num_tokens": 14985474.0, + "step": 6879, + "train/ce_loss": 1.1794265508651733 + }, + { + "epoch": 0.680146331817283, + "step": 6879, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.680146331817283, + "step": 6879, + "train/total_loss": 0.1687239110469818 + }, + { + "epoch": 0.6802452046667985, + "grad_norm": 0.7139967083930969, + "learning_rate": 8.30168619888246e-06, + "loss": 0.1433, + "step": 6880 + }, + { + "entropy": 8.262923240661621, + "epoch": 0.6802452046667985, + "mean_token_accuracy": 0.7223360538482666, + "num_tokens": 14990985.0, + "step": 6880, + "train/ce_loss": 0.9992165565490723 + }, + { + "epoch": 0.6802452046667985, + "step": 6880, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.6802452046667985, + "step": 6880, + "train/total_loss": 0.15851540863513947 + }, + { + "entropy": 8.59264087677002, + "epoch": 0.680344077516314, + "mean_token_accuracy": 0.7144362330436707, + "num_tokens": 14996403.0, + "step": 6881, + "train/ce_loss": 0.867514967918396 + }, + { + "epoch": 0.680344077516314, + "step": 6881, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.680344077516314, + "step": 6881, + "train/total_loss": 0.12190774828195572 + }, + { + "entropy": 8.539525032043457, + "epoch": 0.6804429503658296, + "mean_token_accuracy": 0.7626774907112122, + "num_tokens": 15001873.0, + "step": 6882, + "train/ce_loss": 0.5471914410591125 + }, + { + "epoch": 0.6804429503658296, + "step": 6882, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.6804429503658296, + "step": 6882, + "train/total_loss": 0.07425040006637573 + }, + { + "entropy": 8.81664752960205, + "epoch": 0.6805418232153451, + "mean_token_accuracy": 0.7230576276779175, + "num_tokens": 15007133.0, + "step": 6883, + "train/ce_loss": 0.8308492302894592 + }, + { + "epoch": 0.6805418232153451, + "step": 6883, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.6805418232153451, + "step": 6883, + "train/total_loss": 0.17292867600917816 + }, + { + "entropy": 8.927459716796875, + "epoch": 0.6806406960648606, + "mean_token_accuracy": 0.7009803652763367, + "num_tokens": 15012392.0, + "step": 6884, + "train/ce_loss": 0.5564685463905334 + }, + { + "epoch": 0.6806406960648606, + "step": 6884, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.6806406960648606, + "step": 6884, + "train/total_loss": 0.1142406016588211 + }, + { + "entropy": 8.939496994018555, + "epoch": 0.6807395689143761, + "mean_token_accuracy": 0.8058551549911499, + "num_tokens": 15017479.0, + "step": 6885, + "train/ce_loss": 0.8924233317375183 + }, + { + "epoch": 0.6807395689143761, + "step": 6885, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.6807395689143761, + "step": 6885, + "train/total_loss": 0.1556485891342163 + }, + { + "entropy": 8.974817276000977, + "epoch": 0.6808384417638916, + "mean_token_accuracy": 0.7827337980270386, + "num_tokens": 15022605.0, + "step": 6886, + "train/ce_loss": 1.5032484043331351e-05 + }, + { + "epoch": 0.6808384417638916, + "step": 6886, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.6808384417638916, + "step": 6886, + "train/total_loss": 0.04297025501728058 + }, + { + "entropy": 9.356979370117188, + "epoch": 0.6809373146134071, + "mean_token_accuracy": 0.7393483519554138, + "num_tokens": 15027462.0, + "step": 6887, + "train/ce_loss": 5.043874807597604e-06 + }, + { + "epoch": 0.6809373146134071, + "step": 6887, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.6809373146134071, + "step": 6887, + "train/total_loss": 0.07421925663948059 + }, + { + "entropy": 8.717205047607422, + "epoch": 0.6810361874629227, + "mean_token_accuracy": 0.7674999833106995, + "num_tokens": 15032714.0, + "step": 6888, + "train/ce_loss": 0.8920142650604248 + }, + { + "epoch": 0.6810361874629227, + "step": 6888, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.6810361874629227, + "step": 6888, + "train/total_loss": 0.139982670545578 + }, + { + "entropy": 9.096343994140625, + "epoch": 0.6811350603124382, + "mean_token_accuracy": 0.7919555902481079, + "num_tokens": 15037871.0, + "step": 6889, + "train/ce_loss": 1.001779556274414 + }, + { + "epoch": 0.6811350603124382, + "step": 6889, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.6811350603124382, + "step": 6889, + "train/total_loss": 0.17439670860767365 + }, + { + "entropy": 8.990443229675293, + "epoch": 0.6812339331619537, + "mean_token_accuracy": 0.7565789222717285, + "num_tokens": 15043033.0, + "step": 6890, + "train/ce_loss": 1.350875973701477 + }, + { + "epoch": 0.6812339331619537, + "step": 6890, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.6812339331619537, + "step": 6890, + "train/total_loss": 0.19758759438991547 + }, + { + "entropy": 9.394641876220703, + "epoch": 0.6813328060114693, + "mean_token_accuracy": 0.8042226433753967, + "num_tokens": 15048019.0, + "step": 6891, + "train/ce_loss": 0.8413051962852478 + }, + { + "epoch": 0.6813328060114693, + "step": 6891, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.6813328060114693, + "step": 6891, + "train/total_loss": 0.14663052558898926 + }, + { + "entropy": 8.438233375549316, + "epoch": 0.6814316788609848, + "mean_token_accuracy": 0.7343283295631409, + "num_tokens": 15053504.0, + "step": 6892, + "train/ce_loss": 0.39659619331359863 + }, + { + "epoch": 0.6814316788609848, + "step": 6892, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.6814316788609848, + "step": 6892, + "train/total_loss": 0.06309711933135986 + }, + { + "entropy": 8.297380447387695, + "epoch": 0.6815305517105003, + "mean_token_accuracy": 0.7411873936653137, + "num_tokens": 15059003.0, + "step": 6893, + "train/ce_loss": 0.9297574758529663 + }, + { + "epoch": 0.6815305517105003, + "step": 6893, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.6815305517105003, + "step": 6893, + "train/total_loss": 0.14375700056552887 + }, + { + "entropy": 9.346956253051758, + "epoch": 0.6816294245600159, + "mean_token_accuracy": 0.7327731251716614, + "num_tokens": 15064046.0, + "step": 6894, + "train/ce_loss": 0.6615893840789795 + }, + { + "epoch": 0.6816294245600159, + "step": 6894, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.6816294245600159, + "step": 6894, + "train/total_loss": 0.11303394287824631 + }, + { + "entropy": 8.94331169128418, + "epoch": 0.6817282974095313, + "mean_token_accuracy": 0.7450722455978394, + "num_tokens": 15069294.0, + "step": 6895, + "train/ce_loss": 0.8234104514122009 + }, + { + "epoch": 0.6817282974095313, + "step": 6895, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.6817282974095313, + "step": 6895, + "train/total_loss": 0.1839035451412201 + }, + { + "entropy": 8.835611343383789, + "epoch": 0.6818271702590468, + "mean_token_accuracy": 0.7842261791229248, + "num_tokens": 15074427.0, + "step": 6896, + "train/ce_loss": 0.6595916748046875 + }, + { + "epoch": 0.6818271702590468, + "step": 6896, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.6818271702590468, + "step": 6896, + "train/total_loss": 0.08549042046070099 + }, + { + "entropy": 9.11497974395752, + "epoch": 0.6819260431085624, + "mean_token_accuracy": 0.7549406886100769, + "num_tokens": 15079369.0, + "step": 6897, + "train/ce_loss": 0.7551848292350769 + }, + { + "epoch": 0.6819260431085624, + "step": 6897, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.6819260431085624, + "step": 6897, + "train/total_loss": 0.10676848143339157 + }, + { + "entropy": 8.623621940612793, + "epoch": 0.6820249159580779, + "mean_token_accuracy": 0.7431507110595703, + "num_tokens": 15084689.0, + "step": 6898, + "train/ce_loss": 0.6764382719993591 + }, + { + "epoch": 0.6820249159580779, + "step": 6898, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.6820249159580779, + "step": 6898, + "train/total_loss": 0.08717507869005203 + }, + { + "entropy": 8.41831111907959, + "epoch": 0.6821237888075934, + "mean_token_accuracy": 0.7379958033561707, + "num_tokens": 15090114.0, + "step": 6899, + "train/ce_loss": 0.6716561913490295 + }, + { + "epoch": 0.6821237888075934, + "step": 6899, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.6821237888075934, + "step": 6899, + "train/total_loss": 0.11404062062501907 + }, + { + "epoch": 0.682222661657109, + "grad_norm": 0.6683655381202698, + "learning_rate": 8.296741334124513e-06, + "loss": 0.1307, + "step": 6900 + }, + { + "entropy": 8.801922798156738, + "epoch": 0.682222661657109, + "mean_token_accuracy": 0.7189384698867798, + "num_tokens": 15095403.0, + "step": 6900, + "train/ce_loss": 0.71654212474823 + }, + { + "epoch": 0.682222661657109, + "step": 6900, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.682222661657109, + "step": 6900, + "train/total_loss": 0.09118546545505524 + }, + { + "entropy": 8.632822036743164, + "epoch": 0.6823215345066245, + "mean_token_accuracy": 0.8171206116676331, + "num_tokens": 15100588.0, + "step": 6901, + "train/ce_loss": 0.5940370559692383 + }, + { + "epoch": 0.6823215345066245, + "step": 6901, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.6823215345066245, + "step": 6901, + "train/total_loss": 0.1453412026166916 + }, + { + "entropy": 8.848738670349121, + "epoch": 0.68242040735614, + "mean_token_accuracy": 0.6931540369987488, + "num_tokens": 15105866.0, + "step": 6902, + "train/ce_loss": 0.3797646164894104 + }, + { + "epoch": 0.68242040735614, + "step": 6902, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.68242040735614, + "step": 6902, + "train/total_loss": 0.0731327086687088 + }, + { + "entropy": 8.843427658081055, + "epoch": 0.6825192802056556, + "mean_token_accuracy": 0.7657067775726318, + "num_tokens": 15111116.0, + "step": 6903, + "train/ce_loss": 1.0031598806381226 + }, + { + "epoch": 0.6825192802056556, + "step": 6903, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.6825192802056556, + "step": 6903, + "train/total_loss": 0.15890973806381226 + }, + { + "entropy": 8.921822547912598, + "epoch": 0.682618153055171, + "mean_token_accuracy": 0.6976743936538696, + "num_tokens": 15116235.0, + "step": 6904, + "train/ce_loss": 1.6693660020828247 + }, + { + "epoch": 0.682618153055171, + "step": 6904, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.682618153055171, + "step": 6904, + "train/total_loss": 0.22553035616874695 + }, + { + "entropy": 9.12919807434082, + "epoch": 0.6827170259046865, + "mean_token_accuracy": 0.7301255464553833, + "num_tokens": 15121201.0, + "step": 6905, + "train/ce_loss": 2.707676410675049 + }, + { + "epoch": 0.6827170259046865, + "step": 6905, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.6827170259046865, + "step": 6905, + "train/total_loss": 0.3410801589488983 + }, + { + "entropy": 8.956316947937012, + "epoch": 0.6828158987542021, + "mean_token_accuracy": 0.7127799987792969, + "num_tokens": 15126365.0, + "step": 6906, + "train/ce_loss": 1.3406143188476562 + }, + { + "epoch": 0.6828158987542021, + "step": 6906, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.6828158987542021, + "step": 6906, + "train/total_loss": 0.18093644082546234 + }, + { + "entropy": 9.401857376098633, + "epoch": 0.6829147716037176, + "mean_token_accuracy": 0.7478448152542114, + "num_tokens": 15131228.0, + "step": 6907, + "train/ce_loss": 1.821577279770281e-05 + }, + { + "epoch": 0.6829147716037176, + "step": 6907, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.6829147716037176, + "step": 6907, + "train/total_loss": 0.03906432166695595 + }, + { + "entropy": 9.001886367797852, + "epoch": 0.6830136444532331, + "mean_token_accuracy": 0.7763496041297913, + "num_tokens": 15136434.0, + "step": 6908, + "train/ce_loss": 2.1235814529063646e-06 + }, + { + "epoch": 0.6830136444532331, + "step": 6908, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.6830136444532331, + "step": 6908, + "train/total_loss": 0.05078146234154701 + }, + { + "entropy": 8.753898620605469, + "epoch": 0.6831125173027487, + "mean_token_accuracy": 0.743030309677124, + "num_tokens": 15141705.0, + "step": 6909, + "train/ce_loss": 0.5286942720413208 + }, + { + "epoch": 0.6831125173027487, + "step": 6909, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.6831125173027487, + "step": 6909, + "train/total_loss": 0.07630692422389984 + }, + { + "entropy": 9.249065399169922, + "epoch": 0.6832113901522642, + "mean_token_accuracy": 0.7991543412208557, + "num_tokens": 15146602.0, + "step": 6910, + "train/ce_loss": 9.989611498895101e-06 + }, + { + "epoch": 0.6832113901522642, + "step": 6910, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.6832113901522642, + "step": 6910, + "train/total_loss": 0.03125099837779999 + }, + { + "entropy": 9.217145919799805, + "epoch": 0.6833102630017797, + "mean_token_accuracy": 0.7015151381492615, + "num_tokens": 15151695.0, + "step": 6911, + "train/ce_loss": 1.6542013883590698 + }, + { + "epoch": 0.6833102630017797, + "step": 6911, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.6833102630017797, + "step": 6911, + "train/total_loss": 0.19667014479637146 + }, + { + "entropy": 8.901766777038574, + "epoch": 0.6834091358512953, + "mean_token_accuracy": 0.7896774411201477, + "num_tokens": 15157100.0, + "step": 6912, + "train/ce_loss": 0.6305601596832275 + }, + { + "epoch": 0.6834091358512953, + "step": 6912, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.6834091358512953, + "step": 6912, + "train/total_loss": 0.11774351447820663 + }, + { + "entropy": 8.767526626586914, + "epoch": 0.6835080087008107, + "mean_token_accuracy": 0.7545564770698547, + "num_tokens": 15162419.0, + "step": 6913, + "train/ce_loss": 0.6704851984977722 + }, + { + "epoch": 0.6835080087008107, + "step": 6913, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.6835080087008107, + "step": 6913, + "train/total_loss": 0.12173601984977722 + }, + { + "entropy": 8.950695037841797, + "epoch": 0.6836068815503262, + "mean_token_accuracy": 0.7896138429641724, + "num_tokens": 15167594.0, + "step": 6914, + "train/ce_loss": 0.7056196331977844 + }, + { + "epoch": 0.6836068815503262, + "step": 6914, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.6836068815503262, + "step": 6914, + "train/total_loss": 0.1096244677901268 + }, + { + "entropy": 8.735855102539062, + "epoch": 0.6837057543998418, + "mean_token_accuracy": 0.7021013498306274, + "num_tokens": 15172778.0, + "step": 6915, + "train/ce_loss": 1.1340030431747437 + }, + { + "epoch": 0.6837057543998418, + "step": 6915, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.6837057543998418, + "step": 6915, + "train/total_loss": 0.18761906027793884 + }, + { + "entropy": 9.025125503540039, + "epoch": 0.6838046272493573, + "mean_token_accuracy": 0.7478134036064148, + "num_tokens": 15177895.0, + "step": 6916, + "train/ce_loss": 1.2451808452606201 + }, + { + "epoch": 0.6838046272493573, + "step": 6916, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.6838046272493573, + "step": 6916, + "train/total_loss": 0.18311184644699097 + }, + { + "entropy": 9.271974563598633, + "epoch": 0.6839035000988728, + "mean_token_accuracy": 0.7707641124725342, + "num_tokens": 15182925.0, + "step": 6917, + "train/ce_loss": 0.7653732299804688 + }, + { + "epoch": 0.6839035000988728, + "step": 6917, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.6839035000988728, + "step": 6917, + "train/total_loss": 0.1273185759782791 + }, + { + "entropy": 9.092538833618164, + "epoch": 0.6840023729483884, + "mean_token_accuracy": 0.7117552161216736, + "num_tokens": 15187949.0, + "step": 6918, + "train/ce_loss": 2.0715394839498913e-06 + }, + { + "epoch": 0.6840023729483884, + "step": 6918, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.6840023729483884, + "step": 6918, + "train/total_loss": 0.035156458616256714 + }, + { + "entropy": 8.716814994812012, + "epoch": 0.6841012457979039, + "mean_token_accuracy": 0.7954545617103577, + "num_tokens": 15193317.0, + "step": 6919, + "train/ce_loss": 0.5797393321990967 + }, + { + "epoch": 0.6841012457979039, + "step": 6919, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.6841012457979039, + "step": 6919, + "train/total_loss": 0.1243801862001419 + }, + { + "epoch": 0.6842001186474194, + "grad_norm": 0.5404684543609619, + "learning_rate": 8.291796469366563e-06, + "loss": 0.1353, + "step": 6920 + }, + { + "entropy": 9.481237411499023, + "epoch": 0.6842001186474194, + "mean_token_accuracy": 0.7302325367927551, + "num_tokens": 15198166.0, + "step": 6920, + "train/ce_loss": 1.2219781875610352 + }, + { + "epoch": 0.6842001186474194, + "step": 6920, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.6842001186474194, + "step": 6920, + "train/total_loss": 0.20422907173633575 + }, + { + "entropy": 9.129714965820312, + "epoch": 0.684298991496935, + "mean_token_accuracy": 0.7300613522529602, + "num_tokens": 15203304.0, + "step": 6921, + "train/ce_loss": 4.9342497732141055e-06 + }, + { + "epoch": 0.684298991496935, + "step": 6921, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.684298991496935, + "step": 6921, + "train/total_loss": 0.0312504917383194 + }, + { + "entropy": 8.877391815185547, + "epoch": 0.6843978643464504, + "mean_token_accuracy": 0.727148711681366, + "num_tokens": 15208470.0, + "step": 6922, + "train/ce_loss": 0.9647364020347595 + }, + { + "epoch": 0.6843978643464504, + "step": 6922, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.6843978643464504, + "step": 6922, + "train/total_loss": 0.17850488424301147 + }, + { + "entropy": 9.041803359985352, + "epoch": 0.684496737195966, + "mean_token_accuracy": 0.7361769080162048, + "num_tokens": 15213600.0, + "step": 6923, + "train/ce_loss": 0.6440777778625488 + }, + { + "epoch": 0.684496737195966, + "step": 6923, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.684496737195966, + "step": 6923, + "train/total_loss": 0.12690778076648712 + }, + { + "entropy": 8.622148513793945, + "epoch": 0.6845956100454815, + "mean_token_accuracy": 0.7507886290550232, + "num_tokens": 15218999.0, + "step": 6924, + "train/ce_loss": 0.5754372477531433 + }, + { + "epoch": 0.6845956100454815, + "step": 6924, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.6845956100454815, + "step": 6924, + "train/total_loss": 0.07707497477531433 + }, + { + "entropy": 8.552651405334473, + "epoch": 0.684694482894997, + "mean_token_accuracy": 0.7321226000785828, + "num_tokens": 15224346.0, + "step": 6925, + "train/ce_loss": 0.9482977390289307 + }, + { + "epoch": 0.684694482894997, + "step": 6925, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.684694482894997, + "step": 6925, + "train/total_loss": 0.1299860179424286 + }, + { + "entropy": 8.672773361206055, + "epoch": 0.6847933557445126, + "mean_token_accuracy": 0.7335600852966309, + "num_tokens": 15229690.0, + "step": 6926, + "train/ce_loss": 0.8891081213951111 + }, + { + "epoch": 0.6847933557445126, + "step": 6926, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.6847933557445126, + "step": 6926, + "train/total_loss": 0.15922331809997559 + }, + { + "entropy": 9.116071701049805, + "epoch": 0.6848922285940281, + "mean_token_accuracy": 0.7018927335739136, + "num_tokens": 15234745.0, + "step": 6927, + "train/ce_loss": 1.9434056282043457 + }, + { + "epoch": 0.6848922285940281, + "step": 6927, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.6848922285940281, + "step": 6927, + "train/total_loss": 0.2490280717611313 + }, + { + "entropy": 8.96804428100586, + "epoch": 0.6849911014435436, + "mean_token_accuracy": 0.6753424406051636, + "num_tokens": 15239930.0, + "step": 6928, + "train/ce_loss": 1.9519343376159668 + }, + { + "epoch": 0.6849911014435436, + "step": 6928, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.6849911014435436, + "step": 6928, + "train/total_loss": 0.25769343972206116 + }, + { + "entropy": 9.231451034545898, + "epoch": 0.6850899742930592, + "mean_token_accuracy": 0.70216304063797, + "num_tokens": 15244839.0, + "step": 6929, + "train/ce_loss": 1.6754367351531982 + }, + { + "epoch": 0.6850899742930592, + "step": 6929, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.6850899742930592, + "step": 6929, + "train/total_loss": 0.2261374294757843 + }, + { + "entropy": 8.91139030456543, + "epoch": 0.6851888471425747, + "mean_token_accuracy": 0.7185473442077637, + "num_tokens": 15250010.0, + "step": 6930, + "train/ce_loss": 2.334519194846507e-06 + }, + { + "epoch": 0.6851888471425747, + "step": 6930, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.6851888471425747, + "step": 6930, + "train/total_loss": 0.015625232830643654 + }, + { + "entropy": 8.973052978515625, + "epoch": 0.6852877199920901, + "mean_token_accuracy": 0.7477242946624756, + "num_tokens": 15255302.0, + "step": 6931, + "train/ce_loss": 0.5091356039047241 + }, + { + "epoch": 0.6852877199920901, + "step": 6931, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.6852877199920901, + "step": 6931, + "train/total_loss": 0.11341355741024017 + }, + { + "entropy": 8.891918182373047, + "epoch": 0.6853865928416057, + "mean_token_accuracy": 0.7184466123580933, + "num_tokens": 15260457.0, + "step": 6932, + "train/ce_loss": 0.8679772615432739 + }, + { + "epoch": 0.6853865928416057, + "step": 6932, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.6853865928416057, + "step": 6932, + "train/total_loss": 0.15711022913455963 + }, + { + "entropy": 9.115850448608398, + "epoch": 0.6854854656911212, + "mean_token_accuracy": 0.675000011920929, + "num_tokens": 15265470.0, + "step": 6933, + "train/ce_loss": 2.707383632659912 + }, + { + "epoch": 0.6854854656911212, + "step": 6933, + "train/sim_loss": 0.16796875 + }, + { + "epoch": 0.6854854656911212, + "step": 6933, + "train/total_loss": 0.4387071132659912 + }, + { + "entropy": 8.753941535949707, + "epoch": 0.6855843385406367, + "mean_token_accuracy": 0.7397727370262146, + "num_tokens": 15270798.0, + "step": 6934, + "train/ce_loss": 0.8424127697944641 + }, + { + "epoch": 0.6855843385406367, + "step": 6934, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.6855843385406367, + "step": 6934, + "train/total_loss": 0.12721002101898193 + }, + { + "entropy": 8.697265625, + "epoch": 0.6856832113901523, + "mean_token_accuracy": 0.6972677707672119, + "num_tokens": 15276176.0, + "step": 6935, + "train/ce_loss": 0.8581960797309875 + }, + { + "epoch": 0.6856832113901523, + "step": 6935, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.6856832113901523, + "step": 6935, + "train/total_loss": 0.11706960946321487 + }, + { + "entropy": 9.301244735717773, + "epoch": 0.6857820842396678, + "mean_token_accuracy": 0.8045454621315002, + "num_tokens": 15281233.0, + "step": 6936, + "train/ce_loss": 0.6674541234970093 + }, + { + "epoch": 0.6857820842396678, + "step": 6936, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.6857820842396678, + "step": 6936, + "train/total_loss": 0.15658916532993317 + }, + { + "entropy": 9.335037231445312, + "epoch": 0.6858809570891833, + "mean_token_accuracy": 0.7811158895492554, + "num_tokens": 15286143.0, + "step": 6937, + "train/ce_loss": 2.9457798973453464e-06 + }, + { + "epoch": 0.6858809570891833, + "step": 6937, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.6858809570891833, + "step": 6937, + "train/total_loss": 0.04687529429793358 + }, + { + "entropy": 9.344409942626953, + "epoch": 0.6859798299386989, + "mean_token_accuracy": 0.751724123954773, + "num_tokens": 15291190.0, + "step": 6938, + "train/ce_loss": 1.1120717525482178 + }, + { + "epoch": 0.6859798299386989, + "step": 6938, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.6859798299386989, + "step": 6938, + "train/total_loss": 0.19323843717575073 + }, + { + "entropy": 8.513089179992676, + "epoch": 0.6860787027882144, + "mean_token_accuracy": 0.702531635761261, + "num_tokens": 15296660.0, + "step": 6939, + "train/ce_loss": 0.7875324487686157 + }, + { + "epoch": 0.6860787027882144, + "step": 6939, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.6860787027882144, + "step": 6939, + "train/total_loss": 0.12172199785709381 + }, + { + "epoch": 0.6861775756377299, + "grad_norm": 0.817496657371521, + "learning_rate": 8.286851604608614e-06, + "loss": 0.1428, + "step": 6940 + }, + { + "entropy": 8.629898071289062, + "epoch": 0.6861775756377299, + "mean_token_accuracy": 0.7637028098106384, + "num_tokens": 15301930.0, + "step": 6940, + "train/ce_loss": 1.129717230796814 + }, + { + "epoch": 0.6861775756377299, + "step": 6940, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.6861775756377299, + "step": 6940, + "train/total_loss": 0.1403154730796814 + }, + { + "entropy": 8.15090560913086, + "epoch": 0.6862764484872454, + "mean_token_accuracy": 0.7578058838844299, + "num_tokens": 15307624.0, + "step": 6941, + "train/ce_loss": 0.6505463719367981 + }, + { + "epoch": 0.6862764484872454, + "step": 6941, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.6862764484872454, + "step": 6941, + "train/total_loss": 0.08067964017391205 + }, + { + "entropy": 8.585698127746582, + "epoch": 0.6863753213367609, + "mean_token_accuracy": 0.7148289084434509, + "num_tokens": 15313145.0, + "step": 6942, + "train/ce_loss": 0.7779737114906311 + }, + { + "epoch": 0.6863753213367609, + "step": 6942, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.6863753213367609, + "step": 6942, + "train/total_loss": 0.11685987561941147 + }, + { + "entropy": 8.508909225463867, + "epoch": 0.6864741941862764, + "mean_token_accuracy": 0.7624728679656982, + "num_tokens": 15318585.0, + "step": 6943, + "train/ce_loss": 0.967738687992096 + }, + { + "epoch": 0.6864741941862764, + "step": 6943, + "train/sim_loss": 0.1484375 + }, + { + "epoch": 0.6864741941862764, + "step": 6943, + "train/total_loss": 0.24521136283874512 + }, + { + "entropy": 8.761722564697266, + "epoch": 0.686573067035792, + "mean_token_accuracy": 0.7230955362319946, + "num_tokens": 15323892.0, + "step": 6944, + "train/ce_loss": 0.7384951114654541 + }, + { + "epoch": 0.686573067035792, + "step": 6944, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.686573067035792, + "step": 6944, + "train/total_loss": 0.10509951412677765 + }, + { + "entropy": 8.465885162353516, + "epoch": 0.6866719398853075, + "mean_token_accuracy": 0.7373637557029724, + "num_tokens": 15329350.0, + "step": 6945, + "train/ce_loss": 0.5195974111557007 + }, + { + "epoch": 0.6866719398853075, + "step": 6945, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.6866719398853075, + "step": 6945, + "train/total_loss": 0.07149098813533783 + }, + { + "entropy": 8.980575561523438, + "epoch": 0.686770812734823, + "mean_token_accuracy": 0.804964542388916, + "num_tokens": 15334364.0, + "step": 6946, + "train/ce_loss": 0.8190268278121948 + }, + { + "epoch": 0.686770812734823, + "step": 6946, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.686770812734823, + "step": 6946, + "train/total_loss": 0.14830893278121948 + }, + { + "entropy": 8.666390419006348, + "epoch": 0.6868696855843386, + "mean_token_accuracy": 0.7522580623626709, + "num_tokens": 15339643.0, + "step": 6947, + "train/ce_loss": 1.0155224800109863 + }, + { + "epoch": 0.6868696855843386, + "step": 6947, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.6868696855843386, + "step": 6947, + "train/total_loss": 0.15233349800109863 + }, + { + "entropy": 8.394342422485352, + "epoch": 0.6869685584338541, + "mean_token_accuracy": 0.7468926310539246, + "num_tokens": 15345031.0, + "step": 6948, + "train/ce_loss": 0.4751631021499634 + }, + { + "epoch": 0.6869685584338541, + "step": 6948, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.6869685584338541, + "step": 6948, + "train/total_loss": 0.13345381617546082 + }, + { + "entropy": 8.674410820007324, + "epoch": 0.6870674312833696, + "mean_token_accuracy": 0.7519466280937195, + "num_tokens": 15350339.0, + "step": 6949, + "train/ce_loss": 0.7661105394363403 + }, + { + "epoch": 0.6870674312833696, + "step": 6949, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.6870674312833696, + "step": 6949, + "train/total_loss": 0.13520480692386627 + }, + { + "entropy": 8.750062942504883, + "epoch": 0.6871663041328852, + "mean_token_accuracy": 0.7936893105506897, + "num_tokens": 15355609.0, + "step": 6950, + "train/ce_loss": 0.45663222670555115 + }, + { + "epoch": 0.6871663041328852, + "step": 6950, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.6871663041328852, + "step": 6950, + "train/total_loss": 0.08863197267055511 + }, + { + "entropy": 8.613815307617188, + "epoch": 0.6872651769824006, + "mean_token_accuracy": 0.7497291564941406, + "num_tokens": 15361043.0, + "step": 6951, + "train/ce_loss": 0.8819339871406555 + }, + { + "epoch": 0.6872651769824006, + "step": 6951, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.6872651769824006, + "step": 6951, + "train/total_loss": 0.1663184016942978 + }, + { + "entropy": 8.531137466430664, + "epoch": 0.6873640498319161, + "mean_token_accuracy": 0.7067415714263916, + "num_tokens": 15366352.0, + "step": 6952, + "train/ce_loss": 0.9505997896194458 + }, + { + "epoch": 0.6873640498319161, + "step": 6952, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.6873640498319161, + "step": 6952, + "train/total_loss": 0.13412249088287354 + }, + { + "entropy": 8.574007034301758, + "epoch": 0.6874629226814317, + "mean_token_accuracy": 0.7746614813804626, + "num_tokens": 15372090.0, + "step": 6953, + "train/ce_loss": 0.6122711300849915 + }, + { + "epoch": 0.6874629226814317, + "step": 6953, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.6874629226814317, + "step": 6953, + "train/total_loss": 0.13153961300849915 + }, + { + "entropy": 9.415061950683594, + "epoch": 0.6875617955309472, + "mean_token_accuracy": 0.7694117426872253, + "num_tokens": 15377107.0, + "step": 6954, + "train/ce_loss": 3.785308081205585e-06 + }, + { + "epoch": 0.6875617955309472, + "step": 6954, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.6875617955309472, + "step": 6954, + "train/total_loss": 0.04296912997961044 + }, + { + "entropy": 9.409917831420898, + "epoch": 0.6876606683804627, + "mean_token_accuracy": 0.7870967984199524, + "num_tokens": 15381964.0, + "step": 6955, + "train/ce_loss": 3.528351726345136e-06 + }, + { + "epoch": 0.6876606683804627, + "step": 6955, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.6876606683804627, + "step": 6955, + "train/total_loss": 0.042969103902578354 + }, + { + "entropy": 9.558332443237305, + "epoch": 0.6877595412299783, + "mean_token_accuracy": 0.703797459602356, + "num_tokens": 15386747.0, + "step": 6956, + "train/ce_loss": 2.1576952934265137 + }, + { + "epoch": 0.6877595412299783, + "step": 6956, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.6877595412299783, + "step": 6956, + "train/total_loss": 0.25092577934265137 + }, + { + "entropy": 8.30762767791748, + "epoch": 0.6878584140794938, + "mean_token_accuracy": 0.7465091347694397, + "num_tokens": 15392211.0, + "step": 6957, + "train/ce_loss": 0.5005062222480774 + }, + { + "epoch": 0.6878584140794938, + "step": 6957, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.6878584140794938, + "step": 6957, + "train/total_loss": 0.15942561626434326 + }, + { + "entropy": 8.810013771057129, + "epoch": 0.6879572869290093, + "mean_token_accuracy": 0.7215189933776855, + "num_tokens": 15397392.0, + "step": 6958, + "train/ce_loss": 1.6074625253677368 + }, + { + "epoch": 0.6879572869290093, + "step": 6958, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.6879572869290093, + "step": 6958, + "train/total_loss": 0.2115275114774704 + }, + { + "entropy": 8.816883087158203, + "epoch": 0.6880561597785249, + "mean_token_accuracy": 0.8195187449455261, + "num_tokens": 15402706.0, + "step": 6959, + "train/ce_loss": 0.6443217992782593 + }, + { + "epoch": 0.6880561597785249, + "step": 6959, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.6880561597785249, + "step": 6959, + "train/total_loss": 0.08005718141794205 + }, + { + "epoch": 0.6881550326280403, + "grad_norm": 0.6103768944740295, + "learning_rate": 8.281906739850666e-06, + "loss": 0.1311, + "step": 6960 + }, + { + "entropy": 9.008502960205078, + "epoch": 0.6881550326280403, + "mean_token_accuracy": 0.709269642829895, + "num_tokens": 15407864.0, + "step": 6960, + "train/ce_loss": 0.8353558778762817 + }, + { + "epoch": 0.6881550326280403, + "step": 6960, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.6881550326280403, + "step": 6960, + "train/total_loss": 0.1069730892777443 + }, + { + "entropy": 9.035163879394531, + "epoch": 0.6882539054775558, + "mean_token_accuracy": 0.7142857313156128, + "num_tokens": 15412753.0, + "step": 6961, + "train/ce_loss": 3.2206226023845375e-06 + }, + { + "epoch": 0.6882539054775558, + "step": 6961, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.6882539054775558, + "step": 6961, + "train/total_loss": 0.04296907037496567 + }, + { + "entropy": 9.355962753295898, + "epoch": 0.6883527783270714, + "mean_token_accuracy": 0.742222249507904, + "num_tokens": 15417653.0, + "step": 6962, + "train/ce_loss": 3.1448560093849665e-06 + }, + { + "epoch": 0.6883527783270714, + "step": 6962, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.6883527783270714, + "step": 6962, + "train/total_loss": 0.03515656292438507 + }, + { + "entropy": 9.3391752243042, + "epoch": 0.6884516511765869, + "mean_token_accuracy": 0.6926229596138, + "num_tokens": 15422612.0, + "step": 6963, + "train/ce_loss": 2.5738637447357178 + }, + { + "epoch": 0.6884516511765869, + "step": 6963, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.6884516511765869, + "step": 6963, + "train/total_loss": 0.32379263639450073 + }, + { + "entropy": 9.015137672424316, + "epoch": 0.6885505240261024, + "mean_token_accuracy": 0.7619718313217163, + "num_tokens": 15427819.0, + "step": 6964, + "train/ce_loss": 2.477358066244051e-06 + }, + { + "epoch": 0.6885505240261024, + "step": 6964, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.6885505240261024, + "step": 6964, + "train/total_loss": 0.019531497731804848 + }, + { + "entropy": 8.540361404418945, + "epoch": 0.688649396875618, + "mean_token_accuracy": 0.7202295660972595, + "num_tokens": 15432999.0, + "step": 6965, + "train/ce_loss": 1.135185718536377 + }, + { + "epoch": 0.688649396875618, + "step": 6965, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.688649396875618, + "step": 6965, + "train/total_loss": 0.14476856589317322 + }, + { + "entropy": 8.603002548217773, + "epoch": 0.6887482697251335, + "mean_token_accuracy": 0.7921653985977173, + "num_tokens": 15438421.0, + "step": 6966, + "train/ce_loss": 0.5192272067070007 + }, + { + "epoch": 0.6887482697251335, + "step": 6966, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.6887482697251335, + "step": 6966, + "train/total_loss": 0.1378602236509323 + }, + { + "entropy": 8.734756469726562, + "epoch": 0.688847142574649, + "mean_token_accuracy": 0.7278645634651184, + "num_tokens": 15443616.0, + "step": 6967, + "train/ce_loss": 0.6984837651252747 + }, + { + "epoch": 0.688847142574649, + "step": 6967, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.688847142574649, + "step": 6967, + "train/total_loss": 0.11281713098287582 + }, + { + "entropy": 8.821735382080078, + "epoch": 0.6889460154241646, + "mean_token_accuracy": 0.7677664756774902, + "num_tokens": 15448912.0, + "step": 6968, + "train/ce_loss": 0.8653164505958557 + }, + { + "epoch": 0.6889460154241646, + "step": 6968, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.6889460154241646, + "step": 6968, + "train/total_loss": 0.1685628890991211 + }, + { + "entropy": 8.957123756408691, + "epoch": 0.68904488827368, + "mean_token_accuracy": 0.6975903511047363, + "num_tokens": 15454188.0, + "step": 6969, + "train/ce_loss": 3.484945636955672e-06 + }, + { + "epoch": 0.68904488827368, + "step": 6969, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.68904488827368, + "step": 6969, + "train/total_loss": 0.08203160017728806 + }, + { + "entropy": 8.231283187866211, + "epoch": 0.6891437611231955, + "mean_token_accuracy": 0.7591313123703003, + "num_tokens": 15459651.0, + "step": 6970, + "train/ce_loss": 0.5685218572616577 + }, + { + "epoch": 0.6891437611231955, + "step": 6970, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.6891437611231955, + "step": 6970, + "train/total_loss": 0.07247719168663025 + }, + { + "entropy": 8.5908203125, + "epoch": 0.6892426339727111, + "mean_token_accuracy": 0.7651006579399109, + "num_tokens": 15465033.0, + "step": 6971, + "train/ce_loss": 1.4566595554351807 + }, + { + "epoch": 0.6892426339727111, + "step": 6971, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.6892426339727111, + "step": 6971, + "train/total_loss": 0.2159784585237503 + }, + { + "entropy": 8.771675109863281, + "epoch": 0.6893415068222266, + "mean_token_accuracy": 0.7152230739593506, + "num_tokens": 15470295.0, + "step": 6972, + "train/ce_loss": 0.8336161375045776 + }, + { + "epoch": 0.6893415068222266, + "step": 6972, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.6893415068222266, + "step": 6972, + "train/total_loss": 0.10679911822080612 + }, + { + "entropy": 9.065987586975098, + "epoch": 0.6894403796717421, + "mean_token_accuracy": 0.7348178029060364, + "num_tokens": 15475175.0, + "step": 6973, + "train/ce_loss": 1.7554562091827393 + }, + { + "epoch": 0.6894403796717421, + "step": 6973, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.6894403796717421, + "step": 6973, + "train/total_loss": 0.2419518679380417 + }, + { + "entropy": 8.591651916503906, + "epoch": 0.6895392525212577, + "mean_token_accuracy": 0.7198124527931213, + "num_tokens": 15480486.0, + "step": 6974, + "train/ce_loss": 0.9288971424102783 + }, + { + "epoch": 0.6895392525212577, + "step": 6974, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.6895392525212577, + "step": 6974, + "train/total_loss": 0.1944522261619568 + }, + { + "entropy": 8.860366821289062, + "epoch": 0.6896381253707732, + "mean_token_accuracy": 0.6860730648040771, + "num_tokens": 15486018.0, + "step": 6975, + "train/ce_loss": 1.2888222932815552 + }, + { + "epoch": 0.6896381253707732, + "step": 6975, + "train/sim_loss": 0.12890625 + }, + { + "epoch": 0.6896381253707732, + "step": 6975, + "train/total_loss": 0.2577884793281555 + }, + { + "entropy": 9.116162300109863, + "epoch": 0.6897369982202887, + "mean_token_accuracy": 0.7212543487548828, + "num_tokens": 15491020.0, + "step": 6976, + "train/ce_loss": 1.1505359411239624 + }, + { + "epoch": 0.6897369982202887, + "step": 6976, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.6897369982202887, + "step": 6976, + "train/total_loss": 0.15020984411239624 + }, + { + "entropy": 8.815119743347168, + "epoch": 0.6898358710698043, + "mean_token_accuracy": 0.781862735748291, + "num_tokens": 15496330.0, + "step": 6977, + "train/ce_loss": 0.4676681160926819 + }, + { + "epoch": 0.6898358710698043, + "step": 6977, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.6898358710698043, + "step": 6977, + "train/total_loss": 0.06239181384444237 + }, + { + "entropy": 8.83273696899414, + "epoch": 0.6899347439193197, + "mean_token_accuracy": 0.7268408536911011, + "num_tokens": 15501593.0, + "step": 6978, + "train/ce_loss": 1.1178123950958252 + }, + { + "epoch": 0.6899347439193197, + "step": 6978, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.6899347439193197, + "step": 6978, + "train/total_loss": 0.17428123950958252 + }, + { + "entropy": 8.750473022460938, + "epoch": 0.6900336167688352, + "mean_token_accuracy": 0.7281213402748108, + "num_tokens": 15506965.0, + "step": 6979, + "train/ce_loss": 0.5950695276260376 + }, + { + "epoch": 0.6900336167688352, + "step": 6979, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.6900336167688352, + "step": 6979, + "train/total_loss": 0.10247570276260376 + }, + { + "epoch": 0.6901324896183508, + "grad_norm": 0.6545323133468628, + "learning_rate": 8.276961875092717e-06, + "loss": 0.1403, + "step": 6980 + }, + { + "entropy": 8.452592849731445, + "epoch": 0.6901324896183508, + "mean_token_accuracy": 0.7842170000076294, + "num_tokens": 15512283.0, + "step": 6980, + "train/ce_loss": 0.5714089274406433 + }, + { + "epoch": 0.6901324896183508, + "step": 6980, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.6901324896183508, + "step": 6980, + "train/total_loss": 0.11573464423418045 + }, + { + "entropy": 8.647806167602539, + "epoch": 0.6902313624678663, + "mean_token_accuracy": 0.7115628719329834, + "num_tokens": 15517574.0, + "step": 6981, + "train/ce_loss": 0.7578251361846924 + }, + { + "epoch": 0.6902313624678663, + "step": 6981, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.6902313624678663, + "step": 6981, + "train/total_loss": 0.13437625765800476 + }, + { + "entropy": 8.560383796691895, + "epoch": 0.6903302353173818, + "mean_token_accuracy": 0.7291220426559448, + "num_tokens": 15522949.0, + "step": 6982, + "train/ce_loss": 0.8393318057060242 + }, + { + "epoch": 0.6903302353173818, + "step": 6982, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.6903302353173818, + "step": 6982, + "train/total_loss": 0.12299568206071854 + }, + { + "entropy": 9.18415355682373, + "epoch": 0.6904291081668974, + "mean_token_accuracy": 0.7688266038894653, + "num_tokens": 15527967.0, + "step": 6983, + "train/ce_loss": 3.3392479963367805e-06 + }, + { + "epoch": 0.6904291081668974, + "step": 6983, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.6904291081668974, + "step": 6983, + "train/total_loss": 0.015625333413481712 + }, + { + "entropy": 8.457925796508789, + "epoch": 0.6905279810164129, + "mean_token_accuracy": 0.7348242998123169, + "num_tokens": 15533315.0, + "step": 6984, + "train/ce_loss": 0.8995658159255981 + }, + { + "epoch": 0.6905279810164129, + "step": 6984, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.6905279810164129, + "step": 6984, + "train/total_loss": 0.10558158159255981 + }, + { + "entropy": 8.641708374023438, + "epoch": 0.6906268538659284, + "mean_token_accuracy": 0.8094576001167297, + "num_tokens": 15538496.0, + "step": 6985, + "train/ce_loss": 0.9776160717010498 + }, + { + "epoch": 0.6906268538659284, + "step": 6985, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.6906268538659284, + "step": 6985, + "train/total_loss": 0.1485428512096405 + }, + { + "entropy": 9.258674621582031, + "epoch": 0.690725726715444, + "mean_token_accuracy": 0.69786536693573, + "num_tokens": 15543563.0, + "step": 6986, + "train/ce_loss": 2.1139408090675715e-06 + }, + { + "epoch": 0.690725726715444, + "step": 6986, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.690725726715444, + "step": 6986, + "train/total_loss": 0.015625210478901863 + }, + { + "entropy": 9.310544967651367, + "epoch": 0.6908245995649595, + "mean_token_accuracy": 0.6595237851142883, + "num_tokens": 15548412.0, + "step": 6987, + "train/ce_loss": 3.0720837116241455 + }, + { + "epoch": 0.6908245995649595, + "step": 6987, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.6908245995649595, + "step": 6987, + "train/total_loss": 0.361895889043808 + }, + { + "entropy": 8.935279846191406, + "epoch": 0.6909234724144749, + "mean_token_accuracy": 0.7120419144630432, + "num_tokens": 15553773.0, + "step": 6988, + "train/ce_loss": 1.2668113708496094 + }, + { + "epoch": 0.6909234724144749, + "step": 6988, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.6909234724144749, + "step": 6988, + "train/total_loss": 0.2321498841047287 + }, + { + "entropy": 8.701950073242188, + "epoch": 0.6910223452639905, + "mean_token_accuracy": 0.6670190095901489, + "num_tokens": 15559163.0, + "step": 6989, + "train/ce_loss": 0.9159974455833435 + }, + { + "epoch": 0.6910223452639905, + "step": 6989, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.6910223452639905, + "step": 6989, + "train/total_loss": 0.1697247475385666 + }, + { + "entropy": 8.946187973022461, + "epoch": 0.691121218113506, + "mean_token_accuracy": 0.7379518151283264, + "num_tokens": 15564289.0, + "step": 6990, + "train/ce_loss": 1.1444780826568604 + }, + { + "epoch": 0.691121218113506, + "step": 6990, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.691121218113506, + "step": 6990, + "train/total_loss": 0.18085405230522156 + }, + { + "entropy": 9.32418441772461, + "epoch": 0.6912200909630215, + "mean_token_accuracy": 0.7272727489471436, + "num_tokens": 15569168.0, + "step": 6991, + "train/ce_loss": 2.120774030685425 + }, + { + "epoch": 0.6912200909630215, + "step": 6991, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.6912200909630215, + "step": 6991, + "train/total_loss": 0.30582740902900696 + }, + { + "entropy": 8.605212211608887, + "epoch": 0.6913189638125371, + "mean_token_accuracy": 0.7746650576591492, + "num_tokens": 15574463.0, + "step": 6992, + "train/ce_loss": 0.7690951824188232 + }, + { + "epoch": 0.6913189638125371, + "step": 6992, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.6913189638125371, + "step": 6992, + "train/total_loss": 0.16284701228141785 + }, + { + "entropy": 8.168206214904785, + "epoch": 0.6914178366620526, + "mean_token_accuracy": 0.7494226098060608, + "num_tokens": 15579807.0, + "step": 6993, + "train/ce_loss": 0.6665575504302979 + }, + { + "epoch": 0.6914178366620526, + "step": 6993, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.6914178366620526, + "step": 6993, + "train/total_loss": 0.09790575504302979 + }, + { + "entropy": 9.690252304077148, + "epoch": 0.6915167095115681, + "mean_token_accuracy": 0.7416666746139526, + "num_tokens": 15584586.0, + "step": 6994, + "train/ce_loss": 2.0575296878814697 + }, + { + "epoch": 0.6915167095115681, + "step": 6994, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.6915167095115681, + "step": 6994, + "train/total_loss": 0.283877968788147 + }, + { + "entropy": 8.993197441101074, + "epoch": 0.6916155823610837, + "mean_token_accuracy": 0.746268630027771, + "num_tokens": 15589573.0, + "step": 6995, + "train/ce_loss": 0.8488063812255859 + }, + { + "epoch": 0.6916155823610837, + "step": 6995, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.6916155823610837, + "step": 6995, + "train/total_loss": 0.16691190004348755 + }, + { + "entropy": 9.218578338623047, + "epoch": 0.6917144552105992, + "mean_token_accuracy": 0.727707028388977, + "num_tokens": 15594633.0, + "step": 6996, + "train/ce_loss": 1.031732201576233 + }, + { + "epoch": 0.6917144552105992, + "step": 6996, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.6917144552105992, + "step": 6996, + "train/total_loss": 0.14614197611808777 + }, + { + "entropy": 8.436715126037598, + "epoch": 0.6918133280601146, + "mean_token_accuracy": 0.7288801670074463, + "num_tokens": 15600086.0, + "step": 6997, + "train/ce_loss": 0.8681232929229736 + }, + { + "epoch": 0.6918133280601146, + "step": 6997, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.6918133280601146, + "step": 6997, + "train/total_loss": 0.1297810822725296 + }, + { + "entropy": 8.853265762329102, + "epoch": 0.6919122009096302, + "mean_token_accuracy": 0.7205438017845154, + "num_tokens": 15605201.0, + "step": 6998, + "train/ce_loss": 1.191149353981018 + }, + { + "epoch": 0.6919122009096302, + "step": 6998, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.6919122009096302, + "step": 6998, + "train/total_loss": 0.1738024353981018 + }, + { + "entropy": 8.89388656616211, + "epoch": 0.6920110737591457, + "mean_token_accuracy": 0.7541229128837585, + "num_tokens": 15610312.0, + "step": 6999, + "train/ce_loss": 1.0421494245529175 + }, + { + "epoch": 0.6920110737591457, + "step": 6999, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.6920110737591457, + "step": 6999, + "train/total_loss": 0.18624618649482727 + }, + { + "epoch": 0.6921099466086612, + "grad_norm": 0.7092252373695374, + "learning_rate": 8.272017010334769e-06, + "loss": 0.1383, + "step": 7000 + }, + { + "entropy": 9.542508125305176, + "epoch": 0.6921099466086612, + "mean_token_accuracy": 0.7443609237670898, + "num_tokens": 15615112.0, + "step": 7000, + "train/ce_loss": 5.2732480071426835e-06 + }, + { + "epoch": 0.6921099466086612, + "step": 7000, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.6921099466086612, + "step": 7000, + "train/total_loss": 0.04687552899122238 + }, + { + "entropy": 9.186712265014648, + "epoch": 0.6922088194581768, + "mean_token_accuracy": 0.7618243098258972, + "num_tokens": 15620157.0, + "step": 7001, + "train/ce_loss": 0.7200567722320557 + }, + { + "epoch": 0.6922088194581768, + "step": 7001, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.6922088194581768, + "step": 7001, + "train/total_loss": 0.09544318169355392 + }, + { + "entropy": 9.247856140136719, + "epoch": 0.6923076923076923, + "mean_token_accuracy": 0.7417103052139282, + "num_tokens": 15625177.0, + "step": 7002, + "train/ce_loss": 1.290831446647644 + }, + { + "epoch": 0.6923076923076923, + "step": 7002, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.6923076923076923, + "step": 7002, + "train/total_loss": 0.17986439168453217 + }, + { + "entropy": 8.667369842529297, + "epoch": 0.6924065651572078, + "mean_token_accuracy": 0.7571251392364502, + "num_tokens": 15630470.0, + "step": 7003, + "train/ce_loss": 0.7783504724502563 + }, + { + "epoch": 0.6924065651572078, + "step": 7003, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.6924065651572078, + "step": 7003, + "train/total_loss": 0.09736629575490952 + }, + { + "entropy": 8.759040832519531, + "epoch": 0.6925054380067234, + "mean_token_accuracy": 0.7424441576004028, + "num_tokens": 15635698.0, + "step": 7004, + "train/ce_loss": 0.7566330432891846 + }, + { + "epoch": 0.6925054380067234, + "step": 7004, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.6925054380067234, + "step": 7004, + "train/total_loss": 0.13816329836845398 + }, + { + "entropy": 8.970389366149902, + "epoch": 0.6926043108562389, + "mean_token_accuracy": 0.7370689511299133, + "num_tokens": 15640832.0, + "step": 7005, + "train/ce_loss": 1.5509412288665771 + }, + { + "epoch": 0.6926043108562389, + "step": 7005, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.6926043108562389, + "step": 7005, + "train/total_loss": 0.20196913182735443 + }, + { + "entropy": 9.440950393676758, + "epoch": 0.6927031837057545, + "mean_token_accuracy": 0.7175572514533997, + "num_tokens": 15645719.0, + "step": 7006, + "train/ce_loss": 5.76427519263234e-05 + }, + { + "epoch": 0.6927031837057545, + "step": 7006, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.6927031837057545, + "step": 7006, + "train/total_loss": 0.02344326488673687 + }, + { + "entropy": 8.93044662475586, + "epoch": 0.6928020565552699, + "mean_token_accuracy": 0.7400000095367432, + "num_tokens": 15651044.0, + "step": 7007, + "train/ce_loss": 2.412987214484019e-06 + }, + { + "epoch": 0.6928020565552699, + "step": 7007, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.6928020565552699, + "step": 7007, + "train/total_loss": 0.0195314921438694 + }, + { + "entropy": 8.99925422668457, + "epoch": 0.6929009294047854, + "mean_token_accuracy": 0.7074742317199707, + "num_tokens": 15656305.0, + "step": 7008, + "train/ce_loss": 1.2787649631500244 + }, + { + "epoch": 0.6929009294047854, + "step": 7008, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.6929009294047854, + "step": 7008, + "train/total_loss": 0.17865775525569916 + }, + { + "entropy": 9.029271125793457, + "epoch": 0.692999802254301, + "mean_token_accuracy": 0.81717449426651, + "num_tokens": 15661522.0, + "step": 7009, + "train/ce_loss": 0.8705811500549316 + }, + { + "epoch": 0.692999802254301, + "step": 7009, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.692999802254301, + "step": 7009, + "train/total_loss": 0.13393312692642212 + }, + { + "entropy": 8.652618408203125, + "epoch": 0.6930986751038165, + "mean_token_accuracy": 0.6761229038238525, + "num_tokens": 15666832.0, + "step": 7010, + "train/ce_loss": 0.34817835688591003 + }, + { + "epoch": 0.6930986751038165, + "step": 7010, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.6930986751038165, + "step": 7010, + "train/total_loss": 0.08169283717870712 + }, + { + "entropy": 8.780120849609375, + "epoch": 0.693197547953332, + "mean_token_accuracy": 0.7528344392776489, + "num_tokens": 15672164.0, + "step": 7011, + "train/ce_loss": 0.5572643876075745 + }, + { + "epoch": 0.693197547953332, + "step": 7011, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.693197547953332, + "step": 7011, + "train/total_loss": 0.13385143876075745 + }, + { + "entropy": 8.186322212219238, + "epoch": 0.6932964208028476, + "mean_token_accuracy": 0.6894736886024475, + "num_tokens": 15677614.0, + "step": 7012, + "train/ce_loss": 1.051735281944275 + }, + { + "epoch": 0.6932964208028476, + "step": 7012, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.6932964208028476, + "step": 7012, + "train/total_loss": 0.1793922781944275 + }, + { + "entropy": 8.736364364624023, + "epoch": 0.6933952936523631, + "mean_token_accuracy": 0.734375, + "num_tokens": 15682890.0, + "step": 7013, + "train/ce_loss": 1.169240117073059 + }, + { + "epoch": 0.6933952936523631, + "step": 7013, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.6933952936523631, + "step": 7013, + "train/total_loss": 0.14426776766777039 + }, + { + "entropy": 8.883325576782227, + "epoch": 0.6934941665018786, + "mean_token_accuracy": 0.7451523542404175, + "num_tokens": 15688037.0, + "step": 7014, + "train/ce_loss": 0.5323801040649414 + }, + { + "epoch": 0.6934941665018786, + "step": 7014, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.6934941665018786, + "step": 7014, + "train/total_loss": 0.08839426189661026 + }, + { + "entropy": 8.72563648223877, + "epoch": 0.6935930393513942, + "mean_token_accuracy": 0.7043189406394958, + "num_tokens": 15693454.0, + "step": 7015, + "train/ce_loss": 0.7103878259658813 + }, + { + "epoch": 0.6935930393513942, + "step": 7015, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.6935930393513942, + "step": 7015, + "train/total_loss": 0.12182003259658813 + }, + { + "entropy": 9.101256370544434, + "epoch": 0.6936919122009096, + "mean_token_accuracy": 0.7774342894554138, + "num_tokens": 15698527.0, + "step": 7016, + "train/ce_loss": 0.7961004972457886 + }, + { + "epoch": 0.6936919122009096, + "step": 7016, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.6936919122009096, + "step": 7016, + "train/total_loss": 0.09914129972457886 + }, + { + "entropy": 9.542841911315918, + "epoch": 0.6937907850504251, + "mean_token_accuracy": 0.790123462677002, + "num_tokens": 15703310.0, + "step": 7017, + "train/ce_loss": 1.63310968875885 + }, + { + "epoch": 0.6937907850504251, + "step": 7017, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.6937907850504251, + "step": 7017, + "train/total_loss": 0.2101859748363495 + }, + { + "entropy": 9.061891555786133, + "epoch": 0.6938896578999407, + "mean_token_accuracy": 0.673559844493866, + "num_tokens": 15708451.0, + "step": 7018, + "train/ce_loss": 1.4210647344589233 + }, + { + "epoch": 0.6938896578999407, + "step": 7018, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.6938896578999407, + "step": 7018, + "train/total_loss": 0.18898147344589233 + }, + { + "entropy": 8.897470474243164, + "epoch": 0.6939885307494562, + "mean_token_accuracy": 0.7648686170578003, + "num_tokens": 15713599.0, + "step": 7019, + "train/ce_loss": 0.8759534955024719 + }, + { + "epoch": 0.6939885307494562, + "step": 7019, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.6939885307494562, + "step": 7019, + "train/total_loss": 0.15009534358978271 + }, + { + "epoch": 0.6940874035989717, + "grad_norm": 0.6430907845497131, + "learning_rate": 8.26707214557682e-06, + "loss": 0.1372, + "step": 7020 + }, + { + "entropy": 9.013862609863281, + "epoch": 0.6940874035989717, + "mean_token_accuracy": 0.7945619225502014, + "num_tokens": 15718791.0, + "step": 7020, + "train/ce_loss": 0.6018125414848328 + }, + { + "epoch": 0.6940874035989717, + "step": 7020, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.6940874035989717, + "step": 7020, + "train/total_loss": 0.10315001010894775 + }, + { + "entropy": 9.094813346862793, + "epoch": 0.6941862764484873, + "mean_token_accuracy": 0.7145214676856995, + "num_tokens": 15723836.0, + "step": 7021, + "train/ce_loss": 1.3392627239227295 + }, + { + "epoch": 0.6941862764484873, + "step": 7021, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.6941862764484873, + "step": 7021, + "train/total_loss": 0.19642627239227295 + }, + { + "entropy": 9.047096252441406, + "epoch": 0.6942851492980028, + "mean_token_accuracy": 0.7503949403762817, + "num_tokens": 15729062.0, + "step": 7022, + "train/ce_loss": 0.79969322681427 + }, + { + "epoch": 0.6942851492980028, + "step": 7022, + "train/sim_loss": 0.12890625 + }, + { + "epoch": 0.6942851492980028, + "step": 7022, + "train/total_loss": 0.20887556672096252 + }, + { + "entropy": 9.305543899536133, + "epoch": 0.6943840221475183, + "mean_token_accuracy": 0.7069536447525024, + "num_tokens": 15734089.0, + "step": 7023, + "train/ce_loss": 1.635424017906189 + }, + { + "epoch": 0.6943840221475183, + "step": 7023, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.6943840221475183, + "step": 7023, + "train/total_loss": 0.20260490477085114 + }, + { + "entropy": 8.614456176757812, + "epoch": 0.6944828949970339, + "mean_token_accuracy": 0.6590662598609924, + "num_tokens": 15739476.0, + "step": 7024, + "train/ce_loss": 1.0922640562057495 + }, + { + "epoch": 0.6944828949970339, + "step": 7024, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.6944828949970339, + "step": 7024, + "train/total_loss": 0.21860140562057495 + }, + { + "entropy": 9.667367935180664, + "epoch": 0.6945817678465493, + "mean_token_accuracy": 0.7654028534889221, + "num_tokens": 15744277.0, + "step": 7025, + "train/ce_loss": 1.4774781465530396 + }, + { + "epoch": 0.6945817678465493, + "step": 7025, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.6945817678465493, + "step": 7025, + "train/total_loss": 0.17118531465530396 + }, + { + "entropy": 9.445836067199707, + "epoch": 0.6946806406960648, + "mean_token_accuracy": 0.764976978302002, + "num_tokens": 15749147.0, + "step": 7026, + "train/ce_loss": 0.7500922083854675 + }, + { + "epoch": 0.6946806406960648, + "step": 7026, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.6946806406960648, + "step": 7026, + "train/total_loss": 0.09454046934843063 + }, + { + "entropy": 8.696281433105469, + "epoch": 0.6947795135455804, + "mean_token_accuracy": 0.6740331649780273, + "num_tokens": 15754495.0, + "step": 7027, + "train/ce_loss": 1.7816426753997803 + }, + { + "epoch": 0.6947795135455804, + "step": 7027, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.6947795135455804, + "step": 7027, + "train/total_loss": 0.2797267735004425 + }, + { + "entropy": 8.684164047241211, + "epoch": 0.6948783863950959, + "mean_token_accuracy": 0.7372781038284302, + "num_tokens": 15759817.0, + "step": 7028, + "train/ce_loss": 0.6079049706459045 + }, + { + "epoch": 0.6948783863950959, + "step": 7028, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.6948783863950959, + "step": 7028, + "train/total_loss": 0.08422799408435822 + }, + { + "entropy": 9.313911437988281, + "epoch": 0.6949772592446114, + "mean_token_accuracy": 0.750629723072052, + "num_tokens": 15764658.0, + "step": 7029, + "train/ce_loss": 1.0196951627731323 + }, + { + "epoch": 0.6949772592446114, + "step": 7029, + "train/sim_loss": 0.01171875 + }, + { + "epoch": 0.6949772592446114, + "step": 7029, + "train/total_loss": 0.11368826776742935 + }, + { + "entropy": 9.19039535522461, + "epoch": 0.695076132094127, + "mean_token_accuracy": 0.7332268357276917, + "num_tokens": 15769758.0, + "step": 7030, + "train/ce_loss": 0.7941244840621948 + }, + { + "epoch": 0.695076132094127, + "step": 7030, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.695076132094127, + "step": 7030, + "train/total_loss": 0.10284995287656784 + }, + { + "entropy": 8.487091064453125, + "epoch": 0.6951750049436425, + "mean_token_accuracy": 0.7599545121192932, + "num_tokens": 15775114.0, + "step": 7031, + "train/ce_loss": 1.244439721107483 + }, + { + "epoch": 0.6951750049436425, + "step": 7031, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.6951750049436425, + "step": 7031, + "train/total_loss": 0.19475647807121277 + }, + { + "entropy": 8.887763023376465, + "epoch": 0.695273877793158, + "mean_token_accuracy": 0.7837837934494019, + "num_tokens": 15780342.0, + "step": 7032, + "train/ce_loss": 0.9753705263137817 + }, + { + "epoch": 0.695273877793158, + "step": 7032, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.695273877793158, + "step": 7032, + "train/total_loss": 0.2069120556116104 + }, + { + "entropy": 8.522140502929688, + "epoch": 0.6953727506426736, + "mean_token_accuracy": 0.7234273552894592, + "num_tokens": 15785701.0, + "step": 7033, + "train/ce_loss": 1.1009597778320312 + }, + { + "epoch": 0.6953727506426736, + "step": 7033, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.6953727506426736, + "step": 7033, + "train/total_loss": 0.16087722778320312 + }, + { + "entropy": 8.658458709716797, + "epoch": 0.695471623492189, + "mean_token_accuracy": 0.8036999106407166, + "num_tokens": 15791293.0, + "step": 7034, + "train/ce_loss": 0.6290245056152344 + }, + { + "epoch": 0.695471623492189, + "step": 7034, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.695471623492189, + "step": 7034, + "train/total_loss": 0.12540245056152344 + }, + { + "entropy": 9.100859642028809, + "epoch": 0.6955704963417045, + "mean_token_accuracy": 0.7009202241897583, + "num_tokens": 15796324.0, + "step": 7035, + "train/ce_loss": 2.258258973597549e-06 + }, + { + "epoch": 0.6955704963417045, + "step": 7035, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.6955704963417045, + "step": 7035, + "train/total_loss": 0.042968977242708206 + }, + { + "entropy": 8.710677146911621, + "epoch": 0.6956693691912201, + "mean_token_accuracy": 0.7430051565170288, + "num_tokens": 15801770.0, + "step": 7036, + "train/ce_loss": 0.6250700950622559 + }, + { + "epoch": 0.6956693691912201, + "step": 7036, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.6956693691912201, + "step": 7036, + "train/total_loss": 0.1015695109963417 + }, + { + "entropy": 9.15246868133545, + "epoch": 0.6957682420407356, + "mean_token_accuracy": 0.7771317958831787, + "num_tokens": 15806704.0, + "step": 7037, + "train/ce_loss": 1.804560661315918 + }, + { + "epoch": 0.6957682420407356, + "step": 7037, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.6957682420407356, + "step": 7037, + "train/total_loss": 0.2624873220920563 + }, + { + "entropy": 8.971076011657715, + "epoch": 0.6958671148902511, + "mean_token_accuracy": 0.7671394944190979, + "num_tokens": 15812041.0, + "step": 7038, + "train/ce_loss": 1.947181317518698e-06 + }, + { + "epoch": 0.6958671148902511, + "step": 7038, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.6958671148902511, + "step": 7038, + "train/total_loss": 0.05078144371509552 + }, + { + "entropy": 8.714999198913574, + "epoch": 0.6959659877397667, + "mean_token_accuracy": 0.7703225612640381, + "num_tokens": 15817481.0, + "step": 7039, + "train/ce_loss": 0.8191965222358704 + }, + { + "epoch": 0.6959659877397667, + "step": 7039, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.6959659877397667, + "step": 7039, + "train/total_loss": 0.14441965520381927 + }, + { + "epoch": 0.6960648605892822, + "grad_norm": 0.6332975625991821, + "learning_rate": 8.26212728081887e-06, + "loss": 0.1369, + "step": 7040 + }, + { + "entropy": 8.384191513061523, + "epoch": 0.6960648605892822, + "mean_token_accuracy": 0.7045454382896423, + "num_tokens": 15822919.0, + "step": 7040, + "train/ce_loss": 0.9338988065719604 + }, + { + "epoch": 0.6960648605892822, + "step": 7040, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.6960648605892822, + "step": 7040, + "train/total_loss": 0.19885863363742828 + }, + { + "entropy": 8.649618148803711, + "epoch": 0.6961637334387977, + "mean_token_accuracy": 0.7686403393745422, + "num_tokens": 15828269.0, + "step": 7041, + "train/ce_loss": 0.6279864311218262 + }, + { + "epoch": 0.6961637334387977, + "step": 7041, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.6961637334387977, + "step": 7041, + "train/total_loss": 0.0862361416220665 + }, + { + "entropy": 8.825262069702148, + "epoch": 0.6962626062883133, + "mean_token_accuracy": 0.7746913433074951, + "num_tokens": 15833417.0, + "step": 7042, + "train/ce_loss": 0.810846209526062 + }, + { + "epoch": 0.6962626062883133, + "step": 7042, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.6962626062883133, + "step": 7042, + "train/total_loss": 0.10061587393283844 + }, + { + "entropy": 8.882431983947754, + "epoch": 0.6963614791378288, + "mean_token_accuracy": 0.7299168705940247, + "num_tokens": 15838598.0, + "step": 7043, + "train/ce_loss": 1.1015716791152954 + }, + { + "epoch": 0.6963614791378288, + "step": 7043, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.6963614791378288, + "step": 7043, + "train/total_loss": 0.16484466195106506 + }, + { + "entropy": 8.79694938659668, + "epoch": 0.6964603519873442, + "mean_token_accuracy": 0.7814726829528809, + "num_tokens": 15843913.0, + "step": 7044, + "train/ce_loss": 0.45228341221809387 + }, + { + "epoch": 0.6964603519873442, + "step": 7044, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.6964603519873442, + "step": 7044, + "train/total_loss": 0.06475959718227386 + }, + { + "entropy": 8.557976722717285, + "epoch": 0.6965592248368598, + "mean_token_accuracy": 0.7971863746643066, + "num_tokens": 15849218.0, + "step": 7045, + "train/ce_loss": 0.6079275608062744 + }, + { + "epoch": 0.6965592248368598, + "step": 7045, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.6965592248368598, + "step": 7045, + "train/total_loss": 0.11938650906085968 + }, + { + "entropy": 8.67821216583252, + "epoch": 0.6966580976863753, + "mean_token_accuracy": 0.7239868640899658, + "num_tokens": 15854649.0, + "step": 7046, + "train/ce_loss": 0.6561583876609802 + }, + { + "epoch": 0.6966580976863753, + "step": 7046, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.6966580976863753, + "step": 7046, + "train/total_loss": 0.09295959025621414 + }, + { + "entropy": 8.828751564025879, + "epoch": 0.6967569705358908, + "mean_token_accuracy": 0.7404255270957947, + "num_tokens": 15859791.0, + "step": 7047, + "train/ce_loss": 4.0537565837439615e-06 + }, + { + "epoch": 0.6967569705358908, + "step": 7047, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.6967569705358908, + "step": 7047, + "train/total_loss": 0.05468790605664253 + }, + { + "entropy": 8.84826946258545, + "epoch": 0.6968558433854064, + "mean_token_accuracy": 0.7540983557701111, + "num_tokens": 15864923.0, + "step": 7048, + "train/ce_loss": 1.7639780708123e-05 + }, + { + "epoch": 0.6968558433854064, + "step": 7048, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.6968558433854064, + "step": 7048, + "train/total_loss": 0.08593926578760147 + }, + { + "entropy": 8.52353286743164, + "epoch": 0.6969547162349219, + "mean_token_accuracy": 0.8147714138031006, + "num_tokens": 15870225.0, + "step": 7049, + "train/ce_loss": 0.5272038578987122 + }, + { + "epoch": 0.6969547162349219, + "step": 7049, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.6969547162349219, + "step": 7049, + "train/total_loss": 0.07225163280963898 + }, + { + "entropy": 8.890176773071289, + "epoch": 0.6970535890844374, + "mean_token_accuracy": 0.748851478099823, + "num_tokens": 15875310.0, + "step": 7050, + "train/ce_loss": 1.0641462802886963 + }, + { + "epoch": 0.6970535890844374, + "step": 7050, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.6970535890844374, + "step": 7050, + "train/total_loss": 0.16110213100910187 + }, + { + "entropy": 8.700510025024414, + "epoch": 0.697152461933953, + "mean_token_accuracy": 0.7427577972412109, + "num_tokens": 15880645.0, + "step": 7051, + "train/ce_loss": 1.2868669033050537 + }, + { + "epoch": 0.697152461933953, + "step": 7051, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.697152461933953, + "step": 7051, + "train/total_loss": 0.17946794629096985 + }, + { + "entropy": 9.045878410339355, + "epoch": 0.6972513347834685, + "mean_token_accuracy": 0.7814815044403076, + "num_tokens": 15885580.0, + "step": 7052, + "train/ce_loss": 1.1209051609039307 + }, + { + "epoch": 0.6972513347834685, + "step": 7052, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.6972513347834685, + "step": 7052, + "train/total_loss": 0.15115302801132202 + }, + { + "entropy": 9.706929206848145, + "epoch": 0.6973502076329839, + "mean_token_accuracy": 0.7095709443092346, + "num_tokens": 15890263.0, + "step": 7053, + "train/ce_loss": 1.4180285930633545 + }, + { + "epoch": 0.6973502076329839, + "step": 7053, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.6973502076329839, + "step": 7053, + "train/total_loss": 0.1847716122865677 + }, + { + "entropy": 8.205760955810547, + "epoch": 0.6974490804824995, + "mean_token_accuracy": 0.7109295129776001, + "num_tokens": 15895745.0, + "step": 7054, + "train/ce_loss": 1.4638735055923462 + }, + { + "epoch": 0.6974490804824995, + "step": 7054, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.6974490804824995, + "step": 7054, + "train/total_loss": 0.20498110353946686 + }, + { + "entropy": 8.700922012329102, + "epoch": 0.697547953332015, + "mean_token_accuracy": 0.6952381134033203, + "num_tokens": 15900979.0, + "step": 7055, + "train/ce_loss": 1.427583932876587 + }, + { + "epoch": 0.697547953332015, + "step": 7055, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.697547953332015, + "step": 7055, + "train/total_loss": 0.20525839924812317 + }, + { + "entropy": 9.129181861877441, + "epoch": 0.6976468261815305, + "mean_token_accuracy": 0.699999988079071, + "num_tokens": 15906045.0, + "step": 7056, + "train/ce_loss": 1.8817038536071777 + }, + { + "epoch": 0.6976468261815305, + "step": 7056, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.6976468261815305, + "step": 7056, + "train/total_loss": 0.23113913834095 + }, + { + "entropy": 8.812071800231934, + "epoch": 0.6977456990310461, + "mean_token_accuracy": 0.7074999809265137, + "num_tokens": 15911278.0, + "step": 7057, + "train/ce_loss": 1.5500264167785645 + }, + { + "epoch": 0.6977456990310461, + "step": 7057, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.6977456990310461, + "step": 7057, + "train/total_loss": 0.2018776386976242 + }, + { + "entropy": 8.717105865478516, + "epoch": 0.6978445718805616, + "mean_token_accuracy": 0.7850356101989746, + "num_tokens": 15916581.0, + "step": 7058, + "train/ce_loss": 1.033908724784851 + }, + { + "epoch": 0.6978445718805616, + "step": 7058, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.6978445718805616, + "step": 7058, + "train/total_loss": 0.1424533724784851 + }, + { + "entropy": 8.572164535522461, + "epoch": 0.6979434447300771, + "mean_token_accuracy": 0.731517493724823, + "num_tokens": 15922098.0, + "step": 7059, + "train/ce_loss": 1.026472568511963 + }, + { + "epoch": 0.6979434447300771, + "step": 7059, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.6979434447300771, + "step": 7059, + "train/total_loss": 0.16514725983142853 + }, + { + "epoch": 0.6980423175795927, + "grad_norm": 0.5985382199287415, + "learning_rate": 8.257182416060922e-06, + "loss": 0.1269, + "step": 7060 + }, + { + "entropy": 8.374982833862305, + "epoch": 0.6980423175795927, + "mean_token_accuracy": 0.7595682144165039, + "num_tokens": 15927602.0, + "step": 7060, + "train/ce_loss": 0.7092851996421814 + }, + { + "epoch": 0.6980423175795927, + "step": 7060, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.6980423175795927, + "step": 7060, + "train/total_loss": 0.12170977145433426 + }, + { + "entropy": 8.375333786010742, + "epoch": 0.6981411904291082, + "mean_token_accuracy": 0.7082917094230652, + "num_tokens": 15933067.0, + "step": 7061, + "train/ce_loss": 1.6360722780227661 + }, + { + "epoch": 0.6981411904291082, + "step": 7061, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.6981411904291082, + "step": 7061, + "train/total_loss": 0.27298223972320557 + }, + { + "entropy": 9.378819465637207, + "epoch": 0.6982400632786236, + "mean_token_accuracy": 0.7847357988357544, + "num_tokens": 15938013.0, + "step": 7062, + "train/ce_loss": 0.7179803252220154 + }, + { + "epoch": 0.6982400632786236, + "step": 7062, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.6982400632786236, + "step": 7062, + "train/total_loss": 0.09132928401231766 + }, + { + "entropy": 9.801604270935059, + "epoch": 0.6983389361281392, + "mean_token_accuracy": 0.7347826361656189, + "num_tokens": 15942619.0, + "step": 7063, + "train/ce_loss": 6.8908921093679965e-06 + }, + { + "epoch": 0.6983389361281392, + "step": 7063, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.6983389361281392, + "step": 7063, + "train/total_loss": 0.039063189178705215 + }, + { + "entropy": 8.975824356079102, + "epoch": 0.6984378089776547, + "mean_token_accuracy": 0.7554980516433716, + "num_tokens": 15947894.0, + "step": 7064, + "train/ce_loss": 0.6070204377174377 + }, + { + "epoch": 0.6984378089776547, + "step": 7064, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.6984378089776547, + "step": 7064, + "train/total_loss": 0.15445204079151154 + }, + { + "entropy": 8.47336196899414, + "epoch": 0.6985366818271702, + "mean_token_accuracy": 0.7288135886192322, + "num_tokens": 15953294.0, + "step": 7065, + "train/ce_loss": 0.7014027833938599 + }, + { + "epoch": 0.6985366818271702, + "step": 7065, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.6985366818271702, + "step": 7065, + "train/total_loss": 0.0857652798295021 + }, + { + "entropy": 9.08005142211914, + "epoch": 0.6986355546766858, + "mean_token_accuracy": 0.7300509214401245, + "num_tokens": 15958338.0, + "step": 7066, + "train/ce_loss": 0.7763445973396301 + }, + { + "epoch": 0.6986355546766858, + "step": 7066, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.6986355546766858, + "step": 7066, + "train/total_loss": 0.19091570377349854 + }, + { + "entropy": 8.79062271118164, + "epoch": 0.6987344275262013, + "mean_token_accuracy": 0.7575392127037048, + "num_tokens": 15963659.0, + "step": 7067, + "train/ce_loss": 0.7277435660362244 + }, + { + "epoch": 0.6987344275262013, + "step": 7067, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.6987344275262013, + "step": 7067, + "train/total_loss": 0.09621185809373856 + }, + { + "entropy": 9.04910659790039, + "epoch": 0.6988333003757168, + "mean_token_accuracy": 0.6647887229919434, + "num_tokens": 15968820.0, + "step": 7068, + "train/ce_loss": 1.0253169536590576 + }, + { + "epoch": 0.6988333003757168, + "step": 7068, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.6988333003757168, + "step": 7068, + "train/total_loss": 0.16112545132637024 + }, + { + "entropy": 9.114204406738281, + "epoch": 0.6989321732252324, + "mean_token_accuracy": 0.8054298758506775, + "num_tokens": 15973904.0, + "step": 7069, + "train/ce_loss": 2.230848394901841e-06 + }, + { + "epoch": 0.6989321732252324, + "step": 7069, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.6989321732252324, + "step": 7069, + "train/total_loss": 0.019531473517417908 + }, + { + "entropy": 8.6248779296875, + "epoch": 0.6990310460747479, + "mean_token_accuracy": 0.781737208366394, + "num_tokens": 15979295.0, + "step": 7070, + "train/ce_loss": 0.9150219559669495 + }, + { + "epoch": 0.6990310460747479, + "step": 7070, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.6990310460747479, + "step": 7070, + "train/total_loss": 0.11884594708681107 + }, + { + "entropy": 9.219822883605957, + "epoch": 0.6991299189242633, + "mean_token_accuracy": 0.7191780805587769, + "num_tokens": 15984285.0, + "step": 7071, + "train/ce_loss": 4.236349923303351e-06 + }, + { + "epoch": 0.6991299189242633, + "step": 7071, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.6991299189242633, + "step": 7071, + "train/total_loss": 0.050781674683094025 + }, + { + "entropy": 8.964593887329102, + "epoch": 0.699228791773779, + "mean_token_accuracy": 0.8073654174804688, + "num_tokens": 15989435.0, + "step": 7072, + "train/ce_loss": 0.8848397731781006 + }, + { + "epoch": 0.699228791773779, + "step": 7072, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.699228791773779, + "step": 7072, + "train/total_loss": 0.11973398178815842 + }, + { + "entropy": 8.831852912902832, + "epoch": 0.6993276646232944, + "mean_token_accuracy": 0.7979942560195923, + "num_tokens": 15994636.0, + "step": 7073, + "train/ce_loss": 0.48496463894844055 + }, + { + "epoch": 0.6993276646232944, + "step": 7073, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.6993276646232944, + "step": 7073, + "train/total_loss": 0.06412146985530853 + }, + { + "entropy": 8.867591857910156, + "epoch": 0.6994265374728099, + "mean_token_accuracy": 0.7411095499992371, + "num_tokens": 15999758.0, + "step": 7074, + "train/ce_loss": 0.6474717855453491 + }, + { + "epoch": 0.6994265374728099, + "step": 7074, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.6994265374728099, + "step": 7074, + "train/total_loss": 0.08818467706441879 + }, + { + "entropy": 8.66320514678955, + "epoch": 0.6995254103223255, + "mean_token_accuracy": 0.7608951926231384, + "num_tokens": 16005014.0, + "step": 7075, + "train/ce_loss": 1.0257648229599 + }, + { + "epoch": 0.6995254103223255, + "step": 7075, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.6995254103223255, + "step": 7075, + "train/total_loss": 0.18070149421691895 + }, + { + "entropy": 9.197005271911621, + "epoch": 0.699624283171841, + "mean_token_accuracy": 0.6825174689292908, + "num_tokens": 16010200.0, + "step": 7076, + "train/ce_loss": 0.7518903017044067 + }, + { + "epoch": 0.699624283171841, + "step": 7076, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.699624283171841, + "step": 7076, + "train/total_loss": 0.1455015242099762 + }, + { + "entropy": 8.708137512207031, + "epoch": 0.6997231560213565, + "mean_token_accuracy": 0.778064489364624, + "num_tokens": 16015397.0, + "step": 7077, + "train/ce_loss": 1.0956438779830933 + }, + { + "epoch": 0.6997231560213565, + "step": 7077, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.6997231560213565, + "step": 7077, + "train/total_loss": 0.1564393937587738 + }, + { + "entropy": 8.628875732421875, + "epoch": 0.6998220288708721, + "mean_token_accuracy": 0.7569866180419922, + "num_tokens": 16020699.0, + "step": 7078, + "train/ce_loss": 0.5898162126541138 + }, + { + "epoch": 0.6998220288708721, + "step": 7078, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.6998220288708721, + "step": 7078, + "train/total_loss": 0.09804412722587585 + }, + { + "entropy": 8.9287691116333, + "epoch": 0.6999209017203876, + "mean_token_accuracy": 0.7357512712478638, + "num_tokens": 16025927.0, + "step": 7079, + "train/ce_loss": 0.9454010725021362 + }, + { + "epoch": 0.6999209017203876, + "step": 7079, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.6999209017203876, + "step": 7079, + "train/total_loss": 0.18047761917114258 + }, + { + "epoch": 0.700019774569903, + "grad_norm": 0.6242483258247375, + "learning_rate": 8.252237551302973e-06, + "loss": 0.1294, + "step": 7080 + }, + { + "entropy": 8.967345237731934, + "epoch": 0.700019774569903, + "mean_token_accuracy": 0.7030848264694214, + "num_tokens": 16031203.0, + "step": 7080, + "train/ce_loss": 0.6736631989479065 + }, + { + "epoch": 0.700019774569903, + "step": 7080, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.700019774569903, + "step": 7080, + "train/total_loss": 0.09080382436513901 + }, + { + "entropy": 8.525558471679688, + "epoch": 0.7001186474194186, + "mean_token_accuracy": 0.7220930457115173, + "num_tokens": 16036539.0, + "step": 7081, + "train/ce_loss": 1.2200530767440796 + }, + { + "epoch": 0.7001186474194186, + "step": 7081, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.7001186474194186, + "step": 7081, + "train/total_loss": 0.16106781363487244 + }, + { + "entropy": 9.250432014465332, + "epoch": 0.7002175202689341, + "mean_token_accuracy": 0.743879497051239, + "num_tokens": 16041435.0, + "step": 7082, + "train/ce_loss": 1.3173713684082031 + }, + { + "epoch": 0.7002175202689341, + "step": 7082, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.7002175202689341, + "step": 7082, + "train/total_loss": 0.1629871428012848 + }, + { + "entropy": 8.72872257232666, + "epoch": 0.7003163931184496, + "mean_token_accuracy": 0.74609375, + "num_tokens": 16046697.0, + "step": 7083, + "train/ce_loss": 4.460508080228465e-06 + }, + { + "epoch": 0.7003163931184496, + "step": 7083, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.7003163931184496, + "step": 7083, + "train/total_loss": 0.050781697034835815 + }, + { + "entropy": 8.655278205871582, + "epoch": 0.7004152659679652, + "mean_token_accuracy": 0.7363515496253967, + "num_tokens": 16051930.0, + "step": 7084, + "train/ce_loss": 0.8728628754615784 + }, + { + "epoch": 0.7004152659679652, + "step": 7084, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.7004152659679652, + "step": 7084, + "train/total_loss": 0.13025504350662231 + }, + { + "entropy": 8.845071792602539, + "epoch": 0.7005141388174807, + "mean_token_accuracy": 0.7242857217788696, + "num_tokens": 16057115.0, + "step": 7085, + "train/ce_loss": 0.8364428877830505 + }, + { + "epoch": 0.7005141388174807, + "step": 7085, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.7005141388174807, + "step": 7085, + "train/total_loss": 0.134425550699234 + }, + { + "entropy": 8.940481185913086, + "epoch": 0.7006130116669962, + "mean_token_accuracy": 0.7395973205566406, + "num_tokens": 16062287.0, + "step": 7086, + "train/ce_loss": 1.0155926942825317 + }, + { + "epoch": 0.7006130116669962, + "step": 7086, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.7006130116669962, + "step": 7086, + "train/total_loss": 0.13671553134918213 + }, + { + "entropy": 8.607136726379395, + "epoch": 0.7007118845165118, + "mean_token_accuracy": 0.698285698890686, + "num_tokens": 16067616.0, + "step": 7087, + "train/ce_loss": 0.9275574684143066 + }, + { + "epoch": 0.7007118845165118, + "step": 7087, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.7007118845165118, + "step": 7087, + "train/total_loss": 0.1357244998216629 + }, + { + "entropy": 8.59605598449707, + "epoch": 0.7008107573660273, + "mean_token_accuracy": 0.7386478185653687, + "num_tokens": 16073144.0, + "step": 7088, + "train/ce_loss": 0.5148741602897644 + }, + { + "epoch": 0.7008107573660273, + "step": 7088, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.7008107573660273, + "step": 7088, + "train/total_loss": 0.09445616602897644 + }, + { + "entropy": 8.678980827331543, + "epoch": 0.7009096302155429, + "mean_token_accuracy": 0.7221621870994568, + "num_tokens": 16078536.0, + "step": 7089, + "train/ce_loss": 1.5042873620986938 + }, + { + "epoch": 0.7009096302155429, + "step": 7089, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.7009096302155429, + "step": 7089, + "train/total_loss": 0.19339749217033386 + }, + { + "entropy": 8.27509880065918, + "epoch": 0.7010085030650584, + "mean_token_accuracy": 0.747553825378418, + "num_tokens": 16084061.0, + "step": 7090, + "train/ce_loss": 1.1672831773757935 + }, + { + "epoch": 0.7010085030650584, + "step": 7090, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.7010085030650584, + "step": 7090, + "train/total_loss": 0.19094707071781158 + }, + { + "entropy": 8.718148231506348, + "epoch": 0.7011073759145738, + "mean_token_accuracy": 0.748062014579773, + "num_tokens": 16089291.0, + "step": 7091, + "train/ce_loss": 1.0415087938308716 + }, + { + "epoch": 0.7011073759145738, + "step": 7091, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.7011073759145738, + "step": 7091, + "train/total_loss": 0.1822758913040161 + }, + { + "entropy": 8.659074783325195, + "epoch": 0.7012062487640894, + "mean_token_accuracy": 0.7489823698997498, + "num_tokens": 16094534.0, + "step": 7092, + "train/ce_loss": 0.36011967062950134 + }, + { + "epoch": 0.7012062487640894, + "step": 7092, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.7012062487640894, + "step": 7092, + "train/total_loss": 0.0633557140827179 + }, + { + "entropy": 9.10936164855957, + "epoch": 0.7013051216136049, + "mean_token_accuracy": 0.7325383424758911, + "num_tokens": 16099542.0, + "step": 7093, + "train/ce_loss": 1.0369967222213745 + }, + { + "epoch": 0.7013051216136049, + "step": 7093, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.7013051216136049, + "step": 7093, + "train/total_loss": 0.1310434341430664 + }, + { + "entropy": 8.790109634399414, + "epoch": 0.7014039944631204, + "mean_token_accuracy": 0.7553310990333557, + "num_tokens": 16104956.0, + "step": 7094, + "train/ce_loss": 0.7927346229553223 + }, + { + "epoch": 0.7014039944631204, + "step": 7094, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.7014039944631204, + "step": 7094, + "train/total_loss": 0.16521096229553223 + }, + { + "entropy": 8.922645568847656, + "epoch": 0.701502867312636, + "mean_token_accuracy": 0.7646198868751526, + "num_tokens": 16110125.0, + "step": 7095, + "train/ce_loss": 0.5583441257476807 + }, + { + "epoch": 0.701502867312636, + "step": 7095, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.701502867312636, + "step": 7095, + "train/total_loss": 0.09489691257476807 + }, + { + "entropy": 9.202375411987305, + "epoch": 0.7016017401621515, + "mean_token_accuracy": 0.7791519165039062, + "num_tokens": 16115083.0, + "step": 7096, + "train/ce_loss": 1.1116911172866821 + }, + { + "epoch": 0.7016017401621515, + "step": 7096, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.7016017401621515, + "step": 7096, + "train/total_loss": 0.15413786470890045 + }, + { + "entropy": 9.530303955078125, + "epoch": 0.701700613011667, + "mean_token_accuracy": 0.7134387493133545, + "num_tokens": 16119981.0, + "step": 7097, + "train/ce_loss": 2.1672567527275532e-05 + }, + { + "epoch": 0.701700613011667, + "step": 7097, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.701700613011667, + "step": 7097, + "train/total_loss": 0.054689668118953705 + }, + { + "entropy": 9.085861206054688, + "epoch": 0.7017994858611826, + "mean_token_accuracy": 0.7403314709663391, + "num_tokens": 16124953.0, + "step": 7098, + "train/ce_loss": 4.617771992343478e-06 + }, + { + "epoch": 0.7017994858611826, + "step": 7098, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.7017994858611826, + "step": 7098, + "train/total_loss": 0.05078171193599701 + }, + { + "entropy": 8.683131217956543, + "epoch": 0.7018983587106981, + "mean_token_accuracy": 0.7448512315750122, + "num_tokens": 16130276.0, + "step": 7099, + "train/ce_loss": 1.2270652055740356 + }, + { + "epoch": 0.7018983587106981, + "step": 7099, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.7018983587106981, + "step": 7099, + "train/total_loss": 0.22817528247833252 + }, + { + "epoch": 0.7019972315602135, + "grad_norm": 0.6079980134963989, + "learning_rate": 8.247292686545023e-06, + "loss": 0.1378, + "step": 7100 + }, + { + "entropy": 9.169426918029785, + "epoch": 0.7019972315602135, + "mean_token_accuracy": 0.7962675094604492, + "num_tokens": 16135351.0, + "step": 7100, + "train/ce_loss": 0.6476722359657288 + }, + { + "epoch": 0.7019972315602135, + "step": 7100, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.7019972315602135, + "step": 7100, + "train/total_loss": 0.11554847657680511 + }, + { + "entropy": 8.599437713623047, + "epoch": 0.7020961044097291, + "mean_token_accuracy": 0.695652186870575, + "num_tokens": 16140657.0, + "step": 7101, + "train/ce_loss": 1.5919551849365234 + }, + { + "epoch": 0.7020961044097291, + "step": 7101, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.7020961044097291, + "step": 7101, + "train/total_loss": 0.21388302743434906 + }, + { + "entropy": 8.517477989196777, + "epoch": 0.7021949772592446, + "mean_token_accuracy": 0.7347368597984314, + "num_tokens": 16146108.0, + "step": 7102, + "train/ce_loss": 0.6423676609992981 + }, + { + "epoch": 0.7021949772592446, + "step": 7102, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.7021949772592446, + "step": 7102, + "train/total_loss": 0.11501801759004593 + }, + { + "entropy": 8.598403930664062, + "epoch": 0.7022938501087601, + "mean_token_accuracy": 0.6909765005111694, + "num_tokens": 16151414.0, + "step": 7103, + "train/ce_loss": 0.7955908179283142 + }, + { + "epoch": 0.7022938501087601, + "step": 7103, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.7022938501087601, + "step": 7103, + "train/total_loss": 0.1459653377532959 + }, + { + "entropy": 9.130369186401367, + "epoch": 0.7023927229582757, + "mean_token_accuracy": 0.7560975551605225, + "num_tokens": 16156419.0, + "step": 7104, + "train/ce_loss": 0.9296554327011108 + }, + { + "epoch": 0.7023927229582757, + "step": 7104, + "train/sim_loss": 0.01171875 + }, + { + "epoch": 0.7023927229582757, + "step": 7104, + "train/total_loss": 0.10468429327011108 + }, + { + "entropy": 8.827374458312988, + "epoch": 0.7024915958077912, + "mean_token_accuracy": 0.7533875107765198, + "num_tokens": 16161646.0, + "step": 7105, + "train/ce_loss": 0.5184999108314514 + }, + { + "epoch": 0.7024915958077912, + "step": 7105, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.7024915958077912, + "step": 7105, + "train/total_loss": 0.07919374108314514 + }, + { + "entropy": 8.807904243469238, + "epoch": 0.7025904686573067, + "mean_token_accuracy": 0.7344444394111633, + "num_tokens": 16166945.0, + "step": 7106, + "train/ce_loss": 0.5310591459274292 + }, + { + "epoch": 0.7025904686573067, + "step": 7106, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.7025904686573067, + "step": 7106, + "train/total_loss": 0.1156059205532074 + }, + { + "entropy": 9.410507202148438, + "epoch": 0.7026893415068223, + "mean_token_accuracy": 0.748971164226532, + "num_tokens": 16171838.0, + "step": 7107, + "train/ce_loss": 3.303274752397556e-06 + }, + { + "epoch": 0.7026893415068223, + "step": 7107, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.7026893415068223, + "step": 7107, + "train/total_loss": 0.04687533155083656 + }, + { + "entropy": 9.01647663116455, + "epoch": 0.7027882143563378, + "mean_token_accuracy": 0.7506082653999329, + "num_tokens": 16177259.0, + "step": 7108, + "train/ce_loss": 6.459321866714163e-06 + }, + { + "epoch": 0.7027882143563378, + "step": 7108, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.7027882143563378, + "step": 7108, + "train/total_loss": 0.023438146337866783 + }, + { + "entropy": 8.497390747070312, + "epoch": 0.7028870872058532, + "mean_token_accuracy": 0.747583270072937, + "num_tokens": 16182649.0, + "step": 7109, + "train/ce_loss": 0.850794792175293 + }, + { + "epoch": 0.7028870872058532, + "step": 7109, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.7028870872058532, + "step": 7109, + "train/total_loss": 0.13195449113845825 + }, + { + "entropy": 8.878206253051758, + "epoch": 0.7029859600553688, + "mean_token_accuracy": 0.6658415794372559, + "num_tokens": 16187907.0, + "step": 7110, + "train/ce_loss": 0.9725397825241089 + }, + { + "epoch": 0.7029859600553688, + "step": 7110, + "train/sim_loss": 0.12890625 + }, + { + "epoch": 0.7029859600553688, + "step": 7110, + "train/total_loss": 0.2261602282524109 + }, + { + "entropy": 8.743565559387207, + "epoch": 0.7030848329048843, + "mean_token_accuracy": 0.6086448431015015, + "num_tokens": 16193219.0, + "step": 7111, + "train/ce_loss": 0.97925865650177 + }, + { + "epoch": 0.7030848329048843, + "step": 7111, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.7030848329048843, + "step": 7111, + "train/total_loss": 0.19167587161064148 + }, + { + "entropy": 8.753701210021973, + "epoch": 0.7031837057543998, + "mean_token_accuracy": 0.7647058963775635, + "num_tokens": 16198592.0, + "step": 7112, + "train/ce_loss": 0.5746150612831116 + }, + { + "epoch": 0.7031837057543998, + "step": 7112, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.7031837057543998, + "step": 7112, + "train/total_loss": 0.08089900761842728 + }, + { + "entropy": 8.577747344970703, + "epoch": 0.7032825786039154, + "mean_token_accuracy": 0.7411225438117981, + "num_tokens": 16203972.0, + "step": 7113, + "train/ce_loss": 1.0217781066894531 + }, + { + "epoch": 0.7032825786039154, + "step": 7113, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.7032825786039154, + "step": 7113, + "train/total_loss": 0.16467781364917755 + }, + { + "entropy": 8.97984504699707, + "epoch": 0.7033814514534309, + "mean_token_accuracy": 0.7781690359115601, + "num_tokens": 16209017.0, + "step": 7114, + "train/ce_loss": 0.6582441926002502 + }, + { + "epoch": 0.7033814514534309, + "step": 7114, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.7033814514534309, + "step": 7114, + "train/total_loss": 0.15566816926002502 + }, + { + "entropy": 8.744083404541016, + "epoch": 0.7034803243029464, + "mean_token_accuracy": 0.7279411554336548, + "num_tokens": 16214460.0, + "step": 7115, + "train/ce_loss": 1.2731103897094727 + }, + { + "epoch": 0.7034803243029464, + "step": 7115, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.7034803243029464, + "step": 7115, + "train/total_loss": 0.17418603599071503 + }, + { + "entropy": 8.947891235351562, + "epoch": 0.703579197152462, + "mean_token_accuracy": 0.7562408447265625, + "num_tokens": 16219600.0, + "step": 7116, + "train/ce_loss": 0.5274356603622437 + }, + { + "epoch": 0.703579197152462, + "step": 7116, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.703579197152462, + "step": 7116, + "train/total_loss": 0.1035248190164566 + }, + { + "entropy": 8.943184852600098, + "epoch": 0.7036780700019775, + "mean_token_accuracy": 0.6681922078132629, + "num_tokens": 16224930.0, + "step": 7117, + "train/ce_loss": 1.4683958292007446 + }, + { + "epoch": 0.7036780700019775, + "step": 7117, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.7036780700019775, + "step": 7117, + "train/total_loss": 0.20933958888053894 + }, + { + "entropy": 9.058347702026367, + "epoch": 0.703776942851493, + "mean_token_accuracy": 0.7681607604026794, + "num_tokens": 16229995.0, + "step": 7118, + "train/ce_loss": 0.9557143449783325 + }, + { + "epoch": 0.703776942851493, + "step": 7118, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.703776942851493, + "step": 7118, + "train/total_loss": 0.12291518598794937 + }, + { + "entropy": 8.758286476135254, + "epoch": 0.7038758157010085, + "mean_token_accuracy": 0.7527272701263428, + "num_tokens": 16235290.0, + "step": 7119, + "train/ce_loss": 1.1078318357467651 + }, + { + "epoch": 0.7038758157010085, + "step": 7119, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.7038758157010085, + "step": 7119, + "train/total_loss": 0.169376939535141 + }, + { + "epoch": 0.703974688550524, + "grad_norm": 0.5794971585273743, + "learning_rate": 8.242347821787075e-06, + "loss": 0.1395, + "step": 7120 + }, + { + "entropy": 8.493532180786133, + "epoch": 0.703974688550524, + "mean_token_accuracy": 0.758169949054718, + "num_tokens": 16240724.0, + "step": 7120, + "train/ce_loss": 0.9743674397468567 + }, + { + "epoch": 0.703974688550524, + "step": 7120, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.703974688550524, + "step": 7120, + "train/total_loss": 0.13649925589561462 + }, + { + "entropy": 8.739421844482422, + "epoch": 0.7040735614000395, + "mean_token_accuracy": 0.7146371603012085, + "num_tokens": 16245995.0, + "step": 7121, + "train/ce_loss": 0.9262082576751709 + }, + { + "epoch": 0.7040735614000395, + "step": 7121, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.7040735614000395, + "step": 7121, + "train/total_loss": 0.12387082725763321 + }, + { + "entropy": 8.64365005493164, + "epoch": 0.7041724342495551, + "mean_token_accuracy": 0.7761557102203369, + "num_tokens": 16251308.0, + "step": 7122, + "train/ce_loss": 0.5316778421401978 + }, + { + "epoch": 0.7041724342495551, + "step": 7122, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.7041724342495551, + "step": 7122, + "train/total_loss": 0.09613654017448425 + }, + { + "entropy": 9.161237716674805, + "epoch": 0.7042713070990706, + "mean_token_accuracy": 0.7161290049552917, + "num_tokens": 16256355.0, + "step": 7123, + "train/ce_loss": 1.7950961589813232 + }, + { + "epoch": 0.7042713070990706, + "step": 7123, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.7042713070990706, + "step": 7123, + "train/total_loss": 0.28497838973999023 + }, + { + "entropy": 8.801056861877441, + "epoch": 0.7043701799485861, + "mean_token_accuracy": 0.7650200128555298, + "num_tokens": 16261530.0, + "step": 7124, + "train/ce_loss": 0.7621773481369019 + }, + { + "epoch": 0.7043701799485861, + "step": 7124, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.7043701799485861, + "step": 7124, + "train/total_loss": 0.11918648332357407 + }, + { + "entropy": 8.350522994995117, + "epoch": 0.7044690527981017, + "mean_token_accuracy": 0.6791236996650696, + "num_tokens": 16266768.0, + "step": 7125, + "train/ce_loss": 1.3533482551574707 + }, + { + "epoch": 0.7044690527981017, + "step": 7125, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.7044690527981017, + "step": 7125, + "train/total_loss": 0.2251785844564438 + }, + { + "entropy": 8.447582244873047, + "epoch": 0.7045679256476172, + "mean_token_accuracy": 0.7481662631034851, + "num_tokens": 16272084.0, + "step": 7126, + "train/ce_loss": 0.9055770635604858 + }, + { + "epoch": 0.7045679256476172, + "step": 7126, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.7045679256476172, + "step": 7126, + "train/total_loss": 0.18821395933628082 + }, + { + "entropy": 9.425670623779297, + "epoch": 0.7046667984971327, + "mean_token_accuracy": 0.7310924530029297, + "num_tokens": 16276860.0, + "step": 7127, + "train/ce_loss": 7.22577397027635e-06 + }, + { + "epoch": 0.7046667984971327, + "step": 7127, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.7046667984971327, + "step": 7127, + "train/total_loss": 0.0390632227063179 + }, + { + "entropy": 8.652997970581055, + "epoch": 0.7047656713466482, + "mean_token_accuracy": 0.7416201233863831, + "num_tokens": 16282076.0, + "step": 7128, + "train/ce_loss": 1.3002984523773193 + }, + { + "epoch": 0.7047656713466482, + "step": 7128, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.7047656713466482, + "step": 7128, + "train/total_loss": 0.2394048422574997 + }, + { + "entropy": 8.795637130737305, + "epoch": 0.7048645441961637, + "mean_token_accuracy": 0.7727839946746826, + "num_tokens": 16287340.0, + "step": 7129, + "train/ce_loss": 1.3004807233810425 + }, + { + "epoch": 0.7048645441961637, + "step": 7129, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.7048645441961637, + "step": 7129, + "train/total_loss": 0.17301683127880096 + }, + { + "entropy": 9.394787788391113, + "epoch": 0.7049634170456792, + "mean_token_accuracy": 0.7645630836486816, + "num_tokens": 16292204.0, + "step": 7130, + "train/ce_loss": 1.435694694519043 + }, + { + "epoch": 0.7049634170456792, + "step": 7130, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.7049634170456792, + "step": 7130, + "train/total_loss": 0.2099757194519043 + }, + { + "entropy": 9.610006332397461, + "epoch": 0.7050622898951948, + "mean_token_accuracy": 0.7081544995307922, + "num_tokens": 16297116.0, + "step": 7131, + "train/ce_loss": 1.6964994529189426e-06 + }, + { + "epoch": 0.7050622898951948, + "step": 7131, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.7050622898951948, + "step": 7131, + "train/total_loss": 0.06640642136335373 + }, + { + "entropy": 8.901695251464844, + "epoch": 0.7051611627447103, + "mean_token_accuracy": 0.6830891966819763, + "num_tokens": 16302446.0, + "step": 7132, + "train/ce_loss": 4.321337655710522e-06 + }, + { + "epoch": 0.7051611627447103, + "step": 7132, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.7051611627447103, + "step": 7132, + "train/total_loss": 0.05078168213367462 + }, + { + "entropy": 8.928903579711914, + "epoch": 0.7052600355942258, + "mean_token_accuracy": 0.7589158415794373, + "num_tokens": 16307597.0, + "step": 7133, + "train/ce_loss": 0.7011563777923584 + }, + { + "epoch": 0.7052600355942258, + "step": 7133, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.7052600355942258, + "step": 7133, + "train/total_loss": 0.12089689075946808 + }, + { + "entropy": 8.908486366271973, + "epoch": 0.7053589084437414, + "mean_token_accuracy": 0.7802907824516296, + "num_tokens": 16312610.0, + "step": 7134, + "train/ce_loss": 1.5629417475793161e-06 + }, + { + "epoch": 0.7053589084437414, + "step": 7134, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.7053589084437414, + "step": 7134, + "train/total_loss": 0.023437656462192535 + }, + { + "entropy": 8.977813720703125, + "epoch": 0.7054577812932569, + "mean_token_accuracy": 0.7375504970550537, + "num_tokens": 16317834.0, + "step": 7135, + "train/ce_loss": 1.288329005241394 + }, + { + "epoch": 0.7054577812932569, + "step": 7135, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.7054577812932569, + "step": 7135, + "train/total_loss": 0.19133290648460388 + }, + { + "entropy": 8.279001235961914, + "epoch": 0.7055566541427724, + "mean_token_accuracy": 0.7865055203437805, + "num_tokens": 16323305.0, + "step": 7136, + "train/ce_loss": 0.8539165258407593 + }, + { + "epoch": 0.7055566541427724, + "step": 7136, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.7055566541427724, + "step": 7136, + "train/total_loss": 0.11273540556430817 + }, + { + "entropy": 9.084806442260742, + "epoch": 0.705655526992288, + "mean_token_accuracy": 0.7059925198554993, + "num_tokens": 16328261.0, + "step": 7137, + "train/ce_loss": 1.0540443658828735 + }, + { + "epoch": 0.705655526992288, + "step": 7137, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.705655526992288, + "step": 7137, + "train/total_loss": 0.15227943658828735 + }, + { + "entropy": 8.779060363769531, + "epoch": 0.7057543998418034, + "mean_token_accuracy": 0.7353308200836182, + "num_tokens": 16333537.0, + "step": 7138, + "train/ce_loss": 0.8341813683509827 + }, + { + "epoch": 0.7057543998418034, + "step": 7138, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.7057543998418034, + "step": 7138, + "train/total_loss": 0.1420118808746338 + }, + { + "entropy": 8.768844604492188, + "epoch": 0.7058532726913189, + "mean_token_accuracy": 0.7597402334213257, + "num_tokens": 16338772.0, + "step": 7139, + "train/ce_loss": 0.41492050886154175 + }, + { + "epoch": 0.7058532726913189, + "step": 7139, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.7058532726913189, + "step": 7139, + "train/total_loss": 0.0961795523762703 + }, + { + "epoch": 0.7059521455408345, + "grad_norm": 0.652470052242279, + "learning_rate": 8.237402957029126e-06, + "loss": 0.133, + "step": 7140 + }, + { + "entropy": 8.763189315795898, + "epoch": 0.7059521455408345, + "mean_token_accuracy": 0.7318007946014404, + "num_tokens": 16344046.0, + "step": 7140, + "train/ce_loss": 1.0437805652618408 + }, + { + "epoch": 0.7059521455408345, + "step": 7140, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.7059521455408345, + "step": 7140, + "train/total_loss": 0.17859680950641632 + }, + { + "entropy": 8.776391983032227, + "epoch": 0.70605101839035, + "mean_token_accuracy": 0.746051013469696, + "num_tokens": 16349338.0, + "step": 7141, + "train/ce_loss": 0.7831575274467468 + }, + { + "epoch": 0.70605101839035, + "step": 7141, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.70605101839035, + "step": 7141, + "train/total_loss": 0.12909701466560364 + }, + { + "entropy": 8.896232604980469, + "epoch": 0.7061498912398655, + "mean_token_accuracy": 0.736775815486908, + "num_tokens": 16354619.0, + "step": 7142, + "train/ce_loss": 0.939310610294342 + }, + { + "epoch": 0.7061498912398655, + "step": 7142, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.7061498912398655, + "step": 7142, + "train/total_loss": 0.15643106400966644 + }, + { + "entropy": 8.795890808105469, + "epoch": 0.7062487640893811, + "mean_token_accuracy": 0.7345678806304932, + "num_tokens": 16360063.0, + "step": 7143, + "train/ce_loss": 0.6977701783180237 + }, + { + "epoch": 0.7062487640893811, + "step": 7143, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.7062487640893811, + "step": 7143, + "train/total_loss": 0.12055826932191849 + }, + { + "entropy": 8.57271957397461, + "epoch": 0.7063476369388966, + "mean_token_accuracy": 0.7506082653999329, + "num_tokens": 16365378.0, + "step": 7144, + "train/ce_loss": 0.5306150317192078 + }, + { + "epoch": 0.7063476369388966, + "step": 7144, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.7063476369388966, + "step": 7144, + "train/total_loss": 0.08040525019168854 + }, + { + "entropy": 9.348947525024414, + "epoch": 0.7064465097884121, + "mean_token_accuracy": 0.7150635123252869, + "num_tokens": 16370323.0, + "step": 7145, + "train/ce_loss": 1.9032906293869019 + }, + { + "epoch": 0.7064465097884121, + "step": 7145, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.7064465097884121, + "step": 7145, + "train/total_loss": 0.22157905995845795 + }, + { + "entropy": 8.477828025817871, + "epoch": 0.7065453826379277, + "mean_token_accuracy": 0.7294238805770874, + "num_tokens": 16375744.0, + "step": 7146, + "train/ce_loss": 0.8453947901725769 + }, + { + "epoch": 0.7065453826379277, + "step": 7146, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.7065453826379277, + "step": 7146, + "train/total_loss": 0.1548519730567932 + }, + { + "entropy": 8.79666805267334, + "epoch": 0.7066442554874431, + "mean_token_accuracy": 0.7016128897666931, + "num_tokens": 16381072.0, + "step": 7147, + "train/ce_loss": 1.7105617189372424e-06 + }, + { + "epoch": 0.7066442554874431, + "step": 7147, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.7066442554874431, + "step": 7147, + "train/total_loss": 0.02343767136335373 + }, + { + "entropy": 8.350767135620117, + "epoch": 0.7067431283369586, + "mean_token_accuracy": 0.7127450704574585, + "num_tokens": 16386528.0, + "step": 7148, + "train/ce_loss": 1.3043437004089355 + }, + { + "epoch": 0.7067431283369586, + "step": 7148, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.7067431283369586, + "step": 7148, + "train/total_loss": 0.18512187898159027 + }, + { + "entropy": 8.232901573181152, + "epoch": 0.7068420011864742, + "mean_token_accuracy": 0.7346723079681396, + "num_tokens": 16391963.0, + "step": 7149, + "train/ce_loss": 1.0024423599243164 + }, + { + "epoch": 0.7068420011864742, + "step": 7149, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.7068420011864742, + "step": 7149, + "train/total_loss": 0.17055673897266388 + }, + { + "entropy": 8.889684677124023, + "epoch": 0.7069408740359897, + "mean_token_accuracy": 0.7333333492279053, + "num_tokens": 16397043.0, + "step": 7150, + "train/ce_loss": 0.8943809866905212 + }, + { + "epoch": 0.7069408740359897, + "step": 7150, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.7069408740359897, + "step": 7150, + "train/total_loss": 0.12068810313940048 + }, + { + "entropy": 8.396048545837402, + "epoch": 0.7070397468855052, + "mean_token_accuracy": 0.8077325224876404, + "num_tokens": 16402546.0, + "step": 7151, + "train/ce_loss": 0.43090003728866577 + }, + { + "epoch": 0.7070397468855052, + "step": 7151, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.7070397468855052, + "step": 7151, + "train/total_loss": 0.06652750074863434 + }, + { + "entropy": 8.451786994934082, + "epoch": 0.7071386197350208, + "mean_token_accuracy": 0.7062146663665771, + "num_tokens": 16407882.0, + "step": 7152, + "train/ce_loss": 0.7465851306915283 + }, + { + "epoch": 0.7071386197350208, + "step": 7152, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.7071386197350208, + "step": 7152, + "train/total_loss": 0.12543976306915283 + }, + { + "entropy": 8.876398086547852, + "epoch": 0.7072374925845363, + "mean_token_accuracy": 0.7353433966636658, + "num_tokens": 16412986.0, + "step": 7153, + "train/ce_loss": 0.7852999567985535 + }, + { + "epoch": 0.7072374925845363, + "step": 7153, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.7072374925845363, + "step": 7153, + "train/total_loss": 0.11759249866008759 + }, + { + "entropy": 8.655569076538086, + "epoch": 0.7073363654340518, + "mean_token_accuracy": 0.7562130093574524, + "num_tokens": 16418289.0, + "step": 7154, + "train/ce_loss": 0.7815631031990051 + }, + { + "epoch": 0.7073363654340518, + "step": 7154, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.7073363654340518, + "step": 7154, + "train/total_loss": 0.15628132224082947 + }, + { + "entropy": 8.82831859588623, + "epoch": 0.7074352382835674, + "mean_token_accuracy": 0.7213930487632751, + "num_tokens": 16423545.0, + "step": 7155, + "train/ce_loss": 0.7690247297286987 + }, + { + "epoch": 0.7074352382835674, + "step": 7155, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.7074352382835674, + "step": 7155, + "train/total_loss": 0.12768372893333435 + }, + { + "entropy": 8.179976463317871, + "epoch": 0.7075341111330828, + "mean_token_accuracy": 0.6920821070671082, + "num_tokens": 16429042.0, + "step": 7156, + "train/ce_loss": 0.8388369679450989 + }, + { + "epoch": 0.7075341111330828, + "step": 7156, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.7075341111330828, + "step": 7156, + "train/total_loss": 0.16200870275497437 + }, + { + "entropy": 9.279035568237305, + "epoch": 0.7076329839825983, + "mean_token_accuracy": 0.6789883375167847, + "num_tokens": 16434141.0, + "step": 7157, + "train/ce_loss": 1.4697717428207397 + }, + { + "epoch": 0.7076329839825983, + "step": 7157, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.7076329839825983, + "step": 7157, + "train/total_loss": 0.21728967130184174 + }, + { + "entropy": 8.525737762451172, + "epoch": 0.7077318568321139, + "mean_token_accuracy": 0.720652163028717, + "num_tokens": 16439476.0, + "step": 7158, + "train/ce_loss": 0.44447314739227295 + }, + { + "epoch": 0.7077318568321139, + "step": 7158, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.7077318568321139, + "step": 7158, + "train/total_loss": 0.08350981771945953 + }, + { + "entropy": 8.48501968383789, + "epoch": 0.7078307296816294, + "mean_token_accuracy": 0.7268195152282715, + "num_tokens": 16444987.0, + "step": 7159, + "train/ce_loss": 1.2266879081726074 + }, + { + "epoch": 0.7078307296816294, + "step": 7159, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.7078307296816294, + "step": 7159, + "train/total_loss": 0.2007938027381897 + }, + { + "epoch": 0.7079296025311449, + "grad_norm": 0.7253368496894836, + "learning_rate": 8.232458092271178e-06, + "loss": 0.1443, + "step": 7160 + }, + { + "entropy": 8.727168083190918, + "epoch": 0.7079296025311449, + "mean_token_accuracy": 0.7009102702140808, + "num_tokens": 16450221.0, + "step": 7160, + "train/ce_loss": 0.8683127164840698 + }, + { + "epoch": 0.7079296025311449, + "step": 7160, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.7079296025311449, + "step": 7160, + "train/total_loss": 0.18448752164840698 + }, + { + "entropy": 8.723089218139648, + "epoch": 0.7080284753806605, + "mean_token_accuracy": 0.7420118451118469, + "num_tokens": 16455509.0, + "step": 7161, + "train/ce_loss": 0.4825913906097412 + }, + { + "epoch": 0.7080284753806605, + "step": 7161, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.7080284753806605, + "step": 7161, + "train/total_loss": 0.09904038906097412 + }, + { + "entropy": 8.638715744018555, + "epoch": 0.708127348230176, + "mean_token_accuracy": 0.7156177163124084, + "num_tokens": 16460895.0, + "step": 7162, + "train/ce_loss": 0.9495337605476379 + }, + { + "epoch": 0.708127348230176, + "step": 7162, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.708127348230176, + "step": 7162, + "train/total_loss": 0.17698463797569275 + }, + { + "entropy": 8.273295402526855, + "epoch": 0.7082262210796915, + "mean_token_accuracy": 0.7514318227767944, + "num_tokens": 16466245.0, + "step": 7163, + "train/ce_loss": 1.0944613218307495 + }, + { + "epoch": 0.7082262210796915, + "step": 7163, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.7082262210796915, + "step": 7163, + "train/total_loss": 0.17585238814353943 + }, + { + "entropy": 8.572425842285156, + "epoch": 0.7083250939292071, + "mean_token_accuracy": 0.7860310673713684, + "num_tokens": 16471615.0, + "step": 7164, + "train/ce_loss": 0.6559258699417114 + }, + { + "epoch": 0.7083250939292071, + "step": 7164, + "train/sim_loss": 0.01171875 + }, + { + "epoch": 0.7083250939292071, + "step": 7164, + "train/total_loss": 0.07731133699417114 + }, + { + "entropy": 8.556367874145508, + "epoch": 0.7084239667787225, + "mean_token_accuracy": 0.7476732134819031, + "num_tokens": 16477086.0, + "step": 7165, + "train/ce_loss": 0.7664522528648376 + }, + { + "epoch": 0.7084239667787225, + "step": 7165, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.7084239667787225, + "step": 7165, + "train/total_loss": 0.13914522528648376 + }, + { + "entropy": 8.366483688354492, + "epoch": 0.708522839628238, + "mean_token_accuracy": 0.7300000190734863, + "num_tokens": 16482415.0, + "step": 7166, + "train/ce_loss": 1.4523415565490723 + }, + { + "epoch": 0.708522839628238, + "step": 7166, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.708522839628238, + "step": 7166, + "train/total_loss": 0.180390402674675 + }, + { + "entropy": 8.758155822753906, + "epoch": 0.7086217124777536, + "mean_token_accuracy": 0.7078651785850525, + "num_tokens": 16487429.0, + "step": 7167, + "train/ce_loss": 1.5139796733856201 + }, + { + "epoch": 0.7086217124777536, + "step": 7167, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.7086217124777536, + "step": 7167, + "train/total_loss": 0.2373354732990265 + }, + { + "entropy": 8.730156898498535, + "epoch": 0.7087205853272691, + "mean_token_accuracy": 0.7630619406700134, + "num_tokens": 16492721.0, + "step": 7168, + "train/ce_loss": 0.6302322745323181 + }, + { + "epoch": 0.7087205853272691, + "step": 7168, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.7087205853272691, + "step": 7168, + "train/total_loss": 0.16458573937416077 + }, + { + "entropy": 8.71129322052002, + "epoch": 0.7088194581767846, + "mean_token_accuracy": 0.6772428750991821, + "num_tokens": 16498044.0, + "step": 7169, + "train/ce_loss": 1.1904356479644775 + }, + { + "epoch": 0.7088194581767846, + "step": 7169, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.7088194581767846, + "step": 7169, + "train/total_loss": 0.16591855883598328 + }, + { + "entropy": 8.920693397521973, + "epoch": 0.7089183310263002, + "mean_token_accuracy": 0.7358229756355286, + "num_tokens": 16503273.0, + "step": 7170, + "train/ce_loss": 1.2857366800308228 + }, + { + "epoch": 0.7089183310263002, + "step": 7170, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.7089183310263002, + "step": 7170, + "train/total_loss": 0.21841742098331451 + }, + { + "entropy": 9.01522445678711, + "epoch": 0.7090172038758157, + "mean_token_accuracy": 0.7606111764907837, + "num_tokens": 16508300.0, + "step": 7171, + "train/ce_loss": 0.4783221483230591 + }, + { + "epoch": 0.7090172038758157, + "step": 7171, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.7090172038758157, + "step": 7171, + "train/total_loss": 0.12205097079277039 + }, + { + "entropy": 8.480701446533203, + "epoch": 0.7091160767253313, + "mean_token_accuracy": 0.7057633996009827, + "num_tokens": 16513825.0, + "step": 7172, + "train/ce_loss": 0.7974771857261658 + }, + { + "epoch": 0.7091160767253313, + "step": 7172, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.7091160767253313, + "step": 7172, + "train/total_loss": 0.11099772155284882 + }, + { + "entropy": 8.477746963500977, + "epoch": 0.7092149495748468, + "mean_token_accuracy": 0.7269545197486877, + "num_tokens": 16519141.0, + "step": 7173, + "train/ce_loss": 0.4708613455295563 + }, + { + "epoch": 0.7092149495748468, + "step": 7173, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.7092149495748468, + "step": 7173, + "train/total_loss": 0.07442988455295563 + }, + { + "entropy": 8.391982078552246, + "epoch": 0.7093138224243623, + "mean_token_accuracy": 0.6977459192276001, + "num_tokens": 16524613.0, + "step": 7174, + "train/ce_loss": 1.1097391843795776 + }, + { + "epoch": 0.7093138224243623, + "step": 7174, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.7093138224243623, + "step": 7174, + "train/total_loss": 0.16175517439842224 + }, + { + "entropy": 9.193445205688477, + "epoch": 0.7094126952738778, + "mean_token_accuracy": 0.7680412530899048, + "num_tokens": 16529469.0, + "step": 7175, + "train/ce_loss": 1.7883644104003906 + }, + { + "epoch": 0.7094126952738778, + "step": 7175, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.7094126952738778, + "step": 7175, + "train/total_loss": 0.22961769998073578 + }, + { + "entropy": 9.193803787231445, + "epoch": 0.7095115681233933, + "mean_token_accuracy": 0.8093883395195007, + "num_tokens": 16534618.0, + "step": 7176, + "train/ce_loss": 0.8265652656555176 + }, + { + "epoch": 0.7095115681233933, + "step": 7176, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.7095115681233933, + "step": 7176, + "train/total_loss": 0.14906278252601624 + }, + { + "entropy": 8.67904281616211, + "epoch": 0.7096104409729088, + "mean_token_accuracy": 0.73209547996521, + "num_tokens": 16539769.0, + "step": 7177, + "train/ce_loss": 0.670390784740448 + }, + { + "epoch": 0.7096104409729088, + "step": 7177, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.7096104409729088, + "step": 7177, + "train/total_loss": 0.12953907251358032 + }, + { + "entropy": 9.012173652648926, + "epoch": 0.7097093138224244, + "mean_token_accuracy": 0.7704447507858276, + "num_tokens": 16544894.0, + "step": 7178, + "train/ce_loss": 0.8676934838294983 + }, + { + "epoch": 0.7097093138224244, + "step": 7178, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.7097093138224244, + "step": 7178, + "train/total_loss": 0.17270684242248535 + }, + { + "entropy": 8.632959365844727, + "epoch": 0.7098081866719399, + "mean_token_accuracy": 0.7363238334655762, + "num_tokens": 16550413.0, + "step": 7179, + "train/ce_loss": 1.1974496841430664 + }, + { + "epoch": 0.7098081866719399, + "step": 7179, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.7098081866719399, + "step": 7179, + "train/total_loss": 0.22130747139453888 + }, + { + "epoch": 0.7099070595214554, + "grad_norm": 0.6061992645263672, + "learning_rate": 8.227513227513229e-06, + "loss": 0.1407, + "step": 7180 + }, + { + "entropy": 8.753702163696289, + "epoch": 0.7099070595214554, + "mean_token_accuracy": 0.7413366436958313, + "num_tokens": 16555710.0, + "step": 7180, + "train/ce_loss": 1.0147998332977295 + }, + { + "epoch": 0.7099070595214554, + "step": 7180, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.7099070595214554, + "step": 7180, + "train/total_loss": 0.20694872736930847 + }, + { + "entropy": 9.063572883605957, + "epoch": 0.710005932370971, + "mean_token_accuracy": 0.7465437650680542, + "num_tokens": 16560745.0, + "step": 7181, + "train/ce_loss": 0.8497380018234253 + }, + { + "epoch": 0.710005932370971, + "step": 7181, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.710005932370971, + "step": 7181, + "train/total_loss": 0.18653631210327148 + }, + { + "entropy": 9.492816925048828, + "epoch": 0.7101048052204865, + "mean_token_accuracy": 0.6989796161651611, + "num_tokens": 16565562.0, + "step": 7182, + "train/ce_loss": 1.6922687292099 + }, + { + "epoch": 0.7101048052204865, + "step": 7182, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.7101048052204865, + "step": 7182, + "train/total_loss": 0.19266436994075775 + }, + { + "entropy": 8.737098693847656, + "epoch": 0.710203678070002, + "mean_token_accuracy": 0.7596795558929443, + "num_tokens": 16570949.0, + "step": 7183, + "train/ce_loss": 0.9820594191551208 + }, + { + "epoch": 0.710203678070002, + "step": 7183, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.710203678070002, + "step": 7183, + "train/total_loss": 0.20758095383644104 + }, + { + "entropy": 9.075728416442871, + "epoch": 0.7103025509195176, + "mean_token_accuracy": 0.7591836452484131, + "num_tokens": 16575852.0, + "step": 7184, + "train/ce_loss": 3.5117921015626052e-06 + }, + { + "epoch": 0.7103025509195176, + "step": 7184, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.7103025509195176, + "step": 7184, + "train/total_loss": 0.042969100177288055 + }, + { + "entropy": 8.564920425415039, + "epoch": 0.710401423769033, + "mean_token_accuracy": 0.7653301954269409, + "num_tokens": 16581185.0, + "step": 7185, + "train/ce_loss": 0.5567665696144104 + }, + { + "epoch": 0.710401423769033, + "step": 7185, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.710401423769033, + "step": 7185, + "train/total_loss": 0.0947391539812088 + }, + { + "entropy": 8.391427993774414, + "epoch": 0.7105002966185485, + "mean_token_accuracy": 0.7432170510292053, + "num_tokens": 16586717.0, + "step": 7186, + "train/ce_loss": 0.5869799852371216 + }, + { + "epoch": 0.7105002966185485, + "step": 7186, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.7105002966185485, + "step": 7186, + "train/total_loss": 0.14072924852371216 + }, + { + "entropy": 8.852867126464844, + "epoch": 0.7105991694680641, + "mean_token_accuracy": 0.739570140838623, + "num_tokens": 16591969.0, + "step": 7187, + "train/ce_loss": 1.23922598361969 + }, + { + "epoch": 0.7105991694680641, + "step": 7187, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.7105991694680641, + "step": 7187, + "train/total_loss": 0.20204760134220123 + }, + { + "entropy": 9.074911117553711, + "epoch": 0.7106980423175796, + "mean_token_accuracy": 0.6957831382751465, + "num_tokens": 16597053.0, + "step": 7188, + "train/ce_loss": 1.0194846391677856 + }, + { + "epoch": 0.7106980423175796, + "step": 7188, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.7106980423175796, + "step": 7188, + "train/total_loss": 0.12929221987724304 + }, + { + "entropy": 9.232538223266602, + "epoch": 0.7107969151670951, + "mean_token_accuracy": 0.761904776096344, + "num_tokens": 16601888.0, + "step": 7189, + "train/ce_loss": 2.124577522277832 + }, + { + "epoch": 0.7107969151670951, + "step": 7189, + "train/sim_loss": 0.15625 + }, + { + "epoch": 0.7107969151670951, + "step": 7189, + "train/total_loss": 0.3687077760696411 + }, + { + "entropy": 8.746088981628418, + "epoch": 0.7108957880166107, + "mean_token_accuracy": 0.6945031881332397, + "num_tokens": 16607285.0, + "step": 7190, + "train/ce_loss": 1.0544683933258057 + }, + { + "epoch": 0.7108957880166107, + "step": 7190, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.7108957880166107, + "step": 7190, + "train/total_loss": 0.13669684529304504 + }, + { + "entropy": 9.125240325927734, + "epoch": 0.7109946608661262, + "mean_token_accuracy": 0.8063872456550598, + "num_tokens": 16612238.0, + "step": 7191, + "train/ce_loss": 2.3707571017439477e-06 + }, + { + "epoch": 0.7109946608661262, + "step": 7191, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.7109946608661262, + "step": 7191, + "train/total_loss": 0.0781252384185791 + }, + { + "entropy": 8.543449401855469, + "epoch": 0.7110935337156417, + "mean_token_accuracy": 0.7433832287788391, + "num_tokens": 16617597.0, + "step": 7192, + "train/ce_loss": 1.0866926908493042 + }, + { + "epoch": 0.7110935337156417, + "step": 7192, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.7110935337156417, + "step": 7192, + "train/total_loss": 0.15945053100585938 + }, + { + "entropy": 9.435101509094238, + "epoch": 0.7111924065651573, + "mean_token_accuracy": 0.6426734924316406, + "num_tokens": 16622413.0, + "step": 7193, + "train/ce_loss": 8.009789780771825e-06 + }, + { + "epoch": 0.7111924065651573, + "step": 7193, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.7111924065651573, + "step": 7193, + "train/total_loss": 0.03906330093741417 + }, + { + "entropy": 8.519792556762695, + "epoch": 0.7112912794146727, + "mean_token_accuracy": 0.7894088625907898, + "num_tokens": 16627750.0, + "step": 7194, + "train/ce_loss": 0.3221474289894104 + }, + { + "epoch": 0.7112912794146727, + "step": 7194, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.7112912794146727, + "step": 7194, + "train/total_loss": 0.04783974215388298 + }, + { + "entropy": 8.549459457397461, + "epoch": 0.7113901522641882, + "mean_token_accuracy": 0.6720741391181946, + "num_tokens": 16633098.0, + "step": 7195, + "train/ce_loss": 1.1505359411239624 + }, + { + "epoch": 0.7113901522641882, + "step": 7195, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.7113901522641882, + "step": 7195, + "train/total_loss": 0.16583484411239624 + }, + { + "entropy": 8.939401626586914, + "epoch": 0.7114890251137038, + "mean_token_accuracy": 0.7618343234062195, + "num_tokens": 16638198.0, + "step": 7196, + "train/ce_loss": 1.1567680835723877 + }, + { + "epoch": 0.7114890251137038, + "step": 7196, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.7114890251137038, + "step": 7196, + "train/total_loss": 0.18989557027816772 + }, + { + "entropy": 8.785555839538574, + "epoch": 0.7115878979632193, + "mean_token_accuracy": 0.807212233543396, + "num_tokens": 16643387.0, + "step": 7197, + "train/ce_loss": 1.0497312545776367 + }, + { + "epoch": 0.7115878979632193, + "step": 7197, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.7115878979632193, + "step": 7197, + "train/total_loss": 0.17528563737869263 + }, + { + "entropy": 8.629180908203125, + "epoch": 0.7116867708127348, + "mean_token_accuracy": 0.7587769031524658, + "num_tokens": 16648693.0, + "step": 7198, + "train/ce_loss": 0.8789370656013489 + }, + { + "epoch": 0.7116867708127348, + "step": 7198, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.7116867708127348, + "step": 7198, + "train/total_loss": 0.15820620954036713 + }, + { + "entropy": 9.374748229980469, + "epoch": 0.7117856436622504, + "mean_token_accuracy": 0.7395833134651184, + "num_tokens": 16653588.0, + "step": 7199, + "train/ce_loss": 3.264047563789063e-06 + }, + { + "epoch": 0.7117856436622504, + "step": 7199, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.7117856436622504, + "step": 7199, + "train/total_loss": 0.046875327825546265 + }, + { + "epoch": 0.7118845165117659, + "grad_norm": 0.7894151210784912, + "learning_rate": 8.222568362755279e-06, + "loss": 0.1379, + "step": 7200 + }, + { + "entropy": 9.173515319824219, + "epoch": 0.7118845165117659, + "mean_token_accuracy": 0.75, + "num_tokens": 16658565.0, + "step": 7200, + "train/ce_loss": 3.8327752918121405e-06 + }, + { + "epoch": 0.7118845165117659, + "step": 7200, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.7118845165117659, + "step": 7200, + "train/total_loss": 0.03906288370490074 + }, + { + "entropy": 8.6094970703125, + "epoch": 0.7119833893612814, + "mean_token_accuracy": 0.6755725145339966, + "num_tokens": 16663803.0, + "step": 7201, + "train/ce_loss": 3.767704765778035e-05 + }, + { + "epoch": 0.7119833893612814, + "step": 7201, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.7119833893612814, + "step": 7201, + "train/total_loss": 0.031253766268491745 + }, + { + "entropy": 8.67281436920166, + "epoch": 0.712082262210797, + "mean_token_accuracy": 0.7536889910697937, + "num_tokens": 16669137.0, + "step": 7202, + "train/ce_loss": 0.8251853585243225 + }, + { + "epoch": 0.712082262210797, + "step": 7202, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.712082262210797, + "step": 7202, + "train/total_loss": 0.1918935477733612 + }, + { + "entropy": 8.776318550109863, + "epoch": 0.7121811350603124, + "mean_token_accuracy": 0.759856641292572, + "num_tokens": 16674566.0, + "step": 7203, + "train/ce_loss": 0.6415863037109375 + }, + { + "epoch": 0.7121811350603124, + "step": 7203, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.7121811350603124, + "step": 7203, + "train/total_loss": 0.161814883351326 + }, + { + "entropy": 8.146249771118164, + "epoch": 0.7122800079098279, + "mean_token_accuracy": 0.7675840854644775, + "num_tokens": 16680003.0, + "step": 7204, + "train/ce_loss": 0.5550179481506348 + }, + { + "epoch": 0.7122800079098279, + "step": 7204, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.7122800079098279, + "step": 7204, + "train/total_loss": 0.125814288854599 + }, + { + "entropy": 8.744900703430176, + "epoch": 0.7123788807593435, + "mean_token_accuracy": 0.727173924446106, + "num_tokens": 16685414.0, + "step": 7205, + "train/ce_loss": 0.7248499393463135 + }, + { + "epoch": 0.7123788807593435, + "step": 7205, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.7123788807593435, + "step": 7205, + "train/total_loss": 0.11545374244451523 + }, + { + "entropy": 8.859378814697266, + "epoch": 0.712477753608859, + "mean_token_accuracy": 0.75, + "num_tokens": 16690588.0, + "step": 7206, + "train/ce_loss": 0.5503898859024048 + }, + { + "epoch": 0.712477753608859, + "step": 7206, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.712477753608859, + "step": 7206, + "train/total_loss": 0.07847648859024048 + }, + { + "entropy": 9.277009963989258, + "epoch": 0.7125766264583745, + "mean_token_accuracy": 0.7433264851570129, + "num_tokens": 16695516.0, + "step": 7207, + "train/ce_loss": 1.363459825515747 + }, + { + "epoch": 0.7125766264583745, + "step": 7207, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.7125766264583745, + "step": 7207, + "train/total_loss": 0.1988459825515747 + }, + { + "entropy": 8.88760757446289, + "epoch": 0.7126754993078901, + "mean_token_accuracy": 0.7215026021003723, + "num_tokens": 16700755.0, + "step": 7208, + "train/ce_loss": 0.5441802144050598 + }, + { + "epoch": 0.7126754993078901, + "step": 7208, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.7126754993078901, + "step": 7208, + "train/total_loss": 0.08566802740097046 + }, + { + "entropy": 8.83308219909668, + "epoch": 0.7127743721574056, + "mean_token_accuracy": 0.751329779624939, + "num_tokens": 16705945.0, + "step": 7209, + "train/ce_loss": 0.4552195072174072 + }, + { + "epoch": 0.7127743721574056, + "step": 7209, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.7127743721574056, + "step": 7209, + "train/total_loss": 0.10411570221185684 + }, + { + "entropy": 8.83424186706543, + "epoch": 0.7128732450069211, + "mean_token_accuracy": 0.8017789125442505, + "num_tokens": 16711160.0, + "step": 7210, + "train/ce_loss": 0.930652916431427 + }, + { + "epoch": 0.7128732450069211, + "step": 7210, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.7128732450069211, + "step": 7210, + "train/total_loss": 0.1243152916431427 + }, + { + "entropy": 8.777759552001953, + "epoch": 0.7129721178564367, + "mean_token_accuracy": 0.7257204055786133, + "num_tokens": 16716523.0, + "step": 7211, + "train/ce_loss": 0.6090152859687805 + }, + { + "epoch": 0.7129721178564367, + "step": 7211, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.7129721178564367, + "step": 7211, + "train/total_loss": 0.08433903008699417 + }, + { + "entropy": 8.86879825592041, + "epoch": 0.7130709907059521, + "mean_token_accuracy": 0.7698630094528198, + "num_tokens": 16721891.0, + "step": 7212, + "train/ce_loss": 0.30563464760780334 + }, + { + "epoch": 0.7130709907059521, + "step": 7212, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.7130709907059521, + "step": 7212, + "train/total_loss": 0.050094716250896454 + }, + { + "entropy": 9.277276992797852, + "epoch": 0.7131698635554676, + "mean_token_accuracy": 0.7074379920959473, + "num_tokens": 16727058.0, + "step": 7213, + "train/ce_loss": 1.1419485807418823 + }, + { + "epoch": 0.7131698635554676, + "step": 7213, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.7131698635554676, + "step": 7213, + "train/total_loss": 0.1962261199951172 + }, + { + "entropy": 8.50288200378418, + "epoch": 0.7132687364049832, + "mean_token_accuracy": 0.7432712316513062, + "num_tokens": 16732460.0, + "step": 7214, + "train/ce_loss": 0.844182550907135 + }, + { + "epoch": 0.7132687364049832, + "step": 7214, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.7132687364049832, + "step": 7214, + "train/total_loss": 0.15473076701164246 + }, + { + "entropy": 8.25242805480957, + "epoch": 0.7133676092544987, + "mean_token_accuracy": 0.8155699968338013, + "num_tokens": 16738011.0, + "step": 7215, + "train/ce_loss": 0.46100690960884094 + }, + { + "epoch": 0.7133676092544987, + "step": 7215, + "train/sim_loss": 0.01171875 + }, + { + "epoch": 0.7133676092544987, + "step": 7215, + "train/total_loss": 0.057819440960884094 + }, + { + "entropy": 8.90095329284668, + "epoch": 0.7134664821040142, + "mean_token_accuracy": 0.7601199150085449, + "num_tokens": 16743169.0, + "step": 7216, + "train/ce_loss": 1.0977258682250977 + }, + { + "epoch": 0.7134664821040142, + "step": 7216, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.7134664821040142, + "step": 7216, + "train/total_loss": 0.14102259278297424 + }, + { + "entropy": 8.77444076538086, + "epoch": 0.7135653549535298, + "mean_token_accuracy": 0.6976456046104431, + "num_tokens": 16748427.0, + "step": 7217, + "train/ce_loss": 1.2158559560775757 + }, + { + "epoch": 0.7135653549535298, + "step": 7217, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.7135653549535298, + "step": 7217, + "train/total_loss": 0.20752310752868652 + }, + { + "entropy": 8.828907012939453, + "epoch": 0.7136642278030453, + "mean_token_accuracy": 0.7189265489578247, + "num_tokens": 16753620.0, + "step": 7218, + "train/ce_loss": 1.1605870723724365 + }, + { + "epoch": 0.7136642278030453, + "step": 7218, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.7136642278030453, + "step": 7218, + "train/total_loss": 0.17855870723724365 + }, + { + "entropy": 9.13592529296875, + "epoch": 0.7137631006525608, + "mean_token_accuracy": 0.7054794430732727, + "num_tokens": 16758634.0, + "step": 7219, + "train/ce_loss": 1.2255994081497192 + }, + { + "epoch": 0.7137631006525608, + "step": 7219, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.7137631006525608, + "step": 7219, + "train/total_loss": 0.20068493485450745 + }, + { + "epoch": 0.7138619735020764, + "grad_norm": 0.8070234656333923, + "learning_rate": 8.21762349799733e-06, + "loss": 0.1408, + "step": 7220 + }, + { + "entropy": 8.727354049682617, + "epoch": 0.7138619735020764, + "mean_token_accuracy": 0.6777523159980774, + "num_tokens": 16763997.0, + "step": 7220, + "train/ce_loss": 0.7659044861793518 + }, + { + "epoch": 0.7138619735020764, + "step": 7220, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.7138619735020764, + "step": 7220, + "train/total_loss": 0.12346544861793518 + }, + { + "entropy": 8.890021324157715, + "epoch": 0.7139608463515918, + "mean_token_accuracy": 0.767912745475769, + "num_tokens": 16769055.0, + "step": 7221, + "train/ce_loss": 0.47009187936782837 + }, + { + "epoch": 0.7139608463515918, + "step": 7221, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.7139608463515918, + "step": 7221, + "train/total_loss": 0.0860716849565506 + }, + { + "entropy": 8.97850513458252, + "epoch": 0.7140597192011073, + "mean_token_accuracy": 0.6974110007286072, + "num_tokens": 16774132.0, + "step": 7222, + "train/ce_loss": 1.646481905481778e-06 + }, + { + "epoch": 0.7140597192011073, + "step": 7222, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.7140597192011073, + "step": 7222, + "train/total_loss": 0.07421891391277313 + }, + { + "entropy": 8.553703308105469, + "epoch": 0.7141585920506229, + "mean_token_accuracy": 0.714970052242279, + "num_tokens": 16779472.0, + "step": 7223, + "train/ce_loss": 0.48592522740364075 + }, + { + "epoch": 0.7141585920506229, + "step": 7223, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.7141585920506229, + "step": 7223, + "train/total_loss": 0.07984252274036407 + }, + { + "entropy": 9.40013313293457, + "epoch": 0.7142574649001384, + "mean_token_accuracy": 0.8220140337944031, + "num_tokens": 16784296.0, + "step": 7224, + "train/ce_loss": 1.2307887077331543 + }, + { + "epoch": 0.7142574649001384, + "step": 7224, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.7142574649001384, + "step": 7224, + "train/total_loss": 0.14651638269424438 + }, + { + "entropy": 9.15133285522461, + "epoch": 0.7143563377496539, + "mean_token_accuracy": 0.7839506268501282, + "num_tokens": 16789368.0, + "step": 7225, + "train/ce_loss": 0.8047212958335876 + }, + { + "epoch": 0.7143563377496539, + "step": 7225, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.7143563377496539, + "step": 7225, + "train/total_loss": 0.10390963405370712 + }, + { + "entropy": 8.94349193572998, + "epoch": 0.7144552105991695, + "mean_token_accuracy": 0.7742424011230469, + "num_tokens": 16794518.0, + "step": 7226, + "train/ce_loss": 1.8928044482890982e-06 + }, + { + "epoch": 0.7144552105991695, + "step": 7226, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.7144552105991695, + "step": 7226, + "train/total_loss": 0.05859393998980522 + }, + { + "entropy": 9.260454177856445, + "epoch": 0.714554083448685, + "mean_token_accuracy": 0.7296360731124878, + "num_tokens": 16799497.0, + "step": 7227, + "train/ce_loss": 2.46664899350435e-06 + }, + { + "epoch": 0.714554083448685, + "step": 7227, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.714554083448685, + "step": 7227, + "train/total_loss": 0.0195314958691597 + }, + { + "entropy": 8.678625106811523, + "epoch": 0.7146529562982005, + "mean_token_accuracy": 0.7012820243835449, + "num_tokens": 16804724.0, + "step": 7228, + "train/ce_loss": 1.9900438785552979 + }, + { + "epoch": 0.7146529562982005, + "step": 7228, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.7146529562982005, + "step": 7228, + "train/total_loss": 0.2771294116973877 + }, + { + "entropy": 8.781715393066406, + "epoch": 0.7147518291477161, + "mean_token_accuracy": 0.748251736164093, + "num_tokens": 16809933.0, + "step": 7229, + "train/ce_loss": 1.2562072277069092 + }, + { + "epoch": 0.7147518291477161, + "step": 7229, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.7147518291477161, + "step": 7229, + "train/total_loss": 0.18812072277069092 + }, + { + "entropy": 8.724804878234863, + "epoch": 0.7148507019972316, + "mean_token_accuracy": 0.7318652868270874, + "num_tokens": 16815153.0, + "step": 7230, + "train/ce_loss": 0.6815114617347717 + }, + { + "epoch": 0.7148507019972316, + "step": 7230, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.7148507019972316, + "step": 7230, + "train/total_loss": 0.11893239617347717 + }, + { + "entropy": 8.732213973999023, + "epoch": 0.714949574846747, + "mean_token_accuracy": 0.6647264361381531, + "num_tokens": 16820635.0, + "step": 7231, + "train/ce_loss": 1.2498207092285156 + }, + { + "epoch": 0.714949574846747, + "step": 7231, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.714949574846747, + "step": 7231, + "train/total_loss": 0.1640445739030838 + }, + { + "entropy": 8.821627616882324, + "epoch": 0.7150484476962626, + "mean_token_accuracy": 0.6881851553916931, + "num_tokens": 16825913.0, + "step": 7232, + "train/ce_loss": 0.9049714803695679 + }, + { + "epoch": 0.7150484476962626, + "step": 7232, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.7150484476962626, + "step": 7232, + "train/total_loss": 0.15299715101718903 + }, + { + "entropy": 9.065165519714355, + "epoch": 0.7151473205457781, + "mean_token_accuracy": 0.7344992160797119, + "num_tokens": 16831015.0, + "step": 7233, + "train/ce_loss": 0.6679177284240723 + }, + { + "epoch": 0.7151473205457781, + "step": 7233, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.7151473205457781, + "step": 7233, + "train/total_loss": 0.11366677284240723 + }, + { + "entropy": 9.262226104736328, + "epoch": 0.7152461933952936, + "mean_token_accuracy": 0.7128027677536011, + "num_tokens": 16836047.0, + "step": 7234, + "train/ce_loss": 1.1703163385391235 + }, + { + "epoch": 0.7152461933952936, + "step": 7234, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.7152461933952936, + "step": 7234, + "train/total_loss": 0.19906288385391235 + }, + { + "entropy": 8.538326263427734, + "epoch": 0.7153450662448092, + "mean_token_accuracy": 0.6952841877937317, + "num_tokens": 16841310.0, + "step": 7235, + "train/ce_loss": 0.514238178730011 + }, + { + "epoch": 0.7153450662448092, + "step": 7235, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.7153450662448092, + "step": 7235, + "train/total_loss": 0.0982988178730011 + }, + { + "entropy": 8.30468463897705, + "epoch": 0.7154439390943247, + "mean_token_accuracy": 0.7168743014335632, + "num_tokens": 16846712.0, + "step": 7236, + "train/ce_loss": 1.3380621671676636 + }, + { + "epoch": 0.7154439390943247, + "step": 7236, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.7154439390943247, + "step": 7236, + "train/total_loss": 0.19239996373653412 + }, + { + "entropy": 8.470191955566406, + "epoch": 0.7155428119438402, + "mean_token_accuracy": 0.7670772671699524, + "num_tokens": 16852113.0, + "step": 7237, + "train/ce_loss": 0.958454430103302 + }, + { + "epoch": 0.7155428119438402, + "step": 7237, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.7155428119438402, + "step": 7237, + "train/total_loss": 0.18568919599056244 + }, + { + "entropy": 9.10818099975586, + "epoch": 0.7156416847933558, + "mean_token_accuracy": 0.7554585337638855, + "num_tokens": 16857213.0, + "step": 7238, + "train/ce_loss": 0.8512943983078003 + }, + { + "epoch": 0.7156416847933558, + "step": 7238, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.7156416847933558, + "step": 7238, + "train/total_loss": 0.13200443983078003 + }, + { + "entropy": 8.787921905517578, + "epoch": 0.7157405576428713, + "mean_token_accuracy": 0.7117117047309875, + "num_tokens": 16862428.0, + "step": 7239, + "train/ce_loss": 0.7441038489341736 + }, + { + "epoch": 0.7157405576428713, + "step": 7239, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.7157405576428713, + "step": 7239, + "train/total_loss": 0.12128538638353348 + }, + { + "epoch": 0.7158394304923867, + "grad_norm": 0.7717053890228271, + "learning_rate": 8.212678633239382e-06, + "loss": 0.1401, + "step": 7240 + }, + { + "entropy": 8.294328689575195, + "epoch": 0.7158394304923867, + "mean_token_accuracy": 0.7929901480674744, + "num_tokens": 16867827.0, + "step": 7240, + "train/ce_loss": 0.516572892665863 + }, + { + "epoch": 0.7158394304923867, + "step": 7240, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.7158394304923867, + "step": 7240, + "train/total_loss": 0.1141572892665863 + }, + { + "entropy": 8.661676406860352, + "epoch": 0.7159383033419023, + "mean_token_accuracy": 0.6924939751625061, + "num_tokens": 16873311.0, + "step": 7241, + "train/ce_loss": 1.3083157539367676 + }, + { + "epoch": 0.7159383033419023, + "step": 7241, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.7159383033419023, + "step": 7241, + "train/total_loss": 0.21676908433437347 + }, + { + "entropy": 9.006423950195312, + "epoch": 0.7160371761914178, + "mean_token_accuracy": 0.7820737957954407, + "num_tokens": 16878342.0, + "step": 7242, + "train/ce_loss": 1.0763325691223145 + }, + { + "epoch": 0.7160371761914178, + "step": 7242, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.7160371761914178, + "step": 7242, + "train/total_loss": 0.16622701287269592 + }, + { + "entropy": 8.977418899536133, + "epoch": 0.7161360490409333, + "mean_token_accuracy": 0.7130681872367859, + "num_tokens": 16883553.0, + "step": 7243, + "train/ce_loss": 1.2317839860916138 + }, + { + "epoch": 0.7161360490409333, + "step": 7243, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.7161360490409333, + "step": 7243, + "train/total_loss": 0.2247408926486969 + }, + { + "entropy": 8.649625778198242, + "epoch": 0.7162349218904489, + "mean_token_accuracy": 0.7537747025489807, + "num_tokens": 16888914.0, + "step": 7244, + "train/ce_loss": 0.5398925542831421 + }, + { + "epoch": 0.7162349218904489, + "step": 7244, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.7162349218904489, + "step": 7244, + "train/total_loss": 0.06961426138877869 + }, + { + "entropy": 8.799467086791992, + "epoch": 0.7163337947399644, + "mean_token_accuracy": 0.6811594367027283, + "num_tokens": 16894264.0, + "step": 7245, + "train/ce_loss": 1.9662058353424072 + }, + { + "epoch": 0.7163337947399644, + "step": 7245, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.7163337947399644, + "step": 7245, + "train/total_loss": 0.2747455835342407 + }, + { + "entropy": 8.755645751953125, + "epoch": 0.7164326675894799, + "mean_token_accuracy": 0.7402299046516418, + "num_tokens": 16899628.0, + "step": 7246, + "train/ce_loss": 1.0392379760742188 + }, + { + "epoch": 0.7164326675894799, + "step": 7246, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.7164326675894799, + "step": 7246, + "train/total_loss": 0.19767379760742188 + }, + { + "entropy": 9.167597770690918, + "epoch": 0.7165315404389955, + "mean_token_accuracy": 0.7532228231430054, + "num_tokens": 16904611.0, + "step": 7247, + "train/ce_loss": 1.9028851738767116e-06 + }, + { + "epoch": 0.7165315404389955, + "step": 7247, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.7165315404389955, + "step": 7247, + "train/total_loss": 0.08203144371509552 + }, + { + "entropy": 9.476949691772461, + "epoch": 0.716630413288511, + "mean_token_accuracy": 0.7322275042533875, + "num_tokens": 16909444.0, + "step": 7248, + "train/ce_loss": 6.365969511534786e-06 + }, + { + "epoch": 0.716630413288511, + "step": 7248, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.716630413288511, + "step": 7248, + "train/total_loss": 0.04687563702464104 + }, + { + "entropy": 9.324455261230469, + "epoch": 0.7167292861380264, + "mean_token_accuracy": 0.8163716793060303, + "num_tokens": 16914355.0, + "step": 7249, + "train/ce_loss": 1.9346645785844885e-05 + }, + { + "epoch": 0.7167292861380264, + "step": 7249, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.7167292861380264, + "step": 7249, + "train/total_loss": 0.0625019371509552 + }, + { + "entropy": 8.972284317016602, + "epoch": 0.716828158987542, + "mean_token_accuracy": 0.746081531047821, + "num_tokens": 16919452.0, + "step": 7250, + "train/ce_loss": 0.9190874695777893 + }, + { + "epoch": 0.716828158987542, + "step": 7250, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.716828158987542, + "step": 7250, + "train/total_loss": 0.1348775029182434 + }, + { + "entropy": 8.439289093017578, + "epoch": 0.7169270318370575, + "mean_token_accuracy": 0.735052764415741, + "num_tokens": 16924781.0, + "step": 7251, + "train/ce_loss": 1.137518048286438 + }, + { + "epoch": 0.7169270318370575, + "step": 7251, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.7169270318370575, + "step": 7251, + "train/total_loss": 0.20359554886817932 + }, + { + "entropy": 8.556591033935547, + "epoch": 0.717025904686573, + "mean_token_accuracy": 0.709227442741394, + "num_tokens": 16930205.0, + "step": 7252, + "train/ce_loss": 1.2083420753479004 + }, + { + "epoch": 0.717025904686573, + "step": 7252, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.717025904686573, + "step": 7252, + "train/total_loss": 0.19114670157432556 + }, + { + "entropy": 8.435426712036133, + "epoch": 0.7171247775360886, + "mean_token_accuracy": 0.7184079885482788, + "num_tokens": 16935732.0, + "step": 7253, + "train/ce_loss": 0.6173237562179565 + }, + { + "epoch": 0.7171247775360886, + "step": 7253, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.7171247775360886, + "step": 7253, + "train/total_loss": 0.13985738158226013 + }, + { + "entropy": 8.789794921875, + "epoch": 0.7172236503856041, + "mean_token_accuracy": 0.710659921169281, + "num_tokens": 16940861.0, + "step": 7254, + "train/ce_loss": 1.0131810903549194 + }, + { + "epoch": 0.7172236503856041, + "step": 7254, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.7172236503856041, + "step": 7254, + "train/total_loss": 0.1599118709564209 + }, + { + "entropy": 8.52991008758545, + "epoch": 0.7173225232351197, + "mean_token_accuracy": 0.7250000238418579, + "num_tokens": 16946073.0, + "step": 7255, + "train/ce_loss": 1.1255245208740234 + }, + { + "epoch": 0.7173225232351197, + "step": 7255, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.7173225232351197, + "step": 7255, + "train/total_loss": 0.1555212140083313 + }, + { + "entropy": 8.874298095703125, + "epoch": 0.7174213960846352, + "mean_token_accuracy": 0.7180851101875305, + "num_tokens": 16951307.0, + "step": 7256, + "train/ce_loss": 1.0606045722961426 + }, + { + "epoch": 0.7174213960846352, + "step": 7256, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.7174213960846352, + "step": 7256, + "train/total_loss": 0.1646542102098465 + }, + { + "entropy": 8.793537139892578, + "epoch": 0.7175202689341507, + "mean_token_accuracy": 0.7648725509643555, + "num_tokens": 16956482.0, + "step": 7257, + "train/ce_loss": 1.007021188735962 + }, + { + "epoch": 0.7175202689341507, + "step": 7257, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.7175202689341507, + "step": 7257, + "train/total_loss": 0.14367087185382843 + }, + { + "entropy": 8.745311737060547, + "epoch": 0.7176191417836663, + "mean_token_accuracy": 0.7070600390434265, + "num_tokens": 16961909.0, + "step": 7258, + "train/ce_loss": 0.41854217648506165 + }, + { + "epoch": 0.7176191417836663, + "step": 7258, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.7176191417836663, + "step": 7258, + "train/total_loss": 0.06919796764850616 + }, + { + "entropy": 9.132467269897461, + "epoch": 0.7177180146331817, + "mean_token_accuracy": 0.7160278558731079, + "num_tokens": 16966932.0, + "step": 7259, + "train/ce_loss": 2.1726157665252686 + }, + { + "epoch": 0.7177180146331817, + "step": 7259, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.7177180146331817, + "step": 7259, + "train/total_loss": 0.29148033261299133 + }, + { + "epoch": 0.7178168874826972, + "grad_norm": 0.8078387379646301, + "learning_rate": 8.207733768481432e-06, + "loss": 0.1427, + "step": 7260 + }, + { + "entropy": 8.826803207397461, + "epoch": 0.7178168874826972, + "mean_token_accuracy": 0.707732617855072, + "num_tokens": 16972171.0, + "step": 7260, + "train/ce_loss": 0.8902512788772583 + }, + { + "epoch": 0.7178168874826972, + "step": 7260, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.7178168874826972, + "step": 7260, + "train/total_loss": 0.15152513980865479 + }, + { + "entropy": 8.603378295898438, + "epoch": 0.7179157603322128, + "mean_token_accuracy": 0.7553443908691406, + "num_tokens": 16977437.0, + "step": 7261, + "train/ce_loss": 0.762908399105072 + }, + { + "epoch": 0.7179157603322128, + "step": 7261, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.7179157603322128, + "step": 7261, + "train/total_loss": 0.11925958842039108 + }, + { + "entropy": 9.200379371643066, + "epoch": 0.7180146331817283, + "mean_token_accuracy": 0.7368420958518982, + "num_tokens": 16982388.0, + "step": 7262, + "train/ce_loss": 0.7837957143783569 + }, + { + "epoch": 0.7180146331817283, + "step": 7262, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.7180146331817283, + "step": 7262, + "train/total_loss": 0.1135358214378357 + }, + { + "entropy": 9.373811721801758, + "epoch": 0.7181135060312438, + "mean_token_accuracy": 0.7566137313842773, + "num_tokens": 16987174.0, + "step": 7263, + "train/ce_loss": 3.704700475282152e-06 + }, + { + "epoch": 0.7181135060312438, + "step": 7263, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.7181135060312438, + "step": 7263, + "train/total_loss": 0.019531620666384697 + }, + { + "entropy": 8.62483024597168, + "epoch": 0.7182123788807594, + "mean_token_accuracy": 0.6995661854743958, + "num_tokens": 16992550.0, + "step": 7264, + "train/ce_loss": 1.0524500608444214 + }, + { + "epoch": 0.7182123788807594, + "step": 7264, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.7182123788807594, + "step": 7264, + "train/total_loss": 0.15993250906467438 + }, + { + "entropy": 8.923933029174805, + "epoch": 0.7183112517302749, + "mean_token_accuracy": 0.7289433479309082, + "num_tokens": 16997680.0, + "step": 7265, + "train/ce_loss": 0.675235390663147 + }, + { + "epoch": 0.7183112517302749, + "step": 7265, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.7183112517302749, + "step": 7265, + "train/total_loss": 0.1026797890663147 + }, + { + "entropy": 8.639825820922852, + "epoch": 0.7184101245797904, + "mean_token_accuracy": 0.735195517539978, + "num_tokens": 17003217.0, + "step": 7266, + "train/ce_loss": 1.0886646509170532 + }, + { + "epoch": 0.7184101245797904, + "step": 7266, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.7184101245797904, + "step": 7266, + "train/total_loss": 0.18699146807193756 + }, + { + "entropy": 8.842220306396484, + "epoch": 0.718508997429306, + "mean_token_accuracy": 0.7212121486663818, + "num_tokens": 17008323.0, + "step": 7267, + "train/ce_loss": 1.6363429722332512e-06 + }, + { + "epoch": 0.718508997429306, + "step": 7267, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.718508997429306, + "step": 7267, + "train/total_loss": 0.03515641391277313 + }, + { + "entropy": 8.581207275390625, + "epoch": 0.7186078702788214, + "mean_token_accuracy": 0.7721046209335327, + "num_tokens": 17013564.0, + "step": 7268, + "train/ce_loss": 0.8590487837791443 + }, + { + "epoch": 0.7186078702788214, + "step": 7268, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.7186078702788214, + "step": 7268, + "train/total_loss": 0.14059238135814667 + }, + { + "entropy": 8.644612312316895, + "epoch": 0.7187067431283369, + "mean_token_accuracy": 0.801672637462616, + "num_tokens": 17018854.0, + "step": 7269, + "train/ce_loss": 0.5526071786880493 + }, + { + "epoch": 0.7187067431283369, + "step": 7269, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.7187067431283369, + "step": 7269, + "train/total_loss": 0.10213571786880493 + }, + { + "entropy": 8.574300765991211, + "epoch": 0.7188056159778525, + "mean_token_accuracy": 0.6941431760787964, + "num_tokens": 17024202.0, + "step": 7270, + "train/ce_loss": 0.7288708686828613 + }, + { + "epoch": 0.7188056159778525, + "step": 7270, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.7188056159778525, + "step": 7270, + "train/total_loss": 0.1471058428287506 + }, + { + "entropy": 9.051513671875, + "epoch": 0.718904488827368, + "mean_token_accuracy": 0.6909385323524475, + "num_tokens": 17029259.0, + "step": 7271, + "train/ce_loss": 1.2106337547302246 + }, + { + "epoch": 0.718904488827368, + "step": 7271, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.718904488827368, + "step": 7271, + "train/total_loss": 0.14840713143348694 + }, + { + "entropy": 9.47685718536377, + "epoch": 0.7190033616768835, + "mean_token_accuracy": 0.7555066347122192, + "num_tokens": 17034070.0, + "step": 7272, + "train/ce_loss": 1.5516616106033325 + }, + { + "epoch": 0.7190033616768835, + "step": 7272, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.7190033616768835, + "step": 7272, + "train/total_loss": 0.2020411640405655 + }, + { + "entropy": 8.486614227294922, + "epoch": 0.7191022345263991, + "mean_token_accuracy": 0.8106796145439148, + "num_tokens": 17039398.0, + "step": 7273, + "train/ce_loss": 0.6014370322227478 + }, + { + "epoch": 0.7191022345263991, + "step": 7273, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.7191022345263991, + "step": 7273, + "train/total_loss": 0.08358120918273926 + }, + { + "entropy": 8.953771591186523, + "epoch": 0.7192011073759146, + "mean_token_accuracy": 0.73884516954422, + "num_tokens": 17044604.0, + "step": 7274, + "train/ce_loss": 1.0438170433044434 + }, + { + "epoch": 0.7192011073759146, + "step": 7274, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.7192011073759146, + "step": 7274, + "train/total_loss": 0.12391295284032822 + }, + { + "entropy": 8.621545791625977, + "epoch": 0.7192999802254301, + "mean_token_accuracy": 0.7087979912757874, + "num_tokens": 17049927.0, + "step": 7275, + "train/ce_loss": 1.3095701932907104 + }, + { + "epoch": 0.7192999802254301, + "step": 7275, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.7192999802254301, + "step": 7275, + "train/total_loss": 0.20517577230930328 + }, + { + "entropy": 8.50368881225586, + "epoch": 0.7193988530749457, + "mean_token_accuracy": 0.7884427309036255, + "num_tokens": 17055383.0, + "step": 7276, + "train/ce_loss": 0.5625127553939819 + }, + { + "epoch": 0.7193988530749457, + "step": 7276, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.7193988530749457, + "step": 7276, + "train/total_loss": 0.13437627255916595 + }, + { + "entropy": 8.556852340698242, + "epoch": 0.7194977259244612, + "mean_token_accuracy": 0.7288557291030884, + "num_tokens": 17060665.0, + "step": 7277, + "train/ce_loss": 0.8305554986000061 + }, + { + "epoch": 0.7194977259244612, + "step": 7277, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.7194977259244612, + "step": 7277, + "train/total_loss": 0.1377430558204651 + }, + { + "entropy": 8.74986743927002, + "epoch": 0.7195965987739766, + "mean_token_accuracy": 0.7279999852180481, + "num_tokens": 17065820.0, + "step": 7278, + "train/ce_loss": 0.9675143361091614 + }, + { + "epoch": 0.7195965987739766, + "step": 7278, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.7195965987739766, + "step": 7278, + "train/total_loss": 0.13972018659114838 + }, + { + "entropy": 9.176345825195312, + "epoch": 0.7196954716234922, + "mean_token_accuracy": 0.7654028534889221, + "num_tokens": 17070658.0, + "step": 7279, + "train/ce_loss": 9.67250616668025e-06 + }, + { + "epoch": 0.7196954716234922, + "step": 7279, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.7196954716234922, + "step": 7279, + "train/total_loss": 0.0429697185754776 + }, + { + "epoch": 0.7197943444730077, + "grad_norm": 0.7864909172058105, + "learning_rate": 8.202788903723484e-06, + "loss": 0.135, + "step": 7280 + }, + { + "entropy": 8.759176254272461, + "epoch": 0.7197943444730077, + "mean_token_accuracy": 0.7441860437393188, + "num_tokens": 17075801.0, + "step": 7280, + "train/ce_loss": 1.1582366228103638 + }, + { + "epoch": 0.7197943444730077, + "step": 7280, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.7197943444730077, + "step": 7280, + "train/total_loss": 0.1822299063205719 + }, + { + "entropy": 8.721479415893555, + "epoch": 0.7198932173225232, + "mean_token_accuracy": 0.8014440536499023, + "num_tokens": 17081088.0, + "step": 7281, + "train/ce_loss": 0.3420823812484741 + }, + { + "epoch": 0.7198932173225232, + "step": 7281, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.7198932173225232, + "step": 7281, + "train/total_loss": 0.08108323812484741 + }, + { + "entropy": 8.305360794067383, + "epoch": 0.7199920901720388, + "mean_token_accuracy": 0.7468030452728271, + "num_tokens": 17086360.0, + "step": 7282, + "train/ce_loss": 1.2415152788162231 + }, + { + "epoch": 0.7199920901720388, + "step": 7282, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.7199920901720388, + "step": 7282, + "train/total_loss": 0.19446402788162231 + }, + { + "entropy": 8.77110481262207, + "epoch": 0.7200909630215543, + "mean_token_accuracy": 0.7856173515319824, + "num_tokens": 17091557.0, + "step": 7283, + "train/ce_loss": 1.6788532093414688e-06 + }, + { + "epoch": 0.7200909630215543, + "step": 7283, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.7200909630215543, + "step": 7283, + "train/total_loss": 0.03125016763806343 + }, + { + "entropy": 8.664463996887207, + "epoch": 0.7201898358710698, + "mean_token_accuracy": 0.7898550629615784, + "num_tokens": 17096810.0, + "step": 7284, + "train/ce_loss": 0.7395745515823364 + }, + { + "epoch": 0.7201898358710698, + "step": 7284, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.7201898358710698, + "step": 7284, + "train/total_loss": 0.09348870813846588 + }, + { + "entropy": 8.494865417480469, + "epoch": 0.7202887087205854, + "mean_token_accuracy": 0.7291428446769714, + "num_tokens": 17102316.0, + "step": 7285, + "train/ce_loss": 1.0033003091812134 + }, + { + "epoch": 0.7202887087205854, + "step": 7285, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.7202887087205854, + "step": 7285, + "train/total_loss": 0.19798627495765686 + }, + { + "entropy": 8.049318313598633, + "epoch": 0.7203875815701009, + "mean_token_accuracy": 0.7228571176528931, + "num_tokens": 17107833.0, + "step": 7286, + "train/ce_loss": 0.5071789026260376 + }, + { + "epoch": 0.7203875815701009, + "step": 7286, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.7203875815701009, + "step": 7286, + "train/total_loss": 0.10931164026260376 + }, + { + "entropy": 8.946247100830078, + "epoch": 0.7204864544196163, + "mean_token_accuracy": 0.7163531184196472, + "num_tokens": 17113141.0, + "step": 7287, + "train/ce_loss": 1.2641355991363525 + }, + { + "epoch": 0.7204864544196163, + "step": 7287, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.7204864544196163, + "step": 7287, + "train/total_loss": 0.18891356885433197 + }, + { + "entropy": 8.590167045593262, + "epoch": 0.7205853272691319, + "mean_token_accuracy": 0.7274826765060425, + "num_tokens": 17118500.0, + "step": 7288, + "train/ce_loss": 0.917041003704071 + }, + { + "epoch": 0.7205853272691319, + "step": 7288, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.7205853272691319, + "step": 7288, + "train/total_loss": 0.1776416003704071 + }, + { + "entropy": 8.500893592834473, + "epoch": 0.7206842001186474, + "mean_token_accuracy": 0.7496740818023682, + "num_tokens": 17123731.0, + "step": 7289, + "train/ce_loss": 0.7513278126716614 + }, + { + "epoch": 0.7206842001186474, + "step": 7289, + "train/sim_loss": 0.1484375 + }, + { + "epoch": 0.7206842001186474, + "step": 7289, + "train/total_loss": 0.22357028722763062 + }, + { + "entropy": 8.950748443603516, + "epoch": 0.7207830729681629, + "mean_token_accuracy": 0.7232796549797058, + "num_tokens": 17128851.0, + "step": 7290, + "train/ce_loss": 1.2981815338134766 + }, + { + "epoch": 0.7207830729681629, + "step": 7290, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.7207830729681629, + "step": 7290, + "train/total_loss": 0.2313806563615799 + }, + { + "entropy": 8.955303192138672, + "epoch": 0.7208819458176785, + "mean_token_accuracy": 0.747706413269043, + "num_tokens": 17133935.0, + "step": 7291, + "train/ce_loss": 0.7134157419204712 + }, + { + "epoch": 0.7208819458176785, + "step": 7291, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.7208819458176785, + "step": 7291, + "train/total_loss": 0.14165407419204712 + }, + { + "entropy": 9.262459754943848, + "epoch": 0.720980818667194, + "mean_token_accuracy": 0.7288888692855835, + "num_tokens": 17138528.0, + "step": 7292, + "train/ce_loss": 3.491542884148657e-05 + }, + { + "epoch": 0.720980818667194, + "step": 7292, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.720980818667194, + "step": 7292, + "train/total_loss": 0.07812849432229996 + }, + { + "entropy": 8.342453002929688, + "epoch": 0.7210796915167095, + "mean_token_accuracy": 0.6983184814453125, + "num_tokens": 17143950.0, + "step": 7293, + "train/ce_loss": 0.5059213042259216 + }, + { + "epoch": 0.7210796915167095, + "step": 7293, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.7210796915167095, + "step": 7293, + "train/total_loss": 0.08184213191270828 + }, + { + "entropy": 8.34473991394043, + "epoch": 0.7211785643662251, + "mean_token_accuracy": 0.7457447052001953, + "num_tokens": 17149397.0, + "step": 7294, + "train/ce_loss": 1.351395606994629 + }, + { + "epoch": 0.7211785643662251, + "step": 7294, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.7211785643662251, + "step": 7294, + "train/total_loss": 0.2093583196401596 + }, + { + "entropy": 8.92292308807373, + "epoch": 0.7212774372157406, + "mean_token_accuracy": 0.7750343084335327, + "num_tokens": 17154570.0, + "step": 7295, + "train/ce_loss": 0.6525868773460388 + }, + { + "epoch": 0.7212774372157406, + "step": 7295, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.7212774372157406, + "step": 7295, + "train/total_loss": 0.11603993922472 + }, + { + "entropy": 8.965505599975586, + "epoch": 0.721376310065256, + "mean_token_accuracy": 0.7849116921424866, + "num_tokens": 17159649.0, + "step": 7296, + "train/ce_loss": 1.060815691947937 + }, + { + "epoch": 0.721376310065256, + "step": 7296, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.721376310065256, + "step": 7296, + "train/total_loss": 0.15686282515525818 + }, + { + "entropy": 8.488422393798828, + "epoch": 0.7214751829147716, + "mean_token_accuracy": 0.707975447177887, + "num_tokens": 17164944.0, + "step": 7297, + "train/ce_loss": 0.5313419699668884 + }, + { + "epoch": 0.7214751829147716, + "step": 7297, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.7214751829147716, + "step": 7297, + "train/total_loss": 0.07266545295715332 + }, + { + "entropy": 8.473865509033203, + "epoch": 0.7215740557642871, + "mean_token_accuracy": 0.7348642945289612, + "num_tokens": 17170337.0, + "step": 7298, + "train/ce_loss": 0.6764245629310608 + }, + { + "epoch": 0.7215740557642871, + "step": 7298, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.7215740557642871, + "step": 7298, + "train/total_loss": 0.1184237077832222 + }, + { + "entropy": 8.944363594055176, + "epoch": 0.7216729286138026, + "mean_token_accuracy": 0.75, + "num_tokens": 17175397.0, + "step": 7299, + "train/ce_loss": 4.032572178402916e-06 + }, + { + "epoch": 0.7216729286138026, + "step": 7299, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.7216729286138026, + "step": 7299, + "train/total_loss": 0.039062902331352234 + }, + { + "epoch": 0.7217718014633182, + "grad_norm": 0.6683271527290344, + "learning_rate": 8.197844038965535e-06, + "loss": 0.1385, + "step": 7300 + }, + { + "entropy": 8.849038124084473, + "epoch": 0.7217718014633182, + "mean_token_accuracy": 0.7812929749488831, + "num_tokens": 17180597.0, + "step": 7300, + "train/ce_loss": 0.8105028867721558 + }, + { + "epoch": 0.7217718014633182, + "step": 7300, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.7217718014633182, + "step": 7300, + "train/total_loss": 0.13964404165744781 + }, + { + "entropy": 8.331449508666992, + "epoch": 0.7218706743128337, + "mean_token_accuracy": 0.7185500860214233, + "num_tokens": 17186034.0, + "step": 7301, + "train/ce_loss": 0.6758619546890259 + }, + { + "epoch": 0.7218706743128337, + "step": 7301, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.7218706743128337, + "step": 7301, + "train/total_loss": 0.12617994844913483 + }, + { + "entropy": 9.267677307128906, + "epoch": 0.7219695471623492, + "mean_token_accuracy": 0.6851485371589661, + "num_tokens": 17190994.0, + "step": 7302, + "train/ce_loss": 1.6048048734664917 + }, + { + "epoch": 0.7219695471623492, + "step": 7302, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.7219695471623492, + "step": 7302, + "train/total_loss": 0.21907423436641693 + }, + { + "entropy": 8.935868263244629, + "epoch": 0.7220684200118648, + "mean_token_accuracy": 0.7830769419670105, + "num_tokens": 17196129.0, + "step": 7303, + "train/ce_loss": 2.780159775284119e-06 + }, + { + "epoch": 0.7220684200118648, + "step": 7303, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.7220684200118648, + "step": 7303, + "train/total_loss": 0.039062779396772385 + }, + { + "entropy": 8.983610153198242, + "epoch": 0.7221672928613803, + "mean_token_accuracy": 0.7756873965263367, + "num_tokens": 17201279.0, + "step": 7304, + "train/ce_loss": 0.7668209671974182 + }, + { + "epoch": 0.7221672928613803, + "step": 7304, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.7221672928613803, + "step": 7304, + "train/total_loss": 0.14699459075927734 + }, + { + "entropy": 8.59881591796875, + "epoch": 0.7222661657108957, + "mean_token_accuracy": 0.7738232016563416, + "num_tokens": 17206657.0, + "step": 7305, + "train/ce_loss": 1.3534693717956543 + }, + { + "epoch": 0.7222661657108957, + "step": 7305, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.7222661657108957, + "step": 7305, + "train/total_loss": 0.1900344341993332 + }, + { + "entropy": 8.371246337890625, + "epoch": 0.7223650385604113, + "mean_token_accuracy": 0.6880530714988708, + "num_tokens": 17212033.0, + "step": 7306, + "train/ce_loss": 1.5280489921569824 + }, + { + "epoch": 0.7223650385604113, + "step": 7306, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.7223650385604113, + "step": 7306, + "train/total_loss": 0.2621799111366272 + }, + { + "entropy": 8.451948165893555, + "epoch": 0.7224639114099268, + "mean_token_accuracy": 0.7366737723350525, + "num_tokens": 17217434.0, + "step": 7307, + "train/ce_loss": 0.8553075790405273 + }, + { + "epoch": 0.7224639114099268, + "step": 7307, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.7224639114099268, + "step": 7307, + "train/total_loss": 0.17146825790405273 + }, + { + "entropy": 8.971637725830078, + "epoch": 0.7225627842594423, + "mean_token_accuracy": 0.7372488379478455, + "num_tokens": 17222565.0, + "step": 7308, + "train/ce_loss": 1.1359513998031616 + }, + { + "epoch": 0.7225627842594423, + "step": 7308, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.7225627842594423, + "step": 7308, + "train/total_loss": 0.1604701429605484 + }, + { + "entropy": 9.254190444946289, + "epoch": 0.7226616571089579, + "mean_token_accuracy": 0.7188612222671509, + "num_tokens": 17227609.0, + "step": 7309, + "train/ce_loss": 1.335377812385559 + }, + { + "epoch": 0.7226616571089579, + "step": 7309, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.7226616571089579, + "step": 7309, + "train/total_loss": 0.18822528421878815 + }, + { + "entropy": 8.650724411010742, + "epoch": 0.7227605299584734, + "mean_token_accuracy": 0.7820025086402893, + "num_tokens": 17232854.0, + "step": 7310, + "train/ce_loss": 0.7430127263069153 + }, + { + "epoch": 0.7227605299584734, + "step": 7310, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.7227605299584734, + "step": 7310, + "train/total_loss": 0.12898877263069153 + }, + { + "entropy": 8.497949600219727, + "epoch": 0.7228594028079889, + "mean_token_accuracy": 0.6936936974525452, + "num_tokens": 17238307.0, + "step": 7311, + "train/ce_loss": 0.9521242380142212 + }, + { + "epoch": 0.7228594028079889, + "step": 7311, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.7228594028079889, + "step": 7311, + "train/total_loss": 0.1381811797618866 + }, + { + "entropy": 9.520938873291016, + "epoch": 0.7229582756575045, + "mean_token_accuracy": 0.7518072128295898, + "num_tokens": 17243117.0, + "step": 7312, + "train/ce_loss": 1.2061069011688232 + }, + { + "epoch": 0.7229582756575045, + "step": 7312, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.7229582756575045, + "step": 7312, + "train/total_loss": 0.16748568415641785 + }, + { + "entropy": 8.96164321899414, + "epoch": 0.72305714850702, + "mean_token_accuracy": 0.7125645279884338, + "num_tokens": 17248119.0, + "step": 7313, + "train/ce_loss": 1.5244228839874268 + }, + { + "epoch": 0.72305714850702, + "step": 7313, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.72305714850702, + "step": 7313, + "train/total_loss": 0.20712979137897491 + }, + { + "entropy": 8.680285453796387, + "epoch": 0.7231560213565355, + "mean_token_accuracy": 0.7047146558761597, + "num_tokens": 17253399.0, + "step": 7314, + "train/ce_loss": 2.0357666015625 + }, + { + "epoch": 0.7231560213565355, + "step": 7314, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.7231560213565355, + "step": 7314, + "train/total_loss": 0.2660766839981079 + }, + { + "entropy": 8.502523422241211, + "epoch": 0.723254894206051, + "mean_token_accuracy": 0.7521008253097534, + "num_tokens": 17258839.0, + "step": 7315, + "train/ce_loss": 1.0317186117172241 + }, + { + "epoch": 0.723254894206051, + "step": 7315, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.723254894206051, + "step": 7315, + "train/total_loss": 0.15004685521125793 + }, + { + "entropy": 8.51579475402832, + "epoch": 0.7233537670555665, + "mean_token_accuracy": 0.7277904152870178, + "num_tokens": 17264243.0, + "step": 7316, + "train/ce_loss": 0.7120223641395569 + }, + { + "epoch": 0.7233537670555665, + "step": 7316, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.7233537670555665, + "step": 7316, + "train/total_loss": 0.16104599833488464 + }, + { + "entropy": 8.494817733764648, + "epoch": 0.723452639905082, + "mean_token_accuracy": 0.7609195113182068, + "num_tokens": 17269687.0, + "step": 7317, + "train/ce_loss": 0.6494558453559875 + }, + { + "epoch": 0.723452639905082, + "step": 7317, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.723452639905082, + "step": 7317, + "train/total_loss": 0.16260182857513428 + }, + { + "entropy": 8.299484252929688, + "epoch": 0.7235515127545976, + "mean_token_accuracy": 0.7377210259437561, + "num_tokens": 17275225.0, + "step": 7318, + "train/ce_loss": 1.2110645771026611 + }, + { + "epoch": 0.7235515127545976, + "step": 7318, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.7235515127545976, + "step": 7318, + "train/total_loss": 0.17970021069049835 + }, + { + "entropy": 9.056045532226562, + "epoch": 0.7236503856041131, + "mean_token_accuracy": 0.759358286857605, + "num_tokens": 17280252.0, + "step": 7319, + "train/ce_loss": 3.6766282391909044e-06 + }, + { + "epoch": 0.7236503856041131, + "step": 7319, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.7236503856041131, + "step": 7319, + "train/total_loss": 0.03125036880373955 + }, + { + "epoch": 0.7237492584536286, + "grad_norm": 0.7412518262863159, + "learning_rate": 8.192899174207585e-06, + "loss": 0.1361, + "step": 7320 + }, + { + "entropy": 8.961812973022461, + "epoch": 0.7237492584536286, + "mean_token_accuracy": 0.7068965435028076, + "num_tokens": 17285276.0, + "step": 7320, + "train/ce_loss": 1.3280572891235352 + }, + { + "epoch": 0.7237492584536286, + "step": 7320, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.7237492584536286, + "step": 7320, + "train/total_loss": 0.203118234872818 + }, + { + "entropy": 8.624178886413574, + "epoch": 0.7238481313031442, + "mean_token_accuracy": 0.7888888716697693, + "num_tokens": 17290638.0, + "step": 7321, + "train/ce_loss": 0.7367948293685913 + }, + { + "epoch": 0.7238481313031442, + "step": 7321, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.7238481313031442, + "step": 7321, + "train/total_loss": 0.08930448442697525 + }, + { + "entropy": 8.59969711303711, + "epoch": 0.7239470041526597, + "mean_token_accuracy": 0.7271605134010315, + "num_tokens": 17295983.0, + "step": 7322, + "train/ce_loss": 0.4919517934322357 + }, + { + "epoch": 0.7239470041526597, + "step": 7322, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.7239470041526597, + "step": 7322, + "train/total_loss": 0.08044518530368805 + }, + { + "entropy": 9.034954071044922, + "epoch": 0.7240458770021752, + "mean_token_accuracy": 0.716911792755127, + "num_tokens": 17300973.0, + "step": 7323, + "train/ce_loss": 0.9119350910186768 + }, + { + "epoch": 0.7240458770021752, + "step": 7323, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.7240458770021752, + "step": 7323, + "train/total_loss": 0.19275601208209991 + }, + { + "entropy": 8.415181159973145, + "epoch": 0.7241447498516907, + "mean_token_accuracy": 0.7527233362197876, + "num_tokens": 17306318.0, + "step": 7324, + "train/ce_loss": 1.0075483322143555 + }, + { + "epoch": 0.7241447498516907, + "step": 7324, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.7241447498516907, + "step": 7324, + "train/total_loss": 0.16325482726097107 + }, + { + "entropy": 8.788187980651855, + "epoch": 0.7242436227012062, + "mean_token_accuracy": 0.7599999904632568, + "num_tokens": 17311610.0, + "step": 7325, + "train/ce_loss": 0.5460984706878662 + }, + { + "epoch": 0.7242436227012062, + "step": 7325, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.7242436227012062, + "step": 7325, + "train/total_loss": 0.10539110004901886 + }, + { + "entropy": 9.16337776184082, + "epoch": 0.7243424955507217, + "mean_token_accuracy": 0.7110186815261841, + "num_tokens": 17316551.0, + "step": 7326, + "train/ce_loss": 1.2475454807281494 + }, + { + "epoch": 0.7243424955507217, + "step": 7326, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.7243424955507217, + "step": 7326, + "train/total_loss": 0.18725454807281494 + }, + { + "entropy": 8.655111312866211, + "epoch": 0.7244413684002373, + "mean_token_accuracy": 0.7433751821517944, + "num_tokens": 17321755.0, + "step": 7327, + "train/ce_loss": 1.4795926809310913 + }, + { + "epoch": 0.7244413684002373, + "step": 7327, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.7244413684002373, + "step": 7327, + "train/total_loss": 0.21045927703380585 + }, + { + "entropy": 9.506175994873047, + "epoch": 0.7245402412497528, + "mean_token_accuracy": 0.6496163606643677, + "num_tokens": 17326533.0, + "step": 7328, + "train/ce_loss": 1.2679378986358643 + }, + { + "epoch": 0.7245402412497528, + "step": 7328, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.7245402412497528, + "step": 7328, + "train/total_loss": 0.1697625368833542 + }, + { + "entropy": 8.775362014770508, + "epoch": 0.7246391140992683, + "mean_token_accuracy": 0.731218695640564, + "num_tokens": 17331595.0, + "step": 7329, + "train/ce_loss": 4.083109615748981e-06 + }, + { + "epoch": 0.7246391140992683, + "step": 7329, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.7246391140992683, + "step": 7329, + "train/total_loss": 0.02734415791928768 + }, + { + "entropy": 8.553563117980957, + "epoch": 0.7247379869487839, + "mean_token_accuracy": 0.7808598875999451, + "num_tokens": 17336795.0, + "step": 7330, + "train/ce_loss": 1.1927872896194458 + }, + { + "epoch": 0.7247379869487839, + "step": 7330, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.7247379869487839, + "step": 7330, + "train/total_loss": 0.17396622896194458 + }, + { + "entropy": 8.947246551513672, + "epoch": 0.7248368597982994, + "mean_token_accuracy": 0.6691957712173462, + "num_tokens": 17341876.0, + "step": 7331, + "train/ce_loss": 9.290523053095967e-07 + }, + { + "epoch": 0.7248368597982994, + "step": 7331, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.7248368597982994, + "step": 7331, + "train/total_loss": 0.03515634313225746 + }, + { + "entropy": 8.291561126708984, + "epoch": 0.7249357326478149, + "mean_token_accuracy": 0.7085152864456177, + "num_tokens": 17347287.0, + "step": 7332, + "train/ce_loss": 0.6597549915313721 + }, + { + "epoch": 0.7249357326478149, + "step": 7332, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.7249357326478149, + "step": 7332, + "train/total_loss": 0.15191300213336945 + }, + { + "entropy": 8.774089813232422, + "epoch": 0.7250346054973305, + "mean_token_accuracy": 0.7362204790115356, + "num_tokens": 17352474.0, + "step": 7333, + "train/ce_loss": 0.8822577595710754 + }, + { + "epoch": 0.7250346054973305, + "step": 7333, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.7250346054973305, + "step": 7333, + "train/total_loss": 0.17806953191757202 + }, + { + "entropy": 9.008880615234375, + "epoch": 0.7251334783468459, + "mean_token_accuracy": 0.7876105904579163, + "num_tokens": 17357450.0, + "step": 7334, + "train/ce_loss": 1.1267569065093994 + }, + { + "epoch": 0.7251334783468459, + "step": 7334, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.7251334783468459, + "step": 7334, + "train/total_loss": 0.17517569661140442 + }, + { + "entropy": 8.721375465393066, + "epoch": 0.7252323511963614, + "mean_token_accuracy": 0.7621878981590271, + "num_tokens": 17362752.0, + "step": 7335, + "train/ce_loss": 0.8404841423034668 + }, + { + "epoch": 0.7252323511963614, + "step": 7335, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.7252323511963614, + "step": 7335, + "train/total_loss": 0.15045467019081116 + }, + { + "entropy": 8.608890533447266, + "epoch": 0.725331224045877, + "mean_token_accuracy": 0.7049180269241333, + "num_tokens": 17368205.0, + "step": 7336, + "train/ce_loss": 0.607792317867279 + }, + { + "epoch": 0.725331224045877, + "step": 7336, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.725331224045877, + "step": 7336, + "train/total_loss": 0.11156047880649567 + }, + { + "entropy": 8.911270141601562, + "epoch": 0.7254300968953925, + "mean_token_accuracy": 0.8027523159980774, + "num_tokens": 17373265.0, + "step": 7337, + "train/ce_loss": 0.5394410490989685 + }, + { + "epoch": 0.7254300968953925, + "step": 7337, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.7254300968953925, + "step": 7337, + "train/total_loss": 0.09300661087036133 + }, + { + "entropy": 9.362253189086914, + "epoch": 0.7255289697449081, + "mean_token_accuracy": 0.8140589594841003, + "num_tokens": 17378321.0, + "step": 7338, + "train/ce_loss": 2.5051738248293987e-06 + }, + { + "epoch": 0.7255289697449081, + "step": 7338, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.7255289697449081, + "step": 7338, + "train/total_loss": 0.023437749594449997 + }, + { + "entropy": 8.757553100585938, + "epoch": 0.7256278425944236, + "mean_token_accuracy": 0.6928281188011169, + "num_tokens": 17383680.0, + "step": 7339, + "train/ce_loss": 1.2112091779708862 + }, + { + "epoch": 0.7256278425944236, + "step": 7339, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.7256278425944236, + "step": 7339, + "train/total_loss": 0.18752717971801758 + }, + { + "epoch": 0.7257267154439391, + "grad_norm": 0.7014778852462769, + "learning_rate": 8.187954309449638e-06, + "loss": 0.1341, + "step": 7340 + }, + { + "entropy": 9.233248710632324, + "epoch": 0.7257267154439391, + "mean_token_accuracy": 0.723809540271759, + "num_tokens": 17388614.0, + "step": 7340, + "train/ce_loss": 1.8752094507217407 + }, + { + "epoch": 0.7257267154439391, + "step": 7340, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.7257267154439391, + "step": 7340, + "train/total_loss": 0.26955220103263855 + }, + { + "entropy": 9.112678527832031, + "epoch": 0.7258255882934547, + "mean_token_accuracy": 0.7578397393226624, + "num_tokens": 17393630.0, + "step": 7341, + "train/ce_loss": 1.0484298467636108 + }, + { + "epoch": 0.7258255882934547, + "step": 7341, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.7258255882934547, + "step": 7341, + "train/total_loss": 0.14390549063682556 + }, + { + "entropy": 9.32995319366455, + "epoch": 0.7259244611429702, + "mean_token_accuracy": 0.657706081867218, + "num_tokens": 17398628.0, + "step": 7342, + "train/ce_loss": 1.2173696756362915 + }, + { + "epoch": 0.7259244611429702, + "step": 7342, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.7259244611429702, + "step": 7342, + "train/total_loss": 0.16470572352409363 + }, + { + "entropy": 8.545857429504395, + "epoch": 0.7260233339924856, + "mean_token_accuracy": 0.813034176826477, + "num_tokens": 17404007.0, + "step": 7343, + "train/ce_loss": 0.4299650490283966 + }, + { + "epoch": 0.7260233339924856, + "step": 7343, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.7260233339924856, + "step": 7343, + "train/total_loss": 0.05862150713801384 + }, + { + "entropy": 8.629613876342773, + "epoch": 0.7261222068420012, + "mean_token_accuracy": 0.7901785969734192, + "num_tokens": 17409341.0, + "step": 7344, + "train/ce_loss": 0.8379325866699219 + }, + { + "epoch": 0.7261222068420012, + "step": 7344, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.7261222068420012, + "step": 7344, + "train/total_loss": 0.1384807527065277 + }, + { + "entropy": 8.712474822998047, + "epoch": 0.7262210796915167, + "mean_token_accuracy": 0.7073474526405334, + "num_tokens": 17414572.0, + "step": 7345, + "train/ce_loss": 1.0608352422714233 + }, + { + "epoch": 0.7262210796915167, + "step": 7345, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.7262210796915167, + "step": 7345, + "train/total_loss": 0.17639602720737457 + }, + { + "entropy": 9.008268356323242, + "epoch": 0.7263199525410322, + "mean_token_accuracy": 0.7863247990608215, + "num_tokens": 17419675.0, + "step": 7346, + "train/ce_loss": 0.894185483455658 + }, + { + "epoch": 0.7263199525410322, + "step": 7346, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.7263199525410322, + "step": 7346, + "train/total_loss": 0.12066855281591415 + }, + { + "entropy": 8.653308868408203, + "epoch": 0.7264188253905478, + "mean_token_accuracy": 0.7242990732192993, + "num_tokens": 17424975.0, + "step": 7347, + "train/ce_loss": 0.6597151756286621 + }, + { + "epoch": 0.7264188253905478, + "step": 7347, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.7264188253905478, + "step": 7347, + "train/total_loss": 0.08159651607275009 + }, + { + "entropy": 8.451759338378906, + "epoch": 0.7265176982400633, + "mean_token_accuracy": 0.7743830680847168, + "num_tokens": 17430523.0, + "step": 7348, + "train/ce_loss": 0.42508605122566223 + }, + { + "epoch": 0.7265176982400633, + "step": 7348, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.7265176982400633, + "step": 7348, + "train/total_loss": 0.08157110214233398 + }, + { + "entropy": 8.61266803741455, + "epoch": 0.7266165710895788, + "mean_token_accuracy": 0.7376623153686523, + "num_tokens": 17435977.0, + "step": 7349, + "train/ce_loss": 1.0716931819915771 + }, + { + "epoch": 0.7266165710895788, + "step": 7349, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.7266165710895788, + "step": 7349, + "train/total_loss": 0.13451308012008667 + }, + { + "entropy": 8.098893165588379, + "epoch": 0.7267154439390944, + "mean_token_accuracy": 0.7740992903709412, + "num_tokens": 17441465.0, + "step": 7350, + "train/ce_loss": 0.5998178720474243 + }, + { + "epoch": 0.7267154439390944, + "step": 7350, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.7267154439390944, + "step": 7350, + "train/total_loss": 0.10685679316520691 + }, + { + "entropy": 9.199495315551758, + "epoch": 0.7268143167886099, + "mean_token_accuracy": 0.7086882591247559, + "num_tokens": 17446501.0, + "step": 7351, + "train/ce_loss": 0.7616429924964905 + }, + { + "epoch": 0.7268143167886099, + "step": 7351, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.7268143167886099, + "step": 7351, + "train/total_loss": 0.10741429775953293 + }, + { + "entropy": 9.160688400268555, + "epoch": 0.7269131896381253, + "mean_token_accuracy": 0.7421602606773376, + "num_tokens": 17451525.0, + "step": 7352, + "train/ce_loss": 0.6872962713241577 + }, + { + "epoch": 0.7269131896381253, + "step": 7352, + "train/sim_loss": 0.01171875 + }, + { + "epoch": 0.7269131896381253, + "step": 7352, + "train/total_loss": 0.08044838160276413 + }, + { + "entropy": 8.840906143188477, + "epoch": 0.7270120624876409, + "mean_token_accuracy": 0.8368263244628906, + "num_tokens": 17456701.0, + "step": 7353, + "train/ce_loss": 0.7152250409126282 + }, + { + "epoch": 0.7270120624876409, + "step": 7353, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.7270120624876409, + "step": 7353, + "train/total_loss": 0.16917875409126282 + }, + { + "entropy": 8.859922409057617, + "epoch": 0.7271109353371564, + "mean_token_accuracy": 0.7874464988708496, + "num_tokens": 17461877.0, + "step": 7354, + "train/ce_loss": 0.7153387069702148 + }, + { + "epoch": 0.7271109353371564, + "step": 7354, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.7271109353371564, + "step": 7354, + "train/total_loss": 0.15356512367725372 + }, + { + "entropy": 8.345499038696289, + "epoch": 0.7272098081866719, + "mean_token_accuracy": 0.7487636208534241, + "num_tokens": 17467412.0, + "step": 7355, + "train/ce_loss": 0.3188781440258026 + }, + { + "epoch": 0.7272098081866719, + "step": 7355, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.7272098081866719, + "step": 7355, + "train/total_loss": 0.05532531440258026 + }, + { + "entropy": 8.838354110717773, + "epoch": 0.7273086810361875, + "mean_token_accuracy": 0.7466443181037903, + "num_tokens": 17472610.0, + "step": 7356, + "train/ce_loss": 1.1068003177642822 + }, + { + "epoch": 0.7273086810361875, + "step": 7356, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.7273086810361875, + "step": 7356, + "train/total_loss": 0.20833629369735718 + }, + { + "entropy": 8.740463256835938, + "epoch": 0.727407553885703, + "mean_token_accuracy": 0.8121212124824524, + "num_tokens": 17477790.0, + "step": 7357, + "train/ce_loss": 0.8163862824440002 + }, + { + "epoch": 0.727407553885703, + "step": 7357, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.727407553885703, + "step": 7357, + "train/total_loss": 0.1207011267542839 + }, + { + "entropy": 8.466141700744629, + "epoch": 0.7275064267352185, + "mean_token_accuracy": 0.7346513867378235, + "num_tokens": 17483220.0, + "step": 7358, + "train/ce_loss": 0.9694086313247681 + }, + { + "epoch": 0.7275064267352185, + "step": 7358, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.7275064267352185, + "step": 7358, + "train/total_loss": 0.13600337505340576 + }, + { + "entropy": 9.09575080871582, + "epoch": 0.7276052995847341, + "mean_token_accuracy": 0.7530487775802612, + "num_tokens": 17488329.0, + "step": 7359, + "train/ce_loss": 2.8264503271202557e-06 + }, + { + "epoch": 0.7276052995847341, + "step": 7359, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.7276052995847341, + "step": 7359, + "train/total_loss": 0.027344033122062683 + }, + { + "epoch": 0.7277041724342496, + "grad_norm": 0.715436577796936, + "learning_rate": 8.183009444691688e-06, + "loss": 0.1286, + "step": 7360 + }, + { + "entropy": 8.299179077148438, + "epoch": 0.7277041724342496, + "mean_token_accuracy": 0.7663366198539734, + "num_tokens": 17493841.0, + "step": 7360, + "train/ce_loss": 0.8519513010978699 + }, + { + "epoch": 0.7277041724342496, + "step": 7360, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.7277041724342496, + "step": 7360, + "train/total_loss": 0.1242576315999031 + }, + { + "entropy": 8.913398742675781, + "epoch": 0.727803045283765, + "mean_token_accuracy": 0.7658402323722839, + "num_tokens": 17499035.0, + "step": 7361, + "train/ce_loss": 0.6264690160751343 + }, + { + "epoch": 0.727803045283765, + "step": 7361, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.727803045283765, + "step": 7361, + "train/total_loss": 0.08999065309762955 + }, + { + "entropy": 9.48381233215332, + "epoch": 0.7279019181332806, + "mean_token_accuracy": 0.7548746466636658, + "num_tokens": 17503785.0, + "step": 7362, + "train/ce_loss": 1.407195031788433e-05 + }, + { + "epoch": 0.7279019181332806, + "step": 7362, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.7279019181332806, + "step": 7362, + "train/total_loss": 0.02343890629708767 + }, + { + "entropy": 9.410112380981445, + "epoch": 0.7280007909827961, + "mean_token_accuracy": 0.6206185817718506, + "num_tokens": 17508709.0, + "step": 7363, + "train/ce_loss": 3.2126429232448572e-06 + }, + { + "epoch": 0.7280007909827961, + "step": 7363, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.7280007909827961, + "step": 7363, + "train/total_loss": 0.023437820374965668 + }, + { + "entropy": 8.998542785644531, + "epoch": 0.7280996638323116, + "mean_token_accuracy": 0.7198879718780518, + "num_tokens": 17513839.0, + "step": 7364, + "train/ce_loss": 1.8226782083511353 + }, + { + "epoch": 0.7280996638323116, + "step": 7364, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.7280996638323116, + "step": 7364, + "train/total_loss": 0.22133032977581024 + }, + { + "entropy": 8.419729232788086, + "epoch": 0.7281985366818272, + "mean_token_accuracy": 0.7397260069847107, + "num_tokens": 17519340.0, + "step": 7365, + "train/ce_loss": 0.5257212519645691 + }, + { + "epoch": 0.7281985366818272, + "step": 7365, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.7281985366818272, + "step": 7365, + "train/total_loss": 0.06819713115692139 + }, + { + "entropy": 8.369890213012695, + "epoch": 0.7282974095313427, + "mean_token_accuracy": 0.7153284549713135, + "num_tokens": 17524796.0, + "step": 7366, + "train/ce_loss": 1.0927273035049438 + }, + { + "epoch": 0.7282974095313427, + "step": 7366, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.7282974095313427, + "step": 7366, + "train/total_loss": 0.14833523333072662 + }, + { + "entropy": 9.245950698852539, + "epoch": 0.7283962823808582, + "mean_token_accuracy": 0.7298187613487244, + "num_tokens": 17530022.0, + "step": 7367, + "train/ce_loss": 0.9309850335121155 + }, + { + "epoch": 0.7283962823808582, + "step": 7367, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.7283962823808582, + "step": 7367, + "train/total_loss": 0.1438797563314438 + }, + { + "entropy": 9.127169609069824, + "epoch": 0.7284951552303738, + "mean_token_accuracy": 0.7284172773361206, + "num_tokens": 17535072.0, + "step": 7368, + "train/ce_loss": 1.8420739706925815e-06 + }, + { + "epoch": 0.7284951552303738, + "step": 7368, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.7284951552303738, + "step": 7368, + "train/total_loss": 0.031250182539224625 + }, + { + "entropy": 8.681436538696289, + "epoch": 0.7285940280798893, + "mean_token_accuracy": 0.7388613820075989, + "num_tokens": 17540395.0, + "step": 7369, + "train/ce_loss": 1.1861965656280518 + }, + { + "epoch": 0.7285940280798893, + "step": 7369, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.7285940280798893, + "step": 7369, + "train/total_loss": 0.1654946506023407 + }, + { + "entropy": 9.279239654541016, + "epoch": 0.7286929009294048, + "mean_token_accuracy": 0.7154639363288879, + "num_tokens": 17545286.0, + "step": 7370, + "train/ce_loss": 1.6929576531765633e-06 + }, + { + "epoch": 0.7286929009294048, + "step": 7370, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.7286929009294048, + "step": 7370, + "train/total_loss": 0.01953141950070858 + }, + { + "entropy": 8.704438209533691, + "epoch": 0.7287917737789203, + "mean_token_accuracy": 0.7281323671340942, + "num_tokens": 17550780.0, + "step": 7371, + "train/ce_loss": 0.9354274272918701 + }, + { + "epoch": 0.7287917737789203, + "step": 7371, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.7287917737789203, + "step": 7371, + "train/total_loss": 0.18338650465011597 + }, + { + "entropy": 9.061699867248535, + "epoch": 0.7288906466284358, + "mean_token_accuracy": 0.7403684854507446, + "num_tokens": 17555835.0, + "step": 7372, + "train/ce_loss": 0.9406352043151855 + }, + { + "epoch": 0.7288906466284358, + "step": 7372, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.7288906466284358, + "step": 7372, + "train/total_loss": 0.10968852043151855 + }, + { + "entropy": 8.256986618041992, + "epoch": 0.7289895194779513, + "mean_token_accuracy": 0.8081587553024292, + "num_tokens": 17561260.0, + "step": 7373, + "train/ce_loss": 0.6097618937492371 + }, + { + "epoch": 0.7289895194779513, + "step": 7373, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.7289895194779513, + "step": 7373, + "train/total_loss": 0.12738244235515594 + }, + { + "entropy": 9.185291290283203, + "epoch": 0.7290883923274669, + "mean_token_accuracy": 0.7113970518112183, + "num_tokens": 17566203.0, + "step": 7374, + "train/ce_loss": 1.5345923900604248 + }, + { + "epoch": 0.7290883923274669, + "step": 7374, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.7290883923274669, + "step": 7374, + "train/total_loss": 0.21595923602581024 + }, + { + "entropy": 8.327561378479004, + "epoch": 0.7291872651769824, + "mean_token_accuracy": 0.7195817232131958, + "num_tokens": 17571715.0, + "step": 7375, + "train/ce_loss": 0.7341464161872864 + }, + { + "epoch": 0.7291872651769824, + "step": 7375, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.7291872651769824, + "step": 7375, + "train/total_loss": 0.089039646089077 + }, + { + "entropy": 9.330865859985352, + "epoch": 0.7292861380264979, + "mean_token_accuracy": 0.6661211252212524, + "num_tokens": 17576749.0, + "step": 7376, + "train/ce_loss": 1.2426577806472778 + }, + { + "epoch": 0.7292861380264979, + "step": 7376, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.7292861380264979, + "step": 7376, + "train/total_loss": 0.21020328998565674 + }, + { + "entropy": 8.291744232177734, + "epoch": 0.7293850108760135, + "mean_token_accuracy": 0.7259752750396729, + "num_tokens": 17582272.0, + "step": 7377, + "train/ce_loss": 0.8136835098266602 + }, + { + "epoch": 0.7293850108760135, + "step": 7377, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.7293850108760135, + "step": 7377, + "train/total_loss": 0.1673058569431305 + }, + { + "entropy": 9.165674209594727, + "epoch": 0.729483883725529, + "mean_token_accuracy": 0.7454844117164612, + "num_tokens": 17587321.0, + "step": 7378, + "train/ce_loss": 1.0950554609298706 + }, + { + "epoch": 0.729483883725529, + "step": 7378, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.729483883725529, + "step": 7378, + "train/total_loss": 0.1993492990732193 + }, + { + "entropy": 9.01347541809082, + "epoch": 0.7295827565750445, + "mean_token_accuracy": 0.6778916716575623, + "num_tokens": 17592451.0, + "step": 7379, + "train/ce_loss": 1.272599458694458 + }, + { + "epoch": 0.7295827565750445, + "step": 7379, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.7295827565750445, + "step": 7379, + "train/total_loss": 0.18585370481014252 + }, + { + "epoch": 0.72968162942456, + "grad_norm": 0.7320528030395508, + "learning_rate": 8.17806457993374e-06, + "loss": 0.1383, + "step": 7380 + }, + { + "entropy": 9.084978103637695, + "epoch": 0.72968162942456, + "mean_token_accuracy": 0.8264462947845459, + "num_tokens": 17597583.0, + "step": 7380, + "train/ce_loss": 0.6765223145484924 + }, + { + "epoch": 0.72968162942456, + "step": 7380, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.72968162942456, + "step": 7380, + "train/total_loss": 0.10671473294496536 + }, + { + "entropy": 9.180556297302246, + "epoch": 0.7297805022740755, + "mean_token_accuracy": 0.711033284664154, + "num_tokens": 17602516.0, + "step": 7381, + "train/ce_loss": 1.0049020051956177 + }, + { + "epoch": 0.7297805022740755, + "step": 7381, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.7297805022740755, + "step": 7381, + "train/total_loss": 0.15127146244049072 + }, + { + "entropy": 8.788009643554688, + "epoch": 0.729879375123591, + "mean_token_accuracy": 0.692307710647583, + "num_tokens": 17607628.0, + "step": 7382, + "train/ce_loss": 1.089476227760315 + }, + { + "epoch": 0.729879375123591, + "step": 7382, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.729879375123591, + "step": 7382, + "train/total_loss": 0.18316638469696045 + }, + { + "entropy": 8.832174301147461, + "epoch": 0.7299782479731066, + "mean_token_accuracy": 0.791023850440979, + "num_tokens": 17612691.0, + "step": 7383, + "train/ce_loss": 0.3496512472629547 + }, + { + "epoch": 0.7299782479731066, + "step": 7383, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.7299782479731066, + "step": 7383, + "train/total_loss": 0.07012137770652771 + }, + { + "entropy": 8.969341278076172, + "epoch": 0.7300771208226221, + "mean_token_accuracy": 0.757656455039978, + "num_tokens": 17617886.0, + "step": 7384, + "train/ce_loss": 1.3367009162902832 + }, + { + "epoch": 0.7300771208226221, + "step": 7384, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.7300771208226221, + "step": 7384, + "train/total_loss": 0.19617009162902832 + }, + { + "entropy": 9.211132049560547, + "epoch": 0.7301759936721376, + "mean_token_accuracy": 0.7120315432548523, + "num_tokens": 17622788.0, + "step": 7385, + "train/ce_loss": 1.2451320886611938 + }, + { + "epoch": 0.7301759936721376, + "step": 7385, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.7301759936721376, + "step": 7385, + "train/total_loss": 0.17920070886611938 + }, + { + "entropy": 8.61956787109375, + "epoch": 0.7302748665216532, + "mean_token_accuracy": 0.7086614370346069, + "num_tokens": 17627988.0, + "step": 7386, + "train/ce_loss": 1.3923940658569336 + }, + { + "epoch": 0.7302748665216532, + "step": 7386, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.7302748665216532, + "step": 7386, + "train/total_loss": 0.22127066552639008 + }, + { + "entropy": 8.674093246459961, + "epoch": 0.7303737393711687, + "mean_token_accuracy": 0.7386934757232666, + "num_tokens": 17633417.0, + "step": 7387, + "train/ce_loss": 0.6746419668197632 + }, + { + "epoch": 0.7303737393711687, + "step": 7387, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.7303737393711687, + "step": 7387, + "train/total_loss": 0.1573079526424408 + }, + { + "entropy": 8.782123565673828, + "epoch": 0.7304726122206842, + "mean_token_accuracy": 0.6891891956329346, + "num_tokens": 17638583.0, + "step": 7388, + "train/ce_loss": 1.1684306859970093 + }, + { + "epoch": 0.7304726122206842, + "step": 7388, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.7304726122206842, + "step": 7388, + "train/total_loss": 0.1598118245601654 + }, + { + "entropy": 8.279651641845703, + "epoch": 0.7305714850701998, + "mean_token_accuracy": 0.7507629990577698, + "num_tokens": 17644035.0, + "step": 7389, + "train/ce_loss": 0.5885629653930664 + }, + { + "epoch": 0.7305714850701998, + "step": 7389, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.7305714850701998, + "step": 7389, + "train/total_loss": 0.0940125435590744 + }, + { + "entropy": 8.533639907836914, + "epoch": 0.7306703579197152, + "mean_token_accuracy": 0.6977567672729492, + "num_tokens": 17649324.0, + "step": 7390, + "train/ce_loss": 1.4970142841339111 + }, + { + "epoch": 0.7306703579197152, + "step": 7390, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.7306703579197152, + "step": 7390, + "train/total_loss": 0.18095143139362335 + }, + { + "entropy": 8.730866432189941, + "epoch": 0.7307692307692307, + "mean_token_accuracy": 0.78311687707901, + "num_tokens": 17654555.0, + "step": 7391, + "train/ce_loss": 0.7643266916275024 + }, + { + "epoch": 0.7307692307692307, + "step": 7391, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.7307692307692307, + "step": 7391, + "train/total_loss": 0.12721392512321472 + }, + { + "entropy": 8.391874313354492, + "epoch": 0.7308681036187463, + "mean_token_accuracy": 0.7694672346115112, + "num_tokens": 17660014.0, + "step": 7392, + "train/ce_loss": 0.7122501134872437 + }, + { + "epoch": 0.7308681036187463, + "step": 7392, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.7308681036187463, + "step": 7392, + "train/total_loss": 0.09075625985860825 + }, + { + "entropy": 8.80854320526123, + "epoch": 0.7309669764682618, + "mean_token_accuracy": 0.7176287174224854, + "num_tokens": 17665096.0, + "step": 7393, + "train/ce_loss": 2.03812837600708 + }, + { + "epoch": 0.7309669764682618, + "step": 7393, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.7309669764682618, + "step": 7393, + "train/total_loss": 0.289750337600708 + }, + { + "entropy": 8.550348281860352, + "epoch": 0.7310658493177773, + "mean_token_accuracy": 0.718826413154602, + "num_tokens": 17670382.0, + "step": 7394, + "train/ce_loss": 0.45230749249458313 + }, + { + "epoch": 0.7310658493177773, + "step": 7394, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.7310658493177773, + "step": 7394, + "train/total_loss": 0.09210574626922607 + }, + { + "entropy": 8.48996353149414, + "epoch": 0.7311647221672929, + "mean_token_accuracy": 0.7989795804023743, + "num_tokens": 17675835.0, + "step": 7395, + "train/ce_loss": 0.4476233720779419 + }, + { + "epoch": 0.7311647221672929, + "step": 7395, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.7311647221672929, + "step": 7395, + "train/total_loss": 0.07210609316825867 + }, + { + "entropy": 8.452526092529297, + "epoch": 0.7312635950168084, + "mean_token_accuracy": 0.7200854420661926, + "num_tokens": 17681280.0, + "step": 7396, + "train/ce_loss": 0.8974115252494812 + }, + { + "epoch": 0.7312635950168084, + "step": 7396, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.7312635950168084, + "step": 7396, + "train/total_loss": 0.16005365550518036 + }, + { + "entropy": 8.42809009552002, + "epoch": 0.7313624678663239, + "mean_token_accuracy": 0.7561235427856445, + "num_tokens": 17686692.0, + "step": 7397, + "train/ce_loss": 0.931861162185669 + }, + { + "epoch": 0.7313624678663239, + "step": 7397, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.7313624678663239, + "step": 7397, + "train/total_loss": 0.17521736025810242 + }, + { + "entropy": 8.542633056640625, + "epoch": 0.7314613407158395, + "mean_token_accuracy": 0.746835470199585, + "num_tokens": 17691887.0, + "step": 7398, + "train/ce_loss": 1.00189208984375 + }, + { + "epoch": 0.7314613407158395, + "step": 7398, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.7314613407158395, + "step": 7398, + "train/total_loss": 0.150970458984375 + }, + { + "entropy": 8.98475170135498, + "epoch": 0.7315602135653549, + "mean_token_accuracy": 0.7868338823318481, + "num_tokens": 17696949.0, + "step": 7399, + "train/ce_loss": 3.0302765026135603e-06 + }, + { + "epoch": 0.7315602135653549, + "step": 7399, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.7315602135653549, + "step": 7399, + "train/total_loss": 0.019531553611159325 + }, + { + "epoch": 0.7316590864148704, + "grad_norm": 0.5891684889793396, + "learning_rate": 8.173119715175791e-06, + "loss": 0.1299, + "step": 7400 + }, + { + "entropy": 8.926944732666016, + "epoch": 0.7316590864148704, + "mean_token_accuracy": 0.711240291595459, + "num_tokens": 17701897.0, + "step": 7400, + "train/ce_loss": 1.4225362539291382 + }, + { + "epoch": 0.7316590864148704, + "step": 7400, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.7316590864148704, + "step": 7400, + "train/total_loss": 0.20865987241268158 + }, + { + "entropy": 8.862375259399414, + "epoch": 0.731757959264386, + "mean_token_accuracy": 0.7735247015953064, + "num_tokens": 17707019.0, + "step": 7401, + "train/ce_loss": 0.6153345704078674 + }, + { + "epoch": 0.731757959264386, + "step": 7401, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.731757959264386, + "step": 7401, + "train/total_loss": 0.15528345108032227 + }, + { + "entropy": 8.799311637878418, + "epoch": 0.7318568321139015, + "mean_token_accuracy": 0.6855955719947815, + "num_tokens": 17712364.0, + "step": 7402, + "train/ce_loss": 1.2858757972717285 + }, + { + "epoch": 0.7318568321139015, + "step": 7402, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.7318568321139015, + "step": 7402, + "train/total_loss": 0.24186883866786957 + }, + { + "entropy": 8.513522148132324, + "epoch": 0.731955704963417, + "mean_token_accuracy": 0.7071239948272705, + "num_tokens": 17717553.0, + "step": 7403, + "train/ce_loss": 0.8545153737068176 + }, + { + "epoch": 0.731955704963417, + "step": 7403, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.731955704963417, + "step": 7403, + "train/total_loss": 0.10888903588056564 + }, + { + "entropy": 8.651004791259766, + "epoch": 0.7320545778129326, + "mean_token_accuracy": 0.8285714387893677, + "num_tokens": 17722810.0, + "step": 7404, + "train/ce_loss": 0.6369662880897522 + }, + { + "epoch": 0.7320545778129326, + "step": 7404, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.7320545778129326, + "step": 7404, + "train/total_loss": 0.07932163029909134 + }, + { + "entropy": 8.672464370727539, + "epoch": 0.7321534506624481, + "mean_token_accuracy": 0.736774206161499, + "num_tokens": 17728047.0, + "step": 7405, + "train/ce_loss": 0.9165966510772705 + }, + { + "epoch": 0.7321534506624481, + "step": 7405, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.7321534506624481, + "step": 7405, + "train/total_loss": 0.11509716510772705 + }, + { + "entropy": 8.93012523651123, + "epoch": 0.7322523235119636, + "mean_token_accuracy": 0.6988950371742249, + "num_tokens": 17733210.0, + "step": 7406, + "train/ce_loss": 0.8794297575950623 + }, + { + "epoch": 0.7322523235119636, + "step": 7406, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.7322523235119636, + "step": 7406, + "train/total_loss": 0.15434923768043518 + }, + { + "entropy": 8.988134384155273, + "epoch": 0.7323511963614792, + "mean_token_accuracy": 0.6939501762390137, + "num_tokens": 17738243.0, + "step": 7407, + "train/ce_loss": 1.4761024713516235 + }, + { + "epoch": 0.7323511963614792, + "step": 7407, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.7323511963614792, + "step": 7407, + "train/total_loss": 0.22182899713516235 + }, + { + "entropy": 8.872218132019043, + "epoch": 0.7324500692109946, + "mean_token_accuracy": 0.715179979801178, + "num_tokens": 17743510.0, + "step": 7408, + "train/ce_loss": 1.2351499795913696 + }, + { + "epoch": 0.7324500692109946, + "step": 7408, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.7324500692109946, + "step": 7408, + "train/total_loss": 0.20554625988006592 + }, + { + "entropy": 8.71223258972168, + "epoch": 0.7325489420605101, + "mean_token_accuracy": 0.7410179376602173, + "num_tokens": 17748656.0, + "step": 7409, + "train/ce_loss": 0.929199755191803 + }, + { + "epoch": 0.7325489420605101, + "step": 7409, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.7325489420605101, + "step": 7409, + "train/total_loss": 0.1163574755191803 + }, + { + "entropy": 8.58144760131836, + "epoch": 0.7326478149100257, + "mean_token_accuracy": 0.7557160258293152, + "num_tokens": 17753983.0, + "step": 7410, + "train/ce_loss": 0.9028260111808777 + }, + { + "epoch": 0.7326478149100257, + "step": 7410, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.7326478149100257, + "step": 7410, + "train/total_loss": 0.14497010409832 + }, + { + "entropy": 8.340339660644531, + "epoch": 0.7327466877595412, + "mean_token_accuracy": 0.7973421812057495, + "num_tokens": 17759376.0, + "step": 7411, + "train/ce_loss": 0.46473783254623413 + }, + { + "epoch": 0.7327466877595412, + "step": 7411, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.7327466877595412, + "step": 7411, + "train/total_loss": 0.07381753623485565 + }, + { + "entropy": 8.653460502624512, + "epoch": 0.7328455606090567, + "mean_token_accuracy": 0.6598424911499023, + "num_tokens": 17764447.0, + "step": 7412, + "train/ce_loss": 2.751971483230591 + }, + { + "epoch": 0.7328455606090567, + "step": 7412, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.7328455606090567, + "step": 7412, + "train/total_loss": 0.3337908983230591 + }, + { + "entropy": 8.45848274230957, + "epoch": 0.7329444334585723, + "mean_token_accuracy": 0.7096070051193237, + "num_tokens": 17769851.0, + "step": 7413, + "train/ce_loss": 0.9773561358451843 + }, + { + "epoch": 0.7329444334585723, + "step": 7413, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.7329444334585723, + "step": 7413, + "train/total_loss": 0.19148561358451843 + }, + { + "entropy": 8.733375549316406, + "epoch": 0.7330433063080878, + "mean_token_accuracy": 0.8272109031677246, + "num_tokens": 17775073.0, + "step": 7414, + "train/ce_loss": 0.7675660252571106 + }, + { + "epoch": 0.7330433063080878, + "step": 7414, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.7330433063080878, + "step": 7414, + "train/total_loss": 0.09238160401582718 + }, + { + "entropy": 8.829120635986328, + "epoch": 0.7331421791576033, + "mean_token_accuracy": 0.7294617295265198, + "num_tokens": 17780221.0, + "step": 7415, + "train/ce_loss": 0.580898106098175 + }, + { + "epoch": 0.7331421791576033, + "step": 7415, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.7331421791576033, + "step": 7415, + "train/total_loss": 0.12058980762958527 + }, + { + "entropy": 8.948328018188477, + "epoch": 0.7332410520071189, + "mean_token_accuracy": 0.7348703145980835, + "num_tokens": 17785351.0, + "step": 7416, + "train/ce_loss": 1.471127986907959 + }, + { + "epoch": 0.7332410520071189, + "step": 7416, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.7332410520071189, + "step": 7416, + "train/total_loss": 0.17836280167102814 + }, + { + "entropy": 9.177257537841797, + "epoch": 0.7333399248566344, + "mean_token_accuracy": 0.7938596606254578, + "num_tokens": 17790211.0, + "step": 7417, + "train/ce_loss": 3.0804694688413292e-06 + }, + { + "epoch": 0.7333399248566344, + "step": 7417, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.7333399248566344, + "step": 7417, + "train/total_loss": 0.05468780919909477 + }, + { + "entropy": 8.93316650390625, + "epoch": 0.7334387977061498, + "mean_token_accuracy": 0.6968085169792175, + "num_tokens": 17795436.0, + "step": 7418, + "train/ce_loss": 0.7184810042381287 + }, + { + "epoch": 0.7334387977061498, + "step": 7418, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.7334387977061498, + "step": 7418, + "train/total_loss": 0.09137935191392899 + }, + { + "entropy": 8.684490203857422, + "epoch": 0.7335376705556654, + "mean_token_accuracy": 0.7452229261398315, + "num_tokens": 17800717.0, + "step": 7419, + "train/ce_loss": 0.981926679611206 + }, + { + "epoch": 0.7335376705556654, + "step": 7419, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.7335376705556654, + "step": 7419, + "train/total_loss": 0.13334891200065613 + }, + { + "epoch": 0.7336365434051809, + "grad_norm": 0.6303120851516724, + "learning_rate": 8.168174850417841e-06, + "loss": 0.1366, + "step": 7420 + }, + { + "entropy": 8.528745651245117, + "epoch": 0.7336365434051809, + "mean_token_accuracy": 0.7373613119125366, + "num_tokens": 17806026.0, + "step": 7420, + "train/ce_loss": 1.3812599182128906 + }, + { + "epoch": 0.7336365434051809, + "step": 7420, + "train/sim_loss": 0.12109375 + }, + { + "epoch": 0.7336365434051809, + "step": 7420, + "train/total_loss": 0.259219765663147 + }, + { + "entropy": 8.751842498779297, + "epoch": 0.7337354162546965, + "mean_token_accuracy": 0.8198433518409729, + "num_tokens": 17811277.0, + "step": 7421, + "train/ce_loss": 0.6977095007896423 + }, + { + "epoch": 0.7337354162546965, + "step": 7421, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.7337354162546965, + "step": 7421, + "train/total_loss": 0.1322709619998932 + }, + { + "entropy": 8.914030075073242, + "epoch": 0.733834289104212, + "mean_token_accuracy": 0.7413073778152466, + "num_tokens": 17816436.0, + "step": 7422, + "train/ce_loss": 0.6798083186149597 + }, + { + "epoch": 0.733834289104212, + "step": 7422, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.733834289104212, + "step": 7422, + "train/total_loss": 0.1500120759010315 + }, + { + "entropy": 8.907278060913086, + "epoch": 0.7339331619537275, + "mean_token_accuracy": 0.7296072244644165, + "num_tokens": 17821531.0, + "step": 7423, + "train/ce_loss": 1.2015002965927124 + }, + { + "epoch": 0.7339331619537275, + "step": 7423, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.7339331619537275, + "step": 7423, + "train/total_loss": 0.15530627965927124 + }, + { + "entropy": 8.852396011352539, + "epoch": 0.7340320348032431, + "mean_token_accuracy": 0.7387606501579285, + "num_tokens": 17826820.0, + "step": 7424, + "train/ce_loss": 0.818252444267273 + }, + { + "epoch": 0.7340320348032431, + "step": 7424, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.7340320348032431, + "step": 7424, + "train/total_loss": 0.17166900634765625 + }, + { + "entropy": 8.934038162231445, + "epoch": 0.7341309076527586, + "mean_token_accuracy": 0.6975398063659668, + "num_tokens": 17831949.0, + "step": 7425, + "train/ce_loss": 2.117769718170166 + }, + { + "epoch": 0.7341309076527586, + "step": 7425, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.7341309076527586, + "step": 7425, + "train/total_loss": 0.3289644718170166 + }, + { + "entropy": 8.69120979309082, + "epoch": 0.734229780502274, + "mean_token_accuracy": 0.7587500214576721, + "num_tokens": 17837188.0, + "step": 7426, + "train/ce_loss": 0.7669753432273865 + }, + { + "epoch": 0.734229780502274, + "step": 7426, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.734229780502274, + "step": 7426, + "train/total_loss": 0.11966628581285477 + }, + { + "entropy": 8.386311531066895, + "epoch": 0.7343286533517897, + "mean_token_accuracy": 0.7668508291244507, + "num_tokens": 17842613.0, + "step": 7427, + "train/ce_loss": 0.4831239581108093 + }, + { + "epoch": 0.7343286533517897, + "step": 7427, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.7343286533517897, + "step": 7427, + "train/total_loss": 0.06393739581108093 + }, + { + "entropy": 8.612841606140137, + "epoch": 0.7344275262013051, + "mean_token_accuracy": 0.7655259966850281, + "num_tokens": 17847857.0, + "step": 7428, + "train/ce_loss": 0.6544772982597351 + }, + { + "epoch": 0.7344275262013051, + "step": 7428, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.7344275262013051, + "step": 7428, + "train/total_loss": 0.11622898280620575 + }, + { + "entropy": 8.549814224243164, + "epoch": 0.7345263990508206, + "mean_token_accuracy": 0.7464324831962585, + "num_tokens": 17853239.0, + "step": 7429, + "train/ce_loss": 0.5706326365470886 + }, + { + "epoch": 0.7345263990508206, + "step": 7429, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.7345263990508206, + "step": 7429, + "train/total_loss": 0.0844070166349411 + }, + { + "entropy": 8.911842346191406, + "epoch": 0.7346252719003362, + "mean_token_accuracy": 0.7272727489471436, + "num_tokens": 17858466.0, + "step": 7430, + "train/ce_loss": 0.8218401670455933 + }, + { + "epoch": 0.7346252719003362, + "step": 7430, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.7346252719003362, + "step": 7430, + "train/total_loss": 0.12905901670455933 + }, + { + "entropy": 8.468302726745605, + "epoch": 0.7347241447498517, + "mean_token_accuracy": 0.7347826361656189, + "num_tokens": 17863842.0, + "step": 7431, + "train/ce_loss": 0.739936113357544 + }, + { + "epoch": 0.7347241447498517, + "step": 7431, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.7347241447498517, + "step": 7431, + "train/total_loss": 0.14039987325668335 + }, + { + "entropy": 8.911083221435547, + "epoch": 0.7348230175993672, + "mean_token_accuracy": 0.741428554058075, + "num_tokens": 17868967.0, + "step": 7432, + "train/ce_loss": 1.0359693765640259 + }, + { + "epoch": 0.7348230175993672, + "step": 7432, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.7348230175993672, + "step": 7432, + "train/total_loss": 0.18562819063663483 + }, + { + "entropy": 8.585333824157715, + "epoch": 0.7349218904488828, + "mean_token_accuracy": 0.7360946536064148, + "num_tokens": 17874280.0, + "step": 7433, + "train/ce_loss": 0.9600635170936584 + }, + { + "epoch": 0.7349218904488828, + "step": 7433, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.7349218904488828, + "step": 7433, + "train/total_loss": 0.1819438636302948 + }, + { + "entropy": 8.341687202453613, + "epoch": 0.7350207632983983, + "mean_token_accuracy": 0.7622789740562439, + "num_tokens": 17879742.0, + "step": 7434, + "train/ce_loss": 0.5410976409912109 + }, + { + "epoch": 0.7350207632983983, + "step": 7434, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.7350207632983983, + "step": 7434, + "train/total_loss": 0.09707851707935333 + }, + { + "entropy": 8.966690063476562, + "epoch": 0.7351196361479138, + "mean_token_accuracy": 0.7216890454292297, + "num_tokens": 17884712.0, + "step": 7435, + "train/ce_loss": 1.1198730135220103e-05 + }, + { + "epoch": 0.7351196361479138, + "step": 7435, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.7351196361479138, + "step": 7435, + "train/total_loss": 0.05859487131237984 + }, + { + "entropy": 9.215240478515625, + "epoch": 0.7352185089974294, + "mean_token_accuracy": 0.7508361339569092, + "num_tokens": 17889779.0, + "step": 7436, + "train/ce_loss": 2.024491550400853e-06 + }, + { + "epoch": 0.7352185089974294, + "step": 7436, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.7352185089974294, + "step": 7436, + "train/total_loss": 0.06250020116567612 + }, + { + "entropy": 8.692525863647461, + "epoch": 0.7353173818469448, + "mean_token_accuracy": 0.7033374309539795, + "num_tokens": 17895091.0, + "step": 7437, + "train/ce_loss": 1.055518627166748 + }, + { + "epoch": 0.7353173818469448, + "step": 7437, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.7353173818469448, + "step": 7437, + "train/total_loss": 0.16023936867713928 + }, + { + "entropy": 8.578010559082031, + "epoch": 0.7354162546964603, + "mean_token_accuracy": 0.7098501324653625, + "num_tokens": 17900483.0, + "step": 7438, + "train/ce_loss": 0.8790786266326904 + }, + { + "epoch": 0.7354162546964603, + "step": 7438, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.7354162546964603, + "step": 7438, + "train/total_loss": 0.13478286564350128 + }, + { + "entropy": 8.591001510620117, + "epoch": 0.7355151275459759, + "mean_token_accuracy": 0.7846952080726624, + "num_tokens": 17905716.0, + "step": 7439, + "train/ce_loss": 1.0676848888397217 + }, + { + "epoch": 0.7355151275459759, + "step": 7439, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.7355151275459759, + "step": 7439, + "train/total_loss": 0.20442473888397217 + }, + { + "epoch": 0.7356140003954914, + "grad_norm": 0.6831417083740234, + "learning_rate": 8.163229985659894e-06, + "loss": 0.1384, + "step": 7440 + }, + { + "entropy": 8.783339500427246, + "epoch": 0.7356140003954914, + "mean_token_accuracy": 0.6866666674613953, + "num_tokens": 17910920.0, + "step": 7440, + "train/ce_loss": 1.181344747543335 + }, + { + "epoch": 0.7356140003954914, + "step": 7440, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.7356140003954914, + "step": 7440, + "train/total_loss": 0.16891571879386902 + }, + { + "entropy": 8.621808052062988, + "epoch": 0.7357128732450069, + "mean_token_accuracy": 0.747474730014801, + "num_tokens": 17916289.0, + "step": 7441, + "train/ce_loss": 1.483172059059143 + }, + { + "epoch": 0.7357128732450069, + "step": 7441, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.7357128732450069, + "step": 7441, + "train/total_loss": 0.19519220292568207 + }, + { + "entropy": 8.434520721435547, + "epoch": 0.7358117460945225, + "mean_token_accuracy": 0.740618109703064, + "num_tokens": 17921665.0, + "step": 7442, + "train/ce_loss": 0.37312033772468567 + }, + { + "epoch": 0.7358117460945225, + "step": 7442, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.7358117460945225, + "step": 7442, + "train/total_loss": 0.07637453079223633 + }, + { + "entropy": 8.742439270019531, + "epoch": 0.735910618944038, + "mean_token_accuracy": 0.7351274490356445, + "num_tokens": 17926775.0, + "step": 7443, + "train/ce_loss": 1.4379651546478271 + }, + { + "epoch": 0.735910618944038, + "step": 7443, + "train/sim_loss": 0.140625 + }, + { + "epoch": 0.735910618944038, + "step": 7443, + "train/total_loss": 0.28442150354385376 + }, + { + "entropy": 8.910379409790039, + "epoch": 0.7360094917935535, + "mean_token_accuracy": 0.75, + "num_tokens": 17931831.0, + "step": 7444, + "train/ce_loss": 2.8590995952981757e-06 + }, + { + "epoch": 0.7360094917935535, + "step": 7444, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.7360094917935535, + "step": 7444, + "train/total_loss": 0.023437784984707832 + }, + { + "entropy": 8.449346542358398, + "epoch": 0.7361083646430691, + "mean_token_accuracy": 0.7605473399162292, + "num_tokens": 17937201.0, + "step": 7445, + "train/ce_loss": 0.5927489399909973 + }, + { + "epoch": 0.7361083646430691, + "step": 7445, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.7361083646430691, + "step": 7445, + "train/total_loss": 0.12568114697933197 + }, + { + "entropy": 8.496419906616211, + "epoch": 0.7362072374925845, + "mean_token_accuracy": 0.7832699418067932, + "num_tokens": 17942474.0, + "step": 7446, + "train/ce_loss": 0.5286972522735596 + }, + { + "epoch": 0.7362072374925845, + "step": 7446, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.7362072374925845, + "step": 7446, + "train/total_loss": 0.09583847224712372 + }, + { + "entropy": 9.497404098510742, + "epoch": 0.7363061103421, + "mean_token_accuracy": 0.7071583271026611, + "num_tokens": 17947345.0, + "step": 7447, + "train/ce_loss": 1.1761665344238281 + }, + { + "epoch": 0.7363061103421, + "step": 7447, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.7363061103421, + "step": 7447, + "train/total_loss": 0.1371479034423828 + }, + { + "entropy": 8.603475570678711, + "epoch": 0.7364049831916156, + "mean_token_accuracy": 0.7022988796234131, + "num_tokens": 17952687.0, + "step": 7448, + "train/ce_loss": 0.9062698483467102 + }, + { + "epoch": 0.7364049831916156, + "step": 7448, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.7364049831916156, + "step": 7448, + "train/total_loss": 0.11797073483467102 + }, + { + "entropy": 9.307031631469727, + "epoch": 0.7365038560411311, + "mean_token_accuracy": 0.7286324501037598, + "num_tokens": 17957579.0, + "step": 7449, + "train/ce_loss": 1.983763337135315 + }, + { + "epoch": 0.7365038560411311, + "step": 7449, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.7365038560411311, + "step": 7449, + "train/total_loss": 0.2725951075553894 + }, + { + "entropy": 8.819190979003906, + "epoch": 0.7366027288906466, + "mean_token_accuracy": 0.7057926654815674, + "num_tokens": 17962639.0, + "step": 7450, + "train/ce_loss": 1.0233535766601562 + }, + { + "epoch": 0.7366027288906466, + "step": 7450, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.7366027288906466, + "step": 7450, + "train/total_loss": 0.1765541136264801 + }, + { + "entropy": 8.637828826904297, + "epoch": 0.7367016017401622, + "mean_token_accuracy": 0.7119901180267334, + "num_tokens": 17967934.0, + "step": 7451, + "train/ce_loss": 1.9882410764694214 + }, + { + "epoch": 0.7367016017401622, + "step": 7451, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.7367016017401622, + "step": 7451, + "train/total_loss": 0.24960535764694214 + }, + { + "entropy": 8.599782943725586, + "epoch": 0.7368004745896777, + "mean_token_accuracy": 0.7238442897796631, + "num_tokens": 17973243.0, + "step": 7452, + "train/ce_loss": 0.9183986186981201 + }, + { + "epoch": 0.7368004745896777, + "step": 7452, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.7368004745896777, + "step": 7452, + "train/total_loss": 0.17777736485004425 + }, + { + "entropy": 8.703730583190918, + "epoch": 0.7368993474391932, + "mean_token_accuracy": 0.7442660331726074, + "num_tokens": 17978597.0, + "step": 7453, + "train/ce_loss": 0.710777759552002 + }, + { + "epoch": 0.7368993474391932, + "step": 7453, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.7368993474391932, + "step": 7453, + "train/total_loss": 0.16873402893543243 + }, + { + "entropy": 8.349955558776855, + "epoch": 0.7369982202887088, + "mean_token_accuracy": 0.7752193212509155, + "num_tokens": 17983918.0, + "step": 7454, + "train/ce_loss": 0.5250358581542969 + }, + { + "epoch": 0.7369982202887088, + "step": 7454, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.7369982202887088, + "step": 7454, + "train/total_loss": 0.09547233581542969 + }, + { + "entropy": 9.179643630981445, + "epoch": 0.7370970931382242, + "mean_token_accuracy": 0.7523629665374756, + "num_tokens": 17988831.0, + "step": 7455, + "train/ce_loss": 1.054730772972107 + }, + { + "epoch": 0.7370970931382242, + "step": 7455, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.7370970931382242, + "step": 7455, + "train/total_loss": 0.12891057133674622 + }, + { + "entropy": 8.195453643798828, + "epoch": 0.7371959659877397, + "mean_token_accuracy": 0.7772685885429382, + "num_tokens": 17994422.0, + "step": 7456, + "train/ce_loss": 0.589515209197998 + }, + { + "epoch": 0.7371959659877397, + "step": 7456, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.7371959659877397, + "step": 7456, + "train/total_loss": 0.12145152688026428 + }, + { + "entropy": 8.321382522583008, + "epoch": 0.7372948388372553, + "mean_token_accuracy": 0.7400000095367432, + "num_tokens": 17999746.0, + "step": 7457, + "train/ce_loss": 1.3899191617965698 + }, + { + "epoch": 0.7372948388372553, + "step": 7457, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.7372948388372553, + "step": 7457, + "train/total_loss": 0.22102317214012146 + }, + { + "entropy": 8.427468299865723, + "epoch": 0.7373937116867708, + "mean_token_accuracy": 0.7523609399795532, + "num_tokens": 18005183.0, + "step": 7458, + "train/ce_loss": 0.47106844186782837 + }, + { + "epoch": 0.7373937116867708, + "step": 7458, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.7373937116867708, + "step": 7458, + "train/total_loss": 0.06273184716701508 + }, + { + "entropy": 8.995831489562988, + "epoch": 0.7374925845362863, + "mean_token_accuracy": 0.686821699142456, + "num_tokens": 18010282.0, + "step": 7459, + "train/ce_loss": 1.6602399349212646 + }, + { + "epoch": 0.7374925845362863, + "step": 7459, + "train/sim_loss": 0.140625 + }, + { + "epoch": 0.7374925845362863, + "step": 7459, + "train/total_loss": 0.30664899945259094 + }, + { + "epoch": 0.7375914573858019, + "grad_norm": 0.7434929609298706, + "learning_rate": 8.158285120901944e-06, + "loss": 0.1457, + "step": 7460 + }, + { + "entropy": 8.676469802856445, + "epoch": 0.7375914573858019, + "mean_token_accuracy": 0.7600595951080322, + "num_tokens": 18015401.0, + "step": 7460, + "train/ce_loss": 0.8739643096923828 + }, + { + "epoch": 0.7375914573858019, + "step": 7460, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.7375914573858019, + "step": 7460, + "train/total_loss": 0.14208394289016724 + }, + { + "entropy": 8.663125038146973, + "epoch": 0.7376903302353174, + "mean_token_accuracy": 0.7645390033721924, + "num_tokens": 18020579.0, + "step": 7461, + "train/ce_loss": 1.3094218969345093 + }, + { + "epoch": 0.7376903302353174, + "step": 7461, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.7376903302353174, + "step": 7461, + "train/total_loss": 0.1700046956539154 + }, + { + "entropy": 8.195062637329102, + "epoch": 0.7377892030848329, + "mean_token_accuracy": 0.7237623929977417, + "num_tokens": 18026087.0, + "step": 7462, + "train/ce_loss": 1.2017582654953003 + }, + { + "epoch": 0.7377892030848329, + "step": 7462, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.7377892030848329, + "step": 7462, + "train/total_loss": 0.15923833847045898 + }, + { + "entropy": 8.615524291992188, + "epoch": 0.7378880759343485, + "mean_token_accuracy": 0.7533265352249146, + "num_tokens": 18031543.0, + "step": 7463, + "train/ce_loss": 0.5794716477394104 + }, + { + "epoch": 0.7378880759343485, + "step": 7463, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.7378880759343485, + "step": 7463, + "train/total_loss": 0.09310341626405716 + }, + { + "entropy": 8.52651309967041, + "epoch": 0.737986948783864, + "mean_token_accuracy": 0.7455782294273376, + "num_tokens": 18036708.0, + "step": 7464, + "train/ce_loss": 0.650346040725708 + }, + { + "epoch": 0.737986948783864, + "step": 7464, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.737986948783864, + "step": 7464, + "train/total_loss": 0.10019085556268692 + }, + { + "entropy": 9.093423843383789, + "epoch": 0.7380858216333794, + "mean_token_accuracy": 0.6904761791229248, + "num_tokens": 18041777.0, + "step": 7465, + "train/ce_loss": 0.5290768146514893 + }, + { + "epoch": 0.7380858216333794, + "step": 7465, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.7380858216333794, + "step": 7465, + "train/total_loss": 0.08806393295526505 + }, + { + "entropy": 8.408145904541016, + "epoch": 0.738184694482895, + "mean_token_accuracy": 0.7427341341972351, + "num_tokens": 18047248.0, + "step": 7466, + "train/ce_loss": 1.373179316520691 + }, + { + "epoch": 0.738184694482895, + "step": 7466, + "train/sim_loss": 0.12890625 + }, + { + "epoch": 0.738184694482895, + "step": 7466, + "train/total_loss": 0.266224205493927 + }, + { + "entropy": 9.112894058227539, + "epoch": 0.7382835673324105, + "mean_token_accuracy": 0.7475149035453796, + "num_tokens": 18052163.0, + "step": 7467, + "train/ce_loss": 1.8712252378463745 + }, + { + "epoch": 0.7382835673324105, + "step": 7467, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.7382835673324105, + "step": 7467, + "train/total_loss": 0.23790377378463745 + }, + { + "entropy": 8.533920288085938, + "epoch": 0.738382440181926, + "mean_token_accuracy": 0.7494252920150757, + "num_tokens": 18057560.0, + "step": 7468, + "train/ce_loss": 0.5045980215072632 + }, + { + "epoch": 0.738382440181926, + "step": 7468, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.738382440181926, + "step": 7468, + "train/total_loss": 0.14030355215072632 + }, + { + "entropy": 8.695446968078613, + "epoch": 0.7384813130314416, + "mean_token_accuracy": 0.7431421279907227, + "num_tokens": 18062915.0, + "step": 7469, + "train/ce_loss": 1.0216988325119019 + }, + { + "epoch": 0.7384813130314416, + "step": 7469, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.7384813130314416, + "step": 7469, + "train/total_loss": 0.17638863623142242 + }, + { + "entropy": 8.989627838134766, + "epoch": 0.7385801858809571, + "mean_token_accuracy": 0.7777777910232544, + "num_tokens": 18068143.0, + "step": 7470, + "train/ce_loss": 1.1923831701278687 + }, + { + "epoch": 0.7385801858809571, + "step": 7470, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.7385801858809571, + "step": 7470, + "train/total_loss": 0.19345706701278687 + }, + { + "entropy": 9.072559356689453, + "epoch": 0.7386790587304726, + "mean_token_accuracy": 0.7285463809967041, + "num_tokens": 18073159.0, + "step": 7471, + "train/ce_loss": 4.317763796279905e-06 + }, + { + "epoch": 0.7386790587304726, + "step": 7471, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.7386790587304726, + "step": 7471, + "train/total_loss": 0.04296918213367462 + }, + { + "entropy": 8.951403617858887, + "epoch": 0.7387779315799882, + "mean_token_accuracy": 0.7418397665023804, + "num_tokens": 18078321.0, + "step": 7472, + "train/ce_loss": 1.0193686485290527 + }, + { + "epoch": 0.7387779315799882, + "step": 7472, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.7387779315799882, + "step": 7472, + "train/total_loss": 0.18006187677383423 + }, + { + "entropy": 8.52254867553711, + "epoch": 0.7388768044295037, + "mean_token_accuracy": 0.7313883304595947, + "num_tokens": 18083782.0, + "step": 7473, + "train/ce_loss": 0.5721186399459839 + }, + { + "epoch": 0.7388768044295037, + "step": 7473, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.7388768044295037, + "step": 7473, + "train/total_loss": 0.08455561101436615 + }, + { + "entropy": 8.624273300170898, + "epoch": 0.7389756772790191, + "mean_token_accuracy": 0.7298387289047241, + "num_tokens": 18088950.0, + "step": 7474, + "train/ce_loss": 0.4974902868270874 + }, + { + "epoch": 0.7389756772790191, + "step": 7474, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.7389756772790191, + "step": 7474, + "train/total_loss": 0.08881153166294098 + }, + { + "entropy": 8.725934982299805, + "epoch": 0.7390745501285347, + "mean_token_accuracy": 0.7341115474700928, + "num_tokens": 18094363.0, + "step": 7475, + "train/ce_loss": 0.9648977518081665 + }, + { + "epoch": 0.7390745501285347, + "step": 7475, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.7390745501285347, + "step": 7475, + "train/total_loss": 0.1746147871017456 + }, + { + "entropy": 8.515832901000977, + "epoch": 0.7391734229780502, + "mean_token_accuracy": 0.75, + "num_tokens": 18099660.0, + "step": 7476, + "train/ce_loss": 0.5893865823745728 + }, + { + "epoch": 0.7391734229780502, + "step": 7476, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.7391734229780502, + "step": 7476, + "train/total_loss": 0.1175324097275734 + }, + { + "entropy": 8.935949325561523, + "epoch": 0.7392722958275657, + "mean_token_accuracy": 0.7350427508354187, + "num_tokens": 18104844.0, + "step": 7477, + "train/ce_loss": 1.5920840041871998e-06 + }, + { + "epoch": 0.7392722958275657, + "step": 7477, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.7392722958275657, + "step": 7477, + "train/total_loss": 0.031250160187482834 + }, + { + "entropy": 8.676297187805176, + "epoch": 0.7393711686770813, + "mean_token_accuracy": 0.7168674468994141, + "num_tokens": 18109939.0, + "step": 7478, + "train/ce_loss": 1.316830039024353 + }, + { + "epoch": 0.7393711686770813, + "step": 7478, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.7393711686770813, + "step": 7478, + "train/total_loss": 0.19418300688266754 + }, + { + "entropy": 8.472021102905273, + "epoch": 0.7394700415265968, + "mean_token_accuracy": 0.7288135886192322, + "num_tokens": 18115328.0, + "step": 7479, + "train/ce_loss": 0.778196394443512 + }, + { + "epoch": 0.7394700415265968, + "step": 7479, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.7394700415265968, + "step": 7479, + "train/total_loss": 0.10125713795423508 + }, + { + "epoch": 0.7395689143761123, + "grad_norm": 0.6186052560806274, + "learning_rate": 8.153340256143996e-06, + "loss": 0.1388, + "step": 7480 + }, + { + "entropy": 8.667085647583008, + "epoch": 0.7395689143761123, + "mean_token_accuracy": 0.7706093192100525, + "num_tokens": 18120639.0, + "step": 7480, + "train/ce_loss": 0.9406066536903381 + }, + { + "epoch": 0.7395689143761123, + "step": 7480, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.7395689143761123, + "step": 7480, + "train/total_loss": 0.14874815940856934 + }, + { + "entropy": 8.820192337036133, + "epoch": 0.7396677872256279, + "mean_token_accuracy": 0.7028493881225586, + "num_tokens": 18125825.0, + "step": 7481, + "train/ce_loss": 1.157432507170597e-06 + }, + { + "epoch": 0.7396677872256279, + "step": 7481, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.7396677872256279, + "step": 7481, + "train/total_loss": 0.03906261548399925 + }, + { + "entropy": 8.352201461791992, + "epoch": 0.7397666600751434, + "mean_token_accuracy": 0.7907253503799438, + "num_tokens": 18131117.0, + "step": 7482, + "train/ce_loss": 0.5861807465553284 + }, + { + "epoch": 0.7397666600751434, + "step": 7482, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.7397666600751434, + "step": 7482, + "train/total_loss": 0.08986807614564896 + }, + { + "entropy": 8.777387619018555, + "epoch": 0.7398655329246588, + "mean_token_accuracy": 0.7474489808082581, + "num_tokens": 18136348.0, + "step": 7483, + "train/ce_loss": 0.7887819409370422 + }, + { + "epoch": 0.7398655329246588, + "step": 7483, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.7398655329246588, + "step": 7483, + "train/total_loss": 0.12184694409370422 + }, + { + "entropy": 8.668893814086914, + "epoch": 0.7399644057741744, + "mean_token_accuracy": 0.7419354915618896, + "num_tokens": 18141641.0, + "step": 7484, + "train/ce_loss": 0.732936680316925 + }, + { + "epoch": 0.7399644057741744, + "step": 7484, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.7399644057741744, + "step": 7484, + "train/total_loss": 0.14360617101192474 + }, + { + "entropy": 8.698103904724121, + "epoch": 0.7400632786236899, + "mean_token_accuracy": 0.7807351350784302, + "num_tokens": 18146859.0, + "step": 7485, + "train/ce_loss": 0.6472761631011963 + }, + { + "epoch": 0.7400632786236899, + "step": 7485, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.7400632786236899, + "step": 7485, + "train/total_loss": 0.08035261929035187 + }, + { + "entropy": 8.32318115234375, + "epoch": 0.7401621514732054, + "mean_token_accuracy": 0.7098692059516907, + "num_tokens": 18152188.0, + "step": 7486, + "train/ce_loss": 0.5486804842948914 + }, + { + "epoch": 0.7401621514732054, + "step": 7486, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.7401621514732054, + "step": 7486, + "train/total_loss": 0.10955554991960526 + }, + { + "entropy": 8.622991561889648, + "epoch": 0.740261024322721, + "mean_token_accuracy": 0.7683772444725037, + "num_tokens": 18157390.0, + "step": 7487, + "train/ce_loss": 0.8550941348075867 + }, + { + "epoch": 0.740261024322721, + "step": 7487, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.740261024322721, + "step": 7487, + "train/total_loss": 0.14800941944122314 + }, + { + "entropy": 9.3809814453125, + "epoch": 0.7403598971722365, + "mean_token_accuracy": 0.7139784693717957, + "num_tokens": 18162235.0, + "step": 7488, + "train/ce_loss": 5.4315864872478414e-06 + }, + { + "epoch": 0.7403598971722365, + "step": 7488, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.7403598971722365, + "step": 7488, + "train/total_loss": 0.015625543892383575 + }, + { + "entropy": 8.999673843383789, + "epoch": 0.740458770021752, + "mean_token_accuracy": 0.8227091431617737, + "num_tokens": 18167216.0, + "step": 7489, + "train/ce_loss": 3.1229728847392835e-06 + }, + { + "epoch": 0.740458770021752, + "step": 7489, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.740458770021752, + "step": 7489, + "train/total_loss": 0.04296906292438507 + }, + { + "entropy": 9.476018905639648, + "epoch": 0.7405576428712676, + "mean_token_accuracy": 0.7777777910232544, + "num_tokens": 18172017.0, + "step": 7490, + "train/ce_loss": 1.7611380815505981 + }, + { + "epoch": 0.7405576428712676, + "step": 7490, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.7405576428712676, + "step": 7490, + "train/total_loss": 0.2386138141155243 + }, + { + "entropy": 8.836692810058594, + "epoch": 0.7406565157207831, + "mean_token_accuracy": 0.7779291272163391, + "num_tokens": 18177190.0, + "step": 7491, + "train/ce_loss": 9.796309541343362e-07 + }, + { + "epoch": 0.7406565157207831, + "step": 7491, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.7406565157207831, + "step": 7491, + "train/total_loss": 0.02343759872019291 + }, + { + "entropy": 8.639360427856445, + "epoch": 0.7407553885702985, + "mean_token_accuracy": 0.798751950263977, + "num_tokens": 18182267.0, + "step": 7492, + "train/ce_loss": 0.6451224088668823 + }, + { + "epoch": 0.7407553885702985, + "step": 7492, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.7407553885702985, + "step": 7492, + "train/total_loss": 0.08794974535703659 + }, + { + "entropy": 9.31312370300293, + "epoch": 0.7408542614198141, + "mean_token_accuracy": 0.8333333134651184, + "num_tokens": 18187189.0, + "step": 7493, + "train/ce_loss": 1.679405613685958e-06 + }, + { + "epoch": 0.7408542614198141, + "step": 7493, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.7408542614198141, + "step": 7493, + "train/total_loss": 0.02343766763806343 + }, + { + "entropy": 8.655989646911621, + "epoch": 0.7409531342693296, + "mean_token_accuracy": 0.7229729890823364, + "num_tokens": 18192370.0, + "step": 7494, + "train/ce_loss": 0.5090431571006775 + }, + { + "epoch": 0.7409531342693296, + "step": 7494, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.7409531342693296, + "step": 7494, + "train/total_loss": 0.09777931869029999 + }, + { + "entropy": 8.696633338928223, + "epoch": 0.7410520071188451, + "mean_token_accuracy": 0.7646276354789734, + "num_tokens": 18197603.0, + "step": 7495, + "train/ce_loss": 0.6268374919891357 + }, + { + "epoch": 0.7410520071188451, + "step": 7495, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.7410520071188451, + "step": 7495, + "train/total_loss": 0.08221500366926193 + }, + { + "entropy": 8.735063552856445, + "epoch": 0.7411508799683607, + "mean_token_accuracy": 0.7355263233184814, + "num_tokens": 18202861.0, + "step": 7496, + "train/ce_loss": 1.2398719787597656 + }, + { + "epoch": 0.7411508799683607, + "step": 7496, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.7411508799683607, + "step": 7496, + "train/total_loss": 0.15523719787597656 + }, + { + "entropy": 8.043102264404297, + "epoch": 0.7412497528178762, + "mean_token_accuracy": 0.6976320743560791, + "num_tokens": 18208422.0, + "step": 7497, + "train/ce_loss": 0.8832078576087952 + }, + { + "epoch": 0.7412497528178762, + "step": 7497, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.7412497528178762, + "step": 7497, + "train/total_loss": 0.166445791721344 + }, + { + "entropy": 8.297633171081543, + "epoch": 0.7413486256673917, + "mean_token_accuracy": 0.770567774772644, + "num_tokens": 18213775.0, + "step": 7498, + "train/ce_loss": 0.9094583988189697 + }, + { + "epoch": 0.7413486256673917, + "step": 7498, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.7413486256673917, + "step": 7498, + "train/total_loss": 0.14172708988189697 + }, + { + "entropy": 8.361133575439453, + "epoch": 0.7414474985169073, + "mean_token_accuracy": 0.7287405729293823, + "num_tokens": 18219158.0, + "step": 7499, + "train/ce_loss": 1.0043796300888062 + }, + { + "epoch": 0.7414474985169073, + "step": 7499, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.7414474985169073, + "step": 7499, + "train/total_loss": 0.1590317189693451 + }, + { + "epoch": 0.7415463713664228, + "grad_norm": 0.6593493819236755, + "learning_rate": 8.148395391386045e-06, + "loss": 0.1274, + "step": 7500 + }, + { + "entropy": 8.695594787597656, + "epoch": 0.7415463713664228, + "mean_token_accuracy": 0.7369165420532227, + "num_tokens": 18224300.0, + "step": 7500, + "train/ce_loss": 0.4475291073322296 + }, + { + "epoch": 0.7415463713664228, + "step": 7500, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.7415463713664228, + "step": 7500, + "train/total_loss": 0.12678416073322296 + }, + { + "entropy": 8.78538703918457, + "epoch": 0.7416452442159382, + "mean_token_accuracy": 0.711796224117279, + "num_tokens": 18229483.0, + "step": 7501, + "train/ce_loss": 2.1044628620147705 + }, + { + "epoch": 0.7416452442159382, + "step": 7501, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.7416452442159382, + "step": 7501, + "train/total_loss": 0.257321298122406 + }, + { + "entropy": 8.628006935119629, + "epoch": 0.7417441170654538, + "mean_token_accuracy": 0.7457386255264282, + "num_tokens": 18234634.0, + "step": 7502, + "train/ce_loss": 0.9490795135498047 + }, + { + "epoch": 0.7417441170654538, + "step": 7502, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.7417441170654538, + "step": 7502, + "train/total_loss": 0.1261579543352127 + }, + { + "entropy": 8.950165748596191, + "epoch": 0.7418429899149693, + "mean_token_accuracy": 0.7557471394538879, + "num_tokens": 18239778.0, + "step": 7503, + "train/ce_loss": 1.7057231664657593 + }, + { + "epoch": 0.7418429899149693, + "step": 7503, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.7418429899149693, + "step": 7503, + "train/total_loss": 0.22916607558727264 + }, + { + "entropy": 8.387922286987305, + "epoch": 0.7419418627644849, + "mean_token_accuracy": 0.739130437374115, + "num_tokens": 18245170.0, + "step": 7504, + "train/ce_loss": 0.7675881385803223 + }, + { + "epoch": 0.7419418627644849, + "step": 7504, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.7419418627644849, + "step": 7504, + "train/total_loss": 0.12363381683826447 + }, + { + "entropy": 8.603219985961914, + "epoch": 0.7420407356140004, + "mean_token_accuracy": 0.6928961873054504, + "num_tokens": 18250566.0, + "step": 7505, + "train/ce_loss": 1.4200619459152222 + }, + { + "epoch": 0.7420407356140004, + "step": 7505, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.7420407356140004, + "step": 7505, + "train/total_loss": 0.22403745353221893 + }, + { + "entropy": 8.27988052368164, + "epoch": 0.7421396084635159, + "mean_token_accuracy": 0.7318918704986572, + "num_tokens": 18255985.0, + "step": 7506, + "train/ce_loss": 0.6595481038093567 + }, + { + "epoch": 0.7421396084635159, + "step": 7506, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.7421396084635159, + "step": 7506, + "train/total_loss": 0.08548606187105179 + }, + { + "entropy": 8.62784194946289, + "epoch": 0.7422384813130315, + "mean_token_accuracy": 0.7718501687049866, + "num_tokens": 18261498.0, + "step": 7507, + "train/ce_loss": 1.0468218326568604 + }, + { + "epoch": 0.7422384813130315, + "step": 7507, + "train/sim_loss": 0.1484375 + }, + { + "epoch": 0.7422384813130315, + "step": 7507, + "train/total_loss": 0.25311967730522156 + }, + { + "entropy": 8.539094924926758, + "epoch": 0.742337354162547, + "mean_token_accuracy": 0.7296416759490967, + "num_tokens": 18266566.0, + "step": 7508, + "train/ce_loss": 2.49420668296807e-06 + }, + { + "epoch": 0.742337354162547, + "step": 7508, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.742337354162547, + "step": 7508, + "train/total_loss": 0.03515649959445 + }, + { + "entropy": 9.223367691040039, + "epoch": 0.7424362270120625, + "mean_token_accuracy": 0.7969052195549011, + "num_tokens": 18271480.0, + "step": 7509, + "train/ce_loss": 1.2301596403121948 + }, + { + "epoch": 0.7424362270120625, + "step": 7509, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.7424362270120625, + "step": 7509, + "train/total_loss": 0.19332846999168396 + }, + { + "entropy": 9.014839172363281, + "epoch": 0.7425350998615781, + "mean_token_accuracy": 0.7388724088668823, + "num_tokens": 18276636.0, + "step": 7510, + "train/ce_loss": 0.7081224322319031 + }, + { + "epoch": 0.7425350998615781, + "step": 7510, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.7425350998615781, + "step": 7510, + "train/total_loss": 0.10206224769353867 + }, + { + "entropy": 9.475015640258789, + "epoch": 0.7426339727110935, + "mean_token_accuracy": 0.7409470677375793, + "num_tokens": 18281409.0, + "step": 7511, + "train/ce_loss": 4.576662377075991e-06 + }, + { + "epoch": 0.7426339727110935, + "step": 7511, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.7426339727110935, + "step": 7511, + "train/total_loss": 0.04687545821070671 + }, + { + "entropy": 8.920570373535156, + "epoch": 0.742732845560609, + "mean_token_accuracy": 0.7527910470962524, + "num_tokens": 18286444.0, + "step": 7512, + "train/ce_loss": 6.4199302869383246e-06 + }, + { + "epoch": 0.742732845560609, + "step": 7512, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.742732845560609, + "step": 7512, + "train/total_loss": 0.027344392612576485 + }, + { + "entropy": 8.559412956237793, + "epoch": 0.7428317184101246, + "mean_token_accuracy": 0.8051391839981079, + "num_tokens": 18292050.0, + "step": 7513, + "train/ce_loss": 0.3435446619987488 + }, + { + "epoch": 0.7428317184101246, + "step": 7513, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.7428317184101246, + "step": 7513, + "train/total_loss": 0.08513571321964264 + }, + { + "entropy": 9.073625564575195, + "epoch": 0.7429305912596401, + "mean_token_accuracy": 0.7381294965744019, + "num_tokens": 18297199.0, + "step": 7514, + "train/ce_loss": 1.1100710253231227e-06 + }, + { + "epoch": 0.7429305912596401, + "step": 7514, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.7429305912596401, + "step": 7514, + "train/total_loss": 0.015625111758708954 + }, + { + "entropy": 8.76772689819336, + "epoch": 0.7430294641091556, + "mean_token_accuracy": 0.7468879818916321, + "num_tokens": 18302398.0, + "step": 7515, + "train/ce_loss": 1.4463998079299927 + }, + { + "epoch": 0.7430294641091556, + "step": 7515, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.7430294641091556, + "step": 7515, + "train/total_loss": 0.2032337337732315 + }, + { + "entropy": 8.771977424621582, + "epoch": 0.7431283369586712, + "mean_token_accuracy": 0.7443820238113403, + "num_tokens": 18307595.0, + "step": 7516, + "train/ce_loss": 0.7369849681854248 + }, + { + "epoch": 0.7431283369586712, + "step": 7516, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.7431283369586712, + "step": 7516, + "train/total_loss": 0.0932297483086586 + }, + { + "entropy": 8.773170471191406, + "epoch": 0.7432272098081867, + "mean_token_accuracy": 0.7547169923782349, + "num_tokens": 18312728.0, + "step": 7517, + "train/ce_loss": 0.5180942416191101 + }, + { + "epoch": 0.7432272098081867, + "step": 7517, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.7432272098081867, + "step": 7517, + "train/total_loss": 0.07524693012237549 + }, + { + "entropy": 8.073772430419922, + "epoch": 0.7433260826577022, + "mean_token_accuracy": 0.748633861541748, + "num_tokens": 18318329.0, + "step": 7518, + "train/ce_loss": 1.114916443824768 + }, + { + "epoch": 0.7433260826577022, + "step": 7518, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.7433260826577022, + "step": 7518, + "train/total_loss": 0.13102290034294128 + }, + { + "entropy": 8.676217079162598, + "epoch": 0.7434249555072178, + "mean_token_accuracy": 0.7458333373069763, + "num_tokens": 18323542.0, + "step": 7519, + "train/ce_loss": 0.2892405688762665 + }, + { + "epoch": 0.7434249555072178, + "step": 7519, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.7434249555072178, + "step": 7519, + "train/total_loss": 0.06017405539751053 + }, + { + "epoch": 0.7435238283567333, + "grad_norm": 0.657289981842041, + "learning_rate": 8.143450526628097e-06, + "loss": 0.1319, + "step": 7520 + }, + { + "entropy": 8.515302658081055, + "epoch": 0.7435238283567333, + "mean_token_accuracy": 0.7568420767784119, + "num_tokens": 18328982.0, + "step": 7520, + "train/ce_loss": 0.7898205518722534 + }, + { + "epoch": 0.7435238283567333, + "step": 7520, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.7435238283567333, + "step": 7520, + "train/total_loss": 0.12195080518722534 + }, + { + "entropy": 8.461028099060059, + "epoch": 0.7436227012062487, + "mean_token_accuracy": 0.7594537734985352, + "num_tokens": 18334450.0, + "step": 7521, + "train/ce_loss": 0.7316333055496216 + }, + { + "epoch": 0.7436227012062487, + "step": 7521, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.7436227012062487, + "step": 7521, + "train/total_loss": 0.09660083055496216 + }, + { + "entropy": 8.769364356994629, + "epoch": 0.7437215740557643, + "mean_token_accuracy": 0.7709720134735107, + "num_tokens": 18339689.0, + "step": 7522, + "train/ce_loss": 0.9942920207977295 + }, + { + "epoch": 0.7437215740557643, + "step": 7522, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.7437215740557643, + "step": 7522, + "train/total_loss": 0.1970854550600052 + }, + { + "entropy": 8.664652824401855, + "epoch": 0.7438204469052798, + "mean_token_accuracy": 0.6864801645278931, + "num_tokens": 18345008.0, + "step": 7523, + "train/ce_loss": 0.9823068976402283 + }, + { + "epoch": 0.7438204469052798, + "step": 7523, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.7438204469052798, + "step": 7523, + "train/total_loss": 0.16073068976402283 + }, + { + "entropy": 8.796646118164062, + "epoch": 0.7439193197547953, + "mean_token_accuracy": 0.7348203063011169, + "num_tokens": 18350282.0, + "step": 7524, + "train/ce_loss": 0.9658528566360474 + }, + { + "epoch": 0.7439193197547953, + "step": 7524, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.7439193197547953, + "step": 7524, + "train/total_loss": 0.15908528864383698 + }, + { + "entropy": 8.759809494018555, + "epoch": 0.7440181926043109, + "mean_token_accuracy": 0.7451253533363342, + "num_tokens": 18355493.0, + "step": 7525, + "train/ce_loss": 0.9836140871047974 + }, + { + "epoch": 0.7440181926043109, + "step": 7525, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.7440181926043109, + "step": 7525, + "train/total_loss": 0.12570515275001526 + }, + { + "entropy": 9.079217910766602, + "epoch": 0.7441170654538264, + "mean_token_accuracy": 0.7358803749084473, + "num_tokens": 18360503.0, + "step": 7526, + "train/ce_loss": 1.0410563945770264 + }, + { + "epoch": 0.7441170654538264, + "step": 7526, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.7441170654538264, + "step": 7526, + "train/total_loss": 0.1666056513786316 + }, + { + "entropy": 8.668304443359375, + "epoch": 0.7442159383033419, + "mean_token_accuracy": 0.7101865410804749, + "num_tokens": 18365645.0, + "step": 7527, + "train/ce_loss": 1.1230249404907227 + }, + { + "epoch": 0.7442159383033419, + "step": 7527, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.7442159383033419, + "step": 7527, + "train/total_loss": 0.1865212470293045 + }, + { + "entropy": 8.49547004699707, + "epoch": 0.7443148111528575, + "mean_token_accuracy": 0.7277167439460754, + "num_tokens": 18371001.0, + "step": 7528, + "train/ce_loss": 1.0620167255401611 + }, + { + "epoch": 0.7443148111528575, + "step": 7528, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.7443148111528575, + "step": 7528, + "train/total_loss": 0.1257329285144806 + }, + { + "entropy": 8.6134033203125, + "epoch": 0.744413684002373, + "mean_token_accuracy": 0.7319819927215576, + "num_tokens": 18376333.0, + "step": 7529, + "train/ce_loss": 0.4151674509048462 + }, + { + "epoch": 0.744413684002373, + "step": 7529, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.744413684002373, + "step": 7529, + "train/total_loss": 0.0883917510509491 + }, + { + "entropy": 8.853605270385742, + "epoch": 0.7445125568518884, + "mean_token_accuracy": 0.6931297779083252, + "num_tokens": 18381429.0, + "step": 7530, + "train/ce_loss": 1.5182098150253296 + }, + { + "epoch": 0.7445125568518884, + "step": 7530, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.7445125568518884, + "step": 7530, + "train/total_loss": 0.18697723746299744 + }, + { + "entropy": 9.093571662902832, + "epoch": 0.744611429701404, + "mean_token_accuracy": 0.76382976770401, + "num_tokens": 18386300.0, + "step": 7531, + "train/ce_loss": 1.4884306192398071 + }, + { + "epoch": 0.744611429701404, + "step": 7531, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.744611429701404, + "step": 7531, + "train/total_loss": 0.20353056490421295 + }, + { + "entropy": 8.799421310424805, + "epoch": 0.7447103025509195, + "mean_token_accuracy": 0.7830045819282532, + "num_tokens": 18391446.0, + "step": 7532, + "train/ce_loss": 0.8592729568481445 + }, + { + "epoch": 0.7447103025509195, + "step": 7532, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.7447103025509195, + "step": 7532, + "train/total_loss": 0.12498980015516281 + }, + { + "entropy": 9.006401062011719, + "epoch": 0.744809175400435, + "mean_token_accuracy": 0.7160714268684387, + "num_tokens": 18396445.0, + "step": 7533, + "train/ce_loss": 0.8229637145996094 + }, + { + "epoch": 0.744809175400435, + "step": 7533, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.744809175400435, + "step": 7533, + "train/total_loss": 0.13698387145996094 + }, + { + "entropy": 8.820967674255371, + "epoch": 0.7449080482499506, + "mean_token_accuracy": 0.742214560508728, + "num_tokens": 18401464.0, + "step": 7534, + "train/ce_loss": 1.3239574432373047 + }, + { + "epoch": 0.7449080482499506, + "step": 7534, + "train/sim_loss": 0.16015625 + }, + { + "epoch": 0.7449080482499506, + "step": 7534, + "train/total_loss": 0.29255199432373047 + }, + { + "entropy": 8.689075469970703, + "epoch": 0.7450069210994661, + "mean_token_accuracy": 0.7237308025360107, + "num_tokens": 18406801.0, + "step": 7535, + "train/ce_loss": 0.7591511011123657 + }, + { + "epoch": 0.7450069210994661, + "step": 7535, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.7450069210994661, + "step": 7535, + "train/total_loss": 0.1618526130914688 + }, + { + "entropy": 9.02638053894043, + "epoch": 0.7451057939489816, + "mean_token_accuracy": 0.6994134783744812, + "num_tokens": 18411902.0, + "step": 7536, + "train/ce_loss": 0.9574239253997803 + }, + { + "epoch": 0.7451057939489816, + "step": 7536, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.7451057939489816, + "step": 7536, + "train/total_loss": 0.15042990446090698 + }, + { + "entropy": 8.805293083190918, + "epoch": 0.7452046667984972, + "mean_token_accuracy": 0.763005793094635, + "num_tokens": 18417236.0, + "step": 7537, + "train/ce_loss": 0.5509923696517944 + }, + { + "epoch": 0.7452046667984972, + "step": 7537, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.7452046667984972, + "step": 7537, + "train/total_loss": 0.0707242339849472 + }, + { + "entropy": 8.43828010559082, + "epoch": 0.7453035396480127, + "mean_token_accuracy": 0.7420675754547119, + "num_tokens": 18422720.0, + "step": 7538, + "train/ce_loss": 0.5752097368240356 + }, + { + "epoch": 0.7453035396480127, + "step": 7538, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.7453035396480127, + "step": 7538, + "train/total_loss": 0.07314597070217133 + }, + { + "entropy": 8.483229637145996, + "epoch": 0.7454024124975281, + "mean_token_accuracy": 0.814213216304779, + "num_tokens": 18428155.0, + "step": 7539, + "train/ce_loss": 0.6831315755844116 + }, + { + "epoch": 0.7454024124975281, + "step": 7539, + "train/sim_loss": 0.01171875 + }, + { + "epoch": 0.7454024124975281, + "step": 7539, + "train/total_loss": 0.08003190904855728 + }, + { + "epoch": 0.7455012853470437, + "grad_norm": 0.5534847974777222, + "learning_rate": 8.138505661870148e-06, + "loss": 0.1358, + "step": 7540 + }, + { + "entropy": 8.548550605773926, + "epoch": 0.7455012853470437, + "mean_token_accuracy": 0.7884828448295593, + "num_tokens": 18433584.0, + "step": 7540, + "train/ce_loss": 1.0471758842468262 + }, + { + "epoch": 0.7455012853470437, + "step": 7540, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.7455012853470437, + "step": 7540, + "train/total_loss": 0.12815508246421814 + }, + { + "entropy": 8.895271301269531, + "epoch": 0.7456001581965592, + "mean_token_accuracy": 0.6723459959030151, + "num_tokens": 18438833.0, + "step": 7541, + "train/ce_loss": 2.0626556873321533 + }, + { + "epoch": 0.7456001581965592, + "step": 7541, + "train/sim_loss": 0.12109375 + }, + { + "epoch": 0.7456001581965592, + "step": 7541, + "train/total_loss": 0.32735931873321533 + }, + { + "entropy": 8.48635196685791, + "epoch": 0.7456990310460747, + "mean_token_accuracy": 0.7440100908279419, + "num_tokens": 18444086.0, + "step": 7542, + "train/ce_loss": 1.2792021036148071 + }, + { + "epoch": 0.7456990310460747, + "step": 7542, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.7456990310460747, + "step": 7542, + "train/total_loss": 0.1787014603614807 + }, + { + "entropy": 9.208368301391602, + "epoch": 0.7457979038955903, + "mean_token_accuracy": 0.740667998790741, + "num_tokens": 18449036.0, + "step": 7543, + "train/ce_loss": 1.4675124883651733 + }, + { + "epoch": 0.7457979038955903, + "step": 7543, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.7457979038955903, + "step": 7543, + "train/total_loss": 0.2014387547969818 + }, + { + "entropy": 8.466072082519531, + "epoch": 0.7458967767451058, + "mean_token_accuracy": 0.7203290462493896, + "num_tokens": 18454544.0, + "step": 7544, + "train/ce_loss": 1.236045241355896 + }, + { + "epoch": 0.7458967767451058, + "step": 7544, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.7458967767451058, + "step": 7544, + "train/total_loss": 0.18219828605651855 + }, + { + "entropy": 8.675228118896484, + "epoch": 0.7459956495946213, + "mean_token_accuracy": 0.7056451439857483, + "num_tokens": 18459762.0, + "step": 7545, + "train/ce_loss": 1.0474457740783691 + }, + { + "epoch": 0.7459956495946213, + "step": 7545, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.7459956495946213, + "step": 7545, + "train/total_loss": 0.1633383333683014 + }, + { + "entropy": 8.85736083984375, + "epoch": 0.7460945224441369, + "mean_token_accuracy": 0.7022106647491455, + "num_tokens": 18465006.0, + "step": 7546, + "train/ce_loss": 0.8944984078407288 + }, + { + "epoch": 0.7460945224441369, + "step": 7546, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.7460945224441369, + "step": 7546, + "train/total_loss": 0.17538735270500183 + }, + { + "entropy": 9.092981338500977, + "epoch": 0.7461933952936524, + "mean_token_accuracy": 0.690559446811676, + "num_tokens": 18470058.0, + "step": 7547, + "train/ce_loss": 1.8033366203308105 + }, + { + "epoch": 0.7461933952936524, + "step": 7547, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.7461933952936524, + "step": 7547, + "train/total_loss": 0.25064617395401 + }, + { + "entropy": 8.797235488891602, + "epoch": 0.7462922681431678, + "mean_token_accuracy": 0.7824427485466003, + "num_tokens": 18475304.0, + "step": 7548, + "train/ce_loss": 0.7003121972084045 + }, + { + "epoch": 0.7462922681431678, + "step": 7548, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.7462922681431678, + "step": 7548, + "train/total_loss": 0.15987497568130493 + }, + { + "entropy": 8.513991355895996, + "epoch": 0.7463911409926834, + "mean_token_accuracy": 0.7707838416099548, + "num_tokens": 18480603.0, + "step": 7549, + "train/ce_loss": 0.6448712944984436 + }, + { + "epoch": 0.7463911409926834, + "step": 7549, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.7463911409926834, + "step": 7549, + "train/total_loss": 0.11526837944984436 + }, + { + "entropy": 8.103286743164062, + "epoch": 0.7464900138421989, + "mean_token_accuracy": 0.7144185900688171, + "num_tokens": 18486163.0, + "step": 7550, + "train/ce_loss": 0.8816394805908203 + }, + { + "epoch": 0.7464900138421989, + "step": 7550, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.7464900138421989, + "step": 7550, + "train/total_loss": 0.15457019209861755 + }, + { + "entropy": 8.91938591003418, + "epoch": 0.7465888866917144, + "mean_token_accuracy": 0.7802768349647522, + "num_tokens": 18491183.0, + "step": 7551, + "train/ce_loss": 1.2602639198303223 + }, + { + "epoch": 0.7465888866917144, + "step": 7551, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.7465888866917144, + "step": 7551, + "train/total_loss": 0.20024514198303223 + }, + { + "entropy": 9.720104217529297, + "epoch": 0.74668775954123, + "mean_token_accuracy": 0.7919074892997742, + "num_tokens": 18495760.0, + "step": 7552, + "train/ce_loss": 1.2329982382652815e-05 + }, + { + "epoch": 0.74668775954123, + "step": 7552, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.74668775954123, + "step": 7552, + "train/total_loss": 0.04296998307108879 + }, + { + "entropy": 8.548629760742188, + "epoch": 0.7467866323907455, + "mean_token_accuracy": 0.7142857313156128, + "num_tokens": 18500989.0, + "step": 7553, + "train/ce_loss": 0.7419317364692688 + }, + { + "epoch": 0.7467866323907455, + "step": 7553, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.7467866323907455, + "step": 7553, + "train/total_loss": 0.11716192215681076 + }, + { + "entropy": 8.859134674072266, + "epoch": 0.746885505240261, + "mean_token_accuracy": 0.7392686605453491, + "num_tokens": 18506091.0, + "step": 7554, + "train/ce_loss": 3.27682710121735e-06 + }, + { + "epoch": 0.746885505240261, + "step": 7554, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.746885505240261, + "step": 7554, + "train/total_loss": 0.042969077825546265 + }, + { + "entropy": 8.791265487670898, + "epoch": 0.7469843780897766, + "mean_token_accuracy": 0.7300944924354553, + "num_tokens": 18511294.0, + "step": 7555, + "train/ce_loss": 0.7456115484237671 + }, + { + "epoch": 0.7469843780897766, + "step": 7555, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.7469843780897766, + "step": 7555, + "train/total_loss": 0.13315489888191223 + }, + { + "entropy": 8.044285774230957, + "epoch": 0.7470832509392921, + "mean_token_accuracy": 0.7624565362930298, + "num_tokens": 18516656.0, + "step": 7556, + "train/ce_loss": 0.5414119362831116 + }, + { + "epoch": 0.7470832509392921, + "step": 7556, + "train/sim_loss": 0.13671875 + }, + { + "epoch": 0.7470832509392921, + "step": 7556, + "train/total_loss": 0.19085994362831116 + }, + { + "entropy": 9.396928787231445, + "epoch": 0.7471821237888076, + "mean_token_accuracy": 0.7343096137046814, + "num_tokens": 18521562.0, + "step": 7557, + "train/ce_loss": 1.8854216250474565e-05 + }, + { + "epoch": 0.7471821237888076, + "step": 7557, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.7471821237888076, + "step": 7557, + "train/total_loss": 0.03125188499689102 + }, + { + "entropy": 8.831316947937012, + "epoch": 0.7472809966383231, + "mean_token_accuracy": 0.7898550629615784, + "num_tokens": 18526662.0, + "step": 7558, + "train/ce_loss": 1.0223673582077026 + }, + { + "epoch": 0.7472809966383231, + "step": 7558, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.7472809966383231, + "step": 7558, + "train/total_loss": 0.13348674774169922 + }, + { + "entropy": 8.615570068359375, + "epoch": 0.7473798694878386, + "mean_token_accuracy": 0.8077889680862427, + "num_tokens": 18531932.0, + "step": 7559, + "train/ce_loss": 0.5703041553497314 + }, + { + "epoch": 0.7473798694878386, + "step": 7559, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.7473798694878386, + "step": 7559, + "train/total_loss": 0.11171791702508926 + }, + { + "epoch": 0.7474787423373541, + "grad_norm": 0.6037012934684753, + "learning_rate": 8.1335607971122e-06, + "loss": 0.1366, + "step": 7560 + }, + { + "entropy": 8.575265884399414, + "epoch": 0.7474787423373541, + "mean_token_accuracy": 0.697621762752533, + "num_tokens": 18537320.0, + "step": 7560, + "train/ce_loss": 1.0120177268981934 + }, + { + "epoch": 0.7474787423373541, + "step": 7560, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.7474787423373541, + "step": 7560, + "train/total_loss": 0.17932677268981934 + }, + { + "entropy": 9.304473876953125, + "epoch": 0.7475776151868697, + "mean_token_accuracy": 0.7781955003738403, + "num_tokens": 18542236.0, + "step": 7561, + "train/ce_loss": 2.2549270397576038e-06 + }, + { + "epoch": 0.7475776151868697, + "step": 7561, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.7475776151868697, + "step": 7561, + "train/total_loss": 0.019531475380063057 + }, + { + "entropy": 8.95811653137207, + "epoch": 0.7476764880363852, + "mean_token_accuracy": 0.7650793790817261, + "num_tokens": 18547337.0, + "step": 7562, + "train/ce_loss": 0.82717365026474 + }, + { + "epoch": 0.7476764880363852, + "step": 7562, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.7476764880363852, + "step": 7562, + "train/total_loss": 0.16084235906600952 + }, + { + "entropy": 8.916593551635742, + "epoch": 0.7477753608859007, + "mean_token_accuracy": 0.7220670580863953, + "num_tokens": 18552464.0, + "step": 7563, + "train/ce_loss": 1.4096484184265137 + }, + { + "epoch": 0.7477753608859007, + "step": 7563, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.7477753608859007, + "step": 7563, + "train/total_loss": 0.17612110078334808 + }, + { + "entropy": 8.862186431884766, + "epoch": 0.7478742337354163, + "mean_token_accuracy": 0.7115716934204102, + "num_tokens": 18557431.0, + "step": 7564, + "train/ce_loss": 1.0325582027435303 + }, + { + "epoch": 0.7478742337354163, + "step": 7564, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.7478742337354163, + "step": 7564, + "train/total_loss": 0.14231832325458527 + }, + { + "entropy": 8.625448226928711, + "epoch": 0.7479731065849318, + "mean_token_accuracy": 0.7784430980682373, + "num_tokens": 18562730.0, + "step": 7565, + "train/ce_loss": 0.6577219367027283 + }, + { + "epoch": 0.7479731065849318, + "step": 7565, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.7479731065849318, + "step": 7565, + "train/total_loss": 0.19077220559120178 + }, + { + "entropy": 8.983163833618164, + "epoch": 0.7480719794344473, + "mean_token_accuracy": 0.6866952776908875, + "num_tokens": 18567875.0, + "step": 7566, + "train/ce_loss": 0.9836313724517822 + }, + { + "epoch": 0.7480719794344473, + "step": 7566, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.7480719794344473, + "step": 7566, + "train/total_loss": 0.19211313128471375 + }, + { + "entropy": 9.013215065002441, + "epoch": 0.7481708522839629, + "mean_token_accuracy": 0.733137845993042, + "num_tokens": 18572949.0, + "step": 7567, + "train/ce_loss": 1.7395390272140503 + }, + { + "epoch": 0.7481708522839629, + "step": 7567, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.7481708522839629, + "step": 7567, + "train/total_loss": 0.23254765570163727 + }, + { + "entropy": 8.51715087890625, + "epoch": 0.7482697251334783, + "mean_token_accuracy": 0.7427577972412109, + "num_tokens": 18578264.0, + "step": 7568, + "train/ce_loss": 0.794825553894043 + }, + { + "epoch": 0.7482697251334783, + "step": 7568, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.7482697251334783, + "step": 7568, + "train/total_loss": 0.1302638053894043 + }, + { + "entropy": 8.655542373657227, + "epoch": 0.7483685979829938, + "mean_token_accuracy": 0.7131428718566895, + "num_tokens": 18583553.0, + "step": 7569, + "train/ce_loss": 0.9008631706237793 + }, + { + "epoch": 0.7483685979829938, + "step": 7569, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.7483685979829938, + "step": 7569, + "train/total_loss": 0.15258631110191345 + }, + { + "entropy": 8.622589111328125, + "epoch": 0.7484674708325094, + "mean_token_accuracy": 0.746532142162323, + "num_tokens": 18588816.0, + "step": 7570, + "train/ce_loss": 0.6112365126609802 + }, + { + "epoch": 0.7484674708325094, + "step": 7570, + "train/sim_loss": 0.12109375 + }, + { + "epoch": 0.7484674708325094, + "step": 7570, + "train/total_loss": 0.18221740424633026 + }, + { + "entropy": 8.559322357177734, + "epoch": 0.7485663436820249, + "mean_token_accuracy": 0.7550111413002014, + "num_tokens": 18594348.0, + "step": 7571, + "train/ce_loss": 0.8936256170272827 + }, + { + "epoch": 0.7485663436820249, + "step": 7571, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.7485663436820249, + "step": 7571, + "train/total_loss": 0.15186256170272827 + }, + { + "entropy": 8.6543550491333, + "epoch": 0.7486652165315404, + "mean_token_accuracy": 0.6327043771743774, + "num_tokens": 18599599.0, + "step": 7572, + "train/ce_loss": 1.2990891933441162 + }, + { + "epoch": 0.7486652165315404, + "step": 7572, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.7486652165315404, + "step": 7572, + "train/total_loss": 0.24319016933441162 + }, + { + "entropy": 8.520683288574219, + "epoch": 0.748764089381056, + "mean_token_accuracy": 0.7318500876426697, + "num_tokens": 18604890.0, + "step": 7573, + "train/ce_loss": 0.5030612349510193 + }, + { + "epoch": 0.748764089381056, + "step": 7573, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.748764089381056, + "step": 7573, + "train/total_loss": 0.10889987647533417 + }, + { + "entropy": 8.476627349853516, + "epoch": 0.7488629622305715, + "mean_token_accuracy": 0.683783769607544, + "num_tokens": 18610104.0, + "step": 7574, + "train/ce_loss": 1.7495384216308594 + }, + { + "epoch": 0.7488629622305715, + "step": 7574, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.7488629622305715, + "step": 7574, + "train/total_loss": 0.2882350981235504 + }, + { + "entropy": 8.270191192626953, + "epoch": 0.748961835080087, + "mean_token_accuracy": 0.6940000057220459, + "num_tokens": 18615569.0, + "step": 7575, + "train/ce_loss": 0.976158082485199 + }, + { + "epoch": 0.748961835080087, + "step": 7575, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.748961835080087, + "step": 7575, + "train/total_loss": 0.1835533082485199 + }, + { + "entropy": 7.990413665771484, + "epoch": 0.7490607079296026, + "mean_token_accuracy": 0.763129711151123, + "num_tokens": 18620977.0, + "step": 7576, + "train/ce_loss": 0.5641739368438721 + }, + { + "epoch": 0.7490607079296026, + "step": 7576, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.7490607079296026, + "step": 7576, + "train/total_loss": 0.11891739070415497 + }, + { + "entropy": 8.844701766967773, + "epoch": 0.749159580779118, + "mean_token_accuracy": 0.6914498209953308, + "num_tokens": 18625855.0, + "step": 7577, + "train/ce_loss": 0.5731430053710938 + }, + { + "epoch": 0.749159580779118, + "step": 7577, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.749159580779118, + "step": 7577, + "train/total_loss": 0.08856430649757385 + }, + { + "entropy": 9.015901565551758, + "epoch": 0.7492584536286335, + "mean_token_accuracy": 0.7592892050743103, + "num_tokens": 18630950.0, + "step": 7578, + "train/ce_loss": 0.7445675134658813 + }, + { + "epoch": 0.7492584536286335, + "step": 7578, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.7492584536286335, + "step": 7578, + "train/total_loss": 0.12133175134658813 + }, + { + "entropy": 8.592124938964844, + "epoch": 0.7493573264781491, + "mean_token_accuracy": 0.7462483048439026, + "num_tokens": 18636177.0, + "step": 7579, + "train/ce_loss": 0.8068994283676147 + }, + { + "epoch": 0.7493573264781491, + "step": 7579, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.7493573264781491, + "step": 7579, + "train/total_loss": 0.131471186876297 + }, + { + "epoch": 0.7494561993276646, + "grad_norm": 0.6941366195678711, + "learning_rate": 8.12861593235425e-06, + "loss": 0.1481, + "step": 7580 + }, + { + "entropy": 8.709858894348145, + "epoch": 0.7494561993276646, + "mean_token_accuracy": 0.7689969539642334, + "num_tokens": 18641324.0, + "step": 7580, + "train/ce_loss": 0.4913293123245239 + }, + { + "epoch": 0.7494561993276646, + "step": 7580, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.7494561993276646, + "step": 7580, + "train/total_loss": 0.09991417825222015 + }, + { + "entropy": 8.855411529541016, + "epoch": 0.7495550721771801, + "mean_token_accuracy": 0.7503234148025513, + "num_tokens": 18646557.0, + "step": 7581, + "train/ce_loss": 1.4032633304595947 + }, + { + "epoch": 0.7495550721771801, + "step": 7581, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.7495550721771801, + "step": 7581, + "train/total_loss": 0.2457950860261917 + }, + { + "entropy": 8.342367172241211, + "epoch": 0.7496539450266957, + "mean_token_accuracy": 0.7551020383834839, + "num_tokens": 18651977.0, + "step": 7582, + "train/ce_loss": 0.5137928128242493 + }, + { + "epoch": 0.7496539450266957, + "step": 7582, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.7496539450266957, + "step": 7582, + "train/total_loss": 0.09825427830219269 + }, + { + "entropy": 8.741762161254883, + "epoch": 0.7497528178762112, + "mean_token_accuracy": 0.7442424297332764, + "num_tokens": 18657241.0, + "step": 7583, + "train/ce_loss": 1.6014753327908693e-06 + }, + { + "epoch": 0.7497528178762112, + "step": 7583, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.7497528178762112, + "step": 7583, + "train/total_loss": 0.015625160187482834 + }, + { + "entropy": 8.494527816772461, + "epoch": 0.7498516907257267, + "mean_token_accuracy": 0.7305936217308044, + "num_tokens": 18662545.0, + "step": 7584, + "train/ce_loss": 0.8228175640106201 + }, + { + "epoch": 0.7498516907257267, + "step": 7584, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.7498516907257267, + "step": 7584, + "train/total_loss": 0.14087551832199097 + }, + { + "entropy": 8.599166870117188, + "epoch": 0.7499505635752423, + "mean_token_accuracy": 0.7303988933563232, + "num_tokens": 18667757.0, + "step": 7585, + "train/ce_loss": 0.6843902468681335 + }, + { + "epoch": 0.7499505635752423, + "step": 7585, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.7499505635752423, + "step": 7585, + "train/total_loss": 0.12312652915716171 + }, + { + "entropy": 8.907880783081055, + "epoch": 0.7500494364247577, + "mean_token_accuracy": 0.7748031616210938, + "num_tokens": 18672716.0, + "step": 7586, + "train/ce_loss": 4.061507752339821e-06 + }, + { + "epoch": 0.7500494364247577, + "step": 7586, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.7500494364247577, + "step": 7586, + "train/total_loss": 0.023437906056642532 + }, + { + "entropy": 8.357831954956055, + "epoch": 0.7501483092742733, + "mean_token_accuracy": 0.688720166683197, + "num_tokens": 18678023.0, + "step": 7587, + "train/ce_loss": 1.2316768169403076 + }, + { + "epoch": 0.7501483092742733, + "step": 7587, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.7501483092742733, + "step": 7587, + "train/total_loss": 0.16223019361495972 + }, + { + "entropy": 8.60870361328125, + "epoch": 0.7502471821237888, + "mean_token_accuracy": 0.7943166494369507, + "num_tokens": 18683248.0, + "step": 7588, + "train/ce_loss": 0.751041829586029 + }, + { + "epoch": 0.7502471821237888, + "step": 7588, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.7502471821237888, + "step": 7588, + "train/total_loss": 0.13760417699813843 + }, + { + "entropy": 8.520055770874023, + "epoch": 0.7503460549733043, + "mean_token_accuracy": 0.7567901015281677, + "num_tokens": 18688529.0, + "step": 7589, + "train/ce_loss": 0.8611323833465576 + }, + { + "epoch": 0.7503460549733043, + "step": 7589, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.7503460549733043, + "step": 7589, + "train/total_loss": 0.13298824429512024 + }, + { + "entropy": 8.39891242980957, + "epoch": 0.7504449278228199, + "mean_token_accuracy": 0.7409700751304626, + "num_tokens": 18693959.0, + "step": 7590, + "train/ce_loss": 0.603983998298645 + }, + { + "epoch": 0.7504449278228199, + "step": 7590, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.7504449278228199, + "step": 7590, + "train/total_loss": 0.0994608998298645 + }, + { + "entropy": 8.5487699508667, + "epoch": 0.7505438006723354, + "mean_token_accuracy": 0.79368656873703, + "num_tokens": 18699309.0, + "step": 7591, + "train/ce_loss": 0.9199407696723938 + }, + { + "epoch": 0.7505438006723354, + "step": 7591, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.7505438006723354, + "step": 7591, + "train/total_loss": 0.15058782696723938 + }, + { + "entropy": 8.455150604248047, + "epoch": 0.7506426735218509, + "mean_token_accuracy": 0.8018540143966675, + "num_tokens": 18704662.0, + "step": 7592, + "train/ce_loss": 0.48203450441360474 + }, + { + "epoch": 0.7506426735218509, + "step": 7592, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.7506426735218509, + "step": 7592, + "train/total_loss": 0.10289095342159271 + }, + { + "entropy": 8.719837188720703, + "epoch": 0.7507415463713665, + "mean_token_accuracy": 0.7514705657958984, + "num_tokens": 18709771.0, + "step": 7593, + "train/ce_loss": 1.3186227083206177 + }, + { + "epoch": 0.7507415463713665, + "step": 7593, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.7507415463713665, + "step": 7593, + "train/total_loss": 0.18264351785182953 + }, + { + "entropy": 8.980011940002441, + "epoch": 0.750840419220882, + "mean_token_accuracy": 0.7803278565406799, + "num_tokens": 18714836.0, + "step": 7594, + "train/ce_loss": 0.9642717242240906 + }, + { + "epoch": 0.750840419220882, + "step": 7594, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.750840419220882, + "step": 7594, + "train/total_loss": 0.13158342242240906 + }, + { + "entropy": 8.246875762939453, + "epoch": 0.7509392920703974, + "mean_token_accuracy": 0.6725025773048401, + "num_tokens": 18720288.0, + "step": 7595, + "train/ce_loss": 0.7636457085609436 + }, + { + "epoch": 0.7509392920703974, + "step": 7595, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.7509392920703974, + "step": 7595, + "train/total_loss": 0.13495832681655884 + }, + { + "entropy": 8.382661819458008, + "epoch": 0.751038164919913, + "mean_token_accuracy": 0.7502527832984924, + "num_tokens": 18725702.0, + "step": 7596, + "train/ce_loss": 1.2867059707641602 + }, + { + "epoch": 0.751038164919913, + "step": 7596, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.751038164919913, + "step": 7596, + "train/total_loss": 0.2185143530368805 + }, + { + "entropy": 9.54419231414795, + "epoch": 0.7511370377694285, + "mean_token_accuracy": 0.7117347121238708, + "num_tokens": 18730487.0, + "step": 7597, + "train/ce_loss": 1.4519308805465698 + }, + { + "epoch": 0.7511370377694285, + "step": 7597, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.7511370377694285, + "step": 7597, + "train/total_loss": 0.18816183507442474 + }, + { + "entropy": 8.40174674987793, + "epoch": 0.751235910618944, + "mean_token_accuracy": 0.7516778707504272, + "num_tokens": 18735741.0, + "step": 7598, + "train/ce_loss": 0.98127681016922 + }, + { + "epoch": 0.751235910618944, + "step": 7598, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.751235910618944, + "step": 7598, + "train/total_loss": 0.16062769293785095 + }, + { + "entropy": 8.947654724121094, + "epoch": 0.7513347834684596, + "mean_token_accuracy": 0.7711864113807678, + "num_tokens": 18740836.0, + "step": 7599, + "train/ce_loss": 1.3571666479110718 + }, + { + "epoch": 0.7513347834684596, + "step": 7599, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.7513347834684596, + "step": 7599, + "train/total_loss": 0.15524791181087494 + }, + { + "epoch": 0.7514336563179751, + "grad_norm": 0.5190445184707642, + "learning_rate": 8.123671067596301e-06, + "loss": 0.127, + "step": 7600 + }, + { + "entropy": 8.397933006286621, + "epoch": 0.7514336563179751, + "mean_token_accuracy": 0.8217922449111938, + "num_tokens": 18746269.0, + "step": 7600, + "train/ce_loss": 0.34773579239845276 + }, + { + "epoch": 0.7514336563179751, + "step": 7600, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.7514336563179751, + "step": 7600, + "train/total_loss": 0.054304830729961395 + }, + { + "entropy": 8.61817741394043, + "epoch": 0.7515325291674906, + "mean_token_accuracy": 0.7633495330810547, + "num_tokens": 18751603.0, + "step": 7601, + "train/ce_loss": 0.6802176833152771 + }, + { + "epoch": 0.7515325291674906, + "step": 7601, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.7515325291674906, + "step": 7601, + "train/total_loss": 0.09145926684141159 + }, + { + "entropy": 9.059032440185547, + "epoch": 0.7516314020170062, + "mean_token_accuracy": 0.7538726329803467, + "num_tokens": 18756648.0, + "step": 7602, + "train/ce_loss": 0.9057416319847107 + }, + { + "epoch": 0.7516314020170062, + "step": 7602, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.7516314020170062, + "step": 7602, + "train/total_loss": 0.16869917511940002 + }, + { + "entropy": 9.00408935546875, + "epoch": 0.7517302748665217, + "mean_token_accuracy": 0.6993569135665894, + "num_tokens": 18761939.0, + "step": 7603, + "train/ce_loss": 0.6494431495666504 + }, + { + "epoch": 0.7517302748665217, + "step": 7603, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.7517302748665217, + "step": 7603, + "train/total_loss": 0.1040068194270134 + }, + { + "entropy": 9.162939071655273, + "epoch": 0.7518291477160371, + "mean_token_accuracy": 0.7204301357269287, + "num_tokens": 18766923.0, + "step": 7604, + "train/ce_loss": 1.313880205154419 + }, + { + "epoch": 0.7518291477160371, + "step": 7604, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.7518291477160371, + "step": 7604, + "train/total_loss": 0.17045052349567413 + }, + { + "entropy": 8.271470069885254, + "epoch": 0.7519280205655527, + "mean_token_accuracy": 0.777990460395813, + "num_tokens": 18772439.0, + "step": 7605, + "train/ce_loss": 0.4249177575111389 + }, + { + "epoch": 0.7519280205655527, + "step": 7605, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.7519280205655527, + "step": 7605, + "train/total_loss": 0.06983552873134613 + }, + { + "entropy": 8.742707252502441, + "epoch": 0.7520268934150682, + "mean_token_accuracy": 0.7394468784332275, + "num_tokens": 18777605.0, + "step": 7606, + "train/ce_loss": 0.6090832948684692 + }, + { + "epoch": 0.7520268934150682, + "step": 7606, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.7520268934150682, + "step": 7606, + "train/total_loss": 0.09606458246707916 + }, + { + "entropy": 8.429786682128906, + "epoch": 0.7521257662645837, + "mean_token_accuracy": 0.7768508791923523, + "num_tokens": 18782997.0, + "step": 7607, + "train/ce_loss": 0.7540313601493835 + }, + { + "epoch": 0.7521257662645837, + "step": 7607, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.7521257662645837, + "step": 7607, + "train/total_loss": 0.0988406389951706 + }, + { + "entropy": 8.465465545654297, + "epoch": 0.7522246391140993, + "mean_token_accuracy": 0.727918803691864, + "num_tokens": 18788440.0, + "step": 7608, + "train/ce_loss": 0.9121677875518799 + }, + { + "epoch": 0.7522246391140993, + "step": 7608, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.7522246391140993, + "step": 7608, + "train/total_loss": 0.1849667727947235 + }, + { + "entropy": 8.796218872070312, + "epoch": 0.7523235119636148, + "mean_token_accuracy": 0.7910271286964417, + "num_tokens": 18793748.0, + "step": 7609, + "train/ce_loss": 0.5593013167381287 + }, + { + "epoch": 0.7523235119636148, + "step": 7609, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.7523235119636148, + "step": 7609, + "train/total_loss": 0.14186763763427734 + }, + { + "entropy": 8.514887809753418, + "epoch": 0.7524223848131303, + "mean_token_accuracy": 0.7386091351509094, + "num_tokens": 18799074.0, + "step": 7610, + "train/ce_loss": 1.0005974769592285 + }, + { + "epoch": 0.7524223848131303, + "step": 7610, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.7524223848131303, + "step": 7610, + "train/total_loss": 0.17037224769592285 + }, + { + "entropy": 8.344120025634766, + "epoch": 0.7525212576626459, + "mean_token_accuracy": 0.7127312421798706, + "num_tokens": 18804445.0, + "step": 7611, + "train/ce_loss": 1.3133176565170288 + }, + { + "epoch": 0.7525212576626459, + "step": 7611, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.7525212576626459, + "step": 7611, + "train/total_loss": 0.22117552161216736 + }, + { + "entropy": 8.522525787353516, + "epoch": 0.7526201305121614, + "mean_token_accuracy": 0.7158836722373962, + "num_tokens": 18809776.0, + "step": 7612, + "train/ce_loss": 1.3812496662139893 + }, + { + "epoch": 0.7526201305121614, + "step": 7612, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.7526201305121614, + "step": 7612, + "train/total_loss": 0.1928124725818634 + }, + { + "entropy": 8.93623161315918, + "epoch": 0.7527190033616769, + "mean_token_accuracy": 0.7377278804779053, + "num_tokens": 18814946.0, + "step": 7613, + "train/ce_loss": 1.4814845323562622 + }, + { + "epoch": 0.7527190033616769, + "step": 7613, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.7527190033616769, + "step": 7613, + "train/total_loss": 0.23017971217632294 + }, + { + "entropy": 9.092214584350586, + "epoch": 0.7528178762111924, + "mean_token_accuracy": 0.7739999890327454, + "num_tokens": 18819872.0, + "step": 7614, + "train/ce_loss": 1.1034413576126099 + }, + { + "epoch": 0.7528178762111924, + "step": 7614, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.7528178762111924, + "step": 7614, + "train/total_loss": 0.16893789172172546 + }, + { + "entropy": 9.46321964263916, + "epoch": 0.7529167490607079, + "mean_token_accuracy": 0.7542372941970825, + "num_tokens": 18824640.0, + "step": 7615, + "train/ce_loss": 1.9673778297146782e-05 + }, + { + "epoch": 0.7529167490607079, + "step": 7615, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.7529167490607079, + "step": 7615, + "train/total_loss": 0.06250196695327759 + }, + { + "entropy": 8.586030960083008, + "epoch": 0.7530156219102234, + "mean_token_accuracy": 0.7251114249229431, + "num_tokens": 18829748.0, + "step": 7616, + "train/ce_loss": 1.3748188018798828 + }, + { + "epoch": 0.7530156219102234, + "step": 7616, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.7530156219102234, + "step": 7616, + "train/total_loss": 0.17654438316822052 + }, + { + "entropy": 8.770456314086914, + "epoch": 0.753114494759739, + "mean_token_accuracy": 0.7137546539306641, + "num_tokens": 18834980.0, + "step": 7617, + "train/ce_loss": 1.3555060625076294 + }, + { + "epoch": 0.753114494759739, + "step": 7617, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.753114494759739, + "step": 7617, + "train/total_loss": 0.1902381032705307 + }, + { + "entropy": 8.679096221923828, + "epoch": 0.7532133676092545, + "mean_token_accuracy": 0.7106325626373291, + "num_tokens": 18840183.0, + "step": 7618, + "train/ce_loss": 0.9374057054519653 + }, + { + "epoch": 0.7532133676092545, + "step": 7618, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.7532133676092545, + "step": 7618, + "train/total_loss": 0.1640530824661255 + }, + { + "entropy": 8.480308532714844, + "epoch": 0.75331224045877, + "mean_token_accuracy": 0.785495400428772, + "num_tokens": 18845601.0, + "step": 7619, + "train/ce_loss": 0.5295276641845703 + }, + { + "epoch": 0.75331224045877, + "step": 7619, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.75331224045877, + "step": 7619, + "train/total_loss": 0.09982776641845703 + }, + { + "epoch": 0.7534111133082856, + "grad_norm": 0.5392074584960938, + "learning_rate": 8.118726202838353e-06, + "loss": 0.1327, + "step": 7620 + }, + { + "entropy": 8.986780166625977, + "epoch": 0.7534111133082856, + "mean_token_accuracy": 0.7759259343147278, + "num_tokens": 18850607.0, + "step": 7620, + "train/ce_loss": 1.2527285814285278 + }, + { + "epoch": 0.7534111133082856, + "step": 7620, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.7534111133082856, + "step": 7620, + "train/total_loss": 0.17605410516262054 + }, + { + "entropy": 8.76120376586914, + "epoch": 0.7535099861578011, + "mean_token_accuracy": 0.7209677696228027, + "num_tokens": 18855644.0, + "step": 7621, + "train/ce_loss": 0.6488677263259888 + }, + { + "epoch": 0.7535099861578011, + "step": 7621, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.7535099861578011, + "step": 7621, + "train/total_loss": 0.10394927114248276 + }, + { + "entropy": 8.42733383178711, + "epoch": 0.7536088590073166, + "mean_token_accuracy": 0.73204106092453, + "num_tokens": 18861003.0, + "step": 7622, + "train/ce_loss": 0.6457306146621704 + }, + { + "epoch": 0.7536088590073166, + "step": 7622, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.7536088590073166, + "step": 7622, + "train/total_loss": 0.09191681444644928 + }, + { + "entropy": 9.14065170288086, + "epoch": 0.7537077318568322, + "mean_token_accuracy": 0.7386363744735718, + "num_tokens": 18866022.0, + "step": 7623, + "train/ce_loss": 1.131330966949463 + }, + { + "epoch": 0.7537077318568322, + "step": 7623, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.7537077318568322, + "step": 7623, + "train/total_loss": 0.14828935265541077 + }, + { + "entropy": 8.719907760620117, + "epoch": 0.7538066047063476, + "mean_token_accuracy": 0.725806474685669, + "num_tokens": 18871251.0, + "step": 7624, + "train/ce_loss": 0.8957783579826355 + }, + { + "epoch": 0.7538066047063476, + "step": 7624, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.7538066047063476, + "step": 7624, + "train/total_loss": 0.1364528387784958 + }, + { + "entropy": 8.517269134521484, + "epoch": 0.7539054775558631, + "mean_token_accuracy": 0.8009478449821472, + "num_tokens": 18876563.0, + "step": 7625, + "train/ce_loss": 0.5261335372924805 + }, + { + "epoch": 0.7539054775558631, + "step": 7625, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.7539054775558631, + "step": 7625, + "train/total_loss": 0.07214460521936417 + }, + { + "entropy": 9.236428260803223, + "epoch": 0.7540043504053787, + "mean_token_accuracy": 0.8019230961799622, + "num_tokens": 18881525.0, + "step": 7626, + "train/ce_loss": 1.5224268436431885 + }, + { + "epoch": 0.7540043504053787, + "step": 7626, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.7540043504053787, + "step": 7626, + "train/total_loss": 0.21474269032478333 + }, + { + "entropy": 8.618507385253906, + "epoch": 0.7541032232548942, + "mean_token_accuracy": 0.7896138429641724, + "num_tokens": 18886766.0, + "step": 7627, + "train/ce_loss": 0.7982921600341797 + }, + { + "epoch": 0.7541032232548942, + "step": 7627, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.7541032232548942, + "step": 7627, + "train/total_loss": 0.11889171600341797 + }, + { + "entropy": 8.485706329345703, + "epoch": 0.7542020961044097, + "mean_token_accuracy": 0.7259439826011658, + "num_tokens": 18892020.0, + "step": 7628, + "train/ce_loss": 0.44417646527290344 + }, + { + "epoch": 0.7542020961044097, + "step": 7628, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.7542020961044097, + "step": 7628, + "train/total_loss": 0.10691764950752258 + }, + { + "entropy": 8.981820106506348, + "epoch": 0.7543009689539253, + "mean_token_accuracy": 0.8368263244628906, + "num_tokens": 18897145.0, + "step": 7629, + "train/ce_loss": 1.4848389582766686e-05 + }, + { + "epoch": 0.7543009689539253, + "step": 7629, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.7543009689539253, + "step": 7629, + "train/total_loss": 0.035157736390829086 + }, + { + "entropy": 8.820948600769043, + "epoch": 0.7543998418034408, + "mean_token_accuracy": 0.7624161243438721, + "num_tokens": 18902319.0, + "step": 7630, + "train/ce_loss": 1.2992298603057861 + }, + { + "epoch": 0.7543998418034408, + "step": 7630, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.7543998418034408, + "step": 7630, + "train/total_loss": 0.2158604860305786 + }, + { + "entropy": 8.083815574645996, + "epoch": 0.7544987146529563, + "mean_token_accuracy": 0.7002996802330017, + "num_tokens": 18907893.0, + "step": 7631, + "train/ce_loss": 0.42063286900520325 + }, + { + "epoch": 0.7544987146529563, + "step": 7631, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.7544987146529563, + "step": 7631, + "train/total_loss": 0.08893828839063644 + }, + { + "entropy": 8.371919631958008, + "epoch": 0.7545975875024719, + "mean_token_accuracy": 0.7872105836868286, + "num_tokens": 18913288.0, + "step": 7632, + "train/ce_loss": 0.41169917583465576 + }, + { + "epoch": 0.7545975875024719, + "step": 7632, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.7545975875024719, + "step": 7632, + "train/total_loss": 0.056794919073581696 + }, + { + "entropy": 8.150374412536621, + "epoch": 0.7546964603519873, + "mean_token_accuracy": 0.7114846110343933, + "num_tokens": 18919016.0, + "step": 7633, + "train/ce_loss": 0.9580610394477844 + }, + { + "epoch": 0.7546964603519873, + "step": 7633, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.7546964603519873, + "step": 7633, + "train/total_loss": 0.14658735692501068 + }, + { + "entropy": 8.695228576660156, + "epoch": 0.7547953332015028, + "mean_token_accuracy": 0.7293233275413513, + "num_tokens": 18924318.0, + "step": 7634, + "train/ce_loss": 0.9011909365653992 + }, + { + "epoch": 0.7547953332015028, + "step": 7634, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.7547953332015028, + "step": 7634, + "train/total_loss": 0.19558784365653992 + }, + { + "entropy": 8.712812423706055, + "epoch": 0.7548942060510184, + "mean_token_accuracy": 0.7779237627983093, + "num_tokens": 18929530.0, + "step": 7635, + "train/ce_loss": 0.6864867210388184 + }, + { + "epoch": 0.7548942060510184, + "step": 7635, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.7548942060510184, + "step": 7635, + "train/total_loss": 0.08427367359399796 + }, + { + "entropy": 8.558393478393555, + "epoch": 0.7549930789005339, + "mean_token_accuracy": 0.7250280380249023, + "num_tokens": 18934873.0, + "step": 7636, + "train/ce_loss": 0.49897968769073486 + }, + { + "epoch": 0.7549930789005339, + "step": 7636, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.7549930789005339, + "step": 7636, + "train/total_loss": 0.10849171876907349 + }, + { + "entropy": 9.561351776123047, + "epoch": 0.7550919517500494, + "mean_token_accuracy": 0.7879580855369568, + "num_tokens": 18939694.0, + "step": 7637, + "train/ce_loss": 4.886520400759764e-06 + }, + { + "epoch": 0.7550919517500494, + "step": 7637, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.7550919517500494, + "step": 7637, + "train/total_loss": 0.0351567380130291 + }, + { + "entropy": 9.224124908447266, + "epoch": 0.755190824599565, + "mean_token_accuracy": 0.7718120813369751, + "num_tokens": 18944557.0, + "step": 7638, + "train/ce_loss": 1.5212565660476685 + }, + { + "epoch": 0.755190824599565, + "step": 7638, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.755190824599565, + "step": 7638, + "train/total_loss": 0.21462565660476685 + }, + { + "entropy": 8.878040313720703, + "epoch": 0.7552896974490805, + "mean_token_accuracy": 0.7241379022598267, + "num_tokens": 18949594.0, + "step": 7639, + "train/ce_loss": 1.5049303770065308 + }, + { + "epoch": 0.7552896974490805, + "step": 7639, + "train/sim_loss": 0.1328125 + }, + { + "epoch": 0.7552896974490805, + "step": 7639, + "train/total_loss": 0.2833055257797241 + }, + { + "epoch": 0.755388570298596, + "grad_norm": 0.750734269618988, + "learning_rate": 8.113781338080404e-06, + "loss": 0.1272, + "step": 7640 + }, + { + "entropy": 9.231822967529297, + "epoch": 0.755388570298596, + "mean_token_accuracy": 0.7209653258323669, + "num_tokens": 18954725.0, + "step": 7640, + "train/ce_loss": 0.9756774306297302 + }, + { + "epoch": 0.755388570298596, + "step": 7640, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.755388570298596, + "step": 7640, + "train/total_loss": 0.15225523710250854 + }, + { + "entropy": 8.80136489868164, + "epoch": 0.7554874431481116, + "mean_token_accuracy": 0.8112947940826416, + "num_tokens": 18959893.0, + "step": 7641, + "train/ce_loss": 0.6539521813392639 + }, + { + "epoch": 0.7554874431481116, + "step": 7641, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.7554874431481116, + "step": 7641, + "train/total_loss": 0.11227022111415863 + }, + { + "entropy": 8.258249282836914, + "epoch": 0.755586315997627, + "mean_token_accuracy": 0.7637571096420288, + "num_tokens": 18965483.0, + "step": 7642, + "train/ce_loss": 0.41775640845298767 + }, + { + "epoch": 0.755586315997627, + "step": 7642, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.755586315997627, + "step": 7642, + "train/total_loss": 0.065213143825531 + }, + { + "entropy": 8.918859481811523, + "epoch": 0.7556851888471425, + "mean_token_accuracy": 0.6951219439506531, + "num_tokens": 18970580.0, + "step": 7643, + "train/ce_loss": 1.2000300884246826 + }, + { + "epoch": 0.7556851888471425, + "step": 7643, + "train/sim_loss": 0.0078125 + }, + { + "epoch": 0.7556851888471425, + "step": 7643, + "train/total_loss": 0.12781551480293274 + }, + { + "entropy": 8.945146560668945, + "epoch": 0.7557840616966581, + "mean_token_accuracy": 0.7384615540504456, + "num_tokens": 18975692.0, + "step": 7644, + "train/ce_loss": 0.9329941868782043 + }, + { + "epoch": 0.7557840616966581, + "step": 7644, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.7557840616966581, + "step": 7644, + "train/total_loss": 0.13236191868782043 + }, + { + "entropy": 8.990866661071777, + "epoch": 0.7558829345461736, + "mean_token_accuracy": 0.7046783566474915, + "num_tokens": 18980792.0, + "step": 7645, + "train/ce_loss": 0.6352463960647583 + }, + { + "epoch": 0.7558829345461736, + "step": 7645, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.7558829345461736, + "step": 7645, + "train/total_loss": 0.09868089109659195 + }, + { + "entropy": 8.912968635559082, + "epoch": 0.7559818073956891, + "mean_token_accuracy": 0.7124277353286743, + "num_tokens": 18985920.0, + "step": 7646, + "train/ce_loss": 1.3944169282913208 + }, + { + "epoch": 0.7559818073956891, + "step": 7646, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.7559818073956891, + "step": 7646, + "train/total_loss": 0.19412919878959656 + }, + { + "entropy": 8.82077693939209, + "epoch": 0.7560806802452047, + "mean_token_accuracy": 0.7394822239875793, + "num_tokens": 18990998.0, + "step": 7647, + "train/ce_loss": 1.771406888961792 + }, + { + "epoch": 0.7560806802452047, + "step": 7647, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.7560806802452047, + "step": 7647, + "train/total_loss": 0.2747969627380371 + }, + { + "entropy": 8.739612579345703, + "epoch": 0.7561795530947202, + "mean_token_accuracy": 0.7270340919494629, + "num_tokens": 18996379.0, + "step": 7648, + "train/ce_loss": 1.1098839044570923 + }, + { + "epoch": 0.7561795530947202, + "step": 7648, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.7561795530947202, + "step": 7648, + "train/total_loss": 0.16958214342594147 + }, + { + "entropy": 8.741659164428711, + "epoch": 0.7562784259442357, + "mean_token_accuracy": 0.7295454740524292, + "num_tokens": 19001690.0, + "step": 7649, + "train/ce_loss": 0.9927825927734375 + }, + { + "epoch": 0.7562784259442357, + "step": 7649, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.7562784259442357, + "step": 7649, + "train/total_loss": 0.1305282711982727 + }, + { + "entropy": 8.605138778686523, + "epoch": 0.7563772987937513, + "mean_token_accuracy": 0.7696078419685364, + "num_tokens": 19006970.0, + "step": 7650, + "train/ce_loss": 0.8369215130805969 + }, + { + "epoch": 0.7563772987937513, + "step": 7650, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.7563772987937513, + "step": 7650, + "train/total_loss": 0.17744216322898865 + }, + { + "entropy": 8.186044692993164, + "epoch": 0.7564761716432667, + "mean_token_accuracy": 0.737758457660675, + "num_tokens": 19012562.0, + "step": 7651, + "train/ce_loss": 0.9426973462104797 + }, + { + "epoch": 0.7564761716432667, + "step": 7651, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.7564761716432667, + "step": 7651, + "train/total_loss": 0.1958322376012802 + }, + { + "entropy": 8.739931106567383, + "epoch": 0.7565750444927822, + "mean_token_accuracy": 0.7268232107162476, + "num_tokens": 19017841.0, + "step": 7652, + "train/ce_loss": 0.5349259972572327 + }, + { + "epoch": 0.7565750444927822, + "step": 7652, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.7565750444927822, + "step": 7652, + "train/total_loss": 0.09646135568618774 + }, + { + "entropy": 8.697602272033691, + "epoch": 0.7566739173422978, + "mean_token_accuracy": 0.7437425255775452, + "num_tokens": 19023154.0, + "step": 7653, + "train/ce_loss": 1.1454800367355347 + }, + { + "epoch": 0.7566739173422978, + "step": 7653, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.7566739173422978, + "step": 7653, + "train/total_loss": 0.208297997713089 + }, + { + "entropy": 8.589914321899414, + "epoch": 0.7567727901918133, + "mean_token_accuracy": 0.7968127727508545, + "num_tokens": 19028452.0, + "step": 7654, + "train/ce_loss": 0.7477198839187622 + }, + { + "epoch": 0.7567727901918133, + "step": 7654, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.7567727901918133, + "step": 7654, + "train/total_loss": 0.12945950031280518 + }, + { + "entropy": 8.548648834228516, + "epoch": 0.7568716630413288, + "mean_token_accuracy": 0.7398452758789062, + "num_tokens": 19033960.0, + "step": 7655, + "train/ce_loss": 0.7226057648658752 + }, + { + "epoch": 0.7568716630413288, + "step": 7655, + "train/sim_loss": 0.01171875 + }, + { + "epoch": 0.7568716630413288, + "step": 7655, + "train/total_loss": 0.08397933095693588 + }, + { + "entropy": 8.774779319763184, + "epoch": 0.7569705358908444, + "mean_token_accuracy": 0.7493638396263123, + "num_tokens": 19039250.0, + "step": 7656, + "train/ce_loss": 0.4386369287967682 + }, + { + "epoch": 0.7569705358908444, + "step": 7656, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.7569705358908444, + "step": 7656, + "train/total_loss": 0.0907386988401413 + }, + { + "entropy": 8.82728099822998, + "epoch": 0.7570694087403599, + "mean_token_accuracy": 0.7601156234741211, + "num_tokens": 19044417.0, + "step": 7657, + "train/ce_loss": 0.7033409476280212 + }, + { + "epoch": 0.7570694087403599, + "step": 7657, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.7570694087403599, + "step": 7657, + "train/total_loss": 0.11720909923315048 + }, + { + "entropy": 9.062734603881836, + "epoch": 0.7571682815898754, + "mean_token_accuracy": 0.7332268357276917, + "num_tokens": 19049517.0, + "step": 7658, + "train/ce_loss": 0.7107903957366943 + }, + { + "epoch": 0.7571682815898754, + "step": 7658, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.7571682815898754, + "step": 7658, + "train/total_loss": 0.1335790455341339 + }, + { + "entropy": 8.766202926635742, + "epoch": 0.757267154439391, + "mean_token_accuracy": 0.7806190848350525, + "num_tokens": 19054753.0, + "step": 7659, + "train/ce_loss": 0.40278881788253784 + }, + { + "epoch": 0.757267154439391, + "step": 7659, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.757267154439391, + "step": 7659, + "train/total_loss": 0.07152888178825378 + }, + { + "epoch": 0.7573660272889065, + "grad_norm": 0.594711184501648, + "learning_rate": 8.108836473322456e-06, + "loss": 0.1295, + "step": 7660 + }, + { + "entropy": 9.129072189331055, + "epoch": 0.7573660272889065, + "mean_token_accuracy": 0.7789473533630371, + "num_tokens": 19059764.0, + "step": 7660, + "train/ce_loss": 0.7323489785194397 + }, + { + "epoch": 0.7573660272889065, + "step": 7660, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.7573660272889065, + "step": 7660, + "train/total_loss": 0.12401615083217621 + }, + { + "entropy": 8.544218063354492, + "epoch": 0.7574649001384219, + "mean_token_accuracy": 0.7217194437980652, + "num_tokens": 19065131.0, + "step": 7661, + "train/ce_loss": 0.8963987827301025 + }, + { + "epoch": 0.7574649001384219, + "step": 7661, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.7574649001384219, + "step": 7661, + "train/total_loss": 0.11307737976312637 + }, + { + "entropy": 8.525057792663574, + "epoch": 0.7575637729879375, + "mean_token_accuracy": 0.7340301871299744, + "num_tokens": 19070457.0, + "step": 7662, + "train/ce_loss": 0.6648871898651123 + }, + { + "epoch": 0.7575637729879375, + "step": 7662, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.7575637729879375, + "step": 7662, + "train/total_loss": 0.10555122047662735 + }, + { + "entropy": 8.885919570922852, + "epoch": 0.757662645837453, + "mean_token_accuracy": 0.7124773859977722, + "num_tokens": 19075429.0, + "step": 7663, + "train/ce_loss": 1.6520631334060454e-06 + }, + { + "epoch": 0.757662645837453, + "step": 7663, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.757662645837453, + "step": 7663, + "train/total_loss": 0.05859391391277313 + }, + { + "entropy": 9.214609146118164, + "epoch": 0.7577615186869685, + "mean_token_accuracy": 0.6887966990470886, + "num_tokens": 19080345.0, + "step": 7664, + "train/ce_loss": 3.675885182019556e-06 + }, + { + "epoch": 0.7577615186869685, + "step": 7664, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.7577615186869685, + "step": 7664, + "train/total_loss": 0.03906286880373955 + }, + { + "entropy": 8.944068908691406, + "epoch": 0.7578603915364841, + "mean_token_accuracy": 0.696825385093689, + "num_tokens": 19085386.0, + "step": 7665, + "train/ce_loss": 0.9921472072601318 + }, + { + "epoch": 0.7578603915364841, + "step": 7665, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.7578603915364841, + "step": 7665, + "train/total_loss": 0.13827723264694214 + }, + { + "entropy": 8.536942481994629, + "epoch": 0.7579592643859996, + "mean_token_accuracy": 0.7810304164886475, + "num_tokens": 19090667.0, + "step": 7666, + "train/ce_loss": 0.8015487790107727 + }, + { + "epoch": 0.7579592643859996, + "step": 7666, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.7579592643859996, + "step": 7666, + "train/total_loss": 0.09968613088130951 + }, + { + "entropy": 8.433038711547852, + "epoch": 0.7580581372355151, + "mean_token_accuracy": 0.7573891878128052, + "num_tokens": 19095914.0, + "step": 7667, + "train/ce_loss": 0.6970519423484802 + }, + { + "epoch": 0.7580581372355151, + "step": 7667, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.7580581372355151, + "step": 7667, + "train/total_loss": 0.15564268827438354 + }, + { + "entropy": 9.189191818237305, + "epoch": 0.7581570100850307, + "mean_token_accuracy": 0.8009592294692993, + "num_tokens": 19100740.0, + "step": 7668, + "train/ce_loss": 1.1578792333602905 + }, + { + "epoch": 0.7581570100850307, + "step": 7668, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.7581570100850307, + "step": 7668, + "train/total_loss": 0.18610042333602905 + }, + { + "entropy": 8.543998718261719, + "epoch": 0.7582558829345462, + "mean_token_accuracy": 0.7582417726516724, + "num_tokens": 19106022.0, + "step": 7669, + "train/ce_loss": 0.551040530204773 + }, + { + "epoch": 0.7582558829345462, + "step": 7669, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.7582558829345462, + "step": 7669, + "train/total_loss": 0.08635405451059341 + }, + { + "entropy": 8.42531967163086, + "epoch": 0.7583547557840618, + "mean_token_accuracy": 0.7783669233322144, + "num_tokens": 19111455.0, + "step": 7670, + "train/ce_loss": 0.9934836626052856 + }, + { + "epoch": 0.7583547557840618, + "step": 7670, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.7583547557840618, + "step": 7670, + "train/total_loss": 0.16184836626052856 + }, + { + "entropy": 8.398706436157227, + "epoch": 0.7584536286335772, + "mean_token_accuracy": 0.770893394947052, + "num_tokens": 19116607.0, + "step": 7671, + "train/ce_loss": 0.5111764073371887 + }, + { + "epoch": 0.7584536286335772, + "step": 7671, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.7584536286335772, + "step": 7671, + "train/total_loss": 0.09799264371395111 + }, + { + "entropy": 8.344941139221191, + "epoch": 0.7585525014830927, + "mean_token_accuracy": 0.7381930351257324, + "num_tokens": 19122070.0, + "step": 7672, + "train/ce_loss": 1.1934471130371094 + }, + { + "epoch": 0.7585525014830927, + "step": 7672, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.7585525014830927, + "step": 7672, + "train/total_loss": 0.19746971130371094 + }, + { + "entropy": 8.620504379272461, + "epoch": 0.7586513743326083, + "mean_token_accuracy": 0.7267637252807617, + "num_tokens": 19127440.0, + "step": 7673, + "train/ce_loss": 1.2869994640350342 + }, + { + "epoch": 0.7586513743326083, + "step": 7673, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.7586513743326083, + "step": 7673, + "train/total_loss": 0.23807494342327118 + }, + { + "entropy": 8.172987937927246, + "epoch": 0.7587502471821238, + "mean_token_accuracy": 0.7366120219230652, + "num_tokens": 19132849.0, + "step": 7674, + "train/ce_loss": 0.7104565501213074 + }, + { + "epoch": 0.7587502471821238, + "step": 7674, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.7587502471821238, + "step": 7674, + "train/total_loss": 0.1022956594824791 + }, + { + "entropy": 8.5946683883667, + "epoch": 0.7588491200316393, + "mean_token_accuracy": 0.8109685182571411, + "num_tokens": 19138180.0, + "step": 7675, + "train/ce_loss": 0.5208238363265991 + }, + { + "epoch": 0.7588491200316393, + "step": 7675, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.7588491200316393, + "step": 7675, + "train/total_loss": 0.07161363959312439 + }, + { + "entropy": 8.85075569152832, + "epoch": 0.7589479928811549, + "mean_token_accuracy": 0.8359073400497437, + "num_tokens": 19143134.0, + "step": 7676, + "train/ce_loss": 1.2283612489700317 + }, + { + "epoch": 0.7589479928811549, + "step": 7676, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.7589479928811549, + "step": 7676, + "train/total_loss": 0.1540861278772354 + }, + { + "entropy": 9.028573989868164, + "epoch": 0.7590468657306704, + "mean_token_accuracy": 0.7280265092849731, + "num_tokens": 19148196.0, + "step": 7677, + "train/ce_loss": 1.2335330247879028 + }, + { + "epoch": 0.7590468657306704, + "step": 7677, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.7590468657306704, + "step": 7677, + "train/total_loss": 0.21710330247879028 + }, + { + "entropy": 8.929908752441406, + "epoch": 0.7591457385801859, + "mean_token_accuracy": 0.7253731489181519, + "num_tokens": 19153459.0, + "step": 7678, + "train/ce_loss": 1.0745279788970947 + }, + { + "epoch": 0.7591457385801859, + "step": 7678, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.7591457385801859, + "step": 7678, + "train/total_loss": 0.15042155981063843 + }, + { + "entropy": 8.580034255981445, + "epoch": 0.7592446114297015, + "mean_token_accuracy": 0.7003567218780518, + "num_tokens": 19158817.0, + "step": 7679, + "train/ce_loss": 0.9963113069534302 + }, + { + "epoch": 0.7592446114297015, + "step": 7679, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.7592446114297015, + "step": 7679, + "train/total_loss": 0.16603738069534302 + }, + { + "epoch": 0.7593434842792169, + "grad_norm": 0.6969523429870605, + "learning_rate": 8.103891608564506e-06, + "loss": 0.1333, + "step": 7680 + }, + { + "entropy": 8.960241317749023, + "epoch": 0.7593434842792169, + "mean_token_accuracy": 0.7313974499702454, + "num_tokens": 19163806.0, + "step": 7680, + "train/ce_loss": 1.0162924528121948 + }, + { + "epoch": 0.7593434842792169, + "step": 7680, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.7593434842792169, + "step": 7680, + "train/total_loss": 0.21100425720214844 + }, + { + "entropy": 8.218679428100586, + "epoch": 0.7594423571287324, + "mean_token_accuracy": 0.7494692206382751, + "num_tokens": 19169271.0, + "step": 7681, + "train/ce_loss": 0.5055243968963623 + }, + { + "epoch": 0.7594423571287324, + "step": 7681, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.7594423571287324, + "step": 7681, + "train/total_loss": 0.06617744266986847 + }, + { + "entropy": 8.814424514770508, + "epoch": 0.759541229978248, + "mean_token_accuracy": 0.6930533051490784, + "num_tokens": 19174363.0, + "step": 7682, + "train/ce_loss": 1.045782446861267 + }, + { + "epoch": 0.759541229978248, + "step": 7682, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.759541229978248, + "step": 7682, + "train/total_loss": 0.15145325660705566 + }, + { + "entropy": 8.498937606811523, + "epoch": 0.7596401028277635, + "mean_token_accuracy": 0.6663055419921875, + "num_tokens": 19179733.0, + "step": 7683, + "train/ce_loss": 1.204154372215271 + }, + { + "epoch": 0.7596401028277635, + "step": 7683, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.7596401028277635, + "step": 7683, + "train/total_loss": 0.15557169914245605 + }, + { + "entropy": 8.231889724731445, + "epoch": 0.759738975677279, + "mean_token_accuracy": 0.7150654792785645, + "num_tokens": 19185097.0, + "step": 7684, + "train/ce_loss": 0.5256108045578003 + }, + { + "epoch": 0.759738975677279, + "step": 7684, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.759738975677279, + "step": 7684, + "train/total_loss": 0.10334233194589615 + }, + { + "entropy": 8.79952621459961, + "epoch": 0.7598378485267946, + "mean_token_accuracy": 0.7369901537895203, + "num_tokens": 19190257.0, + "step": 7685, + "train/ce_loss": 2.6327656996727455e-06 + }, + { + "epoch": 0.7598378485267946, + "step": 7685, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.7598378485267946, + "step": 7685, + "train/total_loss": 0.03515651449561119 + }, + { + "entropy": 8.854053497314453, + "epoch": 0.7599367213763101, + "mean_token_accuracy": 0.7942177057266235, + "num_tokens": 19195280.0, + "step": 7686, + "train/ce_loss": 2.11320639209589e-06 + }, + { + "epoch": 0.7599367213763101, + "step": 7686, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.7599367213763101, + "step": 7686, + "train/total_loss": 0.03515646234154701 + }, + { + "entropy": 8.679353713989258, + "epoch": 0.7600355942258256, + "mean_token_accuracy": 0.7257217764854431, + "num_tokens": 19200515.0, + "step": 7687, + "train/ce_loss": 1.6108123064041138 + }, + { + "epoch": 0.7600355942258256, + "step": 7687, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.7600355942258256, + "step": 7687, + "train/total_loss": 0.2157687395811081 + }, + { + "entropy": 8.521926879882812, + "epoch": 0.7601344670753412, + "mean_token_accuracy": 0.7382199168205261, + "num_tokens": 19205680.0, + "step": 7688, + "train/ce_loss": 0.5365400910377502 + }, + { + "epoch": 0.7601344670753412, + "step": 7688, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.7601344670753412, + "step": 7688, + "train/total_loss": 0.1161540150642395 + }, + { + "entropy": 8.752731323242188, + "epoch": 0.7602333399248566, + "mean_token_accuracy": 0.6937212944030762, + "num_tokens": 19210798.0, + "step": 7689, + "train/ce_loss": 7.696427246628446e-07 + }, + { + "epoch": 0.7602333399248566, + "step": 7689, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.7602333399248566, + "step": 7689, + "train/total_loss": 0.01562507636845112 + }, + { + "entropy": 9.152226448059082, + "epoch": 0.7603322127743721, + "mean_token_accuracy": 0.8024096488952637, + "num_tokens": 19215634.0, + "step": 7690, + "train/ce_loss": 2.3737229639664292e-06 + }, + { + "epoch": 0.7603322127743721, + "step": 7690, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.7603322127743721, + "step": 7690, + "train/total_loss": 0.0429689884185791 + }, + { + "entropy": 8.405892372131348, + "epoch": 0.7604310856238877, + "mean_token_accuracy": 0.7084308862686157, + "num_tokens": 19220961.0, + "step": 7691, + "train/ce_loss": 1.2519909143447876 + }, + { + "epoch": 0.7604310856238877, + "step": 7691, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.7604310856238877, + "step": 7691, + "train/total_loss": 0.179886594414711 + }, + { + "entropy": 8.562359809875488, + "epoch": 0.7605299584734032, + "mean_token_accuracy": 0.7176079750061035, + "num_tokens": 19226386.0, + "step": 7692, + "train/ce_loss": 0.8045241236686707 + }, + { + "epoch": 0.7605299584734032, + "step": 7692, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.7605299584734032, + "step": 7692, + "train/total_loss": 0.13123366236686707 + }, + { + "entropy": 8.601509094238281, + "epoch": 0.7606288313229187, + "mean_token_accuracy": 0.7753058671951294, + "num_tokens": 19231750.0, + "step": 7693, + "train/ce_loss": 1.0604190826416016 + }, + { + "epoch": 0.7606288313229187, + "step": 7693, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.7606288313229187, + "step": 7693, + "train/total_loss": 0.17244815826416016 + }, + { + "entropy": 8.766267776489258, + "epoch": 0.7607277041724343, + "mean_token_accuracy": 0.7398843765258789, + "num_tokens": 19236937.0, + "step": 7694, + "train/ce_loss": 0.9179567694664001 + }, + { + "epoch": 0.7607277041724343, + "step": 7694, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.7607277041724343, + "step": 7694, + "train/total_loss": 0.1582019329071045 + }, + { + "entropy": 9.615768432617188, + "epoch": 0.7608265770219498, + "mean_token_accuracy": 0.7142857313156128, + "num_tokens": 19241813.0, + "step": 7695, + "train/ce_loss": 1.6348321437835693 + }, + { + "epoch": 0.7608265770219498, + "step": 7695, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.7608265770219498, + "step": 7695, + "train/total_loss": 0.23770196735858917 + }, + { + "entropy": 9.021735191345215, + "epoch": 0.7609254498714653, + "mean_token_accuracy": 0.7733812928199768, + "num_tokens": 19246819.0, + "step": 7696, + "train/ce_loss": 1.9381379843252944e-06 + }, + { + "epoch": 0.7609254498714653, + "step": 7696, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.7609254498714653, + "step": 7696, + "train/total_loss": 0.04296894371509552 + }, + { + "entropy": 8.358237266540527, + "epoch": 0.7610243227209809, + "mean_token_accuracy": 0.7392776608467102, + "num_tokens": 19252203.0, + "step": 7697, + "train/ce_loss": 0.967032790184021 + }, + { + "epoch": 0.7610243227209809, + "step": 7697, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.7610243227209809, + "step": 7697, + "train/total_loss": 0.17092204093933105 + }, + { + "entropy": 8.386516571044922, + "epoch": 0.7611231955704963, + "mean_token_accuracy": 0.7639034390449524, + "num_tokens": 19257644.0, + "step": 7698, + "train/ce_loss": 0.7730222344398499 + }, + { + "epoch": 0.7611231955704963, + "step": 7698, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.7611231955704963, + "step": 7698, + "train/total_loss": 0.1280834674835205 + }, + { + "entropy": 9.069533348083496, + "epoch": 0.7612220684200118, + "mean_token_accuracy": 0.7192192077636719, + "num_tokens": 19262769.0, + "step": 7699, + "train/ce_loss": 0.921979546546936 + }, + { + "epoch": 0.7612220684200118, + "step": 7699, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.7612220684200118, + "step": 7699, + "train/total_loss": 0.1664167046546936 + }, + { + "epoch": 0.7613209412695274, + "grad_norm": 0.6902703046798706, + "learning_rate": 8.098946743806557e-06, + "loss": 0.1421, + "step": 7700 + }, + { + "entropy": 8.63135051727295, + "epoch": 0.7613209412695274, + "mean_token_accuracy": 0.6991676688194275, + "num_tokens": 19268070.0, + "step": 7700, + "train/ce_loss": 0.6126058101654053 + }, + { + "epoch": 0.7613209412695274, + "step": 7700, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.7613209412695274, + "step": 7700, + "train/total_loss": 0.08860433101654053 + }, + { + "entropy": 9.117762565612793, + "epoch": 0.7614198141190429, + "mean_token_accuracy": 0.7804877758026123, + "num_tokens": 19273207.0, + "step": 7701, + "train/ce_loss": 0.7227744460105896 + }, + { + "epoch": 0.7614198141190429, + "step": 7701, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.7614198141190429, + "step": 7701, + "train/total_loss": 0.14258995652198792 + }, + { + "entropy": 8.944087982177734, + "epoch": 0.7615186869685584, + "mean_token_accuracy": 0.7549341917037964, + "num_tokens": 19278312.0, + "step": 7702, + "train/ce_loss": 0.7299985885620117 + }, + { + "epoch": 0.7615186869685584, + "step": 7702, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.7615186869685584, + "step": 7702, + "train/total_loss": 0.11206235736608505 + }, + { + "entropy": 8.509510040283203, + "epoch": 0.761617559818074, + "mean_token_accuracy": 0.7464967966079712, + "num_tokens": 19283519.0, + "step": 7703, + "train/ce_loss": 1.4568054676055908 + }, + { + "epoch": 0.761617559818074, + "step": 7703, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.761617559818074, + "step": 7703, + "train/total_loss": 0.20818054676055908 + }, + { + "entropy": 8.294575691223145, + "epoch": 0.7617164326675895, + "mean_token_accuracy": 0.7869177460670471, + "num_tokens": 19289017.0, + "step": 7704, + "train/ce_loss": 0.7559100389480591 + }, + { + "epoch": 0.7617164326675895, + "step": 7704, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.7617164326675895, + "step": 7704, + "train/total_loss": 0.11855975538492203 + }, + { + "entropy": 8.896890640258789, + "epoch": 0.761815305517105, + "mean_token_accuracy": 0.7196382284164429, + "num_tokens": 19294196.0, + "step": 7705, + "train/ce_loss": 1.29708731174469 + }, + { + "epoch": 0.761815305517105, + "step": 7705, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.761815305517105, + "step": 7705, + "train/total_loss": 0.16486498713493347 + }, + { + "entropy": 8.722881317138672, + "epoch": 0.7619141783666206, + "mean_token_accuracy": 0.7140718698501587, + "num_tokens": 19299325.0, + "step": 7706, + "train/ce_loss": 1.6583621501922607 + }, + { + "epoch": 0.7619141783666206, + "step": 7706, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.7619141783666206, + "step": 7706, + "train/total_loss": 0.23614871501922607 + }, + { + "entropy": 9.242901802062988, + "epoch": 0.762013051216136, + "mean_token_accuracy": 0.7539823055267334, + "num_tokens": 19304310.0, + "step": 7707, + "train/ce_loss": 1.672262191772461 + }, + { + "epoch": 0.762013051216136, + "step": 7707, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.762013051216136, + "step": 7707, + "train/total_loss": 0.23363247513771057 + }, + { + "entropy": 8.592472076416016, + "epoch": 0.7621119240656515, + "mean_token_accuracy": 0.7590798735618591, + "num_tokens": 19309635.0, + "step": 7708, + "train/ce_loss": 0.5775005221366882 + }, + { + "epoch": 0.7621119240656515, + "step": 7708, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.7621119240656515, + "step": 7708, + "train/total_loss": 0.08900005370378494 + }, + { + "entropy": 8.906074523925781, + "epoch": 0.7622107969151671, + "mean_token_accuracy": 0.7651098966598511, + "num_tokens": 19314794.0, + "step": 7709, + "train/ce_loss": 0.581382155418396 + }, + { + "epoch": 0.7622107969151671, + "step": 7709, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.7622107969151671, + "step": 7709, + "train/total_loss": 0.08938822150230408 + }, + { + "entropy": 8.652992248535156, + "epoch": 0.7623096697646826, + "mean_token_accuracy": 0.7920299172401428, + "num_tokens": 19320053.0, + "step": 7710, + "train/ce_loss": 1.3876551389694214 + }, + { + "epoch": 0.7623096697646826, + "step": 7710, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.7623096697646826, + "step": 7710, + "train/total_loss": 0.22860926389694214 + }, + { + "entropy": 9.118717193603516, + "epoch": 0.7624085426141981, + "mean_token_accuracy": 0.7709029912948608, + "num_tokens": 19325094.0, + "step": 7711, + "train/ce_loss": 1.1129260063171387 + }, + { + "epoch": 0.7624085426141981, + "step": 7711, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.7624085426141981, + "step": 7711, + "train/total_loss": 0.18941760063171387 + }, + { + "entropy": 9.099111557006836, + "epoch": 0.7625074154637137, + "mean_token_accuracy": 0.7534013390541077, + "num_tokens": 19330099.0, + "step": 7712, + "train/ce_loss": 0.6406109929084778 + }, + { + "epoch": 0.7625074154637137, + "step": 7712, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.7625074154637137, + "step": 7712, + "train/total_loss": 0.08749859780073166 + }, + { + "entropy": 9.474559783935547, + "epoch": 0.7626062883132292, + "mean_token_accuracy": 0.7681940793991089, + "num_tokens": 19334908.0, + "step": 7713, + "train/ce_loss": 2.5190438464051113e-06 + }, + { + "epoch": 0.7626062883132292, + "step": 7713, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.7626062883132292, + "step": 7713, + "train/total_loss": 0.0742190033197403 + }, + { + "entropy": 8.941295623779297, + "epoch": 0.7627051611627447, + "mean_token_accuracy": 0.7441217303276062, + "num_tokens": 19340115.0, + "step": 7714, + "train/ce_loss": 0.86009681224823 + }, + { + "epoch": 0.7627051611627447, + "step": 7714, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.7627051611627447, + "step": 7714, + "train/total_loss": 0.148509681224823 + }, + { + "entropy": 9.293888092041016, + "epoch": 0.7628040340122603, + "mean_token_accuracy": 0.7445651888847351, + "num_tokens": 19345188.0, + "step": 7715, + "train/ce_loss": 6.578445663762977e-06 + }, + { + "epoch": 0.7628040340122603, + "step": 7715, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.7628040340122603, + "step": 7715, + "train/total_loss": 0.01562565751373768 + }, + { + "entropy": 8.775178909301758, + "epoch": 0.7629029068617758, + "mean_token_accuracy": 0.7128713130950928, + "num_tokens": 19350483.0, + "step": 7716, + "train/ce_loss": 0.5081300139427185 + }, + { + "epoch": 0.7629029068617758, + "step": 7716, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.7629029068617758, + "step": 7716, + "train/total_loss": 0.10550050437450409 + }, + { + "entropy": 8.708213806152344, + "epoch": 0.7630017797112912, + "mean_token_accuracy": 0.7303128242492676, + "num_tokens": 19356076.0, + "step": 7717, + "train/ce_loss": 0.9680280089378357 + }, + { + "epoch": 0.7630017797112912, + "step": 7717, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.7630017797112912, + "step": 7717, + "train/total_loss": 0.17492780089378357 + }, + { + "entropy": 9.063175201416016, + "epoch": 0.7631006525608068, + "mean_token_accuracy": 0.7174280881881714, + "num_tokens": 19361135.0, + "step": 7718, + "train/ce_loss": 1.0929538011550903 + }, + { + "epoch": 0.7631006525608068, + "step": 7718, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.7631006525608068, + "step": 7718, + "train/total_loss": 0.18351413309574127 + }, + { + "entropy": 8.598108291625977, + "epoch": 0.7631995254103223, + "mean_token_accuracy": 0.7684848308563232, + "num_tokens": 19366456.0, + "step": 7719, + "train/ce_loss": 0.6938397288322449 + }, + { + "epoch": 0.7631995254103223, + "step": 7719, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.7631995254103223, + "step": 7719, + "train/total_loss": 0.11625897139310837 + }, + { + "epoch": 0.7632983982598378, + "grad_norm": 0.7268943786621094, + "learning_rate": 8.09400187904861e-06, + "loss": 0.1302, + "step": 7720 + }, + { + "entropy": 9.239712715148926, + "epoch": 0.7632983982598378, + "mean_token_accuracy": 0.7887640595436096, + "num_tokens": 19371358.0, + "step": 7720, + "train/ce_loss": 0.9097400903701782 + }, + { + "epoch": 0.7632983982598378, + "step": 7720, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.7632983982598378, + "step": 7720, + "train/total_loss": 0.11831776052713394 + }, + { + "entropy": 8.484024047851562, + "epoch": 0.7633972711093534, + "mean_token_accuracy": 0.8020594716072083, + "num_tokens": 19376667.0, + "step": 7721, + "train/ce_loss": 0.716927170753479 + }, + { + "epoch": 0.7633972711093534, + "step": 7721, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.7633972711093534, + "step": 7721, + "train/total_loss": 0.13028647005558014 + }, + { + "entropy": 8.522327423095703, + "epoch": 0.7634961439588689, + "mean_token_accuracy": 0.7739975452423096, + "num_tokens": 19381971.0, + "step": 7722, + "train/ce_loss": 0.5556038022041321 + }, + { + "epoch": 0.7634961439588689, + "step": 7722, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.7634961439588689, + "step": 7722, + "train/total_loss": 0.08681038022041321 + }, + { + "entropy": 8.600162506103516, + "epoch": 0.7635950168083844, + "mean_token_accuracy": 0.7875862121582031, + "num_tokens": 19387182.0, + "step": 7723, + "train/ce_loss": 0.3888521194458008 + }, + { + "epoch": 0.7635950168083844, + "step": 7723, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.7635950168083844, + "step": 7723, + "train/total_loss": 0.0662289634346962 + }, + { + "entropy": 8.18398380279541, + "epoch": 0.7636938896579, + "mean_token_accuracy": 0.7144240140914917, + "num_tokens": 19392683.0, + "step": 7724, + "train/ce_loss": 0.916076123714447 + }, + { + "epoch": 0.7636938896579, + "step": 7724, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.7636938896579, + "step": 7724, + "train/total_loss": 0.11113886535167694 + }, + { + "entropy": 9.19622802734375, + "epoch": 0.7637927625074155, + "mean_token_accuracy": 0.7403846383094788, + "num_tokens": 19397348.0, + "step": 7725, + "train/ce_loss": 3.756938167498447e-05 + }, + { + "epoch": 0.7637927625074155, + "step": 7725, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.7637927625074155, + "step": 7725, + "train/total_loss": 0.04687875509262085 + }, + { + "entropy": 8.722827911376953, + "epoch": 0.7638916353569309, + "mean_token_accuracy": 0.7213695645332336, + "num_tokens": 19402684.0, + "step": 7726, + "train/ce_loss": 0.4669986367225647 + }, + { + "epoch": 0.7638916353569309, + "step": 7726, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.7638916353569309, + "step": 7726, + "train/total_loss": 0.07404361665248871 + }, + { + "entropy": 9.330000877380371, + "epoch": 0.7639905082064465, + "mean_token_accuracy": 0.7877358198165894, + "num_tokens": 19407544.0, + "step": 7727, + "train/ce_loss": 3.991949597548228e-06 + }, + { + "epoch": 0.7639905082064465, + "step": 7727, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.7639905082064465, + "step": 7727, + "train/total_loss": 0.031250398606061935 + }, + { + "entropy": 8.632328987121582, + "epoch": 0.764089381055962, + "mean_token_accuracy": 0.732824444770813, + "num_tokens": 19412908.0, + "step": 7728, + "train/ce_loss": 0.6812427043914795 + }, + { + "epoch": 0.764089381055962, + "step": 7728, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.764089381055962, + "step": 7728, + "train/total_loss": 0.09937427192926407 + }, + { + "entropy": 9.08932113647461, + "epoch": 0.7641882539054775, + "mean_token_accuracy": 0.692307710647583, + "num_tokens": 19417906.0, + "step": 7729, + "train/ce_loss": 2.049284375971183e-06 + }, + { + "epoch": 0.7641882539054775, + "step": 7729, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.7641882539054775, + "step": 7729, + "train/total_loss": 0.042968954890966415 + }, + { + "entropy": 9.123553276062012, + "epoch": 0.7642871267549931, + "mean_token_accuracy": 0.7763370871543884, + "num_tokens": 19422907.0, + "step": 7730, + "train/ce_loss": 1.2972058057785034 + }, + { + "epoch": 0.7642871267549931, + "step": 7730, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.7642871267549931, + "step": 7730, + "train/total_loss": 0.16097058355808258 + }, + { + "entropy": 9.108598709106445, + "epoch": 0.7643859996045086, + "mean_token_accuracy": 0.7902207970619202, + "num_tokens": 19427959.0, + "step": 7731, + "train/ce_loss": 0.8736315965652466 + }, + { + "epoch": 0.7643859996045086, + "step": 7731, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.7643859996045086, + "step": 7731, + "train/total_loss": 0.11470691114664078 + }, + { + "entropy": 9.450088500976562, + "epoch": 0.7644848724540241, + "mean_token_accuracy": 0.7854166626930237, + "num_tokens": 19432839.0, + "step": 7732, + "train/ce_loss": 1.6410548369094613e-06 + }, + { + "epoch": 0.7644848724540241, + "step": 7732, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.7644848724540241, + "step": 7732, + "train/total_loss": 0.015625163912773132 + }, + { + "entropy": 9.395781517028809, + "epoch": 0.7645837453035397, + "mean_token_accuracy": 0.7614678740501404, + "num_tokens": 19437698.0, + "step": 7733, + "train/ce_loss": 8.319220796693116e-06 + }, + { + "epoch": 0.7645837453035397, + "step": 7733, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.7645837453035397, + "step": 7733, + "train/total_loss": 0.03906333073973656 + }, + { + "entropy": 9.136443138122559, + "epoch": 0.7646826181530552, + "mean_token_accuracy": 0.7687296271324158, + "num_tokens": 19442776.0, + "step": 7734, + "train/ce_loss": 1.2883407407571212e-06 + }, + { + "epoch": 0.7646826181530552, + "step": 7734, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.7646826181530552, + "step": 7734, + "train/total_loss": 0.015625128522515297 + }, + { + "entropy": 9.37185287475586, + "epoch": 0.7647814910025706, + "mean_token_accuracy": 0.7629063129425049, + "num_tokens": 19447748.0, + "step": 7735, + "train/ce_loss": 0.5439682602882385 + }, + { + "epoch": 0.7647814910025706, + "step": 7735, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.7647814910025706, + "step": 7735, + "train/total_loss": 0.11689682304859161 + }, + { + "entropy": 9.62160587310791, + "epoch": 0.7648803638520862, + "mean_token_accuracy": 0.7392995953559875, + "num_tokens": 19452418.0, + "step": 7736, + "train/ce_loss": 4.22458151660976e-06 + }, + { + "epoch": 0.7648803638520862, + "step": 7736, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.7648803638520862, + "step": 7736, + "train/total_loss": 0.042969170957803726 + }, + { + "entropy": 9.938680648803711, + "epoch": 0.7649792367016017, + "mean_token_accuracy": 0.7355931997299194, + "num_tokens": 19457076.0, + "step": 7737, + "train/ce_loss": 3.6609737890103133e-06 + }, + { + "epoch": 0.7649792367016017, + "step": 7737, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.7649792367016017, + "step": 7737, + "train/total_loss": 0.08203161507844925 + }, + { + "entropy": 8.995643615722656, + "epoch": 0.7650781095511172, + "mean_token_accuracy": 0.7534818649291992, + "num_tokens": 19462245.0, + "step": 7738, + "train/ce_loss": 0.4997852146625519 + }, + { + "epoch": 0.7650781095511172, + "step": 7738, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.7650781095511172, + "step": 7738, + "train/total_loss": 0.11247852444648743 + }, + { + "entropy": 8.558038711547852, + "epoch": 0.7651769824006328, + "mean_token_accuracy": 0.7941550016403198, + "num_tokens": 19467522.0, + "step": 7739, + "train/ce_loss": 0.5175499320030212 + }, + { + "epoch": 0.7651769824006328, + "step": 7739, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.7651769824006328, + "step": 7739, + "train/total_loss": 0.14550499618053436 + }, + { + "epoch": 0.7652758552501483, + "grad_norm": 0.7859126925468445, + "learning_rate": 8.08905701429066e-06, + "loss": 0.1202, + "step": 7740 + }, + { + "entropy": 8.586603164672852, + "epoch": 0.7652758552501483, + "mean_token_accuracy": 0.6853233575820923, + "num_tokens": 19472819.0, + "step": 7740, + "train/ce_loss": 1.1033329963684082 + }, + { + "epoch": 0.7652758552501483, + "step": 7740, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.7652758552501483, + "step": 7740, + "train/total_loss": 0.15330204367637634 + }, + { + "entropy": 9.153886795043945, + "epoch": 0.7653747280996638, + "mean_token_accuracy": 0.7885532379150391, + "num_tokens": 19477902.0, + "step": 7741, + "train/ce_loss": 1.0411827564239502 + }, + { + "epoch": 0.7653747280996638, + "step": 7741, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.7653747280996638, + "step": 7741, + "train/total_loss": 0.14708703756332397 + }, + { + "entropy": 8.211365699768066, + "epoch": 0.7654736009491794, + "mean_token_accuracy": 0.7018633484840393, + "num_tokens": 19483491.0, + "step": 7742, + "train/ce_loss": 1.3559170961380005 + }, + { + "epoch": 0.7654736009491794, + "step": 7742, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.7654736009491794, + "step": 7742, + "train/total_loss": 0.20590421557426453 + }, + { + "entropy": 8.63299560546875, + "epoch": 0.7655724737986949, + "mean_token_accuracy": 0.7731529474258423, + "num_tokens": 19489070.0, + "step": 7743, + "train/ce_loss": 0.27736401557922363 + }, + { + "epoch": 0.7655724737986949, + "step": 7743, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.7655724737986949, + "step": 7743, + "train/total_loss": 0.04726765304803848 + }, + { + "entropy": 8.707715034484863, + "epoch": 0.7656713466482103, + "mean_token_accuracy": 0.7670384049415588, + "num_tokens": 19494288.0, + "step": 7744, + "train/ce_loss": 1.2483643293380737 + }, + { + "epoch": 0.7656713466482103, + "step": 7744, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.7656713466482103, + "step": 7744, + "train/total_loss": 0.15999269485473633 + }, + { + "entropy": 8.702041625976562, + "epoch": 0.765770219497726, + "mean_token_accuracy": 0.7841463685035706, + "num_tokens": 19499582.0, + "step": 7745, + "train/ce_loss": 0.7944613695144653 + }, + { + "epoch": 0.765770219497726, + "step": 7745, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.765770219497726, + "step": 7745, + "train/total_loss": 0.16928988695144653 + }, + { + "entropy": 8.619491577148438, + "epoch": 0.7658690923472414, + "mean_token_accuracy": 0.7274590134620667, + "num_tokens": 19505160.0, + "step": 7746, + "train/ce_loss": 0.8761892318725586 + }, + { + "epoch": 0.7658690923472414, + "step": 7746, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.7658690923472414, + "step": 7746, + "train/total_loss": 0.12277517467737198 + }, + { + "entropy": 8.589058876037598, + "epoch": 0.7659679651967569, + "mean_token_accuracy": 0.7404494285583496, + "num_tokens": 19510518.0, + "step": 7747, + "train/ce_loss": 1.9129421710968018 + }, + { + "epoch": 0.7659679651967569, + "step": 7747, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.7659679651967569, + "step": 7747, + "train/total_loss": 0.24207547307014465 + }, + { + "entropy": 8.918684005737305, + "epoch": 0.7660668380462725, + "mean_token_accuracy": 0.7492957711219788, + "num_tokens": 19515653.0, + "step": 7748, + "train/ce_loss": 0.5139737129211426 + }, + { + "epoch": 0.7660668380462725, + "step": 7748, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.7660668380462725, + "step": 7748, + "train/total_loss": 0.10999111831188202 + }, + { + "entropy": 8.723108291625977, + "epoch": 0.766165710895788, + "mean_token_accuracy": 0.7662178874015808, + "num_tokens": 19520907.0, + "step": 7749, + "train/ce_loss": 0.4100935757160187 + }, + { + "epoch": 0.766165710895788, + "step": 7749, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.766165710895788, + "step": 7749, + "train/total_loss": 0.07616560906171799 + }, + { + "entropy": 8.271391868591309, + "epoch": 0.7662645837453035, + "mean_token_accuracy": 0.744027316570282, + "num_tokens": 19526263.0, + "step": 7750, + "train/ce_loss": 0.9203963875770569 + }, + { + "epoch": 0.7662645837453035, + "step": 7750, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.7662645837453035, + "step": 7750, + "train/total_loss": 0.15844589471817017 + }, + { + "entropy": 8.823776245117188, + "epoch": 0.7663634565948191, + "mean_token_accuracy": 0.726123571395874, + "num_tokens": 19531429.0, + "step": 7751, + "train/ce_loss": 0.8118509650230408 + }, + { + "epoch": 0.7663634565948191, + "step": 7751, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.7663634565948191, + "step": 7751, + "train/total_loss": 0.13587260246276855 + }, + { + "entropy": 9.326594352722168, + "epoch": 0.7664623294443346, + "mean_token_accuracy": 0.7789255976676941, + "num_tokens": 19536297.0, + "step": 7752, + "train/ce_loss": 2.020547071879264e-06 + }, + { + "epoch": 0.7664623294443346, + "step": 7752, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.7664623294443346, + "step": 7752, + "train/total_loss": 0.027343951165676117 + }, + { + "entropy": 8.197793960571289, + "epoch": 0.7665612022938502, + "mean_token_accuracy": 0.7110352516174316, + "num_tokens": 19541592.0, + "step": 7753, + "train/ce_loss": 0.9869194030761719 + }, + { + "epoch": 0.7665612022938502, + "step": 7753, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.7665612022938502, + "step": 7753, + "train/total_loss": 0.1533794403076172 + }, + { + "entropy": 8.259000778198242, + "epoch": 0.7666600751433656, + "mean_token_accuracy": 0.7468785643577576, + "num_tokens": 19546926.0, + "step": 7754, + "train/ce_loss": 0.7809675931930542 + }, + { + "epoch": 0.7666600751433656, + "step": 7754, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.7666600751433656, + "step": 7754, + "train/total_loss": 0.09762801229953766 + }, + { + "entropy": 8.879019737243652, + "epoch": 0.7667589479928811, + "mean_token_accuracy": 0.7185473442077637, + "num_tokens": 19552104.0, + "step": 7755, + "train/ce_loss": 1.5787826776504517 + }, + { + "epoch": 0.7667589479928811, + "step": 7755, + "train/sim_loss": 0.12890625 + }, + { + "epoch": 0.7667589479928811, + "step": 7755, + "train/total_loss": 0.2867845296859741 + }, + { + "entropy": 8.442885398864746, + "epoch": 0.7668578208423967, + "mean_token_accuracy": 0.6954148411750793, + "num_tokens": 19557481.0, + "step": 7756, + "train/ce_loss": 1.7079004049301147 + }, + { + "epoch": 0.7668578208423967, + "step": 7756, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.7668578208423967, + "step": 7756, + "train/total_loss": 0.26844629645347595 + }, + { + "entropy": 8.493907928466797, + "epoch": 0.7669566936919122, + "mean_token_accuracy": 0.7236994504928589, + "num_tokens": 19562820.0, + "step": 7757, + "train/ce_loss": 0.990061342716217 + }, + { + "epoch": 0.7669566936919122, + "step": 7757, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.7669566936919122, + "step": 7757, + "train/total_loss": 0.12634989619255066 + }, + { + "entropy": 8.28232192993164, + "epoch": 0.7670555665414277, + "mean_token_accuracy": 0.6814891695976257, + "num_tokens": 19568273.0, + "step": 7758, + "train/ce_loss": 0.9605075120925903 + }, + { + "epoch": 0.7670555665414277, + "step": 7758, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.7670555665414277, + "step": 7758, + "train/total_loss": 0.17026950418949127 + }, + { + "entropy": 8.471275329589844, + "epoch": 0.7671544393909433, + "mean_token_accuracy": 0.7878788113594055, + "num_tokens": 19573710.0, + "step": 7759, + "train/ce_loss": 0.8600389361381531 + }, + { + "epoch": 0.7671544393909433, + "step": 7759, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.7671544393909433, + "step": 7759, + "train/total_loss": 0.14069139957427979 + }, + { + "epoch": 0.7672533122404588, + "grad_norm": 0.5270076990127563, + "learning_rate": 8.084112149532712e-06, + "loss": 0.134, + "step": 7760 + }, + { + "entropy": 8.327375411987305, + "epoch": 0.7672533122404588, + "mean_token_accuracy": 0.7164750695228577, + "num_tokens": 19579186.0, + "step": 7760, + "train/ce_loss": 0.4279042184352875 + }, + { + "epoch": 0.7672533122404588, + "step": 7760, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.7672533122404588, + "step": 7760, + "train/total_loss": 0.08185292780399323 + }, + { + "entropy": 8.624473571777344, + "epoch": 0.7673521850899743, + "mean_token_accuracy": 0.7292225360870361, + "num_tokens": 19584373.0, + "step": 7761, + "train/ce_loss": 1.4330039448395837e-06 + }, + { + "epoch": 0.7673521850899743, + "step": 7761, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.7673521850899743, + "step": 7761, + "train/total_loss": 0.03125014156103134 + }, + { + "entropy": 8.348209381103516, + "epoch": 0.7674510579394899, + "mean_token_accuracy": 0.769070029258728, + "num_tokens": 19589785.0, + "step": 7762, + "train/ce_loss": 0.5385426878929138 + }, + { + "epoch": 0.7674510579394899, + "step": 7762, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.7674510579394899, + "step": 7762, + "train/total_loss": 0.06947927176952362 + }, + { + "entropy": 8.514656066894531, + "epoch": 0.7675499307890054, + "mean_token_accuracy": 0.6915422677993774, + "num_tokens": 19594851.0, + "step": 7763, + "train/ce_loss": 2.1961233615875244 + }, + { + "epoch": 0.7675499307890054, + "step": 7763, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.7675499307890054, + "step": 7763, + "train/total_loss": 0.27820611000061035 + }, + { + "entropy": 8.7510347366333, + "epoch": 0.7676488036385208, + "mean_token_accuracy": 0.7577937841415405, + "num_tokens": 19600108.0, + "step": 7764, + "train/ce_loss": 0.3642151355743408 + }, + { + "epoch": 0.7676488036385208, + "step": 7764, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.7676488036385208, + "step": 7764, + "train/total_loss": 0.0832965150475502 + }, + { + "entropy": 8.535284042358398, + "epoch": 0.7677476764880364, + "mean_token_accuracy": 0.7045454382896423, + "num_tokens": 19605586.0, + "step": 7765, + "train/ce_loss": 0.8237572312355042 + }, + { + "epoch": 0.7677476764880364, + "step": 7765, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.7677476764880364, + "step": 7765, + "train/total_loss": 0.10971947759389877 + }, + { + "entropy": 8.440315246582031, + "epoch": 0.7678465493375519, + "mean_token_accuracy": 0.7381489872932434, + "num_tokens": 19610983.0, + "step": 7766, + "train/ce_loss": 0.9059736132621765 + }, + { + "epoch": 0.7678465493375519, + "step": 7766, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.7678465493375519, + "step": 7766, + "train/total_loss": 0.11794111132621765 + }, + { + "entropy": 8.703001022338867, + "epoch": 0.7679454221870674, + "mean_token_accuracy": 0.6919592022895813, + "num_tokens": 19616312.0, + "step": 7767, + "train/ce_loss": 1.5078632831573486 + }, + { + "epoch": 0.7679454221870674, + "step": 7767, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.7679454221870674, + "step": 7767, + "train/total_loss": 0.24453632533550262 + }, + { + "entropy": 9.096664428710938, + "epoch": 0.768044295036583, + "mean_token_accuracy": 0.7035830616950989, + "num_tokens": 19621410.0, + "step": 7768, + "train/ce_loss": 1.1963167190551758 + }, + { + "epoch": 0.768044295036583, + "step": 7768, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.768044295036583, + "step": 7768, + "train/total_loss": 0.21728792786598206 + }, + { + "entropy": 8.492799758911133, + "epoch": 0.7681431678860985, + "mean_token_accuracy": 0.7325301170349121, + "num_tokens": 19626713.0, + "step": 7769, + "train/ce_loss": 0.6306684613227844 + }, + { + "epoch": 0.7681431678860985, + "step": 7769, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.7681431678860985, + "step": 7769, + "train/total_loss": 0.14119184017181396 + }, + { + "entropy": 8.575288772583008, + "epoch": 0.768242040735614, + "mean_token_accuracy": 0.7642276287078857, + "num_tokens": 19632059.0, + "step": 7770, + "train/ce_loss": 0.5693222284317017 + }, + { + "epoch": 0.768242040735614, + "step": 7770, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.768242040735614, + "step": 7770, + "train/total_loss": 0.0920884758234024 + }, + { + "entropy": 8.951510429382324, + "epoch": 0.7683409135851296, + "mean_token_accuracy": 0.6703296899795532, + "num_tokens": 19637046.0, + "step": 7771, + "train/ce_loss": 2.1792502403259277 + }, + { + "epoch": 0.7683409135851296, + "step": 7771, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.7683409135851296, + "step": 7771, + "train/total_loss": 0.2726125121116638 + }, + { + "entropy": 8.60496711730957, + "epoch": 0.7684397864346451, + "mean_token_accuracy": 0.7296416759490967, + "num_tokens": 19642430.0, + "step": 7772, + "train/ce_loss": 0.6584407687187195 + }, + { + "epoch": 0.7684397864346451, + "step": 7772, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.7684397864346451, + "step": 7772, + "train/total_loss": 0.1127190813422203 + }, + { + "entropy": 8.587507247924805, + "epoch": 0.7685386592841605, + "mean_token_accuracy": 0.7205567359924316, + "num_tokens": 19647828.0, + "step": 7773, + "train/ce_loss": 0.542307436466217 + }, + { + "epoch": 0.7685386592841605, + "step": 7773, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.7685386592841605, + "step": 7773, + "train/total_loss": 0.09329324960708618 + }, + { + "entropy": 8.893842697143555, + "epoch": 0.7686375321336761, + "mean_token_accuracy": 0.770380437374115, + "num_tokens": 19652950.0, + "step": 7774, + "train/ce_loss": 1.1178854703903198 + }, + { + "epoch": 0.7686375321336761, + "step": 7774, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.7686375321336761, + "step": 7774, + "train/total_loss": 0.2289760410785675 + }, + { + "entropy": 8.655986785888672, + "epoch": 0.7687364049831916, + "mean_token_accuracy": 0.7772215008735657, + "num_tokens": 19658209.0, + "step": 7775, + "train/ce_loss": 0.47028452157974243 + }, + { + "epoch": 0.7687364049831916, + "step": 7775, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.7687364049831916, + "step": 7775, + "train/total_loss": 0.08218470215797424 + }, + { + "entropy": 9.172534942626953, + "epoch": 0.7688352778327071, + "mean_token_accuracy": 0.7229129672050476, + "num_tokens": 19663189.0, + "step": 7776, + "train/ce_loss": 1.1840168099297443e-06 + }, + { + "epoch": 0.7688352778327071, + "step": 7776, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.7688352778327071, + "step": 7776, + "train/total_loss": 0.01562511920928955 + }, + { + "entropy": 8.723504066467285, + "epoch": 0.7689341506822227, + "mean_token_accuracy": 0.7420494556427002, + "num_tokens": 19668212.0, + "step": 7777, + "train/ce_loss": 1.058593988418579 + }, + { + "epoch": 0.7689341506822227, + "step": 7777, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.7689341506822227, + "step": 7777, + "train/total_loss": 0.1527343988418579 + }, + { + "entropy": 8.734945297241211, + "epoch": 0.7690330235317382, + "mean_token_accuracy": 0.7875568866729736, + "num_tokens": 19673230.0, + "step": 7778, + "train/ce_loss": 0.6322168111801147 + }, + { + "epoch": 0.7690330235317382, + "step": 7778, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.7690330235317382, + "step": 7778, + "train/total_loss": 0.12181543558835983 + }, + { + "entropy": 8.808191299438477, + "epoch": 0.7691318963812537, + "mean_token_accuracy": 0.7994100451469421, + "num_tokens": 19678355.0, + "step": 7779, + "train/ce_loss": 0.9891080260276794 + }, + { + "epoch": 0.7691318963812537, + "step": 7779, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.7691318963812537, + "step": 7779, + "train/total_loss": 0.13406705856323242 + }, + { + "epoch": 0.7692307692307693, + "grad_norm": 0.5392338633537292, + "learning_rate": 8.07916728477476e-06, + "loss": 0.1423, + "step": 7780 + }, + { + "entropy": 8.632076263427734, + "epoch": 0.7692307692307693, + "mean_token_accuracy": 0.740406334400177, + "num_tokens": 19683747.0, + "step": 7780, + "train/ce_loss": 0.6915012001991272 + }, + { + "epoch": 0.7692307692307693, + "step": 7780, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.7692307692307693, + "step": 7780, + "train/total_loss": 0.13555637001991272 + }, + { + "entropy": 8.572942733764648, + "epoch": 0.7693296420802848, + "mean_token_accuracy": 0.7436463832855225, + "num_tokens": 19689155.0, + "step": 7781, + "train/ce_loss": 1.505046010017395 + }, + { + "epoch": 0.7693296420802848, + "step": 7781, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.7693296420802848, + "step": 7781, + "train/total_loss": 0.19737960398197174 + }, + { + "entropy": 8.728029251098633, + "epoch": 0.7694285149298002, + "mean_token_accuracy": 0.7652778029441833, + "num_tokens": 19694368.0, + "step": 7782, + "train/ce_loss": 0.4747575521469116 + }, + { + "epoch": 0.7694285149298002, + "step": 7782, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.7694285149298002, + "step": 7782, + "train/total_loss": 0.11778825521469116 + }, + { + "entropy": 8.992225646972656, + "epoch": 0.7695273877793158, + "mean_token_accuracy": 0.8357142806053162, + "num_tokens": 19699488.0, + "step": 7783, + "train/ce_loss": 0.41708889603614807 + }, + { + "epoch": 0.7695273877793158, + "step": 7783, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.7695273877793158, + "step": 7783, + "train/total_loss": 0.07295888662338257 + }, + { + "entropy": 9.007128715515137, + "epoch": 0.7696262606288313, + "mean_token_accuracy": 0.7003424763679504, + "num_tokens": 19704472.0, + "step": 7784, + "train/ce_loss": 2.804467840178404e-06 + }, + { + "epoch": 0.7696262606288313, + "step": 7784, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.7696262606288313, + "step": 7784, + "train/total_loss": 0.08203153312206268 + }, + { + "entropy": 8.808753967285156, + "epoch": 0.7697251334783468, + "mean_token_accuracy": 0.7394578456878662, + "num_tokens": 19709610.0, + "step": 7785, + "train/ce_loss": 5.0079838729288895e-06 + }, + { + "epoch": 0.7697251334783468, + "step": 7785, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.7697251334783468, + "step": 7785, + "train/total_loss": 0.042969249188899994 + }, + { + "entropy": 8.428844451904297, + "epoch": 0.7698240063278624, + "mean_token_accuracy": 0.7607496976852417, + "num_tokens": 19715017.0, + "step": 7786, + "train/ce_loss": 0.3310222625732422 + }, + { + "epoch": 0.7698240063278624, + "step": 7786, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.7698240063278624, + "step": 7786, + "train/total_loss": 0.07997722923755646 + }, + { + "entropy": 8.50446605682373, + "epoch": 0.7699228791773779, + "mean_token_accuracy": 0.8143203854560852, + "num_tokens": 19720320.0, + "step": 7787, + "train/ce_loss": 0.5868676900863647 + }, + { + "epoch": 0.7699228791773779, + "step": 7787, + "train/sim_loss": 0.01171875 + }, + { + "epoch": 0.7699228791773779, + "step": 7787, + "train/total_loss": 0.0704055204987526 + }, + { + "entropy": 8.214300155639648, + "epoch": 0.7700217520268934, + "mean_token_accuracy": 0.7140077948570251, + "num_tokens": 19725838.0, + "step": 7788, + "train/ce_loss": 0.5881697535514832 + }, + { + "epoch": 0.7700217520268934, + "step": 7788, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.7700217520268934, + "step": 7788, + "train/total_loss": 0.09787947684526443 + }, + { + "entropy": 8.687908172607422, + "epoch": 0.770120624876409, + "mean_token_accuracy": 0.7725381255149841, + "num_tokens": 19731025.0, + "step": 7789, + "train/ce_loss": 0.4396521747112274 + }, + { + "epoch": 0.770120624876409, + "step": 7789, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.770120624876409, + "step": 7789, + "train/total_loss": 0.07521522045135498 + }, + { + "entropy": 8.553461074829102, + "epoch": 0.7702194977259245, + "mean_token_accuracy": 0.7950581312179565, + "num_tokens": 19736225.0, + "step": 7790, + "train/ce_loss": 0.9271291494369507 + }, + { + "epoch": 0.7702194977259245, + "step": 7790, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.7702194977259245, + "step": 7790, + "train/total_loss": 0.1317754089832306 + }, + { + "entropy": 8.774941444396973, + "epoch": 0.77031837057544, + "mean_token_accuracy": 0.7512626051902771, + "num_tokens": 19741488.0, + "step": 7791, + "train/ce_loss": 1.02925443649292 + }, + { + "epoch": 0.77031837057544, + "step": 7791, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.77031837057544, + "step": 7791, + "train/total_loss": 0.17323794960975647 + }, + { + "entropy": 8.831399917602539, + "epoch": 0.7704172434249555, + "mean_token_accuracy": 0.7074742317199707, + "num_tokens": 19746683.0, + "step": 7792, + "train/ce_loss": 1.3657745122909546 + }, + { + "epoch": 0.7704172434249555, + "step": 7792, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.7704172434249555, + "step": 7792, + "train/total_loss": 0.20688995718955994 + }, + { + "entropy": 8.7943115234375, + "epoch": 0.770516116274471, + "mean_token_accuracy": 0.7727952003479004, + "num_tokens": 19751793.0, + "step": 7793, + "train/ce_loss": 0.578055202960968 + }, + { + "epoch": 0.770516116274471, + "step": 7793, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.770516116274471, + "step": 7793, + "train/total_loss": 0.14374302327632904 + }, + { + "entropy": 8.386873245239258, + "epoch": 0.7706149891239865, + "mean_token_accuracy": 0.7150714993476868, + "num_tokens": 19757159.0, + "step": 7794, + "train/ce_loss": 0.5064057111740112 + }, + { + "epoch": 0.7706149891239865, + "step": 7794, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.7706149891239865, + "step": 7794, + "train/total_loss": 0.07017181813716888 + }, + { + "entropy": 8.634651184082031, + "epoch": 0.7707138619735021, + "mean_token_accuracy": 0.7295454740524292, + "num_tokens": 19762451.0, + "step": 7795, + "train/ce_loss": 0.6535912752151489 + }, + { + "epoch": 0.7707138619735021, + "step": 7795, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.7707138619735021, + "step": 7795, + "train/total_loss": 0.10051538050174713 + }, + { + "entropy": 8.393540382385254, + "epoch": 0.7708127348230176, + "mean_token_accuracy": 0.7106382846832275, + "num_tokens": 19768056.0, + "step": 7796, + "train/ce_loss": 0.6739010810852051 + }, + { + "epoch": 0.7708127348230176, + "step": 7796, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.7708127348230176, + "step": 7796, + "train/total_loss": 0.11817135661840439 + }, + { + "entropy": 8.74815845489502, + "epoch": 0.7709116076725331, + "mean_token_accuracy": 0.7723463773727417, + "num_tokens": 19773324.0, + "step": 7797, + "train/ce_loss": 0.4139121174812317 + }, + { + "epoch": 0.7709116076725331, + "step": 7797, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.7709116076725331, + "step": 7797, + "train/total_loss": 0.07264120876789093 + }, + { + "entropy": 9.048473358154297, + "epoch": 0.7710104805220487, + "mean_token_accuracy": 0.7356114983558655, + "num_tokens": 19778355.0, + "step": 7798, + "train/ce_loss": 2.1130688310222467e-06 + }, + { + "epoch": 0.7710104805220487, + "step": 7798, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.7710104805220487, + "step": 7798, + "train/total_loss": 0.05859396234154701 + }, + { + "entropy": 8.616962432861328, + "epoch": 0.7711093533715642, + "mean_token_accuracy": 0.7152858972549438, + "num_tokens": 19783653.0, + "step": 7799, + "train/ce_loss": 0.5495577454566956 + }, + { + "epoch": 0.7711093533715642, + "step": 7799, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.7711093533715642, + "step": 7799, + "train/total_loss": 0.08620578050613403 + }, + { + "epoch": 0.7712082262210797, + "grad_norm": 0.6130492091178894, + "learning_rate": 8.074222420016813e-06, + "loss": 0.13, + "step": 7800 + }, + { + "entropy": 8.810811996459961, + "epoch": 0.7712082262210797, + "mean_token_accuracy": 0.7312661409378052, + "num_tokens": 19788897.0, + "step": 7800, + "train/ce_loss": 0.4379151165485382 + }, + { + "epoch": 0.7712082262210797, + "step": 7800, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.7712082262210797, + "step": 7800, + "train/total_loss": 0.0945727676153183 + }, + { + "entropy": 8.515292167663574, + "epoch": 0.7713070990705952, + "mean_token_accuracy": 0.7429577708244324, + "num_tokens": 19794191.0, + "step": 7801, + "train/ce_loss": 0.9319631457328796 + }, + { + "epoch": 0.7713070990705952, + "step": 7801, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.7713070990705952, + "step": 7801, + "train/total_loss": 0.1635088175535202 + }, + { + "entropy": 9.327791213989258, + "epoch": 0.7714059719201107, + "mean_token_accuracy": 0.7540650367736816, + "num_tokens": 19799088.0, + "step": 7802, + "train/ce_loss": 4.191006610199111e-06 + }, + { + "epoch": 0.7714059719201107, + "step": 7802, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.7714059719201107, + "step": 7802, + "train/total_loss": 0.027344169095158577 + }, + { + "entropy": 8.99296760559082, + "epoch": 0.7715048447696262, + "mean_token_accuracy": 0.7601810097694397, + "num_tokens": 19804183.0, + "step": 7803, + "train/ce_loss": 0.6521633863449097 + }, + { + "epoch": 0.7715048447696262, + "step": 7803, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.7715048447696262, + "step": 7803, + "train/total_loss": 0.08865384012460709 + }, + { + "entropy": 8.937102317810059, + "epoch": 0.7716037176191418, + "mean_token_accuracy": 0.7731829285621643, + "num_tokens": 19809453.0, + "step": 7804, + "train/ce_loss": 1.5798154890944716e-06 + }, + { + "epoch": 0.7716037176191418, + "step": 7804, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.7716037176191418, + "step": 7804, + "train/total_loss": 0.039062656462192535 + }, + { + "entropy": 9.016761779785156, + "epoch": 0.7717025904686573, + "mean_token_accuracy": 0.7131931185722351, + "num_tokens": 19814573.0, + "step": 7805, + "train/ce_loss": 1.0731233358383179 + }, + { + "epoch": 0.7717025904686573, + "step": 7805, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.7717025904686573, + "step": 7805, + "train/total_loss": 0.13856233656406403 + }, + { + "entropy": 8.710519790649414, + "epoch": 0.7718014633181728, + "mean_token_accuracy": 0.7759398221969604, + "num_tokens": 19819705.0, + "step": 7806, + "train/ce_loss": 0.8306262493133545 + }, + { + "epoch": 0.7718014633181728, + "step": 7806, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.7718014633181728, + "step": 7806, + "train/total_loss": 0.15337511897087097 + }, + { + "entropy": 8.52718734741211, + "epoch": 0.7719003361676884, + "mean_token_accuracy": 0.7378410696983337, + "num_tokens": 19825064.0, + "step": 7807, + "train/ce_loss": 0.7315281629562378 + }, + { + "epoch": 0.7719003361676884, + "step": 7807, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.7719003361676884, + "step": 7807, + "train/total_loss": 0.1356528103351593 + }, + { + "entropy": 8.339141845703125, + "epoch": 0.7719992090172039, + "mean_token_accuracy": 0.7184035181999207, + "num_tokens": 19830435.0, + "step": 7808, + "train/ce_loss": 0.5292571783065796 + }, + { + "epoch": 0.7719992090172039, + "step": 7808, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.7719992090172039, + "step": 7808, + "train/total_loss": 0.0802694708108902 + }, + { + "entropy": 9.055339813232422, + "epoch": 0.7720980818667194, + "mean_token_accuracy": 0.7195122241973877, + "num_tokens": 19835580.0, + "step": 7809, + "train/ce_loss": 1.4505902528762817 + }, + { + "epoch": 0.7720980818667194, + "step": 7809, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.7720980818667194, + "step": 7809, + "train/total_loss": 0.1958402842283249 + }, + { + "entropy": 9.082110404968262, + "epoch": 0.772196954716235, + "mean_token_accuracy": 0.7423664331436157, + "num_tokens": 19840588.0, + "step": 7810, + "train/ce_loss": 1.3220585584640503 + }, + { + "epoch": 0.772196954716235, + "step": 7810, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.772196954716235, + "step": 7810, + "train/total_loss": 0.18298710882663727 + }, + { + "entropy": 8.77676773071289, + "epoch": 0.7722958275657504, + "mean_token_accuracy": 0.7403461933135986, + "num_tokens": 19845779.0, + "step": 7811, + "train/ce_loss": 0.6185296177864075 + }, + { + "epoch": 0.7722958275657504, + "step": 7811, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.7722958275657504, + "step": 7811, + "train/total_loss": 0.09310296177864075 + }, + { + "entropy": 8.527267456054688, + "epoch": 0.7723947004152659, + "mean_token_accuracy": 0.7119205594062805, + "num_tokens": 19851209.0, + "step": 7812, + "train/ce_loss": 0.42909958958625793 + }, + { + "epoch": 0.7723947004152659, + "step": 7812, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.7723947004152659, + "step": 7812, + "train/total_loss": 0.08587871491909027 + }, + { + "entropy": 8.80923080444336, + "epoch": 0.7724935732647815, + "mean_token_accuracy": 0.7142857313156128, + "num_tokens": 19856396.0, + "step": 7813, + "train/ce_loss": 1.7006196975708008 + }, + { + "epoch": 0.7724935732647815, + "step": 7813, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.7724935732647815, + "step": 7813, + "train/total_loss": 0.24037447571754456 + }, + { + "entropy": 8.203269958496094, + "epoch": 0.772592446114297, + "mean_token_accuracy": 0.7153153419494629, + "num_tokens": 19861948.0, + "step": 7814, + "train/ce_loss": 0.7045519948005676 + }, + { + "epoch": 0.772592446114297, + "step": 7814, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.772592446114297, + "step": 7814, + "train/total_loss": 0.12123645097017288 + }, + { + "entropy": 9.388669967651367, + "epoch": 0.7726913189638125, + "mean_token_accuracy": 0.7602591514587402, + "num_tokens": 19866819.0, + "step": 7815, + "train/ce_loss": 1.1423147916793823 + }, + { + "epoch": 0.7726913189638125, + "step": 7815, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.7726913189638125, + "step": 7815, + "train/total_loss": 0.20407523214817047 + }, + { + "entropy": 9.166075706481934, + "epoch": 0.7727901918133281, + "mean_token_accuracy": 0.8114104866981506, + "num_tokens": 19871845.0, + "step": 7816, + "train/ce_loss": 0.8951804041862488 + }, + { + "epoch": 0.7727901918133281, + "step": 7816, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.7727901918133281, + "step": 7816, + "train/total_loss": 0.11295554041862488 + }, + { + "entropy": 8.625469207763672, + "epoch": 0.7728890646628436, + "mean_token_accuracy": 0.7141280174255371, + "num_tokens": 19877227.0, + "step": 7817, + "train/ce_loss": 0.49586185812950134 + }, + { + "epoch": 0.7728890646628436, + "step": 7817, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.7728890646628436, + "step": 7817, + "train/total_loss": 0.1433361917734146 + }, + { + "entropy": 8.812108993530273, + "epoch": 0.7729879375123591, + "mean_token_accuracy": 0.7391952276229858, + "num_tokens": 19882372.0, + "step": 7818, + "train/ce_loss": 1.0758458375930786 + }, + { + "epoch": 0.7729879375123591, + "step": 7818, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.7729879375123591, + "step": 7818, + "train/total_loss": 0.18570959568023682 + }, + { + "entropy": 9.89470100402832, + "epoch": 0.7730868103618747, + "mean_token_accuracy": 0.7422680258750916, + "num_tokens": 19886928.0, + "step": 7819, + "train/ce_loss": 2.782318460958777e-06 + }, + { + "epoch": 0.7730868103618747, + "step": 7819, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.7730868103618747, + "step": 7819, + "train/total_loss": 0.015625277534127235 + }, + { + "epoch": 0.7731856832113901, + "grad_norm": 1.0795270204544067, + "learning_rate": 8.069277555258863e-06, + "loss": 0.1332, + "step": 7820 + }, + { + "entropy": 8.894759178161621, + "epoch": 0.7731856832113901, + "mean_token_accuracy": 0.7464183568954468, + "num_tokens": 19892023.0, + "step": 7820, + "train/ce_loss": 0.7307823896408081 + }, + { + "epoch": 0.7731856832113901, + "step": 7820, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.7731856832113901, + "step": 7820, + "train/total_loss": 0.11604698747396469 + }, + { + "entropy": 9.182503700256348, + "epoch": 0.7732845560609056, + "mean_token_accuracy": 0.759013295173645, + "num_tokens": 19897012.0, + "step": 7821, + "train/ce_loss": 1.2509524822235107 + }, + { + "epoch": 0.7732845560609056, + "step": 7821, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.7732845560609056, + "step": 7821, + "train/total_loss": 0.15634524822235107 + }, + { + "entropy": 8.66482925415039, + "epoch": 0.7733834289104212, + "mean_token_accuracy": 0.709549069404602, + "num_tokens": 19902223.0, + "step": 7822, + "train/ce_loss": 1.0512527227401733 + }, + { + "epoch": 0.7733834289104212, + "step": 7822, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.7733834289104212, + "step": 7822, + "train/total_loss": 0.1598127782344818 + }, + { + "entropy": 8.769386291503906, + "epoch": 0.7734823017599367, + "mean_token_accuracy": 0.810693621635437, + "num_tokens": 19907394.0, + "step": 7823, + "train/ce_loss": 0.879429817199707 + }, + { + "epoch": 0.7734823017599367, + "step": 7823, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.7734823017599367, + "step": 7823, + "train/total_loss": 0.16216173768043518 + }, + { + "entropy": 8.26472282409668, + "epoch": 0.7735811746094522, + "mean_token_accuracy": 0.8351115584373474, + "num_tokens": 19912907.0, + "step": 7824, + "train/ce_loss": 0.8675022125244141 + }, + { + "epoch": 0.7735811746094522, + "step": 7824, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.7735811746094522, + "step": 7824, + "train/total_loss": 0.18440647423267365 + }, + { + "entropy": 8.358294486999512, + "epoch": 0.7736800474589678, + "mean_token_accuracy": 0.6705202460289001, + "num_tokens": 19918464.0, + "step": 7825, + "train/ce_loss": 0.9174467325210571 + }, + { + "epoch": 0.7736800474589678, + "step": 7825, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.7736800474589678, + "step": 7825, + "train/total_loss": 0.16986967623233795 + }, + { + "entropy": 8.729463577270508, + "epoch": 0.7737789203084833, + "mean_token_accuracy": 0.7240259647369385, + "num_tokens": 19923848.0, + "step": 7826, + "train/ce_loss": 0.843796968460083 + }, + { + "epoch": 0.7737789203084833, + "step": 7826, + "train/sim_loss": 0.13671875 + }, + { + "epoch": 0.7737789203084833, + "step": 7826, + "train/total_loss": 0.22109845280647278 + }, + { + "entropy": 8.410392761230469, + "epoch": 0.7738777931579988, + "mean_token_accuracy": 0.7503410577774048, + "num_tokens": 19929120.0, + "step": 7827, + "train/ce_loss": 0.8885999321937561 + }, + { + "epoch": 0.7738777931579988, + "step": 7827, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.7738777931579988, + "step": 7827, + "train/total_loss": 0.12010999768972397 + }, + { + "entropy": 8.993410110473633, + "epoch": 0.7739766660075144, + "mean_token_accuracy": 0.800000011920929, + "num_tokens": 19934179.0, + "step": 7828, + "train/ce_loss": 2.7117188437841833e-05 + }, + { + "epoch": 0.7739766660075144, + "step": 7828, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.7739766660075144, + "step": 7828, + "train/total_loss": 0.03906521201133728 + }, + { + "entropy": 9.25328254699707, + "epoch": 0.7740755388570298, + "mean_token_accuracy": 0.7326202988624573, + "num_tokens": 19939189.0, + "step": 7829, + "train/ce_loss": 1.108992099761963 + }, + { + "epoch": 0.7740755388570298, + "step": 7829, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.7740755388570298, + "step": 7829, + "train/total_loss": 0.1499617099761963 + }, + { + "entropy": 9.52667236328125, + "epoch": 0.7741744117065453, + "mean_token_accuracy": 0.8308605551719666, + "num_tokens": 19943900.0, + "step": 7830, + "train/ce_loss": 2.575121698100702e-06 + }, + { + "epoch": 0.7741744117065453, + "step": 7830, + "train/sim_loss": 0.01171875 + }, + { + "epoch": 0.7741744117065453, + "step": 7830, + "train/total_loss": 0.011719007976353168 + }, + { + "entropy": 9.501398086547852, + "epoch": 0.7742732845560609, + "mean_token_accuracy": 0.8183807730674744, + "num_tokens": 19948748.0, + "step": 7831, + "train/ce_loss": 2.076280452456558e-06 + }, + { + "epoch": 0.7742732845560609, + "step": 7831, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.7742732845560609, + "step": 7831, + "train/total_loss": 0.023437706753611565 + }, + { + "entropy": 8.508647918701172, + "epoch": 0.7743721574055764, + "mean_token_accuracy": 0.7374100685119629, + "num_tokens": 19954051.0, + "step": 7832, + "train/ce_loss": 0.8255497217178345 + }, + { + "epoch": 0.7743721574055764, + "step": 7832, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.7743721574055764, + "step": 7832, + "train/total_loss": 0.14114871621131897 + }, + { + "entropy": 8.713500022888184, + "epoch": 0.7744710302550919, + "mean_token_accuracy": 0.7908496856689453, + "num_tokens": 19959266.0, + "step": 7833, + "train/ce_loss": 1.030476450920105 + }, + { + "epoch": 0.7744710302550919, + "step": 7833, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.7744710302550919, + "step": 7833, + "train/total_loss": 0.17726638913154602 + }, + { + "entropy": 8.520734786987305, + "epoch": 0.7745699031046075, + "mean_token_accuracy": 0.8053553104400635, + "num_tokens": 19964739.0, + "step": 7834, + "train/ce_loss": 0.7346962094306946 + }, + { + "epoch": 0.7745699031046075, + "step": 7834, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.7745699031046075, + "step": 7834, + "train/total_loss": 0.1164383739233017 + }, + { + "entropy": 8.950132369995117, + "epoch": 0.774668775954123, + "mean_token_accuracy": 0.7404129505157471, + "num_tokens": 19969880.0, + "step": 7835, + "train/ce_loss": 0.5527032613754272 + }, + { + "epoch": 0.774668775954123, + "step": 7835, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.774668775954123, + "step": 7835, + "train/total_loss": 0.09823907911777496 + }, + { + "entropy": 8.807478904724121, + "epoch": 0.7747676488036386, + "mean_token_accuracy": 0.7493734359741211, + "num_tokens": 19975201.0, + "step": 7836, + "train/ce_loss": 1.1441510915756226 + }, + { + "epoch": 0.7747676488036386, + "step": 7836, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.7747676488036386, + "step": 7836, + "train/total_loss": 0.17691510915756226 + }, + { + "entropy": 8.433201789855957, + "epoch": 0.7748665216531541, + "mean_token_accuracy": 0.6755037307739258, + "num_tokens": 19980635.0, + "step": 7837, + "train/ce_loss": 0.5758907198905945 + }, + { + "epoch": 0.7748665216531541, + "step": 7837, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.7748665216531541, + "step": 7837, + "train/total_loss": 0.1279015690088272 + }, + { + "entropy": 8.277987480163574, + "epoch": 0.7749653945026695, + "mean_token_accuracy": 0.7234762907028198, + "num_tokens": 19986007.0, + "step": 7838, + "train/ce_loss": 0.9494882822036743 + }, + { + "epoch": 0.7749653945026695, + "step": 7838, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.7749653945026695, + "step": 7838, + "train/total_loss": 0.13401132822036743 + }, + { + "entropy": 8.675048828125, + "epoch": 0.7750642673521851, + "mean_token_accuracy": 0.7599999904632568, + "num_tokens": 19991128.0, + "step": 7839, + "train/ce_loss": 1.2529450259535224e-06 + }, + { + "epoch": 0.7750642673521851, + "step": 7839, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.7750642673521851, + "step": 7839, + "train/total_loss": 0.05078137665987015 + }, + { + "epoch": 0.7751631402017006, + "grad_norm": 0.7123085260391235, + "learning_rate": 8.064332690500916e-06, + "loss": 0.1327, + "step": 7840 + }, + { + "entropy": 9.40606689453125, + "epoch": 0.7751631402017006, + "mean_token_accuracy": 0.7947976589202881, + "num_tokens": 19995897.0, + "step": 7840, + "train/ce_loss": 2.4185565052903257e-06 + }, + { + "epoch": 0.7751631402017006, + "step": 7840, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.7751631402017006, + "step": 7840, + "train/total_loss": 0.0781252384185791 + }, + { + "entropy": 9.181451797485352, + "epoch": 0.7752620130512161, + "mean_token_accuracy": 0.75, + "num_tokens": 20000705.0, + "step": 7841, + "train/ce_loss": 1.2251102924346924 + }, + { + "epoch": 0.7752620130512161, + "step": 7841, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.7752620130512161, + "step": 7841, + "train/total_loss": 0.20454227924346924 + }, + { + "entropy": 8.59832763671875, + "epoch": 0.7753608859007317, + "mean_token_accuracy": 0.8022598624229431, + "num_tokens": 20006031.0, + "step": 7842, + "train/ce_loss": 0.5766798853874207 + }, + { + "epoch": 0.7753608859007317, + "step": 7842, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.7753608859007317, + "step": 7842, + "train/total_loss": 0.11626173555850983 + }, + { + "entropy": 8.97331428527832, + "epoch": 0.7754597587502472, + "mean_token_accuracy": 0.7272727489471436, + "num_tokens": 20011111.0, + "step": 7843, + "train/ce_loss": 0.6290687918663025 + }, + { + "epoch": 0.7754597587502472, + "step": 7843, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.7754597587502472, + "step": 7843, + "train/total_loss": 0.12150063365697861 + }, + { + "entropy": 8.379244804382324, + "epoch": 0.7755586315997627, + "mean_token_accuracy": 0.7770419716835022, + "num_tokens": 20016430.0, + "step": 7844, + "train/ce_loss": 0.4407392740249634 + }, + { + "epoch": 0.7755586315997627, + "step": 7844, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.7755586315997627, + "step": 7844, + "train/total_loss": 0.0753239244222641 + }, + { + "entropy": 8.849149703979492, + "epoch": 0.7756575044492783, + "mean_token_accuracy": 0.7721179723739624, + "num_tokens": 20021657.0, + "step": 7845, + "train/ce_loss": 0.9204382300376892 + }, + { + "epoch": 0.7756575044492783, + "step": 7845, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.7756575044492783, + "step": 7845, + "train/total_loss": 0.16235631704330444 + }, + { + "entropy": 8.597208023071289, + "epoch": 0.7757563772987938, + "mean_token_accuracy": 0.6629588603973389, + "num_tokens": 20027039.0, + "step": 7846, + "train/ce_loss": 1.005993366241455 + }, + { + "epoch": 0.7757563772987938, + "step": 7846, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.7757563772987938, + "step": 7846, + "train/total_loss": 0.17872434854507446 + }, + { + "entropy": 8.525615692138672, + "epoch": 0.7758552501483093, + "mean_token_accuracy": 0.7469879388809204, + "num_tokens": 20032241.0, + "step": 7847, + "train/ce_loss": 0.6244814395904541 + }, + { + "epoch": 0.7758552501483093, + "step": 7847, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.7758552501483093, + "step": 7847, + "train/total_loss": 0.10541689395904541 + }, + { + "entropy": 8.570363998413086, + "epoch": 0.7759541229978248, + "mean_token_accuracy": 0.8177676796913147, + "num_tokens": 20037628.0, + "step": 7848, + "train/ce_loss": 0.5132327675819397 + }, + { + "epoch": 0.7759541229978248, + "step": 7848, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.7759541229978248, + "step": 7848, + "train/total_loss": 0.09819827973842621 + }, + { + "entropy": 8.65130615234375, + "epoch": 0.7760529958473403, + "mean_token_accuracy": 0.7279821634292603, + "num_tokens": 20042958.0, + "step": 7849, + "train/ce_loss": 0.5276586413383484 + }, + { + "epoch": 0.7760529958473403, + "step": 7849, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.7760529958473403, + "step": 7849, + "train/total_loss": 0.0996408611536026 + }, + { + "entropy": 8.43891716003418, + "epoch": 0.7761518686968558, + "mean_token_accuracy": 0.6803699731826782, + "num_tokens": 20048392.0, + "step": 7850, + "train/ce_loss": 0.39941999316215515 + }, + { + "epoch": 0.7761518686968558, + "step": 7850, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.7761518686968558, + "step": 7850, + "train/total_loss": 0.08291074633598328 + }, + { + "entropy": 8.427473068237305, + "epoch": 0.7762507415463714, + "mean_token_accuracy": 0.7265135645866394, + "num_tokens": 20053815.0, + "step": 7851, + "train/ce_loss": 0.7450786828994751 + }, + { + "epoch": 0.7762507415463714, + "step": 7851, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.7762507415463714, + "step": 7851, + "train/total_loss": 0.14872661232948303 + }, + { + "entropy": 8.747206687927246, + "epoch": 0.7763496143958869, + "mean_token_accuracy": 0.6984318494796753, + "num_tokens": 20059083.0, + "step": 7852, + "train/ce_loss": 1.3575971126556396 + }, + { + "epoch": 0.7763496143958869, + "step": 7852, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.7763496143958869, + "step": 7852, + "train/total_loss": 0.22169721126556396 + }, + { + "entropy": 9.182523727416992, + "epoch": 0.7764484872454024, + "mean_token_accuracy": 0.7379844784736633, + "num_tokens": 20064155.0, + "step": 7853, + "train/ce_loss": 1.1574733257293701 + }, + { + "epoch": 0.7764484872454024, + "step": 7853, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.7764484872454024, + "step": 7853, + "train/total_loss": 0.193872332572937 + }, + { + "entropy": 9.358688354492188, + "epoch": 0.776547360094918, + "mean_token_accuracy": 0.8188976645469666, + "num_tokens": 20068980.0, + "step": 7854, + "train/ce_loss": 0.8725805282592773 + }, + { + "epoch": 0.776547360094918, + "step": 7854, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.776547360094918, + "step": 7854, + "train/total_loss": 0.10288305580615997 + }, + { + "entropy": 9.307382583618164, + "epoch": 0.7766462329444335, + "mean_token_accuracy": 0.744990885257721, + "num_tokens": 20073911.0, + "step": 7855, + "train/ce_loss": 1.0040943622589111 + }, + { + "epoch": 0.7766462329444335, + "step": 7855, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.7766462329444335, + "step": 7855, + "train/total_loss": 0.13947194814682007 + }, + { + "entropy": 9.696178436279297, + "epoch": 0.776745105793949, + "mean_token_accuracy": 0.6959064602851868, + "num_tokens": 20078626.0, + "step": 7856, + "train/ce_loss": 2.161552906036377 + }, + { + "epoch": 0.776745105793949, + "step": 7856, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.776745105793949, + "step": 7856, + "train/total_loss": 0.3255302906036377 + }, + { + "entropy": 9.63293743133545, + "epoch": 0.7768439786434645, + "mean_token_accuracy": 0.8255813717842102, + "num_tokens": 20083370.0, + "step": 7857, + "train/ce_loss": 1.1936830282211304 + }, + { + "epoch": 0.7768439786434645, + "step": 7857, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.7768439786434645, + "step": 7857, + "train/total_loss": 0.162337064743042 + }, + { + "entropy": 8.957457542419434, + "epoch": 0.77694285149298, + "mean_token_accuracy": 0.7108014225959778, + "num_tokens": 20088397.0, + "step": 7858, + "train/ce_loss": 0.5364453196525574 + }, + { + "epoch": 0.77694285149298, + "step": 7858, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.77694285149298, + "step": 7858, + "train/total_loss": 0.07708203792572021 + }, + { + "entropy": 8.190643310546875, + "epoch": 0.7770417243424955, + "mean_token_accuracy": 0.7280939221382141, + "num_tokens": 20093960.0, + "step": 7859, + "train/ce_loss": 0.6429663896560669 + }, + { + "epoch": 0.7770417243424955, + "step": 7859, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.7770417243424955, + "step": 7859, + "train/total_loss": 0.12289039045572281 + }, + { + "epoch": 0.7771405971920111, + "grad_norm": 0.6672192215919495, + "learning_rate": 8.059387825742966e-06, + "loss": 0.1312, + "step": 7860 + }, + { + "entropy": 9.07497501373291, + "epoch": 0.7771405971920111, + "mean_token_accuracy": 0.7948275804519653, + "num_tokens": 20098993.0, + "step": 7860, + "train/ce_loss": 0.8659631609916687 + }, + { + "epoch": 0.7771405971920111, + "step": 7860, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.7771405971920111, + "step": 7860, + "train/total_loss": 0.1256588101387024 + }, + { + "entropy": 8.92231559753418, + "epoch": 0.7772394700415266, + "mean_token_accuracy": 0.7153846025466919, + "num_tokens": 20104232.0, + "step": 7861, + "train/ce_loss": 0.7884999513626099 + }, + { + "epoch": 0.7772394700415266, + "step": 7861, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.7772394700415266, + "step": 7861, + "train/total_loss": 0.11791249364614487 + }, + { + "entropy": 8.849882125854492, + "epoch": 0.7773383428910421, + "mean_token_accuracy": 0.7496598362922668, + "num_tokens": 20109437.0, + "step": 7862, + "train/ce_loss": 1.1555202007293701 + }, + { + "epoch": 0.7773383428910421, + "step": 7862, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.7773383428910421, + "step": 7862, + "train/total_loss": 0.16633327305316925 + }, + { + "entropy": 8.588375091552734, + "epoch": 0.7774372157405577, + "mean_token_accuracy": 0.759115993976593, + "num_tokens": 20114778.0, + "step": 7863, + "train/ce_loss": 0.625299334526062 + }, + { + "epoch": 0.7774372157405577, + "step": 7863, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.7774372157405577, + "step": 7863, + "train/total_loss": 0.12502993643283844 + }, + { + "entropy": 8.494949340820312, + "epoch": 0.7775360885900732, + "mean_token_accuracy": 0.7898658514022827, + "num_tokens": 20119964.0, + "step": 7864, + "train/ce_loss": 0.4647732079029083 + }, + { + "epoch": 0.7775360885900732, + "step": 7864, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.7775360885900732, + "step": 7864, + "train/total_loss": 0.1480398178100586 + }, + { + "entropy": 9.21607780456543, + "epoch": 0.7776349614395887, + "mean_token_accuracy": 0.8204697966575623, + "num_tokens": 20124948.0, + "step": 7865, + "train/ce_loss": 8.904492574401957e-07 + }, + { + "epoch": 0.7776349614395887, + "step": 7865, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.7776349614395887, + "step": 7865, + "train/total_loss": 0.023437589406967163 + }, + { + "entropy": 9.210044860839844, + "epoch": 0.7777338342891043, + "mean_token_accuracy": 0.7599999904632568, + "num_tokens": 20130002.0, + "step": 7866, + "train/ce_loss": 1.7427250895707402e-06 + }, + { + "epoch": 0.7777338342891043, + "step": 7866, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.7777338342891043, + "step": 7866, + "train/total_loss": 0.03906267508864403 + }, + { + "entropy": 8.752881050109863, + "epoch": 0.7778327071386197, + "mean_token_accuracy": 0.699999988079071, + "num_tokens": 20135281.0, + "step": 7867, + "train/ce_loss": 0.5289360284805298 + }, + { + "epoch": 0.7778327071386197, + "step": 7867, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.7778327071386197, + "step": 7867, + "train/total_loss": 0.09976860880851746 + }, + { + "entropy": 9.038463592529297, + "epoch": 0.7779315799881352, + "mean_token_accuracy": 0.6651446223258972, + "num_tokens": 20140378.0, + "step": 7868, + "train/ce_loss": 1.573218822479248 + }, + { + "epoch": 0.7779315799881352, + "step": 7868, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.7779315799881352, + "step": 7868, + "train/total_loss": 0.22763438522815704 + }, + { + "entropy": 8.640933990478516, + "epoch": 0.7780304528376508, + "mean_token_accuracy": 0.7322134375572205, + "num_tokens": 20145873.0, + "step": 7869, + "train/ce_loss": 1.3555995225906372 + }, + { + "epoch": 0.7780304528376508, + "step": 7869, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.7780304528376508, + "step": 7869, + "train/total_loss": 0.19415371119976044 + }, + { + "entropy": 8.888750076293945, + "epoch": 0.7781293256871663, + "mean_token_accuracy": 0.7715517282485962, + "num_tokens": 20151019.0, + "step": 7870, + "train/ce_loss": 1.0199583768844604 + }, + { + "epoch": 0.7781293256871663, + "step": 7870, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.7781293256871663, + "step": 7870, + "train/total_loss": 0.14887084066867828 + }, + { + "entropy": 8.983304977416992, + "epoch": 0.7782281985366818, + "mean_token_accuracy": 0.7315541505813599, + "num_tokens": 20156135.0, + "step": 7871, + "train/ce_loss": 3.7230190628179116e-06 + }, + { + "epoch": 0.7782281985366818, + "step": 7871, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.7782281985366818, + "step": 7871, + "train/total_loss": 0.06250037252902985 + }, + { + "entropy": 8.64794921875, + "epoch": 0.7783270713861974, + "mean_token_accuracy": 0.7871345281600952, + "num_tokens": 20161460.0, + "step": 7872, + "train/ce_loss": 0.6233188509941101 + }, + { + "epoch": 0.7783270713861974, + "step": 7872, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.7783270713861974, + "step": 7872, + "train/total_loss": 0.10139438509941101 + }, + { + "entropy": 8.281718254089355, + "epoch": 0.7784259442357129, + "mean_token_accuracy": 0.7824831604957581, + "num_tokens": 20166968.0, + "step": 7873, + "train/ce_loss": 0.7039061784744263 + }, + { + "epoch": 0.7784259442357129, + "step": 7873, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.7784259442357129, + "step": 7873, + "train/total_loss": 0.12117186933755875 + }, + { + "entropy": 8.540313720703125, + "epoch": 0.7785248170852284, + "mean_token_accuracy": 0.7196765542030334, + "num_tokens": 20172248.0, + "step": 7874, + "train/ce_loss": 0.5756349563598633 + }, + { + "epoch": 0.7785248170852284, + "step": 7874, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.7785248170852284, + "step": 7874, + "train/total_loss": 0.12006349861621857 + }, + { + "entropy": 8.901520729064941, + "epoch": 0.778623689934744, + "mean_token_accuracy": 0.733433723449707, + "num_tokens": 20177335.0, + "step": 7875, + "train/ce_loss": 2.54775773100846e-06 + }, + { + "epoch": 0.778623689934744, + "step": 7875, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.778623689934744, + "step": 7875, + "train/total_loss": 0.0937502533197403 + }, + { + "entropy": 8.65632438659668, + "epoch": 0.7787225627842594, + "mean_token_accuracy": 0.8341708779335022, + "num_tokens": 20182542.0, + "step": 7876, + "train/ce_loss": 0.6827567219734192 + }, + { + "epoch": 0.7787225627842594, + "step": 7876, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.7787225627842594, + "step": 7876, + "train/total_loss": 0.09952567517757416 + }, + { + "entropy": 9.449702262878418, + "epoch": 0.7788214356337749, + "mean_token_accuracy": 0.7385321259498596, + "num_tokens": 20187400.0, + "step": 7877, + "train/ce_loss": 1.2082908153533936 + }, + { + "epoch": 0.7788214356337749, + "step": 7877, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.7788214356337749, + "step": 7877, + "train/total_loss": 0.19504782557487488 + }, + { + "entropy": 8.83372688293457, + "epoch": 0.7789203084832905, + "mean_token_accuracy": 0.7173333168029785, + "num_tokens": 20192618.0, + "step": 7878, + "train/ce_loss": 0.7673982977867126 + }, + { + "epoch": 0.7789203084832905, + "step": 7878, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.7789203084832905, + "step": 7878, + "train/total_loss": 0.1001773327589035 + }, + { + "entropy": 8.807306289672852, + "epoch": 0.779019181332806, + "mean_token_accuracy": 0.7661290168762207, + "num_tokens": 20197706.0, + "step": 7879, + "train/ce_loss": 3.00373426398437e-06 + }, + { + "epoch": 0.779019181332806, + "step": 7879, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.779019181332806, + "step": 7879, + "train/total_loss": 0.06250029802322388 + }, + { + "epoch": 0.7791180541823215, + "grad_norm": 0.6747611165046692, + "learning_rate": 8.054442960985017e-06, + "loss": 0.1315, + "step": 7880 + }, + { + "entropy": 8.52204704284668, + "epoch": 0.7791180541823215, + "mean_token_accuracy": 0.7667436599731445, + "num_tokens": 20203057.0, + "step": 7880, + "train/ce_loss": 0.5791946053504944 + }, + { + "epoch": 0.7791180541823215, + "step": 7880, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.7791180541823215, + "step": 7880, + "train/total_loss": 0.1008882075548172 + }, + { + "entropy": 9.394721984863281, + "epoch": 0.7792169270318371, + "mean_token_accuracy": 0.7935871481895447, + "num_tokens": 20207943.0, + "step": 7881, + "train/ce_loss": 1.1336095333099365 + }, + { + "epoch": 0.7792169270318371, + "step": 7881, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.7792169270318371, + "step": 7881, + "train/total_loss": 0.1602359563112259 + }, + { + "entropy": 8.673254013061523, + "epoch": 0.7793157998813526, + "mean_token_accuracy": 0.7245509028434753, + "num_tokens": 20213260.0, + "step": 7882, + "train/ce_loss": 0.628462016582489 + }, + { + "epoch": 0.7793157998813526, + "step": 7882, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.7793157998813526, + "step": 7882, + "train/total_loss": 0.10190870612859726 + }, + { + "entropy": 8.758913040161133, + "epoch": 0.7794146727308681, + "mean_token_accuracy": 0.7386215925216675, + "num_tokens": 20218504.0, + "step": 7883, + "train/ce_loss": 0.5018093585968018 + }, + { + "epoch": 0.7794146727308681, + "step": 7883, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.7794146727308681, + "step": 7883, + "train/total_loss": 0.10486844182014465 + }, + { + "entropy": 9.086111068725586, + "epoch": 0.7795135455803837, + "mean_token_accuracy": 0.68006432056427, + "num_tokens": 20223621.0, + "step": 7884, + "train/ce_loss": 1.9067862033843994 + }, + { + "epoch": 0.7795135455803837, + "step": 7884, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.7795135455803837, + "step": 7884, + "train/total_loss": 0.2570848762989044 + }, + { + "entropy": 8.832433700561523, + "epoch": 0.7796124184298991, + "mean_token_accuracy": 0.7552356123924255, + "num_tokens": 20228894.0, + "step": 7885, + "train/ce_loss": 0.9636496305465698 + }, + { + "epoch": 0.7796124184298991, + "step": 7885, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.7796124184298991, + "step": 7885, + "train/total_loss": 0.15886497497558594 + }, + { + "entropy": 9.566644668579102, + "epoch": 0.7797112912794146, + "mean_token_accuracy": 0.7421875, + "num_tokens": 20233709.0, + "step": 7886, + "train/ce_loss": 2.084218978881836 + }, + { + "epoch": 0.7797112912794146, + "step": 7886, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.7797112912794146, + "step": 7886, + "train/total_loss": 0.25920313596725464 + }, + { + "entropy": 8.960962295532227, + "epoch": 0.7798101641289302, + "mean_token_accuracy": 0.6954838633537292, + "num_tokens": 20238938.0, + "step": 7887, + "train/ce_loss": 1.7520071268081665 + }, + { + "epoch": 0.7798101641289302, + "step": 7887, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.7798101641289302, + "step": 7887, + "train/total_loss": 0.2650444507598877 + }, + { + "entropy": 9.25937271118164, + "epoch": 0.7799090369784457, + "mean_token_accuracy": 0.7517730593681335, + "num_tokens": 20243936.0, + "step": 7888, + "train/ce_loss": 1.1822519302368164 + }, + { + "epoch": 0.7799090369784457, + "step": 7888, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.7799090369784457, + "step": 7888, + "train/total_loss": 0.15728768706321716 + }, + { + "entropy": 8.589618682861328, + "epoch": 0.7800079098279612, + "mean_token_accuracy": 0.7505091428756714, + "num_tokens": 20249400.0, + "step": 7889, + "train/ce_loss": 0.8180780410766602 + }, + { + "epoch": 0.7800079098279612, + "step": 7889, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.7800079098279612, + "step": 7889, + "train/total_loss": 0.12868280708789825 + }, + { + "entropy": 8.764129638671875, + "epoch": 0.7801067826774768, + "mean_token_accuracy": 0.7448107600212097, + "num_tokens": 20254624.0, + "step": 7890, + "train/ce_loss": 0.923783540725708 + }, + { + "epoch": 0.7801067826774768, + "step": 7890, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.7801067826774768, + "step": 7890, + "train/total_loss": 0.12753459811210632 + }, + { + "entropy": 8.824312210083008, + "epoch": 0.7802056555269923, + "mean_token_accuracy": 0.7082429528236389, + "num_tokens": 20259950.0, + "step": 7891, + "train/ce_loss": 0.5625573396682739 + }, + { + "epoch": 0.7802056555269923, + "step": 7891, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.7802056555269923, + "step": 7891, + "train/total_loss": 0.10313073545694351 + }, + { + "entropy": 8.657915115356445, + "epoch": 0.7803045283765078, + "mean_token_accuracy": 0.7373167872428894, + "num_tokens": 20265329.0, + "step": 7892, + "train/ce_loss": 0.8548310399055481 + }, + { + "epoch": 0.7803045283765078, + "step": 7892, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.7803045283765078, + "step": 7892, + "train/total_loss": 0.1636081039905548 + }, + { + "entropy": 8.728321075439453, + "epoch": 0.7804034012260234, + "mean_token_accuracy": 0.7177419066429138, + "num_tokens": 20270510.0, + "step": 7893, + "train/ce_loss": 1.0501341819763184 + }, + { + "epoch": 0.7804034012260234, + "step": 7893, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.7804034012260234, + "step": 7893, + "train/total_loss": 0.1987634301185608 + }, + { + "entropy": 8.53168773651123, + "epoch": 0.7805022740755388, + "mean_token_accuracy": 0.7386243343353271, + "num_tokens": 20275934.0, + "step": 7894, + "train/ce_loss": 0.6309541463851929 + }, + { + "epoch": 0.7805022740755388, + "step": 7894, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.7805022740755388, + "step": 7894, + "train/total_loss": 0.08262666314840317 + }, + { + "entropy": 8.515279769897461, + "epoch": 0.7806011469250543, + "mean_token_accuracy": 0.7110874056816101, + "num_tokens": 20281360.0, + "step": 7895, + "train/ce_loss": 0.46416860818862915 + }, + { + "epoch": 0.7806011469250543, + "step": 7895, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.7806011469250543, + "step": 7895, + "train/total_loss": 0.12063561379909515 + }, + { + "entropy": 8.935840606689453, + "epoch": 0.7807000197745699, + "mean_token_accuracy": 0.75, + "num_tokens": 20286587.0, + "step": 7896, + "train/ce_loss": 1.288562536239624 + }, + { + "epoch": 0.7807000197745699, + "step": 7896, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.7807000197745699, + "step": 7896, + "train/total_loss": 0.14838750660419464 + }, + { + "entropy": 9.0083589553833, + "epoch": 0.7807988926240854, + "mean_token_accuracy": 0.7654135227203369, + "num_tokens": 20291683.0, + "step": 7897, + "train/ce_loss": 0.8810030817985535 + }, + { + "epoch": 0.7807988926240854, + "step": 7897, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.7807988926240854, + "step": 7897, + "train/total_loss": 0.13888156414031982 + }, + { + "entropy": 9.198946952819824, + "epoch": 0.7808977654736009, + "mean_token_accuracy": 0.6476923227310181, + "num_tokens": 20296804.0, + "step": 7898, + "train/ce_loss": 1.6421705484390259 + }, + { + "epoch": 0.7808977654736009, + "step": 7898, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.7808977654736009, + "step": 7898, + "train/total_loss": 0.2384358048439026 + }, + { + "entropy": 8.83854866027832, + "epoch": 0.7809966383231165, + "mean_token_accuracy": 0.7840490937232971, + "num_tokens": 20302066.0, + "step": 7899, + "train/ce_loss": 0.9571102857589722 + }, + { + "epoch": 0.7809966383231165, + "step": 7899, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.7809966383231165, + "step": 7899, + "train/total_loss": 0.16602352261543274 + }, + { + "epoch": 0.781095511172632, + "grad_norm": 0.6194737553596497, + "learning_rate": 8.049498096227069e-06, + "loss": 0.1419, + "step": 7900 + }, + { + "entropy": 8.932303428649902, + "epoch": 0.781095511172632, + "mean_token_accuracy": 0.7843137383460999, + "num_tokens": 20306917.0, + "step": 7900, + "train/ce_loss": 1.1366647481918335 + }, + { + "epoch": 0.781095511172632, + "step": 7900, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.781095511172632, + "step": 7900, + "train/total_loss": 0.14491647481918335 + }, + { + "entropy": 8.959232330322266, + "epoch": 0.7811943840221475, + "mean_token_accuracy": 0.7389885783195496, + "num_tokens": 20311969.0, + "step": 7901, + "train/ce_loss": 2.387908125456306e-06 + }, + { + "epoch": 0.7811943840221475, + "step": 7901, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.7811943840221475, + "step": 7901, + "train/total_loss": 0.0390627384185791 + }, + { + "entropy": 8.497319221496582, + "epoch": 0.7812932568716631, + "mean_token_accuracy": 0.7304452657699585, + "num_tokens": 20317288.0, + "step": 7902, + "train/ce_loss": 0.5628913044929504 + }, + { + "epoch": 0.7812932568716631, + "step": 7902, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.7812932568716631, + "step": 7902, + "train/total_loss": 0.10316413640975952 + }, + { + "entropy": 8.847746849060059, + "epoch": 0.7813921297211786, + "mean_token_accuracy": 0.75698322057724, + "num_tokens": 20322479.0, + "step": 7903, + "train/ce_loss": 2.7168525775778107e-06 + }, + { + "epoch": 0.7813921297211786, + "step": 7903, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.7813921297211786, + "step": 7903, + "train/total_loss": 0.03515652194619179 + }, + { + "entropy": 8.57381820678711, + "epoch": 0.781491002570694, + "mean_token_accuracy": 0.7816377282142639, + "num_tokens": 20327782.0, + "step": 7904, + "train/ce_loss": 0.9726953506469727 + }, + { + "epoch": 0.781491002570694, + "step": 7904, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.781491002570694, + "step": 7904, + "train/total_loss": 0.16367578506469727 + }, + { + "entropy": 8.720494270324707, + "epoch": 0.7815898754202096, + "mean_token_accuracy": 0.7451456189155579, + "num_tokens": 20333043.0, + "step": 7905, + "train/ce_loss": 0.9451776742935181 + }, + { + "epoch": 0.7815898754202096, + "step": 7905, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.7815898754202096, + "step": 7905, + "train/total_loss": 0.1296740174293518 + }, + { + "entropy": 8.519255638122559, + "epoch": 0.7816887482697251, + "mean_token_accuracy": 0.7466487884521484, + "num_tokens": 20338247.0, + "step": 7906, + "train/ce_loss": 0.9122191071510315 + }, + { + "epoch": 0.7816887482697251, + "step": 7906, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.7816887482697251, + "step": 7906, + "train/total_loss": 0.1732531636953354 + }, + { + "entropy": 8.998676300048828, + "epoch": 0.7817876211192406, + "mean_token_accuracy": 0.7673667073249817, + "num_tokens": 20343328.0, + "step": 7907, + "train/ce_loss": 2.213766947534168e-06 + }, + { + "epoch": 0.7817876211192406, + "step": 7907, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.7817876211192406, + "step": 7907, + "train/total_loss": 0.03906271979212761 + }, + { + "entropy": 9.285778999328613, + "epoch": 0.7818864939687562, + "mean_token_accuracy": 0.7413793206214905, + "num_tokens": 20348267.0, + "step": 7908, + "train/ce_loss": 1.0977128744125366 + }, + { + "epoch": 0.7818864939687562, + "step": 7908, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.7818864939687562, + "step": 7908, + "train/total_loss": 0.13320878148078918 + }, + { + "entropy": 8.340143203735352, + "epoch": 0.7819853668182717, + "mean_token_accuracy": 0.7763578295707703, + "num_tokens": 20353650.0, + "step": 7909, + "train/ce_loss": 0.7174997925758362 + }, + { + "epoch": 0.7819853668182717, + "step": 7909, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.7819853668182717, + "step": 7909, + "train/total_loss": 0.1108124777674675 + }, + { + "entropy": 9.001302719116211, + "epoch": 0.7820842396677872, + "mean_token_accuracy": 0.7584615349769592, + "num_tokens": 20358736.0, + "step": 7910, + "train/ce_loss": 0.7865976095199585 + }, + { + "epoch": 0.7820842396677872, + "step": 7910, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.7820842396677872, + "step": 7910, + "train/total_loss": 0.10990976542234421 + }, + { + "entropy": 8.796815872192383, + "epoch": 0.7821831125173028, + "mean_token_accuracy": 0.7554697394371033, + "num_tokens": 20363966.0, + "step": 7911, + "train/ce_loss": 0.8870112895965576 + }, + { + "epoch": 0.7821831125173028, + "step": 7911, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.7821831125173028, + "step": 7911, + "train/total_loss": 0.14729487895965576 + }, + { + "entropy": 8.566937446594238, + "epoch": 0.7822819853668183, + "mean_token_accuracy": 0.7266111969947815, + "num_tokens": 20369407.0, + "step": 7912, + "train/ce_loss": 0.5319351553916931 + }, + { + "epoch": 0.7822819853668183, + "step": 7912, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.7822819853668183, + "step": 7912, + "train/total_loss": 0.13131850957870483 + }, + { + "entropy": 8.470523834228516, + "epoch": 0.7823808582163337, + "mean_token_accuracy": 0.7873210906982422, + "num_tokens": 20374871.0, + "step": 7913, + "train/ce_loss": 0.5656315088272095 + }, + { + "epoch": 0.7823808582163337, + "step": 7913, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.7823808582163337, + "step": 7913, + "train/total_loss": 0.12296940386295319 + }, + { + "entropy": 8.459247589111328, + "epoch": 0.7824797310658493, + "mean_token_accuracy": 0.747826099395752, + "num_tokens": 20380133.0, + "step": 7914, + "train/ce_loss": 1.336376667022705 + }, + { + "epoch": 0.7824797310658493, + "step": 7914, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.7824797310658493, + "step": 7914, + "train/total_loss": 0.1570751667022705 + }, + { + "entropy": 8.66012191772461, + "epoch": 0.7825786039153648, + "mean_token_accuracy": 0.7441558241844177, + "num_tokens": 20385392.0, + "step": 7915, + "train/ce_loss": 0.6318830251693726 + }, + { + "epoch": 0.7825786039153648, + "step": 7915, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.7825786039153648, + "step": 7915, + "train/total_loss": 0.09834455698728561 + }, + { + "entropy": 8.976568222045898, + "epoch": 0.7826774767648803, + "mean_token_accuracy": 0.6805348992347717, + "num_tokens": 20390493.0, + "step": 7916, + "train/ce_loss": 0.5899818539619446 + }, + { + "epoch": 0.7826774767648803, + "step": 7916, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.7826774767648803, + "step": 7916, + "train/total_loss": 0.08243568241596222 + }, + { + "entropy": 8.40031623840332, + "epoch": 0.7827763496143959, + "mean_token_accuracy": 0.7359437942504883, + "num_tokens": 20395980.0, + "step": 7917, + "train/ce_loss": 0.8957025408744812 + }, + { + "epoch": 0.7827763496143959, + "step": 7917, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.7827763496143959, + "step": 7917, + "train/total_loss": 0.12863275408744812 + }, + { + "entropy": 8.910615921020508, + "epoch": 0.7828752224639114, + "mean_token_accuracy": 0.7111756205558777, + "num_tokens": 20401136.0, + "step": 7918, + "train/ce_loss": 0.6762477159500122 + }, + { + "epoch": 0.7828752224639114, + "step": 7918, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.7828752224639114, + "step": 7918, + "train/total_loss": 0.1301247775554657 + }, + { + "entropy": 8.72070598602295, + "epoch": 0.782974095313427, + "mean_token_accuracy": 0.7680723071098328, + "num_tokens": 20406278.0, + "step": 7919, + "train/ce_loss": 1.0571833848953247 + }, + { + "epoch": 0.782974095313427, + "step": 7919, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.782974095313427, + "step": 7919, + "train/total_loss": 0.16431209444999695 + }, + { + "epoch": 0.7830729681629425, + "grad_norm": 0.6485387682914734, + "learning_rate": 8.04455323146912e-06, + "loss": 0.1316, + "step": 7920 + }, + { + "entropy": 8.218687057495117, + "epoch": 0.7830729681629425, + "mean_token_accuracy": 0.8042895197868347, + "num_tokens": 20411890.0, + "step": 7920, + "train/ce_loss": 1.124735713005066 + }, + { + "epoch": 0.7830729681629425, + "step": 7920, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.7830729681629425, + "step": 7920, + "train/total_loss": 0.15934857726097107 + }, + { + "entropy": 8.609413146972656, + "epoch": 0.783171841012458, + "mean_token_accuracy": 0.7561797499656677, + "num_tokens": 20417281.0, + "step": 7921, + "train/ce_loss": 0.5667004585266113 + }, + { + "epoch": 0.783171841012458, + "step": 7921, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.783171841012458, + "step": 7921, + "train/total_loss": 0.15432628989219666 + }, + { + "entropy": 9.017016410827637, + "epoch": 0.7832707138619736, + "mean_token_accuracy": 0.7774389982223511, + "num_tokens": 20422422.0, + "step": 7922, + "train/ce_loss": 1.0670339634089032e-06 + }, + { + "epoch": 0.7832707138619736, + "step": 7922, + "train/sim_loss": 0.01171875 + }, + { + "epoch": 0.7832707138619736, + "step": 7922, + "train/total_loss": 0.01171885710209608 + }, + { + "entropy": 8.53697395324707, + "epoch": 0.783369586711489, + "mean_token_accuracy": 0.807603657245636, + "num_tokens": 20427795.0, + "step": 7923, + "train/ce_loss": 0.6338130235671997 + }, + { + "epoch": 0.783369586711489, + "step": 7923, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.783369586711489, + "step": 7923, + "train/total_loss": 0.10635005682706833 + }, + { + "entropy": 8.64338493347168, + "epoch": 0.7834684595610045, + "mean_token_accuracy": 0.7033492922782898, + "num_tokens": 20433096.0, + "step": 7924, + "train/ce_loss": 0.6053569316864014 + }, + { + "epoch": 0.7834684595610045, + "step": 7924, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.7834684595610045, + "step": 7924, + "train/total_loss": 0.14647319912910461 + }, + { + "entropy": 8.749120712280273, + "epoch": 0.7835673324105201, + "mean_token_accuracy": 0.7053571343421936, + "num_tokens": 20438470.0, + "step": 7925, + "train/ce_loss": 0.5116437673568726 + }, + { + "epoch": 0.7835673324105201, + "step": 7925, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.7835673324105201, + "step": 7925, + "train/total_loss": 0.07850812375545502 + }, + { + "entropy": 8.944839477539062, + "epoch": 0.7836662052600356, + "mean_token_accuracy": 0.7326732873916626, + "num_tokens": 20443596.0, + "step": 7926, + "train/ce_loss": 1.0792173147201538 + }, + { + "epoch": 0.7836662052600356, + "step": 7926, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.7836662052600356, + "step": 7926, + "train/total_loss": 0.13917173445224762 + }, + { + "entropy": 8.53488540649414, + "epoch": 0.7837650781095511, + "mean_token_accuracy": 0.7253814339637756, + "num_tokens": 20448762.0, + "step": 7927, + "train/ce_loss": 0.893704891204834 + }, + { + "epoch": 0.7837650781095511, + "step": 7927, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.7837650781095511, + "step": 7927, + "train/total_loss": 0.1674954891204834 + }, + { + "entropy": 8.753499984741211, + "epoch": 0.7838639509590667, + "mean_token_accuracy": 0.6675094962120056, + "num_tokens": 20454029.0, + "step": 7928, + "train/ce_loss": 1.5897996425628662 + }, + { + "epoch": 0.7838639509590667, + "step": 7928, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.7838639509590667, + "step": 7928, + "train/total_loss": 0.22538621723651886 + }, + { + "entropy": 8.803192138671875, + "epoch": 0.7839628238085822, + "mean_token_accuracy": 0.7758620977401733, + "num_tokens": 20459208.0, + "step": 7929, + "train/ce_loss": 0.9285788536071777 + }, + { + "epoch": 0.7839628238085822, + "step": 7929, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.7839628238085822, + "step": 7929, + "train/total_loss": 0.16317039728164673 + }, + { + "entropy": 8.868133544921875, + "epoch": 0.7840616966580977, + "mean_token_accuracy": 0.7783641219139099, + "num_tokens": 20464413.0, + "step": 7930, + "train/ce_loss": 1.8015189198195003e-06 + }, + { + "epoch": 0.7840616966580977, + "step": 7930, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.7840616966580977, + "step": 7930, + "train/total_loss": 0.042968928813934326 + }, + { + "entropy": 9.242012023925781, + "epoch": 0.7841605695076133, + "mean_token_accuracy": 0.7279411554336548, + "num_tokens": 20469376.0, + "step": 7931, + "train/ce_loss": 0.6100656390190125 + }, + { + "epoch": 0.7841605695076133, + "step": 7931, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.7841605695076133, + "step": 7931, + "train/total_loss": 0.096162810921669 + }, + { + "entropy": 8.578420639038086, + "epoch": 0.7842594423571287, + "mean_token_accuracy": 0.6862967014312744, + "num_tokens": 20474698.0, + "step": 7932, + "train/ce_loss": 1.4359699487686157 + }, + { + "epoch": 0.7842594423571287, + "step": 7932, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.7842594423571287, + "step": 7932, + "train/total_loss": 0.17875324189662933 + }, + { + "entropy": 8.901908874511719, + "epoch": 0.7843583152066442, + "mean_token_accuracy": 0.7245222926139832, + "num_tokens": 20479970.0, + "step": 7933, + "train/ce_loss": 0.8366997838020325 + }, + { + "epoch": 0.7843583152066442, + "step": 7933, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.7843583152066442, + "step": 7933, + "train/total_loss": 0.1696074903011322 + }, + { + "entropy": 9.048408508300781, + "epoch": 0.7844571880561598, + "mean_token_accuracy": 0.7338235378265381, + "num_tokens": 20485079.0, + "step": 7934, + "train/ce_loss": 0.9843783974647522 + }, + { + "epoch": 0.7844571880561598, + "step": 7934, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.7844571880561598, + "step": 7934, + "train/total_loss": 0.1609378457069397 + }, + { + "entropy": 8.651874542236328, + "epoch": 0.7845560609056753, + "mean_token_accuracy": 0.7883771657943726, + "num_tokens": 20490458.0, + "step": 7935, + "train/ce_loss": 0.5401686429977417 + }, + { + "epoch": 0.7845560609056753, + "step": 7935, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.7845560609056753, + "step": 7935, + "train/total_loss": 0.07745436578989029 + }, + { + "entropy": 8.963913917541504, + "epoch": 0.7846549337551908, + "mean_token_accuracy": 0.7274011373519897, + "num_tokens": 20495576.0, + "step": 7936, + "train/ce_loss": 0.631771445274353 + }, + { + "epoch": 0.7846549337551908, + "step": 7936, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.7846549337551908, + "step": 7936, + "train/total_loss": 0.11786464601755142 + }, + { + "entropy": 8.555036544799805, + "epoch": 0.7847538066047064, + "mean_token_accuracy": 0.7471967339515686, + "num_tokens": 20501050.0, + "step": 7937, + "train/ce_loss": 0.7506260275840759 + }, + { + "epoch": 0.7847538066047064, + "step": 7937, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.7847538066047064, + "step": 7937, + "train/total_loss": 0.1492813527584076 + }, + { + "entropy": 8.483772277832031, + "epoch": 0.7848526794542219, + "mean_token_accuracy": 0.7716371417045593, + "num_tokens": 20506422.0, + "step": 7938, + "train/ce_loss": 0.5519025325775146 + }, + { + "epoch": 0.7848526794542219, + "step": 7938, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.7848526794542219, + "step": 7938, + "train/total_loss": 0.10206525027751923 + }, + { + "entropy": 8.557881355285645, + "epoch": 0.7849515523037374, + "mean_token_accuracy": 0.7925764322280884, + "num_tokens": 20511788.0, + "step": 7939, + "train/ce_loss": 0.6271799206733704 + }, + { + "epoch": 0.7849515523037374, + "step": 7939, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.7849515523037374, + "step": 7939, + "train/total_loss": 0.1056867465376854 + }, + { + "epoch": 0.785050425153253, + "grad_norm": 0.5147098898887634, + "learning_rate": 8.039608366711172e-06, + "loss": 0.1314, + "step": 7940 + }, + { + "entropy": 9.699087142944336, + "epoch": 0.785050425153253, + "mean_token_accuracy": 0.7768816947937012, + "num_tokens": 20516521.0, + "step": 7940, + "train/ce_loss": 1.6855492503964342e-06 + }, + { + "epoch": 0.785050425153253, + "step": 7940, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.785050425153253, + "step": 7940, + "train/total_loss": 0.01953141763806343 + }, + { + "entropy": 8.913328170776367, + "epoch": 0.7851492980027684, + "mean_token_accuracy": 0.7882736325263977, + "num_tokens": 20521566.0, + "step": 7941, + "train/ce_loss": 0.806064248085022 + }, + { + "epoch": 0.7851492980027684, + "step": 7941, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.7851492980027684, + "step": 7941, + "train/total_loss": 0.09623142331838608 + }, + { + "entropy": 9.045415878295898, + "epoch": 0.7852481708522839, + "mean_token_accuracy": 0.7320703864097595, + "num_tokens": 20526796.0, + "step": 7942, + "train/ce_loss": 5.9377862271503545e-06 + }, + { + "epoch": 0.7852481708522839, + "step": 7942, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.7852481708522839, + "step": 7942, + "train/total_loss": 0.054688092321157455 + }, + { + "entropy": 8.949482917785645, + "epoch": 0.7853470437017995, + "mean_token_accuracy": 0.6782729625701904, + "num_tokens": 20531958.0, + "step": 7943, + "train/ce_loss": 1.5178576707839966 + }, + { + "epoch": 0.7853470437017995, + "step": 7943, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.7853470437017995, + "step": 7943, + "train/total_loss": 0.18303577601909637 + }, + { + "entropy": 8.98634147644043, + "epoch": 0.785445916551315, + "mean_token_accuracy": 0.739130437374115, + "num_tokens": 20537152.0, + "step": 7944, + "train/ce_loss": 1.258015751838684 + }, + { + "epoch": 0.785445916551315, + "step": 7944, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.785445916551315, + "step": 7944, + "train/total_loss": 0.18439532816410065 + }, + { + "entropy": 8.618897438049316, + "epoch": 0.7855447894008305, + "mean_token_accuracy": 0.7541713118553162, + "num_tokens": 20542613.0, + "step": 7945, + "train/ce_loss": 0.8079213500022888 + }, + { + "epoch": 0.7855447894008305, + "step": 7945, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.7855447894008305, + "step": 7945, + "train/total_loss": 0.1706358790397644 + }, + { + "entropy": 9.174298286437988, + "epoch": 0.7856436622503461, + "mean_token_accuracy": 0.7657807469367981, + "num_tokens": 20547689.0, + "step": 7946, + "train/ce_loss": 5.641299139824696e-06 + }, + { + "epoch": 0.7856436622503461, + "step": 7946, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.7856436622503461, + "step": 7946, + "train/total_loss": 0.05468806251883507 + }, + { + "entropy": 9.220314025878906, + "epoch": 0.7857425350998616, + "mean_token_accuracy": 0.7574257254600525, + "num_tokens": 20552669.0, + "step": 7947, + "train/ce_loss": 1.3368439674377441 + }, + { + "epoch": 0.7857425350998616, + "step": 7947, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.7857425350998616, + "step": 7947, + "train/total_loss": 0.19227814674377441 + }, + { + "entropy": 8.648846626281738, + "epoch": 0.7858414079493771, + "mean_token_accuracy": 0.7247058749198914, + "num_tokens": 20557996.0, + "step": 7948, + "train/ce_loss": 0.4762094020843506 + }, + { + "epoch": 0.7858414079493771, + "step": 7948, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.7858414079493771, + "step": 7948, + "train/total_loss": 0.10621468722820282 + }, + { + "entropy": 8.662435531616211, + "epoch": 0.7859402807988927, + "mean_token_accuracy": 0.7303522825241089, + "num_tokens": 20563177.0, + "step": 7949, + "train/ce_loss": 1.3563436269760132 + }, + { + "epoch": 0.7859402807988927, + "step": 7949, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.7859402807988927, + "step": 7949, + "train/total_loss": 0.17469686269760132 + }, + { + "entropy": 9.52070426940918, + "epoch": 0.7860391536484082, + "mean_token_accuracy": 0.7683284282684326, + "num_tokens": 20567934.0, + "step": 7950, + "train/ce_loss": 1.9786020857281983e-06 + }, + { + "epoch": 0.7860391536484082, + "step": 7950, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.7860391536484082, + "step": 7950, + "train/total_loss": 0.03906269744038582 + }, + { + "entropy": 8.594144821166992, + "epoch": 0.7861380264979236, + "mean_token_accuracy": 0.7701793909072876, + "num_tokens": 20573264.0, + "step": 7951, + "train/ce_loss": 0.4847142994403839 + }, + { + "epoch": 0.7861380264979236, + "step": 7951, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.7861380264979236, + "step": 7951, + "train/total_loss": 0.08753393590450287 + }, + { + "entropy": 8.669903755187988, + "epoch": 0.7862368993474392, + "mean_token_accuracy": 0.7776025533676147, + "num_tokens": 20578340.0, + "step": 7952, + "train/ce_loss": 0.6803780198097229 + }, + { + "epoch": 0.7862368993474392, + "step": 7952, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.7862368993474392, + "step": 7952, + "train/total_loss": 0.08366280049085617 + }, + { + "entropy": 8.677490234375, + "epoch": 0.7863357721969547, + "mean_token_accuracy": 0.7487562298774719, + "num_tokens": 20583615.0, + "step": 7953, + "train/ce_loss": 0.7094335556030273 + }, + { + "epoch": 0.7863357721969547, + "step": 7953, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.7863357721969547, + "step": 7953, + "train/total_loss": 0.11000585556030273 + }, + { + "entropy": 8.330732345581055, + "epoch": 0.7864346450464702, + "mean_token_accuracy": 0.7355035543441772, + "num_tokens": 20589087.0, + "step": 7954, + "train/ce_loss": 0.6643909215927124 + }, + { + "epoch": 0.7864346450464702, + "step": 7954, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.7864346450464702, + "step": 7954, + "train/total_loss": 0.14456409215927124 + }, + { + "entropy": 8.504324913024902, + "epoch": 0.7865335178959858, + "mean_token_accuracy": 0.7805677056312561, + "num_tokens": 20594501.0, + "step": 7955, + "train/ce_loss": 0.6007000803947449 + }, + { + "epoch": 0.7865335178959858, + "step": 7955, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.7865335178959858, + "step": 7955, + "train/total_loss": 0.11866375803947449 + }, + { + "entropy": 9.33056640625, + "epoch": 0.7866323907455013, + "mean_token_accuracy": 0.7306967973709106, + "num_tokens": 20599418.0, + "step": 7956, + "train/ce_loss": 1.3013379573822021 + }, + { + "epoch": 0.7866323907455013, + "step": 7956, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.7866323907455013, + "step": 7956, + "train/total_loss": 0.18872754275798798 + }, + { + "entropy": 8.516487121582031, + "epoch": 0.7867312635950168, + "mean_token_accuracy": 0.6837030053138733, + "num_tokens": 20604922.0, + "step": 7957, + "train/ce_loss": 1.3547759056091309 + }, + { + "epoch": 0.7867312635950168, + "step": 7957, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.7867312635950168, + "step": 7957, + "train/total_loss": 0.21360258758068085 + }, + { + "entropy": 8.600809097290039, + "epoch": 0.7868301364445324, + "mean_token_accuracy": 0.7753201127052307, + "num_tokens": 20610223.0, + "step": 7958, + "train/ce_loss": 0.9719870686531067 + }, + { + "epoch": 0.7868301364445324, + "step": 7958, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.7868301364445324, + "step": 7958, + "train/total_loss": 0.1596987098455429 + }, + { + "entropy": 8.850839614868164, + "epoch": 0.7869290092940479, + "mean_token_accuracy": 0.7395944595336914, + "num_tokens": 20615630.0, + "step": 7959, + "train/ce_loss": 1.0459824800491333 + }, + { + "epoch": 0.7869290092940479, + "step": 7959, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.7869290092940479, + "step": 7959, + "train/total_loss": 0.1514732539653778 + }, + { + "epoch": 0.7870278821435633, + "grad_norm": 0.7744261622428894, + "learning_rate": 8.034663501953222e-06, + "loss": 0.1369, + "step": 7960 + }, + { + "entropy": 8.881771087646484, + "epoch": 0.7870278821435633, + "mean_token_accuracy": 0.7635036706924438, + "num_tokens": 20620788.0, + "step": 7960, + "train/ce_loss": 0.6608197689056396 + }, + { + "epoch": 0.7870278821435633, + "step": 7960, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.7870278821435633, + "step": 7960, + "train/total_loss": 0.08170697838068008 + }, + { + "entropy": 8.501601219177246, + "epoch": 0.7871267549930789, + "mean_token_accuracy": 0.7590233683586121, + "num_tokens": 20626220.0, + "step": 7961, + "train/ce_loss": 0.9903839230537415 + }, + { + "epoch": 0.7871267549930789, + "step": 7961, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.7871267549930789, + "step": 7961, + "train/total_loss": 0.14591339230537415 + }, + { + "entropy": 8.645923614501953, + "epoch": 0.7872256278425944, + "mean_token_accuracy": 0.6985210180282593, + "num_tokens": 20631757.0, + "step": 7962, + "train/ce_loss": 1.2180404663085938 + }, + { + "epoch": 0.7872256278425944, + "step": 7962, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.7872256278425944, + "step": 7962, + "train/total_loss": 0.21164780855178833 + }, + { + "entropy": 8.543001174926758, + "epoch": 0.7873245006921099, + "mean_token_accuracy": 0.7823721170425415, + "num_tokens": 20637151.0, + "step": 7963, + "train/ce_loss": 0.7809872627258301 + }, + { + "epoch": 0.7873245006921099, + "step": 7963, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.7873245006921099, + "step": 7963, + "train/total_loss": 0.09372372925281525 + }, + { + "entropy": 8.694469451904297, + "epoch": 0.7874233735416255, + "mean_token_accuracy": 0.7221029996871948, + "num_tokens": 20642564.0, + "step": 7964, + "train/ce_loss": 0.7001290321350098 + }, + { + "epoch": 0.7874233735416255, + "step": 7964, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.7874233735416255, + "step": 7964, + "train/total_loss": 0.1207941547036171 + }, + { + "entropy": 9.358198165893555, + "epoch": 0.787522246391141, + "mean_token_accuracy": 0.7299145460128784, + "num_tokens": 20647740.0, + "step": 7965, + "train/ce_loss": 0.8079087734222412 + }, + { + "epoch": 0.787522246391141, + "step": 7965, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.787522246391141, + "step": 7965, + "train/total_loss": 0.13157212734222412 + }, + { + "entropy": 8.975767135620117, + "epoch": 0.7876211192406565, + "mean_token_accuracy": 0.7756654024124146, + "num_tokens": 20653030.0, + "step": 7966, + "train/ce_loss": 1.059606671333313 + }, + { + "epoch": 0.7876211192406565, + "step": 7966, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.7876211192406565, + "step": 7966, + "train/total_loss": 0.1528356671333313 + }, + { + "entropy": 8.476731300354004, + "epoch": 0.7877199920901721, + "mean_token_accuracy": 0.7681007385253906, + "num_tokens": 20658446.0, + "step": 7967, + "train/ce_loss": 0.644286572933197 + }, + { + "epoch": 0.7877199920901721, + "step": 7967, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.7877199920901721, + "step": 7967, + "train/total_loss": 0.0800536572933197 + }, + { + "entropy": 8.317113876342773, + "epoch": 0.7878188649396876, + "mean_token_accuracy": 0.7533039450645447, + "num_tokens": 20663819.0, + "step": 7968, + "train/ce_loss": 0.49962639808654785 + }, + { + "epoch": 0.7878188649396876, + "step": 7968, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.7878188649396876, + "step": 7968, + "train/total_loss": 0.08902513980865479 + }, + { + "entropy": 9.088321685791016, + "epoch": 0.787917737789203, + "mean_token_accuracy": 0.7987321615219116, + "num_tokens": 20668916.0, + "step": 7969, + "train/ce_loss": 1.279579520225525 + }, + { + "epoch": 0.787917737789203, + "step": 7969, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.787917737789203, + "step": 7969, + "train/total_loss": 0.17092670500278473 + }, + { + "entropy": 8.945789337158203, + "epoch": 0.7880166106387186, + "mean_token_accuracy": 0.7312588691711426, + "num_tokens": 20674098.0, + "step": 7970, + "train/ce_loss": 0.9316537380218506 + }, + { + "epoch": 0.7880166106387186, + "step": 7970, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.7880166106387186, + "step": 7970, + "train/total_loss": 0.14394661784172058 + }, + { + "entropy": 8.59889030456543, + "epoch": 0.7881154834882341, + "mean_token_accuracy": 0.667382001876831, + "num_tokens": 20679676.0, + "step": 7971, + "train/ce_loss": 0.9962515830993652 + }, + { + "epoch": 0.7881154834882341, + "step": 7971, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.7881154834882341, + "step": 7971, + "train/total_loss": 0.15431267023086548 + }, + { + "entropy": 8.534368515014648, + "epoch": 0.7882143563377496, + "mean_token_accuracy": 0.7758007049560547, + "num_tokens": 20684968.0, + "step": 7972, + "train/ce_loss": 0.6547650098800659 + }, + { + "epoch": 0.7882143563377496, + "step": 7972, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.7882143563377496, + "step": 7972, + "train/total_loss": 0.13188275694847107 + }, + { + "entropy": 8.79300594329834, + "epoch": 0.7883132291872652, + "mean_token_accuracy": 0.7480106353759766, + "num_tokens": 20690174.0, + "step": 7973, + "train/ce_loss": 0.600091278553009 + }, + { + "epoch": 0.7883132291872652, + "step": 7973, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.7883132291872652, + "step": 7973, + "train/total_loss": 0.13032162189483643 + }, + { + "entropy": 9.573885917663574, + "epoch": 0.7884121020367807, + "mean_token_accuracy": 0.6720430254936218, + "num_tokens": 20694967.0, + "step": 7974, + "train/ce_loss": 2.0439798831939697 + }, + { + "epoch": 0.7884121020367807, + "step": 7974, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.7884121020367807, + "step": 7974, + "train/total_loss": 0.2473667412996292 + }, + { + "entropy": 9.054139137268066, + "epoch": 0.7885109748862962, + "mean_token_accuracy": 0.7531914710998535, + "num_tokens": 20700189.0, + "step": 7975, + "train/ce_loss": 0.9281861186027527 + }, + { + "epoch": 0.7885109748862962, + "step": 7975, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.7885109748862962, + "step": 7975, + "train/total_loss": 0.14750611782073975 + }, + { + "entropy": 8.449117660522461, + "epoch": 0.7886098477358118, + "mean_token_accuracy": 0.7948139905929565, + "num_tokens": 20705548.0, + "step": 7976, + "train/ce_loss": 0.6490321755409241 + }, + { + "epoch": 0.7886098477358118, + "step": 7976, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.7886098477358118, + "step": 7976, + "train/total_loss": 0.12349697202444077 + }, + { + "entropy": 8.732007026672363, + "epoch": 0.7887087205853273, + "mean_token_accuracy": 0.7657784223556519, + "num_tokens": 20710665.0, + "step": 7977, + "train/ce_loss": 0.5330588817596436 + }, + { + "epoch": 0.7887087205853273, + "step": 7977, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.7887087205853273, + "step": 7977, + "train/total_loss": 0.07674339413642883 + }, + { + "entropy": 9.389777183532715, + "epoch": 0.7888075934348427, + "mean_token_accuracy": 0.7830508351325989, + "num_tokens": 20715658.0, + "step": 7978, + "train/ce_loss": 6.085639938646636e-07 + }, + { + "epoch": 0.7888075934348427, + "step": 7978, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.7888075934348427, + "step": 7978, + "train/total_loss": 0.019531311467289925 + }, + { + "entropy": 8.771812438964844, + "epoch": 0.7889064662843583, + "mean_token_accuracy": 0.7468944191932678, + "num_tokens": 20720768.0, + "step": 7979, + "train/ce_loss": 1.1544526815414429 + }, + { + "epoch": 0.7889064662843583, + "step": 7979, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.7889064662843583, + "step": 7979, + "train/total_loss": 0.13107027113437653 + }, + { + "epoch": 0.7890053391338738, + "grad_norm": 0.67234867811203, + "learning_rate": 8.029718637195273e-06, + "loss": 0.1305, + "step": 7980 + }, + { + "entropy": 8.965940475463867, + "epoch": 0.7890053391338738, + "mean_token_accuracy": 0.7742448449134827, + "num_tokens": 20725814.0, + "step": 7980, + "train/ce_loss": 1.0769838094711304 + }, + { + "epoch": 0.7890053391338738, + "step": 7980, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.7890053391338738, + "step": 7980, + "train/total_loss": 0.16238588094711304 + }, + { + "entropy": 8.755380630493164, + "epoch": 0.7891042119833893, + "mean_token_accuracy": 0.7167567610740662, + "num_tokens": 20731220.0, + "step": 7981, + "train/ce_loss": 1.0310207605361938 + }, + { + "epoch": 0.7891042119833893, + "step": 7981, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.7891042119833893, + "step": 7981, + "train/total_loss": 0.17341458797454834 + }, + { + "entropy": 9.05705451965332, + "epoch": 0.7892030848329049, + "mean_token_accuracy": 0.7604166865348816, + "num_tokens": 20736324.0, + "step": 7982, + "train/ce_loss": 1.1232382348680403e-06 + }, + { + "epoch": 0.7892030848329049, + "step": 7982, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.7892030848329049, + "step": 7982, + "train/total_loss": 0.046875111758708954 + }, + { + "entropy": 8.760584831237793, + "epoch": 0.7893019576824204, + "mean_token_accuracy": 0.7796013951301575, + "num_tokens": 20741626.0, + "step": 7983, + "train/ce_loss": 1.0431007146835327 + }, + { + "epoch": 0.7893019576824204, + "step": 7983, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.7893019576824204, + "step": 7983, + "train/total_loss": 0.1433725655078888 + }, + { + "entropy": 9.176156997680664, + "epoch": 0.7894008305319359, + "mean_token_accuracy": 0.7445651888847351, + "num_tokens": 20746585.0, + "step": 7984, + "train/ce_loss": 5.271598638501018e-06 + }, + { + "epoch": 0.7894008305319359, + "step": 7984, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.7894008305319359, + "step": 7984, + "train/total_loss": 0.04296927899122238 + }, + { + "entropy": 8.639911651611328, + "epoch": 0.7894997033814515, + "mean_token_accuracy": 0.6930022835731506, + "num_tokens": 20751976.0, + "step": 7985, + "train/ce_loss": 1.131714105606079 + }, + { + "epoch": 0.7894997033814515, + "step": 7985, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.7894997033814515, + "step": 7985, + "train/total_loss": 0.17957766354084015 + }, + { + "entropy": 9.173746109008789, + "epoch": 0.789598576230967, + "mean_token_accuracy": 0.7767145037651062, + "num_tokens": 20757029.0, + "step": 7986, + "train/ce_loss": 0.716386079788208 + }, + { + "epoch": 0.789598576230967, + "step": 7986, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.789598576230967, + "step": 7986, + "train/total_loss": 0.08726360648870468 + }, + { + "entropy": 8.37651252746582, + "epoch": 0.7896974490804824, + "mean_token_accuracy": 0.7713097929954529, + "num_tokens": 20762505.0, + "step": 7987, + "train/ce_loss": 1.0379140377044678 + }, + { + "epoch": 0.7896974490804824, + "step": 7987, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.7896974490804824, + "step": 7987, + "train/total_loss": 0.13504141569137573 + }, + { + "entropy": 8.751646041870117, + "epoch": 0.789796321929998, + "mean_token_accuracy": 0.7278287410736084, + "num_tokens": 20767694.0, + "step": 7988, + "train/ce_loss": 1.356079906145169e-06 + }, + { + "epoch": 0.789796321929998, + "step": 7988, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.789796321929998, + "step": 7988, + "train/total_loss": 0.027343885973095894 + }, + { + "entropy": 9.12930679321289, + "epoch": 0.7898951947795135, + "mean_token_accuracy": 0.7540983557701111, + "num_tokens": 20772824.0, + "step": 7989, + "train/ce_loss": 0.7866105437278748 + }, + { + "epoch": 0.7898951947795135, + "step": 7989, + "train/sim_loss": 0.14453125 + }, + { + "epoch": 0.7898951947795135, + "step": 7989, + "train/total_loss": 0.22319230437278748 + }, + { + "entropy": 8.300350189208984, + "epoch": 0.789994067629029, + "mean_token_accuracy": 0.7761467695236206, + "num_tokens": 20778429.0, + "step": 7990, + "train/ce_loss": 0.7868548631668091 + }, + { + "epoch": 0.789994067629029, + "step": 7990, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.789994067629029, + "step": 7990, + "train/total_loss": 0.14899799227714539 + }, + { + "entropy": 8.613931655883789, + "epoch": 0.7900929404785446, + "mean_token_accuracy": 0.8106951713562012, + "num_tokens": 20783832.0, + "step": 7991, + "train/ce_loss": 0.6341633796691895 + }, + { + "epoch": 0.7900929404785446, + "step": 7991, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.7900929404785446, + "step": 7991, + "train/total_loss": 0.07904133945703506 + }, + { + "entropy": 8.594196319580078, + "epoch": 0.7901918133280601, + "mean_token_accuracy": 0.7136514782905579, + "num_tokens": 20789196.0, + "step": 7992, + "train/ce_loss": 1.3270422220230103 + }, + { + "epoch": 0.7901918133280601, + "step": 7992, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.7901918133280601, + "step": 7992, + "train/total_loss": 0.2030167281627655 + }, + { + "entropy": 8.562560081481934, + "epoch": 0.7902906861775756, + "mean_token_accuracy": 0.7399267554283142, + "num_tokens": 20794525.0, + "step": 7993, + "train/ce_loss": 0.4412449896335602 + }, + { + "epoch": 0.7902906861775756, + "step": 7993, + "train/sim_loss": 0.01171875 + }, + { + "epoch": 0.7902906861775756, + "step": 7993, + "train/total_loss": 0.05584324896335602 + }, + { + "entropy": 8.837336540222168, + "epoch": 0.7903895590270912, + "mean_token_accuracy": 0.7310705184936523, + "num_tokens": 20800023.0, + "step": 7994, + "train/ce_loss": 0.6181901693344116 + }, + { + "epoch": 0.7903895590270912, + "step": 7994, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.7903895590270912, + "step": 7994, + "train/total_loss": 0.13213151693344116 + }, + { + "entropy": 8.306492805480957, + "epoch": 0.7904884318766067, + "mean_token_accuracy": 0.8111979365348816, + "num_tokens": 20805260.0, + "step": 7995, + "train/ce_loss": 2.1152385670575313e-05 + }, + { + "epoch": 0.7904884318766067, + "step": 7995, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.7904884318766067, + "step": 7995, + "train/total_loss": 0.039064615964889526 + }, + { + "entropy": 8.57056999206543, + "epoch": 0.7905873047261222, + "mean_token_accuracy": 0.7406143546104431, + "num_tokens": 20810589.0, + "step": 7996, + "train/ce_loss": 0.44312337040901184 + }, + { + "epoch": 0.7905873047261222, + "step": 7996, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.7905873047261222, + "step": 7996, + "train/total_loss": 0.06384359300136566 + }, + { + "entropy": 8.586959838867188, + "epoch": 0.7906861775756377, + "mean_token_accuracy": 0.7104018926620483, + "num_tokens": 20815889.0, + "step": 7997, + "train/ce_loss": 1.5188014507293701 + }, + { + "epoch": 0.7906861775756377, + "step": 7997, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.7906861775756377, + "step": 7997, + "train/total_loss": 0.194848895072937 + }, + { + "entropy": 8.29849910736084, + "epoch": 0.7907850504251532, + "mean_token_accuracy": 0.7167235612869263, + "num_tokens": 20821299.0, + "step": 7998, + "train/ce_loss": 0.7832455039024353 + }, + { + "epoch": 0.7907850504251532, + "step": 7998, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.7907850504251532, + "step": 7998, + "train/total_loss": 0.129105806350708 + }, + { + "entropy": 8.774063110351562, + "epoch": 0.7908839232746687, + "mean_token_accuracy": 0.7605262994766235, + "num_tokens": 20826559.0, + "step": 7999, + "train/ce_loss": 0.4658834636211395 + }, + { + "epoch": 0.7908839232746687, + "step": 7999, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.7908839232746687, + "step": 7999, + "train/total_loss": 0.09736959636211395 + }, + { + "epoch": 0.7909827961241843, + "grad_norm": 0.6165838837623596, + "learning_rate": 8.024773772437325e-06, + "loss": 0.1267, + "step": 8000 + }, + { + "entropy": 8.382328987121582, + "epoch": 0.7909827961241843, + "mean_token_accuracy": 0.6791630387306213, + "num_tokens": 20832156.0, + "step": 8000, + "train/ce_loss": 1.113782286643982 + }, + { + "epoch": 0.7909827961241843, + "step": 8000, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.7909827961241843, + "step": 8000, + "train/total_loss": 0.19731572270393372 + }, + { + "entropy": 8.665933609008789, + "epoch": 0.7910816689736998, + "mean_token_accuracy": 0.7242206335067749, + "num_tokens": 20837632.0, + "step": 8001, + "train/ce_loss": 1.1206034421920776 + }, + { + "epoch": 0.7910816689736998, + "step": 8001, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.7910816689736998, + "step": 8001, + "train/total_loss": 0.1862790882587433 + }, + { + "entropy": 8.408807754516602, + "epoch": 0.7911805418232154, + "mean_token_accuracy": 0.7311936020851135, + "num_tokens": 20843124.0, + "step": 8002, + "train/ce_loss": 1.1915003061294556 + }, + { + "epoch": 0.7911805418232154, + "step": 8002, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.7911805418232154, + "step": 8002, + "train/total_loss": 0.2129000425338745 + }, + { + "entropy": 8.198397636413574, + "epoch": 0.7912794146727309, + "mean_token_accuracy": 0.8179509043693542, + "num_tokens": 20848756.0, + "step": 8003, + "train/ce_loss": 0.5002986788749695 + }, + { + "epoch": 0.7912794146727309, + "step": 8003, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.7912794146727309, + "step": 8003, + "train/total_loss": 0.06956112384796143 + }, + { + "entropy": 8.506534576416016, + "epoch": 0.7913782875222464, + "mean_token_accuracy": 0.7067209482192993, + "num_tokens": 20854222.0, + "step": 8004, + "train/ce_loss": 0.7447643876075745 + }, + { + "epoch": 0.7913782875222464, + "step": 8004, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.7913782875222464, + "step": 8004, + "train/total_loss": 0.1369764506816864 + }, + { + "entropy": 8.983867645263672, + "epoch": 0.791477160371762, + "mean_token_accuracy": 0.7689393758773804, + "num_tokens": 20859472.0, + "step": 8005, + "train/ce_loss": 0.9216192960739136 + }, + { + "epoch": 0.791477160371762, + "step": 8005, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.791477160371762, + "step": 8005, + "train/total_loss": 0.20153692364692688 + }, + { + "entropy": 8.91226577758789, + "epoch": 0.7915760332212775, + "mean_token_accuracy": 0.7316715717315674, + "num_tokens": 20864596.0, + "step": 8006, + "train/ce_loss": 1.0091112852096558 + }, + { + "epoch": 0.7915760332212775, + "step": 8006, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.7915760332212775, + "step": 8006, + "train/total_loss": 0.19466114044189453 + }, + { + "entropy": 9.32400894165039, + "epoch": 0.7916749060707929, + "mean_token_accuracy": 0.7775735259056091, + "num_tokens": 20869586.0, + "step": 8007, + "train/ce_loss": 2.9520026600948768e-06 + }, + { + "epoch": 0.7916749060707929, + "step": 8007, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.7916749060707929, + "step": 8007, + "train/total_loss": 0.05078154429793358 + }, + { + "entropy": 8.588582992553711, + "epoch": 0.7917737789203085, + "mean_token_accuracy": 0.7236692905426025, + "num_tokens": 20874962.0, + "step": 8008, + "train/ce_loss": 1.1916790008544922 + }, + { + "epoch": 0.7917737789203085, + "step": 8008, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.7917737789203085, + "step": 8008, + "train/total_loss": 0.13869914412498474 + }, + { + "entropy": 8.967981338500977, + "epoch": 0.791872651769824, + "mean_token_accuracy": 0.753926694393158, + "num_tokens": 20880203.0, + "step": 8009, + "train/ce_loss": 0.5056743621826172 + }, + { + "epoch": 0.791872651769824, + "step": 8009, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.791872651769824, + "step": 8009, + "train/total_loss": 0.11697368323802948 + }, + { + "entropy": 9.474925994873047, + "epoch": 0.7919715246193395, + "mean_token_accuracy": 0.7214699983596802, + "num_tokens": 20885184.0, + "step": 8010, + "train/ce_loss": 1.2627744674682617 + }, + { + "epoch": 0.7919715246193395, + "step": 8010, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.7919715246193395, + "step": 8010, + "train/total_loss": 0.18487119674682617 + }, + { + "entropy": 9.434576988220215, + "epoch": 0.7920703974688551, + "mean_token_accuracy": 0.6704225540161133, + "num_tokens": 20889996.0, + "step": 8011, + "train/ce_loss": 0.5485074520111084 + }, + { + "epoch": 0.7920703974688551, + "step": 8011, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.7920703974688551, + "step": 8011, + "train/total_loss": 0.1290694922208786 + }, + { + "entropy": 8.907894134521484, + "epoch": 0.7921692703183706, + "mean_token_accuracy": 0.7462499737739563, + "num_tokens": 20895250.0, + "step": 8012, + "train/ce_loss": 0.9570124745368958 + }, + { + "epoch": 0.7921692703183706, + "step": 8012, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.7921692703183706, + "step": 8012, + "train/total_loss": 0.11523249745368958 + }, + { + "entropy": 8.427125930786133, + "epoch": 0.7922681431678861, + "mean_token_accuracy": 0.7129337787628174, + "num_tokens": 20900657.0, + "step": 8013, + "train/ce_loss": 1.1414391994476318 + }, + { + "epoch": 0.7922681431678861, + "step": 8013, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.7922681431678861, + "step": 8013, + "train/total_loss": 0.16101892292499542 + }, + { + "entropy": 8.855546951293945, + "epoch": 0.7923670160174017, + "mean_token_accuracy": 0.7314356565475464, + "num_tokens": 20905914.0, + "step": 8014, + "train/ce_loss": 1.0533287525177002 + }, + { + "epoch": 0.7923670160174017, + "step": 8014, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.7923670160174017, + "step": 8014, + "train/total_loss": 0.1483016312122345 + }, + { + "entropy": 9.2638521194458, + "epoch": 0.7924658888669172, + "mean_token_accuracy": 0.7286245226860046, + "num_tokens": 20910943.0, + "step": 8015, + "train/ce_loss": 0.7868447303771973 + }, + { + "epoch": 0.7924658888669172, + "step": 8015, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.7924658888669172, + "step": 8015, + "train/total_loss": 0.1333719789981842 + }, + { + "entropy": 8.576969146728516, + "epoch": 0.7925647617164326, + "mean_token_accuracy": 0.720710039138794, + "num_tokens": 20916322.0, + "step": 8016, + "train/ce_loss": 0.5343302488327026 + }, + { + "epoch": 0.7925647617164326, + "step": 8016, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.7925647617164326, + "step": 8016, + "train/total_loss": 0.08858928084373474 + }, + { + "entropy": 8.793830871582031, + "epoch": 0.7926636345659482, + "mean_token_accuracy": 0.7413554787635803, + "num_tokens": 20921481.0, + "step": 8017, + "train/ce_loss": 0.7343287467956543 + }, + { + "epoch": 0.7926636345659482, + "step": 8017, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.7926636345659482, + "step": 8017, + "train/total_loss": 0.12421412765979767 + }, + { + "entropy": 8.47189712524414, + "epoch": 0.7927625074154637, + "mean_token_accuracy": 0.7388414144515991, + "num_tokens": 20927218.0, + "step": 8018, + "train/ce_loss": 0.5181703567504883 + }, + { + "epoch": 0.7927625074154637, + "step": 8018, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.7927625074154637, + "step": 8018, + "train/total_loss": 0.11822328716516495 + }, + { + "entropy": 8.603368759155273, + "epoch": 0.7928613802649792, + "mean_token_accuracy": 0.7450058460235596, + "num_tokens": 20932722.0, + "step": 8019, + "train/ce_loss": 0.7076160907745361 + }, + { + "epoch": 0.7928613802649792, + "step": 8019, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.7928613802649792, + "step": 8019, + "train/total_loss": 0.10591786354780197 + }, + { + "epoch": 0.7929602531144948, + "grad_norm": 0.6106243133544922, + "learning_rate": 8.019828907679375e-06, + "loss": 0.1447, + "step": 8020 + }, + { + "entropy": 8.636609077453613, + "epoch": 0.7929602531144948, + "mean_token_accuracy": 0.6854565739631653, + "num_tokens": 20938063.0, + "step": 8020, + "train/ce_loss": 1.7223507165908813 + }, + { + "epoch": 0.7929602531144948, + "step": 8020, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.7929602531144948, + "step": 8020, + "train/total_loss": 0.21129757165908813 + }, + { + "entropy": 8.813518524169922, + "epoch": 0.7930591259640103, + "mean_token_accuracy": 0.7285902500152588, + "num_tokens": 20943288.0, + "step": 8021, + "train/ce_loss": 0.8579605221748352 + }, + { + "epoch": 0.7930591259640103, + "step": 8021, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.7930591259640103, + "step": 8021, + "train/total_loss": 0.136577308177948 + }, + { + "entropy": 8.812105178833008, + "epoch": 0.7931579988135258, + "mean_token_accuracy": 0.7507122755050659, + "num_tokens": 20948458.0, + "step": 8022, + "train/ce_loss": 1.2979592084884644 + }, + { + "epoch": 0.7931579988135258, + "step": 8022, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.7931579988135258, + "step": 8022, + "train/total_loss": 0.15713967382907867 + }, + { + "entropy": 8.767423629760742, + "epoch": 0.7932568716630414, + "mean_token_accuracy": 0.7603305578231812, + "num_tokens": 20953932.0, + "step": 8023, + "train/ce_loss": 0.4973965585231781 + }, + { + "epoch": 0.7932568716630414, + "step": 8023, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.7932568716630414, + "step": 8023, + "train/total_loss": 0.07317715883255005 + }, + { + "entropy": 8.79323673248291, + "epoch": 0.7933557445125569, + "mean_token_accuracy": 0.7204058766365051, + "num_tokens": 20959292.0, + "step": 8024, + "train/ce_loss": 0.7244845628738403 + }, + { + "epoch": 0.7933557445125569, + "step": 8024, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.7933557445125569, + "step": 8024, + "train/total_loss": 0.09588595479726791 + }, + { + "entropy": 8.509698867797852, + "epoch": 0.7934546173620723, + "mean_token_accuracy": 0.7055630683898926, + "num_tokens": 20964491.0, + "step": 8025, + "train/ce_loss": 0.5640966892242432 + }, + { + "epoch": 0.7934546173620723, + "step": 8025, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.7934546173620723, + "step": 8025, + "train/total_loss": 0.10719092190265656 + }, + { + "entropy": 8.53420352935791, + "epoch": 0.7935534902115879, + "mean_token_accuracy": 0.7690721750259399, + "num_tokens": 20969951.0, + "step": 8026, + "train/ce_loss": 0.4616752564907074 + }, + { + "epoch": 0.7935534902115879, + "step": 8026, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.7935534902115879, + "step": 8026, + "train/total_loss": 0.0735112726688385 + }, + { + "entropy": 9.335734367370605, + "epoch": 0.7936523630611034, + "mean_token_accuracy": 0.7820267677307129, + "num_tokens": 20974926.0, + "step": 8027, + "train/ce_loss": 1.5045855045318604 + }, + { + "epoch": 0.7936523630611034, + "step": 8027, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.7936523630611034, + "step": 8027, + "train/total_loss": 0.23639605939388275 + }, + { + "entropy": 8.86979866027832, + "epoch": 0.7937512359106189, + "mean_token_accuracy": 0.7473560571670532, + "num_tokens": 20980290.0, + "step": 8028, + "train/ce_loss": 0.551834762096405 + }, + { + "epoch": 0.7937512359106189, + "step": 8028, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.7937512359106189, + "step": 8028, + "train/total_loss": 0.09033972769975662 + }, + { + "entropy": 9.030107498168945, + "epoch": 0.7938501087601345, + "mean_token_accuracy": 0.7059679627418518, + "num_tokens": 20985458.0, + "step": 8029, + "train/ce_loss": 0.9660660624504089 + }, + { + "epoch": 0.7938501087601345, + "step": 8029, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.7938501087601345, + "step": 8029, + "train/total_loss": 0.11613785475492477 + }, + { + "entropy": 9.329030990600586, + "epoch": 0.79394898160965, + "mean_token_accuracy": 0.7026239037513733, + "num_tokens": 20990211.0, + "step": 8030, + "train/ce_loss": 2.4383382424275624e-06 + }, + { + "epoch": 0.79394898160965, + "step": 8030, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.79394898160965, + "step": 8030, + "train/total_loss": 0.0390627421438694 + }, + { + "entropy": 9.173990249633789, + "epoch": 0.7940478544591655, + "mean_token_accuracy": 0.7726597189903259, + "num_tokens": 20995291.0, + "step": 8031, + "train/ce_loss": 0.7765879034996033 + }, + { + "epoch": 0.7940478544591655, + "step": 8031, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.7940478544591655, + "step": 8031, + "train/total_loss": 0.12844005227088928 + }, + { + "entropy": 9.423505783081055, + "epoch": 0.7941467273086811, + "mean_token_accuracy": 0.6603773832321167, + "num_tokens": 21000213.0, + "step": 8032, + "train/ce_loss": 4.033063305541873e-06 + }, + { + "epoch": 0.7941467273086811, + "step": 8032, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.7941467273086811, + "step": 8032, + "train/total_loss": 0.054687902331352234 + }, + { + "entropy": 8.714384078979492, + "epoch": 0.7942456001581966, + "mean_token_accuracy": 0.744516134262085, + "num_tokens": 21005463.0, + "step": 8033, + "train/ce_loss": 0.688065767288208 + }, + { + "epoch": 0.7942456001581966, + "step": 8033, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.7942456001581966, + "step": 8033, + "train/total_loss": 0.09615033119916916 + }, + { + "entropy": 8.982434272766113, + "epoch": 0.794344473007712, + "mean_token_accuracy": 0.6989721059799194, + "num_tokens": 21010597.0, + "step": 8034, + "train/ce_loss": 1.1527574062347412 + }, + { + "epoch": 0.794344473007712, + "step": 8034, + "train/sim_loss": 0.14453125 + }, + { + "epoch": 0.794344473007712, + "step": 8034, + "train/total_loss": 0.2598069906234741 + }, + { + "entropy": 9.549768447875977, + "epoch": 0.7944433458572276, + "mean_token_accuracy": 0.6770833134651184, + "num_tokens": 21015279.0, + "step": 8035, + "train/ce_loss": 4.862302830588305e-06 + }, + { + "epoch": 0.7944433458572276, + "step": 8035, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.7944433458572276, + "step": 8035, + "train/total_loss": 0.0546879880130291 + }, + { + "entropy": 8.494789123535156, + "epoch": 0.7945422187067431, + "mean_token_accuracy": 0.7814776301383972, + "num_tokens": 21020720.0, + "step": 8036, + "train/ce_loss": 0.793206512928009 + }, + { + "epoch": 0.7945422187067431, + "step": 8036, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.7945422187067431, + "step": 8036, + "train/total_loss": 0.13400815427303314 + }, + { + "entropy": 8.888826370239258, + "epoch": 0.7946410915562586, + "mean_token_accuracy": 0.7537091970443726, + "num_tokens": 21025807.0, + "step": 8037, + "train/ce_loss": 0.4757600724697113 + }, + { + "epoch": 0.7946410915562586, + "step": 8037, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.7946410915562586, + "step": 8037, + "train/total_loss": 0.14132601022720337 + }, + { + "entropy": 8.643922805786133, + "epoch": 0.7947399644057742, + "mean_token_accuracy": 0.7627695798873901, + "num_tokens": 21031175.0, + "step": 8038, + "train/ce_loss": 0.4287779927253723 + }, + { + "epoch": 0.7947399644057742, + "step": 8038, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.7947399644057742, + "step": 8038, + "train/total_loss": 0.06240905076265335 + }, + { + "entropy": 8.62867546081543, + "epoch": 0.7948388372552897, + "mean_token_accuracy": 0.7097480893135071, + "num_tokens": 21036536.0, + "step": 8039, + "train/ce_loss": 1.2380282878875732 + }, + { + "epoch": 0.7948388372552897, + "step": 8039, + "train/sim_loss": 0.140625 + }, + { + "epoch": 0.7948388372552897, + "step": 8039, + "train/total_loss": 0.2644278407096863 + }, + { + "epoch": 0.7949377101048052, + "grad_norm": 0.6808719635009766, + "learning_rate": 8.014884042921427e-06, + "loss": 0.1365, + "step": 8040 + }, + { + "entropy": 9.270933151245117, + "epoch": 0.7949377101048052, + "mean_token_accuracy": 0.7643442749977112, + "num_tokens": 21041484.0, + "step": 8040, + "train/ce_loss": 1.065250277519226 + }, + { + "epoch": 0.7949377101048052, + "step": 8040, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.7949377101048052, + "step": 8040, + "train/total_loss": 0.14558753371238708 + }, + { + "entropy": 8.76633358001709, + "epoch": 0.7950365829543208, + "mean_token_accuracy": 0.7448609471321106, + "num_tokens": 21046722.0, + "step": 8041, + "train/ce_loss": 0.8189980387687683 + }, + { + "epoch": 0.7950365829543208, + "step": 8041, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.7950365829543208, + "step": 8041, + "train/total_loss": 0.12096230685710907 + }, + { + "entropy": 9.02629280090332, + "epoch": 0.7951354558038363, + "mean_token_accuracy": 0.7824143171310425, + "num_tokens": 21051861.0, + "step": 8042, + "train/ce_loss": 1.332713007926941 + }, + { + "epoch": 0.7951354558038363, + "step": 8042, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.7951354558038363, + "step": 8042, + "train/total_loss": 0.19967755675315857 + }, + { + "entropy": 8.792691230773926, + "epoch": 0.7952343286533518, + "mean_token_accuracy": 0.7832929491996765, + "num_tokens": 21057169.0, + "step": 8043, + "train/ce_loss": 0.7716123461723328 + }, + { + "epoch": 0.7952343286533518, + "step": 8043, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.7952343286533518, + "step": 8043, + "train/total_loss": 0.10059873759746552 + }, + { + "entropy": 8.570995330810547, + "epoch": 0.7953332015028673, + "mean_token_accuracy": 0.7436932325363159, + "num_tokens": 21062602.0, + "step": 8044, + "train/ce_loss": 1.0313103199005127 + }, + { + "epoch": 0.7953332015028673, + "step": 8044, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.7953332015028673, + "step": 8044, + "train/total_loss": 0.1500060260295868 + }, + { + "entropy": 8.827428817749023, + "epoch": 0.7954320743523828, + "mean_token_accuracy": 0.7553072571754456, + "num_tokens": 21067957.0, + "step": 8045, + "train/ce_loss": 0.8951282501220703 + }, + { + "epoch": 0.7954320743523828, + "step": 8045, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.7954320743523828, + "step": 8045, + "train/total_loss": 0.17935657501220703 + }, + { + "entropy": 8.760117530822754, + "epoch": 0.7955309472018983, + "mean_token_accuracy": 0.7811704874038696, + "num_tokens": 21073205.0, + "step": 8046, + "train/ce_loss": 1.0642935037612915 + }, + { + "epoch": 0.7955309472018983, + "step": 8046, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.7955309472018983, + "step": 8046, + "train/total_loss": 0.1493981033563614 + }, + { + "entropy": 8.706624031066895, + "epoch": 0.7956298200514139, + "mean_token_accuracy": 0.7051442861557007, + "num_tokens": 21078485.0, + "step": 8047, + "train/ce_loss": 0.5215659141540527 + }, + { + "epoch": 0.7956298200514139, + "step": 8047, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.7956298200514139, + "step": 8047, + "train/total_loss": 0.12637534737586975 + }, + { + "entropy": 8.552886962890625, + "epoch": 0.7957286929009294, + "mean_token_accuracy": 0.769328236579895, + "num_tokens": 21083766.0, + "step": 8048, + "train/ce_loss": 0.7825904488563538 + }, + { + "epoch": 0.7957286929009294, + "step": 8048, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.7957286929009294, + "step": 8048, + "train/total_loss": 0.13294655084609985 + }, + { + "entropy": 8.60584831237793, + "epoch": 0.7958275657504449, + "mean_token_accuracy": 0.740899384021759, + "num_tokens": 21089199.0, + "step": 8049, + "train/ce_loss": 0.49789944291114807 + }, + { + "epoch": 0.7958275657504449, + "step": 8049, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.7958275657504449, + "step": 8049, + "train/total_loss": 0.10447745025157928 + }, + { + "entropy": 8.944599151611328, + "epoch": 0.7959264385999605, + "mean_token_accuracy": 0.7732843160629272, + "num_tokens": 21094487.0, + "step": 8050, + "train/ce_loss": 1.007731318473816 + }, + { + "epoch": 0.7959264385999605, + "step": 8050, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.7959264385999605, + "step": 8050, + "train/total_loss": 0.13202312588691711 + }, + { + "entropy": 8.915923118591309, + "epoch": 0.796025311449476, + "mean_token_accuracy": 0.7136498689651489, + "num_tokens": 21099605.0, + "step": 8051, + "train/ce_loss": 1.4537798166275024 + }, + { + "epoch": 0.796025311449476, + "step": 8051, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.796025311449476, + "step": 8051, + "train/total_loss": 0.219596728682518 + }, + { + "entropy": 9.284667015075684, + "epoch": 0.7961241842989915, + "mean_token_accuracy": 0.762666642665863, + "num_tokens": 21104423.0, + "step": 8052, + "train/ce_loss": 1.6215224266052246 + }, + { + "epoch": 0.7961241842989915, + "step": 8052, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.7961241842989915, + "step": 8052, + "train/total_loss": 0.1973084956407547 + }, + { + "entropy": 9.210882186889648, + "epoch": 0.796223057148507, + "mean_token_accuracy": 0.7402597665786743, + "num_tokens": 21109519.0, + "step": 8053, + "train/ce_loss": 1.3008627891540527 + }, + { + "epoch": 0.796223057148507, + "step": 8053, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.796223057148507, + "step": 8053, + "train/total_loss": 0.235555037856102 + }, + { + "entropy": 9.005620956420898, + "epoch": 0.7963219299980225, + "mean_token_accuracy": 0.7664974331855774, + "num_tokens": 21114766.0, + "step": 8054, + "train/ce_loss": 0.9678490161895752 + }, + { + "epoch": 0.7963219299980225, + "step": 8054, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.7963219299980225, + "step": 8054, + "train/total_loss": 0.13975365459918976 + }, + { + "entropy": 8.489038467407227, + "epoch": 0.796420802847538, + "mean_token_accuracy": 0.7626546621322632, + "num_tokens": 21120156.0, + "step": 8055, + "train/ce_loss": 0.6151663661003113 + }, + { + "epoch": 0.796420802847538, + "step": 8055, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.796420802847538, + "step": 8055, + "train/total_loss": 0.1747978925704956 + }, + { + "entropy": 9.020843505859375, + "epoch": 0.7965196756970536, + "mean_token_accuracy": 0.682539701461792, + "num_tokens": 21125371.0, + "step": 8056, + "train/ce_loss": 1.6898895502090454 + }, + { + "epoch": 0.7965196756970536, + "step": 8056, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.7965196756970536, + "step": 8056, + "train/total_loss": 0.24711395800113678 + }, + { + "entropy": 9.3050537109375, + "epoch": 0.7966185485465691, + "mean_token_accuracy": 0.7439446449279785, + "num_tokens": 21130385.0, + "step": 8057, + "train/ce_loss": 1.4615914821624756 + }, + { + "epoch": 0.7966185485465691, + "step": 8057, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.7966185485465691, + "step": 8057, + "train/total_loss": 0.19694040715694427 + }, + { + "entropy": 9.728808403015137, + "epoch": 0.7967174213960846, + "mean_token_accuracy": 0.7196765542030334, + "num_tokens": 21135193.0, + "step": 8058, + "train/ce_loss": 8.455596798739862e-06 + }, + { + "epoch": 0.7967174213960846, + "step": 8058, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.7967174213960846, + "step": 8058, + "train/total_loss": 0.04296959564089775 + }, + { + "entropy": 8.709403038024902, + "epoch": 0.7968162942456002, + "mean_token_accuracy": 0.7583047151565552, + "num_tokens": 21140515.0, + "step": 8059, + "train/ce_loss": 0.709441065788269 + }, + { + "epoch": 0.7968162942456002, + "step": 8059, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.7968162942456002, + "step": 8059, + "train/total_loss": 0.10610035806894302 + }, + { + "epoch": 0.7969151670951157, + "grad_norm": 0.6506040692329407, + "learning_rate": 8.009939178163478e-06, + "loss": 0.1338, + "step": 8060 + }, + { + "entropy": 8.993186950683594, + "epoch": 0.7969151670951157, + "mean_token_accuracy": 0.7208976149559021, + "num_tokens": 21145651.0, + "step": 8060, + "train/ce_loss": 0.6721799373626709 + }, + { + "epoch": 0.7969151670951157, + "step": 8060, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.7969151670951157, + "step": 8060, + "train/total_loss": 0.15315550565719604 + }, + { + "entropy": 9.022912979125977, + "epoch": 0.7970140399446312, + "mean_token_accuracy": 0.6871657967567444, + "num_tokens": 21150860.0, + "step": 8061, + "train/ce_loss": 1.7690759897232056 + }, + { + "epoch": 0.7970140399446312, + "step": 8061, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.7970140399446312, + "step": 8061, + "train/total_loss": 0.21597009897232056 + }, + { + "entropy": 9.806182861328125, + "epoch": 0.7971129127941468, + "mean_token_accuracy": 0.8284023404121399, + "num_tokens": 21155635.0, + "step": 8062, + "train/ce_loss": 5.1401830205577426e-06 + }, + { + "epoch": 0.7971129127941468, + "step": 8062, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.7971129127941468, + "step": 8062, + "train/total_loss": 0.015625514090061188 + }, + { + "entropy": 9.324712753295898, + "epoch": 0.7972117856436622, + "mean_token_accuracy": 0.7597172856330872, + "num_tokens": 21160607.0, + "step": 8063, + "train/ce_loss": 1.064432978630066 + }, + { + "epoch": 0.7972117856436622, + "step": 8063, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.7972117856436622, + "step": 8063, + "train/total_loss": 0.15722455084323883 + }, + { + "entropy": 8.54730224609375, + "epoch": 0.7973106584931777, + "mean_token_accuracy": 0.7011363506317139, + "num_tokens": 21165949.0, + "step": 8064, + "train/ce_loss": 0.995823085308075 + }, + { + "epoch": 0.7973106584931777, + "step": 8064, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.7973106584931777, + "step": 8064, + "train/total_loss": 0.15426981449127197 + }, + { + "entropy": 8.729358673095703, + "epoch": 0.7974095313426933, + "mean_token_accuracy": 0.7662790417671204, + "num_tokens": 21171319.0, + "step": 8065, + "train/ce_loss": 0.858303427696228 + }, + { + "epoch": 0.7974095313426933, + "step": 8065, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.7974095313426933, + "step": 8065, + "train/total_loss": 0.13270534574985504 + }, + { + "entropy": 8.9976806640625, + "epoch": 0.7975084041922088, + "mean_token_accuracy": 0.7122905254364014, + "num_tokens": 21176505.0, + "step": 8066, + "train/ce_loss": 2.1025643348693848 + }, + { + "epoch": 0.7975084041922088, + "step": 8066, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.7975084041922088, + "step": 8066, + "train/total_loss": 0.2922877073287964 + }, + { + "entropy": 8.699651718139648, + "epoch": 0.7976072770417243, + "mean_token_accuracy": 0.7576754093170166, + "num_tokens": 21181869.0, + "step": 8067, + "train/ce_loss": 1.1232519149780273 + }, + { + "epoch": 0.7976072770417243, + "step": 8067, + "train/sim_loss": 0.1328125 + }, + { + "epoch": 0.7976072770417243, + "step": 8067, + "train/total_loss": 0.24513769149780273 + }, + { + "entropy": 8.542829513549805, + "epoch": 0.7977061498912399, + "mean_token_accuracy": 0.694774329662323, + "num_tokens": 21187195.0, + "step": 8068, + "train/ce_loss": 1.2314989566802979 + }, + { + "epoch": 0.7977061498912399, + "step": 8068, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.7977061498912399, + "step": 8068, + "train/total_loss": 0.17393115162849426 + }, + { + "entropy": 9.735811233520508, + "epoch": 0.7978050227407554, + "mean_token_accuracy": 0.7394958138465881, + "num_tokens": 21191956.0, + "step": 8069, + "train/ce_loss": 1.8081225156784058 + }, + { + "epoch": 0.7978050227407554, + "step": 8069, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.7978050227407554, + "step": 8069, + "train/total_loss": 0.24331225454807281 + }, + { + "entropy": 9.444601058959961, + "epoch": 0.7979038955902709, + "mean_token_accuracy": 0.7428571581840515, + "num_tokens": 21196910.0, + "step": 8070, + "train/ce_loss": 2.05026503863337e-06 + }, + { + "epoch": 0.7979038955902709, + "step": 8070, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.7979038955902709, + "step": 8070, + "train/total_loss": 0.019531454890966415 + }, + { + "entropy": 9.299373626708984, + "epoch": 0.7980027684397865, + "mean_token_accuracy": 0.7241379022598267, + "num_tokens": 21201912.0, + "step": 8071, + "train/ce_loss": 1.0788629055023193 + }, + { + "epoch": 0.7980027684397865, + "step": 8071, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.7980027684397865, + "step": 8071, + "train/total_loss": 0.17819878458976746 + }, + { + "entropy": 9.13897705078125, + "epoch": 0.7981016412893019, + "mean_token_accuracy": 0.744966447353363, + "num_tokens": 21206880.0, + "step": 8072, + "train/ce_loss": 3.742259877981269e-06 + }, + { + "epoch": 0.7981016412893019, + "step": 8072, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.7981016412893019, + "step": 8072, + "train/total_loss": 0.039062872529029846 + }, + { + "entropy": 8.684549331665039, + "epoch": 0.7982005141388174, + "mean_token_accuracy": 0.764011800289154, + "num_tokens": 21212030.0, + "step": 8073, + "train/ce_loss": 0.8428314328193665 + }, + { + "epoch": 0.7982005141388174, + "step": 8073, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.7982005141388174, + "step": 8073, + "train/total_loss": 0.12725189328193665 + }, + { + "entropy": 9.31668758392334, + "epoch": 0.798299386988333, + "mean_token_accuracy": 0.77625572681427, + "num_tokens": 21217085.0, + "step": 8074, + "train/ce_loss": 5.788366706838133e-06 + }, + { + "epoch": 0.798299386988333, + "step": 8074, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.798299386988333, + "step": 8074, + "train/total_loss": 0.08593808114528656 + }, + { + "entropy": 8.60556411743164, + "epoch": 0.7983982598378485, + "mean_token_accuracy": 0.7404305934906006, + "num_tokens": 21222449.0, + "step": 8075, + "train/ce_loss": 0.6787205934524536 + }, + { + "epoch": 0.7983982598378485, + "step": 8075, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.7983982598378485, + "step": 8075, + "train/total_loss": 0.1616220623254776 + }, + { + "entropy": 9.245166778564453, + "epoch": 0.798497132687364, + "mean_token_accuracy": 0.7532051205635071, + "num_tokens": 21227519.0, + "step": 8076, + "train/ce_loss": 1.3790321350097656 + }, + { + "epoch": 0.798497132687364, + "step": 8076, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.798497132687364, + "step": 8076, + "train/total_loss": 0.24337196350097656 + }, + { + "entropy": 8.748480796813965, + "epoch": 0.7985960055368796, + "mean_token_accuracy": 0.7029339671134949, + "num_tokens": 21232773.0, + "step": 8077, + "train/ce_loss": 0.5744321346282959 + }, + { + "epoch": 0.7985960055368796, + "step": 8077, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.7985960055368796, + "step": 8077, + "train/total_loss": 0.15900571644306183 + }, + { + "entropy": 8.951897621154785, + "epoch": 0.7986948783863951, + "mean_token_accuracy": 0.6483516693115234, + "num_tokens": 21237912.0, + "step": 8078, + "train/ce_loss": 1.4018466472625732 + }, + { + "epoch": 0.7986948783863951, + "step": 8078, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.7986948783863951, + "step": 8078, + "train/total_loss": 0.2144034206867218 + }, + { + "entropy": 9.206893920898438, + "epoch": 0.7987937512359106, + "mean_token_accuracy": 0.7587253451347351, + "num_tokens": 21243002.0, + "step": 8079, + "train/ce_loss": 1.1328591108322144 + }, + { + "epoch": 0.7987937512359106, + "step": 8079, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.7987937512359106, + "step": 8079, + "train/total_loss": 0.16797341406345367 + }, + { + "epoch": 0.7988926240854262, + "grad_norm": 0.7565287947654724, + "learning_rate": 8.004994313405528e-06, + "loss": 0.1375, + "step": 8080 + }, + { + "entropy": 8.812400817871094, + "epoch": 0.7988926240854262, + "mean_token_accuracy": 0.7281166911125183, + "num_tokens": 21248212.0, + "step": 8080, + "train/ce_loss": 0.8350350856781006 + }, + { + "epoch": 0.7988926240854262, + "step": 8080, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.7988926240854262, + "step": 8080, + "train/total_loss": 0.14209726452827454 + }, + { + "entropy": 8.763813018798828, + "epoch": 0.7989914969349416, + "mean_token_accuracy": 0.6979866027832031, + "num_tokens": 21253595.0, + "step": 8081, + "train/ce_loss": 1.2388736009597778 + }, + { + "epoch": 0.7989914969349416, + "step": 8081, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.7989914969349416, + "step": 8081, + "train/total_loss": 0.19029361009597778 + }, + { + "entropy": 9.12805461883545, + "epoch": 0.7990903697844571, + "mean_token_accuracy": 0.760869562625885, + "num_tokens": 21258659.0, + "step": 8082, + "train/ce_loss": 0.6981890201568604 + }, + { + "epoch": 0.7990903697844571, + "step": 8082, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.7990903697844571, + "step": 8082, + "train/total_loss": 0.1088814064860344 + }, + { + "entropy": 8.688815116882324, + "epoch": 0.7991892426339727, + "mean_token_accuracy": 0.7022653818130493, + "num_tokens": 21264000.0, + "step": 8083, + "train/ce_loss": 1.0230445861816406 + }, + { + "epoch": 0.7991892426339727, + "step": 8083, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.7991892426339727, + "step": 8083, + "train/total_loss": 0.15308570861816406 + }, + { + "entropy": 8.801924705505371, + "epoch": 0.7992881154834882, + "mean_token_accuracy": 0.737864077091217, + "num_tokens": 21269218.0, + "step": 8084, + "train/ce_loss": 0.7019436955451965 + }, + { + "epoch": 0.7992881154834882, + "step": 8084, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.7992881154834882, + "step": 8084, + "train/total_loss": 0.09753812104463577 + }, + { + "entropy": 8.858683586120605, + "epoch": 0.7993869883330038, + "mean_token_accuracy": 0.6863309144973755, + "num_tokens": 21274397.0, + "step": 8085, + "train/ce_loss": 1.792798399925232 + }, + { + "epoch": 0.7993869883330038, + "step": 8085, + "train/sim_loss": 0.1953125 + }, + { + "epoch": 0.7993869883330038, + "step": 8085, + "train/total_loss": 0.3745923638343811 + }, + { + "entropy": 9.004164695739746, + "epoch": 0.7994858611825193, + "mean_token_accuracy": 0.7594594359397888, + "num_tokens": 21279601.0, + "step": 8086, + "train/ce_loss": 0.7385054230690002 + }, + { + "epoch": 0.7994858611825193, + "step": 8086, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.7994858611825193, + "step": 8086, + "train/total_loss": 0.18322554230690002 + }, + { + "entropy": 10.292235374450684, + "epoch": 0.7995847340320348, + "mean_token_accuracy": 1.0, + "num_tokens": 21283981.0, + "step": 8087, + "train/ce_loss": 4.2019906686618924e-05 + }, + { + "epoch": 0.7995847340320348, + "step": 8087, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.7995847340320348, + "step": 8087, + "train/total_loss": 0.06641045212745667 + }, + { + "entropy": 8.526718139648438, + "epoch": 0.7996836068815504, + "mean_token_accuracy": 0.7063829898834229, + "num_tokens": 21289353.0, + "step": 8088, + "train/ce_loss": 0.9399168491363525 + }, + { + "epoch": 0.7996836068815504, + "step": 8088, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.7996836068815504, + "step": 8088, + "train/total_loss": 0.1252416968345642 + }, + { + "entropy": 8.619998931884766, + "epoch": 0.7997824797310659, + "mean_token_accuracy": 0.7592592835426331, + "num_tokens": 21294710.0, + "step": 8089, + "train/ce_loss": 0.638979434967041 + }, + { + "epoch": 0.7997824797310659, + "step": 8089, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.7997824797310659, + "step": 8089, + "train/total_loss": 0.13030418753623962 + }, + { + "entropy": 8.820860862731934, + "epoch": 0.7998813525805814, + "mean_token_accuracy": 0.7468671798706055, + "num_tokens": 21299985.0, + "step": 8090, + "train/ce_loss": 0.9259432554244995 + }, + { + "epoch": 0.7998813525805814, + "step": 8090, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.7998813525805814, + "step": 8090, + "train/total_loss": 0.11993807554244995 + }, + { + "entropy": 8.713180541992188, + "epoch": 0.799980225430097, + "mean_token_accuracy": 0.7084826827049255, + "num_tokens": 21305282.0, + "step": 8091, + "train/ce_loss": 1.0818867683410645 + }, + { + "epoch": 0.799980225430097, + "step": 8091, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.799980225430097, + "step": 8091, + "train/total_loss": 0.1863136887550354 + }, + { + "entropy": 8.716011047363281, + "epoch": 0.8000790982796124, + "mean_token_accuracy": 0.7217788100242615, + "num_tokens": 21310640.0, + "step": 8092, + "train/ce_loss": 0.6730086803436279 + }, + { + "epoch": 0.8000790982796124, + "step": 8092, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.8000790982796124, + "step": 8092, + "train/total_loss": 0.12980087101459503 + }, + { + "entropy": 8.94961166381836, + "epoch": 0.8001779711291279, + "mean_token_accuracy": 0.7549669146537781, + "num_tokens": 21315713.0, + "step": 8093, + "train/ce_loss": 1.4025733470916748 + }, + { + "epoch": 0.8001779711291279, + "step": 8093, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.8001779711291279, + "step": 8093, + "train/total_loss": 0.2222885936498642 + }, + { + "entropy": 8.678363800048828, + "epoch": 0.8002768439786435, + "mean_token_accuracy": 0.6886574029922485, + "num_tokens": 21321056.0, + "step": 8094, + "train/ce_loss": 1.2181483507156372 + }, + { + "epoch": 0.8002768439786435, + "step": 8094, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.8002768439786435, + "step": 8094, + "train/total_loss": 0.19603359699249268 + }, + { + "entropy": 8.87493896484375, + "epoch": 0.800375716828159, + "mean_token_accuracy": 0.7653213739395142, + "num_tokens": 21326136.0, + "step": 8095, + "train/ce_loss": 0.7125213742256165 + }, + { + "epoch": 0.800375716828159, + "step": 8095, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.800375716828159, + "step": 8095, + "train/total_loss": 0.08687713742256165 + }, + { + "entropy": 8.643080711364746, + "epoch": 0.8004745896776745, + "mean_token_accuracy": 0.7255638837814331, + "num_tokens": 21331607.0, + "step": 8096, + "train/ce_loss": 0.5503685474395752 + }, + { + "epoch": 0.8004745896776745, + "step": 8096, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.8004745896776745, + "step": 8096, + "train/total_loss": 0.09019310772418976 + }, + { + "entropy": 8.950005531311035, + "epoch": 0.8005734625271901, + "mean_token_accuracy": 0.7805907130241394, + "num_tokens": 21336787.0, + "step": 8097, + "train/ce_loss": 0.9524298310279846 + }, + { + "epoch": 0.8005734625271901, + "step": 8097, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.8005734625271901, + "step": 8097, + "train/total_loss": 0.11868048459291458 + }, + { + "entropy": 8.641998291015625, + "epoch": 0.8006723353767056, + "mean_token_accuracy": 0.7713178396224976, + "num_tokens": 21342041.0, + "step": 8098, + "train/ce_loss": 0.896429181098938 + }, + { + "epoch": 0.8006723353767056, + "step": 8098, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.8006723353767056, + "step": 8098, + "train/total_loss": 0.15214291214942932 + }, + { + "entropy": 9.116812705993652, + "epoch": 0.800771208226221, + "mean_token_accuracy": 0.7391952276229858, + "num_tokens": 21347123.0, + "step": 8099, + "train/ce_loss": 3.7170384530327283e-06 + }, + { + "epoch": 0.800771208226221, + "step": 8099, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.800771208226221, + "step": 8099, + "train/total_loss": 0.023437872529029846 + }, + { + "epoch": 0.8008700810757367, + "grad_norm": 0.6553953886032104, + "learning_rate": 8.00004944864758e-06, + "loss": 0.1359, + "step": 8100 + }, + { + "entropy": 8.752761840820312, + "epoch": 0.8008700810757367, + "mean_token_accuracy": 0.7319347262382507, + "num_tokens": 21352467.0, + "step": 8100, + "train/ce_loss": 0.5254290103912354 + }, + { + "epoch": 0.8008700810757367, + "step": 8100, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.8008700810757367, + "step": 8100, + "train/total_loss": 0.06816790252923965 + }, + { + "entropy": 8.795303344726562, + "epoch": 0.8009689539252521, + "mean_token_accuracy": 0.7713936567306519, + "num_tokens": 21357741.0, + "step": 8101, + "train/ce_loss": 0.5583730936050415 + }, + { + "epoch": 0.8009689539252521, + "step": 8101, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.8009689539252521, + "step": 8101, + "train/total_loss": 0.12614980340003967 + }, + { + "entropy": 8.594700813293457, + "epoch": 0.8010678267747676, + "mean_token_accuracy": 0.7375565767288208, + "num_tokens": 21363153.0, + "step": 8102, + "train/ce_loss": 1.0188671350479126 + }, + { + "epoch": 0.8010678267747676, + "step": 8102, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.8010678267747676, + "step": 8102, + "train/total_loss": 0.18001171946525574 + }, + { + "entropy": 8.538590431213379, + "epoch": 0.8011666996242832, + "mean_token_accuracy": 0.7801578640937805, + "num_tokens": 21368469.0, + "step": 8103, + "train/ce_loss": 0.700299084186554 + }, + { + "epoch": 0.8011666996242832, + "step": 8103, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.8011666996242832, + "step": 8103, + "train/total_loss": 0.11299865692853928 + }, + { + "entropy": 9.091485977172852, + "epoch": 0.8012655724737987, + "mean_token_accuracy": 0.7887067198753357, + "num_tokens": 21373453.0, + "step": 8104, + "train/ce_loss": 1.0037702322006226 + }, + { + "epoch": 0.8012655724737987, + "step": 8104, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.8012655724737987, + "step": 8104, + "train/total_loss": 0.17459577322006226 + }, + { + "entropy": 9.3186674118042, + "epoch": 0.8013644453233142, + "mean_token_accuracy": 0.78925621509552, + "num_tokens": 21378354.0, + "step": 8105, + "train/ce_loss": 1.93576056517486e-06 + }, + { + "epoch": 0.8013644453233142, + "step": 8105, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.8013644453233142, + "step": 8105, + "train/total_loss": 0.05078144371509552 + }, + { + "entropy": 8.80603313446045, + "epoch": 0.8014633181728298, + "mean_token_accuracy": 0.7463087439537048, + "num_tokens": 21383590.0, + "step": 8106, + "train/ce_loss": 1.2183705568313599 + }, + { + "epoch": 0.8014633181728298, + "step": 8106, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.8014633181728298, + "step": 8106, + "train/total_loss": 0.1999620497226715 + }, + { + "entropy": 9.395215034484863, + "epoch": 0.8015621910223453, + "mean_token_accuracy": 0.7768595218658447, + "num_tokens": 21388364.0, + "step": 8107, + "train/ce_loss": 1.455410361289978 + }, + { + "epoch": 0.8015621910223453, + "step": 8107, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.8015621910223453, + "step": 8107, + "train/total_loss": 0.19632229208946228 + }, + { + "entropy": 8.796661376953125, + "epoch": 0.8016610638718608, + "mean_token_accuracy": 0.7198124527931213, + "num_tokens": 21393700.0, + "step": 8108, + "train/ce_loss": 0.7744140625 + }, + { + "epoch": 0.8016610638718608, + "step": 8108, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.8016610638718608, + "step": 8108, + "train/total_loss": 0.17119140923023224 + }, + { + "entropy": 8.673404693603516, + "epoch": 0.8017599367213764, + "mean_token_accuracy": 0.7109470963478088, + "num_tokens": 21398962.0, + "step": 8109, + "train/ce_loss": 1.0927854776382446 + }, + { + "epoch": 0.8017599367213764, + "step": 8109, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.8017599367213764, + "step": 8109, + "train/total_loss": 0.17177855968475342 + }, + { + "entropy": 8.964365005493164, + "epoch": 0.8018588095708918, + "mean_token_accuracy": 0.7806748747825623, + "num_tokens": 21404080.0, + "step": 8110, + "train/ce_loss": 3.7938925743219443e-06 + }, + { + "epoch": 0.8018588095708918, + "step": 8110, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.8018588095708918, + "step": 8110, + "train/total_loss": 0.04296912997961044 + }, + { + "entropy": 8.62191104888916, + "epoch": 0.8019576824204073, + "mean_token_accuracy": 0.7578288316726685, + "num_tokens": 21409560.0, + "step": 8111, + "train/ce_loss": 0.6696912050247192 + }, + { + "epoch": 0.8019576824204073, + "step": 8111, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.8019576824204073, + "step": 8111, + "train/total_loss": 0.0865003690123558 + }, + { + "entropy": 8.387670516967773, + "epoch": 0.8020565552699229, + "mean_token_accuracy": 0.7623188495635986, + "num_tokens": 21415084.0, + "step": 8112, + "train/ce_loss": 0.8153111338615417 + }, + { + "epoch": 0.8020565552699229, + "step": 8112, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.8020565552699229, + "step": 8112, + "train/total_loss": 0.1362186074256897 + }, + { + "entropy": 8.499277114868164, + "epoch": 0.8021554281194384, + "mean_token_accuracy": 0.7409224510192871, + "num_tokens": 21420575.0, + "step": 8113, + "train/ce_loss": 0.9124782085418701 + }, + { + "epoch": 0.8021554281194384, + "step": 8113, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.8021554281194384, + "step": 8113, + "train/total_loss": 0.1654665768146515 + }, + { + "entropy": 8.523259162902832, + "epoch": 0.8022543009689539, + "mean_token_accuracy": 0.7751396894454956, + "num_tokens": 21425784.0, + "step": 8114, + "train/ce_loss": 1.2068164348602295 + }, + { + "epoch": 0.8022543009689539, + "step": 8114, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.8022543009689539, + "step": 8114, + "train/total_loss": 0.14802539348602295 + }, + { + "entropy": 8.784333229064941, + "epoch": 0.8023531738184695, + "mean_token_accuracy": 0.7310252785682678, + "num_tokens": 21430948.0, + "step": 8115, + "train/ce_loss": 0.9043253064155579 + }, + { + "epoch": 0.8023531738184695, + "step": 8115, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.8023531738184695, + "step": 8115, + "train/total_loss": 0.1451200246810913 + }, + { + "entropy": 9.411741256713867, + "epoch": 0.802452046667985, + "mean_token_accuracy": 0.7320099472999573, + "num_tokens": 21435781.0, + "step": 8116, + "train/ce_loss": 1.9460113048553467 + }, + { + "epoch": 0.802452046667985, + "step": 8116, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.802452046667985, + "step": 8116, + "train/total_loss": 0.3078823685646057 + }, + { + "entropy": 8.769495010375977, + "epoch": 0.8025509195175005, + "mean_token_accuracy": 0.7507836818695068, + "num_tokens": 21440880.0, + "step": 8117, + "train/ce_loss": 0.932226300239563 + }, + { + "epoch": 0.8025509195175005, + "step": 8117, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.8025509195175005, + "step": 8117, + "train/total_loss": 0.11666013300418854 + }, + { + "entropy": 9.231952667236328, + "epoch": 0.8026497923670161, + "mean_token_accuracy": 0.7342550158500671, + "num_tokens": 21446017.0, + "step": 8118, + "train/ce_loss": 1.7037280797958374 + }, + { + "epoch": 0.8026497923670161, + "step": 8118, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.8026497923670161, + "step": 8118, + "train/total_loss": 0.2641228139400482 + }, + { + "entropy": 8.689058303833008, + "epoch": 0.8027486652165315, + "mean_token_accuracy": 0.7462499737739563, + "num_tokens": 21451367.0, + "step": 8119, + "train/ce_loss": 0.5837413668632507 + }, + { + "epoch": 0.8027486652165315, + "step": 8119, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.8027486652165315, + "step": 8119, + "train/total_loss": 0.10915538668632507 + }, + { + "epoch": 0.802847538066047, + "grad_norm": 0.6518300175666809, + "learning_rate": 7.995104583889631e-06, + "loss": 0.1314, + "step": 8120 + }, + { + "entropy": 9.131936073303223, + "epoch": 0.802847538066047, + "mean_token_accuracy": 0.8208954930305481, + "num_tokens": 21456214.0, + "step": 8120, + "train/ce_loss": 4.40249687017058e-06 + }, + { + "epoch": 0.802847538066047, + "step": 8120, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.802847538066047, + "step": 8120, + "train/total_loss": 0.05859418958425522 + }, + { + "entropy": 9.137779235839844, + "epoch": 0.8029464109155626, + "mean_token_accuracy": 0.765925943851471, + "num_tokens": 21461376.0, + "step": 8121, + "train/ce_loss": 0.7983360290527344 + }, + { + "epoch": 0.8029464109155626, + "step": 8121, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.8029464109155626, + "step": 8121, + "train/total_loss": 0.12670859694480896 + }, + { + "entropy": 8.53607177734375, + "epoch": 0.8030452837650781, + "mean_token_accuracy": 0.7017017006874084, + "num_tokens": 21466842.0, + "step": 8122, + "train/ce_loss": 0.70408034324646 + }, + { + "epoch": 0.8030452837650781, + "step": 8122, + "train/sim_loss": 0.01171875 + }, + { + "epoch": 0.8030452837650781, + "step": 8122, + "train/total_loss": 0.08212678879499435 + }, + { + "entropy": 8.714844703674316, + "epoch": 0.8031441566145936, + "mean_token_accuracy": 0.7058252692222595, + "num_tokens": 21472370.0, + "step": 8123, + "train/ce_loss": 1.137970209121704 + }, + { + "epoch": 0.8031441566145936, + "step": 8123, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.8031441566145936, + "step": 8123, + "train/total_loss": 0.13723452389240265 + }, + { + "entropy": 8.661134719848633, + "epoch": 0.8032430294641092, + "mean_token_accuracy": 0.756440281867981, + "num_tokens": 21477721.0, + "step": 8124, + "train/ce_loss": 0.646306037902832 + }, + { + "epoch": 0.8032430294641092, + "step": 8124, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.8032430294641092, + "step": 8124, + "train/total_loss": 0.08806810528039932 + }, + { + "entropy": 9.107043266296387, + "epoch": 0.8033419023136247, + "mean_token_accuracy": 0.7185840606689453, + "num_tokens": 21482741.0, + "step": 8125, + "train/ce_loss": 1.4271183013916016 + }, + { + "epoch": 0.8033419023136247, + "step": 8125, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.8033419023136247, + "step": 8125, + "train/total_loss": 0.1817743331193924 + }, + { + "entropy": 9.141106605529785, + "epoch": 0.8034407751631402, + "mean_token_accuracy": 0.687609076499939, + "num_tokens": 21487739.0, + "step": 8126, + "train/ce_loss": 1.2258257865905762 + }, + { + "epoch": 0.8034407751631402, + "step": 8126, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.8034407751631402, + "step": 8126, + "train/total_loss": 0.1421138346195221 + }, + { + "entropy": 8.606668472290039, + "epoch": 0.8035396480126558, + "mean_token_accuracy": 0.7492130398750305, + "num_tokens": 21493147.0, + "step": 8127, + "train/ce_loss": 0.83812415599823 + }, + { + "epoch": 0.8035396480126558, + "step": 8127, + "train/sim_loss": 0.140625 + }, + { + "epoch": 0.8035396480126558, + "step": 8127, + "train/total_loss": 0.224437415599823 + }, + { + "entropy": 9.192540168762207, + "epoch": 0.8036385208621712, + "mean_token_accuracy": 0.7893129587173462, + "num_tokens": 21498259.0, + "step": 8128, + "train/ce_loss": 2.97106703328609e-06 + }, + { + "epoch": 0.8036385208621712, + "step": 8128, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.8036385208621712, + "step": 8128, + "train/total_loss": 0.015625298023223877 + }, + { + "entropy": 8.542991638183594, + "epoch": 0.8037373937116867, + "mean_token_accuracy": 0.7178378105163574, + "num_tokens": 21503648.0, + "step": 8129, + "train/ce_loss": 0.6392026543617249 + }, + { + "epoch": 0.8037373937116867, + "step": 8129, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.8037373937116867, + "step": 8129, + "train/total_loss": 0.1029827669262886 + }, + { + "entropy": 8.937554359436035, + "epoch": 0.8038362665612023, + "mean_token_accuracy": 0.7507507801055908, + "num_tokens": 21508919.0, + "step": 8130, + "train/ce_loss": 4.179452662356198e-05 + }, + { + "epoch": 0.8038362665612023, + "step": 8130, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.8038362665612023, + "step": 8130, + "train/total_loss": 0.035160429775714874 + }, + { + "entropy": 8.675518989562988, + "epoch": 0.8039351394107178, + "mean_token_accuracy": 0.6987804770469666, + "num_tokens": 21514166.0, + "step": 8131, + "train/ce_loss": 0.961373507976532 + }, + { + "epoch": 0.8039351394107178, + "step": 8131, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.8039351394107178, + "step": 8131, + "train/total_loss": 0.15863734483718872 + }, + { + "entropy": 9.307016372680664, + "epoch": 0.8040340122602333, + "mean_token_accuracy": 0.7984790802001953, + "num_tokens": 21519108.0, + "step": 8132, + "train/ce_loss": 0.9115478992462158 + }, + { + "epoch": 0.8040340122602333, + "step": 8132, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.8040340122602333, + "step": 8132, + "train/total_loss": 0.1145922914147377 + }, + { + "entropy": 8.928248405456543, + "epoch": 0.8041328851097489, + "mean_token_accuracy": 0.7286184430122375, + "num_tokens": 21524159.0, + "step": 8133, + "train/ce_loss": 0.7917159199714661 + }, + { + "epoch": 0.8041328851097489, + "step": 8133, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.8041328851097489, + "step": 8133, + "train/total_loss": 0.11823409050703049 + }, + { + "entropy": 8.766800880432129, + "epoch": 0.8042317579592644, + "mean_token_accuracy": 0.7781732082366943, + "num_tokens": 21529497.0, + "step": 8134, + "train/ce_loss": 1.123258113861084 + }, + { + "epoch": 0.8042317579592644, + "step": 8134, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.8042317579592644, + "step": 8134, + "train/total_loss": 0.15529456734657288 + }, + { + "entropy": 8.762529373168945, + "epoch": 0.8043306308087799, + "mean_token_accuracy": 0.7233333587646484, + "num_tokens": 21534555.0, + "step": 8135, + "train/ce_loss": 1.5933242138999049e-06 + }, + { + "epoch": 0.8043306308087799, + "step": 8135, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.8043306308087799, + "step": 8135, + "train/total_loss": 0.07031265646219254 + }, + { + "entropy": 9.216514587402344, + "epoch": 0.8044295036582955, + "mean_token_accuracy": 0.7174721360206604, + "num_tokens": 21539554.0, + "step": 8136, + "train/ce_loss": 4.063111191499047e-06 + }, + { + "epoch": 0.8044295036582955, + "step": 8136, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.8044295036582955, + "step": 8136, + "train/total_loss": 0.05859415605664253 + }, + { + "entropy": 9.50296401977539, + "epoch": 0.804528376507811, + "mean_token_accuracy": 0.7253668904304504, + "num_tokens": 21544465.0, + "step": 8137, + "train/ce_loss": 2.0438973903656006 + }, + { + "epoch": 0.804528376507811, + "step": 8137, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.804528376507811, + "step": 8137, + "train/total_loss": 0.251264750957489 + }, + { + "entropy": 9.341086387634277, + "epoch": 0.8046272493573264, + "mean_token_accuracy": 0.7492625117301941, + "num_tokens": 21549378.0, + "step": 8138, + "train/ce_loss": 1.6934353113174438 + }, + { + "epoch": 0.8046272493573264, + "step": 8138, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.8046272493573264, + "step": 8138, + "train/total_loss": 0.2513747811317444 + }, + { + "entropy": 8.868955612182617, + "epoch": 0.804726122206842, + "mean_token_accuracy": 0.6798365116119385, + "num_tokens": 21554554.0, + "step": 8139, + "train/ce_loss": 1.5643260478973389 + }, + { + "epoch": 0.804726122206842, + "step": 8139, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.804726122206842, + "step": 8139, + "train/total_loss": 0.2150263637304306 + }, + { + "epoch": 0.8048249950563575, + "grad_norm": 0.7821770906448364, + "learning_rate": 7.990159719131683e-06, + "loss": 0.1326, + "step": 8140 + }, + { + "entropy": 8.8527250289917, + "epoch": 0.8048249950563575, + "mean_token_accuracy": 0.7285902500152588, + "num_tokens": 21559769.0, + "step": 8140, + "train/ce_loss": 1.3271492719650269 + }, + { + "epoch": 0.8048249950563575, + "step": 8140, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.8048249950563575, + "step": 8140, + "train/total_loss": 0.16005867719650269 + }, + { + "entropy": 9.02514362335205, + "epoch": 0.804923867905873, + "mean_token_accuracy": 0.7098214030265808, + "num_tokens": 21564937.0, + "step": 8141, + "train/ce_loss": 1.5891838073730469 + }, + { + "epoch": 0.804923867905873, + "step": 8141, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.804923867905873, + "step": 8141, + "train/total_loss": 0.2331371307373047 + }, + { + "entropy": 8.961231231689453, + "epoch": 0.8050227407553886, + "mean_token_accuracy": 0.7310230731964111, + "num_tokens": 21569969.0, + "step": 8142, + "train/ce_loss": 0.8391844034194946 + }, + { + "epoch": 0.8050227407553886, + "step": 8142, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.8050227407553886, + "step": 8142, + "train/total_loss": 0.15032470226287842 + }, + { + "entropy": 8.839179992675781, + "epoch": 0.8051216136049041, + "mean_token_accuracy": 0.7161997556686401, + "num_tokens": 21575231.0, + "step": 8143, + "train/ce_loss": 1.0139042139053345 + }, + { + "epoch": 0.8051216136049041, + "step": 8143, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.8051216136049041, + "step": 8143, + "train/total_loss": 0.17560917139053345 + }, + { + "entropy": 9.141321182250977, + "epoch": 0.8052204864544196, + "mean_token_accuracy": 0.7846715450286865, + "num_tokens": 21580195.0, + "step": 8144, + "train/ce_loss": 0.9555160403251648 + }, + { + "epoch": 0.8052204864544196, + "step": 8144, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.8052204864544196, + "step": 8144, + "train/total_loss": 0.12680160999298096 + }, + { + "entropy": 8.899325370788574, + "epoch": 0.8053193593039352, + "mean_token_accuracy": 0.7378129363059998, + "num_tokens": 21585434.0, + "step": 8145, + "train/ce_loss": 1.0690226554870605 + }, + { + "epoch": 0.8053193593039352, + "step": 8145, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.8053193593039352, + "step": 8145, + "train/total_loss": 0.21237102150917053 + }, + { + "entropy": 8.712418556213379, + "epoch": 0.8054182321534507, + "mean_token_accuracy": 0.8041236996650696, + "num_tokens": 21590656.0, + "step": 8146, + "train/ce_loss": 0.6086009740829468 + }, + { + "epoch": 0.8054182321534507, + "step": 8146, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.8054182321534507, + "step": 8146, + "train/total_loss": 0.07648509740829468 + }, + { + "entropy": 9.335983276367188, + "epoch": 0.8055171050029661, + "mean_token_accuracy": 0.7619834542274475, + "num_tokens": 21595696.0, + "step": 8147, + "train/ce_loss": 0.8511669039726257 + }, + { + "epoch": 0.8055171050029661, + "step": 8147, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.8055171050029661, + "step": 8147, + "train/total_loss": 0.1632416844367981 + }, + { + "entropy": 8.933341026306152, + "epoch": 0.8056159778524817, + "mean_token_accuracy": 0.7549574971199036, + "num_tokens": 21600821.0, + "step": 8148, + "train/ce_loss": 1.0629116296768188 + }, + { + "epoch": 0.8056159778524817, + "step": 8148, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.8056159778524817, + "step": 8148, + "train/total_loss": 0.14144742488861084 + }, + { + "entropy": 8.205276489257812, + "epoch": 0.8057148507019972, + "mean_token_accuracy": 0.7639310956001282, + "num_tokens": 21606265.0, + "step": 8149, + "train/ce_loss": 1.0072635412216187 + }, + { + "epoch": 0.8057148507019972, + "step": 8149, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.8057148507019972, + "step": 8149, + "train/total_loss": 0.20228886604309082 + }, + { + "entropy": 9.068489074707031, + "epoch": 0.8058137235515127, + "mean_token_accuracy": 0.707025408744812, + "num_tokens": 21611427.0, + "step": 8150, + "train/ce_loss": 4.91906757815741e-06 + }, + { + "epoch": 0.8058137235515127, + "step": 8150, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.8058137235515127, + "step": 8150, + "train/total_loss": 0.0468754917383194 + }, + { + "entropy": 8.444740295410156, + "epoch": 0.8059125964010283, + "mean_token_accuracy": 0.7334058880805969, + "num_tokens": 21616869.0, + "step": 8151, + "train/ce_loss": 1.6713114976882935 + }, + { + "epoch": 0.8059125964010283, + "step": 8151, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.8059125964010283, + "step": 8151, + "train/total_loss": 0.23744365572929382 + }, + { + "entropy": 9.057316780090332, + "epoch": 0.8060114692505438, + "mean_token_accuracy": 0.7879341840744019, + "num_tokens": 21621889.0, + "step": 8152, + "train/ce_loss": 4.6114215024317673e-07 + }, + { + "epoch": 0.8060114692505438, + "step": 8152, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.8060114692505438, + "step": 8152, + "train/total_loss": 0.03906254470348358 + }, + { + "entropy": 8.672420501708984, + "epoch": 0.8061103421000593, + "mean_token_accuracy": 0.800936758518219, + "num_tokens": 21627189.0, + "step": 8153, + "train/ce_loss": 0.46436673402786255 + }, + { + "epoch": 0.8061103421000593, + "step": 8153, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.8061103421000593, + "step": 8153, + "train/total_loss": 0.09331167489290237 + }, + { + "entropy": 8.96420669555664, + "epoch": 0.8062092149495749, + "mean_token_accuracy": 0.7317743897438049, + "num_tokens": 21632434.0, + "step": 8154, + "train/ce_loss": 1.1457781791687012 + }, + { + "epoch": 0.8062092149495749, + "step": 8154, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.8062092149495749, + "step": 8154, + "train/total_loss": 0.18879657983779907 + }, + { + "entropy": 8.43289566040039, + "epoch": 0.8063080877990904, + "mean_token_accuracy": 0.6918798685073853, + "num_tokens": 21637818.0, + "step": 8155, + "train/ce_loss": 0.8233134746551514 + }, + { + "epoch": 0.8063080877990904, + "step": 8155, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.8063080877990904, + "step": 8155, + "train/total_loss": 0.1292063593864441 + }, + { + "entropy": 9.024656295776367, + "epoch": 0.8064069606486058, + "mean_token_accuracy": 0.7409162521362305, + "num_tokens": 21642914.0, + "step": 8156, + "train/ce_loss": 0.43868541717529297 + }, + { + "epoch": 0.8064069606486058, + "step": 8156, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.8064069606486058, + "step": 8156, + "train/total_loss": 0.0751185417175293 + }, + { + "entropy": 8.357614517211914, + "epoch": 0.8065058334981214, + "mean_token_accuracy": 0.6878452897071838, + "num_tokens": 21648492.0, + "step": 8157, + "train/ce_loss": 1.4950826168060303 + }, + { + "epoch": 0.8065058334981214, + "step": 8157, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.8065058334981214, + "step": 8157, + "train/total_loss": 0.1807582676410675 + }, + { + "entropy": 8.726078033447266, + "epoch": 0.8066047063476369, + "mean_token_accuracy": 0.7450110912322998, + "num_tokens": 21653859.0, + "step": 8158, + "train/ce_loss": 0.613193154335022 + }, + { + "epoch": 0.8066047063476369, + "step": 8158, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.8066047063476369, + "step": 8158, + "train/total_loss": 0.09256932139396667 + }, + { + "entropy": 9.057610511779785, + "epoch": 0.8067035791971524, + "mean_token_accuracy": 0.773181140422821, + "num_tokens": 21659023.0, + "step": 8159, + "train/ce_loss": 1.34531831741333 + }, + { + "epoch": 0.8067035791971524, + "step": 8159, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.8067035791971524, + "step": 8159, + "train/total_loss": 0.16187559068202972 + }, + { + "epoch": 0.806802452046668, + "grad_norm": 0.655254602432251, + "learning_rate": 7.985214854373734e-06, + "loss": 0.1312, + "step": 8160 + }, + { + "entropy": 8.64984130859375, + "epoch": 0.806802452046668, + "mean_token_accuracy": 0.7977805137634277, + "num_tokens": 21664315.0, + "step": 8160, + "train/ce_loss": 0.7628602981567383 + }, + { + "epoch": 0.806802452046668, + "step": 8160, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.806802452046668, + "step": 8160, + "train/total_loss": 0.09972353279590607 + }, + { + "entropy": 9.269373893737793, + "epoch": 0.8069013248961835, + "mean_token_accuracy": 0.67405766248703, + "num_tokens": 21669226.0, + "step": 8161, + "train/ce_loss": 1.1899515390396118 + }, + { + "epoch": 0.8069013248961835, + "step": 8161, + "train/sim_loss": 0.01171875 + }, + { + "epoch": 0.8069013248961835, + "step": 8161, + "train/total_loss": 0.13071390986442566 + }, + { + "entropy": 9.361366271972656, + "epoch": 0.807000197745699, + "mean_token_accuracy": 0.8279069662094116, + "num_tokens": 21674053.0, + "step": 8162, + "train/ce_loss": 1.273462176322937 + }, + { + "epoch": 0.807000197745699, + "step": 8162, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.807000197745699, + "step": 8162, + "train/total_loss": 0.1585962176322937 + }, + { + "entropy": 8.576358795166016, + "epoch": 0.8070990705952146, + "mean_token_accuracy": 0.7305315136909485, + "num_tokens": 21679304.0, + "step": 8163, + "train/ce_loss": 1.026901364326477 + }, + { + "epoch": 0.8070990705952146, + "step": 8163, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.8070990705952146, + "step": 8163, + "train/total_loss": 0.14565888047218323 + }, + { + "entropy": 8.602506637573242, + "epoch": 0.8071979434447301, + "mean_token_accuracy": 0.7146092653274536, + "num_tokens": 21684674.0, + "step": 8164, + "train/ce_loss": 0.6335917115211487 + }, + { + "epoch": 0.8071979434447301, + "step": 8164, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.8071979434447301, + "step": 8164, + "train/total_loss": 0.12585917115211487 + }, + { + "entropy": 9.482227325439453, + "epoch": 0.8072968162942455, + "mean_token_accuracy": 0.7589454054832458, + "num_tokens": 21689605.0, + "step": 8165, + "train/ce_loss": 6.770823119950364e-07 + }, + { + "epoch": 0.8072968162942455, + "step": 8165, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.8072968162942455, + "step": 8165, + "train/total_loss": 0.019531317055225372 + }, + { + "entropy": 8.98696231842041, + "epoch": 0.8073956891437611, + "mean_token_accuracy": 0.7142857313156128, + "num_tokens": 21694766.0, + "step": 8166, + "train/ce_loss": 0.6035987138748169 + }, + { + "epoch": 0.8073956891437611, + "step": 8166, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.8073956891437611, + "step": 8166, + "train/total_loss": 0.10332862287759781 + }, + { + "entropy": 8.953849792480469, + "epoch": 0.8074945619932766, + "mean_token_accuracy": 0.717391312122345, + "num_tokens": 21699946.0, + "step": 8167, + "train/ce_loss": 1.9634870290756226 + }, + { + "epoch": 0.8074945619932766, + "step": 8167, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.8074945619932766, + "step": 8167, + "train/total_loss": 0.22759871184825897 + }, + { + "entropy": 8.882444381713867, + "epoch": 0.8075934348427922, + "mean_token_accuracy": 0.7415730357170105, + "num_tokens": 21704924.0, + "step": 8168, + "train/ce_loss": 0.8021706342697144 + }, + { + "epoch": 0.8075934348427922, + "step": 8168, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.8075934348427922, + "step": 8168, + "train/total_loss": 0.13099831342697144 + }, + { + "entropy": 8.663026809692383, + "epoch": 0.8076923076923077, + "mean_token_accuracy": 0.802879273891449, + "num_tokens": 21710294.0, + "step": 8169, + "train/ce_loss": 0.42362141609191895 + }, + { + "epoch": 0.8076923076923077, + "step": 8169, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.8076923076923077, + "step": 8169, + "train/total_loss": 0.057987142354249954 + }, + { + "entropy": 9.483896255493164, + "epoch": 0.8077911805418232, + "mean_token_accuracy": 0.7542856931686401, + "num_tokens": 21715068.0, + "step": 8170, + "train/ce_loss": 5.225174845691072e-06 + }, + { + "epoch": 0.8077911805418232, + "step": 8170, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.8077911805418232, + "step": 8170, + "train/total_loss": 0.058594271540641785 + }, + { + "entropy": 8.993762969970703, + "epoch": 0.8078900533913388, + "mean_token_accuracy": 0.6810228824615479, + "num_tokens": 21720460.0, + "step": 8171, + "train/ce_loss": 1.189120894196094e-06 + }, + { + "epoch": 0.8078900533913388, + "step": 8171, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.8078900533913388, + "step": 8171, + "train/total_loss": 0.01562511920928955 + }, + { + "entropy": 8.781641006469727, + "epoch": 0.8079889262408543, + "mean_token_accuracy": 0.7355769276618958, + "num_tokens": 21725690.0, + "step": 8172, + "train/ce_loss": 0.5689041614532471 + }, + { + "epoch": 0.8079889262408543, + "step": 8172, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.8079889262408543, + "step": 8172, + "train/total_loss": 0.11157791316509247 + }, + { + "entropy": 9.35464859008789, + "epoch": 0.8080877990903698, + "mean_token_accuracy": 0.8356807231903076, + "num_tokens": 21730563.0, + "step": 8173, + "train/ce_loss": 1.0675036907196045 + }, + { + "epoch": 0.8080877990903698, + "step": 8173, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.8080877990903698, + "step": 8173, + "train/total_loss": 0.14190661907196045 + }, + { + "entropy": 8.79731273651123, + "epoch": 0.8081866719398854, + "mean_token_accuracy": 0.6955530047416687, + "num_tokens": 21735924.0, + "step": 8174, + "train/ce_loss": 0.9024097919464111 + }, + { + "epoch": 0.8081866719398854, + "step": 8174, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.8081866719398854, + "step": 8174, + "train/total_loss": 0.1722722351551056 + }, + { + "entropy": 8.88913631439209, + "epoch": 0.8082855447894008, + "mean_token_accuracy": 0.7214191555976868, + "num_tokens": 21741107.0, + "step": 8175, + "train/ce_loss": 0.5039877891540527 + }, + { + "epoch": 0.8082855447894008, + "step": 8175, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.8082855447894008, + "step": 8175, + "train/total_loss": 0.12071128189563751 + }, + { + "entropy": 8.999834060668945, + "epoch": 0.8083844176389163, + "mean_token_accuracy": 0.7393162250518799, + "num_tokens": 21746213.0, + "step": 8176, + "train/ce_loss": 1.1778316497802734 + }, + { + "epoch": 0.8083844176389163, + "step": 8176, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.8083844176389163, + "step": 8176, + "train/total_loss": 0.18418940901756287 + }, + { + "entropy": 9.247943878173828, + "epoch": 0.8084832904884319, + "mean_token_accuracy": 0.6855670213699341, + "num_tokens": 21751221.0, + "step": 8177, + "train/ce_loss": 1.1590492725372314 + }, + { + "epoch": 0.8084832904884319, + "step": 8177, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.8084832904884319, + "step": 8177, + "train/total_loss": 0.16277992725372314 + }, + { + "entropy": 8.619396209716797, + "epoch": 0.8085821633379474, + "mean_token_accuracy": 0.7241014838218689, + "num_tokens": 21756648.0, + "step": 8178, + "train/ce_loss": 2.023618221282959 + }, + { + "epoch": 0.8085821633379474, + "step": 8178, + "train/sim_loss": 0.15625 + }, + { + "epoch": 0.8085821633379474, + "step": 8178, + "train/total_loss": 0.3586118221282959 + }, + { + "entropy": 8.900561332702637, + "epoch": 0.8086810361874629, + "mean_token_accuracy": 0.7817638516426086, + "num_tokens": 21761811.0, + "step": 8179, + "train/ce_loss": 0.7690855860710144 + }, + { + "epoch": 0.8086810361874629, + "step": 8179, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.8086810361874629, + "step": 8179, + "train/total_loss": 0.11597105860710144 + }, + { + "epoch": 0.8087799090369785, + "grad_norm": 0.5436288118362427, + "learning_rate": 7.980269989615784e-06, + "loss": 0.1367, + "step": 8180 + }, + { + "entropy": 8.950399398803711, + "epoch": 0.8087799090369785, + "mean_token_accuracy": 0.7471979856491089, + "num_tokens": 21767080.0, + "step": 8180, + "train/ce_loss": 1.2610423564910889 + }, + { + "epoch": 0.8087799090369785, + "step": 8180, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.8087799090369785, + "step": 8180, + "train/total_loss": 0.1925104856491089 + }, + { + "entropy": 8.72748851776123, + "epoch": 0.808878781886494, + "mean_token_accuracy": 0.7057444453239441, + "num_tokens": 21772390.0, + "step": 8181, + "train/ce_loss": 1.424280047416687 + }, + { + "epoch": 0.808878781886494, + "step": 8181, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.808878781886494, + "step": 8181, + "train/total_loss": 0.2518030107021332 + }, + { + "entropy": 9.251352310180664, + "epoch": 0.8089776547360095, + "mean_token_accuracy": 0.7716049551963806, + "num_tokens": 21777306.0, + "step": 8182, + "train/ce_loss": 3.123627266177209e-06 + }, + { + "epoch": 0.8089776547360095, + "step": 8182, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.8089776547360095, + "step": 8182, + "train/total_loss": 0.04296906292438507 + }, + { + "entropy": 9.193682670593262, + "epoch": 0.8090765275855251, + "mean_token_accuracy": 0.7491638660430908, + "num_tokens": 21782342.0, + "step": 8183, + "train/ce_loss": 2.006776809692383 + }, + { + "epoch": 0.8090765275855251, + "step": 8183, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.8090765275855251, + "step": 8183, + "train/total_loss": 0.28270894289016724 + }, + { + "entropy": 8.846532821655273, + "epoch": 0.8091754004350405, + "mean_token_accuracy": 0.7214533090591431, + "num_tokens": 21787379.0, + "step": 8184, + "train/ce_loss": 8.488844400744711e-07 + }, + { + "epoch": 0.8091754004350405, + "step": 8184, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.8091754004350405, + "step": 8184, + "train/total_loss": 0.06250008195638657 + }, + { + "entropy": 9.142698287963867, + "epoch": 0.809274273284556, + "mean_token_accuracy": 0.6946688294410706, + "num_tokens": 21792454.0, + "step": 8185, + "train/ce_loss": 4.0535462176194414e-05 + }, + { + "epoch": 0.809274273284556, + "step": 8185, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.809274273284556, + "step": 8185, + "train/total_loss": 0.03516030311584473 + }, + { + "entropy": 9.179564476013184, + "epoch": 0.8093731461340716, + "mean_token_accuracy": 0.7639639377593994, + "num_tokens": 21797410.0, + "step": 8186, + "train/ce_loss": 6.890414852023241e-07 + }, + { + "epoch": 0.8093731461340716, + "step": 8186, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.8093731461340716, + "step": 8186, + "train/total_loss": 0.04296881705522537 + }, + { + "entropy": 9.330995559692383, + "epoch": 0.8094720189835871, + "mean_token_accuracy": 0.6957928538322449, + "num_tokens": 21802469.0, + "step": 8187, + "train/ce_loss": 1.5272752046585083 + }, + { + "epoch": 0.8094720189835871, + "step": 8187, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.8094720189835871, + "step": 8187, + "train/total_loss": 0.21522752940654755 + }, + { + "entropy": 9.198179244995117, + "epoch": 0.8095708918331026, + "mean_token_accuracy": 0.7378151416778564, + "num_tokens": 21807485.0, + "step": 8188, + "train/ce_loss": 1.382462739944458 + }, + { + "epoch": 0.8095708918331026, + "step": 8188, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.8095708918331026, + "step": 8188, + "train/total_loss": 0.19684003293514252 + }, + { + "entropy": 8.477127075195312, + "epoch": 0.8096697646826182, + "mean_token_accuracy": 0.7734375, + "num_tokens": 21812875.0, + "step": 8189, + "train/ce_loss": 0.6133180260658264 + }, + { + "epoch": 0.8096697646826182, + "step": 8189, + "train/sim_loss": 0.01171875 + }, + { + "epoch": 0.8096697646826182, + "step": 8189, + "train/total_loss": 0.07305055856704712 + }, + { + "entropy": 8.476020812988281, + "epoch": 0.8097686375321337, + "mean_token_accuracy": 0.7351290583610535, + "num_tokens": 21818262.0, + "step": 8190, + "train/ce_loss": 0.6485268473625183 + }, + { + "epoch": 0.8097686375321337, + "step": 8190, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.8097686375321337, + "step": 8190, + "train/total_loss": 0.10782143473625183 + }, + { + "entropy": 8.579954147338867, + "epoch": 0.8098675103816492, + "mean_token_accuracy": 0.7418032884597778, + "num_tokens": 21823665.0, + "step": 8191, + "train/ce_loss": 0.7341662645339966 + }, + { + "epoch": 0.8098675103816492, + "step": 8191, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.8098675103816492, + "step": 8191, + "train/total_loss": 0.12029162794351578 + }, + { + "entropy": 9.22266674041748, + "epoch": 0.8099663832311648, + "mean_token_accuracy": 0.7193763852119446, + "num_tokens": 21828619.0, + "step": 8192, + "train/ce_loss": 2.1391375064849854 + }, + { + "epoch": 0.8099663832311648, + "step": 8192, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.8099663832311648, + "step": 8192, + "train/total_loss": 0.2803199887275696 + }, + { + "entropy": 8.551570892333984, + "epoch": 0.8100652560806803, + "mean_token_accuracy": 0.7967479825019836, + "num_tokens": 21834084.0, + "step": 8193, + "train/ce_loss": 0.7586435675621033 + }, + { + "epoch": 0.8100652560806803, + "step": 8193, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.8100652560806803, + "step": 8193, + "train/total_loss": 0.10320810973644257 + }, + { + "entropy": 9.315771102905273, + "epoch": 0.8101641289301957, + "mean_token_accuracy": 0.7569573521614075, + "num_tokens": 21839027.0, + "step": 8194, + "train/ce_loss": 1.9297350645065308 + }, + { + "epoch": 0.8101641289301957, + "step": 8194, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.8101641289301957, + "step": 8194, + "train/total_loss": 0.24766100943088531 + }, + { + "entropy": 8.790742874145508, + "epoch": 0.8102630017797113, + "mean_token_accuracy": 0.7670454382896423, + "num_tokens": 21844536.0, + "step": 8195, + "train/ce_loss": 0.6125141382217407 + }, + { + "epoch": 0.8102630017797113, + "step": 8195, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.8102630017797113, + "step": 8195, + "train/total_loss": 0.1354701668024063 + }, + { + "entropy": 9.010497093200684, + "epoch": 0.8103618746292268, + "mean_token_accuracy": 0.7312925457954407, + "num_tokens": 21849575.0, + "step": 8196, + "train/ce_loss": 0.7210679650306702 + }, + { + "epoch": 0.8103618746292268, + "step": 8196, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.8103618746292268, + "step": 8196, + "train/total_loss": 0.11898180097341537 + }, + { + "entropy": 8.528179168701172, + "epoch": 0.8104607474787423, + "mean_token_accuracy": 0.7588516473770142, + "num_tokens": 21855248.0, + "step": 8197, + "train/ce_loss": 0.9351038932800293 + }, + { + "epoch": 0.8104607474787423, + "step": 8197, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.8104607474787423, + "step": 8197, + "train/total_loss": 0.17944788932800293 + }, + { + "entropy": 9.088539123535156, + "epoch": 0.8105596203282579, + "mean_token_accuracy": 0.7485148310661316, + "num_tokens": 21860220.0, + "step": 8198, + "train/ce_loss": 2.4155744540621527e-05 + }, + { + "epoch": 0.8105596203282579, + "step": 8198, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.8105596203282579, + "step": 8198, + "train/total_loss": 0.0351586639881134 + }, + { + "entropy": 8.88365364074707, + "epoch": 0.8106584931777734, + "mean_token_accuracy": 0.7418397665023804, + "num_tokens": 21865383.0, + "step": 8199, + "train/ce_loss": 1.06198251247406 + }, + { + "epoch": 0.8106584931777734, + "step": 8199, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.8106584931777734, + "step": 8199, + "train/total_loss": 0.184323251247406 + }, + { + "epoch": 0.8107573660272889, + "grad_norm": 0.7262281775474548, + "learning_rate": 7.975325124857835e-06, + "loss": 0.1342, + "step": 8200 + }, + { + "entropy": 8.694978713989258, + "epoch": 0.8107573660272889, + "mean_token_accuracy": 0.7109470963478088, + "num_tokens": 21870697.0, + "step": 8200, + "train/ce_loss": 1.0218790769577026 + }, + { + "epoch": 0.8107573660272889, + "step": 8200, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.8107573660272889, + "step": 8200, + "train/total_loss": 0.1685941517353058 + }, + { + "entropy": 9.071819305419922, + "epoch": 0.8108562388768045, + "mean_token_accuracy": 0.7636612057685852, + "num_tokens": 21875848.0, + "step": 8201, + "train/ce_loss": 1.4440220594406128 + }, + { + "epoch": 0.8108562388768045, + "step": 8201, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.8108562388768045, + "step": 8201, + "train/total_loss": 0.19518345594406128 + }, + { + "entropy": 9.03312873840332, + "epoch": 0.81095511172632, + "mean_token_accuracy": 0.7526717782020569, + "num_tokens": 21880982.0, + "step": 8202, + "train/ce_loss": 0.9207428097724915 + }, + { + "epoch": 0.81095511172632, + "step": 8202, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.81095511172632, + "step": 8202, + "train/total_loss": 0.13894927501678467 + }, + { + "entropy": 8.990741729736328, + "epoch": 0.8110539845758354, + "mean_token_accuracy": 0.7273918986320496, + "num_tokens": 21886231.0, + "step": 8203, + "train/ce_loss": 0.988568127155304 + }, + { + "epoch": 0.8110539845758354, + "step": 8203, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.8110539845758354, + "step": 8203, + "train/total_loss": 0.15354430675506592 + }, + { + "entropy": 9.083792686462402, + "epoch": 0.811152857425351, + "mean_token_accuracy": 0.7047308087348938, + "num_tokens": 21891320.0, + "step": 8204, + "train/ce_loss": 1.0045556336990558e-05 + }, + { + "epoch": 0.811152857425351, + "step": 8204, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.811152857425351, + "step": 8204, + "train/total_loss": 0.046876005828380585 + }, + { + "entropy": 8.488182067871094, + "epoch": 0.8112517302748665, + "mean_token_accuracy": 0.7369033694267273, + "num_tokens": 21896692.0, + "step": 8205, + "train/ce_loss": 1.4631035327911377 + }, + { + "epoch": 0.8112517302748665, + "step": 8205, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.8112517302748665, + "step": 8205, + "train/total_loss": 0.19709160923957825 + }, + { + "entropy": 9.184462547302246, + "epoch": 0.811350603124382, + "mean_token_accuracy": 0.7267658114433289, + "num_tokens": 21901678.0, + "step": 8206, + "train/ce_loss": 1.351523995399475 + }, + { + "epoch": 0.811350603124382, + "step": 8206, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.811350603124382, + "step": 8206, + "train/total_loss": 0.1820273995399475 + }, + { + "entropy": 8.80001449584961, + "epoch": 0.8114494759738976, + "mean_token_accuracy": 0.6877419352531433, + "num_tokens": 21906948.0, + "step": 8207, + "train/ce_loss": 7.886482489993796e-05 + }, + { + "epoch": 0.8114494759738976, + "step": 8207, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.8114494759738976, + "step": 8207, + "train/total_loss": 0.023445386439561844 + }, + { + "entropy": 9.19816780090332, + "epoch": 0.8115483488234131, + "mean_token_accuracy": 0.7311828136444092, + "num_tokens": 21912085.0, + "step": 8208, + "train/ce_loss": 0.5881509780883789 + }, + { + "epoch": 0.8115483488234131, + "step": 8208, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.8115483488234131, + "step": 8208, + "train/total_loss": 0.1291275918483734 + }, + { + "entropy": 8.79111385345459, + "epoch": 0.8116472216729286, + "mean_token_accuracy": 0.6975609660148621, + "num_tokens": 21917155.0, + "step": 8209, + "train/ce_loss": 1.086192011833191 + }, + { + "epoch": 0.8116472216729286, + "step": 8209, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.8116472216729286, + "step": 8209, + "train/total_loss": 0.19846296310424805 + }, + { + "entropy": 8.186117172241211, + "epoch": 0.8117460945224442, + "mean_token_accuracy": 0.7790403962135315, + "num_tokens": 21922386.0, + "step": 8210, + "train/ce_loss": 0.6409353017807007 + }, + { + "epoch": 0.8117460945224442, + "step": 8210, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.8117460945224442, + "step": 8210, + "train/total_loss": 0.08753103017807007 + }, + { + "entropy": 8.486915588378906, + "epoch": 0.8118449673719597, + "mean_token_accuracy": 0.7954545617103577, + "num_tokens": 21927842.0, + "step": 8211, + "train/ce_loss": 0.6763221621513367 + }, + { + "epoch": 0.8118449673719597, + "step": 8211, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.8118449673719597, + "step": 8211, + "train/total_loss": 0.08716347068548203 + }, + { + "entropy": 8.522668838500977, + "epoch": 0.8119438402214751, + "mean_token_accuracy": 0.7724137902259827, + "num_tokens": 21933210.0, + "step": 8212, + "train/ce_loss": 0.39027079939842224 + }, + { + "epoch": 0.8119438402214751, + "step": 8212, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.8119438402214751, + "step": 8212, + "train/total_loss": 0.07418332993984222 + }, + { + "entropy": 9.335029602050781, + "epoch": 0.8120427130709907, + "mean_token_accuracy": 0.751396656036377, + "num_tokens": 21938016.0, + "step": 8213, + "train/ce_loss": 0.0015728664584457874 + }, + { + "epoch": 0.8120427130709907, + "step": 8213, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.8120427130709907, + "step": 8213, + "train/total_loss": 0.03531353548169136 + }, + { + "entropy": 8.462377548217773, + "epoch": 0.8121415859205062, + "mean_token_accuracy": 0.7784945964813232, + "num_tokens": 21943407.0, + "step": 8214, + "train/ce_loss": 0.7918002605438232 + }, + { + "epoch": 0.8121415859205062, + "step": 8214, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.8121415859205062, + "step": 8214, + "train/total_loss": 0.1143362745642662 + }, + { + "entropy": 8.499216079711914, + "epoch": 0.8122404587700217, + "mean_token_accuracy": 0.7302423715591431, + "num_tokens": 21948804.0, + "step": 8215, + "train/ce_loss": 0.5783441066741943 + }, + { + "epoch": 0.8122404587700217, + "step": 8215, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.8122404587700217, + "step": 8215, + "train/total_loss": 0.09689691662788391 + }, + { + "entropy": 9.778457641601562, + "epoch": 0.8123393316195373, + "mean_token_accuracy": 0.7428571581840515, + "num_tokens": 21953551.0, + "step": 8216, + "train/ce_loss": 1.4274528439273126e-06 + }, + { + "epoch": 0.8123393316195373, + "step": 8216, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.8123393316195373, + "step": 8216, + "train/total_loss": 0.02734389342367649 + }, + { + "entropy": 8.859435081481934, + "epoch": 0.8124382044690528, + "mean_token_accuracy": 0.7398601174354553, + "num_tokens": 21959061.0, + "step": 8217, + "train/ce_loss": 0.7559324502944946 + }, + { + "epoch": 0.8124382044690528, + "step": 8217, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.8124382044690528, + "step": 8217, + "train/total_loss": 0.1615307480096817 + }, + { + "entropy": 8.961817741394043, + "epoch": 0.8125370773185683, + "mean_token_accuracy": 0.780802309513092, + "num_tokens": 21964232.0, + "step": 8218, + "train/ce_loss": 0.7763864398002625 + }, + { + "epoch": 0.8125370773185683, + "step": 8218, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.8125370773185683, + "step": 8218, + "train/total_loss": 0.1713886559009552 + }, + { + "entropy": 8.722240447998047, + "epoch": 0.8126359501680839, + "mean_token_accuracy": 0.7825000286102295, + "num_tokens": 21969525.0, + "step": 8219, + "train/ce_loss": 0.6948031783103943 + }, + { + "epoch": 0.8126359501680839, + "step": 8219, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.8126359501680839, + "step": 8219, + "train/total_loss": 0.11635532230138779 + }, + { + "epoch": 0.8127348230175994, + "grad_norm": 0.5876989960670471, + "learning_rate": 7.970380260099887e-06, + "loss": 0.1333, + "step": 8220 + }, + { + "entropy": 9.117151260375977, + "epoch": 0.8127348230175994, + "mean_token_accuracy": 0.7641242742538452, + "num_tokens": 21974903.0, + "step": 8220, + "train/ce_loss": 0.6785506010055542 + }, + { + "epoch": 0.8127348230175994, + "step": 8220, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.8127348230175994, + "step": 8220, + "train/total_loss": 0.12254256010055542 + }, + { + "entropy": 8.457649230957031, + "epoch": 0.8128336958671148, + "mean_token_accuracy": 0.7218309640884399, + "num_tokens": 21980267.0, + "step": 8221, + "train/ce_loss": 1.3428711891174316 + }, + { + "epoch": 0.8128336958671148, + "step": 8221, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.8128336958671148, + "step": 8221, + "train/total_loss": 0.20850586891174316 + }, + { + "entropy": 9.19150161743164, + "epoch": 0.8129325687166304, + "mean_token_accuracy": 0.740234375, + "num_tokens": 21985217.0, + "step": 8222, + "train/ce_loss": 0.7980934381484985 + }, + { + "epoch": 0.8129325687166304, + "step": 8222, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.8129325687166304, + "step": 8222, + "train/total_loss": 0.12277809530496597 + }, + { + "entropy": 9.105156898498535, + "epoch": 0.8130314415661459, + "mean_token_accuracy": 0.6800000071525574, + "num_tokens": 21990177.0, + "step": 8223, + "train/ce_loss": 0.8357672095298767 + }, + { + "epoch": 0.8130314415661459, + "step": 8223, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.8130314415661459, + "step": 8223, + "train/total_loss": 0.11873297393321991 + }, + { + "entropy": 8.884305953979492, + "epoch": 0.8131303144156614, + "mean_token_accuracy": 0.7631579041481018, + "num_tokens": 21995414.0, + "step": 8224, + "train/ce_loss": 3.9317649225267814e-07 + }, + { + "epoch": 0.8131303144156614, + "step": 8224, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.8131303144156614, + "step": 8224, + "train/total_loss": 0.03906254097819328 + }, + { + "entropy": 8.017794609069824, + "epoch": 0.813229187265177, + "mean_token_accuracy": 0.7348754405975342, + "num_tokens": 22001014.0, + "step": 8225, + "train/ce_loss": 1.0558550357818604 + }, + { + "epoch": 0.813229187265177, + "step": 8225, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.813229187265177, + "step": 8225, + "train/total_loss": 0.160273015499115 + }, + { + "entropy": 8.938722610473633, + "epoch": 0.8133280601146925, + "mean_token_accuracy": 0.7399702668190002, + "num_tokens": 22006115.0, + "step": 8226, + "train/ce_loss": 1.6660056114196777 + }, + { + "epoch": 0.8133280601146925, + "step": 8226, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.8133280601146925, + "step": 8226, + "train/total_loss": 0.2291005700826645 + }, + { + "entropy": 9.419342994689941, + "epoch": 0.813426932964208, + "mean_token_accuracy": 0.7194244861602783, + "num_tokens": 22010946.0, + "step": 8227, + "train/ce_loss": 4.596821554514463e-07 + }, + { + "epoch": 0.813426932964208, + "step": 8227, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.813426932964208, + "step": 8227, + "train/total_loss": 0.03125004470348358 + }, + { + "entropy": 9.03645133972168, + "epoch": 0.8135258058137236, + "mean_token_accuracy": 0.7803468108177185, + "num_tokens": 22016062.0, + "step": 8228, + "train/ce_loss": 6.380624313351291e-07 + }, + { + "epoch": 0.8135258058137236, + "step": 8228, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.8135258058137236, + "step": 8228, + "train/total_loss": 0.042968813329935074 + }, + { + "entropy": 8.901385307312012, + "epoch": 0.8136246786632391, + "mean_token_accuracy": 0.692187488079071, + "num_tokens": 22021166.0, + "step": 8229, + "train/ce_loss": 6.219978786248248e-06 + }, + { + "epoch": 0.8136246786632391, + "step": 8229, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.8136246786632391, + "step": 8229, + "train/total_loss": 0.03515687212347984 + }, + { + "entropy": 8.753562927246094, + "epoch": 0.8137235515127546, + "mean_token_accuracy": 0.7471910119056702, + "num_tokens": 22026560.0, + "step": 8230, + "train/ce_loss": 0.8991544246673584 + }, + { + "epoch": 0.8137235515127546, + "step": 8230, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.8137235515127546, + "step": 8230, + "train/total_loss": 0.1446029543876648 + }, + { + "entropy": 9.053617477416992, + "epoch": 0.8138224243622701, + "mean_token_accuracy": 0.7304624915122986, + "num_tokens": 22031656.0, + "step": 8231, + "train/ce_loss": 6.243632242330932e-07 + }, + { + "epoch": 0.8138224243622701, + "step": 8231, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.8138224243622701, + "step": 8231, + "train/total_loss": 0.06250005960464478 + }, + { + "entropy": 8.620099067687988, + "epoch": 0.8139212972117856, + "mean_token_accuracy": 0.6655328869819641, + "num_tokens": 22036981.0, + "step": 8232, + "train/ce_loss": 1.7731776237487793 + }, + { + "epoch": 0.8139212972117856, + "step": 8232, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.8139212972117856, + "step": 8232, + "train/total_loss": 0.2593490183353424 + }, + { + "entropy": 8.56728744506836, + "epoch": 0.8140201700613011, + "mean_token_accuracy": 0.746582567691803, + "num_tokens": 22042453.0, + "step": 8233, + "train/ce_loss": 0.9670370221138 + }, + { + "epoch": 0.8140201700613011, + "step": 8233, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.8140201700613011, + "step": 8233, + "train/total_loss": 0.16310995817184448 + }, + { + "entropy": 8.869913101196289, + "epoch": 0.8141190429108167, + "mean_token_accuracy": 0.7425068020820618, + "num_tokens": 22047647.0, + "step": 8234, + "train/ce_loss": 0.49576520919799805 + }, + { + "epoch": 0.8141190429108167, + "step": 8234, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.8141190429108167, + "step": 8234, + "train/total_loss": 0.0691077709197998 + }, + { + "entropy": 8.869132995605469, + "epoch": 0.8142179157603322, + "mean_token_accuracy": 0.7114361524581909, + "num_tokens": 22052916.0, + "step": 8235, + "train/ce_loss": 0.7772649526596069 + }, + { + "epoch": 0.8142179157603322, + "step": 8235, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.8142179157603322, + "step": 8235, + "train/total_loss": 0.11678899824619293 + }, + { + "entropy": 8.645116806030273, + "epoch": 0.8143167886098477, + "mean_token_accuracy": 0.7642679810523987, + "num_tokens": 22058211.0, + "step": 8236, + "train/ce_loss": 1.0038399696350098 + }, + { + "epoch": 0.8143167886098477, + "step": 8236, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.8143167886098477, + "step": 8236, + "train/total_loss": 0.12772774696350098 + }, + { + "entropy": 8.418092727661133, + "epoch": 0.8144156614593633, + "mean_token_accuracy": 0.7742663621902466, + "num_tokens": 22063569.0, + "step": 8237, + "train/ce_loss": 0.8072574734687805 + }, + { + "epoch": 0.8144156614593633, + "step": 8237, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.8144156614593633, + "step": 8237, + "train/total_loss": 0.135413259267807 + }, + { + "entropy": 8.732414245605469, + "epoch": 0.8145145343088788, + "mean_token_accuracy": 0.7214111685752869, + "num_tokens": 22068807.0, + "step": 8238, + "train/ce_loss": 0.7674160599708557 + }, + { + "epoch": 0.8145145343088788, + "step": 8238, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.8145145343088788, + "step": 8238, + "train/total_loss": 0.12361660599708557 + }, + { + "entropy": 8.821417808532715, + "epoch": 0.8146134071583943, + "mean_token_accuracy": 0.6559571623802185, + "num_tokens": 22074002.0, + "step": 8239, + "train/ce_loss": 1.8144460916519165 + }, + { + "epoch": 0.8146134071583943, + "step": 8239, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.8146134071583943, + "step": 8239, + "train/total_loss": 0.21660086512565613 + }, + { + "epoch": 0.8147122800079099, + "grad_norm": 0.7151651382446289, + "learning_rate": 7.965435395341938e-06, + "loss": 0.1417, + "step": 8240 + }, + { + "entropy": 9.104612350463867, + "epoch": 0.8147122800079099, + "mean_token_accuracy": 0.779321014881134, + "num_tokens": 22079096.0, + "step": 8240, + "train/ce_loss": 0.8043537735939026 + }, + { + "epoch": 0.8147122800079099, + "step": 8240, + "train/sim_loss": 0.01171875 + }, + { + "epoch": 0.8147122800079099, + "step": 8240, + "train/total_loss": 0.0921541303396225 + }, + { + "entropy": 9.01150131225586, + "epoch": 0.8148111528574253, + "mean_token_accuracy": 0.8212209343910217, + "num_tokens": 22084214.0, + "step": 8241, + "train/ce_loss": 0.7468549013137817 + }, + { + "epoch": 0.8148111528574253, + "step": 8241, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.8148111528574253, + "step": 8241, + "train/total_loss": 0.0903104916214943 + }, + { + "entropy": 8.795052528381348, + "epoch": 0.8149100257069408, + "mean_token_accuracy": 0.7451403737068176, + "num_tokens": 22089596.0, + "step": 8242, + "train/ce_loss": 0.8570025563240051 + }, + { + "epoch": 0.8149100257069408, + "step": 8242, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.8149100257069408, + "step": 8242, + "train/total_loss": 0.15601275861263275 + }, + { + "entropy": 8.701278686523438, + "epoch": 0.8150088985564564, + "mean_token_accuracy": 0.7772151827812195, + "num_tokens": 22094869.0, + "step": 8243, + "train/ce_loss": 0.9589567184448242 + }, + { + "epoch": 0.8150088985564564, + "step": 8243, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.8150088985564564, + "step": 8243, + "train/total_loss": 0.1974581778049469 + }, + { + "entropy": 8.662206649780273, + "epoch": 0.8151077714059719, + "mean_token_accuracy": 0.6931034326553345, + "num_tokens": 22099893.0, + "step": 8244, + "train/ce_loss": 1.9629923105239868 + }, + { + "epoch": 0.8151077714059719, + "step": 8244, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.8151077714059719, + "step": 8244, + "train/total_loss": 0.2509867548942566 + }, + { + "entropy": 9.216302871704102, + "epoch": 0.8152066442554874, + "mean_token_accuracy": 0.7698675394058228, + "num_tokens": 22104907.0, + "step": 8245, + "train/ce_loss": 0.9153091907501221 + }, + { + "epoch": 0.8152066442554874, + "step": 8245, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.8152066442554874, + "step": 8245, + "train/total_loss": 0.1266871690750122 + }, + { + "entropy": 8.466188430786133, + "epoch": 0.815305517105003, + "mean_token_accuracy": 0.7120253443717957, + "num_tokens": 22110327.0, + "step": 8246, + "train/ce_loss": 0.749636173248291 + }, + { + "epoch": 0.815305517105003, + "step": 8246, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.815305517105003, + "step": 8246, + "train/total_loss": 0.17261987924575806 + }, + { + "entropy": 8.246451377868652, + "epoch": 0.8154043899545185, + "mean_token_accuracy": 0.7100409865379333, + "num_tokens": 22115812.0, + "step": 8247, + "train/ce_loss": 1.0034377574920654 + }, + { + "epoch": 0.8154043899545185, + "step": 8247, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.8154043899545185, + "step": 8247, + "train/total_loss": 0.15893752872943878 + }, + { + "entropy": 8.852086067199707, + "epoch": 0.815503262804034, + "mean_token_accuracy": 0.7243510484695435, + "num_tokens": 22121040.0, + "step": 8248, + "train/ce_loss": 0.5238107442855835 + }, + { + "epoch": 0.815503262804034, + "step": 8248, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.815503262804034, + "step": 8248, + "train/total_loss": 0.11097482591867447 + }, + { + "entropy": 8.911182403564453, + "epoch": 0.8156021356535496, + "mean_token_accuracy": 0.7021563053131104, + "num_tokens": 22126240.0, + "step": 8249, + "train/ce_loss": 0.6195603609085083 + }, + { + "epoch": 0.8156021356535496, + "step": 8249, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.8156021356535496, + "step": 8249, + "train/total_loss": 0.10492478311061859 + }, + { + "entropy": 9.07373046875, + "epoch": 0.815701008503065, + "mean_token_accuracy": 0.730182945728302, + "num_tokens": 22131352.0, + "step": 8250, + "train/ce_loss": 0.567441999912262 + }, + { + "epoch": 0.815701008503065, + "step": 8250, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.815701008503065, + "step": 8250, + "train/total_loss": 0.09580670297145844 + }, + { + "entropy": 8.514007568359375, + "epoch": 0.8157998813525806, + "mean_token_accuracy": 0.7369614243507385, + "num_tokens": 22136710.0, + "step": 8251, + "train/ce_loss": 0.41268324851989746 + }, + { + "epoch": 0.8157998813525806, + "step": 8251, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.8157998813525806, + "step": 8251, + "train/total_loss": 0.06470582634210587 + }, + { + "entropy": 8.258570671081543, + "epoch": 0.8158987542020961, + "mean_token_accuracy": 0.6890848875045776, + "num_tokens": 22142144.0, + "step": 8252, + "train/ce_loss": 1.0246692895889282 + }, + { + "epoch": 0.8158987542020961, + "step": 8252, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.8158987542020961, + "step": 8252, + "train/total_loss": 0.14934194087982178 + }, + { + "entropy": 8.888803482055664, + "epoch": 0.8159976270516116, + "mean_token_accuracy": 0.7424441576004028, + "num_tokens": 22147351.0, + "step": 8253, + "train/ce_loss": 0.8449608087539673 + }, + { + "epoch": 0.8159976270516116, + "step": 8253, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.8159976270516116, + "step": 8253, + "train/total_loss": 0.11574608087539673 + }, + { + "entropy": 8.89084243774414, + "epoch": 0.8160964999011272, + "mean_token_accuracy": 0.7249190807342529, + "num_tokens": 22152469.0, + "step": 8254, + "train/ce_loss": 0.8302783370018005 + }, + { + "epoch": 0.8160964999011272, + "step": 8254, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.8160964999011272, + "step": 8254, + "train/total_loss": 0.14552783966064453 + }, + { + "entropy": 9.082358360290527, + "epoch": 0.8161953727506427, + "mean_token_accuracy": 0.7819444537162781, + "num_tokens": 22157606.0, + "step": 8255, + "train/ce_loss": 1.2188773155212402 + }, + { + "epoch": 0.8161953727506427, + "step": 8255, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.8161953727506427, + "step": 8255, + "train/total_loss": 0.17657524347305298 + }, + { + "entropy": 8.382397651672363, + "epoch": 0.8162942456001582, + "mean_token_accuracy": 0.7291471362113953, + "num_tokens": 22163092.0, + "step": 8256, + "train/ce_loss": 0.9671027064323425 + }, + { + "epoch": 0.8162942456001582, + "step": 8256, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.8162942456001582, + "step": 8256, + "train/total_loss": 0.15530401468276978 + }, + { + "entropy": 8.917795181274414, + "epoch": 0.8163931184496738, + "mean_token_accuracy": 0.7467249035835266, + "num_tokens": 22168221.0, + "step": 8257, + "train/ce_loss": 8.391322126044543e-07 + }, + { + "epoch": 0.8163931184496738, + "step": 8257, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.8163931184496738, + "step": 8257, + "train/total_loss": 0.08203133195638657 + }, + { + "entropy": 9.265806198120117, + "epoch": 0.8164919912991893, + "mean_token_accuracy": 0.7693877816200256, + "num_tokens": 22173150.0, + "step": 8258, + "train/ce_loss": 1.5727033615112305 + }, + { + "epoch": 0.8164919912991893, + "step": 8258, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.8164919912991893, + "step": 8258, + "train/total_loss": 0.19242659211158752 + }, + { + "entropy": 8.78251838684082, + "epoch": 0.8165908641487047, + "mean_token_accuracy": 0.8165829181671143, + "num_tokens": 22178403.0, + "step": 8259, + "train/ce_loss": 0.8774014711380005 + }, + { + "epoch": 0.8165908641487047, + "step": 8259, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.8165908641487047, + "step": 8259, + "train/total_loss": 0.11117764562368393 + }, + { + "epoch": 0.8166897369982203, + "grad_norm": 0.6642946004867554, + "learning_rate": 7.96049053058399e-06, + "loss": 0.134, + "step": 8260 + }, + { + "entropy": 8.415128707885742, + "epoch": 0.8166897369982203, + "mean_token_accuracy": 0.6840882897377014, + "num_tokens": 22183787.0, + "step": 8260, + "train/ce_loss": 0.7524656653404236 + }, + { + "epoch": 0.8166897369982203, + "step": 8260, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.8166897369982203, + "step": 8260, + "train/total_loss": 0.14946532249450684 + }, + { + "entropy": 8.952371597290039, + "epoch": 0.8167886098477358, + "mean_token_accuracy": 0.7789165377616882, + "num_tokens": 22188922.0, + "step": 8261, + "train/ce_loss": 1.5190229415893555 + }, + { + "epoch": 0.8167886098477358, + "step": 8261, + "train/sim_loss": 0.1484375 + }, + { + "epoch": 0.8167886098477358, + "step": 8261, + "train/total_loss": 0.30033981800079346 + }, + { + "entropy": 8.918517112731934, + "epoch": 0.8168874826972513, + "mean_token_accuracy": 0.6932153105735779, + "num_tokens": 22194040.0, + "step": 8262, + "train/ce_loss": 1.3680672645568848 + }, + { + "epoch": 0.8168874826972513, + "step": 8262, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.8168874826972513, + "step": 8262, + "train/total_loss": 0.18368172645568848 + }, + { + "entropy": 8.656283378601074, + "epoch": 0.8169863555467669, + "mean_token_accuracy": 0.7665418386459351, + "num_tokens": 22199355.0, + "step": 8263, + "train/ce_loss": 0.9361574053764343 + }, + { + "epoch": 0.8169863555467669, + "step": 8263, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.8169863555467669, + "step": 8263, + "train/total_loss": 0.15611574053764343 + }, + { + "entropy": 8.89004898071289, + "epoch": 0.8170852283962824, + "mean_token_accuracy": 0.767160177230835, + "num_tokens": 22204549.0, + "step": 8264, + "train/ce_loss": 0.7157173156738281 + }, + { + "epoch": 0.8170852283962824, + "step": 8264, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.8170852283962824, + "step": 8264, + "train/total_loss": 0.1106342300772667 + }, + { + "entropy": 8.740985870361328, + "epoch": 0.8171841012457979, + "mean_token_accuracy": 0.7036144733428955, + "num_tokens": 22209881.0, + "step": 8265, + "train/ce_loss": 1.5244280099868774 + }, + { + "epoch": 0.8171841012457979, + "step": 8265, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.8171841012457979, + "step": 8265, + "train/total_loss": 0.2422865480184555 + }, + { + "entropy": 8.232596397399902, + "epoch": 0.8172829740953135, + "mean_token_accuracy": 0.7505694627761841, + "num_tokens": 22215296.0, + "step": 8266, + "train/ce_loss": 0.7841481566429138 + }, + { + "epoch": 0.8172829740953135, + "step": 8266, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.8172829740953135, + "step": 8266, + "train/total_loss": 0.11747732013463974 + }, + { + "entropy": 8.950629234313965, + "epoch": 0.817381846944829, + "mean_token_accuracy": 0.7545582056045532, + "num_tokens": 22220480.0, + "step": 8267, + "train/ce_loss": 0.7292152643203735 + }, + { + "epoch": 0.817381846944829, + "step": 8267, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.817381846944829, + "step": 8267, + "train/total_loss": 0.1393277794122696 + }, + { + "entropy": 8.48008918762207, + "epoch": 0.8174807197943444, + "mean_token_accuracy": 0.6881313323974609, + "num_tokens": 22225729.0, + "step": 8268, + "train/ce_loss": 1.4195605899658403e-06 + }, + { + "epoch": 0.8174807197943444, + "step": 8268, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.8174807197943444, + "step": 8268, + "train/total_loss": 0.05078139156103134 + }, + { + "entropy": 9.015676498413086, + "epoch": 0.81757959264386, + "mean_token_accuracy": 0.7643312215805054, + "num_tokens": 22230803.0, + "step": 8269, + "train/ce_loss": 0.6524415016174316 + }, + { + "epoch": 0.81757959264386, + "step": 8269, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.81757959264386, + "step": 8269, + "train/total_loss": 0.1355566531419754 + }, + { + "entropy": 8.901986122131348, + "epoch": 0.8176784654933755, + "mean_token_accuracy": 0.7915493249893188, + "num_tokens": 22235973.0, + "step": 8270, + "train/ce_loss": 0.9602965712547302 + }, + { + "epoch": 0.8176784654933755, + "step": 8270, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.8176784654933755, + "step": 8270, + "train/total_loss": 0.16243591904640198 + }, + { + "entropy": 8.636728286743164, + "epoch": 0.817777338342891, + "mean_token_accuracy": 0.7170022130012512, + "num_tokens": 22241334.0, + "step": 8271, + "train/ce_loss": 1.8001161813735962 + }, + { + "epoch": 0.817777338342891, + "step": 8271, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.817777338342891, + "step": 8271, + "train/total_loss": 0.22688661515712738 + }, + { + "entropy": 9.415346145629883, + "epoch": 0.8178762111924066, + "mean_token_accuracy": 0.7160883545875549, + "num_tokens": 22246057.0, + "step": 8272, + "train/ce_loss": 2.0104639530181885 + }, + { + "epoch": 0.8178762111924066, + "step": 8272, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.8178762111924066, + "step": 8272, + "train/total_loss": 0.2635464072227478 + }, + { + "entropy": 9.019721031188965, + "epoch": 0.8179750840419221, + "mean_token_accuracy": 0.7765957713127136, + "num_tokens": 22251219.0, + "step": 8273, + "train/ce_loss": 0.819446325302124 + }, + { + "epoch": 0.8179750840419221, + "step": 8273, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.8179750840419221, + "step": 8273, + "train/total_loss": 0.12100713700056076 + }, + { + "entropy": 8.677478790283203, + "epoch": 0.8180739568914376, + "mean_token_accuracy": 0.7400274872779846, + "num_tokens": 22256400.0, + "step": 8274, + "train/ce_loss": 5.961961164757668e-07 + }, + { + "epoch": 0.8180739568914376, + "step": 8274, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.8180739568914376, + "step": 8274, + "train/total_loss": 0.019531309604644775 + }, + { + "entropy": 8.468635559082031, + "epoch": 0.8181728297409532, + "mean_token_accuracy": 0.8016194105148315, + "num_tokens": 22261867.0, + "step": 8275, + "train/ce_loss": 0.6289389729499817 + }, + { + "epoch": 0.8181728297409532, + "step": 8275, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.8181728297409532, + "step": 8275, + "train/total_loss": 0.12539389729499817 + }, + { + "entropy": 8.647253036499023, + "epoch": 0.8182717025904687, + "mean_token_accuracy": 0.734375, + "num_tokens": 22267152.0, + "step": 8276, + "train/ce_loss": 0.6211887001991272 + }, + { + "epoch": 0.8182717025904687, + "step": 8276, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.8182717025904687, + "step": 8276, + "train/total_loss": 0.09336887300014496 + }, + { + "entropy": 9.016571998596191, + "epoch": 0.8183705754399841, + "mean_token_accuracy": 0.765739381313324, + "num_tokens": 22272265.0, + "step": 8277, + "train/ce_loss": 1.0259438753128052 + }, + { + "epoch": 0.8183705754399841, + "step": 8277, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.8183705754399841, + "step": 8277, + "train/total_loss": 0.18071939051151276 + }, + { + "entropy": 8.80375862121582, + "epoch": 0.8184694482894997, + "mean_token_accuracy": 0.7493734359741211, + "num_tokens": 22277490.0, + "step": 8278, + "train/ce_loss": 5.999174277349084e-07 + }, + { + "epoch": 0.8184694482894997, + "step": 8278, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.8184694482894997, + "step": 8278, + "train/total_loss": 0.035156309604644775 + }, + { + "entropy": 8.638599395751953, + "epoch": 0.8185683211390152, + "mean_token_accuracy": 0.7134292721748352, + "num_tokens": 22282797.0, + "step": 8279, + "train/ce_loss": 0.7618140578269958 + }, + { + "epoch": 0.8185683211390152, + "step": 8279, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.8185683211390152, + "step": 8279, + "train/total_loss": 0.12696266174316406 + }, + { + "epoch": 0.8186671939885307, + "grad_norm": 0.7438392043113708, + "learning_rate": 7.95554566582604e-06, + "loss": 0.1353, + "step": 8280 + }, + { + "entropy": 9.337332725524902, + "epoch": 0.8186671939885307, + "mean_token_accuracy": 0.7757847309112549, + "num_tokens": 22287669.0, + "step": 8280, + "train/ce_loss": 1.153381586074829 + }, + { + "epoch": 0.8186671939885307, + "step": 8280, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.8186671939885307, + "step": 8280, + "train/total_loss": 0.19736941158771515 + }, + { + "entropy": 8.556232452392578, + "epoch": 0.8187660668380463, + "mean_token_accuracy": 0.692118227481842, + "num_tokens": 22292920.0, + "step": 8281, + "train/ce_loss": 0.8099319338798523 + }, + { + "epoch": 0.8187660668380463, + "step": 8281, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.8187660668380463, + "step": 8281, + "train/total_loss": 0.10443069785833359 + }, + { + "entropy": 8.515803337097168, + "epoch": 0.8188649396875618, + "mean_token_accuracy": 0.6752136945724487, + "num_tokens": 22298330.0, + "step": 8282, + "train/ce_loss": 1.8313559293746948 + }, + { + "epoch": 0.8188649396875618, + "step": 8282, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.8188649396875618, + "step": 8282, + "train/total_loss": 0.22219809889793396 + }, + { + "entropy": 9.142642974853516, + "epoch": 0.8189638125370773, + "mean_token_accuracy": 0.7852664589881897, + "num_tokens": 22303398.0, + "step": 8283, + "train/ce_loss": 1.0307633876800537 + }, + { + "epoch": 0.8189638125370773, + "step": 8283, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.8189638125370773, + "step": 8283, + "train/total_loss": 0.15776383876800537 + }, + { + "entropy": 8.419479370117188, + "epoch": 0.8190626853865929, + "mean_token_accuracy": 0.7898229956626892, + "num_tokens": 22308773.0, + "step": 8284, + "train/ce_loss": 0.5398300886154175 + }, + { + "epoch": 0.8190626853865929, + "step": 8284, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.8190626853865929, + "step": 8284, + "train/total_loss": 0.10867051035165787 + }, + { + "entropy": 8.660914421081543, + "epoch": 0.8191615582361084, + "mean_token_accuracy": 0.7525309324264526, + "num_tokens": 22314115.0, + "step": 8285, + "train/ce_loss": 1.1084849834442139 + }, + { + "epoch": 0.8191615582361084, + "step": 8285, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.8191615582361084, + "step": 8285, + "train/total_loss": 0.15772350132465363 + }, + { + "entropy": 8.585163116455078, + "epoch": 0.8192604310856239, + "mean_token_accuracy": 0.8113878965377808, + "num_tokens": 22319453.0, + "step": 8286, + "train/ce_loss": 0.3588511645793915 + }, + { + "epoch": 0.8192604310856239, + "step": 8286, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.8192604310856239, + "step": 8286, + "train/total_loss": 0.11401011794805527 + }, + { + "entropy": 8.49917221069336, + "epoch": 0.8193593039351394, + "mean_token_accuracy": 0.7005405426025391, + "num_tokens": 22324888.0, + "step": 8287, + "train/ce_loss": 0.6729772090911865 + }, + { + "epoch": 0.8193593039351394, + "step": 8287, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.8193593039351394, + "step": 8287, + "train/total_loss": 0.14151647686958313 + }, + { + "entropy": 9.326393127441406, + "epoch": 0.8194581767846549, + "mean_token_accuracy": 0.723809540271759, + "num_tokens": 22329744.0, + "step": 8288, + "train/ce_loss": 1.513746976852417 + }, + { + "epoch": 0.8194581767846549, + "step": 8288, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.8194581767846549, + "step": 8288, + "train/total_loss": 0.2451246976852417 + }, + { + "entropy": 8.601880073547363, + "epoch": 0.8195570496341704, + "mean_token_accuracy": 0.7397260069847107, + "num_tokens": 22335017.0, + "step": 8289, + "train/ce_loss": 0.7906304001808167 + }, + { + "epoch": 0.8195570496341704, + "step": 8289, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.8195570496341704, + "step": 8289, + "train/total_loss": 0.1181255429983139 + }, + { + "entropy": 8.5064115524292, + "epoch": 0.819655922483686, + "mean_token_accuracy": 0.7393674850463867, + "num_tokens": 22340411.0, + "step": 8290, + "train/ce_loss": 0.7451791167259216 + }, + { + "epoch": 0.819655922483686, + "step": 8290, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.819655922483686, + "step": 8290, + "train/total_loss": 0.09014291316270828 + }, + { + "entropy": 9.029192924499512, + "epoch": 0.8197547953332015, + "mean_token_accuracy": 0.7339743375778198, + "num_tokens": 22345485.0, + "step": 8291, + "train/ce_loss": 0.9517990350723267 + }, + { + "epoch": 0.8197547953332015, + "step": 8291, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.8197547953332015, + "step": 8291, + "train/total_loss": 0.15377366542816162 + }, + { + "entropy": 8.917953491210938, + "epoch": 0.819853668182717, + "mean_token_accuracy": 0.7058823704719543, + "num_tokens": 22350742.0, + "step": 8292, + "train/ce_loss": 1.5822000705156825e-06 + }, + { + "epoch": 0.819853668182717, + "step": 8292, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.819853668182717, + "step": 8292, + "train/total_loss": 0.031250156462192535 + }, + { + "entropy": 9.037485122680664, + "epoch": 0.8199525410322326, + "mean_token_accuracy": 0.7503876090049744, + "num_tokens": 22355827.0, + "step": 8293, + "train/ce_loss": 1.3138306140899658 + }, + { + "epoch": 0.8199525410322326, + "step": 8293, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.8199525410322326, + "step": 8293, + "train/total_loss": 0.19778931140899658 + }, + { + "entropy": 9.30839729309082, + "epoch": 0.8200514138817481, + "mean_token_accuracy": 0.7744361162185669, + "num_tokens": 22360627.0, + "step": 8294, + "train/ce_loss": 1.0988813638687134 + }, + { + "epoch": 0.8200514138817481, + "step": 8294, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.8200514138817481, + "step": 8294, + "train/total_loss": 0.14895063638687134 + }, + { + "entropy": 8.727283477783203, + "epoch": 0.8201502867312636, + "mean_token_accuracy": 0.7434210777282715, + "num_tokens": 22365805.0, + "step": 8295, + "train/ce_loss": 1.1040056943893433 + }, + { + "epoch": 0.8201502867312636, + "step": 8295, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.8201502867312636, + "step": 8295, + "train/total_loss": 0.14165057241916656 + }, + { + "entropy": 8.835163116455078, + "epoch": 0.8202491595807792, + "mean_token_accuracy": 0.7066666483879089, + "num_tokens": 22371104.0, + "step": 8296, + "train/ce_loss": 1.369777798652649 + }, + { + "epoch": 0.8202491595807792, + "step": 8296, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.8202491595807792, + "step": 8296, + "train/total_loss": 0.19557152688503265 + }, + { + "entropy": 8.338752746582031, + "epoch": 0.8203480324302946, + "mean_token_accuracy": 0.748913049697876, + "num_tokens": 22376719.0, + "step": 8297, + "train/ce_loss": 0.8425332307815552 + }, + { + "epoch": 0.8203480324302946, + "step": 8297, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.8203480324302946, + "step": 8297, + "train/total_loss": 0.14675332605838776 + }, + { + "entropy": 8.253923416137695, + "epoch": 0.8204469052798101, + "mean_token_accuracy": 0.7366803288459778, + "num_tokens": 22382188.0, + "step": 8298, + "train/ce_loss": 0.9377067685127258 + }, + { + "epoch": 0.8204469052798101, + "step": 8298, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.8204469052798101, + "step": 8298, + "train/total_loss": 0.14845818281173706 + }, + { + "entropy": 9.257837295532227, + "epoch": 0.8205457781293257, + "mean_token_accuracy": 0.728314220905304, + "num_tokens": 22387243.0, + "step": 8299, + "train/ce_loss": 0.8520750403404236 + }, + { + "epoch": 0.8205457781293257, + "step": 8299, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.8205457781293257, + "step": 8299, + "train/total_loss": 0.1203637570142746 + }, + { + "epoch": 0.8206446509788412, + "grad_norm": 0.7130552530288696, + "learning_rate": 7.950600801068091e-06, + "loss": 0.1332, + "step": 8300 + }, + { + "entropy": 8.64598274230957, + "epoch": 0.8206446509788412, + "mean_token_accuracy": 0.7901375889778137, + "num_tokens": 22392555.0, + "step": 8300, + "train/ce_loss": 0.6230838894844055 + }, + { + "epoch": 0.8206446509788412, + "step": 8300, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.8206446509788412, + "step": 8300, + "train/total_loss": 0.1755896359682083 + }, + { + "entropy": 9.163591384887695, + "epoch": 0.8207435238283567, + "mean_token_accuracy": 0.7315112352371216, + "num_tokens": 22397643.0, + "step": 8301, + "train/ce_loss": 1.5040645599365234 + }, + { + "epoch": 0.8207435238283567, + "step": 8301, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.8207435238283567, + "step": 8301, + "train/total_loss": 0.22853146493434906 + }, + { + "entropy": 9.302888870239258, + "epoch": 0.8208423966778723, + "mean_token_accuracy": 0.7225000262260437, + "num_tokens": 22402433.0, + "step": 8302, + "train/ce_loss": 1.322007392445812e-06 + }, + { + "epoch": 0.8208423966778723, + "step": 8302, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.8208423966778723, + "step": 8302, + "train/total_loss": 0.046875130385160446 + }, + { + "entropy": 9.107100486755371, + "epoch": 0.8209412695273878, + "mean_token_accuracy": 0.6994134783744812, + "num_tokens": 22407533.0, + "step": 8303, + "train/ce_loss": 5.614239171336521e-07 + }, + { + "epoch": 0.8209412695273878, + "step": 8303, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.8209412695273878, + "step": 8303, + "train/total_loss": 0.03515630587935448 + }, + { + "entropy": 8.753803253173828, + "epoch": 0.8210401423769033, + "mean_token_accuracy": 0.746185839176178, + "num_tokens": 22412736.0, + "step": 8304, + "train/ce_loss": 0.6114625334739685 + }, + { + "epoch": 0.8210401423769033, + "step": 8304, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.8210401423769033, + "step": 8304, + "train/total_loss": 0.11192750930786133 + }, + { + "entropy": 8.770864486694336, + "epoch": 0.8211390152264189, + "mean_token_accuracy": 0.6758373379707336, + "num_tokens": 22418028.0, + "step": 8305, + "train/ce_loss": 1.266835331916809 + }, + { + "epoch": 0.8211390152264189, + "step": 8305, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.8211390152264189, + "step": 8305, + "train/total_loss": 0.1969960331916809 + }, + { + "entropy": 9.309890747070312, + "epoch": 0.8212378880759343, + "mean_token_accuracy": 0.7563451528549194, + "num_tokens": 22423032.0, + "step": 8306, + "train/ce_loss": 1.0493959188461304 + }, + { + "epoch": 0.8212378880759343, + "step": 8306, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.8212378880759343, + "step": 8306, + "train/total_loss": 0.19868959486484528 + }, + { + "entropy": 9.099388122558594, + "epoch": 0.8213367609254498, + "mean_token_accuracy": 0.8033536672592163, + "num_tokens": 22428139.0, + "step": 8307, + "train/ce_loss": 0.6932326555252075 + }, + { + "epoch": 0.8213367609254498, + "step": 8307, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.8213367609254498, + "step": 8307, + "train/total_loss": 0.08494826406240463 + }, + { + "entropy": 8.80750846862793, + "epoch": 0.8214356337749654, + "mean_token_accuracy": 0.8082840442657471, + "num_tokens": 22433438.0, + "step": 8308, + "train/ce_loss": 0.5369587540626526 + }, + { + "epoch": 0.8214356337749654, + "step": 8308, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.8214356337749654, + "step": 8308, + "train/total_loss": 0.08494587242603302 + }, + { + "entropy": 8.695101737976074, + "epoch": 0.8215345066244809, + "mean_token_accuracy": 0.7209302186965942, + "num_tokens": 22438765.0, + "step": 8309, + "train/ce_loss": 1.1398298740386963 + }, + { + "epoch": 0.8215345066244809, + "step": 8309, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.8215345066244809, + "step": 8309, + "train/total_loss": 0.18038924038410187 + }, + { + "entropy": 9.937353134155273, + "epoch": 0.8216333794739964, + "mean_token_accuracy": 0.8384279608726501, + "num_tokens": 22443405.0, + "step": 8310, + "train/ce_loss": 1.3805160961055662e-06 + }, + { + "epoch": 0.8216333794739964, + "step": 8310, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.8216333794739964, + "step": 8310, + "train/total_loss": 0.019531387835741043 + }, + { + "entropy": 9.165075302124023, + "epoch": 0.821732252323512, + "mean_token_accuracy": 0.7601476311683655, + "num_tokens": 22448388.0, + "step": 8311, + "train/ce_loss": 0.6348003149032593 + }, + { + "epoch": 0.821732252323512, + "step": 8311, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.821732252323512, + "step": 8311, + "train/total_loss": 0.09082378447055817 + }, + { + "entropy": 9.357953071594238, + "epoch": 0.8218311251730275, + "mean_token_accuracy": 0.7235293984413147, + "num_tokens": 22453288.0, + "step": 8312, + "train/ce_loss": 1.2712736129760742 + }, + { + "epoch": 0.8218311251730275, + "step": 8312, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.8218311251730275, + "step": 8312, + "train/total_loss": 0.16618986427783966 + }, + { + "entropy": 8.546711921691895, + "epoch": 0.821929998022543, + "mean_token_accuracy": 0.7825160026550293, + "num_tokens": 22458686.0, + "step": 8313, + "train/ce_loss": 0.5331897735595703 + }, + { + "epoch": 0.821929998022543, + "step": 8313, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.821929998022543, + "step": 8313, + "train/total_loss": 0.06894397735595703 + }, + { + "entropy": 8.55560302734375, + "epoch": 0.8220288708720586, + "mean_token_accuracy": 0.7384615540504456, + "num_tokens": 22464101.0, + "step": 8314, + "train/ce_loss": 0.44517233967781067 + }, + { + "epoch": 0.8220288708720586, + "step": 8314, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.8220288708720586, + "step": 8314, + "train/total_loss": 0.07186098396778107 + }, + { + "entropy": 8.67685317993164, + "epoch": 0.822127743721574, + "mean_token_accuracy": 0.7303225994110107, + "num_tokens": 22469378.0, + "step": 8315, + "train/ce_loss": 0.9484623670578003 + }, + { + "epoch": 0.822127743721574, + "step": 8315, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.822127743721574, + "step": 8315, + "train/total_loss": 0.15734624862670898 + }, + { + "entropy": 8.476226806640625, + "epoch": 0.8222266165710895, + "mean_token_accuracy": 0.7451550364494324, + "num_tokens": 22474939.0, + "step": 8316, + "train/ce_loss": 1.000063419342041 + }, + { + "epoch": 0.8222266165710895, + "step": 8316, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.8222266165710895, + "step": 8316, + "train/total_loss": 0.1468813419342041 + }, + { + "entropy": 9.247249603271484, + "epoch": 0.8223254894206051, + "mean_token_accuracy": 0.6802217960357666, + "num_tokens": 22479948.0, + "step": 8317, + "train/ce_loss": 1.2007057666778564 + }, + { + "epoch": 0.8223254894206051, + "step": 8317, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.8223254894206051, + "step": 8317, + "train/total_loss": 0.21772682666778564 + }, + { + "entropy": 9.541007995605469, + "epoch": 0.8224243622701206, + "mean_token_accuracy": 0.6674311757087708, + "num_tokens": 22484793.0, + "step": 8318, + "train/ce_loss": 3.321171561765368e-06 + }, + { + "epoch": 0.8224243622701206, + "step": 8318, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.8224243622701206, + "step": 8318, + "train/total_loss": 0.023437831550836563 + }, + { + "entropy": 8.837417602539062, + "epoch": 0.8225232351196361, + "mean_token_accuracy": 0.6903954744338989, + "num_tokens": 22490131.0, + "step": 8319, + "train/ce_loss": 1.2030870914459229 + }, + { + "epoch": 0.8225232351196361, + "step": 8319, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.8225232351196361, + "step": 8319, + "train/total_loss": 0.19062121212482452 + }, + { + "epoch": 0.8226221079691517, + "grad_norm": 0.7055894732475281, + "learning_rate": 7.945655936310143e-06, + "loss": 0.1319, + "step": 8320 + }, + { + "entropy": 8.6664400100708, + "epoch": 0.8226221079691517, + "mean_token_accuracy": 0.7209821343421936, + "num_tokens": 22495530.0, + "step": 8320, + "train/ce_loss": 1.1936757564544678 + }, + { + "epoch": 0.8226221079691517, + "step": 8320, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.8226221079691517, + "step": 8320, + "train/total_loss": 0.1740550696849823 + }, + { + "entropy": 9.077009201049805, + "epoch": 0.8227209808186672, + "mean_token_accuracy": 0.7269230484962463, + "num_tokens": 22500516.0, + "step": 8321, + "train/ce_loss": 0.8068293929100037 + }, + { + "epoch": 0.8227209808186672, + "step": 8321, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.8227209808186672, + "step": 8321, + "train/total_loss": 0.11974544078111649 + }, + { + "entropy": 8.769912719726562, + "epoch": 0.8228198536681827, + "mean_token_accuracy": 0.6963350772857666, + "num_tokens": 22505729.0, + "step": 8322, + "train/ce_loss": 8.714407158549875e-06 + }, + { + "epoch": 0.8228198536681827, + "step": 8322, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.8228198536681827, + "step": 8322, + "train/total_loss": 0.02734462171792984 + }, + { + "entropy": 9.142882347106934, + "epoch": 0.8229187265176983, + "mean_token_accuracy": 0.6964285969734192, + "num_tokens": 22510860.0, + "step": 8323, + "train/ce_loss": 0.9637071490287781 + }, + { + "epoch": 0.8229187265176983, + "step": 8323, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.8229187265176983, + "step": 8323, + "train/total_loss": 0.18230822682380676 + }, + { + "entropy": 9.229522705078125, + "epoch": 0.8230175993672137, + "mean_token_accuracy": 0.7593167424201965, + "num_tokens": 22515897.0, + "step": 8324, + "train/ce_loss": 5.397641871240921e-07 + }, + { + "epoch": 0.8230175993672137, + "step": 8324, + "train/sim_loss": 0.01171875 + }, + { + "epoch": 0.8230175993672137, + "step": 8324, + "train/total_loss": 0.011718804016709328 + }, + { + "entropy": 8.921040534973145, + "epoch": 0.8231164722167292, + "mean_token_accuracy": 0.7547847032546997, + "num_tokens": 22521199.0, + "step": 8325, + "train/ce_loss": 0.5268176794052124 + }, + { + "epoch": 0.8231164722167292, + "step": 8325, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.8231164722167292, + "step": 8325, + "train/total_loss": 0.09174427390098572 + }, + { + "entropy": 9.184289932250977, + "epoch": 0.8232153450662448, + "mean_token_accuracy": 0.7299703359603882, + "num_tokens": 22526335.0, + "step": 8326, + "train/ce_loss": 0.8263388872146606 + }, + { + "epoch": 0.8232153450662448, + "step": 8326, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.8232153450662448, + "step": 8326, + "train/total_loss": 0.1646651327610016 + }, + { + "entropy": 8.599185943603516, + "epoch": 0.8233142179157603, + "mean_token_accuracy": 0.7813725471496582, + "num_tokens": 22531824.0, + "step": 8327, + "train/ce_loss": 0.7705698609352112 + }, + { + "epoch": 0.8233142179157603, + "step": 8327, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.8233142179157603, + "step": 8327, + "train/total_loss": 0.09658823907375336 + }, + { + "entropy": 9.578577995300293, + "epoch": 0.8234130907652758, + "mean_token_accuracy": 0.7731958627700806, + "num_tokens": 22536644.0, + "step": 8328, + "train/ce_loss": 1.8175444438384147e-06 + }, + { + "epoch": 0.8234130907652758, + "step": 8328, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.8234130907652758, + "step": 8328, + "train/total_loss": 0.042968932539224625 + }, + { + "entropy": 8.849867820739746, + "epoch": 0.8235119636147914, + "mean_token_accuracy": 0.7250280380249023, + "num_tokens": 22541954.0, + "step": 8329, + "train/ce_loss": 1.332899808883667 + }, + { + "epoch": 0.8235119636147914, + "step": 8329, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.8235119636147914, + "step": 8329, + "train/total_loss": 0.16063372790813446 + }, + { + "entropy": 8.916130065917969, + "epoch": 0.8236108364643069, + "mean_token_accuracy": 0.7496318221092224, + "num_tokens": 22547085.0, + "step": 8330, + "train/ce_loss": 0.9665127992630005 + }, + { + "epoch": 0.8236108364643069, + "step": 8330, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.8236108364643069, + "step": 8330, + "train/total_loss": 0.14743253588676453 + }, + { + "entropy": 8.864173889160156, + "epoch": 0.8237097093138224, + "mean_token_accuracy": 0.7624861001968384, + "num_tokens": 22552479.0, + "step": 8331, + "train/ce_loss": 0.5066568851470947 + }, + { + "epoch": 0.8237097093138224, + "step": 8331, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.8237097093138224, + "step": 8331, + "train/total_loss": 0.1366031914949417 + }, + { + "entropy": 8.682044982910156, + "epoch": 0.823808582163338, + "mean_token_accuracy": 0.7339832782745361, + "num_tokens": 22557646.0, + "step": 8332, + "train/ce_loss": 0.9429482817649841 + }, + { + "epoch": 0.823808582163338, + "step": 8332, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.823808582163338, + "step": 8332, + "train/total_loss": 0.20366983115673065 + }, + { + "entropy": 8.33643627166748, + "epoch": 0.8239074550128535, + "mean_token_accuracy": 0.7113022208213806, + "num_tokens": 22562989.0, + "step": 8333, + "train/ce_loss": 1.1955353021621704 + }, + { + "epoch": 0.8239074550128535, + "step": 8333, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.8239074550128535, + "step": 8333, + "train/total_loss": 0.21330353617668152 + }, + { + "entropy": 9.05999755859375, + "epoch": 0.824006327862369, + "mean_token_accuracy": 0.7610872387886047, + "num_tokens": 22568175.0, + "step": 8334, + "train/ce_loss": 4.788768706021074e-07 + }, + { + "epoch": 0.824006327862369, + "step": 8334, + "train/sim_loss": 0.01171875 + }, + { + "epoch": 0.824006327862369, + "step": 8334, + "train/total_loss": 0.011718797497451305 + }, + { + "entropy": 8.767662048339844, + "epoch": 0.8241052007118845, + "mean_token_accuracy": 0.7311828136444092, + "num_tokens": 22573392.0, + "step": 8335, + "train/ce_loss": 0.5941759347915649 + }, + { + "epoch": 0.8241052007118845, + "step": 8335, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.8241052007118845, + "step": 8335, + "train/total_loss": 0.11410509049892426 + }, + { + "entropy": 8.826053619384766, + "epoch": 0.8242040735614, + "mean_token_accuracy": 0.7337662577629089, + "num_tokens": 22578487.0, + "step": 8336, + "train/ce_loss": 4.6122145249682944e-06 + }, + { + "epoch": 0.8242040735614, + "step": 8336, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.8242040735614, + "step": 8336, + "train/total_loss": 0.03515671193599701 + }, + { + "entropy": 9.424985885620117, + "epoch": 0.8243029464109156, + "mean_token_accuracy": 0.7524752616882324, + "num_tokens": 22583428.0, + "step": 8337, + "train/ce_loss": 0.8262380957603455 + }, + { + "epoch": 0.8243029464109156, + "step": 8337, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.8243029464109156, + "step": 8337, + "train/total_loss": 0.13731130957603455 + }, + { + "entropy": 9.327249526977539, + "epoch": 0.8244018192604311, + "mean_token_accuracy": 0.7720306515693665, + "num_tokens": 22588375.0, + "step": 8338, + "train/ce_loss": 0.8780062198638916 + }, + { + "epoch": 0.8244018192604311, + "step": 8338, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.8244018192604311, + "step": 8338, + "train/total_loss": 0.12686312198638916 + }, + { + "entropy": 8.657686233520508, + "epoch": 0.8245006921099466, + "mean_token_accuracy": 0.7360405921936035, + "num_tokens": 22593823.0, + "step": 8339, + "train/ce_loss": 1.0873697996139526 + }, + { + "epoch": 0.8245006921099466, + "step": 8339, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.8245006921099466, + "step": 8339, + "train/total_loss": 0.15951824188232422 + }, + { + "epoch": 0.8245995649594622, + "grad_norm": 0.573781430721283, + "learning_rate": 7.940711071552194e-06, + "loss": 0.1365, + "step": 8340 + }, + { + "entropy": 8.919189453125, + "epoch": 0.8245995649594622, + "mean_token_accuracy": 0.6984318494796753, + "num_tokens": 22599110.0, + "step": 8340, + "train/ce_loss": 1.0977706909179688 + }, + { + "epoch": 0.8245995649594622, + "step": 8340, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.8245995649594622, + "step": 8340, + "train/total_loss": 0.1449333131313324 + }, + { + "entropy": 8.751079559326172, + "epoch": 0.8246984378089777, + "mean_token_accuracy": 0.6935867071151733, + "num_tokens": 22604477.0, + "step": 8341, + "train/ce_loss": 1.095212459564209 + }, + { + "epoch": 0.8246984378089777, + "step": 8341, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.8246984378089777, + "step": 8341, + "train/total_loss": 0.16811498999595642 + }, + { + "entropy": 8.826139450073242, + "epoch": 0.8247973106584932, + "mean_token_accuracy": 0.7035236954689026, + "num_tokens": 22609834.0, + "step": 8342, + "train/ce_loss": 1.364446759223938 + }, + { + "epoch": 0.8247973106584932, + "step": 8342, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.8247973106584932, + "step": 8342, + "train/total_loss": 0.24191342294216156 + }, + { + "entropy": 9.234088897705078, + "epoch": 0.8248961835080088, + "mean_token_accuracy": 0.7575277090072632, + "num_tokens": 22614883.0, + "step": 8343, + "train/ce_loss": 1.440025806427002 + }, + { + "epoch": 0.8248961835080088, + "step": 8343, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.8248961835080088, + "step": 8343, + "train/total_loss": 0.16744008660316467 + }, + { + "entropy": 9.1658935546875, + "epoch": 0.8249950563575242, + "mean_token_accuracy": 0.743813693523407, + "num_tokens": 22620039.0, + "step": 8344, + "train/ce_loss": 1.5082443952560425 + }, + { + "epoch": 0.8249950563575242, + "step": 8344, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.8249950563575242, + "step": 8344, + "train/total_loss": 0.1976994425058365 + }, + { + "entropy": 9.100385665893555, + "epoch": 0.8250939292070397, + "mean_token_accuracy": 0.761689305305481, + "num_tokens": 22625123.0, + "step": 8345, + "train/ce_loss": 8.795655048743356e-07 + }, + { + "epoch": 0.8250939292070397, + "step": 8345, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.8250939292070397, + "step": 8345, + "train/total_loss": 0.05859383940696716 + }, + { + "entropy": 9.031524658203125, + "epoch": 0.8251928020565553, + "mean_token_accuracy": 0.7572078704833984, + "num_tokens": 22630206.0, + "step": 8346, + "train/ce_loss": 0.6743995547294617 + }, + { + "epoch": 0.8251928020565553, + "step": 8346, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.8251928020565553, + "step": 8346, + "train/total_loss": 0.0947837084531784 + }, + { + "entropy": 8.112039566040039, + "epoch": 0.8252916749060708, + "mean_token_accuracy": 0.7008771896362305, + "num_tokens": 22635773.0, + "step": 8347, + "train/ce_loss": 1.1441271305084229 + }, + { + "epoch": 0.8252916749060708, + "step": 8347, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.8252916749060708, + "step": 8347, + "train/total_loss": 0.20035022497177124 + }, + { + "entropy": 8.473743438720703, + "epoch": 0.8253905477555863, + "mean_token_accuracy": 0.7245370149612427, + "num_tokens": 22641116.0, + "step": 8348, + "train/ce_loss": 0.8390929698944092 + }, + { + "epoch": 0.8253905477555863, + "step": 8348, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.8253905477555863, + "step": 8348, + "train/total_loss": 0.1542218029499054 + }, + { + "entropy": 8.437721252441406, + "epoch": 0.8254894206051019, + "mean_token_accuracy": 0.725806474685669, + "num_tokens": 22646642.0, + "step": 8349, + "train/ce_loss": 0.5069792866706848 + }, + { + "epoch": 0.8254894206051019, + "step": 8349, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.8254894206051019, + "step": 8349, + "train/total_loss": 0.1014791801571846 + }, + { + "entropy": 8.793678283691406, + "epoch": 0.8255882934546174, + "mean_token_accuracy": 0.7767969965934753, + "num_tokens": 22651962.0, + "step": 8350, + "train/ce_loss": 0.43854376673698425 + }, + { + "epoch": 0.8255882934546174, + "step": 8350, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.8255882934546174, + "step": 8350, + "train/total_loss": 0.09463562816381454 + }, + { + "entropy": 8.788917541503906, + "epoch": 0.8256871663041329, + "mean_token_accuracy": 0.7319728136062622, + "num_tokens": 22657170.0, + "step": 8351, + "train/ce_loss": 0.5017361640930176 + }, + { + "epoch": 0.8256871663041329, + "step": 8351, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.8256871663041329, + "step": 8351, + "train/total_loss": 0.09314236789941788 + }, + { + "entropy": 8.618673324584961, + "epoch": 0.8257860391536485, + "mean_token_accuracy": 0.6952965259552002, + "num_tokens": 22662603.0, + "step": 8352, + "train/ce_loss": 1.5782723426818848 + }, + { + "epoch": 0.8257860391536485, + "step": 8352, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.8257860391536485, + "step": 8352, + "train/total_loss": 0.2359522432088852 + }, + { + "entropy": 8.515067100524902, + "epoch": 0.8258849120031639, + "mean_token_accuracy": 0.7948718070983887, + "num_tokens": 22667913.0, + "step": 8353, + "train/ce_loss": 0.9898000359535217 + }, + { + "epoch": 0.8258849120031639, + "step": 8353, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.8258849120031639, + "step": 8353, + "train/total_loss": 0.14585500955581665 + }, + { + "entropy": 8.51097297668457, + "epoch": 0.8259837848526794, + "mean_token_accuracy": 0.7309874892234802, + "num_tokens": 22673272.0, + "step": 8354, + "train/ce_loss": 0.6297245621681213 + }, + { + "epoch": 0.8259837848526794, + "step": 8354, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.8259837848526794, + "step": 8354, + "train/total_loss": 0.09422245621681213 + }, + { + "entropy": 8.786314010620117, + "epoch": 0.826082657702195, + "mean_token_accuracy": 0.6924968957901001, + "num_tokens": 22678561.0, + "step": 8355, + "train/ce_loss": 0.7038342952728271 + }, + { + "epoch": 0.826082657702195, + "step": 8355, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.826082657702195, + "step": 8355, + "train/total_loss": 0.10553967952728271 + }, + { + "entropy": 9.134121894836426, + "epoch": 0.8261815305517105, + "mean_token_accuracy": 0.8154860138893127, + "num_tokens": 22683557.0, + "step": 8356, + "train/ce_loss": 5.090377044325578e-07 + }, + { + "epoch": 0.8261815305517105, + "step": 8356, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.8261815305517105, + "step": 8356, + "train/total_loss": 0.04687505215406418 + }, + { + "entropy": 8.495359420776367, + "epoch": 0.826280403401226, + "mean_token_accuracy": 0.6715686321258545, + "num_tokens": 22689015.0, + "step": 8357, + "train/ce_loss": 1.5298234224319458 + }, + { + "epoch": 0.826280403401226, + "step": 8357, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.826280403401226, + "step": 8357, + "train/total_loss": 0.19204483926296234 + }, + { + "entropy": 8.934258460998535, + "epoch": 0.8263792762507416, + "mean_token_accuracy": 0.75, + "num_tokens": 22694265.0, + "step": 8358, + "train/ce_loss": 0.9743590950965881 + }, + { + "epoch": 0.8263792762507416, + "step": 8358, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.8263792762507416, + "step": 8358, + "train/total_loss": 0.17556092143058777 + }, + { + "entropy": 9.121557235717773, + "epoch": 0.8264781491002571, + "mean_token_accuracy": 0.680672287940979, + "num_tokens": 22699467.0, + "step": 8359, + "train/ce_loss": 0.8887518048286438 + }, + { + "epoch": 0.8264781491002571, + "step": 8359, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.8264781491002571, + "step": 8359, + "train/total_loss": 0.1357501745223999 + }, + { + "epoch": 0.8265770219497726, + "grad_norm": 0.7569577693939209, + "learning_rate": 7.935766206794246e-06, + "loss": 0.1418, + "step": 8360 + }, + { + "entropy": 8.900482177734375, + "epoch": 0.8265770219497726, + "mean_token_accuracy": 0.7642276287078857, + "num_tokens": 22704663.0, + "step": 8360, + "train/ce_loss": 0.5398333072662354 + }, + { + "epoch": 0.8265770219497726, + "step": 8360, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.8265770219497726, + "step": 8360, + "train/total_loss": 0.11648333072662354 + }, + { + "entropy": 9.118875503540039, + "epoch": 0.8266758947992882, + "mean_token_accuracy": 0.6995153427124023, + "num_tokens": 22709685.0, + "step": 8361, + "train/ce_loss": 1.1485044524306431e-06 + }, + { + "epoch": 0.8266758947992882, + "step": 8361, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.8266758947992882, + "step": 8361, + "train/total_loss": 0.03515636548399925 + }, + { + "entropy": 8.710692405700684, + "epoch": 0.8267747676488036, + "mean_token_accuracy": 0.7369020581245422, + "num_tokens": 22715058.0, + "step": 8362, + "train/ce_loss": 0.6468256711959839 + }, + { + "epoch": 0.8267747676488036, + "step": 8362, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.8267747676488036, + "step": 8362, + "train/total_loss": 0.17015132308006287 + }, + { + "entropy": 9.265089988708496, + "epoch": 0.8268736404983191, + "mean_token_accuracy": 0.7182866334915161, + "num_tokens": 22720280.0, + "step": 8363, + "train/ce_loss": 1.0972142219543457 + }, + { + "epoch": 0.8268736404983191, + "step": 8363, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.8268736404983191, + "step": 8363, + "train/total_loss": 0.18003392219543457 + }, + { + "entropy": 8.912721633911133, + "epoch": 0.8269725133478347, + "mean_token_accuracy": 0.7468531727790833, + "num_tokens": 22725493.0, + "step": 8364, + "train/ce_loss": 1.1656962897177436e-06 + }, + { + "epoch": 0.8269725133478347, + "step": 8364, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.8269725133478347, + "step": 8364, + "train/total_loss": 0.05859386548399925 + }, + { + "entropy": 9.387989044189453, + "epoch": 0.8270713861973502, + "mean_token_accuracy": 0.8360071182250977, + "num_tokens": 22730463.0, + "step": 8365, + "train/ce_loss": 0.6007579565048218 + }, + { + "epoch": 0.8270713861973502, + "step": 8365, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.8270713861973502, + "step": 8365, + "train/total_loss": 0.1460132896900177 + }, + { + "entropy": 8.675314903259277, + "epoch": 0.8271702590468657, + "mean_token_accuracy": 0.7235772609710693, + "num_tokens": 22735762.0, + "step": 8366, + "train/ce_loss": 0.8701668977737427 + }, + { + "epoch": 0.8271702590468657, + "step": 8366, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.8271702590468657, + "step": 8366, + "train/total_loss": 0.20029795169830322 + }, + { + "entropy": 9.281265258789062, + "epoch": 0.8272691318963813, + "mean_token_accuracy": 0.7655601501464844, + "num_tokens": 22740697.0, + "step": 8367, + "train/ce_loss": 0.726944625377655 + }, + { + "epoch": 0.8272691318963813, + "step": 8367, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.8272691318963813, + "step": 8367, + "train/total_loss": 0.12347571551799774 + }, + { + "entropy": 9.316499710083008, + "epoch": 0.8273680047458968, + "mean_token_accuracy": 0.7140411138534546, + "num_tokens": 22745762.0, + "step": 8368, + "train/ce_loss": 1.2663293773584883e-06 + }, + { + "epoch": 0.8273680047458968, + "step": 8368, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.8273680047458968, + "step": 8368, + "train/total_loss": 0.04296887665987015 + }, + { + "entropy": 8.754581451416016, + "epoch": 0.8274668775954123, + "mean_token_accuracy": 0.7535884976387024, + "num_tokens": 22751003.0, + "step": 8369, + "train/ce_loss": 1.0439002513885498 + }, + { + "epoch": 0.8274668775954123, + "step": 8369, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.8274668775954123, + "step": 8369, + "train/total_loss": 0.13954627513885498 + }, + { + "entropy": 9.088933944702148, + "epoch": 0.8275657504449279, + "mean_token_accuracy": 0.745814323425293, + "num_tokens": 22756149.0, + "step": 8370, + "train/ce_loss": 0.6570780873298645 + }, + { + "epoch": 0.8275657504449279, + "step": 8370, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.8275657504449279, + "step": 8370, + "train/total_loss": 0.15164530277252197 + }, + { + "entropy": 8.768074035644531, + "epoch": 0.8276646232944433, + "mean_token_accuracy": 0.7833982110023499, + "num_tokens": 22761399.0, + "step": 8371, + "train/ce_loss": 0.6726462244987488 + }, + { + "epoch": 0.8276646232944433, + "step": 8371, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.8276646232944433, + "step": 8371, + "train/total_loss": 0.094608373939991 + }, + { + "entropy": 8.775765419006348, + "epoch": 0.8277634961439588, + "mean_token_accuracy": 0.7537227869033813, + "num_tokens": 22766702.0, + "step": 8372, + "train/ce_loss": 0.612133800983429 + }, + { + "epoch": 0.8277634961439588, + "step": 8372, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.8277634961439588, + "step": 8372, + "train/total_loss": 0.08465088158845901 + }, + { + "entropy": 9.579498291015625, + "epoch": 0.8278623689934744, + "mean_token_accuracy": 0.6849710941314697, + "num_tokens": 22771445.0, + "step": 8373, + "train/ce_loss": 1.8132412433624268 + }, + { + "epoch": 0.8278623689934744, + "step": 8373, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.8278623689934744, + "step": 8373, + "train/total_loss": 0.23991787433624268 + }, + { + "entropy": 8.698003768920898, + "epoch": 0.8279612418429899, + "mean_token_accuracy": 0.7521276473999023, + "num_tokens": 22776877.0, + "step": 8374, + "train/ce_loss": 0.5366253852844238 + }, + { + "epoch": 0.8279612418429899, + "step": 8374, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.8279612418429899, + "step": 8374, + "train/total_loss": 0.10835003852844238 + }, + { + "entropy": 9.23279857635498, + "epoch": 0.8280601146925054, + "mean_token_accuracy": 0.7651376128196716, + "num_tokens": 22781854.0, + "step": 8375, + "train/ce_loss": 0.7307769656181335 + }, + { + "epoch": 0.8280601146925054, + "step": 8375, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.8280601146925054, + "step": 8375, + "train/total_loss": 0.11214020103216171 + }, + { + "entropy": 8.741460800170898, + "epoch": 0.828158987542021, + "mean_token_accuracy": 0.7210884094238281, + "num_tokens": 22787014.0, + "step": 8376, + "train/ce_loss": 0.9084756374359131 + }, + { + "epoch": 0.828158987542021, + "step": 8376, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.828158987542021, + "step": 8376, + "train/total_loss": 0.14944131672382355 + }, + { + "entropy": 8.50411605834961, + "epoch": 0.8282578603915365, + "mean_token_accuracy": 0.7203065156936646, + "num_tokens": 22792294.0, + "step": 8377, + "train/ce_loss": 1.2492197751998901 + }, + { + "epoch": 0.8282578603915365, + "step": 8377, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.8282578603915365, + "step": 8377, + "train/total_loss": 0.210859477519989 + }, + { + "entropy": 8.712186813354492, + "epoch": 0.828356733241052, + "mean_token_accuracy": 0.7301587462425232, + "num_tokens": 22797685.0, + "step": 8378, + "train/ce_loss": 0.7416990995407104 + }, + { + "epoch": 0.828356733241052, + "step": 8378, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.828356733241052, + "step": 8378, + "train/total_loss": 0.12495116144418716 + }, + { + "entropy": 8.593022346496582, + "epoch": 0.8284556060905676, + "mean_token_accuracy": 0.7089552283287048, + "num_tokens": 22802988.0, + "step": 8379, + "train/ce_loss": 0.6530058979988098 + }, + { + "epoch": 0.8284556060905676, + "step": 8379, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.8284556060905676, + "step": 8379, + "train/total_loss": 0.0848318412899971 + }, + { + "epoch": 0.828554478940083, + "grad_norm": 0.8061201572418213, + "learning_rate": 7.930821342036296e-06, + "loss": 0.1408, + "step": 8380 + }, + { + "entropy": 8.493386268615723, + "epoch": 0.828554478940083, + "mean_token_accuracy": 0.7535545229911804, + "num_tokens": 22808315.0, + "step": 8380, + "train/ce_loss": 3.5311497867951402e-06 + }, + { + "epoch": 0.828554478940083, + "step": 8380, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.828554478940083, + "step": 8380, + "train/total_loss": 0.035156603902578354 + }, + { + "entropy": 8.784348487854004, + "epoch": 0.8286533517895985, + "mean_token_accuracy": 0.674054741859436, + "num_tokens": 22813511.0, + "step": 8381, + "train/ce_loss": 1.5281237363815308 + }, + { + "epoch": 0.8286533517895985, + "step": 8381, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.8286533517895985, + "step": 8381, + "train/total_loss": 0.24656237661838531 + }, + { + "entropy": 8.662527084350586, + "epoch": 0.8287522246391141, + "mean_token_accuracy": 0.7210144996643066, + "num_tokens": 22818804.0, + "step": 8382, + "train/ce_loss": 0.9179139137268066 + }, + { + "epoch": 0.8287522246391141, + "step": 8382, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.8287522246391141, + "step": 8382, + "train/total_loss": 0.15819764137268066 + }, + { + "entropy": 9.229565620422363, + "epoch": 0.8288510974886296, + "mean_token_accuracy": 0.7356687784194946, + "num_tokens": 22823921.0, + "step": 8383, + "train/ce_loss": 0.8467808365821838 + }, + { + "epoch": 0.8288510974886296, + "step": 8383, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.8288510974886296, + "step": 8383, + "train/total_loss": 0.14717808365821838 + }, + { + "entropy": 9.11532974243164, + "epoch": 0.8289499703381451, + "mean_token_accuracy": 0.7118353247642517, + "num_tokens": 22828969.0, + "step": 8384, + "train/ce_loss": 3.157426118850708 + }, + { + "epoch": 0.8289499703381451, + "step": 8384, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.8289499703381451, + "step": 8384, + "train/total_loss": 0.3704301118850708 + }, + { + "entropy": 8.697054862976074, + "epoch": 0.8290488431876607, + "mean_token_accuracy": 0.736540675163269, + "num_tokens": 22834322.0, + "step": 8385, + "train/ce_loss": 0.3840124309062958 + }, + { + "epoch": 0.8290488431876607, + "step": 8385, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.8290488431876607, + "step": 8385, + "train/total_loss": 0.08136999607086182 + }, + { + "entropy": 8.476293563842773, + "epoch": 0.8291477160371762, + "mean_token_accuracy": 0.7830578684806824, + "num_tokens": 22839780.0, + "step": 8386, + "train/ce_loss": 0.36207708716392517 + }, + { + "epoch": 0.8291477160371762, + "step": 8386, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.8291477160371762, + "step": 8386, + "train/total_loss": 0.05183270946145058 + }, + { + "entropy": 8.90697956085205, + "epoch": 0.8292465888866917, + "mean_token_accuracy": 0.705633819103241, + "num_tokens": 22844896.0, + "step": 8387, + "train/ce_loss": 1.3907127380371094 + }, + { + "epoch": 0.8292465888866917, + "step": 8387, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.8292465888866917, + "step": 8387, + "train/total_loss": 0.1937587708234787 + }, + { + "entropy": 9.319096565246582, + "epoch": 0.8293454617362073, + "mean_token_accuracy": 0.69140625, + "num_tokens": 22849833.0, + "step": 8388, + "train/ce_loss": 7.63372918299865e-06 + }, + { + "epoch": 0.8293454617362073, + "step": 8388, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.8293454617362073, + "step": 8388, + "train/total_loss": 0.015625763684511185 + }, + { + "entropy": 9.05936050415039, + "epoch": 0.8294443345857228, + "mean_token_accuracy": 0.6970198750495911, + "num_tokens": 22854904.0, + "step": 8389, + "train/ce_loss": 1.126064419746399 + }, + { + "epoch": 0.8294443345857228, + "step": 8389, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.8294443345857228, + "step": 8389, + "train/total_loss": 0.17510643601417542 + }, + { + "entropy": 9.503225326538086, + "epoch": 0.8295432074352382, + "mean_token_accuracy": 0.7247706651687622, + "num_tokens": 22859662.0, + "step": 8390, + "train/ce_loss": 1.6506280644534854e-06 + }, + { + "epoch": 0.8295432074352382, + "step": 8390, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.8295432074352382, + "step": 8390, + "train/total_loss": 0.03906266391277313 + }, + { + "entropy": 8.62103271484375, + "epoch": 0.8296420802847538, + "mean_token_accuracy": 0.7653429508209229, + "num_tokens": 22864963.0, + "step": 8391, + "train/ce_loss": 0.7772435545921326 + }, + { + "epoch": 0.8296420802847538, + "step": 8391, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.8296420802847538, + "step": 8391, + "train/total_loss": 0.1363181173801422 + }, + { + "entropy": 8.943082809448242, + "epoch": 0.8297409531342693, + "mean_token_accuracy": 0.6905370950698853, + "num_tokens": 22870227.0, + "step": 8392, + "train/ce_loss": 1.8764548301696777 + }, + { + "epoch": 0.8297409531342693, + "step": 8392, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.8297409531342693, + "step": 8392, + "train/total_loss": 0.30092674493789673 + }, + { + "entropy": 9.290210723876953, + "epoch": 0.8298398259837848, + "mean_token_accuracy": 0.7767857313156128, + "num_tokens": 22875191.0, + "step": 8393, + "train/ce_loss": 1.4096912145614624 + }, + { + "epoch": 0.8298398259837848, + "step": 8393, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.8298398259837848, + "step": 8393, + "train/total_loss": 0.19175037741661072 + }, + { + "entropy": 9.422948837280273, + "epoch": 0.8299386988333004, + "mean_token_accuracy": 0.7759036421775818, + "num_tokens": 22880006.0, + "step": 8394, + "train/ce_loss": 3.2801135603222065e-06 + }, + { + "epoch": 0.8299386988333004, + "step": 8394, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.8299386988333004, + "step": 8394, + "train/total_loss": 0.019531577825546265 + }, + { + "entropy": 8.709457397460938, + "epoch": 0.8300375716828159, + "mean_token_accuracy": 0.6770833134651184, + "num_tokens": 22885445.0, + "step": 8395, + "train/ce_loss": 1.0227208137512207 + }, + { + "epoch": 0.8300375716828159, + "step": 8395, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.8300375716828159, + "step": 8395, + "train/total_loss": 0.18430334329605103 + }, + { + "entropy": 8.994638442993164, + "epoch": 0.8301364445323314, + "mean_token_accuracy": 0.7828842997550964, + "num_tokens": 22890524.0, + "step": 8396, + "train/ce_loss": 1.1766117811203003 + }, + { + "epoch": 0.8301364445323314, + "step": 8396, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.8301364445323314, + "step": 8396, + "train/total_loss": 0.17234867811203003 + }, + { + "entropy": 8.95175552368164, + "epoch": 0.830235317381847, + "mean_token_accuracy": 0.7016011476516724, + "num_tokens": 22895628.0, + "step": 8397, + "train/ce_loss": 1.218043565750122 + }, + { + "epoch": 0.830235317381847, + "step": 8397, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.830235317381847, + "step": 8397, + "train/total_loss": 0.1764918565750122 + }, + { + "entropy": 8.634076118469238, + "epoch": 0.8303341902313625, + "mean_token_accuracy": 0.7063106894493103, + "num_tokens": 22900889.0, + "step": 8398, + "train/ce_loss": 0.709955096244812 + }, + { + "epoch": 0.8303341902313625, + "step": 8398, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.8303341902313625, + "step": 8398, + "train/total_loss": 0.1061517596244812 + }, + { + "entropy": 8.409835815429688, + "epoch": 0.8304330630808779, + "mean_token_accuracy": 0.7674418687820435, + "num_tokens": 22906310.0, + "step": 8399, + "train/ce_loss": 0.748572826385498 + }, + { + "epoch": 0.8304330630808779, + "step": 8399, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.8304330630808779, + "step": 8399, + "train/total_loss": 0.13735729455947876 + }, + { + "epoch": 0.8305319359303935, + "grad_norm": 0.5774850845336914, + "learning_rate": 7.925876477278347e-06, + "loss": 0.1424, + "step": 8400 + }, + { + "entropy": 9.510812759399414, + "epoch": 0.8305319359303935, + "mean_token_accuracy": 0.8356807231903076, + "num_tokens": 22911129.0, + "step": 8400, + "train/ce_loss": 1.0387249176346813e-06 + }, + { + "epoch": 0.8305319359303935, + "step": 8400, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.8305319359303935, + "step": 8400, + "train/total_loss": 0.015625104308128357 + }, + { + "entropy": 8.4102783203125, + "epoch": 0.830630808779909, + "mean_token_accuracy": 0.7367829084396362, + "num_tokens": 22916518.0, + "step": 8401, + "train/ce_loss": 0.835471510887146 + }, + { + "epoch": 0.830630808779909, + "step": 8401, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.830630808779909, + "step": 8401, + "train/total_loss": 0.14995339512825012 + }, + { + "entropy": 8.879125595092773, + "epoch": 0.8307296816294245, + "mean_token_accuracy": 0.7929373979568481, + "num_tokens": 22921647.0, + "step": 8402, + "train/ce_loss": 0.6748191118240356 + }, + { + "epoch": 0.8307296816294245, + "step": 8402, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.8307296816294245, + "step": 8402, + "train/total_loss": 0.09091941267251968 + }, + { + "entropy": 9.066545486450195, + "epoch": 0.8308285544789401, + "mean_token_accuracy": 0.7420634627342224, + "num_tokens": 22926624.0, + "step": 8403, + "train/ce_loss": 0.577014148235321 + }, + { + "epoch": 0.8308285544789401, + "step": 8403, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.8308285544789401, + "step": 8403, + "train/total_loss": 0.08895141631364822 + }, + { + "entropy": 8.172971725463867, + "epoch": 0.8309274273284556, + "mean_token_accuracy": 0.7911571264266968, + "num_tokens": 22932192.0, + "step": 8404, + "train/ce_loss": 1.1235606670379639 + }, + { + "epoch": 0.8309274273284556, + "step": 8404, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.8309274273284556, + "step": 8404, + "train/total_loss": 0.1865748167037964 + }, + { + "entropy": 8.688688278198242, + "epoch": 0.8310263001779711, + "mean_token_accuracy": 0.7402597665786743, + "num_tokens": 22937426.0, + "step": 8405, + "train/ce_loss": 0.8759625554084778 + }, + { + "epoch": 0.8310263001779711, + "step": 8405, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.8310263001779711, + "step": 8405, + "train/total_loss": 0.11103376001119614 + }, + { + "entropy": 8.854475021362305, + "epoch": 0.8311251730274867, + "mean_token_accuracy": 0.7823033928871155, + "num_tokens": 22942586.0, + "step": 8406, + "train/ce_loss": 0.6007594466209412 + }, + { + "epoch": 0.8311251730274867, + "step": 8406, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.8311251730274867, + "step": 8406, + "train/total_loss": 0.11085719615221024 + }, + { + "entropy": 8.476692199707031, + "epoch": 0.8312240458770022, + "mean_token_accuracy": 0.7699680328369141, + "num_tokens": 22948003.0, + "step": 8407, + "train/ce_loss": 0.741219162940979 + }, + { + "epoch": 0.8312240458770022, + "step": 8407, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.8312240458770022, + "step": 8407, + "train/total_loss": 0.08974691480398178 + }, + { + "entropy": 8.197213172912598, + "epoch": 0.8313229187265176, + "mean_token_accuracy": 0.7603121399879456, + "num_tokens": 22953378.0, + "step": 8408, + "train/ce_loss": 0.6864122748374939 + }, + { + "epoch": 0.8313229187265176, + "step": 8408, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.8313229187265176, + "step": 8408, + "train/total_loss": 0.11160998046398163 + }, + { + "entropy": 8.923828125, + "epoch": 0.8314217915760332, + "mean_token_accuracy": 0.6744548082351685, + "num_tokens": 22958449.0, + "step": 8409, + "train/ce_loss": 1.0419980753795244e-06 + }, + { + "epoch": 0.8314217915760332, + "step": 8409, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.8314217915760332, + "step": 8409, + "train/total_loss": 0.04296885430812836 + }, + { + "entropy": 9.024328231811523, + "epoch": 0.8315206644255487, + "mean_token_accuracy": 0.7577413320541382, + "num_tokens": 22963474.0, + "step": 8410, + "train/ce_loss": 0.8398553729057312 + }, + { + "epoch": 0.8315206644255487, + "step": 8410, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.8315206644255487, + "step": 8410, + "train/total_loss": 0.10742303729057312 + }, + { + "entropy": 8.459671020507812, + "epoch": 0.8316195372750642, + "mean_token_accuracy": 0.6199377179145813, + "num_tokens": 22968874.0, + "step": 8411, + "train/ce_loss": 0.92495197057724 + }, + { + "epoch": 0.8316195372750642, + "step": 8411, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.8316195372750642, + "step": 8411, + "train/total_loss": 0.15499520301818848 + }, + { + "entropy": 8.697453498840332, + "epoch": 0.8317184101245798, + "mean_token_accuracy": 0.7323232293128967, + "num_tokens": 22974116.0, + "step": 8412, + "train/ce_loss": 0.7704938650131226 + }, + { + "epoch": 0.8317184101245798, + "step": 8412, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.8317184101245798, + "step": 8412, + "train/total_loss": 0.1278306394815445 + }, + { + "entropy": 8.825996398925781, + "epoch": 0.8318172829740953, + "mean_token_accuracy": 0.7680251002311707, + "num_tokens": 22979196.0, + "step": 8413, + "train/ce_loss": 1.0850144624710083 + }, + { + "epoch": 0.8318172829740953, + "step": 8413, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.8318172829740953, + "step": 8413, + "train/total_loss": 0.13193894922733307 + }, + { + "entropy": 8.536054611206055, + "epoch": 0.8319161558236108, + "mean_token_accuracy": 0.7847380638122559, + "num_tokens": 22984495.0, + "step": 8414, + "train/ce_loss": 0.5865741968154907 + }, + { + "epoch": 0.8319161558236108, + "step": 8414, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.8319161558236108, + "step": 8414, + "train/total_loss": 0.08600117266178131 + }, + { + "entropy": 8.073843002319336, + "epoch": 0.8320150286731264, + "mean_token_accuracy": 0.741847813129425, + "num_tokens": 22990053.0, + "step": 8415, + "train/ce_loss": 0.7983760833740234 + }, + { + "epoch": 0.8320150286731264, + "step": 8415, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.8320150286731264, + "step": 8415, + "train/total_loss": 0.1306188702583313 + }, + { + "entropy": 8.630202293395996, + "epoch": 0.8321139015226419, + "mean_token_accuracy": 0.7215189933776855, + "num_tokens": 22995255.0, + "step": 8416, + "train/ce_loss": 0.43194296956062317 + }, + { + "epoch": 0.8321139015226419, + "step": 8416, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.8321139015226419, + "step": 8416, + "train/total_loss": 0.10569429397583008 + }, + { + "entropy": 8.924360275268555, + "epoch": 0.8322127743721575, + "mean_token_accuracy": 0.7236024737358093, + "num_tokens": 23000323.0, + "step": 8417, + "train/ce_loss": 1.4973204135894775 + }, + { + "epoch": 0.8322127743721575, + "step": 8417, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.8322127743721575, + "step": 8417, + "train/total_loss": 0.21613828837871552 + }, + { + "entropy": 8.600046157836914, + "epoch": 0.832311647221673, + "mean_token_accuracy": 0.7043596506118774, + "num_tokens": 23005480.0, + "step": 8418, + "train/ce_loss": 0.9622127413749695 + }, + { + "epoch": 0.832311647221673, + "step": 8418, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.832311647221673, + "step": 8418, + "train/total_loss": 0.17825251817703247 + }, + { + "entropy": 8.23831558227539, + "epoch": 0.8324105200711884, + "mean_token_accuracy": 0.7659574747085571, + "num_tokens": 23011068.0, + "step": 8419, + "train/ce_loss": 1.023498296737671 + }, + { + "epoch": 0.8324105200711884, + "step": 8419, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.8324105200711884, + "step": 8419, + "train/total_loss": 0.13359983265399933 + }, + { + "epoch": 0.832509392920704, + "grad_norm": 0.5148611664772034, + "learning_rate": 7.920931612520399e-06, + "loss": 0.1285, + "step": 8420 + }, + { + "entropy": 8.815807342529297, + "epoch": 0.832509392920704, + "mean_token_accuracy": 0.793795645236969, + "num_tokens": 23016096.0, + "step": 8420, + "train/ce_loss": 0.9615424871444702 + }, + { + "epoch": 0.832509392920704, + "step": 8420, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.832509392920704, + "step": 8420, + "train/total_loss": 0.14302924275398254 + }, + { + "entropy": 9.1303129196167, + "epoch": 0.8326082657702195, + "mean_token_accuracy": 0.7280858755111694, + "num_tokens": 23021063.0, + "step": 8421, + "train/ce_loss": 0.7835027575492859 + }, + { + "epoch": 0.8326082657702195, + "step": 8421, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.8326082657702195, + "step": 8421, + "train/total_loss": 0.10178777575492859 + }, + { + "entropy": 8.770580291748047, + "epoch": 0.832707138619735, + "mean_token_accuracy": 0.7217742204666138, + "num_tokens": 23026414.0, + "step": 8422, + "train/ce_loss": 0.8574286103248596 + }, + { + "epoch": 0.832707138619735, + "step": 8422, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.832707138619735, + "step": 8422, + "train/total_loss": 0.12480536103248596 + }, + { + "entropy": 8.346721649169922, + "epoch": 0.8328060114692506, + "mean_token_accuracy": 0.7166344523429871, + "num_tokens": 23032154.0, + "step": 8423, + "train/ce_loss": 0.6669735908508301 + }, + { + "epoch": 0.8328060114692506, + "step": 8423, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.8328060114692506, + "step": 8423, + "train/total_loss": 0.144822359085083 + }, + { + "entropy": 8.428167343139648, + "epoch": 0.8329048843187661, + "mean_token_accuracy": 0.6963037252426147, + "num_tokens": 23037691.0, + "step": 8424, + "train/ce_loss": 1.1446759700775146 + }, + { + "epoch": 0.8329048843187661, + "step": 8424, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.8329048843187661, + "step": 8424, + "train/total_loss": 0.192592591047287 + }, + { + "entropy": 9.567185401916504, + "epoch": 0.8330037571682816, + "mean_token_accuracy": 0.7131147384643555, + "num_tokens": 23042447.0, + "step": 8425, + "train/ce_loss": 1.6128241213664296e-06 + }, + { + "epoch": 0.8330037571682816, + "step": 8425, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.8330037571682816, + "step": 8425, + "train/total_loss": 0.039062660187482834 + }, + { + "entropy": 8.869767189025879, + "epoch": 0.8331026300177972, + "mean_token_accuracy": 0.757377028465271, + "num_tokens": 23047558.0, + "step": 8426, + "train/ce_loss": 0.9084510803222656 + }, + { + "epoch": 0.8331026300177972, + "step": 8426, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.8331026300177972, + "step": 8426, + "train/total_loss": 0.14162635803222656 + }, + { + "entropy": 8.885663986206055, + "epoch": 0.8332015028673126, + "mean_token_accuracy": 0.7267002463340759, + "num_tokens": 23052820.0, + "step": 8427, + "train/ce_loss": 0.9693460464477539 + }, + { + "epoch": 0.8332015028673126, + "step": 8427, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.8332015028673126, + "step": 8427, + "train/total_loss": 0.12818461656570435 + }, + { + "entropy": 8.810038566589355, + "epoch": 0.8333003757168281, + "mean_token_accuracy": 0.779724657535553, + "num_tokens": 23058062.0, + "step": 8428, + "train/ce_loss": 0.5381803512573242 + }, + { + "epoch": 0.8333003757168281, + "step": 8428, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.8333003757168281, + "step": 8428, + "train/total_loss": 0.08116178214550018 + }, + { + "entropy": 9.27984619140625, + "epoch": 0.8333992485663437, + "mean_token_accuracy": 0.694200336933136, + "num_tokens": 23063100.0, + "step": 8429, + "train/ce_loss": 2.27346134185791 + }, + { + "epoch": 0.8333992485663437, + "step": 8429, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.8333992485663437, + "step": 8429, + "train/total_loss": 0.33281487226486206 + }, + { + "entropy": 9.171561241149902, + "epoch": 0.8334981214158592, + "mean_token_accuracy": 0.7915254235267639, + "num_tokens": 23068114.0, + "step": 8430, + "train/ce_loss": 1.2283937849133508e-06 + }, + { + "epoch": 0.8334981214158592, + "step": 8430, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.8334981214158592, + "step": 8430, + "train/total_loss": 0.04687512293457985 + }, + { + "entropy": 8.703938484191895, + "epoch": 0.8335969942653747, + "mean_token_accuracy": 0.7080045342445374, + "num_tokens": 23073502.0, + "step": 8431, + "train/ce_loss": 1.0218861103057861 + }, + { + "epoch": 0.8335969942653747, + "step": 8431, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.8335969942653747, + "step": 8431, + "train/total_loss": 0.1803136169910431 + }, + { + "entropy": 8.560479164123535, + "epoch": 0.8336958671148903, + "mean_token_accuracy": 0.7452300786972046, + "num_tokens": 23078827.0, + "step": 8432, + "train/ce_loss": 0.6318869590759277 + }, + { + "epoch": 0.8336958671148903, + "step": 8432, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.8336958671148903, + "step": 8432, + "train/total_loss": 0.11006369441747665 + }, + { + "entropy": 8.864185333251953, + "epoch": 0.8337947399644058, + "mean_token_accuracy": 0.7435897588729858, + "num_tokens": 23083971.0, + "step": 8433, + "train/ce_loss": 1.4856287240982056 + }, + { + "epoch": 0.8337947399644058, + "step": 8433, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.8337947399644058, + "step": 8433, + "train/total_loss": 0.19934412837028503 + }, + { + "entropy": 8.569899559020996, + "epoch": 0.8338936128139213, + "mean_token_accuracy": 0.7508690357208252, + "num_tokens": 23089307.0, + "step": 8434, + "train/ce_loss": 0.5698906183242798 + }, + { + "epoch": 0.8338936128139213, + "step": 8434, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.8338936128139213, + "step": 8434, + "train/total_loss": 0.10386405885219574 + }, + { + "entropy": 8.418647766113281, + "epoch": 0.8339924856634369, + "mean_token_accuracy": 0.7304609417915344, + "num_tokens": 23094802.0, + "step": 8435, + "train/ce_loss": 0.8897053599357605 + }, + { + "epoch": 0.8339924856634369, + "step": 8435, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.8339924856634369, + "step": 8435, + "train/total_loss": 0.14365804195404053 + }, + { + "entropy": 8.659425735473633, + "epoch": 0.8340913585129524, + "mean_token_accuracy": 0.78125, + "num_tokens": 23100106.0, + "step": 8436, + "train/ce_loss": 0.287148654460907 + }, + { + "epoch": 0.8340913585129524, + "step": 8436, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.8340913585129524, + "step": 8436, + "train/total_loss": 0.0521523654460907 + }, + { + "entropy": 8.957656860351562, + "epoch": 0.8341902313624678, + "mean_token_accuracy": 0.8513761758804321, + "num_tokens": 23105083.0, + "step": 8437, + "train/ce_loss": 0.9667260050773621 + }, + { + "epoch": 0.8341902313624678, + "step": 8437, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.8341902313624678, + "step": 8437, + "train/total_loss": 0.11229760199785233 + }, + { + "entropy": 8.240507125854492, + "epoch": 0.8342891042119834, + "mean_token_accuracy": 0.772009015083313, + "num_tokens": 23110549.0, + "step": 8438, + "train/ce_loss": 0.5577598214149475 + }, + { + "epoch": 0.8342891042119834, + "step": 8438, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.8342891042119834, + "step": 8438, + "train/total_loss": 0.09483848512172699 + }, + { + "entropy": 8.941741943359375, + "epoch": 0.8343879770614989, + "mean_token_accuracy": 0.7020057439804077, + "num_tokens": 23115686.0, + "step": 8439, + "train/ce_loss": 1.318642258644104 + }, + { + "epoch": 0.8343879770614989, + "step": 8439, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.8343879770614989, + "step": 8439, + "train/total_loss": 0.21389548480510712 + }, + { + "epoch": 0.8344868499110144, + "grad_norm": 0.7413841485977173, + "learning_rate": 7.91598674776245e-06, + "loss": 0.1269, + "step": 8440 + }, + { + "entropy": 9.051006317138672, + "epoch": 0.8344868499110144, + "mean_token_accuracy": 0.6917688250541687, + "num_tokens": 23120688.0, + "step": 8440, + "train/ce_loss": 1.116180181503296 + }, + { + "epoch": 0.8344868499110144, + "step": 8440, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.8344868499110144, + "step": 8440, + "train/total_loss": 0.1350555121898651 + }, + { + "entropy": 8.562871932983398, + "epoch": 0.83458572276053, + "mean_token_accuracy": 0.751091718673706, + "num_tokens": 23125839.0, + "step": 8441, + "train/ce_loss": 1.2514069080352783 + }, + { + "epoch": 0.83458572276053, + "step": 8441, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.83458572276053, + "step": 8441, + "train/total_loss": 0.1798281967639923 + }, + { + "entropy": 8.867137908935547, + "epoch": 0.8346845956100455, + "mean_token_accuracy": 0.7886075973510742, + "num_tokens": 23131065.0, + "step": 8442, + "train/ce_loss": 0.664675772190094 + }, + { + "epoch": 0.8346845956100455, + "step": 8442, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.8346845956100455, + "step": 8442, + "train/total_loss": 0.12115507572889328 + }, + { + "entropy": 8.775603294372559, + "epoch": 0.834783468459561, + "mean_token_accuracy": 0.7020602226257324, + "num_tokens": 23136150.0, + "step": 8443, + "train/ce_loss": 0.9842575788497925 + }, + { + "epoch": 0.834783468459561, + "step": 8443, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.834783468459561, + "step": 8443, + "train/total_loss": 0.1296757608652115 + }, + { + "entropy": 8.78519344329834, + "epoch": 0.8348823413090766, + "mean_token_accuracy": 0.6872811913490295, + "num_tokens": 23141520.0, + "step": 8444, + "train/ce_loss": 1.3550491333007812 + }, + { + "epoch": 0.8348823413090766, + "step": 8444, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.8348823413090766, + "step": 8444, + "train/total_loss": 0.22925491631031036 + }, + { + "entropy": 8.138087272644043, + "epoch": 0.8349812141585921, + "mean_token_accuracy": 0.7243589758872986, + "num_tokens": 23146908.0, + "step": 8445, + "train/ce_loss": 0.7495837211608887 + }, + { + "epoch": 0.8349812141585921, + "step": 8445, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.8349812141585921, + "step": 8445, + "train/total_loss": 0.13745838403701782 + }, + { + "entropy": 8.91369342803955, + "epoch": 0.8350800870081075, + "mean_token_accuracy": 0.7582781314849854, + "num_tokens": 23151983.0, + "step": 8446, + "train/ce_loss": 0.5326682925224304 + }, + { + "epoch": 0.8350800870081075, + "step": 8446, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.8350800870081075, + "step": 8446, + "train/total_loss": 0.08451683074235916 + }, + { + "entropy": 8.554800033569336, + "epoch": 0.8351789598576231, + "mean_token_accuracy": 0.7573529481887817, + "num_tokens": 23157260.0, + "step": 8447, + "train/ce_loss": 0.7137629389762878 + }, + { + "epoch": 0.8351789598576231, + "step": 8447, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.8351789598576231, + "step": 8447, + "train/total_loss": 0.13387629389762878 + }, + { + "entropy": 9.390222549438477, + "epoch": 0.8352778327071386, + "mean_token_accuracy": 0.742222249507904, + "num_tokens": 23162099.0, + "step": 8448, + "train/ce_loss": 4.685093244916061e-06 + }, + { + "epoch": 0.8352778327071386, + "step": 8448, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.8352778327071386, + "step": 8448, + "train/total_loss": 0.050781719386577606 + }, + { + "entropy": 8.345687866210938, + "epoch": 0.8353767055566541, + "mean_token_accuracy": 0.7475345134735107, + "num_tokens": 23167566.0, + "step": 8449, + "train/ce_loss": 0.34656718373298645 + }, + { + "epoch": 0.8353767055566541, + "step": 8449, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.8353767055566541, + "step": 8449, + "train/total_loss": 0.050281718373298645 + }, + { + "entropy": 9.12002182006836, + "epoch": 0.8354755784061697, + "mean_token_accuracy": 0.7476190328598022, + "num_tokens": 23172605.0, + "step": 8450, + "train/ce_loss": 1.1007190942764282 + }, + { + "epoch": 0.8354755784061697, + "step": 8450, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.8354755784061697, + "step": 8450, + "train/total_loss": 0.13741566240787506 + }, + { + "entropy": 9.263957977294922, + "epoch": 0.8355744512556852, + "mean_token_accuracy": 0.7510040402412415, + "num_tokens": 23177672.0, + "step": 8451, + "train/ce_loss": 2.7391645289753797e-06 + }, + { + "epoch": 0.8355744512556852, + "step": 8451, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.8355744512556852, + "step": 8451, + "train/total_loss": 0.035156525671482086 + }, + { + "entropy": 8.432599067687988, + "epoch": 0.8356733241052007, + "mean_token_accuracy": 0.7297297120094299, + "num_tokens": 23183068.0, + "step": 8452, + "train/ce_loss": 0.8153901100158691 + }, + { + "epoch": 0.8356733241052007, + "step": 8452, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.8356733241052007, + "step": 8452, + "train/total_loss": 0.09716401249170303 + }, + { + "entropy": 8.373237609863281, + "epoch": 0.8357721969547163, + "mean_token_accuracy": 0.7285407781600952, + "num_tokens": 23188483.0, + "step": 8453, + "train/ce_loss": 1.45299232006073 + }, + { + "epoch": 0.8357721969547163, + "step": 8453, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.8357721969547163, + "step": 8453, + "train/total_loss": 0.20779924094676971 + }, + { + "entropy": 8.922636032104492, + "epoch": 0.8358710698042318, + "mean_token_accuracy": 0.7309644818305969, + "num_tokens": 23193585.0, + "step": 8454, + "train/ce_loss": 0.8643047213554382 + }, + { + "epoch": 0.8358710698042318, + "step": 8454, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.8358710698042318, + "step": 8454, + "train/total_loss": 0.15674297511577606 + }, + { + "entropy": 8.581033706665039, + "epoch": 0.8359699426537472, + "mean_token_accuracy": 0.761904776096344, + "num_tokens": 23198944.0, + "step": 8455, + "train/ce_loss": 0.6584579944610596 + }, + { + "epoch": 0.8359699426537472, + "step": 8455, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.8359699426537472, + "step": 8455, + "train/total_loss": 0.1088145524263382 + }, + { + "entropy": 8.840801239013672, + "epoch": 0.8360688155032628, + "mean_token_accuracy": 0.754054069519043, + "num_tokens": 23204141.0, + "step": 8456, + "train/ce_loss": 0.6707462072372437 + }, + { + "epoch": 0.8360688155032628, + "step": 8456, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.8360688155032628, + "step": 8456, + "train/total_loss": 0.13738712668418884 + }, + { + "entropy": 9.381414413452148, + "epoch": 0.8361676883527783, + "mean_token_accuracy": 0.8161616325378418, + "num_tokens": 23209045.0, + "step": 8457, + "train/ce_loss": 6.694864396195044e-07 + }, + { + "epoch": 0.8361676883527783, + "step": 8457, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.8361676883527783, + "step": 8457, + "train/total_loss": 0.015625067055225372 + }, + { + "entropy": 8.514923095703125, + "epoch": 0.8362665612022938, + "mean_token_accuracy": 0.7355371713638306, + "num_tokens": 23214327.0, + "step": 8458, + "train/ce_loss": 1.0225625038146973 + }, + { + "epoch": 0.8362665612022938, + "step": 8458, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.8362665612022938, + "step": 8458, + "train/total_loss": 0.15694375336170197 + }, + { + "entropy": 8.720748901367188, + "epoch": 0.8363654340518094, + "mean_token_accuracy": 0.7418967485427856, + "num_tokens": 23219806.0, + "step": 8459, + "train/ce_loss": 1.3102567195892334 + }, + { + "epoch": 0.8363654340518094, + "step": 8459, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.8363654340518094, + "step": 8459, + "train/total_loss": 0.21696317195892334 + }, + { + "epoch": 0.8364643069013249, + "grad_norm": 0.6300832033157349, + "learning_rate": 7.9110418830045e-06, + "loss": 0.1355, + "step": 8460 + }, + { + "entropy": 8.613868713378906, + "epoch": 0.8364643069013249, + "mean_token_accuracy": 0.7577142715454102, + "num_tokens": 23225144.0, + "step": 8460, + "train/ce_loss": 0.8855839967727661 + }, + { + "epoch": 0.8364643069013249, + "step": 8460, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.8364643069013249, + "step": 8460, + "train/total_loss": 0.12371464818716049 + }, + { + "entropy": 8.367238998413086, + "epoch": 0.8365631797508404, + "mean_token_accuracy": 0.7315436005592346, + "num_tokens": 23230505.0, + "step": 8461, + "train/ce_loss": 1.0768791437149048 + }, + { + "epoch": 0.8365631797508404, + "step": 8461, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.8365631797508404, + "step": 8461, + "train/total_loss": 0.17800042033195496 + }, + { + "entropy": 8.522979736328125, + "epoch": 0.836662052600356, + "mean_token_accuracy": 0.7502837777137756, + "num_tokens": 23235848.0, + "step": 8462, + "train/ce_loss": 0.714676022529602 + }, + { + "epoch": 0.836662052600356, + "step": 8462, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.836662052600356, + "step": 8462, + "train/total_loss": 0.13006135821342468 + }, + { + "entropy": 9.270840644836426, + "epoch": 0.8367609254498715, + "mean_token_accuracy": 0.762135922908783, + "num_tokens": 23240680.0, + "step": 8463, + "train/ce_loss": 2.330828692720388e-06 + }, + { + "epoch": 0.8367609254498715, + "step": 8463, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.8367609254498715, + "step": 8463, + "train/total_loss": 0.0312502346932888 + }, + { + "entropy": 9.12474536895752, + "epoch": 0.836859798299387, + "mean_token_accuracy": 0.760869562625885, + "num_tokens": 23245576.0, + "step": 8464, + "train/ce_loss": 0.8324379324913025 + }, + { + "epoch": 0.836859798299387, + "step": 8464, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.836859798299387, + "step": 8464, + "train/total_loss": 0.12621253728866577 + }, + { + "entropy": 8.247017860412598, + "epoch": 0.8369586711489025, + "mean_token_accuracy": 0.7172264456748962, + "num_tokens": 23250968.0, + "step": 8465, + "train/ce_loss": 1.229318618774414 + }, + { + "epoch": 0.8369586711489025, + "step": 8465, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.8369586711489025, + "step": 8465, + "train/total_loss": 0.19324436783790588 + }, + { + "entropy": 9.369348526000977, + "epoch": 0.837057543998418, + "mean_token_accuracy": 0.7494033575057983, + "num_tokens": 23255798.0, + "step": 8466, + "train/ce_loss": 1.835410237312317 + }, + { + "epoch": 0.837057543998418, + "step": 8466, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.837057543998418, + "step": 8466, + "train/total_loss": 0.26557227969169617 + }, + { + "entropy": 8.333223342895508, + "epoch": 0.8371564168479335, + "mean_token_accuracy": 0.7402885556221008, + "num_tokens": 23261203.0, + "step": 8467, + "train/ce_loss": 0.9438441395759583 + }, + { + "epoch": 0.8371564168479335, + "step": 8467, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.8371564168479335, + "step": 8467, + "train/total_loss": 0.15297816693782806 + }, + { + "entropy": 8.755775451660156, + "epoch": 0.8372552896974491, + "mean_token_accuracy": 0.7366504669189453, + "num_tokens": 23266466.0, + "step": 8468, + "train/ce_loss": 0.7262039184570312 + }, + { + "epoch": 0.8372552896974491, + "step": 8468, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.8372552896974491, + "step": 8468, + "train/total_loss": 0.11949539184570312 + }, + { + "entropy": 9.043832778930664, + "epoch": 0.8373541625469646, + "mean_token_accuracy": 0.7504159808158875, + "num_tokens": 23271467.0, + "step": 8469, + "train/ce_loss": 1.2775529623031616 + }, + { + "epoch": 0.8373541625469646, + "step": 8469, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.8373541625469646, + "step": 8469, + "train/total_loss": 0.1668177992105484 + }, + { + "entropy": 9.056379318237305, + "epoch": 0.8374530353964801, + "mean_token_accuracy": 0.7526132464408875, + "num_tokens": 23276509.0, + "step": 8470, + "train/ce_loss": 0.983027458190918 + }, + { + "epoch": 0.8374530353964801, + "step": 8470, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.8374530353964801, + "step": 8470, + "train/total_loss": 0.17252150177955627 + }, + { + "entropy": 9.1365385055542, + "epoch": 0.8375519082459957, + "mean_token_accuracy": 0.7578008770942688, + "num_tokens": 23281638.0, + "step": 8471, + "train/ce_loss": 7.340035494962649e-07 + }, + { + "epoch": 0.8375519082459957, + "step": 8471, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.8375519082459957, + "step": 8471, + "train/total_loss": 0.03125007450580597 + }, + { + "entropy": 8.997591018676758, + "epoch": 0.8376507810955112, + "mean_token_accuracy": 0.7923728823661804, + "num_tokens": 23286777.0, + "step": 8472, + "train/ce_loss": 0.713519275188446 + }, + { + "epoch": 0.8376507810955112, + "step": 8472, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.8376507810955112, + "step": 8472, + "train/total_loss": 0.08697693049907684 + }, + { + "entropy": 8.927431106567383, + "epoch": 0.8377496539450267, + "mean_token_accuracy": 0.6694214940071106, + "num_tokens": 23291899.0, + "step": 8473, + "train/ce_loss": 1.389837384223938 + }, + { + "epoch": 0.8377496539450267, + "step": 8473, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.8377496539450267, + "step": 8473, + "train/total_loss": 0.19367124140262604 + }, + { + "entropy": 8.895759582519531, + "epoch": 0.8378485267945422, + "mean_token_accuracy": 0.718068540096283, + "num_tokens": 23297049.0, + "step": 8474, + "train/ce_loss": 0.7009792327880859 + }, + { + "epoch": 0.8378485267945422, + "step": 8474, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.8378485267945422, + "step": 8474, + "train/total_loss": 0.0935354232788086 + }, + { + "entropy": 9.136212348937988, + "epoch": 0.8379473996440577, + "mean_token_accuracy": 0.7834862470626831, + "num_tokens": 23302024.0, + "step": 8475, + "train/ce_loss": 0.6531566381454468 + }, + { + "epoch": 0.8379473996440577, + "step": 8475, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.8379473996440577, + "step": 8475, + "train/total_loss": 0.12781566381454468 + }, + { + "entropy": 8.840536117553711, + "epoch": 0.8380462724935732, + "mean_token_accuracy": 0.7091906666755676, + "num_tokens": 23307208.0, + "step": 8476, + "train/ce_loss": 1.3356698751449585 + }, + { + "epoch": 0.8380462724935732, + "step": 8476, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.8380462724935732, + "step": 8476, + "train/total_loss": 0.2038794904947281 + }, + { + "entropy": 8.715561866760254, + "epoch": 0.8381451453430888, + "mean_token_accuracy": 0.7680000066757202, + "num_tokens": 23312434.0, + "step": 8477, + "train/ce_loss": 0.7491929531097412 + }, + { + "epoch": 0.8381451453430888, + "step": 8477, + "train/sim_loss": 0.01171875 + }, + { + "epoch": 0.8381451453430888, + "step": 8477, + "train/total_loss": 0.08663804829120636 + }, + { + "entropy": 8.269938468933105, + "epoch": 0.8382440181926043, + "mean_token_accuracy": 0.6903846263885498, + "num_tokens": 23317965.0, + "step": 8478, + "train/ce_loss": 0.7817928791046143 + }, + { + "epoch": 0.8382440181926043, + "step": 8478, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.8382440181926043, + "step": 8478, + "train/total_loss": 0.14849179983139038 + }, + { + "entropy": 8.730112075805664, + "epoch": 0.8383428910421198, + "mean_token_accuracy": 0.7268588542938232, + "num_tokens": 23323042.0, + "step": 8479, + "train/ce_loss": 1.4868693351745605 + }, + { + "epoch": 0.8383428910421198, + "step": 8479, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.8383428910421198, + "step": 8479, + "train/total_loss": 0.18384318053722382 + }, + { + "epoch": 0.8384417638916354, + "grad_norm": 0.7349727749824524, + "learning_rate": 7.90609701824655e-06, + "loss": 0.1321, + "step": 8480 + }, + { + "entropy": 9.112332344055176, + "epoch": 0.8384417638916354, + "mean_token_accuracy": 0.7666666507720947, + "num_tokens": 23328042.0, + "step": 8480, + "train/ce_loss": 1.523605465888977 + }, + { + "epoch": 0.8384417638916354, + "step": 8480, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.8384417638916354, + "step": 8480, + "train/total_loss": 0.23048554360866547 + }, + { + "entropy": 8.693452835083008, + "epoch": 0.8385406367411509, + "mean_token_accuracy": 0.7580246925354004, + "num_tokens": 23333344.0, + "step": 8481, + "train/ce_loss": 0.7240076661109924 + }, + { + "epoch": 0.8385406367411509, + "step": 8481, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.8385406367411509, + "step": 8481, + "train/total_loss": 0.1270882785320282 + }, + { + "entropy": 9.14535140991211, + "epoch": 0.8386395095906664, + "mean_token_accuracy": 0.735988199710846, + "num_tokens": 23338414.0, + "step": 8482, + "train/ce_loss": 0.896630585193634 + }, + { + "epoch": 0.8386395095906664, + "step": 8482, + "train/sim_loss": 0.01171875 + }, + { + "epoch": 0.8386395095906664, + "step": 8482, + "train/total_loss": 0.1013818085193634 + }, + { + "entropy": 8.412368774414062, + "epoch": 0.838738382440182, + "mean_token_accuracy": 0.7041965126991272, + "num_tokens": 23344043.0, + "step": 8483, + "train/ce_loss": 1.0863405466079712 + }, + { + "epoch": 0.838738382440182, + "step": 8483, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.838738382440182, + "step": 8483, + "train/total_loss": 0.16722780466079712 + }, + { + "entropy": 9.090755462646484, + "epoch": 0.8388372552896974, + "mean_token_accuracy": 0.7093425393104553, + "num_tokens": 23349056.0, + "step": 8484, + "train/ce_loss": 1.0335745811462402 + }, + { + "epoch": 0.8388372552896974, + "step": 8484, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.8388372552896974, + "step": 8484, + "train/total_loss": 0.1580449640750885 + }, + { + "entropy": 9.019186973571777, + "epoch": 0.8389361281392129, + "mean_token_accuracy": 0.800000011920929, + "num_tokens": 23354014.0, + "step": 8485, + "train/ce_loss": 9.221578238793882e-07 + }, + { + "epoch": 0.8389361281392129, + "step": 8485, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.8389361281392129, + "step": 8485, + "train/total_loss": 0.03125009313225746 + }, + { + "entropy": 8.70306396484375, + "epoch": 0.8390350009887285, + "mean_token_accuracy": 0.7714681625366211, + "num_tokens": 23359218.0, + "step": 8486, + "train/ce_loss": 0.4649391770362854 + }, + { + "epoch": 0.8390350009887285, + "step": 8486, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.8390350009887285, + "step": 8486, + "train/total_loss": 0.06602516770362854 + }, + { + "entropy": 8.45088005065918, + "epoch": 0.839133873838244, + "mean_token_accuracy": 0.7324613332748413, + "num_tokens": 23364545.0, + "step": 8487, + "train/ce_loss": 0.860693097114563 + }, + { + "epoch": 0.839133873838244, + "step": 8487, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.839133873838244, + "step": 8487, + "train/total_loss": 0.14856931567192078 + }, + { + "entropy": 8.545808792114258, + "epoch": 0.8392327466877595, + "mean_token_accuracy": 0.7394015192985535, + "num_tokens": 23369802.0, + "step": 8488, + "train/ce_loss": 0.7315399646759033 + }, + { + "epoch": 0.8392327466877595, + "step": 8488, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.8392327466877595, + "step": 8488, + "train/total_loss": 0.10049774497747421 + }, + { + "entropy": 8.705459594726562, + "epoch": 0.8393316195372751, + "mean_token_accuracy": 0.7708830833435059, + "num_tokens": 23375068.0, + "step": 8489, + "train/ce_loss": 0.6915591359138489 + }, + { + "epoch": 0.8393316195372751, + "step": 8489, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.8393316195372751, + "step": 8489, + "train/total_loss": 0.11603091657161713 + }, + { + "entropy": 8.906850814819336, + "epoch": 0.8394304923867906, + "mean_token_accuracy": 0.7515822649002075, + "num_tokens": 23380162.0, + "step": 8490, + "train/ce_loss": 0.8677210807800293 + }, + { + "epoch": 0.8394304923867906, + "step": 8490, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.8394304923867906, + "step": 8490, + "train/total_loss": 0.1609908640384674 + }, + { + "entropy": 8.584822654724121, + "epoch": 0.8395293652363061, + "mean_token_accuracy": 0.7032257914543152, + "num_tokens": 23385409.0, + "step": 8491, + "train/ce_loss": 1.0996503829956055 + }, + { + "epoch": 0.8395293652363061, + "step": 8491, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.8395293652363061, + "step": 8491, + "train/total_loss": 0.18027754127979279 + }, + { + "entropy": 8.949121475219727, + "epoch": 0.8396282380858217, + "mean_token_accuracy": 0.771345853805542, + "num_tokens": 23390561.0, + "step": 8492, + "train/ce_loss": 0.966265857219696 + }, + { + "epoch": 0.8396282380858217, + "step": 8492, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.8396282380858217, + "step": 8492, + "train/total_loss": 0.12006408721208572 + }, + { + "entropy": 9.422487258911133, + "epoch": 0.8397271109353371, + "mean_token_accuracy": 0.7461240291595459, + "num_tokens": 23395468.0, + "step": 8493, + "train/ce_loss": 1.1239761114120483 + }, + { + "epoch": 0.8397271109353371, + "step": 8493, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.8397271109353371, + "step": 8493, + "train/total_loss": 0.18661636114120483 + }, + { + "entropy": 8.569074630737305, + "epoch": 0.8398259837848526, + "mean_token_accuracy": 0.7707602381706238, + "num_tokens": 23400763.0, + "step": 8494, + "train/ce_loss": 0.8903215527534485 + }, + { + "epoch": 0.8398259837848526, + "step": 8494, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.8398259837848526, + "step": 8494, + "train/total_loss": 0.1359071582555771 + }, + { + "entropy": 9.111557960510254, + "epoch": 0.8399248566343682, + "mean_token_accuracy": 0.7453798651695251, + "num_tokens": 23405671.0, + "step": 8495, + "train/ce_loss": 1.4308420419692993 + }, + { + "epoch": 0.8399248566343682, + "step": 8495, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.8399248566343682, + "step": 8495, + "train/total_loss": 0.21339671313762665 + }, + { + "entropy": 8.813003540039062, + "epoch": 0.8400237294838837, + "mean_token_accuracy": 0.801536500453949, + "num_tokens": 23411051.0, + "step": 8496, + "train/ce_loss": 0.3880203366279602 + }, + { + "epoch": 0.8400237294838837, + "step": 8496, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.8400237294838837, + "step": 8496, + "train/total_loss": 0.07395828515291214 + }, + { + "entropy": 9.437934875488281, + "epoch": 0.8401226023333992, + "mean_token_accuracy": 0.6849315166473389, + "num_tokens": 23415900.0, + "step": 8497, + "train/ce_loss": 1.8858469724655151 + }, + { + "epoch": 0.8401226023333992, + "step": 8497, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.8401226023333992, + "step": 8497, + "train/total_loss": 0.29795968532562256 + }, + { + "entropy": 8.867807388305664, + "epoch": 0.8402214751829148, + "mean_token_accuracy": 0.7607142925262451, + "num_tokens": 23420951.0, + "step": 8498, + "train/ce_loss": 1.3461689150062739e-06 + }, + { + "epoch": 0.8402214751829148, + "step": 8498, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.8402214751829148, + "step": 8498, + "train/total_loss": 0.050781384110450745 + }, + { + "entropy": 8.785367965698242, + "epoch": 0.8403203480324303, + "mean_token_accuracy": 0.7249602675437927, + "num_tokens": 23426042.0, + "step": 8499, + "train/ce_loss": 6.686180995529867e-07 + }, + { + "epoch": 0.8403203480324303, + "step": 8499, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.8403203480324303, + "step": 8499, + "train/total_loss": 0.06250006705522537 + }, + { + "epoch": 0.8404192208819459, + "grad_norm": 0.6899054646492004, + "learning_rate": 7.901152153488603e-06, + "loss": 0.1273, + "step": 8500 + }, + { + "entropy": 8.333955764770508, + "epoch": 0.8404192208819459, + "mean_token_accuracy": 0.7285569906234741, + "num_tokens": 23431545.0, + "step": 8500, + "train/ce_loss": 1.2725051641464233 + }, + { + "epoch": 0.8404192208819459, + "step": 8500, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.8404192208819459, + "step": 8500, + "train/total_loss": 0.2053755223751068 + }, + { + "entropy": 8.942061424255371, + "epoch": 0.8405180937314614, + "mean_token_accuracy": 0.7556270360946655, + "num_tokens": 23436600.0, + "step": 8501, + "train/ce_loss": 0.9197985529899597 + }, + { + "epoch": 0.8405180937314614, + "step": 8501, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.8405180937314614, + "step": 8501, + "train/total_loss": 0.13885486125946045 + }, + { + "entropy": 9.137856483459473, + "epoch": 0.8406169665809768, + "mean_token_accuracy": 0.8154761791229248, + "num_tokens": 23441542.0, + "step": 8502, + "train/ce_loss": 1.5246752500534058 + }, + { + "epoch": 0.8406169665809768, + "step": 8502, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.8406169665809768, + "step": 8502, + "train/total_loss": 0.2032487839460373 + }, + { + "entropy": 8.56517219543457, + "epoch": 0.8407158394304924, + "mean_token_accuracy": 0.7099056839942932, + "num_tokens": 23446869.0, + "step": 8503, + "train/ce_loss": 0.5327627062797546 + }, + { + "epoch": 0.8407158394304924, + "step": 8503, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.8407158394304924, + "step": 8503, + "train/total_loss": 0.11968252062797546 + }, + { + "entropy": 8.913006782531738, + "epoch": 0.8408147122800079, + "mean_token_accuracy": 0.7347221970558167, + "num_tokens": 23451981.0, + "step": 8504, + "train/ce_loss": 1.4104857444763184 + }, + { + "epoch": 0.8408147122800079, + "step": 8504, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.8408147122800079, + "step": 8504, + "train/total_loss": 0.1957360804080963 + }, + { + "entropy": 8.650247573852539, + "epoch": 0.8409135851295234, + "mean_token_accuracy": 0.6997635960578918, + "num_tokens": 23457235.0, + "step": 8505, + "train/ce_loss": 1.308672547340393 + }, + { + "epoch": 0.8409135851295234, + "step": 8505, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.8409135851295234, + "step": 8505, + "train/total_loss": 0.21680475771427155 + }, + { + "entropy": 8.899694442749023, + "epoch": 0.841012457979039, + "mean_token_accuracy": 0.7393526434898376, + "num_tokens": 23462282.0, + "step": 8506, + "train/ce_loss": 4.857674298364145e-07 + }, + { + "epoch": 0.841012457979039, + "step": 8506, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.841012457979039, + "step": 8506, + "train/total_loss": 0.01953129842877388 + }, + { + "entropy": 8.43980884552002, + "epoch": 0.8411113308285545, + "mean_token_accuracy": 0.751091718673706, + "num_tokens": 23467612.0, + "step": 8507, + "train/ce_loss": 0.5670673847198486 + }, + { + "epoch": 0.8411113308285545, + "step": 8507, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.8411113308285545, + "step": 8507, + "train/total_loss": 0.0840504914522171 + }, + { + "entropy": 8.516087532043457, + "epoch": 0.84121020367807, + "mean_token_accuracy": 0.7193396091461182, + "num_tokens": 23472937.0, + "step": 8508, + "train/ce_loss": 0.6912945508956909 + }, + { + "epoch": 0.84121020367807, + "step": 8508, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.84121020367807, + "step": 8508, + "train/total_loss": 0.12772321701049805 + }, + { + "entropy": 9.018800735473633, + "epoch": 0.8413090765275856, + "mean_token_accuracy": 0.7811934947967529, + "num_tokens": 23477954.0, + "step": 8509, + "train/ce_loss": 7.553363730039564e-07 + }, + { + "epoch": 0.8413090765275856, + "step": 8509, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.8413090765275856, + "step": 8509, + "train/total_loss": 0.02734382636845112 + }, + { + "entropy": 8.852306365966797, + "epoch": 0.8414079493771011, + "mean_token_accuracy": 0.7095046639442444, + "num_tokens": 23483141.0, + "step": 8510, + "train/ce_loss": 1.117521047592163 + }, + { + "epoch": 0.8414079493771011, + "step": 8510, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.8414079493771011, + "step": 8510, + "train/total_loss": 0.18597085773944855 + }, + { + "entropy": 8.106832504272461, + "epoch": 0.8415068222266165, + "mean_token_accuracy": 0.6650148630142212, + "num_tokens": 23488644.0, + "step": 8511, + "train/ce_loss": 1.5925387144088745 + }, + { + "epoch": 0.8415068222266165, + "step": 8511, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.8415068222266165, + "step": 8511, + "train/total_loss": 0.18659763038158417 + }, + { + "entropy": 8.672381401062012, + "epoch": 0.8416056950761321, + "mean_token_accuracy": 0.7424425482749939, + "num_tokens": 23493957.0, + "step": 8512, + "train/ce_loss": 1.1097936630249023 + }, + { + "epoch": 0.8416056950761321, + "step": 8512, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.8416056950761321, + "step": 8512, + "train/total_loss": 0.1656668782234192 + }, + { + "entropy": 8.412344932556152, + "epoch": 0.8417045679256476, + "mean_token_accuracy": 0.7193638682365417, + "num_tokens": 23499553.0, + "step": 8513, + "train/ce_loss": 0.8414641618728638 + }, + { + "epoch": 0.8417045679256476, + "step": 8513, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.8417045679256476, + "step": 8513, + "train/total_loss": 0.1388339102268219 + }, + { + "entropy": 8.977106094360352, + "epoch": 0.8418034407751631, + "mean_token_accuracy": 0.7268656492233276, + "num_tokens": 23504695.0, + "step": 8514, + "train/ce_loss": 1.0664161443710327 + }, + { + "epoch": 0.8418034407751631, + "step": 8514, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.8418034407751631, + "step": 8514, + "train/total_loss": 0.17695412039756775 + }, + { + "entropy": 8.835411071777344, + "epoch": 0.8419023136246787, + "mean_token_accuracy": 0.731121301651001, + "num_tokens": 23510047.0, + "step": 8515, + "train/ce_loss": 0.530247688293457 + }, + { + "epoch": 0.8419023136246787, + "step": 8515, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.8419023136246787, + "step": 8515, + "train/total_loss": 0.0920872688293457 + }, + { + "entropy": 8.61327075958252, + "epoch": 0.8420011864741942, + "mean_token_accuracy": 0.7627856135368347, + "num_tokens": 23515473.0, + "step": 8516, + "train/ce_loss": 1.1188116073608398 + }, + { + "epoch": 0.8420011864741942, + "step": 8516, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.8420011864741942, + "step": 8516, + "train/total_loss": 0.23688116669654846 + }, + { + "entropy": 9.671828269958496, + "epoch": 0.8421000593237097, + "mean_token_accuracy": 0.834645688533783, + "num_tokens": 23520133.0, + "step": 8517, + "train/ce_loss": 3.985355306213023e-06 + }, + { + "epoch": 0.8421000593237097, + "step": 8517, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.8421000593237097, + "step": 8517, + "train/total_loss": 0.07031289488077164 + }, + { + "entropy": 8.810830116271973, + "epoch": 0.8421989321732253, + "mean_token_accuracy": 0.7390710115432739, + "num_tokens": 23525305.0, + "step": 8518, + "train/ce_loss": 1.1688063144683838 + }, + { + "epoch": 0.8421989321732253, + "step": 8518, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.8421989321732253, + "step": 8518, + "train/total_loss": 0.1832868754863739 + }, + { + "entropy": 9.445963859558105, + "epoch": 0.8422978050227408, + "mean_token_accuracy": 0.7844611406326294, + "num_tokens": 23530088.0, + "step": 8519, + "train/ce_loss": 2.614632194308797e-06 + }, + { + "epoch": 0.8422978050227408, + "step": 8519, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.8422978050227408, + "step": 8519, + "train/total_loss": 0.027344010770320892 + }, + { + "epoch": 0.8423966778722562, + "grad_norm": 0.7130556702613831, + "learning_rate": 7.896207288730653e-06, + "loss": 0.1326, + "step": 8520 + }, + { + "entropy": 9.060623168945312, + "epoch": 0.8423966778722562, + "mean_token_accuracy": 0.733564019203186, + "num_tokens": 23535111.0, + "step": 8520, + "train/ce_loss": 1.1208001375198364 + }, + { + "epoch": 0.8423966778722562, + "step": 8520, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.8423966778722562, + "step": 8520, + "train/total_loss": 0.13161125779151917 + }, + { + "entropy": 8.928070068359375, + "epoch": 0.8424955507217718, + "mean_token_accuracy": 0.7264630794525146, + "num_tokens": 23540361.0, + "step": 8521, + "train/ce_loss": 1.4061381816864014 + }, + { + "epoch": 0.8424955507217718, + "step": 8521, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.8424955507217718, + "step": 8521, + "train/total_loss": 0.20311382412910461 + }, + { + "entropy": 8.617700576782227, + "epoch": 0.8425944235712873, + "mean_token_accuracy": 0.6704730987548828, + "num_tokens": 23545414.0, + "step": 8522, + "train/ce_loss": 2.2443900108337402 + }, + { + "epoch": 0.8425944235712873, + "step": 8522, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.8425944235712873, + "step": 8522, + "train/total_loss": 0.29475152492523193 + }, + { + "entropy": 8.72222900390625, + "epoch": 0.8426932964208028, + "mean_token_accuracy": 0.7211201786994934, + "num_tokens": 23550708.0, + "step": 8523, + "train/ce_loss": 0.5025374889373779 + }, + { + "epoch": 0.8426932964208028, + "step": 8523, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.8426932964208028, + "step": 8523, + "train/total_loss": 0.08540999889373779 + }, + { + "entropy": 8.840371131896973, + "epoch": 0.8427921692703184, + "mean_token_accuracy": 0.7091690301895142, + "num_tokens": 23555829.0, + "step": 8524, + "train/ce_loss": 1.5976157188415527 + }, + { + "epoch": 0.8427921692703184, + "step": 8524, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.8427921692703184, + "step": 8524, + "train/total_loss": 0.21054282784461975 + }, + { + "entropy": 8.897891998291016, + "epoch": 0.8428910421198339, + "mean_token_accuracy": 0.7764876484870911, + "num_tokens": 23561122.0, + "step": 8525, + "train/ce_loss": 0.9834213852882385 + }, + { + "epoch": 0.8428910421198339, + "step": 8525, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.8428910421198339, + "step": 8525, + "train/total_loss": 0.1256859004497528 + }, + { + "entropy": 9.54948616027832, + "epoch": 0.8429899149693494, + "mean_token_accuracy": 0.7002288103103638, + "num_tokens": 23565997.0, + "step": 8526, + "train/ce_loss": 1.4457448287430452e-06 + }, + { + "epoch": 0.8429899149693494, + "step": 8526, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.8429899149693494, + "step": 8526, + "train/total_loss": 0.06640639156103134 + }, + { + "entropy": 8.320509910583496, + "epoch": 0.843088787818865, + "mean_token_accuracy": 0.7177914381027222, + "num_tokens": 23571297.0, + "step": 8527, + "train/ce_loss": 0.9710502624511719 + }, + { + "epoch": 0.843088787818865, + "step": 8527, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.843088787818865, + "step": 8527, + "train/total_loss": 0.1283550262451172 + }, + { + "entropy": 8.480062484741211, + "epoch": 0.8431876606683805, + "mean_token_accuracy": 0.7616875767707825, + "num_tokens": 23576669.0, + "step": 8528, + "train/ce_loss": 0.5755949020385742 + }, + { + "epoch": 0.8431876606683805, + "step": 8528, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.8431876606683805, + "step": 8528, + "train/total_loss": 0.07318449020385742 + }, + { + "entropy": 9.013786315917969, + "epoch": 0.843286533517896, + "mean_token_accuracy": 0.7514124512672424, + "num_tokens": 23581811.0, + "step": 8529, + "train/ce_loss": 1.2379498481750488 + }, + { + "epoch": 0.843286533517896, + "step": 8529, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.843286533517896, + "step": 8529, + "train/total_loss": 0.18238873779773712 + }, + { + "entropy": 9.010156631469727, + "epoch": 0.8433854063674115, + "mean_token_accuracy": 0.7237654328346252, + "num_tokens": 23586901.0, + "step": 8530, + "train/ce_loss": 0.8623937964439392 + }, + { + "epoch": 0.8433854063674115, + "step": 8530, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.8433854063674115, + "step": 8530, + "train/total_loss": 0.11358313262462616 + }, + { + "entropy": 8.625732421875, + "epoch": 0.843484279216927, + "mean_token_accuracy": 0.7690504193305969, + "num_tokens": 23592195.0, + "step": 8531, + "train/ce_loss": 0.524692714214325 + }, + { + "epoch": 0.843484279216927, + "step": 8531, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.843484279216927, + "step": 8531, + "train/total_loss": 0.13450051844120026 + }, + { + "entropy": 9.116499900817871, + "epoch": 0.8435831520664425, + "mean_token_accuracy": 0.7751572132110596, + "num_tokens": 23597256.0, + "step": 8532, + "train/ce_loss": 1.6770809888839722 + }, + { + "epoch": 0.8435831520664425, + "step": 8532, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.8435831520664425, + "step": 8532, + "train/total_loss": 0.22239559888839722 + }, + { + "entropy": 9.061075210571289, + "epoch": 0.8436820249159581, + "mean_token_accuracy": 0.7697160840034485, + "num_tokens": 23602309.0, + "step": 8533, + "train/ce_loss": 0.5759576559066772 + }, + { + "epoch": 0.8436820249159581, + "step": 8533, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.8436820249159581, + "step": 8533, + "train/total_loss": 0.09275201708078384 + }, + { + "entropy": 9.309283256530762, + "epoch": 0.8437808977654736, + "mean_token_accuracy": 0.7600849270820618, + "num_tokens": 23607234.0, + "step": 8534, + "train/ce_loss": 5.001173235541501e-07 + }, + { + "epoch": 0.8437808977654736, + "step": 8534, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.8437808977654736, + "step": 8534, + "train/total_loss": 0.01953130029141903 + }, + { + "entropy": 9.241487503051758, + "epoch": 0.8438797706149891, + "mean_token_accuracy": 0.6947040557861328, + "num_tokens": 23612257.0, + "step": 8535, + "train/ce_loss": 1.3567728996276855 + }, + { + "epoch": 0.8438797706149891, + "step": 8535, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.8438797706149891, + "step": 8535, + "train/total_loss": 0.2059897929430008 + }, + { + "entropy": 8.804858207702637, + "epoch": 0.8439786434645047, + "mean_token_accuracy": 0.6940194964408875, + "num_tokens": 23617462.0, + "step": 8536, + "train/ce_loss": 1.312057614326477 + }, + { + "epoch": 0.8439786434645047, + "step": 8536, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.8439786434645047, + "step": 8536, + "train/total_loss": 0.17026826739311218 + }, + { + "entropy": 8.754676818847656, + "epoch": 0.8440775163140202, + "mean_token_accuracy": 0.7330623269081116, + "num_tokens": 23622683.0, + "step": 8537, + "train/ce_loss": 0.9614264369010925 + }, + { + "epoch": 0.8440775163140202, + "step": 8537, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.8440775163140202, + "step": 8537, + "train/total_loss": 0.13129889965057373 + }, + { + "entropy": 8.331369400024414, + "epoch": 0.8441763891635357, + "mean_token_accuracy": 0.7461140155792236, + "num_tokens": 23628181.0, + "step": 8538, + "train/ce_loss": 0.43463143706321716 + }, + { + "epoch": 0.8441763891635357, + "step": 8538, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.8441763891635357, + "step": 8538, + "train/total_loss": 0.06299439072608948 + }, + { + "entropy": 8.614924430847168, + "epoch": 0.8442752620130513, + "mean_token_accuracy": 0.7688098549842834, + "num_tokens": 23633417.0, + "step": 8539, + "train/ce_loss": 0.895168125629425 + }, + { + "epoch": 0.8442752620130513, + "step": 8539, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.8442752620130513, + "step": 8539, + "train/total_loss": 0.18326681852340698 + }, + { + "epoch": 0.8443741348625667, + "grad_norm": 0.6965118646621704, + "learning_rate": 7.891262423972705e-06, + "loss": 0.132, + "step": 8540 + }, + { + "entropy": 9.057525634765625, + "epoch": 0.8443741348625667, + "mean_token_accuracy": 0.7484939694404602, + "num_tokens": 23638500.0, + "step": 8540, + "train/ce_loss": 4.804210789188801e-07 + }, + { + "epoch": 0.8443741348625667, + "step": 8540, + "train/sim_loss": 0.01171875 + }, + { + "epoch": 0.8443741348625667, + "step": 8540, + "train/total_loss": 0.01171879842877388 + }, + { + "entropy": 8.604438781738281, + "epoch": 0.8444730077120822, + "mean_token_accuracy": 0.7147846221923828, + "num_tokens": 23643836.0, + "step": 8541, + "train/ce_loss": 1.0626955032348633 + }, + { + "epoch": 0.8444730077120822, + "step": 8541, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.8444730077120822, + "step": 8541, + "train/total_loss": 0.15314455330371857 + }, + { + "entropy": 8.803327560424805, + "epoch": 0.8445718805615978, + "mean_token_accuracy": 0.7428977489471436, + "num_tokens": 23649029.0, + "step": 8542, + "train/ce_loss": 0.7747955918312073 + }, + { + "epoch": 0.8445718805615978, + "step": 8542, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.8445718805615978, + "step": 8542, + "train/total_loss": 0.15951082110404968 + }, + { + "entropy": 8.597539901733398, + "epoch": 0.8446707534111133, + "mean_token_accuracy": 0.7333333492279053, + "num_tokens": 23654293.0, + "step": 8543, + "train/ce_loss": 0.9350537657737732 + }, + { + "epoch": 0.8446707534111133, + "step": 8543, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.8446707534111133, + "step": 8543, + "train/total_loss": 0.1364741325378418 + }, + { + "entropy": 8.989640235900879, + "epoch": 0.8447696262606288, + "mean_token_accuracy": 0.6534771919250488, + "num_tokens": 23659552.0, + "step": 8544, + "train/ce_loss": 2.057020902633667 + }, + { + "epoch": 0.8447696262606288, + "step": 8544, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.8447696262606288, + "step": 8544, + "train/total_loss": 0.3072645962238312 + }, + { + "entropy": 8.553593635559082, + "epoch": 0.8448684991101444, + "mean_token_accuracy": 0.7130801677703857, + "num_tokens": 23664935.0, + "step": 8545, + "train/ce_loss": 0.978156328201294 + }, + { + "epoch": 0.8448684991101444, + "step": 8545, + "train/sim_loss": 0.16015625 + }, + { + "epoch": 0.8448684991101444, + "step": 8545, + "train/total_loss": 0.2579718828201294 + }, + { + "entropy": 8.832277297973633, + "epoch": 0.8449673719596599, + "mean_token_accuracy": 0.7181926369667053, + "num_tokens": 23670259.0, + "step": 8546, + "train/ce_loss": 0.8887196779251099 + }, + { + "epoch": 0.8449673719596599, + "step": 8546, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.8449673719596599, + "step": 8546, + "train/total_loss": 0.13965322077274323 + }, + { + "entropy": 9.03927993774414, + "epoch": 0.8450662448091754, + "mean_token_accuracy": 0.7805194854736328, + "num_tokens": 23675503.0, + "step": 8547, + "train/ce_loss": 0.836915910243988 + }, + { + "epoch": 0.8450662448091754, + "step": 8547, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.8450662448091754, + "step": 8547, + "train/total_loss": 0.10712908953428268 + }, + { + "entropy": 9.511265754699707, + "epoch": 0.845165117658691, + "mean_token_accuracy": 0.7051281929016113, + "num_tokens": 23680415.0, + "step": 8548, + "train/ce_loss": 1.317632794380188 + }, + { + "epoch": 0.845165117658691, + "step": 8548, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.845165117658691, + "step": 8548, + "train/total_loss": 0.1669195294380188 + }, + { + "entropy": 8.35056209564209, + "epoch": 0.8452639905082064, + "mean_token_accuracy": 0.6816367506980896, + "num_tokens": 23685928.0, + "step": 8549, + "train/ce_loss": 0.5039235353469849 + }, + { + "epoch": 0.8452639905082064, + "step": 8549, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.8452639905082064, + "step": 8549, + "train/total_loss": 0.11289235949516296 + }, + { + "entropy": 8.82366943359375, + "epoch": 0.8453628633577219, + "mean_token_accuracy": 0.6337500214576721, + "num_tokens": 23691174.0, + "step": 8550, + "train/ce_loss": 1.8179452419281006 + }, + { + "epoch": 0.8453628633577219, + "step": 8550, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.8453628633577219, + "step": 8550, + "train/total_loss": 0.28335702419281006 + }, + { + "entropy": 8.933710098266602, + "epoch": 0.8454617362072375, + "mean_token_accuracy": 0.7696709632873535, + "num_tokens": 23696311.0, + "step": 8551, + "train/ce_loss": 0.43073558807373047 + }, + { + "epoch": 0.8454617362072375, + "step": 8551, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.8454617362072375, + "step": 8551, + "train/total_loss": 0.09776106476783752 + }, + { + "entropy": 8.46588134765625, + "epoch": 0.845560609056753, + "mean_token_accuracy": 0.7494845390319824, + "num_tokens": 23701739.0, + "step": 8552, + "train/ce_loss": 1.047522783279419 + }, + { + "epoch": 0.845560609056753, + "step": 8552, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.845560609056753, + "step": 8552, + "train/total_loss": 0.12818977236747742 + }, + { + "entropy": 8.805068969726562, + "epoch": 0.8456594819062685, + "mean_token_accuracy": 0.7956104278564453, + "num_tokens": 23706978.0, + "step": 8553, + "train/ce_loss": 1.2623344218809507e-06 + }, + { + "epoch": 0.8456594819062685, + "step": 8553, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.8456594819062685, + "step": 8553, + "train/total_loss": 0.05468762665987015 + }, + { + "entropy": 8.793703079223633, + "epoch": 0.8457583547557841, + "mean_token_accuracy": 0.7777777910232544, + "num_tokens": 23712031.0, + "step": 8554, + "train/ce_loss": 0.9132997989654541 + }, + { + "epoch": 0.8457583547557841, + "step": 8554, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.8457583547557841, + "step": 8554, + "train/total_loss": 0.13039249181747437 + }, + { + "entropy": 8.934954643249512, + "epoch": 0.8458572276052996, + "mean_token_accuracy": 0.6742532253265381, + "num_tokens": 23717178.0, + "step": 8555, + "train/ce_loss": 4.5444750185197336e-07 + }, + { + "epoch": 0.8458572276052996, + "step": 8555, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.8458572276052996, + "step": 8555, + "train/total_loss": 0.01562504470348358 + }, + { + "entropy": 8.994284629821777, + "epoch": 0.8459561004548151, + "mean_token_accuracy": 0.7478134036064148, + "num_tokens": 23722343.0, + "step": 8556, + "train/ce_loss": 1.5355756282806396 + }, + { + "epoch": 0.8459561004548151, + "step": 8556, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.8459561004548151, + "step": 8556, + "train/total_loss": 0.21996381878852844 + }, + { + "entropy": 8.861419677734375, + "epoch": 0.8460549733043307, + "mean_token_accuracy": 0.7296416759490967, + "num_tokens": 23727741.0, + "step": 8557, + "train/ce_loss": 1.1478617191314697 + }, + { + "epoch": 0.8460549733043307, + "step": 8557, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.8460549733043307, + "step": 8557, + "train/total_loss": 0.23197367787361145 + }, + { + "entropy": 9.423959732055664, + "epoch": 0.8461538461538461, + "mean_token_accuracy": 0.7896679043769836, + "num_tokens": 23732738.0, + "step": 8558, + "train/ce_loss": 0.9041481018066406 + }, + { + "epoch": 0.8461538461538461, + "step": 8558, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.8461538461538461, + "step": 8558, + "train/total_loss": 0.16072732210159302 + }, + { + "entropy": 8.978797912597656, + "epoch": 0.8462527190033616, + "mean_token_accuracy": 0.7086092829704285, + "num_tokens": 23737967.0, + "step": 8559, + "train/ce_loss": 0.9606765508651733 + }, + { + "epoch": 0.8462527190033616, + "step": 8559, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.8462527190033616, + "step": 8559, + "train/total_loss": 0.1741926670074463 + }, + { + "epoch": 0.8463515918528772, + "grad_norm": 0.7191409468650818, + "learning_rate": 7.886317559214756e-06, + "loss": 0.1419, + "step": 8560 + }, + { + "entropy": 8.879575729370117, + "epoch": 0.8463515918528772, + "mean_token_accuracy": 0.7217962741851807, + "num_tokens": 23743346.0, + "step": 8560, + "train/ce_loss": 0.960830807685852 + }, + { + "epoch": 0.8463515918528772, + "step": 8560, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.8463515918528772, + "step": 8560, + "train/total_loss": 0.18592682480812073 + }, + { + "entropy": 9.213173866271973, + "epoch": 0.8464504647023927, + "mean_token_accuracy": 0.7724252343177795, + "num_tokens": 23748358.0, + "step": 8561, + "train/ce_loss": 0.8338235020637512 + }, + { + "epoch": 0.8464504647023927, + "step": 8561, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.8464504647023927, + "step": 8561, + "train/total_loss": 0.11853860318660736 + }, + { + "entropy": 8.947521209716797, + "epoch": 0.8465493375519082, + "mean_token_accuracy": 0.7548387050628662, + "num_tokens": 23753612.0, + "step": 8562, + "train/ce_loss": 0.5646962523460388 + }, + { + "epoch": 0.8465493375519082, + "step": 8562, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.8465493375519082, + "step": 8562, + "train/total_loss": 0.1541258692741394 + }, + { + "entropy": 9.613811492919922, + "epoch": 0.8466482104014238, + "mean_token_accuracy": 0.7535211443901062, + "num_tokens": 23758428.0, + "step": 8563, + "train/ce_loss": 7.110681963240495e-06 + }, + { + "epoch": 0.8466482104014238, + "step": 8563, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.8466482104014238, + "step": 8563, + "train/total_loss": 0.050781961530447006 + }, + { + "entropy": 9.07571029663086, + "epoch": 0.8467470832509393, + "mean_token_accuracy": 0.6820987462997437, + "num_tokens": 23763537.0, + "step": 8564, + "train/ce_loss": 0.8961170315742493 + }, + { + "epoch": 0.8467470832509393, + "step": 8564, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.8467470832509393, + "step": 8564, + "train/total_loss": 0.1286742091178894 + }, + { + "entropy": 8.552220344543457, + "epoch": 0.8468459561004548, + "mean_token_accuracy": 0.7806385159492493, + "num_tokens": 23768918.0, + "step": 8565, + "train/ce_loss": 0.8199772238731384 + }, + { + "epoch": 0.8468459561004548, + "step": 8565, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.8468459561004548, + "step": 8565, + "train/total_loss": 0.11715397238731384 + }, + { + "entropy": 9.405799865722656, + "epoch": 0.8469448289499704, + "mean_token_accuracy": 0.7757936716079712, + "num_tokens": 23773824.0, + "step": 8566, + "train/ce_loss": 1.1489684581756592 + }, + { + "epoch": 0.8469448289499704, + "step": 8566, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.8469448289499704, + "step": 8566, + "train/total_loss": 0.13052184879779816 + }, + { + "entropy": 8.768903732299805, + "epoch": 0.8470437017994858, + "mean_token_accuracy": 0.7369093298912048, + "num_tokens": 23779078.0, + "step": 8567, + "train/ce_loss": 1.2097439765930176 + }, + { + "epoch": 0.8470437017994858, + "step": 8567, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.8470437017994858, + "step": 8567, + "train/total_loss": 0.17566189169883728 + }, + { + "entropy": 8.474605560302734, + "epoch": 0.8471425746490013, + "mean_token_accuracy": 0.7535714507102966, + "num_tokens": 23784381.0, + "step": 8568, + "train/ce_loss": 0.6563302874565125 + }, + { + "epoch": 0.8471425746490013, + "step": 8568, + "train/sim_loss": 0.01171875 + }, + { + "epoch": 0.8471425746490013, + "step": 8568, + "train/total_loss": 0.07735177874565125 + }, + { + "entropy": 8.380297660827637, + "epoch": 0.8472414474985169, + "mean_token_accuracy": 0.7276736497879028, + "num_tokens": 23789765.0, + "step": 8569, + "train/ce_loss": 1.0520323514938354 + }, + { + "epoch": 0.8472414474985169, + "step": 8569, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.8472414474985169, + "step": 8569, + "train/total_loss": 0.14426574110984802 + }, + { + "entropy": 9.462567329406738, + "epoch": 0.8473403203480324, + "mean_token_accuracy": 0.75, + "num_tokens": 23794751.0, + "step": 8570, + "train/ce_loss": 1.1305701264063828e-06 + }, + { + "epoch": 0.8473403203480324, + "step": 8570, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.8473403203480324, + "step": 8570, + "train/total_loss": 0.039062611758708954 + }, + { + "entropy": 8.370973587036133, + "epoch": 0.8474391931975479, + "mean_token_accuracy": 0.7162954211235046, + "num_tokens": 23800133.0, + "step": 8571, + "train/ce_loss": 0.7542774677276611 + }, + { + "epoch": 0.8474391931975479, + "step": 8571, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.8474391931975479, + "step": 8571, + "train/total_loss": 0.10277149826288223 + }, + { + "entropy": 8.593929290771484, + "epoch": 0.8475380660470635, + "mean_token_accuracy": 0.7829294204711914, + "num_tokens": 23805505.0, + "step": 8572, + "train/ce_loss": 1.0726655721664429 + }, + { + "epoch": 0.8475380660470635, + "step": 8572, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.8475380660470635, + "step": 8572, + "train/total_loss": 0.17757906019687653 + }, + { + "entropy": 8.853445053100586, + "epoch": 0.847636938896579, + "mean_token_accuracy": 0.7512755393981934, + "num_tokens": 23810777.0, + "step": 8573, + "train/ce_loss": 1.2246817350387573 + }, + { + "epoch": 0.847636938896579, + "step": 8573, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.847636938896579, + "step": 8573, + "train/total_loss": 0.21621817350387573 + }, + { + "entropy": 8.767151832580566, + "epoch": 0.8477358117460945, + "mean_token_accuracy": 0.7639665007591248, + "num_tokens": 23815947.0, + "step": 8574, + "train/ce_loss": 0.9610381126403809 + }, + { + "epoch": 0.8477358117460945, + "step": 8574, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.8477358117460945, + "step": 8574, + "train/total_loss": 0.13907256722450256 + }, + { + "entropy": 8.860359191894531, + "epoch": 0.8478346845956101, + "mean_token_accuracy": 0.7936893105506897, + "num_tokens": 23821210.0, + "step": 8575, + "train/ce_loss": 0.795008659362793 + }, + { + "epoch": 0.8478346845956101, + "step": 8575, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.8478346845956101, + "step": 8575, + "train/total_loss": 0.13028211891651154 + }, + { + "entropy": 8.468084335327148, + "epoch": 0.8479335574451256, + "mean_token_accuracy": 0.6940928101539612, + "num_tokens": 23826593.0, + "step": 8576, + "train/ce_loss": 0.7455776333808899 + }, + { + "epoch": 0.8479335574451256, + "step": 8576, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.8479335574451256, + "step": 8576, + "train/total_loss": 0.11362026631832123 + }, + { + "entropy": 8.811766624450684, + "epoch": 0.848032430294641, + "mean_token_accuracy": 0.7359477281570435, + "num_tokens": 23831802.0, + "step": 8577, + "train/ce_loss": 0.7632312178611755 + }, + { + "epoch": 0.848032430294641, + "step": 8577, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.848032430294641, + "step": 8577, + "train/total_loss": 0.12710437178611755 + }, + { + "entropy": 8.497659683227539, + "epoch": 0.8481313031441566, + "mean_token_accuracy": 0.715871274471283, + "num_tokens": 23837209.0, + "step": 8578, + "train/ce_loss": 0.6656380295753479 + }, + { + "epoch": 0.8481313031441566, + "step": 8578, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.8481313031441566, + "step": 8578, + "train/total_loss": 0.08218880742788315 + }, + { + "entropy": 8.694424629211426, + "epoch": 0.8482301759936721, + "mean_token_accuracy": 0.7347715497016907, + "num_tokens": 23842508.0, + "step": 8579, + "train/ce_loss": 0.8246199488639832 + }, + { + "epoch": 0.8482301759936721, + "step": 8579, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.8482301759936721, + "step": 8579, + "train/total_loss": 0.10980574786663055 + }, + { + "epoch": 0.8483290488431876, + "grad_norm": 0.6876291036605835, + "learning_rate": 7.881372694456806e-06, + "loss": 0.1312, + "step": 8580 + }, + { + "entropy": 9.102163314819336, + "epoch": 0.8483290488431876, + "mean_token_accuracy": 0.8110516667366028, + "num_tokens": 23847560.0, + "step": 8580, + "train/ce_loss": 9.460008527639729e-07 + }, + { + "epoch": 0.8483290488431876, + "step": 8580, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.8483290488431876, + "step": 8580, + "train/total_loss": 0.03515634313225746 + }, + { + "entropy": 8.552837371826172, + "epoch": 0.8484279216927032, + "mean_token_accuracy": 0.7890088558197021, + "num_tokens": 23853053.0, + "step": 8581, + "train/ce_loss": 0.6224937438964844 + }, + { + "epoch": 0.8484279216927032, + "step": 8581, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.8484279216927032, + "step": 8581, + "train/total_loss": 0.10131187736988068 + }, + { + "entropy": 8.294087409973145, + "epoch": 0.8485267945422187, + "mean_token_accuracy": 0.7723258137702942, + "num_tokens": 23858541.0, + "step": 8582, + "train/ce_loss": 0.699788510799408 + }, + { + "epoch": 0.8485267945422187, + "step": 8582, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.8485267945422187, + "step": 8582, + "train/total_loss": 0.11294760555028915 + }, + { + "entropy": 9.38934326171875, + "epoch": 0.8486256673917343, + "mean_token_accuracy": 0.7436892986297607, + "num_tokens": 23863503.0, + "step": 8583, + "train/ce_loss": 0.9503109455108643 + }, + { + "epoch": 0.8486256673917343, + "step": 8583, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.8486256673917343, + "step": 8583, + "train/total_loss": 0.14581234753131866 + }, + { + "entropy": 9.223760604858398, + "epoch": 0.8487245402412498, + "mean_token_accuracy": 0.7319148778915405, + "num_tokens": 23868383.0, + "step": 8584, + "train/ce_loss": 5.062517516307707e-07 + }, + { + "epoch": 0.8487245402412498, + "step": 8584, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.8487245402412498, + "step": 8584, + "train/total_loss": 0.05078130215406418 + }, + { + "entropy": 8.907994270324707, + "epoch": 0.8488234130907653, + "mean_token_accuracy": 0.7239512801170349, + "num_tokens": 23873611.0, + "step": 8585, + "train/ce_loss": 0.813693106174469 + }, + { + "epoch": 0.8488234130907653, + "step": 8585, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.8488234130907653, + "step": 8585, + "train/total_loss": 0.1282443106174469 + }, + { + "entropy": 8.57661247253418, + "epoch": 0.8489222859402809, + "mean_token_accuracy": 0.7664429545402527, + "num_tokens": 23878867.0, + "step": 8586, + "train/ce_loss": 0.8562458753585815 + }, + { + "epoch": 0.8489222859402809, + "step": 8586, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.8489222859402809, + "step": 8586, + "train/total_loss": 0.1168745905160904 + }, + { + "entropy": 8.623764038085938, + "epoch": 0.8490211587897963, + "mean_token_accuracy": 0.7465224266052246, + "num_tokens": 23884026.0, + "step": 8587, + "train/ce_loss": 0.5021078586578369 + }, + { + "epoch": 0.8490211587897963, + "step": 8587, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.8490211587897963, + "step": 8587, + "train/total_loss": 0.11661703884601593 + }, + { + "entropy": 9.092384338378906, + "epoch": 0.8491200316393118, + "mean_token_accuracy": 0.6960926055908203, + "num_tokens": 23889140.0, + "step": 8588, + "train/ce_loss": 1.0651516914367676 + }, + { + "epoch": 0.8491200316393118, + "step": 8588, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.8491200316393118, + "step": 8588, + "train/total_loss": 0.16510891914367676 + }, + { + "entropy": 8.752415657043457, + "epoch": 0.8492189044888274, + "mean_token_accuracy": 0.7476537823677063, + "num_tokens": 23894586.0, + "step": 8589, + "train/ce_loss": 0.5429462790489197 + }, + { + "epoch": 0.8492189044888274, + "step": 8589, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.8492189044888274, + "step": 8589, + "train/total_loss": 0.1324196308851242 + }, + { + "entropy": 9.215569496154785, + "epoch": 0.8493177773383429, + "mean_token_accuracy": 0.8021978139877319, + "num_tokens": 23899594.0, + "step": 8590, + "train/ce_loss": 8.209082693610981e-07 + }, + { + "epoch": 0.8493177773383429, + "step": 8590, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.8493177773383429, + "step": 8590, + "train/total_loss": 0.07031258195638657 + }, + { + "entropy": 8.700742721557617, + "epoch": 0.8494166501878584, + "mean_token_accuracy": 0.7142857313156128, + "num_tokens": 23904918.0, + "step": 8591, + "train/ce_loss": 0.5889235138893127 + }, + { + "epoch": 0.8494166501878584, + "step": 8591, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.8494166501878584, + "step": 8591, + "train/total_loss": 0.09795485436916351 + }, + { + "entropy": 8.850866317749023, + "epoch": 0.849515523037374, + "mean_token_accuracy": 0.7548543810844421, + "num_tokens": 23910206.0, + "step": 8592, + "train/ce_loss": 0.4724769592285156 + }, + { + "epoch": 0.849515523037374, + "step": 8592, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.849515523037374, + "step": 8592, + "train/total_loss": 0.09802894294261932 + }, + { + "entropy": 9.597957611083984, + "epoch": 0.8496143958868895, + "mean_token_accuracy": 0.738161563873291, + "num_tokens": 23914980.0, + "step": 8593, + "train/ce_loss": 1.9964507818222046 + }, + { + "epoch": 0.8496143958868895, + "step": 8593, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.8496143958868895, + "step": 8593, + "train/total_loss": 0.27386385202407837 + }, + { + "entropy": 8.678569793701172, + "epoch": 0.849713268736405, + "mean_token_accuracy": 0.6766623258590698, + "num_tokens": 23920252.0, + "step": 8594, + "train/ce_loss": 1.3998847007751465 + }, + { + "epoch": 0.849713268736405, + "step": 8594, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.849713268736405, + "step": 8594, + "train/total_loss": 0.2103009670972824 + }, + { + "entropy": 8.598791122436523, + "epoch": 0.8498121415859206, + "mean_token_accuracy": 0.7106481194496155, + "num_tokens": 23925626.0, + "step": 8595, + "train/ce_loss": 0.6554878950119019 + }, + { + "epoch": 0.8498121415859206, + "step": 8595, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.8498121415859206, + "step": 8595, + "train/total_loss": 0.12804879248142242 + }, + { + "entropy": 8.57380485534668, + "epoch": 0.849911014435436, + "mean_token_accuracy": 0.738170325756073, + "num_tokens": 23930999.0, + "step": 8596, + "train/ce_loss": 1.0399413108825684 + }, + { + "epoch": 0.849911014435436, + "step": 8596, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.849911014435436, + "step": 8596, + "train/total_loss": 0.14696288108825684 + }, + { + "entropy": 8.762775421142578, + "epoch": 0.8500098872849515, + "mean_token_accuracy": 0.7331759333610535, + "num_tokens": 23936310.0, + "step": 8597, + "train/ce_loss": 0.8365949988365173 + }, + { + "epoch": 0.8500098872849515, + "step": 8597, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.8500098872849515, + "step": 8597, + "train/total_loss": 0.11100324988365173 + }, + { + "entropy": 9.271051406860352, + "epoch": 0.8501087601344671, + "mean_token_accuracy": 0.7364568114280701, + "num_tokens": 23941398.0, + "step": 8598, + "train/ce_loss": 1.2323108911514282 + }, + { + "epoch": 0.8501087601344671, + "step": 8598, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.8501087601344671, + "step": 8598, + "train/total_loss": 0.15838733315467834 + }, + { + "entropy": 8.725774765014648, + "epoch": 0.8502076329839826, + "mean_token_accuracy": 0.7077682018280029, + "num_tokens": 23946664.0, + "step": 8599, + "train/ce_loss": 0.5549308657646179 + }, + { + "epoch": 0.8502076329839826, + "step": 8599, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.8502076329839826, + "step": 8599, + "train/total_loss": 0.11799308657646179 + }, + { + "epoch": 0.8503065058334981, + "grad_norm": 0.736488401889801, + "learning_rate": 7.876427829698859e-06, + "loss": 0.1359, + "step": 8600 + }, + { + "entropy": 8.868095397949219, + "epoch": 0.8503065058334981, + "mean_token_accuracy": 0.7949790954589844, + "num_tokens": 23951870.0, + "step": 8600, + "train/ce_loss": 0.7157239317893982 + }, + { + "epoch": 0.8503065058334981, + "step": 8600, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.8503065058334981, + "step": 8600, + "train/total_loss": 0.11063489317893982 + }, + { + "entropy": 8.631006240844727, + "epoch": 0.8504053786830137, + "mean_token_accuracy": 0.7225305438041687, + "num_tokens": 23957208.0, + "step": 8601, + "train/ce_loss": 0.7594647407531738 + }, + { + "epoch": 0.8504053786830137, + "step": 8601, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.8504053786830137, + "step": 8601, + "train/total_loss": 0.11891522258520126 + }, + { + "entropy": 8.989002227783203, + "epoch": 0.8505042515325292, + "mean_token_accuracy": 0.7481371164321899, + "num_tokens": 23962275.0, + "step": 8602, + "train/ce_loss": 0.7256680727005005 + }, + { + "epoch": 0.8505042515325292, + "step": 8602, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.8505042515325292, + "step": 8602, + "train/total_loss": 0.11944180727005005 + }, + { + "entropy": 8.472922325134277, + "epoch": 0.8506031243820447, + "mean_token_accuracy": 0.7381423115730286, + "num_tokens": 23967743.0, + "step": 8603, + "train/ce_loss": 1.0811166763305664 + }, + { + "epoch": 0.8506031243820447, + "step": 8603, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.8506031243820447, + "step": 8603, + "train/total_loss": 0.1510804295539856 + }, + { + "entropy": 8.90719223022461, + "epoch": 0.8507019972315603, + "mean_token_accuracy": 0.730654776096344, + "num_tokens": 23972855.0, + "step": 8604, + "train/ce_loss": 4.292879225431534e-07 + }, + { + "epoch": 0.8507019972315603, + "step": 8604, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.8507019972315603, + "step": 8604, + "train/total_loss": 0.04687504470348358 + }, + { + "entropy": 8.540361404418945, + "epoch": 0.8508008700810757, + "mean_token_accuracy": 0.7266355156898499, + "num_tokens": 23978191.0, + "step": 8605, + "train/ce_loss": 0.5352979898452759 + }, + { + "epoch": 0.8508008700810757, + "step": 8605, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.8508008700810757, + "step": 8605, + "train/total_loss": 0.07696729898452759 + }, + { + "entropy": 9.061471939086914, + "epoch": 0.8508997429305912, + "mean_token_accuracy": 0.8200590014457703, + "num_tokens": 23983318.0, + "step": 8606, + "train/ce_loss": 7.412546665364061e-07 + }, + { + "epoch": 0.8508997429305912, + "step": 8606, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.8508997429305912, + "step": 8606, + "train/total_loss": 0.06250007450580597 + }, + { + "entropy": 8.739973068237305, + "epoch": 0.8509986157801068, + "mean_token_accuracy": 0.7247474789619446, + "num_tokens": 23988557.0, + "step": 8607, + "train/ce_loss": 0.7236993908882141 + }, + { + "epoch": 0.8509986157801068, + "step": 8607, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.8509986157801068, + "step": 8607, + "train/total_loss": 0.14658868312835693 + }, + { + "entropy": 9.018573760986328, + "epoch": 0.8510974886296223, + "mean_token_accuracy": 0.7268170714378357, + "num_tokens": 23994006.0, + "step": 8608, + "train/ce_loss": 1.1890531778335571 + }, + { + "epoch": 0.8510974886296223, + "step": 8608, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.8510974886296223, + "step": 8608, + "train/total_loss": 0.18531157076358795 + }, + { + "entropy": 8.670655250549316, + "epoch": 0.8511963614791378, + "mean_token_accuracy": 0.7262773513793945, + "num_tokens": 23999337.0, + "step": 8609, + "train/ce_loss": 0.41880103945732117 + }, + { + "epoch": 0.8511963614791378, + "step": 8609, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.8511963614791378, + "step": 8609, + "train/total_loss": 0.057505104690790176 + }, + { + "entropy": 8.736785888671875, + "epoch": 0.8512952343286534, + "mean_token_accuracy": 0.7797872424125671, + "num_tokens": 24004694.0, + "step": 8610, + "train/ce_loss": 0.5912141799926758 + }, + { + "epoch": 0.8512952343286534, + "step": 8610, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.8512952343286534, + "step": 8610, + "train/total_loss": 0.09037141501903534 + }, + { + "entropy": 9.069501876831055, + "epoch": 0.8513941071781689, + "mean_token_accuracy": 0.7878260612487793, + "num_tokens": 24009678.0, + "step": 8611, + "train/ce_loss": 5.896106358704856e-07 + }, + { + "epoch": 0.8513941071781689, + "step": 8611, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.8513941071781689, + "step": 8611, + "train/total_loss": 0.046875059604644775 + }, + { + "entropy": 8.639055252075195, + "epoch": 0.8514929800276844, + "mean_token_accuracy": 0.7667887806892395, + "num_tokens": 24014937.0, + "step": 8612, + "train/ce_loss": 0.49192896485328674 + }, + { + "epoch": 0.8514929800276844, + "step": 8612, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.8514929800276844, + "step": 8612, + "train/total_loss": 0.0726303979754448 + }, + { + "entropy": 8.987211227416992, + "epoch": 0.8515918528772, + "mean_token_accuracy": 0.7503566145896912, + "num_tokens": 24020045.0, + "step": 8613, + "train/ce_loss": 2.2853257632959867e-06 + }, + { + "epoch": 0.8515918528772, + "step": 8613, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.8515918528772, + "step": 8613, + "train/total_loss": 0.0664064809679985 + }, + { + "entropy": 8.735445976257324, + "epoch": 0.8516907257267154, + "mean_token_accuracy": 0.7709359526634216, + "num_tokens": 24025315.0, + "step": 8614, + "train/ce_loss": 0.8446625471115112 + }, + { + "epoch": 0.8516907257267154, + "step": 8614, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.8516907257267154, + "step": 8614, + "train/total_loss": 0.11181000620126724 + }, + { + "entropy": 8.674492835998535, + "epoch": 0.8517895985762309, + "mean_token_accuracy": 0.8137565851211548, + "num_tokens": 24030721.0, + "step": 8615, + "train/ce_loss": 0.6308075785636902 + }, + { + "epoch": 0.8517895985762309, + "step": 8615, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.8517895985762309, + "step": 8615, + "train/total_loss": 0.11776825785636902 + }, + { + "entropy": 9.038165092468262, + "epoch": 0.8518884714257465, + "mean_token_accuracy": 0.6757457852363586, + "num_tokens": 24035949.0, + "step": 8616, + "train/ce_loss": 1.302553415298462 + }, + { + "epoch": 0.8518884714257465, + "step": 8616, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.8518884714257465, + "step": 8616, + "train/total_loss": 0.1849428415298462 + }, + { + "entropy": 8.956336975097656, + "epoch": 0.851987344275262, + "mean_token_accuracy": 0.7837445735931396, + "num_tokens": 24041072.0, + "step": 8617, + "train/ce_loss": 0.46485111117362976 + }, + { + "epoch": 0.851987344275262, + "step": 8617, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.851987344275262, + "step": 8617, + "train/total_loss": 0.06601636111736298 + }, + { + "entropy": 8.625548362731934, + "epoch": 0.8520862171247775, + "mean_token_accuracy": 0.7753530144691467, + "num_tokens": 24046361.0, + "step": 8618, + "train/ce_loss": 0.8610982298851013 + }, + { + "epoch": 0.8520862171247775, + "step": 8618, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.8520862171247775, + "step": 8618, + "train/total_loss": 0.10564107447862625 + }, + { + "entropy": 8.861141204833984, + "epoch": 0.8521850899742931, + "mean_token_accuracy": 0.788170576095581, + "num_tokens": 24051489.0, + "step": 8619, + "train/ce_loss": 0.5501077771186829 + }, + { + "epoch": 0.8521850899742931, + "step": 8619, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.8521850899742931, + "step": 8619, + "train/total_loss": 0.12141703069210052 + }, + { + "epoch": 0.8522839628238086, + "grad_norm": 0.5112330913543701, + "learning_rate": 7.87148296494091e-06, + "loss": 0.1179, + "step": 8620 + }, + { + "entropy": 8.934500694274902, + "epoch": 0.8522839628238086, + "mean_token_accuracy": 0.727748692035675, + "num_tokens": 24056752.0, + "step": 8620, + "train/ce_loss": 0.6823888421058655 + }, + { + "epoch": 0.8522839628238086, + "step": 8620, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.8522839628238086, + "step": 8620, + "train/total_loss": 0.11511388421058655 + }, + { + "entropy": 9.102067947387695, + "epoch": 0.8523828356733241, + "mean_token_accuracy": 0.704402506351471, + "num_tokens": 24061844.0, + "step": 8621, + "train/ce_loss": 1.138018012046814 + }, + { + "epoch": 0.8523828356733241, + "step": 8621, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.8523828356733241, + "step": 8621, + "train/total_loss": 0.17239555716514587 + }, + { + "entropy": 8.598770141601562, + "epoch": 0.8524817085228397, + "mean_token_accuracy": 0.7490774989128113, + "num_tokens": 24067075.0, + "step": 8622, + "train/ce_loss": 0.7021009922027588 + }, + { + "epoch": 0.8524817085228397, + "step": 8622, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.8524817085228397, + "step": 8622, + "train/total_loss": 0.12489759922027588 + }, + { + "entropy": 8.624722480773926, + "epoch": 0.8525805813723552, + "mean_token_accuracy": 0.7587034702301025, + "num_tokens": 24072299.0, + "step": 8623, + "train/ce_loss": 0.8237633109092712 + }, + { + "epoch": 0.8525805813723552, + "step": 8623, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.8525805813723552, + "step": 8623, + "train/total_loss": 0.12143883109092712 + }, + { + "entropy": 9.024396896362305, + "epoch": 0.8526794542218706, + "mean_token_accuracy": 0.779411792755127, + "num_tokens": 24077336.0, + "step": 8624, + "train/ce_loss": 1.0187090635299683 + }, + { + "epoch": 0.8526794542218706, + "step": 8624, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.8526794542218706, + "step": 8624, + "train/total_loss": 0.14483965933322906 + }, + { + "entropy": 8.925800323486328, + "epoch": 0.8527783270713862, + "mean_token_accuracy": 0.7268722653388977, + "num_tokens": 24082508.0, + "step": 8625, + "train/ce_loss": 1.5128824710845947 + }, + { + "epoch": 0.8527783270713862, + "step": 8625, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.8527783270713862, + "step": 8625, + "train/total_loss": 0.2098820060491562 + }, + { + "entropy": 8.57644271850586, + "epoch": 0.8528771999209017, + "mean_token_accuracy": 0.7945075631141663, + "num_tokens": 24088036.0, + "step": 8626, + "train/ce_loss": 0.8464938402175903 + }, + { + "epoch": 0.8528771999209017, + "step": 8626, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.8528771999209017, + "step": 8626, + "train/total_loss": 0.10418063402175903 + }, + { + "entropy": 9.06151008605957, + "epoch": 0.8529760727704172, + "mean_token_accuracy": 0.7252747416496277, + "num_tokens": 24093196.0, + "step": 8627, + "train/ce_loss": 1.6834861040115356 + }, + { + "epoch": 0.8529760727704172, + "step": 8627, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.8529760727704172, + "step": 8627, + "train/total_loss": 0.22303611040115356 + }, + { + "entropy": 9.575900077819824, + "epoch": 0.8530749456199328, + "mean_token_accuracy": 0.780701756477356, + "num_tokens": 24097951.0, + "step": 8628, + "train/ce_loss": 1.3008063888264587e-06 + }, + { + "epoch": 0.8530749456199328, + "step": 8628, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.8530749456199328, + "step": 8628, + "train/total_loss": 0.054687630385160446 + }, + { + "entropy": 8.6054105758667, + "epoch": 0.8531738184694483, + "mean_token_accuracy": 0.695652186870575, + "num_tokens": 24103249.0, + "step": 8629, + "train/ce_loss": 0.9870397448539734 + }, + { + "epoch": 0.8531738184694483, + "step": 8629, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.8531738184694483, + "step": 8629, + "train/total_loss": 0.14948523044586182 + }, + { + "entropy": 8.558268547058105, + "epoch": 0.8532726913189638, + "mean_token_accuracy": 0.8256983160972595, + "num_tokens": 24108620.0, + "step": 8630, + "train/ce_loss": 0.5947093367576599 + }, + { + "epoch": 0.8532726913189638, + "step": 8630, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.8532726913189638, + "step": 8630, + "train/total_loss": 0.08290843665599823 + }, + { + "entropy": 9.279542922973633, + "epoch": 0.8533715641684794, + "mean_token_accuracy": 0.7066895365715027, + "num_tokens": 24113653.0, + "step": 8631, + "train/ce_loss": 0.893915057182312 + }, + { + "epoch": 0.8533715641684794, + "step": 8631, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.8533715641684794, + "step": 8631, + "train/total_loss": 0.16751649975776672 + }, + { + "entropy": 8.98904037475586, + "epoch": 0.8534704370179949, + "mean_token_accuracy": 0.7587301731109619, + "num_tokens": 24118699.0, + "step": 8632, + "train/ce_loss": 0.8543035387992859 + }, + { + "epoch": 0.8534704370179949, + "step": 8632, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.8534704370179949, + "step": 8632, + "train/total_loss": 0.1440241038799286 + }, + { + "entropy": 9.05948257446289, + "epoch": 0.8535693098675103, + "mean_token_accuracy": 0.6973180174827576, + "num_tokens": 24123664.0, + "step": 8633, + "train/ce_loss": 1.6334046125411987 + }, + { + "epoch": 0.8535693098675103, + "step": 8633, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.8535693098675103, + "step": 8633, + "train/total_loss": 0.2453717142343521 + }, + { + "entropy": 8.830093383789062, + "epoch": 0.8536681827170259, + "mean_token_accuracy": 0.7382388710975647, + "num_tokens": 24128879.0, + "step": 8634, + "train/ce_loss": 1.3079240322113037 + }, + { + "epoch": 0.8536681827170259, + "step": 8634, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.8536681827170259, + "step": 8634, + "train/total_loss": 0.19719865918159485 + }, + { + "entropy": 8.617467880249023, + "epoch": 0.8537670555665414, + "mean_token_accuracy": 0.7291910648345947, + "num_tokens": 24134170.0, + "step": 8635, + "train/ce_loss": 0.49795177578926086 + }, + { + "epoch": 0.8537670555665414, + "step": 8635, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.8537670555665414, + "step": 8635, + "train/total_loss": 0.10448268055915833 + }, + { + "entropy": 8.834949493408203, + "epoch": 0.8538659284160569, + "mean_token_accuracy": 0.7502726316452026, + "num_tokens": 24139579.0, + "step": 8636, + "train/ce_loss": 0.7361787557601929 + }, + { + "epoch": 0.8538659284160569, + "step": 8636, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.8538659284160569, + "step": 8636, + "train/total_loss": 0.12049287557601929 + }, + { + "entropy": 8.288768768310547, + "epoch": 0.8539648012655725, + "mean_token_accuracy": 0.7339534759521484, + "num_tokens": 24145189.0, + "step": 8637, + "train/ce_loss": 1.0247743129730225 + }, + { + "epoch": 0.8539648012655725, + "step": 8637, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.8539648012655725, + "step": 8637, + "train/total_loss": 0.14935243129730225 + }, + { + "entropy": 8.689209938049316, + "epoch": 0.854063674115088, + "mean_token_accuracy": 0.7563636302947998, + "num_tokens": 24150462.0, + "step": 8638, + "train/ce_loss": 0.6432814002037048 + }, + { + "epoch": 0.854063674115088, + "step": 8638, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.854063674115088, + "step": 8638, + "train/total_loss": 0.1112031415104866 + }, + { + "entropy": 8.928628921508789, + "epoch": 0.8541625469646035, + "mean_token_accuracy": 0.7103004455566406, + "num_tokens": 24155427.0, + "step": 8639, + "train/ce_loss": 0.775364339351654 + }, + { + "epoch": 0.8541625469646035, + "step": 8639, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.8541625469646035, + "step": 8639, + "train/total_loss": 0.1009739339351654 + }, + { + "epoch": 0.8542614198141191, + "grad_norm": 0.8426479697227478, + "learning_rate": 7.866538100182961e-06, + "loss": 0.1292, + "step": 8640 + }, + { + "entropy": 8.4351806640625, + "epoch": 0.8542614198141191, + "mean_token_accuracy": 0.8066465258598328, + "num_tokens": 24160886.0, + "step": 8640, + "train/ce_loss": 0.2624002695083618 + }, + { + "epoch": 0.8542614198141191, + "step": 8640, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.8542614198141191, + "step": 8640, + "train/total_loss": 0.0653025284409523 + }, + { + "entropy": 9.260761260986328, + "epoch": 0.8543602926636346, + "mean_token_accuracy": 0.7677165269851685, + "num_tokens": 24165991.0, + "step": 8641, + "train/ce_loss": 4.111280986762722e-07 + }, + { + "epoch": 0.8543602926636346, + "step": 8641, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.8543602926636346, + "step": 8641, + "train/total_loss": 0.015625040978193283 + }, + { + "entropy": 8.621223449707031, + "epoch": 0.85445916551315, + "mean_token_accuracy": 0.7359307408332825, + "num_tokens": 24171379.0, + "step": 8642, + "train/ce_loss": 0.6777894496917725 + }, + { + "epoch": 0.85445916551315, + "step": 8642, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.85445916551315, + "step": 8642, + "train/total_loss": 0.17715394496917725 + }, + { + "entropy": 9.42569351196289, + "epoch": 0.8545580383626656, + "mean_token_accuracy": 0.7537593841552734, + "num_tokens": 24176280.0, + "step": 8643, + "train/ce_loss": 1.375551462173462 + }, + { + "epoch": 0.8545580383626656, + "step": 8643, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.8545580383626656, + "step": 8643, + "train/total_loss": 0.16099265217781067 + }, + { + "entropy": 8.520685195922852, + "epoch": 0.8546569112121811, + "mean_token_accuracy": 0.7365339398384094, + "num_tokens": 24181571.0, + "step": 8644, + "train/ce_loss": 1.1391706466674805 + }, + { + "epoch": 0.8546569112121811, + "step": 8644, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.8546569112121811, + "step": 8644, + "train/total_loss": 0.16860456764698029 + }, + { + "entropy": 8.459127426147461, + "epoch": 0.8547557840616966, + "mean_token_accuracy": 0.7465968728065491, + "num_tokens": 24187035.0, + "step": 8645, + "train/ce_loss": 0.6572544574737549 + }, + { + "epoch": 0.8547557840616966, + "step": 8645, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.8547557840616966, + "step": 8645, + "train/total_loss": 0.08916294574737549 + }, + { + "entropy": 8.707165718078613, + "epoch": 0.8548546569112122, + "mean_token_accuracy": 0.7386519908905029, + "num_tokens": 24192243.0, + "step": 8646, + "train/ce_loss": 0.7648477554321289 + }, + { + "epoch": 0.8548546569112122, + "step": 8646, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.8548546569112122, + "step": 8646, + "train/total_loss": 0.11164102703332901 + }, + { + "entropy": 8.758549690246582, + "epoch": 0.8549535297607277, + "mean_token_accuracy": 0.722908079624176, + "num_tokens": 24197457.0, + "step": 8647, + "train/ce_loss": 0.8366193771362305 + }, + { + "epoch": 0.8549535297607277, + "step": 8647, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.8549535297607277, + "step": 8647, + "train/total_loss": 0.14616194367408752 + }, + { + "entropy": 8.51551628112793, + "epoch": 0.8550524026102432, + "mean_token_accuracy": 0.6890308856964111, + "num_tokens": 24202863.0, + "step": 8648, + "train/ce_loss": 1.5394750833511353 + }, + { + "epoch": 0.8550524026102432, + "step": 8648, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.8550524026102432, + "step": 8648, + "train/total_loss": 0.19301001727581024 + }, + { + "entropy": 8.53546142578125, + "epoch": 0.8551512754597588, + "mean_token_accuracy": 0.703592836856842, + "num_tokens": 24208316.0, + "step": 8649, + "train/ce_loss": 0.7742993235588074 + }, + { + "epoch": 0.8551512754597588, + "step": 8649, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.8551512754597588, + "step": 8649, + "train/total_loss": 0.11258618533611298 + }, + { + "entropy": 8.493489265441895, + "epoch": 0.8552501483092743, + "mean_token_accuracy": 0.7212954163551331, + "num_tokens": 24213864.0, + "step": 8650, + "train/ce_loss": 0.537199079990387 + }, + { + "epoch": 0.8552501483092743, + "step": 8650, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.8552501483092743, + "step": 8650, + "train/total_loss": 0.0810636579990387 + }, + { + "entropy": 8.240386009216309, + "epoch": 0.8553490211587897, + "mean_token_accuracy": 0.748680055141449, + "num_tokens": 24219362.0, + "step": 8651, + "train/ce_loss": 1.250221610069275 + }, + { + "epoch": 0.8553490211587897, + "step": 8651, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.8553490211587897, + "step": 8651, + "train/total_loss": 0.17189715802669525 + }, + { + "entropy": 9.68118667602539, + "epoch": 0.8554478940083053, + "mean_token_accuracy": 0.6902654767036438, + "num_tokens": 24224195.0, + "step": 8652, + "train/ce_loss": 1.649112343788147 + }, + { + "epoch": 0.8554478940083053, + "step": 8652, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.8554478940083053, + "step": 8652, + "train/total_loss": 0.24303624033927917 + }, + { + "entropy": 9.208187103271484, + "epoch": 0.8555467668578208, + "mean_token_accuracy": 0.7402597665786743, + "num_tokens": 24229169.0, + "step": 8653, + "train/ce_loss": 0.9814550280570984 + }, + { + "epoch": 0.8555467668578208, + "step": 8653, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.8555467668578208, + "step": 8653, + "train/total_loss": 0.1254892647266388 + }, + { + "entropy": 8.391998291015625, + "epoch": 0.8556456397073363, + "mean_token_accuracy": 0.7403433322906494, + "num_tokens": 24234544.0, + "step": 8654, + "train/ce_loss": 1.4496474266052246 + }, + { + "epoch": 0.8556456397073363, + "step": 8654, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.8556456397073363, + "step": 8654, + "train/total_loss": 0.20355848968029022 + }, + { + "entropy": 9.0525484085083, + "epoch": 0.8557445125568519, + "mean_token_accuracy": 0.789559543132782, + "num_tokens": 24239631.0, + "step": 8655, + "train/ce_loss": 1.1713453531265259 + }, + { + "epoch": 0.8557445125568519, + "step": 8655, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.8557445125568519, + "step": 8655, + "train/total_loss": 0.13666579127311707 + }, + { + "entropy": 9.150043487548828, + "epoch": 0.8558433854063674, + "mean_token_accuracy": 0.7431610822677612, + "num_tokens": 24244700.0, + "step": 8656, + "train/ce_loss": 2.7685500754159875e-06 + }, + { + "epoch": 0.8558433854063674, + "step": 8656, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.8558433854063674, + "step": 8656, + "train/total_loss": 0.06640652567148209 + }, + { + "entropy": 8.428903579711914, + "epoch": 0.8559422582558829, + "mean_token_accuracy": 0.7721261382102966, + "num_tokens": 24250134.0, + "step": 8657, + "train/ce_loss": 0.8671013116836548 + }, + { + "epoch": 0.8559422582558829, + "step": 8657, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.8559422582558829, + "step": 8657, + "train/total_loss": 0.153116375207901 + }, + { + "entropy": 9.028989791870117, + "epoch": 0.8560411311053985, + "mean_token_accuracy": 0.7668308615684509, + "num_tokens": 24255237.0, + "step": 8658, + "train/ce_loss": 0.7063049077987671 + }, + { + "epoch": 0.8560411311053985, + "step": 8658, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.8560411311053985, + "step": 8658, + "train/total_loss": 0.09016174077987671 + }, + { + "entropy": 8.776468276977539, + "epoch": 0.856140003954914, + "mean_token_accuracy": 0.7960928082466125, + "num_tokens": 24260669.0, + "step": 8659, + "train/ce_loss": 0.8988680839538574 + }, + { + "epoch": 0.856140003954914, + "step": 8659, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.856140003954914, + "step": 8659, + "train/total_loss": 0.10551180690526962 + }, + { + "epoch": 0.8562388768044294, + "grad_norm": 0.543863832950592, + "learning_rate": 7.861593235425012e-06, + "loss": 0.1278, + "step": 8660 + }, + { + "entropy": 9.166704177856445, + "epoch": 0.8562388768044294, + "mean_token_accuracy": 0.7692307829856873, + "num_tokens": 24265760.0, + "step": 8660, + "train/ce_loss": 1.2796474695205688 + }, + { + "epoch": 0.8562388768044294, + "step": 8660, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.8562388768044294, + "step": 8660, + "train/total_loss": 0.19827724993228912 + }, + { + "entropy": 8.707932472229004, + "epoch": 0.856337749653945, + "mean_token_accuracy": 0.7459584474563599, + "num_tokens": 24271079.0, + "step": 8661, + "train/ce_loss": 0.7910167574882507 + }, + { + "epoch": 0.856337749653945, + "step": 8661, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.856337749653945, + "step": 8661, + "train/total_loss": 0.19238293170928955 + }, + { + "entropy": 8.982584953308105, + "epoch": 0.8564366225034605, + "mean_token_accuracy": 0.7586206793785095, + "num_tokens": 24275989.0, + "step": 8662, + "train/ce_loss": 1.5561401844024658 + }, + { + "epoch": 0.8564366225034605, + "step": 8662, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.8564366225034605, + "step": 8662, + "train/total_loss": 0.21030151844024658 + }, + { + "entropy": 8.329089164733887, + "epoch": 0.856535495352976, + "mean_token_accuracy": 0.7138493061065674, + "num_tokens": 24281442.0, + "step": 8663, + "train/ce_loss": 0.9314218759536743 + }, + { + "epoch": 0.856535495352976, + "step": 8663, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.856535495352976, + "step": 8663, + "train/total_loss": 0.14001718163490295 + }, + { + "entropy": 8.926669120788574, + "epoch": 0.8566343682024916, + "mean_token_accuracy": 0.7155612111091614, + "num_tokens": 24286662.0, + "step": 8664, + "train/ce_loss": 0.7832470536231995 + }, + { + "epoch": 0.8566343682024916, + "step": 8664, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.8566343682024916, + "step": 8664, + "train/total_loss": 0.15254345536231995 + }, + { + "entropy": 8.831933975219727, + "epoch": 0.8567332410520071, + "mean_token_accuracy": 0.7798560857772827, + "num_tokens": 24291849.0, + "step": 8665, + "train/ce_loss": 2.490518511422124e-07 + }, + { + "epoch": 0.8567332410520071, + "step": 8665, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.8567332410520071, + "step": 8665, + "train/total_loss": 0.01953127421438694 + }, + { + "entropy": 8.320831298828125, + "epoch": 0.8568321139015227, + "mean_token_accuracy": 0.7524752616882324, + "num_tokens": 24297357.0, + "step": 8666, + "train/ce_loss": 0.5641217827796936 + }, + { + "epoch": 0.8568321139015227, + "step": 8666, + "train/sim_loss": 0.01171875 + }, + { + "epoch": 0.8568321139015227, + "step": 8666, + "train/total_loss": 0.06813092529773712 + }, + { + "entropy": 8.73794937133789, + "epoch": 0.8569309867510382, + "mean_token_accuracy": 0.7379972338676453, + "num_tokens": 24302554.0, + "step": 8667, + "train/ce_loss": 1.0336923599243164 + }, + { + "epoch": 0.8569309867510382, + "step": 8667, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.8569309867510382, + "step": 8667, + "train/total_loss": 0.18930673599243164 + }, + { + "entropy": 9.101522445678711, + "epoch": 0.8570298596005537, + "mean_token_accuracy": 0.7204301357269287, + "num_tokens": 24307562.0, + "step": 8668, + "train/ce_loss": 1.9060012102127075 + }, + { + "epoch": 0.8570298596005537, + "step": 8668, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.8570298596005537, + "step": 8668, + "train/total_loss": 0.25310012698173523 + }, + { + "entropy": 9.16499137878418, + "epoch": 0.8571287324500693, + "mean_token_accuracy": 0.741605818271637, + "num_tokens": 24312723.0, + "step": 8669, + "train/ce_loss": 2.9166920967327314e-07 + }, + { + "epoch": 0.8571287324500693, + "step": 8669, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.8571287324500693, + "step": 8669, + "train/total_loss": 0.019531279802322388 + }, + { + "entropy": 8.343894958496094, + "epoch": 0.8572276052995847, + "mean_token_accuracy": 0.7389557957649231, + "num_tokens": 24317914.0, + "step": 8670, + "train/ce_loss": 1.0072234869003296 + }, + { + "epoch": 0.8572276052995847, + "step": 8670, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.8572276052995847, + "step": 8670, + "train/total_loss": 0.15150359272956848 + }, + { + "entropy": 8.694144248962402, + "epoch": 0.8573264781491002, + "mean_token_accuracy": 0.739386796951294, + "num_tokens": 24323244.0, + "step": 8671, + "train/ce_loss": 0.7169451713562012 + }, + { + "epoch": 0.8573264781491002, + "step": 8671, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.8573264781491002, + "step": 8671, + "train/total_loss": 0.1576320230960846 + }, + { + "entropy": 8.642142295837402, + "epoch": 0.8574253509986158, + "mean_token_accuracy": 0.7232142686843872, + "num_tokens": 24328527.0, + "step": 8672, + "train/ce_loss": 1.4176908731460571 + }, + { + "epoch": 0.8574253509986158, + "step": 8672, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.8574253509986158, + "step": 8672, + "train/total_loss": 0.20426909625530243 + }, + { + "entropy": 8.432353973388672, + "epoch": 0.8575242238481313, + "mean_token_accuracy": 0.7180179953575134, + "num_tokens": 24334127.0, + "step": 8673, + "train/ce_loss": 0.810718834400177 + }, + { + "epoch": 0.8575242238481313, + "step": 8673, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.8575242238481313, + "step": 8673, + "train/total_loss": 0.1396656334400177 + }, + { + "entropy": 8.835989952087402, + "epoch": 0.8576230966976468, + "mean_token_accuracy": 0.7057521939277649, + "num_tokens": 24339514.0, + "step": 8674, + "train/ce_loss": 1.4376308917999268 + }, + { + "epoch": 0.8576230966976468, + "step": 8674, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.8576230966976468, + "step": 8674, + "train/total_loss": 0.21407559514045715 + }, + { + "entropy": 8.778553009033203, + "epoch": 0.8577219695471624, + "mean_token_accuracy": 0.7320512533187866, + "num_tokens": 24344730.0, + "step": 8675, + "train/ce_loss": 0.897465169429779 + }, + { + "epoch": 0.8577219695471624, + "step": 8675, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.8577219695471624, + "step": 8675, + "train/total_loss": 0.11318401992321014 + }, + { + "entropy": 8.590299606323242, + "epoch": 0.8578208423966779, + "mean_token_accuracy": 0.7136150002479553, + "num_tokens": 24350023.0, + "step": 8676, + "train/ce_loss": 1.29165780544281 + }, + { + "epoch": 0.8578208423966779, + "step": 8676, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.8578208423966779, + "step": 8676, + "train/total_loss": 0.17994703352451324 + }, + { + "entropy": 8.699274063110352, + "epoch": 0.8579197152461934, + "mean_token_accuracy": 0.7906976938247681, + "num_tokens": 24355356.0, + "step": 8677, + "train/ce_loss": 0.42230668663978577 + }, + { + "epoch": 0.8579197152461934, + "step": 8677, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.8579197152461934, + "step": 8677, + "train/total_loss": 0.06957441568374634 + }, + { + "entropy": 9.221429824829102, + "epoch": 0.858018588095709, + "mean_token_accuracy": 0.7283018827438354, + "num_tokens": 24360328.0, + "step": 8678, + "train/ce_loss": 0.7309057116508484 + }, + { + "epoch": 0.858018588095709, + "step": 8678, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.858018588095709, + "step": 8678, + "train/total_loss": 0.1277780830860138 + }, + { + "entropy": 8.58169937133789, + "epoch": 0.8581174609452245, + "mean_token_accuracy": 0.7274701595306396, + "num_tokens": 24365720.0, + "step": 8679, + "train/ce_loss": 0.5765038132667542 + }, + { + "epoch": 0.8581174609452245, + "step": 8679, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.8581174609452245, + "step": 8679, + "train/total_loss": 0.08108788728713989 + }, + { + "epoch": 0.8582163337947399, + "grad_norm": 0.5956012606620789, + "learning_rate": 7.856648370667062e-06, + "loss": 0.1323, + "step": 8680 + }, + { + "entropy": 9.144123077392578, + "epoch": 0.8582163337947399, + "mean_token_accuracy": 0.7272727489471436, + "num_tokens": 24370787.0, + "step": 8680, + "train/ce_loss": 0.9091194272041321 + }, + { + "epoch": 0.8582163337947399, + "step": 8680, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.8582163337947399, + "step": 8680, + "train/total_loss": 0.12606820464134216 + }, + { + "entropy": 8.755536079406738, + "epoch": 0.8583152066442555, + "mean_token_accuracy": 0.7513020634651184, + "num_tokens": 24376023.0, + "step": 8681, + "train/ce_loss": 1.099310278892517 + }, + { + "epoch": 0.8583152066442555, + "step": 8681, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.8583152066442555, + "step": 8681, + "train/total_loss": 0.16071227192878723 + }, + { + "entropy": 8.497855186462402, + "epoch": 0.858414079493771, + "mean_token_accuracy": 0.7962962985038757, + "num_tokens": 24381515.0, + "step": 8682, + "train/ce_loss": 0.759404182434082 + }, + { + "epoch": 0.858414079493771, + "step": 8682, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.858414079493771, + "step": 8682, + "train/total_loss": 0.13062793016433716 + }, + { + "entropy": 8.67982292175293, + "epoch": 0.8585129523432865, + "mean_token_accuracy": 0.7808219194412231, + "num_tokens": 24386825.0, + "step": 8683, + "train/ce_loss": 0.6889883875846863 + }, + { + "epoch": 0.8585129523432865, + "step": 8683, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.8585129523432865, + "step": 8683, + "train/total_loss": 0.09624259173870087 + }, + { + "entropy": 8.736405372619629, + "epoch": 0.8586118251928021, + "mean_token_accuracy": 0.7019950151443481, + "num_tokens": 24392090.0, + "step": 8684, + "train/ce_loss": 1.2038031816482544 + }, + { + "epoch": 0.8586118251928021, + "step": 8684, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.8586118251928021, + "step": 8684, + "train/total_loss": 0.19850531220436096 + }, + { + "entropy": 9.02114486694336, + "epoch": 0.8587106980423176, + "mean_token_accuracy": 0.7817531228065491, + "num_tokens": 24397094.0, + "step": 8685, + "train/ce_loss": 1.209519863128662 + }, + { + "epoch": 0.8587106980423176, + "step": 8685, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.8587106980423176, + "step": 8685, + "train/total_loss": 0.16001448035240173 + }, + { + "entropy": 9.204957008361816, + "epoch": 0.8588095708918331, + "mean_token_accuracy": 0.8021582961082458, + "num_tokens": 24402084.0, + "step": 8686, + "train/ce_loss": 1.132004285864241e-06 + }, + { + "epoch": 0.8588095708918331, + "step": 8686, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.8588095708918331, + "step": 8686, + "train/total_loss": 0.015625113621354103 + }, + { + "entropy": 9.330758094787598, + "epoch": 0.8589084437413487, + "mean_token_accuracy": 0.8086304068565369, + "num_tokens": 24407084.0, + "step": 8687, + "train/ce_loss": 0.6505013704299927 + }, + { + "epoch": 0.8589084437413487, + "step": 8687, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.8589084437413487, + "step": 8687, + "train/total_loss": 0.0845813900232315 + }, + { + "entropy": 8.71497631072998, + "epoch": 0.8590073165908642, + "mean_token_accuracy": 0.7440318465232849, + "num_tokens": 24412280.0, + "step": 8688, + "train/ce_loss": 0.48954129219055176 + }, + { + "epoch": 0.8590073165908642, + "step": 8688, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.8590073165908642, + "step": 8688, + "train/total_loss": 0.11145412921905518 + }, + { + "entropy": 8.394184112548828, + "epoch": 0.8591061894403796, + "mean_token_accuracy": 0.7381465435028076, + "num_tokens": 24417632.0, + "step": 8689, + "train/ce_loss": 0.6080519556999207 + }, + { + "epoch": 0.8591061894403796, + "step": 8689, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.8591061894403796, + "step": 8689, + "train/total_loss": 0.11158645153045654 + }, + { + "entropy": 9.146610260009766, + "epoch": 0.8592050622898952, + "mean_token_accuracy": 0.7051281929016113, + "num_tokens": 24422677.0, + "step": 8690, + "train/ce_loss": 1.0803229808807373 + }, + { + "epoch": 0.8592050622898952, + "step": 8690, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.8592050622898952, + "step": 8690, + "train/total_loss": 0.17443855106830597 + }, + { + "entropy": 8.834553718566895, + "epoch": 0.8593039351394107, + "mean_token_accuracy": 0.7250900268554688, + "num_tokens": 24427992.0, + "step": 8691, + "train/ce_loss": 1.0625476837158203 + }, + { + "epoch": 0.8593039351394107, + "step": 8691, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.8593039351394107, + "step": 8691, + "train/total_loss": 0.16875477135181427 + }, + { + "entropy": 9.251741409301758, + "epoch": 0.8594028079889262, + "mean_token_accuracy": 0.7544169425964355, + "num_tokens": 24432969.0, + "step": 8692, + "train/ce_loss": 0.8937926888465881 + }, + { + "epoch": 0.8594028079889262, + "step": 8692, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.8594028079889262, + "step": 8692, + "train/total_loss": 0.15187928080558777 + }, + { + "entropy": 9.002845764160156, + "epoch": 0.8595016808384418, + "mean_token_accuracy": 0.6908212304115295, + "num_tokens": 24437999.0, + "step": 8693, + "train/ce_loss": 1.8824050426483154 + }, + { + "epoch": 0.8595016808384418, + "step": 8693, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.8595016808384418, + "step": 8693, + "train/total_loss": 0.28589677810668945 + }, + { + "entropy": 9.49638557434082, + "epoch": 0.8596005536879573, + "mean_token_accuracy": 0.6689655184745789, + "num_tokens": 24442894.0, + "step": 8694, + "train/ce_loss": 3.628325373483676e-07 + }, + { + "epoch": 0.8596005536879573, + "step": 8694, + "train/sim_loss": 0.01171875 + }, + { + "epoch": 0.8596005536879573, + "step": 8694, + "train/total_loss": 0.01171878632158041 + }, + { + "entropy": 8.463619232177734, + "epoch": 0.8596994265374728, + "mean_token_accuracy": 0.7720670104026794, + "num_tokens": 24448289.0, + "step": 8695, + "train/ce_loss": 0.5734670162200928 + }, + { + "epoch": 0.8596994265374728, + "step": 8695, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.8596994265374728, + "step": 8695, + "train/total_loss": 0.08859670162200928 + }, + { + "entropy": 8.894155502319336, + "epoch": 0.8597982993869884, + "mean_token_accuracy": 0.7293127775192261, + "num_tokens": 24453448.0, + "step": 8696, + "train/ce_loss": 0.939193069934845 + }, + { + "epoch": 0.8597982993869884, + "step": 8696, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.8597982993869884, + "step": 8696, + "train/total_loss": 0.1564193069934845 + }, + { + "entropy": 8.99164867401123, + "epoch": 0.8598971722365039, + "mean_token_accuracy": 0.7431906461715698, + "num_tokens": 24458622.0, + "step": 8697, + "train/ce_loss": 1.413056492805481 + }, + { + "epoch": 0.8598971722365039, + "step": 8697, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.8598971722365039, + "step": 8697, + "train/total_loss": 0.18818065524101257 + }, + { + "entropy": 8.630083084106445, + "epoch": 0.8599960450860193, + "mean_token_accuracy": 0.720200777053833, + "num_tokens": 24463854.0, + "step": 8698, + "train/ce_loss": 0.9030309915542603 + }, + { + "epoch": 0.8599960450860193, + "step": 8698, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.8599960450860193, + "step": 8698, + "train/total_loss": 0.19577184319496155 + }, + { + "entropy": 8.450031280517578, + "epoch": 0.8600949179355349, + "mean_token_accuracy": 0.7494736909866333, + "num_tokens": 24469293.0, + "step": 8699, + "train/ce_loss": 0.5924932956695557 + }, + { + "epoch": 0.8600949179355349, + "step": 8699, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.8600949179355349, + "step": 8699, + "train/total_loss": 0.08659307658672333 + }, + { + "epoch": 0.8601937907850504, + "grad_norm": 0.5866478085517883, + "learning_rate": 7.851703505909115e-06, + "loss": 0.1359, + "step": 8700 + }, + { + "entropy": 8.636362075805664, + "epoch": 0.8601937907850504, + "mean_token_accuracy": 0.7581620216369629, + "num_tokens": 24474585.0, + "step": 8700, + "train/ce_loss": 1.01895272731781 + }, + { + "epoch": 0.8601937907850504, + "step": 8700, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.8601937907850504, + "step": 8700, + "train/total_loss": 0.180020272731781 + }, + { + "entropy": 9.017528533935547, + "epoch": 0.8602926636345659, + "mean_token_accuracy": 0.7680412530899048, + "num_tokens": 24480023.0, + "step": 8701, + "train/ce_loss": 1.0584609508514404 + }, + { + "epoch": 0.8602926636345659, + "step": 8701, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.8602926636345659, + "step": 8701, + "train/total_loss": 0.176158607006073 + }, + { + "entropy": 8.56066608428955, + "epoch": 0.8603915364840815, + "mean_token_accuracy": 0.7193585634231567, + "num_tokens": 24485343.0, + "step": 8702, + "train/ce_loss": 1.0047358274459839 + }, + { + "epoch": 0.8603915364840815, + "step": 8702, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.8603915364840815, + "step": 8702, + "train/total_loss": 0.1629735827445984 + }, + { + "entropy": 8.613938331604004, + "epoch": 0.860490409333597, + "mean_token_accuracy": 0.7861557602882385, + "num_tokens": 24490596.0, + "step": 8703, + "train/ce_loss": 0.46136343479156494 + }, + { + "epoch": 0.860490409333597, + "step": 8703, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.860490409333597, + "step": 8703, + "train/total_loss": 0.10082384943962097 + }, + { + "entropy": 8.38138484954834, + "epoch": 0.8605892821831125, + "mean_token_accuracy": 0.7080609798431396, + "num_tokens": 24495944.0, + "step": 8704, + "train/ce_loss": 1.1422832012176514 + }, + { + "epoch": 0.8605892821831125, + "step": 8704, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.8605892821831125, + "step": 8704, + "train/total_loss": 0.14938457310199738 + }, + { + "entropy": 8.48853588104248, + "epoch": 0.8606881550326281, + "mean_token_accuracy": 0.7091295123100281, + "num_tokens": 24501366.0, + "step": 8705, + "train/ce_loss": 0.797477662563324 + }, + { + "epoch": 0.8606881550326281, + "step": 8705, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.8606881550326281, + "step": 8705, + "train/total_loss": 0.1149040162563324 + }, + { + "entropy": 9.005110740661621, + "epoch": 0.8607870278821436, + "mean_token_accuracy": 0.7484076619148254, + "num_tokens": 24506428.0, + "step": 8706, + "train/ce_loss": 1.3149898052215576 + }, + { + "epoch": 0.8607870278821436, + "step": 8706, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.8607870278821436, + "step": 8706, + "train/total_loss": 0.17446772754192352 + }, + { + "entropy": 8.744300842285156, + "epoch": 0.860885900731659, + "mean_token_accuracy": 0.8268733620643616, + "num_tokens": 24511674.0, + "step": 8707, + "train/ce_loss": 0.35784006118774414 + }, + { + "epoch": 0.860885900731659, + "step": 8707, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.860885900731659, + "step": 8707, + "train/total_loss": 0.07484650611877441 + }, + { + "entropy": 8.888997077941895, + "epoch": 0.8609847735811746, + "mean_token_accuracy": 0.72541743516922, + "num_tokens": 24516650.0, + "step": 8708, + "train/ce_loss": 1.081268310546875 + }, + { + "epoch": 0.8609847735811746, + "step": 8708, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.8609847735811746, + "step": 8708, + "train/total_loss": 0.15109558403491974 + }, + { + "entropy": 9.171119689941406, + "epoch": 0.8610836464306901, + "mean_token_accuracy": 0.7603550553321838, + "num_tokens": 24521774.0, + "step": 8709, + "train/ce_loss": 0.6120818257331848 + }, + { + "epoch": 0.8610836464306901, + "step": 8709, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.8610836464306901, + "step": 8709, + "train/total_loss": 0.17058318853378296 + }, + { + "entropy": 8.389473915100098, + "epoch": 0.8611825192802056, + "mean_token_accuracy": 0.7079002261161804, + "num_tokens": 24527231.0, + "step": 8710, + "train/ce_loss": 1.2086772918701172 + }, + { + "epoch": 0.8611825192802056, + "step": 8710, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.8611825192802056, + "step": 8710, + "train/total_loss": 0.17946147918701172 + }, + { + "entropy": 8.300840377807617, + "epoch": 0.8612813921297212, + "mean_token_accuracy": 0.72667396068573, + "num_tokens": 24532651.0, + "step": 8711, + "train/ce_loss": 0.726784884929657 + }, + { + "epoch": 0.8612813921297212, + "step": 8711, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.8612813921297212, + "step": 8711, + "train/total_loss": 0.14689724147319794 + }, + { + "entropy": 8.55239486694336, + "epoch": 0.8613802649792367, + "mean_token_accuracy": 0.7553735971450806, + "num_tokens": 24538110.0, + "step": 8712, + "train/ce_loss": 0.4948091506958008 + }, + { + "epoch": 0.8613802649792367, + "step": 8712, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.8613802649792367, + "step": 8712, + "train/total_loss": 0.07291841506958008 + }, + { + "entropy": 9.009099960327148, + "epoch": 0.8614791378287522, + "mean_token_accuracy": 0.7603305578231812, + "num_tokens": 24543177.0, + "step": 8713, + "train/ce_loss": 1.6545424461364746 + }, + { + "epoch": 0.8614791378287522, + "step": 8713, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.8614791378287522, + "step": 8713, + "train/total_loss": 0.21623550355434418 + }, + { + "entropy": 8.838738441467285, + "epoch": 0.8615780106782678, + "mean_token_accuracy": 0.6797385811805725, + "num_tokens": 24548427.0, + "step": 8714, + "train/ce_loss": 0.9460775256156921 + }, + { + "epoch": 0.8615780106782678, + "step": 8714, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.8615780106782678, + "step": 8714, + "train/total_loss": 0.15320150554180145 + }, + { + "entropy": 9.11565113067627, + "epoch": 0.8616768835277833, + "mean_token_accuracy": 0.7608370780944824, + "num_tokens": 24553517.0, + "step": 8715, + "train/ce_loss": 1.4782397747039795 + }, + { + "epoch": 0.8616768835277833, + "step": 8715, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.8616768835277833, + "step": 8715, + "train/total_loss": 0.2103239744901657 + }, + { + "entropy": 8.60834789276123, + "epoch": 0.8617757563772988, + "mean_token_accuracy": 0.6983606815338135, + "num_tokens": 24558912.0, + "step": 8716, + "train/ce_loss": 1.4724786281585693 + }, + { + "epoch": 0.8617757563772988, + "step": 8716, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.8617757563772988, + "step": 8716, + "train/total_loss": 0.17459161579608917 + }, + { + "entropy": 8.75145149230957, + "epoch": 0.8618746292268143, + "mean_token_accuracy": 0.7423133254051208, + "num_tokens": 24564083.0, + "step": 8717, + "train/ce_loss": 0.9547011256217957 + }, + { + "epoch": 0.8618746292268143, + "step": 8717, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.8618746292268143, + "step": 8717, + "train/total_loss": 0.2087513655424118 + }, + { + "entropy": 8.839400291442871, + "epoch": 0.8619735020763298, + "mean_token_accuracy": 0.7345399856567383, + "num_tokens": 24569171.0, + "step": 8718, + "train/ce_loss": 1.3656909465789795 + }, + { + "epoch": 0.8619735020763298, + "step": 8718, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.8619735020763298, + "step": 8718, + "train/total_loss": 0.1951628476381302 + }, + { + "entropy": 8.544647216796875, + "epoch": 0.8620723749258453, + "mean_token_accuracy": 0.7559462189674377, + "num_tokens": 24574615.0, + "step": 8719, + "train/ce_loss": 0.7375919818878174 + }, + { + "epoch": 0.8620723749258453, + "step": 8719, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.8620723749258453, + "step": 8719, + "train/total_loss": 0.17141544818878174 + }, + { + "epoch": 0.8621712477753609, + "grad_norm": 0.6967912912368774, + "learning_rate": 7.846758641151165e-06, + "loss": 0.1358, + "step": 8720 + }, + { + "entropy": 8.95772933959961, + "epoch": 0.8621712477753609, + "mean_token_accuracy": 0.6918518543243408, + "num_tokens": 24579691.0, + "step": 8720, + "train/ce_loss": 1.047044277191162 + }, + { + "epoch": 0.8621712477753609, + "step": 8720, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.8621712477753609, + "step": 8720, + "train/total_loss": 0.18282943964004517 + }, + { + "entropy": 8.738275527954102, + "epoch": 0.8622701206248764, + "mean_token_accuracy": 0.676980197429657, + "num_tokens": 24584963.0, + "step": 8721, + "train/ce_loss": 0.7576073408126831 + }, + { + "epoch": 0.8622701206248764, + "step": 8721, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.8622701206248764, + "step": 8721, + "train/total_loss": 0.13044823706150055 + }, + { + "entropy": 8.73996353149414, + "epoch": 0.8623689934743919, + "mean_token_accuracy": 0.7284533381462097, + "num_tokens": 24590246.0, + "step": 8722, + "train/ce_loss": 0.6583800315856934 + }, + { + "epoch": 0.8623689934743919, + "step": 8722, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.8623689934743919, + "step": 8722, + "train/total_loss": 0.10490050166845322 + }, + { + "entropy": 8.506805419921875, + "epoch": 0.8624678663239075, + "mean_token_accuracy": 0.7607433199882507, + "num_tokens": 24595734.0, + "step": 8723, + "train/ce_loss": 0.6425955891609192 + }, + { + "epoch": 0.8624678663239075, + "step": 8723, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.8624678663239075, + "step": 8723, + "train/total_loss": 0.13457205891609192 + }, + { + "entropy": 8.412109375, + "epoch": 0.862566739173423, + "mean_token_accuracy": 0.6770114898681641, + "num_tokens": 24601053.0, + "step": 8724, + "train/ce_loss": 1.2470697164535522 + }, + { + "epoch": 0.862566739173423, + "step": 8724, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.862566739173423, + "step": 8724, + "train/total_loss": 0.18330073356628418 + }, + { + "entropy": 8.725008010864258, + "epoch": 0.8626656120229385, + "mean_token_accuracy": 0.7747875452041626, + "num_tokens": 24606234.0, + "step": 8725, + "train/ce_loss": 1.2740275859832764 + }, + { + "epoch": 0.8626656120229385, + "step": 8725, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.8626656120229385, + "step": 8725, + "train/total_loss": 0.19380901753902435 + }, + { + "entropy": 8.509666442871094, + "epoch": 0.862764484872454, + "mean_token_accuracy": 0.680190920829773, + "num_tokens": 24611590.0, + "step": 8726, + "train/ce_loss": 0.9858887791633606 + }, + { + "epoch": 0.862764484872454, + "step": 8726, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.862764484872454, + "step": 8726, + "train/total_loss": 0.16890138387680054 + }, + { + "entropy": 8.78453254699707, + "epoch": 0.8628633577219695, + "mean_token_accuracy": 0.7397769689559937, + "num_tokens": 24616850.0, + "step": 8727, + "train/ce_loss": 0.9157573580741882 + }, + { + "epoch": 0.8628633577219695, + "step": 8727, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.8628633577219695, + "step": 8727, + "train/total_loss": 0.1384507417678833 + }, + { + "entropy": 9.509747505187988, + "epoch": 0.862962230571485, + "mean_token_accuracy": 0.6687116622924805, + "num_tokens": 24621756.0, + "step": 8728, + "train/ce_loss": 2.419189929962158 + }, + { + "epoch": 0.862962230571485, + "step": 8728, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.862962230571485, + "step": 8728, + "train/total_loss": 0.26926273107528687 + }, + { + "entropy": 8.149187088012695, + "epoch": 0.8630611034210006, + "mean_token_accuracy": 0.7740345001220703, + "num_tokens": 24627458.0, + "step": 8729, + "train/ce_loss": 0.5720027089118958 + }, + { + "epoch": 0.8630611034210006, + "step": 8729, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.8630611034210006, + "step": 8729, + "train/total_loss": 0.09626276791095734 + }, + { + "entropy": 9.031068801879883, + "epoch": 0.8631599762705161, + "mean_token_accuracy": 0.7213114500045776, + "num_tokens": 24632551.0, + "step": 8730, + "train/ce_loss": 1.2604602575302124 + }, + { + "epoch": 0.8631599762705161, + "step": 8730, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.8631599762705161, + "step": 8730, + "train/total_loss": 0.19635853171348572 + }, + { + "entropy": 8.85468864440918, + "epoch": 0.8632588491200316, + "mean_token_accuracy": 0.7720588445663452, + "num_tokens": 24637645.0, + "step": 8731, + "train/ce_loss": 1.0503281354904175 + }, + { + "epoch": 0.8632588491200316, + "step": 8731, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.8632588491200316, + "step": 8731, + "train/total_loss": 0.12456406652927399 + }, + { + "entropy": 9.003961563110352, + "epoch": 0.8633577219695472, + "mean_token_accuracy": 0.8041431307792664, + "num_tokens": 24642631.0, + "step": 8732, + "train/ce_loss": 0.6411572098731995 + }, + { + "epoch": 0.8633577219695472, + "step": 8732, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.8633577219695472, + "step": 8732, + "train/total_loss": 0.0797407254576683 + }, + { + "entropy": 8.73344898223877, + "epoch": 0.8634565948190627, + "mean_token_accuracy": 0.8298969268798828, + "num_tokens": 24647867.0, + "step": 8733, + "train/ce_loss": 0.9211376905441284 + }, + { + "epoch": 0.8634565948190627, + "step": 8733, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.8634565948190627, + "step": 8733, + "train/total_loss": 0.14680126309394836 + }, + { + "entropy": 8.371345520019531, + "epoch": 0.8635554676685782, + "mean_token_accuracy": 0.7224669456481934, + "num_tokens": 24653284.0, + "step": 8734, + "train/ce_loss": 1.3611029386520386 + }, + { + "epoch": 0.8635554676685782, + "step": 8734, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.8635554676685782, + "step": 8734, + "train/total_loss": 0.21423529088497162 + }, + { + "entropy": 8.128787994384766, + "epoch": 0.8636543405180938, + "mean_token_accuracy": 0.8126079440116882, + "num_tokens": 24658967.0, + "step": 8735, + "train/ce_loss": 0.3554665446281433 + }, + { + "epoch": 0.8636543405180938, + "step": 8735, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.8636543405180938, + "step": 8735, + "train/total_loss": 0.05117165669798851 + }, + { + "entropy": 8.596895217895508, + "epoch": 0.8637532133676092, + "mean_token_accuracy": 0.750348687171936, + "num_tokens": 24664138.0, + "step": 8736, + "train/ce_loss": 0.7949276566505432 + }, + { + "epoch": 0.8637532133676092, + "step": 8736, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.8637532133676092, + "step": 8736, + "train/total_loss": 0.13418027758598328 + }, + { + "entropy": 8.983320236206055, + "epoch": 0.8638520862171247, + "mean_token_accuracy": 0.7711864113807678, + "num_tokens": 24669276.0, + "step": 8737, + "train/ce_loss": 0.9564729928970337 + }, + { + "epoch": 0.8638520862171247, + "step": 8737, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.8638520862171247, + "step": 8737, + "train/total_loss": 0.21283480525016785 + }, + { + "entropy": 8.86839485168457, + "epoch": 0.8639509590666403, + "mean_token_accuracy": 0.8213740587234497, + "num_tokens": 24674384.0, + "step": 8738, + "train/ce_loss": 0.7088941931724548 + }, + { + "epoch": 0.8639509590666403, + "step": 8738, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.8639509590666403, + "step": 8738, + "train/total_loss": 0.1099519208073616 + }, + { + "entropy": 8.702260971069336, + "epoch": 0.8640498319161558, + "mean_token_accuracy": 0.7682619690895081, + "num_tokens": 24679662.0, + "step": 8739, + "train/ce_loss": 0.48759010434150696 + }, + { + "epoch": 0.8640498319161558, + "step": 8739, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.8640498319161558, + "step": 8739, + "train/total_loss": 0.07219651341438293 + }, + { + "epoch": 0.8641487047656713, + "grad_norm": 0.6484668850898743, + "learning_rate": 7.841813776393217e-06, + "loss": 0.1303, + "step": 8740 + }, + { + "entropy": 8.378477096557617, + "epoch": 0.8641487047656713, + "mean_token_accuracy": 0.7508571147918701, + "num_tokens": 24685015.0, + "step": 8740, + "train/ce_loss": 0.791218638420105 + }, + { + "epoch": 0.8641487047656713, + "step": 8740, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.8641487047656713, + "step": 8740, + "train/total_loss": 0.13771560788154602 + }, + { + "entropy": 8.485795974731445, + "epoch": 0.8642475776151869, + "mean_token_accuracy": 0.7496991753578186, + "num_tokens": 24690284.0, + "step": 8741, + "train/ce_loss": 1.1184836626052856 + }, + { + "epoch": 0.8642475776151869, + "step": 8741, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.8642475776151869, + "step": 8741, + "train/total_loss": 0.1743483692407608 + }, + { + "entropy": 8.313455581665039, + "epoch": 0.8643464504647024, + "mean_token_accuracy": 0.8098591566085815, + "num_tokens": 24695674.0, + "step": 8742, + "train/ce_loss": 0.8051064610481262 + }, + { + "epoch": 0.8643464504647024, + "step": 8742, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.8643464504647024, + "step": 8742, + "train/total_loss": 0.14691689610481262 + }, + { + "entropy": 8.535341262817383, + "epoch": 0.8644453233142179, + "mean_token_accuracy": 0.7163029313087463, + "num_tokens": 24700922.0, + "step": 8743, + "train/ce_loss": 0.9688300490379333 + }, + { + "epoch": 0.8644453233142179, + "step": 8743, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.8644453233142179, + "step": 8743, + "train/total_loss": 0.14766424894332886 + }, + { + "entropy": 8.891427993774414, + "epoch": 0.8645441961637335, + "mean_token_accuracy": 0.771175742149353, + "num_tokens": 24706199.0, + "step": 8744, + "train/ce_loss": 0.7586009502410889 + }, + { + "epoch": 0.8645441961637335, + "step": 8744, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.8645441961637335, + "step": 8744, + "train/total_loss": 0.12664134800434113 + }, + { + "entropy": 8.649910926818848, + "epoch": 0.8646430690132489, + "mean_token_accuracy": 0.7512376308441162, + "num_tokens": 24711489.0, + "step": 8745, + "train/ce_loss": 0.7900895476341248 + }, + { + "epoch": 0.8646430690132489, + "step": 8745, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.8646430690132489, + "step": 8745, + "train/total_loss": 0.13369646668434143 + }, + { + "entropy": 8.408275604248047, + "epoch": 0.8647419418627644, + "mean_token_accuracy": 0.7441386580467224, + "num_tokens": 24717007.0, + "step": 8746, + "train/ce_loss": 1.1202709674835205 + }, + { + "epoch": 0.8647419418627644, + "step": 8746, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.8647419418627644, + "step": 8746, + "train/total_loss": 0.197964608669281 + }, + { + "entropy": 8.852964401245117, + "epoch": 0.86484081471228, + "mean_token_accuracy": 0.6896046996116638, + "num_tokens": 24722128.0, + "step": 8747, + "train/ce_loss": 4.757750502903946e-06 + }, + { + "epoch": 0.86484081471228, + "step": 8747, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.86484081471228, + "step": 8747, + "train/total_loss": 0.0390629768371582 + }, + { + "entropy": 8.401473999023438, + "epoch": 0.8649396875617955, + "mean_token_accuracy": 0.7679222226142883, + "num_tokens": 24727533.0, + "step": 8748, + "train/ce_loss": 0.5992299914360046 + }, + { + "epoch": 0.8649396875617955, + "step": 8748, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.8649396875617955, + "step": 8748, + "train/total_loss": 0.10289175063371658 + }, + { + "entropy": 9.06220531463623, + "epoch": 0.8650385604113111, + "mean_token_accuracy": 0.7325102686882019, + "num_tokens": 24732446.0, + "step": 8749, + "train/ce_loss": 6.240025527404214e-07 + }, + { + "epoch": 0.8650385604113111, + "step": 8749, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.8650385604113111, + "step": 8749, + "train/total_loss": 0.046875063329935074 + }, + { + "entropy": 8.637744903564453, + "epoch": 0.8651374332608266, + "mean_token_accuracy": 0.7092198729515076, + "num_tokens": 24737699.0, + "step": 8750, + "train/ce_loss": 1.3729150295257568 + }, + { + "epoch": 0.8651374332608266, + "step": 8750, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.8651374332608266, + "step": 8750, + "train/total_loss": 0.17635400593280792 + }, + { + "entropy": 8.494135856628418, + "epoch": 0.8652363061103421, + "mean_token_accuracy": 0.7586206793785095, + "num_tokens": 24742937.0, + "step": 8751, + "train/ce_loss": 0.7793292999267578 + }, + { + "epoch": 0.8652363061103421, + "step": 8751, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.8652363061103421, + "step": 8751, + "train/total_loss": 0.1287141740322113 + }, + { + "entropy": 8.59860610961914, + "epoch": 0.8653351789598577, + "mean_token_accuracy": 0.7230255603790283, + "num_tokens": 24748247.0, + "step": 8752, + "train/ce_loss": 0.9081260561943054 + }, + { + "epoch": 0.8653351789598577, + "step": 8752, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.8653351789598577, + "step": 8752, + "train/total_loss": 0.16112510859966278 + }, + { + "entropy": 8.650777816772461, + "epoch": 0.8654340518093732, + "mean_token_accuracy": 0.7725631594657898, + "num_tokens": 24753533.0, + "step": 8753, + "train/ce_loss": 0.8317409753799438 + }, + { + "epoch": 0.8654340518093732, + "step": 8753, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.8654340518093732, + "step": 8753, + "train/total_loss": 0.16911160945892334 + }, + { + "entropy": 8.657723426818848, + "epoch": 0.8655329246588886, + "mean_token_accuracy": 0.6906077265739441, + "num_tokens": 24758684.0, + "step": 8754, + "train/ce_loss": 1.6056824922561646 + }, + { + "epoch": 0.8655329246588886, + "step": 8754, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.8655329246588886, + "step": 8754, + "train/total_loss": 0.2074432522058487 + }, + { + "entropy": 8.510689735412598, + "epoch": 0.8656317975084042, + "mean_token_accuracy": 0.783369779586792, + "num_tokens": 24764084.0, + "step": 8755, + "train/ce_loss": 0.9446129202842712 + }, + { + "epoch": 0.8656317975084042, + "step": 8755, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.8656317975084042, + "step": 8755, + "train/total_loss": 0.18821129202842712 + }, + { + "entropy": 8.236587524414062, + "epoch": 0.8657306703579197, + "mean_token_accuracy": 0.772357702255249, + "num_tokens": 24769425.0, + "step": 8756, + "train/ce_loss": 0.6139279007911682 + }, + { + "epoch": 0.8657306703579197, + "step": 8756, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.8657306703579197, + "step": 8756, + "train/total_loss": 0.11608029156923294 + }, + { + "entropy": 8.884057998657227, + "epoch": 0.8658295432074352, + "mean_token_accuracy": 0.7586705088615417, + "num_tokens": 24774582.0, + "step": 8757, + "train/ce_loss": 0.7076722979545593 + }, + { + "epoch": 0.8658295432074352, + "step": 8757, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.8658295432074352, + "step": 8757, + "train/total_loss": 0.12154848128557205 + }, + { + "entropy": 8.224753379821777, + "epoch": 0.8659284160569508, + "mean_token_accuracy": 0.7279999852180481, + "num_tokens": 24780062.0, + "step": 8758, + "train/ce_loss": 1.2939484119415283 + }, + { + "epoch": 0.8659284160569508, + "step": 8758, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.8659284160569508, + "step": 8758, + "train/total_loss": 0.15283234417438507 + }, + { + "entropy": 8.87765884399414, + "epoch": 0.8660272889064663, + "mean_token_accuracy": 0.765562891960144, + "num_tokens": 24785284.0, + "step": 8759, + "train/ce_loss": 1.3594692945480347 + }, + { + "epoch": 0.8660272889064663, + "step": 8759, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.8660272889064663, + "step": 8759, + "train/total_loss": 0.20235317945480347 + }, + { + "epoch": 0.8661261617559818, + "grad_norm": 0.588288426399231, + "learning_rate": 7.836868911635268e-06, + "loss": 0.1316, + "step": 8760 + }, + { + "entropy": 9.202454566955566, + "epoch": 0.8661261617559818, + "mean_token_accuracy": 0.7523452043533325, + "num_tokens": 24790218.0, + "step": 8760, + "train/ce_loss": 7.031700306470157e-07 + }, + { + "epoch": 0.8661261617559818, + "step": 8760, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.8661261617559818, + "step": 8760, + "train/total_loss": 0.03515632078051567 + }, + { + "entropy": 8.66160774230957, + "epoch": 0.8662250346054974, + "mean_token_accuracy": 0.6910466551780701, + "num_tokens": 24795443.0, + "step": 8761, + "train/ce_loss": 0.6732712388038635 + }, + { + "epoch": 0.8662250346054974, + "step": 8761, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.8662250346054974, + "step": 8761, + "train/total_loss": 0.1688896268606186 + }, + { + "entropy": 9.296399116516113, + "epoch": 0.8663239074550129, + "mean_token_accuracy": 0.7829268574714661, + "num_tokens": 24800288.0, + "step": 8762, + "train/ce_loss": 9.687728379503824e-07 + }, + { + "epoch": 0.8663239074550129, + "step": 8762, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.8663239074550129, + "step": 8762, + "train/total_loss": 0.02343759685754776 + }, + { + "entropy": 8.678385734558105, + "epoch": 0.8664227803045284, + "mean_token_accuracy": 0.7857142686843872, + "num_tokens": 24805656.0, + "step": 8763, + "train/ce_loss": 0.9216625094413757 + }, + { + "epoch": 0.8664227803045284, + "step": 8763, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.8664227803045284, + "step": 8763, + "train/total_loss": 0.11951000243425369 + }, + { + "entropy": 9.073097229003906, + "epoch": 0.866521653154044, + "mean_token_accuracy": 0.7687074542045593, + "num_tokens": 24810682.0, + "step": 8764, + "train/ce_loss": 0.7515096664428711 + }, + { + "epoch": 0.866521653154044, + "step": 8764, + "train/sim_loss": 0.01171875 + }, + { + "epoch": 0.866521653154044, + "step": 8764, + "train/total_loss": 0.08686971664428711 + }, + { + "entropy": 8.48833179473877, + "epoch": 0.8666205260035594, + "mean_token_accuracy": 0.7521263957023621, + "num_tokens": 24816012.0, + "step": 8765, + "train/ce_loss": 0.7278606295585632 + }, + { + "epoch": 0.8666205260035594, + "step": 8765, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.8666205260035594, + "step": 8765, + "train/total_loss": 0.11966106295585632 + }, + { + "entropy": 8.351832389831543, + "epoch": 0.8667193988530749, + "mean_token_accuracy": 0.7191234827041626, + "num_tokens": 24821523.0, + "step": 8766, + "train/ce_loss": 1.1557978391647339 + }, + { + "epoch": 0.8667193988530749, + "step": 8766, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.8667193988530749, + "step": 8766, + "train/total_loss": 0.1585485339164734 + }, + { + "entropy": 8.598573684692383, + "epoch": 0.8668182717025905, + "mean_token_accuracy": 0.7502762675285339, + "num_tokens": 24826916.0, + "step": 8767, + "train/ce_loss": 0.9814897775650024 + }, + { + "epoch": 0.8668182717025905, + "step": 8767, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.8668182717025905, + "step": 8767, + "train/total_loss": 0.17236772179603577 + }, + { + "entropy": 8.701048851013184, + "epoch": 0.866917144552106, + "mean_token_accuracy": 0.7069825530052185, + "num_tokens": 24832214.0, + "step": 8768, + "train/ce_loss": 1.222639560699463 + }, + { + "epoch": 0.866917144552106, + "step": 8768, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.866917144552106, + "step": 8768, + "train/total_loss": 0.18867021799087524 + }, + { + "entropy": 8.696617126464844, + "epoch": 0.8670160174016215, + "mean_token_accuracy": 0.7450058460235596, + "num_tokens": 24837607.0, + "step": 8769, + "train/ce_loss": 0.3104584515094757 + }, + { + "epoch": 0.8670160174016215, + "step": 8769, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.8670160174016215, + "step": 8769, + "train/total_loss": 0.08573334664106369 + }, + { + "entropy": 8.912432670593262, + "epoch": 0.8671148902511371, + "mean_token_accuracy": 0.7912912964820862, + "num_tokens": 24842729.0, + "step": 8770, + "train/ce_loss": 1.3645211458206177 + }, + { + "epoch": 0.8671148902511371, + "step": 8770, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.8671148902511371, + "step": 8770, + "train/total_loss": 0.20676462352275848 + }, + { + "entropy": 8.724910736083984, + "epoch": 0.8672137631006526, + "mean_token_accuracy": 0.7616279125213623, + "num_tokens": 24847877.0, + "step": 8771, + "train/ce_loss": 1.2197141647338867 + }, + { + "epoch": 0.8672137631006526, + "step": 8771, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.8672137631006526, + "step": 8771, + "train/total_loss": 0.20009642839431763 + }, + { + "entropy": 8.996208190917969, + "epoch": 0.867312635950168, + "mean_token_accuracy": 0.7210440635681152, + "num_tokens": 24852966.0, + "step": 8772, + "train/ce_loss": 1.4611930847167969 + }, + { + "epoch": 0.867312635950168, + "step": 8772, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.867312635950168, + "step": 8772, + "train/total_loss": 0.20861931145191193 + }, + { + "entropy": 8.577859878540039, + "epoch": 0.8674115087996837, + "mean_token_accuracy": 0.718191385269165, + "num_tokens": 24858363.0, + "step": 8773, + "train/ce_loss": 1.5229592323303223 + }, + { + "epoch": 0.8674115087996837, + "step": 8773, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.8674115087996837, + "step": 8773, + "train/total_loss": 0.19917093217372894 + }, + { + "entropy": 8.59080982208252, + "epoch": 0.8675103816491991, + "mean_token_accuracy": 0.7405966520309448, + "num_tokens": 24863565.0, + "step": 8774, + "train/ce_loss": 0.35415276885032654 + }, + { + "epoch": 0.8675103816491991, + "step": 8774, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.8675103816491991, + "step": 8774, + "train/total_loss": 0.07057152688503265 + }, + { + "entropy": 8.744940757751465, + "epoch": 0.8676092544987146, + "mean_token_accuracy": 0.7650063633918762, + "num_tokens": 24868777.0, + "step": 8775, + "train/ce_loss": 0.9832481741905212 + }, + { + "epoch": 0.8676092544987146, + "step": 8775, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.8676092544987146, + "step": 8775, + "train/total_loss": 0.21551232039928436 + }, + { + "entropy": 8.630331039428711, + "epoch": 0.8677081273482302, + "mean_token_accuracy": 0.7324913740158081, + "num_tokens": 24874099.0, + "step": 8776, + "train/ce_loss": 0.6644471883773804 + }, + { + "epoch": 0.8677081273482302, + "step": 8776, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.8677081273482302, + "step": 8776, + "train/total_loss": 0.10550721734762192 + }, + { + "entropy": 8.871881484985352, + "epoch": 0.8678070001977457, + "mean_token_accuracy": 0.7710843086242676, + "num_tokens": 24879211.0, + "step": 8777, + "train/ce_loss": 0.5619823336601257 + }, + { + "epoch": 0.8678070001977457, + "step": 8777, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.8678070001977457, + "step": 8777, + "train/total_loss": 0.09916698932647705 + }, + { + "entropy": 8.483622550964355, + "epoch": 0.8679058730472612, + "mean_token_accuracy": 0.7622682452201843, + "num_tokens": 24884564.0, + "step": 8778, + "train/ce_loss": 0.8081666231155396 + }, + { + "epoch": 0.8679058730472612, + "step": 8778, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.8679058730472612, + "step": 8778, + "train/total_loss": 0.13941040635108948 + }, + { + "entropy": 8.644962310791016, + "epoch": 0.8680047458967768, + "mean_token_accuracy": 0.7714285850524902, + "num_tokens": 24889883.0, + "step": 8779, + "train/ce_loss": 0.7450631856918335 + }, + { + "epoch": 0.8680047458967768, + "step": 8779, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.8680047458967768, + "step": 8779, + "train/total_loss": 0.11747507005929947 + }, + { + "epoch": 0.8681036187462923, + "grad_norm": 0.5960235595703125, + "learning_rate": 7.831924046877318e-06, + "loss": 0.1282, + "step": 8780 + }, + { + "entropy": 8.878667831420898, + "epoch": 0.8681036187462923, + "mean_token_accuracy": 0.730715274810791, + "num_tokens": 24895108.0, + "step": 8780, + "train/ce_loss": 0.5047854781150818 + }, + { + "epoch": 0.8681036187462923, + "step": 8780, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.8681036187462923, + "step": 8780, + "train/total_loss": 0.15204104781150818 + }, + { + "entropy": 8.98653793334961, + "epoch": 0.8682024915958078, + "mean_token_accuracy": 0.6356073021888733, + "num_tokens": 24900180.0, + "step": 8781, + "train/ce_loss": 0.7897573709487915 + }, + { + "epoch": 0.8682024915958078, + "step": 8781, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.8682024915958078, + "step": 8781, + "train/total_loss": 0.17272573709487915 + }, + { + "entropy": 9.035877227783203, + "epoch": 0.8683013644453234, + "mean_token_accuracy": 0.7129120826721191, + "num_tokens": 24905321.0, + "step": 8782, + "train/ce_loss": 0.4941462278366089 + }, + { + "epoch": 0.8683013644453234, + "step": 8782, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.8683013644453234, + "step": 8782, + "train/total_loss": 0.06894586980342865 + }, + { + "entropy": 8.775835990905762, + "epoch": 0.8684002372948388, + "mean_token_accuracy": 0.7245950102806091, + "num_tokens": 24910441.0, + "step": 8783, + "train/ce_loss": 5.589906777458964e-07 + }, + { + "epoch": 0.8684002372948388, + "step": 8783, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.8684002372948388, + "step": 8783, + "train/total_loss": 0.04687505587935448 + }, + { + "entropy": 8.742904663085938, + "epoch": 0.8684991101443543, + "mean_token_accuracy": 0.7463592290878296, + "num_tokens": 24915743.0, + "step": 8784, + "train/ce_loss": 0.47416871786117554 + }, + { + "epoch": 0.8684991101443543, + "step": 8784, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.8684991101443543, + "step": 8784, + "train/total_loss": 0.09819812327623367 + }, + { + "entropy": 8.247857093811035, + "epoch": 0.8685979829938699, + "mean_token_accuracy": 0.739294707775116, + "num_tokens": 24921019.0, + "step": 8785, + "train/ce_loss": 1.2385609149932861 + }, + { + "epoch": 0.8685979829938699, + "step": 8785, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.8685979829938699, + "step": 8785, + "train/total_loss": 0.1668248474597931 + }, + { + "entropy": 8.880462646484375, + "epoch": 0.8686968558433854, + "mean_token_accuracy": 0.765925943851471, + "num_tokens": 24926142.0, + "step": 8786, + "train/ce_loss": 0.6230125427246094 + }, + { + "epoch": 0.8686968558433854, + "step": 8786, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.8686968558433854, + "step": 8786, + "train/total_loss": 0.12480125576257706 + }, + { + "entropy": 8.575641632080078, + "epoch": 0.8687957286929009, + "mean_token_accuracy": 0.7439724206924438, + "num_tokens": 24931476.0, + "step": 8787, + "train/ce_loss": 0.9515129923820496 + }, + { + "epoch": 0.8687957286929009, + "step": 8787, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.8687957286929009, + "step": 8787, + "train/total_loss": 0.15374505519866943 + }, + { + "entropy": 8.655603408813477, + "epoch": 0.8688946015424165, + "mean_token_accuracy": 0.7214885950088501, + "num_tokens": 24936757.0, + "step": 8788, + "train/ce_loss": 0.8460344672203064 + }, + { + "epoch": 0.8688946015424165, + "step": 8788, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.8688946015424165, + "step": 8788, + "train/total_loss": 0.1275722086429596 + }, + { + "entropy": 8.732928276062012, + "epoch": 0.868993474391932, + "mean_token_accuracy": 0.703611433506012, + "num_tokens": 24942020.0, + "step": 8789, + "train/ce_loss": 1.4696121215820312 + }, + { + "epoch": 0.868993474391932, + "step": 8789, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.868993474391932, + "step": 8789, + "train/total_loss": 0.2524299621582031 + }, + { + "entropy": 8.986867904663086, + "epoch": 0.8690923472414475, + "mean_token_accuracy": 0.7361111044883728, + "num_tokens": 24947228.0, + "step": 8790, + "train/ce_loss": 6.576165105798282e-07 + }, + { + "epoch": 0.8690923472414475, + "step": 8790, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.8690923472414475, + "step": 8790, + "train/total_loss": 0.05078131705522537 + }, + { + "entropy": 9.043817520141602, + "epoch": 0.8691912200909631, + "mean_token_accuracy": 0.7069182395935059, + "num_tokens": 24952470.0, + "step": 8791, + "train/ce_loss": 0.8918421864509583 + }, + { + "epoch": 0.8691912200909631, + "step": 8791, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.8691912200909631, + "step": 8791, + "train/total_loss": 0.1321529746055603 + }, + { + "entropy": 8.527637481689453, + "epoch": 0.8692900929404785, + "mean_token_accuracy": 0.7156398296356201, + "num_tokens": 24957819.0, + "step": 8792, + "train/ce_loss": 0.6995357871055603 + }, + { + "epoch": 0.8692900929404785, + "step": 8792, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.8692900929404785, + "step": 8792, + "train/total_loss": 0.10120358318090439 + }, + { + "entropy": 8.938925743103027, + "epoch": 0.869388965789994, + "mean_token_accuracy": 0.7340824007987976, + "num_tokens": 24963084.0, + "step": 8793, + "train/ce_loss": 0.5987036228179932 + }, + { + "epoch": 0.869388965789994, + "step": 8793, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.869388965789994, + "step": 8793, + "train/total_loss": 0.07549536228179932 + }, + { + "entropy": 8.46339225769043, + "epoch": 0.8694878386395096, + "mean_token_accuracy": 0.7095837593078613, + "num_tokens": 24968653.0, + "step": 8794, + "train/ce_loss": 1.06622314453125 + }, + { + "epoch": 0.8694878386395096, + "step": 8794, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.8694878386395096, + "step": 8794, + "train/total_loss": 0.14959105849266052 + }, + { + "entropy": 8.62234115600586, + "epoch": 0.8695867114890251, + "mean_token_accuracy": 0.7968923449516296, + "num_tokens": 24973975.0, + "step": 8795, + "train/ce_loss": 0.8510656952857971 + }, + { + "epoch": 0.8695867114890251, + "step": 8795, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.8695867114890251, + "step": 8795, + "train/total_loss": 0.10073157399892807 + }, + { + "entropy": 8.597803115844727, + "epoch": 0.8696855843385406, + "mean_token_accuracy": 0.6967545747756958, + "num_tokens": 24979419.0, + "step": 8796, + "train/ce_loss": 1.3414621353149414 + }, + { + "epoch": 0.8696855843385406, + "step": 8796, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.8696855843385406, + "step": 8796, + "train/total_loss": 0.15758371353149414 + }, + { + "entropy": 8.722497940063477, + "epoch": 0.8697844571880562, + "mean_token_accuracy": 0.6937500238418579, + "num_tokens": 24984711.0, + "step": 8797, + "train/ce_loss": 0.9156614542007446 + }, + { + "epoch": 0.8697844571880562, + "step": 8797, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.8697844571880562, + "step": 8797, + "train/total_loss": 0.17359739542007446 + }, + { + "entropy": 8.773809432983398, + "epoch": 0.8698833300375717, + "mean_token_accuracy": 0.7800875306129456, + "num_tokens": 24990046.0, + "step": 8798, + "train/ce_loss": 0.6494016647338867 + }, + { + "epoch": 0.8698833300375717, + "step": 8798, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.8698833300375717, + "step": 8798, + "train/total_loss": 0.08837766945362091 + }, + { + "entropy": 9.861303329467773, + "epoch": 0.8699822028870872, + "mean_token_accuracy": 0.7676056623458862, + "num_tokens": 24994736.0, + "step": 8799, + "train/ce_loss": 7.118906637515465e-07 + }, + { + "epoch": 0.8699822028870872, + "step": 8799, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.8699822028870872, + "step": 8799, + "train/total_loss": 0.03515632078051567 + }, + { + "epoch": 0.8700810757366028, + "grad_norm": 0.8049528002738953, + "learning_rate": 7.82697918211937e-06, + "loss": 0.1392, + "step": 8800 + }, + { + "entropy": 9.053065299987793, + "epoch": 0.8700810757366028, + "mean_token_accuracy": 0.772357702255249, + "num_tokens": 24999798.0, + "step": 8800, + "train/ce_loss": 1.8910090923309326 + }, + { + "epoch": 0.8700810757366028, + "step": 8800, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.8700810757366028, + "step": 8800, + "train/total_loss": 0.2672259211540222 + }, + { + "entropy": 8.306099891662598, + "epoch": 0.8701799485861182, + "mean_token_accuracy": 0.7704485654830933, + "num_tokens": 25005411.0, + "step": 8801, + "train/ce_loss": 0.46456578373908997 + }, + { + "epoch": 0.8701799485861182, + "step": 8801, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.8701799485861182, + "step": 8801, + "train/total_loss": 0.06598782539367676 + }, + { + "entropy": 8.760089874267578, + "epoch": 0.8702788214356337, + "mean_token_accuracy": 0.718482255935669, + "num_tokens": 25010714.0, + "step": 8802, + "train/ce_loss": 0.6581257581710815 + }, + { + "epoch": 0.8702788214356337, + "step": 8802, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.8702788214356337, + "step": 8802, + "train/total_loss": 0.10878133028745651 + }, + { + "entropy": 8.544943809509277, + "epoch": 0.8703776942851493, + "mean_token_accuracy": 0.7622149586677551, + "num_tokens": 25016044.0, + "step": 8803, + "train/ce_loss": 0.3731022775173187 + }, + { + "epoch": 0.8703776942851493, + "step": 8803, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.8703776942851493, + "step": 8803, + "train/total_loss": 0.08418522775173187 + }, + { + "entropy": 9.079167366027832, + "epoch": 0.8704765671346648, + "mean_token_accuracy": 0.7332361340522766, + "num_tokens": 25021135.0, + "step": 8804, + "train/ce_loss": 1.3193233013153076 + }, + { + "epoch": 0.8704765671346648, + "step": 8804, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.8704765671346648, + "step": 8804, + "train/total_loss": 0.225682333111763 + }, + { + "entropy": 8.460450172424316, + "epoch": 0.8705754399841803, + "mean_token_accuracy": 0.7470398545265198, + "num_tokens": 25026685.0, + "step": 8805, + "train/ce_loss": 0.6708948612213135 + }, + { + "epoch": 0.8705754399841803, + "step": 8805, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.8705754399841803, + "step": 8805, + "train/total_loss": 0.1139644905924797 + }, + { + "entropy": 8.694866180419922, + "epoch": 0.8706743128336959, + "mean_token_accuracy": 0.7612500190734863, + "num_tokens": 25031976.0, + "step": 8806, + "train/ce_loss": 1.2063027620315552 + }, + { + "epoch": 0.8706743128336959, + "step": 8806, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.8706743128336959, + "step": 8806, + "train/total_loss": 0.17141152918338776 + }, + { + "entropy": 9.025408744812012, + "epoch": 0.8707731856832114, + "mean_token_accuracy": 0.759381890296936, + "num_tokens": 25036890.0, + "step": 8807, + "train/ce_loss": 0.5602459907531738 + }, + { + "epoch": 0.8707731856832114, + "step": 8807, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.8707731856832114, + "step": 8807, + "train/total_loss": 0.10680584609508514 + }, + { + "entropy": 9.421650886535645, + "epoch": 0.8708720585327269, + "mean_token_accuracy": 0.7763713002204895, + "num_tokens": 25041789.0, + "step": 8808, + "train/ce_loss": 1.0831292867660522 + }, + { + "epoch": 0.8708720585327269, + "step": 8808, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.8708720585327269, + "step": 8808, + "train/total_loss": 0.2059691846370697 + }, + { + "entropy": 8.586231231689453, + "epoch": 0.8709709313822425, + "mean_token_accuracy": 0.7010989189147949, + "num_tokens": 25047134.0, + "step": 8809, + "train/ce_loss": 1.241375207901001 + }, + { + "epoch": 0.8709709313822425, + "step": 8809, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.8709709313822425, + "step": 8809, + "train/total_loss": 0.2100750207901001 + }, + { + "entropy": 8.946133613586426, + "epoch": 0.871069804231758, + "mean_token_accuracy": 0.7638036608695984, + "num_tokens": 25052235.0, + "step": 8810, + "train/ce_loss": 0.470211386680603 + }, + { + "epoch": 0.871069804231758, + "step": 8810, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.871069804231758, + "step": 8810, + "train/total_loss": 0.08998988568782806 + }, + { + "entropy": 9.209354400634766, + "epoch": 0.8711686770812734, + "mean_token_accuracy": 0.7810857892036438, + "num_tokens": 25057229.0, + "step": 8811, + "train/ce_loss": 3.2818891781971615e-07 + }, + { + "epoch": 0.8711686770812734, + "step": 8811, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.8711686770812734, + "step": 8811, + "train/total_loss": 0.035156283527612686 + }, + { + "entropy": 9.166122436523438, + "epoch": 0.871267549930789, + "mean_token_accuracy": 0.7071197628974915, + "num_tokens": 25062294.0, + "step": 8812, + "train/ce_loss": 2.000129461288452 + }, + { + "epoch": 0.871267549930789, + "step": 8812, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.871267549930789, + "step": 8812, + "train/total_loss": 0.2625129520893097 + }, + { + "entropy": 9.153691291809082, + "epoch": 0.8713664227803045, + "mean_token_accuracy": 0.7372449040412903, + "num_tokens": 25067145.0, + "step": 8813, + "train/ce_loss": 1.0874640565816662e-06 + }, + { + "epoch": 0.8713664227803045, + "step": 8813, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.8713664227803045, + "step": 8813, + "train/total_loss": 0.035156358033418655 + }, + { + "entropy": 9.020694732666016, + "epoch": 0.87146529562982, + "mean_token_accuracy": 0.7691154479980469, + "num_tokens": 25072257.0, + "step": 8814, + "train/ce_loss": 0.9739776253700256 + }, + { + "epoch": 0.87146529562982, + "step": 8814, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.87146529562982, + "step": 8814, + "train/total_loss": 0.14817902445793152 + }, + { + "entropy": 8.523090362548828, + "epoch": 0.8715641684793356, + "mean_token_accuracy": 0.7251995205879211, + "num_tokens": 25077597.0, + "step": 8815, + "train/ce_loss": 1.0241416692733765 + }, + { + "epoch": 0.8715641684793356, + "step": 8815, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.8715641684793356, + "step": 8815, + "train/total_loss": 0.16100791096687317 + }, + { + "entropy": 9.017142295837402, + "epoch": 0.8716630413288511, + "mean_token_accuracy": 0.7455196976661682, + "num_tokens": 25082608.0, + "step": 8816, + "train/ce_loss": 1.1585428714752197 + }, + { + "epoch": 0.8716630413288511, + "step": 8816, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.8716630413288511, + "step": 8816, + "train/total_loss": 0.18226054310798645 + }, + { + "entropy": 8.302145004272461, + "epoch": 0.8717619141783666, + "mean_token_accuracy": 0.7987151741981506, + "num_tokens": 25088050.0, + "step": 8817, + "train/ce_loss": 0.5422400236129761 + }, + { + "epoch": 0.8717619141783666, + "step": 8817, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.8717619141783666, + "step": 8817, + "train/total_loss": 0.08938024938106537 + }, + { + "entropy": 8.53742790222168, + "epoch": 0.8718607870278822, + "mean_token_accuracy": 0.7130434513092041, + "num_tokens": 25093477.0, + "step": 8818, + "train/ce_loss": 0.6250754594802856 + }, + { + "epoch": 0.8718607870278822, + "step": 8818, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.8718607870278822, + "step": 8818, + "train/total_loss": 0.1718825399875641 + }, + { + "entropy": 8.350768089294434, + "epoch": 0.8719596598773977, + "mean_token_accuracy": 0.76962810754776, + "num_tokens": 25098957.0, + "step": 8819, + "train/ce_loss": 0.6983389258384705 + }, + { + "epoch": 0.8719596598773977, + "step": 8819, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.8719596598773977, + "step": 8819, + "train/total_loss": 0.0932713970541954 + }, + { + "epoch": 0.8720585327269131, + "grad_norm": 0.6103883385658264, + "learning_rate": 7.822034317361421e-06, + "loss": 0.1331, + "step": 8820 + }, + { + "entropy": 8.73092269897461, + "epoch": 0.8720585327269131, + "mean_token_accuracy": 0.7148817777633667, + "num_tokens": 25104127.0, + "step": 8820, + "train/ce_loss": 8.011184604583832e-07 + }, + { + "epoch": 0.8720585327269131, + "step": 8820, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.8720585327269131, + "step": 8820, + "train/total_loss": 0.031250081956386566 + }, + { + "entropy": 8.991170883178711, + "epoch": 0.8721574055764287, + "mean_token_accuracy": 0.8072837591171265, + "num_tokens": 25109231.0, + "step": 8821, + "train/ce_loss": 0.746583878993988 + }, + { + "epoch": 0.8721574055764287, + "step": 8821, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.8721574055764287, + "step": 8821, + "train/total_loss": 0.09028338640928268 + }, + { + "entropy": 8.568203926086426, + "epoch": 0.8722562784259442, + "mean_token_accuracy": 0.7308707237243652, + "num_tokens": 25114476.0, + "step": 8822, + "train/ce_loss": 1.2717561721801758 + }, + { + "epoch": 0.8722562784259442, + "step": 8822, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.8722562784259442, + "step": 8822, + "train/total_loss": 0.18576936423778534 + }, + { + "entropy": 8.938301086425781, + "epoch": 0.8723551512754597, + "mean_token_accuracy": 0.7442799210548401, + "num_tokens": 25119680.0, + "step": 8823, + "train/ce_loss": 1.2276751704121125e-06 + }, + { + "epoch": 0.8723551512754597, + "step": 8823, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.8723551512754597, + "step": 8823, + "train/total_loss": 0.04296887293457985 + }, + { + "entropy": 8.760126113891602, + "epoch": 0.8724540241249753, + "mean_token_accuracy": 0.7533252835273743, + "num_tokens": 25124967.0, + "step": 8824, + "train/ce_loss": 1.063308835029602 + }, + { + "epoch": 0.8724540241249753, + "step": 8824, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.8724540241249753, + "step": 8824, + "train/total_loss": 0.16492463648319244 + }, + { + "entropy": 8.983345031738281, + "epoch": 0.8725528969744908, + "mean_token_accuracy": 0.7981651425361633, + "num_tokens": 25130067.0, + "step": 8825, + "train/ce_loss": 1.3582448959350586 + }, + { + "epoch": 0.8725528969744908, + "step": 8825, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.8725528969744908, + "step": 8825, + "train/total_loss": 0.15926198661327362 + }, + { + "entropy": 8.7062349319458, + "epoch": 0.8726517698240063, + "mean_token_accuracy": 0.7188295125961304, + "num_tokens": 25135337.0, + "step": 8826, + "train/ce_loss": 0.798263669013977 + }, + { + "epoch": 0.8726517698240063, + "step": 8826, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.8726517698240063, + "step": 8826, + "train/total_loss": 0.13842011988162994 + }, + { + "entropy": 9.845602035522461, + "epoch": 0.8727506426735219, + "mean_token_accuracy": 0.6994949579238892, + "num_tokens": 25140133.0, + "step": 8827, + "train/ce_loss": 1.5620946884155273 + }, + { + "epoch": 0.8727506426735219, + "step": 8827, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.8727506426735219, + "step": 8827, + "train/total_loss": 0.22261571884155273 + }, + { + "entropy": 9.002527236938477, + "epoch": 0.8728495155230374, + "mean_token_accuracy": 0.7468553185462952, + "num_tokens": 25145188.0, + "step": 8828, + "train/ce_loss": 9.026667271427868e-07 + }, + { + "epoch": 0.8728495155230374, + "step": 8828, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.8728495155230374, + "step": 8828, + "train/total_loss": 0.05468758940696716 + }, + { + "entropy": 9.244731903076172, + "epoch": 0.8729483883725528, + "mean_token_accuracy": 0.7821100950241089, + "num_tokens": 25150100.0, + "step": 8829, + "train/ce_loss": 0.8010627627372742 + }, + { + "epoch": 0.8729483883725528, + "step": 8829, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.8729483883725528, + "step": 8829, + "train/total_loss": 0.11916878074407578 + }, + { + "entropy": 8.634561538696289, + "epoch": 0.8730472612220684, + "mean_token_accuracy": 0.7494061589241028, + "num_tokens": 25155403.0, + "step": 8830, + "train/ce_loss": 0.6409665942192078 + }, + { + "epoch": 0.8730472612220684, + "step": 8830, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.8730472612220684, + "step": 8830, + "train/total_loss": 0.09534665942192078 + }, + { + "entropy": 8.507293701171875, + "epoch": 0.8731461340715839, + "mean_token_accuracy": 0.7270641922950745, + "num_tokens": 25160774.0, + "step": 8831, + "train/ce_loss": 0.6567292809486389 + }, + { + "epoch": 0.8731461340715839, + "step": 8831, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.8731461340715839, + "step": 8831, + "train/total_loss": 0.08911042660474777 + }, + { + "entropy": 8.993139266967773, + "epoch": 0.8732450069210995, + "mean_token_accuracy": 0.7669291496276855, + "num_tokens": 25165869.0, + "step": 8832, + "train/ce_loss": 1.3291172981262207 + }, + { + "epoch": 0.8732450069210995, + "step": 8832, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.8732450069210995, + "step": 8832, + "train/total_loss": 0.19150547683238983 + }, + { + "entropy": 8.491973876953125, + "epoch": 0.873343879770615, + "mean_token_accuracy": 0.7161226272583008, + "num_tokens": 25171504.0, + "step": 8833, + "train/ce_loss": 0.9487578868865967 + }, + { + "epoch": 0.873343879770615, + "step": 8833, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.873343879770615, + "step": 8833, + "train/total_loss": 0.1769070327281952 + }, + { + "entropy": 9.158414840698242, + "epoch": 0.8734427526201305, + "mean_token_accuracy": 0.7287319302558899, + "num_tokens": 25176575.0, + "step": 8834, + "train/ce_loss": 1.3763008117675781 + }, + { + "epoch": 0.8734427526201305, + "step": 8834, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.8734427526201305, + "step": 8834, + "train/total_loss": 0.15716134011745453 + }, + { + "entropy": 8.496709823608398, + "epoch": 0.8735416254696461, + "mean_token_accuracy": 0.6809210777282715, + "num_tokens": 25181957.0, + "step": 8835, + "train/ce_loss": 1.1742103099822998 + }, + { + "epoch": 0.8735416254696461, + "step": 8835, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.8735416254696461, + "step": 8835, + "train/total_loss": 0.15648353099822998 + }, + { + "entropy": 9.131623268127441, + "epoch": 0.8736404983191616, + "mean_token_accuracy": 0.6885812878608704, + "num_tokens": 25186968.0, + "step": 8836, + "train/ce_loss": 0.7909911274909973 + }, + { + "epoch": 0.8736404983191616, + "step": 8836, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.8736404983191616, + "step": 8836, + "train/total_loss": 0.1298803687095642 + }, + { + "entropy": 8.554312705993652, + "epoch": 0.8737393711686771, + "mean_token_accuracy": 0.7601432204246521, + "num_tokens": 25192270.0, + "step": 8837, + "train/ce_loss": 0.972604513168335 + }, + { + "epoch": 0.8737393711686771, + "step": 8837, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.8737393711686771, + "step": 8837, + "train/total_loss": 0.20663544535636902 + }, + { + "entropy": 8.629166603088379, + "epoch": 0.8738382440181927, + "mean_token_accuracy": 0.7542662024497986, + "num_tokens": 25197672.0, + "step": 8838, + "train/ce_loss": 1.1455631256103516 + }, + { + "epoch": 0.8738382440181927, + "step": 8838, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.8738382440181927, + "step": 8838, + "train/total_loss": 0.17705631256103516 + }, + { + "entropy": 8.445144653320312, + "epoch": 0.8739371168677081, + "mean_token_accuracy": 0.7559139728546143, + "num_tokens": 25203111.0, + "step": 8839, + "train/ce_loss": 0.46726107597351074 + }, + { + "epoch": 0.8739371168677081, + "step": 8839, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.8739371168677081, + "step": 8839, + "train/total_loss": 0.07016360759735107 + }, + { + "epoch": 0.8740359897172236, + "grad_norm": 0.5828443169593811, + "learning_rate": 7.817089452603473e-06, + "loss": 0.1283, + "step": 8840 + }, + { + "entropy": 8.578930854797363, + "epoch": 0.8740359897172236, + "mean_token_accuracy": 0.716911792755127, + "num_tokens": 25208403.0, + "step": 8840, + "train/ce_loss": 0.8367867469787598 + }, + { + "epoch": 0.8740359897172236, + "step": 8840, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.8740359897172236, + "step": 8840, + "train/total_loss": 0.13836617767810822 + }, + { + "entropy": 9.337748527526855, + "epoch": 0.8741348625667392, + "mean_token_accuracy": 0.7061403393745422, + "num_tokens": 25213254.0, + "step": 8841, + "train/ce_loss": 3.4821451322386565e-07 + }, + { + "epoch": 0.8741348625667392, + "step": 8841, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.8741348625667392, + "step": 8841, + "train/total_loss": 0.015625035390257835 + }, + { + "entropy": 9.182525634765625, + "epoch": 0.8742337354162547, + "mean_token_accuracy": 0.7558139562606812, + "num_tokens": 25218292.0, + "step": 8842, + "train/ce_loss": 0.8731189370155334 + }, + { + "epoch": 0.8742337354162547, + "step": 8842, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.8742337354162547, + "step": 8842, + "train/total_loss": 0.11074939370155334 + }, + { + "entropy": 9.116571426391602, + "epoch": 0.8743326082657702, + "mean_token_accuracy": 0.7578125, + "num_tokens": 25223217.0, + "step": 8843, + "train/ce_loss": 1.6050145626068115 + }, + { + "epoch": 0.8743326082657702, + "step": 8843, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.8743326082657702, + "step": 8843, + "train/total_loss": 0.21518896520137787 + }, + { + "entropy": 8.42207145690918, + "epoch": 0.8744314811152858, + "mean_token_accuracy": 0.7744282484054565, + "num_tokens": 25228672.0, + "step": 8844, + "train/ce_loss": 0.43781447410583496 + }, + { + "epoch": 0.8744314811152858, + "step": 8844, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.8744314811152858, + "step": 8844, + "train/total_loss": 0.06721894443035126 + }, + { + "entropy": 9.061729431152344, + "epoch": 0.8745303539648013, + "mean_token_accuracy": 0.7083947062492371, + "num_tokens": 25233823.0, + "step": 8845, + "train/ce_loss": 1.604802131652832 + }, + { + "epoch": 0.8745303539648013, + "step": 8845, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.8745303539648013, + "step": 8845, + "train/total_loss": 0.22688646614551544 + }, + { + "entropy": 8.510478019714355, + "epoch": 0.8746292268143168, + "mean_token_accuracy": 0.748400866985321, + "num_tokens": 25239165.0, + "step": 8846, + "train/ce_loss": 0.6801382303237915 + }, + { + "epoch": 0.8746292268143168, + "step": 8846, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.8746292268143168, + "step": 8846, + "train/total_loss": 0.10707632452249527 + }, + { + "entropy": 8.817245483398438, + "epoch": 0.8747280996638324, + "mean_token_accuracy": 0.7609921097755432, + "num_tokens": 25244504.0, + "step": 8847, + "train/ce_loss": 1.1438863277435303 + }, + { + "epoch": 0.8747280996638324, + "step": 8847, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.8747280996638324, + "step": 8847, + "train/total_loss": 0.16907614469528198 + }, + { + "entropy": 8.993545532226562, + "epoch": 0.8748269725133478, + "mean_token_accuracy": 0.7871674299240112, + "num_tokens": 25249611.0, + "step": 8848, + "train/ce_loss": 1.3642228841781616 + }, + { + "epoch": 0.8748269725133478, + "step": 8848, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.8748269725133478, + "step": 8848, + "train/total_loss": 0.1989222913980484 + }, + { + "entropy": 8.39812183380127, + "epoch": 0.8749258453628633, + "mean_token_accuracy": 0.7722457647323608, + "num_tokens": 25255046.0, + "step": 8849, + "train/ce_loss": 0.45712316036224365 + }, + { + "epoch": 0.8749258453628633, + "step": 8849, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.8749258453628633, + "step": 8849, + "train/total_loss": 0.09258732199668884 + }, + { + "entropy": 8.450510025024414, + "epoch": 0.8750247182123789, + "mean_token_accuracy": 0.7152698040008545, + "num_tokens": 25260336.0, + "step": 8850, + "train/ce_loss": 0.7078292369842529 + }, + { + "epoch": 0.8750247182123789, + "step": 8850, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.8750247182123789, + "step": 8850, + "train/total_loss": 0.09422042220830917 + }, + { + "entropy": 9.017822265625, + "epoch": 0.8751235910618944, + "mean_token_accuracy": 0.7021898031234741, + "num_tokens": 25265448.0, + "step": 8851, + "train/ce_loss": 0.6941998600959778 + }, + { + "epoch": 0.8751235910618944, + "step": 8851, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.8751235910618944, + "step": 8851, + "train/total_loss": 0.1006699874997139 + }, + { + "entropy": 9.01706314086914, + "epoch": 0.8752224639114099, + "mean_token_accuracy": 0.7002583742141724, + "num_tokens": 25270700.0, + "step": 8852, + "train/ce_loss": 0.6723163723945618 + }, + { + "epoch": 0.8752224639114099, + "step": 8852, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.8752224639114099, + "step": 8852, + "train/total_loss": 0.12191914021968842 + }, + { + "entropy": 8.469293594360352, + "epoch": 0.8753213367609255, + "mean_token_accuracy": 0.7325714230537415, + "num_tokens": 25276066.0, + "step": 8853, + "train/ce_loss": 0.9326720833778381 + }, + { + "epoch": 0.8753213367609255, + "step": 8853, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.8753213367609255, + "step": 8853, + "train/total_loss": 0.13623595237731934 + }, + { + "entropy": 9.226408004760742, + "epoch": 0.875420209610441, + "mean_token_accuracy": 0.7230769395828247, + "num_tokens": 25281273.0, + "step": 8854, + "train/ce_loss": 1.058532953262329 + }, + { + "epoch": 0.875420209610441, + "step": 8854, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.875420209610441, + "step": 8854, + "train/total_loss": 0.18007203936576843 + }, + { + "entropy": 8.688737869262695, + "epoch": 0.8755190824599565, + "mean_token_accuracy": 0.7356194853782654, + "num_tokens": 25286672.0, + "step": 8855, + "train/ce_loss": 0.6025984883308411 + }, + { + "epoch": 0.8755190824599565, + "step": 8855, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.8755190824599565, + "step": 8855, + "train/total_loss": 0.0915098488330841 + }, + { + "entropy": 8.628371238708496, + "epoch": 0.8756179553094721, + "mean_token_accuracy": 0.7133243680000305, + "num_tokens": 25291871.0, + "step": 8856, + "train/ce_loss": 0.9561830163002014 + }, + { + "epoch": 0.8756179553094721, + "step": 8856, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.8756179553094721, + "step": 8856, + "train/total_loss": 0.15421205759048462 + }, + { + "entropy": 8.08566665649414, + "epoch": 0.8757168281589875, + "mean_token_accuracy": 0.6803196668624878, + "num_tokens": 25297367.0, + "step": 8857, + "train/ce_loss": 1.001320242881775 + }, + { + "epoch": 0.8757168281589875, + "step": 8857, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.8757168281589875, + "step": 8857, + "train/total_loss": 0.162632018327713 + }, + { + "entropy": 9.131614685058594, + "epoch": 0.875815701008503, + "mean_token_accuracy": 0.7594202756881714, + "num_tokens": 25302495.0, + "step": 8858, + "train/ce_loss": 0.6407871842384338 + }, + { + "epoch": 0.875815701008503, + "step": 8858, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.875815701008503, + "step": 8858, + "train/total_loss": 0.08751621842384338 + }, + { + "entropy": 8.983501434326172, + "epoch": 0.8759145738580186, + "mean_token_accuracy": 0.75, + "num_tokens": 25307599.0, + "step": 8859, + "train/ce_loss": 1.0661547183990479 + }, + { + "epoch": 0.8759145738580186, + "step": 8859, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.8759145738580186, + "step": 8859, + "train/total_loss": 0.16520923376083374 + }, + { + "epoch": 0.8760134467075341, + "grad_norm": 0.7033073306083679, + "learning_rate": 7.812144587845522e-06, + "loss": 0.1372, + "step": 8860 + }, + { + "entropy": 8.933948516845703, + "epoch": 0.8760134467075341, + "mean_token_accuracy": 0.7825342416763306, + "num_tokens": 25312658.0, + "step": 8860, + "train/ce_loss": 0.9847171902656555 + }, + { + "epoch": 0.8760134467075341, + "step": 8860, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.8760134467075341, + "step": 8860, + "train/total_loss": 0.1687842309474945 + }, + { + "entropy": 8.694652557373047, + "epoch": 0.8761123195570496, + "mean_token_accuracy": 0.7148289084434509, + "num_tokens": 25317887.0, + "step": 8861, + "train/ce_loss": 1.199857473373413 + }, + { + "epoch": 0.8761123195570496, + "step": 8861, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.8761123195570496, + "step": 8861, + "train/total_loss": 0.17857950925827026 + }, + { + "entropy": 8.699516296386719, + "epoch": 0.8762111924065652, + "mean_token_accuracy": 0.7503090500831604, + "num_tokens": 25323166.0, + "step": 8862, + "train/ce_loss": 0.9494368433952332 + }, + { + "epoch": 0.8762111924065652, + "step": 8862, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.8762111924065652, + "step": 8862, + "train/total_loss": 0.13791243731975555 + }, + { + "entropy": 8.456817626953125, + "epoch": 0.8763100652560807, + "mean_token_accuracy": 0.8034188151359558, + "num_tokens": 25328609.0, + "step": 8863, + "train/ce_loss": 0.7831841111183167 + }, + { + "epoch": 0.8763100652560807, + "step": 8863, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.8763100652560807, + "step": 8863, + "train/total_loss": 0.13300591707229614 + }, + { + "entropy": 8.716633796691895, + "epoch": 0.8764089381055962, + "mean_token_accuracy": 0.7696245908737183, + "num_tokens": 25333649.0, + "step": 8864, + "train/ce_loss": 0.5097966194152832 + }, + { + "epoch": 0.8764089381055962, + "step": 8864, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.8764089381055962, + "step": 8864, + "train/total_loss": 0.07051090896129608 + }, + { + "entropy": 9.292539596557617, + "epoch": 0.8765078109551118, + "mean_token_accuracy": 0.7435897588729858, + "num_tokens": 25338463.0, + "step": 8865, + "train/ce_loss": 1.7354304790496826 + }, + { + "epoch": 0.8765078109551118, + "step": 8865, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.8765078109551118, + "step": 8865, + "train/total_loss": 0.2907305359840393 + }, + { + "entropy": 8.276805877685547, + "epoch": 0.8766066838046273, + "mean_token_accuracy": 0.7647058963775635, + "num_tokens": 25343993.0, + "step": 8866, + "train/ce_loss": 0.9382361769676208 + }, + { + "epoch": 0.8766066838046273, + "step": 8866, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.8766066838046273, + "step": 8866, + "train/total_loss": 0.1211673691868782 + }, + { + "entropy": 8.47601318359375, + "epoch": 0.8767055566541427, + "mean_token_accuracy": 0.7644970417022705, + "num_tokens": 25349328.0, + "step": 8867, + "train/ce_loss": 1.3544975519180298 + }, + { + "epoch": 0.8767055566541427, + "step": 8867, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.8767055566541427, + "step": 8867, + "train/total_loss": 0.19013725221157074 + }, + { + "entropy": 8.886510848999023, + "epoch": 0.8768044295036583, + "mean_token_accuracy": 0.7770618796348572, + "num_tokens": 25354554.0, + "step": 8868, + "train/ce_loss": 0.7130325436592102 + }, + { + "epoch": 0.8768044295036583, + "step": 8868, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.8768044295036583, + "step": 8868, + "train/total_loss": 0.15724074840545654 + }, + { + "entropy": 8.782245635986328, + "epoch": 0.8769033023531738, + "mean_token_accuracy": 0.7518247961997986, + "num_tokens": 25359880.0, + "step": 8869, + "train/ce_loss": 0.6772040128707886 + }, + { + "epoch": 0.8769033023531738, + "step": 8869, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.8769033023531738, + "step": 8869, + "train/total_loss": 0.09115790575742722 + }, + { + "entropy": 9.141199111938477, + "epoch": 0.8770021752026893, + "mean_token_accuracy": 0.7357357144355774, + "num_tokens": 25365008.0, + "step": 8870, + "train/ce_loss": 1.5943024158477783 + }, + { + "epoch": 0.8770021752026893, + "step": 8870, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.8770021752026893, + "step": 8870, + "train/total_loss": 0.24927400052547455 + }, + { + "entropy": 8.956819534301758, + "epoch": 0.8771010480522049, + "mean_token_accuracy": 0.752755880355835, + "num_tokens": 25370085.0, + "step": 8871, + "train/ce_loss": 0.7061737775802612 + }, + { + "epoch": 0.8771010480522049, + "step": 8871, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.8771010480522049, + "step": 8871, + "train/total_loss": 0.09405487775802612 + }, + { + "entropy": 8.722127914428711, + "epoch": 0.8771999209017204, + "mean_token_accuracy": 0.7299363017082214, + "num_tokens": 25375319.0, + "step": 8872, + "train/ce_loss": 0.9193618893623352 + }, + { + "epoch": 0.8771999209017204, + "step": 8872, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.8771999209017204, + "step": 8872, + "train/total_loss": 0.13881120085716248 + }, + { + "entropy": 9.018770217895508, + "epoch": 0.8772987937512359, + "mean_token_accuracy": 0.7918623089790344, + "num_tokens": 25380414.0, + "step": 8873, + "train/ce_loss": 0.9188916087150574 + }, + { + "epoch": 0.8772987937512359, + "step": 8873, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.8772987937512359, + "step": 8873, + "train/total_loss": 0.1387641727924347 + }, + { + "entropy": 8.860586166381836, + "epoch": 0.8773976666007515, + "mean_token_accuracy": 0.73758864402771, + "num_tokens": 25385740.0, + "step": 8874, + "train/ce_loss": 0.7601509690284729 + }, + { + "epoch": 0.8773976666007515, + "step": 8874, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.8773976666007515, + "step": 8874, + "train/total_loss": 0.14242134988307953 + }, + { + "entropy": 8.414628982543945, + "epoch": 0.877496539450267, + "mean_token_accuracy": 0.7476635575294495, + "num_tokens": 25391182.0, + "step": 8875, + "train/ce_loss": 1.0848225355148315 + }, + { + "epoch": 0.877496539450267, + "step": 8875, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.877496539450267, + "step": 8875, + "train/total_loss": 0.1358260065317154 + }, + { + "entropy": 9.601459503173828, + "epoch": 0.8775954122997824, + "mean_token_accuracy": 0.7941888570785522, + "num_tokens": 25396004.0, + "step": 8876, + "train/ce_loss": 0.9619978666305542 + }, + { + "epoch": 0.8775954122997824, + "step": 8876, + "train/sim_loss": 0.01171875 + }, + { + "epoch": 0.8775954122997824, + "step": 8876, + "train/total_loss": 0.10791853815317154 + }, + { + "entropy": 8.563558578491211, + "epoch": 0.877694285149298, + "mean_token_accuracy": 0.740024209022522, + "num_tokens": 25401315.0, + "step": 8877, + "train/ce_loss": 0.6377543807029724 + }, + { + "epoch": 0.877694285149298, + "step": 8877, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.877694285149298, + "step": 8877, + "train/total_loss": 0.0794004425406456 + }, + { + "entropy": 8.761125564575195, + "epoch": 0.8777931579988135, + "mean_token_accuracy": 0.7519209384918213, + "num_tokens": 25406665.0, + "step": 8878, + "train/ce_loss": 1.182747721672058 + }, + { + "epoch": 0.8777931579988135, + "step": 8878, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.8777931579988135, + "step": 8878, + "train/total_loss": 0.1807747781276703 + }, + { + "entropy": 8.149176597595215, + "epoch": 0.877892030848329, + "mean_token_accuracy": 0.7310924530029297, + "num_tokens": 25412100.0, + "step": 8879, + "train/ce_loss": 0.8335572481155396 + }, + { + "epoch": 0.877892030848329, + "step": 8879, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.877892030848329, + "step": 8879, + "train/total_loss": 0.12632447481155396 + }, + { + "epoch": 0.8779909036978446, + "grad_norm": 0.6092635989189148, + "learning_rate": 7.807199723087574e-06, + "loss": 0.1257, + "step": 8880 + }, + { + "entropy": 8.752358436584473, + "epoch": 0.8779909036978446, + "mean_token_accuracy": 0.7354497313499451, + "num_tokens": 25417280.0, + "step": 8880, + "train/ce_loss": 0.844520628452301 + }, + { + "epoch": 0.8779909036978446, + "step": 8880, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.8779909036978446, + "step": 8880, + "train/total_loss": 0.1313270628452301 + }, + { + "entropy": 9.201775550842285, + "epoch": 0.8780897765473601, + "mean_token_accuracy": 0.7435158491134644, + "num_tokens": 25422467.0, + "step": 8881, + "train/ce_loss": 5.139292511557869e-07 + }, + { + "epoch": 0.8780897765473601, + "step": 8881, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.8780897765473601, + "step": 8881, + "train/total_loss": 0.02343755215406418 + }, + { + "entropy": 9.13831901550293, + "epoch": 0.8781886493968756, + "mean_token_accuracy": 0.7526236772537231, + "num_tokens": 25427702.0, + "step": 8882, + "train/ce_loss": 0.7612507343292236 + }, + { + "epoch": 0.8781886493968756, + "step": 8882, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.8781886493968756, + "step": 8882, + "train/total_loss": 0.15425008535385132 + }, + { + "entropy": 8.566118240356445, + "epoch": 0.8782875222463912, + "mean_token_accuracy": 0.7414966225624084, + "num_tokens": 25433054.0, + "step": 8883, + "train/ce_loss": 0.5385515093803406 + }, + { + "epoch": 0.8782875222463912, + "step": 8883, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.8782875222463912, + "step": 8883, + "train/total_loss": 0.06948015093803406 + }, + { + "entropy": 9.248607635498047, + "epoch": 0.8783863950959067, + "mean_token_accuracy": 0.6926910281181335, + "num_tokens": 25438066.0, + "step": 8884, + "train/ce_loss": 1.8416887521743774 + }, + { + "epoch": 0.8783863950959067, + "step": 8884, + "train/sim_loss": 0.1875 + }, + { + "epoch": 0.8783863950959067, + "step": 8884, + "train/total_loss": 0.37166887521743774 + }, + { + "entropy": 8.906831741333008, + "epoch": 0.8784852679454221, + "mean_token_accuracy": 0.7623888254165649, + "num_tokens": 25443328.0, + "step": 8885, + "train/ce_loss": 0.5900859832763672 + }, + { + "epoch": 0.8784852679454221, + "step": 8885, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.8784852679454221, + "step": 8885, + "train/total_loss": 0.10978984832763672 + }, + { + "entropy": 8.640851020812988, + "epoch": 0.8785841407949377, + "mean_token_accuracy": 0.696351945400238, + "num_tokens": 25448728.0, + "step": 8886, + "train/ce_loss": 0.9450283050537109 + }, + { + "epoch": 0.8785841407949377, + "step": 8886, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.8785841407949377, + "step": 8886, + "train/total_loss": 0.21169033646583557 + }, + { + "entropy": 8.625853538513184, + "epoch": 0.8786830136444532, + "mean_token_accuracy": 0.7610993385314941, + "num_tokens": 25454337.0, + "step": 8887, + "train/ce_loss": 0.9452787637710571 + }, + { + "epoch": 0.8786830136444532, + "step": 8887, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.8786830136444532, + "step": 8887, + "train/total_loss": 0.13749662041664124 + }, + { + "entropy": 8.575627326965332, + "epoch": 0.8787818864939687, + "mean_token_accuracy": 0.7576141953468323, + "num_tokens": 25459571.0, + "step": 8888, + "train/ce_loss": 0.5424915552139282 + }, + { + "epoch": 0.8787818864939687, + "step": 8888, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.8787818864939687, + "step": 8888, + "train/total_loss": 0.07768665254116058 + }, + { + "entropy": 8.32967758178711, + "epoch": 0.8788807593434843, + "mean_token_accuracy": 0.7444238066673279, + "num_tokens": 25465173.0, + "step": 8889, + "train/ce_loss": 0.8996832966804504 + }, + { + "epoch": 0.8788807593434843, + "step": 8889, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.8788807593434843, + "step": 8889, + "train/total_loss": 0.17981207370758057 + }, + { + "entropy": 8.881383895874023, + "epoch": 0.8789796321929998, + "mean_token_accuracy": 0.7243995070457458, + "num_tokens": 25470418.0, + "step": 8890, + "train/ce_loss": 1.5166651010513306 + }, + { + "epoch": 0.8789796321929998, + "step": 8890, + "train/sim_loss": 0.12890625 + }, + { + "epoch": 0.8789796321929998, + "step": 8890, + "train/total_loss": 0.280572772026062 + }, + { + "entropy": 8.884958267211914, + "epoch": 0.8790785050425153, + "mean_token_accuracy": 0.7473524808883667, + "num_tokens": 25475540.0, + "step": 8891, + "train/ce_loss": 0.95103520154953 + }, + { + "epoch": 0.8790785050425153, + "step": 8891, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.8790785050425153, + "step": 8891, + "train/total_loss": 0.14197853207588196 + }, + { + "entropy": 9.408562660217285, + "epoch": 0.8791773778920309, + "mean_token_accuracy": 0.8113207817077637, + "num_tokens": 25480414.0, + "step": 8892, + "train/ce_loss": 2.3370089365926106e-06 + }, + { + "epoch": 0.8791773778920309, + "step": 8892, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.8791773778920309, + "step": 8892, + "train/total_loss": 0.027343982830643654 + }, + { + "entropy": 8.868795394897461, + "epoch": 0.8792762507415464, + "mean_token_accuracy": 0.7309237122535706, + "num_tokens": 25485656.0, + "step": 8893, + "train/ce_loss": 1.4700690507888794 + }, + { + "epoch": 0.8792762507415464, + "step": 8893, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.8792762507415464, + "step": 8893, + "train/total_loss": 0.21731941401958466 + }, + { + "entropy": 8.755334854125977, + "epoch": 0.8793751235910618, + "mean_token_accuracy": 0.7582278251647949, + "num_tokens": 25490928.0, + "step": 8894, + "train/ce_loss": 1.7055973557944526e-06 + }, + { + "epoch": 0.8793751235910618, + "step": 8894, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.8793751235910618, + "step": 8894, + "train/total_loss": 0.03125017136335373 + }, + { + "entropy": 8.831209182739258, + "epoch": 0.8794739964405774, + "mean_token_accuracy": 0.7928921580314636, + "num_tokens": 25496223.0, + "step": 8895, + "train/ce_loss": 1.1295431852340698 + }, + { + "epoch": 0.8794739964405774, + "step": 8895, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.8794739964405774, + "step": 8895, + "train/total_loss": 0.12857931852340698 + }, + { + "entropy": 8.743522644042969, + "epoch": 0.8795728692900929, + "mean_token_accuracy": 0.7045161128044128, + "num_tokens": 25501482.0, + "step": 8896, + "train/ce_loss": 0.559874951839447 + }, + { + "epoch": 0.8795728692900929, + "step": 8896, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.8795728692900929, + "step": 8896, + "train/total_loss": 0.13801874220371246 + }, + { + "entropy": 8.64323616027832, + "epoch": 0.8796717421396084, + "mean_token_accuracy": 0.7845528721809387, + "num_tokens": 25506910.0, + "step": 8897, + "train/ce_loss": 0.48229286074638367 + }, + { + "epoch": 0.8796717421396084, + "step": 8897, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.8796717421396084, + "step": 8897, + "train/total_loss": 0.08729179203510284 + }, + { + "entropy": 8.610434532165527, + "epoch": 0.879770614989124, + "mean_token_accuracy": 0.7132784724235535, + "num_tokens": 25512183.0, + "step": 8898, + "train/ce_loss": 0.48742854595184326 + }, + { + "epoch": 0.879770614989124, + "step": 8898, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.879770614989124, + "step": 8898, + "train/total_loss": 0.0643678605556488 + }, + { + "entropy": 8.946176528930664, + "epoch": 0.8798694878386395, + "mean_token_accuracy": 0.6847222447395325, + "num_tokens": 25517383.0, + "step": 8899, + "train/ce_loss": 0.8376083374023438 + }, + { + "epoch": 0.8798694878386395, + "step": 8899, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.8798694878386395, + "step": 8899, + "train/total_loss": 0.1384483277797699 + }, + { + "epoch": 0.879968360688155, + "grad_norm": 0.725326657295227, + "learning_rate": 7.802254858329625e-06, + "loss": 0.1384, + "step": 8900 + }, + { + "entropy": 8.435836791992188, + "epoch": 0.879968360688155, + "mean_token_accuracy": 0.7551867365837097, + "num_tokens": 25522877.0, + "step": 8900, + "train/ce_loss": 2.0146045684814453 + }, + { + "epoch": 0.879968360688155, + "step": 8900, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.879968360688155, + "step": 8900, + "train/total_loss": 0.26005423069000244 + }, + { + "entropy": 8.750149726867676, + "epoch": 0.8800672335376706, + "mean_token_accuracy": 0.7774968147277832, + "num_tokens": 25528141.0, + "step": 8901, + "train/ce_loss": 0.4915764033794403 + }, + { + "epoch": 0.8800672335376706, + "step": 8901, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.8800672335376706, + "step": 8901, + "train/total_loss": 0.08040764182806015 + }, + { + "entropy": 8.452852249145508, + "epoch": 0.8801661063871861, + "mean_token_accuracy": 0.755959153175354, + "num_tokens": 25533531.0, + "step": 8902, + "train/ce_loss": 0.8060742020606995 + }, + { + "epoch": 0.8801661063871861, + "step": 8902, + "train/sim_loss": 0.1328125 + }, + { + "epoch": 0.8801661063871861, + "step": 8902, + "train/total_loss": 0.21341991424560547 + }, + { + "entropy": 8.768773078918457, + "epoch": 0.8802649792367016, + "mean_token_accuracy": 0.7303370833396912, + "num_tokens": 25538688.0, + "step": 8903, + "train/ce_loss": 0.4603393077850342 + }, + { + "epoch": 0.8802649792367016, + "step": 8903, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.8802649792367016, + "step": 8903, + "train/total_loss": 0.11634643375873566 + }, + { + "entropy": 8.699012756347656, + "epoch": 0.8803638520862171, + "mean_token_accuracy": 0.7281795740127563, + "num_tokens": 25543955.0, + "step": 8904, + "train/ce_loss": 1.3716063499450684 + }, + { + "epoch": 0.8803638520862171, + "step": 8904, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.8803638520862171, + "step": 8904, + "train/total_loss": 0.21137939393520355 + }, + { + "entropy": 8.342909812927246, + "epoch": 0.8804627249357326, + "mean_token_accuracy": 0.7587548494338989, + "num_tokens": 25549428.0, + "step": 8905, + "train/ce_loss": 1.0330997705459595 + }, + { + "epoch": 0.8804627249357326, + "step": 8905, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.8804627249357326, + "step": 8905, + "train/total_loss": 0.1423724889755249 + }, + { + "entropy": 8.701141357421875, + "epoch": 0.8805615977852481, + "mean_token_accuracy": 0.7564895153045654, + "num_tokens": 25554642.0, + "step": 8906, + "train/ce_loss": 0.7028892040252686 + }, + { + "epoch": 0.8805615977852481, + "step": 8906, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.8805615977852481, + "step": 8906, + "train/total_loss": 0.13278892636299133 + }, + { + "entropy": 8.76907730102539, + "epoch": 0.8806604706347637, + "mean_token_accuracy": 0.7080394625663757, + "num_tokens": 25559834.0, + "step": 8907, + "train/ce_loss": 0.895540177822113 + }, + { + "epoch": 0.8806604706347637, + "step": 8907, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.8806604706347637, + "step": 8907, + "train/total_loss": 0.13642901182174683 + }, + { + "entropy": 8.812956809997559, + "epoch": 0.8807593434842792, + "mean_token_accuracy": 0.729411780834198, + "num_tokens": 25565162.0, + "step": 8908, + "train/ce_loss": 0.562024712562561 + }, + { + "epoch": 0.8807593434842792, + "step": 8908, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.8807593434842792, + "step": 8908, + "train/total_loss": 0.0991712212562561 + }, + { + "entropy": 8.56463623046875, + "epoch": 0.8808582163337947, + "mean_token_accuracy": 0.7385892271995544, + "num_tokens": 25570584.0, + "step": 8909, + "train/ce_loss": 0.5374073386192322 + }, + { + "epoch": 0.8808582163337947, + "step": 8909, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.8808582163337947, + "step": 8909, + "train/total_loss": 0.1045219898223877 + }, + { + "entropy": 8.398541450500488, + "epoch": 0.8809570891833103, + "mean_token_accuracy": 0.6953405141830444, + "num_tokens": 25575893.0, + "step": 8910, + "train/ce_loss": 0.5384745597839355 + }, + { + "epoch": 0.8809570891833103, + "step": 8910, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.8809570891833103, + "step": 8910, + "train/total_loss": 0.07728496193885803 + }, + { + "entropy": 8.852869033813477, + "epoch": 0.8810559620328258, + "mean_token_accuracy": 0.7487179636955261, + "num_tokens": 25581057.0, + "step": 8911, + "train/ce_loss": 0.5818669199943542 + }, + { + "epoch": 0.8810559620328258, + "step": 8911, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.8810559620328258, + "step": 8911, + "train/total_loss": 0.10506169497966766 + }, + { + "entropy": 8.566947937011719, + "epoch": 0.8811548348823414, + "mean_token_accuracy": 0.8086283206939697, + "num_tokens": 25586451.0, + "step": 8912, + "train/ce_loss": 0.47238612174987793 + }, + { + "epoch": 0.8811548348823414, + "step": 8912, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.8811548348823414, + "step": 8912, + "train/total_loss": 0.06676986813545227 + }, + { + "entropy": 8.759763717651367, + "epoch": 0.8812537077318569, + "mean_token_accuracy": 0.7574626803398132, + "num_tokens": 25591708.0, + "step": 8913, + "train/ce_loss": 0.5444103479385376 + }, + { + "epoch": 0.8812537077318569, + "step": 8913, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.8812537077318569, + "step": 8913, + "train/total_loss": 0.11303478479385376 + }, + { + "entropy": 8.827457427978516, + "epoch": 0.8813525805813723, + "mean_token_accuracy": 0.7457886934280396, + "num_tokens": 25596775.0, + "step": 8914, + "train/ce_loss": 0.7621586918830872 + }, + { + "epoch": 0.8813525805813723, + "step": 8914, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.8813525805813723, + "step": 8914, + "train/total_loss": 0.13871586322784424 + }, + { + "entropy": 8.827047348022461, + "epoch": 0.8814514534308879, + "mean_token_accuracy": 0.7344877123832703, + "num_tokens": 25601953.0, + "step": 8915, + "train/ce_loss": 2.1599555015563965 + }, + { + "epoch": 0.8814514534308879, + "step": 8915, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.8814514534308879, + "step": 8915, + "train/total_loss": 0.28240180015563965 + }, + { + "entropy": 8.259698867797852, + "epoch": 0.8815503262804034, + "mean_token_accuracy": 0.7744755148887634, + "num_tokens": 25607565.0, + "step": 8916, + "train/ce_loss": 0.24848879873752594 + }, + { + "epoch": 0.8815503262804034, + "step": 8916, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.8815503262804034, + "step": 8916, + "train/total_loss": 0.056098878383636475 + }, + { + "entropy": 9.535283088684082, + "epoch": 0.8816491991299189, + "mean_token_accuracy": 0.821670413017273, + "num_tokens": 25612404.0, + "step": 8917, + "train/ce_loss": 3.0784343607592746e-07 + }, + { + "epoch": 0.8816491991299189, + "step": 8917, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.8816491991299189, + "step": 8917, + "train/total_loss": 0.015625031664967537 + }, + { + "entropy": 8.790726661682129, + "epoch": 0.8817480719794345, + "mean_token_accuracy": 0.707563042640686, + "num_tokens": 25617497.0, + "step": 8918, + "train/ce_loss": 2.574655354692368e-06 + }, + { + "epoch": 0.8817480719794345, + "step": 8918, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.8817480719794345, + "step": 8918, + "train/total_loss": 0.06250026077032089 + }, + { + "entropy": 9.147619247436523, + "epoch": 0.88184694482895, + "mean_token_accuracy": 0.7005163431167603, + "num_tokens": 25622540.0, + "step": 8919, + "train/ce_loss": 8.420393555752526e-07 + }, + { + "epoch": 0.88184694482895, + "step": 8919, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.88184694482895, + "step": 8919, + "train/total_loss": 0.039062585681676865 + }, + { + "epoch": 0.8819458176784655, + "grad_norm": 0.7121282815933228, + "learning_rate": 7.797309993571677e-06, + "loss": 0.133, + "step": 8920 + }, + { + "entropy": 8.917064666748047, + "epoch": 0.8819458176784655, + "mean_token_accuracy": 0.7324262857437134, + "num_tokens": 25627895.0, + "step": 8920, + "train/ce_loss": 0.6380739808082581 + }, + { + "epoch": 0.8819458176784655, + "step": 8920, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.8819458176784655, + "step": 8920, + "train/total_loss": 0.0989636480808258 + }, + { + "entropy": 9.06655502319336, + "epoch": 0.8820446905279811, + "mean_token_accuracy": 0.7949080467224121, + "num_tokens": 25633036.0, + "step": 8921, + "train/ce_loss": 0.5876515507698059 + }, + { + "epoch": 0.8820446905279811, + "step": 8921, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.8820446905279811, + "step": 8921, + "train/total_loss": 0.11345265805721283 + }, + { + "entropy": 8.67611026763916, + "epoch": 0.8821435633774966, + "mean_token_accuracy": 0.7111111283302307, + "num_tokens": 25638258.0, + "step": 8922, + "train/ce_loss": 0.8048073053359985 + }, + { + "epoch": 0.8821435633774966, + "step": 8922, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.8821435633774966, + "step": 8922, + "train/total_loss": 0.11563698202371597 + }, + { + "entropy": 9.230369567871094, + "epoch": 0.882242436227012, + "mean_token_accuracy": 0.7452692985534668, + "num_tokens": 25643372.0, + "step": 8923, + "train/ce_loss": 0.4094837009906769 + }, + { + "epoch": 0.882242436227012, + "step": 8923, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.882242436227012, + "step": 8923, + "train/total_loss": 0.09954212605953217 + }, + { + "entropy": 8.365276336669922, + "epoch": 0.8823413090765276, + "mean_token_accuracy": 0.7350254058837891, + "num_tokens": 25648815.0, + "step": 8924, + "train/ce_loss": 0.9520606994628906 + }, + { + "epoch": 0.8823413090765276, + "step": 8924, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.8823413090765276, + "step": 8924, + "train/total_loss": 0.18114358186721802 + }, + { + "entropy": 8.996768951416016, + "epoch": 0.8824401819260431, + "mean_token_accuracy": 0.7585185170173645, + "num_tokens": 25653965.0, + "step": 8925, + "train/ce_loss": 0.7729405164718628 + }, + { + "epoch": 0.8824401819260431, + "step": 8925, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.8824401819260431, + "step": 8925, + "train/total_loss": 0.12416905164718628 + }, + { + "entropy": 8.97836971282959, + "epoch": 0.8825390547755586, + "mean_token_accuracy": 0.7607843279838562, + "num_tokens": 25659203.0, + "step": 8926, + "train/ce_loss": 0.8300597071647644 + }, + { + "epoch": 0.8825390547755586, + "step": 8926, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.8825390547755586, + "step": 8926, + "train/total_loss": 0.16113096475601196 + }, + { + "entropy": 8.835588455200195, + "epoch": 0.8826379276250742, + "mean_token_accuracy": 0.7110862135887146, + "num_tokens": 25664553.0, + "step": 8927, + "train/ce_loss": 0.7836290597915649 + }, + { + "epoch": 0.8826379276250742, + "step": 8927, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.8826379276250742, + "step": 8927, + "train/total_loss": 0.10180040448904037 + }, + { + "entropy": 8.801051139831543, + "epoch": 0.8827368004745897, + "mean_token_accuracy": 0.746666669845581, + "num_tokens": 25669887.0, + "step": 8928, + "train/ce_loss": 0.8353655934333801 + }, + { + "epoch": 0.8827368004745897, + "step": 8928, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.8827368004745897, + "step": 8928, + "train/total_loss": 0.1225990578532219 + }, + { + "entropy": 8.794574737548828, + "epoch": 0.8828356733241052, + "mean_token_accuracy": 0.7582292556762695, + "num_tokens": 25675246.0, + "step": 8929, + "train/ce_loss": 0.35076427459716797 + }, + { + "epoch": 0.8828356733241052, + "step": 8929, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.8828356733241052, + "step": 8929, + "train/total_loss": 0.05070142820477486 + }, + { + "entropy": 9.182476997375488, + "epoch": 0.8829345461736208, + "mean_token_accuracy": 0.7186991572380066, + "num_tokens": 25680278.0, + "step": 8930, + "train/ce_loss": 9.869708037513192e-07 + }, + { + "epoch": 0.8829345461736208, + "step": 8930, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.8829345461736208, + "step": 8930, + "train/total_loss": 0.03125009685754776 + }, + { + "entropy": 9.165849685668945, + "epoch": 0.8830334190231363, + "mean_token_accuracy": 0.7638669013977051, + "num_tokens": 25685336.0, + "step": 8931, + "train/ce_loss": 1.3718249797821045 + }, + { + "epoch": 0.8830334190231363, + "step": 8931, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.8830334190231363, + "step": 8931, + "train/total_loss": 0.19187000393867493 + }, + { + "entropy": 9.511493682861328, + "epoch": 0.8831322918726517, + "mean_token_accuracy": 0.7290748953819275, + "num_tokens": 25690213.0, + "step": 8932, + "train/ce_loss": 2.57829879046767e-06 + }, + { + "epoch": 0.8831322918726517, + "step": 8932, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.8831322918726517, + "step": 8932, + "train/total_loss": 0.054687757045030594 + }, + { + "entropy": 9.19780445098877, + "epoch": 0.8832311647221673, + "mean_token_accuracy": 0.8355704545974731, + "num_tokens": 25695253.0, + "step": 8933, + "train/ce_loss": 0.6308323740959167 + }, + { + "epoch": 0.8832311647221673, + "step": 8933, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.8832311647221673, + "step": 8933, + "train/total_loss": 0.0865207388997078 + }, + { + "entropy": 8.815532684326172, + "epoch": 0.8833300375716828, + "mean_token_accuracy": 0.7622842192649841, + "num_tokens": 25700488.0, + "step": 8934, + "train/ce_loss": 0.6281819343566895 + }, + { + "epoch": 0.8833300375716828, + "step": 8934, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.8833300375716828, + "step": 8934, + "train/total_loss": 0.09406819194555283 + }, + { + "entropy": 8.927444458007812, + "epoch": 0.8834289104211983, + "mean_token_accuracy": 0.7278250455856323, + "num_tokens": 25705765.0, + "step": 8935, + "train/ce_loss": 0.6441558599472046 + }, + { + "epoch": 0.8834289104211983, + "step": 8935, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.8834289104211983, + "step": 8935, + "train/total_loss": 0.1151968389749527 + }, + { + "entropy": 8.61522102355957, + "epoch": 0.8835277832707139, + "mean_token_accuracy": 0.7848244905471802, + "num_tokens": 25711116.0, + "step": 8936, + "train/ce_loss": 1.077426552772522 + }, + { + "epoch": 0.8835277832707139, + "step": 8936, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.8835277832707139, + "step": 8936, + "train/total_loss": 0.15071141719818115 + }, + { + "entropy": 9.279502868652344, + "epoch": 0.8836266561202294, + "mean_token_accuracy": 0.6969178318977356, + "num_tokens": 25716089.0, + "step": 8937, + "train/ce_loss": 1.4144339561462402 + }, + { + "epoch": 0.8836266561202294, + "step": 8937, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.8836266561202294, + "step": 8937, + "train/total_loss": 0.1922246515750885 + }, + { + "entropy": 8.89355754852295, + "epoch": 0.8837255289697449, + "mean_token_accuracy": 0.7366504669189453, + "num_tokens": 25721422.0, + "step": 8938, + "train/ce_loss": 0.6931769847869873 + }, + { + "epoch": 0.8837255289697449, + "step": 8938, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.8837255289697449, + "step": 8938, + "train/total_loss": 0.09275519847869873 + }, + { + "entropy": 8.457046508789062, + "epoch": 0.8838244018192605, + "mean_token_accuracy": 0.7530864477157593, + "num_tokens": 25726869.0, + "step": 8939, + "train/ce_loss": 0.583381712436676 + }, + { + "epoch": 0.8838244018192605, + "step": 8939, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.8838244018192605, + "step": 8939, + "train/total_loss": 0.07786942273378372 + }, + { + "epoch": 0.883923274668776, + "grad_norm": 0.5730092525482178, + "learning_rate": 7.792365128813727e-06, + "loss": 0.1301, + "step": 8940 + }, + { + "entropy": 9.440145492553711, + "epoch": 0.883923274668776, + "mean_token_accuracy": 0.6542239785194397, + "num_tokens": 25731837.0, + "step": 8940, + "train/ce_loss": 3.5538878440856934 + }, + { + "epoch": 0.883923274668776, + "step": 8940, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.883923274668776, + "step": 8940, + "train/total_loss": 0.4022637903690338 + }, + { + "entropy": 8.689183235168457, + "epoch": 0.8840221475182914, + "mean_token_accuracy": 0.7245322465896606, + "num_tokens": 25737243.0, + "step": 8941, + "train/ce_loss": 1.8543144464492798 + }, + { + "epoch": 0.8840221475182914, + "step": 8941, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.8840221475182914, + "step": 8941, + "train/total_loss": 0.25574395060539246 + }, + { + "entropy": 8.877262115478516, + "epoch": 0.884121020367807, + "mean_token_accuracy": 0.7209026217460632, + "num_tokens": 25742519.0, + "step": 8942, + "train/ce_loss": 2.0073533058166504 + }, + { + "epoch": 0.884121020367807, + "step": 8942, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.884121020367807, + "step": 8942, + "train/total_loss": 0.27495408058166504 + }, + { + "entropy": 8.815057754516602, + "epoch": 0.8842198932173225, + "mean_token_accuracy": 0.7562254071235657, + "num_tokens": 25747716.0, + "step": 8943, + "train/ce_loss": 0.8791753649711609 + }, + { + "epoch": 0.8842198932173225, + "step": 8943, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.8842198932173225, + "step": 8943, + "train/total_loss": 0.1504175364971161 + }, + { + "entropy": 8.566951751708984, + "epoch": 0.884318766066838, + "mean_token_accuracy": 0.689734697341919, + "num_tokens": 25753045.0, + "step": 8944, + "train/ce_loss": 1.353271245956421 + }, + { + "epoch": 0.884318766066838, + "step": 8944, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.884318766066838, + "step": 8944, + "train/total_loss": 0.24470213055610657 + }, + { + "entropy": 8.79873275756836, + "epoch": 0.8844176389163536, + "mean_token_accuracy": 0.7470167279243469, + "num_tokens": 25758511.0, + "step": 8945, + "train/ce_loss": 0.7518976330757141 + }, + { + "epoch": 0.8844176389163536, + "step": 8945, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.8844176389163536, + "step": 8945, + "train/total_loss": 0.1337835192680359 + }, + { + "entropy": 9.191802024841309, + "epoch": 0.8845165117658691, + "mean_token_accuracy": 0.7674825191497803, + "num_tokens": 25763466.0, + "step": 8946, + "train/ce_loss": 0.6582119464874268 + }, + { + "epoch": 0.8845165117658691, + "step": 8946, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.8845165117658691, + "step": 8946, + "train/total_loss": 0.13613370060920715 + }, + { + "entropy": 8.626323699951172, + "epoch": 0.8846153846153846, + "mean_token_accuracy": 0.7957276105880737, + "num_tokens": 25768667.0, + "step": 8947, + "train/ce_loss": 0.5319151878356934 + }, + { + "epoch": 0.8846153846153846, + "step": 8947, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.8846153846153846, + "step": 8947, + "train/total_loss": 0.07272277027368546 + }, + { + "entropy": 8.56462287902832, + "epoch": 0.8847142574649002, + "mean_token_accuracy": 0.7169590592384338, + "num_tokens": 25774038.0, + "step": 8948, + "train/ce_loss": 1.3787561655044556 + }, + { + "epoch": 0.8847142574649002, + "step": 8948, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.8847142574649002, + "step": 8948, + "train/total_loss": 0.23162561655044556 + }, + { + "entropy": 8.933222770690918, + "epoch": 0.8848131303144157, + "mean_token_accuracy": 0.7023121118545532, + "num_tokens": 25779246.0, + "step": 8949, + "train/ce_loss": 1.3690531253814697 + }, + { + "epoch": 0.8848131303144157, + "step": 8949, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.8848131303144157, + "step": 8949, + "train/total_loss": 0.19159281253814697 + }, + { + "entropy": 9.15608024597168, + "epoch": 0.8849120031639311, + "mean_token_accuracy": 0.7441016435623169, + "num_tokens": 25784243.0, + "step": 8950, + "train/ce_loss": 1.1210671663284302 + }, + { + "epoch": 0.8849120031639311, + "step": 8950, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.8849120031639311, + "step": 8950, + "train/total_loss": 0.17851296067237854 + }, + { + "entropy": 8.732675552368164, + "epoch": 0.8850108760134467, + "mean_token_accuracy": 0.6734475493431091, + "num_tokens": 25789648.0, + "step": 8951, + "train/ce_loss": 2.0168395042419434 + }, + { + "epoch": 0.8850108760134467, + "step": 8951, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.8850108760134467, + "step": 8951, + "train/total_loss": 0.23293395340442657 + }, + { + "entropy": 8.678586959838867, + "epoch": 0.8851097488629622, + "mean_token_accuracy": 0.795134425163269, + "num_tokens": 25794883.0, + "step": 8952, + "train/ce_loss": 0.3202112317085266 + }, + { + "epoch": 0.8851097488629622, + "step": 8952, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.8851097488629622, + "step": 8952, + "train/total_loss": 0.05936487391591072 + }, + { + "entropy": 8.49631118774414, + "epoch": 0.8852086217124777, + "mean_token_accuracy": 0.8096304535865784, + "num_tokens": 25800285.0, + "step": 8953, + "train/ce_loss": 0.6475458145141602 + }, + { + "epoch": 0.8852086217124777, + "step": 8953, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.8852086217124777, + "step": 8953, + "train/total_loss": 0.09991083294153214 + }, + { + "entropy": 8.76569938659668, + "epoch": 0.8853074945619933, + "mean_token_accuracy": 0.7706855535507202, + "num_tokens": 25805590.0, + "step": 8954, + "train/ce_loss": 1.0109540224075317 + }, + { + "epoch": 0.8853074945619933, + "step": 8954, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.8853074945619933, + "step": 8954, + "train/total_loss": 0.18703290820121765 + }, + { + "entropy": 9.034889221191406, + "epoch": 0.8854063674115088, + "mean_token_accuracy": 0.6842105388641357, + "num_tokens": 25810630.0, + "step": 8955, + "train/ce_loss": 1.3987741470336914 + }, + { + "epoch": 0.8854063674115088, + "step": 8955, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.8854063674115088, + "step": 8955, + "train/total_loss": 0.22581492364406586 + }, + { + "entropy": 8.44428825378418, + "epoch": 0.8855052402610243, + "mean_token_accuracy": 0.7245631814002991, + "num_tokens": 25816120.0, + "step": 8956, + "train/ce_loss": 0.7968977093696594 + }, + { + "epoch": 0.8855052402610243, + "step": 8956, + "train/sim_loss": 0.01171875 + }, + { + "epoch": 0.8855052402610243, + "step": 8956, + "train/total_loss": 0.09140852093696594 + }, + { + "entropy": 8.451257705688477, + "epoch": 0.8856041131105399, + "mean_token_accuracy": 0.7467532753944397, + "num_tokens": 25821561.0, + "step": 8957, + "train/ce_loss": 0.9519866108894348 + }, + { + "epoch": 0.8856041131105399, + "step": 8957, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.8856041131105399, + "step": 8957, + "train/total_loss": 0.12254241108894348 + }, + { + "entropy": 9.339811325073242, + "epoch": 0.8857029859600554, + "mean_token_accuracy": 0.7164750695228577, + "num_tokens": 25826653.0, + "step": 8958, + "train/ce_loss": 1.1054483652114868 + }, + { + "epoch": 0.8857029859600554, + "step": 8958, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.8857029859600554, + "step": 8958, + "train/total_loss": 0.1847635805606842 + }, + { + "entropy": 9.519251823425293, + "epoch": 0.8858018588095709, + "mean_token_accuracy": 0.6827957034111023, + "num_tokens": 25831625.0, + "step": 8959, + "train/ce_loss": 0.9935401082038879 + }, + { + "epoch": 0.8858018588095709, + "step": 8959, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.8858018588095709, + "step": 8959, + "train/total_loss": 0.17357276380062103 + }, + { + "epoch": 0.8859007316590864, + "grad_norm": 0.7268169522285461, + "learning_rate": 7.787420264055778e-06, + "loss": 0.1342, + "step": 8960 + }, + { + "entropy": 9.06598949432373, + "epoch": 0.8859007316590864, + "mean_token_accuracy": 0.697609007358551, + "num_tokens": 25836814.0, + "step": 8960, + "train/ce_loss": 0.7908419370651245 + }, + { + "epoch": 0.8859007316590864, + "step": 8960, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.8859007316590864, + "step": 8960, + "train/total_loss": 0.11424044519662857 + }, + { + "entropy": 8.391468048095703, + "epoch": 0.8859996045086019, + "mean_token_accuracy": 0.7623947858810425, + "num_tokens": 25842363.0, + "step": 8961, + "train/ce_loss": 0.9892774820327759 + }, + { + "epoch": 0.8859996045086019, + "step": 8961, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.8859996045086019, + "step": 8961, + "train/total_loss": 0.18486525118350983 + }, + { + "entropy": 9.127792358398438, + "epoch": 0.8860984773581174, + "mean_token_accuracy": 0.7111111283302307, + "num_tokens": 25847520.0, + "step": 8962, + "train/ce_loss": 0.6025692224502563 + }, + { + "epoch": 0.8860984773581174, + "step": 8962, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.8860984773581174, + "step": 8962, + "train/total_loss": 0.10322567820549011 + }, + { + "entropy": 9.31008529663086, + "epoch": 0.886197350207633, + "mean_token_accuracy": 0.7643678188323975, + "num_tokens": 25852467.0, + "step": 8963, + "train/ce_loss": 1.804742693901062 + }, + { + "epoch": 0.886197350207633, + "step": 8963, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.886197350207633, + "step": 8963, + "train/total_loss": 0.24688051640987396 + }, + { + "entropy": 8.78582763671875, + "epoch": 0.8862962230571485, + "mean_token_accuracy": 0.7967032790184021, + "num_tokens": 25857661.0, + "step": 8964, + "train/ce_loss": 0.6282260417938232 + }, + { + "epoch": 0.8862962230571485, + "step": 8964, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.8862962230571485, + "step": 8964, + "train/total_loss": 0.0862601026892662 + }, + { + "entropy": 9.728044509887695, + "epoch": 0.886395095906664, + "mean_token_accuracy": 0.7571884989738464, + "num_tokens": 25862386.0, + "step": 8965, + "train/ce_loss": 4.494114875797095e-07 + }, + { + "epoch": 0.886395095906664, + "step": 8965, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.886395095906664, + "step": 8965, + "train/total_loss": 0.03906254470348358 + }, + { + "entropy": 9.573162078857422, + "epoch": 0.8864939687561796, + "mean_token_accuracy": 0.7716186046600342, + "num_tokens": 25867218.0, + "step": 8966, + "train/ce_loss": 6.132223688837257e-07 + }, + { + "epoch": 0.8864939687561796, + "step": 8966, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.8864939687561796, + "step": 8966, + "train/total_loss": 0.035156309604644775 + }, + { + "entropy": 8.320290565490723, + "epoch": 0.8865928416056951, + "mean_token_accuracy": 0.7339003682136536, + "num_tokens": 25872510.0, + "step": 8967, + "train/ce_loss": 0.7447497844696045 + }, + { + "epoch": 0.8865928416056951, + "step": 8967, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.8865928416056951, + "step": 8967, + "train/total_loss": 0.1252562403678894 + }, + { + "entropy": 9.116350173950195, + "epoch": 0.8866917144552106, + "mean_token_accuracy": 0.7511811256408691, + "num_tokens": 25877586.0, + "step": 8968, + "train/ce_loss": 4.29521065825611e-07 + }, + { + "epoch": 0.8866917144552106, + "step": 8968, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.8866917144552106, + "step": 8968, + "train/total_loss": 0.04687504470348358 + }, + { + "entropy": 8.468484878540039, + "epoch": 0.8867905873047262, + "mean_token_accuracy": 0.7777777910232544, + "num_tokens": 25882886.0, + "step": 8969, + "train/ce_loss": 0.45138150453567505 + }, + { + "epoch": 0.8867905873047262, + "step": 8969, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.8867905873047262, + "step": 8969, + "train/total_loss": 0.0763881504535675 + }, + { + "entropy": 8.804032325744629, + "epoch": 0.8868894601542416, + "mean_token_accuracy": 0.7127799987792969, + "num_tokens": 25888161.0, + "step": 8970, + "train/ce_loss": 0.6077730655670166 + }, + { + "epoch": 0.8868894601542416, + "step": 8970, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.8868894601542416, + "step": 8970, + "train/total_loss": 0.08030855655670166 + }, + { + "entropy": 8.558965682983398, + "epoch": 0.8869883330037571, + "mean_token_accuracy": 0.7459633946418762, + "num_tokens": 25893581.0, + "step": 8971, + "train/ce_loss": 0.9098638296127319 + }, + { + "epoch": 0.8869883330037571, + "step": 8971, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.8869883330037571, + "step": 8971, + "train/total_loss": 0.14176763594150543 + }, + { + "entropy": 9.194341659545898, + "epoch": 0.8870872058532727, + "mean_token_accuracy": 0.7659574747085571, + "num_tokens": 25898560.0, + "step": 8972, + "train/ce_loss": 1.1279083490371704 + }, + { + "epoch": 0.8870872058532727, + "step": 8972, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.8870872058532727, + "step": 8972, + "train/total_loss": 0.17919708788394928 + }, + { + "entropy": 9.268182754516602, + "epoch": 0.8871860787027882, + "mean_token_accuracy": 0.7967479825019836, + "num_tokens": 25903590.0, + "step": 8973, + "train/ce_loss": 0.5328656435012817 + }, + { + "epoch": 0.8871860787027882, + "step": 8973, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.8871860787027882, + "step": 8973, + "train/total_loss": 0.07672406733036041 + }, + { + "entropy": 8.511130332946777, + "epoch": 0.8872849515523037, + "mean_token_accuracy": 0.7100238800048828, + "num_tokens": 25908896.0, + "step": 8974, + "train/ce_loss": 0.8261290192604065 + }, + { + "epoch": 0.8872849515523037, + "step": 8974, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.8872849515523037, + "step": 8974, + "train/total_loss": 0.12558165192604065 + }, + { + "entropy": 8.378144264221191, + "epoch": 0.8873838244018193, + "mean_token_accuracy": 0.7524971961975098, + "num_tokens": 25914255.0, + "step": 8975, + "train/ce_loss": 0.9754911661148071 + }, + { + "epoch": 0.8873838244018193, + "step": 8975, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.8873838244018193, + "step": 8975, + "train/total_loss": 0.14442411065101624 + }, + { + "entropy": 9.569849014282227, + "epoch": 0.8874826972513348, + "mean_token_accuracy": 0.726190447807312, + "num_tokens": 25919130.0, + "step": 8976, + "train/ce_loss": 8.707828555998276e-07 + }, + { + "epoch": 0.8874826972513348, + "step": 8976, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.8874826972513348, + "step": 8976, + "train/total_loss": 0.039062585681676865 + }, + { + "entropy": 9.503039360046387, + "epoch": 0.8875815701008503, + "mean_token_accuracy": 0.71856290102005, + "num_tokens": 25924031.0, + "step": 8977, + "train/ce_loss": 6.605205271625891e-07 + }, + { + "epoch": 0.8875815701008503, + "step": 8977, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.8875815701008503, + "step": 8977, + "train/total_loss": 0.05859381705522537 + }, + { + "entropy": 9.024887084960938, + "epoch": 0.8876804429503659, + "mean_token_accuracy": 0.8118518590927124, + "num_tokens": 25929125.0, + "step": 8978, + "train/ce_loss": 0.4504447877407074 + }, + { + "epoch": 0.8876804429503659, + "step": 8978, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.8876804429503659, + "step": 8978, + "train/total_loss": 0.06848198175430298 + }, + { + "entropy": 8.868086814880371, + "epoch": 0.8877793157998813, + "mean_token_accuracy": 0.7417417168617249, + "num_tokens": 25934228.0, + "step": 8979, + "train/ce_loss": 1.3392902612686157 + }, + { + "epoch": 0.8877793157998813, + "step": 8979, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.8877793157998813, + "step": 8979, + "train/total_loss": 0.1847102791070938 + }, + { + "epoch": 0.8878781886493968, + "grad_norm": 0.6756343841552734, + "learning_rate": 7.78247539929783e-06, + "loss": 0.1313, + "step": 8980 + }, + { + "entropy": 9.047006607055664, + "epoch": 0.8878781886493968, + "mean_token_accuracy": 0.729907751083374, + "num_tokens": 25939570.0, + "step": 8980, + "train/ce_loss": 1.318399429321289 + }, + { + "epoch": 0.8878781886493968, + "step": 8980, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.8878781886493968, + "step": 8980, + "train/total_loss": 0.22558994591236115 + }, + { + "entropy": 8.497231483459473, + "epoch": 0.8879770614989124, + "mean_token_accuracy": 0.7470930218696594, + "num_tokens": 25945098.0, + "step": 8981, + "train/ce_loss": 0.9360732436180115 + }, + { + "epoch": 0.8879770614989124, + "step": 8981, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.8879770614989124, + "step": 8981, + "train/total_loss": 0.1404823362827301 + }, + { + "entropy": 8.695652961730957, + "epoch": 0.8880759343484279, + "mean_token_accuracy": 0.734649121761322, + "num_tokens": 25950459.0, + "step": 8982, + "train/ce_loss": 0.5582740902900696 + }, + { + "epoch": 0.8880759343484279, + "step": 8982, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.8880759343484279, + "step": 8982, + "train/total_loss": 0.07145240902900696 + }, + { + "entropy": 8.390104293823242, + "epoch": 0.8881748071979434, + "mean_token_accuracy": 0.6779220700263977, + "num_tokens": 25956086.0, + "step": 8983, + "train/ce_loss": 0.9199328422546387 + }, + { + "epoch": 0.8881748071979434, + "step": 8983, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.8881748071979434, + "step": 8983, + "train/total_loss": 0.1310557872056961 + }, + { + "entropy": 8.763376235961914, + "epoch": 0.888273680047459, + "mean_token_accuracy": 0.765389084815979, + "num_tokens": 25961419.0, + "step": 8984, + "train/ce_loss": 0.5386528372764587 + }, + { + "epoch": 0.888273680047459, + "step": 8984, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.888273680047459, + "step": 8984, + "train/total_loss": 0.07730278372764587 + }, + { + "entropy": 8.878808975219727, + "epoch": 0.8883725528969745, + "mean_token_accuracy": 0.7311557531356812, + "num_tokens": 25966562.0, + "step": 8985, + "train/ce_loss": 0.8655556440353394 + }, + { + "epoch": 0.8883725528969745, + "step": 8985, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.8883725528969745, + "step": 8985, + "train/total_loss": 0.2037430703639984 + }, + { + "entropy": 9.184910774230957, + "epoch": 0.88847142574649, + "mean_token_accuracy": 0.7799227833747864, + "num_tokens": 25971506.0, + "step": 8986, + "train/ce_loss": 2.3841751328745886e-07 + }, + { + "epoch": 0.88847142574649, + "step": 8986, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.88847142574649, + "step": 8986, + "train/total_loss": 0.03906252235174179 + }, + { + "entropy": 8.740650177001953, + "epoch": 0.8885702985960056, + "mean_token_accuracy": 0.745932400226593, + "num_tokens": 25976772.0, + "step": 8987, + "train/ce_loss": 0.5288046598434448 + }, + { + "epoch": 0.8885702985960056, + "step": 8987, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.8885702985960056, + "step": 8987, + "train/total_loss": 0.10366171598434448 + }, + { + "entropy": 8.849593162536621, + "epoch": 0.888669171445521, + "mean_token_accuracy": 0.737423300743103, + "num_tokens": 25982054.0, + "step": 8988, + "train/ce_loss": 1.2712167501449585 + }, + { + "epoch": 0.888669171445521, + "step": 8988, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.888669171445521, + "step": 8988, + "train/total_loss": 0.2130591720342636 + }, + { + "entropy": 8.474994659423828, + "epoch": 0.8887680442950365, + "mean_token_accuracy": 0.7028301954269409, + "num_tokens": 25987379.0, + "step": 8989, + "train/ce_loss": 0.5919832587242126 + }, + { + "epoch": 0.8887680442950365, + "step": 8989, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.8887680442950365, + "step": 8989, + "train/total_loss": 0.09435457736253738 + }, + { + "entropy": 8.605907440185547, + "epoch": 0.8888669171445521, + "mean_token_accuracy": 0.7706310749053955, + "num_tokens": 25992705.0, + "step": 8990, + "train/ce_loss": 0.5869959592819214 + }, + { + "epoch": 0.8888669171445521, + "step": 8990, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.8888669171445521, + "step": 8990, + "train/total_loss": 0.0899495929479599 + }, + { + "entropy": 8.663553237915039, + "epoch": 0.8889657899940676, + "mean_token_accuracy": 0.7052631378173828, + "num_tokens": 25998037.0, + "step": 8991, + "train/ce_loss": 0.7423734068870544 + }, + { + "epoch": 0.8889657899940676, + "step": 8991, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.8889657899940676, + "step": 8991, + "train/total_loss": 0.15626859664916992 + }, + { + "entropy": 9.869404792785645, + "epoch": 0.8890646628435831, + "mean_token_accuracy": 0.748792290687561, + "num_tokens": 26002662.0, + "step": 8992, + "train/ce_loss": 6.933688041499408e-07 + }, + { + "epoch": 0.8890646628435831, + "step": 8992, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.8890646628435831, + "step": 8992, + "train/total_loss": 0.02343756891787052 + }, + { + "entropy": 8.575206756591797, + "epoch": 0.8891635356930987, + "mean_token_accuracy": 0.6972789168357849, + "num_tokens": 26008022.0, + "step": 8993, + "train/ce_loss": 0.8619474172592163 + }, + { + "epoch": 0.8891635356930987, + "step": 8993, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.8891635356930987, + "step": 8993, + "train/total_loss": 0.12135099619626999 + }, + { + "entropy": 9.302907943725586, + "epoch": 0.8892624085426142, + "mean_token_accuracy": 0.7334710955619812, + "num_tokens": 26012932.0, + "step": 8994, + "train/ce_loss": 5.172261126062949e-07 + }, + { + "epoch": 0.8892624085426142, + "step": 8994, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.8892624085426142, + "step": 8994, + "train/total_loss": 0.05468755215406418 + }, + { + "entropy": 8.820615768432617, + "epoch": 0.8893612813921298, + "mean_token_accuracy": 0.7038043737411499, + "num_tokens": 26018045.0, + "step": 8995, + "train/ce_loss": 8.845435331750195e-06 + }, + { + "epoch": 0.8893612813921298, + "step": 8995, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.8893612813921298, + "step": 8995, + "train/total_loss": 0.058594632893800735 + }, + { + "entropy": 8.942163467407227, + "epoch": 0.8894601542416453, + "mean_token_accuracy": 0.7947368621826172, + "num_tokens": 26023442.0, + "step": 8996, + "train/ce_loss": 0.6891187429428101 + }, + { + "epoch": 0.8894601542416453, + "step": 8996, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.8894601542416453, + "step": 8996, + "train/total_loss": 0.13141188025474548 + }, + { + "entropy": 8.762359619140625, + "epoch": 0.8895590270911607, + "mean_token_accuracy": 0.7494407296180725, + "num_tokens": 26028765.0, + "step": 8997, + "train/ce_loss": 1.0854436159133911 + }, + { + "epoch": 0.8895590270911607, + "step": 8997, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.8895590270911607, + "step": 8997, + "train/total_loss": 0.13979436457157135 + }, + { + "entropy": 8.596555709838867, + "epoch": 0.8896578999406763, + "mean_token_accuracy": 0.8172484636306763, + "num_tokens": 26034234.0, + "step": 8998, + "train/ce_loss": 0.6125220656394958 + }, + { + "epoch": 0.8896578999406763, + "step": 8998, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.8896578999406763, + "step": 8998, + "train/total_loss": 0.10422095656394958 + }, + { + "entropy": 9.128072738647461, + "epoch": 0.8897567727901918, + "mean_token_accuracy": 0.6632201075553894, + "num_tokens": 26039394.0, + "step": 8999, + "train/ce_loss": 0.9784678220748901 + }, + { + "epoch": 0.8897567727901918, + "step": 8999, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.8897567727901918, + "step": 8999, + "train/total_loss": 0.15644052624702454 + }, + { + "epoch": 0.8898556456397073, + "grad_norm": 0.751347541809082, + "learning_rate": 7.77753053453988e-06, + "loss": 0.1359, + "step": 9000 + }, + { + "entropy": 8.731948852539062, + "epoch": 0.8898556456397073, + "mean_token_accuracy": 0.7823458313941956, + "num_tokens": 26044735.0, + "step": 9000, + "train/ce_loss": 0.5468152165412903 + }, + { + "epoch": 0.8898556456397073, + "step": 9000, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.8898556456397073, + "step": 9000, + "train/total_loss": 0.07811902463436127 + }, + { + "entropy": 9.005437850952148, + "epoch": 0.8899545184892229, + "mean_token_accuracy": 0.695195198059082, + "num_tokens": 26049826.0, + "step": 9001, + "train/ce_loss": 0.40771716833114624 + }, + { + "epoch": 0.8899545184892229, + "step": 9001, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.8899545184892229, + "step": 9001, + "train/total_loss": 0.1071779727935791 + }, + { + "entropy": 8.488256454467773, + "epoch": 0.8900533913387384, + "mean_token_accuracy": 0.75, + "num_tokens": 26055136.0, + "step": 9002, + "train/ce_loss": 0.5950515270233154 + }, + { + "epoch": 0.8900533913387384, + "step": 9002, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.8900533913387384, + "step": 9002, + "train/total_loss": 0.1532551497220993 + }, + { + "entropy": 8.892885208129883, + "epoch": 0.8901522641882539, + "mean_token_accuracy": 0.7684515118598938, + "num_tokens": 26060244.0, + "step": 9003, + "train/ce_loss": 0.6378443837165833 + }, + { + "epoch": 0.8901522641882539, + "step": 9003, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.8901522641882539, + "step": 9003, + "train/total_loss": 0.09112819284200668 + }, + { + "entropy": 9.24199390411377, + "epoch": 0.8902511370377695, + "mean_token_accuracy": 0.7516629695892334, + "num_tokens": 26065111.0, + "step": 9004, + "train/ce_loss": 1.6132978200912476 + }, + { + "epoch": 0.8902511370377695, + "step": 9004, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.8902511370377695, + "step": 9004, + "train/total_loss": 0.22382979094982147 + }, + { + "entropy": 9.007030487060547, + "epoch": 0.890350009887285, + "mean_token_accuracy": 0.7781201601028442, + "num_tokens": 26070247.0, + "step": 9005, + "train/ce_loss": 0.9047526121139526 + }, + { + "epoch": 0.890350009887285, + "step": 9005, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.890350009887285, + "step": 9005, + "train/total_loss": 0.12953776121139526 + }, + { + "entropy": 8.367483139038086, + "epoch": 0.8904488827368005, + "mean_token_accuracy": 0.7826552391052246, + "num_tokens": 26075689.0, + "step": 9006, + "train/ce_loss": 0.6974188089370728 + }, + { + "epoch": 0.8904488827368005, + "step": 9006, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.8904488827368005, + "step": 9006, + "train/total_loss": 0.1166168823838234 + }, + { + "entropy": 8.697525024414062, + "epoch": 0.890547755586316, + "mean_token_accuracy": 0.8026666641235352, + "num_tokens": 26080902.0, + "step": 9007, + "train/ce_loss": 1.102607250213623 + }, + { + "epoch": 0.890547755586316, + "step": 9007, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.890547755586316, + "step": 9007, + "train/total_loss": 0.1571357250213623 + }, + { + "entropy": 9.06612777709961, + "epoch": 0.8906466284358315, + "mean_token_accuracy": 0.744027316570282, + "num_tokens": 26085963.0, + "step": 9008, + "train/ce_loss": 4.638117445665557e-07 + }, + { + "epoch": 0.8906466284358315, + "step": 9008, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.8906466284358315, + "step": 9008, + "train/total_loss": 0.02343754656612873 + }, + { + "entropy": 8.718547821044922, + "epoch": 0.890745501285347, + "mean_token_accuracy": 0.7577388882637024, + "num_tokens": 26091164.0, + "step": 9009, + "train/ce_loss": 0.515997588634491 + }, + { + "epoch": 0.890745501285347, + "step": 9009, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.890745501285347, + "step": 9009, + "train/total_loss": 0.09066225588321686 + }, + { + "entropy": 8.434659957885742, + "epoch": 0.8908443741348626, + "mean_token_accuracy": 0.6988235116004944, + "num_tokens": 26096475.0, + "step": 9010, + "train/ce_loss": 0.7154737710952759 + }, + { + "epoch": 0.8908443741348626, + "step": 9010, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.8908443741348626, + "step": 9010, + "train/total_loss": 0.11060988157987595 + }, + { + "entropy": 8.800464630126953, + "epoch": 0.8909432469843781, + "mean_token_accuracy": 0.771501898765564, + "num_tokens": 26101733.0, + "step": 9011, + "train/ce_loss": 4.872323984272953e-07 + }, + { + "epoch": 0.8909432469843781, + "step": 9011, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.8909432469843781, + "step": 9011, + "train/total_loss": 0.05468754842877388 + }, + { + "entropy": 8.192608833312988, + "epoch": 0.8910421198338936, + "mean_token_accuracy": 0.7322916388511658, + "num_tokens": 26107180.0, + "step": 9012, + "train/ce_loss": 0.6412844061851501 + }, + { + "epoch": 0.8910421198338936, + "step": 9012, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.8910421198338936, + "step": 9012, + "train/total_loss": 0.11490969359874725 + }, + { + "entropy": 9.043550491333008, + "epoch": 0.8911409926834092, + "mean_token_accuracy": 0.7675675749778748, + "num_tokens": 26112352.0, + "step": 9013, + "train/ce_loss": 0.4754367172718048 + }, + { + "epoch": 0.8911409926834092, + "step": 9013, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.8911409926834092, + "step": 9013, + "train/total_loss": 0.10223117470741272 + }, + { + "entropy": 8.560033798217773, + "epoch": 0.8912398655329247, + "mean_token_accuracy": 0.7476303577423096, + "num_tokens": 26117652.0, + "step": 9014, + "train/ce_loss": 1.167054533958435 + }, + { + "epoch": 0.8912398655329247, + "step": 9014, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.8912398655329247, + "step": 9014, + "train/total_loss": 0.17139294743537903 + }, + { + "entropy": 8.732704162597656, + "epoch": 0.8913387383824402, + "mean_token_accuracy": 0.7706043720245361, + "num_tokens": 26122837.0, + "step": 9015, + "train/ce_loss": 0.5358596444129944 + }, + { + "epoch": 0.8913387383824402, + "step": 9015, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.8913387383824402, + "step": 9015, + "train/total_loss": 0.0965547114610672 + }, + { + "entropy": 8.724569320678711, + "epoch": 0.8914376112319558, + "mean_token_accuracy": 0.6988266110420227, + "num_tokens": 26128083.0, + "step": 9016, + "train/ce_loss": 0.9315149784088135 + }, + { + "epoch": 0.8914376112319558, + "step": 9016, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.8914376112319558, + "step": 9016, + "train/total_loss": 0.1322140097618103 + }, + { + "entropy": 8.461874008178711, + "epoch": 0.8915364840814712, + "mean_token_accuracy": 0.7396335601806641, + "num_tokens": 26133608.0, + "step": 9017, + "train/ce_loss": 0.8199291229248047 + }, + { + "epoch": 0.8915364840814712, + "step": 9017, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.8915364840814712, + "step": 9017, + "train/total_loss": 0.18746167421340942 + }, + { + "entropy": 8.35120964050293, + "epoch": 0.8916353569309867, + "mean_token_accuracy": 0.7727272510528564, + "num_tokens": 26139153.0, + "step": 9018, + "train/ce_loss": 0.6267951130867004 + }, + { + "epoch": 0.8916353569309867, + "step": 9018, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.8916353569309867, + "step": 9018, + "train/total_loss": 0.09783576428890228 + }, + { + "entropy": 9.33806037902832, + "epoch": 0.8917342297805023, + "mean_token_accuracy": 0.7386138439178467, + "num_tokens": 26144101.0, + "step": 9019, + "train/ce_loss": 1.6418176889419556 + }, + { + "epoch": 0.8917342297805023, + "step": 9019, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.8917342297805023, + "step": 9019, + "train/total_loss": 0.21496301889419556 + }, + { + "epoch": 0.8918331026300178, + "grad_norm": 0.6972878575325012, + "learning_rate": 7.772585669781933e-06, + "loss": 0.1285, + "step": 9020 + }, + { + "entropy": 9.390382766723633, + "epoch": 0.8918331026300178, + "mean_token_accuracy": 0.7250608205795288, + "num_tokens": 26148919.0, + "step": 9020, + "train/ce_loss": 1.5816576480865479 + }, + { + "epoch": 0.8918331026300178, + "step": 9020, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.8918331026300178, + "step": 9020, + "train/total_loss": 0.18941576778888702 + }, + { + "entropy": 9.10663890838623, + "epoch": 0.8919319754795333, + "mean_token_accuracy": 0.7777777910232544, + "num_tokens": 26154008.0, + "step": 9021, + "train/ce_loss": 1.269675850868225 + }, + { + "epoch": 0.8919319754795333, + "step": 9021, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.8919319754795333, + "step": 9021, + "train/total_loss": 0.19728009402751923 + }, + { + "entropy": 9.42596435546875, + "epoch": 0.8920308483290489, + "mean_token_accuracy": 0.7516930103302002, + "num_tokens": 26158859.0, + "step": 9022, + "train/ce_loss": 4.456167630451091e-07 + }, + { + "epoch": 0.8920308483290489, + "step": 9022, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.8920308483290489, + "step": 9022, + "train/total_loss": 0.03906254470348358 + }, + { + "entropy": 8.594335556030273, + "epoch": 0.8921297211785644, + "mean_token_accuracy": 0.7644287347793579, + "num_tokens": 26164362.0, + "step": 9023, + "train/ce_loss": 0.6554343104362488 + }, + { + "epoch": 0.8921297211785644, + "step": 9023, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.8921297211785644, + "step": 9023, + "train/total_loss": 0.08507468551397324 + }, + { + "entropy": 9.105646133422852, + "epoch": 0.8922285940280799, + "mean_token_accuracy": 0.692307710647583, + "num_tokens": 26169576.0, + "step": 9024, + "train/ce_loss": 1.1244890689849854 + }, + { + "epoch": 0.8922285940280799, + "step": 9024, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.8922285940280799, + "step": 9024, + "train/total_loss": 0.20229265093803406 + }, + { + "entropy": 9.05662727355957, + "epoch": 0.8923274668775955, + "mean_token_accuracy": 0.7682291865348816, + "num_tokens": 26174811.0, + "step": 9025, + "train/ce_loss": 0.9285861849784851 + }, + { + "epoch": 0.8923274668775955, + "step": 9025, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.8923274668775955, + "step": 9025, + "train/total_loss": 0.16707736253738403 + }, + { + "entropy": 8.581493377685547, + "epoch": 0.8924263397271109, + "mean_token_accuracy": 0.7704917788505554, + "num_tokens": 26180168.0, + "step": 9026, + "train/ce_loss": 0.6383937001228333 + }, + { + "epoch": 0.8924263397271109, + "step": 9026, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.8924263397271109, + "step": 9026, + "train/total_loss": 0.1068081185221672 + }, + { + "entropy": 9.577193260192871, + "epoch": 0.8925252125766264, + "mean_token_accuracy": 0.782608687877655, + "num_tokens": 26184875.0, + "step": 9027, + "train/ce_loss": 0.9829044342041016 + }, + { + "epoch": 0.8925252125766264, + "step": 9027, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.8925252125766264, + "step": 9027, + "train/total_loss": 0.13735294342041016 + }, + { + "entropy": 8.63807487487793, + "epoch": 0.892624085426142, + "mean_token_accuracy": 0.7050528526306152, + "num_tokens": 26190204.0, + "step": 9028, + "train/ce_loss": 0.7294370532035828 + }, + { + "epoch": 0.892624085426142, + "step": 9028, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.892624085426142, + "step": 9028, + "train/total_loss": 0.13934996724128723 + }, + { + "entropy": 8.742045402526855, + "epoch": 0.8927229582756575, + "mean_token_accuracy": 0.662162184715271, + "num_tokens": 26195414.0, + "step": 9029, + "train/ce_loss": 6.97838459018385e-07 + }, + { + "epoch": 0.8927229582756575, + "step": 9029, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.8927229582756575, + "step": 9029, + "train/total_loss": 0.04687507078051567 + }, + { + "entropy": 8.24318790435791, + "epoch": 0.892821831125173, + "mean_token_accuracy": 0.7603686451911926, + "num_tokens": 26200938.0, + "step": 9030, + "train/ce_loss": 0.8421469330787659 + }, + { + "epoch": 0.892821831125173, + "step": 9030, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.892821831125173, + "step": 9030, + "train/total_loss": 0.1623396873474121 + }, + { + "entropy": 9.521178245544434, + "epoch": 0.8929207039746886, + "mean_token_accuracy": 0.6871035695075989, + "num_tokens": 26205841.0, + "step": 9031, + "train/ce_loss": 2.0296339988708496 + }, + { + "epoch": 0.8929207039746886, + "step": 9031, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.8929207039746886, + "step": 9031, + "train/total_loss": 0.2654634118080139 + }, + { + "entropy": 8.726725578308105, + "epoch": 0.8930195768242041, + "mean_token_accuracy": 0.8068965673446655, + "num_tokens": 26211062.0, + "step": 9032, + "train/ce_loss": 0.7748110890388489 + }, + { + "epoch": 0.8930195768242041, + "step": 9032, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.8930195768242041, + "step": 9032, + "train/total_loss": 0.09310611337423325 + }, + { + "entropy": 9.207667350769043, + "epoch": 0.8931184496737196, + "mean_token_accuracy": 0.7491227984428406, + "num_tokens": 26216088.0, + "step": 9033, + "train/ce_loss": 0.8433367609977722 + }, + { + "epoch": 0.8931184496737196, + "step": 9033, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.8931184496737196, + "step": 9033, + "train/total_loss": 0.16245868802070618 + }, + { + "entropy": 9.220256805419922, + "epoch": 0.8932173225232352, + "mean_token_accuracy": 0.7688524723052979, + "num_tokens": 26221121.0, + "step": 9034, + "train/ce_loss": 0.7536109089851379 + }, + { + "epoch": 0.8932173225232352, + "step": 9034, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.8932173225232352, + "step": 9034, + "train/total_loss": 0.12614235281944275 + }, + { + "entropy": 9.368236541748047, + "epoch": 0.8933161953727506, + "mean_token_accuracy": 0.785263180732727, + "num_tokens": 26226030.0, + "step": 9035, + "train/ce_loss": 1.0121777057647705 + }, + { + "epoch": 0.8933161953727506, + "step": 9035, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.8933161953727506, + "step": 9035, + "train/total_loss": 0.13637402653694153 + }, + { + "entropy": 8.360307693481445, + "epoch": 0.8934150682222661, + "mean_token_accuracy": 0.7129071354866028, + "num_tokens": 26231407.0, + "step": 9036, + "train/ce_loss": 1.0447863340377808 + }, + { + "epoch": 0.8934150682222661, + "step": 9036, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.8934150682222661, + "step": 9036, + "train/total_loss": 0.1669786274433136 + }, + { + "entropy": 8.6937255859375, + "epoch": 0.8935139410717817, + "mean_token_accuracy": 0.7613762617111206, + "num_tokens": 26236818.0, + "step": 9037, + "train/ce_loss": 0.8467341661453247 + }, + { + "epoch": 0.8935139410717817, + "step": 9037, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.8935139410717817, + "step": 9037, + "train/total_loss": 0.10029841959476471 + }, + { + "entropy": 8.954123497009277, + "epoch": 0.8936128139212972, + "mean_token_accuracy": 0.7614991664886475, + "num_tokens": 26241818.0, + "step": 9038, + "train/ce_loss": 0.7165358066558838 + }, + { + "epoch": 0.8936128139212972, + "step": 9038, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.8936128139212972, + "step": 9038, + "train/total_loss": 0.1146223321557045 + }, + { + "entropy": 9.098472595214844, + "epoch": 0.8937116867708127, + "mean_token_accuracy": 0.7612456679344177, + "num_tokens": 26246832.0, + "step": 9039, + "train/ce_loss": 1.1427669525146484 + }, + { + "epoch": 0.8937116867708127, + "step": 9039, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.8937116867708127, + "step": 9039, + "train/total_loss": 0.1650579571723938 + }, + { + "epoch": 0.8938105596203283, + "grad_norm": 0.6742735505104065, + "learning_rate": 7.767640805023983e-06, + "loss": 0.1266, + "step": 9040 + }, + { + "entropy": 8.34196662902832, + "epoch": 0.8938105596203283, + "mean_token_accuracy": 0.7777777910232544, + "num_tokens": 26252304.0, + "step": 9040, + "train/ce_loss": 1.012823462486267 + }, + { + "epoch": 0.8938105596203283, + "step": 9040, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.8938105596203283, + "step": 9040, + "train/total_loss": 0.14815735816955566 + }, + { + "entropy": 8.855175971984863, + "epoch": 0.8939094324698438, + "mean_token_accuracy": 0.7379679083824158, + "num_tokens": 26257507.0, + "step": 9041, + "train/ce_loss": 1.2165783643722534 + }, + { + "epoch": 0.8939094324698438, + "step": 9041, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.8939094324698438, + "step": 9041, + "train/total_loss": 0.1958765983581543 + }, + { + "entropy": 8.561556816101074, + "epoch": 0.8940083053193593, + "mean_token_accuracy": 0.7976694703102112, + "num_tokens": 26262920.0, + "step": 9042, + "train/ce_loss": 0.725635826587677 + }, + { + "epoch": 0.8940083053193593, + "step": 9042, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.8940083053193593, + "step": 9042, + "train/total_loss": 0.09209483116865158 + }, + { + "entropy": 8.421992301940918, + "epoch": 0.8941071781688749, + "mean_token_accuracy": 0.7265269160270691, + "num_tokens": 26268483.0, + "step": 9043, + "train/ce_loss": 1.123929738998413 + }, + { + "epoch": 0.8941071781688749, + "step": 9043, + "train/sim_loss": 0.13671875 + }, + { + "epoch": 0.8941071781688749, + "step": 9043, + "train/total_loss": 0.24911172688007355 + }, + { + "entropy": 8.830625534057617, + "epoch": 0.8942060510183903, + "mean_token_accuracy": 0.7846607565879822, + "num_tokens": 26273604.0, + "step": 9044, + "train/ce_loss": 1.676502506597899e-06 + }, + { + "epoch": 0.8942060510183903, + "step": 9044, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.8942060510183903, + "step": 9044, + "train/total_loss": 0.08984392136335373 + }, + { + "entropy": 9.134194374084473, + "epoch": 0.8943049238679058, + "mean_token_accuracy": 0.7523364424705505, + "num_tokens": 26278673.0, + "step": 9045, + "train/ce_loss": 0.8511750102043152 + }, + { + "epoch": 0.8943049238679058, + "step": 9045, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.8943049238679058, + "step": 9045, + "train/total_loss": 0.12027375400066376 + }, + { + "entropy": 8.744175910949707, + "epoch": 0.8944037967174214, + "mean_token_accuracy": 0.70333331823349, + "num_tokens": 26284089.0, + "step": 9046, + "train/ce_loss": 1.0774354934692383 + }, + { + "epoch": 0.8944037967174214, + "step": 9046, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.8944037967174214, + "step": 9046, + "train/total_loss": 0.19368106126785278 + }, + { + "entropy": 8.416398048400879, + "epoch": 0.8945026695669369, + "mean_token_accuracy": 0.7437295317649841, + "num_tokens": 26289527.0, + "step": 9047, + "train/ce_loss": 0.5761348605155945 + }, + { + "epoch": 0.8945026695669369, + "step": 9047, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.8945026695669369, + "step": 9047, + "train/total_loss": 0.10448849201202393 + }, + { + "entropy": 9.505701065063477, + "epoch": 0.8946015424164524, + "mean_token_accuracy": 0.7559241652488708, + "num_tokens": 26294365.0, + "step": 9048, + "train/ce_loss": 1.436909556388855 + }, + { + "epoch": 0.8946015424164524, + "step": 9048, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.8946015424164524, + "step": 9048, + "train/total_loss": 0.18665970861911774 + }, + { + "entropy": 9.163416862487793, + "epoch": 0.894700415265968, + "mean_token_accuracy": 0.7727952003479004, + "num_tokens": 26299507.0, + "step": 9049, + "train/ce_loss": 1.247650146484375 + }, + { + "epoch": 0.894700415265968, + "step": 9049, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.894700415265968, + "step": 9049, + "train/total_loss": 0.17945250868797302 + }, + { + "entropy": 9.116547584533691, + "epoch": 0.8947992881154835, + "mean_token_accuracy": 0.7496111989021301, + "num_tokens": 26304595.0, + "step": 9050, + "train/ce_loss": 1.3614546060562134 + }, + { + "epoch": 0.8947992881154835, + "step": 9050, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.8947992881154835, + "step": 9050, + "train/total_loss": 0.2494267076253891 + }, + { + "entropy": 9.33833122253418, + "epoch": 0.894898160964999, + "mean_token_accuracy": 0.6994906663894653, + "num_tokens": 26309668.0, + "step": 9051, + "train/ce_loss": 0.9015560150146484 + }, + { + "epoch": 0.894898160964999, + "step": 9051, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.894898160964999, + "step": 9051, + "train/total_loss": 0.16046810150146484 + }, + { + "entropy": 8.719079971313477, + "epoch": 0.8949970338145146, + "mean_token_accuracy": 0.787089467048645, + "num_tokens": 26315022.0, + "step": 9052, + "train/ce_loss": 0.3523921072483063 + }, + { + "epoch": 0.8949970338145146, + "step": 9052, + "train/sim_loss": 0.01171875 + }, + { + "epoch": 0.8949970338145146, + "step": 9052, + "train/total_loss": 0.04695796221494675 + }, + { + "entropy": 8.357917785644531, + "epoch": 0.89509590666403, + "mean_token_accuracy": 0.71685391664505, + "num_tokens": 26320401.0, + "step": 9053, + "train/ce_loss": 1.4288767576217651 + }, + { + "epoch": 0.89509590666403, + "step": 9053, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.89509590666403, + "step": 9053, + "train/total_loss": 0.174137681722641 + }, + { + "entropy": 9.23624038696289, + "epoch": 0.8951947795135455, + "mean_token_accuracy": 0.7727272510528564, + "num_tokens": 26325419.0, + "step": 9054, + "train/ce_loss": 0.46162018179893494 + }, + { + "epoch": 0.8951947795135455, + "step": 9054, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.8951947795135455, + "step": 9054, + "train/total_loss": 0.07741202414035797 + }, + { + "entropy": 8.519001007080078, + "epoch": 0.8952936523630611, + "mean_token_accuracy": 0.7407833933830261, + "num_tokens": 26330690.0, + "step": 9055, + "train/ce_loss": 0.4581650495529175 + }, + { + "epoch": 0.8952936523630611, + "step": 9055, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.8952936523630611, + "step": 9055, + "train/total_loss": 0.07706651091575623 + }, + { + "entropy": 9.182341575622559, + "epoch": 0.8953925252125766, + "mean_token_accuracy": 0.7250922322273254, + "num_tokens": 26335664.0, + "step": 9056, + "train/ce_loss": 0.8898815512657166 + }, + { + "epoch": 0.8953925252125766, + "step": 9056, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.8953925252125766, + "step": 9056, + "train/total_loss": 0.14367565512657166 + }, + { + "entropy": 8.56419563293457, + "epoch": 0.8954913980620921, + "mean_token_accuracy": 0.7774358987808228, + "num_tokens": 26341103.0, + "step": 9057, + "train/ce_loss": 0.5643113851547241 + }, + { + "epoch": 0.8954913980620921, + "step": 9057, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.8954913980620921, + "step": 9057, + "train/total_loss": 0.10721239447593689 + }, + { + "entropy": 8.964850425720215, + "epoch": 0.8955902709116077, + "mean_token_accuracy": 0.7044117450714111, + "num_tokens": 26346212.0, + "step": 9058, + "train/ce_loss": 1.4405393600463867 + }, + { + "epoch": 0.8955902709116077, + "step": 9058, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.8955902709116077, + "step": 9058, + "train/total_loss": 0.22217893600463867 + }, + { + "entropy": 8.850532531738281, + "epoch": 0.8956891437611232, + "mean_token_accuracy": 0.6666666865348816, + "num_tokens": 26351549.0, + "step": 9059, + "train/ce_loss": 1.0828642845153809 + }, + { + "epoch": 0.8956891437611232, + "step": 9059, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.8956891437611232, + "step": 9059, + "train/total_loss": 0.19813019037246704 + }, + { + "epoch": 0.8957880166106387, + "grad_norm": 0.6650404930114746, + "learning_rate": 7.762695940266034e-06, + "loss": 0.1373, + "step": 9060 + }, + { + "entropy": 8.52380084991455, + "epoch": 0.8957880166106387, + "mean_token_accuracy": 0.7497337460517883, + "num_tokens": 26356938.0, + "step": 9060, + "train/ce_loss": 1.0425094366073608 + }, + { + "epoch": 0.8957880166106387, + "step": 9060, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.8957880166106387, + "step": 9060, + "train/total_loss": 0.1706571877002716 + }, + { + "entropy": 8.88840103149414, + "epoch": 0.8958868894601543, + "mean_token_accuracy": 0.7521994113922119, + "num_tokens": 26362086.0, + "step": 9061, + "train/ce_loss": 1.2668105000557262e-06 + }, + { + "epoch": 0.8958868894601543, + "step": 9061, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.8958868894601543, + "step": 9061, + "train/total_loss": 0.023437626659870148 + }, + { + "entropy": 8.716756820678711, + "epoch": 0.8959857623096698, + "mean_token_accuracy": 0.7196701765060425, + "num_tokens": 26367400.0, + "step": 9062, + "train/ce_loss": 0.8180581331253052 + }, + { + "epoch": 0.8959857623096698, + "step": 9062, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.8959857623096698, + "step": 9062, + "train/total_loss": 0.13649332523345947 + }, + { + "entropy": 9.058746337890625, + "epoch": 0.8960846351591852, + "mean_token_accuracy": 0.7813051342964172, + "num_tokens": 26372600.0, + "step": 9063, + "train/ce_loss": 0.6065887212753296 + }, + { + "epoch": 0.8960846351591852, + "step": 9063, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.8960846351591852, + "step": 9063, + "train/total_loss": 0.09581512212753296 + }, + { + "entropy": 8.757225036621094, + "epoch": 0.8961835080087008, + "mean_token_accuracy": 0.6877419352531433, + "num_tokens": 26377839.0, + "step": 9064, + "train/ce_loss": 1.433215618133545 + }, + { + "epoch": 0.8961835080087008, + "step": 9064, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.8961835080087008, + "step": 9064, + "train/total_loss": 0.17457155883312225 + }, + { + "entropy": 8.651361465454102, + "epoch": 0.8962823808582163, + "mean_token_accuracy": 0.7403740286827087, + "num_tokens": 26383210.0, + "step": 9065, + "train/ce_loss": 0.8476965427398682 + }, + { + "epoch": 0.8962823808582163, + "step": 9065, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.8962823808582163, + "step": 9065, + "train/total_loss": 0.16289466619491577 + }, + { + "entropy": 9.399979591369629, + "epoch": 0.8963812537077318, + "mean_token_accuracy": 0.8402489423751831, + "num_tokens": 26388116.0, + "step": 9066, + "train/ce_loss": 1.3859827518463135 + }, + { + "epoch": 0.8963812537077318, + "step": 9066, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.8963812537077318, + "step": 9066, + "train/total_loss": 0.1698482781648636 + }, + { + "entropy": 8.580986022949219, + "epoch": 0.8964801265572474, + "mean_token_accuracy": 0.7384284138679504, + "num_tokens": 26393511.0, + "step": 9067, + "train/ce_loss": 0.5452489852905273 + }, + { + "epoch": 0.8964801265572474, + "step": 9067, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.8964801265572474, + "step": 9067, + "train/total_loss": 0.08186864852905273 + }, + { + "entropy": 9.107563972473145, + "epoch": 0.8965789994067629, + "mean_token_accuracy": 0.716356098651886, + "num_tokens": 26398411.0, + "step": 9068, + "train/ce_loss": 2.5687904357910156 + }, + { + "epoch": 0.8965789994067629, + "step": 9068, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.8965789994067629, + "step": 9068, + "train/total_loss": 0.354535311460495 + }, + { + "entropy": 8.618518829345703, + "epoch": 0.8966778722562784, + "mean_token_accuracy": 0.7164179086685181, + "num_tokens": 26403833.0, + "step": 9069, + "train/ce_loss": 0.5482277870178223 + }, + { + "epoch": 0.8966778722562784, + "step": 9069, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.8966778722562784, + "step": 9069, + "train/total_loss": 0.08997903019189835 + }, + { + "entropy": 8.767492294311523, + "epoch": 0.896776745105794, + "mean_token_accuracy": 0.7418263554573059, + "num_tokens": 26409137.0, + "step": 9070, + "train/ce_loss": 0.6119635105133057 + }, + { + "epoch": 0.896776745105794, + "step": 9070, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.896776745105794, + "step": 9070, + "train/total_loss": 0.17057135701179504 + }, + { + "entropy": 8.644116401672363, + "epoch": 0.8968756179553095, + "mean_token_accuracy": 0.7927107214927673, + "num_tokens": 26414534.0, + "step": 9071, + "train/ce_loss": 0.7480507493019104 + }, + { + "epoch": 0.8968756179553095, + "step": 9071, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.8968756179553095, + "step": 9071, + "train/total_loss": 0.16074258089065552 + }, + { + "entropy": 8.785004615783691, + "epoch": 0.8969744908048249, + "mean_token_accuracy": 0.7204545736312866, + "num_tokens": 26419843.0, + "step": 9072, + "train/ce_loss": 1.3798236846923828 + }, + { + "epoch": 0.8969744908048249, + "step": 9072, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.8969744908048249, + "step": 9072, + "train/total_loss": 0.20438861846923828 + }, + { + "entropy": 8.707588195800781, + "epoch": 0.8970733636543405, + "mean_token_accuracy": 0.7354685664176941, + "num_tokens": 26425176.0, + "step": 9073, + "train/ce_loss": 0.8631170988082886 + }, + { + "epoch": 0.8970733636543405, + "step": 9073, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.8970733636543405, + "step": 9073, + "train/total_loss": 0.1917804628610611 + }, + { + "entropy": 8.898002624511719, + "epoch": 0.897172236503856, + "mean_token_accuracy": 0.7440944910049438, + "num_tokens": 26430433.0, + "step": 9074, + "train/ce_loss": 0.4725923240184784 + }, + { + "epoch": 0.897172236503856, + "step": 9074, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.897172236503856, + "step": 9074, + "train/total_loss": 0.09413423389196396 + }, + { + "entropy": 9.20399284362793, + "epoch": 0.8972711093533715, + "mean_token_accuracy": 0.7118644118309021, + "num_tokens": 26435497.0, + "step": 9075, + "train/ce_loss": 1.0065940618515015 + }, + { + "epoch": 0.8972711093533715, + "step": 9075, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.8972711093533715, + "step": 9075, + "train/total_loss": 0.12800315022468567 + }, + { + "entropy": 8.567405700683594, + "epoch": 0.8973699822028871, + "mean_token_accuracy": 0.7644171714782715, + "num_tokens": 26440767.0, + "step": 9076, + "train/ce_loss": 0.484589159488678 + }, + { + "epoch": 0.8973699822028871, + "step": 9076, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.8973699822028871, + "step": 9076, + "train/total_loss": 0.08752141892910004 + }, + { + "entropy": 8.940652847290039, + "epoch": 0.8974688550524026, + "mean_token_accuracy": 0.750348687171936, + "num_tokens": 26445893.0, + "step": 9077, + "train/ce_loss": 0.8887761831283569 + }, + { + "epoch": 0.8974688550524026, + "step": 9077, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.8974688550524026, + "step": 9077, + "train/total_loss": 0.1591901183128357 + }, + { + "entropy": 8.863910675048828, + "epoch": 0.8975677279019182, + "mean_token_accuracy": 0.7949735522270203, + "num_tokens": 26451137.0, + "step": 9078, + "train/ce_loss": 0.5831772685050964 + }, + { + "epoch": 0.8975677279019182, + "step": 9078, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.8975677279019182, + "step": 9078, + "train/total_loss": 0.10519272834062576 + }, + { + "entropy": 9.978321075439453, + "epoch": 0.8976666007514337, + "mean_token_accuracy": 0.7720588445663452, + "num_tokens": 26455767.0, + "step": 9079, + "train/ce_loss": 8.344578077412734e-07 + }, + { + "epoch": 0.8976666007514337, + "step": 9079, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.8976666007514337, + "step": 9079, + "train/total_loss": 0.019531333819031715 + }, + { + "epoch": 0.8977654736009492, + "grad_norm": 0.8531478047370911, + "learning_rate": 7.757751075508086e-06, + "loss": 0.1295, + "step": 9080 + }, + { + "entropy": 9.510713577270508, + "epoch": 0.8977654736009492, + "mean_token_accuracy": 0.7338501214981079, + "num_tokens": 26460556.0, + "step": 9080, + "train/ce_loss": 1.867562174797058 + }, + { + "epoch": 0.8977654736009492, + "step": 9080, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.8977654736009492, + "step": 9080, + "train/total_loss": 0.2492562234401703 + }, + { + "entropy": 8.828611373901367, + "epoch": 0.8978643464504648, + "mean_token_accuracy": 0.7374301552772522, + "num_tokens": 26465710.0, + "step": 9081, + "train/ce_loss": 0.5925540924072266 + }, + { + "epoch": 0.8978643464504648, + "step": 9081, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.8978643464504648, + "step": 9081, + "train/total_loss": 0.14519290626049042 + }, + { + "entropy": 8.482282638549805, + "epoch": 0.8979632192999802, + "mean_token_accuracy": 0.792417049407959, + "num_tokens": 26471268.0, + "step": 9082, + "train/ce_loss": 0.6010853052139282 + }, + { + "epoch": 0.8979632192999802, + "step": 9082, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.8979632192999802, + "step": 9082, + "train/total_loss": 0.13042102754116058 + }, + { + "entropy": 9.155592918395996, + "epoch": 0.8980620921494957, + "mean_token_accuracy": 0.7203539609909058, + "num_tokens": 26476288.0, + "step": 9083, + "train/ce_loss": 3.7049531442789885e-07 + }, + { + "epoch": 0.8980620921494957, + "step": 9083, + "train/sim_loss": 0.01171875 + }, + { + "epoch": 0.8980620921494957, + "step": 9083, + "train/total_loss": 0.011718787252902985 + }, + { + "entropy": 8.928726196289062, + "epoch": 0.8981609649990113, + "mean_token_accuracy": 0.7413127422332764, + "num_tokens": 26481530.0, + "step": 9084, + "train/ce_loss": 0.9454926252365112 + }, + { + "epoch": 0.8981609649990113, + "step": 9084, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.8981609649990113, + "step": 9084, + "train/total_loss": 0.1765805184841156 + }, + { + "entropy": 9.107856750488281, + "epoch": 0.8982598378485268, + "mean_token_accuracy": 0.7364746928215027, + "num_tokens": 26486612.0, + "step": 9085, + "train/ce_loss": 2.3217562272748182e-07 + }, + { + "epoch": 0.8982598378485268, + "step": 9085, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.8982598378485268, + "step": 9085, + "train/total_loss": 0.02734377235174179 + }, + { + "entropy": 9.56021785736084, + "epoch": 0.8983587106980423, + "mean_token_accuracy": 0.7511848211288452, + "num_tokens": 26491412.0, + "step": 9086, + "train/ce_loss": 1.2456231117248535 + }, + { + "epoch": 0.8983587106980423, + "step": 9086, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.8983587106980423, + "step": 9086, + "train/total_loss": 0.1714373230934143 + }, + { + "entropy": 9.498257637023926, + "epoch": 0.8984575835475579, + "mean_token_accuracy": 0.7429245114326477, + "num_tokens": 26496233.0, + "step": 9087, + "train/ce_loss": 2.564113401604118e-07 + }, + { + "epoch": 0.8984575835475579, + "step": 9087, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.8984575835475579, + "step": 9087, + "train/total_loss": 0.01562502607703209 + }, + { + "entropy": 9.015697479248047, + "epoch": 0.8985564563970734, + "mean_token_accuracy": 0.7487603425979614, + "num_tokens": 26501311.0, + "step": 9088, + "train/ce_loss": 2.7880012112291297e-06 + }, + { + "epoch": 0.8985564563970734, + "step": 9088, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.8985564563970734, + "step": 9088, + "train/total_loss": 0.023437779396772385 + }, + { + "entropy": 9.054108619689941, + "epoch": 0.8986553292465889, + "mean_token_accuracy": 0.7352415323257446, + "num_tokens": 26506316.0, + "step": 9089, + "train/ce_loss": 0.9395824074745178 + }, + { + "epoch": 0.8986553292465889, + "step": 9089, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.8986553292465889, + "step": 9089, + "train/total_loss": 0.14864574372768402 + }, + { + "entropy": 8.577143669128418, + "epoch": 0.8987542020961045, + "mean_token_accuracy": 0.7199612259864807, + "num_tokens": 26511840.0, + "step": 9090, + "train/ce_loss": 0.6009590029716492 + }, + { + "epoch": 0.8987542020961045, + "step": 9090, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.8987542020961045, + "step": 9090, + "train/total_loss": 0.0913459062576294 + }, + { + "entropy": 8.717094421386719, + "epoch": 0.89885307494562, + "mean_token_accuracy": 0.7784172892570496, + "num_tokens": 26516988.0, + "step": 9091, + "train/ce_loss": 1.873485643955064e-06 + }, + { + "epoch": 0.89885307494562, + "step": 9091, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.89885307494562, + "step": 9091, + "train/total_loss": 0.04687518626451492 + }, + { + "entropy": 9.038698196411133, + "epoch": 0.8989519477951354, + "mean_token_accuracy": 0.7762619256973267, + "num_tokens": 26522208.0, + "step": 9092, + "train/ce_loss": 0.8083727359771729 + }, + { + "epoch": 0.8989519477951354, + "step": 9092, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.8989519477951354, + "step": 9092, + "train/total_loss": 0.11989977210760117 + }, + { + "entropy": 8.21472454071045, + "epoch": 0.899050820644651, + "mean_token_accuracy": 0.7547169923782349, + "num_tokens": 26527721.0, + "step": 9093, + "train/ce_loss": 0.8184592723846436 + }, + { + "epoch": 0.899050820644651, + "step": 9093, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.899050820644651, + "step": 9093, + "train/total_loss": 0.10528343170881271 + }, + { + "entropy": 8.824597358703613, + "epoch": 0.8991496934941665, + "mean_token_accuracy": 0.6925031542778015, + "num_tokens": 26532997.0, + "step": 9094, + "train/ce_loss": 1.0248957872390747 + }, + { + "epoch": 0.8991496934941665, + "step": 9094, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.8991496934941665, + "step": 9094, + "train/total_loss": 0.16108334064483643 + }, + { + "entropy": 8.745586395263672, + "epoch": 0.899248566343682, + "mean_token_accuracy": 0.6959999799728394, + "num_tokens": 26538335.0, + "step": 9095, + "train/ce_loss": 0.4734397232532501 + }, + { + "epoch": 0.899248566343682, + "step": 9095, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.899248566343682, + "step": 9095, + "train/total_loss": 0.09031271934509277 + }, + { + "entropy": 8.67325210571289, + "epoch": 0.8993474391931976, + "mean_token_accuracy": 0.7426666617393494, + "num_tokens": 26543510.0, + "step": 9096, + "train/ce_loss": 0.7213162183761597 + }, + { + "epoch": 0.8993474391931976, + "step": 9096, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.8993474391931976, + "step": 9096, + "train/total_loss": 0.12681913375854492 + }, + { + "entropy": 8.939607620239258, + "epoch": 0.8994463120427131, + "mean_token_accuracy": 0.7454031109809875, + "num_tokens": 26548596.0, + "step": 9097, + "train/ce_loss": 0.9978237748146057 + }, + { + "epoch": 0.8994463120427131, + "step": 9097, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.8994463120427131, + "step": 9097, + "train/total_loss": 0.16228237748146057 + }, + { + "entropy": 8.586393356323242, + "epoch": 0.8995451848922286, + "mean_token_accuracy": 0.787089467048645, + "num_tokens": 26553991.0, + "step": 9098, + "train/ce_loss": 0.5694405436515808 + }, + { + "epoch": 0.8995451848922286, + "step": 9098, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.8995451848922286, + "step": 9098, + "train/total_loss": 0.13116280734539032 + }, + { + "entropy": 9.860730171203613, + "epoch": 0.8996440577417442, + "mean_token_accuracy": 0.8520709872245789, + "num_tokens": 26558591.0, + "step": 9099, + "train/ce_loss": 1.3599603789771209e-06 + }, + { + "epoch": 0.8996440577417442, + "step": 9099, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.8996440577417442, + "step": 9099, + "train/total_loss": 0.04296888783574104 + }, + { + "epoch": 0.8997429305912596, + "grad_norm": 0.9253165125846863, + "learning_rate": 7.752806210750137e-06, + "loss": 0.1263, + "step": 9100 + }, + { + "entropy": 8.981040000915527, + "epoch": 0.8997429305912596, + "mean_token_accuracy": 0.7735849022865295, + "num_tokens": 26564011.0, + "step": 9100, + "train/ce_loss": 0.9932456016540527 + }, + { + "epoch": 0.8997429305912596, + "step": 9100, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.8997429305912596, + "step": 9100, + "train/total_loss": 0.1774495542049408 + }, + { + "entropy": 8.719240188598633, + "epoch": 0.8998418034407751, + "mean_token_accuracy": 0.7344497442245483, + "num_tokens": 26569279.0, + "step": 9101, + "train/ce_loss": 0.8734342455863953 + }, + { + "epoch": 0.8998418034407751, + "step": 9101, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.8998418034407751, + "step": 9101, + "train/total_loss": 0.14593717455863953 + }, + { + "entropy": 9.323492050170898, + "epoch": 0.8999406762902907, + "mean_token_accuracy": 0.7406716346740723, + "num_tokens": 26574252.0, + "step": 9102, + "train/ce_loss": 1.1112210750579834 + }, + { + "epoch": 0.8999406762902907, + "step": 9102, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.8999406762902907, + "step": 9102, + "train/total_loss": 0.22440335154533386 + }, + { + "entropy": 8.75655746459961, + "epoch": 0.9000395491398062, + "mean_token_accuracy": 0.7513020634651184, + "num_tokens": 26579511.0, + "step": 9103, + "train/ce_loss": 0.8895690441131592 + }, + { + "epoch": 0.9000395491398062, + "step": 9103, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.9000395491398062, + "step": 9103, + "train/total_loss": 0.15536315739154816 + }, + { + "entropy": 8.985508918762207, + "epoch": 0.9001384219893217, + "mean_token_accuracy": 0.780635416507721, + "num_tokens": 26584613.0, + "step": 9104, + "train/ce_loss": 5.280485311232042e-07 + }, + { + "epoch": 0.9001384219893217, + "step": 9104, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.9001384219893217, + "step": 9104, + "train/total_loss": 0.05859380215406418 + }, + { + "entropy": 8.611551284790039, + "epoch": 0.9002372948388373, + "mean_token_accuracy": 0.7111801505088806, + "num_tokens": 26590051.0, + "step": 9105, + "train/ce_loss": 1.0287556648254395 + }, + { + "epoch": 0.9002372948388373, + "step": 9105, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.9002372948388373, + "step": 9105, + "train/total_loss": 0.16537556052207947 + }, + { + "entropy": 8.797918319702148, + "epoch": 0.9003361676883528, + "mean_token_accuracy": 0.7944133877754211, + "num_tokens": 26595407.0, + "step": 9106, + "train/ce_loss": 0.426572322845459 + }, + { + "epoch": 0.9003361676883528, + "step": 9106, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.9003361676883528, + "step": 9106, + "train/total_loss": 0.06609473377466202 + }, + { + "entropy": 8.250228881835938, + "epoch": 0.9004350405378683, + "mean_token_accuracy": 0.7409178018569946, + "num_tokens": 26600923.0, + "step": 9107, + "train/ce_loss": 1.0799992084503174 + }, + { + "epoch": 0.9004350405378683, + "step": 9107, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.9004350405378683, + "step": 9107, + "train/total_loss": 0.15878117084503174 + }, + { + "entropy": 9.036908149719238, + "epoch": 0.9005339133873839, + "mean_token_accuracy": 0.7595541477203369, + "num_tokens": 26606050.0, + "step": 9108, + "train/ce_loss": 0.5973562598228455 + }, + { + "epoch": 0.9005339133873839, + "step": 9108, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.9005339133873839, + "step": 9108, + "train/total_loss": 0.12614187598228455 + }, + { + "entropy": 9.420276641845703, + "epoch": 0.9006327862368994, + "mean_token_accuracy": 0.800000011920929, + "num_tokens": 26610906.0, + "step": 9109, + "train/ce_loss": 5.389318857851322e-07 + }, + { + "epoch": 0.9006327862368994, + "step": 9109, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.9006327862368994, + "step": 9109, + "train/total_loss": 0.03906255215406418 + }, + { + "entropy": 8.7485933303833, + "epoch": 0.9007316590864148, + "mean_token_accuracy": 0.7695035338401794, + "num_tokens": 26615909.0, + "step": 9110, + "train/ce_loss": 6.372333245963091e-06 + }, + { + "epoch": 0.9007316590864148, + "step": 9110, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.9007316590864148, + "step": 9110, + "train/total_loss": 0.03125063702464104 + }, + { + "entropy": 9.318817138671875, + "epoch": 0.9008305319359304, + "mean_token_accuracy": 0.748603343963623, + "num_tokens": 26620884.0, + "step": 9111, + "train/ce_loss": 5.345463023331831e-07 + }, + { + "epoch": 0.9008305319359304, + "step": 9111, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.9008305319359304, + "step": 9111, + "train/total_loss": 0.03906255215406418 + }, + { + "entropy": 8.58206558227539, + "epoch": 0.9009294047854459, + "mean_token_accuracy": 0.7491289377212524, + "num_tokens": 26626274.0, + "step": 9112, + "train/ce_loss": 0.705833911895752 + }, + { + "epoch": 0.9009294047854459, + "step": 9112, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.9009294047854459, + "step": 9112, + "train/total_loss": 0.11745839565992355 + }, + { + "entropy": 8.656524658203125, + "epoch": 0.9010282776349614, + "mean_token_accuracy": 0.702570378780365, + "num_tokens": 26631603.0, + "step": 9113, + "train/ce_loss": 1.5656601190567017 + }, + { + "epoch": 0.9010282776349614, + "step": 9113, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.9010282776349614, + "step": 9113, + "train/total_loss": 0.18000350892543793 + }, + { + "entropy": 8.704153060913086, + "epoch": 0.901127150484477, + "mean_token_accuracy": 0.7424789667129517, + "num_tokens": 26636942.0, + "step": 9114, + "train/ce_loss": 0.9390100836753845 + }, + { + "epoch": 0.901127150484477, + "step": 9114, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.901127150484477, + "step": 9114, + "train/total_loss": 0.14468225836753845 + }, + { + "entropy": 8.923505783081055, + "epoch": 0.9012260233339925, + "mean_token_accuracy": 0.7388362884521484, + "num_tokens": 26642169.0, + "step": 9115, + "train/ce_loss": 0.6768019795417786 + }, + { + "epoch": 0.9012260233339925, + "step": 9115, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.9012260233339925, + "step": 9115, + "train/total_loss": 0.1301802098751068 + }, + { + "entropy": 8.667463302612305, + "epoch": 0.901324896183508, + "mean_token_accuracy": 0.7836593985557556, + "num_tokens": 26647479.0, + "step": 9116, + "train/ce_loss": 0.4220693111419678 + }, + { + "epoch": 0.901324896183508, + "step": 9116, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.901324896183508, + "step": 9116, + "train/total_loss": 0.10080067813396454 + }, + { + "entropy": 8.569649696350098, + "epoch": 0.9014237690330236, + "mean_token_accuracy": 0.7390350699424744, + "num_tokens": 26652858.0, + "step": 9117, + "train/ce_loss": 0.9989830851554871 + }, + { + "epoch": 0.9014237690330236, + "step": 9117, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.9014237690330236, + "step": 9117, + "train/total_loss": 0.1545858085155487 + }, + { + "entropy": 9.056396484375, + "epoch": 0.901522641882539, + "mean_token_accuracy": 0.7503759264945984, + "num_tokens": 26657918.0, + "step": 9118, + "train/ce_loss": 0.9660353660583496 + }, + { + "epoch": 0.901522641882539, + "step": 9118, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.901522641882539, + "step": 9118, + "train/total_loss": 0.12785354256629944 + }, + { + "entropy": 9.318904876708984, + "epoch": 0.9016215147320545, + "mean_token_accuracy": 0.7260536551475525, + "num_tokens": 26662878.0, + "step": 9119, + "train/ce_loss": 1.0541210174560547 + }, + { + "epoch": 0.9016215147320545, + "step": 9119, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.9016215147320545, + "step": 9119, + "train/total_loss": 0.136662095785141 + }, + { + "epoch": 0.9017203875815701, + "grad_norm": 0.7299725413322449, + "learning_rate": 7.747861345992189e-06, + "loss": 0.1259, + "step": 9120 + }, + { + "entropy": 8.333454132080078, + "epoch": 0.9017203875815701, + "mean_token_accuracy": 0.7875458002090454, + "num_tokens": 26668243.0, + "step": 9120, + "train/ce_loss": 0.542241632938385 + }, + { + "epoch": 0.9017203875815701, + "step": 9120, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.9017203875815701, + "step": 9120, + "train/total_loss": 0.1089116632938385 + }, + { + "entropy": 8.740531921386719, + "epoch": 0.9018192604310856, + "mean_token_accuracy": 0.7414966225624084, + "num_tokens": 26673584.0, + "step": 9121, + "train/ce_loss": 0.5661129951477051 + }, + { + "epoch": 0.9018192604310856, + "step": 9121, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.9018192604310856, + "step": 9121, + "train/total_loss": 0.12301754951477051 + }, + { + "entropy": 8.611451148986816, + "epoch": 0.9019181332806011, + "mean_token_accuracy": 0.7427912354469299, + "num_tokens": 26678920.0, + "step": 9122, + "train/ce_loss": 1.0637965202331543 + }, + { + "epoch": 0.9019181332806011, + "step": 9122, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.9019181332806011, + "step": 9122, + "train/total_loss": 0.1415359079837799 + }, + { + "entropy": 9.500768661499023, + "epoch": 0.9020170061301167, + "mean_token_accuracy": 0.8245614171028137, + "num_tokens": 26683884.0, + "step": 9123, + "train/ce_loss": 8.357000638170575e-07 + }, + { + "epoch": 0.9020170061301167, + "step": 9123, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.9020170061301167, + "step": 9123, + "train/total_loss": 0.050781331956386566 + }, + { + "entropy": 9.072831153869629, + "epoch": 0.9021158789796322, + "mean_token_accuracy": 0.7768240571022034, + "num_tokens": 26689055.0, + "step": 9124, + "train/ce_loss": 0.9636762142181396 + }, + { + "epoch": 0.9021158789796322, + "step": 9124, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.9021158789796322, + "step": 9124, + "train/total_loss": 0.12761762738227844 + }, + { + "entropy": 8.837091445922852, + "epoch": 0.9022147518291477, + "mean_token_accuracy": 0.7314702272415161, + "num_tokens": 26694300.0, + "step": 9125, + "train/ce_loss": 0.9927276372909546 + }, + { + "epoch": 0.9022147518291477, + "step": 9125, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.9022147518291477, + "step": 9125, + "train/total_loss": 0.16958525776863098 + }, + { + "entropy": 9.130998611450195, + "epoch": 0.9023136246786633, + "mean_token_accuracy": 0.7519999742507935, + "num_tokens": 26699487.0, + "step": 9126, + "train/ce_loss": 1.4159026145935059 + }, + { + "epoch": 0.9023136246786633, + "step": 9126, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.9023136246786633, + "step": 9126, + "train/total_loss": 0.19627776741981506 + }, + { + "entropy": 8.802294731140137, + "epoch": 0.9024124975281788, + "mean_token_accuracy": 0.7866848111152649, + "num_tokens": 26704706.0, + "step": 9127, + "train/ce_loss": 7.055182322801556e-07 + }, + { + "epoch": 0.9024124975281788, + "step": 9127, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.9024124975281788, + "step": 9127, + "train/total_loss": 0.02343757078051567 + }, + { + "entropy": 9.004886627197266, + "epoch": 0.9025113703776942, + "mean_token_accuracy": 0.7532467246055603, + "num_tokens": 26709952.0, + "step": 9128, + "train/ce_loss": 0.871261715888977 + }, + { + "epoch": 0.9025113703776942, + "step": 9128, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.9025113703776942, + "step": 9128, + "train/total_loss": 0.10665742307901382 + }, + { + "entropy": 8.931509017944336, + "epoch": 0.9026102432272098, + "mean_token_accuracy": 0.7090908885002136, + "num_tokens": 26715301.0, + "step": 9129, + "train/ce_loss": 0.673997700214386 + }, + { + "epoch": 0.9026102432272098, + "step": 9129, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.9026102432272098, + "step": 9129, + "train/total_loss": 0.1338060200214386 + }, + { + "entropy": 8.808271408081055, + "epoch": 0.9027091160767253, + "mean_token_accuracy": 0.8070422410964966, + "num_tokens": 26720477.0, + "step": 9130, + "train/ce_loss": 0.6568292379379272 + }, + { + "epoch": 0.9027091160767253, + "step": 9130, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.9027091160767253, + "step": 9130, + "train/total_loss": 0.12818291783332825 + }, + { + "entropy": 8.710620880126953, + "epoch": 0.9028079889262408, + "mean_token_accuracy": 0.7081589698791504, + "num_tokens": 26725899.0, + "step": 9131, + "train/ce_loss": 1.0490682125091553 + }, + { + "epoch": 0.9028079889262408, + "step": 9131, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.9028079889262408, + "step": 9131, + "train/total_loss": 0.17521932721138 + }, + { + "entropy": 8.956960678100586, + "epoch": 0.9029068617757564, + "mean_token_accuracy": 0.7303797602653503, + "num_tokens": 26731228.0, + "step": 9132, + "train/ce_loss": 1.0235652923583984 + }, + { + "epoch": 0.9029068617757564, + "step": 9132, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.9029068617757564, + "step": 9132, + "train/total_loss": 0.21954402327537537 + }, + { + "entropy": 9.084754943847656, + "epoch": 0.9030057346252719, + "mean_token_accuracy": 0.7411003112792969, + "num_tokens": 26736233.0, + "step": 9133, + "train/ce_loss": 0.7100213766098022 + }, + { + "epoch": 0.9030057346252719, + "step": 9133, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.9030057346252719, + "step": 9133, + "train/total_loss": 0.09443964064121246 + }, + { + "entropy": 8.940507888793945, + "epoch": 0.9031046074747874, + "mean_token_accuracy": 0.7477341294288635, + "num_tokens": 26741328.0, + "step": 9134, + "train/ce_loss": 0.31841954588890076 + }, + { + "epoch": 0.9031046074747874, + "step": 9134, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.9031046074747874, + "step": 9134, + "train/total_loss": 0.0865294560790062 + }, + { + "entropy": 8.825894355773926, + "epoch": 0.903203480324303, + "mean_token_accuracy": 0.6662983298301697, + "num_tokens": 26746719.0, + "step": 9135, + "train/ce_loss": 1.049531102180481 + }, + { + "epoch": 0.903203480324303, + "step": 9135, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.903203480324303, + "step": 9135, + "train/total_loss": 0.1596406102180481 + }, + { + "entropy": 8.965797424316406, + "epoch": 0.9033023531738185, + "mean_token_accuracy": 0.7156084775924683, + "num_tokens": 26751905.0, + "step": 9136, + "train/ce_loss": 1.2573646306991577 + }, + { + "epoch": 0.9033023531738185, + "step": 9136, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.9033023531738185, + "step": 9136, + "train/total_loss": 0.17651771008968353 + }, + { + "entropy": 8.739272117614746, + "epoch": 0.903401226023334, + "mean_token_accuracy": 0.772357702255249, + "num_tokens": 26757181.0, + "step": 9137, + "train/ce_loss": 0.5272718071937561 + }, + { + "epoch": 0.903401226023334, + "step": 9137, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.903401226023334, + "step": 9137, + "train/total_loss": 0.09569592773914337 + }, + { + "entropy": 9.129547119140625, + "epoch": 0.9035000988728495, + "mean_token_accuracy": 0.6932408809661865, + "num_tokens": 26762223.0, + "step": 9138, + "train/ce_loss": 1.9463756084442139 + }, + { + "epoch": 0.9035000988728495, + "step": 9138, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.9035000988728495, + "step": 9138, + "train/total_loss": 0.25323131680488586 + }, + { + "entropy": 8.961040496826172, + "epoch": 0.903598971722365, + "mean_token_accuracy": 0.753947377204895, + "num_tokens": 26767426.0, + "step": 9139, + "train/ce_loss": 0.9590904712677002 + }, + { + "epoch": 0.903598971722365, + "step": 9139, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.903598971722365, + "step": 9139, + "train/total_loss": 0.19747155904769897 + }, + { + "epoch": 0.9036978445718805, + "grad_norm": 0.5847175121307373, + "learning_rate": 7.742916481234238e-06, + "loss": 0.1273, + "step": 9140 + }, + { + "entropy": 8.927392959594727, + "epoch": 0.9036978445718805, + "mean_token_accuracy": 0.7094972133636475, + "num_tokens": 26772608.0, + "step": 9140, + "train/ce_loss": 3.629513116720773e-07 + }, + { + "epoch": 0.9036978445718805, + "step": 9140, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.9036978445718805, + "step": 9140, + "train/total_loss": 0.015625035390257835 + }, + { + "entropy": 8.195661544799805, + "epoch": 0.9037967174213961, + "mean_token_accuracy": 0.7666666507720947, + "num_tokens": 26778244.0, + "step": 9141, + "train/ce_loss": 0.8570559024810791 + }, + { + "epoch": 0.9037967174213961, + "step": 9141, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.9037967174213961, + "step": 9141, + "train/total_loss": 0.10914309322834015 + }, + { + "entropy": 8.750200271606445, + "epoch": 0.9038955902709116, + "mean_token_accuracy": 0.7545344829559326, + "num_tokens": 26783477.0, + "step": 9142, + "train/ce_loss": 0.7238687872886658 + }, + { + "epoch": 0.9038955902709116, + "step": 9142, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.9038955902709116, + "step": 9142, + "train/total_loss": 0.12707439064979553 + }, + { + "entropy": 9.928706169128418, + "epoch": 0.9039944631204271, + "mean_token_accuracy": 0.841269850730896, + "num_tokens": 26788083.0, + "step": 9143, + "train/ce_loss": 1.1201769893887104e-06 + }, + { + "epoch": 0.9039944631204271, + "step": 9143, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.9039944631204271, + "step": 9143, + "train/total_loss": 0.015625111758708954 + }, + { + "entropy": 8.867383003234863, + "epoch": 0.9040933359699427, + "mean_token_accuracy": 0.6898638606071472, + "num_tokens": 26793199.0, + "step": 9144, + "train/ce_loss": 1.7946258594747633e-06 + }, + { + "epoch": 0.9040933359699427, + "step": 9144, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.9040933359699427, + "step": 9144, + "train/total_loss": 0.039062678813934326 + }, + { + "entropy": 8.619510650634766, + "epoch": 0.9041922088194582, + "mean_token_accuracy": 0.803983211517334, + "num_tokens": 26798589.0, + "step": 9145, + "train/ce_loss": 0.8176717758178711 + }, + { + "epoch": 0.9041922088194582, + "step": 9145, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.9041922088194582, + "step": 9145, + "train/total_loss": 0.12473592907190323 + }, + { + "entropy": 9.34872817993164, + "epoch": 0.9042910816689737, + "mean_token_accuracy": 0.7686274647712708, + "num_tokens": 26803531.0, + "step": 9146, + "train/ce_loss": 9.471125963500526e-07 + }, + { + "epoch": 0.9042910816689737, + "step": 9146, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.9042910816689737, + "step": 9146, + "train/total_loss": 0.03515634313225746 + }, + { + "entropy": 8.584484100341797, + "epoch": 0.9043899545184892, + "mean_token_accuracy": 0.7789968848228455, + "num_tokens": 26808715.0, + "step": 9147, + "train/ce_loss": 1.163509726524353 + }, + { + "epoch": 0.9043899545184892, + "step": 9147, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.9043899545184892, + "step": 9147, + "train/total_loss": 0.18666347861289978 + }, + { + "entropy": 9.282931327819824, + "epoch": 0.9044888273680047, + "mean_token_accuracy": 0.6739864945411682, + "num_tokens": 26813759.0, + "step": 9148, + "train/ce_loss": 1.4836020469665527 + }, + { + "epoch": 0.9044888273680047, + "step": 9148, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.9044888273680047, + "step": 9148, + "train/total_loss": 0.2342977076768875 + }, + { + "entropy": 8.913932800292969, + "epoch": 0.9045877002175202, + "mean_token_accuracy": 0.747035562992096, + "num_tokens": 26819024.0, + "step": 9149, + "train/ce_loss": 0.3981608748435974 + }, + { + "epoch": 0.9045877002175202, + "step": 9149, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.9045877002175202, + "step": 9149, + "train/total_loss": 0.13356608152389526 + }, + { + "entropy": 8.687820434570312, + "epoch": 0.9046865730670358, + "mean_token_accuracy": 0.7634803652763367, + "num_tokens": 26824330.0, + "step": 9150, + "train/ce_loss": 0.4469693601131439 + }, + { + "epoch": 0.9046865730670358, + "step": 9150, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.9046865730670358, + "step": 9150, + "train/total_loss": 0.07985319197177887 + }, + { + "entropy": 8.851806640625, + "epoch": 0.9047854459165513, + "mean_token_accuracy": 0.7380627393722534, + "num_tokens": 26829470.0, + "step": 9151, + "train/ce_loss": 1.8865202378037793e-07 + }, + { + "epoch": 0.9047854459165513, + "step": 9151, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.9047854459165513, + "step": 9151, + "train/total_loss": 0.015625018626451492 + }, + { + "entropy": 8.814598083496094, + "epoch": 0.9048843187660668, + "mean_token_accuracy": 0.7309321761131287, + "num_tokens": 26835016.0, + "step": 9152, + "train/ce_loss": 0.8167648315429688 + }, + { + "epoch": 0.9048843187660668, + "step": 9152, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.9048843187660668, + "step": 9152, + "train/total_loss": 0.11292648315429688 + }, + { + "entropy": 9.176883697509766, + "epoch": 0.9049831916155824, + "mean_token_accuracy": 0.7152209281921387, + "num_tokens": 26840035.0, + "step": 9153, + "train/ce_loss": 0.8641502261161804 + }, + { + "epoch": 0.9049831916155824, + "step": 9153, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.9049831916155824, + "step": 9153, + "train/total_loss": 0.12547752261161804 + }, + { + "entropy": 8.835829734802246, + "epoch": 0.9050820644650979, + "mean_token_accuracy": 0.7415881752967834, + "num_tokens": 26845249.0, + "step": 9154, + "train/ce_loss": 0.39165768027305603 + }, + { + "epoch": 0.9050820644650979, + "step": 9154, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.9050820644650979, + "step": 9154, + "train/total_loss": 0.08604076504707336 + }, + { + "entropy": 8.979759216308594, + "epoch": 0.9051809373146134, + "mean_token_accuracy": 0.7575757503509521, + "num_tokens": 26850299.0, + "step": 9155, + "train/ce_loss": 1.1635652780532837 + }, + { + "epoch": 0.9051809373146134, + "step": 9155, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.9051809373146134, + "step": 9155, + "train/total_loss": 0.1944815218448639 + }, + { + "entropy": 9.043808937072754, + "epoch": 0.905279810164129, + "mean_token_accuracy": 0.7861635088920593, + "num_tokens": 26855454.0, + "step": 9156, + "train/ce_loss": 0.6003595590591431 + }, + { + "epoch": 0.905279810164129, + "step": 9156, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.905279810164129, + "step": 9156, + "train/total_loss": 0.09909845888614655 + }, + { + "entropy": 8.320194244384766, + "epoch": 0.9053786830136444, + "mean_token_accuracy": 0.7412678003311157, + "num_tokens": 26860710.0, + "step": 9157, + "train/ce_loss": 0.7466564178466797 + }, + { + "epoch": 0.9053786830136444, + "step": 9157, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.9053786830136444, + "step": 9157, + "train/total_loss": 0.1410718858242035 + }, + { + "entropy": 8.66702651977539, + "epoch": 0.9054775558631599, + "mean_token_accuracy": 0.76171875, + "num_tokens": 26865975.0, + "step": 9158, + "train/ce_loss": 0.6458284854888916 + }, + { + "epoch": 0.9054775558631599, + "step": 9158, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.9054775558631599, + "step": 9158, + "train/total_loss": 0.08802034705877304 + }, + { + "entropy": 8.560710906982422, + "epoch": 0.9055764287126755, + "mean_token_accuracy": 0.7720670104026794, + "num_tokens": 26871375.0, + "step": 9159, + "train/ce_loss": 0.32802441716194153 + }, + { + "epoch": 0.9055764287126755, + "step": 9159, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.9055764287126755, + "step": 9159, + "train/total_loss": 0.06405244767665863 + }, + { + "epoch": 0.905675301562191, + "grad_norm": 0.6260942816734314, + "learning_rate": 7.73797161647629e-06, + "loss": 0.1289, + "step": 9160 + }, + { + "entropy": 8.676794052124023, + "epoch": 0.905675301562191, + "mean_token_accuracy": 0.7962154150009155, + "num_tokens": 26876558.0, + "step": 9160, + "train/ce_loss": 0.5289694666862488 + }, + { + "epoch": 0.905675301562191, + "step": 9160, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.905675301562191, + "step": 9160, + "train/total_loss": 0.11930319666862488 + }, + { + "entropy": 9.213626861572266, + "epoch": 0.9057741744117066, + "mean_token_accuracy": 0.742671012878418, + "num_tokens": 26881760.0, + "step": 9161, + "train/ce_loss": 1.0828567743301392 + }, + { + "epoch": 0.9057741744117066, + "step": 9161, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.9057741744117066, + "step": 9161, + "train/total_loss": 0.16297318041324615 + }, + { + "entropy": 8.690057754516602, + "epoch": 0.9058730472612221, + "mean_token_accuracy": 0.7410604357719421, + "num_tokens": 26887077.0, + "step": 9162, + "train/ce_loss": 1.0695013999938965 + }, + { + "epoch": 0.9058730472612221, + "step": 9162, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.9058730472612221, + "step": 9162, + "train/total_loss": 0.17726263403892517 + }, + { + "entropy": 8.557741165161133, + "epoch": 0.9059719201107376, + "mean_token_accuracy": 0.7709497213363647, + "num_tokens": 26892280.0, + "step": 9163, + "train/ce_loss": 0.44319382309913635 + }, + { + "epoch": 0.9059719201107376, + "step": 9163, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.9059719201107376, + "step": 9163, + "train/total_loss": 0.09119438380002975 + }, + { + "entropy": 8.492541313171387, + "epoch": 0.9060707929602532, + "mean_token_accuracy": 0.6945652365684509, + "num_tokens": 26897639.0, + "step": 9164, + "train/ce_loss": 1.1665364503860474 + }, + { + "epoch": 0.9060707929602532, + "step": 9164, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.9060707929602532, + "step": 9164, + "train/total_loss": 0.2338411509990692 + }, + { + "entropy": 8.92304801940918, + "epoch": 0.9061696658097687, + "mean_token_accuracy": 0.7109588980674744, + "num_tokens": 26902832.0, + "step": 9165, + "train/ce_loss": 0.8163487911224365 + }, + { + "epoch": 0.9061696658097687, + "step": 9165, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.9061696658097687, + "step": 9165, + "train/total_loss": 0.11679112911224365 + }, + { + "entropy": 9.129404067993164, + "epoch": 0.9062685386592841, + "mean_token_accuracy": 0.7043918967247009, + "num_tokens": 26907859.0, + "step": 9166, + "train/ce_loss": 0.8976324796676636 + }, + { + "epoch": 0.9062685386592841, + "step": 9166, + "train/sim_loss": 0.11328125 + }, + { + "epoch": 0.9062685386592841, + "step": 9166, + "train/total_loss": 0.20304450392723083 + }, + { + "entropy": 9.423097610473633, + "epoch": 0.9063674115087997, + "mean_token_accuracy": 0.8117870688438416, + "num_tokens": 26912793.0, + "step": 9167, + "train/ce_loss": 2.8646294936152117e-07 + }, + { + "epoch": 0.9063674115087997, + "step": 9167, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.9063674115087997, + "step": 9167, + "train/total_loss": 0.01562502793967724 + }, + { + "entropy": 8.839334487915039, + "epoch": 0.9064662843583152, + "mean_token_accuracy": 0.7293844223022461, + "num_tokens": 26918139.0, + "step": 9168, + "train/ce_loss": 1.317361831665039 + }, + { + "epoch": 0.9064662843583152, + "step": 9168, + "train/sim_loss": 0.12890625 + }, + { + "epoch": 0.9064662843583152, + "step": 9168, + "train/total_loss": 0.2606424391269684 + }, + { + "entropy": 8.991203308105469, + "epoch": 0.9065651572078307, + "mean_token_accuracy": 0.7046070694923401, + "num_tokens": 26923334.0, + "step": 9169, + "train/ce_loss": 0.9669110774993896 + }, + { + "epoch": 0.9065651572078307, + "step": 9169, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.9065651572078307, + "step": 9169, + "train/total_loss": 0.1552848517894745 + }, + { + "entropy": 8.864419937133789, + "epoch": 0.9066640300573463, + "mean_token_accuracy": 0.8049853444099426, + "num_tokens": 26928485.0, + "step": 9170, + "train/ce_loss": 1.5716816506028408e-06 + }, + { + "epoch": 0.9066640300573463, + "step": 9170, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.9066640300573463, + "step": 9170, + "train/total_loss": 0.039062656462192535 + }, + { + "entropy": 8.876815795898438, + "epoch": 0.9067629029068618, + "mean_token_accuracy": 0.732467532157898, + "num_tokens": 26933708.0, + "step": 9171, + "train/ce_loss": 7.270077730936464e-07 + }, + { + "epoch": 0.9067629029068618, + "step": 9171, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.9067629029068618, + "step": 9171, + "train/total_loss": 0.06250007450580597 + }, + { + "entropy": 8.742498397827148, + "epoch": 0.9068617757563773, + "mean_token_accuracy": 0.7655718922615051, + "num_tokens": 26939038.0, + "step": 9172, + "train/ce_loss": 0.66084885597229 + }, + { + "epoch": 0.9068617757563773, + "step": 9172, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.9068617757563773, + "step": 9172, + "train/total_loss": 0.13639739155769348 + }, + { + "entropy": 8.657337188720703, + "epoch": 0.9069606486058929, + "mean_token_accuracy": 0.7401197552680969, + "num_tokens": 26944377.0, + "step": 9173, + "train/ce_loss": 0.8242636322975159 + }, + { + "epoch": 0.9069606486058929, + "step": 9173, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.9069606486058929, + "step": 9173, + "train/total_loss": 0.11367636173963547 + }, + { + "entropy": 8.706947326660156, + "epoch": 0.9070595214554084, + "mean_token_accuracy": 0.7205039858818054, + "num_tokens": 26949694.0, + "step": 9174, + "train/ce_loss": 1.2270557880401611 + }, + { + "epoch": 0.9070595214554084, + "step": 9174, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.9070595214554084, + "step": 9174, + "train/total_loss": 0.1891118288040161 + }, + { + "entropy": 8.381051063537598, + "epoch": 0.9071583943049238, + "mean_token_accuracy": 0.7837281227111816, + "num_tokens": 26955155.0, + "step": 9175, + "train/ce_loss": 0.6724500060081482 + }, + { + "epoch": 0.9071583943049238, + "step": 9175, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.9071583943049238, + "step": 9175, + "train/total_loss": 0.1141199991106987 + }, + { + "entropy": 8.379070281982422, + "epoch": 0.9072572671544394, + "mean_token_accuracy": 0.7655755281448364, + "num_tokens": 26960616.0, + "step": 9176, + "train/ce_loss": 0.6805732250213623 + }, + { + "epoch": 0.9072572671544394, + "step": 9176, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.9072572671544394, + "step": 9176, + "train/total_loss": 0.08758857101202011 + }, + { + "entropy": 9.151988983154297, + "epoch": 0.9073561400039549, + "mean_token_accuracy": 0.6783999800682068, + "num_tokens": 26965728.0, + "step": 9177, + "train/ce_loss": 1.9155022528138943e-06 + }, + { + "epoch": 0.9073561400039549, + "step": 9177, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.9073561400039549, + "step": 9177, + "train/total_loss": 0.05468768998980522 + }, + { + "entropy": 9.601924896240234, + "epoch": 0.9074550128534704, + "mean_token_accuracy": 0.7032967209815979, + "num_tokens": 26970773.0, + "step": 9178, + "train/ce_loss": 1.1842122376037878e-06 + }, + { + "epoch": 0.9074550128534704, + "step": 9178, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.9074550128534704, + "step": 9178, + "train/total_loss": 0.06250011920928955 + }, + { + "entropy": 8.63504409790039, + "epoch": 0.907553885702986, + "mean_token_accuracy": 0.7269076108932495, + "num_tokens": 26975983.0, + "step": 9179, + "train/ce_loss": 1.1727397441864014 + }, + { + "epoch": 0.907553885702986, + "step": 9179, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.907553885702986, + "step": 9179, + "train/total_loss": 0.1602427363395691 + }, + { + "epoch": 0.9076527585525015, + "grad_norm": 0.6241506338119507, + "learning_rate": 7.73302675171834e-06, + "loss": 0.1372, + "step": 9180 + }, + { + "entropy": 8.744926452636719, + "epoch": 0.9076527585525015, + "mean_token_accuracy": 0.7477124333381653, + "num_tokens": 26981181.0, + "step": 9180, + "train/ce_loss": 0.494517058134079 + }, + { + "epoch": 0.9076527585525015, + "step": 9180, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.9076527585525015, + "step": 9180, + "train/total_loss": 0.10804545879364014 + }, + { + "entropy": 8.447515487670898, + "epoch": 0.907751631402017, + "mean_token_accuracy": 0.7246073484420776, + "num_tokens": 26986567.0, + "step": 9181, + "train/ce_loss": 0.8649625182151794 + }, + { + "epoch": 0.907751631402017, + "step": 9181, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.907751631402017, + "step": 9181, + "train/total_loss": 0.1294650137424469 + }, + { + "entropy": 8.460390090942383, + "epoch": 0.9078505042515326, + "mean_token_accuracy": 0.7459839582443237, + "num_tokens": 26991997.0, + "step": 9182, + "train/ce_loss": 0.8686043620109558 + }, + { + "epoch": 0.9078505042515326, + "step": 9182, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.9078505042515326, + "step": 9182, + "train/total_loss": 0.15717294812202454 + }, + { + "entropy": 9.152366638183594, + "epoch": 0.9079493771010481, + "mean_token_accuracy": 0.7311828136444092, + "num_tokens": 26997101.0, + "step": 9183, + "train/ce_loss": 7.624874456269026e-07 + }, + { + "epoch": 0.9079493771010481, + "step": 9183, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.9079493771010481, + "step": 9183, + "train/total_loss": 0.04687507450580597 + }, + { + "entropy": 9.098976135253906, + "epoch": 0.9080482499505635, + "mean_token_accuracy": 0.7281553149223328, + "num_tokens": 27002238.0, + "step": 9184, + "train/ce_loss": 1.229797124862671 + }, + { + "epoch": 0.9080482499505635, + "step": 9184, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.9080482499505635, + "step": 9184, + "train/total_loss": 0.16204221546649933 + }, + { + "entropy": 8.807500839233398, + "epoch": 0.9081471228000791, + "mean_token_accuracy": 0.7618438005447388, + "num_tokens": 27007489.0, + "step": 9185, + "train/ce_loss": 2.1185859111483296e-07 + }, + { + "epoch": 0.9081471228000791, + "step": 9185, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.9081471228000791, + "step": 9185, + "train/total_loss": 0.01562502048909664 + }, + { + "entropy": 8.601675033569336, + "epoch": 0.9082459956495946, + "mean_token_accuracy": 0.7455310225486755, + "num_tokens": 27012864.0, + "step": 9186, + "train/ce_loss": 0.5795109272003174 + }, + { + "epoch": 0.9082459956495946, + "step": 9186, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.9082459956495946, + "step": 9186, + "train/total_loss": 0.10091984272003174 + }, + { + "entropy": 8.90341567993164, + "epoch": 0.9083448684991101, + "mean_token_accuracy": 0.7690447568893433, + "num_tokens": 27018125.0, + "step": 9187, + "train/ce_loss": 0.8771971464157104 + }, + { + "epoch": 0.9083448684991101, + "step": 9187, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.9083448684991101, + "step": 9187, + "train/total_loss": 0.18928220868110657 + }, + { + "entropy": 8.82425594329834, + "epoch": 0.9084437413486257, + "mean_token_accuracy": 0.7973778247833252, + "num_tokens": 27023453.0, + "step": 9188, + "train/ce_loss": 0.5918988585472107 + }, + { + "epoch": 0.9084437413486257, + "step": 9188, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.9084437413486257, + "step": 9188, + "train/total_loss": 0.10215863585472107 + }, + { + "entropy": 9.128678321838379, + "epoch": 0.9085426141981412, + "mean_token_accuracy": 0.7230769395828247, + "num_tokens": 27028529.0, + "step": 9189, + "train/ce_loss": 0.9039449095726013 + }, + { + "epoch": 0.9085426141981412, + "step": 9189, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.9085426141981412, + "step": 9189, + "train/total_loss": 0.1294569969177246 + }, + { + "entropy": 8.939802169799805, + "epoch": 0.9086414870476567, + "mean_token_accuracy": 0.7655259966850281, + "num_tokens": 27033766.0, + "step": 9190, + "train/ce_loss": 0.656508207321167 + }, + { + "epoch": 0.9086414870476567, + "step": 9190, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.9086414870476567, + "step": 9190, + "train/total_loss": 0.0969008207321167 + }, + { + "entropy": 9.163475036621094, + "epoch": 0.9087403598971723, + "mean_token_accuracy": 0.7784810066223145, + "num_tokens": 27038851.0, + "step": 9191, + "train/ce_loss": 3.538524424584466e-07 + }, + { + "epoch": 0.9087403598971723, + "step": 9191, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.9087403598971723, + "step": 9191, + "train/total_loss": 0.015625035390257835 + }, + { + "entropy": 9.445109367370605, + "epoch": 0.9088392327466878, + "mean_token_accuracy": 0.7329192757606506, + "num_tokens": 27043810.0, + "step": 9192, + "train/ce_loss": 1.2226264476776123 + }, + { + "epoch": 0.9088392327466878, + "step": 9192, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.9088392327466878, + "step": 9192, + "train/total_loss": 0.21991890668869019 + }, + { + "entropy": 9.054713249206543, + "epoch": 0.9089381055962032, + "mean_token_accuracy": 0.7955145239830017, + "num_tokens": 27049054.0, + "step": 9193, + "train/ce_loss": 1.1977680921554565 + }, + { + "epoch": 0.9089381055962032, + "step": 9193, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.9089381055962032, + "step": 9193, + "train/total_loss": 0.20962056517601013 + }, + { + "entropy": 8.675012588500977, + "epoch": 0.9090369784457188, + "mean_token_accuracy": 0.7595212459564209, + "num_tokens": 27054398.0, + "step": 9194, + "train/ce_loss": 1.0104807615280151 + }, + { + "epoch": 0.9090369784457188, + "step": 9194, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.9090369784457188, + "step": 9194, + "train/total_loss": 0.151829332113266 + }, + { + "entropy": 9.217578887939453, + "epoch": 0.9091358512952343, + "mean_token_accuracy": 0.7096296548843384, + "num_tokens": 27059540.0, + "step": 9195, + "train/ce_loss": 1.2349307537078857 + }, + { + "epoch": 0.9091358512952343, + "step": 9195, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.9091358512952343, + "step": 9195, + "train/total_loss": 0.15474307537078857 + }, + { + "entropy": 9.376506805419922, + "epoch": 0.9092347241447498, + "mean_token_accuracy": 0.7658802270889282, + "num_tokens": 27064537.0, + "step": 9196, + "train/ce_loss": 1.2885411706520244e-06 + }, + { + "epoch": 0.9092347241447498, + "step": 9196, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.9092347241447498, + "step": 9196, + "train/total_loss": 0.046875130385160446 + }, + { + "entropy": 8.676100730895996, + "epoch": 0.9093335969942654, + "mean_token_accuracy": 0.7641752362251282, + "num_tokens": 27069753.0, + "step": 9197, + "train/ce_loss": 0.9406799674034119 + }, + { + "epoch": 0.9093335969942654, + "step": 9197, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.9093335969942654, + "step": 9197, + "train/total_loss": 0.1331304907798767 + }, + { + "entropy": 10.059473991394043, + "epoch": 0.9094324698437809, + "mean_token_accuracy": 0.7715736031532288, + "num_tokens": 27074306.0, + "step": 9198, + "train/ce_loss": 1.067428115675284e-06 + }, + { + "epoch": 0.9094324698437809, + "step": 9198, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.9094324698437809, + "step": 9198, + "train/total_loss": 0.015625106170773506 + }, + { + "entropy": 9.032780647277832, + "epoch": 0.9095313426932964, + "mean_token_accuracy": 0.7309486865997314, + "num_tokens": 27079421.0, + "step": 9199, + "train/ce_loss": 0.7582253217697144 + }, + { + "epoch": 0.9095313426932964, + "step": 9199, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.9095313426932964, + "step": 9199, + "train/total_loss": 0.12660378217697144 + }, + { + "epoch": 0.909630215542812, + "grad_norm": 0.6809642910957336, + "learning_rate": 7.728081886960393e-06, + "loss": 0.1245, + "step": 9200 + }, + { + "entropy": 9.335073471069336, + "epoch": 0.909630215542812, + "mean_token_accuracy": 0.7510121464729309, + "num_tokens": 27084389.0, + "step": 9200, + "train/ce_loss": 1.4134788513183594 + }, + { + "epoch": 0.909630215542812, + "step": 9200, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.909630215542812, + "step": 9200, + "train/total_loss": 0.20775413513183594 + }, + { + "entropy": 9.247215270996094, + "epoch": 0.9097290883923275, + "mean_token_accuracy": 0.726396918296814, + "num_tokens": 27089343.0, + "step": 9201, + "train/ce_loss": 1.3582441806793213 + }, + { + "epoch": 0.9097290883923275, + "step": 9201, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.9097290883923275, + "step": 9201, + "train/total_loss": 0.18660567700862885 + }, + { + "entropy": 9.34528923034668, + "epoch": 0.909827961241843, + "mean_token_accuracy": 0.787564754486084, + "num_tokens": 27094157.0, + "step": 9202, + "train/ce_loss": 1.572838544845581 + }, + { + "epoch": 0.909827961241843, + "step": 9202, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.909827961241843, + "step": 9202, + "train/total_loss": 0.21197135746479034 + }, + { + "entropy": 9.446272850036621, + "epoch": 0.9099268340913585, + "mean_token_accuracy": 0.7732793688774109, + "num_tokens": 27099110.0, + "step": 9203, + "train/ce_loss": 0.8522922992706299 + }, + { + "epoch": 0.9099268340913585, + "step": 9203, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.9099268340913585, + "step": 9203, + "train/total_loss": 0.16726048290729523 + }, + { + "entropy": 8.76350212097168, + "epoch": 0.910025706940874, + "mean_token_accuracy": 0.740618109703064, + "num_tokens": 27104477.0, + "step": 9204, + "train/ce_loss": 1.0039551258087158 + }, + { + "epoch": 0.910025706940874, + "step": 9204, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.910025706940874, + "step": 9204, + "train/total_loss": 0.11602051556110382 + }, + { + "entropy": 8.420951843261719, + "epoch": 0.9101245797903895, + "mean_token_accuracy": 0.784009575843811, + "num_tokens": 27109800.0, + "step": 9205, + "train/ce_loss": 0.7582249045372009 + }, + { + "epoch": 0.9101245797903895, + "step": 9205, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.9101245797903895, + "step": 9205, + "train/total_loss": 0.09535374492406845 + }, + { + "entropy": 9.049461364746094, + "epoch": 0.9102234526399051, + "mean_token_accuracy": 0.7334410548210144, + "num_tokens": 27114884.0, + "step": 9206, + "train/ce_loss": 0.9187730550765991 + }, + { + "epoch": 0.9102234526399051, + "step": 9206, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.9102234526399051, + "step": 9206, + "train/total_loss": 0.11531480401754379 + }, + { + "entropy": 8.63083267211914, + "epoch": 0.9103223254894206, + "mean_token_accuracy": 0.7093023061752319, + "num_tokens": 27120259.0, + "step": 9207, + "train/ce_loss": 0.7709442377090454 + }, + { + "epoch": 0.9103223254894206, + "step": 9207, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.9103223254894206, + "step": 9207, + "train/total_loss": 0.1317819356918335 + }, + { + "entropy": 8.907642364501953, + "epoch": 0.9104211983389361, + "mean_token_accuracy": 0.7167182564735413, + "num_tokens": 27125329.0, + "step": 9208, + "train/ce_loss": 1.4443556070327759 + }, + { + "epoch": 0.9104211983389361, + "step": 9208, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.9104211983389361, + "step": 9208, + "train/total_loss": 0.2147480696439743 + }, + { + "entropy": 8.556644439697266, + "epoch": 0.9105200711884517, + "mean_token_accuracy": 0.7161290049552917, + "num_tokens": 27130716.0, + "step": 9209, + "train/ce_loss": 1.3037917613983154 + }, + { + "epoch": 0.9105200711884517, + "step": 9209, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.9105200711884517, + "step": 9209, + "train/total_loss": 0.17725418508052826 + }, + { + "entropy": 9.725595474243164, + "epoch": 0.9106189440379672, + "mean_token_accuracy": 0.9027777910232544, + "num_tokens": 27135372.0, + "step": 9210, + "train/ce_loss": 1.4040017504157731e-06 + }, + { + "epoch": 0.9106189440379672, + "step": 9210, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.9106189440379672, + "step": 9210, + "train/total_loss": 0.06250014156103134 + }, + { + "entropy": 8.989730834960938, + "epoch": 0.9107178168874827, + "mean_token_accuracy": 0.7469879388809204, + "num_tokens": 27140475.0, + "step": 9211, + "train/ce_loss": 1.525792121887207 + }, + { + "epoch": 0.9107178168874827, + "step": 9211, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.9107178168874827, + "step": 9211, + "train/total_loss": 0.19554796814918518 + }, + { + "entropy": 8.68646240234375, + "epoch": 0.9108166897369983, + "mean_token_accuracy": 0.732467532157898, + "num_tokens": 27145687.0, + "step": 9212, + "train/ce_loss": 1.2112396955490112 + }, + { + "epoch": 0.9108166897369983, + "step": 9212, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.9108166897369983, + "step": 9212, + "train/total_loss": 0.18753021955490112 + }, + { + "entropy": 8.59811782836914, + "epoch": 0.9109155625865137, + "mean_token_accuracy": 0.7418655157089233, + "num_tokens": 27151078.0, + "step": 9213, + "train/ce_loss": 0.6997737884521484 + }, + { + "epoch": 0.9109155625865137, + "step": 9213, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.9109155625865137, + "step": 9213, + "train/total_loss": 0.09341488033533096 + }, + { + "entropy": 8.52330207824707, + "epoch": 0.9110144354360292, + "mean_token_accuracy": 0.7364583611488342, + "num_tokens": 27156531.0, + "step": 9214, + "train/ce_loss": 0.8881973028182983 + }, + { + "epoch": 0.9110144354360292, + "step": 9214, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.9110144354360292, + "step": 9214, + "train/total_loss": 0.1513197422027588 + }, + { + "entropy": 9.369311332702637, + "epoch": 0.9111133082855448, + "mean_token_accuracy": 0.7292490005493164, + "num_tokens": 27161463.0, + "step": 9215, + "train/ce_loss": 4.153951522312127e-06 + }, + { + "epoch": 0.9111133082855448, + "step": 9215, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.9111133082855448, + "step": 9215, + "train/total_loss": 0.05859416723251343 + }, + { + "entropy": 8.633882522583008, + "epoch": 0.9112121811350603, + "mean_token_accuracy": 0.7447513937950134, + "num_tokens": 27166985.0, + "step": 9216, + "train/ce_loss": 0.7126514911651611 + }, + { + "epoch": 0.9112121811350603, + "step": 9216, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.9112121811350603, + "step": 9216, + "train/total_loss": 0.10251515358686447 + }, + { + "entropy": 8.838998794555664, + "epoch": 0.9113110539845758, + "mean_token_accuracy": 0.7244623899459839, + "num_tokens": 27172206.0, + "step": 9217, + "train/ce_loss": 0.6789557933807373 + }, + { + "epoch": 0.9113110539845758, + "step": 9217, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.9113110539845758, + "step": 9217, + "train/total_loss": 0.09523933380842209 + }, + { + "entropy": 9.317330360412598, + "epoch": 0.9114099268340914, + "mean_token_accuracy": 0.7415094375610352, + "num_tokens": 27177146.0, + "step": 9218, + "train/ce_loss": 1.1412620544433594 + }, + { + "epoch": 0.9114099268340914, + "step": 9218, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.9114099268340914, + "step": 9218, + "train/total_loss": 0.14928245544433594 + }, + { + "entropy": 8.91617202758789, + "epoch": 0.9115087996836069, + "mean_token_accuracy": 0.6881889700889587, + "num_tokens": 27182261.0, + "step": 9219, + "train/ce_loss": 1.661460041999817 + }, + { + "epoch": 0.9115087996836069, + "step": 9219, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.9115087996836069, + "step": 9219, + "train/total_loss": 0.22083351016044617 + }, + { + "epoch": 0.9116076725331224, + "grad_norm": 0.727852463722229, + "learning_rate": 7.723137022202443e-06, + "loss": 0.1244, + "step": 9220 + }, + { + "entropy": 9.493191719055176, + "epoch": 0.9116076725331224, + "mean_token_accuracy": 0.7830578684806824, + "num_tokens": 27187157.0, + "step": 9220, + "train/ce_loss": 1.112162709236145 + }, + { + "epoch": 0.9116076725331224, + "step": 9220, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.9116076725331224, + "step": 9220, + "train/total_loss": 0.14637252688407898 + }, + { + "entropy": 8.813331604003906, + "epoch": 0.911706545382638, + "mean_token_accuracy": 0.7963483333587646, + "num_tokens": 27192336.0, + "step": 9221, + "train/ce_loss": 0.5560468435287476 + }, + { + "epoch": 0.911706545382638, + "step": 9221, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.911706545382638, + "step": 9221, + "train/total_loss": 0.07122968137264252 + }, + { + "entropy": 8.88023567199707, + "epoch": 0.9118054182321534, + "mean_token_accuracy": 0.7069988250732422, + "num_tokens": 27197600.0, + "step": 9222, + "train/ce_loss": 1.0730652809143066 + }, + { + "epoch": 0.9118054182321534, + "step": 9222, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.9118054182321534, + "step": 9222, + "train/total_loss": 0.16199404001235962 + }, + { + "entropy": 8.567594528198242, + "epoch": 0.9119042910816689, + "mean_token_accuracy": 0.7450058460235596, + "num_tokens": 27202955.0, + "step": 9223, + "train/ce_loss": 0.6554208397865295 + }, + { + "epoch": 0.9119042910816689, + "step": 9223, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.9119042910816689, + "step": 9223, + "train/total_loss": 0.1553858369588852 + }, + { + "entropy": 8.949714660644531, + "epoch": 0.9120031639311845, + "mean_token_accuracy": 0.7708333134651184, + "num_tokens": 27208181.0, + "step": 9224, + "train/ce_loss": 0.5297962427139282 + }, + { + "epoch": 0.9120031639311845, + "step": 9224, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.9120031639311845, + "step": 9224, + "train/total_loss": 0.08813587576150894 + }, + { + "entropy": 9.567924499511719, + "epoch": 0.9121020367807, + "mean_token_accuracy": 0.7350649237632751, + "num_tokens": 27212987.0, + "step": 9225, + "train/ce_loss": 3.7651375350833405e-07 + }, + { + "epoch": 0.9121020367807, + "step": 9225, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.9121020367807, + "step": 9225, + "train/total_loss": 0.015625037252902985 + }, + { + "entropy": 8.928082466125488, + "epoch": 0.9122009096302155, + "mean_token_accuracy": 0.6947040557861328, + "num_tokens": 27218082.0, + "step": 9226, + "train/ce_loss": 0.8295297622680664 + }, + { + "epoch": 0.9122009096302155, + "step": 9226, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.9122009096302155, + "step": 9226, + "train/total_loss": 0.17279672622680664 + }, + { + "entropy": 8.772733688354492, + "epoch": 0.9122997824797311, + "mean_token_accuracy": 0.734375, + "num_tokens": 27223286.0, + "step": 9227, + "train/ce_loss": 0.479086697101593 + }, + { + "epoch": 0.9122997824797311, + "step": 9227, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.9122997824797311, + "step": 9227, + "train/total_loss": 0.08697117120027542 + }, + { + "entropy": 8.668851852416992, + "epoch": 0.9123986553292466, + "mean_token_accuracy": 0.7693236470222473, + "num_tokens": 27228540.0, + "step": 9228, + "train/ce_loss": 0.6925671100616455 + }, + { + "epoch": 0.9123986553292466, + "step": 9228, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.9123986553292466, + "step": 9228, + "train/total_loss": 0.08488171547651291 + }, + { + "entropy": 8.739944458007812, + "epoch": 0.9124975281787621, + "mean_token_accuracy": 0.7110519409179688, + "num_tokens": 27233732.0, + "step": 9229, + "train/ce_loss": 0.7571595907211304 + }, + { + "epoch": 0.9124975281787621, + "step": 9229, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.9124975281787621, + "step": 9229, + "train/total_loss": 0.13040345907211304 + }, + { + "entropy": 8.30958366394043, + "epoch": 0.9125964010282777, + "mean_token_accuracy": 0.7227227091789246, + "num_tokens": 27239173.0, + "step": 9230, + "train/ce_loss": 1.288848638534546 + }, + { + "epoch": 0.9125964010282777, + "step": 9230, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.9125964010282777, + "step": 9230, + "train/total_loss": 0.23044736683368683 + }, + { + "entropy": 8.77785587310791, + "epoch": 0.9126952738777931, + "mean_token_accuracy": 0.7530266046524048, + "num_tokens": 27244481.0, + "step": 9231, + "train/ce_loss": 0.694959282875061 + }, + { + "epoch": 0.9126952738777931, + "step": 9231, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.9126952738777931, + "step": 9231, + "train/total_loss": 0.08902718126773834 + }, + { + "entropy": 8.481739044189453, + "epoch": 0.9127941467273086, + "mean_token_accuracy": 0.722806990146637, + "num_tokens": 27249811.0, + "step": 9232, + "train/ce_loss": 0.933229386806488 + }, + { + "epoch": 0.9127941467273086, + "step": 9232, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.9127941467273086, + "step": 9232, + "train/total_loss": 0.15191668272018433 + }, + { + "entropy": 8.568881034851074, + "epoch": 0.9128930195768242, + "mean_token_accuracy": 0.7235932946205139, + "num_tokens": 27255309.0, + "step": 9233, + "train/ce_loss": 0.6516080498695374 + }, + { + "epoch": 0.9128930195768242, + "step": 9233, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.9128930195768242, + "step": 9233, + "train/total_loss": 0.09641080349683762 + }, + { + "entropy": 9.117914199829102, + "epoch": 0.9129918924263397, + "mean_token_accuracy": 0.8020231127738953, + "num_tokens": 27260464.0, + "step": 9234, + "train/ce_loss": 0.9868393540382385 + }, + { + "epoch": 0.9129918924263397, + "step": 9234, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.9129918924263397, + "step": 9234, + "train/total_loss": 0.1299339383840561 + }, + { + "entropy": 8.817948341369629, + "epoch": 0.9130907652758552, + "mean_token_accuracy": 0.7510040402412415, + "num_tokens": 27265683.0, + "step": 9235, + "train/ce_loss": 0.8873830437660217 + }, + { + "epoch": 0.9130907652758552, + "step": 9235, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.9130907652758552, + "step": 9235, + "train/total_loss": 0.1317070573568344 + }, + { + "entropy": 8.456457138061523, + "epoch": 0.9131896381253708, + "mean_token_accuracy": 0.7397563457489014, + "num_tokens": 27271089.0, + "step": 9236, + "train/ce_loss": 0.7246193885803223 + }, + { + "epoch": 0.9131896381253708, + "step": 9236, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.9131896381253708, + "step": 9236, + "train/total_loss": 0.11933694034814835 + }, + { + "entropy": 8.499395370483398, + "epoch": 0.9132885109748863, + "mean_token_accuracy": 0.7075055241584778, + "num_tokens": 27276488.0, + "step": 9237, + "train/ce_loss": 0.5785139799118042 + }, + { + "epoch": 0.9132885109748863, + "step": 9237, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.9132885109748863, + "step": 9237, + "train/total_loss": 0.1047264039516449 + }, + { + "entropy": 8.961029052734375, + "epoch": 0.9133873838244018, + "mean_token_accuracy": 0.6569620370864868, + "num_tokens": 27281784.0, + "step": 9238, + "train/ce_loss": 0.47236815094947815 + }, + { + "epoch": 0.9133873838244018, + "step": 9238, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.9133873838244018, + "step": 9238, + "train/total_loss": 0.09411181509494781 + }, + { + "entropy": 8.695600509643555, + "epoch": 0.9134862566739174, + "mean_token_accuracy": 0.7176339030265808, + "num_tokens": 27287120.0, + "step": 9239, + "train/ce_loss": 0.7086272835731506 + }, + { + "epoch": 0.9134862566739174, + "step": 9239, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.9134862566739174, + "step": 9239, + "train/total_loss": 0.10601898282766342 + }, + { + "epoch": 0.9135851295234328, + "grad_norm": 0.606782853603363, + "learning_rate": 7.718192157444494e-06, + "loss": 0.1405, + "step": 9240 + }, + { + "entropy": 8.963159561157227, + "epoch": 0.9135851295234328, + "mean_token_accuracy": 0.771324872970581, + "num_tokens": 27292104.0, + "step": 9240, + "train/ce_loss": 1.5950840711593628 + }, + { + "epoch": 0.9135851295234328, + "step": 9240, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.9135851295234328, + "step": 9240, + "train/total_loss": 0.20247715711593628 + }, + { + "entropy": 8.75002670288086, + "epoch": 0.9136840023729483, + "mean_token_accuracy": 0.78812575340271, + "num_tokens": 27297428.0, + "step": 9241, + "train/ce_loss": 0.3548147976398468 + }, + { + "epoch": 0.9136840023729483, + "step": 9241, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.9136840023729483, + "step": 9241, + "train/total_loss": 0.05110647901892662 + }, + { + "entropy": 8.583799362182617, + "epoch": 0.9137828752224639, + "mean_token_accuracy": 0.7675977945327759, + "num_tokens": 27302770.0, + "step": 9242, + "train/ce_loss": 0.4433761537075043 + }, + { + "epoch": 0.9137828752224639, + "step": 9242, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.9137828752224639, + "step": 9242, + "train/total_loss": 0.05996261537075043 + }, + { + "entropy": 8.733439445495605, + "epoch": 0.9138817480719794, + "mean_token_accuracy": 0.7619718313217163, + "num_tokens": 27307975.0, + "step": 9243, + "train/ce_loss": 0.8882772922515869 + }, + { + "epoch": 0.9138817480719794, + "step": 9243, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.9138817480719794, + "step": 9243, + "train/total_loss": 0.11617147922515869 + }, + { + "entropy": 8.964921951293945, + "epoch": 0.913980620921495, + "mean_token_accuracy": 0.7971576452255249, + "num_tokens": 27313209.0, + "step": 9244, + "train/ce_loss": 0.5068889856338501 + }, + { + "epoch": 0.913980620921495, + "step": 9244, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.913980620921495, + "step": 9244, + "train/total_loss": 0.07412640005350113 + }, + { + "entropy": 8.99893569946289, + "epoch": 0.9140794937710105, + "mean_token_accuracy": 0.7366071343421936, + "num_tokens": 27318395.0, + "step": 9245, + "train/ce_loss": 0.6802981495857239 + }, + { + "epoch": 0.9140794937710105, + "step": 9245, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.9140794937710105, + "step": 9245, + "train/total_loss": 0.10709231346845627 + }, + { + "entropy": 8.335893630981445, + "epoch": 0.914178366620526, + "mean_token_accuracy": 0.7097457647323608, + "num_tokens": 27323782.0, + "step": 9246, + "train/ce_loss": 1.2256686687469482 + }, + { + "epoch": 0.914178366620526, + "step": 9246, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.914178366620526, + "step": 9246, + "train/total_loss": 0.16944187879562378 + }, + { + "entropy": 8.798724174499512, + "epoch": 0.9142772394700416, + "mean_token_accuracy": 0.7006451487541199, + "num_tokens": 27328988.0, + "step": 9247, + "train/ce_loss": 3.9869163970251975e-07 + }, + { + "epoch": 0.9142772394700416, + "step": 9247, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.9142772394700416, + "step": 9247, + "train/total_loss": 0.03125004097819328 + }, + { + "entropy": 8.986804008483887, + "epoch": 0.9143761123195571, + "mean_token_accuracy": 0.7445008754730225, + "num_tokens": 27334065.0, + "step": 9248, + "train/ce_loss": 1.5509508848190308 + }, + { + "epoch": 0.9143761123195571, + "step": 9248, + "train/sim_loss": 0.1328125 + }, + { + "epoch": 0.9143761123195571, + "step": 9248, + "train/total_loss": 0.28790760040283203 + }, + { + "entropy": 8.465126037597656, + "epoch": 0.9144749851690726, + "mean_token_accuracy": 0.7416666746139526, + "num_tokens": 27339382.0, + "step": 9249, + "train/ce_loss": 1.0632635354995728 + }, + { + "epoch": 0.9144749851690726, + "step": 9249, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.9144749851690726, + "step": 9249, + "train/total_loss": 0.18835760653018951 + }, + { + "entropy": 8.786970138549805, + "epoch": 0.9145738580185881, + "mean_token_accuracy": 0.7538461685180664, + "num_tokens": 27344615.0, + "step": 9250, + "train/ce_loss": 8.582583745919692e-07 + }, + { + "epoch": 0.9145738580185881, + "step": 9250, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.9145738580185881, + "step": 9250, + "train/total_loss": 0.039062585681676865 + }, + { + "entropy": 9.150355339050293, + "epoch": 0.9146727308681036, + "mean_token_accuracy": 0.6981481313705444, + "num_tokens": 27349612.0, + "step": 9251, + "train/ce_loss": 0.901170551776886 + }, + { + "epoch": 0.9146727308681036, + "step": 9251, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.9146727308681036, + "step": 9251, + "train/total_loss": 0.12136705964803696 + }, + { + "entropy": 8.961719512939453, + "epoch": 0.9147716037176191, + "mean_token_accuracy": 0.7664121985435486, + "num_tokens": 27354652.0, + "step": 9252, + "train/ce_loss": 1.3180468082427979 + }, + { + "epoch": 0.9147716037176191, + "step": 9252, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.9147716037176191, + "step": 9252, + "train/total_loss": 0.2021171897649765 + }, + { + "entropy": 9.69161605834961, + "epoch": 0.9148704765671347, + "mean_token_accuracy": 0.7443820238113403, + "num_tokens": 27359403.0, + "step": 9253, + "train/ce_loss": 3.000313597567583e-07 + }, + { + "epoch": 0.9148704765671347, + "step": 9253, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.9148704765671347, + "step": 9253, + "train/total_loss": 0.015625029802322388 + }, + { + "entropy": 8.836685180664062, + "epoch": 0.9149693494166502, + "mean_token_accuracy": 0.7243107557296753, + "num_tokens": 27364834.0, + "step": 9254, + "train/ce_loss": 0.7340941429138184 + }, + { + "epoch": 0.9149693494166502, + "step": 9254, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.9149693494166502, + "step": 9254, + "train/total_loss": 0.12419066578149796 + }, + { + "entropy": 8.632948875427246, + "epoch": 0.9150682222661657, + "mean_token_accuracy": 0.730681836605072, + "num_tokens": 27370163.0, + "step": 9255, + "train/ce_loss": 1.0817639827728271 + }, + { + "epoch": 0.9150682222661657, + "step": 9255, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.9150682222661657, + "step": 9255, + "train/total_loss": 0.14723891019821167 + }, + { + "entropy": 8.910545349121094, + "epoch": 0.9151670951156813, + "mean_token_accuracy": 0.8300395011901855, + "num_tokens": 27375394.0, + "step": 9256, + "train/ce_loss": 9.63684556154476e-07 + }, + { + "epoch": 0.9151670951156813, + "step": 9256, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.9151670951156813, + "step": 9256, + "train/total_loss": 0.04687509685754776 + }, + { + "entropy": 9.36585521697998, + "epoch": 0.9152659679651968, + "mean_token_accuracy": 0.7733089327812195, + "num_tokens": 27380386.0, + "step": 9257, + "train/ce_loss": 1.8916507826816087e-07 + }, + { + "epoch": 0.9152659679651968, + "step": 9257, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.9152659679651968, + "step": 9257, + "train/total_loss": 0.015625018626451492 + }, + { + "entropy": 9.008048057556152, + "epoch": 0.9153648408147123, + "mean_token_accuracy": 0.727544903755188, + "num_tokens": 27385501.0, + "step": 9258, + "train/ce_loss": 1.2201671600341797 + }, + { + "epoch": 0.9153648408147123, + "step": 9258, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.9153648408147123, + "step": 9258, + "train/total_loss": 0.19623547792434692 + }, + { + "entropy": 8.924967765808105, + "epoch": 0.9154637136642279, + "mean_token_accuracy": 0.75, + "num_tokens": 27390665.0, + "step": 9259, + "train/ce_loss": 0.3701741695404053 + }, + { + "epoch": 0.9154637136642279, + "step": 9259, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.9154637136642279, + "step": 9259, + "train/total_loss": 0.07607991993427277 + }, + { + "epoch": 0.9155625865137433, + "grad_norm": 0.612991452217102, + "learning_rate": 7.713247292686546e-06, + "loss": 0.1263, + "step": 9260 + }, + { + "entropy": 8.833109855651855, + "epoch": 0.9155625865137433, + "mean_token_accuracy": 0.7330677509307861, + "num_tokens": 27395949.0, + "step": 9260, + "train/ce_loss": 0.6401509642601013 + }, + { + "epoch": 0.9155625865137433, + "step": 9260, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.9155625865137433, + "step": 9260, + "train/total_loss": 0.09135884791612625 + }, + { + "entropy": 9.314435958862305, + "epoch": 0.9156614593632588, + "mean_token_accuracy": 0.7307692170143127, + "num_tokens": 27400866.0, + "step": 9261, + "train/ce_loss": 2.0196523666381836 + }, + { + "epoch": 0.9156614593632588, + "step": 9261, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.9156614593632588, + "step": 9261, + "train/total_loss": 0.24493399262428284 + }, + { + "entropy": 8.92432975769043, + "epoch": 0.9157603322127744, + "mean_token_accuracy": 0.7299270033836365, + "num_tokens": 27406057.0, + "step": 9262, + "train/ce_loss": 1.7605401277542114 + }, + { + "epoch": 0.9157603322127744, + "step": 9262, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.9157603322127744, + "step": 9262, + "train/total_loss": 0.2580852508544922 + }, + { + "entropy": 9.119075775146484, + "epoch": 0.9158592050622899, + "mean_token_accuracy": 0.8072100281715393, + "num_tokens": 27411098.0, + "step": 9263, + "train/ce_loss": 0.87204909324646 + }, + { + "epoch": 0.9158592050622899, + "step": 9263, + "train/sim_loss": 0.01171875 + }, + { + "epoch": 0.9158592050622899, + "step": 9263, + "train/total_loss": 0.09892366081476212 + }, + { + "entropy": 8.559356689453125, + "epoch": 0.9159580779118054, + "mean_token_accuracy": 0.6959459185600281, + "num_tokens": 27416476.0, + "step": 9264, + "train/ce_loss": 0.5877631306648254 + }, + { + "epoch": 0.9159580779118054, + "step": 9264, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.9159580779118054, + "step": 9264, + "train/total_loss": 0.08612006902694702 + }, + { + "entropy": 8.968629837036133, + "epoch": 0.916056950761321, + "mean_token_accuracy": 0.7583732008934021, + "num_tokens": 27421731.0, + "step": 9265, + "train/ce_loss": 1.246213674545288 + }, + { + "epoch": 0.916056950761321, + "step": 9265, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.916056950761321, + "step": 9265, + "train/total_loss": 0.14024636149406433 + }, + { + "entropy": 9.342061042785645, + "epoch": 0.9161558236108365, + "mean_token_accuracy": 0.7292817831039429, + "num_tokens": 27426712.0, + "step": 9266, + "train/ce_loss": 1.7498931884765625 + }, + { + "epoch": 0.9161558236108365, + "step": 9266, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.9161558236108365, + "step": 9266, + "train/total_loss": 0.20233307778835297 + }, + { + "entropy": 9.135245323181152, + "epoch": 0.916254696460352, + "mean_token_accuracy": 0.8283828496932983, + "num_tokens": 27431751.0, + "step": 9267, + "train/ce_loss": 3.4464170539649785e-07 + }, + { + "epoch": 0.916254696460352, + "step": 9267, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.916254696460352, + "step": 9267, + "train/total_loss": 0.054687533527612686 + }, + { + "entropy": 8.552834510803223, + "epoch": 0.9163535693098676, + "mean_token_accuracy": 0.7318059206008911, + "num_tokens": 27436990.0, + "step": 9268, + "train/ce_loss": 1.207385778427124 + }, + { + "epoch": 0.9163535693098676, + "step": 9268, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.9163535693098676, + "step": 9268, + "train/total_loss": 0.19495733082294464 + }, + { + "entropy": 8.659819602966309, + "epoch": 0.916452442159383, + "mean_token_accuracy": 0.7121387124061584, + "num_tokens": 27442306.0, + "step": 9269, + "train/ce_loss": 0.9533243179321289 + }, + { + "epoch": 0.916452442159383, + "step": 9269, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.916452442159383, + "step": 9269, + "train/total_loss": 0.12658244371414185 + }, + { + "entropy": 8.757511138916016, + "epoch": 0.9165513150088985, + "mean_token_accuracy": 0.701298713684082, + "num_tokens": 27447400.0, + "step": 9270, + "train/ce_loss": 7.575519589408941e-07 + }, + { + "epoch": 0.9165513150088985, + "step": 9270, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.9165513150088985, + "step": 9270, + "train/total_loss": 0.05078132450580597 + }, + { + "entropy": 8.658183097839355, + "epoch": 0.9166501878584141, + "mean_token_accuracy": 0.70652174949646, + "num_tokens": 27452557.0, + "step": 9271, + "train/ce_loss": 0.825967013835907 + }, + { + "epoch": 0.9166501878584141, + "step": 9271, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.9166501878584141, + "step": 9271, + "train/total_loss": 0.14900295436382294 + }, + { + "entropy": 9.13985538482666, + "epoch": 0.9167490607079296, + "mean_token_accuracy": 0.7724770903587341, + "num_tokens": 27457613.0, + "step": 9272, + "train/ce_loss": 0.9535839557647705 + }, + { + "epoch": 0.9167490607079296, + "step": 9272, + "train/sim_loss": 0.01171875 + }, + { + "epoch": 0.9167490607079296, + "step": 9272, + "train/total_loss": 0.10707714408636093 + }, + { + "entropy": 9.225017547607422, + "epoch": 0.9168479335574451, + "mean_token_accuracy": 0.73758864402771, + "num_tokens": 27462630.0, + "step": 9273, + "train/ce_loss": 0.764447033405304 + }, + { + "epoch": 0.9168479335574451, + "step": 9273, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.9168479335574451, + "step": 9273, + "train/total_loss": 0.10769470781087875 + }, + { + "entropy": 8.432116508483887, + "epoch": 0.9169468064069607, + "mean_token_accuracy": 0.7677624821662903, + "num_tokens": 27467996.0, + "step": 9274, + "train/ce_loss": 0.9285256862640381 + }, + { + "epoch": 0.9169468064069607, + "step": 9274, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.9169468064069607, + "step": 9274, + "train/total_loss": 0.12410257011651993 + }, + { + "entropy": 8.640363693237305, + "epoch": 0.9170456792564762, + "mean_token_accuracy": 0.7194994688034058, + "num_tokens": 27473427.0, + "step": 9275, + "train/ce_loss": 0.7433151602745056 + }, + { + "epoch": 0.9170456792564762, + "step": 9275, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.9170456792564762, + "step": 9275, + "train/total_loss": 0.12120651453733444 + }, + { + "entropy": 10.058313369750977, + "epoch": 0.9171445521059917, + "mean_token_accuracy": 0.8109452724456787, + "num_tokens": 27477965.0, + "step": 9276, + "train/ce_loss": 1.6321387192874681e-06 + }, + { + "epoch": 0.9171445521059917, + "step": 9276, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.9171445521059917, + "step": 9276, + "train/total_loss": 0.05078141391277313 + }, + { + "entropy": 8.52928352355957, + "epoch": 0.9172434249555073, + "mean_token_accuracy": 0.7412935495376587, + "num_tokens": 27483468.0, + "step": 9277, + "train/ce_loss": 0.7360466718673706 + }, + { + "epoch": 0.9172434249555073, + "step": 9277, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.9172434249555073, + "step": 9277, + "train/total_loss": 0.10876091569662094 + }, + { + "entropy": 9.13144302368164, + "epoch": 0.9173422978050227, + "mean_token_accuracy": 0.75, + "num_tokens": 27488506.0, + "step": 9278, + "train/ce_loss": 1.0652343034744263 + }, + { + "epoch": 0.9173422978050227, + "step": 9278, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.9173422978050227, + "step": 9278, + "train/total_loss": 0.14558592438697815 + }, + { + "entropy": 8.72749137878418, + "epoch": 0.9174411706545382, + "mean_token_accuracy": 0.7104247212409973, + "num_tokens": 27493705.0, + "step": 9279, + "train/ce_loss": 1.1955887079238892 + }, + { + "epoch": 0.9174411706545382, + "step": 9279, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.9174411706545382, + "step": 9279, + "train/total_loss": 0.16252762079238892 + }, + { + "epoch": 0.9175400435040538, + "grad_norm": 0.5956372618675232, + "learning_rate": 7.708302427928596e-06, + "loss": 0.1313, + "step": 9280 + }, + { + "entropy": 9.752147674560547, + "epoch": 0.9175400435040538, + "mean_token_accuracy": 0.7214611768722534, + "num_tokens": 27498331.0, + "step": 9280, + "train/ce_loss": 1.589429871273751e-06 + }, + { + "epoch": 0.9175400435040538, + "step": 9280, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.9175400435040538, + "step": 9280, + "train/total_loss": 0.031250160187482834 + }, + { + "entropy": 8.482603073120117, + "epoch": 0.9176389163535693, + "mean_token_accuracy": 0.7118483185768127, + "num_tokens": 27503836.0, + "step": 9281, + "train/ce_loss": 1.0050632953643799 + }, + { + "epoch": 0.9176389163535693, + "step": 9281, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.9176389163535693, + "step": 9281, + "train/total_loss": 0.19035008549690247 + }, + { + "entropy": 9.328857421875, + "epoch": 0.9177377892030848, + "mean_token_accuracy": 0.7199169993400574, + "num_tokens": 27508780.0, + "step": 9282, + "train/ce_loss": 0.933283269405365 + }, + { + "epoch": 0.9177377892030848, + "step": 9282, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.9177377892030848, + "step": 9282, + "train/total_loss": 0.1402033269405365 + }, + { + "entropy": 8.806611061096191, + "epoch": 0.9178366620526004, + "mean_token_accuracy": 0.793608546257019, + "num_tokens": 27513994.0, + "step": 9283, + "train/ce_loss": 0.7182518839836121 + }, + { + "epoch": 0.9178366620526004, + "step": 9283, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.9178366620526004, + "step": 9283, + "train/total_loss": 0.15776269137859344 + }, + { + "entropy": 8.265416145324707, + "epoch": 0.9179355349021159, + "mean_token_accuracy": 0.6988210082054138, + "num_tokens": 27519390.0, + "step": 9284, + "train/ce_loss": 0.8868662118911743 + }, + { + "epoch": 0.9179355349021159, + "step": 9284, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.9179355349021159, + "step": 9284, + "train/total_loss": 0.13165536522865295 + }, + { + "entropy": 9.252330780029297, + "epoch": 0.9180344077516314, + "mean_token_accuracy": 0.7689655423164368, + "num_tokens": 27524408.0, + "step": 9285, + "train/ce_loss": 2.17863515672434e-07 + }, + { + "epoch": 0.9180344077516314, + "step": 9285, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.9180344077516314, + "step": 9285, + "train/total_loss": 0.02343752235174179 + }, + { + "entropy": 8.831356048583984, + "epoch": 0.918133280601147, + "mean_token_accuracy": 0.7590512037277222, + "num_tokens": 27529679.0, + "step": 9286, + "train/ce_loss": 0.5693334341049194 + }, + { + "epoch": 0.918133280601147, + "step": 9286, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.918133280601147, + "step": 9286, + "train/total_loss": 0.08427709341049194 + }, + { + "entropy": 9.128816604614258, + "epoch": 0.9182321534506624, + "mean_token_accuracy": 0.7772194147109985, + "num_tokens": 27534745.0, + "step": 9287, + "train/ce_loss": 0.9913820028305054 + }, + { + "epoch": 0.9182321534506624, + "step": 9287, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.9182321534506624, + "step": 9287, + "train/total_loss": 0.16945070028305054 + }, + { + "entropy": 8.789335250854492, + "epoch": 0.9183310263001779, + "mean_token_accuracy": 0.7612121105194092, + "num_tokens": 27540036.0, + "step": 9288, + "train/ce_loss": 0.47905784845352173 + }, + { + "epoch": 0.9183310263001779, + "step": 9288, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.9183310263001779, + "step": 9288, + "train/total_loss": 0.09478078782558441 + }, + { + "entropy": 10.050600051879883, + "epoch": 0.9184298991496935, + "mean_token_accuracy": 0.7607361674308777, + "num_tokens": 27544587.0, + "step": 9289, + "train/ce_loss": 2.6196956634521484 + }, + { + "epoch": 0.9184298991496935, + "step": 9289, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.9184298991496935, + "step": 9289, + "train/total_loss": 0.35962581634521484 + }, + { + "entropy": 8.77641773223877, + "epoch": 0.918528771999209, + "mean_token_accuracy": 0.717277467250824, + "num_tokens": 27549822.0, + "step": 9290, + "train/ce_loss": 1.258701205253601 + }, + { + "epoch": 0.918528771999209, + "step": 9290, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.918528771999209, + "step": 9290, + "train/total_loss": 0.17665137350559235 + }, + { + "entropy": 9.14539909362793, + "epoch": 0.9186276448487245, + "mean_token_accuracy": 0.7543553709983826, + "num_tokens": 27554853.0, + "step": 9291, + "train/ce_loss": 1.2595537900924683 + }, + { + "epoch": 0.9186276448487245, + "step": 9291, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.9186276448487245, + "step": 9291, + "train/total_loss": 0.19626788794994354 + }, + { + "entropy": 8.761069297790527, + "epoch": 0.9187265176982401, + "mean_token_accuracy": 0.7253668904304504, + "num_tokens": 27560241.0, + "step": 9292, + "train/ce_loss": 1.3040637969970703 + }, + { + "epoch": 0.9187265176982401, + "step": 9292, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.9187265176982401, + "step": 9292, + "train/total_loss": 0.16946887969970703 + }, + { + "entropy": 8.862565994262695, + "epoch": 0.9188253905477556, + "mean_token_accuracy": 0.7532281279563904, + "num_tokens": 27565362.0, + "step": 9293, + "train/ce_loss": 0.617352306842804 + }, + { + "epoch": 0.9188253905477556, + "step": 9293, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.9188253905477556, + "step": 9293, + "train/total_loss": 0.09298522770404816 + }, + { + "entropy": 8.723288536071777, + "epoch": 0.9189242633972711, + "mean_token_accuracy": 0.6952140927314758, + "num_tokens": 27570667.0, + "step": 9294, + "train/ce_loss": 0.7255082130432129 + }, + { + "epoch": 0.9189242633972711, + "step": 9294, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.9189242633972711, + "step": 9294, + "train/total_loss": 0.09989457577466965 + }, + { + "entropy": 8.683667182922363, + "epoch": 0.9190231362467867, + "mean_token_accuracy": 0.7905982732772827, + "num_tokens": 27576021.0, + "step": 9295, + "train/ce_loss": 0.7735714316368103 + }, + { + "epoch": 0.9190231362467867, + "step": 9295, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.9190231362467867, + "step": 9295, + "train/total_loss": 0.10079464316368103 + }, + { + "entropy": 8.675325393676758, + "epoch": 0.9191220090963022, + "mean_token_accuracy": 0.6952879428863525, + "num_tokens": 27581428.0, + "step": 9296, + "train/ce_loss": 0.9181460738182068 + }, + { + "epoch": 0.9191220090963022, + "step": 9296, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.9191220090963022, + "step": 9296, + "train/total_loss": 0.15431460738182068 + }, + { + "entropy": 9.235428810119629, + "epoch": 0.9192208819458176, + "mean_token_accuracy": 0.7586206793785095, + "num_tokens": 27586409.0, + "step": 9297, + "train/ce_loss": 1.6527061462402344 + }, + { + "epoch": 0.9192208819458176, + "step": 9297, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.9192208819458176, + "step": 9297, + "train/total_loss": 0.2551143765449524 + }, + { + "entropy": 9.978853225708008, + "epoch": 0.9193197547953332, + "mean_token_accuracy": 0.7686567306518555, + "num_tokens": 27591004.0, + "step": 9298, + "train/ce_loss": 1.1262466159678297e-06 + }, + { + "epoch": 0.9193197547953332, + "step": 9298, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.9193197547953332, + "step": 9298, + "train/total_loss": 0.054687611758708954 + }, + { + "entropy": 8.849390983581543, + "epoch": 0.9194186276448487, + "mean_token_accuracy": 0.6966292262077332, + "num_tokens": 27596258.0, + "step": 9299, + "train/ce_loss": 1.3365767002105713 + }, + { + "epoch": 0.9194186276448487, + "step": 9299, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.9194186276448487, + "step": 9299, + "train/total_loss": 0.21959517896175385 + }, + { + "epoch": 0.9195175004943642, + "grad_norm": 0.7616487145423889, + "learning_rate": 7.703357563170648e-06, + "loss": 0.1308, + "step": 9300 + }, + { + "entropy": 9.030107498168945, + "epoch": 0.9195175004943642, + "mean_token_accuracy": 0.7249283790588379, + "num_tokens": 27601384.0, + "step": 9300, + "train/ce_loss": 0.00010385373025201261 + }, + { + "epoch": 0.9195175004943642, + "step": 9300, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.9195175004943642, + "step": 9300, + "train/total_loss": 0.05469788610935211 + }, + { + "entropy": 8.875327110290527, + "epoch": 0.9196163733438798, + "mean_token_accuracy": 0.7274472117424011, + "num_tokens": 27606329.0, + "step": 9301, + "train/ce_loss": 1.3728803396224976 + }, + { + "epoch": 0.9196163733438798, + "step": 9301, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.9196163733438798, + "step": 9301, + "train/total_loss": 0.18025678396224976 + }, + { + "entropy": 8.848821640014648, + "epoch": 0.9197152461933953, + "mean_token_accuracy": 0.7321637272834778, + "num_tokens": 27611646.0, + "step": 9302, + "train/ce_loss": 0.8853605389595032 + }, + { + "epoch": 0.9197152461933953, + "step": 9302, + "train/sim_loss": 0.14453125 + }, + { + "epoch": 0.9197152461933953, + "step": 9302, + "train/total_loss": 0.23306730389595032 + }, + { + "entropy": 8.887819290161133, + "epoch": 0.9198141190429108, + "mean_token_accuracy": 0.8480325937271118, + "num_tokens": 27616848.0, + "step": 9303, + "train/ce_loss": 2.756186461283505e-07 + }, + { + "epoch": 0.9198141190429108, + "step": 9303, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.9198141190429108, + "step": 9303, + "train/total_loss": 0.04296877607703209 + }, + { + "entropy": 8.672847747802734, + "epoch": 0.9199129918924264, + "mean_token_accuracy": 0.7010309100151062, + "num_tokens": 27622316.0, + "step": 9304, + "train/ce_loss": 0.9316827058792114 + }, + { + "epoch": 0.9199129918924264, + "step": 9304, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.9199129918924264, + "step": 9304, + "train/total_loss": 0.15957452356815338 + }, + { + "entropy": 8.53351879119873, + "epoch": 0.9200118647419419, + "mean_token_accuracy": 0.7390350699424744, + "num_tokens": 27627718.0, + "step": 9305, + "train/ce_loss": 0.8892126083374023 + }, + { + "epoch": 0.9200118647419419, + "step": 9305, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.9200118647419419, + "step": 9305, + "train/total_loss": 0.11626501381397247 + }, + { + "entropy": 9.371720314025879, + "epoch": 0.9201107375914573, + "mean_token_accuracy": 0.7847533822059631, + "num_tokens": 27632609.0, + "step": 9306, + "train/ce_loss": 3.378468704795523e-07 + }, + { + "epoch": 0.9201107375914573, + "step": 9306, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.9201107375914573, + "step": 9306, + "train/total_loss": 0.039062533527612686 + }, + { + "entropy": 8.608626365661621, + "epoch": 0.9202096104409729, + "mean_token_accuracy": 0.7905759215354919, + "num_tokens": 27638025.0, + "step": 9307, + "train/ce_loss": 0.4734403192996979 + }, + { + "epoch": 0.9202096104409729, + "step": 9307, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.9202096104409729, + "step": 9307, + "train/total_loss": 0.07078152894973755 + }, + { + "entropy": 8.684558868408203, + "epoch": 0.9203084832904884, + "mean_token_accuracy": 0.759036123752594, + "num_tokens": 27643228.0, + "step": 9308, + "train/ce_loss": 1.0258185863494873 + }, + { + "epoch": 0.9203084832904884, + "step": 9308, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.9203084832904884, + "step": 9308, + "train/total_loss": 0.13383185863494873 + }, + { + "entropy": 8.951282501220703, + "epoch": 0.9204073561400039, + "mean_token_accuracy": 0.769336998462677, + "num_tokens": 27648405.0, + "step": 9309, + "train/ce_loss": 0.7111250162124634 + }, + { + "epoch": 0.9204073561400039, + "step": 9309, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.9204073561400039, + "step": 9309, + "train/total_loss": 0.0906437560915947 + }, + { + "entropy": 9.013503074645996, + "epoch": 0.9205062289895195, + "mean_token_accuracy": 0.7174515128135681, + "num_tokens": 27653765.0, + "step": 9310, + "train/ce_loss": 1.3770428895950317 + }, + { + "epoch": 0.9205062289895195, + "step": 9310, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.9205062289895195, + "step": 9310, + "train/total_loss": 0.2353605479001999 + }, + { + "entropy": 8.536752700805664, + "epoch": 0.920605101839035, + "mean_token_accuracy": 0.760613203048706, + "num_tokens": 27659097.0, + "step": 9311, + "train/ce_loss": 0.510001003742218 + }, + { + "epoch": 0.920605101839035, + "step": 9311, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.920605101839035, + "step": 9311, + "train/total_loss": 0.08225010335445404 + }, + { + "entropy": 8.710546493530273, + "epoch": 0.9207039746885505, + "mean_token_accuracy": 0.7442396283149719, + "num_tokens": 27664485.0, + "step": 9312, + "train/ce_loss": 0.58183354139328 + }, + { + "epoch": 0.9207039746885505, + "step": 9312, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.9207039746885505, + "step": 9312, + "train/total_loss": 0.11287085711956024 + }, + { + "entropy": 8.914073944091797, + "epoch": 0.9208028475380661, + "mean_token_accuracy": 0.7561929821968079, + "num_tokens": 27669741.0, + "step": 9313, + "train/ce_loss": 1.2763566970825195 + }, + { + "epoch": 0.9208028475380661, + "step": 9313, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.9208028475380661, + "step": 9313, + "train/total_loss": 0.2018544226884842 + }, + { + "entropy": 8.939849853515625, + "epoch": 0.9209017203875816, + "mean_token_accuracy": 0.6697009205818176, + "num_tokens": 27674962.0, + "step": 9314, + "train/ce_loss": 1.2684932947158813 + }, + { + "epoch": 0.9209017203875816, + "step": 9314, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.9209017203875816, + "step": 9314, + "train/total_loss": 0.20106808841228485 + }, + { + "entropy": 9.103290557861328, + "epoch": 0.921000593237097, + "mean_token_accuracy": 0.7338345646858215, + "num_tokens": 27680084.0, + "step": 9315, + "train/ce_loss": 0.7312984466552734 + }, + { + "epoch": 0.921000593237097, + "step": 9315, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.921000593237097, + "step": 9315, + "train/total_loss": 0.11609859764575958 + }, + { + "entropy": 8.95407485961914, + "epoch": 0.9210994660866126, + "mean_token_accuracy": 0.7327327132225037, + "num_tokens": 27685230.0, + "step": 9316, + "train/ce_loss": 0.873866617679596 + }, + { + "epoch": 0.9210994660866126, + "step": 9316, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.9210994660866126, + "step": 9316, + "train/total_loss": 0.15769916772842407 + }, + { + "entropy": 9.344866752624512, + "epoch": 0.9211983389361281, + "mean_token_accuracy": 0.7724957466125488, + "num_tokens": 27690251.0, + "step": 9317, + "train/ce_loss": 1.6272326774924295e-07 + }, + { + "epoch": 0.9211983389361281, + "step": 9317, + "train/sim_loss": 0.01171875 + }, + { + "epoch": 0.9211983389361281, + "step": 9317, + "train/total_loss": 0.011718765832483768 + }, + { + "entropy": 8.525527000427246, + "epoch": 0.9212972117856436, + "mean_token_accuracy": 0.7003610134124756, + "num_tokens": 27695584.0, + "step": 9318, + "train/ce_loss": 0.5618838667869568 + }, + { + "epoch": 0.9212972117856436, + "step": 9318, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.9212972117856436, + "step": 9318, + "train/total_loss": 0.11478213965892792 + }, + { + "entropy": 9.122682571411133, + "epoch": 0.9213960846351592, + "mean_token_accuracy": 0.6692759394645691, + "num_tokens": 27700559.0, + "step": 9319, + "train/ce_loss": 2.500817402051325e-07 + }, + { + "epoch": 0.9213960846351592, + "step": 9319, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.9213960846351592, + "step": 9319, + "train/total_loss": 0.02734377421438694 + }, + { + "epoch": 0.9214949574846747, + "grad_norm": 0.9534464478492737, + "learning_rate": 7.698412698412699e-06, + "loss": 0.1333, + "step": 9320 + }, + { + "entropy": 8.832344055175781, + "epoch": 0.9214949574846747, + "mean_token_accuracy": 0.7819650173187256, + "num_tokens": 27705765.0, + "step": 9320, + "train/ce_loss": 0.5779180526733398 + }, + { + "epoch": 0.9214949574846747, + "step": 9320, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.9214949574846747, + "step": 9320, + "train/total_loss": 0.0890418067574501 + }, + { + "entropy": 9.167750358581543, + "epoch": 0.9215938303341902, + "mean_token_accuracy": 0.7516447305679321, + "num_tokens": 27710813.0, + "step": 9321, + "train/ce_loss": 1.5943471193313599 + }, + { + "epoch": 0.9215938303341902, + "step": 9321, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.9215938303341902, + "step": 9321, + "train/total_loss": 0.1867784708738327 + }, + { + "entropy": 8.775861740112305, + "epoch": 0.9216927031837058, + "mean_token_accuracy": 0.7531003355979919, + "num_tokens": 27716209.0, + "step": 9322, + "train/ce_loss": 0.7420780658721924 + }, + { + "epoch": 0.9216927031837058, + "step": 9322, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.9216927031837058, + "step": 9322, + "train/total_loss": 0.09373905509710312 + }, + { + "entropy": 9.585281372070312, + "epoch": 0.9217915760332213, + "mean_token_accuracy": 0.7176781296730042, + "num_tokens": 27720975.0, + "step": 9323, + "train/ce_loss": 2.351759672164917 + }, + { + "epoch": 0.9217915760332213, + "step": 9323, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.9217915760332213, + "step": 9323, + "train/total_loss": 0.2820509672164917 + }, + { + "entropy": 8.793519973754883, + "epoch": 0.9218904488827367, + "mean_token_accuracy": 0.8018372654914856, + "num_tokens": 27726209.0, + "step": 9324, + "train/ce_loss": 0.6098015904426575 + }, + { + "epoch": 0.9218904488827367, + "step": 9324, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.9218904488827367, + "step": 9324, + "train/total_loss": 0.08051140606403351 + }, + { + "entropy": 8.71558952331543, + "epoch": 0.9219893217322523, + "mean_token_accuracy": 0.725784420967102, + "num_tokens": 27731440.0, + "step": 9325, + "train/ce_loss": 0.9672392010688782 + }, + { + "epoch": 0.9219893217322523, + "step": 9325, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.9219893217322523, + "step": 9325, + "train/total_loss": 0.12797391414642334 + }, + { + "entropy": 9.141233444213867, + "epoch": 0.9220881945817678, + "mean_token_accuracy": 0.7003105878829956, + "num_tokens": 27736475.0, + "step": 9326, + "train/ce_loss": 1.083046555519104 + }, + { + "epoch": 0.9220881945817678, + "step": 9326, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.9220881945817678, + "step": 9326, + "train/total_loss": 0.19033589959144592 + }, + { + "entropy": 8.981075286865234, + "epoch": 0.9221870674312834, + "mean_token_accuracy": 0.7578125, + "num_tokens": 27741557.0, + "step": 9327, + "train/ce_loss": 1.4328120946884155 + }, + { + "epoch": 0.9221870674312834, + "step": 9327, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.9221870674312834, + "step": 9327, + "train/total_loss": 0.21749995648860931 + }, + { + "entropy": 8.807497024536133, + "epoch": 0.9222859402807989, + "mean_token_accuracy": 0.7117486596107483, + "num_tokens": 27746743.0, + "step": 9328, + "train/ce_loss": 0.545606255531311 + }, + { + "epoch": 0.9222859402807989, + "step": 9328, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.9222859402807989, + "step": 9328, + "train/total_loss": 0.10534188151359558 + }, + { + "entropy": 9.165096282958984, + "epoch": 0.9223848131303144, + "mean_token_accuracy": 0.7166324257850647, + "num_tokens": 27751672.0, + "step": 9329, + "train/ce_loss": 1.3440909385681152 + }, + { + "epoch": 0.9223848131303144, + "step": 9329, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.9223848131303144, + "step": 9329, + "train/total_loss": 0.165659099817276 + }, + { + "entropy": 9.130973815917969, + "epoch": 0.92248368597983, + "mean_token_accuracy": 0.7927756905555725, + "num_tokens": 27756659.0, + "step": 9330, + "train/ce_loss": 0.8847929835319519 + }, + { + "epoch": 0.92248368597983, + "step": 9330, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.92248368597983, + "step": 9330, + "train/total_loss": 0.13535431027412415 + }, + { + "entropy": 9.225261688232422, + "epoch": 0.9225825588293455, + "mean_token_accuracy": 0.8013244867324829, + "num_tokens": 27761720.0, + "step": 9331, + "train/ce_loss": 1.2016048431396484 + }, + { + "epoch": 0.9225825588293455, + "step": 9331, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.9225825588293455, + "step": 9331, + "train/total_loss": 0.14750424027442932 + }, + { + "entropy": 9.045722007751465, + "epoch": 0.922681431678861, + "mean_token_accuracy": 0.7472222447395325, + "num_tokens": 27766929.0, + "step": 9332, + "train/ce_loss": 0.681461751461029 + }, + { + "epoch": 0.922681431678861, + "step": 9332, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.922681431678861, + "step": 9332, + "train/total_loss": 0.12283367663621902 + }, + { + "entropy": 8.748706817626953, + "epoch": 0.9227803045283766, + "mean_token_accuracy": 0.6838790774345398, + "num_tokens": 27772216.0, + "step": 9333, + "train/ce_loss": 0.8040033578872681 + }, + { + "epoch": 0.9227803045283766, + "step": 9333, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.9227803045283766, + "step": 9333, + "train/total_loss": 0.12336909025907516 + }, + { + "entropy": 9.276330947875977, + "epoch": 0.922879177377892, + "mean_token_accuracy": 0.6828479170799255, + "num_tokens": 27777279.0, + "step": 9334, + "train/ce_loss": 2.368741007785502e-07 + }, + { + "epoch": 0.922879177377892, + "step": 9334, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.922879177377892, + "step": 9334, + "train/total_loss": 0.01562502421438694 + }, + { + "entropy": 9.032341003417969, + "epoch": 0.9229780502274075, + "mean_token_accuracy": 0.7513736486434937, + "num_tokens": 27782429.0, + "step": 9335, + "train/ce_loss": 0.8243290781974792 + }, + { + "epoch": 0.9229780502274075, + "step": 9335, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.9229780502274075, + "step": 9335, + "train/total_loss": 0.10977666079998016 + }, + { + "entropy": 8.510507583618164, + "epoch": 0.9230769230769231, + "mean_token_accuracy": 0.8008130192756653, + "num_tokens": 27787892.0, + "step": 9336, + "train/ce_loss": 0.4929116368293762 + }, + { + "epoch": 0.9230769230769231, + "step": 9336, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.9230769230769231, + "step": 9336, + "train/total_loss": 0.07663491368293762 + }, + { + "entropy": 8.87246322631836, + "epoch": 0.9231757959264386, + "mean_token_accuracy": 0.7857961058616638, + "num_tokens": 27793215.0, + "step": 9337, + "train/ce_loss": 0.6730414628982544 + }, + { + "epoch": 0.9231757959264386, + "step": 9337, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.9231757959264386, + "step": 9337, + "train/total_loss": 0.12980414927005768 + }, + { + "entropy": 9.145835876464844, + "epoch": 0.9232746687759541, + "mean_token_accuracy": 0.7697160840034485, + "num_tokens": 27798264.0, + "step": 9338, + "train/ce_loss": 0.790299117565155 + }, + { + "epoch": 0.9232746687759541, + "step": 9338, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.9232746687759541, + "step": 9338, + "train/total_loss": 0.14543616771697998 + }, + { + "entropy": 8.914115905761719, + "epoch": 0.9233735416254697, + "mean_token_accuracy": 0.7496671080589294, + "num_tokens": 27803469.0, + "step": 9339, + "train/ce_loss": 0.8739914298057556 + }, + { + "epoch": 0.9233735416254697, + "step": 9339, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.9233735416254697, + "step": 9339, + "train/total_loss": 0.18505540490150452 + }, + { + "epoch": 0.9234724144749852, + "grad_norm": 0.6647242903709412, + "learning_rate": 7.69346783365475e-06, + "loss": 0.1258, + "step": 9340 + }, + { + "entropy": 8.883416175842285, + "epoch": 0.9234724144749852, + "mean_token_accuracy": 0.7220026254653931, + "num_tokens": 27808710.0, + "step": 9340, + "train/ce_loss": 1.3461934328079224 + }, + { + "epoch": 0.9234724144749852, + "step": 9340, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.9234724144749852, + "step": 9340, + "train/total_loss": 0.17758809030056 + }, + { + "entropy": 8.845987319946289, + "epoch": 0.9235712873245007, + "mean_token_accuracy": 0.8050633072853088, + "num_tokens": 27813966.0, + "step": 9341, + "train/ce_loss": 0.6493423581123352 + }, + { + "epoch": 0.9235712873245007, + "step": 9341, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.9235712873245007, + "step": 9341, + "train/total_loss": 0.08837173879146576 + }, + { + "entropy": 8.537727355957031, + "epoch": 0.9236701601740163, + "mean_token_accuracy": 0.7448609471321106, + "num_tokens": 27819236.0, + "step": 9342, + "train/ce_loss": 0.8598325252532959 + }, + { + "epoch": 0.9236701601740163, + "step": 9342, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.9236701601740163, + "step": 9342, + "train/total_loss": 0.1250457465648651 + }, + { + "entropy": 8.727884292602539, + "epoch": 0.9237690330235317, + "mean_token_accuracy": 0.7356608510017395, + "num_tokens": 27824499.0, + "step": 9343, + "train/ce_loss": 0.755394697189331 + }, + { + "epoch": 0.9237690330235317, + "step": 9343, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.9237690330235317, + "step": 9343, + "train/total_loss": 0.1302269697189331 + }, + { + "entropy": 8.807491302490234, + "epoch": 0.9238679058730472, + "mean_token_accuracy": 0.761904776096344, + "num_tokens": 27829734.0, + "step": 9344, + "train/ce_loss": 0.4971083402633667 + }, + { + "epoch": 0.9238679058730472, + "step": 9344, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.9238679058730472, + "step": 9344, + "train/total_loss": 0.08096083998680115 + }, + { + "entropy": 8.90900993347168, + "epoch": 0.9239667787225628, + "mean_token_accuracy": 0.7629969716072083, + "num_tokens": 27834842.0, + "step": 9345, + "train/ce_loss": 4.6224951688600413e-07 + }, + { + "epoch": 0.9239667787225628, + "step": 9345, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.9239667787225628, + "step": 9345, + "train/total_loss": 0.04296879470348358 + }, + { + "entropy": 8.945581436157227, + "epoch": 0.9240656515720783, + "mean_token_accuracy": 0.636623740196228, + "num_tokens": 27840000.0, + "step": 9346, + "train/ce_loss": 1.802316665649414 + }, + { + "epoch": 0.9240656515720783, + "step": 9346, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.9240656515720783, + "step": 9346, + "train/total_loss": 0.23101292550563812 + }, + { + "entropy": 8.90842056274414, + "epoch": 0.9241645244215938, + "mean_token_accuracy": 0.7927536368370056, + "num_tokens": 27845147.0, + "step": 9347, + "train/ce_loss": 1.3514710664749146 + }, + { + "epoch": 0.9241645244215938, + "step": 9347, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.9241645244215938, + "step": 9347, + "train/total_loss": 0.1976471096277237 + }, + { + "entropy": 9.230728149414062, + "epoch": 0.9242633972711094, + "mean_token_accuracy": 0.7515337467193604, + "num_tokens": 27850251.0, + "step": 9348, + "train/ce_loss": 0.6681864857673645 + }, + { + "epoch": 0.9242633972711094, + "step": 9348, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.9242633972711094, + "step": 9348, + "train/total_loss": 0.09416239708662033 + }, + { + "entropy": 8.170320510864258, + "epoch": 0.9243622701206249, + "mean_token_accuracy": 0.7357609868049622, + "num_tokens": 27855784.0, + "step": 9349, + "train/ce_loss": 1.053152084350586 + }, + { + "epoch": 0.9243622701206249, + "step": 9349, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.9243622701206249, + "step": 9349, + "train/total_loss": 0.1951589584350586 + }, + { + "entropy": 8.387100219726562, + "epoch": 0.9244611429701404, + "mean_token_accuracy": 0.7791411280632019, + "num_tokens": 27861374.0, + "step": 9350, + "train/ce_loss": 0.7524048089981079 + }, + { + "epoch": 0.9244611429701404, + "step": 9350, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.9244611429701404, + "step": 9350, + "train/total_loss": 0.15336549282073975 + }, + { + "entropy": 8.531319618225098, + "epoch": 0.924560015819656, + "mean_token_accuracy": 0.7187127470970154, + "num_tokens": 27866691.0, + "step": 9351, + "train/ce_loss": 1.0648727416992188 + }, + { + "epoch": 0.924560015819656, + "step": 9351, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.924560015819656, + "step": 9351, + "train/total_loss": 0.16117477416992188 + }, + { + "entropy": 8.408615112304688, + "epoch": 0.9246588886691715, + "mean_token_accuracy": 0.7593184113502502, + "num_tokens": 27872086.0, + "step": 9352, + "train/ce_loss": 0.732154905796051 + }, + { + "epoch": 0.9246588886691715, + "step": 9352, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.9246588886691715, + "step": 9352, + "train/total_loss": 0.12399674206972122 + }, + { + "entropy": 8.75208854675293, + "epoch": 0.9247577615186869, + "mean_token_accuracy": 0.7730769515037537, + "num_tokens": 27877323.0, + "step": 9353, + "train/ce_loss": 1.2357757091522217 + }, + { + "epoch": 0.9247577615186869, + "step": 9353, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.9247577615186869, + "step": 9353, + "train/total_loss": 0.1704525649547577 + }, + { + "entropy": 8.690408706665039, + "epoch": 0.9248566343682025, + "mean_token_accuracy": 0.6970338821411133, + "num_tokens": 27882606.0, + "step": 9354, + "train/ce_loss": 0.5237799286842346 + }, + { + "epoch": 0.9248566343682025, + "step": 9354, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.9248566343682025, + "step": 9354, + "train/total_loss": 0.11487799882888794 + }, + { + "entropy": 8.586967468261719, + "epoch": 0.924955507217718, + "mean_token_accuracy": 0.7449495196342468, + "num_tokens": 27887896.0, + "step": 9355, + "train/ce_loss": 0.8277714252471924 + }, + { + "epoch": 0.924955507217718, + "step": 9355, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.924955507217718, + "step": 9355, + "train/total_loss": 0.14527714252471924 + }, + { + "entropy": 8.509919166564941, + "epoch": 0.9250543800672335, + "mean_token_accuracy": 0.7360350489616394, + "num_tokens": 27893289.0, + "step": 9356, + "train/ce_loss": 0.7717025876045227 + }, + { + "epoch": 0.9250543800672335, + "step": 9356, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.9250543800672335, + "step": 9356, + "train/total_loss": 0.10842026025056839 + }, + { + "entropy": 8.69703483581543, + "epoch": 0.9251532529167491, + "mean_token_accuracy": 0.7157190442085266, + "num_tokens": 27898670.0, + "step": 9357, + "train/ce_loss": 1.3000333309173584 + }, + { + "epoch": 0.9251532529167491, + "step": 9357, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.9251532529167491, + "step": 9357, + "train/total_loss": 0.19250333309173584 + }, + { + "entropy": 8.952316284179688, + "epoch": 0.9252521257662646, + "mean_token_accuracy": 0.8059490323066711, + "num_tokens": 27903898.0, + "step": 9358, + "train/ce_loss": 0.3204611837863922 + }, + { + "epoch": 0.9252521257662646, + "step": 9358, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.9252521257662646, + "step": 9358, + "train/total_loss": 0.0515773706138134 + }, + { + "entropy": 9.340235710144043, + "epoch": 0.9253509986157801, + "mean_token_accuracy": 0.8120915293693542, + "num_tokens": 27908961.0, + "step": 9359, + "train/ce_loss": 0.7203558683395386 + }, + { + "epoch": 0.9253509986157801, + "step": 9359, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.9253509986157801, + "step": 9359, + "train/total_loss": 0.09156683832406998 + }, + { + "epoch": 0.9254498714652957, + "grad_norm": 0.6123246550559998, + "learning_rate": 7.688522968896802e-06, + "loss": 0.1322, + "step": 9360 + }, + { + "entropy": 8.673837661743164, + "epoch": 0.9254498714652957, + "mean_token_accuracy": 0.7361446022987366, + "num_tokens": 27914459.0, + "step": 9360, + "train/ce_loss": 1.0571768283843994 + }, + { + "epoch": 0.9254498714652957, + "step": 9360, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.9254498714652957, + "step": 9360, + "train/total_loss": 0.13306143879890442 + }, + { + "entropy": 8.64249038696289, + "epoch": 0.9255487443148112, + "mean_token_accuracy": 0.7620252966880798, + "num_tokens": 27919767.0, + "step": 9361, + "train/ce_loss": 0.5044848918914795 + }, + { + "epoch": 0.9255487443148112, + "step": 9361, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.9255487443148112, + "step": 9361, + "train/total_loss": 0.10904224216938019 + }, + { + "entropy": 8.920316696166992, + "epoch": 0.9256476171643266, + "mean_token_accuracy": 0.7136498689651489, + "num_tokens": 27924860.0, + "step": 9362, + "train/ce_loss": 3.5302545597915014e-07 + }, + { + "epoch": 0.9256476171643266, + "step": 9362, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.9256476171643266, + "step": 9362, + "train/total_loss": 0.039062533527612686 + }, + { + "entropy": 9.195442199707031, + "epoch": 0.9257464900138422, + "mean_token_accuracy": 0.7482993006706238, + "num_tokens": 27929869.0, + "step": 9363, + "train/ce_loss": 0.5305820107460022 + }, + { + "epoch": 0.9257464900138422, + "step": 9363, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.9257464900138422, + "step": 9363, + "train/total_loss": 0.0804019570350647 + }, + { + "entropy": 9.042174339294434, + "epoch": 0.9258453628633577, + "mean_token_accuracy": 0.776562511920929, + "num_tokens": 27934975.0, + "step": 9364, + "train/ce_loss": 0.9903772473335266 + }, + { + "epoch": 0.9258453628633577, + "step": 9364, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.9258453628633577, + "step": 9364, + "train/total_loss": 0.12638148665428162 + }, + { + "entropy": 8.419917106628418, + "epoch": 0.9259442357128732, + "mean_token_accuracy": 0.7669452428817749, + "num_tokens": 27940552.0, + "step": 9365, + "train/ce_loss": 0.4101727306842804 + }, + { + "epoch": 0.9259442357128732, + "step": 9365, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.9259442357128732, + "step": 9365, + "train/total_loss": 0.05664227530360222 + }, + { + "entropy": 8.653945922851562, + "epoch": 0.9260431085623888, + "mean_token_accuracy": 0.7473053932189941, + "num_tokens": 27945920.0, + "step": 9366, + "train/ce_loss": 0.6925281286239624 + }, + { + "epoch": 0.9260431085623888, + "step": 9366, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.9260431085623888, + "step": 9366, + "train/total_loss": 0.12784656882286072 + }, + { + "entropy": 9.342437744140625, + "epoch": 0.9261419814119043, + "mean_token_accuracy": 0.8017751574516296, + "num_tokens": 27950700.0, + "step": 9367, + "train/ce_loss": 1.431895498171798e-06 + }, + { + "epoch": 0.9261419814119043, + "step": 9367, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.9261419814119043, + "step": 9367, + "train/total_loss": 0.05859389156103134 + }, + { + "entropy": 8.90158462524414, + "epoch": 0.9262408542614198, + "mean_token_accuracy": 0.7121464014053345, + "num_tokens": 27955770.0, + "step": 9368, + "train/ce_loss": 1.037416696548462 + }, + { + "epoch": 0.9262408542614198, + "step": 9368, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.9262408542614198, + "step": 9368, + "train/total_loss": 0.15452292561531067 + }, + { + "entropy": 9.231480598449707, + "epoch": 0.9263397271109354, + "mean_token_accuracy": 0.7996219396591187, + "num_tokens": 27960757.0, + "step": 9369, + "train/ce_loss": 1.3427621126174927 + }, + { + "epoch": 0.9263397271109354, + "step": 9369, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.9263397271109354, + "step": 9369, + "train/total_loss": 0.22802621126174927 + }, + { + "entropy": 9.150675773620605, + "epoch": 0.9264385999604509, + "mean_token_accuracy": 0.6822916865348816, + "num_tokens": 27965964.0, + "step": 9370, + "train/ce_loss": 1.2984963655471802 + }, + { + "epoch": 0.9264385999604509, + "step": 9370, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.9264385999604509, + "step": 9370, + "train/total_loss": 0.2196933925151825 + }, + { + "entropy": 8.527593612670898, + "epoch": 0.9265374728099663, + "mean_token_accuracy": 0.7487179636955261, + "num_tokens": 27971450.0, + "step": 9371, + "train/ce_loss": 0.7690181732177734 + }, + { + "epoch": 0.9265374728099663, + "step": 9371, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.9265374728099663, + "step": 9371, + "train/total_loss": 0.10424556583166122 + }, + { + "entropy": 9.013280868530273, + "epoch": 0.9266363456594819, + "mean_token_accuracy": 0.7367668151855469, + "num_tokens": 27976582.0, + "step": 9372, + "train/ce_loss": 0.826474666595459 + }, + { + "epoch": 0.9266363456594819, + "step": 9372, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.9266363456594819, + "step": 9372, + "train/total_loss": 0.19202247262001038 + }, + { + "entropy": 9.376861572265625, + "epoch": 0.9267352185089974, + "mean_token_accuracy": 0.631130039691925, + "num_tokens": 27981476.0, + "step": 9373, + "train/ce_loss": 1.5703624486923218 + }, + { + "epoch": 0.9267352185089974, + "step": 9373, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.9267352185089974, + "step": 9373, + "train/total_loss": 0.19609874486923218 + }, + { + "entropy": 9.084300994873047, + "epoch": 0.9268340913585129, + "mean_token_accuracy": 0.7346647381782532, + "num_tokens": 27986643.0, + "step": 9374, + "train/ce_loss": 1.1067943572998047 + }, + { + "epoch": 0.9268340913585129, + "step": 9374, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.9268340913585129, + "step": 9374, + "train/total_loss": 0.18489819765090942 + }, + { + "entropy": 8.769912719726562, + "epoch": 0.9269329642080285, + "mean_token_accuracy": 0.73072749376297, + "num_tokens": 27992034.0, + "step": 9375, + "train/ce_loss": 0.39563554525375366 + }, + { + "epoch": 0.9269329642080285, + "step": 9375, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.9269329642080285, + "step": 9375, + "train/total_loss": 0.06690730154514313 + }, + { + "entropy": 8.79995346069336, + "epoch": 0.927031837057544, + "mean_token_accuracy": 0.7369047403335571, + "num_tokens": 27997372.0, + "step": 9376, + "train/ce_loss": 1.4369641542434692 + }, + { + "epoch": 0.927031837057544, + "step": 9376, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.927031837057544, + "step": 9376, + "train/total_loss": 0.23744641244411469 + }, + { + "entropy": 8.828529357910156, + "epoch": 0.9271307099070595, + "mean_token_accuracy": 0.6910377144813538, + "num_tokens": 28002695.0, + "step": 9377, + "train/ce_loss": 1.3177534341812134 + }, + { + "epoch": 0.9271307099070595, + "step": 9377, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.9271307099070595, + "step": 9377, + "train/total_loss": 0.17865034937858582 + }, + { + "entropy": 9.035560607910156, + "epoch": 0.9272295827565751, + "mean_token_accuracy": 0.7108792662620544, + "num_tokens": 28007804.0, + "step": 9378, + "train/ce_loss": 1.0736099481582642 + }, + { + "epoch": 0.9272295827565751, + "step": 9378, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.9272295827565751, + "step": 9378, + "train/total_loss": 0.13861098885536194 + }, + { + "entropy": 9.269739151000977, + "epoch": 0.9273284556060906, + "mean_token_accuracy": 0.7559523582458496, + "num_tokens": 28012757.0, + "step": 9379, + "train/ce_loss": 3.0842841169942403e-07 + }, + { + "epoch": 0.9273284556060906, + "step": 9379, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.9273284556060906, + "step": 9379, + "train/total_loss": 0.023437531664967537 + }, + { + "epoch": 0.927427328455606, + "grad_norm": 0.6609386801719666, + "learning_rate": 7.683578104138852e-06, + "loss": 0.1385, + "step": 9380 + }, + { + "entropy": 9.316780090332031, + "epoch": 0.927427328455606, + "mean_token_accuracy": 0.7272727489471436, + "num_tokens": 28017765.0, + "step": 9380, + "train/ce_loss": 1.2086098194122314 + }, + { + "epoch": 0.927427328455606, + "step": 9380, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.927427328455606, + "step": 9380, + "train/total_loss": 0.2380484938621521 + }, + { + "entropy": 8.851558685302734, + "epoch": 0.9275262013051216, + "mean_token_accuracy": 0.7346938848495483, + "num_tokens": 28022978.0, + "step": 9381, + "train/ce_loss": 0.7256733179092407 + }, + { + "epoch": 0.9275262013051216, + "step": 9381, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.9275262013051216, + "step": 9381, + "train/total_loss": 0.09600483626127243 + }, + { + "entropy": 8.732415199279785, + "epoch": 0.9276250741546371, + "mean_token_accuracy": 0.6887417435646057, + "num_tokens": 28028004.0, + "step": 9382, + "train/ce_loss": 1.2488353252410889 + }, + { + "epoch": 0.9276250741546371, + "step": 9382, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.9276250741546371, + "step": 9382, + "train/total_loss": 0.1717585325241089 + }, + { + "entropy": 8.534761428833008, + "epoch": 0.9277239470041526, + "mean_token_accuracy": 0.6759545803070068, + "num_tokens": 28033425.0, + "step": 9383, + "train/ce_loss": 0.968550980091095 + }, + { + "epoch": 0.9277239470041526, + "step": 9383, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.9277239470041526, + "step": 9383, + "train/total_loss": 0.14373010396957397 + }, + { + "entropy": 8.801989555358887, + "epoch": 0.9278228198536682, + "mean_token_accuracy": 0.824999988079071, + "num_tokens": 28038650.0, + "step": 9384, + "train/ce_loss": 0.741737425327301 + }, + { + "epoch": 0.9278228198536682, + "step": 9384, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.9278228198536682, + "step": 9384, + "train/total_loss": 0.10151749104261398 + }, + { + "entropy": 8.70286750793457, + "epoch": 0.9279216927031837, + "mean_token_accuracy": 0.7546961307525635, + "num_tokens": 28043969.0, + "step": 9385, + "train/ce_loss": 0.5636538863182068 + }, + { + "epoch": 0.9279216927031837, + "step": 9385, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.9279216927031837, + "step": 9385, + "train/total_loss": 0.07589663565158844 + }, + { + "entropy": 8.636743545532227, + "epoch": 0.9280205655526992, + "mean_token_accuracy": 0.767208993434906, + "num_tokens": 28049265.0, + "step": 9386, + "train/ce_loss": 0.4064827561378479 + }, + { + "epoch": 0.9280205655526992, + "step": 9386, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.9280205655526992, + "step": 9386, + "train/total_loss": 0.08752328157424927 + }, + { + "entropy": 8.807705879211426, + "epoch": 0.9281194384022148, + "mean_token_accuracy": 0.7386634945869446, + "num_tokens": 28054567.0, + "step": 9387, + "train/ce_loss": 1.0495035648345947 + }, + { + "epoch": 0.9281194384022148, + "step": 9387, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.9281194384022148, + "step": 9387, + "train/total_loss": 0.13229411840438843 + }, + { + "entropy": 9.348652839660645, + "epoch": 0.9282183112517303, + "mean_token_accuracy": 0.7552182078361511, + "num_tokens": 28059544.0, + "step": 9388, + "train/ce_loss": 1.1646846532821655 + }, + { + "epoch": 0.9282183112517303, + "step": 9388, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.9282183112517303, + "step": 9388, + "train/total_loss": 0.13599970936775208 + }, + { + "entropy": 9.150012016296387, + "epoch": 0.9283171841012458, + "mean_token_accuracy": 0.6988906264305115, + "num_tokens": 28064639.0, + "step": 9389, + "train/ce_loss": 2.826244838161074e-07 + }, + { + "epoch": 0.9283171841012458, + "step": 9389, + "train/sim_loss": 0.0078125 + }, + { + "epoch": 0.9283171841012458, + "step": 9389, + "train/total_loss": 0.007812527939677238 + }, + { + "entropy": 9.319042205810547, + "epoch": 0.9284160569507613, + "mean_token_accuracy": 0.8239316344261169, + "num_tokens": 28069650.0, + "step": 9390, + "train/ce_loss": 0.6236292719841003 + }, + { + "epoch": 0.9284160569507613, + "step": 9390, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.9284160569507613, + "step": 9390, + "train/total_loss": 0.0779879242181778 + }, + { + "entropy": 8.673835754394531, + "epoch": 0.9285149298002768, + "mean_token_accuracy": 0.7916136980056763, + "num_tokens": 28074852.0, + "step": 9391, + "train/ce_loss": 0.8313379287719727 + }, + { + "epoch": 0.9285149298002768, + "step": 9391, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.9285149298002768, + "step": 9391, + "train/total_loss": 0.1339150369167328 + }, + { + "entropy": 9.542570114135742, + "epoch": 0.9286138026497923, + "mean_token_accuracy": 0.7303664684295654, + "num_tokens": 28079667.0, + "step": 9392, + "train/ce_loss": 9.037328823069402e-07 + }, + { + "epoch": 0.9286138026497923, + "step": 9392, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.9286138026497923, + "step": 9392, + "train/total_loss": 0.03125008940696716 + }, + { + "entropy": 8.512406349182129, + "epoch": 0.9287126754993079, + "mean_token_accuracy": 0.7486457228660583, + "num_tokens": 28085098.0, + "step": 9393, + "train/ce_loss": 0.8508886694908142 + }, + { + "epoch": 0.9287126754993079, + "step": 9393, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.9287126754993079, + "step": 9393, + "train/total_loss": 0.16712012887001038 + }, + { + "entropy": 8.425409317016602, + "epoch": 0.9288115483488234, + "mean_token_accuracy": 0.7757973670959473, + "num_tokens": 28090656.0, + "step": 9394, + "train/ce_loss": 0.9109476208686829 + }, + { + "epoch": 0.9288115483488234, + "step": 9394, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.9288115483488234, + "step": 9394, + "train/total_loss": 0.1614072620868683 + }, + { + "entropy": 8.87820816040039, + "epoch": 0.9289104211983389, + "mean_token_accuracy": 0.8025157451629639, + "num_tokens": 28095867.0, + "step": 9395, + "train/ce_loss": 5.457933411889826e-07 + }, + { + "epoch": 0.9289104211983389, + "step": 9395, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.9289104211983389, + "step": 9395, + "train/total_loss": 0.03125005587935448 + }, + { + "entropy": 8.390907287597656, + "epoch": 0.9290092940478545, + "mean_token_accuracy": 0.7643064856529236, + "num_tokens": 28101557.0, + "step": 9396, + "train/ce_loss": 0.4423627257347107 + }, + { + "epoch": 0.9290092940478545, + "step": 9396, + "train/sim_loss": 0.01171875 + }, + { + "epoch": 0.9290092940478545, + "step": 9396, + "train/total_loss": 0.05595502257347107 + }, + { + "entropy": 8.457071304321289, + "epoch": 0.92910816689737, + "mean_token_accuracy": 0.7335984110832214, + "num_tokens": 28107049.0, + "step": 9397, + "train/ce_loss": 1.928679347038269 + }, + { + "epoch": 0.92910816689737, + "step": 9397, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.92910816689737, + "step": 9397, + "train/total_loss": 0.2788054347038269 + }, + { + "entropy": 8.952810287475586, + "epoch": 0.9292070397468855, + "mean_token_accuracy": 0.7200646996498108, + "num_tokens": 28112238.0, + "step": 9398, + "train/ce_loss": 1.202625036239624 + }, + { + "epoch": 0.9292070397468855, + "step": 9398, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.9292070397468855, + "step": 9398, + "train/total_loss": 0.1827625036239624 + }, + { + "entropy": 8.893738746643066, + "epoch": 0.929305912596401, + "mean_token_accuracy": 0.7468531727790833, + "num_tokens": 28117434.0, + "step": 9399, + "train/ce_loss": 0.9753656983375549 + }, + { + "epoch": 0.929305912596401, + "step": 9399, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.929305912596401, + "step": 9399, + "train/total_loss": 0.18347406387329102 + }, + { + "epoch": 0.9294047854459165, + "grad_norm": 0.6104568243026733, + "learning_rate": 7.678633239380904e-06, + "loss": 0.1266, + "step": 9400 + }, + { + "entropy": 9.174907684326172, + "epoch": 0.9294047854459165, + "mean_token_accuracy": 0.6466974020004272, + "num_tokens": 28122494.0, + "step": 9400, + "train/ce_loss": 1.2414363622665405 + }, + { + "epoch": 0.9294047854459165, + "step": 9400, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.9294047854459165, + "step": 9400, + "train/total_loss": 0.17101863026618958 + }, + { + "entropy": 8.795722961425781, + "epoch": 0.929503658295432, + "mean_token_accuracy": 0.7110266089439392, + "num_tokens": 28127718.0, + "step": 9401, + "train/ce_loss": 0.6686466336250305 + }, + { + "epoch": 0.929503658295432, + "step": 9401, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.929503658295432, + "step": 9401, + "train/total_loss": 0.13717716932296753 + }, + { + "entropy": 8.616256713867188, + "epoch": 0.9296025311449476, + "mean_token_accuracy": 0.7277277112007141, + "num_tokens": 28133177.0, + "step": 9402, + "train/ce_loss": 0.44011417031288147 + }, + { + "epoch": 0.9296025311449476, + "step": 9402, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.9296025311449476, + "step": 9402, + "train/total_loss": 0.07135516405105591 + }, + { + "entropy": 8.786222457885742, + "epoch": 0.9297014039944631, + "mean_token_accuracy": 0.774631917476654, + "num_tokens": 28138520.0, + "step": 9403, + "train/ce_loss": 0.43570011854171753 + }, + { + "epoch": 0.9297014039944631, + "step": 9403, + "train/sim_loss": 0.01171875 + }, + { + "epoch": 0.9297014039944631, + "step": 9403, + "train/total_loss": 0.05528876185417175 + }, + { + "entropy": 8.698478698730469, + "epoch": 0.9298002768439786, + "mean_token_accuracy": 0.7355072498321533, + "num_tokens": 28143834.0, + "step": 9404, + "train/ce_loss": 1.078285813331604 + }, + { + "epoch": 0.9298002768439786, + "step": 9404, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.9298002768439786, + "step": 9404, + "train/total_loss": 0.16642233729362488 + }, + { + "entropy": 8.438079833984375, + "epoch": 0.9298991496934942, + "mean_token_accuracy": 0.7645788192749023, + "num_tokens": 28149162.0, + "step": 9405, + "train/ce_loss": 0.39540475606918335 + }, + { + "epoch": 0.9298991496934942, + "step": 9405, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.9298991496934942, + "step": 9405, + "train/total_loss": 0.10204047709703445 + }, + { + "entropy": 9.068194389343262, + "epoch": 0.9299980225430097, + "mean_token_accuracy": 0.7638484239578247, + "num_tokens": 28154305.0, + "step": 9406, + "train/ce_loss": 0.7043539881706238 + }, + { + "epoch": 0.9299980225430097, + "step": 9406, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.9299980225430097, + "step": 9406, + "train/total_loss": 0.12512290477752686 + }, + { + "entropy": 8.68542766571045, + "epoch": 0.9300968953925252, + "mean_token_accuracy": 0.7389221787452698, + "num_tokens": 28159634.0, + "step": 9407, + "train/ce_loss": 1.0234767198562622 + }, + { + "epoch": 0.9300968953925252, + "step": 9407, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.9300968953925252, + "step": 9407, + "train/total_loss": 0.18437892198562622 + }, + { + "entropy": 9.482927322387695, + "epoch": 0.9301957682420408, + "mean_token_accuracy": 0.7506775259971619, + "num_tokens": 28164412.0, + "step": 9408, + "train/ce_loss": 4.923381311527919e-07 + }, + { + "epoch": 0.9301957682420408, + "step": 9408, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.9301957682420408, + "step": 9408, + "train/total_loss": 0.02343754842877388 + }, + { + "entropy": 9.366888046264648, + "epoch": 0.9302946410915562, + "mean_token_accuracy": 0.7749999761581421, + "num_tokens": 28169334.0, + "step": 9409, + "train/ce_loss": 1.7896606922149658 + }, + { + "epoch": 0.9302946410915562, + "step": 9409, + "train/sim_loss": 0.12890625 + }, + { + "epoch": 0.9302946410915562, + "step": 9409, + "train/total_loss": 0.30787232518196106 + }, + { + "entropy": 8.677665710449219, + "epoch": 0.9303935139410718, + "mean_token_accuracy": 0.7423912882804871, + "num_tokens": 28174760.0, + "step": 9410, + "train/ce_loss": 0.6476882100105286 + }, + { + "epoch": 0.9303935139410718, + "step": 9410, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.9303935139410718, + "step": 9410, + "train/total_loss": 0.10383132100105286 + }, + { + "entropy": 8.895466804504395, + "epoch": 0.9304923867905873, + "mean_token_accuracy": 0.7579972147941589, + "num_tokens": 28179943.0, + "step": 9411, + "train/ce_loss": 1.3835035562515259 + }, + { + "epoch": 0.9304923867905873, + "step": 9411, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.9304923867905873, + "step": 9411, + "train/total_loss": 0.22428785264492035 + }, + { + "entropy": 9.131093978881836, + "epoch": 0.9305912596401028, + "mean_token_accuracy": 0.7207207083702087, + "num_tokens": 28185245.0, + "step": 9412, + "train/ce_loss": 0.8876646757125854 + }, + { + "epoch": 0.9305912596401028, + "step": 9412, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.9305912596401028, + "step": 9412, + "train/total_loss": 0.15907897055149078 + }, + { + "entropy": 9.342243194580078, + "epoch": 0.9306901324896184, + "mean_token_accuracy": 0.75, + "num_tokens": 28190261.0, + "step": 9413, + "train/ce_loss": 0.8801150321960449 + }, + { + "epoch": 0.9306901324896184, + "step": 9413, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.9306901324896184, + "step": 9413, + "train/total_loss": 0.1426990032196045 + }, + { + "entropy": 8.698667526245117, + "epoch": 0.9307890053391339, + "mean_token_accuracy": 0.7549019455909729, + "num_tokens": 28195539.0, + "step": 9414, + "train/ce_loss": 1.0844625234603882 + }, + { + "epoch": 0.9307890053391339, + "step": 9414, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.9307890053391339, + "step": 9414, + "train/total_loss": 0.15532125532627106 + }, + { + "entropy": 8.73432445526123, + "epoch": 0.9308878781886494, + "mean_token_accuracy": 0.756898820400238, + "num_tokens": 28200778.0, + "step": 9415, + "train/ce_loss": 0.7432786226272583 + }, + { + "epoch": 0.9308878781886494, + "step": 9415, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.9308878781886494, + "step": 9415, + "train/total_loss": 0.12901535630226135 + }, + { + "entropy": 8.85866928100586, + "epoch": 0.930986751038165, + "mean_token_accuracy": 0.6936936974525452, + "num_tokens": 28205941.0, + "step": 9416, + "train/ce_loss": 2.242464065551758 + }, + { + "epoch": 0.930986751038165, + "step": 9416, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.930986751038165, + "step": 9416, + "train/total_loss": 0.29846516251564026 + }, + { + "entropy": 8.43039321899414, + "epoch": 0.9310856238876805, + "mean_token_accuracy": 0.7508090734481812, + "num_tokens": 28211377.0, + "step": 9417, + "train/ce_loss": 0.8832802176475525 + }, + { + "epoch": 0.9310856238876805, + "step": 9417, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.9310856238876805, + "step": 9417, + "train/total_loss": 0.1469217836856842 + }, + { + "entropy": 8.777997970581055, + "epoch": 0.9311844967371959, + "mean_token_accuracy": 0.7008149027824402, + "num_tokens": 28216684.0, + "step": 9418, + "train/ce_loss": 0.7578139305114746 + }, + { + "epoch": 0.9311844967371959, + "step": 9418, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.9311844967371959, + "step": 9418, + "train/total_loss": 0.12265639752149582 + }, + { + "entropy": 9.463607788085938, + "epoch": 0.9312833695867115, + "mean_token_accuracy": 0.7788461446762085, + "num_tokens": 28221485.0, + "step": 9419, + "train/ce_loss": 1.238187313079834 + }, + { + "epoch": 0.9312833695867115, + "step": 9419, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.9312833695867115, + "step": 9419, + "train/total_loss": 0.16678747534751892 + }, + { + "epoch": 0.931382242436227, + "grad_norm": 0.8246861100196838, + "learning_rate": 7.673688374622955e-06, + "loss": 0.1339, + "step": 9420 + }, + { + "entropy": 8.770391464233398, + "epoch": 0.931382242436227, + "mean_token_accuracy": 0.7437499761581421, + "num_tokens": 28226591.0, + "step": 9420, + "train/ce_loss": 1.1773754358291626 + }, + { + "epoch": 0.931382242436227, + "step": 9420, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.931382242436227, + "step": 9420, + "train/total_loss": 0.2349250465631485 + }, + { + "entropy": 8.86601448059082, + "epoch": 0.9314811152857425, + "mean_token_accuracy": 0.7668097019195557, + "num_tokens": 28231728.0, + "step": 9421, + "train/ce_loss": 0.9965757727622986 + }, + { + "epoch": 0.9314811152857425, + "step": 9421, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.9314811152857425, + "step": 9421, + "train/total_loss": 0.1387200802564621 + }, + { + "entropy": 9.105216979980469, + "epoch": 0.9315799881352581, + "mean_token_accuracy": 0.7777777910232544, + "num_tokens": 28236718.0, + "step": 9422, + "train/ce_loss": 1.0376790761947632 + }, + { + "epoch": 0.9315799881352581, + "step": 9422, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.9315799881352581, + "step": 9422, + "train/total_loss": 0.17408040165901184 + }, + { + "entropy": 8.490823745727539, + "epoch": 0.9316788609847736, + "mean_token_accuracy": 0.7954047918319702, + "num_tokens": 28242119.0, + "step": 9423, + "train/ce_loss": 0.6964000463485718 + }, + { + "epoch": 0.9316788609847736, + "step": 9423, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.9316788609847736, + "step": 9423, + "train/total_loss": 0.12042125314474106 + }, + { + "entropy": 8.82767105102539, + "epoch": 0.9317777338342891, + "mean_token_accuracy": 0.7571428418159485, + "num_tokens": 28247158.0, + "step": 9424, + "train/ce_loss": 3.042653418106056e-07 + }, + { + "epoch": 0.9317777338342891, + "step": 9424, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.9317777338342891, + "step": 9424, + "train/total_loss": 0.03906252980232239 + }, + { + "entropy": 8.905559539794922, + "epoch": 0.9318766066838047, + "mean_token_accuracy": 0.7159383296966553, + "num_tokens": 28252397.0, + "step": 9425, + "train/ce_loss": 1.7327330112457275 + }, + { + "epoch": 0.9318766066838047, + "step": 9425, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.9318766066838047, + "step": 9425, + "train/total_loss": 0.24358581006526947 + }, + { + "entropy": 8.280866622924805, + "epoch": 0.9319754795333202, + "mean_token_accuracy": 0.8251879811286926, + "num_tokens": 28257970.0, + "step": 9426, + "train/ce_loss": 0.32057952880859375 + }, + { + "epoch": 0.9319754795333202, + "step": 9426, + "train/sim_loss": 0.01171875 + }, + { + "epoch": 0.9319754795333202, + "step": 9426, + "train/total_loss": 0.043776702135801315 + }, + { + "entropy": 8.459920883178711, + "epoch": 0.9320743523828356, + "mean_token_accuracy": 0.7546992301940918, + "num_tokens": 28263529.0, + "step": 9427, + "train/ce_loss": 0.8341599702835083 + }, + { + "epoch": 0.9320743523828356, + "step": 9427, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.9320743523828356, + "step": 9427, + "train/total_loss": 0.13419725000858307 + }, + { + "entropy": 8.291421890258789, + "epoch": 0.9321732252323512, + "mean_token_accuracy": 0.7515856027603149, + "num_tokens": 28268893.0, + "step": 9428, + "train/ce_loss": 0.6774831414222717 + }, + { + "epoch": 0.9321732252323512, + "step": 9428, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.9321732252323512, + "step": 9428, + "train/total_loss": 0.11462331563234329 + }, + { + "entropy": 8.916852951049805, + "epoch": 0.9322720980818667, + "mean_token_accuracy": 0.7021276354789734, + "num_tokens": 28274092.0, + "step": 9429, + "train/ce_loss": 1.4796149730682373 + }, + { + "epoch": 0.9322720980818667, + "step": 9429, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.9322720980818667, + "step": 9429, + "train/total_loss": 0.22218024730682373 + }, + { + "entropy": 8.927335739135742, + "epoch": 0.9323709709313822, + "mean_token_accuracy": 0.7384615540504456, + "num_tokens": 28279173.0, + "step": 9430, + "train/ce_loss": 5.237801019575272e-07 + }, + { + "epoch": 0.9323709709313822, + "step": 9430, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.9323709709313822, + "step": 9430, + "train/total_loss": 0.03906255215406418 + }, + { + "entropy": 8.914237976074219, + "epoch": 0.9324698437808978, + "mean_token_accuracy": 0.7145135402679443, + "num_tokens": 28284245.0, + "step": 9431, + "train/ce_loss": 7.544079494437028e-07 + }, + { + "epoch": 0.9324698437808978, + "step": 9431, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.9324698437808978, + "step": 9431, + "train/total_loss": 0.05468757450580597 + }, + { + "entropy": 9.386693954467773, + "epoch": 0.9325687166304133, + "mean_token_accuracy": 0.8069105744361877, + "num_tokens": 28289149.0, + "step": 9432, + "train/ce_loss": 2.0158906011147337e-07 + }, + { + "epoch": 0.9325687166304133, + "step": 9432, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.9325687166304133, + "step": 9432, + "train/total_loss": 0.02734377048909664 + }, + { + "entropy": 8.992332458496094, + "epoch": 0.9326675894799288, + "mean_token_accuracy": 0.7102941274642944, + "num_tokens": 28294284.0, + "step": 9433, + "train/ce_loss": 3.898809097790945e-07 + }, + { + "epoch": 0.9326675894799288, + "step": 9433, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.9326675894799288, + "step": 9433, + "train/total_loss": 0.039062537252902985 + }, + { + "entropy": 8.570638656616211, + "epoch": 0.9327664623294444, + "mean_token_accuracy": 0.7350119948387146, + "num_tokens": 28299590.0, + "step": 9434, + "train/ce_loss": 1.2431139945983887 + }, + { + "epoch": 0.9327664623294444, + "step": 9434, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.9327664623294444, + "step": 9434, + "train/total_loss": 0.1516551524400711 + }, + { + "entropy": 9.032379150390625, + "epoch": 0.9328653351789599, + "mean_token_accuracy": 0.7147335410118103, + "num_tokens": 28304706.0, + "step": 9435, + "train/ce_loss": 1.2679431438446045 + }, + { + "epoch": 0.9328653351789599, + "step": 9435, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.9328653351789599, + "step": 9435, + "train/total_loss": 0.18148182332515717 + }, + { + "entropy": 8.528156280517578, + "epoch": 0.9329642080284754, + "mean_token_accuracy": 0.7256729006767273, + "num_tokens": 28310092.0, + "step": 9436, + "train/ce_loss": 1.0612057447433472 + }, + { + "epoch": 0.9329642080284754, + "step": 9436, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.9329642080284754, + "step": 9436, + "train/total_loss": 0.16080808639526367 + }, + { + "entropy": 9.329557418823242, + "epoch": 0.933063080877991, + "mean_token_accuracy": 0.7918455004692078, + "num_tokens": 28315002.0, + "step": 9437, + "train/ce_loss": 1.523861289024353 + }, + { + "epoch": 0.933063080877991, + "step": 9437, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.933063080877991, + "step": 9437, + "train/total_loss": 0.2070736289024353 + }, + { + "entropy": 8.956287384033203, + "epoch": 0.9331619537275064, + "mean_token_accuracy": 0.8003876209259033, + "num_tokens": 28319910.0, + "step": 9438, + "train/ce_loss": 6.311527727120847e-07 + }, + { + "epoch": 0.9331619537275064, + "step": 9438, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.9331619537275064, + "step": 9438, + "train/total_loss": 0.039062563329935074 + }, + { + "entropy": 9.165496826171875, + "epoch": 0.9332608265770219, + "mean_token_accuracy": 0.6877256035804749, + "num_tokens": 28324890.0, + "step": 9439, + "train/ce_loss": 2.1929569244384766 + }, + { + "epoch": 0.9332608265770219, + "step": 9439, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.9332608265770219, + "step": 9439, + "train/total_loss": 0.3052331805229187 + }, + { + "epoch": 0.9333596994265375, + "grad_norm": 0.7782477736473083, + "learning_rate": 7.668743509865005e-06, + "loss": 0.1292, + "step": 9440 + }, + { + "entropy": 9.423730850219727, + "epoch": 0.9333596994265375, + "mean_token_accuracy": 0.6735632419586182, + "num_tokens": 28329728.0, + "step": 9440, + "train/ce_loss": 6.598913273592188e-07 + }, + { + "epoch": 0.9333596994265375, + "step": 9440, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.9333596994265375, + "step": 9440, + "train/total_loss": 0.05468756705522537 + }, + { + "entropy": 8.39457893371582, + "epoch": 0.933458572276053, + "mean_token_accuracy": 0.8268858790397644, + "num_tokens": 28335274.0, + "step": 9441, + "train/ce_loss": 0.6306044459342957 + }, + { + "epoch": 0.933458572276053, + "step": 9441, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.933458572276053, + "step": 9441, + "train/total_loss": 0.1138416975736618 + }, + { + "entropy": 8.52895736694336, + "epoch": 0.9335574451255685, + "mean_token_accuracy": 0.7826552391052246, + "num_tokens": 28340661.0, + "step": 9442, + "train/ce_loss": 0.7003907561302185 + }, + { + "epoch": 0.9335574451255685, + "step": 9442, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.9335574451255685, + "step": 9442, + "train/total_loss": 0.11691407859325409 + }, + { + "entropy": 8.414928436279297, + "epoch": 0.9336563179750841, + "mean_token_accuracy": 0.7371188402175903, + "num_tokens": 28346164.0, + "step": 9443, + "train/ce_loss": 1.0276933908462524 + }, + { + "epoch": 0.9336563179750841, + "step": 9443, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.9336563179750841, + "step": 9443, + "train/total_loss": 0.14964434504508972 + }, + { + "entropy": 8.391592025756836, + "epoch": 0.9337551908245996, + "mean_token_accuracy": 0.6993339657783508, + "num_tokens": 28351747.0, + "step": 9444, + "train/ce_loss": 1.360185980796814 + }, + { + "epoch": 0.9337551908245996, + "step": 9444, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.9337551908245996, + "step": 9444, + "train/total_loss": 0.20633110404014587 + }, + { + "entropy": 9.084711074829102, + "epoch": 0.933854063674115, + "mean_token_accuracy": 0.7538226246833801, + "num_tokens": 28356847.0, + "step": 9445, + "train/ce_loss": 0.840169370174408 + }, + { + "epoch": 0.933854063674115, + "step": 9445, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.933854063674115, + "step": 9445, + "train/total_loss": 0.11526694148778915 + }, + { + "entropy": 8.835979461669922, + "epoch": 0.9339529365236307, + "mean_token_accuracy": 0.7090908885002136, + "num_tokens": 28361968.0, + "step": 9446, + "train/ce_loss": 1.1038906574249268 + }, + { + "epoch": 0.9339529365236307, + "step": 9446, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.9339529365236307, + "step": 9446, + "train/total_loss": 0.13382656872272491 + }, + { + "entropy": 8.689046859741211, + "epoch": 0.9340518093731461, + "mean_token_accuracy": 0.7245509028434753, + "num_tokens": 28367278.0, + "step": 9447, + "train/ce_loss": 0.9410738348960876 + }, + { + "epoch": 0.9340518093731461, + "step": 9447, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.9340518093731461, + "step": 9447, + "train/total_loss": 0.15660738945007324 + }, + { + "entropy": 9.413528442382812, + "epoch": 0.9341506822226616, + "mean_token_accuracy": 0.7511013150215149, + "num_tokens": 28372140.0, + "step": 9448, + "train/ce_loss": 1.5572891235351562 + }, + { + "epoch": 0.9341506822226616, + "step": 9448, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.9341506822226616, + "step": 9448, + "train/total_loss": 0.20260392129421234 + }, + { + "entropy": 8.6624755859375, + "epoch": 0.9342495550721772, + "mean_token_accuracy": 0.7384230494499207, + "num_tokens": 28377348.0, + "step": 9449, + "train/ce_loss": 0.7418690323829651 + }, + { + "epoch": 0.9342495550721772, + "step": 9449, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.9342495550721772, + "step": 9449, + "train/total_loss": 0.11715565621852875 + }, + { + "entropy": 8.503235816955566, + "epoch": 0.9343484279216927, + "mean_token_accuracy": 0.7927736639976501, + "num_tokens": 28382754.0, + "step": 9450, + "train/ce_loss": 0.5844284296035767 + }, + { + "epoch": 0.9343484279216927, + "step": 9450, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.9343484279216927, + "step": 9450, + "train/total_loss": 0.0857865959405899 + }, + { + "entropy": 8.794065475463867, + "epoch": 0.9344473007712082, + "mean_token_accuracy": 0.7134071588516235, + "num_tokens": 28388029.0, + "step": 9451, + "train/ce_loss": 1.0954326391220093 + }, + { + "epoch": 0.9344473007712082, + "step": 9451, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.9344473007712082, + "step": 9451, + "train/total_loss": 0.16813701391220093 + }, + { + "entropy": 9.619552612304688, + "epoch": 0.9345461736207238, + "mean_token_accuracy": 0.7867435216903687, + "num_tokens": 28392811.0, + "step": 9452, + "train/ce_loss": 1.8935411389975343e-06 + }, + { + "epoch": 0.9345461736207238, + "step": 9452, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.9345461736207238, + "step": 9452, + "train/total_loss": 0.05078143998980522 + }, + { + "entropy": 9.024389266967773, + "epoch": 0.9346450464702393, + "mean_token_accuracy": 0.8212290406227112, + "num_tokens": 28397955.0, + "step": 9453, + "train/ce_loss": 1.036226990436262e-06 + }, + { + "epoch": 0.9346450464702393, + "step": 9453, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.9346450464702393, + "step": 9453, + "train/total_loss": 0.03515635430812836 + }, + { + "entropy": 8.194178581237793, + "epoch": 0.9347439193197548, + "mean_token_accuracy": 0.6833616495132446, + "num_tokens": 28403671.0, + "step": 9454, + "train/ce_loss": 1.6191717386245728 + }, + { + "epoch": 0.9347439193197548, + "step": 9454, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.9347439193197548, + "step": 9454, + "train/total_loss": 0.23222967982292175 + }, + { + "entropy": 9.058138847351074, + "epoch": 0.9348427921692704, + "mean_token_accuracy": 0.6903137564659119, + "num_tokens": 28408808.0, + "step": 9455, + "train/ce_loss": 1.0023647546768188 + }, + { + "epoch": 0.9348427921692704, + "step": 9455, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.9348427921692704, + "step": 9455, + "train/total_loss": 0.20179897546768188 + }, + { + "entropy": 8.40052604675293, + "epoch": 0.9349416650187858, + "mean_token_accuracy": 0.724258303642273, + "num_tokens": 28414418.0, + "step": 9456, + "train/ce_loss": 0.5172601938247681 + }, + { + "epoch": 0.9349416650187858, + "step": 9456, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.9349416650187858, + "step": 9456, + "train/total_loss": 0.06735102087259293 + }, + { + "entropy": 9.11497974395752, + "epoch": 0.9350405378683013, + "mean_token_accuracy": 0.7714285850524902, + "num_tokens": 28419507.0, + "step": 9457, + "train/ce_loss": 0.9743853211402893 + }, + { + "epoch": 0.9350405378683013, + "step": 9457, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.9350405378683013, + "step": 9457, + "train/total_loss": 0.12087603658437729 + }, + { + "entropy": 9.073640823364258, + "epoch": 0.9351394107178169, + "mean_token_accuracy": 0.6483180522918701, + "num_tokens": 28424605.0, + "step": 9458, + "train/ce_loss": 2.0779506826329452e-07 + }, + { + "epoch": 0.9351394107178169, + "step": 9458, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.9351394107178169, + "step": 9458, + "train/total_loss": 0.01562502048909664 + }, + { + "entropy": 9.379833221435547, + "epoch": 0.9352382835673324, + "mean_token_accuracy": 0.7451403737068176, + "num_tokens": 28429526.0, + "step": 9459, + "train/ce_loss": 2.451115790336189e-07 + }, + { + "epoch": 0.9352382835673324, + "step": 9459, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.9352382835673324, + "step": 9459, + "train/total_loss": 0.01562502421438694 + }, + { + "epoch": 0.9353371564168479, + "grad_norm": 0.743285059928894, + "learning_rate": 7.663798645107056e-06, + "loss": 0.1311, + "step": 9460 + }, + { + "entropy": 8.391401290893555, + "epoch": 0.9353371564168479, + "mean_token_accuracy": 0.757080614566803, + "num_tokens": 28434897.0, + "step": 9460, + "train/ce_loss": 0.6055987477302551 + }, + { + "epoch": 0.9353371564168479, + "step": 9460, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.9353371564168479, + "step": 9460, + "train/total_loss": 0.08399737626314163 + }, + { + "entropy": 9.518046379089355, + "epoch": 0.9354360292663635, + "mean_token_accuracy": 0.7429466843605042, + "num_tokens": 28439582.0, + "step": 9461, + "train/ce_loss": 1.153960511146579e-06 + }, + { + "epoch": 0.9354360292663635, + "step": 9461, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.9354360292663635, + "step": 9461, + "train/total_loss": 0.05468761548399925 + }, + { + "entropy": 8.630594253540039, + "epoch": 0.935534902115879, + "mean_token_accuracy": 0.7149643898010254, + "num_tokens": 28444917.0, + "step": 9462, + "train/ce_loss": 0.5538694262504578 + }, + { + "epoch": 0.935534902115879, + "step": 9462, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.935534902115879, + "step": 9462, + "train/total_loss": 0.12179319560527802 + }, + { + "entropy": 9.009743690490723, + "epoch": 0.9356337749653945, + "mean_token_accuracy": 0.7209302186965942, + "num_tokens": 28449949.0, + "step": 9463, + "train/ce_loss": 1.1485670804977417 + }, + { + "epoch": 0.9356337749653945, + "step": 9463, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.9356337749653945, + "step": 9463, + "train/total_loss": 0.18907546997070312 + }, + { + "entropy": 9.719734191894531, + "epoch": 0.9357326478149101, + "mean_token_accuracy": 0.7188405990600586, + "num_tokens": 28454663.0, + "step": 9464, + "train/ce_loss": 3.317109076306224e-07 + }, + { + "epoch": 0.9357326478149101, + "step": 9464, + "train/sim_loss": 0.01171875 + }, + { + "epoch": 0.9357326478149101, + "step": 9464, + "train/total_loss": 0.011718783527612686 + }, + { + "entropy": 8.511116027832031, + "epoch": 0.9358315206644255, + "mean_token_accuracy": 0.739506185054779, + "num_tokens": 28459920.0, + "step": 9465, + "train/ce_loss": 0.5615465044975281 + }, + { + "epoch": 0.9358315206644255, + "step": 9465, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.9358315206644255, + "step": 9465, + "train/total_loss": 0.09521715342998505 + }, + { + "entropy": 8.738088607788086, + "epoch": 0.935930393513941, + "mean_token_accuracy": 0.7585799098014832, + "num_tokens": 28465216.0, + "step": 9466, + "train/ce_loss": 0.5507985353469849 + }, + { + "epoch": 0.935930393513941, + "step": 9466, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.935930393513941, + "step": 9466, + "train/total_loss": 0.1019548550248146 + }, + { + "entropy": 8.782062530517578, + "epoch": 0.9360292663634566, + "mean_token_accuracy": 0.7211155295372009, + "num_tokens": 28470456.0, + "step": 9467, + "train/ce_loss": 0.7884370684623718 + }, + { + "epoch": 0.9360292663634566, + "step": 9467, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.9360292663634566, + "step": 9467, + "train/total_loss": 0.16868746280670166 + }, + { + "entropy": 8.478357315063477, + "epoch": 0.9361281392129721, + "mean_token_accuracy": 0.7252604365348816, + "num_tokens": 28475703.0, + "step": 9468, + "train/ce_loss": 0.412435382604599 + }, + { + "epoch": 0.9361281392129721, + "step": 9468, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.9361281392129721, + "step": 9468, + "train/total_loss": 0.0568685382604599 + }, + { + "entropy": 9.02198314666748, + "epoch": 0.9362270120624876, + "mean_token_accuracy": 0.781345546245575, + "num_tokens": 28480785.0, + "step": 9469, + "train/ce_loss": 0.653954267501831 + }, + { + "epoch": 0.9362270120624876, + "step": 9469, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.9362270120624876, + "step": 9469, + "train/total_loss": 0.10445792973041534 + }, + { + "entropy": 9.033451080322266, + "epoch": 0.9363258849120032, + "mean_token_accuracy": 0.7223340272903442, + "num_tokens": 28485735.0, + "step": 9470, + "train/ce_loss": 1.467859192416654e-06 + }, + { + "epoch": 0.9363258849120032, + "step": 9470, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.9363258849120032, + "step": 9470, + "train/total_loss": 0.06250014901161194 + }, + { + "entropy": 8.486593246459961, + "epoch": 0.9364247577615187, + "mean_token_accuracy": 0.7819972038269043, + "num_tokens": 28490908.0, + "step": 9471, + "train/ce_loss": 3.661734240267833e-07 + }, + { + "epoch": 0.9364247577615187, + "step": 9471, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.9364247577615187, + "step": 9471, + "train/total_loss": 0.023437537252902985 + }, + { + "entropy": 9.105995178222656, + "epoch": 0.9365236306110342, + "mean_token_accuracy": 0.7398496270179749, + "num_tokens": 28496000.0, + "step": 9472, + "train/ce_loss": 2.3877581156739325e-07 + }, + { + "epoch": 0.9365236306110342, + "step": 9472, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.9365236306110342, + "step": 9472, + "train/total_loss": 0.02343752421438694 + }, + { + "entropy": 8.429557800292969, + "epoch": 0.9366225034605498, + "mean_token_accuracy": 0.769978404045105, + "num_tokens": 28501386.0, + "step": 9473, + "train/ce_loss": 0.9409735798835754 + }, + { + "epoch": 0.9366225034605498, + "step": 9473, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.9366225034605498, + "step": 9473, + "train/total_loss": 0.14487861096858978 + }, + { + "entropy": 9.37339973449707, + "epoch": 0.9367213763100652, + "mean_token_accuracy": 0.7769376039505005, + "num_tokens": 28506381.0, + "step": 9474, + "train/ce_loss": 4.75928715104601e-07 + }, + { + "epoch": 0.9367213763100652, + "step": 9474, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.9367213763100652, + "step": 9474, + "train/total_loss": 0.03906254842877388 + }, + { + "entropy": 8.381181716918945, + "epoch": 0.9368202491595807, + "mean_token_accuracy": 0.782608687877655, + "num_tokens": 28511802.0, + "step": 9475, + "train/ce_loss": 0.6702274680137634 + }, + { + "epoch": 0.9368202491595807, + "step": 9475, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.9368202491595807, + "step": 9475, + "train/total_loss": 0.12171024829149246 + }, + { + "entropy": 9.021618843078613, + "epoch": 0.9369191220090963, + "mean_token_accuracy": 0.7468030452728271, + "num_tokens": 28517043.0, + "step": 9476, + "train/ce_loss": 0.4683550298213959 + }, + { + "epoch": 0.9369191220090963, + "step": 9476, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.9369191220090963, + "step": 9476, + "train/total_loss": 0.0663667544722557 + }, + { + "entropy": 8.575700759887695, + "epoch": 0.9370179948586118, + "mean_token_accuracy": 0.736775815486908, + "num_tokens": 28522247.0, + "step": 9477, + "train/ce_loss": 1.0939861536026 + }, + { + "epoch": 0.9370179948586118, + "step": 9477, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.9370179948586118, + "step": 9477, + "train/total_loss": 0.14455486834049225 + }, + { + "entropy": 8.39863395690918, + "epoch": 0.9371168677081273, + "mean_token_accuracy": 0.7544204592704773, + "num_tokens": 28527742.0, + "step": 9478, + "train/ce_loss": 0.6728148460388184 + }, + { + "epoch": 0.9371168677081273, + "step": 9478, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.9371168677081273, + "step": 9478, + "train/total_loss": 0.16493773460388184 + }, + { + "entropy": 8.780609130859375, + "epoch": 0.9372157405576429, + "mean_token_accuracy": 0.7668965458869934, + "num_tokens": 28532951.0, + "step": 9479, + "train/ce_loss": 1.119539499282837 + }, + { + "epoch": 0.9372157405576429, + "step": 9479, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.9372157405576429, + "step": 9479, + "train/total_loss": 0.1666414439678192 + }, + { + "epoch": 0.9373146134071584, + "grad_norm": 0.5941998362541199, + "learning_rate": 7.658853780349108e-06, + "loss": 0.129, + "step": 9480 + }, + { + "entropy": 9.30009651184082, + "epoch": 0.9373146134071584, + "mean_token_accuracy": 0.7337883710861206, + "num_tokens": 28538002.0, + "step": 9480, + "train/ce_loss": 3.8732500229343714e-07 + }, + { + "epoch": 0.9373146134071584, + "step": 9480, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.9373146134071584, + "step": 9480, + "train/total_loss": 0.027343789115548134 + }, + { + "entropy": 8.628240585327148, + "epoch": 0.9374134862566739, + "mean_token_accuracy": 0.692307710647583, + "num_tokens": 28543458.0, + "step": 9481, + "train/ce_loss": 1.2278821468353271 + }, + { + "epoch": 0.9374134862566739, + "step": 9481, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.9374134862566739, + "step": 9481, + "train/total_loss": 0.1813819706439972 + }, + { + "entropy": 8.353310585021973, + "epoch": 0.9375123591061895, + "mean_token_accuracy": 0.769911527633667, + "num_tokens": 28548977.0, + "step": 9482, + "train/ce_loss": 0.6554830074310303 + }, + { + "epoch": 0.9375123591061895, + "step": 9482, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.9375123591061895, + "step": 9482, + "train/total_loss": 0.12414205074310303 + }, + { + "entropy": 8.617682456970215, + "epoch": 0.937611231955705, + "mean_token_accuracy": 0.7256097793579102, + "num_tokens": 28554441.0, + "step": 9483, + "train/ce_loss": 1.6344026327133179 + }, + { + "epoch": 0.937611231955705, + "step": 9483, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.937611231955705, + "step": 9483, + "train/total_loss": 0.2571902871131897 + }, + { + "entropy": 9.161894798278809, + "epoch": 0.9377101048052204, + "mean_token_accuracy": 0.7301855087280273, + "num_tokens": 28559481.0, + "step": 9484, + "train/ce_loss": 1.361616611480713 + }, + { + "epoch": 0.9377101048052204, + "step": 9484, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.9377101048052204, + "step": 9484, + "train/total_loss": 0.186942920088768 + }, + { + "entropy": 9.093269348144531, + "epoch": 0.937808977654736, + "mean_token_accuracy": 0.7424242496490479, + "num_tokens": 28564504.0, + "step": 9485, + "train/ce_loss": 8.236112307713483e-07 + }, + { + "epoch": 0.937808977654736, + "step": 9485, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.937808977654736, + "step": 9485, + "train/total_loss": 0.06250008195638657 + }, + { + "entropy": 8.668212890625, + "epoch": 0.9379078505042515, + "mean_token_accuracy": 0.693315863609314, + "num_tokens": 28569736.0, + "step": 9486, + "train/ce_loss": 1.6782050132751465 + }, + { + "epoch": 0.9379078505042515, + "step": 9486, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.9379078505042515, + "step": 9486, + "train/total_loss": 0.2186017483472824 + }, + { + "entropy": 8.875524520874023, + "epoch": 0.938006723353767, + "mean_token_accuracy": 0.7543390989303589, + "num_tokens": 28574911.0, + "step": 9487, + "train/ce_loss": 0.9683337807655334 + }, + { + "epoch": 0.938006723353767, + "step": 9487, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.938006723353767, + "step": 9487, + "train/total_loss": 0.14761462807655334 + }, + { + "entropy": 8.638969421386719, + "epoch": 0.9381055962032826, + "mean_token_accuracy": 0.7071611285209656, + "num_tokens": 28580186.0, + "step": 9488, + "train/ce_loss": 0.5961774587631226 + }, + { + "epoch": 0.9381055962032826, + "step": 9488, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.9381055962032826, + "step": 9488, + "train/total_loss": 0.09477399289608002 + }, + { + "entropy": 8.519998550415039, + "epoch": 0.9382044690527981, + "mean_token_accuracy": 0.7333333492279053, + "num_tokens": 28585438.0, + "step": 9489, + "train/ce_loss": 0.6842387914657593 + }, + { + "epoch": 0.9382044690527981, + "step": 9489, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.9382044690527981, + "step": 9489, + "train/total_loss": 0.11529888212680817 + }, + { + "entropy": 8.97115421295166, + "epoch": 0.9383033419023136, + "mean_token_accuracy": 0.7360115051269531, + "num_tokens": 28590564.0, + "step": 9490, + "train/ce_loss": 1.1503511667251587 + }, + { + "epoch": 0.9383033419023136, + "step": 9490, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.9383033419023136, + "step": 9490, + "train/total_loss": 0.15019136667251587 + }, + { + "entropy": 9.357909202575684, + "epoch": 0.9384022147518292, + "mean_token_accuracy": 0.7748593091964722, + "num_tokens": 28595539.0, + "step": 9491, + "train/ce_loss": 0.5976631045341492 + }, + { + "epoch": 0.9384022147518292, + "step": 9491, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.9384022147518292, + "step": 9491, + "train/total_loss": 0.09882880747318268 + }, + { + "entropy": 8.69143009185791, + "epoch": 0.9385010876013447, + "mean_token_accuracy": 0.763052225112915, + "num_tokens": 28600742.0, + "step": 9492, + "train/ce_loss": 1.2426862716674805 + }, + { + "epoch": 0.9385010876013447, + "step": 9492, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.9385010876013447, + "step": 9492, + "train/total_loss": 0.19848737120628357 + }, + { + "entropy": 8.563522338867188, + "epoch": 0.9385999604508602, + "mean_token_accuracy": 0.7716450095176697, + "num_tokens": 28606121.0, + "step": 9493, + "train/ce_loss": 1.5273971557617188 + }, + { + "epoch": 0.9385999604508602, + "step": 9493, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.9385999604508602, + "step": 9493, + "train/total_loss": 0.2230522185564041 + }, + { + "entropy": 8.836255073547363, + "epoch": 0.9386988333003757, + "mean_token_accuracy": 0.7194631099700928, + "num_tokens": 28611370.0, + "step": 9494, + "train/ce_loss": 0.778255820274353 + }, + { + "epoch": 0.9386988333003757, + "step": 9494, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.9386988333003757, + "step": 9494, + "train/total_loss": 0.10126308351755142 + }, + { + "entropy": 8.884140968322754, + "epoch": 0.9387977061498912, + "mean_token_accuracy": 0.7642752528190613, + "num_tokens": 28616492.0, + "step": 9495, + "train/ce_loss": 0.8069877028465271 + }, + { + "epoch": 0.9387977061498912, + "step": 9495, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.9387977061498912, + "step": 9495, + "train/total_loss": 0.13929252326488495 + }, + { + "entropy": 8.872888565063477, + "epoch": 0.9388965789994068, + "mean_token_accuracy": 0.748308539390564, + "num_tokens": 28621699.0, + "step": 9496, + "train/ce_loss": 0.8947360515594482 + }, + { + "epoch": 0.9388965789994068, + "step": 9496, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.9388965789994068, + "step": 9496, + "train/total_loss": 0.13634860515594482 + }, + { + "entropy": 9.04621410369873, + "epoch": 0.9389954518489223, + "mean_token_accuracy": 0.7554054260253906, + "num_tokens": 28626910.0, + "step": 9497, + "train/ce_loss": 0.47385963797569275 + }, + { + "epoch": 0.9389954518489223, + "step": 9497, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.9389954518489223, + "step": 9497, + "train/total_loss": 0.10207346081733704 + }, + { + "entropy": 8.489900588989258, + "epoch": 0.9390943246984378, + "mean_token_accuracy": 0.7533129453659058, + "num_tokens": 28632354.0, + "step": 9498, + "train/ce_loss": 0.5772135257720947 + }, + { + "epoch": 0.9390943246984378, + "step": 9498, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.9390943246984378, + "step": 9498, + "train/total_loss": 0.08506510406732559 + }, + { + "entropy": 9.268988609313965, + "epoch": 0.9391931975479534, + "mean_token_accuracy": 0.782865583896637, + "num_tokens": 28637473.0, + "step": 9499, + "train/ce_loss": 0.5241722464561462 + }, + { + "epoch": 0.9391931975479534, + "step": 9499, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.9391931975479534, + "step": 9499, + "train/total_loss": 0.07194847613573074 + }, + { + "epoch": 0.9392920703974689, + "grad_norm": 0.6899769306182861, + "learning_rate": 7.653908915591159e-06, + "loss": 0.1295, + "step": 9500 + }, + { + "entropy": 8.788375854492188, + "epoch": 0.9392920703974689, + "mean_token_accuracy": 0.7633832693099976, + "num_tokens": 28642838.0, + "step": 9500, + "train/ce_loss": 1.4281773567199707 + }, + { + "epoch": 0.9392920703974689, + "step": 9500, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.9392920703974689, + "step": 9500, + "train/total_loss": 0.23656773567199707 + }, + { + "entropy": 8.71684455871582, + "epoch": 0.9393909432469844, + "mean_token_accuracy": 0.7353801131248474, + "num_tokens": 28648153.0, + "step": 9501, + "train/ce_loss": 0.3031177520751953 + }, + { + "epoch": 0.9393909432469844, + "step": 9501, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.9393909432469844, + "step": 9501, + "train/total_loss": 0.07328052818775177 + }, + { + "entropy": 8.878063201904297, + "epoch": 0.9394898160965, + "mean_token_accuracy": 0.7370129823684692, + "num_tokens": 28653270.0, + "step": 9502, + "train/ce_loss": 3.8936110513532185e-07 + }, + { + "epoch": 0.9394898160965, + "step": 9502, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.9394898160965, + "step": 9502, + "train/total_loss": 0.046875037252902985 + }, + { + "entropy": 9.535175323486328, + "epoch": 0.9395886889460154, + "mean_token_accuracy": 0.7554945349693298, + "num_tokens": 28658044.0, + "step": 9503, + "train/ce_loss": 8.357617389265215e-07 + }, + { + "epoch": 0.9395886889460154, + "step": 9503, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.9395886889460154, + "step": 9503, + "train/total_loss": 0.035156331956386566 + }, + { + "entropy": 8.738447189331055, + "epoch": 0.9396875617955309, + "mean_token_accuracy": 0.723294734954834, + "num_tokens": 28663295.0, + "step": 9504, + "train/ce_loss": 1.1513490676879883 + }, + { + "epoch": 0.9396875617955309, + "step": 9504, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.9396875617955309, + "step": 9504, + "train/total_loss": 0.15810365974903107 + }, + { + "entropy": 8.636959075927734, + "epoch": 0.9397864346450465, + "mean_token_accuracy": 0.7348484992980957, + "num_tokens": 28668564.0, + "step": 9505, + "train/ce_loss": 0.7040850520133972 + }, + { + "epoch": 0.9397864346450465, + "step": 9505, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.9397864346450465, + "step": 9505, + "train/total_loss": 0.12509600818157196 + }, + { + "entropy": 8.755784034729004, + "epoch": 0.939885307494562, + "mean_token_accuracy": 0.7001166939735413, + "num_tokens": 28673910.0, + "step": 9506, + "train/ce_loss": 0.7709546685218811 + }, + { + "epoch": 0.939885307494562, + "step": 9506, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.939885307494562, + "step": 9506, + "train/total_loss": 0.10443922132253647 + }, + { + "entropy": 8.498984336853027, + "epoch": 0.9399841803440775, + "mean_token_accuracy": 0.8089758157730103, + "num_tokens": 28679269.0, + "step": 9507, + "train/ce_loss": 0.44236424565315247 + }, + { + "epoch": 0.9399841803440775, + "step": 9507, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.9399841803440775, + "step": 9507, + "train/total_loss": 0.06376767158508301 + }, + { + "entropy": 9.350627899169922, + "epoch": 0.9400830531935931, + "mean_token_accuracy": 0.7412587404251099, + "num_tokens": 28684233.0, + "step": 9508, + "train/ce_loss": 1.5312329530715942 + }, + { + "epoch": 0.9400830531935931, + "step": 9508, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.9400830531935931, + "step": 9508, + "train/total_loss": 0.21171705424785614 + }, + { + "entropy": 8.621431350708008, + "epoch": 0.9401819260431086, + "mean_token_accuracy": 0.7551867365837097, + "num_tokens": 28689675.0, + "step": 9509, + "train/ce_loss": 0.46875298023223877 + }, + { + "epoch": 0.9401819260431086, + "step": 9509, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.9401819260431086, + "step": 9509, + "train/total_loss": 0.06250029802322388 + }, + { + "entropy": 8.615789413452148, + "epoch": 0.9402807988926241, + "mean_token_accuracy": 0.767241358757019, + "num_tokens": 28695205.0, + "step": 9510, + "train/ce_loss": 0.8325075507164001 + }, + { + "epoch": 0.9402807988926241, + "step": 9510, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.9402807988926241, + "step": 9510, + "train/total_loss": 0.1730945110321045 + }, + { + "entropy": 9.028203964233398, + "epoch": 0.9403796717421397, + "mean_token_accuracy": 0.743145763874054, + "num_tokens": 28700348.0, + "step": 9511, + "train/ce_loss": 2.415129642940883e-07 + }, + { + "epoch": 0.9403796717421397, + "step": 9511, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.9403796717421397, + "step": 9511, + "train/total_loss": 0.04296877235174179 + }, + { + "entropy": 8.917841911315918, + "epoch": 0.9404785445916551, + "mean_token_accuracy": 0.7300613522529602, + "num_tokens": 28705647.0, + "step": 9512, + "train/ce_loss": 0.7141034007072449 + }, + { + "epoch": 0.9404785445916551, + "step": 9512, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.9404785445916551, + "step": 9512, + "train/total_loss": 0.09875409305095673 + }, + { + "entropy": 8.543609619140625, + "epoch": 0.9405774174411706, + "mean_token_accuracy": 0.7538101077079773, + "num_tokens": 28710939.0, + "step": 9513, + "train/ce_loss": 0.6660972833633423 + }, + { + "epoch": 0.9405774174411706, + "step": 9513, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.9405774174411706, + "step": 9513, + "train/total_loss": 0.11348473280668259 + }, + { + "entropy": 9.336162567138672, + "epoch": 0.9406762902906862, + "mean_token_accuracy": 0.713385820388794, + "num_tokens": 28715956.0, + "step": 9514, + "train/ce_loss": 1.1417412757873535 + }, + { + "epoch": 0.9406762902906862, + "step": 9514, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.9406762902906862, + "step": 9514, + "train/total_loss": 0.16886162757873535 + }, + { + "entropy": 8.69278335571289, + "epoch": 0.9407751631402017, + "mean_token_accuracy": 0.733668327331543, + "num_tokens": 28721273.0, + "step": 9515, + "train/ce_loss": 0.9447028636932373 + }, + { + "epoch": 0.9407751631402017, + "step": 9515, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.9407751631402017, + "step": 9515, + "train/total_loss": 0.11400153487920761 + }, + { + "entropy": 8.511211395263672, + "epoch": 0.9408740359897172, + "mean_token_accuracy": 0.7770137786865234, + "num_tokens": 28726782.0, + "step": 9516, + "train/ce_loss": 0.6004266738891602 + }, + { + "epoch": 0.9408740359897172, + "step": 9516, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.9408740359897172, + "step": 9516, + "train/total_loss": 0.07957391440868378 + }, + { + "entropy": 8.731058120727539, + "epoch": 0.9409729088392328, + "mean_token_accuracy": 0.7242236137390137, + "num_tokens": 28732050.0, + "step": 9517, + "train/ce_loss": 0.6168516874313354 + }, + { + "epoch": 0.9409729088392328, + "step": 9517, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.9409729088392328, + "step": 9517, + "train/total_loss": 0.11246642470359802 + }, + { + "entropy": 8.753804206848145, + "epoch": 0.9410717816887483, + "mean_token_accuracy": 0.756041407585144, + "num_tokens": 28737385.0, + "step": 9518, + "train/ce_loss": 0.41143131256103516 + }, + { + "epoch": 0.9410717816887483, + "step": 9518, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.9410717816887483, + "step": 9518, + "train/total_loss": 0.08411188423633575 + }, + { + "entropy": 8.625227928161621, + "epoch": 0.9411706545382638, + "mean_token_accuracy": 0.7270471453666687, + "num_tokens": 28742634.0, + "step": 9519, + "train/ce_loss": 0.6203886866569519 + }, + { + "epoch": 0.9411706545382638, + "step": 9519, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.9411706545382638, + "step": 9519, + "train/total_loss": 0.10110136866569519 + }, + { + "epoch": 0.9412695273877794, + "grad_norm": 0.676220178604126, + "learning_rate": 7.64896405083321e-06, + "loss": 0.1316, + "step": 9520 + }, + { + "entropy": 9.301724433898926, + "epoch": 0.9412695273877794, + "mean_token_accuracy": 0.7676923274993896, + "num_tokens": 28747708.0, + "step": 9520, + "train/ce_loss": 0.675580620765686 + }, + { + "epoch": 0.9412695273877794, + "step": 9520, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.9412695273877794, + "step": 9520, + "train/total_loss": 0.14958931505680084 + }, + { + "entropy": 8.998207092285156, + "epoch": 0.9413684002372948, + "mean_token_accuracy": 0.7565698623657227, + "num_tokens": 28752900.0, + "step": 9521, + "train/ce_loss": 1.0089915990829468 + }, + { + "epoch": 0.9413684002372948, + "step": 9521, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.9413684002372948, + "step": 9521, + "train/total_loss": 0.13605540990829468 + }, + { + "entropy": 8.510425567626953, + "epoch": 0.9414672730868103, + "mean_token_accuracy": 0.8020133972167969, + "num_tokens": 28758310.0, + "step": 9522, + "train/ce_loss": 0.4035511314868927 + }, + { + "epoch": 0.9414672730868103, + "step": 9522, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.9414672730868103, + "step": 9522, + "train/total_loss": 0.05988636240363121 + }, + { + "entropy": 8.765780448913574, + "epoch": 0.9415661459363259, + "mean_token_accuracy": 0.7340686321258545, + "num_tokens": 28763603.0, + "step": 9523, + "train/ce_loss": 1.7169021368026733 + }, + { + "epoch": 0.9415661459363259, + "step": 9523, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.9415661459363259, + "step": 9523, + "train/total_loss": 0.2224714607000351 + }, + { + "entropy": 8.83626937866211, + "epoch": 0.9416650187858414, + "mean_token_accuracy": 0.7493917346000671, + "num_tokens": 28768926.0, + "step": 9524, + "train/ce_loss": 0.545708417892456 + }, + { + "epoch": 0.9416650187858414, + "step": 9524, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.9416650187858414, + "step": 9524, + "train/total_loss": 0.11707083880901337 + }, + { + "entropy": 8.736515045166016, + "epoch": 0.9417638916353569, + "mean_token_accuracy": 0.7122762203216553, + "num_tokens": 28774218.0, + "step": 9525, + "train/ce_loss": 0.6712250113487244 + }, + { + "epoch": 0.9417638916353569, + "step": 9525, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.9417638916353569, + "step": 9525, + "train/total_loss": 0.11790375411510468 + }, + { + "entropy": 8.839805603027344, + "epoch": 0.9418627644848725, + "mean_token_accuracy": 0.7200474739074707, + "num_tokens": 28779521.0, + "step": 9526, + "train/ce_loss": 0.5456531047821045 + }, + { + "epoch": 0.9418627644848725, + "step": 9526, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.9418627644848725, + "step": 9526, + "train/total_loss": 0.10925281047821045 + }, + { + "entropy": 9.020331382751465, + "epoch": 0.941961637334388, + "mean_token_accuracy": 0.6974790096282959, + "num_tokens": 28784698.0, + "step": 9527, + "train/ce_loss": 0.7585151791572571 + }, + { + "epoch": 0.941961637334388, + "step": 9527, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.941961637334388, + "step": 9527, + "train/total_loss": 0.14225777983665466 + }, + { + "entropy": 9.176788330078125, + "epoch": 0.9420605101839035, + "mean_token_accuracy": 0.7791798114776611, + "num_tokens": 28789834.0, + "step": 9528, + "train/ce_loss": 1.0635613203048706 + }, + { + "epoch": 0.9420605101839035, + "step": 9528, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.9420605101839035, + "step": 9528, + "train/total_loss": 0.14932489395141602 + }, + { + "entropy": 9.404987335205078, + "epoch": 0.9421593830334191, + "mean_token_accuracy": 0.7586206793785095, + "num_tokens": 28794813.0, + "step": 9529, + "train/ce_loss": 2.1721525911289064e-07 + }, + { + "epoch": 0.9421593830334191, + "step": 9529, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.9421593830334191, + "step": 9529, + "train/total_loss": 0.01562502235174179 + }, + { + "entropy": 8.8800048828125, + "epoch": 0.9422582558829345, + "mean_token_accuracy": 0.7600519061088562, + "num_tokens": 28800027.0, + "step": 9530, + "train/ce_loss": 0.807660698890686 + }, + { + "epoch": 0.9422582558829345, + "step": 9530, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.9422582558829345, + "step": 9530, + "train/total_loss": 0.11982857435941696 + }, + { + "entropy": 8.948602676391602, + "epoch": 0.94235712873245, + "mean_token_accuracy": 0.6991150379180908, + "num_tokens": 28805025.0, + "step": 9531, + "train/ce_loss": 8.591353548581537e-07 + }, + { + "epoch": 0.94235712873245, + "step": 9531, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.94235712873245, + "step": 9531, + "train/total_loss": 0.07031258940696716 + }, + { + "entropy": 8.528826713562012, + "epoch": 0.9424560015819656, + "mean_token_accuracy": 0.7619577050209045, + "num_tokens": 28810411.0, + "step": 9532, + "train/ce_loss": 0.952943742275238 + }, + { + "epoch": 0.9424560015819656, + "step": 9532, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.9424560015819656, + "step": 9532, + "train/total_loss": 0.13435688614845276 + }, + { + "entropy": 9.240312576293945, + "epoch": 0.9425548744314811, + "mean_token_accuracy": 0.7006173133850098, + "num_tokens": 28815519.0, + "step": 9533, + "train/ce_loss": 1.3274903297424316 + }, + { + "epoch": 0.9425548744314811, + "step": 9533, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.9425548744314811, + "step": 9533, + "train/total_loss": 0.1796240359544754 + }, + { + "entropy": 8.97305679321289, + "epoch": 0.9426537472809966, + "mean_token_accuracy": 0.7384615540504456, + "num_tokens": 28820566.0, + "step": 9534, + "train/ce_loss": 0.9110743999481201 + }, + { + "epoch": 0.9426537472809966, + "step": 9534, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.9426537472809966, + "step": 9534, + "train/total_loss": 0.11454494297504425 + }, + { + "entropy": 8.747942924499512, + "epoch": 0.9427526201305122, + "mean_token_accuracy": 0.7578828930854797, + "num_tokens": 28825885.0, + "step": 9535, + "train/ce_loss": 0.47831931710243225 + }, + { + "epoch": 0.9427526201305122, + "step": 9535, + "train/sim_loss": 0.1015625 + }, + { + "epoch": 0.9427526201305122, + "step": 9535, + "train/total_loss": 0.1493944376707077 + }, + { + "entropy": 8.6371488571167, + "epoch": 0.9428514929800277, + "mean_token_accuracy": 0.7226791977882385, + "num_tokens": 28831199.0, + "step": 9536, + "train/ce_loss": 0.798780083656311 + }, + { + "epoch": 0.9428514929800277, + "step": 9536, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.9428514929800277, + "step": 9536, + "train/total_loss": 0.13456550240516663 + }, + { + "entropy": 8.563827514648438, + "epoch": 0.9429503658295432, + "mean_token_accuracy": 0.7590090036392212, + "num_tokens": 28836555.0, + "step": 9537, + "train/ce_loss": 0.910048246383667 + }, + { + "epoch": 0.9429503658295432, + "step": 9537, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.9429503658295432, + "step": 9537, + "train/total_loss": 0.11053607612848282 + }, + { + "entropy": 8.62954044342041, + "epoch": 0.9430492386790588, + "mean_token_accuracy": 0.7395833134651184, + "num_tokens": 28841813.0, + "step": 9538, + "train/ce_loss": 0.6747787594795227 + }, + { + "epoch": 0.9430492386790588, + "step": 9538, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.9430492386790588, + "step": 9538, + "train/total_loss": 0.10654037445783615 + }, + { + "entropy": 8.663762092590332, + "epoch": 0.9431481115285743, + "mean_token_accuracy": 0.7041942477226257, + "num_tokens": 28847202.0, + "step": 9539, + "train/ce_loss": 0.8284857273101807 + }, + { + "epoch": 0.9431481115285743, + "step": 9539, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.9431481115285743, + "step": 9539, + "train/total_loss": 0.12581732869148254 + }, + { + "epoch": 0.9432469843780897, + "grad_norm": 0.6371222138404846, + "learning_rate": 7.644019186075261e-06, + "loss": 0.1344, + "step": 9540 + }, + { + "entropy": 9.424263954162598, + "epoch": 0.9432469843780897, + "mean_token_accuracy": 0.6541849970817566, + "num_tokens": 28852115.0, + "step": 9540, + "train/ce_loss": 4.169666567577224e-07 + }, + { + "epoch": 0.9432469843780897, + "step": 9540, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.9432469843780897, + "step": 9540, + "train/total_loss": 0.03906254097819328 + }, + { + "entropy": 8.73521900177002, + "epoch": 0.9433458572276053, + "mean_token_accuracy": 0.7703889608383179, + "num_tokens": 28857360.0, + "step": 9541, + "train/ce_loss": 0.6447766423225403 + }, + { + "epoch": 0.9433458572276053, + "step": 9541, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.9433458572276053, + "step": 9541, + "train/total_loss": 0.15041516721248627 + }, + { + "entropy": 8.16667366027832, + "epoch": 0.9434447300771208, + "mean_token_accuracy": 0.6919233798980713, + "num_tokens": 28863070.0, + "step": 9542, + "train/ce_loss": 0.875781238079071 + }, + { + "epoch": 0.9434447300771208, + "step": 9542, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.9434447300771208, + "step": 9542, + "train/total_loss": 0.19304686784744263 + }, + { + "entropy": 8.911556243896484, + "epoch": 0.9435436029266363, + "mean_token_accuracy": 0.8066825866699219, + "num_tokens": 28868394.0, + "step": 9543, + "train/ce_loss": 1.0784525871276855 + }, + { + "epoch": 0.9435436029266363, + "step": 9543, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.9435436029266363, + "step": 9543, + "train/total_loss": 0.1469077616930008 + }, + { + "entropy": 8.681024551391602, + "epoch": 0.9436424757761519, + "mean_token_accuracy": 0.7692307829856873, + "num_tokens": 28873701.0, + "step": 9544, + "train/ce_loss": 0.6686069369316101 + }, + { + "epoch": 0.9436424757761519, + "step": 9544, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.9436424757761519, + "step": 9544, + "train/total_loss": 0.09029819816350937 + }, + { + "entropy": 9.512763977050781, + "epoch": 0.9437413486256674, + "mean_token_accuracy": 0.7889125943183899, + "num_tokens": 28878591.0, + "step": 9545, + "train/ce_loss": 5.38847814368637e-07 + }, + { + "epoch": 0.9437413486256674, + "step": 9545, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.9437413486256674, + "step": 9545, + "train/total_loss": 0.04296880215406418 + }, + { + "entropy": 9.259182929992676, + "epoch": 0.9438402214751829, + "mean_token_accuracy": 0.7662538886070251, + "num_tokens": 28883674.0, + "step": 9546, + "train/ce_loss": 0.5185627937316895 + }, + { + "epoch": 0.9438402214751829, + "step": 9546, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.9438402214751829, + "step": 9546, + "train/total_loss": 0.06748127937316895 + }, + { + "entropy": 8.751737594604492, + "epoch": 0.9439390943246985, + "mean_token_accuracy": 0.7868080139160156, + "num_tokens": 28888989.0, + "step": 9547, + "train/ce_loss": 0.4132890999317169 + }, + { + "epoch": 0.9439390943246985, + "step": 9547, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.9439390943246985, + "step": 9547, + "train/total_loss": 0.09601640701293945 + }, + { + "entropy": 8.520257949829102, + "epoch": 0.944037967174214, + "mean_token_accuracy": 0.6856866478919983, + "num_tokens": 28894447.0, + "step": 9548, + "train/ce_loss": 0.6307772994041443 + }, + { + "epoch": 0.944037967174214, + "step": 9548, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.944037967174214, + "step": 9548, + "train/total_loss": 0.09432773292064667 + }, + { + "entropy": 9.019133567810059, + "epoch": 0.9441368400237294, + "mean_token_accuracy": 0.7587253451347351, + "num_tokens": 28899570.0, + "step": 9549, + "train/ce_loss": 0.7235116958618164 + }, + { + "epoch": 0.9441368400237294, + "step": 9549, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.9441368400237294, + "step": 9549, + "train/total_loss": 0.11531992256641388 + }, + { + "entropy": 8.642354965209961, + "epoch": 0.944235712873245, + "mean_token_accuracy": 0.7587336301803589, + "num_tokens": 28904949.0, + "step": 9550, + "train/ce_loss": 0.7822985649108887 + }, + { + "epoch": 0.944235712873245, + "step": 9550, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.944235712873245, + "step": 9550, + "train/total_loss": 0.1094798594713211 + }, + { + "entropy": 9.277965545654297, + "epoch": 0.9443345857227605, + "mean_token_accuracy": 0.7056530117988586, + "num_tokens": 28909941.0, + "step": 9551, + "train/ce_loss": 0.6955422163009644 + }, + { + "epoch": 0.9443345857227605, + "step": 9551, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.9443345857227605, + "step": 9551, + "train/total_loss": 0.11642922461032867 + }, + { + "entropy": 8.385626792907715, + "epoch": 0.944433458572276, + "mean_token_accuracy": 0.7405660152435303, + "num_tokens": 28915504.0, + "step": 9552, + "train/ce_loss": 0.6856048107147217 + }, + { + "epoch": 0.944433458572276, + "step": 9552, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.944433458572276, + "step": 9552, + "train/total_loss": 0.11152923107147217 + }, + { + "entropy": 8.710573196411133, + "epoch": 0.9445323314217916, + "mean_token_accuracy": 0.7350993156433105, + "num_tokens": 28920720.0, + "step": 9553, + "train/ce_loss": 0.7603187561035156 + }, + { + "epoch": 0.9445323314217916, + "step": 9553, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.9445323314217916, + "step": 9553, + "train/total_loss": 0.1307193785905838 + }, + { + "entropy": 8.891063690185547, + "epoch": 0.9446312042713071, + "mean_token_accuracy": 0.7756097316741943, + "num_tokens": 28925965.0, + "step": 9554, + "train/ce_loss": 0.5832635760307312 + }, + { + "epoch": 0.9446312042713071, + "step": 9554, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.9446312042713071, + "step": 9554, + "train/total_loss": 0.0973888635635376 + }, + { + "entropy": 8.735498428344727, + "epoch": 0.9447300771208226, + "mean_token_accuracy": 0.7409793734550476, + "num_tokens": 28931288.0, + "step": 9555, + "train/ce_loss": 0.45289525389671326 + }, + { + "epoch": 0.9447300771208226, + "step": 9555, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.9447300771208226, + "step": 9555, + "train/total_loss": 0.0999770313501358 + }, + { + "entropy": 9.089837074279785, + "epoch": 0.9448289499703382, + "mean_token_accuracy": 0.7692307829856873, + "num_tokens": 28936466.0, + "step": 9556, + "train/ce_loss": 0.5755773186683655 + }, + { + "epoch": 0.9448289499703382, + "step": 9556, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.9448289499703382, + "step": 9556, + "train/total_loss": 0.10052648186683655 + }, + { + "entropy": 8.925742149353027, + "epoch": 0.9449278228198537, + "mean_token_accuracy": 0.7666263580322266, + "num_tokens": 28941765.0, + "step": 9557, + "train/ce_loss": 0.8353256583213806 + }, + { + "epoch": 0.9449278228198537, + "step": 9557, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.9449278228198537, + "step": 9557, + "train/total_loss": 0.18900132179260254 + }, + { + "entropy": 9.195016860961914, + "epoch": 0.9450266956693691, + "mean_token_accuracy": 0.7957860827445984, + "num_tokens": 28947015.0, + "step": 9558, + "train/ce_loss": 1.1745822429656982 + }, + { + "epoch": 0.9450266956693691, + "step": 9558, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.9450266956693691, + "step": 9558, + "train/total_loss": 0.20730197429656982 + }, + { + "entropy": 9.235591888427734, + "epoch": 0.9451255685188847, + "mean_token_accuracy": 0.7303225994110107, + "num_tokens": 28952238.0, + "step": 9559, + "train/ce_loss": 1.6796906265881262e-07 + }, + { + "epoch": 0.9451255685188847, + "step": 9559, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.9451255685188847, + "step": 9559, + "train/total_loss": 0.019531266763806343 + }, + { + "epoch": 0.9452244413684002, + "grad_norm": 0.6818765997886658, + "learning_rate": 7.639074321317312e-06, + "loss": 0.1339, + "step": 9560 + }, + { + "entropy": 8.612297058105469, + "epoch": 0.9452244413684002, + "mean_token_accuracy": 0.7153465151786804, + "num_tokens": 28957528.0, + "step": 9560, + "train/ce_loss": 1.071324110031128 + }, + { + "epoch": 0.9452244413684002, + "step": 9560, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.9452244413684002, + "step": 9560, + "train/total_loss": 0.17353865504264832 + }, + { + "entropy": 8.844977378845215, + "epoch": 0.9453233142179157, + "mean_token_accuracy": 0.7661623358726501, + "num_tokens": 28962730.0, + "step": 9561, + "train/ce_loss": 7.732745075372804e-07 + }, + { + "epoch": 0.9453233142179157, + "step": 9561, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.9453233142179157, + "step": 9561, + "train/total_loss": 0.05078132823109627 + }, + { + "entropy": 8.497743606567383, + "epoch": 0.9454221870674313, + "mean_token_accuracy": 0.7420249581336975, + "num_tokens": 28967957.0, + "step": 9562, + "train/ce_loss": 1.2307610511779785 + }, + { + "epoch": 0.9454221870674313, + "step": 9562, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.9454221870674313, + "step": 9562, + "train/total_loss": 0.16604486107826233 + }, + { + "entropy": 8.593058586120605, + "epoch": 0.9455210599169468, + "mean_token_accuracy": 0.7852193713188171, + "num_tokens": 28973312.0, + "step": 9563, + "train/ce_loss": 0.708541750907898 + }, + { + "epoch": 0.9455210599169468, + "step": 9563, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.9455210599169468, + "step": 9563, + "train/total_loss": 0.10991667956113815 + }, + { + "entropy": 9.197546005249023, + "epoch": 0.9456199327664623, + "mean_token_accuracy": 0.7704917788505554, + "num_tokens": 28978302.0, + "step": 9564, + "train/ce_loss": 0.8882774114608765 + }, + { + "epoch": 0.9456199327664623, + "step": 9564, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.9456199327664623, + "step": 9564, + "train/total_loss": 0.13960899412631989 + }, + { + "entropy": 8.964488983154297, + "epoch": 0.9457188056159779, + "mean_token_accuracy": 0.7850098609924316, + "num_tokens": 28983289.0, + "step": 9565, + "train/ce_loss": 1.0092006921768188 + }, + { + "epoch": 0.9457188056159779, + "step": 9565, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.9457188056159779, + "step": 9565, + "train/total_loss": 0.18685758113861084 + }, + { + "entropy": 8.841659545898438, + "epoch": 0.9458176784654934, + "mean_token_accuracy": 0.7730496525764465, + "num_tokens": 28988462.0, + "step": 9566, + "train/ce_loss": 2.0214712619781494 + }, + { + "epoch": 0.9458176784654934, + "step": 9566, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.9458176784654934, + "step": 9566, + "train/total_loss": 0.26855337619781494 + }, + { + "entropy": 8.560609817504883, + "epoch": 0.9459165513150088, + "mean_token_accuracy": 0.6855733394622803, + "num_tokens": 28993716.0, + "step": 9567, + "train/ce_loss": 0.9242957830429077 + }, + { + "epoch": 0.9459165513150088, + "step": 9567, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.9459165513150088, + "step": 9567, + "train/total_loss": 0.14321082830429077 + }, + { + "entropy": 9.03412914276123, + "epoch": 0.9460154241645244, + "mean_token_accuracy": 0.836241602897644, + "num_tokens": 28998902.0, + "step": 9568, + "train/ce_loss": 0.48793551325798035 + }, + { + "epoch": 0.9460154241645244, + "step": 9568, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.9460154241645244, + "step": 9568, + "train/total_loss": 0.07613730430603027 + }, + { + "entropy": 8.861091613769531, + "epoch": 0.9461142970140399, + "mean_token_accuracy": 0.7330447435379028, + "num_tokens": 29004087.0, + "step": 9569, + "train/ce_loss": 0.4134216010570526 + }, + { + "epoch": 0.9461142970140399, + "step": 9569, + "train/sim_loss": 0.10546875 + }, + { + "epoch": 0.9461142970140399, + "step": 9569, + "train/total_loss": 0.14681091904640198 + }, + { + "entropy": 9.295025825500488, + "epoch": 0.9462131698635554, + "mean_token_accuracy": 0.739130437374115, + "num_tokens": 29009015.0, + "step": 9570, + "train/ce_loss": 1.2365758419036865 + }, + { + "epoch": 0.9462131698635554, + "step": 9570, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.9462131698635554, + "step": 9570, + "train/total_loss": 0.18615758419036865 + }, + { + "entropy": 8.862635612487793, + "epoch": 0.946312042713071, + "mean_token_accuracy": 0.7630890011787415, + "num_tokens": 29014235.0, + "step": 9571, + "train/ce_loss": 0.8983027935028076 + }, + { + "epoch": 0.946312042713071, + "step": 9571, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.946312042713071, + "step": 9571, + "train/total_loss": 0.14451777935028076 + }, + { + "entropy": 9.31369400024414, + "epoch": 0.9464109155625865, + "mean_token_accuracy": 0.7960088849067688, + "num_tokens": 29019124.0, + "step": 9572, + "train/ce_loss": 1.9161144495010376 + }, + { + "epoch": 0.9464109155625865, + "step": 9572, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.9464109155625865, + "step": 9572, + "train/total_loss": 0.26973646879196167 + }, + { + "entropy": 9.24710464477539, + "epoch": 0.946509788412102, + "mean_token_accuracy": 0.715242862701416, + "num_tokens": 29024179.0, + "step": 9573, + "train/ce_loss": 1.3356640338897705 + }, + { + "epoch": 0.946509788412102, + "step": 9573, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.946509788412102, + "step": 9573, + "train/total_loss": 0.18044140934944153 + }, + { + "entropy": 8.64162540435791, + "epoch": 0.9466086612616176, + "mean_token_accuracy": 0.7286624312400818, + "num_tokens": 29029453.0, + "step": 9574, + "train/ce_loss": 7.203916538855992e-07 + }, + { + "epoch": 0.9466086612616176, + "step": 9574, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.9466086612616176, + "step": 9574, + "train/total_loss": 0.01953132264316082 + }, + { + "entropy": 8.737985610961914, + "epoch": 0.9467075341111331, + "mean_token_accuracy": 0.6741440296173096, + "num_tokens": 29034778.0, + "step": 9575, + "train/ce_loss": 1.2737239599227905 + }, + { + "epoch": 0.9467075341111331, + "step": 9575, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.9467075341111331, + "step": 9575, + "train/total_loss": 0.2015911489725113 + }, + { + "entropy": 8.338376998901367, + "epoch": 0.9468064069606487, + "mean_token_accuracy": 0.7080808281898499, + "num_tokens": 29040270.0, + "step": 9576, + "train/ce_loss": 1.3967866897583008 + }, + { + "epoch": 0.9468064069606487, + "step": 9576, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.9468064069606487, + "step": 9576, + "train/total_loss": 0.21780367195606232 + }, + { + "entropy": 8.711580276489258, + "epoch": 0.9469052798101641, + "mean_token_accuracy": 0.7848761677742004, + "num_tokens": 29045465.0, + "step": 9577, + "train/ce_loss": 0.6465726494789124 + }, + { + "epoch": 0.9469052798101641, + "step": 9577, + "train/sim_loss": 0.1328125 + }, + { + "epoch": 0.9469052798101641, + "step": 9577, + "train/total_loss": 0.1974697709083557 + }, + { + "entropy": 8.95461654663086, + "epoch": 0.9470041526596796, + "mean_token_accuracy": 0.7730711102485657, + "num_tokens": 29050545.0, + "step": 9578, + "train/ce_loss": 0.6712116599082947 + }, + { + "epoch": 0.9470041526596796, + "step": 9578, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.9470041526596796, + "step": 9578, + "train/total_loss": 0.18430867791175842 + }, + { + "entropy": 8.306659698486328, + "epoch": 0.9471030255091952, + "mean_token_accuracy": 0.7609561681747437, + "num_tokens": 29056056.0, + "step": 9579, + "train/ce_loss": 0.7028023600578308 + }, + { + "epoch": 0.9471030255091952, + "step": 9579, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.9471030255091952, + "step": 9579, + "train/total_loss": 0.10153023898601532 + }, + { + "epoch": 0.9472018983587107, + "grad_norm": 0.5578201413154602, + "learning_rate": 7.634129456559364e-06, + "loss": 0.1314, + "step": 9580 + }, + { + "entropy": 8.874184608459473, + "epoch": 0.9472018983587107, + "mean_token_accuracy": 0.761049747467041, + "num_tokens": 29061251.0, + "step": 9580, + "train/ce_loss": 1.3506217002868652 + }, + { + "epoch": 0.9472018983587107, + "step": 9580, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.9472018983587107, + "step": 9580, + "train/total_loss": 0.19756217300891876 + }, + { + "entropy": 8.735138893127441, + "epoch": 0.9473007712082262, + "mean_token_accuracy": 0.7437810897827148, + "num_tokens": 29066553.0, + "step": 9581, + "train/ce_loss": 0.697642982006073 + }, + { + "epoch": 0.9473007712082262, + "step": 9581, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.9473007712082262, + "step": 9581, + "train/total_loss": 0.11663930118083954 + }, + { + "entropy": 8.543025970458984, + "epoch": 0.9473996440577418, + "mean_token_accuracy": 0.7392815947532654, + "num_tokens": 29071903.0, + "step": 9582, + "train/ce_loss": 1.2665374279022217 + }, + { + "epoch": 0.9473996440577418, + "step": 9582, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.9473996440577418, + "step": 9582, + "train/total_loss": 0.2086849957704544 + }, + { + "entropy": 8.526872634887695, + "epoch": 0.9474985169072573, + "mean_token_accuracy": 0.7544987201690674, + "num_tokens": 29077161.0, + "step": 9583, + "train/ce_loss": 0.6691533923149109 + }, + { + "epoch": 0.9474985169072573, + "step": 9583, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.9474985169072573, + "step": 9583, + "train/total_loss": 0.1294153332710266 + }, + { + "entropy": 8.957462310791016, + "epoch": 0.9475973897567728, + "mean_token_accuracy": 0.7225806713104248, + "num_tokens": 29082393.0, + "step": 9584, + "train/ce_loss": 0.49990689754486084 + }, + { + "epoch": 0.9475973897567728, + "step": 9584, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.9475973897567728, + "step": 9584, + "train/total_loss": 0.0695219412446022 + }, + { + "entropy": 9.148205757141113, + "epoch": 0.9476962626062884, + "mean_token_accuracy": 0.7481343150138855, + "num_tokens": 29087348.0, + "step": 9585, + "train/ce_loss": 1.5612250763297197e-06 + }, + { + "epoch": 0.9476962626062884, + "step": 9585, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.9476962626062884, + "step": 9585, + "train/total_loss": 0.039062656462192535 + }, + { + "entropy": 9.202255249023438, + "epoch": 0.9477951354558038, + "mean_token_accuracy": 0.7464454770088196, + "num_tokens": 29092247.0, + "step": 9586, + "train/ce_loss": 9.728628356242552e-07 + }, + { + "epoch": 0.9477951354558038, + "step": 9586, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.9477951354558038, + "step": 9586, + "train/total_loss": 0.03125009685754776 + }, + { + "entropy": 8.650840759277344, + "epoch": 0.9478940083053193, + "mean_token_accuracy": 0.7518337368965149, + "num_tokens": 29097518.0, + "step": 9587, + "train/ce_loss": 0.9363239407539368 + }, + { + "epoch": 0.9478940083053193, + "step": 9587, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.9478940083053193, + "step": 9587, + "train/total_loss": 0.17175740003585815 + }, + { + "entropy": 8.485133171081543, + "epoch": 0.9479928811548349, + "mean_token_accuracy": 0.7577720284461975, + "num_tokens": 29102713.0, + "step": 9588, + "train/ce_loss": 0.8084627389907837 + }, + { + "epoch": 0.9479928811548349, + "step": 9588, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.9479928811548349, + "step": 9588, + "train/total_loss": 0.13162752985954285 + }, + { + "entropy": 8.302215576171875, + "epoch": 0.9480917540043504, + "mean_token_accuracy": 0.7535714507102966, + "num_tokens": 29108249.0, + "step": 9589, + "train/ce_loss": 0.6899745464324951 + }, + { + "epoch": 0.9480917540043504, + "step": 9589, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.9480917540043504, + "step": 9589, + "train/total_loss": 0.11587245762348175 + }, + { + "entropy": 9.150361061096191, + "epoch": 0.9481906268538659, + "mean_token_accuracy": 0.720588207244873, + "num_tokens": 29113185.0, + "step": 9590, + "train/ce_loss": 9.790721833269345e-07 + }, + { + "epoch": 0.9481906268538659, + "step": 9590, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.9481906268538659, + "step": 9590, + "train/total_loss": 0.03125009685754776 + }, + { + "entropy": 8.1705322265625, + "epoch": 0.9482894997033815, + "mean_token_accuracy": 0.7268232107162476, + "num_tokens": 29118477.0, + "step": 9591, + "train/ce_loss": 0.5983821749687195 + }, + { + "epoch": 0.9482894997033815, + "step": 9591, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.9482894997033815, + "step": 9591, + "train/total_loss": 0.09108822047710419 + }, + { + "entropy": 8.875567436218262, + "epoch": 0.948388372552897, + "mean_token_accuracy": 0.7209302186965942, + "num_tokens": 29123381.0, + "step": 9592, + "train/ce_loss": 0.9020702242851257 + }, + { + "epoch": 0.948388372552897, + "step": 9592, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.948388372552897, + "step": 9592, + "train/total_loss": 0.11364452540874481 + }, + { + "entropy": 8.887754440307617, + "epoch": 0.9484872454024125, + "mean_token_accuracy": 0.7001434564590454, + "num_tokens": 29128591.0, + "step": 9593, + "train/ce_loss": 4.049984170251264e-07 + }, + { + "epoch": 0.9484872454024125, + "step": 9593, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.9484872454024125, + "step": 9593, + "train/total_loss": 0.03906254097819328 + }, + { + "entropy": 8.799975395202637, + "epoch": 0.9485861182519281, + "mean_token_accuracy": 0.757615864276886, + "num_tokens": 29133798.0, + "step": 9594, + "train/ce_loss": 1.4511492252349854 + }, + { + "epoch": 0.9485861182519281, + "step": 9594, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.9485861182519281, + "step": 9594, + "train/total_loss": 0.231052428483963 + }, + { + "entropy": 8.386930465698242, + "epoch": 0.9486849911014436, + "mean_token_accuracy": 0.8011173009872437, + "num_tokens": 29139193.0, + "step": 9595, + "train/ce_loss": 0.666289210319519 + }, + { + "epoch": 0.9486849911014436, + "step": 9595, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.9486849911014436, + "step": 9595, + "train/total_loss": 0.09006642550230026 + }, + { + "entropy": 8.39547061920166, + "epoch": 0.948783863950959, + "mean_token_accuracy": 0.7128072381019592, + "num_tokens": 29144436.0, + "step": 9596, + "train/ce_loss": 1.6115036010742188 + }, + { + "epoch": 0.948783863950959, + "step": 9596, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.948783863950959, + "step": 9596, + "train/total_loss": 0.20021286606788635 + }, + { + "entropy": 8.999906539916992, + "epoch": 0.9488827368004746, + "mean_token_accuracy": 0.7307692170143127, + "num_tokens": 29149488.0, + "step": 9597, + "train/ce_loss": 1.4049520586922881e-06 + }, + { + "epoch": 0.9488827368004746, + "step": 9597, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.9488827368004746, + "step": 9597, + "train/total_loss": 0.04296889156103134 + }, + { + "entropy": 9.10368824005127, + "epoch": 0.9489816096499901, + "mean_token_accuracy": 0.830232560634613, + "num_tokens": 29154363.0, + "step": 9598, + "train/ce_loss": 1.1234755516052246 + }, + { + "epoch": 0.9489816096499901, + "step": 9598, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.9489816096499901, + "step": 9598, + "train/total_loss": 0.1357850581407547 + }, + { + "entropy": 9.209823608398438, + "epoch": 0.9490804824995056, + "mean_token_accuracy": 0.8063943386077881, + "num_tokens": 29159343.0, + "step": 9599, + "train/ce_loss": 0.7923817038536072 + }, + { + "epoch": 0.9490804824995056, + "step": 9599, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.9490804824995056, + "step": 9599, + "train/total_loss": 0.1222069188952446 + }, + { + "epoch": 0.9491793553490212, + "grad_norm": 0.5473429560661316, + "learning_rate": 7.629184591801415e-06, + "loss": 0.1311, + "step": 9600 + }, + { + "entropy": 8.758648872375488, + "epoch": 0.9491793553490212, + "mean_token_accuracy": 0.7156726717948914, + "num_tokens": 29164485.0, + "step": 9600, + "train/ce_loss": 1.3763682842254639 + }, + { + "epoch": 0.9491793553490212, + "step": 9600, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.9491793553490212, + "step": 9600, + "train/total_loss": 0.18451182544231415 + }, + { + "entropy": 8.479524612426758, + "epoch": 0.9492782281985367, + "mean_token_accuracy": 0.7768199443817139, + "num_tokens": 29170003.0, + "step": 9601, + "train/ce_loss": 0.7482802271842957 + }, + { + "epoch": 0.9492782281985367, + "step": 9601, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.9492782281985367, + "step": 9601, + "train/total_loss": 0.11389052122831345 + }, + { + "entropy": 8.709478378295898, + "epoch": 0.9493771010480522, + "mean_token_accuracy": 0.7390776872634888, + "num_tokens": 29175249.0, + "step": 9602, + "train/ce_loss": 0.5524051785469055 + }, + { + "epoch": 0.9493771010480522, + "step": 9602, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.9493771010480522, + "step": 9602, + "train/total_loss": 0.10602176934480667 + }, + { + "entropy": 8.462508201599121, + "epoch": 0.9494759738975678, + "mean_token_accuracy": 0.8088942170143127, + "num_tokens": 29180548.0, + "step": 9603, + "train/ce_loss": 0.45807701349258423 + }, + { + "epoch": 0.9494759738975678, + "step": 9603, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.9494759738975678, + "step": 9603, + "train/total_loss": 0.08096395432949066 + }, + { + "entropy": 8.42574405670166, + "epoch": 0.9495748467470833, + "mean_token_accuracy": 0.7428571581840515, + "num_tokens": 29185774.0, + "step": 9604, + "train/ce_loss": 0.8734005689620972 + }, + { + "epoch": 0.9495748467470833, + "step": 9604, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.9495748467470833, + "step": 9604, + "train/total_loss": 0.12249630689620972 + }, + { + "entropy": 8.75879955291748, + "epoch": 0.9496737195965987, + "mean_token_accuracy": 0.7345844507217407, + "num_tokens": 29191030.0, + "step": 9605, + "train/ce_loss": 1.0893501043319702 + }, + { + "epoch": 0.9496737195965987, + "step": 9605, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.9496737195965987, + "step": 9605, + "train/total_loss": 0.13627876341342926 + }, + { + "entropy": 8.954879760742188, + "epoch": 0.9497725924461143, + "mean_token_accuracy": 0.7334235310554504, + "num_tokens": 29196218.0, + "step": 9606, + "train/ce_loss": 1.2382944822311401 + }, + { + "epoch": 0.9497725924461143, + "step": 9606, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.9497725924461143, + "step": 9606, + "train/total_loss": 0.2214857041835785 + }, + { + "entropy": 8.716231346130371, + "epoch": 0.9498714652956298, + "mean_token_accuracy": 0.7468671798706055, + "num_tokens": 29201482.0, + "step": 9607, + "train/ce_loss": 0.9447196125984192 + }, + { + "epoch": 0.9498714652956298, + "step": 9607, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.9498714652956298, + "step": 9607, + "train/total_loss": 0.13744071125984192 + }, + { + "entropy": 8.335722923278809, + "epoch": 0.9499703381451453, + "mean_token_accuracy": 0.7696390748023987, + "num_tokens": 29206827.0, + "step": 9608, + "train/ce_loss": 0.7796752452850342 + }, + { + "epoch": 0.9499703381451453, + "step": 9608, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.9499703381451453, + "step": 9608, + "train/total_loss": 0.11312377452850342 + }, + { + "entropy": 8.784276962280273, + "epoch": 0.9500692109946609, + "mean_token_accuracy": 0.7550200819969177, + "num_tokens": 29212004.0, + "step": 9609, + "train/ce_loss": 1.1044058799743652 + }, + { + "epoch": 0.9500692109946609, + "step": 9609, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.9500692109946609, + "step": 9609, + "train/total_loss": 0.21981558203697205 + }, + { + "entropy": 8.904178619384766, + "epoch": 0.9501680838441764, + "mean_token_accuracy": 0.7452830076217651, + "num_tokens": 29217230.0, + "step": 9610, + "train/ce_loss": 2.821152236265334e-07 + }, + { + "epoch": 0.9501680838441764, + "step": 9610, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.9501680838441764, + "step": 9610, + "train/total_loss": 0.05468752980232239 + }, + { + "entropy": 9.369682312011719, + "epoch": 0.9502669566936919, + "mean_token_accuracy": 0.7763158082962036, + "num_tokens": 29222069.0, + "step": 9611, + "train/ce_loss": 9.107867526836344e-07 + }, + { + "epoch": 0.9502669566936919, + "step": 9611, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.9502669566936919, + "step": 9611, + "train/total_loss": 0.04687508940696716 + }, + { + "entropy": 8.254669189453125, + "epoch": 0.9503658295432075, + "mean_token_accuracy": 0.6848049163818359, + "num_tokens": 29227423.0, + "step": 9612, + "train/ce_loss": 1.081194519996643 + }, + { + "epoch": 0.9503658295432075, + "step": 9612, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.9503658295432075, + "step": 9612, + "train/total_loss": 0.19015070796012878 + }, + { + "entropy": 8.887582778930664, + "epoch": 0.950464702392723, + "mean_token_accuracy": 0.7818182110786438, + "num_tokens": 29232676.0, + "step": 9613, + "train/ce_loss": 1.269497715838952e-07 + }, + { + "epoch": 0.950464702392723, + "step": 9613, + "train/sim_loss": 0.01171875 + }, + { + "epoch": 0.950464702392723, + "step": 9613, + "train/total_loss": 0.011718763038516045 + }, + { + "entropy": 8.436988830566406, + "epoch": 0.9505635752422384, + "mean_token_accuracy": 0.7754459381103516, + "num_tokens": 29238129.0, + "step": 9614, + "train/ce_loss": 0.8593966364860535 + }, + { + "epoch": 0.9505635752422384, + "step": 9614, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.9505635752422384, + "step": 9614, + "train/total_loss": 0.1250021755695343 + }, + { + "entropy": 8.663818359375, + "epoch": 0.950662448091754, + "mean_token_accuracy": 0.746268630027771, + "num_tokens": 29243334.0, + "step": 9615, + "train/ce_loss": 0.8685634136199951 + }, + { + "epoch": 0.950662448091754, + "step": 9615, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.950662448091754, + "step": 9615, + "train/total_loss": 0.10248134285211563 + }, + { + "entropy": 9.107805252075195, + "epoch": 0.9507613209412695, + "mean_token_accuracy": 0.7438271641731262, + "num_tokens": 29248443.0, + "step": 9616, + "train/ce_loss": 1.4567358493804932 + }, + { + "epoch": 0.9507613209412695, + "step": 9616, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.9507613209412695, + "step": 9616, + "train/total_loss": 0.21207983791828156 + }, + { + "entropy": 8.361902236938477, + "epoch": 0.950860193790785, + "mean_token_accuracy": 0.7422266602516174, + "num_tokens": 29254118.0, + "step": 9617, + "train/ce_loss": 0.8464975357055664 + }, + { + "epoch": 0.950860193790785, + "step": 9617, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.950860193790785, + "step": 9617, + "train/total_loss": 0.17449350655078888 + }, + { + "entropy": 8.67348861694336, + "epoch": 0.9509590666403006, + "mean_token_accuracy": 0.7758620977401733, + "num_tokens": 29259309.0, + "step": 9618, + "train/ce_loss": 0.9091631174087524 + }, + { + "epoch": 0.9509590666403006, + "step": 9618, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.9509590666403006, + "step": 9618, + "train/total_loss": 0.13779130578041077 + }, + { + "entropy": 9.485261917114258, + "epoch": 0.9510579394898161, + "mean_token_accuracy": 0.7659090757369995, + "num_tokens": 29264202.0, + "step": 9619, + "train/ce_loss": 1.1151223588967696e-06 + }, + { + "epoch": 0.9510579394898161, + "step": 9619, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.9510579394898161, + "step": 9619, + "train/total_loss": 0.031250111758708954 + }, + { + "epoch": 0.9511568123393316, + "grad_norm": 0.7135426998138428, + "learning_rate": 7.624239727043467e-06, + "loss": 0.1283, + "step": 9620 + }, + { + "entropy": 9.077592849731445, + "epoch": 0.9511568123393316, + "mean_token_accuracy": 0.7478849291801453, + "num_tokens": 29269192.0, + "step": 9620, + "train/ce_loss": 2.533438134832977e-07 + }, + { + "epoch": 0.9511568123393316, + "step": 9620, + "train/sim_loss": 0.01171875 + }, + { + "epoch": 0.9511568123393316, + "step": 9620, + "train/total_loss": 0.011718775145709515 + }, + { + "entropy": 8.475610733032227, + "epoch": 0.9512556851888472, + "mean_token_accuracy": 0.7614907026290894, + "num_tokens": 29274501.0, + "step": 9621, + "train/ce_loss": 0.9760831594467163 + }, + { + "epoch": 0.9512556851888472, + "step": 9621, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.9512556851888472, + "step": 9621, + "train/total_loss": 0.16010832786560059 + }, + { + "entropy": 8.620203018188477, + "epoch": 0.9513545580383627, + "mean_token_accuracy": 0.6976484060287476, + "num_tokens": 29279850.0, + "step": 9622, + "train/ce_loss": 0.9753076434135437 + }, + { + "epoch": 0.9513545580383627, + "step": 9622, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.9513545580383627, + "step": 9622, + "train/total_loss": 0.1639370173215866 + }, + { + "entropy": 8.828330039978027, + "epoch": 0.9514534308878781, + "mean_token_accuracy": 0.6967320442199707, + "num_tokens": 29285023.0, + "step": 9623, + "train/ce_loss": 1.4751416444778442 + }, + { + "epoch": 0.9514534308878781, + "step": 9623, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.9514534308878781, + "step": 9623, + "train/total_loss": 0.21782666444778442 + }, + { + "entropy": 9.476432800292969, + "epoch": 0.9515523037373937, + "mean_token_accuracy": 0.6666666865348816, + "num_tokens": 29289840.0, + "step": 9624, + "train/ce_loss": 2.2870726585388184 + }, + { + "epoch": 0.9515523037373937, + "step": 9624, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.9515523037373937, + "step": 9624, + "train/total_loss": 0.2794885039329529 + }, + { + "entropy": 8.957849502563477, + "epoch": 0.9516511765869092, + "mean_token_accuracy": 0.7439544796943665, + "num_tokens": 29294973.0, + "step": 9625, + "train/ce_loss": 0.8902990818023682 + }, + { + "epoch": 0.9516511765869092, + "step": 9625, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.9516511765869092, + "step": 9625, + "train/total_loss": 0.15152990818023682 + }, + { + "entropy": 8.880248069763184, + "epoch": 0.9517500494364247, + "mean_token_accuracy": 0.7473261952400208, + "num_tokens": 29300196.0, + "step": 9626, + "train/ce_loss": 0.47349148988723755 + }, + { + "epoch": 0.9517500494364247, + "step": 9626, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.9517500494364247, + "step": 9626, + "train/total_loss": 0.14109915494918823 + }, + { + "entropy": 8.577116966247559, + "epoch": 0.9518489222859403, + "mean_token_accuracy": 0.7583892345428467, + "num_tokens": 29305536.0, + "step": 9627, + "train/ce_loss": 0.6490175127983093 + }, + { + "epoch": 0.9518489222859403, + "step": 9627, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.9518489222859403, + "step": 9627, + "train/total_loss": 0.14693300426006317 + }, + { + "entropy": 9.41624927520752, + "epoch": 0.9519477951354558, + "mean_token_accuracy": 0.7373272180557251, + "num_tokens": 29310379.0, + "step": 9628, + "train/ce_loss": 1.0523601770401 + }, + { + "epoch": 0.9519477951354558, + "step": 9628, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.9519477951354558, + "step": 9628, + "train/total_loss": 0.1560172736644745 + }, + { + "entropy": 8.359001159667969, + "epoch": 0.9520466679849713, + "mean_token_accuracy": 0.7457831501960754, + "num_tokens": 29315681.0, + "step": 9629, + "train/ce_loss": 1.1068475246429443 + }, + { + "epoch": 0.9520466679849713, + "step": 9629, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.9520466679849713, + "step": 9629, + "train/total_loss": 0.12630975246429443 + }, + { + "entropy": 9.151994705200195, + "epoch": 0.9521455408344869, + "mean_token_accuracy": 0.8147059082984924, + "num_tokens": 29320810.0, + "step": 9630, + "train/ce_loss": 0.5198752284049988 + }, + { + "epoch": 0.9521455408344869, + "step": 9630, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.9521455408344869, + "step": 9630, + "train/total_loss": 0.06761252880096436 + }, + { + "entropy": 9.170339584350586, + "epoch": 0.9522444136840024, + "mean_token_accuracy": 0.7709401845932007, + "num_tokens": 29325847.0, + "step": 9631, + "train/ce_loss": 0.9556100368499756 + }, + { + "epoch": 0.9522444136840024, + "step": 9631, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.9522444136840024, + "step": 9631, + "train/total_loss": 0.11118600517511368 + }, + { + "entropy": 8.444564819335938, + "epoch": 0.9523432865335179, + "mean_token_accuracy": 0.7725714445114136, + "num_tokens": 29331221.0, + "step": 9632, + "train/ce_loss": 0.5062717795372009 + }, + { + "epoch": 0.9523432865335179, + "step": 9632, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.9523432865335179, + "step": 9632, + "train/total_loss": 0.06625217944383621 + }, + { + "entropy": 9.152999877929688, + "epoch": 0.9524421593830334, + "mean_token_accuracy": 0.7718023061752319, + "num_tokens": 29336331.0, + "step": 9633, + "train/ce_loss": 3.02873019109029e-07 + }, + { + "epoch": 0.9524421593830334, + "step": 9633, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.9524421593830334, + "step": 9633, + "train/total_loss": 0.015625029802322388 + }, + { + "entropy": 9.484378814697266, + "epoch": 0.9525410322325489, + "mean_token_accuracy": 0.7777777910232544, + "num_tokens": 29341224.0, + "step": 9634, + "train/ce_loss": 1.2847749530919828e-06 + }, + { + "epoch": 0.9525410322325489, + "step": 9634, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.9525410322325489, + "step": 9634, + "train/total_loss": 0.03906262665987015 + }, + { + "entropy": 8.592009544372559, + "epoch": 0.9526399050820644, + "mean_token_accuracy": 0.7072879076004028, + "num_tokens": 29346541.0, + "step": 9635, + "train/ce_loss": 1.1064434051513672 + }, + { + "epoch": 0.9526399050820644, + "step": 9635, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.9526399050820644, + "step": 9635, + "train/total_loss": 0.16533184051513672 + }, + { + "entropy": 7.667773246765137, + "epoch": 0.95273877793158, + "mean_token_accuracy": 0.6940418481826782, + "num_tokens": 29352255.0, + "step": 9636, + "train/ce_loss": 0.5391709804534912 + }, + { + "epoch": 0.95273877793158, + "step": 9636, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.95273877793158, + "step": 9636, + "train/total_loss": 0.08907334506511688 + }, + { + "entropy": 9.034952163696289, + "epoch": 0.9528376507810955, + "mean_token_accuracy": 0.7628294229507446, + "num_tokens": 29357433.0, + "step": 9637, + "train/ce_loss": 0.8349732756614685 + }, + { + "epoch": 0.9528376507810955, + "step": 9637, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.9528376507810955, + "step": 9637, + "train/total_loss": 0.1616223305463791 + }, + { + "entropy": 8.987411499023438, + "epoch": 0.952936523630611, + "mean_token_accuracy": 0.7757773995399475, + "num_tokens": 29362482.0, + "step": 9638, + "train/ce_loss": 1.2512165307998657 + }, + { + "epoch": 0.952936523630611, + "step": 9638, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.952936523630611, + "step": 9638, + "train/total_loss": 0.17590290307998657 + }, + { + "entropy": 8.629709243774414, + "epoch": 0.9530353964801266, + "mean_token_accuracy": 0.7574031949043274, + "num_tokens": 29367840.0, + "step": 9639, + "train/ce_loss": 0.7920249104499817 + }, + { + "epoch": 0.9530353964801266, + "step": 9639, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.9530353964801266, + "step": 9639, + "train/total_loss": 0.15732750296592712 + }, + { + "epoch": 0.9531342693296421, + "grad_norm": 0.6226643919944763, + "learning_rate": 7.6192948622855164e-06, + "loss": 0.1297, + "step": 9640 + }, + { + "entropy": 9.593673706054688, + "epoch": 0.9531342693296421, + "mean_token_accuracy": 0.7761557102203369, + "num_tokens": 29372689.0, + "step": 9640, + "train/ce_loss": 0.9557827115058899 + }, + { + "epoch": 0.9531342693296421, + "step": 9640, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.9531342693296421, + "step": 9640, + "train/total_loss": 0.15026578307151794 + }, + { + "entropy": 8.888915061950684, + "epoch": 0.9532331421791576, + "mean_token_accuracy": 0.7330383658409119, + "num_tokens": 29377870.0, + "step": 9641, + "train/ce_loss": 1.5397324562072754 + }, + { + "epoch": 0.9532331421791576, + "step": 9641, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.9532331421791576, + "step": 9641, + "train/total_loss": 0.18522325158119202 + }, + { + "entropy": 8.38487434387207, + "epoch": 0.9533320150286732, + "mean_token_accuracy": 0.7719486355781555, + "num_tokens": 29383274.0, + "step": 9642, + "train/ce_loss": 0.7030945420265198 + }, + { + "epoch": 0.9533320150286732, + "step": 9642, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.9533320150286732, + "step": 9642, + "train/total_loss": 0.08593445271253586 + }, + { + "entropy": 8.955349922180176, + "epoch": 0.9534308878781886, + "mean_token_accuracy": 0.7538461685180664, + "num_tokens": 29388606.0, + "step": 9643, + "train/ce_loss": 4.794979986399994e-07 + }, + { + "epoch": 0.9534308878781886, + "step": 9643, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.9534308878781886, + "step": 9643, + "train/total_loss": 0.05468754842877388 + }, + { + "entropy": 8.561123847961426, + "epoch": 0.9535297607277041, + "mean_token_accuracy": 0.8042105436325073, + "num_tokens": 29394009.0, + "step": 9644, + "train/ce_loss": 0.6880059838294983 + }, + { + "epoch": 0.9535297607277041, + "step": 9644, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.9535297607277041, + "step": 9644, + "train/total_loss": 0.11176934838294983 + }, + { + "entropy": 8.632359504699707, + "epoch": 0.9536286335772197, + "mean_token_accuracy": 0.7747524976730347, + "num_tokens": 29399300.0, + "step": 9645, + "train/ce_loss": 0.5286980867385864 + }, + { + "epoch": 0.9536286335772197, + "step": 9645, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.9536286335772197, + "step": 9645, + "train/total_loss": 0.08411981165409088 + }, + { + "entropy": 9.39550495147705, + "epoch": 0.9537275064267352, + "mean_token_accuracy": 0.760765552520752, + "num_tokens": 29404308.0, + "step": 9646, + "train/ce_loss": 0.5505694150924683 + }, + { + "epoch": 0.9537275064267352, + "step": 9646, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.9537275064267352, + "step": 9646, + "train/total_loss": 0.07458819448947906 + }, + { + "entropy": 9.014603614807129, + "epoch": 0.9538263792762507, + "mean_token_accuracy": 0.7614285945892334, + "num_tokens": 29409535.0, + "step": 9647, + "train/ce_loss": 0.5262411832809448 + }, + { + "epoch": 0.9538263792762507, + "step": 9647, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.9538263792762507, + "step": 9647, + "train/total_loss": 0.06824912130832672 + }, + { + "entropy": 9.35798168182373, + "epoch": 0.9539252521257663, + "mean_token_accuracy": 0.8034825921058655, + "num_tokens": 29414344.0, + "step": 9648, + "train/ce_loss": 1.2550104856491089 + }, + { + "epoch": 0.9539252521257663, + "step": 9648, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.9539252521257663, + "step": 9648, + "train/total_loss": 0.16065730154514313 + }, + { + "entropy": 8.385096549987793, + "epoch": 0.9540241249752818, + "mean_token_accuracy": 0.7011995911598206, + "num_tokens": 29419718.0, + "step": 9649, + "train/ce_loss": 0.684291422367096 + }, + { + "epoch": 0.9540241249752818, + "step": 9649, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.9540241249752818, + "step": 9649, + "train/total_loss": 0.0996791422367096 + }, + { + "entropy": 9.243229866027832, + "epoch": 0.9541229978247973, + "mean_token_accuracy": 0.7060367465019226, + "num_tokens": 29424501.0, + "step": 9650, + "train/ce_loss": 4.0800068745738827e-07 + }, + { + "epoch": 0.9541229978247973, + "step": 9650, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.9541229978247973, + "step": 9650, + "train/total_loss": 0.015625040978193283 + }, + { + "entropy": 8.5402193069458, + "epoch": 0.9542218706743129, + "mean_token_accuracy": 0.7251521348953247, + "num_tokens": 29429954.0, + "step": 9651, + "train/ce_loss": 0.5222027897834778 + }, + { + "epoch": 0.9542218706743129, + "step": 9651, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.9542218706743129, + "step": 9651, + "train/total_loss": 0.07175153493881226 + }, + { + "entropy": 8.449823379516602, + "epoch": 0.9543207435238283, + "mean_token_accuracy": 0.7398906946182251, + "num_tokens": 29435328.0, + "step": 9652, + "train/ce_loss": 0.9733685255050659 + }, + { + "epoch": 0.9543207435238283, + "step": 9652, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.9543207435238283, + "step": 9652, + "train/total_loss": 0.12468060106039047 + }, + { + "entropy": 8.732869148254395, + "epoch": 0.9544196163733438, + "mean_token_accuracy": 0.7914831042289734, + "num_tokens": 29440456.0, + "step": 9653, + "train/ce_loss": 0.6452661156654358 + }, + { + "epoch": 0.9544196163733438, + "step": 9653, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.9544196163733438, + "step": 9653, + "train/total_loss": 0.10358911007642746 + }, + { + "entropy": 9.277456283569336, + "epoch": 0.9545184892228594, + "mean_token_accuracy": 0.8242678046226501, + "num_tokens": 29445359.0, + "step": 9654, + "train/ce_loss": 1.2522435188293457 + }, + { + "epoch": 0.9545184892228594, + "step": 9654, + "train/sim_loss": 0.01171875 + }, + { + "epoch": 0.9545184892228594, + "step": 9654, + "train/total_loss": 0.13694310188293457 + }, + { + "entropy": 8.717188835144043, + "epoch": 0.9546173620723749, + "mean_token_accuracy": 0.7574257254600525, + "num_tokens": 29450639.0, + "step": 9655, + "train/ce_loss": 0.5885151624679565 + }, + { + "epoch": 0.9546173620723749, + "step": 9655, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.9546173620723749, + "step": 9655, + "train/total_loss": 0.09400776773691177 + }, + { + "entropy": 9.030670166015625, + "epoch": 0.9547162349218904, + "mean_token_accuracy": 0.7245762944221497, + "num_tokens": 29455741.0, + "step": 9656, + "train/ce_loss": 1.4345485510602884e-07 + }, + { + "epoch": 0.9547162349218904, + "step": 9656, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.9547162349218904, + "step": 9656, + "train/total_loss": 0.015625014901161194 + }, + { + "entropy": 9.049999237060547, + "epoch": 0.954815107771406, + "mean_token_accuracy": 0.7521968483924866, + "num_tokens": 29460741.0, + "step": 9657, + "train/ce_loss": 0.8397353887557983 + }, + { + "epoch": 0.954815107771406, + "step": 9657, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.954815107771406, + "step": 9657, + "train/total_loss": 0.12694229185581207 + }, + { + "entropy": 8.608626365661621, + "epoch": 0.9549139806209215, + "mean_token_accuracy": 0.7580274939537048, + "num_tokens": 29466118.0, + "step": 9658, + "train/ce_loss": 0.45035797357559204 + }, + { + "epoch": 0.9549139806209215, + "step": 9658, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.9549139806209215, + "step": 9658, + "train/total_loss": 0.10362954437732697 + }, + { + "entropy": 9.093671798706055, + "epoch": 0.9550128534704371, + "mean_token_accuracy": 0.7730496525764465, + "num_tokens": 29471203.0, + "step": 9659, + "train/ce_loss": 0.7704161405563354 + }, + { + "epoch": 0.9550128534704371, + "step": 9659, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.9550128534704371, + "step": 9659, + "train/total_loss": 0.1512603759765625 + }, + { + "epoch": 0.9551117263199526, + "grad_norm": 0.7387588620185852, + "learning_rate": 7.614349997527568e-06, + "loss": 0.1241, + "step": 9660 + }, + { + "entropy": 8.689239501953125, + "epoch": 0.9551117263199526, + "mean_token_accuracy": 0.7180365324020386, + "num_tokens": 29476548.0, + "step": 9660, + "train/ce_loss": 0.4345012605190277 + }, + { + "epoch": 0.9551117263199526, + "step": 9660, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.9551117263199526, + "step": 9660, + "train/total_loss": 0.09423138201236725 + }, + { + "entropy": 8.494304656982422, + "epoch": 0.955210599169468, + "mean_token_accuracy": 0.7832568883895874, + "num_tokens": 29481913.0, + "step": 9661, + "train/ce_loss": 0.9814844131469727 + }, + { + "epoch": 0.955210599169468, + "step": 9661, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.955210599169468, + "step": 9661, + "train/total_loss": 0.1254921853542328 + }, + { + "entropy": 8.977173805236816, + "epoch": 0.9553094720189836, + "mean_token_accuracy": 0.755464494228363, + "num_tokens": 29487064.0, + "step": 9662, + "train/ce_loss": 0.5466949939727783 + }, + { + "epoch": 0.9553094720189836, + "step": 9662, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.9553094720189836, + "step": 9662, + "train/total_loss": 0.07420074939727783 + }, + { + "entropy": 8.666604995727539, + "epoch": 0.9554083448684991, + "mean_token_accuracy": 0.7538644671440125, + "num_tokens": 29492379.0, + "step": 9663, + "train/ce_loss": 1.08368718624115 + }, + { + "epoch": 0.9554083448684991, + "step": 9663, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.9554083448684991, + "step": 9663, + "train/total_loss": 0.16305622458457947 + }, + { + "entropy": 8.567361831665039, + "epoch": 0.9555072177180146, + "mean_token_accuracy": 0.747474730014801, + "num_tokens": 29497766.0, + "step": 9664, + "train/ce_loss": 0.9273149967193604 + }, + { + "epoch": 0.9555072177180146, + "step": 9664, + "train/sim_loss": 0.13671875 + }, + { + "epoch": 0.9555072177180146, + "step": 9664, + "train/total_loss": 0.2294502556324005 + }, + { + "entropy": 9.599896430969238, + "epoch": 0.9556060905675302, + "mean_token_accuracy": 0.7493734359741211, + "num_tokens": 29502580.0, + "step": 9665, + "train/ce_loss": 2.07197642326355 + }, + { + "epoch": 0.9556060905675302, + "step": 9665, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.9556060905675302, + "step": 9665, + "train/total_loss": 0.2775101661682129 + }, + { + "entropy": 8.539717674255371, + "epoch": 0.9557049634170457, + "mean_token_accuracy": 0.7021898031234741, + "num_tokens": 29507763.0, + "step": 9666, + "train/ce_loss": 0.8698290586471558 + }, + { + "epoch": 0.9557049634170457, + "step": 9666, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.9557049634170457, + "step": 9666, + "train/total_loss": 0.13385790586471558 + }, + { + "entropy": 8.689737319946289, + "epoch": 0.9558038362665612, + "mean_token_accuracy": 0.7953714728355408, + "num_tokens": 29513050.0, + "step": 9667, + "train/ce_loss": 0.3238702714443207 + }, + { + "epoch": 0.9558038362665612, + "step": 9667, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.9558038362665612, + "step": 9667, + "train/total_loss": 0.04801202937960625 + }, + { + "entropy": 8.805400848388672, + "epoch": 0.9559027091160768, + "mean_token_accuracy": 0.7234352231025696, + "num_tokens": 29518208.0, + "step": 9668, + "train/ce_loss": 1.4133466482162476 + }, + { + "epoch": 0.9559027091160768, + "step": 9668, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.9559027091160768, + "step": 9668, + "train/total_loss": 0.180397167801857 + }, + { + "entropy": 8.859384536743164, + "epoch": 0.9560015819655923, + "mean_token_accuracy": 0.6579973697662354, + "num_tokens": 29523450.0, + "step": 9669, + "train/ce_loss": 1.8035385608673096 + }, + { + "epoch": 0.9560015819655923, + "step": 9669, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.9560015819655923, + "step": 9669, + "train/total_loss": 0.23894761502742767 + }, + { + "entropy": 8.606244087219238, + "epoch": 0.9561004548151077, + "mean_token_accuracy": 0.7643097639083862, + "num_tokens": 29528827.0, + "step": 9670, + "train/ce_loss": 0.9713116884231567 + }, + { + "epoch": 0.9561004548151077, + "step": 9670, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.9561004548151077, + "step": 9670, + "train/total_loss": 0.2065061628818512 + }, + { + "entropy": 8.814547538757324, + "epoch": 0.9561993276646233, + "mean_token_accuracy": 0.7670885920524597, + "num_tokens": 29534110.0, + "step": 9671, + "train/ce_loss": 0.8379871845245361 + }, + { + "epoch": 0.9561993276646233, + "step": 9671, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.9561993276646233, + "step": 9671, + "train/total_loss": 0.14629872143268585 + }, + { + "entropy": 8.33130931854248, + "epoch": 0.9562982005141388, + "mean_token_accuracy": 0.6969397664070129, + "num_tokens": 29539627.0, + "step": 9672, + "train/ce_loss": 0.6200782656669617 + }, + { + "epoch": 0.9562982005141388, + "step": 9672, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.9562982005141388, + "step": 9672, + "train/total_loss": 0.1049765795469284 + }, + { + "entropy": 8.958049774169922, + "epoch": 0.9563970733636543, + "mean_token_accuracy": 0.7506925463676453, + "num_tokens": 29544822.0, + "step": 9673, + "train/ce_loss": 0.6942681670188904 + }, + { + "epoch": 0.9563970733636543, + "step": 9673, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.9563970733636543, + "step": 9673, + "train/total_loss": 0.13583306968212128 + }, + { + "entropy": 8.575597763061523, + "epoch": 0.9564959462131699, + "mean_token_accuracy": 0.695364236831665, + "num_tokens": 29550184.0, + "step": 9674, + "train/ce_loss": 0.8115186095237732 + }, + { + "epoch": 0.9564959462131699, + "step": 9674, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.9564959462131699, + "step": 9674, + "train/total_loss": 0.14365187287330627 + }, + { + "entropy": 8.64834213256836, + "epoch": 0.9565948190626854, + "mean_token_accuracy": 0.7287761569023132, + "num_tokens": 29555594.0, + "step": 9675, + "train/ce_loss": 0.45716869831085205 + }, + { + "epoch": 0.9565948190626854, + "step": 9675, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.9565948190626854, + "step": 9675, + "train/total_loss": 0.13946686685085297 + }, + { + "entropy": 8.666889190673828, + "epoch": 0.9566936919122009, + "mean_token_accuracy": 0.7227272987365723, + "num_tokens": 29560910.0, + "step": 9676, + "train/ce_loss": 0.3776177763938904 + }, + { + "epoch": 0.9566936919122009, + "step": 9676, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.9566936919122009, + "step": 9676, + "train/total_loss": 0.09635552763938904 + }, + { + "entropy": 8.791280746459961, + "epoch": 0.9567925647617165, + "mean_token_accuracy": 0.7641395926475525, + "num_tokens": 29566179.0, + "step": 9677, + "train/ce_loss": 0.40096956491470337 + }, + { + "epoch": 0.9567925647617165, + "step": 9677, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.9567925647617165, + "step": 9677, + "train/total_loss": 0.0635344535112381 + }, + { + "entropy": 9.04377269744873, + "epoch": 0.956891437611232, + "mean_token_accuracy": 0.7180555462837219, + "num_tokens": 29571332.0, + "step": 9678, + "train/ce_loss": 1.2874047756195068 + }, + { + "epoch": 0.956891437611232, + "step": 9678, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.956891437611232, + "step": 9678, + "train/total_loss": 0.15999047458171844 + }, + { + "entropy": 9.064447402954102, + "epoch": 0.9569903104607475, + "mean_token_accuracy": 0.7198142409324646, + "num_tokens": 29576401.0, + "step": 9679, + "train/ce_loss": 1.0077346563339233 + }, + { + "epoch": 0.9569903104607475, + "step": 9679, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.9569903104607475, + "step": 9679, + "train/total_loss": 0.12421096861362457 + }, + { + "epoch": 0.957089183310263, + "grad_norm": 0.8103324770927429, + "learning_rate": 7.609405132769619e-06, + "loss": 0.1383, + "step": 9680 + }, + { + "entropy": 8.435659408569336, + "epoch": 0.957089183310263, + "mean_token_accuracy": 0.7245119214057922, + "num_tokens": 29581723.0, + "step": 9680, + "train/ce_loss": 0.8044260740280151 + }, + { + "epoch": 0.957089183310263, + "step": 9680, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.957089183310263, + "step": 9680, + "train/total_loss": 0.1312238574028015 + }, + { + "entropy": 9.460543632507324, + "epoch": 0.9571880561597785, + "mean_token_accuracy": 0.7295454740524292, + "num_tokens": 29586561.0, + "step": 9681, + "train/ce_loss": 1.2621893882751465 + }, + { + "epoch": 0.9571880561597785, + "step": 9681, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.9571880561597785, + "step": 9681, + "train/total_loss": 0.16528144478797913 + }, + { + "entropy": 8.640851974487305, + "epoch": 0.957286929009294, + "mean_token_accuracy": 0.7362045645713806, + "num_tokens": 29591791.0, + "step": 9682, + "train/ce_loss": 0.948087751865387 + }, + { + "epoch": 0.957286929009294, + "step": 9682, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.957286929009294, + "step": 9682, + "train/total_loss": 0.17293378710746765 + }, + { + "entropy": 9.918706893920898, + "epoch": 0.9573858018588096, + "mean_token_accuracy": 0.7696078419685364, + "num_tokens": 29596377.0, + "step": 9683, + "train/ce_loss": 2.9881794452667236 + }, + { + "epoch": 0.9573858018588096, + "step": 9683, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.9573858018588096, + "step": 9683, + "train/total_loss": 0.3886617124080658 + }, + { + "entropy": 8.680938720703125, + "epoch": 0.9574846747083251, + "mean_token_accuracy": 0.7422279715538025, + "num_tokens": 29601598.0, + "step": 9684, + "train/ce_loss": 0.5348566174507141 + }, + { + "epoch": 0.9574846747083251, + "step": 9684, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.9574846747083251, + "step": 9684, + "train/total_loss": 0.10036066174507141 + }, + { + "entropy": 8.284825325012207, + "epoch": 0.9575835475578406, + "mean_token_accuracy": 0.7689906358718872, + "num_tokens": 29607074.0, + "step": 9685, + "train/ce_loss": 0.5938931703567505 + }, + { + "epoch": 0.9575835475578406, + "step": 9685, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.9575835475578406, + "step": 9685, + "train/total_loss": 0.10235807299613953 + }, + { + "entropy": 8.49846363067627, + "epoch": 0.9576824204073562, + "mean_token_accuracy": 0.7897196412086487, + "num_tokens": 29612427.0, + "step": 9686, + "train/ce_loss": 0.4811134338378906 + }, + { + "epoch": 0.9576824204073562, + "step": 9686, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.9576824204073562, + "step": 9686, + "train/total_loss": 0.07154884934425354 + }, + { + "entropy": 8.664314270019531, + "epoch": 0.9577812932568717, + "mean_token_accuracy": 0.720812201499939, + "num_tokens": 29617668.0, + "step": 9687, + "train/ce_loss": 0.7171815037727356 + }, + { + "epoch": 0.9577812932568717, + "step": 9687, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.9577812932568717, + "step": 9687, + "train/total_loss": 0.11859314888715744 + }, + { + "entropy": 9.304027557373047, + "epoch": 0.9578801661063872, + "mean_token_accuracy": 0.7219047546386719, + "num_tokens": 29622602.0, + "step": 9688, + "train/ce_loss": 3.151641010390449e-07 + }, + { + "epoch": 0.9578801661063872, + "step": 9688, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.9578801661063872, + "step": 9688, + "train/total_loss": 0.019531281664967537 + }, + { + "entropy": 9.046564102172852, + "epoch": 0.9579790389559028, + "mean_token_accuracy": 0.6889952421188354, + "num_tokens": 29627665.0, + "step": 9689, + "train/ce_loss": 1.4818629026412964 + }, + { + "epoch": 0.9579790389559028, + "step": 9689, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.9579790389559028, + "step": 9689, + "train/total_loss": 0.2575612962245941 + }, + { + "entropy": 8.488359451293945, + "epoch": 0.9580779118054182, + "mean_token_accuracy": 0.7052153944969177, + "num_tokens": 29633056.0, + "step": 9690, + "train/ce_loss": 0.4829596281051636 + }, + { + "epoch": 0.9580779118054182, + "step": 9690, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.9580779118054182, + "step": 9690, + "train/total_loss": 0.07954595983028412 + }, + { + "entropy": 8.586043357849121, + "epoch": 0.9581767846549337, + "mean_token_accuracy": 0.7146092653274536, + "num_tokens": 29638386.0, + "step": 9691, + "train/ce_loss": 1.2549149990081787 + }, + { + "epoch": 0.9581767846549337, + "step": 9691, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.9581767846549337, + "step": 9691, + "train/total_loss": 0.16455399990081787 + }, + { + "entropy": 8.421553611755371, + "epoch": 0.9582756575044493, + "mean_token_accuracy": 0.7447306513786316, + "num_tokens": 29643718.0, + "step": 9692, + "train/ce_loss": 0.5929149985313416 + }, + { + "epoch": 0.9582756575044493, + "step": 9692, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.9582756575044493, + "step": 9692, + "train/total_loss": 0.14522899687290192 + }, + { + "entropy": 8.817549705505371, + "epoch": 0.9583745303539648, + "mean_token_accuracy": 0.6828644275665283, + "num_tokens": 29648974.0, + "step": 9693, + "train/ce_loss": 1.9984155893325806 + }, + { + "epoch": 0.9583745303539648, + "step": 9693, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.9583745303539648, + "step": 9693, + "train/total_loss": 0.24671655893325806 + }, + { + "entropy": 9.072053909301758, + "epoch": 0.9584734032034803, + "mean_token_accuracy": 0.7591911554336548, + "num_tokens": 29653978.0, + "step": 9694, + "train/ce_loss": 8.633719517092686e-07 + }, + { + "epoch": 0.9584734032034803, + "step": 9694, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.9584734032034803, + "step": 9694, + "train/total_loss": 0.058593835681676865 + }, + { + "entropy": 9.154497146606445, + "epoch": 0.9585722760529959, + "mean_token_accuracy": 0.7762646079063416, + "num_tokens": 29658966.0, + "step": 9695, + "train/ce_loss": 0.8793838620185852 + }, + { + "epoch": 0.9585722760529959, + "step": 9695, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.9585722760529959, + "step": 9695, + "train/total_loss": 0.11918839067220688 + }, + { + "entropy": 8.997401237487793, + "epoch": 0.9586711489025114, + "mean_token_accuracy": 0.7197368144989014, + "num_tokens": 29664156.0, + "step": 9696, + "train/ce_loss": 0.655886173248291 + }, + { + "epoch": 0.9586711489025114, + "step": 9696, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.9586711489025114, + "step": 9696, + "train/total_loss": 0.11636986583471298 + }, + { + "entropy": 8.906684875488281, + "epoch": 0.9587700217520269, + "mean_token_accuracy": 0.6468305587768555, + "num_tokens": 29669328.0, + "step": 9697, + "train/ce_loss": 0.8485145568847656 + }, + { + "epoch": 0.9587700217520269, + "step": 9697, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.9587700217520269, + "step": 9697, + "train/total_loss": 0.1668827086687088 + }, + { + "entropy": 8.718497276306152, + "epoch": 0.9588688946015425, + "mean_token_accuracy": 0.751937985420227, + "num_tokens": 29674657.0, + "step": 9698, + "train/ce_loss": 0.854433000087738 + }, + { + "epoch": 0.9588688946015425, + "step": 9698, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.9588688946015425, + "step": 9698, + "train/total_loss": 0.11278705298900604 + }, + { + "entropy": 8.88694953918457, + "epoch": 0.9589677674510579, + "mean_token_accuracy": 0.6712749600410461, + "num_tokens": 29679726.0, + "step": 9699, + "train/ce_loss": 5.976818897579506e-07 + }, + { + "epoch": 0.9589677674510579, + "step": 9699, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.9589677674510579, + "step": 9699, + "train/total_loss": 0.023437559604644775 + }, + { + "epoch": 0.9590666403005734, + "grad_norm": 0.7731952667236328, + "learning_rate": 7.6044602680116705e-06, + "loss": 0.1421, + "step": 9700 + }, + { + "entropy": 8.920503616333008, + "epoch": 0.9590666403005734, + "mean_token_accuracy": 0.7343957424163818, + "num_tokens": 29684953.0, + "step": 9700, + "train/ce_loss": 3.4321760722377803e-07 + }, + { + "epoch": 0.9590666403005734, + "step": 9700, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.9590666403005734, + "step": 9700, + "train/total_loss": 0.035156283527612686 + }, + { + "entropy": 9.540019989013672, + "epoch": 0.959165513150089, + "mean_token_accuracy": 0.7209302186965942, + "num_tokens": 29689795.0, + "step": 9701, + "train/ce_loss": 7.614484047735459e-07 + }, + { + "epoch": 0.959165513150089, + "step": 9701, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.959165513150089, + "step": 9701, + "train/total_loss": 0.03125007450580597 + }, + { + "entropy": 8.575620651245117, + "epoch": 0.9592643859996045, + "mean_token_accuracy": 0.7465618848800659, + "num_tokens": 29695306.0, + "step": 9702, + "train/ce_loss": 0.6707134246826172 + }, + { + "epoch": 0.9592643859996045, + "step": 9702, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.9592643859996045, + "step": 9702, + "train/total_loss": 0.1256650984287262 + }, + { + "entropy": 9.787461280822754, + "epoch": 0.95936325884912, + "mean_token_accuracy": 0.7609755992889404, + "num_tokens": 29699883.0, + "step": 9703, + "train/ce_loss": 9.148747267317958e-06 + }, + { + "epoch": 0.95936325884912, + "step": 9703, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.95936325884912, + "step": 9703, + "train/total_loss": 0.03515716642141342 + }, + { + "entropy": 8.471220016479492, + "epoch": 0.9594621316986356, + "mean_token_accuracy": 0.7642998099327087, + "num_tokens": 29705421.0, + "step": 9704, + "train/ce_loss": 0.848787248134613 + }, + { + "epoch": 0.9594621316986356, + "step": 9704, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.9594621316986356, + "step": 9704, + "train/total_loss": 0.12394122779369354 + }, + { + "entropy": 9.439668655395508, + "epoch": 0.9595610045481511, + "mean_token_accuracy": 0.7807376980781555, + "num_tokens": 29710337.0, + "step": 9705, + "train/ce_loss": 0.8956339359283447 + }, + { + "epoch": 0.9595610045481511, + "step": 9705, + "train/sim_loss": 0.0859375 + }, + { + "epoch": 0.9595610045481511, + "step": 9705, + "train/total_loss": 0.17550089955329895 + }, + { + "entropy": 8.944560050964355, + "epoch": 0.9596598773976666, + "mean_token_accuracy": 0.7627737522125244, + "num_tokens": 29715623.0, + "step": 9706, + "train/ce_loss": 1.211319923400879 + }, + { + "epoch": 0.9596598773976666, + "step": 9706, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.9596598773976666, + "step": 9706, + "train/total_loss": 0.1758194863796234 + }, + { + "entropy": 8.496309280395508, + "epoch": 0.9597587502471822, + "mean_token_accuracy": 0.6829004287719727, + "num_tokens": 29721051.0, + "step": 9707, + "train/ce_loss": 1.096146583557129 + }, + { + "epoch": 0.9597587502471822, + "step": 9707, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.9597587502471822, + "step": 9707, + "train/total_loss": 0.21898967027664185 + }, + { + "entropy": 8.67544937133789, + "epoch": 0.9598576230966976, + "mean_token_accuracy": 0.730140209197998, + "num_tokens": 29726388.0, + "step": 9708, + "train/ce_loss": 0.6697824597358704 + }, + { + "epoch": 0.9598576230966976, + "step": 9708, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.9598576230966976, + "step": 9708, + "train/total_loss": 0.10213449597358704 + }, + { + "entropy": 8.619380950927734, + "epoch": 0.9599564959462131, + "mean_token_accuracy": 0.689538836479187, + "num_tokens": 29731719.0, + "step": 9709, + "train/ce_loss": 0.7001871466636658 + }, + { + "epoch": 0.9599564959462131, + "step": 9709, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.9599564959462131, + "step": 9709, + "train/total_loss": 0.1090812161564827 + }, + { + "entropy": 9.104570388793945, + "epoch": 0.9600553687957287, + "mean_token_accuracy": 0.7729941010475159, + "num_tokens": 29736686.0, + "step": 9710, + "train/ce_loss": 1.1940386295318604 + }, + { + "epoch": 0.9600553687957287, + "step": 9710, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.9600553687957287, + "step": 9710, + "train/total_loss": 0.1779976189136505 + }, + { + "entropy": 8.630685806274414, + "epoch": 0.9601542416452442, + "mean_token_accuracy": 0.7542168498039246, + "num_tokens": 29741962.0, + "step": 9711, + "train/ce_loss": 0.6684284210205078 + }, + { + "epoch": 0.9601542416452442, + "step": 9711, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.9601542416452442, + "step": 9711, + "train/total_loss": 0.11371784657239914 + }, + { + "entropy": 9.062545776367188, + "epoch": 0.9602531144947597, + "mean_token_accuracy": 0.720447301864624, + "num_tokens": 29747047.0, + "step": 9712, + "train/ce_loss": 1.7544846534729004 + }, + { + "epoch": 0.9602531144947597, + "step": 9712, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.9602531144947597, + "step": 9712, + "train/total_loss": 0.253573477268219 + }, + { + "entropy": 10.125118255615234, + "epoch": 0.9603519873442753, + "mean_token_accuracy": 0.6926069855690002, + "num_tokens": 29751652.0, + "step": 9713, + "train/ce_loss": 6.363983402479789e-07 + }, + { + "epoch": 0.9603519873442753, + "step": 9713, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.9603519873442753, + "step": 9713, + "train/total_loss": 0.019531313329935074 + }, + { + "entropy": 8.604927062988281, + "epoch": 0.9604508601937908, + "mean_token_accuracy": 0.7398785352706909, + "num_tokens": 29757145.0, + "step": 9714, + "train/ce_loss": 0.7927497625350952 + }, + { + "epoch": 0.9604508601937908, + "step": 9714, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.9604508601937908, + "step": 9714, + "train/total_loss": 0.169118732213974 + }, + { + "entropy": 8.794660568237305, + "epoch": 0.9605497330433063, + "mean_token_accuracy": 0.796316385269165, + "num_tokens": 29762488.0, + "step": 9715, + "train/ce_loss": 0.6840125918388367 + }, + { + "epoch": 0.9605497330433063, + "step": 9715, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.9605497330433063, + "step": 9715, + "train/total_loss": 0.1309012621641159 + }, + { + "entropy": 8.498865127563477, + "epoch": 0.9606486058928219, + "mean_token_accuracy": 0.7266536951065063, + "num_tokens": 29768016.0, + "step": 9716, + "train/ce_loss": 1.4905140399932861 + }, + { + "epoch": 0.9606486058928219, + "step": 9716, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.9606486058928219, + "step": 9716, + "train/total_loss": 0.2740514278411865 + }, + { + "entropy": 8.539307594299316, + "epoch": 0.9607474787423373, + "mean_token_accuracy": 0.6680244207382202, + "num_tokens": 29773440.0, + "step": 9717, + "train/ce_loss": 0.664154589176178 + }, + { + "epoch": 0.9607474787423373, + "step": 9717, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.9607474787423373, + "step": 9717, + "train/total_loss": 0.0820404589176178 + }, + { + "entropy": 9.315811157226562, + "epoch": 0.9608463515918528, + "mean_token_accuracy": 0.6927710771560669, + "num_tokens": 29778373.0, + "step": 9718, + "train/ce_loss": 0.6773354411125183 + }, + { + "epoch": 0.9608463515918528, + "step": 9718, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.9608463515918528, + "step": 9718, + "train/total_loss": 0.1497648060321808 + }, + { + "entropy": 8.938533782958984, + "epoch": 0.9609452244413684, + "mean_token_accuracy": 0.6992481350898743, + "num_tokens": 29783528.0, + "step": 9719, + "train/ce_loss": 1.8118160963058472 + }, + { + "epoch": 0.9609452244413684, + "step": 9719, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.9609452244413684, + "step": 9719, + "train/total_loss": 0.2710253596305847 + }, + { + "epoch": 0.9610440972908839, + "grad_norm": 0.7465754151344299, + "learning_rate": 7.599515403253722e-06, + "loss": 0.1395, + "step": 9720 + }, + { + "entropy": 8.669353485107422, + "epoch": 0.9610440972908839, + "mean_token_accuracy": 0.7660500407218933, + "num_tokens": 29788915.0, + "step": 9720, + "train/ce_loss": 0.6009553074836731 + }, + { + "epoch": 0.9610440972908839, + "step": 9720, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.9610440972908839, + "step": 9720, + "train/total_loss": 0.07962678372859955 + }, + { + "entropy": 8.840965270996094, + "epoch": 0.9611429701403994, + "mean_token_accuracy": 0.7968936562538147, + "num_tokens": 29794204.0, + "step": 9721, + "train/ce_loss": 0.5751218795776367 + }, + { + "epoch": 0.9611429701403994, + "step": 9721, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.9611429701403994, + "step": 9721, + "train/total_loss": 0.07313719391822815 + }, + { + "entropy": 8.781105041503906, + "epoch": 0.961241842989915, + "mean_token_accuracy": 0.8096385598182678, + "num_tokens": 29799491.0, + "step": 9722, + "train/ce_loss": 0.7813106775283813 + }, + { + "epoch": 0.961241842989915, + "step": 9722, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.961241842989915, + "step": 9722, + "train/total_loss": 0.10547482222318649 + }, + { + "entropy": 8.61587905883789, + "epoch": 0.9613407158394305, + "mean_token_accuracy": 0.7680995464324951, + "num_tokens": 29805027.0, + "step": 9723, + "train/ce_loss": 0.7303171157836914 + }, + { + "epoch": 0.9613407158394305, + "step": 9723, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.9613407158394305, + "step": 9723, + "train/total_loss": 0.1120942160487175 + }, + { + "entropy": 8.903885841369629, + "epoch": 0.961439588688946, + "mean_token_accuracy": 0.8101736903190613, + "num_tokens": 29810293.0, + "step": 9724, + "train/ce_loss": 0.5742484927177429 + }, + { + "epoch": 0.961439588688946, + "step": 9724, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.961439588688946, + "step": 9724, + "train/total_loss": 0.08086235076189041 + }, + { + "entropy": 9.229190826416016, + "epoch": 0.9615384615384616, + "mean_token_accuracy": 0.7374179363250732, + "num_tokens": 29815150.0, + "step": 9725, + "train/ce_loss": 1.182202935218811 + }, + { + "epoch": 0.9615384615384616, + "step": 9725, + "train/sim_loss": 0.01171875 + }, + { + "epoch": 0.9615384615384616, + "step": 9725, + "train/total_loss": 0.12993904948234558 + }, + { + "entropy": 9.013776779174805, + "epoch": 0.961637334387977, + "mean_token_accuracy": 0.7624309659004211, + "num_tokens": 29820350.0, + "step": 9726, + "train/ce_loss": 1.519352912902832 + }, + { + "epoch": 0.961637334387977, + "step": 9726, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.961637334387977, + "step": 9726, + "train/total_loss": 0.22615404427051544 + }, + { + "entropy": 9.120112419128418, + "epoch": 0.9617362072374925, + "mean_token_accuracy": 0.7324414849281311, + "num_tokens": 29825351.0, + "step": 9727, + "train/ce_loss": 1.4603698253631592 + }, + { + "epoch": 0.9617362072374925, + "step": 9727, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.9617362072374925, + "step": 9727, + "train/total_loss": 0.20072448253631592 + }, + { + "entropy": 9.247553825378418, + "epoch": 0.9618350800870081, + "mean_token_accuracy": 0.7626526951789856, + "num_tokens": 29830317.0, + "step": 9728, + "train/ce_loss": 0.6655154228210449 + }, + { + "epoch": 0.9618350800870081, + "step": 9728, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.9618350800870081, + "step": 9728, + "train/total_loss": 0.11342654377222061 + }, + { + "entropy": 8.839900970458984, + "epoch": 0.9619339529365236, + "mean_token_accuracy": 0.7316129207611084, + "num_tokens": 29835544.0, + "step": 9729, + "train/ce_loss": 1.413920283317566 + }, + { + "epoch": 0.9619339529365236, + "step": 9729, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.9619339529365236, + "step": 9729, + "train/total_loss": 0.1960795372724533 + }, + { + "entropy": 9.041462898254395, + "epoch": 0.9620328257860391, + "mean_token_accuracy": 0.7145161032676697, + "num_tokens": 29840625.0, + "step": 9730, + "train/ce_loss": 0.8221074342727661 + }, + { + "epoch": 0.9620328257860391, + "step": 9730, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.9620328257860391, + "step": 9730, + "train/total_loss": 0.11736699193716049 + }, + { + "entropy": 8.483409881591797, + "epoch": 0.9621316986355547, + "mean_token_accuracy": 0.7784730792045593, + "num_tokens": 29845908.0, + "step": 9731, + "train/ce_loss": 1.1115496158599854 + }, + { + "epoch": 0.9621316986355547, + "step": 9731, + "train/sim_loss": 0.1328125 + }, + { + "epoch": 0.9621316986355547, + "step": 9731, + "train/total_loss": 0.2439674735069275 + }, + { + "entropy": 8.228540420532227, + "epoch": 0.9622305714850702, + "mean_token_accuracy": 0.7142857313156128, + "num_tokens": 29851287.0, + "step": 9732, + "train/ce_loss": 0.6496933698654175 + }, + { + "epoch": 0.9622305714850702, + "step": 9732, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.9622305714850702, + "step": 9732, + "train/total_loss": 0.11965683847665787 + }, + { + "entropy": 8.675930976867676, + "epoch": 0.9623294443345857, + "mean_token_accuracy": 0.8305687308311462, + "num_tokens": 29856605.0, + "step": 9733, + "train/ce_loss": 5.587366445070074e-07 + }, + { + "epoch": 0.9623294443345857, + "step": 9733, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.9623294443345857, + "step": 9733, + "train/total_loss": 0.027343805879354477 + }, + { + "entropy": 9.15136432647705, + "epoch": 0.9624283171841013, + "mean_token_accuracy": 0.676800012588501, + "num_tokens": 29861676.0, + "step": 9734, + "train/ce_loss": 0.9130910038948059 + }, + { + "epoch": 0.9624283171841013, + "step": 9734, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.9624283171841013, + "step": 9734, + "train/total_loss": 0.1538091003894806 + }, + { + "entropy": 8.893739700317383, + "epoch": 0.9625271900336168, + "mean_token_accuracy": 0.7581274509429932, + "num_tokens": 29866891.0, + "step": 9735, + "train/ce_loss": 0.6514489650726318 + }, + { + "epoch": 0.9625271900336168, + "step": 9735, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.9625271900336168, + "step": 9735, + "train/total_loss": 0.08076989650726318 + }, + { + "entropy": 8.613104820251465, + "epoch": 0.9626260628831322, + "mean_token_accuracy": 0.8061728477478027, + "num_tokens": 29872167.0, + "step": 9736, + "train/ce_loss": 1.2039073705673218 + }, + { + "epoch": 0.9626260628831322, + "step": 9736, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.9626260628831322, + "step": 9736, + "train/total_loss": 0.14382824301719666 + }, + { + "entropy": 8.599849700927734, + "epoch": 0.9627249357326478, + "mean_token_accuracy": 0.7332563400268555, + "num_tokens": 29877656.0, + "step": 9737, + "train/ce_loss": 0.8840844631195068 + }, + { + "epoch": 0.9627249357326478, + "step": 9737, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.9627249357326478, + "step": 9737, + "train/total_loss": 0.1626271903514862 + }, + { + "entropy": 8.765642166137695, + "epoch": 0.9628238085821633, + "mean_token_accuracy": 0.7921419739723206, + "num_tokens": 29882890.0, + "step": 9738, + "train/ce_loss": 2.2119331788417185e-07 + }, + { + "epoch": 0.9628238085821633, + "step": 9738, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.9628238085821633, + "step": 9738, + "train/total_loss": 0.01562502235174179 + }, + { + "entropy": 8.54442310333252, + "epoch": 0.9629226814316788, + "mean_token_accuracy": 0.7326139211654663, + "num_tokens": 29888132.0, + "step": 9739, + "train/ce_loss": 1.1796483993530273 + }, + { + "epoch": 0.9629226814316788, + "step": 9739, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.9629226814316788, + "step": 9739, + "train/total_loss": 0.16874608397483826 + }, + { + "epoch": 0.9630215542811944, + "grad_norm": 0.616840124130249, + "learning_rate": 7.594570538495772e-06, + "loss": 0.1206, + "step": 9740 + }, + { + "entropy": 8.68200969696045, + "epoch": 0.9630215542811944, + "mean_token_accuracy": 0.7403846383094788, + "num_tokens": 29893468.0, + "step": 9740, + "train/ce_loss": 1.115727424621582 + }, + { + "epoch": 0.9630215542811944, + "step": 9740, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.9630215542811944, + "step": 9740, + "train/total_loss": 0.1896977424621582 + }, + { + "entropy": 9.168376922607422, + "epoch": 0.9631204271307099, + "mean_token_accuracy": 0.7841140627861023, + "num_tokens": 29898362.0, + "step": 9741, + "train/ce_loss": 1.3955055475234985 + }, + { + "epoch": 0.9631204271307099, + "step": 9741, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.9631204271307099, + "step": 9741, + "train/total_loss": 0.18642555177211761 + }, + { + "entropy": 8.921894073486328, + "epoch": 0.9632192999802255, + "mean_token_accuracy": 0.723809540271759, + "num_tokens": 29903562.0, + "step": 9742, + "train/ce_loss": 0.8586784601211548 + }, + { + "epoch": 0.9632192999802255, + "step": 9742, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.9632192999802255, + "step": 9742, + "train/total_loss": 0.10930534452199936 + }, + { + "entropy": 8.499792098999023, + "epoch": 0.963318172829741, + "mean_token_accuracy": 0.7182254195213318, + "num_tokens": 29908899.0, + "step": 9743, + "train/ce_loss": 0.44669750332832336 + }, + { + "epoch": 0.963318172829741, + "step": 9743, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.963318172829741, + "step": 9743, + "train/total_loss": 0.1110759973526001 + }, + { + "entropy": 9.490238189697266, + "epoch": 0.9634170456792565, + "mean_token_accuracy": 0.7023121118545532, + "num_tokens": 29913656.0, + "step": 9744, + "train/ce_loss": 1.694462537765503 + }, + { + "epoch": 0.9634170456792565, + "step": 9744, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.9634170456792565, + "step": 9744, + "train/total_loss": 0.20850875973701477 + }, + { + "entropy": 9.028064727783203, + "epoch": 0.963515918528772, + "mean_token_accuracy": 0.7840909361839294, + "num_tokens": 29918706.0, + "step": 9745, + "train/ce_loss": 8.081343594312784e-07 + }, + { + "epoch": 0.963515918528772, + "step": 9745, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.963515918528772, + "step": 9745, + "train/total_loss": 0.046875081956386566 + }, + { + "entropy": 9.367633819580078, + "epoch": 0.9636147913782875, + "mean_token_accuracy": 0.7660818696022034, + "num_tokens": 29923828.0, + "step": 9746, + "train/ce_loss": 1.222727656364441 + }, + { + "epoch": 0.9636147913782875, + "step": 9746, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.9636147913782875, + "step": 9746, + "train/total_loss": 0.23164775967597961 + }, + { + "entropy": 9.461725234985352, + "epoch": 0.963713664227803, + "mean_token_accuracy": 0.7025495767593384, + "num_tokens": 29928614.0, + "step": 9747, + "train/ce_loss": 1.1468218872323632e-06 + }, + { + "epoch": 0.963713664227803, + "step": 9747, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.963713664227803, + "step": 9747, + "train/total_loss": 0.08203136175870895 + }, + { + "entropy": 9.182886123657227, + "epoch": 0.9638125370773186, + "mean_token_accuracy": 0.76897132396698, + "num_tokens": 29933684.0, + "step": 9748, + "train/ce_loss": 3.44960398024341e-07 + }, + { + "epoch": 0.9638125370773186, + "step": 9748, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.9638125370773186, + "step": 9748, + "train/total_loss": 0.031250033527612686 + }, + { + "entropy": 9.002074241638184, + "epoch": 0.9639114099268341, + "mean_token_accuracy": 0.7204142212867737, + "num_tokens": 29938812.0, + "step": 9749, + "train/ce_loss": 0.9889717698097229 + }, + { + "epoch": 0.9639114099268341, + "step": 9749, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.9639114099268341, + "step": 9749, + "train/total_loss": 0.18874093890190125 + }, + { + "entropy": 8.901506423950195, + "epoch": 0.9640102827763496, + "mean_token_accuracy": 0.7672035098075867, + "num_tokens": 29943939.0, + "step": 9750, + "train/ce_loss": 0.7076161503791809 + }, + { + "epoch": 0.9640102827763496, + "step": 9750, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.9640102827763496, + "step": 9750, + "train/total_loss": 0.13326162099838257 + }, + { + "entropy": 8.964765548706055, + "epoch": 0.9641091556258652, + "mean_token_accuracy": 0.7090432643890381, + "num_tokens": 29949172.0, + "step": 9751, + "train/ce_loss": 1.442529559135437 + }, + { + "epoch": 0.9641091556258652, + "step": 9751, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.9641091556258652, + "step": 9751, + "train/total_loss": 0.2028467059135437 + }, + { + "entropy": 8.82308578491211, + "epoch": 0.9642080284753807, + "mean_token_accuracy": 0.7761006355285645, + "num_tokens": 29954325.0, + "step": 9752, + "train/ce_loss": 0.9967244267463684 + }, + { + "epoch": 0.9642080284753807, + "step": 9752, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.9642080284753807, + "step": 9752, + "train/total_loss": 0.16217243671417236 + }, + { + "entropy": 9.71257209777832, + "epoch": 0.9643069013248962, + "mean_token_accuracy": 0.7117437720298767, + "num_tokens": 29959030.0, + "step": 9753, + "train/ce_loss": 1.1878239547513658e-06 + }, + { + "epoch": 0.9643069013248962, + "step": 9753, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.9643069013248962, + "step": 9753, + "train/total_loss": 0.03125011920928955 + }, + { + "entropy": 9.152084350585938, + "epoch": 0.9644057741744118, + "mean_token_accuracy": 0.7585227489471436, + "num_tokens": 29964127.0, + "step": 9754, + "train/ce_loss": 1.111953616142273 + }, + { + "epoch": 0.9644057741744118, + "step": 9754, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.9644057741744118, + "step": 9754, + "train/total_loss": 0.16197660565376282 + }, + { + "entropy": 8.521442413330078, + "epoch": 0.9645046470239272, + "mean_token_accuracy": 0.7327141165733337, + "num_tokens": 29969554.0, + "step": 9755, + "train/ce_loss": 0.7364586591720581 + }, + { + "epoch": 0.9645046470239272, + "step": 9755, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.9645046470239272, + "step": 9755, + "train/total_loss": 0.11270836740732193 + }, + { + "entropy": 8.969746589660645, + "epoch": 0.9646035198734427, + "mean_token_accuracy": 0.8072992563247681, + "num_tokens": 29974716.0, + "step": 9756, + "train/ce_loss": 9.724466281113564e-07 + }, + { + "epoch": 0.9646035198734427, + "step": 9756, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.9646035198734427, + "step": 9756, + "train/total_loss": 0.04687509685754776 + }, + { + "entropy": 8.370565414428711, + "epoch": 0.9647023927229583, + "mean_token_accuracy": 0.7384259104728699, + "num_tokens": 29980014.0, + "step": 9757, + "train/ce_loss": 0.9871373176574707 + }, + { + "epoch": 0.9647023927229583, + "step": 9757, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.9647023927229583, + "step": 9757, + "train/total_loss": 0.1455887258052826 + }, + { + "entropy": 8.649551391601562, + "epoch": 0.9648012655724738, + "mean_token_accuracy": 0.8205384016036987, + "num_tokens": 29985526.0, + "step": 9758, + "train/ce_loss": 0.5213025212287903 + }, + { + "epoch": 0.9648012655724738, + "step": 9758, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.9648012655724738, + "step": 9758, + "train/total_loss": 0.07166150212287903 + }, + { + "entropy": 8.791582107543945, + "epoch": 0.9649001384219893, + "mean_token_accuracy": 0.7807424664497375, + "num_tokens": 29990858.0, + "step": 9759, + "train/ce_loss": 1.0622608661651611 + }, + { + "epoch": 0.9649001384219893, + "step": 9759, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.9649001384219893, + "step": 9759, + "train/total_loss": 0.1609135866165161 + }, + { + "epoch": 0.9649990112715049, + "grad_norm": 0.5699576735496521, + "learning_rate": 7.589625673737824e-06, + "loss": 0.1311, + "step": 9760 + }, + { + "entropy": 9.449071884155273, + "epoch": 0.9649990112715049, + "mean_token_accuracy": 0.6702355742454529, + "num_tokens": 29995723.0, + "step": 9760, + "train/ce_loss": 9.597781627235236e-07 + }, + { + "epoch": 0.9649990112715049, + "step": 9760, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.9649990112715049, + "step": 9760, + "train/total_loss": 0.04687509685754776 + }, + { + "entropy": 8.83869743347168, + "epoch": 0.9650978841210204, + "mean_token_accuracy": 0.6340621113777161, + "num_tokens": 30001091.0, + "step": 9761, + "train/ce_loss": 0.7668175101280212 + }, + { + "epoch": 0.9650978841210204, + "step": 9761, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.9650978841210204, + "step": 9761, + "train/total_loss": 0.17043176293373108 + }, + { + "entropy": 8.57378101348877, + "epoch": 0.9651967569705359, + "mean_token_accuracy": 0.6736842393875122, + "num_tokens": 30006428.0, + "step": 9762, + "train/ce_loss": 0.937447190284729 + }, + { + "epoch": 0.9651967569705359, + "step": 9762, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.9651967569705359, + "step": 9762, + "train/total_loss": 0.12890097498893738 + }, + { + "entropy": 8.815035820007324, + "epoch": 0.9652956298200515, + "mean_token_accuracy": 0.7747858166694641, + "num_tokens": 30011686.0, + "step": 9763, + "train/ce_loss": 0.5151112675666809 + }, + { + "epoch": 0.9652956298200515, + "step": 9763, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.9652956298200515, + "step": 9763, + "train/total_loss": 0.07494862377643585 + }, + { + "entropy": 8.976507186889648, + "epoch": 0.965394502669567, + "mean_token_accuracy": 0.7398081421852112, + "num_tokens": 30016975.0, + "step": 9764, + "train/ce_loss": 0.5990418195724487 + }, + { + "epoch": 0.965394502669567, + "step": 9764, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.965394502669567, + "step": 9764, + "train/total_loss": 0.11849793791770935 + }, + { + "entropy": 9.343544960021973, + "epoch": 0.9654933755190824, + "mean_token_accuracy": 0.8410351276397705, + "num_tokens": 30021936.0, + "step": 9765, + "train/ce_loss": 0.700653612613678 + }, + { + "epoch": 0.9654933755190824, + "step": 9765, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.9654933755190824, + "step": 9765, + "train/total_loss": 0.11303411424160004 + }, + { + "entropy": 8.677410125732422, + "epoch": 0.965592248368598, + "mean_token_accuracy": 0.7547393441200256, + "num_tokens": 30027192.0, + "step": 9766, + "train/ce_loss": 0.7619554996490479 + }, + { + "epoch": 0.965592248368598, + "step": 9766, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.965592248368598, + "step": 9766, + "train/total_loss": 0.12307055294513702 + }, + { + "entropy": 8.991375923156738, + "epoch": 0.9656911212181135, + "mean_token_accuracy": 0.7194656729698181, + "num_tokens": 30032130.0, + "step": 9767, + "train/ce_loss": 1.2186259031295776 + }, + { + "epoch": 0.9656911212181135, + "step": 9767, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.9656911212181135, + "step": 9767, + "train/total_loss": 0.16092509031295776 + }, + { + "entropy": 8.920236587524414, + "epoch": 0.965789994067629, + "mean_token_accuracy": 0.767160177230835, + "num_tokens": 30037370.0, + "step": 9768, + "train/ce_loss": 0.3864889442920685 + }, + { + "epoch": 0.965789994067629, + "step": 9768, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.965789994067629, + "step": 9768, + "train/total_loss": 0.13630515336990356 + }, + { + "entropy": 8.582466125488281, + "epoch": 0.9658888669171446, + "mean_token_accuracy": 0.7327766418457031, + "num_tokens": 30042785.0, + "step": 9769, + "train/ce_loss": 1.0490230321884155 + }, + { + "epoch": 0.9658888669171446, + "step": 9769, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.9658888669171446, + "step": 9769, + "train/total_loss": 0.17521479725837708 + }, + { + "entropy": 8.956478118896484, + "epoch": 0.9659877397666601, + "mean_token_accuracy": 0.7528735399246216, + "num_tokens": 30047924.0, + "step": 9770, + "train/ce_loss": 1.0998936891555786 + }, + { + "epoch": 0.9659877397666601, + "step": 9770, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.9659877397666601, + "step": 9770, + "train/total_loss": 0.16467687487602234 + }, + { + "entropy": 9.13542652130127, + "epoch": 0.9660866126161756, + "mean_token_accuracy": 0.6312500238418579, + "num_tokens": 30053032.0, + "step": 9771, + "train/ce_loss": 7.420662768709008e-07 + }, + { + "epoch": 0.9660866126161756, + "step": 9771, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.9660866126161756, + "step": 9771, + "train/total_loss": 0.04687507450580597 + }, + { + "entropy": 8.715719223022461, + "epoch": 0.9661854854656912, + "mean_token_accuracy": 0.7609391808509827, + "num_tokens": 30058415.0, + "step": 9772, + "train/ce_loss": 0.7486229538917542 + }, + { + "epoch": 0.9661854854656912, + "step": 9772, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.9661854854656912, + "step": 9772, + "train/total_loss": 0.1295498013496399 + }, + { + "entropy": 8.704704284667969, + "epoch": 0.9662843583152066, + "mean_token_accuracy": 0.7709563374519348, + "num_tokens": 30063731.0, + "step": 9773, + "train/ce_loss": 1.3971641063690186 + }, + { + "epoch": 0.9662843583152066, + "step": 9773, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.9662843583152066, + "step": 9773, + "train/total_loss": 0.20612266659736633 + }, + { + "entropy": 9.03307056427002, + "epoch": 0.9663832311647221, + "mean_token_accuracy": 0.7463414669036865, + "num_tokens": 30068963.0, + "step": 9774, + "train/ce_loss": 1.5434439182281494 + }, + { + "epoch": 0.9663832311647221, + "step": 9774, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.9663832311647221, + "step": 9774, + "train/total_loss": 0.23637564480304718 + }, + { + "entropy": 8.791257858276367, + "epoch": 0.9664821040142377, + "mean_token_accuracy": 0.7603748440742493, + "num_tokens": 30074113.0, + "step": 9775, + "train/ce_loss": 0.7728161811828613 + }, + { + "epoch": 0.9664821040142377, + "step": 9775, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.9664821040142377, + "step": 9775, + "train/total_loss": 0.12415661662817001 + }, + { + "entropy": 8.576519966125488, + "epoch": 0.9665809768637532, + "mean_token_accuracy": 0.7609427571296692, + "num_tokens": 30079485.0, + "step": 9776, + "train/ce_loss": 0.6334747672080994 + }, + { + "epoch": 0.9665809768637532, + "step": 9776, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.9665809768637532, + "step": 9776, + "train/total_loss": 0.1336599886417389 + }, + { + "entropy": 9.028536796569824, + "epoch": 0.9666798497132687, + "mean_token_accuracy": 0.7583333253860474, + "num_tokens": 30084501.0, + "step": 9777, + "train/ce_loss": 1.0083562135696411 + }, + { + "epoch": 0.9666798497132687, + "step": 9777, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.9666798497132687, + "step": 9777, + "train/total_loss": 0.1281793713569641 + }, + { + "entropy": 8.261909484863281, + "epoch": 0.9667787225627843, + "mean_token_accuracy": 0.6924778819084167, + "num_tokens": 30089816.0, + "step": 9778, + "train/ce_loss": 0.8240401148796082 + }, + { + "epoch": 0.9667787225627843, + "step": 9778, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.9667787225627843, + "step": 9778, + "train/total_loss": 0.1292790174484253 + }, + { + "entropy": 8.421815872192383, + "epoch": 0.9668775954122998, + "mean_token_accuracy": 0.7316821217536926, + "num_tokens": 30095429.0, + "step": 9779, + "train/ce_loss": 0.6641775369644165 + }, + { + "epoch": 0.9668775954122998, + "step": 9779, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.9668775954122998, + "step": 9779, + "train/total_loss": 0.14063650369644165 + }, + { + "epoch": 0.9669764682618153, + "grad_norm": 0.6132939457893372, + "learning_rate": 7.584680808979875e-06, + "loss": 0.1351, + "step": 9780 + }, + { + "entropy": 8.637802124023438, + "epoch": 0.9669764682618153, + "mean_token_accuracy": 0.7306889295578003, + "num_tokens": 30100871.0, + "step": 9780, + "train/ce_loss": 1.974214792251587 + }, + { + "epoch": 0.9669764682618153, + "step": 9780, + "train/sim_loss": 0.13671875 + }, + { + "epoch": 0.9669764682618153, + "step": 9780, + "train/total_loss": 0.33414024114608765 + }, + { + "entropy": 8.890082359313965, + "epoch": 0.9670753411113309, + "mean_token_accuracy": 0.7455048561096191, + "num_tokens": 30106019.0, + "step": 9781, + "train/ce_loss": 0.925648033618927 + }, + { + "epoch": 0.9670753411113309, + "step": 9781, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.9670753411113309, + "step": 9781, + "train/total_loss": 0.15115855634212494 + }, + { + "entropy": 8.773796081542969, + "epoch": 0.9671742139608464, + "mean_token_accuracy": 0.7570332288742065, + "num_tokens": 30111273.0, + "step": 9782, + "train/ce_loss": 0.6232446432113647 + }, + { + "epoch": 0.9671742139608464, + "step": 9782, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.9671742139608464, + "step": 9782, + "train/total_loss": 0.08576196432113647 + }, + { + "entropy": 9.439830780029297, + "epoch": 0.9672730868103618, + "mean_token_accuracy": 0.7900000214576721, + "num_tokens": 30116166.0, + "step": 9783, + "train/ce_loss": 0.6501454710960388 + }, + { + "epoch": 0.9672730868103618, + "step": 9783, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.9672730868103618, + "step": 9783, + "train/total_loss": 0.11188954859972 + }, + { + "entropy": 8.262337684631348, + "epoch": 0.9673719596598774, + "mean_token_accuracy": 0.7378542423248291, + "num_tokens": 30121630.0, + "step": 9784, + "train/ce_loss": 0.834562361240387 + }, + { + "epoch": 0.9673719596598774, + "step": 9784, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.9673719596598774, + "step": 9784, + "train/total_loss": 0.12251874059438705 + }, + { + "entropy": 9.01753044128418, + "epoch": 0.9674708325093929, + "mean_token_accuracy": 0.7933884263038635, + "num_tokens": 30126694.0, + "step": 9785, + "train/ce_loss": 0.9453256130218506 + }, + { + "epoch": 0.9674708325093929, + "step": 9785, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.9674708325093929, + "step": 9785, + "train/total_loss": 0.1335950642824173 + }, + { + "entropy": 8.868257522583008, + "epoch": 0.9675697053589084, + "mean_token_accuracy": 0.6899999976158142, + "num_tokens": 30131991.0, + "step": 9786, + "train/ce_loss": 1.6262303590774536 + }, + { + "epoch": 0.9675697053589084, + "step": 9786, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.9675697053589084, + "step": 9786, + "train/total_loss": 0.21731053292751312 + }, + { + "entropy": 8.569860458374023, + "epoch": 0.967668578208424, + "mean_token_accuracy": 0.7249712347984314, + "num_tokens": 30137306.0, + "step": 9787, + "train/ce_loss": 1.28461492061615 + }, + { + "epoch": 0.967668578208424, + "step": 9787, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.967668578208424, + "step": 9787, + "train/total_loss": 0.17533649504184723 + }, + { + "entropy": 8.76042366027832, + "epoch": 0.9677674510579395, + "mean_token_accuracy": 0.7638669013977051, + "num_tokens": 30142376.0, + "step": 9788, + "train/ce_loss": 4.821223456019652e-07 + }, + { + "epoch": 0.9677674510579395, + "step": 9788, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.9677674510579395, + "step": 9788, + "train/total_loss": 0.05859379842877388 + }, + { + "entropy": 9.251687049865723, + "epoch": 0.967866323907455, + "mean_token_accuracy": 0.7751371264457703, + "num_tokens": 30147381.0, + "step": 9789, + "train/ce_loss": 0.8808366656303406 + }, + { + "epoch": 0.967866323907455, + "step": 9789, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.967866323907455, + "step": 9789, + "train/total_loss": 0.1349586695432663 + }, + { + "entropy": 8.435419082641602, + "epoch": 0.9679651967569706, + "mean_token_accuracy": 0.811965823173523, + "num_tokens": 30152838.0, + "step": 9790, + "train/ce_loss": 0.6257777810096741 + }, + { + "epoch": 0.9679651967569706, + "step": 9790, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.9679651967569706, + "step": 9790, + "train/total_loss": 0.08210902661085129 + }, + { + "entropy": 8.560461044311523, + "epoch": 0.968064069606486, + "mean_token_accuracy": 0.7419700026512146, + "num_tokens": 30158236.0, + "step": 9791, + "train/ce_loss": 0.6622475385665894 + }, + { + "epoch": 0.968064069606486, + "step": 9791, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.968064069606486, + "step": 9791, + "train/total_loss": 0.08966225385665894 + }, + { + "entropy": 8.549823760986328, + "epoch": 0.9681629424560015, + "mean_token_accuracy": 0.7554479241371155, + "num_tokens": 30163545.0, + "step": 9792, + "train/ce_loss": 0.7888723611831665 + }, + { + "epoch": 0.9681629424560015, + "step": 9792, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.9681629424560015, + "step": 9792, + "train/total_loss": 0.1374809890985489 + }, + { + "entropy": 8.473400115966797, + "epoch": 0.9682618153055171, + "mean_token_accuracy": 0.742290735244751, + "num_tokens": 30168950.0, + "step": 9793, + "train/ce_loss": 1.3957717418670654 + }, + { + "epoch": 0.9682618153055171, + "step": 9793, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.9682618153055171, + "step": 9793, + "train/total_loss": 0.17082718014717102 + }, + { + "entropy": 8.768596649169922, + "epoch": 0.9683606881550326, + "mean_token_accuracy": 0.8123359680175781, + "num_tokens": 30174239.0, + "step": 9794, + "train/ce_loss": 0.8686895966529846 + }, + { + "epoch": 0.9683606881550326, + "step": 9794, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.9683606881550326, + "step": 9794, + "train/total_loss": 0.10249396413564682 + }, + { + "entropy": 9.506278991699219, + "epoch": 0.9684595610045481, + "mean_token_accuracy": 0.7516930103302002, + "num_tokens": 30179078.0, + "step": 9795, + "train/ce_loss": 1.5441043376922607 + }, + { + "epoch": 0.9684595610045481, + "step": 9795, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.9684595610045481, + "step": 9795, + "train/total_loss": 0.1739416867494583 + }, + { + "entropy": 9.06515121459961, + "epoch": 0.9685584338540637, + "mean_token_accuracy": 0.7006579041481018, + "num_tokens": 30184104.0, + "step": 9796, + "train/ce_loss": 1.252088189125061 + }, + { + "epoch": 0.9685584338540637, + "step": 9796, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.9685584338540637, + "step": 9796, + "train/total_loss": 0.15645882487297058 + }, + { + "entropy": 8.85397720336914, + "epoch": 0.9686573067035792, + "mean_token_accuracy": 0.7564275860786438, + "num_tokens": 30189344.0, + "step": 9797, + "train/ce_loss": 0.6353757381439209 + }, + { + "epoch": 0.9686573067035792, + "step": 9797, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.9686573067035792, + "step": 9797, + "train/total_loss": 0.10260007530450821 + }, + { + "entropy": 8.397137641906738, + "epoch": 0.9687561795530947, + "mean_token_accuracy": 0.7437499761581421, + "num_tokens": 30194958.0, + "step": 9798, + "train/ce_loss": 0.8359341025352478 + }, + { + "epoch": 0.9687561795530947, + "step": 9798, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.9687561795530947, + "step": 9798, + "train/total_loss": 0.12656216323375702 + }, + { + "entropy": 8.923301696777344, + "epoch": 0.9688550524026103, + "mean_token_accuracy": 0.7220077514648438, + "num_tokens": 30200219.0, + "step": 9799, + "train/ce_loss": 1.037619948387146 + }, + { + "epoch": 0.9688550524026103, + "step": 9799, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.9688550524026103, + "step": 9799, + "train/total_loss": 0.19360575079917908 + }, + { + "epoch": 0.9689539252521258, + "grad_norm": 0.7058612108230591, + "learning_rate": 7.5797359442219264e-06, + "loss": 0.1257, + "step": 9800 + }, + { + "entropy": 8.534658432006836, + "epoch": 0.9689539252521258, + "mean_token_accuracy": 0.757446825504303, + "num_tokens": 30205669.0, + "step": 9800, + "train/ce_loss": 0.5837336182594299 + }, + { + "epoch": 0.9689539252521258, + "step": 9800, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.9689539252521258, + "step": 9800, + "train/total_loss": 0.073998361825943 + }, + { + "entropy": 8.685020446777344, + "epoch": 0.9690527981016412, + "mean_token_accuracy": 0.7762399315834045, + "num_tokens": 30211013.0, + "step": 9801, + "train/ce_loss": 0.9707077741622925 + }, + { + "epoch": 0.9690527981016412, + "step": 9801, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.9690527981016412, + "step": 9801, + "train/total_loss": 0.15566453337669373 + }, + { + "entropy": 8.840173721313477, + "epoch": 0.9691516709511568, + "mean_token_accuracy": 0.763239860534668, + "num_tokens": 30216148.0, + "step": 9802, + "train/ce_loss": 0.9764208197593689 + }, + { + "epoch": 0.9691516709511568, + "step": 9802, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.9691516709511568, + "step": 9802, + "train/total_loss": 0.17186084389686584 + }, + { + "entropy": 9.262995719909668, + "epoch": 0.9692505438006723, + "mean_token_accuracy": 0.7594501972198486, + "num_tokens": 30221204.0, + "step": 9803, + "train/ce_loss": 1.0339422225952148 + }, + { + "epoch": 0.9692505438006723, + "step": 9803, + "train/sim_loss": 0.08984375 + }, + { + "epoch": 0.9692505438006723, + "step": 9803, + "train/total_loss": 0.19323797523975372 + }, + { + "entropy": 8.904062271118164, + "epoch": 0.9693494166501878, + "mean_token_accuracy": 0.7573964595794678, + "num_tokens": 30226408.0, + "step": 9804, + "train/ce_loss": 1.0313167572021484 + }, + { + "epoch": 0.9693494166501878, + "step": 9804, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.9693494166501878, + "step": 9804, + "train/total_loss": 0.16953793168067932 + }, + { + "entropy": 9.223979949951172, + "epoch": 0.9694482894997034, + "mean_token_accuracy": 0.7154255509376526, + "num_tokens": 30231239.0, + "step": 9805, + "train/ce_loss": 1.9309848546981812 + }, + { + "epoch": 0.9694482894997034, + "step": 9805, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.9694482894997034, + "step": 9805, + "train/total_loss": 0.24387973546981812 + }, + { + "entropy": 8.908904075622559, + "epoch": 0.9695471623492189, + "mean_token_accuracy": 0.7402945160865784, + "num_tokens": 30236399.0, + "step": 9806, + "train/ce_loss": 0.8049737811088562 + }, + { + "epoch": 0.9695471623492189, + "step": 9806, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.9695471623492189, + "step": 9806, + "train/total_loss": 0.1117473766207695 + }, + { + "entropy": 9.124170303344727, + "epoch": 0.9696460351987344, + "mean_token_accuracy": 0.7725321650505066, + "num_tokens": 30241528.0, + "step": 9807, + "train/ce_loss": 1.1656169891357422 + }, + { + "epoch": 0.9696460351987344, + "step": 9807, + "train/sim_loss": 0.140625 + }, + { + "epoch": 0.9696460351987344, + "step": 9807, + "train/total_loss": 0.2571867108345032 + }, + { + "entropy": 8.931862831115723, + "epoch": 0.96974490804825, + "mean_token_accuracy": 0.7455621361732483, + "num_tokens": 30246653.0, + "step": 9808, + "train/ce_loss": 1.578021764755249 + }, + { + "epoch": 0.96974490804825, + "step": 9808, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.96974490804825, + "step": 9808, + "train/total_loss": 0.22030217945575714 + }, + { + "entropy": 8.949090957641602, + "epoch": 0.9698437808977655, + "mean_token_accuracy": 0.6895973086357117, + "num_tokens": 30251677.0, + "step": 9809, + "train/ce_loss": 3.2962290674731776e-07 + }, + { + "epoch": 0.9698437808977655, + "step": 9809, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.9698437808977655, + "step": 9809, + "train/total_loss": 0.039062533527612686 + }, + { + "entropy": 8.885849952697754, + "epoch": 0.969942653747281, + "mean_token_accuracy": 0.7645348906517029, + "num_tokens": 30256773.0, + "step": 9810, + "train/ce_loss": 0.618908703327179 + }, + { + "epoch": 0.969942653747281, + "step": 9810, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.969942653747281, + "step": 9810, + "train/total_loss": 0.0970471203327179 + }, + { + "entropy": 8.56889820098877, + "epoch": 0.9700415265967965, + "mean_token_accuracy": 0.805587887763977, + "num_tokens": 30262103.0, + "step": 9811, + "train/ce_loss": 0.7377899289131165 + }, + { + "epoch": 0.9700415265967965, + "step": 9811, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.9700415265967965, + "step": 9811, + "train/total_loss": 0.11674774438142776 + }, + { + "entropy": 8.948206901550293, + "epoch": 0.970140399446312, + "mean_token_accuracy": 0.836454451084137, + "num_tokens": 30267351.0, + "step": 9812, + "train/ce_loss": 0.5882524251937866 + }, + { + "epoch": 0.970140399446312, + "step": 9812, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.970140399446312, + "step": 9812, + "train/total_loss": 0.08226273953914642 + }, + { + "entropy": 8.484578132629395, + "epoch": 0.9702392722958275, + "mean_token_accuracy": 0.752653956413269, + "num_tokens": 30272732.0, + "step": 9813, + "train/ce_loss": 0.6853912472724915 + }, + { + "epoch": 0.9702392722958275, + "step": 9813, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.9702392722958275, + "step": 9813, + "train/total_loss": 0.14275787770748138 + }, + { + "entropy": 8.83863353729248, + "epoch": 0.9703381451453431, + "mean_token_accuracy": 0.7642045617103577, + "num_tokens": 30277918.0, + "step": 9814, + "train/ce_loss": 1.0013800859451294 + }, + { + "epoch": 0.9703381451453431, + "step": 9814, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.9703381451453431, + "step": 9814, + "train/total_loss": 0.15091925859451294 + }, + { + "entropy": 8.29507064819336, + "epoch": 0.9704370179948586, + "mean_token_accuracy": 0.7156756520271301, + "num_tokens": 30283333.0, + "step": 9815, + "train/ce_loss": 1.232089877128601 + }, + { + "epoch": 0.9704370179948586, + "step": 9815, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.9704370179948586, + "step": 9815, + "train/total_loss": 0.23258399963378906 + }, + { + "entropy": 9.107410430908203, + "epoch": 0.9705358908443741, + "mean_token_accuracy": 0.7877813577651978, + "num_tokens": 30288411.0, + "step": 9816, + "train/ce_loss": 0.7494767904281616 + }, + { + "epoch": 0.9705358908443741, + "step": 9816, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.9705358908443741, + "step": 9816, + "train/total_loss": 0.09838517755270004 + }, + { + "entropy": 8.747941017150879, + "epoch": 0.9706347636938897, + "mean_token_accuracy": 0.7611940503120422, + "num_tokens": 30293477.0, + "step": 9817, + "train/ce_loss": 1.2272424697875977 + }, + { + "epoch": 0.9706347636938897, + "step": 9817, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.9706347636938897, + "step": 9817, + "train/total_loss": 0.150067999958992 + }, + { + "entropy": 8.865287780761719, + "epoch": 0.9707336365434052, + "mean_token_accuracy": 0.7332214713096619, + "num_tokens": 30298540.0, + "step": 9818, + "train/ce_loss": 0.9569790363311768 + }, + { + "epoch": 0.9707336365434052, + "step": 9818, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.9707336365434052, + "step": 9818, + "train/total_loss": 0.15429165959358215 + }, + { + "entropy": 8.483968734741211, + "epoch": 0.9708325093929207, + "mean_token_accuracy": 0.7448747158050537, + "num_tokens": 30303910.0, + "step": 9819, + "train/ce_loss": 0.814598560333252 + }, + { + "epoch": 0.9708325093929207, + "step": 9819, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.9708325093929207, + "step": 9819, + "train/total_loss": 0.14005360007286072 + }, + { + "epoch": 0.9709313822424362, + "grad_norm": 0.5963754057884216, + "learning_rate": 7.574791079463977e-06, + "loss": 0.1267, + "step": 9820 + }, + { + "entropy": 9.1385498046875, + "epoch": 0.9709313822424362, + "mean_token_accuracy": 0.7041096091270447, + "num_tokens": 30309183.0, + "step": 9820, + "train/ce_loss": 1.7761496305465698 + }, + { + "epoch": 0.9709313822424362, + "step": 9820, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.9709313822424362, + "step": 9820, + "train/total_loss": 0.2088649719953537 + }, + { + "entropy": 8.867751121520996, + "epoch": 0.9710302550919517, + "mean_token_accuracy": 0.6960408687591553, + "num_tokens": 30314426.0, + "step": 9821, + "train/ce_loss": 0.4713476300239563 + }, + { + "epoch": 0.9710302550919517, + "step": 9821, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.9710302550919517, + "step": 9821, + "train/total_loss": 0.07057226449251175 + }, + { + "entropy": 8.420510292053223, + "epoch": 0.9711291279414672, + "mean_token_accuracy": 0.7186813354492188, + "num_tokens": 30319966.0, + "step": 9822, + "train/ce_loss": 1.010133981704712 + }, + { + "epoch": 0.9711291279414672, + "step": 9822, + "train/sim_loss": 0.09765625 + }, + { + "epoch": 0.9711291279414672, + "step": 9822, + "train/total_loss": 0.1986696422100067 + }, + { + "entropy": 9.160971641540527, + "epoch": 0.9712280007909828, + "mean_token_accuracy": 0.7020280957221985, + "num_tokens": 30325034.0, + "step": 9823, + "train/ce_loss": 1.3547375202178955 + }, + { + "epoch": 0.9712280007909828, + "step": 9823, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.9712280007909828, + "step": 9823, + "train/total_loss": 0.18234875798225403 + }, + { + "entropy": 8.668876647949219, + "epoch": 0.9713268736404983, + "mean_token_accuracy": 0.6980676054954529, + "num_tokens": 30330338.0, + "step": 9824, + "train/ce_loss": 0.9471348524093628 + }, + { + "epoch": 0.9713268736404983, + "step": 9824, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.9713268736404983, + "step": 9824, + "train/total_loss": 0.1533072292804718 + }, + { + "entropy": 8.502266883850098, + "epoch": 0.9714257464900139, + "mean_token_accuracy": 0.7967391014099121, + "num_tokens": 30335731.0, + "step": 9825, + "train/ce_loss": 0.61232990026474 + }, + { + "epoch": 0.9714257464900139, + "step": 9825, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.9714257464900139, + "step": 9825, + "train/total_loss": 0.10029549151659012 + }, + { + "entropy": 8.246334075927734, + "epoch": 0.9715246193395294, + "mean_token_accuracy": 0.7240592837333679, + "num_tokens": 30341120.0, + "step": 9826, + "train/ce_loss": 1.5039557218551636 + }, + { + "epoch": 0.9715246193395294, + "step": 9826, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.9715246193395294, + "step": 9826, + "train/total_loss": 0.22461432218551636 + }, + { + "entropy": 8.657670974731445, + "epoch": 0.9716234921890449, + "mean_token_accuracy": 0.7759783864021301, + "num_tokens": 30346333.0, + "step": 9827, + "train/ce_loss": 0.38430023193359375 + }, + { + "epoch": 0.9716234921890449, + "step": 9827, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.9716234921890449, + "step": 9827, + "train/total_loss": 0.061867523938417435 + }, + { + "entropy": 8.815181732177734, + "epoch": 0.9717223650385605, + "mean_token_accuracy": 0.737313449382782, + "num_tokens": 30351417.0, + "step": 9828, + "train/ce_loss": 0.8075527548789978 + }, + { + "epoch": 0.9717223650385605, + "step": 9828, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.9717223650385605, + "step": 9828, + "train/total_loss": 0.12372402846813202 + }, + { + "entropy": 8.354762077331543, + "epoch": 0.971821237888076, + "mean_token_accuracy": 0.768878698348999, + "num_tokens": 30356742.0, + "step": 9829, + "train/ce_loss": 0.6924364566802979 + }, + { + "epoch": 0.971821237888076, + "step": 9829, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.971821237888076, + "step": 9829, + "train/total_loss": 0.1122123971581459 + }, + { + "entropy": 8.414628982543945, + "epoch": 0.9719201107375914, + "mean_token_accuracy": 0.7400644421577454, + "num_tokens": 30362147.0, + "step": 9830, + "train/ce_loss": 0.3731345236301422 + }, + { + "epoch": 0.9719201107375914, + "step": 9830, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.9719201107375914, + "step": 9830, + "train/total_loss": 0.09981345385313034 + }, + { + "entropy": 8.397029876708984, + "epoch": 0.972018983587107, + "mean_token_accuracy": 0.759100615978241, + "num_tokens": 30367529.0, + "step": 9831, + "train/ce_loss": 0.46859368681907654 + }, + { + "epoch": 0.972018983587107, + "step": 9831, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.972018983587107, + "step": 9831, + "train/total_loss": 0.10545311868190765 + }, + { + "entropy": 8.723847389221191, + "epoch": 0.9721178564366225, + "mean_token_accuracy": 0.6708715558052063, + "num_tokens": 30372884.0, + "step": 9832, + "train/ce_loss": 0.9105349183082581 + }, + { + "epoch": 0.9721178564366225, + "step": 9832, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.9721178564366225, + "step": 9832, + "train/total_loss": 0.16917848587036133 + }, + { + "entropy": 9.426420211791992, + "epoch": 0.972216729286138, + "mean_token_accuracy": 0.6875, + "num_tokens": 30377721.0, + "step": 9833, + "train/ce_loss": 3.1561458110809326 + }, + { + "epoch": 0.972216729286138, + "step": 9833, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.972216729286138, + "step": 9833, + "train/total_loss": 0.35858333110809326 + }, + { + "entropy": 8.554241180419922, + "epoch": 0.9723156021356536, + "mean_token_accuracy": 0.7353658676147461, + "num_tokens": 30383019.0, + "step": 9834, + "train/ce_loss": 0.9683541655540466 + }, + { + "epoch": 0.9723156021356536, + "step": 9834, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.9723156021356536, + "step": 9834, + "train/total_loss": 0.1437104195356369 + }, + { + "entropy": 9.29680061340332, + "epoch": 0.9724144749851691, + "mean_token_accuracy": 0.7786259651184082, + "num_tokens": 30387872.0, + "step": 9835, + "train/ce_loss": 3.724886425970908e-07 + }, + { + "epoch": 0.9724144749851691, + "step": 9835, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.9724144749851691, + "step": 9835, + "train/total_loss": 0.039062537252902985 + }, + { + "entropy": 9.01911735534668, + "epoch": 0.9725133478346846, + "mean_token_accuracy": 0.7025723457336426, + "num_tokens": 30392988.0, + "step": 9836, + "train/ce_loss": 0.9779345393180847 + }, + { + "epoch": 0.9725133478346846, + "step": 9836, + "train/sim_loss": 0.109375 + }, + { + "epoch": 0.9725133478346846, + "step": 9836, + "train/total_loss": 0.20716845989227295 + }, + { + "entropy": 8.451966285705566, + "epoch": 0.9726122206842002, + "mean_token_accuracy": 0.7625570893287659, + "num_tokens": 30398387.0, + "step": 9837, + "train/ce_loss": 0.8094248175621033 + }, + { + "epoch": 0.9726122206842002, + "step": 9837, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.9726122206842002, + "step": 9837, + "train/total_loss": 0.15516123175621033 + }, + { + "entropy": 8.538702964782715, + "epoch": 0.9727110935337157, + "mean_token_accuracy": 0.7369033694267273, + "num_tokens": 30403754.0, + "step": 9838, + "train/ce_loss": 0.5979429483413696 + }, + { + "epoch": 0.9727110935337157, + "step": 9838, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.9727110935337157, + "step": 9838, + "train/total_loss": 0.15354429185390472 + }, + { + "entropy": 9.114009857177734, + "epoch": 0.9728099663832311, + "mean_token_accuracy": 0.7113401889801025, + "num_tokens": 30408920.0, + "step": 9839, + "train/ce_loss": 1.1505573987960815 + }, + { + "epoch": 0.9728099663832311, + "step": 9839, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.9728099663832311, + "step": 9839, + "train/total_loss": 0.16583698987960815 + }, + { + "epoch": 0.9729088392327467, + "grad_norm": 0.6539301872253418, + "learning_rate": 7.569846214706028e-06, + "loss": 0.1425, + "step": 9840 + }, + { + "entropy": 8.513267517089844, + "epoch": 0.9729088392327467, + "mean_token_accuracy": 0.706315815448761, + "num_tokens": 30414386.0, + "step": 9840, + "train/ce_loss": 1.0598288774490356 + }, + { + "epoch": 0.9729088392327467, + "step": 9840, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.9729088392327467, + "step": 9840, + "train/total_loss": 0.18020164966583252 + }, + { + "entropy": 8.936271667480469, + "epoch": 0.9730077120822622, + "mean_token_accuracy": 0.7788461446762085, + "num_tokens": 30419528.0, + "step": 9841, + "train/ce_loss": 1.055103063583374 + }, + { + "epoch": 0.9730077120822622, + "step": 9841, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.9730077120822622, + "step": 9841, + "train/total_loss": 0.16019780933856964 + }, + { + "entropy": 8.57362174987793, + "epoch": 0.9731065849317777, + "mean_token_accuracy": 0.7004279494285583, + "num_tokens": 30424689.0, + "step": 9842, + "train/ce_loss": 1.0744620561599731 + }, + { + "epoch": 0.9731065849317777, + "step": 9842, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.9731065849317777, + "step": 9842, + "train/total_loss": 0.15822745859622955 + }, + { + "entropy": 9.38490104675293, + "epoch": 0.9732054577812933, + "mean_token_accuracy": 0.8166311383247375, + "num_tokens": 30429612.0, + "step": 9843, + "train/ce_loss": 1.1905419796676142e-06 + }, + { + "epoch": 0.9732054577812933, + "step": 9843, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.9732054577812933, + "step": 9843, + "train/total_loss": 0.04687511920928955 + }, + { + "entropy": 9.143271446228027, + "epoch": 0.9733043306308088, + "mean_token_accuracy": 0.711033284664154, + "num_tokens": 30434596.0, + "step": 9844, + "train/ce_loss": 1.6245604753494263 + }, + { + "epoch": 0.9733043306308088, + "step": 9844, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.9733043306308088, + "step": 9844, + "train/total_loss": 0.18589355051517487 + }, + { + "entropy": 9.067255020141602, + "epoch": 0.9734032034803243, + "mean_token_accuracy": 0.7121879458427429, + "num_tokens": 30439631.0, + "step": 9845, + "train/ce_loss": 2.583715286164079e-07 + }, + { + "epoch": 0.9734032034803243, + "step": 9845, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.9734032034803243, + "step": 9845, + "train/total_loss": 0.01953127607703209 + }, + { + "entropy": 9.127470970153809, + "epoch": 0.9735020763298399, + "mean_token_accuracy": 0.7874763011932373, + "num_tokens": 30444609.0, + "step": 9846, + "train/ce_loss": 1.6459321975708008 + }, + { + "epoch": 0.9735020763298399, + "step": 9846, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.9735020763298399, + "step": 9846, + "train/total_loss": 0.20365571975708008 + }, + { + "entropy": 8.572263717651367, + "epoch": 0.9736009491793554, + "mean_token_accuracy": 0.7341115474700928, + "num_tokens": 30449885.0, + "step": 9847, + "train/ce_loss": 1.071635127067566 + }, + { + "epoch": 0.9736009491793554, + "step": 9847, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.9736009491793554, + "step": 9847, + "train/total_loss": 0.12669476866722107 + }, + { + "entropy": 8.690587997436523, + "epoch": 0.9736998220288708, + "mean_token_accuracy": 0.748633861541748, + "num_tokens": 30455114.0, + "step": 9848, + "train/ce_loss": 0.4136137366294861 + }, + { + "epoch": 0.9736998220288708, + "step": 9848, + "train/sim_loss": 0.01171875 + }, + { + "epoch": 0.9736998220288708, + "step": 9848, + "train/total_loss": 0.05308012291789055 + }, + { + "entropy": 9.048288345336914, + "epoch": 0.9737986948783864, + "mean_token_accuracy": 0.7550644278526306, + "num_tokens": 30460086.0, + "step": 9849, + "train/ce_loss": 0.7388678789138794 + }, + { + "epoch": 0.9737986948783864, + "step": 9849, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.9737986948783864, + "step": 9849, + "train/total_loss": 0.10513678938150406 + }, + { + "entropy": 8.765560150146484, + "epoch": 0.9738975677279019, + "mean_token_accuracy": 0.7100840210914612, + "num_tokens": 30465212.0, + "step": 9850, + "train/ce_loss": 0.4580962061882019 + }, + { + "epoch": 0.9738975677279019, + "step": 9850, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.9738975677279019, + "step": 9850, + "train/total_loss": 0.09268462657928467 + }, + { + "entropy": 8.234114646911621, + "epoch": 0.9739964405774174, + "mean_token_accuracy": 0.7294520735740662, + "num_tokens": 30470577.0, + "step": 9851, + "train/ce_loss": 0.8070613145828247 + }, + { + "epoch": 0.9739964405774174, + "step": 9851, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.9739964405774174, + "step": 9851, + "train/total_loss": 0.1275811344385147 + }, + { + "entropy": 8.795877456665039, + "epoch": 0.974095313426933, + "mean_token_accuracy": 0.773553729057312, + "num_tokens": 30475656.0, + "step": 9852, + "train/ce_loss": 6.72289729664044e-07 + }, + { + "epoch": 0.974095313426933, + "step": 9852, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.974095313426933, + "step": 9852, + "train/total_loss": 0.04687506705522537 + }, + { + "entropy": 8.467682838439941, + "epoch": 0.9741941862764485, + "mean_token_accuracy": 0.7105262875556946, + "num_tokens": 30481086.0, + "step": 9853, + "train/ce_loss": 1.3465851545333862 + }, + { + "epoch": 0.9741941862764485, + "step": 9853, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.9741941862764485, + "step": 9853, + "train/total_loss": 0.21278351545333862 + }, + { + "entropy": 9.144447326660156, + "epoch": 0.974293059125964, + "mean_token_accuracy": 0.7205169796943665, + "num_tokens": 30486170.0, + "step": 9854, + "train/ce_loss": 4.991715059077251e-07 + }, + { + "epoch": 0.974293059125964, + "step": 9854, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.974293059125964, + "step": 9854, + "train/total_loss": 0.03125004842877388 + }, + { + "entropy": 9.285968780517578, + "epoch": 0.9743919319754796, + "mean_token_accuracy": 0.8068833947181702, + "num_tokens": 30491137.0, + "step": 9855, + "train/ce_loss": 0.5230701565742493 + }, + { + "epoch": 0.9743919319754796, + "step": 9855, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.9743919319754796, + "step": 9855, + "train/total_loss": 0.08746326714754105 + }, + { + "entropy": 8.926168441772461, + "epoch": 0.9744908048249951, + "mean_token_accuracy": 0.7908309698104858, + "num_tokens": 30496281.0, + "step": 9856, + "train/ce_loss": 0.7687076926231384 + }, + { + "epoch": 0.9744908048249951, + "step": 9856, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.9744908048249951, + "step": 9856, + "train/total_loss": 0.11983951926231384 + }, + { + "entropy": 8.906764030456543, + "epoch": 0.9745896776745105, + "mean_token_accuracy": 0.7139175534248352, + "num_tokens": 30501532.0, + "step": 9857, + "train/ce_loss": 0.8398147821426392 + }, + { + "epoch": 0.9745896776745105, + "step": 9857, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.9745896776745105, + "step": 9857, + "train/total_loss": 0.1035127267241478 + }, + { + "entropy": 8.640974044799805, + "epoch": 0.9746885505240261, + "mean_token_accuracy": 0.7666666507720947, + "num_tokens": 30506782.0, + "step": 9858, + "train/ce_loss": 0.8701812624931335 + }, + { + "epoch": 0.9746885505240261, + "step": 9858, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.9746885505240261, + "step": 9858, + "train/total_loss": 0.11045562475919724 + }, + { + "entropy": 8.79356861114502, + "epoch": 0.9747874233735416, + "mean_token_accuracy": 0.7195994257926941, + "num_tokens": 30511942.0, + "step": 9859, + "train/ce_loss": 0.7857524752616882 + }, + { + "epoch": 0.9747874233735416, + "step": 9859, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.9747874233735416, + "step": 9859, + "train/total_loss": 0.1567002534866333 + }, + { + "epoch": 0.9748862962230571, + "grad_norm": 0.777504026889801, + "learning_rate": 7.56490134994808e-06, + "loss": 0.1331, + "step": 9860 + }, + { + "entropy": 9.06789779663086, + "epoch": 0.9748862962230571, + "mean_token_accuracy": 0.7567976117134094, + "num_tokens": 30517046.0, + "step": 9860, + "train/ce_loss": 1.3188773393630981 + }, + { + "epoch": 0.9748862962230571, + "step": 9860, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.9748862962230571, + "step": 9860, + "train/total_loss": 0.21001273393630981 + }, + { + "entropy": 8.430561065673828, + "epoch": 0.9749851690725727, + "mean_token_accuracy": 0.7646432518959045, + "num_tokens": 30522474.0, + "step": 9861, + "train/ce_loss": 0.9632667899131775 + }, + { + "epoch": 0.9749851690725727, + "step": 9861, + "train/sim_loss": 0.07421875 + }, + { + "epoch": 0.9749851690725727, + "step": 9861, + "train/total_loss": 0.17054542899131775 + }, + { + "entropy": 9.121234893798828, + "epoch": 0.9750840419220882, + "mean_token_accuracy": 0.7977991700172424, + "num_tokens": 30527648.0, + "step": 9862, + "train/ce_loss": 1.0044971704483032 + }, + { + "epoch": 0.9750840419220882, + "step": 9862, + "train/sim_loss": 0.05078125 + }, + { + "epoch": 0.9750840419220882, + "step": 9862, + "train/total_loss": 0.15123096108436584 + }, + { + "entropy": 8.365032196044922, + "epoch": 0.9751829147716037, + "mean_token_accuracy": 0.7340182662010193, + "num_tokens": 30533041.0, + "step": 9863, + "train/ce_loss": 1.510496973991394 + }, + { + "epoch": 0.9751829147716037, + "step": 9863, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.9751829147716037, + "step": 9863, + "train/total_loss": 0.22136220335960388 + }, + { + "entropy": 8.620687484741211, + "epoch": 0.9752817876211193, + "mean_token_accuracy": 0.7404305934906006, + "num_tokens": 30538414.0, + "step": 9864, + "train/ce_loss": 0.6511469483375549 + }, + { + "epoch": 0.9752817876211193, + "step": 9864, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.9752817876211193, + "step": 9864, + "train/total_loss": 0.09636469930410385 + }, + { + "entropy": 8.600302696228027, + "epoch": 0.9753806604706348, + "mean_token_accuracy": 0.8005018830299377, + "num_tokens": 30543687.0, + "step": 9865, + "train/ce_loss": 4.128142165882309e-07 + }, + { + "epoch": 0.9753806604706348, + "step": 9865, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.9753806604706348, + "step": 9865, + "train/total_loss": 0.019531290978193283 + }, + { + "entropy": 8.438641548156738, + "epoch": 0.9754795333201502, + "mean_token_accuracy": 0.7306532859802246, + "num_tokens": 30549115.0, + "step": 9866, + "train/ce_loss": 0.852030336856842 + }, + { + "epoch": 0.9754795333201502, + "step": 9866, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.9754795333201502, + "step": 9866, + "train/total_loss": 0.13207803666591644 + }, + { + "entropy": 8.44935131072998, + "epoch": 0.9755784061696658, + "mean_token_accuracy": 0.7590726017951965, + "num_tokens": 30554567.0, + "step": 9867, + "train/ce_loss": 0.7305549383163452 + }, + { + "epoch": 0.9755784061696658, + "step": 9867, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.9755784061696658, + "step": 9867, + "train/total_loss": 0.08868049830198288 + }, + { + "entropy": 9.04803466796875, + "epoch": 0.9756772790191813, + "mean_token_accuracy": 0.715242862701416, + "num_tokens": 30559655.0, + "step": 9868, + "train/ce_loss": 9.432754382032726e-07 + }, + { + "epoch": 0.9756772790191813, + "step": 9868, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.9756772790191813, + "step": 9868, + "train/total_loss": 0.04687509313225746 + }, + { + "entropy": 8.45159912109375, + "epoch": 0.9757761518686968, + "mean_token_accuracy": 0.7975757718086243, + "num_tokens": 30564978.0, + "step": 9869, + "train/ce_loss": 0.6422327756881714 + }, + { + "epoch": 0.9757761518686968, + "step": 9869, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.9757761518686968, + "step": 9869, + "train/total_loss": 0.0915670320391655 + }, + { + "entropy": 8.481781005859375, + "epoch": 0.9758750247182124, + "mean_token_accuracy": 0.7322221994400024, + "num_tokens": 30570348.0, + "step": 9870, + "train/ce_loss": 0.5732051730155945 + }, + { + "epoch": 0.9758750247182124, + "step": 9870, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.9758750247182124, + "step": 9870, + "train/total_loss": 0.08857052028179169 + }, + { + "entropy": 8.707862854003906, + "epoch": 0.9759738975677279, + "mean_token_accuracy": 0.7855477929115295, + "num_tokens": 30575692.0, + "step": 9871, + "train/ce_loss": 0.9808839559555054 + }, + { + "epoch": 0.9759738975677279, + "step": 9871, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.9759738975677279, + "step": 9871, + "train/total_loss": 0.14105714857578278 + }, + { + "entropy": 8.741270065307617, + "epoch": 0.9760727704172434, + "mean_token_accuracy": 0.7372549176216125, + "num_tokens": 30580956.0, + "step": 9872, + "train/ce_loss": 1.205855369567871 + }, + { + "epoch": 0.9760727704172434, + "step": 9872, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.9760727704172434, + "step": 9872, + "train/total_loss": 0.16746053099632263 + }, + { + "entropy": 8.641582489013672, + "epoch": 0.976171643266759, + "mean_token_accuracy": 0.7350705862045288, + "num_tokens": 30586365.0, + "step": 9873, + "train/ce_loss": 0.45913925766944885 + }, + { + "epoch": 0.976171643266759, + "step": 9873, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.976171643266759, + "step": 9873, + "train/total_loss": 0.065445177257061 + }, + { + "entropy": 10.355825424194336, + "epoch": 0.9762705161162745, + "mean_token_accuracy": 1.0, + "num_tokens": 30590747.0, + "step": 9874, + "train/ce_loss": 6.013480378896929e-05 + }, + { + "epoch": 0.9762705161162745, + "step": 9874, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.9762705161162745, + "step": 9874, + "train/total_loss": 0.03516226261854172 + }, + { + "entropy": 8.985115051269531, + "epoch": 0.97636938896579, + "mean_token_accuracy": 0.7647058963775635, + "num_tokens": 30595756.0, + "step": 9875, + "train/ce_loss": 8.571366834075889e-07 + }, + { + "epoch": 0.97636938896579, + "step": 9875, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.97636938896579, + "step": 9875, + "train/total_loss": 0.046875085681676865 + }, + { + "entropy": 8.91389274597168, + "epoch": 0.9764682618153055, + "mean_token_accuracy": 0.783382773399353, + "num_tokens": 30600919.0, + "step": 9876, + "train/ce_loss": 3.1128601563068514e-07 + }, + { + "epoch": 0.9764682618153055, + "step": 9876, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.9764682618153055, + "step": 9876, + "train/total_loss": 0.03906252980232239 + }, + { + "entropy": 8.787052154541016, + "epoch": 0.976567134664821, + "mean_token_accuracy": 0.7713903784751892, + "num_tokens": 30606118.0, + "step": 9877, + "train/ce_loss": 0.4978821277618408 + }, + { + "epoch": 0.976567134664821, + "step": 9877, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.976567134664821, + "step": 9877, + "train/total_loss": 0.0654132142663002 + }, + { + "entropy": 8.785783767700195, + "epoch": 0.9766660075143365, + "mean_token_accuracy": 0.7283018827438354, + "num_tokens": 30611376.0, + "step": 9878, + "train/ce_loss": 0.9105390310287476 + }, + { + "epoch": 0.9766660075143365, + "step": 9878, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.9766660075143365, + "step": 9878, + "train/total_loss": 0.17308515310287476 + }, + { + "entropy": 8.438060760498047, + "epoch": 0.9767648803638521, + "mean_token_accuracy": 0.7971563935279846, + "num_tokens": 30616939.0, + "step": 9879, + "train/ce_loss": 0.8996409773826599 + }, + { + "epoch": 0.9767648803638521, + "step": 9879, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.9767648803638521, + "step": 9879, + "train/total_loss": 0.1368390917778015 + }, + { + "epoch": 0.9768637532133676, + "grad_norm": 0.510572075843811, + "learning_rate": 7.559956485190131e-06, + "loss": 0.1195, + "step": 9880 + }, + { + "entropy": 8.88320541381836, + "epoch": 0.9768637532133676, + "mean_token_accuracy": 0.718068540096283, + "num_tokens": 30622004.0, + "step": 9880, + "train/ce_loss": 1.1161073446273804 + }, + { + "epoch": 0.9768637532133676, + "step": 9880, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.9768637532133676, + "step": 9880, + "train/total_loss": 0.13504824042320251 + }, + { + "entropy": 8.4711275100708, + "epoch": 0.9769626260628831, + "mean_token_accuracy": 0.7662650346755981, + "num_tokens": 30627319.0, + "step": 9881, + "train/ce_loss": 0.8220862746238708 + }, + { + "epoch": 0.9769626260628831, + "step": 9881, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.9769626260628831, + "step": 9881, + "train/total_loss": 0.13689613342285156 + }, + { + "entropy": 8.859695434570312, + "epoch": 0.9770614989123987, + "mean_token_accuracy": 0.8397040963172913, + "num_tokens": 30632589.0, + "step": 9882, + "train/ce_loss": 0.789210855960846 + }, + { + "epoch": 0.9770614989123987, + "step": 9882, + "train/sim_loss": 0.1171875 + }, + { + "epoch": 0.9770614989123987, + "step": 9882, + "train/total_loss": 0.19610857963562012 + }, + { + "entropy": 8.766387939453125, + "epoch": 0.9771603717619142, + "mean_token_accuracy": 0.7624831199645996, + "num_tokens": 30637801.0, + "step": 9883, + "train/ce_loss": 1.42343008518219 + }, + { + "epoch": 0.9771603717619142, + "step": 9883, + "train/sim_loss": 0.125 + }, + { + "epoch": 0.9771603717619142, + "step": 9883, + "train/total_loss": 0.26734301447868347 + }, + { + "entropy": 8.970907211303711, + "epoch": 0.9772592446114297, + "mean_token_accuracy": 0.7400881052017212, + "num_tokens": 30642949.0, + "step": 9884, + "train/ce_loss": 1.1563928127288818 + }, + { + "epoch": 0.9772592446114297, + "step": 9884, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.9772592446114297, + "step": 9884, + "train/total_loss": 0.17032678425312042 + }, + { + "entropy": 8.41048526763916, + "epoch": 0.9773581174609453, + "mean_token_accuracy": 0.7832817435264587, + "num_tokens": 30648399.0, + "step": 9885, + "train/ce_loss": 0.7665212154388428 + }, + { + "epoch": 0.9773581174609453, + "step": 9885, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.9773581174609453, + "step": 9885, + "train/total_loss": 0.13133962452411652 + }, + { + "entropy": 8.745559692382812, + "epoch": 0.9774569903104607, + "mean_token_accuracy": 0.736923098564148, + "num_tokens": 30653559.0, + "step": 9886, + "train/ce_loss": 1.0831034183502197 + }, + { + "epoch": 0.9774569903104607, + "step": 9886, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.9774569903104607, + "step": 9886, + "train/total_loss": 0.13565409183502197 + }, + { + "entropy": 8.54442024230957, + "epoch": 0.9775558631599762, + "mean_token_accuracy": 0.750295877456665, + "num_tokens": 30658783.0, + "step": 9887, + "train/ce_loss": 1.1568293571472168 + }, + { + "epoch": 0.9775558631599762, + "step": 9887, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.9775558631599762, + "step": 9887, + "train/total_loss": 0.1430266797542572 + }, + { + "entropy": 8.582108497619629, + "epoch": 0.9776547360094918, + "mean_token_accuracy": 0.7840467095375061, + "num_tokens": 30664269.0, + "step": 9888, + "train/ce_loss": 0.4403001368045807 + }, + { + "epoch": 0.9776547360094918, + "step": 9888, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.9776547360094918, + "step": 9888, + "train/total_loss": 0.11043626070022583 + }, + { + "entropy": 8.791365623474121, + "epoch": 0.9777536088590073, + "mean_token_accuracy": 0.7666666507720947, + "num_tokens": 30669518.0, + "step": 9889, + "train/ce_loss": 0.5905187726020813 + }, + { + "epoch": 0.9777536088590073, + "step": 9889, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.9777536088590073, + "step": 9889, + "train/total_loss": 0.08248937875032425 + }, + { + "entropy": 8.640741348266602, + "epoch": 0.9778524817085228, + "mean_token_accuracy": 0.7566371560096741, + "num_tokens": 30674853.0, + "step": 9890, + "train/ce_loss": 0.721811056137085 + }, + { + "epoch": 0.9778524817085228, + "step": 9890, + "train/sim_loss": 0.02734375 + }, + { + "epoch": 0.9778524817085228, + "step": 9890, + "train/total_loss": 0.0995248556137085 + }, + { + "entropy": 8.512459754943848, + "epoch": 0.9779513545580384, + "mean_token_accuracy": 0.7124260067939758, + "num_tokens": 30680183.0, + "step": 9891, + "train/ce_loss": 0.8118852972984314 + }, + { + "epoch": 0.9779513545580384, + "step": 9891, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.9779513545580384, + "step": 9891, + "train/total_loss": 0.12806352972984314 + }, + { + "entropy": 8.836570739746094, + "epoch": 0.9780502274075539, + "mean_token_accuracy": 0.8242074847221375, + "num_tokens": 30685331.0, + "step": 9892, + "train/ce_loss": 0.5848731398582458 + }, + { + "epoch": 0.9780502274075539, + "step": 9892, + "train/sim_loss": 0.015625 + }, + { + "epoch": 0.9780502274075539, + "step": 9892, + "train/total_loss": 0.07411231100559235 + }, + { + "entropy": 8.382086753845215, + "epoch": 0.9781491002570694, + "mean_token_accuracy": 0.7394678592681885, + "num_tokens": 30690664.0, + "step": 9893, + "train/ce_loss": 0.9393115043640137 + }, + { + "epoch": 0.9781491002570694, + "step": 9893, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.9781491002570694, + "step": 9893, + "train/total_loss": 0.1408061534166336 + }, + { + "entropy": 8.651121139526367, + "epoch": 0.978247973106585, + "mean_token_accuracy": 0.7192254662513733, + "num_tokens": 30695806.0, + "step": 9894, + "train/ce_loss": 1.034562110900879 + }, + { + "epoch": 0.978247973106585, + "step": 9894, + "train/sim_loss": 0.09375 + }, + { + "epoch": 0.978247973106585, + "step": 9894, + "train/total_loss": 0.19720621407032013 + }, + { + "entropy": 9.303420066833496, + "epoch": 0.9783468459561004, + "mean_token_accuracy": 0.6924528479576111, + "num_tokens": 30700778.0, + "step": 9895, + "train/ce_loss": 1.5140877962112427 + }, + { + "epoch": 0.9783468459561004, + "step": 9895, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.9783468459561004, + "step": 9895, + "train/total_loss": 0.18656502664089203 + }, + { + "entropy": 9.45178508758545, + "epoch": 0.9784457188056159, + "mean_token_accuracy": 0.7640449404716492, + "num_tokens": 30705674.0, + "step": 9896, + "train/ce_loss": 2.3216692568439612e-07 + }, + { + "epoch": 0.9784457188056159, + "step": 9896, + "train/sim_loss": 0.0078125 + }, + { + "epoch": 0.9784457188056159, + "step": 9896, + "train/total_loss": 0.007812523283064365 + }, + { + "entropy": 8.834487915039062, + "epoch": 0.9785445916551315, + "mean_token_accuracy": 0.7036328911781311, + "num_tokens": 30710659.0, + "step": 9897, + "train/ce_loss": 1.789854884147644 + }, + { + "epoch": 0.9785445916551315, + "step": 9897, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.9785445916551315, + "step": 9897, + "train/total_loss": 0.22586049139499664 + }, + { + "entropy": 8.278316497802734, + "epoch": 0.978643464504647, + "mean_token_accuracy": 0.7219917178153992, + "num_tokens": 30716035.0, + "step": 9898, + "train/ce_loss": 0.8721766471862793 + }, + { + "epoch": 0.978643464504647, + "step": 9898, + "train/sim_loss": 0.05859375 + }, + { + "epoch": 0.978643464504647, + "step": 9898, + "train/total_loss": 0.14581140875816345 + }, + { + "entropy": 9.036575317382812, + "epoch": 0.9787423373541625, + "mean_token_accuracy": 0.6861110925674438, + "num_tokens": 30721229.0, + "step": 9899, + "train/ce_loss": 1.099528193473816 + }, + { + "epoch": 0.9787423373541625, + "step": 9899, + "train/sim_loss": 0.04296875 + }, + { + "epoch": 0.9787423373541625, + "step": 9899, + "train/total_loss": 0.15292157232761383 + }, + { + "epoch": 0.9788412102036781, + "grad_norm": 0.7387040853500366, + "learning_rate": 7.555011620432182e-06, + "loss": 0.1293, + "step": 9900 + }, + { + "entropy": 8.581748962402344, + "epoch": 0.9788412102036781, + "mean_token_accuracy": 0.689486563205719, + "num_tokens": 30726478.0, + "step": 9900, + "train/ce_loss": 0.6335959434509277 + }, + { + "epoch": 0.9788412102036781, + "step": 9900, + "train/sim_loss": 0.03125 + }, + { + "epoch": 0.9788412102036781, + "step": 9900, + "train/total_loss": 0.09460959583520889 + }, + { + "entropy": 8.285743713378906, + "epoch": 0.9789400830531936, + "mean_token_accuracy": 0.7516650557518005, + "num_tokens": 30732196.0, + "step": 9901, + "train/ce_loss": 1.0870752334594727 + }, + { + "epoch": 0.9789400830531936, + "step": 9901, + "train/sim_loss": 0.078125 + }, + { + "epoch": 0.9789400830531936, + "step": 9901, + "train/total_loss": 0.1868325173854828 + }, + { + "entropy": 8.982463836669922, + "epoch": 0.9790389559027091, + "mean_token_accuracy": 0.6801406145095825, + "num_tokens": 30737219.0, + "step": 9902, + "train/ce_loss": 1.3527101278305054 + }, + { + "epoch": 0.9790389559027091, + "step": 9902, + "train/sim_loss": 0.0546875 + }, + { + "epoch": 0.9790389559027091, + "step": 9902, + "train/total_loss": 0.18995851278305054 + }, + { + "entropy": 9.142914772033691, + "epoch": 0.9791378287522247, + "mean_token_accuracy": 0.7255892157554626, + "num_tokens": 30742283.0, + "step": 9903, + "train/ce_loss": 1.7833095788955688 + }, + { + "epoch": 0.9791378287522247, + "step": 9903, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.9791378287522247, + "step": 9903, + "train/total_loss": 0.24473720788955688 + }, + { + "entropy": 8.998891830444336, + "epoch": 0.9792367016017401, + "mean_token_accuracy": 0.7496296167373657, + "num_tokens": 30747397.0, + "step": 9904, + "train/ce_loss": 0.5943091511726379 + }, + { + "epoch": 0.9792367016017401, + "step": 9904, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.9792367016017401, + "step": 9904, + "train/total_loss": 0.08286841213703156 + }, + { + "entropy": 8.840842247009277, + "epoch": 0.9793355744512556, + "mean_token_accuracy": 0.7252252101898193, + "num_tokens": 30752514.0, + "step": 9905, + "train/ce_loss": 1.216805338859558 + }, + { + "epoch": 0.9793355744512556, + "step": 9905, + "train/sim_loss": 0.0703125 + }, + { + "epoch": 0.9793355744512556, + "step": 9905, + "train/total_loss": 0.19199302792549133 + }, + { + "entropy": 8.774174690246582, + "epoch": 0.9794344473007712, + "mean_token_accuracy": 0.7604422569274902, + "num_tokens": 30757778.0, + "step": 9906, + "train/ce_loss": 0.9100068807601929 + }, + { + "epoch": 0.9794344473007712, + "step": 9906, + "train/sim_loss": 0.03515625 + }, + { + "epoch": 0.9794344473007712, + "step": 9906, + "train/total_loss": 0.12615694105625153 + }, + { + "entropy": 8.940999984741211, + "epoch": 0.9795333201502867, + "mean_token_accuracy": 0.7410179376602173, + "num_tokens": 30762903.0, + "step": 9907, + "train/ce_loss": 0.520973801612854 + }, + { + "epoch": 0.9795333201502867, + "step": 9907, + "train/sim_loss": 0.046875 + }, + { + "epoch": 0.9795333201502867, + "step": 9907, + "train/total_loss": 0.0989723801612854 + }, + { + "entropy": 8.454092025756836, + "epoch": 0.9796321929998023, + "mean_token_accuracy": 0.7634069323539734, + "num_tokens": 30768319.0, + "step": 9908, + "train/ce_loss": 0.9952600598335266 + }, + { + "epoch": 0.9796321929998023, + "step": 9908, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.9796321929998023, + "step": 9908, + "train/total_loss": 0.16593226790428162 + }, + { + "entropy": 8.704395294189453, + "epoch": 0.9797310658493178, + "mean_token_accuracy": 0.646039605140686, + "num_tokens": 30773610.0, + "step": 9909, + "train/ce_loss": 1.4009902477264404 + }, + { + "epoch": 0.9797310658493178, + "step": 9909, + "train/sim_loss": 0.06640625 + }, + { + "epoch": 0.9797310658493178, + "step": 9909, + "train/total_loss": 0.20650528371334076 + }, + { + "entropy": 8.383907318115234, + "epoch": 0.9798299386988333, + "mean_token_accuracy": 0.7507853507995605, + "num_tokens": 30779058.0, + "step": 9910, + "train/ce_loss": 0.7845810055732727 + }, + { + "epoch": 0.9798299386988333, + "step": 9910, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.9798299386988333, + "step": 9910, + "train/total_loss": 0.11752060055732727 + }, + { + "entropy": 8.56901741027832, + "epoch": 0.9799288115483489, + "mean_token_accuracy": 0.7284946441650391, + "num_tokens": 30784258.0, + "step": 9911, + "train/ce_loss": 0.6746019721031189 + }, + { + "epoch": 0.9799288115483489, + "step": 9911, + "train/sim_loss": 0.0390625 + }, + { + "epoch": 0.9799288115483489, + "step": 9911, + "train/total_loss": 0.10652270168066025 + }, + { + "entropy": 8.998884201049805, + "epoch": 0.9800276843978644, + "mean_token_accuracy": 0.8318318128585815, + "num_tokens": 30789417.0, + "step": 9912, + "train/ce_loss": 0.777534008026123 + }, + { + "epoch": 0.9800276843978644, + "step": 9912, + "train/sim_loss": 0.01953125 + }, + { + "epoch": 0.9800276843978644, + "step": 9912, + "train/total_loss": 0.09728465229272842 + }, + { + "entropy": 8.481942176818848, + "epoch": 0.9801265572473798, + "mean_token_accuracy": 0.7428229451179504, + "num_tokens": 30794865.0, + "step": 9913, + "train/ce_loss": 0.9887452721595764 + }, + { + "epoch": 0.9801265572473798, + "step": 9913, + "train/sim_loss": 0.08203125 + }, + { + "epoch": 0.9801265572473798, + "step": 9913, + "train/total_loss": 0.1809057891368866 + }, + { + "entropy": 8.863540649414062, + "epoch": 0.9802254300968954, + "mean_token_accuracy": 0.7674094438552856, + "num_tokens": 30800041.0, + "step": 9914, + "train/ce_loss": 1.9064351320266724 + }, + { + "epoch": 0.9802254300968954, + "step": 9914, + "train/sim_loss": 0.0625 + }, + { + "epoch": 0.9802254300968954, + "step": 9914, + "train/total_loss": 0.2531435191631317 + }, + { + "entropy": 8.945619583129883, + "epoch": 0.9803243029464109, + "mean_token_accuracy": 0.7545271515846252, + "num_tokens": 30805020.0, + "step": 9915, + "train/ce_loss": 0.5835563540458679 + }, + { + "epoch": 0.9803243029464109, + "step": 9915, + "train/sim_loss": 0.0234375 + }, + { + "epoch": 0.9803243029464109, + "step": 9915, + "train/total_loss": 0.08179313689470291 + }, + { + "entropy": 9.063322067260742, + "epoch": 0.9804231757959264, + "mean_token_accuracy": 0.7410604357719421, + "num_tokens": 30810451.0, + "step": 9916, + "train/ce_loss": 0.8868422508239746 + }, + { + "epoch": 0.9804231757959264, + "step": 9916, + "train/sim_loss": 0.02734375 }, { - "entropy": 9.012129020690917, - "epoch": 0.04350405378683014, - "grad_norm": 8.741070747375488, - "learning_rate": 9.893932650941997e-06, - "loss": 0.5833, - "mean_token_accuracy": 0.8093826532363891, - "num_tokens": 2112797.0, - "step": 440 + "epoch": 0.9804231757959264, + "step": 9916, + "train/total_loss": 0.11602797359228134 }, { - "entropy": 8.85499711036682, - "epoch": 0.0454815107771406, - "grad_norm": 7.772428035736084, - "learning_rate": 9.88898778618405e-06, - "loss": 0.5931, - "mean_token_accuracy": 0.8054407477378845, - "num_tokens": 2209736.0, - "step": 460 + "entropy": 8.314630508422852, + "epoch": 0.980522048645442, + "mean_token_accuracy": 0.7217973470687866, + "num_tokens": 30815993.0, + "step": 9917, + "train/ce_loss": 0.9607058167457581 }, { - "entropy": 9.008498573303223, - "epoch": 0.04745896776745106, - "grad_norm": 5.063883304595947, - "learning_rate": 9.8840429214261e-06, - "loss": 0.5573, - "mean_token_accuracy": 0.8134046852588653, - "num_tokens": 2306375.0, - "step": 480 + "epoch": 0.980522048645442, + "step": 9917, + "train/sim_loss": 0.01953125 }, { - "entropy": 9.027544116973877, - "epoch": 0.04943642475776152, - "grad_norm": 6.074745178222656, - "learning_rate": 9.87909805666815e-06, - "loss": 0.6062, - "mean_token_accuracy": 0.7952834278345108, - "num_tokens": 2401863.0, - "step": 500 + "epoch": 0.980522048645442, + "step": 9917, + "train/total_loss": 0.11560183018445969 }, { - "entropy": 8.941827821731568, - "epoch": 0.05141388174807198, - "grad_norm": 5.661360740661621, - "learning_rate": 9.874153191910203e-06, - "loss": 0.5416, - "mean_token_accuracy": 0.8219775348901749, - "num_tokens": 96641.0, - "step": 520 + "entropy": 8.702873229980469, + "epoch": 0.9806209214949575, + "mean_token_accuracy": 0.6839160919189453, + "num_tokens": 30821165.0, + "step": 9918, + "train/ce_loss": 0.7455655336380005 }, { - "entropy": 8.973506832122803, - "epoch": 0.05339133873838244, - "grad_norm": 11.214988708496094, - "learning_rate": 9.869208327152253e-06, - "loss": 0.555, - "mean_token_accuracy": 0.8116023421287537, - "num_tokens": 192188.0, - "step": 540 + "epoch": 0.9806209214949575, + "step": 9918, + "train/sim_loss": 0.05078125 }, { - "entropy": 8.858332443237305, - "epoch": 0.0553687957286929, - "grad_norm": 5.695793151855469, - "learning_rate": 9.864263462394305e-06, - "loss": 0.521, - "mean_token_accuracy": 0.8223962157964706, - "num_tokens": 288727.0, - "step": 560 + "epoch": 0.9806209214949575, + "step": 9918, + "train/total_loss": 0.12533780932426453 }, { - "entropy": 8.82462306022644, - "epoch": 0.057346252719003364, - "grad_norm": 7.937519550323486, - "learning_rate": 9.859318597636356e-06, - "loss": 0.5524, - "mean_token_accuracy": 0.8099074214696884, - "num_tokens": 384935.0, - "step": 580 + "entropy": 8.36336612701416, + "epoch": 0.980719794344473, + "mean_token_accuracy": 0.8054474592208862, + "num_tokens": 30826831.0, + "step": 9919, + "train/ce_loss": 0.691750705242157 }, { - "entropy": 8.89127550125122, - "epoch": 0.059323709709313825, - "grad_norm": 8.627009391784668, - "learning_rate": 9.854373732878406e-06, - "loss": 0.5433, - "mean_token_accuracy": 0.8203161925077438, - "num_tokens": 481486.0, - "step": 600 + "epoch": 0.980719794344473, + "step": 9919, + "train/sim_loss": 0.0625 }, { - "entropy": 8.845260763168335, - "epoch": 0.061301166699624286, - "grad_norm": 5.2925004959106445, - "learning_rate": 9.849428868120457e-06, - "loss": 0.5003, - "mean_token_accuracy": 0.8314364552497864, - "num_tokens": 576494.0, - "step": 620 + "epoch": 0.980719794344473, + "step": 9919, + "train/total_loss": 0.13167506456375122 }, { - "entropy": 8.797648763656616, - "epoch": 0.06327862368993474, - "grad_norm": 6.669622898101807, - "learning_rate": 9.844484003362509e-06, - "loss": 0.5048, - "mean_token_accuracy": 0.8285873651504516, - "num_tokens": 672082.0, - "step": 640 + "epoch": 0.9808186671939886, + "grad_norm": 0.5420098900794983, + "learning_rate": 7.550066755674232e-06, + "loss": 0.1404, + "step": 9920 }, { - "entropy": 8.779708003997802, - "epoch": 0.0652560806802452, - "grad_norm": 7.255181312561035, - "learning_rate": 9.83953913860456e-06, - "loss": 0.5323, - "mean_token_accuracy": 0.8236057549715042, - "num_tokens": 768865.0, - "step": 660 + "entropy": 8.432720184326172, + "epoch": 0.9808186671939886, + "mean_token_accuracy": 0.6704196333885193, + "num_tokens": 30832308.0, + "step": 9920, + "train/ce_loss": 1.9303783178329468 }, { - "entropy": 9.006498098373413, - "epoch": 0.06723353767055566, - "grad_norm": 9.592479705810547, - "learning_rate": 9.834594273846612e-06, - "loss": 0.5004, - "mean_token_accuracy": 0.8292551845312118, - "num_tokens": 863537.0, - "step": 680 + "epoch": 0.9808186671939886, + "step": 9920, + "train/sim_loss": 0.04296875 }, { - "entropy": 8.846775245666503, - "epoch": 0.06921099466086612, - "grad_norm": 7.150557518005371, - "learning_rate": 9.829649409088662e-06, - "loss": 0.5216, - "mean_token_accuracy": 0.8251522511243821, - "num_tokens": 958588.0, - "step": 700 + "epoch": 0.9808186671939886, + "step": 9920, + "train/total_loss": 0.23600658774375916 }, { - "entropy": 8.872265100479126, - "epoch": 0.07118845165117658, - "grad_norm": 5.498615741729736, - "learning_rate": 9.824704544330713e-06, - "loss": 0.5196, - "mean_token_accuracy": 0.8241115421056747, - "num_tokens": 1054790.0, - "step": 720 + "entropy": 8.786901473999023, + "epoch": 0.9809175400435041, + "mean_token_accuracy": 0.7964236736297607, + "num_tokens": 30837514.0, + "step": 9921, + "train/ce_loss": 5.456970484374324e-07 }, { - "entropy": 8.910785818099976, - "epoch": 0.07316590864148705, - "grad_norm": 7.82961368560791, - "learning_rate": 9.819759679572765e-06, - "loss": 0.4852, - "mean_token_accuracy": 0.8368175506591797, - "num_tokens": 1149594.0, - "step": 740 + "epoch": 0.9809175400435041, + "step": 9921, + "train/sim_loss": 0.04296875 }, { - "entropy": 8.991975927352906, - "epoch": 0.0751433656317975, - "grad_norm": 9.478486061096191, - "learning_rate": 9.814814814814815e-06, - "loss": 0.504, - "mean_token_accuracy": 0.829120421409607, - "num_tokens": 1244950.0, - "step": 760 + "epoch": 0.9809175400435041, + "step": 9921, + "train/total_loss": 0.04296880587935448 }, { - "entropy": 9.140399312973022, - "epoch": 0.07712082262210797, - "grad_norm": 6.89947509765625, - "learning_rate": 9.809869950056868e-06, - "loss": 0.5183, - "mean_token_accuracy": 0.8244409829378128, - "num_tokens": 1339074.0, - "step": 780 + "entropy": 8.23611831665039, + "epoch": 0.9810164128930196, + "mean_token_accuracy": 0.7310061454772949, + "num_tokens": 30843032.0, + "step": 9922, + "train/ce_loss": 0.5126314163208008 }, { - "entropy": 8.921469259262086, - "epoch": 0.07909827961241843, - "grad_norm": 5.486616611480713, - "learning_rate": 9.804925085298918e-06, - "loss": 0.5331, - "mean_token_accuracy": 0.8215873599052429, - "num_tokens": 1435022.0, - "step": 800 + "epoch": 0.9810164128930196, + "step": 9922, + "train/sim_loss": 0.03515625 }, { - "entropy": 9.024704217910767, - "epoch": 0.08107573660272889, - "grad_norm": 9.051934242248535, - "learning_rate": 9.799980220540969e-06, - "loss": 0.5465, - "mean_token_accuracy": 0.8153810113668442, - "num_tokens": 1529757.0, - "step": 820 + "epoch": 0.9810164128930196, + "step": 9922, + "train/total_loss": 0.08641938865184784 }, { - "entropy": 9.114181613922119, - "epoch": 0.08305319359303935, - "grad_norm": 5.360408782958984, - "learning_rate": 9.79503535578302e-06, - "loss": 0.5314, - "mean_token_accuracy": 0.8226724088191986, - "num_tokens": 1625207.0, - "step": 840 + "entropy": 8.348541259765625, + "epoch": 0.9811152857425351, + "mean_token_accuracy": 0.776190459728241, + "num_tokens": 30848329.0, + "step": 9923, + "train/ce_loss": 0.7847875952720642 }, { - "entropy": 8.981474876403809, - "epoch": 0.08503065058334981, - "grad_norm": 7.849207878112793, - "learning_rate": 9.790090491025071e-06, - "loss": 0.5584, - "mean_token_accuracy": 0.8155747056007385, - "num_tokens": 1720380.0, - "step": 860 + "epoch": 0.9811152857425351, + "step": 9923, + "train/sim_loss": 0.078125 }, { - "entropy": 8.951270866394044, - "epoch": 0.08700810757366027, - "grad_norm": 4.745890140533447, - "learning_rate": 9.785145626267124e-06, - "loss": 0.5211, - "mean_token_accuracy": 0.8240448266267777, - "num_tokens": 1816218.0, - "step": 880 + "epoch": 0.9811152857425351, + "step": 9923, + "train/total_loss": 0.15660375356674194 }, { - "entropy": 8.851841449737549, - "epoch": 0.08898556456397073, - "grad_norm": 8.521614074707031, - "learning_rate": 9.780200761509172e-06, - "loss": 0.5764, - "mean_token_accuracy": 0.8115346252918243, - "num_tokens": 1913262.0, - "step": 900 + "entropy": 8.548707008361816, + "epoch": 0.9812141585920506, + "mean_token_accuracy": 0.7929824590682983, + "num_tokens": 30853656.0, + "step": 9924, + "train/ce_loss": 0.8370770812034607 }, { - "entropy": 9.02562928199768, - "epoch": 0.0909630215542812, - "grad_norm": 10.010477066040039, - "learning_rate": 9.775255896751225e-06, - "loss": 0.5088, - "mean_token_accuracy": 0.8275548785924911, - "num_tokens": 2008708.0, - "step": 920 + "epoch": 0.9812141585920506, + "step": 9924, + "train/sim_loss": 0.04296875 }, { - "entropy": 8.689861631393432, - "epoch": 0.09294047854459166, - "grad_norm": 4.45553731918335, - "learning_rate": 9.770311031993277e-06, - "loss": 0.5344, - "mean_token_accuracy": 0.8239773392677308, - "num_tokens": 2106069.0, - "step": 940 + "epoch": 0.9812141585920506, + "step": 9924, + "train/total_loss": 0.12667647004127502 }, { - "entropy": 8.8539484500885, - "epoch": 0.09491793553490212, - "grad_norm": 7.795210361480713, - "learning_rate": 9.765366167235327e-06, - "loss": 0.5444, - "mean_token_accuracy": 0.8206797152757644, - "num_tokens": 2202355.0, - "step": 960 + "entropy": 8.849676132202148, + "epoch": 0.9813130314415661, + "mean_token_accuracy": 0.7673179507255554, + "num_tokens": 30858608.0, + "step": 9925, + "train/ce_loss": 0.9295164346694946 }, { - "entropy": 9.191250038146972, - "epoch": 0.09689539252521258, - "grad_norm": 6.243945121765137, - "learning_rate": 9.760421302477378e-06, - "loss": 0.5247, - "mean_token_accuracy": 0.8212809175252914, - "num_tokens": 2296746.0, - "step": 980 + "epoch": 0.9813130314415661, + "step": 9925, + "train/sim_loss": 0.0390625 }, { - "entropy": 8.940995264053345, - "epoch": 0.09887284951552304, - "grad_norm": 18.972265243530273, - "learning_rate": 9.755476437719428e-06, - "loss": 0.5473, - "mean_token_accuracy": 0.817921158671379, - "num_tokens": 2392971.0, - "step": 1000 + "epoch": 0.9813130314415661, + "step": 9925, + "train/total_loss": 0.13201415538787842 }, { - "entropy": 8.960219192504884, - "epoch": 0.1008503065058335, - "grad_norm": 16.02000617980957, - "learning_rate": 9.75053157296148e-06, - "loss": 0.5339, - "mean_token_accuracy": 0.8229005843400955, - "num_tokens": 2489244.0, - "step": 1020 + "entropy": 8.875197410583496, + "epoch": 0.9814119042910817, + "mean_token_accuracy": 0.7397959232330322, + "num_tokens": 30863848.0, + "step": 9926, + "train/ce_loss": 0.45025020837783813 }, { - "entropy": 8.982712030410767, - "epoch": 0.10282776349614396, - "grad_norm": 6.925206661224365, - "learning_rate": 9.745586708203531e-06, - "loss": 0.4831, - "mean_token_accuracy": 0.8360354512929916, - "num_tokens": 2584780.0, - "step": 1040 + "epoch": 0.9814119042910817, + "step": 9926, + "train/sim_loss": 0.078125 }, { - "entropy": 8.81601586341858, - "epoch": 0.10480522048645442, - "grad_norm": 4.667802333831787, - "learning_rate": 9.740641843445583e-06, - "loss": 0.4942, - "mean_token_accuracy": 0.8328757613897324, - "num_tokens": 2681601.0, - "step": 1060 + "epoch": 0.9814119042910817, + "step": 9926, + "train/total_loss": 0.12315002083778381 }, { - "entropy": 9.202123260498047, - "epoch": 0.10678267747676488, - "grad_norm": 4.67241907119751, - "learning_rate": 9.735696978687634e-06, - "loss": 0.4656, - "mean_token_accuracy": 0.8410820543766022, - "num_tokens": 2775353.0, - "step": 1080 + "entropy": 8.888309478759766, + "epoch": 0.9815107771405972, + "mean_token_accuracy": 0.7327459454536438, + "num_tokens": 30868968.0, + "step": 9927, + "train/ce_loss": 0.6028944253921509 }, { - "entropy": 9.024459838867188, - "epoch": 0.10876013446707535, - "grad_norm": 7.67643404006958, - "learning_rate": 9.730752113929684e-06, - "loss": 0.4945, - "mean_token_accuracy": 0.8338283032178879, - "num_tokens": 2870291.0, - "step": 1100 + "epoch": 0.9815107771405972, + "step": 9927, + "train/sim_loss": 0.04296875 }, { - "entropy": 8.979987621307373, - "epoch": 0.1107375914573858, - "grad_norm": 25.34343910217285, - "learning_rate": 9.725807249171736e-06, - "loss": 0.5276, - "mean_token_accuracy": 0.8217969834804535, - "num_tokens": 2967160.0, - "step": 1120 + "epoch": 0.9815107771405972, + "step": 9927, + "train/total_loss": 0.10325819253921509 }, { - "entropy": 9.02458415031433, - "epoch": 0.11271504844769627, - "grad_norm": 10.91560173034668, - "learning_rate": 9.720862384413787e-06, - "loss": 0.5164, - "mean_token_accuracy": 0.8294859111309052, - "num_tokens": 3064487.0, - "step": 1140 + "entropy": 8.876193046569824, + "epoch": 0.9816096499901127, + "mean_token_accuracy": 0.7164179086685181, + "num_tokens": 30874115.0, + "step": 9928, + "train/ce_loss": 0.8704211115837097 }, { - "entropy": 9.044503927230835, - "epoch": 0.11469250543800673, - "grad_norm": 4.927774429321289, - "learning_rate": 9.715917519655839e-06, - "loss": 0.4593, - "mean_token_accuracy": 0.8436438351869583, - "num_tokens": 3160351.0, - "step": 1160 + "epoch": 0.9816096499901127, + "step": 9928, + "train/sim_loss": 0.03125 }, { - "entropy": 9.10939655303955, - "epoch": 0.11666996242831719, - "grad_norm": 5.526464939117432, - "learning_rate": 9.71097265489789e-06, - "loss": 0.5186, - "mean_token_accuracy": 0.8225110024213791, - "num_tokens": 3255229.0, - "step": 1180 + "epoch": 0.9816096499901127, + "step": 9928, + "train/total_loss": 0.11829211562871933 }, { - "entropy": 9.157744455337525, - "epoch": 0.11864741941862765, - "grad_norm": 16.12851905822754, - "learning_rate": 9.70602779013994e-06, - "loss": 0.5099, - "mean_token_accuracy": 0.8333958268165589, - "num_tokens": 3349813.0, - "step": 1200 + "entropy": 8.968652725219727, + "epoch": 0.9817085228396283, + "mean_token_accuracy": 0.7547169923782349, + "num_tokens": 30879250.0, + "step": 9929, + "train/ce_loss": 0.651799738407135 }, { - "entropy": 9.03145079612732, - "epoch": 0.12062487640893811, - "grad_norm": 48.44076919555664, - "learning_rate": 9.701082925381992e-06, - "loss": 0.514, - "mean_token_accuracy": 0.829283133149147, - "num_tokens": 3445128.0, - "step": 1220 + "epoch": 0.9817085228396283, + "step": 9929, + "train/sim_loss": 0.015625 }, { - "entropy": 9.046934366226196, - "epoch": 0.12260233339924857, - "grad_norm": 4.218654155731201, - "learning_rate": 9.696138060624043e-06, - "loss": 0.5524, - "mean_token_accuracy": 0.8134865045547486, - "num_tokens": 3540660.0, - "step": 1240 + "epoch": 0.9817085228396283, + "step": 9929, + "train/total_loss": 0.0808049738407135 }, { - "entropy": 8.9799147605896, - "epoch": 0.12457979038955903, - "grad_norm": 5.418064117431641, - "learning_rate": 9.691193195866095e-06, - "loss": 0.5128, - "mean_token_accuracy": 0.8257223129272461, - "num_tokens": 3637051.0, - "step": 1260 + "entropy": 8.65110969543457, + "epoch": 0.9818073956891438, + "mean_token_accuracy": 0.790043294429779, + "num_tokens": 30884875.0, + "step": 9930, + "train/ce_loss": 1.005576491355896 }, { - "entropy": 9.078458547592163, - "epoch": 0.12655724737986948, - "grad_norm": 8.336530685424805, - "learning_rate": 9.686248331108144e-06, - "loss": 0.4793, - "mean_token_accuracy": 0.8399377524852752, - "num_tokens": 3731137.0, - "step": 1280 + "epoch": 0.9818073956891438, + "step": 9930, + "train/sim_loss": 0.14453125 }, { - "entropy": 9.002950096130371, - "epoch": 0.12853470437017994, - "grad_norm": 5.724057197570801, - "learning_rate": 9.681303466350196e-06, - "loss": 0.5174, - "mean_token_accuracy": 0.8220262438058853, - "num_tokens": 3826771.0, - "step": 1300 + "epoch": 0.9818073956891438, + "step": 9930, + "train/total_loss": 0.24508890509605408 }, { - "entropy": 8.82624306678772, - "epoch": 0.1305121613604904, - "grad_norm": 7.001655101776123, - "learning_rate": 9.676358601592247e-06, - "loss": 0.4845, - "mean_token_accuracy": 0.8322299361228943, - "num_tokens": 3922689.0, - "step": 1320 + "entropy": 9.129095077514648, + "epoch": 0.9819062685386593, + "mean_token_accuracy": 0.7457627058029175, + "num_tokens": 30889864.0, + "step": 9931, + "train/ce_loss": 9.725036989038927e-07 }, { - "entropy": 8.664327621459961, - "epoch": 0.13248961835080086, - "grad_norm": 4.310056686401367, - "learning_rate": 9.671413736834299e-06, - "loss": 0.5138, - "mean_token_accuracy": 0.8264670699834824, - "num_tokens": 4018964.0, - "step": 1340 + "epoch": 0.9819062685386593, + "step": 9931, + "train/sim_loss": 0.04296875 }, { - "entropy": 8.969021081924438, - "epoch": 0.13446707534111133, - "grad_norm": 5.618369102478027, - "learning_rate": 9.66646887207635e-06, - "loss": 0.5505, - "mean_token_accuracy": 0.8132057249546051, - "num_tokens": 4113869.0, - "step": 1360 + "epoch": 0.9819062685386593, + "step": 9931, + "train/total_loss": 0.04296884685754776 }, { - "entropy": 8.82208080291748, - "epoch": 0.1364445323314218, - "grad_norm": 5.97266960144043, - "learning_rate": 9.6615240073184e-06, - "loss": 0.5058, - "mean_token_accuracy": 0.8316337525844574, - "num_tokens": 4209735.0, - "step": 1380 + "entropy": 9.076364517211914, + "epoch": 0.9820051413881749, + "mean_token_accuracy": 0.6855670213699341, + "num_tokens": 30894878.0, + "step": 9932, + "train/ce_loss": 0.7352795004844666 }, { - "entropy": 8.831556391716003, - "epoch": 0.13842198932173225, - "grad_norm": 6.052074909210205, - "learning_rate": 9.656579142560452e-06, - "loss": 0.5133, - "mean_token_accuracy": 0.8256670117378235, - "num_tokens": 4305687.0, - "step": 1400 + "epoch": 0.9820051413881749, + "step": 9932, + "train/sim_loss": 0.03125 }, { - "entropy": 8.764881277084351, - "epoch": 0.1403994463120427, - "grad_norm": 5.213918209075928, - "learning_rate": 9.651634277802503e-06, - "loss": 0.5011, - "mean_token_accuracy": 0.8285619378089905, - "num_tokens": 4401865.0, - "step": 1420 + "epoch": 0.9820051413881749, + "step": 9932, + "train/total_loss": 0.10477795451879501 }, { - "entropy": 8.800114393234253, - "epoch": 0.14237690330235317, - "grad_norm": 5.405309200286865, - "learning_rate": 9.646689413044555e-06, - "loss": 0.4639, - "mean_token_accuracy": 0.8440170794725418, - "num_tokens": 4497916.0, - "step": 1440 + "entropy": 8.525617599487305, + "epoch": 0.9821040142376903, + "mean_token_accuracy": 0.7788162231445312, + "num_tokens": 30900269.0, + "step": 9933, + "train/ce_loss": 0.4009047746658325 }, { - "entropy": 8.909861755371093, - "epoch": 0.14435436029266363, - "grad_norm": 4.966978549957275, - "learning_rate": 9.641744548286605e-06, - "loss": 0.5014, - "mean_token_accuracy": 0.8311232626438141, - "num_tokens": 4593280.0, - "step": 1460 + "epoch": 0.9821040142376903, + "step": 9933, + "train/sim_loss": 0.0390625 }, { - "entropy": 8.939611196517944, - "epoch": 0.1463318172829741, - "grad_norm": 4.919088840484619, - "learning_rate": 9.636799683528656e-06, - "loss": 0.5286, - "mean_token_accuracy": 0.8200359553098678, - "num_tokens": 4689036.0, - "step": 1480 + "epoch": 0.9821040142376903, + "step": 9933, + "train/total_loss": 0.07915297895669937 }, { - "entropy": 9.041218280792236, - "epoch": 0.14830927427328455, - "grad_norm": 6.6664299964904785, - "learning_rate": 9.631854818770708e-06, - "loss": 0.4763, - "mean_token_accuracy": 0.8372103363275528, - "num_tokens": 4784279.0, - "step": 1500 + "entropy": 8.45478343963623, + "epoch": 0.9822028870872058, + "mean_token_accuracy": 0.7869757413864136, + "num_tokens": 30905662.0, + "step": 9934, + "train/ce_loss": 0.6240054965019226 }, { - "entropy": 8.930060291290284, - "epoch": 0.150286731263595, - "grad_norm": 6.254668235778809, - "learning_rate": 9.626909954012758e-06, - "loss": 0.5204, - "mean_token_accuracy": 0.8237109780311584, - "num_tokens": 4879922.0, - "step": 1520 + "epoch": 0.9822028870872058, + "step": 9934, + "train/sim_loss": 0.03125 }, { - "entropy": 8.909379577636718, - "epoch": 0.15226418825390547, - "grad_norm": 5.818599700927734, - "learning_rate": 9.62196508925481e-06, - "loss": 0.5132, - "mean_token_accuracy": 0.825594112277031, - "num_tokens": 4974502.0, - "step": 1540 + "epoch": 0.9822028870872058, + "step": 9934, + "train/total_loss": 0.09365054965019226 }, { - "entropy": 8.820873641967774, - "epoch": 0.15424164524421594, - "grad_norm": 7.291753768920898, - "learning_rate": 9.617020224496861e-06, - "loss": 0.5176, - "mean_token_accuracy": 0.8232084572315216, - "num_tokens": 5070620.0, - "step": 1560 + "entropy": 8.567974090576172, + "epoch": 0.9823017599367214, + "mean_token_accuracy": 0.7644444704055786, + "num_tokens": 30911045.0, + "step": 9935, + "train/ce_loss": 0.5513052940368652 }, { - "entropy": 8.903107976913452, - "epoch": 0.1562191022345264, - "grad_norm": 4.824163913726807, - "learning_rate": 9.612075359738912e-06, - "loss": 0.4841, - "mean_token_accuracy": 0.8357096225023269, - "num_tokens": 5167176.0, - "step": 1580 + "epoch": 0.9823017599367214, + "step": 9935, + "train/sim_loss": 0.02734375 }, { - "entropy": 8.739392495155334, - "epoch": 0.15819655922483686, - "grad_norm": 5.428810119628906, - "learning_rate": 9.607130494980962e-06, - "loss": 0.4673, - "mean_token_accuracy": 0.8413905739784241, - "num_tokens": 5263124.0, - "step": 1600 + "epoch": 0.9823017599367214, + "step": 9935, + "train/total_loss": 0.08247427642345428 }, { - "entropy": 8.775667548179626, - "epoch": 0.16017401621514732, - "grad_norm": 5.47054386138916, - "learning_rate": 9.602185630223014e-06, - "loss": 0.501, - "mean_token_accuracy": 0.8284578830003738, - "num_tokens": 5358524.0, - "step": 1620 + "entropy": 8.663185119628906, + "epoch": 0.9824006327862369, + "mean_token_accuracy": 0.7214452028274536, + "num_tokens": 30916391.0, + "step": 9936, + "train/ce_loss": 1.0856975317001343 }, { - "entropy": 8.693215084075927, - "epoch": 0.16215147320545778, - "grad_norm": 4.900571823120117, - "learning_rate": 9.597240765465065e-06, - "loss": 0.5036, - "mean_token_accuracy": 0.8287155061960221, - "num_tokens": 5454582.0, - "step": 1640 + "epoch": 0.9824006327862369, + "step": 9936, + "train/sim_loss": 0.0625 }, { - "entropy": 8.924742317199707, - "epoch": 0.16412893019576824, - "grad_norm": 5.174020290374756, - "learning_rate": 9.592295900707115e-06, - "loss": 0.5458, - "mean_token_accuracy": 0.8197515368461609, - "num_tokens": 5549478.0, - "step": 1660 + "epoch": 0.9824006327862369, + "step": 9936, + "train/total_loss": 0.17106975615024567 }, { - "entropy": 8.825971221923828, - "epoch": 0.1661063871860787, - "grad_norm": 6.814822196960449, - "learning_rate": 9.587351035949168e-06, - "loss": 0.5041, - "mean_token_accuracy": 0.8275789350271225, - "num_tokens": 5645183.0, - "step": 1680 + "entropy": 8.67253303527832, + "epoch": 0.9824995056357524, + "mean_token_accuracy": 0.7493606209754944, + "num_tokens": 30921630.0, + "step": 9937, + "train/ce_loss": 0.9833881855010986 }, { - "entropy": 8.594632005691528, - "epoch": 0.16808384417638916, - "grad_norm": 4.346351146697998, - "learning_rate": 9.582406171191218e-06, - "loss": 0.502, - "mean_token_accuracy": 0.8311293244361877, - "num_tokens": 5741648.0, - "step": 1700 + "epoch": 0.9824995056357524, + "step": 9937, + "train/sim_loss": 0.05859375 }, { - "entropy": 8.767394065856934, - "epoch": 0.17006130116669962, - "grad_norm": 5.395818710327148, - "learning_rate": 9.57746130643327e-06, - "loss": 0.5038, - "mean_token_accuracy": 0.830024516582489, - "num_tokens": 5836587.0, - "step": 1720 + "epoch": 0.9824995056357524, + "step": 9937, + "train/total_loss": 0.15693256258964539 }, { - "entropy": 8.472971439361572, - "epoch": 0.17203875815701009, - "grad_norm": 7.172584533691406, - "learning_rate": 9.57251644167532e-06, - "loss": 0.5222, - "mean_token_accuracy": 0.8254323422908783, - "num_tokens": 5933501.0, - "step": 1740 + "entropy": 8.700638771057129, + "epoch": 0.982598378485268, + "mean_token_accuracy": 0.6666666865348816, + "num_tokens": 30926947.0, + "step": 9938, + "train/ce_loss": 0.9952269196510315 }, { - "entropy": 8.652265691757203, - "epoch": 0.17401621514732055, - "grad_norm": 5.1596150398254395, - "learning_rate": 9.567571576917371e-06, - "loss": 0.4964, - "mean_token_accuracy": 0.8323924720287323, - "num_tokens": 6029794.0, - "step": 1760 + "epoch": 0.982598378485268, + "step": 9938, + "train/sim_loss": 0.046875 }, { - "entropy": 8.885914325714111, - "epoch": 0.175993672137631, - "grad_norm": 4.945981979370117, - "learning_rate": 9.562626712159424e-06, - "loss": 0.46, - "mean_token_accuracy": 0.8397042721509933, - "num_tokens": 6125059.0, - "step": 1780 + "epoch": 0.982598378485268, + "step": 9938, + "train/total_loss": 0.1463976949453354 }, { - "entropy": 8.70250141620636, - "epoch": 0.17797112912794147, - "grad_norm": 5.440828800201416, - "learning_rate": 9.557681847401474e-06, - "loss": 0.4916, - "mean_token_accuracy": 0.8350311100482941, - "num_tokens": 6220451.0, - "step": 1800 + "entropy": 9.080299377441406, + "epoch": 0.9826972513347835, + "mean_token_accuracy": 0.8273026347160339, + "num_tokens": 30932016.0, + "step": 9939, + "train/ce_loss": 0.6579766869544983 }, { - "entropy": 8.761249303817749, - "epoch": 0.17994858611825193, - "grad_norm": 5.89918327331543, - "learning_rate": 9.552736982643526e-06, - "loss": 0.4658, - "mean_token_accuracy": 0.8416806638240815, - "num_tokens": 6316271.0, - "step": 1820 + "epoch": 0.9826972513347835, + "step": 9939, + "train/sim_loss": 0.0234375 }, { - "entropy": 8.758138608932494, - "epoch": 0.1819260431085624, - "grad_norm": 7.40993070602417, - "learning_rate": 9.547792117885577e-06, - "loss": 0.5334, - "mean_token_accuracy": 0.8154588431119919, - "num_tokens": 6411681.0, - "step": 1840 + "epoch": 0.9826972513347835, + "step": 9939, + "train/total_loss": 0.08923517167568207 }, { - "entropy": 8.75136857032776, - "epoch": 0.18390350009887285, - "grad_norm": 5.19437313079834, - "learning_rate": 9.542847253127627e-06, - "loss": 0.4862, - "mean_token_accuracy": 0.8305679202079773, - "num_tokens": 6507056.0, - "step": 1860 + "epoch": 0.982796124184299, + "grad_norm": 0.5730013251304626, + "learning_rate": 7.545121890916283e-06, + "loss": 0.1323, + "step": 9940 }, { - "entropy": 8.643301391601563, - "epoch": 0.1858809570891833, - "grad_norm": 4.272485733032227, - "learning_rate": 9.53790238836968e-06, - "loss": 0.5291, - "mean_token_accuracy": 0.8222526699304581, - "num_tokens": 6602845.0, - "step": 1880 + "entropy": 8.32789421081543, + "epoch": 0.982796124184299, + "mean_token_accuracy": 0.7092130780220032, + "num_tokens": 30937548.0, + "step": 9940, + "train/ce_loss": 1.2583142518997192 }, { - "entropy": 8.616838121414185, - "epoch": 0.18785841407949377, - "grad_norm": 6.093313694000244, - "learning_rate": 9.53295752361173e-06, - "loss": 0.3825, - "mean_token_accuracy": 0.8661695659160614, - "num_tokens": 6697828.0, - "step": 1900 + "epoch": 0.982796124184299, + "step": 9940, + "train/sim_loss": 0.04296875 }, { - "entropy": 8.651631712913513, - "epoch": 0.18983587106980424, - "grad_norm": 5.479152679443359, - "learning_rate": 9.528012658853782e-06, - "loss": 0.2848, - "mean_token_accuracy": 0.897405207157135, - "num_tokens": 6791771.0, - "step": 1920 + "epoch": 0.982796124184299, + "step": 9940, + "train/total_loss": 0.16880017518997192 }, { - "entropy": 8.556629657745361, - "epoch": 0.1918133280601147, - "grad_norm": 6.000372886657715, - "learning_rate": 9.523067794095833e-06, - "loss": 0.3001, - "mean_token_accuracy": 0.8973362028598786, - "num_tokens": 6887293.0, - "step": 1940 + "entropy": 8.597822189331055, + "epoch": 0.9828949970338146, + "mean_token_accuracy": 0.7548179626464844, + "num_tokens": 30942910.0, + "step": 9941, + "train/ce_loss": 0.8048480153083801 }, { - "entropy": 8.22054898738861, - "epoch": 0.19379078505042516, - "grad_norm": 6.345143795013428, - "learning_rate": 9.518122929337883e-06, - "loss": 0.3036, - "mean_token_accuracy": 0.8922749698162079, - "num_tokens": 6985554.0, - "step": 1960 + "epoch": 0.9828949970338146, + "step": 9941, + "train/sim_loss": 0.05859375 }, { - "entropy": 8.608243989944459, - "epoch": 0.19576824204073562, - "grad_norm": 5.1964521408081055, - "learning_rate": 9.513178064579934e-06, - "loss": 0.2367, - "mean_token_accuracy": 0.9139017850160599, - "num_tokens": 7080003.0, - "step": 1980 + "epoch": 0.9828949970338146, + "step": 9941, + "train/total_loss": 0.1390785574913025 }, { - "entropy": 8.399808406829834, - "epoch": 0.19774569903104608, - "grad_norm": 4.899987697601318, - "learning_rate": 9.508233199821986e-06, - "loss": 0.2778, - "mean_token_accuracy": 0.9018704712390899, - "num_tokens": 7177420.0, - "step": 2000 + "entropy": 8.579482078552246, + "epoch": 0.98299386988333, + "mean_token_accuracy": 0.7096773982048035, + "num_tokens": 30948202.0, + "step": 9942, + "train/ce_loss": 0.7927011251449585 }, { - "entropy": 8.568553709983826, - "epoch": 0.19972315602135654, - "grad_norm": 6.212925434112549, - "learning_rate": 9.503288335064036e-06, - "loss": 0.2629, - "mean_token_accuracy": 0.9049244940280914, - "num_tokens": 7272883.0, - "step": 2020 + "epoch": 0.98299386988333, + "step": 9942, + "train/sim_loss": 0.03125 }, { - "entropy": 8.240225672721863, - "epoch": 0.201700613011667, - "grad_norm": 53.15958023071289, - "learning_rate": 9.498343470306089e-06, - "loss": 0.2849, - "mean_token_accuracy": 0.9013453364372254, - "num_tokens": 7370192.0, - "step": 2040 + "epoch": 0.98299386988333, + "step": 9942, + "train/total_loss": 0.11052011698484421 }, { - "entropy": 8.387232661247253, - "epoch": 0.20367807000197746, - "grad_norm": 7.635287284851074, - "learning_rate": 9.493398605548139e-06, - "loss": 0.2862, - "mean_token_accuracy": 0.8973095953464508, - "num_tokens": 7466233.0, - "step": 2060 + "entropy": 8.698169708251953, + "epoch": 0.9830927427328455, + "mean_token_accuracy": 0.7479091882705688, + "num_tokens": 30953496.0, + "step": 9943, + "train/ce_loss": 0.5432027578353882 }, { - "entropy": 8.369617199897766, - "epoch": 0.20565552699228792, - "grad_norm": 7.001592636108398, - "learning_rate": 9.48845374079019e-06, - "loss": 0.2931, - "mean_token_accuracy": 0.898612916469574, - "num_tokens": 7563349.0, - "step": 2080 + "epoch": 0.9830927427328455, + "step": 9943, + "train/sim_loss": 0.05078125 }, { - "entropy": 8.632820010185242, - "epoch": 0.20763298398259838, - "grad_norm": 5.053295135498047, - "learning_rate": 9.483508876032242e-06, - "loss": 0.2253, - "mean_token_accuracy": 0.9214951753616333, - "num_tokens": 7657830.0, - "step": 2100 + "epoch": 0.9830927427328455, + "step": 9943, + "train/total_loss": 0.10510152578353882 }, { - "entropy": 8.251531720161438, - "epoch": 0.20961044097290885, - "grad_norm": 9.124457359313965, - "learning_rate": 9.478564011274292e-06, - "loss": 0.2615, - "mean_token_accuracy": 0.9079591751098632, - "num_tokens": 7754133.0, - "step": 2120 + "entropy": 9.422571182250977, + "epoch": 0.9831916155823611, + "mean_token_accuracy": 0.7852882742881775, + "num_tokens": 30958416.0, + "step": 9944, + "train/ce_loss": 1.8939794017569511e-06 }, { - "entropy": 8.382238411903382, - "epoch": 0.2115878979632193, - "grad_norm": 7.134673595428467, - "learning_rate": 9.473619146516345e-06, - "loss": 0.2719, - "mean_token_accuracy": 0.9010010421276092, - "num_tokens": 7850058.0, - "step": 2140 + "epoch": 0.9831916155823611, + "step": 9944, + "train/sim_loss": 0.03125 }, { - "entropy": 8.454780960083008, - "epoch": 0.21356535495352977, - "grad_norm": 6.973474502563477, - "learning_rate": 9.468674281758395e-06, - "loss": 0.2409, - "mean_token_accuracy": 0.9142124205827713, - "num_tokens": 7945426.0, - "step": 2160 + "epoch": 0.9831916155823611, + "step": 9944, + "train/total_loss": 0.03125018998980522 }, { - "entropy": 8.543955183029174, - "epoch": 0.21554281194384023, - "grad_norm": 5.735151290893555, - "learning_rate": 9.463729417000446e-06, - "loss": 0.2635, - "mean_token_accuracy": 0.9128041237592697, - "num_tokens": 8041052.0, - "step": 2180 + "entropy": 8.789154052734375, + "epoch": 0.9832904884318766, + "mean_token_accuracy": 0.7508854866027832, + "num_tokens": 30963690.0, + "step": 9945, + "train/ce_loss": 0.2710076868534088 }, { - "entropy": 8.450089383125306, - "epoch": 0.2175202689341507, - "grad_norm": 12.039443016052246, - "learning_rate": 9.458784552242498e-06, - "loss": 0.2284, - "mean_token_accuracy": 0.9211171269416809, - "num_tokens": 8136902.0, - "step": 2200 + "epoch": 0.9832904884318766, + "step": 9945, + "train/sim_loss": 0.01171875 }, { - "entropy": 8.344897627830505, - "epoch": 0.21949772592446115, - "grad_norm": 9.118818283081055, - "learning_rate": 9.453839687484548e-06, - "loss": 0.2475, - "mean_token_accuracy": 0.9140394806861878, - "num_tokens": 8233112.0, - "step": 2220 + "epoch": 0.9832904884318766, + "step": 9945, + "train/total_loss": 0.03881952166557312 }, { - "entropy": 8.570263671875, - "epoch": 0.2214751829147716, - "grad_norm": 5.619281768798828, - "learning_rate": 9.4488948227266e-06, - "loss": 0.2692, - "mean_token_accuracy": 0.9032078206539154, - "num_tokens": 8328963.0, - "step": 2240 + "entropy": 8.839859008789062, + "epoch": 0.9833893612813921, + "mean_token_accuracy": 0.708020031452179, + "num_tokens": 30968956.0, + "step": 9946, + "train/ce_loss": 0.5631778836250305 }, { - "entropy": 8.43959300518036, - "epoch": 0.22345263990508207, - "grad_norm": 5.591318607330322, - "learning_rate": 9.44394995796865e-06, - "loss": 0.2655, - "mean_token_accuracy": 0.9060284614562988, - "num_tokens": 8426064.0, - "step": 2260 + "epoch": 0.9833893612813921, + "step": 9946, + "train/sim_loss": 0.0703125 }, { - "entropy": 8.402120852470398, - "epoch": 0.22543009689539253, - "grad_norm": 6.271893501281738, - "learning_rate": 9.439005093210701e-06, - "loss": 0.2537, - "mean_token_accuracy": 0.9108969271183014, - "num_tokens": 8522366.0, - "step": 2280 + "epoch": 0.9833893612813921, + "step": 9946, + "train/total_loss": 0.1266302913427353 }, { - "entropy": 8.449602079391479, - "epoch": 0.227407553885703, - "grad_norm": 6.051339626312256, - "learning_rate": 9.434060228452752e-06, - "loss": 0.2335, - "mean_token_accuracy": 0.9169043987989426, - "num_tokens": 8617659.0, - "step": 2300 + "entropy": 8.943111419677734, + "epoch": 0.9834882341309077, + "mean_token_accuracy": 0.7838214635848999, + "num_tokens": 30974140.0, + "step": 9947, + "train/ce_loss": 3.6909526102135715e-07 }, { - "entropy": 8.50084822177887, - "epoch": 0.22938501087601346, - "grad_norm": 8.404824256896973, - "learning_rate": 9.429115363694804e-06, - "loss": 0.2561, - "mean_token_accuracy": 0.9115628123283386, - "num_tokens": 8713565.0, - "step": 2320 + "epoch": 0.9834882341309077, + "step": 9947, + "train/sim_loss": 0.0390625 }, { - "entropy": 8.512407898902893, - "epoch": 0.23136246786632392, - "grad_norm": 6.2526469230651855, - "learning_rate": 9.424170498936855e-06, - "loss": 0.2806, - "mean_token_accuracy": 0.9018924921751023, - "num_tokens": 8810125.0, - "step": 2340 + "epoch": 0.9834882341309077, + "step": 9947, + "train/total_loss": 0.039062537252902985 }, { - "entropy": 8.387197065353394, - "epoch": 0.23333992485663438, - "grad_norm": 4.145463466644287, - "learning_rate": 9.419225634178905e-06, - "loss": 0.2608, - "mean_token_accuracy": 0.9093194723129272, - "num_tokens": 8907004.0, - "step": 2360 + "entropy": 9.292435646057129, + "epoch": 0.9835871069804232, + "mean_token_accuracy": 0.8214936256408691, + "num_tokens": 30979104.0, + "step": 9948, + "train/ce_loss": 1.589453262340612e-07 }, { - "entropy": 8.361292695999145, - "epoch": 0.23531738184694484, - "grad_norm": 5.790935516357422, - "learning_rate": 9.414280769420957e-06, - "loss": 0.2387, - "mean_token_accuracy": 0.9160567998886109, - "num_tokens": 9003321.0, - "step": 2380 + "epoch": 0.9835871069804232, + "step": 9948, + "train/sim_loss": 0.01953125 }, { - "entropy": 8.301017570495606, - "epoch": 0.2372948388372553, - "grad_norm": 5.417795181274414, - "learning_rate": 9.409335904663008e-06, - "loss": 0.247, - "mean_token_accuracy": 0.9102832496166229, - "num_tokens": 9100423.0, - "step": 2400 + "epoch": 0.9835871069804232, + "step": 9948, + "train/total_loss": 0.019531266763806343 }, { - "entropy": 8.47871811389923, - "epoch": 0.23927229582756576, - "grad_norm": 7.506371974945068, - "learning_rate": 9.40439103990506e-06, - "loss": 0.2535, - "mean_token_accuracy": 0.911874744296074, - "num_tokens": 9195832.0, - "step": 2420 + "entropy": 8.4045991897583, + "epoch": 0.9836859798299387, + "mean_token_accuracy": 0.7729138135910034, + "num_tokens": 30984340.0, + "step": 9949, + "train/ce_loss": 0.5247573256492615 }, { - "entropy": 8.220595622062683, - "epoch": 0.24124975281787622, - "grad_norm": 4.856903553009033, - "learning_rate": 9.39944617514711e-06, - "loss": 0.2402, - "mean_token_accuracy": 0.9161158621311187, - "num_tokens": 9291370.0, - "step": 2440 + "epoch": 0.9836859798299387, + "step": 9949, + "train/sim_loss": 0.0390625 }, { - "entropy": 8.25529749393463, - "epoch": 0.24322720980818668, - "grad_norm": 7.935940742492676, - "learning_rate": 9.394501310389161e-06, - "loss": 0.2566, - "mean_token_accuracy": 0.9096111744642258, - "num_tokens": 9387905.0, - "step": 2460 + "epoch": 0.9836859798299387, + "step": 9949, + "train/total_loss": 0.09153823554515839 }, { - "entropy": 8.375088810920715, - "epoch": 0.24520466679849715, - "grad_norm": 5.770101070404053, - "learning_rate": 9.389556445631213e-06, - "loss": 0.2571, - "mean_token_accuracy": 0.9083121955394745, - "num_tokens": 9484096.0, - "step": 2480 + "entropy": 8.655339241027832, + "epoch": 0.9837848526794543, + "mean_token_accuracy": 0.8016877770423889, + "num_tokens": 30989909.0, + "step": 9950, + "train/ce_loss": 0.4090072810649872 }, { - "entropy": 8.423392510414123, - "epoch": 0.2471821237888076, - "grad_norm": 10.278122901916504, - "learning_rate": 9.384611580873264e-06, - "loss": 0.2434, - "mean_token_accuracy": 0.9145131319761276, - "num_tokens": 9579726.0, - "step": 2500 + "epoch": 0.9837848526794543, + "step": 9950, + "train/sim_loss": 0.015625 }, { - "entropy": 8.352119731903077, - "epoch": 0.24915958077911807, - "grad_norm": 6.051823139190674, - "learning_rate": 9.379666716115316e-06, - "loss": 0.2279, - "mean_token_accuracy": 0.9206470519304275, - "num_tokens": 94552.0, - "step": 2520 + "epoch": 0.9837848526794543, + "step": 9950, + "train/total_loss": 0.05652572959661484 }, { - "entropy": 8.018824338912964, - "epoch": 0.25113703776942853, - "grad_norm": 5.889877796173096, - "learning_rate": 9.374721851357365e-06, - "loss": 0.2757, - "mean_token_accuracy": 0.9056919336318969, - "num_tokens": 191458.0, - "step": 2540 + "entropy": 8.748329162597656, + "epoch": 0.9838837255289697, + "mean_token_accuracy": 0.7439024448394775, + "num_tokens": 30995145.0, + "step": 9951, + "train/ce_loss": 0.8711205720901489 }, { - "entropy": 8.595752286911011, - "epoch": 0.25311449475973896, - "grad_norm": 8.249797821044922, - "learning_rate": 9.369776986599417e-06, - "loss": 0.2336, - "mean_token_accuracy": 0.9185534924268722, - "num_tokens": 286576.0, - "step": 2560 + "epoch": 0.9838837255289697, + "step": 9951, + "train/sim_loss": 0.03515625 }, { - "entropy": 8.46731448173523, - "epoch": 0.25509195175004945, - "grad_norm": 8.037137031555176, - "learning_rate": 9.364832121841468e-06, - "loss": 0.2336, - "mean_token_accuracy": 0.9169575601816178, - "num_tokens": 381880.0, - "step": 2580 + "epoch": 0.9838837255289697, + "step": 9951, + "train/total_loss": 0.12226831167936325 }, { - "entropy": 8.412287640571595, - "epoch": 0.2570694087403599, - "grad_norm": 4.862310409545898, - "learning_rate": 9.35988725708352e-06, - "loss": 0.2481, - "mean_token_accuracy": 0.9106969445943832, - "num_tokens": 478028.0, - "step": 2600 + "entropy": 8.843263626098633, + "epoch": 0.9839825983784852, + "mean_token_accuracy": 0.7362045645713806, + "num_tokens": 31000515.0, + "step": 9952, + "train/ce_loss": 1.092915415763855 }, { - "entropy": 8.470451760292054, - "epoch": 0.2590468657306704, - "grad_norm": 5.986464023590088, - "learning_rate": 9.35494239232557e-06, - "loss": 0.2258, - "mean_token_accuracy": 0.9217559069395065, - "num_tokens": 573294.0, - "step": 2620 + "epoch": 0.9839825983784852, + "step": 9952, + "train/sim_loss": 0.10546875 }, { - "entropy": 8.314798545837402, - "epoch": 0.2610243227209808, - "grad_norm": 7.20578670501709, - "learning_rate": 9.34999752756762e-06, - "loss": 0.262, - "mean_token_accuracy": 0.9090510606765747, - "num_tokens": 671030.0, - "step": 2640 + "epoch": 0.9839825983784852, + "step": 9952, + "train/total_loss": 0.21476030349731445 }, { - "entropy": 8.340648698806763, - "epoch": 0.2630017797112913, - "grad_norm": 18.52384376525879, - "learning_rate": 9.345052662809673e-06, - "loss": 0.2528, - "mean_token_accuracy": 0.911935618519783, - "num_tokens": 767830.0, - "step": 2660 + "entropy": 8.46907901763916, + "epoch": 0.9840814712280008, + "mean_token_accuracy": 0.7589175701141357, + "num_tokens": 31005985.0, + "step": 9953, + "train/ce_loss": 0.49849000573158264 }, { - "entropy": 8.39926905632019, - "epoch": 0.26497923670160173, - "grad_norm": 6.089130878448486, - "learning_rate": 9.340107798051723e-06, - "loss": 0.2507, - "mean_token_accuracy": 0.9114709228277207, - "num_tokens": 864749.0, - "step": 2680 + "epoch": 0.9840814712280008, + "step": 9953, + "train/sim_loss": 0.0703125 }, { - "entropy": 8.36897315979004, - "epoch": 0.2669566936919122, - "grad_norm": 13.114466667175293, - "learning_rate": 9.335162933293776e-06, - "loss": 0.2604, - "mean_token_accuracy": 0.9096815407276153, - "num_tokens": 961252.0, - "step": 2700 + "epoch": 0.9840814712280008, + "step": 9953, + "train/total_loss": 0.1201615035533905 }, { - "entropy": 8.219355010986328, - "epoch": 0.26893415068222265, - "grad_norm": 7.862151145935059, - "learning_rate": 9.330218068535826e-06, - "loss": 0.2229, - "mean_token_accuracy": 0.9226343750953674, - "num_tokens": 1057981.0, - "step": 2720 + "entropy": 8.036636352539062, + "epoch": 0.9841803440775163, + "mean_token_accuracy": 0.7485265135765076, + "num_tokens": 31011469.0, + "step": 9954, + "train/ce_loss": 1.0809634923934937 }, { - "entropy": 8.18206970691681, - "epoch": 0.27091160767253314, - "grad_norm": 12.203274726867676, - "learning_rate": 9.325273203777877e-06, - "loss": 0.2213, - "mean_token_accuracy": 0.9267831236124039, - "num_tokens": 1155711.0, - "step": 2740 + "epoch": 0.9841803440775163, + "step": 9954, + "train/sim_loss": 0.05078125 }, { - "entropy": 8.399131608009338, - "epoch": 0.2728890646628436, - "grad_norm": 25.41950225830078, - "learning_rate": 9.320328339019929e-06, - "loss": 0.2235, - "mean_token_accuracy": 0.9188963174819946, - "num_tokens": 1251932.0, - "step": 2760 + "epoch": 0.9841803440775163, + "step": 9954, + "train/total_loss": 0.15887761116027832 }, { - "entropy": 8.395215249061584, - "epoch": 0.27486652165315406, - "grad_norm": 7.615808486938477, - "learning_rate": 9.31538347426198e-06, - "loss": 0.2124, - "mean_token_accuracy": 0.9265283226966858, - "num_tokens": 1347603.0, - "step": 2780 + "entropy": 9.49459457397461, + "epoch": 0.9842792169270318, + "mean_token_accuracy": 0.7575757503509521, + "num_tokens": 31016328.0, + "step": 9955, + "train/ce_loss": 1.3868564367294312 }, { - "entropy": 8.609463143348695, - "epoch": 0.2768439786434645, - "grad_norm": 6.715790271759033, - "learning_rate": 9.310438609504032e-06, - "loss": 0.2154, - "mean_token_accuracy": 0.9271300345659256, - "num_tokens": 1443656.0, - "step": 2800 + "epoch": 0.9842792169270318, + "step": 9955, + "train/sim_loss": 0.03125 }, { - "entropy": 8.485823464393615, - "epoch": 0.278821435633775, - "grad_norm": 4.69348669052124, - "learning_rate": 9.305493744746082e-06, - "loss": 0.201, - "mean_token_accuracy": 0.9283007353544235, - "num_tokens": 1539190.0, - "step": 2820 + "epoch": 0.9842792169270318, + "step": 9955, + "train/total_loss": 0.16993564367294312 }, { - "entropy": 8.613473558425904, - "epoch": 0.2807988926240854, - "grad_norm": 6.357530117034912, - "learning_rate": 9.300548879988133e-06, - "loss": 0.2073, - "mean_token_accuracy": 0.9269598335027694, - "num_tokens": 1635783.0, - "step": 2840 + "entropy": 8.62130355834961, + "epoch": 0.9843780897765474, + "mean_token_accuracy": 0.7247806787490845, + "num_tokens": 31021779.0, + "step": 9956, + "train/ce_loss": 0.7187850475311279 }, { - "entropy": 8.747143530845642, - "epoch": 0.2827763496143959, - "grad_norm": 5.147572040557861, - "learning_rate": 9.295604015230185e-06, - "loss": 0.2176, - "mean_token_accuracy": 0.9254478424787521, - "num_tokens": 1731022.0, - "step": 2860 + "epoch": 0.9843780897765474, + "step": 9956, + "train/sim_loss": 0.03515625 }, { - "entropy": 8.297242498397827, - "epoch": 0.28475380660470634, - "grad_norm": 9.231119155883789, - "learning_rate": 9.290659150472235e-06, - "loss": 0.2348, - "mean_token_accuracy": 0.9174753576517105, - "num_tokens": 1828801.0, - "step": 2880 + "epoch": 0.9843780897765474, + "step": 9956, + "train/total_loss": 0.10703475773334503 }, { - "entropy": 8.60151228904724, - "epoch": 0.2867312635950168, - "grad_norm": 6.1927971839904785, - "learning_rate": 9.285714285714288e-06, - "loss": 0.2071, - "mean_token_accuracy": 0.9249164193868638, - "num_tokens": 1925147.0, - "step": 2900 + "entropy": 8.984066009521484, + "epoch": 0.9844769626260629, + "mean_token_accuracy": 0.7504835724830627, + "num_tokens": 31026751.0, + "step": 9957, + "train/ce_loss": 0.8555747866630554 }, { - "entropy": 8.34753065109253, - "epoch": 0.28870872058532726, - "grad_norm": 8.126082420349121, - "learning_rate": 9.280769420956338e-06, - "loss": 0.2342, - "mean_token_accuracy": 0.9167996197938919, - "num_tokens": 2021743.0, - "step": 2920 + "epoch": 0.9844769626260629, + "step": 9957, + "train/sim_loss": 0.02734375 }, { - "entropy": 8.526820945739747, - "epoch": 0.29068617757563775, - "grad_norm": 7.052775859832764, - "learning_rate": 9.275824556198389e-06, - "loss": 0.2171, - "mean_token_accuracy": 0.9249680280685425, - "num_tokens": 2117695.0, - "step": 2940 + "epoch": 0.9844769626260629, + "step": 9957, + "train/total_loss": 0.1129012331366539 }, { - "entropy": 8.464531111717225, - "epoch": 0.2926636345659482, - "grad_norm": 7.4120635986328125, - "learning_rate": 9.270879691440439e-06, - "loss": 0.2255, - "mean_token_accuracy": 0.9214257657527923, - "num_tokens": 2213305.0, - "step": 2960 + "entropy": 8.909862518310547, + "epoch": 0.9845758354755784, + "mean_token_accuracy": 0.7599451541900635, + "num_tokens": 31031971.0, + "step": 9958, + "train/ce_loss": 1.0698707103729248 }, { - "entropy": 8.495063853263854, - "epoch": 0.29464109155625867, - "grad_norm": 5.081912517547607, - "learning_rate": 9.265934826682491e-06, - "loss": 0.2067, - "mean_token_accuracy": 0.9276946157217025, - "num_tokens": 2308033.0, - "step": 2980 + "epoch": 0.9845758354755784, + "step": 9958, + "train/sim_loss": 0.05859375 }, { - "entropy": 8.457685017585755, - "epoch": 0.2966185485465691, - "grad_norm": 14.727659225463867, - "learning_rate": 9.260989961924542e-06, - "loss": 0.2239, - "mean_token_accuracy": 0.9228915780782699, - "num_tokens": 2402735.0, - "step": 3000 + "epoch": 0.9845758354755784, + "step": 9958, + "train/total_loss": 0.16558082401752472 }, { - "entropy": 8.324458050727845, - "epoch": 0.2985960055368796, - "grad_norm": 6.275546550750732, - "learning_rate": 9.256045097166592e-06, - "loss": 0.2464, - "mean_token_accuracy": 0.9135557144880295, - "num_tokens": 2498388.0, - "step": 3020 + "entropy": 8.192459106445312, + "epoch": 0.984674708325094, + "mean_token_accuracy": 0.7552602291107178, + "num_tokens": 31037396.0, + "step": 9959, + "train/ce_loss": 0.49683094024658203 }, { - "entropy": 8.748064064979554, - "epoch": 0.30057346252719, - "grad_norm": 7.923785209655762, - "learning_rate": 9.251100232408645e-06, - "loss": 0.2379, - "mean_token_accuracy": 0.9163645774126052, - "num_tokens": 2592789.0, - "step": 3040 + "epoch": 0.984674708325094, + "step": 9959, + "train/sim_loss": 0.05859375 }, { - "entropy": 8.496152544021607, - "epoch": 0.3025509195175005, - "grad_norm": 8.312590599060059, - "learning_rate": 9.246155367650695e-06, - "loss": 0.2301, - "mean_token_accuracy": 0.9194170475006104, - "num_tokens": 2687931.0, - "step": 3060 + "epoch": 0.984674708325094, + "step": 9959, + "train/total_loss": 0.1082768440246582 }, { - "entropy": 8.400599956512451, - "epoch": 0.30452837650781095, - "grad_norm": 6.803915023803711, - "learning_rate": 9.241210502892747e-06, - "loss": 0.2576, - "mean_token_accuracy": 0.91059789955616, - "num_tokens": 2782908.0, - "step": 3080 + "epoch": 0.9847735811746094, + "grad_norm": 0.6442089080810547, + "learning_rate": 7.540177026158335e-06, + "loss": 0.1207, + "step": 9960 }, { - "entropy": 8.42363269329071, - "epoch": 0.30650583349812144, - "grad_norm": 7.424758434295654, - "learning_rate": 9.236265638134798e-06, - "loss": 0.2511, - "mean_token_accuracy": 0.9122053682804108, - "num_tokens": 2878717.0, - "step": 3100 + "entropy": 8.275558471679688, + "epoch": 0.9847735811746094, + "mean_token_accuracy": 0.7203311920166016, + "num_tokens": 31042946.0, + "step": 9960, + "train/ce_loss": 0.5132285356521606 }, { - "entropy": 8.354277396202088, - "epoch": 0.30848329048843187, - "grad_norm": 10.31395149230957, - "learning_rate": 9.231320773376848e-06, - "loss": 0.2632, - "mean_token_accuracy": 0.9108431339263916, - "num_tokens": 2973904.0, - "step": 3120 + "epoch": 0.9847735811746094, + "step": 9960, + "train/sim_loss": 0.03515625 }, { - "entropy": 8.381376791000367, - "epoch": 0.31046074747874236, - "grad_norm": 6.864160060882568, - "learning_rate": 9.2263759086189e-06, - "loss": 0.2618, - "mean_token_accuracy": 0.9079474300146103, - "num_tokens": 3069688.0, - "step": 3140 + "epoch": 0.9847735811746094, + "step": 9960, + "train/total_loss": 0.08647910505533218 }, { - "entropy": 8.298176574707032, - "epoch": 0.3124382044690528, - "grad_norm": 22.918903350830078, - "learning_rate": 9.221431043860951e-06, - "loss": 0.2748, - "mean_token_accuracy": 0.9050507992506027, - "num_tokens": 3165367.0, - "step": 3160 + "entropy": 8.636802673339844, + "epoch": 0.9848724540241249, + "mean_token_accuracy": 0.7493857741355896, + "num_tokens": 31048241.0, + "step": 9961, + "train/ce_loss": 0.7567731142044067 }, { - "entropy": 8.527709984779358, - "epoch": 0.3144156614593633, - "grad_norm": 6.728426456451416, - "learning_rate": 9.216486179103003e-06, - "loss": 0.2617, - "mean_token_accuracy": 0.9109597116708755, - "num_tokens": 3260693.0, - "step": 3180 + "epoch": 0.9848724540241249, + "step": 9961, + "train/sim_loss": 0.05078125 }, { - "entropy": 8.394654035568237, - "epoch": 0.3163931184496737, - "grad_norm": 10.421079635620117, - "learning_rate": 9.211541314345054e-06, - "loss": 0.2659, - "mean_token_accuracy": 0.9066603630781174, - "num_tokens": 3357544.0, - "step": 3200 + "epoch": 0.9848724540241249, + "step": 9961, + "train/total_loss": 0.1264585554599762 }, { - "entropy": 8.543966937065125, - "epoch": 0.3183705754399842, - "grad_norm": 14.038777351379395, - "learning_rate": 9.206596449587104e-06, - "loss": 0.2622, - "mean_token_accuracy": 0.9077901512384414, - "num_tokens": 3452556.0, - "step": 3220 + "entropy": 8.473033905029297, + "epoch": 0.9849713268736405, + "mean_token_accuracy": 0.7456258535385132, + "num_tokens": 31053448.0, + "step": 9962, + "train/ce_loss": 0.773692786693573 }, { - "entropy": 8.68721113204956, - "epoch": 0.32034803243029464, - "grad_norm": 9.478950500488281, - "learning_rate": 9.201651584829155e-06, - "loss": 0.2415, - "mean_token_accuracy": 0.9130460679531097, - "num_tokens": 3547183.0, - "step": 3240 + "epoch": 0.9849713268736405, + "step": 9962, + "train/sim_loss": 0.0390625 }, { - "entropy": 8.325738787651062, - "epoch": 0.3223254894206051, - "grad_norm": 26.005895614624023, - "learning_rate": 9.196706720071207e-06, - "loss": 0.2878, - "mean_token_accuracy": 0.9011870890855789, - "num_tokens": 3644597.0, - "step": 3260 + "epoch": 0.9849713268736405, + "step": 9962, + "train/total_loss": 0.11643178015947342 }, { - "entropy": 8.690507102012635, - "epoch": 0.32430294641091556, - "grad_norm": 8.384211540222168, - "learning_rate": 9.191761855313257e-06, - "loss": 0.2553, - "mean_token_accuracy": 0.9089549452066421, - "num_tokens": 3739837.0, - "step": 3280 + "entropy": 8.31374740600586, + "epoch": 0.985070199723156, + "mean_token_accuracy": 0.7385475039482117, + "num_tokens": 31058793.0, + "step": 9963, + "train/ce_loss": 1.0783629417419434 }, { - "entropy": 8.425889158248902, - "epoch": 0.32628040340122605, - "grad_norm": 6.265214443206787, - "learning_rate": 9.18681699055531e-06, - "loss": 0.2452, - "mean_token_accuracy": 0.913366386294365, - "num_tokens": 3835291.0, - "step": 3300 + "epoch": 0.985070199723156, + "step": 9963, + "train/sim_loss": 0.046875 }, { - "entropy": 8.370037078857422, - "epoch": 0.3282578603915365, - "grad_norm": 6.2291340827941895, - "learning_rate": 9.18187212579736e-06, - "loss": 0.269, - "mean_token_accuracy": 0.9074391812086106, - "num_tokens": 3932381.0, - "step": 3320 + "epoch": 0.985070199723156, + "step": 9963, + "train/total_loss": 0.1547113060951233 }, { - "entropy": 8.544364070892334, - "epoch": 0.33023531738184697, - "grad_norm": 8.258687019348145, - "learning_rate": 9.17692726103941e-06, - "loss": 0.2444, - "mean_token_accuracy": 0.9135472714900971, - "num_tokens": 4028469.0, - "step": 3340 + "entropy": 8.551907539367676, + "epoch": 0.9851690725726715, + "mean_token_accuracy": 0.7377245426177979, + "num_tokens": 31064149.0, + "step": 9964, + "train/ce_loss": 1.03456449508667 }, { - "entropy": 8.355397725105286, - "epoch": 0.3322127743721574, - "grad_norm": 7.2240705490112305, - "learning_rate": 9.171982396281463e-06, - "loss": 0.2761, - "mean_token_accuracy": 0.9035510569810867, - "num_tokens": 4125264.0, - "step": 3360 + "epoch": 0.9851690725726715, + "step": 9964, + "train/sim_loss": 0.07421875 }, { - "entropy": 8.592670917510986, - "epoch": 0.3341902313624679, - "grad_norm": 7.992181777954102, - "learning_rate": 9.167037531523513e-06, - "loss": 0.2298, - "mean_token_accuracy": 0.9165220469236374, - "num_tokens": 4219780.0, - "step": 3380 + "epoch": 0.9851690725726715, + "step": 9964, + "train/total_loss": 0.17767520248889923 }, { - "entropy": 8.532916688919068, - "epoch": 0.3361676883527783, - "grad_norm": 12.373023986816406, - "learning_rate": 9.162092666765566e-06, - "loss": 0.2386, - "mean_token_accuracy": 0.9183211266994477, - "num_tokens": 4315305.0, - "step": 3400 + "entropy": 8.632230758666992, + "epoch": 0.9852679454221871, + "mean_token_accuracy": 0.751396656036377, + "num_tokens": 31069309.0, + "step": 9965, + "train/ce_loss": 0.8975366353988647 }, { - "entropy": 8.44810152053833, - "epoch": 0.3381451453430888, - "grad_norm": 8.67602252960205, - "learning_rate": 9.157147802007616e-06, - "loss": 0.2649, - "mean_token_accuracy": 0.903523588180542, - "num_tokens": 4411489.0, - "step": 3420 + "epoch": 0.9852679454221871, + "step": 9965, + "train/sim_loss": 0.0390625 }, { - "entropy": 8.477056670188905, - "epoch": 0.34012260233339925, - "grad_norm": 6.801801681518555, - "learning_rate": 9.152202937249667e-06, - "loss": 0.2428, - "mean_token_accuracy": 0.9140433698892594, - "num_tokens": 4506957.0, - "step": 3440 + "epoch": 0.9852679454221871, + "step": 9965, + "train/total_loss": 0.128816157579422 }, { - "entropy": 8.282349920272827, - "epoch": 0.3421000593237097, - "grad_norm": 7.049509525299072, - "learning_rate": 9.147258072491719e-06, - "loss": 0.2623, - "mean_token_accuracy": 0.9076365619897843, - "num_tokens": 4603240.0, - "step": 3460 + "entropy": 8.25493049621582, + "epoch": 0.9853668182717026, + "mean_token_accuracy": 0.7153518199920654, + "num_tokens": 31074731.0, + "step": 9966, + "train/ce_loss": 1.4290907382965088 }, { - "entropy": 8.413060212135315, - "epoch": 0.34407751631402017, - "grad_norm": 5.879739761352539, - "learning_rate": 9.14231320773377e-06, - "loss": 0.2327, - "mean_token_accuracy": 0.9164053320884704, - "num_tokens": 4697425.0, - "step": 3480 + "epoch": 0.9853668182717026, + "step": 9966, + "train/sim_loss": 0.046875 }, { - "entropy": 8.434215950965882, - "epoch": 0.3460549733043306, - "grad_norm": 5.856489658355713, - "learning_rate": 9.137368342975821e-06, - "loss": 0.2592, - "mean_token_accuracy": 0.914098185300827, - "num_tokens": 4793533.0, - "step": 3500 + "epoch": 0.9853668182717026, + "step": 9966, + "train/total_loss": 0.18978407979011536 }, { - "entropy": 8.238046503067016, - "epoch": 0.3480324302946411, - "grad_norm": 6.225820064544678, - "learning_rate": 9.132423478217872e-06, - "loss": 0.255, - "mean_token_accuracy": 0.9096780836582183, - "num_tokens": 4890051.0, - "step": 3520 + "entropy": 8.804088592529297, + "epoch": 0.9854656911212181, + "mean_token_accuracy": 0.737500011920929, + "num_tokens": 31079930.0, + "step": 9967, + "train/ce_loss": 0.7617242336273193 }, { - "entropy": 8.449634981155395, - "epoch": 0.3500098872849515, - "grad_norm": 7.62793493270874, - "learning_rate": 9.127478613459922e-06, - "loss": 0.2781, - "mean_token_accuracy": 0.9047579407691956, - "num_tokens": 4985902.0, - "step": 3540 + "epoch": 0.9854656911212181, + "step": 9967, + "train/sim_loss": 0.05078125 }, { - "entropy": 8.475815391540527, - "epoch": 0.351987344275262, - "grad_norm": 10.578008651733398, - "learning_rate": 9.122533748701975e-06, - "loss": 0.2402, - "mean_token_accuracy": 0.9141593277454376, - "num_tokens": 5081308.0, - "step": 3560 + "epoch": 0.9854656911212181, + "step": 9967, + "train/total_loss": 0.12695367634296417 }, { - "entropy": 8.44127995967865, - "epoch": 0.35396480126557245, - "grad_norm": 6.785723686218262, - "learning_rate": 9.117588883944025e-06, - "loss": 0.2407, - "mean_token_accuracy": 0.9155316352844238, - "num_tokens": 5176522.0, - "step": 3580 + "entropy": 8.463518142700195, + "epoch": 0.9855645639707337, + "mean_token_accuracy": 0.7080292105674744, + "num_tokens": 31085205.0, + "step": 9968, + "train/ce_loss": 0.8394144177436829 }, { - "entropy": 8.48484799861908, - "epoch": 0.35594225825588294, - "grad_norm": 4.562641620635986, - "learning_rate": 9.112644019186077e-06, - "loss": 0.2447, - "mean_token_accuracy": 0.9130895644426346, - "num_tokens": 5271373.0, - "step": 3600 + "epoch": 0.9855645639707337, + "step": 9968, + "train/sim_loss": 0.01953125 }, { - "entropy": 8.343516564369201, - "epoch": 0.35791971524619337, - "grad_norm": 5.01414680480957, - "learning_rate": 9.107699154428126e-06, - "loss": 0.2752, - "mean_token_accuracy": 0.9001256883144378, - "num_tokens": 5368016.0, - "step": 3620 + "epoch": 0.9855645639707337, + "step": 9968, + "train/total_loss": 0.10347269475460052 }, { - "entropy": 8.318782877922057, - "epoch": 0.35989717223650386, - "grad_norm": 3.8312673568725586, - "learning_rate": 9.102754289670178e-06, - "loss": 0.2554, - "mean_token_accuracy": 0.9118220657110214, - "num_tokens": 5464498.0, - "step": 3640 + "entropy": 8.643590927124023, + "epoch": 0.9856634368202492, + "mean_token_accuracy": 0.7765700221061707, + "num_tokens": 31090452.0, + "step": 9969, + "train/ce_loss": 0.3944287896156311 }, { - "entropy": 8.310300302505492, - "epoch": 0.3618746292268143, - "grad_norm": 5.629157543182373, - "learning_rate": 9.097809424912229e-06, - "loss": 0.2464, - "mean_token_accuracy": 0.9146625936031342, - "num_tokens": 5559876.0, - "step": 3660 + "epoch": 0.9856634368202492, + "step": 9969, + "train/sim_loss": 0.015625 }, { - "entropy": 8.620506691932679, - "epoch": 0.3638520862171248, - "grad_norm": 5.579307556152344, - "learning_rate": 9.092864560154281e-06, - "loss": 0.259, - "mean_token_accuracy": 0.9087336152791977, - "num_tokens": 5655476.0, - "step": 3680 + "epoch": 0.9856634368202492, + "step": 9969, + "train/total_loss": 0.05506787821650505 }, { - "entropy": 8.480404424667359, - "epoch": 0.3658295432074352, - "grad_norm": 4.763086795806885, - "learning_rate": 9.087919695396332e-06, - "loss": 0.2513, - "mean_token_accuracy": 0.9134955763816833, - "num_tokens": 5752407.0, - "step": 3700 + "entropy": 8.440271377563477, + "epoch": 0.9857623096697646, + "mean_token_accuracy": 0.7650334239006042, + "num_tokens": 31095819.0, + "step": 9970, + "train/ce_loss": 0.8097338676452637 }, { - "entropy": 8.338110160827636, - "epoch": 0.3678070001977457, - "grad_norm": 5.0226149559021, - "learning_rate": 9.082974830638382e-06, - "loss": 0.2469, - "mean_token_accuracy": 0.9148130536079406, - "num_tokens": 5847749.0, - "step": 3720 + "epoch": 0.9857623096697646, + "step": 9970, + "train/sim_loss": 0.046875 }, { - "entropy": 8.353825235366822, - "epoch": 0.36978445718805614, - "grad_norm": 6.5569257736206055, - "learning_rate": 9.078029965880434e-06, - "loss": 0.2718, - "mean_token_accuracy": 0.9040377497673034, - "num_tokens": 5943456.0, - "step": 3740 + "epoch": 0.9857623096697646, + "step": 9970, + "train/total_loss": 0.12784838676452637 }, { - "entropy": 8.457132935523987, - "epoch": 0.3717619141783666, - "grad_norm": 5.592168807983398, - "learning_rate": 9.073085101122485e-06, - "loss": 0.2645, - "mean_token_accuracy": 0.9071003496646881, - "num_tokens": 6039992.0, - "step": 3760 + "entropy": 8.892143249511719, + "epoch": 0.9858611825192802, + "mean_token_accuracy": 0.7469244003295898, + "num_tokens": 31100858.0, + "step": 9971, + "train/ce_loss": 1.3040673732757568 }, { - "entropy": 8.454567408561706, - "epoch": 0.37373937116867706, - "grad_norm": 6.413713455200195, - "learning_rate": 9.068140236364537e-06, - "loss": 0.2459, - "mean_token_accuracy": 0.9136128157377243, - "num_tokens": 6136190.0, - "step": 3780 + "epoch": 0.9858611825192802, + "step": 9971, + "train/sim_loss": 0.04296875 }, { - "entropy": 8.557881689071655, - "epoch": 0.37571682815898755, - "grad_norm": 6.823557376861572, - "learning_rate": 9.063195371606588e-06, - "loss": 0.2626, - "mean_token_accuracy": 0.908292618393898, - "num_tokens": 6232638.0, - "step": 3800 + "epoch": 0.9858611825192802, + "step": 9971, + "train/total_loss": 0.17337548732757568 + }, + { + "entropy": 9.170970916748047, + "epoch": 0.9859600553687957, + "mean_token_accuracy": 0.8060606122016907, + "num_tokens": 31105753.0, + "step": 9972, + "train/ce_loss": 1.0176302194595337 }, { - "entropy": 8.568648982048035, - "epoch": 0.377694285149298, - "grad_norm": 14.621783256530762, - "learning_rate": 9.058250506848638e-06, - "loss": 0.276, - "mean_token_accuracy": 0.9028916656970978, - "num_tokens": 6329318.0, - "step": 3820 + "epoch": 0.9859600553687957, + "step": 9972, + "train/sim_loss": 0.015625 }, { - "entropy": 8.635245060920715, - "epoch": 0.37967174213960847, - "grad_norm": 8.16762924194336, - "learning_rate": 9.05330564209069e-06, - "loss": 0.2755, - "mean_token_accuracy": 0.9029468595981598, - "num_tokens": 6426096.0, - "step": 3840 + "epoch": 0.9859600553687957, + "step": 9972, + "train/total_loss": 0.11738802492618561 }, { - "entropy": 8.564204359054566, - "epoch": 0.3816491991299189, - "grad_norm": 5.403090476989746, - "learning_rate": 9.04836077733274e-06, - "loss": 0.2592, - "mean_token_accuracy": 0.9082671731710434, - "num_tokens": 6522976.0, - "step": 3860 + "entropy": 8.85882568359375, + "epoch": 0.9860589282183112, + "mean_token_accuracy": 0.6770708560943604, + "num_tokens": 31111062.0, + "step": 9973, + "train/ce_loss": 0.7416624426841736 }, { - "entropy": 8.420181322097779, - "epoch": 0.3836266561202294, - "grad_norm": 44.703433990478516, - "learning_rate": 9.043415912574793e-06, - "loss": 0.2432, - "mean_token_accuracy": 0.9155853986740112, - "num_tokens": 6619524.0, - "step": 3880 + "epoch": 0.9860589282183112, + "step": 9973, + "train/sim_loss": 0.0625 }, { - "entropy": 8.50045416355133, - "epoch": 0.3856041131105398, - "grad_norm": 8.709927558898926, - "learning_rate": 9.038471047816842e-06, - "loss": 0.2179, - "mean_token_accuracy": 0.9222229868173599, - "num_tokens": 6715192.0, - "step": 3900 + "epoch": 0.9860589282183112, + "step": 9973, + "train/total_loss": 0.13666623830795288 }, { - "entropy": 8.599285316467284, - "epoch": 0.3875815701008503, - "grad_norm": 5.20430850982666, - "learning_rate": 9.033526183058894e-06, - "loss": 0.2477, - "mean_token_accuracy": 0.9111745923757553, - "num_tokens": 6810590.0, - "step": 3920 + "entropy": 8.846567153930664, + "epoch": 0.9861578010678268, + "mean_token_accuracy": 0.822603702545166, + "num_tokens": 31116244.0, + "step": 9974, + "train/ce_loss": 0.8458290100097656 }, { - "entropy": 8.619631052017212, - "epoch": 0.38955902709116075, - "grad_norm": 6.483546257019043, - "learning_rate": 9.028581318300944e-06, - "loss": 0.2483, - "mean_token_accuracy": 0.9158045738935471, - "num_tokens": 6907648.0, - "step": 3940 + "epoch": 0.9861578010678268, + "step": 9974, + "train/sim_loss": 0.0390625 }, { - "entropy": 8.913666248321533, - "epoch": 0.39153648408147124, - "grad_norm": 6.434947490692139, - "learning_rate": 9.023636453542997e-06, - "loss": 0.2453, - "mean_token_accuracy": 0.9141961604356765, - "num_tokens": 7002736.0, - "step": 3960 + "epoch": 0.9861578010678268, + "step": 9974, + "train/total_loss": 0.12364540249109268 }, { - "entropy": 8.798964881896973, - "epoch": 0.39351394107178167, - "grad_norm": 5.798513889312744, - "learning_rate": 9.018691588785047e-06, - "loss": 0.2352, - "mean_token_accuracy": 0.9153059989213943, - "num_tokens": 7097610.0, - "step": 3980 + "entropy": 8.67448616027832, + "epoch": 0.9862566739173423, + "mean_token_accuracy": 0.7335957884788513, + "num_tokens": 31121455.0, + "step": 9975, + "train/ce_loss": 0.8351233601570129 }, { - "entropy": 8.49081060886383, - "epoch": 0.39549139806209216, - "grad_norm": 6.558709621429443, - "learning_rate": 9.013746724027098e-06, - "loss": 0.2547, - "mean_token_accuracy": 0.909723362326622, - "num_tokens": 7194313.0, - "step": 4000 + "epoch": 0.9862566739173423, + "step": 9975, + "train/sim_loss": 0.0703125 }, { - "entropy": 8.443391680717468, - "epoch": 0.3974688550524026, - "grad_norm": 4.921619415283203, - "learning_rate": 9.00880185926915e-06, - "loss": 0.2698, - "mean_token_accuracy": 0.9055833429098129, - "num_tokens": 7291108.0, - "step": 4020 + "epoch": 0.9862566739173423, + "step": 9975, + "train/total_loss": 0.1538248360157013 }, { - "entropy": 8.53747022151947, - "epoch": 0.3994463120427131, - "grad_norm": 15.979576110839844, - "learning_rate": 9.0038569945112e-06, - "loss": 0.2577, - "mean_token_accuracy": 0.9085649162530899, - "num_tokens": 7388098.0, - "step": 4040 + "entropy": 8.290175437927246, + "epoch": 0.9863555467668578, + "mean_token_accuracy": 0.742484986782074, + "num_tokens": 31126914.0, + "step": 9976, + "train/ce_loss": 1.2690761089324951 }, { - "entropy": 8.446025109291076, - "epoch": 0.4014237690330235, - "grad_norm": 7.959125995635986, - "learning_rate": 8.998912129753253e-06, - "loss": 0.2175, - "mean_token_accuracy": 0.9249117046594619, - "num_tokens": 7484316.0, - "step": 4060 + "epoch": 0.9863555467668578, + "step": 9976, + "train/sim_loss": 0.0546875 }, { - "entropy": 8.648867797851562, - "epoch": 0.403401226023334, - "grad_norm": 8.137770652770996, - "learning_rate": 8.993967264995303e-06, - "loss": 0.2303, - "mean_token_accuracy": 0.918008628487587, - "num_tokens": 7579956.0, - "step": 4080 + "epoch": 0.9863555467668578, + "step": 9976, + "train/total_loss": 0.181595116853714 }, { - "entropy": 8.620676064491272, - "epoch": 0.40537868301364444, - "grad_norm": 6.270977020263672, - "learning_rate": 8.989022400237354e-06, - "loss": 0.2422, - "mean_token_accuracy": 0.9142392307519913, - "num_tokens": 7675194.0, - "step": 4100 + "entropy": 8.776311874389648, + "epoch": 0.9864544196163734, + "mean_token_accuracy": 0.739534854888916, + "num_tokens": 31132012.0, + "step": 9977, + "train/ce_loss": 0.7153046131134033 }, { - "entropy": 8.534672164916993, - "epoch": 0.4073561400039549, - "grad_norm": 5.043423652648926, - "learning_rate": 8.984077535479406e-06, - "loss": 0.2668, - "mean_token_accuracy": 0.9067037135362626, - "num_tokens": 7771870.0, - "step": 4120 + "epoch": 0.9864544196163734, + "step": 9977, + "train/sim_loss": 0.0546875 }, { - "entropy": 8.64254539012909, - "epoch": 0.40933359699426536, - "grad_norm": 5.158176422119141, - "learning_rate": 8.979132670721456e-06, - "loss": 0.2212, - "mean_token_accuracy": 0.9224802881479264, - "num_tokens": 7866763.0, - "step": 4140 + "epoch": 0.9864544196163734, + "step": 9977, + "train/total_loss": 0.12621796131134033 }, { - "entropy": 8.413564467430115, - "epoch": 0.41131105398457585, - "grad_norm": 7.307987689971924, - "learning_rate": 8.974187805963509e-06, - "loss": 0.2238, - "mean_token_accuracy": 0.922067540884018, - "num_tokens": 7963074.0, - "step": 4160 + "entropy": 8.852508544921875, + "epoch": 0.9865532924658889, + "mean_token_accuracy": 0.671999990940094, + "num_tokens": 31137088.0, + "step": 9978, + "train/ce_loss": 7.408016244880855e-07 }, { - "entropy": 8.448160934448243, - "epoch": 0.4132885109748863, - "grad_norm": 8.33636474609375, - "learning_rate": 8.969242941205559e-06, - "loss": 0.2643, - "mean_token_accuracy": 0.9066998779773712, - "num_tokens": 8059194.0, - "step": 4180 + "epoch": 0.9865532924658889, + "step": 9978, + "train/sim_loss": 0.05859375 }, { - "entropy": 8.51591784954071, - "epoch": 0.41526596796519677, - "grad_norm": 7.432035446166992, - "learning_rate": 8.96429807644761e-06, - "loss": 0.2611, - "mean_token_accuracy": 0.9087202578783036, - "num_tokens": 8155548.0, - "step": 4200 + "epoch": 0.9865532924658889, + "step": 9978, + "train/total_loss": 0.05859382450580597 }, { - "entropy": 8.623217153549195, - "epoch": 0.4172434249555072, - "grad_norm": 8.370624542236328, - "learning_rate": 8.95935321168966e-06, - "loss": 0.2473, - "mean_token_accuracy": 0.9145809859037399, - "num_tokens": 8250975.0, - "step": 4220 + "entropy": 9.135875701904297, + "epoch": 0.9866521653154043, + "mean_token_accuracy": 0.7842639684677124, + "num_tokens": 31141936.0, + "step": 9979, + "train/ce_loss": 1.3566555026045535e-06 }, { - "entropy": 8.545232200622559, - "epoch": 0.4192208819458177, - "grad_norm": 9.062784194946289, - "learning_rate": 8.954408346931712e-06, - "loss": 0.2263, - "mean_token_accuracy": 0.9223273754119873, - "num_tokens": 8345874.0, - "step": 4240 + "epoch": 0.9866521653154043, + "step": 9979, + "train/sim_loss": 0.046875 }, { - "entropy": 8.55104238986969, - "epoch": 0.4211983389361281, - "grad_norm": 5.968785285949707, - "learning_rate": 8.949463482173763e-06, - "loss": 0.2192, - "mean_token_accuracy": 0.923736497759819, - "num_tokens": 8440886.0, - "step": 4260 + "epoch": 0.9866521653154043, + "step": 9979, + "train/total_loss": 0.046875134110450745 }, { - "entropy": 8.487057495117188, - "epoch": 0.4231757959264386, - "grad_norm": 7.367476940155029, - "learning_rate": 8.944518617415815e-06, - "loss": 0.2413, - "mean_token_accuracy": 0.9157112687826157, - "num_tokens": 8536072.0, - "step": 4280 + "epoch": 0.9867510381649199, + "grad_norm": 0.7542499899864197, + "learning_rate": 7.535232161400386e-06, + "loss": 0.1305, + "step": 9980 }, { - "entropy": 8.589159536361695, - "epoch": 0.42515325291674905, - "grad_norm": 6.42571496963501, - "learning_rate": 8.939573752657866e-06, - "loss": 0.2554, - "mean_token_accuracy": 0.9097007185220718, - "num_tokens": 8632162.0, - "step": 4300 + "entropy": 8.518350601196289, + "epoch": 0.9867510381649199, + "mean_token_accuracy": 0.7614973187446594, + "num_tokens": 31147269.0, + "step": 9980, + "train/ce_loss": 0.5412923693656921 }, { - "entropy": 8.636207485198975, - "epoch": 0.42713070990705954, - "grad_norm": 14.71319580078125, - "learning_rate": 8.934628887899916e-06, - "loss": 0.2423, - "mean_token_accuracy": 0.9149332791566849, - "num_tokens": 8727551.0, - "step": 4320 + "epoch": 0.9867510381649199, + "step": 9980, + "train/sim_loss": 0.015625 }, { - "entropy": 8.53875629901886, - "epoch": 0.42910816689736997, - "grad_norm": 9.17918586730957, - "learning_rate": 8.929684023141968e-06, - "loss": 0.2825, - "mean_token_accuracy": 0.9025544673204422, - "num_tokens": 8824324.0, - "step": 4340 + "epoch": 0.9867510381649199, + "step": 9980, + "train/total_loss": 0.06975424289703369 }, { - "entropy": 8.493563961982726, - "epoch": 0.43108562388768046, - "grad_norm": 11.744765281677246, - "learning_rate": 8.924739158384019e-06, - "loss": 0.2507, - "mean_token_accuracy": 0.912699818611145, - "num_tokens": 8921137.0, - "step": 4360 + "entropy": 8.914718627929688, + "epoch": 0.9868499110144354, + "mean_token_accuracy": 0.7020348906517029, + "num_tokens": 31152404.0, + "step": 9981, + "train/ce_loss": 0.8738462924957275 }, { - "entropy": 8.225626754760743, - "epoch": 0.4330630808779909, - "grad_norm": 20.236576080322266, - "learning_rate": 8.91979429362607e-06, - "loss": 0.2494, - "mean_token_accuracy": 0.9128432601690293, - "num_tokens": 9018294.0, - "step": 4380 + "epoch": 0.9868499110144354, + "step": 9981, + "train/sim_loss": 0.125 }, { - "entropy": 8.52009084224701, - "epoch": 0.4350405378683014, - "grad_norm": 8.029839515686035, - "learning_rate": 8.914849428868121e-06, - "loss": 0.2348, - "mean_token_accuracy": 0.9197594821453094, - "num_tokens": 9114681.0, - "step": 4400 + "epoch": 0.9868499110144354, + "step": 9981, + "train/total_loss": 0.2123846411705017 }, { - "entropy": 8.449236345291137, - "epoch": 0.4370179948586118, - "grad_norm": 19.661365509033203, - "learning_rate": 8.909904564110172e-06, - "loss": 0.2317, - "mean_token_accuracy": 0.92117640376091, - "num_tokens": 9210408.0, - "step": 4420 + "entropy": 8.742120742797852, + "epoch": 0.9869487838639509, + "mean_token_accuracy": 0.7226277589797974, + "num_tokens": 31157670.0, + "step": 9982, + "train/ce_loss": 0.7680981755256653 }, { - "entropy": 8.556246447563172, - "epoch": 0.4389954518489223, - "grad_norm": 11.88007926940918, - "learning_rate": 8.904959699352224e-06, - "loss": 0.2299, - "mean_token_accuracy": 0.9193400293588638, - "num_tokens": 9305409.0, - "step": 4440 + "epoch": 0.9869487838639509, + "step": 9982, + "train/sim_loss": 0.04296875 }, { - "entropy": 8.877286338806153, - "epoch": 0.44097290883923274, - "grad_norm": 6.529606342315674, - "learning_rate": 8.900014834594275e-06, - "loss": 0.2358, - "mean_token_accuracy": 0.9178378254175186, - "num_tokens": 9399622.0, - "step": 4460 + "epoch": 0.9869487838639509, + "step": 9982, + "train/total_loss": 0.11977856606245041 }, { - "entropy": 8.656371331214904, - "epoch": 0.4429503658295432, - "grad_norm": 6.3461689949035645, - "learning_rate": 8.895069969836325e-06, - "loss": 0.248, - "mean_token_accuracy": 0.9159361749887467, - "num_tokens": 9494403.0, - "step": 4480 + "entropy": 8.708446502685547, + "epoch": 0.9870476567134665, + "mean_token_accuracy": 0.7712895274162292, + "num_tokens": 31162978.0, + "step": 9983, + "train/ce_loss": 0.5740722417831421 }, { - "entropy": 8.628129410743714, - "epoch": 0.44492782281985366, - "grad_norm": 7.510733127593994, - "learning_rate": 8.890125105078377e-06, - "loss": 0.2509, - "mean_token_accuracy": 0.9142907947301865, - "num_tokens": 9590235.0, - "step": 4500 + "epoch": 0.9870476567134665, + "step": 9983, + "train/sim_loss": 0.03125 }, { - "entropy": 8.534839606285095, - "epoch": 0.44690527981016415, - "grad_norm": 5.806549072265625, - "learning_rate": 8.885180240320428e-06, - "loss": 0.2474, - "mean_token_accuracy": 0.9153155446052551, - "num_tokens": 9687098.0, - "step": 4520 + "epoch": 0.9870476567134665, + "step": 9983, + "train/total_loss": 0.08865723013877869 }, { - "entropy": 8.506877684593201, - "epoch": 0.4488827368004746, - "grad_norm": 13.601163864135742, - "learning_rate": 8.88023537556248e-06, - "loss": 0.2199, - "mean_token_accuracy": 0.9219205379486084, - "num_tokens": 9782268.0, - "step": 4540 + "entropy": 8.663925170898438, + "epoch": 0.987146529562982, + "mean_token_accuracy": 0.7137305736541748, + "num_tokens": 31168202.0, + "step": 9984, + "train/ce_loss": 1.1544108390808105 }, { - "entropy": 8.472445487976074, - "epoch": 0.45086019379078507, - "grad_norm": 6.868998050689697, - "learning_rate": 8.87529051080453e-06, - "loss": 0.2499, - "mean_token_accuracy": 0.9141937404870987, - "num_tokens": 9879964.0, - "step": 4560 + "epoch": 0.987146529562982, + "step": 9984, + "train/sim_loss": 0.03515625 }, { - "entropy": 8.975648546218872, - "epoch": 0.4528376507810955, - "grad_norm": 8.0801420211792, - "learning_rate": 8.870345646046581e-06, - "loss": 0.2521, - "mean_token_accuracy": 0.9139484345912934, - "num_tokens": 9975985.0, - "step": 4580 + "epoch": 0.987146529562982, + "step": 9984, + "train/total_loss": 0.15059733390808105 }, { - "entropy": 8.786634349822998, - "epoch": 0.454815107771406, - "grad_norm": 12.988626480102539, - "learning_rate": 8.865400781288632e-06, - "loss": 0.2982, - "mean_token_accuracy": 0.8961870938539505, - "num_tokens": 10073948.0, - "step": 4600 + "entropy": 8.859031677246094, + "epoch": 0.9872454024124975, + "mean_token_accuracy": 0.7522522807121277, + "num_tokens": 31173490.0, + "step": 9985, + "train/ce_loss": 1.1717761754989624 }, { - "entropy": 8.90425477027893, - "epoch": 0.4567925647617164, - "grad_norm": 16.124622344970703, - "learning_rate": 8.860455916530684e-06, - "loss": 0.2685, - "mean_token_accuracy": 0.9074593156576156, - "num_tokens": 10169488.0, - "step": 4620 + "epoch": 0.9872454024124975, + "step": 9985, + "train/sim_loss": 0.0859375 }, { - "entropy": 8.903997898101807, - "epoch": 0.4587700217520269, - "grad_norm": 26.714290618896484, - "learning_rate": 8.855511051772734e-06, - "loss": 0.2412, - "mean_token_accuracy": 0.9162767887115478, - "num_tokens": 10264792.0, - "step": 4640 + "epoch": 0.9872454024124975, + "step": 9985, + "train/total_loss": 0.20311512053012848 }, { - "entropy": 8.799645900726318, - "epoch": 0.46074747874233735, - "grad_norm": 7.018490314483643, - "learning_rate": 8.850566187014787e-06, - "loss": 0.2478, - "mean_token_accuracy": 0.9110076993703842, - "num_tokens": 10360854.0, - "step": 4660 + "entropy": 8.618402481079102, + "epoch": 0.9873442752620131, + "mean_token_accuracy": 0.7758620977401733, + "num_tokens": 31178664.0, + "step": 9986, + "train/ce_loss": 0.8400186896324158 }, { - "entropy": 8.944317960739136, - "epoch": 0.46272493573264784, - "grad_norm": 12.574722290039062, - "learning_rate": 8.845621322256837e-06, - "loss": 0.2566, - "mean_token_accuracy": 0.9106970399618148, - "num_tokens": 10456504.0, - "step": 4680 + "epoch": 0.9873442752620131, + "step": 9986, + "train/sim_loss": 0.03515625 }, { - "entropy": 8.824884748458862, - "epoch": 0.46470239272295827, - "grad_norm": 10.262856483459473, - "learning_rate": 8.840676457498888e-06, - "loss": 0.2482, - "mean_token_accuracy": 0.913115856051445, - "num_tokens": 10553012.0, - "step": 4700 + "epoch": 0.9873442752620131, + "step": 9986, + "train/total_loss": 0.11915811896324158 }, { - "entropy": 8.780104827880859, - "epoch": 0.46667984971326876, - "grad_norm": 6.89714241027832, - "learning_rate": 8.83573159274094e-06, - "loss": 0.2434, - "mean_token_accuracy": 0.9154404312372207, - "num_tokens": 10647887.0, - "step": 4720 + "entropy": 9.091707229614258, + "epoch": 0.9874431481115286, + "mean_token_accuracy": 0.8032000064849854, + "num_tokens": 31183757.0, + "step": 9987, + "train/ce_loss": 8.002979825505463e-07 }, { - "entropy": 8.67457902431488, - "epoch": 0.4686573067035792, - "grad_norm": 5.493983268737793, - "learning_rate": 8.83078672798299e-06, - "loss": 0.2427, - "mean_token_accuracy": 0.9139101594686508, - "num_tokens": 10743328.0, - "step": 4740 + "epoch": 0.9874431481115286, + "step": 9987, + "train/sim_loss": 0.0234375 }, { - "entropy": 8.727025961875915, - "epoch": 0.4706347636938897, - "grad_norm": 13.759496688842773, - "learning_rate": 8.825841863225042e-06, - "loss": 0.2311, - "mean_token_accuracy": 0.92018963098526, - "num_tokens": 10837953.0, - "step": 4760 + "epoch": 0.9874431481115286, + "step": 9987, + "train/total_loss": 0.023437580093741417 }, { - "entropy": 8.438578462600708, - "epoch": 0.4726122206842001, - "grad_norm": 13.486681938171387, - "learning_rate": 8.820896998467093e-06, - "loss": 0.2349, - "mean_token_accuracy": 0.9182916134595871, - "num_tokens": 10934049.0, - "step": 4780 + "entropy": 8.52845287322998, + "epoch": 0.987542020961044, + "mean_token_accuracy": 0.7927232384681702, + "num_tokens": 31189178.0, + "step": 9988, + "train/ce_loss": 0.7290910482406616 }, { - "entropy": 8.59653172492981, - "epoch": 0.4745896776745106, - "grad_norm": 8.081748008728027, - "learning_rate": 8.815952133709143e-06, - "loss": 0.2274, - "mean_token_accuracy": 0.921638372540474, - "num_tokens": 11028412.0, - "step": 4800 + "epoch": 0.987542020961044, + "step": 9988, + "train/sim_loss": 0.0546875 }, { - "entropy": 8.560403037071229, - "epoch": 0.47656713466482103, - "grad_norm": 5.403700351715088, - "learning_rate": 8.811007268951196e-06, - "loss": 0.2434, - "mean_token_accuracy": 0.914707025885582, - "num_tokens": 11124333.0, - "step": 4820 + "epoch": 0.987542020961044, + "step": 9988, + "train/total_loss": 0.12759661674499512 }, { - "entropy": 8.652185344696045, - "epoch": 0.4785445916551315, - "grad_norm": 8.639039039611816, - "learning_rate": 8.806062404193246e-06, - "loss": 0.24, - "mean_token_accuracy": 0.9131653487682343, - "num_tokens": 11220014.0, - "step": 4840 + "entropy": 9.61940860748291, + "epoch": 0.9876408938105596, + "mean_token_accuracy": 0.7434554696083069, + "num_tokens": 31193922.0, + "step": 9989, + "train/ce_loss": 3.083202955167508e-07 }, { - "entropy": 8.498576831817626, - "epoch": 0.48052204864544196, - "grad_norm": 7.509744644165039, - "learning_rate": 8.801117539435298e-06, - "loss": 0.2474, - "mean_token_accuracy": 0.9103549897670746, - "num_tokens": 11317155.0, - "step": 4860 + "epoch": 0.9876408938105596, + "step": 9989, + "train/sim_loss": 0.01171875 }, { - "entropy": 8.43241617679596, - "epoch": 0.48249950563575245, - "grad_norm": 8.70064640045166, - "learning_rate": 8.796172674677347e-06, - "loss": 0.2588, - "mean_token_accuracy": 0.9110805332660675, - "num_tokens": 11414206.0, - "step": 4880 + "epoch": 0.9876408938105596, + "step": 9989, + "train/total_loss": 0.011718780733644962 }, { - "entropy": 8.588392210006713, - "epoch": 0.4844769626260629, - "grad_norm": 9.98619270324707, - "learning_rate": 8.7912278099194e-06, - "loss": 0.2477, - "mean_token_accuracy": 0.9100183725357056, - "num_tokens": 11511069.0, - "step": 4900 + "entropy": 8.437448501586914, + "epoch": 0.9877397666600751, + "mean_token_accuracy": 0.7885652780532837, + "num_tokens": 31199360.0, + "step": 9990, + "train/ce_loss": 0.2778100073337555 }, { - "entropy": 8.61137251853943, - "epoch": 0.48645441961637337, - "grad_norm": 22.916452407836914, - "learning_rate": 8.78628294516145e-06, - "loss": 0.2463, - "mean_token_accuracy": 0.914769783616066, - "num_tokens": 11606498.0, - "step": 4920 + "epoch": 0.9877397666600751, + "step": 9990, + "train/sim_loss": 0.015625 }, { - "entropy": 8.60069227218628, - "epoch": 0.4884318766066838, - "grad_norm": 13.548232078552246, - "learning_rate": 8.781338080403502e-06, - "loss": 0.2453, - "mean_token_accuracy": 0.9153828501701355, - "num_tokens": 11702117.0, - "step": 4940 + "epoch": 0.9877397666600751, + "step": 9990, + "train/total_loss": 0.04340600222349167 }, { - "entropy": 8.531246638298034, - "epoch": 0.4904093335969943, - "grad_norm": 6.630843162536621, - "learning_rate": 8.776393215645553e-06, - "loss": 0.237, - "mean_token_accuracy": 0.9166796892881394, - "num_tokens": 11796688.0, - "step": 4960 + "entropy": 9.451400756835938, + "epoch": 0.9878386395095907, + "mean_token_accuracy": 0.747863233089447, + "num_tokens": 31204240.0, + "step": 9991, + "train/ce_loss": 0.9985411167144775 }, { - "entropy": 8.654666376113891, - "epoch": 0.4923867905873047, - "grad_norm": 7.365876197814941, - "learning_rate": 8.771448350887603e-06, - "loss": 0.2418, - "mean_token_accuracy": 0.9159041941165924, - "num_tokens": 11890708.0, - "step": 4980 + "epoch": 0.9878386395095907, + "step": 9991, + "train/sim_loss": 0.06640625 }, { - "entropy": 8.316032791137696, - "epoch": 0.4943642475776152, - "grad_norm": 9.629853248596191, - "learning_rate": 8.766503486129655e-06, - "loss": 0.24, - "mean_token_accuracy": 0.9146564483642579, - "num_tokens": 11986115.0, - "step": 5000 + "epoch": 0.9878386395095907, + "step": 9991, + "train/total_loss": 0.16626036167144775 }, { - "entropy": 8.249524021148682, - "epoch": 0.49634170456792565, - "grad_norm": 7.097714424133301, - "learning_rate": 8.761558621371706e-06, - "loss": 0.262, - "mean_token_accuracy": 0.9096025586128235, - "num_tokens": 12083533.0, - "step": 5020 + "entropy": 8.895739555358887, + "epoch": 0.9879375123591062, + "mean_token_accuracy": 0.7634561061859131, + "num_tokens": 31209503.0, + "step": 9992, + "train/ce_loss": 1.0490940809249878 }, { - "entropy": 8.394402647018433, - "epoch": 0.49831916155823613, - "grad_norm": 9.107475280761719, - "learning_rate": 8.756613756613758e-06, - "loss": 0.2518, - "mean_token_accuracy": 0.9127232819795609, - "num_tokens": 12179525.0, - "step": 5040 + "epoch": 0.9879375123591062, + "step": 9992, + "train/sim_loss": 0.02734375 }, { - "entropy": 8.438790369033814, - "epoch": 0.5002966185485466, - "grad_norm": 14.101659774780273, - "learning_rate": 8.751668891855809e-06, - "loss": 0.2526, - "mean_token_accuracy": 0.9126446783542633, - "num_tokens": 12276923.0, - "step": 5060 + "epoch": 0.9879375123591062, + "step": 9992, + "train/total_loss": 0.13225317001342773 }, { - "entropy": 8.444346523284912, - "epoch": 0.5022740755388571, - "grad_norm": 17.958641052246094, - "learning_rate": 8.746724027097859e-06, - "loss": 0.2267, - "mean_token_accuracy": 0.9193026006221772, - "num_tokens": 12372350.0, - "step": 5080 + "entropy": 9.31628704071045, + "epoch": 0.9880363852086217, + "mean_token_accuracy": 0.7696850299835205, + "num_tokens": 31214458.0, + "step": 9993, + "train/ce_loss": 0.9760105609893799 }, { - "entropy": 8.648650860786438, - "epoch": 0.5042515325291675, - "grad_norm": 9.379371643066406, - "learning_rate": 8.741779162339911e-06, - "loss": 0.226, - "mean_token_accuracy": 0.9195880085229874, - "num_tokens": 12467487.0, - "step": 5100 + "epoch": 0.9880363852086217, + "step": 9993, + "train/sim_loss": 0.01953125 }, { - "entropy": 8.385884857177734, - "epoch": 0.5062289895194779, - "grad_norm": 12.274886131286621, - "learning_rate": 8.736834297581962e-06, - "loss": 0.2765, - "mean_token_accuracy": 0.9028911709785461, - "num_tokens": 12564961.0, - "step": 5120 + "epoch": 0.9880363852086217, + "step": 9993, + "train/total_loss": 0.11713230609893799 }, { - "entropy": 8.581983947753907, - "epoch": 0.5082064465097884, - "grad_norm": 9.47417163848877, - "learning_rate": 8.731889432824014e-06, - "loss": 0.2429, - "mean_token_accuracy": 0.9126976132392883, - "num_tokens": 12660125.0, - "step": 5140 + "entropy": 8.331695556640625, + "epoch": 0.9881352580581373, + "mean_token_accuracy": 0.7680355310440063, + "num_tokens": 31219831.0, + "step": 9994, + "train/ce_loss": 0.7177839279174805 }, { - "entropy": 8.547501754760741, - "epoch": 0.5101839035000989, - "grad_norm": 6.504186630249023, - "learning_rate": 8.726944568066063e-06, - "loss": 0.2554, - "mean_token_accuracy": 0.9128349304199219, - "num_tokens": 12755125.0, - "step": 5160 + "epoch": 0.9881352580581373, + "step": 9994, + "train/sim_loss": 0.0234375 }, { - "entropy": 8.442482471466064, - "epoch": 0.5121613604904093, - "grad_norm": 6.167585849761963, - "learning_rate": 8.721999703308115e-06, - "loss": 0.2486, - "mean_token_accuracy": 0.9139046847820282, - "num_tokens": 12851013.0, - "step": 5180 + "epoch": 0.9881352580581373, + "step": 9994, + "train/total_loss": 0.09521589428186417 }, { - "entropy": 8.603999400138855, - "epoch": 0.5141388174807198, - "grad_norm": 6.748779296875, - "learning_rate": 8.717054838550165e-06, - "loss": 0.2531, - "mean_token_accuracy": 0.9108146101236343, - "num_tokens": 12946413.0, - "step": 5200 + "entropy": 8.767294883728027, + "epoch": 0.9882341309076528, + "mean_token_accuracy": 0.799501895904541, + "num_tokens": 31225104.0, + "step": 9995, + "train/ce_loss": 0.5451284050941467 }, { - "entropy": 8.391959357261658, - "epoch": 0.5161162744710303, - "grad_norm": 23.97373390197754, - "learning_rate": 8.712109973792218e-06, - "loss": 0.2499, - "mean_token_accuracy": 0.9130973368883133, - "num_tokens": 13043131.0, - "step": 5220 + "epoch": 0.9882341309076528, + "step": 9995, + "train/sim_loss": 0.0234375 }, { - "entropy": 8.548647618293762, - "epoch": 0.5180937314613407, - "grad_norm": 5.871434688568115, - "learning_rate": 8.707165109034268e-06, - "loss": 0.2586, - "mean_token_accuracy": 0.911566361784935, - "num_tokens": 13138298.0, - "step": 5240 + "epoch": 0.9882341309076528, + "step": 9995, + "train/total_loss": 0.07795034348964691 }, { - "entropy": 8.57565267086029, - "epoch": 0.5200711884516511, - "grad_norm": 7.245882511138916, - "learning_rate": 8.702220244276319e-06, - "loss": 0.2453, - "mean_token_accuracy": 0.9127212554216385, - "num_tokens": 13233040.0, - "step": 5260 + "entropy": 8.49264907836914, + "epoch": 0.9883330037571683, + "mean_token_accuracy": 0.7464065551757812, + "num_tokens": 31230563.0, + "step": 9996, + "train/ce_loss": 0.4141041040420532 }, { - "entropy": 8.294360160827637, - "epoch": 0.5220486454419616, - "grad_norm": 5.041314125061035, - "learning_rate": 8.697275379518371e-06, - "loss": 0.2576, - "mean_token_accuracy": 0.9105819523334503, - "num_tokens": 13328563.0, - "step": 5280 + "epoch": 0.9883330037571683, + "step": 9996, + "train/sim_loss": 0.015625 }, { - "entropy": 8.413219618797303, - "epoch": 0.5240261024322721, - "grad_norm": 6.246405601501465, - "learning_rate": 8.692330514760421e-06, - "loss": 0.2512, - "mean_token_accuracy": 0.9153288692235947, - "num_tokens": 13423555.0, - "step": 5300 + "epoch": 0.9883330037571683, + "step": 9996, + "train/total_loss": 0.0570354126393795 }, { - "entropy": 8.362274503707885, - "epoch": 0.5260035594225826, - "grad_norm": 7.106925010681152, - "learning_rate": 8.687385650002474e-06, - "loss": 0.2526, - "mean_token_accuracy": 0.9109732240438462, - "num_tokens": 13518741.0, - "step": 5320 + "entropy": 8.719945907592773, + "epoch": 0.9884318766066839, + "mean_token_accuracy": 0.7779156565666199, + "num_tokens": 31235811.0, + "step": 9997, + "train/ce_loss": 0.7226426005363464 }, { - "entropy": 8.177941012382508, - "epoch": 0.527981016412893, - "grad_norm": 5.926587104797363, - "learning_rate": 8.682440785244524e-06, - "loss": 0.2603, - "mean_token_accuracy": 0.9100730836391449, - "num_tokens": 13614040.0, - "step": 5340 + "epoch": 0.9884318766066839, + "step": 9997, + "train/sim_loss": 0.01953125 }, { - "entropy": 8.209478998184204, - "epoch": 0.5299584734032035, - "grad_norm": 11.204751014709473, - "learning_rate": 8.677495920486575e-06, - "loss": 0.2346, - "mean_token_accuracy": 0.9182068973779678, - "num_tokens": 13709344.0, - "step": 5360 + "epoch": 0.9884318766066839, + "step": 9997, + "train/total_loss": 0.09179551154375076 }, { - "entropy": 8.210282015800477, - "epoch": 0.531935930393514, - "grad_norm": 5.428844451904297, - "learning_rate": 8.672551055728627e-06, - "loss": 0.2461, - "mean_token_accuracy": 0.9162275105714798, - "num_tokens": 13805079.0, - "step": 5380 + "entropy": 9.418034553527832, + "epoch": 0.9885307494561993, + "mean_token_accuracy": 0.7284482717514038, + "num_tokens": 31240665.0, + "step": 9998, + "train/ce_loss": 1.6247684955596924 }, { - "entropy": 8.37671356201172, - "epoch": 0.5339133873838244, - "grad_norm": 10.085187911987305, - "learning_rate": 8.667606190970677e-06, - "loss": 0.2535, - "mean_token_accuracy": 0.9122269928455353, - "num_tokens": 13900835.0, - "step": 5400 + "epoch": 0.9885307494561993, + "step": 9998, + "train/sim_loss": 0.0703125 }, { - "entropy": 8.271014213562012, - "epoch": 0.5358908443741348, - "grad_norm": 6.77738094329834, - "learning_rate": 8.66266132621273e-06, - "loss": 0.2403, - "mean_token_accuracy": 0.912382036447525, - "num_tokens": 13996199.0, - "step": 5420 + "epoch": 0.9885307494561993, + "step": 9998, + "train/total_loss": 0.23278935253620148 }, { - "entropy": 8.354682016372681, - "epoch": 0.5378683013644453, - "grad_norm": 10.853652954101562, - "learning_rate": 8.65771646145478e-06, - "loss": 0.2395, - "mean_token_accuracy": 0.9132285386323928, - "num_tokens": 14091840.0, - "step": 5440 + "entropy": 8.844606399536133, + "epoch": 0.9886296223057148, + "mean_token_accuracy": 0.7281553149223328, + "num_tokens": 31245851.0, + "step": 9999, + "train/ce_loss": 0.621844470500946 }, { - "entropy": 8.318234205245972, - "epoch": 0.5398457583547558, - "grad_norm": 6.674023628234863, - "learning_rate": 8.65277159669683e-06, - "loss": 0.2643, - "mean_token_accuracy": 0.9089009284973144, - "num_tokens": 14188037.0, - "step": 5460 + "epoch": 0.9886296223057148, + "step": 9999, + "train/sim_loss": 0.08984375 }, { - "entropy": 8.31621437072754, - "epoch": 0.5418232153450663, - "grad_norm": 8.350377082824707, - "learning_rate": 8.647826731938883e-06, - "loss": 0.2483, - "mean_token_accuracy": 0.9131887167692184, - "num_tokens": 14283585.0, - "step": 5480 + "epoch": 0.9886296223057148, + "step": 9999, + "train/total_loss": 0.15202820301055908 }, { - "entropy": 8.442362189292908, - "epoch": 0.5438006723353767, - "grad_norm": 10.145578384399414, - "learning_rate": 8.642881867180933e-06, - "loss": 0.2415, - "mean_token_accuracy": 0.9147403508424758, - "num_tokens": 14379579.0, - "step": 5500 + "epoch": 0.9887284951552304, + "grad_norm": 0.7090817093849182, + "learning_rate": 7.5302872966424374e-06, + "loss": 0.1195, + "step": 10000 } ], "logging_steps": 20, @@ -2777,7 +183527,7 @@ "attributes": {} } }, - "total_flos": 1.2251221025336852e+18, + "total_flos": 2.41961109815296e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null