{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9887284951552304, "eval_steps": 500, "global_step": 10000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 3.3239192962646484, "epoch": 0, "mean_token_accuracy": 0.6472114324569702, "num_tokens": 5243.0, "step": 0, "train/ce_loss": 2.4552130699157715 }, { "epoch": 0, "step": 0, "train/sim_loss": 1.015625 }, { "epoch": 0, "step": 0, "train/total_loss": 1.2611463069915771 }, { "entropy": 3.365748882293701, "epoch": 9.887284951552304e-05, "mean_token_accuracy": 0.6583143472671509, "num_tokens": 10097.0, "step": 1, "train/ce_loss": 0.7174420952796936 }, { "epoch": 9.887284951552304e-05, "step": 1, "train/sim_loss": 1.0078125 }, { "epoch": 9.887284951552304e-05, "step": 1, "train/total_loss": 1.0795567035675049 }, { "entropy": 3.1243503093719482, "epoch": 0.00019774569903104609, "mean_token_accuracy": 0.6797671318054199, "num_tokens": 15213.0, "step": 2, "train/ce_loss": 2.3286292552948 }, { "epoch": 0.00019774569903104609, "step": 2, "train/sim_loss": 0.98828125 }, { "epoch": 0.00019774569903104609, "step": 2, "train/total_loss": 1.221144199371338 }, { "entropy": 3.2535760402679443, "epoch": 0.00029661854854656913, "mean_token_accuracy": 0.6405940651893616, "num_tokens": 20700.0, "step": 3, "train/ce_loss": 1.0501251220703125 }, { "epoch": 0.00029661854854656913, "step": 3, "train/sim_loss": 0.98828125 }, { "epoch": 0.00029661854854656913, "step": 3, "train/total_loss": 1.0932937860488892 }, { "entropy": 3.384281873703003, "epoch": 0.00039549139806209217, "mean_token_accuracy": 0.6576381325721741, "num_tokens": 26091.0, "step": 4, "train/ce_loss": 1.9366257190704346 }, { "epoch": 0.00039549139806209217, "step": 4, "train/sim_loss": 0.98828125 }, { "epoch": 0.00039549139806209217, "step": 4, "train/total_loss": 1.1819437742233276 }, { "entropy": 3.4554200172424316, "epoch": 0.0004943642475776152, "mean_token_accuracy": 0.727007269859314, "num_tokens": 31430.0, "step": 5, "train/ce_loss": 1.1327928304672241 }, { "epoch": 0.0004943642475776152, "step": 5, "train/sim_loss": 0.97265625 }, { "epoch": 0.0004943642475776152, "step": 5, "train/total_loss": 1.0859355926513672 }, { "entropy": 3.5804295539855957, "epoch": 0.0005932370970931383, "mean_token_accuracy": 0.6846153736114502, "num_tokens": 36852.0, "step": 6, "train/ce_loss": 1.342855453491211 }, { "epoch": 0.0005932370970931383, "step": 6, "train/sim_loss": 0.98828125 }, { "epoch": 0.0005932370970931383, "step": 6, "train/total_loss": 1.122566819190979 }, { "entropy": 3.361402750015259, "epoch": 0.0006921099466086612, "mean_token_accuracy": 0.6744966506958008, "num_tokens": 41935.0, "step": 7, "train/ce_loss": 1.7447583675384521 }, { "epoch": 0.0006921099466086612, "step": 7, "train/sim_loss": 0.9296875 }, { "epoch": 0.0006921099466086612, "step": 7, "train/total_loss": 1.1041632890701294 }, { "entropy": 3.931365966796875, "epoch": 0.0007909827961241843, "mean_token_accuracy": 0.6637630462646484, "num_tokens": 46986.0, "step": 8, "train/ce_loss": 0.3796125054359436 }, { "epoch": 0.0007909827961241843, "step": 8, "train/sim_loss": 0.9296875 }, { "epoch": 0.0007909827961241843, "step": 8, "train/total_loss": 0.9676487445831299 }, { "entropy": 3.9508445262908936, "epoch": 0.0008898556456397073, "mean_token_accuracy": 0.6216216087341309, "num_tokens": 52049.0, "step": 9, "train/ce_loss": 0.3588610291481018 }, { "epoch": 0.0008898556456397073, "step": 9, "train/sim_loss": 0.8828125 }, { "epoch": 0.0008898556456397073, "step": 9, "train/total_loss": 0.9186986088752747 }, { "entropy": 3.788201332092285, "epoch": 0.0009887284951552303, "mean_token_accuracy": 0.7525773048400879, "num_tokens": 56847.0, "step": 10, "train/ce_loss": 1.950152039527893 }, { "epoch": 0.0009887284951552303, "step": 10, "train/sim_loss": 0.8359375 }, { "epoch": 0.0009887284951552303, "step": 10, "train/total_loss": 1.0309526920318604 }, { "entropy": 3.670529842376709, "epoch": 0.0010876013446707534, "mean_token_accuracy": 0.725824773311615, "num_tokens": 62255.0, "step": 11, "train/ce_loss": 0.854789674282074 }, { "epoch": 0.0010876013446707534, "step": 11, "train/sim_loss": 0.7890625 }, { "epoch": 0.0010876013446707534, "step": 11, "train/total_loss": 0.8745414614677429 }, { "entropy": 4.051430702209473, "epoch": 0.0011864741941862765, "mean_token_accuracy": 0.7071713209152222, "num_tokens": 67232.0, "step": 12, "train/ce_loss": 0.32699134945869446 }, { "epoch": 0.0011864741941862765, "step": 12, "train/sim_loss": 0.7265625 }, { "epoch": 0.0011864741941862765, "step": 12, "train/total_loss": 0.7592616081237793 }, { "entropy": 4.368044853210449, "epoch": 0.0012853470437017994, "mean_token_accuracy": 0.7151514887809753, "num_tokens": 71963.0, "step": 13, "train/ce_loss": 0.49288395047187805 }, { "epoch": 0.0012853470437017994, "step": 13, "train/sim_loss": 0.64453125 }, { "epoch": 0.0012853470437017994, "step": 13, "train/total_loss": 0.6938196420669556 }, { "entropy": 4.399681091308594, "epoch": 0.0013842198932173225, "mean_token_accuracy": 0.7262210845947266, "num_tokens": 77190.0, "step": 14, "train/ce_loss": 0.9703823328018188 }, { "epoch": 0.0013842198932173225, "step": 14, "train/sim_loss": 0.62890625 }, { "epoch": 0.0013842198932173225, "step": 14, "train/total_loss": 0.725944459438324 }, { "entropy": 4.859724044799805, "epoch": 0.0014830927427328456, "mean_token_accuracy": 0.7016759514808655, "num_tokens": 82566.0, "step": 15, "train/ce_loss": 0.9824153184890747 }, { "epoch": 0.0014830927427328456, "step": 15, "train/sim_loss": 0.55078125 }, { "epoch": 0.0014830927427328456, "step": 15, "train/total_loss": 0.6490227580070496 }, { "entropy": 4.952378273010254, "epoch": 0.0015819655922483687, "mean_token_accuracy": 0.6652078628540039, "num_tokens": 87957.0, "step": 16, "train/ce_loss": 1.4136582612991333 }, { "epoch": 0.0015819655922483687, "step": 16, "train/sim_loss": 0.58984375 }, { "epoch": 0.0015819655922483687, "step": 16, "train/total_loss": 0.7312095761299133 }, { "entropy": 4.8109025955200195, "epoch": 0.0016808384417638916, "mean_token_accuracy": 0.7169811129570007, "num_tokens": 93055.0, "step": 17, "train/ce_loss": 1.1063599586486816 }, { "epoch": 0.0016808384417638916, "step": 17, "train/sim_loss": 0.5078125 }, { "epoch": 0.0016808384417638916, "step": 17, "train/total_loss": 0.6184484958648682 }, { "entropy": 4.44307804107666, "epoch": 0.0017797112912794147, "mean_token_accuracy": 0.7559681534767151, "num_tokens": 98691.0, "step": 18, "train/ce_loss": 0.914014995098114 }, { "epoch": 0.0017797112912794147, "step": 18, "train/sim_loss": 0.5078125 }, { "epoch": 0.0017797112912794147, "step": 18, "train/total_loss": 0.5992140173912048 }, { "entropy": 5.108180046081543, "epoch": 0.0018785841407949378, "mean_token_accuracy": 0.692307710647583, "num_tokens": 103954.0, "step": 19, "train/ce_loss": 1.0503363609313965 }, { "epoch": 0.0018785841407949378, "step": 19, "train/sim_loss": 0.43359375 }, { "epoch": 0.0018785841407949378, "step": 19, "train/total_loss": 0.5386273860931396 }, { "epoch": 0.0019774569903104606, "grad_norm": 1.845420241355896, "learning_rate": 9.997774810858923e-06, "loss": 0.9294, "step": 20 }, { "entropy": 4.6566619873046875, "epoch": 0.0019774569903104606, "mean_token_accuracy": 0.7223684191703796, "num_tokens": 109226.0, "step": 20, "train/ce_loss": 0.6507769227027893 }, { "epoch": 0.0019774569903104606, "step": 20, "train/sim_loss": 0.42578125 }, { "epoch": 0.0019774569903104606, "step": 20, "train/total_loss": 0.49085894227027893 }, { "entropy": 4.65466833114624, "epoch": 0.002076329839825984, "mean_token_accuracy": 0.7402912378311157, "num_tokens": 114480.0, "step": 21, "train/ce_loss": 1.0225709676742554 }, { "epoch": 0.002076329839825984, "step": 21, "train/sim_loss": 0.41015625 }, { "epoch": 0.002076329839825984, "step": 21, "train/total_loss": 0.5124133229255676 }, { "entropy": 5.295852184295654, "epoch": 0.002175202689341507, "mean_token_accuracy": 0.6411564350128174, "num_tokens": 119473.0, "step": 22, "train/ce_loss": 1.4177800416946411 }, { "epoch": 0.002175202689341507, "step": 22, "train/sim_loss": 0.4140625 }, { "epoch": 0.002175202689341507, "step": 22, "train/total_loss": 0.5558404922485352 }, { "entropy": 4.775343894958496, "epoch": 0.0022740755388570297, "mean_token_accuracy": 0.701298713684082, "num_tokens": 124877.0, "step": 23, "train/ce_loss": 1.3397403955459595 }, { "epoch": 0.0022740755388570297, "step": 23, "train/sim_loss": 0.328125 }, { "epoch": 0.0022740755388570297, "step": 23, "train/total_loss": 0.4620990455150604 }, { "entropy": 4.733779430389404, "epoch": 0.002372948388372553, "mean_token_accuracy": 0.689393937587738, "num_tokens": 130113.0, "step": 24, "train/ce_loss": 0.7229293584823608 }, { "epoch": 0.002372948388372553, "step": 24, "train/sim_loss": 0.3671875 }, { "epoch": 0.002372948388372553, "step": 24, "train/total_loss": 0.43948042392730713 }, { "entropy": 5.126186370849609, "epoch": 0.002471821237888076, "mean_token_accuracy": 0.6462736129760742, "num_tokens": 135527.0, "step": 25, "train/ce_loss": 1.0404521226882935 }, { "epoch": 0.002471821237888076, "step": 25, "train/sim_loss": 0.359375 }, { "epoch": 0.002471821237888076, "step": 25, "train/total_loss": 0.46342021226882935 }, { "entropy": 5.366857528686523, "epoch": 0.002570694087403599, "mean_token_accuracy": 0.719939112663269, "num_tokens": 140661.0, "step": 26, "train/ce_loss": 1.3868837356567383 }, { "epoch": 0.002570694087403599, "step": 26, "train/sim_loss": 0.29296875 }, { "epoch": 0.002570694087403599, "step": 26, "train/total_loss": 0.4316571354866028 }, { "entropy": 5.796281814575195, "epoch": 0.002669566936919122, "mean_token_accuracy": 0.7521514892578125, "num_tokens": 145711.0, "step": 27, "train/ce_loss": 0.20895634591579437 }, { "epoch": 0.002669566936919122, "step": 27, "train/sim_loss": 0.2890625 }, { "epoch": 0.002669566936919122, "step": 27, "train/total_loss": 0.3099581301212311 }, { "entropy": 5.276698112487793, "epoch": 0.002768439786434645, "mean_token_accuracy": 0.7659817337989807, "num_tokens": 151237.0, "step": 28, "train/ce_loss": 0.811927855014801 }, { "epoch": 0.002768439786434645, "step": 28, "train/sim_loss": 0.359375 }, { "epoch": 0.002768439786434645, "step": 28, "train/total_loss": 0.4405677914619446 }, { "entropy": 5.565755844116211, "epoch": 0.0028673126359501683, "mean_token_accuracy": 0.7230098247528076, "num_tokens": 156640.0, "step": 29, "train/ce_loss": 1.0701594352722168 }, { "epoch": 0.0028673126359501683, "step": 29, "train/sim_loss": 0.39453125 }, { "epoch": 0.0028673126359501683, "step": 29, "train/total_loss": 0.5015472173690796 }, { "entropy": 5.5919294357299805, "epoch": 0.002966185485465691, "mean_token_accuracy": 0.7300275564193726, "num_tokens": 161820.0, "step": 30, "train/ce_loss": 1.280920147895813 }, { "epoch": 0.002966185485465691, "step": 30, "train/sim_loss": 0.375 }, { "epoch": 0.002966185485465691, "step": 30, "train/total_loss": 0.5030920505523682 }, { "entropy": 5.7281036376953125, "epoch": 0.003065058334981214, "mean_token_accuracy": 0.7092568278312683, "num_tokens": 167014.0, "step": 31, "train/ce_loss": 0.6602377891540527 }, { "epoch": 0.003065058334981214, "step": 31, "train/sim_loss": 0.31640625 }, { "epoch": 0.003065058334981214, "step": 31, "train/total_loss": 0.3824300169944763 }, { "entropy": 6.002870559692383, "epoch": 0.0031639311844967374, "mean_token_accuracy": 0.7331671118736267, "num_tokens": 171815.0, "step": 32, "train/ce_loss": 1.5884499549865723 }, { "epoch": 0.0031639311844967374, "step": 32, "train/sim_loss": 0.33203125 }, { "epoch": 0.0031639311844967374, "step": 32, "train/total_loss": 0.4908762574195862 }, { "entropy": 5.498105049133301, "epoch": 0.0032628040340122602, "mean_token_accuracy": 0.7279322743415833, "num_tokens": 177099.0, "step": 33, "train/ce_loss": 0.8982157707214355 }, { "epoch": 0.0032628040340122602, "step": 33, "train/sim_loss": 0.26953125 }, { "epoch": 0.0032628040340122602, "step": 33, "train/total_loss": 0.35935282707214355 }, { "entropy": 5.875737190246582, "epoch": 0.003361676883527783, "mean_token_accuracy": 0.7120181322097778, "num_tokens": 182442.0, "step": 34, "train/ce_loss": 0.9330689907073975 }, { "epoch": 0.003361676883527783, "step": 34, "train/sim_loss": 0.26953125 }, { "epoch": 0.003361676883527783, "step": 34, "train/total_loss": 0.36283814907073975 }, { "entropy": 5.939522743225098, "epoch": 0.0034605497330433064, "mean_token_accuracy": 0.6893453001976013, "num_tokens": 187705.0, "step": 35, "train/ce_loss": 1.2632049322128296 }, { "epoch": 0.0034605497330433064, "step": 35, "train/sim_loss": 0.29296875 }, { "epoch": 0.0034605497330433064, "step": 35, "train/total_loss": 0.419289231300354 }, { "entropy": 5.7719573974609375, "epoch": 0.0035594225825588293, "mean_token_accuracy": 0.749492883682251, "num_tokens": 193137.0, "step": 36, "train/ce_loss": 0.9662005305290222 }, { "epoch": 0.0035594225825588293, "step": 36, "train/sim_loss": 0.29296875 }, { "epoch": 0.0035594225825588293, "step": 36, "train/total_loss": 0.3895888030529022 }, { "entropy": 6.139836311340332, "epoch": 0.003658295432074352, "mean_token_accuracy": 0.7349397540092468, "num_tokens": 198212.0, "step": 37, "train/ce_loss": 1.0053077936172485 }, { "epoch": 0.003658295432074352, "step": 37, "train/sim_loss": 0.2421875 }, { "epoch": 0.003658295432074352, "step": 37, "train/total_loss": 0.3427182734012604 }, { "entropy": 6.456868648529053, "epoch": 0.0037571682815898755, "mean_token_accuracy": 0.74055415391922, "num_tokens": 203017.0, "step": 38, "train/ce_loss": 0.23852603137493134 }, { "epoch": 0.0037571682815898755, "step": 38, "train/sim_loss": 0.3203125 }, { "epoch": 0.0037571682815898755, "step": 38, "train/total_loss": 0.3441651165485382 }, { "entropy": 6.685752868652344, "epoch": 0.0038560411311053984, "mean_token_accuracy": 0.6641104221343994, "num_tokens": 208114.0, "step": 39, "train/ce_loss": 2.193326711654663 }, { "epoch": 0.0038560411311053984, "step": 39, "train/sim_loss": 0.26171875 }, { "epoch": 0.0038560411311053984, "step": 39, "train/total_loss": 0.4810514450073242 }, { "epoch": 0.003954913980620921, "grad_norm": 1.2726562023162842, "learning_rate": 9.992829946100975e-06, "loss": 0.4364, "step": 40 }, { "entropy": 6.089438438415527, "epoch": 0.003954913980620921, "mean_token_accuracy": 0.7372262477874756, "num_tokens": 213442.0, "step": 40, "train/ce_loss": 0.7173500657081604 }, { "epoch": 0.003954913980620921, "step": 40, "train/sim_loss": 0.26171875 }, { "epoch": 0.003954913980620921, "step": 40, "train/total_loss": 0.3334537744522095 }, { "entropy": 6.468903064727783, "epoch": 0.004053786830136444, "mean_token_accuracy": 0.6836581826210022, "num_tokens": 218574.0, "step": 41, "train/ce_loss": 1.1456220149993896 }, { "epoch": 0.004053786830136444, "step": 41, "train/sim_loss": 0.3125 }, { "epoch": 0.004053786830136444, "step": 41, "train/total_loss": 0.4270622134208679 }, { "entropy": 5.9639973640441895, "epoch": 0.004152659679651968, "mean_token_accuracy": 0.7407407164573669, "num_tokens": 223990.0, "step": 42, "train/ce_loss": 0.8651658892631531 }, { "epoch": 0.004152659679651968, "step": 42, "train/sim_loss": 0.296875 }, { "epoch": 0.004152659679651968, "step": 42, "train/total_loss": 0.3833915889263153 }, { "entropy": 6.586842060089111, "epoch": 0.004251532529167491, "mean_token_accuracy": 0.7338129281997681, "num_tokens": 229012.0, "step": 43, "train/ce_loss": 1.1294350624084473 }, { "epoch": 0.004251532529167491, "step": 43, "train/sim_loss": 0.27734375 }, { "epoch": 0.004251532529167491, "step": 43, "train/total_loss": 0.39028725028038025 }, { "entropy": 6.888969421386719, "epoch": 0.004350405378683014, "mean_token_accuracy": 0.7364705801010132, "num_tokens": 233869.0, "step": 44, "train/ce_loss": 0.15823636949062347 }, { "epoch": 0.004350405378683014, "step": 44, "train/sim_loss": 0.265625 }, { "epoch": 0.004350405378683014, "step": 44, "train/total_loss": 0.281448632478714 }, { "entropy": 6.940787315368652, "epoch": 0.0044492782281985365, "mean_token_accuracy": 0.7454175353050232, "num_tokens": 238801.0, "step": 45, "train/ce_loss": 1.0938209295272827 }, { "epoch": 0.0044492782281985365, "step": 45, "train/sim_loss": 0.19921875 }, { "epoch": 0.0044492782281985365, "step": 45, "train/total_loss": 0.30860084295272827 }, { "entropy": 7.062989234924316, "epoch": 0.004548151077714059, "mean_token_accuracy": 0.7319587469100952, "num_tokens": 243831.0, "step": 46, "train/ce_loss": 0.1027403399348259 }, { "epoch": 0.004548151077714059, "step": 46, "train/sim_loss": 0.21875 }, { "epoch": 0.004548151077714059, "step": 46, "train/total_loss": 0.2290240377187729 }, { "entropy": 6.859626770019531, "epoch": 0.004647023927229583, "mean_token_accuracy": 0.6785079836845398, "num_tokens": 248970.0, "step": 47, "train/ce_loss": 1.9300769567489624 }, { "epoch": 0.004647023927229583, "step": 47, "train/sim_loss": 0.2421875 }, { "epoch": 0.004647023927229583, "step": 47, "train/total_loss": 0.4351952075958252 }, { "entropy": 6.56555700302124, "epoch": 0.004745896776745106, "mean_token_accuracy": 0.7164633870124817, "num_tokens": 253993.0, "step": 48, "train/ce_loss": 1.6052716970443726 }, { "epoch": 0.004745896776745106, "step": 48, "train/sim_loss": 0.21875 }, { "epoch": 0.004745896776745106, "step": 48, "train/total_loss": 0.37927716970443726 }, { "entropy": 6.6610870361328125, "epoch": 0.004844769626260629, "mean_token_accuracy": 0.7629796862602234, "num_tokens": 259345.0, "step": 49, "train/ce_loss": 0.9208154678344727 }, { "epoch": 0.004844769626260629, "step": 49, "train/sim_loss": 0.23046875 }, { "epoch": 0.004844769626260629, "step": 49, "train/total_loss": 0.32255029678344727 }, { "entropy": 7.085768699645996, "epoch": 0.004943642475776152, "mean_token_accuracy": 0.7152317762374878, "num_tokens": 264265.0, "step": 50, "train/ce_loss": 1.4471031427383423 }, { "epoch": 0.004943642475776152, "step": 50, "train/sim_loss": 0.23828125 }, { "epoch": 0.004943642475776152, "step": 50, "train/total_loss": 0.3829915523529053 }, { "entropy": 6.328176498413086, "epoch": 0.005042515325291675, "mean_token_accuracy": 0.7368420958518982, "num_tokens": 269783.0, "step": 51, "train/ce_loss": 0.6005712151527405 }, { "epoch": 0.005042515325291675, "step": 51, "train/sim_loss": 0.2265625 }, { "epoch": 0.005042515325291675, "step": 51, "train/total_loss": 0.286619633436203 }, { "entropy": 6.613964080810547, "epoch": 0.005141388174807198, "mean_token_accuracy": 0.7525196075439453, "num_tokens": 275151.0, "step": 52, "train/ce_loss": 0.982277512550354 }, { "epoch": 0.005141388174807198, "step": 52, "train/sim_loss": 0.2578125 }, { "epoch": 0.005141388174807198, "step": 52, "train/total_loss": 0.35604023933410645 }, { "entropy": 6.406770706176758, "epoch": 0.005240261024322721, "mean_token_accuracy": 0.6592427492141724, "num_tokens": 280523.0, "step": 53, "train/ce_loss": 1.1580888032913208 }, { "epoch": 0.005240261024322721, "step": 53, "train/sim_loss": 0.23828125 }, { "epoch": 0.005240261024322721, "step": 53, "train/total_loss": 0.3540901243686676 }, { "entropy": 7.066647052764893, "epoch": 0.005339133873838244, "mean_token_accuracy": 0.7491582632064819, "num_tokens": 285554.0, "step": 54, "train/ce_loss": 1.6794772148132324 }, { "epoch": 0.005339133873838244, "step": 54, "train/sim_loss": 0.25390625 }, { "epoch": 0.005339133873838244, "step": 54, "train/total_loss": 0.4218539595603943 }, { "entropy": 6.347116470336914, "epoch": 0.005438006723353767, "mean_token_accuracy": 0.7419700026512146, "num_tokens": 290978.0, "step": 55, "train/ce_loss": 0.4674301743507385 }, { "epoch": 0.005438006723353767, "step": 55, "train/sim_loss": 0.24609375 }, { "epoch": 0.005438006723353767, "step": 55, "train/total_loss": 0.2928367555141449 }, { "entropy": 6.935833930969238, "epoch": 0.00553687957286929, "mean_token_accuracy": 0.7323232293128967, "num_tokens": 296402.0, "step": 56, "train/ce_loss": 0.6243418455123901 }, { "epoch": 0.00553687957286929, "step": 56, "train/sim_loss": 0.33203125 }, { "epoch": 0.00553687957286929, "step": 56, "train/total_loss": 0.39446544647216797 }, { "entropy": 6.755821228027344, "epoch": 0.005635752422384813, "mean_token_accuracy": 0.7094240784645081, "num_tokens": 301565.0, "step": 57, "train/ce_loss": 1.3958739042282104 }, { "epoch": 0.005635752422384813, "step": 57, "train/sim_loss": 0.3125 }, { "epoch": 0.005635752422384813, "step": 57, "train/total_loss": 0.45208740234375 }, { "entropy": 6.747868061065674, "epoch": 0.005734625271900337, "mean_token_accuracy": 0.7036224007606506, "num_tokens": 306899.0, "step": 58, "train/ce_loss": 1.1025996208190918 }, { "epoch": 0.005734625271900337, "step": 58, "train/sim_loss": 0.2265625 }, { "epoch": 0.005734625271900337, "step": 58, "train/total_loss": 0.3368224501609802 }, { "entropy": 7.568367958068848, "epoch": 0.0058334981214158595, "mean_token_accuracy": 0.6982248425483704, "num_tokens": 311618.0, "step": 59, "train/ce_loss": 0.13859635591506958 }, { "epoch": 0.0058334981214158595, "step": 59, "train/sim_loss": 0.2265625 }, { "epoch": 0.0058334981214158595, "step": 59, "train/total_loss": 0.24042212963104248 }, { "epoch": 0.005932370970931382, "grad_norm": 1.4971586465835571, "learning_rate": 9.987885081343026e-06, "loss": 0.3521, "step": 60 }, { "entropy": 6.701387882232666, "epoch": 0.005932370970931382, "mean_token_accuracy": 0.6508380174636841, "num_tokens": 316843.0, "step": 60, "train/ce_loss": 1.0780704021453857 }, { "epoch": 0.005932370970931382, "step": 60, "train/sim_loss": 0.21875 }, { "epoch": 0.005932370970931382, "step": 60, "train/total_loss": 0.3265570402145386 }, { "entropy": 7.042557239532471, "epoch": 0.006031243820446905, "mean_token_accuracy": 0.7535853981971741, "num_tokens": 322073.0, "step": 61, "train/ce_loss": 0.8363357186317444 }, { "epoch": 0.006031243820446905, "step": 61, "train/sim_loss": 0.21875 }, { "epoch": 0.006031243820446905, "step": 61, "train/total_loss": 0.30238357186317444 }, { "entropy": 7.509478569030762, "epoch": 0.006130116669962428, "mean_token_accuracy": 0.7476979494094849, "num_tokens": 327050.0, "step": 62, "train/ce_loss": 1.542819857597351 }, { "epoch": 0.006130116669962428, "step": 62, "train/sim_loss": 0.21875 }, { "epoch": 0.006130116669962428, "step": 62, "train/total_loss": 0.37303197383880615 }, { "entropy": 7.262565612792969, "epoch": 0.006228989519477951, "mean_token_accuracy": 0.7211428284645081, "num_tokens": 332360.0, "step": 63, "train/ce_loss": 0.8255243897438049 }, { "epoch": 0.006228989519477951, "step": 63, "train/sim_loss": 0.2734375 }, { "epoch": 0.006228989519477951, "step": 63, "train/total_loss": 0.355989933013916 }, { "entropy": 7.196902751922607, "epoch": 0.006327862368993475, "mean_token_accuracy": 0.7398772835731506, "num_tokens": 337688.0, "step": 64, "train/ce_loss": 0.8732907176017761 }, { "epoch": 0.006327862368993475, "step": 64, "train/sim_loss": 0.20703125 }, { "epoch": 0.006327862368993475, "step": 64, "train/total_loss": 0.29436033964157104 }, { "entropy": 6.966497421264648, "epoch": 0.006426735218508998, "mean_token_accuracy": 0.7328431606292725, "num_tokens": 342964.0, "step": 65, "train/ce_loss": 1.7824984788894653 }, { "epoch": 0.006426735218508998, "step": 65, "train/sim_loss": 0.20703125 }, { "epoch": 0.006426735218508998, "step": 65, "train/total_loss": 0.3852810859680176 }, { "entropy": 6.835407257080078, "epoch": 0.0065256080680245205, "mean_token_accuracy": 0.7473170757293701, "num_tokens": 348451.0, "step": 66, "train/ce_loss": 0.9521581530570984 }, { "epoch": 0.0065256080680245205, "step": 66, "train/sim_loss": 0.21875 }, { "epoch": 0.0065256080680245205, "step": 66, "train/total_loss": 0.3139658272266388 }, { "entropy": 7.106851100921631, "epoch": 0.006624480917540043, "mean_token_accuracy": 0.7011128664016724, "num_tokens": 353568.0, "step": 67, "train/ce_loss": 2.121598243713379 }, { "epoch": 0.006624480917540043, "step": 67, "train/sim_loss": 0.20703125 }, { "epoch": 0.006624480917540043, "step": 67, "train/total_loss": 0.41919106245040894 }, { "entropy": 7.332256317138672, "epoch": 0.006723353767055566, "mean_token_accuracy": 0.7397590279579163, "num_tokens": 358854.0, "step": 68, "train/ce_loss": 0.9141528010368347 }, { "epoch": 0.006723353767055566, "step": 68, "train/sim_loss": 0.1953125 }, { "epoch": 0.006723353767055566, "step": 68, "train/total_loss": 0.28672778606414795 }, { "entropy": 7.315262317657471, "epoch": 0.00682222661657109, "mean_token_accuracy": 0.6781250238418579, "num_tokens": 363938.0, "step": 69, "train/ce_loss": 1.1206138134002686 }, { "epoch": 0.00682222661657109, "step": 69, "train/sim_loss": 0.265625 }, { "epoch": 0.00682222661657109, "step": 69, "train/total_loss": 0.37768638134002686 }, { "entropy": 7.905448913574219, "epoch": 0.006921099466086613, "mean_token_accuracy": 0.70243901014328, "num_tokens": 368995.0, "step": 70, "train/ce_loss": 0.7935793399810791 }, { "epoch": 0.006921099466086613, "step": 70, "train/sim_loss": 0.2109375 }, { "epoch": 0.006921099466086613, "step": 70, "train/total_loss": 0.29029542207717896 }, { "entropy": 7.295677185058594, "epoch": 0.007019972315602136, "mean_token_accuracy": 0.6715328693389893, "num_tokens": 374234.0, "step": 71, "train/ce_loss": 1.6432149410247803 }, { "epoch": 0.007019972315602136, "step": 71, "train/sim_loss": 0.32421875 }, { "epoch": 0.007019972315602136, "step": 71, "train/total_loss": 0.4885402321815491 }, { "entropy": 7.1108551025390625, "epoch": 0.007118845165117659, "mean_token_accuracy": 0.7644927501678467, "num_tokens": 379526.0, "step": 72, "train/ce_loss": 0.6361651420593262 }, { "epoch": 0.007118845165117659, "step": 72, "train/sim_loss": 0.19921875 }, { "epoch": 0.007118845165117659, "step": 72, "train/total_loss": 0.2628352642059326 }, { "entropy": 7.745475769042969, "epoch": 0.0072177180146331815, "mean_token_accuracy": 0.7246596217155457, "num_tokens": 384624.0, "step": 73, "train/ce_loss": 1.0735074281692505 }, { "epoch": 0.0072177180146331815, "step": 73, "train/sim_loss": 0.25 }, { "epoch": 0.0072177180146331815, "step": 73, "train/total_loss": 0.35735073685646057 }, { "entropy": 7.32032585144043, "epoch": 0.007316590864148704, "mean_token_accuracy": 0.699284017086029, "num_tokens": 389936.0, "step": 74, "train/ce_loss": 1.220793604850769 }, { "epoch": 0.007316590864148704, "step": 74, "train/sim_loss": 0.30078125 }, { "epoch": 0.007316590864148704, "step": 74, "train/total_loss": 0.42286062240600586 }, { "entropy": 7.601529121398926, "epoch": 0.007415463713664228, "mean_token_accuracy": 0.723192036151886, "num_tokens": 395205.0, "step": 75, "train/ce_loss": 1.2222076654434204 }, { "epoch": 0.007415463713664228, "step": 75, "train/sim_loss": 0.23046875 }, { "epoch": 0.007415463713664228, "step": 75, "train/total_loss": 0.3526895046234131 }, { "entropy": 7.278405666351318, "epoch": 0.007514336563179751, "mean_token_accuracy": 0.7372061014175415, "num_tokens": 400368.0, "step": 76, "train/ce_loss": 0.859329104423523 }, { "epoch": 0.007514336563179751, "step": 76, "train/sim_loss": 0.17578125 }, { "epoch": 0.007514336563179751, "step": 76, "train/total_loss": 0.2617141604423523 }, { "entropy": 7.346745491027832, "epoch": 0.007613209412695274, "mean_token_accuracy": 0.6926229596138, "num_tokens": 405502.0, "step": 77, "train/ce_loss": 0.06304176896810532 }, { "epoch": 0.007613209412695274, "step": 77, "train/sim_loss": 0.19140625 }, { "epoch": 0.007613209412695274, "step": 77, "train/total_loss": 0.19771042466163635 }, { "entropy": 7.988180160522461, "epoch": 0.007712082262210797, "mean_token_accuracy": 0.7505938410758972, "num_tokens": 410364.0, "step": 78, "train/ce_loss": 0.10073232650756836 }, { "epoch": 0.007712082262210797, "step": 78, "train/sim_loss": 0.18359375 }, { "epoch": 0.007712082262210797, "step": 78, "train/total_loss": 0.1936669796705246 }, { "entropy": 7.201304912567139, "epoch": 0.00781095511172632, "mean_token_accuracy": 0.724950909614563, "num_tokens": 415885.0, "step": 79, "train/ce_loss": 1.0036214590072632 }, { "epoch": 0.00781095511172632, "step": 79, "train/sim_loss": 0.20703125 }, { "epoch": 0.00781095511172632, "step": 79, "train/total_loss": 0.3073934018611908 }, { "epoch": 0.007909827961241843, "grad_norm": 0.834606409072876, "learning_rate": 9.982940216585078e-06, "loss": 0.3239, "step": 80 }, { "entropy": 7.307192802429199, "epoch": 0.007909827961241843, "mean_token_accuracy": 0.7760290503501892, "num_tokens": 421192.0, "step": 80, "train/ce_loss": 1.2137980461120605 }, { "epoch": 0.007909827961241843, "step": 80, "train/sim_loss": 0.28125 }, { "epoch": 0.007909827961241843, "step": 80, "train/total_loss": 0.4026297926902771 }, { "entropy": 6.885316848754883, "epoch": 0.008008700810757366, "mean_token_accuracy": 0.7960339784622192, "num_tokens": 426776.0, "step": 81, "train/ce_loss": 0.7223502397537231 }, { "epoch": 0.008008700810757366, "step": 81, "train/sim_loss": 0.19140625 }, { "epoch": 0.008008700810757366, "step": 81, "train/total_loss": 0.26364126801490784 }, { "entropy": 7.086678504943848, "epoch": 0.008107573660272888, "mean_token_accuracy": 0.7789784073829651, "num_tokens": 432456.0, "step": 82, "train/ce_loss": 0.7674393653869629 }, { "epoch": 0.008107573660272888, "step": 82, "train/sim_loss": 0.17578125 }, { "epoch": 0.008107573660272888, "step": 82, "train/total_loss": 0.2525251805782318 }, { "entropy": 8.099905967712402, "epoch": 0.008206446509788412, "mean_token_accuracy": 0.8287937641143799, "num_tokens": 437135.0, "step": 83, "train/ce_loss": 0.17125576734542847 }, { "epoch": 0.008206446509788412, "step": 83, "train/sim_loss": 0.1796875 }, { "epoch": 0.008206446509788412, "step": 83, "train/total_loss": 0.19681307673454285 }, { "entropy": 7.160604000091553, "epoch": 0.008305319359303936, "mean_token_accuracy": 0.7324613332748413, "num_tokens": 442472.0, "step": 84, "train/ce_loss": 0.8014112114906311 }, { "epoch": 0.008305319359303936, "step": 84, "train/sim_loss": 0.15625 }, { "epoch": 0.008305319359303936, "step": 84, "train/total_loss": 0.2363911271095276 }, { "entropy": 7.76218318939209, "epoch": 0.008404192208819458, "mean_token_accuracy": 0.7582237124443054, "num_tokens": 447559.0, "step": 85, "train/ce_loss": 0.9764944314956665 }, { "epoch": 0.008404192208819458, "step": 85, "train/sim_loss": 0.13671875 }, { "epoch": 0.008404192208819458, "step": 85, "train/total_loss": 0.2343682050704956 }, { "entropy": 8.122018814086914, "epoch": 0.008503065058334982, "mean_token_accuracy": 0.7223300933837891, "num_tokens": 452491.0, "step": 86, "train/ce_loss": 1.1845427751541138 }, { "epoch": 0.008503065058334982, "step": 86, "train/sim_loss": 0.1796875 }, { "epoch": 0.008503065058334982, "step": 86, "train/total_loss": 0.2981417775154114 }, { "entropy": 7.716374397277832, "epoch": 0.008601937907850504, "mean_token_accuracy": 0.7362499833106995, "num_tokens": 457972.0, "step": 87, "train/ce_loss": 0.9235560894012451 }, { "epoch": 0.008601937907850504, "step": 87, "train/sim_loss": 0.26171875 }, { "epoch": 0.008601937907850504, "step": 87, "train/total_loss": 0.3540743589401245 }, { "entropy": 7.842241287231445, "epoch": 0.008700810757366027, "mean_token_accuracy": 0.703797459602356, "num_tokens": 463188.0, "step": 88, "train/ce_loss": 1.2208952903747559 }, { "epoch": 0.008700810757366027, "step": 88, "train/sim_loss": 0.1953125 }, { "epoch": 0.008700810757366027, "step": 88, "train/total_loss": 0.31740203499794006 }, { "entropy": 7.5404253005981445, "epoch": 0.008799683606881551, "mean_token_accuracy": 0.7666666507720947, "num_tokens": 468311.0, "step": 89, "train/ce_loss": 1.0468343496322632 }, { "epoch": 0.008799683606881551, "step": 89, "train/sim_loss": 0.109375 }, { "epoch": 0.008799683606881551, "step": 89, "train/total_loss": 0.21405842900276184 }, { "entropy": 7.966272354125977, "epoch": 0.008898556456397073, "mean_token_accuracy": 0.6932907104492188, "num_tokens": 473025.0, "step": 90, "train/ce_loss": 0.13910500705242157 }, { "epoch": 0.008898556456397073, "step": 90, "train/sim_loss": 0.21484375 }, { "epoch": 0.008898556456397073, "step": 90, "train/total_loss": 0.22875425219535828 }, { "entropy": 7.517707824707031, "epoch": 0.008997429305912597, "mean_token_accuracy": 0.7390761375427246, "num_tokens": 478290.0, "step": 91, "train/ce_loss": 0.6987117528915405 }, { "epoch": 0.008997429305912597, "step": 91, "train/sim_loss": 0.1328125 }, { "epoch": 0.008997429305912597, "step": 91, "train/total_loss": 0.202683687210083 }, { "entropy": 7.53427791595459, "epoch": 0.009096302155428119, "mean_token_accuracy": 0.6783004403114319, "num_tokens": 483445.0, "step": 92, "train/ce_loss": 1.182236909866333 }, { "epoch": 0.009096302155428119, "step": 92, "train/sim_loss": 0.234375 }, { "epoch": 0.009096302155428119, "step": 92, "train/total_loss": 0.3525986969470978 }, { "entropy": 7.789042949676514, "epoch": 0.009195175004943643, "mean_token_accuracy": 0.7073529362678528, "num_tokens": 488538.0, "step": 93, "train/ce_loss": 0.06518832594156265 }, { "epoch": 0.009195175004943643, "step": 93, "train/sim_loss": 0.15234375 }, { "epoch": 0.009195175004943643, "step": 93, "train/total_loss": 0.15886257588863373 }, { "entropy": 7.619516372680664, "epoch": 0.009294047854459166, "mean_token_accuracy": 0.7179487347602844, "num_tokens": 493813.0, "step": 94, "train/ce_loss": 0.8823347687721252 }, { "epoch": 0.009294047854459166, "step": 94, "train/sim_loss": 0.18359375 }, { "epoch": 0.009294047854459166, "step": 94, "train/total_loss": 0.27182722091674805 }, { "entropy": 7.926183700561523, "epoch": 0.009392920703974688, "mean_token_accuracy": 0.7148703932762146, "num_tokens": 498978.0, "step": 95, "train/ce_loss": 0.8853976130485535 }, { "epoch": 0.009392920703974688, "step": 95, "train/sim_loss": 0.171875 }, { "epoch": 0.009392920703974688, "step": 95, "train/total_loss": 0.2604147791862488 }, { "entropy": 8.150131225585938, "epoch": 0.009491793553490212, "mean_token_accuracy": 0.7428571581840515, "num_tokens": 504097.0, "step": 96, "train/ce_loss": 0.6387143731117249 }, { "epoch": 0.009491793553490212, "step": 96, "train/sim_loss": 0.17578125 }, { "epoch": 0.009491793553490212, "step": 96, "train/total_loss": 0.23965269327163696 }, { "entropy": 7.242536544799805, "epoch": 0.009590666403005734, "mean_token_accuracy": 0.752212405204773, "num_tokens": 509554.0, "step": 97, "train/ce_loss": 0.8059234023094177 }, { "epoch": 0.009590666403005734, "step": 97, "train/sim_loss": 0.2421875 }, { "epoch": 0.009590666403005734, "step": 97, "train/total_loss": 0.3227798342704773 }, { "entropy": 7.3880696296691895, "epoch": 0.009689539252521258, "mean_token_accuracy": 0.724952757358551, "num_tokens": 515123.0, "step": 98, "train/ce_loss": 1.176647424697876 }, { "epoch": 0.009689539252521258, "step": 98, "train/sim_loss": 0.203125 }, { "epoch": 0.009689539252521258, "step": 98, "train/total_loss": 0.32078975439071655 }, { "entropy": 8.349966049194336, "epoch": 0.00978841210203678, "mean_token_accuracy": 0.749588131904602, "num_tokens": 520171.0, "step": 99, "train/ce_loss": 0.06600559502840042 }, { "epoch": 0.00978841210203678, "step": 99, "train/sim_loss": 0.203125 }, { "epoch": 0.00978841210203678, "step": 99, "train/total_loss": 0.20972555875778198 }, { "epoch": 0.009887284951552304, "grad_norm": 1.0337260961532593, "learning_rate": 9.977995351827128e-06, "loss": 0.2901, "step": 100 }, { "entropy": 8.475637435913086, "epoch": 0.009887284951552304, "mean_token_accuracy": 0.6748120188713074, "num_tokens": 525179.0, "step": 100, "train/ce_loss": 0.07647012174129486 }, { "epoch": 0.009887284951552304, "step": 100, "train/sim_loss": 0.1796875 }, { "epoch": 0.009887284951552304, "step": 100, "train/total_loss": 0.18733450770378113 }, { "entropy": 8.2037353515625, "epoch": 0.009986157801067827, "mean_token_accuracy": 0.7037037014961243, "num_tokens": 530303.0, "step": 101, "train/ce_loss": 0.7040248513221741 }, { "epoch": 0.009986157801067827, "step": 101, "train/sim_loss": 0.16796875 }, { "epoch": 0.009986157801067827, "step": 101, "train/total_loss": 0.23837123811244965 }, { "entropy": 8.168619155883789, "epoch": 0.01008503065058335, "mean_token_accuracy": 0.7303797602653503, "num_tokens": 535507.0, "step": 102, "train/ce_loss": 0.7577227354049683 }, { "epoch": 0.01008503065058335, "step": 102, "train/sim_loss": 0.18359375 }, { "epoch": 0.01008503065058335, "step": 102, "train/total_loss": 0.2593660354614258 }, { "entropy": 8.024353981018066, "epoch": 0.010183903500098873, "mean_token_accuracy": 0.6881720423698425, "num_tokens": 540582.0, "step": 103, "train/ce_loss": 1.9404637813568115 }, { "epoch": 0.010183903500098873, "step": 103, "train/sim_loss": 0.2265625 }, { "epoch": 0.010183903500098873, "step": 103, "train/total_loss": 0.42060887813568115 }, { "entropy": 8.253752708435059, "epoch": 0.010282776349614395, "mean_token_accuracy": 0.7130434513092041, "num_tokens": 545677.0, "step": 104, "train/ce_loss": 0.05784047022461891 }, { "epoch": 0.010282776349614395, "step": 104, "train/sim_loss": 0.140625 }, { "epoch": 0.010282776349614395, "step": 104, "train/total_loss": 0.1464090496301651 }, { "entropy": 7.711334705352783, "epoch": 0.010381649199129919, "mean_token_accuracy": 0.7691428661346436, "num_tokens": 551034.0, "step": 105, "train/ce_loss": 1.0411088466644287 }, { "epoch": 0.010381649199129919, "step": 105, "train/sim_loss": 0.18359375 }, { "epoch": 0.010381649199129919, "step": 105, "train/total_loss": 0.2877046465873718 }, { "entropy": 7.441492080688477, "epoch": 0.010480522048645443, "mean_token_accuracy": 0.6875, "num_tokens": 556413.0, "step": 106, "train/ce_loss": 0.6729193329811096 }, { "epoch": 0.010480522048645443, "step": 106, "train/sim_loss": 0.1640625 }, { "epoch": 0.010480522048645443, "step": 106, "train/total_loss": 0.23135444521903992 }, { "entropy": 7.938560962677002, "epoch": 0.010579394898160965, "mean_token_accuracy": 0.6752910614013672, "num_tokens": 561628.0, "step": 107, "train/ce_loss": 1.1121141910552979 }, { "epoch": 0.010579394898160965, "step": 107, "train/sim_loss": 0.15625 }, { "epoch": 0.010579394898160965, "step": 107, "train/total_loss": 0.2674614191055298 }, { "entropy": 7.696544170379639, "epoch": 0.010678267747676488, "mean_token_accuracy": 0.7175732254981995, "num_tokens": 567069.0, "step": 108, "train/ce_loss": 0.9015482068061829 }, { "epoch": 0.010678267747676488, "step": 108, "train/sim_loss": 0.109375 }, { "epoch": 0.010678267747676488, "step": 108, "train/total_loss": 0.19952982664108276 }, { "entropy": 8.239730834960938, "epoch": 0.01077714059719201, "mean_token_accuracy": 0.7056276798248291, "num_tokens": 572228.0, "step": 109, "train/ce_loss": 0.8910127282142639 }, { "epoch": 0.01077714059719201, "step": 109, "train/sim_loss": 0.1875 }, { "epoch": 0.01077714059719201, "step": 109, "train/total_loss": 0.27660128474235535 }, { "entropy": 8.282722473144531, "epoch": 0.010876013446707534, "mean_token_accuracy": 0.711757242679596, "num_tokens": 577446.0, "step": 110, "train/ce_loss": 1.4429800510406494 }, { "epoch": 0.010876013446707534, "step": 110, "train/sim_loss": 0.234375 }, { "epoch": 0.010876013446707534, "step": 110, "train/total_loss": 0.3786730170249939 }, { "entropy": 8.14548110961914, "epoch": 0.010974886296223058, "mean_token_accuracy": 0.73591548204422, "num_tokens": 582743.0, "step": 111, "train/ce_loss": 1.082576036453247 }, { "epoch": 0.010974886296223058, "step": 111, "train/sim_loss": 0.20703125 }, { "epoch": 0.010974886296223058, "step": 111, "train/total_loss": 0.31528884172439575 }, { "entropy": 8.513275146484375, "epoch": 0.01107375914573858, "mean_token_accuracy": 0.7968127727508545, "num_tokens": 587669.0, "step": 112, "train/ce_loss": 0.08766748011112213 }, { "epoch": 0.01107375914573858, "step": 112, "train/sim_loss": 0.1171875 }, { "epoch": 0.01107375914573858, "step": 112, "train/total_loss": 0.12595424056053162 }, { "entropy": 8.212099075317383, "epoch": 0.011172631995254104, "mean_token_accuracy": 0.7019002437591553, "num_tokens": 592930.0, "step": 113, "train/ce_loss": 0.9837558269500732 }, { "epoch": 0.011172631995254104, "step": 113, "train/sim_loss": 0.171875 }, { "epoch": 0.011172631995254104, "step": 113, "train/total_loss": 0.2702505886554718 }, { "entropy": 7.418394088745117, "epoch": 0.011271504844769626, "mean_token_accuracy": 0.7642626762390137, "num_tokens": 598372.0, "step": 114, "train/ce_loss": 0.8529636859893799 }, { "epoch": 0.011271504844769626, "step": 114, "train/sim_loss": 0.1796875 }, { "epoch": 0.011271504844769626, "step": 114, "train/total_loss": 0.2649838626384735 }, { "entropy": 8.68088436126709, "epoch": 0.01137037769428515, "mean_token_accuracy": 0.6915887594223022, "num_tokens": 603489.0, "step": 115, "train/ce_loss": 0.07375526428222656 }, { "epoch": 0.01137037769428515, "step": 115, "train/sim_loss": 0.14453125 }, { "epoch": 0.01137037769428515, "step": 115, "train/total_loss": 0.15190677344799042 }, { "entropy": 8.23654556274414, "epoch": 0.011469250543800673, "mean_token_accuracy": 0.6918158531188965, "num_tokens": 608730.0, "step": 116, "train/ce_loss": 1.41244375705719 }, { "epoch": 0.011469250543800673, "step": 116, "train/sim_loss": 0.26953125 }, { "epoch": 0.011469250543800673, "step": 116, "train/total_loss": 0.41077563166618347 }, { "entropy": 8.532476425170898, "epoch": 0.011568123393316195, "mean_token_accuracy": 0.7419928908348083, "num_tokens": 613721.0, "step": 117, "train/ce_loss": 0.07354797422885895 }, { "epoch": 0.011568123393316195, "step": 117, "train/sim_loss": 0.1484375 }, { "epoch": 0.011568123393316195, "step": 117, "train/total_loss": 0.15579229593276978 }, { "entropy": 8.25015640258789, "epoch": 0.011666996242831719, "mean_token_accuracy": 0.6930572390556335, "num_tokens": 618995.0, "step": 118, "train/ce_loss": 0.7497038841247559 }, { "epoch": 0.011666996242831719, "step": 118, "train/sim_loss": 0.16796875 }, { "epoch": 0.011666996242831719, "step": 118, "train/total_loss": 0.24293914437294006 }, { "entropy": 7.956493377685547, "epoch": 0.011765869092347241, "mean_token_accuracy": 0.7132115960121155, "num_tokens": 624420.0, "step": 119, "train/ce_loss": 0.7325726747512817 }, { "epoch": 0.011765869092347241, "step": 119, "train/sim_loss": 0.1484375 }, { "epoch": 0.011765869092347241, "step": 119, "train/total_loss": 0.22169476747512817 }, { "epoch": 0.011864741941862765, "grad_norm": 0.8489806056022644, "learning_rate": 9.973050487069179e-06, "loss": 0.2843, "step": 120 }, { "entropy": 7.759334564208984, "epoch": 0.011864741941862765, "mean_token_accuracy": 0.7204301357269287, "num_tokens": 629755.0, "step": 120, "train/ce_loss": 0.5303604006767273 }, { "epoch": 0.011864741941862765, "step": 120, "train/sim_loss": 0.1875 }, { "epoch": 0.011864741941862765, "step": 120, "train/total_loss": 0.24053603410720825 }, { "entropy": 8.225971221923828, "epoch": 0.011963614791378287, "mean_token_accuracy": 0.6958677768707275, "num_tokens": 634814.0, "step": 121, "train/ce_loss": 1.8621231317520142 }, { "epoch": 0.011963614791378287, "step": 121, "train/sim_loss": 0.36328125 }, { "epoch": 0.011963614791378287, "step": 121, "train/total_loss": 0.5494935512542725 }, { "entropy": 8.129207611083984, "epoch": 0.01206248764089381, "mean_token_accuracy": 0.7058823704719543, "num_tokens": 640095.0, "step": 122, "train/ce_loss": 0.8336842656135559 }, { "epoch": 0.01206248764089381, "step": 122, "train/sim_loss": 0.15234375 }, { "epoch": 0.01206248764089381, "step": 122, "train/total_loss": 0.2357121706008911 }, { "entropy": 8.380830764770508, "epoch": 0.012161360490409334, "mean_token_accuracy": 0.7931034564971924, "num_tokens": 645132.0, "step": 123, "train/ce_loss": 1.075549840927124 }, { "epoch": 0.012161360490409334, "step": 123, "train/sim_loss": 0.15625 }, { "epoch": 0.012161360490409334, "step": 123, "train/total_loss": 0.26380497217178345 }, { "entropy": 8.702407836914062, "epoch": 0.012260233339924856, "mean_token_accuracy": 0.6796460151672363, "num_tokens": 650314.0, "step": 124, "train/ce_loss": 0.9473351836204529 }, { "epoch": 0.012260233339924856, "step": 124, "train/sim_loss": 0.19921875 }, { "epoch": 0.012260233339924856, "step": 124, "train/total_loss": 0.2939522862434387 }, { "entropy": 8.119245529174805, "epoch": 0.01235910618944038, "mean_token_accuracy": 0.773099422454834, "num_tokens": 655612.0, "step": 125, "train/ce_loss": 0.8545771837234497 }, { "epoch": 0.01235910618944038, "step": 125, "train/sim_loss": 0.2421875 }, { "epoch": 0.01235910618944038, "step": 125, "train/total_loss": 0.3276452124118805 }, { "entropy": 8.00175666809082, "epoch": 0.012457979038955902, "mean_token_accuracy": 0.7394366264343262, "num_tokens": 660934.0, "step": 126, "train/ce_loss": 1.1516802310943604 }, { "epoch": 0.012457979038955902, "step": 126, "train/sim_loss": 0.234375 }, { "epoch": 0.012457979038955902, "step": 126, "train/total_loss": 0.349543035030365 }, { "entropy": 8.274015426635742, "epoch": 0.012556851888471426, "mean_token_accuracy": 0.7601156234741211, "num_tokens": 666080.0, "step": 127, "train/ce_loss": 0.6554973125457764 }, { "epoch": 0.012556851888471426, "step": 127, "train/sim_loss": 0.2109375 }, { "epoch": 0.012556851888471426, "step": 127, "train/total_loss": 0.27648723125457764 }, { "entropy": 8.076434135437012, "epoch": 0.01265572473798695, "mean_token_accuracy": 0.6938519477844238, "num_tokens": 671277.0, "step": 128, "train/ce_loss": 1.0039684772491455 }, { "epoch": 0.01265572473798695, "step": 128, "train/sim_loss": 0.22265625 }, { "epoch": 0.01265572473798695, "step": 128, "train/total_loss": 0.3230530917644501 }, { "entropy": 7.593088626861572, "epoch": 0.012754597587502471, "mean_token_accuracy": 0.6751986145973206, "num_tokens": 676804.0, "step": 129, "train/ce_loss": 1.3502253293991089 }, { "epoch": 0.012754597587502471, "step": 129, "train/sim_loss": 0.20703125 }, { "epoch": 0.012754597587502471, "step": 129, "train/total_loss": 0.34205377101898193 }, { "entropy": 8.646316528320312, "epoch": 0.012853470437017995, "mean_token_accuracy": 0.7296819686889648, "num_tokens": 681786.0, "step": 130, "train/ce_loss": 1.2546935081481934 }, { "epoch": 0.012853470437017995, "step": 130, "train/sim_loss": 0.10546875 }, { "epoch": 0.012853470437017995, "step": 130, "train/total_loss": 0.2309381067752838 }, { "entropy": 7.9116291999816895, "epoch": 0.012952343286533517, "mean_token_accuracy": 0.7299435138702393, "num_tokens": 687168.0, "step": 131, "train/ce_loss": 1.1150727272033691 }, { "epoch": 0.012952343286533517, "step": 131, "train/sim_loss": 0.09375 }, { "epoch": 0.012952343286533517, "step": 131, "train/total_loss": 0.20525726675987244 }, { "entropy": 8.308708190917969, "epoch": 0.013051216136049041, "mean_token_accuracy": 0.6818181872367859, "num_tokens": 692372.0, "step": 132, "train/ce_loss": 1.6832152605056763 }, { "epoch": 0.013051216136049041, "step": 132, "train/sim_loss": 0.2578125 }, { "epoch": 0.013051216136049041, "step": 132, "train/total_loss": 0.42613404989242554 }, { "entropy": 8.511515617370605, "epoch": 0.013150088985564565, "mean_token_accuracy": 0.7616000175476074, "num_tokens": 697466.0, "step": 133, "train/ce_loss": 1.3688126802444458 }, { "epoch": 0.013150088985564565, "step": 133, "train/sim_loss": 0.24609375 }, { "epoch": 0.013150088985564565, "step": 133, "train/total_loss": 0.3829750418663025 }, { "entropy": 8.20486068725586, "epoch": 0.013248961835080087, "mean_token_accuracy": 0.7224118113517761, "num_tokens": 702816.0, "step": 134, "train/ce_loss": 0.690566897392273 }, { "epoch": 0.013248961835080087, "step": 134, "train/sim_loss": 0.15234375 }, { "epoch": 0.013248961835080087, "step": 134, "train/total_loss": 0.2214004397392273 }, { "entropy": 8.287908554077148, "epoch": 0.01334783468459561, "mean_token_accuracy": 0.6638772487640381, "num_tokens": 708006.0, "step": 135, "train/ce_loss": 1.7614285945892334 }, { "epoch": 0.01334783468459561, "step": 135, "train/sim_loss": 0.23046875 }, { "epoch": 0.01334783468459561, "step": 135, "train/total_loss": 0.4066116213798523 }, { "entropy": 8.253705978393555, "epoch": 0.013446707534111133, "mean_token_accuracy": 0.7321041226387024, "num_tokens": 713369.0, "step": 136, "train/ce_loss": 1.069230079650879 }, { "epoch": 0.013446707534111133, "step": 136, "train/sim_loss": 0.16796875 }, { "epoch": 0.013446707534111133, "step": 136, "train/total_loss": 0.27489176392555237 }, { "entropy": 8.739953994750977, "epoch": 0.013545580383626656, "mean_token_accuracy": 0.7504000067710876, "num_tokens": 718456.0, "step": 137, "train/ce_loss": 0.06112891063094139 }, { "epoch": 0.013545580383626656, "step": 137, "train/sim_loss": 0.14453125 }, { "epoch": 0.013545580383626656, "step": 137, "train/total_loss": 0.15064413845539093 }, { "entropy": 7.99261474609375, "epoch": 0.01364445323314218, "mean_token_accuracy": 0.7210884094238281, "num_tokens": 724037.0, "step": 138, "train/ce_loss": 0.8059151768684387 }, { "epoch": 0.01364445323314218, "step": 138, "train/sim_loss": 0.20703125 }, { "epoch": 0.01364445323314218, "step": 138, "train/total_loss": 0.2876227796077728 }, { "entropy": 8.674912452697754, "epoch": 0.013743326082657702, "mean_token_accuracy": 0.6875981092453003, "num_tokens": 729068.0, "step": 139, "train/ce_loss": 0.06736356765031815 }, { "epoch": 0.013743326082657702, "step": 139, "train/sim_loss": 0.1171875 }, { "epoch": 0.013743326082657702, "step": 139, "train/total_loss": 0.12392385303974152 }, { "epoch": 0.013842198932173226, "grad_norm": 0.956753134727478, "learning_rate": 9.968105622311231e-06, "loss": 0.2816, "step": 140 }, { "entropy": 8.696903228759766, "epoch": 0.013842198932173226, "mean_token_accuracy": 0.723796010017395, "num_tokens": 734252.0, "step": 140, "train/ce_loss": 1.2009042501449585 }, { "epoch": 0.013842198932173226, "step": 140, "train/sim_loss": 0.18359375 }, { "epoch": 0.013842198932173226, "step": 140, "train/total_loss": 0.30368417501449585 }, { "entropy": 8.498991966247559, "epoch": 0.013941071781688748, "mean_token_accuracy": 0.6803030371665955, "num_tokens": 739342.0, "step": 141, "train/ce_loss": 1.6696449518203735 }, { "epoch": 0.013941071781688748, "step": 141, "train/sim_loss": 0.140625 }, { "epoch": 0.013941071781688748, "step": 141, "train/total_loss": 0.30758950114250183 }, { "entropy": 8.561853408813477, "epoch": 0.014039944631204272, "mean_token_accuracy": 0.7418967485427856, "num_tokens": 744599.0, "step": 142, "train/ce_loss": 1.0845693349838257 }, { "epoch": 0.014039944631204272, "step": 142, "train/sim_loss": 0.21875 }, { "epoch": 0.014039944631204272, "step": 142, "train/total_loss": 0.32720693945884705 }, { "entropy": 9.196142196655273, "epoch": 0.014138817480719794, "mean_token_accuracy": 0.690773069858551, "num_tokens": 749401.0, "step": 143, "train/ce_loss": 0.10551132261753082 }, { "epoch": 0.014138817480719794, "step": 143, "train/sim_loss": 0.08984375 }, { "epoch": 0.014138817480719794, "step": 143, "train/total_loss": 0.10039488226175308 }, { "entropy": 8.558753967285156, "epoch": 0.014237690330235317, "mean_token_accuracy": 0.7275362610816956, "num_tokens": 754544.0, "step": 144, "train/ce_loss": 0.8957123160362244 }, { "epoch": 0.014237690330235317, "step": 144, "train/sim_loss": 0.2109375 }, { "epoch": 0.014237690330235317, "step": 144, "train/total_loss": 0.3005087375640869 }, { "entropy": 8.242622375488281, "epoch": 0.014336563179750841, "mean_token_accuracy": 0.7183406352996826, "num_tokens": 759930.0, "step": 145, "train/ce_loss": 0.9891186952590942 }, { "epoch": 0.014336563179750841, "step": 145, "train/sim_loss": 0.234375 }, { "epoch": 0.014336563179750841, "step": 145, "train/total_loss": 0.3332868814468384 }, { "entropy": 8.081424713134766, "epoch": 0.014435436029266363, "mean_token_accuracy": 0.7152941226959229, "num_tokens": 765262.0, "step": 146, "train/ce_loss": 0.8288601040840149 }, { "epoch": 0.014435436029266363, "step": 146, "train/sim_loss": 0.1796875 }, { "epoch": 0.014435436029266363, "step": 146, "train/total_loss": 0.2625735104084015 }, { "entropy": 8.312261581420898, "epoch": 0.014534308878781887, "mean_token_accuracy": 0.7093153595924377, "num_tokens": 770605.0, "step": 147, "train/ce_loss": 0.94538414478302 }, { "epoch": 0.014534308878781887, "step": 147, "train/sim_loss": 0.203125 }, { "epoch": 0.014534308878781887, "step": 147, "train/total_loss": 0.2976634204387665 }, { "entropy": 8.404376029968262, "epoch": 0.014633181728297409, "mean_token_accuracy": 0.7289837002754211, "num_tokens": 775906.0, "step": 148, "train/ce_loss": 1.1064625978469849 }, { "epoch": 0.014633181728297409, "step": 148, "train/sim_loss": 0.21484375 }, { "epoch": 0.014633181728297409, "step": 148, "train/total_loss": 0.32548999786376953 }, { "entropy": 9.281839370727539, "epoch": 0.014732054577812933, "mean_token_accuracy": 0.6978922486305237, "num_tokens": 780756.0, "step": 149, "train/ce_loss": 0.09931197762489319 }, { "epoch": 0.014732054577812933, "step": 149, "train/sim_loss": 0.078125 }, { "epoch": 0.014732054577812933, "step": 149, "train/total_loss": 0.08805619925260544 }, { "entropy": 8.621355056762695, "epoch": 0.014830927427328456, "mean_token_accuracy": 0.7159841656684875, "num_tokens": 785977.0, "step": 150, "train/ce_loss": 0.6905444860458374 }, { "epoch": 0.014830927427328456, "step": 150, "train/sim_loss": 0.17578125 }, { "epoch": 0.014830927427328456, "step": 150, "train/total_loss": 0.24483570456504822 }, { "entropy": 8.721384048461914, "epoch": 0.014929800276843978, "mean_token_accuracy": 0.7140864729881287, "num_tokens": 791153.0, "step": 151, "train/ce_loss": 0.8488461375236511 }, { "epoch": 0.014929800276843978, "step": 151, "train/sim_loss": 0.1328125 }, { "epoch": 0.014929800276843978, "step": 151, "train/total_loss": 0.2176971137523651 }, { "entropy": 8.786897659301758, "epoch": 0.015028673126359502, "mean_token_accuracy": 0.6991018056869507, "num_tokens": 796262.0, "step": 152, "train/ce_loss": 1.2345761060714722 }, { "epoch": 0.015028673126359502, "step": 152, "train/sim_loss": 0.2109375 }, { "epoch": 0.015028673126359502, "step": 152, "train/total_loss": 0.3343951106071472 }, { "entropy": 8.233671188354492, "epoch": 0.015127545975875024, "mean_token_accuracy": 0.7185430526733398, "num_tokens": 801682.0, "step": 153, "train/ce_loss": 1.0754245519638062 }, { "epoch": 0.015127545975875024, "step": 153, "train/sim_loss": 0.2109375 }, { "epoch": 0.015127545975875024, "step": 153, "train/total_loss": 0.3184799551963806 }, { "entropy": 8.390192985534668, "epoch": 0.015226418825390548, "mean_token_accuracy": 0.725400447845459, "num_tokens": 806996.0, "step": 154, "train/ce_loss": 0.7750483751296997 }, { "epoch": 0.015226418825390548, "step": 154, "train/sim_loss": 0.1875 }, { "epoch": 0.015226418825390548, "step": 154, "train/total_loss": 0.26500484347343445 }, { "entropy": 8.63515853881836, "epoch": 0.015325291674906072, "mean_token_accuracy": 0.7489655017852783, "num_tokens": 812174.0, "step": 155, "train/ce_loss": 1.156675934791565 }, { "epoch": 0.015325291674906072, "step": 155, "train/sim_loss": 0.2109375 }, { "epoch": 0.015325291674906072, "step": 155, "train/total_loss": 0.32660508155822754 }, { "entropy": 8.505970001220703, "epoch": 0.015424164524421594, "mean_token_accuracy": 0.7043847441673279, "num_tokens": 817404.0, "step": 156, "train/ce_loss": 1.3409483432769775 }, { "epoch": 0.015424164524421594, "step": 156, "train/sim_loss": 0.16796875 }, { "epoch": 0.015424164524421594, "step": 156, "train/total_loss": 0.30206358432769775 }, { "entropy": 8.562335968017578, "epoch": 0.015523037373937117, "mean_token_accuracy": 0.703903079032898, "num_tokens": 822586.0, "step": 157, "train/ce_loss": 0.9353137612342834 }, { "epoch": 0.015523037373937117, "step": 157, "train/sim_loss": 0.1171875 }, { "epoch": 0.015523037373937117, "step": 157, "train/total_loss": 0.21071887016296387 }, { "entropy": 8.562350273132324, "epoch": 0.01562191022345264, "mean_token_accuracy": 0.7104825377464294, "num_tokens": 827639.0, "step": 158, "train/ce_loss": 2.006349802017212 }, { "epoch": 0.01562191022345264, "step": 158, "train/sim_loss": 0.21875 }, { "epoch": 0.01562191022345264, "step": 158, "train/total_loss": 0.41938498616218567 }, { "entropy": 8.904520034790039, "epoch": 0.01572078307296816, "mean_token_accuracy": 0.7043235898017883, "num_tokens": 832831.0, "step": 159, "train/ce_loss": 0.9618018865585327 }, { "epoch": 0.01572078307296816, "step": 159, "train/sim_loss": 0.140625 }, { "epoch": 0.01572078307296816, "step": 159, "train/total_loss": 0.23680520057678223 }, { "epoch": 0.015819655922483685, "grad_norm": 1.015324592590332, "learning_rate": 9.963160757553282e-06, "loss": 0.2759, "step": 160 }, { "entropy": 8.706175804138184, "epoch": 0.015819655922483685, "mean_token_accuracy": 0.6988416910171509, "num_tokens": 838103.0, "step": 160, "train/ce_loss": 0.5171559453010559 }, { "epoch": 0.015819655922483685, "step": 160, "train/sim_loss": 0.15234375 }, { "epoch": 0.015819655922483685, "step": 160, "train/total_loss": 0.20405934751033783 }, { "entropy": 8.741148948669434, "epoch": 0.01591852877199921, "mean_token_accuracy": 0.7098930478096008, "num_tokens": 843328.0, "step": 161, "train/ce_loss": 1.5813908576965332 }, { "epoch": 0.01591852877199921, "step": 161, "train/sim_loss": 0.07421875 }, { "epoch": 0.01591852877199921, "step": 161, "train/total_loss": 0.23235784471035004 }, { "entropy": 8.80101203918457, "epoch": 0.016017401621514733, "mean_token_accuracy": 0.7891246676445007, "num_tokens": 848505.0, "step": 162, "train/ce_loss": 0.05155276134610176 }, { "epoch": 0.016017401621514733, "step": 162, "train/sim_loss": 0.1328125 }, { "epoch": 0.016017401621514733, "step": 162, "train/total_loss": 0.1379677802324295 }, { "entropy": 9.00037956237793, "epoch": 0.016116274471030256, "mean_token_accuracy": 0.708737850189209, "num_tokens": 853575.0, "step": 163, "train/ce_loss": 0.06782928854227066 }, { "epoch": 0.016116274471030256, "step": 163, "train/sim_loss": 0.14453125 }, { "epoch": 0.016116274471030256, "step": 163, "train/total_loss": 0.15131418406963348 }, { "entropy": 8.507973670959473, "epoch": 0.016215147320545777, "mean_token_accuracy": 0.7311370968818665, "num_tokens": 858942.0, "step": 164, "train/ce_loss": 0.906366765499115 }, { "epoch": 0.016215147320545777, "step": 164, "train/sim_loss": 0.17578125 }, { "epoch": 0.016215147320545777, "step": 164, "train/total_loss": 0.266417920589447 }, { "entropy": 8.641292572021484, "epoch": 0.0163140201700613, "mean_token_accuracy": 0.6557788848876953, "num_tokens": 864259.0, "step": 165, "train/ce_loss": 1.7006336450576782 }, { "epoch": 0.0163140201700613, "step": 165, "train/sim_loss": 0.2265625 }, { "epoch": 0.0163140201700613, "step": 165, "train/total_loss": 0.3966258764266968 }, { "entropy": 8.167360305786133, "epoch": 0.016412893019576824, "mean_token_accuracy": 0.6848739385604858, "num_tokens": 869746.0, "step": 166, "train/ce_loss": 1.4542746543884277 }, { "epoch": 0.016412893019576824, "step": 166, "train/sim_loss": 0.23828125 }, { "epoch": 0.016412893019576824, "step": 166, "train/total_loss": 0.3837087154388428 }, { "entropy": 9.23829460144043, "epoch": 0.016511765869092348, "mean_token_accuracy": 0.6920473575592041, "num_tokens": 874724.0, "step": 167, "train/ce_loss": 0.07225502282381058 }, { "epoch": 0.016511765869092348, "step": 167, "train/sim_loss": 0.125 }, { "epoch": 0.016511765869092348, "step": 167, "train/total_loss": 0.13222549855709076 }, { "entropy": 8.727310180664062, "epoch": 0.01661063871860787, "mean_token_accuracy": 0.7001023292541504, "num_tokens": 880467.0, "step": 168, "train/ce_loss": 1.0050170421600342 }, { "epoch": 0.01661063871860787, "step": 168, "train/sim_loss": 0.16796875 }, { "epoch": 0.01661063871860787, "step": 168, "train/total_loss": 0.2684704661369324 }, { "entropy": 9.13823127746582, "epoch": 0.016709511568123392, "mean_token_accuracy": 0.7419354915618896, "num_tokens": 885447.0, "step": 169, "train/ce_loss": 0.07551628351211548 }, { "epoch": 0.016709511568123392, "step": 169, "train/sim_loss": 0.22265625 }, { "epoch": 0.016709511568123392, "step": 169, "train/total_loss": 0.2302078753709793 }, { "entropy": 8.848125457763672, "epoch": 0.016808384417638916, "mean_token_accuracy": 0.7138413786888123, "num_tokens": 890538.0, "step": 170, "train/ce_loss": 1.4403102397918701 }, { "epoch": 0.016808384417638916, "step": 170, "train/sim_loss": 0.1953125 }, { "epoch": 0.016808384417638916, "step": 170, "train/total_loss": 0.3393435478210449 }, { "entropy": 9.174699783325195, "epoch": 0.01690725726715444, "mean_token_accuracy": 0.7342256307601929, "num_tokens": 895528.0, "step": 171, "train/ce_loss": 1.0800552368164062 }, { "epoch": 0.01690725726715444, "step": 171, "train/sim_loss": 0.15234375 }, { "epoch": 0.01690725726715444, "step": 171, "train/total_loss": 0.2603492736816406 }, { "entropy": 8.827863693237305, "epoch": 0.017006130116669963, "mean_token_accuracy": 0.7016574740409851, "num_tokens": 900722.0, "step": 172, "train/ce_loss": 1.1682084798812866 }, { "epoch": 0.017006130116669963, "step": 172, "train/sim_loss": 0.12890625 }, { "epoch": 0.017006130116669963, "step": 172, "train/total_loss": 0.24572709202766418 }, { "entropy": 8.44694709777832, "epoch": 0.017105002966185487, "mean_token_accuracy": 0.6686217188835144, "num_tokens": 906192.0, "step": 173, "train/ce_loss": 1.4545676708221436 }, { "epoch": 0.017105002966185487, "step": 173, "train/sim_loss": 0.2265625 }, { "epoch": 0.017105002966185487, "step": 173, "train/total_loss": 0.37201929092407227 }, { "entropy": 9.067190170288086, "epoch": 0.017203875815701007, "mean_token_accuracy": 0.7195325493812561, "num_tokens": 911200.0, "step": 174, "train/ce_loss": 1.3222960233688354 }, { "epoch": 0.017203875815701007, "step": 174, "train/sim_loss": 0.21484375 }, { "epoch": 0.017203875815701007, "step": 174, "train/total_loss": 0.34707337617874146 }, { "entropy": 8.505998611450195, "epoch": 0.01730274866521653, "mean_token_accuracy": 0.7523029446601868, "num_tokens": 916678.0, "step": 175, "train/ce_loss": 0.8026020526885986 }, { "epoch": 0.01730274866521653, "step": 175, "train/sim_loss": 0.12109375 }, { "epoch": 0.01730274866521653, "step": 175, "train/total_loss": 0.20135396718978882 }, { "entropy": 8.75352954864502, "epoch": 0.017401621514732055, "mean_token_accuracy": 0.7118226885795593, "num_tokens": 921948.0, "step": 176, "train/ce_loss": 1.0283185243606567 }, { "epoch": 0.017401621514732055, "step": 176, "train/sim_loss": 0.1484375 }, { "epoch": 0.017401621514732055, "step": 176, "train/total_loss": 0.2512693405151367 }, { "entropy": 8.730976104736328, "epoch": 0.01750049436424758, "mean_token_accuracy": 0.727385401725769, "num_tokens": 927247.0, "step": 177, "train/ce_loss": 0.911525547504425 }, { "epoch": 0.01750049436424758, "step": 177, "train/sim_loss": 0.1171875 }, { "epoch": 0.01750049436424758, "step": 177, "train/total_loss": 0.20834004878997803 }, { "entropy": 8.64146614074707, "epoch": 0.017599367213763102, "mean_token_accuracy": 0.7541163563728333, "num_tokens": 932641.0, "step": 178, "train/ce_loss": 0.9116729497909546 }, { "epoch": 0.017599367213763102, "step": 178, "train/sim_loss": 0.1328125 }, { "epoch": 0.017599367213763102, "step": 178, "train/total_loss": 0.22397980093955994 }, { "entropy": 9.030397415161133, "epoch": 0.017698240063278622, "mean_token_accuracy": 0.7196261882781982, "num_tokens": 937755.0, "step": 179, "train/ce_loss": 0.7100560069084167 }, { "epoch": 0.017698240063278622, "step": 179, "train/sim_loss": 0.1484375 }, { "epoch": 0.017698240063278622, "step": 179, "train/total_loss": 0.21944311261177063 }, { "epoch": 0.017797112912794146, "grad_norm": 1.0556875467300415, "learning_rate": 9.958215892795334e-06, "loss": 0.2623, "step": 180 }, { "entropy": 9.004786491394043, "epoch": 0.017797112912794146, "mean_token_accuracy": 0.670976996421814, "num_tokens": 942893.0, "step": 180, "train/ce_loss": 1.569191575050354 }, { "epoch": 0.017797112912794146, "step": 180, "train/sim_loss": 0.16796875 }, { "epoch": 0.017797112912794146, "step": 180, "train/total_loss": 0.3248879313468933 }, { "entropy": 8.789143562316895, "epoch": 0.01789598576230967, "mean_token_accuracy": 0.7061281204223633, "num_tokens": 948108.0, "step": 181, "train/ce_loss": 0.8462908864021301 }, { "epoch": 0.01789598576230967, "step": 181, "train/sim_loss": 0.14453125 }, { "epoch": 0.01789598576230967, "step": 181, "train/total_loss": 0.229160338640213 }, { "entropy": 8.294668197631836, "epoch": 0.017994858611825194, "mean_token_accuracy": 0.7203980088233948, "num_tokens": 953603.0, "step": 182, "train/ce_loss": 1.0397672653198242 }, { "epoch": 0.017994858611825194, "step": 182, "train/sim_loss": 0.1328125 }, { "epoch": 0.017994858611825194, "step": 182, "train/total_loss": 0.23678922653198242 }, { "entropy": 8.335565567016602, "epoch": 0.018093731461340717, "mean_token_accuracy": 0.7104413509368896, "num_tokens": 959017.0, "step": 183, "train/ce_loss": 1.5085524320602417 }, { "epoch": 0.018093731461340717, "step": 183, "train/sim_loss": 0.2265625 }, { "epoch": 0.018093731461340717, "step": 183, "train/total_loss": 0.37741774320602417 }, { "entropy": 8.205455780029297, "epoch": 0.018192604310856238, "mean_token_accuracy": 0.7226027250289917, "num_tokens": 964411.0, "step": 184, "train/ce_loss": 0.7219122648239136 }, { "epoch": 0.018192604310856238, "step": 184, "train/sim_loss": 0.1953125 }, { "epoch": 0.018192604310856238, "step": 184, "train/total_loss": 0.2675037384033203 }, { "entropy": 9.04613971710205, "epoch": 0.01829147716037176, "mean_token_accuracy": 0.7063491940498352, "num_tokens": 969465.0, "step": 185, "train/ce_loss": 1.5215908288955688 }, { "epoch": 0.01829147716037176, "step": 185, "train/sim_loss": 0.17578125 }, { "epoch": 0.01829147716037176, "step": 185, "train/total_loss": 0.32794034481048584 }, { "entropy": 9.08860969543457, "epoch": 0.018390350009887285, "mean_token_accuracy": 0.7233333587646484, "num_tokens": 974538.0, "step": 186, "train/ce_loss": 2.49151611328125 }, { "epoch": 0.018390350009887285, "step": 186, "train/sim_loss": 0.20703125 }, { "epoch": 0.018390350009887285, "step": 186, "train/total_loss": 0.4561828672885895 }, { "entropy": 8.43522834777832, "epoch": 0.01848922285940281, "mean_token_accuracy": 0.692150890827179, "num_tokens": 980012.0, "step": 187, "train/ce_loss": 0.5604060292243958 }, { "epoch": 0.01848922285940281, "step": 187, "train/sim_loss": 0.13671875 }, { "epoch": 0.01848922285940281, "step": 187, "train/total_loss": 0.19275934994220734 }, { "entropy": 8.536243438720703, "epoch": 0.018588095708918333, "mean_token_accuracy": 0.761562168598175, "num_tokens": 985454.0, "step": 188, "train/ce_loss": 0.6313604712486267 }, { "epoch": 0.018588095708918333, "step": 188, "train/sim_loss": 0.10546875 }, { "epoch": 0.018588095708918333, "step": 188, "train/total_loss": 0.1686047911643982 }, { "entropy": 9.47385025024414, "epoch": 0.018686968558433853, "mean_token_accuracy": 0.7377398610115051, "num_tokens": 990496.0, "step": 189, "train/ce_loss": 0.0897272527217865 }, { "epoch": 0.018686968558433853, "step": 189, "train/sim_loss": 0.078125 }, { "epoch": 0.018686968558433853, "step": 189, "train/total_loss": 0.08709772676229477 }, { "entropy": 8.622787475585938, "epoch": 0.018785841407949377, "mean_token_accuracy": 0.7806817889213562, "num_tokens": 995895.0, "step": 190, "train/ce_loss": 0.710089385509491 }, { "epoch": 0.018785841407949377, "step": 190, "train/sim_loss": 0.0859375 }, { "epoch": 0.018785841407949377, "step": 190, "train/total_loss": 0.15694645047187805 }, { "entropy": 8.932117462158203, "epoch": 0.0188847142574649, "mean_token_accuracy": 0.7503876090049744, "num_tokens": 1000931.0, "step": 191, "train/ce_loss": 0.06429041922092438 }, { "epoch": 0.0188847142574649, "step": 191, "train/sim_loss": 0.1171875 }, { "epoch": 0.0188847142574649, "step": 191, "train/total_loss": 0.1236165389418602 }, { "entropy": 9.011377334594727, "epoch": 0.018983587106980424, "mean_token_accuracy": 0.7205674052238464, "num_tokens": 1006037.0, "step": 192, "train/ce_loss": 1.5213985443115234 }, { "epoch": 0.018983587106980424, "step": 192, "train/sim_loss": 0.0859375 }, { "epoch": 0.018983587106980424, "step": 192, "train/total_loss": 0.23807735741138458 }, { "entropy": 8.612946510314941, "epoch": 0.019082459956495944, "mean_token_accuracy": 0.6990496516227722, "num_tokens": 1011448.0, "step": 193, "train/ce_loss": 0.9318941831588745 }, { "epoch": 0.019082459956495944, "step": 193, "train/sim_loss": 0.1640625 }, { "epoch": 0.019082459956495944, "step": 193, "train/total_loss": 0.25725191831588745 }, { "entropy": 8.525861740112305, "epoch": 0.019181332806011468, "mean_token_accuracy": 0.7281553149223328, "num_tokens": 1016745.0, "step": 194, "train/ce_loss": 1.1830980777740479 }, { "epoch": 0.019181332806011468, "step": 194, "train/sim_loss": 0.19140625 }, { "epoch": 0.019181332806011468, "step": 194, "train/total_loss": 0.30971604585647583 }, { "entropy": 9.627721786499023, "epoch": 0.019280205655526992, "mean_token_accuracy": 0.7314410209655762, "num_tokens": 1021631.0, "step": 195, "train/ce_loss": 0.09438954293727875 }, { "epoch": 0.019280205655526992, "step": 195, "train/sim_loss": 0.19140625 }, { "epoch": 0.019280205655526992, "step": 195, "train/total_loss": 0.20084521174430847 }, { "entropy": 8.480622291564941, "epoch": 0.019379078505042516, "mean_token_accuracy": 0.7431694269180298, "num_tokens": 1027045.0, "step": 196, "train/ce_loss": 0.9766530990600586 }, { "epoch": 0.019379078505042516, "step": 196, "train/sim_loss": 0.07421875 }, { "epoch": 0.019379078505042516, "step": 196, "train/total_loss": 0.17188405990600586 }, { "entropy": 9.240240097045898, "epoch": 0.01947795135455804, "mean_token_accuracy": 0.7355371713638306, "num_tokens": 1032065.0, "step": 197, "train/ce_loss": 1.199245810508728 }, { "epoch": 0.01947795135455804, "step": 197, "train/sim_loss": 0.234375 }, { "epoch": 0.01947795135455804, "step": 197, "train/total_loss": 0.3542995750904083 }, { "entropy": 8.914320945739746, "epoch": 0.01957682420407356, "mean_token_accuracy": 0.7111111283302307, "num_tokens": 1037297.0, "step": 198, "train/ce_loss": 1.3729205131530762 }, { "epoch": 0.01957682420407356, "step": 198, "train/sim_loss": 0.21875 }, { "epoch": 0.01957682420407356, "step": 198, "train/total_loss": 0.3560420572757721 }, { "entropy": 8.601119041442871, "epoch": 0.019675697053589083, "mean_token_accuracy": 0.7517588138580322, "num_tokens": 1042771.0, "step": 199, "train/ce_loss": 0.7754992246627808 }, { "epoch": 0.019675697053589083, "step": 199, "train/sim_loss": 0.13671875 }, { "epoch": 0.019675697053589083, "step": 199, "train/total_loss": 0.21426868438720703 }, { "epoch": 0.019774569903104607, "grad_norm": 1.345518946647644, "learning_rate": 9.953271028037384e-06, "loss": 0.2608, "step": 200 }, { "entropy": 9.236515045166016, "epoch": 0.019774569903104607, "mean_token_accuracy": 0.7239263653755188, "num_tokens": 1047876.0, "step": 200, "train/ce_loss": 0.7935214638710022 }, { "epoch": 0.019774569903104607, "step": 200, "train/sim_loss": 0.24609375 }, { "epoch": 0.019774569903104607, "step": 200, "train/total_loss": 0.32544589042663574 }, { "entropy": 8.959417343139648, "epoch": 0.01987344275262013, "mean_token_accuracy": 0.7191011309623718, "num_tokens": 1053155.0, "step": 201, "train/ce_loss": 1.5126712322235107 }, { "epoch": 0.01987344275262013, "step": 201, "train/sim_loss": 0.12109375 }, { "epoch": 0.01987344275262013, "step": 201, "train/total_loss": 0.2723608613014221 }, { "entropy": 8.881634712219238, "epoch": 0.019972315602135655, "mean_token_accuracy": 0.728205144405365, "num_tokens": 1058418.0, "step": 202, "train/ce_loss": 0.977267861366272 }, { "epoch": 0.019972315602135655, "step": 202, "train/sim_loss": 0.16796875 }, { "epoch": 0.019972315602135655, "step": 202, "train/total_loss": 0.2656955420970917 }, { "entropy": 8.64078140258789, "epoch": 0.020071188451651175, "mean_token_accuracy": 0.6912899613380432, "num_tokens": 1063760.0, "step": 203, "train/ce_loss": 0.5148274302482605 }, { "epoch": 0.020071188451651175, "step": 203, "train/sim_loss": 0.0703125 }, { "epoch": 0.020071188451651175, "step": 203, "train/total_loss": 0.12179524451494217 }, { "entropy": 8.804824829101562, "epoch": 0.0201700613011667, "mean_token_accuracy": 0.7189542651176453, "num_tokens": 1069021.0, "step": 204, "train/ce_loss": 0.5810104608535767 }, { "epoch": 0.0201700613011667, "step": 204, "train/sim_loss": 0.13671875 }, { "epoch": 0.0201700613011667, "step": 204, "train/total_loss": 0.19481979310512543 }, { "entropy": 8.664105415344238, "epoch": 0.020268934150682223, "mean_token_accuracy": 0.7547568678855896, "num_tokens": 1074471.0, "step": 205, "train/ce_loss": 0.7599750757217407 }, { "epoch": 0.020268934150682223, "step": 205, "train/sim_loss": 0.1640625 }, { "epoch": 0.020268934150682223, "step": 205, "train/total_loss": 0.2400600016117096 }, { "entropy": 8.950141906738281, "epoch": 0.020367807000197746, "mean_token_accuracy": 0.7085253596305847, "num_tokens": 1079796.0, "step": 206, "train/ce_loss": 1.1631953716278076 }, { "epoch": 0.020367807000197746, "step": 206, "train/sim_loss": 0.19140625 }, { "epoch": 0.020367807000197746, "step": 206, "train/total_loss": 0.30772578716278076 }, { "entropy": 9.293176651000977, "epoch": 0.02046667984971327, "mean_token_accuracy": 0.6983333230018616, "num_tokens": 1084873.0, "step": 207, "train/ce_loss": 1.2679121494293213 }, { "epoch": 0.02046667984971327, "step": 207, "train/sim_loss": 0.125 }, { "epoch": 0.02046667984971327, "step": 207, "train/total_loss": 0.25179123878479004 }, { "entropy": 8.642351150512695, "epoch": 0.02056555269922879, "mean_token_accuracy": 0.6852207183837891, "num_tokens": 1090360.0, "step": 208, "train/ce_loss": 1.1404317617416382 }, { "epoch": 0.02056555269922879, "step": 208, "train/sim_loss": 0.12890625 }, { "epoch": 0.02056555269922879, "step": 208, "train/total_loss": 0.24294942617416382 }, { "entropy": 9.028209686279297, "epoch": 0.020664425548744314, "mean_token_accuracy": 0.7037037014961243, "num_tokens": 1095739.0, "step": 209, "train/ce_loss": 0.8197504281997681 }, { "epoch": 0.020664425548744314, "step": 209, "train/sim_loss": 0.125 }, { "epoch": 0.020664425548744314, "step": 209, "train/total_loss": 0.2069750428199768 }, { "entropy": 9.236760139465332, "epoch": 0.020763298398259838, "mean_token_accuracy": 0.736672043800354, "num_tokens": 1100775.0, "step": 210, "train/ce_loss": 1.0761586427688599 }, { "epoch": 0.020763298398259838, "step": 210, "train/sim_loss": 0.12890625 }, { "epoch": 0.020763298398259838, "step": 210, "train/total_loss": 0.2365221083164215 }, { "entropy": 9.078277587890625, "epoch": 0.02086217124777536, "mean_token_accuracy": 0.7010869383811951, "num_tokens": 1105989.0, "step": 211, "train/ce_loss": 1.85743248462677 }, { "epoch": 0.02086217124777536, "step": 211, "train/sim_loss": 0.15234375 }, { "epoch": 0.02086217124777536, "step": 211, "train/total_loss": 0.3380870223045349 }, { "entropy": 8.900236129760742, "epoch": 0.020961044097290885, "mean_token_accuracy": 0.7445997595787048, "num_tokens": 1111249.0, "step": 212, "train/ce_loss": 0.9222058653831482 }, { "epoch": 0.020961044097290885, "step": 212, "train/sim_loss": 0.1796875 }, { "epoch": 0.020961044097290885, "step": 212, "train/total_loss": 0.27190810441970825 }, { "entropy": 9.74795913696289, "epoch": 0.021059916946806406, "mean_token_accuracy": 0.730512261390686, "num_tokens": 1116279.0, "step": 213, "train/ce_loss": 1.8484524488449097 }, { "epoch": 0.021059916946806406, "step": 213, "train/sim_loss": 0.19140625 }, { "epoch": 0.021059916946806406, "step": 213, "train/total_loss": 0.3762515187263489 }, { "entropy": 9.32369613647461, "epoch": 0.02115878979632193, "mean_token_accuracy": 0.7382352948188782, "num_tokens": 1121425.0, "step": 214, "train/ce_loss": 1.1132593154907227 }, { "epoch": 0.02115878979632193, "step": 214, "train/sim_loss": 0.13671875 }, { "epoch": 0.02115878979632193, "step": 214, "train/total_loss": 0.2480446845293045 }, { "entropy": 9.323495864868164, "epoch": 0.021257662645837453, "mean_token_accuracy": 0.7352024912834167, "num_tokens": 1126563.0, "step": 215, "train/ce_loss": 0.9907536506652832 }, { "epoch": 0.021257662645837453, "step": 215, "train/sim_loss": 0.1875 }, { "epoch": 0.021257662645837453, "step": 215, "train/total_loss": 0.2865753769874573 }, { "entropy": 8.703104972839355, "epoch": 0.021356535495352977, "mean_token_accuracy": 0.7093185186386108, "num_tokens": 1131731.0, "step": 216, "train/ce_loss": 1.185186743736267 }, { "epoch": 0.021356535495352977, "step": 216, "train/sim_loss": 0.21875 }, { "epoch": 0.021356535495352977, "step": 216, "train/total_loss": 0.3372686803340912 }, { "entropy": 8.79998779296875, "epoch": 0.0214554083448685, "mean_token_accuracy": 0.7827004194259644, "num_tokens": 1137169.0, "step": 217, "train/ce_loss": 0.562030553817749 }, { "epoch": 0.0214554083448685, "step": 217, "train/sim_loss": 0.1328125 }, { "epoch": 0.0214554083448685, "step": 217, "train/total_loss": 0.18901555240154266 }, { "entropy": 9.627821922302246, "epoch": 0.02155428119438402, "mean_token_accuracy": 0.7794677019119263, "num_tokens": 1142098.0, "step": 218, "train/ce_loss": 1.4625898599624634 }, { "epoch": 0.02155428119438402, "step": 218, "train/sim_loss": 0.0703125 }, { "epoch": 0.02155428119438402, "step": 218, "train/total_loss": 0.21657149493694305 }, { "entropy": 9.095985412597656, "epoch": 0.021653154043899545, "mean_token_accuracy": 0.6560587286949158, "num_tokens": 1147354.0, "step": 219, "train/ce_loss": 1.4578096866607666 }, { "epoch": 0.021653154043899545, "step": 219, "train/sim_loss": 0.1953125 }, { "epoch": 0.021653154043899545, "step": 219, "train/total_loss": 0.3410934805870056 }, { "epoch": 0.02175202689341507, "grad_norm": 1.3434762954711914, "learning_rate": 9.948326163279435e-06, "loss": 0.2473, "step": 220 }, { "entropy": 9.08565616607666, "epoch": 0.02175202689341507, "mean_token_accuracy": 0.7113022208213806, "num_tokens": 1152640.0, "step": 220, "train/ce_loss": 0.6921712160110474 }, { "epoch": 0.02175202689341507, "step": 220, "train/sim_loss": 0.14453125 }, { "epoch": 0.02175202689341507, "step": 220, "train/total_loss": 0.21374836564064026 }, { "entropy": 10.296051979064941, "epoch": 0.021850899742930592, "mean_token_accuracy": 0.7894737124443054, "num_tokens": 1157164.0, "step": 221, "train/ce_loss": 2.8208775520324707 }, { "epoch": 0.021850899742930592, "step": 221, "train/sim_loss": 0.1484375 }, { "epoch": 0.021850899742930592, "step": 221, "train/total_loss": 0.4305252730846405 }, { "entropy": 9.4681396484375, "epoch": 0.021949772592446116, "mean_token_accuracy": 0.6847290396690369, "num_tokens": 1162232.0, "step": 222, "train/ce_loss": 1.3300937414169312 }, { "epoch": 0.021949772592446116, "step": 222, "train/sim_loss": 0.125 }, { "epoch": 0.021949772592446116, "step": 222, "train/total_loss": 0.2580093741416931 }, { "entropy": 8.884187698364258, "epoch": 0.022048645441961636, "mean_token_accuracy": 0.7592137455940247, "num_tokens": 1167498.0, "step": 223, "train/ce_loss": 0.6539856791496277 }, { "epoch": 0.022048645441961636, "step": 223, "train/sim_loss": 0.15625 }, { "epoch": 0.022048645441961636, "step": 223, "train/total_loss": 0.22164857387542725 }, { "entropy": 9.357826232910156, "epoch": 0.02214751829147716, "mean_token_accuracy": 0.7581395506858826, "num_tokens": 1172577.0, "step": 224, "train/ce_loss": 0.06450015306472778 }, { "epoch": 0.02214751829147716, "step": 224, "train/sim_loss": 0.11328125 }, { "epoch": 0.02214751829147716, "step": 224, "train/total_loss": 0.11973126232624054 }, { "entropy": 8.823542594909668, "epoch": 0.022246391140992684, "mean_token_accuracy": 0.7916666865348816, "num_tokens": 1177916.0, "step": 225, "train/ce_loss": 0.6573609113693237 }, { "epoch": 0.022246391140992684, "step": 225, "train/sim_loss": 0.1015625 }, { "epoch": 0.022246391140992684, "step": 225, "train/total_loss": 0.1672985851764679 }, { "entropy": 9.321389198303223, "epoch": 0.022345263990508207, "mean_token_accuracy": 0.7525179982185364, "num_tokens": 1183041.0, "step": 226, "train/ce_loss": 0.8095218539237976 }, { "epoch": 0.022345263990508207, "step": 226, "train/sim_loss": 0.109375 }, { "epoch": 0.022345263990508207, "step": 226, "train/total_loss": 0.19032719731330872 }, { "entropy": 8.588724136352539, "epoch": 0.02244413684002373, "mean_token_accuracy": 0.7323809266090393, "num_tokens": 1188541.0, "step": 227, "train/ce_loss": 1.3050811290740967 }, { "epoch": 0.02244413684002373, "step": 227, "train/sim_loss": 0.15234375 }, { "epoch": 0.02244413684002373, "step": 227, "train/total_loss": 0.2828518748283386 }, { "entropy": 9.121466636657715, "epoch": 0.02254300968953925, "mean_token_accuracy": 0.7220843434333801, "num_tokens": 1193847.0, "step": 228, "train/ce_loss": 0.7918413281440735 }, { "epoch": 0.02254300968953925, "step": 228, "train/sim_loss": 0.17578125 }, { "epoch": 0.02254300968953925, "step": 228, "train/total_loss": 0.2549653947353363 }, { "entropy": 9.202960968017578, "epoch": 0.022641882539054775, "mean_token_accuracy": 0.7212918400764465, "num_tokens": 1199108.0, "step": 229, "train/ce_loss": 0.5830985307693481 }, { "epoch": 0.022641882539054775, "step": 229, "train/sim_loss": 0.17578125 }, { "epoch": 0.022641882539054775, "step": 229, "train/total_loss": 0.23409110307693481 }, { "entropy": 9.505131721496582, "epoch": 0.0227407553885703, "mean_token_accuracy": 0.6988266110420227, "num_tokens": 1204451.0, "step": 230, "train/ce_loss": 1.8573004007339478 }, { "epoch": 0.0227407553885703, "step": 230, "train/sim_loss": 0.21484375 }, { "epoch": 0.0227407553885703, "step": 230, "train/total_loss": 0.4005737900733948 }, { "entropy": 8.627418518066406, "epoch": 0.022839628238085823, "mean_token_accuracy": 0.7078787684440613, "num_tokens": 1209740.0, "step": 231, "train/ce_loss": 1.0552197694778442 }, { "epoch": 0.022839628238085823, "step": 231, "train/sim_loss": 0.19921875 }, { "epoch": 0.022839628238085823, "step": 231, "train/total_loss": 0.3047407269477844 }, { "entropy": 9.916093826293945, "epoch": 0.022938501087601346, "mean_token_accuracy": 0.6979695558547974, "num_tokens": 1214531.0, "step": 232, "train/ce_loss": 3.3532495498657227 }, { "epoch": 0.022938501087601346, "step": 232, "train/sim_loss": 0.16015625 }, { "epoch": 0.022938501087601346, "step": 232, "train/total_loss": 0.4954812228679657 }, { "entropy": 8.920723915100098, "epoch": 0.023037373937116867, "mean_token_accuracy": 0.7028985619544983, "num_tokens": 1219879.0, "step": 233, "train/ce_loss": 1.1744595766067505 }, { "epoch": 0.023037373937116867, "step": 233, "train/sim_loss": 0.1953125 }, { "epoch": 0.023037373937116867, "step": 233, "train/total_loss": 0.3127584457397461 }, { "entropy": 8.755692481994629, "epoch": 0.02313624678663239, "mean_token_accuracy": 0.7393509149551392, "num_tokens": 1225367.0, "step": 234, "train/ce_loss": 0.7512011528015137 }, { "epoch": 0.02313624678663239, "step": 234, "train/sim_loss": 0.078125 }, { "epoch": 0.02313624678663239, "step": 234, "train/total_loss": 0.15324512124061584 }, { "entropy": 9.171567916870117, "epoch": 0.023235119636147914, "mean_token_accuracy": 0.7664429545402527, "num_tokens": 1230564.0, "step": 235, "train/ce_loss": 1.2915208339691162 }, { "epoch": 0.023235119636147914, "step": 235, "train/sim_loss": 0.11328125 }, { "epoch": 0.023235119636147914, "step": 235, "train/total_loss": 0.2424333393573761 }, { "entropy": 9.673821449279785, "epoch": 0.023333992485663438, "mean_token_accuracy": 0.7728055119514465, "num_tokens": 1235536.0, "step": 236, "train/ce_loss": 1.4527373313903809 }, { "epoch": 0.023333992485663438, "step": 236, "train/sim_loss": 0.07421875 }, { "epoch": 0.023333992485663438, "step": 236, "train/total_loss": 0.21949248015880585 }, { "entropy": 9.116235733032227, "epoch": 0.023432865335178958, "mean_token_accuracy": 0.7522816061973572, "num_tokens": 1240792.0, "step": 237, "train/ce_loss": 0.7418687343597412 }, { "epoch": 0.023432865335178958, "step": 237, "train/sim_loss": 0.125 }, { "epoch": 0.023432865335178958, "step": 237, "train/total_loss": 0.19918687641620636 }, { "entropy": 9.20254898071289, "epoch": 0.023531738184694482, "mean_token_accuracy": 0.7182235717773438, "num_tokens": 1245972.0, "step": 238, "train/ce_loss": 1.3997796773910522 }, { "epoch": 0.023531738184694482, "step": 238, "train/sim_loss": 0.15625 }, { "epoch": 0.023531738184694482, "step": 238, "train/total_loss": 0.29622799158096313 }, { "entropy": 9.339031219482422, "epoch": 0.023630611034210006, "mean_token_accuracy": 0.7014741897583008, "num_tokens": 1251225.0, "step": 239, "train/ce_loss": 1.2503479719161987 }, { "epoch": 0.023630611034210006, "step": 239, "train/sim_loss": 0.125 }, { "epoch": 0.023630611034210006, "step": 239, "train/total_loss": 0.25003480911254883 }, { "epoch": 0.02372948388372553, "grad_norm": 1.3856909275054932, "learning_rate": 9.943381298521487e-06, "loss": 0.2277, "step": 240 }, { "entropy": 9.820657730102539, "epoch": 0.02372948388372553, "mean_token_accuracy": 0.7542856931686401, "num_tokens": 1256159.0, "step": 240, "train/ce_loss": 0.5057271718978882 }, { "epoch": 0.02372948388372553, "step": 240, "train/sim_loss": 0.109375 }, { "epoch": 0.02372948388372553, "step": 240, "train/total_loss": 0.1599477231502533 }, { "entropy": 9.042003631591797, "epoch": 0.023828356733241053, "mean_token_accuracy": 0.6929824352264404, "num_tokens": 1261426.0, "step": 241, "train/ce_loss": 0.5546204447746277 }, { "epoch": 0.023828356733241053, "step": 241, "train/sim_loss": 0.1171875 }, { "epoch": 0.023828356733241053, "step": 241, "train/total_loss": 0.172649547457695 }, { "entropy": 9.184640884399414, "epoch": 0.023927229582756573, "mean_token_accuracy": 0.7013630867004395, "num_tokens": 1266638.0, "step": 242, "train/ce_loss": 1.0061686038970947 }, { "epoch": 0.023927229582756573, "step": 242, "train/sim_loss": 0.24609375 }, { "epoch": 0.023927229582756573, "step": 242, "train/total_loss": 0.3467106223106384 }, { "entropy": 8.989482879638672, "epoch": 0.024026102432272097, "mean_token_accuracy": 0.6966426968574524, "num_tokens": 1271922.0, "step": 243, "train/ce_loss": 1.0107966661453247 }, { "epoch": 0.024026102432272097, "step": 243, "train/sim_loss": 0.1484375 }, { "epoch": 0.024026102432272097, "step": 243, "train/total_loss": 0.24951717257499695 }, { "entropy": 8.77995491027832, "epoch": 0.02412497528178762, "mean_token_accuracy": 0.7386723160743713, "num_tokens": 1277360.0, "step": 244, "train/ce_loss": 1.490092158317566 }, { "epoch": 0.02412497528178762, "step": 244, "train/sim_loss": 0.171875 }, { "epoch": 0.02412497528178762, "step": 244, "train/total_loss": 0.32088422775268555 }, { "entropy": 9.096035957336426, "epoch": 0.024223848131303145, "mean_token_accuracy": 0.703529417514801, "num_tokens": 1282714.0, "step": 245, "train/ce_loss": 1.1417651176452637 }, { "epoch": 0.024223848131303145, "step": 245, "train/sim_loss": 0.16015625 }, { "epoch": 0.024223848131303145, "step": 245, "train/total_loss": 0.27433276176452637 }, { "entropy": 9.295907974243164, "epoch": 0.02432272098081867, "mean_token_accuracy": 0.7151702642440796, "num_tokens": 1287806.0, "step": 246, "train/ce_loss": 1.053331732749939 }, { "epoch": 0.02432272098081867, "step": 246, "train/sim_loss": 0.15234375 }, { "epoch": 0.02432272098081867, "step": 246, "train/total_loss": 0.2576769292354584 }, { "entropy": 8.644203186035156, "epoch": 0.02442159383033419, "mean_token_accuracy": 0.7180910110473633, "num_tokens": 1293268.0, "step": 247, "train/ce_loss": 1.44148588180542 }, { "epoch": 0.02442159383033419, "step": 247, "train/sim_loss": 0.171875 }, { "epoch": 0.02442159383033419, "step": 247, "train/total_loss": 0.316023588180542 }, { "entropy": 9.263618469238281, "epoch": 0.024520466679849712, "mean_token_accuracy": 0.7442159652709961, "num_tokens": 1298474.0, "step": 248, "train/ce_loss": 0.9490503072738647 }, { "epoch": 0.024520466679849712, "step": 248, "train/sim_loss": 0.1640625 }, { "epoch": 0.024520466679849712, "step": 248, "train/total_loss": 0.2589675188064575 }, { "entropy": 9.133621215820312, "epoch": 0.024619339529365236, "mean_token_accuracy": 0.6857825517654419, "num_tokens": 1303710.0, "step": 249, "train/ce_loss": 0.9590094685554504 }, { "epoch": 0.024619339529365236, "step": 249, "train/sim_loss": 0.1484375 }, { "epoch": 0.024619339529365236, "step": 249, "train/total_loss": 0.24433845281600952 }, { "entropy": 9.832183837890625, "epoch": 0.02471821237888076, "mean_token_accuracy": 0.7421875, "num_tokens": 1308639.0, "step": 250, "train/ce_loss": 1.9395623207092285 }, { "epoch": 0.02471821237888076, "step": 250, "train/sim_loss": 0.14453125 }, { "epoch": 0.02471821237888076, "step": 250, "train/total_loss": 0.33848750591278076 }, { "entropy": 9.327045440673828, "epoch": 0.024817085228396284, "mean_token_accuracy": 0.7238605618476868, "num_tokens": 1313843.0, "step": 251, "train/ce_loss": 1.5444014072418213 }, { "epoch": 0.024817085228396284, "step": 251, "train/sim_loss": 0.12109375 }, { "epoch": 0.024817085228396284, "step": 251, "train/total_loss": 0.27553391456604004 }, { "entropy": 8.991327285766602, "epoch": 0.024915958077911804, "mean_token_accuracy": 0.7205542922019958, "num_tokens": 1319197.0, "step": 252, "train/ce_loss": 1.2354319095611572 }, { "epoch": 0.024915958077911804, "step": 252, "train/sim_loss": 0.1640625 }, { "epoch": 0.024915958077911804, "step": 252, "train/total_loss": 0.2876057028770447 }, { "entropy": 8.753467559814453, "epoch": 0.025014830927427328, "mean_token_accuracy": 0.7426470518112183, "num_tokens": 1324679.0, "step": 253, "train/ce_loss": 0.651845395565033 }, { "epoch": 0.025014830927427328, "step": 253, "train/sim_loss": 0.16796875 }, { "epoch": 0.025014830927427328, "step": 253, "train/total_loss": 0.23315328359603882 }, { "entropy": 9.486014366149902, "epoch": 0.02511370377694285, "mean_token_accuracy": 0.6701337099075317, "num_tokens": 1329835.0, "step": 254, "train/ce_loss": 1.8440042734146118 }, { "epoch": 0.02511370377694285, "step": 254, "train/sim_loss": 0.1484375 }, { "epoch": 0.02511370377694285, "step": 254, "train/total_loss": 0.33283793926239014 }, { "entropy": 9.138690948486328, "epoch": 0.025212576626458375, "mean_token_accuracy": 0.7478787899017334, "num_tokens": 1335118.0, "step": 255, "train/ce_loss": 0.7411956787109375 }, { "epoch": 0.025212576626458375, "step": 255, "train/sim_loss": 0.1328125 }, { "epoch": 0.025212576626458375, "step": 255, "train/total_loss": 0.20693206787109375 }, { "entropy": 9.330408096313477, "epoch": 0.0253114494759739, "mean_token_accuracy": 0.7630208134651184, "num_tokens": 1340380.0, "step": 256, "train/ce_loss": 0.6438601613044739 }, { "epoch": 0.0253114494759739, "step": 256, "train/sim_loss": 0.12109375 }, { "epoch": 0.0253114494759739, "step": 256, "train/total_loss": 0.1854797601699829 }, { "entropy": 9.453132629394531, "epoch": 0.02541032232548942, "mean_token_accuracy": 0.6998368501663208, "num_tokens": 1345474.0, "step": 257, "train/ce_loss": 0.07052500545978546 }, { "epoch": 0.02541032232548942, "step": 257, "train/sim_loss": 0.109375 }, { "epoch": 0.02541032232548942, "step": 257, "train/total_loss": 0.11642750352621078 }, { "entropy": 8.803450584411621, "epoch": 0.025509195175004943, "mean_token_accuracy": 0.697926938533783, "num_tokens": 1350942.0, "step": 258, "train/ce_loss": 0.8056007027626038 }, { "epoch": 0.025509195175004943, "step": 258, "train/sim_loss": 0.2421875 }, { "epoch": 0.025509195175004943, "step": 258, "train/total_loss": 0.3227475881576538 }, { "entropy": 8.871196746826172, "epoch": 0.025608068024520467, "mean_token_accuracy": 0.7041800618171692, "num_tokens": 1356393.0, "step": 259, "train/ce_loss": 0.6629928350448608 }, { "epoch": 0.025608068024520467, "step": 259, "train/sim_loss": 0.1484375 }, { "epoch": 0.025608068024520467, "step": 259, "train/total_loss": 0.21473678946495056 }, { "epoch": 0.02570694087403599, "grad_norm": 1.321649193763733, "learning_rate": 9.938436433763537e-06, "loss": 0.2425, "step": 260 }, { "entropy": 9.480998992919922, "epoch": 0.02570694087403599, "mean_token_accuracy": 0.7395994067192078, "num_tokens": 1361529.0, "step": 260, "train/ce_loss": 1.1267880201339722 }, { "epoch": 0.02570694087403599, "step": 260, "train/sim_loss": 0.14453125 }, { "epoch": 0.02570694087403599, "step": 260, "train/total_loss": 0.25721004605293274 }, { "entropy": 9.206134796142578, "epoch": 0.025805813723551514, "mean_token_accuracy": 0.7177985906600952, "num_tokens": 1366857.0, "step": 261, "train/ce_loss": 0.786657989025116 }, { "epoch": 0.025805813723551514, "step": 261, "train/sim_loss": 0.109375 }, { "epoch": 0.025805813723551514, "step": 261, "train/total_loss": 0.18804079294204712 }, { "entropy": 8.90103816986084, "epoch": 0.025904686573067034, "mean_token_accuracy": 0.6851851940155029, "num_tokens": 1372246.0, "step": 262, "train/ce_loss": 1.7658196687698364 }, { "epoch": 0.025904686573067034, "step": 262, "train/sim_loss": 0.1875 }, { "epoch": 0.025904686573067034, "step": 262, "train/total_loss": 0.3640819787979126 }, { "entropy": 9.348938941955566, "epoch": 0.026003559422582558, "mean_token_accuracy": 0.6954612135887146, "num_tokens": 1377399.0, "step": 263, "train/ce_loss": 1.1223899126052856 }, { "epoch": 0.026003559422582558, "step": 263, "train/sim_loss": 0.140625 }, { "epoch": 0.026003559422582558, "step": 263, "train/total_loss": 0.2528640031814575 }, { "entropy": 9.85448169708252, "epoch": 0.026102432272098082, "mean_token_accuracy": 0.6844106316566467, "num_tokens": 1382339.0, "step": 264, "train/ce_loss": 2.0575814247131348 }, { "epoch": 0.026102432272098082, "step": 264, "train/sim_loss": 0.21875 }, { "epoch": 0.026102432272098082, "step": 264, "train/total_loss": 0.42450815439224243 }, { "entropy": 10.01374340057373, "epoch": 0.026201305121613606, "mean_token_accuracy": 0.7354369163513184, "num_tokens": 1387182.0, "step": 265, "train/ce_loss": 2.1004700660705566 }, { "epoch": 0.026201305121613606, "step": 265, "train/sim_loss": 0.1640625 }, { "epoch": 0.026201305121613606, "step": 265, "train/total_loss": 0.37410950660705566 }, { "entropy": 9.003646850585938, "epoch": 0.02630017797112913, "mean_token_accuracy": 0.6959183812141418, "num_tokens": 1392641.0, "step": 266, "train/ce_loss": 0.881372332572937 }, { "epoch": 0.02630017797112913, "step": 266, "train/sim_loss": 0.21875 }, { "epoch": 0.02630017797112913, "step": 266, "train/total_loss": 0.3068872392177582 }, { "entropy": 9.18246841430664, "epoch": 0.02639905082064465, "mean_token_accuracy": 0.7476525902748108, "num_tokens": 1397972.0, "step": 267, "train/ce_loss": 0.6705335378646851 }, { "epoch": 0.02639905082064465, "step": 267, "train/sim_loss": 0.16796875 }, { "epoch": 0.02639905082064465, "step": 267, "train/total_loss": 0.23502209782600403 }, { "entropy": 8.929038047790527, "epoch": 0.026497923670160173, "mean_token_accuracy": 0.7204058766365051, "num_tokens": 1403304.0, "step": 268, "train/ce_loss": 0.5589403510093689 }, { "epoch": 0.026497923670160173, "step": 268, "train/sim_loss": 0.14453125 }, { "epoch": 0.026497923670160173, "step": 268, "train/total_loss": 0.20042528212070465 }, { "entropy": 9.119290351867676, "epoch": 0.026596796519675697, "mean_token_accuracy": 0.7673377990722656, "num_tokens": 1408660.0, "step": 269, "train/ce_loss": 0.984940767288208 }, { "epoch": 0.026596796519675697, "step": 269, "train/sim_loss": 0.12890625 }, { "epoch": 0.026596796519675697, "step": 269, "train/total_loss": 0.22740033268928528 }, { "entropy": 9.511564254760742, "epoch": 0.02669566936919122, "mean_token_accuracy": 0.6966966986656189, "num_tokens": 1413763.0, "step": 270, "train/ce_loss": 1.857006311416626 }, { "epoch": 0.02669566936919122, "step": 270, "train/sim_loss": 0.140625 }, { "epoch": 0.02669566936919122, "step": 270, "train/total_loss": 0.3263256549835205 }, { "entropy": 8.517393112182617, "epoch": 0.026794542218706745, "mean_token_accuracy": 0.7054985761642456, "num_tokens": 1419345.0, "step": 271, "train/ce_loss": 1.1406935453414917 }, { "epoch": 0.026794542218706745, "step": 271, "train/sim_loss": 0.15234375 }, { "epoch": 0.026794542218706745, "step": 271, "train/total_loss": 0.2664130926132202 }, { "entropy": 8.833395004272461, "epoch": 0.026893415068222265, "mean_token_accuracy": 0.7211934328079224, "num_tokens": 1424814.0, "step": 272, "train/ce_loss": 1.1049021482467651 }, { "epoch": 0.026893415068222265, "step": 272, "train/sim_loss": 0.1484375 }, { "epoch": 0.026893415068222265, "step": 272, "train/total_loss": 0.25892770290374756 }, { "entropy": 9.114679336547852, "epoch": 0.02699228791773779, "mean_token_accuracy": 0.7144607901573181, "num_tokens": 1430064.0, "step": 273, "train/ce_loss": 0.4850656986236572 }, { "epoch": 0.02699228791773779, "step": 273, "train/sim_loss": 0.09375 }, { "epoch": 0.02699228791773779, "step": 273, "train/total_loss": 0.14225657284259796 }, { "entropy": 9.55802059173584, "epoch": 0.027091160767253313, "mean_token_accuracy": 0.7269790172576904, "num_tokens": 1435123.0, "step": 274, "train/ce_loss": 1.139512062072754 }, { "epoch": 0.027091160767253313, "step": 274, "train/sim_loss": 0.19140625 }, { "epoch": 0.027091160767253313, "step": 274, "train/total_loss": 0.3053574562072754 }, { "entropy": 8.908613204956055, "epoch": 0.027190033616768836, "mean_token_accuracy": 0.6693735718727112, "num_tokens": 1440477.0, "step": 275, "train/ce_loss": 1.4616297483444214 }, { "epoch": 0.027190033616768836, "step": 275, "train/sim_loss": 0.16015625 }, { "epoch": 0.027190033616768836, "step": 275, "train/total_loss": 0.3063192367553711 }, { "entropy": 9.507146835327148, "epoch": 0.02728890646628436, "mean_token_accuracy": 0.7745571732521057, "num_tokens": 1445599.0, "step": 276, "train/ce_loss": 0.06952886283397675 }, { "epoch": 0.02728890646628436, "step": 276, "train/sim_loss": 0.1171875 }, { "epoch": 0.02728890646628436, "step": 276, "train/total_loss": 0.12414038926362991 }, { "entropy": 10.23452377319336, "epoch": 0.02738777931579988, "mean_token_accuracy": 0.7180156707763672, "num_tokens": 1450376.0, "step": 277, "train/ce_loss": 1.0686619281768799 }, { "epoch": 0.02738777931579988, "step": 277, "train/sim_loss": 0.1015625 }, { "epoch": 0.02738777931579988, "step": 277, "train/total_loss": 0.20842869579792023 }, { "entropy": 8.921943664550781, "epoch": 0.027486652165315404, "mean_token_accuracy": 0.7830578684806824, "num_tokens": 1455813.0, "step": 278, "train/ce_loss": 0.6289975047111511 }, { "epoch": 0.027486652165315404, "step": 278, "train/sim_loss": 0.09375 }, { "epoch": 0.027486652165315404, "step": 278, "train/total_loss": 0.15664975345134735 }, { "entropy": 9.548613548278809, "epoch": 0.027585525014830928, "mean_token_accuracy": 0.6795827150344849, "num_tokens": 1460924.0, "step": 279, "train/ce_loss": 1.3225817680358887 }, { "epoch": 0.027585525014830928, "step": 279, "train/sim_loss": 0.1484375 }, { "epoch": 0.027585525014830928, "step": 279, "train/total_loss": 0.28069567680358887 }, { "epoch": 0.02768439786434645, "grad_norm": 2.2994251251220703, "learning_rate": 9.93349156900559e-06, "loss": 0.2291, "step": 280 }, { "entropy": 8.961726188659668, "epoch": 0.02768439786434645, "mean_token_accuracy": 0.7474972009658813, "num_tokens": 1466310.0, "step": 280, "train/ce_loss": 1.1064954996109009 }, { "epoch": 0.02768439786434645, "step": 280, "train/sim_loss": 0.13671875 }, { "epoch": 0.02768439786434645, "step": 280, "train/total_loss": 0.24736830592155457 }, { "entropy": 9.037384033203125, "epoch": 0.027783270713861972, "mean_token_accuracy": 0.7144444584846497, "num_tokens": 1471685.0, "step": 281, "train/ce_loss": 0.73092120885849 }, { "epoch": 0.027783270713861972, "step": 281, "train/sim_loss": 0.125 }, { "epoch": 0.027783270713861972, "step": 281, "train/total_loss": 0.19809213280677795 }, { "entropy": 9.636255264282227, "epoch": 0.027882143563377496, "mean_token_accuracy": 0.7265238761901855, "num_tokens": 1476760.0, "step": 282, "train/ce_loss": 1.2324522733688354 }, { "epoch": 0.027882143563377496, "step": 282, "train/sim_loss": 0.12890625 }, { "epoch": 0.027882143563377496, "step": 282, "train/total_loss": 0.2521514892578125 }, { "entropy": 9.781538009643555, "epoch": 0.02798101641289302, "mean_token_accuracy": 0.7630161643028259, "num_tokens": 1481688.0, "step": 283, "train/ce_loss": 0.08090229332447052 }, { "epoch": 0.02798101641289302, "step": 283, "train/sim_loss": 0.10546875 }, { "epoch": 0.02798101641289302, "step": 283, "train/total_loss": 0.11355897784233093 }, { "entropy": 9.071699142456055, "epoch": 0.028079889262408543, "mean_token_accuracy": 0.7766439914703369, "num_tokens": 1487047.0, "step": 284, "train/ce_loss": 0.8658326864242554 }, { "epoch": 0.028079889262408543, "step": 284, "train/sim_loss": 0.1640625 }, { "epoch": 0.028079889262408543, "step": 284, "train/total_loss": 0.2506457567214966 }, { "entropy": 9.438531875610352, "epoch": 0.028178762111924067, "mean_token_accuracy": 0.7694753408432007, "num_tokens": 1492121.0, "step": 285, "train/ce_loss": 1.3623826503753662 }, { "epoch": 0.028178762111924067, "step": 285, "train/sim_loss": 0.1015625 }, { "epoch": 0.028178762111924067, "step": 285, "train/total_loss": 0.23780076205730438 }, { "entropy": 9.435054779052734, "epoch": 0.028277634961439587, "mean_token_accuracy": 0.6922094225883484, "num_tokens": 1497370.0, "step": 286, "train/ce_loss": 1.057147741317749 }, { "epoch": 0.028277634961439587, "step": 286, "train/sim_loss": 0.13671875 }, { "epoch": 0.028277634961439587, "step": 286, "train/total_loss": 0.24243351817131042 }, { "entropy": 9.316326141357422, "epoch": 0.02837650781095511, "mean_token_accuracy": 0.7567954063415527, "num_tokens": 1502553.0, "step": 287, "train/ce_loss": 1.2863101959228516 }, { "epoch": 0.02837650781095511, "step": 287, "train/sim_loss": 0.1875 }, { "epoch": 0.02837650781095511, "step": 287, "train/total_loss": 0.31613102555274963 }, { "entropy": 10.227319717407227, "epoch": 0.028475380660470635, "mean_token_accuracy": 0.64402174949646, "num_tokens": 1507305.0, "step": 288, "train/ce_loss": 0.12142963707447052 }, { "epoch": 0.028475380660470635, "step": 288, "train/sim_loss": 0.078125 }, { "epoch": 0.028475380660470635, "step": 288, "train/total_loss": 0.09026796370744705 }, { "entropy": 9.190692901611328, "epoch": 0.02857425350998616, "mean_token_accuracy": 0.7074910998344421, "num_tokens": 1512615.0, "step": 289, "train/ce_loss": 1.271957278251648 }, { "epoch": 0.02857425350998616, "step": 289, "train/sim_loss": 0.16796875 }, { "epoch": 0.02857425350998616, "step": 289, "train/total_loss": 0.29516446590423584 }, { "entropy": 9.307730674743652, "epoch": 0.028673126359501682, "mean_token_accuracy": 0.7369077205657959, "num_tokens": 1517908.0, "step": 290, "train/ce_loss": 0.9792628288269043 }, { "epoch": 0.028673126359501682, "step": 290, "train/sim_loss": 0.0859375 }, { "epoch": 0.028673126359501682, "step": 290, "train/total_loss": 0.1838637888431549 }, { "entropy": 9.437780380249023, "epoch": 0.028771999209017202, "mean_token_accuracy": 0.7896341681480408, "num_tokens": 1523017.0, "step": 291, "train/ce_loss": 0.0677185133099556 }, { "epoch": 0.028771999209017202, "step": 291, "train/sim_loss": 0.046875 }, { "epoch": 0.028771999209017202, "step": 291, "train/total_loss": 0.05364685133099556 }, { "entropy": 9.36805534362793, "epoch": 0.028870872058532726, "mean_token_accuracy": 0.7188329100608826, "num_tokens": 1528212.0, "step": 292, "train/ce_loss": 1.1883916854858398 }, { "epoch": 0.028870872058532726, "step": 292, "train/sim_loss": 0.171875 }, { "epoch": 0.028870872058532726, "step": 292, "train/total_loss": 0.29071417450904846 }, { "entropy": 9.275108337402344, "epoch": 0.02896974490804825, "mean_token_accuracy": 0.7531328201293945, "num_tokens": 1533509.0, "step": 293, "train/ce_loss": 0.7196071743965149 }, { "epoch": 0.02896974490804825, "step": 293, "train/sim_loss": 0.16796875 }, { "epoch": 0.02896974490804825, "step": 293, "train/total_loss": 0.2399294674396515 }, { "entropy": 9.360564231872559, "epoch": 0.029068617757563774, "mean_token_accuracy": 0.7331606149673462, "num_tokens": 1538701.0, "step": 294, "train/ce_loss": 1.553326964378357 }, { "epoch": 0.029068617757563774, "step": 294, "train/sim_loss": 0.1015625 }, { "epoch": 0.029068617757563774, "step": 294, "train/total_loss": 0.25689518451690674 }, { "entropy": 9.396177291870117, "epoch": 0.029167490607079297, "mean_token_accuracy": 0.7080581188201904, "num_tokens": 1543900.0, "step": 295, "train/ce_loss": 1.2857218980789185 }, { "epoch": 0.029167490607079297, "step": 295, "train/sim_loss": 0.109375 }, { "epoch": 0.029167490607079297, "step": 295, "train/total_loss": 0.23794719576835632 }, { "entropy": 9.138364791870117, "epoch": 0.029266363456594818, "mean_token_accuracy": 0.7395397424697876, "num_tokens": 1549280.0, "step": 296, "train/ce_loss": 0.8193778991699219 }, { "epoch": 0.029266363456594818, "step": 296, "train/sim_loss": 0.10546875 }, { "epoch": 0.029266363456594818, "step": 296, "train/total_loss": 0.1874065399169922 }, { "entropy": 9.80689811706543, "epoch": 0.02936523630611034, "mean_token_accuracy": 0.7438016533851624, "num_tokens": 1554295.0, "step": 297, "train/ce_loss": 0.5103650689125061 }, { "epoch": 0.02936523630611034, "step": 297, "train/sim_loss": 0.08203125 }, { "epoch": 0.02936523630611034, "step": 297, "train/total_loss": 0.1330677568912506 }, { "entropy": 10.591114044189453, "epoch": 0.029464109155625865, "mean_token_accuracy": 0.6646341681480408, "num_tokens": 1558872.0, "step": 298, "train/ce_loss": 0.27430862188339233 }, { "epoch": 0.029464109155625865, "step": 298, "train/sim_loss": 0.140625 }, { "epoch": 0.029464109155625865, "step": 298, "train/total_loss": 0.16805586218833923 }, { "entropy": 9.821271896362305, "epoch": 0.02956298200514139, "mean_token_accuracy": 0.7472324967384338, "num_tokens": 1563847.0, "step": 299, "train/ce_loss": 1.1159749031066895 }, { "epoch": 0.02956298200514139, "step": 299, "train/sim_loss": 0.15234375 }, { "epoch": 0.02956298200514139, "step": 299, "train/total_loss": 0.26394122838974 }, { "epoch": 0.029661854854656913, "grad_norm": 1.9446110725402832, "learning_rate": 9.928546704247638e-06, "loss": 0.2196, "step": 300 }, { "entropy": 9.165465354919434, "epoch": 0.029661854854656913, "mean_token_accuracy": 0.7384615540504456, "num_tokens": 1569256.0, "step": 300, "train/ce_loss": 0.6590936779975891 }, { "epoch": 0.029661854854656913, "step": 300, "train/sim_loss": 0.15234375 }, { "epoch": 0.029661854854656913, "step": 300, "train/total_loss": 0.21825312077999115 }, { "entropy": 9.37040901184082, "epoch": 0.029760727704172433, "mean_token_accuracy": 0.6537467837333679, "num_tokens": 1574467.0, "step": 301, "train/ce_loss": 0.953637421131134 }, { "epoch": 0.029760727704172433, "step": 301, "train/sim_loss": 0.171875 }, { "epoch": 0.029760727704172433, "step": 301, "train/total_loss": 0.2672387361526489 }, { "entropy": 9.353124618530273, "epoch": 0.029859600553687957, "mean_token_accuracy": 0.7333333492279053, "num_tokens": 1579772.0, "step": 302, "train/ce_loss": 1.0786752700805664 }, { "epoch": 0.029859600553687957, "step": 302, "train/sim_loss": 0.140625 }, { "epoch": 0.029859600553687957, "step": 302, "train/total_loss": 0.2484925389289856 }, { "entropy": 9.137678146362305, "epoch": 0.02995847340320348, "mean_token_accuracy": 0.7180232405662537, "num_tokens": 1585265.0, "step": 303, "train/ce_loss": 0.6878090500831604 }, { "epoch": 0.02995847340320348, "step": 303, "train/sim_loss": 0.11328125 }, { "epoch": 0.02995847340320348, "step": 303, "train/total_loss": 0.18206214904785156 }, { "entropy": 9.260383605957031, "epoch": 0.030057346252719004, "mean_token_accuracy": 0.6810966730117798, "num_tokens": 1590428.0, "step": 304, "train/ce_loss": 0.5839409828186035 }, { "epoch": 0.030057346252719004, "step": 304, "train/sim_loss": 0.16015625 }, { "epoch": 0.030057346252719004, "step": 304, "train/total_loss": 0.21855035424232483 }, { "entropy": 9.679158210754395, "epoch": 0.030156219102234528, "mean_token_accuracy": 0.6967340707778931, "num_tokens": 1595508.0, "step": 305, "train/ce_loss": 1.1889206171035767 }, { "epoch": 0.030156219102234528, "step": 305, "train/sim_loss": 0.1640625 }, { "epoch": 0.030156219102234528, "step": 305, "train/total_loss": 0.2829545736312866 }, { "entropy": 9.837874412536621, "epoch": 0.030255091951750048, "mean_token_accuracy": 0.7153153419494629, "num_tokens": 1600467.0, "step": 306, "train/ce_loss": 1.511080026626587 }, { "epoch": 0.030255091951750048, "step": 306, "train/sim_loss": 0.14453125 }, { "epoch": 0.030255091951750048, "step": 306, "train/total_loss": 0.2956392765045166 }, { "entropy": 8.882853507995605, "epoch": 0.030353964801265572, "mean_token_accuracy": 0.7215346693992615, "num_tokens": 1605747.0, "step": 307, "train/ce_loss": 0.7038434743881226 }, { "epoch": 0.030353964801265572, "step": 307, "train/sim_loss": 0.06640625 }, { "epoch": 0.030353964801265572, "step": 307, "train/total_loss": 0.13679060339927673 }, { "entropy": 9.678632736206055, "epoch": 0.030452837650781096, "mean_token_accuracy": 0.7492163181304932, "num_tokens": 1610875.0, "step": 308, "train/ce_loss": 1.0676531791687012 }, { "epoch": 0.030452837650781096, "step": 308, "train/sim_loss": 0.1328125 }, { "epoch": 0.030452837650781096, "step": 308, "train/total_loss": 0.23957782983779907 }, { "entropy": 9.990983963012695, "epoch": 0.03055171050029662, "mean_token_accuracy": 0.7561521530151367, "num_tokens": 1615768.0, "step": 309, "train/ce_loss": 1.7758872509002686 }, { "epoch": 0.03055171050029662, "step": 309, "train/sim_loss": 0.12890625 }, { "epoch": 0.03055171050029662, "step": 309, "train/total_loss": 0.30649498105049133 }, { "entropy": 9.119295120239258, "epoch": 0.030650583349812143, "mean_token_accuracy": 0.7405515909194946, "num_tokens": 1621219.0, "step": 310, "train/ce_loss": 1.228122591972351 }, { "epoch": 0.030650583349812143, "step": 310, "train/sim_loss": 0.1171875 }, { "epoch": 0.030650583349812143, "step": 310, "train/total_loss": 0.23999977111816406 }, { "entropy": 9.466287612915039, "epoch": 0.030749456199327663, "mean_token_accuracy": 0.7510431408882141, "num_tokens": 1626430.0, "step": 311, "train/ce_loss": 1.2873427867889404 }, { "epoch": 0.030749456199327663, "step": 311, "train/sim_loss": 0.1484375 }, { "epoch": 0.030749456199327663, "step": 311, "train/total_loss": 0.277171790599823 }, { "entropy": 9.017744064331055, "epoch": 0.030848329048843187, "mean_token_accuracy": 0.7283422350883484, "num_tokens": 1631878.0, "step": 312, "train/ce_loss": 0.7938199639320374 }, { "epoch": 0.030848329048843187, "step": 312, "train/sim_loss": 0.12109375 }, { "epoch": 0.030848329048843187, "step": 312, "train/total_loss": 0.2004757523536682 }, { "entropy": 10.244565963745117, "epoch": 0.03094720189835871, "mean_token_accuracy": 0.7329843044281006, "num_tokens": 1636689.0, "step": 313, "train/ce_loss": 0.11315623670816422 }, { "epoch": 0.03094720189835871, "step": 313, "train/sim_loss": 0.125 }, { "epoch": 0.03094720189835871, "step": 313, "train/total_loss": 0.13631562888622284 }, { "entropy": 9.71835708618164, "epoch": 0.031046074747874235, "mean_token_accuracy": 0.655986487865448, "num_tokens": 1641719.0, "step": 314, "train/ce_loss": 1.4221258163452148 }, { "epoch": 0.031046074747874235, "step": 314, "train/sim_loss": 0.15234375 }, { "epoch": 0.031046074747874235, "step": 314, "train/total_loss": 0.29455631971359253 }, { "entropy": 9.370756149291992, "epoch": 0.03114494759738976, "mean_token_accuracy": 0.6858572959899902, "num_tokens": 1647010.0, "step": 315, "train/ce_loss": 1.1413757801055908 }, { "epoch": 0.03114494759738976, "step": 315, "train/sim_loss": 0.08203125 }, { "epoch": 0.03114494759738976, "step": 315, "train/total_loss": 0.19616883993148804 }, { "entropy": 9.630697250366211, "epoch": 0.03124382044690528, "mean_token_accuracy": 0.7165932655334473, "num_tokens": 1652125.0, "step": 316, "train/ce_loss": 1.0022145509719849 }, { "epoch": 0.03124382044690528, "step": 316, "train/sim_loss": 0.13671875 }, { "epoch": 0.03124382044690528, "step": 316, "train/total_loss": 0.2369402050971985 }, { "entropy": 9.33189868927002, "epoch": 0.0313426932964208, "mean_token_accuracy": 0.687915027141571, "num_tokens": 1657349.0, "step": 317, "train/ce_loss": 0.7280907034873962 }, { "epoch": 0.0313426932964208, "step": 317, "train/sim_loss": 0.09765625 }, { "epoch": 0.0313426932964208, "step": 317, "train/total_loss": 0.17046532034873962 }, { "entropy": 9.760059356689453, "epoch": 0.03144156614593632, "mean_token_accuracy": 0.734446108341217, "num_tokens": 1662421.0, "step": 318, "train/ce_loss": 1.6615796089172363 }, { "epoch": 0.03144156614593632, "step": 318, "train/sim_loss": 0.125 }, { "epoch": 0.03144156614593632, "step": 318, "train/total_loss": 0.29115796089172363 }, { "entropy": 9.35573673248291, "epoch": 0.03154043899545185, "mean_token_accuracy": 0.7798036336898804, "num_tokens": 1667672.0, "step": 319, "train/ce_loss": 0.9388341903686523 }, { "epoch": 0.03154043899545185, "step": 319, "train/sim_loss": 0.1875 }, { "epoch": 0.03154043899545185, "step": 319, "train/total_loss": 0.2813834249973297 }, { "epoch": 0.03163931184496737, "grad_norm": 1.9336750507354736, "learning_rate": 9.92360183948969e-06, "loss": 0.23, "step": 320 }, { "entropy": 8.928403854370117, "epoch": 0.03163931184496737, "mean_token_accuracy": 0.7151514887809753, "num_tokens": 1673111.0, "step": 320, "train/ce_loss": 1.0507612228393555 }, { "epoch": 0.03163931184496737, "step": 320, "train/sim_loss": 0.1171875 }, { "epoch": 0.03163931184496737, "step": 320, "train/total_loss": 0.2222636342048645 }, { "entropy": 9.394420623779297, "epoch": 0.0317381846944829, "mean_token_accuracy": 0.7555555701255798, "num_tokens": 1678316.0, "step": 321, "train/ce_loss": 0.5773379802703857 }, { "epoch": 0.0317381846944829, "step": 321, "train/sim_loss": 0.15625 }, { "epoch": 0.0317381846944829, "step": 321, "train/total_loss": 0.21398380398750305 }, { "entropy": 9.462547302246094, "epoch": 0.03183705754399842, "mean_token_accuracy": 0.7281420826911926, "num_tokens": 1683502.0, "step": 322, "train/ce_loss": 0.8007969260215759 }, { "epoch": 0.03183705754399842, "step": 322, "train/sim_loss": 0.0859375 }, { "epoch": 0.03183705754399842, "step": 322, "train/total_loss": 0.16601720452308655 }, { "entropy": 9.313180923461914, "epoch": 0.03193593039351394, "mean_token_accuracy": 0.6699629426002502, "num_tokens": 1688804.0, "step": 323, "train/ce_loss": 1.2433942556381226 }, { "epoch": 0.03193593039351394, "step": 323, "train/sim_loss": 0.17578125 }, { "epoch": 0.03193593039351394, "step": 323, "train/total_loss": 0.30012068152427673 }, { "entropy": 8.810771942138672, "epoch": 0.032034803243029465, "mean_token_accuracy": 0.6816443800926208, "num_tokens": 1694314.0, "step": 324, "train/ce_loss": 1.380662441253662 }, { "epoch": 0.032034803243029465, "step": 324, "train/sim_loss": 0.1953125 }, { "epoch": 0.032034803243029465, "step": 324, "train/total_loss": 0.33337873220443726 }, { "entropy": 9.738914489746094, "epoch": 0.032133676092544985, "mean_token_accuracy": 0.7730061411857605, "num_tokens": 1699329.0, "step": 325, "train/ce_loss": 0.8244234919548035 }, { "epoch": 0.032133676092544985, "step": 325, "train/sim_loss": 0.06640625 }, { "epoch": 0.032133676092544985, "step": 325, "train/total_loss": 0.14884859323501587 }, { "entropy": 9.07105827331543, "epoch": 0.03223254894206051, "mean_token_accuracy": 0.6907756924629211, "num_tokens": 1704797.0, "step": 326, "train/ce_loss": 0.7959145307540894 }, { "epoch": 0.03223254894206051, "step": 326, "train/sim_loss": 0.10546875 }, { "epoch": 0.03223254894206051, "step": 326, "train/total_loss": 0.18506020307540894 }, { "entropy": 9.063455581665039, "epoch": 0.03233142179157603, "mean_token_accuracy": 0.6761487722396851, "num_tokens": 1710183.0, "step": 327, "train/ce_loss": 1.2873786687850952 }, { "epoch": 0.03233142179157603, "step": 327, "train/sim_loss": 0.15625 }, { "epoch": 0.03233142179157603, "step": 327, "train/total_loss": 0.2849878668785095 }, { "entropy": 9.247129440307617, "epoch": 0.03243029464109155, "mean_token_accuracy": 0.678329586982727, "num_tokens": 1715575.0, "step": 328, "train/ce_loss": 0.8147888779640198 }, { "epoch": 0.03243029464109155, "step": 328, "train/sim_loss": 0.11328125 }, { "epoch": 0.03243029464109155, "step": 328, "train/total_loss": 0.19476014375686646 }, { "entropy": 9.43368911743164, "epoch": 0.03252916749060708, "mean_token_accuracy": 0.7616707682609558, "num_tokens": 1720850.0, "step": 329, "train/ce_loss": 1.0006930828094482 }, { "epoch": 0.03252916749060708, "step": 329, "train/sim_loss": 0.09765625 }, { "epoch": 0.03252916749060708, "step": 329, "train/total_loss": 0.1977255642414093 }, { "entropy": 9.383726119995117, "epoch": 0.0326280403401226, "mean_token_accuracy": 0.7228915691375732, "num_tokens": 1726093.0, "step": 330, "train/ce_loss": 0.9005025625228882 }, { "epoch": 0.0326280403401226, "step": 330, "train/sim_loss": 0.15234375 }, { "epoch": 0.0326280403401226, "step": 330, "train/total_loss": 0.24239400029182434 }, { "entropy": 9.186288833618164, "epoch": 0.03272691318963813, "mean_token_accuracy": 0.6726190447807312, "num_tokens": 1731378.0, "step": 331, "train/ce_loss": 0.8888669013977051 }, { "epoch": 0.03272691318963813, "step": 331, "train/sim_loss": 0.12109375 }, { "epoch": 0.03272691318963813, "step": 331, "train/total_loss": 0.20998044312000275 }, { "entropy": 9.234413146972656, "epoch": 0.03282578603915365, "mean_token_accuracy": 0.7278989553451538, "num_tokens": 1736731.0, "step": 332, "train/ce_loss": 1.1174771785736084 }, { "epoch": 0.03282578603915365, "step": 332, "train/sim_loss": 0.140625 }, { "epoch": 0.03282578603915365, "step": 332, "train/total_loss": 0.25237271189689636 }, { "entropy": 9.7683687210083, "epoch": 0.03292465888866917, "mean_token_accuracy": 0.761904776096344, "num_tokens": 1741814.0, "step": 333, "train/ce_loss": 0.810090959072113 }, { "epoch": 0.03292465888866917, "step": 333, "train/sim_loss": 0.109375 }, { "epoch": 0.03292465888866917, "step": 333, "train/total_loss": 0.19038408994674683 }, { "entropy": 9.496259689331055, "epoch": 0.033023531738184696, "mean_token_accuracy": 0.7402088642120361, "num_tokens": 1747017.0, "step": 334, "train/ce_loss": 1.5065089464187622 }, { "epoch": 0.033023531738184696, "step": 334, "train/sim_loss": 0.1171875 }, { "epoch": 0.033023531738184696, "step": 334, "train/total_loss": 0.26783841848373413 }, { "entropy": 9.342421531677246, "epoch": 0.033122404587700216, "mean_token_accuracy": 0.7607142925262451, "num_tokens": 1752372.0, "step": 335, "train/ce_loss": 1.0633383989334106 }, { "epoch": 0.033122404587700216, "step": 335, "train/sim_loss": 0.140625 }, { "epoch": 0.033122404587700216, "step": 335, "train/total_loss": 0.24695885181427002 }, { "entropy": 9.478039741516113, "epoch": 0.03322127743721574, "mean_token_accuracy": 0.7618438005447388, "num_tokens": 1757607.0, "step": 336, "train/ce_loss": 0.7672297358512878 }, { "epoch": 0.03322127743721574, "step": 336, "train/sim_loss": 0.11328125 }, { "epoch": 0.03322127743721574, "step": 336, "train/total_loss": 0.19000422954559326 }, { "entropy": 9.245959281921387, "epoch": 0.033320150286731264, "mean_token_accuracy": 0.7064732313156128, "num_tokens": 1763010.0, "step": 337, "train/ce_loss": 1.899410367012024 }, { "epoch": 0.033320150286731264, "step": 337, "train/sim_loss": 0.21875 }, { "epoch": 0.033320150286731264, "step": 337, "train/total_loss": 0.40869104862213135 }, { "entropy": 9.674976348876953, "epoch": 0.033419023136246784, "mean_token_accuracy": 0.7416918277740479, "num_tokens": 1768106.0, "step": 338, "train/ce_loss": 0.6345680356025696 }, { "epoch": 0.033419023136246784, "step": 338, "train/sim_loss": 0.0546875 }, { "epoch": 0.033419023136246784, "step": 338, "train/total_loss": 0.11814430356025696 }, { "entropy": 9.843158721923828, "epoch": 0.03351789598576231, "mean_token_accuracy": 0.7960000038146973, "num_tokens": 1773101.0, "step": 339, "train/ce_loss": 0.09235163033008575 }, { "epoch": 0.03351789598576231, "step": 339, "train/sim_loss": 0.12109375 }, { "epoch": 0.03351789598576231, "step": 339, "train/total_loss": 0.13032890856266022 }, { "epoch": 0.03361676883527783, "grad_norm": 1.593528151512146, "learning_rate": 9.918656974731741e-06, "loss": 0.2225, "step": 340 }, { "entropy": 9.499095916748047, "epoch": 0.03361676883527783, "mean_token_accuracy": 0.7245430946350098, "num_tokens": 1778328.0, "step": 340, "train/ce_loss": 1.1299872398376465 }, { "epoch": 0.03361676883527783, "step": 340, "train/sim_loss": 0.15625 }, { "epoch": 0.03361676883527783, "step": 340, "train/total_loss": 0.26924872398376465 }, { "entropy": 9.061765670776367, "epoch": 0.03371564168479336, "mean_token_accuracy": 0.7465968728065491, "num_tokens": 1783740.0, "step": 341, "train/ce_loss": 1.1679484844207764 }, { "epoch": 0.03371564168479336, "step": 341, "train/sim_loss": 0.15234375 }, { "epoch": 0.03371564168479336, "step": 341, "train/total_loss": 0.2691386044025421 }, { "entropy": 9.927787780761719, "epoch": 0.03381451453430888, "mean_token_accuracy": 0.7411764860153198, "num_tokens": 1788923.0, "step": 342, "train/ce_loss": 0.07725854218006134 }, { "epoch": 0.03381451453430888, "step": 342, "train/sim_loss": 0.08203125 }, { "epoch": 0.03381451453430888, "step": 342, "train/total_loss": 0.08975710719823837 }, { "entropy": 9.046907424926758, "epoch": 0.0339133873838244, "mean_token_accuracy": 0.7765957713127136, "num_tokens": 1794222.0, "step": 343, "train/ce_loss": 0.5280411243438721 }, { "epoch": 0.0339133873838244, "step": 343, "train/sim_loss": 0.140625 }, { "epoch": 0.0339133873838244, "step": 343, "train/total_loss": 0.1934291124343872 }, { "entropy": 9.82237434387207, "epoch": 0.034012260233339926, "mean_token_accuracy": 0.7479674816131592, "num_tokens": 1799263.0, "step": 344, "train/ce_loss": 0.07228964567184448 }, { "epoch": 0.034012260233339926, "step": 344, "train/sim_loss": 0.05859375 }, { "epoch": 0.034012260233339926, "step": 344, "train/total_loss": 0.06582271307706833 }, { "entropy": 9.202108383178711, "epoch": 0.03411113308285545, "mean_token_accuracy": 0.803748607635498, "num_tokens": 1804661.0, "step": 345, "train/ce_loss": 0.6875723004341125 }, { "epoch": 0.03411113308285545, "step": 345, "train/sim_loss": 0.06640625 }, { "epoch": 0.03411113308285545, "step": 345, "train/total_loss": 0.13516348600387573 }, { "entropy": 9.554802894592285, "epoch": 0.034210005932370974, "mean_token_accuracy": 0.7437407970428467, "num_tokens": 1809827.0, "step": 346, "train/ce_loss": 0.7660393714904785 }, { "epoch": 0.034210005932370974, "step": 346, "train/sim_loss": 0.12109375 }, { "epoch": 0.034210005932370974, "step": 346, "train/total_loss": 0.1976976990699768 }, { "entropy": 9.579947471618652, "epoch": 0.034308878781886494, "mean_token_accuracy": 0.6934749484062195, "num_tokens": 1814939.0, "step": 347, "train/ce_loss": 0.06933347135782242 }, { "epoch": 0.034308878781886494, "step": 347, "train/sim_loss": 0.1328125 }, { "epoch": 0.034308878781886494, "step": 347, "train/total_loss": 0.13974584639072418 }, { "entropy": 9.451644897460938, "epoch": 0.034407751631402014, "mean_token_accuracy": 0.6985769867897034, "num_tokens": 1820363.0, "step": 348, "train/ce_loss": 1.155254602432251 }, { "epoch": 0.034407751631402014, "step": 348, "train/sim_loss": 0.11328125 }, { "epoch": 0.034407751631402014, "step": 348, "train/total_loss": 0.22880670428276062 }, { "entropy": 10.010807991027832, "epoch": 0.03450662448091754, "mean_token_accuracy": 0.6879310607910156, "num_tokens": 1825338.0, "step": 349, "train/ce_loss": 0.0773419514298439 }, { "epoch": 0.03450662448091754, "step": 349, "train/sim_loss": 0.05078125 }, { "epoch": 0.03450662448091754, "step": 349, "train/total_loss": 0.05851544439792633 }, { "entropy": 9.17041015625, "epoch": 0.03460549733043306, "mean_token_accuracy": 0.7520184516906738, "num_tokens": 1830655.0, "step": 350, "train/ce_loss": 0.6814654469490051 }, { "epoch": 0.03460549733043306, "step": 350, "train/sim_loss": 0.06640625 }, { "epoch": 0.03460549733043306, "step": 350, "train/total_loss": 0.13455280661582947 }, { "entropy": 10.224859237670898, "epoch": 0.03470437017994859, "mean_token_accuracy": 0.7102137804031372, "num_tokens": 1835458.0, "step": 351, "train/ce_loss": 0.10597500205039978 }, { "epoch": 0.03470437017994859, "step": 351, "train/sim_loss": 0.046875 }, { "epoch": 0.03470437017994859, "step": 351, "train/total_loss": 0.05747250095009804 }, { "entropy": 9.771078109741211, "epoch": 0.03480324302946411, "mean_token_accuracy": 0.6955752372741699, "num_tokens": 1840454.0, "step": 352, "train/ce_loss": 0.07685268670320511 }, { "epoch": 0.03480324302946411, "step": 352, "train/sim_loss": 0.125 }, { "epoch": 0.03480324302946411, "step": 352, "train/total_loss": 0.13268527388572693 }, { "entropy": 9.915056228637695, "epoch": 0.03490211587897963, "mean_token_accuracy": 0.6846542954444885, "num_tokens": 1845477.0, "step": 353, "train/ce_loss": 0.07793489098548889 }, { "epoch": 0.03490211587897963, "step": 353, "train/sim_loss": 0.0859375 }, { "epoch": 0.03490211587897963, "step": 353, "train/total_loss": 0.09373098611831665 }, { "entropy": 9.970849990844727, "epoch": 0.03500098872849516, "mean_token_accuracy": 0.7813687920570374, "num_tokens": 1850429.0, "step": 354, "train/ce_loss": 0.08181675523519516 }, { "epoch": 0.03500098872849516, "step": 354, "train/sim_loss": 0.12890625 }, { "epoch": 0.03500098872849516, "step": 354, "train/total_loss": 0.13708792626857758 }, { "entropy": 9.199779510498047, "epoch": 0.03509986157801068, "mean_token_accuracy": 0.759100615978241, "num_tokens": 1855814.0, "step": 355, "train/ce_loss": 0.9021751284599304 }, { "epoch": 0.03509986157801068, "step": 355, "train/sim_loss": 0.08203125 }, { "epoch": 0.03509986157801068, "step": 355, "train/total_loss": 0.17224876582622528 }, { "entropy": 9.291690826416016, "epoch": 0.035198734427526204, "mean_token_accuracy": 0.709563136100769, "num_tokens": 1861150.0, "step": 356, "train/ce_loss": 0.7254281044006348 }, { "epoch": 0.035198734427526204, "step": 356, "train/sim_loss": 0.13671875 }, { "epoch": 0.035198734427526204, "step": 356, "train/total_loss": 0.20926156640052795 }, { "entropy": 9.447186470031738, "epoch": 0.035297607277041725, "mean_token_accuracy": 0.7272727489471436, "num_tokens": 1866398.0, "step": 357, "train/ce_loss": 0.6839994788169861 }, { "epoch": 0.035297607277041725, "step": 357, "train/sim_loss": 0.203125 }, { "epoch": 0.035297607277041725, "step": 357, "train/total_loss": 0.27152496576309204 }, { "entropy": 10.161275863647461, "epoch": 0.035396480126557245, "mean_token_accuracy": 0.7622222304344177, "num_tokens": 1871278.0, "step": 358, "train/ce_loss": 1.1349841356277466 }, { "epoch": 0.035396480126557245, "step": 358, "train/sim_loss": 0.1015625 }, { "epoch": 0.035396480126557245, "step": 358, "train/total_loss": 0.21506091952323914 }, { "entropy": 9.809469223022461, "epoch": 0.03549535297607277, "mean_token_accuracy": 0.6945337653160095, "num_tokens": 1876358.0, "step": 359, "train/ce_loss": 0.07336652278900146 }, { "epoch": 0.03549535297607277, "step": 359, "train/sim_loss": 0.0859375 }, { "epoch": 0.03549535297607277, "step": 359, "train/total_loss": 0.09327415376901627 }, { "epoch": 0.03559422582558829, "grad_norm": 1.2241110801696777, "learning_rate": 9.913712109973793e-06, "loss": 0.2122, "step": 360 }, { "entropy": 9.10953140258789, "epoch": 0.03559422582558829, "mean_token_accuracy": 0.7454128265380859, "num_tokens": 1881776.0, "step": 360, "train/ce_loss": 0.8973351120948792 }, { "epoch": 0.03559422582558829, "step": 360, "train/sim_loss": 0.19140625 }, { "epoch": 0.03559422582558829, "step": 360, "train/total_loss": 0.2811397612094879 }, { "entropy": 9.590354919433594, "epoch": 0.03569309867510382, "mean_token_accuracy": 0.807894766330719, "num_tokens": 1886981.0, "step": 361, "train/ce_loss": 0.05856641009449959 }, { "epoch": 0.03569309867510382, "step": 361, "train/sim_loss": 0.12109375 }, { "epoch": 0.03569309867510382, "step": 361, "train/total_loss": 0.12695039808750153 }, { "entropy": 9.504430770874023, "epoch": 0.03579197152461934, "mean_token_accuracy": 0.702570378780365, "num_tokens": 1892273.0, "step": 362, "train/ce_loss": 0.9794031381607056 }, { "epoch": 0.03579197152461934, "step": 362, "train/sim_loss": 0.1015625 }, { "epoch": 0.03579197152461934, "step": 362, "train/total_loss": 0.1995028257369995 }, { "entropy": 9.122315406799316, "epoch": 0.03589084437413486, "mean_token_accuracy": 0.7554240822792053, "num_tokens": 1897749.0, "step": 363, "train/ce_loss": 0.9510576128959656 }, { "epoch": 0.03589084437413486, "step": 363, "train/sim_loss": 0.0703125 }, { "epoch": 0.03589084437413486, "step": 363, "train/total_loss": 0.16541826725006104 }, { "entropy": 8.951866149902344, "epoch": 0.03598971722365039, "mean_token_accuracy": 0.7251461744308472, "num_tokens": 1903146.0, "step": 364, "train/ce_loss": 0.5546991229057312 }, { "epoch": 0.03598971722365039, "step": 364, "train/sim_loss": 0.0546875 }, { "epoch": 0.03598971722365039, "step": 364, "train/total_loss": 0.11015741527080536 }, { "entropy": 9.105632781982422, "epoch": 0.03608859007316591, "mean_token_accuracy": 0.7198622226715088, "num_tokens": 1908511.0, "step": 365, "train/ce_loss": 0.8597995042800903 }, { "epoch": 0.03608859007316591, "step": 365, "train/sim_loss": 0.09765625 }, { "epoch": 0.03608859007316591, "step": 365, "train/total_loss": 0.18363620340824127 }, { "entropy": 9.629191398620605, "epoch": 0.036187462922681435, "mean_token_accuracy": 0.703342616558075, "num_tokens": 1913848.0, "step": 366, "train/ce_loss": 1.2104308605194092 }, { "epoch": 0.036187462922681435, "step": 366, "train/sim_loss": 0.12109375 }, { "epoch": 0.036187462922681435, "step": 366, "train/total_loss": 0.24213683605194092 }, { "entropy": 9.564308166503906, "epoch": 0.036286335772196955, "mean_token_accuracy": 0.7109004855155945, "num_tokens": 1919134.0, "step": 367, "train/ce_loss": 1.0658434629440308 }, { "epoch": 0.036286335772196955, "step": 367, "train/sim_loss": 0.125 }, { "epoch": 0.036286335772196955, "step": 367, "train/total_loss": 0.2315843403339386 }, { "entropy": 9.375260353088379, "epoch": 0.036385208621712475, "mean_token_accuracy": 0.7547169923782349, "num_tokens": 1924431.0, "step": 368, "train/ce_loss": 1.1075860261917114 }, { "epoch": 0.036385208621712475, "step": 368, "train/sim_loss": 0.1171875 }, { "epoch": 0.036385208621712475, "step": 368, "train/total_loss": 0.22794610261917114 }, { "entropy": 9.342070579528809, "epoch": 0.036484081471228, "mean_token_accuracy": 0.6740740537643433, "num_tokens": 1929668.0, "step": 369, "train/ce_loss": 2.2320828437805176 }, { "epoch": 0.036484081471228, "step": 369, "train/sim_loss": 0.1953125 }, { "epoch": 0.036484081471228, "step": 369, "train/total_loss": 0.41852080821990967 }, { "entropy": 9.583789825439453, "epoch": 0.03658295432074352, "mean_token_accuracy": 0.71378093957901, "num_tokens": 1934657.0, "step": 370, "train/ce_loss": 1.2746697664260864 }, { "epoch": 0.03658295432074352, "step": 370, "train/sim_loss": 0.08203125 }, { "epoch": 0.03658295432074352, "step": 370, "train/total_loss": 0.20949822664260864 }, { "entropy": 9.580131530761719, "epoch": 0.03668182717025905, "mean_token_accuracy": 0.755215585231781, "num_tokens": 1939828.0, "step": 371, "train/ce_loss": 0.8176199793815613 }, { "epoch": 0.03668182717025905, "step": 371, "train/sim_loss": 0.171875 }, { "epoch": 0.03668182717025905, "step": 371, "train/total_loss": 0.25363701581954956 }, { "entropy": 9.380910873413086, "epoch": 0.03678070001977457, "mean_token_accuracy": 0.7086801528930664, "num_tokens": 1945141.0, "step": 372, "train/ce_loss": 1.2361494302749634 }, { "epoch": 0.03678070001977457, "step": 372, "train/sim_loss": 0.08203125 }, { "epoch": 0.03678070001977457, "step": 372, "train/total_loss": 0.20564618706703186 }, { "entropy": 9.45290756225586, "epoch": 0.03687957286929009, "mean_token_accuracy": 0.7642679810523987, "num_tokens": 1950405.0, "step": 373, "train/ce_loss": 0.949661135673523 }, { "epoch": 0.03687957286929009, "step": 373, "train/sim_loss": 0.125 }, { "epoch": 0.03687957286929009, "step": 373, "train/total_loss": 0.2199661135673523 }, { "entropy": 9.106277465820312, "epoch": 0.03697844571880562, "mean_token_accuracy": 0.7809057235717773, "num_tokens": 1955673.0, "step": 374, "train/ce_loss": 0.6012702584266663 }, { "epoch": 0.03697844571880562, "step": 374, "train/sim_loss": 0.16796875 }, { "epoch": 0.03697844571880562, "step": 374, "train/total_loss": 0.22809576988220215 }, { "entropy": 9.308792114257812, "epoch": 0.03707731856832114, "mean_token_accuracy": 0.6877990365028381, "num_tokens": 1960969.0, "step": 375, "train/ce_loss": 0.7121626734733582 }, { "epoch": 0.03707731856832114, "step": 375, "train/sim_loss": 0.1484375 }, { "epoch": 0.03707731856832114, "step": 375, "train/total_loss": 0.21965377032756805 }, { "entropy": 9.10552978515625, "epoch": 0.037176191417836665, "mean_token_accuracy": 0.7052631378173828, "num_tokens": 1966398.0, "step": 376, "train/ce_loss": 1.4431555271148682 }, { "epoch": 0.037176191417836665, "step": 376, "train/sim_loss": 0.19921875 }, { "epoch": 0.037176191417836665, "step": 376, "train/total_loss": 0.34353429079055786 }, { "entropy": 9.340739250183105, "epoch": 0.037275064267352186, "mean_token_accuracy": 0.7555012106895447, "num_tokens": 1971693.0, "step": 377, "train/ce_loss": 0.5882933735847473 }, { "epoch": 0.037275064267352186, "step": 377, "train/sim_loss": 0.078125 }, { "epoch": 0.037275064267352186, "step": 377, "train/total_loss": 0.13695433735847473 }, { "entropy": 10.635891914367676, "epoch": 0.037373937116867706, "mean_token_accuracy": 0.8177340030670166, "num_tokens": 1976284.0, "step": 378, "train/ce_loss": 0.2206471860408783 }, { "epoch": 0.037373937116867706, "step": 378, "train/sim_loss": 0.07421875 }, { "epoch": 0.037373937116867706, "step": 378, "train/total_loss": 0.09628346562385559 }, { "entropy": 9.537246704101562, "epoch": 0.03747280996638323, "mean_token_accuracy": 0.7893961668014526, "num_tokens": 1981422.0, "step": 379, "train/ce_loss": 1.5890189409255981 }, { "epoch": 0.03747280996638323, "step": 379, "train/sim_loss": 0.16015625 }, { "epoch": 0.03747280996638323, "step": 379, "train/total_loss": 0.3190581500530243 }, { "epoch": 0.03757168281589875, "grad_norm": 1.5449098348617554, "learning_rate": 9.908767245215844e-06, "loss": 0.2045, "step": 380 }, { "entropy": 9.678614616394043, "epoch": 0.03757168281589875, "mean_token_accuracy": 0.7061538696289062, "num_tokens": 1986517.0, "step": 380, "train/ce_loss": 1.311052680015564 }, { "epoch": 0.03757168281589875, "step": 380, "train/sim_loss": 0.12890625 }, { "epoch": 0.03757168281589875, "step": 380, "train/total_loss": 0.2600115239620209 }, { "entropy": 9.551870346069336, "epoch": 0.03767055566541428, "mean_token_accuracy": 0.7640449404716492, "num_tokens": 1991679.0, "step": 381, "train/ce_loss": 0.06471993774175644 }, { "epoch": 0.03767055566541428, "step": 381, "train/sim_loss": 0.08203125 }, { "epoch": 0.03767055566541428, "step": 381, "train/total_loss": 0.08850324153900146 }, { "entropy": 8.966535568237305, "epoch": 0.0377694285149298, "mean_token_accuracy": 0.7140204310417175, "num_tokens": 1997213.0, "step": 382, "train/ce_loss": 1.2641946077346802 }, { "epoch": 0.0377694285149298, "step": 382, "train/sim_loss": 0.11328125 }, { "epoch": 0.0377694285149298, "step": 382, "train/total_loss": 0.23970071971416473 }, { "entropy": 9.849260330200195, "epoch": 0.03786830136444532, "mean_token_accuracy": 0.796875, "num_tokens": 2002255.0, "step": 383, "train/ce_loss": 1.4398695230484009 }, { "epoch": 0.03786830136444532, "step": 383, "train/sim_loss": 0.1015625 }, { "epoch": 0.03786830136444532, "step": 383, "train/total_loss": 0.24554945528507233 }, { "entropy": 9.840927124023438, "epoch": 0.03796717421396085, "mean_token_accuracy": 0.6904761791229248, "num_tokens": 2007253.0, "step": 384, "train/ce_loss": 0.07275044918060303 }, { "epoch": 0.03796717421396085, "step": 384, "train/sim_loss": 0.13671875 }, { "epoch": 0.03796717421396085, "step": 384, "train/total_loss": 0.1439937949180603 }, { "entropy": 9.434771537780762, "epoch": 0.03806604706347637, "mean_token_accuracy": 0.7175843715667725, "num_tokens": 2012308.0, "step": 385, "train/ce_loss": 1.2704976797103882 }, { "epoch": 0.03806604706347637, "step": 385, "train/sim_loss": 0.109375 }, { "epoch": 0.03806604706347637, "step": 385, "train/total_loss": 0.2364247739315033 }, { "entropy": 9.346845626831055, "epoch": 0.03816491991299189, "mean_token_accuracy": 0.7320573925971985, "num_tokens": 2017561.0, "step": 386, "train/ce_loss": 0.6856738924980164 }, { "epoch": 0.03816491991299189, "step": 386, "train/sim_loss": 0.08203125 }, { "epoch": 0.03816491991299189, "step": 386, "train/total_loss": 0.1505986452102661 }, { "entropy": 9.976537704467773, "epoch": 0.038263792762507416, "mean_token_accuracy": 0.7317939400672913, "num_tokens": 2022530.0, "step": 387, "train/ce_loss": 0.07988831400871277 }, { "epoch": 0.038263792762507416, "step": 387, "train/sim_loss": 0.09375 }, { "epoch": 0.038263792762507416, "step": 387, "train/total_loss": 0.1017388328909874 }, { "entropy": 9.263082504272461, "epoch": 0.038362665612022936, "mean_token_accuracy": 0.7423728704452515, "num_tokens": 2027916.0, "step": 388, "train/ce_loss": 0.9694319367408752 }, { "epoch": 0.038362665612022936, "step": 388, "train/sim_loss": 0.0703125 }, { "epoch": 0.038362665612022936, "step": 388, "train/total_loss": 0.167255699634552 }, { "entropy": 9.486333847045898, "epoch": 0.038461538461538464, "mean_token_accuracy": 0.7420538067817688, "num_tokens": 2033210.0, "step": 389, "train/ce_loss": 1.5127630233764648 }, { "epoch": 0.038461538461538464, "step": 389, "train/sim_loss": 0.171875 }, { "epoch": 0.038461538461538464, "step": 389, "train/total_loss": 0.32315129041671753 }, { "entropy": 9.922418594360352, "epoch": 0.038560411311053984, "mean_token_accuracy": 0.7598039507865906, "num_tokens": 2038255.0, "step": 390, "train/ce_loss": 0.6838778853416443 }, { "epoch": 0.038560411311053984, "step": 390, "train/sim_loss": 0.12890625 }, { "epoch": 0.038560411311053984, "step": 390, "train/total_loss": 0.19729404151439667 }, { "entropy": 9.55936050415039, "epoch": 0.038659284160569504, "mean_token_accuracy": 0.7387005686759949, "num_tokens": 2043404.0, "step": 391, "train/ce_loss": 0.9636222720146179 }, { "epoch": 0.038659284160569504, "step": 391, "train/sim_loss": 0.09765625 }, { "epoch": 0.038659284160569504, "step": 391, "train/total_loss": 0.19401848316192627 }, { "entropy": 9.560068130493164, "epoch": 0.03875815701008503, "mean_token_accuracy": 0.712435245513916, "num_tokens": 2048633.0, "step": 392, "train/ce_loss": 0.05836481228470802 }, { "epoch": 0.03875815701008503, "step": 392, "train/sim_loss": 0.1015625 }, { "epoch": 0.03875815701008503, "step": 392, "train/total_loss": 0.10739897936582565 }, { "entropy": 9.025551795959473, "epoch": 0.03885702985960055, "mean_token_accuracy": 0.7123420834541321, "num_tokens": 2054126.0, "step": 393, "train/ce_loss": 0.6507374048233032 }, { "epoch": 0.03885702985960055, "step": 393, "train/sim_loss": 0.04296875 }, { "epoch": 0.03885702985960055, "step": 393, "train/total_loss": 0.10804249346256256 }, { "entropy": 9.996772766113281, "epoch": 0.03895590270911608, "mean_token_accuracy": 0.7022058963775635, "num_tokens": 2059108.0, "step": 394, "train/ce_loss": 2.8516266345977783 }, { "epoch": 0.03895590270911608, "step": 394, "train/sim_loss": 0.140625 }, { "epoch": 0.03895590270911608, "step": 394, "train/total_loss": 0.42578765749931335 }, { "entropy": 10.15201187133789, "epoch": 0.0390547755586316, "mean_token_accuracy": 0.7736263871192932, "num_tokens": 2063954.0, "step": 395, "train/ce_loss": 1.0598214864730835 }, { "epoch": 0.0390547755586316, "step": 395, "train/sim_loss": 0.171875 }, { "epoch": 0.0390547755586316, "step": 395, "train/total_loss": 0.2778571546077728 }, { "entropy": 9.360065460205078, "epoch": 0.03915364840814712, "mean_token_accuracy": 0.7875586748123169, "num_tokens": 2069258.0, "step": 396, "train/ce_loss": 1.0520858764648438 }, { "epoch": 0.03915364840814712, "step": 396, "train/sim_loss": 0.14453125 }, { "epoch": 0.03915364840814712, "step": 396, "train/total_loss": 0.2497398406267166 }, { "entropy": 9.63375473022461, "epoch": 0.03925252125766265, "mean_token_accuracy": 0.691717803478241, "num_tokens": 2074378.0, "step": 397, "train/ce_loss": 1.5342135429382324 }, { "epoch": 0.03925252125766265, "step": 397, "train/sim_loss": 0.17578125 }, { "epoch": 0.03925252125766265, "step": 397, "train/total_loss": 0.3292025923728943 }, { "entropy": 9.150320053100586, "epoch": 0.03935139410717817, "mean_token_accuracy": 0.6758767366409302, "num_tokens": 2079768.0, "step": 398, "train/ce_loss": 1.444730281829834 }, { "epoch": 0.03935139410717817, "step": 398, "train/sim_loss": 0.1015625 }, { "epoch": 0.03935139410717817, "step": 398, "train/total_loss": 0.24603553116321564 }, { "entropy": 9.808318138122559, "epoch": 0.039450266956693694, "mean_token_accuracy": 0.6682692170143127, "num_tokens": 2084825.0, "step": 399, "train/ce_loss": 1.5510262250900269 }, { "epoch": 0.039450266956693694, "step": 399, "train/sim_loss": 0.16015625 }, { "epoch": 0.039450266956693694, "step": 399, "train/total_loss": 0.31525886058807373 }, { "epoch": 0.039549139806209214, "grad_norm": 2.2871906757354736, "learning_rate": 9.903822380457894e-06, "loss": 0.212, "step": 400 }, { "entropy": 10.660465240478516, "epoch": 0.039549139806209214, "mean_token_accuracy": 0.7262569665908813, "num_tokens": 2089397.0, "step": 400, "train/ce_loss": 0.256531298160553 }, { "epoch": 0.039549139806209214, "step": 400, "train/sim_loss": 0.09375 }, { "epoch": 0.039549139806209214, "step": 400, "train/total_loss": 0.11940313130617142 }, { "entropy": 9.446050643920898, "epoch": 0.039648012655724735, "mean_token_accuracy": 0.6892856955528259, "num_tokens": 2094670.0, "step": 401, "train/ce_loss": 1.4382058382034302 }, { "epoch": 0.039648012655724735, "step": 401, "train/sim_loss": 0.1796875 }, { "epoch": 0.039648012655724735, "step": 401, "train/total_loss": 0.323508083820343 }, { "entropy": 9.751455307006836, "epoch": 0.03974688550524026, "mean_token_accuracy": 0.7054263353347778, "num_tokens": 2099743.0, "step": 402, "train/ce_loss": 0.8057286143302917 }, { "epoch": 0.03974688550524026, "step": 402, "train/sim_loss": 0.1328125 }, { "epoch": 0.03974688550524026, "step": 402, "train/total_loss": 0.21338537335395813 }, { "entropy": 9.942941665649414, "epoch": 0.03984575835475578, "mean_token_accuracy": 0.7459749579429626, "num_tokens": 2104887.0, "step": 403, "train/ce_loss": 1.0311578512191772 }, { "epoch": 0.03984575835475578, "step": 403, "train/sim_loss": 0.10546875 }, { "epoch": 0.03984575835475578, "step": 403, "train/total_loss": 0.20858454704284668 }, { "entropy": 10.0800199508667, "epoch": 0.03994463120427131, "mean_token_accuracy": 0.6900901198387146, "num_tokens": 2109903.0, "step": 404, "train/ce_loss": 1.4212541580200195 }, { "epoch": 0.03994463120427131, "step": 404, "train/sim_loss": 0.10546875 }, { "epoch": 0.03994463120427131, "step": 404, "train/total_loss": 0.24759416282176971 }, { "entropy": 9.820121765136719, "epoch": 0.04004350405378683, "mean_token_accuracy": 0.7098150849342346, "num_tokens": 2115001.0, "step": 405, "train/ce_loss": 0.9588977098464966 }, { "epoch": 0.04004350405378683, "step": 405, "train/sim_loss": 0.078125 }, { "epoch": 0.04004350405378683, "step": 405, "train/total_loss": 0.17401477694511414 }, { "entropy": 8.908750534057617, "epoch": 0.04014237690330235, "mean_token_accuracy": 0.7490909099578857, "num_tokens": 2120590.0, "step": 406, "train/ce_loss": 0.5333845615386963 }, { "epoch": 0.04014237690330235, "step": 406, "train/sim_loss": 0.1484375 }, { "epoch": 0.04014237690330235, "step": 406, "train/total_loss": 0.2017759531736374 }, { "entropy": 9.100845336914062, "epoch": 0.04024124975281788, "mean_token_accuracy": 0.7977142930030823, "num_tokens": 2125997.0, "step": 407, "train/ce_loss": 0.46464645862579346 }, { "epoch": 0.04024124975281788, "step": 407, "train/sim_loss": 0.0546875 }, { "epoch": 0.04024124975281788, "step": 407, "train/total_loss": 0.10115215182304382 }, { "entropy": 9.426050186157227, "epoch": 0.0403401226023334, "mean_token_accuracy": 0.7438867688179016, "num_tokens": 2131308.0, "step": 408, "train/ce_loss": 1.037602186203003 }, { "epoch": 0.0403401226023334, "step": 408, "train/sim_loss": 0.0703125 }, { "epoch": 0.0403401226023334, "step": 408, "train/total_loss": 0.17407271265983582 }, { "entropy": 9.679730415344238, "epoch": 0.040438995451848925, "mean_token_accuracy": 0.7153392434120178, "num_tokens": 2136448.0, "step": 409, "train/ce_loss": 0.06517762690782547 }, { "epoch": 0.040438995451848925, "step": 409, "train/sim_loss": 0.171875 }, { "epoch": 0.040438995451848925, "step": 409, "train/total_loss": 0.17839276790618896 }, { "entropy": 9.857072830200195, "epoch": 0.040537868301364445, "mean_token_accuracy": 0.7915966510772705, "num_tokens": 2141513.0, "step": 410, "train/ce_loss": 1.0456303358078003 }, { "epoch": 0.040537868301364445, "step": 410, "train/sim_loss": 0.16796875 }, { "epoch": 0.040537868301364445, "step": 410, "train/total_loss": 0.27253177762031555 }, { "entropy": 9.174946784973145, "epoch": 0.040636741150879965, "mean_token_accuracy": 0.7205422520637512, "num_tokens": 2146915.0, "step": 411, "train/ce_loss": 0.7723899483680725 }, { "epoch": 0.040636741150879965, "step": 411, "train/sim_loss": 0.11328125 }, { "epoch": 0.040636741150879965, "step": 411, "train/total_loss": 0.1905202567577362 }, { "entropy": 9.211700439453125, "epoch": 0.04073561400039549, "mean_token_accuracy": 0.7519181370735168, "num_tokens": 2152109.0, "step": 412, "train/ce_loss": 0.9920454621315002 }, { "epoch": 0.04073561400039549, "step": 412, "train/sim_loss": 0.13671875 }, { "epoch": 0.04073561400039549, "step": 412, "train/total_loss": 0.23592329025268555 }, { "entropy": 9.378646850585938, "epoch": 0.04083448684991101, "mean_token_accuracy": 0.7542856931686401, "num_tokens": 2157459.0, "step": 413, "train/ce_loss": 0.9726569652557373 }, { "epoch": 0.04083448684991101, "step": 413, "train/sim_loss": 0.1328125 }, { "epoch": 0.04083448684991101, "step": 413, "train/total_loss": 0.23007819056510925 }, { "entropy": 9.208342552185059, "epoch": 0.04093335969942654, "mean_token_accuracy": 0.6861538290977478, "num_tokens": 2162936.0, "step": 414, "train/ce_loss": 1.107186198234558 }, { "epoch": 0.04093335969942654, "step": 414, "train/sim_loss": 0.09375 }, { "epoch": 0.04093335969942654, "step": 414, "train/total_loss": 0.20446862280368805 }, { "entropy": 9.583135604858398, "epoch": 0.04103223254894206, "mean_token_accuracy": 0.740641713142395, "num_tokens": 2168070.0, "step": 415, "train/ce_loss": 0.6643601059913635 }, { "epoch": 0.04103223254894206, "step": 415, "train/sim_loss": 0.109375 }, { "epoch": 0.04103223254894206, "step": 415, "train/total_loss": 0.1758110225200653 }, { "entropy": 9.213274002075195, "epoch": 0.04113110539845758, "mean_token_accuracy": 0.6670157313346863, "num_tokens": 2173464.0, "step": 416, "train/ce_loss": 1.370169997215271 }, { "epoch": 0.04113110539845758, "step": 416, "train/sim_loss": 0.11328125 }, { "epoch": 0.04113110539845758, "step": 416, "train/total_loss": 0.25029826164245605 }, { "entropy": 9.854242324829102, "epoch": 0.04122997824797311, "mean_token_accuracy": 0.7210776805877686, "num_tokens": 2178511.0, "step": 417, "train/ce_loss": 1.0721811056137085 }, { "epoch": 0.04122997824797311, "step": 417, "train/sim_loss": 0.046875 }, { "epoch": 0.04122997824797311, "step": 417, "train/total_loss": 0.15409311652183533 }, { "entropy": 9.106668472290039, "epoch": 0.04132885109748863, "mean_token_accuracy": 0.773797333240509, "num_tokens": 2183928.0, "step": 418, "train/ce_loss": 0.7181837558746338 }, { "epoch": 0.04132885109748863, "step": 418, "train/sim_loss": 0.04296875 }, { "epoch": 0.04132885109748863, "step": 418, "train/total_loss": 0.11478712409734726 }, { "entropy": 9.570853233337402, "epoch": 0.041427723947004155, "mean_token_accuracy": 0.7247474789619446, "num_tokens": 2189194.0, "step": 419, "train/ce_loss": 1.313214898109436 }, { "epoch": 0.041427723947004155, "step": 419, "train/sim_loss": 0.109375 }, { "epoch": 0.041427723947004155, "step": 419, "train/total_loss": 0.2406964898109436 }, { "epoch": 0.041526596796519676, "grad_norm": 1.4583357572555542, "learning_rate": 9.898877515699947e-06, "loss": 0.2068, "step": 420 }, { "entropy": 9.421991348266602, "epoch": 0.041526596796519676, "mean_token_accuracy": 0.7029449343681335, "num_tokens": 2194472.0, "step": 420, "train/ce_loss": 1.083237886428833 }, { "epoch": 0.041526596796519676, "step": 420, "train/sim_loss": 0.09375 }, { "epoch": 0.041526596796519676, "step": 420, "train/total_loss": 0.20207378268241882 }, { "entropy": 9.213184356689453, "epoch": 0.041625469646035196, "mean_token_accuracy": 0.7299435138702393, "num_tokens": 2199854.0, "step": 421, "train/ce_loss": 0.7965179085731506 }, { "epoch": 0.041625469646035196, "step": 421, "train/sim_loss": 0.06640625 }, { "epoch": 0.041625469646035196, "step": 421, "train/total_loss": 0.14605805277824402 }, { "entropy": 9.282001495361328, "epoch": 0.04172434249555072, "mean_token_accuracy": 0.731225311756134, "num_tokens": 2205172.0, "step": 422, "train/ce_loss": 0.6847802996635437 }, { "epoch": 0.04172434249555072, "step": 422, "train/sim_loss": 0.13671875 }, { "epoch": 0.04172434249555072, "step": 422, "train/total_loss": 0.2051967829465866 }, { "entropy": 9.40710163116455, "epoch": 0.04182321534506624, "mean_token_accuracy": 0.7071688771247864, "num_tokens": 2210504.0, "step": 423, "train/ce_loss": 0.9189309477806091 }, { "epoch": 0.04182321534506624, "step": 423, "train/sim_loss": 0.109375 }, { "epoch": 0.04182321534506624, "step": 423, "train/total_loss": 0.20126810669898987 }, { "entropy": 9.208239555358887, "epoch": 0.04192208819458177, "mean_token_accuracy": 0.7650273442268372, "num_tokens": 2215960.0, "step": 424, "train/ce_loss": 0.6772084832191467 }, { "epoch": 0.04192208819458177, "step": 424, "train/sim_loss": 0.08984375 }, { "epoch": 0.04192208819458177, "step": 424, "train/total_loss": 0.15756461024284363 }, { "entropy": 9.339761734008789, "epoch": 0.04202096104409729, "mean_token_accuracy": 0.7011995911598206, "num_tokens": 2221354.0, "step": 425, "train/ce_loss": 1.2315471172332764 }, { "epoch": 0.04202096104409729, "step": 425, "train/sim_loss": 0.109375 }, { "epoch": 0.04202096104409729, "step": 425, "train/total_loss": 0.23252971470355988 }, { "entropy": 10.32594108581543, "epoch": 0.04211983389361281, "mean_token_accuracy": 0.6701570749282837, "num_tokens": 2226144.0, "step": 426, "train/ce_loss": 0.11662330478429794 }, { "epoch": 0.04211983389361281, "step": 426, "train/sim_loss": 0.0546875 }, { "epoch": 0.04211983389361281, "step": 426, "train/total_loss": 0.06634983420372009 }, { "entropy": 9.58592414855957, "epoch": 0.04221870674312834, "mean_token_accuracy": 0.6890410780906677, "num_tokens": 2231272.0, "step": 427, "train/ce_loss": 1.2044618129730225 }, { "epoch": 0.04221870674312834, "step": 427, "train/sim_loss": 0.1328125 }, { "epoch": 0.04221870674312834, "step": 427, "train/total_loss": 0.25325867533683777 }, { "entropy": 10.165045738220215, "epoch": 0.04231757959264386, "mean_token_accuracy": 0.6581395268440247, "num_tokens": 2236127.0, "step": 428, "train/ce_loss": 2.049638509750366 }, { "epoch": 0.04231757959264386, "step": 428, "train/sim_loss": 0.125 }, { "epoch": 0.04231757959264386, "step": 428, "train/total_loss": 0.3299638628959656 }, { "entropy": 9.492660522460938, "epoch": 0.042416452442159386, "mean_token_accuracy": 0.7185792326927185, "num_tokens": 2241314.0, "step": 429, "train/ce_loss": 0.8751475214958191 }, { "epoch": 0.042416452442159386, "step": 429, "train/sim_loss": 0.17578125 }, { "epoch": 0.042416452442159386, "step": 429, "train/total_loss": 0.2632960081100464 }, { "entropy": 9.028284072875977, "epoch": 0.042515325291674906, "mean_token_accuracy": 0.6537585258483887, "num_tokens": 2246691.0, "step": 430, "train/ce_loss": 1.0627728700637817 }, { "epoch": 0.042515325291674906, "step": 430, "train/sim_loss": 0.08984375 }, { "epoch": 0.042515325291674906, "step": 430, "train/total_loss": 0.19612103700637817 }, { "entropy": 9.469210624694824, "epoch": 0.042614198141190426, "mean_token_accuracy": 0.6758104562759399, "num_tokens": 2251923.0, "step": 431, "train/ce_loss": 0.8347735404968262 }, { "epoch": 0.042614198141190426, "step": 431, "train/sim_loss": 0.09375 }, { "epoch": 0.042614198141190426, "step": 431, "train/total_loss": 0.17722734808921814 }, { "entropy": 9.71932601928711, "epoch": 0.042713070990705954, "mean_token_accuracy": 0.7784615159034729, "num_tokens": 2257009.0, "step": 432, "train/ce_loss": 0.8408069014549255 }, { "epoch": 0.042713070990705954, "step": 432, "train/sim_loss": 0.125 }, { "epoch": 0.042713070990705954, "step": 432, "train/total_loss": 0.20908069610595703 }, { "entropy": 9.299928665161133, "epoch": 0.042811943840221474, "mean_token_accuracy": 0.7300771474838257, "num_tokens": 2262256.0, "step": 433, "train/ce_loss": 1.170479655265808 }, { "epoch": 0.042811943840221474, "step": 433, "train/sim_loss": 0.16796875 }, { "epoch": 0.042811943840221474, "step": 433, "train/total_loss": 0.2850167155265808 }, { "entropy": 9.70884895324707, "epoch": 0.042910816689737, "mean_token_accuracy": 0.7266775965690613, "num_tokens": 2267328.0, "step": 434, "train/ce_loss": 0.07351674884557724 }, { "epoch": 0.042910816689737, "step": 434, "train/sim_loss": 0.12890625 }, { "epoch": 0.042910816689737, "step": 434, "train/total_loss": 0.13625793159008026 }, { "entropy": 9.696551322937012, "epoch": 0.04300968953925252, "mean_token_accuracy": 0.7911184430122375, "num_tokens": 2272356.0, "step": 435, "train/ce_loss": 0.9318480491638184 }, { "epoch": 0.04300968953925252, "step": 435, "train/sim_loss": 0.15234375 }, { "epoch": 0.04300968953925252, "step": 435, "train/total_loss": 0.24552854895591736 }, { "entropy": 9.346677780151367, "epoch": 0.04310856238876804, "mean_token_accuracy": 0.7118644118309021, "num_tokens": 2277737.0, "step": 436, "train/ce_loss": 0.6921613216400146 }, { "epoch": 0.04310856238876804, "step": 436, "train/sim_loss": 0.1171875 }, { "epoch": 0.04310856238876804, "step": 436, "train/total_loss": 0.18640363216400146 }, { "entropy": 9.694978713989258, "epoch": 0.04320743523828357, "mean_token_accuracy": 0.673202633857727, "num_tokens": 2283078.0, "step": 437, "train/ce_loss": 0.05739998072385788 }, { "epoch": 0.04320743523828357, "step": 437, "train/sim_loss": 0.05078125 }, { "epoch": 0.04320743523828357, "step": 437, "train/total_loss": 0.05652124807238579 }, { "entropy": 9.755623817443848, "epoch": 0.04330630808779909, "mean_token_accuracy": 0.6567862629890442, "num_tokens": 2288169.0, "step": 438, "train/ce_loss": 3.244351625442505 }, { "epoch": 0.04330630808779909, "step": 438, "train/sim_loss": 0.125 }, { "epoch": 0.04330630808779909, "step": 438, "train/total_loss": 0.44943517446517944 }, { "entropy": 9.664872169494629, "epoch": 0.043405180937314616, "mean_token_accuracy": 0.7266187071800232, "num_tokens": 2293346.0, "step": 439, "train/ce_loss": 1.3087403774261475 }, { "epoch": 0.043405180937314616, "step": 439, "train/sim_loss": 0.078125 }, { "epoch": 0.043405180937314616, "step": 439, "train/total_loss": 0.20899903774261475 }, { "epoch": 0.04350405378683014, "grad_norm": 2.076592445373535, "learning_rate": 9.893932650941997e-06, "loss": 0.2254, "step": 440 }, { "entropy": 9.562398910522461, "epoch": 0.04350405378683014, "mean_token_accuracy": 0.7599999904632568, "num_tokens": 2298580.0, "step": 440, "train/ce_loss": 1.0011541843414307 }, { "epoch": 0.04350405378683014, "step": 440, "train/sim_loss": 0.15234375 }, { "epoch": 0.04350405378683014, "step": 440, "train/total_loss": 0.25245916843414307 }, { "entropy": 9.547796249389648, "epoch": 0.04360292663634566, "mean_token_accuracy": 0.7114177942276001, "num_tokens": 2303822.0, "step": 441, "train/ce_loss": 1.0273072719573975 }, { "epoch": 0.04360292663634566, "step": 441, "train/sim_loss": 0.07421875 }, { "epoch": 0.04360292663634566, "step": 441, "train/total_loss": 0.17694947123527527 }, { "entropy": 9.212635040283203, "epoch": 0.043701799485861184, "mean_token_accuracy": 0.6997663378715515, "num_tokens": 2309186.0, "step": 442, "train/ce_loss": 1.113999605178833 }, { "epoch": 0.043701799485861184, "step": 442, "train/sim_loss": 0.1328125 }, { "epoch": 0.043701799485861184, "step": 442, "train/total_loss": 0.24421246349811554 }, { "entropy": 9.891897201538086, "epoch": 0.043800672335376704, "mean_token_accuracy": 0.7316293716430664, "num_tokens": 2314244.0, "step": 443, "train/ce_loss": 1.3147135972976685 }, { "epoch": 0.043800672335376704, "step": 443, "train/sim_loss": 0.09765625 }, { "epoch": 0.043800672335376704, "step": 443, "train/total_loss": 0.22912761569023132 }, { "entropy": 9.439208984375, "epoch": 0.04389954518489223, "mean_token_accuracy": 0.6620603203773499, "num_tokens": 2319515.0, "step": 444, "train/ce_loss": 0.9787640571594238 }, { "epoch": 0.04389954518489223, "step": 444, "train/sim_loss": 0.09375 }, { "epoch": 0.04389954518489223, "step": 444, "train/total_loss": 0.1916263997554779 }, { "entropy": 9.741747856140137, "epoch": 0.04399841803440775, "mean_token_accuracy": 0.7788617610931396, "num_tokens": 2324573.0, "step": 445, "train/ce_loss": 0.9956402778625488 }, { "epoch": 0.04399841803440775, "step": 445, "train/sim_loss": 0.125 }, { "epoch": 0.04399841803440775, "step": 445, "train/total_loss": 0.22456403076648712 }, { "entropy": 9.297388076782227, "epoch": 0.04409729088392327, "mean_token_accuracy": 0.7089151740074158, "num_tokens": 2329978.0, "step": 446, "train/ce_loss": 0.643767774105072 }, { "epoch": 0.04409729088392327, "step": 446, "train/sim_loss": 0.078125 }, { "epoch": 0.04409729088392327, "step": 446, "train/total_loss": 0.14250177145004272 }, { "entropy": 9.491058349609375, "epoch": 0.0441961637334388, "mean_token_accuracy": 0.7719298005104065, "num_tokens": 2335112.0, "step": 447, "train/ce_loss": 1.470329999923706 }, { "epoch": 0.0441961637334388, "step": 447, "train/sim_loss": 0.140625 }, { "epoch": 0.0441961637334388, "step": 447, "train/total_loss": 0.2876580059528351 }, { "entropy": 9.639659881591797, "epoch": 0.04429503658295432, "mean_token_accuracy": 0.7009345889091492, "num_tokens": 2340175.0, "step": 448, "train/ce_loss": 0.07066857069730759 }, { "epoch": 0.04429503658295432, "step": 448, "train/sim_loss": 0.08203125 }, { "epoch": 0.04429503658295432, "step": 448, "train/total_loss": 0.08909811079502106 }, { "entropy": 9.312711715698242, "epoch": 0.04439390943246985, "mean_token_accuracy": 0.6821120977401733, "num_tokens": 2345527.0, "step": 449, "train/ce_loss": 1.3275420665740967 }, { "epoch": 0.04439390943246985, "step": 449, "train/sim_loss": 0.10546875 }, { "epoch": 0.04439390943246985, "step": 449, "train/total_loss": 0.23822295665740967 }, { "entropy": 9.274275779724121, "epoch": 0.04449278228198537, "mean_token_accuracy": 0.7433920502662659, "num_tokens": 2350889.0, "step": 450, "train/ce_loss": 0.801036536693573 }, { "epoch": 0.04449278228198537, "step": 450, "train/sim_loss": 0.0859375 }, { "epoch": 0.04449278228198537, "step": 450, "train/total_loss": 0.16604116559028625 }, { "entropy": 9.944259643554688, "epoch": 0.04459165513150089, "mean_token_accuracy": 0.7213114500045776, "num_tokens": 2355951.0, "step": 451, "train/ce_loss": 1.1396982669830322 }, { "epoch": 0.04459165513150089, "step": 451, "train/sim_loss": 0.15234375 }, { "epoch": 0.04459165513150089, "step": 451, "train/total_loss": 0.2663135826587677 }, { "entropy": 9.422792434692383, "epoch": 0.044690527981016415, "mean_token_accuracy": 0.7263888716697693, "num_tokens": 2361143.0, "step": 452, "train/ce_loss": 0.7414805889129639 }, { "epoch": 0.044690527981016415, "step": 452, "train/sim_loss": 0.16015625 }, { "epoch": 0.044690527981016415, "step": 452, "train/total_loss": 0.2343043088912964 }, { "entropy": 9.042624473571777, "epoch": 0.044789400830531935, "mean_token_accuracy": 0.6950430870056152, "num_tokens": 2366528.0, "step": 453, "train/ce_loss": 1.3365452289581299 }, { "epoch": 0.044789400830531935, "step": 453, "train/sim_loss": 0.1796875 }, { "epoch": 0.044789400830531935, "step": 453, "train/total_loss": 0.31334203481674194 }, { "entropy": 9.04693603515625, "epoch": 0.04488827368004746, "mean_token_accuracy": 0.6670415997505188, "num_tokens": 2371927.0, "step": 454, "train/ce_loss": 0.7594133019447327 }, { "epoch": 0.04488827368004746, "step": 454, "train/sim_loss": 0.15234375 }, { "epoch": 0.04488827368004746, "step": 454, "train/total_loss": 0.2282850742340088 }, { "entropy": 9.376440048217773, "epoch": 0.04498714652956298, "mean_token_accuracy": 0.667117714881897, "num_tokens": 2377192.0, "step": 455, "train/ce_loss": 0.8834444880485535 }, { "epoch": 0.04498714652956298, "step": 455, "train/sim_loss": 0.11328125 }, { "epoch": 0.04498714652956298, "step": 455, "train/total_loss": 0.20162570476531982 }, { "entropy": 9.437824249267578, "epoch": 0.0450860193790785, "mean_token_accuracy": 0.7165775299072266, "num_tokens": 2382435.0, "step": 456, "train/ce_loss": 0.956652820110321 }, { "epoch": 0.0450860193790785, "step": 456, "train/sim_loss": 0.19921875 }, { "epoch": 0.0450860193790785, "step": 456, "train/total_loss": 0.2948840260505676 }, { "entropy": 9.413591384887695, "epoch": 0.04518489222859403, "mean_token_accuracy": 0.7445520758628845, "num_tokens": 2387728.0, "step": 457, "train/ce_loss": 0.6375630497932434 }, { "epoch": 0.04518489222859403, "step": 457, "train/sim_loss": 0.0546875 }, { "epoch": 0.04518489222859403, "step": 457, "train/total_loss": 0.1184438094496727 }, { "entropy": 9.387490272521973, "epoch": 0.04528376507810955, "mean_token_accuracy": 0.7684478163719177, "num_tokens": 2392969.0, "step": 458, "train/ce_loss": 0.643700897693634 }, { "epoch": 0.04528376507810955, "step": 458, "train/sim_loss": 0.140625 }, { "epoch": 0.04528376507810955, "step": 458, "train/total_loss": 0.20499509572982788 }, { "entropy": 9.37730598449707, "epoch": 0.04538263792762508, "mean_token_accuracy": 0.7394366264343262, "num_tokens": 2398500.0, "step": 459, "train/ce_loss": 0.8843984007835388 }, { "epoch": 0.04538263792762508, "step": 459, "train/sim_loss": 0.12890625 }, { "epoch": 0.04538263792762508, "step": 459, "train/total_loss": 0.21734610199928284 }, { "epoch": 0.0454815107771406, "grad_norm": 1.35333251953125, "learning_rate": 9.88898778618405e-06, "loss": 0.2168, "step": 460 }, { "entropy": 9.22109317779541, "epoch": 0.0454815107771406, "mean_token_accuracy": 0.7326968908309937, "num_tokens": 2403843.0, "step": 460, "train/ce_loss": 1.2081056833267212 }, { "epoch": 0.0454815107771406, "step": 460, "train/sim_loss": 0.1015625 }, { "epoch": 0.0454815107771406, "step": 460, "train/total_loss": 0.22237306833267212 }, { "entropy": 9.541427612304688, "epoch": 0.04558038362665612, "mean_token_accuracy": 0.7227214574813843, "num_tokens": 2409094.0, "step": 461, "train/ce_loss": 1.2430551052093506 }, { "epoch": 0.04558038362665612, "step": 461, "train/sim_loss": 0.1640625 }, { "epoch": 0.04558038362665612, "step": 461, "train/total_loss": 0.28836801648139954 }, { "entropy": 8.860454559326172, "epoch": 0.045679256476171645, "mean_token_accuracy": 0.7073863744735718, "num_tokens": 2414702.0, "step": 462, "train/ce_loss": 1.036400318145752 }, { "epoch": 0.045679256476171645, "step": 462, "train/sim_loss": 0.125 }, { "epoch": 0.045679256476171645, "step": 462, "train/total_loss": 0.22864003479480743 }, { "entropy": 9.177661895751953, "epoch": 0.045778129325687165, "mean_token_accuracy": 0.7801822423934937, "num_tokens": 2420073.0, "step": 463, "train/ce_loss": 1.0977107286453247 }, { "epoch": 0.045778129325687165, "step": 463, "train/sim_loss": 0.12890625 }, { "epoch": 0.045778129325687165, "step": 463, "train/total_loss": 0.23867732286453247 }, { "entropy": 9.636924743652344, "epoch": 0.04587700217520269, "mean_token_accuracy": 0.7410179376602173, "num_tokens": 2425227.0, "step": 464, "train/ce_loss": 0.7735141515731812 }, { "epoch": 0.04587700217520269, "step": 464, "train/sim_loss": 0.12109375 }, { "epoch": 0.04587700217520269, "step": 464, "train/total_loss": 0.1984451711177826 }, { "entropy": 10.003236770629883, "epoch": 0.04597587502471821, "mean_token_accuracy": 0.7064220309257507, "num_tokens": 2430139.0, "step": 465, "train/ce_loss": 0.0809466615319252 }, { "epoch": 0.04597587502471821, "step": 465, "train/sim_loss": 0.05859375 }, { "epoch": 0.04597587502471821, "step": 465, "train/total_loss": 0.0666884183883667 }, { "entropy": 9.566146850585938, "epoch": 0.04607474787423373, "mean_token_accuracy": 0.7546699643135071, "num_tokens": 2435381.0, "step": 466, "train/ce_loss": 0.9219437837600708 }, { "epoch": 0.04607474787423373, "step": 466, "train/sim_loss": 0.13671875 }, { "epoch": 0.04607474787423373, "step": 466, "train/total_loss": 0.22891312837600708 }, { "entropy": 9.887555122375488, "epoch": 0.04617362072374926, "mean_token_accuracy": 0.801996648311615, "num_tokens": 2440437.0, "step": 467, "train/ce_loss": 0.07591411471366882 }, { "epoch": 0.04617362072374926, "step": 467, "train/sim_loss": 0.1875 }, { "epoch": 0.04617362072374926, "step": 467, "train/total_loss": 0.19509141147136688 }, { "entropy": 9.326565742492676, "epoch": 0.04627249357326478, "mean_token_accuracy": 0.7552836537361145, "num_tokens": 2445733.0, "step": 468, "train/ce_loss": 1.1807743310928345 }, { "epoch": 0.04627249357326478, "step": 468, "train/sim_loss": 0.109375 }, { "epoch": 0.04627249357326478, "step": 468, "train/total_loss": 0.22745242714881897 }, { "entropy": 9.14306640625, "epoch": 0.04637136642278031, "mean_token_accuracy": 0.7111356258392334, "num_tokens": 2451137.0, "step": 469, "train/ce_loss": 1.2142341136932373 }, { "epoch": 0.04637136642278031, "step": 469, "train/sim_loss": 0.17578125 }, { "epoch": 0.04637136642278031, "step": 469, "train/total_loss": 0.2972046732902527 }, { "entropy": 9.252565383911133, "epoch": 0.04647023927229583, "mean_token_accuracy": 0.6867052316665649, "num_tokens": 2456435.0, "step": 470, "train/ce_loss": 1.1048601865768433 }, { "epoch": 0.04647023927229583, "step": 470, "train/sim_loss": 0.1953125 }, { "epoch": 0.04647023927229583, "step": 470, "train/total_loss": 0.3057985305786133 }, { "entropy": 9.195484161376953, "epoch": 0.04656911212181135, "mean_token_accuracy": 0.7412935495376587, "num_tokens": 2461679.0, "step": 471, "train/ce_loss": 0.7700126767158508 }, { "epoch": 0.04656911212181135, "step": 471, "train/sim_loss": 0.1015625 }, { "epoch": 0.04656911212181135, "step": 471, "train/total_loss": 0.17856377363204956 }, { "entropy": 9.602058410644531, "epoch": 0.046667984971326876, "mean_token_accuracy": 0.7310252785682678, "num_tokens": 2467055.0, "step": 472, "train/ce_loss": 1.0718854665756226 }, { "epoch": 0.046667984971326876, "step": 472, "train/sim_loss": 0.1171875 }, { "epoch": 0.046667984971326876, "step": 472, "train/total_loss": 0.22437605261802673 }, { "entropy": 9.021293640136719, "epoch": 0.046766857820842396, "mean_token_accuracy": 0.7156549692153931, "num_tokens": 2472511.0, "step": 473, "train/ce_loss": 1.4495042562484741 }, { "epoch": 0.046766857820842396, "step": 473, "train/sim_loss": 0.14453125 }, { "epoch": 0.046766857820842396, "step": 473, "train/total_loss": 0.2894816994667053 }, { "entropy": 10.21648120880127, "epoch": 0.046865730670357916, "mean_token_accuracy": 0.729468584060669, "num_tokens": 2477402.0, "step": 474, "train/ce_loss": 0.11186335980892181 }, { "epoch": 0.046865730670357916, "step": 474, "train/sim_loss": 0.1015625 }, { "epoch": 0.046865730670357916, "step": 474, "train/total_loss": 0.11274883896112442 }, { "entropy": 10.212539672851562, "epoch": 0.046964603519873444, "mean_token_accuracy": 0.6848635077476501, "num_tokens": 2482244.0, "step": 475, "train/ce_loss": 2.5846259593963623 }, { "epoch": 0.046964603519873444, "step": 475, "train/sim_loss": 0.10546875 }, { "epoch": 0.046964603519873444, "step": 475, "train/total_loss": 0.3639313578605652 }, { "entropy": 9.680295944213867, "epoch": 0.047063476369388964, "mean_token_accuracy": 0.7712329030036926, "num_tokens": 2487419.0, "step": 476, "train/ce_loss": 1.2396368980407715 }, { "epoch": 0.047063476369388964, "step": 476, "train/sim_loss": 0.125 }, { "epoch": 0.047063476369388964, "step": 476, "train/total_loss": 0.24896368384361267 }, { "entropy": 9.602041244506836, "epoch": 0.04716234921890449, "mean_token_accuracy": 0.6979591846466064, "num_tokens": 2492585.0, "step": 477, "train/ce_loss": 1.0866364240646362 }, { "epoch": 0.04716234921890449, "step": 477, "train/sim_loss": 0.1328125 }, { "epoch": 0.04716234921890449, "step": 477, "train/total_loss": 0.2414761483669281 }, { "entropy": 9.360517501831055, "epoch": 0.04726122206842001, "mean_token_accuracy": 0.746760904788971, "num_tokens": 2497904.0, "step": 478, "train/ce_loss": 0.7504639029502869 }, { "epoch": 0.04726122206842001, "step": 478, "train/sim_loss": 0.0546875 }, { "epoch": 0.04726122206842001, "step": 478, "train/total_loss": 0.1297338902950287 }, { "entropy": 9.118058204650879, "epoch": 0.04736009491793553, "mean_token_accuracy": 0.7364264726638794, "num_tokens": 2503342.0, "step": 479, "train/ce_loss": 0.8444780707359314 }, { "epoch": 0.04736009491793553, "step": 479, "train/sim_loss": 0.1171875 }, { "epoch": 0.04736009491793553, "step": 479, "train/total_loss": 0.20163530111312866 }, { "epoch": 0.04745896776745106, "grad_norm": 1.105547308921814, "learning_rate": 9.8840429214261e-06, "loss": 0.2074, "step": 480 }, { "entropy": 9.979605674743652, "epoch": 0.04745896776745106, "mean_token_accuracy": 0.77173912525177, "num_tokens": 2508154.0, "step": 480, "train/ce_loss": 1.515071988105774 }, { "epoch": 0.04745896776745106, "step": 480, "train/sim_loss": 0.05078125 }, { "epoch": 0.04745896776745106, "step": 480, "train/total_loss": 0.2022884488105774 }, { "entropy": 9.231979370117188, "epoch": 0.04755784061696658, "mean_token_accuracy": 0.7763440608978271, "num_tokens": 2513512.0, "step": 481, "train/ce_loss": 0.8422334790229797 }, { "epoch": 0.04755784061696658, "step": 481, "train/sim_loss": 0.05078125 }, { "epoch": 0.04755784061696658, "step": 481, "train/total_loss": 0.13500460982322693 }, { "entropy": 10.054229736328125, "epoch": 0.047656713466482106, "mean_token_accuracy": 0.748106062412262, "num_tokens": 2518458.0, "step": 482, "train/ce_loss": 1.097081184387207 }, { "epoch": 0.047656713466482106, "step": 482, "train/sim_loss": 0.078125 }, { "epoch": 0.047656713466482106, "step": 482, "train/total_loss": 0.18783313035964966 }, { "entropy": 9.745567321777344, "epoch": 0.04775558631599763, "mean_token_accuracy": 0.6627907156944275, "num_tokens": 2523459.0, "step": 483, "train/ce_loss": 1.2584025859832764 }, { "epoch": 0.04775558631599763, "step": 483, "train/sim_loss": 0.0546875 }, { "epoch": 0.04775558631599763, "step": 483, "train/total_loss": 0.18052776157855988 }, { "entropy": 10.35242748260498, "epoch": 0.04785445916551315, "mean_token_accuracy": 0.7480719685554504, "num_tokens": 2528257.0, "step": 484, "train/ce_loss": 2.1718273162841797 }, { "epoch": 0.04785445916551315, "step": 484, "train/sim_loss": 0.09375 }, { "epoch": 0.04785445916551315, "step": 484, "train/total_loss": 0.3109327554702759 }, { "entropy": 9.667442321777344, "epoch": 0.047953332015028674, "mean_token_accuracy": 0.7165242433547974, "num_tokens": 2533434.0, "step": 485, "train/ce_loss": 1.0756824016571045 }, { "epoch": 0.047953332015028674, "step": 485, "train/sim_loss": 0.171875 }, { "epoch": 0.047953332015028674, "step": 485, "train/total_loss": 0.27944323420524597 }, { "entropy": 9.372124671936035, "epoch": 0.048052204864544194, "mean_token_accuracy": 0.744508683681488, "num_tokens": 2538761.0, "step": 486, "train/ce_loss": 1.0770900249481201 }, { "epoch": 0.048052204864544194, "step": 486, "train/sim_loss": 0.125 }, { "epoch": 0.048052204864544194, "step": 486, "train/total_loss": 0.23270900547504425 }, { "entropy": 9.491572380065918, "epoch": 0.04815107771405972, "mean_token_accuracy": 0.7078787684440613, "num_tokens": 2544017.0, "step": 487, "train/ce_loss": 0.5395680069923401 }, { "epoch": 0.04815107771405972, "step": 487, "train/sim_loss": 0.109375 }, { "epoch": 0.04815107771405972, "step": 487, "train/total_loss": 0.1633318066596985 }, { "entropy": 8.881023406982422, "epoch": 0.04824995056357524, "mean_token_accuracy": 0.680672287940979, "num_tokens": 2549587.0, "step": 488, "train/ce_loss": 1.331809163093567 }, { "epoch": 0.04824995056357524, "step": 488, "train/sim_loss": 0.12109375 }, { "epoch": 0.04824995056357524, "step": 488, "train/total_loss": 0.2542746663093567 }, { "entropy": 9.483545303344727, "epoch": 0.04834882341309076, "mean_token_accuracy": 0.7666666507720947, "num_tokens": 2554754.0, "step": 489, "train/ce_loss": 0.055041637271642685 }, { "epoch": 0.04834882341309076, "step": 489, "train/sim_loss": 0.125 }, { "epoch": 0.04834882341309076, "step": 489, "train/total_loss": 0.13050416111946106 }, { "entropy": 9.063933372497559, "epoch": 0.04844769626260629, "mean_token_accuracy": 0.6822529435157776, "num_tokens": 2560222.0, "step": 490, "train/ce_loss": 1.1069523096084595 }, { "epoch": 0.04844769626260629, "step": 490, "train/sim_loss": 0.1640625 }, { "epoch": 0.04844769626260629, "step": 490, "train/total_loss": 0.2747577428817749 }, { "entropy": 9.450166702270508, "epoch": 0.04854656911212181, "mean_token_accuracy": 0.7581775784492493, "num_tokens": 2565512.0, "step": 491, "train/ce_loss": 1.0669002532958984 }, { "epoch": 0.04854656911212181, "step": 491, "train/sim_loss": 0.08203125 }, { "epoch": 0.04854656911212181, "step": 491, "train/total_loss": 0.18872126936912537 }, { "entropy": 9.656312942504883, "epoch": 0.04864544196163734, "mean_token_accuracy": 0.6796992421150208, "num_tokens": 2570648.0, "step": 492, "train/ce_loss": 1.0691183805465698 }, { "epoch": 0.04864544196163734, "step": 492, "train/sim_loss": 0.1015625 }, { "epoch": 0.04864544196163734, "step": 492, "train/total_loss": 0.20847433805465698 }, { "entropy": 9.513638496398926, "epoch": 0.04874431481115286, "mean_token_accuracy": 0.7193675637245178, "num_tokens": 2575927.0, "step": 493, "train/ce_loss": 0.8813692331314087 }, { "epoch": 0.04874431481115286, "step": 493, "train/sim_loss": 0.12109375 }, { "epoch": 0.04874431481115286, "step": 493, "train/total_loss": 0.2092306762933731 }, { "entropy": 9.012145042419434, "epoch": 0.04884318766066838, "mean_token_accuracy": 0.7292340993881226, "num_tokens": 2581273.0, "step": 494, "train/ce_loss": 0.643973708152771 }, { "epoch": 0.04884318766066838, "step": 494, "train/sim_loss": 0.05859375 }, { "epoch": 0.04884318766066838, "step": 494, "train/total_loss": 0.12299112230539322 }, { "entropy": 9.455644607543945, "epoch": 0.048942060510183905, "mean_token_accuracy": 0.6707482933998108, "num_tokens": 2586482.0, "step": 495, "train/ce_loss": 1.2485941648483276 }, { "epoch": 0.048942060510183905, "step": 495, "train/sim_loss": 0.1171875 }, { "epoch": 0.048942060510183905, "step": 495, "train/total_loss": 0.24204692244529724 }, { "entropy": 9.610002517700195, "epoch": 0.049040933359699425, "mean_token_accuracy": 0.7510373592376709, "num_tokens": 2591696.0, "step": 496, "train/ce_loss": 0.8027147650718689 }, { "epoch": 0.049040933359699425, "step": 496, "train/sim_loss": 0.0546875 }, { "epoch": 0.049040933359699425, "step": 496, "train/total_loss": 0.13495898246765137 }, { "entropy": 10.199856758117676, "epoch": 0.04913980620921495, "mean_token_accuracy": 0.6554054021835327, "num_tokens": 2596533.0, "step": 497, "train/ce_loss": 0.09949162602424622 }, { "epoch": 0.04913980620921495, "step": 497, "train/sim_loss": 0.171875 }, { "epoch": 0.04913980620921495, "step": 497, "train/total_loss": 0.18182416260242462 }, { "entropy": 9.396942138671875, "epoch": 0.04923867905873047, "mean_token_accuracy": 0.6440251469612122, "num_tokens": 2601827.0, "step": 498, "train/ce_loss": 0.9401248097419739 }, { "epoch": 0.04923867905873047, "step": 498, "train/sim_loss": 0.08984375 }, { "epoch": 0.04923867905873047, "step": 498, "train/total_loss": 0.18385623395442963 }, { "entropy": 9.42705249786377, "epoch": 0.04933755190824599, "mean_token_accuracy": 0.6958277225494385, "num_tokens": 2607043.0, "step": 499, "train/ce_loss": 1.1366710662841797 }, { "epoch": 0.04933755190824599, "step": 499, "train/sim_loss": 0.0625 }, { "epoch": 0.04933755190824599, "step": 499, "train/total_loss": 0.1761671006679535 }, { "epoch": 0.04943642475776152, "grad_norm": 1.5242397785186768, "learning_rate": 9.87909805666815e-06, "loss": 0.2099, "step": 500 }, { "entropy": 9.39438247680664, "epoch": 0.04943642475776152, "mean_token_accuracy": 0.6682521104812622, "num_tokens": 2612307.0, "step": 500, "train/ce_loss": 1.0083014965057373 }, { "epoch": 0.04943642475776152, "step": 500, "train/sim_loss": 0.0859375 }, { "epoch": 0.04943642475776152, "step": 500, "train/total_loss": 0.18676765263080597 }, { "entropy": 9.212362289428711, "epoch": 0.04953529760727704, "mean_token_accuracy": 0.7669584155082703, "num_tokens": 2617734.0, "step": 501, "train/ce_loss": 0.5354273915290833 }, { "epoch": 0.04953529760727704, "step": 501, "train/sim_loss": 0.10546875 }, { "epoch": 0.04953529760727704, "step": 501, "train/total_loss": 0.15901148319244385 }, { "entropy": 9.426509857177734, "epoch": 0.04963417045679257, "mean_token_accuracy": 0.7060890197753906, "num_tokens": 2623043.0, "step": 502, "train/ce_loss": 2.2972195148468018 }, { "epoch": 0.04963417045679257, "step": 502, "train/sim_loss": 0.3046875 }, { "epoch": 0.04963417045679257, "step": 502, "train/total_loss": 0.5344094634056091 }, { "entropy": 9.411643981933594, "epoch": 0.04973304330630809, "mean_token_accuracy": 0.7471839785575867, "num_tokens": 2628337.0, "step": 503, "train/ce_loss": 0.5850355625152588 }, { "epoch": 0.04973304330630809, "step": 503, "train/sim_loss": 0.046875 }, { "epoch": 0.04973304330630809, "step": 503, "train/total_loss": 0.10537855327129364 }, { "entropy": 9.113931655883789, "epoch": 0.04983191615582361, "mean_token_accuracy": 0.748400866985321, "num_tokens": 2633762.0, "step": 504, "train/ce_loss": 0.8196927905082703 }, { "epoch": 0.04983191615582361, "step": 504, "train/sim_loss": 0.09765625 }, { "epoch": 0.04983191615582361, "step": 504, "train/total_loss": 0.17962554097175598 }, { "entropy": 9.683710098266602, "epoch": 0.049930789005339135, "mean_token_accuracy": 0.7558479309082031, "num_tokens": 2638905.0, "step": 505, "train/ce_loss": 1.0427159070968628 }, { "epoch": 0.049930789005339135, "step": 505, "train/sim_loss": 0.109375 }, { "epoch": 0.049930789005339135, "step": 505, "train/total_loss": 0.21364659070968628 }, { "entropy": 9.304154396057129, "epoch": 0.050029661854854655, "mean_token_accuracy": 0.7181817889213562, "num_tokens": 2644217.0, "step": 506, "train/ce_loss": 0.917572557926178 }, { "epoch": 0.050029661854854655, "step": 506, "train/sim_loss": 0.05859375 }, { "epoch": 0.050029661854854655, "step": 506, "train/total_loss": 0.15035101771354675 }, { "entropy": 9.906225204467773, "epoch": 0.05012853470437018, "mean_token_accuracy": 0.754749596118927, "num_tokens": 2649187.0, "step": 507, "train/ce_loss": 1.5441699028015137 }, { "epoch": 0.05012853470437018, "step": 507, "train/sim_loss": 0.16015625 }, { "epoch": 0.05012853470437018, "step": 507, "train/total_loss": 0.3145732283592224 }, { "entropy": 10.101706504821777, "epoch": 0.0502274075538857, "mean_token_accuracy": 0.7184466123580933, "num_tokens": 2653966.0, "step": 508, "train/ce_loss": 0.10800564289093018 }, { "epoch": 0.0502274075538857, "step": 508, "train/sim_loss": 0.0625 }, { "epoch": 0.0502274075538857, "step": 508, "train/total_loss": 0.0733005627989769 }, { "entropy": 9.47825813293457, "epoch": 0.05032628040340122, "mean_token_accuracy": 0.7415143847465515, "num_tokens": 2659143.0, "step": 509, "train/ce_loss": 0.05864207446575165 }, { "epoch": 0.05032628040340122, "step": 509, "train/sim_loss": 0.07421875 }, { "epoch": 0.05032628040340122, "step": 509, "train/total_loss": 0.0800829604268074 }, { "entropy": 9.604631423950195, "epoch": 0.05042515325291675, "mean_token_accuracy": 0.6968838572502136, "num_tokens": 2664325.0, "step": 510, "train/ce_loss": 0.48100969195365906 }, { "epoch": 0.05042515325291675, "step": 510, "train/sim_loss": 0.1015625 }, { "epoch": 0.05042515325291675, "step": 510, "train/total_loss": 0.14966347813606262 }, { "entropy": 8.994229316711426, "epoch": 0.05052402610243227, "mean_token_accuracy": 0.6837257146835327, "num_tokens": 2669802.0, "step": 511, "train/ce_loss": 1.3078932762145996 }, { "epoch": 0.05052402610243227, "step": 511, "train/sim_loss": 0.1875 }, { "epoch": 0.05052402610243227, "step": 511, "train/total_loss": 0.3182893395423889 }, { "entropy": 9.30910587310791, "epoch": 0.0506228989519478, "mean_token_accuracy": 0.68727707862854, "num_tokens": 2675131.0, "step": 512, "train/ce_loss": 1.0803829431533813 }, { "epoch": 0.0506228989519478, "step": 512, "train/sim_loss": 0.08203125 }, { "epoch": 0.0506228989519478, "step": 512, "train/total_loss": 0.1900695562362671 }, { "entropy": 9.34923267364502, "epoch": 0.05072177180146332, "mean_token_accuracy": 0.7068965435028076, "num_tokens": 2680306.0, "step": 513, "train/ce_loss": 0.8655760288238525 }, { "epoch": 0.05072177180146332, "step": 513, "train/sim_loss": 0.11328125 }, { "epoch": 0.05072177180146332, "step": 513, "train/total_loss": 0.19983884692192078 }, { "entropy": 9.359048843383789, "epoch": 0.05082064465097884, "mean_token_accuracy": 0.7277227640151978, "num_tokens": 2685562.0, "step": 514, "train/ce_loss": 0.9464130401611328 }, { "epoch": 0.05082064465097884, "step": 514, "train/sim_loss": 0.05078125 }, { "epoch": 0.05082064465097884, "step": 514, "train/total_loss": 0.1454225480556488 }, { "entropy": 9.247976303100586, "epoch": 0.050919517500494366, "mean_token_accuracy": 0.7032474875450134, "num_tokens": 2690926.0, "step": 515, "train/ce_loss": 0.6532353162765503 }, { "epoch": 0.050919517500494366, "step": 515, "train/sim_loss": 0.14453125 }, { "epoch": 0.050919517500494366, "step": 515, "train/total_loss": 0.20985478162765503 }, { "entropy": 9.181419372558594, "epoch": 0.051018390350009886, "mean_token_accuracy": 0.7311608791351318, "num_tokens": 2696367.0, "step": 516, "train/ce_loss": 1.1315739154815674 }, { "epoch": 0.051018390350009886, "step": 516, "train/sim_loss": 0.109375 }, { "epoch": 0.051018390350009886, "step": 516, "train/total_loss": 0.22253239154815674 }, { "entropy": 9.452371597290039, "epoch": 0.05111726319952541, "mean_token_accuracy": 0.8046242594718933, "num_tokens": 2701822.0, "step": 517, "train/ce_loss": 0.04842658340930939 }, { "epoch": 0.05111726319952541, "step": 517, "train/sim_loss": 0.109375 }, { "epoch": 0.05111726319952541, "step": 517, "train/total_loss": 0.11421766132116318 }, { "entropy": 9.96130084991455, "epoch": 0.05121613604904093, "mean_token_accuracy": 0.7293497323989868, "num_tokens": 2706837.0, "step": 518, "train/ce_loss": 1.3989778757095337 }, { "epoch": 0.05121613604904093, "step": 518, "train/sim_loss": 0.109375 }, { "epoch": 0.05121613604904093, "step": 518, "train/total_loss": 0.24927279353141785 }, { "entropy": 9.840981483459473, "epoch": 0.051315008898556454, "mean_token_accuracy": 0.7715654969215393, "num_tokens": 2711892.0, "step": 519, "train/ce_loss": 0.07052932679653168 }, { "epoch": 0.051315008898556454, "step": 519, "train/sim_loss": 0.125 }, { "epoch": 0.051315008898556454, "step": 519, "train/total_loss": 0.1320529282093048 }, { "epoch": 0.05141388174807198, "grad_norm": 1.1278705596923828, "learning_rate": 9.874153191910203e-06, "loss": 0.1989, "step": 520 }, { "entropy": 10.010367393493652, "epoch": 0.05141388174807198, "mean_token_accuracy": 0.695652186870575, "num_tokens": 2716867.0, "step": 520, "train/ce_loss": 1.4642590284347534 }, { "epoch": 0.05141388174807198, "step": 520, "train/sim_loss": 0.046875 }, { "epoch": 0.05141388174807198, "step": 520, "train/total_loss": 0.19330090284347534 }, { "entropy": 9.677255630493164, "epoch": 0.0515127545975875, "mean_token_accuracy": 0.7307132482528687, "num_tokens": 2722025.0, "step": 521, "train/ce_loss": 0.8722675442695618 }, { "epoch": 0.0515127545975875, "step": 521, "train/sim_loss": 0.16796875 }, { "epoch": 0.0515127545975875, "step": 521, "train/total_loss": 0.2551954984664917 }, { "entropy": 10.041577339172363, "epoch": 0.05161162744710303, "mean_token_accuracy": 0.7870370149612427, "num_tokens": 2726902.0, "step": 522, "train/ce_loss": 1.583846092224121 }, { "epoch": 0.05161162744710303, "step": 522, "train/sim_loss": 0.08984375 }, { "epoch": 0.05161162744710303, "step": 522, "train/total_loss": 0.24822835624217987 }, { "entropy": 9.485618591308594, "epoch": 0.05171050029661855, "mean_token_accuracy": 0.6848484873771667, "num_tokens": 2732201.0, "step": 523, "train/ce_loss": 1.1445075273513794 }, { "epoch": 0.05171050029661855, "step": 523, "train/sim_loss": 0.11328125 }, { "epoch": 0.05171050029661855, "step": 523, "train/total_loss": 0.22773200273513794 }, { "entropy": 9.58382797241211, "epoch": 0.05180937314613407, "mean_token_accuracy": 0.7578796744346619, "num_tokens": 2737328.0, "step": 524, "train/ce_loss": 0.06139937788248062 }, { "epoch": 0.05180937314613407, "step": 524, "train/sim_loss": 0.046875 }, { "epoch": 0.05180937314613407, "step": 524, "train/total_loss": 0.05301493778824806 }, { "entropy": 9.250560760498047, "epoch": 0.051908245995649596, "mean_token_accuracy": 0.6807563900947571, "num_tokens": 2742615.0, "step": 525, "train/ce_loss": 0.8151491284370422 }, { "epoch": 0.051908245995649596, "step": 525, "train/sim_loss": 0.0703125 }, { "epoch": 0.051908245995649596, "step": 525, "train/total_loss": 0.15182742476463318 }, { "entropy": 9.5101318359375, "epoch": 0.052007118845165116, "mean_token_accuracy": 0.7257484793663025, "num_tokens": 2748071.0, "step": 526, "train/ce_loss": 0.8393839597702026 }, { "epoch": 0.052007118845165116, "step": 526, "train/sim_loss": 0.07421875 }, { "epoch": 0.052007118845165116, "step": 526, "train/total_loss": 0.1581571400165558 }, { "entropy": 9.4252347946167, "epoch": 0.052105991694680644, "mean_token_accuracy": 0.743658185005188, "num_tokens": 2753307.0, "step": 527, "train/ce_loss": 0.6566883325576782 }, { "epoch": 0.052105991694680644, "step": 527, "train/sim_loss": 0.09765625 }, { "epoch": 0.052105991694680644, "step": 527, "train/total_loss": 0.16332508623600006 }, { "entropy": 9.265995979309082, "epoch": 0.052204864544196164, "mean_token_accuracy": 0.741360068321228, "num_tokens": 2758632.0, "step": 528, "train/ce_loss": 0.4899671971797943 }, { "epoch": 0.052204864544196164, "step": 528, "train/sim_loss": 0.1171875 }, { "epoch": 0.052204864544196164, "step": 528, "train/total_loss": 0.1661842167377472 }, { "entropy": 9.533434867858887, "epoch": 0.052303737393711684, "mean_token_accuracy": 0.6966145634651184, "num_tokens": 2763831.0, "step": 529, "train/ce_loss": 0.7207930684089661 }, { "epoch": 0.052303737393711684, "step": 529, "train/sim_loss": 0.171875 }, { "epoch": 0.052303737393711684, "step": 529, "train/total_loss": 0.24395430088043213 }, { "entropy": 8.904180526733398, "epoch": 0.05240261024322721, "mean_token_accuracy": 0.739051103591919, "num_tokens": 2769420.0, "step": 530, "train/ce_loss": 0.6011399030685425 }, { "epoch": 0.05240261024322721, "step": 530, "train/sim_loss": 0.046875 }, { "epoch": 0.05240261024322721, "step": 530, "train/total_loss": 0.10698899626731873 }, { "entropy": 9.722763061523438, "epoch": 0.05250148309274273, "mean_token_accuracy": 0.7542504072189331, "num_tokens": 2774496.0, "step": 531, "train/ce_loss": 1.466348648071289 }, { "epoch": 0.05250148309274273, "step": 531, "train/sim_loss": 0.08984375 }, { "epoch": 0.05250148309274273, "step": 531, "train/total_loss": 0.23647861182689667 }, { "entropy": 9.23657512664795, "epoch": 0.05260035594225826, "mean_token_accuracy": 0.7497048377990723, "num_tokens": 2779765.0, "step": 532, "train/ce_loss": 1.1639745235443115 }, { "epoch": 0.05260035594225826, "step": 532, "train/sim_loss": 0.109375 }, { "epoch": 0.05260035594225826, "step": 532, "train/total_loss": 0.2257724553346634 }, { "entropy": 9.56563663482666, "epoch": 0.05269922879177378, "mean_token_accuracy": 0.7853403091430664, "num_tokens": 2784999.0, "step": 533, "train/ce_loss": 0.7111084461212158 }, { "epoch": 0.05269922879177378, "step": 533, "train/sim_loss": 0.12890625 }, { "epoch": 0.05269922879177378, "step": 533, "train/total_loss": 0.20001709461212158 }, { "entropy": 9.463350296020508, "epoch": 0.0527981016412893, "mean_token_accuracy": 0.6979695558547974, "num_tokens": 2790274.0, "step": 534, "train/ce_loss": 0.8471155166625977 }, { "epoch": 0.0527981016412893, "step": 534, "train/sim_loss": 0.08203125 }, { "epoch": 0.0527981016412893, "step": 534, "train/total_loss": 0.16674280166625977 }, { "entropy": 10.123266220092773, "epoch": 0.05289697449080483, "mean_token_accuracy": 0.7316017150878906, "num_tokens": 2795156.0, "step": 535, "train/ce_loss": 1.0770008563995361 }, { "epoch": 0.05289697449080483, "step": 535, "train/sim_loss": 0.046875 }, { "epoch": 0.05289697449080483, "step": 535, "train/total_loss": 0.15457507967948914 }, { "entropy": 10.131725311279297, "epoch": 0.05299584734032035, "mean_token_accuracy": 0.7266514897346497, "num_tokens": 2800024.0, "step": 536, "train/ce_loss": 0.10147576034069061 }, { "epoch": 0.05299584734032035, "step": 536, "train/sim_loss": 0.09765625 }, { "epoch": 0.05299584734032035, "step": 536, "train/total_loss": 0.1078038290143013 }, { "entropy": 9.347770690917969, "epoch": 0.053094720189835874, "mean_token_accuracy": 0.6723237633705139, "num_tokens": 2805230.0, "step": 537, "train/ce_loss": 0.8107247948646545 }, { "epoch": 0.053094720189835874, "step": 537, "train/sim_loss": 0.1328125 }, { "epoch": 0.053094720189835874, "step": 537, "train/total_loss": 0.21388497948646545 }, { "entropy": 9.418624877929688, "epoch": 0.053193593039351394, "mean_token_accuracy": 0.7631224989891052, "num_tokens": 2810488.0, "step": 538, "train/ce_loss": 0.7745002508163452 }, { "epoch": 0.053193593039351394, "step": 538, "train/sim_loss": 0.13671875 }, { "epoch": 0.053193593039351394, "step": 538, "train/total_loss": 0.21416878700256348 }, { "entropy": 9.554218292236328, "epoch": 0.053292465888866915, "mean_token_accuracy": 0.719298243522644, "num_tokens": 2815640.0, "step": 539, "train/ce_loss": 1.0348252058029175 }, { "epoch": 0.053292465888866915, "step": 539, "train/sim_loss": 0.09765625 }, { "epoch": 0.053292465888866915, "step": 539, "train/total_loss": 0.20113876461982727 }, { "epoch": 0.05339133873838244, "grad_norm": 1.2608516216278076, "learning_rate": 9.869208327152253e-06, "loss": 0.1932, "step": 540 }, { "entropy": 9.439250946044922, "epoch": 0.05339133873838244, "mean_token_accuracy": 0.6946264505386353, "num_tokens": 2820919.0, "step": 540, "train/ce_loss": 1.2679036855697632 }, { "epoch": 0.05339133873838244, "step": 540, "train/sim_loss": 0.06640625 }, { "epoch": 0.05339133873838244, "step": 540, "train/total_loss": 0.1931966245174408 }, { "entropy": 9.167460441589355, "epoch": 0.05349021158789796, "mean_token_accuracy": 0.7379454970359802, "num_tokens": 2826540.0, "step": 541, "train/ce_loss": 0.576278030872345 }, { "epoch": 0.05349021158789796, "step": 541, "train/sim_loss": 0.1484375 }, { "epoch": 0.05349021158789796, "step": 541, "train/total_loss": 0.20606529712677002 }, { "entropy": 9.229609489440918, "epoch": 0.05358908443741349, "mean_token_accuracy": 0.805038332939148, "num_tokens": 2831934.0, "step": 542, "train/ce_loss": 0.5699486136436462 }, { "epoch": 0.05358908443741349, "step": 542, "train/sim_loss": 0.0546875 }, { "epoch": 0.05358908443741349, "step": 542, "train/total_loss": 0.11168236285448074 }, { "entropy": 8.880252838134766, "epoch": 0.05368795728692901, "mean_token_accuracy": 0.765531063079834, "num_tokens": 2837431.0, "step": 543, "train/ce_loss": 0.5533732175827026 }, { "epoch": 0.05368795728692901, "step": 543, "train/sim_loss": 0.0546875 }, { "epoch": 0.05368795728692901, "step": 543, "train/total_loss": 0.1100248247385025 }, { "entropy": 9.366069793701172, "epoch": 0.05378683013644453, "mean_token_accuracy": 0.6319176554679871, "num_tokens": 2842676.0, "step": 544, "train/ce_loss": 1.4185377359390259 }, { "epoch": 0.05378683013644453, "step": 544, "train/sim_loss": 0.09765625 }, { "epoch": 0.05378683013644453, "step": 544, "train/total_loss": 0.23951002955436707 }, { "entropy": 10.311344146728516, "epoch": 0.05388570298596006, "mean_token_accuracy": 0.7603305578231812, "num_tokens": 2847451.0, "step": 545, "train/ce_loss": 1.1562058925628662 }, { "epoch": 0.05388570298596006, "step": 545, "train/sim_loss": 0.265625 }, { "epoch": 0.05388570298596006, "step": 545, "train/total_loss": 0.38124558329582214 }, { "entropy": 9.588722229003906, "epoch": 0.05398457583547558, "mean_token_accuracy": 0.7671428322792053, "num_tokens": 2852698.0, "step": 546, "train/ce_loss": 0.7407620549201965 }, { "epoch": 0.05398457583547558, "step": 546, "train/sim_loss": 0.1171875 }, { "epoch": 0.05398457583547558, "step": 546, "train/total_loss": 0.19126370549201965 }, { "entropy": 9.780805587768555, "epoch": 0.054083448684991105, "mean_token_accuracy": 0.7216174006462097, "num_tokens": 2857813.0, "step": 547, "train/ce_loss": 0.6773979663848877 }, { "epoch": 0.054083448684991105, "step": 547, "train/sim_loss": 0.0546875 }, { "epoch": 0.054083448684991105, "step": 547, "train/total_loss": 0.12242729961872101 }, { "entropy": 9.329841613769531, "epoch": 0.054182321534506625, "mean_token_accuracy": 0.722453236579895, "num_tokens": 2863270.0, "step": 548, "train/ce_loss": 0.5865836143493652 }, { "epoch": 0.054182321534506625, "step": 548, "train/sim_loss": 0.09375 }, { "epoch": 0.054182321534506625, "step": 548, "train/total_loss": 0.15240836143493652 }, { "entropy": 9.572759628295898, "epoch": 0.054281194384022145, "mean_token_accuracy": 0.7383966445922852, "num_tokens": 2868525.0, "step": 549, "train/ce_loss": 1.0673495531082153 }, { "epoch": 0.054281194384022145, "step": 549, "train/sim_loss": 0.0625 }, { "epoch": 0.054281194384022145, "step": 549, "train/total_loss": 0.169234961271286 }, { "entropy": 9.241981506347656, "epoch": 0.05438006723353767, "mean_token_accuracy": 0.6978335380554199, "num_tokens": 2873852.0, "step": 550, "train/ce_loss": 0.8549608588218689 }, { "epoch": 0.05438006723353767, "step": 550, "train/sim_loss": 0.1015625 }, { "epoch": 0.05438006723353767, "step": 550, "train/total_loss": 0.18705859780311584 }, { "entropy": 9.983388900756836, "epoch": 0.05447894008305319, "mean_token_accuracy": 0.7796934843063354, "num_tokens": 2878819.0, "step": 551, "train/ce_loss": 0.6586815118789673 }, { "epoch": 0.05447894008305319, "step": 551, "train/sim_loss": 0.13671875 }, { "epoch": 0.05447894008305319, "step": 551, "train/total_loss": 0.20258690416812897 }, { "entropy": 9.14186954498291, "epoch": 0.05457781293256872, "mean_token_accuracy": 0.7487623691558838, "num_tokens": 2884128.0, "step": 552, "train/ce_loss": 0.9660906791687012 }, { "epoch": 0.05457781293256872, "step": 552, "train/sim_loss": 0.12109375 }, { "epoch": 0.05457781293256872, "step": 552, "train/total_loss": 0.21770282089710236 }, { "entropy": 10.010492324829102, "epoch": 0.05467668578208424, "mean_token_accuracy": 0.70652174949646, "num_tokens": 2889127.0, "step": 553, "train/ce_loss": 0.07915318757295609 }, { "epoch": 0.05467668578208424, "step": 553, "train/sim_loss": 0.08203125 }, { "epoch": 0.05467668578208424, "step": 553, "train/total_loss": 0.08994656801223755 }, { "entropy": 9.300575256347656, "epoch": 0.05477555863159976, "mean_token_accuracy": 0.7167630195617676, "num_tokens": 2894476.0, "step": 554, "train/ce_loss": 1.0042680501937866 }, { "epoch": 0.05477555863159976, "step": 554, "train/sim_loss": 0.078125 }, { "epoch": 0.05477555863159976, "step": 554, "train/total_loss": 0.1785518079996109 }, { "entropy": 9.623985290527344, "epoch": 0.05487443148111529, "mean_token_accuracy": 0.7852256894111633, "num_tokens": 2899852.0, "step": 555, "train/ce_loss": 1.194027304649353 }, { "epoch": 0.05487443148111529, "step": 555, "train/sim_loss": 0.19921875 }, { "epoch": 0.05487443148111529, "step": 555, "train/total_loss": 0.3186214864253998 }, { "entropy": 9.840734481811523, "epoch": 0.05497330433063081, "mean_token_accuracy": 0.7689822316169739, "num_tokens": 2904891.0, "step": 556, "train/ce_loss": 1.167907476425171 }, { "epoch": 0.05497330433063081, "step": 556, "train/sim_loss": 0.11328125 }, { "epoch": 0.05497330433063081, "step": 556, "train/total_loss": 0.2300719916820526 }, { "entropy": 9.579257011413574, "epoch": 0.055072177180146335, "mean_token_accuracy": 0.7628571391105652, "num_tokens": 2910033.0, "step": 557, "train/ce_loss": 0.9383928775787354 }, { "epoch": 0.055072177180146335, "step": 557, "train/sim_loss": 0.078125 }, { "epoch": 0.055072177180146335, "step": 557, "train/total_loss": 0.17196428775787354 }, { "entropy": 9.686721801757812, "epoch": 0.055171050029661856, "mean_token_accuracy": 0.7272727489471436, "num_tokens": 2915156.0, "step": 558, "train/ce_loss": 0.8214370608329773 }, { "epoch": 0.055171050029661856, "step": 558, "train/sim_loss": 0.1171875 }, { "epoch": 0.055171050029661856, "step": 558, "train/total_loss": 0.19933120906352997 }, { "entropy": 9.445137023925781, "epoch": 0.055269922879177376, "mean_token_accuracy": 0.7244501709938049, "num_tokens": 2920382.0, "step": 559, "train/ce_loss": 0.8776147961616516 }, { "epoch": 0.055269922879177376, "step": 559, "train/sim_loss": 0.12109375 }, { "epoch": 0.055269922879177376, "step": 559, "train/total_loss": 0.20885524153709412 }, { "epoch": 0.0553687957286929, "grad_norm": 1.3400676250457764, "learning_rate": 9.864263462394305e-06, "loss": 0.2003, "step": 560 }, { "entropy": 9.43281078338623, "epoch": 0.0553687957286929, "mean_token_accuracy": 0.7690140604972839, "num_tokens": 2925595.0, "step": 560, "train/ce_loss": 1.3493784666061401 }, { "epoch": 0.0553687957286929, "step": 560, "train/sim_loss": 0.12109375 }, { "epoch": 0.0553687957286929, "step": 560, "train/total_loss": 0.2560316026210785 }, { "entropy": 9.54813289642334, "epoch": 0.05546766857820842, "mean_token_accuracy": 0.7298091053962708, "num_tokens": 2930687.0, "step": 561, "train/ce_loss": 1.03697669506073 }, { "epoch": 0.05546766857820842, "step": 561, "train/sim_loss": 0.11328125 }, { "epoch": 0.05546766857820842, "step": 561, "train/total_loss": 0.21697892248630524 }, { "entropy": 9.249414443969727, "epoch": 0.055566541427723944, "mean_token_accuracy": 0.6990394592285156, "num_tokens": 2936080.0, "step": 562, "train/ce_loss": 1.2749476432800293 }, { "epoch": 0.055566541427723944, "step": 562, "train/sim_loss": 0.125 }, { "epoch": 0.055566541427723944, "step": 562, "train/total_loss": 0.252494752407074 }, { "entropy": 9.528446197509766, "epoch": 0.05566541427723947, "mean_token_accuracy": 0.7445721626281738, "num_tokens": 2941279.0, "step": 563, "train/ce_loss": 0.868265688419342 }, { "epoch": 0.05566541427723947, "step": 563, "train/sim_loss": 0.078125 }, { "epoch": 0.05566541427723947, "step": 563, "train/total_loss": 0.16495156288146973 }, { "entropy": 9.777507781982422, "epoch": 0.05576428712675499, "mean_token_accuracy": 0.7195325493812561, "num_tokens": 2946373.0, "step": 564, "train/ce_loss": 1.328007459640503 }, { "epoch": 0.05576428712675499, "step": 564, "train/sim_loss": 0.12109375 }, { "epoch": 0.05576428712675499, "step": 564, "train/total_loss": 0.25389450788497925 }, { "entropy": 9.187501907348633, "epoch": 0.05586315997627052, "mean_token_accuracy": 0.6438775658607483, "num_tokens": 2951840.0, "step": 565, "train/ce_loss": 1.264036774635315 }, { "epoch": 0.05586315997627052, "step": 565, "train/sim_loss": 0.1171875 }, { "epoch": 0.05586315997627052, "step": 565, "train/total_loss": 0.24359117448329926 }, { "entropy": 9.460844039916992, "epoch": 0.05596203282578604, "mean_token_accuracy": 0.7052767276763916, "num_tokens": 2957077.0, "step": 566, "train/ce_loss": 0.8064752221107483 }, { "epoch": 0.05596203282578604, "step": 566, "train/sim_loss": 0.12109375 }, { "epoch": 0.05596203282578604, "step": 566, "train/total_loss": 0.2017412781715393 }, { "entropy": 9.599678993225098, "epoch": 0.05606090567530156, "mean_token_accuracy": 0.8073529601097107, "num_tokens": 2962264.0, "step": 567, "train/ce_loss": 0.8236745595932007 }, { "epoch": 0.05606090567530156, "step": 567, "train/sim_loss": 0.09375 }, { "epoch": 0.05606090567530156, "step": 567, "train/total_loss": 0.1761174499988556 }, { "entropy": 10.204925537109375, "epoch": 0.056159778524817086, "mean_token_accuracy": 0.6845637559890747, "num_tokens": 2967118.0, "step": 568, "train/ce_loss": 0.09642869979143143 }, { "epoch": 0.056159778524817086, "step": 568, "train/sim_loss": 0.0859375 }, { "epoch": 0.056159778524817086, "step": 568, "train/total_loss": 0.09558036923408508 }, { "entropy": 9.058152198791504, "epoch": 0.056258651374332606, "mean_token_accuracy": 0.7917485237121582, "num_tokens": 2972615.0, "step": 569, "train/ce_loss": 0.4953159689903259 }, { "epoch": 0.056258651374332606, "step": 569, "train/sim_loss": 0.078125 }, { "epoch": 0.056258651374332606, "step": 569, "train/total_loss": 0.12765659391880035 }, { "entropy": 9.651445388793945, "epoch": 0.056357524223848134, "mean_token_accuracy": 0.6978021860122681, "num_tokens": 2977773.0, "step": 570, "train/ce_loss": 1.5411763191223145 }, { "epoch": 0.056357524223848134, "step": 570, "train/sim_loss": 0.1171875 }, { "epoch": 0.056357524223848134, "step": 570, "train/total_loss": 0.2713051438331604 }, { "entropy": 9.766046524047852, "epoch": 0.056456397073363654, "mean_token_accuracy": 0.7850467562675476, "num_tokens": 2982852.0, "step": 571, "train/ce_loss": 0.638634979724884 }, { "epoch": 0.056456397073363654, "step": 571, "train/sim_loss": 0.0859375 }, { "epoch": 0.056456397073363654, "step": 571, "train/total_loss": 0.14980100095272064 }, { "entropy": 9.760435104370117, "epoch": 0.056555269922879174, "mean_token_accuracy": 0.7678571343421936, "num_tokens": 2988004.0, "step": 572, "train/ce_loss": 0.059007056057453156 }, { "epoch": 0.056555269922879174, "step": 572, "train/sim_loss": 0.07421875 }, { "epoch": 0.056555269922879174, "step": 572, "train/total_loss": 0.08011945337057114 }, { "entropy": 8.902538299560547, "epoch": 0.0566541427723947, "mean_token_accuracy": 0.7229129672050476, "num_tokens": 2993614.0, "step": 573, "train/ce_loss": 0.6874699592590332 }, { "epoch": 0.0566541427723947, "step": 573, "train/sim_loss": 0.078125 }, { "epoch": 0.0566541427723947, "step": 573, "train/total_loss": 0.14687199890613556 }, { "entropy": 10.377233505249023, "epoch": 0.05675301562191022, "mean_token_accuracy": 0.7002881765365601, "num_tokens": 2998359.0, "step": 574, "train/ce_loss": 2.1008381843566895 }, { "epoch": 0.05675301562191022, "step": 574, "train/sim_loss": 0.09765625 }, { "epoch": 0.05675301562191022, "step": 574, "train/total_loss": 0.30774009227752686 }, { "entropy": 9.222179412841797, "epoch": 0.05685188847142575, "mean_token_accuracy": 0.6544342637062073, "num_tokens": 3003840.0, "step": 575, "train/ce_loss": 0.9479613304138184 }, { "epoch": 0.05685188847142575, "step": 575, "train/sim_loss": 0.1328125 }, { "epoch": 0.05685188847142575, "step": 575, "train/total_loss": 0.22760863602161407 }, { "entropy": 9.846975326538086, "epoch": 0.05695076132094127, "mean_token_accuracy": 0.7049742937088013, "num_tokens": 3008886.0, "step": 576, "train/ce_loss": 0.8759105205535889 }, { "epoch": 0.05695076132094127, "step": 576, "train/sim_loss": 0.09765625 }, { "epoch": 0.05695076132094127, "step": 576, "train/total_loss": 0.1852473020553589 }, { "entropy": 9.408296585083008, "epoch": 0.05704963417045679, "mean_token_accuracy": 0.7602040767669678, "num_tokens": 3014182.0, "step": 577, "train/ce_loss": 1.0684754848480225 }, { "epoch": 0.05704963417045679, "step": 577, "train/sim_loss": 0.06640625 }, { "epoch": 0.05704963417045679, "step": 577, "train/total_loss": 0.17325380444526672 }, { "entropy": 9.4006986618042, "epoch": 0.05714850701997232, "mean_token_accuracy": 0.7496976852416992, "num_tokens": 3019471.0, "step": 578, "train/ce_loss": 0.5649070739746094 }, { "epoch": 0.05714850701997232, "step": 578, "train/sim_loss": 0.08984375 }, { "epoch": 0.05714850701997232, "step": 578, "train/total_loss": 0.1463344544172287 }, { "entropy": 9.148704528808594, "epoch": 0.05724737986948784, "mean_token_accuracy": 0.6821345686912537, "num_tokens": 3024797.0, "step": 579, "train/ce_loss": 1.0922415256500244 }, { "epoch": 0.05724737986948784, "step": 579, "train/sim_loss": 0.11328125 }, { "epoch": 0.05724737986948784, "step": 579, "train/total_loss": 0.22250540554523468 }, { "epoch": 0.057346252719003364, "grad_norm": 1.7009624242782593, "learning_rate": 9.859318597636356e-06, "loss": 0.1971, "step": 580 }, { "entropy": 9.603752136230469, "epoch": 0.057346252719003364, "mean_token_accuracy": 0.7529761791229248, "num_tokens": 3029923.0, "step": 580, "train/ce_loss": 0.06308668851852417 }, { "epoch": 0.057346252719003364, "step": 580, "train/sim_loss": 0.07421875 }, { "epoch": 0.057346252719003364, "step": 580, "train/total_loss": 0.0805274173617363 }, { "entropy": 9.038854598999023, "epoch": 0.057445125568518884, "mean_token_accuracy": 0.7690557241439819, "num_tokens": 3035294.0, "step": 581, "train/ce_loss": 0.8573618531227112 }, { "epoch": 0.057445125568518884, "step": 581, "train/sim_loss": 0.09765625 }, { "epoch": 0.057445125568518884, "step": 581, "train/total_loss": 0.18339243531227112 }, { "entropy": 9.591069221496582, "epoch": 0.057543998418034405, "mean_token_accuracy": 0.7294429540634155, "num_tokens": 3040500.0, "step": 582, "train/ce_loss": 0.6902552247047424 }, { "epoch": 0.057543998418034405, "step": 582, "train/sim_loss": 0.1875 }, { "epoch": 0.057543998418034405, "step": 582, "train/total_loss": 0.25652551651000977 }, { "entropy": 9.707898139953613, "epoch": 0.05764287126754993, "mean_token_accuracy": 0.6551265120506287, "num_tokens": 3045672.0, "step": 583, "train/ce_loss": 0.05402826890349388 }, { "epoch": 0.05764287126754993, "step": 583, "train/sim_loss": 0.09375 }, { "epoch": 0.05764287126754993, "step": 583, "train/total_loss": 0.0991528257727623 }, { "entropy": 9.415910720825195, "epoch": 0.05774174411706545, "mean_token_accuracy": 0.7229064106941223, "num_tokens": 3050888.0, "step": 584, "train/ce_loss": 0.9903222918510437 }, { "epoch": 0.05774174411706545, "step": 584, "train/sim_loss": 0.05859375 }, { "epoch": 0.05774174411706545, "step": 584, "train/total_loss": 0.1576259732246399 }, { "entropy": 9.809968948364258, "epoch": 0.05784061696658098, "mean_token_accuracy": 0.7357414364814758, "num_tokens": 3055823.0, "step": 585, "train/ce_loss": 1.3496983051300049 }, { "epoch": 0.05784061696658098, "step": 585, "train/sim_loss": 0.10546875 }, { "epoch": 0.05784061696658098, "step": 585, "train/total_loss": 0.2404385805130005 }, { "entropy": 8.99539566040039, "epoch": 0.0579394898160965, "mean_token_accuracy": 0.8194444179534912, "num_tokens": 3061280.0, "step": 586, "train/ce_loss": 0.6846917271614075 }, { "epoch": 0.0579394898160965, "step": 586, "train/sim_loss": 0.0546875 }, { "epoch": 0.0579394898160965, "step": 586, "train/total_loss": 0.12315667420625687 }, { "entropy": 9.648222923278809, "epoch": 0.05803836266561202, "mean_token_accuracy": 0.6480938196182251, "num_tokens": 3066535.0, "step": 587, "train/ce_loss": 1.0551172494888306 }, { "epoch": 0.05803836266561202, "step": 587, "train/sim_loss": 0.14453125 }, { "epoch": 0.05803836266561202, "step": 587, "train/total_loss": 0.25004297494888306 }, { "entropy": 9.561786651611328, "epoch": 0.05813723551512755, "mean_token_accuracy": 0.7098844647407532, "num_tokens": 3071709.0, "step": 588, "train/ce_loss": 1.5621851682662964 }, { "epoch": 0.05813723551512755, "step": 588, "train/sim_loss": 0.08984375 }, { "epoch": 0.05813723551512755, "step": 588, "train/total_loss": 0.2460622638463974 }, { "entropy": 9.043785095214844, "epoch": 0.05823610836464307, "mean_token_accuracy": 0.7582089304924011, "num_tokens": 3077207.0, "step": 589, "train/ce_loss": 0.7210156321525574 }, { "epoch": 0.05823610836464307, "step": 589, "train/sim_loss": 0.03125 }, { "epoch": 0.05823610836464307, "step": 589, "train/total_loss": 0.10335156321525574 }, { "entropy": 9.510514259338379, "epoch": 0.058334981214158595, "mean_token_accuracy": 0.7247340679168701, "num_tokens": 3082407.0, "step": 590, "train/ce_loss": 1.260585904121399 }, { "epoch": 0.058334981214158595, "step": 590, "train/sim_loss": 0.12890625 }, { "epoch": 0.058334981214158595, "step": 590, "train/total_loss": 0.25496482849121094 }, { "entropy": 9.443502426147461, "epoch": 0.058433854063674115, "mean_token_accuracy": 0.7330729365348816, "num_tokens": 3087607.0, "step": 591, "train/ce_loss": 1.3981369733810425 }, { "epoch": 0.058433854063674115, "step": 591, "train/sim_loss": 0.1171875 }, { "epoch": 0.058433854063674115, "step": 591, "train/total_loss": 0.25700122117996216 }, { "entropy": 9.518030166625977, "epoch": 0.058532726913189635, "mean_token_accuracy": 0.7631579041481018, "num_tokens": 3092856.0, "step": 592, "train/ce_loss": 1.1735337972640991 }, { "epoch": 0.058532726913189635, "step": 592, "train/sim_loss": 0.0859375 }, { "epoch": 0.058532726913189635, "step": 592, "train/total_loss": 0.2032908797264099 }, { "entropy": 9.151599884033203, "epoch": 0.05863159976270516, "mean_token_accuracy": 0.7040951251983643, "num_tokens": 3098148.0, "step": 593, "train/ce_loss": 1.2240796089172363 }, { "epoch": 0.05863159976270516, "step": 593, "train/sim_loss": 0.1015625 }, { "epoch": 0.05863159976270516, "step": 593, "train/total_loss": 0.2239704728126526 }, { "entropy": 10.255637168884277, "epoch": 0.05873047261222068, "mean_token_accuracy": 0.6727688908576965, "num_tokens": 3103189.0, "step": 594, "train/ce_loss": 0.10033336281776428 }, { "epoch": 0.05873047261222068, "step": 594, "train/sim_loss": 0.1171875 }, { "epoch": 0.05873047261222068, "step": 594, "train/total_loss": 0.12722083926200867 }, { "entropy": 9.687835693359375, "epoch": 0.05882934546173621, "mean_token_accuracy": 0.760188102722168, "num_tokens": 3108264.0, "step": 595, "train/ce_loss": 0.7539442777633667 }, { "epoch": 0.05882934546173621, "step": 595, "train/sim_loss": 0.09375 }, { "epoch": 0.05882934546173621, "step": 595, "train/total_loss": 0.1691444218158722 }, { "entropy": 9.354427337646484, "epoch": 0.05892821831125173, "mean_token_accuracy": 0.6974595785140991, "num_tokens": 3113589.0, "step": 596, "train/ce_loss": 0.5949295163154602 }, { "epoch": 0.05892821831125173, "step": 596, "train/sim_loss": 0.08984375 }, { "epoch": 0.05892821831125173, "step": 596, "train/total_loss": 0.14933669567108154 }, { "entropy": 9.385342597961426, "epoch": 0.05902709116076725, "mean_token_accuracy": 0.7252747416496277, "num_tokens": 3118832.0, "step": 597, "train/ce_loss": 0.048359014093875885 }, { "epoch": 0.05902709116076725, "step": 597, "train/sim_loss": 0.125 }, { "epoch": 0.05902709116076725, "step": 597, "train/total_loss": 0.12983590364456177 }, { "entropy": 9.755470275878906, "epoch": 0.05912596401028278, "mean_token_accuracy": 0.6884498596191406, "num_tokens": 3123936.0, "step": 598, "train/ce_loss": 0.06409426033496857 }, { "epoch": 0.05912596401028278, "step": 598, "train/sim_loss": 0.0703125 }, { "epoch": 0.05912596401028278, "step": 598, "train/total_loss": 0.0767219290137291 }, { "entropy": 8.945388793945312, "epoch": 0.0592248368597983, "mean_token_accuracy": 0.726396918296814, "num_tokens": 3129554.0, "step": 599, "train/ce_loss": 0.8508357405662537 }, { "epoch": 0.0592248368597983, "step": 599, "train/sim_loss": 0.10546875 }, { "epoch": 0.0592248368597983, "step": 599, "train/total_loss": 0.19055232405662537 }, { "epoch": 0.059323709709313825, "grad_norm": 1.3134665489196777, "learning_rate": 9.854373732878406e-06, "loss": 0.1913, "step": 600 }, { "entropy": 9.647882461547852, "epoch": 0.059323709709313825, "mean_token_accuracy": 0.7123098373413086, "num_tokens": 3134698.0, "step": 600, "train/ce_loss": 0.7676689624786377 }, { "epoch": 0.059323709709313825, "step": 600, "train/sim_loss": 0.08203125 }, { "epoch": 0.059323709709313825, "step": 600, "train/total_loss": 0.15879815816879272 }, { "entropy": 9.90464973449707, "epoch": 0.059422582558829345, "mean_token_accuracy": 0.739130437374115, "num_tokens": 3139674.0, "step": 601, "train/ce_loss": 0.9781233668327332 }, { "epoch": 0.059422582558829345, "step": 601, "train/sim_loss": 0.125 }, { "epoch": 0.059422582558829345, "step": 601, "train/total_loss": 0.22281233966350555 }, { "entropy": 9.9501953125, "epoch": 0.059521455408344866, "mean_token_accuracy": 0.776442289352417, "num_tokens": 3144516.0, "step": 602, "train/ce_loss": 0.10536352545022964 }, { "epoch": 0.059521455408344866, "step": 602, "train/sim_loss": 0.05859375 }, { "epoch": 0.059521455408344866, "step": 602, "train/total_loss": 0.06913010030984879 }, { "entropy": 9.114591598510742, "epoch": 0.05962032825786039, "mean_token_accuracy": 0.7934537529945374, "num_tokens": 3149858.0, "step": 603, "train/ce_loss": 0.8752561807632446 }, { "epoch": 0.05962032825786039, "step": 603, "train/sim_loss": 0.1015625 }, { "epoch": 0.05962032825786039, "step": 603, "train/total_loss": 0.1890881210565567 }, { "entropy": 9.432883262634277, "epoch": 0.05971920110737591, "mean_token_accuracy": 0.7468706369400024, "num_tokens": 3154999.0, "step": 604, "train/ce_loss": 1.2306396961212158 }, { "epoch": 0.05971920110737591, "step": 604, "train/sim_loss": 0.08203125 }, { "epoch": 0.05971920110737591, "step": 604, "train/total_loss": 0.20509523153305054 }, { "entropy": 9.856866836547852, "epoch": 0.05981807395689144, "mean_token_accuracy": 0.6504347920417786, "num_tokens": 3160014.0, "step": 605, "train/ce_loss": 1.4992445707321167 }, { "epoch": 0.05981807395689144, "step": 605, "train/sim_loss": 0.125 }, { "epoch": 0.05981807395689144, "step": 605, "train/total_loss": 0.27492445707321167 }, { "entropy": 9.179306030273438, "epoch": 0.05991694680640696, "mean_token_accuracy": 0.7101293206214905, "num_tokens": 3165444.0, "step": 606, "train/ce_loss": 1.657849907875061 }, { "epoch": 0.05991694680640696, "step": 606, "train/sim_loss": 0.1328125 }, { "epoch": 0.05991694680640696, "step": 606, "train/total_loss": 0.298597514629364 }, { "entropy": 9.692015647888184, "epoch": 0.06001581965592248, "mean_token_accuracy": 0.7776243090629578, "num_tokens": 3170762.0, "step": 607, "train/ce_loss": 0.6523178815841675 }, { "epoch": 0.06001581965592248, "step": 607, "train/sim_loss": 0.078125 }, { "epoch": 0.06001581965592248, "step": 607, "train/total_loss": 0.1433568000793457 }, { "entropy": 9.208277702331543, "epoch": 0.06011469250543801, "mean_token_accuracy": 0.7749999761581421, "num_tokens": 3176133.0, "step": 608, "train/ce_loss": 0.6562087535858154 }, { "epoch": 0.06011469250543801, "step": 608, "train/sim_loss": 0.12890625 }, { "epoch": 0.06011469250543801, "step": 608, "train/total_loss": 0.19452711939811707 }, { "entropy": 9.781991958618164, "epoch": 0.06021356535495353, "mean_token_accuracy": 0.7298938035964966, "num_tokens": 3181214.0, "step": 609, "train/ce_loss": 0.7615407109260559 }, { "epoch": 0.06021356535495353, "step": 609, "train/sim_loss": 0.0625 }, { "epoch": 0.06021356535495353, "step": 609, "train/total_loss": 0.13865408301353455 }, { "entropy": 9.374441146850586, "epoch": 0.060312438204469056, "mean_token_accuracy": 0.7690476179122925, "num_tokens": 3186517.0, "step": 610, "train/ce_loss": 0.7123885154724121 }, { "epoch": 0.060312438204469056, "step": 610, "train/sim_loss": 0.109375 }, { "epoch": 0.060312438204469056, "step": 610, "train/total_loss": 0.18061384558677673 }, { "entropy": 9.316181182861328, "epoch": 0.060411311053984576, "mean_token_accuracy": 0.6707317233085632, "num_tokens": 3191845.0, "step": 611, "train/ce_loss": 0.6949711441993713 }, { "epoch": 0.060411311053984576, "step": 611, "train/sim_loss": 0.109375 }, { "epoch": 0.060411311053984576, "step": 611, "train/total_loss": 0.17887210845947266 }, { "entropy": 9.724642753601074, "epoch": 0.060510183903500096, "mean_token_accuracy": 0.759878396987915, "num_tokens": 3196946.0, "step": 612, "train/ce_loss": 0.5086504817008972 }, { "epoch": 0.060510183903500096, "step": 612, "train/sim_loss": 0.08203125 }, { "epoch": 0.060510183903500096, "step": 612, "train/total_loss": 0.1328963041305542 }, { "entropy": 9.029947280883789, "epoch": 0.060609056753015624, "mean_token_accuracy": 0.7461773753166199, "num_tokens": 3202457.0, "step": 613, "train/ce_loss": 0.4887118637561798 }, { "epoch": 0.060609056753015624, "step": 613, "train/sim_loss": 0.1328125 }, { "epoch": 0.060609056753015624, "step": 613, "train/total_loss": 0.18168368935585022 }, { "entropy": 9.9183931350708, "epoch": 0.060707929602531144, "mean_token_accuracy": 0.7015177011489868, "num_tokens": 3207503.0, "step": 614, "train/ce_loss": 1.7783939838409424 }, { "epoch": 0.060707929602531144, "step": 614, "train/sim_loss": 0.109375 }, { "epoch": 0.060707929602531144, "step": 614, "train/total_loss": 0.28721439838409424 }, { "entropy": 9.622869491577148, "epoch": 0.06080680245204667, "mean_token_accuracy": 0.7181409001350403, "num_tokens": 3212625.0, "step": 615, "train/ce_loss": 1.2805120944976807 }, { "epoch": 0.06080680245204667, "step": 615, "train/sim_loss": 0.10546875 }, { "epoch": 0.06080680245204667, "step": 615, "train/total_loss": 0.23351995646953583 }, { "entropy": 10.772820472717285, "epoch": 0.06090567530156219, "mean_token_accuracy": 0.7873563170433044, "num_tokens": 3217180.0, "step": 616, "train/ce_loss": 0.2533998489379883 }, { "epoch": 0.06090567530156219, "step": 616, "train/sim_loss": 0.06640625 }, { "epoch": 0.06090567530156219, "step": 616, "train/total_loss": 0.09174623340368271 }, { "entropy": 9.706140518188477, "epoch": 0.06100454815107771, "mean_token_accuracy": 0.7152974605560303, "num_tokens": 3222383.0, "step": 617, "train/ce_loss": 0.9342068433761597 }, { "epoch": 0.06100454815107771, "step": 617, "train/sim_loss": 0.11328125 }, { "epoch": 0.06100454815107771, "step": 617, "train/total_loss": 0.20670193433761597 }, { "entropy": 9.423571586608887, "epoch": 0.06110342100059324, "mean_token_accuracy": 0.6854742169380188, "num_tokens": 3227588.0, "step": 618, "train/ce_loss": 1.3230915069580078 }, { "epoch": 0.06110342100059324, "step": 618, "train/sim_loss": 0.109375 }, { "epoch": 0.06110342100059324, "step": 618, "train/total_loss": 0.24168415367603302 }, { "entropy": 9.546838760375977, "epoch": 0.06120229385010876, "mean_token_accuracy": 0.7903226017951965, "num_tokens": 3232774.0, "step": 619, "train/ce_loss": 0.7212303876876831 }, { "epoch": 0.06120229385010876, "step": 619, "train/sim_loss": 0.046875 }, { "epoch": 0.06120229385010876, "step": 619, "train/total_loss": 0.11899804323911667 }, { "epoch": 0.061301166699624286, "grad_norm": 1.0468522310256958, "learning_rate": 9.849428868120457e-06, "loss": 0.1921, "step": 620 }, { "entropy": 9.641387939453125, "epoch": 0.061301166699624286, "mean_token_accuracy": 0.7740525007247925, "num_tokens": 3237908.0, "step": 620, "train/ce_loss": 1.0195034742355347 }, { "epoch": 0.061301166699624286, "step": 620, "train/sim_loss": 0.046875 }, { "epoch": 0.061301166699624286, "step": 620, "train/total_loss": 0.14882534742355347 }, { "entropy": 9.968416213989258, "epoch": 0.06140003954913981, "mean_token_accuracy": 0.7218309640884399, "num_tokens": 3242863.0, "step": 621, "train/ce_loss": 1.2666391134262085 }, { "epoch": 0.06140003954913981, "step": 621, "train/sim_loss": 0.109375 }, { "epoch": 0.06140003954913981, "step": 621, "train/total_loss": 0.2360389083623886 }, { "entropy": 9.490911483764648, "epoch": 0.06149891239865533, "mean_token_accuracy": 0.6958277225494385, "num_tokens": 3248076.0, "step": 622, "train/ce_loss": 1.2347162961959839 }, { "epoch": 0.06149891239865533, "step": 622, "train/sim_loss": 0.125 }, { "epoch": 0.06149891239865533, "step": 622, "train/total_loss": 0.24847163259983063 }, { "entropy": 9.121641159057617, "epoch": 0.061597785248170854, "mean_token_accuracy": 0.7781609296798706, "num_tokens": 3253474.0, "step": 623, "train/ce_loss": 0.7475561499595642 }, { "epoch": 0.061597785248170854, "step": 623, "train/sim_loss": 0.1015625 }, { "epoch": 0.061597785248170854, "step": 623, "train/total_loss": 0.17631810903549194 }, { "entropy": 9.5985746383667, "epoch": 0.061696658097686374, "mean_token_accuracy": 0.7353723645210266, "num_tokens": 3258670.0, "step": 624, "train/ce_loss": 0.7435654997825623 }, { "epoch": 0.061696658097686374, "step": 624, "train/sim_loss": 0.0390625 }, { "epoch": 0.061696658097686374, "step": 624, "train/total_loss": 0.1134190484881401 }, { "entropy": 9.410194396972656, "epoch": 0.0617955309472019, "mean_token_accuracy": 0.7748690843582153, "num_tokens": 3263910.0, "step": 625, "train/ce_loss": 0.6336197257041931 }, { "epoch": 0.0617955309472019, "step": 625, "train/sim_loss": 0.12109375 }, { "epoch": 0.0617955309472019, "step": 625, "train/total_loss": 0.1844557225704193 }, { "entropy": 9.588516235351562, "epoch": 0.06189440379671742, "mean_token_accuracy": 0.7860139608383179, "num_tokens": 3269075.0, "step": 626, "train/ce_loss": 0.060945674777030945 }, { "epoch": 0.06189440379671742, "step": 626, "train/sim_loss": 0.05859375 }, { "epoch": 0.06189440379671742, "step": 626, "train/total_loss": 0.0646883174777031 }, { "entropy": 10.053567886352539, "epoch": 0.06199327664623294, "mean_token_accuracy": 0.7411273717880249, "num_tokens": 3273928.0, "step": 627, "train/ce_loss": 1.2324460744857788 }, { "epoch": 0.06199327664623294, "step": 627, "train/sim_loss": 0.10546875 }, { "epoch": 0.06199327664623294, "step": 627, "train/total_loss": 0.22871336340904236 }, { "entropy": 9.442203521728516, "epoch": 0.06209214949574847, "mean_token_accuracy": 0.8010075688362122, "num_tokens": 3279180.0, "step": 628, "train/ce_loss": 0.6030117273330688 }, { "epoch": 0.06209214949574847, "step": 628, "train/sim_loss": 0.078125 }, { "epoch": 0.06209214949574847, "step": 628, "train/total_loss": 0.13842616975307465 }, { "entropy": 10.123472213745117, "epoch": 0.06219102234526399, "mean_token_accuracy": 0.7330595254898071, "num_tokens": 3284069.0, "step": 629, "train/ce_loss": 0.07958053052425385 }, { "epoch": 0.06219102234526399, "step": 629, "train/sim_loss": 0.08984375 }, { "epoch": 0.06219102234526399, "step": 629, "train/total_loss": 0.0978018045425415 }, { "entropy": 9.632625579833984, "epoch": 0.06228989519477952, "mean_token_accuracy": 0.7446808218955994, "num_tokens": 3289163.0, "step": 630, "train/ce_loss": 0.661207377910614 }, { "epoch": 0.06228989519477952, "step": 630, "train/sim_loss": 0.0625 }, { "epoch": 0.06228989519477952, "step": 630, "train/total_loss": 0.12862074375152588 }, { "entropy": 9.37982177734375, "epoch": 0.06238876804429504, "mean_token_accuracy": 0.6502857208251953, "num_tokens": 3294500.0, "step": 631, "train/ce_loss": 1.2884637117385864 }, { "epoch": 0.06238876804429504, "step": 631, "train/sim_loss": 0.13671875 }, { "epoch": 0.06238876804429504, "step": 631, "train/total_loss": 0.2655651271343231 }, { "entropy": 10.025651931762695, "epoch": 0.06248764089381056, "mean_token_accuracy": 0.8018691539764404, "num_tokens": 3299447.0, "step": 632, "train/ce_loss": 0.0742001160979271 }, { "epoch": 0.06248764089381056, "step": 632, "train/sim_loss": 0.05078125 }, { "epoch": 0.06248764089381056, "step": 632, "train/total_loss": 0.05820126086473465 }, { "entropy": 9.40864372253418, "epoch": 0.06258651374332608, "mean_token_accuracy": 0.7171581983566284, "num_tokens": 3304648.0, "step": 633, "train/ce_loss": 0.8080118894577026 }, { "epoch": 0.06258651374332608, "step": 633, "train/sim_loss": 0.09375 }, { "epoch": 0.06258651374332608, "step": 633, "train/total_loss": 0.17455118894577026 }, { "entropy": 9.281893730163574, "epoch": 0.0626853865928416, "mean_token_accuracy": 0.688524603843689, "num_tokens": 3309914.0, "step": 634, "train/ce_loss": 1.0388758182525635 }, { "epoch": 0.0626853865928416, "step": 634, "train/sim_loss": 0.14453125 }, { "epoch": 0.0626853865928416, "step": 634, "train/total_loss": 0.24841883778572083 }, { "entropy": 9.273179054260254, "epoch": 0.06278425944235713, "mean_token_accuracy": 0.7076923251152039, "num_tokens": 3315246.0, "step": 635, "train/ce_loss": 1.4154484272003174 }, { "epoch": 0.06278425944235713, "step": 635, "train/sim_loss": 0.125 }, { "epoch": 0.06278425944235713, "step": 635, "train/total_loss": 0.2665448486804962 }, { "entropy": 9.427886962890625, "epoch": 0.06288313229187265, "mean_token_accuracy": 0.8057553768157959, "num_tokens": 3320563.0, "step": 636, "train/ce_loss": 0.04886097088456154 }, { "epoch": 0.06288313229187265, "step": 636, "train/sim_loss": 0.09375 }, { "epoch": 0.06288313229187265, "step": 636, "train/total_loss": 0.09863609820604324 }, { "entropy": 9.08419132232666, "epoch": 0.06298200514138817, "mean_token_accuracy": 0.7119438052177429, "num_tokens": 3325840.0, "step": 637, "train/ce_loss": 0.6452086567878723 }, { "epoch": 0.06298200514138817, "step": 637, "train/sim_loss": 0.078125 }, { "epoch": 0.06298200514138817, "step": 637, "train/total_loss": 0.14264586567878723 }, { "entropy": 9.62785816192627, "epoch": 0.0630808779909037, "mean_token_accuracy": 0.7223926186561584, "num_tokens": 3330918.0, "step": 638, "train/ce_loss": 0.8809434175491333 }, { "epoch": 0.0630808779909037, "step": 638, "train/sim_loss": 0.0625 }, { "epoch": 0.0630808779909037, "step": 638, "train/total_loss": 0.15059435367584229 }, { "entropy": 9.237200736999512, "epoch": 0.06317975084041923, "mean_token_accuracy": 0.6712734699249268, "num_tokens": 3336571.0, "step": 639, "train/ce_loss": 1.0680677890777588 }, { "epoch": 0.06317975084041923, "step": 639, "train/sim_loss": 0.09375 }, { "epoch": 0.06317975084041923, "step": 639, "train/total_loss": 0.20055678486824036 }, { "epoch": 0.06327862368993474, "grad_norm": 1.2951865196228027, "learning_rate": 9.844484003362509e-06, "loss": 0.1846, "step": 640 }, { "entropy": 9.697471618652344, "epoch": 0.06327862368993474, "mean_token_accuracy": 0.667117714881897, "num_tokens": 3341744.0, "step": 640, "train/ce_loss": 0.056046780198812485 }, { "epoch": 0.06327862368993474, "step": 640, "train/sim_loss": 0.109375 }, { "epoch": 0.06327862368993474, "step": 640, "train/total_loss": 0.11497967690229416 }, { "entropy": 9.747730255126953, "epoch": 0.06337749653945027, "mean_token_accuracy": 0.7149606347084045, "num_tokens": 3346815.0, "step": 641, "train/ce_loss": 0.9643349051475525 }, { "epoch": 0.06337749653945027, "step": 641, "train/sim_loss": 0.125 }, { "epoch": 0.06337749653945027, "step": 641, "train/total_loss": 0.22143349051475525 }, { "entropy": 9.916828155517578, "epoch": 0.0634763693889658, "mean_token_accuracy": 0.7209677696228027, "num_tokens": 3351870.0, "step": 642, "train/ce_loss": 1.0646116733551025 }, { "epoch": 0.0634763693889658, "step": 642, "train/sim_loss": 0.09765625 }, { "epoch": 0.0634763693889658, "step": 642, "train/total_loss": 0.20411741733551025 }, { "entropy": 9.16053581237793, "epoch": 0.06357524223848131, "mean_token_accuracy": 0.7201645970344543, "num_tokens": 3357323.0, "step": 643, "train/ce_loss": 1.1711736917495728 }, { "epoch": 0.06357524223848131, "step": 643, "train/sim_loss": 0.18359375 }, { "epoch": 0.06357524223848131, "step": 643, "train/total_loss": 0.30071112513542175 }, { "entropy": 9.730305671691895, "epoch": 0.06367411508799684, "mean_token_accuracy": 0.7410179376602173, "num_tokens": 3362499.0, "step": 644, "train/ce_loss": 0.6209876537322998 }, { "epoch": 0.06367411508799684, "step": 644, "train/sim_loss": 0.109375 }, { "epoch": 0.06367411508799684, "step": 644, "train/total_loss": 0.17147377133369446 }, { "entropy": 9.807441711425781, "epoch": 0.06377298793751236, "mean_token_accuracy": 0.7218155264854431, "num_tokens": 3367619.0, "step": 645, "train/ce_loss": 1.9182504415512085 }, { "epoch": 0.06377298793751236, "step": 645, "train/sim_loss": 0.12109375 }, { "epoch": 0.06377298793751236, "step": 645, "train/total_loss": 0.3129187822341919 }, { "entropy": 9.33283805847168, "epoch": 0.06387186078702788, "mean_token_accuracy": 0.7675804495811462, "num_tokens": 3372926.0, "step": 646, "train/ce_loss": 0.49201691150665283 }, { "epoch": 0.06387186078702788, "step": 646, "train/sim_loss": 0.0625 }, { "epoch": 0.06387186078702788, "step": 646, "train/total_loss": 0.11170169711112976 }, { "entropy": 9.105297088623047, "epoch": 0.0639707336365434, "mean_token_accuracy": 0.761529803276062, "num_tokens": 3378291.0, "step": 647, "train/ce_loss": 0.666023313999176 }, { "epoch": 0.0639707336365434, "step": 647, "train/sim_loss": 0.0546875 }, { "epoch": 0.0639707336365434, "step": 647, "train/total_loss": 0.12128983438014984 }, { "entropy": 9.271563529968262, "epoch": 0.06406960648605893, "mean_token_accuracy": 0.7352246046066284, "num_tokens": 3383646.0, "step": 648, "train/ce_loss": 0.9176062345504761 }, { "epoch": 0.06406960648605893, "step": 648, "train/sim_loss": 0.12109375 }, { "epoch": 0.06406960648605893, "step": 648, "train/total_loss": 0.21285438537597656 }, { "entropy": 9.301518440246582, "epoch": 0.06416847933557446, "mean_token_accuracy": 0.657549262046814, "num_tokens": 3389018.0, "step": 649, "train/ce_loss": 0.6130936145782471 }, { "epoch": 0.06416847933557446, "step": 649, "train/sim_loss": 0.125 }, { "epoch": 0.06416847933557446, "step": 649, "train/total_loss": 0.18630936741828918 }, { "entropy": 9.155862808227539, "epoch": 0.06426735218508997, "mean_token_accuracy": 0.7696139216423035, "num_tokens": 3394266.0, "step": 650, "train/ce_loss": 0.6263456344604492 }, { "epoch": 0.06426735218508997, "step": 650, "train/sim_loss": 0.09375 }, { "epoch": 0.06426735218508997, "step": 650, "train/total_loss": 0.15638455748558044 }, { "entropy": 9.07483196258545, "epoch": 0.0643662250346055, "mean_token_accuracy": 0.6795699000358582, "num_tokens": 3399684.0, "step": 651, "train/ce_loss": 0.9619243741035461 }, { "epoch": 0.0643662250346055, "step": 651, "train/sim_loss": 0.125 }, { "epoch": 0.0643662250346055, "step": 651, "train/total_loss": 0.22119244933128357 }, { "entropy": 10.023591995239258, "epoch": 0.06446509788412103, "mean_token_accuracy": 0.7412451505661011, "num_tokens": 3404623.0, "step": 652, "train/ce_loss": 0.08032934367656708 }, { "epoch": 0.06446509788412103, "step": 652, "train/sim_loss": 0.09375 }, { "epoch": 0.06446509788412103, "step": 652, "train/total_loss": 0.10178293287754059 }, { "entropy": 9.418949127197266, "epoch": 0.06456397073363654, "mean_token_accuracy": 0.7397820353507996, "num_tokens": 3409860.0, "step": 653, "train/ce_loss": 1.1483129262924194 }, { "epoch": 0.06456397073363654, "step": 653, "train/sim_loss": 0.109375 }, { "epoch": 0.06456397073363654, "step": 653, "train/total_loss": 0.22420629858970642 }, { "entropy": 9.003963470458984, "epoch": 0.06466284358315207, "mean_token_accuracy": 0.6903499364852905, "num_tokens": 3415334.0, "step": 654, "train/ce_loss": 1.5239121913909912 }, { "epoch": 0.06466284358315207, "step": 654, "train/sim_loss": 0.171875 }, { "epoch": 0.06466284358315207, "step": 654, "train/total_loss": 0.3242662250995636 }, { "entropy": 9.603049278259277, "epoch": 0.0647617164326676, "mean_token_accuracy": 0.7332382202148438, "num_tokens": 3420483.0, "step": 655, "train/ce_loss": 0.8936455845832825 }, { "epoch": 0.0647617164326676, "step": 655, "train/sim_loss": 0.05859375 }, { "epoch": 0.0647617164326676, "step": 655, "train/total_loss": 0.14795830845832825 }, { "entropy": 9.058694839477539, "epoch": 0.0648605892821831, "mean_token_accuracy": 0.7370558381080627, "num_tokens": 3425941.0, "step": 656, "train/ce_loss": 0.53897625207901 }, { "epoch": 0.0648605892821831, "step": 656, "train/sim_loss": 0.08203125 }, { "epoch": 0.0648605892821831, "step": 656, "train/total_loss": 0.13592886924743652 }, { "entropy": 9.526253700256348, "epoch": 0.06495946213169863, "mean_token_accuracy": 0.7169274687767029, "num_tokens": 3431138.0, "step": 657, "train/ce_loss": 1.0278583765029907 }, { "epoch": 0.06495946213169863, "step": 657, "train/sim_loss": 0.140625 }, { "epoch": 0.06495946213169863, "step": 657, "train/total_loss": 0.2434108406305313 }, { "entropy": 9.444064140319824, "epoch": 0.06505833498121416, "mean_token_accuracy": 0.7347875833511353, "num_tokens": 3436467.0, "step": 658, "train/ce_loss": 1.1032007932662964 }, { "epoch": 0.06505833498121416, "step": 658, "train/sim_loss": 0.0703125 }, { "epoch": 0.06505833498121416, "step": 658, "train/total_loss": 0.1806325912475586 }, { "entropy": 9.832465171813965, "epoch": 0.06515720783072969, "mean_token_accuracy": 0.6955223679542542, "num_tokens": 3441567.0, "step": 659, "train/ce_loss": 0.9809430241584778 }, { "epoch": 0.06515720783072969, "step": 659, "train/sim_loss": 0.125 }, { "epoch": 0.06515720783072969, "step": 659, "train/total_loss": 0.22309431433677673 }, { "epoch": 0.0652560806802452, "grad_norm": 1.191369891166687, "learning_rate": 9.83953913860456e-06, "loss": 0.193, "step": 660 }, { "entropy": 9.245681762695312, "epoch": 0.0652560806802452, "mean_token_accuracy": 0.7570194602012634, "num_tokens": 3446942.0, "step": 660, "train/ce_loss": 0.6208062171936035 }, { "epoch": 0.0652560806802452, "step": 660, "train/sim_loss": 0.07421875 }, { "epoch": 0.0652560806802452, "step": 660, "train/total_loss": 0.13629937171936035 }, { "entropy": 9.718344688415527, "epoch": 0.06535495352976073, "mean_token_accuracy": 0.6290801167488098, "num_tokens": 3452047.0, "step": 661, "train/ce_loss": 2.3280043601989746 }, { "epoch": 0.06535495352976073, "step": 661, "train/sim_loss": 0.14453125 }, { "epoch": 0.06535495352976073, "step": 661, "train/total_loss": 0.3773316740989685 }, { "entropy": 9.507204055786133, "epoch": 0.06545382637927626, "mean_token_accuracy": 0.7333333492279053, "num_tokens": 3457287.0, "step": 662, "train/ce_loss": 0.7894484996795654 }, { "epoch": 0.06545382637927626, "step": 662, "train/sim_loss": 0.1015625 }, { "epoch": 0.06545382637927626, "step": 662, "train/total_loss": 0.1805073618888855 }, { "entropy": 9.868836402893066, "epoch": 0.06555269922879177, "mean_token_accuracy": 0.7058823704719543, "num_tokens": 3462330.0, "step": 663, "train/ce_loss": 1.783589243888855 }, { "epoch": 0.06555269922879177, "step": 663, "train/sim_loss": 0.140625 }, { "epoch": 0.06555269922879177, "step": 663, "train/total_loss": 0.31898391246795654 }, { "entropy": 9.411312103271484, "epoch": 0.0656515720783073, "mean_token_accuracy": 0.7514863014221191, "num_tokens": 3467654.0, "step": 664, "train/ce_loss": 1.2834221124649048 }, { "epoch": 0.0656515720783073, "step": 664, "train/sim_loss": 0.1015625 }, { "epoch": 0.0656515720783073, "step": 664, "train/total_loss": 0.22990471124649048 }, { "entropy": 9.016120910644531, "epoch": 0.06575044492782282, "mean_token_accuracy": 0.7105262875556946, "num_tokens": 3473114.0, "step": 665, "train/ce_loss": 1.0795176029205322 }, { "epoch": 0.06575044492782282, "step": 665, "train/sim_loss": 0.1796875 }, { "epoch": 0.06575044492782282, "step": 665, "train/total_loss": 0.2876392602920532 }, { "entropy": 10.028396606445312, "epoch": 0.06584931777733834, "mean_token_accuracy": 0.8230912685394287, "num_tokens": 3478105.0, "step": 666, "train/ce_loss": 1.5367385149002075 }, { "epoch": 0.06584931777733834, "step": 666, "train/sim_loss": 0.1171875 }, { "epoch": 0.06584931777733834, "step": 666, "train/total_loss": 0.27086135745048523 }, { "entropy": 9.48210620880127, "epoch": 0.06594819062685386, "mean_token_accuracy": 0.7305825352668762, "num_tokens": 3483359.0, "step": 667, "train/ce_loss": 0.828326940536499 }, { "epoch": 0.06594819062685386, "step": 667, "train/sim_loss": 0.0625 }, { "epoch": 0.06594819062685386, "step": 667, "train/total_loss": 0.1453326940536499 }, { "entropy": 9.934640884399414, "epoch": 0.06604706347636939, "mean_token_accuracy": 0.7124394178390503, "num_tokens": 3488405.0, "step": 668, "train/ce_loss": 0.059258297085762024 }, { "epoch": 0.06604706347636939, "step": 668, "train/sim_loss": 0.09375 }, { "epoch": 0.06604706347636939, "step": 668, "train/total_loss": 0.09967582672834396 }, { "entropy": 10.445979118347168, "epoch": 0.06614593632588492, "mean_token_accuracy": 0.7656765580177307, "num_tokens": 3493089.0, "step": 669, "train/ce_loss": 0.12868449091911316 }, { "epoch": 0.06614593632588492, "step": 669, "train/sim_loss": 0.0390625 }, { "epoch": 0.06614593632588492, "step": 669, "train/total_loss": 0.051930949091911316 }, { "entropy": 9.246888160705566, "epoch": 0.06624480917540043, "mean_token_accuracy": 0.7436159253120422, "num_tokens": 3498540.0, "step": 670, "train/ce_loss": 0.9152721166610718 }, { "epoch": 0.06624480917540043, "step": 670, "train/sim_loss": 0.0703125 }, { "epoch": 0.06624480917540043, "step": 670, "train/total_loss": 0.16183972358703613 }, { "entropy": 10.375326156616211, "epoch": 0.06634368202491596, "mean_token_accuracy": 0.6973684430122375, "num_tokens": 3503331.0, "step": 671, "train/ce_loss": 2.0260884761810303 }, { "epoch": 0.06634368202491596, "step": 671, "train/sim_loss": 0.0703125 }, { "epoch": 0.06634368202491596, "step": 671, "train/total_loss": 0.2729213535785675 }, { "entropy": 10.073959350585938, "epoch": 0.06644255487443149, "mean_token_accuracy": 0.6776180863380432, "num_tokens": 3508288.0, "step": 672, "train/ce_loss": 1.6511471271514893 }, { "epoch": 0.06644255487443149, "step": 672, "train/sim_loss": 0.16796875 }, { "epoch": 0.06644255487443149, "step": 672, "train/total_loss": 0.33308345079421997 }, { "entropy": 9.359809875488281, "epoch": 0.066541427723947, "mean_token_accuracy": 0.7652173638343811, "num_tokens": 3513547.0, "step": 673, "train/ce_loss": 0.8948929905891418 }, { "epoch": 0.066541427723947, "step": 673, "train/sim_loss": 0.05859375 }, { "epoch": 0.066541427723947, "step": 673, "train/total_loss": 0.14808306097984314 }, { "entropy": 9.301137924194336, "epoch": 0.06664030057346253, "mean_token_accuracy": 0.709046483039856, "num_tokens": 3518803.0, "step": 674, "train/ce_loss": 1.2451282739639282 }, { "epoch": 0.06664030057346253, "step": 674, "train/sim_loss": 0.09375 }, { "epoch": 0.06664030057346253, "step": 674, "train/total_loss": 0.21826282143592834 }, { "entropy": 9.372105598449707, "epoch": 0.06673917342297805, "mean_token_accuracy": 0.6800920367240906, "num_tokens": 3524178.0, "step": 675, "train/ce_loss": 0.6253710389137268 }, { "epoch": 0.06673917342297805, "step": 675, "train/sim_loss": 0.11328125 }, { "epoch": 0.06673917342297805, "step": 675, "train/total_loss": 0.17581835389137268 }, { "entropy": 9.635863304138184, "epoch": 0.06683804627249357, "mean_token_accuracy": 0.7761394381523132, "num_tokens": 3529381.0, "step": 676, "train/ce_loss": 0.8285779356956482 }, { "epoch": 0.06683804627249357, "step": 676, "train/sim_loss": 0.11328125 }, { "epoch": 0.06683804627249357, "step": 676, "train/total_loss": 0.19613903760910034 }, { "entropy": 9.848868370056152, "epoch": 0.0669369191220091, "mean_token_accuracy": 0.764026403427124, "num_tokens": 3534381.0, "step": 677, "train/ce_loss": 1.0774803161621094 }, { "epoch": 0.0669369191220091, "step": 677, "train/sim_loss": 0.04296875 }, { "epoch": 0.0669369191220091, "step": 677, "train/total_loss": 0.15071678161621094 }, { "entropy": 9.583255767822266, "epoch": 0.06703579197152462, "mean_token_accuracy": 0.7356687784194946, "num_tokens": 3539473.0, "step": 678, "train/ce_loss": 0.6760376691818237 }, { "epoch": 0.06703579197152462, "step": 678, "train/sim_loss": 0.05859375 }, { "epoch": 0.06703579197152462, "step": 678, "train/total_loss": 0.12619751691818237 }, { "entropy": 10.006082534790039, "epoch": 0.06713466482104015, "mean_token_accuracy": 0.7246891856193542, "num_tokens": 3544441.0, "step": 679, "train/ce_loss": 0.05885869264602661 }, { "epoch": 0.06713466482104015, "step": 679, "train/sim_loss": 0.09765625 }, { "epoch": 0.06713466482104015, "step": 679, "train/total_loss": 0.10354211926460266 }, { "epoch": 0.06723353767055566, "grad_norm": 1.089155673980713, "learning_rate": 9.834594273846612e-06, "loss": 0.1867, "step": 680 }, { "entropy": 9.445066452026367, "epoch": 0.06723353767055566, "mean_token_accuracy": 0.7359356880187988, "num_tokens": 3549734.0, "step": 680, "train/ce_loss": 1.1421045064926147 }, { "epoch": 0.06723353767055566, "step": 680, "train/sim_loss": 0.109375 }, { "epoch": 0.06723353767055566, "step": 680, "train/total_loss": 0.22358545660972595 }, { "entropy": 9.163797378540039, "epoch": 0.06733241052007119, "mean_token_accuracy": 0.6962421536445618, "num_tokens": 3555145.0, "step": 681, "train/ce_loss": 0.4922032058238983 }, { "epoch": 0.06733241052007119, "step": 681, "train/sim_loss": 0.10546875 }, { "epoch": 0.06733241052007119, "step": 681, "train/total_loss": 0.15468907356262207 }, { "entropy": 9.931070327758789, "epoch": 0.06743128336958672, "mean_token_accuracy": 0.7162162065505981, "num_tokens": 3560215.0, "step": 682, "train/ce_loss": 2.0106382369995117 }, { "epoch": 0.06743128336958672, "step": 682, "train/sim_loss": 0.125 }, { "epoch": 0.06743128336958672, "step": 682, "train/total_loss": 0.3260638117790222 }, { "entropy": 9.990118980407715, "epoch": 0.06753015621910223, "mean_token_accuracy": 0.7467700242996216, "num_tokens": 3565072.0, "step": 683, "train/ce_loss": 1.3418598175048828 }, { "epoch": 0.06753015621910223, "step": 683, "train/sim_loss": 0.125 }, { "epoch": 0.06753015621910223, "step": 683, "train/total_loss": 0.2591859698295593 }, { "entropy": 9.619613647460938, "epoch": 0.06762902906861776, "mean_token_accuracy": 0.7405857443809509, "num_tokens": 3570234.0, "step": 684, "train/ce_loss": 1.0212657451629639 }, { "epoch": 0.06762902906861776, "step": 684, "train/sim_loss": 0.09765625 }, { "epoch": 0.06762902906861776, "step": 684, "train/total_loss": 0.1997828185558319 }, { "entropy": 9.682656288146973, "epoch": 0.06772790191813328, "mean_token_accuracy": 0.7384615540504456, "num_tokens": 3575280.0, "step": 685, "train/ce_loss": 1.205757737159729 }, { "epoch": 0.06772790191813328, "step": 685, "train/sim_loss": 0.078125 }, { "epoch": 0.06772790191813328, "step": 685, "train/total_loss": 0.19870078563690186 }, { "entropy": 9.31159782409668, "epoch": 0.0678267747676488, "mean_token_accuracy": 0.7185016870498657, "num_tokens": 3580607.0, "step": 686, "train/ce_loss": 0.6654388308525085 }, { "epoch": 0.0678267747676488, "step": 686, "train/sim_loss": 0.05859375 }, { "epoch": 0.0678267747676488, "step": 686, "train/total_loss": 0.12513762712478638 }, { "entropy": 9.789628982543945, "epoch": 0.06792564761716433, "mean_token_accuracy": 0.749627411365509, "num_tokens": 3585915.0, "step": 687, "train/ce_loss": 1.253875494003296 }, { "epoch": 0.06792564761716433, "step": 687, "train/sim_loss": 0.1171875 }, { "epoch": 0.06792564761716433, "step": 687, "train/total_loss": 0.2425750494003296 }, { "entropy": 9.748092651367188, "epoch": 0.06802452046667985, "mean_token_accuracy": 0.6965517401695251, "num_tokens": 3590926.0, "step": 688, "train/ce_loss": 1.903677225112915 }, { "epoch": 0.06802452046667985, "step": 688, "train/sim_loss": 0.19921875 }, { "epoch": 0.06802452046667985, "step": 688, "train/total_loss": 0.389586478471756 }, { "entropy": 9.030678749084473, "epoch": 0.06812339331619537, "mean_token_accuracy": 0.779033899307251, "num_tokens": 3596414.0, "step": 689, "train/ce_loss": 1.0212630033493042 }, { "epoch": 0.06812339331619537, "step": 689, "train/sim_loss": 0.12109375 }, { "epoch": 0.06812339331619537, "step": 689, "train/total_loss": 0.22322005033493042 }, { "entropy": 10.724943161010742, "epoch": 0.0682222661657109, "mean_token_accuracy": 0.8090452551841736, "num_tokens": 3600992.0, "step": 690, "train/ce_loss": 0.20112809538841248 }, { "epoch": 0.0682222661657109, "step": 690, "train/sim_loss": 0.0625 }, { "epoch": 0.0682222661657109, "step": 690, "train/total_loss": 0.08261281251907349 }, { "entropy": 9.349640846252441, "epoch": 0.06832113901522642, "mean_token_accuracy": 0.7215777039527893, "num_tokens": 3606369.0, "step": 691, "train/ce_loss": 1.2464035749435425 }, { "epoch": 0.06832113901522642, "step": 691, "train/sim_loss": 0.125 }, { "epoch": 0.06832113901522642, "step": 691, "train/total_loss": 0.2496403604745865 }, { "entropy": 9.738096237182617, "epoch": 0.06842001186474195, "mean_token_accuracy": 0.736923098564148, "num_tokens": 3611444.0, "step": 692, "train/ce_loss": 1.0789051055908203 }, { "epoch": 0.06842001186474195, "step": 692, "train/sim_loss": 0.06640625 }, { "epoch": 0.06842001186474195, "step": 692, "train/total_loss": 0.1742967665195465 }, { "entropy": 9.667228698730469, "epoch": 0.06851888471425746, "mean_token_accuracy": 0.724397599697113, "num_tokens": 3616530.0, "step": 693, "train/ce_loss": 1.6246271133422852 }, { "epoch": 0.06851888471425746, "step": 693, "train/sim_loss": 0.14453125 }, { "epoch": 0.06851888471425746, "step": 693, "train/total_loss": 0.3069939613342285 }, { "entropy": 9.364891052246094, "epoch": 0.06861775756377299, "mean_token_accuracy": 0.7209567427635193, "num_tokens": 3621889.0, "step": 694, "train/ce_loss": 1.0871267318725586 }, { "epoch": 0.06861775756377299, "step": 694, "train/sim_loss": 0.1328125 }, { "epoch": 0.06861775756377299, "step": 694, "train/total_loss": 0.24152517318725586 }, { "entropy": 10.386150360107422, "epoch": 0.06871663041328852, "mean_token_accuracy": 0.7190082669258118, "num_tokens": 3626639.0, "step": 695, "train/ce_loss": 0.08880459517240524 }, { "epoch": 0.06871663041328852, "step": 695, "train/sim_loss": 0.0390625 }, { "epoch": 0.06871663041328852, "step": 695, "train/total_loss": 0.047942958772182465 }, { "entropy": 9.494813919067383, "epoch": 0.06881550326280403, "mean_token_accuracy": 0.6580311059951782, "num_tokens": 3631873.0, "step": 696, "train/ce_loss": 0.921953558921814 }, { "epoch": 0.06881550326280403, "step": 696, "train/sim_loss": 0.078125 }, { "epoch": 0.06881550326280403, "step": 696, "train/total_loss": 0.17032036185264587 }, { "entropy": 9.156608581542969, "epoch": 0.06891437611231956, "mean_token_accuracy": 0.7486573457717896, "num_tokens": 3637255.0, "step": 697, "train/ce_loss": 0.3901670277118683 }, { "epoch": 0.06891437611231956, "step": 697, "train/sim_loss": 0.09375 }, { "epoch": 0.06891437611231956, "step": 697, "train/total_loss": 0.1327667087316513 }, { "entropy": 9.285764694213867, "epoch": 0.06901324896183508, "mean_token_accuracy": 0.6714628338813782, "num_tokens": 3642508.0, "step": 698, "train/ce_loss": 1.7411725521087646 }, { "epoch": 0.06901324896183508, "step": 698, "train/sim_loss": 0.1171875 }, { "epoch": 0.06901324896183508, "step": 698, "train/total_loss": 0.2913047671318054 }, { "entropy": 9.458796501159668, "epoch": 0.0691121218113506, "mean_token_accuracy": 0.7443181872367859, "num_tokens": 3647698.0, "step": 699, "train/ce_loss": 0.5074321627616882 }, { "epoch": 0.0691121218113506, "step": 699, "train/sim_loss": 0.07421875 }, { "epoch": 0.0691121218113506, "step": 699, "train/total_loss": 0.1249619722366333 }, { "epoch": 0.06921099466086612, "grad_norm": 1.3604364395141602, "learning_rate": 9.829649409088662e-06, "loss": 0.1957, "step": 700 }, { "entropy": 9.569518089294434, "epoch": 0.06921099466086612, "mean_token_accuracy": 0.737051784992218, "num_tokens": 3652914.0, "step": 700, "train/ce_loss": 0.03997810557484627 }, { "epoch": 0.06921099466086612, "step": 700, "train/sim_loss": 0.09765625 }, { "epoch": 0.06921099466086612, "step": 700, "train/total_loss": 0.1016540601849556 }, { "entropy": 9.300058364868164, "epoch": 0.06930986751038165, "mean_token_accuracy": 0.701545774936676, "num_tokens": 3658230.0, "step": 701, "train/ce_loss": 1.087414264678955 }, { "epoch": 0.06930986751038165, "step": 701, "train/sim_loss": 0.140625 }, { "epoch": 0.06930986751038165, "step": 701, "train/total_loss": 0.24936643242835999 }, { "entropy": 8.97523307800293, "epoch": 0.06940874035989718, "mean_token_accuracy": 0.7360979914665222, "num_tokens": 3663780.0, "step": 702, "train/ce_loss": 0.4771391451358795 }, { "epoch": 0.06940874035989718, "step": 702, "train/sim_loss": 0.05859375 }, { "epoch": 0.06940874035989718, "step": 702, "train/total_loss": 0.10630767047405243 }, { "entropy": 9.283151626586914, "epoch": 0.06950761320941269, "mean_token_accuracy": 0.7625418305397034, "num_tokens": 3669110.0, "step": 703, "train/ce_loss": 0.7032759189605713 }, { "epoch": 0.06950761320941269, "step": 703, "train/sim_loss": 0.046875 }, { "epoch": 0.06950761320941269, "step": 703, "train/total_loss": 0.11720259487628937 }, { "entropy": 9.970033645629883, "epoch": 0.06960648605892822, "mean_token_accuracy": 0.6790606379508972, "num_tokens": 3674050.0, "step": 704, "train/ce_loss": 1.2082698345184326 }, { "epoch": 0.06960648605892822, "step": 704, "train/sim_loss": 0.0859375 }, { "epoch": 0.06960648605892822, "step": 704, "train/total_loss": 0.20676448941230774 }, { "entropy": 10.034111022949219, "epoch": 0.06970535890844375, "mean_token_accuracy": 0.7245762944221497, "num_tokens": 3678912.0, "step": 705, "train/ce_loss": 0.08424589782953262 }, { "epoch": 0.06970535890844375, "step": 705, "train/sim_loss": 0.0859375 }, { "epoch": 0.06970535890844375, "step": 705, "train/total_loss": 0.09436208754777908 }, { "entropy": 10.111080169677734, "epoch": 0.06980423175795926, "mean_token_accuracy": 0.6854838728904724, "num_tokens": 3683843.0, "step": 706, "train/ce_loss": 1.994897484779358 }, { "epoch": 0.06980423175795926, "step": 706, "train/sim_loss": 0.1328125 }, { "epoch": 0.06980423175795926, "step": 706, "train/total_loss": 0.3323022723197937 }, { "entropy": 9.114567756652832, "epoch": 0.06990310460747479, "mean_token_accuracy": 0.7799385786056519, "num_tokens": 3689314.0, "step": 707, "train/ce_loss": 0.48830723762512207 }, { "epoch": 0.06990310460747479, "step": 707, "train/sim_loss": 0.04296875 }, { "epoch": 0.06990310460747479, "step": 707, "train/total_loss": 0.09179947525262833 }, { "entropy": 8.870497703552246, "epoch": 0.07000197745699031, "mean_token_accuracy": 0.7571288347244263, "num_tokens": 3694841.0, "step": 708, "train/ce_loss": 0.9047189950942993 }, { "epoch": 0.07000197745699031, "step": 708, "train/sim_loss": 0.03515625 }, { "epoch": 0.07000197745699031, "step": 708, "train/total_loss": 0.12562814354896545 }, { "entropy": 9.620412826538086, "epoch": 0.07010085030650583, "mean_token_accuracy": 0.7257575988769531, "num_tokens": 3700157.0, "step": 709, "train/ce_loss": 0.427399605512619 }, { "epoch": 0.07010085030650583, "step": 709, "train/sim_loss": 0.18359375 }, { "epoch": 0.07010085030650583, "step": 709, "train/total_loss": 0.22633370757102966 }, { "entropy": 9.372359275817871, "epoch": 0.07019972315602135, "mean_token_accuracy": 0.7134146094322205, "num_tokens": 3705452.0, "step": 710, "train/ce_loss": 0.5273782014846802 }, { "epoch": 0.07019972315602135, "step": 710, "train/sim_loss": 0.09375 }, { "epoch": 0.07019972315602135, "step": 710, "train/total_loss": 0.14648781716823578 }, { "entropy": 10.039669036865234, "epoch": 0.07029859600553688, "mean_token_accuracy": 0.7710084319114685, "num_tokens": 3710372.0, "step": 711, "train/ce_loss": 1.3590974807739258 }, { "epoch": 0.07029859600553688, "step": 711, "train/sim_loss": 0.0859375 }, { "epoch": 0.07029859600553688, "step": 711, "train/total_loss": 0.22184725105762482 }, { "entropy": 9.72056770324707, "epoch": 0.07039746885505241, "mean_token_accuracy": 0.7262658476829529, "num_tokens": 3715474.0, "step": 712, "train/ce_loss": 1.278663992881775 }, { "epoch": 0.07039746885505241, "step": 712, "train/sim_loss": 0.140625 }, { "epoch": 0.07039746885505241, "step": 712, "train/total_loss": 0.26849138736724854 }, { "entropy": 9.267029762268066, "epoch": 0.07049634170456792, "mean_token_accuracy": 0.7059509754180908, "num_tokens": 3720799.0, "step": 713, "train/ce_loss": 0.4272323548793793 }, { "epoch": 0.07049634170456792, "step": 713, "train/sim_loss": 0.0859375 }, { "epoch": 0.07049634170456792, "step": 713, "train/total_loss": 0.12866073846817017 }, { "entropy": 9.170409202575684, "epoch": 0.07059521455408345, "mean_token_accuracy": 0.7936508059501648, "num_tokens": 3726103.0, "step": 714, "train/ce_loss": 0.6024886965751648 }, { "epoch": 0.07059521455408345, "step": 714, "train/sim_loss": 0.07421875 }, { "epoch": 0.07059521455408345, "step": 714, "train/total_loss": 0.13446761667728424 }, { "entropy": 9.855649948120117, "epoch": 0.07069408740359898, "mean_token_accuracy": 0.6976743936538696, "num_tokens": 3731103.0, "step": 715, "train/ce_loss": 0.04739635810256004 }, { "epoch": 0.07069408740359898, "step": 715, "train/sim_loss": 0.0546875 }, { "epoch": 0.07069408740359898, "step": 715, "train/total_loss": 0.059427134692668915 }, { "entropy": 9.538551330566406, "epoch": 0.07079296025311449, "mean_token_accuracy": 0.7252010703086853, "num_tokens": 3736312.0, "step": 716, "train/ce_loss": 1.1417169570922852 }, { "epoch": 0.07079296025311449, "step": 716, "train/sim_loss": 0.11328125 }, { "epoch": 0.07079296025311449, "step": 716, "train/total_loss": 0.22745294868946075 }, { "entropy": 9.267305374145508, "epoch": 0.07089183310263002, "mean_token_accuracy": 0.7060086131095886, "num_tokens": 3741658.0, "step": 717, "train/ce_loss": 0.7700828313827515 }, { "epoch": 0.07089183310263002, "step": 717, "train/sim_loss": 0.09375 }, { "epoch": 0.07089183310263002, "step": 717, "train/total_loss": 0.17075827717781067 }, { "entropy": 9.660655975341797, "epoch": 0.07099070595214554, "mean_token_accuracy": 0.664505660533905, "num_tokens": 3746718.0, "step": 718, "train/ce_loss": 1.1156566143035889 }, { "epoch": 0.07099070595214554, "step": 718, "train/sim_loss": 0.08984375 }, { "epoch": 0.07099070595214554, "step": 718, "train/total_loss": 0.20140941441059113 }, { "entropy": 9.282903671264648, "epoch": 0.07108957880166106, "mean_token_accuracy": 0.7349665760993958, "num_tokens": 3752108.0, "step": 719, "train/ce_loss": 0.8221208453178406 }, { "epoch": 0.07108957880166106, "step": 719, "train/sim_loss": 0.109375 }, { "epoch": 0.07108957880166106, "step": 719, "train/total_loss": 0.19158709049224854 }, { "epoch": 0.07118845165117658, "grad_norm": 1.1553668975830078, "learning_rate": 9.824704544330713e-06, "loss": 0.1963, "step": 720 }, { "entropy": 9.120210647583008, "epoch": 0.07118845165117658, "mean_token_accuracy": 0.741631805896759, "num_tokens": 3757553.0, "step": 720, "train/ce_loss": 0.8454121351242065 }, { "epoch": 0.07118845165117658, "step": 720, "train/sim_loss": 0.1484375 }, { "epoch": 0.07118845165117658, "step": 720, "train/total_loss": 0.2329787164926529 }, { "entropy": 9.471561431884766, "epoch": 0.07128732450069211, "mean_token_accuracy": 0.7384823560714722, "num_tokens": 3762730.0, "step": 721, "train/ce_loss": 1.2541835308074951 }, { "epoch": 0.07128732450069211, "step": 721, "train/sim_loss": 0.13671875 }, { "epoch": 0.07128732450069211, "step": 721, "train/total_loss": 0.26213711500167847 }, { "entropy": 9.546029090881348, "epoch": 0.07138619735020764, "mean_token_accuracy": 0.7725321650505066, "num_tokens": 3767914.0, "step": 722, "train/ce_loss": 0.8785749077796936 }, { "epoch": 0.07138619735020764, "step": 722, "train/sim_loss": 0.05078125 }, { "epoch": 0.07138619735020764, "step": 722, "train/total_loss": 0.13863873481750488 }, { "entropy": 9.323598861694336, "epoch": 0.07148507019972315, "mean_token_accuracy": 0.7405345439910889, "num_tokens": 3773233.0, "step": 723, "train/ce_loss": 0.9451057314872742 }, { "epoch": 0.07148507019972315, "step": 723, "train/sim_loss": 0.1015625 }, { "epoch": 0.07148507019972315, "step": 723, "train/total_loss": 0.19607308506965637 }, { "entropy": 9.958362579345703, "epoch": 0.07158394304923868, "mean_token_accuracy": 0.7546531558036804, "num_tokens": 3778258.0, "step": 724, "train/ce_loss": 1.2022948265075684 }, { "epoch": 0.07158394304923868, "step": 724, "train/sim_loss": 0.06640625 }, { "epoch": 0.07158394304923868, "step": 724, "train/total_loss": 0.18663573265075684 }, { "entropy": 9.390419006347656, "epoch": 0.0716828158987542, "mean_token_accuracy": 0.7416563630104065, "num_tokens": 3783553.0, "step": 725, "train/ce_loss": 0.6417506337165833 }, { "epoch": 0.0716828158987542, "step": 725, "train/sim_loss": 0.09765625 }, { "epoch": 0.0716828158987542, "step": 725, "train/total_loss": 0.1618313193321228 }, { "entropy": 9.722644805908203, "epoch": 0.07178168874826972, "mean_token_accuracy": 0.7279411554336548, "num_tokens": 3788630.0, "step": 726, "train/ce_loss": 1.2358253002166748 }, { "epoch": 0.07178168874826972, "step": 726, "train/sim_loss": 0.109375 }, { "epoch": 0.07178168874826972, "step": 726, "train/total_loss": 0.23295754194259644 }, { "entropy": 10.481596946716309, "epoch": 0.07188056159778525, "mean_token_accuracy": 0.6655948758125305, "num_tokens": 3793333.0, "step": 727, "train/ce_loss": 0.035042211413383484 }, { "epoch": 0.07188056159778525, "step": 727, "train/sim_loss": 0.05078125 }, { "epoch": 0.07188056159778525, "step": 727, "train/total_loss": 0.05428547039628029 }, { "entropy": 9.238446235656738, "epoch": 0.07197943444730077, "mean_token_accuracy": 0.7754868268966675, "num_tokens": 3798691.0, "step": 728, "train/ce_loss": 0.4300273656845093 }, { "epoch": 0.07197943444730077, "step": 728, "train/sim_loss": 0.09765625 }, { "epoch": 0.07197943444730077, "step": 728, "train/total_loss": 0.14065898954868317 }, { "entropy": 9.407970428466797, "epoch": 0.07207830729681629, "mean_token_accuracy": 0.7565789222717285, "num_tokens": 3803934.0, "step": 729, "train/ce_loss": 0.6835651397705078 }, { "epoch": 0.07207830729681629, "step": 729, "train/sim_loss": 0.0625 }, { "epoch": 0.07207830729681629, "step": 729, "train/total_loss": 0.13085651397705078 }, { "entropy": 9.088936805725098, "epoch": 0.07217718014633182, "mean_token_accuracy": 0.708737850189209, "num_tokens": 3809399.0, "step": 730, "train/ce_loss": 0.6989524364471436 }, { "epoch": 0.07217718014633182, "step": 730, "train/sim_loss": 0.08203125 }, { "epoch": 0.07217718014633182, "step": 730, "train/total_loss": 0.15192648768424988 }, { "entropy": 9.598482131958008, "epoch": 0.07227605299584734, "mean_token_accuracy": 0.7364568114280701, "num_tokens": 3814590.0, "step": 731, "train/ce_loss": 1.3085780143737793 }, { "epoch": 0.07227605299584734, "step": 731, "train/sim_loss": 0.08984375 }, { "epoch": 0.07227605299584734, "step": 731, "train/total_loss": 0.22070156037807465 }, { "entropy": 10.219598770141602, "epoch": 0.07237492584536287, "mean_token_accuracy": 0.75, "num_tokens": 3819480.0, "step": 732, "train/ce_loss": 0.05476241931319237 }, { "epoch": 0.07237492584536287, "step": 732, "train/sim_loss": 0.07421875 }, { "epoch": 0.07237492584536287, "step": 732, "train/total_loss": 0.07969499379396439 }, { "entropy": 9.48927116394043, "epoch": 0.07247379869487838, "mean_token_accuracy": 0.7329114079475403, "num_tokens": 3824729.0, "step": 733, "train/ce_loss": 0.7749419212341309 }, { "epoch": 0.07247379869487838, "step": 733, "train/sim_loss": 0.08203125 }, { "epoch": 0.07247379869487838, "step": 733, "train/total_loss": 0.15952545404434204 }, { "entropy": 9.768869400024414, "epoch": 0.07257267154439391, "mean_token_accuracy": 0.6836734414100647, "num_tokens": 3829855.0, "step": 734, "train/ce_loss": 1.120697259902954 }, { "epoch": 0.07257267154439391, "step": 734, "train/sim_loss": 0.14453125 }, { "epoch": 0.07257267154439391, "step": 734, "train/total_loss": 0.2566009759902954 }, { "entropy": 9.667350769042969, "epoch": 0.07267154439390944, "mean_token_accuracy": 0.7994186282157898, "num_tokens": 3834992.0, "step": 735, "train/ce_loss": 0.8130565881729126 }, { "epoch": 0.07267154439390944, "step": 735, "train/sim_loss": 0.06640625 }, { "epoch": 0.07267154439390944, "step": 735, "train/total_loss": 0.14771190285682678 }, { "entropy": 9.413403511047363, "epoch": 0.07277041724342495, "mean_token_accuracy": 0.7240506410598755, "num_tokens": 3840270.0, "step": 736, "train/ce_loss": 0.9753805994987488 }, { "epoch": 0.07277041724342495, "step": 736, "train/sim_loss": 0.12109375 }, { "epoch": 0.07277041724342495, "step": 736, "train/total_loss": 0.2186318039894104 }, { "entropy": 10.649709701538086, "epoch": 0.07286929009294048, "mean_token_accuracy": 0.7428571581840515, "num_tokens": 3844835.0, "step": 737, "train/ce_loss": 0.0780835896730423 }, { "epoch": 0.07286929009294048, "step": 737, "train/sim_loss": 0.05078125 }, { "epoch": 0.07286929009294048, "step": 737, "train/total_loss": 0.05858960747718811 }, { "entropy": 9.241476058959961, "epoch": 0.072968162942456, "mean_token_accuracy": 0.7703016400337219, "num_tokens": 3850202.0, "step": 738, "train/ce_loss": 0.5357682704925537 }, { "epoch": 0.072968162942456, "step": 738, "train/sim_loss": 0.046875 }, { "epoch": 0.072968162942456, "step": 738, "train/total_loss": 0.10045182704925537 }, { "entropy": 10.102038383483887, "epoch": 0.07306703579197152, "mean_token_accuracy": 0.6833333373069763, "num_tokens": 3855122.0, "step": 739, "train/ce_loss": 0.037747181951999664 }, { "epoch": 0.07306703579197152, "step": 739, "train/sim_loss": 0.09375 }, { "epoch": 0.07306703579197152, "step": 739, "train/total_loss": 0.0975247174501419 }, { "epoch": 0.07316590864148705, "grad_norm": 1.4861098527908325, "learning_rate": 9.819759679572765e-06, "loss": 0.1844, "step": 740 }, { "entropy": 9.190587997436523, "epoch": 0.07316590864148705, "mean_token_accuracy": 0.6952695250511169, "num_tokens": 3860520.0, "step": 740, "train/ce_loss": 0.8424211740493774 }, { "epoch": 0.07316590864148705, "step": 740, "train/sim_loss": 0.08203125 }, { "epoch": 0.07316590864148705, "step": 740, "train/total_loss": 0.16627337038516998 }, { "entropy": 9.568659782409668, "epoch": 0.07326478149100257, "mean_token_accuracy": 0.7337278127670288, "num_tokens": 3865690.0, "step": 741, "train/ce_loss": 0.713638961315155 }, { "epoch": 0.07326478149100257, "step": 741, "train/sim_loss": 0.0859375 }, { "epoch": 0.07326478149100257, "step": 741, "train/total_loss": 0.1573013961315155 }, { "entropy": 9.289871215820312, "epoch": 0.0733636543405181, "mean_token_accuracy": 0.7718120813369751, "num_tokens": 3871187.0, "step": 742, "train/ce_loss": 0.6669222712516785 }, { "epoch": 0.0733636543405181, "step": 742, "train/sim_loss": 0.0703125 }, { "epoch": 0.0733636543405181, "step": 742, "train/total_loss": 0.13700473308563232 }, { "entropy": 9.741256713867188, "epoch": 0.07346252719003361, "mean_token_accuracy": 0.7789046764373779, "num_tokens": 3876114.0, "step": 743, "train/ce_loss": 0.697907567024231 }, { "epoch": 0.07346252719003361, "step": 743, "train/sim_loss": 0.0859375 }, { "epoch": 0.07346252719003361, "step": 743, "train/total_loss": 0.15572825074195862 }, { "entropy": 9.256278038024902, "epoch": 0.07356140003954914, "mean_token_accuracy": 0.8060781359672546, "num_tokens": 3881308.0, "step": 744, "train/ce_loss": 0.8655441403388977 }, { "epoch": 0.07356140003954914, "step": 744, "train/sim_loss": 0.06640625 }, { "epoch": 0.07356140003954914, "step": 744, "train/total_loss": 0.1529606580734253 }, { "entropy": 9.161439895629883, "epoch": 0.07366027288906467, "mean_token_accuracy": 0.7142857313156128, "num_tokens": 3886614.0, "step": 745, "train/ce_loss": 0.6780821084976196 }, { "epoch": 0.07366027288906467, "step": 745, "train/sim_loss": 0.13671875 }, { "epoch": 0.07366027288906467, "step": 745, "train/total_loss": 0.20452696084976196 }, { "entropy": 9.79833984375, "epoch": 0.07375914573858018, "mean_token_accuracy": 0.6719492673873901, "num_tokens": 3891750.0, "step": 746, "train/ce_loss": 0.010583448223769665 }, { "epoch": 0.07375914573858018, "step": 746, "train/sim_loss": 0.078125 }, { "epoch": 0.07375914573858018, "step": 746, "train/total_loss": 0.07918334752321243 }, { "entropy": 9.648619651794434, "epoch": 0.07385801858809571, "mean_token_accuracy": 0.7537190318107605, "num_tokens": 3896805.0, "step": 747, "train/ce_loss": 1.2625455856323242 }, { "epoch": 0.07385801858809571, "step": 747, "train/sim_loss": 0.10546875 }, { "epoch": 0.07385801858809571, "step": 747, "train/total_loss": 0.23172330856323242 }, { "entropy": 9.038060188293457, "epoch": 0.07395689143761124, "mean_token_accuracy": 0.7724867463111877, "num_tokens": 3902211.0, "step": 748, "train/ce_loss": 0.7919908761978149 }, { "epoch": 0.07395689143761124, "step": 748, "train/sim_loss": 0.09765625 }, { "epoch": 0.07395689143761124, "step": 748, "train/total_loss": 0.17685534060001373 }, { "entropy": 9.768566131591797, "epoch": 0.07405576428712675, "mean_token_accuracy": 0.6908283829689026, "num_tokens": 3907482.0, "step": 749, "train/ce_loss": 1.213486671447754 }, { "epoch": 0.07405576428712675, "step": 749, "train/sim_loss": 0.140625 }, { "epoch": 0.07405576428712675, "step": 749, "train/total_loss": 0.26197367906570435 }, { "entropy": 9.529090881347656, "epoch": 0.07415463713664228, "mean_token_accuracy": 0.769784152507782, "num_tokens": 3912423.0, "step": 750, "train/ce_loss": 1.1830240488052368 }, { "epoch": 0.07415463713664228, "step": 750, "train/sim_loss": 0.1171875 }, { "epoch": 0.07415463713664228, "step": 750, "train/total_loss": 0.23548990488052368 }, { "entropy": 9.040364265441895, "epoch": 0.0742535099861578, "mean_token_accuracy": 0.73758864402771, "num_tokens": 3917883.0, "step": 751, "train/ce_loss": 0.48614466190338135 }, { "epoch": 0.0742535099861578, "step": 751, "train/sim_loss": 0.09765625 }, { "epoch": 0.0742535099861578, "step": 751, "train/total_loss": 0.1462707221508026 }, { "entropy": 9.44776725769043, "epoch": 0.07435238283567333, "mean_token_accuracy": 0.6906565427780151, "num_tokens": 3923172.0, "step": 752, "train/ce_loss": 1.3117228746414185 }, { "epoch": 0.07435238283567333, "step": 752, "train/sim_loss": 0.12109375 }, { "epoch": 0.07435238283567333, "step": 752, "train/total_loss": 0.2522660493850708 }, { "entropy": 9.157966613769531, "epoch": 0.07445125568518884, "mean_token_accuracy": 0.7769230604171753, "num_tokens": 3928570.0, "step": 753, "train/ce_loss": 0.8286904096603394 }, { "epoch": 0.07445125568518884, "step": 753, "train/sim_loss": 0.15625 }, { "epoch": 0.07445125568518884, "step": 753, "train/total_loss": 0.2391190528869629 }, { "entropy": 9.417069435119629, "epoch": 0.07455012853470437, "mean_token_accuracy": 0.7154285907745361, "num_tokens": 3934052.0, "step": 754, "train/ce_loss": 1.1700571775436401 }, { "epoch": 0.07455012853470437, "step": 754, "train/sim_loss": 0.1796875 }, { "epoch": 0.07455012853470437, "step": 754, "train/total_loss": 0.29669320583343506 }, { "entropy": 9.932807922363281, "epoch": 0.0746490013842199, "mean_token_accuracy": 0.7793851494789124, "num_tokens": 3939025.0, "step": 755, "train/ce_loss": 0.0001756744022713974 }, { "epoch": 0.0746490013842199, "step": 755, "train/sim_loss": 0.05859375 }, { "epoch": 0.0746490013842199, "step": 755, "train/total_loss": 0.058611318469047546 }, { "entropy": 10.180121421813965, "epoch": 0.07474787423373541, "mean_token_accuracy": 0.769444465637207, "num_tokens": 3943785.0, "step": 756, "train/ce_loss": 2.040376901626587 }, { "epoch": 0.07474787423373541, "step": 756, "train/sim_loss": 0.10546875 }, { "epoch": 0.07474787423373541, "step": 756, "train/total_loss": 0.30950644612312317 }, { "entropy": 9.88981819152832, "epoch": 0.07484674708325094, "mean_token_accuracy": 0.7750439643859863, "num_tokens": 3948807.0, "step": 757, "train/ce_loss": 0.9959053993225098 }, { "epoch": 0.07484674708325094, "step": 757, "train/sim_loss": 0.1015625 }, { "epoch": 0.07484674708325094, "step": 757, "train/total_loss": 0.20115303993225098 }, { "entropy": 9.706342697143555, "epoch": 0.07494561993276647, "mean_token_accuracy": 0.6854838728904724, "num_tokens": 3953879.0, "step": 758, "train/ce_loss": 2.2091052532196045 }, { "epoch": 0.07494561993276647, "step": 758, "train/sim_loss": 0.15625 }, { "epoch": 0.07494561993276647, "step": 758, "train/total_loss": 0.37716054916381836 }, { "entropy": 10.303378105163574, "epoch": 0.07504449278228198, "mean_token_accuracy": 0.7054455280303955, "num_tokens": 3958686.0, "step": 759, "train/ce_loss": 0.00018329703016206622 }, { "epoch": 0.07504449278228198, "step": 759, "train/sim_loss": 0.09375 }, { "epoch": 0.07504449278228198, "step": 759, "train/total_loss": 0.09376832842826843 }, { "epoch": 0.0751433656317975, "grad_norm": 1.2485485076904297, "learning_rate": 9.814814814814815e-06, "loss": 0.1817, "step": 760 }, { "entropy": 9.339644432067871, "epoch": 0.0751433656317975, "mean_token_accuracy": 0.7108753323554993, "num_tokens": 3963948.0, "step": 760, "train/ce_loss": 0.7239080667495728 }, { "epoch": 0.0751433656317975, "step": 760, "train/sim_loss": 0.12890625 }, { "epoch": 0.0751433656317975, "step": 760, "train/total_loss": 0.20129705965518951 }, { "entropy": 9.127784729003906, "epoch": 0.07524223848131303, "mean_token_accuracy": 0.7497155666351318, "num_tokens": 3969361.0, "step": 761, "train/ce_loss": 0.8279957175254822 }, { "epoch": 0.07524223848131303, "step": 761, "train/sim_loss": 0.0390625 }, { "epoch": 0.07524223848131303, "step": 761, "train/total_loss": 0.12186207622289658 }, { "entropy": 10.072265625, "epoch": 0.07534111133082856, "mean_token_accuracy": 0.7269155383110046, "num_tokens": 3974483.0, "step": 762, "train/ce_loss": 1.78815495967865 }, { "epoch": 0.07534111133082856, "step": 762, "train/sim_loss": 0.14453125 }, { "epoch": 0.07534111133082856, "step": 762, "train/total_loss": 0.32334673404693604 }, { "entropy": 8.886787414550781, "epoch": 0.07543998418034407, "mean_token_accuracy": 0.7236328125, "num_tokens": 3980013.0, "step": 763, "train/ce_loss": 0.7672596573829651 }, { "epoch": 0.07543998418034407, "step": 763, "train/sim_loss": 0.02734375 }, { "epoch": 0.07543998418034407, "step": 763, "train/total_loss": 0.10406971722841263 }, { "entropy": 9.612545013427734, "epoch": 0.0755388570298596, "mean_token_accuracy": 0.7057291865348816, "num_tokens": 3985241.0, "step": 764, "train/ce_loss": 0.0003228633722756058 }, { "epoch": 0.0755388570298596, "step": 764, "train/sim_loss": 0.125 }, { "epoch": 0.0755388570298596, "step": 764, "train/total_loss": 0.12503229081630707 }, { "entropy": 10.012237548828125, "epoch": 0.07563772987937513, "mean_token_accuracy": 0.7092783451080322, "num_tokens": 3990048.0, "step": 765, "train/ce_loss": 0.00010308609489584342 }, { "epoch": 0.07563772987937513, "step": 765, "train/sim_loss": 0.078125 }, { "epoch": 0.07563772987937513, "step": 765, "train/total_loss": 0.07813531160354614 }, { "entropy": 10.73228645324707, "epoch": 0.07573660272889064, "mean_token_accuracy": 0.7272727489471436, "num_tokens": 3994606.0, "step": 766, "train/ce_loss": 3.5277905464172363 }, { "epoch": 0.07573660272889064, "step": 766, "train/sim_loss": 0.1171875 }, { "epoch": 0.07573660272889064, "step": 766, "train/total_loss": 0.4699665606021881 }, { "entropy": 10.159137725830078, "epoch": 0.07583547557840617, "mean_token_accuracy": 0.7402299046516418, "num_tokens": 3999438.0, "step": 767, "train/ce_loss": 0.00011631346569629386 }, { "epoch": 0.07583547557840617, "step": 767, "train/sim_loss": 0.0703125 }, { "epoch": 0.07583547557840617, "step": 767, "train/total_loss": 0.0703241303563118 }, { "entropy": 9.458081245422363, "epoch": 0.0759343484279217, "mean_token_accuracy": 0.7712082266807556, "num_tokens": 4004694.0, "step": 768, "train/ce_loss": 0.7753697633743286 }, { "epoch": 0.0759343484279217, "step": 768, "train/sim_loss": 0.03515625 }, { "epoch": 0.0759343484279217, "step": 768, "train/total_loss": 0.11269322782754898 }, { "entropy": 9.82442855834961, "epoch": 0.07603322127743721, "mean_token_accuracy": 0.7715231776237488, "num_tokens": 4009700.0, "step": 769, "train/ce_loss": 0.6574856042861938 }, { "epoch": 0.07603322127743721, "step": 769, "train/sim_loss": 0.078125 }, { "epoch": 0.07603322127743721, "step": 769, "train/total_loss": 0.14387357234954834 }, { "entropy": 9.83502197265625, "epoch": 0.07613209412695274, "mean_token_accuracy": 0.738831639289856, "num_tokens": 4014669.0, "step": 770, "train/ce_loss": 0.7562925219535828 }, { "epoch": 0.07613209412695274, "step": 770, "train/sim_loss": 0.0859375 }, { "epoch": 0.07613209412695274, "step": 770, "train/total_loss": 0.16156676411628723 }, { "entropy": 9.765600204467773, "epoch": 0.07623096697646826, "mean_token_accuracy": 0.737942099571228, "num_tokens": 4019740.0, "step": 771, "train/ce_loss": 6.540792674059048e-05 }, { "epoch": 0.07623096697646826, "step": 771, "train/sim_loss": 0.0859375 }, { "epoch": 0.07623096697646826, "step": 771, "train/total_loss": 0.0859440416097641 }, { "entropy": 9.84894847869873, "epoch": 0.07632983982598378, "mean_token_accuracy": 0.743922233581543, "num_tokens": 4024689.0, "step": 772, "train/ce_loss": 5.4590320360148326e-05 }, { "epoch": 0.07632983982598378, "step": 772, "train/sim_loss": 0.08203125 }, { "epoch": 0.07632983982598378, "step": 772, "train/total_loss": 0.08203671127557755 }, { "entropy": 9.404129981994629, "epoch": 0.0764287126754993, "mean_token_accuracy": 0.7545219659805298, "num_tokens": 4029914.0, "step": 773, "train/ce_loss": 0.7593308091163635 }, { "epoch": 0.0764287126754993, "step": 773, "train/sim_loss": 0.08984375 }, { "epoch": 0.0764287126754993, "step": 773, "train/total_loss": 0.1657768338918686 }, { "entropy": 10.176400184631348, "epoch": 0.07652758552501483, "mean_token_accuracy": 0.6865979433059692, "num_tokens": 4034779.0, "step": 774, "train/ce_loss": 7.022523641353473e-05 }, { "epoch": 0.07652758552501483, "step": 774, "train/sim_loss": 0.05078125 }, { "epoch": 0.07652758552501483, "step": 774, "train/total_loss": 0.0507882721722126 }, { "entropy": 9.250575065612793, "epoch": 0.07662645837453036, "mean_token_accuracy": 0.704395592212677, "num_tokens": 4040187.0, "step": 775, "train/ce_loss": 0.5897475481033325 }, { "epoch": 0.07662645837453036, "step": 775, "train/sim_loss": 0.07421875 }, { "epoch": 0.07662645837453036, "step": 775, "train/total_loss": 0.1331935077905655 }, { "entropy": 9.51695728302002, "epoch": 0.07672533122404587, "mean_token_accuracy": 0.7522255182266235, "num_tokens": 4045271.0, "step": 776, "train/ce_loss": 5.094786683912389e-05 }, { "epoch": 0.07672533122404587, "step": 776, "train/sim_loss": 0.1015625 }, { "epoch": 0.07672533122404587, "step": 776, "train/total_loss": 0.1015675961971283 }, { "entropy": 9.079865455627441, "epoch": 0.0768242040735614, "mean_token_accuracy": 0.7241014838218689, "num_tokens": 4050727.0, "step": 777, "train/ce_loss": 1.0284991264343262 }, { "epoch": 0.0768242040735614, "step": 777, "train/sim_loss": 0.125 }, { "epoch": 0.0768242040735614, "step": 777, "train/total_loss": 0.22784991562366486 }, { "entropy": 9.44433307647705, "epoch": 0.07692307692307693, "mean_token_accuracy": 0.7258262038230896, "num_tokens": 4056011.0, "step": 778, "train/ce_loss": 1.0514321327209473 }, { "epoch": 0.07692307692307693, "step": 778, "train/sim_loss": 0.09375 }, { "epoch": 0.07692307692307693, "step": 778, "train/total_loss": 0.1988932192325592 }, { "entropy": 9.88050651550293, "epoch": 0.07702194977259244, "mean_token_accuracy": 0.7445651888847351, "num_tokens": 4061016.0, "step": 779, "train/ce_loss": 1.3254461288452148 }, { "epoch": 0.07702194977259244, "step": 779, "train/sim_loss": 0.10546875 }, { "epoch": 0.07702194977259244, "step": 779, "train/total_loss": 0.2380133718252182 }, { "epoch": 0.07712082262210797, "grad_norm": 1.189727544784546, "learning_rate": 9.809869950056868e-06, "loss": 0.1784, "step": 780 }, { "entropy": 9.282702445983887, "epoch": 0.07712082262210797, "mean_token_accuracy": 0.7541191577911377, "num_tokens": 4066318.0, "step": 780, "train/ce_loss": 0.8596696257591248 }, { "epoch": 0.07712082262210797, "step": 780, "train/sim_loss": 0.1015625 }, { "epoch": 0.07712082262210797, "step": 780, "train/total_loss": 0.18752947449684143 }, { "entropy": 9.363134384155273, "epoch": 0.0772196954716235, "mean_token_accuracy": 0.7042440176010132, "num_tokens": 4071532.0, "step": 781, "train/ce_loss": 1.0713800191879272 }, { "epoch": 0.0772196954716235, "step": 781, "train/sim_loss": 0.15234375 }, { "epoch": 0.0772196954716235, "step": 781, "train/total_loss": 0.2594817578792572 }, { "entropy": 9.268877983093262, "epoch": 0.07731856832113901, "mean_token_accuracy": 0.7172653675079346, "num_tokens": 4076917.0, "step": 782, "train/ce_loss": 0.9807270765304565 }, { "epoch": 0.07731856832113901, "step": 782, "train/sim_loss": 0.08203125 }, { "epoch": 0.07731856832113901, "step": 782, "train/total_loss": 0.18010395765304565 }, { "entropy": 9.296899795532227, "epoch": 0.07741744117065454, "mean_token_accuracy": 0.7200435996055603, "num_tokens": 4082478.0, "step": 783, "train/ce_loss": 0.713777482509613 }, { "epoch": 0.07741744117065454, "step": 783, "train/sim_loss": 0.09765625 }, { "epoch": 0.07741744117065454, "step": 783, "train/total_loss": 0.16903400421142578 }, { "entropy": 9.328934669494629, "epoch": 0.07751631402017006, "mean_token_accuracy": 0.7242236137390137, "num_tokens": 4087763.0, "step": 784, "train/ce_loss": 0.4932112991809845 }, { "epoch": 0.07751631402017006, "step": 784, "train/sim_loss": 0.1328125 }, { "epoch": 0.07751631402017006, "step": 784, "train/total_loss": 0.18213362991809845 }, { "entropy": 9.931402206420898, "epoch": 0.07761518686968559, "mean_token_accuracy": 0.810234546661377, "num_tokens": 4092642.0, "step": 785, "train/ce_loss": 0.8886024951934814 }, { "epoch": 0.07761518686968559, "step": 785, "train/sim_loss": 0.0390625 }, { "epoch": 0.07761518686968559, "step": 785, "train/total_loss": 0.12792274355888367 }, { "entropy": 8.978738784790039, "epoch": 0.0777140597192011, "mean_token_accuracy": 0.7637795209884644, "num_tokens": 4098166.0, "step": 786, "train/ce_loss": 0.644202709197998 }, { "epoch": 0.0777140597192011, "step": 786, "train/sim_loss": 0.171875 }, { "epoch": 0.0777140597192011, "step": 786, "train/total_loss": 0.23629528284072876 }, { "entropy": 9.127129554748535, "epoch": 0.07781293256871663, "mean_token_accuracy": 0.7309812307357788, "num_tokens": 4103562.0, "step": 787, "train/ce_loss": 0.9794098138809204 }, { "epoch": 0.07781293256871663, "step": 787, "train/sim_loss": 0.109375 }, { "epoch": 0.07781293256871663, "step": 787, "train/total_loss": 0.20731598138809204 }, { "entropy": 8.971997261047363, "epoch": 0.07791180541823216, "mean_token_accuracy": 0.7138643264770508, "num_tokens": 4109055.0, "step": 788, "train/ce_loss": 1.256629467010498 }, { "epoch": 0.07791180541823216, "step": 788, "train/sim_loss": 0.0859375 }, { "epoch": 0.07791180541823216, "step": 788, "train/total_loss": 0.21160045266151428 }, { "entropy": 10.002260208129883, "epoch": 0.07801067826774767, "mean_token_accuracy": 0.7286527752876282, "num_tokens": 4113975.0, "step": 789, "train/ce_loss": 1.4703837633132935 }, { "epoch": 0.07801067826774767, "step": 789, "train/sim_loss": 0.1015625 }, { "epoch": 0.07801067826774767, "step": 789, "train/total_loss": 0.24860088527202606 }, { "entropy": 9.624162673950195, "epoch": 0.0781095511172632, "mean_token_accuracy": 0.6926286220550537, "num_tokens": 4119187.0, "step": 790, "train/ce_loss": 1.9462153911590576 }, { "epoch": 0.0781095511172632, "step": 790, "train/sim_loss": 0.1015625 }, { "epoch": 0.0781095511172632, "step": 790, "train/total_loss": 0.29618406295776367 }, { "entropy": 9.789203643798828, "epoch": 0.07820842396677873, "mean_token_accuracy": 0.6942771077156067, "num_tokens": 4124288.0, "step": 791, "train/ce_loss": 1.2474993467330933 }, { "epoch": 0.07820842396677873, "step": 791, "train/sim_loss": 0.08984375 }, { "epoch": 0.07820842396677873, "step": 791, "train/total_loss": 0.21459367871284485 }, { "entropy": 10.033029556274414, "epoch": 0.07830729681629424, "mean_token_accuracy": 0.7454175353050232, "num_tokens": 4129206.0, "step": 792, "train/ce_loss": 1.3075237274169922 }, { "epoch": 0.07830729681629424, "step": 792, "train/sim_loss": 0.078125 }, { "epoch": 0.07830729681629424, "step": 792, "train/total_loss": 0.20887736976146698 }, { "entropy": 9.54617691040039, "epoch": 0.07840616966580977, "mean_token_accuracy": 0.7549933195114136, "num_tokens": 4134381.0, "step": 793, "train/ce_loss": 0.8156450390815735 }, { "epoch": 0.07840616966580977, "step": 793, "train/sim_loss": 0.09765625 }, { "epoch": 0.07840616966580977, "step": 793, "train/total_loss": 0.1792207658290863 }, { "entropy": 9.540731430053711, "epoch": 0.0785050425153253, "mean_token_accuracy": 0.7285513281822205, "num_tokens": 4139579.0, "step": 794, "train/ce_loss": 0.6351312398910522 }, { "epoch": 0.0785050425153253, "step": 794, "train/sim_loss": 0.08984375 }, { "epoch": 0.0785050425153253, "step": 794, "train/total_loss": 0.1533568799495697 }, { "entropy": 9.890420913696289, "epoch": 0.07860391536484082, "mean_token_accuracy": 0.693493127822876, "num_tokens": 4144627.0, "step": 795, "train/ce_loss": 0.7012494206428528 }, { "epoch": 0.07860391536484082, "step": 795, "train/sim_loss": 0.08984375 }, { "epoch": 0.07860391536484082, "step": 795, "train/total_loss": 0.15996870398521423 }, { "entropy": 9.151998519897461, "epoch": 0.07870278821435633, "mean_token_accuracy": 0.7046688199043274, "num_tokens": 4150017.0, "step": 796, "train/ce_loss": 1.022414207458496 }, { "epoch": 0.07870278821435633, "step": 796, "train/sim_loss": 0.12109375 }, { "epoch": 0.07870278821435633, "step": 796, "train/total_loss": 0.2233351767063141 }, { "entropy": 10.323640823364258, "epoch": 0.07880166106387186, "mean_token_accuracy": 0.7662721872329712, "num_tokens": 4154737.0, "step": 797, "train/ce_loss": 7.12625915184617e-05 }, { "epoch": 0.07880166106387186, "step": 797, "train/sim_loss": 0.0390625 }, { "epoch": 0.07880166106387186, "step": 797, "train/total_loss": 0.03906962648034096 }, { "entropy": 9.252408981323242, "epoch": 0.07890053391338739, "mean_token_accuracy": 0.7404674291610718, "num_tokens": 4160031.0, "step": 798, "train/ce_loss": 0.7661677598953247 }, { "epoch": 0.07890053391338739, "step": 798, "train/sim_loss": 0.11328125 }, { "epoch": 0.07890053391338739, "step": 798, "train/total_loss": 0.1898980289697647 }, { "entropy": 9.693077087402344, "epoch": 0.0789994067629029, "mean_token_accuracy": 0.7832586169242859, "num_tokens": 4165172.0, "step": 799, "train/ce_loss": 0.8967200517654419 }, { "epoch": 0.0789994067629029, "step": 799, "train/sim_loss": 0.05859375 }, { "epoch": 0.0789994067629029, "step": 799, "train/total_loss": 0.1482657492160797 }, { "epoch": 0.07909827961241843, "grad_norm": 1.1841247081756592, "learning_rate": 9.804925085298918e-06, "loss": 0.1842, "step": 800 }, { "entropy": 9.122665405273438, "epoch": 0.07909827961241843, "mean_token_accuracy": 0.696703314781189, "num_tokens": 4170555.0, "step": 800, "train/ce_loss": 1.649169921875 }, { "epoch": 0.07909827961241843, "step": 800, "train/sim_loss": 0.12890625 }, { "epoch": 0.07909827961241843, "step": 800, "train/total_loss": 0.2938232421875 }, { "entropy": 9.88027572631836, "epoch": 0.07919715246193396, "mean_token_accuracy": 0.7062146663665771, "num_tokens": 4175529.0, "step": 801, "train/ce_loss": 1.0107178688049316 }, { "epoch": 0.07919715246193396, "step": 801, "train/sim_loss": 0.15234375 }, { "epoch": 0.07919715246193396, "step": 801, "train/total_loss": 0.2534155249595642 }, { "entropy": 9.497377395629883, "epoch": 0.07929602531144947, "mean_token_accuracy": 0.7316129207611084, "num_tokens": 4180700.0, "step": 802, "train/ce_loss": 1.0892964601516724 }, { "epoch": 0.07929602531144947, "step": 802, "train/sim_loss": 0.0703125 }, { "epoch": 0.07929602531144947, "step": 802, "train/total_loss": 0.17924214899539948 }, { "entropy": 9.278539657592773, "epoch": 0.079394898160965, "mean_token_accuracy": 0.7182390093803406, "num_tokens": 4185971.0, "step": 803, "train/ce_loss": 0.8930141925811768 }, { "epoch": 0.079394898160965, "step": 803, "train/sim_loss": 0.11328125 }, { "epoch": 0.079394898160965, "step": 803, "train/total_loss": 0.20258267223834991 }, { "entropy": 9.887369155883789, "epoch": 0.07949377101048052, "mean_token_accuracy": 0.71875, "num_tokens": 4190973.0, "step": 804, "train/ce_loss": 1.0374951362609863 }, { "epoch": 0.07949377101048052, "step": 804, "train/sim_loss": 0.08984375 }, { "epoch": 0.07949377101048052, "step": 804, "train/total_loss": 0.19359326362609863 }, { "entropy": 9.997756958007812, "epoch": 0.07959264385999605, "mean_token_accuracy": 0.7203791737556458, "num_tokens": 4195773.0, "step": 805, "train/ce_loss": 4.149144297116436e-05 }, { "epoch": 0.07959264385999605, "step": 805, "train/sim_loss": 0.10546875 }, { "epoch": 0.07959264385999605, "step": 805, "train/total_loss": 0.10547289997339249 }, { "entropy": 9.551786422729492, "epoch": 0.07969151670951156, "mean_token_accuracy": 0.6892109513282776, "num_tokens": 4200813.0, "step": 806, "train/ce_loss": 3.241149170207791e-05 }, { "epoch": 0.07969151670951156, "step": 806, "train/sim_loss": 0.1171875 }, { "epoch": 0.07969151670951156, "step": 806, "train/total_loss": 0.11719074100255966 }, { "entropy": 9.735570907592773, "epoch": 0.07979038955902709, "mean_token_accuracy": 0.7573872208595276, "num_tokens": 4205860.0, "step": 807, "train/ce_loss": 1.1239843368530273 }, { "epoch": 0.07979038955902709, "step": 807, "train/sim_loss": 0.12890625 }, { "epoch": 0.07979038955902709, "step": 807, "train/total_loss": 0.2413046956062317 }, { "entropy": 9.355799674987793, "epoch": 0.07988926240854262, "mean_token_accuracy": 0.748062014579773, "num_tokens": 4211070.0, "step": 808, "train/ce_loss": 1.0023411512374878 }, { "epoch": 0.07988926240854262, "step": 808, "train/sim_loss": 0.11328125 }, { "epoch": 0.07988926240854262, "step": 808, "train/total_loss": 0.21351537108421326 }, { "entropy": 9.87933349609375, "epoch": 0.07998813525805813, "mean_token_accuracy": 0.7006688714027405, "num_tokens": 4216082.0, "step": 809, "train/ce_loss": 1.4445483684539795 }, { "epoch": 0.07998813525805813, "step": 809, "train/sim_loss": 0.1015625 }, { "epoch": 0.07998813525805813, "step": 809, "train/total_loss": 0.24601733684539795 }, { "entropy": 9.267508506774902, "epoch": 0.08008700810757366, "mean_token_accuracy": 0.7016759514808655, "num_tokens": 4221494.0, "step": 810, "train/ce_loss": 1.6310838460922241 }, { "epoch": 0.08008700810757366, "step": 810, "train/sim_loss": 0.125 }, { "epoch": 0.08008700810757366, "step": 810, "train/total_loss": 0.2881084084510803 }, { "entropy": 10.011655807495117, "epoch": 0.08018588095708919, "mean_token_accuracy": 0.6947565674781799, "num_tokens": 4226468.0, "step": 811, "train/ce_loss": 2.444669246673584 }, { "epoch": 0.08018588095708919, "step": 811, "train/sim_loss": 0.1171875 }, { "epoch": 0.08018588095708919, "step": 811, "train/total_loss": 0.3616544306278229 }, { "entropy": 9.906841278076172, "epoch": 0.0802847538066047, "mean_token_accuracy": 0.6685288548469543, "num_tokens": 4231432.0, "step": 812, "train/ce_loss": 3.6316334444563836e-05 }, { "epoch": 0.0802847538066047, "step": 812, "train/sim_loss": 0.0703125 }, { "epoch": 0.0802847538066047, "step": 812, "train/total_loss": 0.0703161284327507 }, { "entropy": 9.732148170471191, "epoch": 0.08038362665612023, "mean_token_accuracy": 0.7801653146743774, "num_tokens": 4236501.0, "step": 813, "train/ce_loss": 0.6189934611320496 }, { "epoch": 0.08038362665612023, "step": 813, "train/sim_loss": 0.11328125 }, { "epoch": 0.08038362665612023, "step": 813, "train/total_loss": 0.1751805990934372 }, { "entropy": 9.342995643615723, "epoch": 0.08048249950563575, "mean_token_accuracy": 0.7265536785125732, "num_tokens": 4241872.0, "step": 814, "train/ce_loss": 1.6427977085113525 }, { "epoch": 0.08048249950563575, "step": 814, "train/sim_loss": 0.12109375 }, { "epoch": 0.08048249950563575, "step": 814, "train/total_loss": 0.2853735089302063 }, { "entropy": 9.426212310791016, "epoch": 0.08058137235515128, "mean_token_accuracy": 0.7403100728988647, "num_tokens": 4247155.0, "step": 815, "train/ce_loss": 0.9434316754341125 }, { "epoch": 0.08058137235515128, "step": 815, "train/sim_loss": 0.05078125 }, { "epoch": 0.08058137235515128, "step": 815, "train/total_loss": 0.1451244205236435 }, { "entropy": 9.224058151245117, "epoch": 0.0806802452046668, "mean_token_accuracy": 0.7601279020309448, "num_tokens": 4252553.0, "step": 816, "train/ce_loss": 0.40708041191101074 }, { "epoch": 0.0806802452046668, "step": 816, "train/sim_loss": 0.0546875 }, { "epoch": 0.0806802452046668, "step": 816, "train/total_loss": 0.0953955426812172 }, { "entropy": 9.849893569946289, "epoch": 0.08077911805418232, "mean_token_accuracy": 0.7182866334915161, "num_tokens": 4257586.0, "step": 817, "train/ce_loss": 0.7182490229606628 }, { "epoch": 0.08077911805418232, "step": 817, "train/sim_loss": 0.03125 }, { "epoch": 0.08077911805418232, "step": 817, "train/total_loss": 0.10307490080595016 }, { "entropy": 9.295900344848633, "epoch": 0.08087799090369785, "mean_token_accuracy": 0.7526754140853882, "num_tokens": 4262891.0, "step": 818, "train/ce_loss": 0.8068259954452515 }, { "epoch": 0.08087799090369785, "step": 818, "train/sim_loss": 0.0390625 }, { "epoch": 0.08087799090369785, "step": 818, "train/total_loss": 0.11974509805440903 }, { "entropy": 9.492517471313477, "epoch": 0.08097686375321336, "mean_token_accuracy": 0.6806282997131348, "num_tokens": 4268115.0, "step": 819, "train/ce_loss": 1.0423537492752075 }, { "epoch": 0.08097686375321336, "step": 819, "train/sim_loss": 0.08984375 }, { "epoch": 0.08097686375321336, "step": 819, "train/total_loss": 0.19407913088798523 }, { "epoch": 0.08107573660272889, "grad_norm": 1.2709230184555054, "learning_rate": 9.799980220540969e-06, "loss": 0.186, "step": 820 }, { "entropy": 9.600884437561035, "epoch": 0.08107573660272889, "mean_token_accuracy": 0.7112135291099548, "num_tokens": 4273215.0, "step": 820, "train/ce_loss": 0.8723232746124268 }, { "epoch": 0.08107573660272889, "step": 820, "train/sim_loss": 0.12890625 }, { "epoch": 0.08107573660272889, "step": 820, "train/total_loss": 0.2161385715007782 }, { "entropy": 10.040645599365234, "epoch": 0.08117460945224442, "mean_token_accuracy": 0.7112526297569275, "num_tokens": 4278091.0, "step": 821, "train/ce_loss": 1.208526372909546 }, { "epoch": 0.08117460945224442, "step": 821, "train/sim_loss": 0.09765625 }, { "epoch": 0.08117460945224442, "step": 821, "train/total_loss": 0.21850889921188354 }, { "entropy": 8.934160232543945, "epoch": 0.08127348230175993, "mean_token_accuracy": 0.7260825634002686, "num_tokens": 4283566.0, "step": 822, "train/ce_loss": 1.0187729597091675 }, { "epoch": 0.08127348230175993, "step": 822, "train/sim_loss": 0.125 }, { "epoch": 0.08127348230175993, "step": 822, "train/total_loss": 0.22687730193138123 }, { "entropy": 9.646869659423828, "epoch": 0.08137235515127546, "mean_token_accuracy": 0.7361563444137573, "num_tokens": 4288637.0, "step": 823, "train/ce_loss": 4.1637067624833435e-05 }, { "epoch": 0.08137235515127546, "step": 823, "train/sim_loss": 0.0390625 }, { "epoch": 0.08137235515127546, "step": 823, "train/total_loss": 0.03906666487455368 }, { "entropy": 9.507827758789062, "epoch": 0.08147122800079099, "mean_token_accuracy": 0.7921568751335144, "num_tokens": 4293902.0, "step": 824, "train/ce_loss": 0.918192446231842 }, { "epoch": 0.08147122800079099, "step": 824, "train/sim_loss": 0.09375 }, { "epoch": 0.08147122800079099, "step": 824, "train/total_loss": 0.18556925654411316 }, { "entropy": 9.711023330688477, "epoch": 0.08157010085030651, "mean_token_accuracy": 0.7573099136352539, "num_tokens": 4299053.0, "step": 825, "train/ce_loss": 0.00011551461648195982 }, { "epoch": 0.08157010085030651, "step": 825, "train/sim_loss": 0.1796875 }, { "epoch": 0.08157010085030651, "step": 825, "train/total_loss": 0.17969904839992523 }, { "entropy": 9.974448204040527, "epoch": 0.08166897369982203, "mean_token_accuracy": 0.748062014579773, "num_tokens": 4303992.0, "step": 826, "train/ce_loss": 1.3299791812896729 }, { "epoch": 0.08166897369982203, "step": 826, "train/sim_loss": 0.10546875 }, { "epoch": 0.08166897369982203, "step": 826, "train/total_loss": 0.23846666514873505 }, { "entropy": 9.31650161743164, "epoch": 0.08176784654933755, "mean_token_accuracy": 0.7368420958518982, "num_tokens": 4309328.0, "step": 827, "train/ce_loss": 0.5922544002532959 }, { "epoch": 0.08176784654933755, "step": 827, "train/sim_loss": 0.1640625 }, { "epoch": 0.08176784654933755, "step": 827, "train/total_loss": 0.2232879400253296 }, { "entropy": 9.702505111694336, "epoch": 0.08186671939885308, "mean_token_accuracy": 0.662756621837616, "num_tokens": 4314475.0, "step": 828, "train/ce_loss": 3.506695065880194e-05 }, { "epoch": 0.08186671939885308, "step": 828, "train/sim_loss": 0.0390625 }, { "epoch": 0.08186671939885308, "step": 828, "train/total_loss": 0.03906600549817085 }, { "entropy": 9.546751976013184, "epoch": 0.0819655922483686, "mean_token_accuracy": 0.733418345451355, "num_tokens": 4319698.0, "step": 829, "train/ce_loss": 1.594700813293457 }, { "epoch": 0.0819655922483686, "step": 829, "train/sim_loss": 0.1328125 }, { "epoch": 0.0819655922483686, "step": 829, "train/total_loss": 0.2922825813293457 }, { "entropy": 9.644353866577148, "epoch": 0.08206446509788412, "mean_token_accuracy": 0.752136766910553, "num_tokens": 4324752.0, "step": 830, "train/ce_loss": 0.859276294708252 }, { "epoch": 0.08206446509788412, "step": 830, "train/sim_loss": 0.0859375 }, { "epoch": 0.08206446509788412, "step": 830, "train/total_loss": 0.17186513543128967 }, { "entropy": 9.546089172363281, "epoch": 0.08216333794739965, "mean_token_accuracy": 0.7421875, "num_tokens": 4329984.0, "step": 831, "train/ce_loss": 0.4032405912876129 }, { "epoch": 0.08216333794739965, "step": 831, "train/sim_loss": 0.0703125 }, { "epoch": 0.08216333794739965, "step": 831, "train/total_loss": 0.11063656210899353 }, { "entropy": 10.189659118652344, "epoch": 0.08226221079691516, "mean_token_accuracy": 0.8125, "num_tokens": 4334821.0, "step": 832, "train/ce_loss": 5.903546480112709e-05 }, { "epoch": 0.08226221079691516, "step": 832, "train/sim_loss": 0.03125 }, { "epoch": 0.08226221079691516, "step": 832, "train/total_loss": 0.03125590458512306 }, { "entropy": 9.257970809936523, "epoch": 0.08236108364643069, "mean_token_accuracy": 0.7412140369415283, "num_tokens": 4340191.0, "step": 833, "train/ce_loss": 1.0270909070968628 }, { "epoch": 0.08236108364643069, "step": 833, "train/sim_loss": 0.10546875 }, { "epoch": 0.08236108364643069, "step": 833, "train/total_loss": 0.2081778347492218 }, { "entropy": 9.155115127563477, "epoch": 0.08245995649594622, "mean_token_accuracy": 0.7144288420677185, "num_tokens": 4345735.0, "step": 834, "train/ce_loss": 0.3554162085056305 }, { "epoch": 0.08245995649594622, "step": 834, "train/sim_loss": 0.09765625 }, { "epoch": 0.08245995649594622, "step": 834, "train/total_loss": 0.1331978738307953 }, { "entropy": 9.520933151245117, "epoch": 0.08255882934546174, "mean_token_accuracy": 0.6748299598693848, "num_tokens": 4350961.0, "step": 835, "train/ce_loss": 1.509456992149353 }, { "epoch": 0.08255882934546174, "step": 835, "train/sim_loss": 0.109375 }, { "epoch": 0.08255882934546174, "step": 835, "train/total_loss": 0.2603207230567932 }, { "entropy": 9.509539604187012, "epoch": 0.08265770219497726, "mean_token_accuracy": 0.7180616855621338, "num_tokens": 4356047.0, "step": 836, "train/ce_loss": 1.479731559753418 }, { "epoch": 0.08265770219497726, "step": 836, "train/sim_loss": 0.15234375 }, { "epoch": 0.08265770219497726, "step": 836, "train/total_loss": 0.3003169298171997 }, { "entropy": 9.277669906616211, "epoch": 0.08275657504449278, "mean_token_accuracy": 0.7243436574935913, "num_tokens": 4361380.0, "step": 837, "train/ce_loss": 0.9445345997810364 }, { "epoch": 0.08275657504449278, "step": 837, "train/sim_loss": 0.16015625 }, { "epoch": 0.08275657504449278, "step": 837, "train/total_loss": 0.25460970401763916 }, { "entropy": 9.328760147094727, "epoch": 0.08285544789400831, "mean_token_accuracy": 0.7488937973976135, "num_tokens": 4366759.0, "step": 838, "train/ce_loss": 0.5685387253761292 }, { "epoch": 0.08285544789400831, "step": 838, "train/sim_loss": 0.09765625 }, { "epoch": 0.08285544789400831, "step": 838, "train/total_loss": 0.15451012551784515 }, { "entropy": 9.904058456420898, "epoch": 0.08295432074352382, "mean_token_accuracy": 0.7512953281402588, "num_tokens": 4371773.0, "step": 839, "train/ce_loss": 1.4188343286514282 }, { "epoch": 0.08295432074352382, "step": 839, "train/sim_loss": 0.08984375 }, { "epoch": 0.08295432074352382, "step": 839, "train/total_loss": 0.23172718286514282 }, { "epoch": 0.08305319359303935, "grad_norm": 1.0367178916931152, "learning_rate": 9.79503535578302e-06, "loss": 0.1826, "step": 840 }, { "entropy": 9.342744827270508, "epoch": 0.08305319359303935, "mean_token_accuracy": 0.7129629850387573, "num_tokens": 4377104.0, "step": 840, "train/ce_loss": 0.6612659096717834 }, { "epoch": 0.08305319359303935, "step": 840, "train/sim_loss": 0.0703125 }, { "epoch": 0.08305319359303935, "step": 840, "train/total_loss": 0.13643908500671387 }, { "entropy": 9.328734397888184, "epoch": 0.08315206644255488, "mean_token_accuracy": 0.717674970626831, "num_tokens": 4382420.0, "step": 841, "train/ce_loss": 1.2937597036361694 }, { "epoch": 0.08315206644255488, "step": 841, "train/sim_loss": 0.1328125 }, { "epoch": 0.08315206644255488, "step": 841, "train/total_loss": 0.26218849420547485 }, { "entropy": 9.546821594238281, "epoch": 0.08325093929207039, "mean_token_accuracy": 0.691428542137146, "num_tokens": 4387593.0, "step": 842, "train/ce_loss": 0.7460771203041077 }, { "epoch": 0.08325093929207039, "step": 842, "train/sim_loss": 0.1015625 }, { "epoch": 0.08325093929207039, "step": 842, "train/total_loss": 0.176170215010643 }, { "entropy": 9.157147407531738, "epoch": 0.08334981214158592, "mean_token_accuracy": 0.7199147939682007, "num_tokens": 4393014.0, "step": 843, "train/ce_loss": 0.7931463122367859 }, { "epoch": 0.08334981214158592, "step": 843, "train/sim_loss": 0.09765625 }, { "epoch": 0.08334981214158592, "step": 843, "train/total_loss": 0.17697088420391083 }, { "entropy": 9.803672790527344, "epoch": 0.08344868499110145, "mean_token_accuracy": 0.6762917637825012, "num_tokens": 4398141.0, "step": 844, "train/ce_loss": 0.8283305764198303 }, { "epoch": 0.08344868499110145, "step": 844, "train/sim_loss": 0.1015625 }, { "epoch": 0.08344868499110145, "step": 844, "train/total_loss": 0.18439555168151855 }, { "entropy": 9.153200149536133, "epoch": 0.08354755784061697, "mean_token_accuracy": 0.6939426064491272, "num_tokens": 4403592.0, "step": 845, "train/ce_loss": 0.9548829197883606 }, { "epoch": 0.08354755784061697, "step": 845, "train/sim_loss": 0.1328125 }, { "epoch": 0.08354755784061697, "step": 845, "train/total_loss": 0.2283007949590683 }, { "entropy": 9.16877555847168, "epoch": 0.08364643069013249, "mean_token_accuracy": 0.6962785124778748, "num_tokens": 4408930.0, "step": 846, "train/ce_loss": 0.8261193633079529 }, { "epoch": 0.08364643069013249, "step": 846, "train/sim_loss": 0.08984375 }, { "epoch": 0.08364643069013249, "step": 846, "train/total_loss": 0.17245569825172424 }, { "entropy": 9.682516098022461, "epoch": 0.08374530353964801, "mean_token_accuracy": 0.7147335410118103, "num_tokens": 4413996.0, "step": 847, "train/ce_loss": 1.019052505493164 }, { "epoch": 0.08374530353964801, "step": 847, "train/sim_loss": 0.0703125 }, { "epoch": 0.08374530353964801, "step": 847, "train/total_loss": 0.17221775650978088 }, { "entropy": 9.821207046508789, "epoch": 0.08384417638916354, "mean_token_accuracy": 0.723747968673706, "num_tokens": 4419084.0, "step": 848, "train/ce_loss": 1.4414478540420532 }, { "epoch": 0.08384417638916354, "step": 848, "train/sim_loss": 0.14453125 }, { "epoch": 0.08384417638916354, "step": 848, "train/total_loss": 0.28867602348327637 }, { "entropy": 10.034945487976074, "epoch": 0.08394304923867905, "mean_token_accuracy": 0.7482993006706238, "num_tokens": 4423942.0, "step": 849, "train/ce_loss": 2.0451998710632324 }, { "epoch": 0.08394304923867905, "step": 849, "train/sim_loss": 0.09765625 }, { "epoch": 0.08394304923867905, "step": 849, "train/total_loss": 0.30217623710632324 }, { "entropy": 9.162328720092773, "epoch": 0.08404192208819458, "mean_token_accuracy": 0.7416148781776428, "num_tokens": 4429239.0, "step": 850, "train/ce_loss": 0.6268206834793091 }, { "epoch": 0.08404192208819458, "step": 850, "train/sim_loss": 0.0390625 }, { "epoch": 0.08404192208819458, "step": 850, "train/total_loss": 0.10174456983804703 }, { "entropy": 9.801444053649902, "epoch": 0.08414079493771011, "mean_token_accuracy": 0.7170172333717346, "num_tokens": 4434188.0, "step": 851, "train/ce_loss": 5.47383569937665e-05 }, { "epoch": 0.08414079493771011, "step": 851, "train/sim_loss": 0.0546875 }, { "epoch": 0.08414079493771011, "step": 851, "train/total_loss": 0.05469297245144844 }, { "entropy": 10.159244537353516, "epoch": 0.08423966778722562, "mean_token_accuracy": 0.7085019946098328, "num_tokens": 4439049.0, "step": 852, "train/ce_loss": 3.2078307413030416e-05 }, { "epoch": 0.08423966778722562, "step": 852, "train/sim_loss": 0.06640625 }, { "epoch": 0.08423966778722562, "step": 852, "train/total_loss": 0.06640946120023727 }, { "entropy": 10.302398681640625, "epoch": 0.08433854063674115, "mean_token_accuracy": 0.7385057210922241, "num_tokens": 4443823.0, "step": 853, "train/ce_loss": 4.880238338955678e-05 }, { "epoch": 0.08433854063674115, "step": 853, "train/sim_loss": 0.0703125 }, { "epoch": 0.08433854063674115, "step": 853, "train/total_loss": 0.07031738013029099 }, { "entropy": 9.40043830871582, "epoch": 0.08443741348625668, "mean_token_accuracy": 0.6945169568061829, "num_tokens": 4449051.0, "step": 854, "train/ce_loss": 1.1474387645721436 }, { "epoch": 0.08443741348625668, "step": 854, "train/sim_loss": 0.12109375 }, { "epoch": 0.08443741348625668, "step": 854, "train/total_loss": 0.2358376383781433 }, { "entropy": 9.591285705566406, "epoch": 0.0845362863357722, "mean_token_accuracy": 0.7363494634628296, "num_tokens": 4454147.0, "step": 855, "train/ce_loss": 0.8136089444160461 }, { "epoch": 0.0845362863357722, "step": 855, "train/sim_loss": 0.0625 }, { "epoch": 0.0845362863357722, "step": 855, "train/total_loss": 0.14386090636253357 }, { "entropy": 9.808528900146484, "epoch": 0.08463515918528772, "mean_token_accuracy": 0.7299145460128784, "num_tokens": 4459176.0, "step": 856, "train/ce_loss": 0.8623424768447876 }, { "epoch": 0.08463515918528772, "step": 856, "train/sim_loss": 0.08984375 }, { "epoch": 0.08463515918528772, "step": 856, "train/total_loss": 0.17607799172401428 }, { "entropy": 9.614444732666016, "epoch": 0.08473403203480324, "mean_token_accuracy": 0.7460992932319641, "num_tokens": 4464368.0, "step": 857, "train/ce_loss": 0.6678289771080017 }, { "epoch": 0.08473403203480324, "step": 857, "train/sim_loss": 0.05078125 }, { "epoch": 0.08473403203480324, "step": 857, "train/total_loss": 0.11756414920091629 }, { "entropy": 9.298678398132324, "epoch": 0.08483290488431877, "mean_token_accuracy": 0.7164179086685181, "num_tokens": 4469722.0, "step": 858, "train/ce_loss": 0.6989213228225708 }, { "epoch": 0.08483290488431877, "step": 858, "train/sim_loss": 0.046875 }, { "epoch": 0.08483290488431877, "step": 858, "train/total_loss": 0.11676713079214096 }, { "entropy": 9.135697364807129, "epoch": 0.08493177773383428, "mean_token_accuracy": 0.6835051774978638, "num_tokens": 4475159.0, "step": 859, "train/ce_loss": 1.6708698272705078 }, { "epoch": 0.08493177773383428, "step": 859, "train/sim_loss": 0.140625 }, { "epoch": 0.08493177773383428, "step": 859, "train/total_loss": 0.30771198868751526 }, { "epoch": 0.08503065058334981, "grad_norm": 1.4547375440597534, "learning_rate": 9.790090491025071e-06, "loss": 0.1914, "step": 860 }, { "entropy": 8.923101425170898, "epoch": 0.08503065058334981, "mean_token_accuracy": 0.6813910007476807, "num_tokens": 4480776.0, "step": 860, "train/ce_loss": 0.9627625942230225 }, { "epoch": 0.08503065058334981, "step": 860, "train/sim_loss": 0.1171875 }, { "epoch": 0.08503065058334981, "step": 860, "train/total_loss": 0.21346375346183777 }, { "entropy": 9.888015747070312, "epoch": 0.08512952343286534, "mean_token_accuracy": 0.6981450319290161, "num_tokens": 4485823.0, "step": 861, "train/ce_loss": 1.032806634902954 }, { "epoch": 0.08512952343286534, "step": 861, "train/sim_loss": 0.140625 }, { "epoch": 0.08512952343286534, "step": 861, "train/total_loss": 0.2439056634902954 }, { "entropy": 9.753445625305176, "epoch": 0.08522839628238085, "mean_token_accuracy": 0.70597243309021, "num_tokens": 4490898.0, "step": 862, "train/ce_loss": 2.412477731704712 }, { "epoch": 0.08522839628238085, "step": 862, "train/sim_loss": 0.1328125 }, { "epoch": 0.08522839628238085, "step": 862, "train/total_loss": 0.3740602731704712 }, { "entropy": 9.161497116088867, "epoch": 0.08532726913189638, "mean_token_accuracy": 0.7283422350883484, "num_tokens": 4496297.0, "step": 863, "train/ce_loss": 0.9021885395050049 }, { "epoch": 0.08532726913189638, "step": 863, "train/sim_loss": 0.0390625 }, { "epoch": 0.08532726913189638, "step": 863, "train/total_loss": 0.12928135693073273 }, { "entropy": 9.778253555297852, "epoch": 0.08542614198141191, "mean_token_accuracy": 0.710616409778595, "num_tokens": 4501313.0, "step": 864, "train/ce_loss": 0.8009510040283203 }, { "epoch": 0.08542614198141191, "step": 864, "train/sim_loss": 0.05859375 }, { "epoch": 0.08542614198141191, "step": 864, "train/total_loss": 0.138688862323761 }, { "entropy": 9.464872360229492, "epoch": 0.08552501483092742, "mean_token_accuracy": 0.7915531396865845, "num_tokens": 4506536.0, "step": 865, "train/ce_loss": 0.520185649394989 }, { "epoch": 0.08552501483092742, "step": 865, "train/sim_loss": 0.03125 }, { "epoch": 0.08552501483092742, "step": 865, "train/total_loss": 0.08326856791973114 }, { "entropy": 10.019142150878906, "epoch": 0.08562388768044295, "mean_token_accuracy": 0.7115043997764587, "num_tokens": 4511517.0, "step": 866, "train/ce_loss": 0.0004360276216175407 }, { "epoch": 0.08562388768044295, "step": 866, "train/sim_loss": 0.078125 }, { "epoch": 0.08562388768044295, "step": 866, "train/total_loss": 0.0781686007976532 }, { "entropy": 9.592554092407227, "epoch": 0.08572276052995847, "mean_token_accuracy": 0.6846985816955566, "num_tokens": 4516633.0, "step": 867, "train/ce_loss": 1.4119404554367065 }, { "epoch": 0.08572276052995847, "step": 867, "train/sim_loss": 0.08984375 }, { "epoch": 0.08572276052995847, "step": 867, "train/total_loss": 0.23103779554367065 }, { "entropy": 9.140426635742188, "epoch": 0.085821633379474, "mean_token_accuracy": 0.7066666483879089, "num_tokens": 4522097.0, "step": 868, "train/ce_loss": 0.7785761952400208 }, { "epoch": 0.085821633379474, "step": 868, "train/sim_loss": 0.0546875 }, { "epoch": 0.085821633379474, "step": 868, "train/total_loss": 0.1325451135635376 }, { "entropy": 9.77696418762207, "epoch": 0.08592050622898952, "mean_token_accuracy": 0.804958701133728, "num_tokens": 4527137.0, "step": 869, "train/ce_loss": 0.9853973388671875 }, { "epoch": 0.08592050622898952, "step": 869, "train/sim_loss": 0.125 }, { "epoch": 0.08592050622898952, "step": 869, "train/total_loss": 0.22353973984718323 }, { "entropy": 9.248411178588867, "epoch": 0.08601937907850504, "mean_token_accuracy": 0.7294981479644775, "num_tokens": 4532446.0, "step": 870, "train/ce_loss": 0.5703291296958923 }, { "epoch": 0.08601937907850504, "step": 870, "train/sim_loss": 0.0390625 }, { "epoch": 0.08601937907850504, "step": 870, "train/total_loss": 0.09609541296958923 }, { "entropy": 9.832139015197754, "epoch": 0.08611825192802057, "mean_token_accuracy": 0.7669421434402466, "num_tokens": 4537515.0, "step": 871, "train/ce_loss": 0.8696778416633606 }, { "epoch": 0.08611825192802057, "step": 871, "train/sim_loss": 0.0703125 }, { "epoch": 0.08611825192802057, "step": 871, "train/total_loss": 0.15728029608726501 }, { "entropy": 9.310486793518066, "epoch": 0.08621712477753608, "mean_token_accuracy": 0.6934306621551514, "num_tokens": 4542820.0, "step": 872, "train/ce_loss": 0.6153068542480469 }, { "epoch": 0.08621712477753608, "step": 872, "train/sim_loss": 0.1484375 }, { "epoch": 0.08621712477753608, "step": 872, "train/total_loss": 0.2099681794643402 }, { "entropy": 10.34494400024414, "epoch": 0.08631599762705161, "mean_token_accuracy": 0.8199999928474426, "num_tokens": 4547574.0, "step": 873, "train/ce_loss": 1.1924973726272583 }, { "epoch": 0.08631599762705161, "step": 873, "train/sim_loss": 0.13671875 }, { "epoch": 0.08631599762705161, "step": 873, "train/total_loss": 0.25596848130226135 }, { "entropy": 9.449071884155273, "epoch": 0.08641487047656714, "mean_token_accuracy": 0.7564259767532349, "num_tokens": 4552805.0, "step": 874, "train/ce_loss": 0.8073897957801819 }, { "epoch": 0.08641487047656714, "step": 874, "train/sim_loss": 0.109375 }, { "epoch": 0.08641487047656714, "step": 874, "train/total_loss": 0.19011399149894714 }, { "entropy": 9.491059303283691, "epoch": 0.08651374332608265, "mean_token_accuracy": 0.7123655676841736, "num_tokens": 4558024.0, "step": 875, "train/ce_loss": 1.0081188678741455 }, { "epoch": 0.08651374332608265, "step": 875, "train/sim_loss": 0.0859375 }, { "epoch": 0.08651374332608265, "step": 875, "train/total_loss": 0.1867493987083435 }, { "entropy": 9.761913299560547, "epoch": 0.08661261617559818, "mean_token_accuracy": 0.6963696479797363, "num_tokens": 4563031.0, "step": 876, "train/ce_loss": 1.437301754951477 }, { "epoch": 0.08661261617559818, "step": 876, "train/sim_loss": 0.109375 }, { "epoch": 0.08661261617559818, "step": 876, "train/total_loss": 0.25310516357421875 }, { "entropy": 9.131916046142578, "epoch": 0.0867114890251137, "mean_token_accuracy": 0.7195122241973877, "num_tokens": 4568486.0, "step": 877, "train/ce_loss": 0.7580614686012268 }, { "epoch": 0.0867114890251137, "step": 877, "train/sim_loss": 0.1171875 }, { "epoch": 0.0867114890251137, "step": 877, "train/total_loss": 0.1929936408996582 }, { "entropy": 9.216597557067871, "epoch": 0.08681036187462923, "mean_token_accuracy": 0.6851248741149902, "num_tokens": 4573853.0, "step": 878, "train/ce_loss": 1.3044555187225342 }, { "epoch": 0.08681036187462923, "step": 878, "train/sim_loss": 0.10546875 }, { "epoch": 0.08681036187462923, "step": 878, "train/total_loss": 0.23591430485248566 }, { "entropy": 9.29898452758789, "epoch": 0.08690923472414475, "mean_token_accuracy": 0.7606936693191528, "num_tokens": 4579205.0, "step": 879, "train/ce_loss": 0.6638868451118469 }, { "epoch": 0.08690923472414475, "step": 879, "train/sim_loss": 0.0859375 }, { "epoch": 0.08690923472414475, "step": 879, "train/total_loss": 0.15232619643211365 }, { "epoch": 0.08700810757366027, "grad_norm": 1.011390209197998, "learning_rate": 9.785145626267124e-06, "loss": 0.1816, "step": 880 }, { "entropy": 9.205062866210938, "epoch": 0.08700810757366027, "mean_token_accuracy": 0.7522421479225159, "num_tokens": 4584625.0, "step": 880, "train/ce_loss": 0.7033084630966187 }, { "epoch": 0.08700810757366027, "step": 880, "train/sim_loss": 0.03125 }, { "epoch": 0.08700810757366027, "step": 880, "train/total_loss": 0.10158085078001022 }, { "entropy": 9.204500198364258, "epoch": 0.0871069804231758, "mean_token_accuracy": 0.7185500860214233, "num_tokens": 4590024.0, "step": 881, "train/ce_loss": 1.1597356796264648 }, { "epoch": 0.0871069804231758, "step": 881, "train/sim_loss": 0.07421875 }, { "epoch": 0.0871069804231758, "step": 881, "train/total_loss": 0.190192312002182 }, { "entropy": 9.53515625, "epoch": 0.08720585327269131, "mean_token_accuracy": 0.6879194378852844, "num_tokens": 4595085.0, "step": 882, "train/ce_loss": 2.1317622661590576 }, { "epoch": 0.08720585327269131, "step": 882, "train/sim_loss": 0.08203125 }, { "epoch": 0.08720585327269131, "step": 882, "train/total_loss": 0.29520750045776367 }, { "entropy": 9.357887268066406, "epoch": 0.08730472612220684, "mean_token_accuracy": 0.7347418069839478, "num_tokens": 4600452.0, "step": 883, "train/ce_loss": 1.2176082134246826 }, { "epoch": 0.08730472612220684, "step": 883, "train/sim_loss": 0.1171875 }, { "epoch": 0.08730472612220684, "step": 883, "train/total_loss": 0.23894831538200378 }, { "entropy": 9.738540649414062, "epoch": 0.08740359897172237, "mean_token_accuracy": 0.7417762875556946, "num_tokens": 4605515.0, "step": 884, "train/ce_loss": 3.276742063462734e-05 }, { "epoch": 0.08740359897172237, "step": 884, "train/sim_loss": 0.04296875 }, { "epoch": 0.08740359897172237, "step": 884, "train/total_loss": 0.042972028255462646 }, { "entropy": 9.246567726135254, "epoch": 0.08750247182123788, "mean_token_accuracy": 0.681664764881134, "num_tokens": 4610849.0, "step": 885, "train/ce_loss": 1.3851035833358765 }, { "epoch": 0.08750247182123788, "step": 885, "train/sim_loss": 0.1484375 }, { "epoch": 0.08750247182123788, "step": 885, "train/total_loss": 0.2869478464126587 }, { "entropy": 9.525009155273438, "epoch": 0.08760134467075341, "mean_token_accuracy": 0.7420249581336975, "num_tokens": 4616043.0, "step": 886, "train/ce_loss": 0.9769114255905151 }, { "epoch": 0.08760134467075341, "step": 886, "train/sim_loss": 0.0859375 }, { "epoch": 0.08760134467075341, "step": 886, "train/total_loss": 0.183628648519516 }, { "entropy": 9.74372386932373, "epoch": 0.08770021752026894, "mean_token_accuracy": 0.7533556818962097, "num_tokens": 4621077.0, "step": 887, "train/ce_loss": 0.6396549940109253 }, { "epoch": 0.08770021752026894, "step": 887, "train/sim_loss": 0.0859375 }, { "epoch": 0.08770021752026894, "step": 887, "train/total_loss": 0.14990299940109253 }, { "entropy": 9.419316291809082, "epoch": 0.08779909036978446, "mean_token_accuracy": 0.767471432685852, "num_tokens": 4626323.0, "step": 888, "train/ce_loss": 0.7446501851081848 }, { "epoch": 0.08779909036978446, "step": 888, "train/sim_loss": 0.0390625 }, { "epoch": 0.08779909036978446, "step": 888, "train/total_loss": 0.11352752149105072 }, { "entropy": 9.304037094116211, "epoch": 0.08789796321929998, "mean_token_accuracy": 0.6585366129875183, "num_tokens": 4631525.0, "step": 889, "train/ce_loss": 2.263930320739746 }, { "epoch": 0.08789796321929998, "step": 889, "train/sim_loss": 0.109375 }, { "epoch": 0.08789796321929998, "step": 889, "train/total_loss": 0.33576804399490356 }, { "entropy": 9.804574966430664, "epoch": 0.0879968360688155, "mean_token_accuracy": 0.7204968929290771, "num_tokens": 4636601.0, "step": 890, "train/ce_loss": 1.3510684967041016 }, { "epoch": 0.0879968360688155, "step": 890, "train/sim_loss": 0.09765625 }, { "epoch": 0.0879968360688155, "step": 890, "train/total_loss": 0.23276309669017792 }, { "entropy": 9.326436042785645, "epoch": 0.08809570891833103, "mean_token_accuracy": 0.7505882382392883, "num_tokens": 4641903.0, "step": 891, "train/ce_loss": 0.6687228679656982 }, { "epoch": 0.08809570891833103, "step": 891, "train/sim_loss": 0.09375 }, { "epoch": 0.08809570891833103, "step": 891, "train/total_loss": 0.16062229871749878 }, { "entropy": 9.97100830078125, "epoch": 0.08819458176784654, "mean_token_accuracy": 0.6701940298080444, "num_tokens": 4646893.0, "step": 892, "train/ce_loss": 1.1901462078094482 }, { "epoch": 0.08819458176784654, "step": 892, "train/sim_loss": 0.11328125 }, { "epoch": 0.08819458176784654, "step": 892, "train/total_loss": 0.23229587078094482 }, { "entropy": 9.102214813232422, "epoch": 0.08829345461736207, "mean_token_accuracy": 0.7774358987808228, "num_tokens": 4652394.0, "step": 893, "train/ce_loss": 1.2304139137268066 }, { "epoch": 0.08829345461736207, "step": 893, "train/sim_loss": 0.12890625 }, { "epoch": 0.08829345461736207, "step": 893, "train/total_loss": 0.25194764137268066 }, { "entropy": 9.257131576538086, "epoch": 0.0883923274668776, "mean_token_accuracy": 0.7405345439910889, "num_tokens": 4657736.0, "step": 894, "train/ce_loss": 0.5056064128875732 }, { "epoch": 0.0883923274668776, "step": 894, "train/sim_loss": 0.046875 }, { "epoch": 0.0883923274668776, "step": 894, "train/total_loss": 0.09743563830852509 }, { "entropy": 9.271703720092773, "epoch": 0.08849120031639311, "mean_token_accuracy": 0.7413395047187805, "num_tokens": 4663017.0, "step": 895, "train/ce_loss": 0.7187294363975525 }, { "epoch": 0.08849120031639311, "step": 895, "train/sim_loss": 0.09375 }, { "epoch": 0.08849120031639311, "step": 895, "train/total_loss": 0.16562294960021973 }, { "entropy": 9.240007400512695, "epoch": 0.08859007316590864, "mean_token_accuracy": 0.7541766166687012, "num_tokens": 4668301.0, "step": 896, "train/ce_loss": 0.8012892007827759 }, { "epoch": 0.08859007316590864, "step": 896, "train/sim_loss": 0.10546875 }, { "epoch": 0.08859007316590864, "step": 896, "train/total_loss": 0.18559767305850983 }, { "entropy": 8.960734367370605, "epoch": 0.08868894601542417, "mean_token_accuracy": 0.6455331444740295, "num_tokens": 4673841.0, "step": 897, "train/ce_loss": 1.7212449312210083 }, { "epoch": 0.08868894601542417, "step": 897, "train/sim_loss": 0.09765625 }, { "epoch": 0.08868894601542417, "step": 897, "train/total_loss": 0.2697807550430298 }, { "entropy": 9.661792755126953, "epoch": 0.0887878188649397, "mean_token_accuracy": 0.7244318127632141, "num_tokens": 4678986.0, "step": 898, "train/ce_loss": 2.423352088953834e-05 }, { "epoch": 0.0887878188649397, "step": 898, "train/sim_loss": 0.0390625 }, { "epoch": 0.0887878188649397, "step": 898, "train/total_loss": 0.0390649251639843 }, { "entropy": 9.03544807434082, "epoch": 0.08888669171445521, "mean_token_accuracy": 0.7222787141799927, "num_tokens": 4684460.0, "step": 899, "train/ce_loss": 1.1360260248184204 }, { "epoch": 0.08888669171445521, "step": 899, "train/sim_loss": 0.14453125 }, { "epoch": 0.08888669171445521, "step": 899, "train/total_loss": 0.2581338584423065 }, { "epoch": 0.08898556456397073, "grad_norm": 1.515589714050293, "learning_rate": 9.780200761509172e-06, "loss": 0.1777, "step": 900 }, { "entropy": 9.385675430297852, "epoch": 0.08898556456397073, "mean_token_accuracy": 0.6979310512542725, "num_tokens": 4689651.0, "step": 900, "train/ce_loss": 0.9100170731544495 }, { "epoch": 0.08898556456397073, "step": 900, "train/sim_loss": 0.1484375 }, { "epoch": 0.08898556456397073, "step": 900, "train/total_loss": 0.2394392192363739 }, { "entropy": 9.832839965820312, "epoch": 0.08908443741348626, "mean_token_accuracy": 0.7122302055358887, "num_tokens": 4694651.0, "step": 901, "train/ce_loss": 1.3100913763046265 }, { "epoch": 0.08908443741348626, "step": 901, "train/sim_loss": 0.08203125 }, { "epoch": 0.08908443741348626, "step": 901, "train/total_loss": 0.21304039657115936 }, { "entropy": 9.541091918945312, "epoch": 0.08918331026300177, "mean_token_accuracy": 0.7403973340988159, "num_tokens": 4699878.0, "step": 902, "train/ce_loss": 0.7895435094833374 }, { "epoch": 0.08918331026300177, "step": 902, "train/sim_loss": 0.05078125 }, { "epoch": 0.08918331026300177, "step": 902, "train/total_loss": 0.12973560392856598 }, { "entropy": 9.55825424194336, "epoch": 0.0892821831125173, "mean_token_accuracy": 0.7576974630355835, "num_tokens": 4705071.0, "step": 903, "train/ce_loss": 0.7540633082389832 }, { "epoch": 0.0892821831125173, "step": 903, "train/sim_loss": 0.0390625 }, { "epoch": 0.0892821831125173, "step": 903, "train/total_loss": 0.11446883529424667 }, { "entropy": 9.702561378479004, "epoch": 0.08938105596203283, "mean_token_accuracy": 0.7054908275604248, "num_tokens": 4710161.0, "step": 904, "train/ce_loss": 6.917696737218648e-05 }, { "epoch": 0.08938105596203283, "step": 904, "train/sim_loss": 0.09765625 }, { "epoch": 0.08938105596203283, "step": 904, "train/total_loss": 0.09766316413879395 }, { "entropy": 9.32586669921875, "epoch": 0.08947992881154834, "mean_token_accuracy": 0.7013463973999023, "num_tokens": 4715421.0, "step": 905, "train/ce_loss": 0.6814620494842529 }, { "epoch": 0.08947992881154834, "step": 905, "train/sim_loss": 0.109375 }, { "epoch": 0.08947992881154834, "step": 905, "train/total_loss": 0.17752119898796082 }, { "entropy": 9.214298248291016, "epoch": 0.08957880166106387, "mean_token_accuracy": 0.7730569839477539, "num_tokens": 4720850.0, "step": 906, "train/ce_loss": 0.6316813826560974 }, { "epoch": 0.08957880166106387, "step": 906, "train/sim_loss": 0.0390625 }, { "epoch": 0.08957880166106387, "step": 906, "train/total_loss": 0.10223063826560974 }, { "entropy": 8.960771560668945, "epoch": 0.0896776745105794, "mean_token_accuracy": 0.804950475692749, "num_tokens": 4726345.0, "step": 907, "train/ce_loss": 0.5146143436431885 }, { "epoch": 0.0896776745105794, "step": 907, "train/sim_loss": 0.03125 }, { "epoch": 0.0896776745105794, "step": 907, "train/total_loss": 0.08271143585443497 }, { "entropy": 9.457979202270508, "epoch": 0.08977654736009492, "mean_token_accuracy": 0.7007672786712646, "num_tokens": 4731594.0, "step": 908, "train/ce_loss": 0.9663236737251282 }, { "epoch": 0.08977654736009492, "step": 908, "train/sim_loss": 0.0703125 }, { "epoch": 0.08977654736009492, "step": 908, "train/total_loss": 0.16694486141204834 }, { "entropy": 9.720237731933594, "epoch": 0.08987542020961044, "mean_token_accuracy": 0.6881405711174011, "num_tokens": 4736726.0, "step": 909, "train/ce_loss": 1.5250821113586426 }, { "epoch": 0.08987542020961044, "step": 909, "train/sim_loss": 0.1171875 }, { "epoch": 0.08987542020961044, "step": 909, "train/total_loss": 0.2696956992149353 }, { "entropy": 9.777441024780273, "epoch": 0.08997429305912596, "mean_token_accuracy": 0.718196451663971, "num_tokens": 4741758.0, "step": 910, "train/ce_loss": 1.7164819240570068 }, { "epoch": 0.08997429305912596, "step": 910, "train/sim_loss": 0.06640625 }, { "epoch": 0.08997429305912596, "step": 910, "train/total_loss": 0.23805443942546844 }, { "entropy": 10.040216445922852, "epoch": 0.09007316590864149, "mean_token_accuracy": 0.75, "num_tokens": 4746633.0, "step": 911, "train/ce_loss": 1.6431957483291626 }, { "epoch": 0.09007316590864149, "step": 911, "train/sim_loss": 0.09375 }, { "epoch": 0.09007316590864149, "step": 911, "train/total_loss": 0.25806957483291626 }, { "entropy": 9.284655570983887, "epoch": 0.090172038758157, "mean_token_accuracy": 0.7226791977882385, "num_tokens": 4751974.0, "step": 912, "train/ce_loss": 0.9069715142250061 }, { "epoch": 0.090172038758157, "step": 912, "train/sim_loss": 0.1015625 }, { "epoch": 0.090172038758157, "step": 912, "train/total_loss": 0.19225965440273285 }, { "entropy": 9.134580612182617, "epoch": 0.09027091160767253, "mean_token_accuracy": 0.8069105744361877, "num_tokens": 4757651.0, "step": 913, "train/ce_loss": 0.661210834980011 }, { "epoch": 0.09027091160767253, "step": 913, "train/sim_loss": 0.1328125 }, { "epoch": 0.09027091160767253, "step": 913, "train/total_loss": 0.19893358647823334 }, { "entropy": 9.497674942016602, "epoch": 0.09036978445718806, "mean_token_accuracy": 0.7619718313217163, "num_tokens": 4762807.0, "step": 914, "train/ce_loss": 0.8399479389190674 }, { "epoch": 0.09036978445718806, "step": 914, "train/sim_loss": 0.03515625 }, { "epoch": 0.09036978445718806, "step": 914, "train/total_loss": 0.1191510483622551 }, { "entropy": 9.850480079650879, "epoch": 0.09046865730670357, "mean_token_accuracy": 0.7324414849281311, "num_tokens": 4767810.0, "step": 915, "train/ce_loss": 2.1993157133692876e-05 }, { "epoch": 0.09046865730670357, "step": 915, "train/sim_loss": 0.03515625 }, { "epoch": 0.09046865730670357, "step": 915, "train/total_loss": 0.03515844792127609 }, { "entropy": 9.506845474243164, "epoch": 0.0905675301562191, "mean_token_accuracy": 0.7412223815917969, "num_tokens": 4773030.0, "step": 916, "train/ce_loss": 0.8922023773193359 }, { "epoch": 0.0905675301562191, "step": 916, "train/sim_loss": 0.08984375 }, { "epoch": 0.0905675301562191, "step": 916, "train/total_loss": 0.17906399071216583 }, { "entropy": 9.768218040466309, "epoch": 0.09066640300573463, "mean_token_accuracy": 0.7072418928146362, "num_tokens": 4778103.0, "step": 917, "train/ce_loss": 4.770288069266826e-05 }, { "epoch": 0.09066640300573463, "step": 917, "train/sim_loss": 0.046875 }, { "epoch": 0.09066640300573463, "step": 917, "train/total_loss": 0.04687977209687233 }, { "entropy": 10.2958345413208, "epoch": 0.09076527585525015, "mean_token_accuracy": 0.7989276051521301, "num_tokens": 4782833.0, "step": 918, "train/ce_loss": 1.2475789785385132 }, { "epoch": 0.09076527585525015, "step": 918, "train/sim_loss": 0.0390625 }, { "epoch": 0.09076527585525015, "step": 918, "train/total_loss": 0.16382040083408356 }, { "entropy": 9.392905235290527, "epoch": 0.09086414870476567, "mean_token_accuracy": 0.7146371603012085, "num_tokens": 4788116.0, "step": 919, "train/ce_loss": 1.3511189222335815 }, { "epoch": 0.09086414870476567, "step": 919, "train/sim_loss": 0.07421875 }, { "epoch": 0.09086414870476567, "step": 919, "train/total_loss": 0.20933064818382263 }, { "epoch": 0.0909630215542812, "grad_norm": 1.054306983947754, "learning_rate": 9.775255896751225e-06, "loss": 0.1746, "step": 920 }, { "entropy": 9.122296333312988, "epoch": 0.0909630215542812, "mean_token_accuracy": 0.6872385144233704, "num_tokens": 4793545.0, "step": 920, "train/ce_loss": 0.6360870599746704 }, { "epoch": 0.0909630215542812, "step": 920, "train/sim_loss": 0.11328125 }, { "epoch": 0.0909630215542812, "step": 920, "train/total_loss": 0.17688995599746704 }, { "entropy": 9.493011474609375, "epoch": 0.09106189440379672, "mean_token_accuracy": 0.6517857313156128, "num_tokens": 4798806.0, "step": 921, "train/ce_loss": 2.0138051695539616e-05 }, { "epoch": 0.09106189440379672, "step": 921, "train/sim_loss": 0.0703125 }, { "epoch": 0.09106189440379672, "step": 921, "train/total_loss": 0.07031451165676117 }, { "entropy": 9.315386772155762, "epoch": 0.09116076725331224, "mean_token_accuracy": 0.7726027369499207, "num_tokens": 4804026.0, "step": 922, "train/ce_loss": 0.5254396200180054 }, { "epoch": 0.09116076725331224, "step": 922, "train/sim_loss": 0.0390625 }, { "epoch": 0.09116076725331224, "step": 922, "train/total_loss": 0.09160646796226501 }, { "entropy": 9.016719818115234, "epoch": 0.09125964010282776, "mean_token_accuracy": 0.7427983283996582, "num_tokens": 4809561.0, "step": 923, "train/ce_loss": 0.8595583438873291 }, { "epoch": 0.09125964010282776, "step": 923, "train/sim_loss": 0.0703125 }, { "epoch": 0.09125964010282776, "step": 923, "train/total_loss": 0.15626832842826843 }, { "entropy": 9.173903465270996, "epoch": 0.09135851295234329, "mean_token_accuracy": 0.6830732226371765, "num_tokens": 4814882.0, "step": 924, "train/ce_loss": 0.7632126212120056 }, { "epoch": 0.09135851295234329, "step": 924, "train/sim_loss": 0.07421875 }, { "epoch": 0.09135851295234329, "step": 924, "train/total_loss": 0.15054002404212952 }, { "entropy": 9.216700553894043, "epoch": 0.0914573858018588, "mean_token_accuracy": 0.7312775254249573, "num_tokens": 4820252.0, "step": 925, "train/ce_loss": 0.8972272872924805 }, { "epoch": 0.0914573858018588, "step": 925, "train/sim_loss": 0.0625 }, { "epoch": 0.0914573858018588, "step": 925, "train/total_loss": 0.15222272276878357 }, { "entropy": 9.649072647094727, "epoch": 0.09155625865137433, "mean_token_accuracy": 0.7263888716697693, "num_tokens": 4825419.0, "step": 926, "train/ce_loss": 1.167543888092041 }, { "epoch": 0.09155625865137433, "step": 926, "train/sim_loss": 0.09765625 }, { "epoch": 0.09155625865137433, "step": 926, "train/total_loss": 0.21441063284873962 }, { "entropy": 9.430672645568848, "epoch": 0.09165513150088986, "mean_token_accuracy": 0.71875, "num_tokens": 4830615.0, "step": 927, "train/ce_loss": 0.7259710431098938 }, { "epoch": 0.09165513150088986, "step": 927, "train/sim_loss": 0.109375 }, { "epoch": 0.09165513150088986, "step": 927, "train/total_loss": 0.18197211623191833 }, { "entropy": 9.15649127960205, "epoch": 0.09175400435040539, "mean_token_accuracy": 0.6683831214904785, "num_tokens": 4836069.0, "step": 928, "train/ce_loss": 0.8065360188484192 }, { "epoch": 0.09175400435040539, "step": 928, "train/sim_loss": 0.125 }, { "epoch": 0.09175400435040539, "step": 928, "train/total_loss": 0.2056536078453064 }, { "entropy": 9.24024772644043, "epoch": 0.0918528771999209, "mean_token_accuracy": 0.7321212291717529, "num_tokens": 4841299.0, "step": 929, "train/ce_loss": 1.1482356786727905 }, { "epoch": 0.0918528771999209, "step": 929, "train/sim_loss": 0.09375 }, { "epoch": 0.0918528771999209, "step": 929, "train/total_loss": 0.208573579788208 }, { "entropy": 9.268930435180664, "epoch": 0.09195175004943643, "mean_token_accuracy": 0.7400721907615662, "num_tokens": 4846612.0, "step": 930, "train/ce_loss": 1.3190456628799438 }, { "epoch": 0.09195175004943643, "step": 930, "train/sim_loss": 0.15625 }, { "epoch": 0.09195175004943643, "step": 930, "train/total_loss": 0.28815457224845886 }, { "entropy": 9.457271575927734, "epoch": 0.09205062289895195, "mean_token_accuracy": 0.7328145503997803, "num_tokens": 4852035.0, "step": 931, "train/ce_loss": 0.5545473098754883 }, { "epoch": 0.09205062289895195, "step": 931, "train/sim_loss": 0.13671875 }, { "epoch": 0.09205062289895195, "step": 931, "train/total_loss": 0.19217348098754883 }, { "entropy": 9.846050262451172, "epoch": 0.09214949574846747, "mean_token_accuracy": 0.7230769395828247, "num_tokens": 4857039.0, "step": 932, "train/ce_loss": 0.896928608417511 }, { "epoch": 0.09214949574846747, "step": 932, "train/sim_loss": 0.109375 }, { "epoch": 0.09214949574846747, "step": 932, "train/total_loss": 0.1990678608417511 }, { "entropy": 9.475932121276855, "epoch": 0.092248368597983, "mean_token_accuracy": 0.6655791401863098, "num_tokens": 4862013.0, "step": 933, "train/ce_loss": 0.8814436197280884 }, { "epoch": 0.092248368597983, "step": 933, "train/sim_loss": 0.078125 }, { "epoch": 0.092248368597983, "step": 933, "train/total_loss": 0.16626936197280884 }, { "entropy": 9.894923210144043, "epoch": 0.09234724144749852, "mean_token_accuracy": 0.7560975551605225, "num_tokens": 4867047.0, "step": 934, "train/ce_loss": 1.5262436866760254 }, { "epoch": 0.09234724144749852, "step": 934, "train/sim_loss": 0.05078125 }, { "epoch": 0.09234724144749852, "step": 934, "train/total_loss": 0.20340561866760254 }, { "entropy": 9.473404884338379, "epoch": 0.09244611429701403, "mean_token_accuracy": 0.7365792989730835, "num_tokens": 4872304.0, "step": 935, "train/ce_loss": 0.7553489804267883 }, { "epoch": 0.09244611429701403, "step": 935, "train/sim_loss": 0.09765625 }, { "epoch": 0.09244611429701403, "step": 935, "train/total_loss": 0.1731911599636078 }, { "entropy": 8.571310043334961, "epoch": 0.09254498714652956, "mean_token_accuracy": 0.7900262475013733, "num_tokens": 4877960.0, "step": 936, "train/ce_loss": 0.400553435087204 }, { "epoch": 0.09254498714652956, "step": 936, "train/sim_loss": 0.1015625 }, { "epoch": 0.09254498714652956, "step": 936, "train/total_loss": 0.14161784946918488 }, { "entropy": 9.622425079345703, "epoch": 0.09264385999604509, "mean_token_accuracy": 0.7332361340522766, "num_tokens": 4883083.0, "step": 937, "train/ce_loss": 0.6467565894126892 }, { "epoch": 0.09264385999604509, "step": 937, "train/sim_loss": 0.08984375 }, { "epoch": 0.09264385999604509, "step": 937, "train/total_loss": 0.15451940894126892 }, { "entropy": 9.209803581237793, "epoch": 0.09274273284556062, "mean_token_accuracy": 0.7240990996360779, "num_tokens": 4888480.0, "step": 938, "train/ce_loss": 0.7532393336296082 }, { "epoch": 0.09274273284556062, "step": 938, "train/sim_loss": 0.140625 }, { "epoch": 0.09274273284556062, "step": 938, "train/total_loss": 0.2159489393234253 }, { "entropy": 9.478158950805664, "epoch": 0.09284160569507613, "mean_token_accuracy": 0.7897371649742126, "num_tokens": 4893692.0, "step": 939, "train/ce_loss": 0.7379792928695679 }, { "epoch": 0.09284160569507613, "step": 939, "train/sim_loss": 0.05078125 }, { "epoch": 0.09284160569507613, "step": 939, "train/total_loss": 0.12457918375730515 }, { "epoch": 0.09294047854459166, "grad_norm": 0.9931323528289795, "learning_rate": 9.770311031993277e-06, "loss": 0.1841, "step": 940 }, { "entropy": 9.09237289428711, "epoch": 0.09294047854459166, "mean_token_accuracy": 0.7732426524162292, "num_tokens": 4899037.0, "step": 940, "train/ce_loss": 0.5895564556121826 }, { "epoch": 0.09294047854459166, "step": 940, "train/sim_loss": 0.1015625 }, { "epoch": 0.09294047854459166, "step": 940, "train/total_loss": 0.16051813960075378 }, { "entropy": 10.08241081237793, "epoch": 0.09303935139410718, "mean_token_accuracy": 0.752293586730957, "num_tokens": 4903761.0, "step": 941, "train/ce_loss": 4.899001578451134e-05 }, { "epoch": 0.09303935139410718, "step": 941, "train/sim_loss": 0.078125 }, { "epoch": 0.09303935139410718, "step": 941, "train/total_loss": 0.07812990248203278 }, { "entropy": 9.072853088378906, "epoch": 0.0931382242436227, "mean_token_accuracy": 0.7931416034698486, "num_tokens": 4909129.0, "step": 942, "train/ce_loss": 0.58165043592453 }, { "epoch": 0.0931382242436227, "step": 942, "train/sim_loss": 0.04296875 }, { "epoch": 0.0931382242436227, "step": 942, "train/total_loss": 0.101133793592453 }, { "entropy": 9.140288352966309, "epoch": 0.09323709709313822, "mean_token_accuracy": 0.766978919506073, "num_tokens": 4914501.0, "step": 943, "train/ce_loss": 0.5601727962493896 }, { "epoch": 0.09323709709313822, "step": 943, "train/sim_loss": 0.03515625 }, { "epoch": 0.09323709709313822, "step": 943, "train/total_loss": 0.09117352962493896 }, { "entropy": 9.401217460632324, "epoch": 0.09333596994265375, "mean_token_accuracy": 0.7375796437263489, "num_tokens": 4919689.0, "step": 944, "train/ce_loss": 1.0126149654388428 }, { "epoch": 0.09333596994265375, "step": 944, "train/sim_loss": 0.02734375 }, { "epoch": 0.09333596994265375, "step": 944, "train/total_loss": 0.12860524654388428 }, { "entropy": 9.376395225524902, "epoch": 0.09343484279216926, "mean_token_accuracy": 0.7076537013053894, "num_tokens": 4924936.0, "step": 945, "train/ce_loss": 0.7645605206489563 }, { "epoch": 0.09343484279216926, "step": 945, "train/sim_loss": 0.08203125 }, { "epoch": 0.09343484279216926, "step": 945, "train/total_loss": 0.15848730504512787 }, { "entropy": 9.753252983093262, "epoch": 0.09353371564168479, "mean_token_accuracy": 0.7732656598091125, "num_tokens": 4930019.0, "step": 946, "train/ce_loss": 0.7650435566902161 }, { "epoch": 0.09353371564168479, "step": 946, "train/sim_loss": 0.0625 }, { "epoch": 0.09353371564168479, "step": 946, "train/total_loss": 0.13900434970855713 }, { "entropy": 9.455278396606445, "epoch": 0.09363258849120032, "mean_token_accuracy": 0.7410423159599304, "num_tokens": 4935086.0, "step": 947, "train/ce_loss": 1.4153460264205933 }, { "epoch": 0.09363258849120032, "step": 947, "train/sim_loss": 0.109375 }, { "epoch": 0.09363258849120032, "step": 947, "train/total_loss": 0.25090962648391724 }, { "entropy": 9.894110679626465, "epoch": 0.09373146134071583, "mean_token_accuracy": 0.7921478152275085, "num_tokens": 4939913.0, "step": 948, "train/ce_loss": 7.748230564175174e-05 }, { "epoch": 0.09373146134071583, "step": 948, "train/sim_loss": 0.08203125 }, { "epoch": 0.09373146134071583, "step": 948, "train/total_loss": 0.0820389986038208 }, { "entropy": 9.149972915649414, "epoch": 0.09383033419023136, "mean_token_accuracy": 0.6889804601669312, "num_tokens": 4945318.0, "step": 949, "train/ce_loss": 0.6303824186325073 }, { "epoch": 0.09383033419023136, "step": 949, "train/sim_loss": 0.07421875 }, { "epoch": 0.09383033419023136, "step": 949, "train/total_loss": 0.13725699484348297 }, { "entropy": 8.84632682800293, "epoch": 0.09392920703974689, "mean_token_accuracy": 0.7248826026916504, "num_tokens": 4950855.0, "step": 950, "train/ce_loss": 1.2966424226760864 }, { "epoch": 0.09392920703974689, "step": 950, "train/sim_loss": 0.08203125 }, { "epoch": 0.09392920703974689, "step": 950, "train/total_loss": 0.21169549226760864 }, { "entropy": 9.00080680847168, "epoch": 0.09402807988926241, "mean_token_accuracy": 0.7522211074829102, "num_tokens": 4956337.0, "step": 951, "train/ce_loss": 0.5790229439735413 }, { "epoch": 0.09402807988926241, "step": 951, "train/sim_loss": 0.04296875 }, { "epoch": 0.09402807988926241, "step": 951, "train/total_loss": 0.10087104141712189 }, { "entropy": 9.652454376220703, "epoch": 0.09412695273877793, "mean_token_accuracy": 0.7160120606422424, "num_tokens": 4961403.0, "step": 952, "train/ce_loss": 1.1361289024353027 }, { "epoch": 0.09412695273877793, "step": 952, "train/sim_loss": 0.08984375 }, { "epoch": 0.09412695273877793, "step": 952, "train/total_loss": 0.20345664024353027 }, { "entropy": 9.625324249267578, "epoch": 0.09422582558829345, "mean_token_accuracy": 0.7744565010070801, "num_tokens": 4966589.0, "step": 953, "train/ce_loss": 2.6591091227601282e-05 }, { "epoch": 0.09422582558829345, "step": 953, "train/sim_loss": 0.0703125 }, { "epoch": 0.09422582558829345, "step": 953, "train/total_loss": 0.0703151598572731 }, { "entropy": 9.048276901245117, "epoch": 0.09432469843780898, "mean_token_accuracy": 0.6781609058380127, "num_tokens": 4971948.0, "step": 954, "train/ce_loss": 1.3899551630020142 }, { "epoch": 0.09432469843780898, "step": 954, "train/sim_loss": 0.11328125 }, { "epoch": 0.09432469843780898, "step": 954, "train/total_loss": 0.25227677822113037 }, { "entropy": 9.31239128112793, "epoch": 0.0944235712873245, "mean_token_accuracy": 0.6825581192970276, "num_tokens": 4977237.0, "step": 955, "train/ce_loss": 0.6473762392997742 }, { "epoch": 0.0944235712873245, "step": 955, "train/sim_loss": 0.08203125 }, { "epoch": 0.0944235712873245, "step": 955, "train/total_loss": 0.14676886796951294 }, { "entropy": 8.887161254882812, "epoch": 0.09452244413684002, "mean_token_accuracy": 0.759829044342041, "num_tokens": 4982837.0, "step": 956, "train/ce_loss": 0.6316436529159546 }, { "epoch": 0.09452244413684002, "step": 956, "train/sim_loss": 0.06640625 }, { "epoch": 0.09452244413684002, "step": 956, "train/total_loss": 0.1295706182718277 }, { "entropy": 9.931779861450195, "epoch": 0.09462131698635555, "mean_token_accuracy": 0.7406014800071716, "num_tokens": 4987864.0, "step": 957, "train/ce_loss": 1.021917462348938 }, { "epoch": 0.09462131698635555, "step": 957, "train/sim_loss": 0.09375 }, { "epoch": 0.09462131698635555, "step": 957, "train/total_loss": 0.1959417462348938 }, { "entropy": 9.646172523498535, "epoch": 0.09472018983587106, "mean_token_accuracy": 0.7307132482528687, "num_tokens": 4992995.0, "step": 958, "train/ce_loss": 1.0254849195480347 }, { "epoch": 0.09472018983587106, "step": 958, "train/sim_loss": 0.1171875 }, { "epoch": 0.09472018983587106, "step": 958, "train/total_loss": 0.2197359949350357 }, { "entropy": 9.367281913757324, "epoch": 0.09481906268538659, "mean_token_accuracy": 0.6963824033737183, "num_tokens": 4998185.0, "step": 959, "train/ce_loss": 0.7240278720855713 }, { "epoch": 0.09481906268538659, "step": 959, "train/sim_loss": 0.08984375 }, { "epoch": 0.09481906268538659, "step": 959, "train/total_loss": 0.16224654018878937 }, { "epoch": 0.09491793553490212, "grad_norm": 1.1352251768112183, "learning_rate": 9.765366167235327e-06, "loss": 0.1763, "step": 960 }, { "entropy": 9.286064147949219, "epoch": 0.09491793553490212, "mean_token_accuracy": 0.7055492401123047, "num_tokens": 5003682.0, "step": 960, "train/ce_loss": 0.8170363903045654 }, { "epoch": 0.09491793553490212, "step": 960, "train/sim_loss": 0.1171875 }, { "epoch": 0.09491793553490212, "step": 960, "train/total_loss": 0.19889113306999207 }, { "entropy": 9.945854187011719, "epoch": 0.09501680838441764, "mean_token_accuracy": 0.7458677887916565, "num_tokens": 5008588.0, "step": 961, "train/ce_loss": 1.0800068378448486 }, { "epoch": 0.09501680838441764, "step": 961, "train/sim_loss": 0.0859375 }, { "epoch": 0.09501680838441764, "step": 961, "train/total_loss": 0.19393819570541382 }, { "entropy": 9.646007537841797, "epoch": 0.09511568123393316, "mean_token_accuracy": 0.7693602442741394, "num_tokens": 5013660.0, "step": 962, "train/ce_loss": 0.8580648303031921 }, { "epoch": 0.09511568123393316, "step": 962, "train/sim_loss": 0.03515625 }, { "epoch": 0.09511568123393316, "step": 962, "train/total_loss": 0.1209627315402031 }, { "entropy": 9.333677291870117, "epoch": 0.09521455408344869, "mean_token_accuracy": 0.7670251131057739, "num_tokens": 5018978.0, "step": 963, "train/ce_loss": 0.6943764686584473 }, { "epoch": 0.09521455408344869, "step": 963, "train/sim_loss": 0.1015625 }, { "epoch": 0.09521455408344869, "step": 963, "train/total_loss": 0.1710001528263092 }, { "entropy": 9.317039489746094, "epoch": 0.09531342693296421, "mean_token_accuracy": 0.7134146094322205, "num_tokens": 5024290.0, "step": 964, "train/ce_loss": 1.0419347286224365 }, { "epoch": 0.09531342693296421, "step": 964, "train/sim_loss": 0.1015625 }, { "epoch": 0.09531342693296421, "step": 964, "train/total_loss": 0.20575597882270813 }, { "entropy": 9.468748092651367, "epoch": 0.09541229978247973, "mean_token_accuracy": 0.7516425848007202, "num_tokens": 5029511.0, "step": 965, "train/ce_loss": 0.7559531927108765 }, { "epoch": 0.09541229978247973, "step": 965, "train/sim_loss": 0.05859375 }, { "epoch": 0.09541229978247973, "step": 965, "train/total_loss": 0.13418906927108765 }, { "entropy": 9.766124725341797, "epoch": 0.09551117263199525, "mean_token_accuracy": 0.7006269693374634, "num_tokens": 5034627.0, "step": 966, "train/ce_loss": 0.7853338122367859 }, { "epoch": 0.09551117263199525, "step": 966, "train/sim_loss": 0.109375 }, { "epoch": 0.09551117263199525, "step": 966, "train/total_loss": 0.1879083812236786 }, { "entropy": 10.407069206237793, "epoch": 0.09561004548151078, "mean_token_accuracy": 0.7586206793785095, "num_tokens": 5039340.0, "step": 967, "train/ce_loss": 4.891712887911126e-05 }, { "epoch": 0.09561004548151078, "step": 967, "train/sim_loss": 0.0390625 }, { "epoch": 0.09561004548151078, "step": 967, "train/total_loss": 0.03906739130616188 }, { "entropy": 10.262247085571289, "epoch": 0.0957089183310263, "mean_token_accuracy": 0.723557710647583, "num_tokens": 5044132.0, "step": 968, "train/ce_loss": 4.142152829444967e-05 }, { "epoch": 0.0957089183310263, "step": 968, "train/sim_loss": 0.03125 }, { "epoch": 0.0957089183310263, "step": 968, "train/total_loss": 0.03125414252281189 }, { "entropy": 9.122846603393555, "epoch": 0.09580779118054182, "mean_token_accuracy": 0.7097130417823792, "num_tokens": 5049493.0, "step": 969, "train/ce_loss": 1.2161741256713867 }, { "epoch": 0.09580779118054182, "step": 969, "train/sim_loss": 0.03515625 }, { "epoch": 0.09580779118054182, "step": 969, "train/total_loss": 0.1567736566066742 }, { "entropy": 9.219634056091309, "epoch": 0.09590666403005735, "mean_token_accuracy": 0.7389830350875854, "num_tokens": 5054859.0, "step": 970, "train/ce_loss": 0.8640303611755371 }, { "epoch": 0.09590666403005735, "step": 970, "train/sim_loss": 0.1171875 }, { "epoch": 0.09590666403005735, "step": 970, "train/total_loss": 0.2035905420780182 }, { "entropy": 9.576761245727539, "epoch": 0.09600553687957288, "mean_token_accuracy": 0.6657682061195374, "num_tokens": 5060061.0, "step": 971, "train/ce_loss": 2.4594476222991943 }, { "epoch": 0.09600553687957288, "step": 971, "train/sim_loss": 0.09375 }, { "epoch": 0.09600553687957288, "step": 971, "train/total_loss": 0.3396947681903839 }, { "entropy": 9.380084991455078, "epoch": 0.09610440972908839, "mean_token_accuracy": 0.7656458020210266, "num_tokens": 5065271.0, "step": 972, "train/ce_loss": 0.49439525604248047 }, { "epoch": 0.09610440972908839, "step": 972, "train/sim_loss": 0.1171875 }, { "epoch": 0.09610440972908839, "step": 972, "train/total_loss": 0.16662701964378357 }, { "entropy": 9.686589241027832, "epoch": 0.09620328257860392, "mean_token_accuracy": 0.6975609660148621, "num_tokens": 5070409.0, "step": 973, "train/ce_loss": 1.1760450601577759 }, { "epoch": 0.09620328257860392, "step": 973, "train/sim_loss": 0.125 }, { "epoch": 0.09620328257860392, "step": 973, "train/total_loss": 0.24260450899600983 }, { "entropy": 10.30485725402832, "epoch": 0.09630215542811944, "mean_token_accuracy": 0.7692307829856873, "num_tokens": 5075148.0, "step": 974, "train/ce_loss": 1.261991262435913 }, { "epoch": 0.09630215542811944, "step": 974, "train/sim_loss": 0.10546875 }, { "epoch": 0.09630215542811944, "step": 974, "train/total_loss": 0.2316678762435913 }, { "entropy": 10.129904747009277, "epoch": 0.09640102827763496, "mean_token_accuracy": 0.7991071343421936, "num_tokens": 5080046.0, "step": 975, "train/ce_loss": 3.663766983663663e-05 }, { "epoch": 0.09640102827763496, "step": 975, "train/sim_loss": 0.09765625 }, { "epoch": 0.09640102827763496, "step": 975, "train/total_loss": 0.09765991568565369 }, { "entropy": 9.284428596496582, "epoch": 0.09649990112715048, "mean_token_accuracy": 0.6944785118103027, "num_tokens": 5085291.0, "step": 976, "train/ce_loss": 0.760468065738678 }, { "epoch": 0.09649990112715048, "step": 976, "train/sim_loss": 0.08984375 }, { "epoch": 0.09649990112715048, "step": 976, "train/total_loss": 0.16589055955410004 }, { "entropy": 9.093676567077637, "epoch": 0.09659877397666601, "mean_token_accuracy": 0.7232635021209717, "num_tokens": 5090657.0, "step": 977, "train/ce_loss": 0.6065735220909119 }, { "epoch": 0.09659877397666601, "step": 977, "train/sim_loss": 0.046875 }, { "epoch": 0.09659877397666601, "step": 977, "train/total_loss": 0.10753235220909119 }, { "entropy": 10.156436920166016, "epoch": 0.09669764682618152, "mean_token_accuracy": 0.773955762386322, "num_tokens": 5095471.0, "step": 978, "train/ce_loss": 1.2206401824951172 }, { "epoch": 0.09669764682618152, "step": 978, "train/sim_loss": 0.046875 }, { "epoch": 0.09669764682618152, "step": 978, "train/total_loss": 0.1689390242099762 }, { "entropy": 9.224005699157715, "epoch": 0.09679651967569705, "mean_token_accuracy": 0.7690557241439819, "num_tokens": 5100782.0, "step": 979, "train/ce_loss": 0.9182709455490112 }, { "epoch": 0.09679651967569705, "step": 979, "train/sim_loss": 0.1015625 }, { "epoch": 0.09679651967569705, "step": 979, "train/total_loss": 0.19338959455490112 }, { "epoch": 0.09689539252521258, "grad_norm": 0.9375425577163696, "learning_rate": 9.760421302477378e-06, "loss": 0.1738, "step": 980 }, { "entropy": 9.039779663085938, "epoch": 0.09689539252521258, "mean_token_accuracy": 0.6974595785140991, "num_tokens": 5106046.0, "step": 980, "train/ce_loss": 0.6134800314903259 }, { "epoch": 0.09689539252521258, "step": 980, "train/sim_loss": 0.078125 }, { "epoch": 0.09689539252521258, "step": 980, "train/total_loss": 0.13947300612926483 }, { "entropy": 10.0238676071167, "epoch": 0.0969942653747281, "mean_token_accuracy": 0.7307692170143127, "num_tokens": 5110808.0, "step": 981, "train/ce_loss": 0.0001462309737689793 }, { "epoch": 0.0969942653747281, "step": 981, "train/sim_loss": 0.10546875 }, { "epoch": 0.0969942653747281, "step": 981, "train/total_loss": 0.10548337548971176 }, { "entropy": 9.56509780883789, "epoch": 0.09709313822424362, "mean_token_accuracy": 0.7315634489059448, "num_tokens": 5115944.0, "step": 982, "train/ce_loss": 1.0472911596298218 }, { "epoch": 0.09709313822424362, "step": 982, "train/sim_loss": 0.08203125 }, { "epoch": 0.09709313822424362, "step": 982, "train/total_loss": 0.18676036596298218 }, { "entropy": 9.660545349121094, "epoch": 0.09719201107375915, "mean_token_accuracy": 0.7556818127632141, "num_tokens": 5121091.0, "step": 983, "train/ce_loss": 0.528469979763031 }, { "epoch": 0.09719201107375915, "step": 983, "train/sim_loss": 0.046875 }, { "epoch": 0.09719201107375915, "step": 983, "train/total_loss": 0.0997219979763031 }, { "entropy": 9.2199125289917, "epoch": 0.09729088392327467, "mean_token_accuracy": 0.6720368266105652, "num_tokens": 5126456.0, "step": 984, "train/ce_loss": 1.1708029508590698 }, { "epoch": 0.09729088392327467, "step": 984, "train/sim_loss": 0.109375 }, { "epoch": 0.09729088392327467, "step": 984, "train/total_loss": 0.22645530104637146 }, { "entropy": 9.225263595581055, "epoch": 0.09738975677279019, "mean_token_accuracy": 0.7661470174789429, "num_tokens": 5131806.0, "step": 985, "train/ce_loss": 1.202368974685669 }, { "epoch": 0.09738975677279019, "step": 985, "train/sim_loss": 0.03515625 }, { "epoch": 0.09738975677279019, "step": 985, "train/total_loss": 0.15539315342903137 }, { "entropy": 9.521390914916992, "epoch": 0.09748862962230571, "mean_token_accuracy": 0.7075588703155518, "num_tokens": 5137059.0, "step": 986, "train/ce_loss": 0.992639422416687 }, { "epoch": 0.09748862962230571, "step": 986, "train/sim_loss": 0.046875 }, { "epoch": 0.09748862962230571, "step": 986, "train/total_loss": 0.14613893628120422 }, { "entropy": 8.910574913024902, "epoch": 0.09758750247182124, "mean_token_accuracy": 0.7683258056640625, "num_tokens": 5142658.0, "step": 987, "train/ce_loss": 0.9678569436073303 }, { "epoch": 0.09758750247182124, "step": 987, "train/sim_loss": 0.15625 }, { "epoch": 0.09758750247182124, "step": 987, "train/total_loss": 0.25303569436073303 }, { "entropy": 9.38957691192627, "epoch": 0.09768637532133675, "mean_token_accuracy": 0.770370364189148, "num_tokens": 5147938.0, "step": 988, "train/ce_loss": 0.846873939037323 }, { "epoch": 0.09768637532133675, "step": 988, "train/sim_loss": 0.11328125 }, { "epoch": 0.09768637532133675, "step": 988, "train/total_loss": 0.19796864688396454 }, { "entropy": 9.404655456542969, "epoch": 0.09778524817085228, "mean_token_accuracy": 0.7073760628700256, "num_tokens": 5153258.0, "step": 989, "train/ce_loss": 0.7749897241592407 }, { "epoch": 0.09778524817085228, "step": 989, "train/sim_loss": 0.11328125 }, { "epoch": 0.09778524817085228, "step": 989, "train/total_loss": 0.19078022241592407 }, { "entropy": 9.652052879333496, "epoch": 0.09788412102036781, "mean_token_accuracy": 0.700276255607605, "num_tokens": 5158422.0, "step": 990, "train/ce_loss": 1.1462301015853882 }, { "epoch": 0.09788412102036781, "step": 990, "train/sim_loss": 0.08203125 }, { "epoch": 0.09788412102036781, "step": 990, "train/total_loss": 0.19665426015853882 }, { "entropy": 9.799245834350586, "epoch": 0.09798299386988334, "mean_token_accuracy": 0.7184750437736511, "num_tokens": 5163525.0, "step": 991, "train/ce_loss": 0.8648521304130554 }, { "epoch": 0.09798299386988334, "step": 991, "train/sim_loss": 0.1484375 }, { "epoch": 0.09798299386988334, "step": 991, "train/total_loss": 0.23492270708084106 }, { "entropy": 9.31769847869873, "epoch": 0.09808186671939885, "mean_token_accuracy": 0.7049723863601685, "num_tokens": 5168924.0, "step": 992, "train/ce_loss": 0.7937401533126831 }, { "epoch": 0.09808186671939885, "step": 992, "train/sim_loss": 0.109375 }, { "epoch": 0.09808186671939885, "step": 992, "train/total_loss": 0.1887490153312683 }, { "entropy": 9.720190048217773, "epoch": 0.09818073956891438, "mean_token_accuracy": 0.7551724314689636, "num_tokens": 5173940.0, "step": 993, "train/ce_loss": 1.0298943519592285 }, { "epoch": 0.09818073956891438, "step": 993, "train/sim_loss": 0.1484375 }, { "epoch": 0.09818073956891438, "step": 993, "train/total_loss": 0.25142693519592285 }, { "entropy": 9.876079559326172, "epoch": 0.0982796124184299, "mean_token_accuracy": 0.7508650422096252, "num_tokens": 5178969.0, "step": 994, "train/ce_loss": 2.5448929591220804e-05 }, { "epoch": 0.0982796124184299, "step": 994, "train/sim_loss": 0.04296875 }, { "epoch": 0.0982796124184299, "step": 994, "train/total_loss": 0.04297129437327385 }, { "entropy": 10.213848114013672, "epoch": 0.09837848526794542, "mean_token_accuracy": 0.7462038993835449, "num_tokens": 5183853.0, "step": 995, "train/ce_loss": 2.6947966034640558e-05 }, { "epoch": 0.09837848526794542, "step": 995, "train/sim_loss": 0.03125 }, { "epoch": 0.09837848526794542, "step": 995, "train/total_loss": 0.03125269338488579 }, { "entropy": 9.504982948303223, "epoch": 0.09847735811746094, "mean_token_accuracy": 0.7118194103240967, "num_tokens": 5189055.0, "step": 996, "train/ce_loss": 1.0164376497268677 }, { "epoch": 0.09847735811746094, "step": 996, "train/sim_loss": 0.1796875 }, { "epoch": 0.09847735811746094, "step": 996, "train/total_loss": 0.28133127093315125 }, { "entropy": 9.319685935974121, "epoch": 0.09857623096697647, "mean_token_accuracy": 0.7459584474563599, "num_tokens": 5194385.0, "step": 997, "train/ce_loss": 0.9244051575660706 }, { "epoch": 0.09857623096697647, "step": 997, "train/sim_loss": 0.12890625 }, { "epoch": 0.09857623096697647, "step": 997, "train/total_loss": 0.22134676575660706 }, { "entropy": 9.0696439743042, "epoch": 0.09867510381649199, "mean_token_accuracy": 0.7338709831237793, "num_tokens": 5199780.0, "step": 998, "train/ce_loss": 0.7940992116928101 }, { "epoch": 0.09867510381649199, "step": 998, "train/sim_loss": 0.0703125 }, { "epoch": 0.09867510381649199, "step": 998, "train/total_loss": 0.14972242712974548 }, { "entropy": 9.231082916259766, "epoch": 0.09877397666600751, "mean_token_accuracy": 0.706695020198822, "num_tokens": 5205221.0, "step": 999, "train/ce_loss": 1.249048113822937 }, { "epoch": 0.09877397666600751, "step": 999, "train/sim_loss": 0.04296875 }, { "epoch": 0.09877397666600751, "step": 999, "train/total_loss": 0.1678735613822937 }, { "epoch": 0.09887284951552304, "grad_norm": 1.0724185705184937, "learning_rate": 9.755476437719428e-06, "loss": 0.1774, "step": 1000 }, { "entropy": 9.569454193115234, "epoch": 0.09887284951552304, "mean_token_accuracy": 0.7770618796348572, "num_tokens": 5210499.0, "step": 1000, "train/ce_loss": 1.22596275806427 }, { "epoch": 0.09887284951552304, "step": 1000, "train/sim_loss": 0.08984375 }, { "epoch": 0.09887284951552304, "step": 1000, "train/total_loss": 0.21244002878665924 }, { "entropy": 9.117839813232422, "epoch": 0.09897172236503857, "mean_token_accuracy": 0.7530747652053833, "num_tokens": 5215992.0, "step": 1001, "train/ce_loss": 0.6427945494651794 }, { "epoch": 0.09897172236503857, "step": 1001, "train/sim_loss": 0.08984375 }, { "epoch": 0.09897172236503857, "step": 1001, "train/total_loss": 0.1541232168674469 }, { "entropy": 9.556461334228516, "epoch": 0.09907059521455408, "mean_token_accuracy": 0.7525510191917419, "num_tokens": 5221190.0, "step": 1002, "train/ce_loss": 0.8305707573890686 }, { "epoch": 0.09907059521455408, "step": 1002, "train/sim_loss": 0.0859375 }, { "epoch": 0.09907059521455408, "step": 1002, "train/total_loss": 0.16899457573890686 }, { "entropy": 9.66070556640625, "epoch": 0.09916946806406961, "mean_token_accuracy": 0.6742081642150879, "num_tokens": 5226301.0, "step": 1003, "train/ce_loss": 1.040513515472412 }, { "epoch": 0.09916946806406961, "step": 1003, "train/sim_loss": 0.078125 }, { "epoch": 0.09916946806406961, "step": 1003, "train/total_loss": 0.1821763515472412 }, { "entropy": 9.522982597351074, "epoch": 0.09926834091358513, "mean_token_accuracy": 0.7161290049552917, "num_tokens": 5231531.0, "step": 1004, "train/ce_loss": 0.7382574677467346 }, { "epoch": 0.09926834091358513, "step": 1004, "train/sim_loss": 0.0390625 }, { "epoch": 0.09926834091358513, "step": 1004, "train/total_loss": 0.11288824677467346 }, { "entropy": 9.404748916625977, "epoch": 0.09936721376310065, "mean_token_accuracy": 0.737171471118927, "num_tokens": 5236786.0, "step": 1005, "train/ce_loss": 1.1236399412155151 }, { "epoch": 0.09936721376310065, "step": 1005, "train/sim_loss": 0.08984375 }, { "epoch": 0.09936721376310065, "step": 1005, "train/total_loss": 0.2022077441215515 }, { "entropy": 10.117021560668945, "epoch": 0.09946608661261618, "mean_token_accuracy": 0.738095223903656, "num_tokens": 5241623.0, "step": 1006, "train/ce_loss": 1.809425950050354 }, { "epoch": 0.09946608661261618, "step": 1006, "train/sim_loss": 0.1015625 }, { "epoch": 0.09946608661261618, "step": 1006, "train/total_loss": 0.2825050950050354 }, { "entropy": 9.110442161560059, "epoch": 0.0995649594621317, "mean_token_accuracy": 0.6974697709083557, "num_tokens": 5247022.0, "step": 1007, "train/ce_loss": 1.1598174571990967 }, { "epoch": 0.0995649594621317, "step": 1007, "train/sim_loss": 0.1015625 }, { "epoch": 0.0995649594621317, "step": 1007, "train/total_loss": 0.21754425764083862 }, { "entropy": 9.71247673034668, "epoch": 0.09966383231164722, "mean_token_accuracy": 0.7104136943817139, "num_tokens": 5252114.0, "step": 1008, "train/ce_loss": 0.6906753182411194 }, { "epoch": 0.09966383231164722, "step": 1008, "train/sim_loss": 0.0703125 }, { "epoch": 0.09966383231164722, "step": 1008, "train/total_loss": 0.13938003778457642 }, { "entropy": 9.66971492767334, "epoch": 0.09976270516116274, "mean_token_accuracy": 0.8030592799186707, "num_tokens": 5257107.0, "step": 1009, "train/ce_loss": 0.7509585618972778 }, { "epoch": 0.09976270516116274, "step": 1009, "train/sim_loss": 0.0546875 }, { "epoch": 0.09976270516116274, "step": 1009, "train/total_loss": 0.12978336215019226 }, { "entropy": 9.871513366699219, "epoch": 0.09986157801067827, "mean_token_accuracy": 0.7030201554298401, "num_tokens": 5262083.0, "step": 1010, "train/ce_loss": 1.4761762619018555 }, { "epoch": 0.09986157801067827, "step": 1010, "train/sim_loss": 0.13671875 }, { "epoch": 0.09986157801067827, "step": 1010, "train/total_loss": 0.2843363881111145 }, { "entropy": 9.108389854431152, "epoch": 0.0999604508601938, "mean_token_accuracy": 0.7257072329521179, "num_tokens": 5267393.0, "step": 1011, "train/ce_loss": 1.0095230340957642 }, { "epoch": 0.0999604508601938, "step": 1011, "train/sim_loss": 0.1484375 }, { "epoch": 0.0999604508601938, "step": 1011, "train/total_loss": 0.24938979744911194 }, { "entropy": 9.610630989074707, "epoch": 0.10005932370970931, "mean_token_accuracy": 0.7174515128135681, "num_tokens": 5272534.0, "step": 1012, "train/ce_loss": 0.6784527897834778 }, { "epoch": 0.10005932370970931, "step": 1012, "train/sim_loss": 0.0546875 }, { "epoch": 0.10005932370970931, "step": 1012, "train/total_loss": 0.12253277748823166 }, { "entropy": 8.754789352416992, "epoch": 0.10015819655922484, "mean_token_accuracy": 0.7428810596466064, "num_tokens": 5278206.0, "step": 1013, "train/ce_loss": 0.6056615114212036 }, { "epoch": 0.10015819655922484, "step": 1013, "train/sim_loss": 0.03515625 }, { "epoch": 0.10015819655922484, "step": 1013, "train/total_loss": 0.09572240710258484 }, { "entropy": 9.353209495544434, "epoch": 0.10025706940874037, "mean_token_accuracy": 0.7290886640548706, "num_tokens": 5283457.0, "step": 1014, "train/ce_loss": 0.7436414361000061 }, { "epoch": 0.10025706940874037, "step": 1014, "train/sim_loss": 0.09375 }, { "epoch": 0.10025706940874037, "step": 1014, "train/total_loss": 0.16811415553092957 }, { "entropy": 9.666563987731934, "epoch": 0.10035594225825588, "mean_token_accuracy": 0.7506702542304993, "num_tokens": 5288815.0, "step": 1015, "train/ce_loss": 0.5352444648742676 }, { "epoch": 0.10035594225825588, "step": 1015, "train/sim_loss": 0.03515625 }, { "epoch": 0.10035594225825588, "step": 1015, "train/total_loss": 0.088680699467659 }, { "entropy": 9.092962265014648, "epoch": 0.1004548151077714, "mean_token_accuracy": 0.6735324263572693, "num_tokens": 5294243.0, "step": 1016, "train/ce_loss": 0.8530191779136658 }, { "epoch": 0.1004548151077714, "step": 1016, "train/sim_loss": 0.0703125 }, { "epoch": 0.1004548151077714, "step": 1016, "train/total_loss": 0.15561442077159882 }, { "entropy": 9.992460250854492, "epoch": 0.10055368795728693, "mean_token_accuracy": 0.7484536170959473, "num_tokens": 5299175.0, "step": 1017, "train/ce_loss": 4.608002564054914e-05 }, { "epoch": 0.10055368795728693, "step": 1017, "train/sim_loss": 0.078125 }, { "epoch": 0.10055368795728693, "step": 1017, "train/total_loss": 0.0781296044588089 }, { "entropy": 9.113978385925293, "epoch": 0.10065256080680245, "mean_token_accuracy": 0.6854166388511658, "num_tokens": 5304671.0, "step": 1018, "train/ce_loss": 0.5623672604560852 }, { "epoch": 0.10065256080680245, "step": 1018, "train/sim_loss": 0.10546875 }, { "epoch": 0.10065256080680245, "step": 1018, "train/total_loss": 0.16170547902584076 }, { "entropy": 9.790916442871094, "epoch": 0.10075143365631797, "mean_token_accuracy": 0.7684210538864136, "num_tokens": 5309699.0, "step": 1019, "train/ce_loss": 1.3232040405273438 }, { "epoch": 0.10075143365631797, "step": 1019, "train/sim_loss": 0.12109375 }, { "epoch": 0.10075143365631797, "step": 1019, "train/total_loss": 0.2534141540527344 }, { "epoch": 0.1008503065058335, "grad_norm": 1.2360754013061523, "learning_rate": 9.75053157296148e-06, "loss": 0.1749, "step": 1020 }, { "entropy": 9.670308113098145, "epoch": 0.1008503065058335, "mean_token_accuracy": 0.6661631464958191, "num_tokens": 5314810.0, "step": 1020, "train/ce_loss": 1.7671234607696533 }, { "epoch": 0.1008503065058335, "step": 1020, "train/sim_loss": 0.10546875 }, { "epoch": 0.1008503065058335, "step": 1020, "train/total_loss": 0.2821810841560364 }, { "entropy": 9.031726837158203, "epoch": 0.10094917935534903, "mean_token_accuracy": 0.730975329875946, "num_tokens": 5320233.0, "step": 1021, "train/ce_loss": 0.9387472867965698 }, { "epoch": 0.10094917935534903, "step": 1021, "train/sim_loss": 0.0546875 }, { "epoch": 0.10094917935534903, "step": 1021, "train/total_loss": 0.1485622227191925 }, { "entropy": 9.516468048095703, "epoch": 0.10104805220486454, "mean_token_accuracy": 0.7382920384407043, "num_tokens": 5325369.0, "step": 1022, "train/ce_loss": 0.86199951171875 }, { "epoch": 0.10104805220486454, "step": 1022, "train/sim_loss": 0.078125 }, { "epoch": 0.10104805220486454, "step": 1022, "train/total_loss": 0.16432495415210724 }, { "entropy": 9.152571678161621, "epoch": 0.10114692505438007, "mean_token_accuracy": 0.7382628917694092, "num_tokens": 5330709.0, "step": 1023, "train/ce_loss": 0.9139989614486694 }, { "epoch": 0.10114692505438007, "step": 1023, "train/sim_loss": 0.0703125 }, { "epoch": 0.10114692505438007, "step": 1023, "train/total_loss": 0.1617124080657959 }, { "entropy": 9.668933868408203, "epoch": 0.1012457979038956, "mean_token_accuracy": 0.7255216836929321, "num_tokens": 5335785.0, "step": 1024, "train/ce_loss": 0.7937629818916321 }, { "epoch": 0.1012457979038956, "step": 1024, "train/sim_loss": 0.16015625 }, { "epoch": 0.1012457979038956, "step": 1024, "train/total_loss": 0.23953256011009216 }, { "entropy": 8.975940704345703, "epoch": 0.10134467075341111, "mean_token_accuracy": 0.7204641103744507, "num_tokens": 5341261.0, "step": 1025, "train/ce_loss": 1.1749017238616943 }, { "epoch": 0.10134467075341111, "step": 1025, "train/sim_loss": 0.19140625 }, { "epoch": 0.10134467075341111, "step": 1025, "train/total_loss": 0.30889642238616943 }, { "entropy": 9.504355430603027, "epoch": 0.10144354360292664, "mean_token_accuracy": 0.7365177273750305, "num_tokens": 5346381.0, "step": 1026, "train/ce_loss": 1.250915765762329 }, { "epoch": 0.10144354360292664, "step": 1026, "train/sim_loss": 0.06640625 }, { "epoch": 0.10144354360292664, "step": 1026, "train/total_loss": 0.1914978325366974 }, { "entropy": 9.745513916015625, "epoch": 0.10154241645244216, "mean_token_accuracy": 0.8098256587982178, "num_tokens": 5351478.0, "step": 1027, "train/ce_loss": 1.9117256670142524e-05 }, { "epoch": 0.10154241645244216, "step": 1027, "train/sim_loss": 0.07421875 }, { "epoch": 0.10154241645244216, "step": 1027, "train/total_loss": 0.07422066479921341 }, { "entropy": 9.939186096191406, "epoch": 0.10164128930195768, "mean_token_accuracy": 0.7660818696022034, "num_tokens": 5356403.0, "step": 1028, "train/ce_loss": 0.6355189681053162 }, { "epoch": 0.10164128930195768, "step": 1028, "train/sim_loss": 0.08984375 }, { "epoch": 0.10164128930195768, "step": 1028, "train/total_loss": 0.1533956527709961 }, { "entropy": 9.516606330871582, "epoch": 0.1017401621514732, "mean_token_accuracy": 0.7118881344795227, "num_tokens": 5361618.0, "step": 1029, "train/ce_loss": 0.7298515439033508 }, { "epoch": 0.1017401621514732, "step": 1029, "train/sim_loss": 0.07421875 }, { "epoch": 0.1017401621514732, "step": 1029, "train/total_loss": 0.14720390737056732 }, { "entropy": 9.831457138061523, "epoch": 0.10183903500098873, "mean_token_accuracy": 0.744966447353363, "num_tokens": 5366926.0, "step": 1030, "train/ce_loss": 2.8248850867385045e-05 }, { "epoch": 0.10183903500098873, "step": 1030, "train/sim_loss": 0.10546875 }, { "epoch": 0.10183903500098873, "step": 1030, "train/total_loss": 0.10547157377004623 }, { "entropy": 9.04391098022461, "epoch": 0.10193790785050424, "mean_token_accuracy": 0.7756202816963196, "num_tokens": 5372332.0, "step": 1031, "train/ce_loss": 0.3345872759819031 }, { "epoch": 0.10193790785050424, "step": 1031, "train/sim_loss": 0.02734375 }, { "epoch": 0.10193790785050424, "step": 1031, "train/total_loss": 0.06080247834324837 }, { "entropy": 9.163641929626465, "epoch": 0.10203678070001977, "mean_token_accuracy": 0.7380688190460205, "num_tokens": 5377746.0, "step": 1032, "train/ce_loss": 1.9390454326639883e-05 }, { "epoch": 0.10203678070001977, "step": 1032, "train/sim_loss": 0.0625 }, { "epoch": 0.10203678070001977, "step": 1032, "train/total_loss": 0.0625019371509552 }, { "entropy": 9.652329444885254, "epoch": 0.1021356535495353, "mean_token_accuracy": 0.7177541851997375, "num_tokens": 5382866.0, "step": 1033, "train/ce_loss": 1.420344352722168 }, { "epoch": 0.1021356535495353, "step": 1033, "train/sim_loss": 0.0703125 }, { "epoch": 0.1021356535495353, "step": 1033, "train/total_loss": 0.21234694123268127 }, { "entropy": 9.247446060180664, "epoch": 0.10223452639905083, "mean_token_accuracy": 0.7298919558525085, "num_tokens": 5388181.0, "step": 1034, "train/ce_loss": 1.1431541442871094 }, { "epoch": 0.10223452639905083, "step": 1034, "train/sim_loss": 0.05859375 }, { "epoch": 0.10223452639905083, "step": 1034, "train/total_loss": 0.17290917038917542 }, { "entropy": 9.286208152770996, "epoch": 0.10233339924856634, "mean_token_accuracy": 0.723514199256897, "num_tokens": 5393399.0, "step": 1035, "train/ce_loss": 0.8224537968635559 }, { "epoch": 0.10233339924856634, "step": 1035, "train/sim_loss": 0.0625 }, { "epoch": 0.10233339924856634, "step": 1035, "train/total_loss": 0.1447453796863556 }, { "entropy": 10.020112991333008, "epoch": 0.10243227209808187, "mean_token_accuracy": 0.773955762386322, "num_tokens": 5398283.0, "step": 1036, "train/ce_loss": 2.7731024601962417e-05 }, { "epoch": 0.10243227209808187, "step": 1036, "train/sim_loss": 0.078125 }, { "epoch": 0.10243227209808187, "step": 1036, "train/total_loss": 0.07812777161598206 }, { "entropy": 9.74041748046875, "epoch": 0.1025311449475974, "mean_token_accuracy": 0.7201907634735107, "num_tokens": 5403341.0, "step": 1037, "train/ce_loss": 1.832025191106368e-05 }, { "epoch": 0.1025311449475974, "step": 1037, "train/sim_loss": 0.03515625 }, { "epoch": 0.1025311449475974, "step": 1037, "train/total_loss": 0.03515808284282684 }, { "entropy": 9.702130317687988, "epoch": 0.10263001779711291, "mean_token_accuracy": 0.739469587802887, "num_tokens": 5408403.0, "step": 1038, "train/ce_loss": 0.9791857004165649 }, { "epoch": 0.10263001779711291, "step": 1038, "train/sim_loss": 0.05859375 }, { "epoch": 0.10263001779711291, "step": 1038, "train/total_loss": 0.1565123200416565 }, { "entropy": 9.729958534240723, "epoch": 0.10272889064662843, "mean_token_accuracy": 0.6488189101219177, "num_tokens": 5413443.0, "step": 1039, "train/ce_loss": 3.65582927770447e-05 }, { "epoch": 0.10272889064662843, "step": 1039, "train/sim_loss": 0.078125 }, { "epoch": 0.10272889064662843, "step": 1039, "train/total_loss": 0.07812865823507309 }, { "epoch": 0.10282776349614396, "grad_norm": 1.0242434740066528, "learning_rate": 9.745586708203531e-06, "loss": 0.1722, "step": 1040 }, { "entropy": 9.663080215454102, "epoch": 0.10282776349614396, "mean_token_accuracy": 0.6986532211303711, "num_tokens": 5418525.0, "step": 1040, "train/ce_loss": 3.0713326850673184e-05 }, { "epoch": 0.10282776349614396, "step": 1040, "train/sim_loss": 0.09765625 }, { "epoch": 0.10282776349614396, "step": 1040, "train/total_loss": 0.09765931963920593 }, { "entropy": 10.008130073547363, "epoch": 0.10292663634565948, "mean_token_accuracy": 0.7440347075462341, "num_tokens": 5423418.0, "step": 1041, "train/ce_loss": 1.2210838794708252 }, { "epoch": 0.10292663634565948, "step": 1041, "train/sim_loss": 0.08203125 }, { "epoch": 0.10292663634565948, "step": 1041, "train/total_loss": 0.20413964986801147 }, { "entropy": 9.635592460632324, "epoch": 0.103025509195175, "mean_token_accuracy": 0.6676342487335205, "num_tokens": 5428561.0, "step": 1042, "train/ce_loss": 1.4121594429016113 }, { "epoch": 0.103025509195175, "step": 1042, "train/sim_loss": 0.12109375 }, { "epoch": 0.103025509195175, "step": 1042, "train/total_loss": 0.2623097002506256 }, { "entropy": 9.016424179077148, "epoch": 0.10312438204469053, "mean_token_accuracy": 0.7136015295982361, "num_tokens": 5434047.0, "step": 1043, "train/ce_loss": 0.9875646233558655 }, { "epoch": 0.10312438204469053, "step": 1043, "train/sim_loss": 0.078125 }, { "epoch": 0.10312438204469053, "step": 1043, "train/total_loss": 0.17688146233558655 }, { "entropy": 9.06117057800293, "epoch": 0.10322325489420606, "mean_token_accuracy": 0.7363834381103516, "num_tokens": 5439435.0, "step": 1044, "train/ce_loss": 0.7383791208267212 }, { "epoch": 0.10322325489420606, "step": 1044, "train/sim_loss": 0.1015625 }, { "epoch": 0.10322325489420606, "step": 1044, "train/total_loss": 0.17540040612220764 }, { "entropy": 9.030341148376465, "epoch": 0.10332212774372157, "mean_token_accuracy": 0.711448609828949, "num_tokens": 5444805.0, "step": 1045, "train/ce_loss": 1.4875901937484741 }, { "epoch": 0.10332212774372157, "step": 1045, "train/sim_loss": 0.125 }, { "epoch": 0.10332212774372157, "step": 1045, "train/total_loss": 0.27375900745391846 }, { "entropy": 9.205885887145996, "epoch": 0.1034210005932371, "mean_token_accuracy": 0.7298136353492737, "num_tokens": 5450226.0, "step": 1046, "train/ce_loss": 0.5827401876449585 }, { "epoch": 0.1034210005932371, "step": 1046, "train/sim_loss": 0.05859375 }, { "epoch": 0.1034210005932371, "step": 1046, "train/total_loss": 0.11686776578426361 }, { "entropy": 9.355142593383789, "epoch": 0.10351987344275262, "mean_token_accuracy": 0.7164887189865112, "num_tokens": 5455576.0, "step": 1047, "train/ce_loss": 0.7967851758003235 }, { "epoch": 0.10351987344275262, "step": 1047, "train/sim_loss": 0.09765625 }, { "epoch": 0.10351987344275262, "step": 1047, "train/total_loss": 0.1773347705602646 }, { "entropy": 8.922216415405273, "epoch": 0.10361874629226814, "mean_token_accuracy": 0.7060241103172302, "num_tokens": 5460920.0, "step": 1048, "train/ce_loss": 0.6458163857460022 }, { "epoch": 0.10361874629226814, "step": 1048, "train/sim_loss": 0.09375 }, { "epoch": 0.10361874629226814, "step": 1048, "train/total_loss": 0.15833163261413574 }, { "entropy": 9.688494682312012, "epoch": 0.10371761914178367, "mean_token_accuracy": 0.7666666507720947, "num_tokens": 5465943.0, "step": 1049, "train/ce_loss": 3.978270251536742e-05 }, { "epoch": 0.10371761914178367, "step": 1049, "train/sim_loss": 0.08203125 }, { "epoch": 0.10371761914178367, "step": 1049, "train/total_loss": 0.08203522861003876 }, { "entropy": 9.07177734375, "epoch": 0.10381649199129919, "mean_token_accuracy": 0.7392638325691223, "num_tokens": 5471426.0, "step": 1050, "train/ce_loss": 0.9018810391426086 }, { "epoch": 0.10381649199129919, "step": 1050, "train/sim_loss": 0.1015625 }, { "epoch": 0.10381649199129919, "step": 1050, "train/total_loss": 0.19175061583518982 }, { "entropy": 9.343345642089844, "epoch": 0.1039153648408147, "mean_token_accuracy": 0.7020023465156555, "num_tokens": 5476722.0, "step": 1051, "train/ce_loss": 1.2637797594070435 }, { "epoch": 0.1039153648408147, "step": 1051, "train/sim_loss": 0.0546875 }, { "epoch": 0.1039153648408147, "step": 1051, "train/total_loss": 0.18106548488140106 }, { "entropy": 9.867258071899414, "epoch": 0.10401423769033023, "mean_token_accuracy": 0.7724252343177795, "num_tokens": 5481763.0, "step": 1052, "train/ce_loss": 6.280098750721663e-05 }, { "epoch": 0.10401423769033023, "step": 1052, "train/sim_loss": 0.07421875 }, { "epoch": 0.10401423769033023, "step": 1052, "train/total_loss": 0.0742250308394432 }, { "entropy": 9.537542343139648, "epoch": 0.10411311053984576, "mean_token_accuracy": 0.7412280440330505, "num_tokens": 5486868.0, "step": 1053, "train/ce_loss": 0.6791310906410217 }, { "epoch": 0.10411311053984576, "step": 1053, "train/sim_loss": 0.02734375 }, { "epoch": 0.10411311053984576, "step": 1053, "train/total_loss": 0.09525685757398605 }, { "entropy": 9.18380355834961, "epoch": 0.10421198338936129, "mean_token_accuracy": 0.7676130533218384, "num_tokens": 5492286.0, "step": 1054, "train/ce_loss": 0.6990429162979126 }, { "epoch": 0.10421198338936129, "step": 1054, "train/sim_loss": 0.0390625 }, { "epoch": 0.10421198338936129, "step": 1054, "train/total_loss": 0.10896679013967514 }, { "entropy": 9.26451301574707, "epoch": 0.1043108562388768, "mean_token_accuracy": 0.7121034264564514, "num_tokens": 5497607.0, "step": 1055, "train/ce_loss": 1.6029696464538574 }, { "epoch": 0.1043108562388768, "step": 1055, "train/sim_loss": 0.0546875 }, { "epoch": 0.1043108562388768, "step": 1055, "train/total_loss": 0.2149844616651535 }, { "entropy": 9.994356155395508, "epoch": 0.10440972908839233, "mean_token_accuracy": 0.7188118696212769, "num_tokens": 5502539.0, "step": 1056, "train/ce_loss": 0.8876862525939941 }, { "epoch": 0.10440972908839233, "step": 1056, "train/sim_loss": 0.0703125 }, { "epoch": 0.10440972908839233, "step": 1056, "train/total_loss": 0.1590811312198639 }, { "entropy": 9.06218147277832, "epoch": 0.10450860193790786, "mean_token_accuracy": 0.7263681888580322, "num_tokens": 5508019.0, "step": 1057, "train/ce_loss": 0.6542841196060181 }, { "epoch": 0.10450860193790786, "step": 1057, "train/sim_loss": 0.140625 }, { "epoch": 0.10450860193790786, "step": 1057, "train/total_loss": 0.20605340600013733 }, { "entropy": 9.794883728027344, "epoch": 0.10460747478742337, "mean_token_accuracy": 0.7626811861991882, "num_tokens": 5513039.0, "step": 1058, "train/ce_loss": 0.9645821452140808 }, { "epoch": 0.10460747478742337, "step": 1058, "train/sim_loss": 0.05078125 }, { "epoch": 0.10460747478742337, "step": 1058, "train/total_loss": 0.14723947644233704 }, { "entropy": 8.95814323425293, "epoch": 0.1047063476369389, "mean_token_accuracy": 0.7363238334655762, "num_tokens": 5518471.0, "step": 1059, "train/ce_loss": 0.8257557153701782 }, { "epoch": 0.1047063476369389, "step": 1059, "train/sim_loss": 0.125 }, { "epoch": 0.1047063476369389, "step": 1059, "train/total_loss": 0.20757557451725006 }, { "epoch": 0.10480522048645442, "grad_norm": 0.9177259206771851, "learning_rate": 9.740641843445583e-06, "loss": 0.1816, "step": 1060 }, { "entropy": 9.62043571472168, "epoch": 0.10480522048645442, "mean_token_accuracy": 0.8195956349372864, "num_tokens": 5523555.0, "step": 1060, "train/ce_loss": 0.784421443939209 }, { "epoch": 0.10480522048645442, "step": 1060, "train/sim_loss": 0.0390625 }, { "epoch": 0.10480522048645442, "step": 1060, "train/total_loss": 0.11750464886426926 }, { "entropy": 9.526849746704102, "epoch": 0.10490409333596994, "mean_token_accuracy": 0.7537190318107605, "num_tokens": 5528557.0, "step": 1061, "train/ce_loss": 1.3954969644546509 }, { "epoch": 0.10490409333596994, "step": 1061, "train/sim_loss": 0.09375 }, { "epoch": 0.10490409333596994, "step": 1061, "train/total_loss": 0.23329970240592957 }, { "entropy": 9.389505386352539, "epoch": 0.10500296618548546, "mean_token_accuracy": 0.7473053932189941, "num_tokens": 5533850.0, "step": 1062, "train/ce_loss": 0.5176913142204285 }, { "epoch": 0.10500296618548546, "step": 1062, "train/sim_loss": 0.07421875 }, { "epoch": 0.10500296618548546, "step": 1062, "train/total_loss": 0.12598788738250732 }, { "entropy": 9.29703140258789, "epoch": 0.10510183903500099, "mean_token_accuracy": 0.7314285635948181, "num_tokens": 5539182.0, "step": 1063, "train/ce_loss": 0.7778090238571167 }, { "epoch": 0.10510183903500099, "step": 1063, "train/sim_loss": 0.0859375 }, { "epoch": 0.10510183903500099, "step": 1063, "train/total_loss": 0.16371840238571167 }, { "entropy": 9.751190185546875, "epoch": 0.10520071188451652, "mean_token_accuracy": 0.708053708076477, "num_tokens": 5544175.0, "step": 1064, "train/ce_loss": 2.324305295944214 }, { "epoch": 0.10520071188451652, "step": 1064, "train/sim_loss": 0.09375 }, { "epoch": 0.10520071188451652, "step": 1064, "train/total_loss": 0.32618051767349243 }, { "entropy": 10.641530990600586, "epoch": 0.10529958473403203, "mean_token_accuracy": 0.7652581930160522, "num_tokens": 5548778.0, "step": 1065, "train/ce_loss": 0.00010860838665394112 }, { "epoch": 0.10529958473403203, "step": 1065, "train/sim_loss": 0.078125 }, { "epoch": 0.10529958473403203, "step": 1065, "train/total_loss": 0.07813586294651031 }, { "entropy": 9.428884506225586, "epoch": 0.10539845758354756, "mean_token_accuracy": 0.7327935099601746, "num_tokens": 5553960.0, "step": 1066, "train/ce_loss": 0.9732573628425598 }, { "epoch": 0.10539845758354756, "step": 1066, "train/sim_loss": 0.109375 }, { "epoch": 0.10539845758354756, "step": 1066, "train/total_loss": 0.20670074224472046 }, { "entropy": 9.512097358703613, "epoch": 0.10549733043306309, "mean_token_accuracy": 0.7585752010345459, "num_tokens": 5559205.0, "step": 1067, "train/ce_loss": 1.512286901473999 }, { "epoch": 0.10549733043306309, "step": 1067, "train/sim_loss": 0.09375 }, { "epoch": 0.10549733043306309, "step": 1067, "train/total_loss": 0.24497869610786438 }, { "entropy": 9.436140060424805, "epoch": 0.1055962032825786, "mean_token_accuracy": 0.7049382925033569, "num_tokens": 5564461.0, "step": 1068, "train/ce_loss": 0.7769739031791687 }, { "epoch": 0.1055962032825786, "step": 1068, "train/sim_loss": 0.0625 }, { "epoch": 0.1055962032825786, "step": 1068, "train/total_loss": 0.14019739627838135 }, { "entropy": 9.738252639770508, "epoch": 0.10569507613209413, "mean_token_accuracy": 0.716586172580719, "num_tokens": 5569521.0, "step": 1069, "train/ce_loss": 0.6802010536193848 }, { "epoch": 0.10569507613209413, "step": 1069, "train/sim_loss": 0.12109375 }, { "epoch": 0.10569507613209413, "step": 1069, "train/total_loss": 0.18911385536193848 }, { "entropy": 9.438884735107422, "epoch": 0.10579394898160965, "mean_token_accuracy": 0.6901004314422607, "num_tokens": 5574704.0, "step": 1070, "train/ce_loss": 1.1930865049362183 }, { "epoch": 0.10579394898160965, "step": 1070, "train/sim_loss": 0.171875 }, { "epoch": 0.10579394898160965, "step": 1070, "train/total_loss": 0.2911836504936218 }, { "entropy": 8.921512603759766, "epoch": 0.10589282183112517, "mean_token_accuracy": 0.6904761791229248, "num_tokens": 5580112.0, "step": 1071, "train/ce_loss": 0.835815966129303 }, { "epoch": 0.10589282183112517, "step": 1071, "train/sim_loss": 0.08203125 }, { "epoch": 0.10589282183112517, "step": 1071, "train/total_loss": 0.1656128466129303 }, { "entropy": 10.20355224609375, "epoch": 0.1059916946806407, "mean_token_accuracy": 0.6859122514724731, "num_tokens": 5584972.0, "step": 1072, "train/ce_loss": 3.075763743254356e-05 }, { "epoch": 0.1059916946806407, "step": 1072, "train/sim_loss": 0.078125 }, { "epoch": 0.1059916946806407, "step": 1072, "train/total_loss": 0.07812807708978653 }, { "entropy": 9.451912879943848, "epoch": 0.10609056753015622, "mean_token_accuracy": 0.7582260370254517, "num_tokens": 5590117.0, "step": 1073, "train/ce_loss": 0.6548001170158386 }, { "epoch": 0.10609056753015622, "step": 1073, "train/sim_loss": 0.05078125 }, { "epoch": 0.10609056753015622, "step": 1073, "train/total_loss": 0.11626126617193222 }, { "entropy": 10.318904876708984, "epoch": 0.10618944037967175, "mean_token_accuracy": 0.716292142868042, "num_tokens": 5594851.0, "step": 1074, "train/ce_loss": 2.4563345909118652 }, { "epoch": 0.10618944037967175, "step": 1074, "train/sim_loss": 0.171875 }, { "epoch": 0.10618944037967175, "step": 1074, "train/total_loss": 0.41750848293304443 }, { "entropy": 9.271596908569336, "epoch": 0.10628831322918726, "mean_token_accuracy": 0.7515225410461426, "num_tokens": 5600196.0, "step": 1075, "train/ce_loss": 0.5061623454093933 }, { "epoch": 0.10628831322918726, "step": 1075, "train/sim_loss": 0.09375 }, { "epoch": 0.10628831322918726, "step": 1075, "train/total_loss": 0.14436623454093933 }, { "entropy": 10.099292755126953, "epoch": 0.10638718607870279, "mean_token_accuracy": 0.7397260069847107, "num_tokens": 5605059.0, "step": 1076, "train/ce_loss": 1.4282630681991577 }, { "epoch": 0.10638718607870279, "step": 1076, "train/sim_loss": 0.0703125 }, { "epoch": 0.10638718607870279, "step": 1076, "train/total_loss": 0.21313880383968353 }, { "entropy": 9.397590637207031, "epoch": 0.10648605892821832, "mean_token_accuracy": 0.7211055159568787, "num_tokens": 5610301.0, "step": 1077, "train/ce_loss": 1.4666411876678467 }, { "epoch": 0.10648605892821832, "step": 1077, "train/sim_loss": 0.11328125 }, { "epoch": 0.10648605892821832, "step": 1077, "train/total_loss": 0.2599453926086426 }, { "entropy": 9.761058807373047, "epoch": 0.10658493177773383, "mean_token_accuracy": 0.8031495809555054, "num_tokens": 5615385.0, "step": 1078, "train/ce_loss": 2.565521572250873e-05 }, { "epoch": 0.10658493177773383, "step": 1078, "train/sim_loss": 0.0703125 }, { "epoch": 0.10658493177773383, "step": 1078, "train/total_loss": 0.07031506299972534 }, { "entropy": 9.661194801330566, "epoch": 0.10668380462724936, "mean_token_accuracy": 0.7828054428100586, "num_tokens": 5620426.0, "step": 1079, "train/ce_loss": 0.5483853816986084 }, { "epoch": 0.10668380462724936, "step": 1079, "train/sim_loss": 0.05859375 }, { "epoch": 0.10668380462724936, "step": 1079, "train/total_loss": 0.11343228816986084 }, { "epoch": 0.10678267747676488, "grad_norm": 0.8281675577163696, "learning_rate": 9.735696978687634e-06, "loss": 0.1665, "step": 1080 }, { "entropy": 9.907204627990723, "epoch": 0.10678267747676488, "mean_token_accuracy": 0.8071428537368774, "num_tokens": 5625303.0, "step": 1080, "train/ce_loss": 0.878250002861023 }, { "epoch": 0.10678267747676488, "step": 1080, "train/sim_loss": 0.0546875 }, { "epoch": 0.10678267747676488, "step": 1080, "train/total_loss": 0.1425125002861023 }, { "entropy": 9.570771217346191, "epoch": 0.1068815503262804, "mean_token_accuracy": 0.7329462766647339, "num_tokens": 5630459.0, "step": 1081, "train/ce_loss": 0.875616192817688 }, { "epoch": 0.1068815503262804, "step": 1081, "train/sim_loss": 0.09765625 }, { "epoch": 0.1068815503262804, "step": 1081, "train/total_loss": 0.18521787226200104 }, { "entropy": 9.296771049499512, "epoch": 0.10698042317579592, "mean_token_accuracy": 0.7612107396125793, "num_tokens": 5635751.0, "step": 1082, "train/ce_loss": 1.1663848161697388 }, { "epoch": 0.10698042317579592, "step": 1082, "train/sim_loss": 0.0859375 }, { "epoch": 0.10698042317579592, "step": 1082, "train/total_loss": 0.20257598161697388 }, { "entropy": 9.091501235961914, "epoch": 0.10707929602531145, "mean_token_accuracy": 0.7034631967544556, "num_tokens": 5641138.0, "step": 1083, "train/ce_loss": 1.280279278755188 }, { "epoch": 0.10707929602531145, "step": 1083, "train/sim_loss": 0.10546875 }, { "epoch": 0.10707929602531145, "step": 1083, "train/total_loss": 0.23349668085575104 }, { "entropy": 9.119293212890625, "epoch": 0.10717816887482698, "mean_token_accuracy": 0.7681970596313477, "num_tokens": 5646516.0, "step": 1084, "train/ce_loss": 0.7127261757850647 }, { "epoch": 0.10717816887482698, "step": 1084, "train/sim_loss": 0.07421875 }, { "epoch": 0.10717816887482698, "step": 1084, "train/total_loss": 0.145491361618042 }, { "entropy": 9.799027442932129, "epoch": 0.10727704172434249, "mean_token_accuracy": 0.6939102411270142, "num_tokens": 5651551.0, "step": 1085, "train/ce_loss": 1.602504014968872 }, { "epoch": 0.10727704172434249, "step": 1085, "train/sim_loss": 0.08984375 }, { "epoch": 0.10727704172434249, "step": 1085, "train/total_loss": 0.2500941753387451 }, { "entropy": 9.298020362854004, "epoch": 0.10737591457385802, "mean_token_accuracy": 0.6997166872024536, "num_tokens": 5656746.0, "step": 1086, "train/ce_loss": 1.2978819608688354 }, { "epoch": 0.10737591457385802, "step": 1086, "train/sim_loss": 0.08984375 }, { "epoch": 0.10737591457385802, "step": 1086, "train/total_loss": 0.21963195502758026 }, { "entropy": 9.827201843261719, "epoch": 0.10747478742337355, "mean_token_accuracy": 0.771019697189331, "num_tokens": 5661765.0, "step": 1087, "train/ce_loss": 0.9341949224472046 }, { "epoch": 0.10747478742337355, "step": 1087, "train/sim_loss": 0.0546875 }, { "epoch": 0.10747478742337355, "step": 1087, "train/total_loss": 0.14810699224472046 }, { "entropy": 9.570361137390137, "epoch": 0.10757366027288906, "mean_token_accuracy": 0.7847533822059631, "num_tokens": 5666913.0, "step": 1088, "train/ce_loss": 4.315694241086021e-05 }, { "epoch": 0.10757366027288906, "step": 1088, "train/sim_loss": 0.0859375 }, { "epoch": 0.10757366027288906, "step": 1088, "train/total_loss": 0.08594181388616562 }, { "entropy": 9.161052703857422, "epoch": 0.10767253312240459, "mean_token_accuracy": 0.7218044996261597, "num_tokens": 5672177.0, "step": 1089, "train/ce_loss": 0.7218360304832458 }, { "epoch": 0.10767253312240459, "step": 1089, "train/sim_loss": 0.0625 }, { "epoch": 0.10767253312240459, "step": 1089, "train/total_loss": 0.13468360900878906 }, { "entropy": 9.451432228088379, "epoch": 0.10777140597192011, "mean_token_accuracy": 0.6954177618026733, "num_tokens": 5677299.0, "step": 1090, "train/ce_loss": 0.6777337193489075 }, { "epoch": 0.10777140597192011, "step": 1090, "train/sim_loss": 0.0625 }, { "epoch": 0.10777140597192011, "step": 1090, "train/total_loss": 0.13027337193489075 }, { "entropy": 9.760773658752441, "epoch": 0.10787027882143563, "mean_token_accuracy": 0.746691882610321, "num_tokens": 5682235.0, "step": 1091, "train/ce_loss": 1.132237434387207 }, { "epoch": 0.10787027882143563, "step": 1091, "train/sim_loss": 0.078125 }, { "epoch": 0.10787027882143563, "step": 1091, "train/total_loss": 0.19134874641895294 }, { "entropy": 9.321340560913086, "epoch": 0.10796915167095116, "mean_token_accuracy": 0.7540760636329651, "num_tokens": 5687477.0, "step": 1092, "train/ce_loss": 0.7502046823501587 }, { "epoch": 0.10796915167095116, "step": 1092, "train/sim_loss": 0.10546875 }, { "epoch": 0.10796915167095116, "step": 1092, "train/total_loss": 0.1804892122745514 }, { "entropy": 9.607926368713379, "epoch": 0.10806802452046668, "mean_token_accuracy": 0.6939597129821777, "num_tokens": 5692664.0, "step": 1093, "train/ce_loss": 1.2573391199111938 }, { "epoch": 0.10806802452046668, "step": 1093, "train/sim_loss": 0.1171875 }, { "epoch": 0.10806802452046668, "step": 1093, "train/total_loss": 0.24292141199111938 }, { "entropy": 9.303705215454102, "epoch": 0.10816689736998221, "mean_token_accuracy": 0.7413554787635803, "num_tokens": 5697887.0, "step": 1094, "train/ce_loss": 0.9573739171028137 }, { "epoch": 0.10816689736998221, "step": 1094, "train/sim_loss": 0.0859375 }, { "epoch": 0.10816689736998221, "step": 1094, "train/total_loss": 0.18167489767074585 }, { "entropy": 9.43138313293457, "epoch": 0.10826577021949772, "mean_token_accuracy": 0.714102566242218, "num_tokens": 5703140.0, "step": 1095, "train/ce_loss": 0.4888365864753723 }, { "epoch": 0.10826577021949772, "step": 1095, "train/sim_loss": 0.05859375 }, { "epoch": 0.10826577021949772, "step": 1095, "train/total_loss": 0.10747741162776947 }, { "entropy": 9.34381103515625, "epoch": 0.10836464306901325, "mean_token_accuracy": 0.7085561752319336, "num_tokens": 5708319.0, "step": 1096, "train/ce_loss": 1.3514018064597622e-05 }, { "epoch": 0.10836464306901325, "step": 1096, "train/sim_loss": 0.109375 }, { "epoch": 0.10836464306901325, "step": 1096, "train/total_loss": 0.10937634855508804 }, { "entropy": 9.446690559387207, "epoch": 0.10846351591852878, "mean_token_accuracy": 0.7742817997932434, "num_tokens": 5713462.0, "step": 1097, "train/ce_loss": 3.5229713830631226e-05 }, { "epoch": 0.10846351591852878, "step": 1097, "train/sim_loss": 0.08984375 }, { "epoch": 0.10846351591852878, "step": 1097, "train/total_loss": 0.08984727412462234 }, { "entropy": 9.997285842895508, "epoch": 0.10856238876804429, "mean_token_accuracy": 0.8060606122016907, "num_tokens": 5718357.0, "step": 1098, "train/ce_loss": 2.3086209694156423e-05 }, { "epoch": 0.10856238876804429, "step": 1098, "train/sim_loss": 0.04296875 }, { "epoch": 0.10856238876804429, "step": 1098, "train/total_loss": 0.042971059679985046 }, { "entropy": 9.321104049682617, "epoch": 0.10866126161755982, "mean_token_accuracy": 0.6739409565925598, "num_tokens": 5723572.0, "step": 1099, "train/ce_loss": 1.665620038693305e-05 }, { "epoch": 0.10866126161755982, "step": 1099, "train/sim_loss": 0.08984375 }, { "epoch": 0.10866126161755982, "step": 1099, "train/total_loss": 0.08984541893005371 }, { "epoch": 0.10876013446707535, "grad_norm": 1.111937403678894, "learning_rate": 9.730752113929684e-06, "loss": 0.1699, "step": 1100 }, { "entropy": 8.92409896850586, "epoch": 0.10876013446707535, "mean_token_accuracy": 0.6962025165557861, "num_tokens": 5729117.0, "step": 1100, "train/ce_loss": 0.6997928619384766 }, { "epoch": 0.10876013446707535, "step": 1100, "train/sim_loss": 0.125 }, { "epoch": 0.10876013446707535, "step": 1100, "train/total_loss": 0.19497928023338318 }, { "entropy": 9.248737335205078, "epoch": 0.10885900731659086, "mean_token_accuracy": 0.7200474739074707, "num_tokens": 5734411.0, "step": 1101, "train/ce_loss": 1.9167568683624268 }, { "epoch": 0.10885900731659086, "step": 1101, "train/sim_loss": 0.140625 }, { "epoch": 0.10885900731659086, "step": 1101, "train/total_loss": 0.33230069279670715 }, { "entropy": 9.988884925842285, "epoch": 0.10895788016610639, "mean_token_accuracy": 0.698113203048706, "num_tokens": 5739384.0, "step": 1102, "train/ce_loss": 4.440023985807784e-05 }, { "epoch": 0.10895788016610639, "step": 1102, "train/sim_loss": 0.08984375 }, { "epoch": 0.10895788016610639, "step": 1102, "train/total_loss": 0.08984819054603577 }, { "entropy": 9.443476676940918, "epoch": 0.10905675301562191, "mean_token_accuracy": 0.7716216444969177, "num_tokens": 5744567.0, "step": 1103, "train/ce_loss": 5.284923463477753e-05 }, { "epoch": 0.10905675301562191, "step": 1103, "train/sim_loss": 0.10546875 }, { "epoch": 0.10905675301562191, "step": 1103, "train/total_loss": 0.10547403246164322 }, { "entropy": 9.307315826416016, "epoch": 0.10915562586513744, "mean_token_accuracy": 0.7219387888908386, "num_tokens": 5749839.0, "step": 1104, "train/ce_loss": 0.724831223487854 }, { "epoch": 0.10915562586513744, "step": 1104, "train/sim_loss": 0.06640625 }, { "epoch": 0.10915562586513744, "step": 1104, "train/total_loss": 0.1388893723487854 }, { "entropy": 9.767234802246094, "epoch": 0.10925449871465295, "mean_token_accuracy": 0.7443868517875671, "num_tokens": 5754879.0, "step": 1105, "train/ce_loss": 0.9078450202941895 }, { "epoch": 0.10925449871465295, "step": 1105, "train/sim_loss": 0.0859375 }, { "epoch": 0.10925449871465295, "step": 1105, "train/total_loss": 0.17672200500965118 }, { "entropy": 9.019737243652344, "epoch": 0.10935337156416848, "mean_token_accuracy": 0.7746341228485107, "num_tokens": 5760364.0, "step": 1106, "train/ce_loss": 0.6363298892974854 }, { "epoch": 0.10935337156416848, "step": 1106, "train/sim_loss": 0.0625 }, { "epoch": 0.10935337156416848, "step": 1106, "train/total_loss": 0.126132994890213 }, { "entropy": 9.645170211791992, "epoch": 0.10945224441368401, "mean_token_accuracy": 0.7149681448936462, "num_tokens": 5765419.0, "step": 1107, "train/ce_loss": 1.4607880115509033 }, { "epoch": 0.10945224441368401, "step": 1107, "train/sim_loss": 0.10546875 }, { "epoch": 0.10945224441368401, "step": 1107, "train/total_loss": 0.25154757499694824 }, { "entropy": 9.698060989379883, "epoch": 0.10955111726319952, "mean_token_accuracy": 0.6975036859512329, "num_tokens": 5770503.0, "step": 1108, "train/ce_loss": 1.2893515825271606 }, { "epoch": 0.10955111726319952, "step": 1108, "train/sim_loss": 0.12890625 }, { "epoch": 0.10955111726319952, "step": 1108, "train/total_loss": 0.25784140825271606 }, { "entropy": 9.17084789276123, "epoch": 0.10964999011271505, "mean_token_accuracy": 0.743534505367279, "num_tokens": 5775872.0, "step": 1109, "train/ce_loss": 0.7339786887168884 }, { "epoch": 0.10964999011271505, "step": 1109, "train/sim_loss": 0.09375 }, { "epoch": 0.10964999011271505, "step": 1109, "train/total_loss": 0.16714787483215332 }, { "entropy": 9.010650634765625, "epoch": 0.10974886296223058, "mean_token_accuracy": 0.777063250541687, "num_tokens": 5781283.0, "step": 1110, "train/ce_loss": 0.5257327556610107 }, { "epoch": 0.10974886296223058, "step": 1110, "train/sim_loss": 0.0703125 }, { "epoch": 0.10974886296223058, "step": 1110, "train/total_loss": 0.12288577854633331 }, { "entropy": 8.9091796875, "epoch": 0.10984773581174609, "mean_token_accuracy": 0.7399380803108215, "num_tokens": 5786765.0, "step": 1111, "train/ce_loss": 1.0092471837997437 }, { "epoch": 0.10984773581174609, "step": 1111, "train/sim_loss": 0.1328125 }, { "epoch": 0.10984773581174609, "step": 1111, "train/total_loss": 0.23373723030090332 }, { "entropy": 9.311386108398438, "epoch": 0.10994660866126162, "mean_token_accuracy": 0.7463087439537048, "num_tokens": 5792003.0, "step": 1112, "train/ce_loss": 1.1881719827651978 }, { "epoch": 0.10994660866126162, "step": 1112, "train/sim_loss": 0.1171875 }, { "epoch": 0.10994660866126162, "step": 1112, "train/total_loss": 0.23600471019744873 }, { "entropy": 9.483650207519531, "epoch": 0.11004548151077714, "mean_token_accuracy": 0.6721556782722473, "num_tokens": 5797096.0, "step": 1113, "train/ce_loss": 0.7234494090080261 }, { "epoch": 0.11004548151077714, "step": 1113, "train/sim_loss": 0.109375 }, { "epoch": 0.11004548151077714, "step": 1113, "train/total_loss": 0.18171994388103485 }, { "entropy": 9.235286712646484, "epoch": 0.11014435436029267, "mean_token_accuracy": 0.7658593058586121, "num_tokens": 5802446.0, "step": 1114, "train/ce_loss": 0.5792244672775269 }, { "epoch": 0.11014435436029267, "step": 1114, "train/sim_loss": 0.125 }, { "epoch": 0.11014435436029267, "step": 1114, "train/total_loss": 0.18292245268821716 }, { "entropy": 9.276355743408203, "epoch": 0.11024322720980818, "mean_token_accuracy": 0.7150062918663025, "num_tokens": 5807718.0, "step": 1115, "train/ce_loss": 0.9252979159355164 }, { "epoch": 0.11024322720980818, "step": 1115, "train/sim_loss": 0.07421875 }, { "epoch": 0.11024322720980818, "step": 1115, "train/total_loss": 0.1667485535144806 }, { "entropy": 9.39252758026123, "epoch": 0.11034210005932371, "mean_token_accuracy": 0.6945288777351379, "num_tokens": 5812826.0, "step": 1116, "train/ce_loss": 1.2116296291351318 }, { "epoch": 0.11034210005932371, "step": 1116, "train/sim_loss": 0.05078125 }, { "epoch": 0.11034210005932371, "step": 1116, "train/total_loss": 0.17194421589374542 }, { "entropy": 9.744169235229492, "epoch": 0.11044097290883924, "mean_token_accuracy": 0.7659574747085571, "num_tokens": 5817847.0, "step": 1117, "train/ce_loss": 0.00010859971371246502 }, { "epoch": 0.11044097290883924, "step": 1117, "train/sim_loss": 0.125 }, { "epoch": 0.11044097290883924, "step": 1117, "train/total_loss": 0.12501086294651031 }, { "entropy": 8.795273780822754, "epoch": 0.11053984575835475, "mean_token_accuracy": 0.7181664109230042, "num_tokens": 5823707.0, "step": 1118, "train/ce_loss": 1.1598750352859497 }, { "epoch": 0.11053984575835475, "step": 1118, "train/sim_loss": 0.08984375 }, { "epoch": 0.11053984575835475, "step": 1118, "train/total_loss": 0.20583125948905945 }, { "entropy": 9.927274703979492, "epoch": 0.11063871860787028, "mean_token_accuracy": 0.6340996026992798, "num_tokens": 5828644.0, "step": 1119, "train/ce_loss": 0.0007295234245248139 }, { "epoch": 0.11063871860787028, "step": 1119, "train/sim_loss": 0.06640625 }, { "epoch": 0.11063871860787028, "step": 1119, "train/total_loss": 0.06647919863462448 }, { "epoch": 0.1107375914573858, "grad_norm": 1.2310987710952759, "learning_rate": 9.725807249171736e-06, "loss": 0.1763, "step": 1120 }, { "entropy": 9.093071937561035, "epoch": 0.1107375914573858, "mean_token_accuracy": 0.7035794258117676, "num_tokens": 5834016.0, "step": 1120, "train/ce_loss": 0.984878420829773 }, { "epoch": 0.1107375914573858, "step": 1120, "train/sim_loss": 0.1640625 }, { "epoch": 0.1107375914573858, "step": 1120, "train/total_loss": 0.26255035400390625 }, { "entropy": 9.372610092163086, "epoch": 0.11083646430690132, "mean_token_accuracy": 0.767175555229187, "num_tokens": 5839414.0, "step": 1121, "train/ce_loss": 0.6642642617225647 }, { "epoch": 0.11083646430690132, "step": 1121, "train/sim_loss": 0.09375 }, { "epoch": 0.11083646430690132, "step": 1121, "train/total_loss": 0.16017642617225647 }, { "entropy": 9.518269538879395, "epoch": 0.11093533715641685, "mean_token_accuracy": 0.7764227390289307, "num_tokens": 5844801.0, "step": 1122, "train/ce_loss": 1.3010786771774292 }, { "epoch": 0.11093533715641685, "step": 1122, "train/sim_loss": 0.14453125 }, { "epoch": 0.11093533715641685, "step": 1122, "train/total_loss": 0.2746391296386719 }, { "entropy": 9.283157348632812, "epoch": 0.11103421000593237, "mean_token_accuracy": 0.728380024433136, "num_tokens": 5850116.0, "step": 1123, "train/ce_loss": 1.0420175790786743 }, { "epoch": 0.11103421000593237, "step": 1123, "train/sim_loss": 0.0859375 }, { "epoch": 0.11103421000593237, "step": 1123, "train/total_loss": 0.1901392638683319 }, { "entropy": 9.071952819824219, "epoch": 0.11113308285544789, "mean_token_accuracy": 0.7816537618637085, "num_tokens": 5855276.0, "step": 1124, "train/ce_loss": 0.6013221144676208 }, { "epoch": 0.11113308285544789, "step": 1124, "train/sim_loss": 0.05859375 }, { "epoch": 0.11113308285544789, "step": 1124, "train/total_loss": 0.1187259629368782 }, { "entropy": 9.901144981384277, "epoch": 0.11123195570496341, "mean_token_accuracy": 0.7330754399299622, "num_tokens": 5860228.0, "step": 1125, "train/ce_loss": 1.0996679067611694 }, { "epoch": 0.11123195570496341, "step": 1125, "train/sim_loss": 0.0546875 }, { "epoch": 0.11123195570496341, "step": 1125, "train/total_loss": 0.16465428471565247 }, { "entropy": 9.006317138671875, "epoch": 0.11133082855447894, "mean_token_accuracy": 0.7103825211524963, "num_tokens": 5865653.0, "step": 1126, "train/ce_loss": 0.7401002049446106 }, { "epoch": 0.11133082855447894, "step": 1126, "train/sim_loss": 0.109375 }, { "epoch": 0.11133082855447894, "step": 1126, "train/total_loss": 0.18338501453399658 }, { "entropy": 9.19222640991211, "epoch": 0.11142970140399447, "mean_token_accuracy": 0.6580976843833923, "num_tokens": 5870913.0, "step": 1127, "train/ce_loss": 4.0744573198026046e-05 }, { "epoch": 0.11142970140399447, "step": 1127, "train/sim_loss": 0.0859375 }, { "epoch": 0.11142970140399447, "step": 1127, "train/total_loss": 0.08594157546758652 }, { "entropy": 9.887141227722168, "epoch": 0.11152857425350998, "mean_token_accuracy": 0.7782427072525024, "num_tokens": 5875820.0, "step": 1128, "train/ce_loss": 1.148012638092041 }, { "epoch": 0.11152857425350998, "step": 1128, "train/sim_loss": 0.11328125 }, { "epoch": 0.11152857425350998, "step": 1128, "train/total_loss": 0.22808250784873962 }, { "entropy": 9.06885814666748, "epoch": 0.11162744710302551, "mean_token_accuracy": 0.6842105388641357, "num_tokens": 5881218.0, "step": 1129, "train/ce_loss": 1.2860099077224731 }, { "epoch": 0.11162744710302551, "step": 1129, "train/sim_loss": 0.09375 }, { "epoch": 0.11162744710302551, "step": 1129, "train/total_loss": 0.22235099971294403 }, { "entropy": 9.067543029785156, "epoch": 0.11172631995254104, "mean_token_accuracy": 0.7619631886482239, "num_tokens": 5886545.0, "step": 1130, "train/ce_loss": 0.9816094636917114 }, { "epoch": 0.11172631995254104, "step": 1130, "train/sim_loss": 0.03515625 }, { "epoch": 0.11172631995254104, "step": 1130, "train/total_loss": 0.13331720232963562 }, { "entropy": 9.337303161621094, "epoch": 0.11182519280205655, "mean_token_accuracy": 0.7229336500167847, "num_tokens": 5892028.0, "step": 1131, "train/ce_loss": 1.4003444910049438 }, { "epoch": 0.11182519280205655, "step": 1131, "train/sim_loss": 0.09375 }, { "epoch": 0.11182519280205655, "step": 1131, "train/total_loss": 0.23378445208072662 }, { "entropy": 9.111893653869629, "epoch": 0.11192406565157208, "mean_token_accuracy": 0.7122060656547546, "num_tokens": 5897437.0, "step": 1132, "train/ce_loss": 0.9091646671295166 }, { "epoch": 0.11192406565157208, "step": 1132, "train/sim_loss": 0.09765625 }, { "epoch": 0.11192406565157208, "step": 1132, "train/total_loss": 0.1885727196931839 }, { "entropy": 9.357961654663086, "epoch": 0.1120229385010876, "mean_token_accuracy": 0.7841823101043701, "num_tokens": 5902671.0, "step": 1133, "train/ce_loss": 1.0447713136672974 }, { "epoch": 0.1120229385010876, "step": 1133, "train/sim_loss": 0.0546875 }, { "epoch": 0.1120229385010876, "step": 1133, "train/total_loss": 0.1591646373271942 }, { "entropy": 9.133411407470703, "epoch": 0.11212181135060312, "mean_token_accuracy": 0.7156652212142944, "num_tokens": 5908128.0, "step": 1134, "train/ce_loss": 1.4204862117767334 }, { "epoch": 0.11212181135060312, "step": 1134, "train/sim_loss": 0.10546875 }, { "epoch": 0.11212181135060312, "step": 1134, "train/total_loss": 0.24751737713813782 }, { "entropy": 10.052787780761719, "epoch": 0.11222068420011864, "mean_token_accuracy": 0.7367256879806519, "num_tokens": 5912934.0, "step": 1135, "train/ce_loss": 2.2169891963130794e-05 }, { "epoch": 0.11222068420011864, "step": 1135, "train/sim_loss": 0.03125 }, { "epoch": 0.11222068420011864, "step": 1135, "train/total_loss": 0.031252216547727585 }, { "entropy": 8.959535598754883, "epoch": 0.11231955704963417, "mean_token_accuracy": 0.709452748298645, "num_tokens": 5918441.0, "step": 1136, "train/ce_loss": 0.8237015604972839 }, { "epoch": 0.11231955704963417, "step": 1136, "train/sim_loss": 0.05078125 }, { "epoch": 0.11231955704963417, "step": 1136, "train/total_loss": 0.13315141201019287 }, { "entropy": 9.312917709350586, "epoch": 0.1124184298991497, "mean_token_accuracy": 0.7187864780426025, "num_tokens": 5923747.0, "step": 1137, "train/ce_loss": 1.4374332427978516 }, { "epoch": 0.1124184298991497, "step": 1137, "train/sim_loss": 0.09375 }, { "epoch": 0.1124184298991497, "step": 1137, "train/total_loss": 0.23749332129955292 }, { "entropy": 9.914706230163574, "epoch": 0.11251730274866521, "mean_token_accuracy": 0.76106196641922, "num_tokens": 5928933.0, "step": 1138, "train/ce_loss": 5.610586595139466e-05 }, { "epoch": 0.11251730274866521, "step": 1138, "train/sim_loss": 0.0703125 }, { "epoch": 0.11251730274866521, "step": 1138, "train/total_loss": 0.07031811028718948 }, { "entropy": 9.102922439575195, "epoch": 0.11261617559818074, "mean_token_accuracy": 0.7156862616539001, "num_tokens": 5934184.0, "step": 1139, "train/ce_loss": 0.8545843362808228 }, { "epoch": 0.11261617559818074, "step": 1139, "train/sim_loss": 0.1640625 }, { "epoch": 0.11261617559818074, "step": 1139, "train/total_loss": 0.2495209276676178 }, { "epoch": 0.11271504844769627, "grad_norm": 0.9209204912185669, "learning_rate": 9.720862384413787e-06, "loss": 0.178, "step": 1140 }, { "entropy": 9.652783393859863, "epoch": 0.11271504844769627, "mean_token_accuracy": 0.7483333349227905, "num_tokens": 5939261.0, "step": 1140, "train/ce_loss": 0.00013310209033079445 }, { "epoch": 0.11271504844769627, "step": 1140, "train/sim_loss": 0.03125 }, { "epoch": 0.11271504844769627, "step": 1140, "train/total_loss": 0.031263310462236404 }, { "entropy": 10.0499849319458, "epoch": 0.11281392129721178, "mean_token_accuracy": 0.7789757251739502, "num_tokens": 5944050.0, "step": 1141, "train/ce_loss": 0.001149828196503222 }, { "epoch": 0.11281392129721178, "step": 1141, "train/sim_loss": 0.09765625 }, { "epoch": 0.11281392129721178, "step": 1141, "train/total_loss": 0.09777123481035233 }, { "entropy": 9.451024055480957, "epoch": 0.11291279414672731, "mean_token_accuracy": 0.746268630027771, "num_tokens": 5949252.0, "step": 1142, "train/ce_loss": 0.7472849488258362 }, { "epoch": 0.11291279414672731, "step": 1142, "train/sim_loss": 0.09375 }, { "epoch": 0.11291279414672731, "step": 1142, "train/total_loss": 0.16847848892211914 }, { "entropy": 9.157025337219238, "epoch": 0.11301166699624283, "mean_token_accuracy": 0.686956524848938, "num_tokens": 5954442.0, "step": 1143, "train/ce_loss": 1.2585369348526 }, { "epoch": 0.11301166699624283, "step": 1143, "train/sim_loss": 0.140625 }, { "epoch": 0.11301166699624283, "step": 1143, "train/total_loss": 0.2664787173271179 }, { "entropy": 8.705760955810547, "epoch": 0.11311053984575835, "mean_token_accuracy": 0.727192223072052, "num_tokens": 5960065.0, "step": 1144, "train/ce_loss": 0.6710728406906128 }, { "epoch": 0.11311053984575835, "step": 1144, "train/sim_loss": 0.078125 }, { "epoch": 0.11311053984575835, "step": 1144, "train/total_loss": 0.14523229002952576 }, { "entropy": 9.306068420410156, "epoch": 0.11320941269527388, "mean_token_accuracy": 0.7318500876426697, "num_tokens": 5965508.0, "step": 1145, "train/ce_loss": 1.1127384901046753 }, { "epoch": 0.11320941269527388, "step": 1145, "train/sim_loss": 0.1015625 }, { "epoch": 0.11320941269527388, "step": 1145, "train/total_loss": 0.212836354970932 }, { "entropy": 9.491476058959961, "epoch": 0.1133082855447894, "mean_token_accuracy": 0.7203947305679321, "num_tokens": 5970603.0, "step": 1146, "train/ce_loss": 1.5402119970531203e-05 }, { "epoch": 0.1133082855447894, "step": 1146, "train/sim_loss": 0.08203125 }, { "epoch": 0.1133082855447894, "step": 1146, "train/total_loss": 0.08203279227018356 }, { "entropy": 8.89062786102295, "epoch": 0.11340715839430493, "mean_token_accuracy": 0.6906552314758301, "num_tokens": 5976012.0, "step": 1147, "train/ce_loss": 1.2451964616775513 }, { "epoch": 0.11340715839430493, "step": 1147, "train/sim_loss": 0.109375 }, { "epoch": 0.11340715839430493, "step": 1147, "train/total_loss": 0.23389464616775513 }, { "entropy": 8.958955764770508, "epoch": 0.11350603124382044, "mean_token_accuracy": 0.746051013469696, "num_tokens": 5981384.0, "step": 1148, "train/ce_loss": 0.6065599918365479 }, { "epoch": 0.11350603124382044, "step": 1148, "train/sim_loss": 0.0859375 }, { "epoch": 0.11350603124382044, "step": 1148, "train/total_loss": 0.14659349620342255 }, { "entropy": 9.490360260009766, "epoch": 0.11360490409333597, "mean_token_accuracy": 0.802431583404541, "num_tokens": 5986525.0, "step": 1149, "train/ce_loss": 1.0267833471298218 }, { "epoch": 0.11360490409333597, "step": 1149, "train/sim_loss": 0.05078125 }, { "epoch": 0.11360490409333597, "step": 1149, "train/total_loss": 0.1534595787525177 }, { "entropy": 10.175813674926758, "epoch": 0.1137037769428515, "mean_token_accuracy": 0.7017543911933899, "num_tokens": 5991269.0, "step": 1150, "train/ce_loss": 1.901901364326477 }, { "epoch": 0.1137037769428515, "step": 1150, "train/sim_loss": 0.0703125 }, { "epoch": 0.1137037769428515, "step": 1150, "train/total_loss": 0.2605026364326477 }, { "entropy": 8.89851188659668, "epoch": 0.11380264979236701, "mean_token_accuracy": 0.7866028547286987, "num_tokens": 5996778.0, "step": 1151, "train/ce_loss": 0.6228799223899841 }, { "epoch": 0.11380264979236701, "step": 1151, "train/sim_loss": 0.078125 }, { "epoch": 0.11380264979236701, "step": 1151, "train/total_loss": 0.14041298627853394 }, { "entropy": 9.506275177001953, "epoch": 0.11390152264188254, "mean_token_accuracy": 0.776068389415741, "num_tokens": 6001841.0, "step": 1152, "train/ce_loss": 1.0933926105499268 }, { "epoch": 0.11390152264188254, "step": 1152, "train/sim_loss": 0.11328125 }, { "epoch": 0.11390152264188254, "step": 1152, "train/total_loss": 0.22262051701545715 }, { "entropy": 9.191484451293945, "epoch": 0.11400039549139807, "mean_token_accuracy": 0.762326180934906, "num_tokens": 6007095.0, "step": 1153, "train/ce_loss": 0.8622652292251587 }, { "epoch": 0.11400039549139807, "step": 1153, "train/sim_loss": 0.0625 }, { "epoch": 0.11400039549139807, "step": 1153, "train/total_loss": 0.14872652292251587 }, { "entropy": 9.810675621032715, "epoch": 0.11409926834091358, "mean_token_accuracy": 0.7382671236991882, "num_tokens": 6012059.0, "step": 1154, "train/ce_loss": 0.6489583253860474 }, { "epoch": 0.11409926834091358, "step": 1154, "train/sim_loss": 0.0703125 }, { "epoch": 0.11409926834091358, "step": 1154, "train/total_loss": 0.1352083384990692 }, { "entropy": 9.498499870300293, "epoch": 0.1141981411904291, "mean_token_accuracy": 0.7010723948478699, "num_tokens": 6017238.0, "step": 1155, "train/ce_loss": 1.6045225858688354 }, { "epoch": 0.1141981411904291, "step": 1155, "train/sim_loss": 0.1171875 }, { "epoch": 0.1141981411904291, "step": 1155, "train/total_loss": 0.2776397466659546 }, { "entropy": 9.012279510498047, "epoch": 0.11429701403994463, "mean_token_accuracy": 0.7267683744430542, "num_tokens": 6022452.0, "step": 1156, "train/ce_loss": 1.3016753196716309 }, { "epoch": 0.11429701403994463, "step": 1156, "train/sim_loss": 0.12109375 }, { "epoch": 0.11429701403994463, "step": 1156, "train/total_loss": 0.25126129388809204 }, { "entropy": 9.472189903259277, "epoch": 0.11439588688946016, "mean_token_accuracy": 0.7290779948234558, "num_tokens": 6027586.0, "step": 1157, "train/ce_loss": 1.8474470376968384 }, { "epoch": 0.11439588688946016, "step": 1157, "train/sim_loss": 0.10546875 }, { "epoch": 0.11439588688946016, "step": 1157, "train/total_loss": 0.2902134656906128 }, { "entropy": 9.285955429077148, "epoch": 0.11449475973897567, "mean_token_accuracy": 0.7551020383834839, "num_tokens": 6032785.0, "step": 1158, "train/ce_loss": 0.9319847226142883 }, { "epoch": 0.11449475973897567, "step": 1158, "train/sim_loss": 0.0703125 }, { "epoch": 0.11449475973897567, "step": 1158, "train/total_loss": 0.1635109782218933 }, { "entropy": 8.998316764831543, "epoch": 0.1145936325884912, "mean_token_accuracy": 0.7122736573219299, "num_tokens": 6038253.0, "step": 1159, "train/ce_loss": 0.7804404497146606 }, { "epoch": 0.1145936325884912, "step": 1159, "train/sim_loss": 0.078125 }, { "epoch": 0.1145936325884912, "step": 1159, "train/total_loss": 0.15616905689239502 }, { "epoch": 0.11469250543800673, "grad_norm": 1.0438960790634155, "learning_rate": 9.715917519655839e-06, "loss": 0.1692, "step": 1160 }, { "entropy": 9.792644500732422, "epoch": 0.11469250543800673, "mean_token_accuracy": 0.7573657035827637, "num_tokens": 6043287.0, "step": 1160, "train/ce_loss": 0.9914917945861816 }, { "epoch": 0.11469250543800673, "step": 1160, "train/sim_loss": 0.06640625 }, { "epoch": 0.11469250543800673, "step": 1160, "train/total_loss": 0.1655554324388504 }, { "entropy": 9.431596755981445, "epoch": 0.11479137828752224, "mean_token_accuracy": 0.6828829050064087, "num_tokens": 6048297.0, "step": 1161, "train/ce_loss": 1.0226948261260986 }, { "epoch": 0.11479137828752224, "step": 1161, "train/sim_loss": 0.11328125 }, { "epoch": 0.11479137828752224, "step": 1161, "train/total_loss": 0.2155507355928421 }, { "entropy": 9.102705001831055, "epoch": 0.11489025113703777, "mean_token_accuracy": 0.755630612373352, "num_tokens": 6053670.0, "step": 1162, "train/ce_loss": 0.7495688796043396 }, { "epoch": 0.11489025113703777, "step": 1162, "train/sim_loss": 0.12109375 }, { "epoch": 0.11489025113703777, "step": 1162, "train/total_loss": 0.19605064392089844 }, { "entropy": 9.561321258544922, "epoch": 0.1149891239865533, "mean_token_accuracy": 0.7164804339408875, "num_tokens": 6058813.0, "step": 1163, "train/ce_loss": 1.1004800398950465e-05 }, { "epoch": 0.1149891239865533, "step": 1163, "train/sim_loss": 0.07421875 }, { "epoch": 0.1149891239865533, "step": 1163, "train/total_loss": 0.07421985268592834 }, { "entropy": 9.675859451293945, "epoch": 0.11508799683606881, "mean_token_accuracy": 0.6913996338844299, "num_tokens": 6063837.0, "step": 1164, "train/ce_loss": 2.5309915145044215e-05 }, { "epoch": 0.11508799683606881, "step": 1164, "train/sim_loss": 0.0390625 }, { "epoch": 0.11508799683606881, "step": 1164, "train/total_loss": 0.039065029472112656 }, { "entropy": 9.744415283203125, "epoch": 0.11518686968558434, "mean_token_accuracy": 0.7025547623634338, "num_tokens": 6068853.0, "step": 1165, "train/ce_loss": 1.094001293182373 }, { "epoch": 0.11518686968558434, "step": 1165, "train/sim_loss": 0.0859375 }, { "epoch": 0.11518686968558434, "step": 1165, "train/total_loss": 0.19533762335777283 }, { "entropy": 9.051517486572266, "epoch": 0.11528574253509986, "mean_token_accuracy": 0.7436781525611877, "num_tokens": 6074213.0, "step": 1166, "train/ce_loss": 1.2844007015228271 }, { "epoch": 0.11528574253509986, "step": 1166, "train/sim_loss": 0.09765625 }, { "epoch": 0.11528574253509986, "step": 1166, "train/total_loss": 0.22609631717205048 }, { "entropy": 9.856742858886719, "epoch": 0.11538461538461539, "mean_token_accuracy": 0.6978998184204102, "num_tokens": 6079217.0, "step": 1167, "train/ce_loss": 1.1069480180740356 }, { "epoch": 0.11538461538461539, "step": 1167, "train/sim_loss": 0.0546875 }, { "epoch": 0.11538461538461539, "step": 1167, "train/total_loss": 0.1653822958469391 }, { "entropy": 9.239913940429688, "epoch": 0.1154834882341309, "mean_token_accuracy": 0.7493917346000671, "num_tokens": 6084490.0, "step": 1168, "train/ce_loss": 0.7585064768791199 }, { "epoch": 0.1154834882341309, "step": 1168, "train/sim_loss": 0.0859375 }, { "epoch": 0.1154834882341309, "step": 1168, "train/total_loss": 0.16178815066814423 }, { "entropy": 9.63561725616455, "epoch": 0.11558236108364643, "mean_token_accuracy": 0.7750759720802307, "num_tokens": 6089607.0, "step": 1169, "train/ce_loss": 1.0800330638885498 }, { "epoch": 0.11558236108364643, "step": 1169, "train/sim_loss": 0.08984375 }, { "epoch": 0.11558236108364643, "step": 1169, "train/total_loss": 0.19784706830978394 }, { "entropy": 9.188960075378418, "epoch": 0.11568123393316196, "mean_token_accuracy": 0.6948717832565308, "num_tokens": 6094847.0, "step": 1170, "train/ce_loss": 0.8472950458526611 }, { "epoch": 0.11568123393316196, "step": 1170, "train/sim_loss": 0.0625 }, { "epoch": 0.11568123393316196, "step": 1170, "train/total_loss": 0.14722950756549835 }, { "entropy": 9.626472473144531, "epoch": 0.11578010678267747, "mean_token_accuracy": 0.7612403035163879, "num_tokens": 6099914.0, "step": 1171, "train/ce_loss": 1.5983554124832153 }, { "epoch": 0.11578010678267747, "step": 1171, "train/sim_loss": 0.06640625 }, { "epoch": 0.11578010678267747, "step": 1171, "train/total_loss": 0.226241797208786 }, { "entropy": 10.03702163696289, "epoch": 0.115878979632193, "mean_token_accuracy": 0.7382550239562988, "num_tokens": 6104787.0, "step": 1172, "train/ce_loss": 1.2654229402542114 }, { "epoch": 0.115878979632193, "step": 1172, "train/sim_loss": 0.03515625 }, { "epoch": 0.115878979632193, "step": 1172, "train/total_loss": 0.16169854998588562 }, { "entropy": 9.217288970947266, "epoch": 0.11597785248170853, "mean_token_accuracy": 0.7721238732337952, "num_tokens": 6110145.0, "step": 1173, "train/ce_loss": 0.9713622331619263 }, { "epoch": 0.11597785248170853, "step": 1173, "train/sim_loss": 0.15234375 }, { "epoch": 0.11597785248170853, "step": 1173, "train/total_loss": 0.2494799792766571 }, { "entropy": 9.51988410949707, "epoch": 0.11607672533122404, "mean_token_accuracy": 0.6904109716415405, "num_tokens": 6115359.0, "step": 1174, "train/ce_loss": 1.0642578601837158 }, { "epoch": 0.11607672533122404, "step": 1174, "train/sim_loss": 0.125 }, { "epoch": 0.11607672533122404, "step": 1174, "train/total_loss": 0.23142579197883606 }, { "entropy": 9.422069549560547, "epoch": 0.11617559818073957, "mean_token_accuracy": 0.6796116232872009, "num_tokens": 6120522.0, "step": 1175, "train/ce_loss": 1.9794244766235352 }, { "epoch": 0.11617559818073957, "step": 1175, "train/sim_loss": 0.12109375 }, { "epoch": 0.11617559818073957, "step": 1175, "train/total_loss": 0.31903618574142456 }, { "entropy": 9.484894752502441, "epoch": 0.1162744710302551, "mean_token_accuracy": 0.6625683307647705, "num_tokens": 6125659.0, "step": 1176, "train/ce_loss": 1.1174323844898026e-05 }, { "epoch": 0.1162744710302551, "step": 1176, "train/sim_loss": 0.04296875 }, { "epoch": 0.1162744710302551, "step": 1176, "train/total_loss": 0.04296986758708954 }, { "entropy": 8.861796379089355, "epoch": 0.11637334387977062, "mean_token_accuracy": 0.7552238702774048, "num_tokens": 6131122.0, "step": 1177, "train/ce_loss": 1.1340677738189697 }, { "epoch": 0.11637334387977062, "step": 1177, "train/sim_loss": 0.10546875 }, { "epoch": 0.11637334387977062, "step": 1177, "train/total_loss": 0.21887552738189697 }, { "entropy": 9.71383285522461, "epoch": 0.11647221672928613, "mean_token_accuracy": 0.7714285850524902, "num_tokens": 6136176.0, "step": 1178, "train/ce_loss": 0.6980844736099243 }, { "epoch": 0.11647221672928613, "step": 1178, "train/sim_loss": 0.1015625 }, { "epoch": 0.11647221672928613, "step": 1178, "train/total_loss": 0.1713709533214569 }, { "entropy": 9.452953338623047, "epoch": 0.11657108957880166, "mean_token_accuracy": 0.7136871218681335, "num_tokens": 6141338.0, "step": 1179, "train/ce_loss": 0.9732190370559692 }, { "epoch": 0.11657108957880166, "step": 1179, "train/sim_loss": 0.06640625 }, { "epoch": 0.11657108957880166, "step": 1179, "train/total_loss": 0.16372814774513245 }, { "epoch": 0.11666996242831719, "grad_norm": 1.0579614639282227, "learning_rate": 9.71097265489789e-06, "loss": 0.1707, "step": 1180 }, { "entropy": 9.792144775390625, "epoch": 0.11666996242831719, "mean_token_accuracy": 0.6960950493812561, "num_tokens": 6146359.0, "step": 1180, "train/ce_loss": 1.1791037321090698 }, { "epoch": 0.11666996242831719, "step": 1180, "train/sim_loss": 0.109375 }, { "epoch": 0.11666996242831719, "step": 1180, "train/total_loss": 0.22728538513183594 }, { "entropy": 10.021678924560547, "epoch": 0.1167688352778327, "mean_token_accuracy": 0.7434554696083069, "num_tokens": 6151194.0, "step": 1181, "train/ce_loss": 1.1177552938461304 }, { "epoch": 0.1167688352778327, "step": 1181, "train/sim_loss": 0.12890625 }, { "epoch": 0.1167688352778327, "step": 1181, "train/total_loss": 0.24068178236484528 }, { "entropy": 9.493809700012207, "epoch": 0.11686770812734823, "mean_token_accuracy": 0.73051518201828, "num_tokens": 6156397.0, "step": 1182, "train/ce_loss": 1.126855731010437 }, { "epoch": 0.11686770812734823, "step": 1182, "train/sim_loss": 0.046875 }, { "epoch": 0.11686770812734823, "step": 1182, "train/total_loss": 0.15956057608127594 }, { "entropy": 9.727752685546875, "epoch": 0.11696658097686376, "mean_token_accuracy": 0.7101200819015503, "num_tokens": 6161441.0, "step": 1183, "train/ce_loss": 1.4100347757339478 }, { "epoch": 0.11696658097686376, "step": 1183, "train/sim_loss": 0.0546875 }, { "epoch": 0.11696658097686376, "step": 1183, "train/total_loss": 0.19569097459316254 }, { "entropy": 9.780705451965332, "epoch": 0.11706545382637927, "mean_token_accuracy": 0.739051103591919, "num_tokens": 6166437.0, "step": 1184, "train/ce_loss": 1.4163333177566528 }, { "epoch": 0.11706545382637927, "step": 1184, "train/sim_loss": 0.1484375 }, { "epoch": 0.11706545382637927, "step": 1184, "train/total_loss": 0.2900708317756653 }, { "entropy": 9.239494323730469, "epoch": 0.1171643266758948, "mean_token_accuracy": 0.7134831547737122, "num_tokens": 6171772.0, "step": 1185, "train/ce_loss": 0.5141210556030273 }, { "epoch": 0.1171643266758948, "step": 1185, "train/sim_loss": 0.07421875 }, { "epoch": 0.1171643266758948, "step": 1185, "train/total_loss": 0.12563085556030273 }, { "entropy": 9.675355911254883, "epoch": 0.11726319952541032, "mean_token_accuracy": 0.7637795209884644, "num_tokens": 6176786.0, "step": 1186, "train/ce_loss": 1.1922436952590942 }, { "epoch": 0.11726319952541032, "step": 1186, "train/sim_loss": 0.0703125 }, { "epoch": 0.11726319952541032, "step": 1186, "train/total_loss": 0.18953686952590942 }, { "entropy": 9.679882049560547, "epoch": 0.11736207237492585, "mean_token_accuracy": 0.751655638217926, "num_tokens": 6181865.0, "step": 1187, "train/ce_loss": 6.324046262307093e-05 }, { "epoch": 0.11736207237492585, "step": 1187, "train/sim_loss": 0.05078125 }, { "epoch": 0.11736207237492585, "step": 1187, "train/total_loss": 0.05078757554292679 }, { "entropy": 9.549026489257812, "epoch": 0.11746094522444137, "mean_token_accuracy": 0.6795030832290649, "num_tokens": 6187096.0, "step": 1188, "train/ce_loss": 2.21201753616333 }, { "epoch": 0.11746094522444137, "step": 1188, "train/sim_loss": 0.1484375 }, { "epoch": 0.11746094522444137, "step": 1188, "train/total_loss": 0.3696392774581909 }, { "entropy": 10.246437072753906, "epoch": 0.11755981807395689, "mean_token_accuracy": 0.7467948794364929, "num_tokens": 6191811.0, "step": 1189, "train/ce_loss": 2.0585265159606934 }, { "epoch": 0.11755981807395689, "step": 1189, "train/sim_loss": 0.05859375 }, { "epoch": 0.11755981807395689, "step": 1189, "train/total_loss": 0.2644464075565338 }, { "entropy": 9.535202026367188, "epoch": 0.11765869092347242, "mean_token_accuracy": 0.7592319250106812, "num_tokens": 6196954.0, "step": 1190, "train/ce_loss": 1.5350197553634644 }, { "epoch": 0.11765869092347242, "step": 1190, "train/sim_loss": 0.11328125 }, { "epoch": 0.11765869092347242, "step": 1190, "train/total_loss": 0.2667832374572754 }, { "entropy": 9.761550903320312, "epoch": 0.11775756377298793, "mean_token_accuracy": 0.6554770469665527, "num_tokens": 6201988.0, "step": 1191, "train/ce_loss": 1.2218689918518066 }, { "epoch": 0.11775756377298793, "step": 1191, "train/sim_loss": 0.125 }, { "epoch": 0.11775756377298793, "step": 1191, "train/total_loss": 0.24718689918518066 }, { "entropy": 9.070679664611816, "epoch": 0.11785643662250346, "mean_token_accuracy": 0.7737818956375122, "num_tokens": 6207322.0, "step": 1192, "train/ce_loss": 0.7580581903457642 }, { "epoch": 0.11785643662250346, "step": 1192, "train/sim_loss": 0.09765625 }, { "epoch": 0.11785643662250346, "step": 1192, "train/total_loss": 0.17346206307411194 }, { "entropy": 9.435079574584961, "epoch": 0.11795530947201899, "mean_token_accuracy": 0.7184873819351196, "num_tokens": 6212482.0, "step": 1193, "train/ce_loss": 3.4512537240516394e-05 }, { "epoch": 0.11795530947201899, "step": 1193, "train/sim_loss": 0.06640625 }, { "epoch": 0.11795530947201899, "step": 1193, "train/total_loss": 0.06640969961881638 }, { "entropy": 9.026063919067383, "epoch": 0.1180541823215345, "mean_token_accuracy": 0.7972167134284973, "num_tokens": 6217954.0, "step": 1194, "train/ce_loss": 0.8212894201278687 }, { "epoch": 0.1180541823215345, "step": 1194, "train/sim_loss": 0.0859375 }, { "epoch": 0.1180541823215345, "step": 1194, "train/total_loss": 0.16806644201278687 }, { "entropy": 9.829679489135742, "epoch": 0.11815305517105003, "mean_token_accuracy": 0.8259385824203491, "num_tokens": 6223014.0, "step": 1195, "train/ce_loss": 1.441159110981971e-05 }, { "epoch": 0.11815305517105003, "step": 1195, "train/sim_loss": 0.0859375 }, { "epoch": 0.11815305517105003, "step": 1195, "train/total_loss": 0.0859389379620552 }, { "entropy": 9.668957710266113, "epoch": 0.11825192802056556, "mean_token_accuracy": 0.7221324443817139, "num_tokens": 6228082.0, "step": 1196, "train/ce_loss": 1.3952945664641447e-05 }, { "epoch": 0.11825192802056556, "step": 1196, "train/sim_loss": 0.02734375 }, { "epoch": 0.11825192802056556, "step": 1196, "train/total_loss": 0.027345145121216774 }, { "entropy": 8.92799186706543, "epoch": 0.11835080087008108, "mean_token_accuracy": 0.7669094800949097, "num_tokens": 6233533.0, "step": 1197, "train/ce_loss": 0.5769115090370178 }, { "epoch": 0.11835080087008108, "step": 1197, "train/sim_loss": 0.08984375 }, { "epoch": 0.11835080087008108, "step": 1197, "train/total_loss": 0.14753490686416626 }, { "entropy": 9.498449325561523, "epoch": 0.1184496737195966, "mean_token_accuracy": 0.73221755027771, "num_tokens": 6238696.0, "step": 1198, "train/ce_loss": 1.7542729377746582 }, { "epoch": 0.1184496737195966, "step": 1198, "train/sim_loss": 0.1171875 }, { "epoch": 0.1184496737195966, "step": 1198, "train/total_loss": 0.29261481761932373 }, { "entropy": 8.95993423461914, "epoch": 0.11854854656911212, "mean_token_accuracy": 0.7315508127212524, "num_tokens": 6244122.0, "step": 1199, "train/ce_loss": 0.6192775964736938 }, { "epoch": 0.11854854656911212, "step": 1199, "train/sim_loss": 0.05859375 }, { "epoch": 0.11854854656911212, "step": 1199, "train/total_loss": 0.12052151560783386 }, { "epoch": 0.11864741941862765, "grad_norm": 1.1604726314544678, "learning_rate": 9.70602779013994e-06, "loss": 0.1688, "step": 1200 }, { "entropy": 9.959648132324219, "epoch": 0.11864741941862765, "mean_token_accuracy": 0.804347813129425, "num_tokens": 6249015.0, "step": 1200, "train/ce_loss": 1.8018967239186168e-05 }, { "epoch": 0.11864741941862765, "step": 1200, "train/sim_loss": 0.02734375 }, { "epoch": 0.11864741941862765, "step": 1200, "train/total_loss": 0.027345551177859306 }, { "entropy": 9.165637969970703, "epoch": 0.11874629226814316, "mean_token_accuracy": 0.7649667263031006, "num_tokens": 6254365.0, "step": 1201, "train/ce_loss": 0.9161604046821594 }, { "epoch": 0.11874629226814316, "step": 1201, "train/sim_loss": 0.109375 }, { "epoch": 0.11874629226814316, "step": 1201, "train/total_loss": 0.20099103450775146 }, { "entropy": 9.422985076904297, "epoch": 0.11884516511765869, "mean_token_accuracy": 0.7157894968986511, "num_tokens": 6259554.0, "step": 1202, "train/ce_loss": 0.6448574066162109 }, { "epoch": 0.11884516511765869, "step": 1202, "train/sim_loss": 0.03515625 }, { "epoch": 0.11884516511765869, "step": 1202, "train/total_loss": 0.09964199364185333 }, { "entropy": 9.062889099121094, "epoch": 0.11894403796717422, "mean_token_accuracy": 0.7551020383834839, "num_tokens": 6265015.0, "step": 1203, "train/ce_loss": 0.49379420280456543 }, { "epoch": 0.11894403796717422, "step": 1203, "train/sim_loss": 0.11328125 }, { "epoch": 0.11894403796717422, "step": 1203, "train/total_loss": 0.16266067326068878 }, { "entropy": 9.328768730163574, "epoch": 0.11904291081668973, "mean_token_accuracy": 0.7580437660217285, "num_tokens": 6270231.0, "step": 1204, "train/ce_loss": 0.7451881170272827 }, { "epoch": 0.11904291081668973, "step": 1204, "train/sim_loss": 0.06640625 }, { "epoch": 0.11904291081668973, "step": 1204, "train/total_loss": 0.1409250646829605 }, { "entropy": 9.521407127380371, "epoch": 0.11914178366620526, "mean_token_accuracy": 0.7482993006706238, "num_tokens": 6275434.0, "step": 1205, "train/ce_loss": 0.8505513072013855 }, { "epoch": 0.11914178366620526, "step": 1205, "train/sim_loss": 0.08984375 }, { "epoch": 0.11914178366620526, "step": 1205, "train/total_loss": 0.1748988926410675 }, { "entropy": 9.026713371276855, "epoch": 0.11924065651572079, "mean_token_accuracy": 0.7139507532119751, "num_tokens": 6280762.0, "step": 1206, "train/ce_loss": 0.9918038249015808 }, { "epoch": 0.11924065651572079, "step": 1206, "train/sim_loss": 0.1015625 }, { "epoch": 0.11924065651572079, "step": 1206, "train/total_loss": 0.20074288547039032 }, { "entropy": 9.690537452697754, "epoch": 0.1193395293652363, "mean_token_accuracy": 0.7435529828071594, "num_tokens": 6285845.0, "step": 1207, "train/ce_loss": 0.7968851327896118 }, { "epoch": 0.1193395293652363, "step": 1207, "train/sim_loss": 0.1015625 }, { "epoch": 0.1193395293652363, "step": 1207, "train/total_loss": 0.18125101923942566 }, { "entropy": 9.778779029846191, "epoch": 0.11943840221475183, "mean_token_accuracy": 0.7252964377403259, "num_tokens": 6290782.0, "step": 1208, "train/ce_loss": 1.856166958808899 }, { "epoch": 0.11943840221475183, "step": 1208, "train/sim_loss": 0.1328125 }, { "epoch": 0.11943840221475183, "step": 1208, "train/total_loss": 0.31842920184135437 }, { "entropy": 9.426246643066406, "epoch": 0.11953727506426735, "mean_token_accuracy": 0.7288590669631958, "num_tokens": 6295967.0, "step": 1209, "train/ce_loss": 0.974338948726654 }, { "epoch": 0.11953727506426735, "step": 1209, "train/sim_loss": 0.05078125 }, { "epoch": 0.11953727506426735, "step": 1209, "train/total_loss": 0.1482151448726654 }, { "entropy": 8.880840301513672, "epoch": 0.11963614791378288, "mean_token_accuracy": 0.6835051774978638, "num_tokens": 6301374.0, "step": 1210, "train/ce_loss": 1.1998740434646606 }, { "epoch": 0.11963614791378288, "step": 1210, "train/sim_loss": 0.171875 }, { "epoch": 0.11963614791378288, "step": 1210, "train/total_loss": 0.2918623983860016 }, { "entropy": 9.126518249511719, "epoch": 0.1197350207632984, "mean_token_accuracy": 0.7554535269737244, "num_tokens": 6306708.0, "step": 1211, "train/ce_loss": 0.4112582504749298 }, { "epoch": 0.1197350207632984, "step": 1211, "train/sim_loss": 0.0546875 }, { "epoch": 0.1197350207632984, "step": 1211, "train/total_loss": 0.0958133265376091 }, { "entropy": 9.21739387512207, "epoch": 0.11983389361281392, "mean_token_accuracy": 0.7209011316299438, "num_tokens": 6311950.0, "step": 1212, "train/ce_loss": 1.0548444986343384 }, { "epoch": 0.11983389361281392, "step": 1212, "train/sim_loss": 0.08984375 }, { "epoch": 0.11983389361281392, "step": 1212, "train/total_loss": 0.19532820582389832 }, { "entropy": 9.517651557922363, "epoch": 0.11993276646232945, "mean_token_accuracy": 0.7482900023460388, "num_tokens": 6317089.0, "step": 1213, "train/ce_loss": 0.5034375786781311 }, { "epoch": 0.11993276646232945, "step": 1213, "train/sim_loss": 0.02734375 }, { "epoch": 0.11993276646232945, "step": 1213, "train/total_loss": 0.07768750935792923 }, { "entropy": 9.730310440063477, "epoch": 0.12003163931184496, "mean_token_accuracy": 0.6894639730453491, "num_tokens": 6322081.0, "step": 1214, "train/ce_loss": 1.6999226808547974 }, { "epoch": 0.12003163931184496, "step": 1214, "train/sim_loss": 0.109375 }, { "epoch": 0.12003163931184496, "step": 1214, "train/total_loss": 0.27936726808547974 }, { "entropy": 9.209872245788574, "epoch": 0.12013051216136049, "mean_token_accuracy": 0.755750298500061, "num_tokens": 6327402.0, "step": 1215, "train/ce_loss": 1.0638724565505981 }, { "epoch": 0.12013051216136049, "step": 1215, "train/sim_loss": 0.203125 }, { "epoch": 0.12013051216136049, "step": 1215, "train/total_loss": 0.30951225757598877 }, { "entropy": 9.435253143310547, "epoch": 0.12022938501087602, "mean_token_accuracy": 0.6762028336524963, "num_tokens": 6332575.0, "step": 1216, "train/ce_loss": 0.6653417944908142 }, { "epoch": 0.12022938501087602, "step": 1216, "train/sim_loss": 0.109375 }, { "epoch": 0.12022938501087602, "step": 1216, "train/total_loss": 0.17590919137001038 }, { "entropy": 9.680652618408203, "epoch": 0.12032825786039153, "mean_token_accuracy": 0.7171052694320679, "num_tokens": 6337618.0, "step": 1217, "train/ce_loss": 0.9547468423843384 }, { "epoch": 0.12032825786039153, "step": 1217, "train/sim_loss": 0.08984375 }, { "epoch": 0.12032825786039153, "step": 1217, "train/total_loss": 0.18531844019889832 }, { "entropy": 9.75971508026123, "epoch": 0.12042713070990706, "mean_token_accuracy": 0.7676767706871033, "num_tokens": 6342641.0, "step": 1218, "train/ce_loss": 1.0770344734191895 }, { "epoch": 0.12042713070990706, "step": 1218, "train/sim_loss": 0.0625 }, { "epoch": 0.12042713070990706, "step": 1218, "train/total_loss": 0.17020344734191895 }, { "entropy": 9.716499328613281, "epoch": 0.12052600355942258, "mean_token_accuracy": 0.7244367599487305, "num_tokens": 6347644.0, "step": 1219, "train/ce_loss": 1.3945896625518799 }, { "epoch": 0.12052600355942258, "step": 1219, "train/sim_loss": 0.08203125 }, { "epoch": 0.12052600355942258, "step": 1219, "train/total_loss": 0.22149021923542023 }, { "epoch": 0.12062487640893811, "grad_norm": 1.1249451637268066, "learning_rate": 9.701082925381992e-06, "loss": 0.1742, "step": 1220 }, { "entropy": 9.388711929321289, "epoch": 0.12062487640893811, "mean_token_accuracy": 0.7427440881729126, "num_tokens": 6352873.0, "step": 1220, "train/ce_loss": 0.7889469265937805 }, { "epoch": 0.12062487640893811, "step": 1220, "train/sim_loss": 0.0859375 }, { "epoch": 0.12062487640893811, "step": 1220, "train/total_loss": 0.164832204580307 }, { "entropy": 9.422139167785645, "epoch": 0.12072374925845362, "mean_token_accuracy": 0.735897421836853, "num_tokens": 6358135.0, "step": 1221, "train/ce_loss": 1.3089293241500854 }, { "epoch": 0.12072374925845362, "step": 1221, "train/sim_loss": 0.08984375 }, { "epoch": 0.12072374925845362, "step": 1221, "train/total_loss": 0.22073668241500854 }, { "entropy": 9.557997703552246, "epoch": 0.12082262210796915, "mean_token_accuracy": 0.7147541046142578, "num_tokens": 6363158.0, "step": 1222, "train/ce_loss": 0.9907953143119812 }, { "epoch": 0.12082262210796915, "step": 1222, "train/sim_loss": 0.06640625 }, { "epoch": 0.12082262210796915, "step": 1222, "train/total_loss": 0.16548578441143036 }, { "entropy": 9.598645210266113, "epoch": 0.12092149495748468, "mean_token_accuracy": 0.7001394629478455, "num_tokens": 6368329.0, "step": 1223, "train/ce_loss": 1.5701709985733032 }, { "epoch": 0.12092149495748468, "step": 1223, "train/sim_loss": 0.125 }, { "epoch": 0.12092149495748468, "step": 1223, "train/total_loss": 0.2820171117782593 }, { "entropy": 9.731266021728516, "epoch": 0.12102036780700019, "mean_token_accuracy": 0.7379181981086731, "num_tokens": 6373324.0, "step": 1224, "train/ce_loss": 3.4697419323492795e-05 }, { "epoch": 0.12102036780700019, "step": 1224, "train/sim_loss": 0.07421875 }, { "epoch": 0.12102036780700019, "step": 1224, "train/total_loss": 0.07422222197055817 }, { "entropy": 9.88830280303955, "epoch": 0.12111924065651572, "mean_token_accuracy": 0.752964437007904, "num_tokens": 6378294.0, "step": 1225, "train/ce_loss": 0.966044008731842 }, { "epoch": 0.12111924065651572, "step": 1225, "train/sim_loss": 0.14453125 }, { "epoch": 0.12111924065651572, "step": 1225, "train/total_loss": 0.24113565683364868 }, { "entropy": 9.497847557067871, "epoch": 0.12121811350603125, "mean_token_accuracy": 0.7132768630981445, "num_tokens": 6383478.0, "step": 1226, "train/ce_loss": 1.0445523262023926 }, { "epoch": 0.12121811350603125, "step": 1226, "train/sim_loss": 0.08203125 }, { "epoch": 0.12121811350603125, "step": 1226, "train/total_loss": 0.18648648262023926 }, { "entropy": 9.349191665649414, "epoch": 0.12131698635554676, "mean_token_accuracy": 0.7230169177055359, "num_tokens": 6388691.0, "step": 1227, "train/ce_loss": 0.7283400893211365 }, { "epoch": 0.12131698635554676, "step": 1227, "train/sim_loss": 0.0625 }, { "epoch": 0.12131698635554676, "step": 1227, "train/total_loss": 0.13533401489257812 }, { "entropy": 9.422361373901367, "epoch": 0.12141585920506229, "mean_token_accuracy": 0.7373096346855164, "num_tokens": 6393930.0, "step": 1228, "train/ce_loss": 0.3774206340312958 }, { "epoch": 0.12141585920506229, "step": 1228, "train/sim_loss": 0.06640625 }, { "epoch": 0.12141585920506229, "step": 1228, "train/total_loss": 0.10414831340312958 }, { "entropy": 9.410026550292969, "epoch": 0.12151473205457781, "mean_token_accuracy": 0.7279693484306335, "num_tokens": 6399156.0, "step": 1229, "train/ce_loss": 0.9712676405906677 }, { "epoch": 0.12151473205457781, "step": 1229, "train/sim_loss": 0.05078125 }, { "epoch": 0.12151473205457781, "step": 1229, "train/total_loss": 0.147908017039299 }, { "entropy": 10.178994178771973, "epoch": 0.12161360490409334, "mean_token_accuracy": 0.7486910820007324, "num_tokens": 6403792.0, "step": 1230, "train/ce_loss": 3.394787549972534 }, { "epoch": 0.12161360490409334, "step": 1230, "train/sim_loss": 0.140625 }, { "epoch": 0.12161360490409334, "step": 1230, "train/total_loss": 0.4801037609577179 }, { "entropy": 10.003273010253906, "epoch": 0.12171247775360886, "mean_token_accuracy": 0.6084656119346619, "num_tokens": 6408747.0, "step": 1231, "train/ce_loss": 1.4144474334898405e-05 }, { "epoch": 0.12171247775360886, "step": 1231, "train/sim_loss": 0.0703125 }, { "epoch": 0.12171247775360886, "step": 1231, "train/total_loss": 0.07031391561031342 }, { "entropy": 9.176398277282715, "epoch": 0.12181135060312438, "mean_token_accuracy": 0.6727467775344849, "num_tokens": 6414145.0, "step": 1232, "train/ce_loss": 0.8553107976913452 }, { "epoch": 0.12181135060312438, "step": 1232, "train/sim_loss": 0.09375 }, { "epoch": 0.12181135060312438, "step": 1232, "train/total_loss": 0.179281085729599 }, { "entropy": 9.400400161743164, "epoch": 0.12191022345263991, "mean_token_accuracy": 0.739130437374115, "num_tokens": 6419402.0, "step": 1233, "train/ce_loss": 1.1326872110366821 }, { "epoch": 0.12191022345263991, "step": 1233, "train/sim_loss": 0.09375 }, { "epoch": 0.12191022345263991, "step": 1233, "train/total_loss": 0.20701873302459717 }, { "entropy": 9.089184761047363, "epoch": 0.12200909630215542, "mean_token_accuracy": 0.7258883118629456, "num_tokens": 6424840.0, "step": 1234, "train/ce_loss": 0.7860023379325867 }, { "epoch": 0.12200909630215542, "step": 1234, "train/sim_loss": 0.05078125 }, { "epoch": 0.12200909630215542, "step": 1234, "train/total_loss": 0.1293814778327942 }, { "entropy": 9.444795608520508, "epoch": 0.12210796915167095, "mean_token_accuracy": 0.75, "num_tokens": 6429992.0, "step": 1235, "train/ce_loss": 0.5777448415756226 }, { "epoch": 0.12210796915167095, "step": 1235, "train/sim_loss": 0.046875 }, { "epoch": 0.12210796915167095, "step": 1235, "train/total_loss": 0.10464948415756226 }, { "entropy": 8.957151412963867, "epoch": 0.12220684200118648, "mean_token_accuracy": 0.7469262480735779, "num_tokens": 6435485.0, "step": 1236, "train/ce_loss": 0.6073027849197388 }, { "epoch": 0.12220684200118648, "step": 1236, "train/sim_loss": 0.109375 }, { "epoch": 0.12220684200118648, "step": 1236, "train/total_loss": 0.17010527849197388 }, { "entropy": 9.437676429748535, "epoch": 0.12230571485070199, "mean_token_accuracy": 0.7027778029441833, "num_tokens": 6440692.0, "step": 1237, "train/ce_loss": 0.6752130389213562 }, { "epoch": 0.12230571485070199, "step": 1237, "train/sim_loss": 0.0859375 }, { "epoch": 0.12230571485070199, "step": 1237, "train/total_loss": 0.15345880389213562 }, { "entropy": 9.129096984863281, "epoch": 0.12240458770021752, "mean_token_accuracy": 0.6656050682067871, "num_tokens": 6446127.0, "step": 1238, "train/ce_loss": 0.9888354539871216 }, { "epoch": 0.12240458770021752, "step": 1238, "train/sim_loss": 0.0625 }, { "epoch": 0.12240458770021752, "step": 1238, "train/total_loss": 0.16138353943824768 }, { "entropy": 9.386200904846191, "epoch": 0.12250346054973305, "mean_token_accuracy": 0.7580437660217285, "num_tokens": 6451381.0, "step": 1239, "train/ce_loss": 0.8257952332496643 }, { "epoch": 0.12250346054973305, "step": 1239, "train/sim_loss": 0.0703125 }, { "epoch": 0.12250346054973305, "step": 1239, "train/total_loss": 0.15289202332496643 }, { "epoch": 0.12260233339924857, "grad_norm": 0.992129385471344, "learning_rate": 9.696138060624043e-06, "loss": 0.1771, "step": 1240 }, { "entropy": 9.329833984375, "epoch": 0.12260233339924857, "mean_token_accuracy": 0.7079953551292419, "num_tokens": 6456712.0, "step": 1240, "train/ce_loss": 0.3363990783691406 }, { "epoch": 0.12260233339924857, "step": 1240, "train/sim_loss": 0.0546875 }, { "epoch": 0.12260233339924857, "step": 1240, "train/total_loss": 0.08832740783691406 }, { "entropy": 9.286294937133789, "epoch": 0.12270120624876409, "mean_token_accuracy": 0.7363834381103516, "num_tokens": 6462330.0, "step": 1241, "train/ce_loss": 1.0246698366245255e-05 }, { "epoch": 0.12270120624876409, "step": 1241, "train/sim_loss": 0.078125 }, { "epoch": 0.12270120624876409, "step": 1241, "train/total_loss": 0.07812602818012238 }, { "entropy": 9.592647552490234, "epoch": 0.12280007909827961, "mean_token_accuracy": 0.7633228898048401, "num_tokens": 6467407.0, "step": 1242, "train/ce_loss": 1.1297849416732788 }, { "epoch": 0.12280007909827961, "step": 1242, "train/sim_loss": 0.0625 }, { "epoch": 0.12280007909827961, "step": 1242, "train/total_loss": 0.1754784882068634 }, { "entropy": 9.25661849975586, "epoch": 0.12289895194779514, "mean_token_accuracy": 0.7703788876533508, "num_tokens": 6472744.0, "step": 1243, "train/ce_loss": 0.5336284637451172 }, { "epoch": 0.12289895194779514, "step": 1243, "train/sim_loss": 0.09375 }, { "epoch": 0.12289895194779514, "step": 1243, "train/total_loss": 0.14711284637451172 }, { "entropy": 9.378896713256836, "epoch": 0.12299782479731065, "mean_token_accuracy": 0.6879240274429321, "num_tokens": 6477966.0, "step": 1244, "train/ce_loss": 1.2743122577667236 }, { "epoch": 0.12299782479731065, "step": 1244, "train/sim_loss": 0.05859375 }, { "epoch": 0.12299782479731065, "step": 1244, "train/total_loss": 0.1860249787569046 }, { "entropy": 9.449660301208496, "epoch": 0.12309669764682618, "mean_token_accuracy": 0.7572559118270874, "num_tokens": 6483175.0, "step": 1245, "train/ce_loss": 3.446074333623983e-05 }, { "epoch": 0.12309669764682618, "step": 1245, "train/sim_loss": 0.0859375 }, { "epoch": 0.12309669764682618, "step": 1245, "train/total_loss": 0.08594094961881638 }, { "entropy": 9.363601684570312, "epoch": 0.12319557049634171, "mean_token_accuracy": 0.6921212077140808, "num_tokens": 6488457.0, "step": 1246, "train/ce_loss": 0.9129509329795837 }, { "epoch": 0.12319557049634171, "step": 1246, "train/sim_loss": 0.109375 }, { "epoch": 0.12319557049634171, "step": 1246, "train/total_loss": 0.20067009329795837 }, { "entropy": 9.186436653137207, "epoch": 0.12329444334585722, "mean_token_accuracy": 0.7436463832855225, "num_tokens": 6493826.0, "step": 1247, "train/ce_loss": 0.9415988922119141 }, { "epoch": 0.12329444334585722, "step": 1247, "train/sim_loss": 0.0625 }, { "epoch": 0.12329444334585722, "step": 1247, "train/total_loss": 0.15665990114212036 }, { "entropy": 9.832391738891602, "epoch": 0.12339331619537275, "mean_token_accuracy": 0.7275862097740173, "num_tokens": 6498876.0, "step": 1248, "train/ce_loss": 0.860569417476654 }, { "epoch": 0.12339331619537275, "step": 1248, "train/sim_loss": 0.06640625 }, { "epoch": 0.12339331619537275, "step": 1248, "train/total_loss": 0.15246319770812988 }, { "entropy": 9.70506477355957, "epoch": 0.12349218904488828, "mean_token_accuracy": 0.6977152824401855, "num_tokens": 6503856.0, "step": 1249, "train/ce_loss": 2.863835652533453e-05 }, { "epoch": 0.12349218904488828, "step": 1249, "train/sim_loss": 0.0625 }, { "epoch": 0.12349218904488828, "step": 1249, "train/total_loss": 0.06250286102294922 }, { "entropy": 9.391181945800781, "epoch": 0.1235910618944038, "mean_token_accuracy": 0.7162346243858337, "num_tokens": 6509076.0, "step": 1250, "train/ce_loss": 0.6494923233985901 }, { "epoch": 0.1235910618944038, "step": 1250, "train/sim_loss": 0.05078125 }, { "epoch": 0.1235910618944038, "step": 1250, "train/total_loss": 0.11573048681020737 }, { "entropy": 9.535764694213867, "epoch": 0.12368993474391932, "mean_token_accuracy": 0.754408061504364, "num_tokens": 6514262.0, "step": 1251, "train/ce_loss": 1.2194236516952515 }, { "epoch": 0.12368993474391932, "step": 1251, "train/sim_loss": 0.05859375 }, { "epoch": 0.12368993474391932, "step": 1251, "train/total_loss": 0.18053612112998962 }, { "entropy": 9.383031845092773, "epoch": 0.12378880759343484, "mean_token_accuracy": 0.7243173122406006, "num_tokens": 6519477.0, "step": 1252, "train/ce_loss": 0.6060498952865601 }, { "epoch": 0.12378880759343484, "step": 1252, "train/sim_loss": 0.05078125 }, { "epoch": 0.12378880759343484, "step": 1252, "train/total_loss": 0.111386239528656 }, { "entropy": 10.231584548950195, "epoch": 0.12388768044295037, "mean_token_accuracy": 0.7390300035476685, "num_tokens": 6524383.0, "step": 1253, "train/ce_loss": 1.7479231357574463 }, { "epoch": 0.12388768044295037, "step": 1253, "train/sim_loss": 0.0703125 }, { "epoch": 0.12388768044295037, "step": 1253, "train/total_loss": 0.2451048195362091 }, { "entropy": 9.92870044708252, "epoch": 0.12398655329246588, "mean_token_accuracy": 0.7698541283607483, "num_tokens": 6529570.0, "step": 1254, "train/ce_loss": 1.1913306479982566e-05 }, { "epoch": 0.12398655329246588, "step": 1254, "train/sim_loss": 0.03125 }, { "epoch": 0.12398655329246588, "step": 1254, "train/total_loss": 0.03125119209289551 }, { "entropy": 9.180500984191895, "epoch": 0.12408542614198141, "mean_token_accuracy": 0.7398189902305603, "num_tokens": 6534953.0, "step": 1255, "train/ce_loss": 0.7377535700798035 }, { "epoch": 0.12408542614198141, "step": 1255, "train/sim_loss": 0.07421875 }, { "epoch": 0.12408542614198141, "step": 1255, "train/total_loss": 0.14799410104751587 }, { "entropy": 9.337590217590332, "epoch": 0.12418429899149694, "mean_token_accuracy": 0.7755101919174194, "num_tokens": 6540321.0, "step": 1256, "train/ce_loss": 0.681896448135376 }, { "epoch": 0.12418429899149694, "step": 1256, "train/sim_loss": 0.04296875 }, { "epoch": 0.12418429899149694, "step": 1256, "train/total_loss": 0.11115839332342148 }, { "entropy": 9.066247940063477, "epoch": 0.12428317184101245, "mean_token_accuracy": 0.723243236541748, "num_tokens": 6545708.0, "step": 1257, "train/ce_loss": 1.0786329507827759 }, { "epoch": 0.12428317184101245, "step": 1257, "train/sim_loss": 0.109375 }, { "epoch": 0.12428317184101245, "step": 1257, "train/total_loss": 0.21723830699920654 }, { "entropy": 9.692575454711914, "epoch": 0.12438204469052798, "mean_token_accuracy": 0.7036011219024658, "num_tokens": 6550874.0, "step": 1258, "train/ce_loss": 1.9783855676651 }, { "epoch": 0.12438204469052798, "step": 1258, "train/sim_loss": 0.09765625 }, { "epoch": 0.12438204469052798, "step": 1258, "train/total_loss": 0.29549479484558105 }, { "entropy": 9.475704193115234, "epoch": 0.1244809175400435, "mean_token_accuracy": 0.7932098507881165, "num_tokens": 6555977.0, "step": 1259, "train/ce_loss": 1.2450464963912964 }, { "epoch": 0.1244809175400435, "step": 1259, "train/sim_loss": 0.140625 }, { "epoch": 0.1244809175400435, "step": 1259, "train/total_loss": 0.2651296555995941 }, { "epoch": 0.12457979038955903, "grad_norm": 1.0474660396575928, "learning_rate": 9.691193195866095e-06, "loss": 0.1628, "step": 1260 }, { "entropy": 9.542675018310547, "epoch": 0.12457979038955903, "mean_token_accuracy": 0.7211155295372009, "num_tokens": 6561183.0, "step": 1260, "train/ce_loss": 1.0661858320236206 }, { "epoch": 0.12457979038955903, "step": 1260, "train/sim_loss": 0.06640625 }, { "epoch": 0.12457979038955903, "step": 1260, "train/total_loss": 0.17302483320236206 }, { "entropy": 9.597604751586914, "epoch": 0.12467866323907455, "mean_token_accuracy": 0.795918345451355, "num_tokens": 6566282.0, "step": 1261, "train/ce_loss": 0.525081217288971 }, { "epoch": 0.12467866323907455, "step": 1261, "train/sim_loss": 0.0234375 }, { "epoch": 0.12467866323907455, "step": 1261, "train/total_loss": 0.07594562321901321 }, { "entropy": 9.565014839172363, "epoch": 0.12477753608859007, "mean_token_accuracy": 0.7484471797943115, "num_tokens": 6571362.0, "step": 1262, "train/ce_loss": 0.7680662870407104 }, { "epoch": 0.12477753608859007, "step": 1262, "train/sim_loss": 0.0546875 }, { "epoch": 0.12477753608859007, "step": 1262, "train/total_loss": 0.13149413466453552 }, { "entropy": 10.078500747680664, "epoch": 0.1248764089381056, "mean_token_accuracy": 0.7957746386528015, "num_tokens": 6576215.0, "step": 1263, "train/ce_loss": 2.5891031327773817e-05 }, { "epoch": 0.1248764089381056, "step": 1263, "train/sim_loss": 0.12890625 }, { "epoch": 0.1248764089381056, "step": 1263, "train/total_loss": 0.12890884280204773 }, { "entropy": 9.09901237487793, "epoch": 0.12497528178762111, "mean_token_accuracy": 0.744027316570282, "num_tokens": 6581545.0, "step": 1264, "train/ce_loss": 0.8924403786659241 }, { "epoch": 0.12497528178762111, "step": 1264, "train/sim_loss": 0.06640625 }, { "epoch": 0.12497528178762111, "step": 1264, "train/total_loss": 0.1556502878665924 }, { "entropy": 9.241326332092285, "epoch": 0.12507415463713664, "mean_token_accuracy": 0.7755101919174194, "num_tokens": 6586889.0, "step": 1265, "train/ce_loss": 0.770359456539154 }, { "epoch": 0.12507415463713664, "step": 1265, "train/sim_loss": 0.0703125 }, { "epoch": 0.12507415463713664, "step": 1265, "train/total_loss": 0.14734844863414764 }, { "entropy": 9.286764144897461, "epoch": 0.12517302748665216, "mean_token_accuracy": 0.7348901033401489, "num_tokens": 6592091.0, "step": 1266, "train/ce_loss": 0.9730169177055359 }, { "epoch": 0.12517302748665216, "step": 1266, "train/sim_loss": 0.109375 }, { "epoch": 0.12517302748665216, "step": 1266, "train/total_loss": 0.2066766917705536 }, { "entropy": 9.571826934814453, "epoch": 0.1252719003361677, "mean_token_accuracy": 0.7157142758369446, "num_tokens": 6597224.0, "step": 1267, "train/ce_loss": 1.1601194143295288 }, { "epoch": 0.1252719003361677, "step": 1267, "train/sim_loss": 0.109375 }, { "epoch": 0.1252719003361677, "step": 1267, "train/total_loss": 0.22538694739341736 }, { "entropy": 9.823686599731445, "epoch": 0.1253707731856832, "mean_token_accuracy": 0.7427184581756592, "num_tokens": 6602268.0, "step": 1268, "train/ce_loss": 1.0998046398162842 }, { "epoch": 0.1253707731856832, "step": 1268, "train/sim_loss": 0.06640625 }, { "epoch": 0.1253707731856832, "step": 1268, "train/total_loss": 0.17638671398162842 }, { "entropy": 9.747212409973145, "epoch": 0.12546964603519872, "mean_token_accuracy": 0.7157360315322876, "num_tokens": 6607335.0, "step": 1269, "train/ce_loss": 1.0469977855682373 }, { "epoch": 0.12546964603519872, "step": 1269, "train/sim_loss": 0.05078125 }, { "epoch": 0.12546964603519872, "step": 1269, "train/total_loss": 0.15548104047775269 }, { "entropy": 9.508520126342773, "epoch": 0.12556851888471426, "mean_token_accuracy": 0.6960651278495789, "num_tokens": 6612522.0, "step": 1270, "train/ce_loss": 0.7423213124275208 }, { "epoch": 0.12556851888471426, "step": 1270, "train/sim_loss": 0.0625 }, { "epoch": 0.12556851888471426, "step": 1270, "train/total_loss": 0.13673213124275208 }, { "entropy": 10.032516479492188, "epoch": 0.12566739173422978, "mean_token_accuracy": 0.7343358397483826, "num_tokens": 6617352.0, "step": 1271, "train/ce_loss": 1.6255320310592651 }, { "epoch": 0.12566739173422978, "step": 1271, "train/sim_loss": 0.11328125 }, { "epoch": 0.12566739173422978, "step": 1271, "train/total_loss": 0.27583444118499756 }, { "entropy": 9.865862846374512, "epoch": 0.1257662645837453, "mean_token_accuracy": 0.7620751261711121, "num_tokens": 6622340.0, "step": 1272, "train/ce_loss": 1.6030211448669434 }, { "epoch": 0.1257662645837453, "step": 1272, "train/sim_loss": 0.08203125 }, { "epoch": 0.1257662645837453, "step": 1272, "train/total_loss": 0.24233336746692657 }, { "entropy": 9.0894775390625, "epoch": 0.12586513743326083, "mean_token_accuracy": 0.7485907673835754, "num_tokens": 6627710.0, "step": 1273, "train/ce_loss": 1.1820942163467407 }, { "epoch": 0.12586513743326083, "step": 1273, "train/sim_loss": 0.08203125 }, { "epoch": 0.12586513743326083, "step": 1273, "train/total_loss": 0.20024067163467407 }, { "entropy": 10.18376350402832, "epoch": 0.12596401028277635, "mean_token_accuracy": 0.7234042286872864, "num_tokens": 6632482.0, "step": 1274, "train/ce_loss": 1.2288492918014526 }, { "epoch": 0.12596401028277635, "step": 1274, "train/sim_loss": 0.09375 }, { "epoch": 0.12596401028277635, "step": 1274, "train/total_loss": 0.21663492918014526 }, { "entropy": 10.220690727233887, "epoch": 0.1260628831322919, "mean_token_accuracy": 0.6997318863868713, "num_tokens": 6637254.0, "step": 1275, "train/ce_loss": 2.4932305812835693 }, { "epoch": 0.1260628831322919, "step": 1275, "train/sim_loss": 0.140625 }, { "epoch": 0.1260628831322919, "step": 1275, "train/total_loss": 0.3899480700492859 }, { "entropy": 9.369550704956055, "epoch": 0.1261617559818074, "mean_token_accuracy": 0.7096336483955383, "num_tokens": 6642467.0, "step": 1276, "train/ce_loss": 0.7277787327766418 }, { "epoch": 0.1261617559818074, "step": 1276, "train/sim_loss": 0.1015625 }, { "epoch": 0.1261617559818074, "step": 1276, "train/total_loss": 0.1743403673171997 }, { "entropy": 9.829463958740234, "epoch": 0.1262606288313229, "mean_token_accuracy": 0.7222222089767456, "num_tokens": 6647512.0, "step": 1277, "train/ce_loss": 1.1789212226867676 }, { "epoch": 0.1262606288313229, "step": 1277, "train/sim_loss": 0.10546875 }, { "epoch": 0.1262606288313229, "step": 1277, "train/total_loss": 0.22336086630821228 }, { "entropy": 8.898569107055664, "epoch": 0.12635950168083845, "mean_token_accuracy": 0.7696228623390198, "num_tokens": 6652940.0, "step": 1278, "train/ce_loss": 0.7270646095275879 }, { "epoch": 0.12635950168083845, "step": 1278, "train/sim_loss": 0.03125 }, { "epoch": 0.12635950168083845, "step": 1278, "train/total_loss": 0.10395646095275879 }, { "entropy": 9.275156021118164, "epoch": 0.12645837453035397, "mean_token_accuracy": 0.716167688369751, "num_tokens": 6658269.0, "step": 1279, "train/ce_loss": 0.9162885546684265 }, { "epoch": 0.12645837453035397, "step": 1279, "train/sim_loss": 0.09375 }, { "epoch": 0.12645837453035397, "step": 1279, "train/total_loss": 0.18537884950637817 }, { "epoch": 0.12655724737986948, "grad_norm": 1.0265246629714966, "learning_rate": 9.686248331108144e-06, "loss": 0.1631, "step": 1280 }, { "entropy": 9.02204418182373, "epoch": 0.12655724737986948, "mean_token_accuracy": 0.738095223903656, "num_tokens": 6663676.0, "step": 1280, "train/ce_loss": 0.5852922797203064 }, { "epoch": 0.12655724737986948, "step": 1280, "train/sim_loss": 0.0390625 }, { "epoch": 0.12655724737986948, "step": 1280, "train/total_loss": 0.09759172797203064 }, { "entropy": 9.600400924682617, "epoch": 0.12665612022938502, "mean_token_accuracy": 0.7263843417167664, "num_tokens": 6668723.0, "step": 1281, "train/ce_loss": 1.3899685144424438 }, { "epoch": 0.12665612022938502, "step": 1281, "train/sim_loss": 0.08203125 }, { "epoch": 0.12665612022938502, "step": 1281, "train/total_loss": 0.22102810442447662 }, { "entropy": 9.386069297790527, "epoch": 0.12675499307890054, "mean_token_accuracy": 0.7247706651687622, "num_tokens": 6674135.0, "step": 1282, "train/ce_loss": 1.2490756511688232 }, { "epoch": 0.12675499307890054, "step": 1282, "train/sim_loss": 0.09765625 }, { "epoch": 0.12675499307890054, "step": 1282, "train/total_loss": 0.22256381809711456 }, { "entropy": 9.768744468688965, "epoch": 0.12685386592841605, "mean_token_accuracy": 0.7996453642845154, "num_tokens": 6679155.0, "step": 1283, "train/ce_loss": 1.2421215615177061e-05 }, { "epoch": 0.12685386592841605, "step": 1283, "train/sim_loss": 0.03515625 }, { "epoch": 0.12685386592841605, "step": 1283, "train/total_loss": 0.03515749052166939 }, { "entropy": 9.283811569213867, "epoch": 0.1269527387779316, "mean_token_accuracy": 0.691428542137146, "num_tokens": 6684510.0, "step": 1284, "train/ce_loss": 0.9716193079948425 }, { "epoch": 0.1269527387779316, "step": 1284, "train/sim_loss": 0.09375 }, { "epoch": 0.1269527387779316, "step": 1284, "train/total_loss": 0.1909119337797165 }, { "entropy": 9.552628517150879, "epoch": 0.1270516116274471, "mean_token_accuracy": 0.6743044257164001, "num_tokens": 6689557.0, "step": 1285, "train/ce_loss": 1.5597680807113647 }, { "epoch": 0.1270516116274471, "step": 1285, "train/sim_loss": 0.125 }, { "epoch": 0.1270516116274471, "step": 1285, "train/total_loss": 0.2809768319129944 }, { "entropy": 8.94034194946289, "epoch": 0.12715048447696262, "mean_token_accuracy": 0.802480936050415, "num_tokens": 6695068.0, "step": 1286, "train/ce_loss": 0.4175623655319214 }, { "epoch": 0.12715048447696262, "step": 1286, "train/sim_loss": 0.125 }, { "epoch": 0.12715048447696262, "step": 1286, "train/total_loss": 0.16675624251365662 }, { "entropy": 9.418731689453125, "epoch": 0.12724935732647816, "mean_token_accuracy": 0.7613104581832886, "num_tokens": 6700156.0, "step": 1287, "train/ce_loss": 0.9082545638084412 }, { "epoch": 0.12724935732647816, "step": 1287, "train/sim_loss": 0.0703125 }, { "epoch": 0.12724935732647816, "step": 1287, "train/total_loss": 0.16113796830177307 }, { "entropy": 10.113789558410645, "epoch": 0.12734823017599367, "mean_token_accuracy": 0.7019704580307007, "num_tokens": 6704954.0, "step": 1288, "train/ce_loss": 1.5278859791578725e-05 }, { "epoch": 0.12734823017599367, "step": 1288, "train/sim_loss": 0.02734375 }, { "epoch": 0.12734823017599367, "step": 1288, "train/total_loss": 0.02734527736902237 }, { "entropy": 9.118091583251953, "epoch": 0.12744710302550918, "mean_token_accuracy": 0.7314715385437012, "num_tokens": 6710522.0, "step": 1289, "train/ce_loss": 0.701236367225647 }, { "epoch": 0.12744710302550918, "step": 1289, "train/sim_loss": 0.11328125 }, { "epoch": 0.12744710302550918, "step": 1289, "train/total_loss": 0.18340489268302917 }, { "entropy": 9.750611305236816, "epoch": 0.12754597587502473, "mean_token_accuracy": 0.7197802066802979, "num_tokens": 6715529.0, "step": 1290, "train/ce_loss": 9.375158697366714e-06 }, { "epoch": 0.12754597587502473, "step": 1290, "train/sim_loss": 0.0859375 }, { "epoch": 0.12754597587502473, "step": 1290, "train/total_loss": 0.08593843877315521 }, { "entropy": 9.286053657531738, "epoch": 0.12764484872454024, "mean_token_accuracy": 0.7582821846008301, "num_tokens": 6720773.0, "step": 1291, "train/ce_loss": 0.7109894752502441 }, { "epoch": 0.12764484872454024, "step": 1291, "train/sim_loss": 0.11328125 }, { "epoch": 0.12764484872454024, "step": 1291, "train/total_loss": 0.1843802034854889 }, { "entropy": 8.948986053466797, "epoch": 0.12774372157405575, "mean_token_accuracy": 0.6772777438163757, "num_tokens": 6726136.0, "step": 1292, "train/ce_loss": 0.9387810230255127 }, { "epoch": 0.12774372157405575, "step": 1292, "train/sim_loss": 0.0546875 }, { "epoch": 0.12774372157405575, "step": 1292, "train/total_loss": 0.1485656052827835 }, { "entropy": 9.39590835571289, "epoch": 0.1278425944235713, "mean_token_accuracy": 0.7406855225563049, "num_tokens": 6731196.0, "step": 1293, "train/ce_loss": 0.7263284921646118 }, { "epoch": 0.1278425944235713, "step": 1293, "train/sim_loss": 0.09375 }, { "epoch": 0.1278425944235713, "step": 1293, "train/total_loss": 0.16638284921646118 }, { "entropy": 9.307497024536133, "epoch": 0.1279414672730868, "mean_token_accuracy": 0.6719160079956055, "num_tokens": 6736402.0, "step": 1294, "train/ce_loss": 0.8595036268234253 }, { "epoch": 0.1279414672730868, "step": 1294, "train/sim_loss": 0.109375 }, { "epoch": 0.1279414672730868, "step": 1294, "train/total_loss": 0.19532537460327148 }, { "entropy": 9.940542221069336, "epoch": 0.12804034012260232, "mean_token_accuracy": 0.7020785212516785, "num_tokens": 6741231.0, "step": 1295, "train/ce_loss": 2.441803216934204 }, { "epoch": 0.12804034012260232, "step": 1295, "train/sim_loss": 0.0859375 }, { "epoch": 0.12804034012260232, "step": 1295, "train/total_loss": 0.3301178216934204 }, { "entropy": 10.023080825805664, "epoch": 0.12813921297211786, "mean_token_accuracy": 0.6821561455726624, "num_tokens": 6746207.0, "step": 1296, "train/ce_loss": 2.109379529953003 }, { "epoch": 0.12813921297211786, "step": 1296, "train/sim_loss": 0.109375 }, { "epoch": 0.12813921297211786, "step": 1296, "train/total_loss": 0.3203129768371582 }, { "entropy": 8.976844787597656, "epoch": 0.12823808582163337, "mean_token_accuracy": 0.7243243455886841, "num_tokens": 6751446.0, "step": 1297, "train/ce_loss": 1.3865740299224854 }, { "epoch": 0.12823808582163337, "step": 1297, "train/sim_loss": 0.09375 }, { "epoch": 0.12823808582163337, "step": 1297, "train/total_loss": 0.23240740597248077 }, { "entropy": 9.06201171875, "epoch": 0.12833695867114892, "mean_token_accuracy": 0.7755308151245117, "num_tokens": 6757105.0, "step": 1298, "train/ce_loss": 0.8118742108345032 }, { "epoch": 0.12833695867114892, "step": 1298, "train/sim_loss": 0.0390625 }, { "epoch": 0.12833695867114892, "step": 1298, "train/total_loss": 0.1202499195933342 }, { "entropy": 9.676658630371094, "epoch": 0.12843583152066443, "mean_token_accuracy": 0.7454545497894287, "num_tokens": 6762103.0, "step": 1299, "train/ce_loss": 0.00011109438491985202 }, { "epoch": 0.12843583152066443, "step": 1299, "train/sim_loss": 0.0703125 }, { "epoch": 0.12843583152066443, "step": 1299, "train/total_loss": 0.07032360881567001 }, { "epoch": 0.12853470437017994, "grad_norm": 0.9925926327705383, "learning_rate": 9.681303466350196e-06, "loss": 0.1728, "step": 1300 }, { "entropy": 9.072019577026367, "epoch": 0.12853470437017994, "mean_token_accuracy": 0.7306163311004639, "num_tokens": 6767619.0, "step": 1300, "train/ce_loss": 0.3801075518131256 }, { "epoch": 0.12853470437017994, "step": 1300, "train/sim_loss": 0.05859375 }, { "epoch": 0.12853470437017994, "step": 1300, "train/total_loss": 0.09660451114177704 }, { "entropy": 9.6489839553833, "epoch": 0.12863357721969548, "mean_token_accuracy": 0.7655068039894104, "num_tokens": 6772693.0, "step": 1301, "train/ce_loss": 0.8618592619895935 }, { "epoch": 0.12863357721969548, "step": 1301, "train/sim_loss": 0.02734375 }, { "epoch": 0.12863357721969548, "step": 1301, "train/total_loss": 0.11352967470884323 }, { "entropy": 9.157691955566406, "epoch": 0.128732450069211, "mean_token_accuracy": 0.7541766166687012, "num_tokens": 6778026.0, "step": 1302, "train/ce_loss": 0.543510913848877 }, { "epoch": 0.128732450069211, "step": 1302, "train/sim_loss": 0.03515625 }, { "epoch": 0.128732450069211, "step": 1302, "train/total_loss": 0.0895073413848877 }, { "entropy": 9.758062362670898, "epoch": 0.1288313229187265, "mean_token_accuracy": 0.7643097639083862, "num_tokens": 6783068.0, "step": 1303, "train/ce_loss": 0.7941330671310425 }, { "epoch": 0.1288313229187265, "step": 1303, "train/sim_loss": 0.078125 }, { "epoch": 0.1288313229187265, "step": 1303, "train/total_loss": 0.1575383096933365 }, { "entropy": 8.827596664428711, "epoch": 0.12893019576824205, "mean_token_accuracy": 0.7171814441680908, "num_tokens": 6788595.0, "step": 1304, "train/ce_loss": 1.030279278755188 }, { "epoch": 0.12893019576824205, "step": 1304, "train/sim_loss": 0.06640625 }, { "epoch": 0.12893019576824205, "step": 1304, "train/total_loss": 0.16943418979644775 }, { "entropy": 9.598592758178711, "epoch": 0.12902906861775756, "mean_token_accuracy": 0.7517730593681335, "num_tokens": 6793607.0, "step": 1305, "train/ce_loss": 0.9198727607727051 }, { "epoch": 0.12902906861775756, "step": 1305, "train/sim_loss": 0.0859375 }, { "epoch": 0.12902906861775756, "step": 1305, "train/total_loss": 0.17792478203773499 }, { "entropy": 9.47142505645752, "epoch": 0.12912794146727308, "mean_token_accuracy": 0.811188817024231, "num_tokens": 6798780.0, "step": 1306, "train/ce_loss": 1.043941119860392e-05 }, { "epoch": 0.12912794146727308, "step": 1306, "train/sim_loss": 0.09375 }, { "epoch": 0.12912794146727308, "step": 1306, "train/total_loss": 0.09375104308128357 }, { "entropy": 9.123851776123047, "epoch": 0.12922681431678862, "mean_token_accuracy": 0.7857961058616638, "num_tokens": 6804138.0, "step": 1307, "train/ce_loss": 0.45554080605506897 }, { "epoch": 0.12922681431678862, "step": 1307, "train/sim_loss": 0.03125 }, { "epoch": 0.12922681431678862, "step": 1307, "train/total_loss": 0.07680408656597137 }, { "entropy": 9.088683128356934, "epoch": 0.12932568716630413, "mean_token_accuracy": 0.7415599822998047, "num_tokens": 6809477.0, "step": 1308, "train/ce_loss": 0.7919617891311646 }, { "epoch": 0.12932568716630413, "step": 1308, "train/sim_loss": 0.0546875 }, { "epoch": 0.12932568716630413, "step": 1308, "train/total_loss": 0.13388368487358093 }, { "entropy": 9.484408378601074, "epoch": 0.12942456001581965, "mean_token_accuracy": 0.7262997031211853, "num_tokens": 6814748.0, "step": 1309, "train/ce_loss": 4.9226622650166973e-05 }, { "epoch": 0.12942456001581965, "step": 1309, "train/sim_loss": 0.0546875 }, { "epoch": 0.12942456001581965, "step": 1309, "train/total_loss": 0.05469242110848427 }, { "entropy": 9.560070037841797, "epoch": 0.1295234328653352, "mean_token_accuracy": 0.765531063079834, "num_tokens": 6819718.0, "step": 1310, "train/ce_loss": 0.3943033814430237 }, { "epoch": 0.1295234328653352, "step": 1310, "train/sim_loss": 0.09375 }, { "epoch": 0.1295234328653352, "step": 1310, "train/total_loss": 0.13318033516407013 }, { "entropy": 9.553999900817871, "epoch": 0.1296223057148507, "mean_token_accuracy": 0.6898638606071472, "num_tokens": 6824837.0, "step": 1311, "train/ce_loss": 1.1553877592086792 }, { "epoch": 0.1296223057148507, "step": 1311, "train/sim_loss": 0.10546875 }, { "epoch": 0.1296223057148507, "step": 1311, "train/total_loss": 0.22100752592086792 }, { "entropy": 9.081045150756836, "epoch": 0.1297211785643662, "mean_token_accuracy": 0.7494456768035889, "num_tokens": 6830181.0, "step": 1312, "train/ce_loss": 0.9298657774925232 }, { "epoch": 0.1297211785643662, "step": 1312, "train/sim_loss": 0.1015625 }, { "epoch": 0.1297211785643662, "step": 1312, "train/total_loss": 0.1945490837097168 }, { "entropy": 9.104818344116211, "epoch": 0.12982005141388175, "mean_token_accuracy": 0.7246752977371216, "num_tokens": 6835437.0, "step": 1313, "train/ce_loss": 0.5295743346214294 }, { "epoch": 0.12982005141388175, "step": 1313, "train/sim_loss": 0.03515625 }, { "epoch": 0.12982005141388175, "step": 1313, "train/total_loss": 0.0881136804819107 }, { "entropy": 9.376556396484375, "epoch": 0.12991892426339727, "mean_token_accuracy": 0.7823129296302795, "num_tokens": 6840675.0, "step": 1314, "train/ce_loss": 2.5385288608958945e-05 }, { "epoch": 0.12991892426339727, "step": 1314, "train/sim_loss": 0.0703125 }, { "epoch": 0.12991892426339727, "step": 1314, "train/total_loss": 0.07031504064798355 }, { "entropy": 9.420551300048828, "epoch": 0.13001779711291278, "mean_token_accuracy": 0.7459893226623535, "num_tokens": 6845847.0, "step": 1315, "train/ce_loss": 1.6422370672225952 }, { "epoch": 0.13001779711291278, "step": 1315, "train/sim_loss": 0.08984375 }, { "epoch": 0.13001779711291278, "step": 1315, "train/total_loss": 0.25406748056411743 }, { "entropy": 9.569178581237793, "epoch": 0.13011666996242832, "mean_token_accuracy": 0.7777777910232544, "num_tokens": 6850885.0, "step": 1316, "train/ce_loss": 0.7575596570968628 }, { "epoch": 0.13011666996242832, "step": 1316, "train/sim_loss": 0.04296875 }, { "epoch": 0.13011666996242832, "step": 1316, "train/total_loss": 0.11872471868991852 }, { "entropy": 9.686807632446289, "epoch": 0.13021554281194384, "mean_token_accuracy": 0.6978297233581543, "num_tokens": 6855927.0, "step": 1317, "train/ce_loss": 1.4601283073425293 }, { "epoch": 0.13021554281194384, "step": 1317, "train/sim_loss": 0.09375 }, { "epoch": 0.13021554281194384, "step": 1317, "train/total_loss": 0.2397628277540207 }, { "entropy": 9.00992488861084, "epoch": 0.13031441566145938, "mean_token_accuracy": 0.6796690225601196, "num_tokens": 6861209.0, "step": 1318, "train/ce_loss": 0.777147650718689 }, { "epoch": 0.13031441566145938, "step": 1318, "train/sim_loss": 0.05859375 }, { "epoch": 0.13031441566145938, "step": 1318, "train/total_loss": 0.13630852103233337 }, { "entropy": 9.652292251586914, "epoch": 0.1304132885109749, "mean_token_accuracy": 0.7416520118713379, "num_tokens": 6866227.0, "step": 1319, "train/ce_loss": 1.0801567441376392e-05 }, { "epoch": 0.1304132885109749, "step": 1319, "train/sim_loss": 0.07421875 }, { "epoch": 0.1304132885109749, "step": 1319, "train/total_loss": 0.07421983033418655 }, { "epoch": 0.1305121613604904, "grad_norm": 1.0008471012115479, "learning_rate": 9.676358601592247e-06, "loss": 0.1571, "step": 1320 }, { "entropy": 9.465921401977539, "epoch": 0.1305121613604904, "mean_token_accuracy": 0.7111111283302307, "num_tokens": 6871414.0, "step": 1320, "train/ce_loss": 0.6895711421966553 }, { "epoch": 0.1305121613604904, "step": 1320, "train/sim_loss": 0.07421875 }, { "epoch": 0.1305121613604904, "step": 1320, "train/total_loss": 0.14317587018013 }, { "entropy": 10.085680961608887, "epoch": 0.13061103421000594, "mean_token_accuracy": 0.8009478449821472, "num_tokens": 6876185.0, "step": 1321, "train/ce_loss": 1.1681467294692993 }, { "epoch": 0.13061103421000594, "step": 1321, "train/sim_loss": 0.0625 }, { "epoch": 0.13061103421000594, "step": 1321, "train/total_loss": 0.17931467294692993 }, { "entropy": 9.769105911254883, "epoch": 0.13070990705952146, "mean_token_accuracy": 0.8288770318031311, "num_tokens": 6881137.0, "step": 1322, "train/ce_loss": 8.012760190467816e-06 }, { "epoch": 0.13070990705952146, "step": 1322, "train/sim_loss": 0.06640625 }, { "epoch": 0.13070990705952146, "step": 1322, "train/total_loss": 0.06640705466270447 }, { "entropy": 9.293441772460938, "epoch": 0.13080877990903697, "mean_token_accuracy": 0.7526316046714783, "num_tokens": 6886334.0, "step": 1323, "train/ce_loss": 1.331781029701233 }, { "epoch": 0.13080877990903697, "step": 1323, "train/sim_loss": 0.140625 }, { "epoch": 0.13080877990903697, "step": 1323, "train/total_loss": 0.27380311489105225 }, { "entropy": 9.82375717163086, "epoch": 0.1309076527585525, "mean_token_accuracy": 0.7560521364212036, "num_tokens": 6891483.0, "step": 1324, "train/ce_loss": 1.423461675643921 }, { "epoch": 0.1309076527585525, "step": 1324, "train/sim_loss": 0.1171875 }, { "epoch": 0.1309076527585525, "step": 1324, "train/total_loss": 0.25953367352485657 }, { "entropy": 9.339868545532227, "epoch": 0.13100652560806803, "mean_token_accuracy": 0.6946022510528564, "num_tokens": 6896649.0, "step": 1325, "train/ce_loss": 1.1555442810058594 }, { "epoch": 0.13100652560806803, "step": 1325, "train/sim_loss": 0.0625 }, { "epoch": 0.13100652560806803, "step": 1325, "train/total_loss": 0.17805442214012146 }, { "entropy": 8.74455738067627, "epoch": 0.13110539845758354, "mean_token_accuracy": 0.7673649191856384, "num_tokens": 6902037.0, "step": 1326, "train/ce_loss": 0.5528606176376343 }, { "epoch": 0.13110539845758354, "step": 1326, "train/sim_loss": 0.0625 }, { "epoch": 0.13110539845758354, "step": 1326, "train/total_loss": 0.11778606474399567 }, { "entropy": 9.457708358764648, "epoch": 0.13120427130709908, "mean_token_accuracy": 0.6380281448364258, "num_tokens": 6907205.0, "step": 1327, "train/ce_loss": 8.960471859609243e-06 }, { "epoch": 0.13120427130709908, "step": 1327, "train/sim_loss": 0.0234375 }, { "epoch": 0.13120427130709908, "step": 1327, "train/total_loss": 0.02343839593231678 }, { "entropy": 9.52875804901123, "epoch": 0.1313031441566146, "mean_token_accuracy": 0.639769434928894, "num_tokens": 6912340.0, "step": 1328, "train/ce_loss": 5.955170308880042e-06 }, { "epoch": 0.1313031441566146, "step": 1328, "train/sim_loss": 0.0625 }, { "epoch": 0.1313031441566146, "step": 1328, "train/total_loss": 0.06250059604644775 }, { "entropy": 9.210979461669922, "epoch": 0.1314020170061301, "mean_token_accuracy": 0.7533742189407349, "num_tokens": 6917620.0, "step": 1329, "train/ce_loss": 0.816834568977356 }, { "epoch": 0.1314020170061301, "step": 1329, "train/sim_loss": 0.0625 }, { "epoch": 0.1314020170061301, "step": 1329, "train/total_loss": 0.1441834568977356 }, { "entropy": 9.25547981262207, "epoch": 0.13150088985564565, "mean_token_accuracy": 0.7266921997070312, "num_tokens": 6922884.0, "step": 1330, "train/ce_loss": 0.3945234417915344 }, { "epoch": 0.13150088985564565, "step": 1330, "train/sim_loss": 0.06640625 }, { "epoch": 0.13150088985564565, "step": 1330, "train/total_loss": 0.10585859417915344 }, { "entropy": 9.18018913269043, "epoch": 0.13159976270516116, "mean_token_accuracy": 0.7842857241630554, "num_tokens": 6928075.0, "step": 1331, "train/ce_loss": 0.8674007058143616 }, { "epoch": 0.13159976270516116, "step": 1331, "train/sim_loss": 0.1171875 }, { "epoch": 0.13159976270516116, "step": 1331, "train/total_loss": 0.20392757654190063 }, { "entropy": 8.887187004089355, "epoch": 0.13169863555467667, "mean_token_accuracy": 0.6983373165130615, "num_tokens": 6933415.0, "step": 1332, "train/ce_loss": 0.5412377715110779 }, { "epoch": 0.13169863555467667, "step": 1332, "train/sim_loss": 0.1171875 }, { "epoch": 0.13169863555467667, "step": 1332, "train/total_loss": 0.17131127417087555 }, { "entropy": 8.93563461303711, "epoch": 0.13179750840419222, "mean_token_accuracy": 0.7395301461219788, "num_tokens": 6938869.0, "step": 1333, "train/ce_loss": 1.14861261844635 }, { "epoch": 0.13179750840419222, "step": 1333, "train/sim_loss": 0.125 }, { "epoch": 0.13179750840419222, "step": 1333, "train/total_loss": 0.23986126482486725 }, { "entropy": 8.794504165649414, "epoch": 0.13189638125370773, "mean_token_accuracy": 0.7954971790313721, "num_tokens": 6944435.0, "step": 1334, "train/ce_loss": 0.5525534749031067 }, { "epoch": 0.13189638125370773, "step": 1334, "train/sim_loss": 0.0390625 }, { "epoch": 0.13189638125370773, "step": 1334, "train/total_loss": 0.09431785345077515 }, { "entropy": 9.051572799682617, "epoch": 0.13199525410322324, "mean_token_accuracy": 0.6588888764381409, "num_tokens": 6949781.0, "step": 1335, "train/ce_loss": 0.9546046257019043 }, { "epoch": 0.13199525410322324, "step": 1335, "train/sim_loss": 0.09765625 }, { "epoch": 0.13199525410322324, "step": 1335, "train/total_loss": 0.19311672449111938 }, { "entropy": 9.176790237426758, "epoch": 0.13209412695273878, "mean_token_accuracy": 0.76953125, "num_tokens": 6955007.0, "step": 1336, "train/ce_loss": 0.7356476783752441 }, { "epoch": 0.13209412695273878, "step": 1336, "train/sim_loss": 0.1328125 }, { "epoch": 0.13209412695273878, "step": 1336, "train/total_loss": 0.20637726783752441 }, { "entropy": 9.438520431518555, "epoch": 0.1321929998022543, "mean_token_accuracy": 0.7435897588729858, "num_tokens": 6960066.0, "step": 1337, "train/ce_loss": 0.9659104943275452 }, { "epoch": 0.1321929998022543, "step": 1337, "train/sim_loss": 0.1171875 }, { "epoch": 0.1321929998022543, "step": 1337, "train/total_loss": 0.213778555393219 }, { "entropy": 9.292132377624512, "epoch": 0.13229187265176984, "mean_token_accuracy": 0.7315270900726318, "num_tokens": 6965292.0, "step": 1338, "train/ce_loss": 0.6084064245223999 }, { "epoch": 0.13229187265176984, "step": 1338, "train/sim_loss": 0.09765625 }, { "epoch": 0.13229187265176984, "step": 1338, "train/total_loss": 0.1584968864917755 }, { "entropy": 9.065678596496582, "epoch": 0.13239074550128535, "mean_token_accuracy": 0.7868303656578064, "num_tokens": 6970713.0, "step": 1339, "train/ce_loss": 0.6088027954101562 }, { "epoch": 0.13239074550128535, "step": 1339, "train/sim_loss": 0.0625 }, { "epoch": 0.13239074550128535, "step": 1339, "train/total_loss": 0.12338028103113174 }, { "epoch": 0.13248961835080086, "grad_norm": 0.9765375852584839, "learning_rate": 9.671413736834299e-06, "loss": 0.1662, "step": 1340 }, { "entropy": 9.935449600219727, "epoch": 0.13248961835080086, "mean_token_accuracy": 0.697265625, "num_tokens": 6975626.0, "step": 1340, "train/ce_loss": 2.081921100616455 }, { "epoch": 0.13248961835080086, "step": 1340, "train/sim_loss": 0.046875 }, { "epoch": 0.13248961835080086, "step": 1340, "train/total_loss": 0.2550671100616455 }, { "entropy": 10.374621391296387, "epoch": 0.1325884912003164, "mean_token_accuracy": 0.6761133670806885, "num_tokens": 6980238.0, "step": 1341, "train/ce_loss": 4.566015243530273 }, { "epoch": 0.1325884912003164, "step": 1341, "train/sim_loss": 0.09375 }, { "epoch": 0.1325884912003164, "step": 1341, "train/total_loss": 0.5503515005111694 }, { "entropy": 9.028810501098633, "epoch": 0.13268736404983192, "mean_token_accuracy": 0.70659339427948, "num_tokens": 6985616.0, "step": 1342, "train/ce_loss": 1.0756028890609741 }, { "epoch": 0.13268736404983192, "step": 1342, "train/sim_loss": 0.078125 }, { "epoch": 0.13268736404983192, "step": 1342, "train/total_loss": 0.18568529188632965 }, { "entropy": 9.860417366027832, "epoch": 0.13278623689934743, "mean_token_accuracy": 0.7653631567955017, "num_tokens": 6990572.0, "step": 1343, "train/ce_loss": 0.8157594203948975 }, { "epoch": 0.13278623689934743, "step": 1343, "train/sim_loss": 0.14453125 }, { "epoch": 0.13278623689934743, "step": 1343, "train/total_loss": 0.22610719501972198 }, { "entropy": 9.547462463378906, "epoch": 0.13288510974886297, "mean_token_accuracy": 0.7703081369400024, "num_tokens": 6995727.0, "step": 1344, "train/ce_loss": 8.18125863588648e-06 }, { "epoch": 0.13288510974886297, "step": 1344, "train/sim_loss": 0.0625 }, { "epoch": 0.13288510974886297, "step": 1344, "train/total_loss": 0.06250081956386566 }, { "entropy": 9.098556518554688, "epoch": 0.1329839825983785, "mean_token_accuracy": 0.7614781856536865, "num_tokens": 7001136.0, "step": 1345, "train/ce_loss": 0.6963076591491699 }, { "epoch": 0.1329839825983785, "step": 1345, "train/sim_loss": 0.02734375 }, { "epoch": 0.1329839825983785, "step": 1345, "train/total_loss": 0.09697451442480087 }, { "entropy": 9.58292007446289, "epoch": 0.133082855447894, "mean_token_accuracy": 0.7492354512214661, "num_tokens": 7006230.0, "step": 1346, "train/ce_loss": 1.217063546180725 }, { "epoch": 0.133082855447894, "step": 1346, "train/sim_loss": 0.10546875 }, { "epoch": 0.133082855447894, "step": 1346, "train/total_loss": 0.22717511653900146 }, { "entropy": 9.555110931396484, "epoch": 0.13318172829740954, "mean_token_accuracy": 0.714067280292511, "num_tokens": 7011333.0, "step": 1347, "train/ce_loss": 0.780555009841919 }, { "epoch": 0.13318172829740954, "step": 1347, "train/sim_loss": 0.0703125 }, { "epoch": 0.13318172829740954, "step": 1347, "train/total_loss": 0.1483680009841919 }, { "entropy": 9.313787460327148, "epoch": 0.13328060114692505, "mean_token_accuracy": 0.7480417490005493, "num_tokens": 7016588.0, "step": 1348, "train/ce_loss": 0.8471536040306091 }, { "epoch": 0.13328060114692505, "step": 1348, "train/sim_loss": 0.0859375 }, { "epoch": 0.13328060114692505, "step": 1348, "train/total_loss": 0.1706528663635254 }, { "entropy": 9.890573501586914, "epoch": 0.13337947399644057, "mean_token_accuracy": 0.7034631967544556, "num_tokens": 7021478.0, "step": 1349, "train/ce_loss": 1.1914722919464111 }, { "epoch": 0.13337947399644057, "step": 1349, "train/sim_loss": 0.0546875 }, { "epoch": 0.13337947399644057, "step": 1349, "train/total_loss": 0.17383474111557007 }, { "entropy": 9.21375846862793, "epoch": 0.1334783468459561, "mean_token_accuracy": 0.7373737096786499, "num_tokens": 7026726.0, "step": 1350, "train/ce_loss": 1.0719325542449951 }, { "epoch": 0.1334783468459561, "step": 1350, "train/sim_loss": 0.12109375 }, { "epoch": 0.1334783468459561, "step": 1350, "train/total_loss": 0.228287011384964 }, { "entropy": 9.115421295166016, "epoch": 0.13357721969547162, "mean_token_accuracy": 0.774193525314331, "num_tokens": 7031974.0, "step": 1351, "train/ce_loss": 0.8924320340156555 }, { "epoch": 0.13357721969547162, "step": 1351, "train/sim_loss": 0.09375 }, { "epoch": 0.13357721969547162, "step": 1351, "train/total_loss": 0.18299320340156555 }, { "entropy": 9.520133018493652, "epoch": 0.13367609254498714, "mean_token_accuracy": 0.7358490824699402, "num_tokens": 7037118.0, "step": 1352, "train/ce_loss": 2.67130380962044e-05 }, { "epoch": 0.13367609254498714, "step": 1352, "train/sim_loss": 0.09375 }, { "epoch": 0.13367609254498714, "step": 1352, "train/total_loss": 0.0937526747584343 }, { "entropy": 9.93098258972168, "epoch": 0.13377496539450268, "mean_token_accuracy": 0.7065637111663818, "num_tokens": 7042062.0, "step": 1353, "train/ce_loss": 1.890914427349344e-05 }, { "epoch": 0.13377496539450268, "step": 1353, "train/sim_loss": 0.12890625 }, { "epoch": 0.13377496539450268, "step": 1353, "train/total_loss": 0.12890814244747162 }, { "entropy": 9.51821517944336, "epoch": 0.1338738382440182, "mean_token_accuracy": 0.6343558430671692, "num_tokens": 7047333.0, "step": 1354, "train/ce_loss": 2.017146348953247 }, { "epoch": 0.1338738382440182, "step": 1354, "train/sim_loss": 0.05078125 }, { "epoch": 0.1338738382440182, "step": 1354, "train/total_loss": 0.2524958848953247 }, { "entropy": 9.332813262939453, "epoch": 0.1339727110935337, "mean_token_accuracy": 0.7398273944854736, "num_tokens": 7052611.0, "step": 1355, "train/ce_loss": 0.5288713574409485 }, { "epoch": 0.1339727110935337, "step": 1355, "train/sim_loss": 0.12109375 }, { "epoch": 0.1339727110935337, "step": 1355, "train/total_loss": 0.17398089170455933 }, { "entropy": 8.66998291015625, "epoch": 0.13407158394304924, "mean_token_accuracy": 0.7036363482475281, "num_tokens": 7058353.0, "step": 1356, "train/ce_loss": 1.4327878952026367 }, { "epoch": 0.13407158394304924, "step": 1356, "train/sim_loss": 0.1328125 }, { "epoch": 0.13407158394304924, "step": 1356, "train/total_loss": 0.2760912775993347 }, { "entropy": 9.798766136169434, "epoch": 0.13417045679256476, "mean_token_accuracy": 0.7202796936035156, "num_tokens": 7063369.0, "step": 1357, "train/ce_loss": 1.655919913901016e-05 }, { "epoch": 0.13417045679256476, "step": 1357, "train/sim_loss": 0.07421875 }, { "epoch": 0.13417045679256476, "step": 1357, "train/total_loss": 0.07422040402889252 }, { "entropy": 9.45732307434082, "epoch": 0.1342693296420803, "mean_token_accuracy": 0.6950617432594299, "num_tokens": 7068607.0, "step": 1358, "train/ce_loss": 0.6530421376228333 }, { "epoch": 0.1342693296420803, "step": 1358, "train/sim_loss": 0.1015625 }, { "epoch": 0.1342693296420803, "step": 1358, "train/total_loss": 0.1668667197227478 }, { "entropy": 9.124235153198242, "epoch": 0.1343682024915958, "mean_token_accuracy": 0.6947236061096191, "num_tokens": 7073816.0, "step": 1359, "train/ce_loss": 0.9237757325172424 }, { "epoch": 0.1343682024915958, "step": 1359, "train/sim_loss": 0.08203125 }, { "epoch": 0.1343682024915958, "step": 1359, "train/total_loss": 0.17440882325172424 }, { "epoch": 0.13446707534111133, "grad_norm": 1.1598478555679321, "learning_rate": 9.66646887207635e-06, "loss": 0.1733, "step": 1360 }, { "entropy": 9.222099304199219, "epoch": 0.13446707534111133, "mean_token_accuracy": 0.7375144958496094, "num_tokens": 7079093.0, "step": 1360, "train/ce_loss": 0.4310187101364136 }, { "epoch": 0.13446707534111133, "step": 1360, "train/sim_loss": 0.0625 }, { "epoch": 0.13446707534111133, "step": 1360, "train/total_loss": 0.10560187697410583 }, { "entropy": 9.048439979553223, "epoch": 0.13456594819062687, "mean_token_accuracy": 0.7167043089866638, "num_tokens": 7084466.0, "step": 1361, "train/ce_loss": 1.3650734424591064 }, { "epoch": 0.13456594819062687, "step": 1361, "train/sim_loss": 0.06640625 }, { "epoch": 0.13456594819062687, "step": 1361, "train/total_loss": 0.20291359722614288 }, { "entropy": 8.655082702636719, "epoch": 0.13466482104014238, "mean_token_accuracy": 0.7400379776954651, "num_tokens": 7089959.0, "step": 1362, "train/ce_loss": 1.21505868434906 }, { "epoch": 0.13466482104014238, "step": 1362, "train/sim_loss": 0.1015625 }, { "epoch": 0.13466482104014238, "step": 1362, "train/total_loss": 0.22306837141513824 }, { "entropy": 10.349322319030762, "epoch": 0.1347636938896579, "mean_token_accuracy": 0.7899686694145203, "num_tokens": 7094666.0, "step": 1363, "train/ce_loss": 2.0730125470436178e-05 }, { "epoch": 0.1347636938896579, "step": 1363, "train/sim_loss": 0.03125 }, { "epoch": 0.1347636938896579, "step": 1363, "train/total_loss": 0.031252071261405945 }, { "entropy": 9.29534912109375, "epoch": 0.13486256673917343, "mean_token_accuracy": 0.7682198286056519, "num_tokens": 7099994.0, "step": 1364, "train/ce_loss": 0.5866829752922058 }, { "epoch": 0.13486256673917343, "step": 1364, "train/sim_loss": 0.078125 }, { "epoch": 0.13486256673917343, "step": 1364, "train/total_loss": 0.13679330050945282 }, { "entropy": 9.404373168945312, "epoch": 0.13496143958868895, "mean_token_accuracy": 0.69986891746521, "num_tokens": 7105236.0, "step": 1365, "train/ce_loss": 0.7443293333053589 }, { "epoch": 0.13496143958868895, "step": 1365, "train/sim_loss": 0.125 }, { "epoch": 0.13496143958868895, "step": 1365, "train/total_loss": 0.19943293929100037 }, { "entropy": 8.976805686950684, "epoch": 0.13506031243820446, "mean_token_accuracy": 0.7087682485580444, "num_tokens": 7110755.0, "step": 1366, "train/ce_loss": 1.2494643926620483 }, { "epoch": 0.13506031243820446, "step": 1366, "train/sim_loss": 0.15625 }, { "epoch": 0.13506031243820446, "step": 1366, "train/total_loss": 0.2811964452266693 }, { "entropy": 10.0078125, "epoch": 0.13515918528772, "mean_token_accuracy": 0.7490636706352234, "num_tokens": 7115692.0, "step": 1367, "train/ce_loss": 1.1242824257351458e-05 }, { "epoch": 0.13515918528772, "step": 1367, "train/sim_loss": 0.02734375 }, { "epoch": 0.13515918528772, "step": 1367, "train/total_loss": 0.027344875037670135 }, { "entropy": 9.153022766113281, "epoch": 0.13525805813723552, "mean_token_accuracy": 0.7320799231529236, "num_tokens": 7121020.0, "step": 1368, "train/ce_loss": 0.8959968686103821 }, { "epoch": 0.13525805813723552, "step": 1368, "train/sim_loss": 0.08984375 }, { "epoch": 0.13525805813723552, "step": 1368, "train/total_loss": 0.17944344878196716 }, { "entropy": 9.92242431640625, "epoch": 0.13535693098675103, "mean_token_accuracy": 0.8330308794975281, "num_tokens": 7126028.0, "step": 1369, "train/ce_loss": 0.9085344672203064 }, { "epoch": 0.13535693098675103, "step": 1369, "train/sim_loss": 0.0859375 }, { "epoch": 0.13535693098675103, "step": 1369, "train/total_loss": 0.17679095268249512 }, { "entropy": 9.732063293457031, "epoch": 0.13545580383626657, "mean_token_accuracy": 0.7522388100624084, "num_tokens": 7131328.0, "step": 1370, "train/ce_loss": 9.229583156411536e-06 }, { "epoch": 0.13545580383626657, "step": 1370, "train/sim_loss": 0.03125 }, { "epoch": 0.13545580383626657, "step": 1370, "train/total_loss": 0.03125092387199402 }, { "entropy": 9.762304306030273, "epoch": 0.13555467668578208, "mean_token_accuracy": 0.6817447543144226, "num_tokens": 7136433.0, "step": 1371, "train/ce_loss": 1.118974208831787 }, { "epoch": 0.13555467668578208, "step": 1371, "train/sim_loss": 0.11328125 }, { "epoch": 0.13555467668578208, "step": 1371, "train/total_loss": 0.22517867386341095 }, { "entropy": 9.191245079040527, "epoch": 0.1356535495352976, "mean_token_accuracy": 0.7228608131408691, "num_tokens": 7141585.0, "step": 1372, "train/ce_loss": 0.38976824283599854 }, { "epoch": 0.1356535495352976, "step": 1372, "train/sim_loss": 0.09765625 }, { "epoch": 0.1356535495352976, "step": 1372, "train/total_loss": 0.13663306832313538 }, { "entropy": 9.613529205322266, "epoch": 0.13575242238481314, "mean_token_accuracy": 0.7711213231086731, "num_tokens": 7146681.0, "step": 1373, "train/ce_loss": 1.1483960151672363 }, { "epoch": 0.13575242238481314, "step": 1373, "train/sim_loss": 0.06640625 }, { "epoch": 0.13575242238481314, "step": 1373, "train/total_loss": 0.1812458634376526 }, { "entropy": 9.747818946838379, "epoch": 0.13585129523432865, "mean_token_accuracy": 0.7123287916183472, "num_tokens": 7151721.0, "step": 1374, "train/ce_loss": 1.276660680770874 }, { "epoch": 0.13585129523432865, "step": 1374, "train/sim_loss": 0.078125 }, { "epoch": 0.13585129523432865, "step": 1374, "train/total_loss": 0.20579107105731964 }, { "entropy": 9.456674575805664, "epoch": 0.13595016808384416, "mean_token_accuracy": 0.7160193920135498, "num_tokens": 7156995.0, "step": 1375, "train/ce_loss": 1.8477184772491455 }, { "epoch": 0.13595016808384416, "step": 1375, "train/sim_loss": 0.15625 }, { "epoch": 0.13595016808384416, "step": 1375, "train/total_loss": 0.3410218358039856 }, { "entropy": 9.060297012329102, "epoch": 0.1360490409333597, "mean_token_accuracy": 0.7053254246711731, "num_tokens": 7162351.0, "step": 1376, "train/ce_loss": 1.2942602634429932 }, { "epoch": 0.1360490409333597, "step": 1376, "train/sim_loss": 0.0859375 }, { "epoch": 0.1360490409333597, "step": 1376, "train/total_loss": 0.2153635323047638 }, { "entropy": 9.12033462524414, "epoch": 0.13614791378287522, "mean_token_accuracy": 0.6435294151306152, "num_tokens": 7167679.0, "step": 1377, "train/ce_loss": 0.8891092538833618 }, { "epoch": 0.13614791378287522, "step": 1377, "train/sim_loss": 0.09375 }, { "epoch": 0.13614791378287522, "step": 1377, "train/total_loss": 0.18266093730926514 }, { "entropy": 9.498316764831543, "epoch": 0.13624678663239073, "mean_token_accuracy": 0.739062488079071, "num_tokens": 7172731.0, "step": 1378, "train/ce_loss": 1.083294137060875e-05 }, { "epoch": 0.13624678663239073, "step": 1378, "train/sim_loss": 0.02734375 }, { "epoch": 0.13624678663239073, "step": 1378, "train/total_loss": 0.027344834059476852 }, { "entropy": 9.433088302612305, "epoch": 0.13634565948190627, "mean_token_accuracy": 0.6906779408454895, "num_tokens": 7177888.0, "step": 1379, "train/ce_loss": 1.207837462425232 }, { "epoch": 0.13634565948190627, "step": 1379, "train/sim_loss": 0.06640625 }, { "epoch": 0.13634565948190627, "step": 1379, "train/total_loss": 0.1871899962425232 }, { "epoch": 0.1364445323314218, "grad_norm": 1.1392931938171387, "learning_rate": 9.6615240073184e-06, "loss": 0.1708, "step": 1380 }, { "entropy": 9.162598609924316, "epoch": 0.1364445323314218, "mean_token_accuracy": 0.7412031888961792, "num_tokens": 7183249.0, "step": 1380, "train/ce_loss": 0.7054414749145508 }, { "epoch": 0.1364445323314218, "step": 1380, "train/sim_loss": 0.0390625 }, { "epoch": 0.1364445323314218, "step": 1380, "train/total_loss": 0.10960664600133896 }, { "entropy": 9.58977222442627, "epoch": 0.13654340518093733, "mean_token_accuracy": 0.7033112645149231, "num_tokens": 7188484.0, "step": 1381, "train/ce_loss": 1.0873607397079468 }, { "epoch": 0.13654340518093733, "step": 1381, "train/sim_loss": 0.0390625 }, { "epoch": 0.13654340518093733, "step": 1381, "train/total_loss": 0.1477985680103302 }, { "entropy": 9.628973007202148, "epoch": 0.13664227803045284, "mean_token_accuracy": 0.7557142972946167, "num_tokens": 7193633.0, "step": 1382, "train/ce_loss": 0.7194046378135681 }, { "epoch": 0.13664227803045284, "step": 1382, "train/sim_loss": 0.11328125 }, { "epoch": 0.13664227803045284, "step": 1382, "train/total_loss": 0.18522171676158905 }, { "entropy": 9.48279094696045, "epoch": 0.13674115087996835, "mean_token_accuracy": 0.6994134783744812, "num_tokens": 7198964.0, "step": 1383, "train/ce_loss": 0.981130838394165 }, { "epoch": 0.13674115087996835, "step": 1383, "train/sim_loss": 0.0859375 }, { "epoch": 0.13674115087996835, "step": 1383, "train/total_loss": 0.18405058979988098 }, { "entropy": 9.090730667114258, "epoch": 0.1368400237294839, "mean_token_accuracy": 0.7595744729042053, "num_tokens": 7204415.0, "step": 1384, "train/ce_loss": 1.2270708084106445 }, { "epoch": 0.1368400237294839, "step": 1384, "train/sim_loss": 0.0859375 }, { "epoch": 0.1368400237294839, "step": 1384, "train/total_loss": 0.2086445838212967 }, { "entropy": 9.309127807617188, "epoch": 0.1369388965789994, "mean_token_accuracy": 0.7635053992271423, "num_tokens": 7209716.0, "step": 1385, "train/ce_loss": 0.714293360710144 }, { "epoch": 0.1369388965789994, "step": 1385, "train/sim_loss": 0.0546875 }, { "epoch": 0.1369388965789994, "step": 1385, "train/total_loss": 0.12611684203147888 }, { "entropy": 9.814483642578125, "epoch": 0.13703776942851492, "mean_token_accuracy": 0.7638190984725952, "num_tokens": 7214776.0, "step": 1386, "train/ce_loss": 2.559537097113207e-05 }, { "epoch": 0.13703776942851492, "step": 1386, "train/sim_loss": 0.078125 }, { "epoch": 0.13703776942851492, "step": 1386, "train/total_loss": 0.07812756299972534 }, { "entropy": 10.152091979980469, "epoch": 0.13713664227803046, "mean_token_accuracy": 0.7139587998390198, "num_tokens": 7219571.0, "step": 1387, "train/ce_loss": 1.8109357357025146 }, { "epoch": 0.13713664227803046, "step": 1387, "train/sim_loss": 0.09765625 }, { "epoch": 0.13713664227803046, "step": 1387, "train/total_loss": 0.27874982357025146 }, { "entropy": 8.986875534057617, "epoch": 0.13723551512754598, "mean_token_accuracy": 0.7254273295402527, "num_tokens": 7224988.0, "step": 1388, "train/ce_loss": 1.1162502765655518 }, { "epoch": 0.13723551512754598, "step": 1388, "train/sim_loss": 0.1953125 }, { "epoch": 0.13723551512754598, "step": 1388, "train/total_loss": 0.3069375157356262 }, { "entropy": 9.793027877807617, "epoch": 0.1373343879770615, "mean_token_accuracy": 0.7576736807823181, "num_tokens": 7230030.0, "step": 1389, "train/ce_loss": 1.2323343753814697 }, { "epoch": 0.1373343879770615, "step": 1389, "train/sim_loss": 0.06640625 }, { "epoch": 0.1373343879770615, "step": 1389, "train/total_loss": 0.18963968753814697 }, { "entropy": 9.297113418579102, "epoch": 0.13743326082657703, "mean_token_accuracy": 0.6991368532180786, "num_tokens": 7235295.0, "step": 1390, "train/ce_loss": 1.4138586521148682 }, { "epoch": 0.13743326082657703, "step": 1390, "train/sim_loss": 0.1171875 }, { "epoch": 0.13743326082657703, "step": 1390, "train/total_loss": 0.25857335329055786 }, { "entropy": 9.281339645385742, "epoch": 0.13753213367609254, "mean_token_accuracy": 0.6515151262283325, "num_tokens": 7240653.0, "step": 1391, "train/ce_loss": 1.2768079042434692 }, { "epoch": 0.13753213367609254, "step": 1391, "train/sim_loss": 0.13671875 }, { "epoch": 0.13753213367609254, "step": 1391, "train/total_loss": 0.26439952850341797 }, { "entropy": 9.057063102722168, "epoch": 0.13763100652560806, "mean_token_accuracy": 0.755646824836731, "num_tokens": 7246146.0, "step": 1392, "train/ce_loss": 0.5638306140899658 }, { "epoch": 0.13763100652560806, "step": 1392, "train/sim_loss": 0.08984375 }, { "epoch": 0.13763100652560806, "step": 1392, "train/total_loss": 0.14622680842876434 }, { "entropy": 10.018474578857422, "epoch": 0.1377298793751236, "mean_token_accuracy": 0.6976743936538696, "num_tokens": 7251021.0, "step": 1393, "train/ce_loss": 2.264758348464966 }, { "epoch": 0.1377298793751236, "step": 1393, "train/sim_loss": 0.1171875 }, { "epoch": 0.1377298793751236, "step": 1393, "train/total_loss": 0.3436633348464966 }, { "entropy": 8.718015670776367, "epoch": 0.1378287522246391, "mean_token_accuracy": 0.6834763884544373, "num_tokens": 7256420.0, "step": 1394, "train/ce_loss": 1.1374813318252563 }, { "epoch": 0.1378287522246391, "step": 1394, "train/sim_loss": 0.08203125 }, { "epoch": 0.1378287522246391, "step": 1394, "train/total_loss": 0.19577938318252563 }, { "entropy": 10.238953590393066, "epoch": 0.13792762507415463, "mean_token_accuracy": 0.7974026203155518, "num_tokens": 7261187.0, "step": 1395, "train/ce_loss": 3.1131625291891396e-05 }, { "epoch": 0.13792762507415463, "step": 1395, "train/sim_loss": 0.0390625 }, { "epoch": 0.13792762507415463, "step": 1395, "train/total_loss": 0.039065614342689514 }, { "entropy": 9.5997896194458, "epoch": 0.13802649792367017, "mean_token_accuracy": 0.7196581363677979, "num_tokens": 7266220.0, "step": 1396, "train/ce_loss": 1.3521981239318848 }, { "epoch": 0.13802649792367017, "step": 1396, "train/sim_loss": 0.09375 }, { "epoch": 0.13802649792367017, "step": 1396, "train/total_loss": 0.22896981239318848 }, { "entropy": 9.595669746398926, "epoch": 0.13812537077318568, "mean_token_accuracy": 0.6781250238418579, "num_tokens": 7271470.0, "step": 1397, "train/ce_loss": 0.8560667037963867 }, { "epoch": 0.13812537077318568, "step": 1397, "train/sim_loss": 0.11328125 }, { "epoch": 0.13812537077318568, "step": 1397, "train/total_loss": 0.1988879144191742 }, { "entropy": 8.980318069458008, "epoch": 0.1382242436227012, "mean_token_accuracy": 0.7407024502754211, "num_tokens": 7276938.0, "step": 1398, "train/ce_loss": 0.6260985136032104 }, { "epoch": 0.1382242436227012, "step": 1398, "train/sim_loss": 0.10546875 }, { "epoch": 0.1382242436227012, "step": 1398, "train/total_loss": 0.16807860136032104 }, { "entropy": 9.427846908569336, "epoch": 0.13832311647221673, "mean_token_accuracy": 0.6952381134033203, "num_tokens": 7282047.0, "step": 1399, "train/ce_loss": 1.5995008945465088 }, { "epoch": 0.13832311647221673, "step": 1399, "train/sim_loss": 0.078125 }, { "epoch": 0.13832311647221673, "step": 1399, "train/total_loss": 0.23807509243488312 }, { "epoch": 0.13842198932173225, "grad_norm": 0.9312789440155029, "learning_rate": 9.656579142560452e-06, "loss": 0.1721, "step": 1400 }, { "entropy": 9.420281410217285, "epoch": 0.13842198932173225, "mean_token_accuracy": 0.7587600946426392, "num_tokens": 7287273.0, "step": 1400, "train/ce_loss": 0.911980390548706 }, { "epoch": 0.13842198932173225, "step": 1400, "train/sim_loss": 0.0859375 }, { "epoch": 0.13842198932173225, "step": 1400, "train/total_loss": 0.17713554203510284 }, { "entropy": 10.491708755493164, "epoch": 0.1385208621712478, "mean_token_accuracy": 0.764102578163147, "num_tokens": 7291863.0, "step": 1401, "train/ce_loss": 2.9230513973743655e-05 }, { "epoch": 0.1385208621712478, "step": 1401, "train/sim_loss": 0.0625 }, { "epoch": 0.1385208621712478, "step": 1401, "train/total_loss": 0.062502920627594 }, { "entropy": 9.89574146270752, "epoch": 0.1386197350207633, "mean_token_accuracy": 0.6787072420120239, "num_tokens": 7296824.0, "step": 1402, "train/ce_loss": 1.0397026538848877 }, { "epoch": 0.1386197350207633, "step": 1402, "train/sim_loss": 0.12109375 }, { "epoch": 0.1386197350207633, "step": 1402, "train/total_loss": 0.2250640094280243 }, { "entropy": 9.101736068725586, "epoch": 0.13871860787027882, "mean_token_accuracy": 0.6821191906929016, "num_tokens": 7302205.0, "step": 1403, "train/ce_loss": 0.3729458153247833 }, { "epoch": 0.13871860787027882, "step": 1403, "train/sim_loss": 0.046875 }, { "epoch": 0.13871860787027882, "step": 1403, "train/total_loss": 0.08416958153247833 }, { "entropy": 9.347423553466797, "epoch": 0.13881748071979436, "mean_token_accuracy": 0.6495097875595093, "num_tokens": 7307695.0, "step": 1404, "train/ce_loss": 1.291334867477417 }, { "epoch": 0.13881748071979436, "step": 1404, "train/sim_loss": 0.0625 }, { "epoch": 0.13881748071979436, "step": 1404, "train/total_loss": 0.19163349270820618 }, { "entropy": 9.36381721496582, "epoch": 0.13891635356930987, "mean_token_accuracy": 0.7665745615959167, "num_tokens": 7312906.0, "step": 1405, "train/ce_loss": 0.6868124008178711 }, { "epoch": 0.13891635356930987, "step": 1405, "train/sim_loss": 0.078125 }, { "epoch": 0.13891635356930987, "step": 1405, "train/total_loss": 0.1468062400817871 }, { "entropy": 9.406861305236816, "epoch": 0.13901522641882538, "mean_token_accuracy": 0.7622950673103333, "num_tokens": 7318039.0, "step": 1406, "train/ce_loss": 0.9608842730522156 }, { "epoch": 0.13901522641882538, "step": 1406, "train/sim_loss": 0.09375 }, { "epoch": 0.13901522641882538, "step": 1406, "train/total_loss": 0.1898384392261505 }, { "entropy": 9.218897819519043, "epoch": 0.13911409926834092, "mean_token_accuracy": 0.7496790885925293, "num_tokens": 7323261.0, "step": 1407, "train/ce_loss": 0.6161945462226868 }, { "epoch": 0.13911409926834092, "step": 1407, "train/sim_loss": 0.03125 }, { "epoch": 0.13911409926834092, "step": 1407, "train/total_loss": 0.09286946058273315 }, { "entropy": 9.789392471313477, "epoch": 0.13921297211785644, "mean_token_accuracy": 0.6774774789810181, "num_tokens": 7328238.0, "step": 1408, "train/ce_loss": 1.5993882417678833 }, { "epoch": 0.13921297211785644, "step": 1408, "train/sim_loss": 0.1171875 }, { "epoch": 0.13921297211785644, "step": 1408, "train/total_loss": 0.2771263122558594 }, { "entropy": 9.464317321777344, "epoch": 0.13931184496737195, "mean_token_accuracy": 0.6744186282157898, "num_tokens": 7333407.0, "step": 1409, "train/ce_loss": 7.5892635322816204e-06 }, { "epoch": 0.13931184496737195, "step": 1409, "train/sim_loss": 0.07421875 }, { "epoch": 0.13931184496737195, "step": 1409, "train/total_loss": 0.07421950995922089 }, { "entropy": 8.900279998779297, "epoch": 0.1394107178168875, "mean_token_accuracy": 0.7890382409095764, "num_tokens": 7338884.0, "step": 1410, "train/ce_loss": 0.5504341125488281 }, { "epoch": 0.1394107178168875, "step": 1410, "train/sim_loss": 0.03125 }, { "epoch": 0.1394107178168875, "step": 1410, "train/total_loss": 0.08629341423511505 }, { "entropy": 8.84177017211914, "epoch": 0.139509590666403, "mean_token_accuracy": 0.7534090876579285, "num_tokens": 7344229.0, "step": 1411, "train/ce_loss": 0.7043808698654175 }, { "epoch": 0.139509590666403, "step": 1411, "train/sim_loss": 0.01953125 }, { "epoch": 0.139509590666403, "step": 1411, "train/total_loss": 0.08996933698654175 }, { "entropy": 9.614435195922852, "epoch": 0.13960846351591852, "mean_token_accuracy": 0.6666666865348816, "num_tokens": 7349216.0, "step": 1412, "train/ce_loss": 2.0270464420318604 }, { "epoch": 0.13960846351591852, "step": 1412, "train/sim_loss": 0.08984375 }, { "epoch": 0.13960846351591852, "step": 1412, "train/total_loss": 0.29254841804504395 }, { "entropy": 9.185687065124512, "epoch": 0.13970733636543406, "mean_token_accuracy": 0.7352555990219116, "num_tokens": 7354461.0, "step": 1413, "train/ce_loss": 0.8546119928359985 }, { "epoch": 0.13970733636543406, "step": 1413, "train/sim_loss": 0.07421875 }, { "epoch": 0.13970733636543406, "step": 1413, "train/total_loss": 0.15967994928359985 }, { "entropy": 9.345466613769531, "epoch": 0.13980620921494957, "mean_token_accuracy": 0.7362499833106995, "num_tokens": 7359711.0, "step": 1414, "train/ce_loss": 1.5574983358383179 }, { "epoch": 0.13980620921494957, "step": 1414, "train/sim_loss": 0.0859375 }, { "epoch": 0.13980620921494957, "step": 1414, "train/total_loss": 0.2416873425245285 }, { "entropy": 8.867634773254395, "epoch": 0.13990508206446509, "mean_token_accuracy": 0.7376705408096313, "num_tokens": 7365166.0, "step": 1415, "train/ce_loss": 0.7297521829605103 }, { "epoch": 0.13990508206446509, "step": 1415, "train/sim_loss": 0.0390625 }, { "epoch": 0.13990508206446509, "step": 1415, "train/total_loss": 0.11203771829605103 }, { "entropy": 8.939706802368164, "epoch": 0.14000395491398063, "mean_token_accuracy": 0.6901565790176392, "num_tokens": 7370554.0, "step": 1416, "train/ce_loss": 1.0094400644302368 }, { "epoch": 0.14000395491398063, "step": 1416, "train/sim_loss": 0.1328125 }, { "epoch": 0.14000395491398063, "step": 1416, "train/total_loss": 0.23375651240348816 }, { "entropy": 9.093274116516113, "epoch": 0.14010282776349614, "mean_token_accuracy": 0.7566079497337341, "num_tokens": 7375943.0, "step": 1417, "train/ce_loss": 0.6850955486297607 }, { "epoch": 0.14010282776349614, "step": 1417, "train/sim_loss": 0.03515625 }, { "epoch": 0.14010282776349614, "step": 1417, "train/total_loss": 0.1036658063530922 }, { "entropy": 9.350330352783203, "epoch": 0.14020170061301165, "mean_token_accuracy": 0.7856155037879944, "num_tokens": 7381104.0, "step": 1418, "train/ce_loss": 0.8399333953857422 }, { "epoch": 0.14020170061301165, "step": 1418, "train/sim_loss": 0.046875 }, { "epoch": 0.14020170061301165, "step": 1418, "train/total_loss": 0.1308683454990387 }, { "entropy": 9.108048439025879, "epoch": 0.1403005734625272, "mean_token_accuracy": 0.7621809840202332, "num_tokens": 7386436.0, "step": 1419, "train/ce_loss": 0.876664936542511 }, { "epoch": 0.1403005734625272, "step": 1419, "train/sim_loss": 0.08203125 }, { "epoch": 0.1403005734625272, "step": 1419, "train/total_loss": 0.16969774663448334 }, { "epoch": 0.1403994463120427, "grad_norm": 0.8239012360572815, "learning_rate": 9.651634277802503e-06, "loss": 0.1676, "step": 1420 }, { "entropy": 9.418233871459961, "epoch": 0.1403994463120427, "mean_token_accuracy": 0.7874125838279724, "num_tokens": 7391582.0, "step": 1420, "train/ce_loss": 0.986320972442627 }, { "epoch": 0.1403994463120427, "step": 1420, "train/sim_loss": 0.0625 }, { "epoch": 0.1403994463120427, "step": 1420, "train/total_loss": 0.1611320972442627 }, { "entropy": 9.41115951538086, "epoch": 0.14049831916155825, "mean_token_accuracy": 0.7489986419677734, "num_tokens": 7396791.0, "step": 1421, "train/ce_loss": 1.4863344430923462 }, { "epoch": 0.14049831916155825, "step": 1421, "train/sim_loss": 0.0859375 }, { "epoch": 0.14049831916155825, "step": 1421, "train/total_loss": 0.2345709502696991 }, { "entropy": 9.614538192749023, "epoch": 0.14059719201107376, "mean_token_accuracy": 0.7886056900024414, "num_tokens": 7401889.0, "step": 1422, "train/ce_loss": 0.8483023643493652 }, { "epoch": 0.14059719201107376, "step": 1422, "train/sim_loss": 0.0625 }, { "epoch": 0.14059719201107376, "step": 1422, "train/total_loss": 0.14733023941516876 }, { "entropy": 9.525871276855469, "epoch": 0.14069606486058928, "mean_token_accuracy": 0.7485029697418213, "num_tokens": 7406993.0, "step": 1423, "train/ce_loss": 1.4722119569778442 }, { "epoch": 0.14069606486058928, "step": 1423, "train/sim_loss": 0.0625 }, { "epoch": 0.14069606486058928, "step": 1423, "train/total_loss": 0.20972119271755219 }, { "entropy": 9.20797061920166, "epoch": 0.14079493771010482, "mean_token_accuracy": 0.6947835683822632, "num_tokens": 7412350.0, "step": 1424, "train/ce_loss": 1.1180000305175781 }, { "epoch": 0.14079493771010482, "step": 1424, "train/sim_loss": 0.08984375 }, { "epoch": 0.14079493771010482, "step": 1424, "train/total_loss": 0.20164376497268677 }, { "entropy": 9.134281158447266, "epoch": 0.14089381055962033, "mean_token_accuracy": 0.747586190700531, "num_tokens": 7417620.0, "step": 1425, "train/ce_loss": 0.5653731226921082 }, { "epoch": 0.14089381055962033, "step": 1425, "train/sim_loss": 0.08203125 }, { "epoch": 0.14089381055962033, "step": 1425, "train/total_loss": 0.13856856524944305 }, { "entropy": 9.078322410583496, "epoch": 0.14099268340913584, "mean_token_accuracy": 0.7253086566925049, "num_tokens": 7423051.0, "step": 1426, "train/ce_loss": 1.2762891054153442 }, { "epoch": 0.14099268340913584, "step": 1426, "train/sim_loss": 0.078125 }, { "epoch": 0.14099268340913584, "step": 1426, "train/total_loss": 0.20575390756130219 }, { "entropy": 9.37423324584961, "epoch": 0.14109155625865138, "mean_token_accuracy": 0.6671575903892517, "num_tokens": 7428196.0, "step": 1427, "train/ce_loss": 1.6712976694107056 }, { "epoch": 0.14109155625865138, "step": 1427, "train/sim_loss": 0.12890625 }, { "epoch": 0.14109155625865138, "step": 1427, "train/total_loss": 0.2960360050201416 }, { "entropy": 9.619972229003906, "epoch": 0.1411904291081669, "mean_token_accuracy": 0.7265100479125977, "num_tokens": 7433246.0, "step": 1428, "train/ce_loss": 1.672378420829773 }, { "epoch": 0.1411904291081669, "step": 1428, "train/sim_loss": 0.09375 }, { "epoch": 0.1411904291081669, "step": 1428, "train/total_loss": 0.2609878480434418 }, { "entropy": 9.88757038116455, "epoch": 0.1412893019576824, "mean_token_accuracy": 0.7270992398262024, "num_tokens": 7438150.0, "step": 1429, "train/ce_loss": 0.7832049131393433 }, { "epoch": 0.1412893019576824, "step": 1429, "train/sim_loss": 0.0859375 }, { "epoch": 0.1412893019576824, "step": 1429, "train/total_loss": 0.16425800323486328 }, { "entropy": 9.301030158996582, "epoch": 0.14138817480719795, "mean_token_accuracy": 0.6925169825553894, "num_tokens": 7443337.0, "step": 1430, "train/ce_loss": 0.8136307001113892 }, { "epoch": 0.14138817480719795, "step": 1430, "train/sim_loss": 0.0625 }, { "epoch": 0.14138817480719795, "step": 1430, "train/total_loss": 0.14386308193206787 }, { "entropy": 8.811057090759277, "epoch": 0.14148704765671347, "mean_token_accuracy": 0.7604060769081116, "num_tokens": 7448858.0, "step": 1431, "train/ce_loss": 0.9372698664665222 }, { "epoch": 0.14148704765671347, "step": 1431, "train/sim_loss": 0.1328125 }, { "epoch": 0.14148704765671347, "step": 1431, "train/total_loss": 0.2265394926071167 }, { "entropy": 9.793838500976562, "epoch": 0.14158592050622898, "mean_token_accuracy": 0.7733089327812195, "num_tokens": 7453846.0, "step": 1432, "train/ce_loss": 1.3188660144805908 }, { "epoch": 0.14158592050622898, "step": 1432, "train/sim_loss": 0.125 }, { "epoch": 0.14158592050622898, "step": 1432, "train/total_loss": 0.2568866014480591 }, { "entropy": 8.781266212463379, "epoch": 0.14168479335574452, "mean_token_accuracy": 0.7399267554283142, "num_tokens": 7459427.0, "step": 1433, "train/ce_loss": 1.0480564832687378 }, { "epoch": 0.14168479335574452, "step": 1433, "train/sim_loss": 0.109375 }, { "epoch": 0.14168479335574452, "step": 1433, "train/total_loss": 0.21418064832687378 }, { "entropy": 9.048070907592773, "epoch": 0.14178366620526003, "mean_token_accuracy": 0.75, "num_tokens": 7464698.0, "step": 1434, "train/ce_loss": 0.6495194435119629 }, { "epoch": 0.14178366620526003, "step": 1434, "train/sim_loss": 0.046875 }, { "epoch": 0.14178366620526003, "step": 1434, "train/total_loss": 0.11182694882154465 }, { "entropy": 9.225584030151367, "epoch": 0.14188253905477555, "mean_token_accuracy": 0.7363515496253967, "num_tokens": 7469916.0, "step": 1435, "train/ce_loss": 0.8719775676727295 }, { "epoch": 0.14188253905477555, "step": 1435, "train/sim_loss": 0.08203125 }, { "epoch": 0.14188253905477555, "step": 1435, "train/total_loss": 0.16922900080680847 }, { "entropy": 9.48530101776123, "epoch": 0.1419814119042911, "mean_token_accuracy": 0.6973294019699097, "num_tokens": 7475084.0, "step": 1436, "train/ce_loss": 0.7331254482269287 }, { "epoch": 0.1419814119042911, "step": 1436, "train/sim_loss": 0.1015625 }, { "epoch": 0.1419814119042911, "step": 1436, "train/total_loss": 0.17487505078315735 }, { "entropy": 8.971145629882812, "epoch": 0.1420802847538066, "mean_token_accuracy": 0.7418181896209717, "num_tokens": 7480409.0, "step": 1437, "train/ce_loss": 1.2895128726959229 }, { "epoch": 0.1420802847538066, "step": 1437, "train/sim_loss": 0.09375 }, { "epoch": 0.1420802847538066, "step": 1437, "train/total_loss": 0.222701296210289 }, { "entropy": 10.205839157104492, "epoch": 0.14217915760332211, "mean_token_accuracy": 0.6894736886024475, "num_tokens": 7485142.0, "step": 1438, "train/ce_loss": 2.168229103088379 }, { "epoch": 0.14217915760332211, "step": 1438, "train/sim_loss": 0.08203125 }, { "epoch": 0.14217915760332211, "step": 1438, "train/total_loss": 0.29885417222976685 }, { "entropy": 8.872293472290039, "epoch": 0.14227803045283766, "mean_token_accuracy": 0.7761341333389282, "num_tokens": 7490696.0, "step": 1439, "train/ce_loss": 0.7610247731208801 }, { "epoch": 0.14227803045283766, "step": 1439, "train/sim_loss": 0.1171875 }, { "epoch": 0.14227803045283766, "step": 1439, "train/total_loss": 0.19328998029232025 }, { "epoch": 0.14237690330235317, "grad_norm": 0.9774222373962402, "learning_rate": 9.646689413044555e-06, "loss": 0.1714, "step": 1440 }, { "entropy": 9.629098892211914, "epoch": 0.14237690330235317, "mean_token_accuracy": 0.6889579892158508, "num_tokens": 7495780.0, "step": 1440, "train/ce_loss": 1.546508550643921 }, { "epoch": 0.14237690330235317, "step": 1440, "train/sim_loss": 0.0703125 }, { "epoch": 0.14237690330235317, "step": 1440, "train/total_loss": 0.22496335208415985 }, { "entropy": 8.989510536193848, "epoch": 0.1424757761518687, "mean_token_accuracy": 0.7749725580215454, "num_tokens": 7501192.0, "step": 1441, "train/ce_loss": 0.7729292511940002 }, { "epoch": 0.1424757761518687, "step": 1441, "train/sim_loss": 0.11328125 }, { "epoch": 0.1424757761518687, "step": 1441, "train/total_loss": 0.19057416915893555 }, { "entropy": 9.388324737548828, "epoch": 0.14257464900138422, "mean_token_accuracy": 0.7248322367668152, "num_tokens": 7506414.0, "step": 1442, "train/ce_loss": 0.4920440912246704 }, { "epoch": 0.14257464900138422, "step": 1442, "train/sim_loss": 0.0703125 }, { "epoch": 0.14257464900138422, "step": 1442, "train/total_loss": 0.11951690912246704 }, { "entropy": 9.656713485717773, "epoch": 0.14267352185089974, "mean_token_accuracy": 0.7351524829864502, "num_tokens": 7511502.0, "step": 1443, "train/ce_loss": 1.2703362703323364 }, { "epoch": 0.14267352185089974, "step": 1443, "train/sim_loss": 0.046875 }, { "epoch": 0.14267352185089974, "step": 1443, "train/total_loss": 0.17390863597393036 }, { "entropy": 9.42349624633789, "epoch": 0.14277239470041528, "mean_token_accuracy": 0.6608344316482544, "num_tokens": 7516696.0, "step": 1444, "train/ce_loss": 1.5800093412399292 }, { "epoch": 0.14277239470041528, "step": 1444, "train/sim_loss": 0.0703125 }, { "epoch": 0.14277239470041528, "step": 1444, "train/total_loss": 0.22831343114376068 }, { "entropy": 9.205739974975586, "epoch": 0.1428712675499308, "mean_token_accuracy": 0.7598944306373596, "num_tokens": 7521908.0, "step": 1445, "train/ce_loss": 0.8339932560920715 }, { "epoch": 0.1428712675499308, "step": 1445, "train/sim_loss": 0.05859375 }, { "epoch": 0.1428712675499308, "step": 1445, "train/total_loss": 0.14199307560920715 }, { "entropy": 9.594503402709961, "epoch": 0.1429701403994463, "mean_token_accuracy": 0.75, "num_tokens": 7526952.0, "step": 1446, "train/ce_loss": 1.2667876482009888 }, { "epoch": 0.1429701403994463, "step": 1446, "train/sim_loss": 0.0859375 }, { "epoch": 0.1429701403994463, "step": 1446, "train/total_loss": 0.21261626482009888 }, { "entropy": 9.727521896362305, "epoch": 0.14306901324896185, "mean_token_accuracy": 0.7665056586265564, "num_tokens": 7532050.0, "step": 1447, "train/ce_loss": 0.8516501784324646 }, { "epoch": 0.14306901324896185, "step": 1447, "train/sim_loss": 0.07421875 }, { "epoch": 0.14306901324896185, "step": 1447, "train/total_loss": 0.15938377380371094 }, { "entropy": 9.347183227539062, "epoch": 0.14316788609847736, "mean_token_accuracy": 0.7016248106956482, "num_tokens": 7537184.0, "step": 1448, "train/ce_loss": 1.6525059938430786 }, { "epoch": 0.14316788609847736, "step": 1448, "train/sim_loss": 0.0703125 }, { "epoch": 0.14316788609847736, "step": 1448, "train/total_loss": 0.23556309938430786 }, { "entropy": 8.938035011291504, "epoch": 0.14326675894799287, "mean_token_accuracy": 0.713178277015686, "num_tokens": 7542662.0, "step": 1449, "train/ce_loss": 1.2951596975326538 }, { "epoch": 0.14326675894799287, "step": 1449, "train/sim_loss": 0.1171875 }, { "epoch": 0.14326675894799287, "step": 1449, "train/total_loss": 0.24670347571372986 }, { "entropy": 9.225709915161133, "epoch": 0.1433656317975084, "mean_token_accuracy": 0.7651775479316711, "num_tokens": 7547966.0, "step": 1450, "train/ce_loss": 0.8416441082954407 }, { "epoch": 0.1433656317975084, "step": 1450, "train/sim_loss": 0.06640625 }, { "epoch": 0.1433656317975084, "step": 1450, "train/total_loss": 0.15057066082954407 }, { "entropy": 9.266736030578613, "epoch": 0.14346450464702393, "mean_token_accuracy": 0.7343065738677979, "num_tokens": 7553141.0, "step": 1451, "train/ce_loss": 1.0648012161254883 }, { "epoch": 0.14346450464702393, "step": 1451, "train/sim_loss": 0.078125 }, { "epoch": 0.14346450464702393, "step": 1451, "train/total_loss": 0.18460512161254883 }, { "entropy": 9.525014877319336, "epoch": 0.14356337749653944, "mean_token_accuracy": 0.7324749827384949, "num_tokens": 7558305.0, "step": 1452, "train/ce_loss": 1.7234762907028198 }, { "epoch": 0.14356337749653944, "step": 1452, "train/sim_loss": 0.1171875 }, { "epoch": 0.14356337749653944, "step": 1452, "train/total_loss": 0.28953513503074646 }, { "entropy": 8.922002792358398, "epoch": 0.14366225034605498, "mean_token_accuracy": 0.6986986994743347, "num_tokens": 7563774.0, "step": 1453, "train/ce_loss": 0.5577152967453003 }, { "epoch": 0.14366225034605498, "step": 1453, "train/sim_loss": 0.03515625 }, { "epoch": 0.14366225034605498, "step": 1453, "train/total_loss": 0.09092777967453003 }, { "entropy": 9.489364624023438, "epoch": 0.1437611231955705, "mean_token_accuracy": 0.7285714149475098, "num_tokens": 7568986.0, "step": 1454, "train/ce_loss": 0.6063663959503174 }, { "epoch": 0.1437611231955705, "step": 1454, "train/sim_loss": 0.0703125 }, { "epoch": 0.1437611231955705, "step": 1454, "train/total_loss": 0.13094913959503174 }, { "entropy": 9.168773651123047, "epoch": 0.143859996045086, "mean_token_accuracy": 0.7039238810539246, "num_tokens": 7574270.0, "step": 1455, "train/ce_loss": 0.8265439867973328 }, { "epoch": 0.143859996045086, "step": 1455, "train/sim_loss": 0.09765625 }, { "epoch": 0.143859996045086, "step": 1455, "train/total_loss": 0.18031065165996552 }, { "entropy": 9.695110321044922, "epoch": 0.14395886889460155, "mean_token_accuracy": 0.7261484265327454, "num_tokens": 7579265.0, "step": 1456, "train/ce_loss": 1.3975584806757979e-05 }, { "epoch": 0.14395886889460155, "step": 1456, "train/sim_loss": 0.0859375 }, { "epoch": 0.14395886889460155, "step": 1456, "train/total_loss": 0.08593890070915222 }, { "entropy": 9.658077239990234, "epoch": 0.14405774174411706, "mean_token_accuracy": 0.7463557124137878, "num_tokens": 7584372.0, "step": 1457, "train/ce_loss": 1.2585256099700928 }, { "epoch": 0.14405774174411706, "step": 1457, "train/sim_loss": 0.0390625 }, { "epoch": 0.14405774174411706, "step": 1457, "train/total_loss": 0.164915069937706 }, { "entropy": 9.691431045532227, "epoch": 0.14415661459363258, "mean_token_accuracy": 0.7751798629760742, "num_tokens": 7589333.0, "step": 1458, "train/ce_loss": 1.035921013681218e-05 }, { "epoch": 0.14415661459363258, "step": 1458, "train/sim_loss": 0.06640625 }, { "epoch": 0.14415661459363258, "step": 1458, "train/total_loss": 0.06640728563070297 }, { "entropy": 9.916348457336426, "epoch": 0.14425548744314812, "mean_token_accuracy": 0.7798742055892944, "num_tokens": 7594257.0, "step": 1459, "train/ce_loss": 1.742569088935852 }, { "epoch": 0.14425548744314812, "step": 1459, "train/sim_loss": 0.0859375 }, { "epoch": 0.14425548744314812, "step": 1459, "train/total_loss": 0.26019442081451416 }, { "epoch": 0.14435436029266363, "grad_norm": 1.05721914768219, "learning_rate": 9.641744548286605e-06, "loss": 0.1613, "step": 1460 }, { "entropy": 9.46556282043457, "epoch": 0.14435436029266363, "mean_token_accuracy": 0.7061469554901123, "num_tokens": 7599326.0, "step": 1460, "train/ce_loss": 1.0082299709320068 }, { "epoch": 0.14435436029266363, "step": 1460, "train/sim_loss": 0.078125 }, { "epoch": 0.14435436029266363, "step": 1460, "train/total_loss": 0.17894800007343292 }, { "entropy": 9.255624771118164, "epoch": 0.14445323314217914, "mean_token_accuracy": 0.6964064240455627, "num_tokens": 7604626.0, "step": 1461, "train/ce_loss": 0.9624316692352295 }, { "epoch": 0.14445323314217914, "step": 1461, "train/sim_loss": 0.1171875 }, { "epoch": 0.14445323314217914, "step": 1461, "train/total_loss": 0.21343067288398743 }, { "entropy": 9.89693546295166, "epoch": 0.14455210599169468, "mean_token_accuracy": 0.7595818638801575, "num_tokens": 7609670.0, "step": 1462, "train/ce_loss": 0.7699599862098694 }, { "epoch": 0.14455210599169468, "step": 1462, "train/sim_loss": 0.0546875 }, { "epoch": 0.14455210599169468, "step": 1462, "train/total_loss": 0.13168349862098694 }, { "entropy": 9.130101203918457, "epoch": 0.1446509788412102, "mean_token_accuracy": 0.7557003498077393, "num_tokens": 7615070.0, "step": 1463, "train/ce_loss": 0.7649668455123901 }, { "epoch": 0.1446509788412102, "step": 1463, "train/sim_loss": 0.0390625 }, { "epoch": 0.1446509788412102, "step": 1463, "train/total_loss": 0.1155591830611229 }, { "entropy": 9.673083305358887, "epoch": 0.14474985169072574, "mean_token_accuracy": 0.7098283767700195, "num_tokens": 7620116.0, "step": 1464, "train/ce_loss": 0.9854806661605835 }, { "epoch": 0.14474985169072574, "step": 1464, "train/sim_loss": 0.1015625 }, { "epoch": 0.14474985169072574, "step": 1464, "train/total_loss": 0.2001105695962906 }, { "entropy": 9.212682723999023, "epoch": 0.14484872454024125, "mean_token_accuracy": 0.6935840845108032, "num_tokens": 7625416.0, "step": 1465, "train/ce_loss": 1.523110270500183 }, { "epoch": 0.14484872454024125, "step": 1465, "train/sim_loss": 0.08984375 }, { "epoch": 0.14484872454024125, "step": 1465, "train/total_loss": 0.2421547770500183 }, { "entropy": 10.001008033752441, "epoch": 0.14494759738975677, "mean_token_accuracy": 0.7122557759284973, "num_tokens": 7630393.0, "step": 1466, "train/ce_loss": 1.0846751928329468 }, { "epoch": 0.14494759738975677, "step": 1466, "train/sim_loss": 0.0546875 }, { "epoch": 0.14494759738975677, "step": 1466, "train/total_loss": 0.16315501928329468 }, { "entropy": 9.443642616271973, "epoch": 0.1450464702392723, "mean_token_accuracy": 0.7260459065437317, "num_tokens": 7635565.0, "step": 1467, "train/ce_loss": 0.9722166657447815 }, { "epoch": 0.1450464702392723, "step": 1467, "train/sim_loss": 0.05078125 }, { "epoch": 0.1450464702392723, "step": 1467, "train/total_loss": 0.14800292253494263 }, { "entropy": 9.269857406616211, "epoch": 0.14514534308878782, "mean_token_accuracy": 0.6705756783485413, "num_tokens": 7640988.0, "step": 1468, "train/ce_loss": 1.412301778793335 }, { "epoch": 0.14514534308878782, "step": 1468, "train/sim_loss": 0.1171875 }, { "epoch": 0.14514534308878782, "step": 1468, "train/total_loss": 0.25841766595840454 }, { "entropy": 9.7373046875, "epoch": 0.14524421593830333, "mean_token_accuracy": 0.7007407546043396, "num_tokens": 7646091.0, "step": 1469, "train/ce_loss": 9.146291631623171e-06 }, { "epoch": 0.14524421593830333, "step": 1469, "train/sim_loss": 0.03515625 }, { "epoch": 0.14524421593830333, "step": 1469, "train/total_loss": 0.03515716642141342 }, { "entropy": 9.382098197937012, "epoch": 0.14534308878781887, "mean_token_accuracy": 0.7916167378425598, "num_tokens": 7651404.0, "step": 1470, "train/ce_loss": 1.072875738143921 }, { "epoch": 0.14534308878781887, "step": 1470, "train/sim_loss": 0.03515625 }, { "epoch": 0.14534308878781887, "step": 1470, "train/total_loss": 0.14244383573532104 }, { "entropy": 8.834354400634766, "epoch": 0.1454419616373344, "mean_token_accuracy": 0.6691973805427551, "num_tokens": 7656797.0, "step": 1471, "train/ce_loss": 1.2199304103851318 }, { "epoch": 0.1454419616373344, "step": 1471, "train/sim_loss": 0.109375 }, { "epoch": 0.1454419616373344, "step": 1471, "train/total_loss": 0.2313680350780487 }, { "entropy": 9.114487648010254, "epoch": 0.1455408344868499, "mean_token_accuracy": 0.718120813369751, "num_tokens": 7662221.0, "step": 1472, "train/ce_loss": 1.2842098474502563 }, { "epoch": 0.1455408344868499, "step": 1472, "train/sim_loss": 0.08203125 }, { "epoch": 0.1455408344868499, "step": 1472, "train/total_loss": 0.21045224368572235 }, { "entropy": 10.331319808959961, "epoch": 0.14563970733636544, "mean_token_accuracy": 0.7259474992752075, "num_tokens": 7666962.0, "step": 1473, "train/ce_loss": 1.2172787189483643 }, { "epoch": 0.14563970733636544, "step": 1473, "train/sim_loss": 0.0546875 }, { "epoch": 0.14563970733636544, "step": 1473, "train/total_loss": 0.17641538381576538 }, { "entropy": 10.037343978881836, "epoch": 0.14573858018588096, "mean_token_accuracy": 0.7982646226882935, "num_tokens": 7671868.0, "step": 1474, "train/ce_loss": 3.5985547583550215e-05 }, { "epoch": 0.14573858018588096, "step": 1474, "train/sim_loss": 0.078125 }, { "epoch": 0.14573858018588096, "step": 1474, "train/total_loss": 0.07812859863042831 }, { "entropy": 9.554939270019531, "epoch": 0.14583745303539647, "mean_token_accuracy": 0.7434402108192444, "num_tokens": 7676999.0, "step": 1475, "train/ce_loss": 0.491904079914093 }, { "epoch": 0.14583745303539647, "step": 1475, "train/sim_loss": 0.109375 }, { "epoch": 0.14583745303539647, "step": 1475, "train/total_loss": 0.15856540203094482 }, { "entropy": 9.385547637939453, "epoch": 0.145936325884912, "mean_token_accuracy": 0.7004889845848083, "num_tokens": 7682266.0, "step": 1476, "train/ce_loss": 1.4973679780960083 }, { "epoch": 0.145936325884912, "step": 1476, "train/sim_loss": 0.125 }, { "epoch": 0.145936325884912, "step": 1476, "train/total_loss": 0.27473682165145874 }, { "entropy": 9.379226684570312, "epoch": 0.14603519873442752, "mean_token_accuracy": 0.718358039855957, "num_tokens": 7687576.0, "step": 1477, "train/ce_loss": 0.6333465576171875 }, { "epoch": 0.14603519873442752, "step": 1477, "train/sim_loss": 0.0859375 }, { "epoch": 0.14603519873442752, "step": 1477, "train/total_loss": 0.149272158741951 }, { "entropy": 9.491981506347656, "epoch": 0.14613407158394304, "mean_token_accuracy": 0.7100591659545898, "num_tokens": 7692734.0, "step": 1478, "train/ce_loss": 0.7247401475906372 }, { "epoch": 0.14613407158394304, "step": 1478, "train/sim_loss": 0.07421875 }, { "epoch": 0.14613407158394304, "step": 1478, "train/total_loss": 0.14669276773929596 }, { "entropy": 9.006340026855469, "epoch": 0.14623294443345858, "mean_token_accuracy": 0.6806231737136841, "num_tokens": 7698217.0, "step": 1479, "train/ce_loss": 1.313188910484314 }, { "epoch": 0.14623294443345858, "step": 1479, "train/sim_loss": 0.06640625 }, { "epoch": 0.14623294443345858, "step": 1479, "train/total_loss": 0.19772514700889587 }, { "epoch": 0.1463318172829741, "grad_norm": 1.075100064277649, "learning_rate": 9.636799683528656e-06, "loss": 0.1685, "step": 1480 }, { "entropy": 9.058917999267578, "epoch": 0.1463318172829741, "mean_token_accuracy": 0.7405857443809509, "num_tokens": 7703678.0, "step": 1480, "train/ce_loss": 1.1823251247406006 }, { "epoch": 0.1463318172829741, "step": 1480, "train/sim_loss": 0.1953125 }, { "epoch": 0.1463318172829741, "step": 1480, "train/total_loss": 0.31354501843452454 }, { "entropy": 9.945625305175781, "epoch": 0.1464306901324896, "mean_token_accuracy": 0.701171875, "num_tokens": 7708647.0, "step": 1481, "train/ce_loss": 1.6655864715576172 }, { "epoch": 0.1464306901324896, "step": 1481, "train/sim_loss": 0.0703125 }, { "epoch": 0.1464306901324896, "step": 1481, "train/total_loss": 0.2368711531162262 }, { "entropy": 9.082256317138672, "epoch": 0.14652956298200515, "mean_token_accuracy": 0.7210718393325806, "num_tokens": 7713952.0, "step": 1482, "train/ce_loss": 1.3255983591079712 }, { "epoch": 0.14652956298200515, "step": 1482, "train/sim_loss": 0.0703125 }, { "epoch": 0.14652956298200515, "step": 1482, "train/total_loss": 0.20287233591079712 }, { "entropy": 10.126291275024414, "epoch": 0.14662843583152066, "mean_token_accuracy": 0.7613065242767334, "num_tokens": 7718771.0, "step": 1483, "train/ce_loss": 1.0498250722885132 }, { "epoch": 0.14662843583152066, "step": 1483, "train/sim_loss": 0.0546875 }, { "epoch": 0.14662843583152066, "step": 1483, "train/total_loss": 0.15967001020908356 }, { "entropy": 9.443145751953125, "epoch": 0.1467273086810362, "mean_token_accuracy": 0.7363751530647278, "num_tokens": 7724015.0, "step": 1484, "train/ce_loss": 1.3845174312591553 }, { "epoch": 0.1467273086810362, "step": 1484, "train/sim_loss": 0.1484375 }, { "epoch": 0.1467273086810362, "step": 1484, "train/total_loss": 0.2868892550468445 }, { "entropy": 10.079927444458008, "epoch": 0.1468261815305517, "mean_token_accuracy": 0.7978723645210266, "num_tokens": 7728918.0, "step": 1485, "train/ce_loss": 0.8031333088874817 }, { "epoch": 0.1468261815305517, "step": 1485, "train/sim_loss": 0.05859375 }, { "epoch": 0.1468261815305517, "step": 1485, "train/total_loss": 0.1389070749282837 }, { "entropy": 10.058080673217773, "epoch": 0.14692505438006723, "mean_token_accuracy": 0.7175398468971252, "num_tokens": 7733758.0, "step": 1486, "train/ce_loss": 3.7512934795813635e-05 }, { "epoch": 0.14692505438006723, "step": 1486, "train/sim_loss": 0.0859375 }, { "epoch": 0.14692505438006723, "step": 1486, "train/total_loss": 0.08594124764204025 }, { "entropy": 9.744989395141602, "epoch": 0.14702392722958277, "mean_token_accuracy": 0.7365930676460266, "num_tokens": 7738850.0, "step": 1487, "train/ce_loss": 1.0534642934799194 }, { "epoch": 0.14702392722958277, "step": 1487, "train/sim_loss": 0.07421875 }, { "epoch": 0.14702392722958277, "step": 1487, "train/total_loss": 0.1795651912689209 }, { "entropy": 9.28485107421875, "epoch": 0.14712280007909828, "mean_token_accuracy": 0.7431551218032837, "num_tokens": 7744070.0, "step": 1488, "train/ce_loss": 0.7180099487304688 }, { "epoch": 0.14712280007909828, "step": 1488, "train/sim_loss": 0.05078125 }, { "epoch": 0.14712280007909828, "step": 1488, "train/total_loss": 0.12258224934339523 }, { "entropy": 9.276777267456055, "epoch": 0.1472216729286138, "mean_token_accuracy": 0.7062423229217529, "num_tokens": 7749376.0, "step": 1489, "train/ce_loss": 0.9177853465080261 }, { "epoch": 0.1472216729286138, "step": 1489, "train/sim_loss": 0.0390625 }, { "epoch": 0.1472216729286138, "step": 1489, "train/total_loss": 0.13084104657173157 }, { "entropy": 9.664016723632812, "epoch": 0.14732054577812934, "mean_token_accuracy": 0.7136222720146179, "num_tokens": 7754478.0, "step": 1490, "train/ce_loss": 1.612519383430481 }, { "epoch": 0.14732054577812934, "step": 1490, "train/sim_loss": 0.1015625 }, { "epoch": 0.14732054577812934, "step": 1490, "train/total_loss": 0.262814462184906 }, { "entropy": 9.540077209472656, "epoch": 0.14741941862764485, "mean_token_accuracy": 0.7288401126861572, "num_tokens": 7759574.0, "step": 1491, "train/ce_loss": 1.006054401397705 }, { "epoch": 0.14741941862764485, "step": 1491, "train/sim_loss": 0.08984375 }, { "epoch": 0.14741941862764485, "step": 1491, "train/total_loss": 0.19044919312000275 }, { "entropy": 9.13851547241211, "epoch": 0.14751829147716036, "mean_token_accuracy": 0.7488636374473572, "num_tokens": 7764984.0, "step": 1492, "train/ce_loss": 0.8154755234718323 }, { "epoch": 0.14751829147716036, "step": 1492, "train/sim_loss": 0.078125 }, { "epoch": 0.14751829147716036, "step": 1492, "train/total_loss": 0.1596725583076477 }, { "entropy": 9.106375694274902, "epoch": 0.1476171643266759, "mean_token_accuracy": 0.7339848279953003, "num_tokens": 7770380.0, "step": 1493, "train/ce_loss": 0.41329845786094666 }, { "epoch": 0.1476171643266759, "step": 1493, "train/sim_loss": 0.0703125 }, { "epoch": 0.1476171643266759, "step": 1493, "train/total_loss": 0.11164234578609467 }, { "entropy": 9.136665344238281, "epoch": 0.14771603717619142, "mean_token_accuracy": 0.7275640964508057, "num_tokens": 7775789.0, "step": 1494, "train/ce_loss": 1.3060837984085083 }, { "epoch": 0.14771603717619142, "step": 1494, "train/sim_loss": 0.1015625 }, { "epoch": 0.14771603717619142, "step": 1494, "train/total_loss": 0.23217087984085083 }, { "entropy": 9.167590141296387, "epoch": 0.14781491002570693, "mean_token_accuracy": 0.7491926550865173, "num_tokens": 7781215.0, "step": 1495, "train/ce_loss": 0.5833946466445923 }, { "epoch": 0.14781491002570693, "step": 1495, "train/sim_loss": 0.0859375 }, { "epoch": 0.14781491002570693, "step": 1495, "train/total_loss": 0.144276961684227 }, { "entropy": 9.965858459472656, "epoch": 0.14791378287522247, "mean_token_accuracy": 0.7415329813957214, "num_tokens": 7786166.0, "step": 1496, "train/ce_loss": 1.4708634614944458 }, { "epoch": 0.14791378287522247, "step": 1496, "train/sim_loss": 0.05078125 }, { "epoch": 0.14791378287522247, "step": 1496, "train/total_loss": 0.19786760210990906 }, { "entropy": 9.608375549316406, "epoch": 0.14801265572473798, "mean_token_accuracy": 0.780415415763855, "num_tokens": 7791279.0, "step": 1497, "train/ce_loss": 0.7494889497756958 }, { "epoch": 0.14801265572473798, "step": 1497, "train/sim_loss": 0.04296875 }, { "epoch": 0.14801265572473798, "step": 1497, "train/total_loss": 0.11791764944791794 }, { "entropy": 9.619302749633789, "epoch": 0.1481115285742535, "mean_token_accuracy": 0.6865671873092651, "num_tokens": 7796438.0, "step": 1498, "train/ce_loss": 1.3919106721878052 }, { "epoch": 0.1481115285742535, "step": 1498, "train/sim_loss": 0.16015625 }, { "epoch": 0.1481115285742535, "step": 1498, "train/total_loss": 0.2993473410606384 }, { "entropy": 9.500656127929688, "epoch": 0.14821040142376904, "mean_token_accuracy": 0.7028796076774597, "num_tokens": 7801662.0, "step": 1499, "train/ce_loss": 1.4153218269348145 }, { "epoch": 0.14821040142376904, "step": 1499, "train/sim_loss": 0.125 }, { "epoch": 0.14821040142376904, "step": 1499, "train/total_loss": 0.26653218269348145 }, { "epoch": 0.14830927427328455, "grad_norm": 1.0314209461212158, "learning_rate": 9.631854818770708e-06, "loss": 0.1656, "step": 1500 }, { "entropy": 9.492480278015137, "epoch": 0.14830927427328455, "mean_token_accuracy": 0.6932599544525146, "num_tokens": 7806835.0, "step": 1500, "train/ce_loss": 1.1232261657714844 }, { "epoch": 0.14830927427328455, "step": 1500, "train/sim_loss": 0.05078125 }, { "epoch": 0.14830927427328455, "step": 1500, "train/total_loss": 0.1631038784980774 }, { "entropy": 9.169363021850586, "epoch": 0.14840814712280007, "mean_token_accuracy": 0.7156549692153931, "num_tokens": 7812264.0, "step": 1501, "train/ce_loss": 0.7301574349403381 }, { "epoch": 0.14840814712280007, "step": 1501, "train/sim_loss": 0.08984375 }, { "epoch": 0.14840814712280007, "step": 1501, "train/total_loss": 0.1628594994544983 }, { "entropy": 9.594934463500977, "epoch": 0.1485070199723156, "mean_token_accuracy": 0.6979332566261292, "num_tokens": 7817365.0, "step": 1502, "train/ce_loss": 2.1555140018463135 }, { "epoch": 0.1485070199723156, "step": 1502, "train/sim_loss": 0.13671875 }, { "epoch": 0.1485070199723156, "step": 1502, "train/total_loss": 0.3522701561450958 }, { "entropy": 9.353469848632812, "epoch": 0.14860589282183112, "mean_token_accuracy": 0.7362637519836426, "num_tokens": 7822630.0, "step": 1503, "train/ce_loss": 0.7238584756851196 }, { "epoch": 0.14860589282183112, "step": 1503, "train/sim_loss": 0.0234375 }, { "epoch": 0.14860589282183112, "step": 1503, "train/total_loss": 0.09582334756851196 }, { "entropy": 9.216747283935547, "epoch": 0.14870476567134666, "mean_token_accuracy": 0.7195402383804321, "num_tokens": 7827948.0, "step": 1504, "train/ce_loss": 0.7982300519943237 }, { "epoch": 0.14870476567134666, "step": 1504, "train/sim_loss": 0.03125 }, { "epoch": 0.14870476567134666, "step": 1504, "train/total_loss": 0.11107300966978073 }, { "entropy": 9.300481796264648, "epoch": 0.14880363852086217, "mean_token_accuracy": 0.7751938104629517, "num_tokens": 7833248.0, "step": 1505, "train/ce_loss": 1.175078272819519 }, { "epoch": 0.14880363852086217, "step": 1505, "train/sim_loss": 0.1015625 }, { "epoch": 0.14880363852086217, "step": 1505, "train/total_loss": 0.21907033026218414 }, { "entropy": 9.822269439697266, "epoch": 0.1489025113703777, "mean_token_accuracy": 0.7555555701255798, "num_tokens": 7838269.0, "step": 1506, "train/ce_loss": 1.6059404611587524 }, { "epoch": 0.1489025113703777, "step": 1506, "train/sim_loss": 0.1015625 }, { "epoch": 0.1489025113703777, "step": 1506, "train/total_loss": 0.26215654611587524 }, { "entropy": 9.485581398010254, "epoch": 0.14900138421989323, "mean_token_accuracy": 0.7326057553291321, "num_tokens": 7843453.0, "step": 1507, "train/ce_loss": 0.951874315738678 }, { "epoch": 0.14900138421989323, "step": 1507, "train/sim_loss": 0.0859375 }, { "epoch": 0.14900138421989323, "step": 1507, "train/total_loss": 0.18112492561340332 }, { "entropy": 9.385790824890137, "epoch": 0.14910025706940874, "mean_token_accuracy": 0.725824773311615, "num_tokens": 7848740.0, "step": 1508, "train/ce_loss": 1.8204835653305054 }, { "epoch": 0.14910025706940874, "step": 1508, "train/sim_loss": 0.1484375 }, { "epoch": 0.14910025706940874, "step": 1508, "train/total_loss": 0.33048588037490845 }, { "entropy": 9.386913299560547, "epoch": 0.14919912991892426, "mean_token_accuracy": 0.7376312017440796, "num_tokens": 7853875.0, "step": 1509, "train/ce_loss": 1.0938643217086792 }, { "epoch": 0.14919912991892426, "step": 1509, "train/sim_loss": 0.08984375 }, { "epoch": 0.14919912991892426, "step": 1509, "train/total_loss": 0.19923019409179688 }, { "entropy": 9.619256973266602, "epoch": 0.1492980027684398, "mean_token_accuracy": 0.7361878156661987, "num_tokens": 7859076.0, "step": 1510, "train/ce_loss": 0.8881609439849854 }, { "epoch": 0.1492980027684398, "step": 1510, "train/sim_loss": 0.07421875 }, { "epoch": 0.1492980027684398, "step": 1510, "train/total_loss": 0.1630348563194275 }, { "entropy": 10.16584587097168, "epoch": 0.1493968756179553, "mean_token_accuracy": 0.8153846263885498, "num_tokens": 7863919.0, "step": 1511, "train/ce_loss": 2.056551238638349e-05 }, { "epoch": 0.1493968756179553, "step": 1511, "train/sim_loss": 0.05859375 }, { "epoch": 0.1493968756179553, "step": 1511, "train/total_loss": 0.05859580636024475 }, { "entropy": 9.185079574584961, "epoch": 0.14949574846747082, "mean_token_accuracy": 0.7560647130012512, "num_tokens": 7869161.0, "step": 1512, "train/ce_loss": 0.828264594078064 }, { "epoch": 0.14949574846747082, "step": 1512, "train/sim_loss": 0.10546875 }, { "epoch": 0.14949574846747082, "step": 1512, "train/total_loss": 0.18829521536827087 }, { "entropy": 9.522941589355469, "epoch": 0.14959462131698636, "mean_token_accuracy": 0.7742424011230469, "num_tokens": 7874312.0, "step": 1513, "train/ce_loss": 0.9248256087303162 }, { "epoch": 0.14959462131698636, "step": 1513, "train/sim_loss": 0.08203125 }, { "epoch": 0.14959462131698636, "step": 1513, "train/total_loss": 0.1745138168334961 }, { "entropy": 9.77141284942627, "epoch": 0.14969349416650188, "mean_token_accuracy": 0.6744186282157898, "num_tokens": 7879340.0, "step": 1514, "train/ce_loss": 1.1118297576904297 }, { "epoch": 0.14969349416650188, "step": 1514, "train/sim_loss": 0.125 }, { "epoch": 0.14969349416650188, "step": 1514, "train/total_loss": 0.23618298768997192 }, { "entropy": 9.230764389038086, "epoch": 0.1497923670160174, "mean_token_accuracy": 0.691142201423645, "num_tokens": 7884685.0, "step": 1515, "train/ce_loss": 0.7974668741226196 }, { "epoch": 0.1497923670160174, "step": 1515, "train/sim_loss": 0.0546875 }, { "epoch": 0.1497923670160174, "step": 1515, "train/total_loss": 0.13443419337272644 }, { "entropy": 9.194846153259277, "epoch": 0.14989123986553293, "mean_token_accuracy": 0.7187893986701965, "num_tokens": 7889955.0, "step": 1516, "train/ce_loss": 0.6924360990524292 }, { "epoch": 0.14989123986553293, "step": 1516, "train/sim_loss": 0.0703125 }, { "epoch": 0.14989123986553293, "step": 1516, "train/total_loss": 0.13955610990524292 }, { "entropy": 9.383991241455078, "epoch": 0.14999011271504845, "mean_token_accuracy": 0.768324613571167, "num_tokens": 7895187.0, "step": 1517, "train/ce_loss": 0.7051076889038086 }, { "epoch": 0.14999011271504845, "step": 1517, "train/sim_loss": 0.07421875 }, { "epoch": 0.14999011271504845, "step": 1517, "train/total_loss": 0.14472952485084534 }, { "entropy": 9.848875999450684, "epoch": 0.15008898556456396, "mean_token_accuracy": 0.7439446449279785, "num_tokens": 7900211.0, "step": 1518, "train/ce_loss": 0.7502337098121643 }, { "epoch": 0.15008898556456396, "step": 1518, "train/sim_loss": 0.13671875 }, { "epoch": 0.15008898556456396, "step": 1518, "train/total_loss": 0.21174213290214539 }, { "entropy": 9.283403396606445, "epoch": 0.1501878584140795, "mean_token_accuracy": 0.7236841917037964, "num_tokens": 7905520.0, "step": 1519, "train/ce_loss": 0.9833032488822937 }, { "epoch": 0.1501878584140795, "step": 1519, "train/sim_loss": 0.08203125 }, { "epoch": 0.1501878584140795, "step": 1519, "train/total_loss": 0.1803615689277649 }, { "epoch": 0.150286731263595, "grad_norm": 1.0560803413391113, "learning_rate": 9.626909954012758e-06, "loss": 0.1666, "step": 1520 }, { "entropy": 9.216127395629883, "epoch": 0.150286731263595, "mean_token_accuracy": 0.7326343655586243, "num_tokens": 7910732.0, "step": 1520, "train/ce_loss": 0.8644230961799622 }, { "epoch": 0.150286731263595, "step": 1520, "train/sim_loss": 0.15234375 }, { "epoch": 0.150286731263595, "step": 1520, "train/total_loss": 0.23878607153892517 }, { "entropy": 9.040716171264648, "epoch": 0.15038560411311053, "mean_token_accuracy": 0.7021716833114624, "num_tokens": 7916187.0, "step": 1521, "train/ce_loss": 0.9593912363052368 }, { "epoch": 0.15038560411311053, "step": 1521, "train/sim_loss": 0.0703125 }, { "epoch": 0.15038560411311053, "step": 1521, "train/total_loss": 0.16625162959098816 }, { "entropy": 9.894933700561523, "epoch": 0.15048447696262607, "mean_token_accuracy": 0.7157676219940186, "num_tokens": 7921098.0, "step": 1522, "train/ce_loss": 1.4292757511138916 }, { "epoch": 0.15048447696262607, "step": 1522, "train/sim_loss": 0.125 }, { "epoch": 0.15048447696262607, "step": 1522, "train/total_loss": 0.2679275870323181 }, { "entropy": 9.750065803527832, "epoch": 0.15058334981214158, "mean_token_accuracy": 0.6848030090332031, "num_tokens": 7926100.0, "step": 1523, "train/ce_loss": 1.9158490896224976 }, { "epoch": 0.15058334981214158, "step": 1523, "train/sim_loss": 0.125 }, { "epoch": 0.15058334981214158, "step": 1523, "train/total_loss": 0.31658491492271423 }, { "entropy": 9.4520902633667, "epoch": 0.15068222266165712, "mean_token_accuracy": 0.7044585943222046, "num_tokens": 7931336.0, "step": 1524, "train/ce_loss": 0.9469541311264038 }, { "epoch": 0.15068222266165712, "step": 1524, "train/sim_loss": 0.078125 }, { "epoch": 0.15068222266165712, "step": 1524, "train/total_loss": 0.17282041907310486 }, { "entropy": 9.343910217285156, "epoch": 0.15078109551117264, "mean_token_accuracy": 0.7021546363830566, "num_tokens": 7936514.0, "step": 1525, "train/ce_loss": 1.0948412418365479 }, { "epoch": 0.15078109551117264, "step": 1525, "train/sim_loss": 0.078125 }, { "epoch": 0.15078109551117264, "step": 1525, "train/total_loss": 0.18760913610458374 }, { "entropy": 9.896944046020508, "epoch": 0.15087996836068815, "mean_token_accuracy": 0.7777777910232544, "num_tokens": 7941447.0, "step": 1526, "train/ce_loss": 0.83757483959198 }, { "epoch": 0.15087996836068815, "step": 1526, "train/sim_loss": 0.05859375 }, { "epoch": 0.15087996836068815, "step": 1526, "train/total_loss": 0.14235123991966248 }, { "entropy": 10.245054244995117, "epoch": 0.1509788412102037, "mean_token_accuracy": 0.7386363744735718, "num_tokens": 7946249.0, "step": 1527, "train/ce_loss": 1.0803058103192598e-05 }, { "epoch": 0.1509788412102037, "step": 1527, "train/sim_loss": 0.0390625 }, { "epoch": 0.1509788412102037, "step": 1527, "train/total_loss": 0.039063580334186554 }, { "entropy": 9.269997596740723, "epoch": 0.1510777140597192, "mean_token_accuracy": 0.6519208550453186, "num_tokens": 7951606.0, "step": 1528, "train/ce_loss": 0.794880747795105 }, { "epoch": 0.1510777140597192, "step": 1528, "train/sim_loss": 0.06640625 }, { "epoch": 0.1510777140597192, "step": 1528, "train/total_loss": 0.14589431881904602 }, { "entropy": 9.347448348999023, "epoch": 0.15117658690923472, "mean_token_accuracy": 0.7196382284164429, "num_tokens": 7956820.0, "step": 1529, "train/ce_loss": 0.5854206681251526 }, { "epoch": 0.15117658690923472, "step": 1529, "train/sim_loss": 0.06640625 }, { "epoch": 0.15117658690923472, "step": 1529, "train/total_loss": 0.12494832277297974 }, { "entropy": 9.150819778442383, "epoch": 0.15127545975875026, "mean_token_accuracy": 0.7061556577682495, "num_tokens": 7962135.0, "step": 1530, "train/ce_loss": 1.0358163118362427 }, { "epoch": 0.15127545975875026, "step": 1530, "train/sim_loss": 0.09375 }, { "epoch": 0.15127545975875026, "step": 1530, "train/total_loss": 0.19733163714408875 }, { "entropy": 9.922235488891602, "epoch": 0.15137433260826577, "mean_token_accuracy": 0.7366412281990051, "num_tokens": 7967078.0, "step": 1531, "train/ce_loss": 1.2564538717269897 }, { "epoch": 0.15137433260826577, "step": 1531, "train/sim_loss": 0.078125 }, { "epoch": 0.15137433260826577, "step": 1531, "train/total_loss": 0.20377038419246674 }, { "entropy": 8.938532829284668, "epoch": 0.15147320545778128, "mean_token_accuracy": 0.7037383317947388, "num_tokens": 7972631.0, "step": 1532, "train/ce_loss": 0.841424286365509 }, { "epoch": 0.15147320545778128, "step": 1532, "train/sim_loss": 0.08984375 }, { "epoch": 0.15147320545778128, "step": 1532, "train/total_loss": 0.17398618161678314 }, { "entropy": 9.842126846313477, "epoch": 0.15157207830729683, "mean_token_accuracy": 0.7336522936820984, "num_tokens": 7977858.0, "step": 1533, "train/ce_loss": 8.909573807613924e-06 }, { "epoch": 0.15157207830729683, "step": 1533, "train/sim_loss": 0.03125 }, { "epoch": 0.15157207830729683, "step": 1533, "train/total_loss": 0.03125089034438133 }, { "entropy": 9.684154510498047, "epoch": 0.15167095115681234, "mean_token_accuracy": 0.7536231875419617, "num_tokens": 7982805.0, "step": 1534, "train/ce_loss": 4.817128137801774e-05 }, { "epoch": 0.15167095115681234, "step": 1534, "train/sim_loss": 0.05859375 }, { "epoch": 0.15167095115681234, "step": 1534, "train/total_loss": 0.05859856680035591 }, { "entropy": 9.907486915588379, "epoch": 0.15176982400632785, "mean_token_accuracy": 0.7193675637245178, "num_tokens": 7987813.0, "step": 1535, "train/ce_loss": 1.0405404282209929e-05 }, { "epoch": 0.15176982400632785, "step": 1535, "train/sim_loss": 0.078125 }, { "epoch": 0.15176982400632785, "step": 1535, "train/total_loss": 0.07812604308128357 }, { "entropy": 9.681005477905273, "epoch": 0.1518686968558434, "mean_token_accuracy": 0.6660412549972534, "num_tokens": 7992807.0, "step": 1536, "train/ce_loss": 3.722548353835009e-05 }, { "epoch": 0.1518686968558434, "step": 1536, "train/sim_loss": 0.0703125 }, { "epoch": 0.1518686968558434, "step": 1536, "train/total_loss": 0.07031622529029846 }, { "entropy": 9.169857025146484, "epoch": 0.1519675697053589, "mean_token_accuracy": 0.7594936490058899, "num_tokens": 7998059.0, "step": 1537, "train/ce_loss": 0.584730863571167 }, { "epoch": 0.1519675697053589, "step": 1537, "train/sim_loss": 0.0625 }, { "epoch": 0.1519675697053589, "step": 1537, "train/total_loss": 0.12097308784723282 }, { "entropy": 9.898794174194336, "epoch": 0.15206644255487442, "mean_token_accuracy": 0.7634854912757874, "num_tokens": 8002938.0, "step": 1538, "train/ce_loss": 0.8556390404701233 }, { "epoch": 0.15206644255487442, "step": 1538, "train/sim_loss": 0.05078125 }, { "epoch": 0.15206644255487442, "step": 1538, "train/total_loss": 0.13634514808654785 }, { "entropy": 9.214766502380371, "epoch": 0.15216531540438996, "mean_token_accuracy": 0.7001166939735413, "num_tokens": 8008306.0, "step": 1539, "train/ce_loss": 7.306869520107284e-06 }, { "epoch": 0.15216531540438996, "step": 1539, "train/sim_loss": 0.046875 }, { "epoch": 0.15216531540438996, "step": 1539, "train/total_loss": 0.0468757301568985 }, { "epoch": 0.15226418825390547, "grad_norm": 1.021117091178894, "learning_rate": 9.62196508925481e-06, "loss": 0.1727, "step": 1540 }, { "entropy": 9.442438125610352, "epoch": 0.15226418825390547, "mean_token_accuracy": 0.752077579498291, "num_tokens": 8013523.0, "step": 1540, "train/ce_loss": 0.7084006667137146 }, { "epoch": 0.15226418825390547, "step": 1540, "train/sim_loss": 0.08984375 }, { "epoch": 0.15226418825390547, "step": 1540, "train/total_loss": 0.16068381071090698 }, { "entropy": 9.875503540039062, "epoch": 0.152363061103421, "mean_token_accuracy": 0.80694979429245, "num_tokens": 8018507.0, "step": 1541, "train/ce_loss": 8.61121679918142e-06 }, { "epoch": 0.152363061103421, "step": 1541, "train/sim_loss": 0.0859375 }, { "epoch": 0.152363061103421, "step": 1541, "train/total_loss": 0.08593836426734924 }, { "entropy": 9.098029136657715, "epoch": 0.15246193395293653, "mean_token_accuracy": 0.6898002028465271, "num_tokens": 8023943.0, "step": 1542, "train/ce_loss": 0.507659375667572 }, { "epoch": 0.15246193395293653, "step": 1542, "train/sim_loss": 0.09765625 }, { "epoch": 0.15246193395293653, "step": 1542, "train/total_loss": 0.14842218160629272 }, { "entropy": 9.00258731842041, "epoch": 0.15256080680245204, "mean_token_accuracy": 0.739393949508667, "num_tokens": 8029469.0, "step": 1543, "train/ce_loss": 0.696696937084198 }, { "epoch": 0.15256080680245204, "step": 1543, "train/sim_loss": 0.140625 }, { "epoch": 0.15256080680245204, "step": 1543, "train/total_loss": 0.2102946937084198 }, { "entropy": 9.308021545410156, "epoch": 0.15265967965196756, "mean_token_accuracy": 0.7641866207122803, "num_tokens": 8034739.0, "step": 1544, "train/ce_loss": 0.6892697811126709 }, { "epoch": 0.15265967965196756, "step": 1544, "train/sim_loss": 0.03125 }, { "epoch": 0.15265967965196756, "step": 1544, "train/total_loss": 0.10017698258161545 }, { "entropy": 9.620084762573242, "epoch": 0.1527585525014831, "mean_token_accuracy": 0.7102649211883545, "num_tokens": 8039836.0, "step": 1545, "train/ce_loss": 1.0270184247929137e-05 }, { "epoch": 0.1527585525014831, "step": 1545, "train/sim_loss": 0.09375 }, { "epoch": 0.1527585525014831, "step": 1545, "train/total_loss": 0.09375102818012238 }, { "entropy": 9.019035339355469, "epoch": 0.1528574253509986, "mean_token_accuracy": 0.688095211982727, "num_tokens": 8045133.0, "step": 1546, "train/ce_loss": 0.926342248916626 }, { "epoch": 0.1528574253509986, "step": 1546, "train/sim_loss": 0.05859375 }, { "epoch": 0.1528574253509986, "step": 1546, "train/total_loss": 0.15122798085212708 }, { "entropy": 10.257822036743164, "epoch": 0.15295629820051415, "mean_token_accuracy": 0.7606837749481201, "num_tokens": 8049916.0, "step": 1547, "train/ce_loss": 1.0292736291885376 }, { "epoch": 0.15295629820051415, "step": 1547, "train/sim_loss": 0.078125 }, { "epoch": 0.15295629820051415, "step": 1547, "train/total_loss": 0.18105235695838928 }, { "entropy": 9.112434387207031, "epoch": 0.15305517105002966, "mean_token_accuracy": 0.7713310718536377, "num_tokens": 8055255.0, "step": 1548, "train/ce_loss": 0.35855820775032043 }, { "epoch": 0.15305517105002966, "step": 1548, "train/sim_loss": 0.05078125 }, { "epoch": 0.15305517105002966, "step": 1548, "train/total_loss": 0.08663707226514816 }, { "entropy": 9.885303497314453, "epoch": 0.15315404389954518, "mean_token_accuracy": 0.7345890402793884, "num_tokens": 8060292.0, "step": 1549, "train/ce_loss": 1.0865520238876343 }, { "epoch": 0.15315404389954518, "step": 1549, "train/sim_loss": 0.09765625 }, { "epoch": 0.15315404389954518, "step": 1549, "train/total_loss": 0.20631146430969238 }, { "entropy": 9.570383071899414, "epoch": 0.15325291674906072, "mean_token_accuracy": 0.7030625939369202, "num_tokens": 8065463.0, "step": 1550, "train/ce_loss": 1.4615740776062012 }, { "epoch": 0.15325291674906072, "step": 1550, "train/sim_loss": 0.05078125 }, { "epoch": 0.15325291674906072, "step": 1550, "train/total_loss": 0.1969386637210846 }, { "entropy": 9.84042739868164, "epoch": 0.15335178959857623, "mean_token_accuracy": 0.7349823117256165, "num_tokens": 8070463.0, "step": 1551, "train/ce_loss": 1.0207172632217407 }, { "epoch": 0.15335178959857623, "step": 1551, "train/sim_loss": 0.0546875 }, { "epoch": 0.15335178959857623, "step": 1551, "train/total_loss": 0.15675923228263855 }, { "entropy": 9.249610900878906, "epoch": 0.15345066244809175, "mean_token_accuracy": 0.7213114500045776, "num_tokens": 8075756.0, "step": 1552, "train/ce_loss": 1.1852610111236572 }, { "epoch": 0.15345066244809175, "step": 1552, "train/sim_loss": 0.109375 }, { "epoch": 0.15345066244809175, "step": 1552, "train/total_loss": 0.22790110111236572 }, { "entropy": 9.230417251586914, "epoch": 0.1535495352976073, "mean_token_accuracy": 0.6813187003135681, "num_tokens": 8081026.0, "step": 1553, "train/ce_loss": 1.0361573696136475 }, { "epoch": 0.1535495352976073, "step": 1553, "train/sim_loss": 0.05078125 }, { "epoch": 0.1535495352976073, "step": 1553, "train/total_loss": 0.15439698100090027 }, { "entropy": 9.386066436767578, "epoch": 0.1536484081471228, "mean_token_accuracy": 0.7218710780143738, "num_tokens": 8086245.0, "step": 1554, "train/ce_loss": 0.7083413600921631 }, { "epoch": 0.1536484081471228, "step": 1554, "train/sim_loss": 0.07421875 }, { "epoch": 0.1536484081471228, "step": 1554, "train/total_loss": 0.14505288004875183 }, { "entropy": 9.255256652832031, "epoch": 0.1537472809966383, "mean_token_accuracy": 0.7347931861877441, "num_tokens": 8091542.0, "step": 1555, "train/ce_loss": 1.0050069093704224 }, { "epoch": 0.1537472809966383, "step": 1555, "train/sim_loss": 0.13671875 }, { "epoch": 0.1537472809966383, "step": 1555, "train/total_loss": 0.2372194528579712 }, { "entropy": 9.164133071899414, "epoch": 0.15384615384615385, "mean_token_accuracy": 0.6586695909500122, "num_tokens": 8096957.0, "step": 1556, "train/ce_loss": 2.3513078689575195 }, { "epoch": 0.15384615384615385, "step": 1556, "train/sim_loss": 0.1328125 }, { "epoch": 0.15384615384615385, "step": 1556, "train/total_loss": 0.36794328689575195 }, { "entropy": 9.656140327453613, "epoch": 0.15394502669566937, "mean_token_accuracy": 0.7245222926139832, "num_tokens": 8102034.0, "step": 1557, "train/ce_loss": 1.2057785987854004 }, { "epoch": 0.15394502669566937, "step": 1557, "train/sim_loss": 0.078125 }, { "epoch": 0.15394502669566937, "step": 1557, "train/total_loss": 0.198702871799469 }, { "entropy": 9.500467300415039, "epoch": 0.15404389954518488, "mean_token_accuracy": 0.7125827670097351, "num_tokens": 8107382.0, "step": 1558, "train/ce_loss": 0.7562654614448547 }, { "epoch": 0.15404389954518488, "step": 1558, "train/sim_loss": 0.05078125 }, { "epoch": 0.15404389954518488, "step": 1558, "train/total_loss": 0.12640780210494995 }, { "entropy": 8.847757339477539, "epoch": 0.15414277239470042, "mean_token_accuracy": 0.6982097029685974, "num_tokens": 8112637.0, "step": 1559, "train/ce_loss": 1.044164776802063 }, { "epoch": 0.15414277239470042, "step": 1559, "train/sim_loss": 0.078125 }, { "epoch": 0.15414277239470042, "step": 1559, "train/total_loss": 0.18254148960113525 }, { "epoch": 0.15424164524421594, "grad_norm": 1.083085060119629, "learning_rate": 9.617020224496861e-06, "loss": 0.1719, "step": 1560 }, { "entropy": 10.149479866027832, "epoch": 0.15424164524421594, "mean_token_accuracy": 0.7255370020866394, "num_tokens": 8117514.0, "step": 1560, "train/ce_loss": 2.61259210674325e-05 }, { "epoch": 0.15424164524421594, "step": 1560, "train/sim_loss": 0.0625 }, { "epoch": 0.15424164524421594, "step": 1560, "train/total_loss": 0.06250261515378952 }, { "entropy": 9.694318771362305, "epoch": 0.15434051809373145, "mean_token_accuracy": 0.6761363744735718, "num_tokens": 8122653.0, "step": 1561, "train/ce_loss": 1.5257987976074219 }, { "epoch": 0.15434051809373145, "step": 1561, "train/sim_loss": 0.04296875 }, { "epoch": 0.15434051809373145, "step": 1561, "train/total_loss": 0.1955486387014389 }, { "entropy": 9.275548934936523, "epoch": 0.154439390943247, "mean_token_accuracy": 0.6985645890235901, "num_tokens": 8127997.0, "step": 1562, "train/ce_loss": 1.181897759437561 }, { "epoch": 0.154439390943247, "step": 1562, "train/sim_loss": 0.11328125 }, { "epoch": 0.154439390943247, "step": 1562, "train/total_loss": 0.23147103190422058 }, { "entropy": 9.00159740447998, "epoch": 0.1545382637927625, "mean_token_accuracy": 0.7092896103858948, "num_tokens": 8133435.0, "step": 1563, "train/ce_loss": 1.2038424015045166 }, { "epoch": 0.1545382637927625, "step": 1563, "train/sim_loss": 0.0703125 }, { "epoch": 0.1545382637927625, "step": 1563, "train/total_loss": 0.19069674611091614 }, { "entropy": 9.292047500610352, "epoch": 0.15463713664227802, "mean_token_accuracy": 0.7600979208946228, "num_tokens": 8138693.0, "step": 1564, "train/ce_loss": 1.0820578336715698 }, { "epoch": 0.15463713664227802, "step": 1564, "train/sim_loss": 0.0546875 }, { "epoch": 0.15463713664227802, "step": 1564, "train/total_loss": 0.16289329528808594 }, { "entropy": 9.516674995422363, "epoch": 0.15473600949179356, "mean_token_accuracy": 0.7142857313156128, "num_tokens": 8143884.0, "step": 1565, "train/ce_loss": 0.7768955230712891 }, { "epoch": 0.15473600949179356, "step": 1565, "train/sim_loss": 0.0859375 }, { "epoch": 0.15473600949179356, "step": 1565, "train/total_loss": 0.16362705826759338 }, { "entropy": 9.774168968200684, "epoch": 0.15483488234130907, "mean_token_accuracy": 0.7204724550247192, "num_tokens": 8148876.0, "step": 1566, "train/ce_loss": 1.6146434545516968 }, { "epoch": 0.15483488234130907, "step": 1566, "train/sim_loss": 0.09375 }, { "epoch": 0.15483488234130907, "step": 1566, "train/total_loss": 0.2552143335342407 }, { "entropy": 9.159017562866211, "epoch": 0.1549337551908246, "mean_token_accuracy": 0.7898627519607544, "num_tokens": 8154290.0, "step": 1567, "train/ce_loss": 0.40811553597450256 }, { "epoch": 0.1549337551908246, "step": 1567, "train/sim_loss": 0.05078125 }, { "epoch": 0.1549337551908246, "step": 1567, "train/total_loss": 0.09159280359745026 }, { "entropy": 9.256782531738281, "epoch": 0.15503262804034013, "mean_token_accuracy": 0.6602316498756409, "num_tokens": 8159488.0, "step": 1568, "train/ce_loss": 1.8952783346176147 }, { "epoch": 0.15503262804034013, "step": 1568, "train/sim_loss": 0.1171875 }, { "epoch": 0.15503262804034013, "step": 1568, "train/total_loss": 0.30671533942222595 }, { "entropy": 9.015083312988281, "epoch": 0.15513150088985564, "mean_token_accuracy": 0.7527352571487427, "num_tokens": 8164885.0, "step": 1569, "train/ce_loss": 0.5627985000610352 }, { "epoch": 0.15513150088985564, "step": 1569, "train/sim_loss": 0.09765625 }, { "epoch": 0.15513150088985564, "step": 1569, "train/total_loss": 0.15393610298633575 }, { "entropy": 9.708174705505371, "epoch": 0.15523037373937118, "mean_token_accuracy": 0.7035398483276367, "num_tokens": 8170002.0, "step": 1570, "train/ce_loss": 8.378126949537545e-06 }, { "epoch": 0.15523037373937118, "step": 1570, "train/sim_loss": 0.0390625 }, { "epoch": 0.15523037373937118, "step": 1570, "train/total_loss": 0.039063338190317154 }, { "entropy": 10.102365493774414, "epoch": 0.1553292465888867, "mean_token_accuracy": 0.7489539980888367, "num_tokens": 8174859.0, "step": 1571, "train/ce_loss": 2.2206978797912598 }, { "epoch": 0.1553292465888867, "step": 1571, "train/sim_loss": 0.08984375 }, { "epoch": 0.1553292465888867, "step": 1571, "train/total_loss": 0.31191354990005493 }, { "entropy": 9.020883560180664, "epoch": 0.1554281194384022, "mean_token_accuracy": 0.7104121446609497, "num_tokens": 8180256.0, "step": 1572, "train/ce_loss": 0.5232083797454834 }, { "epoch": 0.1554281194384022, "step": 1572, "train/sim_loss": 0.046875 }, { "epoch": 0.1554281194384022, "step": 1572, "train/total_loss": 0.09919583797454834 }, { "entropy": 9.084095001220703, "epoch": 0.15552699228791775, "mean_token_accuracy": 0.8171926140785217, "num_tokens": 8185667.0, "step": 1573, "train/ce_loss": 0.41737455129623413 }, { "epoch": 0.15552699228791775, "step": 1573, "train/sim_loss": 0.03125 }, { "epoch": 0.15552699228791775, "step": 1573, "train/total_loss": 0.07298745214939117 }, { "entropy": 9.414447784423828, "epoch": 0.15562586513743326, "mean_token_accuracy": 0.7877551317214966, "num_tokens": 8190897.0, "step": 1574, "train/ce_loss": 0.43452367186546326 }, { "epoch": 0.15562586513743326, "step": 1574, "train/sim_loss": 0.0546875 }, { "epoch": 0.15562586513743326, "step": 1574, "train/total_loss": 0.09813986718654633 }, { "entropy": 9.340505599975586, "epoch": 0.15572473798694877, "mean_token_accuracy": 0.7110552787780762, "num_tokens": 8196141.0, "step": 1575, "train/ce_loss": 1.0214818716049194 }, { "epoch": 0.15572473798694877, "step": 1575, "train/sim_loss": 0.0390625 }, { "epoch": 0.15572473798694877, "step": 1575, "train/total_loss": 0.14121069014072418 }, { "entropy": 9.347097396850586, "epoch": 0.15582361083646432, "mean_token_accuracy": 0.7240506410598755, "num_tokens": 8201409.0, "step": 1576, "train/ce_loss": 0.595423698425293 }, { "epoch": 0.15582361083646432, "step": 1576, "train/sim_loss": 0.03515625 }, { "epoch": 0.15582361083646432, "step": 1576, "train/total_loss": 0.09469862282276154 }, { "entropy": 9.248504638671875, "epoch": 0.15592248368597983, "mean_token_accuracy": 0.7483588457107544, "num_tokens": 8206760.0, "step": 1577, "train/ce_loss": 0.6549820303916931 }, { "epoch": 0.15592248368597983, "step": 1577, "train/sim_loss": 0.0703125 }, { "epoch": 0.15592248368597983, "step": 1577, "train/total_loss": 0.1358107030391693 }, { "entropy": 9.31238079071045, "epoch": 0.15602135653549534, "mean_token_accuracy": 0.6877370476722717, "num_tokens": 8212045.0, "step": 1578, "train/ce_loss": 0.9893155097961426 }, { "epoch": 0.15602135653549534, "step": 1578, "train/sim_loss": 0.109375 }, { "epoch": 0.15602135653549534, "step": 1578, "train/total_loss": 0.20830655097961426 }, { "entropy": 9.541158676147461, "epoch": 0.15612022938501088, "mean_token_accuracy": 0.7480106353759766, "num_tokens": 8217402.0, "step": 1579, "train/ce_loss": 0.5257939100265503 }, { "epoch": 0.15612022938501088, "step": 1579, "train/sim_loss": 0.0859375 }, { "epoch": 0.15612022938501088, "step": 1579, "train/total_loss": 0.1385168880224228 }, { "epoch": 0.1562191022345264, "grad_norm": 0.8479238748550415, "learning_rate": 9.612075359738912e-06, "loss": 0.1645, "step": 1580 }, { "entropy": 8.832939147949219, "epoch": 0.1562191022345264, "mean_token_accuracy": 0.7540687322616577, "num_tokens": 8222972.0, "step": 1580, "train/ce_loss": 1.0235515832901 }, { "epoch": 0.1562191022345264, "step": 1580, "train/sim_loss": 0.1015625 }, { "epoch": 0.1562191022345264, "step": 1580, "train/total_loss": 0.20391765236854553 }, { "entropy": 9.819954872131348, "epoch": 0.1563179750840419, "mean_token_accuracy": 0.7840909361839294, "num_tokens": 8228118.0, "step": 1581, "train/ce_loss": 7.304859536816366e-06 }, { "epoch": 0.1563179750840419, "step": 1581, "train/sim_loss": 0.07421875 }, { "epoch": 0.1563179750840419, "step": 1581, "train/total_loss": 0.0742194801568985 }, { "entropy": 9.163996696472168, "epoch": 0.15641684793355745, "mean_token_accuracy": 0.7970521450042725, "num_tokens": 8233462.0, "step": 1582, "train/ce_loss": 1.022832020680653e-05 }, { "epoch": 0.15641684793355745, "step": 1582, "train/sim_loss": 0.0546875 }, { "epoch": 0.15641684793355745, "step": 1582, "train/total_loss": 0.05468852445483208 }, { "entropy": 9.698354721069336, "epoch": 0.15651572078307296, "mean_token_accuracy": 0.7314662337303162, "num_tokens": 8238506.0, "step": 1583, "train/ce_loss": 1.5570802133879624e-05 }, { "epoch": 0.15651572078307296, "step": 1583, "train/sim_loss": 0.05078125 }, { "epoch": 0.15651572078307296, "step": 1583, "train/total_loss": 0.05078280717134476 }, { "entropy": 9.24047565460205, "epoch": 0.15661459363258848, "mean_token_accuracy": 0.7287024855613708, "num_tokens": 8243759.0, "step": 1584, "train/ce_loss": 1.1748043298721313 }, { "epoch": 0.15661459363258848, "step": 1584, "train/sim_loss": 0.10546875 }, { "epoch": 0.15661459363258848, "step": 1584, "train/total_loss": 0.22294917702674866 }, { "entropy": 9.562520980834961, "epoch": 0.15671346648210402, "mean_token_accuracy": 0.7647058963775635, "num_tokens": 8248901.0, "step": 1585, "train/ce_loss": 0.8184059858322144 }, { "epoch": 0.15671346648210402, "step": 1585, "train/sim_loss": 0.12109375 }, { "epoch": 0.15671346648210402, "step": 1585, "train/total_loss": 0.2029343545436859 }, { "entropy": 9.282878875732422, "epoch": 0.15681233933161953, "mean_token_accuracy": 0.7263017296791077, "num_tokens": 8254067.0, "step": 1586, "train/ce_loss": 0.4955042600631714 }, { "epoch": 0.15681233933161953, "step": 1586, "train/sim_loss": 0.0859375 }, { "epoch": 0.15681233933161953, "step": 1586, "train/total_loss": 0.13548792898654938 }, { "entropy": 9.205556869506836, "epoch": 0.15691121218113507, "mean_token_accuracy": 0.7452085614204407, "num_tokens": 8259383.0, "step": 1587, "train/ce_loss": 1.2147839069366455 }, { "epoch": 0.15691121218113507, "step": 1587, "train/sim_loss": 0.03125 }, { "epoch": 0.15691121218113507, "step": 1587, "train/total_loss": 0.1527283936738968 }, { "entropy": 9.279157638549805, "epoch": 0.1570100850306506, "mean_token_accuracy": 0.7449275255203247, "num_tokens": 8264532.0, "step": 1588, "train/ce_loss": 0.5279999375343323 }, { "epoch": 0.1570100850306506, "step": 1588, "train/sim_loss": 0.07421875 }, { "epoch": 0.1570100850306506, "step": 1588, "train/total_loss": 0.1270187497138977 }, { "entropy": 9.94023609161377, "epoch": 0.1571089578801661, "mean_token_accuracy": 0.7112970948219299, "num_tokens": 8269447.0, "step": 1589, "train/ce_loss": 1.4001447198097594e-05 }, { "epoch": 0.1571089578801661, "step": 1589, "train/sim_loss": 0.03515625 }, { "epoch": 0.1571089578801661, "step": 1589, "train/total_loss": 0.03515765070915222 }, { "entropy": 9.349342346191406, "epoch": 0.15720783072968164, "mean_token_accuracy": 0.7328145503997803, "num_tokens": 8274737.0, "step": 1590, "train/ce_loss": 1.1686177253723145 }, { "epoch": 0.15720783072968164, "step": 1590, "train/sim_loss": 0.078125 }, { "epoch": 0.15720783072968164, "step": 1590, "train/total_loss": 0.19498677551746368 }, { "entropy": 9.330066680908203, "epoch": 0.15730670357919715, "mean_token_accuracy": 0.7281795740127563, "num_tokens": 8279977.0, "step": 1591, "train/ce_loss": 0.6450347304344177 }, { "epoch": 0.15730670357919715, "step": 1591, "train/sim_loss": 0.08984375 }, { "epoch": 0.15730670357919715, "step": 1591, "train/total_loss": 0.154347226023674 }, { "entropy": 9.549765586853027, "epoch": 0.15740557642871267, "mean_token_accuracy": 0.7203166484832764, "num_tokens": 8285156.0, "step": 1592, "train/ce_loss": 0.5984858274459839 }, { "epoch": 0.15740557642871267, "step": 1592, "train/sim_loss": 0.0546875 }, { "epoch": 0.15740557642871267, "step": 1592, "train/total_loss": 0.11453608423471451 }, { "entropy": 9.196601867675781, "epoch": 0.1575044492782282, "mean_token_accuracy": 0.8042269349098206, "num_tokens": 8290532.0, "step": 1593, "train/ce_loss": 0.6578105688095093 }, { "epoch": 0.1575044492782282, "step": 1593, "train/sim_loss": 0.10546875 }, { "epoch": 0.1575044492782282, "step": 1593, "train/total_loss": 0.17124980688095093 }, { "entropy": 8.923377990722656, "epoch": 0.15760332212774372, "mean_token_accuracy": 0.738231897354126, "num_tokens": 8295923.0, "step": 1594, "train/ce_loss": 0.9051287770271301 }, { "epoch": 0.15760332212774372, "step": 1594, "train/sim_loss": 0.0703125 }, { "epoch": 0.15760332212774372, "step": 1594, "train/total_loss": 0.16082537174224854 }, { "entropy": 10.551109313964844, "epoch": 0.15770219497725924, "mean_token_accuracy": 0.7696969509124756, "num_tokens": 8300451.0, "step": 1595, "train/ce_loss": 2.871335527743213e-05 }, { "epoch": 0.15770219497725924, "step": 1595, "train/sim_loss": 0.078125 }, { "epoch": 0.15770219497725924, "step": 1595, "train/total_loss": 0.07812786847352982 }, { "entropy": 9.211976051330566, "epoch": 0.15780106782677478, "mean_token_accuracy": 0.7874564528465271, "num_tokens": 8305782.0, "step": 1596, "train/ce_loss": 0.6386144757270813 }, { "epoch": 0.15780106782677478, "step": 1596, "train/sim_loss": 0.02734375 }, { "epoch": 0.15780106782677478, "step": 1596, "train/total_loss": 0.09120520204305649 }, { "entropy": 9.565742492675781, "epoch": 0.1578999406762903, "mean_token_accuracy": 0.7011308670043945, "num_tokens": 8311013.0, "step": 1597, "train/ce_loss": 1.6720161437988281 }, { "epoch": 0.1578999406762903, "step": 1597, "train/sim_loss": 0.08984375 }, { "epoch": 0.1578999406762903, "step": 1597, "train/total_loss": 0.2570453882217407 }, { "entropy": 8.96369743347168, "epoch": 0.1579988135258058, "mean_token_accuracy": 0.7186788320541382, "num_tokens": 8316396.0, "step": 1598, "train/ce_loss": 1.4170739650726318 }, { "epoch": 0.1579988135258058, "step": 1598, "train/sim_loss": 0.08203125 }, { "epoch": 0.1579988135258058, "step": 1598, "train/total_loss": 0.2237386554479599 }, { "entropy": 9.431184768676758, "epoch": 0.15809768637532134, "mean_token_accuracy": 0.6938202381134033, "num_tokens": 8321563.0, "step": 1599, "train/ce_loss": 0.6445886492729187 }, { "epoch": 0.15809768637532134, "step": 1599, "train/sim_loss": 0.08984375 }, { "epoch": 0.15809768637532134, "step": 1599, "train/total_loss": 0.15430262684822083 }, { "epoch": 0.15819655922483686, "grad_norm": 0.9533355832099915, "learning_rate": 9.607130494980962e-06, "loss": 0.1562, "step": 1600 }, { "entropy": 8.762441635131836, "epoch": 0.15819655922483686, "mean_token_accuracy": 0.700095534324646, "num_tokens": 8327160.0, "step": 1600, "train/ce_loss": 0.7110863327980042 }, { "epoch": 0.15819655922483686, "step": 1600, "train/sim_loss": 0.09375 }, { "epoch": 0.15819655922483686, "step": 1600, "train/total_loss": 0.1648586392402649 }, { "entropy": 9.795531272888184, "epoch": 0.15829543207435237, "mean_token_accuracy": 0.7343234419822693, "num_tokens": 8332145.0, "step": 1601, "train/ce_loss": 0.9776668548583984 }, { "epoch": 0.15829543207435237, "step": 1601, "train/sim_loss": 0.0703125 }, { "epoch": 0.15829543207435237, "step": 1601, "train/total_loss": 0.1680791974067688 }, { "entropy": 9.971131324768066, "epoch": 0.1583943049238679, "mean_token_accuracy": 0.7102272510528564, "num_tokens": 8337102.0, "step": 1602, "train/ce_loss": 8.836418601276819e-06 }, { "epoch": 0.1583943049238679, "step": 1602, "train/sim_loss": 0.07421875 }, { "epoch": 0.1583943049238679, "step": 1602, "train/total_loss": 0.07421963661909103 }, { "entropy": 9.375772476196289, "epoch": 0.15849317777338343, "mean_token_accuracy": 0.7432432174682617, "num_tokens": 8342371.0, "step": 1603, "train/ce_loss": 0.6301258206367493 }, { "epoch": 0.15849317777338343, "step": 1603, "train/sim_loss": 0.03125 }, { "epoch": 0.15849317777338343, "step": 1603, "train/total_loss": 0.09426258504390717 }, { "entropy": 9.804561614990234, "epoch": 0.15859205062289894, "mean_token_accuracy": 0.7110389471054077, "num_tokens": 8347419.0, "step": 1604, "train/ce_loss": 1.3388265371322632 }, { "epoch": 0.15859205062289894, "step": 1604, "train/sim_loss": 0.09375 }, { "epoch": 0.15859205062289894, "step": 1604, "train/total_loss": 0.22763265669345856 }, { "entropy": 9.409191131591797, "epoch": 0.15869092347241448, "mean_token_accuracy": 0.8163539171218872, "num_tokens": 8352638.0, "step": 1605, "train/ce_loss": 6.432980626414064e-06 }, { "epoch": 0.15869092347241448, "step": 1605, "train/sim_loss": 0.1015625 }, { "epoch": 0.15869092347241448, "step": 1605, "train/total_loss": 0.10156314074993134 }, { "entropy": 9.398286819458008, "epoch": 0.15878979632193, "mean_token_accuracy": 0.7496706247329712, "num_tokens": 8357890.0, "step": 1606, "train/ce_loss": 0.6533154845237732 }, { "epoch": 0.15878979632193, "step": 1606, "train/sim_loss": 0.1015625 }, { "epoch": 0.15878979632193, "step": 1606, "train/total_loss": 0.16689404845237732 }, { "entropy": 9.626225471496582, "epoch": 0.15888866917144553, "mean_token_accuracy": 0.7177914381027222, "num_tokens": 8362990.0, "step": 1607, "train/ce_loss": 0.9942994713783264 }, { "epoch": 0.15888866917144553, "step": 1607, "train/sim_loss": 0.06640625 }, { "epoch": 0.15888866917144553, "step": 1607, "train/total_loss": 0.16583620011806488 }, { "entropy": 9.111536026000977, "epoch": 0.15898754202096105, "mean_token_accuracy": 0.720652163028717, "num_tokens": 8368387.0, "step": 1608, "train/ce_loss": 0.5740724205970764 }, { "epoch": 0.15898754202096105, "step": 1608, "train/sim_loss": 0.03125 }, { "epoch": 0.15898754202096105, "step": 1608, "train/total_loss": 0.08865724503993988 }, { "entropy": 9.151995658874512, "epoch": 0.15908641487047656, "mean_token_accuracy": 0.7651006579399109, "num_tokens": 8373779.0, "step": 1609, "train/ce_loss": 0.744520366191864 }, { "epoch": 0.15908641487047656, "step": 1609, "train/sim_loss": 0.0703125 }, { "epoch": 0.15908641487047656, "step": 1609, "train/total_loss": 0.14476454257965088 }, { "entropy": 9.361624717712402, "epoch": 0.1591852877199921, "mean_token_accuracy": 0.7044673562049866, "num_tokens": 8379120.0, "step": 1610, "train/ce_loss": 0.8378784656524658 }, { "epoch": 0.1591852877199921, "step": 1610, "train/sim_loss": 0.0625 }, { "epoch": 0.1591852877199921, "step": 1610, "train/total_loss": 0.14628785848617554 }, { "entropy": 9.501016616821289, "epoch": 0.15928416056950762, "mean_token_accuracy": 0.6928374767303467, "num_tokens": 8384368.0, "step": 1611, "train/ce_loss": 1.7459763288497925 }, { "epoch": 0.15928416056950762, "step": 1611, "train/sim_loss": 0.0703125 }, { "epoch": 0.15928416056950762, "step": 1611, "train/total_loss": 0.2449101358652115 }, { "entropy": 9.5258207321167, "epoch": 0.15938303341902313, "mean_token_accuracy": 0.7316341996192932, "num_tokens": 8389514.0, "step": 1612, "train/ce_loss": 0.6329296231269836 }, { "epoch": 0.15938303341902313, "step": 1612, "train/sim_loss": 0.078125 }, { "epoch": 0.15938303341902313, "step": 1612, "train/total_loss": 0.1414179652929306 }, { "entropy": 9.500965118408203, "epoch": 0.15948190626853867, "mean_token_accuracy": 0.7055057883262634, "num_tokens": 8394754.0, "step": 1613, "train/ce_loss": 0.66053307056427 }, { "epoch": 0.15948190626853867, "step": 1613, "train/sim_loss": 0.05859375 }, { "epoch": 0.15948190626853867, "step": 1613, "train/total_loss": 0.12464705854654312 }, { "entropy": 9.78488540649414, "epoch": 0.15958077911805418, "mean_token_accuracy": 0.6838235259056091, "num_tokens": 8399725.0, "step": 1614, "train/ce_loss": 5.6860840231820475e-06 }, { "epoch": 0.15958077911805418, "step": 1614, "train/sim_loss": 0.0546875 }, { "epoch": 0.15958077911805418, "step": 1614, "train/total_loss": 0.054688069969415665 }, { "entropy": 9.601234436035156, "epoch": 0.1596796519675697, "mean_token_accuracy": 0.7455138564109802, "num_tokens": 8404824.0, "step": 1615, "train/ce_loss": 1.4592608213424683 }, { "epoch": 0.1596796519675697, "step": 1615, "train/sim_loss": 0.046875 }, { "epoch": 0.1596796519675697, "step": 1615, "train/total_loss": 0.1928010880947113 }, { "entropy": 9.11591911315918, "epoch": 0.15977852481708524, "mean_token_accuracy": 0.7567287683486938, "num_tokens": 8410279.0, "step": 1616, "train/ce_loss": 1.1196359395980835 }, { "epoch": 0.15977852481708524, "step": 1616, "train/sim_loss": 0.16796875 }, { "epoch": 0.15977852481708524, "step": 1616, "train/total_loss": 0.2799323499202728 }, { "entropy": 9.921876907348633, "epoch": 0.15987739766660075, "mean_token_accuracy": 0.7521968483924866, "num_tokens": 8415285.0, "step": 1617, "train/ce_loss": 0.8378710746765137 }, { "epoch": 0.15987739766660075, "step": 1617, "train/sim_loss": 0.05859375 }, { "epoch": 0.15987739766660075, "step": 1617, "train/total_loss": 0.14238086342811584 }, { "entropy": 9.652484893798828, "epoch": 0.15997627051611626, "mean_token_accuracy": 0.731054961681366, "num_tokens": 8420408.0, "step": 1618, "train/ce_loss": 1.6161900758743286 }, { "epoch": 0.15997627051611626, "step": 1618, "train/sim_loss": 0.078125 }, { "epoch": 0.15997627051611626, "step": 1618, "train/total_loss": 0.23974400758743286 }, { "entropy": 10.246427536010742, "epoch": 0.1600751433656318, "mean_token_accuracy": 0.7930174469947815, "num_tokens": 8425172.0, "step": 1619, "train/ce_loss": 1.2845847606658936 }, { "epoch": 0.1600751433656318, "step": 1619, "train/sim_loss": 0.02734375 }, { "epoch": 0.1600751433656318, "step": 1619, "train/total_loss": 0.15580223500728607 }, { "epoch": 0.16017401621514732, "grad_norm": 0.9123067259788513, "learning_rate": 9.602185630223014e-06, "loss": 0.1584, "step": 1620 }, { "entropy": 9.728799819946289, "epoch": 0.16017401621514732, "mean_token_accuracy": 0.7475728392601013, "num_tokens": 8430156.0, "step": 1620, "train/ce_loss": 1.1937299966812134 }, { "epoch": 0.16017401621514732, "step": 1620, "train/sim_loss": 0.04296875 }, { "epoch": 0.16017401621514732, "step": 1620, "train/total_loss": 0.16234174370765686 }, { "entropy": 8.924592971801758, "epoch": 0.16027288906466283, "mean_token_accuracy": 0.7698259353637695, "num_tokens": 8435650.0, "step": 1621, "train/ce_loss": 0.4493315517902374 }, { "epoch": 0.16027288906466283, "step": 1621, "train/sim_loss": 0.0390625 }, { "epoch": 0.16027288906466283, "step": 1621, "train/total_loss": 0.08399565517902374 }, { "entropy": 9.233839988708496, "epoch": 0.16037176191417837, "mean_token_accuracy": 0.7586206793785095, "num_tokens": 8440954.0, "step": 1622, "train/ce_loss": 0.7626397013664246 }, { "epoch": 0.16037176191417837, "step": 1622, "train/sim_loss": 0.12890625 }, { "epoch": 0.16037176191417837, "step": 1622, "train/total_loss": 0.20517021417617798 }, { "entropy": 8.83321762084961, "epoch": 0.1604706347636939, "mean_token_accuracy": 0.7466539144515991, "num_tokens": 8446556.0, "step": 1623, "train/ce_loss": 0.7785013318061829 }, { "epoch": 0.1604706347636939, "step": 1623, "train/sim_loss": 0.0546875 }, { "epoch": 0.1604706347636939, "step": 1623, "train/total_loss": 0.1325376331806183 }, { "entropy": 9.502970695495605, "epoch": 0.1605695076132094, "mean_token_accuracy": 0.7137452960014343, "num_tokens": 8451759.0, "step": 1624, "train/ce_loss": 5.685467840521596e-06 }, { "epoch": 0.1605695076132094, "step": 1624, "train/sim_loss": 0.03515625 }, { "epoch": 0.1605695076132094, "step": 1624, "train/total_loss": 0.035156819969415665 }, { "entropy": 9.826895713806152, "epoch": 0.16066838046272494, "mean_token_accuracy": 0.7473118305206299, "num_tokens": 8456739.0, "step": 1625, "train/ce_loss": 1.6536694765090942 }, { "epoch": 0.16066838046272494, "step": 1625, "train/sim_loss": 0.0859375 }, { "epoch": 0.16066838046272494, "step": 1625, "train/total_loss": 0.2513044476509094 }, { "entropy": 9.346691131591797, "epoch": 0.16076725331224045, "mean_token_accuracy": 0.6819338202476501, "num_tokens": 8461927.0, "step": 1626, "train/ce_loss": 1.2664635181427002 }, { "epoch": 0.16076725331224045, "step": 1626, "train/sim_loss": 0.12890625 }, { "epoch": 0.16076725331224045, "step": 1626, "train/total_loss": 0.25555258989334106 }, { "entropy": 9.530488014221191, "epoch": 0.160866126161756, "mean_token_accuracy": 0.7142857313156128, "num_tokens": 8467011.0, "step": 1627, "train/ce_loss": 1.0578060150146484 }, { "epoch": 0.160866126161756, "step": 1627, "train/sim_loss": 0.07421875 }, { "epoch": 0.160866126161756, "step": 1627, "train/total_loss": 0.17999935150146484 }, { "entropy": 8.888346672058105, "epoch": 0.1609649990112715, "mean_token_accuracy": 0.7515375018119812, "num_tokens": 8472313.0, "step": 1628, "train/ce_loss": 0.6371399164199829 }, { "epoch": 0.1609649990112715, "step": 1628, "train/sim_loss": 0.0859375 }, { "epoch": 0.1609649990112715, "step": 1628, "train/total_loss": 0.14965149760246277 }, { "entropy": 10.206025123596191, "epoch": 0.16106387186078702, "mean_token_accuracy": 0.7366071343421936, "num_tokens": 8477123.0, "step": 1629, "train/ce_loss": 2.246410846710205 }, { "epoch": 0.16106387186078702, "step": 1629, "train/sim_loss": 0.0859375 }, { "epoch": 0.16106387186078702, "step": 1629, "train/total_loss": 0.3105785846710205 }, { "entropy": 9.567769050598145, "epoch": 0.16116274471030256, "mean_token_accuracy": 0.6633093357086182, "num_tokens": 8482235.0, "step": 1630, "train/ce_loss": 1.9544039964675903 }, { "epoch": 0.16116274471030256, "step": 1630, "train/sim_loss": 0.15234375 }, { "epoch": 0.16116274471030256, "step": 1630, "train/total_loss": 0.347784161567688 }, { "entropy": 8.883554458618164, "epoch": 0.16126161755981808, "mean_token_accuracy": 0.750507116317749, "num_tokens": 8487778.0, "step": 1631, "train/ce_loss": 1.0531898736953735 }, { "epoch": 0.16126161755981808, "step": 1631, "train/sim_loss": 0.1171875 }, { "epoch": 0.16126161755981808, "step": 1631, "train/total_loss": 0.22250649333000183 }, { "entropy": 9.575586318969727, "epoch": 0.1613604904093336, "mean_token_accuracy": 0.7296072244644165, "num_tokens": 8492885.0, "step": 1632, "train/ce_loss": 0.48304834961891174 }, { "epoch": 0.1613604904093336, "step": 1632, "train/sim_loss": 0.08984375 }, { "epoch": 0.1613604904093336, "step": 1632, "train/total_loss": 0.13814859092235565 }, { "entropy": 9.59064769744873, "epoch": 0.16145936325884913, "mean_token_accuracy": 0.7426981925964355, "num_tokens": 8498052.0, "step": 1633, "train/ce_loss": 0.9048088192939758 }, { "epoch": 0.16145936325884913, "step": 1633, "train/sim_loss": 0.0859375 }, { "epoch": 0.16145936325884913, "step": 1633, "train/total_loss": 0.17641839385032654 }, { "entropy": 9.502883911132812, "epoch": 0.16155823610836464, "mean_token_accuracy": 0.8213802576065063, "num_tokens": 8503255.0, "step": 1634, "train/ce_loss": 0.4299742579460144 }, { "epoch": 0.16155823610836464, "step": 1634, "train/sim_loss": 0.03125 }, { "epoch": 0.16155823610836464, "step": 1634, "train/total_loss": 0.07424742728471756 }, { "entropy": 9.213842391967773, "epoch": 0.16165710895788016, "mean_token_accuracy": 0.730434775352478, "num_tokens": 8508630.0, "step": 1635, "train/ce_loss": 0.8116820454597473 }, { "epoch": 0.16165710895788016, "step": 1635, "train/sim_loss": 0.05859375 }, { "epoch": 0.16165710895788016, "step": 1635, "train/total_loss": 0.13976195454597473 }, { "entropy": 8.935359954833984, "epoch": 0.1617559818073957, "mean_token_accuracy": 0.7946336269378662, "num_tokens": 8514056.0, "step": 1636, "train/ce_loss": 0.49651533365249634 }, { "epoch": 0.1617559818073957, "step": 1636, "train/sim_loss": 0.06640625 }, { "epoch": 0.1617559818073957, "step": 1636, "train/total_loss": 0.11605778336524963 }, { "entropy": 9.745095252990723, "epoch": 0.1618548546569112, "mean_token_accuracy": 0.7855855822563171, "num_tokens": 8519048.0, "step": 1637, "train/ce_loss": 0.637229859828949 }, { "epoch": 0.1618548546569112, "step": 1637, "train/sim_loss": 0.046875 }, { "epoch": 0.1618548546569112, "step": 1637, "train/total_loss": 0.11059799045324326 }, { "entropy": 9.171215057373047, "epoch": 0.16195372750642673, "mean_token_accuracy": 0.7346465587615967, "num_tokens": 8524339.0, "step": 1638, "train/ce_loss": 0.8786928057670593 }, { "epoch": 0.16195372750642673, "step": 1638, "train/sim_loss": 0.03125 }, { "epoch": 0.16195372750642673, "step": 1638, "train/total_loss": 0.11911927908658981 }, { "entropy": 9.65593147277832, "epoch": 0.16205260035594227, "mean_token_accuracy": 0.7678855061531067, "num_tokens": 8529436.0, "step": 1639, "train/ce_loss": 0.5777314305305481 }, { "epoch": 0.16205260035594227, "step": 1639, "train/sim_loss": 0.046875 }, { "epoch": 0.16205260035594227, "step": 1639, "train/total_loss": 0.10464814305305481 }, { "epoch": 0.16215147320545778, "grad_norm": 1.0075432062149048, "learning_rate": 9.597240765465065e-06, "loss": 0.1601, "step": 1640 }, { "entropy": 9.951290130615234, "epoch": 0.16215147320545778, "mean_token_accuracy": 0.7233644723892212, "num_tokens": 8534411.0, "step": 1640, "train/ce_loss": 0.5973339676856995 }, { "epoch": 0.16215147320545778, "step": 1640, "train/sim_loss": 0.04296875 }, { "epoch": 0.16215147320545778, "step": 1640, "train/total_loss": 0.10270214825868607 }, { "entropy": 9.90272045135498, "epoch": 0.1622503460549733, "mean_token_accuracy": 0.7053942084312439, "num_tokens": 8539330.0, "step": 1641, "train/ce_loss": 8.89528018888086e-06 }, { "epoch": 0.1622503460549733, "step": 1641, "train/sim_loss": 0.0234375 }, { "epoch": 0.1622503460549733, "step": 1641, "train/total_loss": 0.023438390344381332 }, { "entropy": 9.613363265991211, "epoch": 0.16234921890448883, "mean_token_accuracy": 0.732064425945282, "num_tokens": 8544441.0, "step": 1642, "train/ce_loss": 6.30884505881113e-06 }, { "epoch": 0.16234921890448883, "step": 1642, "train/sim_loss": 0.08984375 }, { "epoch": 0.16234921890448883, "step": 1642, "train/total_loss": 0.08984438329935074 }, { "entropy": 9.551526069641113, "epoch": 0.16244809175400435, "mean_token_accuracy": 0.7136498689651489, "num_tokens": 8549611.0, "step": 1643, "train/ce_loss": 1.3973978757858276 }, { "epoch": 0.16244809175400435, "step": 1643, "train/sim_loss": 0.11328125 }, { "epoch": 0.16244809175400435, "step": 1643, "train/total_loss": 0.2530210614204407 }, { "entropy": 9.609851837158203, "epoch": 0.16254696460351986, "mean_token_accuracy": 0.7132667899131775, "num_tokens": 8554782.0, "step": 1644, "train/ce_loss": 0.5838186740875244 }, { "epoch": 0.16254696460351986, "step": 1644, "train/sim_loss": 0.08203125 }, { "epoch": 0.16254696460351986, "step": 1644, "train/total_loss": 0.14041312038898468 }, { "entropy": 9.33971118927002, "epoch": 0.1626458374530354, "mean_token_accuracy": 0.7272727489471436, "num_tokens": 8559990.0, "step": 1645, "train/ce_loss": 0.6293501257896423 }, { "epoch": 0.1626458374530354, "step": 1645, "train/sim_loss": 0.0859375 }, { "epoch": 0.1626458374530354, "step": 1645, "train/total_loss": 0.1488725244998932 }, { "entropy": 9.75534439086914, "epoch": 0.16274471030255092, "mean_token_accuracy": 0.7266436219215393, "num_tokens": 8564992.0, "step": 1646, "train/ce_loss": 6.342493634292623e-06 }, { "epoch": 0.16274471030255092, "step": 1646, "train/sim_loss": 0.07421875 }, { "epoch": 0.16274471030255092, "step": 1646, "train/total_loss": 0.07421938329935074 }, { "entropy": 9.056360244750977, "epoch": 0.16284358315206643, "mean_token_accuracy": 0.6840425729751587, "num_tokens": 8570449.0, "step": 1647, "train/ce_loss": 0.9854462146759033 }, { "epoch": 0.16284358315206643, "step": 1647, "train/sim_loss": 0.09765625 }, { "epoch": 0.16284358315206643, "step": 1647, "train/total_loss": 0.1962008774280548 }, { "entropy": 9.603727340698242, "epoch": 0.16294245600158197, "mean_token_accuracy": 0.7390071153640747, "num_tokens": 8575590.0, "step": 1648, "train/ce_loss": 1.2806551456451416 }, { "epoch": 0.16294245600158197, "step": 1648, "train/sim_loss": 0.078125 }, { "epoch": 0.16294245600158197, "step": 1648, "train/total_loss": 0.20619051158428192 }, { "entropy": 9.485952377319336, "epoch": 0.16304132885109748, "mean_token_accuracy": 0.6822840571403503, "num_tokens": 8580732.0, "step": 1649, "train/ce_loss": 1.824843406677246 }, { "epoch": 0.16304132885109748, "step": 1649, "train/sim_loss": 0.109375 }, { "epoch": 0.16304132885109748, "step": 1649, "train/total_loss": 0.29185932874679565 }, { "entropy": 9.710960388183594, "epoch": 0.16314020170061302, "mean_token_accuracy": 0.6900311708450317, "num_tokens": 8585824.0, "step": 1650, "train/ce_loss": 1.2377186976664234e-05 }, { "epoch": 0.16314020170061302, "step": 1650, "train/sim_loss": 0.046875 }, { "epoch": 0.16314020170061302, "step": 1650, "train/total_loss": 0.04687623679637909 }, { "entropy": 9.404132843017578, "epoch": 0.16323907455012854, "mean_token_accuracy": 0.7272727489471436, "num_tokens": 8591053.0, "step": 1651, "train/ce_loss": 1.1155272722244263 }, { "epoch": 0.16323907455012854, "step": 1651, "train/sim_loss": 0.0703125 }, { "epoch": 0.16323907455012854, "step": 1651, "train/total_loss": 0.18186523020267487 }, { "entropy": 9.23090648651123, "epoch": 0.16333794739964405, "mean_token_accuracy": 0.7109375, "num_tokens": 8596443.0, "step": 1652, "train/ce_loss": 0.6732193827629089 }, { "epoch": 0.16333794739964405, "step": 1652, "train/sim_loss": 0.046875 }, { "epoch": 0.16333794739964405, "step": 1652, "train/total_loss": 0.11419694125652313 }, { "entropy": 9.733444213867188, "epoch": 0.1634368202491596, "mean_token_accuracy": 0.6919431090354919, "num_tokens": 8601525.0, "step": 1653, "train/ce_loss": 1.4947330951690674 }, { "epoch": 0.1634368202491596, "step": 1653, "train/sim_loss": 0.1328125 }, { "epoch": 0.1634368202491596, "step": 1653, "train/total_loss": 0.28228580951690674 }, { "entropy": 9.211160659790039, "epoch": 0.1635356930986751, "mean_token_accuracy": 0.70138019323349, "num_tokens": 8606780.0, "step": 1654, "train/ce_loss": 0.9958420991897583 }, { "epoch": 0.1635356930986751, "step": 1654, "train/sim_loss": 0.08203125 }, { "epoch": 0.1635356930986751, "step": 1654, "train/total_loss": 0.18161547183990479 }, { "entropy": 9.393009185791016, "epoch": 0.16363456594819062, "mean_token_accuracy": 0.7454323768615723, "num_tokens": 8612040.0, "step": 1655, "train/ce_loss": 1.3290444612503052 }, { "epoch": 0.16363456594819062, "step": 1655, "train/sim_loss": 0.03515625 }, { "epoch": 0.16363456594819062, "step": 1655, "train/total_loss": 0.16806070506572723 }, { "entropy": 10.141335487365723, "epoch": 0.16373343879770616, "mean_token_accuracy": 0.7228915691375732, "num_tokens": 8616865.0, "step": 1656, "train/ce_loss": 2.1531684398651123 }, { "epoch": 0.16373343879770616, "step": 1656, "train/sim_loss": 0.0859375 }, { "epoch": 0.16373343879770616, "step": 1656, "train/total_loss": 0.3012543320655823 }, { "entropy": 9.644229888916016, "epoch": 0.16383231164722167, "mean_token_accuracy": 0.7107913494110107, "num_tokens": 8621975.0, "step": 1657, "train/ce_loss": 6.4753439801279455e-06 }, { "epoch": 0.16383231164722167, "step": 1657, "train/sim_loss": 0.07421875 }, { "epoch": 0.16383231164722167, "step": 1657, "train/total_loss": 0.07421939820051193 }, { "entropy": 9.311150550842285, "epoch": 0.1639311844967372, "mean_token_accuracy": 0.6637279391288757, "num_tokens": 8627271.0, "step": 1658, "train/ce_loss": 1.0205533504486084 }, { "epoch": 0.1639311844967372, "step": 1658, "train/sim_loss": 0.12109375 }, { "epoch": 0.1639311844967372, "step": 1658, "train/total_loss": 0.22314909100532532 }, { "entropy": 9.454523086547852, "epoch": 0.16403005734625273, "mean_token_accuracy": 0.720588207244873, "num_tokens": 8632532.0, "step": 1659, "train/ce_loss": 0.7508016228675842 }, { "epoch": 0.16403005734625273, "step": 1659, "train/sim_loss": 0.0703125 }, { "epoch": 0.16403005734625273, "step": 1659, "train/total_loss": 0.14539265632629395 }, { "epoch": 0.16412893019576824, "grad_norm": 0.9204435348510742, "learning_rate": 9.592295900707115e-06, "loss": 0.1754, "step": 1660 }, { "entropy": 9.35174560546875, "epoch": 0.16412893019576824, "mean_token_accuracy": 0.6658163070678711, "num_tokens": 8637783.0, "step": 1660, "train/ce_loss": 1.5418193340301514 }, { "epoch": 0.16412893019576824, "step": 1660, "train/sim_loss": 0.0625 }, { "epoch": 0.16412893019576824, "step": 1660, "train/total_loss": 0.21668194234371185 }, { "entropy": 8.99431037902832, "epoch": 0.16422780304528375, "mean_token_accuracy": 0.733195424079895, "num_tokens": 8643218.0, "step": 1661, "train/ce_loss": 1.2750509977340698 }, { "epoch": 0.16422780304528375, "step": 1661, "train/sim_loss": 0.1015625 }, { "epoch": 0.16422780304528375, "step": 1661, "train/total_loss": 0.2290676087141037 }, { "entropy": 9.160455703735352, "epoch": 0.1643266758947993, "mean_token_accuracy": 0.7895902395248413, "num_tokens": 8648639.0, "step": 1662, "train/ce_loss": 0.7102704644203186 }, { "epoch": 0.1643266758947993, "step": 1662, "train/sim_loss": 0.0625 }, { "epoch": 0.1643266758947993, "step": 1662, "train/total_loss": 0.13352704048156738 }, { "entropy": 9.707138061523438, "epoch": 0.1644255487443148, "mean_token_accuracy": 0.7761194109916687, "num_tokens": 8653687.0, "step": 1663, "train/ce_loss": 0.8248770833015442 }, { "epoch": 0.1644255487443148, "step": 1663, "train/sim_loss": 0.03515625 }, { "epoch": 0.1644255487443148, "step": 1663, "train/total_loss": 0.11764395982027054 }, { "entropy": 9.405868530273438, "epoch": 0.16452442159383032, "mean_token_accuracy": 0.7403726577758789, "num_tokens": 8658946.0, "step": 1664, "train/ce_loss": 0.4154486656188965 }, { "epoch": 0.16452442159383032, "step": 1664, "train/sim_loss": 0.02734375 }, { "epoch": 0.16452442159383032, "step": 1664, "train/total_loss": 0.06888861954212189 }, { "entropy": 9.920665740966797, "epoch": 0.16462329444334586, "mean_token_accuracy": 0.7779660820960999, "num_tokens": 8663950.0, "step": 1665, "train/ce_loss": 0.37819704413414 }, { "epoch": 0.16462329444334586, "step": 1665, "train/sim_loss": 0.05078125 }, { "epoch": 0.16462329444334586, "step": 1665, "train/total_loss": 0.08860095590353012 }, { "entropy": 9.274250030517578, "epoch": 0.16472216729286138, "mean_token_accuracy": 0.6619552373886108, "num_tokens": 8669240.0, "step": 1666, "train/ce_loss": 1.6018245220184326 }, { "epoch": 0.16472216729286138, "step": 1666, "train/sim_loss": 0.07421875 }, { "epoch": 0.16472216729286138, "step": 1666, "train/total_loss": 0.23440121114253998 }, { "entropy": 10.213384628295898, "epoch": 0.1648210401423769, "mean_token_accuracy": 0.6941580772399902, "num_tokens": 8673963.0, "step": 1667, "train/ce_loss": 0.00013563338143285364 }, { "epoch": 0.1648210401423769, "step": 1667, "train/sim_loss": 0.0625 }, { "epoch": 0.1648210401423769, "step": 1667, "train/total_loss": 0.0625135600566864 }, { "entropy": 9.858098983764648, "epoch": 0.16491991299189243, "mean_token_accuracy": 0.7063903212547302, "num_tokens": 8678967.0, "step": 1668, "train/ce_loss": 0.9753656387329102 }, { "epoch": 0.16491991299189243, "step": 1668, "train/sim_loss": 0.0625 }, { "epoch": 0.16491991299189243, "step": 1668, "train/total_loss": 0.16003656387329102 }, { "entropy": 9.863697052001953, "epoch": 0.16501878584140794, "mean_token_accuracy": 0.7641682028770447, "num_tokens": 8683950.0, "step": 1669, "train/ce_loss": 0.6916494965553284 }, { "epoch": 0.16501878584140794, "step": 1669, "train/sim_loss": 0.09765625 }, { "epoch": 0.16501878584140794, "step": 1669, "train/total_loss": 0.1668212115764618 }, { "entropy": 9.501869201660156, "epoch": 0.16511765869092349, "mean_token_accuracy": 0.7032679915428162, "num_tokens": 8689133.0, "step": 1670, "train/ce_loss": 0.8607410788536072 }, { "epoch": 0.16511765869092349, "step": 1670, "train/sim_loss": 0.06640625 }, { "epoch": 0.16511765869092349, "step": 1670, "train/total_loss": 0.1524803638458252 }, { "entropy": 9.70029067993164, "epoch": 0.165216531540439, "mean_token_accuracy": 0.7043847441673279, "num_tokens": 8694288.0, "step": 1671, "train/ce_loss": 4.156109298492083e-06 }, { "epoch": 0.165216531540439, "step": 1671, "train/sim_loss": 0.07421875 }, { "epoch": 0.165216531540439, "step": 1671, "train/total_loss": 0.07421916723251343 }, { "entropy": 9.59384536743164, "epoch": 0.1653154043899545, "mean_token_accuracy": 0.7766367197036743, "num_tokens": 8699651.0, "step": 1672, "train/ce_loss": 0.8478952050209045 }, { "epoch": 0.1653154043899545, "step": 1672, "train/sim_loss": 0.05859375 }, { "epoch": 0.1653154043899545, "step": 1672, "train/total_loss": 0.14338326454162598 }, { "entropy": 9.08733081817627, "epoch": 0.16541427723947005, "mean_token_accuracy": 0.7172414064407349, "num_tokens": 8704976.0, "step": 1673, "train/ce_loss": 1.1948295831680298 }, { "epoch": 0.16541427723947005, "step": 1673, "train/sim_loss": 0.0703125 }, { "epoch": 0.16541427723947005, "step": 1673, "train/total_loss": 0.18979546427726746 }, { "entropy": 9.397780418395996, "epoch": 0.16551315008898557, "mean_token_accuracy": 0.7293776869773865, "num_tokens": 8710158.0, "step": 1674, "train/ce_loss": 1.657582402229309 }, { "epoch": 0.16551315008898557, "step": 1674, "train/sim_loss": 0.10546875 }, { "epoch": 0.16551315008898557, "step": 1674, "train/total_loss": 0.27122700214385986 }, { "entropy": 8.993717193603516, "epoch": 0.16561202293850108, "mean_token_accuracy": 0.7628865838050842, "num_tokens": 8715600.0, "step": 1675, "train/ce_loss": 0.7835026979446411 }, { "epoch": 0.16561202293850108, "step": 1675, "train/sim_loss": 0.03125 }, { "epoch": 0.16561202293850108, "step": 1675, "train/total_loss": 0.10960026830434799 }, { "entropy": 9.839988708496094, "epoch": 0.16571089578801662, "mean_token_accuracy": 0.7283531427383423, "num_tokens": 8720639.0, "step": 1676, "train/ce_loss": 1.15177321434021 }, { "epoch": 0.16571089578801662, "step": 1676, "train/sim_loss": 0.109375 }, { "epoch": 0.16571089578801662, "step": 1676, "train/total_loss": 0.22455233335494995 }, { "entropy": 9.535063743591309, "epoch": 0.16580976863753213, "mean_token_accuracy": 0.7291960716247559, "num_tokens": 8725817.0, "step": 1677, "train/ce_loss": 0.6581219434738159 }, { "epoch": 0.16580976863753213, "step": 1677, "train/sim_loss": 0.03125 }, { "epoch": 0.16580976863753213, "step": 1677, "train/total_loss": 0.09706219285726547 }, { "entropy": 9.273845672607422, "epoch": 0.16590864148704765, "mean_token_accuracy": 0.7103694677352905, "num_tokens": 8731115.0, "step": 1678, "train/ce_loss": 0.6409934163093567 }, { "epoch": 0.16590864148704765, "step": 1678, "train/sim_loss": 0.1171875 }, { "epoch": 0.16590864148704765, "step": 1678, "train/total_loss": 0.18128684163093567 }, { "entropy": 9.664846420288086, "epoch": 0.1660075143365632, "mean_token_accuracy": 0.7144970297813416, "num_tokens": 8736445.0, "step": 1679, "train/ce_loss": 7.968543286551721e-06 }, { "epoch": 0.1660075143365632, "step": 1679, "train/sim_loss": 0.078125 }, { "epoch": 0.1660075143365632, "step": 1679, "train/total_loss": 0.07812579721212387 }, { "epoch": 0.1661063871860787, "grad_norm": 1.0694078207015991, "learning_rate": 9.587351035949168e-06, "loss": 0.1613, "step": 1680 }, { "entropy": 9.293163299560547, "epoch": 0.1661063871860787, "mean_token_accuracy": 0.6978609561920166, "num_tokens": 8741700.0, "step": 1680, "train/ce_loss": 0.6442439556121826 }, { "epoch": 0.1661063871860787, "step": 1680, "train/sim_loss": 0.0390625 }, { "epoch": 0.1661063871860787, "step": 1680, "train/total_loss": 0.10348689556121826 }, { "entropy": 9.476476669311523, "epoch": 0.16620526003559422, "mean_token_accuracy": 0.7588832378387451, "num_tokens": 8746903.0, "step": 1681, "train/ce_loss": 1.2075145244598389 }, { "epoch": 0.16620526003559422, "step": 1681, "train/sim_loss": 0.05859375 }, { "epoch": 0.16620526003559422, "step": 1681, "train/total_loss": 0.17934520542621613 }, { "entropy": 9.433666229248047, "epoch": 0.16630413288510976, "mean_token_accuracy": 0.748308539390564, "num_tokens": 8752118.0, "step": 1682, "train/ce_loss": 1.4417132139205933 }, { "epoch": 0.16630413288510976, "step": 1682, "train/sim_loss": 0.0859375 }, { "epoch": 0.16630413288510976, "step": 1682, "train/total_loss": 0.2301088273525238 }, { "entropy": 9.663443565368652, "epoch": 0.16640300573462527, "mean_token_accuracy": 0.7016248106956482, "num_tokens": 8757254.0, "step": 1683, "train/ce_loss": 9.290965863328893e-06 }, { "epoch": 0.16640300573462527, "step": 1683, "train/sim_loss": 0.08203125 }, { "epoch": 0.16640300573462527, "step": 1683, "train/total_loss": 0.08203218132257462 }, { "entropy": 9.118719100952148, "epoch": 0.16650187858414078, "mean_token_accuracy": 0.7209302186965942, "num_tokens": 8762648.0, "step": 1684, "train/ce_loss": 0.8776395916938782 }, { "epoch": 0.16650187858414078, "step": 1684, "train/sim_loss": 0.05078125 }, { "epoch": 0.16650187858414078, "step": 1684, "train/total_loss": 0.1385452151298523 }, { "entropy": 9.244359016418457, "epoch": 0.16660075143365632, "mean_token_accuracy": 0.7730496525764465, "num_tokens": 8767929.0, "step": 1685, "train/ce_loss": 0.8379502892494202 }, { "epoch": 0.16660075143365632, "step": 1685, "train/sim_loss": 0.05859375 }, { "epoch": 0.16660075143365632, "step": 1685, "train/total_loss": 0.14238879084587097 }, { "entropy": 9.807613372802734, "epoch": 0.16669962428317184, "mean_token_accuracy": 0.7221269011497498, "num_tokens": 8772948.0, "step": 1686, "train/ce_loss": 5.5832565521996e-06 }, { "epoch": 0.16669962428317184, "step": 1686, "train/sim_loss": 0.07421875 }, { "epoch": 0.16669962428317184, "step": 1686, "train/total_loss": 0.07421930879354477 }, { "entropy": 9.008600234985352, "epoch": 0.16679849713268735, "mean_token_accuracy": 0.7431102395057678, "num_tokens": 8778432.0, "step": 1687, "train/ce_loss": 0.8022541403770447 }, { "epoch": 0.16679849713268735, "step": 1687, "train/sim_loss": 0.03125 }, { "epoch": 0.16679849713268735, "step": 1687, "train/total_loss": 0.11147541552782059 }, { "entropy": 9.089427947998047, "epoch": 0.1668973699822029, "mean_token_accuracy": 0.7213114500045776, "num_tokens": 8783724.0, "step": 1688, "train/ce_loss": 0.683851420879364 }, { "epoch": 0.1668973699822029, "step": 1688, "train/sim_loss": 0.07421875 }, { "epoch": 0.1668973699822029, "step": 1688, "train/total_loss": 0.14260390400886536 }, { "entropy": 9.778931617736816, "epoch": 0.1669962428317184, "mean_token_accuracy": 0.7264957427978516, "num_tokens": 8788906.0, "step": 1689, "train/ce_loss": 1.454725742340088 }, { "epoch": 0.1669962428317184, "step": 1689, "train/sim_loss": 0.1015625 }, { "epoch": 0.1669962428317184, "step": 1689, "train/total_loss": 0.24703507125377655 }, { "entropy": 9.725312232971191, "epoch": 0.16709511568123395, "mean_token_accuracy": 0.7404129505157471, "num_tokens": 8794029.0, "step": 1690, "train/ce_loss": 1.193360686302185 }, { "epoch": 0.16709511568123395, "step": 1690, "train/sim_loss": 0.0703125 }, { "epoch": 0.16709511568123395, "step": 1690, "train/total_loss": 0.1896485686302185 }, { "entropy": 9.521604537963867, "epoch": 0.16719398853074946, "mean_token_accuracy": 0.7085714340209961, "num_tokens": 8799239.0, "step": 1691, "train/ce_loss": 4.235959295328939e-06 }, { "epoch": 0.16719398853074946, "step": 1691, "train/sim_loss": 0.05078125 }, { "epoch": 0.16719398853074946, "step": 1691, "train/total_loss": 0.050781674683094025 }, { "entropy": 9.517845153808594, "epoch": 0.16729286138026497, "mean_token_accuracy": 0.7022556662559509, "num_tokens": 8804320.0, "step": 1692, "train/ce_loss": 0.7737181782722473 }, { "epoch": 0.16729286138026497, "step": 1692, "train/sim_loss": 0.1328125 }, { "epoch": 0.16729286138026497, "step": 1692, "train/total_loss": 0.21018432080745697 }, { "entropy": 9.612174034118652, "epoch": 0.16739173422978051, "mean_token_accuracy": 0.7832061052322388, "num_tokens": 8809461.0, "step": 1693, "train/ce_loss": 4.751113465317758e-06 }, { "epoch": 0.16739173422978051, "step": 1693, "train/sim_loss": 0.1015625 }, { "epoch": 0.16739173422978051, "step": 1693, "train/total_loss": 0.1015629768371582 }, { "entropy": 9.11648178100586, "epoch": 0.16749060707929603, "mean_token_accuracy": 0.7461140155792236, "num_tokens": 8814839.0, "step": 1694, "train/ce_loss": 0.8760917782783508 }, { "epoch": 0.16749060707929603, "step": 1694, "train/sim_loss": 0.03515625 }, { "epoch": 0.16749060707929603, "step": 1694, "train/total_loss": 0.1227654293179512 }, { "entropy": 9.678754806518555, "epoch": 0.16758947992881154, "mean_token_accuracy": 0.6839762330055237, "num_tokens": 8819942.0, "step": 1695, "train/ce_loss": 1.5940691232681274 }, { "epoch": 0.16758947992881154, "step": 1695, "train/sim_loss": 0.078125 }, { "epoch": 0.16758947992881154, "step": 1695, "train/total_loss": 0.23753191530704498 }, { "entropy": 9.327474594116211, "epoch": 0.16768835277832708, "mean_token_accuracy": 0.725261926651001, "num_tokens": 8825243.0, "step": 1696, "train/ce_loss": 0.5194224119186401 }, { "epoch": 0.16768835277832708, "step": 1696, "train/sim_loss": 0.0625 }, { "epoch": 0.16768835277832708, "step": 1696, "train/total_loss": 0.11444224417209625 }, { "entropy": 9.477210998535156, "epoch": 0.1677872256278426, "mean_token_accuracy": 0.761255145072937, "num_tokens": 8830605.0, "step": 1697, "train/ce_loss": 0.9472794532775879 }, { "epoch": 0.1677872256278426, "step": 1697, "train/sim_loss": 0.0625 }, { "epoch": 0.1677872256278426, "step": 1697, "train/total_loss": 0.15722794830799103 }, { "entropy": 9.329252243041992, "epoch": 0.1678860984773581, "mean_token_accuracy": 0.7214111685752869, "num_tokens": 8835917.0, "step": 1698, "train/ce_loss": 0.852024495601654 }, { "epoch": 0.1678860984773581, "step": 1698, "train/sim_loss": 0.0859375 }, { "epoch": 0.1678860984773581, "step": 1698, "train/total_loss": 0.17113995552062988 }, { "entropy": 9.541632652282715, "epoch": 0.16798497132687365, "mean_token_accuracy": 0.781593382358551, "num_tokens": 8841115.0, "step": 1699, "train/ce_loss": 0.9113073945045471 }, { "epoch": 0.16798497132687365, "step": 1699, "train/sim_loss": 0.05078125 }, { "epoch": 0.16798497132687365, "step": 1699, "train/total_loss": 0.14191198348999023 }, { "epoch": 0.16808384417638916, "grad_norm": 0.8907069563865662, "learning_rate": 9.582406171191218e-06, "loss": 0.1606, "step": 1700 }, { "entropy": 9.1544189453125, "epoch": 0.16808384417638916, "mean_token_accuracy": 0.7335600852966309, "num_tokens": 8846482.0, "step": 1700, "train/ce_loss": 0.7185544967651367 }, { "epoch": 0.16808384417638916, "step": 1700, "train/sim_loss": 0.03515625 }, { "epoch": 0.16808384417638916, "step": 1700, "train/total_loss": 0.10701169818639755 }, { "entropy": 9.01425552368164, "epoch": 0.16818271702590468, "mean_token_accuracy": 0.6836434602737427, "num_tokens": 8851999.0, "step": 1701, "train/ce_loss": 0.7083297967910767 }, { "epoch": 0.16818271702590468, "step": 1701, "train/sim_loss": 0.0546875 }, { "epoch": 0.16818271702590468, "step": 1701, "train/total_loss": 0.1255204826593399 }, { "entropy": 9.019742965698242, "epoch": 0.16828158987542022, "mean_token_accuracy": 0.680232584476471, "num_tokens": 8857320.0, "step": 1702, "train/ce_loss": 0.9772090911865234 }, { "epoch": 0.16828158987542022, "step": 1702, "train/sim_loss": 0.09375 }, { "epoch": 0.16828158987542022, "step": 1702, "train/total_loss": 0.1914709210395813 }, { "entropy": 9.636129379272461, "epoch": 0.16838046272493573, "mean_token_accuracy": 0.7067238688468933, "num_tokens": 8862376.0, "step": 1703, "train/ce_loss": 9.42720907914918e-06 }, { "epoch": 0.16838046272493573, "step": 1703, "train/sim_loss": 0.06640625 }, { "epoch": 0.16838046272493573, "step": 1703, "train/total_loss": 0.06640719622373581 }, { "entropy": 10.004971504211426, "epoch": 0.16847933557445124, "mean_token_accuracy": 0.7386831045150757, "num_tokens": 8867303.0, "step": 1704, "train/ce_loss": 7.465568614861695e-06 }, { "epoch": 0.16847933557445124, "step": 1704, "train/sim_loss": 0.06640625 }, { "epoch": 0.16847933557445124, "step": 1704, "train/total_loss": 0.06640699505805969 }, { "entropy": 9.0205659866333, "epoch": 0.16857820842396679, "mean_token_accuracy": 0.7471042275428772, "num_tokens": 8872843.0, "step": 1705, "train/ce_loss": 0.6346682906150818 }, { "epoch": 0.16857820842396679, "step": 1705, "train/sim_loss": 0.09765625 }, { "epoch": 0.16857820842396679, "step": 1705, "train/total_loss": 0.16112308204174042 }, { "entropy": 9.970184326171875, "epoch": 0.1686770812734823, "mean_token_accuracy": 0.7250509262084961, "num_tokens": 8877748.0, "step": 1706, "train/ce_loss": 7.713006198173389e-06 }, { "epoch": 0.1686770812734823, "step": 1706, "train/sim_loss": 0.0703125 }, { "epoch": 0.1686770812734823, "step": 1706, "train/total_loss": 0.07031327486038208 }, { "entropy": 9.801458358764648, "epoch": 0.1687759541229978, "mean_token_accuracy": 0.7711039185523987, "num_tokens": 8882812.0, "step": 1707, "train/ce_loss": 1.2045811414718628 }, { "epoch": 0.1687759541229978, "step": 1707, "train/sim_loss": 0.1171875 }, { "epoch": 0.1687759541229978, "step": 1707, "train/total_loss": 0.23764562606811523 }, { "entropy": 10.350695610046387, "epoch": 0.16887482697251335, "mean_token_accuracy": 0.7046783566474915, "num_tokens": 8887543.0, "step": 1708, "train/ce_loss": 8.123223778966349e-06 }, { "epoch": 0.16887482697251335, "step": 1708, "train/sim_loss": 0.078125 }, { "epoch": 0.16887482697251335, "step": 1708, "train/total_loss": 0.07812581211328506 }, { "entropy": 9.536373138427734, "epoch": 0.16897369982202887, "mean_token_accuracy": 0.7261345982551575, "num_tokens": 8892672.0, "step": 1709, "train/ce_loss": 0.6556292772293091 }, { "epoch": 0.16897369982202887, "step": 1709, "train/sim_loss": 0.0625 }, { "epoch": 0.16897369982202887, "step": 1709, "train/total_loss": 0.12806293368339539 }, { "entropy": 9.55009651184082, "epoch": 0.1690725726715444, "mean_token_accuracy": 0.7368420958518982, "num_tokens": 8897775.0, "step": 1710, "train/ce_loss": 6.441311597882304e-06 }, { "epoch": 0.1690725726715444, "step": 1710, "train/sim_loss": 0.109375 }, { "epoch": 0.1690725726715444, "step": 1710, "train/total_loss": 0.10937564074993134 }, { "entropy": 10.291550636291504, "epoch": 0.16917144552105992, "mean_token_accuracy": 0.7945205569267273, "num_tokens": 8902499.0, "step": 1711, "train/ce_loss": 0.8966747522354126 }, { "epoch": 0.16917144552105992, "step": 1711, "train/sim_loss": 0.05859375 }, { "epoch": 0.16917144552105992, "step": 1711, "train/total_loss": 0.14826121926307678 }, { "entropy": 9.649927139282227, "epoch": 0.16927031837057543, "mean_token_accuracy": 0.7388632893562317, "num_tokens": 8907600.0, "step": 1712, "train/ce_loss": 1.3270708322525024 }, { "epoch": 0.16927031837057543, "step": 1712, "train/sim_loss": 0.08203125 }, { "epoch": 0.16927031837057543, "step": 1712, "train/total_loss": 0.21473833918571472 }, { "entropy": 9.199341773986816, "epoch": 0.16936919122009098, "mean_token_accuracy": 0.7558528184890747, "num_tokens": 8912983.0, "step": 1713, "train/ce_loss": 0.5694192051887512 }, { "epoch": 0.16936919122009098, "step": 1713, "train/sim_loss": 0.0703125 }, { "epoch": 0.16936919122009098, "step": 1713, "train/total_loss": 0.1272544264793396 }, { "entropy": 8.896648406982422, "epoch": 0.1694680640696065, "mean_token_accuracy": 0.7320930361747742, "num_tokens": 8918509.0, "step": 1714, "train/ce_loss": 0.6630674004554749 }, { "epoch": 0.1694680640696065, "step": 1714, "train/sim_loss": 0.078125 }, { "epoch": 0.1694680640696065, "step": 1714, "train/total_loss": 0.14443174004554749 }, { "entropy": 9.277908325195312, "epoch": 0.169566936919122, "mean_token_accuracy": 0.7257861495018005, "num_tokens": 8923778.0, "step": 1715, "train/ce_loss": 0.8705785870552063 }, { "epoch": 0.169566936919122, "step": 1715, "train/sim_loss": 0.04296875 }, { "epoch": 0.169566936919122, "step": 1715, "train/total_loss": 0.13002660870552063 }, { "entropy": 9.339280128479004, "epoch": 0.16966580976863754, "mean_token_accuracy": 0.7461340427398682, "num_tokens": 8928996.0, "step": 1716, "train/ce_loss": 0.7038314938545227 }, { "epoch": 0.16966580976863754, "step": 1716, "train/sim_loss": 0.0703125 }, { "epoch": 0.16966580976863754, "step": 1716, "train/total_loss": 0.14069566130638123 }, { "entropy": 9.661998748779297, "epoch": 0.16976468261815306, "mean_token_accuracy": 0.7586776614189148, "num_tokens": 8934047.0, "step": 1717, "train/ce_loss": 1.1090891361236572 }, { "epoch": 0.16976468261815306, "step": 1717, "train/sim_loss": 0.03125 }, { "epoch": 0.16976468261815306, "step": 1717, "train/total_loss": 0.14215892553329468 }, { "entropy": 9.382301330566406, "epoch": 0.16986355546766857, "mean_token_accuracy": 0.7525380849838257, "num_tokens": 8939288.0, "step": 1718, "train/ce_loss": 1.004603385925293 }, { "epoch": 0.16986355546766857, "step": 1718, "train/sim_loss": 0.08984375 }, { "epoch": 0.16986355546766857, "step": 1718, "train/total_loss": 0.19030410051345825 }, { "entropy": 9.96207046508789, "epoch": 0.1699624283171841, "mean_token_accuracy": 0.7532956600189209, "num_tokens": 8944267.0, "step": 1719, "train/ce_loss": 0.6833586692810059 }, { "epoch": 0.1699624283171841, "step": 1719, "train/sim_loss": 0.109375 }, { "epoch": 0.1699624283171841, "step": 1719, "train/total_loss": 0.1777108609676361 }, { "epoch": 0.17006130116669962, "grad_norm": 0.9704808592796326, "learning_rate": 9.57746130643327e-06, "loss": 0.1675, "step": 1720 }, { "entropy": 9.23289680480957, "epoch": 0.17006130116669962, "mean_token_accuracy": 0.7065088748931885, "num_tokens": 8949589.0, "step": 1720, "train/ce_loss": 0.6869282126426697 }, { "epoch": 0.17006130116669962, "step": 1720, "train/sim_loss": 0.06640625 }, { "epoch": 0.17006130116669962, "step": 1720, "train/total_loss": 0.13509908318519592 }, { "entropy": 8.945226669311523, "epoch": 0.17016017401621514, "mean_token_accuracy": 0.7189542651176453, "num_tokens": 8955139.0, "step": 1721, "train/ce_loss": 0.5507825016975403 }, { "epoch": 0.17016017401621514, "step": 1721, "train/sim_loss": 0.03515625 }, { "epoch": 0.17016017401621514, "step": 1721, "train/total_loss": 0.09023450314998627 }, { "entropy": 9.419705390930176, "epoch": 0.17025904686573068, "mean_token_accuracy": 0.7095046639442444, "num_tokens": 8960324.0, "step": 1722, "train/ce_loss": 0.48278287053108215 }, { "epoch": 0.17025904686573068, "step": 1722, "train/sim_loss": 0.06640625 }, { "epoch": 0.17025904686573068, "step": 1722, "train/total_loss": 0.11468453705310822 }, { "entropy": 9.383966445922852, "epoch": 0.1703579197152462, "mean_token_accuracy": 0.6917989253997803, "num_tokens": 8965505.0, "step": 1723, "train/ce_loss": 1.9925827980041504 }, { "epoch": 0.1703579197152462, "step": 1723, "train/sim_loss": 0.05859375 }, { "epoch": 0.1703579197152462, "step": 1723, "train/total_loss": 0.2578520178794861 }, { "entropy": 8.971576690673828, "epoch": 0.1704567925647617, "mean_token_accuracy": 0.7637362480163574, "num_tokens": 8970896.0, "step": 1724, "train/ce_loss": 0.7792263031005859 }, { "epoch": 0.1704567925647617, "step": 1724, "train/sim_loss": 0.07421875 }, { "epoch": 0.1704567925647617, "step": 1724, "train/total_loss": 0.15214139223098755 }, { "entropy": 9.419415473937988, "epoch": 0.17055566541427725, "mean_token_accuracy": 0.7274073958396912, "num_tokens": 8976002.0, "step": 1725, "train/ce_loss": 0.7176265716552734 }, { "epoch": 0.17055566541427725, "step": 1725, "train/sim_loss": 0.06640625 }, { "epoch": 0.17055566541427725, "step": 1725, "train/total_loss": 0.13816890120506287 }, { "entropy": 9.516657829284668, "epoch": 0.17065453826379276, "mean_token_accuracy": 0.714067280292511, "num_tokens": 8981144.0, "step": 1726, "train/ce_loss": 1.1410041224735323e-05 }, { "epoch": 0.17065453826379276, "step": 1726, "train/sim_loss": 0.05078125 }, { "epoch": 0.17065453826379276, "step": 1726, "train/total_loss": 0.05078238993883133 }, { "entropy": 9.524466514587402, "epoch": 0.17075341111330827, "mean_token_accuracy": 0.7178003191947937, "num_tokens": 8986258.0, "step": 1727, "train/ce_loss": 0.5584552884101868 }, { "epoch": 0.17075341111330827, "step": 1727, "train/sim_loss": 0.07421875 }, { "epoch": 0.17075341111330827, "step": 1727, "train/total_loss": 0.13006427884101868 }, { "entropy": 9.789735794067383, "epoch": 0.17085228396282381, "mean_token_accuracy": 0.7452667951583862, "num_tokens": 8991289.0, "step": 1728, "train/ce_loss": 1.0358099643781316e-05 }, { "epoch": 0.17085228396282381, "step": 1728, "train/sim_loss": 0.078125 }, { "epoch": 0.17085228396282381, "step": 1728, "train/total_loss": 0.07812603563070297 }, { "entropy": 9.533740997314453, "epoch": 0.17095115681233933, "mean_token_accuracy": 0.69532710313797, "num_tokens": 8996289.0, "step": 1729, "train/ce_loss": 1.4981647729873657 }, { "epoch": 0.17095115681233933, "step": 1729, "train/sim_loss": 0.125 }, { "epoch": 0.17095115681233933, "step": 1729, "train/total_loss": 0.27481648325920105 }, { "entropy": 8.76301383972168, "epoch": 0.17105002966185484, "mean_token_accuracy": 0.8076152205467224, "num_tokens": 9001811.0, "step": 1730, "train/ce_loss": 0.495195597410202 }, { "epoch": 0.17105002966185484, "step": 1730, "train/sim_loss": 0.03515625 }, { "epoch": 0.17105002966185484, "step": 1730, "train/total_loss": 0.08467581123113632 }, { "entropy": 9.538856506347656, "epoch": 0.17114890251137038, "mean_token_accuracy": 0.688693106174469, "num_tokens": 9006951.0, "step": 1731, "train/ce_loss": 1.0818617343902588 }, { "epoch": 0.17114890251137038, "step": 1731, "train/sim_loss": 0.07421875 }, { "epoch": 0.17114890251137038, "step": 1731, "train/total_loss": 0.18240493535995483 }, { "entropy": 9.177574157714844, "epoch": 0.1712477753608859, "mean_token_accuracy": 0.7191489338874817, "num_tokens": 9012359.0, "step": 1732, "train/ce_loss": 1.2056026458740234 }, { "epoch": 0.1712477753608859, "step": 1732, "train/sim_loss": 0.0859375 }, { "epoch": 0.1712477753608859, "step": 1732, "train/total_loss": 0.20649775862693787 }, { "entropy": 8.994552612304688, "epoch": 0.17134664821040144, "mean_token_accuracy": 0.7307692170143127, "num_tokens": 9017704.0, "step": 1733, "train/ce_loss": 1.107088327407837 }, { "epoch": 0.17134664821040144, "step": 1733, "train/sim_loss": 0.1015625 }, { "epoch": 0.17134664821040144, "step": 1733, "train/total_loss": 0.2122713327407837 }, { "entropy": 9.316804885864258, "epoch": 0.17144552105991695, "mean_token_accuracy": 0.8085365891456604, "num_tokens": 9022970.0, "step": 1734, "train/ce_loss": 0.4841051697731018 }, { "epoch": 0.17144552105991695, "step": 1734, "train/sim_loss": 0.03515625 }, { "epoch": 0.17144552105991695, "step": 1734, "train/total_loss": 0.08356676995754242 }, { "entropy": 9.439652442932129, "epoch": 0.17154439390943246, "mean_token_accuracy": 0.7350318431854248, "num_tokens": 9028160.0, "step": 1735, "train/ce_loss": 1.0343725681304932 }, { "epoch": 0.17154439390943246, "step": 1735, "train/sim_loss": 0.0546875 }, { "epoch": 0.17154439390943246, "step": 1735, "train/total_loss": 0.15812475979328156 }, { "entropy": 9.1663818359375, "epoch": 0.171643266758948, "mean_token_accuracy": 0.7737226486206055, "num_tokens": 9033519.0, "step": 1736, "train/ce_loss": 0.5285016894340515 }, { "epoch": 0.171643266758948, "step": 1736, "train/sim_loss": 0.0390625 }, { "epoch": 0.171643266758948, "step": 1736, "train/total_loss": 0.09191267192363739 }, { "entropy": 9.119280815124512, "epoch": 0.17174213960846352, "mean_token_accuracy": 0.7171609997749329, "num_tokens": 9038914.0, "step": 1737, "train/ce_loss": 0.5667960047721863 }, { "epoch": 0.17174213960846352, "step": 1737, "train/sim_loss": 0.078125 }, { "epoch": 0.17174213960846352, "step": 1737, "train/total_loss": 0.1348046064376831 }, { "entropy": 9.536650657653809, "epoch": 0.17184101245797903, "mean_token_accuracy": 0.7481805086135864, "num_tokens": 9044056.0, "step": 1738, "train/ce_loss": 0.6263763308525085 }, { "epoch": 0.17184101245797903, "step": 1738, "train/sim_loss": 0.07421875 }, { "epoch": 0.17184101245797903, "step": 1738, "train/total_loss": 0.13685637712478638 }, { "entropy": 9.257691383361816, "epoch": 0.17193988530749457, "mean_token_accuracy": 0.6525934934616089, "num_tokens": 9049393.0, "step": 1739, "train/ce_loss": 1.0220292806625366 }, { "epoch": 0.17193988530749457, "step": 1739, "train/sim_loss": 0.08984375 }, { "epoch": 0.17193988530749457, "step": 1739, "train/total_loss": 0.19204667210578918 }, { "epoch": 0.17203875815701009, "grad_norm": 1.1733081340789795, "learning_rate": 9.57251644167532e-06, "loss": 0.1643, "step": 1740 }, { "entropy": 9.278755187988281, "epoch": 0.17203875815701009, "mean_token_accuracy": 0.7256990671157837, "num_tokens": 9054661.0, "step": 1740, "train/ce_loss": 0.8057752251625061 }, { "epoch": 0.17203875815701009, "step": 1740, "train/sim_loss": 0.0703125 }, { "epoch": 0.17203875815701009, "step": 1740, "train/total_loss": 0.1508900225162506 }, { "entropy": 9.168914794921875, "epoch": 0.1721376310065256, "mean_token_accuracy": 0.6828729510307312, "num_tokens": 9059997.0, "step": 1741, "train/ce_loss": 0.3121006190776825 }, { "epoch": 0.1721376310065256, "step": 1741, "train/sim_loss": 0.0625 }, { "epoch": 0.1721376310065256, "step": 1741, "train/total_loss": 0.09371006488800049 }, { "entropy": 9.900165557861328, "epoch": 0.17223650385604114, "mean_token_accuracy": 0.7354166507720947, "num_tokens": 9064908.0, "step": 1742, "train/ce_loss": 0.9869362711906433 }, { "epoch": 0.17223650385604114, "step": 1742, "train/sim_loss": 0.07421875 }, { "epoch": 0.17223650385604114, "step": 1742, "train/total_loss": 0.1729123890399933 }, { "entropy": 9.285699844360352, "epoch": 0.17233537670555665, "mean_token_accuracy": 0.7460890412330627, "num_tokens": 9070163.0, "step": 1743, "train/ce_loss": 0.4222497344017029 }, { "epoch": 0.17233537670555665, "step": 1743, "train/sim_loss": 0.06640625 }, { "epoch": 0.17233537670555665, "step": 1743, "train/total_loss": 0.10863122344017029 }, { "entropy": 9.915960311889648, "epoch": 0.17243424955507217, "mean_token_accuracy": 0.7307001948356628, "num_tokens": 9075131.0, "step": 1744, "train/ce_loss": 1.5877418518066406 }, { "epoch": 0.17243424955507217, "step": 1744, "train/sim_loss": 0.07421875 }, { "epoch": 0.17243424955507217, "step": 1744, "train/total_loss": 0.23299293220043182 }, { "entropy": 8.995991706848145, "epoch": 0.1725331224045877, "mean_token_accuracy": 0.7079152464866638, "num_tokens": 9080499.0, "step": 1745, "train/ce_loss": 0.6653462052345276 }, { "epoch": 0.1725331224045877, "step": 1745, "train/sim_loss": 0.06640625 }, { "epoch": 0.1725331224045877, "step": 1745, "train/total_loss": 0.132940873503685 }, { "entropy": 9.508094787597656, "epoch": 0.17263199525410322, "mean_token_accuracy": 0.7595772743225098, "num_tokens": 9085891.0, "step": 1746, "train/ce_loss": 0.7341980338096619 }, { "epoch": 0.17263199525410322, "step": 1746, "train/sim_loss": 0.06640625 }, { "epoch": 0.17263199525410322, "step": 1746, "train/total_loss": 0.13982605934143066 }, { "entropy": 9.453865051269531, "epoch": 0.17273086810361873, "mean_token_accuracy": 0.7516005039215088, "num_tokens": 9091113.0, "step": 1747, "train/ce_loss": 0.8095971941947937 }, { "epoch": 0.17273086810361873, "step": 1747, "train/sim_loss": 0.1484375 }, { "epoch": 0.17273086810361873, "step": 1747, "train/total_loss": 0.2293972223997116 }, { "entropy": 9.227066040039062, "epoch": 0.17282974095313428, "mean_token_accuracy": 0.7286063432693481, "num_tokens": 9096424.0, "step": 1748, "train/ce_loss": 0.7434126734733582 }, { "epoch": 0.17282974095313428, "step": 1748, "train/sim_loss": 0.05078125 }, { "epoch": 0.17282974095313428, "step": 1748, "train/total_loss": 0.12512251734733582 }, { "entropy": 9.189695358276367, "epoch": 0.1729286138026498, "mean_token_accuracy": 0.7150837779045105, "num_tokens": 9101762.0, "step": 1749, "train/ce_loss": 0.9477271437644958 }, { "epoch": 0.1729286138026498, "step": 1749, "train/sim_loss": 0.10546875 }, { "epoch": 0.1729286138026498, "step": 1749, "train/total_loss": 0.20024147629737854 }, { "entropy": 8.743956565856934, "epoch": 0.1730274866521653, "mean_token_accuracy": 0.7459016442298889, "num_tokens": 9107245.0, "step": 1750, "train/ce_loss": 0.8299065828323364 }, { "epoch": 0.1730274866521653, "step": 1750, "train/sim_loss": 0.1015625 }, { "epoch": 0.1730274866521653, "step": 1750, "train/total_loss": 0.18455316126346588 }, { "entropy": 9.632036209106445, "epoch": 0.17312635950168084, "mean_token_accuracy": 0.7817638516426086, "num_tokens": 9112356.0, "step": 1751, "train/ce_loss": 1.2616857290267944 }, { "epoch": 0.17312635950168084, "step": 1751, "train/sim_loss": 0.0546875 }, { "epoch": 0.17312635950168084, "step": 1751, "train/total_loss": 0.18085607886314392 }, { "entropy": 9.808963775634766, "epoch": 0.17322523235119636, "mean_token_accuracy": 0.7113593816757202, "num_tokens": 9117345.0, "step": 1752, "train/ce_loss": 1.0092039108276367 }, { "epoch": 0.17322523235119636, "step": 1752, "train/sim_loss": 0.078125 }, { "epoch": 0.17322523235119636, "step": 1752, "train/total_loss": 0.1790453940629959 }, { "entropy": 9.167421340942383, "epoch": 0.1733241052007119, "mean_token_accuracy": 0.7189384698867798, "num_tokens": 9122644.0, "step": 1753, "train/ce_loss": 0.7508464455604553 }, { "epoch": 0.1733241052007119, "step": 1753, "train/sim_loss": 0.08203125 }, { "epoch": 0.1733241052007119, "step": 1753, "train/total_loss": 0.1571159064769745 }, { "entropy": 9.825424194335938, "epoch": 0.1734229780502274, "mean_token_accuracy": 0.7134831547737122, "num_tokens": 9127657.0, "step": 1754, "train/ce_loss": 0.8539113402366638 }, { "epoch": 0.1734229780502274, "step": 1754, "train/sim_loss": 0.11328125 }, { "epoch": 0.1734229780502274, "step": 1754, "train/total_loss": 0.19867238402366638 }, { "entropy": 9.104223251342773, "epoch": 0.17352185089974292, "mean_token_accuracy": 0.7300000190734863, "num_tokens": 9133087.0, "step": 1755, "train/ce_loss": 0.3209971487522125 }, { "epoch": 0.17352185089974292, "step": 1755, "train/sim_loss": 0.0859375 }, { "epoch": 0.17352185089974292, "step": 1755, "train/total_loss": 0.11803721636533737 }, { "entropy": 9.736286163330078, "epoch": 0.17362072374925847, "mean_token_accuracy": 0.7467741966247559, "num_tokens": 9138128.0, "step": 1756, "train/ce_loss": 1.6898686226340942e-05 }, { "epoch": 0.17362072374925847, "step": 1756, "train/sim_loss": 0.0703125 }, { "epoch": 0.17362072374925847, "step": 1756, "train/total_loss": 0.0703141912817955 }, { "entropy": 9.524857521057129, "epoch": 0.17371959659877398, "mean_token_accuracy": 0.7045840620994568, "num_tokens": 9143311.0, "step": 1757, "train/ce_loss": 1.2180140018463135 }, { "epoch": 0.17371959659877398, "step": 1757, "train/sim_loss": 0.08203125 }, { "epoch": 0.17371959659877398, "step": 1757, "train/total_loss": 0.20383265614509583 }, { "entropy": 9.574028968811035, "epoch": 0.1738184694482895, "mean_token_accuracy": 0.7369862794876099, "num_tokens": 9148469.0, "step": 1758, "train/ce_loss": 0.7238757610321045 }, { "epoch": 0.1738184694482895, "step": 1758, "train/sim_loss": 0.0859375 }, { "epoch": 0.1738184694482895, "step": 1758, "train/total_loss": 0.15832507610321045 }, { "entropy": 8.992545127868652, "epoch": 0.17391734229780503, "mean_token_accuracy": 0.7678571343421936, "num_tokens": 9153893.0, "step": 1759, "train/ce_loss": 0.360592246055603 }, { "epoch": 0.17391734229780503, "step": 1759, "train/sim_loss": 0.07421875 }, { "epoch": 0.17391734229780503, "step": 1759, "train/total_loss": 0.11027798056602478 }, { "epoch": 0.17401621514732055, "grad_norm": 0.8445634245872498, "learning_rate": 9.567571576917371e-06, "loss": 0.1678, "step": 1760 }, { "entropy": 9.652566909790039, "epoch": 0.17401621514732055, "mean_token_accuracy": 0.7388059496879578, "num_tokens": 9158991.0, "step": 1760, "train/ce_loss": 1.5193932056427002 }, { "epoch": 0.17401621514732055, "step": 1760, "train/sim_loss": 0.06640625 }, { "epoch": 0.17401621514732055, "step": 1760, "train/total_loss": 0.21834556758403778 }, { "entropy": 9.348615646362305, "epoch": 0.17411508799683606, "mean_token_accuracy": 0.7608982920646667, "num_tokens": 9164282.0, "step": 1761, "train/ce_loss": 0.7403160333633423 }, { "epoch": 0.17411508799683606, "step": 1761, "train/sim_loss": 0.06640625 }, { "epoch": 0.17411508799683606, "step": 1761, "train/total_loss": 0.14043785631656647 }, { "entropy": 9.238094329833984, "epoch": 0.1742139608463516, "mean_token_accuracy": 0.7394859790802002, "num_tokens": 9169601.0, "step": 1762, "train/ce_loss": 0.3605496287345886 }, { "epoch": 0.1742139608463516, "step": 1762, "train/sim_loss": 0.078125 }, { "epoch": 0.1742139608463516, "step": 1762, "train/total_loss": 0.11417996883392334 }, { "entropy": 9.508635520935059, "epoch": 0.17431283369586711, "mean_token_accuracy": 0.7448559403419495, "num_tokens": 9174958.0, "step": 1763, "train/ce_loss": 1.4740917682647705 }, { "epoch": 0.17431283369586711, "step": 1763, "train/sim_loss": 0.08203125 }, { "epoch": 0.17431283369586711, "step": 1763, "train/total_loss": 0.22944043576717377 }, { "entropy": 10.01222038269043, "epoch": 0.17441170654538263, "mean_token_accuracy": 0.7267080545425415, "num_tokens": 9179827.0, "step": 1764, "train/ce_loss": 1.3136488632881083e-05 }, { "epoch": 0.17441170654538263, "step": 1764, "train/sim_loss": 0.0234375 }, { "epoch": 0.17441170654538263, "step": 1764, "train/total_loss": 0.023438813164830208 }, { "entropy": 9.579922676086426, "epoch": 0.17451057939489817, "mean_token_accuracy": 0.7388167381286621, "num_tokens": 9184970.0, "step": 1765, "train/ce_loss": 1.531071424484253 }, { "epoch": 0.17451057939489817, "step": 1765, "train/sim_loss": 0.08984375 }, { "epoch": 0.17451057939489817, "step": 1765, "train/total_loss": 0.242950901389122 }, { "entropy": 9.957202911376953, "epoch": 0.17460945224441368, "mean_token_accuracy": 0.853210985660553, "num_tokens": 9189851.0, "step": 1766, "train/ce_loss": 1.0298511981964111 }, { "epoch": 0.17460945224441368, "step": 1766, "train/sim_loss": 0.046875 }, { "epoch": 0.17460945224441368, "step": 1766, "train/total_loss": 0.14986011385917664 }, { "entropy": 9.486515045166016, "epoch": 0.1747083250939292, "mean_token_accuracy": 0.728205144405365, "num_tokens": 9194877.0, "step": 1767, "train/ce_loss": 1.5970531702041626 }, { "epoch": 0.1747083250939292, "step": 1767, "train/sim_loss": 0.2578125 }, { "epoch": 0.1747083250939292, "step": 1767, "train/total_loss": 0.41751784086227417 }, { "entropy": 10.033461570739746, "epoch": 0.17480719794344474, "mean_token_accuracy": 0.7226890921592712, "num_tokens": 9199768.0, "step": 1768, "train/ce_loss": 6.694883632007986e-05 }, { "epoch": 0.17480719794344474, "step": 1768, "train/sim_loss": 0.0390625 }, { "epoch": 0.17480719794344474, "step": 1768, "train/total_loss": 0.039069194346666336 }, { "entropy": 9.921037673950195, "epoch": 0.17490607079296025, "mean_token_accuracy": 0.7867803573608398, "num_tokens": 9204667.0, "step": 1769, "train/ce_loss": 1.650039792060852 }, { "epoch": 0.17490607079296025, "step": 1769, "train/sim_loss": 0.078125 }, { "epoch": 0.17490607079296025, "step": 1769, "train/total_loss": 0.24312898516654968 }, { "entropy": 9.630839347839355, "epoch": 0.17500494364247576, "mean_token_accuracy": 0.7410423159599304, "num_tokens": 9209727.0, "step": 1770, "train/ce_loss": 0.8447070717811584 }, { "epoch": 0.17500494364247576, "step": 1770, "train/sim_loss": 0.0703125 }, { "epoch": 0.17500494364247576, "step": 1770, "train/total_loss": 0.1547832190990448 }, { "entropy": 9.445236206054688, "epoch": 0.1751038164919913, "mean_token_accuracy": 0.7431694269180298, "num_tokens": 9214955.0, "step": 1771, "train/ce_loss": 0.4419805109500885 }, { "epoch": 0.1751038164919913, "step": 1771, "train/sim_loss": 0.1171875 }, { "epoch": 0.1751038164919913, "step": 1771, "train/total_loss": 0.16138555109500885 }, { "entropy": 9.136075019836426, "epoch": 0.17520268934150682, "mean_token_accuracy": 0.6778378486633301, "num_tokens": 9220329.0, "step": 1772, "train/ce_loss": 0.6905080080032349 }, { "epoch": 0.17520268934150682, "step": 1772, "train/sim_loss": 0.05859375 }, { "epoch": 0.17520268934150682, "step": 1772, "train/total_loss": 0.12764455378055573 }, { "entropy": 9.195317268371582, "epoch": 0.17530156219102236, "mean_token_accuracy": 0.7616707682609558, "num_tokens": 9225623.0, "step": 1773, "train/ce_loss": 1.0541272163391113 }, { "epoch": 0.17530156219102236, "step": 1773, "train/sim_loss": 0.09765625 }, { "epoch": 0.17530156219102236, "step": 1773, "train/total_loss": 0.20306897163391113 }, { "entropy": 9.68917179107666, "epoch": 0.17540043504053787, "mean_token_accuracy": 0.7045454382896423, "num_tokens": 9230712.0, "step": 1774, "train/ce_loss": 1.472992181777954 }, { "epoch": 0.17540043504053787, "step": 1774, "train/sim_loss": 0.08984375 }, { "epoch": 0.17540043504053787, "step": 1774, "train/total_loss": 0.23714296519756317 }, { "entropy": 9.076818466186523, "epoch": 0.17549930789005339, "mean_token_accuracy": 0.6739811897277832, "num_tokens": 9236187.0, "step": 1775, "train/ce_loss": 0.6155195832252502 }, { "epoch": 0.17549930789005339, "step": 1775, "train/sim_loss": 0.09765625 }, { "epoch": 0.17549930789005339, "step": 1775, "train/total_loss": 0.15920820832252502 }, { "entropy": 9.201278686523438, "epoch": 0.17559818073956893, "mean_token_accuracy": 0.7463414669036865, "num_tokens": 9241467.0, "step": 1776, "train/ce_loss": 0.9912561178207397 }, { "epoch": 0.17559818073956893, "step": 1776, "train/sim_loss": 0.07421875 }, { "epoch": 0.17559818073956893, "step": 1776, "train/total_loss": 0.17334437370300293 }, { "entropy": 9.124858856201172, "epoch": 0.17569705358908444, "mean_token_accuracy": 0.682692289352417, "num_tokens": 9246847.0, "step": 1777, "train/ce_loss": 0.9765233993530273 }, { "epoch": 0.17569705358908444, "step": 1777, "train/sim_loss": 0.05078125 }, { "epoch": 0.17569705358908444, "step": 1777, "train/total_loss": 0.1484335958957672 }, { "entropy": 9.057942390441895, "epoch": 0.17579592643859995, "mean_token_accuracy": 0.7431289553642273, "num_tokens": 9252263.0, "step": 1778, "train/ce_loss": 0.8119320869445801 }, { "epoch": 0.17579592643859995, "step": 1778, "train/sim_loss": 0.11328125 }, { "epoch": 0.17579592643859995, "step": 1778, "train/total_loss": 0.194474458694458 }, { "entropy": 9.562312126159668, "epoch": 0.1758947992881155, "mean_token_accuracy": 0.818320631980896, "num_tokens": 9257360.0, "step": 1779, "train/ce_loss": 5.361784587876173e-06 }, { "epoch": 0.1758947992881155, "step": 1779, "train/sim_loss": 0.03125 }, { "epoch": 0.1758947992881155, "step": 1779, "train/total_loss": 0.03125053644180298 }, { "epoch": 0.175993672137631, "grad_norm": 0.7717829942703247, "learning_rate": 9.562626712159424e-06, "loss": 0.1539, "step": 1780 }, { "entropy": 9.121821403503418, "epoch": 0.175993672137631, "mean_token_accuracy": 0.7740740776062012, "num_tokens": 9262637.0, "step": 1780, "train/ce_loss": 0.759074330329895 }, { "epoch": 0.175993672137631, "step": 1780, "train/sim_loss": 0.12109375 }, { "epoch": 0.175993672137631, "step": 1780, "train/total_loss": 0.19700118899345398 }, { "entropy": 9.320338249206543, "epoch": 0.17609254498714652, "mean_token_accuracy": 0.744535505771637, "num_tokens": 9267864.0, "step": 1781, "train/ce_loss": 0.7638721466064453 }, { "epoch": 0.17609254498714652, "step": 1781, "train/sim_loss": 0.078125 }, { "epoch": 0.17609254498714652, "step": 1781, "train/total_loss": 0.1545122265815735 }, { "entropy": 8.921955108642578, "epoch": 0.17619141783666206, "mean_token_accuracy": 0.7149321436882019, "num_tokens": 9273168.0, "step": 1782, "train/ce_loss": 1.2899847030639648 }, { "epoch": 0.17619141783666206, "step": 1782, "train/sim_loss": 0.05859375 }, { "epoch": 0.17619141783666206, "step": 1782, "train/total_loss": 0.18759222328662872 }, { "entropy": 9.738253593444824, "epoch": 0.17629029068617758, "mean_token_accuracy": 0.7022653818130493, "num_tokens": 9278273.0, "step": 1783, "train/ce_loss": 1.3247605562210083 }, { "epoch": 0.17629029068617758, "step": 1783, "train/sim_loss": 0.109375 }, { "epoch": 0.17629029068617758, "step": 1783, "train/total_loss": 0.2418510615825653 }, { "entropy": 9.320428848266602, "epoch": 0.1763891635356931, "mean_token_accuracy": 0.734375, "num_tokens": 9283410.0, "step": 1784, "train/ce_loss": 1.5505481958389282 }, { "epoch": 0.1763891635356931, "step": 1784, "train/sim_loss": 0.08203125 }, { "epoch": 0.1763891635356931, "step": 1784, "train/total_loss": 0.23708607256412506 }, { "entropy": 8.998424530029297, "epoch": 0.17648803638520863, "mean_token_accuracy": 0.7615965604782104, "num_tokens": 9288826.0, "step": 1785, "train/ce_loss": 0.8054037094116211 }, { "epoch": 0.17648803638520863, "step": 1785, "train/sim_loss": 0.109375 }, { "epoch": 0.17648803638520863, "step": 1785, "train/total_loss": 0.18991537392139435 }, { "entropy": 9.724132537841797, "epoch": 0.17658690923472414, "mean_token_accuracy": 0.7370242476463318, "num_tokens": 9293868.0, "step": 1786, "train/ce_loss": 0.8708922863006592 }, { "epoch": 0.17658690923472414, "step": 1786, "train/sim_loss": 0.06640625 }, { "epoch": 0.17658690923472414, "step": 1786, "train/total_loss": 0.15349549055099487 }, { "entropy": 9.522594451904297, "epoch": 0.17668578208423966, "mean_token_accuracy": 0.7403225898742676, "num_tokens": 9298944.0, "step": 1787, "train/ce_loss": 1.2386717796325684 }, { "epoch": 0.17668578208423966, "step": 1787, "train/sim_loss": 0.0859375 }, { "epoch": 0.17668578208423966, "step": 1787, "train/total_loss": 0.2098046839237213 }, { "entropy": 10.408124923706055, "epoch": 0.1767846549337552, "mean_token_accuracy": 0.7104377150535583, "num_tokens": 9303661.0, "step": 1788, "train/ce_loss": 6.664123793598264e-05 }, { "epoch": 0.1767846549337552, "step": 1788, "train/sim_loss": 0.046875 }, { "epoch": 0.1767846549337552, "step": 1788, "train/total_loss": 0.04688166454434395 }, { "entropy": 9.13021183013916, "epoch": 0.1768835277832707, "mean_token_accuracy": 0.730526328086853, "num_tokens": 9309093.0, "step": 1789, "train/ce_loss": 0.8388895988464355 }, { "epoch": 0.1768835277832707, "step": 1789, "train/sim_loss": 0.05859375 }, { "epoch": 0.1768835277832707, "step": 1789, "train/total_loss": 0.1424827128648758 }, { "entropy": 9.56302261352539, "epoch": 0.17698240063278622, "mean_token_accuracy": 0.6944444179534912, "num_tokens": 9314152.0, "step": 1790, "train/ce_loss": 1.0453897714614868 }, { "epoch": 0.17698240063278622, "step": 1790, "train/sim_loss": 0.078125 }, { "epoch": 0.17698240063278622, "step": 1790, "train/total_loss": 0.18266397714614868 }, { "entropy": 9.247736930847168, "epoch": 0.17708127348230177, "mean_token_accuracy": 0.788557231426239, "num_tokens": 9319444.0, "step": 1791, "train/ce_loss": 0.49960947036743164 }, { "epoch": 0.17708127348230177, "step": 1791, "train/sim_loss": 0.09765625 }, { "epoch": 0.17708127348230177, "step": 1791, "train/total_loss": 0.1476171910762787 }, { "entropy": 9.873125076293945, "epoch": 0.17718014633181728, "mean_token_accuracy": 0.7311608791351318, "num_tokens": 9324346.0, "step": 1792, "train/ce_loss": 2.2994272708892822 }, { "epoch": 0.17718014633181728, "step": 1792, "train/sim_loss": 0.0625 }, { "epoch": 0.17718014633181728, "step": 1792, "train/total_loss": 0.2924427390098572 }, { "entropy": 9.160506248474121, "epoch": 0.17727901918133282, "mean_token_accuracy": 0.7244898080825806, "num_tokens": 9329680.0, "step": 1793, "train/ce_loss": 1.0658172369003296 }, { "epoch": 0.17727901918133282, "step": 1793, "train/sim_loss": 0.08984375 }, { "epoch": 0.17727901918133282, "step": 1793, "train/total_loss": 0.19642546772956848 }, { "entropy": 9.280560493469238, "epoch": 0.17737789203084833, "mean_token_accuracy": 0.7568270564079285, "num_tokens": 9334912.0, "step": 1794, "train/ce_loss": 1.158437728881836 }, { "epoch": 0.17737789203084833, "step": 1794, "train/sim_loss": 0.0703125 }, { "epoch": 0.17737789203084833, "step": 1794, "train/total_loss": 0.1861562728881836 }, { "entropy": 10.03182601928711, "epoch": 0.17747676488036385, "mean_token_accuracy": 0.7526881694793701, "num_tokens": 9339782.0, "step": 1795, "train/ce_loss": 2.1850147247314453 }, { "epoch": 0.17747676488036385, "step": 1795, "train/sim_loss": 0.0859375 }, { "epoch": 0.17747676488036385, "step": 1795, "train/total_loss": 0.304438978433609 }, { "entropy": 8.88985824584961, "epoch": 0.1775756377298794, "mean_token_accuracy": 0.7665244936943054, "num_tokens": 9345184.0, "step": 1796, "train/ce_loss": 0.5084155797958374 }, { "epoch": 0.1775756377298794, "step": 1796, "train/sim_loss": 0.06640625 }, { "epoch": 0.1775756377298794, "step": 1796, "train/total_loss": 0.1172478049993515 }, { "entropy": 8.852987289428711, "epoch": 0.1776745105793949, "mean_token_accuracy": 0.7342026233673096, "num_tokens": 9350669.0, "step": 1797, "train/ce_loss": 0.6396374106407166 }, { "epoch": 0.1776745105793949, "step": 1797, "train/sim_loss": 0.08984375 }, { "epoch": 0.1776745105793949, "step": 1797, "train/total_loss": 0.15380749106407166 }, { "entropy": 9.820240020751953, "epoch": 0.17777338342891041, "mean_token_accuracy": 0.765072762966156, "num_tokens": 9355647.0, "step": 1798, "train/ce_loss": 1.4581230878829956 }, { "epoch": 0.17777338342891041, "step": 1798, "train/sim_loss": 0.0703125 }, { "epoch": 0.17777338342891041, "step": 1798, "train/total_loss": 0.21612481772899628 }, { "entropy": 9.143946647644043, "epoch": 0.17787225627842596, "mean_token_accuracy": 0.6997518539428711, "num_tokens": 9360961.0, "step": 1799, "train/ce_loss": 1.3224116563796997 }, { "epoch": 0.17787225627842596, "step": 1799, "train/sim_loss": 0.0625 }, { "epoch": 0.17787225627842596, "step": 1799, "train/total_loss": 0.1947411745786667 }, { "epoch": 0.17797112912794147, "grad_norm": 1.0880522727966309, "learning_rate": 9.557681847401474e-06, "loss": 0.1681, "step": 1800 }, { "entropy": 9.875439643859863, "epoch": 0.17797112912794147, "mean_token_accuracy": 0.7449209690093994, "num_tokens": 9365872.0, "step": 1800, "train/ce_loss": 0.9147996306419373 }, { "epoch": 0.17797112912794147, "step": 1800, "train/sim_loss": 0.0546875 }, { "epoch": 0.17797112912794147, "step": 1800, "train/total_loss": 0.14616745710372925 }, { "entropy": 9.731260299682617, "epoch": 0.17807000197745698, "mean_token_accuracy": 0.7075098752975464, "num_tokens": 9370789.0, "step": 1801, "train/ce_loss": 1.002318024635315 }, { "epoch": 0.17807000197745698, "step": 1801, "train/sim_loss": 0.06640625 }, { "epoch": 0.17807000197745698, "step": 1801, "train/total_loss": 0.16663804650306702 }, { "entropy": 9.157102584838867, "epoch": 0.17816887482697252, "mean_token_accuracy": 0.7615384459495544, "num_tokens": 9376296.0, "step": 1802, "train/ce_loss": 0.8391835689544678 }, { "epoch": 0.17816887482697252, "step": 1802, "train/sim_loss": 0.06640625 }, { "epoch": 0.17816887482697252, "step": 1802, "train/total_loss": 0.15032461285591125 }, { "entropy": 9.377922058105469, "epoch": 0.17826774767648804, "mean_token_accuracy": 0.7012278437614441, "num_tokens": 9381493.0, "step": 1803, "train/ce_loss": 0.7130151987075806 }, { "epoch": 0.17826774767648804, "step": 1803, "train/sim_loss": 0.08203125 }, { "epoch": 0.17826774767648804, "step": 1803, "train/total_loss": 0.15333276987075806 }, { "entropy": 9.224993705749512, "epoch": 0.17836662052600355, "mean_token_accuracy": 0.7852028608322144, "num_tokens": 9386798.0, "step": 1804, "train/ce_loss": 0.8284642696380615 }, { "epoch": 0.17836662052600355, "step": 1804, "train/sim_loss": 0.02734375 }, { "epoch": 0.17836662052600355, "step": 1804, "train/total_loss": 0.11019017547369003 }, { "entropy": 9.024450302124023, "epoch": 0.1784654933755191, "mean_token_accuracy": 0.7416413426399231, "num_tokens": 9392253.0, "step": 1805, "train/ce_loss": 1.0536538362503052 }, { "epoch": 0.1784654933755191, "step": 1805, "train/sim_loss": 0.11328125 }, { "epoch": 0.1784654933755191, "step": 1805, "train/total_loss": 0.21864664554595947 }, { "entropy": 9.209526062011719, "epoch": 0.1785643662250346, "mean_token_accuracy": 0.7149425148963928, "num_tokens": 9397564.0, "step": 1806, "train/ce_loss": 0.672696590423584 }, { "epoch": 0.1785643662250346, "step": 1806, "train/sim_loss": 0.12109375 }, { "epoch": 0.1785643662250346, "step": 1806, "train/total_loss": 0.18836340308189392 }, { "entropy": 8.976957321166992, "epoch": 0.17866323907455012, "mean_token_accuracy": 0.7857911586761475, "num_tokens": 9402970.0, "step": 1807, "train/ce_loss": 0.602595329284668 }, { "epoch": 0.17866323907455012, "step": 1807, "train/sim_loss": 0.06640625 }, { "epoch": 0.17866323907455012, "step": 1807, "train/total_loss": 0.12666578590869904 }, { "entropy": 9.577447891235352, "epoch": 0.17876211192406566, "mean_token_accuracy": 0.7555555701255798, "num_tokens": 9408069.0, "step": 1808, "train/ce_loss": 0.6192771792411804 }, { "epoch": 0.17876211192406566, "step": 1808, "train/sim_loss": 0.08984375 }, { "epoch": 0.17876211192406566, "step": 1808, "train/total_loss": 0.15177147090435028 }, { "entropy": 9.357033729553223, "epoch": 0.17886098477358117, "mean_token_accuracy": 0.7422401905059814, "num_tokens": 9413239.0, "step": 1809, "train/ce_loss": 1.4079818725585938 }, { "epoch": 0.17886098477358117, "step": 1809, "train/sim_loss": 0.09375 }, { "epoch": 0.17886098477358117, "step": 1809, "train/total_loss": 0.2345481961965561 }, { "entropy": 9.14383316040039, "epoch": 0.17895985762309669, "mean_token_accuracy": 0.7847533822059631, "num_tokens": 9418600.0, "step": 1810, "train/ce_loss": 0.9785193204879761 }, { "epoch": 0.17895985762309669, "step": 1810, "train/sim_loss": 0.0859375 }, { "epoch": 0.17895985762309669, "step": 1810, "train/total_loss": 0.1837894320487976 }, { "entropy": 10.573735237121582, "epoch": 0.17905873047261223, "mean_token_accuracy": 0.6759776473045349, "num_tokens": 9423167.0, "step": 1811, "train/ce_loss": 4.147632122039795 }, { "epoch": 0.17905873047261223, "step": 1811, "train/sim_loss": 0.0859375 }, { "epoch": 0.17905873047261223, "step": 1811, "train/total_loss": 0.5007007122039795 }, { "entropy": 9.858811378479004, "epoch": 0.17915760332212774, "mean_token_accuracy": 0.7148080468177795, "num_tokens": 9428113.0, "step": 1812, "train/ce_loss": 1.2605161666870117 }, { "epoch": 0.17915760332212774, "step": 1812, "train/sim_loss": 0.0546875 }, { "epoch": 0.17915760332212774, "step": 1812, "train/total_loss": 0.1807391196489334 }, { "entropy": 8.92715835571289, "epoch": 0.17925647617164325, "mean_token_accuracy": 0.7004877924919128, "num_tokens": 9433602.0, "step": 1813, "train/ce_loss": 1.0510623455047607 }, { "epoch": 0.17925647617164325, "step": 1813, "train/sim_loss": 0.05078125 }, { "epoch": 0.17925647617164325, "step": 1813, "train/total_loss": 0.15588748455047607 }, { "entropy": 9.373226165771484, "epoch": 0.1793553490211588, "mean_token_accuracy": 0.7319711446762085, "num_tokens": 9439078.0, "step": 1814, "train/ce_loss": 1.2842234373092651 }, { "epoch": 0.1793553490211588, "step": 1814, "train/sim_loss": 0.10546875 }, { "epoch": 0.1793553490211588, "step": 1814, "train/total_loss": 0.233891099691391 }, { "entropy": 9.695732116699219, "epoch": 0.1794542218706743, "mean_token_accuracy": 0.7933884263038635, "num_tokens": 9444124.0, "step": 1815, "train/ce_loss": 5.596983555733459e-06 }, { "epoch": 0.1794542218706743, "step": 1815, "train/sim_loss": 0.03515625 }, { "epoch": 0.1794542218706743, "step": 1815, "train/total_loss": 0.03515680879354477 }, { "entropy": 9.347949981689453, "epoch": 0.17955309472018985, "mean_token_accuracy": 0.7309136390686035, "num_tokens": 9449371.0, "step": 1816, "train/ce_loss": 0.9072271585464478 }, { "epoch": 0.17955309472018985, "step": 1816, "train/sim_loss": 0.03125 }, { "epoch": 0.17955309472018985, "step": 1816, "train/total_loss": 0.1219727173447609 }, { "entropy": 9.286724090576172, "epoch": 0.17965196756970536, "mean_token_accuracy": 0.7020997405052185, "num_tokens": 9454597.0, "step": 1817, "train/ce_loss": 0.5547173023223877 }, { "epoch": 0.17965196756970536, "step": 1817, "train/sim_loss": 0.046875 }, { "epoch": 0.17965196756970536, "step": 1817, "train/total_loss": 0.10234673321247101 }, { "entropy": 9.670854568481445, "epoch": 0.17975084041922088, "mean_token_accuracy": 0.7434312105178833, "num_tokens": 9459663.0, "step": 1818, "train/ce_loss": 5.238787252892507e-06 }, { "epoch": 0.17975084041922088, "step": 1818, "train/sim_loss": 0.078125 }, { "epoch": 0.17975084041922088, "step": 1818, "train/total_loss": 0.07812552154064178 }, { "entropy": 9.288985252380371, "epoch": 0.17984971326873642, "mean_token_accuracy": 0.7222222089767456, "num_tokens": 9464980.0, "step": 1819, "train/ce_loss": 0.8383810520172119 }, { "epoch": 0.17984971326873642, "step": 1819, "train/sim_loss": 0.05078125 }, { "epoch": 0.17984971326873642, "step": 1819, "train/total_loss": 0.1346193552017212 }, { "epoch": 0.17994858611825193, "grad_norm": 0.9709348082542419, "learning_rate": 9.552736982643526e-06, "loss": 0.1551, "step": 1820 }, { "entropy": 9.821374893188477, "epoch": 0.17994858611825193, "mean_token_accuracy": 0.6834782361984253, "num_tokens": 9469973.0, "step": 1820, "train/ce_loss": 1.030767798423767 }, { "epoch": 0.17994858611825193, "step": 1820, "train/sim_loss": 0.08203125 }, { "epoch": 0.17994858611825193, "step": 1820, "train/total_loss": 0.1851080358028412 }, { "entropy": 9.43313217163086, "epoch": 0.18004745896776744, "mean_token_accuracy": 0.7410852909088135, "num_tokens": 9475082.0, "step": 1821, "train/ce_loss": 0.7259458899497986 }, { "epoch": 0.18004745896776744, "step": 1821, "train/sim_loss": 0.0546875 }, { "epoch": 0.18004745896776744, "step": 1821, "train/total_loss": 0.12728208303451538 }, { "entropy": 9.883010864257812, "epoch": 0.18014633181728298, "mean_token_accuracy": 0.7248576879501343, "num_tokens": 9479978.0, "step": 1822, "train/ce_loss": 6.774416760890745e-06 }, { "epoch": 0.18014633181728298, "step": 1822, "train/sim_loss": 0.02734375 }, { "epoch": 0.18014633181728298, "step": 1822, "train/total_loss": 0.02734442800283432 }, { "entropy": 9.508912086486816, "epoch": 0.1802452046667985, "mean_token_accuracy": 0.7350901365280151, "num_tokens": 9485097.0, "step": 1823, "train/ce_loss": 1.1825639009475708 }, { "epoch": 0.1802452046667985, "step": 1823, "train/sim_loss": 0.09375 }, { "epoch": 0.1802452046667985, "step": 1823, "train/total_loss": 0.21200639009475708 }, { "entropy": 9.023045539855957, "epoch": 0.180344077516314, "mean_token_accuracy": 0.7669801712036133, "num_tokens": 9490520.0, "step": 1824, "train/ce_loss": 1.0307142734527588 }, { "epoch": 0.180344077516314, "step": 1824, "train/sim_loss": 0.05859375 }, { "epoch": 0.180344077516314, "step": 1824, "train/total_loss": 0.1616651713848114 }, { "entropy": 9.16715145111084, "epoch": 0.18044295036582955, "mean_token_accuracy": 0.7367149591445923, "num_tokens": 9495871.0, "step": 1825, "train/ce_loss": 0.8486382961273193 }, { "epoch": 0.18044295036582955, "step": 1825, "train/sim_loss": 0.0703125 }, { "epoch": 0.18044295036582955, "step": 1825, "train/total_loss": 0.1551763415336609 }, { "entropy": 10.543107032775879, "epoch": 0.18054182321534507, "mean_token_accuracy": 0.7471264600753784, "num_tokens": 9500474.0, "step": 1826, "train/ce_loss": 5.42702000529971e-05 }, { "epoch": 0.18054182321534507, "step": 1826, "train/sim_loss": 0.06640625 }, { "epoch": 0.18054182321534507, "step": 1826, "train/total_loss": 0.06641167402267456 }, { "entropy": 9.587188720703125, "epoch": 0.18064069606486058, "mean_token_accuracy": 0.7706552743911743, "num_tokens": 9505620.0, "step": 1827, "train/ce_loss": 1.4216008186340332 }, { "epoch": 0.18064069606486058, "step": 1827, "train/sim_loss": 0.06640625 }, { "epoch": 0.18064069606486058, "step": 1827, "train/total_loss": 0.2085663378238678 }, { "entropy": 9.30483627319336, "epoch": 0.18073956891437612, "mean_token_accuracy": 0.7323943376541138, "num_tokens": 9510941.0, "step": 1828, "train/ce_loss": 0.8432378172874451 }, { "epoch": 0.18073956891437612, "step": 1828, "train/sim_loss": 0.07421875 }, { "epoch": 0.18073956891437612, "step": 1828, "train/total_loss": 0.15854254364967346 }, { "entropy": 9.974531173706055, "epoch": 0.18083844176389163, "mean_token_accuracy": 0.6699604988098145, "num_tokens": 9515857.0, "step": 1829, "train/ce_loss": 1.2670997381210327 }, { "epoch": 0.18083844176389163, "step": 1829, "train/sim_loss": 0.0859375 }, { "epoch": 0.18083844176389163, "step": 1829, "train/total_loss": 0.2126474827528 }, { "entropy": 9.296100616455078, "epoch": 0.18093731461340715, "mean_token_accuracy": 0.7124260067939758, "num_tokens": 9521156.0, "step": 1830, "train/ce_loss": 0.601858913898468 }, { "epoch": 0.18093731461340715, "step": 1830, "train/sim_loss": 0.0234375 }, { "epoch": 0.18093731461340715, "step": 1830, "train/total_loss": 0.08362339437007904 }, { "entropy": 9.4048433303833, "epoch": 0.1810361874629227, "mean_token_accuracy": 0.6895705461502075, "num_tokens": 9526351.0, "step": 1831, "train/ce_loss": 0.674392580986023 }, { "epoch": 0.1810361874629227, "step": 1831, "train/sim_loss": 0.09375 }, { "epoch": 0.1810361874629227, "step": 1831, "train/total_loss": 0.1611892580986023 }, { "entropy": 9.436763763427734, "epoch": 0.1811350603124382, "mean_token_accuracy": 0.7588152289390564, "num_tokens": 9531530.0, "step": 1832, "train/ce_loss": 0.8237096071243286 }, { "epoch": 0.1811350603124382, "step": 1832, "train/sim_loss": 0.078125 }, { "epoch": 0.1811350603124382, "step": 1832, "train/total_loss": 0.16049596667289734 }, { "entropy": 9.5198335647583, "epoch": 0.18123393316195371, "mean_token_accuracy": 0.7661623358726501, "num_tokens": 9536692.0, "step": 1833, "train/ce_loss": 1.0668498277664185 }, { "epoch": 0.18123393316195371, "step": 1833, "train/sim_loss": 0.0390625 }, { "epoch": 0.18123393316195371, "step": 1833, "train/total_loss": 0.14574748277664185 }, { "entropy": 8.950961112976074, "epoch": 0.18133280601146926, "mean_token_accuracy": 0.7164319157600403, "num_tokens": 9542387.0, "step": 1834, "train/ce_loss": 1.0130492448806763 }, { "epoch": 0.18133280601146926, "step": 1834, "train/sim_loss": 0.08203125 }, { "epoch": 0.18133280601146926, "step": 1834, "train/total_loss": 0.18333616852760315 }, { "entropy": 9.354697227478027, "epoch": 0.18143167886098477, "mean_token_accuracy": 0.7476635575294495, "num_tokens": 9547598.0, "step": 1835, "train/ce_loss": 0.7982195615768433 }, { "epoch": 0.18143167886098477, "step": 1835, "train/sim_loss": 0.05859375 }, { "epoch": 0.18143167886098477, "step": 1835, "train/total_loss": 0.13841570913791656 }, { "entropy": 9.004920959472656, "epoch": 0.1815305517105003, "mean_token_accuracy": 0.7362030744552612, "num_tokens": 9552969.0, "step": 1836, "train/ce_loss": 0.7738270163536072 }, { "epoch": 0.1815305517105003, "step": 1836, "train/sim_loss": 0.09375 }, { "epoch": 0.1815305517105003, "step": 1836, "train/total_loss": 0.17113271355628967 }, { "entropy": 8.88099193572998, "epoch": 0.18162942456001582, "mean_token_accuracy": 0.7259978652000427, "num_tokens": 9558426.0, "step": 1837, "train/ce_loss": 0.3200748562812805 }, { "epoch": 0.18162942456001582, "step": 1837, "train/sim_loss": 0.02734375 }, { "epoch": 0.18162942456001582, "step": 1837, "train/total_loss": 0.05935123562812805 }, { "entropy": 9.812850952148438, "epoch": 0.18172829740953134, "mean_token_accuracy": 0.7011070251464844, "num_tokens": 9563367.0, "step": 1838, "train/ce_loss": 0.746933102607727 }, { "epoch": 0.18172829740953134, "step": 1838, "train/sim_loss": 0.09765625 }, { "epoch": 0.18172829740953134, "step": 1838, "train/total_loss": 0.17234957218170166 }, { "entropy": 9.44894790649414, "epoch": 0.18182717025904688, "mean_token_accuracy": 0.7223684191703796, "num_tokens": 9568596.0, "step": 1839, "train/ce_loss": 1.4718496799468994 }, { "epoch": 0.18182717025904688, "step": 1839, "train/sim_loss": 0.125 }, { "epoch": 0.18182717025904688, "step": 1839, "train/total_loss": 0.27218496799468994 }, { "epoch": 0.1819260431085624, "grad_norm": 0.8998593091964722, "learning_rate": 9.547792117885577e-06, "loss": 0.1612, "step": 1840 }, { "entropy": 9.199764251708984, "epoch": 0.1819260431085624, "mean_token_accuracy": 0.7387606501579285, "num_tokens": 9573826.0, "step": 1840, "train/ce_loss": 0.780877947807312 }, { "epoch": 0.1819260431085624, "step": 1840, "train/sim_loss": 0.05078125 }, { "epoch": 0.1819260431085624, "step": 1840, "train/total_loss": 0.12886905670166016 }, { "entropy": 8.80765151977539, "epoch": 0.1820249159580779, "mean_token_accuracy": 0.7360248565673828, "num_tokens": 9579299.0, "step": 1841, "train/ce_loss": 0.8180506825447083 }, { "epoch": 0.1820249159580779, "step": 1841, "train/sim_loss": 0.09765625 }, { "epoch": 0.1820249159580779, "step": 1841, "train/total_loss": 0.17946133017539978 }, { "entropy": 9.316116333007812, "epoch": 0.18212378880759345, "mean_token_accuracy": 0.7462499737739563, "num_tokens": 9584548.0, "step": 1842, "train/ce_loss": 0.7248634099960327 }, { "epoch": 0.18212378880759345, "step": 1842, "train/sim_loss": 0.02734375 }, { "epoch": 0.18212378880759345, "step": 1842, "train/total_loss": 0.09983009099960327 }, { "entropy": 9.40951919555664, "epoch": 0.18222266165710896, "mean_token_accuracy": 0.7121211886405945, "num_tokens": 9589726.0, "step": 1843, "train/ce_loss": 0.7894455194473267 }, { "epoch": 0.18222266165710896, "step": 1843, "train/sim_loss": 0.08203125 }, { "epoch": 0.18222266165710896, "step": 1843, "train/total_loss": 0.16097581386566162 }, { "entropy": 9.293411254882812, "epoch": 0.18232153450662447, "mean_token_accuracy": 0.766749382019043, "num_tokens": 9595011.0, "step": 1844, "train/ce_loss": 1.7790314814192243e-05 }, { "epoch": 0.18232153450662447, "step": 1844, "train/sim_loss": 0.09375 }, { "epoch": 0.18232153450662447, "step": 1844, "train/total_loss": 0.09375178068876266 }, { "entropy": 9.97812271118164, "epoch": 0.18242040735614, "mean_token_accuracy": 0.7121211886405945, "num_tokens": 9599837.0, "step": 1845, "train/ce_loss": 1.2837271690368652 }, { "epoch": 0.18242040735614, "step": 1845, "train/sim_loss": 0.10546875 }, { "epoch": 0.18242040735614, "step": 1845, "train/total_loss": 0.23384146392345428 }, { "entropy": 9.744732856750488, "epoch": 0.18251928020565553, "mean_token_accuracy": 0.7655986547470093, "num_tokens": 9604874.0, "step": 1846, "train/ce_loss": 0.9753443598747253 }, { "epoch": 0.18251928020565553, "step": 1846, "train/sim_loss": 0.02734375 }, { "epoch": 0.18251928020565553, "step": 1846, "train/total_loss": 0.12487819045782089 }, { "entropy": 9.101228713989258, "epoch": 0.18261815305517104, "mean_token_accuracy": 0.7458006739616394, "num_tokens": 9610218.0, "step": 1847, "train/ce_loss": 0.9057663679122925 }, { "epoch": 0.18261815305517104, "step": 1847, "train/sim_loss": 0.1328125 }, { "epoch": 0.18261815305517104, "step": 1847, "train/total_loss": 0.2233891487121582 }, { "entropy": 9.602880477905273, "epoch": 0.18271702590468658, "mean_token_accuracy": 0.7296848893165588, "num_tokens": 9615236.0, "step": 1848, "train/ce_loss": 1.7166345119476318 }, { "epoch": 0.18271702590468658, "step": 1848, "train/sim_loss": 0.1328125 }, { "epoch": 0.18271702590468658, "step": 1848, "train/total_loss": 0.30447596311569214 }, { "entropy": 9.3284912109375, "epoch": 0.1828158987542021, "mean_token_accuracy": 0.7758318781852722, "num_tokens": 9620228.0, "step": 1849, "train/ce_loss": 0.9070501923561096 }, { "epoch": 0.1828158987542021, "step": 1849, "train/sim_loss": 0.08984375 }, { "epoch": 0.1828158987542021, "step": 1849, "train/total_loss": 0.1805487722158432 }, { "entropy": 9.956958770751953, "epoch": 0.1829147716037176, "mean_token_accuracy": 0.7658079862594604, "num_tokens": 9625117.0, "step": 1850, "train/ce_loss": 9.23292463994585e-06 }, { "epoch": 0.1829147716037176, "step": 1850, "train/sim_loss": 0.05859375 }, { "epoch": 0.1829147716037176, "step": 1850, "train/total_loss": 0.05859467387199402 }, { "entropy": 9.088489532470703, "epoch": 0.18301364445323315, "mean_token_accuracy": 0.7605321407318115, "num_tokens": 9630474.0, "step": 1851, "train/ce_loss": 1.0857402086257935 }, { "epoch": 0.18301364445323315, "step": 1851, "train/sim_loss": 0.1171875 }, { "epoch": 0.18301364445323315, "step": 1851, "train/total_loss": 0.2257615327835083 }, { "entropy": 8.86593246459961, "epoch": 0.18311251730274866, "mean_token_accuracy": 0.6904024481773376, "num_tokens": 9635916.0, "step": 1852, "train/ce_loss": 1.391951322555542 }, { "epoch": 0.18311251730274866, "step": 1852, "train/sim_loss": 0.0703125 }, { "epoch": 0.18311251730274866, "step": 1852, "train/total_loss": 0.20950762927532196 }, { "entropy": 9.429494857788086, "epoch": 0.18321139015226418, "mean_token_accuracy": 0.7647849321365356, "num_tokens": 9641109.0, "step": 1853, "train/ce_loss": 0.6080541014671326 }, { "epoch": 0.18321139015226418, "step": 1853, "train/sim_loss": 0.03515625 }, { "epoch": 0.18321139015226418, "step": 1853, "train/total_loss": 0.09596166014671326 }, { "entropy": 9.24432373046875, "epoch": 0.18331026300177972, "mean_token_accuracy": 0.7325268983840942, "num_tokens": 9646320.0, "step": 1854, "train/ce_loss": 0.9263020157814026 }, { "epoch": 0.18331026300177972, "step": 1854, "train/sim_loss": 0.01953125 }, { "epoch": 0.18331026300177972, "step": 1854, "train/total_loss": 0.11216145008802414 }, { "entropy": 9.437981605529785, "epoch": 0.18340913585129523, "mean_token_accuracy": 0.7396870851516724, "num_tokens": 9651493.0, "step": 1855, "train/ce_loss": 0.9242285490036011 }, { "epoch": 0.18340913585129523, "step": 1855, "train/sim_loss": 0.04296875 }, { "epoch": 0.18340913585129523, "step": 1855, "train/total_loss": 0.13539160788059235 }, { "entropy": 9.209920883178711, "epoch": 0.18350800870081077, "mean_token_accuracy": 0.7011643052101135, "num_tokens": 9656744.0, "step": 1856, "train/ce_loss": 1.042596459388733 }, { "epoch": 0.18350800870081077, "step": 1856, "train/sim_loss": 0.10546875 }, { "epoch": 0.18350800870081077, "step": 1856, "train/total_loss": 0.2097283899784088 }, { "entropy": 9.788610458374023, "epoch": 0.18360688155032628, "mean_token_accuracy": 0.7260788083076477, "num_tokens": 9661703.0, "step": 1857, "train/ce_loss": 6.188375664351042e-06 }, { "epoch": 0.18360688155032628, "step": 1857, "train/sim_loss": 0.04296875 }, { "epoch": 0.18360688155032628, "step": 1857, "train/total_loss": 0.042969368398189545 }, { "entropy": 9.401823043823242, "epoch": 0.1837057543998418, "mean_token_accuracy": 0.6855670213699341, "num_tokens": 9666917.0, "step": 1858, "train/ce_loss": 1.1027350425720215 }, { "epoch": 0.1837057543998418, "step": 1858, "train/sim_loss": 0.0625 }, { "epoch": 0.1837057543998418, "step": 1858, "train/total_loss": 0.17277351021766663 }, { "entropy": 9.327844619750977, "epoch": 0.18380462724935734, "mean_token_accuracy": 0.742514967918396, "num_tokens": 9672173.0, "step": 1859, "train/ce_loss": 0.74072265625 }, { "epoch": 0.18380462724935734, "step": 1859, "train/sim_loss": 0.0625 }, { "epoch": 0.18380462724935734, "step": 1859, "train/total_loss": 0.13657227158546448 }, { "epoch": 0.18390350009887285, "grad_norm": 0.8869383335113525, "learning_rate": 9.542847253127627e-06, "loss": 0.153, "step": 1860 }, { "entropy": 9.441791534423828, "epoch": 0.18390350009887285, "mean_token_accuracy": 0.7203728556632996, "num_tokens": 9677377.0, "step": 1860, "train/ce_loss": 1.3488295078277588 }, { "epoch": 0.18390350009887285, "step": 1860, "train/sim_loss": 0.0703125 }, { "epoch": 0.18390350009887285, "step": 1860, "train/total_loss": 0.20519545674324036 }, { "entropy": 8.826675415039062, "epoch": 0.18400237294838837, "mean_token_accuracy": 0.7460018992424011, "num_tokens": 9682960.0, "step": 1861, "train/ce_loss": 0.5895362496376038 }, { "epoch": 0.18400237294838837, "step": 1861, "train/sim_loss": 0.03125 }, { "epoch": 0.18400237294838837, "step": 1861, "train/total_loss": 0.09020362794399261 }, { "entropy": 9.426769256591797, "epoch": 0.1841012457979039, "mean_token_accuracy": 0.7626886367797852, "num_tokens": 9688133.0, "step": 1862, "train/ce_loss": 3.955638931074645e-06 }, { "epoch": 0.1841012457979039, "step": 1862, "train/sim_loss": 0.0859375 }, { "epoch": 0.1841012457979039, "step": 1862, "train/total_loss": 0.08593789488077164 }, { "entropy": 9.118680000305176, "epoch": 0.18420011864741942, "mean_token_accuracy": 0.7420091032981873, "num_tokens": 9693421.0, "step": 1863, "train/ce_loss": 0.7646657228469849 }, { "epoch": 0.18420011864741942, "step": 1863, "train/sim_loss": 0.05078125 }, { "epoch": 0.18420011864741942, "step": 1863, "train/total_loss": 0.12724782526493073 }, { "entropy": 10.062543869018555, "epoch": 0.18429899149693493, "mean_token_accuracy": 0.6973365545272827, "num_tokens": 9698256.0, "step": 1864, "train/ce_loss": 2.019545718212612e-05 }, { "epoch": 0.18429899149693493, "step": 1864, "train/sim_loss": 0.046875 }, { "epoch": 0.18429899149693493, "step": 1864, "train/total_loss": 0.046877019107341766 }, { "entropy": 9.804080963134766, "epoch": 0.18439786434645047, "mean_token_accuracy": 0.6678898930549622, "num_tokens": 9703264.0, "step": 1865, "train/ce_loss": 2.0530173778533936 }, { "epoch": 0.18439786434645047, "step": 1865, "train/sim_loss": 0.12890625 }, { "epoch": 0.18439786434645047, "step": 1865, "train/total_loss": 0.33420801162719727 }, { "entropy": 9.011037826538086, "epoch": 0.184496737195966, "mean_token_accuracy": 0.6892778873443604, "num_tokens": 9708695.0, "step": 1866, "train/ce_loss": 0.7095602750778198 }, { "epoch": 0.184496737195966, "step": 1866, "train/sim_loss": 0.0859375 }, { "epoch": 0.184496737195966, "step": 1866, "train/total_loss": 0.1568935215473175 }, { "entropy": 9.28196907043457, "epoch": 0.1845956100454815, "mean_token_accuracy": 0.7383177280426025, "num_tokens": 9713972.0, "step": 1867, "train/ce_loss": 0.9496920108795166 }, { "epoch": 0.1845956100454815, "step": 1867, "train/sim_loss": 0.06640625 }, { "epoch": 0.1845956100454815, "step": 1867, "train/total_loss": 0.16137546300888062 }, { "entropy": 9.117605209350586, "epoch": 0.18469448289499704, "mean_token_accuracy": 0.7533632516860962, "num_tokens": 9719362.0, "step": 1868, "train/ce_loss": 0.6435233354568481 }, { "epoch": 0.18469448289499704, "step": 1868, "train/sim_loss": 0.0625 }, { "epoch": 0.18469448289499704, "step": 1868, "train/total_loss": 0.12685233354568481 }, { "entropy": 9.366474151611328, "epoch": 0.18479335574451256, "mean_token_accuracy": 0.7259551882743835, "num_tokens": 9724575.0, "step": 1869, "train/ce_loss": 0.67132568359375 }, { "epoch": 0.18479335574451256, "step": 1869, "train/sim_loss": 0.05078125 }, { "epoch": 0.18479335574451256, "step": 1869, "train/total_loss": 0.11791381984949112 }, { "entropy": 9.35400390625, "epoch": 0.18489222859402807, "mean_token_accuracy": 0.7232037782669067, "num_tokens": 9729866.0, "step": 1870, "train/ce_loss": 0.6131327152252197 }, { "epoch": 0.18489222859402807, "step": 1870, "train/sim_loss": 0.05078125 }, { "epoch": 0.18489222859402807, "step": 1870, "train/total_loss": 0.11209452152252197 }, { "entropy": 10.150435447692871, "epoch": 0.1849911014435436, "mean_token_accuracy": 0.7472527623176575, "num_tokens": 9734623.0, "step": 1871, "train/ce_loss": 8.638548024464399e-06 }, { "epoch": 0.1849911014435436, "step": 1871, "train/sim_loss": 0.03125 }, { "epoch": 0.1849911014435436, "step": 1871, "train/total_loss": 0.03125086426734924 }, { "entropy": 9.388439178466797, "epoch": 0.18508997429305912, "mean_token_accuracy": 0.7086183428764343, "num_tokens": 9739807.0, "step": 1872, "train/ce_loss": 0.6699540019035339 }, { "epoch": 0.18508997429305912, "step": 1872, "train/sim_loss": 0.12890625 }, { "epoch": 0.18508997429305912, "step": 1872, "train/total_loss": 0.19590166211128235 }, { "entropy": 10.134531021118164, "epoch": 0.18518884714257464, "mean_token_accuracy": 0.6870229244232178, "num_tokens": 9744553.0, "step": 1873, "train/ce_loss": 1.5828860998153687 }, { "epoch": 0.18518884714257464, "step": 1873, "train/sim_loss": 0.04296875 }, { "epoch": 0.18518884714257464, "step": 1873, "train/total_loss": 0.2012573629617691 }, { "entropy": 9.382888793945312, "epoch": 0.18528771999209018, "mean_token_accuracy": 0.721784770488739, "num_tokens": 9749768.0, "step": 1874, "train/ce_loss": 0.5177397727966309 }, { "epoch": 0.18528771999209018, "step": 1874, "train/sim_loss": 0.1171875 }, { "epoch": 0.18528771999209018, "step": 1874, "train/total_loss": 0.16896148025989532 }, { "entropy": 9.096595764160156, "epoch": 0.1853865928416057, "mean_token_accuracy": 0.7314702272415161, "num_tokens": 9755067.0, "step": 1875, "train/ce_loss": 0.4712185263633728 }, { "epoch": 0.1853865928416057, "step": 1875, "train/sim_loss": 0.0234375 }, { "epoch": 0.1853865928416057, "step": 1875, "train/total_loss": 0.07055935263633728 }, { "entropy": 8.749658584594727, "epoch": 0.18548546569112123, "mean_token_accuracy": 0.6856856942176819, "num_tokens": 9760586.0, "step": 1876, "train/ce_loss": 0.7312849760055542 }, { "epoch": 0.18548546569112123, "step": 1876, "train/sim_loss": 0.09375 }, { "epoch": 0.18548546569112123, "step": 1876, "train/total_loss": 0.16687849164009094 }, { "entropy": 9.147841453552246, "epoch": 0.18558433854063675, "mean_token_accuracy": 0.7352941036224365, "num_tokens": 9765912.0, "step": 1877, "train/ce_loss": 1.8808138370513916 }, { "epoch": 0.18558433854063675, "step": 1877, "train/sim_loss": 0.09765625 }, { "epoch": 0.18558433854063675, "step": 1877, "train/total_loss": 0.28573763370513916 }, { "entropy": 9.344097137451172, "epoch": 0.18568321139015226, "mean_token_accuracy": 0.7443609237670898, "num_tokens": 9771137.0, "step": 1878, "train/ce_loss": 0.42486655712127686 }, { "epoch": 0.18568321139015226, "step": 1878, "train/sim_loss": 0.05859375 }, { "epoch": 0.18568321139015226, "step": 1878, "train/total_loss": 0.10108040273189545 }, { "entropy": 9.687973022460938, "epoch": 0.1857820842396678, "mean_token_accuracy": 0.7692307829856873, "num_tokens": 9776174.0, "step": 1879, "train/ce_loss": 1.415340542793274 }, { "epoch": 0.1857820842396678, "step": 1879, "train/sim_loss": 0.09375 }, { "epoch": 0.1857820842396678, "step": 1879, "train/total_loss": 0.23528406023979187 }, { "epoch": 0.1858809570891833, "grad_norm": 1.0253803730010986, "learning_rate": 9.53790238836968e-06, "loss": 0.1698, "step": 1880 }, { "entropy": 9.919747352600098, "epoch": 0.1858809570891833, "mean_token_accuracy": 0.7460629940032959, "num_tokens": 9781115.0, "step": 1880, "train/ce_loss": 2.0956978797912598 }, { "epoch": 0.1858809570891833, "step": 1880, "train/sim_loss": 0.0703125 }, { "epoch": 0.1858809570891833, "step": 1880, "train/total_loss": 0.2798823118209839 }, { "entropy": 9.35361385345459, "epoch": 0.18597982993869883, "mean_token_accuracy": 0.7058081030845642, "num_tokens": 9786382.0, "step": 1881, "train/ce_loss": 1.2247314453125 }, { "epoch": 0.18597982993869883, "step": 1881, "train/sim_loss": 0.1015625 }, { "epoch": 0.18597982993869883, "step": 1881, "train/total_loss": 0.22403565049171448 }, { "entropy": 9.506695747375488, "epoch": 0.18607870278821437, "mean_token_accuracy": 0.7617135047912598, "num_tokens": 9791569.0, "step": 1882, "train/ce_loss": 7.135338364605559e-06 }, { "epoch": 0.18607870278821437, "step": 1882, "train/sim_loss": 0.02734375 }, { "epoch": 0.18607870278821437, "step": 1882, "train/total_loss": 0.027344463393092155 }, { "entropy": 9.088245391845703, "epoch": 0.18617757563772988, "mean_token_accuracy": 0.7137891054153442, "num_tokens": 9796935.0, "step": 1883, "train/ce_loss": 1.3417773246765137 }, { "epoch": 0.18617757563772988, "step": 1883, "train/sim_loss": 0.09375 }, { "epoch": 0.18617757563772988, "step": 1883, "train/total_loss": 0.22792772948741913 }, { "entropy": 9.646706581115723, "epoch": 0.1862764484872454, "mean_token_accuracy": 0.7172312140464783, "num_tokens": 9801993.0, "step": 1884, "train/ce_loss": 1.3433440923690796 }, { "epoch": 0.1862764484872454, "step": 1884, "train/sim_loss": 0.0625 }, { "epoch": 0.1862764484872454, "step": 1884, "train/total_loss": 0.19683441519737244 }, { "entropy": 9.41108512878418, "epoch": 0.18637532133676094, "mean_token_accuracy": 0.7837116122245789, "num_tokens": 9807193.0, "step": 1885, "train/ce_loss": 0.5483527779579163 }, { "epoch": 0.18637532133676094, "step": 1885, "train/sim_loss": 0.03125 }, { "epoch": 0.18637532133676094, "step": 1885, "train/total_loss": 0.08608527481555939 }, { "entropy": 9.341239929199219, "epoch": 0.18647419418627645, "mean_token_accuracy": 0.7682403326034546, "num_tokens": 9812334.0, "step": 1886, "train/ce_loss": 0.5725024938583374 }, { "epoch": 0.18647419418627645, "step": 1886, "train/sim_loss": 0.1015625 }, { "epoch": 0.18647419418627645, "step": 1886, "train/total_loss": 0.1588127464056015 }, { "entropy": 9.302617073059082, "epoch": 0.18657306703579196, "mean_token_accuracy": 0.7543640732765198, "num_tokens": 9817596.0, "step": 1887, "train/ce_loss": 1.5095512866973877 }, { "epoch": 0.18657306703579196, "step": 1887, "train/sim_loss": 0.0625 }, { "epoch": 0.18657306703579196, "step": 1887, "train/total_loss": 0.21345512568950653 }, { "entropy": 9.235432624816895, "epoch": 0.1866719398853075, "mean_token_accuracy": 0.6889185309410095, "num_tokens": 9822802.0, "step": 1888, "train/ce_loss": 0.9855005145072937 }, { "epoch": 0.1866719398853075, "step": 1888, "train/sim_loss": 0.078125 }, { "epoch": 0.1866719398853075, "step": 1888, "train/total_loss": 0.17667505145072937 }, { "entropy": 9.256308555603027, "epoch": 0.18677081273482302, "mean_token_accuracy": 0.7429577708244324, "num_tokens": 9828116.0, "step": 1889, "train/ce_loss": 0.7325549721717834 }, { "epoch": 0.18677081273482302, "step": 1889, "train/sim_loss": 0.01953125 }, { "epoch": 0.18677081273482302, "step": 1889, "train/total_loss": 0.0927867516875267 }, { "entropy": 10.075715065002441, "epoch": 0.18686968558433853, "mean_token_accuracy": 0.7091836929321289, "num_tokens": 9832906.0, "step": 1890, "train/ce_loss": 1.0335999727249146 }, { "epoch": 0.18686968558433853, "step": 1890, "train/sim_loss": 0.07421875 }, { "epoch": 0.18686968558433853, "step": 1890, "train/total_loss": 0.17757874727249146 }, { "entropy": 9.484654426574707, "epoch": 0.18696855843385407, "mean_token_accuracy": 0.7492307424545288, "num_tokens": 9838044.0, "step": 1891, "train/ce_loss": 0.8732141852378845 }, { "epoch": 0.18696855843385407, "step": 1891, "train/sim_loss": 0.09375 }, { "epoch": 0.18696855843385407, "step": 1891, "train/total_loss": 0.1810714304447174 }, { "entropy": 9.349531173706055, "epoch": 0.18706743128336958, "mean_token_accuracy": 0.75157630443573, "num_tokens": 9843240.0, "step": 1892, "train/ce_loss": 0.8059046864509583 }, { "epoch": 0.18706743128336958, "step": 1892, "train/sim_loss": 0.09375 }, { "epoch": 0.18706743128336958, "step": 1892, "train/total_loss": 0.17434047162532806 }, { "entropy": 8.950783729553223, "epoch": 0.1871663041328851, "mean_token_accuracy": 0.7363343834877014, "num_tokens": 9848615.0, "step": 1893, "train/ce_loss": 0.7747853994369507 }, { "epoch": 0.1871663041328851, "step": 1893, "train/sim_loss": 0.03515625 }, { "epoch": 0.1871663041328851, "step": 1893, "train/total_loss": 0.11263479292392731 }, { "entropy": 9.945171356201172, "epoch": 0.18726517698240064, "mean_token_accuracy": 0.7347368597984314, "num_tokens": 9853517.0, "step": 1894, "train/ce_loss": 2.5057694074348547e-05 }, { "epoch": 0.18726517698240064, "step": 1894, "train/sim_loss": 0.109375 }, { "epoch": 0.18726517698240064, "step": 1894, "train/total_loss": 0.10937750339508057 }, { "entropy": 9.073461532592773, "epoch": 0.18736404983191615, "mean_token_accuracy": 0.7284234762191772, "num_tokens": 9858879.0, "step": 1895, "train/ce_loss": 1.130031943321228 }, { "epoch": 0.18736404983191615, "step": 1895, "train/sim_loss": 0.06640625 }, { "epoch": 0.18736404983191615, "step": 1895, "train/total_loss": 0.1794094443321228 }, { "entropy": 9.198670387268066, "epoch": 0.18746292268143167, "mean_token_accuracy": 0.7430406808853149, "num_tokens": 9864262.0, "step": 1896, "train/ce_loss": 0.4583074152469635 }, { "epoch": 0.18746292268143167, "step": 1896, "train/sim_loss": 0.03125 }, { "epoch": 0.18746292268143167, "step": 1896, "train/total_loss": 0.07708074152469635 }, { "entropy": 9.202181816101074, "epoch": 0.1875617955309472, "mean_token_accuracy": 0.6662763357162476, "num_tokens": 9869603.0, "step": 1897, "train/ce_loss": 1.7217258214950562 }, { "epoch": 0.1875617955309472, "step": 1897, "train/sim_loss": 0.09765625 }, { "epoch": 0.1875617955309472, "step": 1897, "train/total_loss": 0.2698288559913635 }, { "entropy": 9.915672302246094, "epoch": 0.18766066838046272, "mean_token_accuracy": 0.6188679337501526, "num_tokens": 9874546.0, "step": 1898, "train/ce_loss": 1.6661015251884237e-05 }, { "epoch": 0.18766066838046272, "step": 1898, "train/sim_loss": 0.06640625 }, { "epoch": 0.18766066838046272, "step": 1898, "train/total_loss": 0.06640791893005371 }, { "entropy": 10.016805648803711, "epoch": 0.18775954122997826, "mean_token_accuracy": 0.7373737096786499, "num_tokens": 9879365.0, "step": 1899, "train/ce_loss": 1.7677421569824219 }, { "epoch": 0.18775954122997826, "step": 1899, "train/sim_loss": 0.08203125 }, { "epoch": 0.18775954122997826, "step": 1899, "train/total_loss": 0.25880545377731323 }, { "epoch": 0.18785841407949377, "grad_norm": 1.1335080862045288, "learning_rate": 9.53295752361173e-06, "loss": 0.1604, "step": 1900 }, { "entropy": 9.707734107971191, "epoch": 0.18785841407949377, "mean_token_accuracy": 0.7730711102485657, "num_tokens": 9884433.0, "step": 1900, "train/ce_loss": 0.7513488531112671 }, { "epoch": 0.18785841407949377, "step": 1900, "train/sim_loss": 0.0625 }, { "epoch": 0.18785841407949377, "step": 1900, "train/total_loss": 0.13763488829135895 }, { "entropy": 9.360363006591797, "epoch": 0.1879572869290093, "mean_token_accuracy": 0.6983240246772766, "num_tokens": 9889590.0, "step": 1901, "train/ce_loss": 0.8884443640708923 }, { "epoch": 0.1879572869290093, "step": 1901, "train/sim_loss": 0.08203125 }, { "epoch": 0.1879572869290093, "step": 1901, "train/total_loss": 0.1708756983280182 }, { "entropy": 9.513907432556152, "epoch": 0.18805615977852483, "mean_token_accuracy": 0.7134052515029907, "num_tokens": 9894660.0, "step": 1902, "train/ce_loss": 1.128973364830017 }, { "epoch": 0.18805615977852483, "step": 1902, "train/sim_loss": 0.09375 }, { "epoch": 0.18805615977852483, "step": 1902, "train/total_loss": 0.2066473364830017 }, { "entropy": 9.177980422973633, "epoch": 0.18815503262804034, "mean_token_accuracy": 0.7287173867225647, "num_tokens": 9899966.0, "step": 1903, "train/ce_loss": 0.6958595514297485 }, { "epoch": 0.18815503262804034, "step": 1903, "train/sim_loss": 0.02734375 }, { "epoch": 0.18815503262804034, "step": 1903, "train/total_loss": 0.09692970663309097 }, { "entropy": 9.603252410888672, "epoch": 0.18825390547755586, "mean_token_accuracy": 0.7086092829704285, "num_tokens": 9905018.0, "step": 1904, "train/ce_loss": 1.0595810413360596 }, { "epoch": 0.18825390547755586, "step": 1904, "train/sim_loss": 0.17578125 }, { "epoch": 0.18825390547755586, "step": 1904, "train/total_loss": 0.28173935413360596 }, { "entropy": 9.92378044128418, "epoch": 0.1883527783270714, "mean_token_accuracy": 0.8280922174453735, "num_tokens": 9909953.0, "step": 1905, "train/ce_loss": 1.083510398864746 }, { "epoch": 0.1883527783270714, "step": 1905, "train/sim_loss": 0.046875 }, { "epoch": 0.1883527783270714, "step": 1905, "train/total_loss": 0.15522605180740356 }, { "entropy": 10.098663330078125, "epoch": 0.1884516511765869, "mean_token_accuracy": 0.6756032109260559, "num_tokens": 9914775.0, "step": 1906, "train/ce_loss": 2.1146256923675537 }, { "epoch": 0.1884516511765869, "step": 1906, "train/sim_loss": 0.1171875 }, { "epoch": 0.1884516511765869, "step": 1906, "train/total_loss": 0.3286500573158264 }, { "entropy": 9.783650398254395, "epoch": 0.18855052402610242, "mean_token_accuracy": 0.7022375464439392, "num_tokens": 9919760.0, "step": 1907, "train/ce_loss": 0.675777792930603 }, { "epoch": 0.18855052402610242, "step": 1907, "train/sim_loss": 0.0546875 }, { "epoch": 0.18855052402610242, "step": 1907, "train/total_loss": 0.1222652792930603 }, { "entropy": 9.392547607421875, "epoch": 0.18864939687561796, "mean_token_accuracy": 0.7565789222717285, "num_tokens": 9924979.0, "step": 1908, "train/ce_loss": 0.8832827806472778 }, { "epoch": 0.18864939687561796, "step": 1908, "train/sim_loss": 0.078125 }, { "epoch": 0.18864939687561796, "step": 1908, "train/total_loss": 0.1664532721042633 }, { "entropy": 10.167774200439453, "epoch": 0.18874826972513348, "mean_token_accuracy": 0.7160493731498718, "num_tokens": 9929806.0, "step": 1909, "train/ce_loss": 2.2639083862304688 }, { "epoch": 0.18874826972513348, "step": 1909, "train/sim_loss": 0.09375 }, { "epoch": 0.18874826972513348, "step": 1909, "train/total_loss": 0.3201408386230469 }, { "entropy": 9.179858207702637, "epoch": 0.188847142574649, "mean_token_accuracy": 0.6723940372467041, "num_tokens": 9935182.0, "step": 1910, "train/ce_loss": 0.9893306493759155 }, { "epoch": 0.188847142574649, "step": 1910, "train/sim_loss": 0.09375 }, { "epoch": 0.188847142574649, "step": 1910, "train/total_loss": 0.19268307089805603 }, { "entropy": 9.44682788848877, "epoch": 0.18894601542416453, "mean_token_accuracy": 0.7652284502983093, "num_tokens": 9940395.0, "step": 1911, "train/ce_loss": 0.8784785270690918 }, { "epoch": 0.18894601542416453, "step": 1911, "train/sim_loss": 0.0625 }, { "epoch": 0.18894601542416453, "step": 1911, "train/total_loss": 0.15034785866737366 }, { "entropy": 8.869483947753906, "epoch": 0.18904488827368005, "mean_token_accuracy": 0.7418073415756226, "num_tokens": 9945921.0, "step": 1912, "train/ce_loss": 0.8187457323074341 }, { "epoch": 0.18904488827368005, "step": 1912, "train/sim_loss": 0.02734375 }, { "epoch": 0.18904488827368005, "step": 1912, "train/total_loss": 0.10921832174062729 }, { "entropy": 9.935173034667969, "epoch": 0.18914376112319556, "mean_token_accuracy": 0.7488986849784851, "num_tokens": 9950794.0, "step": 1913, "train/ce_loss": 2.188868284225464 }, { "epoch": 0.18914376112319556, "step": 1913, "train/sim_loss": 0.09375 }, { "epoch": 0.18914376112319556, "step": 1913, "train/total_loss": 0.3126368522644043 }, { "entropy": 9.276559829711914, "epoch": 0.1892426339727111, "mean_token_accuracy": 0.7250324487686157, "num_tokens": 9956008.0, "step": 1914, "train/ce_loss": 0.7460759282112122 }, { "epoch": 0.1892426339727111, "step": 1914, "train/sim_loss": 0.08203125 }, { "epoch": 0.1892426339727111, "step": 1914, "train/total_loss": 0.15663884580135345 }, { "entropy": 9.76962661743164, "epoch": 0.1893415068222266, "mean_token_accuracy": 0.7380560040473938, "num_tokens": 9961048.0, "step": 1915, "train/ce_loss": 0.7595387101173401 }, { "epoch": 0.1893415068222266, "step": 1915, "train/sim_loss": 0.06640625 }, { "epoch": 0.1893415068222266, "step": 1915, "train/total_loss": 0.142360121011734 }, { "entropy": 9.06021785736084, "epoch": 0.18944037967174213, "mean_token_accuracy": 0.6996337175369263, "num_tokens": 9966328.0, "step": 1916, "train/ce_loss": 0.719662606716156 }, { "epoch": 0.18944037967174213, "step": 1916, "train/sim_loss": 0.0625 }, { "epoch": 0.18944037967174213, "step": 1916, "train/total_loss": 0.1344662606716156 }, { "entropy": 9.592153549194336, "epoch": 0.18953925252125767, "mean_token_accuracy": 0.7346278429031372, "num_tokens": 9971393.0, "step": 1917, "train/ce_loss": 1.203270435333252 }, { "epoch": 0.18953925252125767, "step": 1917, "train/sim_loss": 0.1015625 }, { "epoch": 0.18953925252125767, "step": 1917, "train/total_loss": 0.22188955545425415 }, { "entropy": 9.755895614624023, "epoch": 0.18963812537077318, "mean_token_accuracy": 0.807106614112854, "num_tokens": 9976375.0, "step": 1918, "train/ce_loss": 8.259370588348247e-06 }, { "epoch": 0.18963812537077318, "step": 1918, "train/sim_loss": 0.1015625 }, { "epoch": 0.18963812537077318, "step": 1918, "train/total_loss": 0.10156332701444626 }, { "entropy": 9.63601303100586, "epoch": 0.18973699822028872, "mean_token_accuracy": 0.7152875065803528, "num_tokens": 9981514.0, "step": 1919, "train/ce_loss": 0.933495283126831 }, { "epoch": 0.18973699822028872, "step": 1919, "train/sim_loss": 0.0234375 }, { "epoch": 0.18973699822028872, "step": 1919, "train/total_loss": 0.11678703129291534 }, { "epoch": 0.18983587106980424, "grad_norm": 0.9176076054573059, "learning_rate": 9.528012658853782e-06, "loss": 0.1639, "step": 1920 }, { "entropy": 9.022931098937988, "epoch": 0.18983587106980424, "mean_token_accuracy": 0.6468129754066467, "num_tokens": 9986990.0, "step": 1920, "train/ce_loss": 0.8590229749679565 }, { "epoch": 0.18983587106980424, "step": 1920, "train/sim_loss": 0.0703125 }, { "epoch": 0.18983587106980424, "step": 1920, "train/total_loss": 0.15621480345726013 }, { "entropy": 8.768945693969727, "epoch": 0.18993474391931975, "mean_token_accuracy": 0.673511266708374, "num_tokens": 9992442.0, "step": 1921, "train/ce_loss": 0.7011252045631409 }, { "epoch": 0.18993474391931975, "step": 1921, "train/sim_loss": 0.140625 }, { "epoch": 0.18993474391931975, "step": 1921, "train/total_loss": 0.21073752641677856 }, { "entropy": 9.983613014221191, "epoch": 0.1900336167688353, "mean_token_accuracy": 0.7366336584091187, "num_tokens": 9997384.0, "step": 1922, "train/ce_loss": 7.890875167504419e-06 }, { "epoch": 0.1900336167688353, "step": 1922, "train/sim_loss": 0.02734375 }, { "epoch": 0.1900336167688353, "step": 1922, "train/total_loss": 0.027344539761543274 }, { "entropy": 9.837148666381836, "epoch": 0.1901324896183508, "mean_token_accuracy": 0.6741154789924622, "num_tokens": 10002300.0, "step": 1923, "train/ce_loss": 1.8100438117980957 }, { "epoch": 0.1901324896183508, "step": 1923, "train/sim_loss": 0.078125 }, { "epoch": 0.1901324896183508, "step": 1923, "train/total_loss": 0.2591294050216675 }, { "entropy": 10.034281730651855, "epoch": 0.19023136246786632, "mean_token_accuracy": 0.7677165269851685, "num_tokens": 10007226.0, "step": 1924, "train/ce_loss": 5.1621764214360155e-06 }, { "epoch": 0.19023136246786632, "step": 1924, "train/sim_loss": 0.08203125 }, { "epoch": 0.19023136246786632, "step": 1924, "train/total_loss": 0.08203176409006119 }, { "entropy": 9.544149398803711, "epoch": 0.19033023531738186, "mean_token_accuracy": 0.6960408687591553, "num_tokens": 10012625.0, "step": 1925, "train/ce_loss": 1.2972491979599 }, { "epoch": 0.19033023531738186, "step": 1925, "train/sim_loss": 0.0859375 }, { "epoch": 0.19033023531738186, "step": 1925, "train/total_loss": 0.21566241979599 }, { "entropy": 9.560464859008789, "epoch": 0.19042910816689737, "mean_token_accuracy": 0.671897292137146, "num_tokens": 10017783.0, "step": 1926, "train/ce_loss": 0.7921290397644043 }, { "epoch": 0.19042910816689737, "step": 1926, "train/sim_loss": 0.078125 }, { "epoch": 0.19042910816689737, "step": 1926, "train/total_loss": 0.15733790397644043 }, { "entropy": 10.270628929138184, "epoch": 0.19052798101641288, "mean_token_accuracy": 0.7486486434936523, "num_tokens": 10022587.0, "step": 1927, "train/ce_loss": 1.5367345809936523 }, { "epoch": 0.19052798101641288, "step": 1927, "train/sim_loss": 0.1171875 }, { "epoch": 0.19052798101641288, "step": 1927, "train/total_loss": 0.2708609700202942 }, { "entropy": 9.204045295715332, "epoch": 0.19062685386592843, "mean_token_accuracy": 0.7177242636680603, "num_tokens": 10027979.0, "step": 1928, "train/ce_loss": 0.8198716044425964 }, { "epoch": 0.19062685386592843, "step": 1928, "train/sim_loss": 0.05859375 }, { "epoch": 0.19062685386592843, "step": 1928, "train/total_loss": 0.1405809223651886 }, { "entropy": 9.579994201660156, "epoch": 0.19072572671544394, "mean_token_accuracy": 0.7909215688705444, "num_tokens": 10033171.0, "step": 1929, "train/ce_loss": 0.8041414618492126 }, { "epoch": 0.19072572671544394, "step": 1929, "train/sim_loss": 0.0859375 }, { "epoch": 0.19072572671544394, "step": 1929, "train/total_loss": 0.16635164618492126 }, { "entropy": 10.178987503051758, "epoch": 0.19082459956495945, "mean_token_accuracy": 0.717277467250824, "num_tokens": 10037978.0, "step": 1930, "train/ce_loss": 1.842941164970398 }, { "epoch": 0.19082459956495945, "step": 1930, "train/sim_loss": 0.08984375 }, { "epoch": 0.19082459956495945, "step": 1930, "train/total_loss": 0.27413785457611084 }, { "entropy": 9.233572959899902, "epoch": 0.190923472414475, "mean_token_accuracy": 0.7350332736968994, "num_tokens": 10043339.0, "step": 1931, "train/ce_loss": 0.5585495233535767 }, { "epoch": 0.190923472414475, "step": 1931, "train/sim_loss": 0.0390625 }, { "epoch": 0.190923472414475, "step": 1931, "train/total_loss": 0.09491745382547379 }, { "entropy": 9.72354793548584, "epoch": 0.1910223452639905, "mean_token_accuracy": 0.7014681696891785, "num_tokens": 10048394.0, "step": 1932, "train/ce_loss": 1.6334415674209595 }, { "epoch": 0.1910223452639905, "step": 1932, "train/sim_loss": 0.21875 }, { "epoch": 0.1910223452639905, "step": 1932, "train/total_loss": 0.382094144821167 }, { "entropy": 9.566108703613281, "epoch": 0.19112121811350602, "mean_token_accuracy": 0.7293233275413513, "num_tokens": 10053524.0, "step": 1933, "train/ce_loss": 0.9320631623268127 }, { "epoch": 0.19112121811350602, "step": 1933, "train/sim_loss": 0.04296875 }, { "epoch": 0.19112121811350602, "step": 1933, "train/total_loss": 0.13617506623268127 }, { "entropy": 9.240278244018555, "epoch": 0.19122009096302156, "mean_token_accuracy": 0.719260036945343, "num_tokens": 10058882.0, "step": 1934, "train/ce_loss": 1.0645043849945068 }, { "epoch": 0.19122009096302156, "step": 1934, "train/sim_loss": 0.0703125 }, { "epoch": 0.19122009096302156, "step": 1934, "train/total_loss": 0.17676293849945068 }, { "entropy": 9.086050033569336, "epoch": 0.19131896381253707, "mean_token_accuracy": 0.7275574207305908, "num_tokens": 10064339.0, "step": 1935, "train/ce_loss": 1.5135074853897095 }, { "epoch": 0.19131896381253707, "step": 1935, "train/sim_loss": 0.09375 }, { "epoch": 0.19131896381253707, "step": 1935, "train/total_loss": 0.24510075151920319 }, { "entropy": 9.469228744506836, "epoch": 0.1914178366620526, "mean_token_accuracy": 0.7276478409767151, "num_tokens": 10069518.0, "step": 1936, "train/ce_loss": 0.5873953700065613 }, { "epoch": 0.1914178366620526, "step": 1936, "train/sim_loss": 0.05859375 }, { "epoch": 0.1914178366620526, "step": 1936, "train/total_loss": 0.1173332929611206 }, { "entropy": 9.303143501281738, "epoch": 0.19151670951156813, "mean_token_accuracy": 0.7047058939933777, "num_tokens": 10074828.0, "step": 1937, "train/ce_loss": 1.2339478731155396 }, { "epoch": 0.19151670951156813, "step": 1937, "train/sim_loss": 0.1015625 }, { "epoch": 0.19151670951156813, "step": 1937, "train/total_loss": 0.22495728731155396 }, { "entropy": 9.396015167236328, "epoch": 0.19161558236108364, "mean_token_accuracy": 0.6743002533912659, "num_tokens": 10080079.0, "step": 1938, "train/ce_loss": 1.1823872327804565 }, { "epoch": 0.19161558236108364, "step": 1938, "train/sim_loss": 0.05859375 }, { "epoch": 0.19161558236108364, "step": 1938, "train/total_loss": 0.17683246731758118 }, { "entropy": 9.46355152130127, "epoch": 0.19171445521059918, "mean_token_accuracy": 0.737062931060791, "num_tokens": 10085240.0, "step": 1939, "train/ce_loss": 0.6518946290016174 }, { "epoch": 0.19171445521059918, "step": 1939, "train/sim_loss": 0.0703125 }, { "epoch": 0.19171445521059918, "step": 1939, "train/total_loss": 0.13550196588039398 }, { "epoch": 0.1918133280601147, "grad_norm": 0.878887414932251, "learning_rate": 9.523067794095833e-06, "loss": 0.1717, "step": 1940 }, { "entropy": 8.839654922485352, "epoch": 0.1918133280601147, "mean_token_accuracy": 0.7251356244087219, "num_tokens": 10090860.0, "step": 1940, "train/ce_loss": 0.914640486240387 }, { "epoch": 0.1918133280601147, "step": 1940, "train/sim_loss": 0.09375 }, { "epoch": 0.1918133280601147, "step": 1940, "train/total_loss": 0.18521404266357422 }, { "entropy": 9.40005874633789, "epoch": 0.1919122009096302, "mean_token_accuracy": 0.6986469626426697, "num_tokens": 10096084.0, "step": 1941, "train/ce_loss": 1.039476752281189 }, { "epoch": 0.1919122009096302, "step": 1941, "train/sim_loss": 0.078125 }, { "epoch": 0.1919122009096302, "step": 1941, "train/total_loss": 0.18207266926765442 }, { "entropy": 9.450826644897461, "epoch": 0.19201107375914575, "mean_token_accuracy": 0.6740914583206177, "num_tokens": 10101355.0, "step": 1942, "train/ce_loss": 0.4420863091945648 }, { "epoch": 0.19201107375914575, "step": 1942, "train/sim_loss": 0.09765625 }, { "epoch": 0.19201107375914575, "step": 1942, "train/total_loss": 0.14186488091945648 }, { "entropy": 9.06132698059082, "epoch": 0.19210994660866126, "mean_token_accuracy": 0.7876213788986206, "num_tokens": 10106650.0, "step": 1943, "train/ce_loss": 0.6898800134658813 }, { "epoch": 0.19210994660866126, "step": 1943, "train/sim_loss": 0.03125 }, { "epoch": 0.19210994660866126, "step": 1943, "train/total_loss": 0.10023800283670425 }, { "entropy": 9.049436569213867, "epoch": 0.19220881945817678, "mean_token_accuracy": 0.7545090317726135, "num_tokens": 10112168.0, "step": 1944, "train/ce_loss": 0.8400173783302307 }, { "epoch": 0.19220881945817678, "step": 1944, "train/sim_loss": 0.08984375 }, { "epoch": 0.19220881945817678, "step": 1944, "train/total_loss": 0.17384549975395203 }, { "entropy": 9.39749813079834, "epoch": 0.19230769230769232, "mean_token_accuracy": 0.7249322533607483, "num_tokens": 10117366.0, "step": 1945, "train/ce_loss": 1.7692957044346258e-05 }, { "epoch": 0.19230769230769232, "step": 1945, "train/sim_loss": 0.09375 }, { "epoch": 0.19230769230769232, "step": 1945, "train/total_loss": 0.09375176578760147 }, { "entropy": 9.324932098388672, "epoch": 0.19240656515720783, "mean_token_accuracy": 0.7276536226272583, "num_tokens": 10122549.0, "step": 1946, "train/ce_loss": 0.9241024255752563 }, { "epoch": 0.19240656515720783, "step": 1946, "train/sim_loss": 0.09375 }, { "epoch": 0.19240656515720783, "step": 1946, "train/total_loss": 0.18616023659706116 }, { "entropy": 9.378195762634277, "epoch": 0.19250543800672335, "mean_token_accuracy": 0.7531734704971313, "num_tokens": 10127689.0, "step": 1947, "train/ce_loss": 1.0089960098266602 }, { "epoch": 0.19250543800672335, "step": 1947, "train/sim_loss": 0.01953125 }, { "epoch": 0.19250543800672335, "step": 1947, "train/total_loss": 0.1204308494925499 }, { "entropy": 8.946748733520508, "epoch": 0.1926043108562389, "mean_token_accuracy": 0.7372187972068787, "num_tokens": 10133186.0, "step": 1948, "train/ce_loss": 1.051811933517456 }, { "epoch": 0.1926043108562389, "step": 1948, "train/sim_loss": 0.09765625 }, { "epoch": 0.1926043108562389, "step": 1948, "train/total_loss": 0.20283743739128113 }, { "entropy": 9.343496322631836, "epoch": 0.1927031837057544, "mean_token_accuracy": 0.7685185074806213, "num_tokens": 10138462.0, "step": 1949, "train/ce_loss": 0.6328471302986145 }, { "epoch": 0.1927031837057544, "step": 1949, "train/sim_loss": 0.0546875 }, { "epoch": 0.1927031837057544, "step": 1949, "train/total_loss": 0.11797221750020981 }, { "entropy": 9.369144439697266, "epoch": 0.1928020565552699, "mean_token_accuracy": 0.7584415674209595, "num_tokens": 10143716.0, "step": 1950, "train/ce_loss": 0.37200042605400085 }, { "epoch": 0.1928020565552699, "step": 1950, "train/sim_loss": 0.0703125 }, { "epoch": 0.1928020565552699, "step": 1950, "train/total_loss": 0.10751254856586456 }, { "entropy": 9.393310546875, "epoch": 0.19290092940478545, "mean_token_accuracy": 0.7613940834999084, "num_tokens": 10148974.0, "step": 1951, "train/ce_loss": 0.6348753571510315 }, { "epoch": 0.19290092940478545, "step": 1951, "train/sim_loss": 0.078125 }, { "epoch": 0.19290092940478545, "step": 1951, "train/total_loss": 0.14161252975463867 }, { "entropy": 9.001161575317383, "epoch": 0.19299980225430097, "mean_token_accuracy": 0.7382550239562988, "num_tokens": 10154489.0, "step": 1952, "train/ce_loss": 0.3051251769065857 }, { "epoch": 0.19299980225430097, "step": 1952, "train/sim_loss": 0.07421875 }, { "epoch": 0.19299980225430097, "step": 1952, "train/total_loss": 0.10473126918077469 }, { "entropy": 8.901924133300781, "epoch": 0.19309867510381648, "mean_token_accuracy": 0.7837837934494019, "num_tokens": 10160000.0, "step": 1953, "train/ce_loss": 0.7139679193496704 }, { "epoch": 0.19309867510381648, "step": 1953, "train/sim_loss": 0.05859375 }, { "epoch": 0.19309867510381648, "step": 1953, "train/total_loss": 0.12999054789543152 }, { "entropy": 9.370849609375, "epoch": 0.19319754795333202, "mean_token_accuracy": 0.7202072739601135, "num_tokens": 10165235.0, "step": 1954, "train/ce_loss": 0.9253755211830139 }, { "epoch": 0.19319754795333202, "step": 1954, "train/sim_loss": 0.10546875 }, { "epoch": 0.19319754795333202, "step": 1954, "train/total_loss": 0.1980063021183014 }, { "entropy": 9.580575942993164, "epoch": 0.19329642080284754, "mean_token_accuracy": 0.677570104598999, "num_tokens": 10170374.0, "step": 1955, "train/ce_loss": 1.3786916732788086 }, { "epoch": 0.19329642080284754, "step": 1955, "train/sim_loss": 0.05859375 }, { "epoch": 0.19329642080284754, "step": 1955, "train/total_loss": 0.19646291434764862 }, { "entropy": 9.054275512695312, "epoch": 0.19339529365236305, "mean_token_accuracy": 0.7981651425361633, "num_tokens": 10175700.0, "step": 1956, "train/ce_loss": 0.5524548292160034 }, { "epoch": 0.19339529365236305, "step": 1956, "train/sim_loss": 0.02734375 }, { "epoch": 0.19339529365236305, "step": 1956, "train/total_loss": 0.08258923888206482 }, { "entropy": 9.024927139282227, "epoch": 0.1934941665018786, "mean_token_accuracy": 0.7222222089767456, "num_tokens": 10181162.0, "step": 1957, "train/ce_loss": 1.0508496761322021 }, { "epoch": 0.1934941665018786, "step": 1957, "train/sim_loss": 0.046875 }, { "epoch": 0.1934941665018786, "step": 1957, "train/total_loss": 0.15195997059345245 }, { "entropy": 9.6265869140625, "epoch": 0.1935930393513941, "mean_token_accuracy": 0.7394034266471863, "num_tokens": 10186273.0, "step": 1958, "train/ce_loss": 1.0031788349151611 }, { "epoch": 0.1935930393513941, "step": 1958, "train/sim_loss": 0.04296875 }, { "epoch": 0.1935930393513941, "step": 1958, "train/total_loss": 0.14328664541244507 }, { "entropy": 9.224421501159668, "epoch": 0.19369191220090964, "mean_token_accuracy": 0.6613076329231262, "num_tokens": 10191710.0, "step": 1959, "train/ce_loss": 1.8621876239776611 }, { "epoch": 0.19369191220090964, "step": 1959, "train/sim_loss": 0.09765625 }, { "epoch": 0.19369191220090964, "step": 1959, "train/total_loss": 0.2838750183582306 }, { "epoch": 0.19379078505042516, "grad_norm": 1.1886433362960815, "learning_rate": 9.518122929337883e-06, "loss": 0.159, "step": 1960 }, { "entropy": 9.636155128479004, "epoch": 0.19379078505042516, "mean_token_accuracy": 0.7268292903900146, "num_tokens": 10196723.0, "step": 1960, "train/ce_loss": 1.3479249477386475 }, { "epoch": 0.19379078505042516, "step": 1960, "train/sim_loss": 0.13671875 }, { "epoch": 0.19379078505042516, "step": 1960, "train/total_loss": 0.2715112566947937 }, { "entropy": 9.221063613891602, "epoch": 0.19388965789994067, "mean_token_accuracy": 0.7819383144378662, "num_tokens": 10202077.0, "step": 1961, "train/ce_loss": 0.7505377531051636 }, { "epoch": 0.19388965789994067, "step": 1961, "train/sim_loss": 0.0546875 }, { "epoch": 0.19388965789994067, "step": 1961, "train/total_loss": 0.12974128127098083 }, { "entropy": 9.390584945678711, "epoch": 0.1939885307494562, "mean_token_accuracy": 0.7436241507530212, "num_tokens": 10207246.0, "step": 1962, "train/ce_loss": 0.6959307789802551 }, { "epoch": 0.1939885307494562, "step": 1962, "train/sim_loss": 0.09375 }, { "epoch": 0.1939885307494562, "step": 1962, "train/total_loss": 0.16334307193756104 }, { "entropy": 9.111087799072266, "epoch": 0.19408740359897173, "mean_token_accuracy": 0.8062953948974609, "num_tokens": 10212568.0, "step": 1963, "train/ce_loss": 0.6606025695800781 }, { "epoch": 0.19408740359897173, "step": 1963, "train/sim_loss": 0.09765625 }, { "epoch": 0.19408740359897173, "step": 1963, "train/total_loss": 0.16371650993824005 }, { "entropy": 9.369743347167969, "epoch": 0.19418627644848724, "mean_token_accuracy": 0.7235984206199646, "num_tokens": 10217814.0, "step": 1964, "train/ce_loss": 0.7288702130317688 }, { "epoch": 0.19418627644848724, "step": 1964, "train/sim_loss": 0.10546875 }, { "epoch": 0.19418627644848724, "step": 1964, "train/total_loss": 0.17835578322410583 }, { "entropy": 10.039051055908203, "epoch": 0.19428514929800278, "mean_token_accuracy": 0.7597402334213257, "num_tokens": 10222727.0, "step": 1965, "train/ce_loss": 3.78113218175713e-05 }, { "epoch": 0.19428514929800278, "step": 1965, "train/sim_loss": 0.05859375 }, { "epoch": 0.19428514929800278, "step": 1965, "train/total_loss": 0.05859753116965294 }, { "entropy": 9.540410995483398, "epoch": 0.1943840221475183, "mean_token_accuracy": 0.6957746744155884, "num_tokens": 10227884.0, "step": 1966, "train/ce_loss": 1.4989935159683228 }, { "epoch": 0.1943840221475183, "step": 1966, "train/sim_loss": 0.1328125 }, { "epoch": 0.1943840221475183, "step": 1966, "train/total_loss": 0.28271186351776123 }, { "entropy": 9.632904052734375, "epoch": 0.1944828949970338, "mean_token_accuracy": 0.7837445735931396, "num_tokens": 10233008.0, "step": 1967, "train/ce_loss": 0.5353147983551025 }, { "epoch": 0.1944828949970338, "step": 1967, "train/sim_loss": 0.05078125 }, { "epoch": 0.1944828949970338, "step": 1967, "train/total_loss": 0.10431273281574249 }, { "entropy": 9.63956069946289, "epoch": 0.19458176784654935, "mean_token_accuracy": 0.7118644118309021, "num_tokens": 10238022.0, "step": 1968, "train/ce_loss": 8.986912689579185e-06 }, { "epoch": 0.19458176784654935, "step": 1968, "train/sim_loss": 0.0625 }, { "epoch": 0.19458176784654935, "step": 1968, "train/total_loss": 0.06250090152025223 }, { "entropy": 10.186599731445312, "epoch": 0.19468064069606486, "mean_token_accuracy": 0.6994949579238892, "num_tokens": 10242853.0, "step": 1969, "train/ce_loss": 9.860812497208826e-06 }, { "epoch": 0.19468064069606486, "step": 1969, "train/sim_loss": 0.02734375 }, { "epoch": 0.19468064069606486, "step": 1969, "train/total_loss": 0.027344735339283943 }, { "entropy": 9.200822830200195, "epoch": 0.19477951354558037, "mean_token_accuracy": 0.7674919366836548, "num_tokens": 10248207.0, "step": 1970, "train/ce_loss": 0.49040570855140686 }, { "epoch": 0.19477951354558037, "step": 1970, "train/sim_loss": 0.05859375 }, { "epoch": 0.19477951354558037, "step": 1970, "train/total_loss": 0.10763432085514069 }, { "entropy": 9.327460289001465, "epoch": 0.19487838639509591, "mean_token_accuracy": 0.7048054933547974, "num_tokens": 10253554.0, "step": 1971, "train/ce_loss": 1.5515893697738647 }, { "epoch": 0.19487838639509591, "step": 1971, "train/sim_loss": 0.07421875 }, { "epoch": 0.19487838639509591, "step": 1971, "train/total_loss": 0.22937768697738647 }, { "entropy": 9.314977645874023, "epoch": 0.19497725924461143, "mean_token_accuracy": 0.6891566514968872, "num_tokens": 10258854.0, "step": 1972, "train/ce_loss": 0.5195350646972656 }, { "epoch": 0.19497725924461143, "step": 1972, "train/sim_loss": 0.0546875 }, { "epoch": 0.19497725924461143, "step": 1972, "train/total_loss": 0.1066410094499588 }, { "entropy": 9.803762435913086, "epoch": 0.19507613209412694, "mean_token_accuracy": 0.7456446290016174, "num_tokens": 10263834.0, "step": 1973, "train/ce_loss": 1.242767333984375 }, { "epoch": 0.19507613209412694, "step": 1973, "train/sim_loss": 0.0703125 }, { "epoch": 0.19507613209412694, "step": 1973, "train/total_loss": 0.19458922743797302 }, { "entropy": 9.587112426757812, "epoch": 0.19517500494364248, "mean_token_accuracy": 0.7117117047309875, "num_tokens": 10269022.0, "step": 1974, "train/ce_loss": 1.0832068920135498 }, { "epoch": 0.19517500494364248, "step": 1974, "train/sim_loss": 0.04296875 }, { "epoch": 0.19517500494364248, "step": 1974, "train/total_loss": 0.1512894332408905 }, { "entropy": 9.864027976989746, "epoch": 0.195273877793158, "mean_token_accuracy": 0.7209705114364624, "num_tokens": 10273997.0, "step": 1975, "train/ce_loss": 1.0521684885025024 }, { "epoch": 0.195273877793158, "step": 1975, "train/sim_loss": 0.06640625 }, { "epoch": 0.195273877793158, "step": 1975, "train/total_loss": 0.1716231107711792 }, { "entropy": 9.501541137695312, "epoch": 0.1953727506426735, "mean_token_accuracy": 0.7556179761886597, "num_tokens": 10279164.0, "step": 1976, "train/ce_loss": 1.044768214225769 }, { "epoch": 0.1953727506426735, "step": 1976, "train/sim_loss": 0.08984375 }, { "epoch": 0.1953727506426735, "step": 1976, "train/total_loss": 0.19432057440280914 }, { "entropy": 9.759073257446289, "epoch": 0.19547162349218905, "mean_token_accuracy": 0.735433042049408, "num_tokens": 10284245.0, "step": 1977, "train/ce_loss": 1.3436446351988707e-05 }, { "epoch": 0.19547162349218905, "step": 1977, "train/sim_loss": 0.046875 }, { "epoch": 0.19547162349218905, "step": 1977, "train/total_loss": 0.046876344829797745 }, { "entropy": 9.271638870239258, "epoch": 0.19557049634170456, "mean_token_accuracy": 0.7273743152618408, "num_tokens": 10289591.0, "step": 1978, "train/ce_loss": 1.3765709400177002 }, { "epoch": 0.19557049634170456, "step": 1978, "train/sim_loss": 0.12890625 }, { "epoch": 0.19557049634170456, "step": 1978, "train/total_loss": 0.266563355922699 }, { "entropy": 10.238898277282715, "epoch": 0.19566936919122008, "mean_token_accuracy": 0.762273907661438, "num_tokens": 10294361.0, "step": 1979, "train/ce_loss": 8.265675205620937e-06 }, { "epoch": 0.19566936919122008, "step": 1979, "train/sim_loss": 0.02734375 }, { "epoch": 0.19566936919122008, "step": 1979, "train/total_loss": 0.02734457701444626 }, { "epoch": 0.19576824204073562, "grad_norm": 0.8895950317382812, "learning_rate": 9.513178064579934e-06, "loss": 0.1545, "step": 1980 }, { "entropy": 9.463583946228027, "epoch": 0.19576824204073562, "mean_token_accuracy": 0.707317054271698, "num_tokens": 10299584.0, "step": 1980, "train/ce_loss": 0.8408167362213135 }, { "epoch": 0.19576824204073562, "step": 1980, "train/sim_loss": 0.1015625 }, { "epoch": 0.19576824204073562, "step": 1980, "train/total_loss": 0.18564417958259583 }, { "entropy": 9.447678565979004, "epoch": 0.19586711489025113, "mean_token_accuracy": 0.6492537260055542, "num_tokens": 10304859.0, "step": 1981, "train/ce_loss": 3.376751010364387e-06 }, { "epoch": 0.19586711489025113, "step": 1981, "train/sim_loss": 0.05859375 }, { "epoch": 0.19586711489025113, "step": 1981, "train/total_loss": 0.05859408900141716 }, { "entropy": 9.249719619750977, "epoch": 0.19596598773976667, "mean_token_accuracy": 0.6746126413345337, "num_tokens": 10310173.0, "step": 1982, "train/ce_loss": 1.762305498123169 }, { "epoch": 0.19596598773976667, "step": 1982, "train/sim_loss": 0.1015625 }, { "epoch": 0.19596598773976667, "step": 1982, "train/total_loss": 0.2777930498123169 }, { "entropy": 9.396759033203125, "epoch": 0.19606486058928219, "mean_token_accuracy": 0.7430232763290405, "num_tokens": 10315525.0, "step": 1983, "train/ce_loss": 0.5767508745193481 }, { "epoch": 0.19606486058928219, "step": 1983, "train/sim_loss": 0.02734375 }, { "epoch": 0.19606486058928219, "step": 1983, "train/total_loss": 0.08501884341239929 }, { "entropy": 9.488676071166992, "epoch": 0.1961637334387977, "mean_token_accuracy": 0.8308921456336975, "num_tokens": 10320712.0, "step": 1984, "train/ce_loss": 0.8398823738098145 }, { "epoch": 0.1961637334387977, "step": 1984, "train/sim_loss": 0.03125 }, { "epoch": 0.1961637334387977, "step": 1984, "train/total_loss": 0.1152382418513298 }, { "entropy": 9.176420211791992, "epoch": 0.19626260628831324, "mean_token_accuracy": 0.7489361763000488, "num_tokens": 10326162.0, "step": 1985, "train/ce_loss": 0.7047630548477173 }, { "epoch": 0.19626260628831324, "step": 1985, "train/sim_loss": 0.046875 }, { "epoch": 0.19626260628831324, "step": 1985, "train/total_loss": 0.11735130846500397 }, { "entropy": 9.40720272064209, "epoch": 0.19636147913782875, "mean_token_accuracy": 0.7972292304039001, "num_tokens": 10331390.0, "step": 1986, "train/ce_loss": 0.6272144317626953 }, { "epoch": 0.19636147913782875, "step": 1986, "train/sim_loss": 0.0546875 }, { "epoch": 0.19636147913782875, "step": 1986, "train/total_loss": 0.11740894615650177 }, { "entropy": 9.483461380004883, "epoch": 0.19646035198734427, "mean_token_accuracy": 0.7078085541725159, "num_tokens": 10336778.0, "step": 1987, "train/ce_loss": 1.3223289251327515 }, { "epoch": 0.19646035198734427, "step": 1987, "train/sim_loss": 0.12890625 }, { "epoch": 0.19646035198734427, "step": 1987, "train/total_loss": 0.2611391544342041 }, { "entropy": 9.039703369140625, "epoch": 0.1965592248368598, "mean_token_accuracy": 0.8130000233650208, "num_tokens": 10342263.0, "step": 1988, "train/ce_loss": 0.38791629672050476 }, { "epoch": 0.1965592248368598, "step": 1988, "train/sim_loss": 0.0234375 }, { "epoch": 0.1965592248368598, "step": 1988, "train/total_loss": 0.062229130417108536 }, { "entropy": 10.164739608764648, "epoch": 0.19665809768637532, "mean_token_accuracy": 0.7388059496879578, "num_tokens": 10347104.0, "step": 1989, "train/ce_loss": 0.8840498328208923 }, { "epoch": 0.19665809768637532, "step": 1989, "train/sim_loss": 0.109375 }, { "epoch": 0.19665809768637532, "step": 1989, "train/total_loss": 0.19777998328208923 }, { "entropy": 9.35018539428711, "epoch": 0.19675697053589083, "mean_token_accuracy": 0.7886179089546204, "num_tokens": 10352411.0, "step": 1990, "train/ce_loss": 1.022213339805603 }, { "epoch": 0.19675697053589083, "step": 1990, "train/sim_loss": 0.078125 }, { "epoch": 0.19675697053589083, "step": 1990, "train/total_loss": 0.18034633994102478 }, { "entropy": 8.978738784790039, "epoch": 0.19685584338540638, "mean_token_accuracy": 0.728728711605072, "num_tokens": 10357881.0, "step": 1991, "train/ce_loss": 1.0197044610977173 }, { "epoch": 0.19685584338540638, "step": 1991, "train/sim_loss": 0.0703125 }, { "epoch": 0.19685584338540638, "step": 1991, "train/total_loss": 0.17228294909000397 }, { "entropy": 9.523167610168457, "epoch": 0.1969547162349219, "mean_token_accuracy": 0.7468531727790833, "num_tokens": 10363079.0, "step": 1992, "train/ce_loss": 0.4212130010128021 }, { "epoch": 0.1969547162349219, "step": 1992, "train/sim_loss": 0.02734375 }, { "epoch": 0.1969547162349219, "step": 1992, "train/total_loss": 0.06946505606174469 }, { "entropy": 9.488813400268555, "epoch": 0.1970535890844374, "mean_token_accuracy": 0.756035566329956, "num_tokens": 10368308.0, "step": 1993, "train/ce_loss": 0.8103978037834167 }, { "epoch": 0.1970535890844374, "step": 1993, "train/sim_loss": 0.12109375 }, { "epoch": 0.1970535890844374, "step": 1993, "train/total_loss": 0.20213353633880615 }, { "entropy": 9.04245376586914, "epoch": 0.19715246193395294, "mean_token_accuracy": 0.6892712712287903, "num_tokens": 10373763.0, "step": 1994, "train/ce_loss": 0.8689247965812683 }, { "epoch": 0.19715246193395294, "step": 1994, "train/sim_loss": 0.109375 }, { "epoch": 0.19715246193395294, "step": 1994, "train/total_loss": 0.1962674856185913 }, { "entropy": 9.254375457763672, "epoch": 0.19725133478346846, "mean_token_accuracy": 0.711442768573761, "num_tokens": 10379021.0, "step": 1995, "train/ce_loss": 1.226904034614563 }, { "epoch": 0.19725133478346846, "step": 1995, "train/sim_loss": 0.1015625 }, { "epoch": 0.19725133478346846, "step": 1995, "train/total_loss": 0.22425290942192078 }, { "entropy": 9.452052116394043, "epoch": 0.19735020763298397, "mean_token_accuracy": 0.676701545715332, "num_tokens": 10384240.0, "step": 1996, "train/ce_loss": 1.1110657453536987 }, { "epoch": 0.19735020763298397, "step": 1996, "train/sim_loss": 0.125 }, { "epoch": 0.19735020763298397, "step": 1996, "train/total_loss": 0.23610657453536987 }, { "entropy": 9.328840255737305, "epoch": 0.1974490804824995, "mean_token_accuracy": 0.7708095908164978, "num_tokens": 10389549.0, "step": 1997, "train/ce_loss": 0.5044722557067871 }, { "epoch": 0.1974490804824995, "step": 1997, "train/sim_loss": 0.06640625 }, { "epoch": 0.1974490804824995, "step": 1997, "train/total_loss": 0.11685347557067871 }, { "entropy": 9.00434684753418, "epoch": 0.19754795333201502, "mean_token_accuracy": 0.7303252816200256, "num_tokens": 10394939.0, "step": 1998, "train/ce_loss": 0.9697033166885376 }, { "epoch": 0.19754795333201502, "step": 1998, "train/sim_loss": 0.09375 }, { "epoch": 0.19754795333201502, "step": 1998, "train/total_loss": 0.190720334649086 }, { "entropy": 9.811996459960938, "epoch": 0.19764682618153054, "mean_token_accuracy": 0.6995447874069214, "num_tokens": 10399994.0, "step": 1999, "train/ce_loss": 1.406103253364563 }, { "epoch": 0.19764682618153054, "step": 1999, "train/sim_loss": 0.09765625 }, { "epoch": 0.19764682618153054, "step": 1999, "train/total_loss": 0.23826657235622406 }, { "epoch": 0.19774569903104608, "grad_norm": 0.9430355429649353, "learning_rate": 9.508233199821986e-06, "loss": 0.1605, "step": 2000 }, { "entropy": 9.675430297851562, "epoch": 0.19774569903104608, "mean_token_accuracy": 0.7976366281509399, "num_tokens": 10405114.0, "step": 2000, "train/ce_loss": 0.9603677988052368 }, { "epoch": 0.19774569903104608, "step": 2000, "train/sim_loss": 0.05859375 }, { "epoch": 0.19774569903104608, "step": 2000, "train/total_loss": 0.15463054180145264 }, { "entropy": 9.420642852783203, "epoch": 0.1978445718805616, "mean_token_accuracy": 0.7375178337097168, "num_tokens": 10410268.0, "step": 2001, "train/ce_loss": 0.8256149888038635 }, { "epoch": 0.1978445718805616, "step": 2001, "train/sim_loss": 0.06640625 }, { "epoch": 0.1978445718805616, "step": 2001, "train/total_loss": 0.14896774291992188 }, { "entropy": 9.433884620666504, "epoch": 0.19794344473007713, "mean_token_accuracy": 0.7377892136573792, "num_tokens": 10415502.0, "step": 2002, "train/ce_loss": 1.000722885131836 }, { "epoch": 0.19794344473007713, "step": 2002, "train/sim_loss": 0.11328125 }, { "epoch": 0.19794344473007713, "step": 2002, "train/total_loss": 0.21335354447364807 }, { "entropy": 9.553163528442383, "epoch": 0.19804231757959265, "mean_token_accuracy": 0.7405475974082947, "num_tokens": 10420738.0, "step": 2003, "train/ce_loss": 1.2696847915649414 }, { "epoch": 0.19804231757959265, "step": 2003, "train/sim_loss": 0.109375 }, { "epoch": 0.19804231757959265, "step": 2003, "train/total_loss": 0.23634348809719086 }, { "entropy": 9.052651405334473, "epoch": 0.19814119042910816, "mean_token_accuracy": 0.7702127695083618, "num_tokens": 10426204.0, "step": 2004, "train/ce_loss": 0.8379826545715332 }, { "epoch": 0.19814119042910816, "step": 2004, "train/sim_loss": 0.0546875 }, { "epoch": 0.19814119042910816, "step": 2004, "train/total_loss": 0.13848575949668884 }, { "entropy": 9.25126838684082, "epoch": 0.1982400632786237, "mean_token_accuracy": 0.774193525314331, "num_tokens": 10431436.0, "step": 2005, "train/ce_loss": 0.679263710975647 }, { "epoch": 0.1982400632786237, "step": 2005, "train/sim_loss": 0.08203125 }, { "epoch": 0.1982400632786237, "step": 2005, "train/total_loss": 0.14995762705802917 }, { "entropy": 9.719476699829102, "epoch": 0.19833893612813921, "mean_token_accuracy": 0.8030534386634827, "num_tokens": 10436726.0, "step": 2006, "train/ce_loss": 0.9897971749305725 }, { "epoch": 0.19833893612813921, "step": 2006, "train/sim_loss": 0.1015625 }, { "epoch": 0.19833893612813921, "step": 2006, "train/total_loss": 0.20054221153259277 }, { "entropy": 9.661178588867188, "epoch": 0.19843780897765473, "mean_token_accuracy": 0.7221373915672302, "num_tokens": 10441828.0, "step": 2007, "train/ce_loss": 1.348845362663269 }, { "epoch": 0.19843780897765473, "step": 2007, "train/sim_loss": 0.09375 }, { "epoch": 0.19843780897765473, "step": 2007, "train/total_loss": 0.2286345362663269 }, { "entropy": 9.856801986694336, "epoch": 0.19853668182717027, "mean_token_accuracy": 0.7596774101257324, "num_tokens": 10446866.0, "step": 2008, "train/ce_loss": 6.274824954743963e-06 }, { "epoch": 0.19853668182717027, "step": 2008, "train/sim_loss": 0.0234375 }, { "epoch": 0.19853668182717027, "step": 2008, "train/total_loss": 0.02343812771141529 }, { "entropy": 9.621440887451172, "epoch": 0.19863555467668578, "mean_token_accuracy": 0.7267355918884277, "num_tokens": 10452005.0, "step": 2009, "train/ce_loss": 1.4356766939163208 }, { "epoch": 0.19863555467668578, "step": 2009, "train/sim_loss": 0.11328125 }, { "epoch": 0.19863555467668578, "step": 2009, "train/total_loss": 0.25684893131256104 }, { "entropy": 8.885970115661621, "epoch": 0.1987344275262013, "mean_token_accuracy": 0.7532588243484497, "num_tokens": 10457556.0, "step": 2010, "train/ce_loss": 0.6419457793235779 }, { "epoch": 0.1987344275262013, "step": 2010, "train/sim_loss": 0.03125 }, { "epoch": 0.1987344275262013, "step": 2010, "train/total_loss": 0.09544458240270615 }, { "entropy": 9.021108627319336, "epoch": 0.19883330037571684, "mean_token_accuracy": 0.7590000033378601, "num_tokens": 10462993.0, "step": 2011, "train/ce_loss": 0.4544513523578644 }, { "epoch": 0.19883330037571684, "step": 2011, "train/sim_loss": 0.02734375 }, { "epoch": 0.19883330037571684, "step": 2011, "train/total_loss": 0.07278888672590256 }, { "entropy": 9.056339263916016, "epoch": 0.19893217322523235, "mean_token_accuracy": 0.7043294906616211, "num_tokens": 10468374.0, "step": 2012, "train/ce_loss": 0.8080570101737976 }, { "epoch": 0.19893217322523235, "step": 2012, "train/sim_loss": 0.046875 }, { "epoch": 0.19893217322523235, "step": 2012, "train/total_loss": 0.127680703997612 }, { "entropy": 10.461679458618164, "epoch": 0.19903104607474786, "mean_token_accuracy": 0.6085526347160339, "num_tokens": 10473075.0, "step": 2013, "train/ce_loss": 5.331155776977539 }, { "epoch": 0.19903104607474786, "step": 2013, "train/sim_loss": 0.09375 }, { "epoch": 0.19903104607474786, "step": 2013, "train/total_loss": 0.626865565776825 }, { "entropy": 9.206826210021973, "epoch": 0.1991299189242634, "mean_token_accuracy": 0.7319098711013794, "num_tokens": 10478410.0, "step": 2014, "train/ce_loss": 1.3367818593978882 }, { "epoch": 0.1991299189242634, "step": 2014, "train/sim_loss": 0.07421875 }, { "epoch": 0.1991299189242634, "step": 2014, "train/total_loss": 0.20789693295955658 }, { "entropy": 9.818578720092773, "epoch": 0.19922879177377892, "mean_token_accuracy": 0.7318255305290222, "num_tokens": 10483465.0, "step": 2015, "train/ce_loss": 1.089387387764873e-05 }, { "epoch": 0.19922879177377892, "step": 2015, "train/sim_loss": 0.0625 }, { "epoch": 0.19922879177377892, "step": 2015, "train/total_loss": 0.06250108778476715 }, { "entropy": 10.016085624694824, "epoch": 0.19932766462329443, "mean_token_accuracy": 0.7523629665374756, "num_tokens": 10488400.0, "step": 2016, "train/ce_loss": 6.818392648710869e-06 }, { "epoch": 0.19932766462329443, "step": 2016, "train/sim_loss": 0.03125 }, { "epoch": 0.19932766462329443, "step": 2016, "train/total_loss": 0.03125068172812462 }, { "entropy": 8.950575828552246, "epoch": 0.19942653747280997, "mean_token_accuracy": 0.7698113322257996, "num_tokens": 10493922.0, "step": 2017, "train/ce_loss": 0.6243674159049988 }, { "epoch": 0.19942653747280997, "step": 2017, "train/sim_loss": 0.05859375 }, { "epoch": 0.19942653747280997, "step": 2017, "train/total_loss": 0.12103049457073212 }, { "entropy": 10.060829162597656, "epoch": 0.19952541032232549, "mean_token_accuracy": 0.6961206793785095, "num_tokens": 10498835.0, "step": 2018, "train/ce_loss": 1.1689340681186877e-05 }, { "epoch": 0.19952541032232549, "step": 2018, "train/sim_loss": 0.04296875 }, { "epoch": 0.19952541032232549, "step": 2018, "train/total_loss": 0.04296991974115372 }, { "entropy": 10.04655933380127, "epoch": 0.199624283171841, "mean_token_accuracy": 0.7794871926307678, "num_tokens": 10503663.0, "step": 2019, "train/ce_loss": 1.3177484273910522 }, { "epoch": 0.199624283171841, "step": 2019, "train/sim_loss": 0.0234375 }, { "epoch": 0.199624283171841, "step": 2019, "train/total_loss": 0.15521234273910522 }, { "epoch": 0.19972315602135654, "grad_norm": 0.8853054642677307, "learning_rate": 9.503288335064036e-06, "loss": 0.1522, "step": 2020 }, { "entropy": 9.00905990600586, "epoch": 0.19972315602135654, "mean_token_accuracy": 0.7656404972076416, "num_tokens": 10509122.0, "step": 2020, "train/ce_loss": 0.7272489070892334 }, { "epoch": 0.19972315602135654, "step": 2020, "train/sim_loss": 0.0625 }, { "epoch": 0.19972315602135654, "step": 2020, "train/total_loss": 0.13522489368915558 }, { "entropy": 9.366575241088867, "epoch": 0.19982202887087205, "mean_token_accuracy": 0.7505720853805542, "num_tokens": 10514448.0, "step": 2021, "train/ce_loss": 0.6705920100212097 }, { "epoch": 0.19982202887087205, "step": 2021, "train/sim_loss": 0.05078125 }, { "epoch": 0.19982202887087205, "step": 2021, "train/total_loss": 0.11784045398235321 }, { "entropy": 8.827877044677734, "epoch": 0.1999209017203876, "mean_token_accuracy": 0.7924311757087708, "num_tokens": 10519766.0, "step": 2022, "train/ce_loss": 0.4559319317340851 }, { "epoch": 0.1999209017203876, "step": 2022, "train/sim_loss": 0.0234375 }, { "epoch": 0.1999209017203876, "step": 2022, "train/total_loss": 0.06903069466352463 }, { "entropy": 9.280956268310547, "epoch": 0.2000197745699031, "mean_token_accuracy": 0.7597883343696594, "num_tokens": 10525168.0, "step": 2023, "train/ce_loss": 0.746547520160675 }, { "epoch": 0.2000197745699031, "step": 2023, "train/sim_loss": 0.06640625 }, { "epoch": 0.2000197745699031, "step": 2023, "train/total_loss": 0.14106100797653198 }, { "entropy": 9.516263961791992, "epoch": 0.20011864741941862, "mean_token_accuracy": 0.703496515750885, "num_tokens": 10530333.0, "step": 2024, "train/ce_loss": 0.9688807725906372 }, { "epoch": 0.20011864741941862, "step": 2024, "train/sim_loss": 0.02734375 }, { "epoch": 0.20011864741941862, "step": 2024, "train/total_loss": 0.12423183023929596 }, { "entropy": 9.22756576538086, "epoch": 0.20021752026893416, "mean_token_accuracy": 0.7782909870147705, "num_tokens": 10535661.0, "step": 2025, "train/ce_loss": 0.6405593156814575 }, { "epoch": 0.20021752026893416, "step": 2025, "train/sim_loss": 0.0859375 }, { "epoch": 0.20021752026893416, "step": 2025, "train/total_loss": 0.149993434548378 }, { "entropy": 10.073745727539062, "epoch": 0.20031639311844968, "mean_token_accuracy": 0.7888198494911194, "num_tokens": 10540556.0, "step": 2026, "train/ce_loss": 7.1674990067549516e-06 }, { "epoch": 0.20031639311844968, "step": 2026, "train/sim_loss": 0.01953125 }, { "epoch": 0.20031639311844968, "step": 2026, "train/total_loss": 0.019531967118382454 }, { "entropy": 8.897648811340332, "epoch": 0.2004152659679652, "mean_token_accuracy": 0.7366803288459778, "num_tokens": 10546011.0, "step": 2027, "train/ce_loss": 0.645298421382904 }, { "epoch": 0.2004152659679652, "step": 2027, "train/sim_loss": 0.1015625 }, { "epoch": 0.2004152659679652, "step": 2027, "train/total_loss": 0.16609233617782593 }, { "entropy": 9.375082015991211, "epoch": 0.20051413881748073, "mean_token_accuracy": 0.7173637747764587, "num_tokens": 10551280.0, "step": 2028, "train/ce_loss": 0.9982043504714966 }, { "epoch": 0.20051413881748073, "step": 2028, "train/sim_loss": 0.078125 }, { "epoch": 0.20051413881748073, "step": 2028, "train/total_loss": 0.17794543504714966 }, { "entropy": 9.171791076660156, "epoch": 0.20061301166699624, "mean_token_accuracy": 0.6918465495109558, "num_tokens": 10556633.0, "step": 2029, "train/ce_loss": 0.8046217560768127 }, { "epoch": 0.20061301166699624, "step": 2029, "train/sim_loss": 0.1015625 }, { "epoch": 0.20061301166699624, "step": 2029, "train/total_loss": 0.18202468752861023 }, { "entropy": 9.75556755065918, "epoch": 0.20071188451651176, "mean_token_accuracy": 0.7542213797569275, "num_tokens": 10561578.0, "step": 2030, "train/ce_loss": 0.7943035364151001 }, { "epoch": 0.20071188451651176, "step": 2030, "train/sim_loss": 0.02734375 }, { "epoch": 0.20071188451651176, "step": 2030, "train/total_loss": 0.10677410662174225 }, { "entropy": 9.545900344848633, "epoch": 0.2008107573660273, "mean_token_accuracy": 0.7027438879013062, "num_tokens": 10566714.0, "step": 2031, "train/ce_loss": 1.2213687896728516 }, { "epoch": 0.2008107573660273, "step": 2031, "train/sim_loss": 0.1484375 }, { "epoch": 0.2008107573660273, "step": 2031, "train/total_loss": 0.2705743908882141 }, { "entropy": 9.558181762695312, "epoch": 0.2009096302155428, "mean_token_accuracy": 0.6989409923553467, "num_tokens": 10571794.0, "step": 2032, "train/ce_loss": 0.7340117692947388 }, { "epoch": 0.2009096302155428, "step": 2032, "train/sim_loss": 0.0703125 }, { "epoch": 0.2009096302155428, "step": 2032, "train/total_loss": 0.14371368288993835 }, { "entropy": 9.313700675964355, "epoch": 0.20100850306505832, "mean_token_accuracy": 0.7363530993461609, "num_tokens": 10577102.0, "step": 2033, "train/ce_loss": 1.2940865755081177 }, { "epoch": 0.20100850306505832, "step": 2033, "train/sim_loss": 0.08984375 }, { "epoch": 0.20100850306505832, "step": 2033, "train/total_loss": 0.21925240755081177 }, { "entropy": 9.426789283752441, "epoch": 0.20110737591457387, "mean_token_accuracy": 0.7166666388511658, "num_tokens": 10582423.0, "step": 2034, "train/ce_loss": 1.9577373266220093 }, { "epoch": 0.20110737591457387, "step": 2034, "train/sim_loss": 0.09765625 }, { "epoch": 0.20110737591457387, "step": 2034, "train/total_loss": 0.293429970741272 }, { "entropy": 9.317292213439941, "epoch": 0.20120624876408938, "mean_token_accuracy": 0.7283950448036194, "num_tokens": 10587736.0, "step": 2035, "train/ce_loss": 0.5608477592468262 }, { "epoch": 0.20120624876408938, "step": 2035, "train/sim_loss": 0.0390625 }, { "epoch": 0.20120624876408938, "step": 2035, "train/total_loss": 0.0951472818851471 }, { "entropy": 9.143550872802734, "epoch": 0.2013051216136049, "mean_token_accuracy": 0.6800433993339539, "num_tokens": 10593142.0, "step": 2036, "train/ce_loss": 0.742180585861206 }, { "epoch": 0.2013051216136049, "step": 2036, "train/sim_loss": 0.05078125 }, { "epoch": 0.2013051216136049, "step": 2036, "train/total_loss": 0.12499930709600449 }, { "entropy": 9.595016479492188, "epoch": 0.20140399446312043, "mean_token_accuracy": 0.7216216325759888, "num_tokens": 10598345.0, "step": 2037, "train/ce_loss": 1.1758081912994385 }, { "epoch": 0.20140399446312043, "step": 2037, "train/sim_loss": 0.09765625 }, { "epoch": 0.20140399446312043, "step": 2037, "train/total_loss": 0.2152370810508728 }, { "entropy": 9.325058937072754, "epoch": 0.20150286731263595, "mean_token_accuracy": 0.7465667724609375, "num_tokens": 10603613.0, "step": 2038, "train/ce_loss": 1.1411672830581665 }, { "epoch": 0.20150286731263595, "step": 2038, "train/sim_loss": 0.0859375 }, { "epoch": 0.20150286731263595, "step": 2038, "train/total_loss": 0.20005422830581665 }, { "entropy": 8.968401908874512, "epoch": 0.20160174016215146, "mean_token_accuracy": 0.7274436354637146, "num_tokens": 10609180.0, "step": 2039, "train/ce_loss": 1.000770092010498 }, { "epoch": 0.20160174016215146, "step": 2039, "train/sim_loss": 0.05078125 }, { "epoch": 0.20160174016215146, "step": 2039, "train/total_loss": 0.15085825324058533 }, { "epoch": 0.201700613011667, "grad_norm": 0.8397514820098877, "learning_rate": 9.498343470306089e-06, "loss": 0.1556, "step": 2040 }, { "entropy": 9.181343078613281, "epoch": 0.201700613011667, "mean_token_accuracy": 0.6988505721092224, "num_tokens": 10614524.0, "step": 2040, "train/ce_loss": 0.8533067107200623 }, { "epoch": 0.201700613011667, "step": 2040, "train/sim_loss": 0.0546875 }, { "epoch": 0.201700613011667, "step": 2040, "train/total_loss": 0.14001816511154175 }, { "entropy": 9.63900375366211, "epoch": 0.20179948586118251, "mean_token_accuracy": 0.7133758068084717, "num_tokens": 10619605.0, "step": 2041, "train/ce_loss": 1.1804910898208618 }, { "epoch": 0.20179948586118251, "step": 2041, "train/sim_loss": 0.14453125 }, { "epoch": 0.20179948586118251, "step": 2041, "train/total_loss": 0.26258036494255066 }, { "entropy": 9.486444473266602, "epoch": 0.20189835871069806, "mean_token_accuracy": 0.7389610409736633, "num_tokens": 10624824.0, "step": 2042, "train/ce_loss": 1.2508841753005981 }, { "epoch": 0.20189835871069806, "step": 2042, "train/sim_loss": 0.1015625 }, { "epoch": 0.20189835871069806, "step": 2042, "train/total_loss": 0.2266509234905243 }, { "entropy": 9.04334831237793, "epoch": 0.20199723156021357, "mean_token_accuracy": 0.7137404680252075, "num_tokens": 10630340.0, "step": 2043, "train/ce_loss": 0.6191092133522034 }, { "epoch": 0.20199723156021357, "step": 2043, "train/sim_loss": 0.06640625 }, { "epoch": 0.20199723156021357, "step": 2043, "train/total_loss": 0.12831717729568481 }, { "entropy": 9.424947738647461, "epoch": 0.20209610440972908, "mean_token_accuracy": 0.7550744414329529, "num_tokens": 10635549.0, "step": 2044, "train/ce_loss": 0.7709404230117798 }, { "epoch": 0.20209610440972908, "step": 2044, "train/sim_loss": 0.0546875 }, { "epoch": 0.20209610440972908, "step": 2044, "train/total_loss": 0.13178154826164246 }, { "entropy": 9.402710914611816, "epoch": 0.20219497725924462, "mean_token_accuracy": 0.6962233185768127, "num_tokens": 10640594.0, "step": 2045, "train/ce_loss": 1.000903844833374 }, { "epoch": 0.20219497725924462, "step": 2045, "train/sim_loss": 0.04296875 }, { "epoch": 0.20219497725924462, "step": 2045, "train/total_loss": 0.1430591344833374 }, { "entropy": 9.356023788452148, "epoch": 0.20229385010876014, "mean_token_accuracy": 0.6862980723381042, "num_tokens": 10645914.0, "step": 2046, "train/ce_loss": 0.7545071840286255 }, { "epoch": 0.20229385010876014, "step": 2046, "train/sim_loss": 0.0625 }, { "epoch": 0.20229385010876014, "step": 2046, "train/total_loss": 0.13795071840286255 }, { "entropy": 9.378089904785156, "epoch": 0.20239272295827565, "mean_token_accuracy": 0.7333333492279053, "num_tokens": 10651059.0, "step": 2047, "train/ce_loss": 0.6553515195846558 }, { "epoch": 0.20239272295827565, "step": 2047, "train/sim_loss": 0.02734375 }, { "epoch": 0.20239272295827565, "step": 2047, "train/total_loss": 0.09287890046834946 }, { "entropy": 9.854352951049805, "epoch": 0.2024915958077912, "mean_token_accuracy": 0.7586776614189148, "num_tokens": 10656119.0, "step": 2048, "train/ce_loss": 0.636549711227417 }, { "epoch": 0.2024915958077912, "step": 2048, "train/sim_loss": 0.05078125 }, { "epoch": 0.2024915958077912, "step": 2048, "train/total_loss": 0.11443622410297394 }, { "entropy": 9.793302536010742, "epoch": 0.2025904686573067, "mean_token_accuracy": 0.7410714030265808, "num_tokens": 10661123.0, "step": 2049, "train/ce_loss": 5.3264425332599785e-06 }, { "epoch": 0.2025904686573067, "step": 2049, "train/sim_loss": 0.0234375 }, { "epoch": 0.2025904686573067, "step": 2049, "train/total_loss": 0.02343803271651268 }, { "entropy": 9.3540678024292, "epoch": 0.20268934150682222, "mean_token_accuracy": 0.6831579208374023, "num_tokens": 10666513.0, "step": 2050, "train/ce_loss": 0.7889255285263062 }, { "epoch": 0.20268934150682222, "step": 2050, "train/sim_loss": 0.0546875 }, { "epoch": 0.20268934150682222, "step": 2050, "train/total_loss": 0.1335800588130951 }, { "entropy": 9.04610824584961, "epoch": 0.20278821435633776, "mean_token_accuracy": 0.7904656529426575, "num_tokens": 10671915.0, "step": 2051, "train/ce_loss": 0.40603914856910706 }, { "epoch": 0.20278821435633776, "step": 2051, "train/sim_loss": 0.03125 }, { "epoch": 0.20278821435633776, "step": 2051, "train/total_loss": 0.07185392081737518 }, { "entropy": 9.612567901611328, "epoch": 0.20288708720585327, "mean_token_accuracy": 0.7117241621017456, "num_tokens": 10677099.0, "step": 2052, "train/ce_loss": 1.4318925142288208 }, { "epoch": 0.20288708720585327, "step": 2052, "train/sim_loss": 0.07421875 }, { "epoch": 0.20288708720585327, "step": 2052, "train/total_loss": 0.21740800142288208 }, { "entropy": 9.450386047363281, "epoch": 0.20298596005536879, "mean_token_accuracy": 0.7205513715744019, "num_tokens": 10682354.0, "step": 2053, "train/ce_loss": 1.453116536140442 }, { "epoch": 0.20298596005536879, "step": 2053, "train/sim_loss": 0.07421875 }, { "epoch": 0.20298596005536879, "step": 2053, "train/total_loss": 0.2195304036140442 }, { "entropy": 9.776745796203613, "epoch": 0.20308483290488433, "mean_token_accuracy": 0.7428571581840515, "num_tokens": 10687412.0, "step": 2054, "train/ce_loss": 0.9126024842262268 }, { "epoch": 0.20308483290488433, "step": 2054, "train/sim_loss": 0.05859375 }, { "epoch": 0.20308483290488433, "step": 2054, "train/total_loss": 0.14985400438308716 }, { "entropy": 9.455513000488281, "epoch": 0.20318370575439984, "mean_token_accuracy": 0.7038626670837402, "num_tokens": 10692568.0, "step": 2055, "train/ce_loss": 1.1099965572357178 }, { "epoch": 0.20318370575439984, "step": 2055, "train/sim_loss": 0.1015625 }, { "epoch": 0.20318370575439984, "step": 2055, "train/total_loss": 0.21256215870380402 }, { "entropy": 9.406907081604004, "epoch": 0.20328257860391535, "mean_token_accuracy": 0.7060849666595459, "num_tokens": 10697869.0, "step": 2056, "train/ce_loss": 1.2631419897079468 }, { "epoch": 0.20328257860391535, "step": 2056, "train/sim_loss": 0.09765625 }, { "epoch": 0.20328257860391535, "step": 2056, "train/total_loss": 0.2239704579114914 }, { "entropy": 10.24700927734375, "epoch": 0.2033814514534309, "mean_token_accuracy": 0.7822784781455994, "num_tokens": 10702662.0, "step": 2057, "train/ce_loss": 7.433172413584543e-06 }, { "epoch": 0.2033814514534309, "step": 2057, "train/sim_loss": 0.03125 }, { "epoch": 0.2033814514534309, "step": 2057, "train/total_loss": 0.03125074505805969 }, { "entropy": 9.484310150146484, "epoch": 0.2034803243029464, "mean_token_accuracy": 0.7227455973625183, "num_tokens": 10707839.0, "step": 2058, "train/ce_loss": 0.6100543737411499 }, { "epoch": 0.2034803243029464, "step": 2058, "train/sim_loss": 0.0546875 }, { "epoch": 0.2034803243029464, "step": 2058, "train/total_loss": 0.11569294333457947 }, { "entropy": 8.70419692993164, "epoch": 0.20357919715246192, "mean_token_accuracy": 0.7291280031204224, "num_tokens": 10713432.0, "step": 2059, "train/ce_loss": 1.097187876701355 }, { "epoch": 0.20357919715246192, "step": 2059, "train/sim_loss": 0.05859375 }, { "epoch": 0.20357919715246192, "step": 2059, "train/total_loss": 0.16831254959106445 }, { "epoch": 0.20367807000197746, "grad_norm": 0.9894576668739319, "learning_rate": 9.493398605548139e-06, "loss": 0.1646, "step": 2060 }, { "entropy": 9.25218391418457, "epoch": 0.20367807000197746, "mean_token_accuracy": 0.786120593547821, "num_tokens": 10718952.0, "step": 2060, "train/ce_loss": 0.46557751297950745 }, { "epoch": 0.20367807000197746, "step": 2060, "train/sim_loss": 0.01953125 }, { "epoch": 0.20367807000197746, "step": 2060, "train/total_loss": 0.06608900427818298 }, { "entropy": 9.467554092407227, "epoch": 0.20377694285149298, "mean_token_accuracy": 0.7763819098472595, "num_tokens": 10724180.0, "step": 2061, "train/ce_loss": 1.1352170076861512e-05 }, { "epoch": 0.20377694285149298, "step": 2061, "train/sim_loss": 0.07421875 }, { "epoch": 0.20377694285149298, "step": 2061, "train/total_loss": 0.07421988248825073 }, { "entropy": 8.840513229370117, "epoch": 0.2038758157010085, "mean_token_accuracy": 0.7084745764732361, "num_tokens": 10729844.0, "step": 2062, "train/ce_loss": 0.8225208520889282 }, { "epoch": 0.2038758157010085, "step": 2062, "train/sim_loss": 0.09375 }, { "epoch": 0.2038758157010085, "step": 2062, "train/total_loss": 0.17600208520889282 }, { "entropy": 9.026546478271484, "epoch": 0.20397468855052403, "mean_token_accuracy": 0.7510729432106018, "num_tokens": 10735238.0, "step": 2063, "train/ce_loss": 0.48530179262161255 }, { "epoch": 0.20397468855052403, "step": 2063, "train/sim_loss": 0.03125 }, { "epoch": 0.20397468855052403, "step": 2063, "train/total_loss": 0.07978017628192902 }, { "entropy": 9.21430778503418, "epoch": 0.20407356140003954, "mean_token_accuracy": 0.6976439952850342, "num_tokens": 10740468.0, "step": 2064, "train/ce_loss": 1.1475073099136353 }, { "epoch": 0.20407356140003954, "step": 2064, "train/sim_loss": 0.078125 }, { "epoch": 0.20407356140003954, "step": 2064, "train/total_loss": 0.19287574291229248 }, { "entropy": 9.391798973083496, "epoch": 0.20417243424955508, "mean_token_accuracy": 0.7493734359741211, "num_tokens": 10745754.0, "step": 2065, "train/ce_loss": 0.6040437817573547 }, { "epoch": 0.20417243424955508, "step": 2065, "train/sim_loss": 0.06640625 }, { "epoch": 0.20417243424955508, "step": 2065, "train/total_loss": 0.12681062519550323 }, { "entropy": 9.232144355773926, "epoch": 0.2042713070990706, "mean_token_accuracy": 0.7660738825798035, "num_tokens": 10750975.0, "step": 2066, "train/ce_loss": 1.651016116142273 }, { "epoch": 0.2042713070990706, "step": 2066, "train/sim_loss": 0.046875 }, { "epoch": 0.2042713070990706, "step": 2066, "train/total_loss": 0.21197661757469177 }, { "entropy": 9.279935836791992, "epoch": 0.2043701799485861, "mean_token_accuracy": 0.6896162629127502, "num_tokens": 10756340.0, "step": 2067, "train/ce_loss": 1.0655268430709839 }, { "epoch": 0.2043701799485861, "step": 2067, "train/sim_loss": 0.1484375 }, { "epoch": 0.2043701799485861, "step": 2067, "train/total_loss": 0.25499019026756287 }, { "entropy": 9.068220138549805, "epoch": 0.20446905279810165, "mean_token_accuracy": 0.7304643392562866, "num_tokens": 10761649.0, "step": 2068, "train/ce_loss": 0.7303726077079773 }, { "epoch": 0.20446905279810165, "step": 2068, "train/sim_loss": 0.10546875 }, { "epoch": 0.20446905279810165, "step": 2068, "train/total_loss": 0.1785060167312622 }, { "entropy": 9.917533874511719, "epoch": 0.20456792564761717, "mean_token_accuracy": 0.7224137783050537, "num_tokens": 10766651.0, "step": 2069, "train/ce_loss": 1.1582452058792114 }, { "epoch": 0.20456792564761717, "step": 2069, "train/sim_loss": 0.08984375 }, { "epoch": 0.20456792564761717, "step": 2069, "train/total_loss": 0.20566827058792114 }, { "entropy": 9.242486953735352, "epoch": 0.20466679849713268, "mean_token_accuracy": 0.7773631811141968, "num_tokens": 10771977.0, "step": 2070, "train/ce_loss": 0.8605750799179077 }, { "epoch": 0.20466679849713268, "step": 2070, "train/sim_loss": 0.0859375 }, { "epoch": 0.20466679849713268, "step": 2070, "train/total_loss": 0.17199501395225525 }, { "entropy": 9.046150207519531, "epoch": 0.20476567134664822, "mean_token_accuracy": 0.7056995034217834, "num_tokens": 10777407.0, "step": 2071, "train/ce_loss": 0.9511678218841553 }, { "epoch": 0.20476567134664822, "step": 2071, "train/sim_loss": 0.12109375 }, { "epoch": 0.20476567134664822, "step": 2071, "train/total_loss": 0.21621054410934448 }, { "entropy": 9.479668617248535, "epoch": 0.20486454419616373, "mean_token_accuracy": 0.7462068796157837, "num_tokens": 10782603.0, "step": 2072, "train/ce_loss": 0.616319477558136 }, { "epoch": 0.20486454419616373, "step": 2072, "train/sim_loss": 0.0625 }, { "epoch": 0.20486454419616373, "step": 2072, "train/total_loss": 0.1241319477558136 }, { "entropy": 9.510650634765625, "epoch": 0.20496341704567925, "mean_token_accuracy": 0.7013513445854187, "num_tokens": 10787838.0, "step": 2073, "train/ce_loss": 0.8334776163101196 }, { "epoch": 0.20496341704567925, "step": 2073, "train/sim_loss": 0.08203125 }, { "epoch": 0.20496341704567925, "step": 2073, "train/total_loss": 0.16537901759147644 }, { "entropy": 9.949750900268555, "epoch": 0.2050622898951948, "mean_token_accuracy": 0.6925858855247498, "num_tokens": 10792781.0, "step": 2074, "train/ce_loss": 8.121015525830444e-06 }, { "epoch": 0.2050622898951948, "step": 2074, "train/sim_loss": 0.03515625 }, { "epoch": 0.2050622898951948, "step": 2074, "train/total_loss": 0.035157062113285065 }, { "entropy": 9.321216583251953, "epoch": 0.2051611627447103, "mean_token_accuracy": 0.7588306665420532, "num_tokens": 10798042.0, "step": 2075, "train/ce_loss": 0.9396710991859436 }, { "epoch": 0.2051611627447103, "step": 2075, "train/sim_loss": 0.05078125 }, { "epoch": 0.2051611627447103, "step": 2075, "train/total_loss": 0.14474835991859436 }, { "entropy": 9.646879196166992, "epoch": 0.20526003559422581, "mean_token_accuracy": 0.7349768877029419, "num_tokens": 10803123.0, "step": 2076, "train/ce_loss": 0.8319485187530518 }, { "epoch": 0.20526003559422581, "step": 2076, "train/sim_loss": 0.0703125 }, { "epoch": 0.20526003559422581, "step": 2076, "train/total_loss": 0.15350735187530518 }, { "entropy": 9.569722175598145, "epoch": 0.20535890844374136, "mean_token_accuracy": 0.762734591960907, "num_tokens": 10808297.0, "step": 2077, "train/ce_loss": 1.0843838453292847 }, { "epoch": 0.20535890844374136, "step": 2077, "train/sim_loss": 0.0234375 }, { "epoch": 0.20535890844374136, "step": 2077, "train/total_loss": 0.1318758875131607 }, { "entropy": 9.064342498779297, "epoch": 0.20545778129325687, "mean_token_accuracy": 0.7198581695556641, "num_tokens": 10813689.0, "step": 2078, "train/ce_loss": 0.8889843225479126 }, { "epoch": 0.20545778129325687, "step": 2078, "train/sim_loss": 0.0703125 }, { "epoch": 0.20545778129325687, "step": 2078, "train/total_loss": 0.1592109352350235 }, { "entropy": 9.698354721069336, "epoch": 0.20555665414277238, "mean_token_accuracy": 0.724473237991333, "num_tokens": 10818763.0, "step": 2079, "train/ce_loss": 1.1419720649719238 }, { "epoch": 0.20555665414277238, "step": 2079, "train/sim_loss": 0.1171875 }, { "epoch": 0.20555665414277238, "step": 2079, "train/total_loss": 0.23138470947742462 }, { "epoch": 0.20565552699228792, "grad_norm": 1.1899652481079102, "learning_rate": 9.48845374079019e-06, "loss": 0.1626, "step": 2080 }, { "entropy": 9.852563858032227, "epoch": 0.20565552699228792, "mean_token_accuracy": 0.7165775299072266, "num_tokens": 10823782.0, "step": 2080, "train/ce_loss": 1.4960167407989502 }, { "epoch": 0.20565552699228792, "step": 2080, "train/sim_loss": 0.0859375 }, { "epoch": 0.20565552699228792, "step": 2080, "train/total_loss": 0.23553918302059174 }, { "entropy": 9.01875114440918, "epoch": 0.20575439984180344, "mean_token_accuracy": 0.7856468558311462, "num_tokens": 10829311.0, "step": 2081, "train/ce_loss": 1.119086503982544 }, { "epoch": 0.20575439984180344, "step": 2081, "train/sim_loss": 0.09375 }, { "epoch": 0.20575439984180344, "step": 2081, "train/total_loss": 0.20565864443778992 }, { "entropy": 9.640281677246094, "epoch": 0.20585327269131895, "mean_token_accuracy": 0.7061403393745422, "num_tokens": 10834455.0, "step": 2082, "train/ce_loss": 2.283698797225952 }, { "epoch": 0.20585327269131895, "step": 2082, "train/sim_loss": 0.0625 }, { "epoch": 0.20585327269131895, "step": 2082, "train/total_loss": 0.29086989164352417 }, { "entropy": 9.904674530029297, "epoch": 0.2059521455408345, "mean_token_accuracy": 0.7439516186714172, "num_tokens": 10839380.0, "step": 2083, "train/ce_loss": 7.640025614819024e-06 }, { "epoch": 0.2059521455408345, "step": 2083, "train/sim_loss": 0.08984375 }, { "epoch": 0.2059521455408345, "step": 2083, "train/total_loss": 0.08984451740980148 }, { "entropy": 9.319259643554688, "epoch": 0.20605101839035, "mean_token_accuracy": 0.752662718296051, "num_tokens": 10844689.0, "step": 2084, "train/ce_loss": 0.8190650939941406 }, { "epoch": 0.20605101839035, "step": 2084, "train/sim_loss": 0.0625 }, { "epoch": 0.20605101839035, "step": 2084, "train/total_loss": 0.1444065123796463 }, { "entropy": 8.912906646728516, "epoch": 0.20614989123986555, "mean_token_accuracy": 0.7344045639038086, "num_tokens": 10850216.0, "step": 2085, "train/ce_loss": 1.335227131843567 }, { "epoch": 0.20614989123986555, "step": 2085, "train/sim_loss": 0.09375 }, { "epoch": 0.20614989123986555, "step": 2085, "train/total_loss": 0.22727271914482117 }, { "entropy": 9.269612312316895, "epoch": 0.20624876408938106, "mean_token_accuracy": 0.7739726305007935, "num_tokens": 10855563.0, "step": 2086, "train/ce_loss": 0.8642188906669617 }, { "epoch": 0.20624876408938106, "step": 2086, "train/sim_loss": 0.0625 }, { "epoch": 0.20624876408938106, "step": 2086, "train/total_loss": 0.1489218920469284 }, { "entropy": 9.489455223083496, "epoch": 0.20634763693889657, "mean_token_accuracy": 0.7437673211097717, "num_tokens": 10860733.0, "step": 2087, "train/ce_loss": 1.35664701461792 }, { "epoch": 0.20634763693889657, "step": 2087, "train/sim_loss": 0.078125 }, { "epoch": 0.20634763693889657, "step": 2087, "train/total_loss": 0.213789701461792 }, { "entropy": 9.293228149414062, "epoch": 0.2064465097884121, "mean_token_accuracy": 0.6814159154891968, "num_tokens": 10866067.0, "step": 2088, "train/ce_loss": 0.9681704640388489 }, { "epoch": 0.2064465097884121, "step": 2088, "train/sim_loss": 0.04296875 }, { "epoch": 0.2064465097884121, "step": 2088, "train/total_loss": 0.1397857964038849 }, { "entropy": 9.249713897705078, "epoch": 0.20654538263792763, "mean_token_accuracy": 0.660804033279419, "num_tokens": 10871350.0, "step": 2089, "train/ce_loss": 0.7813665270805359 }, { "epoch": 0.20654538263792763, "step": 2089, "train/sim_loss": 0.046875 }, { "epoch": 0.20654538263792763, "step": 2089, "train/total_loss": 0.1250116527080536 }, { "entropy": 10.592601776123047, "epoch": 0.20664425548744314, "mean_token_accuracy": 0.7488986849784851, "num_tokens": 10876001.0, "step": 2090, "train/ce_loss": 3.5592474887380376e-05 }, { "epoch": 0.20664425548744314, "step": 2090, "train/sim_loss": 0.09375 }, { "epoch": 0.20664425548744314, "step": 2090, "train/total_loss": 0.09375356137752533 }, { "entropy": 9.088579177856445, "epoch": 0.20674312833695868, "mean_token_accuracy": 0.8073298335075378, "num_tokens": 10881443.0, "step": 2091, "train/ce_loss": 0.6509912610054016 }, { "epoch": 0.20674312833695868, "step": 2091, "train/sim_loss": 0.0625 }, { "epoch": 0.20674312833695868, "step": 2091, "train/total_loss": 0.12759912014007568 }, { "entropy": 9.509904861450195, "epoch": 0.2068420011864742, "mean_token_accuracy": 0.7426356673240662, "num_tokens": 10886506.0, "step": 2092, "train/ce_loss": 1.523234486579895 }, { "epoch": 0.2068420011864742, "step": 2092, "train/sim_loss": 0.0625 }, { "epoch": 0.2068420011864742, "step": 2092, "train/total_loss": 0.21482345461845398 }, { "entropy": 9.662224769592285, "epoch": 0.2069408740359897, "mean_token_accuracy": 0.7342767119407654, "num_tokens": 10891595.0, "step": 2093, "train/ce_loss": 0.9919571876525879 }, { "epoch": 0.2069408740359897, "step": 2093, "train/sim_loss": 0.078125 }, { "epoch": 0.2069408740359897, "step": 2093, "train/total_loss": 0.1773207187652588 }, { "entropy": 10.016589164733887, "epoch": 0.20703974688550525, "mean_token_accuracy": 0.7780821919441223, "num_tokens": 10896358.0, "step": 2094, "train/ce_loss": 1.0874220132827759 }, { "epoch": 0.20703974688550525, "step": 2094, "train/sim_loss": 0.05859375 }, { "epoch": 0.20703974688550525, "step": 2094, "train/total_loss": 0.16733595728874207 }, { "entropy": 10.13791275024414, "epoch": 0.20713861973502076, "mean_token_accuracy": 0.713178277015686, "num_tokens": 10901188.0, "step": 2095, "train/ce_loss": 2.002369365072809e-05 }, { "epoch": 0.20713861973502076, "step": 2095, "train/sim_loss": 0.06640625 }, { "epoch": 0.20713861973502076, "step": 2095, "train/total_loss": 0.06640825420618057 }, { "entropy": 9.9132080078125, "epoch": 0.20723749258453628, "mean_token_accuracy": 0.7882599830627441, "num_tokens": 10906056.0, "step": 2096, "train/ce_loss": 1.241743803024292 }, { "epoch": 0.20723749258453628, "step": 2096, "train/sim_loss": 0.0546875 }, { "epoch": 0.20723749258453628, "step": 2096, "train/total_loss": 0.17886188626289368 }, { "entropy": 9.56640625, "epoch": 0.20733636543405182, "mean_token_accuracy": 0.7264705896377563, "num_tokens": 10911182.0, "step": 2097, "train/ce_loss": 1.0573230981826782 }, { "epoch": 0.20733636543405182, "step": 2097, "train/sim_loss": 0.09375 }, { "epoch": 0.20733636543405182, "step": 2097, "train/total_loss": 0.19948232173919678 }, { "entropy": 9.239940643310547, "epoch": 0.20743523828356733, "mean_token_accuracy": 0.7880299091339111, "num_tokens": 10916466.0, "step": 2098, "train/ce_loss": 0.7562162280082703 }, { "epoch": 0.20743523828356733, "step": 2098, "train/sim_loss": 0.09765625 }, { "epoch": 0.20743523828356733, "step": 2098, "train/total_loss": 0.17327788472175598 }, { "entropy": 9.75033187866211, "epoch": 0.20753411113308284, "mean_token_accuracy": 0.7783985137939453, "num_tokens": 10921439.0, "step": 2099, "train/ce_loss": 0.9728204011917114 }, { "epoch": 0.20753411113308284, "step": 2099, "train/sim_loss": 0.07421875 }, { "epoch": 0.20753411113308284, "step": 2099, "train/total_loss": 0.1715008020401001 }, { "epoch": 0.20763298398259838, "grad_norm": 0.9003250002861023, "learning_rate": 9.483508876032242e-06, "loss": 0.1486, "step": 2100 }, { "entropy": 9.183356285095215, "epoch": 0.20763298398259838, "mean_token_accuracy": 0.7854356169700623, "num_tokens": 10927015.0, "step": 2100, "train/ce_loss": 1.995689672185108e-05 }, { "epoch": 0.20763298398259838, "step": 2100, "train/sim_loss": 0.046875 }, { "epoch": 0.20763298398259838, "step": 2100, "train/total_loss": 0.046876996755599976 }, { "entropy": 9.04728889465332, "epoch": 0.2077318568321139, "mean_token_accuracy": 0.7083854675292969, "num_tokens": 10932288.0, "step": 2101, "train/ce_loss": 1.0550248622894287 }, { "epoch": 0.2077318568321139, "step": 2101, "train/sim_loss": 0.12890625 }, { "epoch": 0.2077318568321139, "step": 2101, "train/total_loss": 0.23440873622894287 }, { "entropy": 9.192512512207031, "epoch": 0.2078307296816294, "mean_token_accuracy": 0.7589189410209656, "num_tokens": 10937698.0, "step": 2102, "train/ce_loss": 0.8490728139877319 }, { "epoch": 0.2078307296816294, "step": 2102, "train/sim_loss": 0.078125 }, { "epoch": 0.2078307296816294, "step": 2102, "train/total_loss": 0.16303229331970215 }, { "entropy": 9.685310363769531, "epoch": 0.20792960253114495, "mean_token_accuracy": 0.699999988079071, "num_tokens": 10942748.0, "step": 2103, "train/ce_loss": 2.1654441356658936 }, { "epoch": 0.20792960253114495, "step": 2103, "train/sim_loss": 0.08203125 }, { "epoch": 0.20792960253114495, "step": 2103, "train/total_loss": 0.29857566952705383 }, { "entropy": 10.370462417602539, "epoch": 0.20802847538066047, "mean_token_accuracy": 0.732758641242981, "num_tokens": 10947501.0, "step": 2104, "train/ce_loss": 3.596203896449879e-05 }, { "epoch": 0.20802847538066047, "step": 2104, "train/sim_loss": 0.05859375 }, { "epoch": 0.20802847538066047, "step": 2104, "train/total_loss": 0.058597344905138016 }, { "entropy": 9.564261436462402, "epoch": 0.208127348230176, "mean_token_accuracy": 0.7226074934005737, "num_tokens": 10952852.0, "step": 2105, "train/ce_loss": 1.111342191696167 }, { "epoch": 0.208127348230176, "step": 2105, "train/sim_loss": 0.09375 }, { "epoch": 0.208127348230176, "step": 2105, "train/total_loss": 0.20488423109054565 }, { "entropy": 9.011837005615234, "epoch": 0.20822622107969152, "mean_token_accuracy": 0.6643495559692383, "num_tokens": 10958359.0, "step": 2106, "train/ce_loss": 1.959960699081421 }, { "epoch": 0.20822622107969152, "step": 2106, "train/sim_loss": 0.08203125 }, { "epoch": 0.20822622107969152, "step": 2106, "train/total_loss": 0.27802732586860657 }, { "entropy": 9.457014083862305, "epoch": 0.20832509392920703, "mean_token_accuracy": 0.7094801068305969, "num_tokens": 10963500.0, "step": 2107, "train/ce_loss": 0.552707850933075 }, { "epoch": 0.20832509392920703, "step": 2107, "train/sim_loss": 0.05859375 }, { "epoch": 0.20832509392920703, "step": 2107, "train/total_loss": 0.11386454105377197 }, { "entropy": 9.38405990600586, "epoch": 0.20842396677872257, "mean_token_accuracy": 0.7141134142875671, "num_tokens": 10968779.0, "step": 2108, "train/ce_loss": 1.2475124597549438 }, { "epoch": 0.20842396677872257, "step": 2108, "train/sim_loss": 0.046875 }, { "epoch": 0.20842396677872257, "step": 2108, "train/total_loss": 0.1716262400150299 }, { "entropy": 9.579483032226562, "epoch": 0.2085228396282381, "mean_token_accuracy": 0.7328858971595764, "num_tokens": 10973967.0, "step": 2109, "train/ce_loss": 1.4650330543518066 }, { "epoch": 0.2085228396282381, "step": 2109, "train/sim_loss": 0.09375 }, { "epoch": 0.2085228396282381, "step": 2109, "train/total_loss": 0.24025331437587738 }, { "entropy": 9.960945129394531, "epoch": 0.2086217124777536, "mean_token_accuracy": 0.789264440536499, "num_tokens": 10978893.0, "step": 2110, "train/ce_loss": 1.619686918274965e-05 }, { "epoch": 0.2086217124777536, "step": 2110, "train/sim_loss": 0.0546875 }, { "epoch": 0.2086217124777536, "step": 2110, "train/total_loss": 0.05468912050127983 }, { "entropy": 9.54365348815918, "epoch": 0.20872058532726914, "mean_token_accuracy": 0.723849356174469, "num_tokens": 10984029.0, "step": 2111, "train/ce_loss": 0.7713753581047058 }, { "epoch": 0.20872058532726914, "step": 2111, "train/sim_loss": 0.08203125 }, { "epoch": 0.20872058532726914, "step": 2111, "train/total_loss": 0.1591687798500061 }, { "entropy": 8.940006256103516, "epoch": 0.20881945817678466, "mean_token_accuracy": 0.7734752893447876, "num_tokens": 10989556.0, "step": 2112, "train/ce_loss": 0.7423337697982788 }, { "epoch": 0.20881945817678466, "step": 2112, "train/sim_loss": 0.0546875 }, { "epoch": 0.20881945817678466, "step": 2112, "train/total_loss": 0.12892088294029236 }, { "entropy": 9.339137077331543, "epoch": 0.20891833102630017, "mean_token_accuracy": 0.7015098929405212, "num_tokens": 10994905.0, "step": 2113, "train/ce_loss": 0.9357536435127258 }, { "epoch": 0.20891833102630017, "step": 2113, "train/sim_loss": 0.09765625 }, { "epoch": 0.20891833102630017, "step": 2113, "train/total_loss": 0.1912316083908081 }, { "entropy": 8.97794246673584, "epoch": 0.2090172038758157, "mean_token_accuracy": 0.7137647867202759, "num_tokens": 11000354.0, "step": 2114, "train/ce_loss": 0.583050012588501 }, { "epoch": 0.2090172038758157, "step": 2114, "train/sim_loss": 0.0546875 }, { "epoch": 0.2090172038758157, "step": 2114, "train/total_loss": 0.11299250274896622 }, { "entropy": 9.324902534484863, "epoch": 0.20911607672533122, "mean_token_accuracy": 0.7692307829856873, "num_tokens": 11005621.0, "step": 2115, "train/ce_loss": 0.7004877924919128 }, { "epoch": 0.20911607672533122, "step": 2115, "train/sim_loss": 0.0390625 }, { "epoch": 0.20911607672533122, "step": 2115, "train/total_loss": 0.10911127924919128 }, { "entropy": 10.162012100219727, "epoch": 0.20921494957484674, "mean_token_accuracy": 0.7238979339599609, "num_tokens": 11010461.0, "step": 2116, "train/ce_loss": 7.267015007528244e-06 }, { "epoch": 0.20921494957484674, "step": 2116, "train/sim_loss": 0.05859375 }, { "epoch": 0.20921494957484674, "step": 2116, "train/total_loss": 0.0585944764316082 }, { "entropy": 9.51352310180664, "epoch": 0.20931382242436228, "mean_token_accuracy": 0.6925795078277588, "num_tokens": 11015508.0, "step": 2117, "train/ce_loss": 1.5570176401524805e-05 }, { "epoch": 0.20931382242436228, "step": 2117, "train/sim_loss": 0.046875 }, { "epoch": 0.20931382242436228, "step": 2117, "train/total_loss": 0.04687655717134476 }, { "entropy": 9.183328628540039, "epoch": 0.2094126952738778, "mean_token_accuracy": 0.7293233275413513, "num_tokens": 11020821.0, "step": 2118, "train/ce_loss": 1.0247259140014648 }, { "epoch": 0.2094126952738778, "step": 2118, "train/sim_loss": 0.12109375 }, { "epoch": 0.2094126952738778, "step": 2118, "train/total_loss": 0.22356635332107544 }, { "entropy": 9.582716941833496, "epoch": 0.2095115681233933, "mean_token_accuracy": 0.7318840622901917, "num_tokens": 11025944.0, "step": 2119, "train/ce_loss": 1.099069595336914 }, { "epoch": 0.2095115681233933, "step": 2119, "train/sim_loss": 0.06640625 }, { "epoch": 0.2095115681233933, "step": 2119, "train/total_loss": 0.17631322145462036 }, { "epoch": 0.20961044097290885, "grad_norm": 0.7997581362724304, "learning_rate": 9.478564011274292e-06, "loss": 0.1587, "step": 2120 }, { "entropy": 9.7577486038208, "epoch": 0.20961044097290885, "mean_token_accuracy": 0.7158878445625305, "num_tokens": 11030972.0, "step": 2120, "train/ce_loss": 6.1127025219320785e-06 }, { "epoch": 0.20961044097290885, "step": 2120, "train/sim_loss": 0.06640625 }, { "epoch": 0.20961044097290885, "step": 2120, "train/total_loss": 0.06640686094760895 }, { "entropy": 9.57359504699707, "epoch": 0.20970931382242436, "mean_token_accuracy": 0.707446813583374, "num_tokens": 11036159.0, "step": 2121, "train/ce_loss": 0.7156136631965637 }, { "epoch": 0.20970931382242436, "step": 2121, "train/sim_loss": 0.0703125 }, { "epoch": 0.20970931382242436, "step": 2121, "train/total_loss": 0.14187386631965637 }, { "entropy": 9.341856002807617, "epoch": 0.20980818667193987, "mean_token_accuracy": 0.7126582264900208, "num_tokens": 11041429.0, "step": 2122, "train/ce_loss": 1.9096126556396484 }, { "epoch": 0.20980818667193987, "step": 2122, "train/sim_loss": 0.109375 }, { "epoch": 0.20980818667193987, "step": 2122, "train/total_loss": 0.3003362715244293 }, { "entropy": 9.650550842285156, "epoch": 0.2099070595214554, "mean_token_accuracy": 0.7735294103622437, "num_tokens": 11046545.0, "step": 2123, "train/ce_loss": 1.2696391344070435 }, { "epoch": 0.2099070595214554, "step": 2123, "train/sim_loss": 0.01953125 }, { "epoch": 0.2099070595214554, "step": 2123, "train/total_loss": 0.14649516344070435 }, { "entropy": 9.482165336608887, "epoch": 0.21000593237097093, "mean_token_accuracy": 0.7585185170173645, "num_tokens": 11051671.0, "step": 2124, "train/ce_loss": 0.9867421984672546 }, { "epoch": 0.21000593237097093, "step": 2124, "train/sim_loss": 0.0703125 }, { "epoch": 0.21000593237097093, "step": 2124, "train/total_loss": 0.1689867228269577 }, { "entropy": 9.264106750488281, "epoch": 0.21010480522048647, "mean_token_accuracy": 0.7266591787338257, "num_tokens": 11056995.0, "step": 2125, "train/ce_loss": 0.8551509976387024 }, { "epoch": 0.21010480522048647, "step": 2125, "train/sim_loss": 0.1328125 }, { "epoch": 0.21010480522048647, "step": 2125, "train/total_loss": 0.2183276116847992 }, { "entropy": 9.480263710021973, "epoch": 0.21020367807000198, "mean_token_accuracy": 0.6494413614273071, "num_tokens": 11062190.0, "step": 2126, "train/ce_loss": 1.0613958835601807 }, { "epoch": 0.21020367807000198, "step": 2126, "train/sim_loss": 0.078125 }, { "epoch": 0.21020367807000198, "step": 2126, "train/total_loss": 0.18426460027694702 }, { "entropy": 9.736364364624023, "epoch": 0.2103025509195175, "mean_token_accuracy": 0.7264705896377563, "num_tokens": 11067286.0, "step": 2127, "train/ce_loss": 6.552802005899139e-06 }, { "epoch": 0.2103025509195175, "step": 2127, "train/sim_loss": 0.109375 }, { "epoch": 0.2103025509195175, "step": 2127, "train/total_loss": 0.10937565565109253 }, { "entropy": 9.37752914428711, "epoch": 0.21040142376903304, "mean_token_accuracy": 0.7278401851654053, "num_tokens": 11072598.0, "step": 2128, "train/ce_loss": 1.0987099409103394 }, { "epoch": 0.21040142376903304, "step": 2128, "train/sim_loss": 0.0859375 }, { "epoch": 0.21040142376903304, "step": 2128, "train/total_loss": 0.1958085000514984 }, { "entropy": 9.320964813232422, "epoch": 0.21050029661854855, "mean_token_accuracy": 0.6972255706787109, "num_tokens": 11077856.0, "step": 2129, "train/ce_loss": 1.042314052581787 }, { "epoch": 0.21050029661854855, "step": 2129, "train/sim_loss": 0.0859375 }, { "epoch": 0.21050029661854855, "step": 2129, "train/total_loss": 0.19016891717910767 }, { "entropy": 9.59323787689209, "epoch": 0.21059916946806406, "mean_token_accuracy": 0.716312050819397, "num_tokens": 11083033.0, "step": 2130, "train/ce_loss": 1.0699758529663086 }, { "epoch": 0.21059916946806406, "step": 2130, "train/sim_loss": 0.10546875 }, { "epoch": 0.21059916946806406, "step": 2130, "train/total_loss": 0.21246632933616638 }, { "entropy": 9.647937774658203, "epoch": 0.2106980423175796, "mean_token_accuracy": 0.7286356687545776, "num_tokens": 11088142.0, "step": 2131, "train/ce_loss": 0.695247232913971 }, { "epoch": 0.2106980423175796, "step": 2131, "train/sim_loss": 0.046875 }, { "epoch": 0.2106980423175796, "step": 2131, "train/total_loss": 0.11639972776174545 }, { "entropy": 9.487251281738281, "epoch": 0.21079691516709512, "mean_token_accuracy": 0.6836734414100647, "num_tokens": 11093232.0, "step": 2132, "train/ce_loss": 1.0981719493865967 }, { "epoch": 0.21079691516709512, "step": 2132, "train/sim_loss": 0.0703125 }, { "epoch": 0.21079691516709512, "step": 2132, "train/total_loss": 0.18012970685958862 }, { "entropy": 8.924519538879395, "epoch": 0.21089578801661063, "mean_token_accuracy": 0.6953441500663757, "num_tokens": 11098710.0, "step": 2133, "train/ce_loss": 1.0247503519058228 }, { "epoch": 0.21089578801661063, "step": 2133, "train/sim_loss": 0.0859375 }, { "epoch": 0.21089578801661063, "step": 2133, "train/total_loss": 0.18841254711151123 }, { "entropy": 9.11449146270752, "epoch": 0.21099466086612617, "mean_token_accuracy": 0.7630979418754578, "num_tokens": 11104097.0, "step": 2134, "train/ce_loss": 0.796212375164032 }, { "epoch": 0.21099466086612617, "step": 2134, "train/sim_loss": 0.0390625 }, { "epoch": 0.21099466086612617, "step": 2134, "train/total_loss": 0.11868374049663544 }, { "entropy": 9.475465774536133, "epoch": 0.21109353371564168, "mean_token_accuracy": 0.7591036558151245, "num_tokens": 11109279.0, "step": 2135, "train/ce_loss": 0.7177180647850037 }, { "epoch": 0.21109353371564168, "step": 2135, "train/sim_loss": 0.06640625 }, { "epoch": 0.21109353371564168, "step": 2135, "train/total_loss": 0.1381780505180359 }, { "entropy": 9.492043495178223, "epoch": 0.2111924065651572, "mean_token_accuracy": 0.7643678188323975, "num_tokens": 11114417.0, "step": 2136, "train/ce_loss": 0.841325581073761 }, { "epoch": 0.2111924065651572, "step": 2136, "train/sim_loss": 0.078125 }, { "epoch": 0.2111924065651572, "step": 2136, "train/total_loss": 0.16225755214691162 }, { "entropy": 9.480161666870117, "epoch": 0.21129127941467274, "mean_token_accuracy": 0.7516087293624878, "num_tokens": 11119571.0, "step": 2137, "train/ce_loss": 0.8480008840560913 }, { "epoch": 0.21129127941467274, "step": 2137, "train/sim_loss": 0.06640625 }, { "epoch": 0.21129127941467274, "step": 2137, "train/total_loss": 0.1512063443660736 }, { "entropy": 9.458097457885742, "epoch": 0.21139015226418825, "mean_token_accuracy": 0.717208206653595, "num_tokens": 11124856.0, "step": 2138, "train/ce_loss": 1.1783355474472046 }, { "epoch": 0.21139015226418825, "step": 2138, "train/sim_loss": 0.0703125 }, { "epoch": 0.21139015226418825, "step": 2138, "train/total_loss": 0.18814605474472046 }, { "entropy": 9.53870964050293, "epoch": 0.21148902511370377, "mean_token_accuracy": 0.6192959547042847, "num_tokens": 11130074.0, "step": 2139, "train/ce_loss": 4.688951321440982e-06 }, { "epoch": 0.21148902511370377, "step": 2139, "train/sim_loss": 0.06640625 }, { "epoch": 0.21148902511370377, "step": 2139, "train/total_loss": 0.0664067193865776 }, { "epoch": 0.2115878979632193, "grad_norm": 0.9693803191184998, "learning_rate": 9.473619146516345e-06, "loss": 0.1694, "step": 2140 }, { "entropy": 10.037150382995605, "epoch": 0.2115878979632193, "mean_token_accuracy": 0.6808118224143982, "num_tokens": 11135043.0, "step": 2140, "train/ce_loss": 7.014597940724343e-06 }, { "epoch": 0.2115878979632193, "step": 2140, "train/sim_loss": 0.07421875 }, { "epoch": 0.2115878979632193, "step": 2140, "train/total_loss": 0.07421945035457611 }, { "entropy": 10.606910705566406, "epoch": 0.21168677081273482, "mean_token_accuracy": 0.7755101919174194, "num_tokens": 11139654.0, "step": 2141, "train/ce_loss": 3.7913382053375244 }, { "epoch": 0.21168677081273482, "step": 2141, "train/sim_loss": 0.06640625 }, { "epoch": 0.21168677081273482, "step": 2141, "train/total_loss": 0.44554007053375244 }, { "entropy": 9.145793914794922, "epoch": 0.21178564366225033, "mean_token_accuracy": 0.7507820725440979, "num_tokens": 11145031.0, "step": 2142, "train/ce_loss": 0.847519040107727 }, { "epoch": 0.21178564366225033, "step": 2142, "train/sim_loss": 0.09765625 }, { "epoch": 0.21178564366225033, "step": 2142, "train/total_loss": 0.1824081540107727 }, { "entropy": 9.520444869995117, "epoch": 0.21188451651176587, "mean_token_accuracy": 0.7269326448440552, "num_tokens": 11150242.0, "step": 2143, "train/ce_loss": 1.060303807258606 }, { "epoch": 0.21188451651176587, "step": 2143, "train/sim_loss": 0.078125 }, { "epoch": 0.21188451651176587, "step": 2143, "train/total_loss": 0.18415537476539612 }, { "entropy": 9.657861709594727, "epoch": 0.2119833893612814, "mean_token_accuracy": 0.7496063113212585, "num_tokens": 11155328.0, "step": 2144, "train/ce_loss": 1.1257550716400146 }, { "epoch": 0.2119833893612814, "step": 2144, "train/sim_loss": 0.06640625 }, { "epoch": 0.2119833893612814, "step": 2144, "train/total_loss": 0.178981751203537 }, { "entropy": 9.078840255737305, "epoch": 0.2120822622107969, "mean_token_accuracy": 0.7188796401023865, "num_tokens": 11160787.0, "step": 2145, "train/ce_loss": 1.0356298685073853 }, { "epoch": 0.2120822622107969, "step": 2145, "train/sim_loss": 0.08984375 }, { "epoch": 0.2120822622107969, "step": 2145, "train/total_loss": 0.19340673089027405 }, { "entropy": 9.330584526062012, "epoch": 0.21218113506031244, "mean_token_accuracy": 0.7790432572364807, "num_tokens": 11166100.0, "step": 2146, "train/ce_loss": 0.5168320536613464 }, { "epoch": 0.21218113506031244, "step": 2146, "train/sim_loss": 0.02734375 }, { "epoch": 0.21218113506031244, "step": 2146, "train/total_loss": 0.0790269523859024 }, { "entropy": 9.300143241882324, "epoch": 0.21228000790982796, "mean_token_accuracy": 0.7127937078475952, "num_tokens": 11171354.0, "step": 2147, "train/ce_loss": 0.5148841142654419 }, { "epoch": 0.21228000790982796, "step": 2147, "train/sim_loss": 0.07421875 }, { "epoch": 0.21228000790982796, "step": 2147, "train/total_loss": 0.12570716440677643 }, { "entropy": 9.406492233276367, "epoch": 0.2123788807593435, "mean_token_accuracy": 0.7316455841064453, "num_tokens": 11176646.0, "step": 2148, "train/ce_loss": 4.975815500074532e-06 }, { "epoch": 0.2123788807593435, "step": 2148, "train/sim_loss": 0.046875 }, { "epoch": 0.2123788807593435, "step": 2148, "train/total_loss": 0.046875499188899994 }, { "entropy": 9.134361267089844, "epoch": 0.212477753608859, "mean_token_accuracy": 0.7242562770843506, "num_tokens": 11182054.0, "step": 2149, "train/ce_loss": 0.6157044768333435 }, { "epoch": 0.212477753608859, "step": 2149, "train/sim_loss": 0.0703125 }, { "epoch": 0.212477753608859, "step": 2149, "train/total_loss": 0.1318829506635666 }, { "entropy": 9.376569747924805, "epoch": 0.21257662645837452, "mean_token_accuracy": 0.7273809313774109, "num_tokens": 11187347.0, "step": 2150, "train/ce_loss": 0.8534751534461975 }, { "epoch": 0.21257662645837452, "step": 2150, "train/sim_loss": 0.08984375 }, { "epoch": 0.21257662645837452, "step": 2150, "train/total_loss": 0.175191268324852 }, { "entropy": 8.996522903442383, "epoch": 0.21267549930789006, "mean_token_accuracy": 0.6850152611732483, "num_tokens": 11192806.0, "step": 2151, "train/ce_loss": 1.2981199026107788 }, { "epoch": 0.21267549930789006, "step": 2151, "train/sim_loss": 0.06640625 }, { "epoch": 0.21267549930789006, "step": 2151, "train/total_loss": 0.19621823728084564 }, { "entropy": 9.780609130859375, "epoch": 0.21277437215740558, "mean_token_accuracy": 0.7373417615890503, "num_tokens": 11197876.0, "step": 2152, "train/ce_loss": 1.066979169845581 }, { "epoch": 0.21277437215740558, "step": 2152, "train/sim_loss": 0.0546875 }, { "epoch": 0.21277437215740558, "step": 2152, "train/total_loss": 0.1613854169845581 }, { "entropy": 9.951211929321289, "epoch": 0.2128732450069211, "mean_token_accuracy": 0.7306337952613831, "num_tokens": 11202843.0, "step": 2153, "train/ce_loss": 0.8087383508682251 }, { "epoch": 0.2128732450069211, "step": 2153, "train/sim_loss": 0.09375 }, { "epoch": 0.2128732450069211, "step": 2153, "train/total_loss": 0.17462384700775146 }, { "entropy": 9.533098220825195, "epoch": 0.21297211785643663, "mean_token_accuracy": 0.7285318374633789, "num_tokens": 11207989.0, "step": 2154, "train/ce_loss": 1.0698336362838745 }, { "epoch": 0.21297211785643663, "step": 2154, "train/sim_loss": 0.08203125 }, { "epoch": 0.21297211785643663, "step": 2154, "train/total_loss": 0.18901461362838745 }, { "entropy": 9.042793273925781, "epoch": 0.21307099070595215, "mean_token_accuracy": 0.7118847370147705, "num_tokens": 11213272.0, "step": 2155, "train/ce_loss": 1.1877574920654297 }, { "epoch": 0.21307099070595215, "step": 2155, "train/sim_loss": 0.12890625 }, { "epoch": 0.21307099070595215, "step": 2155, "train/total_loss": 0.24768200516700745 }, { "entropy": 9.832735061645508, "epoch": 0.21316986355546766, "mean_token_accuracy": 0.69749516248703, "num_tokens": 11218417.0, "step": 2156, "train/ce_loss": 1.2040752172470093 }, { "epoch": 0.21316986355546766, "step": 2156, "train/sim_loss": 0.0546875 }, { "epoch": 0.21316986355546766, "step": 2156, "train/total_loss": 0.17509502172470093 }, { "entropy": 9.607536315917969, "epoch": 0.2132687364049832, "mean_token_accuracy": 0.7495826482772827, "num_tokens": 11223441.0, "step": 2157, "train/ce_loss": 1.0795719623565674 }, { "epoch": 0.2132687364049832, "step": 2157, "train/sim_loss": 0.1015625 }, { "epoch": 0.2132687364049832, "step": 2157, "train/total_loss": 0.20951969921588898 }, { "entropy": 9.695856094360352, "epoch": 0.2133676092544987, "mean_token_accuracy": 0.7353760600090027, "num_tokens": 11228621.0, "step": 2158, "train/ce_loss": 0.9282440543174744 }, { "epoch": 0.2133676092544987, "step": 2158, "train/sim_loss": 0.078125 }, { "epoch": 0.2133676092544987, "step": 2158, "train/total_loss": 0.17094939947128296 }, { "entropy": 9.801984786987305, "epoch": 0.21346648210401423, "mean_token_accuracy": 0.769784152507782, "num_tokens": 11233643.0, "step": 2159, "train/ce_loss": 0.6551988124847412 }, { "epoch": 0.21346648210401423, "step": 2159, "train/sim_loss": 0.06640625 }, { "epoch": 0.21346648210401423, "step": 2159, "train/total_loss": 0.13192613422870636 }, { "epoch": 0.21356535495352977, "grad_norm": 0.8683087229728699, "learning_rate": 9.468674281758395e-06, "loss": 0.1605, "step": 2160 }, { "entropy": 9.378292083740234, "epoch": 0.21356535495352977, "mean_token_accuracy": 0.6399999856948853, "num_tokens": 11238962.0, "step": 2160, "train/ce_loss": 1.1224546432495117 }, { "epoch": 0.21356535495352977, "step": 2160, "train/sim_loss": 0.04296875 }, { "epoch": 0.21356535495352977, "step": 2160, "train/total_loss": 0.15521422028541565 }, { "entropy": 9.72561264038086, "epoch": 0.21366422780304528, "mean_token_accuracy": 0.7355072498321533, "num_tokens": 11243918.0, "step": 2161, "train/ce_loss": 1.2311592102050781 }, { "epoch": 0.21366422780304528, "step": 2161, "train/sim_loss": 0.07421875 }, { "epoch": 0.21366422780304528, "step": 2161, "train/total_loss": 0.1973346769809723 }, { "entropy": 9.457310676574707, "epoch": 0.2137631006525608, "mean_token_accuracy": 0.7752043604850769, "num_tokens": 11249148.0, "step": 2162, "train/ce_loss": 0.3882007598876953 }, { "epoch": 0.2137631006525608, "step": 2162, "train/sim_loss": 0.078125 }, { "epoch": 0.2137631006525608, "step": 2162, "train/total_loss": 0.11694507300853729 }, { "entropy": 8.943735122680664, "epoch": 0.21386197350207634, "mean_token_accuracy": 0.7989473938941956, "num_tokens": 11254627.0, "step": 2163, "train/ce_loss": 0.644926130771637 }, { "epoch": 0.21386197350207634, "step": 2163, "train/sim_loss": 0.05859375 }, { "epoch": 0.21386197350207634, "step": 2163, "train/total_loss": 0.1230863630771637 }, { "entropy": 8.746071815490723, "epoch": 0.21396084635159185, "mean_token_accuracy": 0.754162609577179, "num_tokens": 11260360.0, "step": 2164, "train/ce_loss": 1.0563163757324219 }, { "epoch": 0.21396084635159185, "step": 2164, "train/sim_loss": 0.13671875 }, { "epoch": 0.21396084635159185, "step": 2164, "train/total_loss": 0.24235039949417114 }, { "entropy": 9.765542984008789, "epoch": 0.21405971920110736, "mean_token_accuracy": 0.7383177280426025, "num_tokens": 11265321.0, "step": 2165, "train/ce_loss": 0.630692720413208 }, { "epoch": 0.21405971920110736, "step": 2165, "train/sim_loss": 0.078125 }, { "epoch": 0.21405971920110736, "step": 2165, "train/total_loss": 0.14119428396224976 }, { "entropy": 9.095598220825195, "epoch": 0.2141585920506229, "mean_token_accuracy": 0.8049792647361755, "num_tokens": 11270720.0, "step": 2166, "train/ce_loss": 0.8454383611679077 }, { "epoch": 0.2141585920506229, "step": 2166, "train/sim_loss": 0.02734375 }, { "epoch": 0.2141585920506229, "step": 2166, "train/total_loss": 0.11188758909702301 }, { "entropy": 9.24032974243164, "epoch": 0.21425746490013842, "mean_token_accuracy": 0.68727707862854, "num_tokens": 11276037.0, "step": 2167, "train/ce_loss": 0.8193873167037964 }, { "epoch": 0.21425746490013842, "step": 2167, "train/sim_loss": 0.0625 }, { "epoch": 0.21425746490013842, "step": 2167, "train/total_loss": 0.1444387435913086 }, { "entropy": 9.336334228515625, "epoch": 0.21435633774965396, "mean_token_accuracy": 0.690157949924469, "num_tokens": 11281342.0, "step": 2168, "train/ce_loss": 1.3831806182861328 }, { "epoch": 0.21435633774965396, "step": 2168, "train/sim_loss": 0.078125 }, { "epoch": 0.21435633774965396, "step": 2168, "train/total_loss": 0.21644306182861328 }, { "entropy": 9.339741706848145, "epoch": 0.21445521059916947, "mean_token_accuracy": 0.6936339735984802, "num_tokens": 11286521.0, "step": 2169, "train/ce_loss": 2.5318486223113723e-05 }, { "epoch": 0.21445521059916947, "step": 2169, "train/sim_loss": 0.0546875 }, { "epoch": 0.21445521059916947, "step": 2169, "train/total_loss": 0.054690033197402954 }, { "entropy": 9.174461364746094, "epoch": 0.21455408344868498, "mean_token_accuracy": 0.7567886710166931, "num_tokens": 11291855.0, "step": 2170, "train/ce_loss": 0.6254871487617493 }, { "epoch": 0.21455408344868498, "step": 2170, "train/sim_loss": 0.0234375 }, { "epoch": 0.21455408344868498, "step": 2170, "train/total_loss": 0.08598621934652328 }, { "entropy": 9.489059448242188, "epoch": 0.21465295629820053, "mean_token_accuracy": 0.8178191781044006, "num_tokens": 11297026.0, "step": 2171, "train/ce_loss": 0.5916562080383301 }, { "epoch": 0.21465295629820053, "step": 2171, "train/sim_loss": 0.0625 }, { "epoch": 0.21465295629820053, "step": 2171, "train/total_loss": 0.12166562676429749 }, { "entropy": 9.058624267578125, "epoch": 0.21475182914771604, "mean_token_accuracy": 0.7308510541915894, "num_tokens": 11302421.0, "step": 2172, "train/ce_loss": 0.659095823764801 }, { "epoch": 0.21475182914771604, "step": 2172, "train/sim_loss": 0.0234375 }, { "epoch": 0.21475182914771604, "step": 2172, "train/total_loss": 0.08934708684682846 }, { "entropy": 9.979301452636719, "epoch": 0.21485070199723155, "mean_token_accuracy": 0.7422037124633789, "num_tokens": 11307316.0, "step": 2173, "train/ce_loss": 1.0199291706085205 }, { "epoch": 0.21485070199723155, "step": 2173, "train/sim_loss": 0.046875 }, { "epoch": 0.21485070199723155, "step": 2173, "train/total_loss": 0.1488679200410843 }, { "entropy": 10.074808120727539, "epoch": 0.2149495748467471, "mean_token_accuracy": 0.7215447425842285, "num_tokens": 11312359.0, "step": 2174, "train/ce_loss": 4.587761395669077e-06 }, { "epoch": 0.2149495748467471, "step": 2174, "train/sim_loss": 0.05859375 }, { "epoch": 0.2149495748467471, "step": 2174, "train/total_loss": 0.05859420821070671 }, { "entropy": 9.64946174621582, "epoch": 0.2150484476962626, "mean_token_accuracy": 0.6759868264198303, "num_tokens": 11317399.0, "step": 2175, "train/ce_loss": 1.8297922611236572 }, { "epoch": 0.2150484476962626, "step": 2175, "train/sim_loss": 0.08203125 }, { "epoch": 0.2150484476962626, "step": 2175, "train/total_loss": 0.2650104761123657 }, { "entropy": 10.183679580688477, "epoch": 0.21514732054577812, "mean_token_accuracy": 0.7074999809265137, "num_tokens": 11322203.0, "step": 2176, "train/ce_loss": 6.507055786642013e-06 }, { "epoch": 0.21514732054577812, "step": 2176, "train/sim_loss": 0.0234375 }, { "epoch": 0.21514732054577812, "step": 2176, "train/total_loss": 0.02343815006315708 }, { "entropy": 9.698988914489746, "epoch": 0.21524619339529366, "mean_token_accuracy": 0.7300319671630859, "num_tokens": 11327284.0, "step": 2177, "train/ce_loss": 5.713675363949733e-06 }, { "epoch": 0.21524619339529366, "step": 2177, "train/sim_loss": 0.078125 }, { "epoch": 0.21524619339529366, "step": 2177, "train/total_loss": 0.07812557369470596 }, { "entropy": 9.833122253417969, "epoch": 0.21534506624480917, "mean_token_accuracy": 0.7629233598709106, "num_tokens": 11332247.0, "step": 2178, "train/ce_loss": 3.8449916246463545e-06 }, { "epoch": 0.21534506624480917, "step": 2178, "train/sim_loss": 0.03125 }, { "epoch": 0.21534506624480917, "step": 2178, "train/total_loss": 0.03125038370490074 }, { "entropy": 9.060279846191406, "epoch": 0.2154439390943247, "mean_token_accuracy": 0.7414247989654541, "num_tokens": 11337479.0, "step": 2179, "train/ce_loss": 0.8337211608886719 }, { "epoch": 0.2154439390943247, "step": 2179, "train/sim_loss": 0.1171875 }, { "epoch": 0.2154439390943247, "step": 2179, "train/total_loss": 0.2005596160888672 }, { "epoch": 0.21554281194384023, "grad_norm": 0.9379047751426697, "learning_rate": 9.463729417000446e-06, "loss": 0.1581, "step": 2180 }, { "entropy": 9.990635871887207, "epoch": 0.21554281194384023, "mean_token_accuracy": 0.7019438147544861, "num_tokens": 11342383.0, "step": 2180, "train/ce_loss": 2.821929454803467 }, { "epoch": 0.21554281194384023, "step": 2180, "train/sim_loss": 0.05859375 }, { "epoch": 0.21554281194384023, "step": 2180, "train/total_loss": 0.3407866954803467 }, { "entropy": 9.707884788513184, "epoch": 0.21564168479335574, "mean_token_accuracy": 0.6892489194869995, "num_tokens": 11347519.0, "step": 2181, "train/ce_loss": 2.8116862722527003e-06 }, { "epoch": 0.21564168479335574, "step": 2181, "train/sim_loss": 0.08203125 }, { "epoch": 0.21564168479335574, "step": 2181, "train/total_loss": 0.08203153312206268 }, { "entropy": 9.095956802368164, "epoch": 0.21574055764287126, "mean_token_accuracy": 0.7733773589134216, "num_tokens": 11352908.0, "step": 2182, "train/ce_loss": 0.8760699033737183 }, { "epoch": 0.21574055764287126, "step": 2182, "train/sim_loss": 0.09375 }, { "epoch": 0.21574055764287126, "step": 2182, "train/total_loss": 0.1813569962978363 }, { "entropy": 9.073506355285645, "epoch": 0.2158394304923868, "mean_token_accuracy": 0.778372585773468, "num_tokens": 11358491.0, "step": 2183, "train/ce_loss": 0.340642511844635 }, { "epoch": 0.2158394304923868, "step": 2183, "train/sim_loss": 0.03125 }, { "epoch": 0.2158394304923868, "step": 2183, "train/total_loss": 0.06531424820423126 }, { "entropy": 9.381427764892578, "epoch": 0.2159383033419023, "mean_token_accuracy": 0.7582128643989563, "num_tokens": 11363764.0, "step": 2184, "train/ce_loss": 2.680990064618527e-06 }, { "epoch": 0.2159383033419023, "step": 2184, "train/sim_loss": 0.0625 }, { "epoch": 0.2159383033419023, "step": 2184, "train/total_loss": 0.06250026822090149 }, { "entropy": 9.285751342773438, "epoch": 0.21603717619141782, "mean_token_accuracy": 0.6935867071151733, "num_tokens": 11369103.0, "step": 2185, "train/ce_loss": 0.6994189620018005 }, { "epoch": 0.21603717619141782, "step": 2185, "train/sim_loss": 0.10546875 }, { "epoch": 0.21603717619141782, "step": 2185, "train/total_loss": 0.175410658121109 }, { "entropy": 9.424877166748047, "epoch": 0.21613604904093336, "mean_token_accuracy": 0.7536704540252686, "num_tokens": 11374159.0, "step": 2186, "train/ce_loss": 0.9362339973449707 }, { "epoch": 0.21613604904093336, "step": 2186, "train/sim_loss": 0.1015625 }, { "epoch": 0.21613604904093336, "step": 2186, "train/total_loss": 0.19518589973449707 }, { "entropy": 9.593955993652344, "epoch": 0.21623492189044888, "mean_token_accuracy": 0.7074927687644958, "num_tokens": 11379250.0, "step": 2187, "train/ce_loss": 2.730981577769853e-06 }, { "epoch": 0.21623492189044888, "step": 2187, "train/sim_loss": 0.0234375 }, { "epoch": 0.21623492189044888, "step": 2187, "train/total_loss": 0.023437773808836937 }, { "entropy": 9.340346336364746, "epoch": 0.21633379473996442, "mean_token_accuracy": 0.7113665342330933, "num_tokens": 11384501.0, "step": 2188, "train/ce_loss": 1.0747685432434082 }, { "epoch": 0.21633379473996442, "step": 2188, "train/sim_loss": 0.078125 }, { "epoch": 0.21633379473996442, "step": 2188, "train/total_loss": 0.1856018602848053 }, { "entropy": 9.467276573181152, "epoch": 0.21643266758947993, "mean_token_accuracy": 0.7388059496879578, "num_tokens": 11389935.0, "step": 2189, "train/ce_loss": 0.872168242931366 }, { "epoch": 0.21643266758947993, "step": 2189, "train/sim_loss": 0.08203125 }, { "epoch": 0.21643266758947993, "step": 2189, "train/total_loss": 0.1692480742931366 }, { "entropy": 9.654823303222656, "epoch": 0.21653154043899545, "mean_token_accuracy": 0.7686567306518555, "num_tokens": 11394899.0, "step": 2190, "train/ce_loss": 0.628430962562561 }, { "epoch": 0.21653154043899545, "step": 2190, "train/sim_loss": 0.03125 }, { "epoch": 0.21653154043899545, "step": 2190, "train/total_loss": 0.09409309923648834 }, { "entropy": 10.016326904296875, "epoch": 0.216630413288511, "mean_token_accuracy": 0.7416666746139526, "num_tokens": 11399780.0, "step": 2191, "train/ce_loss": 2.7722220420837402 }, { "epoch": 0.216630413288511, "step": 2191, "train/sim_loss": 0.109375 }, { "epoch": 0.216630413288511, "step": 2191, "train/total_loss": 0.386597216129303 }, { "entropy": 9.48812484741211, "epoch": 0.2167292861380265, "mean_token_accuracy": 0.6918518543243408, "num_tokens": 11404881.0, "step": 2192, "train/ce_loss": 2.0111641883850098 }, { "epoch": 0.2167292861380265, "step": 2192, "train/sim_loss": 0.0859375 }, { "epoch": 0.2167292861380265, "step": 2192, "train/total_loss": 0.2870539426803589 }, { "entropy": 8.953411102294922, "epoch": 0.216828158987542, "mean_token_accuracy": 0.7527749538421631, "num_tokens": 11410374.0, "step": 2193, "train/ce_loss": 0.8288949728012085 }, { "epoch": 0.216828158987542, "step": 2193, "train/sim_loss": 0.06640625 }, { "epoch": 0.216828158987542, "step": 2193, "train/total_loss": 0.14929574728012085 }, { "entropy": 9.059518814086914, "epoch": 0.21692703183705755, "mean_token_accuracy": 0.7093712687492371, "num_tokens": 11415688.0, "step": 2194, "train/ce_loss": 0.7937802076339722 }, { "epoch": 0.21692703183705755, "step": 2194, "train/sim_loss": 0.1484375 }, { "epoch": 0.21692703183705755, "step": 2194, "train/total_loss": 0.22781552374362946 }, { "entropy": 9.807968139648438, "epoch": 0.21702590468657307, "mean_token_accuracy": 0.7543859481811523, "num_tokens": 11420710.0, "step": 2195, "train/ce_loss": 1.0085327625274658 }, { "epoch": 0.21702590468657307, "step": 2195, "train/sim_loss": 0.05078125 }, { "epoch": 0.21702590468657307, "step": 2195, "train/total_loss": 0.15163452923297882 }, { "entropy": 9.237098693847656, "epoch": 0.21712477753608858, "mean_token_accuracy": 0.7193675637245178, "num_tokens": 11425935.0, "step": 2196, "train/ce_loss": 0.41018494963645935 }, { "epoch": 0.21712477753608858, "step": 2196, "train/sim_loss": 0.07421875 }, { "epoch": 0.21712477753608858, "step": 2196, "train/total_loss": 0.11523725092411041 }, { "entropy": 9.309840202331543, "epoch": 0.21722365038560412, "mean_token_accuracy": 0.6972891688346863, "num_tokens": 11431040.0, "step": 2197, "train/ce_loss": 1.0857893228530884 }, { "epoch": 0.21722365038560412, "step": 2197, "train/sim_loss": 0.06640625 }, { "epoch": 0.21722365038560412, "step": 2197, "train/total_loss": 0.17498518526554108 }, { "entropy": 9.340157508850098, "epoch": 0.21732252323511964, "mean_token_accuracy": 0.7953668236732483, "num_tokens": 11436233.0, "step": 2198, "train/ce_loss": 0.7481324672698975 }, { "epoch": 0.21732252323511964, "step": 2198, "train/sim_loss": 0.05859375 }, { "epoch": 0.21732252323511964, "step": 2198, "train/total_loss": 0.13340699672698975 }, { "entropy": 9.236837387084961, "epoch": 0.21742139608463515, "mean_token_accuracy": 0.7602339386940002, "num_tokens": 11441538.0, "step": 2199, "train/ce_loss": 0.4316747188568115 }, { "epoch": 0.21742139608463515, "step": 2199, "train/sim_loss": 0.0859375 }, { "epoch": 0.21742139608463515, "step": 2199, "train/total_loss": 0.12910497188568115 }, { "epoch": 0.2175202689341507, "grad_norm": 0.8440031409263611, "learning_rate": 9.458784552242498e-06, "loss": 0.1549, "step": 2200 }, { "entropy": 9.14334487915039, "epoch": 0.2175202689341507, "mean_token_accuracy": 0.7280248403549194, "num_tokens": 11446957.0, "step": 2200, "train/ce_loss": 0.8459938764572144 }, { "epoch": 0.2175202689341507, "step": 2200, "train/sim_loss": 0.07421875 }, { "epoch": 0.2175202689341507, "step": 2200, "train/total_loss": 0.15881814062595367 }, { "entropy": 9.404414176940918, "epoch": 0.2176191417836662, "mean_token_accuracy": 0.6830188632011414, "num_tokens": 11452210.0, "step": 2201, "train/ce_loss": 0.4240676462650299 }, { "epoch": 0.2176191417836662, "step": 2201, "train/sim_loss": 0.0546875 }, { "epoch": 0.2176191417836662, "step": 2201, "train/total_loss": 0.09709426760673523 }, { "entropy": 8.77522087097168, "epoch": 0.21771801463318172, "mean_token_accuracy": 0.751960813999176, "num_tokens": 11457701.0, "step": 2202, "train/ce_loss": 0.7309415936470032 }, { "epoch": 0.21771801463318172, "step": 2202, "train/sim_loss": 0.0234375 }, { "epoch": 0.21771801463318172, "step": 2202, "train/total_loss": 0.09653165936470032 }, { "entropy": 9.528435707092285, "epoch": 0.21781688748269726, "mean_token_accuracy": 0.7116212248802185, "num_tokens": 11462797.0, "step": 2203, "train/ce_loss": 1.2283047437667847 }, { "epoch": 0.21781688748269726, "step": 2203, "train/sim_loss": 0.09375 }, { "epoch": 0.21781688748269726, "step": 2203, "train/total_loss": 0.21658048033714294 }, { "entropy": 9.46872329711914, "epoch": 0.21791576033221277, "mean_token_accuracy": 0.6633416414260864, "num_tokens": 11468050.0, "step": 2204, "train/ce_loss": 0.693771481513977 }, { "epoch": 0.21791576033221277, "step": 2204, "train/sim_loss": 0.0234375 }, { "epoch": 0.21791576033221277, "step": 2204, "train/total_loss": 0.09281464666128159 }, { "entropy": 9.334493637084961, "epoch": 0.21801463318172828, "mean_token_accuracy": 0.6779448390007019, "num_tokens": 11473282.0, "step": 2205, "train/ce_loss": 0.7677831053733826 }, { "epoch": 0.21801463318172828, "step": 2205, "train/sim_loss": 0.0703125 }, { "epoch": 0.21801463318172828, "step": 2205, "train/total_loss": 0.1470908224582672 }, { "entropy": 10.113032341003418, "epoch": 0.21811350603124383, "mean_token_accuracy": 0.7927711009979248, "num_tokens": 11478115.0, "step": 2206, "train/ce_loss": 1.3351314919418655e-05 }, { "epoch": 0.21811350603124383, "step": 2206, "train/sim_loss": 0.04296875 }, { "epoch": 0.21811350603124383, "step": 2206, "train/total_loss": 0.04297008365392685 }, { "entropy": 8.985885620117188, "epoch": 0.21821237888075934, "mean_token_accuracy": 0.8191377520561218, "num_tokens": 11483524.0, "step": 2207, "train/ce_loss": 0.4432702958583832 }, { "epoch": 0.21821237888075934, "step": 2207, "train/sim_loss": 0.0234375 }, { "epoch": 0.21821237888075934, "step": 2207, "train/total_loss": 0.0677645355463028 }, { "entropy": 8.988540649414062, "epoch": 0.21831125173027488, "mean_token_accuracy": 0.7454954981803894, "num_tokens": 11488894.0, "step": 2208, "train/ce_loss": 0.8499881029129028 }, { "epoch": 0.21831125173027488, "step": 2208, "train/sim_loss": 0.046875 }, { "epoch": 0.21831125173027488, "step": 2208, "train/total_loss": 0.13187381625175476 }, { "entropy": 9.603333473205566, "epoch": 0.2184101245797904, "mean_token_accuracy": 0.694868266582489, "num_tokens": 11494043.0, "step": 2209, "train/ce_loss": 0.9686959981918335 }, { "epoch": 0.2184101245797904, "step": 2209, "train/sim_loss": 0.1015625 }, { "epoch": 0.2184101245797904, "step": 2209, "train/total_loss": 0.1984321027994156 }, { "entropy": 9.524908065795898, "epoch": 0.2185089974293059, "mean_token_accuracy": 0.699999988079071, "num_tokens": 11499168.0, "step": 2210, "train/ce_loss": 0.6224838495254517 }, { "epoch": 0.2185089974293059, "step": 2210, "train/sim_loss": 0.0625 }, { "epoch": 0.2185089974293059, "step": 2210, "train/total_loss": 0.12474838644266129 }, { "entropy": 9.155143737792969, "epoch": 0.21860787027882145, "mean_token_accuracy": 0.8020833134651184, "num_tokens": 11504503.0, "step": 2211, "train/ce_loss": 0.6591212749481201 }, { "epoch": 0.21860787027882145, "step": 2211, "train/sim_loss": 0.0234375 }, { "epoch": 0.21860787027882145, "step": 2211, "train/total_loss": 0.08934962749481201 }, { "entropy": 9.740113258361816, "epoch": 0.21870674312833696, "mean_token_accuracy": 0.7446808218955994, "num_tokens": 11509505.0, "step": 2212, "train/ce_loss": 1.76637864112854 }, { "epoch": 0.21870674312833696, "step": 2212, "train/sim_loss": 0.04296875 }, { "epoch": 0.21870674312833696, "step": 2212, "train/total_loss": 0.21960662305355072 }, { "entropy": 10.373140335083008, "epoch": 0.21880561597785247, "mean_token_accuracy": 0.7290970087051392, "num_tokens": 11514212.0, "step": 2213, "train/ce_loss": 7.3785072345344815e-06 }, { "epoch": 0.21880561597785247, "step": 2213, "train/sim_loss": 0.05859375 }, { "epoch": 0.21880561597785247, "step": 2213, "train/total_loss": 0.058594487607479095 }, { "entropy": 8.925299644470215, "epoch": 0.21890448882736802, "mean_token_accuracy": 0.7633745074272156, "num_tokens": 11519663.0, "step": 2214, "train/ce_loss": 0.9736812710762024 }, { "epoch": 0.21890448882736802, "step": 2214, "train/sim_loss": 0.125 }, { "epoch": 0.21890448882736802, "step": 2214, "train/total_loss": 0.22236812114715576 }, { "entropy": 9.144561767578125, "epoch": 0.21900336167688353, "mean_token_accuracy": 0.7680690288543701, "num_tokens": 11525043.0, "step": 2215, "train/ce_loss": 0.9562681317329407 }, { "epoch": 0.21900336167688353, "step": 2215, "train/sim_loss": 0.06640625 }, { "epoch": 0.21900336167688353, "step": 2215, "train/total_loss": 0.1620330661535263 }, { "entropy": 9.364012718200684, "epoch": 0.21910223452639904, "mean_token_accuracy": 0.6739690899848938, "num_tokens": 11530351.0, "step": 2216, "train/ce_loss": 1.0492823123931885 }, { "epoch": 0.21910223452639904, "step": 2216, "train/sim_loss": 0.078125 }, { "epoch": 0.21910223452639904, "step": 2216, "train/total_loss": 0.18305322527885437 }, { "entropy": 9.427347183227539, "epoch": 0.21920110737591458, "mean_token_accuracy": 0.7275280952453613, "num_tokens": 11535521.0, "step": 2217, "train/ce_loss": 4.278482265362982e-06 }, { "epoch": 0.21920110737591458, "step": 2217, "train/sim_loss": 0.07421875 }, { "epoch": 0.21920110737591458, "step": 2217, "train/total_loss": 0.07421917468309402 }, { "entropy": 9.088309288024902, "epoch": 0.2192999802254301, "mean_token_accuracy": 0.6828012466430664, "num_tokens": 11540934.0, "step": 2218, "train/ce_loss": 1.5599801540374756 }, { "epoch": 0.2192999802254301, "step": 2218, "train/sim_loss": 0.0859375 }, { "epoch": 0.2192999802254301, "step": 2218, "train/total_loss": 0.24193552136421204 }, { "entropy": 9.686138153076172, "epoch": 0.2193988530749456, "mean_token_accuracy": 0.7630252242088318, "num_tokens": 11545959.0, "step": 2219, "train/ce_loss": 2.031754970550537 }, { "epoch": 0.2193988530749456, "step": 2219, "train/sim_loss": 0.0859375 }, { "epoch": 0.2193988530749456, "step": 2219, "train/total_loss": 0.28911298513412476 }, { "epoch": 0.21949772592446115, "grad_norm": 0.9843518733978271, "learning_rate": 9.453839687484548e-06, "loss": 0.1545, "step": 2220 }, { "entropy": 9.309232711791992, "epoch": 0.21949772592446115, "mean_token_accuracy": 0.7461629509925842, "num_tokens": 11551312.0, "step": 2220, "train/ce_loss": 1.0132420063018799 }, { "epoch": 0.21949772592446115, "step": 2220, "train/sim_loss": 0.0625 }, { "epoch": 0.21949772592446115, "step": 2220, "train/total_loss": 0.163824200630188 }, { "entropy": 9.845184326171875, "epoch": 0.21959659877397666, "mean_token_accuracy": 0.6597077250480652, "num_tokens": 11556192.0, "step": 2221, "train/ce_loss": 1.2175583839416504 }, { "epoch": 0.21959659877397666, "step": 2221, "train/sim_loss": 0.0703125 }, { "epoch": 0.21959659877397666, "step": 2221, "train/total_loss": 0.19206833839416504 }, { "entropy": 9.62173080444336, "epoch": 0.21969547162349218, "mean_token_accuracy": 0.7169811129570007, "num_tokens": 11561289.0, "step": 2222, "train/ce_loss": 1.0519672632217407 }, { "epoch": 0.21969547162349218, "step": 2222, "train/sim_loss": 0.08203125 }, { "epoch": 0.21969547162349218, "step": 2222, "train/total_loss": 0.1872279793024063 }, { "entropy": 9.193510055541992, "epoch": 0.21979434447300772, "mean_token_accuracy": 0.7065337896347046, "num_tokens": 11566685.0, "step": 2223, "train/ce_loss": 1.1219364404678345 }, { "epoch": 0.21979434447300772, "step": 2223, "train/sim_loss": 0.09375 }, { "epoch": 0.21979434447300772, "step": 2223, "train/total_loss": 0.20594364404678345 }, { "entropy": 9.803560256958008, "epoch": 0.21989321732252323, "mean_token_accuracy": 0.7218543291091919, "num_tokens": 11571724.0, "step": 2224, "train/ce_loss": 1.250420331954956 }, { "epoch": 0.21989321732252323, "step": 2224, "train/sim_loss": 0.0703125 }, { "epoch": 0.21989321732252323, "step": 2224, "train/total_loss": 0.19535453617572784 }, { "entropy": 9.618412971496582, "epoch": 0.21999209017203875, "mean_token_accuracy": 0.7300613522529602, "num_tokens": 11576830.0, "step": 2225, "train/ce_loss": 1.8763059415505268e-05 }, { "epoch": 0.21999209017203875, "step": 2225, "train/sim_loss": 0.08984375 }, { "epoch": 0.21999209017203875, "step": 2225, "train/total_loss": 0.08984562754631042 }, { "entropy": 9.272396087646484, "epoch": 0.2200909630215543, "mean_token_accuracy": 0.6992574334144592, "num_tokens": 11582136.0, "step": 2226, "train/ce_loss": 0.5853594541549683 }, { "epoch": 0.2200909630215543, "step": 2226, "train/sim_loss": 0.0703125 }, { "epoch": 0.2200909630215543, "step": 2226, "train/total_loss": 0.12884844839572906 }, { "entropy": 9.071741104125977, "epoch": 0.2201898358710698, "mean_token_accuracy": 0.778969943523407, "num_tokens": 11587575.0, "step": 2227, "train/ce_loss": 0.5541576147079468 }, { "epoch": 0.2201898358710698, "step": 2227, "train/sim_loss": 0.03125 }, { "epoch": 0.2201898358710698, "step": 2227, "train/total_loss": 0.08666576445102692 }, { "entropy": 9.476241111755371, "epoch": 0.22028870872058534, "mean_token_accuracy": 0.70071941614151, "num_tokens": 11592776.0, "step": 2228, "train/ce_loss": 0.9042754173278809 }, { "epoch": 0.22028870872058534, "step": 2228, "train/sim_loss": 0.0703125 }, { "epoch": 0.22028870872058534, "step": 2228, "train/total_loss": 0.16074004769325256 }, { "entropy": 9.842856407165527, "epoch": 0.22038758157010085, "mean_token_accuracy": 0.7243697643280029, "num_tokens": 11597818.0, "step": 2229, "train/ce_loss": 4.030011496070074e-06 }, { "epoch": 0.22038758157010085, "step": 2229, "train/sim_loss": 0.0625 }, { "epoch": 0.22038758157010085, "step": 2229, "train/total_loss": 0.06250040233135223 }, { "entropy": 9.958674430847168, "epoch": 0.22048645441961637, "mean_token_accuracy": 0.7037037014961243, "num_tokens": 11602640.0, "step": 2230, "train/ce_loss": 6.45430336589925e-05 }, { "epoch": 0.22048645441961637, "step": 2230, "train/sim_loss": 0.04296875 }, { "epoch": 0.22048645441961637, "step": 2230, "train/total_loss": 0.042975205928087234 }, { "entropy": 8.876422882080078, "epoch": 0.2205853272691319, "mean_token_accuracy": 0.7611111402511597, "num_tokens": 11608034.0, "step": 2231, "train/ce_loss": 0.5030422806739807 }, { "epoch": 0.2205853272691319, "step": 2231, "train/sim_loss": 0.02734375 }, { "epoch": 0.2205853272691319, "step": 2231, "train/total_loss": 0.07764798402786255 }, { "entropy": 8.97739028930664, "epoch": 0.22068420011864742, "mean_token_accuracy": 0.7794561982154846, "num_tokens": 11613469.0, "step": 2232, "train/ce_loss": 0.6213583946228027 }, { "epoch": 0.22068420011864742, "step": 2232, "train/sim_loss": 0.02734375 }, { "epoch": 0.22068420011864742, "step": 2232, "train/total_loss": 0.08947959542274475 }, { "entropy": 9.215559005737305, "epoch": 0.22078307296816294, "mean_token_accuracy": 0.7322834730148315, "num_tokens": 11618844.0, "step": 2233, "train/ce_loss": 0.7574310898780823 }, { "epoch": 0.22078307296816294, "step": 2233, "train/sim_loss": 0.07421875 }, { "epoch": 0.22078307296816294, "step": 2233, "train/total_loss": 0.14996185898780823 }, { "entropy": 9.45633316040039, "epoch": 0.22088194581767848, "mean_token_accuracy": 0.7794729471206665, "num_tokens": 11624024.0, "step": 2234, "train/ce_loss": 0.6323754787445068 }, { "epoch": 0.22088194581767848, "step": 2234, "train/sim_loss": 0.02734375 }, { "epoch": 0.22088194581767848, "step": 2234, "train/total_loss": 0.09058129787445068 }, { "entropy": 9.645082473754883, "epoch": 0.220980818667194, "mean_token_accuracy": 0.7199312448501587, "num_tokens": 11629020.0, "step": 2235, "train/ce_loss": 0.8218998908996582 }, { "epoch": 0.220980818667194, "step": 2235, "train/sim_loss": 0.09375 }, { "epoch": 0.220980818667194, "step": 2235, "train/total_loss": 0.17593999207019806 }, { "entropy": 9.402721405029297, "epoch": 0.2210796915167095, "mean_token_accuracy": 0.6622516512870789, "num_tokens": 11634182.0, "step": 2236, "train/ce_loss": 0.9104689359664917 }, { "epoch": 0.2210796915167095, "step": 2236, "train/sim_loss": 0.08984375 }, { "epoch": 0.2210796915167095, "step": 2236, "train/total_loss": 0.18089064955711365 }, { "entropy": 9.542716979980469, "epoch": 0.22117856436622504, "mean_token_accuracy": 0.7368420958518982, "num_tokens": 11639348.0, "step": 2237, "train/ce_loss": 0.8644982576370239 }, { "epoch": 0.22117856436622504, "step": 2237, "train/sim_loss": 0.03515625 }, { "epoch": 0.22117856436622504, "step": 2237, "train/total_loss": 0.12160607427358627 }, { "entropy": 9.052597045898438, "epoch": 0.22127743721574056, "mean_token_accuracy": 0.7108141183853149, "num_tokens": 11644629.0, "step": 2238, "train/ce_loss": 0.8265287280082703 }, { "epoch": 0.22127743721574056, "step": 2238, "train/sim_loss": 0.06640625 }, { "epoch": 0.22127743721574056, "step": 2238, "train/total_loss": 0.14905911684036255 }, { "entropy": 8.985305786132812, "epoch": 0.22137631006525607, "mean_token_accuracy": 0.7151966094970703, "num_tokens": 11650015.0, "step": 2239, "train/ce_loss": 0.9002116918563843 }, { "epoch": 0.22137631006525607, "step": 2239, "train/sim_loss": 0.078125 }, { "epoch": 0.22137631006525607, "step": 2239, "train/total_loss": 0.16814616322517395 }, { "epoch": 0.2214751829147716, "grad_norm": 0.9024277925491333, "learning_rate": 9.4488948227266e-06, "loss": 0.1592, "step": 2240 }, { "entropy": 9.394206047058105, "epoch": 0.2214751829147716, "mean_token_accuracy": 0.6993548274040222, "num_tokens": 11655258.0, "step": 2240, "train/ce_loss": 0.9053270816802979 }, { "epoch": 0.2214751829147716, "step": 2240, "train/sim_loss": 0.09765625 }, { "epoch": 0.2214751829147716, "step": 2240, "train/total_loss": 0.18818897008895874 }, { "entropy": 9.000853538513184, "epoch": 0.22157405576428713, "mean_token_accuracy": 0.7426556944847107, "num_tokens": 11660591.0, "step": 2241, "train/ce_loss": 0.8556745648384094 }, { "epoch": 0.22157405576428713, "step": 2241, "train/sim_loss": 0.1015625 }, { "epoch": 0.22157405576428713, "step": 2241, "train/total_loss": 0.18712995946407318 }, { "entropy": 8.76073169708252, "epoch": 0.22167292861380264, "mean_token_accuracy": 0.6751313209533691, "num_tokens": 11666227.0, "step": 2242, "train/ce_loss": 0.46014127135276794 }, { "epoch": 0.22167292861380264, "step": 2242, "train/sim_loss": 0.03125 }, { "epoch": 0.22167292861380264, "step": 2242, "train/total_loss": 0.07726413011550903 }, { "entropy": 9.666487693786621, "epoch": 0.22177180146331818, "mean_token_accuracy": 0.7293233275413513, "num_tokens": 11671206.0, "step": 2243, "train/ce_loss": 1.7687726020812988 }, { "epoch": 0.22177180146331818, "step": 2243, "train/sim_loss": 0.06640625 }, { "epoch": 0.22177180146331818, "step": 2243, "train/total_loss": 0.24328351020812988 }, { "entropy": 9.189653396606445, "epoch": 0.2218706743128337, "mean_token_accuracy": 0.752077579498291, "num_tokens": 11676373.0, "step": 2244, "train/ce_loss": 0.7033227682113647 }, { "epoch": 0.2218706743128337, "step": 2244, "train/sim_loss": 0.05859375 }, { "epoch": 0.2218706743128337, "step": 2244, "train/total_loss": 0.12892603874206543 }, { "entropy": 9.874273300170898, "epoch": 0.2219695471623492, "mean_token_accuracy": 0.8305785059928894, "num_tokens": 11681287.0, "step": 2245, "train/ce_loss": 0.964332640171051 }, { "epoch": 0.2219695471623492, "step": 2245, "train/sim_loss": 0.078125 }, { "epoch": 0.2219695471623492, "step": 2245, "train/total_loss": 0.17455826699733734 }, { "entropy": 9.143362045288086, "epoch": 0.22206842001186475, "mean_token_accuracy": 0.7358943819999695, "num_tokens": 11686593.0, "step": 2246, "train/ce_loss": 0.8456717729568481 }, { "epoch": 0.22206842001186475, "step": 2246, "train/sim_loss": 0.046875 }, { "epoch": 0.22206842001186475, "step": 2246, "train/total_loss": 0.13144218921661377 }, { "entropy": 8.868471145629883, "epoch": 0.22216729286138026, "mean_token_accuracy": 0.7589852213859558, "num_tokens": 11691989.0, "step": 2247, "train/ce_loss": 0.5861417651176453 }, { "epoch": 0.22216729286138026, "step": 2247, "train/sim_loss": 0.03125 }, { "epoch": 0.22216729286138026, "step": 2247, "train/total_loss": 0.08986417949199677 }, { "entropy": 9.47134017944336, "epoch": 0.22226616571089577, "mean_token_accuracy": 0.7125172019004822, "num_tokens": 11697161.0, "step": 2248, "train/ce_loss": 0.8098688721656799 }, { "epoch": 0.22226616571089577, "step": 2248, "train/sim_loss": 0.08984375 }, { "epoch": 0.22226616571089577, "step": 2248, "train/total_loss": 0.170830637216568 }, { "entropy": 9.266434669494629, "epoch": 0.22236503856041132, "mean_token_accuracy": 0.7151898741722107, "num_tokens": 11702424.0, "step": 2249, "train/ce_loss": 0.3487287163734436 }, { "epoch": 0.22236503856041132, "step": 2249, "train/sim_loss": 0.05078125 }, { "epoch": 0.22236503856041132, "step": 2249, "train/total_loss": 0.0856541246175766 }, { "entropy": 8.874809265136719, "epoch": 0.22246391140992683, "mean_token_accuracy": 0.8147773146629333, "num_tokens": 11707903.0, "step": 2250, "train/ce_loss": 1.0702500343322754 }, { "epoch": 0.22246391140992683, "step": 2250, "train/sim_loss": 0.09375 }, { "epoch": 0.22246391140992683, "step": 2250, "train/total_loss": 0.20077499747276306 }, { "entropy": 9.210561752319336, "epoch": 0.22256278425944237, "mean_token_accuracy": 0.718826413154602, "num_tokens": 11713170.0, "step": 2251, "train/ce_loss": 1.1450979709625244 }, { "epoch": 0.22256278425944237, "step": 2251, "train/sim_loss": 0.03515625 }, { "epoch": 0.22256278425944237, "step": 2251, "train/total_loss": 0.14966604113578796 }, { "entropy": 9.203631401062012, "epoch": 0.22266165710895788, "mean_token_accuracy": 0.7115384340286255, "num_tokens": 11718495.0, "step": 2252, "train/ce_loss": 0.9820226430892944 }, { "epoch": 0.22266165710895788, "step": 2252, "train/sim_loss": 0.06640625 }, { "epoch": 0.22266165710895788, "step": 2252, "train/total_loss": 0.16460850834846497 }, { "entropy": 9.035469055175781, "epoch": 0.2227605299584734, "mean_token_accuracy": 0.6905737519264221, "num_tokens": 11723907.0, "step": 2253, "train/ce_loss": 0.7407964468002319 }, { "epoch": 0.2227605299584734, "step": 2253, "train/sim_loss": 0.078125 }, { "epoch": 0.2227605299584734, "step": 2253, "train/total_loss": 0.15220464766025543 }, { "entropy": 9.292431831359863, "epoch": 0.22285940280798894, "mean_token_accuracy": 0.7409090995788574, "num_tokens": 11729054.0, "step": 2254, "train/ce_loss": 6.4857699726417195e-06 }, { "epoch": 0.22285940280798894, "step": 2254, "train/sim_loss": 0.0390625 }, { "epoch": 0.22285940280798894, "step": 2254, "train/total_loss": 0.03906314820051193 }, { "entropy": 9.472380638122559, "epoch": 0.22295827565750445, "mean_token_accuracy": 0.7387140989303589, "num_tokens": 11734131.0, "step": 2255, "train/ce_loss": 1.2210959196090698 }, { "epoch": 0.22295827565750445, "step": 2255, "train/sim_loss": 0.0703125 }, { "epoch": 0.22295827565750445, "step": 2255, "train/total_loss": 0.19242209196090698 }, { "entropy": 8.898763656616211, "epoch": 0.22305714850701996, "mean_token_accuracy": 0.6935166716575623, "num_tokens": 11739640.0, "step": 2256, "train/ce_loss": 0.6935907006263733 }, { "epoch": 0.22305714850701996, "step": 2256, "train/sim_loss": 0.0546875 }, { "epoch": 0.22305714850701996, "step": 2256, "train/total_loss": 0.12404657155275345 }, { "entropy": 9.321701049804688, "epoch": 0.2231560213565355, "mean_token_accuracy": 0.7052631378173828, "num_tokens": 11744824.0, "step": 2257, "train/ce_loss": 0.7226592898368835 }, { "epoch": 0.2231560213565355, "step": 2257, "train/sim_loss": 0.0546875 }, { "epoch": 0.2231560213565355, "step": 2257, "train/total_loss": 0.12695342302322388 }, { "entropy": 9.250961303710938, "epoch": 0.22325489420605102, "mean_token_accuracy": 0.7439724206924438, "num_tokens": 11750153.0, "step": 2258, "train/ce_loss": 1.0045374631881714 }, { "epoch": 0.22325489420605102, "step": 2258, "train/sim_loss": 0.0703125 }, { "epoch": 0.22325489420605102, "step": 2258, "train/total_loss": 0.17076624929904938 }, { "entropy": 9.526915550231934, "epoch": 0.22335376705556653, "mean_token_accuracy": 0.7809917330741882, "num_tokens": 11755325.0, "step": 2259, "train/ce_loss": 1.122484564781189 }, { "epoch": 0.22335376705556653, "step": 2259, "train/sim_loss": 0.09375 }, { "epoch": 0.22335376705556653, "step": 2259, "train/total_loss": 0.20599845051765442 }, { "epoch": 0.22345263990508207, "grad_norm": 0.7781448364257812, "learning_rate": 9.44394995796865e-06, "loss": 0.1563, "step": 2260 }, { "entropy": 9.540910720825195, "epoch": 0.22345263990508207, "mean_token_accuracy": 0.6835616230964661, "num_tokens": 11760471.0, "step": 2260, "train/ce_loss": 1.0501680374145508 }, { "epoch": 0.22345263990508207, "step": 2260, "train/sim_loss": 0.08984375 }, { "epoch": 0.22345263990508207, "step": 2260, "train/total_loss": 0.1948605477809906 }, { "entropy": 9.239645004272461, "epoch": 0.2235515127545976, "mean_token_accuracy": 0.725261926651001, "num_tokens": 11765819.0, "step": 2261, "train/ce_loss": 1.27482271194458 }, { "epoch": 0.2235515127545976, "step": 2261, "train/sim_loss": 0.1171875 }, { "epoch": 0.2235515127545976, "step": 2261, "train/total_loss": 0.24466978013515472 }, { "entropy": 8.915700912475586, "epoch": 0.2236503856041131, "mean_token_accuracy": 0.7682570815086365, "num_tokens": 11771331.0, "step": 2262, "train/ce_loss": 0.6882832646369934 }, { "epoch": 0.2236503856041131, "step": 2262, "train/sim_loss": 0.03515625 }, { "epoch": 0.2236503856041131, "step": 2262, "train/total_loss": 0.10398457944393158 }, { "entropy": 9.431346893310547, "epoch": 0.22374925845362864, "mean_token_accuracy": 0.7694235444068909, "num_tokens": 11776578.0, "step": 2263, "train/ce_loss": 5.010083441447932e-06 }, { "epoch": 0.22374925845362864, "step": 2263, "train/sim_loss": 0.03125 }, { "epoch": 0.22374925845362864, "step": 2263, "train/total_loss": 0.031250499188899994 }, { "entropy": 9.091398239135742, "epoch": 0.22384813130314415, "mean_token_accuracy": 0.7158351540565491, "num_tokens": 11781960.0, "step": 2264, "train/ce_loss": 0.6672316193580627 }, { "epoch": 0.22384813130314415, "step": 2264, "train/sim_loss": 0.06640625 }, { "epoch": 0.22384813130314415, "step": 2264, "train/total_loss": 0.13312941789627075 }, { "entropy": 9.166069984436035, "epoch": 0.22394700415265967, "mean_token_accuracy": 0.7060241103172302, "num_tokens": 11787250.0, "step": 2265, "train/ce_loss": 1.1839373111724854 }, { "epoch": 0.22394700415265967, "step": 2265, "train/sim_loss": 0.046875 }, { "epoch": 0.22394700415265967, "step": 2265, "train/total_loss": 0.16526873409748077 }, { "entropy": 9.317614555358887, "epoch": 0.2240458770021752, "mean_token_accuracy": 0.7234042286872864, "num_tokens": 11792515.0, "step": 2266, "train/ce_loss": 1.562074065208435 }, { "epoch": 0.2240458770021752, "step": 2266, "train/sim_loss": 0.08984375 }, { "epoch": 0.2240458770021752, "step": 2266, "train/total_loss": 0.24605116248130798 }, { "entropy": 9.677188873291016, "epoch": 0.22414474985169072, "mean_token_accuracy": 0.704049825668335, "num_tokens": 11797539.0, "step": 2267, "train/ce_loss": 4.742025339510292e-06 }, { "epoch": 0.22414474985169072, "step": 2267, "train/sim_loss": 0.05078125 }, { "epoch": 0.22414474985169072, "step": 2267, "train/total_loss": 0.050781723111867905 }, { "entropy": 9.175324440002441, "epoch": 0.22424362270120624, "mean_token_accuracy": 0.7828004360198975, "num_tokens": 11802872.0, "step": 2268, "train/ce_loss": 0.6030412912368774 }, { "epoch": 0.22424362270120624, "step": 2268, "train/sim_loss": 0.0234375 }, { "epoch": 0.22424362270120624, "step": 2268, "train/total_loss": 0.08374163508415222 }, { "entropy": 9.462089538574219, "epoch": 0.22434249555072178, "mean_token_accuracy": 0.744516134262085, "num_tokens": 11808065.0, "step": 2269, "train/ce_loss": 0.4913155138492584 }, { "epoch": 0.22434249555072178, "step": 2269, "train/sim_loss": 0.078125 }, { "epoch": 0.22434249555072178, "step": 2269, "train/total_loss": 0.12725655734539032 }, { "entropy": 10.407243728637695, "epoch": 0.2244413684002373, "mean_token_accuracy": 0.7772276997566223, "num_tokens": 11812659.0, "step": 2270, "train/ce_loss": 2.864804628188722e-05 }, { "epoch": 0.2244413684002373, "step": 2270, "train/sim_loss": 0.0546875 }, { "epoch": 0.2244413684002373, "step": 2270, "train/total_loss": 0.05469036474823952 }, { "entropy": 9.114701271057129, "epoch": 0.22454024124975283, "mean_token_accuracy": 0.7374461889266968, "num_tokens": 11817856.0, "step": 2271, "train/ce_loss": 0.9672939777374268 }, { "epoch": 0.22454024124975283, "step": 2271, "train/sim_loss": 0.0546875 }, { "epoch": 0.22454024124975283, "step": 2271, "train/total_loss": 0.15141689777374268 }, { "entropy": 9.434842109680176, "epoch": 0.22463911409926834, "mean_token_accuracy": 0.7405914068222046, "num_tokens": 11823046.0, "step": 2272, "train/ce_loss": 0.6265137791633606 }, { "epoch": 0.22463911409926834, "step": 2272, "train/sim_loss": 0.015625 }, { "epoch": 0.22463911409926834, "step": 2272, "train/total_loss": 0.0782763808965683 }, { "entropy": 9.087112426757812, "epoch": 0.22473798694878386, "mean_token_accuracy": 0.7478684782981873, "num_tokens": 11828361.0, "step": 2273, "train/ce_loss": 1.1432479619979858 }, { "epoch": 0.22473798694878386, "step": 2273, "train/sim_loss": 0.04296875 }, { "epoch": 0.22473798694878386, "step": 2273, "train/total_loss": 0.15729355812072754 }, { "entropy": 9.735007286071777, "epoch": 0.2248368597982994, "mean_token_accuracy": 0.7039473652839661, "num_tokens": 11833438.0, "step": 2274, "train/ce_loss": 0.6995331645011902 }, { "epoch": 0.2248368597982994, "step": 2274, "train/sim_loss": 0.0625 }, { "epoch": 0.2248368597982994, "step": 2274, "train/total_loss": 0.1324533224105835 }, { "entropy": 9.384618759155273, "epoch": 0.2249357326478149, "mean_token_accuracy": 0.7997347712516785, "num_tokens": 11838646.0, "step": 2275, "train/ce_loss": 0.5165915489196777 }, { "epoch": 0.2249357326478149, "step": 2275, "train/sim_loss": 0.03125 }, { "epoch": 0.2249357326478149, "step": 2275, "train/total_loss": 0.08290915191173553 }, { "entropy": 8.890913963317871, "epoch": 0.22503460549733043, "mean_token_accuracy": 0.7223942279815674, "num_tokens": 11844105.0, "step": 2276, "train/ce_loss": 1.4923746585845947 }, { "epoch": 0.22503460549733043, "step": 2276, "train/sim_loss": 0.125 }, { "epoch": 0.22503460549733043, "step": 2276, "train/total_loss": 0.2742374539375305 }, { "entropy": 9.439136505126953, "epoch": 0.22513347834684597, "mean_token_accuracy": 0.7201645970344543, "num_tokens": 11849298.0, "step": 2277, "train/ce_loss": 1.2139506340026855 }, { "epoch": 0.22513347834684597, "step": 2277, "train/sim_loss": 0.0703125 }, { "epoch": 0.22513347834684597, "step": 2277, "train/total_loss": 0.1917075663805008 }, { "entropy": 9.488235473632812, "epoch": 0.22523235119636148, "mean_token_accuracy": 0.7834224700927734, "num_tokens": 11854542.0, "step": 2278, "train/ce_loss": 1.1686978340148926 }, { "epoch": 0.22523235119636148, "step": 2278, "train/sim_loss": 0.09765625 }, { "epoch": 0.22523235119636148, "step": 2278, "train/total_loss": 0.21452602744102478 }, { "entropy": 9.137245178222656, "epoch": 0.225331224045877, "mean_token_accuracy": 0.7211764454841614, "num_tokens": 11859830.0, "step": 2279, "train/ce_loss": 1.1765626668930054 }, { "epoch": 0.225331224045877, "step": 2279, "train/sim_loss": 0.08203125 }, { "epoch": 0.225331224045877, "step": 2279, "train/total_loss": 0.19968751072883606 }, { "epoch": 0.22543009689539253, "grad_norm": 0.8348625898361206, "learning_rate": 9.439005093210701e-06, "loss": 0.1487, "step": 2280 }, { "entropy": 9.51508617401123, "epoch": 0.22543009689539253, "mean_token_accuracy": 0.7322946190834045, "num_tokens": 11864989.0, "step": 2280, "train/ce_loss": 0.5634602308273315 }, { "epoch": 0.22543009689539253, "step": 2280, "train/sim_loss": 0.0390625 }, { "epoch": 0.22543009689539253, "step": 2280, "train/total_loss": 0.09540852904319763 }, { "entropy": 8.554391860961914, "epoch": 0.22552896974490805, "mean_token_accuracy": 0.748110830783844, "num_tokens": 11870654.0, "step": 2281, "train/ce_loss": 0.9140045642852783 }, { "epoch": 0.22552896974490805, "step": 2281, "train/sim_loss": 0.125 }, { "epoch": 0.22552896974490805, "step": 2281, "train/total_loss": 0.21640045940876007 }, { "entropy": 9.242164611816406, "epoch": 0.22562784259442356, "mean_token_accuracy": 0.7529411911964417, "num_tokens": 11875902.0, "step": 2282, "train/ce_loss": 0.6295803785324097 }, { "epoch": 0.22562784259442356, "step": 2282, "train/sim_loss": 0.0390625 }, { "epoch": 0.22562784259442356, "step": 2282, "train/total_loss": 0.10202053934335709 }, { "entropy": 9.475540161132812, "epoch": 0.2257267154439391, "mean_token_accuracy": 0.7105942964553833, "num_tokens": 11881137.0, "step": 2283, "train/ce_loss": 0.8041722774505615 }, { "epoch": 0.2257267154439391, "step": 2283, "train/sim_loss": 0.03125 }, { "epoch": 0.2257267154439391, "step": 2283, "train/total_loss": 0.11166723072528839 }, { "entropy": 9.342806816101074, "epoch": 0.22582558829345462, "mean_token_accuracy": 0.6896985173225403, "num_tokens": 11886401.0, "step": 2284, "train/ce_loss": 3.392033931959304e-06 }, { "epoch": 0.22582558829345462, "step": 2284, "train/sim_loss": 0.03125 }, { "epoch": 0.22582558829345462, "step": 2284, "train/total_loss": 0.03125033900141716 }, { "entropy": 10.024946212768555, "epoch": 0.22592446114297013, "mean_token_accuracy": 0.7042889595031738, "num_tokens": 11891316.0, "step": 2285, "train/ce_loss": 1.474079181207344e-05 }, { "epoch": 0.22592446114297013, "step": 2285, "train/sim_loss": 0.02734375 }, { "epoch": 0.22592446114297013, "step": 2285, "train/total_loss": 0.02734522335231304 }, { "entropy": 9.042903900146484, "epoch": 0.22602333399248567, "mean_token_accuracy": 0.686087965965271, "num_tokens": 11896656.0, "step": 2286, "train/ce_loss": 1.1429626941680908 }, { "epoch": 0.22602333399248567, "step": 2286, "train/sim_loss": 0.0625 }, { "epoch": 0.22602333399248567, "step": 2286, "train/total_loss": 0.17679627239704132 }, { "entropy": 9.304134368896484, "epoch": 0.22612220684200118, "mean_token_accuracy": 0.6961326003074646, "num_tokens": 11901859.0, "step": 2287, "train/ce_loss": 0.8572551012039185 }, { "epoch": 0.22612220684200118, "step": 2287, "train/sim_loss": 0.07421875 }, { "epoch": 0.22612220684200118, "step": 2287, "train/total_loss": 0.15994426608085632 }, { "entropy": 10.196354866027832, "epoch": 0.2262210796915167, "mean_token_accuracy": 0.6957831382751465, "num_tokens": 11906568.0, "step": 2288, "train/ce_loss": 2.1421995162963867 }, { "epoch": 0.2262210796915167, "step": 2288, "train/sim_loss": 0.0703125 }, { "epoch": 0.2262210796915167, "step": 2288, "train/total_loss": 0.28453245759010315 }, { "entropy": 9.624486923217773, "epoch": 0.22631995254103224, "mean_token_accuracy": 0.7454819083213806, "num_tokens": 11911684.0, "step": 2289, "train/ce_loss": 0.869027853012085 }, { "epoch": 0.22631995254103224, "step": 2289, "train/sim_loss": 0.0859375 }, { "epoch": 0.22631995254103224, "step": 2289, "train/total_loss": 0.17284029722213745 }, { "entropy": 9.850872039794922, "epoch": 0.22641882539054775, "mean_token_accuracy": 0.757785439491272, "num_tokens": 11916672.0, "step": 2290, "train/ce_loss": 0.4710646867752075 }, { "epoch": 0.22641882539054775, "step": 2290, "train/sim_loss": 0.07421875 }, { "epoch": 0.22641882539054775, "step": 2290, "train/total_loss": 0.12132522463798523 }, { "entropy": 9.050373077392578, "epoch": 0.2265176982400633, "mean_token_accuracy": 0.7090216279029846, "num_tokens": 11921931.0, "step": 2291, "train/ce_loss": 1.124954342842102 }, { "epoch": 0.2265176982400633, "step": 2291, "train/sim_loss": 0.078125 }, { "epoch": 0.2265176982400633, "step": 2291, "train/total_loss": 0.19062043726444244 }, { "entropy": 9.606853485107422, "epoch": 0.2266165710895788, "mean_token_accuracy": 0.6097561120986938, "num_tokens": 11926987.0, "step": 2292, "train/ce_loss": 1.8163892030715942 }, { "epoch": 0.2266165710895788, "step": 2292, "train/sim_loss": 0.0703125 }, { "epoch": 0.2266165710895788, "step": 2292, "train/total_loss": 0.2519514262676239 }, { "entropy": 9.96500015258789, "epoch": 0.22671544393909432, "mean_token_accuracy": 0.7556390762329102, "num_tokens": 11932069.0, "step": 2293, "train/ce_loss": 0.970014750957489 }, { "epoch": 0.22671544393909432, "step": 2293, "train/sim_loss": 0.0625 }, { "epoch": 0.22671544393909432, "step": 2293, "train/total_loss": 0.15950147807598114 }, { "entropy": 8.88833999633789, "epoch": 0.22681431678860986, "mean_token_accuracy": 0.7294994592666626, "num_tokens": 11937461.0, "step": 2294, "train/ce_loss": 0.9902238249778748 }, { "epoch": 0.22681431678860986, "step": 2294, "train/sim_loss": 0.09765625 }, { "epoch": 0.22681431678860986, "step": 2294, "train/total_loss": 0.19667863845825195 }, { "entropy": 9.839460372924805, "epoch": 0.22691318963812537, "mean_token_accuracy": 0.7652329802513123, "num_tokens": 11942415.0, "step": 2295, "train/ce_loss": 1.1105287075042725 }, { "epoch": 0.22691318963812537, "step": 2295, "train/sim_loss": 0.05859375 }, { "epoch": 0.22691318963812537, "step": 2295, "train/total_loss": 0.16964662075042725 }, { "entropy": 9.365549087524414, "epoch": 0.2270120624876409, "mean_token_accuracy": 0.7042253613471985, "num_tokens": 11947562.0, "step": 2296, "train/ce_loss": 0.7497787475585938 }, { "epoch": 0.2270120624876409, "step": 2296, "train/sim_loss": 0.0546875 }, { "epoch": 0.2270120624876409, "step": 2296, "train/total_loss": 0.12966537475585938 }, { "entropy": 9.545838356018066, "epoch": 0.22711093533715643, "mean_token_accuracy": 0.7606461048126221, "num_tokens": 11952663.0, "step": 2297, "train/ce_loss": 1.0381242036819458 }, { "epoch": 0.22711093533715643, "step": 2297, "train/sim_loss": 0.0625 }, { "epoch": 0.22711093533715643, "step": 2297, "train/total_loss": 0.16631242632865906 }, { "entropy": 9.693061828613281, "epoch": 0.22720980818667194, "mean_token_accuracy": 0.7562189102172852, "num_tokens": 11957898.0, "step": 2298, "train/ce_loss": 6.120082161942264e-06 }, { "epoch": 0.22720980818667194, "step": 2298, "train/sim_loss": 0.05859375 }, { "epoch": 0.22720980818667194, "step": 2298, "train/total_loss": 0.05859436094760895 }, { "entropy": 9.140596389770508, "epoch": 0.22730868103618745, "mean_token_accuracy": 0.6908893585205078, "num_tokens": 11963326.0, "step": 2299, "train/ce_loss": 1.0507303476333618 }, { "epoch": 0.22730868103618745, "step": 2299, "train/sim_loss": 0.0390625 }, { "epoch": 0.22730868103618745, "step": 2299, "train/total_loss": 0.14413553476333618 }, { "epoch": 0.227407553885703, "grad_norm": 1.2080398797988892, "learning_rate": 9.434060228452752e-06, "loss": 0.164, "step": 2300 }, { "entropy": 9.693990707397461, "epoch": 0.227407553885703, "mean_token_accuracy": 0.7839999794960022, "num_tokens": 11968449.0, "step": 2300, "train/ce_loss": 0.5579846501350403 }, { "epoch": 0.227407553885703, "step": 2300, "train/sim_loss": 0.0234375 }, { "epoch": 0.227407553885703, "step": 2300, "train/total_loss": 0.0792359709739685 }, { "entropy": 10.13557243347168, "epoch": 0.2275064267352185, "mean_token_accuracy": 0.7394067645072937, "num_tokens": 11973294.0, "step": 2301, "train/ce_loss": 2.46250581741333 }, { "epoch": 0.2275064267352185, "step": 2301, "train/sim_loss": 0.10546875 }, { "epoch": 0.2275064267352185, "step": 2301, "train/total_loss": 0.35171931982040405 }, { "entropy": 10.132203102111816, "epoch": 0.22760529958473402, "mean_token_accuracy": 0.7748344540596008, "num_tokens": 11978011.0, "step": 2302, "train/ce_loss": 5.034738205722533e-05 }, { "epoch": 0.22760529958473402, "step": 2302, "train/sim_loss": 0.078125 }, { "epoch": 0.22760529958473402, "step": 2302, "train/total_loss": 0.07813003659248352 }, { "entropy": 10.068281173706055, "epoch": 0.22770417243424956, "mean_token_accuracy": 0.755294144153595, "num_tokens": 11982872.0, "step": 2303, "train/ce_loss": 1.7586405277252197 }, { "epoch": 0.22770417243424956, "step": 2303, "train/sim_loss": 0.0703125 }, { "epoch": 0.22770417243424956, "step": 2303, "train/total_loss": 0.2461765557527542 }, { "entropy": 9.399394989013672, "epoch": 0.22780304528376508, "mean_token_accuracy": 0.748633861541748, "num_tokens": 11988082.0, "step": 2304, "train/ce_loss": 1.0429457426071167 }, { "epoch": 0.22780304528376508, "step": 2304, "train/sim_loss": 0.05078125 }, { "epoch": 0.22780304528376508, "step": 2304, "train/total_loss": 0.1550758183002472 }, { "entropy": 9.462631225585938, "epoch": 0.2279019181332806, "mean_token_accuracy": 0.7350901365280151, "num_tokens": 11993404.0, "step": 2305, "train/ce_loss": 0.5820447206497192 }, { "epoch": 0.2279019181332806, "step": 2305, "train/sim_loss": 0.06640625 }, { "epoch": 0.2279019181332806, "step": 2305, "train/total_loss": 0.12461072206497192 }, { "entropy": 9.83642864227295, "epoch": 0.22800079098279613, "mean_token_accuracy": 0.7275922894477844, "num_tokens": 11998424.0, "step": 2306, "train/ce_loss": 6.891273187648039e-06 }, { "epoch": 0.22800079098279613, "step": 2306, "train/sim_loss": 0.0546875 }, { "epoch": 0.22800079098279613, "step": 2306, "train/total_loss": 0.054688189178705215 }, { "entropy": 9.044208526611328, "epoch": 0.22809966383231164, "mean_token_accuracy": 0.7373737096786499, "num_tokens": 12003871.0, "step": 2307, "train/ce_loss": 0.6468204259872437 }, { "epoch": 0.22809966383231164, "step": 2307, "train/sim_loss": 0.01953125 }, { "epoch": 0.22809966383231164, "step": 2307, "train/total_loss": 0.08421329408884048 }, { "entropy": 9.22705078125, "epoch": 0.22819853668182716, "mean_token_accuracy": 0.7378048896789551, "num_tokens": 12009355.0, "step": 2308, "train/ce_loss": 0.7152095437049866 }, { "epoch": 0.22819853668182716, "step": 2308, "train/sim_loss": 0.125 }, { "epoch": 0.22819853668182716, "step": 2308, "train/total_loss": 0.19652095437049866 }, { "entropy": 9.639345169067383, "epoch": 0.2282974095313427, "mean_token_accuracy": 0.6328927874565125, "num_tokens": 12014423.0, "step": 2309, "train/ce_loss": 2.6829929993255064e-06 }, { "epoch": 0.2282974095313427, "step": 2309, "train/sim_loss": 0.02734375 }, { "epoch": 0.2282974095313427, "step": 2309, "train/total_loss": 0.02734401822090149 }, { "entropy": 9.361225128173828, "epoch": 0.2283962823808582, "mean_token_accuracy": 0.7243173122406006, "num_tokens": 12019641.0, "step": 2310, "train/ce_loss": 1.1629915237426758 }, { "epoch": 0.2283962823808582, "step": 2310, "train/sim_loss": 0.10546875 }, { "epoch": 0.2283962823808582, "step": 2310, "train/total_loss": 0.22176790237426758 }, { "entropy": 9.329537391662598, "epoch": 0.22849515523037375, "mean_token_accuracy": 0.7163398861885071, "num_tokens": 12024849.0, "step": 2311, "train/ce_loss": 1.7918075323104858 }, { "epoch": 0.22849515523037375, "step": 2311, "train/sim_loss": 0.078125 }, { "epoch": 0.22849515523037375, "step": 2311, "train/total_loss": 0.25730574131011963 }, { "entropy": 8.798558235168457, "epoch": 0.22859402807988927, "mean_token_accuracy": 0.7682177424430847, "num_tokens": 12030490.0, "step": 2312, "train/ce_loss": 0.7416049838066101 }, { "epoch": 0.22859402807988927, "step": 2312, "train/sim_loss": 0.10546875 }, { "epoch": 0.22859402807988927, "step": 2312, "train/total_loss": 0.17962925136089325 }, { "entropy": 9.236748695373535, "epoch": 0.22869290092940478, "mean_token_accuracy": 0.7161290049552917, "num_tokens": 12035879.0, "step": 2313, "train/ce_loss": 1.196192979812622 }, { "epoch": 0.22869290092940478, "step": 2313, "train/sim_loss": 0.0703125 }, { "epoch": 0.22869290092940478, "step": 2313, "train/total_loss": 0.18993180990219116 }, { "entropy": 9.573179244995117, "epoch": 0.22879177377892032, "mean_token_accuracy": 0.7083333134651184, "num_tokens": 12040982.0, "step": 2314, "train/ce_loss": 0.769433319568634 }, { "epoch": 0.22879177377892032, "step": 2314, "train/sim_loss": 0.09375 }, { "epoch": 0.22879177377892032, "step": 2314, "train/total_loss": 0.17069333791732788 }, { "entropy": 8.967704772949219, "epoch": 0.22889064662843583, "mean_token_accuracy": 0.7914980053901672, "num_tokens": 12046498.0, "step": 2315, "train/ce_loss": 0.49730974435806274 }, { "epoch": 0.22889064662843583, "step": 2315, "train/sim_loss": 0.04296875 }, { "epoch": 0.22889064662843583, "step": 2315, "train/total_loss": 0.09269972145557404 }, { "entropy": 9.235469818115234, "epoch": 0.22898951947795135, "mean_token_accuracy": 0.7600896954536438, "num_tokens": 12051874.0, "step": 2316, "train/ce_loss": 1.099819302558899 }, { "epoch": 0.22898951947795135, "step": 2316, "train/sim_loss": 0.01953125 }, { "epoch": 0.22898951947795135, "step": 2316, "train/total_loss": 0.12951317429542542 }, { "entropy": 9.946176528930664, "epoch": 0.2290883923274669, "mean_token_accuracy": 0.69305020570755, "num_tokens": 12056821.0, "step": 2317, "train/ce_loss": 1.7881754636764526 }, { "epoch": 0.2290883923274669, "step": 2317, "train/sim_loss": 0.1015625 }, { "epoch": 0.2290883923274669, "step": 2317, "train/total_loss": 0.2803800702095032 }, { "entropy": 9.353560447692871, "epoch": 0.2291872651769824, "mean_token_accuracy": 0.7390244007110596, "num_tokens": 12062127.0, "step": 2318, "train/ce_loss": 0.66374272108078 }, { "epoch": 0.2291872651769824, "step": 2318, "train/sim_loss": 0.078125 }, { "epoch": 0.2291872651769824, "step": 2318, "train/total_loss": 0.144499272108078 }, { "entropy": 9.604637145996094, "epoch": 0.22928613802649792, "mean_token_accuracy": 0.7388888597488403, "num_tokens": 12067442.0, "step": 2319, "train/ce_loss": 0.6843863725662231 }, { "epoch": 0.22928613802649792, "step": 2319, "train/sim_loss": 0.07421875 }, { "epoch": 0.22928613802649792, "step": 2319, "train/total_loss": 0.14265739917755127 }, { "epoch": 0.22938501087601346, "grad_norm": 1.1014442443847656, "learning_rate": 9.429115363694804e-06, "loss": 0.1555, "step": 2320 }, { "entropy": 8.732933044433594, "epoch": 0.22938501087601346, "mean_token_accuracy": 0.729629635810852, "num_tokens": 12073023.0, "step": 2320, "train/ce_loss": 0.61324542760849 }, { "epoch": 0.22938501087601346, "step": 2320, "train/sim_loss": 0.0390625 }, { "epoch": 0.22938501087601346, "step": 2320, "train/total_loss": 0.10038704425096512 }, { "entropy": 9.829085350036621, "epoch": 0.22948388372552897, "mean_token_accuracy": 0.803636372089386, "num_tokens": 12077972.0, "step": 2321, "train/ce_loss": 0.9266363382339478 }, { "epoch": 0.22948388372552897, "step": 2321, "train/sim_loss": 0.0625 }, { "epoch": 0.22948388372552897, "step": 2321, "train/total_loss": 0.15516364574432373 }, { "entropy": 9.217707633972168, "epoch": 0.22958275657504448, "mean_token_accuracy": 0.7194570302963257, "num_tokens": 12083345.0, "step": 2322, "train/ce_loss": 0.6711381673812866 }, { "epoch": 0.22958275657504448, "step": 2322, "train/sim_loss": 0.05078125 }, { "epoch": 0.22958275657504448, "step": 2322, "train/total_loss": 0.11789506673812866 }, { "entropy": 9.400322914123535, "epoch": 0.22968162942456002, "mean_token_accuracy": 0.7345844507217407, "num_tokens": 12088564.0, "step": 2323, "train/ce_loss": 0.5601037740707397 }, { "epoch": 0.22968162942456002, "step": 2323, "train/sim_loss": 0.0390625 }, { "epoch": 0.22968162942456002, "step": 2323, "train/total_loss": 0.09507288038730621 }, { "entropy": 9.304672241210938, "epoch": 0.22978050227407554, "mean_token_accuracy": 0.7310087084770203, "num_tokens": 12093878.0, "step": 2324, "train/ce_loss": 0.8067693710327148 }, { "epoch": 0.22978050227407554, "step": 2324, "train/sim_loss": 0.03515625 }, { "epoch": 0.22978050227407554, "step": 2324, "train/total_loss": 0.11583318561315536 }, { "entropy": 9.614727020263672, "epoch": 0.22987937512359105, "mean_token_accuracy": 0.7204142212867737, "num_tokens": 12099042.0, "step": 2325, "train/ce_loss": 3.208590214853757e-06 }, { "epoch": 0.22987937512359105, "step": 2325, "train/sim_loss": 0.046875 }, { "epoch": 0.22987937512359105, "step": 2325, "train/total_loss": 0.04687532037496567 }, { "entropy": 9.11031723022461, "epoch": 0.2299782479731066, "mean_token_accuracy": 0.7690631747245789, "num_tokens": 12104492.0, "step": 2326, "train/ce_loss": 0.7134276032447815 }, { "epoch": 0.2299782479731066, "step": 2326, "train/sim_loss": 0.04296875 }, { "epoch": 0.2299782479731066, "step": 2326, "train/total_loss": 0.11431150883436203 }, { "entropy": 9.151698112487793, "epoch": 0.2300771208226221, "mean_token_accuracy": 0.732300877571106, "num_tokens": 12109856.0, "step": 2327, "train/ce_loss": 1.6674649715423584 }, { "epoch": 0.2300771208226221, "step": 2327, "train/sim_loss": 0.078125 }, { "epoch": 0.2300771208226221, "step": 2327, "train/total_loss": 0.24487149715423584 }, { "entropy": 9.683753967285156, "epoch": 0.23017599367213762, "mean_token_accuracy": 0.7667785286903381, "num_tokens": 12114920.0, "step": 2328, "train/ce_loss": 1.7838212251663208 }, { "epoch": 0.23017599367213762, "step": 2328, "train/sim_loss": 0.1171875 }, { "epoch": 0.23017599367213762, "step": 2328, "train/total_loss": 0.29556962847709656 }, { "entropy": 9.871530532836914, "epoch": 0.23027486652165316, "mean_token_accuracy": 0.7833333611488342, "num_tokens": 12119981.0, "step": 2329, "train/ce_loss": 0.41187918186187744 }, { "epoch": 0.23027486652165316, "step": 2329, "train/sim_loss": 0.07421875 }, { "epoch": 0.23027486652165316, "step": 2329, "train/total_loss": 0.11540666967630386 }, { "entropy": 9.21509838104248, "epoch": 0.23037373937116867, "mean_token_accuracy": 0.7616875767707825, "num_tokens": 12125302.0, "step": 2330, "train/ce_loss": 1.1086196899414062 }, { "epoch": 0.23037373937116867, "step": 2330, "train/sim_loss": 0.0546875 }, { "epoch": 0.23037373937116867, "step": 2330, "train/total_loss": 0.16554947197437286 }, { "entropy": 9.648240089416504, "epoch": 0.2304726122206842, "mean_token_accuracy": 0.789207398891449, "num_tokens": 12130365.0, "step": 2331, "train/ce_loss": 0.7285244464874268 }, { "epoch": 0.2304726122206842, "step": 2331, "train/sim_loss": 0.0625 }, { "epoch": 0.2304726122206842, "step": 2331, "train/total_loss": 0.13535244762897491 }, { "entropy": 9.431110382080078, "epoch": 0.23057148507019973, "mean_token_accuracy": 0.7144790291786194, "num_tokens": 12135585.0, "step": 2332, "train/ce_loss": 1.6808106899261475 }, { "epoch": 0.23057148507019973, "step": 2332, "train/sim_loss": 0.109375 }, { "epoch": 0.23057148507019973, "step": 2332, "train/total_loss": 0.2774560749530792 }, { "entropy": 9.513031959533691, "epoch": 0.23067035791971524, "mean_token_accuracy": 0.7231788039207458, "num_tokens": 12140781.0, "step": 2333, "train/ce_loss": 1.0438014268875122 }, { "epoch": 0.23067035791971524, "step": 2333, "train/sim_loss": 0.06640625 }, { "epoch": 0.23067035791971524, "step": 2333, "train/total_loss": 0.17078639566898346 }, { "entropy": 9.623817443847656, "epoch": 0.23076923076923078, "mean_token_accuracy": 0.7717041969299316, "num_tokens": 12145849.0, "step": 2334, "train/ce_loss": 0.7471662163734436 }, { "epoch": 0.23076923076923078, "step": 2334, "train/sim_loss": 0.01953125 }, { "epoch": 0.23076923076923078, "step": 2334, "train/total_loss": 0.09424787014722824 }, { "entropy": 9.290124893188477, "epoch": 0.2308681036187463, "mean_token_accuracy": 0.6814371347427368, "num_tokens": 12151116.0, "step": 2335, "train/ce_loss": 1.317223072052002 }, { "epoch": 0.2308681036187463, "step": 2335, "train/sim_loss": 0.08984375 }, { "epoch": 0.2308681036187463, "step": 2335, "train/total_loss": 0.2215660661458969 }, { "entropy": 9.283143043518066, "epoch": 0.2309669764682618, "mean_token_accuracy": 0.7582159638404846, "num_tokens": 12156438.0, "step": 2336, "train/ce_loss": 0.47142666578292847 }, { "epoch": 0.2309669764682618, "step": 2336, "train/sim_loss": 0.02734375 }, { "epoch": 0.2309669764682618, "step": 2336, "train/total_loss": 0.07448641955852509 }, { "entropy": 8.935561180114746, "epoch": 0.23106584931777735, "mean_token_accuracy": 0.7375133037567139, "num_tokens": 12161874.0, "step": 2337, "train/ce_loss": 0.9972467422485352 }, { "epoch": 0.23106584931777735, "step": 2337, "train/sim_loss": 0.0859375 }, { "epoch": 0.23106584931777735, "step": 2337, "train/total_loss": 0.185662180185318 }, { "entropy": 9.580480575561523, "epoch": 0.23116472216729286, "mean_token_accuracy": 0.7230538725852966, "num_tokens": 12167025.0, "step": 2338, "train/ce_loss": 1.1938048601150513 }, { "epoch": 0.23116472216729286, "step": 2338, "train/sim_loss": 0.08203125 }, { "epoch": 0.23116472216729286, "step": 2338, "train/total_loss": 0.20141173899173737 }, { "entropy": 9.525504112243652, "epoch": 0.23126359501680838, "mean_token_accuracy": 0.7517531514167786, "num_tokens": 12172214.0, "step": 2339, "train/ce_loss": 0.7448764443397522 }, { "epoch": 0.23126359501680838, "step": 2339, "train/sim_loss": 0.03125 }, { "epoch": 0.23126359501680838, "step": 2339, "train/total_loss": 0.10573764890432358 }, { "epoch": 0.23136246786632392, "grad_norm": 0.8302991986274719, "learning_rate": 9.424170498936855e-06, "loss": 0.1451, "step": 2340 }, { "entropy": 8.966384887695312, "epoch": 0.23136246786632392, "mean_token_accuracy": 0.6930232644081116, "num_tokens": 12177567.0, "step": 2340, "train/ce_loss": 0.9528385400772095 }, { "epoch": 0.23136246786632392, "step": 2340, "train/sim_loss": 0.08984375 }, { "epoch": 0.23136246786632392, "step": 2340, "train/total_loss": 0.1851276159286499 }, { "entropy": 8.769596099853516, "epoch": 0.23146134071583943, "mean_token_accuracy": 0.8007380366325378, "num_tokens": 12183128.0, "step": 2341, "train/ce_loss": 0.5972678065299988 }, { "epoch": 0.23146134071583943, "step": 2341, "train/sim_loss": 0.1171875 }, { "epoch": 0.23146134071583943, "step": 2341, "train/total_loss": 0.1769142746925354 }, { "entropy": 8.95469856262207, "epoch": 0.23156021356535494, "mean_token_accuracy": 0.7057521939277649, "num_tokens": 12188499.0, "step": 2342, "train/ce_loss": 0.523081362247467 }, { "epoch": 0.23156021356535494, "step": 2342, "train/sim_loss": 0.05859375 }, { "epoch": 0.23156021356535494, "step": 2342, "train/total_loss": 0.11090189218521118 }, { "entropy": 9.6317138671875, "epoch": 0.23165908641487049, "mean_token_accuracy": 0.7047451734542847, "num_tokens": 12193481.0, "step": 2343, "train/ce_loss": 1.2870943546295166 }, { "epoch": 0.23165908641487049, "step": 2343, "train/sim_loss": 0.06640625 }, { "epoch": 0.23165908641487049, "step": 2343, "train/total_loss": 0.19511568546295166 }, { "entropy": 9.163106918334961, "epoch": 0.231757959264386, "mean_token_accuracy": 0.7786343693733215, "num_tokens": 12198817.0, "step": 2344, "train/ce_loss": 1.093612790107727 }, { "epoch": 0.231757959264386, "step": 2344, "train/sim_loss": 0.03515625 }, { "epoch": 0.231757959264386, "step": 2344, "train/total_loss": 0.14451754093170166 }, { "entropy": 8.905069351196289, "epoch": 0.2318568321139015, "mean_token_accuracy": 0.7400398254394531, "num_tokens": 12204316.0, "step": 2345, "train/ce_loss": 0.7283441424369812 }, { "epoch": 0.2318568321139015, "step": 2345, "train/sim_loss": 0.0703125 }, { "epoch": 0.2318568321139015, "step": 2345, "train/total_loss": 0.14314691722393036 }, { "entropy": 9.299427032470703, "epoch": 0.23195570496341705, "mean_token_accuracy": 0.7548138499259949, "num_tokens": 12209574.0, "step": 2346, "train/ce_loss": 0.859463632106781 }, { "epoch": 0.23195570496341705, "step": 2346, "train/sim_loss": 0.07421875 }, { "epoch": 0.23195570496341705, "step": 2346, "train/total_loss": 0.16016511619091034 }, { "entropy": 9.65144157409668, "epoch": 0.23205457781293257, "mean_token_accuracy": 0.70606529712677, "num_tokens": 12214672.0, "step": 2347, "train/ce_loss": 2.027458906173706 }, { "epoch": 0.23205457781293257, "step": 2347, "train/sim_loss": 0.078125 }, { "epoch": 0.23205457781293257, "step": 2347, "train/total_loss": 0.2808709144592285 }, { "entropy": 9.273920059204102, "epoch": 0.23215345066244808, "mean_token_accuracy": 0.6890243887901306, "num_tokens": 12219959.0, "step": 2348, "train/ce_loss": 0.7437017560005188 }, { "epoch": 0.23215345066244808, "step": 2348, "train/sim_loss": 0.05078125 }, { "epoch": 0.23215345066244808, "step": 2348, "train/total_loss": 0.12515142560005188 }, { "entropy": 9.235380172729492, "epoch": 0.23225232351196362, "mean_token_accuracy": 0.7119438052177429, "num_tokens": 12225223.0, "step": 2349, "train/ce_loss": 1.7639403343200684 }, { "epoch": 0.23225232351196362, "step": 2349, "train/sim_loss": 0.09765625 }, { "epoch": 0.23225232351196362, "step": 2349, "train/total_loss": 0.2740502953529358 }, { "entropy": 9.568511962890625, "epoch": 0.23235119636147913, "mean_token_accuracy": 0.7660208940505981, "num_tokens": 12230328.0, "step": 2350, "train/ce_loss": 0.7514338493347168 }, { "epoch": 0.23235119636147913, "step": 2350, "train/sim_loss": 0.09765625 }, { "epoch": 0.23235119636147913, "step": 2350, "train/total_loss": 0.17279964685440063 }, { "entropy": 9.128923416137695, "epoch": 0.23245006921099465, "mean_token_accuracy": 0.6614255905151367, "num_tokens": 12235765.0, "step": 2351, "train/ce_loss": 1.1933038234710693 }, { "epoch": 0.23245006921099465, "step": 2351, "train/sim_loss": 0.0859375 }, { "epoch": 0.23245006921099465, "step": 2351, "train/total_loss": 0.20526787638664246 }, { "entropy": 9.096158981323242, "epoch": 0.2325489420605102, "mean_token_accuracy": 0.7404994368553162, "num_tokens": 12241175.0, "step": 2352, "train/ce_loss": 1.2961671352386475 }, { "epoch": 0.2325489420605102, "step": 2352, "train/sim_loss": 0.05859375 }, { "epoch": 0.2325489420605102, "step": 2352, "train/total_loss": 0.18821047246456146 }, { "entropy": 9.5964994430542, "epoch": 0.2326478149100257, "mean_token_accuracy": 0.7549019455909729, "num_tokens": 12246203.0, "step": 2353, "train/ce_loss": 2.9441860078804893e-06 }, { "epoch": 0.2326478149100257, "step": 2353, "train/sim_loss": 0.0234375 }, { "epoch": 0.2326478149100257, "step": 2353, "train/total_loss": 0.02343779429793358 }, { "entropy": 9.449602127075195, "epoch": 0.23274668775954124, "mean_token_accuracy": 0.6873920559883118, "num_tokens": 12251233.0, "step": 2354, "train/ce_loss": 1.7445043325424194 }, { "epoch": 0.23274668775954124, "step": 2354, "train/sim_loss": 0.12109375 }, { "epoch": 0.23274668775954124, "step": 2354, "train/total_loss": 0.29554420709609985 }, { "entropy": 9.216875076293945, "epoch": 0.23284556060905676, "mean_token_accuracy": 0.7317351698875427, "num_tokens": 12256564.0, "step": 2355, "train/ce_loss": 0.9889079332351685 }, { "epoch": 0.23284556060905676, "step": 2355, "train/sim_loss": 0.078125 }, { "epoch": 0.23284556060905676, "step": 2355, "train/total_loss": 0.17701579630374908 }, { "entropy": 9.126455307006836, "epoch": 0.23294443345857227, "mean_token_accuracy": 0.6959620118141174, "num_tokens": 12261836.0, "step": 2356, "train/ce_loss": 0.8084344863891602 }, { "epoch": 0.23294443345857227, "step": 2356, "train/sim_loss": 0.078125 }, { "epoch": 0.23294443345857227, "step": 2356, "train/total_loss": 0.15896844863891602 }, { "entropy": 9.406908988952637, "epoch": 0.2330433063080878, "mean_token_accuracy": 0.7142857313156128, "num_tokens": 12267089.0, "step": 2357, "train/ce_loss": 0.7463719844818115 }, { "epoch": 0.2330433063080878, "step": 2357, "train/sim_loss": 0.0703125 }, { "epoch": 0.2330433063080878, "step": 2357, "train/total_loss": 0.14494970440864563 }, { "entropy": 10.010465621948242, "epoch": 0.23314217915760332, "mean_token_accuracy": 0.784518837928772, "num_tokens": 12272006.0, "step": 2358, "train/ce_loss": 1.2236218935868237e-05 }, { "epoch": 0.23314217915760332, "step": 2358, "train/sim_loss": 0.06640625 }, { "epoch": 0.23314217915760332, "step": 2358, "train/total_loss": 0.0664074718952179 }, { "entropy": 9.247428894042969, "epoch": 0.23324105200711884, "mean_token_accuracy": 0.7581453919410706, "num_tokens": 12277301.0, "step": 2359, "train/ce_loss": 1.2065562009811401 }, { "epoch": 0.23324105200711884, "step": 2359, "train/sim_loss": 0.0703125 }, { "epoch": 0.23324105200711884, "step": 2359, "train/total_loss": 0.1909681260585785 }, { "epoch": 0.23333992485663438, "grad_norm": 0.9280444383621216, "learning_rate": 9.419225634178905e-06, "loss": 0.1634, "step": 2360 }, { "entropy": 9.522346496582031, "epoch": 0.23333992485663438, "mean_token_accuracy": 0.7637444138526917, "num_tokens": 12282397.0, "step": 2360, "train/ce_loss": 0.9355092644691467 }, { "epoch": 0.23333992485663438, "step": 2360, "train/sim_loss": 0.08984375 }, { "epoch": 0.23333992485663438, "step": 2360, "train/total_loss": 0.1833946704864502 }, { "entropy": 9.466015815734863, "epoch": 0.2334387977061499, "mean_token_accuracy": 0.7586750984191895, "num_tokens": 12287449.0, "step": 2361, "train/ce_loss": 1.500874638557434 }, { "epoch": 0.2334387977061499, "step": 2361, "train/sim_loss": 0.08984375 }, { "epoch": 0.2334387977061499, "step": 2361, "train/total_loss": 0.23993121087551117 }, { "entropy": 9.620744705200195, "epoch": 0.2335376705556654, "mean_token_accuracy": 0.75789475440979, "num_tokens": 12292533.0, "step": 2362, "train/ce_loss": 0.8770025968551636 }, { "epoch": 0.2335376705556654, "step": 2362, "train/sim_loss": 0.015625 }, { "epoch": 0.2335376705556654, "step": 2362, "train/total_loss": 0.1033252626657486 }, { "entropy": 9.022543907165527, "epoch": 0.23363654340518095, "mean_token_accuracy": 0.7319062352180481, "num_tokens": 12297991.0, "step": 2363, "train/ce_loss": 0.6032585501670837 }, { "epoch": 0.23363654340518095, "step": 2363, "train/sim_loss": 0.0390625 }, { "epoch": 0.23363654340518095, "step": 2363, "train/total_loss": 0.09938836097717285 }, { "entropy": 9.33456802368164, "epoch": 0.23373541625469646, "mean_token_accuracy": 0.7348951697349548, "num_tokens": 12303223.0, "step": 2364, "train/ce_loss": 1.0315171480178833 }, { "epoch": 0.23373541625469646, "step": 2364, "train/sim_loss": 0.06640625 }, { "epoch": 0.23373541625469646, "step": 2364, "train/total_loss": 0.16955795884132385 }, { "entropy": 9.068136215209961, "epoch": 0.23383428910421197, "mean_token_accuracy": 0.7019438147544861, "num_tokens": 12308573.0, "step": 2365, "train/ce_loss": 0.833172082901001 }, { "epoch": 0.23383428910421197, "step": 2365, "train/sim_loss": 0.03125 }, { "epoch": 0.23383428910421197, "step": 2365, "train/total_loss": 0.11456721276044846 }, { "entropy": 9.199182510375977, "epoch": 0.23393316195372751, "mean_token_accuracy": 0.7469066381454468, "num_tokens": 12313984.0, "step": 2366, "train/ce_loss": 1.0053783655166626 }, { "epoch": 0.23393316195372751, "step": 2366, "train/sim_loss": 0.078125 }, { "epoch": 0.23393316195372751, "step": 2366, "train/total_loss": 0.17866283655166626 }, { "entropy": 9.322465896606445, "epoch": 0.23403203480324303, "mean_token_accuracy": 0.7112299203872681, "num_tokens": 12319359.0, "step": 2367, "train/ce_loss": 0.5418054461479187 }, { "epoch": 0.23403203480324303, "step": 2367, "train/sim_loss": 0.01953125 }, { "epoch": 0.23403203480324303, "step": 2367, "train/total_loss": 0.07371179759502411 }, { "entropy": 9.816730499267578, "epoch": 0.23413090765275854, "mean_token_accuracy": 0.716549277305603, "num_tokens": 12324391.0, "step": 2368, "train/ce_loss": 1.0450146198272705 }, { "epoch": 0.23413090765275854, "step": 2368, "train/sim_loss": 0.0625 }, { "epoch": 0.23413090765275854, "step": 2368, "train/total_loss": 0.16700145602226257 }, { "entropy": 8.824502944946289, "epoch": 0.23422978050227408, "mean_token_accuracy": 0.75, "num_tokens": 12329954.0, "step": 2369, "train/ce_loss": 0.7634261846542358 }, { "epoch": 0.23422978050227408, "step": 2369, "train/sim_loss": 0.0234375 }, { "epoch": 0.23422978050227408, "step": 2369, "train/total_loss": 0.0997801199555397 }, { "entropy": 9.87130355834961, "epoch": 0.2343286533517896, "mean_token_accuracy": 0.7819548845291138, "num_tokens": 12334935.0, "step": 2370, "train/ce_loss": 5.111191057949327e-06 }, { "epoch": 0.2343286533517896, "step": 2370, "train/sim_loss": 0.05078125 }, { "epoch": 0.2343286533517896, "step": 2370, "train/total_loss": 0.05078176036477089 }, { "entropy": 9.217727661132812, "epoch": 0.2344275262013051, "mean_token_accuracy": 0.6961394548416138, "num_tokens": 12340348.0, "step": 2371, "train/ce_loss": 0.7166080474853516 }, { "epoch": 0.2344275262013051, "step": 2371, "train/sim_loss": 0.1484375 }, { "epoch": 0.2344275262013051, "step": 2371, "train/total_loss": 0.2200983166694641 }, { "entropy": 9.32795238494873, "epoch": 0.23452639905082065, "mean_token_accuracy": 0.7483176589012146, "num_tokens": 12345528.0, "step": 2372, "train/ce_loss": 0.5925230383872986 }, { "epoch": 0.23452639905082065, "step": 2372, "train/sim_loss": 0.12890625 }, { "epoch": 0.23452639905082065, "step": 2372, "train/total_loss": 0.1881585568189621 }, { "entropy": 9.745134353637695, "epoch": 0.23462527190033616, "mean_token_accuracy": 0.7691029906272888, "num_tokens": 12350563.0, "step": 2373, "train/ce_loss": 0.8051249980926514 }, { "epoch": 0.23462527190033616, "step": 2373, "train/sim_loss": 0.06640625 }, { "epoch": 0.23462527190033616, "step": 2373, "train/total_loss": 0.14691874384880066 }, { "entropy": 9.267991065979004, "epoch": 0.2347241447498517, "mean_token_accuracy": 0.7484811544418335, "num_tokens": 12355901.0, "step": 2374, "train/ce_loss": 1.1690332889556885 }, { "epoch": 0.2347241447498517, "step": 2374, "train/sim_loss": 0.078125 }, { "epoch": 0.2347241447498517, "step": 2374, "train/total_loss": 0.19502833485603333 }, { "entropy": 9.697690963745117, "epoch": 0.23482301759936722, "mean_token_accuracy": 0.6909871101379395, "num_tokens": 12360971.0, "step": 2375, "train/ce_loss": 1.5387448072433472 }, { "epoch": 0.23482301759936722, "step": 2375, "train/sim_loss": 0.12890625 }, { "epoch": 0.23482301759936722, "step": 2375, "train/total_loss": 0.2827807366847992 }, { "entropy": 9.765154838562012, "epoch": 0.23492189044888273, "mean_token_accuracy": 0.7376725673675537, "num_tokens": 12365945.0, "step": 2376, "train/ce_loss": 4.265839379513636e-06 }, { "epoch": 0.23492189044888273, "step": 2376, "train/sim_loss": 0.03125 }, { "epoch": 0.23492189044888273, "step": 2376, "train/total_loss": 0.03125042840838432 }, { "entropy": 9.0798978805542, "epoch": 0.23502076329839827, "mean_token_accuracy": 0.7807229161262512, "num_tokens": 12371285.0, "step": 2377, "train/ce_loss": 0.35677269101142883 }, { "epoch": 0.23502076329839827, "step": 2377, "train/sim_loss": 0.0703125 }, { "epoch": 0.23502076329839827, "step": 2377, "train/total_loss": 0.10598976910114288 }, { "entropy": 9.423152923583984, "epoch": 0.23511963614791379, "mean_token_accuracy": 0.7773279547691345, "num_tokens": 12376466.0, "step": 2378, "train/ce_loss": 1.1971715688705444 }, { "epoch": 0.23511963614791379, "step": 2378, "train/sim_loss": 0.078125 }, { "epoch": 0.23511963614791379, "step": 2378, "train/total_loss": 0.19784215092658997 }, { "entropy": 9.05411148071289, "epoch": 0.2352185089974293, "mean_token_accuracy": 0.7235772609710693, "num_tokens": 12381819.0, "step": 2379, "train/ce_loss": 0.789045512676239 }, { "epoch": 0.2352185089974293, "step": 2379, "train/sim_loss": 0.03125 }, { "epoch": 0.2352185089974293, "step": 2379, "train/total_loss": 0.11015455424785614 }, { "epoch": 0.23531738184694484, "grad_norm": 0.8408113121986389, "learning_rate": 9.414280769420957e-06, "loss": 0.1538, "step": 2380 }, { "entropy": 9.860815048217773, "epoch": 0.23531738184694484, "mean_token_accuracy": 0.7065972089767456, "num_tokens": 12386833.0, "step": 2380, "train/ce_loss": 1.434122920036316 }, { "epoch": 0.23531738184694484, "step": 2380, "train/sim_loss": 0.08984375 }, { "epoch": 0.23531738184694484, "step": 2380, "train/total_loss": 0.2332560420036316 }, { "entropy": 9.214044570922852, "epoch": 0.23541625469646035, "mean_token_accuracy": 0.6609124541282654, "num_tokens": 12392296.0, "step": 2381, "train/ce_loss": 0.6754460334777832 }, { "epoch": 0.23541625469646035, "step": 2381, "train/sim_loss": 0.12109375 }, { "epoch": 0.23541625469646035, "step": 2381, "train/total_loss": 0.1886383593082428 }, { "entropy": 9.326742172241211, "epoch": 0.23551512754597587, "mean_token_accuracy": 0.7060931921005249, "num_tokens": 12397634.0, "step": 2382, "train/ce_loss": 0.9092620015144348 }, { "epoch": 0.23551512754597587, "step": 2382, "train/sim_loss": 0.1015625 }, { "epoch": 0.23551512754597587, "step": 2382, "train/total_loss": 0.19248870015144348 }, { "entropy": 9.355029106140137, "epoch": 0.2356140003954914, "mean_token_accuracy": 0.7112010717391968, "num_tokens": 12402868.0, "step": 2383, "train/ce_loss": 1.1074140071868896 }, { "epoch": 0.2356140003954914, "step": 2383, "train/sim_loss": 0.09765625 }, { "epoch": 0.2356140003954914, "step": 2383, "train/total_loss": 0.20839765667915344 }, { "entropy": 9.30242919921875, "epoch": 0.23571287324500692, "mean_token_accuracy": 0.7903030514717102, "num_tokens": 12408161.0, "step": 2384, "train/ce_loss": 0.5750543475151062 }, { "epoch": 0.23571287324500692, "step": 2384, "train/sim_loss": 0.07421875 }, { "epoch": 0.23571287324500692, "step": 2384, "train/total_loss": 0.13172417879104614 }, { "entropy": 8.88375186920166, "epoch": 0.23581174609452243, "mean_token_accuracy": 0.7622950673103333, "num_tokens": 12413596.0, "step": 2385, "train/ce_loss": 1.5440810918807983 }, { "epoch": 0.23581174609452243, "step": 2385, "train/sim_loss": 0.1015625 }, { "epoch": 0.23581174609452243, "step": 2385, "train/total_loss": 0.2559705972671509 }, { "entropy": 9.507736206054688, "epoch": 0.23591061894403798, "mean_token_accuracy": 0.691428542137146, "num_tokens": 12418754.0, "step": 2386, "train/ce_loss": 0.6624653935432434 }, { "epoch": 0.23591061894403798, "step": 2386, "train/sim_loss": 0.0859375 }, { "epoch": 0.23591061894403798, "step": 2386, "train/total_loss": 0.15218403935432434 }, { "entropy": 9.170475006103516, "epoch": 0.2360094917935535, "mean_token_accuracy": 0.7021276354789734, "num_tokens": 12423895.0, "step": 2387, "train/ce_loss": 7.41304620532901e-06 }, { "epoch": 0.2360094917935535, "step": 2387, "train/sim_loss": 0.0546875 }, { "epoch": 0.2360094917935535, "step": 2387, "train/total_loss": 0.054688241332769394 }, { "entropy": 9.166980743408203, "epoch": 0.236108364643069, "mean_token_accuracy": 0.711448609828949, "num_tokens": 12429216.0, "step": 2388, "train/ce_loss": 0.8316457867622375 }, { "epoch": 0.236108364643069, "step": 2388, "train/sim_loss": 0.07421875 }, { "epoch": 0.236108364643069, "step": 2388, "train/total_loss": 0.15738332271575928 }, { "entropy": 9.178657531738281, "epoch": 0.23620723749258454, "mean_token_accuracy": 0.7100130319595337, "num_tokens": 12434508.0, "step": 2389, "train/ce_loss": 0.9058764576911926 }, { "epoch": 0.23620723749258454, "step": 2389, "train/sim_loss": 0.06640625 }, { "epoch": 0.23620723749258454, "step": 2389, "train/total_loss": 0.15699389576911926 }, { "entropy": 8.597973823547363, "epoch": 0.23630611034210006, "mean_token_accuracy": 0.7439758777618408, "num_tokens": 12440061.0, "step": 2390, "train/ce_loss": 0.933378279209137 }, { "epoch": 0.23630611034210006, "step": 2390, "train/sim_loss": 0.1171875 }, { "epoch": 0.23630611034210006, "step": 2390, "train/total_loss": 0.21052533388137817 }, { "entropy": 9.485719680786133, "epoch": 0.23640498319161557, "mean_token_accuracy": 0.7674112915992737, "num_tokens": 12445292.0, "step": 2391, "train/ce_loss": 0.8362389206886292 }, { "epoch": 0.23640498319161557, "step": 2391, "train/sim_loss": 0.03125 }, { "epoch": 0.23640498319161557, "step": 2391, "train/total_loss": 0.11487389355897903 }, { "entropy": 9.044958114624023, "epoch": 0.2365038560411311, "mean_token_accuracy": 0.7096399664878845, "num_tokens": 12450637.0, "step": 2392, "train/ce_loss": 1.012323021888733 }, { "epoch": 0.2365038560411311, "step": 2392, "train/sim_loss": 0.1328125 }, { "epoch": 0.2365038560411311, "step": 2392, "train/total_loss": 0.23404480516910553 }, { "entropy": 9.472670555114746, "epoch": 0.23660272889064662, "mean_token_accuracy": 0.7845934629440308, "num_tokens": 12455778.0, "step": 2393, "train/ce_loss": 0.8923097848892212 }, { "epoch": 0.23660272889064662, "step": 2393, "train/sim_loss": 0.05078125 }, { "epoch": 0.23660272889064662, "step": 2393, "train/total_loss": 0.1400122344493866 }, { "entropy": 9.029642105102539, "epoch": 0.23670160174016217, "mean_token_accuracy": 0.7072368264198303, "num_tokens": 12461199.0, "step": 2394, "train/ce_loss": 1.3686507940292358 }, { "epoch": 0.23670160174016217, "step": 2394, "train/sim_loss": 0.06640625 }, { "epoch": 0.23670160174016217, "step": 2394, "train/total_loss": 0.20327132940292358 }, { "entropy": 9.2318754196167, "epoch": 0.23680047458967768, "mean_token_accuracy": 0.769132673740387, "num_tokens": 12466479.0, "step": 2395, "train/ce_loss": 2.7288724595564418e-06 }, { "epoch": 0.23680047458967768, "step": 2395, "train/sim_loss": 0.0546875 }, { "epoch": 0.23680047458967768, "step": 2395, "train/total_loss": 0.05468777194619179 }, { "entropy": 9.625486373901367, "epoch": 0.2368993474391932, "mean_token_accuracy": 0.7072368264198303, "num_tokens": 12471521.0, "step": 2396, "train/ce_loss": 1.7447322607040405 }, { "epoch": 0.2368993474391932, "step": 2396, "train/sim_loss": 0.078125 }, { "epoch": 0.2368993474391932, "step": 2396, "train/total_loss": 0.25259822607040405 }, { "entropy": 9.427966117858887, "epoch": 0.23699822028870873, "mean_token_accuracy": 0.7125340700149536, "num_tokens": 12476719.0, "step": 2397, "train/ce_loss": 1.1014299392700195 }, { "epoch": 0.23699822028870873, "step": 2397, "train/sim_loss": 0.04296875 }, { "epoch": 0.23699822028870873, "step": 2397, "train/total_loss": 0.1531117558479309 }, { "entropy": 9.326618194580078, "epoch": 0.23709709313822425, "mean_token_accuracy": 0.733564019203186, "num_tokens": 12482048.0, "step": 2398, "train/ce_loss": 0.9821603894233704 }, { "epoch": 0.23709709313822425, "step": 2398, "train/sim_loss": 0.05859375 }, { "epoch": 0.23709709313822425, "step": 2398, "train/total_loss": 0.15680979192256927 }, { "entropy": 9.529634475708008, "epoch": 0.23719596598773976, "mean_token_accuracy": 0.7684659361839294, "num_tokens": 12487137.0, "step": 2399, "train/ce_loss": 0.5558364391326904 }, { "epoch": 0.23719596598773976, "step": 2399, "train/sim_loss": 0.07421875 }, { "epoch": 0.23719596598773976, "step": 2399, "train/total_loss": 0.1298023909330368 }, { "epoch": 0.2372948388372553, "grad_norm": 0.8500503897666931, "learning_rate": 9.409335904663008e-06, "loss": 0.1632, "step": 2400 }, { "entropy": 9.319252014160156, "epoch": 0.2372948388372553, "mean_token_accuracy": 0.7546948194503784, "num_tokens": 12492440.0, "step": 2400, "train/ce_loss": 0.4326237738132477 }, { "epoch": 0.2372948388372553, "step": 2400, "train/sim_loss": 0.03125 }, { "epoch": 0.2372948388372553, "step": 2400, "train/total_loss": 0.07451237738132477 }, { "entropy": 9.431442260742188, "epoch": 0.23739371168677081, "mean_token_accuracy": 0.7329843044281006, "num_tokens": 12497660.0, "step": 2401, "train/ce_loss": 0.8422648906707764 }, { "epoch": 0.23739371168677081, "step": 2401, "train/sim_loss": 0.04296875 }, { "epoch": 0.23739371168677081, "step": 2401, "train/total_loss": 0.12719523906707764 }, { "entropy": 8.835033416748047, "epoch": 0.23749258453628633, "mean_token_accuracy": 0.737500011920929, "num_tokens": 12503149.0, "step": 2402, "train/ce_loss": 0.6837636828422546 }, { "epoch": 0.23749258453628633, "step": 2402, "train/sim_loss": 0.0859375 }, { "epoch": 0.23749258453628633, "step": 2402, "train/total_loss": 0.154313862323761 }, { "entropy": 9.53884220123291, "epoch": 0.23759145738580187, "mean_token_accuracy": 0.7151424288749695, "num_tokens": 12508235.0, "step": 2403, "train/ce_loss": 1.3083539009094238 }, { "epoch": 0.23759145738580187, "step": 2403, "train/sim_loss": 0.0859375 }, { "epoch": 0.23759145738580187, "step": 2403, "train/total_loss": 0.2167728990316391 }, { "entropy": 9.01998519897461, "epoch": 0.23769033023531738, "mean_token_accuracy": 0.7577962875366211, "num_tokens": 12513661.0, "step": 2404, "train/ce_loss": 0.9385044574737549 }, { "epoch": 0.23769033023531738, "step": 2404, "train/sim_loss": 0.1171875 }, { "epoch": 0.23769033023531738, "step": 2404, "train/total_loss": 0.21103794872760773 }, { "entropy": 9.81583309173584, "epoch": 0.2377892030848329, "mean_token_accuracy": 0.7698412537574768, "num_tokens": 12518719.0, "step": 2405, "train/ce_loss": 0.7289450764656067 }, { "epoch": 0.2377892030848329, "step": 2405, "train/sim_loss": 0.1171875 }, { "epoch": 0.2377892030848329, "step": 2405, "train/total_loss": 0.19008201360702515 }, { "entropy": 9.46564769744873, "epoch": 0.23788807593434844, "mean_token_accuracy": 0.7140804529190063, "num_tokens": 12523755.0, "step": 2406, "train/ce_loss": 3.154036221530987e-06 }, { "epoch": 0.23788807593434844, "step": 2406, "train/sim_loss": 0.0703125 }, { "epoch": 0.23788807593434844, "step": 2406, "train/total_loss": 0.07031281292438507 }, { "entropy": 9.094564437866211, "epoch": 0.23798694878386395, "mean_token_accuracy": 0.700964629650116, "num_tokens": 12529144.0, "step": 2407, "train/ce_loss": 0.8377649784088135 }, { "epoch": 0.23798694878386395, "step": 2407, "train/sim_loss": 0.03515625 }, { "epoch": 0.23798694878386395, "step": 2407, "train/total_loss": 0.11893274635076523 }, { "entropy": 9.922409057617188, "epoch": 0.23808582163337946, "mean_token_accuracy": 0.7257019281387329, "num_tokens": 12534083.0, "step": 2408, "train/ce_loss": 1.3888559341430664 }, { "epoch": 0.23808582163337946, "step": 2408, "train/sim_loss": 0.1015625 }, { "epoch": 0.23808582163337946, "step": 2408, "train/total_loss": 0.24044810235500336 }, { "entropy": 9.742548942565918, "epoch": 0.238184694482895, "mean_token_accuracy": 0.744425356388092, "num_tokens": 12539062.0, "step": 2409, "train/ce_loss": 1.2016829252243042 }, { "epoch": 0.238184694482895, "step": 2409, "train/sim_loss": 0.0625 }, { "epoch": 0.238184694482895, "step": 2409, "train/total_loss": 0.1826682984828949 }, { "entropy": 9.329795837402344, "epoch": 0.23828356733241052, "mean_token_accuracy": 0.7608142495155334, "num_tokens": 12544271.0, "step": 2410, "train/ce_loss": 1.061061978340149 }, { "epoch": 0.23828356733241052, "step": 2410, "train/sim_loss": 0.0390625 }, { "epoch": 0.23828356733241052, "step": 2410, "train/total_loss": 0.14516869187355042 }, { "entropy": 9.265291213989258, "epoch": 0.23838244018192603, "mean_token_accuracy": 0.7192118167877197, "num_tokens": 12549520.0, "step": 2411, "train/ce_loss": 1.0520355701446533 }, { "epoch": 0.23838244018192603, "step": 2411, "train/sim_loss": 0.078125 }, { "epoch": 0.23838244018192603, "step": 2411, "train/total_loss": 0.1833285689353943 }, { "entropy": 9.62429428100586, "epoch": 0.23848131303144157, "mean_token_accuracy": 0.7111111283302307, "num_tokens": 12554572.0, "step": 2412, "train/ce_loss": 0.5724148154258728 }, { "epoch": 0.23848131303144157, "step": 2412, "train/sim_loss": 0.0625 }, { "epoch": 0.23848131303144157, "step": 2412, "train/total_loss": 0.11974148452281952 }, { "entropy": 10.618011474609375, "epoch": 0.23858018588095709, "mean_token_accuracy": 0.7802197933197021, "num_tokens": 12559164.0, "step": 2413, "train/ce_loss": 1.9048376998398453e-05 }, { "epoch": 0.23858018588095709, "step": 2413, "train/sim_loss": 0.078125 }, { "epoch": 0.23858018588095709, "step": 2413, "train/total_loss": 0.07812690734863281 }, { "entropy": 9.363680839538574, "epoch": 0.2386790587304726, "mean_token_accuracy": 0.7350901365280151, "num_tokens": 12564374.0, "step": 2414, "train/ce_loss": 0.8084295392036438 }, { "epoch": 0.2386790587304726, "step": 2414, "train/sim_loss": 0.1171875 }, { "epoch": 0.2386790587304726, "step": 2414, "train/total_loss": 0.19803045690059662 }, { "entropy": 9.255803108215332, "epoch": 0.23877793157998814, "mean_token_accuracy": 0.7384230494499207, "num_tokens": 12569635.0, "step": 2415, "train/ce_loss": 0.6242900490760803 }, { "epoch": 0.23877793157998814, "step": 2415, "train/sim_loss": 0.06640625 }, { "epoch": 0.23877793157998814, "step": 2415, "train/total_loss": 0.1288352608680725 }, { "entropy": 9.360810279846191, "epoch": 0.23887680442950365, "mean_token_accuracy": 0.7057416439056396, "num_tokens": 12574909.0, "step": 2416, "train/ce_loss": 0.941565990447998 }, { "epoch": 0.23887680442950365, "step": 2416, "train/sim_loss": 0.046875 }, { "epoch": 0.23887680442950365, "step": 2416, "train/total_loss": 0.14103159308433533 }, { "entropy": 9.784135818481445, "epoch": 0.2389756772790192, "mean_token_accuracy": 0.709618866443634, "num_tokens": 12580069.0, "step": 2417, "train/ce_loss": 1.183141827583313 }, { "epoch": 0.2389756772790192, "step": 2417, "train/sim_loss": 0.078125 }, { "epoch": 0.2389756772790192, "step": 2417, "train/total_loss": 0.19643917679786682 }, { "entropy": 9.049604415893555, "epoch": 0.2390745501285347, "mean_token_accuracy": 0.7241379022598267, "num_tokens": 12585471.0, "step": 2418, "train/ce_loss": 1.1365526914596558 }, { "epoch": 0.2390745501285347, "step": 2418, "train/sim_loss": 0.0859375 }, { "epoch": 0.2390745501285347, "step": 2418, "train/total_loss": 0.19959276914596558 }, { "entropy": 9.176121711730957, "epoch": 0.23917342297805022, "mean_token_accuracy": 0.6867321729660034, "num_tokens": 12590754.0, "step": 2419, "train/ce_loss": 0.7100818753242493 }, { "epoch": 0.23917342297805022, "step": 2419, "train/sim_loss": 0.0859375 }, { "epoch": 0.23917342297805022, "step": 2419, "train/total_loss": 0.15694569051265717 }, { "epoch": 0.23927229582756576, "grad_norm": 1.0710947513580322, "learning_rate": 9.40439103990506e-06, "loss": 0.1545, "step": 2420 }, { "entropy": 9.725388526916504, "epoch": 0.23927229582756576, "mean_token_accuracy": 0.7470085620880127, "num_tokens": 12595783.0, "step": 2420, "train/ce_loss": 1.1202698945999146 }, { "epoch": 0.23927229582756576, "step": 2420, "train/sim_loss": 0.0703125 }, { "epoch": 0.23927229582756576, "step": 2420, "train/total_loss": 0.18233948945999146 }, { "entropy": 9.089994430541992, "epoch": 0.23937116867708128, "mean_token_accuracy": 0.7723214030265808, "num_tokens": 12601138.0, "step": 2421, "train/ce_loss": 0.7415687441825867 }, { "epoch": 0.23937116867708128, "step": 2421, "train/sim_loss": 0.03125 }, { "epoch": 0.23937116867708128, "step": 2421, "train/total_loss": 0.10540687292814255 }, { "entropy": 9.053451538085938, "epoch": 0.2394700415265968, "mean_token_accuracy": 0.6727052927017212, "num_tokens": 12606463.0, "step": 2422, "train/ce_loss": 0.5993922352790833 }, { "epoch": 0.2394700415265968, "step": 2422, "train/sim_loss": 0.0859375 }, { "epoch": 0.2394700415265968, "step": 2422, "train/total_loss": 0.1458767205476761 }, { "entropy": 9.50720500946045, "epoch": 0.23956891437611233, "mean_token_accuracy": 0.7133956551551819, "num_tokens": 12611511.0, "step": 2423, "train/ce_loss": 7.332210316235432e-06 }, { "epoch": 0.23956891437611233, "step": 2423, "train/sim_loss": 0.0625 }, { "epoch": 0.23956891437611233, "step": 2423, "train/total_loss": 0.0625007301568985 }, { "entropy": 9.788806915283203, "epoch": 0.23966778722562784, "mean_token_accuracy": 0.8044692873954773, "num_tokens": 12616454.0, "step": 2424, "train/ce_loss": 3.934250344173051e-06 }, { "epoch": 0.23966778722562784, "step": 2424, "train/sim_loss": 0.01953125 }, { "epoch": 0.23966778722562784, "step": 2424, "train/total_loss": 0.019531643018126488 }, { "entropy": 8.997499465942383, "epoch": 0.23976666007514336, "mean_token_accuracy": 0.7777777910232544, "num_tokens": 12621557.0, "step": 2425, "train/ce_loss": 0.48350977897644043 }, { "epoch": 0.23976666007514336, "step": 2425, "train/sim_loss": 0.0546875 }, { "epoch": 0.23976666007514336, "step": 2425, "train/total_loss": 0.1030384749174118 }, { "entropy": 9.196101188659668, "epoch": 0.2398655329246589, "mean_token_accuracy": 0.7543390989303589, "num_tokens": 12626810.0, "step": 2426, "train/ce_loss": 1.0639623403549194 }, { "epoch": 0.2398655329246589, "step": 2426, "train/sim_loss": 0.11328125 }, { "epoch": 0.2398655329246589, "step": 2426, "train/total_loss": 0.21967747807502747 }, { "entropy": 9.574728012084961, "epoch": 0.2399644057741744, "mean_token_accuracy": 0.7956656217575073, "num_tokens": 12631852.0, "step": 2427, "train/ce_loss": 1.0240403413772583 }, { "epoch": 0.2399644057741744, "step": 2427, "train/sim_loss": 0.09375 }, { "epoch": 0.2399644057741744, "step": 2427, "train/total_loss": 0.19615402817726135 }, { "entropy": 9.197060585021973, "epoch": 0.24006327862368992, "mean_token_accuracy": 0.7612359523773193, "num_tokens": 12636993.0, "step": 2428, "train/ce_loss": 8.116682693071198e-06 }, { "epoch": 0.24006327862368992, "step": 2428, "train/sim_loss": 0.05078125 }, { "epoch": 0.24006327862368992, "step": 2428, "train/total_loss": 0.050782062113285065 }, { "entropy": 9.274468421936035, "epoch": 0.24016215147320547, "mean_token_accuracy": 0.6811594367027283, "num_tokens": 12642066.0, "step": 2429, "train/ce_loss": 1.1202927827835083 }, { "epoch": 0.24016215147320547, "step": 2429, "train/sim_loss": 0.1015625 }, { "epoch": 0.24016215147320547, "step": 2429, "train/total_loss": 0.2135917842388153 }, { "entropy": 9.780946731567383, "epoch": 0.24026102432272098, "mean_token_accuracy": 0.7403509020805359, "num_tokens": 12647082.0, "step": 2430, "train/ce_loss": 1.2716385126113892 }, { "epoch": 0.24026102432272098, "step": 2430, "train/sim_loss": 0.08984375 }, { "epoch": 0.24026102432272098, "step": 2430, "train/total_loss": 0.2170076072216034 }, { "entropy": 9.411595344543457, "epoch": 0.2403598971722365, "mean_token_accuracy": 0.7183908224105835, "num_tokens": 12652265.0, "step": 2431, "train/ce_loss": 0.9056243896484375 }, { "epoch": 0.2403598971722365, "step": 2431, "train/sim_loss": 0.06640625 }, { "epoch": 0.2403598971722365, "step": 2431, "train/total_loss": 0.15696868300437927 }, { "entropy": 9.484373092651367, "epoch": 0.24045877002175203, "mean_token_accuracy": 0.7074626684188843, "num_tokens": 12657375.0, "step": 2432, "train/ce_loss": 3.2736211323936004e-06 }, { "epoch": 0.24045877002175203, "step": 2432, "train/sim_loss": 0.0625 }, { "epoch": 0.24045877002175203, "step": 2432, "train/total_loss": 0.06250032782554626 }, { "entropy": 9.2290620803833, "epoch": 0.24055764287126755, "mean_token_accuracy": 0.7011628150939941, "num_tokens": 12662704.0, "step": 2433, "train/ce_loss": 1.1158980131149292 }, { "epoch": 0.24055764287126755, "step": 2433, "train/sim_loss": 0.05859375 }, { "epoch": 0.24055764287126755, "step": 2433, "train/total_loss": 0.17018355429172516 }, { "entropy": 10.093072891235352, "epoch": 0.24065651572078306, "mean_token_accuracy": 0.7209302186965942, "num_tokens": 12667477.0, "step": 2434, "train/ce_loss": 1.2968782357347663e-05 }, { "epoch": 0.24065651572078306, "step": 2434, "train/sim_loss": 0.05859375 }, { "epoch": 0.24065651572078306, "step": 2434, "train/total_loss": 0.058595046401023865 }, { "entropy": 9.002099990844727, "epoch": 0.2407553885702986, "mean_token_accuracy": 0.7219361662864685, "num_tokens": 12672857.0, "step": 2435, "train/ce_loss": 0.6061140298843384 }, { "epoch": 0.2407553885702986, "step": 2435, "train/sim_loss": 0.03125 }, { "epoch": 0.2407553885702986, "step": 2435, "train/total_loss": 0.09186140447854996 }, { "entropy": 9.172452926635742, "epoch": 0.24085426141981411, "mean_token_accuracy": 0.7418224215507507, "num_tokens": 12678190.0, "step": 2436, "train/ce_loss": 0.33215388655662537 }, { "epoch": 0.24085426141981411, "step": 2436, "train/sim_loss": 0.0234375 }, { "epoch": 0.24085426141981411, "step": 2436, "train/total_loss": 0.05665288865566254 }, { "entropy": 8.969165802001953, "epoch": 0.24095313426932966, "mean_token_accuracy": 0.7849566340446472, "num_tokens": 12683684.0, "step": 2437, "train/ce_loss": 0.6042222380638123 }, { "epoch": 0.24095313426932966, "step": 2437, "train/sim_loss": 0.0625 }, { "epoch": 0.24095313426932966, "step": 2437, "train/total_loss": 0.12292222678661346 }, { "entropy": 9.200159072875977, "epoch": 0.24105200711884517, "mean_token_accuracy": 0.7333333492279053, "num_tokens": 12689004.0, "step": 2438, "train/ce_loss": 0.9200250506401062 }, { "epoch": 0.24105200711884517, "step": 2438, "train/sim_loss": 0.0546875 }, { "epoch": 0.24105200711884517, "step": 2438, "train/total_loss": 0.1466900110244751 }, { "entropy": 8.7935152053833, "epoch": 0.24115087996836068, "mean_token_accuracy": 0.7602339386940002, "num_tokens": 12694506.0, "step": 2439, "train/ce_loss": 0.32769227027893066 }, { "epoch": 0.24115087996836068, "step": 2439, "train/sim_loss": 0.0234375 }, { "epoch": 0.24115087996836068, "step": 2439, "train/total_loss": 0.056206729263067245 }, { "epoch": 0.24124975281787622, "grad_norm": 0.9195847511291504, "learning_rate": 9.39944617514711e-06, "loss": 0.1548, "step": 2440 }, { "entropy": 8.913129806518555, "epoch": 0.24124975281787622, "mean_token_accuracy": 0.7268041372299194, "num_tokens": 12699932.0, "step": 2440, "train/ce_loss": 0.6807551980018616 }, { "epoch": 0.24124975281787622, "step": 2440, "train/sim_loss": 0.125 }, { "epoch": 0.24124975281787622, "step": 2440, "train/total_loss": 0.1930755227804184 }, { "entropy": 9.239109992980957, "epoch": 0.24134862566739174, "mean_token_accuracy": 0.6910112500190735, "num_tokens": 12705303.0, "step": 2441, "train/ce_loss": 0.7849908471107483 }, { "epoch": 0.24134862566739174, "step": 2441, "train/sim_loss": 0.078125 }, { "epoch": 0.24134862566739174, "step": 2441, "train/total_loss": 0.15662407875061035 }, { "entropy": 9.928701400756836, "epoch": 0.24144749851690725, "mean_token_accuracy": 0.6928701996803284, "num_tokens": 12710276.0, "step": 2442, "train/ce_loss": 1.6926069259643555 }, { "epoch": 0.24144749851690725, "step": 2442, "train/sim_loss": 0.08203125 }, { "epoch": 0.24144749851690725, "step": 2442, "train/total_loss": 0.2512919306755066 }, { "entropy": 9.428268432617188, "epoch": 0.2415463713664228, "mean_token_accuracy": 0.7929577231407166, "num_tokens": 12715450.0, "step": 2443, "train/ce_loss": 7.356356945820153e-06 }, { "epoch": 0.2415463713664228, "step": 2443, "train/sim_loss": 0.05078125 }, { "epoch": 0.2415463713664228, "step": 2443, "train/total_loss": 0.0507819838821888 }, { "entropy": 9.439188957214355, "epoch": 0.2416452442159383, "mean_token_accuracy": 0.756369411945343, "num_tokens": 12720552.0, "step": 2444, "train/ce_loss": 0.8201816082000732 }, { "epoch": 0.2416452442159383, "step": 2444, "train/sim_loss": 0.0703125 }, { "epoch": 0.2416452442159383, "step": 2444, "train/total_loss": 0.1523306667804718 }, { "entropy": 9.659980773925781, "epoch": 0.24174411706545382, "mean_token_accuracy": 0.7495462894439697, "num_tokens": 12725493.0, "step": 2445, "train/ce_loss": 8.589844583184458e-06 }, { "epoch": 0.24174411706545382, "step": 2445, "train/sim_loss": 0.07421875 }, { "epoch": 0.24174411706545382, "step": 2445, "train/total_loss": 0.07421960681676865 }, { "entropy": 9.174163818359375, "epoch": 0.24184298991496936, "mean_token_accuracy": 0.7416107654571533, "num_tokens": 12731051.0, "step": 2446, "train/ce_loss": 0.842617928981781 }, { "epoch": 0.24184298991496936, "step": 2446, "train/sim_loss": 0.109375 }, { "epoch": 0.24184298991496936, "step": 2446, "train/total_loss": 0.19363680481910706 }, { "entropy": 9.033437728881836, "epoch": 0.24194186276448487, "mean_token_accuracy": 0.7300546169281006, "num_tokens": 12736473.0, "step": 2447, "train/ce_loss": 0.6673880815505981 }, { "epoch": 0.24194186276448487, "step": 2447, "train/sim_loss": 0.1484375 }, { "epoch": 0.24194186276448487, "step": 2447, "train/total_loss": 0.2151763141155243 }, { "entropy": 9.262384414672852, "epoch": 0.24204073561400039, "mean_token_accuracy": 0.661478579044342, "num_tokens": 12741709.0, "step": 2448, "train/ce_loss": 1.1020350456237793 }, { "epoch": 0.24204073561400039, "step": 2448, "train/sim_loss": 0.1484375 }, { "epoch": 0.24204073561400039, "step": 2448, "train/total_loss": 0.25864100456237793 }, { "entropy": 9.353168487548828, "epoch": 0.24213960846351593, "mean_token_accuracy": 0.743073046207428, "num_tokens": 12746961.0, "step": 2449, "train/ce_loss": 0.5089231133460999 }, { "epoch": 0.24213960846351593, "step": 2449, "train/sim_loss": 0.02734375 }, { "epoch": 0.24213960846351593, "step": 2449, "train/total_loss": 0.07823605835437775 }, { "entropy": 8.918380737304688, "epoch": 0.24223848131303144, "mean_token_accuracy": 0.7183098793029785, "num_tokens": 12752208.0, "step": 2450, "train/ce_loss": 0.6301059126853943 }, { "epoch": 0.24223848131303144, "step": 2450, "train/sim_loss": 0.0703125 }, { "epoch": 0.24223848131303144, "step": 2450, "train/total_loss": 0.13332310318946838 }, { "entropy": 9.42928695678711, "epoch": 0.24233735416254695, "mean_token_accuracy": 0.7073863744735718, "num_tokens": 12757351.0, "step": 2451, "train/ce_loss": 1.2831615209579468 }, { "epoch": 0.24233735416254695, "step": 2451, "train/sim_loss": 0.05078125 }, { "epoch": 0.24233735416254695, "step": 2451, "train/total_loss": 0.17909739911556244 }, { "entropy": 10.1730375289917, "epoch": 0.2424362270120625, "mean_token_accuracy": 0.7560975551605225, "num_tokens": 12762123.0, "step": 2452, "train/ce_loss": 6.707434295094572e-06 }, { "epoch": 0.2424362270120625, "step": 2452, "train/sim_loss": 0.02734375 }, { "epoch": 0.2424362270120625, "step": 2452, "train/total_loss": 0.027344420552253723 }, { "entropy": 9.21902847290039, "epoch": 0.242535099861578, "mean_token_accuracy": 0.7799353003501892, "num_tokens": 12767656.0, "step": 2453, "train/ce_loss": 0.953403651714325 }, { "epoch": 0.242535099861578, "step": 2453, "train/sim_loss": 0.0234375 }, { "epoch": 0.242535099861578, "step": 2453, "train/total_loss": 0.11877786368131638 }, { "entropy": 9.858123779296875, "epoch": 0.24263397271109352, "mean_token_accuracy": 0.7278761267662048, "num_tokens": 12772502.0, "step": 2454, "train/ce_loss": 2.1266531944274902 }, { "epoch": 0.24263397271109352, "step": 2454, "train/sim_loss": 0.09765625 }, { "epoch": 0.24263397271109352, "step": 2454, "train/total_loss": 0.310321569442749 }, { "entropy": 9.473504066467285, "epoch": 0.24273284556060906, "mean_token_accuracy": 0.7373887300491333, "num_tokens": 12777684.0, "step": 2455, "train/ce_loss": 1.3440054655075073 }, { "epoch": 0.24273284556060906, "step": 2455, "train/sim_loss": 0.11328125 }, { "epoch": 0.24273284556060906, "step": 2455, "train/total_loss": 0.24768179655075073 }, { "entropy": 9.814481735229492, "epoch": 0.24283171841012458, "mean_token_accuracy": 0.6863710880279541, "num_tokens": 12782738.0, "step": 2456, "train/ce_loss": 3.664860969365691e-06 }, { "epoch": 0.24283171841012458, "step": 2456, "train/sim_loss": 0.02734375 }, { "epoch": 0.24283171841012458, "step": 2456, "train/total_loss": 0.0273441169410944 }, { "entropy": 9.205331802368164, "epoch": 0.24293059125964012, "mean_token_accuracy": 0.7039626836776733, "num_tokens": 12788094.0, "step": 2457, "train/ce_loss": 0.9541040658950806 }, { "epoch": 0.24293059125964012, "step": 2457, "train/sim_loss": 0.10546875 }, { "epoch": 0.24293059125964012, "step": 2457, "train/total_loss": 0.20087915658950806 }, { "entropy": 8.715977668762207, "epoch": 0.24302946410915563, "mean_token_accuracy": 0.7298076748847961, "num_tokens": 12793628.0, "step": 2458, "train/ce_loss": 0.3996654152870178 }, { "epoch": 0.24302946410915563, "step": 2458, "train/sim_loss": 0.0234375 }, { "epoch": 0.24302946410915563, "step": 2458, "train/total_loss": 0.06340403854846954 }, { "entropy": 8.838790893554688, "epoch": 0.24312833695867114, "mean_token_accuracy": 0.711033284664154, "num_tokens": 12799246.0, "step": 2459, "train/ce_loss": 0.5818819403648376 }, { "epoch": 0.24312833695867114, "step": 2459, "train/sim_loss": 0.06640625 }, { "epoch": 0.24312833695867114, "step": 2459, "train/total_loss": 0.12459444999694824 }, { "epoch": 0.24322720980818668, "grad_norm": 0.944430410861969, "learning_rate": 9.394501310389161e-06, "loss": 0.1629, "step": 2460 }, { "entropy": 9.599706649780273, "epoch": 0.24322720980818668, "mean_token_accuracy": 0.7435455918312073, "num_tokens": 12804276.0, "step": 2460, "train/ce_loss": 1.0893281698226929 }, { "epoch": 0.24322720980818668, "step": 2460, "train/sim_loss": 0.0390625 }, { "epoch": 0.24322720980818668, "step": 2460, "train/total_loss": 0.14799532294273376 }, { "entropy": 9.560248374938965, "epoch": 0.2433260826577022, "mean_token_accuracy": 0.728787899017334, "num_tokens": 12809374.0, "step": 2461, "train/ce_loss": 0.5702160596847534 }, { "epoch": 0.2433260826577022, "step": 2461, "train/sim_loss": 0.06640625 }, { "epoch": 0.2433260826577022, "step": 2461, "train/total_loss": 0.1234278529882431 }, { "entropy": 9.090559959411621, "epoch": 0.2434249555072177, "mean_token_accuracy": 0.7119021415710449, "num_tokens": 12814732.0, "step": 2462, "train/ce_loss": 0.943033754825592 }, { "epoch": 0.2434249555072177, "step": 2462, "train/sim_loss": 0.1171875 }, { "epoch": 0.2434249555072177, "step": 2462, "train/total_loss": 0.21149086952209473 }, { "entropy": 9.076742172241211, "epoch": 0.24352382835673325, "mean_token_accuracy": 0.7723840475082397, "num_tokens": 12820124.0, "step": 2463, "train/ce_loss": 0.5626240968704224 }, { "epoch": 0.24352382835673325, "step": 2463, "train/sim_loss": 0.09375 }, { "epoch": 0.24352382835673325, "step": 2463, "train/total_loss": 0.15001240372657776 }, { "entropy": 8.93365478515625, "epoch": 0.24362270120624877, "mean_token_accuracy": 0.7019027471542358, "num_tokens": 12825515.0, "step": 2464, "train/ce_loss": 2.3353025913238525 }, { "epoch": 0.24362270120624877, "step": 2464, "train/sim_loss": 0.12890625 }, { "epoch": 0.24362270120624877, "step": 2464, "train/total_loss": 0.36243653297424316 }, { "entropy": 9.150115966796875, "epoch": 0.24372157405576428, "mean_token_accuracy": 0.7881944179534912, "num_tokens": 12830882.0, "step": 2465, "train/ce_loss": 0.7780004143714905 }, { "epoch": 0.24372157405576428, "step": 2465, "train/sim_loss": 0.09375 }, { "epoch": 0.24372157405576428, "step": 2465, "train/total_loss": 0.17155003547668457 }, { "entropy": 9.220715522766113, "epoch": 0.24382044690527982, "mean_token_accuracy": 0.7518518567085266, "num_tokens": 12836315.0, "step": 2466, "train/ce_loss": 0.7762071490287781 }, { "epoch": 0.24382044690527982, "step": 2466, "train/sim_loss": 0.05859375 }, { "epoch": 0.24382044690527982, "step": 2466, "train/total_loss": 0.1362144649028778 }, { "entropy": 9.594511985778809, "epoch": 0.24391931975479533, "mean_token_accuracy": 0.7945205569267273, "num_tokens": 12841377.0, "step": 2467, "train/ce_loss": 0.9444525241851807 }, { "epoch": 0.24391931975479533, "step": 2467, "train/sim_loss": 0.05078125 }, { "epoch": 0.24391931975479533, "step": 2467, "train/total_loss": 0.14522650837898254 }, { "entropy": 9.246139526367188, "epoch": 0.24401819260431085, "mean_token_accuracy": 0.7575757503509521, "num_tokens": 12846724.0, "step": 2468, "train/ce_loss": 0.9505940079689026 }, { "epoch": 0.24401819260431085, "step": 2468, "train/sim_loss": 0.10546875 }, { "epoch": 0.24401819260431085, "step": 2468, "train/total_loss": 0.20052814483642578 }, { "entropy": 9.038537979125977, "epoch": 0.2441170654538264, "mean_token_accuracy": 0.7349112629890442, "num_tokens": 12852019.0, "step": 2469, "train/ce_loss": 1.0029042959213257 }, { "epoch": 0.2441170654538264, "step": 2469, "train/sim_loss": 0.046875 }, { "epoch": 0.2441170654538264, "step": 2469, "train/total_loss": 0.1471654325723648 }, { "entropy": 10.116767883300781, "epoch": 0.2442159383033419, "mean_token_accuracy": 0.792682945728302, "num_tokens": 12856759.0, "step": 2470, "train/ce_loss": 0.8636196255683899 }, { "epoch": 0.2442159383033419, "step": 2470, "train/sim_loss": 0.06640625 }, { "epoch": 0.2442159383033419, "step": 2470, "train/total_loss": 0.15276822447776794 }, { "entropy": 9.359464645385742, "epoch": 0.24431481115285741, "mean_token_accuracy": 0.6978609561920166, "num_tokens": 12861969.0, "step": 2471, "train/ce_loss": 0.5816138982772827 }, { "epoch": 0.24431481115285741, "step": 2471, "train/sim_loss": 0.046875 }, { "epoch": 0.24431481115285741, "step": 2471, "train/total_loss": 0.10503639280796051 }, { "entropy": 8.996708869934082, "epoch": 0.24441368400237296, "mean_token_accuracy": 0.7271783947944641, "num_tokens": 12867383.0, "step": 2472, "train/ce_loss": 0.7601787447929382 }, { "epoch": 0.24441368400237296, "step": 2472, "train/sim_loss": 0.10546875 }, { "epoch": 0.24441368400237296, "step": 2472, "train/total_loss": 0.18148663640022278 }, { "entropy": 9.336339950561523, "epoch": 0.24451255685188847, "mean_token_accuracy": 0.7127272486686707, "num_tokens": 12872678.0, "step": 2473, "train/ce_loss": 0.5133230686187744 }, { "epoch": 0.24451255685188847, "step": 2473, "train/sim_loss": 0.02734375 }, { "epoch": 0.24451255685188847, "step": 2473, "train/total_loss": 0.07867605984210968 }, { "entropy": 9.151508331298828, "epoch": 0.24461142970140398, "mean_token_accuracy": 0.7403314709663391, "num_tokens": 12878006.0, "step": 2474, "train/ce_loss": 0.4109875559806824 }, { "epoch": 0.24461142970140398, "step": 2474, "train/sim_loss": 0.11328125 }, { "epoch": 0.24461142970140398, "step": 2474, "train/total_loss": 0.15438000857830048 }, { "entropy": 9.76147174835205, "epoch": 0.24471030255091952, "mean_token_accuracy": 0.748633861541748, "num_tokens": 12883008.0, "step": 2475, "train/ce_loss": 1.5969499349594116 }, { "epoch": 0.24471030255091952, "step": 2475, "train/sim_loss": 0.0546875 }, { "epoch": 0.24471030255091952, "step": 2475, "train/total_loss": 0.21438249945640564 }, { "entropy": 8.862588882446289, "epoch": 0.24480917540043504, "mean_token_accuracy": 0.7681007385253906, "num_tokens": 12888452.0, "step": 2476, "train/ce_loss": 0.4870178997516632 }, { "epoch": 0.24480917540043504, "step": 2476, "train/sim_loss": 0.0546875 }, { "epoch": 0.24480917540043504, "step": 2476, "train/total_loss": 0.10338929295539856 }, { "entropy": 9.651182174682617, "epoch": 0.24490804824995058, "mean_token_accuracy": 0.7303754091262817, "num_tokens": 12893440.0, "step": 2477, "train/ce_loss": 0.9295541644096375 }, { "epoch": 0.24490804824995058, "step": 2477, "train/sim_loss": 0.05078125 }, { "epoch": 0.24490804824995058, "step": 2477, "train/total_loss": 0.14373666048049927 }, { "entropy": 9.563039779663086, "epoch": 0.2450069210994661, "mean_token_accuracy": 0.6873977184295654, "num_tokens": 12898499.0, "step": 2478, "train/ce_loss": 3.324387989778188e-06 }, { "epoch": 0.2450069210994661, "step": 2478, "train/sim_loss": 0.03125 }, { "epoch": 0.2450069210994661, "step": 2478, "train/total_loss": 0.03125033155083656 }, { "entropy": 9.432746887207031, "epoch": 0.2451057939489816, "mean_token_accuracy": 0.6913043260574341, "num_tokens": 12903646.0, "step": 2479, "train/ce_loss": 5.7135166571242735e-06 }, { "epoch": 0.2451057939489816, "step": 2479, "train/sim_loss": 0.0703125 }, { "epoch": 0.2451057939489816, "step": 2479, "train/total_loss": 0.07031307369470596 }, { "epoch": 0.24520466679849715, "grad_norm": 0.927836000919342, "learning_rate": 9.389556445631213e-06, "loss": 0.1475, "step": 2480 }, { "entropy": 9.195361137390137, "epoch": 0.24520466679849715, "mean_token_accuracy": 0.7652068138122559, "num_tokens": 12908980.0, "step": 2480, "train/ce_loss": 0.8688988089561462 }, { "epoch": 0.24520466679849715, "step": 2480, "train/sim_loss": 0.0234375 }, { "epoch": 0.24520466679849715, "step": 2480, "train/total_loss": 0.11032738536596298 }, { "entropy": 9.80865478515625, "epoch": 0.24530353964801266, "mean_token_accuracy": 0.7228682041168213, "num_tokens": 12913917.0, "step": 2481, "train/ce_loss": 0.9114884734153748 }, { "epoch": 0.24530353964801266, "step": 2481, "train/sim_loss": 0.06640625 }, { "epoch": 0.24530353964801266, "step": 2481, "train/total_loss": 0.15755510330200195 }, { "entropy": 8.944324493408203, "epoch": 0.24540241249752817, "mean_token_accuracy": 0.7332035303115845, "num_tokens": 12919367.0, "step": 2482, "train/ce_loss": 0.8474249243736267 }, { "epoch": 0.24540241249752817, "step": 2482, "train/sim_loss": 0.15625 }, { "epoch": 0.24540241249752817, "step": 2482, "train/total_loss": 0.2409924864768982 }, { "entropy": 9.81283187866211, "epoch": 0.2455012853470437, "mean_token_accuracy": 0.717756986618042, "num_tokens": 12924512.0, "step": 2483, "train/ce_loss": 1.4316576719284058 }, { "epoch": 0.2455012853470437, "step": 2483, "train/sim_loss": 0.125 }, { "epoch": 0.2455012853470437, "step": 2483, "train/total_loss": 0.2681657671928406 }, { "entropy": 9.55518913269043, "epoch": 0.24560015819655923, "mean_token_accuracy": 0.695588231086731, "num_tokens": 12929645.0, "step": 2484, "train/ce_loss": 1.2374972105026245 }, { "epoch": 0.24560015819655923, "step": 2484, "train/sim_loss": 0.0703125 }, { "epoch": 0.24560015819655923, "step": 2484, "train/total_loss": 0.1940622329711914 }, { "entropy": 9.15027904510498, "epoch": 0.24569903104607474, "mean_token_accuracy": 0.7681818008422852, "num_tokens": 12935026.0, "step": 2485, "train/ce_loss": 1.2009178400039673 }, { "epoch": 0.24569903104607474, "step": 2485, "train/sim_loss": 0.08203125 }, { "epoch": 0.24569903104607474, "step": 2485, "train/total_loss": 0.20212304592132568 }, { "entropy": 9.405292510986328, "epoch": 0.24579790389559028, "mean_token_accuracy": 0.7732558250427246, "num_tokens": 12940185.0, "step": 2486, "train/ce_loss": 0.4908983111381531 }, { "epoch": 0.24579790389559028, "step": 2486, "train/sim_loss": 0.05078125 }, { "epoch": 0.24579790389559028, "step": 2486, "train/total_loss": 0.09987108409404755 }, { "entropy": 9.505070686340332, "epoch": 0.2458967767451058, "mean_token_accuracy": 0.7264000177383423, "num_tokens": 12945220.0, "step": 2487, "train/ce_loss": 1.4660943746566772 }, { "epoch": 0.2458967767451058, "step": 2487, "train/sim_loss": 0.125 }, { "epoch": 0.2458967767451058, "step": 2487, "train/total_loss": 0.27160942554473877 }, { "entropy": 9.689166069030762, "epoch": 0.2459956495946213, "mean_token_accuracy": 0.785263180732727, "num_tokens": 12950181.0, "step": 2488, "train/ce_loss": 1.524032473564148 }, { "epoch": 0.2459956495946213, "step": 2488, "train/sim_loss": 0.03515625 }, { "epoch": 0.2459956495946213, "step": 2488, "train/total_loss": 0.18755950033664703 }, { "entropy": 9.337489128112793, "epoch": 0.24609452244413685, "mean_token_accuracy": 0.7402597665786743, "num_tokens": 12955430.0, "step": 2489, "train/ce_loss": 1.362806797027588 }, { "epoch": 0.24609452244413685, "step": 2489, "train/sim_loss": 0.0703125 }, { "epoch": 0.24609452244413685, "step": 2489, "train/total_loss": 0.20659318566322327 }, { "entropy": 9.787436485290527, "epoch": 0.24619339529365236, "mean_token_accuracy": 0.75789475440979, "num_tokens": 12960340.0, "step": 2490, "train/ce_loss": 1.1043155193328857 }, { "epoch": 0.24619339529365236, "step": 2490, "train/sim_loss": 0.078125 }, { "epoch": 0.24619339529365236, "step": 2490, "train/total_loss": 0.18855655193328857 }, { "entropy": 9.608366012573242, "epoch": 0.24629226814316788, "mean_token_accuracy": 0.7488986849784851, "num_tokens": 12965472.0, "step": 2491, "train/ce_loss": 1.2403558492660522 }, { "epoch": 0.24629226814316788, "step": 2491, "train/sim_loss": 0.0703125 }, { "epoch": 0.24629226814316788, "step": 2491, "train/total_loss": 0.19434809684753418 }, { "entropy": 9.125654220581055, "epoch": 0.24639114099268342, "mean_token_accuracy": 0.7756613492965698, "num_tokens": 12970877.0, "step": 2492, "train/ce_loss": 0.8300853371620178 }, { "epoch": 0.24639114099268342, "step": 2492, "train/sim_loss": 0.078125 }, { "epoch": 0.24639114099268342, "step": 2492, "train/total_loss": 0.1611335277557373 }, { "entropy": 9.14429759979248, "epoch": 0.24649001384219893, "mean_token_accuracy": 0.7069351077079773, "num_tokens": 12976191.0, "step": 2493, "train/ce_loss": 0.6313163638114929 }, { "epoch": 0.24649001384219893, "step": 2493, "train/sim_loss": 0.0625 }, { "epoch": 0.24649001384219893, "step": 2493, "train/total_loss": 0.12563163042068481 }, { "entropy": 9.490312576293945, "epoch": 0.24658888669171444, "mean_token_accuracy": 0.7224606871604919, "num_tokens": 12981315.0, "step": 2494, "train/ce_loss": 1.229867696762085 }, { "epoch": 0.24658888669171444, "step": 2494, "train/sim_loss": 0.109375 }, { "epoch": 0.24658888669171444, "step": 2494, "train/total_loss": 0.23236176371574402 }, { "entropy": 9.757224082946777, "epoch": 0.24668775954122998, "mean_token_accuracy": 0.7421150207519531, "num_tokens": 12986288.0, "step": 2495, "train/ce_loss": 0.757111132144928 }, { "epoch": 0.24668775954122998, "step": 2495, "train/sim_loss": 0.02734375 }, { "epoch": 0.24668775954122998, "step": 2495, "train/total_loss": 0.10305486619472504 }, { "entropy": 9.02226448059082, "epoch": 0.2467866323907455, "mean_token_accuracy": 0.6985596418380737, "num_tokens": 12991727.0, "step": 2496, "train/ce_loss": 0.681783139705658 }, { "epoch": 0.2467866323907455, "step": 2496, "train/sim_loss": 0.0390625 }, { "epoch": 0.2467866323907455, "step": 2496, "train/total_loss": 0.10724081844091415 }, { "entropy": 9.69623851776123, "epoch": 0.246885505240261, "mean_token_accuracy": 0.7811993360519409, "num_tokens": 12996742.0, "step": 2497, "train/ce_loss": 0.8984249234199524 }, { "epoch": 0.246885505240261, "step": 2497, "train/sim_loss": 0.05859375 }, { "epoch": 0.246885505240261, "step": 2497, "train/total_loss": 0.14843624830245972 }, { "entropy": 9.031787872314453, "epoch": 0.24698437808977655, "mean_token_accuracy": 0.6890459656715393, "num_tokens": 13002119.0, "step": 2498, "train/ce_loss": 0.5838040709495544 }, { "epoch": 0.24698437808977655, "step": 2498, "train/sim_loss": 0.09765625 }, { "epoch": 0.24698437808977655, "step": 2498, "train/total_loss": 0.15603666007518768 }, { "entropy": 8.913003921508789, "epoch": 0.24708325093929207, "mean_token_accuracy": 0.7412280440330505, "num_tokens": 13007483.0, "step": 2499, "train/ce_loss": 1.175858736038208 }, { "epoch": 0.24708325093929207, "step": 2499, "train/sim_loss": 0.0546875 }, { "epoch": 0.24708325093929207, "step": 2499, "train/total_loss": 0.17227336764335632 }, { "epoch": 0.2471821237888076, "grad_norm": 0.7621631026268005, "learning_rate": 9.384611580873264e-06, "loss": 0.1552, "step": 2500 }, { "entropy": 8.792287826538086, "epoch": 0.2471821237888076, "mean_token_accuracy": 0.7477656602859497, "num_tokens": 13012953.0, "step": 2500, "train/ce_loss": 1.028020977973938 }, { "epoch": 0.2471821237888076, "step": 2500, "train/sim_loss": 0.0625 }, { "epoch": 0.2471821237888076, "step": 2500, "train/total_loss": 0.1653020977973938 }, { "entropy": 9.223542213439941, "epoch": 0.24728099663832312, "mean_token_accuracy": 0.7072847485542297, "num_tokens": 13018167.0, "step": 2501, "train/ce_loss": 0.8749057650566101 }, { "epoch": 0.24728099663832312, "step": 2501, "train/sim_loss": 0.0390625 }, { "epoch": 0.24728099663832312, "step": 2501, "train/total_loss": 0.12655308842658997 }, { "entropy": 9.162238121032715, "epoch": 0.24737986948783863, "mean_token_accuracy": 0.7095178961753845, "num_tokens": 13023442.0, "step": 2502, "train/ce_loss": 0.6095376014709473 }, { "epoch": 0.24737986948783863, "step": 2502, "train/sim_loss": 0.08203125 }, { "epoch": 0.24737986948783863, "step": 2502, "train/total_loss": 0.1429850161075592 }, { "entropy": 9.14134693145752, "epoch": 0.24747874233735417, "mean_token_accuracy": 0.7111111283302307, "num_tokens": 13028807.0, "step": 2503, "train/ce_loss": 0.769873857498169 }, { "epoch": 0.24747874233735417, "step": 2503, "train/sim_loss": 0.046875 }, { "epoch": 0.24747874233735417, "step": 2503, "train/total_loss": 0.1238623857498169 }, { "entropy": 9.429458618164062, "epoch": 0.2475776151868697, "mean_token_accuracy": 0.7438271641731262, "num_tokens": 13033910.0, "step": 2504, "train/ce_loss": 0.831933319568634 }, { "epoch": 0.2475776151868697, "step": 2504, "train/sim_loss": 0.0390625 }, { "epoch": 0.2475776151868697, "step": 2504, "train/total_loss": 0.1222558319568634 }, { "entropy": 9.209266662597656, "epoch": 0.2476764880363852, "mean_token_accuracy": 0.761083722114563, "num_tokens": 13039171.0, "step": 2505, "train/ce_loss": 0.9590550065040588 }, { "epoch": 0.2476764880363852, "step": 2505, "train/sim_loss": 0.08984375 }, { "epoch": 0.2476764880363852, "step": 2505, "train/total_loss": 0.18574926257133484 }, { "entropy": 10.064163208007812, "epoch": 0.24777536088590074, "mean_token_accuracy": 0.7623762488365173, "num_tokens": 13043995.0, "step": 2506, "train/ce_loss": 6.885237326059723e-06 }, { "epoch": 0.24777536088590074, "step": 2506, "train/sim_loss": 0.046875 }, { "epoch": 0.24777536088590074, "step": 2506, "train/total_loss": 0.046875689178705215 }, { "entropy": 9.656054496765137, "epoch": 0.24787423373541626, "mean_token_accuracy": 0.7292993664741516, "num_tokens": 13049047.0, "step": 2507, "train/ce_loss": 0.7390437722206116 }, { "epoch": 0.24787423373541626, "step": 2507, "train/sim_loss": 0.046875 }, { "epoch": 0.24787423373541626, "step": 2507, "train/total_loss": 0.1207793802022934 }, { "entropy": 10.271624565124512, "epoch": 0.24797310658493177, "mean_token_accuracy": 0.7892976403236389, "num_tokens": 13053750.0, "step": 2508, "train/ce_loss": 1.015043020248413 }, { "epoch": 0.24797310658493177, "step": 2508, "train/sim_loss": 0.0625 }, { "epoch": 0.24797310658493177, "step": 2508, "train/total_loss": 0.16400429606437683 }, { "entropy": 10.410440444946289, "epoch": 0.2480719794344473, "mean_token_accuracy": 0.7052238583564758, "num_tokens": 13058442.0, "step": 2509, "train/ce_loss": 3.608438730239868 }, { "epoch": 0.2480719794344473, "step": 2509, "train/sim_loss": 0.07421875 }, { "epoch": 0.2480719794344473, "step": 2509, "train/total_loss": 0.43506261706352234 }, { "entropy": 9.161474227905273, "epoch": 0.24817085228396282, "mean_token_accuracy": 0.7288776636123657, "num_tokens": 13063739.0, "step": 2510, "train/ce_loss": 1.0487580299377441 }, { "epoch": 0.24817085228396282, "step": 2510, "train/sim_loss": 0.05078125 }, { "epoch": 0.24817085228396282, "step": 2510, "train/total_loss": 0.15565705299377441 }, { "entropy": 10.024190902709961, "epoch": 0.24826972513347834, "mean_token_accuracy": 0.7843601703643799, "num_tokens": 13068589.0, "step": 2511, "train/ce_loss": 2.9918932341388427e-05 }, { "epoch": 0.24826972513347834, "step": 2511, "train/sim_loss": 0.0625 }, { "epoch": 0.24826972513347834, "step": 2511, "train/total_loss": 0.06250299513339996 }, { "entropy": 9.230805397033691, "epoch": 0.24836859798299388, "mean_token_accuracy": 0.7678160667419434, "num_tokens": 13073950.0, "step": 2512, "train/ce_loss": 0.6158214807510376 }, { "epoch": 0.24836859798299388, "step": 2512, "train/sim_loss": 0.05078125 }, { "epoch": 0.24836859798299388, "step": 2512, "train/total_loss": 0.11236339807510376 }, { "entropy": 9.491394996643066, "epoch": 0.2484674708325094, "mean_token_accuracy": 0.7425249218940735, "num_tokens": 13078989.0, "step": 2513, "train/ce_loss": 4.977840035280678e-06 }, { "epoch": 0.2484674708325094, "step": 2513, "train/sim_loss": 0.08203125 }, { "epoch": 0.2484674708325094, "step": 2513, "train/total_loss": 0.0820317491889 }, { "entropy": 9.507766723632812, "epoch": 0.2485663436820249, "mean_token_accuracy": 0.7412790656089783, "num_tokens": 13084140.0, "step": 2514, "train/ce_loss": 4.862940841121599e-06 }, { "epoch": 0.2485663436820249, "step": 2514, "train/sim_loss": 0.06640625 }, { "epoch": 0.2485663436820249, "step": 2514, "train/total_loss": 0.0664067342877388 }, { "entropy": 9.013659477233887, "epoch": 0.24866521653154045, "mean_token_accuracy": 0.7486842274665833, "num_tokens": 13089351.0, "step": 2515, "train/ce_loss": 0.6694341897964478 }, { "epoch": 0.24866521653154045, "step": 2515, "train/sim_loss": 0.09375 }, { "epoch": 0.24866521653154045, "step": 2515, "train/total_loss": 0.16069342195987701 }, { "entropy": 9.053864479064941, "epoch": 0.24876408938105596, "mean_token_accuracy": 0.8036322593688965, "num_tokens": 13094670.0, "step": 2516, "train/ce_loss": 0.5531548857688904 }, { "epoch": 0.24876408938105596, "step": 2516, "train/sim_loss": 0.01953125 }, { "epoch": 0.24876408938105596, "step": 2516, "train/total_loss": 0.07484674453735352 }, { "entropy": 9.353614807128906, "epoch": 0.24886296223057147, "mean_token_accuracy": 0.7054973840713501, "num_tokens": 13099916.0, "step": 2517, "train/ce_loss": 4.7901185098453425e-06 }, { "epoch": 0.24886296223057147, "step": 2517, "train/sim_loss": 0.0703125 }, { "epoch": 0.24886296223057147, "step": 2517, "train/total_loss": 0.0703129768371582 }, { "entropy": 9.010061264038086, "epoch": 0.248961835080087, "mean_token_accuracy": 0.7446569204330444, "num_tokens": 13105308.0, "step": 2518, "train/ce_loss": 0.6093876361846924 }, { "epoch": 0.248961835080087, "step": 2518, "train/sim_loss": 0.02734375 }, { "epoch": 0.248961835080087, "step": 2518, "train/total_loss": 0.088282510638237 }, { "entropy": 9.791950225830078, "epoch": 0.24906070792960253, "mean_token_accuracy": 0.7227926254272461, "num_tokens": 13110241.0, "step": 2519, "train/ce_loss": 1.5051255226135254 }, { "epoch": 0.24906070792960253, "step": 2519, "train/sim_loss": 0.0703125 }, { "epoch": 0.24906070792960253, "step": 2519, "train/total_loss": 0.22082506120204926 }, { "epoch": 0.24915958077911807, "grad_norm": 1.0966280698776245, "learning_rate": 9.379666716115316e-06, "loss": 0.1507, "step": 2520 }, { "entropy": 9.009889602661133, "epoch": 0.24915958077911807, "mean_token_accuracy": 0.7174638509750366, "num_tokens": 13115607.0, "step": 2520, "train/ce_loss": 0.9668665528297424 }, { "epoch": 0.24915958077911807, "step": 2520, "train/sim_loss": 0.13671875 }, { "epoch": 0.24915958077911807, "step": 2520, "train/total_loss": 0.23340541124343872 }, { "entropy": 9.36453628540039, "epoch": 0.24925845362863358, "mean_token_accuracy": 0.7572413682937622, "num_tokens": 13120805.0, "step": 2521, "train/ce_loss": 0.5397550463676453 }, { "epoch": 0.24925845362863358, "step": 2521, "train/sim_loss": 0.06640625 }, { "epoch": 0.24925845362863358, "step": 2521, "train/total_loss": 0.12038175761699677 }, { "entropy": 9.566434860229492, "epoch": 0.2493573264781491, "mean_token_accuracy": 0.693708598613739, "num_tokens": 13125908.0, "step": 2522, "train/ce_loss": 6.7832647800969426e-06 }, { "epoch": 0.2493573264781491, "step": 2522, "train/sim_loss": 0.0625 }, { "epoch": 0.2493573264781491, "step": 2522, "train/total_loss": 0.06250067800283432 }, { "entropy": 9.14031982421875, "epoch": 0.24945619932766464, "mean_token_accuracy": 0.7293233275413513, "num_tokens": 13131177.0, "step": 2523, "train/ce_loss": 0.9928783774375916 }, { "epoch": 0.24945619932766464, "step": 2523, "train/sim_loss": 0.046875 }, { "epoch": 0.24945619932766464, "step": 2523, "train/total_loss": 0.14616283774375916 }, { "entropy": 9.486601829528809, "epoch": 0.24955507217718015, "mean_token_accuracy": 0.72398841381073, "num_tokens": 13136337.0, "step": 2524, "train/ce_loss": 1.0188546180725098 }, { "epoch": 0.24955507217718015, "step": 2524, "train/sim_loss": 0.02734375 }, { "epoch": 0.24955507217718015, "step": 2524, "train/total_loss": 0.12922921776771545 }, { "entropy": 9.250629425048828, "epoch": 0.24965394502669566, "mean_token_accuracy": 0.6994082927703857, "num_tokens": 13141661.0, "step": 2525, "train/ce_loss": 0.5898566246032715 }, { "epoch": 0.24965394502669566, "step": 2525, "train/sim_loss": 0.08203125 }, { "epoch": 0.24965394502669566, "step": 2525, "train/total_loss": 0.1410169154405594 }, { "entropy": 8.768533706665039, "epoch": 0.2497528178762112, "mean_token_accuracy": 0.7286995649337769, "num_tokens": 13147085.0, "step": 2526, "train/ce_loss": 1.1483701467514038 }, { "epoch": 0.2497528178762112, "step": 2526, "train/sim_loss": 0.078125 }, { "epoch": 0.2497528178762112, "step": 2526, "train/total_loss": 0.19296202063560486 }, { "entropy": 9.395888328552246, "epoch": 0.24985169072572672, "mean_token_accuracy": 0.7089946866035461, "num_tokens": 13152328.0, "step": 2527, "train/ce_loss": 1.4271979331970215 }, { "epoch": 0.24985169072572672, "step": 2527, "train/sim_loss": 0.0546875 }, { "epoch": 0.24985169072572672, "step": 2527, "train/total_loss": 0.1974072903394699 }, { "entropy": 9.114011764526367, "epoch": 0.24995056357524223, "mean_token_accuracy": 0.7521929740905762, "num_tokens": 13157862.0, "step": 2528, "train/ce_loss": 0.9586581587791443 }, { "epoch": 0.24995056357524223, "step": 2528, "train/sim_loss": 0.11328125 }, { "epoch": 0.24995056357524223, "step": 2528, "train/total_loss": 0.20914706587791443 }, { "entropy": 10.094575881958008, "epoch": 0.25004943642475774, "mean_token_accuracy": 0.7950819730758667, "num_tokens": 13162524.0, "step": 2529, "train/ce_loss": 9.78577918431256e-06 }, { "epoch": 0.25004943642475774, "step": 2529, "train/sim_loss": 0.0546875 }, { "epoch": 0.25004943642475774, "step": 2529, "train/total_loss": 0.054688479751348495 }, { "entropy": 9.44721794128418, "epoch": 0.2501483092742733, "mean_token_accuracy": 0.6842923760414124, "num_tokens": 13167641.0, "step": 2530, "train/ce_loss": 0.9610504508018494 }, { "epoch": 0.2501483092742733, "step": 2530, "train/sim_loss": 0.0859375 }, { "epoch": 0.2501483092742733, "step": 2530, "train/total_loss": 0.18204253911972046 }, { "entropy": 8.911985397338867, "epoch": 0.2502471821237888, "mean_token_accuracy": 0.7421109676361084, "num_tokens": 13173074.0, "step": 2531, "train/ce_loss": 1.0923680067062378 }, { "epoch": 0.2502471821237888, "step": 2531, "train/sim_loss": 0.11328125 }, { "epoch": 0.2502471821237888, "step": 2531, "train/total_loss": 0.22251805663108826 }, { "entropy": 9.064131736755371, "epoch": 0.2503460549733043, "mean_token_accuracy": 0.7820823192596436, "num_tokens": 13178415.0, "step": 2532, "train/ce_loss": 0.7013729810714722 }, { "epoch": 0.2503460549733043, "step": 2532, "train/sim_loss": 0.09375 }, { "epoch": 0.2503460549733043, "step": 2532, "train/total_loss": 0.16388729214668274 }, { "entropy": 9.00645637512207, "epoch": 0.25044492782281985, "mean_token_accuracy": 0.6718266010284424, "num_tokens": 13183842.0, "step": 2533, "train/ce_loss": 0.8548325896263123 }, { "epoch": 0.25044492782281985, "step": 2533, "train/sim_loss": 0.01953125 }, { "epoch": 0.25044492782281985, "step": 2533, "train/total_loss": 0.10501451045274734 }, { "entropy": 8.99317741394043, "epoch": 0.2505438006723354, "mean_token_accuracy": 0.6922246217727661, "num_tokens": 13189247.0, "step": 2534, "train/ce_loss": 0.7916908860206604 }, { "epoch": 0.2505438006723354, "step": 2534, "train/sim_loss": 0.05078125 }, { "epoch": 0.2505438006723354, "step": 2534, "train/total_loss": 0.12995034456253052 }, { "entropy": 9.250624656677246, "epoch": 0.2506426735218509, "mean_token_accuracy": 0.744413435459137, "num_tokens": 13194438.0, "step": 2535, "train/ce_loss": 1.1858785152435303 }, { "epoch": 0.2506426735218509, "step": 2535, "train/sim_loss": 0.0859375 }, { "epoch": 0.2506426735218509, "step": 2535, "train/total_loss": 0.20452535152435303 }, { "entropy": 8.684514999389648, "epoch": 0.2507415463713664, "mean_token_accuracy": 0.7258979082107544, "num_tokens": 13199940.0, "step": 2536, "train/ce_loss": 0.5790791511535645 }, { "epoch": 0.2507415463713664, "step": 2536, "train/sim_loss": 0.05078125 }, { "epoch": 0.2507415463713664, "step": 2536, "train/total_loss": 0.10868916660547256 }, { "entropy": 9.351373672485352, "epoch": 0.25084041922088196, "mean_token_accuracy": 0.8149210810661316, "num_tokens": 13205120.0, "step": 2537, "train/ce_loss": 0.6165726184844971 }, { "epoch": 0.25084041922088196, "step": 2537, "train/sim_loss": 0.08984375 }, { "epoch": 0.25084041922088196, "step": 2537, "train/total_loss": 0.15150101482868195 }, { "entropy": 9.45691204071045, "epoch": 0.25093929207039745, "mean_token_accuracy": 0.7552674412727356, "num_tokens": 13210138.0, "step": 2538, "train/ce_loss": 1.2463139295578003 }, { "epoch": 0.25093929207039745, "step": 2538, "train/sim_loss": 0.05859375 }, { "epoch": 0.25093929207039745, "step": 2538, "train/total_loss": 0.18322515487670898 }, { "entropy": 8.860736846923828, "epoch": 0.251038164919913, "mean_token_accuracy": 0.7325102686882019, "num_tokens": 13215355.0, "step": 2539, "train/ce_loss": 0.6914465427398682 }, { "epoch": 0.251038164919913, "step": 2539, "train/sim_loss": 0.0703125 }, { "epoch": 0.251038164919913, "step": 2539, "train/total_loss": 0.13945716619491577 }, { "epoch": 0.25113703776942853, "grad_norm": 1.002090334892273, "learning_rate": 9.374721851357365e-06, "loss": 0.1571, "step": 2540 }, { "entropy": 9.910036087036133, "epoch": 0.25113703776942853, "mean_token_accuracy": 0.7692307829856873, "num_tokens": 13220275.0, "step": 2540, "train/ce_loss": 1.0769728422164917 }, { "epoch": 0.25113703776942853, "step": 2540, "train/sim_loss": 0.05078125 }, { "epoch": 0.25113703776942853, "step": 2540, "train/total_loss": 0.1584785282611847 }, { "entropy": 9.055376052856445, "epoch": 0.251235910618944, "mean_token_accuracy": 0.7429218292236328, "num_tokens": 13225675.0, "step": 2541, "train/ce_loss": 0.6915692687034607 }, { "epoch": 0.251235910618944, "step": 2541, "train/sim_loss": 0.01953125 }, { "epoch": 0.251235910618944, "step": 2541, "train/total_loss": 0.08868817985057831 }, { "entropy": 9.54286003112793, "epoch": 0.25133478346845955, "mean_token_accuracy": 0.7547425627708435, "num_tokens": 13231008.0, "step": 2542, "train/ce_loss": 6.5046788222389296e-06 }, { "epoch": 0.25133478346845955, "step": 2542, "train/sim_loss": 0.05078125 }, { "epoch": 0.25133478346845955, "step": 2542, "train/total_loss": 0.05078190192580223 }, { "entropy": 9.195383071899414, "epoch": 0.2514336563179751, "mean_token_accuracy": 0.701564371585846, "num_tokens": 13236334.0, "step": 2543, "train/ce_loss": 1.094359040260315 }, { "epoch": 0.2514336563179751, "step": 2543, "train/sim_loss": 0.109375 }, { "epoch": 0.2514336563179751, "step": 2543, "train/total_loss": 0.21881091594696045 }, { "entropy": 9.518705368041992, "epoch": 0.2515325291674906, "mean_token_accuracy": 0.7325581312179565, "num_tokens": 13241464.0, "step": 2544, "train/ce_loss": 0.729640543460846 }, { "epoch": 0.2515325291674906, "step": 2544, "train/sim_loss": 0.0625 }, { "epoch": 0.2515325291674906, "step": 2544, "train/total_loss": 0.13546405732631683 }, { "entropy": 9.59697151184082, "epoch": 0.2516314020170061, "mean_token_accuracy": 0.7433234453201294, "num_tokens": 13246605.0, "step": 2545, "train/ce_loss": 1.2695198059082031 }, { "epoch": 0.2516314020170061, "step": 2545, "train/sim_loss": 0.19140625 }, { "epoch": 0.2516314020170061, "step": 2545, "train/total_loss": 0.31835824251174927 }, { "entropy": 9.31287670135498, "epoch": 0.25173027486652166, "mean_token_accuracy": 0.6676300764083862, "num_tokens": 13251730.0, "step": 2546, "train/ce_loss": 1.5146877765655518 }, { "epoch": 0.25173027486652166, "step": 2546, "train/sim_loss": 0.09765625 }, { "epoch": 0.25173027486652166, "step": 2546, "train/total_loss": 0.24912503361701965 }, { "entropy": 9.147940635681152, "epoch": 0.25182914771603715, "mean_token_accuracy": 0.7267230749130249, "num_tokens": 13257016.0, "step": 2547, "train/ce_loss": 1.3301770687103271 }, { "epoch": 0.25182914771603715, "step": 2547, "train/sim_loss": 0.05078125 }, { "epoch": 0.25182914771603715, "step": 2547, "train/total_loss": 0.18379895389080048 }, { "entropy": 9.736891746520996, "epoch": 0.2519280205655527, "mean_token_accuracy": 0.7821576595306396, "num_tokens": 13261915.0, "step": 2548, "train/ce_loss": 1.6027624607086182 }, { "epoch": 0.2519280205655527, "step": 2548, "train/sim_loss": 0.09765625 }, { "epoch": 0.2519280205655527, "step": 2548, "train/total_loss": 0.25793248414993286 }, { "entropy": 9.215120315551758, "epoch": 0.25202689341506823, "mean_token_accuracy": 0.6797829270362854, "num_tokens": 13267133.0, "step": 2549, "train/ce_loss": 1.545188069343567 }, { "epoch": 0.25202689341506823, "step": 2549, "train/sim_loss": 0.125 }, { "epoch": 0.25202689341506823, "step": 2549, "train/total_loss": 0.27951881289482117 }, { "entropy": 9.126137733459473, "epoch": 0.2521257662645838, "mean_token_accuracy": 0.7073459625244141, "num_tokens": 13272442.0, "step": 2550, "train/ce_loss": 0.8889264464378357 }, { "epoch": 0.2521257662645838, "step": 2550, "train/sim_loss": 0.08203125 }, { "epoch": 0.2521257662645838, "step": 2550, "train/total_loss": 0.1709238886833191 }, { "entropy": 9.034530639648438, "epoch": 0.25222463911409926, "mean_token_accuracy": 0.7701271176338196, "num_tokens": 13277860.0, "step": 2551, "train/ce_loss": 1.3219208717346191 }, { "epoch": 0.25222463911409926, "step": 2551, "train/sim_loss": 0.0859375 }, { "epoch": 0.25222463911409926, "step": 2551, "train/total_loss": 0.21812959015369415 }, { "entropy": 9.521261215209961, "epoch": 0.2523235119636148, "mean_token_accuracy": 0.7627627849578857, "num_tokens": 13282954.0, "step": 2552, "train/ce_loss": 0.8999001383781433 }, { "epoch": 0.2523235119636148, "step": 2552, "train/sim_loss": 0.03125 }, { "epoch": 0.2523235119636148, "step": 2552, "train/total_loss": 0.12124001234769821 }, { "entropy": 9.218066215515137, "epoch": 0.25242238481313034, "mean_token_accuracy": 0.7368420958518982, "num_tokens": 13288302.0, "step": 2553, "train/ce_loss": 0.7164695858955383 }, { "epoch": 0.25242238481313034, "step": 2553, "train/sim_loss": 0.0546875 }, { "epoch": 0.25242238481313034, "step": 2553, "train/total_loss": 0.12633445858955383 }, { "entropy": 10.222471237182617, "epoch": 0.2525212576626458, "mean_token_accuracy": 0.7066974639892578, "num_tokens": 13293085.0, "step": 2554, "train/ce_loss": 1.2147566080093384 }, { "epoch": 0.2525212576626458, "step": 2554, "train/sim_loss": 0.05078125 }, { "epoch": 0.2525212576626458, "step": 2554, "train/total_loss": 0.17225691676139832 }, { "entropy": 9.82444953918457, "epoch": 0.25262013051216137, "mean_token_accuracy": 0.7295082211494446, "num_tokens": 13297993.0, "step": 2555, "train/ce_loss": 4.79925893159816e-06 }, { "epoch": 0.25262013051216137, "step": 2555, "train/sim_loss": 0.03125 }, { "epoch": 0.25262013051216137, "step": 2555, "train/total_loss": 0.0312504805624485 }, { "entropy": 9.691442489624023, "epoch": 0.2527190033616769, "mean_token_accuracy": 0.7115384340286255, "num_tokens": 13303070.0, "step": 2556, "train/ce_loss": 1.222868800163269 }, { "epoch": 0.2527190033616769, "step": 2556, "train/sim_loss": 0.140625 }, { "epoch": 0.2527190033616769, "step": 2556, "train/total_loss": 0.2629118859767914 }, { "entropy": 9.206101417541504, "epoch": 0.2528178762111924, "mean_token_accuracy": 0.7218863368034363, "num_tokens": 13308338.0, "step": 2557, "train/ce_loss": 0.688447117805481 }, { "epoch": 0.2528178762111924, "step": 2557, "train/sim_loss": 0.08984375 }, { "epoch": 0.2528178762111924, "step": 2557, "train/total_loss": 0.15868845582008362 }, { "entropy": 9.227208137512207, "epoch": 0.25291674906070793, "mean_token_accuracy": 0.7322468161582947, "num_tokens": 13313619.0, "step": 2558, "train/ce_loss": 0.6929495930671692 }, { "epoch": 0.25291674906070793, "step": 2558, "train/sim_loss": 0.05078125 }, { "epoch": 0.25291674906070793, "step": 2558, "train/total_loss": 0.12007620930671692 }, { "entropy": 9.633859634399414, "epoch": 0.2530156219102235, "mean_token_accuracy": 0.6998368501663208, "num_tokens": 13318682.0, "step": 2559, "train/ce_loss": 1.085984706878662 }, { "epoch": 0.2530156219102235, "step": 2559, "train/sim_loss": 0.078125 }, { "epoch": 0.2530156219102235, "step": 2559, "train/total_loss": 0.1867234706878662 }, { "epoch": 0.25311449475973896, "grad_norm": 1.0176974534988403, "learning_rate": 9.369776986599417e-06, "loss": 0.1588, "step": 2560 }, { "entropy": 8.93287467956543, "epoch": 0.25311449475973896, "mean_token_accuracy": 0.7111681699752808, "num_tokens": 13323908.0, "step": 2560, "train/ce_loss": 0.551044762134552 }, { "epoch": 0.25311449475973896, "step": 2560, "train/sim_loss": 0.1171875 }, { "epoch": 0.25311449475973896, "step": 2560, "train/total_loss": 0.17229197919368744 }, { "entropy": 9.74870491027832, "epoch": 0.2532133676092545, "mean_token_accuracy": 0.7206896543502808, "num_tokens": 13328845.0, "step": 2561, "train/ce_loss": 1.1745083332061768 }, { "epoch": 0.2532133676092545, "step": 2561, "train/sim_loss": 0.08203125 }, { "epoch": 0.2532133676092545, "step": 2561, "train/total_loss": 0.19948208332061768 }, { "entropy": 9.688440322875977, "epoch": 0.25331224045877004, "mean_token_accuracy": 0.7081910967826843, "num_tokens": 13333846.0, "step": 2562, "train/ce_loss": 1.4161006212234497 }, { "epoch": 0.25331224045877004, "step": 2562, "train/sim_loss": 0.0625 }, { "epoch": 0.25331224045877004, "step": 2562, "train/total_loss": 0.2041100710630417 }, { "entropy": 9.229511260986328, "epoch": 0.25341111330828553, "mean_token_accuracy": 0.749685525894165, "num_tokens": 13339123.0, "step": 2563, "train/ce_loss": 5.28743566974299e-06 }, { "epoch": 0.25341111330828553, "step": 2563, "train/sim_loss": 0.05859375 }, { "epoch": 0.25341111330828553, "step": 2563, "train/total_loss": 0.05859427899122238 }, { "entropy": 9.266935348510742, "epoch": 0.25350998615780107, "mean_token_accuracy": 0.7222222089767456, "num_tokens": 13344364.0, "step": 2564, "train/ce_loss": 0.6972231268882751 }, { "epoch": 0.25350998615780107, "step": 2564, "train/sim_loss": 0.046875 }, { "epoch": 0.25350998615780107, "step": 2564, "train/total_loss": 0.11659731715917587 }, { "entropy": 9.317707061767578, "epoch": 0.2536088590073166, "mean_token_accuracy": 0.6845549941062927, "num_tokens": 13349568.0, "step": 2565, "train/ce_loss": 1.2373405694961548 }, { "epoch": 0.2536088590073166, "step": 2565, "train/sim_loss": 0.0859375 }, { "epoch": 0.2536088590073166, "step": 2565, "train/total_loss": 0.20967155694961548 }, { "entropy": 9.177165985107422, "epoch": 0.2537077318568321, "mean_token_accuracy": 0.6903669834136963, "num_tokens": 13354883.0, "step": 2566, "train/ce_loss": 1.2569265365600586 }, { "epoch": 0.2537077318568321, "step": 2566, "train/sim_loss": 0.11328125 }, { "epoch": 0.2537077318568321, "step": 2566, "train/total_loss": 0.23897390067577362 }, { "entropy": 10.023475646972656, "epoch": 0.25380660470634764, "mean_token_accuracy": 0.7082067131996155, "num_tokens": 13359654.0, "step": 2567, "train/ce_loss": 3.0070043067098595e-05 }, { "epoch": 0.25380660470634764, "step": 2567, "train/sim_loss": 0.0546875 }, { "epoch": 0.25380660470634764, "step": 2567, "train/total_loss": 0.05469050630927086 }, { "entropy": 9.127727508544922, "epoch": 0.2539054775558632, "mean_token_accuracy": 0.7402135133743286, "num_tokens": 13364965.0, "step": 2568, "train/ce_loss": 1.4286116361618042 }, { "epoch": 0.2539054775558632, "step": 2568, "train/sim_loss": 0.125 }, { "epoch": 0.2539054775558632, "step": 2568, "train/total_loss": 0.26786118745803833 }, { "entropy": 9.038599967956543, "epoch": 0.25400435040537866, "mean_token_accuracy": 0.679358720779419, "num_tokens": 13370437.0, "step": 2569, "train/ce_loss": 1.4212325811386108 }, { "epoch": 0.25400435040537866, "step": 2569, "train/sim_loss": 0.12890625 }, { "epoch": 0.25400435040537866, "step": 2569, "train/total_loss": 0.271029531955719 }, { "entropy": 9.771385192871094, "epoch": 0.2541032232548942, "mean_token_accuracy": 0.7594433426856995, "num_tokens": 13375385.0, "step": 2570, "train/ce_loss": 0.8145433664321899 }, { "epoch": 0.2541032232548942, "step": 2570, "train/sim_loss": 0.08203125 }, { "epoch": 0.2541032232548942, "step": 2570, "train/total_loss": 0.163485586643219 }, { "entropy": 9.49769115447998, "epoch": 0.25420209610440975, "mean_token_accuracy": 0.7037037014961243, "num_tokens": 13380585.0, "step": 2571, "train/ce_loss": 1.4433395862579346 }, { "epoch": 0.25420209610440975, "step": 2571, "train/sim_loss": 0.125 }, { "epoch": 0.25420209610440975, "step": 2571, "train/total_loss": 0.26933395862579346 }, { "entropy": 9.775506973266602, "epoch": 0.25430096895392523, "mean_token_accuracy": 0.7543520331382751, "num_tokens": 13385558.0, "step": 2572, "train/ce_loss": 1.139591932296753 }, { "epoch": 0.25430096895392523, "step": 2572, "train/sim_loss": 0.0859375 }, { "epoch": 0.25430096895392523, "step": 2572, "train/total_loss": 0.1998966932296753 }, { "entropy": 9.066909790039062, "epoch": 0.2543998418034408, "mean_token_accuracy": 0.7022809386253357, "num_tokens": 13390861.0, "step": 2573, "train/ce_loss": 0.6508738398551941 }, { "epoch": 0.2543998418034408, "step": 2573, "train/sim_loss": 0.0390625 }, { "epoch": 0.2543998418034408, "step": 2573, "train/total_loss": 0.10414988547563553 }, { "entropy": 9.451150894165039, "epoch": 0.2544987146529563, "mean_token_accuracy": 0.7231404781341553, "num_tokens": 13396042.0, "step": 2574, "train/ce_loss": 3.7908125705143902e-06 }, { "epoch": 0.2544987146529563, "step": 2574, "train/sim_loss": 0.05078125 }, { "epoch": 0.2544987146529563, "step": 2574, "train/total_loss": 0.05078162997961044 }, { "entropy": 9.539295196533203, "epoch": 0.2545975875024718, "mean_token_accuracy": 0.7214815020561218, "num_tokens": 13401179.0, "step": 2575, "train/ce_loss": 0.5825445055961609 }, { "epoch": 0.2545975875024718, "step": 2575, "train/sim_loss": 0.0390625 }, { "epoch": 0.2545975875024718, "step": 2575, "train/total_loss": 0.09731695055961609 }, { "entropy": 8.767807006835938, "epoch": 0.25469646035198734, "mean_token_accuracy": 0.754923403263092, "num_tokens": 13406605.0, "step": 2576, "train/ce_loss": 1.4180355072021484 }, { "epoch": 0.25469646035198734, "step": 2576, "train/sim_loss": 0.08203125 }, { "epoch": 0.25469646035198734, "step": 2576, "train/total_loss": 0.2238347977399826 }, { "entropy": 9.604494094848633, "epoch": 0.2547953332015029, "mean_token_accuracy": 0.7901785969734192, "num_tokens": 13411903.0, "step": 2577, "train/ce_loss": 0.6346091032028198 }, { "epoch": 0.2547953332015029, "step": 2577, "train/sim_loss": 0.046875 }, { "epoch": 0.2547953332015029, "step": 2577, "train/total_loss": 0.11033590883016586 }, { "entropy": 9.478580474853516, "epoch": 0.25489420605101837, "mean_token_accuracy": 0.7314949035644531, "num_tokens": 13417023.0, "step": 2578, "train/ce_loss": 1.3560292720794678 }, { "epoch": 0.25489420605101837, "step": 2578, "train/sim_loss": 0.0859375 }, { "epoch": 0.25489420605101837, "step": 2578, "train/total_loss": 0.2215404361486435 }, { "entropy": 9.453705787658691, "epoch": 0.2549930789005339, "mean_token_accuracy": 0.6845729947090149, "num_tokens": 13422193.0, "step": 2579, "train/ce_loss": 1.7850563526153564 }, { "epoch": 0.2549930789005339, "step": 2579, "train/sim_loss": 0.12109375 }, { "epoch": 0.2549930789005339, "step": 2579, "train/total_loss": 0.29959940910339355 }, { "epoch": 0.25509195175004945, "grad_norm": 1.0193370580673218, "learning_rate": 9.364832121841468e-06, "loss": 0.171, "step": 2580 }, { "entropy": 9.64789867401123, "epoch": 0.25509195175004945, "mean_token_accuracy": 0.8156934380531311, "num_tokens": 13427205.0, "step": 2580, "train/ce_loss": 0.7694849371910095 }, { "epoch": 0.25509195175004945, "step": 2580, "train/sim_loss": 0.0234375 }, { "epoch": 0.25509195175004945, "step": 2580, "train/total_loss": 0.10038599371910095 }, { "entropy": 9.008233070373535, "epoch": 0.25519082459956494, "mean_token_accuracy": 0.764018714427948, "num_tokens": 13432544.0, "step": 2581, "train/ce_loss": 0.773780345916748 }, { "epoch": 0.25519082459956494, "step": 2581, "train/sim_loss": 0.06640625 }, { "epoch": 0.25519082459956494, "step": 2581, "train/total_loss": 0.1437842845916748 }, { "entropy": 9.066333770751953, "epoch": 0.2552896974490805, "mean_token_accuracy": 0.6932185292243958, "num_tokens": 13437941.0, "step": 2582, "train/ce_loss": 0.7666678428649902 }, { "epoch": 0.2552896974490805, "step": 2582, "train/sim_loss": 0.07421875 }, { "epoch": 0.2552896974490805, "step": 2582, "train/total_loss": 0.15088553726673126 }, { "entropy": 9.372958183288574, "epoch": 0.255388570298596, "mean_token_accuracy": 0.7142857313156128, "num_tokens": 13443142.0, "step": 2583, "train/ce_loss": 0.8054994344711304 }, { "epoch": 0.255388570298596, "step": 2583, "train/sim_loss": 0.0546875 }, { "epoch": 0.255388570298596, "step": 2583, "train/total_loss": 0.135237455368042 }, { "entropy": 9.219785690307617, "epoch": 0.2554874431481115, "mean_token_accuracy": 0.6632064580917358, "num_tokens": 13448477.0, "step": 2584, "train/ce_loss": 0.5881164073944092 }, { "epoch": 0.2554874431481115, "step": 2584, "train/sim_loss": 0.06640625 }, { "epoch": 0.2554874431481115, "step": 2584, "train/total_loss": 0.12521788477897644 }, { "entropy": 9.666365623474121, "epoch": 0.25558631599762704, "mean_token_accuracy": 0.700952410697937, "num_tokens": 13453434.0, "step": 2585, "train/ce_loss": 1.365299940109253 }, { "epoch": 0.25558631599762704, "step": 2585, "train/sim_loss": 0.0859375 }, { "epoch": 0.25558631599762704, "step": 2585, "train/total_loss": 0.22246749699115753 }, { "entropy": 8.801751136779785, "epoch": 0.2556851888471426, "mean_token_accuracy": 0.6927710771560669, "num_tokens": 13458880.0, "step": 2586, "train/ce_loss": 1.258221983909607 }, { "epoch": 0.2556851888471426, "step": 2586, "train/sim_loss": 0.0703125 }, { "epoch": 0.2556851888471426, "step": 2586, "train/total_loss": 0.19613470137119293 }, { "entropy": 9.132341384887695, "epoch": 0.25578406169665807, "mean_token_accuracy": 0.7421307563781738, "num_tokens": 13464184.0, "step": 2587, "train/ce_loss": 0.6278568506240845 }, { "epoch": 0.25578406169665807, "step": 2587, "train/sim_loss": 0.0625 }, { "epoch": 0.25578406169665807, "step": 2587, "train/total_loss": 0.12528568506240845 }, { "entropy": 9.190263748168945, "epoch": 0.2558829345461736, "mean_token_accuracy": 0.8046783804893494, "num_tokens": 13469516.0, "step": 2588, "train/ce_loss": 0.763489842414856 }, { "epoch": 0.2558829345461736, "step": 2588, "train/sim_loss": 0.01953125 }, { "epoch": 0.2558829345461736, "step": 2588, "train/total_loss": 0.09588023275136948 }, { "entropy": 9.274324417114258, "epoch": 0.25598180739568915, "mean_token_accuracy": 0.7122641801834106, "num_tokens": 13474770.0, "step": 2589, "train/ce_loss": 0.5519495010375977 }, { "epoch": 0.25598180739568915, "step": 2589, "train/sim_loss": 0.03125 }, { "epoch": 0.25598180739568915, "step": 2589, "train/total_loss": 0.08644495159387589 }, { "entropy": 9.52096176147461, "epoch": 0.25608068024520464, "mean_token_accuracy": 0.7340764403343201, "num_tokens": 13479838.0, "step": 2590, "train/ce_loss": 1.3775403499603271 }, { "epoch": 0.25608068024520464, "step": 2590, "train/sim_loss": 0.09375 }, { "epoch": 0.25608068024520464, "step": 2590, "train/total_loss": 0.23150403797626495 }, { "entropy": 9.556732177734375, "epoch": 0.2561795530947202, "mean_token_accuracy": 0.699999988079071, "num_tokens": 13484889.0, "step": 2591, "train/ce_loss": 1.050861120223999 }, { "epoch": 0.2561795530947202, "step": 2591, "train/sim_loss": 0.0625 }, { "epoch": 0.2561795530947202, "step": 2591, "train/total_loss": 0.16758611798286438 }, { "entropy": 9.318330764770508, "epoch": 0.2562784259442357, "mean_token_accuracy": 0.7173051238059998, "num_tokens": 13490131.0, "step": 2592, "train/ce_loss": 1.2866765260696411 }, { "epoch": 0.2562784259442357, "step": 2592, "train/sim_loss": 0.09375 }, { "epoch": 0.2562784259442357, "step": 2592, "train/total_loss": 0.2224176526069641 }, { "entropy": 9.17322826385498, "epoch": 0.25637729879375126, "mean_token_accuracy": 0.7382388710975647, "num_tokens": 13495374.0, "step": 2593, "train/ce_loss": 0.7236014604568481 }, { "epoch": 0.25637729879375126, "step": 2593, "train/sim_loss": 0.046875 }, { "epoch": 0.25637729879375126, "step": 2593, "train/total_loss": 0.11923515051603317 }, { "entropy": 9.930195808410645, "epoch": 0.25647617164326675, "mean_token_accuracy": 0.7596566677093506, "num_tokens": 13500283.0, "step": 2594, "train/ce_loss": 0.6788000464439392 }, { "epoch": 0.25647617164326675, "step": 2594, "train/sim_loss": 0.01953125 }, { "epoch": 0.25647617164326675, "step": 2594, "train/total_loss": 0.08741125464439392 }, { "entropy": 9.190201759338379, "epoch": 0.2565750444927823, "mean_token_accuracy": 0.7316455841064453, "num_tokens": 13505489.0, "step": 2595, "train/ce_loss": 0.7839797139167786 }, { "epoch": 0.2565750444927823, "step": 2595, "train/sim_loss": 0.078125 }, { "epoch": 0.2565750444927823, "step": 2595, "train/total_loss": 0.1565229743719101 }, { "entropy": 9.254472732543945, "epoch": 0.25667391734229783, "mean_token_accuracy": 0.7011494040489197, "num_tokens": 13510986.0, "step": 2596, "train/ce_loss": 1.567700982093811 }, { "epoch": 0.25667391734229783, "step": 2596, "train/sim_loss": 0.09375 }, { "epoch": 0.25667391734229783, "step": 2596, "train/total_loss": 0.25052011013031006 }, { "entropy": 9.898000717163086, "epoch": 0.2567727901918133, "mean_token_accuracy": 0.725450873374939, "num_tokens": 13515906.0, "step": 2597, "train/ce_loss": 1.308440089225769 }, { "epoch": 0.2567727901918133, "step": 2597, "train/sim_loss": 0.07421875 }, { "epoch": 0.2567727901918133, "step": 2597, "train/total_loss": 0.20506276190280914 }, { "entropy": 8.82431697845459, "epoch": 0.25687166304132886, "mean_token_accuracy": 0.7403957843780518, "num_tokens": 13521331.0, "step": 2598, "train/ce_loss": 0.4845927059650421 }, { "epoch": 0.25687166304132886, "step": 2598, "train/sim_loss": 0.0625 }, { "epoch": 0.25687166304132886, "step": 2598, "train/total_loss": 0.11095927655696869 }, { "entropy": 9.454943656921387, "epoch": 0.2569705358908444, "mean_token_accuracy": 0.791946291923523, "num_tokens": 13526553.0, "step": 2599, "train/ce_loss": 0.6428021192550659 }, { "epoch": 0.2569705358908444, "step": 2599, "train/sim_loss": 0.0234375 }, { "epoch": 0.2569705358908444, "step": 2599, "train/total_loss": 0.08771771192550659 }, { "epoch": 0.2570694087403599, "grad_norm": 0.6037888526916504, "learning_rate": 9.35988725708352e-06, "loss": 0.1501, "step": 2600 }, { "entropy": 9.420867919921875, "epoch": 0.2570694087403599, "mean_token_accuracy": 0.751358687877655, "num_tokens": 13531751.0, "step": 2600, "train/ce_loss": 0.953595757484436 }, { "epoch": 0.2570694087403599, "step": 2600, "train/sim_loss": 0.0859375 }, { "epoch": 0.2570694087403599, "step": 2600, "train/total_loss": 0.18129707872867584 }, { "entropy": 10.32927131652832, "epoch": 0.2571682815898754, "mean_token_accuracy": 0.7687074542045593, "num_tokens": 13536484.0, "step": 2601, "train/ce_loss": 1.3974251747131348 }, { "epoch": 0.2571682815898754, "step": 2601, "train/sim_loss": 0.08984375 }, { "epoch": 0.2571682815898754, "step": 2601, "train/total_loss": 0.22958627343177795 }, { "entropy": 9.080526351928711, "epoch": 0.25726715443939097, "mean_token_accuracy": 0.7560693621635437, "num_tokens": 13541822.0, "step": 2602, "train/ce_loss": 0.722567081451416 }, { "epoch": 0.25726715443939097, "step": 2602, "train/sim_loss": 0.046875 }, { "epoch": 0.25726715443939097, "step": 2602, "train/total_loss": 0.11913170665502548 }, { "entropy": 9.02514934539795, "epoch": 0.25736602728890645, "mean_token_accuracy": 0.7454545497894287, "num_tokens": 13547143.0, "step": 2603, "train/ce_loss": 0.533889889717102 }, { "epoch": 0.25736602728890645, "step": 2603, "train/sim_loss": 0.0859375 }, { "epoch": 0.25736602728890645, "step": 2603, "train/total_loss": 0.13932648301124573 }, { "entropy": 10.110696792602539, "epoch": 0.257464900138422, "mean_token_accuracy": 0.7905882596969604, "num_tokens": 13551999.0, "step": 2604, "train/ce_loss": 1.1310733556747437 }, { "epoch": 0.257464900138422, "step": 2604, "train/sim_loss": 0.08984375 }, { "epoch": 0.257464900138422, "step": 2604, "train/total_loss": 0.2029510885477066 }, { "entropy": 9.495134353637695, "epoch": 0.25756377298793753, "mean_token_accuracy": 0.7222222089767456, "num_tokens": 13556854.0, "step": 2605, "train/ce_loss": 2.322931686649099e-05 }, { "epoch": 0.25756377298793753, "step": 2605, "train/sim_loss": 0.05859375 }, { "epoch": 0.25756377298793753, "step": 2605, "train/total_loss": 0.05859607458114624 }, { "entropy": 9.316610336303711, "epoch": 0.257662645837453, "mean_token_accuracy": 0.8257668614387512, "num_tokens": 13562154.0, "step": 2606, "train/ce_loss": 0.5784516334533691 }, { "epoch": 0.257662645837453, "step": 2606, "train/sim_loss": 0.08203125 }, { "epoch": 0.257662645837453, "step": 2606, "train/total_loss": 0.13987641036510468 }, { "entropy": 8.904605865478516, "epoch": 0.25776151868696856, "mean_token_accuracy": 0.7675233483314514, "num_tokens": 13567489.0, "step": 2607, "train/ce_loss": 0.9174344539642334 }, { "epoch": 0.25776151868696856, "step": 2607, "train/sim_loss": 0.03515625 }, { "epoch": 0.25776151868696856, "step": 2607, "train/total_loss": 0.12689968943595886 }, { "entropy": 9.172128677368164, "epoch": 0.2578603915364841, "mean_token_accuracy": 0.738831639289856, "num_tokens": 13572838.0, "step": 2608, "train/ce_loss": 0.7097308039665222 }, { "epoch": 0.2578603915364841, "step": 2608, "train/sim_loss": 0.0703125 }, { "epoch": 0.2578603915364841, "step": 2608, "train/total_loss": 0.14128558337688446 }, { "entropy": 10.204506874084473, "epoch": 0.2579592643859996, "mean_token_accuracy": 0.7416020631790161, "num_tokens": 13577616.0, "step": 2609, "train/ce_loss": 1.2679545879364014 }, { "epoch": 0.2579592643859996, "step": 2609, "train/sim_loss": 0.046875 }, { "epoch": 0.2579592643859996, "step": 2609, "train/total_loss": 0.1736704558134079 }, { "entropy": 9.232139587402344, "epoch": 0.25805813723551513, "mean_token_accuracy": 0.7124842405319214, "num_tokens": 13582854.0, "step": 2610, "train/ce_loss": 1.4001924991607666 }, { "epoch": 0.25805813723551513, "step": 2610, "train/sim_loss": 0.046875 }, { "epoch": 0.25805813723551513, "step": 2610, "train/total_loss": 0.1868942528963089 }, { "entropy": 9.506097793579102, "epoch": 0.25815701008503067, "mean_token_accuracy": 0.7235772609710693, "num_tokens": 13587925.0, "step": 2611, "train/ce_loss": 8.264971256721765e-06 }, { "epoch": 0.25815701008503067, "step": 2611, "train/sim_loss": 0.0546875 }, { "epoch": 0.25815701008503067, "step": 2611, "train/total_loss": 0.05468832701444626 }, { "entropy": 9.140655517578125, "epoch": 0.25825588293454615, "mean_token_accuracy": 0.7585033774375916, "num_tokens": 13593277.0, "step": 2612, "train/ce_loss": 1.0882909297943115 }, { "epoch": 0.25825588293454615, "step": 2612, "train/sim_loss": 0.078125 }, { "epoch": 0.25825588293454615, "step": 2612, "train/total_loss": 0.1869540959596634 }, { "entropy": 9.401294708251953, "epoch": 0.2583547557840617, "mean_token_accuracy": 0.7689922451972961, "num_tokens": 13598355.0, "step": 2613, "train/ce_loss": 0.8922169208526611 }, { "epoch": 0.2583547557840617, "step": 2613, "train/sim_loss": 0.05859375 }, { "epoch": 0.2583547557840617, "step": 2613, "train/total_loss": 0.14781543612480164 }, { "entropy": 8.947813987731934, "epoch": 0.25845362863357724, "mean_token_accuracy": 0.7288659811019897, "num_tokens": 13603794.0, "step": 2614, "train/ce_loss": 0.91164630651474 }, { "epoch": 0.25845362863357724, "step": 2614, "train/sim_loss": 0.01953125 }, { "epoch": 0.25845362863357724, "step": 2614, "train/total_loss": 0.11069588363170624 }, { "entropy": 9.560693740844727, "epoch": 0.2585525014830927, "mean_token_accuracy": 0.7612208127975464, "num_tokens": 13608757.0, "step": 2615, "train/ce_loss": 0.5483984351158142 }, { "epoch": 0.2585525014830927, "step": 2615, "train/sim_loss": 0.1015625 }, { "epoch": 0.2585525014830927, "step": 2615, "train/total_loss": 0.1564023494720459 }, { "entropy": 9.200815200805664, "epoch": 0.25865137433260826, "mean_token_accuracy": 0.7626506090164185, "num_tokens": 13613996.0, "step": 2616, "train/ce_loss": 0.8643326759338379 }, { "epoch": 0.25865137433260826, "step": 2616, "train/sim_loss": 0.0546875 }, { "epoch": 0.25865137433260826, "step": 2616, "train/total_loss": 0.1411207616329193 }, { "entropy": 9.343541145324707, "epoch": 0.2587502471821238, "mean_token_accuracy": 0.7405247688293457, "num_tokens": 13619308.0, "step": 2617, "train/ce_loss": 1.2328028678894043 }, { "epoch": 0.2587502471821238, "step": 2617, "train/sim_loss": 0.07421875 }, { "epoch": 0.2587502471821238, "step": 2617, "train/total_loss": 0.19749903678894043 }, { "entropy": 9.184523582458496, "epoch": 0.2588491200316393, "mean_token_accuracy": 0.7261761426925659, "num_tokens": 13624625.0, "step": 2618, "train/ce_loss": 0.7512075901031494 }, { "epoch": 0.2588491200316393, "step": 2618, "train/sim_loss": 0.03515625 }, { "epoch": 0.2588491200316393, "step": 2618, "train/total_loss": 0.11027701199054718 }, { "entropy": 9.113435745239258, "epoch": 0.25894799288115483, "mean_token_accuracy": 0.7227723002433777, "num_tokens": 13630022.0, "step": 2619, "train/ce_loss": 0.7234528064727783 }, { "epoch": 0.25894799288115483, "step": 2619, "train/sim_loss": 0.0546875 }, { "epoch": 0.25894799288115483, "step": 2619, "train/total_loss": 0.1270327866077423 }, { "epoch": 0.2590468657306704, "grad_norm": 0.7478315234184265, "learning_rate": 9.35494239232557e-06, "loss": 0.1419, "step": 2620 }, { "entropy": 9.056140899658203, "epoch": 0.2590468657306704, "mean_token_accuracy": 0.7356979250907898, "num_tokens": 13635382.0, "step": 2620, "train/ce_loss": 0.4664541184902191 }, { "epoch": 0.2590468657306704, "step": 2620, "train/sim_loss": 0.0234375 }, { "epoch": 0.2590468657306704, "step": 2620, "train/total_loss": 0.07008291780948639 }, { "entropy": 9.462387084960938, "epoch": 0.25914573858018586, "mean_token_accuracy": 0.816500723361969, "num_tokens": 13640490.0, "step": 2621, "train/ce_loss": 2.0883476281596813e-06 }, { "epoch": 0.25914573858018586, "step": 2621, "train/sim_loss": 0.0390625 }, { "epoch": 0.25914573858018586, "step": 2621, "train/total_loss": 0.039062708616256714 }, { "entropy": 9.461859703063965, "epoch": 0.2592446114297014, "mean_token_accuracy": 0.7144906520843506, "num_tokens": 13645635.0, "step": 2622, "train/ce_loss": 1.0317198038101196 }, { "epoch": 0.2592446114297014, "step": 2622, "train/sim_loss": 0.109375 }, { "epoch": 0.2592446114297014, "step": 2622, "train/total_loss": 0.21254697442054749 }, { "entropy": 9.722253799438477, "epoch": 0.25934348427921694, "mean_token_accuracy": 0.7546584010124207, "num_tokens": 13650708.0, "step": 2623, "train/ce_loss": 0.48645493388175964 }, { "epoch": 0.25934348427921694, "step": 2623, "train/sim_loss": 0.05078125 }, { "epoch": 0.25934348427921694, "step": 2623, "train/total_loss": 0.0994267463684082 }, { "entropy": 9.369498252868652, "epoch": 0.2594423571287324, "mean_token_accuracy": 0.8027397394180298, "num_tokens": 13655927.0, "step": 2624, "train/ce_loss": 0.6022058129310608 }, { "epoch": 0.2594423571287324, "step": 2624, "train/sim_loss": 0.0703125 }, { "epoch": 0.2594423571287324, "step": 2624, "train/total_loss": 0.13053308427333832 }, { "entropy": 8.932147026062012, "epoch": 0.25954122997824797, "mean_token_accuracy": 0.7331118583679199, "num_tokens": 13661313.0, "step": 2625, "train/ce_loss": 0.9405847191810608 }, { "epoch": 0.25954122997824797, "step": 2625, "train/sim_loss": 0.03515625 }, { "epoch": 0.25954122997824797, "step": 2625, "train/total_loss": 0.12921473383903503 }, { "entropy": 9.141525268554688, "epoch": 0.2596401028277635, "mean_token_accuracy": 0.7060109376907349, "num_tokens": 13666720.0, "step": 2626, "train/ce_loss": 0.9899309873580933 }, { "epoch": 0.2596401028277635, "step": 2626, "train/sim_loss": 0.08203125 }, { "epoch": 0.2596401028277635, "step": 2626, "train/total_loss": 0.18102434277534485 }, { "entropy": 8.945329666137695, "epoch": 0.259738975677279, "mean_token_accuracy": 0.676986575126648, "num_tokens": 13672196.0, "step": 2627, "train/ce_loss": 0.8837153315544128 }, { "epoch": 0.259738975677279, "step": 2627, "train/sim_loss": 0.07421875 }, { "epoch": 0.259738975677279, "step": 2627, "train/total_loss": 0.16259029507637024 }, { "entropy": 9.45634651184082, "epoch": 0.25983784852679453, "mean_token_accuracy": 0.7279305458068848, "num_tokens": 13677492.0, "step": 2628, "train/ce_loss": 5.773954853793839e-06 }, { "epoch": 0.25983784852679453, "step": 2628, "train/sim_loss": 0.0546875 }, { "epoch": 0.25983784852679453, "step": 2628, "train/total_loss": 0.05468807741999626 }, { "entropy": 9.273452758789062, "epoch": 0.2599367213763101, "mean_token_accuracy": 0.7415204644203186, "num_tokens": 13683001.0, "step": 2629, "train/ce_loss": 0.9866542816162109 }, { "epoch": 0.2599367213763101, "step": 2629, "train/sim_loss": 0.109375 }, { "epoch": 0.2599367213763101, "step": 2629, "train/total_loss": 0.20804043114185333 }, { "entropy": 9.282662391662598, "epoch": 0.26003559422582556, "mean_token_accuracy": 0.8014616370201111, "num_tokens": 13688283.0, "step": 2630, "train/ce_loss": 0.8849129676818848 }, { "epoch": 0.26003559422582556, "step": 2630, "train/sim_loss": 0.0390625 }, { "epoch": 0.26003559422582556, "step": 2630, "train/total_loss": 0.127553790807724 }, { "entropy": 9.507521629333496, "epoch": 0.2601344670753411, "mean_token_accuracy": 0.6752265691757202, "num_tokens": 13693401.0, "step": 2631, "train/ce_loss": 1.5338138341903687 }, { "epoch": 0.2601344670753411, "step": 2631, "train/sim_loss": 0.078125 }, { "epoch": 0.2601344670753411, "step": 2631, "train/total_loss": 0.23150639235973358 }, { "entropy": 9.034505844116211, "epoch": 0.26023333992485664, "mean_token_accuracy": 0.7340425252914429, "num_tokens": 13698642.0, "step": 2632, "train/ce_loss": 0.5224541425704956 }, { "epoch": 0.26023333992485664, "step": 2632, "train/sim_loss": 0.03515625 }, { "epoch": 0.26023333992485664, "step": 2632, "train/total_loss": 0.08740166574716568 }, { "entropy": 9.128165245056152, "epoch": 0.2603322127743722, "mean_token_accuracy": 0.7700650691986084, "num_tokens": 13704167.0, "step": 2633, "train/ce_loss": 0.5268516540527344 }, { "epoch": 0.2603322127743722, "step": 2633, "train/sim_loss": 0.0703125 }, { "epoch": 0.2603322127743722, "step": 2633, "train/total_loss": 0.12299767136573792 }, { "entropy": 8.632181167602539, "epoch": 0.26043108562388767, "mean_token_accuracy": 0.6927223801612854, "num_tokens": 13709781.0, "step": 2634, "train/ce_loss": 0.9153441786766052 }, { "epoch": 0.26043108562388767, "step": 2634, "train/sim_loss": 0.0703125 }, { "epoch": 0.26043108562388767, "step": 2634, "train/total_loss": 0.16184692084789276 }, { "entropy": 9.012310028076172, "epoch": 0.2605299584734032, "mean_token_accuracy": 0.6920454502105713, "num_tokens": 13715082.0, "step": 2635, "train/ce_loss": 0.9514340758323669 }, { "epoch": 0.2605299584734032, "step": 2635, "train/sim_loss": 0.14453125 }, { "epoch": 0.2605299584734032, "step": 2635, "train/total_loss": 0.2396746575832367 }, { "entropy": 9.437811851501465, "epoch": 0.26062883132291875, "mean_token_accuracy": 0.7741456031799316, "num_tokens": 13720240.0, "step": 2636, "train/ce_loss": 0.8296997547149658 }, { "epoch": 0.26062883132291875, "step": 2636, "train/sim_loss": 0.0390625 }, { "epoch": 0.26062883132291875, "step": 2636, "train/total_loss": 0.12203247845172882 }, { "entropy": 8.832395553588867, "epoch": 0.26072770417243424, "mean_token_accuracy": 0.7161290049552917, "num_tokens": 13725769.0, "step": 2637, "train/ce_loss": 0.7955597639083862 }, { "epoch": 0.26072770417243424, "step": 2637, "train/sim_loss": 0.03515625 }, { "epoch": 0.26072770417243424, "step": 2637, "train/total_loss": 0.11471223086118698 }, { "entropy": 10.038055419921875, "epoch": 0.2608265770219498, "mean_token_accuracy": 0.7489270567893982, "num_tokens": 13730658.0, "step": 2638, "train/ce_loss": 1.0722737312316895 }, { "epoch": 0.2608265770219498, "step": 2638, "train/sim_loss": 0.06640625 }, { "epoch": 0.2608265770219498, "step": 2638, "train/total_loss": 0.1736336350440979 }, { "entropy": 9.274852752685547, "epoch": 0.2609254498714653, "mean_token_accuracy": 0.7162954211235046, "num_tokens": 13735971.0, "step": 2639, "train/ce_loss": 0.656727135181427 }, { "epoch": 0.2609254498714653, "step": 2639, "train/sim_loss": 0.0859375 }, { "epoch": 0.2609254498714653, "step": 2639, "train/total_loss": 0.15161022543907166 }, { "epoch": 0.2610243227209808, "grad_norm": 1.2802402973175049, "learning_rate": 9.34999752756762e-06, "loss": 0.1581, "step": 2640 }, { "entropy": 9.607377052307129, "epoch": 0.2610243227209808, "mean_token_accuracy": 0.6711111068725586, "num_tokens": 13741121.0, "step": 2640, "train/ce_loss": 0.7046462893486023 }, { "epoch": 0.2610243227209808, "step": 2640, "train/sim_loss": 0.07421875 }, { "epoch": 0.2610243227209808, "step": 2640, "train/total_loss": 0.14468339085578918 }, { "entropy": 9.388420104980469, "epoch": 0.26112319557049635, "mean_token_accuracy": 0.7647058963775635, "num_tokens": 13746335.0, "step": 2641, "train/ce_loss": 0.9969542622566223 }, { "epoch": 0.26112319557049635, "step": 2641, "train/sim_loss": 0.0546875 }, { "epoch": 0.26112319557049635, "step": 2641, "train/total_loss": 0.15438292920589447 }, { "entropy": 9.117688179016113, "epoch": 0.2612220684200119, "mean_token_accuracy": 0.7053669095039368, "num_tokens": 13751728.0, "step": 2642, "train/ce_loss": 1.0301885604858398 }, { "epoch": 0.2612220684200119, "step": 2642, "train/sim_loss": 0.04296875 }, { "epoch": 0.2612220684200119, "step": 2642, "train/total_loss": 0.1459876000881195 }, { "entropy": 9.035491943359375, "epoch": 0.2613209412695274, "mean_token_accuracy": 0.7678132653236389, "num_tokens": 13757016.0, "step": 2643, "train/ce_loss": 0.6681311130523682 }, { "epoch": 0.2613209412695274, "step": 2643, "train/sim_loss": 0.05859375 }, { "epoch": 0.2613209412695274, "step": 2643, "train/total_loss": 0.12540686130523682 }, { "entropy": 8.702983856201172, "epoch": 0.2614198141190429, "mean_token_accuracy": 0.7816091775894165, "num_tokens": 13762587.0, "step": 2644, "train/ce_loss": 0.4604906737804413 }, { "epoch": 0.2614198141190429, "step": 2644, "train/sim_loss": 0.06640625 }, { "epoch": 0.2614198141190429, "step": 2644, "train/total_loss": 0.1124553233385086 }, { "entropy": 9.319679260253906, "epoch": 0.26151868696855846, "mean_token_accuracy": 0.7560283541679382, "num_tokens": 13767732.0, "step": 2645, "train/ce_loss": 0.8887007832527161 }, { "epoch": 0.26151868696855846, "step": 2645, "train/sim_loss": 0.06640625 }, { "epoch": 0.26151868696855846, "step": 2645, "train/total_loss": 0.1552763283252716 }, { "entropy": 10.013471603393555, "epoch": 0.26161755981807394, "mean_token_accuracy": 0.7711111307144165, "num_tokens": 13772583.0, "step": 2646, "train/ce_loss": 1.5753374099731445 }, { "epoch": 0.26161755981807394, "step": 2646, "train/sim_loss": 0.04296875 }, { "epoch": 0.26161755981807394, "step": 2646, "train/total_loss": 0.20050249993801117 }, { "entropy": 9.408167839050293, "epoch": 0.2617164326675895, "mean_token_accuracy": 0.7364864945411682, "num_tokens": 13777763.0, "step": 2647, "train/ce_loss": 0.8203741908073425 }, { "epoch": 0.2617164326675895, "step": 2647, "train/sim_loss": 0.03125 }, { "epoch": 0.2617164326675895, "step": 2647, "train/total_loss": 0.11328741908073425 }, { "entropy": 8.955461502075195, "epoch": 0.261815305517105, "mean_token_accuracy": 0.7596899271011353, "num_tokens": 13783089.0, "step": 2648, "train/ce_loss": 0.9542549848556519 }, { "epoch": 0.261815305517105, "step": 2648, "train/sim_loss": 0.1171875 }, { "epoch": 0.261815305517105, "step": 2648, "train/total_loss": 0.21261300146579742 }, { "entropy": 9.848175048828125, "epoch": 0.2619141783666205, "mean_token_accuracy": 0.7568710446357727, "num_tokens": 13787967.0, "step": 2649, "train/ce_loss": 7.882207682996523e-06 }, { "epoch": 0.2619141783666205, "step": 2649, "train/sim_loss": 0.04296875 }, { "epoch": 0.2619141783666205, "step": 2649, "train/total_loss": 0.042969539761543274 }, { "entropy": 9.237049102783203, "epoch": 0.26201305121613605, "mean_token_accuracy": 0.7001153230667114, "num_tokens": 13793348.0, "step": 2650, "train/ce_loss": 1.8977290391921997 }, { "epoch": 0.26201305121613605, "step": 2650, "train/sim_loss": 0.10546875 }, { "epoch": 0.26201305121613605, "step": 2650, "train/total_loss": 0.29524165391921997 }, { "entropy": 9.357877731323242, "epoch": 0.2621119240656516, "mean_token_accuracy": 0.7086092829704285, "num_tokens": 13798438.0, "step": 2651, "train/ce_loss": 6.794214277761057e-05 }, { "epoch": 0.2621119240656516, "step": 2651, "train/sim_loss": 0.0703125 }, { "epoch": 0.2621119240656516, "step": 2651, "train/total_loss": 0.0703192949295044 }, { "entropy": 9.049224853515625, "epoch": 0.2622107969151671, "mean_token_accuracy": 0.702075719833374, "num_tokens": 13803753.0, "step": 2652, "train/ce_loss": 1.3432912826538086 }, { "epoch": 0.2622107969151671, "step": 2652, "train/sim_loss": 0.08203125 }, { "epoch": 0.2622107969151671, "step": 2652, "train/total_loss": 0.21636037528514862 }, { "entropy": 9.077301025390625, "epoch": 0.2623096697646826, "mean_token_accuracy": 0.7144444584846497, "num_tokens": 13809095.0, "step": 2653, "train/ce_loss": 0.6699445247650146 }, { "epoch": 0.2623096697646826, "step": 2653, "train/sim_loss": 0.0625 }, { "epoch": 0.2623096697646826, "step": 2653, "train/total_loss": 0.12949445843696594 }, { "entropy": 9.025361061096191, "epoch": 0.26240854261419816, "mean_token_accuracy": 0.7596566677093506, "num_tokens": 13814520.0, "step": 2654, "train/ce_loss": 0.5731825828552246 }, { "epoch": 0.26240854261419816, "step": 2654, "train/sim_loss": 0.08984375 }, { "epoch": 0.26240854261419816, "step": 2654, "train/total_loss": 0.14716200530529022 }, { "entropy": 9.267854690551758, "epoch": 0.26250741546371364, "mean_token_accuracy": 0.739130437374115, "num_tokens": 13819649.0, "step": 2655, "train/ce_loss": 0.726793646812439 }, { "epoch": 0.26250741546371364, "step": 2655, "train/sim_loss": 0.0703125 }, { "epoch": 0.26250741546371364, "step": 2655, "train/total_loss": 0.14299187064170837 }, { "entropy": 9.054587364196777, "epoch": 0.2626062883132292, "mean_token_accuracy": 0.7280513644218445, "num_tokens": 13825107.0, "step": 2656, "train/ce_loss": 0.9306489825248718 }, { "epoch": 0.2626062883132292, "step": 2656, "train/sim_loss": 0.09375 }, { "epoch": 0.2626062883132292, "step": 2656, "train/total_loss": 0.18681490421295166 }, { "entropy": 9.160943031311035, "epoch": 0.2627051611627447, "mean_token_accuracy": 0.7162162065505981, "num_tokens": 13830453.0, "step": 2657, "train/ce_loss": 1.320202112197876 }, { "epoch": 0.2627051611627447, "step": 2657, "train/sim_loss": 0.09375 }, { "epoch": 0.2627051611627447, "step": 2657, "train/total_loss": 0.22577022016048431 }, { "entropy": 9.296772003173828, "epoch": 0.2628040340122602, "mean_token_accuracy": 0.7425997257232666, "num_tokens": 13835670.0, "step": 2658, "train/ce_loss": 0.8563773036003113 }, { "epoch": 0.2628040340122602, "step": 2658, "train/sim_loss": 0.0625 }, { "epoch": 0.2628040340122602, "step": 2658, "train/total_loss": 0.14813773334026337 }, { "entropy": 9.201019287109375, "epoch": 0.26290290686177575, "mean_token_accuracy": 0.7122128009796143, "num_tokens": 13840980.0, "step": 2659, "train/ce_loss": 0.7281718254089355 }, { "epoch": 0.26290290686177575, "step": 2659, "train/sim_loss": 0.04296875 }, { "epoch": 0.26290290686177575, "step": 2659, "train/total_loss": 0.11578593403100967 }, { "epoch": 0.2630017797112913, "grad_norm": 0.7757517099380493, "learning_rate": 9.345052662809673e-06, "loss": 0.1477, "step": 2660 }, { "entropy": 9.477794647216797, "epoch": 0.2630017797112913, "mean_token_accuracy": 0.8177965879440308, "num_tokens": 13846147.0, "step": 2660, "train/ce_loss": 0.7558349967002869 }, { "epoch": 0.2630017797112913, "step": 2660, "train/sim_loss": 0.0859375 }, { "epoch": 0.2630017797112913, "step": 2660, "train/total_loss": 0.16152100265026093 }, { "entropy": 10.013696670532227, "epoch": 0.2631006525608068, "mean_token_accuracy": 0.681614339351654, "num_tokens": 13851006.0, "step": 2661, "train/ce_loss": 1.5152400732040405 }, { "epoch": 0.2631006525608068, "step": 2661, "train/sim_loss": 0.08984375 }, { "epoch": 0.2631006525608068, "step": 2661, "train/total_loss": 0.24136775732040405 }, { "entropy": 8.90575885772705, "epoch": 0.2631995254103223, "mean_token_accuracy": 0.7424242496490479, "num_tokens": 13856510.0, "step": 2662, "train/ce_loss": 0.9599835276603699 }, { "epoch": 0.2631995254103223, "step": 2662, "train/sim_loss": 0.08984375 }, { "epoch": 0.2631995254103223, "step": 2662, "train/total_loss": 0.1858420968055725 }, { "entropy": 9.146432876586914, "epoch": 0.26329839825983786, "mean_token_accuracy": 0.7649123072624207, "num_tokens": 13861865.0, "step": 2663, "train/ce_loss": 0.5537664890289307 }, { "epoch": 0.26329839825983786, "step": 2663, "train/sim_loss": 0.078125 }, { "epoch": 0.26329839825983786, "step": 2663, "train/total_loss": 0.13350164890289307 }, { "entropy": 9.466894149780273, "epoch": 0.26339727110935335, "mean_token_accuracy": 0.7438849210739136, "num_tokens": 13867002.0, "step": 2664, "train/ce_loss": 0.908247709274292 }, { "epoch": 0.26339727110935335, "step": 2664, "train/sim_loss": 0.03515625 }, { "epoch": 0.26339727110935335, "step": 2664, "train/total_loss": 0.12598103284835815 }, { "entropy": 9.13033676147461, "epoch": 0.2634961439588689, "mean_token_accuracy": 0.696450412273407, "num_tokens": 13872493.0, "step": 2665, "train/ce_loss": 0.4399969279766083 }, { "epoch": 0.2634961439588689, "step": 2665, "train/sim_loss": 0.078125 }, { "epoch": 0.2634961439588689, "step": 2665, "train/total_loss": 0.12212469428777695 }, { "entropy": 8.980000495910645, "epoch": 0.26359501680838443, "mean_token_accuracy": 0.672251284122467, "num_tokens": 13877922.0, "step": 2666, "train/ce_loss": 1.4724513292312622 }, { "epoch": 0.26359501680838443, "step": 2666, "train/sim_loss": 0.1171875 }, { "epoch": 0.26359501680838443, "step": 2666, "train/total_loss": 0.2644326388835907 }, { "entropy": 9.773246765136719, "epoch": 0.2636938896578999, "mean_token_accuracy": 0.7590579986572266, "num_tokens": 13882872.0, "step": 2667, "train/ce_loss": 0.9083855748176575 }, { "epoch": 0.2636938896578999, "step": 2667, "train/sim_loss": 0.05859375 }, { "epoch": 0.2636938896578999, "step": 2667, "train/total_loss": 0.14943230152130127 }, { "entropy": 9.056009292602539, "epoch": 0.26379276250741546, "mean_token_accuracy": 0.7304551005363464, "num_tokens": 13888200.0, "step": 2668, "train/ce_loss": 0.5513181090354919 }, { "epoch": 0.26379276250741546, "step": 2668, "train/sim_loss": 0.0859375 }, { "epoch": 0.26379276250741546, "step": 2668, "train/total_loss": 0.14106930792331696 }, { "entropy": 9.509899139404297, "epoch": 0.263891635356931, "mean_token_accuracy": 0.6895368695259094, "num_tokens": 13893238.0, "step": 2669, "train/ce_loss": 1.6937878131866455 }, { "epoch": 0.263891635356931, "step": 2669, "train/sim_loss": 0.09765625 }, { "epoch": 0.263891635356931, "step": 2669, "train/total_loss": 0.26703503727912903 }, { "entropy": 8.59216022491455, "epoch": 0.2639905082064465, "mean_token_accuracy": 0.7532597780227661, "num_tokens": 13898822.0, "step": 2670, "train/ce_loss": 0.6907157301902771 }, { "epoch": 0.2639905082064465, "step": 2670, "train/sim_loss": 0.140625 }, { "epoch": 0.2639905082064465, "step": 2670, "train/total_loss": 0.20969657599925995 }, { "entropy": 9.557438850402832, "epoch": 0.264089381055962, "mean_token_accuracy": 0.8209407925605774, "num_tokens": 13903907.0, "step": 2671, "train/ce_loss": 0.43957269191741943 }, { "epoch": 0.264089381055962, "step": 2671, "train/sim_loss": 0.01953125 }, { "epoch": 0.264089381055962, "step": 2671, "train/total_loss": 0.06348852068185806 }, { "entropy": 9.627422332763672, "epoch": 0.26418825390547757, "mean_token_accuracy": 0.7705479264259338, "num_tokens": 13908922.0, "step": 2672, "train/ce_loss": 2.151191234588623 }, { "epoch": 0.26418825390547757, "step": 2672, "train/sim_loss": 0.078125 }, { "epoch": 0.26418825390547757, "step": 2672, "train/total_loss": 0.2932441234588623 }, { "entropy": 9.203516006469727, "epoch": 0.26428712675499305, "mean_token_accuracy": 0.6953020095825195, "num_tokens": 13914151.0, "step": 2673, "train/ce_loss": 1.3459376096725464 }, { "epoch": 0.26428712675499305, "step": 2673, "train/sim_loss": 0.12109375 }, { "epoch": 0.26428712675499305, "step": 2673, "train/total_loss": 0.25568753480911255 }, { "entropy": 9.146098136901855, "epoch": 0.2643859996045086, "mean_token_accuracy": 0.7183908224105835, "num_tokens": 13919447.0, "step": 2674, "train/ce_loss": 0.566749632358551 }, { "epoch": 0.2643859996045086, "step": 2674, "train/sim_loss": 0.0390625 }, { "epoch": 0.2643859996045086, "step": 2674, "train/total_loss": 0.09573746472597122 }, { "entropy": 9.722884178161621, "epoch": 0.26448487245402413, "mean_token_accuracy": 0.713274359703064, "num_tokens": 13924391.0, "step": 2675, "train/ce_loss": 1.1495646238327026 }, { "epoch": 0.26448487245402413, "step": 2675, "train/sim_loss": 0.0625 }, { "epoch": 0.26448487245402413, "step": 2675, "train/total_loss": 0.17745646834373474 }, { "entropy": 9.050594329833984, "epoch": 0.2645837453035397, "mean_token_accuracy": 0.738269031047821, "num_tokens": 13929809.0, "step": 2676, "train/ce_loss": 1.4384698867797852 }, { "epoch": 0.2645837453035397, "step": 2676, "train/sim_loss": 0.12109375 }, { "epoch": 0.2645837453035397, "step": 2676, "train/total_loss": 0.2649407386779785 }, { "entropy": 9.252591133117676, "epoch": 0.26468261815305516, "mean_token_accuracy": 0.7371967434883118, "num_tokens": 13935041.0, "step": 2677, "train/ce_loss": 0.8613839149475098 }, { "epoch": 0.26468261815305516, "step": 2677, "train/sim_loss": 0.08203125 }, { "epoch": 0.26468261815305516, "step": 2677, "train/total_loss": 0.16816964745521545 }, { "entropy": 9.040063858032227, "epoch": 0.2647814910025707, "mean_token_accuracy": 0.7625133395195007, "num_tokens": 13940464.0, "step": 2678, "train/ce_loss": 0.3832489848136902 }, { "epoch": 0.2647814910025707, "step": 2678, "train/sim_loss": 0.07421875 }, { "epoch": 0.2647814910025707, "step": 2678, "train/total_loss": 0.11254364997148514 }, { "entropy": 8.749004364013672, "epoch": 0.26488036385208624, "mean_token_accuracy": 0.7801911234855652, "num_tokens": 13946111.0, "step": 2679, "train/ce_loss": 0.6618013381958008 }, { "epoch": 0.26488036385208624, "step": 2679, "train/sim_loss": 0.04296875 }, { "epoch": 0.26488036385208624, "step": 2679, "train/total_loss": 0.10914888232946396 }, { "epoch": 0.26497923670160173, "grad_norm": 0.7406871914863586, "learning_rate": 9.340107798051723e-06, "loss": 0.1539, "step": 2680 }, { "entropy": 9.133565902709961, "epoch": 0.26497923670160173, "mean_token_accuracy": 0.7111111283302307, "num_tokens": 13951535.0, "step": 2680, "train/ce_loss": 0.535231351852417 }, { "epoch": 0.26497923670160173, "step": 2680, "train/sim_loss": 0.06640625 }, { "epoch": 0.26497923670160173, "step": 2680, "train/total_loss": 0.11992938816547394 }, { "entropy": 9.485006332397461, "epoch": 0.26507810955111727, "mean_token_accuracy": 0.7763347625732422, "num_tokens": 13956685.0, "step": 2681, "train/ce_loss": 3.8054508877394255e-06 }, { "epoch": 0.26507810955111727, "step": 2681, "train/sim_loss": 0.0546875 }, { "epoch": 0.26507810955111727, "step": 2681, "train/total_loss": 0.05468787997961044 }, { "entropy": 8.994706153869629, "epoch": 0.2651769824006328, "mean_token_accuracy": 0.7014613747596741, "num_tokens": 13962305.0, "step": 2682, "train/ce_loss": 1.3398152589797974 }, { "epoch": 0.2651769824006328, "step": 2682, "train/sim_loss": 0.08984375 }, { "epoch": 0.2651769824006328, "step": 2682, "train/total_loss": 0.22382527589797974 }, { "entropy": 9.234704971313477, "epoch": 0.2652758552501483, "mean_token_accuracy": 0.7142857313156128, "num_tokens": 13967559.0, "step": 2683, "train/ce_loss": 0.8524354100227356 }, { "epoch": 0.2652758552501483, "step": 2683, "train/sim_loss": 0.0625 }, { "epoch": 0.2652758552501483, "step": 2683, "train/total_loss": 0.14774355292320251 }, { "entropy": 9.198078155517578, "epoch": 0.26537472809966384, "mean_token_accuracy": 0.666304349899292, "num_tokens": 13972909.0, "step": 2684, "train/ce_loss": 0.9787866473197937 }, { "epoch": 0.26537472809966384, "step": 2684, "train/sim_loss": 0.0546875 }, { "epoch": 0.26537472809966384, "step": 2684, "train/total_loss": 0.15256616473197937 }, { "entropy": 9.263998985290527, "epoch": 0.2654736009491794, "mean_token_accuracy": 0.7625570893287659, "num_tokens": 13978240.0, "step": 2685, "train/ce_loss": 0.8679942488670349 }, { "epoch": 0.2654736009491794, "step": 2685, "train/sim_loss": 0.01953125 }, { "epoch": 0.2654736009491794, "step": 2685, "train/total_loss": 0.10633067786693573 }, { "entropy": 9.46180534362793, "epoch": 0.26557247379869486, "mean_token_accuracy": 0.7883817553520203, "num_tokens": 13983368.0, "step": 2686, "train/ce_loss": 0.8221791982650757 }, { "epoch": 0.26557247379869486, "step": 2686, "train/sim_loss": 0.015625 }, { "epoch": 0.26557247379869486, "step": 2686, "train/total_loss": 0.09784292429685593 }, { "entropy": 9.256744384765625, "epoch": 0.2656713466482104, "mean_token_accuracy": 0.7591686844825745, "num_tokens": 13988655.0, "step": 2687, "train/ce_loss": 0.514398455619812 }, { "epoch": 0.2656713466482104, "step": 2687, "train/sim_loss": 0.05078125 }, { "epoch": 0.2656713466482104, "step": 2687, "train/total_loss": 0.10222110152244568 }, { "entropy": 9.252236366271973, "epoch": 0.26577021949772595, "mean_token_accuracy": 0.7030674815177917, "num_tokens": 13993934.0, "step": 2688, "train/ce_loss": 0.5606142282485962 }, { "epoch": 0.26577021949772595, "step": 2688, "train/sim_loss": 0.0234375 }, { "epoch": 0.26577021949772595, "step": 2688, "train/total_loss": 0.07949892431497574 }, { "entropy": 9.90015983581543, "epoch": 0.26586909234724143, "mean_token_accuracy": 0.7364531755447388, "num_tokens": 13998774.0, "step": 2689, "train/ce_loss": 1.4620862007141113 }, { "epoch": 0.26586909234724143, "step": 2689, "train/sim_loss": 0.0546875 }, { "epoch": 0.26586909234724143, "step": 2689, "train/total_loss": 0.20089612901210785 }, { "entropy": 9.599283218383789, "epoch": 0.265967965196757, "mean_token_accuracy": 0.74631267786026, "num_tokens": 14003925.0, "step": 2690, "train/ce_loss": 1.3352909263630863e-05 }, { "epoch": 0.265967965196757, "step": 2690, "train/sim_loss": 0.0390625 }, { "epoch": 0.265967965196757, "step": 2690, "train/total_loss": 0.03906383365392685 }, { "entropy": 9.03776741027832, "epoch": 0.2660668380462725, "mean_token_accuracy": 0.7853982448577881, "num_tokens": 14009313.0, "step": 2691, "train/ce_loss": 0.680451512336731 }, { "epoch": 0.2660668380462725, "step": 2691, "train/sim_loss": 0.11328125 }, { "epoch": 0.2660668380462725, "step": 2691, "train/total_loss": 0.18132640421390533 }, { "entropy": 9.915488243103027, "epoch": 0.266165710895788, "mean_token_accuracy": 0.7182447910308838, "num_tokens": 14014121.0, "step": 2692, "train/ce_loss": 1.067265272140503 }, { "epoch": 0.266165710895788, "step": 2692, "train/sim_loss": 0.0390625 }, { "epoch": 0.266165710895788, "step": 2692, "train/total_loss": 0.1457890272140503 }, { "entropy": 9.574028968811035, "epoch": 0.26626458374530354, "mean_token_accuracy": 0.7054597735404968, "num_tokens": 14019227.0, "step": 2693, "train/ce_loss": 1.486788005422568e-05 }, { "epoch": 0.26626458374530354, "step": 2693, "train/sim_loss": 0.046875 }, { "epoch": 0.26626458374530354, "step": 2693, "train/total_loss": 0.046876486390829086 }, { "entropy": 9.717619895935059, "epoch": 0.2663634565948191, "mean_token_accuracy": 0.7028571367263794, "num_tokens": 14024358.0, "step": 2694, "train/ce_loss": 0.3979763388633728 }, { "epoch": 0.2663634565948191, "step": 2694, "train/sim_loss": 0.078125 }, { "epoch": 0.2663634565948191, "step": 2694, "train/total_loss": 0.11792263388633728 }, { "entropy": 8.958935737609863, "epoch": 0.26646232944433457, "mean_token_accuracy": 0.7351408004760742, "num_tokens": 14029824.0, "step": 2695, "train/ce_loss": 0.9816790223121643 }, { "epoch": 0.26646232944433457, "step": 2695, "train/sim_loss": 0.06640625 }, { "epoch": 0.26646232944433457, "step": 2695, "train/total_loss": 0.16457414627075195 }, { "entropy": 9.341829299926758, "epoch": 0.2665612022938501, "mean_token_accuracy": 0.7549933195114136, "num_tokens": 14034991.0, "step": 2696, "train/ce_loss": 0.5137763023376465 }, { "epoch": 0.2665612022938501, "step": 2696, "train/sim_loss": 0.046875 }, { "epoch": 0.2665612022938501, "step": 2696, "train/total_loss": 0.09825263172388077 }, { "entropy": 9.283841133117676, "epoch": 0.26666007514336565, "mean_token_accuracy": 0.7165149450302124, "num_tokens": 14040233.0, "step": 2697, "train/ce_loss": 0.6727295517921448 }, { "epoch": 0.26666007514336565, "step": 2697, "train/sim_loss": 0.02734375 }, { "epoch": 0.26666007514336565, "step": 2697, "train/total_loss": 0.09461670368909836 }, { "entropy": 9.179637908935547, "epoch": 0.26675894799288113, "mean_token_accuracy": 0.7394285798072815, "num_tokens": 14045575.0, "step": 2698, "train/ce_loss": 1.1516114473342896 }, { "epoch": 0.26675894799288113, "step": 2698, "train/sim_loss": 0.12109375 }, { "epoch": 0.26675894799288113, "step": 2698, "train/total_loss": 0.23625490069389343 }, { "entropy": 9.415515899658203, "epoch": 0.2668578208423967, "mean_token_accuracy": 0.7286432385444641, "num_tokens": 14050817.0, "step": 2699, "train/ce_loss": 1.4189918041229248 }, { "epoch": 0.2668578208423967, "step": 2699, "train/sim_loss": 0.0390625 }, { "epoch": 0.2668578208423967, "step": 2699, "train/total_loss": 0.18096168339252472 }, { "epoch": 0.2669566936919122, "grad_norm": 0.730399489402771, "learning_rate": 9.335162933293776e-06, "loss": 0.1546, "step": 2700 }, { "entropy": 9.281803131103516, "epoch": 0.2669566936919122, "mean_token_accuracy": 0.7296954393386841, "num_tokens": 14056045.0, "step": 2700, "train/ce_loss": 0.4679917097091675 }, { "epoch": 0.2669566936919122, "step": 2700, "train/sim_loss": 0.04296875 }, { "epoch": 0.2669566936919122, "step": 2700, "train/total_loss": 0.08976791799068451 }, { "entropy": 9.70930004119873, "epoch": 0.2670555665414277, "mean_token_accuracy": 0.7396551966667175, "num_tokens": 14061058.0, "step": 2701, "train/ce_loss": 5.598945335805183e-06 }, { "epoch": 0.2670555665414277, "step": 2701, "train/sim_loss": 0.0625 }, { "epoch": 0.2670555665414277, "step": 2701, "train/total_loss": 0.06250055879354477 }, { "entropy": 9.451964378356934, "epoch": 0.26715443939094324, "mean_token_accuracy": 0.8056679964065552, "num_tokens": 14066259.0, "step": 2702, "train/ce_loss": 0.5921843647956848 }, { "epoch": 0.26715443939094324, "step": 2702, "train/sim_loss": 0.08203125 }, { "epoch": 0.26715443939094324, "step": 2702, "train/total_loss": 0.14124968647956848 }, { "entropy": 9.086771011352539, "epoch": 0.2672533122404588, "mean_token_accuracy": 0.6866515874862671, "num_tokens": 14071634.0, "step": 2703, "train/ce_loss": 0.9547269940376282 }, { "epoch": 0.2672533122404588, "step": 2703, "train/sim_loss": 0.04296875 }, { "epoch": 0.2672533122404588, "step": 2703, "train/total_loss": 0.13844144344329834 }, { "entropy": 9.125083923339844, "epoch": 0.26735218508997427, "mean_token_accuracy": 0.7378190159797668, "num_tokens": 14076987.0, "step": 2704, "train/ce_loss": 0.784442663192749 }, { "epoch": 0.26735218508997427, "step": 2704, "train/sim_loss": 0.12109375 }, { "epoch": 0.26735218508997427, "step": 2704, "train/total_loss": 0.19953802227973938 }, { "entropy": 9.073034286499023, "epoch": 0.2674510579394898, "mean_token_accuracy": 0.720588207244873, "num_tokens": 14082326.0, "step": 2705, "train/ce_loss": 0.6992771625518799 }, { "epoch": 0.2674510579394898, "step": 2705, "train/sim_loss": 0.0859375 }, { "epoch": 0.2674510579394898, "step": 2705, "train/total_loss": 0.15586522221565247 }, { "entropy": 9.043624877929688, "epoch": 0.26754993078900535, "mean_token_accuracy": 0.7110874056816101, "num_tokens": 14087772.0, "step": 2706, "train/ce_loss": 1.204443097114563 }, { "epoch": 0.26754993078900535, "step": 2706, "train/sim_loss": 0.0703125 }, { "epoch": 0.26754993078900535, "step": 2706, "train/total_loss": 0.19075681269168854 }, { "entropy": 9.483551025390625, "epoch": 0.26764880363852084, "mean_token_accuracy": 0.6895043849945068, "num_tokens": 14092881.0, "step": 2707, "train/ce_loss": 1.4192895889282227 }, { "epoch": 0.26764880363852084, "step": 2707, "train/sim_loss": 0.109375 }, { "epoch": 0.26764880363852084, "step": 2707, "train/total_loss": 0.2513039708137512 }, { "entropy": 9.470633506774902, "epoch": 0.2677476764880364, "mean_token_accuracy": 0.814479649066925, "num_tokens": 14098056.0, "step": 2708, "train/ce_loss": 0.9829438328742981 }, { "epoch": 0.2677476764880364, "step": 2708, "train/sim_loss": 0.0234375 }, { "epoch": 0.2677476764880364, "step": 2708, "train/total_loss": 0.12173188477754593 }, { "entropy": 9.47817611694336, "epoch": 0.2678465493375519, "mean_token_accuracy": 0.7224669456481934, "num_tokens": 14103210.0, "step": 2709, "train/ce_loss": 1.5246411561965942 }, { "epoch": 0.2678465493375519, "step": 2709, "train/sim_loss": 0.08984375 }, { "epoch": 0.2678465493375519, "step": 2709, "train/total_loss": 0.2423078715801239 }, { "entropy": 9.424848556518555, "epoch": 0.2679454221870674, "mean_token_accuracy": 0.7935578227043152, "num_tokens": 14108322.0, "step": 2710, "train/ce_loss": 0.4777304232120514 }, { "epoch": 0.2679454221870674, "step": 2710, "train/sim_loss": 0.02734375 }, { "epoch": 0.2679454221870674, "step": 2710, "train/total_loss": 0.07511679828166962 }, { "entropy": 9.044717788696289, "epoch": 0.26804429503658295, "mean_token_accuracy": 0.7251732349395752, "num_tokens": 14113629.0, "step": 2711, "train/ce_loss": 0.7718260884284973 }, { "epoch": 0.26804429503658295, "step": 2711, "train/sim_loss": 0.0703125 }, { "epoch": 0.26804429503658295, "step": 2711, "train/total_loss": 0.1474951207637787 }, { "entropy": 9.391767501831055, "epoch": 0.2681431678860985, "mean_token_accuracy": 0.7617079615592957, "num_tokens": 14118769.0, "step": 2712, "train/ce_loss": 0.5797140598297119 }, { "epoch": 0.2681431678860985, "step": 2712, "train/sim_loss": 0.0546875 }, { "epoch": 0.2681431678860985, "step": 2712, "train/total_loss": 0.11265890300273895 }, { "entropy": 8.919556617736816, "epoch": 0.268242040735614, "mean_token_accuracy": 0.7400932312011719, "num_tokens": 14124076.0, "step": 2713, "train/ce_loss": 0.4740203619003296 }, { "epoch": 0.268242040735614, "step": 2713, "train/sim_loss": 0.02734375 }, { "epoch": 0.268242040735614, "step": 2713, "train/total_loss": 0.0747457891702652 }, { "entropy": 9.423288345336914, "epoch": 0.2683409135851295, "mean_token_accuracy": 0.7017310261726379, "num_tokens": 14129215.0, "step": 2714, "train/ce_loss": 1.82626473903656 }, { "epoch": 0.2683409135851295, "step": 2714, "train/sim_loss": 0.05078125 }, { "epoch": 0.2683409135851295, "step": 2714, "train/total_loss": 0.23340772092342377 }, { "entropy": 8.878215789794922, "epoch": 0.26843978643464506, "mean_token_accuracy": 0.7340530157089233, "num_tokens": 14134701.0, "step": 2715, "train/ce_loss": 0.7186618447303772 }, { "epoch": 0.26843978643464506, "step": 2715, "train/sim_loss": 0.03515625 }, { "epoch": 0.26843978643464506, "step": 2715, "train/total_loss": 0.10702243447303772 }, { "entropy": 9.063881874084473, "epoch": 0.2685386592841606, "mean_token_accuracy": 0.7230955362319946, "num_tokens": 14140018.0, "step": 2716, "train/ce_loss": 0.4996785521507263 }, { "epoch": 0.2685386592841606, "step": 2716, "train/sim_loss": 0.07421875 }, { "epoch": 0.2685386592841606, "step": 2716, "train/total_loss": 0.12418660521507263 }, { "entropy": 8.891548156738281, "epoch": 0.2686375321336761, "mean_token_accuracy": 0.7863330245018005, "num_tokens": 14145505.0, "step": 2717, "train/ce_loss": 0.5857919454574585 }, { "epoch": 0.2686375321336761, "step": 2717, "train/sim_loss": 0.015625 }, { "epoch": 0.2686375321336761, "step": 2717, "train/total_loss": 0.07420419156551361 }, { "entropy": 9.50385856628418, "epoch": 0.2687364049831916, "mean_token_accuracy": 0.7188552021980286, "num_tokens": 14150523.0, "step": 2718, "train/ce_loss": 1.3414199352264404 }, { "epoch": 0.2687364049831916, "step": 2718, "train/sim_loss": 0.0546875 }, { "epoch": 0.2687364049831916, "step": 2718, "train/total_loss": 0.18882949650287628 }, { "entropy": 9.160788536071777, "epoch": 0.26883527783270716, "mean_token_accuracy": 0.668865442276001, "num_tokens": 14155753.0, "step": 2719, "train/ce_loss": 0.684292197227478 }, { "epoch": 0.26883527783270716, "step": 2719, "train/sim_loss": 0.0546875 }, { "epoch": 0.26883527783270716, "step": 2719, "train/total_loss": 0.12311672419309616 }, { "epoch": 0.26893415068222265, "grad_norm": 1.0595309734344482, "learning_rate": 9.330218068535826e-06, "loss": 0.1516, "step": 2720 }, { "entropy": 9.514728546142578, "epoch": 0.26893415068222265, "mean_token_accuracy": 0.7625201940536499, "num_tokens": 14160826.0, "step": 2720, "train/ce_loss": 0.7204391360282898 }, { "epoch": 0.26893415068222265, "step": 2720, "train/sim_loss": 0.0390625 }, { "epoch": 0.26893415068222265, "step": 2720, "train/total_loss": 0.11110641807317734 }, { "entropy": 8.682881355285645, "epoch": 0.2690330235317382, "mean_token_accuracy": 0.7777777910232544, "num_tokens": 14166345.0, "step": 2721, "train/ce_loss": 0.7576871514320374 }, { "epoch": 0.2690330235317382, "step": 2721, "train/sim_loss": 0.08984375 }, { "epoch": 0.2690330235317382, "step": 2721, "train/total_loss": 0.16561245918273926 }, { "entropy": 9.194377899169922, "epoch": 0.26913189638125373, "mean_token_accuracy": 0.7034813761711121, "num_tokens": 14171693.0, "step": 2722, "train/ce_loss": 0.3500674068927765 }, { "epoch": 0.26913189638125373, "step": 2722, "train/sim_loss": 0.05859375 }, { "epoch": 0.26913189638125373, "step": 2722, "train/total_loss": 0.09360049664974213 }, { "entropy": 9.035322189331055, "epoch": 0.2692307692307692, "mean_token_accuracy": 0.7163197994232178, "num_tokens": 14177012.0, "step": 2723, "train/ce_loss": 1.296778678894043 }, { "epoch": 0.2692307692307692, "step": 2723, "train/sim_loss": 0.04296875 }, { "epoch": 0.2692307692307692, "step": 2723, "train/total_loss": 0.172646626830101 }, { "entropy": 9.263710021972656, "epoch": 0.26932964208028476, "mean_token_accuracy": 0.7361282110214233, "num_tokens": 14182469.0, "step": 2724, "train/ce_loss": 0.6803064942359924 }, { "epoch": 0.26932964208028476, "step": 2724, "train/sim_loss": 0.0625 }, { "epoch": 0.26932964208028476, "step": 2724, "train/total_loss": 0.13053065538406372 }, { "entropy": 9.153655052185059, "epoch": 0.2694285149298003, "mean_token_accuracy": 0.7928571701049805, "num_tokens": 14187748.0, "step": 2725, "train/ce_loss": 0.6589941382408142 }, { "epoch": 0.2694285149298003, "step": 2725, "train/sim_loss": 0.0703125 }, { "epoch": 0.2694285149298003, "step": 2725, "train/total_loss": 0.13621191680431366 }, { "entropy": 9.351223945617676, "epoch": 0.2695273877793158, "mean_token_accuracy": 0.72265625, "num_tokens": 14192949.0, "step": 2726, "train/ce_loss": 0.6545261740684509 }, { "epoch": 0.2695273877793158, "step": 2726, "train/sim_loss": 0.06640625 }, { "epoch": 0.2695273877793158, "step": 2726, "train/total_loss": 0.13185887038707733 }, { "entropy": 9.045417785644531, "epoch": 0.2696262606288313, "mean_token_accuracy": 0.7489270567893982, "num_tokens": 14198387.0, "step": 2727, "train/ce_loss": 0.7137749791145325 }, { "epoch": 0.2696262606288313, "step": 2727, "train/sim_loss": 0.0703125 }, { "epoch": 0.2696262606288313, "step": 2727, "train/total_loss": 0.14169000089168549 }, { "entropy": 9.564960479736328, "epoch": 0.26972513347834687, "mean_token_accuracy": 0.7658959627151489, "num_tokens": 14203506.0, "step": 2728, "train/ce_loss": 4.330248884798493e-06 }, { "epoch": 0.26972513347834687, "step": 2728, "train/sim_loss": 0.02734375 }, { "epoch": 0.26972513347834687, "step": 2728, "train/total_loss": 0.02734418213367462 }, { "entropy": 8.947754859924316, "epoch": 0.26982400632786235, "mean_token_accuracy": 0.7384230494499207, "num_tokens": 14208789.0, "step": 2729, "train/ce_loss": 0.6324231624603271 }, { "epoch": 0.26982400632786235, "step": 2729, "train/sim_loss": 0.078125 }, { "epoch": 0.26982400632786235, "step": 2729, "train/total_loss": 0.14136731624603271 }, { "entropy": 9.119012832641602, "epoch": 0.2699228791773779, "mean_token_accuracy": 0.7366703152656555, "num_tokens": 14214085.0, "step": 2730, "train/ce_loss": 0.6751307845115662 }, { "epoch": 0.2699228791773779, "step": 2730, "train/sim_loss": 0.08984375 }, { "epoch": 0.2699228791773779, "step": 2730, "train/total_loss": 0.15735682845115662 }, { "entropy": 9.13355541229248, "epoch": 0.27002175202689344, "mean_token_accuracy": 0.7561837434768677, "num_tokens": 14219572.0, "step": 2731, "train/ce_loss": 0.6862268447875977 }, { "epoch": 0.27002175202689344, "step": 2731, "train/sim_loss": 0.0234375 }, { "epoch": 0.27002175202689344, "step": 2731, "train/total_loss": 0.09206018596887589 }, { "entropy": 9.203107833862305, "epoch": 0.2701206248764089, "mean_token_accuracy": 0.747474730014801, "num_tokens": 14224909.0, "step": 2732, "train/ce_loss": 0.658726155757904 }, { "epoch": 0.2701206248764089, "step": 2732, "train/sim_loss": 0.08203125 }, { "epoch": 0.2701206248764089, "step": 2732, "train/total_loss": 0.14790385961532593 }, { "entropy": 10.122865676879883, "epoch": 0.27021949772592446, "mean_token_accuracy": 0.6963788270950317, "num_tokens": 14229721.0, "step": 2733, "train/ce_loss": 2.6643052101135254 }, { "epoch": 0.27021949772592446, "step": 2733, "train/sim_loss": 0.09375 }, { "epoch": 0.27021949772592446, "step": 2733, "train/total_loss": 0.360180526971817 }, { "entropy": 8.99232006072998, "epoch": 0.27031837057544, "mean_token_accuracy": 0.7479674816131592, "num_tokens": 14235104.0, "step": 2734, "train/ce_loss": 1.247081995010376 }, { "epoch": 0.27031837057544, "step": 2734, "train/sim_loss": 0.046875 }, { "epoch": 0.27031837057544, "step": 2734, "train/total_loss": 0.17158320546150208 }, { "entropy": 9.292896270751953, "epoch": 0.2704172434249555, "mean_token_accuracy": 0.7465145587921143, "num_tokens": 14240571.0, "step": 2735, "train/ce_loss": 0.6882023811340332 }, { "epoch": 0.2704172434249555, "step": 2735, "train/sim_loss": 0.05078125 }, { "epoch": 0.2704172434249555, "step": 2735, "train/total_loss": 0.11960148811340332 }, { "entropy": 9.37419319152832, "epoch": 0.27051611627447103, "mean_token_accuracy": 0.7206632494926453, "num_tokens": 14245799.0, "step": 2736, "train/ce_loss": 0.7072527408599854 }, { "epoch": 0.27051611627447103, "step": 2736, "train/sim_loss": 0.05078125 }, { "epoch": 0.27051611627447103, "step": 2736, "train/total_loss": 0.12150652706623077 }, { "entropy": 8.763299942016602, "epoch": 0.27061498912398657, "mean_token_accuracy": 0.7679324746131897, "num_tokens": 14251248.0, "step": 2737, "train/ce_loss": 0.7712081074714661 }, { "epoch": 0.27061498912398657, "step": 2737, "train/sim_loss": 0.0703125 }, { "epoch": 0.27061498912398657, "step": 2737, "train/total_loss": 0.1474333107471466 }, { "entropy": 9.102038383483887, "epoch": 0.27071386197350206, "mean_token_accuracy": 0.7104895114898682, "num_tokens": 14256446.0, "step": 2738, "train/ce_loss": 1.6209111213684082 }, { "epoch": 0.27071386197350206, "step": 2738, "train/sim_loss": 0.265625 }, { "epoch": 0.27071386197350206, "step": 2738, "train/total_loss": 0.42771613597869873 }, { "entropy": 9.282876968383789, "epoch": 0.2708127348230176, "mean_token_accuracy": 0.755667507648468, "num_tokens": 14261693.0, "step": 2739, "train/ce_loss": 0.3576745390892029 }, { "epoch": 0.2708127348230176, "step": 2739, "train/sim_loss": 0.05859375 }, { "epoch": 0.2708127348230176, "step": 2739, "train/total_loss": 0.09436120092868805 }, { "epoch": 0.27091160767253314, "grad_norm": 0.8129703998565674, "learning_rate": 9.325273203777877e-06, "loss": 0.1538, "step": 2740 }, { "entropy": 9.425825119018555, "epoch": 0.27091160767253314, "mean_token_accuracy": 0.6977058053016663, "num_tokens": 14266945.0, "step": 2740, "train/ce_loss": 0.7067881226539612 }, { "epoch": 0.27091160767253314, "step": 2740, "train/sim_loss": 0.1015625 }, { "epoch": 0.27091160767253314, "step": 2740, "train/total_loss": 0.17224131524562836 }, { "entropy": 9.145968437194824, "epoch": 0.2710104805220486, "mean_token_accuracy": 0.765196681022644, "num_tokens": 14272207.0, "step": 2741, "train/ce_loss": 0.7855210304260254 }, { "epoch": 0.2710104805220486, "step": 2741, "train/sim_loss": 0.06640625 }, { "epoch": 0.2710104805220486, "step": 2741, "train/total_loss": 0.14495834708213806 }, { "entropy": 9.4079008102417, "epoch": 0.27110935337156417, "mean_token_accuracy": 0.8165760636329651, "num_tokens": 14277352.0, "step": 2742, "train/ce_loss": 0.7447280883789062 }, { "epoch": 0.27110935337156417, "step": 2742, "train/sim_loss": 0.0234375 }, { "epoch": 0.27110935337156417, "step": 2742, "train/total_loss": 0.0979103073477745 }, { "entropy": 8.978082656860352, "epoch": 0.2712082262210797, "mean_token_accuracy": 0.69852215051651, "num_tokens": 14282870.0, "step": 2743, "train/ce_loss": 0.867680549621582 }, { "epoch": 0.2712082262210797, "step": 2743, "train/sim_loss": 0.1015625 }, { "epoch": 0.2712082262210797, "step": 2743, "train/total_loss": 0.18833056092262268 }, { "entropy": 9.525739669799805, "epoch": 0.2713070990705952, "mean_token_accuracy": 0.7130177617073059, "num_tokens": 14288003.0, "step": 2744, "train/ce_loss": 1.0470430850982666 }, { "epoch": 0.2713070990705952, "step": 2744, "train/sim_loss": 0.09765625 }, { "epoch": 0.2713070990705952, "step": 2744, "train/total_loss": 0.20236057043075562 }, { "entropy": 9.163778305053711, "epoch": 0.27140597192011073, "mean_token_accuracy": 0.7372986078262329, "num_tokens": 14293289.0, "step": 2745, "train/ce_loss": 0.6977776288986206 }, { "epoch": 0.27140597192011073, "step": 2745, "train/sim_loss": 0.1171875 }, { "epoch": 0.27140597192011073, "step": 2745, "train/total_loss": 0.18696525692939758 }, { "entropy": 9.077678680419922, "epoch": 0.2715048447696263, "mean_token_accuracy": 0.7273838520050049, "num_tokens": 14298592.0, "step": 2746, "train/ce_loss": 6.585466962860664e-06 }, { "epoch": 0.2715048447696263, "step": 2746, "train/sim_loss": 0.04296875 }, { "epoch": 0.2715048447696263, "step": 2746, "train/total_loss": 0.04296940937638283 }, { "entropy": 9.930625915527344, "epoch": 0.27160371761914176, "mean_token_accuracy": 0.7298049926757812, "num_tokens": 14303340.0, "step": 2747, "train/ce_loss": 2.05059552192688 }, { "epoch": 0.27160371761914176, "step": 2747, "train/sim_loss": 0.046875 }, { "epoch": 0.27160371761914176, "step": 2747, "train/total_loss": 0.25193455815315247 }, { "entropy": 9.546127319335938, "epoch": 0.2717025904686573, "mean_token_accuracy": 0.7295690774917603, "num_tokens": 14308489.0, "step": 2748, "train/ce_loss": 1.3548938035964966 }, { "epoch": 0.2717025904686573, "step": 2748, "train/sim_loss": 0.078125 }, { "epoch": 0.2717025904686573, "step": 2748, "train/total_loss": 0.21361438930034637 }, { "entropy": 8.843116760253906, "epoch": 0.27180146331817284, "mean_token_accuracy": 0.7509881258010864, "num_tokens": 14313917.0, "step": 2749, "train/ce_loss": 1.0115149021148682 }, { "epoch": 0.27180146331817284, "step": 2749, "train/sim_loss": 0.11328125 }, { "epoch": 0.27180146331817284, "step": 2749, "train/total_loss": 0.2144327461719513 }, { "entropy": 9.490730285644531, "epoch": 0.27190033616768833, "mean_token_accuracy": 0.766853928565979, "num_tokens": 14319046.0, "step": 2750, "train/ce_loss": 3.2003395062929485e-06 }, { "epoch": 0.27190033616768833, "step": 2750, "train/sim_loss": 0.046875 }, { "epoch": 0.27190033616768833, "step": 2750, "train/total_loss": 0.04687532037496567 }, { "entropy": 8.729164123535156, "epoch": 0.27199920901720387, "mean_token_accuracy": 0.7644135355949402, "num_tokens": 14324571.0, "step": 2751, "train/ce_loss": 0.6920303702354431 }, { "epoch": 0.27199920901720387, "step": 2751, "train/sim_loss": 0.078125 }, { "epoch": 0.27199920901720387, "step": 2751, "train/total_loss": 0.14732804894447327 }, { "entropy": 9.034978866577148, "epoch": 0.2720980818667194, "mean_token_accuracy": 0.7425414323806763, "num_tokens": 14329945.0, "step": 2752, "train/ce_loss": 0.8318552374839783 }, { "epoch": 0.2720980818667194, "step": 2752, "train/sim_loss": 0.05859375 }, { "epoch": 0.2720980818667194, "step": 2752, "train/total_loss": 0.14177927374839783 }, { "entropy": 9.5916109085083, "epoch": 0.2721969547162349, "mean_token_accuracy": 0.7996768951416016, "num_tokens": 14335009.0, "step": 2753, "train/ce_loss": 4.757183887704741e-06 }, { "epoch": 0.2721969547162349, "step": 2753, "train/sim_loss": 0.0234375 }, { "epoch": 0.2721969547162349, "step": 2753, "train/total_loss": 0.023437974974513054 }, { "entropy": 9.023578643798828, "epoch": 0.27229582756575044, "mean_token_accuracy": 0.6979637742042542, "num_tokens": 14340352.0, "step": 2754, "train/ce_loss": 0.4947526454925537 }, { "epoch": 0.27229582756575044, "step": 2754, "train/sim_loss": 0.08203125 }, { "epoch": 0.27229582756575044, "step": 2754, "train/total_loss": 0.1315065175294876 }, { "entropy": 9.618267059326172, "epoch": 0.272394700415266, "mean_token_accuracy": 0.7423780560493469, "num_tokens": 14345391.0, "step": 2755, "train/ce_loss": 8.185864317056257e-06 }, { "epoch": 0.272394700415266, "step": 2755, "train/sim_loss": 0.0703125 }, { "epoch": 0.272394700415266, "step": 2755, "train/total_loss": 0.07031331956386566 }, { "entropy": 8.738838195800781, "epoch": 0.27249357326478146, "mean_token_accuracy": 0.7375964522361755, "num_tokens": 14350794.0, "step": 2756, "train/ce_loss": 0.6735537052154541 }, { "epoch": 0.27249357326478146, "step": 2756, "train/sim_loss": 0.06640625 }, { "epoch": 0.27249357326478146, "step": 2756, "train/total_loss": 0.13376161456108093 }, { "entropy": 9.224106788635254, "epoch": 0.272592446114297, "mean_token_accuracy": 0.7684346437454224, "num_tokens": 14356057.0, "step": 2757, "train/ce_loss": 0.9679180383682251 }, { "epoch": 0.272592446114297, "step": 2757, "train/sim_loss": 0.11328125 }, { "epoch": 0.272592446114297, "step": 2757, "train/total_loss": 0.2100730538368225 }, { "entropy": 9.142610549926758, "epoch": 0.27269131896381255, "mean_token_accuracy": 0.7483870983123779, "num_tokens": 14361301.0, "step": 2758, "train/ce_loss": 1.4557925462722778 }, { "epoch": 0.27269131896381255, "step": 2758, "train/sim_loss": 0.04296875 }, { "epoch": 0.27269131896381255, "step": 2758, "train/total_loss": 0.1885480135679245 }, { "entropy": 10.052937507629395, "epoch": 0.2727901918133281, "mean_token_accuracy": 0.7412935495376587, "num_tokens": 14366114.0, "step": 2759, "train/ce_loss": 1.7297953367233276 }, { "epoch": 0.2727901918133281, "step": 2759, "train/sim_loss": 0.08203125 }, { "epoch": 0.2727901918133281, "step": 2759, "train/total_loss": 0.25501078367233276 }, { "epoch": 0.2728890646628436, "grad_norm": 0.9541262984275818, "learning_rate": 9.320328339019929e-06, "loss": 0.1457, "step": 2760 }, { "entropy": 9.396015167236328, "epoch": 0.2728890646628436, "mean_token_accuracy": 0.6876675486564636, "num_tokens": 14371281.0, "step": 2760, "train/ce_loss": 1.7390131950378418 }, { "epoch": 0.2728890646628436, "step": 2760, "train/sim_loss": 0.08203125 }, { "epoch": 0.2728890646628436, "step": 2760, "train/total_loss": 0.2559325695037842 }, { "entropy": 9.222189903259277, "epoch": 0.2729879375123591, "mean_token_accuracy": 0.6744186282157898, "num_tokens": 14376507.0, "step": 2761, "train/ce_loss": 1.4619855880737305 }, { "epoch": 0.2729879375123591, "step": 2761, "train/sim_loss": 0.05859375 }, { "epoch": 0.2729879375123591, "step": 2761, "train/total_loss": 0.2047923058271408 }, { "entropy": 9.410306930541992, "epoch": 0.27308681036187465, "mean_token_accuracy": 0.7361769080162048, "num_tokens": 14381564.0, "step": 2762, "train/ce_loss": 0.8330072164535522 }, { "epoch": 0.27308681036187465, "step": 2762, "train/sim_loss": 0.05078125 }, { "epoch": 0.27308681036187465, "step": 2762, "train/total_loss": 0.13408197462558746 }, { "entropy": 9.200156211853027, "epoch": 0.27318568321139014, "mean_token_accuracy": 0.7051281929016113, "num_tokens": 14386819.0, "step": 2763, "train/ce_loss": 0.8308500647544861 }, { "epoch": 0.27318568321139014, "step": 2763, "train/sim_loss": 0.0703125 }, { "epoch": 0.27318568321139014, "step": 2763, "train/total_loss": 0.15339750051498413 }, { "entropy": 8.76707935333252, "epoch": 0.2732845560609057, "mean_token_accuracy": 0.70010906457901, "num_tokens": 14392131.0, "step": 2764, "train/ce_loss": 0.5953741669654846 }, { "epoch": 0.2732845560609057, "step": 2764, "train/sim_loss": 0.0390625 }, { "epoch": 0.2732845560609057, "step": 2764, "train/total_loss": 0.09859991818666458 }, { "entropy": 8.96510124206543, "epoch": 0.2733834289104212, "mean_token_accuracy": 0.7502774596214294, "num_tokens": 14397494.0, "step": 2765, "train/ce_loss": 0.9151800274848938 }, { "epoch": 0.2733834289104212, "step": 2765, "train/sim_loss": 0.0625 }, { "epoch": 0.2733834289104212, "step": 2765, "train/total_loss": 0.15401801466941833 }, { "entropy": 8.955915451049805, "epoch": 0.2734823017599367, "mean_token_accuracy": 0.6983805894851685, "num_tokens": 14402944.0, "step": 2766, "train/ce_loss": 0.6279106140136719 }, { "epoch": 0.2734823017599367, "step": 2766, "train/sim_loss": 0.05078125 }, { "epoch": 0.2734823017599367, "step": 2766, "train/total_loss": 0.11357231438159943 }, { "entropy": 9.095291137695312, "epoch": 0.27358117460945225, "mean_token_accuracy": 0.746198832988739, "num_tokens": 14408277.0, "step": 2767, "train/ce_loss": 0.9595216512680054 }, { "epoch": 0.27358117460945225, "step": 2767, "train/sim_loss": 0.09375 }, { "epoch": 0.27358117460945225, "step": 2767, "train/total_loss": 0.18970216810703278 }, { "entropy": 9.259521484375, "epoch": 0.2736800474589678, "mean_token_accuracy": 0.7518518567085266, "num_tokens": 14413564.0, "step": 2768, "train/ce_loss": 0.7817373871803284 }, { "epoch": 0.2736800474589678, "step": 2768, "train/sim_loss": 0.078125 }, { "epoch": 0.2736800474589678, "step": 2768, "train/total_loss": 0.15629874169826508 }, { "entropy": 9.800960540771484, "epoch": 0.2737789203084833, "mean_token_accuracy": 0.7534791231155396, "num_tokens": 14418497.0, "step": 2769, "train/ce_loss": 0.00027163056074641645 }, { "epoch": 0.2737789203084833, "step": 2769, "train/sim_loss": 0.046875 }, { "epoch": 0.2737789203084833, "step": 2769, "train/total_loss": 0.046902164816856384 }, { "entropy": 9.137109756469727, "epoch": 0.2738777931579988, "mean_token_accuracy": 0.7103128433227539, "num_tokens": 14423836.0, "step": 2770, "train/ce_loss": 1.4109325408935547 }, { "epoch": 0.2738777931579988, "step": 2770, "train/sim_loss": 0.1015625 }, { "epoch": 0.2738777931579988, "step": 2770, "train/total_loss": 0.24265575408935547 }, { "entropy": 9.510459899902344, "epoch": 0.27397666600751436, "mean_token_accuracy": 0.7751572132110596, "num_tokens": 14428938.0, "step": 2771, "train/ce_loss": 3.7481873732758686e-05 }, { "epoch": 0.27397666600751436, "step": 2771, "train/sim_loss": 0.0625 }, { "epoch": 0.27397666600751436, "step": 2771, "train/total_loss": 0.06250374764204025 }, { "entropy": 9.130108833312988, "epoch": 0.27407553885702984, "mean_token_accuracy": 0.7431629300117493, "num_tokens": 14434239.0, "step": 2772, "train/ce_loss": 0.3732340335845947 }, { "epoch": 0.27407553885702984, "step": 2772, "train/sim_loss": 0.02734375 }, { "epoch": 0.27407553885702984, "step": 2772, "train/total_loss": 0.06466715037822723 }, { "entropy": 9.484024047851562, "epoch": 0.2741744117065454, "mean_token_accuracy": 0.7289073467254639, "num_tokens": 14439399.0, "step": 2773, "train/ce_loss": 1.231729507446289 }, { "epoch": 0.2741744117065454, "step": 2773, "train/sim_loss": 0.07421875 }, { "epoch": 0.2741744117065454, "step": 2773, "train/total_loss": 0.19739170372486115 }, { "entropy": 9.430547714233398, "epoch": 0.2742732845560609, "mean_token_accuracy": 0.7442922592163086, "num_tokens": 14444573.0, "step": 2774, "train/ce_loss": 1.1785828064603265e-05 }, { "epoch": 0.2742732845560609, "step": 2774, "train/sim_loss": 0.109375 }, { "epoch": 0.2742732845560609, "step": 2774, "train/total_loss": 0.10937617719173431 }, { "entropy": 9.98629093170166, "epoch": 0.2743721574055764, "mean_token_accuracy": 0.7030162215232849, "num_tokens": 14449465.0, "step": 2775, "train/ce_loss": 8.80569132277742e-06 }, { "epoch": 0.2743721574055764, "step": 2775, "train/sim_loss": 0.05859375 }, { "epoch": 0.2743721574055764, "step": 2775, "train/total_loss": 0.05859462916851044 }, { "entropy": 9.538305282592773, "epoch": 0.27447103025509195, "mean_token_accuracy": 0.7018348574638367, "num_tokens": 14454580.0, "step": 2776, "train/ce_loss": 1.2678555250167847 }, { "epoch": 0.27447103025509195, "step": 2776, "train/sim_loss": 0.109375 }, { "epoch": 0.27447103025509195, "step": 2776, "train/total_loss": 0.23616056144237518 }, { "entropy": 10.267413139343262, "epoch": 0.2745699031046075, "mean_token_accuracy": 0.7951807379722595, "num_tokens": 14459267.0, "step": 2777, "train/ce_loss": 1.6592833995819092 }, { "epoch": 0.2745699031046075, "step": 2777, "train/sim_loss": 0.05078125 }, { "epoch": 0.2745699031046075, "step": 2777, "train/total_loss": 0.21670959889888763 }, { "entropy": 8.74622917175293, "epoch": 0.274668775954123, "mean_token_accuracy": 0.7353951930999756, "num_tokens": 14464615.0, "step": 2778, "train/ce_loss": 0.8360884189605713 }, { "epoch": 0.274668775954123, "step": 2778, "train/sim_loss": 0.078125 }, { "epoch": 0.274668775954123, "step": 2778, "train/total_loss": 0.16173383593559265 }, { "entropy": 8.989816665649414, "epoch": 0.2747676488036385, "mean_token_accuracy": 0.8011363744735718, "num_tokens": 14469986.0, "step": 2779, "train/ce_loss": 1.058032512664795 }, { "epoch": 0.2747676488036385, "step": 2779, "train/sim_loss": 0.046875 }, { "epoch": 0.2747676488036385, "step": 2779, "train/total_loss": 0.1526782512664795 }, { "epoch": 0.27486652165315406, "grad_norm": 0.7337502837181091, "learning_rate": 9.31538347426198e-06, "loss": 0.1501, "step": 2780 }, { "entropy": 9.15650749206543, "epoch": 0.27486652165315406, "mean_token_accuracy": 0.6983758807182312, "num_tokens": 14475357.0, "step": 2780, "train/ce_loss": 0.8407629728317261 }, { "epoch": 0.27486652165315406, "step": 2780, "train/sim_loss": 0.0703125 }, { "epoch": 0.27486652165315406, "step": 2780, "train/total_loss": 0.15438880026340485 }, { "entropy": 9.087913513183594, "epoch": 0.27496539450266955, "mean_token_accuracy": 0.7535714507102966, "num_tokens": 14480666.0, "step": 2781, "train/ce_loss": 0.8932794332504272 }, { "epoch": 0.27496539450266955, "step": 2781, "train/sim_loss": 0.08984375 }, { "epoch": 0.27496539450266955, "step": 2781, "train/total_loss": 0.17917169630527496 }, { "entropy": 9.604347229003906, "epoch": 0.2750642673521851, "mean_token_accuracy": 0.7685325145721436, "num_tokens": 14485781.0, "step": 2782, "train/ce_loss": 8.136759788612835e-06 }, { "epoch": 0.2750642673521851, "step": 2782, "train/sim_loss": 0.078125 }, { "epoch": 0.2750642673521851, "step": 2782, "train/total_loss": 0.07812581211328506 }, { "entropy": 8.990873336791992, "epoch": 0.27516314020170063, "mean_token_accuracy": 0.7177497744560242, "num_tokens": 14491339.0, "step": 2783, "train/ce_loss": 0.8948922157287598 }, { "epoch": 0.27516314020170063, "step": 2783, "train/sim_loss": 0.09375 }, { "epoch": 0.27516314020170063, "step": 2783, "train/total_loss": 0.18323922157287598 }, { "entropy": 9.077988624572754, "epoch": 0.2752620130512161, "mean_token_accuracy": 0.7343412637710571, "num_tokens": 14496756.0, "step": 2784, "train/ce_loss": 0.7399857044219971 }, { "epoch": 0.2752620130512161, "step": 2784, "train/sim_loss": 0.0234375 }, { "epoch": 0.2752620130512161, "step": 2784, "train/total_loss": 0.09743607044219971 }, { "entropy": 8.992471694946289, "epoch": 0.27536088590073166, "mean_token_accuracy": 0.7187817096710205, "num_tokens": 14502240.0, "step": 2785, "train/ce_loss": 1.4025267362594604 }, { "epoch": 0.27536088590073166, "step": 2785, "train/sim_loss": 0.0703125 }, { "epoch": 0.27536088590073166, "step": 2785, "train/total_loss": 0.21056517958641052 }, { "entropy": 9.678565979003906, "epoch": 0.2754597587502472, "mean_token_accuracy": 0.7094339728355408, "num_tokens": 14507181.0, "step": 2786, "train/ce_loss": 1.3096341717755422e-05 }, { "epoch": 0.2754597587502472, "step": 2786, "train/sim_loss": 0.07421875 }, { "epoch": 0.2754597587502472, "step": 2786, "train/total_loss": 0.07422006130218506 }, { "entropy": 9.417784690856934, "epoch": 0.2755586315997627, "mean_token_accuracy": 0.7039473652839661, "num_tokens": 14512373.0, "step": 2787, "train/ce_loss": 0.9774115085601807 }, { "epoch": 0.2755586315997627, "step": 2787, "train/sim_loss": 0.046875 }, { "epoch": 0.2755586315997627, "step": 2787, "train/total_loss": 0.14461615681648254 }, { "entropy": 9.363090515136719, "epoch": 0.2756575044492782, "mean_token_accuracy": 0.7112675905227661, "num_tokens": 14517538.0, "step": 2788, "train/ce_loss": 1.3934249877929688 }, { "epoch": 0.2756575044492782, "step": 2788, "train/sim_loss": 0.078125 }, { "epoch": 0.2756575044492782, "step": 2788, "train/total_loss": 0.2174675017595291 }, { "entropy": 9.457781791687012, "epoch": 0.27575637729879376, "mean_token_accuracy": 0.7311521768569946, "num_tokens": 14522649.0, "step": 2789, "train/ce_loss": 1.1051501035690308 }, { "epoch": 0.27575637729879376, "step": 2789, "train/sim_loss": 0.109375 }, { "epoch": 0.27575637729879376, "step": 2789, "train/total_loss": 0.21989001333713531 }, { "entropy": 9.891679763793945, "epoch": 0.27585525014830925, "mean_token_accuracy": 0.7549999952316284, "num_tokens": 14527455.0, "step": 2790, "train/ce_loss": 1.8008737564086914 }, { "epoch": 0.27585525014830925, "step": 2790, "train/sim_loss": 0.078125 }, { "epoch": 0.27585525014830925, "step": 2790, "train/total_loss": 0.2582123875617981 }, { "entropy": 9.288782119750977, "epoch": 0.2759541229978248, "mean_token_accuracy": 0.7717791199684143, "num_tokens": 14532762.0, "step": 2791, "train/ce_loss": 0.8080936074256897 }, { "epoch": 0.2759541229978248, "step": 2791, "train/sim_loss": 0.0625 }, { "epoch": 0.2759541229978248, "step": 2791, "train/total_loss": 0.1433093547821045 }, { "entropy": 9.532764434814453, "epoch": 0.27605299584734033, "mean_token_accuracy": 0.7211093902587891, "num_tokens": 14537852.0, "step": 2792, "train/ce_loss": 0.8462459444999695 }, { "epoch": 0.27605299584734033, "step": 2792, "train/sim_loss": 0.08203125 }, { "epoch": 0.27605299584734033, "step": 2792, "train/total_loss": 0.16665583848953247 }, { "entropy": 9.049671173095703, "epoch": 0.2761518686968558, "mean_token_accuracy": 0.8083961009979248, "num_tokens": 14543216.0, "step": 2793, "train/ce_loss": 0.6352055668830872 }, { "epoch": 0.2761518686968558, "step": 2793, "train/sim_loss": 0.03125 }, { "epoch": 0.2761518686968558, "step": 2793, "train/total_loss": 0.09477055817842484 }, { "entropy": 9.601463317871094, "epoch": 0.27625074154637136, "mean_token_accuracy": 0.7698675394058228, "num_tokens": 14548250.0, "step": 2794, "train/ce_loss": 0.5902164578437805 }, { "epoch": 0.27625074154637136, "step": 2794, "train/sim_loss": 0.05859375 }, { "epoch": 0.27625074154637136, "step": 2794, "train/total_loss": 0.11761540174484253 }, { "entropy": 9.143898963928223, "epoch": 0.2763496143958869, "mean_token_accuracy": 0.7332535982131958, "num_tokens": 14553551.0, "step": 2795, "train/ce_loss": 0.4416395425796509 }, { "epoch": 0.2763496143958869, "step": 2795, "train/sim_loss": 0.0703125 }, { "epoch": 0.2763496143958869, "step": 2795, "train/total_loss": 0.11447645723819733 }, { "entropy": 9.792064666748047, "epoch": 0.2764484872454024, "mean_token_accuracy": 0.7258319854736328, "num_tokens": 14558634.0, "step": 2796, "train/ce_loss": 0.9641293287277222 }, { "epoch": 0.2764484872454024, "step": 2796, "train/sim_loss": 0.09375 }, { "epoch": 0.2764484872454024, "step": 2796, "train/total_loss": 0.19016292691230774 }, { "entropy": 8.983678817749023, "epoch": 0.2765473600949179, "mean_token_accuracy": 0.7092511057853699, "num_tokens": 14564042.0, "step": 2797, "train/ce_loss": 0.49966195225715637 }, { "epoch": 0.2765473600949179, "step": 2797, "train/sim_loss": 0.07421875 }, { "epoch": 0.2765473600949179, "step": 2797, "train/total_loss": 0.12418495118618011 }, { "entropy": 9.377925872802734, "epoch": 0.27664623294443347, "mean_token_accuracy": 0.729194164276123, "num_tokens": 14569246.0, "step": 2798, "train/ce_loss": 0.8390363454818726 }, { "epoch": 0.27664623294443347, "step": 2798, "train/sim_loss": 0.0234375 }, { "epoch": 0.27664623294443347, "step": 2798, "train/total_loss": 0.10734113305807114 }, { "entropy": 9.556909561157227, "epoch": 0.276745105793949, "mean_token_accuracy": 0.7122302055358887, "num_tokens": 14574244.0, "step": 2799, "train/ce_loss": 5.166768460185267e-06 }, { "epoch": 0.276745105793949, "step": 2799, "train/sim_loss": 0.07421875 }, { "epoch": 0.276745105793949, "step": 2799, "train/total_loss": 0.07421926409006119 }, { "epoch": 0.2768439786434645, "grad_norm": 0.9261484742164612, "learning_rate": 9.310438609504032e-06, "loss": 0.1536, "step": 2800 }, { "entropy": 9.34654426574707, "epoch": 0.2768439786434645, "mean_token_accuracy": 0.7158034443855286, "num_tokens": 14579495.0, "step": 2800, "train/ce_loss": 0.40792229771614075 }, { "epoch": 0.2768439786434645, "step": 2800, "train/sim_loss": 0.0546875 }, { "epoch": 0.2768439786434645, "step": 2800, "train/total_loss": 0.09547972679138184 }, { "entropy": 9.522933959960938, "epoch": 0.27694285149298004, "mean_token_accuracy": 0.7470414042472839, "num_tokens": 14584561.0, "step": 2801, "train/ce_loss": 4.356124918558635e-06 }, { "epoch": 0.27694285149298004, "step": 2801, "train/sim_loss": 0.01953125 }, { "epoch": 0.27694285149298004, "step": 2801, "train/total_loss": 0.01953168585896492 }, { "entropy": 9.155094146728516, "epoch": 0.2770417243424956, "mean_token_accuracy": 0.7702227234840393, "num_tokens": 14589871.0, "step": 2802, "train/ce_loss": 0.8684111833572388 }, { "epoch": 0.2770417243424956, "step": 2802, "train/sim_loss": 0.078125 }, { "epoch": 0.2770417243424956, "step": 2802, "train/total_loss": 0.16496612131595612 }, { "entropy": 10.107953071594238, "epoch": 0.27714059719201106, "mean_token_accuracy": 0.722347617149353, "num_tokens": 14594700.0, "step": 2803, "train/ce_loss": 1.263716459274292 }, { "epoch": 0.27714059719201106, "step": 2803, "train/sim_loss": 0.02734375 }, { "epoch": 0.27714059719201106, "step": 2803, "train/total_loss": 0.15371540188789368 }, { "entropy": 9.887292861938477, "epoch": 0.2772394700415266, "mean_token_accuracy": 0.7801268696784973, "num_tokens": 14599591.0, "step": 2804, "train/ce_loss": 1.0284250492986757e-05 }, { "epoch": 0.2772394700415266, "step": 2804, "train/sim_loss": 0.05078125 }, { "epoch": 0.2772394700415266, "step": 2804, "train/total_loss": 0.050782278180122375 }, { "entropy": 9.335623741149902, "epoch": 0.27733834289104214, "mean_token_accuracy": 0.710089385509491, "num_tokens": 14604746.0, "step": 2805, "train/ce_loss": 0.9892579913139343 }, { "epoch": 0.27733834289104214, "step": 2805, "train/sim_loss": 0.06640625 }, { "epoch": 0.27733834289104214, "step": 2805, "train/total_loss": 0.16533204913139343 }, { "entropy": 9.14814567565918, "epoch": 0.27743721574055763, "mean_token_accuracy": 0.7142857313156128, "num_tokens": 14609955.0, "step": 2806, "train/ce_loss": 0.6708879470825195 }, { "epoch": 0.27743721574055763, "step": 2806, "train/sim_loss": 0.078125 }, { "epoch": 0.27743721574055763, "step": 2806, "train/total_loss": 0.1452137976884842 }, { "entropy": 8.987321853637695, "epoch": 0.27753608859007317, "mean_token_accuracy": 0.7251396775245667, "num_tokens": 14615308.0, "step": 2807, "train/ce_loss": 0.8159539699554443 }, { "epoch": 0.27753608859007317, "step": 2807, "train/sim_loss": 0.046875 }, { "epoch": 0.27753608859007317, "step": 2807, "train/total_loss": 0.12847039103507996 }, { "entropy": 9.317058563232422, "epoch": 0.2776349614395887, "mean_token_accuracy": 0.7585693001747131, "num_tokens": 14620460.0, "step": 2808, "train/ce_loss": 0.7746213674545288 }, { "epoch": 0.2776349614395887, "step": 2808, "train/sim_loss": 0.0703125 }, { "epoch": 0.2776349614395887, "step": 2808, "train/total_loss": 0.14777463674545288 }, { "entropy": 9.289009094238281, "epoch": 0.2777338342891042, "mean_token_accuracy": 0.806609570980072, "num_tokens": 14625717.0, "step": 2809, "train/ce_loss": 0.7532195448875427 }, { "epoch": 0.2777338342891042, "step": 2809, "train/sim_loss": 0.0234375 }, { "epoch": 0.2777338342891042, "step": 2809, "train/total_loss": 0.09875945746898651 }, { "entropy": 9.066307067871094, "epoch": 0.27783270713861974, "mean_token_accuracy": 0.7464008927345276, "num_tokens": 14631122.0, "step": 2810, "train/ce_loss": 1.13222336769104 }, { "epoch": 0.27783270713861974, "step": 2810, "train/sim_loss": 0.08203125 }, { "epoch": 0.27783270713861974, "step": 2810, "train/total_loss": 0.19525358080863953 }, { "entropy": 9.542865753173828, "epoch": 0.2779315799881353, "mean_token_accuracy": 0.6925514936447144, "num_tokens": 14636203.0, "step": 2811, "train/ce_loss": 1.0428379774093628 }, { "epoch": 0.2779315799881353, "step": 2811, "train/sim_loss": 0.046875 }, { "epoch": 0.2779315799881353, "step": 2811, "train/total_loss": 0.15115880966186523 }, { "entropy": 9.164630889892578, "epoch": 0.27803045283765077, "mean_token_accuracy": 0.7756410241127014, "num_tokens": 14641408.0, "step": 2812, "train/ce_loss": 0.8429121971130371 }, { "epoch": 0.27803045283765077, "step": 2812, "train/sim_loss": 0.0859375 }, { "epoch": 0.27803045283765077, "step": 2812, "train/total_loss": 0.1702287197113037 }, { "entropy": 9.204523086547852, "epoch": 0.2781293256871663, "mean_token_accuracy": 0.8227990865707397, "num_tokens": 14646764.0, "step": 2813, "train/ce_loss": 0.4245660901069641 }, { "epoch": 0.2781293256871663, "step": 2813, "train/sim_loss": 0.10546875 }, { "epoch": 0.2781293256871663, "step": 2813, "train/total_loss": 0.14792536199092865 }, { "entropy": 9.020892143249512, "epoch": 0.27822819853668185, "mean_token_accuracy": 0.7318652868270874, "num_tokens": 14651994.0, "step": 2814, "train/ce_loss": 0.6662297248840332 }, { "epoch": 0.27822819853668185, "step": 2814, "train/sim_loss": 0.05859375 }, { "epoch": 0.27822819853668185, "step": 2814, "train/total_loss": 0.12521672248840332 }, { "entropy": 9.035504341125488, "epoch": 0.27832707138619733, "mean_token_accuracy": 0.7324973940849304, "num_tokens": 14657409.0, "step": 2815, "train/ce_loss": 0.6135442852973938 }, { "epoch": 0.27832707138619733, "step": 2815, "train/sim_loss": 0.0859375 }, { "epoch": 0.27832707138619733, "step": 2815, "train/total_loss": 0.14729192852973938 }, { "entropy": 9.42744255065918, "epoch": 0.2784259442357129, "mean_token_accuracy": 0.7241379022598267, "num_tokens": 14662533.0, "step": 2816, "train/ce_loss": 1.1029224395751953 }, { "epoch": 0.2784259442357129, "step": 2816, "train/sim_loss": 0.078125 }, { "epoch": 0.2784259442357129, "step": 2816, "train/total_loss": 0.1884172558784485 }, { "entropy": 9.399081230163574, "epoch": 0.2785248170852284, "mean_token_accuracy": 0.7789017558097839, "num_tokens": 14667689.0, "step": 2817, "train/ce_loss": 0.41269248723983765 }, { "epoch": 0.2785248170852284, "step": 2817, "train/sim_loss": 0.05859375 }, { "epoch": 0.2785248170852284, "step": 2817, "train/total_loss": 0.09986300021409988 }, { "entropy": 9.163670539855957, "epoch": 0.2786236899347439, "mean_token_accuracy": 0.6848691701889038, "num_tokens": 14673011.0, "step": 2818, "train/ce_loss": 1.004690170288086 }, { "epoch": 0.2786236899347439, "step": 2818, "train/sim_loss": 0.03125 }, { "epoch": 0.2786236899347439, "step": 2818, "train/total_loss": 0.13171902298927307 }, { "entropy": 9.63107681274414, "epoch": 0.27872256278425944, "mean_token_accuracy": 0.709618866443634, "num_tokens": 14677986.0, "step": 2819, "train/ce_loss": 0.9162476062774658 }, { "epoch": 0.27872256278425944, "step": 2819, "train/sim_loss": 0.05859375 }, { "epoch": 0.27872256278425944, "step": 2819, "train/total_loss": 0.15021851658821106 }, { "epoch": 0.278821435633775, "grad_norm": 0.9040817618370056, "learning_rate": 9.305493744746082e-06, "loss": 0.1469, "step": 2820 }, { "entropy": 9.209228515625, "epoch": 0.278821435633775, "mean_token_accuracy": 0.7755681872367859, "num_tokens": 14683165.0, "step": 2820, "train/ce_loss": 1.6597121953964233 }, { "epoch": 0.278821435633775, "step": 2820, "train/sim_loss": 0.08203125 }, { "epoch": 0.278821435633775, "step": 2820, "train/total_loss": 0.24800246953964233 }, { "entropy": 9.346404075622559, "epoch": 0.27892030848329047, "mean_token_accuracy": 0.7366310358047485, "num_tokens": 14688348.0, "step": 2821, "train/ce_loss": 0.7856786251068115 }, { "epoch": 0.27892030848329047, "step": 2821, "train/sim_loss": 0.0703125 }, { "epoch": 0.27892030848329047, "step": 2821, "train/total_loss": 0.14888036251068115 }, { "entropy": 9.371283531188965, "epoch": 0.279019181332806, "mean_token_accuracy": 0.7604562640190125, "num_tokens": 14693626.0, "step": 2822, "train/ce_loss": 2.148430109024048 }, { "epoch": 0.279019181332806, "step": 2822, "train/sim_loss": 0.1328125 }, { "epoch": 0.279019181332806, "step": 2822, "train/total_loss": 0.3476555347442627 }, { "entropy": 9.872785568237305, "epoch": 0.27911805418232155, "mean_token_accuracy": 0.7659574747085571, "num_tokens": 14698524.0, "step": 2823, "train/ce_loss": 1.5956604480743408 }, { "epoch": 0.27911805418232155, "step": 2823, "train/sim_loss": 0.0859375 }, { "epoch": 0.27911805418232155, "step": 2823, "train/total_loss": 0.24550354480743408 }, { "entropy": 9.347860336303711, "epoch": 0.27921692703183704, "mean_token_accuracy": 0.6710700392723083, "num_tokens": 14703738.0, "step": 2824, "train/ce_loss": 3.157474793624715e-06 }, { "epoch": 0.27921692703183704, "step": 2824, "train/sim_loss": 0.06640625 }, { "epoch": 0.27921692703183704, "step": 2824, "train/total_loss": 0.06640656292438507 }, { "entropy": 9.093656539916992, "epoch": 0.2793157998813526, "mean_token_accuracy": 0.7427184581756592, "num_tokens": 14709041.0, "step": 2825, "train/ce_loss": 0.9866620302200317 }, { "epoch": 0.2793157998813526, "step": 2825, "train/sim_loss": 0.02734375 }, { "epoch": 0.2793157998813526, "step": 2825, "train/total_loss": 0.1260099560022354 }, { "entropy": 9.071894645690918, "epoch": 0.2794146727308681, "mean_token_accuracy": 0.6545040011405945, "num_tokens": 14714345.0, "step": 2826, "train/ce_loss": 1.7742172479629517 }, { "epoch": 0.2794146727308681, "step": 2826, "train/sim_loss": 0.0625 }, { "epoch": 0.2794146727308681, "step": 2826, "train/total_loss": 0.23992173373699188 }, { "entropy": 9.314325332641602, "epoch": 0.2795135455803836, "mean_token_accuracy": 0.707317054271698, "num_tokens": 14719580.0, "step": 2827, "train/ce_loss": 1.368817687034607 }, { "epoch": 0.2795135455803836, "step": 2827, "train/sim_loss": 0.0546875 }, { "epoch": 0.2795135455803836, "step": 2827, "train/total_loss": 0.1915692687034607 }, { "entropy": 9.146047592163086, "epoch": 0.27961241842989915, "mean_token_accuracy": 0.735897421836853, "num_tokens": 14724845.0, "step": 2828, "train/ce_loss": 0.8126648664474487 }, { "epoch": 0.27961241842989915, "step": 2828, "train/sim_loss": 0.078125 }, { "epoch": 0.27961241842989915, "step": 2828, "train/total_loss": 0.15939149260520935 }, { "entropy": 9.197863578796387, "epoch": 0.2797112912794147, "mean_token_accuracy": 0.7515375018119812, "num_tokens": 14730317.0, "step": 2829, "train/ce_loss": 0.8095092177391052 }, { "epoch": 0.2797112912794147, "step": 2829, "train/sim_loss": 0.1328125 }, { "epoch": 0.2797112912794147, "step": 2829, "train/total_loss": 0.21376341581344604 }, { "entropy": 9.530399322509766, "epoch": 0.27981016412893017, "mean_token_accuracy": 0.728787899017334, "num_tokens": 14735388.0, "step": 2830, "train/ce_loss": 1.2663441896438599 }, { "epoch": 0.27981016412893017, "step": 2830, "train/sim_loss": 0.01953125 }, { "epoch": 0.27981016412893017, "step": 2830, "train/total_loss": 0.146165668964386 }, { "entropy": 9.17813777923584, "epoch": 0.2799090369784457, "mean_token_accuracy": 0.7043189406394958, "num_tokens": 14740784.0, "step": 2831, "train/ce_loss": 0.6137787103652954 }, { "epoch": 0.2799090369784457, "step": 2831, "train/sim_loss": 0.03125 }, { "epoch": 0.2799090369784457, "step": 2831, "train/total_loss": 0.0926278680562973 }, { "entropy": 9.317256927490234, "epoch": 0.28000790982796125, "mean_token_accuracy": 0.7368420958518982, "num_tokens": 14745999.0, "step": 2832, "train/ce_loss": 0.6407901048660278 }, { "epoch": 0.28000790982796125, "step": 2832, "train/sim_loss": 0.05859375 }, { "epoch": 0.28000790982796125, "step": 2832, "train/total_loss": 0.12267275899648666 }, { "entropy": 9.02281379699707, "epoch": 0.28010678267747674, "mean_token_accuracy": 0.6962719559669495, "num_tokens": 14751364.0, "step": 2833, "train/ce_loss": 1.4975244998931885 }, { "epoch": 0.28010678267747674, "step": 2833, "train/sim_loss": 0.10546875 }, { "epoch": 0.28010678267747674, "step": 2833, "train/total_loss": 0.2552211880683899 }, { "entropy": 9.3422269821167, "epoch": 0.2802056555269923, "mean_token_accuracy": 0.7568627595901489, "num_tokens": 14756565.0, "step": 2834, "train/ce_loss": 0.9791196584701538 }, { "epoch": 0.2802056555269923, "step": 2834, "train/sim_loss": 0.0390625 }, { "epoch": 0.2802056555269923, "step": 2834, "train/total_loss": 0.13697446882724762 }, { "entropy": 9.067024230957031, "epoch": 0.2803045283765078, "mean_token_accuracy": 0.7177215218544006, "num_tokens": 14761813.0, "step": 2835, "train/ce_loss": 0.9501418471336365 }, { "epoch": 0.2803045283765078, "step": 2835, "train/sim_loss": 0.078125 }, { "epoch": 0.2803045283765078, "step": 2835, "train/total_loss": 0.17313918471336365 }, { "entropy": 9.283982276916504, "epoch": 0.2804034012260233, "mean_token_accuracy": 0.768831193447113, "num_tokens": 14767079.0, "step": 2836, "train/ce_loss": 0.3750511407852173 }, { "epoch": 0.2804034012260233, "step": 2836, "train/sim_loss": 0.0859375 }, { "epoch": 0.2804034012260233, "step": 2836, "train/total_loss": 0.1234426200389862 }, { "entropy": 9.057904243469238, "epoch": 0.28050227407553885, "mean_token_accuracy": 0.7276166677474976, "num_tokens": 14772364.0, "step": 2837, "train/ce_loss": 1.0986840724945068 }, { "epoch": 0.28050227407553885, "step": 2837, "train/sim_loss": 0.05859375 }, { "epoch": 0.28050227407553885, "step": 2837, "train/total_loss": 0.16846215724945068 }, { "entropy": 9.07217788696289, "epoch": 0.2806011469250544, "mean_token_accuracy": 0.751207709312439, "num_tokens": 14777663.0, "step": 2838, "train/ce_loss": 1.0109590291976929 }, { "epoch": 0.2806011469250544, "step": 2838, "train/sim_loss": 0.06640625 }, { "epoch": 0.2806011469250544, "step": 2838, "train/total_loss": 0.16750216484069824 }, { "entropy": 9.474416732788086, "epoch": 0.2807000197745699, "mean_token_accuracy": 0.6593245267868042, "num_tokens": 14782794.0, "step": 2839, "train/ce_loss": 1.2314399480819702 }, { "epoch": 0.2807000197745699, "step": 2839, "train/sim_loss": 0.12890625 }, { "epoch": 0.2807000197745699, "step": 2839, "train/total_loss": 0.2520502507686615 }, { "epoch": 0.2807988926240854, "grad_norm": 0.9944719672203064, "learning_rate": 9.300548879988133e-06, "loss": 0.1588, "step": 2840 }, { "entropy": 10.013803482055664, "epoch": 0.2807988926240854, "mean_token_accuracy": 0.6607669591903687, "num_tokens": 14787549.0, "step": 2840, "train/ce_loss": 9.427340046386234e-06 }, { "epoch": 0.2807988926240854, "step": 2840, "train/sim_loss": 0.03125 }, { "epoch": 0.2807988926240854, "step": 2840, "train/total_loss": 0.03125094249844551 }, { "entropy": 9.168352127075195, "epoch": 0.28089776547360096, "mean_token_accuracy": 0.7252090573310852, "num_tokens": 14792888.0, "step": 2841, "train/ce_loss": 1.0289748907089233 }, { "epoch": 0.28089776547360096, "step": 2841, "train/sim_loss": 0.08984375 }, { "epoch": 0.28089776547360096, "step": 2841, "train/total_loss": 0.1927412450313568 }, { "entropy": 9.441322326660156, "epoch": 0.2809966383231165, "mean_token_accuracy": 0.7796852588653564, "num_tokens": 14798033.0, "step": 2842, "train/ce_loss": 0.7183638215065002 }, { "epoch": 0.2809966383231165, "step": 2842, "train/sim_loss": 0.09765625 }, { "epoch": 0.2809966383231165, "step": 2842, "train/total_loss": 0.16949263215065002 }, { "entropy": 9.063495635986328, "epoch": 0.281095511172632, "mean_token_accuracy": 0.7673377990722656, "num_tokens": 14803361.0, "step": 2843, "train/ce_loss": 0.5977473258972168 }, { "epoch": 0.281095511172632, "step": 2843, "train/sim_loss": 0.05859375 }, { "epoch": 0.281095511172632, "step": 2843, "train/total_loss": 0.1183684840798378 }, { "entropy": 9.264204025268555, "epoch": 0.2811943840221475, "mean_token_accuracy": 0.6883604526519775, "num_tokens": 14808698.0, "step": 2844, "train/ce_loss": 4.171130967733916e-06 }, { "epoch": 0.2811943840221475, "step": 2844, "train/sim_loss": 0.0546875 }, { "epoch": 0.2811943840221475, "step": 2844, "train/total_loss": 0.05468791723251343 }, { "entropy": 9.403308868408203, "epoch": 0.28129325687166307, "mean_token_accuracy": 0.7954220175743103, "num_tokens": 14813867.0, "step": 2845, "train/ce_loss": 4.887909199169371e-06 }, { "epoch": 0.28129325687166307, "step": 2845, "train/sim_loss": 0.06640625 }, { "epoch": 0.28129325687166307, "step": 2845, "train/total_loss": 0.0664067417383194 }, { "entropy": 9.499643325805664, "epoch": 0.28139212972117855, "mean_token_accuracy": 0.7121211886405945, "num_tokens": 14818982.0, "step": 2846, "train/ce_loss": 1.0086324214935303 }, { "epoch": 0.28139212972117855, "step": 2846, "train/sim_loss": 0.03515625 }, { "epoch": 0.28139212972117855, "step": 2846, "train/total_loss": 0.1360194981098175 }, { "entropy": 9.10026741027832, "epoch": 0.2814910025706941, "mean_token_accuracy": 0.7444444298744202, "num_tokens": 14824310.0, "step": 2847, "train/ce_loss": 0.7505509257316589 }, { "epoch": 0.2814910025706941, "step": 2847, "train/sim_loss": 0.09765625 }, { "epoch": 0.2814910025706941, "step": 2847, "train/total_loss": 0.1727113425731659 }, { "entropy": 9.757712364196777, "epoch": 0.28158987542020963, "mean_token_accuracy": 0.7311643958091736, "num_tokens": 14829337.0, "step": 2848, "train/ce_loss": 4.63628703073482e-06 }, { "epoch": 0.28158987542020963, "step": 2848, "train/sim_loss": 0.03125 }, { "epoch": 0.28158987542020963, "step": 2848, "train/total_loss": 0.03125046193599701 }, { "entropy": 8.91356372833252, "epoch": 0.2816887482697251, "mean_token_accuracy": 0.7150395512580872, "num_tokens": 14834547.0, "step": 2849, "train/ce_loss": 0.7498126029968262 }, { "epoch": 0.2816887482697251, "step": 2849, "train/sim_loss": 0.05859375 }, { "epoch": 0.2816887482697251, "step": 2849, "train/total_loss": 0.13357502222061157 }, { "entropy": 9.169994354248047, "epoch": 0.28178762111924066, "mean_token_accuracy": 0.7188940048217773, "num_tokens": 14839629.0, "step": 2850, "train/ce_loss": 2.004420518875122 }, { "epoch": 0.28178762111924066, "step": 2850, "train/sim_loss": 0.08203125 }, { "epoch": 0.28178762111924066, "step": 2850, "train/total_loss": 0.2824733257293701 }, { "entropy": 10.078506469726562, "epoch": 0.2818864939687562, "mean_token_accuracy": 0.7661290168762207, "num_tokens": 14844407.0, "step": 2851, "train/ce_loss": 6.761013082723366e-06 }, { "epoch": 0.2818864939687562, "step": 2851, "train/sim_loss": 0.02734375 }, { "epoch": 0.2818864939687562, "step": 2851, "train/total_loss": 0.02734442614018917 }, { "entropy": 8.773055076599121, "epoch": 0.2819853668182717, "mean_token_accuracy": 0.7260416746139526, "num_tokens": 14849859.0, "step": 2852, "train/ce_loss": 0.8448234796524048 }, { "epoch": 0.2819853668182717, "step": 2852, "train/sim_loss": 0.05078125 }, { "epoch": 0.2819853668182717, "step": 2852, "train/total_loss": 0.135263592004776 }, { "entropy": 9.628518104553223, "epoch": 0.28208423966778723, "mean_token_accuracy": 0.7423934936523438, "num_tokens": 14854786.0, "step": 2853, "train/ce_loss": 8.594476639700588e-06 }, { "epoch": 0.28208423966778723, "step": 2853, "train/sim_loss": 0.09375 }, { "epoch": 0.28208423966778723, "step": 2853, "train/total_loss": 0.09375085681676865 }, { "entropy": 9.283641815185547, "epoch": 0.28218311251730277, "mean_token_accuracy": 0.7653478980064392, "num_tokens": 14859967.0, "step": 2854, "train/ce_loss": 0.9590861201286316 }, { "epoch": 0.28218311251730277, "step": 2854, "train/sim_loss": 0.0546875 }, { "epoch": 0.28218311251730277, "step": 2854, "train/total_loss": 0.15059611201286316 }, { "entropy": 9.685651779174805, "epoch": 0.28228198536681826, "mean_token_accuracy": 0.7138047218322754, "num_tokens": 14864979.0, "step": 2855, "train/ce_loss": 0.7458218932151794 }, { "epoch": 0.28228198536681826, "step": 2855, "train/sim_loss": 0.078125 }, { "epoch": 0.28228198536681826, "step": 2855, "train/total_loss": 0.15270718932151794 }, { "entropy": 9.327569961547852, "epoch": 0.2823808582163338, "mean_token_accuracy": 0.7463863492012024, "num_tokens": 14870208.0, "step": 2856, "train/ce_loss": 0.46743056178092957 }, { "epoch": 0.2823808582163338, "step": 2856, "train/sim_loss": 0.0234375 }, { "epoch": 0.2823808582163338, "step": 2856, "train/total_loss": 0.07018055766820908 }, { "entropy": 8.706644058227539, "epoch": 0.28247973106584934, "mean_token_accuracy": 0.7232142686843872, "num_tokens": 14875736.0, "step": 2857, "train/ce_loss": 0.7347967028617859 }, { "epoch": 0.28247973106584934, "step": 2857, "train/sim_loss": 0.0234375 }, { "epoch": 0.28247973106584934, "step": 2857, "train/total_loss": 0.09691717475652695 }, { "entropy": 9.690895080566406, "epoch": 0.2825786039153648, "mean_token_accuracy": 0.6859813332557678, "num_tokens": 14880704.0, "step": 2858, "train/ce_loss": 1.3040494918823242 }, { "epoch": 0.2825786039153648, "step": 2858, "train/sim_loss": 0.1015625 }, { "epoch": 0.2825786039153648, "step": 2858, "train/total_loss": 0.23196744918823242 }, { "entropy": 8.822907447814941, "epoch": 0.28267747676488036, "mean_token_accuracy": 0.7335243821144104, "num_tokens": 14886240.0, "step": 2859, "train/ce_loss": 0.7813608646392822 }, { "epoch": 0.28267747676488036, "step": 2859, "train/sim_loss": 0.0390625 }, { "epoch": 0.28267747676488036, "step": 2859, "train/total_loss": 0.11719858646392822 }, { "epoch": 0.2827763496143959, "grad_norm": 0.7708230018615723, "learning_rate": 9.295604015230185e-06, "loss": 0.1569, "step": 2860 }, { "entropy": 9.035205841064453, "epoch": 0.2827763496143959, "mean_token_accuracy": 0.7477638721466064, "num_tokens": 14891285.0, "step": 2860, "train/ce_loss": 0.9769201874732971 }, { "epoch": 0.2827763496143959, "step": 2860, "train/sim_loss": 0.05859375 }, { "epoch": 0.2827763496143959, "step": 2860, "train/total_loss": 0.15628576278686523 }, { "entropy": 8.971607208251953, "epoch": 0.2828752224639114, "mean_token_accuracy": 0.707446813583374, "num_tokens": 14896680.0, "step": 2861, "train/ce_loss": 0.7312490940093994 }, { "epoch": 0.2828752224639114, "step": 2861, "train/sim_loss": 0.06640625 }, { "epoch": 0.2828752224639114, "step": 2861, "train/total_loss": 0.13953116536140442 }, { "entropy": 9.324117660522461, "epoch": 0.28297409531342693, "mean_token_accuracy": 0.7440000176429749, "num_tokens": 14901911.0, "step": 2862, "train/ce_loss": 0.8002615571022034 }, { "epoch": 0.28297409531342693, "step": 2862, "train/sim_loss": 0.03125 }, { "epoch": 0.28297409531342693, "step": 2862, "train/total_loss": 0.11127615720033646 }, { "entropy": 9.020425796508789, "epoch": 0.2830729681629425, "mean_token_accuracy": 0.7620651125907898, "num_tokens": 14907299.0, "step": 2863, "train/ce_loss": 0.40052181482315063 }, { "epoch": 0.2830729681629425, "step": 2863, "train/sim_loss": 0.03125 }, { "epoch": 0.2830729681629425, "step": 2863, "train/total_loss": 0.07130218297243118 }, { "entropy": 9.081808090209961, "epoch": 0.28317184101245796, "mean_token_accuracy": 0.7048260569572449, "num_tokens": 14912676.0, "step": 2864, "train/ce_loss": 1.4557803869247437 }, { "epoch": 0.28317184101245796, "step": 2864, "train/sim_loss": 0.109375 }, { "epoch": 0.28317184101245796, "step": 2864, "train/total_loss": 0.2549530267715454 }, { "entropy": 9.343101501464844, "epoch": 0.2832707138619735, "mean_token_accuracy": 0.8108108043670654, "num_tokens": 14917810.0, "step": 2865, "train/ce_loss": 0.6384103298187256 }, { "epoch": 0.2832707138619735, "step": 2865, "train/sim_loss": 0.03125 }, { "epoch": 0.2832707138619735, "step": 2865, "train/total_loss": 0.09509103745222092 }, { "entropy": 9.158092498779297, "epoch": 0.28336958671148904, "mean_token_accuracy": 0.7088607549667358, "num_tokens": 14923164.0, "step": 2866, "train/ce_loss": 0.77399080991745 }, { "epoch": 0.28336958671148904, "step": 2866, "train/sim_loss": 0.08984375 }, { "epoch": 0.28336958671148904, "step": 2866, "train/total_loss": 0.16724282503128052 }, { "entropy": 9.385732650756836, "epoch": 0.2834684595610045, "mean_token_accuracy": 0.7286295890808105, "num_tokens": 14928346.0, "step": 2867, "train/ce_loss": 1.191542387008667 }, { "epoch": 0.2834684595610045, "step": 2867, "train/sim_loss": 0.06640625 }, { "epoch": 0.2834684595610045, "step": 2867, "train/total_loss": 0.18556049466133118 }, { "entropy": 8.915904998779297, "epoch": 0.28356733241052007, "mean_token_accuracy": 0.710208535194397, "num_tokens": 14933734.0, "step": 2868, "train/ce_loss": 1.2880035638809204 }, { "epoch": 0.28356733241052007, "step": 2868, "train/sim_loss": 0.0625 }, { "epoch": 0.28356733241052007, "step": 2868, "train/total_loss": 0.19130036234855652 }, { "entropy": 9.131593704223633, "epoch": 0.2836662052600356, "mean_token_accuracy": 0.7538631558418274, "num_tokens": 14939093.0, "step": 2869, "train/ce_loss": 0.5264643430709839 }, { "epoch": 0.2836662052600356, "step": 2869, "train/sim_loss": 0.0625 }, { "epoch": 0.2836662052600356, "step": 2869, "train/total_loss": 0.11514643579721451 }, { "entropy": 8.972114562988281, "epoch": 0.2837650781095511, "mean_token_accuracy": 0.7587336301803589, "num_tokens": 14944519.0, "step": 2870, "train/ce_loss": 0.5326451063156128 }, { "epoch": 0.2837650781095511, "step": 2870, "train/sim_loss": 0.0859375 }, { "epoch": 0.2837650781095511, "step": 2870, "train/total_loss": 0.13920201361179352 }, { "entropy": 9.17553424835205, "epoch": 0.28386395095906664, "mean_token_accuracy": 0.6920454502105713, "num_tokens": 14949813.0, "step": 2871, "train/ce_loss": 1.0072126388549805 }, { "epoch": 0.28386395095906664, "step": 2871, "train/sim_loss": 0.046875 }, { "epoch": 0.28386395095906664, "step": 2871, "train/total_loss": 0.14759626984596252 }, { "entropy": 8.945381164550781, "epoch": 0.2839628238085822, "mean_token_accuracy": 0.728249192237854, "num_tokens": 14955235.0, "step": 2872, "train/ce_loss": 0.6857571601867676 }, { "epoch": 0.2839628238085822, "step": 2872, "train/sim_loss": 0.0546875 }, { "epoch": 0.2839628238085822, "step": 2872, "train/total_loss": 0.12326321750879288 }, { "entropy": 8.911649703979492, "epoch": 0.28406169665809766, "mean_token_accuracy": 0.7469135522842407, "num_tokens": 14960550.0, "step": 2873, "train/ce_loss": 0.6256386041641235 }, { "epoch": 0.28406169665809766, "step": 2873, "train/sim_loss": 0.0625 }, { "epoch": 0.28406169665809766, "step": 2873, "train/total_loss": 0.12506386637687683 }, { "entropy": 9.24261474609375, "epoch": 0.2841605695076132, "mean_token_accuracy": 0.7627118825912476, "num_tokens": 14965777.0, "step": 2874, "train/ce_loss": 0.5531490445137024 }, { "epoch": 0.2841605695076132, "step": 2874, "train/sim_loss": 0.0390625 }, { "epoch": 0.2841605695076132, "step": 2874, "train/total_loss": 0.09437740594148636 }, { "entropy": 9.203690528869629, "epoch": 0.28425944235712874, "mean_token_accuracy": 0.7638888955116272, "num_tokens": 14970899.0, "step": 2875, "train/ce_loss": 1.4430269402510021e-05 }, { "epoch": 0.28425944235712874, "step": 2875, "train/sim_loss": 0.046875 }, { "epoch": 0.28425944235712874, "step": 2875, "train/total_loss": 0.046876441687345505 }, { "entropy": 8.744983673095703, "epoch": 0.28435831520664423, "mean_token_accuracy": 0.7970244288444519, "num_tokens": 14976265.0, "step": 2876, "train/ce_loss": 0.37529221177101135 }, { "epoch": 0.28435831520664423, "step": 2876, "train/sim_loss": 0.02734375 }, { "epoch": 0.28435831520664423, "step": 2876, "train/total_loss": 0.06487297266721725 }, { "entropy": 9.226240158081055, "epoch": 0.28445718805615977, "mean_token_accuracy": 0.7182044982910156, "num_tokens": 14981516.0, "step": 2877, "train/ce_loss": 2.5748761345312232e-06 }, { "epoch": 0.28445718805615977, "step": 2877, "train/sim_loss": 0.0625 }, { "epoch": 0.28445718805615977, "step": 2877, "train/total_loss": 0.06250026077032089 }, { "entropy": 8.894309043884277, "epoch": 0.2845560609056753, "mean_token_accuracy": 0.7683315873146057, "num_tokens": 14986940.0, "step": 2878, "train/ce_loss": 0.6498753428459167 }, { "epoch": 0.2845560609056753, "step": 2878, "train/sim_loss": 0.06640625 }, { "epoch": 0.2845560609056753, "step": 2878, "train/total_loss": 0.13139379024505615 }, { "entropy": 9.124774932861328, "epoch": 0.2846549337551908, "mean_token_accuracy": 0.738095223903656, "num_tokens": 14992231.0, "step": 2879, "train/ce_loss": 1.1597203016281128 }, { "epoch": 0.2846549337551908, "step": 2879, "train/sim_loss": 0.1328125 }, { "epoch": 0.2846549337551908, "step": 2879, "train/total_loss": 0.24878454208374023 }, { "epoch": 0.28475380660470634, "grad_norm": 0.9130677580833435, "learning_rate": 9.290659150472235e-06, "loss": 0.1444, "step": 2880 }, { "entropy": 9.95883560180664, "epoch": 0.28475380660470634, "mean_token_accuracy": 0.7068607211112976, "num_tokens": 14997134.0, "step": 2880, "train/ce_loss": 1.6533236503601074 }, { "epoch": 0.28475380660470634, "step": 2880, "train/sim_loss": 0.125 }, { "epoch": 0.28475380660470634, "step": 2880, "train/total_loss": 0.2903323769569397 }, { "entropy": 9.849489212036133, "epoch": 0.2848526794542219, "mean_token_accuracy": 0.7683615684509277, "num_tokens": 15002099.0, "step": 2881, "train/ce_loss": 1.461854338645935 }, { "epoch": 0.2848526794542219, "step": 2881, "train/sim_loss": 0.078125 }, { "epoch": 0.2848526794542219, "step": 2881, "train/total_loss": 0.22431044280529022 }, { "entropy": 9.121145248413086, "epoch": 0.2849515523037374, "mean_token_accuracy": 0.7242646813392639, "num_tokens": 15007399.0, "step": 2882, "train/ce_loss": 0.46917709708213806 }, { "epoch": 0.2849515523037374, "step": 2882, "train/sim_loss": 0.06640625 }, { "epoch": 0.2849515523037374, "step": 2882, "train/total_loss": 0.11332395672798157 }, { "entropy": 8.879717826843262, "epoch": 0.2850504251532529, "mean_token_accuracy": 0.7605459094047546, "num_tokens": 15012741.0, "step": 2883, "train/ce_loss": 0.7481593489646912 }, { "epoch": 0.2850504251532529, "step": 2883, "train/sim_loss": 0.109375 }, { "epoch": 0.2850504251532529, "step": 2883, "train/total_loss": 0.18419092893600464 }, { "entropy": 9.377570152282715, "epoch": 0.28514929800276845, "mean_token_accuracy": 0.7293035387992859, "num_tokens": 15017922.0, "step": 2884, "train/ce_loss": 1.36378014087677 }, { "epoch": 0.28514929800276845, "step": 2884, "train/sim_loss": 0.03515625 }, { "epoch": 0.28514929800276845, "step": 2884, "train/total_loss": 0.17153427004814148 }, { "entropy": 9.331884384155273, "epoch": 0.285248170852284, "mean_token_accuracy": 0.7418879270553589, "num_tokens": 15023068.0, "step": 2885, "train/ce_loss": 1.1111305866506882e-05 }, { "epoch": 0.285248170852284, "step": 2885, "train/sim_loss": 0.02734375 }, { "epoch": 0.285248170852284, "step": 2885, "train/total_loss": 0.02734486199915409 }, { "entropy": 8.684528350830078, "epoch": 0.2853470437017995, "mean_token_accuracy": 0.7504363059997559, "num_tokens": 15028713.0, "step": 2886, "train/ce_loss": 0.8095030784606934 }, { "epoch": 0.2853470437017995, "step": 2886, "train/sim_loss": 0.09375 }, { "epoch": 0.2853470437017995, "step": 2886, "train/total_loss": 0.1747003197669983 }, { "entropy": 9.024653434753418, "epoch": 0.285445916551315, "mean_token_accuracy": 0.7375296950340271, "num_tokens": 15034053.0, "step": 2887, "train/ce_loss": 1.1938775777816772 }, { "epoch": 0.285445916551315, "step": 2887, "train/sim_loss": 0.1171875 }, { "epoch": 0.285445916551315, "step": 2887, "train/total_loss": 0.23657526075839996 }, { "entropy": 8.832128524780273, "epoch": 0.28554478940083056, "mean_token_accuracy": 0.701508641242981, "num_tokens": 15039481.0, "step": 2888, "train/ce_loss": 1.0464129447937012 }, { "epoch": 0.28554478940083056, "step": 2888, "train/sim_loss": 0.10546875 }, { "epoch": 0.28554478940083056, "step": 2888, "train/total_loss": 0.21011003851890564 }, { "entropy": 9.106720924377441, "epoch": 0.28564366225034604, "mean_token_accuracy": 0.7845982313156128, "num_tokens": 15044874.0, "step": 2889, "train/ce_loss": 0.8580244183540344 }, { "epoch": 0.28564366225034604, "step": 2889, "train/sim_loss": 0.08203125 }, { "epoch": 0.28564366225034604, "step": 2889, "train/total_loss": 0.16783368587493896 }, { "entropy": 9.683690071105957, "epoch": 0.2857425350998616, "mean_token_accuracy": 0.7377049326896667, "num_tokens": 15049889.0, "step": 2890, "train/ce_loss": 0.7985544204711914 }, { "epoch": 0.2857425350998616, "step": 2890, "train/sim_loss": 0.05859375 }, { "epoch": 0.2857425350998616, "step": 2890, "train/total_loss": 0.13844919204711914 }, { "entropy": 9.650266647338867, "epoch": 0.2858414079493771, "mean_token_accuracy": 0.6741573214530945, "num_tokens": 15054946.0, "step": 2891, "train/ce_loss": 1.537845492362976 }, { "epoch": 0.2858414079493771, "step": 2891, "train/sim_loss": 0.10546875 }, { "epoch": 0.2858414079493771, "step": 2891, "train/total_loss": 0.2592533230781555 }, { "entropy": 9.020576477050781, "epoch": 0.2859402807988926, "mean_token_accuracy": 0.7421320080757141, "num_tokens": 15060366.0, "step": 2892, "train/ce_loss": 0.669991672039032 }, { "epoch": 0.2859402807988926, "step": 2892, "train/sim_loss": 0.02734375 }, { "epoch": 0.2859402807988926, "step": 2892, "train/total_loss": 0.0943429172039032 }, { "entropy": 8.795308113098145, "epoch": 0.28603915364840815, "mean_token_accuracy": 0.789875864982605, "num_tokens": 15065882.0, "step": 2893, "train/ce_loss": 0.2111099511384964 }, { "epoch": 0.28603915364840815, "step": 2893, "train/sim_loss": 0.0234375 }, { "epoch": 0.28603915364840815, "step": 2893, "train/total_loss": 0.04454849660396576 }, { "entropy": 9.871052742004395, "epoch": 0.2861380264979237, "mean_token_accuracy": 0.7721822261810303, "num_tokens": 15070716.0, "step": 2894, "train/ce_loss": 2.063713788986206 }, { "epoch": 0.2861380264979237, "step": 2894, "train/sim_loss": 0.0703125 }, { "epoch": 0.2861380264979237, "step": 2894, "train/total_loss": 0.27668386697769165 }, { "entropy": 9.305549621582031, "epoch": 0.2862368993474392, "mean_token_accuracy": 0.6906290054321289, "num_tokens": 15075962.0, "step": 2895, "train/ce_loss": 1.2572402954101562 }, { "epoch": 0.2862368993474392, "step": 2895, "train/sim_loss": 0.078125 }, { "epoch": 0.2862368993474392, "step": 2895, "train/total_loss": 0.20384903252124786 }, { "entropy": 9.614995002746582, "epoch": 0.2863357721969547, "mean_token_accuracy": 0.7966386675834656, "num_tokens": 15081032.0, "step": 2896, "train/ce_loss": 1.1156418323516846 }, { "epoch": 0.2863357721969547, "step": 2896, "train/sim_loss": 0.0546875 }, { "epoch": 0.2863357721969547, "step": 2896, "train/total_loss": 0.16625168919563293 }, { "entropy": 9.07950210571289, "epoch": 0.28643464504647026, "mean_token_accuracy": 0.7869023084640503, "num_tokens": 15086442.0, "step": 2897, "train/ce_loss": 0.8098610043525696 }, { "epoch": 0.28643464504647026, "step": 2897, "train/sim_loss": 0.08203125 }, { "epoch": 0.28643464504647026, "step": 2897, "train/total_loss": 0.1630173623561859 }, { "entropy": 9.426172256469727, "epoch": 0.28653351789598575, "mean_token_accuracy": 0.7753247022628784, "num_tokens": 15091634.0, "step": 2898, "train/ce_loss": 0.95197993516922 }, { "epoch": 0.28653351789598575, "step": 2898, "train/sim_loss": 0.04296875 }, { "epoch": 0.28653351789598575, "step": 2898, "train/total_loss": 0.13816675543785095 }, { "entropy": 9.329826354980469, "epoch": 0.2866323907455013, "mean_token_accuracy": 0.7532981634140015, "num_tokens": 15096781.0, "step": 2899, "train/ce_loss": 0.6716318130493164 }, { "epoch": 0.2866323907455013, "step": 2899, "train/sim_loss": 0.0703125 }, { "epoch": 0.2866323907455013, "step": 2899, "train/total_loss": 0.13747568428516388 }, { "epoch": 0.2867312635950168, "grad_norm": 0.7907260060310364, "learning_rate": 9.285714285714288e-06, "loss": 0.1448, "step": 2900 }, { "entropy": 9.610153198242188, "epoch": 0.2867312635950168, "mean_token_accuracy": 0.7203647494316101, "num_tokens": 15102227.0, "step": 2900, "train/ce_loss": 1.2978801727294922 }, { "epoch": 0.2867312635950168, "step": 2900, "train/sim_loss": 0.0859375 }, { "epoch": 0.2867312635950168, "step": 2900, "train/total_loss": 0.21572552621364594 }, { "entropy": 9.642109870910645, "epoch": 0.2868301364445323, "mean_token_accuracy": 0.745794415473938, "num_tokens": 15107202.0, "step": 2901, "train/ce_loss": 3.4009508453891613e-06 }, { "epoch": 0.2868301364445323, "step": 2901, "train/sim_loss": 0.04296875 }, { "epoch": 0.2868301364445323, "step": 2901, "train/total_loss": 0.04296908900141716 }, { "entropy": 9.003314971923828, "epoch": 0.28692900929404785, "mean_token_accuracy": 0.7480998635292053, "num_tokens": 15112616.0, "step": 2902, "train/ce_loss": 0.8984993696212769 }, { "epoch": 0.28692900929404785, "step": 2902, "train/sim_loss": 0.12109375 }, { "epoch": 0.28692900929404785, "step": 2902, "train/total_loss": 0.21094369888305664 }, { "entropy": 9.450581550598145, "epoch": 0.2870278821435634, "mean_token_accuracy": 0.6834094524383545, "num_tokens": 15117680.0, "step": 2903, "train/ce_loss": 1.4190950393676758 }, { "epoch": 0.2870278821435634, "step": 2903, "train/sim_loss": 0.09375 }, { "epoch": 0.2870278821435634, "step": 2903, "train/total_loss": 0.23565950989723206 }, { "entropy": 9.08843994140625, "epoch": 0.2871267549930789, "mean_token_accuracy": 0.6920199394226074, "num_tokens": 15122975.0, "step": 2904, "train/ce_loss": 1.9272888898849487 }, { "epoch": 0.2871267549930789, "step": 2904, "train/sim_loss": 0.07421875 }, { "epoch": 0.2871267549930789, "step": 2904, "train/total_loss": 0.2669476270675659 }, { "entropy": 9.10743522644043, "epoch": 0.2872256278425944, "mean_token_accuracy": 0.7439903616905212, "num_tokens": 15128284.0, "step": 2905, "train/ce_loss": 0.8795641660690308 }, { "epoch": 0.2872256278425944, "step": 2905, "train/sim_loss": 0.08984375 }, { "epoch": 0.2872256278425944, "step": 2905, "train/total_loss": 0.17780017852783203 }, { "entropy": 9.162273406982422, "epoch": 0.28732450069210996, "mean_token_accuracy": 0.7629009485244751, "num_tokens": 15133392.0, "step": 2906, "train/ce_loss": 0.5848463773727417 }, { "epoch": 0.28732450069210996, "step": 2906, "train/sim_loss": 0.04296875 }, { "epoch": 0.28732450069210996, "step": 2906, "train/total_loss": 0.10145339369773865 }, { "entropy": 9.031187057495117, "epoch": 0.28742337354162545, "mean_token_accuracy": 0.700796365737915, "num_tokens": 15138727.0, "step": 2907, "train/ce_loss": 0.8902770280838013 }, { "epoch": 0.28742337354162545, "step": 2907, "train/sim_loss": 0.0625 }, { "epoch": 0.28742337354162545, "step": 2907, "train/total_loss": 0.15152770280838013 }, { "entropy": 9.404449462890625, "epoch": 0.287522246391141, "mean_token_accuracy": 0.807479202747345, "num_tokens": 15143927.0, "step": 2908, "train/ce_loss": 0.7218267917633057 }, { "epoch": 0.287522246391141, "step": 2908, "train/sim_loss": 0.0234375 }, { "epoch": 0.287522246391141, "step": 2908, "train/total_loss": 0.09562017768621445 }, { "entropy": 9.481965065002441, "epoch": 0.28762111924065653, "mean_token_accuracy": 0.7522796392440796, "num_tokens": 15149001.0, "step": 2909, "train/ce_loss": 4.39267796537024e-06 }, { "epoch": 0.28762111924065653, "step": 2909, "train/sim_loss": 0.0390625 }, { "epoch": 0.28762111924065653, "step": 2909, "train/total_loss": 0.03906293958425522 }, { "entropy": 9.307743072509766, "epoch": 0.287719992090172, "mean_token_accuracy": 0.7609561681747437, "num_tokens": 15154226.0, "step": 2910, "train/ce_loss": 0.664038896560669 }, { "epoch": 0.287719992090172, "step": 2910, "train/sim_loss": 0.0390625 }, { "epoch": 0.287719992090172, "step": 2910, "train/total_loss": 0.10546638816595078 }, { "entropy": 9.571969985961914, "epoch": 0.28781886493968756, "mean_token_accuracy": 0.7578616142272949, "num_tokens": 15159339.0, "step": 2911, "train/ce_loss": 0.739711344242096 }, { "epoch": 0.28781886493968756, "step": 2911, "train/sim_loss": 0.08984375 }, { "epoch": 0.28781886493968756, "step": 2911, "train/total_loss": 0.16381488740444183 }, { "entropy": 9.01880931854248, "epoch": 0.2879177377892031, "mean_token_accuracy": 0.7094017267227173, "num_tokens": 15164629.0, "step": 2912, "train/ce_loss": 0.7169989943504333 }, { "epoch": 0.2879177377892031, "step": 2912, "train/sim_loss": 0.0625 }, { "epoch": 0.2879177377892031, "step": 2912, "train/total_loss": 0.13419990241527557 }, { "entropy": 8.842731475830078, "epoch": 0.2880166106387186, "mean_token_accuracy": 0.6873683929443359, "num_tokens": 15170096.0, "step": 2913, "train/ce_loss": 0.895346462726593 }, { "epoch": 0.2880166106387186, "step": 2913, "train/sim_loss": 0.1484375 }, { "epoch": 0.2880166106387186, "step": 2913, "train/total_loss": 0.23797214031219482 }, { "entropy": 9.337254524230957, "epoch": 0.2881154834882341, "mean_token_accuracy": 0.7176470756530762, "num_tokens": 15175243.0, "step": 2914, "train/ce_loss": 1.541421890258789 }, { "epoch": 0.2881154834882341, "step": 2914, "train/sim_loss": 0.078125 }, { "epoch": 0.2881154834882341, "step": 2914, "train/total_loss": 0.23226718604564667 }, { "entropy": 9.131725311279297, "epoch": 0.28821435633774967, "mean_token_accuracy": 0.715976357460022, "num_tokens": 15180551.0, "step": 2915, "train/ce_loss": 0.9469427466392517 }, { "epoch": 0.28821435633774967, "step": 2915, "train/sim_loss": 0.03515625 }, { "epoch": 0.28821435633774967, "step": 2915, "train/total_loss": 0.12985053658485413 }, { "entropy": 8.793785095214844, "epoch": 0.28831322918726515, "mean_token_accuracy": 0.7376705408096313, "num_tokens": 15185952.0, "step": 2916, "train/ce_loss": 0.4122583568096161 }, { "epoch": 0.28831322918726515, "step": 2916, "train/sim_loss": 0.015625 }, { "epoch": 0.28831322918726515, "step": 2916, "train/total_loss": 0.05685083568096161 }, { "entropy": 9.526094436645508, "epoch": 0.2884121020367807, "mean_token_accuracy": 0.7099999785423279, "num_tokens": 15191005.0, "step": 2917, "train/ce_loss": 1.3818445205688477 }, { "epoch": 0.2884121020367807, "step": 2917, "train/sim_loss": 0.08203125 }, { "epoch": 0.2884121020367807, "step": 2917, "train/total_loss": 0.22021570801734924 }, { "entropy": 9.113298416137695, "epoch": 0.28851097488629623, "mean_token_accuracy": 0.7255594730377197, "num_tokens": 15196322.0, "step": 2918, "train/ce_loss": 0.6022658348083496 }, { "epoch": 0.28851097488629623, "step": 2918, "train/sim_loss": 0.06640625 }, { "epoch": 0.28851097488629623, "step": 2918, "train/total_loss": 0.12663283944129944 }, { "entropy": 9.03260612487793, "epoch": 0.2886098477358117, "mean_token_accuracy": 0.7084826827049255, "num_tokens": 15201584.0, "step": 2919, "train/ce_loss": 1.2507596015930176 }, { "epoch": 0.2886098477358117, "step": 2919, "train/sim_loss": 0.1171875 }, { "epoch": 0.2886098477358117, "step": 2919, "train/total_loss": 0.24226346611976624 }, { "epoch": 0.28870872058532726, "grad_norm": 0.8603032231330872, "learning_rate": 9.280769420956338e-06, "loss": 0.1559, "step": 2920 }, { "entropy": 9.646169662475586, "epoch": 0.28870872058532726, "mean_token_accuracy": 0.71378093957901, "num_tokens": 15206600.0, "step": 2920, "train/ce_loss": 1.6396628618240356 }, { "epoch": 0.28870872058532726, "step": 2920, "train/sim_loss": 0.08984375 }, { "epoch": 0.28870872058532726, "step": 2920, "train/total_loss": 0.2538100481033325 }, { "entropy": 9.373332977294922, "epoch": 0.2888075934348428, "mean_token_accuracy": 0.7517630457878113, "num_tokens": 15211718.0, "step": 2921, "train/ce_loss": 1.1262335777282715 }, { "epoch": 0.2888075934348428, "step": 2921, "train/sim_loss": 0.05859375 }, { "epoch": 0.2888075934348428, "step": 2921, "train/total_loss": 0.17121711373329163 }, { "entropy": 9.027542114257812, "epoch": 0.2889064662843583, "mean_token_accuracy": 0.76949542760849, "num_tokens": 15217065.0, "step": 2922, "train/ce_loss": 1.0222415924072266 }, { "epoch": 0.2889064662843583, "step": 2922, "train/sim_loss": 0.0625 }, { "epoch": 0.2889064662843583, "step": 2922, "train/total_loss": 0.1647241711616516 }, { "entropy": 8.895853996276855, "epoch": 0.28900533913387383, "mean_token_accuracy": 0.7224880456924438, "num_tokens": 15222387.0, "step": 2923, "train/ce_loss": 0.9339638948440552 }, { "epoch": 0.28900533913387383, "step": 2923, "train/sim_loss": 0.0703125 }, { "epoch": 0.28900533913387383, "step": 2923, "train/total_loss": 0.16370889544487 }, { "entropy": 9.915444374084473, "epoch": 0.28910421198338937, "mean_token_accuracy": 0.7452229261398315, "num_tokens": 15227275.0, "step": 2924, "train/ce_loss": 1.3535692691802979 }, { "epoch": 0.28910421198338937, "step": 2924, "train/sim_loss": 0.07421875 }, { "epoch": 0.28910421198338937, "step": 2924, "train/total_loss": 0.20957568287849426 }, { "entropy": 9.574451446533203, "epoch": 0.2892030848329049, "mean_token_accuracy": 0.7394958138465881, "num_tokens": 15232294.0, "step": 2925, "train/ce_loss": 0.8031234741210938 }, { "epoch": 0.2892030848329049, "step": 2925, "train/sim_loss": 0.0625 }, { "epoch": 0.2892030848329049, "step": 2925, "train/total_loss": 0.1428123414516449 }, { "entropy": 9.17717170715332, "epoch": 0.2893019576824204, "mean_token_accuracy": 0.7391842007637024, "num_tokens": 15237572.0, "step": 2926, "train/ce_loss": 0.6628701686859131 }, { "epoch": 0.2893019576824204, "step": 2926, "train/sim_loss": 0.0390625 }, { "epoch": 0.2893019576824204, "step": 2926, "train/total_loss": 0.10534951835870743 }, { "entropy": 9.602263450622559, "epoch": 0.28940083053193594, "mean_token_accuracy": 0.7700170278549194, "num_tokens": 15242616.0, "step": 2927, "train/ce_loss": 1.0057820081710815 }, { "epoch": 0.28940083053193594, "step": 2927, "train/sim_loss": 0.1015625 }, { "epoch": 0.28940083053193594, "step": 2927, "train/total_loss": 0.2021407037973404 }, { "entropy": 9.086362838745117, "epoch": 0.2894997033814515, "mean_token_accuracy": 0.6846733689308167, "num_tokens": 15247803.0, "step": 2928, "train/ce_loss": 0.859923779964447 }, { "epoch": 0.2894997033814515, "step": 2928, "train/sim_loss": 0.078125 }, { "epoch": 0.2894997033814515, "step": 2928, "train/total_loss": 0.16411738097667694 }, { "entropy": 9.409735679626465, "epoch": 0.28959857623096696, "mean_token_accuracy": 0.7667140960693359, "num_tokens": 15252974.0, "step": 2929, "train/ce_loss": 0.8224440217018127 }, { "epoch": 0.28959857623096696, "step": 2929, "train/sim_loss": 0.02734375 }, { "epoch": 0.28959857623096696, "step": 2929, "train/total_loss": 0.1095881536602974 }, { "entropy": 9.117687225341797, "epoch": 0.2896974490804825, "mean_token_accuracy": 0.7347995042800903, "num_tokens": 15258217.0, "step": 2930, "train/ce_loss": 0.755094587802887 }, { "epoch": 0.2896974490804825, "step": 2930, "train/sim_loss": 0.04296875 }, { "epoch": 0.2896974490804825, "step": 2930, "train/total_loss": 0.1184782087802887 }, { "entropy": 9.611143112182617, "epoch": 0.28979632192999805, "mean_token_accuracy": 0.686274528503418, "num_tokens": 15263206.0, "step": 2931, "train/ce_loss": 0.620786190032959 }, { "epoch": 0.28979632192999805, "step": 2931, "train/sim_loss": 0.0859375 }, { "epoch": 0.28979632192999805, "step": 2931, "train/total_loss": 0.14801612496376038 }, { "entropy": 8.850191116333008, "epoch": 0.28989519477951353, "mean_token_accuracy": 0.7307692170143127, "num_tokens": 15268699.0, "step": 2932, "train/ce_loss": 0.6986292004585266 }, { "epoch": 0.28989519477951353, "step": 2932, "train/sim_loss": 0.02734375 }, { "epoch": 0.28989519477951353, "step": 2932, "train/total_loss": 0.09720667451620102 }, { "entropy": 9.25748348236084, "epoch": 0.2899940676290291, "mean_token_accuracy": 0.7631579041481018, "num_tokens": 15273784.0, "step": 2933, "train/ce_loss": 1.0593773126602173 }, { "epoch": 0.2899940676290291, "step": 2933, "train/sim_loss": 0.05859375 }, { "epoch": 0.2899940676290291, "step": 2933, "train/total_loss": 0.16453148424625397 }, { "entropy": 9.362494468688965, "epoch": 0.2900929404785446, "mean_token_accuracy": 0.7336010932922363, "num_tokens": 15278971.0, "step": 2934, "train/ce_loss": 0.6346144676208496 }, { "epoch": 0.2900929404785446, "step": 2934, "train/sim_loss": 0.078125 }, { "epoch": 0.2900929404785446, "step": 2934, "train/total_loss": 0.14158645272254944 }, { "entropy": 8.851987838745117, "epoch": 0.2901918133280601, "mean_token_accuracy": 0.7468220591545105, "num_tokens": 15284426.0, "step": 2935, "train/ce_loss": 0.7906885743141174 }, { "epoch": 0.2901918133280601, "step": 2935, "train/sim_loss": 0.04296875 }, { "epoch": 0.2901918133280601, "step": 2935, "train/total_loss": 0.1220376119017601 }, { "entropy": 9.00190544128418, "epoch": 0.29029068617757564, "mean_token_accuracy": 0.6767337918281555, "num_tokens": 15289749.0, "step": 2936, "train/ce_loss": 1.0030051469802856 }, { "epoch": 0.29029068617757564, "step": 2936, "train/sim_loss": 0.09375 }, { "epoch": 0.29029068617757564, "step": 2936, "train/total_loss": 0.19405052065849304 }, { "entropy": 8.663482666015625, "epoch": 0.2903895590270912, "mean_token_accuracy": 0.7568134069442749, "num_tokens": 15295187.0, "step": 2937, "train/ce_loss": 1.220179557800293 }, { "epoch": 0.2903895590270912, "step": 2937, "train/sim_loss": 0.05859375 }, { "epoch": 0.2903895590270912, "step": 2937, "train/total_loss": 0.18061169981956482 }, { "entropy": 9.416010856628418, "epoch": 0.29048843187660667, "mean_token_accuracy": 0.6734992861747742, "num_tokens": 15300387.0, "step": 2938, "train/ce_loss": 1.1120859384536743 }, { "epoch": 0.29048843187660667, "step": 2938, "train/sim_loss": 0.0546875 }, { "epoch": 0.29048843187660667, "step": 2938, "train/total_loss": 0.16589608788490295 }, { "entropy": 8.690788269042969, "epoch": 0.2905873047261222, "mean_token_accuracy": 0.7480403184890747, "num_tokens": 15305747.0, "step": 2939, "train/ce_loss": 0.9094756245613098 }, { "epoch": 0.2905873047261222, "step": 2939, "train/sim_loss": 0.03515625 }, { "epoch": 0.2905873047261222, "step": 2939, "train/total_loss": 0.12610381841659546 }, { "epoch": 0.29068617757563775, "grad_norm": 0.9364753365516663, "learning_rate": 9.275824556198389e-06, "loss": 0.1539, "step": 2940 }, { "entropy": 9.060997009277344, "epoch": 0.29068617757563775, "mean_token_accuracy": 0.7299363017082214, "num_tokens": 15310994.0, "step": 2940, "train/ce_loss": 0.6110745668411255 }, { "epoch": 0.29068617757563775, "step": 2940, "train/sim_loss": 0.10546875 }, { "epoch": 0.29068617757563775, "step": 2940, "train/total_loss": 0.16657620668411255 }, { "entropy": 9.382147789001465, "epoch": 0.29078505042515324, "mean_token_accuracy": 0.716617226600647, "num_tokens": 15316134.0, "step": 2941, "train/ce_loss": 1.2948169708251953 }, { "epoch": 0.29078505042515324, "step": 2941, "train/sim_loss": 0.0703125 }, { "epoch": 0.29078505042515324, "step": 2941, "train/total_loss": 0.199794203042984 }, { "entropy": 8.88481616973877, "epoch": 0.2908839232746688, "mean_token_accuracy": 0.6993464231491089, "num_tokens": 15321475.0, "step": 2942, "train/ce_loss": 0.6280501484870911 }, { "epoch": 0.2908839232746688, "step": 2942, "train/sim_loss": 0.05078125 }, { "epoch": 0.2908839232746688, "step": 2942, "train/total_loss": 0.11358626931905746 }, { "entropy": 9.055749893188477, "epoch": 0.2909827961241843, "mean_token_accuracy": 0.7310513257980347, "num_tokens": 15326761.0, "step": 2943, "train/ce_loss": 0.5801316499710083 }, { "epoch": 0.2909827961241843, "step": 2943, "train/sim_loss": 0.0546875 }, { "epoch": 0.2909827961241843, "step": 2943, "train/total_loss": 0.11270067095756531 }, { "entropy": 9.614754676818848, "epoch": 0.2910816689736998, "mean_token_accuracy": 0.6486486196517944, "num_tokens": 15331621.0, "step": 2944, "train/ce_loss": 3.4918501377105713 }, { "epoch": 0.2910816689736998, "step": 2944, "train/sim_loss": 0.1328125 }, { "epoch": 0.2910816689736998, "step": 2944, "train/total_loss": 0.4819975197315216 }, { "entropy": 9.08978271484375, "epoch": 0.29118054182321534, "mean_token_accuracy": 0.7785235047340393, "num_tokens": 15336991.0, "step": 2945, "train/ce_loss": 0.48373153805732727 }, { "epoch": 0.29118054182321534, "step": 2945, "train/sim_loss": 0.01953125 }, { "epoch": 0.29118054182321534, "step": 2945, "train/total_loss": 0.06790440529584885 }, { "entropy": 8.795520782470703, "epoch": 0.2912794146727309, "mean_token_accuracy": 0.7587511539459229, "num_tokens": 15342530.0, "step": 2946, "train/ce_loss": 0.6220370531082153 }, { "epoch": 0.2912794146727309, "step": 2946, "train/sim_loss": 0.0703125 }, { "epoch": 0.2912794146727309, "step": 2946, "train/total_loss": 0.13251620531082153 }, { "entropy": 9.074284553527832, "epoch": 0.29137828752224637, "mean_token_accuracy": 0.6840193867683411, "num_tokens": 15347811.0, "step": 2947, "train/ce_loss": 1.5848338603973389 }, { "epoch": 0.29137828752224637, "step": 2947, "train/sim_loss": 0.05859375 }, { "epoch": 0.29137828752224637, "step": 2947, "train/total_loss": 0.2170771360397339 }, { "entropy": 9.856287956237793, "epoch": 0.2914771603717619, "mean_token_accuracy": 0.7050691246986389, "num_tokens": 15352696.0, "step": 2948, "train/ce_loss": 0.8591023087501526 }, { "epoch": 0.2914771603717619, "step": 2948, "train/sim_loss": 0.046875 }, { "epoch": 0.2914771603717619, "step": 2948, "train/total_loss": 0.13278523087501526 }, { "entropy": 9.76569938659668, "epoch": 0.29157603322127745, "mean_token_accuracy": 0.7038834691047668, "num_tokens": 15357498.0, "step": 2949, "train/ce_loss": 2.4213409423828125 }, { "epoch": 0.29157603322127745, "step": 2949, "train/sim_loss": 0.1015625 }, { "epoch": 0.29157603322127745, "step": 2949, "train/total_loss": 0.34369659423828125 }, { "entropy": 9.00585651397705, "epoch": 0.29167490607079294, "mean_token_accuracy": 0.7393422722816467, "num_tokens": 15362774.0, "step": 2950, "train/ce_loss": 1.1268589496612549 }, { "epoch": 0.29167490607079294, "step": 2950, "train/sim_loss": 0.08203125 }, { "epoch": 0.29167490607079294, "step": 2950, "train/total_loss": 0.194717139005661 }, { "entropy": 9.489755630493164, "epoch": 0.2917737789203085, "mean_token_accuracy": 0.7415929436683655, "num_tokens": 15367806.0, "step": 2951, "train/ce_loss": 5.05991420141072e-06 }, { "epoch": 0.2917737789203085, "step": 2951, "train/sim_loss": 0.046875 }, { "epoch": 0.2917737789203085, "step": 2951, "train/total_loss": 0.04687550663948059 }, { "entropy": 8.950197219848633, "epoch": 0.291872651769824, "mean_token_accuracy": 0.7437020540237427, "num_tokens": 15373187.0, "step": 2952, "train/ce_loss": 0.34434279799461365 }, { "epoch": 0.291872651769824, "step": 2952, "train/sim_loss": 0.0703125 }, { "epoch": 0.291872651769824, "step": 2952, "train/total_loss": 0.10474678128957748 }, { "entropy": 9.185539245605469, "epoch": 0.2919715246193395, "mean_token_accuracy": 0.7609289884567261, "num_tokens": 15378396.0, "step": 2953, "train/ce_loss": 0.8392499089241028 }, { "epoch": 0.2919715246193395, "step": 2953, "train/sim_loss": 0.04296875 }, { "epoch": 0.2919715246193395, "step": 2953, "train/total_loss": 0.12689374387264252 }, { "entropy": 9.294303894042969, "epoch": 0.29207039746885505, "mean_token_accuracy": 0.7472527623176575, "num_tokens": 15383579.0, "step": 2954, "train/ce_loss": 1.0456651449203491 }, { "epoch": 0.29207039746885505, "step": 2954, "train/sim_loss": 0.02734375 }, { "epoch": 0.29207039746885505, "step": 2954, "train/total_loss": 0.1319102644920349 }, { "entropy": 9.182682037353516, "epoch": 0.2921692703183706, "mean_token_accuracy": 0.7450722455978394, "num_tokens": 15388851.0, "step": 2955, "train/ce_loss": 7.724240276729688e-05 }, { "epoch": 0.2921692703183706, "step": 2955, "train/sim_loss": 0.046875 }, { "epoch": 0.2921692703183706, "step": 2955, "train/total_loss": 0.04688272252678871 }, { "entropy": 9.01830005645752, "epoch": 0.2922681431678861, "mean_token_accuracy": 0.7248018383979797, "num_tokens": 15394197.0, "step": 2956, "train/ce_loss": 0.949018657207489 }, { "epoch": 0.2922681431678861, "step": 2956, "train/sim_loss": 0.109375 }, { "epoch": 0.2922681431678861, "step": 2956, "train/total_loss": 0.20427685976028442 }, { "entropy": 9.368885040283203, "epoch": 0.2923670160174016, "mean_token_accuracy": 0.7099023461341858, "num_tokens": 15399370.0, "step": 2957, "train/ce_loss": 1.0313037633895874 }, { "epoch": 0.2923670160174016, "step": 2957, "train/sim_loss": 0.046875 }, { "epoch": 0.2923670160174016, "step": 2957, "train/total_loss": 0.15000537037849426 }, { "entropy": 9.373347282409668, "epoch": 0.29246588886691716, "mean_token_accuracy": 0.704827606678009, "num_tokens": 15404561.0, "step": 2958, "train/ce_loss": 0.7711067795753479 }, { "epoch": 0.29246588886691716, "step": 2958, "train/sim_loss": 0.05078125 }, { "epoch": 0.29246588886691716, "step": 2958, "train/total_loss": 0.1278919279575348 }, { "entropy": 9.563098907470703, "epoch": 0.29256476171643264, "mean_token_accuracy": 0.7034220695495605, "num_tokens": 15409563.0, "step": 2959, "train/ce_loss": 0.7286401987075806 }, { "epoch": 0.29256476171643264, "step": 2959, "train/sim_loss": 0.0859375 }, { "epoch": 0.29256476171643264, "step": 2959, "train/total_loss": 0.15880152583122253 }, { "epoch": 0.2926636345659482, "grad_norm": 1.0534138679504395, "learning_rate": 9.270879691440439e-06, "loss": 0.1555, "step": 2960 }, { "entropy": 9.002914428710938, "epoch": 0.2926636345659482, "mean_token_accuracy": 0.7153284549713135, "num_tokens": 15414801.0, "step": 2960, "train/ce_loss": 0.8889736533164978 }, { "epoch": 0.2926636345659482, "step": 2960, "train/sim_loss": 0.046875 }, { "epoch": 0.2926636345659482, "step": 2960, "train/total_loss": 0.13577237725257874 }, { "entropy": 9.103862762451172, "epoch": 0.2927625074154637, "mean_token_accuracy": 0.7592829465866089, "num_tokens": 15420053.0, "step": 2961, "train/ce_loss": 0.4983813464641571 }, { "epoch": 0.2927625074154637, "step": 2961, "train/sim_loss": 0.0703125 }, { "epoch": 0.2927625074154637, "step": 2961, "train/total_loss": 0.12015064060688019 }, { "entropy": 9.445127487182617, "epoch": 0.2928613802649792, "mean_token_accuracy": 0.7324561476707458, "num_tokens": 15425156.0, "step": 2962, "train/ce_loss": 0.8944116830825806 }, { "epoch": 0.2928613802649792, "step": 2962, "train/sim_loss": 0.07421875 }, { "epoch": 0.2928613802649792, "step": 2962, "train/total_loss": 0.163659930229187 }, { "entropy": 9.726035118103027, "epoch": 0.29296025311449475, "mean_token_accuracy": 0.6761363744735718, "num_tokens": 15430123.0, "step": 2963, "train/ce_loss": 4.313818408263614e-06 }, { "epoch": 0.29296025311449475, "step": 2963, "train/sim_loss": 0.01953125 }, { "epoch": 0.29296025311449475, "step": 2963, "train/total_loss": 0.01953168213367462 }, { "entropy": 9.181549072265625, "epoch": 0.2930591259640103, "mean_token_accuracy": 0.7400768399238586, "num_tokens": 15435323.0, "step": 2964, "train/ce_loss": 0.5938209891319275 }, { "epoch": 0.2930591259640103, "step": 2964, "train/sim_loss": 0.04296875 }, { "epoch": 0.2930591259640103, "step": 2964, "train/total_loss": 0.10235084593296051 }, { "entropy": 9.563488006591797, "epoch": 0.29315799881352583, "mean_token_accuracy": 0.7197986841201782, "num_tokens": 15440341.0, "step": 2965, "train/ce_loss": 1.5780949592590332 }, { "epoch": 0.29315799881352583, "step": 2965, "train/sim_loss": 0.11328125 }, { "epoch": 0.29315799881352583, "step": 2965, "train/total_loss": 0.2710907459259033 }, { "entropy": 9.043390274047852, "epoch": 0.2932568716630413, "mean_token_accuracy": 0.7263948321342468, "num_tokens": 15445852.0, "step": 2966, "train/ce_loss": 5.926921858190326e-06 }, { "epoch": 0.2932568716630413, "step": 2966, "train/sim_loss": 0.04296875 }, { "epoch": 0.2932568716630413, "step": 2966, "train/total_loss": 0.042969342321157455 }, { "entropy": 9.691654205322266, "epoch": 0.29335574451255686, "mean_token_accuracy": 0.734375, "num_tokens": 15450772.0, "step": 2967, "train/ce_loss": 1.099898099899292 }, { "epoch": 0.29335574451255686, "step": 2967, "train/sim_loss": 0.08203125 }, { "epoch": 0.29335574451255686, "step": 2967, "train/total_loss": 0.19202107191085815 }, { "entropy": 9.398199081420898, "epoch": 0.2934546173620724, "mean_token_accuracy": 0.7475317120552063, "num_tokens": 15455899.0, "step": 2968, "train/ce_loss": 5.713059636036633e-06 }, { "epoch": 0.2934546173620724, "step": 2968, "train/sim_loss": 0.03125 }, { "epoch": 0.2934546173620724, "step": 2968, "train/total_loss": 0.031250569969415665 }, { "entropy": 9.10877799987793, "epoch": 0.2935534902115879, "mean_token_accuracy": 0.7523696422576904, "num_tokens": 15461222.0, "step": 2969, "train/ce_loss": 0.6074536442756653 }, { "epoch": 0.2935534902115879, "step": 2969, "train/sim_loss": 0.078125 }, { "epoch": 0.2935534902115879, "step": 2969, "train/total_loss": 0.13887035846710205 }, { "entropy": 8.610416412353516, "epoch": 0.2936523630611034, "mean_token_accuracy": 0.7567567825317383, "num_tokens": 15466487.0, "step": 2970, "train/ce_loss": 1.0411075353622437 }, { "epoch": 0.2936523630611034, "step": 2970, "train/sim_loss": 0.07421875 }, { "epoch": 0.2936523630611034, "step": 2970, "train/total_loss": 0.1783294975757599 }, { "entropy": 9.788820266723633, "epoch": 0.29375123591061897, "mean_token_accuracy": 0.7931034564971924, "num_tokens": 15471259.0, "step": 2971, "train/ce_loss": 1.6351121664047241 }, { "epoch": 0.29375123591061897, "step": 2971, "train/sim_loss": 0.0625 }, { "epoch": 0.29375123591061897, "step": 2971, "train/total_loss": 0.2260112166404724 }, { "entropy": 8.552578926086426, "epoch": 0.29385010876013445, "mean_token_accuracy": 0.717597484588623, "num_tokens": 15476722.0, "step": 2972, "train/ce_loss": 1.4389077425003052 }, { "epoch": 0.29385010876013445, "step": 2972, "train/sim_loss": 0.07421875 }, { "epoch": 0.29385010876013445, "step": 2972, "train/total_loss": 0.21810953319072723 }, { "entropy": 9.41911506652832, "epoch": 0.29394898160965, "mean_token_accuracy": 0.7859424948692322, "num_tokens": 15481802.0, "step": 2973, "train/ce_loss": 0.8379352688789368 }, { "epoch": 0.29394898160965, "step": 2973, "train/sim_loss": 0.03515625 }, { "epoch": 0.29394898160965, "step": 2973, "train/total_loss": 0.1189497783780098 }, { "entropy": 9.36585807800293, "epoch": 0.29404785445916554, "mean_token_accuracy": 0.7714748978614807, "num_tokens": 15486875.0, "step": 2974, "train/ce_loss": 1.2797170877456665 }, { "epoch": 0.29404785445916554, "step": 2974, "train/sim_loss": 0.0859375 }, { "epoch": 0.29404785445916554, "step": 2974, "train/total_loss": 0.21390920877456665 }, { "entropy": 8.743827819824219, "epoch": 0.294146727308681, "mean_token_accuracy": 0.7288317084312439, "num_tokens": 15492311.0, "step": 2975, "train/ce_loss": 0.7437622547149658 }, { "epoch": 0.294146727308681, "step": 2975, "train/sim_loss": 0.0546875 }, { "epoch": 0.294146727308681, "step": 2975, "train/total_loss": 0.12906372547149658 }, { "entropy": 9.55495834350586, "epoch": 0.29424560015819656, "mean_token_accuracy": 0.7990115284919739, "num_tokens": 15497382.0, "step": 2976, "train/ce_loss": 0.7216197848320007 }, { "epoch": 0.29424560015819656, "step": 2976, "train/sim_loss": 0.01953125 }, { "epoch": 0.29424560015819656, "step": 2976, "train/total_loss": 0.09169322997331619 }, { "entropy": 9.320695877075195, "epoch": 0.2943444730077121, "mean_token_accuracy": 0.6904177069664001, "num_tokens": 15502633.0, "step": 2977, "train/ce_loss": 1.0655757188796997 }, { "epoch": 0.2943444730077121, "step": 2977, "train/sim_loss": 0.1015625 }, { "epoch": 0.2943444730077121, "step": 2977, "train/total_loss": 0.20812007784843445 }, { "entropy": 9.47970199584961, "epoch": 0.2944433458572276, "mean_token_accuracy": 0.7394495606422424, "num_tokens": 15507596.0, "step": 2978, "train/ce_loss": 4.474019533518003e-06 }, { "epoch": 0.2944433458572276, "step": 2978, "train/sim_loss": 0.0625 }, { "epoch": 0.2944433458572276, "step": 2978, "train/total_loss": 0.06250044703483582 }, { "entropy": 9.713456153869629, "epoch": 0.29454221870674313, "mean_token_accuracy": 0.7357001900672913, "num_tokens": 15512496.0, "step": 2979, "train/ce_loss": 1.6623332500457764 }, { "epoch": 0.29454221870674313, "step": 2979, "train/sim_loss": 0.0703125 }, { "epoch": 0.29454221870674313, "step": 2979, "train/total_loss": 0.23654583096504211 }, { "epoch": 0.29464109155625867, "grad_norm": 0.9678965210914612, "learning_rate": 9.265934826682491e-06, "loss": 0.1494, "step": 2980 }, { "entropy": 9.160664558410645, "epoch": 0.29464109155625867, "mean_token_accuracy": 0.7824324369430542, "num_tokens": 15517731.0, "step": 2980, "train/ce_loss": 0.5237296223640442 }, { "epoch": 0.29464109155625867, "step": 2980, "train/sim_loss": 0.0546875 }, { "epoch": 0.29464109155625867, "step": 2980, "train/total_loss": 0.10706046223640442 }, { "entropy": 9.939748764038086, "epoch": 0.29473996440577416, "mean_token_accuracy": 0.7386363744735718, "num_tokens": 15522727.0, "step": 2981, "train/ce_loss": 2.172889471054077 }, { "epoch": 0.29473996440577416, "step": 2981, "train/sim_loss": 0.02734375 }, { "epoch": 0.29473996440577416, "step": 2981, "train/total_loss": 0.24463270604610443 }, { "entropy": 8.616790771484375, "epoch": 0.2948388372552897, "mean_token_accuracy": 0.7509191036224365, "num_tokens": 15528327.0, "step": 2982, "train/ce_loss": 1.0044151544570923 }, { "epoch": 0.2948388372552897, "step": 2982, "train/sim_loss": 0.109375 }, { "epoch": 0.2948388372552897, "step": 2982, "train/total_loss": 0.20981651544570923 }, { "entropy": 9.805368423461914, "epoch": 0.29493771010480524, "mean_token_accuracy": 0.680898904800415, "num_tokens": 15533212.0, "step": 2983, "train/ce_loss": 1.1280209037067834e-05 }, { "epoch": 0.29493771010480524, "step": 2983, "train/sim_loss": 0.0390625 }, { "epoch": 0.29493771010480524, "step": 2983, "train/total_loss": 0.039063628762960434 }, { "entropy": 9.743290901184082, "epoch": 0.2950365829543207, "mean_token_accuracy": 0.7201565504074097, "num_tokens": 15538143.0, "step": 2984, "train/ce_loss": 0.9904972910881042 }, { "epoch": 0.2950365829543207, "step": 2984, "train/sim_loss": 0.04296875 }, { "epoch": 0.2950365829543207, "step": 2984, "train/total_loss": 0.14201848208904266 }, { "entropy": 9.05086898803711, "epoch": 0.29513545580383627, "mean_token_accuracy": 0.746760904788971, "num_tokens": 15543493.0, "step": 2985, "train/ce_loss": 0.6934479475021362 }, { "epoch": 0.29513545580383627, "step": 2985, "train/sim_loss": 0.15625 }, { "epoch": 0.29513545580383627, "step": 2985, "train/total_loss": 0.22559478878974915 }, { "entropy": 9.10383415222168, "epoch": 0.2952343286533518, "mean_token_accuracy": 0.7514654397964478, "num_tokens": 15548867.0, "step": 2986, "train/ce_loss": 1.0702656507492065 }, { "epoch": 0.2952343286533518, "step": 2986, "train/sim_loss": 0.0703125 }, { "epoch": 0.2952343286533518, "step": 2986, "train/total_loss": 0.1773390769958496 }, { "entropy": 9.471105575561523, "epoch": 0.2953332015028673, "mean_token_accuracy": 0.7421758770942688, "num_tokens": 15553924.0, "step": 2987, "train/ce_loss": 0.9849151372909546 }, { "epoch": 0.2953332015028673, "step": 2987, "train/sim_loss": 0.05859375 }, { "epoch": 0.2953332015028673, "step": 2987, "train/total_loss": 0.15708526968955994 }, { "entropy": 9.303641319274902, "epoch": 0.29543207435238283, "mean_token_accuracy": 0.7553865909576416, "num_tokens": 15559154.0, "step": 2988, "train/ce_loss": 1.1377649307250977 }, { "epoch": 0.29543207435238283, "step": 2988, "train/sim_loss": 0.08984375 }, { "epoch": 0.29543207435238283, "step": 2988, "train/total_loss": 0.20362025499343872 }, { "entropy": 9.135931015014648, "epoch": 0.2955309472018984, "mean_token_accuracy": 0.7256637215614319, "num_tokens": 15564441.0, "step": 2989, "train/ce_loss": 0.7426436543464661 }, { "epoch": 0.2955309472018984, "step": 2989, "train/sim_loss": 0.08203125 }, { "epoch": 0.2955309472018984, "step": 2989, "train/total_loss": 0.15629562735557556 }, { "entropy": 9.353001594543457, "epoch": 0.29562982005141386, "mean_token_accuracy": 0.749576985836029, "num_tokens": 15569484.0, "step": 2990, "train/ce_loss": 1.0848103761672974 }, { "epoch": 0.29562982005141386, "step": 2990, "train/sim_loss": 0.09375 }, { "epoch": 0.29562982005141386, "step": 2990, "train/total_loss": 0.2022310495376587 }, { "entropy": 9.00560474395752, "epoch": 0.2957286929009294, "mean_token_accuracy": 0.7407878041267395, "num_tokens": 15574769.0, "step": 2991, "train/ce_loss": 0.5350890159606934 }, { "epoch": 0.2957286929009294, "step": 2991, "train/sim_loss": 0.0625 }, { "epoch": 0.2957286929009294, "step": 2991, "train/total_loss": 0.11600890755653381 }, { "entropy": 9.298500061035156, "epoch": 0.29582756575044494, "mean_token_accuracy": 0.71659916639328, "num_tokens": 15579900.0, "step": 2992, "train/ce_loss": 1.0368627309799194 }, { "epoch": 0.29582756575044494, "step": 2992, "train/sim_loss": 0.05078125 }, { "epoch": 0.29582756575044494, "step": 2992, "train/total_loss": 0.15446752309799194 }, { "entropy": 10.281341552734375, "epoch": 0.29592643859996043, "mean_token_accuracy": 0.6860068440437317, "num_tokens": 15584585.0, "step": 2993, "train/ce_loss": 1.00157221822883e-05 }, { "epoch": 0.29592643859996043, "step": 2993, "train/sim_loss": 0.0703125 }, { "epoch": 0.29592643859996043, "step": 2993, "train/total_loss": 0.07031349837779999 }, { "entropy": 8.819000244140625, "epoch": 0.29602531144947597, "mean_token_accuracy": 0.6941176652908325, "num_tokens": 15589974.0, "step": 2994, "train/ce_loss": 0.7955414056777954 }, { "epoch": 0.29602531144947597, "step": 2994, "train/sim_loss": 0.0390625 }, { "epoch": 0.29602531144947597, "step": 2994, "train/total_loss": 0.11861664056777954 }, { "entropy": 8.97614574432373, "epoch": 0.2961241842989915, "mean_token_accuracy": 0.6965174078941345, "num_tokens": 15595279.0, "step": 2995, "train/ce_loss": 0.32965347170829773 }, { "epoch": 0.2961241842989915, "step": 2995, "train/sim_loss": 0.01953125 }, { "epoch": 0.2961241842989915, "step": 2995, "train/total_loss": 0.05249659717082977 }, { "entropy": 9.65750503540039, "epoch": 0.296223057148507, "mean_token_accuracy": 0.739130437374115, "num_tokens": 15600178.0, "step": 2996, "train/ce_loss": 4.405275831231847e-06 }, { "epoch": 0.296223057148507, "step": 2996, "train/sim_loss": 0.01953125 }, { "epoch": 0.296223057148507, "step": 2996, "train/total_loss": 0.019531691446900368 }, { "entropy": 9.083763122558594, "epoch": 0.29632192999802254, "mean_token_accuracy": 0.7827102541923523, "num_tokens": 15605535.0, "step": 2997, "train/ce_loss": 0.523099958896637 }, { "epoch": 0.29632192999802254, "step": 2997, "train/sim_loss": 0.0625 }, { "epoch": 0.29632192999802254, "step": 2997, "train/total_loss": 0.11480999737977982 }, { "entropy": 9.692913055419922, "epoch": 0.2964208028475381, "mean_token_accuracy": 0.7287522554397583, "num_tokens": 15610637.0, "step": 2998, "train/ce_loss": 2.1075310707092285 }, { "epoch": 0.2964208028475381, "step": 2998, "train/sim_loss": 0.1328125 }, { "epoch": 0.2964208028475381, "step": 2998, "train/total_loss": 0.34356561303138733 }, { "entropy": 9.991308212280273, "epoch": 0.29651967569705356, "mean_token_accuracy": 0.6886075735092163, "num_tokens": 15615399.0, "step": 2999, "train/ce_loss": 5.973896350042196e-06 }, { "epoch": 0.29651967569705356, "step": 2999, "train/sim_loss": 0.0625 }, { "epoch": 0.29651967569705356, "step": 2999, "train/total_loss": 0.06250059604644775 }, { "epoch": 0.2966185485465691, "grad_norm": 0.9837698340415955, "learning_rate": 9.260989961924542e-06, "loss": 0.151, "step": 3000 }, { "entropy": 8.74220085144043, "epoch": 0.2966185485465691, "mean_token_accuracy": 0.7542277574539185, "num_tokens": 15620771.0, "step": 3000, "train/ce_loss": 1.4459656476974487 }, { "epoch": 0.2966185485465691, "step": 3000, "train/sim_loss": 0.0859375 }, { "epoch": 0.2966185485465691, "step": 3000, "train/total_loss": 0.23053406178951263 }, { "entropy": 9.76188850402832, "epoch": 0.29671742139608465, "mean_token_accuracy": 0.7573964595794678, "num_tokens": 15625897.0, "step": 3001, "train/ce_loss": 7.043448931653984e-06 }, { "epoch": 0.29671742139608465, "step": 3001, "train/sim_loss": 0.05859375 }, { "epoch": 0.29671742139608465, "step": 3001, "train/total_loss": 0.05859445407986641 }, { "entropy": 8.959300994873047, "epoch": 0.29681629424560013, "mean_token_accuracy": 0.6892911195755005, "num_tokens": 15631045.0, "step": 3002, "train/ce_loss": 1.4792759429838043e-05 }, { "epoch": 0.29681629424560013, "step": 3002, "train/sim_loss": 0.046875 }, { "epoch": 0.29681629424560013, "step": 3002, "train/total_loss": 0.04687647894024849 }, { "entropy": 8.936592102050781, "epoch": 0.2969151670951157, "mean_token_accuracy": 0.723192036151886, "num_tokens": 15636344.0, "step": 3003, "train/ce_loss": 0.9106271862983704 }, { "epoch": 0.2969151670951157, "step": 3003, "train/sim_loss": 0.03515625 }, { "epoch": 0.2969151670951157, "step": 3003, "train/total_loss": 0.1262189745903015 }, { "entropy": 9.652338027954102, "epoch": 0.2970140399446312, "mean_token_accuracy": 0.709193229675293, "num_tokens": 15641357.0, "step": 3004, "train/ce_loss": 0.7016077041625977 }, { "epoch": 0.2970140399446312, "step": 3004, "train/sim_loss": 0.0625 }, { "epoch": 0.2970140399446312, "step": 3004, "train/total_loss": 0.13266077637672424 }, { "entropy": 9.529338836669922, "epoch": 0.2971129127941467, "mean_token_accuracy": 0.7367576360702515, "num_tokens": 15646415.0, "step": 3005, "train/ce_loss": 4.004227321274811e-06 }, { "epoch": 0.2971129127941467, "step": 3005, "train/sim_loss": 0.078125 }, { "epoch": 0.2971129127941467, "step": 3005, "train/total_loss": 0.07812540233135223 }, { "entropy": 9.7982177734375, "epoch": 0.29721178564366224, "mean_token_accuracy": 0.6998341679573059, "num_tokens": 15651450.0, "step": 3006, "train/ce_loss": 1.2795770168304443 }, { "epoch": 0.29721178564366224, "step": 3006, "train/sim_loss": 0.046875 }, { "epoch": 0.29721178564366224, "step": 3006, "train/total_loss": 0.17483270168304443 }, { "entropy": 9.549211502075195, "epoch": 0.2973106584931778, "mean_token_accuracy": 0.7585585713386536, "num_tokens": 15656415.0, "step": 3007, "train/ce_loss": 1.0237561464309692 }, { "epoch": 0.2973106584931778, "step": 3007, "train/sim_loss": 0.046875 }, { "epoch": 0.2973106584931778, "step": 3007, "train/total_loss": 0.14925062656402588 }, { "entropy": 9.022342681884766, "epoch": 0.2974095313426933, "mean_token_accuracy": 0.7371967434883118, "num_tokens": 15661813.0, "step": 3008, "train/ce_loss": 0.7531060576438904 }, { "epoch": 0.2974095313426933, "step": 3008, "train/sim_loss": 0.109375 }, { "epoch": 0.2974095313426933, "step": 3008, "train/total_loss": 0.184685617685318 }, { "entropy": 9.182098388671875, "epoch": 0.2975084041922088, "mean_token_accuracy": 0.7335957884788513, "num_tokens": 15667055.0, "step": 3009, "train/ce_loss": 1.872267723083496 }, { "epoch": 0.2975084041922088, "step": 3009, "train/sim_loss": 0.0703125 }, { "epoch": 0.2975084041922088, "step": 3009, "train/total_loss": 0.2575392723083496 }, { "entropy": 9.481335639953613, "epoch": 0.29760727704172435, "mean_token_accuracy": 0.7445141077041626, "num_tokens": 15672150.0, "step": 3010, "train/ce_loss": 4.337508471508045e-06 }, { "epoch": 0.29760727704172435, "step": 3010, "train/sim_loss": 0.0546875 }, { "epoch": 0.29760727704172435, "step": 3010, "train/total_loss": 0.05468793213367462 }, { "entropy": 9.197444915771484, "epoch": 0.2977061498912399, "mean_token_accuracy": 0.7210718393325806, "num_tokens": 15677443.0, "step": 3011, "train/ce_loss": 1.0222855806350708 }, { "epoch": 0.2977061498912399, "step": 3011, "train/sim_loss": 0.109375 }, { "epoch": 0.2977061498912399, "step": 3011, "train/total_loss": 0.2116035521030426 }, { "entropy": 8.939278602600098, "epoch": 0.2978050227407554, "mean_token_accuracy": 0.7448036670684814, "num_tokens": 15682864.0, "step": 3012, "train/ce_loss": 0.4343223571777344 }, { "epoch": 0.2978050227407554, "step": 3012, "train/sim_loss": 0.08984375 }, { "epoch": 0.2978050227407554, "step": 3012, "train/total_loss": 0.13327598571777344 }, { "entropy": 9.049263000488281, "epoch": 0.2979038955902709, "mean_token_accuracy": 0.7024901509284973, "num_tokens": 15688110.0, "step": 3013, "train/ce_loss": 0.6559205651283264 }, { "epoch": 0.2979038955902709, "step": 3013, "train/sim_loss": 0.07421875 }, { "epoch": 0.2979038955902709, "step": 3013, "train/total_loss": 0.13981080055236816 }, { "entropy": 9.697021484375, "epoch": 0.29800276843978646, "mean_token_accuracy": 0.7936508059501648, "num_tokens": 15692993.0, "step": 3014, "train/ce_loss": 7.438169177476084e-06 }, { "epoch": 0.29800276843978646, "step": 3014, "train/sim_loss": 0.05078125 }, { "epoch": 0.29800276843978646, "step": 3014, "train/total_loss": 0.05078199505805969 }, { "entropy": 9.081084251403809, "epoch": 0.29810164128930194, "mean_token_accuracy": 0.7256532311439514, "num_tokens": 15698285.0, "step": 3015, "train/ce_loss": 1.2116591930389404 }, { "epoch": 0.29810164128930194, "step": 3015, "train/sim_loss": 0.109375 }, { "epoch": 0.29810164128930194, "step": 3015, "train/total_loss": 0.230540931224823 }, { "entropy": 9.241430282592773, "epoch": 0.2982005141388175, "mean_token_accuracy": 0.7848557829856873, "num_tokens": 15703559.0, "step": 3016, "train/ce_loss": 0.7737722396850586 }, { "epoch": 0.2982005141388175, "step": 3016, "train/sim_loss": 0.06640625 }, { "epoch": 0.2982005141388175, "step": 3016, "train/total_loss": 0.14378347992897034 }, { "entropy": 9.451658248901367, "epoch": 0.298299386988333, "mean_token_accuracy": 0.7496296167373657, "num_tokens": 15708647.0, "step": 3017, "train/ce_loss": 0.33514443039894104 }, { "epoch": 0.298299386988333, "step": 3017, "train/sim_loss": 0.046875 }, { "epoch": 0.298299386988333, "step": 3017, "train/total_loss": 0.08038944005966187 }, { "entropy": 9.318082809448242, "epoch": 0.2983982598378485, "mean_token_accuracy": 0.7401574850082397, "num_tokens": 15713885.0, "step": 3018, "train/ce_loss": 1.3564872741699219 }, { "epoch": 0.2983982598378485, "step": 3018, "train/sim_loss": 0.07421875 }, { "epoch": 0.2983982598378485, "step": 3018, "train/total_loss": 0.2098674774169922 }, { "entropy": 8.701761245727539, "epoch": 0.29849713268736405, "mean_token_accuracy": 0.7411225438117981, "num_tokens": 15719261.0, "step": 3019, "train/ce_loss": 1.5156086683273315 }, { "epoch": 0.29849713268736405, "step": 3019, "train/sim_loss": 0.046875 }, { "epoch": 0.29849713268736405, "step": 3019, "train/total_loss": 0.19843587279319763 }, { "epoch": 0.2985960055368796, "grad_norm": 0.8209056854248047, "learning_rate": 9.256045097166592e-06, "loss": 0.1495, "step": 3020 }, { "entropy": 8.67845344543457, "epoch": 0.2985960055368796, "mean_token_accuracy": 0.7316784858703613, "num_tokens": 15724599.0, "step": 3020, "train/ce_loss": 0.8231831192970276 }, { "epoch": 0.2985960055368796, "step": 3020, "train/sim_loss": 0.046875 }, { "epoch": 0.2985960055368796, "step": 3020, "train/total_loss": 0.12919330596923828 }, { "entropy": 9.154563903808594, "epoch": 0.2986948783863951, "mean_token_accuracy": 0.7467362880706787, "num_tokens": 15729836.0, "step": 3021, "train/ce_loss": 0.7567043900489807 }, { "epoch": 0.2986948783863951, "step": 3021, "train/sim_loss": 0.05078125 }, { "epoch": 0.2986948783863951, "step": 3021, "train/total_loss": 0.12645170092582703 }, { "entropy": 8.827190399169922, "epoch": 0.2987937512359106, "mean_token_accuracy": 0.7154762148857117, "num_tokens": 15735176.0, "step": 3022, "train/ce_loss": 0.410447359085083 }, { "epoch": 0.2987937512359106, "step": 3022, "train/sim_loss": 0.08984375 }, { "epoch": 0.2987937512359106, "step": 3022, "train/total_loss": 0.13088849186897278 }, { "entropy": 9.213386535644531, "epoch": 0.29889262408542616, "mean_token_accuracy": 0.7359412908554077, "num_tokens": 15740429.0, "step": 3023, "train/ce_loss": 0.957610011100769 }, { "epoch": 0.29889262408542616, "step": 3023, "train/sim_loss": 0.078125 }, { "epoch": 0.29889262408542616, "step": 3023, "train/total_loss": 0.1738860011100769 }, { "entropy": 9.915075302124023, "epoch": 0.29899149693494165, "mean_token_accuracy": 0.7661795616149902, "num_tokens": 15745358.0, "step": 3024, "train/ce_loss": 4.308172265155008e-06 }, { "epoch": 0.29899149693494165, "step": 3024, "train/sim_loss": 0.06640625 }, { "epoch": 0.29899149693494165, "step": 3024, "train/total_loss": 0.06640668213367462 }, { "entropy": 9.263346672058105, "epoch": 0.2990903697844572, "mean_token_accuracy": 0.6996245384216309, "num_tokens": 15750767.0, "step": 3025, "train/ce_loss": 1.3881711959838867 }, { "epoch": 0.2990903697844572, "step": 3025, "train/sim_loss": 0.12109375 }, { "epoch": 0.2990903697844572, "step": 3025, "train/total_loss": 0.2599108815193176 }, { "entropy": 9.625162124633789, "epoch": 0.29918924263397273, "mean_token_accuracy": 0.7014925479888916, "num_tokens": 15755797.0, "step": 3026, "train/ce_loss": 6.861389920231886e-06 }, { "epoch": 0.29918924263397273, "step": 3026, "train/sim_loss": 0.0546875 }, { "epoch": 0.29918924263397273, "step": 3026, "train/total_loss": 0.05468818545341492 }, { "entropy": 9.5260648727417, "epoch": 0.2992881154834882, "mean_token_accuracy": 0.7708674073219299, "num_tokens": 15761032.0, "step": 3027, "train/ce_loss": 0.8745700716972351 }, { "epoch": 0.2992881154834882, "step": 3027, "train/sim_loss": 0.06640625 }, { "epoch": 0.2992881154834882, "step": 3027, "train/total_loss": 0.15386325120925903 }, { "entropy": 10.308094024658203, "epoch": 0.29938698833300376, "mean_token_accuracy": 0.7317073345184326, "num_tokens": 15765748.0, "step": 3028, "train/ce_loss": 6.836529337306274e-06 }, { "epoch": 0.29938698833300376, "step": 3028, "train/sim_loss": 0.01953125 }, { "epoch": 0.29938698833300376, "step": 3028, "train/total_loss": 0.019531933590769768 }, { "entropy": 8.922569274902344, "epoch": 0.2994858611825193, "mean_token_accuracy": 0.7791342735290527, "num_tokens": 15771169.0, "step": 3029, "train/ce_loss": 0.5738962292671204 }, { "epoch": 0.2994858611825193, "step": 3029, "train/sim_loss": 0.08984375 }, { "epoch": 0.2994858611825193, "step": 3029, "train/total_loss": 0.14723336696624756 }, { "entropy": 10.167108535766602, "epoch": 0.2995847340320348, "mean_token_accuracy": 0.8157894611358643, "num_tokens": 15775953.0, "step": 3030, "train/ce_loss": 7.155690582294483e-06 }, { "epoch": 0.2995847340320348, "step": 3030, "train/sim_loss": 0.02734375 }, { "epoch": 0.2995847340320348, "step": 3030, "train/total_loss": 0.027344465255737305 }, { "entropy": 9.04146957397461, "epoch": 0.2996836068815503, "mean_token_accuracy": 0.7317073345184326, "num_tokens": 15781355.0, "step": 3031, "train/ce_loss": 0.710956335067749 }, { "epoch": 0.2996836068815503, "step": 3031, "train/sim_loss": 0.07421875 }, { "epoch": 0.2996836068815503, "step": 3031, "train/total_loss": 0.14531439542770386 }, { "entropy": 9.568553924560547, "epoch": 0.29978247973106587, "mean_token_accuracy": 0.8229665160179138, "num_tokens": 15786602.0, "step": 3032, "train/ce_loss": 1.3454652616928797e-05 }, { "epoch": 0.29978247973106587, "step": 3032, "train/sim_loss": 0.0546875 }, { "epoch": 0.29978247973106587, "step": 3032, "train/total_loss": 0.054688844829797745 }, { "entropy": 9.415324211120605, "epoch": 0.29988135258058135, "mean_token_accuracy": 0.8056337833404541, "num_tokens": 15791777.0, "step": 3033, "train/ce_loss": 0.6456983685493469 }, { "epoch": 0.29988135258058135, "step": 3033, "train/sim_loss": 0.03125 }, { "epoch": 0.29988135258058135, "step": 3033, "train/total_loss": 0.09581983834505081 }, { "entropy": 10.041857719421387, "epoch": 0.2999802254300969, "mean_token_accuracy": 0.7089337110519409, "num_tokens": 15796545.0, "step": 3034, "train/ce_loss": 1.1669424566207454e-05 }, { "epoch": 0.2999802254300969, "step": 3034, "train/sim_loss": 0.0546875 }, { "epoch": 0.2999802254300969, "step": 3034, "train/total_loss": 0.05468866601586342 }, { "entropy": 9.296567916870117, "epoch": 0.30007909827961243, "mean_token_accuracy": 0.6926751732826233, "num_tokens": 15801647.0, "step": 3035, "train/ce_loss": 3.612391992646735e-06 }, { "epoch": 0.30007909827961243, "step": 3035, "train/sim_loss": 0.07421875 }, { "epoch": 0.30007909827961243, "step": 3035, "train/total_loss": 0.07421910762786865 }, { "entropy": 9.452152252197266, "epoch": 0.3001779711291279, "mean_token_accuracy": 0.7451253533363342, "num_tokens": 15806994.0, "step": 3036, "train/ce_loss": 1.4053936004638672 }, { "epoch": 0.3001779711291279, "step": 3036, "train/sim_loss": 0.0859375 }, { "epoch": 0.3001779711291279, "step": 3036, "train/total_loss": 0.22647686302661896 }, { "entropy": 9.989208221435547, "epoch": 0.30027684397864346, "mean_token_accuracy": 0.7664670944213867, "num_tokens": 15811913.0, "step": 3037, "train/ce_loss": 4.4016423998982646e-06 }, { "epoch": 0.30027684397864346, "step": 3037, "train/sim_loss": 0.07421875 }, { "epoch": 0.30027684397864346, "step": 3037, "train/total_loss": 0.07421918958425522 }, { "entropy": 9.653634071350098, "epoch": 0.300375716828159, "mean_token_accuracy": 0.7589454054832458, "num_tokens": 15816888.0, "step": 3038, "train/ce_loss": 0.9903601408004761 }, { "epoch": 0.300375716828159, "step": 3038, "train/sim_loss": 0.0703125 }, { "epoch": 0.300375716828159, "step": 3038, "train/total_loss": 0.16934850811958313 }, { "entropy": 9.67619514465332, "epoch": 0.3004745896776745, "mean_token_accuracy": 0.771799623966217, "num_tokens": 15821864.0, "step": 3039, "train/ce_loss": 3.5057651075476315e-06 }, { "epoch": 0.3004745896776745, "step": 3039, "train/sim_loss": 0.015625 }, { "epoch": 0.3004745896776745, "step": 3039, "train/total_loss": 0.015625350177288055 }, { "epoch": 0.30057346252719, "grad_norm": 0.781501829624176, "learning_rate": 9.251100232408645e-06, "loss": 0.143, "step": 3040 }, { "entropy": 9.31530475616455, "epoch": 0.30057346252719, "mean_token_accuracy": 0.7032679915428162, "num_tokens": 15827127.0, "step": 3040, "train/ce_loss": 1.4572179317474365 }, { "epoch": 0.30057346252719, "step": 3040, "train/sim_loss": 0.08203125 }, { "epoch": 0.30057346252719, "step": 3040, "train/total_loss": 0.22775304317474365 }, { "entropy": 8.973918914794922, "epoch": 0.30067233537670557, "mean_token_accuracy": 0.7213459610939026, "num_tokens": 15832582.0, "step": 3041, "train/ce_loss": 0.9301877021789551 }, { "epoch": 0.30067233537670557, "step": 3041, "train/sim_loss": 0.0546875 }, { "epoch": 0.30067233537670557, "step": 3041, "train/total_loss": 0.1477062702178955 }, { "entropy": 9.18991756439209, "epoch": 0.30077120822622105, "mean_token_accuracy": 0.7188329100608826, "num_tokens": 15837739.0, "step": 3042, "train/ce_loss": 7.996571184776258e-06 }, { "epoch": 0.30077120822622105, "step": 3042, "train/sim_loss": 0.0625 }, { "epoch": 0.30077120822622105, "step": 3042, "train/total_loss": 0.06250079721212387 }, { "entropy": 10.266804695129395, "epoch": 0.3008700810757366, "mean_token_accuracy": 0.7838827967643738, "num_tokens": 15842382.0, "step": 3043, "train/ce_loss": 2.7267353534698486 }, { "epoch": 0.3008700810757366, "step": 3043, "train/sim_loss": 0.0859375 }, { "epoch": 0.3008700810757366, "step": 3043, "train/total_loss": 0.3586110472679138 }, { "entropy": 9.074047088623047, "epoch": 0.30096895392525214, "mean_token_accuracy": 0.7470725774765015, "num_tokens": 15847691.0, "step": 3044, "train/ce_loss": 0.5398385524749756 }, { "epoch": 0.30096895392525214, "step": 3044, "train/sim_loss": 0.05078125 }, { "epoch": 0.30096895392525214, "step": 3044, "train/total_loss": 0.10476510226726532 }, { "entropy": 9.435815811157227, "epoch": 0.3010678267747676, "mean_token_accuracy": 0.7376788258552551, "num_tokens": 15852747.0, "step": 3045, "train/ce_loss": 4.20673950429773e-06 }, { "epoch": 0.3010678267747676, "step": 3045, "train/sim_loss": 0.0390625 }, { "epoch": 0.3010678267747676, "step": 3045, "train/total_loss": 0.039062920957803726 }, { "entropy": 9.279495239257812, "epoch": 0.30116669962428316, "mean_token_accuracy": 0.7272727489471436, "num_tokens": 15857797.0, "step": 3046, "train/ce_loss": 1.2155992984771729 }, { "epoch": 0.30116669962428316, "step": 3046, "train/sim_loss": 0.10546875 }, { "epoch": 0.30116669962428316, "step": 3046, "train/total_loss": 0.22702868282794952 }, { "entropy": 9.83897876739502, "epoch": 0.3012655724737987, "mean_token_accuracy": 0.7052631378173828, "num_tokens": 15862659.0, "step": 3047, "train/ce_loss": 2.269005537033081 }, { "epoch": 0.3012655724737987, "step": 3047, "train/sim_loss": 0.0859375 }, { "epoch": 0.3012655724737987, "step": 3047, "train/total_loss": 0.312838077545166 }, { "entropy": 8.666114807128906, "epoch": 0.30136444532331425, "mean_token_accuracy": 0.7765362858772278, "num_tokens": 15868075.0, "step": 3048, "train/ce_loss": 0.27126842737197876 }, { "epoch": 0.30136444532331425, "step": 3048, "train/sim_loss": 0.0234375 }, { "epoch": 0.30136444532331425, "step": 3048, "train/total_loss": 0.05056434124708176 }, { "entropy": 9.339645385742188, "epoch": 0.30146331817282973, "mean_token_accuracy": 0.7465648651123047, "num_tokens": 15873217.0, "step": 3049, "train/ce_loss": 0.7426149249076843 }, { "epoch": 0.30146331817282973, "step": 3049, "train/sim_loss": 0.0625 }, { "epoch": 0.30146331817282973, "step": 3049, "train/total_loss": 0.13676148653030396 }, { "entropy": 9.430669784545898, "epoch": 0.30156219102234527, "mean_token_accuracy": 0.7079136967658997, "num_tokens": 15878324.0, "step": 3050, "train/ce_loss": 1.067453384399414 }, { "epoch": 0.30156219102234527, "step": 3050, "train/sim_loss": 0.08984375 }, { "epoch": 0.30156219102234527, "step": 3050, "train/total_loss": 0.19658908247947693 }, { "entropy": 9.262472152709961, "epoch": 0.3016610638718608, "mean_token_accuracy": 0.7813712954521179, "num_tokens": 15883535.0, "step": 3051, "train/ce_loss": 2.010103702545166 }, { "epoch": 0.3016610638718608, "step": 3051, "train/sim_loss": 0.109375 }, { "epoch": 0.3016610638718608, "step": 3051, "train/total_loss": 0.3103853762149811 }, { "entropy": 9.47055721282959, "epoch": 0.3017599367213763, "mean_token_accuracy": 0.7318435907363892, "num_tokens": 15888691.0, "step": 3052, "train/ce_loss": 1.2855441570281982 }, { "epoch": 0.3017599367213763, "step": 3052, "train/sim_loss": 0.03125 }, { "epoch": 0.3017599367213763, "step": 3052, "train/total_loss": 0.15980441868305206 }, { "entropy": 9.202533721923828, "epoch": 0.30185880957089184, "mean_token_accuracy": 0.774631917476654, "num_tokens": 15893983.0, "step": 3053, "train/ce_loss": 0.7063089609146118 }, { "epoch": 0.30185880957089184, "step": 3053, "train/sim_loss": 0.046875 }, { "epoch": 0.30185880957089184, "step": 3053, "train/total_loss": 0.11750590056180954 }, { "entropy": 10.076286315917969, "epoch": 0.3019576824204074, "mean_token_accuracy": 0.8044009804725647, "num_tokens": 15898767.0, "step": 3054, "train/ce_loss": 1.245094895362854 }, { "epoch": 0.3019576824204074, "step": 3054, "train/sim_loss": 0.0703125 }, { "epoch": 0.3019576824204074, "step": 3054, "train/total_loss": 0.19482198357582092 }, { "entropy": 9.487077713012695, "epoch": 0.30205655526992287, "mean_token_accuracy": 0.7615384459495544, "num_tokens": 15903852.0, "step": 3055, "train/ce_loss": 0.783889651298523 }, { "epoch": 0.30205655526992287, "step": 3055, "train/sim_loss": 0.0234375 }, { "epoch": 0.30205655526992287, "step": 3055, "train/total_loss": 0.10182646661996841 }, { "entropy": 8.927887916564941, "epoch": 0.3021554281194384, "mean_token_accuracy": 0.6800433993339539, "num_tokens": 15909207.0, "step": 3056, "train/ce_loss": 1.4836653470993042 }, { "epoch": 0.3021554281194384, "step": 3056, "train/sim_loss": 0.1328125 }, { "epoch": 0.3021554281194384, "step": 3056, "train/total_loss": 0.2811790406703949 }, { "entropy": 8.685641288757324, "epoch": 0.30225430096895395, "mean_token_accuracy": 0.7129999995231628, "num_tokens": 15914643.0, "step": 3057, "train/ce_loss": 0.6042128801345825 }, { "epoch": 0.30225430096895395, "step": 3057, "train/sim_loss": 0.078125 }, { "epoch": 0.30225430096895395, "step": 3057, "train/total_loss": 0.13854628801345825 }, { "entropy": 9.32431411743164, "epoch": 0.30235317381846943, "mean_token_accuracy": 0.7949852347373962, "num_tokens": 15919782.0, "step": 3058, "train/ce_loss": 1.0000278949737549 }, { "epoch": 0.30235317381846943, "step": 3058, "train/sim_loss": 0.0546875 }, { "epoch": 0.30235317381846943, "step": 3058, "train/total_loss": 0.15469029545783997 }, { "entropy": 9.148672103881836, "epoch": 0.302452046667985, "mean_token_accuracy": 0.7779204249382019, "num_tokens": 15925208.0, "step": 3059, "train/ce_loss": 0.49779924750328064 }, { "epoch": 0.302452046667985, "step": 3059, "train/sim_loss": 0.11328125 }, { "epoch": 0.302452046667985, "step": 3059, "train/total_loss": 0.16306117177009583 }, { "epoch": 0.3025509195175005, "grad_norm": 0.8780242204666138, "learning_rate": 9.246155367650695e-06, "loss": 0.1425, "step": 3060 }, { "entropy": 8.764741897583008, "epoch": 0.3025509195175005, "mean_token_accuracy": 0.6839577555656433, "num_tokens": 15930688.0, "step": 3060, "train/ce_loss": 1.202653408050537 }, { "epoch": 0.3025509195175005, "step": 3060, "train/sim_loss": 0.0703125 }, { "epoch": 0.3025509195175005, "step": 3060, "train/total_loss": 0.19057783484458923 }, { "entropy": 9.180252075195312, "epoch": 0.302649792367016, "mean_token_accuracy": 0.7259615659713745, "num_tokens": 15935943.0, "step": 3061, "train/ce_loss": 0.8565216064453125 }, { "epoch": 0.302649792367016, "step": 3061, "train/sim_loss": 0.04296875 }, { "epoch": 0.302649792367016, "step": 3061, "train/total_loss": 0.1286209225654602 }, { "entropy": 8.950722694396973, "epoch": 0.30274866521653154, "mean_token_accuracy": 0.6937377452850342, "num_tokens": 15941391.0, "step": 3062, "train/ce_loss": 1.1386116743087769 }, { "epoch": 0.30274866521653154, "step": 3062, "train/sim_loss": 0.046875 }, { "epoch": 0.30274866521653154, "step": 3062, "train/total_loss": 0.16073617339134216 }, { "entropy": 9.415882110595703, "epoch": 0.3028475380660471, "mean_token_accuracy": 0.763271152973175, "num_tokens": 15946498.0, "step": 3063, "train/ce_loss": 5.109886387799634e-06 }, { "epoch": 0.3028475380660471, "step": 3063, "train/sim_loss": 0.02734375 }, { "epoch": 0.3028475380660471, "step": 3063, "train/total_loss": 0.02734426036477089 }, { "entropy": 9.254497528076172, "epoch": 0.30294641091556257, "mean_token_accuracy": 0.7294469475746155, "num_tokens": 15951646.0, "step": 3064, "train/ce_loss": 0.635911762714386 }, { "epoch": 0.30294641091556257, "step": 3064, "train/sim_loss": 0.09765625 }, { "epoch": 0.30294641091556257, "step": 3064, "train/total_loss": 0.16124743223190308 }, { "entropy": 10.180620193481445, "epoch": 0.3030452837650781, "mean_token_accuracy": 0.7554945349693298, "num_tokens": 15956420.0, "step": 3065, "train/ce_loss": 1.0206608772277832 }, { "epoch": 0.3030452837650781, "step": 3065, "train/sim_loss": 0.1015625 }, { "epoch": 0.3030452837650781, "step": 3065, "train/total_loss": 0.20362859964370728 }, { "entropy": 9.169899940490723, "epoch": 0.30314415661459365, "mean_token_accuracy": 0.707317054271698, "num_tokens": 15961613.0, "step": 3066, "train/ce_loss": 1.2100774049758911 }, { "epoch": 0.30314415661459365, "step": 3066, "train/sim_loss": 0.05859375 }, { "epoch": 0.30314415661459365, "step": 3066, "train/total_loss": 0.1796014904975891 }, { "entropy": 9.298154830932617, "epoch": 0.30324302946410914, "mean_token_accuracy": 0.7533632516860962, "num_tokens": 15966738.0, "step": 3067, "train/ce_loss": 0.850917637348175 }, { "epoch": 0.30324302946410914, "step": 3067, "train/sim_loss": 0.10546875 }, { "epoch": 0.30324302946410914, "step": 3067, "train/total_loss": 0.19056051969528198 }, { "entropy": 9.30412483215332, "epoch": 0.3033419023136247, "mean_token_accuracy": 0.7869565486907959, "num_tokens": 15971864.0, "step": 3068, "train/ce_loss": 0.6976792216300964 }, { "epoch": 0.3033419023136247, "step": 3068, "train/sim_loss": 0.015625 }, { "epoch": 0.3033419023136247, "step": 3068, "train/total_loss": 0.08539292216300964 }, { "entropy": 9.682281494140625, "epoch": 0.3034407751631402, "mean_token_accuracy": 0.7980952262878418, "num_tokens": 15976825.0, "step": 3069, "train/ce_loss": 0.8077563643455505 }, { "epoch": 0.3034407751631402, "step": 3069, "train/sim_loss": 0.078125 }, { "epoch": 0.3034407751631402, "step": 3069, "train/total_loss": 0.158900648355484 }, { "entropy": 9.056129455566406, "epoch": 0.3035396480126557, "mean_token_accuracy": 0.800000011920929, "num_tokens": 15981974.0, "step": 3070, "train/ce_loss": 0.5365228056907654 }, { "epoch": 0.3035396480126557, "step": 3070, "train/sim_loss": 0.0703125 }, { "epoch": 0.3035396480126557, "step": 3070, "train/total_loss": 0.12396478652954102 }, { "entropy": 8.618220329284668, "epoch": 0.30363852086217125, "mean_token_accuracy": 0.7547547817230225, "num_tokens": 15987522.0, "step": 3071, "train/ce_loss": 0.6690239906311035 }, { "epoch": 0.30363852086217125, "step": 3071, "train/sim_loss": 0.08203125 }, { "epoch": 0.30363852086217125, "step": 3071, "train/total_loss": 0.14893364906311035 }, { "entropy": 9.58163833618164, "epoch": 0.3037373937116868, "mean_token_accuracy": 0.7361563444137573, "num_tokens": 15992550.0, "step": 3072, "train/ce_loss": 2.2256726879277267e-06 }, { "epoch": 0.3037373937116868, "step": 3072, "train/sim_loss": 0.01953125 }, { "epoch": 0.3037373937116868, "step": 3072, "train/total_loss": 0.01953147165477276 }, { "entropy": 9.08739185333252, "epoch": 0.3038362665612023, "mean_token_accuracy": 0.7316455841064453, "num_tokens": 15997828.0, "step": 3073, "train/ce_loss": 1.5055323839187622 }, { "epoch": 0.3038362665612023, "step": 3073, "train/sim_loss": 0.078125 }, { "epoch": 0.3038362665612023, "step": 3073, "train/total_loss": 0.22867824137210846 }, { "entropy": 8.78681755065918, "epoch": 0.3039351394107178, "mean_token_accuracy": 0.7214206457138062, "num_tokens": 16003209.0, "step": 3074, "train/ce_loss": 1.348905086517334 }, { "epoch": 0.3039351394107178, "step": 3074, "train/sim_loss": 0.1015625 }, { "epoch": 0.3039351394107178, "step": 3074, "train/total_loss": 0.23645301163196564 }, { "entropy": 9.146982192993164, "epoch": 0.30403401226023336, "mean_token_accuracy": 0.7832258343696594, "num_tokens": 16008418.0, "step": 3075, "train/ce_loss": 0.4269832372665405 }, { "epoch": 0.30403401226023336, "step": 3075, "train/sim_loss": 0.05859375 }, { "epoch": 0.30403401226023336, "step": 3075, "train/total_loss": 0.10129207372665405 }, { "entropy": 9.387057304382324, "epoch": 0.30413288510974884, "mean_token_accuracy": 0.728314220905304, "num_tokens": 16013486.0, "step": 3076, "train/ce_loss": 0.9136193990707397 }, { "epoch": 0.30413288510974884, "step": 3076, "train/sim_loss": 0.05078125 }, { "epoch": 0.30413288510974884, "step": 3076, "train/total_loss": 0.14214318990707397 }, { "entropy": 9.939265251159668, "epoch": 0.3042317579592644, "mean_token_accuracy": 0.6820276379585266, "num_tokens": 16018300.0, "step": 3077, "train/ce_loss": 3.037581443786621 }, { "epoch": 0.3042317579592644, "step": 3077, "train/sim_loss": 0.09765625 }, { "epoch": 0.3042317579592644, "step": 3077, "train/total_loss": 0.4014143943786621 }, { "entropy": 9.897211074829102, "epoch": 0.3043306308087799, "mean_token_accuracy": 0.7265822887420654, "num_tokens": 16023133.0, "step": 3078, "train/ce_loss": 1.5921827554702759 }, { "epoch": 0.3043306308087799, "step": 3078, "train/sim_loss": 0.12109375 }, { "epoch": 0.3043306308087799, "step": 3078, "train/total_loss": 0.28031203150749207 }, { "entropy": 9.130783081054688, "epoch": 0.3044295036582954, "mean_token_accuracy": 0.6957638263702393, "num_tokens": 16028389.0, "step": 3079, "train/ce_loss": 0.8683300614356995 }, { "epoch": 0.3044295036582954, "step": 3079, "train/sim_loss": 0.02734375 }, { "epoch": 0.3044295036582954, "step": 3079, "train/total_loss": 0.11417675763368607 }, { "epoch": 0.30452837650781095, "grad_norm": 0.9272714853286743, "learning_rate": 9.241210502892747e-06, "loss": 0.1476, "step": 3080 }, { "entropy": 9.466758728027344, "epoch": 0.30452837650781095, "mean_token_accuracy": 0.7138211131095886, "num_tokens": 16033449.0, "step": 3080, "train/ce_loss": 3.2655682389304275e-06 }, { "epoch": 0.30452837650781095, "step": 3080, "train/sim_loss": 0.0546875 }, { "epoch": 0.30452837650781095, "step": 3080, "train/total_loss": 0.054687827825546265 }, { "entropy": 8.598503112792969, "epoch": 0.3046272493573265, "mean_token_accuracy": 0.6991720199584961, "num_tokens": 16039052.0, "step": 3081, "train/ce_loss": 0.7666484713554382 }, { "epoch": 0.3046272493573265, "step": 3081, "train/sim_loss": 0.0546875 }, { "epoch": 0.3046272493573265, "step": 3081, "train/total_loss": 0.13135235011577606 }, { "entropy": 9.395477294921875, "epoch": 0.304726122206842, "mean_token_accuracy": 0.730434775352478, "num_tokens": 16044176.0, "step": 3082, "train/ce_loss": 0.900779128074646 }, { "epoch": 0.304726122206842, "step": 3082, "train/sim_loss": 0.0390625 }, { "epoch": 0.304726122206842, "step": 3082, "train/total_loss": 0.12914040684700012 }, { "entropy": 9.107585906982422, "epoch": 0.3048249950563575, "mean_token_accuracy": 0.7666666507720947, "num_tokens": 16049500.0, "step": 3083, "train/ce_loss": 0.46048715710639954 }, { "epoch": 0.3048249950563575, "step": 3083, "train/sim_loss": 0.05859375 }, { "epoch": 0.3048249950563575, "step": 3083, "train/total_loss": 0.10464246571063995 }, { "entropy": 9.299919128417969, "epoch": 0.30492386790587306, "mean_token_accuracy": 0.7155612111091614, "num_tokens": 16054733.0, "step": 3084, "train/ce_loss": 1.1572389602661133 }, { "epoch": 0.30492386790587306, "step": 3084, "train/sim_loss": 0.05078125 }, { "epoch": 0.30492386790587306, "step": 3084, "train/total_loss": 0.16650515794754028 }, { "entropy": 9.56911849975586, "epoch": 0.30502274075538854, "mean_token_accuracy": 0.7896440029144287, "num_tokens": 16059787.0, "step": 3085, "train/ce_loss": 8.066250302363187e-05 }, { "epoch": 0.30502274075538854, "step": 3085, "train/sim_loss": 0.03515625 }, { "epoch": 0.30502274075538854, "step": 3085, "train/total_loss": 0.03516431525349617 }, { "entropy": 9.23663330078125, "epoch": 0.3051216136049041, "mean_token_accuracy": 0.699312686920166, "num_tokens": 16064835.0, "step": 3086, "train/ce_loss": 1.2652978897094727 }, { "epoch": 0.3051216136049041, "step": 3086, "train/sim_loss": 0.0546875 }, { "epoch": 0.3051216136049041, "step": 3086, "train/total_loss": 0.18121729791164398 }, { "entropy": 9.628506660461426, "epoch": 0.3052204864544196, "mean_token_accuracy": 0.7040650248527527, "num_tokens": 16069860.0, "step": 3087, "train/ce_loss": 0.7562285661697388 }, { "epoch": 0.3052204864544196, "step": 3087, "train/sim_loss": 0.0625 }, { "epoch": 0.3052204864544196, "step": 3087, "train/total_loss": 0.13812285661697388 }, { "entropy": 9.595111846923828, "epoch": 0.3053193593039351, "mean_token_accuracy": 0.7810107469558716, "num_tokens": 16074935.0, "step": 3088, "train/ce_loss": 0.593614935874939 }, { "epoch": 0.3053193593039351, "step": 3088, "train/sim_loss": 0.015625 }, { "epoch": 0.3053193593039351, "step": 3088, "train/total_loss": 0.07498649507761002 }, { "entropy": 8.711251258850098, "epoch": 0.30541823215345065, "mean_token_accuracy": 0.7074527144432068, "num_tokens": 16080314.0, "step": 3089, "train/ce_loss": 1.2984533309936523 }, { "epoch": 0.30541823215345065, "step": 3089, "train/sim_loss": 0.09765625 }, { "epoch": 0.30541823215345065, "step": 3089, "train/total_loss": 0.22750158607959747 }, { "entropy": 9.0078763961792, "epoch": 0.3055171050029662, "mean_token_accuracy": 0.7614269852638245, "num_tokens": 16085713.0, "step": 3090, "train/ce_loss": 1.0311739444732666 }, { "epoch": 0.3055171050029662, "step": 3090, "train/sim_loss": 0.0703125 }, { "epoch": 0.3055171050029662, "step": 3090, "train/total_loss": 0.17342990636825562 }, { "entropy": 8.846348762512207, "epoch": 0.30561597785248173, "mean_token_accuracy": 0.7243852615356445, "num_tokens": 16091151.0, "step": 3091, "train/ce_loss": 0.6214485168457031 }, { "epoch": 0.30561597785248173, "step": 3091, "train/sim_loss": 0.03515625 }, { "epoch": 0.30561597785248173, "step": 3091, "train/total_loss": 0.09730110317468643 }, { "entropy": 9.06347942352295, "epoch": 0.3057148507019972, "mean_token_accuracy": 0.667037844657898, "num_tokens": 16096528.0, "step": 3092, "train/ce_loss": 0.662187933921814 }, { "epoch": 0.3057148507019972, "step": 3092, "train/sim_loss": 0.09375 }, { "epoch": 0.3057148507019972, "step": 3092, "train/total_loss": 0.1599687933921814 }, { "entropy": 9.287656784057617, "epoch": 0.30581372355151276, "mean_token_accuracy": 0.7780612111091614, "num_tokens": 16101772.0, "step": 3093, "train/ce_loss": 0.5878052711486816 }, { "epoch": 0.30581372355151276, "step": 3093, "train/sim_loss": 0.02734375 }, { "epoch": 0.30581372355151276, "step": 3093, "train/total_loss": 0.08612427860498428 }, { "entropy": 10.067656517028809, "epoch": 0.3059125964010283, "mean_token_accuracy": 0.6492146849632263, "num_tokens": 16106532.0, "step": 3094, "train/ce_loss": 2.511756181716919 }, { "epoch": 0.3059125964010283, "step": 3094, "train/sim_loss": 0.109375 }, { "epoch": 0.3059125964010283, "step": 3094, "train/total_loss": 0.3605506122112274 }, { "entropy": 8.915959358215332, "epoch": 0.3060114692505438, "mean_token_accuracy": 0.7392290234565735, "num_tokens": 16111942.0, "step": 3095, "train/ce_loss": 0.5130492448806763 }, { "epoch": 0.3060114692505438, "step": 3095, "train/sim_loss": 0.0546875 }, { "epoch": 0.3060114692505438, "step": 3095, "train/total_loss": 0.10599242150783539 }, { "entropy": 9.439672470092773, "epoch": 0.30611034210005933, "mean_token_accuracy": 0.7339622378349304, "num_tokens": 16116897.0, "step": 3096, "train/ce_loss": 4.731972694571596e-06 }, { "epoch": 0.30611034210005933, "step": 3096, "train/sim_loss": 0.02734375 }, { "epoch": 0.30611034210005933, "step": 3096, "train/total_loss": 0.027344223111867905 }, { "entropy": 9.428709983825684, "epoch": 0.30620921494957487, "mean_token_accuracy": 0.7720706462860107, "num_tokens": 16121998.0, "step": 3097, "train/ce_loss": 0.6409549117088318 }, { "epoch": 0.30620921494957487, "step": 3097, "train/sim_loss": 0.0234375 }, { "epoch": 0.30620921494957487, "step": 3097, "train/total_loss": 0.08753298968076706 }, { "entropy": 8.80047607421875, "epoch": 0.30630808779909036, "mean_token_accuracy": 0.6861878633499146, "num_tokens": 16127368.0, "step": 3098, "train/ce_loss": 1.2583460807800293 }, { "epoch": 0.30630808779909036, "step": 3098, "train/sim_loss": 0.0390625 }, { "epoch": 0.30630808779909036, "step": 3098, "train/total_loss": 0.1648971140384674 }, { "entropy": 9.62500286102295, "epoch": 0.3064069606486059, "mean_token_accuracy": 0.6607999801635742, "num_tokens": 16132408.0, "step": 3099, "train/ce_loss": 1.3447495698928833 }, { "epoch": 0.3064069606486059, "step": 3099, "train/sim_loss": 0.06640625 }, { "epoch": 0.3064069606486059, "step": 3099, "train/total_loss": 0.2008812129497528 }, { "epoch": 0.30650583349812144, "grad_norm": 0.9475986957550049, "learning_rate": 9.236265638134798e-06, "loss": 0.1548, "step": 3100 }, { "entropy": 8.950824737548828, "epoch": 0.30650583349812144, "mean_token_accuracy": 0.7482100129127502, "num_tokens": 16137687.0, "step": 3100, "train/ce_loss": 0.47615012526512146 }, { "epoch": 0.30650583349812144, "step": 3100, "train/sim_loss": 0.0703125 }, { "epoch": 0.30650583349812144, "step": 3100, "train/total_loss": 0.11792751401662827 }, { "entropy": 9.56070327758789, "epoch": 0.3066047063476369, "mean_token_accuracy": 0.7605633735656738, "num_tokens": 16142761.0, "step": 3101, "train/ce_loss": 3.0056301056902157e-06 }, { "epoch": 0.3066047063476369, "step": 3101, "train/sim_loss": 0.0234375 }, { "epoch": 0.3066047063476369, "step": 3101, "train/total_loss": 0.023437799885869026 }, { "entropy": 8.973735809326172, "epoch": 0.30670357919715246, "mean_token_accuracy": 0.7640320658683777, "num_tokens": 16148088.0, "step": 3102, "train/ce_loss": 0.608039915561676 }, { "epoch": 0.30670357919715246, "step": 3102, "train/sim_loss": 0.03515625 }, { "epoch": 0.30670357919715246, "step": 3102, "train/total_loss": 0.09596024453639984 }, { "entropy": 9.528715133666992, "epoch": 0.306802452046668, "mean_token_accuracy": 0.7862714529037476, "num_tokens": 16153141.0, "step": 3103, "train/ce_loss": 0.6295854449272156 }, { "epoch": 0.306802452046668, "step": 3103, "train/sim_loss": 0.09765625 }, { "epoch": 0.306802452046668, "step": 3103, "train/total_loss": 0.16061478853225708 }, { "entropy": 9.321884155273438, "epoch": 0.3069013248961835, "mean_token_accuracy": 0.8230769038200378, "num_tokens": 16158235.0, "step": 3104, "train/ce_loss": 0.7484152317047119 }, { "epoch": 0.3069013248961835, "step": 3104, "train/sim_loss": 0.03125 }, { "epoch": 0.3069013248961835, "step": 3104, "train/total_loss": 0.10609152168035507 }, { "entropy": 9.431598663330078, "epoch": 0.30700019774569903, "mean_token_accuracy": 0.7129032015800476, "num_tokens": 16163312.0, "step": 3105, "train/ce_loss": 0.6860657930374146 }, { "epoch": 0.30700019774569903, "step": 3105, "train/sim_loss": 0.046875 }, { "epoch": 0.30700019774569903, "step": 3105, "train/total_loss": 0.11548157781362534 }, { "entropy": 8.806082725524902, "epoch": 0.3070990705952146, "mean_token_accuracy": 0.7071651220321655, "num_tokens": 16168751.0, "step": 3106, "train/ce_loss": 1.0154913663864136 }, { "epoch": 0.3070990705952146, "step": 3106, "train/sim_loss": 0.05859375 }, { "epoch": 0.3070990705952146, "step": 3106, "train/total_loss": 0.1601428985595703 }, { "entropy": 9.546571731567383, "epoch": 0.30719794344473006, "mean_token_accuracy": 0.7418181896209717, "num_tokens": 16173742.0, "step": 3107, "train/ce_loss": 0.7134097218513489 }, { "epoch": 0.30719794344473006, "step": 3107, "train/sim_loss": 0.07421875 }, { "epoch": 0.30719794344473006, "step": 3107, "train/total_loss": 0.14555972814559937 }, { "entropy": 9.867581367492676, "epoch": 0.3072968162942456, "mean_token_accuracy": 0.7926267385482788, "num_tokens": 16178597.0, "step": 3108, "train/ce_loss": 6.592382760572946e-06 }, { "epoch": 0.3072968162942456, "step": 3108, "train/sim_loss": 0.03125 }, { "epoch": 0.3072968162942456, "step": 3108, "train/total_loss": 0.03125065937638283 }, { "entropy": 9.264135360717773, "epoch": 0.30739568914376114, "mean_token_accuracy": 0.6494565010070801, "num_tokens": 16183800.0, "step": 3109, "train/ce_loss": 1.8196673393249512 }, { "epoch": 0.30739568914376114, "step": 3109, "train/sim_loss": 0.1171875 }, { "epoch": 0.30739568914376114, "step": 3109, "train/total_loss": 0.29915422201156616 }, { "entropy": 9.16361141204834, "epoch": 0.3074945619932766, "mean_token_accuracy": 0.6535341739654541, "num_tokens": 16189111.0, "step": 3110, "train/ce_loss": 1.6297848224639893 }, { "epoch": 0.3074945619932766, "step": 3110, "train/sim_loss": 0.0703125 }, { "epoch": 0.3074945619932766, "step": 3110, "train/total_loss": 0.23329098522663116 }, { "entropy": 8.848861694335938, "epoch": 0.30759343484279217, "mean_token_accuracy": 0.7415611743927002, "num_tokens": 16194602.0, "step": 3111, "train/ce_loss": 1.1059902906417847 }, { "epoch": 0.30759343484279217, "step": 3111, "train/sim_loss": 0.08984375 }, { "epoch": 0.30759343484279217, "step": 3111, "train/total_loss": 0.20044279098510742 }, { "entropy": 9.377516746520996, "epoch": 0.3076923076923077, "mean_token_accuracy": 0.7333333492279053, "num_tokens": 16199684.0, "step": 3112, "train/ce_loss": 0.9643053412437439 }, { "epoch": 0.3076923076923077, "step": 3112, "train/sim_loss": 0.125 }, { "epoch": 0.3076923076923077, "step": 3112, "train/total_loss": 0.22143054008483887 }, { "entropy": 9.034387588500977, "epoch": 0.3077911805418232, "mean_token_accuracy": 0.759036123752594, "num_tokens": 16204989.0, "step": 3113, "train/ce_loss": 1.4106206893920898 }, { "epoch": 0.3077911805418232, "step": 3113, "train/sim_loss": 0.1640625 }, { "epoch": 0.3077911805418232, "step": 3113, "train/total_loss": 0.30512458086013794 }, { "entropy": 9.227992057800293, "epoch": 0.30789005339133874, "mean_token_accuracy": 0.7476635575294495, "num_tokens": 16210224.0, "step": 3114, "train/ce_loss": 1.1176414489746094 }, { "epoch": 0.30789005339133874, "step": 3114, "train/sim_loss": 0.0546875 }, { "epoch": 0.30789005339133874, "step": 3114, "train/total_loss": 0.16645164787769318 }, { "entropy": 9.282114028930664, "epoch": 0.3079889262408543, "mean_token_accuracy": 0.7710674405097961, "num_tokens": 16215366.0, "step": 3115, "train/ce_loss": 0.7169881463050842 }, { "epoch": 0.3079889262408543, "step": 3115, "train/sim_loss": 0.08203125 }, { "epoch": 0.3079889262408543, "step": 3115, "train/total_loss": 0.15373006463050842 }, { "entropy": 8.980093002319336, "epoch": 0.30808779909036976, "mean_token_accuracy": 0.6628895401954651, "num_tokens": 16220547.0, "step": 3116, "train/ce_loss": 1.4373340606689453 }, { "epoch": 0.30808779909036976, "step": 3116, "train/sim_loss": 0.11328125 }, { "epoch": 0.30808779909036976, "step": 3116, "train/total_loss": 0.257014662027359 }, { "entropy": 9.515814781188965, "epoch": 0.3081866719398853, "mean_token_accuracy": 0.7389240264892578, "num_tokens": 16225621.0, "step": 3117, "train/ce_loss": 3.717155323101906e-06 }, { "epoch": 0.3081866719398853, "step": 3117, "train/sim_loss": 0.125 }, { "epoch": 0.3081866719398853, "step": 3117, "train/total_loss": 0.12500037252902985 }, { "entropy": 9.999287605285645, "epoch": 0.30828554478940084, "mean_token_accuracy": 0.8034397959709167, "num_tokens": 16230406.0, "step": 3118, "train/ce_loss": 3.893018401868176e-06 }, { "epoch": 0.30828554478940084, "step": 3118, "train/sim_loss": 0.04296875 }, { "epoch": 0.30828554478940084, "step": 3118, "train/total_loss": 0.04296914115548134 }, { "entropy": 8.90539836883545, "epoch": 0.30838441763891633, "mean_token_accuracy": 0.7177508473396301, "num_tokens": 16235807.0, "step": 3119, "train/ce_loss": 0.8543577194213867 }, { "epoch": 0.30838441763891633, "step": 3119, "train/sim_loss": 0.1640625 }, { "epoch": 0.30838441763891633, "step": 3119, "train/total_loss": 0.24949827790260315 }, { "epoch": 0.30848329048843187, "grad_norm": 0.9575387239456177, "learning_rate": 9.231320773376848e-06, "loss": 0.1514, "step": 3120 }, { "entropy": 8.715691566467285, "epoch": 0.30848329048843187, "mean_token_accuracy": 0.7260406613349915, "num_tokens": 16241355.0, "step": 3120, "train/ce_loss": 0.4119970500469208 }, { "epoch": 0.30848329048843187, "step": 3120, "train/sim_loss": 0.015625 }, { "epoch": 0.30848329048843187, "step": 3120, "train/total_loss": 0.0568247064948082 }, { "entropy": 9.690220832824707, "epoch": 0.3085821633379474, "mean_token_accuracy": 0.6921606063842773, "num_tokens": 16246318.0, "step": 3121, "train/ce_loss": 1.8799567222595215 }, { "epoch": 0.3085821633379474, "step": 3121, "train/sim_loss": 0.05078125 }, { "epoch": 0.3085821633379474, "step": 3121, "train/total_loss": 0.23877692222595215 }, { "entropy": 9.361258506774902, "epoch": 0.3086810361874629, "mean_token_accuracy": 0.8055987358093262, "num_tokens": 16251400.0, "step": 3122, "train/ce_loss": 0.8268361687660217 }, { "epoch": 0.3086810361874629, "step": 3122, "train/sim_loss": 0.04296875 }, { "epoch": 0.3086810361874629, "step": 3122, "train/total_loss": 0.12565237283706665 }, { "entropy": 9.38145637512207, "epoch": 0.30877990903697844, "mean_token_accuracy": 0.6562905311584473, "num_tokens": 16256645.0, "step": 3123, "train/ce_loss": 2.5330162048339844 }, { "epoch": 0.30877990903697844, "step": 3123, "train/sim_loss": 0.15234375 }, { "epoch": 0.30877990903697844, "step": 3123, "train/total_loss": 0.40564537048339844 }, { "entropy": 9.34083366394043, "epoch": 0.308878781886494, "mean_token_accuracy": 0.7066051959991455, "num_tokens": 16261805.0, "step": 3124, "train/ce_loss": 1.233890414237976 }, { "epoch": 0.308878781886494, "step": 3124, "train/sim_loss": 0.09765625 }, { "epoch": 0.308878781886494, "step": 3124, "train/total_loss": 0.22104528546333313 }, { "entropy": 9.12912368774414, "epoch": 0.30897765473600947, "mean_token_accuracy": 0.7390244007110596, "num_tokens": 16267104.0, "step": 3125, "train/ce_loss": 0.8176206946372986 }, { "epoch": 0.30897765473600947, "step": 3125, "train/sim_loss": 0.0625 }, { "epoch": 0.30897765473600947, "step": 3125, "train/total_loss": 0.14426207542419434 }, { "entropy": 9.835182189941406, "epoch": 0.309076527585525, "mean_token_accuracy": 0.7484909296035767, "num_tokens": 16272039.0, "step": 3126, "train/ce_loss": 8.67279049998615e-06 }, { "epoch": 0.309076527585525, "step": 3126, "train/sim_loss": 0.03125 }, { "epoch": 0.309076527585525, "step": 3126, "train/total_loss": 0.03125086799263954 }, { "entropy": 10.046555519104004, "epoch": 0.30917540043504055, "mean_token_accuracy": 0.7424242496490479, "num_tokens": 16276819.0, "step": 3127, "train/ce_loss": 1.4793922901153564 }, { "epoch": 0.30917540043504055, "step": 3127, "train/sim_loss": 0.078125 }, { "epoch": 0.30917540043504055, "step": 3127, "train/total_loss": 0.22606423497200012 }, { "entropy": 8.99412727355957, "epoch": 0.30927427328455603, "mean_token_accuracy": 0.6790606379508972, "num_tokens": 16282326.0, "step": 3128, "train/ce_loss": 0.9208659529685974 }, { "epoch": 0.30927427328455603, "step": 3128, "train/sim_loss": 0.09375 }, { "epoch": 0.30927427328455603, "step": 3128, "train/total_loss": 0.18583659827709198 }, { "entropy": 10.206575393676758, "epoch": 0.3093731461340716, "mean_token_accuracy": 0.75314861536026, "num_tokens": 16287075.0, "step": 3129, "train/ce_loss": 1.8498486280441284 }, { "epoch": 0.3093731461340716, "step": 3129, "train/sim_loss": 0.08984375 }, { "epoch": 0.3093731461340716, "step": 3129, "train/total_loss": 0.27482861280441284 }, { "entropy": 9.601837158203125, "epoch": 0.3094720189835871, "mean_token_accuracy": 0.6882882714271545, "num_tokens": 16292083.0, "step": 3130, "train/ce_loss": 3.6322981031844392e-06 }, { "epoch": 0.3094720189835871, "step": 3130, "train/sim_loss": 0.02734375 }, { "epoch": 0.3094720189835871, "step": 3130, "train/total_loss": 0.0273441132158041 }, { "entropy": 9.262863159179688, "epoch": 0.30957089183310266, "mean_token_accuracy": 0.7254672646522522, "num_tokens": 16297395.0, "step": 3131, "train/ce_loss": 0.8118441104888916 }, { "epoch": 0.30957089183310266, "step": 3131, "train/sim_loss": 0.0546875 }, { "epoch": 0.30957089183310266, "step": 3131, "train/total_loss": 0.13587191700935364 }, { "entropy": 8.937446594238281, "epoch": 0.30966976468261814, "mean_token_accuracy": 0.7626903653144836, "num_tokens": 16302727.0, "step": 3132, "train/ce_loss": 0.764589786529541 }, { "epoch": 0.30966976468261814, "step": 3132, "train/sim_loss": 0.05859375 }, { "epoch": 0.30966976468261814, "step": 3132, "train/total_loss": 0.13505274057388306 }, { "entropy": 9.000936508178711, "epoch": 0.3097686375321337, "mean_token_accuracy": 0.707196056842804, "num_tokens": 16308051.0, "step": 3133, "train/ce_loss": 1.7300879955291748 }, { "epoch": 0.3097686375321337, "step": 3133, "train/sim_loss": 0.07421875 }, { "epoch": 0.3097686375321337, "step": 3133, "train/total_loss": 0.24722754955291748 }, { "entropy": 9.17640209197998, "epoch": 0.3098675103816492, "mean_token_accuracy": 0.7517814636230469, "num_tokens": 16313356.0, "step": 3134, "train/ce_loss": 0.6477929353713989 }, { "epoch": 0.3098675103816492, "step": 3134, "train/sim_loss": 0.02734375 }, { "epoch": 0.3098675103816492, "step": 3134, "train/total_loss": 0.09212304651737213 }, { "entropy": 8.919665336608887, "epoch": 0.3099663832311647, "mean_token_accuracy": 0.7678795456886292, "num_tokens": 16318630.0, "step": 3135, "train/ce_loss": 0.49486202001571655 }, { "epoch": 0.3099663832311647, "step": 3135, "train/sim_loss": 0.046875 }, { "epoch": 0.3099663832311647, "step": 3135, "train/total_loss": 0.0963612049818039 }, { "entropy": 9.14445686340332, "epoch": 0.31006525608068025, "mean_token_accuracy": 0.6593785881996155, "num_tokens": 16323970.0, "step": 3136, "train/ce_loss": 1.3321263790130615 }, { "epoch": 0.31006525608068025, "step": 3136, "train/sim_loss": 0.078125 }, { "epoch": 0.31006525608068025, "step": 3136, "train/total_loss": 0.2113376408815384 }, { "entropy": 8.572418212890625, "epoch": 0.3101641289301958, "mean_token_accuracy": 0.755156934261322, "num_tokens": 16329557.0, "step": 3137, "train/ce_loss": 0.6957716345787048 }, { "epoch": 0.3101641289301958, "step": 3137, "train/sim_loss": 0.01953125 }, { "epoch": 0.3101641289301958, "step": 3137, "train/total_loss": 0.0891084149479866 }, { "entropy": 10.049623489379883, "epoch": 0.3102630017797113, "mean_token_accuracy": 0.7021276354789734, "num_tokens": 16334419.0, "step": 3138, "train/ce_loss": 2.307546377182007 }, { "epoch": 0.3102630017797113, "step": 3138, "train/sim_loss": 0.0625 }, { "epoch": 0.3102630017797113, "step": 3138, "train/total_loss": 0.29325464367866516 }, { "entropy": 9.033727645874023, "epoch": 0.3103618746292268, "mean_token_accuracy": 0.7027027010917664, "num_tokens": 16339795.0, "step": 3139, "train/ce_loss": 0.7809948921203613 }, { "epoch": 0.3103618746292268, "step": 3139, "train/sim_loss": 0.0625 }, { "epoch": 0.3103618746292268, "step": 3139, "train/total_loss": 0.14059948921203613 }, { "epoch": 0.31046074747874236, "grad_norm": 0.7521636486053467, "learning_rate": 9.2263759086189e-06, "loss": 0.1506, "step": 3140 }, { "entropy": 9.636505126953125, "epoch": 0.31046074747874236, "mean_token_accuracy": 0.7461240291595459, "num_tokens": 16344756.0, "step": 3140, "train/ce_loss": 0.6385220289230347 }, { "epoch": 0.31046074747874236, "step": 3140, "train/sim_loss": 0.10546875 }, { "epoch": 0.31046074747874236, "step": 3140, "train/total_loss": 0.1693209558725357 }, { "entropy": 9.397228240966797, "epoch": 0.31055962032825785, "mean_token_accuracy": 0.7478134036064148, "num_tokens": 16349847.0, "step": 3141, "train/ce_loss": 1.0517762899398804 }, { "epoch": 0.31055962032825785, "step": 3141, "train/sim_loss": 0.0625 }, { "epoch": 0.31055962032825785, "step": 3141, "train/total_loss": 0.167677640914917 }, { "entropy": 9.273998260498047, "epoch": 0.3106584931777734, "mean_token_accuracy": 0.7318840622901917, "num_tokens": 16354945.0, "step": 3142, "train/ce_loss": 1.263601541519165 }, { "epoch": 0.3106584931777734, "step": 3142, "train/sim_loss": 0.1015625 }, { "epoch": 0.3106584931777734, "step": 3142, "train/total_loss": 0.22792266309261322 }, { "entropy": 9.283475875854492, "epoch": 0.31075736602728893, "mean_token_accuracy": 0.7634561061859131, "num_tokens": 16360131.0, "step": 3143, "train/ce_loss": 0.5632201433181763 }, { "epoch": 0.31075736602728893, "step": 3143, "train/sim_loss": 0.078125 }, { "epoch": 0.31075736602728893, "step": 3143, "train/total_loss": 0.13444700837135315 }, { "entropy": 9.175178527832031, "epoch": 0.3108562388768044, "mean_token_accuracy": 0.7473053932189941, "num_tokens": 16365359.0, "step": 3144, "train/ce_loss": 0.8894725441932678 }, { "epoch": 0.3108562388768044, "step": 3144, "train/sim_loss": 0.05078125 }, { "epoch": 0.3108562388768044, "step": 3144, "train/total_loss": 0.13972851634025574 }, { "entropy": 9.034858703613281, "epoch": 0.31095511172631995, "mean_token_accuracy": 0.7553443908691406, "num_tokens": 16370678.0, "step": 3145, "train/ce_loss": 0.49990347027778625 }, { "epoch": 0.31095511172631995, "step": 3145, "train/sim_loss": 0.0546875 }, { "epoch": 0.31095511172631995, "step": 3145, "train/total_loss": 0.10467784851789474 }, { "entropy": 9.623411178588867, "epoch": 0.3110539845758355, "mean_token_accuracy": 0.7321428656578064, "num_tokens": 16375657.0, "step": 3146, "train/ce_loss": 5.4201768762141e-06 }, { "epoch": 0.3110539845758355, "step": 3146, "train/sim_loss": 0.04296875 }, { "epoch": 0.3110539845758355, "step": 3146, "train/total_loss": 0.04296929016709328 }, { "entropy": 9.056873321533203, "epoch": 0.311152857425351, "mean_token_accuracy": 0.7586206793785095, "num_tokens": 16381050.0, "step": 3147, "train/ce_loss": 0.5513096451759338 }, { "epoch": 0.311152857425351, "step": 3147, "train/sim_loss": 0.05078125 }, { "epoch": 0.311152857425351, "step": 3147, "train/total_loss": 0.1059122160077095 }, { "entropy": 9.472996711730957, "epoch": 0.3112517302748665, "mean_token_accuracy": 0.7312977313995361, "num_tokens": 16386171.0, "step": 3148, "train/ce_loss": 5.088227680971613e-06 }, { "epoch": 0.3112517302748665, "step": 3148, "train/sim_loss": 0.14453125 }, { "epoch": 0.3112517302748665, "step": 3148, "train/total_loss": 0.1445317566394806 }, { "entropy": 9.353708267211914, "epoch": 0.31135060312438206, "mean_token_accuracy": 0.698727011680603, "num_tokens": 16391339.0, "step": 3149, "train/ce_loss": 1.2778841257095337 }, { "epoch": 0.31135060312438206, "step": 3149, "train/sim_loss": 0.09375 }, { "epoch": 0.31135060312438206, "step": 3149, "train/total_loss": 0.22153840959072113 }, { "entropy": 9.361830711364746, "epoch": 0.31144947597389755, "mean_token_accuracy": 0.6990740895271301, "num_tokens": 16396433.0, "step": 3150, "train/ce_loss": 1.6144299507141113 }, { "epoch": 0.31144947597389755, "step": 3150, "train/sim_loss": 0.125 }, { "epoch": 0.31144947597389755, "step": 3150, "train/total_loss": 0.28644299507141113 }, { "entropy": 9.052051544189453, "epoch": 0.3115483488234131, "mean_token_accuracy": 0.7449344396591187, "num_tokens": 16401689.0, "step": 3151, "train/ce_loss": 0.8088662624359131 }, { "epoch": 0.3115483488234131, "step": 3151, "train/sim_loss": 0.109375 }, { "epoch": 0.3115483488234131, "step": 3151, "train/total_loss": 0.1902616322040558 }, { "entropy": 9.249351501464844, "epoch": 0.31164722167292863, "mean_token_accuracy": 0.7616580128669739, "num_tokens": 16406969.0, "step": 3152, "train/ce_loss": 0.9997432231903076 }, { "epoch": 0.31164722167292863, "step": 3152, "train/sim_loss": 0.1015625 }, { "epoch": 0.31164722167292863, "step": 3152, "train/total_loss": 0.20153683423995972 }, { "entropy": 9.221841812133789, "epoch": 0.3117460945224441, "mean_token_accuracy": 0.6978609561920166, "num_tokens": 16412160.0, "step": 3153, "train/ce_loss": 1.0066462755203247 }, { "epoch": 0.3117460945224441, "step": 3153, "train/sim_loss": 0.0859375 }, { "epoch": 0.3117460945224441, "step": 3153, "train/total_loss": 0.1866021305322647 }, { "entropy": 9.281400680541992, "epoch": 0.31184496737195966, "mean_token_accuracy": 0.7664835453033447, "num_tokens": 16417330.0, "step": 3154, "train/ce_loss": 0.7727817296981812 }, { "epoch": 0.31184496737195966, "step": 3154, "train/sim_loss": 0.03125 }, { "epoch": 0.31184496737195966, "step": 3154, "train/total_loss": 0.10852817445993423 }, { "entropy": 8.738920211791992, "epoch": 0.3119438402214752, "mean_token_accuracy": 0.7442424297332764, "num_tokens": 16422602.0, "step": 3155, "train/ce_loss": 0.9020947217941284 }, { "epoch": 0.3119438402214752, "step": 3155, "train/sim_loss": 0.0625 }, { "epoch": 0.3119438402214752, "step": 3155, "train/total_loss": 0.1527094841003418 }, { "entropy": 9.108892440795898, "epoch": 0.3120427130709907, "mean_token_accuracy": 0.743682324886322, "num_tokens": 16427880.0, "step": 3156, "train/ce_loss": 0.610110342502594 }, { "epoch": 0.3120427130709907, "step": 3156, "train/sim_loss": 0.07421875 }, { "epoch": 0.3120427130709907, "step": 3156, "train/total_loss": 0.13522978127002716 }, { "entropy": 8.891284942626953, "epoch": 0.3121415859205062, "mean_token_accuracy": 0.7242798209190369, "num_tokens": 16433372.0, "step": 3157, "train/ce_loss": 0.8658456206321716 }, { "epoch": 0.3121415859205062, "step": 3157, "train/sim_loss": 0.0859375 }, { "epoch": 0.3121415859205062, "step": 3157, "train/total_loss": 0.17252206802368164 }, { "entropy": 9.185648918151855, "epoch": 0.31224045877002177, "mean_token_accuracy": 0.7341935634613037, "num_tokens": 16438572.0, "step": 3158, "train/ce_loss": 0.590491771697998 }, { "epoch": 0.31224045877002177, "step": 3158, "train/sim_loss": 0.03515625 }, { "epoch": 0.31224045877002177, "step": 3158, "train/total_loss": 0.09420542418956757 }, { "entropy": 9.610228538513184, "epoch": 0.31233933161953725, "mean_token_accuracy": 0.6815742254257202, "num_tokens": 16443682.0, "step": 3159, "train/ce_loss": 0.9321441054344177 }, { "epoch": 0.31233933161953725, "step": 3159, "train/sim_loss": 0.109375 }, { "epoch": 0.31233933161953725, "step": 3159, "train/total_loss": 0.20258942246437073 }, { "epoch": 0.3124382044690528, "grad_norm": 0.9434493780136108, "learning_rate": 9.221431043860951e-06, "loss": 0.1525, "step": 3160 }, { "entropy": 8.815746307373047, "epoch": 0.3124382044690528, "mean_token_accuracy": 0.7195301055908203, "num_tokens": 16448886.0, "step": 3160, "train/ce_loss": 0.8756749033927917 }, { "epoch": 0.3124382044690528, "step": 3160, "train/sim_loss": 0.08203125 }, { "epoch": 0.3124382044690528, "step": 3160, "train/total_loss": 0.1695987433195114 }, { "entropy": 9.118791580200195, "epoch": 0.31253707731856833, "mean_token_accuracy": 0.7247259616851807, "num_tokens": 16454171.0, "step": 3161, "train/ce_loss": 1.127454161643982 }, { "epoch": 0.31253707731856833, "step": 3161, "train/sim_loss": 0.0546875 }, { "epoch": 0.31253707731856833, "step": 3161, "train/total_loss": 0.16743291914463043 }, { "entropy": 9.115190505981445, "epoch": 0.3126359501680838, "mean_token_accuracy": 0.723122239112854, "num_tokens": 16459335.0, "step": 3162, "train/ce_loss": 1.120043158531189 }, { "epoch": 0.3126359501680838, "step": 3162, "train/sim_loss": 0.078125 }, { "epoch": 0.3126359501680838, "step": 3162, "train/total_loss": 0.19012930989265442 }, { "entropy": 9.974798202514648, "epoch": 0.31273482301759936, "mean_token_accuracy": 0.6629955768585205, "num_tokens": 16464216.0, "step": 3163, "train/ce_loss": 2.5433461666107178 }, { "epoch": 0.31273482301759936, "step": 3163, "train/sim_loss": 0.109375 }, { "epoch": 0.31273482301759936, "step": 3163, "train/total_loss": 0.36370962858200073 }, { "entropy": 9.074464797973633, "epoch": 0.3128336958671149, "mean_token_accuracy": 0.7586981058120728, "num_tokens": 16469597.0, "step": 3164, "train/ce_loss": 0.9268357157707214 }, { "epoch": 0.3128336958671149, "step": 3164, "train/sim_loss": 0.04296875 }, { "epoch": 0.3128336958671149, "step": 3164, "train/total_loss": 0.1356523334980011 }, { "entropy": 8.859774589538574, "epoch": 0.3129325687166304, "mean_token_accuracy": 0.7575107216835022, "num_tokens": 16475021.0, "step": 3165, "train/ce_loss": 0.7807568907737732 }, { "epoch": 0.3129325687166304, "step": 3165, "train/sim_loss": 0.05078125 }, { "epoch": 0.3129325687166304, "step": 3165, "train/total_loss": 0.12885694205760956 }, { "entropy": 9.364564895629883, "epoch": 0.31303144156614593, "mean_token_accuracy": 0.8024523258209229, "num_tokens": 16480188.0, "step": 3166, "train/ce_loss": 2.2346878267853754e-06 }, { "epoch": 0.31303144156614593, "step": 3166, "train/sim_loss": 0.01953125 }, { "epoch": 0.31303144156614593, "step": 3166, "train/total_loss": 0.019531473517417908 }, { "entropy": 9.303370475769043, "epoch": 0.31313031441566147, "mean_token_accuracy": 0.7154762148857117, "num_tokens": 16485415.0, "step": 3167, "train/ce_loss": 1.0272390842437744 }, { "epoch": 0.31313031441566147, "step": 3167, "train/sim_loss": 0.06640625 }, { "epoch": 0.31313031441566147, "step": 3167, "train/total_loss": 0.16913016140460968 }, { "entropy": 9.708673477172852, "epoch": 0.31322918726517696, "mean_token_accuracy": 0.7376146912574768, "num_tokens": 16490363.0, "step": 3168, "train/ce_loss": 0.5770002603530884 }, { "epoch": 0.31322918726517696, "step": 3168, "train/sim_loss": 0.0703125 }, { "epoch": 0.31322918726517696, "step": 3168, "train/total_loss": 0.1280125230550766 }, { "entropy": 9.506532669067383, "epoch": 0.3133280601146925, "mean_token_accuracy": 0.6943164467811584, "num_tokens": 16495481.0, "step": 3169, "train/ce_loss": 0.9109711647033691 }, { "epoch": 0.3133280601146925, "step": 3169, "train/sim_loss": 0.09765625 }, { "epoch": 0.3133280601146925, "step": 3169, "train/total_loss": 0.18875336647033691 }, { "entropy": 9.18989372253418, "epoch": 0.31342693296420804, "mean_token_accuracy": 0.7279999852180481, "num_tokens": 16500676.0, "step": 3170, "train/ce_loss": 0.7564029097557068 }, { "epoch": 0.31342693296420804, "step": 3170, "train/sim_loss": 0.078125 }, { "epoch": 0.31342693296420804, "step": 3170, "train/total_loss": 0.15376529097557068 }, { "entropy": 9.426542282104492, "epoch": 0.3135258058137236, "mean_token_accuracy": 0.7111716866493225, "num_tokens": 16505864.0, "step": 3171, "train/ce_loss": 0.4606419801712036 }, { "epoch": 0.3135258058137236, "step": 3171, "train/sim_loss": 0.0703125 }, { "epoch": 0.3135258058137236, "step": 3171, "train/total_loss": 0.11637669801712036 }, { "entropy": 9.041022300720215, "epoch": 0.31362467866323906, "mean_token_accuracy": 0.7745803594589233, "num_tokens": 16511167.0, "step": 3172, "train/ce_loss": 0.7756233215332031 }, { "epoch": 0.31362467866323906, "step": 3172, "train/sim_loss": 0.07421875 }, { "epoch": 0.31362467866323906, "step": 3172, "train/total_loss": 0.1517810821533203 }, { "entropy": 9.487861633300781, "epoch": 0.3137235515127546, "mean_token_accuracy": 0.7473867535591125, "num_tokens": 16516217.0, "step": 3173, "train/ce_loss": 1.0670230388641357 }, { "epoch": 0.3137235515127546, "step": 3173, "train/sim_loss": 0.046875 }, { "epoch": 0.3137235515127546, "step": 3173, "train/total_loss": 0.1535772979259491 }, { "entropy": 9.665082931518555, "epoch": 0.31382242436227015, "mean_token_accuracy": 0.7163904309272766, "num_tokens": 16521222.0, "step": 3174, "train/ce_loss": 1.644775390625 }, { "epoch": 0.31382242436227015, "step": 3174, "train/sim_loss": 0.1171875 }, { "epoch": 0.31382242436227015, "step": 3174, "train/total_loss": 0.28166502714157104 }, { "entropy": 8.908178329467773, "epoch": 0.31392129721178563, "mean_token_accuracy": 0.7110311985015869, "num_tokens": 16526518.0, "step": 3175, "train/ce_loss": 0.7185203433036804 }, { "epoch": 0.31392129721178563, "step": 3175, "train/sim_loss": 0.0234375 }, { "epoch": 0.31392129721178563, "step": 3175, "train/total_loss": 0.09528953582048416 }, { "entropy": 9.595062255859375, "epoch": 0.3140201700613012, "mean_token_accuracy": 0.6961538195610046, "num_tokens": 16531477.0, "step": 3176, "train/ce_loss": 0.9679924845695496 }, { "epoch": 0.3140201700613012, "step": 3176, "train/sim_loss": 0.03125 }, { "epoch": 0.3140201700613012, "step": 3176, "train/total_loss": 0.12804925441741943 }, { "entropy": 8.992053985595703, "epoch": 0.3141190429108167, "mean_token_accuracy": 0.7331838607788086, "num_tokens": 16536837.0, "step": 3177, "train/ce_loss": 0.8361531496047974 }, { "epoch": 0.3141190429108167, "step": 3177, "train/sim_loss": 0.08203125 }, { "epoch": 0.3141190429108167, "step": 3177, "train/total_loss": 0.16564656794071198 }, { "entropy": 9.319987297058105, "epoch": 0.3142179157603322, "mean_token_accuracy": 0.7346683144569397, "num_tokens": 16542118.0, "step": 3178, "train/ce_loss": 1.495343804359436 }, { "epoch": 0.3142179157603322, "step": 3178, "train/sim_loss": 0.0859375 }, { "epoch": 0.3142179157603322, "step": 3178, "train/total_loss": 0.23547188937664032 }, { "entropy": 9.595830917358398, "epoch": 0.31431678860984774, "mean_token_accuracy": 0.7257575988769531, "num_tokens": 16547215.0, "step": 3179, "train/ce_loss": 0.7838876247406006 }, { "epoch": 0.31431678860984774, "step": 3179, "train/sim_loss": 0.0703125 }, { "epoch": 0.31431678860984774, "step": 3179, "train/total_loss": 0.1487012654542923 }, { "epoch": 0.3144156614593633, "grad_norm": 0.7691072225570679, "learning_rate": 9.216486179103003e-06, "loss": 0.1531, "step": 3180 }, { "entropy": 9.449749946594238, "epoch": 0.3144156614593633, "mean_token_accuracy": 0.6988636255264282, "num_tokens": 16552342.0, "step": 3180, "train/ce_loss": 1.6050139665603638 }, { "epoch": 0.3144156614593633, "step": 3180, "train/sim_loss": 0.1171875 }, { "epoch": 0.3144156614593633, "step": 3180, "train/total_loss": 0.2776889204978943 }, { "entropy": 8.785375595092773, "epoch": 0.31451453430887877, "mean_token_accuracy": 0.7557436227798462, "num_tokens": 16557692.0, "step": 3181, "train/ce_loss": 1.0190057754516602 }, { "epoch": 0.31451453430887877, "step": 3181, "train/sim_loss": 0.0546875 }, { "epoch": 0.31451453430887877, "step": 3181, "train/total_loss": 0.15658807754516602 }, { "entropy": 10.074111938476562, "epoch": 0.3146134071583943, "mean_token_accuracy": 0.7526881694793701, "num_tokens": 16562523.0, "step": 3182, "train/ce_loss": 1.4351893663406372 }, { "epoch": 0.3146134071583943, "step": 3182, "train/sim_loss": 0.046875 }, { "epoch": 0.3146134071583943, "step": 3182, "train/total_loss": 0.19039393961429596 }, { "entropy": 9.068748474121094, "epoch": 0.31471228000790985, "mean_token_accuracy": 0.7479191422462463, "num_tokens": 16567865.0, "step": 3183, "train/ce_loss": 0.7203229665756226 }, { "epoch": 0.31471228000790985, "step": 3183, "train/sim_loss": 0.12890625 }, { "epoch": 0.31471228000790985, "step": 3183, "train/total_loss": 0.20093855261802673 }, { "entropy": 9.319864273071289, "epoch": 0.31481115285742534, "mean_token_accuracy": 0.7212205529212952, "num_tokens": 16573022.0, "step": 3184, "train/ce_loss": 0.5705752372741699 }, { "epoch": 0.31481115285742534, "step": 3184, "train/sim_loss": 0.06640625 }, { "epoch": 0.31481115285742534, "step": 3184, "train/total_loss": 0.12346377968788147 }, { "entropy": 8.951447486877441, "epoch": 0.3149100257069409, "mean_token_accuracy": 0.7768691778182983, "num_tokens": 16578319.0, "step": 3185, "train/ce_loss": 0.7091139554977417 }, { "epoch": 0.3149100257069409, "step": 3185, "train/sim_loss": 0.01953125 }, { "epoch": 0.3149100257069409, "step": 3185, "train/total_loss": 0.09044265002012253 }, { "entropy": 9.045388221740723, "epoch": 0.3150088985564564, "mean_token_accuracy": 0.6927710771560669, "num_tokens": 16583633.0, "step": 3186, "train/ce_loss": 0.8720293045043945 }, { "epoch": 0.3150088985564564, "step": 3186, "train/sim_loss": 0.11328125 }, { "epoch": 0.3150088985564564, "step": 3186, "train/total_loss": 0.20048418641090393 }, { "entropy": 8.728281021118164, "epoch": 0.3151077714059719, "mean_token_accuracy": 0.7602880597114563, "num_tokens": 16589125.0, "step": 3187, "train/ce_loss": 0.6601799726486206 }, { "epoch": 0.3151077714059719, "step": 3187, "train/sim_loss": 0.03125 }, { "epoch": 0.3151077714059719, "step": 3187, "train/total_loss": 0.0972680002450943 }, { "entropy": 9.076705932617188, "epoch": 0.31520664425548744, "mean_token_accuracy": 0.7634961605072021, "num_tokens": 16594360.0, "step": 3188, "train/ce_loss": 0.8506819605827332 }, { "epoch": 0.31520664425548744, "step": 3188, "train/sim_loss": 0.05859375 }, { "epoch": 0.31520664425548744, "step": 3188, "train/total_loss": 0.14366194605827332 }, { "entropy": 8.745386123657227, "epoch": 0.315305517105003, "mean_token_accuracy": 0.7200435996055603, "num_tokens": 16599729.0, "step": 3189, "train/ce_loss": 1.0710463523864746 }, { "epoch": 0.315305517105003, "step": 3189, "train/sim_loss": 0.078125 }, { "epoch": 0.315305517105003, "step": 3189, "train/total_loss": 0.18522962927818298 }, { "entropy": 9.436115264892578, "epoch": 0.31540438995451847, "mean_token_accuracy": 0.703125, "num_tokens": 16604919.0, "step": 3190, "train/ce_loss": 2.183468818664551 }, { "epoch": 0.31540438995451847, "step": 3190, "train/sim_loss": 0.10546875 }, { "epoch": 0.31540438995451847, "step": 3190, "train/total_loss": 0.32381564378738403 }, { "entropy": 9.276498794555664, "epoch": 0.315503262804034, "mean_token_accuracy": 0.7179487347602844, "num_tokens": 16610501.0, "step": 3191, "train/ce_loss": 0.4966878294944763 }, { "epoch": 0.315503262804034, "step": 3191, "train/sim_loss": 0.03125 }, { "epoch": 0.315503262804034, "step": 3191, "train/total_loss": 0.08091878890991211 }, { "entropy": 8.970152854919434, "epoch": 0.31560213565354955, "mean_token_accuracy": 0.6796785593032837, "num_tokens": 16615819.0, "step": 3192, "train/ce_loss": 0.5181173086166382 }, { "epoch": 0.31560213565354955, "step": 3192, "train/sim_loss": 0.0390625 }, { "epoch": 0.31560213565354955, "step": 3192, "train/total_loss": 0.09087423235177994 }, { "entropy": 9.527421951293945, "epoch": 0.31570100850306504, "mean_token_accuracy": 0.7154471278190613, "num_tokens": 16620892.0, "step": 3193, "train/ce_loss": 0.9545297026634216 }, { "epoch": 0.31570100850306504, "step": 3193, "train/sim_loss": 0.10546875 }, { "epoch": 0.31570100850306504, "step": 3193, "train/total_loss": 0.20092171430587769 }, { "entropy": 9.597978591918945, "epoch": 0.3157998813525806, "mean_token_accuracy": 0.7486534714698792, "num_tokens": 16625888.0, "step": 3194, "train/ce_loss": 0.6437567472457886 }, { "epoch": 0.3157998813525806, "step": 3194, "train/sim_loss": 0.05859375 }, { "epoch": 0.3157998813525806, "step": 3194, "train/total_loss": 0.12296942621469498 }, { "entropy": 8.84429931640625, "epoch": 0.3158987542020961, "mean_token_accuracy": 0.7665369510650635, "num_tokens": 16631432.0, "step": 3195, "train/ce_loss": 0.8552486896514893 }, { "epoch": 0.3158987542020961, "step": 3195, "train/sim_loss": 0.0625 }, { "epoch": 0.3158987542020961, "step": 3195, "train/total_loss": 0.14802487194538116 }, { "entropy": 9.271564483642578, "epoch": 0.3159976270516116, "mean_token_accuracy": 0.750952959060669, "num_tokens": 16636665.0, "step": 3196, "train/ce_loss": 0.6849063634872437 }, { "epoch": 0.3159976270516116, "step": 3196, "train/sim_loss": 0.078125 }, { "epoch": 0.3159976270516116, "step": 3196, "train/total_loss": 0.1466156393289566 }, { "entropy": 9.034075736999512, "epoch": 0.31609649990112715, "mean_token_accuracy": 0.7160919308662415, "num_tokens": 16642031.0, "step": 3197, "train/ce_loss": 0.8857916593551636 }, { "epoch": 0.31609649990112715, "step": 3197, "train/sim_loss": 0.05078125 }, { "epoch": 0.31609649990112715, "step": 3197, "train/total_loss": 0.1393604278564453 }, { "entropy": 9.200294494628906, "epoch": 0.3161953727506427, "mean_token_accuracy": 0.7126168012619019, "num_tokens": 16647359.0, "step": 3198, "train/ce_loss": 1.015448808670044 }, { "epoch": 0.3161953727506427, "step": 3198, "train/sim_loss": 0.046875 }, { "epoch": 0.3161953727506427, "step": 3198, "train/total_loss": 0.14841988682746887 }, { "entropy": 9.951388359069824, "epoch": 0.3162942456001582, "mean_token_accuracy": 0.7136563658714294, "num_tokens": 16652276.0, "step": 3199, "train/ce_loss": 1.0957013368606567 }, { "epoch": 0.3162942456001582, "step": 3199, "train/sim_loss": 0.04296875 }, { "epoch": 0.3162942456001582, "step": 3199, "train/total_loss": 0.15253889560699463 }, { "epoch": 0.3163931184496737, "grad_norm": 0.8756003379821777, "learning_rate": 9.211541314345054e-06, "loss": 0.1548, "step": 3200 }, { "entropy": 9.194119453430176, "epoch": 0.3163931184496737, "mean_token_accuracy": 0.7058823704719543, "num_tokens": 16657483.0, "step": 3200, "train/ce_loss": 0.7935322523117065 }, { "epoch": 0.3163931184496737, "step": 3200, "train/sim_loss": 0.046875 }, { "epoch": 0.3163931184496737, "step": 3200, "train/total_loss": 0.1262282282114029 }, { "entropy": 9.994163513183594, "epoch": 0.31649199129918926, "mean_token_accuracy": 0.756302535533905, "num_tokens": 16662287.0, "step": 3201, "train/ce_loss": 7.051830834825523e-06 }, { "epoch": 0.31649199129918926, "step": 3201, "train/sim_loss": 0.04296875 }, { "epoch": 0.31649199129918926, "step": 3201, "train/total_loss": 0.04296945407986641 }, { "entropy": 9.194380760192871, "epoch": 0.31659086414870474, "mean_token_accuracy": 0.7325870394706726, "num_tokens": 16667538.0, "step": 3202, "train/ce_loss": 1.6637096405029297 }, { "epoch": 0.31659086414870474, "step": 3202, "train/sim_loss": 0.08984375 }, { "epoch": 0.31659086414870474, "step": 3202, "train/total_loss": 0.2562147378921509 }, { "entropy": 9.233200073242188, "epoch": 0.3166897369982203, "mean_token_accuracy": 0.7308584451675415, "num_tokens": 16672857.0, "step": 3203, "train/ce_loss": 0.989677369594574 }, { "epoch": 0.3166897369982203, "step": 3203, "train/sim_loss": 0.08984375 }, { "epoch": 0.3166897369982203, "step": 3203, "train/total_loss": 0.18881148099899292 }, { "entropy": 8.693717956542969, "epoch": 0.3167886098477358, "mean_token_accuracy": 0.7471697926521301, "num_tokens": 16678392.0, "step": 3204, "train/ce_loss": 0.99169921875 }, { "epoch": 0.3167886098477358, "step": 3204, "train/sim_loss": 0.0859375 }, { "epoch": 0.3167886098477358, "step": 3204, "train/total_loss": 0.18510742485523224 }, { "entropy": 9.406003952026367, "epoch": 0.3168874826972513, "mean_token_accuracy": 0.7399463653564453, "num_tokens": 16683567.0, "step": 3205, "train/ce_loss": 1.0676230192184448 }, { "epoch": 0.3168874826972513, "step": 3205, "train/sim_loss": 0.06640625 }, { "epoch": 0.3168874826972513, "step": 3205, "train/total_loss": 0.17316855490207672 }, { "entropy": 9.885889053344727, "epoch": 0.31698635554676685, "mean_token_accuracy": 0.6584269404411316, "num_tokens": 16688440.0, "step": 3206, "train/ce_loss": 1.2701112031936646 }, { "epoch": 0.31698635554676685, "step": 3206, "train/sim_loss": 0.0390625 }, { "epoch": 0.31698635554676685, "step": 3206, "train/total_loss": 0.16607362031936646 }, { "entropy": 9.325813293457031, "epoch": 0.3170852283962824, "mean_token_accuracy": 0.7401477694511414, "num_tokens": 16693729.0, "step": 3207, "train/ce_loss": 1.0808814764022827 }, { "epoch": 0.3170852283962824, "step": 3207, "train/sim_loss": 0.046875 }, { "epoch": 0.3170852283962824, "step": 3207, "train/total_loss": 0.1549631506204605 }, { "entropy": 9.387101173400879, "epoch": 0.3171841012457979, "mean_token_accuracy": 0.7160000205039978, "num_tokens": 16698937.0, "step": 3208, "train/ce_loss": 0.9226993918418884 }, { "epoch": 0.3171841012457979, "step": 3208, "train/sim_loss": 0.08203125 }, { "epoch": 0.3171841012457979, "step": 3208, "train/total_loss": 0.17430119216442108 }, { "entropy": 8.81142520904541, "epoch": 0.3172829740953134, "mean_token_accuracy": 0.7158403992652893, "num_tokens": 16704208.0, "step": 3209, "train/ce_loss": 1.0430004596710205 }, { "epoch": 0.3172829740953134, "step": 3209, "train/sim_loss": 0.0546875 }, { "epoch": 0.3172829740953134, "step": 3209, "train/total_loss": 0.15898755192756653 }, { "entropy": 9.000001907348633, "epoch": 0.31738184694482896, "mean_token_accuracy": 0.7566702365875244, "num_tokens": 16709636.0, "step": 3210, "train/ce_loss": 0.6519407629966736 }, { "epoch": 0.31738184694482896, "step": 3210, "train/sim_loss": 0.03125 }, { "epoch": 0.31738184694482896, "step": 3210, "train/total_loss": 0.09644407778978348 }, { "entropy": 9.263021469116211, "epoch": 0.31748071979434445, "mean_token_accuracy": 0.756926953792572, "num_tokens": 16714887.0, "step": 3211, "train/ce_loss": 0.7199892401695251 }, { "epoch": 0.31748071979434445, "step": 3211, "train/sim_loss": 0.078125 }, { "epoch": 0.31748071979434445, "step": 3211, "train/total_loss": 0.15012392401695251 }, { "entropy": 8.933192253112793, "epoch": 0.31757959264386, "mean_token_accuracy": 0.769487738609314, "num_tokens": 16720282.0, "step": 3212, "train/ce_loss": 0.8922109603881836 }, { "epoch": 0.31757959264386, "step": 3212, "train/sim_loss": 0.02734375 }, { "epoch": 0.31757959264386, "step": 3212, "train/total_loss": 0.11656484752893448 }, { "entropy": 8.905010223388672, "epoch": 0.31767846549337553, "mean_token_accuracy": 0.7305524349212646, "num_tokens": 16725652.0, "step": 3213, "train/ce_loss": 0.9052978754043579 }, { "epoch": 0.31767846549337553, "step": 3213, "train/sim_loss": 0.0625 }, { "epoch": 0.31767846549337553, "step": 3213, "train/total_loss": 0.15302979946136475 }, { "entropy": 10.404428482055664, "epoch": 0.31777733834289107, "mean_token_accuracy": 0.8070175647735596, "num_tokens": 16730211.0, "step": 3214, "train/ce_loss": 0.00039690217818133533 }, { "epoch": 0.31777733834289107, "step": 3214, "train/sim_loss": 0.05078125 }, { "epoch": 0.31777733834289107, "step": 3214, "train/total_loss": 0.05082093924283981 }, { "entropy": 8.677364349365234, "epoch": 0.31787621119240655, "mean_token_accuracy": 0.7495238184928894, "num_tokens": 16735741.0, "step": 3215, "train/ce_loss": 1.1565896272659302 }, { "epoch": 0.31787621119240655, "step": 3215, "train/sim_loss": 0.0234375 }, { "epoch": 0.31787621119240655, "step": 3215, "train/total_loss": 0.1390964686870575 }, { "entropy": 9.153155326843262, "epoch": 0.3179750840419221, "mean_token_accuracy": 0.7590027451515198, "num_tokens": 16740934.0, "step": 3216, "train/ce_loss": 0.5160759091377258 }, { "epoch": 0.3179750840419221, "step": 3216, "train/sim_loss": 0.0546875 }, { "epoch": 0.3179750840419221, "step": 3216, "train/total_loss": 0.10629509389400482 }, { "entropy": 9.285148620605469, "epoch": 0.31807395689143764, "mean_token_accuracy": 0.7387499809265137, "num_tokens": 16746179.0, "step": 3217, "train/ce_loss": 1.7083173990249634 }, { "epoch": 0.31807395689143764, "step": 3217, "train/sim_loss": 0.07421875 }, { "epoch": 0.31807395689143764, "step": 3217, "train/total_loss": 0.24505048990249634 }, { "entropy": 10.631352424621582, "epoch": 0.3181728297409531, "mean_token_accuracy": 0.7166666388511658, "num_tokens": 16750712.0, "step": 3218, "train/ce_loss": 1.9401524696149863e-05 }, { "epoch": 0.3181728297409531, "step": 3218, "train/sim_loss": 0.0234375 }, { "epoch": 0.3181728297409531, "step": 3218, "train/total_loss": 0.0234394408762455 }, { "entropy": 10.037153244018555, "epoch": 0.31827170259046866, "mean_token_accuracy": 0.7359412908554077, "num_tokens": 16755485.0, "step": 3219, "train/ce_loss": 4.458036073629046e-06 }, { "epoch": 0.31827170259046866, "step": 3219, "train/sim_loss": 0.0234375 }, { "epoch": 0.31827170259046866, "step": 3219, "train/total_loss": 0.023437945172190666 }, { "epoch": 0.3183705754399842, "grad_norm": 0.8842136263847351, "learning_rate": 9.206596449587104e-06, "loss": 0.1428, "step": 3220 }, { "entropy": 9.433752059936523, "epoch": 0.3183705754399842, "mean_token_accuracy": 0.7534013390541077, "num_tokens": 16760520.0, "step": 3220, "train/ce_loss": 1.7352644205093384 }, { "epoch": 0.3183705754399842, "step": 3220, "train/sim_loss": 0.05859375 }, { "epoch": 0.3183705754399842, "step": 3220, "train/total_loss": 0.23212020099163055 }, { "entropy": 9.202911376953125, "epoch": 0.3184694482894997, "mean_token_accuracy": 0.7560975551605225, "num_tokens": 16765810.0, "step": 3221, "train/ce_loss": 0.5386577844619751 }, { "epoch": 0.3184694482894997, "step": 3221, "train/sim_loss": 0.02734375 }, { "epoch": 0.3184694482894997, "step": 3221, "train/total_loss": 0.08120952546596527 }, { "entropy": 9.050951957702637, "epoch": 0.31856832113901523, "mean_token_accuracy": 0.7635402679443359, "num_tokens": 16771075.0, "step": 3222, "train/ce_loss": 0.6109569668769836 }, { "epoch": 0.31856832113901523, "step": 3222, "train/sim_loss": 0.05078125 }, { "epoch": 0.31856832113901523, "step": 3222, "train/total_loss": 0.1118769496679306 }, { "entropy": 8.684123039245605, "epoch": 0.3186671939885308, "mean_token_accuracy": 0.7515257000923157, "num_tokens": 16776859.0, "step": 3223, "train/ce_loss": 0.7818444967269897 }, { "epoch": 0.3186671939885308, "step": 3223, "train/sim_loss": 0.05859375 }, { "epoch": 0.3186671939885308, "step": 3223, "train/total_loss": 0.13677820563316345 }, { "entropy": 9.767860412597656, "epoch": 0.31876606683804626, "mean_token_accuracy": 0.7011494040489197, "num_tokens": 16781818.0, "step": 3224, "train/ce_loss": 1.282180905342102 }, { "epoch": 0.31876606683804626, "step": 3224, "train/sim_loss": 0.0390625 }, { "epoch": 0.31876606683804626, "step": 3224, "train/total_loss": 0.16728059947490692 }, { "entropy": 9.266471862792969, "epoch": 0.3188649396875618, "mean_token_accuracy": 0.7120822668075562, "num_tokens": 16787079.0, "step": 3225, "train/ce_loss": 0.778668999671936 }, { "epoch": 0.3188649396875618, "step": 3225, "train/sim_loss": 0.0390625 }, { "epoch": 0.3188649396875618, "step": 3225, "train/total_loss": 0.11692940443754196 }, { "entropy": 9.97616958618164, "epoch": 0.31896381253707734, "mean_token_accuracy": 0.7937219738960266, "num_tokens": 16791900.0, "step": 3226, "train/ce_loss": 3.3997418995568296e-06 }, { "epoch": 0.31896381253707734, "step": 3226, "train/sim_loss": 0.0234375 }, { "epoch": 0.31896381253707734, "step": 3226, "train/total_loss": 0.02343784086406231 }, { "entropy": 9.26513957977295, "epoch": 0.3190626853865928, "mean_token_accuracy": 0.7091836929321289, "num_tokens": 16797154.0, "step": 3227, "train/ce_loss": 0.9920081496238708 }, { "epoch": 0.3190626853865928, "step": 3227, "train/sim_loss": 0.125 }, { "epoch": 0.3190626853865928, "step": 3227, "train/total_loss": 0.22420081496238708 }, { "entropy": 9.090368270874023, "epoch": 0.31916155823610837, "mean_token_accuracy": 0.7353723645210266, "num_tokens": 16802313.0, "step": 3228, "train/ce_loss": 1.317256212234497 }, { "epoch": 0.31916155823610837, "step": 3228, "train/sim_loss": 0.09375 }, { "epoch": 0.31916155823610837, "step": 3228, "train/total_loss": 0.22547562420368195 }, { "entropy": 10.012613296508789, "epoch": 0.3192604310856239, "mean_token_accuracy": 0.7511627674102783, "num_tokens": 16807139.0, "step": 3229, "train/ce_loss": 0.7629386186599731 }, { "epoch": 0.3192604310856239, "step": 3229, "train/sim_loss": 0.03515625 }, { "epoch": 0.3192604310856239, "step": 3229, "train/total_loss": 0.11145011335611343 }, { "entropy": 9.738869667053223, "epoch": 0.3193593039351394, "mean_token_accuracy": 0.7278911471366882, "num_tokens": 16812152.0, "step": 3230, "train/ce_loss": 4.822842583962483e-06 }, { "epoch": 0.3193593039351394, "step": 3230, "train/sim_loss": 0.06640625 }, { "epoch": 0.3193593039351394, "step": 3230, "train/total_loss": 0.0664067342877388 }, { "entropy": 9.756131172180176, "epoch": 0.31945817678465493, "mean_token_accuracy": 0.676300585269928, "num_tokens": 16817074.0, "step": 3231, "train/ce_loss": 4.654988060792675e-06 }, { "epoch": 0.31945817678465493, "step": 3231, "train/sim_loss": 0.0859375 }, { "epoch": 0.31945817678465493, "step": 3231, "train/total_loss": 0.08593796193599701 }, { "entropy": 9.703554153442383, "epoch": 0.3195570496341705, "mean_token_accuracy": 0.7551020383834839, "num_tokens": 16822074.0, "step": 3232, "train/ce_loss": 5.237433470028918e-06 }, { "epoch": 0.3195570496341705, "step": 3232, "train/sim_loss": 0.0546875 }, { "epoch": 0.3195570496341705, "step": 3232, "train/total_loss": 0.05468802526593208 }, { "entropy": 9.267679214477539, "epoch": 0.31965592248368596, "mean_token_accuracy": 0.8261421322822571, "num_tokens": 16827324.0, "step": 3233, "train/ce_loss": 0.4245811104774475 }, { "epoch": 0.31965592248368596, "step": 3233, "train/sim_loss": 0.0234375 }, { "epoch": 0.31965592248368596, "step": 3233, "train/total_loss": 0.06589561700820923 }, { "entropy": 8.957054138183594, "epoch": 0.3197547953332015, "mean_token_accuracy": 0.732300877571106, "num_tokens": 16832679.0, "step": 3234, "train/ce_loss": 0.6917397975921631 }, { "epoch": 0.3197547953332015, "step": 3234, "train/sim_loss": 0.08984375 }, { "epoch": 0.3197547953332015, "step": 3234, "train/total_loss": 0.15901774168014526 }, { "entropy": 9.423365592956543, "epoch": 0.31985366818271704, "mean_token_accuracy": 0.6760828495025635, "num_tokens": 16837632.0, "step": 3235, "train/ce_loss": 2.117717981338501 }, { "epoch": 0.31985366818271704, "step": 3235, "train/sim_loss": 0.12109375 }, { "epoch": 0.31985366818271704, "step": 3235, "train/total_loss": 0.33286553621292114 }, { "entropy": 9.442780494689941, "epoch": 0.31995254103223253, "mean_token_accuracy": 0.7576243877410889, "num_tokens": 16842769.0, "step": 3236, "train/ce_loss": 1.2201064825057983 }, { "epoch": 0.31995254103223253, "step": 3236, "train/sim_loss": 0.1171875 }, { "epoch": 0.31995254103223253, "step": 3236, "train/total_loss": 0.23919814825057983 }, { "entropy": 9.171239852905273, "epoch": 0.32005141388174807, "mean_token_accuracy": 0.7342105507850647, "num_tokens": 16848004.0, "step": 3237, "train/ce_loss": 0.7349594831466675 }, { "epoch": 0.32005141388174807, "step": 3237, "train/sim_loss": 0.03125 }, { "epoch": 0.32005141388174807, "step": 3237, "train/total_loss": 0.10474594682455063 }, { "entropy": 9.50808334350586, "epoch": 0.3201502867312636, "mean_token_accuracy": 0.7857142686843872, "num_tokens": 16853036.0, "step": 3238, "train/ce_loss": 0.8079267144203186 }, { "epoch": 0.3201502867312636, "step": 3238, "train/sim_loss": 0.046875 }, { "epoch": 0.3201502867312636, "step": 3238, "train/total_loss": 0.12766766548156738 }, { "entropy": 9.111824989318848, "epoch": 0.3202491595807791, "mean_token_accuracy": 0.7210965156555176, "num_tokens": 16858318.0, "step": 3239, "train/ce_loss": 0.6129790544509888 }, { "epoch": 0.3202491595807791, "step": 3239, "train/sim_loss": 0.01953125 }, { "epoch": 0.3202491595807791, "step": 3239, "train/total_loss": 0.08082915842533112 }, { "epoch": 0.32034803243029464, "grad_norm": 0.7715190052986145, "learning_rate": 9.201651584829155e-06, "loss": 0.1442, "step": 3240 }, { "entropy": 9.485016822814941, "epoch": 0.32034803243029464, "mean_token_accuracy": 0.8025078177452087, "num_tokens": 16863377.0, "step": 3240, "train/ce_loss": 3.0977219012129353e-06 }, { "epoch": 0.32034803243029464, "step": 3240, "train/sim_loss": 0.05859375 }, { "epoch": 0.32034803243029464, "step": 3240, "train/total_loss": 0.05859405919909477 }, { "entropy": 9.315765380859375, "epoch": 0.3204469052798102, "mean_token_accuracy": 0.7462121248245239, "num_tokens": 16868608.0, "step": 3241, "train/ce_loss": 0.7169209718704224 }, { "epoch": 0.3204469052798102, "step": 3241, "train/sim_loss": 0.03515625 }, { "epoch": 0.3204469052798102, "step": 3241, "train/total_loss": 0.1068483516573906 }, { "entropy": 9.324462890625, "epoch": 0.32054577812932566, "mean_token_accuracy": 0.7270408272743225, "num_tokens": 16873884.0, "step": 3242, "train/ce_loss": 1.1520270109176636 }, { "epoch": 0.32054577812932566, "step": 3242, "train/sim_loss": 0.12109375 }, { "epoch": 0.32054577812932566, "step": 3242, "train/total_loss": 0.23629644513130188 }, { "entropy": 8.780170440673828, "epoch": 0.3206446509788412, "mean_token_accuracy": 0.7125129103660583, "num_tokens": 16879468.0, "step": 3243, "train/ce_loss": 1.032793641090393 }, { "epoch": 0.3206446509788412, "step": 3243, "train/sim_loss": 0.08984375 }, { "epoch": 0.3206446509788412, "step": 3243, "train/total_loss": 0.19312311708927155 }, { "entropy": 8.811084747314453, "epoch": 0.32074352382835675, "mean_token_accuracy": 0.6864801645278931, "num_tokens": 16884823.0, "step": 3244, "train/ce_loss": 0.7729099988937378 }, { "epoch": 0.32074352382835675, "step": 3244, "train/sim_loss": 0.09375 }, { "epoch": 0.32074352382835675, "step": 3244, "train/total_loss": 0.17104101181030273 }, { "entropy": 8.963603019714355, "epoch": 0.32084239667787223, "mean_token_accuracy": 0.6889564394950867, "num_tokens": 16890253.0, "step": 3245, "train/ce_loss": 1.0300863981246948 }, { "epoch": 0.32084239667787223, "step": 3245, "train/sim_loss": 0.05078125 }, { "epoch": 0.32084239667787223, "step": 3245, "train/total_loss": 0.15378989279270172 }, { "entropy": 8.786639213562012, "epoch": 0.3209412695273878, "mean_token_accuracy": 0.7233644723892212, "num_tokens": 16895816.0, "step": 3246, "train/ce_loss": 0.6837566494941711 }, { "epoch": 0.3209412695273878, "step": 3246, "train/sim_loss": 0.0859375 }, { "epoch": 0.3209412695273878, "step": 3246, "train/total_loss": 0.15431317687034607 }, { "entropy": 8.63482666015625, "epoch": 0.3210401423769033, "mean_token_accuracy": 0.7578431367874146, "num_tokens": 16901363.0, "step": 3247, "train/ce_loss": 1.2165522575378418 }, { "epoch": 0.3210401423769033, "step": 3247, "train/sim_loss": 0.05078125 }, { "epoch": 0.3210401423769033, "step": 3247, "train/total_loss": 0.17243647575378418 }, { "entropy": 9.373018264770508, "epoch": 0.3211390152264188, "mean_token_accuracy": 0.7123696208000183, "num_tokens": 16906484.0, "step": 3248, "train/ce_loss": 2.632043106132187e-06 }, { "epoch": 0.3211390152264188, "step": 3248, "train/sim_loss": 0.05078125 }, { "epoch": 0.3211390152264188, "step": 3248, "train/total_loss": 0.05078151449561119 }, { "entropy": 8.7036771774292, "epoch": 0.32123788807593434, "mean_token_accuracy": 0.8161478638648987, "num_tokens": 16911995.0, "step": 3249, "train/ce_loss": 0.49719151854515076 }, { "epoch": 0.32123788807593434, "step": 3249, "train/sim_loss": 0.03125 }, { "epoch": 0.32123788807593434, "step": 3249, "train/total_loss": 0.08096915483474731 }, { "entropy": 9.056543350219727, "epoch": 0.3213367609254499, "mean_token_accuracy": 0.7183979749679565, "num_tokens": 16917262.0, "step": 3250, "train/ce_loss": 0.9369495511054993 }, { "epoch": 0.3213367609254499, "step": 3250, "train/sim_loss": 0.05859375 }, { "epoch": 0.3213367609254499, "step": 3250, "train/total_loss": 0.15228870511054993 }, { "entropy": 8.583107948303223, "epoch": 0.32143563377496537, "mean_token_accuracy": 0.6989351511001587, "num_tokens": 16922768.0, "step": 3251, "train/ce_loss": 0.5158079266548157 }, { "epoch": 0.32143563377496537, "step": 3251, "train/sim_loss": 0.05078125 }, { "epoch": 0.32143563377496537, "step": 3251, "train/total_loss": 0.10236204415559769 }, { "entropy": 9.377542495727539, "epoch": 0.3215345066244809, "mean_token_accuracy": 0.7456555962562561, "num_tokens": 16927804.0, "step": 3252, "train/ce_loss": 1.2434115409851074 }, { "epoch": 0.3215345066244809, "step": 3252, "train/sim_loss": 0.05859375 }, { "epoch": 0.3215345066244809, "step": 3252, "train/total_loss": 0.18293491005897522 }, { "entropy": 9.138757705688477, "epoch": 0.32163337947399645, "mean_token_accuracy": 0.7271557450294495, "num_tokens": 16933053.0, "step": 3253, "train/ce_loss": 0.7804774641990662 }, { "epoch": 0.32163337947399645, "step": 3253, "train/sim_loss": 0.0703125 }, { "epoch": 0.32163337947399645, "step": 3253, "train/total_loss": 0.1483602523803711 }, { "entropy": 9.449091911315918, "epoch": 0.321732252323512, "mean_token_accuracy": 0.7568093538284302, "num_tokens": 16938054.0, "step": 3254, "train/ce_loss": 0.8022903800010681 }, { "epoch": 0.321732252323512, "step": 3254, "train/sim_loss": 0.0703125 }, { "epoch": 0.321732252323512, "step": 3254, "train/total_loss": 0.1505415439605713 }, { "entropy": 9.11605453491211, "epoch": 0.3218311251730275, "mean_token_accuracy": 0.7584269642829895, "num_tokens": 16943467.0, "step": 3255, "train/ce_loss": 1.0913804769515991 }, { "epoch": 0.3218311251730275, "step": 3255, "train/sim_loss": 0.05078125 }, { "epoch": 0.3218311251730275, "step": 3255, "train/total_loss": 0.15991929173469543 }, { "entropy": 9.231228828430176, "epoch": 0.321929998022543, "mean_token_accuracy": 0.7351154088973999, "num_tokens": 16948885.0, "step": 3256, "train/ce_loss": 0.8510516285896301 }, { "epoch": 0.321929998022543, "step": 3256, "train/sim_loss": 0.0390625 }, { "epoch": 0.321929998022543, "step": 3256, "train/total_loss": 0.12416766583919525 }, { "entropy": 10.03917407989502, "epoch": 0.32202887087205856, "mean_token_accuracy": 0.7881773114204407, "num_tokens": 16953734.0, "step": 3257, "train/ce_loss": 4.570816599880345e-06 }, { "epoch": 0.32202887087205856, "step": 3257, "train/sim_loss": 0.04296875 }, { "epoch": 0.32202887087205856, "step": 3257, "train/total_loss": 0.04296920821070671 }, { "entropy": 9.223027229309082, "epoch": 0.32212774372157404, "mean_token_accuracy": 0.7317073345184326, "num_tokens": 16958776.0, "step": 3258, "train/ce_loss": 1.3286868333816528 }, { "epoch": 0.32212774372157404, "step": 3258, "train/sim_loss": 0.0703125 }, { "epoch": 0.32212774372157404, "step": 3258, "train/total_loss": 0.203181192278862 }, { "entropy": 9.222864151000977, "epoch": 0.3222266165710896, "mean_token_accuracy": 0.6962190270423889, "num_tokens": 16963943.0, "step": 3259, "train/ce_loss": 1.0266321897506714 }, { "epoch": 0.3222266165710896, "step": 3259, "train/sim_loss": 0.06640625 }, { "epoch": 0.3222266165710896, "step": 3259, "train/total_loss": 0.16906946897506714 }, { "epoch": 0.3223254894206051, "grad_norm": 1.0479072332382202, "learning_rate": 9.196706720071207e-06, "loss": 0.1523, "step": 3260 }, { "entropy": 8.734139442443848, "epoch": 0.3223254894206051, "mean_token_accuracy": 0.774685800075531, "num_tokens": 16969556.0, "step": 3260, "train/ce_loss": 0.42346420884132385 }, { "epoch": 0.3223254894206051, "step": 3260, "train/sim_loss": 0.0234375 }, { "epoch": 0.3223254894206051, "step": 3260, "train/total_loss": 0.06578391790390015 }, { "entropy": 8.73713493347168, "epoch": 0.3224243622701206, "mean_token_accuracy": 0.7389221787452698, "num_tokens": 16974904.0, "step": 3261, "train/ce_loss": 1.1096874475479126 }, { "epoch": 0.3224243622701206, "step": 3261, "train/sim_loss": 0.06640625 }, { "epoch": 0.3224243622701206, "step": 3261, "train/total_loss": 0.17737498879432678 }, { "entropy": 9.207422256469727, "epoch": 0.32252323511963615, "mean_token_accuracy": 0.7205188870429993, "num_tokens": 16980227.0, "step": 3262, "train/ce_loss": 0.8753836750984192 }, { "epoch": 0.32252323511963615, "step": 3262, "train/sim_loss": 0.09765625 }, { "epoch": 0.32252323511963615, "step": 3262, "train/total_loss": 0.18519461154937744 }, { "entropy": 10.055367469787598, "epoch": 0.3226221079691517, "mean_token_accuracy": 0.6889952421188354, "num_tokens": 16985060.0, "step": 3263, "train/ce_loss": 1.4159187078475952 }, { "epoch": 0.3226221079691517, "step": 3263, "train/sim_loss": 0.125 }, { "epoch": 0.3226221079691517, "step": 3263, "train/total_loss": 0.266591876745224 }, { "entropy": 9.026175498962402, "epoch": 0.3227209808186672, "mean_token_accuracy": 0.7289719581604004, "num_tokens": 16990358.0, "step": 3264, "train/ce_loss": 0.7006011605262756 }, { "epoch": 0.3227209808186672, "step": 3264, "train/sim_loss": 0.0390625 }, { "epoch": 0.3227209808186672, "step": 3264, "train/total_loss": 0.1091226190328598 }, { "entropy": 10.099632263183594, "epoch": 0.3228198536681827, "mean_token_accuracy": 0.77173912525177, "num_tokens": 16995104.0, "step": 3265, "train/ce_loss": 6.077885245758807e-06 }, { "epoch": 0.3228198536681827, "step": 3265, "train/sim_loss": 0.05078125 }, { "epoch": 0.3228198536681827, "step": 3265, "train/total_loss": 0.05078185722231865 }, { "entropy": 9.232105255126953, "epoch": 0.32291872651769826, "mean_token_accuracy": 0.724252462387085, "num_tokens": 17000159.0, "step": 3266, "train/ce_loss": 0.7505518198013306 }, { "epoch": 0.32291872651769826, "step": 3266, "train/sim_loss": 0.0390625 }, { "epoch": 0.32291872651769826, "step": 3266, "train/total_loss": 0.11411768198013306 }, { "entropy": 9.911155700683594, "epoch": 0.32301759936721375, "mean_token_accuracy": 0.7787056565284729, "num_tokens": 17005083.0, "step": 3267, "train/ce_loss": 1.1564180850982666 }, { "epoch": 0.32301759936721375, "step": 3267, "train/sim_loss": 0.03125 }, { "epoch": 0.32301759936721375, "step": 3267, "train/total_loss": 0.14689180254936218 }, { "entropy": 9.332996368408203, "epoch": 0.3231164722167293, "mean_token_accuracy": 0.7897648811340332, "num_tokens": 17010302.0, "step": 3268, "train/ce_loss": 0.8998430967330933 }, { "epoch": 0.3231164722167293, "step": 3268, "train/sim_loss": 0.07421875 }, { "epoch": 0.3231164722167293, "step": 3268, "train/total_loss": 0.16420306265354156 }, { "entropy": 9.12976360321045, "epoch": 0.32321534506624483, "mean_token_accuracy": 0.7894737124443054, "num_tokens": 17015635.0, "step": 3269, "train/ce_loss": 0.6157534122467041 }, { "epoch": 0.32321534506624483, "step": 3269, "train/sim_loss": 0.02734375 }, { "epoch": 0.32321534506624483, "step": 3269, "train/total_loss": 0.08891908824443817 }, { "entropy": 9.445563316345215, "epoch": 0.3233142179157603, "mean_token_accuracy": 0.7508772015571594, "num_tokens": 17020678.0, "step": 3270, "train/ce_loss": 0.9528619050979614 }, { "epoch": 0.3233142179157603, "step": 3270, "train/sim_loss": 0.05078125 }, { "epoch": 0.3233142179157603, "step": 3270, "train/total_loss": 0.14606744050979614 }, { "entropy": 9.506591796875, "epoch": 0.32341309076527586, "mean_token_accuracy": 0.7190812826156616, "num_tokens": 17025649.0, "step": 3271, "train/ce_loss": 2.4868256787158316e-06 }, { "epoch": 0.32341309076527586, "step": 3271, "train/sim_loss": 0.01953125 }, { "epoch": 0.32341309076527586, "step": 3271, "train/total_loss": 0.019531499594449997 }, { "entropy": 10.046039581298828, "epoch": 0.3235119636147914, "mean_token_accuracy": 0.7460317611694336, "num_tokens": 17030429.0, "step": 3272, "train/ce_loss": 3.1560873594571603e-06 }, { "epoch": 0.3235119636147914, "step": 3272, "train/sim_loss": 0.03125 }, { "epoch": 0.3235119636147914, "step": 3272, "train/total_loss": 0.03125031664967537 }, { "entropy": 9.029333114624023, "epoch": 0.3236108364643069, "mean_token_accuracy": 0.7631160616874695, "num_tokens": 17035544.0, "step": 3273, "train/ce_loss": 1.2163193225860596 }, { "epoch": 0.3236108364643069, "step": 3273, "train/sim_loss": 0.06640625 }, { "epoch": 0.3236108364643069, "step": 3273, "train/total_loss": 0.1880381852388382 }, { "entropy": 9.000850677490234, "epoch": 0.3237097093138224, "mean_token_accuracy": 0.7205128073692322, "num_tokens": 17040768.0, "step": 3274, "train/ce_loss": 1.0964419841766357 }, { "epoch": 0.3237097093138224, "step": 3274, "train/sim_loss": 0.05859375 }, { "epoch": 0.3237097093138224, "step": 3274, "train/total_loss": 0.16823795437812805 }, { "entropy": 9.481927871704102, "epoch": 0.32380858216333797, "mean_token_accuracy": 0.7527777552604675, "num_tokens": 17045945.0, "step": 3275, "train/ce_loss": 1.2007566690444946 }, { "epoch": 0.32380858216333797, "step": 3275, "train/sim_loss": 0.0546875 }, { "epoch": 0.32380858216333797, "step": 3275, "train/total_loss": 0.17476317286491394 }, { "entropy": 8.948715209960938, "epoch": 0.32390745501285345, "mean_token_accuracy": 0.7540322542190552, "num_tokens": 17051436.0, "step": 3276, "train/ce_loss": 0.9103338122367859 }, { "epoch": 0.32390745501285345, "step": 3276, "train/sim_loss": 0.0625 }, { "epoch": 0.32390745501285345, "step": 3276, "train/total_loss": 0.15353338420391083 }, { "entropy": 9.306329727172852, "epoch": 0.324006327862369, "mean_token_accuracy": 0.7360248565673828, "num_tokens": 17056539.0, "step": 3277, "train/ce_loss": 0.7820467948913574 }, { "epoch": 0.324006327862369, "step": 3277, "train/sim_loss": 0.09375 }, { "epoch": 0.324006327862369, "step": 3277, "train/total_loss": 0.1719546914100647 }, { "entropy": 8.889620780944824, "epoch": 0.32410520071188453, "mean_token_accuracy": 0.6985781788825989, "num_tokens": 17062255.0, "step": 3278, "train/ce_loss": 0.6463266611099243 }, { "epoch": 0.32410520071188453, "step": 3278, "train/sim_loss": 0.0546875 }, { "epoch": 0.32410520071188453, "step": 3278, "train/total_loss": 0.11932016909122467 }, { "entropy": 9.355274200439453, "epoch": 0.3242040735614, "mean_token_accuracy": 0.7539797425270081, "num_tokens": 17067389.0, "step": 3279, "train/ce_loss": 0.6823098659515381 }, { "epoch": 0.3242040735614, "step": 3279, "train/sim_loss": 0.03515625 }, { "epoch": 0.3242040735614, "step": 3279, "train/total_loss": 0.10338723659515381 }, { "epoch": 0.32430294641091556, "grad_norm": 0.8628454208374023, "learning_rate": 9.191761855313257e-06, "loss": 0.1487, "step": 3280 }, { "entropy": 9.686016082763672, "epoch": 0.32430294641091556, "mean_token_accuracy": 0.7093275785446167, "num_tokens": 17072308.0, "step": 3280, "train/ce_loss": 2.1156375408172607 }, { "epoch": 0.32430294641091556, "step": 3280, "train/sim_loss": 0.0703125 }, { "epoch": 0.32430294641091556, "step": 3280, "train/total_loss": 0.28187626600265503 }, { "entropy": 8.727543830871582, "epoch": 0.3244018192604311, "mean_token_accuracy": 0.834343433380127, "num_tokens": 17077830.0, "step": 3281, "train/ce_loss": 0.9242648482322693 }, { "epoch": 0.3244018192604311, "step": 3281, "train/sim_loss": 0.16796875 }, { "epoch": 0.3244018192604311, "step": 3281, "train/total_loss": 0.26039522886276245 }, { "entropy": 9.704157829284668, "epoch": 0.3245006921099466, "mean_token_accuracy": 0.777365505695343, "num_tokens": 17082804.0, "step": 3282, "train/ce_loss": 1.0163599252700806 }, { "epoch": 0.3245006921099466, "step": 3282, "train/sim_loss": 0.15625 }, { "epoch": 0.3245006921099466, "step": 3282, "train/total_loss": 0.25788599252700806 }, { "entropy": 9.161046981811523, "epoch": 0.32459956495946213, "mean_token_accuracy": 0.76106196641922, "num_tokens": 17088103.0, "step": 3283, "train/ce_loss": 0.3052811026573181 }, { "epoch": 0.32459956495946213, "step": 3283, "train/sim_loss": 0.01953125 }, { "epoch": 0.32459956495946213, "step": 3283, "train/total_loss": 0.05005936324596405 }, { "entropy": 9.814420700073242, "epoch": 0.32469843780897767, "mean_token_accuracy": 0.8077753782272339, "num_tokens": 17092978.0, "step": 3284, "train/ce_loss": 9.27206565393135e-06 }, { "epoch": 0.32469843780897767, "step": 3284, "train/sim_loss": 0.046875 }, { "epoch": 0.32469843780897767, "step": 3284, "train/total_loss": 0.04687592759728432 }, { "entropy": 8.795757293701172, "epoch": 0.32479731065849315, "mean_token_accuracy": 0.7015834450721741, "num_tokens": 17098316.0, "step": 3285, "train/ce_loss": 0.6898893713951111 }, { "epoch": 0.32479731065849315, "step": 3285, "train/sim_loss": 0.05078125 }, { "epoch": 0.32479731065849315, "step": 3285, "train/total_loss": 0.11977019160985947 }, { "entropy": 8.67201042175293, "epoch": 0.3248961835080087, "mean_token_accuracy": 0.7737603187561035, "num_tokens": 17103730.0, "step": 3286, "train/ce_loss": 0.6129236221313477 }, { "epoch": 0.3248961835080087, "step": 3286, "train/sim_loss": 0.02734375 }, { "epoch": 0.3248961835080087, "step": 3286, "train/total_loss": 0.088636115193367 }, { "entropy": 8.647956848144531, "epoch": 0.32499505635752424, "mean_token_accuracy": 0.7273631691932678, "num_tokens": 17109229.0, "step": 3287, "train/ce_loss": 0.9534184336662292 }, { "epoch": 0.32499505635752424, "step": 3287, "train/sim_loss": 0.09765625 }, { "epoch": 0.32499505635752424, "step": 3287, "train/total_loss": 0.19299809634685516 }, { "entropy": 9.613130569458008, "epoch": 0.3250939292070397, "mean_token_accuracy": 0.7248120307922363, "num_tokens": 17114307.0, "step": 3288, "train/ce_loss": 1.9667793367261766e-06 }, { "epoch": 0.3250939292070397, "step": 3288, "train/sim_loss": 0.01953125 }, { "epoch": 0.3250939292070397, "step": 3288, "train/total_loss": 0.01953144744038582 }, { "entropy": 9.082571029663086, "epoch": 0.32519280205655526, "mean_token_accuracy": 0.7468208074569702, "num_tokens": 17119641.0, "step": 3289, "train/ce_loss": 0.3608906865119934 }, { "epoch": 0.32519280205655526, "step": 3289, "train/sim_loss": 0.04296875 }, { "epoch": 0.32519280205655526, "step": 3289, "train/total_loss": 0.07905782014131546 }, { "entropy": 9.360298156738281, "epoch": 0.3252916749060708, "mean_token_accuracy": 0.7407407164573669, "num_tokens": 17124711.0, "step": 3290, "train/ce_loss": 0.6039575934410095 }, { "epoch": 0.3252916749060708, "step": 3290, "train/sim_loss": 0.0546875 }, { "epoch": 0.3252916749060708, "step": 3290, "train/total_loss": 0.11508326232433319 }, { "entropy": 9.86099624633789, "epoch": 0.3253905477555863, "mean_token_accuracy": 0.7488687634468079, "num_tokens": 17129567.0, "step": 3291, "train/ce_loss": 1.2142945528030396 }, { "epoch": 0.3253905477555863, "step": 3291, "train/sim_loss": 0.05859375 }, { "epoch": 0.3253905477555863, "step": 3291, "train/total_loss": 0.1800232082605362 }, { "entropy": 8.741572380065918, "epoch": 0.32548942060510183, "mean_token_accuracy": 0.7155172228813171, "num_tokens": 17135114.0, "step": 3292, "train/ce_loss": 0.803793728351593 }, { "epoch": 0.32548942060510183, "step": 3292, "train/sim_loss": 0.06640625 }, { "epoch": 0.32548942060510183, "step": 3292, "train/total_loss": 0.14678561687469482 }, { "entropy": 9.086403846740723, "epoch": 0.3255882934546174, "mean_token_accuracy": 0.796785295009613, "num_tokens": 17140425.0, "step": 3293, "train/ce_loss": 0.5832430720329285 }, { "epoch": 0.3255882934546174, "step": 3293, "train/sim_loss": 0.05078125 }, { "epoch": 0.3255882934546174, "step": 3293, "train/total_loss": 0.10910555720329285 }, { "entropy": 9.40902328491211, "epoch": 0.32568716630413286, "mean_token_accuracy": 0.8159999847412109, "num_tokens": 17145567.0, "step": 3294, "train/ce_loss": 1.3668820884049637e-06 }, { "epoch": 0.32568716630413286, "step": 3294, "train/sim_loss": 0.0234375 }, { "epoch": 0.32568716630413286, "step": 3294, "train/total_loss": 0.023437635973095894 }, { "entropy": 9.294602394104004, "epoch": 0.3257860391536484, "mean_token_accuracy": 0.7448275685310364, "num_tokens": 17150691.0, "step": 3295, "train/ce_loss": 0.8498972058296204 }, { "epoch": 0.3257860391536484, "step": 3295, "train/sim_loss": 0.0546875 }, { "epoch": 0.3257860391536484, "step": 3295, "train/total_loss": 0.1396772265434265 }, { "entropy": 9.750741958618164, "epoch": 0.32588491200316394, "mean_token_accuracy": 0.6804123520851135, "num_tokens": 17155490.0, "step": 3296, "train/ce_loss": 5.232382591202622e-06 }, { "epoch": 0.32588491200316394, "step": 3296, "train/sim_loss": 0.0234375 }, { "epoch": 0.32588491200316394, "step": 3296, "train/total_loss": 0.023438023403286934 }, { "entropy": 9.786356925964355, "epoch": 0.3259837848526795, "mean_token_accuracy": 0.7335701584815979, "num_tokens": 17160469.0, "step": 3297, "train/ce_loss": 1.3135344982147217 }, { "epoch": 0.3259837848526795, "step": 3297, "train/sim_loss": 0.0703125 }, { "epoch": 0.3259837848526795, "step": 3297, "train/total_loss": 0.2016659528017044 }, { "entropy": 8.91646957397461, "epoch": 0.32608265770219497, "mean_token_accuracy": 0.770691990852356, "num_tokens": 17165637.0, "step": 3298, "train/ce_loss": 0.9684990644454956 }, { "epoch": 0.32608265770219497, "step": 3298, "train/sim_loss": 0.12890625 }, { "epoch": 0.32608265770219497, "step": 3298, "train/total_loss": 0.22575616836547852 }, { "entropy": 8.74300479888916, "epoch": 0.3261815305517105, "mean_token_accuracy": 0.7272727489471436, "num_tokens": 17171049.0, "step": 3299, "train/ce_loss": 1.2194308042526245 }, { "epoch": 0.3261815305517105, "step": 3299, "train/sim_loss": 0.08203125 }, { "epoch": 0.3261815305517105, "step": 3299, "train/total_loss": 0.20397433638572693 }, { "epoch": 0.32628040340122605, "grad_norm": 0.7850518822669983, "learning_rate": 9.18681699055531e-06, "loss": 0.1409, "step": 3300 }, { "entropy": 8.887176513671875, "epoch": 0.32628040340122605, "mean_token_accuracy": 0.7696506381034851, "num_tokens": 17176416.0, "step": 3300, "train/ce_loss": 0.8500009775161743 }, { "epoch": 0.32628040340122605, "step": 3300, "train/sim_loss": 0.109375 }, { "epoch": 0.32628040340122605, "step": 3300, "train/total_loss": 0.19437509775161743 }, { "entropy": 9.179983139038086, "epoch": 0.32637927625074153, "mean_token_accuracy": 0.7182254195213318, "num_tokens": 17181676.0, "step": 3301, "train/ce_loss": 0.6478583216667175 }, { "epoch": 0.32637927625074153, "step": 3301, "train/sim_loss": 0.03125 }, { "epoch": 0.32637927625074153, "step": 3301, "train/total_loss": 0.09603583067655563 }, { "entropy": 8.650181770324707, "epoch": 0.3264781491002571, "mean_token_accuracy": 0.7620087265968323, "num_tokens": 17187096.0, "step": 3302, "train/ce_loss": 0.6787617802619934 }, { "epoch": 0.3264781491002571, "step": 3302, "train/sim_loss": 0.10546875 }, { "epoch": 0.3264781491002571, "step": 3302, "train/total_loss": 0.1733449399471283 }, { "entropy": 9.029155731201172, "epoch": 0.3265770219497726, "mean_token_accuracy": 0.7097142934799194, "num_tokens": 17192413.0, "step": 3303, "train/ce_loss": 0.9886357188224792 }, { "epoch": 0.3265770219497726, "step": 3303, "train/sim_loss": 0.0546875 }, { "epoch": 0.3265770219497726, "step": 3303, "train/total_loss": 0.15355107188224792 }, { "entropy": 8.551504135131836, "epoch": 0.3266758947992881, "mean_token_accuracy": 0.6666666865348816, "num_tokens": 17197919.0, "step": 3304, "train/ce_loss": 1.1812376976013184 }, { "epoch": 0.3266758947992881, "step": 3304, "train/sim_loss": 0.09375 }, { "epoch": 0.3266758947992881, "step": 3304, "train/total_loss": 0.21187376976013184 }, { "entropy": 9.83358383178711, "epoch": 0.32677476764880364, "mean_token_accuracy": 0.8218181729316711, "num_tokens": 17202870.0, "step": 3305, "train/ce_loss": 2.4499606752215186e-06 }, { "epoch": 0.32677476764880364, "step": 3305, "train/sim_loss": 0.015625 }, { "epoch": 0.32677476764880364, "step": 3305, "train/total_loss": 0.0156252458691597 }, { "entropy": 9.238372802734375, "epoch": 0.3268736404983192, "mean_token_accuracy": 0.7621145248413086, "num_tokens": 17208061.0, "step": 3306, "train/ce_loss": 3.0834571589366533e-06 }, { "epoch": 0.3268736404983192, "step": 3306, "train/sim_loss": 0.0859375 }, { "epoch": 0.3268736404983192, "step": 3306, "train/total_loss": 0.08593780547380447 }, { "entropy": 9.290925025939941, "epoch": 0.32697251334783467, "mean_token_accuracy": 0.7260677218437195, "num_tokens": 17213221.0, "step": 3307, "train/ce_loss": 1.2993706464767456 }, { "epoch": 0.32697251334783467, "step": 3307, "train/sim_loss": 0.109375 }, { "epoch": 0.32697251334783467, "step": 3307, "train/total_loss": 0.2393120676279068 }, { "entropy": 9.71102523803711, "epoch": 0.3270713861973502, "mean_token_accuracy": 0.7114093899726868, "num_tokens": 17218246.0, "step": 3308, "train/ce_loss": 1.9544768292689696e-06 }, { "epoch": 0.3270713861973502, "step": 3308, "train/sim_loss": 0.046875 }, { "epoch": 0.3270713861973502, "step": 3308, "train/total_loss": 0.04687519371509552 }, { "entropy": 9.074646949768066, "epoch": 0.32717025904686575, "mean_token_accuracy": 0.7472035884857178, "num_tokens": 17223649.0, "step": 3309, "train/ce_loss": 0.647305428981781 }, { "epoch": 0.32717025904686575, "step": 3309, "train/sim_loss": 0.0625 }, { "epoch": 0.32717025904686575, "step": 3309, "train/total_loss": 0.12723055481910706 }, { "entropy": 8.787233352661133, "epoch": 0.32726913189638124, "mean_token_accuracy": 0.7602397799491882, "num_tokens": 17229133.0, "step": 3310, "train/ce_loss": 0.5999415516853333 }, { "epoch": 0.32726913189638124, "step": 3310, "train/sim_loss": 0.0390625 }, { "epoch": 0.32726913189638124, "step": 3310, "train/total_loss": 0.0990566611289978 }, { "entropy": 8.729236602783203, "epoch": 0.3273680047458968, "mean_token_accuracy": 0.7316620349884033, "num_tokens": 17234668.0, "step": 3311, "train/ce_loss": 1.5345820188522339 }, { "epoch": 0.3273680047458968, "step": 3311, "train/sim_loss": 0.06640625 }, { "epoch": 0.3273680047458968, "step": 3311, "train/total_loss": 0.21986445784568787 }, { "entropy": 8.944244384765625, "epoch": 0.3274668775954123, "mean_token_accuracy": 0.7325194478034973, "num_tokens": 17240048.0, "step": 3312, "train/ce_loss": 0.7197909951210022 }, { "epoch": 0.3274668775954123, "step": 3312, "train/sim_loss": 0.03125 }, { "epoch": 0.3274668775954123, "step": 3312, "train/total_loss": 0.1032290980219841 }, { "entropy": 9.22994613647461, "epoch": 0.3275657504449278, "mean_token_accuracy": 0.7666666507720947, "num_tokens": 17245188.0, "step": 3313, "train/ce_loss": 0.8360373377799988 }, { "epoch": 0.3275657504449278, "step": 3313, "train/sim_loss": 0.0546875 }, { "epoch": 0.3275657504449278, "step": 3313, "train/total_loss": 0.13829123973846436 }, { "entropy": 9.057548522949219, "epoch": 0.32766462329444335, "mean_token_accuracy": 0.7311960458755493, "num_tokens": 17250437.0, "step": 3314, "train/ce_loss": 0.7472837567329407 }, { "epoch": 0.32766462329444335, "step": 3314, "train/sim_loss": 0.046875 }, { "epoch": 0.32766462329444335, "step": 3314, "train/total_loss": 0.12160337716341019 }, { "entropy": 9.754228591918945, "epoch": 0.3277634961439589, "mean_token_accuracy": 0.6967418789863586, "num_tokens": 17255304.0, "step": 3315, "train/ce_loss": 1.4389730495167896e-05 }, { "epoch": 0.3277634961439589, "step": 3315, "train/sim_loss": 0.046875 }, { "epoch": 0.3277634961439589, "step": 3315, "train/total_loss": 0.046876437962055206 }, { "entropy": 9.672286033630371, "epoch": 0.3278623689934744, "mean_token_accuracy": 0.7417218685150146, "num_tokens": 17260334.0, "step": 3316, "train/ce_loss": 1.4596654176712036 }, { "epoch": 0.3278623689934744, "step": 3316, "train/sim_loss": 0.1484375 }, { "epoch": 0.3278623689934744, "step": 3316, "train/total_loss": 0.2944040298461914 }, { "entropy": 8.372819900512695, "epoch": 0.3279612418429899, "mean_token_accuracy": 0.7288888692855835, "num_tokens": 17265775.0, "step": 3317, "train/ce_loss": 1.578393816947937 }, { "epoch": 0.3279612418429899, "step": 3317, "train/sim_loss": 0.1328125 }, { "epoch": 0.3279612418429899, "step": 3317, "train/total_loss": 0.2906518876552582 }, { "entropy": 9.448688507080078, "epoch": 0.32806011469250546, "mean_token_accuracy": 0.7260638475418091, "num_tokens": 17270928.0, "step": 3318, "train/ce_loss": 1.353439211845398 }, { "epoch": 0.32806011469250546, "step": 3318, "train/sim_loss": 0.09375 }, { "epoch": 0.32806011469250546, "step": 3318, "train/total_loss": 0.22909392416477203 }, { "entropy": 8.928474426269531, "epoch": 0.32815898754202094, "mean_token_accuracy": 0.6913319230079651, "num_tokens": 17276352.0, "step": 3319, "train/ce_loss": 0.34745627641677856 }, { "epoch": 0.32815898754202094, "step": 3319, "train/sim_loss": 0.05859375 }, { "epoch": 0.32815898754202094, "step": 3319, "train/total_loss": 0.09333938360214233 }, { "epoch": 0.3282578603915365, "grad_norm": 0.7752724885940552, "learning_rate": 9.18187212579736e-06, "loss": 0.1568, "step": 3320 }, { "entropy": 9.880072593688965, "epoch": 0.3282578603915365, "mean_token_accuracy": 0.7188405990600586, "num_tokens": 17281088.0, "step": 3320, "train/ce_loss": 4.62160005554324e-06 }, { "epoch": 0.3282578603915365, "step": 3320, "train/sim_loss": 0.05078125 }, { "epoch": 0.3282578603915365, "step": 3320, "train/total_loss": 0.05078171193599701 }, { "entropy": 9.501537322998047, "epoch": 0.328356733241052, "mean_token_accuracy": 0.7589820623397827, "num_tokens": 17286241.0, "step": 3321, "train/ce_loss": 1.1275746822357178 }, { "epoch": 0.328356733241052, "step": 3321, "train/sim_loss": 0.0625 }, { "epoch": 0.328356733241052, "step": 3321, "train/total_loss": 0.17525747418403625 }, { "entropy": 9.380807876586914, "epoch": 0.3284556060905675, "mean_token_accuracy": 0.8179271817207336, "num_tokens": 17291434.0, "step": 3322, "train/ce_loss": 0.5504809617996216 }, { "epoch": 0.3284556060905675, "step": 3322, "train/sim_loss": 0.0546875 }, { "epoch": 0.3284556060905675, "step": 3322, "train/total_loss": 0.10973559319972992 }, { "entropy": 9.613485336303711, "epoch": 0.32855447894008305, "mean_token_accuracy": 0.8215962648391724, "num_tokens": 17296491.0, "step": 3323, "train/ce_loss": 0.5445230603218079 }, { "epoch": 0.32855447894008305, "step": 3323, "train/sim_loss": 0.0234375 }, { "epoch": 0.32855447894008305, "step": 3323, "train/total_loss": 0.0778898075222969 }, { "entropy": 9.144057273864746, "epoch": 0.3286533517895986, "mean_token_accuracy": 0.747474730014801, "num_tokens": 17301778.0, "step": 3324, "train/ce_loss": 0.9967942833900452 }, { "epoch": 0.3286533517895986, "step": 3324, "train/sim_loss": 0.05078125 }, { "epoch": 0.3286533517895986, "step": 3324, "train/total_loss": 0.15046069025993347 }, { "entropy": 9.607072830200195, "epoch": 0.3287522246391141, "mean_token_accuracy": 0.7135325074195862, "num_tokens": 17306792.0, "step": 3325, "train/ce_loss": 3.519417987263296e-06 }, { "epoch": 0.3287522246391141, "step": 3325, "train/sim_loss": 0.0546875 }, { "epoch": 0.3287522246391141, "step": 3325, "train/total_loss": 0.054687850177288055 }, { "entropy": 9.660314559936523, "epoch": 0.3288510974886296, "mean_token_accuracy": 0.7192716002464294, "num_tokens": 17311899.0, "step": 3326, "train/ce_loss": 1.3766443729400635 }, { "epoch": 0.3288510974886296, "step": 3326, "train/sim_loss": 0.0625 }, { "epoch": 0.3288510974886296, "step": 3326, "train/total_loss": 0.20016443729400635 }, { "entropy": 9.089285850524902, "epoch": 0.32894997033814516, "mean_token_accuracy": 0.6810228824615479, "num_tokens": 17317099.0, "step": 3327, "train/ce_loss": 0.921363890171051 }, { "epoch": 0.32894997033814516, "step": 3327, "train/sim_loss": 0.05078125 }, { "epoch": 0.32894997033814516, "step": 3327, "train/total_loss": 0.14291763305664062 }, { "entropy": 9.025397300720215, "epoch": 0.32904884318766064, "mean_token_accuracy": 0.8428720235824585, "num_tokens": 17322534.0, "step": 3328, "train/ce_loss": 0.7051679491996765 }, { "epoch": 0.32904884318766064, "step": 3328, "train/sim_loss": 0.09375 }, { "epoch": 0.32904884318766064, "step": 3328, "train/total_loss": 0.16426679491996765 }, { "entropy": 9.427388191223145, "epoch": 0.3291477160371762, "mean_token_accuracy": 0.7158774137496948, "num_tokens": 17327690.0, "step": 3329, "train/ce_loss": 0.8867825269699097 }, { "epoch": 0.3291477160371762, "step": 3329, "train/sim_loss": 0.05078125 }, { "epoch": 0.3291477160371762, "step": 3329, "train/total_loss": 0.1394595056772232 }, { "entropy": 9.08204460144043, "epoch": 0.3292465888866917, "mean_token_accuracy": 0.7878412008285522, "num_tokens": 17332978.0, "step": 3330, "train/ce_loss": 6.495631623693043e-06 }, { "epoch": 0.3292465888866917, "step": 3330, "train/sim_loss": 0.046875 }, { "epoch": 0.3292465888866917, "step": 3330, "train/total_loss": 0.04687564820051193 }, { "entropy": 8.862564086914062, "epoch": 0.3293454617362072, "mean_token_accuracy": 0.7699293494224548, "num_tokens": 17338440.0, "step": 3331, "train/ce_loss": 0.8806781768798828 }, { "epoch": 0.3293454617362072, "step": 3331, "train/sim_loss": 0.07421875 }, { "epoch": 0.3293454617362072, "step": 3331, "train/total_loss": 0.16228657960891724 }, { "entropy": 8.995798110961914, "epoch": 0.32944433458572275, "mean_token_accuracy": 0.7753396034240723, "num_tokens": 17343931.0, "step": 3332, "train/ce_loss": 0.5379678606987 }, { "epoch": 0.32944433458572275, "step": 3332, "train/sim_loss": 0.06640625 }, { "epoch": 0.32944433458572275, "step": 3332, "train/total_loss": 0.12020303308963776 }, { "entropy": 9.290687561035156, "epoch": 0.3295432074352383, "mean_token_accuracy": 0.7165697813034058, "num_tokens": 17349064.0, "step": 3333, "train/ce_loss": 1.9717219856829615e-06 }, { "epoch": 0.3295432074352383, "step": 3333, "train/sim_loss": 0.046875 }, { "epoch": 0.3295432074352383, "step": 3333, "train/total_loss": 0.04687519744038582 }, { "entropy": 9.608075141906738, "epoch": 0.3296420802847538, "mean_token_accuracy": 0.80738365650177, "num_tokens": 17354153.0, "step": 3334, "train/ce_loss": 2.263906480948208e-06 }, { "epoch": 0.3296420802847538, "step": 3334, "train/sim_loss": 0.0390625 }, { "epoch": 0.3296420802847538, "step": 3334, "train/total_loss": 0.039062727242708206 }, { "entropy": 9.076807975769043, "epoch": 0.3297409531342693, "mean_token_accuracy": 0.7295742034912109, "num_tokens": 17359534.0, "step": 3335, "train/ce_loss": 0.7153067588806152 }, { "epoch": 0.3297409531342693, "step": 3335, "train/sim_loss": 0.07421875 }, { "epoch": 0.3297409531342693, "step": 3335, "train/total_loss": 0.14574941992759705 }, { "entropy": 9.463798522949219, "epoch": 0.32983982598378486, "mean_token_accuracy": 0.7430249452590942, "num_tokens": 17364666.0, "step": 3336, "train/ce_loss": 1.0005092008213978e-05 }, { "epoch": 0.32983982598378486, "step": 3336, "train/sim_loss": 0.0390625 }, { "epoch": 0.32983982598378486, "step": 3336, "train/total_loss": 0.039063502103090286 }, { "entropy": 9.013387680053711, "epoch": 0.3299386988333004, "mean_token_accuracy": 0.7323529124259949, "num_tokens": 17369975.0, "step": 3337, "train/ce_loss": 1.050665020942688 }, { "epoch": 0.3299386988333004, "step": 3337, "train/sim_loss": 0.0625 }, { "epoch": 0.3299386988333004, "step": 3337, "train/total_loss": 0.16756650805473328 }, { "entropy": 9.326025009155273, "epoch": 0.3300375716828159, "mean_token_accuracy": 0.7771428823471069, "num_tokens": 17375113.0, "step": 3338, "train/ce_loss": 0.417694628238678 }, { "epoch": 0.3300375716828159, "step": 3338, "train/sim_loss": 0.02734375 }, { "epoch": 0.3300375716828159, "step": 3338, "train/total_loss": 0.06911320984363556 }, { "entropy": 8.72673511505127, "epoch": 0.33013644453233143, "mean_token_accuracy": 0.7530120611190796, "num_tokens": 17380644.0, "step": 3339, "train/ce_loss": 0.4539792835712433 }, { "epoch": 0.33013644453233143, "step": 3339, "train/sim_loss": 0.0390625 }, { "epoch": 0.33013644453233143, "step": 3339, "train/total_loss": 0.08446042984724045 }, { "epoch": 0.33023531738184697, "grad_norm": 0.7518821954727173, "learning_rate": 9.17692726103941e-06, "loss": 0.1369, "step": 3340 }, { "entropy": 8.83997917175293, "epoch": 0.33023531738184697, "mean_token_accuracy": 0.7636761665344238, "num_tokens": 17386031.0, "step": 3340, "train/ce_loss": 0.7052160501480103 }, { "epoch": 0.33023531738184697, "step": 3340, "train/sim_loss": 0.08203125 }, { "epoch": 0.33023531738184697, "step": 3340, "train/total_loss": 0.15255285799503326 }, { "entropy": 8.709097862243652, "epoch": 0.33033419023136246, "mean_token_accuracy": 0.6950549483299255, "num_tokens": 17391586.0, "step": 3341, "train/ce_loss": 0.5241850018501282 }, { "epoch": 0.33033419023136246, "step": 3341, "train/sim_loss": 0.078125 }, { "epoch": 0.33033419023136246, "step": 3341, "train/total_loss": 0.13054350018501282 }, { "entropy": 9.87069034576416, "epoch": 0.330433063080878, "mean_token_accuracy": 0.8385744094848633, "num_tokens": 17396475.0, "step": 3342, "train/ce_loss": 2.3771021915308665e-06 }, { "epoch": 0.330433063080878, "step": 3342, "train/sim_loss": 0.0234375 }, { "epoch": 0.330433063080878, "step": 3342, "train/total_loss": 0.0234377384185791 }, { "entropy": 10.025890350341797, "epoch": 0.33053193593039354, "mean_token_accuracy": 0.751207709312439, "num_tokens": 17401317.0, "step": 3343, "train/ce_loss": 2.90927482637926e-06 }, { "epoch": 0.33053193593039354, "step": 3343, "train/sim_loss": 0.0625 }, { "epoch": 0.33053193593039354, "step": 3343, "train/total_loss": 0.06250029057264328 }, { "entropy": 9.361654281616211, "epoch": 0.330630808779909, "mean_token_accuracy": 0.7660484910011292, "num_tokens": 17406504.0, "step": 3344, "train/ce_loss": 0.613772988319397 }, { "epoch": 0.330630808779909, "step": 3344, "train/sim_loss": 0.015625 }, { "epoch": 0.330630808779909, "step": 3344, "train/total_loss": 0.07700230181217194 }, { "entropy": 9.017574310302734, "epoch": 0.33072968162942457, "mean_token_accuracy": 0.7683615684509277, "num_tokens": 17411911.0, "step": 3345, "train/ce_loss": 0.8716453313827515 }, { "epoch": 0.33072968162942457, "step": 3345, "train/sim_loss": 0.0703125 }, { "epoch": 0.33072968162942457, "step": 3345, "train/total_loss": 0.15747703611850739 }, { "entropy": 9.815315246582031, "epoch": 0.3308285544789401, "mean_token_accuracy": 0.6613636612892151, "num_tokens": 17416713.0, "step": 3346, "train/ce_loss": 9.277481694880407e-06 }, { "epoch": 0.3308285544789401, "step": 3346, "train/sim_loss": 0.1171875 }, { "epoch": 0.3308285544789401, "step": 3346, "train/total_loss": 0.11718843132257462 }, { "entropy": 9.554647445678711, "epoch": 0.3309274273284556, "mean_token_accuracy": 0.7161654233932495, "num_tokens": 17421720.0, "step": 3347, "train/ce_loss": 1.4893670082092285 }, { "epoch": 0.3309274273284556, "step": 3347, "train/sim_loss": 0.0859375 }, { "epoch": 0.3309274273284556, "step": 3347, "train/total_loss": 0.2348742038011551 }, { "entropy": 8.847419738769531, "epoch": 0.33102630017797113, "mean_token_accuracy": 0.7027601003646851, "num_tokens": 17427104.0, "step": 3348, "train/ce_loss": 1.1632755994796753 }, { "epoch": 0.33102630017797113, "step": 3348, "train/sim_loss": 0.08203125 }, { "epoch": 0.33102630017797113, "step": 3348, "train/total_loss": 0.19835880398750305 }, { "entropy": 9.26319694519043, "epoch": 0.3311251730274867, "mean_token_accuracy": 0.7883333563804626, "num_tokens": 17432137.0, "step": 3349, "train/ce_loss": 3.58709644388e-06 }, { "epoch": 0.3311251730274867, "step": 3349, "train/sim_loss": 0.05078125 }, { "epoch": 0.3311251730274867, "step": 3349, "train/total_loss": 0.05078160762786865 }, { "entropy": 8.61819076538086, "epoch": 0.33122404587700216, "mean_token_accuracy": 0.7494692206382751, "num_tokens": 17437573.0, "step": 3350, "train/ce_loss": 0.7018135786056519 }, { "epoch": 0.33122404587700216, "step": 3350, "train/sim_loss": 0.05859375 }, { "epoch": 0.33122404587700216, "step": 3350, "train/total_loss": 0.12877511978149414 }, { "entropy": 9.104698181152344, "epoch": 0.3313229187265177, "mean_token_accuracy": 0.7133917212486267, "num_tokens": 17442841.0, "step": 3351, "train/ce_loss": 1.2091833353042603 }, { "epoch": 0.3313229187265177, "step": 3351, "train/sim_loss": 0.19921875 }, { "epoch": 0.3313229187265177, "step": 3351, "train/total_loss": 0.320137083530426 }, { "entropy": 8.993057250976562, "epoch": 0.33142179157603324, "mean_token_accuracy": 0.7660256624221802, "num_tokens": 17448257.0, "step": 3352, "train/ce_loss": 0.6769647598266602 }, { "epoch": 0.33142179157603324, "step": 3352, "train/sim_loss": 0.06640625 }, { "epoch": 0.33142179157603324, "step": 3352, "train/total_loss": 0.1341027319431305 }, { "entropy": 9.338879585266113, "epoch": 0.33152066442554873, "mean_token_accuracy": 0.7210242748260498, "num_tokens": 17453449.0, "step": 3353, "train/ce_loss": 0.9190797209739685 }, { "epoch": 0.33152066442554873, "step": 3353, "train/sim_loss": 0.046875 }, { "epoch": 0.33152066442554873, "step": 3353, "train/total_loss": 0.13878297805786133 }, { "entropy": 9.517727851867676, "epoch": 0.33161953727506427, "mean_token_accuracy": 0.7896296381950378, "num_tokens": 17458526.0, "step": 3354, "train/ce_loss": 0.8352819085121155 }, { "epoch": 0.33161953727506427, "step": 3354, "train/sim_loss": 0.02734375 }, { "epoch": 0.33161953727506427, "step": 3354, "train/total_loss": 0.11087194085121155 }, { "entropy": 8.94295883178711, "epoch": 0.3317184101245798, "mean_token_accuracy": 0.723127007484436, "num_tokens": 17463947.0, "step": 3355, "train/ce_loss": 0.895768404006958 }, { "epoch": 0.3317184101245798, "step": 3355, "train/sim_loss": 0.05859375 }, { "epoch": 0.3317184101245798, "step": 3355, "train/total_loss": 0.1481705904006958 }, { "entropy": 9.064741134643555, "epoch": 0.3318172829740953, "mean_token_accuracy": 0.6784037351608276, "num_tokens": 17469284.0, "step": 3356, "train/ce_loss": 1.3284772634506226 }, { "epoch": 0.3318172829740953, "step": 3356, "train/sim_loss": 0.11328125 }, { "epoch": 0.3318172829740953, "step": 3356, "train/total_loss": 0.24612897634506226 }, { "entropy": 9.327483177185059, "epoch": 0.33191615582361084, "mean_token_accuracy": 0.8005390763282776, "num_tokens": 17474422.0, "step": 3357, "train/ce_loss": 0.6446126103401184 }, { "epoch": 0.33191615582361084, "step": 3357, "train/sim_loss": 0.0234375 }, { "epoch": 0.33191615582361084, "step": 3357, "train/total_loss": 0.08789876103401184 }, { "entropy": 8.652473449707031, "epoch": 0.3320150286731264, "mean_token_accuracy": 0.7570776343345642, "num_tokens": 17480060.0, "step": 3358, "train/ce_loss": 0.8517627716064453 }, { "epoch": 0.3320150286731264, "step": 3358, "train/sim_loss": 0.0546875 }, { "epoch": 0.3320150286731264, "step": 3358, "train/total_loss": 0.1398637890815735 }, { "entropy": 8.441993713378906, "epoch": 0.33211390152264186, "mean_token_accuracy": 0.7657308578491211, "num_tokens": 17485646.0, "step": 3359, "train/ce_loss": 0.44434165954589844 }, { "epoch": 0.33211390152264186, "step": 3359, "train/sim_loss": 0.0703125 }, { "epoch": 0.33211390152264186, "step": 3359, "train/total_loss": 0.11474666744470596 }, { "epoch": 0.3322127743721574, "grad_norm": 0.7457917332649231, "learning_rate": 9.171982396281463e-06, "loss": 0.1451, "step": 3360 }, { "entropy": 9.506464004516602, "epoch": 0.3322127743721574, "mean_token_accuracy": 0.722129762172699, "num_tokens": 17490669.0, "step": 3360, "train/ce_loss": 1.7210886478424072 }, { "epoch": 0.3322127743721574, "step": 3360, "train/sim_loss": 0.0546875 }, { "epoch": 0.3322127743721574, "step": 3360, "train/total_loss": 0.22679637372493744 }, { "entropy": 9.958076477050781, "epoch": 0.33231164722167295, "mean_token_accuracy": 0.7402597665786743, "num_tokens": 17495499.0, "step": 3361, "train/ce_loss": 2.4337151050567627 }, { "epoch": 0.33231164722167295, "step": 3361, "train/sim_loss": 0.05078125 }, { "epoch": 0.33231164722167295, "step": 3361, "train/total_loss": 0.29415276646614075 }, { "entropy": 9.22468090057373, "epoch": 0.33241052007118843, "mean_token_accuracy": 0.7092511057853699, "num_tokens": 17500605.0, "step": 3362, "train/ce_loss": 0.7624481320381165 }, { "epoch": 0.33241052007118843, "step": 3362, "train/sim_loss": 0.03125 }, { "epoch": 0.33241052007118843, "step": 3362, "train/total_loss": 0.10749481618404388 }, { "entropy": 9.560653686523438, "epoch": 0.33250939292070397, "mean_token_accuracy": 0.7523659467697144, "num_tokens": 17505674.0, "step": 3363, "train/ce_loss": 0.8970683217048645 }, { "epoch": 0.33250939292070397, "step": 3363, "train/sim_loss": 0.01953125 }, { "epoch": 0.33250939292070397, "step": 3363, "train/total_loss": 0.10923808068037033 }, { "entropy": 9.083115577697754, "epoch": 0.3326082657702195, "mean_token_accuracy": 0.7306843400001526, "num_tokens": 17511066.0, "step": 3364, "train/ce_loss": 0.8307674527168274 }, { "epoch": 0.3326082657702195, "step": 3364, "train/sim_loss": 0.06640625 }, { "epoch": 0.3326082657702195, "step": 3364, "train/total_loss": 0.14948299527168274 }, { "entropy": 9.192109107971191, "epoch": 0.332707138619735, "mean_token_accuracy": 0.7402912378311157, "num_tokens": 17516323.0, "step": 3365, "train/ce_loss": 0.43051204085350037 }, { "epoch": 0.332707138619735, "step": 3365, "train/sim_loss": 0.0234375 }, { "epoch": 0.332707138619735, "step": 3365, "train/total_loss": 0.06648870557546616 }, { "entropy": 9.800932884216309, "epoch": 0.33280601146925054, "mean_token_accuracy": 0.7151394486427307, "num_tokens": 17521236.0, "step": 3366, "train/ce_loss": 2.51992560151848e-06 }, { "epoch": 0.33280601146925054, "step": 3366, "train/sim_loss": 0.0234375 }, { "epoch": 0.33280601146925054, "step": 3366, "train/total_loss": 0.023437751457095146 }, { "entropy": 9.11934757232666, "epoch": 0.3329048843187661, "mean_token_accuracy": 0.7839080691337585, "num_tokens": 17526605.0, "step": 3367, "train/ce_loss": 1.0057429075241089 }, { "epoch": 0.3329048843187661, "step": 3367, "train/sim_loss": 0.0234375 }, { "epoch": 0.3329048843187661, "step": 3367, "train/total_loss": 0.12401179224252701 }, { "entropy": 9.207134246826172, "epoch": 0.33300375716828157, "mean_token_accuracy": 0.7644736766815186, "num_tokens": 17531820.0, "step": 3368, "train/ce_loss": 0.9864628911018372 }, { "epoch": 0.33300375716828157, "step": 3368, "train/sim_loss": 0.0625 }, { "epoch": 0.33300375716828157, "step": 3368, "train/total_loss": 0.16114628314971924 }, { "entropy": 9.180716514587402, "epoch": 0.3331026300177971, "mean_token_accuracy": 0.7611510753631592, "num_tokens": 17537022.0, "step": 3369, "train/ce_loss": 1.3197712898254395 }, { "epoch": 0.3331026300177971, "step": 3369, "train/sim_loss": 0.09375 }, { "epoch": 0.3331026300177971, "step": 3369, "train/total_loss": 0.2257271260023117 }, { "entropy": 10.049286842346191, "epoch": 0.33320150286731265, "mean_token_accuracy": 0.7659574747085571, "num_tokens": 17541755.0, "step": 3370, "train/ce_loss": 1.915870189666748 }, { "epoch": 0.33320150286731265, "step": 3370, "train/sim_loss": 0.0859375 }, { "epoch": 0.33320150286731265, "step": 3370, "train/total_loss": 0.27752453088760376 }, { "entropy": 9.705484390258789, "epoch": 0.33330037571682813, "mean_token_accuracy": 0.7814313173294067, "num_tokens": 17546717.0, "step": 3371, "train/ce_loss": 2.390544295849395e-06 }, { "epoch": 0.33330037571682813, "step": 3371, "train/sim_loss": 0.046875 }, { "epoch": 0.33330037571682813, "step": 3371, "train/total_loss": 0.0468752384185791 }, { "entropy": 9.473861694335938, "epoch": 0.3333992485663437, "mean_token_accuracy": 0.7057010531425476, "num_tokens": 17551795.0, "step": 3372, "train/ce_loss": 2.3074352741241455 }, { "epoch": 0.3333992485663437, "step": 3372, "train/sim_loss": 0.09375 }, { "epoch": 0.3333992485663437, "step": 3372, "train/total_loss": 0.32449352741241455 }, { "entropy": 9.567621231079102, "epoch": 0.3334981214158592, "mean_token_accuracy": 0.6865149140357971, "num_tokens": 17556808.0, "step": 3373, "train/ce_loss": 1.390802025794983 }, { "epoch": 0.3334981214158592, "step": 3373, "train/sim_loss": 0.0390625 }, { "epoch": 0.3334981214158592, "step": 3373, "train/total_loss": 0.178142711520195 }, { "entropy": 9.133459091186523, "epoch": 0.3335969942653747, "mean_token_accuracy": 0.7361878156661987, "num_tokens": 17562021.0, "step": 3374, "train/ce_loss": 2.002731434913585e-06 }, { "epoch": 0.3335969942653747, "step": 3374, "train/sim_loss": 0.046875 }, { "epoch": 0.3335969942653747, "step": 3374, "train/total_loss": 0.04687520116567612 }, { "entropy": 8.745182037353516, "epoch": 0.33369586711489024, "mean_token_accuracy": 0.7504743933677673, "num_tokens": 17567587.0, "step": 3375, "train/ce_loss": 1.0268542766571045 }, { "epoch": 0.33369586711489024, "step": 3375, "train/sim_loss": 0.06640625 }, { "epoch": 0.33369586711489024, "step": 3375, "train/total_loss": 0.16909167170524597 }, { "entropy": 8.663030624389648, "epoch": 0.3337947399644058, "mean_token_accuracy": 0.7615545988082886, "num_tokens": 17572966.0, "step": 3376, "train/ce_loss": 0.5454652905464172 }, { "epoch": 0.3337947399644058, "step": 3376, "train/sim_loss": 0.06640625 }, { "epoch": 0.3337947399644058, "step": 3376, "train/total_loss": 0.1209527850151062 }, { "entropy": 9.087337493896484, "epoch": 0.33389361281392127, "mean_token_accuracy": 0.7656427621841431, "num_tokens": 17578301.0, "step": 3377, "train/ce_loss": 0.6339848637580872 }, { "epoch": 0.33389361281392127, "step": 3377, "train/sim_loss": 0.0703125 }, { "epoch": 0.33389361281392127, "step": 3377, "train/total_loss": 0.13371098041534424 }, { "entropy": 9.56787395477295, "epoch": 0.3339924856634368, "mean_token_accuracy": 0.7808219194412231, "num_tokens": 17583251.0, "step": 3378, "train/ce_loss": 1.6264250461972551e-06 }, { "epoch": 0.3339924856634368, "step": 3378, "train/sim_loss": 0.046875 }, { "epoch": 0.3339924856634368, "step": 3378, "train/total_loss": 0.04687516391277313 }, { "entropy": 9.46949577331543, "epoch": 0.33409135851295235, "mean_token_accuracy": 0.7528571486473083, "num_tokens": 17588362.0, "step": 3379, "train/ce_loss": 1.3034005165100098 }, { "epoch": 0.33409135851295235, "step": 3379, "train/sim_loss": 0.05859375 }, { "epoch": 0.33409135851295235, "step": 3379, "train/total_loss": 0.18893380463123322 }, { "epoch": 0.3341902313624679, "grad_norm": 0.7638806104660034, "learning_rate": 9.167037531523513e-06, "loss": 0.1375, "step": 3380 }, { "entropy": 9.141291618347168, "epoch": 0.3341902313624679, "mean_token_accuracy": 0.740440309047699, "num_tokens": 17593694.0, "step": 3380, "train/ce_loss": 0.5845214128494263 }, { "epoch": 0.3341902313624679, "step": 3380, "train/sim_loss": 0.03125 }, { "epoch": 0.3341902313624679, "step": 3380, "train/total_loss": 0.08970214426517487 }, { "entropy": 9.934433937072754, "epoch": 0.3342891042119834, "mean_token_accuracy": 0.791208803653717, "num_tokens": 17598577.0, "step": 3381, "train/ce_loss": 2.761363248282578e-06 }, { "epoch": 0.3342891042119834, "step": 3381, "train/sim_loss": 0.046875 }, { "epoch": 0.3342891042119834, "step": 3381, "train/total_loss": 0.046875275671482086 }, { "entropy": 9.295759201049805, "epoch": 0.3343879770614989, "mean_token_accuracy": 0.7364864945411682, "num_tokens": 17603765.0, "step": 3382, "train/ce_loss": 0.5177283883094788 }, { "epoch": 0.3343879770614989, "step": 3382, "train/sim_loss": 0.046875 }, { "epoch": 0.3343879770614989, "step": 3382, "train/total_loss": 0.098647840321064 }, { "entropy": 9.589226722717285, "epoch": 0.33448684991101446, "mean_token_accuracy": 0.7853492498397827, "num_tokens": 17608778.0, "step": 3383, "train/ce_loss": 0.7475631237030029 }, { "epoch": 0.33448684991101446, "step": 3383, "train/sim_loss": 0.06640625 }, { "epoch": 0.33448684991101446, "step": 3383, "train/total_loss": 0.14116257429122925 }, { "entropy": 9.307208061218262, "epoch": 0.33458572276052995, "mean_token_accuracy": 0.7857142686843872, "num_tokens": 17613969.0, "step": 3384, "train/ce_loss": 1.0145411491394043 }, { "epoch": 0.33458572276052995, "step": 3384, "train/sim_loss": 0.046875 }, { "epoch": 0.33458572276052995, "step": 3384, "train/total_loss": 0.14832910895347595 }, { "entropy": 8.7385835647583, "epoch": 0.3346845956100455, "mean_token_accuracy": 0.8264462947845459, "num_tokens": 17619475.0, "step": 3385, "train/ce_loss": 0.6401126980781555 }, { "epoch": 0.3346845956100455, "step": 3385, "train/sim_loss": 0.02734375 }, { "epoch": 0.3346845956100455, "step": 3385, "train/total_loss": 0.09135501831769943 }, { "entropy": 9.472051620483398, "epoch": 0.33478346845956103, "mean_token_accuracy": 0.6609195470809937, "num_tokens": 17624580.0, "step": 3386, "train/ce_loss": 1.3405520915985107 }, { "epoch": 0.33478346845956103, "step": 3386, "train/sim_loss": 0.09765625 }, { "epoch": 0.33478346845956103, "step": 3386, "train/total_loss": 0.2317114621400833 }, { "entropy": 9.845640182495117, "epoch": 0.3348823413090765, "mean_token_accuracy": 0.7963917255401611, "num_tokens": 17629396.0, "step": 3387, "train/ce_loss": 3.4778004192048684e-06 }, { "epoch": 0.3348823413090765, "step": 3387, "train/sim_loss": 0.046875 }, { "epoch": 0.3348823413090765, "step": 3387, "train/total_loss": 0.04687534645199776 }, { "entropy": 9.08883285522461, "epoch": 0.33498121415859206, "mean_token_accuracy": 0.6870588064193726, "num_tokens": 17634731.0, "step": 3388, "train/ce_loss": 0.7789780497550964 }, { "epoch": 0.33498121415859206, "step": 3388, "train/sim_loss": 0.08203125 }, { "epoch": 0.33498121415859206, "step": 3388, "train/total_loss": 0.1599290668964386 }, { "entropy": 9.02785873413086, "epoch": 0.3350800870081076, "mean_token_accuracy": 0.765625, "num_tokens": 17640107.0, "step": 3389, "train/ce_loss": 0.5221772789955139 }, { "epoch": 0.3350800870081076, "step": 3389, "train/sim_loss": 0.0234375 }, { "epoch": 0.3350800870081076, "step": 3389, "train/total_loss": 0.07565522938966751 }, { "entropy": 9.07229995727539, "epoch": 0.3351789598576231, "mean_token_accuracy": 0.7211538553237915, "num_tokens": 17645437.0, "step": 3390, "train/ce_loss": 0.6404953598976135 }, { "epoch": 0.3351789598576231, "step": 3390, "train/sim_loss": 0.1015625 }, { "epoch": 0.3351789598576231, "step": 3390, "train/total_loss": 0.16561204195022583 }, { "entropy": 9.288196563720703, "epoch": 0.3352778327071386, "mean_token_accuracy": 0.694779098033905, "num_tokens": 17650690.0, "step": 3391, "train/ce_loss": 0.983015239238739 }, { "epoch": 0.3352778327071386, "step": 3391, "train/sim_loss": 0.1328125 }, { "epoch": 0.3352778327071386, "step": 3391, "train/total_loss": 0.23111402988433838 }, { "entropy": 10.127918243408203, "epoch": 0.33537670555665416, "mean_token_accuracy": 0.8166666626930237, "num_tokens": 17655411.0, "step": 3392, "train/ce_loss": 3.3920389341801638e-06 }, { "epoch": 0.33537670555665416, "step": 3392, "train/sim_loss": 0.05078125 }, { "epoch": 0.33537670555665416, "step": 3392, "train/total_loss": 0.05078158900141716 }, { "entropy": 9.554422378540039, "epoch": 0.33547557840616965, "mean_token_accuracy": 0.7822784781455994, "num_tokens": 17660218.0, "step": 3393, "train/ce_loss": 1.4625734090805054 }, { "epoch": 0.33547557840616965, "step": 3393, "train/sim_loss": 0.07421875 }, { "epoch": 0.33547557840616965, "step": 3393, "train/total_loss": 0.22047609090805054 }, { "entropy": 8.788867950439453, "epoch": 0.3355744512556852, "mean_token_accuracy": 0.75, "num_tokens": 17665591.0, "step": 3394, "train/ce_loss": 0.4802239239215851 }, { "epoch": 0.3355744512556852, "step": 3394, "train/sim_loss": 0.05859375 }, { "epoch": 0.3355744512556852, "step": 3394, "train/total_loss": 0.10661613941192627 }, { "entropy": 8.993380546569824, "epoch": 0.33567332410520073, "mean_token_accuracy": 0.7278761267662048, "num_tokens": 17670979.0, "step": 3395, "train/ce_loss": 0.7232365012168884 }, { "epoch": 0.33567332410520073, "step": 3395, "train/sim_loss": 0.05078125 }, { "epoch": 0.33567332410520073, "step": 3395, "train/total_loss": 0.12310490012168884 }, { "entropy": 9.209710121154785, "epoch": 0.3357721969547162, "mean_token_accuracy": 0.7336561679840088, "num_tokens": 17676220.0, "step": 3396, "train/ce_loss": 1.664810299873352 }, { "epoch": 0.3357721969547162, "step": 3396, "train/sim_loss": 0.0859375 }, { "epoch": 0.3357721969547162, "step": 3396, "train/total_loss": 0.25241851806640625 }, { "entropy": 9.402828216552734, "epoch": 0.33587106980423176, "mean_token_accuracy": 0.7558139562606812, "num_tokens": 17681440.0, "step": 3397, "train/ce_loss": 1.0300356149673462 }, { "epoch": 0.33587106980423176, "step": 3397, "train/sim_loss": 0.11328125 }, { "epoch": 0.33587106980423176, "step": 3397, "train/total_loss": 0.21628481149673462 }, { "entropy": 9.31285572052002, "epoch": 0.3359699426537473, "mean_token_accuracy": 0.8018741607666016, "num_tokens": 17686694.0, "step": 3398, "train/ce_loss": 0.5148482918739319 }, { "epoch": 0.3359699426537473, "step": 3398, "train/sim_loss": 0.03125 }, { "epoch": 0.3359699426537473, "step": 3398, "train/total_loss": 0.08273483067750931 }, { "entropy": 8.956930160522461, "epoch": 0.3360688155032628, "mean_token_accuracy": 0.7489224076271057, "num_tokens": 17692096.0, "step": 3399, "train/ce_loss": 0.6533750295639038 }, { "epoch": 0.3360688155032628, "step": 3399, "train/sim_loss": 0.07421875 }, { "epoch": 0.3360688155032628, "step": 3399, "train/total_loss": 0.13955625891685486 }, { "epoch": 0.3361676883527783, "grad_norm": 0.7196058630943298, "learning_rate": 9.162092666765566e-06, "loss": 0.1369, "step": 3400 }, { "entropy": 9.541491508483887, "epoch": 0.3361676883527783, "mean_token_accuracy": 0.7649842500686646, "num_tokens": 17697189.0, "step": 3400, "train/ce_loss": 1.5455391348950798e-06 }, { "epoch": 0.3361676883527783, "step": 3400, "train/sim_loss": 0.05078125 }, { "epoch": 0.3361676883527783, "step": 3400, "train/total_loss": 0.05078140273690224 }, { "entropy": 9.987796783447266, "epoch": 0.33626656120229387, "mean_token_accuracy": 0.7028688788414001, "num_tokens": 17702107.0, "step": 3401, "train/ce_loss": 2.4917209148406982 }, { "epoch": 0.33626656120229387, "step": 3401, "train/sim_loss": 0.0859375 }, { "epoch": 0.33626656120229387, "step": 3401, "train/total_loss": 0.3351095914840698 }, { "entropy": 9.113516807556152, "epoch": 0.33636543405180935, "mean_token_accuracy": 0.7106273770332336, "num_tokens": 17707393.0, "step": 3402, "train/ce_loss": 1.3251246213912964 }, { "epoch": 0.33636543405180935, "step": 3402, "train/sim_loss": 0.06640625 }, { "epoch": 0.33636543405180935, "step": 3402, "train/total_loss": 0.19891871511936188 }, { "entropy": 9.230640411376953, "epoch": 0.3364643069013249, "mean_token_accuracy": 0.7346405386924744, "num_tokens": 17712659.0, "step": 3403, "train/ce_loss": 0.8238956332206726 }, { "epoch": 0.3364643069013249, "step": 3403, "train/sim_loss": 0.0703125 }, { "epoch": 0.3364643069013249, "step": 3403, "train/total_loss": 0.15270206332206726 }, { "entropy": 8.905786514282227, "epoch": 0.33656317975084044, "mean_token_accuracy": 0.7210065722465515, "num_tokens": 17718082.0, "step": 3404, "train/ce_loss": 0.9621824622154236 }, { "epoch": 0.33656317975084044, "step": 3404, "train/sim_loss": 0.1171875 }, { "epoch": 0.33656317975084044, "step": 3404, "train/total_loss": 0.2134057581424713 }, { "entropy": 9.548670768737793, "epoch": 0.3366620526003559, "mean_token_accuracy": 0.7258347868919373, "num_tokens": 17723049.0, "step": 3405, "train/ce_loss": 1.2460538148880005 }, { "epoch": 0.3366620526003559, "step": 3405, "train/sim_loss": 0.08203125 }, { "epoch": 0.3366620526003559, "step": 3405, "train/total_loss": 0.20663663744926453 }, { "entropy": 9.482522010803223, "epoch": 0.33676092544987146, "mean_token_accuracy": 0.6987951993942261, "num_tokens": 17728115.0, "step": 3406, "train/ce_loss": 1.601701259613037 }, { "epoch": 0.33676092544987146, "step": 3406, "train/sim_loss": 0.0546875 }, { "epoch": 0.33676092544987146, "step": 3406, "train/total_loss": 0.21485762298107147 }, { "entropy": 8.978238105773926, "epoch": 0.336859798299387, "mean_token_accuracy": 0.7990430593490601, "num_tokens": 17733398.0, "step": 3407, "train/ce_loss": 0.6218247413635254 }, { "epoch": 0.336859798299387, "step": 3407, "train/sim_loss": 0.08984375 }, { "epoch": 0.336859798299387, "step": 3407, "train/total_loss": 0.1520262211561203 }, { "entropy": 9.083845138549805, "epoch": 0.3369586711489025, "mean_token_accuracy": 0.7783018946647644, "num_tokens": 17738756.0, "step": 3408, "train/ce_loss": 0.5001293420791626 }, { "epoch": 0.3369586711489025, "step": 3408, "train/sim_loss": 0.05859375 }, { "epoch": 0.3369586711489025, "step": 3408, "train/total_loss": 0.10860668122768402 }, { "entropy": 9.258260726928711, "epoch": 0.33705754399841803, "mean_token_accuracy": 0.7490909099578857, "num_tokens": 17743993.0, "step": 3409, "train/ce_loss": 1.1198776960372925 }, { "epoch": 0.33705754399841803, "step": 3409, "train/sim_loss": 0.07421875 }, { "epoch": 0.33705754399841803, "step": 3409, "train/total_loss": 0.18620651960372925 }, { "entropy": 8.85833740234375, "epoch": 0.33715641684793357, "mean_token_accuracy": 0.6942528486251831, "num_tokens": 17749358.0, "step": 3410, "train/ce_loss": 0.5631909370422363 }, { "epoch": 0.33715641684793357, "step": 3410, "train/sim_loss": 0.09375 }, { "epoch": 0.33715641684793357, "step": 3410, "train/total_loss": 0.15006908774375916 }, { "entropy": 9.585782051086426, "epoch": 0.33725528969744906, "mean_token_accuracy": 0.7329545617103577, "num_tokens": 17754258.0, "step": 3411, "train/ce_loss": 1.4850685596466064 }, { "epoch": 0.33725528969744906, "step": 3411, "train/sim_loss": 0.13671875 }, { "epoch": 0.33725528969744906, "step": 3411, "train/total_loss": 0.28522562980651855 }, { "entropy": 9.434508323669434, "epoch": 0.3373541625469646, "mean_token_accuracy": 0.7887538075447083, "num_tokens": 17759407.0, "step": 3412, "train/ce_loss": 0.9562662839889526 }, { "epoch": 0.3373541625469646, "step": 3412, "train/sim_loss": 0.078125 }, { "epoch": 0.3373541625469646, "step": 3412, "train/total_loss": 0.1737516224384308 }, { "entropy": 8.994601249694824, "epoch": 0.33745303539648014, "mean_token_accuracy": 0.7205720543861389, "num_tokens": 17764786.0, "step": 3413, "train/ce_loss": 0.573242723941803 }, { "epoch": 0.33745303539648014, "step": 3413, "train/sim_loss": 0.0234375 }, { "epoch": 0.33745303539648014, "step": 3413, "train/total_loss": 0.08076177537441254 }, { "entropy": 9.084918975830078, "epoch": 0.3375519082459956, "mean_token_accuracy": 0.7689486742019653, "num_tokens": 17770056.0, "step": 3414, "train/ce_loss": 0.9779831171035767 }, { "epoch": 0.3375519082459956, "step": 3414, "train/sim_loss": 0.08984375 }, { "epoch": 0.3375519082459956, "step": 3414, "train/total_loss": 0.18764206767082214 }, { "entropy": 8.784640312194824, "epoch": 0.33765078109551117, "mean_token_accuracy": 0.7269193530082703, "num_tokens": 17775607.0, "step": 3415, "train/ce_loss": 1.16817307472229 }, { "epoch": 0.33765078109551117, "step": 3415, "train/sim_loss": 0.07421875 }, { "epoch": 0.33765078109551117, "step": 3415, "train/total_loss": 0.19103606045246124 }, { "entropy": 9.306246757507324, "epoch": 0.3377496539450267, "mean_token_accuracy": 0.728923499584198, "num_tokens": 17780854.0, "step": 3416, "train/ce_loss": 1.4378396272659302 }, { "epoch": 0.3377496539450267, "step": 3416, "train/sim_loss": 0.1015625 }, { "epoch": 0.3377496539450267, "step": 3416, "train/total_loss": 0.24534647166728973 }, { "entropy": 9.247640609741211, "epoch": 0.3378485267945422, "mean_token_accuracy": 0.7110266089439392, "num_tokens": 17786050.0, "step": 3417, "train/ce_loss": 0.8138503432273865 }, { "epoch": 0.3378485267945422, "step": 3417, "train/sim_loss": 0.02734375 }, { "epoch": 0.3378485267945422, "step": 3417, "train/total_loss": 0.108728788793087 }, { "entropy": 9.1106595993042, "epoch": 0.33794739964405773, "mean_token_accuracy": 0.7891492247581482, "num_tokens": 17791337.0, "step": 3418, "train/ce_loss": 0.39447054266929626 }, { "epoch": 0.33794739964405773, "step": 3418, "train/sim_loss": 0.02734375 }, { "epoch": 0.33794739964405773, "step": 3418, "train/total_loss": 0.06679080426692963 }, { "entropy": 9.286377906799316, "epoch": 0.3380462724935733, "mean_token_accuracy": 0.75660640001297, "num_tokens": 17796494.0, "step": 3419, "train/ce_loss": 0.7106546759605408 }, { "epoch": 0.3380462724935733, "step": 3419, "train/sim_loss": 0.05078125 }, { "epoch": 0.3380462724935733, "step": 3419, "train/total_loss": 0.12184672057628632 }, { "epoch": 0.3381451453430888, "grad_norm": 0.9079579710960388, "learning_rate": 9.157147802007616e-06, "loss": 0.152, "step": 3420 }, { "entropy": 9.076547622680664, "epoch": 0.3381451453430888, "mean_token_accuracy": 0.7792068719863892, "num_tokens": 17801885.0, "step": 3420, "train/ce_loss": 0.3072172999382019 }, { "epoch": 0.3381451453430888, "step": 3420, "train/sim_loss": 0.0625 }, { "epoch": 0.3381451453430888, "step": 3420, "train/total_loss": 0.09322173148393631 }, { "entropy": 8.976940155029297, "epoch": 0.3382440181926043, "mean_token_accuracy": 0.7343251705169678, "num_tokens": 17807318.0, "step": 3421, "train/ce_loss": 0.8840020895004272 }, { "epoch": 0.3382440181926043, "step": 3421, "train/sim_loss": 0.078125 }, { "epoch": 0.3382440181926043, "step": 3421, "train/total_loss": 0.1665252149105072 }, { "entropy": 9.07284164428711, "epoch": 0.33834289104211984, "mean_token_accuracy": 0.7465277910232544, "num_tokens": 17812630.0, "step": 3422, "train/ce_loss": 1.0425264835357666 }, { "epoch": 0.33834289104211984, "step": 3422, "train/sim_loss": 0.046875 }, { "epoch": 0.33834289104211984, "step": 3422, "train/total_loss": 0.1511276513338089 }, { "entropy": 9.675848007202148, "epoch": 0.3384417638916354, "mean_token_accuracy": 0.7110389471054077, "num_tokens": 17817679.0, "step": 3423, "train/ce_loss": 1.2692245244979858 }, { "epoch": 0.3384417638916354, "step": 3423, "train/sim_loss": 0.0703125 }, { "epoch": 0.3384417638916354, "step": 3423, "train/total_loss": 0.19723495841026306 }, { "entropy": 9.075258255004883, "epoch": 0.33854063674115087, "mean_token_accuracy": 0.7951807379722595, "num_tokens": 17822976.0, "step": 3424, "train/ce_loss": 0.8383387327194214 }, { "epoch": 0.33854063674115087, "step": 3424, "train/sim_loss": 0.1015625 }, { "epoch": 0.33854063674115087, "step": 3424, "train/total_loss": 0.18539637327194214 }, { "entropy": 9.544485092163086, "epoch": 0.3386395095906664, "mean_token_accuracy": 0.7225913405418396, "num_tokens": 17828044.0, "step": 3425, "train/ce_loss": 1.075201153755188 }, { "epoch": 0.3386395095906664, "step": 3425, "train/sim_loss": 0.08984375 }, { "epoch": 0.3386395095906664, "step": 3425, "train/total_loss": 0.19736386835575104 }, { "entropy": 9.646242141723633, "epoch": 0.33873838244018195, "mean_token_accuracy": 0.7439862489700317, "num_tokens": 17833065.0, "step": 3426, "train/ce_loss": 0.8191995024681091 }, { "epoch": 0.33873838244018195, "step": 3426, "train/sim_loss": 0.0546875 }, { "epoch": 0.33873838244018195, "step": 3426, "train/total_loss": 0.13660745322704315 }, { "entropy": 9.409704208374023, "epoch": 0.33883725528969744, "mean_token_accuracy": 0.7482219338417053, "num_tokens": 17838195.0, "step": 3427, "train/ce_loss": 0.432102769613266 }, { "epoch": 0.33883725528969744, "step": 3427, "train/sim_loss": 0.03515625 }, { "epoch": 0.33883725528969744, "step": 3427, "train/total_loss": 0.07836653292179108 }, { "entropy": 9.23155403137207, "epoch": 0.338936128139213, "mean_token_accuracy": 0.7944663763046265, "num_tokens": 17843405.0, "step": 3428, "train/ce_loss": 0.49656328558921814 }, { "epoch": 0.338936128139213, "step": 3428, "train/sim_loss": 0.07421875 }, { "epoch": 0.338936128139213, "step": 3428, "train/total_loss": 0.12387508153915405 }, { "entropy": 9.279233932495117, "epoch": 0.3390350009887285, "mean_token_accuracy": 0.7304469347000122, "num_tokens": 17848578.0, "step": 3429, "train/ce_loss": 0.7106434106826782 }, { "epoch": 0.3390350009887285, "step": 3429, "train/sim_loss": 0.0390625 }, { "epoch": 0.3390350009887285, "step": 3429, "train/total_loss": 0.11012684553861618 }, { "entropy": 9.007104873657227, "epoch": 0.339133873838244, "mean_token_accuracy": 0.7529411911964417, "num_tokens": 17853896.0, "step": 3430, "train/ce_loss": 0.4403240382671356 }, { "epoch": 0.339133873838244, "step": 3430, "train/sim_loss": 0.02734375 }, { "epoch": 0.339133873838244, "step": 3430, "train/total_loss": 0.07137615978717804 }, { "entropy": 9.360210418701172, "epoch": 0.33923274668775955, "mean_token_accuracy": 0.6985173225402832, "num_tokens": 17858938.0, "step": 3431, "train/ce_loss": 5.252084520179778e-06 }, { "epoch": 0.33923274668775955, "step": 3431, "train/sim_loss": 0.046875 }, { "epoch": 0.33923274668775955, "step": 3431, "train/total_loss": 0.04687552526593208 }, { "entropy": 9.197908401489258, "epoch": 0.3393316195372751, "mean_token_accuracy": 0.7270194888114929, "num_tokens": 17864127.0, "step": 3432, "train/ce_loss": 0.9348120093345642 }, { "epoch": 0.3393316195372751, "step": 3432, "train/sim_loss": 0.078125 }, { "epoch": 0.3393316195372751, "step": 3432, "train/total_loss": 0.17160621285438538 }, { "entropy": 9.071158409118652, "epoch": 0.33943049238679057, "mean_token_accuracy": 0.6770708560943604, "num_tokens": 17869420.0, "step": 3433, "train/ce_loss": 1.0051276683807373 }, { "epoch": 0.33943049238679057, "step": 3433, "train/sim_loss": 0.05859375 }, { "epoch": 0.33943049238679057, "step": 3433, "train/total_loss": 0.1591065227985382 }, { "entropy": 9.667444229125977, "epoch": 0.3395293652363061, "mean_token_accuracy": 0.673758864402771, "num_tokens": 17874441.0, "step": 3434, "train/ce_loss": 1.7448782920837402 }, { "epoch": 0.3395293652363061, "step": 3434, "train/sim_loss": 0.125 }, { "epoch": 0.3395293652363061, "step": 3434, "train/total_loss": 0.299487829208374 }, { "entropy": 9.665374755859375, "epoch": 0.33962823808582165, "mean_token_accuracy": 0.7579832077026367, "num_tokens": 17879426.0, "step": 3435, "train/ce_loss": 4.009837084595347e-06 }, { "epoch": 0.33962823808582165, "step": 3435, "train/sim_loss": 0.0546875 }, { "epoch": 0.33962823808582165, "step": 3435, "train/total_loss": 0.054687902331352234 }, { "entropy": 9.500116348266602, "epoch": 0.33972711093533714, "mean_token_accuracy": 0.8029412031173706, "num_tokens": 17884537.0, "step": 3436, "train/ce_loss": 1.0814032554626465 }, { "epoch": 0.33972711093533714, "step": 3436, "train/sim_loss": 0.0859375 }, { "epoch": 0.33972711093533714, "step": 3436, "train/total_loss": 0.19407781958580017 }, { "entropy": 9.49143123626709, "epoch": 0.3398259837848527, "mean_token_accuracy": 0.7176128029823303, "num_tokens": 17889653.0, "step": 3437, "train/ce_loss": 1.7539986371994019 }, { "epoch": 0.3398259837848527, "step": 3437, "train/sim_loss": 0.09375 }, { "epoch": 0.3398259837848527, "step": 3437, "train/total_loss": 0.26914986968040466 }, { "entropy": 9.031991958618164, "epoch": 0.3399248566343682, "mean_token_accuracy": 0.746582567691803, "num_tokens": 17895113.0, "step": 3438, "train/ce_loss": 0.5026720762252808 }, { "epoch": 0.3399248566343682, "step": 3438, "train/sim_loss": 0.046875 }, { "epoch": 0.3399248566343682, "step": 3438, "train/total_loss": 0.09714220464229584 }, { "entropy": 9.607033729553223, "epoch": 0.3400237294838837, "mean_token_accuracy": 0.7119740843772888, "num_tokens": 17900170.0, "step": 3439, "train/ce_loss": 1.0218220949172974 }, { "epoch": 0.3400237294838837, "step": 3439, "train/sim_loss": 0.03125 }, { "epoch": 0.3400237294838837, "step": 3439, "train/total_loss": 0.13343220949172974 }, { "epoch": 0.34012260233339925, "grad_norm": 0.9295529127120972, "learning_rate": 9.152202937249667e-06, "loss": 0.1506, "step": 3440 }, { "entropy": 9.612879753112793, "epoch": 0.34012260233339925, "mean_token_accuracy": 0.7244094610214233, "num_tokens": 17905243.0, "step": 3440, "train/ce_loss": 1.4253815412521362 }, { "epoch": 0.34012260233339925, "step": 3440, "train/sim_loss": 0.0625 }, { "epoch": 0.34012260233339925, "step": 3440, "train/total_loss": 0.2050381600856781 }, { "entropy": 9.158390045166016, "epoch": 0.3402214751829148, "mean_token_accuracy": 0.7317365407943726, "num_tokens": 17910674.0, "step": 3441, "train/ce_loss": 0.8184359669685364 }, { "epoch": 0.3402214751829148, "step": 3441, "train/sim_loss": 0.05859375 }, { "epoch": 0.3402214751829148, "step": 3441, "train/total_loss": 0.14043734967708588 }, { "entropy": 9.40418815612793, "epoch": 0.3403203480324303, "mean_token_accuracy": 0.6985074877738953, "num_tokens": 17915825.0, "step": 3442, "train/ce_loss": 1.0082809925079346 }, { "epoch": 0.3403203480324303, "step": 3442, "train/sim_loss": 0.0625 }, { "epoch": 0.3403203480324303, "step": 3442, "train/total_loss": 0.1633281111717224 }, { "entropy": 8.87204360961914, "epoch": 0.3404192208819458, "mean_token_accuracy": 0.7135576009750366, "num_tokens": 17921358.0, "step": 3443, "train/ce_loss": 1.1382791996002197 }, { "epoch": 0.3404192208819458, "step": 3443, "train/sim_loss": 0.0859375 }, { "epoch": 0.3404192208819458, "step": 3443, "train/total_loss": 0.1997654139995575 }, { "entropy": 9.070274353027344, "epoch": 0.34051809373146136, "mean_token_accuracy": 0.7418546080589294, "num_tokens": 17926609.0, "step": 3444, "train/ce_loss": 1.0991380214691162 }, { "epoch": 0.34051809373146136, "step": 3444, "train/sim_loss": 0.109375 }, { "epoch": 0.34051809373146136, "step": 3444, "train/total_loss": 0.21928879618644714 }, { "entropy": 9.582058906555176, "epoch": 0.34061696658097684, "mean_token_accuracy": 0.7063903212547302, "num_tokens": 17931675.0, "step": 3445, "train/ce_loss": 1.2208040971017908e-05 }, { "epoch": 0.34061696658097684, "step": 3445, "train/sim_loss": 0.0859375 }, { "epoch": 0.34061696658097684, "step": 3445, "train/total_loss": 0.0859387218952179 }, { "entropy": 9.25374984741211, "epoch": 0.3407158394304924, "mean_token_accuracy": 0.7633987069129944, "num_tokens": 17936890.0, "step": 3446, "train/ce_loss": 0.5326510667800903 }, { "epoch": 0.3407158394304924, "step": 3446, "train/sim_loss": 0.09765625 }, { "epoch": 0.3407158394304924, "step": 3446, "train/total_loss": 0.15092135965824127 }, { "entropy": 9.824382781982422, "epoch": 0.3408147122800079, "mean_token_accuracy": 0.6777777671813965, "num_tokens": 17941772.0, "step": 3447, "train/ce_loss": 3.712680381795508e-06 }, { "epoch": 0.3408147122800079, "step": 3447, "train/sim_loss": 0.05859375 }, { "epoch": 0.3408147122800079, "step": 3447, "train/total_loss": 0.058594122529029846 }, { "entropy": 9.640596389770508, "epoch": 0.3409135851295234, "mean_token_accuracy": 0.7375415563583374, "num_tokens": 17946862.0, "step": 3448, "train/ce_loss": 0.8982908725738525 }, { "epoch": 0.3409135851295234, "step": 3448, "train/sim_loss": 0.05078125 }, { "epoch": 0.3409135851295234, "step": 3448, "train/total_loss": 0.14061033725738525 }, { "entropy": 8.825933456420898, "epoch": 0.34101245797903895, "mean_token_accuracy": 0.748314619064331, "num_tokens": 17952236.0, "step": 3449, "train/ce_loss": 0.6523842811584473 }, { "epoch": 0.34101245797903895, "step": 3449, "train/sim_loss": 0.0703125 }, { "epoch": 0.34101245797903895, "step": 3449, "train/total_loss": 0.13555093109607697 }, { "entropy": 9.274543762207031, "epoch": 0.3411113308285545, "mean_token_accuracy": 0.7023086547851562, "num_tokens": 17957479.0, "step": 3450, "train/ce_loss": 1.0586771965026855 }, { "epoch": 0.3411113308285545, "step": 3450, "train/sim_loss": 0.10546875 }, { "epoch": 0.3411113308285545, "step": 3450, "train/total_loss": 0.21133646368980408 }, { "entropy": 8.989279747009277, "epoch": 0.34121020367807, "mean_token_accuracy": 0.7483370304107666, "num_tokens": 17962874.0, "step": 3451, "train/ce_loss": 0.4404990077018738 }, { "epoch": 0.34121020367807, "step": 3451, "train/sim_loss": 0.02734375 }, { "epoch": 0.34121020367807, "step": 3451, "train/total_loss": 0.07139365375041962 }, { "entropy": 9.191324234008789, "epoch": 0.3413090765275855, "mean_token_accuracy": 0.6980440020561218, "num_tokens": 17968098.0, "step": 3452, "train/ce_loss": 1.2236419916152954 }, { "epoch": 0.3413090765275855, "step": 3452, "train/sim_loss": 0.0703125 }, { "epoch": 0.3413090765275855, "step": 3452, "train/total_loss": 0.19267669320106506 }, { "entropy": 9.291399002075195, "epoch": 0.34140794937710106, "mean_token_accuracy": 0.6732919216156006, "num_tokens": 17973394.0, "step": 3453, "train/ce_loss": 1.0472372196090873e-06 }, { "epoch": 0.34140794937710106, "step": 3453, "train/sim_loss": 0.0234375 }, { "epoch": 0.34140794937710106, "step": 3453, "train/total_loss": 0.023437604308128357 }, { "entropy": 9.780986785888672, "epoch": 0.34150682222661655, "mean_token_accuracy": 0.8097165822982788, "num_tokens": 17978311.0, "step": 3454, "train/ce_loss": 1.6872035264968872 }, { "epoch": 0.34150682222661655, "step": 3454, "train/sim_loss": 0.11328125 }, { "epoch": 0.34150682222661655, "step": 3454, "train/total_loss": 0.2820016145706177 }, { "entropy": 8.907554626464844, "epoch": 0.3416056950761321, "mean_token_accuracy": 0.7538779973983765, "num_tokens": 17983789.0, "step": 3455, "train/ce_loss": 0.6267127990722656 }, { "epoch": 0.3416056950761321, "step": 3455, "train/sim_loss": 0.01953125 }, { "epoch": 0.3416056950761321, "step": 3455, "train/total_loss": 0.08220253139734268 }, { "entropy": 9.012369155883789, "epoch": 0.34170456792564763, "mean_token_accuracy": 0.7421307563781738, "num_tokens": 17989126.0, "step": 3456, "train/ce_loss": 0.5928884744644165 }, { "epoch": 0.34170456792564763, "step": 3456, "train/sim_loss": 0.0390625 }, { "epoch": 0.34170456792564763, "step": 3456, "train/total_loss": 0.09835134446620941 }, { "entropy": 9.332815170288086, "epoch": 0.3418034407751631, "mean_token_accuracy": 0.7809523940086365, "num_tokens": 17994327.0, "step": 3457, "train/ce_loss": 0.9819399118423462 }, { "epoch": 0.3418034407751631, "step": 3457, "train/sim_loss": 0.03125 }, { "epoch": 0.3418034407751631, "step": 3457, "train/total_loss": 0.12944400310516357 }, { "entropy": 9.30552864074707, "epoch": 0.34190231362467866, "mean_token_accuracy": 0.7752613425254822, "num_tokens": 17999369.0, "step": 3458, "train/ce_loss": 1.0199953317642212 }, { "epoch": 0.34190231362467866, "step": 3458, "train/sim_loss": 0.03515625 }, { "epoch": 0.34190231362467866, "step": 3458, "train/total_loss": 0.13715578615665436 }, { "entropy": 8.830644607543945, "epoch": 0.3420011864741942, "mean_token_accuracy": 0.7116104960441589, "num_tokens": 18004662.0, "step": 3459, "train/ce_loss": 1.4183692932128906 }, { "epoch": 0.3420011864741942, "step": 3459, "train/sim_loss": 0.1484375 }, { "epoch": 0.3420011864741942, "step": 3459, "train/total_loss": 0.290274441242218 }, { "epoch": 0.3421000593237097, "grad_norm": 0.8935452103614807, "learning_rate": 9.147258072491719e-06, "loss": 0.1516, "step": 3460 }, { "entropy": 9.550859451293945, "epoch": 0.3421000593237097, "mean_token_accuracy": 0.6639871597290039, "num_tokens": 18009731.0, "step": 3460, "train/ce_loss": 1.6412410736083984 }, { "epoch": 0.3421000593237097, "step": 3460, "train/sim_loss": 0.04296875 }, { "epoch": 0.3421000593237097, "step": 3460, "train/total_loss": 0.20709286630153656 }, { "entropy": 9.644804000854492, "epoch": 0.3421989321732252, "mean_token_accuracy": 0.8066298365592957, "num_tokens": 18014723.0, "step": 3461, "train/ce_loss": 0.7327430844306946 }, { "epoch": 0.3421989321732252, "step": 3461, "train/sim_loss": 0.05078125 }, { "epoch": 0.3421989321732252, "step": 3461, "train/total_loss": 0.12405555695295334 }, { "entropy": 9.740586280822754, "epoch": 0.34229780502274076, "mean_token_accuracy": 0.7336152195930481, "num_tokens": 18019589.0, "step": 3462, "train/ce_loss": 1.472940444946289 }, { "epoch": 0.34229780502274076, "step": 3462, "train/sim_loss": 0.0546875 }, { "epoch": 0.34229780502274076, "step": 3462, "train/total_loss": 0.2019815444946289 }, { "entropy": 8.780726432800293, "epoch": 0.3423966778722563, "mean_token_accuracy": 0.6923901438713074, "num_tokens": 18025027.0, "step": 3463, "train/ce_loss": 1.0836405754089355 }, { "epoch": 0.3423966778722563, "step": 3463, "train/sim_loss": 0.0625 }, { "epoch": 0.3423966778722563, "step": 3463, "train/total_loss": 0.1708640605211258 }, { "entropy": 9.670867919921875, "epoch": 0.3424955507217718, "mean_token_accuracy": 0.7439446449279785, "num_tokens": 18030076.0, "step": 3464, "train/ce_loss": 0.7355344891548157 }, { "epoch": 0.3424955507217718, "step": 3464, "train/sim_loss": 0.04296875 }, { "epoch": 0.3424955507217718, "step": 3464, "train/total_loss": 0.11652220040559769 }, { "entropy": 9.505711555480957, "epoch": 0.34259442357128733, "mean_token_accuracy": 0.727142870426178, "num_tokens": 18035219.0, "step": 3465, "train/ce_loss": 0.724556565284729 }, { "epoch": 0.34259442357128733, "step": 3465, "train/sim_loss": 0.0703125 }, { "epoch": 0.34259442357128733, "step": 3465, "train/total_loss": 0.14276815950870514 }, { "entropy": 9.53701114654541, "epoch": 0.3426932964208029, "mean_token_accuracy": 0.7606298923492432, "num_tokens": 18040267.0, "step": 3466, "train/ce_loss": 0.8044400215148926 }, { "epoch": 0.3426932964208029, "step": 3466, "train/sim_loss": 0.046875 }, { "epoch": 0.3426932964208029, "step": 3466, "train/total_loss": 0.12731900811195374 }, { "entropy": 9.149065971374512, "epoch": 0.34279216927031836, "mean_token_accuracy": 0.7601390480995178, "num_tokens": 18045617.0, "step": 3467, "train/ce_loss": 7.459978405677248e-06 }, { "epoch": 0.34279216927031836, "step": 3467, "train/sim_loss": 0.0546875 }, { "epoch": 0.34279216927031836, "step": 3467, "train/total_loss": 0.05468824505805969 }, { "entropy": 10.015134811401367, "epoch": 0.3428910421198339, "mean_token_accuracy": 0.699999988079071, "num_tokens": 18050470.0, "step": 3468, "train/ce_loss": 3.0289363861083984 }, { "epoch": 0.3428910421198339, "step": 3468, "train/sim_loss": 0.09375 }, { "epoch": 0.3428910421198339, "step": 3468, "train/total_loss": 0.39664363861083984 }, { "entropy": 9.345232963562012, "epoch": 0.34298991496934944, "mean_token_accuracy": 0.726047933101654, "num_tokens": 18055605.0, "step": 3469, "train/ce_loss": 0.5725339651107788 }, { "epoch": 0.34298991496934944, "step": 3469, "train/sim_loss": 0.03515625 }, { "epoch": 0.34298991496934944, "step": 3469, "train/total_loss": 0.092409648001194 }, { "entropy": 9.524428367614746, "epoch": 0.3430887878188649, "mean_token_accuracy": 0.7312703728675842, "num_tokens": 18060629.0, "step": 3470, "train/ce_loss": 0.5799019932746887 }, { "epoch": 0.3430887878188649, "step": 3470, "train/sim_loss": 0.046875 }, { "epoch": 0.3430887878188649, "step": 3470, "train/total_loss": 0.10486520081758499 }, { "entropy": 9.27010726928711, "epoch": 0.34318766066838047, "mean_token_accuracy": 0.7620751261711121, "num_tokens": 18065629.0, "step": 3471, "train/ce_loss": 0.9724286198616028 }, { "epoch": 0.34318766066838047, "step": 3471, "train/sim_loss": 0.046875 }, { "epoch": 0.34318766066838047, "step": 3471, "train/total_loss": 0.14411786198616028 }, { "entropy": 8.874292373657227, "epoch": 0.343286533517896, "mean_token_accuracy": 0.7732426524162292, "num_tokens": 18070997.0, "step": 3472, "train/ce_loss": 0.6216502785682678 }, { "epoch": 0.343286533517896, "step": 3472, "train/sim_loss": 0.02734375 }, { "epoch": 0.343286533517896, "step": 3472, "train/total_loss": 0.0895087793469429 }, { "entropy": 8.968184471130371, "epoch": 0.3433854063674115, "mean_token_accuracy": 0.6972677707672119, "num_tokens": 18076395.0, "step": 3473, "train/ce_loss": 0.8104959726333618 }, { "epoch": 0.3433854063674115, "step": 3473, "train/sim_loss": 0.09375 }, { "epoch": 0.3433854063674115, "step": 3473, "train/total_loss": 0.1747995913028717 }, { "entropy": 9.562467575073242, "epoch": 0.34348427921692704, "mean_token_accuracy": 0.7288428544998169, "num_tokens": 18081397.0, "step": 3474, "train/ce_loss": 8.978898222267162e-06 }, { "epoch": 0.34348427921692704, "step": 3474, "train/sim_loss": 0.02734375 }, { "epoch": 0.34348427921692704, "step": 3474, "train/total_loss": 0.02734464779496193 }, { "entropy": 9.386184692382812, "epoch": 0.3435831520664426, "mean_token_accuracy": 0.7627840638160706, "num_tokens": 18086573.0, "step": 3475, "train/ce_loss": 2.367150500504067e-06 }, { "epoch": 0.3435831520664426, "step": 3475, "train/sim_loss": 0.0390625 }, { "epoch": 0.3435831520664426, "step": 3475, "train/total_loss": 0.0390627384185791 }, { "entropy": 9.22478199005127, "epoch": 0.34368202491595806, "mean_token_accuracy": 0.7738814949989319, "num_tokens": 18091858.0, "step": 3476, "train/ce_loss": 0.6053165793418884 }, { "epoch": 0.34368202491595806, "step": 3476, "train/sim_loss": 0.0390625 }, { "epoch": 0.34368202491595806, "step": 3476, "train/total_loss": 0.09959416091442108 }, { "entropy": 8.968910217285156, "epoch": 0.3437808977654736, "mean_token_accuracy": 0.7267637252807617, "num_tokens": 18097212.0, "step": 3477, "train/ce_loss": 1.425128698348999 }, { "epoch": 0.3437808977654736, "step": 3477, "train/sim_loss": 0.11328125 }, { "epoch": 0.3437808977654736, "step": 3477, "train/total_loss": 0.25579410791397095 }, { "entropy": 9.667119026184082, "epoch": 0.34387977061498914, "mean_token_accuracy": 0.7336769700050354, "num_tokens": 18102192.0, "step": 3478, "train/ce_loss": 0.7826544642448425 }, { "epoch": 0.34387977061498914, "step": 3478, "train/sim_loss": 0.05859375 }, { "epoch": 0.34387977061498914, "step": 3478, "train/total_loss": 0.1368592083454132 }, { "entropy": 9.635202407836914, "epoch": 0.34397864346450463, "mean_token_accuracy": 0.6964285969734192, "num_tokens": 18107051.0, "step": 3479, "train/ce_loss": 4.103856554138474e-06 }, { "epoch": 0.34397864346450463, "step": 3479, "train/sim_loss": 0.04296875 }, { "epoch": 0.34397864346450463, "step": 3479, "train/total_loss": 0.04296915978193283 }, { "epoch": 0.34407751631402017, "grad_norm": 1.0079654455184937, "learning_rate": 9.14231320773377e-06, "loss": 0.141, "step": 3480 }, { "entropy": 8.853361129760742, "epoch": 0.34407751631402017, "mean_token_accuracy": 0.7193158864974976, "num_tokens": 18112485.0, "step": 3480, "train/ce_loss": 0.8331007957458496 }, { "epoch": 0.34407751631402017, "step": 3480, "train/sim_loss": 0.07421875 }, { "epoch": 0.34407751631402017, "step": 3480, "train/total_loss": 0.1575288325548172 }, { "entropy": 9.428400039672852, "epoch": 0.3441763891635357, "mean_token_accuracy": 0.6616848111152649, "num_tokens": 18117660.0, "step": 3481, "train/ce_loss": 2.4557323455810547 }, { "epoch": 0.3441763891635357, "step": 3481, "train/sim_loss": 0.10546875 }, { "epoch": 0.3441763891635357, "step": 3481, "train/total_loss": 0.3510419726371765 }, { "entropy": 8.761262893676758, "epoch": 0.3442752620130512, "mean_token_accuracy": 0.7782857418060303, "num_tokens": 18123011.0, "step": 3482, "train/ce_loss": 0.6788957715034485 }, { "epoch": 0.3442752620130512, "step": 3482, "train/sim_loss": 0.0546875 }, { "epoch": 0.3442752620130512, "step": 3482, "train/total_loss": 0.12257707864046097 }, { "entropy": 10.270698547363281, "epoch": 0.34437413486256674, "mean_token_accuracy": 0.6736111044883728, "num_tokens": 18127668.0, "step": 3483, "train/ce_loss": 4.814478415937629e-06 }, { "epoch": 0.34437413486256674, "step": 3483, "train/sim_loss": 0.01953125 }, { "epoch": 0.34437413486256674, "step": 3483, "train/total_loss": 0.0195317305624485 }, { "entropy": 9.341055870056152, "epoch": 0.3444730077120823, "mean_token_accuracy": 0.6681286692619324, "num_tokens": 18132822.0, "step": 3484, "train/ce_loss": 1.518288254737854 }, { "epoch": 0.3444730077120823, "step": 3484, "train/sim_loss": 0.08984375 }, { "epoch": 0.3444730077120823, "step": 3484, "train/total_loss": 0.2416725754737854 }, { "entropy": 9.43017578125, "epoch": 0.34457188056159777, "mean_token_accuracy": 0.7330508232116699, "num_tokens": 18138000.0, "step": 3485, "train/ce_loss": 0.5761935114860535 }, { "epoch": 0.34457188056159777, "step": 3485, "train/sim_loss": 0.046875 }, { "epoch": 0.34457188056159777, "step": 3485, "train/total_loss": 0.10449434816837311 }, { "entropy": 8.839698791503906, "epoch": 0.3446707534111133, "mean_token_accuracy": 0.7234927415847778, "num_tokens": 18143459.0, "step": 3486, "train/ce_loss": 0.9702885746955872 }, { "epoch": 0.3446707534111133, "step": 3486, "train/sim_loss": 0.03515625 }, { "epoch": 0.3446707534111133, "step": 3486, "train/total_loss": 0.13218510150909424 }, { "entropy": 9.350061416625977, "epoch": 0.34476962626062885, "mean_token_accuracy": 0.7810107469558716, "num_tokens": 18148562.0, "step": 3487, "train/ce_loss": 1.4726591871294659e-05 }, { "epoch": 0.34476962626062885, "step": 3487, "train/sim_loss": 0.06640625 }, { "epoch": 0.34476962626062885, "step": 3487, "train/total_loss": 0.06640772521495819 }, { "entropy": 9.24201774597168, "epoch": 0.34486849911014433, "mean_token_accuracy": 0.6737288236618042, "num_tokens": 18153731.0, "step": 3488, "train/ce_loss": 1.0004758834838867 }, { "epoch": 0.34486849911014433, "step": 3488, "train/sim_loss": 0.0625 }, { "epoch": 0.34486849911014433, "step": 3488, "train/total_loss": 0.16254758834838867 }, { "entropy": 9.01718521118164, "epoch": 0.3449673719596599, "mean_token_accuracy": 0.6993710398674011, "num_tokens": 18159006.0, "step": 3489, "train/ce_loss": 0.4811428189277649 }, { "epoch": 0.3449673719596599, "step": 3489, "train/sim_loss": 0.0546875 }, { "epoch": 0.3449673719596599, "step": 3489, "train/total_loss": 0.10280178487300873 }, { "entropy": 9.156839370727539, "epoch": 0.3450662448091754, "mean_token_accuracy": 0.7376623153686523, "num_tokens": 18164259.0, "step": 3490, "train/ce_loss": 1.078993797302246 }, { "epoch": 0.3450662448091754, "step": 3490, "train/sim_loss": 0.0625 }, { "epoch": 0.3450662448091754, "step": 3490, "train/total_loss": 0.17039938271045685 }, { "entropy": 9.458637237548828, "epoch": 0.3451651176586909, "mean_token_accuracy": 0.7687296271324158, "num_tokens": 18169353.0, "step": 3491, "train/ce_loss": 1.1278468370437622 }, { "epoch": 0.3451651176586909, "step": 3491, "train/sim_loss": 0.0625 }, { "epoch": 0.3451651176586909, "step": 3491, "train/total_loss": 0.17528468370437622 }, { "entropy": 9.081297874450684, "epoch": 0.34526399050820644, "mean_token_accuracy": 0.7879133224487305, "num_tokens": 18174714.0, "step": 3492, "train/ce_loss": 0.8648220300674438 }, { "epoch": 0.34526399050820644, "step": 3492, "train/sim_loss": 0.0703125 }, { "epoch": 0.34526399050820644, "step": 3492, "train/total_loss": 0.1567946970462799 }, { "entropy": 8.773237228393555, "epoch": 0.345362863357722, "mean_token_accuracy": 0.7347908616065979, "num_tokens": 18180212.0, "step": 3493, "train/ce_loss": 1.1458483934402466 }, { "epoch": 0.345362863357722, "step": 3493, "train/sim_loss": 0.03125 }, { "epoch": 0.345362863357722, "step": 3493, "train/total_loss": 0.14583483338356018 }, { "entropy": 8.975872039794922, "epoch": 0.34546173620723747, "mean_token_accuracy": 0.7310252785682678, "num_tokens": 18185412.0, "step": 3494, "train/ce_loss": 0.642943799495697 }, { "epoch": 0.34546173620723747, "step": 3494, "train/sim_loss": 0.04296875 }, { "epoch": 0.34546173620723747, "step": 3494, "train/total_loss": 0.10726313292980194 }, { "entropy": 8.932552337646484, "epoch": 0.345560609056753, "mean_token_accuracy": 0.7300613522529602, "num_tokens": 18190755.0, "step": 3495, "train/ce_loss": 1.3272416591644287 }, { "epoch": 0.345560609056753, "step": 3495, "train/sim_loss": 0.05078125 }, { "epoch": 0.345560609056753, "step": 3495, "train/total_loss": 0.18350541591644287 }, { "entropy": 9.683507919311523, "epoch": 0.34565948190626855, "mean_token_accuracy": 0.7188678979873657, "num_tokens": 18195738.0, "step": 3496, "train/ce_loss": 0.8042916655540466 }, { "epoch": 0.34565948190626855, "step": 3496, "train/sim_loss": 0.015625 }, { "epoch": 0.34565948190626855, "step": 3496, "train/total_loss": 0.09605416655540466 }, { "entropy": 8.802057266235352, "epoch": 0.34575835475578404, "mean_token_accuracy": 0.7775306105613708, "num_tokens": 18201125.0, "step": 3497, "train/ce_loss": 0.6206459403038025 }, { "epoch": 0.34575835475578404, "step": 3497, "train/sim_loss": 0.02734375 }, { "epoch": 0.34575835475578404, "step": 3497, "train/total_loss": 0.08940834552049637 }, { "entropy": 9.39529037475586, "epoch": 0.3458572276052996, "mean_token_accuracy": 0.7643835544586182, "num_tokens": 18206320.0, "step": 3498, "train/ce_loss": 0.6989363431930542 }, { "epoch": 0.3458572276052996, "step": 3498, "train/sim_loss": 0.09375 }, { "epoch": 0.3458572276052996, "step": 3498, "train/total_loss": 0.16364362835884094 }, { "entropy": 9.509500503540039, "epoch": 0.3459561004548151, "mean_token_accuracy": 0.777414083480835, "num_tokens": 18211365.0, "step": 3499, "train/ce_loss": 1.8198801399194053e-06 }, { "epoch": 0.3459561004548151, "step": 3499, "train/sim_loss": 0.0390625 }, { "epoch": 0.3459561004548151, "step": 3499, "train/total_loss": 0.039062682539224625 }, { "epoch": 0.3460549733043306, "grad_norm": 0.6896134614944458, "learning_rate": 9.137368342975821e-06, "loss": 0.1465, "step": 3500 }, { "entropy": 9.481149673461914, "epoch": 0.3460549733043306, "mean_token_accuracy": 0.720812201499939, "num_tokens": 18216378.0, "step": 3500, "train/ce_loss": 1.0319294929504395 }, { "epoch": 0.3460549733043306, "step": 3500, "train/sim_loss": 0.05859375 }, { "epoch": 0.3460549733043306, "step": 3500, "train/total_loss": 0.16178670525550842 }, { "entropy": 9.102712631225586, "epoch": 0.34615384615384615, "mean_token_accuracy": 0.7146596908569336, "num_tokens": 18221548.0, "step": 3501, "train/ce_loss": 1.060309886932373 }, { "epoch": 0.34615384615384615, "step": 3501, "train/sim_loss": 0.07421875 }, { "epoch": 0.34615384615384615, "step": 3501, "train/total_loss": 0.18024975061416626 }, { "entropy": 9.14307975769043, "epoch": 0.3462527190033617, "mean_token_accuracy": 0.7654808759689331, "num_tokens": 18226792.0, "step": 3502, "train/ce_loss": 0.6257838010787964 }, { "epoch": 0.3462527190033617, "step": 3502, "train/sim_loss": 0.0390625 }, { "epoch": 0.3462527190033617, "step": 3502, "train/total_loss": 0.10164088010787964 }, { "entropy": 9.397144317626953, "epoch": 0.3463515918528772, "mean_token_accuracy": 0.6901615262031555, "num_tokens": 18231912.0, "step": 3503, "train/ce_loss": 2.106091187670245e-06 }, { "epoch": 0.3463515918528772, "step": 3503, "train/sim_loss": 0.08203125 }, { "epoch": 0.3463515918528772, "step": 3503, "train/total_loss": 0.08203145861625671 }, { "entropy": 8.943714141845703, "epoch": 0.3464504647023927, "mean_token_accuracy": 0.6891566514968872, "num_tokens": 18237203.0, "step": 3504, "train/ce_loss": 1.4671072959899902 }, { "epoch": 0.3464504647023927, "step": 3504, "train/sim_loss": 0.05859375 }, { "epoch": 0.3464504647023927, "step": 3504, "train/total_loss": 0.20530448853969574 }, { "entropy": 9.217236518859863, "epoch": 0.34654933755190825, "mean_token_accuracy": 0.7440944910049438, "num_tokens": 18242389.0, "step": 3505, "train/ce_loss": 0.7367294430732727 }, { "epoch": 0.34654933755190825, "step": 3505, "train/sim_loss": 0.0234375 }, { "epoch": 0.34654933755190825, "step": 3505, "train/total_loss": 0.09711044281721115 }, { "entropy": 8.967019081115723, "epoch": 0.3466482104014238, "mean_token_accuracy": 0.6997929811477661, "num_tokens": 18247769.0, "step": 3506, "train/ce_loss": 0.7110524773597717 }, { "epoch": 0.3466482104014238, "step": 3506, "train/sim_loss": 0.0390625 }, { "epoch": 0.3466482104014238, "step": 3506, "train/total_loss": 0.11016774922609329 }, { "entropy": 9.190967559814453, "epoch": 0.3467470832509393, "mean_token_accuracy": 0.6906710267066956, "num_tokens": 18252815.0, "step": 3507, "train/ce_loss": 1.5314048528671265 }, { "epoch": 0.3467470832509393, "step": 3507, "train/sim_loss": 0.125 }, { "epoch": 0.3467470832509393, "step": 3507, "train/total_loss": 0.27814048528671265 }, { "entropy": 8.84999942779541, "epoch": 0.3468459561004548, "mean_token_accuracy": 0.6917900443077087, "num_tokens": 18258039.0, "step": 3508, "train/ce_loss": 0.9880190491676331 }, { "epoch": 0.3468459561004548, "step": 3508, "train/sim_loss": 0.04296875 }, { "epoch": 0.3468459561004548, "step": 3508, "train/total_loss": 0.14177066087722778 }, { "entropy": 9.307038307189941, "epoch": 0.34694482894997036, "mean_token_accuracy": 0.6992366313934326, "num_tokens": 18263149.0, "step": 3509, "train/ce_loss": 1.0231679677963257 }, { "epoch": 0.34694482894997036, "step": 3509, "train/sim_loss": 0.06640625 }, { "epoch": 0.34694482894997036, "step": 3509, "train/total_loss": 0.16872304677963257 }, { "entropy": 9.371397018432617, "epoch": 0.34704370179948585, "mean_token_accuracy": 0.687589168548584, "num_tokens": 18268301.0, "step": 3510, "train/ce_loss": 4.765811354445759e-06 }, { "epoch": 0.34704370179948585, "step": 3510, "train/sim_loss": 0.04296875 }, { "epoch": 0.34704370179948585, "step": 3510, "train/total_loss": 0.0429692268371582 }, { "entropy": 8.887444496154785, "epoch": 0.3471425746490014, "mean_token_accuracy": 0.8310502171516418, "num_tokens": 18273666.0, "step": 3511, "train/ce_loss": 0.693034291267395 }, { "epoch": 0.3471425746490014, "step": 3511, "train/sim_loss": 0.0859375 }, { "epoch": 0.3471425746490014, "step": 3511, "train/total_loss": 0.15524092316627502 }, { "entropy": 9.004448890686035, "epoch": 0.34724144749851693, "mean_token_accuracy": 0.6916201114654541, "num_tokens": 18278990.0, "step": 3512, "train/ce_loss": 0.901794970035553 }, { "epoch": 0.34724144749851693, "step": 3512, "train/sim_loss": 0.0546875 }, { "epoch": 0.34724144749851693, "step": 3512, "train/total_loss": 0.14486700296401978 }, { "entropy": 8.916728019714355, "epoch": 0.3473403203480324, "mean_token_accuracy": 0.7067669034004211, "num_tokens": 18284412.0, "step": 3513, "train/ce_loss": 0.8450969457626343 }, { "epoch": 0.3473403203480324, "step": 3513, "train/sim_loss": 0.046875 }, { "epoch": 0.3473403203480324, "step": 3513, "train/total_loss": 0.1313847005367279 }, { "entropy": 9.046451568603516, "epoch": 0.34743919319754796, "mean_token_accuracy": 0.7526754140853882, "num_tokens": 18289752.0, "step": 3514, "train/ce_loss": 0.6713233590126038 }, { "epoch": 0.34743919319754796, "step": 3514, "train/sim_loss": 0.0546875 }, { "epoch": 0.34743919319754796, "step": 3514, "train/total_loss": 0.12181983888149261 }, { "entropy": 9.24777603149414, "epoch": 0.3475380660470635, "mean_token_accuracy": 0.7366771101951599, "num_tokens": 18294871.0, "step": 3515, "train/ce_loss": 1.0542774200439453 }, { "epoch": 0.3475380660470635, "step": 3515, "train/sim_loss": 0.0546875 }, { "epoch": 0.3475380660470635, "step": 3515, "train/total_loss": 0.16011524200439453 }, { "entropy": 9.478700637817383, "epoch": 0.347636938896579, "mean_token_accuracy": 0.7149532437324524, "num_tokens": 18299954.0, "step": 3516, "train/ce_loss": 1.3608589172363281 }, { "epoch": 0.347636938896579, "step": 3516, "train/sim_loss": 0.046875 }, { "epoch": 0.347636938896579, "step": 3516, "train/total_loss": 0.1829608976840973 }, { "entropy": 8.81755256652832, "epoch": 0.3477358117460945, "mean_token_accuracy": 0.6959064602851868, "num_tokens": 18305277.0, "step": 3517, "train/ce_loss": 0.6051265001296997 }, { "epoch": 0.3477358117460945, "step": 3517, "train/sim_loss": 0.0625 }, { "epoch": 0.3477358117460945, "step": 3517, "train/total_loss": 0.12301264703273773 }, { "entropy": 8.472495079040527, "epoch": 0.34783468459561007, "mean_token_accuracy": 0.7226970791816711, "num_tokens": 18310916.0, "step": 3518, "train/ce_loss": 0.5649114847183228 }, { "epoch": 0.34783468459561007, "step": 3518, "train/sim_loss": 0.03515625 }, { "epoch": 0.34783468459561007, "step": 3518, "train/total_loss": 0.09164740145206451 }, { "entropy": 9.438545227050781, "epoch": 0.34793355744512555, "mean_token_accuracy": 0.7453504800796509, "num_tokens": 18316089.0, "step": 3519, "train/ce_loss": 0.6985089182853699 }, { "epoch": 0.34793355744512555, "step": 3519, "train/sim_loss": 0.0234375 }, { "epoch": 0.34793355744512555, "step": 3519, "train/total_loss": 0.09328839182853699 }, { "epoch": 0.3480324302946411, "grad_norm": 0.8204121589660645, "learning_rate": 9.132423478217872e-06, "loss": 0.1556, "step": 3520 }, { "entropy": 9.595911026000977, "epoch": 0.3480324302946411, "mean_token_accuracy": 0.7762237787246704, "num_tokens": 18321068.0, "step": 3520, "train/ce_loss": 1.5138328990360606e-06 }, { "epoch": 0.3480324302946411, "step": 3520, "train/sim_loss": 0.015625 }, { "epoch": 0.3480324302946411, "step": 3520, "train/total_loss": 0.015625150874257088 }, { "entropy": 9.212435722351074, "epoch": 0.34813130314415663, "mean_token_accuracy": 0.7172236442565918, "num_tokens": 18326301.0, "step": 3521, "train/ce_loss": 0.6783554553985596 }, { "epoch": 0.34813130314415663, "step": 3521, "train/sim_loss": 0.046875 }, { "epoch": 0.34813130314415663, "step": 3521, "train/total_loss": 0.11471054702997208 }, { "entropy": 9.33420467376709, "epoch": 0.3482301759936721, "mean_token_accuracy": 0.7138554453849792, "num_tokens": 18331470.0, "step": 3522, "train/ce_loss": 1.7616900205612183 }, { "epoch": 0.3482301759936721, "step": 3522, "train/sim_loss": 0.140625 }, { "epoch": 0.3482301759936721, "step": 3522, "train/total_loss": 0.3167940080165863 }, { "entropy": 9.230966567993164, "epoch": 0.34832904884318766, "mean_token_accuracy": 0.7760563492774963, "num_tokens": 18336621.0, "step": 3523, "train/ce_loss": 0.7635095715522766 }, { "epoch": 0.34832904884318766, "step": 3523, "train/sim_loss": 0.0703125 }, { "epoch": 0.34832904884318766, "step": 3523, "train/total_loss": 0.14666345715522766 }, { "entropy": 9.205339431762695, "epoch": 0.3484279216927032, "mean_token_accuracy": 0.6947852969169617, "num_tokens": 18341745.0, "step": 3524, "train/ce_loss": 0.9602331519126892 }, { "epoch": 0.3484279216927032, "step": 3524, "train/sim_loss": 0.0703125 }, { "epoch": 0.3484279216927032, "step": 3524, "train/total_loss": 0.1663358211517334 }, { "entropy": 8.775196075439453, "epoch": 0.3485267945422187, "mean_token_accuracy": 0.7293318510055542, "num_tokens": 18347080.0, "step": 3525, "train/ce_loss": 0.6082144379615784 }, { "epoch": 0.3485267945422187, "step": 3525, "train/sim_loss": 0.046875 }, { "epoch": 0.3485267945422187, "step": 3525, "train/total_loss": 0.10769644379615784 }, { "entropy": 9.457630157470703, "epoch": 0.34862566739173423, "mean_token_accuracy": 0.7979274392127991, "num_tokens": 18352084.0, "step": 3526, "train/ce_loss": 0.6494618058204651 }, { "epoch": 0.34862566739173423, "step": 3526, "train/sim_loss": 0.02734375 }, { "epoch": 0.34862566739173423, "step": 3526, "train/total_loss": 0.09228993207216263 }, { "entropy": 9.636478424072266, "epoch": 0.34872454024124977, "mean_token_accuracy": 0.7107023596763611, "num_tokens": 18357101.0, "step": 3527, "train/ce_loss": 1.1041392087936401 }, { "epoch": 0.34872454024124977, "step": 3527, "train/sim_loss": 0.04296875 }, { "epoch": 0.34872454024124977, "step": 3527, "train/total_loss": 0.15338267385959625 }, { "entropy": 9.457572937011719, "epoch": 0.34882341309076526, "mean_token_accuracy": 0.7376811504364014, "num_tokens": 18362231.0, "step": 3528, "train/ce_loss": 1.1171162128448486 }, { "epoch": 0.34882341309076526, "step": 3528, "train/sim_loss": 0.0546875 }, { "epoch": 0.34882341309076526, "step": 3528, "train/total_loss": 0.16639912128448486 }, { "entropy": 9.173391342163086, "epoch": 0.3489222859402808, "mean_token_accuracy": 0.6962785124778748, "num_tokens": 18367491.0, "step": 3529, "train/ce_loss": 0.6444375514984131 }, { "epoch": 0.3489222859402808, "step": 3529, "train/sim_loss": 0.03125 }, { "epoch": 0.3489222859402808, "step": 3529, "train/total_loss": 0.09569375962018967 }, { "entropy": 9.359664916992188, "epoch": 0.34902115878979634, "mean_token_accuracy": 0.779552698135376, "num_tokens": 18372553.0, "step": 3530, "train/ce_loss": 0.6456083655357361 }, { "epoch": 0.34902115878979634, "step": 3530, "train/sim_loss": 0.03515625 }, { "epoch": 0.34902115878979634, "step": 3530, "train/total_loss": 0.09971708804368973 }, { "entropy": 9.433663368225098, "epoch": 0.3491200316393118, "mean_token_accuracy": 0.7432217001914978, "num_tokens": 18377618.0, "step": 3531, "train/ce_loss": 9.10674953047419e-06 }, { "epoch": 0.3491200316393118, "step": 3531, "train/sim_loss": 0.0859375 }, { "epoch": 0.3491200316393118, "step": 3531, "train/total_loss": 0.08593840897083282 }, { "entropy": 9.148578643798828, "epoch": 0.34921890448882736, "mean_token_accuracy": 0.7959427237510681, "num_tokens": 18382937.0, "step": 3532, "train/ce_loss": 0.6363712549209595 }, { "epoch": 0.34921890448882736, "step": 3532, "train/sim_loss": 0.0234375 }, { "epoch": 0.34921890448882736, "step": 3532, "train/total_loss": 0.0870746299624443 }, { "entropy": 8.85528564453125, "epoch": 0.3493177773383429, "mean_token_accuracy": 0.7173038125038147, "num_tokens": 18388528.0, "step": 3533, "train/ce_loss": 1.2859928607940674 }, { "epoch": 0.3493177773383429, "step": 3533, "train/sim_loss": 0.0859375 }, { "epoch": 0.3493177773383429, "step": 3533, "train/total_loss": 0.21453678607940674 }, { "entropy": 9.364201545715332, "epoch": 0.3494166501878584, "mean_token_accuracy": 0.7604976892471313, "num_tokens": 18393649.0, "step": 3534, "train/ce_loss": 0.7129225134849548 }, { "epoch": 0.3494166501878584, "step": 3534, "train/sim_loss": 0.05078125 }, { "epoch": 0.3494166501878584, "step": 3534, "train/total_loss": 0.12207350134849548 }, { "entropy": 9.06539535522461, "epoch": 0.34951552303737393, "mean_token_accuracy": 0.7291169166564941, "num_tokens": 18398963.0, "step": 3535, "train/ce_loss": 0.6554270386695862 }, { "epoch": 0.34951552303737393, "step": 3535, "train/sim_loss": 0.0546875 }, { "epoch": 0.34951552303737393, "step": 3535, "train/total_loss": 0.12023020535707474 }, { "entropy": 9.511955261230469, "epoch": 0.3496143958868895, "mean_token_accuracy": 0.7662771344184875, "num_tokens": 18404061.0, "step": 3536, "train/ce_loss": 0.5374231338500977 }, { "epoch": 0.3496143958868895, "step": 3536, "train/sim_loss": 0.0625 }, { "epoch": 0.3496143958868895, "step": 3536, "train/total_loss": 0.11624231934547424 }, { "entropy": 8.863699913024902, "epoch": 0.34971326873640496, "mean_token_accuracy": 0.8055271506309509, "num_tokens": 18409487.0, "step": 3537, "train/ce_loss": 0.40641874074935913 }, { "epoch": 0.34971326873640496, "step": 3537, "train/sim_loss": 0.03515625 }, { "epoch": 0.34971326873640496, "step": 3537, "train/total_loss": 0.07579812407493591 }, { "entropy": 8.911162376403809, "epoch": 0.3498121415859205, "mean_token_accuracy": 0.7116279006004333, "num_tokens": 18414808.0, "step": 3538, "train/ce_loss": 1.2232792377471924 }, { "epoch": 0.3498121415859205, "step": 3538, "train/sim_loss": 0.0703125 }, { "epoch": 0.3498121415859205, "step": 3538, "train/total_loss": 0.19264042377471924 }, { "entropy": 9.118431091308594, "epoch": 0.34991101443543604, "mean_token_accuracy": 0.7496932744979858, "num_tokens": 18420151.0, "step": 3539, "train/ce_loss": 1.101403832435608 }, { "epoch": 0.34991101443543604, "step": 3539, "train/sim_loss": 0.08203125 }, { "epoch": 0.34991101443543604, "step": 3539, "train/total_loss": 0.1921716332435608 }, { "epoch": 0.3500098872849515, "grad_norm": 0.7718408703804016, "learning_rate": 9.127478613459922e-06, "loss": 0.1463, "step": 3540 }, { "entropy": 9.095006942749023, "epoch": 0.3500098872849515, "mean_token_accuracy": 0.7156334519386292, "num_tokens": 18425384.0, "step": 3540, "train/ce_loss": 1.2889493703842163 }, { "epoch": 0.3500098872849515, "step": 3540, "train/sim_loss": 0.0703125 }, { "epoch": 0.3500098872849515, "step": 3540, "train/total_loss": 0.19920744001865387 }, { "entropy": 9.312450408935547, "epoch": 0.35010876013446707, "mean_token_accuracy": 0.7492997050285339, "num_tokens": 18430574.0, "step": 3541, "train/ce_loss": 1.968036940525053e-06 }, { "epoch": 0.35010876013446707, "step": 3541, "train/sim_loss": 0.078125 }, { "epoch": 0.35010876013446707, "step": 3541, "train/total_loss": 0.07812519371509552 }, { "entropy": 8.899011611938477, "epoch": 0.3502076329839826, "mean_token_accuracy": 0.6872928142547607, "num_tokens": 18435941.0, "step": 3542, "train/ce_loss": 0.9003424644470215 }, { "epoch": 0.3502076329839826, "step": 3542, "train/sim_loss": 0.09375 }, { "epoch": 0.3502076329839826, "step": 3542, "train/total_loss": 0.18378424644470215 }, { "entropy": 9.032732009887695, "epoch": 0.3503065058334981, "mean_token_accuracy": 0.7408804893493652, "num_tokens": 18441250.0, "step": 3543, "train/ce_loss": 0.49240124225616455 }, { "epoch": 0.3503065058334981, "step": 3543, "train/sim_loss": 0.05859375 }, { "epoch": 0.3503065058334981, "step": 3543, "train/total_loss": 0.1078338772058487 }, { "entropy": 9.10744857788086, "epoch": 0.35040537868301364, "mean_token_accuracy": 0.7295690774917603, "num_tokens": 18446369.0, "step": 3544, "train/ce_loss": 0.6408175230026245 }, { "epoch": 0.35040537868301364, "step": 3544, "train/sim_loss": 0.05859375 }, { "epoch": 0.35040537868301364, "step": 3544, "train/total_loss": 0.12267550081014633 }, { "entropy": 8.559624671936035, "epoch": 0.3505042515325292, "mean_token_accuracy": 0.7497507333755493, "num_tokens": 18451885.0, "step": 3545, "train/ce_loss": 1.0121359825134277 }, { "epoch": 0.3505042515325292, "step": 3545, "train/sim_loss": 0.09765625 }, { "epoch": 0.3505042515325292, "step": 3545, "train/total_loss": 0.19886985421180725 }, { "entropy": 8.960552215576172, "epoch": 0.3506031243820447, "mean_token_accuracy": 0.7813853025436401, "num_tokens": 18457299.0, "step": 3546, "train/ce_loss": 0.47213152050971985 }, { "epoch": 0.3506031243820447, "step": 3546, "train/sim_loss": 0.09765625 }, { "epoch": 0.3506031243820447, "step": 3546, "train/total_loss": 0.14486940205097198 }, { "entropy": 9.17786979675293, "epoch": 0.3507019972315602, "mean_token_accuracy": 0.7319316864013672, "num_tokens": 18462511.0, "step": 3547, "train/ce_loss": 0.768038809299469 }, { "epoch": 0.3507019972315602, "step": 3547, "train/sim_loss": 0.0546875 }, { "epoch": 0.3507019972315602, "step": 3547, "train/total_loss": 0.13149139285087585 }, { "entropy": 9.23261833190918, "epoch": 0.35080087008107574, "mean_token_accuracy": 0.7530201077461243, "num_tokens": 18467667.0, "step": 3548, "train/ce_loss": 1.1861225366592407 }, { "epoch": 0.35080087008107574, "step": 3548, "train/sim_loss": 0.07421875 }, { "epoch": 0.35080087008107574, "step": 3548, "train/total_loss": 0.19283100962638855 }, { "entropy": 9.488293647766113, "epoch": 0.3508997429305913, "mean_token_accuracy": 0.7153846025466919, "num_tokens": 18472733.0, "step": 3549, "train/ce_loss": 0.6497202515602112 }, { "epoch": 0.3508997429305913, "step": 3549, "train/sim_loss": 0.05078125 }, { "epoch": 0.3508997429305913, "step": 3549, "train/total_loss": 0.11575327813625336 }, { "entropy": 8.950626373291016, "epoch": 0.35099861578010677, "mean_token_accuracy": 0.7555555701255798, "num_tokens": 18478118.0, "step": 3550, "train/ce_loss": 0.7787980437278748 }, { "epoch": 0.35099861578010677, "step": 3550, "train/sim_loss": 0.0703125 }, { "epoch": 0.35099861578010677, "step": 3550, "train/total_loss": 0.14819231629371643 }, { "entropy": 9.40908432006836, "epoch": 0.3510974886296223, "mean_token_accuracy": 0.6710963249206543, "num_tokens": 18483194.0, "step": 3551, "train/ce_loss": 1.643475890159607 }, { "epoch": 0.3510974886296223, "step": 3551, "train/sim_loss": 0.05078125 }, { "epoch": 0.3510974886296223, "step": 3551, "train/total_loss": 0.2151288390159607 }, { "entropy": 9.369651794433594, "epoch": 0.35119636147913785, "mean_token_accuracy": 0.8061674237251282, "num_tokens": 18488319.0, "step": 3552, "train/ce_loss": 1.0746724605560303 }, { "epoch": 0.35119636147913785, "step": 3552, "train/sim_loss": 0.08984375 }, { "epoch": 0.35119636147913785, "step": 3552, "train/total_loss": 0.19731099903583527 }, { "entropy": 9.777578353881836, "epoch": 0.35129523432865334, "mean_token_accuracy": 0.705234169960022, "num_tokens": 18493039.0, "step": 3553, "train/ce_loss": 4.909882136416854e-06 }, { "epoch": 0.35129523432865334, "step": 3553, "train/sim_loss": 0.06640625 }, { "epoch": 0.35129523432865334, "step": 3553, "train/total_loss": 0.0664067417383194 }, { "entropy": 8.643465042114258, "epoch": 0.3513941071781689, "mean_token_accuracy": 0.7199074029922485, "num_tokens": 18498387.0, "step": 3554, "train/ce_loss": 0.8254873752593994 }, { "epoch": 0.3513941071781689, "step": 3554, "train/sim_loss": 0.078125 }, { "epoch": 0.3513941071781689, "step": 3554, "train/total_loss": 0.16067373752593994 }, { "entropy": 9.082072257995605, "epoch": 0.3514929800276844, "mean_token_accuracy": 0.7390804886817932, "num_tokens": 18503715.0, "step": 3555, "train/ce_loss": 0.5262618660926819 }, { "epoch": 0.3514929800276844, "step": 3555, "train/sim_loss": 0.0625 }, { "epoch": 0.3514929800276844, "step": 3555, "train/total_loss": 0.11512619256973267 }, { "entropy": 9.256733894348145, "epoch": 0.3515918528771999, "mean_token_accuracy": 0.7245178818702698, "num_tokens": 18508925.0, "step": 3556, "train/ce_loss": 0.7370480895042419 }, { "epoch": 0.3515918528771999, "step": 3556, "train/sim_loss": 0.07421875 }, { "epoch": 0.3515918528771999, "step": 3556, "train/total_loss": 0.1479235589504242 }, { "entropy": 9.786665916442871, "epoch": 0.35169072572671545, "mean_token_accuracy": 0.7542017102241516, "num_tokens": 18513840.0, "step": 3557, "train/ce_loss": 3.821523478109157e-06 }, { "epoch": 0.35169072572671545, "step": 3557, "train/sim_loss": 0.05078125 }, { "epoch": 0.35169072572671545, "step": 3557, "train/total_loss": 0.05078163370490074 }, { "entropy": 10.097780227661133, "epoch": 0.351789598576231, "mean_token_accuracy": 0.7157190442085266, "num_tokens": 18518562.0, "step": 3558, "train/ce_loss": 1.937991976737976 }, { "epoch": 0.351789598576231, "step": 3558, "train/sim_loss": 0.05859375 }, { "epoch": 0.351789598576231, "step": 3558, "train/total_loss": 0.2523929476737976 }, { "entropy": 9.427690505981445, "epoch": 0.3518884714257465, "mean_token_accuracy": 0.7645429372787476, "num_tokens": 18523769.0, "step": 3559, "train/ce_loss": 0.6540150046348572 }, { "epoch": 0.3518884714257465, "step": 3559, "train/sim_loss": 0.08984375 }, { "epoch": 0.3518884714257465, "step": 3559, "train/total_loss": 0.15524524450302124 }, { "epoch": 0.351987344275262, "grad_norm": 0.7682899236679077, "learning_rate": 9.122533748701975e-06, "loss": 0.1492, "step": 3560 }, { "entropy": 9.192558288574219, "epoch": 0.351987344275262, "mean_token_accuracy": 0.7062663435935974, "num_tokens": 18528996.0, "step": 3560, "train/ce_loss": 0.8725659251213074 }, { "epoch": 0.351987344275262, "step": 3560, "train/sim_loss": 0.09765625 }, { "epoch": 0.351987344275262, "step": 3560, "train/total_loss": 0.18491284549236298 }, { "entropy": 10.469324111938477, "epoch": 0.35208621712477756, "mean_token_accuracy": 1.0, "num_tokens": 18533420.0, "step": 3561, "train/ce_loss": 9.297148790210485e-05 }, { "epoch": 0.35208621712477756, "step": 3561, "train/sim_loss": 0.04296875 }, { "epoch": 0.35208621712477756, "step": 3561, "train/total_loss": 0.04297804832458496 }, { "entropy": 9.145017623901367, "epoch": 0.35218508997429304, "mean_token_accuracy": 0.7345911860466003, "num_tokens": 18538685.0, "step": 3562, "train/ce_loss": 1.5550450086593628 }, { "epoch": 0.35218508997429304, "step": 3562, "train/sim_loss": 0.09375 }, { "epoch": 0.35218508997429304, "step": 3562, "train/total_loss": 0.249254509806633 }, { "entropy": 8.81939697265625, "epoch": 0.3522839628238086, "mean_token_accuracy": 0.701694905757904, "num_tokens": 18544032.0, "step": 3563, "train/ce_loss": 1.2731009721755981 }, { "epoch": 0.3522839628238086, "step": 3563, "train/sim_loss": 0.06640625 }, { "epoch": 0.3522839628238086, "step": 3563, "train/total_loss": 0.19371634721755981 }, { "entropy": 8.905570983886719, "epoch": 0.3523828356733241, "mean_token_accuracy": 0.7117318511009216, "num_tokens": 18549406.0, "step": 3564, "train/ce_loss": 0.6244997978210449 }, { "epoch": 0.3523828356733241, "step": 3564, "train/sim_loss": 0.078125 }, { "epoch": 0.3523828356733241, "step": 3564, "train/total_loss": 0.14057497680187225 }, { "entropy": 8.975542068481445, "epoch": 0.3524817085228396, "mean_token_accuracy": 0.7894737124443054, "num_tokens": 18554792.0, "step": 3565, "train/ce_loss": 0.7156611084938049 }, { "epoch": 0.3524817085228396, "step": 3565, "train/sim_loss": 0.07421875 }, { "epoch": 0.3524817085228396, "step": 3565, "train/total_loss": 0.14578485488891602 }, { "entropy": 9.697669982910156, "epoch": 0.35258058137235515, "mean_token_accuracy": 0.7649006843566895, "num_tokens": 18559778.0, "step": 3566, "train/ce_loss": 1.360660433769226 }, { "epoch": 0.35258058137235515, "step": 3566, "train/sim_loss": 0.046875 }, { "epoch": 0.35258058137235515, "step": 3566, "train/total_loss": 0.18294104933738708 }, { "entropy": 8.912150382995605, "epoch": 0.3526794542218707, "mean_token_accuracy": 0.7199124693870544, "num_tokens": 18565133.0, "step": 3567, "train/ce_loss": 1.1734925508499146 }, { "epoch": 0.3526794542218707, "step": 3567, "train/sim_loss": 0.0546875 }, { "epoch": 0.3526794542218707, "step": 3567, "train/total_loss": 0.1720367670059204 }, { "entropy": 9.849849700927734, "epoch": 0.3527783270713862, "mean_token_accuracy": 0.7318681478500366, "num_tokens": 18570001.0, "step": 3568, "train/ce_loss": 1.5497937056352384e-05 }, { "epoch": 0.3527783270713862, "step": 3568, "train/sim_loss": 0.01953125 }, { "epoch": 0.3527783270713862, "step": 3568, "train/total_loss": 0.01953279972076416 }, { "entropy": 9.48647689819336, "epoch": 0.3528771999209017, "mean_token_accuracy": 0.7436708807945251, "num_tokens": 18575056.0, "step": 3569, "train/ce_loss": 1.4351483583450317 }, { "epoch": 0.3528771999209017, "step": 3569, "train/sim_loss": 0.0546875 }, { "epoch": 0.3528771999209017, "step": 3569, "train/total_loss": 0.19820234179496765 }, { "entropy": 8.485700607299805, "epoch": 0.35297607277041726, "mean_token_accuracy": 0.7702991366386414, "num_tokens": 18580543.0, "step": 3570, "train/ce_loss": 0.6253058910369873 }, { "epoch": 0.35297607277041726, "step": 3570, "train/sim_loss": 0.02734375 }, { "epoch": 0.35297607277041726, "step": 3570, "train/total_loss": 0.08987434208393097 }, { "entropy": 9.469551086425781, "epoch": 0.35307494561993275, "mean_token_accuracy": 0.7402234673500061, "num_tokens": 18585719.0, "step": 3571, "train/ce_loss": 1.6994985116980388e-06 }, { "epoch": 0.35307494561993275, "step": 3571, "train/sim_loss": 0.08984375 }, { "epoch": 0.35307494561993275, "step": 3571, "train/total_loss": 0.08984392136335373 }, { "entropy": 9.067724227905273, "epoch": 0.3531738184694483, "mean_token_accuracy": 0.7933579087257385, "num_tokens": 18591174.0, "step": 3572, "train/ce_loss": 0.7888551354408264 }, { "epoch": 0.3531738184694483, "step": 3572, "train/sim_loss": 0.0703125 }, { "epoch": 0.3531738184694483, "step": 3572, "train/total_loss": 0.1491980254650116 }, { "entropy": 9.695735931396484, "epoch": 0.3532726913189638, "mean_token_accuracy": 0.7185500860214233, "num_tokens": 18596056.0, "step": 3573, "train/ce_loss": 1.1860848665237427 }, { "epoch": 0.3532726913189638, "step": 3573, "train/sim_loss": 0.05078125 }, { "epoch": 0.3532726913189638, "step": 3573, "train/total_loss": 0.1693897396326065 }, { "entropy": 9.05880355834961, "epoch": 0.3533715641684793, "mean_token_accuracy": 0.7036625742912292, "num_tokens": 18601450.0, "step": 3574, "train/ce_loss": 1.1042697429656982 }, { "epoch": 0.3533715641684793, "step": 3574, "train/sim_loss": 0.14453125 }, { "epoch": 0.3533715641684793, "step": 3574, "train/total_loss": 0.25495821237564087 }, { "entropy": 9.551956176757812, "epoch": 0.35347043701799485, "mean_token_accuracy": 0.7982906103134155, "num_tokens": 18606441.0, "step": 3575, "train/ce_loss": 2.0116110590606695e-06 }, { "epoch": 0.35347043701799485, "step": 3575, "train/sim_loss": 0.06640625 }, { "epoch": 0.35347043701799485, "step": 3575, "train/total_loss": 0.06640645116567612 }, { "entropy": 9.449283599853516, "epoch": 0.3535693098675104, "mean_token_accuracy": 0.7388059496879578, "num_tokens": 18611435.0, "step": 3576, "train/ce_loss": 1.50284743309021 }, { "epoch": 0.3535693098675104, "step": 3576, "train/sim_loss": 0.10546875 }, { "epoch": 0.3535693098675104, "step": 3576, "train/total_loss": 0.2557535171508789 }, { "entropy": 9.170859336853027, "epoch": 0.3536681827170259, "mean_token_accuracy": 0.8545994162559509, "num_tokens": 18616592.0, "step": 3577, "train/ce_loss": 0.6139150261878967 }, { "epoch": 0.3536681827170259, "step": 3577, "train/sim_loss": 0.0546875 }, { "epoch": 0.3536681827170259, "step": 3577, "train/total_loss": 0.11607900261878967 }, { "entropy": 9.522819519042969, "epoch": 0.3537670555665414, "mean_token_accuracy": 0.7272727489471436, "num_tokens": 18621748.0, "step": 3578, "train/ce_loss": 1.217820644378662 }, { "epoch": 0.3537670555665414, "step": 3578, "train/sim_loss": 0.046875 }, { "epoch": 0.3537670555665414, "step": 3578, "train/total_loss": 0.1686570644378662 }, { "entropy": 8.783790588378906, "epoch": 0.35386592841605696, "mean_token_accuracy": 0.7079002261161804, "num_tokens": 18627189.0, "step": 3579, "train/ce_loss": 0.8037675023078918 }, { "epoch": 0.35386592841605696, "step": 3579, "train/sim_loss": 0.046875 }, { "epoch": 0.35386592841605696, "step": 3579, "train/total_loss": 0.1272517442703247 }, { "epoch": 0.35396480126557245, "grad_norm": 0.8489325642585754, "learning_rate": 9.117588883944025e-06, "loss": 0.14, "step": 3580 }, { "entropy": 9.09628677368164, "epoch": 0.35396480126557245, "mean_token_accuracy": 0.7522624731063843, "num_tokens": 18632524.0, "step": 3580, "train/ce_loss": 0.4255582392215729 }, { "epoch": 0.35396480126557245, "step": 3580, "train/sim_loss": 0.0625 }, { "epoch": 0.35396480126557245, "step": 3580, "train/total_loss": 0.10505582392215729 }, { "entropy": 9.367462158203125, "epoch": 0.354063674115088, "mean_token_accuracy": 0.8267831206321716, "num_tokens": 18637674.0, "step": 3581, "train/ce_loss": 0.4969341456890106 }, { "epoch": 0.354063674115088, "step": 3581, "train/sim_loss": 0.0390625 }, { "epoch": 0.354063674115088, "step": 3581, "train/total_loss": 0.08875592052936554 }, { "entropy": 9.681544303894043, "epoch": 0.35416254696460353, "mean_token_accuracy": 0.7083333134651184, "num_tokens": 18642660.0, "step": 3582, "train/ce_loss": 0.9497576951980591 }, { "epoch": 0.35416254696460353, "step": 3582, "train/sim_loss": 0.04296875 }, { "epoch": 0.35416254696460353, "step": 3582, "train/total_loss": 0.1379445195198059 }, { "entropy": 9.453405380249023, "epoch": 0.354261419814119, "mean_token_accuracy": 0.6682927012443542, "num_tokens": 18647712.0, "step": 3583, "train/ce_loss": 1.421942442902946e-06 }, { "epoch": 0.354261419814119, "step": 3583, "train/sim_loss": 0.015625 }, { "epoch": 0.354261419814119, "step": 3583, "train/total_loss": 0.01562514156103134 }, { "entropy": 9.411760330200195, "epoch": 0.35436029266363456, "mean_token_accuracy": 0.7788732647895813, "num_tokens": 18652869.0, "step": 3584, "train/ce_loss": 0.6261138319969177 }, { "epoch": 0.35436029266363456, "step": 3584, "train/sim_loss": 0.0234375 }, { "epoch": 0.35436029266363456, "step": 3584, "train/total_loss": 0.08604888617992401 }, { "entropy": 8.78024959564209, "epoch": 0.3544591655131501, "mean_token_accuracy": 0.8067581653594971, "num_tokens": 18658316.0, "step": 3585, "train/ce_loss": 0.5842998027801514 }, { "epoch": 0.3544591655131501, "step": 3585, "train/sim_loss": 0.03125 }, { "epoch": 0.3544591655131501, "step": 3585, "train/total_loss": 0.08967998623847961 }, { "entropy": 9.422521591186523, "epoch": 0.35455803836266564, "mean_token_accuracy": 0.791208803653717, "num_tokens": 18663258.0, "step": 3586, "train/ce_loss": 4.0228596844826825e-06 }, { "epoch": 0.35455803836266564, "step": 3586, "train/sim_loss": 0.06640625 }, { "epoch": 0.35455803836266564, "step": 3586, "train/total_loss": 0.06640665233135223 }, { "entropy": 9.755395889282227, "epoch": 0.3546569112121811, "mean_token_accuracy": 0.741847813129425, "num_tokens": 18668243.0, "step": 3587, "train/ce_loss": 1.243598222732544 }, { "epoch": 0.3546569112121811, "step": 3587, "train/sim_loss": 0.11328125 }, { "epoch": 0.3546569112121811, "step": 3587, "train/total_loss": 0.23764106631278992 }, { "entropy": 10.178962707519531, "epoch": 0.35475578406169667, "mean_token_accuracy": 0.7535014152526855, "num_tokens": 18673022.0, "step": 3588, "train/ce_loss": 1.7247364521026611 }, { "epoch": 0.35475578406169667, "step": 3588, "train/sim_loss": 0.078125 }, { "epoch": 0.35475578406169667, "step": 3588, "train/total_loss": 0.250598669052124 }, { "entropy": 9.204524993896484, "epoch": 0.3548546569112122, "mean_token_accuracy": 0.7333333492279053, "num_tokens": 18678305.0, "step": 3589, "train/ce_loss": 1.0631451606750488 }, { "epoch": 0.3548546569112122, "step": 3589, "train/sim_loss": 0.0546875 }, { "epoch": 0.3548546569112122, "step": 3589, "train/total_loss": 0.1610020101070404 }, { "entropy": 8.949295043945312, "epoch": 0.3549535297607277, "mean_token_accuracy": 0.7013888955116272, "num_tokens": 18683571.0, "step": 3590, "train/ce_loss": 1.1846740245819092 }, { "epoch": 0.3549535297607277, "step": 3590, "train/sim_loss": 0.0390625 }, { "epoch": 0.3549535297607277, "step": 3590, "train/total_loss": 0.15752990543842316 }, { "entropy": 9.124601364135742, "epoch": 0.35505240261024323, "mean_token_accuracy": 0.7449344396591187, "num_tokens": 18688918.0, "step": 3591, "train/ce_loss": 0.7621549963951111 }, { "epoch": 0.35505240261024323, "step": 3591, "train/sim_loss": 0.03515625 }, { "epoch": 0.35505240261024323, "step": 3591, "train/total_loss": 0.11137174814939499 }, { "entropy": 9.815519332885742, "epoch": 0.3551512754597588, "mean_token_accuracy": 0.7307692170143127, "num_tokens": 18693812.0, "step": 3592, "train/ce_loss": 1.170204758644104 }, { "epoch": 0.3551512754597588, "step": 3592, "train/sim_loss": 0.05859375 }, { "epoch": 0.3551512754597588, "step": 3592, "train/total_loss": 0.17561423778533936 }, { "entropy": 8.898412704467773, "epoch": 0.35525014830927426, "mean_token_accuracy": 0.7038251161575317, "num_tokens": 18699240.0, "step": 3593, "train/ce_loss": 0.7210386395454407 }, { "epoch": 0.35525014830927426, "step": 3593, "train/sim_loss": 0.109375 }, { "epoch": 0.35525014830927426, "step": 3593, "train/total_loss": 0.1814788579940796 }, { "entropy": 8.81280517578125, "epoch": 0.3553490211587898, "mean_token_accuracy": 0.7233368754386902, "num_tokens": 18704639.0, "step": 3594, "train/ce_loss": 0.8128060102462769 }, { "epoch": 0.3553490211587898, "step": 3594, "train/sim_loss": 0.078125 }, { "epoch": 0.3553490211587898, "step": 3594, "train/total_loss": 0.15940560400485992 }, { "entropy": 10.054403305053711, "epoch": 0.35544789400830534, "mean_token_accuracy": 0.7058823704719543, "num_tokens": 18709449.0, "step": 3595, "train/ce_loss": 5.0441635721654166e-06 }, { "epoch": 0.35544789400830534, "step": 3595, "train/sim_loss": 0.03125 }, { "epoch": 0.35544789400830534, "step": 3595, "train/total_loss": 0.03125050291419029 }, { "entropy": 9.578506469726562, "epoch": 0.35554676685782083, "mean_token_accuracy": 0.7337662577629089, "num_tokens": 18714481.0, "step": 3596, "train/ce_loss": 1.2608743906021118 }, { "epoch": 0.35554676685782083, "step": 3596, "train/sim_loss": 0.03515625 }, { "epoch": 0.35554676685782083, "step": 3596, "train/total_loss": 0.16124369204044342 }, { "entropy": 9.35792064666748, "epoch": 0.35564563970733637, "mean_token_accuracy": 0.7272727489471436, "num_tokens": 18719708.0, "step": 3597, "train/ce_loss": 1.0480810403823853 }, { "epoch": 0.35564563970733637, "step": 3597, "train/sim_loss": 0.0625 }, { "epoch": 0.35564563970733637, "step": 3597, "train/total_loss": 0.16730810701847076 }, { "entropy": 8.493367195129395, "epoch": 0.3557445125568519, "mean_token_accuracy": 0.7280898690223694, "num_tokens": 18725065.0, "step": 3598, "train/ce_loss": 1.273380994796753 }, { "epoch": 0.3557445125568519, "step": 3598, "train/sim_loss": 0.07421875 }, { "epoch": 0.3557445125568519, "step": 3598, "train/total_loss": 0.20155684649944305 }, { "entropy": 9.373510360717773, "epoch": 0.3558433854063674, "mean_token_accuracy": 0.7911646366119385, "num_tokens": 18730248.0, "step": 3599, "train/ce_loss": 0.8422386646270752 }, { "epoch": 0.3558433854063674, "step": 3599, "train/sim_loss": 0.0390625 }, { "epoch": 0.3558433854063674, "step": 3599, "train/total_loss": 0.12328636646270752 }, { "epoch": 0.35594225825588294, "grad_norm": 0.6824468374252319, "learning_rate": 9.112644019186077e-06, "loss": 0.141, "step": 3600 }, { "entropy": 9.461212158203125, "epoch": 0.35594225825588294, "mean_token_accuracy": 0.8100889921188354, "num_tokens": 18735348.0, "step": 3600, "train/ce_loss": 0.6437177062034607 }, { "epoch": 0.35594225825588294, "step": 3600, "train/sim_loss": 0.03515625 }, { "epoch": 0.35594225825588294, "step": 3600, "train/total_loss": 0.09952802211046219 }, { "entropy": 8.997678756713867, "epoch": 0.3560411311053985, "mean_token_accuracy": 0.7733473181724548, "num_tokens": 18740765.0, "step": 3601, "train/ce_loss": 0.7001959085464478 }, { "epoch": 0.3560411311053985, "step": 3601, "train/sim_loss": 0.015625 }, { "epoch": 0.3560411311053985, "step": 3601, "train/total_loss": 0.08564459532499313 }, { "entropy": 9.560249328613281, "epoch": 0.35614000395491396, "mean_token_accuracy": 0.7255520224571228, "num_tokens": 18745855.0, "step": 3602, "train/ce_loss": 0.8443648815155029 }, { "epoch": 0.35614000395491396, "step": 3602, "train/sim_loss": 0.046875 }, { "epoch": 0.35614000395491396, "step": 3602, "train/total_loss": 0.13131149113178253 }, { "entropy": 9.580425262451172, "epoch": 0.3562388768044295, "mean_token_accuracy": 0.6866764426231384, "num_tokens": 18750999.0, "step": 3603, "train/ce_loss": 1.130759596824646 }, { "epoch": 0.3562388768044295, "step": 3603, "train/sim_loss": 0.078125 }, { "epoch": 0.3562388768044295, "step": 3603, "train/total_loss": 0.19120097160339355 }, { "entropy": 9.033404350280762, "epoch": 0.35633774965394505, "mean_token_accuracy": 0.7333333492279053, "num_tokens": 18756297.0, "step": 3604, "train/ce_loss": 0.8131483793258667 }, { "epoch": 0.35633774965394505, "step": 3604, "train/sim_loss": 0.0625 }, { "epoch": 0.35633774965394505, "step": 3604, "train/total_loss": 0.1438148319721222 }, { "entropy": 9.32512378692627, "epoch": 0.35643662250346053, "mean_token_accuracy": 0.7956204414367676, "num_tokens": 18761380.0, "step": 3605, "train/ce_loss": 0.7058120369911194 }, { "epoch": 0.35643662250346053, "step": 3605, "train/sim_loss": 0.01953125 }, { "epoch": 0.35643662250346053, "step": 3605, "train/total_loss": 0.09011245518922806 }, { "entropy": 9.25851821899414, "epoch": 0.3565354953529761, "mean_token_accuracy": 0.7393689751625061, "num_tokens": 18766558.0, "step": 3606, "train/ce_loss": 0.5824702382087708 }, { "epoch": 0.3565354953529761, "step": 3606, "train/sim_loss": 0.0859375 }, { "epoch": 0.3565354953529761, "step": 3606, "train/total_loss": 0.14418452978134155 }, { "entropy": 9.170404434204102, "epoch": 0.3566343682024916, "mean_token_accuracy": 0.7585784196853638, "num_tokens": 18771844.0, "step": 3607, "train/ce_loss": 0.38632792234420776 }, { "epoch": 0.3566343682024916, "step": 3607, "train/sim_loss": 0.046875 }, { "epoch": 0.3566343682024916, "step": 3607, "train/total_loss": 0.08550779521465302 }, { "entropy": 9.091513633728027, "epoch": 0.3567332410520071, "mean_token_accuracy": 0.7013630867004395, "num_tokens": 18777148.0, "step": 3608, "train/ce_loss": 1.1107176542282104 }, { "epoch": 0.3567332410520071, "step": 3608, "train/sim_loss": 0.11328125 }, { "epoch": 0.3567332410520071, "step": 3608, "train/total_loss": 0.22435301542282104 }, { "entropy": 8.946985244750977, "epoch": 0.35683211390152264, "mean_token_accuracy": 0.71875, "num_tokens": 18782443.0, "step": 3609, "train/ce_loss": 1.255902647972107 }, { "epoch": 0.35683211390152264, "step": 3609, "train/sim_loss": 0.0703125 }, { "epoch": 0.35683211390152264, "step": 3609, "train/total_loss": 0.1959027647972107 }, { "entropy": 8.945352554321289, "epoch": 0.3569309867510382, "mean_token_accuracy": 0.7400419116020203, "num_tokens": 18787880.0, "step": 3610, "train/ce_loss": 0.8099817037582397 }, { "epoch": 0.3569309867510382, "step": 3610, "train/sim_loss": 0.1171875 }, { "epoch": 0.3569309867510382, "step": 3610, "train/total_loss": 0.19818568229675293 }, { "entropy": 9.248785972595215, "epoch": 0.35702985960055367, "mean_token_accuracy": 0.7289256453514099, "num_tokens": 18792973.0, "step": 3611, "train/ce_loss": 1.0842132568359375 }, { "epoch": 0.35702985960055367, "step": 3611, "train/sim_loss": 0.09765625 }, { "epoch": 0.35702985960055367, "step": 3611, "train/total_loss": 0.20607757568359375 }, { "entropy": 9.346733093261719, "epoch": 0.3571287324500692, "mean_token_accuracy": 0.7813299298286438, "num_tokens": 18798198.0, "step": 3612, "train/ce_loss": 0.5885779857635498 }, { "epoch": 0.3571287324500692, "step": 3612, "train/sim_loss": 0.0546875 }, { "epoch": 0.3571287324500692, "step": 3612, "train/total_loss": 0.11354529857635498 }, { "entropy": 9.392707824707031, "epoch": 0.35722760529958475, "mean_token_accuracy": 0.75789475440979, "num_tokens": 18803418.0, "step": 3613, "train/ce_loss": 0.734527587890625 }, { "epoch": 0.35722760529958475, "step": 3613, "train/sim_loss": 0.12890625 }, { "epoch": 0.35722760529958475, "step": 3613, "train/total_loss": 0.20235902070999146 }, { "entropy": 9.530153274536133, "epoch": 0.35732647814910024, "mean_token_accuracy": 0.6506550312042236, "num_tokens": 18808536.0, "step": 3614, "train/ce_loss": 2.2162020206451416 }, { "epoch": 0.35732647814910024, "step": 3614, "train/sim_loss": 0.1171875 }, { "epoch": 0.35732647814910024, "step": 3614, "train/total_loss": 0.33880770206451416 }, { "entropy": 8.940558433532715, "epoch": 0.3574253509986158, "mean_token_accuracy": 0.7585470080375671, "num_tokens": 18813971.0, "step": 3615, "train/ce_loss": 1.023219108581543 }, { "epoch": 0.3574253509986158, "step": 3615, "train/sim_loss": 0.1328125 }, { "epoch": 0.3574253509986158, "step": 3615, "train/total_loss": 0.23513442277908325 }, { "entropy": 9.895393371582031, "epoch": 0.3575242238481313, "mean_token_accuracy": 0.7389557957649231, "num_tokens": 18818888.0, "step": 3616, "train/ce_loss": 0.701510488986969 }, { "epoch": 0.3575242238481313, "step": 3616, "train/sim_loss": 0.078125 }, { "epoch": 0.3575242238481313, "step": 3616, "train/total_loss": 0.14827606081962585 }, { "entropy": 8.8057861328125, "epoch": 0.3576230966976468, "mean_token_accuracy": 0.7281845808029175, "num_tokens": 18824365.0, "step": 3617, "train/ce_loss": 0.2709960341453552 }, { "epoch": 0.3576230966976468, "step": 3617, "train/sim_loss": 0.0234375 }, { "epoch": 0.3576230966976468, "step": 3617, "train/total_loss": 0.0505371019244194 }, { "entropy": 8.984687805175781, "epoch": 0.35772196954716234, "mean_token_accuracy": 0.7944622039794922, "num_tokens": 18829811.0, "step": 3618, "train/ce_loss": 0.5673206448554993 }, { "epoch": 0.35772196954716234, "step": 3618, "train/sim_loss": 0.0234375 }, { "epoch": 0.35772196954716234, "step": 3618, "train/total_loss": 0.08016956597566605 }, { "entropy": 9.221881866455078, "epoch": 0.3578208423966779, "mean_token_accuracy": 0.771556556224823, "num_tokens": 18835097.0, "step": 3619, "train/ce_loss": 0.7672033309936523 }, { "epoch": 0.3578208423966779, "step": 3619, "train/sim_loss": 0.046875 }, { "epoch": 0.3578208423966779, "step": 3619, "train/total_loss": 0.12359533458948135 }, { "epoch": 0.35791971524619337, "grad_norm": 0.5600435733795166, "learning_rate": 9.107699154428126e-06, "loss": 0.1408, "step": 3620 }, { "entropy": 9.152168273925781, "epoch": 0.35791971524619337, "mean_token_accuracy": 0.732824444770813, "num_tokens": 18840361.0, "step": 3620, "train/ce_loss": 0.8965581655502319 }, { "epoch": 0.35791971524619337, "step": 3620, "train/sim_loss": 0.05859375 }, { "epoch": 0.35791971524619337, "step": 3620, "train/total_loss": 0.1482495665550232 }, { "entropy": 8.964424133300781, "epoch": 0.3580185880957089, "mean_token_accuracy": 0.6876190304756165, "num_tokens": 18845887.0, "step": 3621, "train/ce_loss": 0.6420177817344666 }, { "epoch": 0.3580185880957089, "step": 3621, "train/sim_loss": 0.10546875 }, { "epoch": 0.3580185880957089, "step": 3621, "train/total_loss": 0.16967052221298218 }, { "entropy": 9.759793281555176, "epoch": 0.35811746094522445, "mean_token_accuracy": 0.7165217399597168, "num_tokens": 18850933.0, "step": 3622, "train/ce_loss": 1.5218240022659302 }, { "epoch": 0.35811746094522445, "step": 3622, "train/sim_loss": 0.06640625 }, { "epoch": 0.35811746094522445, "step": 3622, "train/total_loss": 0.21858865022659302 }, { "entropy": 9.021388053894043, "epoch": 0.35821633379473994, "mean_token_accuracy": 0.699999988079071, "num_tokens": 18856409.0, "step": 3623, "train/ce_loss": 0.7790200710296631 }, { "epoch": 0.35821633379473994, "step": 3623, "train/sim_loss": 0.05859375 }, { "epoch": 0.35821633379473994, "step": 3623, "train/total_loss": 0.13649576902389526 }, { "entropy": 9.041007041931152, "epoch": 0.3583152066442555, "mean_token_accuracy": 0.7039312124252319, "num_tokens": 18861669.0, "step": 3624, "train/ce_loss": 0.6578070521354675 }, { "epoch": 0.3583152066442555, "step": 3624, "train/sim_loss": 0.07421875 }, { "epoch": 0.3583152066442555, "step": 3624, "train/total_loss": 0.13999944925308228 }, { "entropy": 8.717838287353516, "epoch": 0.358414079493771, "mean_token_accuracy": 0.7262672781944275, "num_tokens": 18867235.0, "step": 3625, "train/ce_loss": 1.4119700193405151 }, { "epoch": 0.358414079493771, "step": 3625, "train/sim_loss": 0.08984375 }, { "epoch": 0.358414079493771, "step": 3625, "train/total_loss": 0.23104076087474823 }, { "entropy": 9.458995819091797, "epoch": 0.3585129523432865, "mean_token_accuracy": 0.7584269642829895, "num_tokens": 18872254.0, "step": 3626, "train/ce_loss": 0.7267706394195557 }, { "epoch": 0.3585129523432865, "step": 3626, "train/sim_loss": 0.0625 }, { "epoch": 0.3585129523432865, "step": 3626, "train/total_loss": 0.13517707586288452 }, { "entropy": 8.873926162719727, "epoch": 0.35861182519280205, "mean_token_accuracy": 0.7026143670082092, "num_tokens": 18877645.0, "step": 3627, "train/ce_loss": 0.7436491250991821 }, { "epoch": 0.35861182519280205, "step": 3627, "train/sim_loss": 0.09375 }, { "epoch": 0.35861182519280205, "step": 3627, "train/total_loss": 0.16811491549015045 }, { "entropy": 8.854890823364258, "epoch": 0.3587106980423176, "mean_token_accuracy": 0.8172757625579834, "num_tokens": 18883029.0, "step": 3628, "train/ce_loss": 0.6892380714416504 }, { "epoch": 0.3587106980423176, "step": 3628, "train/sim_loss": 0.0703125 }, { "epoch": 0.3587106980423176, "step": 3628, "train/total_loss": 0.13923630118370056 }, { "entropy": 8.848349571228027, "epoch": 0.35880957089183313, "mean_token_accuracy": 0.7848837375640869, "num_tokens": 18888473.0, "step": 3629, "train/ce_loss": 0.6006161570549011 }, { "epoch": 0.35880957089183313, "step": 3629, "train/sim_loss": 0.0390625 }, { "epoch": 0.35880957089183313, "step": 3629, "train/total_loss": 0.09912411868572235 }, { "entropy": 9.404335021972656, "epoch": 0.3589084437413486, "mean_token_accuracy": 0.6823362112045288, "num_tokens": 18893603.0, "step": 3630, "train/ce_loss": 1.101285696029663 }, { "epoch": 0.3589084437413486, "step": 3630, "train/sim_loss": 0.05859375 }, { "epoch": 0.3589084437413486, "step": 3630, "train/total_loss": 0.16872233152389526 }, { "entropy": 9.75284481048584, "epoch": 0.35900731659086416, "mean_token_accuracy": 0.7788844704627991, "num_tokens": 18898517.0, "step": 3631, "train/ce_loss": 8.16061128716683e-06 }, { "epoch": 0.35900731659086416, "step": 3631, "train/sim_loss": 0.0390625 }, { "epoch": 0.35900731659086416, "step": 3631, "train/total_loss": 0.03906331583857536 }, { "entropy": 9.497733116149902, "epoch": 0.3591061894403797, "mean_token_accuracy": 0.7729323506355286, "num_tokens": 18903681.0, "step": 3632, "train/ce_loss": 2.0427783056220505e-06 }, { "epoch": 0.3591061894403797, "step": 3632, "train/sim_loss": 0.04296875 }, { "epoch": 0.3591061894403797, "step": 3632, "train/total_loss": 0.042968954890966415 }, { "entropy": 8.947818756103516, "epoch": 0.3592050622898952, "mean_token_accuracy": 0.7249224185943604, "num_tokens": 18909102.0, "step": 3633, "train/ce_loss": 1.2241902351379395 }, { "epoch": 0.3592050622898952, "step": 3633, "train/sim_loss": 0.11328125 }, { "epoch": 0.3592050622898952, "step": 3633, "train/total_loss": 0.23570027947425842 }, { "entropy": 9.941341400146484, "epoch": 0.3593039351394107, "mean_token_accuracy": 0.6866484880447388, "num_tokens": 18913868.0, "step": 3634, "train/ce_loss": 2.229797601699829 }, { "epoch": 0.3593039351394107, "step": 3634, "train/sim_loss": 0.0703125 }, { "epoch": 0.3593039351394107, "step": 3634, "train/total_loss": 0.2932922840118408 }, { "entropy": 8.790292739868164, "epoch": 0.35940280798892627, "mean_token_accuracy": 0.6800422668457031, "num_tokens": 18919251.0, "step": 3635, "train/ce_loss": 0.8259347677230835 }, { "epoch": 0.35940280798892627, "step": 3635, "train/sim_loss": 0.0390625 }, { "epoch": 0.35940280798892627, "step": 3635, "train/total_loss": 0.12165597826242447 }, { "entropy": 9.229532241821289, "epoch": 0.35950168083844175, "mean_token_accuracy": 0.7695364356040955, "num_tokens": 18924461.0, "step": 3636, "train/ce_loss": 0.5599935054779053 }, { "epoch": 0.35950168083844175, "step": 3636, "train/sim_loss": 0.05859375 }, { "epoch": 0.35950168083844175, "step": 3636, "train/total_loss": 0.11459310352802277 }, { "entropy": 9.133469581604004, "epoch": 0.3596005536879573, "mean_token_accuracy": 0.7037037014961243, "num_tokens": 18929719.0, "step": 3637, "train/ce_loss": 0.480461448431015 }, { "epoch": 0.3596005536879573, "step": 3637, "train/sim_loss": 0.03125 }, { "epoch": 0.3596005536879573, "step": 3637, "train/total_loss": 0.07929614186286926 }, { "entropy": 9.364177703857422, "epoch": 0.35969942653747283, "mean_token_accuracy": 0.738095223903656, "num_tokens": 18934832.0, "step": 3638, "train/ce_loss": 0.6780454516410828 }, { "epoch": 0.35969942653747283, "step": 3638, "train/sim_loss": 0.05078125 }, { "epoch": 0.35969942653747283, "step": 3638, "train/total_loss": 0.11858579516410828 }, { "entropy": 9.644359588623047, "epoch": 0.3597982993869883, "mean_token_accuracy": 0.8057692050933838, "num_tokens": 18939785.0, "step": 3639, "train/ce_loss": 2.528952109059901e-06 }, { "epoch": 0.3597982993869883, "step": 3639, "train/sim_loss": 0.05078125 }, { "epoch": 0.3597982993869883, "step": 3639, "train/total_loss": 0.050781503319740295 }, { "epoch": 0.35989717223650386, "grad_norm": 0.7416434288024902, "learning_rate": 9.102754289670178e-06, "loss": 0.1487, "step": 3640 }, { "entropy": 10.403352737426758, "epoch": 0.35989717223650386, "mean_token_accuracy": 0.7214611768722534, "num_tokens": 18944388.0, "step": 3640, "train/ce_loss": 3.3780710697174072 }, { "epoch": 0.35989717223650386, "step": 3640, "train/sim_loss": 0.0546875 }, { "epoch": 0.35989717223650386, "step": 3640, "train/total_loss": 0.3924946188926697 }, { "entropy": 9.390605926513672, "epoch": 0.3599960450860194, "mean_token_accuracy": 0.7394468784332275, "num_tokens": 18949553.0, "step": 3641, "train/ce_loss": 3.4617044093465665e-06 }, { "epoch": 0.3599960450860194, "step": 3641, "train/sim_loss": 0.0625 }, { "epoch": 0.3599960450860194, "step": 3641, "train/total_loss": 0.06250034272670746 }, { "entropy": 9.734277725219727, "epoch": 0.3600949179355349, "mean_token_accuracy": 0.8051689863204956, "num_tokens": 18954459.0, "step": 3642, "train/ce_loss": 1.4679824113845825 }, { "epoch": 0.3600949179355349, "step": 3642, "train/sim_loss": 0.01953125 }, { "epoch": 0.3600949179355349, "step": 3642, "train/total_loss": 0.166329488158226 }, { "entropy": 9.015350341796875, "epoch": 0.3601937907850504, "mean_token_accuracy": 0.7357142567634583, "num_tokens": 18959821.0, "step": 3643, "train/ce_loss": 1.2550936937332153 }, { "epoch": 0.3601937907850504, "step": 3643, "train/sim_loss": 0.09375 }, { "epoch": 0.3601937907850504, "step": 3643, "train/total_loss": 0.2192593663930893 }, { "entropy": 9.254035949707031, "epoch": 0.36029266363456597, "mean_token_accuracy": 0.7264276146888733, "num_tokens": 18965046.0, "step": 3644, "train/ce_loss": 3.5190134894946823e-06 }, { "epoch": 0.36029266363456597, "step": 3644, "train/sim_loss": 0.0546875 }, { "epoch": 0.36029266363456597, "step": 3644, "train/total_loss": 0.054687850177288055 }, { "entropy": 9.217218399047852, "epoch": 0.36039153648408145, "mean_token_accuracy": 0.704635739326477, "num_tokens": 18970310.0, "step": 3645, "train/ce_loss": 0.49777185916900635 }, { "epoch": 0.36039153648408145, "step": 3645, "train/sim_loss": 0.0703125 }, { "epoch": 0.36039153648408145, "step": 3645, "train/total_loss": 0.12008968740701675 }, { "entropy": 9.125036239624023, "epoch": 0.360490409333597, "mean_token_accuracy": 0.7408906817436218, "num_tokens": 18975548.0, "step": 3646, "train/ce_loss": 1.0844553709030151 }, { "epoch": 0.360490409333597, "step": 3646, "train/sim_loss": 0.046875 }, { "epoch": 0.360490409333597, "step": 3646, "train/total_loss": 0.15532054007053375 }, { "entropy": 9.504810333251953, "epoch": 0.36058928218311254, "mean_token_accuracy": 0.6901172399520874, "num_tokens": 18980597.0, "step": 3647, "train/ce_loss": 2.1324833596736426e-06 }, { "epoch": 0.36058928218311254, "step": 3647, "train/sim_loss": 0.0546875 }, { "epoch": 0.36058928218311254, "step": 3647, "train/total_loss": 0.05468771234154701 }, { "entropy": 9.096872329711914, "epoch": 0.360688155032628, "mean_token_accuracy": 0.7618497014045715, "num_tokens": 18985912.0, "step": 3648, "train/ce_loss": 1.5423396462210803e-06 }, { "epoch": 0.360688155032628, "step": 3648, "train/sim_loss": 0.0625 }, { "epoch": 0.360688155032628, "step": 3648, "train/total_loss": 0.06250015646219254 }, { "entropy": 9.308656692504883, "epoch": 0.36078702788214356, "mean_token_accuracy": 0.7789633870124817, "num_tokens": 18990979.0, "step": 3649, "train/ce_loss": 0.8892550468444824 }, { "epoch": 0.36078702788214356, "step": 3649, "train/sim_loss": 0.046875 }, { "epoch": 0.36078702788214356, "step": 3649, "train/total_loss": 0.13580051064491272 }, { "entropy": 9.134403228759766, "epoch": 0.3608859007316591, "mean_token_accuracy": 0.7423638701438904, "num_tokens": 18996217.0, "step": 3650, "train/ce_loss": 0.9975591897964478 }, { "epoch": 0.3608859007316591, "step": 3650, "train/sim_loss": 0.0390625 }, { "epoch": 0.3608859007316591, "step": 3650, "train/total_loss": 0.1388184130191803 }, { "entropy": 9.531702995300293, "epoch": 0.3609847735811746, "mean_token_accuracy": 0.7331136465072632, "num_tokens": 19001236.0, "step": 3651, "train/ce_loss": 1.3827279806137085 }, { "epoch": 0.3609847735811746, "step": 3651, "train/sim_loss": 0.07421875 }, { "epoch": 0.3609847735811746, "step": 3651, "train/total_loss": 0.21249155700206757 }, { "entropy": 8.824071884155273, "epoch": 0.36108364643069013, "mean_token_accuracy": 0.7599999904632568, "num_tokens": 19006597.0, "step": 3652, "train/ce_loss": 0.6945485472679138 }, { "epoch": 0.36108364643069013, "step": 3652, "train/sim_loss": 0.02734375 }, { "epoch": 0.36108364643069013, "step": 3652, "train/total_loss": 0.0967986062169075 }, { "entropy": 9.394201278686523, "epoch": 0.36118251928020567, "mean_token_accuracy": 0.7309185266494751, "num_tokens": 19011785.0, "step": 3653, "train/ce_loss": 1.196912407875061 }, { "epoch": 0.36118251928020567, "step": 3653, "train/sim_loss": 0.078125 }, { "epoch": 0.36118251928020567, "step": 3653, "train/total_loss": 0.19781625270843506 }, { "entropy": 9.342180252075195, "epoch": 0.36128139212972116, "mean_token_accuracy": 0.7170329689979553, "num_tokens": 19016984.0, "step": 3654, "train/ce_loss": 0.7816472053527832 }, { "epoch": 0.36128139212972116, "step": 3654, "train/sim_loss": 0.06640625 }, { "epoch": 0.36128139212972116, "step": 3654, "train/total_loss": 0.1445709764957428 }, { "entropy": 9.088046073913574, "epoch": 0.3613802649792367, "mean_token_accuracy": 0.7180451154708862, "num_tokens": 19022215.0, "step": 3655, "train/ce_loss": 1.2801862955093384 }, { "epoch": 0.3613802649792367, "step": 3655, "train/sim_loss": 0.07421875 }, { "epoch": 0.3613802649792367, "step": 3655, "train/total_loss": 0.20223738253116608 }, { "entropy": 8.835012435913086, "epoch": 0.36147913782875224, "mean_token_accuracy": 0.7701525092124939, "num_tokens": 19027634.0, "step": 3656, "train/ce_loss": 0.7795095443725586 }, { "epoch": 0.36147913782875224, "step": 3656, "train/sim_loss": 0.06640625 }, { "epoch": 0.36147913782875224, "step": 3656, "train/total_loss": 0.14435720443725586 }, { "entropy": 8.987771987915039, "epoch": 0.3615780106782677, "mean_token_accuracy": 0.7279322743415833, "num_tokens": 19032920.0, "step": 3657, "train/ce_loss": 0.6514043211936951 }, { "epoch": 0.3615780106782677, "step": 3657, "train/sim_loss": 0.046875 }, { "epoch": 0.3615780106782677, "step": 3657, "train/total_loss": 0.11201543360948563 }, { "entropy": 9.48392391204834, "epoch": 0.36167688352778327, "mean_token_accuracy": 0.7801302671432495, "num_tokens": 19037973.0, "step": 3658, "train/ce_loss": 0.8745675683021545 }, { "epoch": 0.36167688352778327, "step": 3658, "train/sim_loss": 0.109375 }, { "epoch": 0.36167688352778327, "step": 3658, "train/total_loss": 0.19683176279067993 }, { "entropy": 8.97184944152832, "epoch": 0.3617757563772988, "mean_token_accuracy": 0.7978494763374329, "num_tokens": 19043372.0, "step": 3659, "train/ce_loss": 0.7526065111160278 }, { "epoch": 0.3617757563772988, "step": 3659, "train/sim_loss": 0.03125 }, { "epoch": 0.3617757563772988, "step": 3659, "train/total_loss": 0.10651065409183502 }, { "epoch": 0.3618746292268143, "grad_norm": 0.7123227715492249, "learning_rate": 9.097809424912229e-06, "loss": 0.1448, "step": 3660 }, { "entropy": 9.43770694732666, "epoch": 0.3618746292268143, "mean_token_accuracy": 0.7228915691375732, "num_tokens": 19048447.0, "step": 3660, "train/ce_loss": 1.235249400138855 }, { "epoch": 0.3618746292268143, "step": 3660, "train/sim_loss": 0.0859375 }, { "epoch": 0.3618746292268143, "step": 3660, "train/total_loss": 0.20946243405342102 }, { "entropy": 9.074600219726562, "epoch": 0.36197350207632983, "mean_token_accuracy": 0.7693333625793457, "num_tokens": 19053679.0, "step": 3661, "train/ce_loss": 0.6530929207801819 }, { "epoch": 0.36197350207632983, "step": 3661, "train/sim_loss": 0.08203125 }, { "epoch": 0.36197350207632983, "step": 3661, "train/total_loss": 0.1473405361175537 }, { "entropy": 8.966264724731445, "epoch": 0.3620723749258454, "mean_token_accuracy": 0.765116274356842, "num_tokens": 19059013.0, "step": 3662, "train/ce_loss": 0.5833165645599365 }, { "epoch": 0.3620723749258454, "step": 3662, "train/sim_loss": 0.02734375 }, { "epoch": 0.3620723749258454, "step": 3662, "train/total_loss": 0.08567540347576141 }, { "entropy": 9.553458213806152, "epoch": 0.36217124777536086, "mean_token_accuracy": 0.7155476808547974, "num_tokens": 19064032.0, "step": 3663, "train/ce_loss": 5.074959517514799e-06 }, { "epoch": 0.36217124777536086, "step": 3663, "train/sim_loss": 0.0625 }, { "epoch": 0.36217124777536086, "step": 3663, "train/total_loss": 0.06250050663948059 }, { "entropy": 9.122953414916992, "epoch": 0.3622701206248764, "mean_token_accuracy": 0.7018140554428101, "num_tokens": 19069381.0, "step": 3664, "train/ce_loss": 0.9899625778198242 }, { "epoch": 0.3622701206248764, "step": 3664, "train/sim_loss": 0.07421875 }, { "epoch": 0.3622701206248764, "step": 3664, "train/total_loss": 0.17321500182151794 }, { "entropy": 9.707911491394043, "epoch": 0.36236899347439194, "mean_token_accuracy": 0.7546468377113342, "num_tokens": 19074348.0, "step": 3665, "train/ce_loss": 1.7823099369707052e-06 }, { "epoch": 0.36236899347439194, "step": 3665, "train/sim_loss": 0.05859375 }, { "epoch": 0.36236899347439194, "step": 3665, "train/total_loss": 0.058593928813934326 }, { "entropy": 9.818957328796387, "epoch": 0.36246786632390743, "mean_token_accuracy": 0.8381502628326416, "num_tokens": 19079290.0, "step": 3666, "train/ce_loss": 3.033597977264435e-06 }, { "epoch": 0.36246786632390743, "step": 3666, "train/sim_loss": 0.07421875 }, { "epoch": 0.36246786632390743, "step": 3666, "train/total_loss": 0.07421905547380447 }, { "entropy": 8.935047149658203, "epoch": 0.36256673917342297, "mean_token_accuracy": 0.7860026955604553, "num_tokens": 19084590.0, "step": 3667, "train/ce_loss": 1.4201887097442523e-06 }, { "epoch": 0.36256673917342297, "step": 3667, "train/sim_loss": 0.07421875 }, { "epoch": 0.36256673917342297, "step": 3667, "train/total_loss": 0.07421889156103134 }, { "entropy": 9.333765029907227, "epoch": 0.3626656120229385, "mean_token_accuracy": 0.7039999961853027, "num_tokens": 19089664.0, "step": 3668, "train/ce_loss": 0.8184553980827332 }, { "epoch": 0.3626656120229385, "step": 3668, "train/sim_loss": 0.05859375 }, { "epoch": 0.3626656120229385, "step": 3668, "train/total_loss": 0.14043930172920227 }, { "entropy": 8.790042877197266, "epoch": 0.36276448487245405, "mean_token_accuracy": 0.7546584010124207, "num_tokens": 19095120.0, "step": 3669, "train/ce_loss": 0.6159719228744507 }, { "epoch": 0.36276448487245405, "step": 3669, "train/sim_loss": 0.015625 }, { "epoch": 0.36276448487245405, "step": 3669, "train/total_loss": 0.07722219824790955 }, { "entropy": 9.340641021728516, "epoch": 0.36286335772196954, "mean_token_accuracy": 0.7985714077949524, "num_tokens": 19100293.0, "step": 3670, "train/ce_loss": 0.8443805575370789 }, { "epoch": 0.36286335772196954, "step": 3670, "train/sim_loss": 0.04296875 }, { "epoch": 0.36286335772196954, "step": 3670, "train/total_loss": 0.12740680575370789 }, { "entropy": 9.315393447875977, "epoch": 0.3629622305714851, "mean_token_accuracy": 0.7367773652076721, "num_tokens": 19105574.0, "step": 3671, "train/ce_loss": 1.1162911653518677 }, { "epoch": 0.3629622305714851, "step": 3671, "train/sim_loss": 0.0625 }, { "epoch": 0.3629622305714851, "step": 3671, "train/total_loss": 0.17412912845611572 }, { "entropy": 9.151384353637695, "epoch": 0.3630611034210006, "mean_token_accuracy": 0.7649208307266235, "num_tokens": 19110853.0, "step": 3672, "train/ce_loss": 0.6073848009109497 }, { "epoch": 0.3630611034210006, "step": 3672, "train/sim_loss": 0.078125 }, { "epoch": 0.3630611034210006, "step": 3672, "train/total_loss": 0.1388634741306305 }, { "entropy": 9.221534729003906, "epoch": 0.3631599762705161, "mean_token_accuracy": 0.7123473286628723, "num_tokens": 19116059.0, "step": 3673, "train/ce_loss": 1.1618515253067017 }, { "epoch": 0.3631599762705161, "step": 3673, "train/sim_loss": 0.125 }, { "epoch": 0.3631599762705161, "step": 3673, "train/total_loss": 0.24118515849113464 }, { "entropy": 9.611105918884277, "epoch": 0.36325884912003165, "mean_token_accuracy": 0.7008032202720642, "num_tokens": 19120998.0, "step": 3674, "train/ce_loss": 1.4367402791976929 }, { "epoch": 0.36325884912003165, "step": 3674, "train/sim_loss": 0.0703125 }, { "epoch": 0.36325884912003165, "step": 3674, "train/total_loss": 0.21398653090000153 }, { "entropy": 9.03463077545166, "epoch": 0.3633577219695472, "mean_token_accuracy": 0.7253270149230957, "num_tokens": 19126289.0, "step": 3675, "train/ce_loss": 0.9281492233276367 }, { "epoch": 0.3633577219695472, "step": 3675, "train/sim_loss": 0.07421875 }, { "epoch": 0.3633577219695472, "step": 3675, "train/total_loss": 0.16703367233276367 }, { "entropy": 9.68355655670166, "epoch": 0.3634565948190627, "mean_token_accuracy": 0.6813559532165527, "num_tokens": 19131350.0, "step": 3676, "train/ce_loss": 1.1487598419189453 }, { "epoch": 0.3634565948190627, "step": 3676, "train/sim_loss": 0.0859375 }, { "epoch": 0.3634565948190627, "step": 3676, "train/total_loss": 0.20081348717212677 }, { "entropy": 8.875577926635742, "epoch": 0.3635554676685782, "mean_token_accuracy": 0.7400000095367432, "num_tokens": 19136685.0, "step": 3677, "train/ce_loss": 1.3505109548568726 }, { "epoch": 0.3635554676685782, "step": 3677, "train/sim_loss": 0.07421875 }, { "epoch": 0.3635554676685782, "step": 3677, "train/total_loss": 0.20926985144615173 }, { "entropy": 9.280202865600586, "epoch": 0.36365434051809375, "mean_token_accuracy": 0.7714987993240356, "num_tokens": 19141958.0, "step": 3678, "train/ce_loss": 1.2463619709014893 }, { "epoch": 0.36365434051809375, "step": 3678, "train/sim_loss": 0.08203125 }, { "epoch": 0.36365434051809375, "step": 3678, "train/total_loss": 0.2066674530506134 }, { "entropy": 9.189447402954102, "epoch": 0.36375321336760924, "mean_token_accuracy": 0.73557049036026, "num_tokens": 19147179.0, "step": 3679, "train/ce_loss": 0.4812721312046051 }, { "epoch": 0.36375321336760924, "step": 3679, "train/sim_loss": 0.015625 }, { "epoch": 0.36375321336760924, "step": 3679, "train/total_loss": 0.06375221908092499 }, { "epoch": 0.3638520862171248, "grad_norm": 0.7537246346473694, "learning_rate": 9.092864560154281e-06, "loss": 0.1415, "step": 3680 }, { "entropy": 8.884530067443848, "epoch": 0.3638520862171248, "mean_token_accuracy": 0.7246963381767273, "num_tokens": 19152817.0, "step": 3680, "train/ce_loss": 0.4940720796585083 }, { "epoch": 0.3638520862171248, "step": 3680, "train/sim_loss": 0.0625 }, { "epoch": 0.3638520862171248, "step": 3680, "train/total_loss": 0.11190721392631531 }, { "entropy": 9.291428565979004, "epoch": 0.3639509590666403, "mean_token_accuracy": 0.728715717792511, "num_tokens": 19157923.0, "step": 3681, "train/ce_loss": 1.2066850662231445 }, { "epoch": 0.3639509590666403, "step": 3681, "train/sim_loss": 0.03515625 }, { "epoch": 0.3639509590666403, "step": 3681, "train/total_loss": 0.15582475066184998 }, { "entropy": 9.171480178833008, "epoch": 0.3640498319161558, "mean_token_accuracy": 0.7663981318473816, "num_tokens": 19163236.0, "step": 3682, "train/ce_loss": 1.0046883821487427 }, { "epoch": 0.3640498319161558, "step": 3682, "train/sim_loss": 0.02734375 }, { "epoch": 0.3640498319161558, "step": 3682, "train/total_loss": 0.12781259417533875 }, { "entropy": 9.120622634887695, "epoch": 0.36414870476567135, "mean_token_accuracy": 0.7536814212799072, "num_tokens": 19168476.0, "step": 3683, "train/ce_loss": 0.875571608543396 }, { "epoch": 0.36414870476567135, "step": 3683, "train/sim_loss": 0.078125 }, { "epoch": 0.36414870476567135, "step": 3683, "train/total_loss": 0.16568216681480408 }, { "entropy": 8.828895568847656, "epoch": 0.3642475776151869, "mean_token_accuracy": 0.7705286741256714, "num_tokens": 19173865.0, "step": 3684, "train/ce_loss": 0.8268486857414246 }, { "epoch": 0.3642475776151869, "step": 3684, "train/sim_loss": 0.0703125 }, { "epoch": 0.3642475776151869, "step": 3684, "train/total_loss": 0.15299737453460693 }, { "entropy": 9.62015438079834, "epoch": 0.3643464504647024, "mean_token_accuracy": 0.7137096524238586, "num_tokens": 19178787.0, "step": 3685, "train/ce_loss": 0.9937043786048889 }, { "epoch": 0.3643464504647024, "step": 3685, "train/sim_loss": 0.0546875 }, { "epoch": 0.3643464504647024, "step": 3685, "train/total_loss": 0.15405794978141785 }, { "entropy": 9.672927856445312, "epoch": 0.3644453233142179, "mean_token_accuracy": 0.7844203114509583, "num_tokens": 19183738.0, "step": 3686, "train/ce_loss": 1.1332205533981323 }, { "epoch": 0.3644453233142179, "step": 3686, "train/sim_loss": 0.11328125 }, { "epoch": 0.3644453233142179, "step": 3686, "train/total_loss": 0.22660329937934875 }, { "entropy": 8.847968101501465, "epoch": 0.36454419616373346, "mean_token_accuracy": 0.7869177460670471, "num_tokens": 19189270.0, "step": 3687, "train/ce_loss": 0.6084604859352112 }, { "epoch": 0.36454419616373346, "step": 3687, "train/sim_loss": 0.0390625 }, { "epoch": 0.36454419616373346, "step": 3687, "train/total_loss": 0.09990854561328888 }, { "entropy": 8.820528984069824, "epoch": 0.36464306901324894, "mean_token_accuracy": 0.7956273555755615, "num_tokens": 19194795.0, "step": 3688, "train/ce_loss": 0.5975689888000488 }, { "epoch": 0.36464306901324894, "step": 3688, "train/sim_loss": 0.06640625 }, { "epoch": 0.36464306901324894, "step": 3688, "train/total_loss": 0.12616315484046936 }, { "entropy": 9.499797821044922, "epoch": 0.3647419418627645, "mean_token_accuracy": 0.7402032017707825, "num_tokens": 19199912.0, "step": 3689, "train/ce_loss": 1.0911719799041748 }, { "epoch": 0.3647419418627645, "step": 3689, "train/sim_loss": 0.140625 }, { "epoch": 0.3647419418627645, "step": 3689, "train/total_loss": 0.24974220991134644 }, { "entropy": 9.30494499206543, "epoch": 0.36484081471228, "mean_token_accuracy": 0.7166866660118103, "num_tokens": 19205211.0, "step": 3690, "train/ce_loss": 0.6577334403991699 }, { "epoch": 0.36484081471228, "step": 3690, "train/sim_loss": 0.0703125 }, { "epoch": 0.36484081471228, "step": 3690, "train/total_loss": 0.13608583807945251 }, { "entropy": 8.974608421325684, "epoch": 0.3649396875617955, "mean_token_accuracy": 0.7792887091636658, "num_tokens": 19210653.0, "step": 3691, "train/ce_loss": 0.9531930685043335 }, { "epoch": 0.3649396875617955, "step": 3691, "train/sim_loss": 0.13671875 }, { "epoch": 0.3649396875617955, "step": 3691, "train/total_loss": 0.23203805088996887 }, { "entropy": 9.016339302062988, "epoch": 0.36503856041131105, "mean_token_accuracy": 0.7889273166656494, "num_tokens": 19215949.0, "step": 3692, "train/ce_loss": 0.6592164635658264 }, { "epoch": 0.36503856041131105, "step": 3692, "train/sim_loss": 0.0390625 }, { "epoch": 0.36503856041131105, "step": 3692, "train/total_loss": 0.10498414933681488 }, { "entropy": 8.699670791625977, "epoch": 0.3651374332608266, "mean_token_accuracy": 0.7269681692123413, "num_tokens": 19221613.0, "step": 3693, "train/ce_loss": 0.4130028486251831 }, { "epoch": 0.3651374332608266, "step": 3693, "train/sim_loss": 0.0234375 }, { "epoch": 0.3651374332608266, "step": 3693, "train/total_loss": 0.06473778188228607 }, { "entropy": 9.287849426269531, "epoch": 0.3652363061103421, "mean_token_accuracy": 0.7048345804214478, "num_tokens": 19226835.0, "step": 3694, "train/ce_loss": 1.6899060010910034 }, { "epoch": 0.3652363061103421, "step": 3694, "train/sim_loss": 0.0546875 }, { "epoch": 0.3652363061103421, "step": 3694, "train/total_loss": 0.2236780971288681 }, { "entropy": 8.74072551727295, "epoch": 0.3653351789598576, "mean_token_accuracy": 0.7337883710861206, "num_tokens": 19232182.0, "step": 3695, "train/ce_loss": 0.9204556941986084 }, { "epoch": 0.3653351789598576, "step": 3695, "train/sim_loss": 0.03515625 }, { "epoch": 0.3653351789598576, "step": 3695, "train/total_loss": 0.12720182538032532 }, { "entropy": 9.471048355102539, "epoch": 0.36543405180937316, "mean_token_accuracy": 0.7438162565231323, "num_tokens": 19237276.0, "step": 3696, "train/ce_loss": 0.9572367668151855 }, { "epoch": 0.36543405180937316, "step": 3696, "train/sim_loss": 0.05859375 }, { "epoch": 0.36543405180937316, "step": 3696, "train/total_loss": 0.1543174386024475 }, { "entropy": 9.895501136779785, "epoch": 0.36553292465888865, "mean_token_accuracy": 0.7692307829856873, "num_tokens": 19242142.0, "step": 3697, "train/ce_loss": 0.5988715291023254 }, { "epoch": 0.36553292465888865, "step": 3697, "train/sim_loss": 0.1015625 }, { "epoch": 0.36553292465888865, "step": 3697, "train/total_loss": 0.16144965589046478 }, { "entropy": 9.960525512695312, "epoch": 0.3656317975084042, "mean_token_accuracy": 0.6813953518867493, "num_tokens": 19246972.0, "step": 3698, "train/ce_loss": 1.8187681436538696 }, { "epoch": 0.3656317975084042, "step": 3698, "train/sim_loss": 0.05859375 }, { "epoch": 0.3656317975084042, "step": 3698, "train/total_loss": 0.24047057330608368 }, { "entropy": 8.915639877319336, "epoch": 0.36573067035791973, "mean_token_accuracy": 0.7660332322120667, "num_tokens": 19252320.0, "step": 3699, "train/ce_loss": 0.39387181401252747 }, { "epoch": 0.36573067035791973, "step": 3699, "train/sim_loss": 0.0234375 }, { "epoch": 0.36573067035791973, "step": 3699, "train/total_loss": 0.06282468140125275 }, { "epoch": 0.3658295432074352, "grad_norm": 0.7704654335975647, "learning_rate": 9.087919695396332e-06, "loss": 0.141, "step": 3700 }, { "entropy": 9.320602416992188, "epoch": 0.3658295432074352, "mean_token_accuracy": 0.7112299203872681, "num_tokens": 19257515.0, "step": 3700, "train/ce_loss": 0.9579900503158569 }, { "epoch": 0.3658295432074352, "step": 3700, "train/sim_loss": 0.0625 }, { "epoch": 0.3658295432074352, "step": 3700, "train/total_loss": 0.15829899907112122 }, { "entropy": 9.27962875366211, "epoch": 0.36592841605695076, "mean_token_accuracy": 0.7882187962532043, "num_tokens": 19262693.0, "step": 3701, "train/ce_loss": 0.8543885946273804 }, { "epoch": 0.36592841605695076, "step": 3701, "train/sim_loss": 0.0390625 }, { "epoch": 0.36592841605695076, "step": 3701, "train/total_loss": 0.12450136244297028 }, { "entropy": 9.562347412109375, "epoch": 0.3660272889064663, "mean_token_accuracy": 0.7322970628738403, "num_tokens": 19267699.0, "step": 3702, "train/ce_loss": 1.6200648546218872 }, { "epoch": 0.3660272889064663, "step": 3702, "train/sim_loss": 0.05859375 }, { "epoch": 0.3660272889064663, "step": 3702, "train/total_loss": 0.22060023248195648 }, { "entropy": 9.190065383911133, "epoch": 0.3661261617559818, "mean_token_accuracy": 0.7539936304092407, "num_tokens": 19272757.0, "step": 3703, "train/ce_loss": 0.5445184111595154 }, { "epoch": 0.3661261617559818, "step": 3703, "train/sim_loss": 0.0546875 }, { "epoch": 0.3661261617559818, "step": 3703, "train/total_loss": 0.1091393381357193 }, { "entropy": 9.860889434814453, "epoch": 0.3662250346054973, "mean_token_accuracy": 0.8409090638160706, "num_tokens": 19277586.0, "step": 3704, "train/ce_loss": 1.1505147218704224 }, { "epoch": 0.3662250346054973, "step": 3704, "train/sim_loss": 0.02734375 }, { "epoch": 0.3662250346054973, "step": 3704, "train/total_loss": 0.1423952281475067 }, { "entropy": 9.17490291595459, "epoch": 0.36632390745501286, "mean_token_accuracy": 0.7366504669189453, "num_tokens": 19283053.0, "step": 3705, "train/ce_loss": 0.5870340466499329 }, { "epoch": 0.36632390745501286, "step": 3705, "train/sim_loss": 0.03515625 }, { "epoch": 0.36632390745501286, "step": 3705, "train/total_loss": 0.09385965764522552 }, { "entropy": 8.471872329711914, "epoch": 0.36642278030452835, "mean_token_accuracy": 0.6974874138832092, "num_tokens": 19288551.0, "step": 3706, "train/ce_loss": 0.6062349677085876 }, { "epoch": 0.36642278030452835, "step": 3706, "train/sim_loss": 0.0859375 }, { "epoch": 0.36642278030452835, "step": 3706, "train/total_loss": 0.14656099677085876 }, { "entropy": 8.921075820922852, "epoch": 0.3665216531540439, "mean_token_accuracy": 0.7591792941093445, "num_tokens": 19293922.0, "step": 3707, "train/ce_loss": 0.659113883972168 }, { "epoch": 0.3665216531540439, "step": 3707, "train/sim_loss": 0.0234375 }, { "epoch": 0.3665216531540439, "step": 3707, "train/total_loss": 0.08934888988733292 }, { "entropy": 9.633601188659668, "epoch": 0.36662052600355943, "mean_token_accuracy": 0.7111486196517944, "num_tokens": 19298925.0, "step": 3708, "train/ce_loss": 1.052996039390564 }, { "epoch": 0.36662052600355943, "step": 3708, "train/sim_loss": 0.0390625 }, { "epoch": 0.36662052600355943, "step": 3708, "train/total_loss": 0.14436210691928864 }, { "entropy": 8.578943252563477, "epoch": 0.3667193988530749, "mean_token_accuracy": 0.7411988377571106, "num_tokens": 19304451.0, "step": 3709, "train/ce_loss": 1.1131523847579956 }, { "epoch": 0.3667193988530749, "step": 3709, "train/sim_loss": 0.140625 }, { "epoch": 0.3667193988530749, "step": 3709, "train/total_loss": 0.2519402503967285 }, { "entropy": 9.549026489257812, "epoch": 0.36681827170259046, "mean_token_accuracy": 0.7397769689559937, "num_tokens": 19309414.0, "step": 3710, "train/ce_loss": 1.154641032218933 }, { "epoch": 0.36681827170259046, "step": 3710, "train/sim_loss": 0.05078125 }, { "epoch": 0.36681827170259046, "step": 3710, "train/total_loss": 0.16624535620212555 }, { "entropy": 9.582513809204102, "epoch": 0.366917144552106, "mean_token_accuracy": 0.757785439491272, "num_tokens": 19314484.0, "step": 3711, "train/ce_loss": 0.8000929951667786 }, { "epoch": 0.366917144552106, "step": 3711, "train/sim_loss": 0.0546875 }, { "epoch": 0.366917144552106, "step": 3711, "train/total_loss": 0.1346968114376068 }, { "entropy": 9.365667343139648, "epoch": 0.36701601740162154, "mean_token_accuracy": 0.7093185186386108, "num_tokens": 19319616.0, "step": 3712, "train/ce_loss": 0.8726776242256165 }, { "epoch": 0.36701601740162154, "step": 3712, "train/sim_loss": 0.046875 }, { "epoch": 0.36701601740162154, "step": 3712, "train/total_loss": 0.13414275646209717 }, { "entropy": 8.654797554016113, "epoch": 0.367114890251137, "mean_token_accuracy": 0.7637088894844055, "num_tokens": 19325105.0, "step": 3713, "train/ce_loss": 0.8119332194328308 }, { "epoch": 0.367114890251137, "step": 3713, "train/sim_loss": 0.046875 }, { "epoch": 0.367114890251137, "step": 3713, "train/total_loss": 0.12806832790374756 }, { "entropy": 9.25880241394043, "epoch": 0.36721376310065257, "mean_token_accuracy": 0.7027741074562073, "num_tokens": 19330350.0, "step": 3714, "train/ce_loss": 1.0818811655044556 }, { "epoch": 0.36721376310065257, "step": 3714, "train/sim_loss": 0.03515625 }, { "epoch": 0.36721376310065257, "step": 3714, "train/total_loss": 0.14334437251091003 }, { "entropy": 9.742050170898438, "epoch": 0.3673126359501681, "mean_token_accuracy": 0.7057416439056396, "num_tokens": 19335185.0, "step": 3715, "train/ce_loss": 1.0356760867580306e-05 }, { "epoch": 0.3673126359501681, "step": 3715, "train/sim_loss": 0.03515625 }, { "epoch": 0.3673126359501681, "step": 3715, "train/total_loss": 0.03515728563070297 }, { "entropy": 9.491357803344727, "epoch": 0.3674115087996836, "mean_token_accuracy": 0.8006535768508911, "num_tokens": 19340254.0, "step": 3716, "train/ce_loss": 0.784355878829956 }, { "epoch": 0.3674115087996836, "step": 3716, "train/sim_loss": 0.0390625 }, { "epoch": 0.3674115087996836, "step": 3716, "train/total_loss": 0.11749809235334396 }, { "entropy": 9.089653968811035, "epoch": 0.36751038164919914, "mean_token_accuracy": 0.7354037165641785, "num_tokens": 19345497.0, "step": 3717, "train/ce_loss": 0.49602407217025757 }, { "epoch": 0.36751038164919914, "step": 3717, "train/sim_loss": 0.07421875 }, { "epoch": 0.36751038164919914, "step": 3717, "train/total_loss": 0.12382115423679352 }, { "entropy": 9.133569717407227, "epoch": 0.3676092544987147, "mean_token_accuracy": 0.7141134142875671, "num_tokens": 19350770.0, "step": 3718, "train/ce_loss": 1.0562591552734375 }, { "epoch": 0.3676092544987147, "step": 3718, "train/sim_loss": 0.0703125 }, { "epoch": 0.3676092544987147, "step": 3718, "train/total_loss": 0.1759384274482727 }, { "entropy": 9.450826644897461, "epoch": 0.36770812734823016, "mean_token_accuracy": 0.7879282236099243, "num_tokens": 19355863.0, "step": 3719, "train/ce_loss": 7.788343282300048e-06 }, { "epoch": 0.36770812734823016, "step": 3719, "train/sim_loss": 0.06640625 }, { "epoch": 0.36770812734823016, "step": 3719, "train/total_loss": 0.06640703231096268 }, { "epoch": 0.3678070001977457, "grad_norm": 0.6311770677566528, "learning_rate": 9.082974830638382e-06, "loss": 0.1422, "step": 3720 }, { "entropy": 8.871721267700195, "epoch": 0.3678070001977457, "mean_token_accuracy": 0.7399380803108215, "num_tokens": 19361335.0, "step": 3720, "train/ce_loss": 0.5923374891281128 }, { "epoch": 0.3678070001977457, "step": 3720, "train/sim_loss": 0.046875 }, { "epoch": 0.3678070001977457, "step": 3720, "train/total_loss": 0.10610875487327576 }, { "entropy": 9.379371643066406, "epoch": 0.36790587304726124, "mean_token_accuracy": 0.7760252356529236, "num_tokens": 19366365.0, "step": 3721, "train/ce_loss": 3.045043285965221e-06 }, { "epoch": 0.36790587304726124, "step": 3721, "train/sim_loss": 0.05859375 }, { "epoch": 0.36790587304726124, "step": 3721, "train/total_loss": 0.058594055473804474 }, { "entropy": 8.701794624328613, "epoch": 0.36800474589677673, "mean_token_accuracy": 0.7163197994232178, "num_tokens": 19371750.0, "step": 3722, "train/ce_loss": 1.0332472324371338 }, { "epoch": 0.36800474589677673, "step": 3722, "train/sim_loss": 0.03125 }, { "epoch": 0.36800474589677673, "step": 3722, "train/total_loss": 0.13457472622394562 }, { "entropy": 9.15493392944336, "epoch": 0.36810361874629227, "mean_token_accuracy": 0.7419354915618896, "num_tokens": 19377018.0, "step": 3723, "train/ce_loss": 0.6793240308761597 }, { "epoch": 0.36810361874629227, "step": 3723, "train/sim_loss": 0.015625 }, { "epoch": 0.36810361874629227, "step": 3723, "train/total_loss": 0.08355740457773209 }, { "entropy": 9.095837593078613, "epoch": 0.3682024915958078, "mean_token_accuracy": 0.7668463587760925, "num_tokens": 19382280.0, "step": 3724, "train/ce_loss": 3.6239430301066022e-06 }, { "epoch": 0.3682024915958078, "step": 3724, "train/sim_loss": 0.0625 }, { "epoch": 0.3682024915958078, "step": 3724, "train/total_loss": 0.06250036507844925 }, { "entropy": 9.607484817504883, "epoch": 0.3683013644453233, "mean_token_accuracy": 0.7284403443336487, "num_tokens": 19387237.0, "step": 3725, "train/ce_loss": 1.377042531967163 }, { "epoch": 0.3683013644453233, "step": 3725, "train/sim_loss": 0.0546875 }, { "epoch": 0.3683013644453233, "step": 3725, "train/total_loss": 0.1923917531967163 }, { "entropy": 9.639066696166992, "epoch": 0.36840023729483884, "mean_token_accuracy": 0.686274528503418, "num_tokens": 19392196.0, "step": 3726, "train/ce_loss": 4.535922016657423e-06 }, { "epoch": 0.36840023729483884, "step": 3726, "train/sim_loss": 0.05859375 }, { "epoch": 0.36840023729483884, "step": 3726, "train/total_loss": 0.05859420448541641 }, { "entropy": 8.723943710327148, "epoch": 0.3684991101443544, "mean_token_accuracy": 0.7697993516921997, "num_tokens": 19397686.0, "step": 3727, "train/ce_loss": 0.8708702325820923 }, { "epoch": 0.3684991101443544, "step": 3727, "train/sim_loss": 0.05859375 }, { "epoch": 0.3684991101443544, "step": 3727, "train/total_loss": 0.14568078517913818 }, { "entropy": 9.060258865356445, "epoch": 0.36859798299386987, "mean_token_accuracy": 0.7626112699508667, "num_tokens": 19402807.0, "step": 3728, "train/ce_loss": 1.1802632808685303 }, { "epoch": 0.36859798299386987, "step": 3728, "train/sim_loss": 0.0625 }, { "epoch": 0.36859798299386987, "step": 3728, "train/total_loss": 0.18052633106708527 }, { "entropy": 9.597804069519043, "epoch": 0.3686968558433854, "mean_token_accuracy": 0.7648351788520813, "num_tokens": 19407668.0, "step": 3729, "train/ce_loss": 1.4211921691894531 }, { "epoch": 0.3686968558433854, "step": 3729, "train/sim_loss": 0.04296875 }, { "epoch": 0.3686968558433854, "step": 3729, "train/total_loss": 0.18508796393871307 }, { "entropy": 9.590475082397461, "epoch": 0.36879572869290095, "mean_token_accuracy": 0.7209677696228027, "num_tokens": 19412752.0, "step": 3730, "train/ce_loss": 0.8765764832496643 }, { "epoch": 0.36879572869290095, "step": 3730, "train/sim_loss": 0.03125 }, { "epoch": 0.36879572869290095, "step": 3730, "train/total_loss": 0.11890765279531479 }, { "entropy": 9.332487106323242, "epoch": 0.36889460154241643, "mean_token_accuracy": 0.8088012337684631, "num_tokens": 19417831.0, "step": 3731, "train/ce_loss": 6.978231340326602e-06 }, { "epoch": 0.36889460154241643, "step": 3731, "train/sim_loss": 0.05078125 }, { "epoch": 0.36889460154241643, "step": 3731, "train/total_loss": 0.05078194662928581 }, { "entropy": 9.907342910766602, "epoch": 0.368993474391932, "mean_token_accuracy": 0.7770270109176636, "num_tokens": 19422694.0, "step": 3732, "train/ce_loss": 1.3829725980758667 }, { "epoch": 0.368993474391932, "step": 3732, "train/sim_loss": 0.03125 }, { "epoch": 0.368993474391932, "step": 3732, "train/total_loss": 0.16954725980758667 }, { "entropy": 8.675329208374023, "epoch": 0.3690923472414475, "mean_token_accuracy": 0.769011378288269, "num_tokens": 19428250.0, "step": 3733, "train/ce_loss": 1.1538902521133423 }, { "epoch": 0.3690923472414475, "step": 3733, "train/sim_loss": 0.06640625 }, { "epoch": 0.3690923472414475, "step": 3733, "train/total_loss": 0.18179526925086975 }, { "entropy": 8.950559616088867, "epoch": 0.369191220090963, "mean_token_accuracy": 0.733031690120697, "num_tokens": 19433631.0, "step": 3734, "train/ce_loss": 1.0515861511230469 }, { "epoch": 0.369191220090963, "step": 3734, "train/sim_loss": 0.046875 }, { "epoch": 0.369191220090963, "step": 3734, "train/total_loss": 0.15203362703323364 }, { "entropy": 8.948648452758789, "epoch": 0.36929009294047854, "mean_token_accuracy": 0.7572254538536072, "num_tokens": 19438924.0, "step": 3735, "train/ce_loss": 0.8543544411659241 }, { "epoch": 0.36929009294047854, "step": 3735, "train/sim_loss": 0.0859375 }, { "epoch": 0.36929009294047854, "step": 3735, "train/total_loss": 0.17137295007705688 }, { "entropy": 9.056068420410156, "epoch": 0.3693889657899941, "mean_token_accuracy": 0.7157894968986511, "num_tokens": 19444153.0, "step": 3736, "train/ce_loss": 0.5833434462547302 }, { "epoch": 0.3693889657899941, "step": 3736, "train/sim_loss": 0.0703125 }, { "epoch": 0.3693889657899941, "step": 3736, "train/total_loss": 0.1286468505859375 }, { "entropy": 9.409212112426758, "epoch": 0.36948783863950957, "mean_token_accuracy": 0.739130437374115, "num_tokens": 19449169.0, "step": 3737, "train/ce_loss": 1.196805715560913 }, { "epoch": 0.36948783863950957, "step": 3737, "train/sim_loss": 0.05859375 }, { "epoch": 0.36948783863950957, "step": 3737, "train/total_loss": 0.17827433347702026 }, { "entropy": 9.145063400268555, "epoch": 0.3695867114890251, "mean_token_accuracy": 0.7054263353347778, "num_tokens": 19454369.0, "step": 3738, "train/ce_loss": 0.6760798096656799 }, { "epoch": 0.3695867114890251, "step": 3738, "train/sim_loss": 0.0703125 }, { "epoch": 0.3695867114890251, "step": 3738, "train/total_loss": 0.13792048394680023 }, { "entropy": 8.771507263183594, "epoch": 0.36968558433854065, "mean_token_accuracy": 0.7543054223060608, "num_tokens": 19459770.0, "step": 3739, "train/ce_loss": 1.1303457021713257 }, { "epoch": 0.36968558433854065, "step": 3739, "train/sim_loss": 0.0546875 }, { "epoch": 0.36968558433854065, "step": 3739, "train/total_loss": 0.16772207617759705 }, { "epoch": 0.36978445718805614, "grad_norm": 0.6994771361351013, "learning_rate": 9.078029965880434e-06, "loss": 0.1386, "step": 3740 }, { "entropy": 9.018630027770996, "epoch": 0.36978445718805614, "mean_token_accuracy": 0.7392900586128235, "num_tokens": 19465066.0, "step": 3740, "train/ce_loss": 0.9878615736961365 }, { "epoch": 0.36978445718805614, "step": 3740, "train/sim_loss": 0.06640625 }, { "epoch": 0.36978445718805614, "step": 3740, "train/total_loss": 0.1651924103498459 }, { "entropy": 9.008108139038086, "epoch": 0.3698833300375717, "mean_token_accuracy": 0.7908979058265686, "num_tokens": 19470352.0, "step": 3741, "train/ce_loss": 0.7387509346008301 }, { "epoch": 0.3698833300375717, "step": 3741, "train/sim_loss": 0.0234375 }, { "epoch": 0.3698833300375717, "step": 3741, "train/total_loss": 0.09731259196996689 }, { "entropy": 9.422561645507812, "epoch": 0.3699822028870872, "mean_token_accuracy": 0.7617260813713074, "num_tokens": 19475356.0, "step": 3742, "train/ce_loss": 2.9977215945109492e-06 }, { "epoch": 0.3699822028870872, "step": 3742, "train/sim_loss": 0.0625 }, { "epoch": 0.3699822028870872, "step": 3742, "train/total_loss": 0.06250029802322388 }, { "entropy": 9.204683303833008, "epoch": 0.3700810757366027, "mean_token_accuracy": 0.7597222328186035, "num_tokens": 19480550.0, "step": 3743, "train/ce_loss": 3.021031261596363e-06 }, { "epoch": 0.3700810757366027, "step": 3743, "train/sim_loss": 0.07421875 }, { "epoch": 0.3700810757366027, "step": 3743, "train/total_loss": 0.07421905547380447 }, { "entropy": 9.255865097045898, "epoch": 0.37017994858611825, "mean_token_accuracy": 0.7313432693481445, "num_tokens": 19485791.0, "step": 3744, "train/ce_loss": 1.0754189491271973 }, { "epoch": 0.37017994858611825, "step": 3744, "train/sim_loss": 0.07421875 }, { "epoch": 0.37017994858611825, "step": 3744, "train/total_loss": 0.18176063895225525 }, { "entropy": 8.607366561889648, "epoch": 0.3702788214356338, "mean_token_accuracy": 0.7419354915618896, "num_tokens": 19491292.0, "step": 3745, "train/ce_loss": 0.5163165926933289 }, { "epoch": 0.3702788214356338, "step": 3745, "train/sim_loss": 0.0859375 }, { "epoch": 0.3702788214356338, "step": 3745, "train/total_loss": 0.13756915926933289 }, { "entropy": 9.31511116027832, "epoch": 0.3703776942851493, "mean_token_accuracy": 0.7166947722434998, "num_tokens": 19496345.0, "step": 3746, "train/ce_loss": 2.6260327103955206e-06 }, { "epoch": 0.3703776942851493, "step": 3746, "train/sim_loss": 0.03125 }, { "epoch": 0.3703776942851493, "step": 3746, "train/total_loss": 0.03125026077032089 }, { "entropy": 8.843770980834961, "epoch": 0.3704765671346648, "mean_token_accuracy": 0.7981220483779907, "num_tokens": 19501675.0, "step": 3747, "train/ce_loss": 0.416376531124115 }, { "epoch": 0.3704765671346648, "step": 3747, "train/sim_loss": 0.06640625 }, { "epoch": 0.3704765671346648, "step": 3747, "train/total_loss": 0.10804390907287598 }, { "entropy": 8.891765594482422, "epoch": 0.37057543998418035, "mean_token_accuracy": 0.6957123279571533, "num_tokens": 19506866.0, "step": 3748, "train/ce_loss": 1.1417073011398315 }, { "epoch": 0.37057543998418035, "step": 3748, "train/sim_loss": 0.1015625 }, { "epoch": 0.37057543998418035, "step": 3748, "train/total_loss": 0.21573323011398315 }, { "entropy": 8.899946212768555, "epoch": 0.37067431283369584, "mean_token_accuracy": 0.7761732935905457, "num_tokens": 19512168.0, "step": 3749, "train/ce_loss": 0.7011145949363708 }, { "epoch": 0.37067431283369584, "step": 3749, "train/sim_loss": 0.05078125 }, { "epoch": 0.37067431283369584, "step": 3749, "train/total_loss": 0.1208927109837532 }, { "entropy": 8.65135383605957, "epoch": 0.3707731856832114, "mean_token_accuracy": 0.7608225345611572, "num_tokens": 19517536.0, "step": 3750, "train/ce_loss": 0.9012789130210876 }, { "epoch": 0.3707731856832114, "step": 3750, "train/sim_loss": 0.06640625 }, { "epoch": 0.3707731856832114, "step": 3750, "train/total_loss": 0.1565341353416443 }, { "entropy": 8.757684707641602, "epoch": 0.3708720585327269, "mean_token_accuracy": 0.6804009079933167, "num_tokens": 19522891.0, "step": 3751, "train/ce_loss": 0.7830634117126465 }, { "epoch": 0.3708720585327269, "step": 3751, "train/sim_loss": 0.04296875 }, { "epoch": 0.3708720585327269, "step": 3751, "train/total_loss": 0.12127508968114853 }, { "entropy": 8.842325210571289, "epoch": 0.37097093138224246, "mean_token_accuracy": 0.7467455863952637, "num_tokens": 19528197.0, "step": 3752, "train/ce_loss": 0.8398581743240356 }, { "epoch": 0.37097093138224246, "step": 3752, "train/sim_loss": 0.08984375 }, { "epoch": 0.37097093138224246, "step": 3752, "train/total_loss": 0.1738295704126358 }, { "entropy": 9.236410140991211, "epoch": 0.37106980423175795, "mean_token_accuracy": 0.8135592937469482, "num_tokens": 19533245.0, "step": 3753, "train/ce_loss": 0.872316837310791 }, { "epoch": 0.37106980423175795, "step": 3753, "train/sim_loss": 0.05078125 }, { "epoch": 0.37106980423175795, "step": 3753, "train/total_loss": 0.13801294565200806 }, { "entropy": 8.678380966186523, "epoch": 0.3711686770812735, "mean_token_accuracy": 0.7435367107391357, "num_tokens": 19538721.0, "step": 3754, "train/ce_loss": 1.0098522901535034 }, { "epoch": 0.3711686770812735, "step": 3754, "train/sim_loss": 0.0859375 }, { "epoch": 0.3711686770812735, "step": 3754, "train/total_loss": 0.18692272901535034 }, { "entropy": 9.058197975158691, "epoch": 0.37126754993078903, "mean_token_accuracy": 0.7084870934486389, "num_tokens": 19544029.0, "step": 3755, "train/ce_loss": 1.0240709781646729 }, { "epoch": 0.37126754993078903, "step": 3755, "train/sim_loss": 0.05078125 }, { "epoch": 0.37126754993078903, "step": 3755, "train/total_loss": 0.15318834781646729 }, { "entropy": 9.473255157470703, "epoch": 0.3713664227803045, "mean_token_accuracy": 0.7191780805587769, "num_tokens": 19549017.0, "step": 3756, "train/ce_loss": 1.2590043544769287 }, { "epoch": 0.3713664227803045, "step": 3756, "train/sim_loss": 0.0625 }, { "epoch": 0.3713664227803045, "step": 3756, "train/total_loss": 0.18840043246746063 }, { "entropy": 9.128877639770508, "epoch": 0.37146529562982006, "mean_token_accuracy": 0.7311139702796936, "num_tokens": 19554241.0, "step": 3757, "train/ce_loss": 0.7337197661399841 }, { "epoch": 0.37146529562982006, "step": 3757, "train/sim_loss": 0.05859375 }, { "epoch": 0.37146529562982006, "step": 3757, "train/total_loss": 0.1319657266139984 }, { "entropy": 9.343884468078613, "epoch": 0.3715641684793356, "mean_token_accuracy": 0.7074722051620483, "num_tokens": 19559295.0, "step": 3758, "train/ce_loss": 1.0605524778366089 }, { "epoch": 0.3715641684793356, "step": 3758, "train/sim_loss": 0.08203125 }, { "epoch": 0.3715641684793356, "step": 3758, "train/total_loss": 0.18808650970458984 }, { "entropy": 9.300300598144531, "epoch": 0.3716630413288511, "mean_token_accuracy": 0.7159686088562012, "num_tokens": 19564515.0, "step": 3759, "train/ce_loss": 2.6648469884094084e-06 }, { "epoch": 0.3716630413288511, "step": 3759, "train/sim_loss": 0.0390625 }, { "epoch": 0.3716630413288511, "step": 3759, "train/total_loss": 0.03906276822090149 }, { "epoch": 0.3717619141783666, "grad_norm": 0.9004268646240234, "learning_rate": 9.073085101122485e-06, "loss": 0.1443, "step": 3760 }, { "entropy": 9.299562454223633, "epoch": 0.3717619141783666, "mean_token_accuracy": 0.7503268122673035, "num_tokens": 19569737.0, "step": 3760, "train/ce_loss": 1.3224796056747437 }, { "epoch": 0.3717619141783666, "step": 3760, "train/sim_loss": 0.0703125 }, { "epoch": 0.3717619141783666, "step": 3760, "train/total_loss": 0.20256046950817108 }, { "entropy": 8.562389373779297, "epoch": 0.37186078702788217, "mean_token_accuracy": 0.732083797454834, "num_tokens": 19575171.0, "step": 3761, "train/ce_loss": 0.6876528263092041 }, { "epoch": 0.37186078702788217, "step": 3761, "train/sim_loss": 0.015625 }, { "epoch": 0.37186078702788217, "step": 3761, "train/total_loss": 0.08439028263092041 }, { "entropy": 9.407758712768555, "epoch": 0.37195965987739765, "mean_token_accuracy": 0.7322946190834045, "num_tokens": 19580292.0, "step": 3762, "train/ce_loss": 0.7906780242919922 }, { "epoch": 0.37195965987739765, "step": 3762, "train/sim_loss": 0.01171875 }, { "epoch": 0.37195965987739765, "step": 3762, "train/total_loss": 0.09078655391931534 }, { "entropy": 9.819665908813477, "epoch": 0.3720585327269132, "mean_token_accuracy": 0.7414187788963318, "num_tokens": 19585112.0, "step": 3763, "train/ce_loss": 1.981115698814392 }, { "epoch": 0.3720585327269132, "step": 3763, "train/sim_loss": 0.07421875 }, { "epoch": 0.3720585327269132, "step": 3763, "train/total_loss": 0.2723303437232971 }, { "entropy": 8.616861343383789, "epoch": 0.37215740557642873, "mean_token_accuracy": 0.726685106754303, "num_tokens": 19590774.0, "step": 3764, "train/ce_loss": 1.3146535158157349 }, { "epoch": 0.37215740557642873, "step": 3764, "train/sim_loss": 0.06640625 }, { "epoch": 0.37215740557642873, "step": 3764, "train/total_loss": 0.1978716105222702 }, { "entropy": 9.059759140014648, "epoch": 0.3722562784259442, "mean_token_accuracy": 0.7260115742683411, "num_tokens": 19596094.0, "step": 3765, "train/ce_loss": 0.7476335167884827 }, { "epoch": 0.3722562784259442, "step": 3765, "train/sim_loss": 0.08984375 }, { "epoch": 0.3722562784259442, "step": 3765, "train/total_loss": 0.16460710763931274 }, { "entropy": 9.937812805175781, "epoch": 0.37235515127545976, "mean_token_accuracy": 0.7248677015304565, "num_tokens": 19600862.0, "step": 3766, "train/ce_loss": 2.5392666884727078e-06 }, { "epoch": 0.37235515127545976, "step": 3766, "train/sim_loss": 0.02734375 }, { "epoch": 0.37235515127545976, "step": 3766, "train/total_loss": 0.027344003319740295 }, { "entropy": 9.089120864868164, "epoch": 0.3724540241249753, "mean_token_accuracy": 0.7475177049636841, "num_tokens": 19606050.0, "step": 3767, "train/ce_loss": 1.9539186268957565e-06 }, { "epoch": 0.3724540241249753, "step": 3767, "train/sim_loss": 0.0859375 }, { "epoch": 0.3724540241249753, "step": 3767, "train/total_loss": 0.08593769371509552 }, { "entropy": 8.93012809753418, "epoch": 0.3725528969744908, "mean_token_accuracy": 0.7232635021209717, "num_tokens": 19611421.0, "step": 3768, "train/ce_loss": 0.8828412890434265 }, { "epoch": 0.3725528969744908, "step": 3768, "train/sim_loss": 0.12109375 }, { "epoch": 0.3725528969744908, "step": 3768, "train/total_loss": 0.20937788486480713 }, { "entropy": 8.709407806396484, "epoch": 0.37265176982400633, "mean_token_accuracy": 0.7229524850845337, "num_tokens": 19616908.0, "step": 3769, "train/ce_loss": 0.7235414981842041 }, { "epoch": 0.37265176982400633, "step": 3769, "train/sim_loss": 0.05078125 }, { "epoch": 0.37265176982400633, "step": 3769, "train/total_loss": 0.12313540279865265 }, { "entropy": 8.594377517700195, "epoch": 0.37275064267352187, "mean_token_accuracy": 0.7008032202720642, "num_tokens": 19622364.0, "step": 3770, "train/ce_loss": 0.5381051898002625 }, { "epoch": 0.37275064267352187, "step": 3770, "train/sim_loss": 0.02734375 }, { "epoch": 0.37275064267352187, "step": 3770, "train/total_loss": 0.08115427196025848 }, { "entropy": 8.811583518981934, "epoch": 0.37284951552303736, "mean_token_accuracy": 0.7488636374473572, "num_tokens": 19627722.0, "step": 3771, "train/ce_loss": 1.0054184198379517 }, { "epoch": 0.37284951552303736, "step": 3771, "train/sim_loss": 0.046875 }, { "epoch": 0.37284951552303736, "step": 3771, "train/total_loss": 0.1474168449640274 }, { "entropy": 8.790473937988281, "epoch": 0.3729483883725529, "mean_token_accuracy": 0.7562437653541565, "num_tokens": 19633180.0, "step": 3772, "train/ce_loss": 0.5944498181343079 }, { "epoch": 0.3729483883725529, "step": 3772, "train/sim_loss": 0.0859375 }, { "epoch": 0.3729483883725529, "step": 3772, "train/total_loss": 0.14538247883319855 }, { "entropy": 10.097342491149902, "epoch": 0.37304726122206844, "mean_token_accuracy": 0.7429718971252441, "num_tokens": 19637830.0, "step": 3773, "train/ce_loss": 7.948490747367032e-06 }, { "epoch": 0.37304726122206844, "step": 3773, "train/sim_loss": 0.046875 }, { "epoch": 0.37304726122206844, "step": 3773, "train/total_loss": 0.04687579348683357 }, { "entropy": 9.866277694702148, "epoch": 0.3731461340715839, "mean_token_accuracy": 0.7611607313156128, "num_tokens": 19642660.0, "step": 3774, "train/ce_loss": 2.121217903550132e-06 }, { "epoch": 0.3731461340715839, "step": 3774, "train/sim_loss": 0.01953125 }, { "epoch": 0.3731461340715839, "step": 3774, "train/total_loss": 0.019531462341547012 }, { "entropy": 8.923385620117188, "epoch": 0.37324500692109946, "mean_token_accuracy": 0.7015834450721741, "num_tokens": 19647911.0, "step": 3775, "train/ce_loss": 0.6671419143676758 }, { "epoch": 0.37324500692109946, "step": 3775, "train/sim_loss": 0.0859375 }, { "epoch": 0.37324500692109946, "step": 3775, "train/total_loss": 0.15265169739723206 }, { "entropy": 9.472021102905273, "epoch": 0.373343879770615, "mean_token_accuracy": 0.7642405033111572, "num_tokens": 19652943.0, "step": 3776, "train/ce_loss": 2.8804249723179964e-06 }, { "epoch": 0.373343879770615, "step": 3776, "train/sim_loss": 0.05078125 }, { "epoch": 0.373343879770615, "step": 3776, "train/total_loss": 0.05078153684735298 }, { "entropy": 8.827619552612305, "epoch": 0.3734427526201305, "mean_token_accuracy": 0.6972375512123108, "num_tokens": 19658312.0, "step": 3777, "train/ce_loss": 1.2203423976898193 }, { "epoch": 0.3734427526201305, "step": 3777, "train/sim_loss": 0.05078125 }, { "epoch": 0.3734427526201305, "step": 3777, "train/total_loss": 0.1728155016899109 }, { "entropy": 9.172332763671875, "epoch": 0.37354162546964603, "mean_token_accuracy": 0.7862694263458252, "num_tokens": 19663559.0, "step": 3778, "train/ce_loss": 0.989366888999939 }, { "epoch": 0.37354162546964603, "step": 3778, "train/sim_loss": 0.08203125 }, { "epoch": 0.37354162546964603, "step": 3778, "train/total_loss": 0.18096794188022614 }, { "entropy": 8.897441864013672, "epoch": 0.3736404983191616, "mean_token_accuracy": 0.7415485382080078, "num_tokens": 19668924.0, "step": 3779, "train/ce_loss": 0.6184482574462891 }, { "epoch": 0.3736404983191616, "step": 3779, "train/sim_loss": 0.0234375 }, { "epoch": 0.3736404983191616, "step": 3779, "train/total_loss": 0.0852823257446289 }, { "epoch": 0.37373937116867706, "grad_norm": 0.7398377060890198, "learning_rate": 9.068140236364537e-06, "loss": 0.1467, "step": 3780 }, { "entropy": 8.876144409179688, "epoch": 0.37373937116867706, "mean_token_accuracy": 0.7533936500549316, "num_tokens": 19674200.0, "step": 3780, "train/ce_loss": 0.6192217469215393 }, { "epoch": 0.37373937116867706, "step": 3780, "train/sim_loss": 0.05859375 }, { "epoch": 0.37373937116867706, "step": 3780, "train/total_loss": 0.12051592767238617 }, { "entropy": 9.752557754516602, "epoch": 0.3738382440181926, "mean_token_accuracy": 0.7612524628639221, "num_tokens": 19679089.0, "step": 3781, "train/ce_loss": 1.2465006113052368 }, { "epoch": 0.3738382440181926, "step": 3781, "train/sim_loss": 0.0546875 }, { "epoch": 0.3738382440181926, "step": 3781, "train/total_loss": 0.17933756113052368 }, { "entropy": 8.792906761169434, "epoch": 0.37393711686770814, "mean_token_accuracy": 0.734468936920166, "num_tokens": 19684562.0, "step": 3782, "train/ce_loss": 0.5887306928634644 }, { "epoch": 0.37393711686770814, "step": 3782, "train/sim_loss": 0.0234375 }, { "epoch": 0.37393711686770814, "step": 3782, "train/total_loss": 0.08231057226657867 }, { "entropy": 9.0285005569458, "epoch": 0.3740359897172236, "mean_token_accuracy": 0.7572916746139526, "num_tokens": 19689998.0, "step": 3783, "train/ce_loss": 1.0398918390274048 }, { "epoch": 0.3740359897172236, "step": 3783, "train/sim_loss": 0.0703125 }, { "epoch": 0.3740359897172236, "step": 3783, "train/total_loss": 0.17430168390274048 }, { "entropy": 8.794363021850586, "epoch": 0.37413486256673917, "mean_token_accuracy": 0.7221134901046753, "num_tokens": 19695471.0, "step": 3784, "train/ce_loss": 0.9877570271492004 }, { "epoch": 0.37413486256673917, "step": 3784, "train/sim_loss": 0.0859375 }, { "epoch": 0.37413486256673917, "step": 3784, "train/total_loss": 0.184713214635849 }, { "entropy": 9.161565780639648, "epoch": 0.3742337354162547, "mean_token_accuracy": 0.6902777552604675, "num_tokens": 19700659.0, "step": 3785, "train/ce_loss": 0.7769063115119934 }, { "epoch": 0.3742337354162547, "step": 3785, "train/sim_loss": 0.07421875 }, { "epoch": 0.3742337354162547, "step": 3785, "train/total_loss": 0.15190938115119934 }, { "entropy": 9.485732078552246, "epoch": 0.3743326082657702, "mean_token_accuracy": 0.8172231912612915, "num_tokens": 19705675.0, "step": 3786, "train/ce_loss": 3.4305571716686245e-06 }, { "epoch": 0.3743326082657702, "step": 3786, "train/sim_loss": 0.06640625 }, { "epoch": 0.3743326082657702, "step": 3786, "train/total_loss": 0.06640659272670746 }, { "entropy": 9.806770324707031, "epoch": 0.37443148111528574, "mean_token_accuracy": 0.6660447716712952, "num_tokens": 19710646.0, "step": 3787, "train/ce_loss": 1.9555909633636475 }, { "epoch": 0.37443148111528574, "step": 3787, "train/sim_loss": 0.0625 }, { "epoch": 0.37443148111528574, "step": 3787, "train/total_loss": 0.2580590844154358 }, { "entropy": 9.27357006072998, "epoch": 0.3745303539648013, "mean_token_accuracy": 0.7506361603736877, "num_tokens": 19715895.0, "step": 3788, "train/ce_loss": 0.4121975004673004 }, { "epoch": 0.3745303539648013, "step": 3788, "train/sim_loss": 0.05078125 }, { "epoch": 0.3745303539648013, "step": 3788, "train/total_loss": 0.09200100600719452 }, { "entropy": 8.91637897491455, "epoch": 0.37462922681431676, "mean_token_accuracy": 0.8262910842895508, "num_tokens": 19721253.0, "step": 3789, "train/ce_loss": 0.73322993516922 }, { "epoch": 0.37462922681431676, "step": 3789, "train/sim_loss": 0.06640625 }, { "epoch": 0.37462922681431676, "step": 3789, "train/total_loss": 0.13972924649715424 }, { "entropy": 9.351829528808594, "epoch": 0.3747280996638323, "mean_token_accuracy": 0.722520112991333, "num_tokens": 19726465.0, "step": 3790, "train/ce_loss": 1.9493904801493045e-06 }, { "epoch": 0.3747280996638323, "step": 3790, "train/sim_loss": 0.04296875 }, { "epoch": 0.3747280996638323, "step": 3790, "train/total_loss": 0.04296894371509552 }, { "entropy": 9.415502548217773, "epoch": 0.37482697251334784, "mean_token_accuracy": 0.7718120813369751, "num_tokens": 19731496.0, "step": 3791, "train/ce_loss": 0.9651876091957092 }, { "epoch": 0.37482697251334784, "step": 3791, "train/sim_loss": 0.0625 }, { "epoch": 0.37482697251334784, "step": 3791, "train/total_loss": 0.15901875495910645 }, { "entropy": 9.059098243713379, "epoch": 0.37492584536286333, "mean_token_accuracy": 0.7314578294754028, "num_tokens": 19736796.0, "step": 3792, "train/ce_loss": 0.6716099381446838 }, { "epoch": 0.37492584536286333, "step": 3792, "train/sim_loss": 0.05078125 }, { "epoch": 0.37492584536286333, "step": 3792, "train/total_loss": 0.11794224381446838 }, { "entropy": 9.588038444519043, "epoch": 0.37502471821237887, "mean_token_accuracy": 0.7354085445404053, "num_tokens": 19741696.0, "step": 3793, "train/ce_loss": 1.068302035331726 }, { "epoch": 0.37502471821237887, "step": 3793, "train/sim_loss": 0.0390625 }, { "epoch": 0.37502471821237887, "step": 3793, "train/total_loss": 0.14589270949363708 }, { "entropy": 8.875151634216309, "epoch": 0.3751235910618944, "mean_token_accuracy": 0.703201949596405, "num_tokens": 19747127.0, "step": 3794, "train/ce_loss": 0.8356339931488037 }, { "epoch": 0.3751235910618944, "step": 3794, "train/sim_loss": 0.109375 }, { "epoch": 0.3751235910618944, "step": 3794, "train/total_loss": 0.1929384022951126 }, { "entropy": 9.004408836364746, "epoch": 0.37522246391140995, "mean_token_accuracy": 0.6998950839042664, "num_tokens": 19752553.0, "step": 3795, "train/ce_loss": 0.7078590989112854 }, { "epoch": 0.37522246391140995, "step": 3795, "train/sim_loss": 0.0390625 }, { "epoch": 0.37522246391140995, "step": 3795, "train/total_loss": 0.10984840989112854 }, { "entropy": 8.763167381286621, "epoch": 0.37532133676092544, "mean_token_accuracy": 0.7606635093688965, "num_tokens": 19757833.0, "step": 3796, "train/ce_loss": 0.671595573425293 }, { "epoch": 0.37532133676092544, "step": 3796, "train/sim_loss": 0.03515625 }, { "epoch": 0.37532133676092544, "step": 3796, "train/total_loss": 0.10231580585241318 }, { "entropy": 8.933568954467773, "epoch": 0.375420209610441, "mean_token_accuracy": 0.801369845867157, "num_tokens": 19763197.0, "step": 3797, "train/ce_loss": 0.5379621386528015 }, { "epoch": 0.375420209610441, "step": 3797, "train/sim_loss": 0.05859375 }, { "epoch": 0.375420209610441, "step": 3797, "train/total_loss": 0.11238996684551239 }, { "entropy": 9.240084648132324, "epoch": 0.3755190824599565, "mean_token_accuracy": 0.7153945565223694, "num_tokens": 19768420.0, "step": 3798, "train/ce_loss": 0.882220447063446 }, { "epoch": 0.3755190824599565, "step": 3798, "train/sim_loss": 0.05078125 }, { "epoch": 0.3755190824599565, "step": 3798, "train/total_loss": 0.13900330662727356 }, { "entropy": 9.321863174438477, "epoch": 0.375617955309472, "mean_token_accuracy": 0.7720994353294373, "num_tokens": 19773583.0, "step": 3799, "train/ce_loss": 1.9414674170548096e-06 }, { "epoch": 0.375617955309472, "step": 3799, "train/sim_loss": 0.0546875 }, { "epoch": 0.375617955309472, "step": 3799, "train/total_loss": 0.05468769371509552 }, { "epoch": 0.37571682815898755, "grad_norm": 0.704355001449585, "learning_rate": 9.063195371606588e-06, "loss": 0.1405, "step": 3800 }, { "entropy": 9.065167427062988, "epoch": 0.37571682815898755, "mean_token_accuracy": 0.7735849022865295, "num_tokens": 19778870.0, "step": 3800, "train/ce_loss": 0.713546872138977 }, { "epoch": 0.37571682815898755, "step": 3800, "train/sim_loss": 0.0390625 }, { "epoch": 0.37571682815898755, "step": 3800, "train/total_loss": 0.1104171872138977 }, { "entropy": 9.010021209716797, "epoch": 0.3758157010085031, "mean_token_accuracy": 0.7234273552894592, "num_tokens": 19784226.0, "step": 3801, "train/ce_loss": 0.9238497614860535 }, { "epoch": 0.3758157010085031, "step": 3801, "train/sim_loss": 0.02734375 }, { "epoch": 0.3758157010085031, "step": 3801, "train/total_loss": 0.11972872912883759 }, { "entropy": 9.002856254577637, "epoch": 0.3759145738580186, "mean_token_accuracy": 0.7265822887420654, "num_tokens": 19789545.0, "step": 3802, "train/ce_loss": 1.0768572092056274 }, { "epoch": 0.3759145738580186, "step": 3802, "train/sim_loss": 0.1015625 }, { "epoch": 0.3759145738580186, "step": 3802, "train/total_loss": 0.20924821496009827 }, { "entropy": 9.009138107299805, "epoch": 0.3760134467075341, "mean_token_accuracy": 0.7669441103935242, "num_tokens": 19794881.0, "step": 3803, "train/ce_loss": 1.5670578479766846 }, { "epoch": 0.3760134467075341, "step": 3803, "train/sim_loss": 0.109375 }, { "epoch": 0.3760134467075341, "step": 3803, "train/total_loss": 0.2660807967185974 }, { "entropy": 9.229785919189453, "epoch": 0.37611231955704966, "mean_token_accuracy": 0.7591036558151245, "num_tokens": 19800032.0, "step": 3804, "train/ce_loss": 0.8593621850013733 }, { "epoch": 0.37611231955704966, "step": 3804, "train/sim_loss": 0.0625 }, { "epoch": 0.37611231955704966, "step": 3804, "train/total_loss": 0.14843621850013733 }, { "entropy": 9.098108291625977, "epoch": 0.37621119240656514, "mean_token_accuracy": 0.7685834765434265, "num_tokens": 19805187.0, "step": 3805, "train/ce_loss": 0.8727970719337463 }, { "epoch": 0.37621119240656514, "step": 3805, "train/sim_loss": 0.05078125 }, { "epoch": 0.37621119240656514, "step": 3805, "train/total_loss": 0.13806095719337463 }, { "entropy": 9.53138542175293, "epoch": 0.3763100652560807, "mean_token_accuracy": 0.7435455918312073, "num_tokens": 19810177.0, "step": 3806, "train/ce_loss": 2.8374862670898438 }, { "epoch": 0.3763100652560807, "step": 3806, "train/sim_loss": 0.06640625 }, { "epoch": 0.3763100652560807, "step": 3806, "train/total_loss": 0.3501548767089844 }, { "entropy": 9.33613109588623, "epoch": 0.3764089381055962, "mean_token_accuracy": 0.7612156271934509, "num_tokens": 19815315.0, "step": 3807, "train/ce_loss": 0.5548412799835205 }, { "epoch": 0.3764089381055962, "step": 3807, "train/sim_loss": 0.0546875 }, { "epoch": 0.3764089381055962, "step": 3807, "train/total_loss": 0.11017163097858429 }, { "entropy": 9.431605339050293, "epoch": 0.3765078109551117, "mean_token_accuracy": 0.7742782235145569, "num_tokens": 19820492.0, "step": 3808, "train/ce_loss": 0.8345779776573181 }, { "epoch": 0.3765078109551117, "step": 3808, "train/sim_loss": 0.01953125 }, { "epoch": 0.3765078109551117, "step": 3808, "train/total_loss": 0.10298904776573181 }, { "entropy": 8.796649932861328, "epoch": 0.37660668380462725, "mean_token_accuracy": 0.7225501537322998, "num_tokens": 19825756.0, "step": 3809, "train/ce_loss": 1.249238133430481 }, { "epoch": 0.37660668380462725, "step": 3809, "train/sim_loss": 0.10546875 }, { "epoch": 0.37660668380462725, "step": 3809, "train/total_loss": 0.23039257526397705 }, { "entropy": 9.011679649353027, "epoch": 0.3767055566541428, "mean_token_accuracy": 0.7263033390045166, "num_tokens": 19831070.0, "step": 3810, "train/ce_loss": 0.7903326153755188 }, { "epoch": 0.3767055566541428, "step": 3810, "train/sim_loss": 0.04296875 }, { "epoch": 0.3767055566541428, "step": 3810, "train/total_loss": 0.122002013027668 }, { "entropy": 9.251346588134766, "epoch": 0.3768044295036583, "mean_token_accuracy": 0.7066166996955872, "num_tokens": 19836485.0, "step": 3811, "train/ce_loss": 0.6743360757827759 }, { "epoch": 0.3768044295036583, "step": 3811, "train/sim_loss": 0.046875 }, { "epoch": 0.3768044295036583, "step": 3811, "train/total_loss": 0.11430861055850983 }, { "entropy": 9.102466583251953, "epoch": 0.3769033023531738, "mean_token_accuracy": 0.7443708777427673, "num_tokens": 19841875.0, "step": 3812, "train/ce_loss": 1.1818798780441284 }, { "epoch": 0.3769033023531738, "step": 3812, "train/sim_loss": 0.109375 }, { "epoch": 0.3769033023531738, "step": 3812, "train/total_loss": 0.22756299376487732 }, { "entropy": 9.400089263916016, "epoch": 0.37700217520268936, "mean_token_accuracy": 0.7442528605461121, "num_tokens": 19847000.0, "step": 3813, "train/ce_loss": 1.1718734502792358 }, { "epoch": 0.37700217520268936, "step": 3813, "train/sim_loss": 0.05078125 }, { "epoch": 0.37700217520268936, "step": 3813, "train/total_loss": 0.16796860098838806 }, { "entropy": 9.248943328857422, "epoch": 0.37710104805220485, "mean_token_accuracy": 0.7553957104682922, "num_tokens": 19852179.0, "step": 3814, "train/ce_loss": 0.893125057220459 }, { "epoch": 0.37710104805220485, "step": 3814, "train/sim_loss": 0.05859375 }, { "epoch": 0.37710104805220485, "step": 3814, "train/total_loss": 0.14790625870227814 }, { "entropy": 9.229852676391602, "epoch": 0.3771999209017204, "mean_token_accuracy": 0.8393632173538208, "num_tokens": 19857393.0, "step": 3815, "train/ce_loss": 0.6361907124519348 }, { "epoch": 0.3771999209017204, "step": 3815, "train/sim_loss": 0.0703125 }, { "epoch": 0.3771999209017204, "step": 3815, "train/total_loss": 0.13393157720565796 }, { "entropy": 9.12453556060791, "epoch": 0.37729879375123593, "mean_token_accuracy": 0.7630208134651184, "num_tokens": 19862651.0, "step": 3816, "train/ce_loss": 0.6658760905265808 }, { "epoch": 0.37729879375123593, "step": 3816, "train/sim_loss": 0.046875 }, { "epoch": 0.37729879375123593, "step": 3816, "train/total_loss": 0.11346261203289032 }, { "entropy": 8.986717224121094, "epoch": 0.3773976666007514, "mean_token_accuracy": 0.7001209259033203, "num_tokens": 19867960.0, "step": 3817, "train/ce_loss": 0.6120560169219971 }, { "epoch": 0.3773976666007514, "step": 3817, "train/sim_loss": 0.07421875 }, { "epoch": 0.3773976666007514, "step": 3817, "train/total_loss": 0.13542434573173523 }, { "entropy": 9.174938201904297, "epoch": 0.37749653945026695, "mean_token_accuracy": 0.7614678740501404, "num_tokens": 19873224.0, "step": 3818, "train/ce_loss": 0.6425613164901733 }, { "epoch": 0.37749653945026695, "step": 3818, "train/sim_loss": 0.0234375 }, { "epoch": 0.37749653945026695, "step": 3818, "train/total_loss": 0.08769363164901733 }, { "entropy": 8.792425155639648, "epoch": 0.3775954122997825, "mean_token_accuracy": 0.6988416910171509, "num_tokens": 19878472.0, "step": 3819, "train/ce_loss": 1.3050808906555176 }, { "epoch": 0.3775954122997825, "step": 3819, "train/sim_loss": 0.05078125 }, { "epoch": 0.3775954122997825, "step": 3819, "train/total_loss": 0.18128934502601624 }, { "epoch": 0.377694285149298, "grad_norm": 0.8487709760665894, "learning_rate": 9.058250506848638e-06, "loss": 0.1432, "step": 3820 }, { "entropy": 9.246591567993164, "epoch": 0.377694285149298, "mean_token_accuracy": 0.7724795937538147, "num_tokens": 19883687.0, "step": 3820, "train/ce_loss": 1.2820429801940918 }, { "epoch": 0.377694285149298, "step": 3820, "train/sim_loss": 0.09765625 }, { "epoch": 0.377694285149298, "step": 3820, "train/total_loss": 0.22586055099964142 }, { "entropy": 9.413658142089844, "epoch": 0.3777931579988135, "mean_token_accuracy": 0.7532281279563904, "num_tokens": 19888821.0, "step": 3821, "train/ce_loss": 0.7351582050323486 }, { "epoch": 0.3777931579988135, "step": 3821, "train/sim_loss": 0.0625 }, { "epoch": 0.3777931579988135, "step": 3821, "train/total_loss": 0.13601583242416382 }, { "entropy": 8.993918418884277, "epoch": 0.37789203084832906, "mean_token_accuracy": 0.7847642302513123, "num_tokens": 19894132.0, "step": 3822, "train/ce_loss": 0.4223306179046631 }, { "epoch": 0.37789203084832906, "step": 3822, "train/sim_loss": 0.0390625 }, { "epoch": 0.37789203084832906, "step": 3822, "train/total_loss": 0.08129556477069855 }, { "entropy": 9.318099021911621, "epoch": 0.37799090369784455, "mean_token_accuracy": 0.7189348936080933, "num_tokens": 19899215.0, "step": 3823, "train/ce_loss": 1.242138182533381e-06 }, { "epoch": 0.37799090369784455, "step": 3823, "train/sim_loss": 0.03125 }, { "epoch": 0.37799090369784455, "step": 3823, "train/total_loss": 0.03125012293457985 }, { "entropy": 9.346855163574219, "epoch": 0.3780897765473601, "mean_token_accuracy": 0.6827676296234131, "num_tokens": 19904443.0, "step": 3824, "train/ce_loss": 0.6489213705062866 }, { "epoch": 0.3780897765473601, "step": 3824, "train/sim_loss": 0.0546875 }, { "epoch": 0.3780897765473601, "step": 3824, "train/total_loss": 0.11957963556051254 }, { "entropy": 9.698902130126953, "epoch": 0.37818864939687563, "mean_token_accuracy": 0.7413793206214905, "num_tokens": 19909412.0, "step": 3825, "train/ce_loss": 0.7541821599006653 }, { "epoch": 0.37818864939687563, "step": 3825, "train/sim_loss": 0.03125 }, { "epoch": 0.37818864939687563, "step": 3825, "train/total_loss": 0.10666821897029877 }, { "entropy": 9.310688972473145, "epoch": 0.3782875222463911, "mean_token_accuracy": 0.737300455570221, "num_tokens": 19914583.0, "step": 3826, "train/ce_loss": 0.7176569700241089 }, { "epoch": 0.3782875222463911, "step": 3826, "train/sim_loss": 0.046875 }, { "epoch": 0.3782875222463911, "step": 3826, "train/total_loss": 0.11864069849252701 }, { "entropy": 9.054895401000977, "epoch": 0.37838639509590666, "mean_token_accuracy": 0.7268232107162476, "num_tokens": 19919852.0, "step": 3827, "train/ce_loss": 0.7547178864479065 }, { "epoch": 0.37838639509590666, "step": 3827, "train/sim_loss": 0.04296875 }, { "epoch": 0.37838639509590666, "step": 3827, "train/total_loss": 0.11844053864479065 }, { "entropy": 9.694982528686523, "epoch": 0.3784852679454222, "mean_token_accuracy": 0.7440147399902344, "num_tokens": 19924806.0, "step": 3828, "train/ce_loss": 2.6799655188369798e-06 }, { "epoch": 0.3784852679454222, "step": 3828, "train/sim_loss": 0.0625 }, { "epoch": 0.3784852679454222, "step": 3828, "train/total_loss": 0.06250026822090149 }, { "entropy": 8.97214126586914, "epoch": 0.3785841407949377, "mean_token_accuracy": 0.7381443381309509, "num_tokens": 19930262.0, "step": 3829, "train/ce_loss": 0.3500278890132904 }, { "epoch": 0.3785841407949377, "step": 3829, "train/sim_loss": 0.171875 }, { "epoch": 0.3785841407949377, "step": 3829, "train/total_loss": 0.20687779784202576 }, { "entropy": 9.158282279968262, "epoch": 0.3786830136444532, "mean_token_accuracy": 0.7079545259475708, "num_tokens": 19935623.0, "step": 3830, "train/ce_loss": 0.7968263626098633 }, { "epoch": 0.3786830136444532, "step": 3830, "train/sim_loss": 0.0546875 }, { "epoch": 0.3786830136444532, "step": 3830, "train/total_loss": 0.13437014818191528 }, { "entropy": 8.646978378295898, "epoch": 0.37878188649396877, "mean_token_accuracy": 0.7291462421417236, "num_tokens": 19941082.0, "step": 3831, "train/ce_loss": 0.9026403427124023 }, { "epoch": 0.37878188649396877, "step": 3831, "train/sim_loss": 0.05078125 }, { "epoch": 0.37878188649396877, "step": 3831, "train/total_loss": 0.14104528725147247 }, { "entropy": 9.50467300415039, "epoch": 0.37888075934348425, "mean_token_accuracy": 0.7625786066055298, "num_tokens": 19946125.0, "step": 3832, "train/ce_loss": 4.323581379139796e-06 }, { "epoch": 0.37888075934348425, "step": 3832, "train/sim_loss": 0.02734375 }, { "epoch": 0.37888075934348425, "step": 3832, "train/total_loss": 0.02734418213367462 }, { "entropy": 9.258719444274902, "epoch": 0.3789796321929998, "mean_token_accuracy": 0.7279999852180481, "num_tokens": 19951367.0, "step": 3833, "train/ce_loss": 1.208894968032837 }, { "epoch": 0.3789796321929998, "step": 3833, "train/sim_loss": 0.06640625 }, { "epoch": 0.3789796321929998, "step": 3833, "train/total_loss": 0.18729574978351593 }, { "entropy": 8.93228530883789, "epoch": 0.37907850504251533, "mean_token_accuracy": 0.6701164245605469, "num_tokens": 19956618.0, "step": 3834, "train/ce_loss": 1.5145224332809448 }, { "epoch": 0.37907850504251533, "step": 3834, "train/sim_loss": 0.0859375 }, { "epoch": 0.37907850504251533, "step": 3834, "train/total_loss": 0.23738974332809448 }, { "entropy": 8.946246147155762, "epoch": 0.3791773778920309, "mean_token_accuracy": 0.7346711158752441, "num_tokens": 19962033.0, "step": 3835, "train/ce_loss": 0.5565978288650513 }, { "epoch": 0.3791773778920309, "step": 3835, "train/sim_loss": 0.05859375 }, { "epoch": 0.3791773778920309, "step": 3835, "train/total_loss": 0.11425353586673737 }, { "entropy": 9.340824127197266, "epoch": 0.37927625074154636, "mean_token_accuracy": 0.757446825504303, "num_tokens": 19967175.0, "step": 3836, "train/ce_loss": 1.0683367252349854 }, { "epoch": 0.37927625074154636, "step": 3836, "train/sim_loss": 0.1171875 }, { "epoch": 0.37927625074154636, "step": 3836, "train/total_loss": 0.22402116656303406 }, { "entropy": 8.507728576660156, "epoch": 0.3793751235910619, "mean_token_accuracy": 0.782608687877655, "num_tokens": 19972781.0, "step": 3837, "train/ce_loss": 0.6930010914802551 }, { "epoch": 0.3793751235910619, "step": 3837, "train/sim_loss": 0.109375 }, { "epoch": 0.3793751235910619, "step": 3837, "train/total_loss": 0.17867511510849 }, { "entropy": 8.375957489013672, "epoch": 0.37947399644057744, "mean_token_accuracy": 0.7422680258750916, "num_tokens": 19978170.0, "step": 3838, "train/ce_loss": 0.8468574285507202 }, { "epoch": 0.37947399644057744, "step": 3838, "train/sim_loss": 0.05078125 }, { "epoch": 0.37947399644057744, "step": 3838, "train/total_loss": 0.13546699285507202 }, { "entropy": 9.039892196655273, "epoch": 0.37957286929009293, "mean_token_accuracy": 0.7541401386260986, "num_tokens": 19983454.0, "step": 3839, "train/ce_loss": 0.5882134437561035 }, { "epoch": 0.37957286929009293, "step": 3839, "train/sim_loss": 0.04296875 }, { "epoch": 0.37957286929009293, "step": 3839, "train/total_loss": 0.10179010033607483 }, { "epoch": 0.37967174213960847, "grad_norm": 0.7944733500480652, "learning_rate": 9.05330564209069e-06, "loss": 0.1476, "step": 3840 }, { "entropy": 9.035563468933105, "epoch": 0.37967174213960847, "mean_token_accuracy": 0.6805251836776733, "num_tokens": 19988803.0, "step": 3840, "train/ce_loss": 1.2471295595169067 }, { "epoch": 0.37967174213960847, "step": 3840, "train/sim_loss": 0.08203125 }, { "epoch": 0.37967174213960847, "step": 3840, "train/total_loss": 0.2067442089319229 }, { "entropy": 8.570268630981445, "epoch": 0.379770614989124, "mean_token_accuracy": 0.7588832378387451, "num_tokens": 19994460.0, "step": 3841, "train/ce_loss": 0.7722548246383667 }, { "epoch": 0.379770614989124, "step": 3841, "train/sim_loss": 0.02734375 }, { "epoch": 0.379770614989124, "step": 3841, "train/total_loss": 0.10456923395395279 }, { "entropy": 9.15269660949707, "epoch": 0.3798694878386395, "mean_token_accuracy": 0.7444589138031006, "num_tokens": 19999705.0, "step": 3842, "train/ce_loss": 1.028852105140686 }, { "epoch": 0.3798694878386395, "step": 3842, "train/sim_loss": 0.05859375 }, { "epoch": 0.3798694878386395, "step": 3842, "train/total_loss": 0.16147896647453308 }, { "entropy": 8.799553871154785, "epoch": 0.37996836068815504, "mean_token_accuracy": 0.8022598624229431, "num_tokens": 20005092.0, "step": 3843, "train/ce_loss": 0.8331433534622192 }, { "epoch": 0.37996836068815504, "step": 3843, "train/sim_loss": 0.06640625 }, { "epoch": 0.37996836068815504, "step": 3843, "train/total_loss": 0.14972057938575745 }, { "entropy": 9.060023307800293, "epoch": 0.3800672335376706, "mean_token_accuracy": 0.7175140976905823, "num_tokens": 20010417.0, "step": 3844, "train/ce_loss": 1.1553999185562134 }, { "epoch": 0.3800672335376706, "step": 3844, "train/sim_loss": 0.05859375 }, { "epoch": 0.3800672335376706, "step": 3844, "train/total_loss": 0.17413374781608582 }, { "entropy": 9.151344299316406, "epoch": 0.38016610638718606, "mean_token_accuracy": 0.778124988079071, "num_tokens": 20015500.0, "step": 3845, "train/ce_loss": 0.6391857266426086 }, { "epoch": 0.38016610638718606, "step": 3845, "train/sim_loss": 0.05859375 }, { "epoch": 0.38016610638718606, "step": 3845, "train/total_loss": 0.1225123256444931 }, { "entropy": 9.005912780761719, "epoch": 0.3802649792367016, "mean_token_accuracy": 0.7820343375205994, "num_tokens": 20020706.0, "step": 3846, "train/ce_loss": 0.8451929092407227 }, { "epoch": 0.3802649792367016, "step": 3846, "train/sim_loss": 0.06640625 }, { "epoch": 0.3802649792367016, "step": 3846, "train/total_loss": 0.15092554688453674 }, { "entropy": 8.730301856994629, "epoch": 0.38036385208621715, "mean_token_accuracy": 0.7464115023612976, "num_tokens": 20026203.0, "step": 3847, "train/ce_loss": 0.6203240752220154 }, { "epoch": 0.38036385208621715, "step": 3847, "train/sim_loss": 0.02734375 }, { "epoch": 0.38036385208621715, "step": 3847, "train/total_loss": 0.08937615901231766 }, { "entropy": 8.813580513000488, "epoch": 0.38046272493573263, "mean_token_accuracy": 0.7479091882705688, "num_tokens": 20031517.0, "step": 3848, "train/ce_loss": 1.0179005861282349 }, { "epoch": 0.38046272493573263, "step": 3848, "train/sim_loss": 0.0859375 }, { "epoch": 0.38046272493573263, "step": 3848, "train/total_loss": 0.18772757053375244 }, { "entropy": 8.879956245422363, "epoch": 0.3805615977852482, "mean_token_accuracy": 0.7799999713897705, "num_tokens": 20036851.0, "step": 3849, "train/ce_loss": 0.49752455949783325 }, { "epoch": 0.3805615977852482, "step": 3849, "train/sim_loss": 0.0390625 }, { "epoch": 0.3805615977852482, "step": 3849, "train/total_loss": 0.08881495893001556 }, { "entropy": 8.920181274414062, "epoch": 0.3806604706347637, "mean_token_accuracy": 0.7253446578979492, "num_tokens": 20042271.0, "step": 3850, "train/ce_loss": 0.4901108145713806 }, { "epoch": 0.3806604706347637, "step": 3850, "train/sim_loss": 0.01953125 }, { "epoch": 0.3806604706347637, "step": 3850, "train/total_loss": 0.06854233145713806 }, { "entropy": 9.092105865478516, "epoch": 0.3807593434842792, "mean_token_accuracy": 0.7521613836288452, "num_tokens": 20047454.0, "step": 3851, "train/ce_loss": 0.42295077443122864 }, { "epoch": 0.3807593434842792, "step": 3851, "train/sim_loss": 0.046875 }, { "epoch": 0.3807593434842792, "step": 3851, "train/total_loss": 0.08917008340358734 }, { "entropy": 9.74991226196289, "epoch": 0.38085821633379474, "mean_token_accuracy": 0.727078914642334, "num_tokens": 20052354.0, "step": 3852, "train/ce_loss": 5.97653524891939e-06 }, { "epoch": 0.38085821633379474, "step": 3852, "train/sim_loss": 0.09375 }, { "epoch": 0.38085821633379474, "step": 3852, "train/total_loss": 0.09375059604644775 }, { "entropy": 9.296963691711426, "epoch": 0.3809570891833103, "mean_token_accuracy": 0.7491856813430786, "num_tokens": 20057444.0, "step": 3853, "train/ce_loss": 1.38630211353302 }, { "epoch": 0.3809570891833103, "step": 3853, "train/sim_loss": 0.08203125 }, { "epoch": 0.3809570891833103, "step": 3853, "train/total_loss": 0.220661461353302 }, { "entropy": 8.965834617614746, "epoch": 0.38105596203282577, "mean_token_accuracy": 0.7508690357208252, "num_tokens": 20062746.0, "step": 3854, "train/ce_loss": 1.291054368019104 }, { "epoch": 0.38105596203282577, "step": 3854, "train/sim_loss": 0.0546875 }, { "epoch": 0.38105596203282577, "step": 3854, "train/total_loss": 0.18379293382167816 }, { "entropy": 8.771659851074219, "epoch": 0.3811548348823413, "mean_token_accuracy": 0.7436399459838867, "num_tokens": 20068285.0, "step": 3855, "train/ce_loss": 1.536427617073059 }, { "epoch": 0.3811548348823413, "step": 3855, "train/sim_loss": 0.1484375 }, { "epoch": 0.3811548348823413, "step": 3855, "train/total_loss": 0.30208027362823486 }, { "entropy": 10.270421981811523, "epoch": 0.38125370773185685, "mean_token_accuracy": 0.6927083134651184, "num_tokens": 20072909.0, "step": 3856, "train/ce_loss": 3.842646837234497 }, { "epoch": 0.38125370773185685, "step": 3856, "train/sim_loss": 0.06640625 }, { "epoch": 0.38125370773185685, "step": 3856, "train/total_loss": 0.45067092776298523 }, { "entropy": 9.520597457885742, "epoch": 0.38135258058137234, "mean_token_accuracy": 0.6839728951454163, "num_tokens": 20077756.0, "step": 3857, "train/ce_loss": 2.0288615226745605 }, { "epoch": 0.38135258058137234, "step": 3857, "train/sim_loss": 0.046875 }, { "epoch": 0.38135258058137234, "step": 3857, "train/total_loss": 0.24976114928722382 }, { "entropy": 8.86798095703125, "epoch": 0.3814514534308879, "mean_token_accuracy": 0.8033707737922668, "num_tokens": 20083112.0, "step": 3858, "train/ce_loss": 0.5570780038833618 }, { "epoch": 0.3814514534308879, "step": 3858, "train/sim_loss": 0.02734375 }, { "epoch": 0.3814514534308879, "step": 3858, "train/total_loss": 0.08305154740810394 }, { "entropy": 8.96121883392334, "epoch": 0.3815503262804034, "mean_token_accuracy": 0.7189819812774658, "num_tokens": 20088544.0, "step": 3859, "train/ce_loss": 0.33686110377311707 }, { "epoch": 0.3815503262804034, "step": 3859, "train/sim_loss": 0.05078125 }, { "epoch": 0.3815503262804034, "step": 3859, "train/total_loss": 0.08446736633777618 }, { "epoch": 0.3816491991299189, "grad_norm": 0.899243175983429, "learning_rate": 9.04836077733274e-06, "loss": 0.141, "step": 3860 }, { "entropy": 9.109146118164062, "epoch": 0.3816491991299189, "mean_token_accuracy": 0.7835820913314819, "num_tokens": 20093784.0, "step": 3860, "train/ce_loss": 0.7749420404434204 }, { "epoch": 0.3816491991299189, "step": 3860, "train/sim_loss": 0.07421875 }, { "epoch": 0.3816491991299189, "step": 3860, "train/total_loss": 0.15171295404434204 }, { "entropy": 9.232582092285156, "epoch": 0.38174807197943444, "mean_token_accuracy": 0.7138314843177795, "num_tokens": 20098878.0, "step": 3861, "train/ce_loss": 1.1303707361221313 }, { "epoch": 0.38174807197943444, "step": 3861, "train/sim_loss": 0.0625 }, { "epoch": 0.38174807197943444, "step": 3861, "train/total_loss": 0.1755370795726776 }, { "entropy": 8.834117889404297, "epoch": 0.38184694482895, "mean_token_accuracy": 0.7311111092567444, "num_tokens": 20104249.0, "step": 3862, "train/ce_loss": 0.5333002805709839 }, { "epoch": 0.38184694482895, "step": 3862, "train/sim_loss": 0.06640625 }, { "epoch": 0.38184694482895, "step": 3862, "train/total_loss": 0.11973628401756287 }, { "entropy": 9.02542495727539, "epoch": 0.38194581767846547, "mean_token_accuracy": 0.7144444584846497, "num_tokens": 20109640.0, "step": 3863, "train/ce_loss": 1.5636701583862305 }, { "epoch": 0.38194581767846547, "step": 3863, "train/sim_loss": 0.1015625 }, { "epoch": 0.38194581767846547, "step": 3863, "train/total_loss": 0.2579295039176941 }, { "entropy": 8.74275016784668, "epoch": 0.382044690527981, "mean_token_accuracy": 0.7470308542251587, "num_tokens": 20114926.0, "step": 3864, "train/ce_loss": 1.2879526615142822 }, { "epoch": 0.382044690527981, "step": 3864, "train/sim_loss": 0.0703125 }, { "epoch": 0.382044690527981, "step": 3864, "train/total_loss": 0.19910776615142822 }, { "entropy": 8.74221420288086, "epoch": 0.38214356337749655, "mean_token_accuracy": 0.7284700870513916, "num_tokens": 20120403.0, "step": 3865, "train/ce_loss": 1.0889049768447876 }, { "epoch": 0.38214356337749655, "step": 3865, "train/sim_loss": 0.109375 }, { "epoch": 0.38214356337749655, "step": 3865, "train/total_loss": 0.21826550364494324 }, { "entropy": 8.940792083740234, "epoch": 0.38224243622701204, "mean_token_accuracy": 0.745591938495636, "num_tokens": 20125626.0, "step": 3866, "train/ce_loss": 0.9817897081375122 }, { "epoch": 0.38224243622701204, "step": 3866, "train/sim_loss": 0.07421875 }, { "epoch": 0.38224243622701204, "step": 3866, "train/total_loss": 0.17239773273468018 }, { "entropy": 8.570219039916992, "epoch": 0.3823413090765276, "mean_token_accuracy": 0.6934911012649536, "num_tokens": 20130975.0, "step": 3867, "train/ce_loss": 0.5346583127975464 }, { "epoch": 0.3823413090765276, "step": 3867, "train/sim_loss": 0.05859375 }, { "epoch": 0.3823413090765276, "step": 3867, "train/total_loss": 0.1120595782995224 }, { "entropy": 9.852130889892578, "epoch": 0.3824401819260431, "mean_token_accuracy": 0.7345309257507324, "num_tokens": 20136069.0, "step": 3868, "train/ce_loss": 2.2908136543264845e-06 }, { "epoch": 0.3824401819260431, "step": 3868, "train/sim_loss": 0.0390625 }, { "epoch": 0.3824401819260431, "step": 3868, "train/total_loss": 0.039062727242708206 }, { "entropy": 8.938485145568848, "epoch": 0.3825390547755586, "mean_token_accuracy": 0.7442660331726074, "num_tokens": 20141461.0, "step": 3869, "train/ce_loss": 0.779180109500885 }, { "epoch": 0.3825390547755586, "step": 3869, "train/sim_loss": 0.078125 }, { "epoch": 0.3825390547755586, "step": 3869, "train/total_loss": 0.15604302287101746 }, { "entropy": 8.845773696899414, "epoch": 0.38263792762507415, "mean_token_accuracy": 0.7202796936035156, "num_tokens": 20146942.0, "step": 3870, "train/ce_loss": 0.7213982343673706 }, { "epoch": 0.38263792762507415, "step": 3870, "train/sim_loss": 0.0390625 }, { "epoch": 0.38263792762507415, "step": 3870, "train/total_loss": 0.11120232194662094 }, { "entropy": 9.102130889892578, "epoch": 0.3827368004745897, "mean_token_accuracy": 0.6877990365028381, "num_tokens": 20152244.0, "step": 3871, "train/ce_loss": 1.3690425157546997 }, { "epoch": 0.3827368004745897, "step": 3871, "train/sim_loss": 0.11328125 }, { "epoch": 0.3827368004745897, "step": 3871, "train/total_loss": 0.250185489654541 }, { "entropy": 9.01791000366211, "epoch": 0.3828356733241052, "mean_token_accuracy": 0.7416666746139526, "num_tokens": 20157552.0, "step": 3872, "train/ce_loss": 1.1151341199874878 }, { "epoch": 0.3828356733241052, "step": 3872, "train/sim_loss": 0.046875 }, { "epoch": 0.3828356733241052, "step": 3872, "train/total_loss": 0.1583884060382843 }, { "entropy": 9.082117080688477, "epoch": 0.3829345461736207, "mean_token_accuracy": 0.6948198080062866, "num_tokens": 20162918.0, "step": 3873, "train/ce_loss": 0.9492505192756653 }, { "epoch": 0.3829345461736207, "step": 3873, "train/sim_loss": 0.10546875 }, { "epoch": 0.3829345461736207, "step": 3873, "train/total_loss": 0.20039379596710205 }, { "entropy": 9.00233268737793, "epoch": 0.38303341902313626, "mean_token_accuracy": 0.7436241507530212, "num_tokens": 20168095.0, "step": 3874, "train/ce_loss": 0.8107348680496216 }, { "epoch": 0.38303341902313626, "step": 3874, "train/sim_loss": 0.04296875 }, { "epoch": 0.38303341902313626, "step": 3874, "train/total_loss": 0.12404223531484604 }, { "entropy": 8.733619689941406, "epoch": 0.38313229187265174, "mean_token_accuracy": 0.7614269852638245, "num_tokens": 20173430.0, "step": 3875, "train/ce_loss": 0.821361780166626 }, { "epoch": 0.38313229187265174, "step": 3875, "train/sim_loss": 0.05859375 }, { "epoch": 0.38313229187265174, "step": 3875, "train/total_loss": 0.14072993397712708 }, { "entropy": 9.802820205688477, "epoch": 0.3832311647221673, "mean_token_accuracy": 0.6971279382705688, "num_tokens": 20178219.0, "step": 3876, "train/ce_loss": 5.660176157107344e-06 }, { "epoch": 0.3832311647221673, "step": 3876, "train/sim_loss": 0.0703125 }, { "epoch": 0.3832311647221673, "step": 3876, "train/total_loss": 0.07031306624412537 }, { "entropy": 10.102174758911133, "epoch": 0.3833300375716828, "mean_token_accuracy": 0.7095709443092346, "num_tokens": 20182941.0, "step": 3877, "train/ce_loss": 7.492818440368865e-06 }, { "epoch": 0.3833300375716828, "step": 3877, "train/sim_loss": 0.05859375 }, { "epoch": 0.3833300375716828, "step": 3877, "train/total_loss": 0.05859449878334999 }, { "entropy": 9.372560501098633, "epoch": 0.38342891042119837, "mean_token_accuracy": 0.7775768637657166, "num_tokens": 20187923.0, "step": 3878, "train/ce_loss": 0.7272844910621643 }, { "epoch": 0.38342891042119837, "step": 3878, "train/sim_loss": 0.1171875 }, { "epoch": 0.38342891042119837, "step": 3878, "train/total_loss": 0.1899159550666809 }, { "entropy": 8.713886260986328, "epoch": 0.38352778327071385, "mean_token_accuracy": 0.692307710647583, "num_tokens": 20193302.0, "step": 3879, "train/ce_loss": 1.1886383295059204 }, { "epoch": 0.38352778327071385, "step": 3879, "train/sim_loss": 0.1875 }, { "epoch": 0.38352778327071385, "step": 3879, "train/total_loss": 0.3063638210296631 }, { "epoch": 0.3836266561202294, "grad_norm": 1.0428014993667603, "learning_rate": 9.043415912574793e-06, "loss": 0.1569, "step": 3880 }, { "entropy": 9.826498031616211, "epoch": 0.3836266561202294, "mean_token_accuracy": 0.7492063641548157, "num_tokens": 20198195.0, "step": 3880, "train/ce_loss": 2.52473219006788e-05 }, { "epoch": 0.3836266561202294, "step": 3880, "train/sim_loss": 0.046875 }, { "epoch": 0.3836266561202294, "step": 3880, "train/total_loss": 0.04687752574682236 }, { "entropy": 8.948516845703125, "epoch": 0.38372552896974493, "mean_token_accuracy": 0.7144607901573181, "num_tokens": 20203491.0, "step": 3881, "train/ce_loss": 1.069422960281372 }, { "epoch": 0.38372552896974493, "step": 3881, "train/sim_loss": 0.0703125 }, { "epoch": 0.38372552896974493, "step": 3881, "train/total_loss": 0.1772547960281372 }, { "entropy": 9.142729759216309, "epoch": 0.3838244018192604, "mean_token_accuracy": 0.8066860437393188, "num_tokens": 20208652.0, "step": 3882, "train/ce_loss": 2.052043100775336e-06 }, { "epoch": 0.3838244018192604, "step": 3882, "train/sim_loss": 0.07421875 }, { "epoch": 0.3838244018192604, "step": 3882, "train/total_loss": 0.07421895861625671 }, { "entropy": 8.838113784790039, "epoch": 0.38392327466877596, "mean_token_accuracy": 0.7088477611541748, "num_tokens": 20214129.0, "step": 3883, "train/ce_loss": 0.6217888593673706 }, { "epoch": 0.38392327466877596, "step": 3883, "train/sim_loss": 0.08984375 }, { "epoch": 0.38392327466877596, "step": 3883, "train/total_loss": 0.15202262997627258 }, { "entropy": 8.645570755004883, "epoch": 0.3840221475182915, "mean_token_accuracy": 0.7366212010383606, "num_tokens": 20219603.0, "step": 3884, "train/ce_loss": 0.6932132840156555 }, { "epoch": 0.3840221475182915, "step": 3884, "train/sim_loss": 0.01953125 }, { "epoch": 0.3840221475182915, "step": 3884, "train/total_loss": 0.08885257691144943 }, { "entropy": 8.908561706542969, "epoch": 0.384121020367807, "mean_token_accuracy": 0.7578378319740295, "num_tokens": 20225007.0, "step": 3885, "train/ce_loss": 0.4770452082157135 }, { "epoch": 0.384121020367807, "step": 3885, "train/sim_loss": 0.0234375 }, { "epoch": 0.384121020367807, "step": 3885, "train/total_loss": 0.07114201784133911 }, { "entropy": 9.26992130279541, "epoch": 0.38421989321732253, "mean_token_accuracy": 0.7266010046005249, "num_tokens": 20230242.0, "step": 3886, "train/ce_loss": 2.017613724092371e-06 }, { "epoch": 0.38421989321732253, "step": 3886, "train/sim_loss": 0.0234375 }, { "epoch": 0.38421989321732253, "step": 3886, "train/total_loss": 0.023437701165676117 }, { "entropy": 8.833425521850586, "epoch": 0.38431876606683807, "mean_token_accuracy": 0.7277969717979431, "num_tokens": 20235624.0, "step": 3887, "train/ce_loss": 0.8311039805412292 }, { "epoch": 0.38431876606683807, "step": 3887, "train/sim_loss": 0.06640625 }, { "epoch": 0.38431876606683807, "step": 3887, "train/total_loss": 0.14951664209365845 }, { "entropy": 9.60293197631836, "epoch": 0.38441763891635355, "mean_token_accuracy": 0.7222222089767456, "num_tokens": 20240654.0, "step": 3888, "train/ce_loss": 1.3117893104208633e-05 }, { "epoch": 0.38441763891635355, "step": 3888, "train/sim_loss": 0.07421875 }, { "epoch": 0.38441763891635355, "step": 3888, "train/total_loss": 0.07422006130218506 }, { "entropy": 9.436559677124023, "epoch": 0.3845165117658691, "mean_token_accuracy": 0.7728285193443298, "num_tokens": 20245585.0, "step": 3889, "train/ce_loss": 0.940013587474823 }, { "epoch": 0.3845165117658691, "step": 3889, "train/sim_loss": 0.0625 }, { "epoch": 0.3845165117658691, "step": 3889, "train/total_loss": 0.15650135278701782 }, { "entropy": 9.050984382629395, "epoch": 0.38461538461538464, "mean_token_accuracy": 0.7281213402748108, "num_tokens": 20250916.0, "step": 3890, "train/ce_loss": 0.9562475085258484 }, { "epoch": 0.38461538461538464, "step": 3890, "train/sim_loss": 0.05859375 }, { "epoch": 0.38461538461538464, "step": 3890, "train/total_loss": 0.15421849489212036 }, { "entropy": 9.18235969543457, "epoch": 0.3847142574649001, "mean_token_accuracy": 0.7667560577392578, "num_tokens": 20256155.0, "step": 3891, "train/ce_loss": 1.0980581045150757 }, { "epoch": 0.3847142574649001, "step": 3891, "train/sim_loss": 0.0703125 }, { "epoch": 0.3847142574649001, "step": 3891, "train/total_loss": 0.18011832237243652 }, { "entropy": 8.994253158569336, "epoch": 0.38481313031441566, "mean_token_accuracy": 0.7229336500167847, "num_tokens": 20261514.0, "step": 3892, "train/ce_loss": 6.4584337451378815e-06 }, { "epoch": 0.38481313031441566, "step": 3892, "train/sim_loss": 0.046875 }, { "epoch": 0.38481313031441566, "step": 3892, "train/total_loss": 0.046875644475221634 }, { "entropy": 10.120834350585938, "epoch": 0.3849120031639312, "mean_token_accuracy": 0.8395061492919922, "num_tokens": 20266243.0, "step": 3893, "train/ce_loss": 1.5382053852081299 }, { "epoch": 0.3849120031639312, "step": 3893, "train/sim_loss": 0.05078125 }, { "epoch": 0.3849120031639312, "step": 3893, "train/total_loss": 0.20460179448127747 }, { "entropy": 9.669536590576172, "epoch": 0.3850108760134467, "mean_token_accuracy": 0.7058823704719543, "num_tokens": 20271159.0, "step": 3894, "train/ce_loss": 1.501869559288025 }, { "epoch": 0.3850108760134467, "step": 3894, "train/sim_loss": 0.07421875 }, { "epoch": 0.3850108760134467, "step": 3894, "train/total_loss": 0.2244057059288025 }, { "entropy": 9.655606269836426, "epoch": 0.38510974886296223, "mean_token_accuracy": 0.7670156955718994, "num_tokens": 20275968.0, "step": 3895, "train/ce_loss": 3.86687361242366e-06 }, { "epoch": 0.38510974886296223, "step": 3895, "train/sim_loss": 0.0546875 }, { "epoch": 0.38510974886296223, "step": 3895, "train/total_loss": 0.05468788743019104 }, { "entropy": 8.863473892211914, "epoch": 0.3852086217124778, "mean_token_accuracy": 0.7544731497764587, "num_tokens": 20281499.0, "step": 3896, "train/ce_loss": 0.7062889337539673 }, { "epoch": 0.3852086217124778, "step": 3896, "train/sim_loss": 0.046875 }, { "epoch": 0.3852086217124778, "step": 3896, "train/total_loss": 0.11750389635562897 }, { "entropy": 8.718485832214355, "epoch": 0.38530749456199326, "mean_token_accuracy": 0.7341317534446716, "num_tokens": 20286796.0, "step": 3897, "train/ce_loss": 0.5802757740020752 }, { "epoch": 0.38530749456199326, "step": 3897, "train/sim_loss": 0.0234375 }, { "epoch": 0.38530749456199326, "step": 3897, "train/total_loss": 0.08146508038043976 }, { "entropy": 9.35566520690918, "epoch": 0.3854063674115088, "mean_token_accuracy": 0.7277701497077942, "num_tokens": 20291948.0, "step": 3898, "train/ce_loss": 0.38343045115470886 }, { "epoch": 0.3854063674115088, "step": 3898, "train/sim_loss": 0.0234375 }, { "epoch": 0.3854063674115088, "step": 3898, "train/total_loss": 0.061780545860528946 }, { "entropy": 9.213844299316406, "epoch": 0.38550524026102434, "mean_token_accuracy": 0.7834645509719849, "num_tokens": 20297178.0, "step": 3899, "train/ce_loss": 1.0819836854934692 }, { "epoch": 0.38550524026102434, "step": 3899, "train/sim_loss": 0.03125 }, { "epoch": 0.38550524026102434, "step": 3899, "train/total_loss": 0.1394483745098114 }, { "epoch": 0.3856041131105398, "grad_norm": 0.8254612684249878, "learning_rate": 9.038471047816842e-06, "loss": 0.1416, "step": 3900 }, { "entropy": 8.882987976074219, "epoch": 0.3856041131105398, "mean_token_accuracy": 0.7002341747283936, "num_tokens": 20302591.0, "step": 3900, "train/ce_loss": 0.5525063276290894 }, { "epoch": 0.3856041131105398, "step": 3900, "train/sim_loss": 0.0859375 }, { "epoch": 0.3856041131105398, "step": 3900, "train/total_loss": 0.1411881297826767 }, { "entropy": 9.601035118103027, "epoch": 0.38570298596005537, "mean_token_accuracy": 0.6929982304573059, "num_tokens": 20307595.0, "step": 3901, "train/ce_loss": 1.6465243101119995 }, { "epoch": 0.38570298596005537, "step": 3901, "train/sim_loss": 0.1015625 }, { "epoch": 0.38570298596005537, "step": 3901, "train/total_loss": 0.26621493697166443 }, { "entropy": 9.398444175720215, "epoch": 0.3858018588095709, "mean_token_accuracy": 0.7452531456947327, "num_tokens": 20312693.0, "step": 3902, "train/ce_loss": 1.2758805751800537 }, { "epoch": 0.3858018588095709, "step": 3902, "train/sim_loss": 0.0390625 }, { "epoch": 0.3858018588095709, "step": 3902, "train/total_loss": 0.16665056347846985 }, { "entropy": 9.34980297088623, "epoch": 0.3859007316590864, "mean_token_accuracy": 0.7067371010780334, "num_tokens": 20317920.0, "step": 3903, "train/ce_loss": 0.8974692225456238 }, { "epoch": 0.3859007316590864, "step": 3903, "train/sim_loss": 0.09765625 }, { "epoch": 0.3859007316590864, "step": 3903, "train/total_loss": 0.18740317225456238 }, { "entropy": 9.25191879272461, "epoch": 0.38599960450860193, "mean_token_accuracy": 0.8173785209655762, "num_tokens": 20323057.0, "step": 3904, "train/ce_loss": 0.8279136419296265 }, { "epoch": 0.38599960450860193, "step": 3904, "train/sim_loss": 0.046875 }, { "epoch": 0.38599960450860193, "step": 3904, "train/total_loss": 0.12966635823249817 }, { "entropy": 9.476541519165039, "epoch": 0.3860984773581175, "mean_token_accuracy": 0.7361563444137573, "num_tokens": 20328091.0, "step": 3905, "train/ce_loss": 0.7952406406402588 }, { "epoch": 0.3860984773581175, "step": 3905, "train/sim_loss": 0.0546875 }, { "epoch": 0.3860984773581175, "step": 3905, "train/total_loss": 0.13421157002449036 }, { "entropy": 9.087745666503906, "epoch": 0.38619735020763296, "mean_token_accuracy": 0.754601240158081, "num_tokens": 20333386.0, "step": 3906, "train/ce_loss": 0.7630630135536194 }, { "epoch": 0.38619735020763296, "step": 3906, "train/sim_loss": 0.0234375 }, { "epoch": 0.38619735020763296, "step": 3906, "train/total_loss": 0.0997438058257103 }, { "entropy": 9.5733642578125, "epoch": 0.3862962230571485, "mean_token_accuracy": 0.7789473533630371, "num_tokens": 20338391.0, "step": 3907, "train/ce_loss": 4.043435637868242e-06 }, { "epoch": 0.3862962230571485, "step": 3907, "train/sim_loss": 0.046875 }, { "epoch": 0.3862962230571485, "step": 3907, "train/total_loss": 0.04687540605664253 }, { "entropy": 8.956235885620117, "epoch": 0.38639509590666404, "mean_token_accuracy": 0.7097862958908081, "num_tokens": 20343760.0, "step": 3908, "train/ce_loss": 1.3375918865203857 }, { "epoch": 0.38639509590666404, "step": 3908, "train/sim_loss": 0.1796875 }, { "epoch": 0.38639509590666404, "step": 3908, "train/total_loss": 0.31344670057296753 }, { "entropy": 9.310017585754395, "epoch": 0.38649396875617953, "mean_token_accuracy": 0.7263157963752747, "num_tokens": 20348858.0, "step": 3909, "train/ce_loss": 1.3528118133544922 }, { "epoch": 0.38649396875617953, "step": 3909, "train/sim_loss": 0.11328125 }, { "epoch": 0.38649396875617953, "step": 3909, "train/total_loss": 0.24856244027614594 }, { "entropy": 9.043222427368164, "epoch": 0.38659284160569507, "mean_token_accuracy": 0.7064934968948364, "num_tokens": 20354094.0, "step": 3910, "train/ce_loss": 0.9024978280067444 }, { "epoch": 0.38659284160569507, "step": 3910, "train/sim_loss": 0.05859375 }, { "epoch": 0.38659284160569507, "step": 3910, "train/total_loss": 0.14884352684020996 }, { "entropy": 8.958993911743164, "epoch": 0.3866917144552106, "mean_token_accuracy": 0.7493887543678284, "num_tokens": 20359382.0, "step": 3911, "train/ce_loss": 0.911043107509613 }, { "epoch": 0.3866917144552106, "step": 3911, "train/sim_loss": 0.0703125 }, { "epoch": 0.3866917144552106, "step": 3911, "train/total_loss": 0.16141681373119354 }, { "entropy": 8.577729225158691, "epoch": 0.3867905873047261, "mean_token_accuracy": 0.6969146728515625, "num_tokens": 20365011.0, "step": 3912, "train/ce_loss": 1.1239310503005981 }, { "epoch": 0.3867905873047261, "step": 3912, "train/sim_loss": 0.09375 }, { "epoch": 0.3867905873047261, "step": 3912, "train/total_loss": 0.2061431109905243 }, { "entropy": 9.092487335205078, "epoch": 0.38688946015424164, "mean_token_accuracy": 0.7576923370361328, "num_tokens": 20370270.0, "step": 3913, "train/ce_loss": 1.0013328790664673 }, { "epoch": 0.38688946015424164, "step": 3913, "train/sim_loss": 0.0390625 }, { "epoch": 0.38688946015424164, "step": 3913, "train/total_loss": 0.13919579982757568 }, { "entropy": 9.505253791809082, "epoch": 0.3869883330037572, "mean_token_accuracy": 0.6710097789764404, "num_tokens": 20375311.0, "step": 3914, "train/ce_loss": 0.8630321621894836 }, { "epoch": 0.3869883330037572, "step": 3914, "train/sim_loss": 0.03125 }, { "epoch": 0.3869883330037572, "step": 3914, "train/total_loss": 0.1175532191991806 }, { "entropy": 10.00143051147461, "epoch": 0.38708720585327266, "mean_token_accuracy": 0.7777777910232544, "num_tokens": 20380117.0, "step": 3915, "train/ce_loss": 1.1587531566619873 }, { "epoch": 0.38708720585327266, "step": 3915, "train/sim_loss": 0.02734375 }, { "epoch": 0.38708720585327266, "step": 3915, "train/total_loss": 0.14321906864643097 }, { "entropy": 9.14274787902832, "epoch": 0.3871860787027882, "mean_token_accuracy": 0.6725888252258301, "num_tokens": 20385412.0, "step": 3916, "train/ce_loss": 0.8936051726341248 }, { "epoch": 0.3871860787027882, "step": 3916, "train/sim_loss": 0.1171875 }, { "epoch": 0.3871860787027882, "step": 3916, "train/total_loss": 0.20654802024364471 }, { "entropy": 9.054058074951172, "epoch": 0.38728495155230375, "mean_token_accuracy": 0.7832929491996765, "num_tokens": 20390771.0, "step": 3917, "train/ce_loss": 0.706035315990448 }, { "epoch": 0.38728495155230375, "step": 3917, "train/sim_loss": 0.0390625 }, { "epoch": 0.38728495155230375, "step": 3917, "train/total_loss": 0.10966603457927704 }, { "entropy": 9.652740478515625, "epoch": 0.3873838244018193, "mean_token_accuracy": 0.7368420958518982, "num_tokens": 20395784.0, "step": 3918, "train/ce_loss": 1.7945903539657593 }, { "epoch": 0.3873838244018193, "step": 3918, "train/sim_loss": 0.1015625 }, { "epoch": 0.3873838244018193, "step": 3918, "train/total_loss": 0.2810215353965759 }, { "entropy": 9.720868110656738, "epoch": 0.3874826972513348, "mean_token_accuracy": 0.7803030014038086, "num_tokens": 20400784.0, "step": 3919, "train/ce_loss": 0.9807897806167603 }, { "epoch": 0.3874826972513348, "step": 3919, "train/sim_loss": 0.09375 }, { "epoch": 0.3874826972513348, "step": 3919, "train/total_loss": 0.19182898104190826 }, { "epoch": 0.3875815701008503, "grad_norm": 0.7641163468360901, "learning_rate": 9.033526183058894e-06, "loss": 0.1508, "step": 3920 }, { "entropy": 9.394466400146484, "epoch": 0.3875815701008503, "mean_token_accuracy": 0.7298049926757812, "num_tokens": 20405958.0, "step": 3920, "train/ce_loss": 2.060258150100708 }, { "epoch": 0.3875815701008503, "step": 3920, "train/sim_loss": 0.0703125 }, { "epoch": 0.3875815701008503, "step": 3920, "train/total_loss": 0.2763383388519287 }, { "entropy": 9.030954360961914, "epoch": 0.38768044295036586, "mean_token_accuracy": 0.7218543291091919, "num_tokens": 20411205.0, "step": 3921, "train/ce_loss": 1.163877248764038 }, { "epoch": 0.38768044295036586, "step": 3921, "train/sim_loss": 0.078125 }, { "epoch": 0.38768044295036586, "step": 3921, "train/total_loss": 0.1945127248764038 }, { "entropy": 9.603231430053711, "epoch": 0.38777931579988134, "mean_token_accuracy": 0.7215999960899353, "num_tokens": 20416287.0, "step": 3922, "train/ce_loss": 3.7166912534303265e-06 }, { "epoch": 0.38777931579988134, "step": 3922, "train/sim_loss": 0.0625 }, { "epoch": 0.38777931579988134, "step": 3922, "train/total_loss": 0.06250037252902985 }, { "entropy": 9.065607070922852, "epoch": 0.3878781886493969, "mean_token_accuracy": 0.7531865835189819, "num_tokens": 20421601.0, "step": 3923, "train/ce_loss": 0.4875900149345398 }, { "epoch": 0.3878781886493969, "step": 3923, "train/sim_loss": 0.08984375 }, { "epoch": 0.3878781886493969, "step": 3923, "train/total_loss": 0.13860274851322174 }, { "entropy": 9.044381141662598, "epoch": 0.3879770614989124, "mean_token_accuracy": 0.7794285416603088, "num_tokens": 20426943.0, "step": 3924, "train/ce_loss": 0.6263932585716248 }, { "epoch": 0.3879770614989124, "step": 3924, "train/sim_loss": 0.07421875 }, { "epoch": 0.3879770614989124, "step": 3924, "train/total_loss": 0.13685807585716248 }, { "entropy": 8.900918006896973, "epoch": 0.3880759343484279, "mean_token_accuracy": 0.72365802526474, "num_tokens": 20432472.0, "step": 3925, "train/ce_loss": 0.7355908751487732 }, { "epoch": 0.3880759343484279, "step": 3925, "train/sim_loss": 0.09765625 }, { "epoch": 0.3880759343484279, "step": 3925, "train/total_loss": 0.17121534049510956 }, { "entropy": 9.017250061035156, "epoch": 0.38817480719794345, "mean_token_accuracy": 0.7524971961975098, "num_tokens": 20437837.0, "step": 3926, "train/ce_loss": 0.7789159417152405 }, { "epoch": 0.38817480719794345, "step": 3926, "train/sim_loss": 0.0703125 }, { "epoch": 0.38817480719794345, "step": 3926, "train/total_loss": 0.14820408821105957 }, { "entropy": 8.924264907836914, "epoch": 0.388273680047459, "mean_token_accuracy": 0.7144444584846497, "num_tokens": 20443220.0, "step": 3927, "train/ce_loss": 1.333925724029541 }, { "epoch": 0.388273680047459, "step": 3927, "train/sim_loss": 0.08203125 }, { "epoch": 0.388273680047459, "step": 3927, "train/total_loss": 0.2154238224029541 }, { "entropy": 9.6730375289917, "epoch": 0.3883725528969745, "mean_token_accuracy": 0.7755101919174194, "num_tokens": 20448240.0, "step": 3928, "train/ce_loss": 3.7251754747558152e-06 }, { "epoch": 0.3883725528969745, "step": 3928, "train/sim_loss": 0.046875 }, { "epoch": 0.3883725528969745, "step": 3928, "train/total_loss": 0.046875372529029846 }, { "entropy": 8.855838775634766, "epoch": 0.38847142574649, "mean_token_accuracy": 0.7216721773147583, "num_tokens": 20453696.0, "step": 3929, "train/ce_loss": 1.4190943241119385 }, { "epoch": 0.38847142574649, "step": 3929, "train/sim_loss": 0.08203125 }, { "epoch": 0.38847142574649, "step": 3929, "train/total_loss": 0.2239406853914261 }, { "entropy": 9.049814224243164, "epoch": 0.38857029859600556, "mean_token_accuracy": 0.740656852722168, "num_tokens": 20459047.0, "step": 3930, "train/ce_loss": 0.8216625452041626 }, { "epoch": 0.38857029859600556, "step": 3930, "train/sim_loss": 0.078125 }, { "epoch": 0.38857029859600556, "step": 3930, "train/total_loss": 0.16029125452041626 }, { "entropy": 9.331798553466797, "epoch": 0.38866917144552104, "mean_token_accuracy": 0.7557544708251953, "num_tokens": 20464285.0, "step": 3931, "train/ce_loss": 0.8100857734680176 }, { "epoch": 0.38866917144552104, "step": 3931, "train/sim_loss": 0.03125 }, { "epoch": 0.38866917144552104, "step": 3931, "train/total_loss": 0.11225857585668564 }, { "entropy": 10.082314491271973, "epoch": 0.3887680442950366, "mean_token_accuracy": 0.7195122241973877, "num_tokens": 20469067.0, "step": 3932, "train/ce_loss": 9.16772842174396e-06 }, { "epoch": 0.3887680442950366, "step": 3932, "train/sim_loss": 0.015625 }, { "epoch": 0.3887680442950366, "step": 3932, "train/total_loss": 0.01562591642141342 }, { "entropy": 8.685100555419922, "epoch": 0.3888669171445521, "mean_token_accuracy": 0.7468030452728271, "num_tokens": 20474377.0, "step": 3933, "train/ce_loss": 0.8407461643218994 }, { "epoch": 0.3888669171445521, "step": 3933, "train/sim_loss": 0.05859375 }, { "epoch": 0.3888669171445521, "step": 3933, "train/total_loss": 0.14266836643218994 }, { "entropy": 9.024192810058594, "epoch": 0.3889657899940676, "mean_token_accuracy": 0.7377398610115051, "num_tokens": 20479779.0, "step": 3934, "train/ce_loss": 0.5858728289604187 }, { "epoch": 0.3889657899940676, "step": 3934, "train/sim_loss": 0.015625 }, { "epoch": 0.3889657899940676, "step": 3934, "train/total_loss": 0.07421228289604187 }, { "entropy": 9.15198040008545, "epoch": 0.38906466284358315, "mean_token_accuracy": 0.7543604373931885, "num_tokens": 20484968.0, "step": 3935, "train/ce_loss": 1.2744925022125244 }, { "epoch": 0.38906466284358315, "step": 3935, "train/sim_loss": 0.09375 }, { "epoch": 0.38906466284358315, "step": 3935, "train/total_loss": 0.22119925916194916 }, { "entropy": 9.710990905761719, "epoch": 0.3891635356930987, "mean_token_accuracy": 0.683501660823822, "num_tokens": 20489983.0, "step": 3936, "train/ce_loss": 1.713331937789917 }, { "epoch": 0.3891635356930987, "step": 3936, "train/sim_loss": 0.10546875 }, { "epoch": 0.3891635356930987, "step": 3936, "train/total_loss": 0.2768019437789917 }, { "entropy": 9.299409866333008, "epoch": 0.3892624085426142, "mean_token_accuracy": 0.760869562625885, "num_tokens": 20495206.0, "step": 3937, "train/ce_loss": 0.7358940839767456 }, { "epoch": 0.3892624085426142, "step": 3937, "train/sim_loss": 0.03125 }, { "epoch": 0.3892624085426142, "step": 3937, "train/total_loss": 0.10483940690755844 }, { "entropy": 9.159974098205566, "epoch": 0.3893612813921297, "mean_token_accuracy": 0.780379056930542, "num_tokens": 20500729.0, "step": 3938, "train/ce_loss": 0.6641556620597839 }, { "epoch": 0.3893612813921297, "step": 3938, "train/sim_loss": 0.01953125 }, { "epoch": 0.3893612813921297, "step": 3938, "train/total_loss": 0.08594682067632675 }, { "entropy": 9.078100204467773, "epoch": 0.38946015424164526, "mean_token_accuracy": 0.7342073917388916, "num_tokens": 20506056.0, "step": 3939, "train/ce_loss": 1.0068473815917969 }, { "epoch": 0.38946015424164526, "step": 3939, "train/sim_loss": 0.08984375 }, { "epoch": 0.38946015424164526, "step": 3939, "train/total_loss": 0.1905284821987152 }, { "epoch": 0.38955902709116075, "grad_norm": 0.7789677381515503, "learning_rate": 9.028581318300944e-06, "loss": 0.146, "step": 3940 }, { "entropy": 9.419309616088867, "epoch": 0.38955902709116075, "mean_token_accuracy": 0.7227866649627686, "num_tokens": 20511234.0, "step": 3940, "train/ce_loss": 1.1209956407546997 }, { "epoch": 0.38955902709116075, "step": 3940, "train/sim_loss": 0.05859375 }, { "epoch": 0.38955902709116075, "step": 3940, "train/total_loss": 0.1706933081150055 }, { "entropy": 9.43798828125, "epoch": 0.3896578999406763, "mean_token_accuracy": 0.7090163826942444, "num_tokens": 20516380.0, "step": 3941, "train/ce_loss": 1.5165958404541016 }, { "epoch": 0.3896578999406763, "step": 3941, "train/sim_loss": 0.1015625 }, { "epoch": 0.3896578999406763, "step": 3941, "train/total_loss": 0.25322210788726807 }, { "entropy": 9.000892639160156, "epoch": 0.38975677279019183, "mean_token_accuracy": 0.7237762212753296, "num_tokens": 20521719.0, "step": 3942, "train/ce_loss": 0.4196535050868988 }, { "epoch": 0.38975677279019183, "step": 3942, "train/sim_loss": 0.0234375 }, { "epoch": 0.38975677279019183, "step": 3942, "train/total_loss": 0.06540285050868988 }, { "entropy": 8.770895004272461, "epoch": 0.3898556456397073, "mean_token_accuracy": 0.7023809552192688, "num_tokens": 20527120.0, "step": 3943, "train/ce_loss": 0.820130467414856 }, { "epoch": 0.3898556456397073, "step": 3943, "train/sim_loss": 0.0625 }, { "epoch": 0.3898556456397073, "step": 3943, "train/total_loss": 0.14451304078102112 }, { "entropy": 9.20180606842041, "epoch": 0.38995451848922286, "mean_token_accuracy": 0.746051013469696, "num_tokens": 20532579.0, "step": 3944, "train/ce_loss": 0.7872023582458496 }, { "epoch": 0.38995451848922286, "step": 3944, "train/sim_loss": 0.0625 }, { "epoch": 0.38995451848922286, "step": 3944, "train/total_loss": 0.14122024178504944 }, { "entropy": 9.434796333312988, "epoch": 0.3900533913387384, "mean_token_accuracy": 0.7356828451156616, "num_tokens": 20537628.0, "step": 3945, "train/ce_loss": 1.1515980958938599 }, { "epoch": 0.3900533913387384, "step": 3945, "train/sim_loss": 0.0546875 }, { "epoch": 0.3900533913387384, "step": 3945, "train/total_loss": 0.169847309589386 }, { "entropy": 9.186416625976562, "epoch": 0.3901522641882539, "mean_token_accuracy": 0.7122692465782166, "num_tokens": 20542993.0, "step": 3946, "train/ce_loss": 0.8618674874305725 }, { "epoch": 0.3901522641882539, "step": 3946, "train/sim_loss": 0.046875 }, { "epoch": 0.3901522641882539, "step": 3946, "train/total_loss": 0.1330617517232895 }, { "entropy": 9.214897155761719, "epoch": 0.3902511370377694, "mean_token_accuracy": 0.7219387888908386, "num_tokens": 20548235.0, "step": 3947, "train/ce_loss": 1.2426503896713257 }, { "epoch": 0.3902511370377694, "step": 3947, "train/sim_loss": 0.09765625 }, { "epoch": 0.3902511370377694, "step": 3947, "train/total_loss": 0.22192129492759705 }, { "entropy": 10.047253608703613, "epoch": 0.39035000988728497, "mean_token_accuracy": 0.7130681872367859, "num_tokens": 20553022.0, "step": 3948, "train/ce_loss": 1.644702434539795 }, { "epoch": 0.39035000988728497, "step": 3948, "train/sim_loss": 0.05078125 }, { "epoch": 0.39035000988728497, "step": 3948, "train/total_loss": 0.21525149047374725 }, { "entropy": 9.5345458984375, "epoch": 0.39044888273680045, "mean_token_accuracy": 0.7266082167625427, "num_tokens": 20558172.0, "step": 3949, "train/ce_loss": 1.3004337549209595 }, { "epoch": 0.39044888273680045, "step": 3949, "train/sim_loss": 0.10546875 }, { "epoch": 0.39044888273680045, "step": 3949, "train/total_loss": 0.2355121225118637 }, { "entropy": 8.942769050598145, "epoch": 0.390547755586316, "mean_token_accuracy": 0.7530364394187927, "num_tokens": 20563603.0, "step": 3950, "train/ce_loss": 0.6398596167564392 }, { "epoch": 0.390547755586316, "step": 3950, "train/sim_loss": 0.046875 }, { "epoch": 0.390547755586316, "step": 3950, "train/total_loss": 0.11086096614599228 }, { "entropy": 10.712668418884277, "epoch": 0.39064662843583153, "mean_token_accuracy": 1.0, "num_tokens": 20568005.0, "step": 3951, "train/ce_loss": 0.00010620328976074234 }, { "epoch": 0.39064662843583153, "step": 3951, "train/sim_loss": 0.04296875 }, { "epoch": 0.39064662843583153, "step": 3951, "train/total_loss": 0.042979370802640915 }, { "entropy": 9.488534927368164, "epoch": 0.390745501285347, "mean_token_accuracy": 0.738095223903656, "num_tokens": 20573047.0, "step": 3952, "train/ce_loss": 0.8325533270835876 }, { "epoch": 0.390745501285347, "step": 3952, "train/sim_loss": 0.07421875 }, { "epoch": 0.390745501285347, "step": 3952, "train/total_loss": 0.157474085688591 }, { "entropy": 9.086153030395508, "epoch": 0.39084437413486256, "mean_token_accuracy": 0.6866059899330139, "num_tokens": 20578328.0, "step": 3953, "train/ce_loss": 0.8276225924491882 }, { "epoch": 0.39084437413486256, "step": 3953, "train/sim_loss": 0.078125 }, { "epoch": 0.39084437413486256, "step": 3953, "train/total_loss": 0.16088727116584778 }, { "entropy": 9.038970947265625, "epoch": 0.3909432469843781, "mean_token_accuracy": 0.7560175061225891, "num_tokens": 20583712.0, "step": 3954, "train/ce_loss": 0.6037119626998901 }, { "epoch": 0.3909432469843781, "step": 3954, "train/sim_loss": 0.01953125 }, { "epoch": 0.3909432469843781, "step": 3954, "train/total_loss": 0.07990244776010513 }, { "entropy": 9.257122039794922, "epoch": 0.3910421198338936, "mean_token_accuracy": 0.7065693140029907, "num_tokens": 20588855.0, "step": 3955, "train/ce_loss": 0.9117102026939392 }, { "epoch": 0.3910421198338936, "step": 3955, "train/sim_loss": 0.046875 }, { "epoch": 0.3910421198338936, "step": 3955, "train/total_loss": 0.1380460262298584 }, { "entropy": 9.421026229858398, "epoch": 0.3911409926834091, "mean_token_accuracy": 0.7773167490959167, "num_tokens": 20594046.0, "step": 3956, "train/ce_loss": 0.7747334241867065 }, { "epoch": 0.3911409926834091, "step": 3956, "train/sim_loss": 0.03515625 }, { "epoch": 0.3911409926834091, "step": 3956, "train/total_loss": 0.11262959241867065 }, { "entropy": 9.129478454589844, "epoch": 0.39123986553292467, "mean_token_accuracy": 0.776566743850708, "num_tokens": 20599254.0, "step": 3957, "train/ce_loss": 0.5620206594467163 }, { "epoch": 0.39123986553292467, "step": 3957, "train/sim_loss": 0.0234375 }, { "epoch": 0.39123986553292467, "step": 3957, "train/total_loss": 0.07963956892490387 }, { "entropy": 9.739778518676758, "epoch": 0.39133873838244015, "mean_token_accuracy": 0.8050000071525574, "num_tokens": 20604094.0, "step": 3958, "train/ce_loss": 1.341671109199524 }, { "epoch": 0.39133873838244015, "step": 3958, "train/sim_loss": 0.046875 }, { "epoch": 0.39133873838244015, "step": 3958, "train/total_loss": 0.1810421198606491 }, { "entropy": 9.148651123046875, "epoch": 0.3914376112319557, "mean_token_accuracy": 0.7365119457244873, "num_tokens": 20609350.0, "step": 3959, "train/ce_loss": 1.4228763580322266 }, { "epoch": 0.3914376112319557, "step": 3959, "train/sim_loss": 0.078125 }, { "epoch": 0.3914376112319557, "step": 3959, "train/total_loss": 0.22041264176368713 }, { "epoch": 0.39153648408147124, "grad_norm": 0.7743656635284424, "learning_rate": 9.023636453542997e-06, "loss": 0.1417, "step": 3960 }, { "entropy": 8.91891860961914, "epoch": 0.39153648408147124, "mean_token_accuracy": 0.7766081690788269, "num_tokens": 20614672.0, "step": 3960, "train/ce_loss": 0.6227753162384033 }, { "epoch": 0.39153648408147124, "step": 3960, "train/sim_loss": 0.05078125 }, { "epoch": 0.39153648408147124, "step": 3960, "train/total_loss": 0.11305878311395645 }, { "entropy": 8.867339134216309, "epoch": 0.3916353569309868, "mean_token_accuracy": 0.7745591998100281, "num_tokens": 20619968.0, "step": 3961, "train/ce_loss": 0.700613260269165 }, { "epoch": 0.3916353569309868, "step": 3961, "train/sim_loss": 0.015625 }, { "epoch": 0.3916353569309868, "step": 3961, "train/total_loss": 0.0856863260269165 }, { "entropy": 9.311765670776367, "epoch": 0.39173422978050226, "mean_token_accuracy": 0.7412095665931702, "num_tokens": 20625139.0, "step": 3962, "train/ce_loss": 1.589390516281128 }, { "epoch": 0.39173422978050226, "step": 3962, "train/sim_loss": 0.0625 }, { "epoch": 0.39173422978050226, "step": 3962, "train/total_loss": 0.22143904864788055 }, { "entropy": 9.160322189331055, "epoch": 0.3918331026300178, "mean_token_accuracy": 0.7806122303009033, "num_tokens": 20630350.0, "step": 3963, "train/ce_loss": 0.8469388484954834 }, { "epoch": 0.3918331026300178, "step": 3963, "train/sim_loss": 0.0546875 }, { "epoch": 0.3918331026300178, "step": 3963, "train/total_loss": 0.13938137888908386 }, { "entropy": 9.489648818969727, "epoch": 0.39193197547953335, "mean_token_accuracy": 0.7718750238418579, "num_tokens": 20635472.0, "step": 3964, "train/ce_loss": 2.640196498759906e-06 }, { "epoch": 0.39193197547953335, "step": 3964, "train/sim_loss": 0.0234375 }, { "epoch": 0.39193197547953335, "step": 3964, "train/total_loss": 0.02343776449561119 }, { "entropy": 9.338415145874023, "epoch": 0.39203084832904883, "mean_token_accuracy": 0.7424483299255371, "num_tokens": 20640570.0, "step": 3965, "train/ce_loss": 0.6520880460739136 }, { "epoch": 0.39203084832904883, "step": 3965, "train/sim_loss": 0.0625 }, { "epoch": 0.39203084832904883, "step": 3965, "train/total_loss": 0.1277088075876236 }, { "entropy": 9.578125, "epoch": 0.39212972117856437, "mean_token_accuracy": 0.7858347296714783, "num_tokens": 20645601.0, "step": 3966, "train/ce_loss": 1.149623155593872 }, { "epoch": 0.39212972117856437, "step": 3966, "train/sim_loss": 0.015625 }, { "epoch": 0.39212972117856437, "step": 3966, "train/total_loss": 0.13058730959892273 }, { "entropy": 9.139913558959961, "epoch": 0.3922285940280799, "mean_token_accuracy": 0.7357051968574524, "num_tokens": 20650852.0, "step": 3967, "train/ce_loss": 0.43606361746788025 }, { "epoch": 0.3922285940280799, "step": 3967, "train/sim_loss": 0.02734375 }, { "epoch": 0.3922285940280799, "step": 3967, "train/total_loss": 0.07095011323690414 }, { "entropy": 9.946247100830078, "epoch": 0.3923274668775954, "mean_token_accuracy": 0.7689969539642334, "num_tokens": 20655632.0, "step": 3968, "train/ce_loss": 1.7134802341461182 }, { "epoch": 0.3923274668775954, "step": 3968, "train/sim_loss": 0.046875 }, { "epoch": 0.3923274668775954, "step": 3968, "train/total_loss": 0.21822302043437958 }, { "entropy": 9.227418899536133, "epoch": 0.39242633972711094, "mean_token_accuracy": 0.7559171319007874, "num_tokens": 20660746.0, "step": 3969, "train/ce_loss": 0.6671918034553528 }, { "epoch": 0.39242633972711094, "step": 3969, "train/sim_loss": 0.04296875 }, { "epoch": 0.39242633972711094, "step": 3969, "train/total_loss": 0.1096879318356514 }, { "entropy": 8.94714069366455, "epoch": 0.3925252125766265, "mean_token_accuracy": 0.7716436386108398, "num_tokens": 20666020.0, "step": 3970, "train/ce_loss": 0.8409286141395569 }, { "epoch": 0.3925252125766265, "step": 3970, "train/sim_loss": 0.05859375 }, { "epoch": 0.3925252125766265, "step": 3970, "train/total_loss": 0.1426866054534912 }, { "entropy": 8.83590030670166, "epoch": 0.39262408542614197, "mean_token_accuracy": 0.680272102355957, "num_tokens": 20671340.0, "step": 3971, "train/ce_loss": 0.7781625390052795 }, { "epoch": 0.39262408542614197, "step": 3971, "train/sim_loss": 0.0625 }, { "epoch": 0.39262408542614197, "step": 3971, "train/total_loss": 0.14031624794006348 }, { "entropy": 9.116752624511719, "epoch": 0.3927229582756575, "mean_token_accuracy": 0.7691256999969482, "num_tokens": 20676527.0, "step": 3972, "train/ce_loss": 0.5245115160942078 }, { "epoch": 0.3927229582756575, "step": 3972, "train/sim_loss": 0.03125 }, { "epoch": 0.3927229582756575, "step": 3972, "train/total_loss": 0.08370114862918854 }, { "entropy": 9.668756484985352, "epoch": 0.39282183112517305, "mean_token_accuracy": 0.7322970628738403, "num_tokens": 20681526.0, "step": 3973, "train/ce_loss": 1.531264066696167 }, { "epoch": 0.39282183112517305, "step": 3973, "train/sim_loss": 0.06640625 }, { "epoch": 0.39282183112517305, "step": 3973, "train/total_loss": 0.21953265368938446 }, { "entropy": 8.670350074768066, "epoch": 0.39292070397468853, "mean_token_accuracy": 0.6647331714630127, "num_tokens": 20686872.0, "step": 3974, "train/ce_loss": 1.3226938247680664 }, { "epoch": 0.39292070397468853, "step": 3974, "train/sim_loss": 0.05078125 }, { "epoch": 0.39292070397468853, "step": 3974, "train/total_loss": 0.18305063247680664 }, { "entropy": 9.280973434448242, "epoch": 0.3930195768242041, "mean_token_accuracy": 0.7009569406509399, "num_tokens": 20692188.0, "step": 3975, "train/ce_loss": 1.4136747121810913 }, { "epoch": 0.3930195768242041, "step": 3975, "train/sim_loss": 0.08203125 }, { "epoch": 0.3930195768242041, "step": 3975, "train/total_loss": 0.22339873015880585 }, { "entropy": 9.94870376586914, "epoch": 0.3931184496737196, "mean_token_accuracy": 0.7412399053573608, "num_tokens": 20696937.0, "step": 3976, "train/ce_loss": 1.8449875116348267 }, { "epoch": 0.3931184496737196, "step": 3976, "train/sim_loss": 0.0546875 }, { "epoch": 0.3931184496737196, "step": 3976, "train/total_loss": 0.23918625712394714 }, { "entropy": 10.018001556396484, "epoch": 0.3932173225232351, "mean_token_accuracy": 0.7804877758026123, "num_tokens": 20701772.0, "step": 3977, "train/ce_loss": 6.123746970843058e-06 }, { "epoch": 0.3932173225232351, "step": 3977, "train/sim_loss": 0.05859375 }, { "epoch": 0.3932173225232351, "step": 3977, "train/total_loss": 0.05859436094760895 }, { "entropy": 8.99118423461914, "epoch": 0.39331619537275064, "mean_token_accuracy": 0.761904776096344, "num_tokens": 20707067.0, "step": 3978, "train/ce_loss": 1.0223777294158936 }, { "epoch": 0.39331619537275064, "step": 3978, "train/sim_loss": 0.04296875 }, { "epoch": 0.39331619537275064, "step": 3978, "train/total_loss": 0.1452065259218216 }, { "entropy": 9.36185073852539, "epoch": 0.3934150682222662, "mean_token_accuracy": 0.7319728136062622, "num_tokens": 20712434.0, "step": 3979, "train/ce_loss": 0.9951471090316772 }, { "epoch": 0.3934150682222662, "step": 3979, "train/sim_loss": 0.06640625 }, { "epoch": 0.3934150682222662, "step": 3979, "train/total_loss": 0.16592097282409668 }, { "epoch": 0.39351394107178167, "grad_norm": 0.7715674638748169, "learning_rate": 9.018691588785047e-06, "loss": 0.1368, "step": 3980 }, { "entropy": 8.942005157470703, "epoch": 0.39351394107178167, "mean_token_accuracy": 0.757777750492096, "num_tokens": 20717817.0, "step": 3980, "train/ce_loss": 0.8228210210800171 }, { "epoch": 0.39351394107178167, "step": 3980, "train/sim_loss": 0.09765625 }, { "epoch": 0.39351394107178167, "step": 3980, "train/total_loss": 0.17993834614753723 }, { "entropy": 9.453428268432617, "epoch": 0.3936128139212972, "mean_token_accuracy": 0.7148817777633667, "num_tokens": 20723013.0, "step": 3981, "train/ce_loss": 0.5984261631965637 }, { "epoch": 0.3936128139212972, "step": 3981, "train/sim_loss": 0.02734375 }, { "epoch": 0.3936128139212972, "step": 3981, "train/total_loss": 0.08718636631965637 }, { "entropy": 8.952157974243164, "epoch": 0.39371168677081275, "mean_token_accuracy": 0.7518796920776367, "num_tokens": 20728459.0, "step": 3982, "train/ce_loss": 0.4479600787162781 }, { "epoch": 0.39371168677081275, "step": 3982, "train/sim_loss": 0.0546875 }, { "epoch": 0.39371168677081275, "step": 3982, "train/total_loss": 0.09948350489139557 }, { "entropy": 9.203887939453125, "epoch": 0.39381055962032824, "mean_token_accuracy": 0.7377423048019409, "num_tokens": 20733804.0, "step": 3983, "train/ce_loss": 0.4607117474079132 }, { "epoch": 0.39381055962032824, "step": 3983, "train/sim_loss": 0.0625 }, { "epoch": 0.39381055962032824, "step": 3983, "train/total_loss": 0.10857117176055908 }, { "entropy": 8.882675170898438, "epoch": 0.3939094324698438, "mean_token_accuracy": 0.7382857203483582, "num_tokens": 20739260.0, "step": 3984, "train/ce_loss": 1.2117985486984253 }, { "epoch": 0.3939094324698438, "step": 3984, "train/sim_loss": 0.078125 }, { "epoch": 0.3939094324698438, "step": 3984, "train/total_loss": 0.19930484890937805 }, { "entropy": 8.734657287597656, "epoch": 0.3940083053193593, "mean_token_accuracy": 0.7133758068084717, "num_tokens": 20744666.0, "step": 3985, "train/ce_loss": 0.7336761355400085 }, { "epoch": 0.3940083053193593, "step": 3985, "train/sim_loss": 0.078125 }, { "epoch": 0.3940083053193593, "step": 3985, "train/total_loss": 0.1514926254749298 }, { "entropy": 9.270834922790527, "epoch": 0.3941071781688748, "mean_token_accuracy": 0.7160161733627319, "num_tokens": 20749855.0, "step": 3986, "train/ce_loss": 0.663804292678833 }, { "epoch": 0.3941071781688748, "step": 3986, "train/sim_loss": 0.04296875 }, { "epoch": 0.3941071781688748, "step": 3986, "train/total_loss": 0.10934918373823166 }, { "entropy": 8.473923683166504, "epoch": 0.39420605101839035, "mean_token_accuracy": 0.7654075622558594, "num_tokens": 20755331.0, "step": 3987, "train/ce_loss": 0.7835092544555664 }, { "epoch": 0.39420605101839035, "step": 3987, "train/sim_loss": 0.05859375 }, { "epoch": 0.39420605101839035, "step": 3987, "train/total_loss": 0.13694468140602112 }, { "entropy": 9.105308532714844, "epoch": 0.3943049238679059, "mean_token_accuracy": 0.6714456677436829, "num_tokens": 20760625.0, "step": 3988, "train/ce_loss": 0.9587419033050537 }, { "epoch": 0.3943049238679059, "step": 3988, "train/sim_loss": 0.046875 }, { "epoch": 0.3943049238679059, "step": 3988, "train/total_loss": 0.14274919033050537 }, { "entropy": 9.182525634765625, "epoch": 0.3944037967174214, "mean_token_accuracy": 0.7326478362083435, "num_tokens": 20765824.0, "step": 3989, "train/ce_loss": 0.9583200216293335 }, { "epoch": 0.3944037967174214, "step": 3989, "train/sim_loss": 0.08203125 }, { "epoch": 0.3944037967174214, "step": 3989, "train/total_loss": 0.1778632551431656 }, { "entropy": 9.696810722351074, "epoch": 0.3945026695669369, "mean_token_accuracy": 0.7580340504646301, "num_tokens": 20770796.0, "step": 3990, "train/ce_loss": 1.025506615638733 }, { "epoch": 0.3945026695669369, "step": 3990, "train/sim_loss": 0.078125 }, { "epoch": 0.3945026695669369, "step": 3990, "train/total_loss": 0.1806756556034088 }, { "entropy": 9.201448440551758, "epoch": 0.39460154241645246, "mean_token_accuracy": 0.7397260069847107, "num_tokens": 20775989.0, "step": 3991, "train/ce_loss": 0.9222592115402222 }, { "epoch": 0.39460154241645246, "step": 3991, "train/sim_loss": 0.04296875 }, { "epoch": 0.39460154241645246, "step": 3991, "train/total_loss": 0.13519467413425446 }, { "entropy": 8.77785587310791, "epoch": 0.39470041526596794, "mean_token_accuracy": 0.7600411772727966, "num_tokens": 20781468.0, "step": 3992, "train/ce_loss": 0.945110559463501 }, { "epoch": 0.39470041526596794, "step": 3992, "train/sim_loss": 0.1015625 }, { "epoch": 0.39470041526596794, "step": 3992, "train/total_loss": 0.19607356190681458 }, { "entropy": 9.376846313476562, "epoch": 0.3947992881154835, "mean_token_accuracy": 0.6996123790740967, "num_tokens": 20786429.0, "step": 3993, "train/ce_loss": 1.9209712743759155 }, { "epoch": 0.3947992881154835, "step": 3993, "train/sim_loss": 0.10546875 }, { "epoch": 0.3947992881154835, "step": 3993, "train/total_loss": 0.29756587743759155 }, { "entropy": 9.21438217163086, "epoch": 0.394898160964999, "mean_token_accuracy": 0.673374593257904, "num_tokens": 20791554.0, "step": 3994, "train/ce_loss": 5.2261252676544245e-06 }, { "epoch": 0.394898160964999, "step": 3994, "train/sim_loss": 0.0390625 }, { "epoch": 0.394898160964999, "step": 3994, "train/total_loss": 0.039063021540641785 }, { "entropy": 9.785654067993164, "epoch": 0.3949970338145145, "mean_token_accuracy": 0.7127882838249207, "num_tokens": 20796429.0, "step": 3995, "train/ce_loss": 1.522355079650879 }, { "epoch": 0.3949970338145145, "step": 3995, "train/sim_loss": 0.0390625 }, { "epoch": 0.3949970338145145, "step": 3995, "train/total_loss": 0.1912980079650879 }, { "entropy": 9.108304977416992, "epoch": 0.39509590666403005, "mean_token_accuracy": 0.7588739395141602, "num_tokens": 20801696.0, "step": 3996, "train/ce_loss": 0.9614579081535339 }, { "epoch": 0.39509590666403005, "step": 3996, "train/sim_loss": 0.09375 }, { "epoch": 0.39509590666403005, "step": 3996, "train/total_loss": 0.18989579379558563 }, { "entropy": 9.596675872802734, "epoch": 0.3951947795135456, "mean_token_accuracy": 0.7289562225341797, "num_tokens": 20806701.0, "step": 3997, "train/ce_loss": 1.8334210380999139e-06 }, { "epoch": 0.3951947795135456, "step": 3997, "train/sim_loss": 0.06640625 }, { "epoch": 0.3951947795135456, "step": 3997, "train/total_loss": 0.06640643626451492 }, { "entropy": 8.852028846740723, "epoch": 0.3952936523630611, "mean_token_accuracy": 0.748344361782074, "num_tokens": 20812123.0, "step": 3998, "train/ce_loss": 0.32659050822257996 }, { "epoch": 0.3952936523630611, "step": 3998, "train/sim_loss": 0.0546875 }, { "epoch": 0.3952936523630611, "step": 3998, "train/total_loss": 0.08734655380249023 }, { "entropy": 9.218839645385742, "epoch": 0.3953925252125766, "mean_token_accuracy": 0.7477477192878723, "num_tokens": 20817347.0, "step": 3999, "train/ce_loss": 0.91793292760849 }, { "epoch": 0.3953925252125766, "step": 3999, "train/sim_loss": 0.05859375 }, { "epoch": 0.3953925252125766, "step": 3999, "train/total_loss": 0.15038704872131348 }, { "epoch": 0.39549139806209216, "grad_norm": 0.7181991934776306, "learning_rate": 9.013746724027098e-06, "loss": 0.155, "step": 4000 }, { "entropy": 9.046833038330078, "epoch": 0.39549139806209216, "mean_token_accuracy": 0.7191435694694519, "num_tokens": 5203.0, "step": 4000, "train/ce_loss": 0.9499548077583313 }, { "epoch": 0.39549139806209216, "step": 4000, "train/sim_loss": 0.0703125 }, { "epoch": 0.39549139806209216, "step": 4000, "train/total_loss": 0.16530798375606537 }, { "entropy": 8.831292152404785, "epoch": 0.3955902709116077, "mean_token_accuracy": 0.7470775842666626, "num_tokens": 10644.0, "step": 4001, "train/ce_loss": 1.0550512075424194 }, { "epoch": 0.3955902709116077, "step": 4001, "train/sim_loss": 0.08984375 }, { "epoch": 0.3955902709116077, "step": 4001, "train/total_loss": 0.19534887373447418 }, { "entropy": 9.527122497558594, "epoch": 0.3956891437611232, "mean_token_accuracy": 0.7454545497894287, "num_tokens": 15582.0, "step": 4002, "train/ce_loss": 1.7859098306871601e-06 }, { "epoch": 0.3956891437611232, "step": 4002, "train/sim_loss": 0.0546875 }, { "epoch": 0.3956891437611232, "step": 4002, "train/total_loss": 0.054687678813934326 }, { "entropy": 8.842239379882812, "epoch": 0.3957880166106387, "mean_token_accuracy": 0.7266436219215393, "num_tokens": 20895.0, "step": 4003, "train/ce_loss": 0.4752326011657715 }, { "epoch": 0.3957880166106387, "step": 4003, "train/sim_loss": 0.0546875 }, { "epoch": 0.3957880166106387, "step": 4003, "train/total_loss": 0.10221076011657715 }, { "entropy": 9.415111541748047, "epoch": 0.39588688946015427, "mean_token_accuracy": 0.72398841381073, "num_tokens": 26048.0, "step": 4004, "train/ce_loss": 0.8236679434776306 }, { "epoch": 0.39588688946015427, "step": 4004, "train/sim_loss": 0.05078125 }, { "epoch": 0.39588688946015427, "step": 4004, "train/total_loss": 0.13314804434776306 }, { "entropy": 9.289140701293945, "epoch": 0.39598576230966975, "mean_token_accuracy": 0.7312775254249573, "num_tokens": 31206.0, "step": 4005, "train/ce_loss": 0.7567775249481201 }, { "epoch": 0.39598576230966975, "step": 4005, "train/sim_loss": 0.078125 }, { "epoch": 0.39598576230966975, "step": 4005, "train/total_loss": 0.153802752494812 }, { "entropy": 8.88846492767334, "epoch": 0.3960846351591853, "mean_token_accuracy": 0.74210524559021, "num_tokens": 36670.0, "step": 4006, "train/ce_loss": 0.7667742967605591 }, { "epoch": 0.3960846351591853, "step": 4006, "train/sim_loss": 0.08984375 }, { "epoch": 0.3960846351591853, "step": 4006, "train/total_loss": 0.16652119159698486 }, { "entropy": 9.517263412475586, "epoch": 0.39618350800870084, "mean_token_accuracy": 0.757328987121582, "num_tokens": 41714.0, "step": 4007, "train/ce_loss": 1.5702300970588112e-06 }, { "epoch": 0.39618350800870084, "step": 4007, "train/sim_loss": 0.046875 }, { "epoch": 0.39618350800870084, "step": 4007, "train/total_loss": 0.046875156462192535 }, { "entropy": 9.251596450805664, "epoch": 0.3962823808582163, "mean_token_accuracy": 0.7770082950592041, "num_tokens": 46944.0, "step": 4008, "train/ce_loss": 1.188407301902771 }, { "epoch": 0.3962823808582163, "step": 4008, "train/sim_loss": 0.0859375 }, { "epoch": 0.3962823808582163, "step": 4008, "train/total_loss": 0.20477822422981262 }, { "entropy": 8.82939624786377, "epoch": 0.39638125370773186, "mean_token_accuracy": 0.7306122183799744, "num_tokens": 52395.0, "step": 4009, "train/ce_loss": 0.7744306325912476 }, { "epoch": 0.39638125370773186, "step": 4009, "train/sim_loss": 0.078125 }, { "epoch": 0.39638125370773186, "step": 4009, "train/total_loss": 0.15556806325912476 }, { "entropy": 9.621936798095703, "epoch": 0.3964801265572474, "mean_token_accuracy": 0.6951026916503906, "num_tokens": 57442.0, "step": 4010, "train/ce_loss": 1.723623514175415 }, { "epoch": 0.3964801265572474, "step": 4010, "train/sim_loss": 0.13671875 }, { "epoch": 0.3964801265572474, "step": 4010, "train/total_loss": 0.309081107378006 }, { "entropy": 9.005047798156738, "epoch": 0.3965789994067629, "mean_token_accuracy": 0.7599093914031982, "num_tokens": 62814.0, "step": 4011, "train/ce_loss": 0.543965756893158 }, { "epoch": 0.3965789994067629, "step": 4011, "train/sim_loss": 0.0390625 }, { "epoch": 0.3965789994067629, "step": 4011, "train/total_loss": 0.09345907717943192 }, { "entropy": 9.017001152038574, "epoch": 0.39667787225627843, "mean_token_accuracy": 0.7417452931404114, "num_tokens": 68119.0, "step": 4012, "train/ce_loss": 0.8745249509811401 }, { "epoch": 0.39667787225627843, "step": 4012, "train/sim_loss": 0.0546875 }, { "epoch": 0.39667787225627843, "step": 4012, "train/total_loss": 0.1421400010585785 }, { "entropy": 9.530156135559082, "epoch": 0.39677674510579397, "mean_token_accuracy": 0.7612403035163879, "num_tokens": 73200.0, "step": 4013, "train/ce_loss": 1.0481228828430176 }, { "epoch": 0.39677674510579397, "step": 4013, "train/sim_loss": 0.046875 }, { "epoch": 0.39677674510579397, "step": 4013, "train/total_loss": 0.15168729424476624 }, { "entropy": 8.660186767578125, "epoch": 0.39687561795530946, "mean_token_accuracy": 0.7231578826904297, "num_tokens": 78622.0, "step": 4014, "train/ce_loss": 1.055025339126587 }, { "epoch": 0.39687561795530946, "step": 4014, "train/sim_loss": 0.04296875 }, { "epoch": 0.39687561795530946, "step": 4014, "train/total_loss": 0.14847129583358765 }, { "entropy": 9.057394027709961, "epoch": 0.396974490804825, "mean_token_accuracy": 0.7364621162414551, "num_tokens": 83914.0, "step": 4015, "train/ce_loss": 0.8721376657485962 }, { "epoch": 0.396974490804825, "step": 4015, "train/sim_loss": 0.0546875 }, { "epoch": 0.396974490804825, "step": 4015, "train/total_loss": 0.14190126955509186 }, { "entropy": 9.577649116516113, "epoch": 0.39707336365434054, "mean_token_accuracy": 0.6562032699584961, "num_tokens": 89023.0, "step": 4016, "train/ce_loss": 2.242205482616555e-06 }, { "epoch": 0.39707336365434054, "step": 4016, "train/sim_loss": 0.03515625 }, { "epoch": 0.39707336365434054, "step": 4016, "train/total_loss": 0.03515647351741791 }, { "entropy": 9.50831413269043, "epoch": 0.397172236503856, "mean_token_accuracy": 0.6845238208770752, "num_tokens": 94210.0, "step": 4017, "train/ce_loss": 1.4990187883377075 }, { "epoch": 0.397172236503856, "step": 4017, "train/sim_loss": 0.046875 }, { "epoch": 0.397172236503856, "step": 4017, "train/total_loss": 0.196776881814003 }, { "entropy": 9.12798023223877, "epoch": 0.39727110935337157, "mean_token_accuracy": 0.6982543468475342, "num_tokens": 99452.0, "step": 4018, "train/ce_loss": 1.048298716545105 }, { "epoch": 0.39727110935337157, "step": 4018, "train/sim_loss": 0.078125 }, { "epoch": 0.39727110935337157, "step": 4018, "train/total_loss": 0.18295487761497498 }, { "entropy": 8.695804595947266, "epoch": 0.3973699822028871, "mean_token_accuracy": 0.7866419553756714, "num_tokens": 105007.0, "step": 4019, "train/ce_loss": 0.5710273385047913 }, { "epoch": 0.3973699822028871, "step": 4019, "train/sim_loss": 0.1171875 }, { "epoch": 0.3973699822028871, "step": 4019, "train/total_loss": 0.1742902398109436 }, { "epoch": 0.3974688550524026, "grad_norm": 0.6420843005180359, "learning_rate": 9.00880185926915e-06, "loss": 0.1499, "step": 4020 }, { "entropy": 9.110542297363281, "epoch": 0.3974688550524026, "mean_token_accuracy": 0.738386332988739, "num_tokens": 110297.0, "step": 4020, "train/ce_loss": 0.6451484560966492 }, { "epoch": 0.3974688550524026, "step": 4020, "train/sim_loss": 0.0703125 }, { "epoch": 0.3974688550524026, "step": 4020, "train/total_loss": 0.13482734560966492 }, { "entropy": 9.310352325439453, "epoch": 0.39756772790191813, "mean_token_accuracy": 0.7536423802375793, "num_tokens": 115507.0, "step": 4021, "train/ce_loss": 0.8712981343269348 }, { "epoch": 0.39756772790191813, "step": 4021, "train/sim_loss": 0.03125 }, { "epoch": 0.39756772790191813, "step": 4021, "train/total_loss": 0.11837981641292572 }, { "entropy": 8.997873306274414, "epoch": 0.3976666007514337, "mean_token_accuracy": 0.7185821533203125, "num_tokens": 120905.0, "step": 4022, "train/ce_loss": 0.6323196291923523 }, { "epoch": 0.3976666007514337, "step": 4022, "train/sim_loss": 0.046875 }, { "epoch": 0.3976666007514337, "step": 4022, "train/total_loss": 0.11010696738958359 }, { "entropy": 8.847343444824219, "epoch": 0.39776547360094916, "mean_token_accuracy": 0.7548022866249084, "num_tokens": 126255.0, "step": 4023, "train/ce_loss": 0.6254032850265503 }, { "epoch": 0.39776547360094916, "step": 4023, "train/sim_loss": 0.08203125 }, { "epoch": 0.39776547360094916, "step": 4023, "train/total_loss": 0.14457157254219055 }, { "entropy": 9.098240852355957, "epoch": 0.3978643464504647, "mean_token_accuracy": 0.7392900586128235, "num_tokens": 131579.0, "step": 4024, "train/ce_loss": 0.5944740772247314 }, { "epoch": 0.3978643464504647, "step": 4024, "train/sim_loss": 0.0234375 }, { "epoch": 0.3978643464504647, "step": 4024, "train/total_loss": 0.08288490772247314 }, { "entropy": 8.962140083312988, "epoch": 0.39796321929998024, "mean_token_accuracy": 0.6808510422706604, "num_tokens": 136828.0, "step": 4025, "train/ce_loss": 0.6933006048202515 }, { "epoch": 0.39796321929998024, "step": 4025, "train/sim_loss": 0.078125 }, { "epoch": 0.39796321929998024, "step": 4025, "train/total_loss": 0.14745506644248962 }, { "entropy": 9.307699203491211, "epoch": 0.3980620921494957, "mean_token_accuracy": 0.6658163070678711, "num_tokens": 142239.0, "step": 4026, "train/ce_loss": 2.1176609992980957 }, { "epoch": 0.3980620921494957, "step": 4026, "train/sim_loss": 0.07421875 }, { "epoch": 0.3980620921494957, "step": 4026, "train/total_loss": 0.2859848737716675 }, { "entropy": 9.146602630615234, "epoch": 0.39816096499901127, "mean_token_accuracy": 0.7664429545402527, "num_tokens": 147485.0, "step": 4027, "train/ce_loss": 0.6791132092475891 }, { "epoch": 0.39816096499901127, "step": 4027, "train/sim_loss": 0.05859375 }, { "epoch": 0.39816096499901127, "step": 4027, "train/total_loss": 0.1265050768852234 }, { "entropy": 8.8102388381958, "epoch": 0.3982598378485268, "mean_token_accuracy": 0.7497593760490417, "num_tokens": 152976.0, "step": 4028, "train/ce_loss": 0.8654963374137878 }, { "epoch": 0.3982598378485268, "step": 4028, "train/sim_loss": 0.0859375 }, { "epoch": 0.3982598378485268, "step": 4028, "train/total_loss": 0.17248713970184326 }, { "entropy": 8.74563217163086, "epoch": 0.3983587106980423, "mean_token_accuracy": 0.6796380281448364, "num_tokens": 158604.0, "step": 4029, "train/ce_loss": 1.4658328294754028 }, { "epoch": 0.3983587106980423, "step": 4029, "train/sim_loss": 0.0703125 }, { "epoch": 0.3983587106980423, "step": 4029, "train/total_loss": 0.21689578890800476 }, { "entropy": 10.073447227478027, "epoch": 0.39845758354755784, "mean_token_accuracy": 0.7485380172729492, "num_tokens": 163393.0, "step": 4030, "train/ce_loss": 2.0892558097839355 }, { "epoch": 0.39845758354755784, "step": 4030, "train/sim_loss": 0.02734375 }, { "epoch": 0.39845758354755784, "step": 4030, "train/total_loss": 0.23626933991909027 }, { "entropy": 9.204407691955566, "epoch": 0.3985564563970734, "mean_token_accuracy": 0.710303008556366, "num_tokens": 168700.0, "step": 4031, "train/ce_loss": 1.2037955522537231 }, { "epoch": 0.3985564563970734, "step": 4031, "train/sim_loss": 0.11328125 }, { "epoch": 0.3985564563970734, "step": 4031, "train/total_loss": 0.23366081714630127 }, { "entropy": 9.018072128295898, "epoch": 0.39865532924658886, "mean_token_accuracy": 0.7207637429237366, "num_tokens": 174018.0, "step": 4032, "train/ce_loss": 0.6557134389877319 }, { "epoch": 0.39865532924658886, "step": 4032, "train/sim_loss": 0.03515625 }, { "epoch": 0.39865532924658886, "step": 4032, "train/total_loss": 0.10072759538888931 }, { "entropy": 9.474685668945312, "epoch": 0.3987542020961044, "mean_token_accuracy": 0.7649006843566895, "num_tokens": 179124.0, "step": 4033, "train/ce_loss": 1.5377615690231323 }, { "epoch": 0.3987542020961044, "step": 4033, "train/sim_loss": 0.0859375 }, { "epoch": 0.3987542020961044, "step": 4033, "train/total_loss": 0.239713653922081 }, { "entropy": 9.250163078308105, "epoch": 0.39885307494561995, "mean_token_accuracy": 0.7286713123321533, "num_tokens": 184274.0, "step": 4034, "train/ce_loss": 0.9188748598098755 }, { "epoch": 0.39885307494561995, "step": 4034, "train/sim_loss": 0.04296875 }, { "epoch": 0.39885307494561995, "step": 4034, "train/total_loss": 0.1348562389612198 }, { "entropy": 10.157920837402344, "epoch": 0.39895194779513543, "mean_token_accuracy": 0.7976539731025696, "num_tokens": 189047.0, "step": 4035, "train/ce_loss": 3.7278227864590008e-06 }, { "epoch": 0.39895194779513543, "step": 4035, "train/sim_loss": 0.0546875 }, { "epoch": 0.39895194779513543, "step": 4035, "train/total_loss": 0.054687872529029846 }, { "entropy": 8.8870849609375, "epoch": 0.39905082064465097, "mean_token_accuracy": 0.7160633206367493, "num_tokens": 194408.0, "step": 4036, "train/ce_loss": 0.7803953289985657 }, { "epoch": 0.39905082064465097, "step": 4036, "train/sim_loss": 0.07421875 }, { "epoch": 0.39905082064465097, "step": 4036, "train/total_loss": 0.1522582769393921 }, { "entropy": 8.751253128051758, "epoch": 0.3991496934941665, "mean_token_accuracy": 0.7391742467880249, "num_tokens": 199845.0, "step": 4037, "train/ce_loss": 0.8194625973701477 }, { "epoch": 0.3991496934941665, "step": 4037, "train/sim_loss": 0.046875 }, { "epoch": 0.3991496934941665, "step": 4037, "train/total_loss": 0.1288212537765503 }, { "entropy": 8.811112403869629, "epoch": 0.399248566343682, "mean_token_accuracy": 0.7383720874786377, "num_tokens": 205327.0, "step": 4038, "train/ce_loss": 0.7337889671325684 }, { "epoch": 0.399248566343682, "step": 4038, "train/sim_loss": 0.0546875 }, { "epoch": 0.399248566343682, "step": 4038, "train/total_loss": 0.12806639075279236 }, { "entropy": 9.679691314697266, "epoch": 0.39934743919319754, "mean_token_accuracy": 0.6666666865348816, "num_tokens": 210204.0, "step": 4039, "train/ce_loss": 1.6742430943850195e-06 }, { "epoch": 0.39934743919319754, "step": 4039, "train/sim_loss": 0.0234375 }, { "epoch": 0.39934743919319754, "step": 4039, "train/total_loss": 0.02343766763806343 }, { "epoch": 0.3994463120427131, "grad_norm": 0.9456307291984558, "learning_rate": 9.0038569945112e-06, "loss": 0.15, "step": 4040 }, { "entropy": 9.541059494018555, "epoch": 0.3994463120427131, "mean_token_accuracy": 0.7266054749488831, "num_tokens": 215164.0, "step": 4040, "train/ce_loss": 3.007882595062256 }, { "epoch": 0.3994463120427131, "step": 4040, "train/sim_loss": 0.10546875 }, { "epoch": 0.3994463120427131, "step": 4040, "train/total_loss": 0.4062570035457611 }, { "entropy": 9.531622886657715, "epoch": 0.39954518489222857, "mean_token_accuracy": 0.698113203048706, "num_tokens": 220296.0, "step": 4041, "train/ce_loss": 1.396550487697823e-06 }, { "epoch": 0.39954518489222857, "step": 4041, "train/sim_loss": 0.03125 }, { "epoch": 0.39954518489222857, "step": 4041, "train/total_loss": 0.03125013783574104 }, { "entropy": 9.091503143310547, "epoch": 0.3996440577417441, "mean_token_accuracy": 0.8155940771102905, "num_tokens": 225589.0, "step": 4042, "train/ce_loss": 0.5756384134292603 }, { "epoch": 0.3996440577417441, "step": 4042, "train/sim_loss": 0.0703125 }, { "epoch": 0.3996440577417441, "step": 4042, "train/total_loss": 0.12787634134292603 }, { "entropy": 8.81747817993164, "epoch": 0.39974293059125965, "mean_token_accuracy": 0.7452547550201416, "num_tokens": 231007.0, "step": 4043, "train/ce_loss": 0.8199257254600525 }, { "epoch": 0.39974293059125965, "step": 4043, "train/sim_loss": 0.05859375 }, { "epoch": 0.39974293059125965, "step": 4043, "train/total_loss": 0.14058631658554077 }, { "entropy": 9.08637523651123, "epoch": 0.3998418034407752, "mean_token_accuracy": 0.7206266522407532, "num_tokens": 236225.0, "step": 4044, "train/ce_loss": 1.1336801052093506 }, { "epoch": 0.3998418034407752, "step": 4044, "train/sim_loss": 0.03125 }, { "epoch": 0.3998418034407752, "step": 4044, "train/total_loss": 0.14461800456047058 }, { "entropy": 9.368659019470215, "epoch": 0.3999406762902907, "mean_token_accuracy": 0.7040441036224365, "num_tokens": 241239.0, "step": 4045, "train/ce_loss": 1.5637998580932617 }, { "epoch": 0.3999406762902907, "step": 4045, "train/sim_loss": 0.046875 }, { "epoch": 0.3999406762902907, "step": 4045, "train/total_loss": 0.20325498282909393 }, { "entropy": 9.326437950134277, "epoch": 0.4000395491398062, "mean_token_accuracy": 0.7401960492134094, "num_tokens": 246301.0, "step": 4046, "train/ce_loss": 0.9135624766349792 }, { "epoch": 0.4000395491398062, "step": 4046, "train/sim_loss": 0.06640625 }, { "epoch": 0.4000395491398062, "step": 4046, "train/total_loss": 0.15776249766349792 }, { "entropy": 8.795669555664062, "epoch": 0.40013842198932176, "mean_token_accuracy": 0.7771428823471069, "num_tokens": 251820.0, "step": 4047, "train/ce_loss": 0.8271172046661377 }, { "epoch": 0.40013842198932176, "step": 4047, "train/sim_loss": 0.05078125 }, { "epoch": 0.40013842198932176, "step": 4047, "train/total_loss": 0.13349297642707825 }, { "entropy": 8.837505340576172, "epoch": 0.40023729483883724, "mean_token_accuracy": 0.7815040946006775, "num_tokens": 257247.0, "step": 4048, "train/ce_loss": 0.5455641746520996 }, { "epoch": 0.40023729483883724, "step": 4048, "train/sim_loss": 0.0625 }, { "epoch": 0.40023729483883724, "step": 4048, "train/total_loss": 0.11705641448497772 }, { "entropy": 9.698233604431152, "epoch": 0.4003361676883528, "mean_token_accuracy": 0.670040488243103, "num_tokens": 262179.0, "step": 4049, "train/ce_loss": 2.601757287979126 }, { "epoch": 0.4003361676883528, "step": 4049, "train/sim_loss": 0.078125 }, { "epoch": 0.4003361676883528, "step": 4049, "train/total_loss": 0.3383007347583771 }, { "entropy": 9.253774642944336, "epoch": 0.4004350405378683, "mean_token_accuracy": 0.7510259747505188, "num_tokens": 267369.0, "step": 4050, "train/ce_loss": 0.7595701813697815 }, { "epoch": 0.4004350405378683, "step": 4050, "train/sim_loss": 0.03125 }, { "epoch": 0.4004350405378683, "step": 4050, "train/total_loss": 0.10720702260732651 }, { "entropy": 9.428849220275879, "epoch": 0.4005339133873838, "mean_token_accuracy": 0.7894002795219421, "num_tokens": 272478.0, "step": 4051, "train/ce_loss": 0.5480824112892151 }, { "epoch": 0.4005339133873838, "step": 4051, "train/sim_loss": 0.01953125 }, { "epoch": 0.4005339133873838, "step": 4051, "train/total_loss": 0.07433949410915375 }, { "entropy": 8.907392501831055, "epoch": 0.40063278623689935, "mean_token_accuracy": 0.7654054164886475, "num_tokens": 277944.0, "step": 4052, "train/ce_loss": 0.570549488067627 }, { "epoch": 0.40063278623689935, "step": 4052, "train/sim_loss": 0.04296875 }, { "epoch": 0.40063278623689935, "step": 4052, "train/total_loss": 0.10002370178699493 }, { "entropy": 8.90610408782959, "epoch": 0.4007316590864149, "mean_token_accuracy": 0.7560483813285828, "num_tokens": 283426.0, "step": 4053, "train/ce_loss": 0.9984250068664551 }, { "epoch": 0.4007316590864149, "step": 4053, "train/sim_loss": 0.07421875 }, { "epoch": 0.4007316590864149, "step": 4053, "train/total_loss": 0.17406125366687775 }, { "entropy": 9.252974510192871, "epoch": 0.4008305319359304, "mean_token_accuracy": 0.7737127542495728, "num_tokens": 288633.0, "step": 4054, "train/ce_loss": 0.7401827573776245 }, { "epoch": 0.4008305319359304, "step": 4054, "train/sim_loss": 0.0390625 }, { "epoch": 0.4008305319359304, "step": 4054, "train/total_loss": 0.11308077722787857 }, { "entropy": 9.408609390258789, "epoch": 0.4009294047854459, "mean_token_accuracy": 0.7308781743049622, "num_tokens": 293964.0, "step": 4055, "train/ce_loss": 1.6208299398422241 }, { "epoch": 0.4009294047854459, "step": 4055, "train/sim_loss": 0.0859375 }, { "epoch": 0.4009294047854459, "step": 4055, "train/total_loss": 0.2480204999446869 }, { "entropy": 9.414863586425781, "epoch": 0.40102827763496146, "mean_token_accuracy": 0.7586206793785095, "num_tokens": 298968.0, "step": 4056, "train/ce_loss": 1.4367305993800983e-06 }, { "epoch": 0.40102827763496146, "step": 4056, "train/sim_loss": 0.03125 }, { "epoch": 0.40102827763496146, "step": 4056, "train/total_loss": 0.03125014528632164 }, { "entropy": 9.196757316589355, "epoch": 0.40112715048447695, "mean_token_accuracy": 0.7459893226623535, "num_tokens": 304217.0, "step": 4057, "train/ce_loss": 0.9032488465309143 }, { "epoch": 0.40112715048447695, "step": 4057, "train/sim_loss": 0.0859375 }, { "epoch": 0.40112715048447695, "step": 4057, "train/total_loss": 0.17626237869262695 }, { "entropy": 9.397751808166504, "epoch": 0.4012260233339925, "mean_token_accuracy": 0.7250755429267883, "num_tokens": 309337.0, "step": 4058, "train/ce_loss": 1.0142661333084106 }, { "epoch": 0.4012260233339925, "step": 4058, "train/sim_loss": 0.0546875 }, { "epoch": 0.4012260233339925, "step": 4058, "train/total_loss": 0.1561141163110733 }, { "entropy": 9.22425651550293, "epoch": 0.40132489618350803, "mean_token_accuracy": 0.7647768259048462, "num_tokens": 314625.0, "step": 4059, "train/ce_loss": 0.8428784012794495 }, { "epoch": 0.40132489618350803, "step": 4059, "train/sim_loss": 0.0703125 }, { "epoch": 0.40132489618350803, "step": 4059, "train/total_loss": 0.1546003520488739 }, { "epoch": 0.4014237690330235, "grad_norm": 0.7428350448608398, "learning_rate": 8.998912129753253e-06, "loss": 0.1401, "step": 4060 }, { "entropy": 9.935905456542969, "epoch": 0.4014237690330235, "mean_token_accuracy": 0.7200000286102295, "num_tokens": 319406.0, "step": 4060, "train/ce_loss": 1.5060312747955322 }, { "epoch": 0.4014237690330235, "step": 4060, "train/sim_loss": 0.0859375 }, { "epoch": 0.4014237690330235, "step": 4060, "train/total_loss": 0.23654063045978546 }, { "entropy": 9.732525825500488, "epoch": 0.40152264188253906, "mean_token_accuracy": 0.7864583134651184, "num_tokens": 324443.0, "step": 4061, "train/ce_loss": 1.0543136596679688 }, { "epoch": 0.40152264188253906, "step": 4061, "train/sim_loss": 0.125 }, { "epoch": 0.40152264188253906, "step": 4061, "train/total_loss": 0.23043137788772583 }, { "entropy": 8.742220878601074, "epoch": 0.4016215147320546, "mean_token_accuracy": 0.7975584864616394, "num_tokens": 329879.0, "step": 4062, "train/ce_loss": 0.620628297328949 }, { "epoch": 0.4016215147320546, "step": 4062, "train/sim_loss": 0.03125 }, { "epoch": 0.4016215147320546, "step": 4062, "train/total_loss": 0.0933128297328949 }, { "entropy": 9.26221752166748, "epoch": 0.4017203875815701, "mean_token_accuracy": 0.7550251483917236, "num_tokens": 335157.0, "step": 4063, "train/ce_loss": 0.947535514831543 }, { "epoch": 0.4017203875815701, "step": 4063, "train/sim_loss": 0.0625 }, { "epoch": 0.4017203875815701, "step": 4063, "train/total_loss": 0.15725356340408325 }, { "entropy": 9.272913932800293, "epoch": 0.4018192604310856, "mean_token_accuracy": 0.7151898741722107, "num_tokens": 340404.0, "step": 4064, "train/ce_loss": 1.4104467630386353 }, { "epoch": 0.4018192604310856, "step": 4064, "train/sim_loss": 0.06640625 }, { "epoch": 0.4018192604310856, "step": 4064, "train/total_loss": 0.20745092630386353 }, { "entropy": 9.822046279907227, "epoch": 0.40191813328060116, "mean_token_accuracy": 0.7368420958518982, "num_tokens": 345124.0, "step": 4065, "train/ce_loss": 1.7242532968521118 }, { "epoch": 0.40191813328060116, "step": 4065, "train/sim_loss": 0.046875 }, { "epoch": 0.40191813328060116, "step": 4065, "train/total_loss": 0.21930032968521118 }, { "entropy": 8.932167053222656, "epoch": 0.40201700613011665, "mean_token_accuracy": 0.7044100165367126, "num_tokens": 350486.0, "step": 4066, "train/ce_loss": 0.748856782913208 }, { "epoch": 0.40201700613011665, "step": 4066, "train/sim_loss": 0.06640625 }, { "epoch": 0.40201700613011665, "step": 4066, "train/total_loss": 0.14129193127155304 }, { "entropy": 9.134580612182617, "epoch": 0.4021158789796322, "mean_token_accuracy": 0.7713178396224976, "num_tokens": 355764.0, "step": 4067, "train/ce_loss": 0.8877795934677124 }, { "epoch": 0.4021158789796322, "step": 4067, "train/sim_loss": 0.08203125 }, { "epoch": 0.4021158789796322, "step": 4067, "train/total_loss": 0.17080920934677124 }, { "entropy": 9.582756042480469, "epoch": 0.40221475182914773, "mean_token_accuracy": 0.6673228144645691, "num_tokens": 360713.0, "step": 4068, "train/ce_loss": 0.9494542479515076 }, { "epoch": 0.40221475182914773, "step": 4068, "train/sim_loss": 0.05078125 }, { "epoch": 0.40221475182914773, "step": 4068, "train/total_loss": 0.14572668075561523 }, { "entropy": 9.266700744628906, "epoch": 0.4023136246786632, "mean_token_accuracy": 0.7130434513092041, "num_tokens": 365887.0, "step": 4069, "train/ce_loss": 0.8378681540489197 }, { "epoch": 0.4023136246786632, "step": 4069, "train/sim_loss": 0.0625 }, { "epoch": 0.4023136246786632, "step": 4069, "train/total_loss": 0.14628681540489197 }, { "entropy": 9.064632415771484, "epoch": 0.40241249752817876, "mean_token_accuracy": 0.740645170211792, "num_tokens": 371197.0, "step": 4070, "train/ce_loss": 0.9820173978805542 }, { "epoch": 0.40241249752817876, "step": 4070, "train/sim_loss": 0.05859375 }, { "epoch": 0.40241249752817876, "step": 4070, "train/total_loss": 0.15679550170898438 }, { "entropy": 9.865793228149414, "epoch": 0.4025113703776943, "mean_token_accuracy": 0.7670885920524597, "num_tokens": 375991.0, "step": 4071, "train/ce_loss": 1.585090160369873 }, { "epoch": 0.4025113703776943, "step": 4071, "train/sim_loss": 0.03125 }, { "epoch": 0.4025113703776943, "step": 4071, "train/total_loss": 0.1897590160369873 }, { "entropy": 9.119791030883789, "epoch": 0.4026102432272098, "mean_token_accuracy": 0.7205387353897095, "num_tokens": 381359.0, "step": 4072, "train/ce_loss": 1.0067178010940552 }, { "epoch": 0.4026102432272098, "step": 4072, "train/sim_loss": 0.11328125 }, { "epoch": 0.4026102432272098, "step": 4072, "train/total_loss": 0.21395303308963776 }, { "entropy": 9.519779205322266, "epoch": 0.4027091160767253, "mean_token_accuracy": 0.692307710647583, "num_tokens": 386436.0, "step": 4073, "train/ce_loss": 1.344114707535482e-06 }, { "epoch": 0.4027091160767253, "step": 4073, "train/sim_loss": 0.0390625 }, { "epoch": 0.4027091160767253, "step": 4073, "train/total_loss": 0.039062634110450745 }, { "entropy": 8.639932632446289, "epoch": 0.40280798892624087, "mean_token_accuracy": 0.7018572688102722, "num_tokens": 391914.0, "step": 4074, "train/ce_loss": 1.4248647689819336 }, { "epoch": 0.40280798892624087, "step": 4074, "train/sim_loss": 0.09765625 }, { "epoch": 0.40280798892624087, "step": 4074, "train/total_loss": 0.24014273285865784 }, { "entropy": 9.420963287353516, "epoch": 0.40290686177575635, "mean_token_accuracy": 0.7201017737388611, "num_tokens": 397177.0, "step": 4075, "train/ce_loss": 1.538402557343943e-06 }, { "epoch": 0.40290686177575635, "step": 4075, "train/sim_loss": 0.0234375 }, { "epoch": 0.40290686177575635, "step": 4075, "train/total_loss": 0.023437654599547386 }, { "entropy": 8.777081489562988, "epoch": 0.4030057346252719, "mean_token_accuracy": 0.7351778745651245, "num_tokens": 402650.0, "step": 4076, "train/ce_loss": 0.6327884793281555 }, { "epoch": 0.4030057346252719, "step": 4076, "train/sim_loss": 0.03125 }, { "epoch": 0.4030057346252719, "step": 4076, "train/total_loss": 0.09452884644269943 }, { "entropy": 9.25904369354248, "epoch": 0.40310460747478744, "mean_token_accuracy": 0.7472826242446899, "num_tokens": 407881.0, "step": 4077, "train/ce_loss": 0.6763638257980347 }, { "epoch": 0.40310460747478744, "step": 4077, "train/sim_loss": 0.03515625 }, { "epoch": 0.40310460747478744, "step": 4077, "train/total_loss": 0.1027926355600357 }, { "entropy": 9.023961067199707, "epoch": 0.4032034803243029, "mean_token_accuracy": 0.723247230052948, "num_tokens": 413130.0, "step": 4078, "train/ce_loss": 1.1932096481323242 }, { "epoch": 0.4032034803243029, "step": 4078, "train/sim_loss": 0.1171875 }, { "epoch": 0.4032034803243029, "step": 4078, "train/total_loss": 0.23650845885276794 }, { "entropy": 9.099632263183594, "epoch": 0.40330235317381846, "mean_token_accuracy": 0.7875416874885559, "num_tokens": 418475.0, "step": 4079, "train/ce_loss": 0.6685423851013184 }, { "epoch": 0.40330235317381846, "step": 4079, "train/sim_loss": 0.0625 }, { "epoch": 0.40330235317381846, "step": 4079, "train/total_loss": 0.12935423851013184 }, { "epoch": 0.403401226023334, "grad_norm": 0.6031910181045532, "learning_rate": 8.993967264995303e-06, "loss": 0.1484, "step": 4080 }, { "entropy": 9.174674987792969, "epoch": 0.403401226023334, "mean_token_accuracy": 0.8140770196914673, "num_tokens": 423713.0, "step": 4080, "train/ce_loss": 0.45924171805381775 }, { "epoch": 0.403401226023334, "step": 4080, "train/sim_loss": 0.02734375 }, { "epoch": 0.403401226023334, "step": 4080, "train/total_loss": 0.07326792180538177 }, { "entropy": 8.881291389465332, "epoch": 0.4035000988728495, "mean_token_accuracy": 0.713567852973938, "num_tokens": 428986.0, "step": 4081, "train/ce_loss": 0.8884828686714172 }, { "epoch": 0.4035000988728495, "step": 4081, "train/sim_loss": 0.0546875 }, { "epoch": 0.4035000988728495, "step": 4081, "train/total_loss": 0.1435357928276062 }, { "entropy": 9.463319778442383, "epoch": 0.40359897172236503, "mean_token_accuracy": 0.749576985836029, "num_tokens": 434015.0, "step": 4082, "train/ce_loss": 0.9069718718528748 }, { "epoch": 0.40359897172236503, "step": 4082, "train/sim_loss": 0.03515625 }, { "epoch": 0.40359897172236503, "step": 4082, "train/total_loss": 0.12585344910621643 }, { "entropy": 9.164590835571289, "epoch": 0.40369784457188057, "mean_token_accuracy": 0.7601390480995178, "num_tokens": 439342.0, "step": 4083, "train/ce_loss": 0.6753251552581787 }, { "epoch": 0.40369784457188057, "step": 4083, "train/sim_loss": 0.05859375 }, { "epoch": 0.40369784457188057, "step": 4083, "train/total_loss": 0.1261262595653534 }, { "entropy": 8.680729866027832, "epoch": 0.4037967174213961, "mean_token_accuracy": 0.7681007385253906, "num_tokens": 444758.0, "step": 4084, "train/ce_loss": 0.6548745036125183 }, { "epoch": 0.4037967174213961, "step": 4084, "train/sim_loss": 0.0546875 }, { "epoch": 0.4037967174213961, "step": 4084, "train/total_loss": 0.12017495185136795 }, { "entropy": 9.053796768188477, "epoch": 0.4038955902709116, "mean_token_accuracy": 0.7254237532615662, "num_tokens": 450083.0, "step": 4085, "train/ce_loss": 0.6766396164894104 }, { "epoch": 0.4038955902709116, "step": 4085, "train/sim_loss": 0.08203125 }, { "epoch": 0.4038955902709116, "step": 4085, "train/total_loss": 0.14969521760940552 }, { "entropy": 8.84349250793457, "epoch": 0.40399446312042714, "mean_token_accuracy": 0.7057613134384155, "num_tokens": 455495.0, "step": 4086, "train/ce_loss": 1.072960376739502 }, { "epoch": 0.40399446312042714, "step": 4086, "train/sim_loss": 0.046875 }, { "epoch": 0.40399446312042714, "step": 4086, "train/total_loss": 0.15417104959487915 }, { "entropy": 9.179718017578125, "epoch": 0.4040933359699427, "mean_token_accuracy": 0.7775148153305054, "num_tokens": 460781.0, "step": 4087, "train/ce_loss": 0.4510173797607422 }, { "epoch": 0.4040933359699427, "step": 4087, "train/sim_loss": 0.0234375 }, { "epoch": 0.4040933359699427, "step": 4087, "train/total_loss": 0.06853923946619034 }, { "entropy": 9.581786155700684, "epoch": 0.40419220881945817, "mean_token_accuracy": 0.7359550595283508, "num_tokens": 465717.0, "step": 4088, "train/ce_loss": 1.939404455697513e-06 }, { "epoch": 0.40419220881945817, "step": 4088, "train/sim_loss": 0.0390625 }, { "epoch": 0.40419220881945817, "step": 4088, "train/total_loss": 0.03906269371509552 }, { "entropy": 8.916521072387695, "epoch": 0.4042910816689737, "mean_token_accuracy": 0.7421320080757141, "num_tokens": 471190.0, "step": 4089, "train/ce_loss": 0.8648796081542969 }, { "epoch": 0.4042910816689737, "step": 4089, "train/sim_loss": 0.0625 }, { "epoch": 0.4042910816689737, "step": 4089, "train/total_loss": 0.14898796379566193 }, { "entropy": 8.816831588745117, "epoch": 0.40438995451848925, "mean_token_accuracy": 0.7719836235046387, "num_tokens": 476652.0, "step": 4090, "train/ce_loss": 0.5155811905860901 }, { "epoch": 0.40438995451848925, "step": 4090, "train/sim_loss": 0.015625 }, { "epoch": 0.40438995451848925, "step": 4090, "train/total_loss": 0.06718312203884125 }, { "entropy": 8.626619338989258, "epoch": 0.40448882736800473, "mean_token_accuracy": 0.7412280440330505, "num_tokens": 482049.0, "step": 4091, "train/ce_loss": 0.579270601272583 }, { "epoch": 0.40448882736800473, "step": 4091, "train/sim_loss": 0.0234375 }, { "epoch": 0.40448882736800473, "step": 4091, "train/total_loss": 0.08136455714702606 }, { "entropy": 9.28750228881836, "epoch": 0.4045877002175203, "mean_token_accuracy": 0.66847825050354, "num_tokens": 487210.0, "step": 4092, "train/ce_loss": 1.484215658820176e-06 }, { "epoch": 0.4045877002175203, "step": 4092, "train/sim_loss": 0.06640625 }, { "epoch": 0.4045877002175203, "step": 4092, "train/total_loss": 0.06640639901161194 }, { "entropy": 9.270790100097656, "epoch": 0.4046865730670358, "mean_token_accuracy": 0.7216066718101501, "num_tokens": 492397.0, "step": 4093, "train/ce_loss": 0.3460865616798401 }, { "epoch": 0.4046865730670358, "step": 4093, "train/sim_loss": 0.0625 }, { "epoch": 0.4046865730670358, "step": 4093, "train/total_loss": 0.09710866212844849 }, { "entropy": 10.077104568481445, "epoch": 0.4047854459165513, "mean_token_accuracy": 0.739534854888916, "num_tokens": 497040.0, "step": 4094, "train/ce_loss": 6.597715582756791e-06 }, { "epoch": 0.4047854459165513, "step": 4094, "train/sim_loss": 0.06640625 }, { "epoch": 0.4047854459165513, "step": 4094, "train/total_loss": 0.06640691310167313 }, { "entropy": 9.576848030090332, "epoch": 0.40488431876606684, "mean_token_accuracy": 0.7211538553237915, "num_tokens": 502022.0, "step": 4095, "train/ce_loss": 1.0170562267303467 }, { "epoch": 0.40488431876606684, "step": 4095, "train/sim_loss": 0.0859375 }, { "epoch": 0.40488431876606684, "step": 4095, "train/total_loss": 0.1876431256532669 }, { "entropy": 9.736879348754883, "epoch": 0.4049831916155824, "mean_token_accuracy": 0.7448630332946777, "num_tokens": 507008.0, "step": 4096, "train/ce_loss": 1.0227447748184204 }, { "epoch": 0.4049831916155824, "step": 4096, "train/sim_loss": 0.10546875 }, { "epoch": 0.4049831916155824, "step": 4096, "train/total_loss": 0.20774322748184204 }, { "entropy": 9.458332061767578, "epoch": 0.40508206446509787, "mean_token_accuracy": 0.7558320164680481, "num_tokens": 512081.0, "step": 4097, "train/ce_loss": 0.6700068116188049 }, { "epoch": 0.40508206446509787, "step": 4097, "train/sim_loss": 0.03125 }, { "epoch": 0.40508206446509787, "step": 4097, "train/total_loss": 0.09825067967176437 }, { "entropy": 9.689407348632812, "epoch": 0.4051809373146134, "mean_token_accuracy": 0.7162471413612366, "num_tokens": 516974.0, "step": 4098, "train/ce_loss": 2.7539579150470672e-06 }, { "epoch": 0.4051809373146134, "step": 4098, "train/sim_loss": 0.0703125 }, { "epoch": 0.4051809373146134, "step": 4098, "train/total_loss": 0.07031277567148209 }, { "entropy": 9.511401176452637, "epoch": 0.40527981016412895, "mean_token_accuracy": 0.7224264740943909, "num_tokens": 521918.0, "step": 4099, "train/ce_loss": 0.7716624140739441 }, { "epoch": 0.40527981016412895, "step": 4099, "train/sim_loss": 0.0546875 }, { "epoch": 0.40527981016412895, "step": 4099, "train/total_loss": 0.13185374438762665 }, { "epoch": 0.40537868301364444, "grad_norm": 0.7709155082702637, "learning_rate": 8.989022400237354e-06, "loss": 0.1442, "step": 4100 }, { "entropy": 8.770706176757812, "epoch": 0.40537868301364444, "mean_token_accuracy": 0.7266880869865417, "num_tokens": 527264.0, "step": 4100, "train/ce_loss": 1.0688512325286865 }, { "epoch": 0.40537868301364444, "step": 4100, "train/sim_loss": 0.0546875 }, { "epoch": 0.40537868301364444, "step": 4100, "train/total_loss": 0.1615726351737976 }, { "entropy": 9.146575927734375, "epoch": 0.40547755586316, "mean_token_accuracy": 0.747863233089447, "num_tokens": 532406.0, "step": 4101, "train/ce_loss": 0.6212337613105774 }, { "epoch": 0.40547755586316, "step": 4101, "train/sim_loss": 0.02734375 }, { "epoch": 0.40547755586316, "step": 4101, "train/total_loss": 0.0894671231508255 }, { "entropy": 8.769676208496094, "epoch": 0.4055764287126755, "mean_token_accuracy": 0.7096070051193237, "num_tokens": 537843.0, "step": 4102, "train/ce_loss": 1.2739291191101074 }, { "epoch": 0.4055764287126755, "step": 4102, "train/sim_loss": 0.09375 }, { "epoch": 0.4055764287126755, "step": 4102, "train/total_loss": 0.22114291787147522 }, { "entropy": 9.421545028686523, "epoch": 0.405675301562191, "mean_token_accuracy": 0.761695921421051, "num_tokens": 542962.0, "step": 4103, "train/ce_loss": 3.845986611850094e-06 }, { "epoch": 0.405675301562191, "step": 4103, "train/sim_loss": 0.05078125 }, { "epoch": 0.405675301562191, "step": 4103, "train/total_loss": 0.05078163370490074 }, { "entropy": 9.3009033203125, "epoch": 0.40577417441170655, "mean_token_accuracy": 0.7651195526123047, "num_tokens": 548130.0, "step": 4104, "train/ce_loss": 0.5734964609146118 }, { "epoch": 0.40577417441170655, "step": 4104, "train/sim_loss": 0.06640625 }, { "epoch": 0.40577417441170655, "step": 4104, "train/total_loss": 0.12375590205192566 }, { "entropy": 9.662715911865234, "epoch": 0.4058730472612221, "mean_token_accuracy": 0.7347294688224792, "num_tokens": 553113.0, "step": 4105, "train/ce_loss": 0.7485767006874084 }, { "epoch": 0.4058730472612221, "step": 4105, "train/sim_loss": 0.0546875 }, { "epoch": 0.4058730472612221, "step": 4105, "train/total_loss": 0.1295451819896698 }, { "entropy": 8.815567970275879, "epoch": 0.40597192011073757, "mean_token_accuracy": 0.711275041103363, "num_tokens": 558553.0, "step": 4106, "train/ce_loss": 0.6791121363639832 }, { "epoch": 0.40597192011073757, "step": 4106, "train/sim_loss": 0.046875 }, { "epoch": 0.40597192011073757, "step": 4106, "train/total_loss": 0.11478621512651443 }, { "entropy": 9.781185150146484, "epoch": 0.4060707929602531, "mean_token_accuracy": 0.7198953032493591, "num_tokens": 563349.0, "step": 4107, "train/ce_loss": 4.859100954490714e-06 }, { "epoch": 0.4060707929602531, "step": 4107, "train/sim_loss": 0.06640625 }, { "epoch": 0.4060707929602531, "step": 4107, "train/total_loss": 0.0664067342877388 }, { "entropy": 9.460572242736816, "epoch": 0.40616966580976865, "mean_token_accuracy": 0.7015625238418579, "num_tokens": 568465.0, "step": 4108, "train/ce_loss": 7.071306754369289e-06 }, { "epoch": 0.40616966580976865, "step": 4108, "train/sim_loss": 0.0390625 }, { "epoch": 0.40616966580976865, "step": 4108, "train/total_loss": 0.03906320780515671 }, { "entropy": 9.23255729675293, "epoch": 0.40626853865928414, "mean_token_accuracy": 0.769336998462677, "num_tokens": 573626.0, "step": 4109, "train/ce_loss": 0.6542064547538757 }, { "epoch": 0.40626853865928414, "step": 4109, "train/sim_loss": 0.04296875 }, { "epoch": 0.40626853865928414, "step": 4109, "train/total_loss": 0.10838939994573593 }, { "entropy": 9.546173095703125, "epoch": 0.4063674115087997, "mean_token_accuracy": 0.7324841022491455, "num_tokens": 578639.0, "step": 4110, "train/ce_loss": 0.6791893243789673 }, { "epoch": 0.4063674115087997, "step": 4110, "train/sim_loss": 0.0625 }, { "epoch": 0.4063674115087997, "step": 4110, "train/total_loss": 0.13041892647743225 }, { "entropy": 9.005331993103027, "epoch": 0.4064662843583152, "mean_token_accuracy": 0.7317380309104919, "num_tokens": 583958.0, "step": 4111, "train/ce_loss": 0.8003168106079102 }, { "epoch": 0.4064662843583152, "step": 4111, "train/sim_loss": 0.046875 }, { "epoch": 0.4064662843583152, "step": 4111, "train/total_loss": 0.12690669298171997 }, { "entropy": 9.159075736999512, "epoch": 0.4065651572078307, "mean_token_accuracy": 0.7363966107368469, "num_tokens": 589231.0, "step": 4112, "train/ce_loss": 0.8423686027526855 }, { "epoch": 0.4065651572078307, "step": 4112, "train/sim_loss": 0.04296875 }, { "epoch": 0.4065651572078307, "step": 4112, "train/total_loss": 0.12720561027526855 }, { "entropy": 8.986069679260254, "epoch": 0.40666403005734625, "mean_token_accuracy": 0.7165071964263916, "num_tokens": 594554.0, "step": 4113, "train/ce_loss": 0.9497794508934021 }, { "epoch": 0.40666403005734625, "step": 4113, "train/sim_loss": 0.0859375 }, { "epoch": 0.40666403005734625, "step": 4113, "train/total_loss": 0.1809154450893402 }, { "entropy": 8.952409744262695, "epoch": 0.4067629029068618, "mean_token_accuracy": 0.7755101919174194, "num_tokens": 599955.0, "step": 4114, "train/ce_loss": 0.6269693970680237 }, { "epoch": 0.4067629029068618, "step": 4114, "train/sim_loss": 0.01953125 }, { "epoch": 0.4067629029068618, "step": 4114, "train/total_loss": 0.08222819119691849 }, { "entropy": 8.910983085632324, "epoch": 0.4068617757563773, "mean_token_accuracy": 0.7552447319030762, "num_tokens": 605377.0, "step": 4115, "train/ce_loss": 0.5977582931518555 }, { "epoch": 0.4068617757563773, "step": 4115, "train/sim_loss": 0.12109375 }, { "epoch": 0.4068617757563773, "step": 4115, "train/total_loss": 0.18086957931518555 }, { "entropy": 9.15473747253418, "epoch": 0.4069606486058928, "mean_token_accuracy": 0.7110552787780762, "num_tokens": 610645.0, "step": 4116, "train/ce_loss": 8.161274308804423e-06 }, { "epoch": 0.4069606486058928, "step": 4116, "train/sim_loss": 0.06640625 }, { "epoch": 0.4069606486058928, "step": 4116, "train/total_loss": 0.06640706956386566 }, { "entropy": 8.895200729370117, "epoch": 0.40705952145540836, "mean_token_accuracy": 0.7756041288375854, "num_tokens": 615997.0, "step": 4117, "train/ce_loss": 0.7694007754325867 }, { "epoch": 0.40705952145540836, "step": 4117, "train/sim_loss": 0.0625 }, { "epoch": 0.40705952145540836, "step": 4117, "train/total_loss": 0.13944008946418762 }, { "entropy": 9.071516990661621, "epoch": 0.40715839430492384, "mean_token_accuracy": 0.718471348285675, "num_tokens": 621268.0, "step": 4118, "train/ce_loss": 1.1130056381225586 }, { "epoch": 0.40715839430492384, "step": 4118, "train/sim_loss": 0.03125 }, { "epoch": 0.40715839430492384, "step": 4118, "train/total_loss": 0.14255055785179138 }, { "entropy": 8.723745346069336, "epoch": 0.4072572671544394, "mean_token_accuracy": 0.7781955003738403, "num_tokens": 626806.0, "step": 4119, "train/ce_loss": 0.3095889985561371 }, { "epoch": 0.4072572671544394, "step": 4119, "train/sim_loss": 0.01953125 }, { "epoch": 0.4072572671544394, "step": 4119, "train/total_loss": 0.05049014836549759 }, { "epoch": 0.4073561400039549, "grad_norm": 0.59541255235672, "learning_rate": 8.984077535479406e-06, "loss": 0.1382, "step": 4120 }, { "entropy": 10.090121269226074, "epoch": 0.4073561400039549, "mean_token_accuracy": 0.7755610942840576, "num_tokens": 631611.0, "step": 4120, "train/ce_loss": 1.8787650333251804e-06 }, { "epoch": 0.4073561400039549, "step": 4120, "train/sim_loss": 0.01953125 }, { "epoch": 0.4073561400039549, "step": 4120, "train/total_loss": 0.019531438127160072 }, { "entropy": 8.96327018737793, "epoch": 0.4074550128534704, "mean_token_accuracy": 0.6701461672782898, "num_tokens": 637197.0, "step": 4121, "train/ce_loss": 0.7925607562065125 }, { "epoch": 0.4074550128534704, "step": 4121, "train/sim_loss": 0.04296875 }, { "epoch": 0.4074550128534704, "step": 4121, "train/total_loss": 0.1222248300909996 }, { "entropy": 10.416955947875977, "epoch": 0.40755388570298595, "mean_token_accuracy": 0.7746478915214539, "num_tokens": 641789.0, "step": 4122, "train/ce_loss": 4.615934358298546e-06 }, { "epoch": 0.40755388570298595, "step": 4122, "train/sim_loss": 0.015625 }, { "epoch": 0.40755388570298595, "step": 4122, "train/total_loss": 0.01562546193599701 }, { "entropy": 8.998491287231445, "epoch": 0.4076527585525015, "mean_token_accuracy": 0.7345013618469238, "num_tokens": 646980.0, "step": 4123, "train/ce_loss": 1.7812578678131104 }, { "epoch": 0.4076527585525015, "step": 4123, "train/sim_loss": 0.078125 }, { "epoch": 0.4076527585525015, "step": 4123, "train/total_loss": 0.25625079870224 }, { "entropy": 9.302095413208008, "epoch": 0.407751631402017, "mean_token_accuracy": 0.6926407217979431, "num_tokens": 652290.0, "step": 4124, "train/ce_loss": 0.5359441637992859 }, { "epoch": 0.407751631402017, "step": 4124, "train/sim_loss": 0.05078125 }, { "epoch": 0.407751631402017, "step": 4124, "train/total_loss": 0.10437566787004471 }, { "entropy": 8.885763168334961, "epoch": 0.4078505042515325, "mean_token_accuracy": 0.7222222089767456, "num_tokens": 657511.0, "step": 4125, "train/ce_loss": 0.9504994750022888 }, { "epoch": 0.4078505042515325, "step": 4125, "train/sim_loss": 0.0546875 }, { "epoch": 0.4078505042515325, "step": 4125, "train/total_loss": 0.14973744750022888 }, { "entropy": 8.990278244018555, "epoch": 0.40794937710104806, "mean_token_accuracy": 0.7949336767196655, "num_tokens": 662855.0, "step": 4126, "train/ce_loss": 0.8072158098220825 }, { "epoch": 0.40794937710104806, "step": 4126, "train/sim_loss": 0.06640625 }, { "epoch": 0.40794937710104806, "step": 4126, "train/total_loss": 0.14712783694267273 }, { "entropy": 8.927145004272461, "epoch": 0.4080482499505636, "mean_token_accuracy": 0.7793939113616943, "num_tokens": 668150.0, "step": 4127, "train/ce_loss": 0.6687441468238831 }, { "epoch": 0.4080482499505636, "step": 4127, "train/sim_loss": 0.12109375 }, { "epoch": 0.4080482499505636, "step": 4127, "train/total_loss": 0.1879681646823883 }, { "entropy": 9.023634910583496, "epoch": 0.4081471228000791, "mean_token_accuracy": 0.7296416759490967, "num_tokens": 673574.0, "step": 4128, "train/ce_loss": 0.825593888759613 }, { "epoch": 0.4081471228000791, "step": 4128, "train/sim_loss": 0.07421875 }, { "epoch": 0.4081471228000791, "step": 4128, "train/total_loss": 0.15677814185619354 }, { "entropy": 9.965190887451172, "epoch": 0.40824599564959463, "mean_token_accuracy": 0.7906976938247681, "num_tokens": 678333.0, "step": 4129, "train/ce_loss": 1.6968340873718262 }, { "epoch": 0.40824599564959463, "step": 4129, "train/sim_loss": 0.03515625 }, { "epoch": 0.40824599564959463, "step": 4129, "train/total_loss": 0.20483966171741486 }, { "entropy": 9.249101638793945, "epoch": 0.40834486849911017, "mean_token_accuracy": 0.7264437675476074, "num_tokens": 683440.0, "step": 4130, "train/ce_loss": 1.220321536064148 }, { "epoch": 0.40834486849911017, "step": 4130, "train/sim_loss": 0.08984375 }, { "epoch": 0.40834486849911017, "step": 4130, "train/total_loss": 0.21187591552734375 }, { "entropy": 9.138092041015625, "epoch": 0.40844374134862566, "mean_token_accuracy": 0.7407407164573669, "num_tokens": 688917.0, "step": 4131, "train/ce_loss": 0.9167888760566711 }, { "epoch": 0.40844374134862566, "step": 4131, "train/sim_loss": 0.0703125 }, { "epoch": 0.40844374134862566, "step": 4131, "train/total_loss": 0.16199138760566711 }, { "entropy": 9.164538383483887, "epoch": 0.4085426141981412, "mean_token_accuracy": 0.7535853981971741, "num_tokens": 694046.0, "step": 4132, "train/ce_loss": 0.8670970797538757 }, { "epoch": 0.4085426141981412, "step": 4132, "train/sim_loss": 0.046875 }, { "epoch": 0.4085426141981412, "step": 4132, "train/total_loss": 0.13358470797538757 }, { "entropy": 8.78746223449707, "epoch": 0.40864148704765674, "mean_token_accuracy": 0.6848341226577759, "num_tokens": 699336.0, "step": 4133, "train/ce_loss": 0.9098950624465942 }, { "epoch": 0.40864148704765674, "step": 4133, "train/sim_loss": 0.1171875 }, { "epoch": 0.40864148704765674, "step": 4133, "train/total_loss": 0.20817700028419495 }, { "entropy": 10.089083671569824, "epoch": 0.4087403598971722, "mean_token_accuracy": 0.7387387156486511, "num_tokens": 703964.0, "step": 4134, "train/ce_loss": 8.100925697362982e-06 }, { "epoch": 0.4087403598971722, "step": 4134, "train/sim_loss": 0.02734375 }, { "epoch": 0.4087403598971722, "step": 4134, "train/total_loss": 0.027344560250639915 }, { "entropy": 9.525264739990234, "epoch": 0.40883923274668776, "mean_token_accuracy": 0.7131537199020386, "num_tokens": 709024.0, "step": 4135, "train/ce_loss": 0.7740110754966736 }, { "epoch": 0.40883923274668776, "step": 4135, "train/sim_loss": 0.03125 }, { "epoch": 0.40883923274668776, "step": 4135, "train/total_loss": 0.10865110903978348 }, { "entropy": 8.619873046875, "epoch": 0.4089381055962033, "mean_token_accuracy": 0.740170955657959, "num_tokens": 714657.0, "step": 4136, "train/ce_loss": 0.383798211812973 }, { "epoch": 0.4089381055962033, "step": 4136, "train/sim_loss": 0.05859375 }, { "epoch": 0.4089381055962033, "step": 4136, "train/total_loss": 0.09697356820106506 }, { "entropy": 9.893526077270508, "epoch": 0.4090369784457188, "mean_token_accuracy": 0.7224576473236084, "num_tokens": 719549.0, "step": 4137, "train/ce_loss": 2.5385968685150146 }, { "epoch": 0.4090369784457188, "step": 4137, "train/sim_loss": 0.0625 }, { "epoch": 0.4090369784457188, "step": 4137, "train/total_loss": 0.3163596987724304 }, { "entropy": 9.709188461303711, "epoch": 0.40913585129523433, "mean_token_accuracy": 0.7655038833618164, "num_tokens": 724521.0, "step": 4138, "train/ce_loss": 2.8100243980588857e-06 }, { "epoch": 0.40913585129523433, "step": 4138, "train/sim_loss": 0.05859375 }, { "epoch": 0.40913585129523433, "step": 4138, "train/total_loss": 0.058594029396772385 }, { "entropy": 9.046256065368652, "epoch": 0.4092347241447499, "mean_token_accuracy": 0.7681970596313477, "num_tokens": 729901.0, "step": 4139, "train/ce_loss": 0.5904944539070129 }, { "epoch": 0.4092347241447499, "step": 4139, "train/sim_loss": 0.05078125 }, { "epoch": 0.4092347241447499, "step": 4139, "train/total_loss": 0.10983069241046906 }, { "epoch": 0.40933359699426536, "grad_norm": 0.652369499206543, "learning_rate": 8.979132670721456e-06, "loss": 0.1384, "step": 4140 }, { "entropy": 9.250444412231445, "epoch": 0.40933359699426536, "mean_token_accuracy": 0.7360594868659973, "num_tokens": 735138.0, "step": 4140, "train/ce_loss": 1.768407940864563 }, { "epoch": 0.40933359699426536, "step": 4140, "train/sim_loss": 0.05078125 }, { "epoch": 0.40933359699426536, "step": 4140, "train/total_loss": 0.22762204706668854 }, { "entropy": 8.794022560119629, "epoch": 0.4094324698437809, "mean_token_accuracy": 0.703157901763916, "num_tokens": 740592.0, "step": 4141, "train/ce_loss": 1.0213520526885986 }, { "epoch": 0.4094324698437809, "step": 4141, "train/sim_loss": 0.11328125 }, { "epoch": 0.4094324698437809, "step": 4141, "train/total_loss": 0.21541646122932434 }, { "entropy": 9.040531158447266, "epoch": 0.40953134269329644, "mean_token_accuracy": 0.7473822236061096, "num_tokens": 745841.0, "step": 4142, "train/ce_loss": 0.9278818368911743 }, { "epoch": 0.40953134269329644, "step": 4142, "train/sim_loss": 0.05859375 }, { "epoch": 0.40953134269329644, "step": 4142, "train/total_loss": 0.1513819396495819 }, { "entropy": 9.186511039733887, "epoch": 0.4096302155428119, "mean_token_accuracy": 0.7512500286102295, "num_tokens": 751087.0, "step": 4143, "train/ce_loss": 1.1020587180610164e-06 }, { "epoch": 0.4096302155428119, "step": 4143, "train/sim_loss": 0.03515625 }, { "epoch": 0.4096302155428119, "step": 4143, "train/total_loss": 0.035156361758708954 }, { "entropy": 8.611774444580078, "epoch": 0.40972908839232747, "mean_token_accuracy": 0.7169615030288696, "num_tokens": 756512.0, "step": 4144, "train/ce_loss": 1.0824406147003174 }, { "epoch": 0.40972908839232747, "step": 4144, "train/sim_loss": 0.05859375 }, { "epoch": 0.40972908839232747, "step": 4144, "train/total_loss": 0.16683781147003174 }, { "entropy": 9.649202346801758, "epoch": 0.409827961241843, "mean_token_accuracy": 0.7317460179328918, "num_tokens": 761563.0, "step": 4145, "train/ce_loss": 1.1951160430908203 }, { "epoch": 0.409827961241843, "step": 4145, "train/sim_loss": 0.0625 }, { "epoch": 0.409827961241843, "step": 4145, "train/total_loss": 0.18201160430908203 }, { "entropy": 8.656286239624023, "epoch": 0.4099268340913585, "mean_token_accuracy": 0.7485822439193726, "num_tokens": 767104.0, "step": 4146, "train/ce_loss": 0.3673864006996155 }, { "epoch": 0.4099268340913585, "step": 4146, "train/sim_loss": 0.0546875 }, { "epoch": 0.4099268340913585, "step": 4146, "train/total_loss": 0.09142614156007767 }, { "entropy": 9.47769546508789, "epoch": 0.41002570694087404, "mean_token_accuracy": 0.7534013390541077, "num_tokens": 772128.0, "step": 4147, "train/ce_loss": 2.0078127818123903e-06 }, { "epoch": 0.41002570694087404, "step": 4147, "train/sim_loss": 0.05859375 }, { "epoch": 0.41002570694087404, "step": 4147, "train/total_loss": 0.05859395116567612 }, { "entropy": 9.3026123046875, "epoch": 0.4101245797903896, "mean_token_accuracy": 0.7651821970939636, "num_tokens": 777339.0, "step": 4148, "train/ce_loss": 0.8742901682853699 }, { "epoch": 0.4101245797903896, "step": 4148, "train/sim_loss": 0.05078125 }, { "epoch": 0.4101245797903896, "step": 4148, "train/total_loss": 0.138210266828537 }, { "entropy": 8.528619766235352, "epoch": 0.41022345263990506, "mean_token_accuracy": 0.8164300322532654, "num_tokens": 782830.0, "step": 4149, "train/ce_loss": 0.5077932476997375 }, { "epoch": 0.41022345263990506, "step": 4149, "train/sim_loss": 0.01953125 }, { "epoch": 0.41022345263990506, "step": 4149, "train/total_loss": 0.070310577750206 }, { "entropy": 9.738550186157227, "epoch": 0.4103223254894206, "mean_token_accuracy": 0.8369781374931335, "num_tokens": 787749.0, "step": 4150, "train/ce_loss": 2.5044425910891732e-06 }, { "epoch": 0.4103223254894206, "step": 4150, "train/sim_loss": 0.0546875 }, { "epoch": 0.4103223254894206, "step": 4150, "train/total_loss": 0.05468774959445 }, { "entropy": 9.343286514282227, "epoch": 0.41042119833893614, "mean_token_accuracy": 0.7023977637290955, "num_tokens": 792922.0, "step": 4151, "train/ce_loss": 0.5836455225944519 }, { "epoch": 0.41042119833893614, "step": 4151, "train/sim_loss": 0.0625 }, { "epoch": 0.41042119833893614, "step": 4151, "train/total_loss": 0.12086455523967743 }, { "entropy": 9.220907211303711, "epoch": 0.41052007118845163, "mean_token_accuracy": 0.7028713822364807, "num_tokens": 798218.0, "step": 4152, "train/ce_loss": 1.197948932647705 }, { "epoch": 0.41052007118845163, "step": 4152, "train/sim_loss": 0.09375 }, { "epoch": 0.41052007118845163, "step": 4152, "train/total_loss": 0.21354490518569946 }, { "entropy": 9.451131820678711, "epoch": 0.41061894403796717, "mean_token_accuracy": 0.7160493731498718, "num_tokens": 803372.0, "step": 4153, "train/ce_loss": 0.8334396481513977 }, { "epoch": 0.41061894403796717, "step": 4153, "train/sim_loss": 0.109375 }, { "epoch": 0.41061894403796717, "step": 4153, "train/total_loss": 0.192718967795372 }, { "entropy": 9.613548278808594, "epoch": 0.4107178168874827, "mean_token_accuracy": 0.7343173623085022, "num_tokens": 808338.0, "step": 4154, "train/ce_loss": 1.4410486221313477 }, { "epoch": 0.4107178168874827, "step": 4154, "train/sim_loss": 0.05859375 }, { "epoch": 0.4107178168874827, "step": 4154, "train/total_loss": 0.20269861817359924 }, { "entropy": 9.733526229858398, "epoch": 0.4108166897369982, "mean_token_accuracy": 0.7347368597984314, "num_tokens": 813253.0, "step": 4155, "train/ce_loss": 1.0818486213684082 }, { "epoch": 0.4108166897369982, "step": 4155, "train/sim_loss": 0.046875 }, { "epoch": 0.4108166897369982, "step": 4155, "train/total_loss": 0.15505987405776978 }, { "entropy": 9.090243339538574, "epoch": 0.41091556258651374, "mean_token_accuracy": 0.6897767186164856, "num_tokens": 818527.0, "step": 4156, "train/ce_loss": 1.2623934745788574 }, { "epoch": 0.41091556258651374, "step": 4156, "train/sim_loss": 0.140625 }, { "epoch": 0.41091556258651374, "step": 4156, "train/total_loss": 0.2668643593788147 }, { "entropy": 8.927118301391602, "epoch": 0.4110144354360293, "mean_token_accuracy": 0.6821621656417847, "num_tokens": 823946.0, "step": 4157, "train/ce_loss": 1.2227681875228882 }, { "epoch": 0.4110144354360293, "step": 4157, "train/sim_loss": 0.1015625 }, { "epoch": 0.4110144354360293, "step": 4157, "train/total_loss": 0.22383931279182434 }, { "entropy": 9.305601119995117, "epoch": 0.41111330828554477, "mean_token_accuracy": 0.7320703864097595, "num_tokens": 829164.0, "step": 4158, "train/ce_loss": 0.6768001914024353 }, { "epoch": 0.41111330828554477, "step": 4158, "train/sim_loss": 0.0546875 }, { "epoch": 0.41111330828554477, "step": 4158, "train/total_loss": 0.12236752361059189 }, { "entropy": 9.253849983215332, "epoch": 0.4112121811350603, "mean_token_accuracy": 0.7406483888626099, "num_tokens": 834422.0, "step": 4159, "train/ce_loss": 1.0460083484649658 }, { "epoch": 0.4112121811350603, "step": 4159, "train/sim_loss": 0.109375 }, { "epoch": 0.4112121811350603, "step": 4159, "train/total_loss": 0.21397584676742554 }, { "epoch": 0.41131105398457585, "grad_norm": 0.7883924245834351, "learning_rate": 8.974187805963509e-06, "loss": 0.1431, "step": 4160 }, { "entropy": 9.040374755859375, "epoch": 0.41131105398457585, "mean_token_accuracy": 0.7023153305053711, "num_tokens": 839765.0, "step": 4160, "train/ce_loss": 1.358425498008728 }, { "epoch": 0.41131105398457585, "step": 4160, "train/sim_loss": 0.0546875 }, { "epoch": 0.41131105398457585, "step": 4160, "train/total_loss": 0.19053004682064056 }, { "entropy": 9.092089653015137, "epoch": 0.41140992683409133, "mean_token_accuracy": 0.7114177942276001, "num_tokens": 845004.0, "step": 4161, "train/ce_loss": 0.8708246350288391 }, { "epoch": 0.41140992683409133, "step": 4161, "train/sim_loss": 0.06640625 }, { "epoch": 0.41140992683409133, "step": 4161, "train/total_loss": 0.15348872542381287 }, { "entropy": 8.921030044555664, "epoch": 0.4115087996836069, "mean_token_accuracy": 0.7410617470741272, "num_tokens": 850422.0, "step": 4162, "train/ce_loss": 0.5317978262901306 }, { "epoch": 0.4115087996836069, "step": 4162, "train/sim_loss": 0.0625 }, { "epoch": 0.4115087996836069, "step": 4162, "train/total_loss": 0.1156797856092453 }, { "entropy": 8.912398338317871, "epoch": 0.4116076725331224, "mean_token_accuracy": 0.7205284833908081, "num_tokens": 855846.0, "step": 4163, "train/ce_loss": 1.3807965517044067 }, { "epoch": 0.4116076725331224, "step": 4163, "train/sim_loss": 0.046875 }, { "epoch": 0.4116076725331224, "step": 4163, "train/total_loss": 0.1849546581506729 }, { "entropy": 9.397573471069336, "epoch": 0.4117065453826379, "mean_token_accuracy": 0.7416918277740479, "num_tokens": 860972.0, "step": 4164, "train/ce_loss": 0.8983836770057678 }, { "epoch": 0.4117065453826379, "step": 4164, "train/sim_loss": 0.0703125 }, { "epoch": 0.4117065453826379, "step": 4164, "train/total_loss": 0.16015087068080902 }, { "entropy": 8.739873886108398, "epoch": 0.41180541823215344, "mean_token_accuracy": 0.7679324746131897, "num_tokens": 866399.0, "step": 4165, "train/ce_loss": 0.801369309425354 }, { "epoch": 0.41180541823215344, "step": 4165, "train/sim_loss": 0.02734375 }, { "epoch": 0.41180541823215344, "step": 4165, "train/total_loss": 0.10748068243265152 }, { "entropy": 9.21189022064209, "epoch": 0.411904291081669, "mean_token_accuracy": 0.7422552704811096, "num_tokens": 871674.0, "step": 4166, "train/ce_loss": 0.5884183645248413 }, { "epoch": 0.411904291081669, "step": 4166, "train/sim_loss": 0.0625 }, { "epoch": 0.411904291081669, "step": 4166, "train/total_loss": 0.12134183943271637 }, { "entropy": 9.931614875793457, "epoch": 0.4120031639311845, "mean_token_accuracy": 0.738095223903656, "num_tokens": 876489.0, "step": 4167, "train/ce_loss": 1.0798530578613281 }, { "epoch": 0.4120031639311845, "step": 4167, "train/sim_loss": 0.05078125 }, { "epoch": 0.4120031639311845, "step": 4167, "train/total_loss": 0.15876656770706177 }, { "entropy": 9.207361221313477, "epoch": 0.4121020367807, "mean_token_accuracy": 0.7567164301872253, "num_tokens": 881592.0, "step": 4168, "train/ce_loss": 2.0488355403358582e-06 }, { "epoch": 0.4121020367807, "step": 4168, "train/sim_loss": 0.046875 }, { "epoch": 0.4121020367807, "step": 4168, "train/total_loss": 0.046875204890966415 }, { "entropy": 8.989534378051758, "epoch": 0.41220090963021555, "mean_token_accuracy": 0.7125550508499146, "num_tokens": 886980.0, "step": 4169, "train/ce_loss": 0.9053176045417786 }, { "epoch": 0.41220090963021555, "step": 4169, "train/sim_loss": 0.0390625 }, { "epoch": 0.41220090963021555, "step": 4169, "train/total_loss": 0.12959426641464233 }, { "entropy": 9.171548843383789, "epoch": 0.4122997824797311, "mean_token_accuracy": 0.7469879388809204, "num_tokens": 892232.0, "step": 4170, "train/ce_loss": 0.6478603482246399 }, { "epoch": 0.4122997824797311, "step": 4170, "train/sim_loss": 0.02734375 }, { "epoch": 0.4122997824797311, "step": 4170, "train/total_loss": 0.09212978929281235 }, { "entropy": 8.943717956542969, "epoch": 0.4123986553292466, "mean_token_accuracy": 0.7210065722465515, "num_tokens": 897634.0, "step": 4171, "train/ce_loss": 1.3900470733642578 }, { "epoch": 0.4123986553292466, "step": 4171, "train/sim_loss": 0.09765625 }, { "epoch": 0.4123986553292466, "step": 4171, "train/total_loss": 0.23666095733642578 }, { "entropy": 8.863798141479492, "epoch": 0.4124975281787621, "mean_token_accuracy": 0.7757575511932373, "num_tokens": 903065.0, "step": 4172, "train/ce_loss": 1.0701133012771606 }, { "epoch": 0.4124975281787621, "step": 4172, "train/sim_loss": 0.06640625 }, { "epoch": 0.4124975281787621, "step": 4172, "train/total_loss": 0.1734175831079483 }, { "entropy": 9.983399391174316, "epoch": 0.41259640102827766, "mean_token_accuracy": 0.751366138458252, "num_tokens": 907822.0, "step": 4173, "train/ce_loss": 2.0375791791593656e-06 }, { "epoch": 0.41259640102827766, "step": 4173, "train/sim_loss": 0.0234375 }, { "epoch": 0.41259640102827766, "step": 4173, "train/total_loss": 0.023437703028321266 }, { "entropy": 9.877622604370117, "epoch": 0.41269527387779315, "mean_token_accuracy": 0.7553191781044006, "num_tokens": 912435.0, "step": 4174, "train/ce_loss": 6.041376764187589e-06 }, { "epoch": 0.41269527387779315, "step": 4174, "train/sim_loss": 0.03515625 }, { "epoch": 0.41269527387779315, "step": 4174, "train/total_loss": 0.03515685349702835 }, { "entropy": 9.246767044067383, "epoch": 0.4127941467273087, "mean_token_accuracy": 0.6970803141593933, "num_tokens": 917902.0, "step": 4175, "train/ce_loss": 0.69617760181427 }, { "epoch": 0.4127941467273087, "step": 4175, "train/sim_loss": 0.05078125 }, { "epoch": 0.4127941467273087, "step": 4175, "train/total_loss": 0.12039901316165924 }, { "entropy": 9.241673469543457, "epoch": 0.4128930195768242, "mean_token_accuracy": 0.7374005317687988, "num_tokens": 923107.0, "step": 4176, "train/ce_loss": 0.7027705907821655 }, { "epoch": 0.4128930195768242, "step": 4176, "train/sim_loss": 0.08203125 }, { "epoch": 0.4128930195768242, "step": 4176, "train/total_loss": 0.15230831503868103 }, { "entropy": 9.122090339660645, "epoch": 0.4129918924263397, "mean_token_accuracy": 0.7830423712730408, "num_tokens": 928396.0, "step": 4177, "train/ce_loss": 0.9528135657310486 }, { "epoch": 0.4129918924263397, "step": 4177, "train/sim_loss": 0.046875 }, { "epoch": 0.4129918924263397, "step": 4177, "train/total_loss": 0.14215636253356934 }, { "entropy": 9.369904518127441, "epoch": 0.41309076527585525, "mean_token_accuracy": 0.8083735704421997, "num_tokens": 933458.0, "step": 4178, "train/ce_loss": 0.718553900718689 }, { "epoch": 0.41309076527585525, "step": 4178, "train/sim_loss": 0.08984375 }, { "epoch": 0.41309076527585525, "step": 4178, "train/total_loss": 0.16169914603233337 }, { "entropy": 9.105664253234863, "epoch": 0.4131896381253708, "mean_token_accuracy": 0.7049180269241333, "num_tokens": 938746.0, "step": 4179, "train/ce_loss": 1.1738409996032715 }, { "epoch": 0.4131896381253708, "step": 4179, "train/sim_loss": 0.07421875 }, { "epoch": 0.4131896381253708, "step": 4179, "train/total_loss": 0.19160285592079163 }, { "epoch": 0.4132885109748863, "grad_norm": 0.8040392994880676, "learning_rate": 8.969242941205559e-06, "loss": 0.1405, "step": 4180 }, { "entropy": 8.808820724487305, "epoch": 0.4132885109748863, "mean_token_accuracy": 0.6976987719535828, "num_tokens": 944186.0, "step": 4180, "train/ce_loss": 1.2142090797424316 }, { "epoch": 0.4132885109748863, "step": 4180, "train/sim_loss": 0.0546875 }, { "epoch": 0.4132885109748863, "step": 4180, "train/total_loss": 0.17610841989517212 }, { "entropy": 8.927715301513672, "epoch": 0.4133873838244018, "mean_token_accuracy": 0.7618510127067566, "num_tokens": 949529.0, "step": 4181, "train/ce_loss": 0.5040786266326904 }, { "epoch": 0.4133873838244018, "step": 4181, "train/sim_loss": 0.02734375 }, { "epoch": 0.4133873838244018, "step": 4181, "train/total_loss": 0.07775161415338516 }, { "entropy": 9.648799896240234, "epoch": 0.41348625667391736, "mean_token_accuracy": 0.7495256066322327, "num_tokens": 954500.0, "step": 4182, "train/ce_loss": 0.7559933066368103 }, { "epoch": 0.41348625667391736, "step": 4182, "train/sim_loss": 0.02734375 }, { "epoch": 0.41348625667391736, "step": 4182, "train/total_loss": 0.10294308513402939 }, { "entropy": 9.496209144592285, "epoch": 0.41358512952343285, "mean_token_accuracy": 0.7256198525428772, "num_tokens": 959543.0, "step": 4183, "train/ce_loss": 1.2499913282226771e-06 }, { "epoch": 0.41358512952343285, "step": 4183, "train/sim_loss": 0.02734375 }, { "epoch": 0.41358512952343285, "step": 4183, "train/total_loss": 0.027343874797225 }, { "entropy": 8.89802360534668, "epoch": 0.4136840023729484, "mean_token_accuracy": 0.7200461030006409, "num_tokens": 964881.0, "step": 4184, "train/ce_loss": 0.7619472742080688 }, { "epoch": 0.4136840023729484, "step": 4184, "train/sim_loss": 0.05859375 }, { "epoch": 0.4136840023729484, "step": 4184, "train/total_loss": 0.13478848338127136 }, { "entropy": 9.495051383972168, "epoch": 0.41378287522246393, "mean_token_accuracy": 0.6853002309799194, "num_tokens": 969783.0, "step": 4185, "train/ce_loss": 5.939029506407678e-06 }, { "epoch": 0.41378287522246393, "step": 4185, "train/sim_loss": 0.0703125 }, { "epoch": 0.41378287522246393, "step": 4185, "train/total_loss": 0.07031309604644775 }, { "entropy": 9.305354118347168, "epoch": 0.4138817480719794, "mean_token_accuracy": 0.7168758511543274, "num_tokens": 974986.0, "step": 4186, "train/ce_loss": 0.5816423892974854 }, { "epoch": 0.4138817480719794, "step": 4186, "train/sim_loss": 0.078125 }, { "epoch": 0.4138817480719794, "step": 4186, "train/total_loss": 0.13628923892974854 }, { "entropy": 9.138479232788086, "epoch": 0.41398062092149496, "mean_token_accuracy": 0.6998770236968994, "num_tokens": 980283.0, "step": 4187, "train/ce_loss": 1.1875509023666382 }, { "epoch": 0.41398062092149496, "step": 4187, "train/sim_loss": 0.0234375 }, { "epoch": 0.41398062092149496, "step": 4187, "train/total_loss": 0.14219260215759277 }, { "entropy": 8.918331146240234, "epoch": 0.4140794937710105, "mean_token_accuracy": 0.7854785323143005, "num_tokens": 985695.0, "step": 4188, "train/ce_loss": 0.6920308470726013 }, { "epoch": 0.4140794937710105, "step": 4188, "train/sim_loss": 0.0625 }, { "epoch": 0.4140794937710105, "step": 4188, "train/total_loss": 0.13170307874679565 }, { "entropy": 9.225728988647461, "epoch": 0.414178366620526, "mean_token_accuracy": 0.7077131271362305, "num_tokens": 990919.0, "step": 4189, "train/ce_loss": 1.3891641401642119e-06 }, { "epoch": 0.414178366620526, "step": 4189, "train/sim_loss": 0.0546875 }, { "epoch": 0.414178366620526, "step": 4189, "train/total_loss": 0.05468763783574104 }, { "entropy": 9.02457332611084, "epoch": 0.4142772394700415, "mean_token_accuracy": 0.762566864490509, "num_tokens": 996528.0, "step": 4190, "train/ce_loss": 0.4660285413265228 }, { "epoch": 0.4142772394700415, "step": 4190, "train/sim_loss": 0.078125 }, { "epoch": 0.4142772394700415, "step": 4190, "train/total_loss": 0.12472786009311676 }, { "entropy": 8.8408784866333, "epoch": 0.41437611231955707, "mean_token_accuracy": 0.7253599166870117, "num_tokens": 1001897.0, "step": 4191, "train/ce_loss": 0.8709747791290283 }, { "epoch": 0.41437611231955707, "step": 4191, "train/sim_loss": 0.0703125 }, { "epoch": 0.41437611231955707, "step": 4191, "train/total_loss": 0.15740998089313507 }, { "entropy": 9.4285888671875, "epoch": 0.41447498516907255, "mean_token_accuracy": 0.773809552192688, "num_tokens": 1007005.0, "step": 4192, "train/ce_loss": 0.7056515216827393 }, { "epoch": 0.41447498516907255, "step": 4192, "train/sim_loss": 0.09375 }, { "epoch": 0.41447498516907255, "step": 4192, "train/total_loss": 0.16431516408920288 }, { "entropy": 9.37752628326416, "epoch": 0.4145738580185881, "mean_token_accuracy": 0.7282127141952515, "num_tokens": 1012202.0, "step": 4193, "train/ce_loss": 0.6693522930145264 }, { "epoch": 0.4145738580185881, "step": 4193, "train/sim_loss": 0.15625 }, { "epoch": 0.4145738580185881, "step": 4193, "train/total_loss": 0.2231852412223816 }, { "entropy": 9.460760116577148, "epoch": 0.41467273086810363, "mean_token_accuracy": 0.7652892470359802, "num_tokens": 1017277.0, "step": 4194, "train/ce_loss": 1.9088233709335327 }, { "epoch": 0.41467273086810363, "step": 4194, "train/sim_loss": 0.12890625 }, { "epoch": 0.41467273086810363, "step": 4194, "train/total_loss": 0.3197885751724243 }, { "entropy": 9.202156066894531, "epoch": 0.4147716037176191, "mean_token_accuracy": 0.7761836647987366, "num_tokens": 1022458.0, "step": 4195, "train/ce_loss": 0.7725761532783508 }, { "epoch": 0.4147716037176191, "step": 4195, "train/sim_loss": 0.0234375 }, { "epoch": 0.4147716037176191, "step": 4195, "train/total_loss": 0.10069511830806732 }, { "entropy": 9.696306228637695, "epoch": 0.41487047656713466, "mean_token_accuracy": 0.789383590221405, "num_tokens": 1027490.0, "step": 4196, "train/ce_loss": 3.918094080290757e-06 }, { "epoch": 0.41487047656713466, "step": 4196, "train/sim_loss": 0.05859375 }, { "epoch": 0.41487047656713466, "step": 4196, "train/total_loss": 0.05859414115548134 }, { "entropy": 8.785720825195312, "epoch": 0.4149693494166502, "mean_token_accuracy": 0.7217742204666138, "num_tokens": 1033026.0, "step": 4197, "train/ce_loss": 1.0272657871246338 }, { "epoch": 0.4149693494166502, "step": 4197, "train/sim_loss": 0.04296875 }, { "epoch": 0.4149693494166502, "step": 4197, "train/total_loss": 0.14569532871246338 }, { "entropy": 9.278945922851562, "epoch": 0.4150682222661657, "mean_token_accuracy": 0.7827869057655334, "num_tokens": 1038239.0, "step": 4198, "train/ce_loss": 0.6755519509315491 }, { "epoch": 0.4150682222661657, "step": 4198, "train/sim_loss": 0.0234375 }, { "epoch": 0.4150682222661657, "step": 4198, "train/total_loss": 0.09099269658327103 }, { "entropy": 9.39266586303711, "epoch": 0.41516709511568123, "mean_token_accuracy": 0.7037037014961243, "num_tokens": 1043308.0, "step": 4199, "train/ce_loss": 2.017041879298631e-06 }, { "epoch": 0.41516709511568123, "step": 4199, "train/sim_loss": 0.0625 }, { "epoch": 0.41516709511568123, "step": 4199, "train/total_loss": 0.06250020116567612 }, { "epoch": 0.41526596796519677, "grad_norm": 0.8774731755256653, "learning_rate": 8.96429807644761e-06, "loss": 0.147, "step": 4200 }, { "entropy": 8.4367094039917, "epoch": 0.41526596796519677, "mean_token_accuracy": 0.7406989932060242, "num_tokens": 1048683.0, "step": 4200, "train/ce_loss": 1.0317161083221436 }, { "epoch": 0.41526596796519677, "step": 4200, "train/sim_loss": 0.078125 }, { "epoch": 0.41526596796519677, "step": 4200, "train/total_loss": 0.18129661679267883 }, { "entropy": 9.368021011352539, "epoch": 0.41536484081471226, "mean_token_accuracy": 0.699999988079071, "num_tokens": 1053755.0, "step": 4201, "train/ce_loss": 2.232920451206155e-06 }, { "epoch": 0.41536484081471226, "step": 4201, "train/sim_loss": 0.03515625 }, { "epoch": 0.41536484081471226, "step": 4201, "train/total_loss": 0.03515647351741791 }, { "entropy": 10.30126953125, "epoch": 0.4154637136642278, "mean_token_accuracy": 0.8042704463005066, "num_tokens": 1058581.0, "step": 4202, "train/ce_loss": 1.648161768913269 }, { "epoch": 0.4154637136642278, "step": 4202, "train/sim_loss": 0.109375 }, { "epoch": 0.4154637136642278, "step": 4202, "train/total_loss": 0.2741912007331848 }, { "entropy": 8.870027542114258, "epoch": 0.41556258651374334, "mean_token_accuracy": 0.7354685664176941, "num_tokens": 1063856.0, "step": 4203, "train/ce_loss": 1.175770878791809 }, { "epoch": 0.41556258651374334, "step": 4203, "train/sim_loss": 0.03515625 }, { "epoch": 0.41556258651374334, "step": 4203, "train/total_loss": 0.15273334085941315 }, { "entropy": 9.552102088928223, "epoch": 0.4156614593632588, "mean_token_accuracy": 0.7178502678871155, "num_tokens": 1068804.0, "step": 4204, "train/ce_loss": 1.5738786458969116 }, { "epoch": 0.4156614593632588, "step": 4204, "train/sim_loss": 0.10546875 }, { "epoch": 0.4156614593632588, "step": 4204, "train/total_loss": 0.2628566026687622 }, { "entropy": 9.039390563964844, "epoch": 0.41576033221277436, "mean_token_accuracy": 0.7024221420288086, "num_tokens": 1074141.0, "step": 4205, "train/ce_loss": 0.701492190361023 }, { "epoch": 0.41576033221277436, "step": 4205, "train/sim_loss": 0.03515625 }, { "epoch": 0.41576033221277436, "step": 4205, "train/total_loss": 0.10530547052621841 }, { "entropy": 9.621667861938477, "epoch": 0.4158592050622899, "mean_token_accuracy": 0.7717041969299316, "num_tokens": 1079197.0, "step": 4206, "train/ce_loss": 0.9569666981697083 }, { "epoch": 0.4158592050622899, "step": 4206, "train/sim_loss": 0.05078125 }, { "epoch": 0.4158592050622899, "step": 4206, "train/total_loss": 0.14647792279720306 }, { "entropy": 9.241792678833008, "epoch": 0.4159580779118054, "mean_token_accuracy": 0.6965811848640442, "num_tokens": 1084368.0, "step": 4207, "train/ce_loss": 1.1052935123443604 }, { "epoch": 0.4159580779118054, "step": 4207, "train/sim_loss": 0.0859375 }, { "epoch": 0.4159580779118054, "step": 4207, "train/total_loss": 0.196466863155365 }, { "entropy": 8.973245620727539, "epoch": 0.41605695076132093, "mean_token_accuracy": 0.6955017447471619, "num_tokens": 1089641.0, "step": 4208, "train/ce_loss": 0.8473589420318604 }, { "epoch": 0.41605695076132093, "step": 4208, "train/sim_loss": 0.16015625 }, { "epoch": 0.41605695076132093, "step": 4208, "train/total_loss": 0.2448921501636505 }, { "entropy": 8.825902938842773, "epoch": 0.4161558236108365, "mean_token_accuracy": 0.7387387156486511, "num_tokens": 1095097.0, "step": 4209, "train/ce_loss": 0.43971484899520874 }, { "epoch": 0.4161558236108365, "step": 4209, "train/sim_loss": 0.0546875 }, { "epoch": 0.4161558236108365, "step": 4209, "train/total_loss": 0.098658986389637 }, { "entropy": 9.571261405944824, "epoch": 0.416254696460352, "mean_token_accuracy": 0.675000011920929, "num_tokens": 1100225.0, "step": 4210, "train/ce_loss": 1.4352977275848389 }, { "epoch": 0.416254696460352, "step": 4210, "train/sim_loss": 0.09765625 }, { "epoch": 0.416254696460352, "step": 4210, "train/total_loss": 0.2411860227584839 }, { "entropy": 8.91400146484375, "epoch": 0.4163535693098675, "mean_token_accuracy": 0.7109634280204773, "num_tokens": 1105585.0, "step": 4211, "train/ce_loss": 0.828546404838562 }, { "epoch": 0.4163535693098675, "step": 4211, "train/sim_loss": 0.046875 }, { "epoch": 0.4163535693098675, "step": 4211, "train/total_loss": 0.12972964346408844 }, { "entropy": 9.401948928833008, "epoch": 0.41645244215938304, "mean_token_accuracy": 0.7531055808067322, "num_tokens": 1110588.0, "step": 4212, "train/ce_loss": 1.419521689414978 }, { "epoch": 0.41645244215938304, "step": 4212, "train/sim_loss": 0.09375 }, { "epoch": 0.41645244215938304, "step": 4212, "train/total_loss": 0.23570217192173004 }, { "entropy": 9.189818382263184, "epoch": 0.4165513150088986, "mean_token_accuracy": 0.7932816743850708, "num_tokens": 1115833.0, "step": 4213, "train/ce_loss": 0.5189548134803772 }, { "epoch": 0.4165513150088986, "step": 4213, "train/sim_loss": 0.0234375 }, { "epoch": 0.4165513150088986, "step": 4213, "train/total_loss": 0.07533298432826996 }, { "entropy": 8.877925872802734, "epoch": 0.41665018785841407, "mean_token_accuracy": 0.703398585319519, "num_tokens": 1121296.0, "step": 4214, "train/ce_loss": 0.8614786267280579 }, { "epoch": 0.41665018785841407, "step": 4214, "train/sim_loss": 0.1015625 }, { "epoch": 0.41665018785841407, "step": 4214, "train/total_loss": 0.18771037459373474 }, { "entropy": 9.34935474395752, "epoch": 0.4167490607079296, "mean_token_accuracy": 0.7023643851280212, "num_tokens": 1126472.0, "step": 4215, "train/ce_loss": 0.5411409139633179 }, { "epoch": 0.4167490607079296, "step": 4215, "train/sim_loss": 0.0625 }, { "epoch": 0.4167490607079296, "step": 4215, "train/total_loss": 0.11661408841609955 }, { "entropy": 9.215014457702637, "epoch": 0.41684793355744515, "mean_token_accuracy": 0.6658536791801453, "num_tokens": 1131774.0, "step": 4216, "train/ce_loss": 1.4825798273086548 }, { "epoch": 0.41684793355744515, "step": 4216, "train/sim_loss": 0.0703125 }, { "epoch": 0.41684793355744515, "step": 4216, "train/total_loss": 0.21857048571109772 }, { "entropy": 9.464516639709473, "epoch": 0.41694680640696064, "mean_token_accuracy": 0.8243451714515686, "num_tokens": 1136869.0, "step": 4217, "train/ce_loss": 1.5847562053750153e-06 }, { "epoch": 0.41694680640696064, "step": 4217, "train/sim_loss": 0.02734375 }, { "epoch": 0.41694680640696064, "step": 4217, "train/total_loss": 0.027343908324837685 }, { "entropy": 9.304022789001465, "epoch": 0.4170456792564762, "mean_token_accuracy": 0.747787594795227, "num_tokens": 1142053.0, "step": 4218, "train/ce_loss": 1.9297674498375272e-06 }, { "epoch": 0.4170456792564762, "step": 4218, "train/sim_loss": 0.05078125 }, { "epoch": 0.4170456792564762, "step": 4218, "train/total_loss": 0.05078144371509552 }, { "entropy": 9.988414764404297, "epoch": 0.4171445521059917, "mean_token_accuracy": 0.7318681478500366, "num_tokens": 1146940.0, "step": 4219, "train/ce_loss": 5.972405233478639e-06 }, { "epoch": 0.4171445521059917, "step": 4219, "train/sim_loss": 0.046875 }, { "epoch": 0.4171445521059917, "step": 4219, "train/total_loss": 0.046875596046447754 }, { "epoch": 0.4172434249555072, "grad_norm": 0.9290772080421448, "learning_rate": 8.95935321168966e-06, "loss": 0.1533, "step": 4220 }, { "entropy": 8.99875259399414, "epoch": 0.4172434249555072, "mean_token_accuracy": 0.7621302008628845, "num_tokens": 1152240.0, "step": 4220, "train/ce_loss": 0.849389910697937 }, { "epoch": 0.4172434249555072, "step": 4220, "train/sim_loss": 0.0234375 }, { "epoch": 0.4172434249555072, "step": 4220, "train/total_loss": 0.10837649554014206 }, { "entropy": 9.093700408935547, "epoch": 0.41734229780502274, "mean_token_accuracy": 0.7468982338905334, "num_tokens": 1157422.0, "step": 4221, "train/ce_loss": 0.7816839218139648 }, { "epoch": 0.41734229780502274, "step": 4221, "train/sim_loss": 0.0625 }, { "epoch": 0.41734229780502274, "step": 4221, "train/total_loss": 0.14066839218139648 }, { "entropy": 9.681419372558594, "epoch": 0.4174411706545383, "mean_token_accuracy": 0.7091836929321289, "num_tokens": 1162433.0, "step": 4222, "train/ce_loss": 1.0733975172042847 }, { "epoch": 0.4174411706545383, "step": 4222, "train/sim_loss": 0.0390625 }, { "epoch": 0.4174411706545383, "step": 4222, "train/total_loss": 0.1464022547006607 }, { "entropy": 8.821057319641113, "epoch": 0.41754004350405377, "mean_token_accuracy": 0.7184684872627258, "num_tokens": 1167752.0, "step": 4223, "train/ce_loss": 0.6913005113601685 }, { "epoch": 0.41754004350405377, "step": 4223, "train/sim_loss": 0.03515625 }, { "epoch": 0.41754004350405377, "step": 4223, "train/total_loss": 0.1042863056063652 }, { "entropy": 9.29666519165039, "epoch": 0.4176389163535693, "mean_token_accuracy": 0.7994186282157898, "num_tokens": 1172913.0, "step": 4224, "train/ce_loss": 0.7657065391540527 }, { "epoch": 0.4176389163535693, "step": 4224, "train/sim_loss": 0.03515625 }, { "epoch": 0.4176389163535693, "step": 4224, "train/total_loss": 0.11172690242528915 }, { "entropy": 9.670294761657715, "epoch": 0.41773778920308485, "mean_token_accuracy": 0.7474226951599121, "num_tokens": 1177937.0, "step": 4225, "train/ce_loss": 0.6599021553993225 }, { "epoch": 0.41773778920308485, "step": 4225, "train/sim_loss": 0.05859375 }, { "epoch": 0.41773778920308485, "step": 4225, "train/total_loss": 0.12458396703004837 }, { "entropy": 8.872920989990234, "epoch": 0.41783666205260034, "mean_token_accuracy": 0.7144653797149658, "num_tokens": 1183207.0, "step": 4226, "train/ce_loss": 1.1690150499343872 }, { "epoch": 0.41783666205260034, "step": 4226, "train/sim_loss": 0.06640625 }, { "epoch": 0.41783666205260034, "step": 4226, "train/total_loss": 0.18330776691436768 }, { "entropy": 8.915066719055176, "epoch": 0.4179355349021159, "mean_token_accuracy": 0.7074999809265137, "num_tokens": 1188463.0, "step": 4227, "train/ce_loss": 0.9992348551750183 }, { "epoch": 0.4179355349021159, "step": 4227, "train/sim_loss": 0.0390625 }, { "epoch": 0.4179355349021159, "step": 4227, "train/total_loss": 0.1389859914779663 }, { "entropy": 10.26605224609375, "epoch": 0.4180344077516314, "mean_token_accuracy": 0.6666666865348816, "num_tokens": 1193198.0, "step": 4228, "train/ce_loss": 1.8083083629608154 }, { "epoch": 0.4180344077516314, "step": 4228, "train/sim_loss": 0.05859375 }, { "epoch": 0.4180344077516314, "step": 4228, "train/total_loss": 0.23942458629608154 }, { "entropy": 9.408763885498047, "epoch": 0.4181332806011469, "mean_token_accuracy": 0.7004950642585754, "num_tokens": 1198058.0, "step": 4229, "train/ce_loss": 2.6393215656280518 }, { "epoch": 0.4181332806011469, "step": 4229, "train/sim_loss": 0.09375 }, { "epoch": 0.4181332806011469, "step": 4229, "train/total_loss": 0.35768216848373413 }, { "entropy": 9.29364013671875, "epoch": 0.41823215345066245, "mean_token_accuracy": 0.7573632597923279, "num_tokens": 1203223.0, "step": 4230, "train/ce_loss": 1.110211730003357 }, { "epoch": 0.41823215345066245, "step": 4230, "train/sim_loss": 0.02734375 }, { "epoch": 0.41823215345066245, "step": 4230, "train/total_loss": 0.13836492598056793 }, { "entropy": 9.558351516723633, "epoch": 0.418331026300178, "mean_token_accuracy": 0.7590163946151733, "num_tokens": 1208246.0, "step": 4231, "train/ce_loss": 1.1962000131607056 }, { "epoch": 0.418331026300178, "step": 4231, "train/sim_loss": 0.12890625 }, { "epoch": 0.418331026300178, "step": 4231, "train/total_loss": 0.24852624535560608 }, { "entropy": 9.735548973083496, "epoch": 0.4184298991496935, "mean_token_accuracy": 0.73046875, "num_tokens": 1213179.0, "step": 4232, "train/ce_loss": 0.875798761844635 }, { "epoch": 0.4184298991496935, "step": 4232, "train/sim_loss": 0.1171875 }, { "epoch": 0.4184298991496935, "step": 4232, "train/total_loss": 0.2047673761844635 }, { "entropy": 9.036125183105469, "epoch": 0.418528771999209, "mean_token_accuracy": 0.7426210045814514, "num_tokens": 1218482.0, "step": 4233, "train/ce_loss": 0.6533321142196655 }, { "epoch": 0.418528771999209, "step": 4233, "train/sim_loss": 0.0390625 }, { "epoch": 0.418528771999209, "step": 4233, "train/total_loss": 0.10439570993185043 }, { "entropy": 9.350015640258789, "epoch": 0.41862764484872456, "mean_token_accuracy": 0.824999988079071, "num_tokens": 1223655.0, "step": 4234, "train/ce_loss": 0.5197864174842834 }, { "epoch": 0.41862764484872456, "step": 4234, "train/sim_loss": 0.04296875 }, { "epoch": 0.41862764484872456, "step": 4234, "train/total_loss": 0.09494739770889282 }, { "entropy": 8.948705673217773, "epoch": 0.41872651769824004, "mean_token_accuracy": 0.7506426572799683, "num_tokens": 1228878.0, "step": 4235, "train/ce_loss": 0.4958691895008087 }, { "epoch": 0.41872651769824004, "step": 4235, "train/sim_loss": 0.03125 }, { "epoch": 0.41872651769824004, "step": 4235, "train/total_loss": 0.08083692193031311 }, { "entropy": 9.025856018066406, "epoch": 0.4188253905477556, "mean_token_accuracy": 0.6833713054656982, "num_tokens": 1234207.0, "step": 4236, "train/ce_loss": 1.5610315799713135 }, { "epoch": 0.4188253905477556, "step": 4236, "train/sim_loss": 0.078125 }, { "epoch": 0.4188253905477556, "step": 4236, "train/total_loss": 0.23422816395759583 }, { "entropy": 8.754049301147461, "epoch": 0.4189242633972711, "mean_token_accuracy": 0.7425025701522827, "num_tokens": 1239659.0, "step": 4237, "train/ce_loss": 0.5289074182510376 }, { "epoch": 0.4189242633972711, "step": 4237, "train/sim_loss": 0.03125 }, { "epoch": 0.4189242633972711, "step": 4237, "train/total_loss": 0.08414074778556824 }, { "entropy": 9.412731170654297, "epoch": 0.4190231362467866, "mean_token_accuracy": 0.7376543283462524, "num_tokens": 1244737.0, "step": 4238, "train/ce_loss": 1.6346156597137451 }, { "epoch": 0.4190231362467866, "step": 4238, "train/sim_loss": 0.0703125 }, { "epoch": 0.4190231362467866, "step": 4238, "train/total_loss": 0.2337740659713745 }, { "entropy": 9.039575576782227, "epoch": 0.41912200909630215, "mean_token_accuracy": 0.7407878041267395, "num_tokens": 1250042.0, "step": 4239, "train/ce_loss": 0.9413285255432129 }, { "epoch": 0.41912200909630215, "step": 4239, "train/sim_loss": 0.08984375 }, { "epoch": 0.41912200909630215, "step": 4239, "train/total_loss": 0.18397660553455353 }, { "epoch": 0.4192208819458177, "grad_norm": 0.8705694079399109, "learning_rate": 8.954408346931712e-06, "loss": 0.144, "step": 4240 }, { "entropy": 9.725851058959961, "epoch": 0.4192208819458177, "mean_token_accuracy": 0.7514340281486511, "num_tokens": 1254948.0, "step": 4240, "train/ce_loss": 1.937035083770752 }, { "epoch": 0.4192208819458177, "step": 4240, "train/sim_loss": 0.05859375 }, { "epoch": 0.4192208819458177, "step": 4240, "train/total_loss": 0.2522972822189331 }, { "entropy": 9.782069206237793, "epoch": 0.4193197547953332, "mean_token_accuracy": 0.6924999952316284, "num_tokens": 1259763.0, "step": 4241, "train/ce_loss": 1.8429124355316162 }, { "epoch": 0.4193197547953332, "step": 4241, "train/sim_loss": 0.11328125 }, { "epoch": 0.4193197547953332, "step": 4241, "train/total_loss": 0.2975724935531616 }, { "entropy": 9.174646377563477, "epoch": 0.4194186276448487, "mean_token_accuracy": 0.6773761510848999, "num_tokens": 1264942.0, "step": 4242, "train/ce_loss": 1.1125339269638062 }, { "epoch": 0.4194186276448487, "step": 4242, "train/sim_loss": 0.0859375 }, { "epoch": 0.4194186276448487, "step": 4242, "train/total_loss": 0.19719089567661285 }, { "entropy": 9.241769790649414, "epoch": 0.41951750049436426, "mean_token_accuracy": 0.774193525314331, "num_tokens": 1270001.0, "step": 4243, "train/ce_loss": 1.6668464013491757e-06 }, { "epoch": 0.41951750049436426, "step": 4243, "train/sim_loss": 0.0625 }, { "epoch": 0.41951750049436426, "step": 4243, "train/total_loss": 0.06250016391277313 }, { "entropy": 8.818593978881836, "epoch": 0.41961637334387974, "mean_token_accuracy": 0.7326139211654663, "num_tokens": 1275281.0, "step": 4244, "train/ce_loss": 0.7373816967010498 }, { "epoch": 0.41961637334387974, "step": 4244, "train/sim_loss": 0.09765625 }, { "epoch": 0.41961637334387974, "step": 4244, "train/total_loss": 0.17139442265033722 }, { "entropy": 8.850120544433594, "epoch": 0.4197152461933953, "mean_token_accuracy": 0.7569988965988159, "num_tokens": 1280574.0, "step": 4245, "train/ce_loss": 0.9278842806816101 }, { "epoch": 0.4197152461933953, "step": 4245, "train/sim_loss": 0.04296875 }, { "epoch": 0.4197152461933953, "step": 4245, "train/total_loss": 0.135757178068161 }, { "entropy": 8.878185272216797, "epoch": 0.4198141190429108, "mean_token_accuracy": 0.7516411542892456, "num_tokens": 1285915.0, "step": 4246, "train/ce_loss": 0.8967689871788025 }, { "epoch": 0.4198141190429108, "step": 4246, "train/sim_loss": 0.0625 }, { "epoch": 0.4198141190429108, "step": 4246, "train/total_loss": 0.1521769016981125 }, { "entropy": 8.870865821838379, "epoch": 0.4199129918924263, "mean_token_accuracy": 0.6959064602851868, "num_tokens": 1291257.0, "step": 4247, "train/ce_loss": 0.5926907658576965 }, { "epoch": 0.4199129918924263, "step": 4247, "train/sim_loss": 0.04296875 }, { "epoch": 0.4199129918924263, "step": 4247, "train/total_loss": 0.10223782807588577 }, { "entropy": 9.483327865600586, "epoch": 0.42001186474194185, "mean_token_accuracy": 0.8003802299499512, "num_tokens": 1296220.0, "step": 4248, "train/ce_loss": 0.8804767727851868 }, { "epoch": 0.42001186474194185, "step": 4248, "train/sim_loss": 0.0234375 }, { "epoch": 0.42001186474194185, "step": 4248, "train/total_loss": 0.11148517578840256 }, { "entropy": 9.329161643981934, "epoch": 0.4201107375914574, "mean_token_accuracy": 0.7123551964759827, "num_tokens": 1301156.0, "step": 4249, "train/ce_loss": 1.829318642616272 }, { "epoch": 0.4201107375914574, "step": 4249, "train/sim_loss": 0.05859375 }, { "epoch": 0.4201107375914574, "step": 4249, "train/total_loss": 0.24152562022209167 }, { "entropy": 9.395343780517578, "epoch": 0.42020961044097294, "mean_token_accuracy": 0.7155025601387024, "num_tokens": 1306213.0, "step": 4250, "train/ce_loss": 1.8414853811264038 }, { "epoch": 0.42020961044097294, "step": 4250, "train/sim_loss": 0.07421875 }, { "epoch": 0.42020961044097294, "step": 4250, "train/total_loss": 0.25836730003356934 }, { "entropy": 9.611597061157227, "epoch": 0.4203084832904884, "mean_token_accuracy": 0.8132635354995728, "num_tokens": 1311164.0, "step": 4251, "train/ce_loss": 0.9971811175346375 }, { "epoch": 0.4203084832904884, "step": 4251, "train/sim_loss": 0.015625 }, { "epoch": 0.4203084832904884, "step": 4251, "train/total_loss": 0.1153431162238121 }, { "entropy": 9.493104934692383, "epoch": 0.42040735614000396, "mean_token_accuracy": 0.7350000143051147, "num_tokens": 1316167.0, "step": 4252, "train/ce_loss": 0.7681896686553955 }, { "epoch": 0.42040735614000396, "step": 4252, "train/sim_loss": 0.10546875 }, { "epoch": 0.42040735614000396, "step": 4252, "train/total_loss": 0.18228772282600403 }, { "entropy": 9.07140827178955, "epoch": 0.4205062289895195, "mean_token_accuracy": 0.7261641025543213, "num_tokens": 1321550.0, "step": 4253, "train/ce_loss": 0.9427207112312317 }, { "epoch": 0.4205062289895195, "step": 4253, "train/sim_loss": 0.046875 }, { "epoch": 0.4205062289895195, "step": 4253, "train/total_loss": 0.14114707708358765 }, { "entropy": 8.669411659240723, "epoch": 0.420605101839035, "mean_token_accuracy": 0.7895287871360779, "num_tokens": 1326977.0, "step": 4254, "train/ce_loss": 0.7670964002609253 }, { "epoch": 0.420605101839035, "step": 4254, "train/sim_loss": 0.02734375 }, { "epoch": 0.420605101839035, "step": 4254, "train/total_loss": 0.10405339300632477 }, { "entropy": 9.015830993652344, "epoch": 0.42070397468855053, "mean_token_accuracy": 0.7553443908691406, "num_tokens": 1332264.0, "step": 4255, "train/ce_loss": 0.7133545279502869 }, { "epoch": 0.42070397468855053, "step": 4255, "train/sim_loss": 0.0390625 }, { "epoch": 0.42070397468855053, "step": 4255, "train/total_loss": 0.11039795726537704 }, { "entropy": 9.15105152130127, "epoch": 0.42080284753806607, "mean_token_accuracy": 0.7324749827384949, "num_tokens": 1337405.0, "step": 4256, "train/ce_loss": 1.7517278365630773e-06 }, { "epoch": 0.42080284753806607, "step": 4256, "train/sim_loss": 0.0546875 }, { "epoch": 0.42080284753806607, "step": 4256, "train/total_loss": 0.05468767508864403 }, { "entropy": 9.284518241882324, "epoch": 0.42090172038758156, "mean_token_accuracy": 0.717783510684967, "num_tokens": 1342615.0, "step": 4257, "train/ce_loss": 0.5699039101600647 }, { "epoch": 0.42090172038758156, "step": 4257, "train/sim_loss": 0.02734375 }, { "epoch": 0.42090172038758156, "step": 4257, "train/total_loss": 0.08433414250612259 }, { "entropy": 8.994129180908203, "epoch": 0.4210005932370971, "mean_token_accuracy": 0.7823129296302795, "num_tokens": 1347959.0, "step": 4258, "train/ce_loss": 0.3339727818965912 }, { "epoch": 0.4210005932370971, "step": 4258, "train/sim_loss": 0.12109375 }, { "epoch": 0.4210005932370971, "step": 4258, "train/total_loss": 0.15449103713035583 }, { "entropy": 9.126960754394531, "epoch": 0.42109946608661264, "mean_token_accuracy": 0.7601476311683655, "num_tokens": 1353254.0, "step": 4259, "train/ce_loss": 0.7362488508224487 }, { "epoch": 0.42109946608661264, "step": 4259, "train/sim_loss": 0.0625 }, { "epoch": 0.42109946608661264, "step": 4259, "train/total_loss": 0.1361248791217804 }, { "epoch": 0.4211983389361281, "grad_norm": 0.6947392225265503, "learning_rate": 8.949463482173763e-06, "loss": 0.1431, "step": 4260 }, { "entropy": 9.342792510986328, "epoch": 0.4211983389361281, "mean_token_accuracy": 0.82343989610672, "num_tokens": 1358361.0, "step": 4260, "train/ce_loss": 0.482095330953598 }, { "epoch": 0.4211983389361281, "step": 4260, "train/sim_loss": 0.078125 }, { "epoch": 0.4211983389361281, "step": 4260, "train/total_loss": 0.1263345330953598 }, { "entropy": 9.324569702148438, "epoch": 0.42129721178564367, "mean_token_accuracy": 0.7637231349945068, "num_tokens": 1363211.0, "step": 4261, "train/ce_loss": 1.7761139869689941 }, { "epoch": 0.42129721178564367, "step": 4261, "train/sim_loss": 0.109375 }, { "epoch": 0.42129721178564367, "step": 4261, "train/total_loss": 0.28698641061782837 }, { "entropy": 9.17307186126709, "epoch": 0.4213960846351592, "mean_token_accuracy": 0.7001338601112366, "num_tokens": 1368428.0, "step": 4262, "train/ce_loss": 0.8786683082580566 }, { "epoch": 0.4213960846351592, "step": 4262, "train/sim_loss": 0.0546875 }, { "epoch": 0.4213960846351592, "step": 4262, "train/total_loss": 0.14255434274673462 }, { "entropy": 9.052959442138672, "epoch": 0.4214949574846747, "mean_token_accuracy": 0.7310426831245422, "num_tokens": 1373711.0, "step": 4263, "train/ce_loss": 0.8060519695281982 }, { "epoch": 0.4214949574846747, "step": 4263, "train/sim_loss": 0.0546875 }, { "epoch": 0.4214949574846747, "step": 4263, "train/total_loss": 0.13529270887374878 }, { "entropy": 9.800721168518066, "epoch": 0.42159383033419023, "mean_token_accuracy": 0.7330595254898071, "num_tokens": 1378592.0, "step": 4264, "train/ce_loss": 1.6116140386657207e-06 }, { "epoch": 0.42159383033419023, "step": 4264, "train/sim_loss": 0.0234375 }, { "epoch": 0.42159383033419023, "step": 4264, "train/total_loss": 0.023437662050127983 }, { "entropy": 8.876489639282227, "epoch": 0.4216927031837058, "mean_token_accuracy": 0.7310606241226196, "num_tokens": 1383858.0, "step": 4265, "train/ce_loss": 0.5489982962608337 }, { "epoch": 0.4216927031837058, "step": 4265, "train/sim_loss": 0.0390625 }, { "epoch": 0.4216927031837058, "step": 4265, "train/total_loss": 0.09396232664585114 }, { "entropy": 9.334985733032227, "epoch": 0.42179157603322126, "mean_token_accuracy": 0.746347963809967, "num_tokens": 1389034.0, "step": 4266, "train/ce_loss": 1.1328575055813417e-06 }, { "epoch": 0.42179157603322126, "step": 4266, "train/sim_loss": 0.0234375 }, { "epoch": 0.42179157603322126, "step": 4266, "train/total_loss": 0.023437613621354103 }, { "entropy": 9.425538063049316, "epoch": 0.4218904488827368, "mean_token_accuracy": 0.6842105388641357, "num_tokens": 1394115.0, "step": 4267, "train/ce_loss": 2.066435172309866e-06 }, { "epoch": 0.4218904488827368, "step": 4267, "train/sim_loss": 0.015625 }, { "epoch": 0.4218904488827368, "step": 4267, "train/total_loss": 0.015625206753611565 }, { "entropy": 9.141765594482422, "epoch": 0.42198932173225234, "mean_token_accuracy": 0.7256410121917725, "num_tokens": 1399353.0, "step": 4268, "train/ce_loss": 0.8137962222099304 }, { "epoch": 0.42198932173225234, "step": 4268, "train/sim_loss": 0.078125 }, { "epoch": 0.42198932173225234, "step": 4268, "train/total_loss": 0.15950462222099304 }, { "entropy": 9.34766960144043, "epoch": 0.42208819458176783, "mean_token_accuracy": 0.7127329111099243, "num_tokens": 1404438.0, "step": 4269, "train/ce_loss": 1.6777497648945427e-06 }, { "epoch": 0.42208819458176783, "step": 4269, "train/sim_loss": 0.01953125 }, { "epoch": 0.42208819458176783, "step": 4269, "train/total_loss": 0.01953141763806343 }, { "entropy": 9.749281883239746, "epoch": 0.42218706743128337, "mean_token_accuracy": 0.7399576902389526, "num_tokens": 1409340.0, "step": 4270, "train/ce_loss": 0.8929030299186707 }, { "epoch": 0.42218706743128337, "step": 4270, "train/sim_loss": 0.046875 }, { "epoch": 0.42218706743128337, "step": 4270, "train/total_loss": 0.1361653059720993 }, { "entropy": 9.200545310974121, "epoch": 0.4222859402807989, "mean_token_accuracy": 0.7155067324638367, "num_tokens": 1414669.0, "step": 4271, "train/ce_loss": 0.6485669016838074 }, { "epoch": 0.4222859402807989, "step": 4271, "train/sim_loss": 0.046875 }, { "epoch": 0.4222859402807989, "step": 4271, "train/total_loss": 0.11173169314861298 }, { "entropy": 8.720863342285156, "epoch": 0.4223848131303144, "mean_token_accuracy": 0.6928879022598267, "num_tokens": 1420107.0, "step": 4272, "train/ce_loss": 1.576992154121399 }, { "epoch": 0.4223848131303144, "step": 4272, "train/sim_loss": 0.1171875 }, { "epoch": 0.4223848131303144, "step": 4272, "train/total_loss": 0.27488672733306885 }, { "entropy": 10.158475875854492, "epoch": 0.42248368597982994, "mean_token_accuracy": 0.7209302186965942, "num_tokens": 1424809.0, "step": 4273, "train/ce_loss": 3.0050621262489585e-06 }, { "epoch": 0.42248368597982994, "step": 4273, "train/sim_loss": 0.04296875 }, { "epoch": 0.42248368597982994, "step": 4273, "train/total_loss": 0.042969051748514175 }, { "entropy": 9.288434982299805, "epoch": 0.4225825588293455, "mean_token_accuracy": 0.7050359845161438, "num_tokens": 1429909.0, "step": 4274, "train/ce_loss": 0.8266807794570923 }, { "epoch": 0.4225825588293455, "step": 4274, "train/sim_loss": 0.1171875 }, { "epoch": 0.4225825588293455, "step": 4274, "train/total_loss": 0.19985558092594147 }, { "entropy": 9.331501007080078, "epoch": 0.42268143167886096, "mean_token_accuracy": 0.7471751570701599, "num_tokens": 1435102.0, "step": 4275, "train/ce_loss": 1.366287112236023 }, { "epoch": 0.42268143167886096, "step": 4275, "train/sim_loss": 0.078125 }, { "epoch": 0.42268143167886096, "step": 4275, "train/total_loss": 0.21475371718406677 }, { "entropy": 9.115610122680664, "epoch": 0.4227803045283765, "mean_token_accuracy": 0.6718562841415405, "num_tokens": 1440426.0, "step": 4276, "train/ce_loss": 0.6482602953910828 }, { "epoch": 0.4227803045283765, "step": 4276, "train/sim_loss": 0.0703125 }, { "epoch": 0.4227803045283765, "step": 4276, "train/total_loss": 0.13513854146003723 }, { "entropy": 8.894174575805664, "epoch": 0.42287917737789205, "mean_token_accuracy": 0.710010290145874, "num_tokens": 1445880.0, "step": 4277, "train/ce_loss": 0.5530392527580261 }, { "epoch": 0.42287917737789205, "step": 4277, "train/sim_loss": 0.0859375 }, { "epoch": 0.42287917737789205, "step": 4277, "train/total_loss": 0.1412414312362671 }, { "entropy": 8.725154876708984, "epoch": 0.42297805022740753, "mean_token_accuracy": 0.75208580493927, "num_tokens": 1451202.0, "step": 4278, "train/ce_loss": 0.7852307558059692 }, { "epoch": 0.42297805022740753, "step": 4278, "train/sim_loss": 0.02734375 }, { "epoch": 0.42297805022740753, "step": 4278, "train/total_loss": 0.10586682707071304 }, { "entropy": 8.779287338256836, "epoch": 0.4230769230769231, "mean_token_accuracy": 0.7813440561294556, "num_tokens": 1456646.0, "step": 4279, "train/ce_loss": 0.8656463623046875 }, { "epoch": 0.4230769230769231, "step": 4279, "train/sim_loss": 0.02734375 }, { "epoch": 0.4230769230769231, "step": 4279, "train/total_loss": 0.11390838772058487 }, { "epoch": 0.4231757959264386, "grad_norm": 0.6826988458633423, "learning_rate": 8.944518617415815e-06, "loss": 0.1495, "step": 4280 }, { "entropy": 8.758885383605957, "epoch": 0.4231757959264386, "mean_token_accuracy": 0.7528089880943298, "num_tokens": 1462266.0, "step": 4280, "train/ce_loss": 0.8109167814254761 }, { "epoch": 0.4231757959264386, "step": 4280, "train/sim_loss": 0.1015625 }, { "epoch": 0.4231757959264386, "step": 4280, "train/total_loss": 0.18265417218208313 }, { "entropy": 9.55448055267334, "epoch": 0.4232746687759541, "mean_token_accuracy": 0.7217805981636047, "num_tokens": 1467293.0, "step": 4281, "train/ce_loss": 2.5310926048405236e-06 }, { "epoch": 0.4232746687759541, "step": 4281, "train/sim_loss": 0.06640625 }, { "epoch": 0.4232746687759541, "step": 4281, "train/total_loss": 0.0664065033197403 }, { "entropy": 9.334671974182129, "epoch": 0.42337354162546964, "mean_token_accuracy": 0.7144754528999329, "num_tokens": 1472462.0, "step": 4282, "train/ce_loss": 3.7898300888628e-06 }, { "epoch": 0.42337354162546964, "step": 4282, "train/sim_loss": 0.046875 }, { "epoch": 0.42337354162546964, "step": 4282, "train/total_loss": 0.04687537997961044 }, { "entropy": 9.370052337646484, "epoch": 0.4234724144749852, "mean_token_accuracy": 0.8143274784088135, "num_tokens": 1477587.0, "step": 4283, "train/ce_loss": 0.45702382922172546 }, { "epoch": 0.4234724144749852, "step": 4283, "train/sim_loss": 0.03515625 }, { "epoch": 0.4234724144749852, "step": 4283, "train/total_loss": 0.08085863292217255 }, { "entropy": 9.209254264831543, "epoch": 0.42357128732450067, "mean_token_accuracy": 0.7461928725242615, "num_tokens": 1482840.0, "step": 4284, "train/ce_loss": 0.7366754412651062 }, { "epoch": 0.42357128732450067, "step": 4284, "train/sim_loss": 0.0859375 }, { "epoch": 0.42357128732450067, "step": 4284, "train/total_loss": 0.15960505604743958 }, { "entropy": 9.531827926635742, "epoch": 0.4236701601740162, "mean_token_accuracy": 0.7326732873916626, "num_tokens": 1487818.0, "step": 4285, "train/ce_loss": 2.4422627120657125e-06 }, { "epoch": 0.4236701601740162, "step": 4285, "train/sim_loss": 0.0625 }, { "epoch": 0.4236701601740162, "step": 4285, "train/total_loss": 0.0625002458691597 }, { "entropy": 8.964927673339844, "epoch": 0.42376903302353175, "mean_token_accuracy": 0.7400000095367432, "num_tokens": 1493137.0, "step": 4286, "train/ce_loss": 1.0670584440231323 }, { "epoch": 0.42376903302353175, "step": 4286, "train/sim_loss": 0.07421875 }, { "epoch": 0.42376903302353175, "step": 4286, "train/total_loss": 0.18092459440231323 }, { "entropy": 9.768739700317383, "epoch": 0.42386790587304723, "mean_token_accuracy": 0.6830986142158508, "num_tokens": 1497995.0, "step": 4287, "train/ce_loss": 2.5790059566497803 }, { "epoch": 0.42386790587304723, "step": 4287, "train/sim_loss": 0.1015625 }, { "epoch": 0.42386790587304723, "step": 4287, "train/total_loss": 0.359463095664978 }, { "entropy": 8.72038459777832, "epoch": 0.4239667787225628, "mean_token_accuracy": 0.7435653209686279, "num_tokens": 1503577.0, "step": 4288, "train/ce_loss": 0.43054938316345215 }, { "epoch": 0.4239667787225628, "step": 4288, "train/sim_loss": 0.01953125 }, { "epoch": 0.4239667787225628, "step": 4288, "train/total_loss": 0.06258618831634521 }, { "entropy": 9.362419128417969, "epoch": 0.4240656515720783, "mean_token_accuracy": 0.7729323506355286, "num_tokens": 1508689.0, "step": 4289, "train/ce_loss": 1.0027906894683838 }, { "epoch": 0.4240656515720783, "step": 4289, "train/sim_loss": 0.0703125 }, { "epoch": 0.4240656515720783, "step": 4289, "train/total_loss": 0.1705915629863739 }, { "entropy": 9.029733657836914, "epoch": 0.4241645244215938, "mean_token_accuracy": 0.6946848034858704, "num_tokens": 1513942.0, "step": 4290, "train/ce_loss": 1.0345737934112549 }, { "epoch": 0.4241645244215938, "step": 4290, "train/sim_loss": 0.09375 }, { "epoch": 0.4241645244215938, "step": 4290, "train/total_loss": 0.19720739126205444 }, { "entropy": 9.854422569274902, "epoch": 0.42426339727110934, "mean_token_accuracy": 0.6627358198165894, "num_tokens": 1518783.0, "step": 4291, "train/ce_loss": 2.7461735498945927e-06 }, { "epoch": 0.42426339727110934, "step": 4291, "train/sim_loss": 0.0625 }, { "epoch": 0.42426339727110934, "step": 4291, "train/total_loss": 0.06250027567148209 }, { "entropy": 8.991606712341309, "epoch": 0.4243622701206249, "mean_token_accuracy": 0.758368194103241, "num_tokens": 1524212.0, "step": 4292, "train/ce_loss": 0.4276218116283417 }, { "epoch": 0.4243622701206249, "step": 4292, "train/sim_loss": 0.01953125 }, { "epoch": 0.4243622701206249, "step": 4292, "train/total_loss": 0.06229343265295029 }, { "entropy": 9.449363708496094, "epoch": 0.4244611429701404, "mean_token_accuracy": 0.791540801525116, "num_tokens": 1529334.0, "step": 4293, "train/ce_loss": 0.5087490677833557 }, { "epoch": 0.4244611429701404, "step": 4293, "train/sim_loss": 0.015625 }, { "epoch": 0.4244611429701404, "step": 4293, "train/total_loss": 0.06649990379810333 }, { "entropy": 8.642279624938965, "epoch": 0.4245600158196559, "mean_token_accuracy": 0.7688171863555908, "num_tokens": 1534765.0, "step": 4294, "train/ce_loss": 0.7547642588615417 }, { "epoch": 0.4245600158196559, "step": 4294, "train/sim_loss": 0.078125 }, { "epoch": 0.4245600158196559, "step": 4294, "train/total_loss": 0.15360143780708313 }, { "entropy": 9.176563262939453, "epoch": 0.42465888866917145, "mean_token_accuracy": 0.7251908183097839, "num_tokens": 1539943.0, "step": 4295, "train/ce_loss": 1.0541415214538574 }, { "epoch": 0.42465888866917145, "step": 4295, "train/sim_loss": 0.046875 }, { "epoch": 0.42465888866917145, "step": 4295, "train/total_loss": 0.15228915214538574 }, { "entropy": 9.577062606811523, "epoch": 0.424757761518687, "mean_token_accuracy": 0.7086092829704285, "num_tokens": 1545004.0, "step": 4296, "train/ce_loss": 1.3417185544967651 }, { "epoch": 0.424757761518687, "step": 4296, "train/sim_loss": 0.05078125 }, { "epoch": 0.424757761518687, "step": 4296, "train/total_loss": 0.18495310842990875 }, { "entropy": 9.091691970825195, "epoch": 0.4248566343682025, "mean_token_accuracy": 0.7349665760993958, "num_tokens": 1550347.0, "step": 4297, "train/ce_loss": 1.865136742591858 }, { "epoch": 0.4248566343682025, "step": 4297, "train/sim_loss": 0.05859375 }, { "epoch": 0.4248566343682025, "step": 4297, "train/total_loss": 0.24510742723941803 }, { "entropy": 8.988584518432617, "epoch": 0.424955507217718, "mean_token_accuracy": 0.7348777055740356, "num_tokens": 1555663.0, "step": 4298, "train/ce_loss": 1.3776904344558716 }, { "epoch": 0.424955507217718, "step": 4298, "train/sim_loss": 0.0546875 }, { "epoch": 0.424955507217718, "step": 4298, "train/total_loss": 0.19245654344558716 }, { "entropy": 9.33218002319336, "epoch": 0.42505438006723356, "mean_token_accuracy": 0.6875, "num_tokens": 1560946.0, "step": 4299, "train/ce_loss": 0.9154267907142639 }, { "epoch": 0.42505438006723356, "step": 4299, "train/sim_loss": 0.1171875 }, { "epoch": 0.42505438006723356, "step": 4299, "train/total_loss": 0.20873019099235535 }, { "epoch": 0.42515325291674905, "grad_norm": 0.8996347188949585, "learning_rate": 8.939573752657866e-06, "loss": 0.1505, "step": 4300 }, { "entropy": 9.435907363891602, "epoch": 0.42515325291674905, "mean_token_accuracy": 0.7455782294273376, "num_tokens": 1566058.0, "step": 4300, "train/ce_loss": 0.8440230488777161 }, { "epoch": 0.42515325291674905, "step": 4300, "train/sim_loss": 0.0390625 }, { "epoch": 0.42515325291674905, "step": 4300, "train/total_loss": 0.12346480786800385 }, { "entropy": 9.667113304138184, "epoch": 0.4252521257662646, "mean_token_accuracy": 0.7288732528686523, "num_tokens": 1571051.0, "step": 4301, "train/ce_loss": 1.0589016675949097 }, { "epoch": 0.4252521257662646, "step": 4301, "train/sim_loss": 0.06640625 }, { "epoch": 0.4252521257662646, "step": 4301, "train/total_loss": 0.1722964197397232 }, { "entropy": 8.825034141540527, "epoch": 0.42535099861578013, "mean_token_accuracy": 0.8170731663703918, "num_tokens": 1576471.0, "step": 4302, "train/ce_loss": 0.45814794301986694 }, { "epoch": 0.42535099861578013, "step": 4302, "train/sim_loss": 0.02734375 }, { "epoch": 0.42535099861578013, "step": 4302, "train/total_loss": 0.07315854728221893 }, { "entropy": 8.938833236694336, "epoch": 0.4254498714652956, "mean_token_accuracy": 0.6905537247657776, "num_tokens": 1581861.0, "step": 4303, "train/ce_loss": 1.396524429321289 }, { "epoch": 0.4254498714652956, "step": 4303, "train/sim_loss": 0.07421875 }, { "epoch": 0.4254498714652956, "step": 4303, "train/total_loss": 0.21387119591236115 }, { "entropy": 9.038439750671387, "epoch": 0.42554874431481116, "mean_token_accuracy": 0.7139713764190674, "num_tokens": 1587255.0, "step": 4304, "train/ce_loss": 0.8129435777664185 }, { "epoch": 0.42554874431481116, "step": 4304, "train/sim_loss": 0.05859375 }, { "epoch": 0.42554874431481116, "step": 4304, "train/total_loss": 0.13988810777664185 }, { "entropy": 9.519832611083984, "epoch": 0.4256476171643267, "mean_token_accuracy": 0.7729195952415466, "num_tokens": 1592395.0, "step": 4305, "train/ce_loss": 0.9152183532714844 }, { "epoch": 0.4256476171643267, "step": 4305, "train/sim_loss": 0.0625 }, { "epoch": 0.4256476171643267, "step": 4305, "train/total_loss": 0.15402182936668396 }, { "entropy": 9.419305801391602, "epoch": 0.4257464900138422, "mean_token_accuracy": 0.7150837779045105, "num_tokens": 1597364.0, "step": 4306, "train/ce_loss": 2.3006148239801405e-06 }, { "epoch": 0.4257464900138422, "step": 4306, "train/sim_loss": 0.0234375 }, { "epoch": 0.4257464900138422, "step": 4306, "train/total_loss": 0.023437730967998505 }, { "entropy": 9.322014808654785, "epoch": 0.4258453628633577, "mean_token_accuracy": 0.7663185596466064, "num_tokens": 1602577.0, "step": 4307, "train/ce_loss": 0.5652867555618286 }, { "epoch": 0.4258453628633577, "step": 4307, "train/sim_loss": 0.05078125 }, { "epoch": 0.4258453628633577, "step": 4307, "train/total_loss": 0.10730992257595062 }, { "entropy": 9.617816925048828, "epoch": 0.42594423571287326, "mean_token_accuracy": 0.7290909290313721, "num_tokens": 1607532.0, "step": 4308, "train/ce_loss": 0.9149979948997498 }, { "epoch": 0.42594423571287326, "step": 4308, "train/sim_loss": 0.0703125 }, { "epoch": 0.42594423571287326, "step": 4308, "train/total_loss": 0.16181230545043945 }, { "entropy": 9.620207786560059, "epoch": 0.42604310856238875, "mean_token_accuracy": 0.7269681692123413, "num_tokens": 1612599.0, "step": 4309, "train/ce_loss": 1.7339175939559937 }, { "epoch": 0.42604310856238875, "step": 4309, "train/sim_loss": 0.06640625 }, { "epoch": 0.42604310856238875, "step": 4309, "train/total_loss": 0.23979800939559937 }, { "entropy": 9.4932222366333, "epoch": 0.4261419814119043, "mean_token_accuracy": 0.7774524092674255, "num_tokens": 1617730.0, "step": 4310, "train/ce_loss": 0.7773568630218506 }, { "epoch": 0.4261419814119043, "step": 4310, "train/sim_loss": 0.046875 }, { "epoch": 0.4261419814119043, "step": 4310, "train/total_loss": 0.12461068481206894 }, { "entropy": 9.982711791992188, "epoch": 0.42624085426141983, "mean_token_accuracy": 0.6958763003349304, "num_tokens": 1622529.0, "step": 4311, "train/ce_loss": 2.5695910453796387 }, { "epoch": 0.42624085426141983, "step": 4311, "train/sim_loss": 0.046875 }, { "epoch": 0.42624085426141983, "step": 4311, "train/total_loss": 0.30383411049842834 }, { "entropy": 9.458898544311523, "epoch": 0.4263397271109353, "mean_token_accuracy": 0.766153872013092, "num_tokens": 1627604.0, "step": 4312, "train/ce_loss": 4.623173936124658e-06 }, { "epoch": 0.4263397271109353, "step": 4312, "train/sim_loss": 0.0625 }, { "epoch": 0.4263397271109353, "step": 4312, "train/total_loss": 0.06250046193599701 }, { "entropy": 8.879953384399414, "epoch": 0.42643859996045086, "mean_token_accuracy": 0.7508342862129211, "num_tokens": 1632977.0, "step": 4313, "train/ce_loss": 0.8125248551368713 }, { "epoch": 0.42643859996045086, "step": 4313, "train/sim_loss": 0.0625 }, { "epoch": 0.42643859996045086, "step": 4313, "train/total_loss": 0.14375248551368713 }, { "entropy": 9.249433517456055, "epoch": 0.4265374728099664, "mean_token_accuracy": 0.7860082387924194, "num_tokens": 1638216.0, "step": 4314, "train/ce_loss": 0.9679844379425049 }, { "epoch": 0.4265374728099664, "step": 4314, "train/sim_loss": 0.04296875 }, { "epoch": 0.4265374728099664, "step": 4314, "train/total_loss": 0.13976719975471497 }, { "entropy": 9.489304542541504, "epoch": 0.4266363456594819, "mean_token_accuracy": 0.7419354915618896, "num_tokens": 1643212.0, "step": 4315, "train/ce_loss": 2.237540456917486e-06 }, { "epoch": 0.4266363456594819, "step": 4315, "train/sim_loss": 0.0390625 }, { "epoch": 0.4266363456594819, "step": 4315, "train/total_loss": 0.03906272351741791 }, { "entropy": 9.41923713684082, "epoch": 0.4267352185089974, "mean_token_accuracy": 0.7376000285148621, "num_tokens": 1648301.0, "step": 4316, "train/ce_loss": 3.542704234860139e-06 }, { "epoch": 0.4267352185089974, "step": 4316, "train/sim_loss": 0.046875 }, { "epoch": 0.4267352185089974, "step": 4316, "train/total_loss": 0.046875353902578354 }, { "entropy": 9.066547393798828, "epoch": 0.42683409135851297, "mean_token_accuracy": 0.7229050397872925, "num_tokens": 1653821.0, "step": 4317, "train/ce_loss": 0.4514893889427185 }, { "epoch": 0.42683409135851297, "step": 4317, "train/sim_loss": 0.10546875 }, { "epoch": 0.42683409135851297, "step": 4317, "train/total_loss": 0.15061768889427185 }, { "entropy": 9.121603965759277, "epoch": 0.42693296420802845, "mean_token_accuracy": 0.698019802570343, "num_tokens": 1659091.0, "step": 4318, "train/ce_loss": 1.3111400604248047 }, { "epoch": 0.42693296420802845, "step": 4318, "train/sim_loss": 0.07421875 }, { "epoch": 0.42693296420802845, "step": 4318, "train/total_loss": 0.20533275604248047 }, { "entropy": 9.072944641113281, "epoch": 0.427031837057544, "mean_token_accuracy": 0.6868250370025635, "num_tokens": 1664538.0, "step": 4319, "train/ce_loss": 0.5692331194877625 }, { "epoch": 0.427031837057544, "step": 4319, "train/sim_loss": 0.07421875 }, { "epoch": 0.427031837057544, "step": 4319, "train/total_loss": 0.13114206492900848 }, { "epoch": 0.42713070990705954, "grad_norm": 0.729607880115509, "learning_rate": 8.934628887899916e-06, "loss": 0.1417, "step": 4320 }, { "entropy": 9.600298881530762, "epoch": 0.42713070990705954, "mean_token_accuracy": 0.7055920958518982, "num_tokens": 1669550.0, "step": 4320, "train/ce_loss": 1.7043551206588745 }, { "epoch": 0.42713070990705954, "step": 4320, "train/sim_loss": 0.0625 }, { "epoch": 0.42713070990705954, "step": 4320, "train/total_loss": 0.23293551802635193 }, { "entropy": 8.856857299804688, "epoch": 0.427229582756575, "mean_token_accuracy": 0.7200000286102295, "num_tokens": 1674829.0, "step": 4321, "train/ce_loss": 0.8603862524032593 }, { "epoch": 0.427229582756575, "step": 4321, "train/sim_loss": 0.046875 }, { "epoch": 0.427229582756575, "step": 4321, "train/total_loss": 0.13291361927986145 }, { "entropy": 9.070219039916992, "epoch": 0.42732845560609056, "mean_token_accuracy": 0.7744966149330139, "num_tokens": 1680009.0, "step": 4322, "train/ce_loss": 0.5782650709152222 }, { "epoch": 0.42732845560609056, "step": 4322, "train/sim_loss": 0.046875 }, { "epoch": 0.42732845560609056, "step": 4322, "train/total_loss": 0.10470150411128998 }, { "entropy": 8.989773750305176, "epoch": 0.4274273284556061, "mean_token_accuracy": 0.7233532667160034, "num_tokens": 1685396.0, "step": 4323, "train/ce_loss": 0.9252065420150757 }, { "epoch": 0.4274273284556061, "step": 4323, "train/sim_loss": 0.0546875 }, { "epoch": 0.4274273284556061, "step": 4323, "train/total_loss": 0.14720815420150757 }, { "entropy": 9.072076797485352, "epoch": 0.4275262013051216, "mean_token_accuracy": 0.7448200583457947, "num_tokens": 1690761.0, "step": 4324, "train/ce_loss": 0.8619343638420105 }, { "epoch": 0.4275262013051216, "step": 4324, "train/sim_loss": 0.1015625 }, { "epoch": 0.4275262013051216, "step": 4324, "train/total_loss": 0.18775594234466553 }, { "entropy": 9.002479553222656, "epoch": 0.42762507415463713, "mean_token_accuracy": 0.7133890986442566, "num_tokens": 1696123.0, "step": 4325, "train/ce_loss": 1.1929075717926025 }, { "epoch": 0.42762507415463713, "step": 4325, "train/sim_loss": 0.04296875 }, { "epoch": 0.42762507415463713, "step": 4325, "train/total_loss": 0.1622595191001892 }, { "entropy": 8.67890739440918, "epoch": 0.42772394700415267, "mean_token_accuracy": 0.800000011920929, "num_tokens": 1701618.0, "step": 4326, "train/ce_loss": 0.34899601340293884 }, { "epoch": 0.42772394700415267, "step": 4326, "train/sim_loss": 0.05859375 }, { "epoch": 0.42772394700415267, "step": 4326, "train/total_loss": 0.09349335730075836 }, { "entropy": 9.073480606079102, "epoch": 0.42782281985366816, "mean_token_accuracy": 0.7301775217056274, "num_tokens": 1706933.0, "step": 4327, "train/ce_loss": 0.7541035413742065 }, { "epoch": 0.42782281985366816, "step": 4327, "train/sim_loss": 0.0390625 }, { "epoch": 0.42782281985366816, "step": 4327, "train/total_loss": 0.11447285860776901 }, { "entropy": 8.725275039672852, "epoch": 0.4279216927031837, "mean_token_accuracy": 0.7545367479324341, "num_tokens": 1712425.0, "step": 4328, "train/ce_loss": 0.6070301532745361 }, { "epoch": 0.4279216927031837, "step": 4328, "train/sim_loss": 0.01953125 }, { "epoch": 0.4279216927031837, "step": 4328, "train/total_loss": 0.08023426681756973 }, { "entropy": 9.50638198852539, "epoch": 0.42802056555269924, "mean_token_accuracy": 0.7130559682846069, "num_tokens": 1717579.0, "step": 4329, "train/ce_loss": 3.69126155419508e-06 }, { "epoch": 0.42802056555269924, "step": 4329, "train/sim_loss": 0.05859375 }, { "epoch": 0.42802056555269924, "step": 4329, "train/total_loss": 0.05859411880373955 }, { "entropy": 9.952825546264648, "epoch": 0.4281194384022147, "mean_token_accuracy": 0.7051281929016113, "num_tokens": 1722420.0, "step": 4330, "train/ce_loss": 3.110465968347853e-06 }, { "epoch": 0.4281194384022147, "step": 4330, "train/sim_loss": 0.02734375 }, { "epoch": 0.4281194384022147, "step": 4330, "train/total_loss": 0.02734406106173992 }, { "entropy": 8.836921691894531, "epoch": 0.42821831125173027, "mean_token_accuracy": 0.7042410969734192, "num_tokens": 1727834.0, "step": 4331, "train/ce_loss": 1.8216817378997803 }, { "epoch": 0.42821831125173027, "step": 4331, "train/sim_loss": 0.078125 }, { "epoch": 0.42821831125173027, "step": 4331, "train/total_loss": 0.260293185710907 }, { "entropy": 9.085926055908203, "epoch": 0.4283171841012458, "mean_token_accuracy": 0.7299363017082214, "num_tokens": 1733113.0, "step": 4332, "train/ce_loss": 0.8461929559707642 }, { "epoch": 0.4283171841012458, "step": 4332, "train/sim_loss": 0.078125 }, { "epoch": 0.4283171841012458, "step": 4332, "train/total_loss": 0.16274429857730865 }, { "entropy": 9.921957015991211, "epoch": 0.42841605695076135, "mean_token_accuracy": 0.7659574747085571, "num_tokens": 1737805.0, "step": 4333, "train/ce_loss": 1.589762806892395 }, { "epoch": 0.42841605695076135, "step": 4333, "train/sim_loss": 0.078125 }, { "epoch": 0.42841605695076135, "step": 4333, "train/total_loss": 0.23710128664970398 }, { "entropy": 9.177725791931152, "epoch": 0.42851492980027683, "mean_token_accuracy": 0.7027707695960999, "num_tokens": 1743085.0, "step": 4334, "train/ce_loss": 1.3322070837020874 }, { "epoch": 0.42851492980027683, "step": 4334, "train/sim_loss": 0.05078125 }, { "epoch": 0.42851492980027683, "step": 4334, "train/total_loss": 0.18400196731090546 }, { "entropy": 8.909058570861816, "epoch": 0.4286138026497924, "mean_token_accuracy": 0.7259668707847595, "num_tokens": 1748419.0, "step": 4335, "train/ce_loss": 0.4872305989265442 }, { "epoch": 0.4286138026497924, "step": 4335, "train/sim_loss": 0.01953125 }, { "epoch": 0.4286138026497924, "step": 4335, "train/total_loss": 0.06825430691242218 }, { "entropy": 8.963470458984375, "epoch": 0.4287126754993079, "mean_token_accuracy": 0.7875136733055115, "num_tokens": 1753772.0, "step": 4336, "train/ce_loss": 0.7437989711761475 }, { "epoch": 0.4287126754993079, "step": 4336, "train/sim_loss": 0.11328125 }, { "epoch": 0.4287126754993079, "step": 4336, "train/total_loss": 0.18766114115715027 }, { "entropy": 9.545357704162598, "epoch": 0.4288115483488234, "mean_token_accuracy": 0.6684684753417969, "num_tokens": 1758747.0, "step": 4337, "train/ce_loss": 4.981597157893702e-06 }, { "epoch": 0.4288115483488234, "step": 4337, "train/sim_loss": 0.08203125 }, { "epoch": 0.4288115483488234, "step": 4337, "train/total_loss": 0.0820317491889 }, { "entropy": 8.90612506866455, "epoch": 0.42891042119833894, "mean_token_accuracy": 0.7734042406082153, "num_tokens": 1764338.0, "step": 4338, "train/ce_loss": 0.468054860830307 }, { "epoch": 0.42891042119833894, "step": 4338, "train/sim_loss": 0.08984375 }, { "epoch": 0.42891042119833894, "step": 4338, "train/total_loss": 0.1366492360830307 }, { "entropy": 9.132699966430664, "epoch": 0.4290092940478545, "mean_token_accuracy": 0.6747159361839294, "num_tokens": 1769519.0, "step": 4339, "train/ce_loss": 0.6216713190078735 }, { "epoch": 0.4290092940478545, "step": 4339, "train/sim_loss": 0.0703125 }, { "epoch": 0.4290092940478545, "step": 4339, "train/total_loss": 0.13247963786125183 }, { "epoch": 0.42910816689736997, "grad_norm": 0.904602587223053, "learning_rate": 8.929684023141968e-06, "loss": 0.1537, "step": 4340 }, { "entropy": 8.995580673217773, "epoch": 0.42910816689736997, "mean_token_accuracy": 0.6687631011009216, "num_tokens": 1774949.0, "step": 4340, "train/ce_loss": 0.8476549983024597 }, { "epoch": 0.42910816689736997, "step": 4340, "train/sim_loss": 0.0390625 }, { "epoch": 0.42910816689736997, "step": 4340, "train/total_loss": 0.12382800132036209 }, { "entropy": 8.944741249084473, "epoch": 0.4292070397468855, "mean_token_accuracy": 0.707975447177887, "num_tokens": 1780237.0, "step": 4341, "train/ce_loss": 1.2588831186294556 }, { "epoch": 0.4292070397468855, "step": 4341, "train/sim_loss": 0.08203125 }, { "epoch": 0.4292070397468855, "step": 4341, "train/total_loss": 0.20791956782341003 }, { "entropy": 9.220817565917969, "epoch": 0.42930591259640105, "mean_token_accuracy": 0.6861979365348816, "num_tokens": 1785491.0, "step": 4342, "train/ce_loss": 1.0993037223815918 }, { "epoch": 0.42930591259640105, "step": 4342, "train/sim_loss": 0.05078125 }, { "epoch": 0.42930591259640105, "step": 4342, "train/total_loss": 0.1607116162776947 }, { "entropy": 9.002422332763672, "epoch": 0.42940478544591654, "mean_token_accuracy": 0.7678571343421936, "num_tokens": 1790793.0, "step": 4343, "train/ce_loss": 0.8925429582595825 }, { "epoch": 0.42940478544591654, "step": 4343, "train/sim_loss": 0.0546875 }, { "epoch": 0.42940478544591654, "step": 4343, "train/total_loss": 0.14394178986549377 }, { "entropy": 8.725269317626953, "epoch": 0.4295036582954321, "mean_token_accuracy": 0.7561797499656677, "num_tokens": 1796128.0, "step": 4344, "train/ce_loss": 0.9660356044769287 }, { "epoch": 0.4295036582954321, "step": 4344, "train/sim_loss": 0.06640625 }, { "epoch": 0.4295036582954321, "step": 4344, "train/total_loss": 0.16300982236862183 }, { "entropy": 8.95657730102539, "epoch": 0.4296025311449476, "mean_token_accuracy": 0.7015834450721741, "num_tokens": 1801405.0, "step": 4345, "train/ce_loss": 0.7507035136222839 }, { "epoch": 0.4296025311449476, "step": 4345, "train/sim_loss": 0.1015625 }, { "epoch": 0.4296025311449476, "step": 4345, "train/total_loss": 0.1766328513622284 }, { "entropy": 9.237102508544922, "epoch": 0.4297014039944631, "mean_token_accuracy": 0.6799007654190063, "num_tokens": 1806678.0, "step": 4346, "train/ce_loss": 1.0277491807937622 }, { "epoch": 0.4297014039944631, "step": 4346, "train/sim_loss": 0.0625 }, { "epoch": 0.4297014039944631, "step": 4346, "train/total_loss": 0.16527491807937622 }, { "entropy": 9.291812896728516, "epoch": 0.42980027684397865, "mean_token_accuracy": 0.6936115026473999, "num_tokens": 1811933.0, "step": 4347, "train/ce_loss": 0.6227318048477173 }, { "epoch": 0.42980027684397865, "step": 4347, "train/sim_loss": 0.04296875 }, { "epoch": 0.42980027684397865, "step": 4347, "train/total_loss": 0.10524193197488785 }, { "entropy": 8.870292663574219, "epoch": 0.4298991496934942, "mean_token_accuracy": 0.7926315665245056, "num_tokens": 1817358.0, "step": 4348, "train/ce_loss": 0.9479617476463318 }, { "epoch": 0.4298991496934942, "step": 4348, "train/sim_loss": 0.0625 }, { "epoch": 0.4298991496934942, "step": 4348, "train/total_loss": 0.15729618072509766 }, { "entropy": 9.563199996948242, "epoch": 0.4299980225430097, "mean_token_accuracy": 0.7112902998924255, "num_tokens": 1822386.0, "step": 4349, "train/ce_loss": 1.2201199531555176 }, { "epoch": 0.4299980225430097, "step": 4349, "train/sim_loss": 0.08984375 }, { "epoch": 0.4299980225430097, "step": 4349, "train/total_loss": 0.21185573935508728 }, { "entropy": 9.326417922973633, "epoch": 0.4300968953925252, "mean_token_accuracy": 0.6901595592498779, "num_tokens": 1827622.0, "step": 4350, "train/ce_loss": 0.9021326303482056 }, { "epoch": 0.4300968953925252, "step": 4350, "train/sim_loss": 0.05078125 }, { "epoch": 0.4300968953925252, "step": 4350, "train/total_loss": 0.14099451899528503 }, { "entropy": 9.50046443939209, "epoch": 0.43019576824204075, "mean_token_accuracy": 0.7844112515449524, "num_tokens": 1832669.0, "step": 4351, "train/ce_loss": 0.6640459895133972 }, { "epoch": 0.43019576824204075, "step": 4351, "train/sim_loss": 0.015625 }, { "epoch": 0.43019576824204075, "step": 4351, "train/total_loss": 0.08202960342168808 }, { "entropy": 8.944676399230957, "epoch": 0.43029464109155624, "mean_token_accuracy": 0.7688098549842834, "num_tokens": 1837863.0, "step": 4352, "train/ce_loss": 0.4522591531276703 }, { "epoch": 0.43029464109155624, "step": 4352, "train/sim_loss": 0.08984375 }, { "epoch": 0.43029464109155624, "step": 4352, "train/total_loss": 0.13506966829299927 }, { "entropy": 9.45470905303955, "epoch": 0.4303935139410718, "mean_token_accuracy": 0.8481012582778931, "num_tokens": 1842920.0, "step": 4353, "train/ce_loss": 1.6522606074431678e-06 }, { "epoch": 0.4303935139410718, "step": 4353, "train/sim_loss": 0.05859375 }, { "epoch": 0.4303935139410718, "step": 4353, "train/total_loss": 0.05859391391277313 }, { "entropy": 9.531600952148438, "epoch": 0.4304923867905873, "mean_token_accuracy": 0.7777777910232544, "num_tokens": 1847968.0, "step": 4354, "train/ce_loss": 0.3012424409389496 }, { "epoch": 0.4304923867905873, "step": 4354, "train/sim_loss": 0.0859375 }, { "epoch": 0.4304923867905873, "step": 4354, "train/total_loss": 0.1160617470741272 }, { "entropy": 9.445377349853516, "epoch": 0.4305912596401028, "mean_token_accuracy": 0.7200621962547302, "num_tokens": 1853092.0, "step": 4355, "train/ce_loss": 1.6494358777999878 }, { "epoch": 0.4305912596401028, "step": 4355, "train/sim_loss": 0.0703125 }, { "epoch": 0.4305912596401028, "step": 4355, "train/total_loss": 0.23525609076023102 }, { "entropy": 8.41752815246582, "epoch": 0.43069013248961835, "mean_token_accuracy": 0.7389340400695801, "num_tokens": 1858729.0, "step": 4356, "train/ce_loss": 1.3742927312850952 }, { "epoch": 0.43069013248961835, "step": 4356, "train/sim_loss": 0.05859375 }, { "epoch": 0.43069013248961835, "step": 4356, "train/total_loss": 0.19602303206920624 }, { "entropy": 9.577810287475586, "epoch": 0.4307890053391339, "mean_token_accuracy": 0.7698541283607483, "num_tokens": 1863928.0, "step": 4357, "train/ce_loss": 0.918196439743042 }, { "epoch": 0.4307890053391339, "step": 4357, "train/sim_loss": 0.09375 }, { "epoch": 0.4307890053391339, "step": 4357, "train/total_loss": 0.1855696439743042 }, { "entropy": 8.904155731201172, "epoch": 0.4308878781886494, "mean_token_accuracy": 0.7665505409240723, "num_tokens": 1869201.0, "step": 4358, "train/ce_loss": 0.5582960844039917 }, { "epoch": 0.4308878781886494, "step": 4358, "train/sim_loss": 0.0546875 }, { "epoch": 0.4308878781886494, "step": 4358, "train/total_loss": 0.11051711440086365 }, { "entropy": 9.43018627166748, "epoch": 0.4309867510381649, "mean_token_accuracy": 0.7713884711265564, "num_tokens": 1874546.0, "step": 4359, "train/ce_loss": 1.0824960470199585 }, { "epoch": 0.4309867510381649, "step": 4359, "train/sim_loss": 0.12890625 }, { "epoch": 0.4309867510381649, "step": 4359, "train/total_loss": 0.23715585470199585 }, { "epoch": 0.43108562388768046, "grad_norm": 0.731275737285614, "learning_rate": 8.924739158384019e-06, "loss": 0.1463, "step": 4360 }, { "entropy": 9.604022026062012, "epoch": 0.43108562388768046, "mean_token_accuracy": 0.7116564512252808, "num_tokens": 1879636.0, "step": 4360, "train/ce_loss": 1.1926167011260986 }, { "epoch": 0.43108562388768046, "step": 4360, "train/sim_loss": 0.11328125 }, { "epoch": 0.43108562388768046, "step": 4360, "train/total_loss": 0.23254293203353882 }, { "entropy": 8.92547607421875, "epoch": 0.43118449673719594, "mean_token_accuracy": 0.7513157725334167, "num_tokens": 1884836.0, "step": 4361, "train/ce_loss": 0.5982657670974731 }, { "epoch": 0.43118449673719594, "step": 4361, "train/sim_loss": 0.078125 }, { "epoch": 0.43118449673719594, "step": 4361, "train/total_loss": 0.1379515826702118 }, { "entropy": 8.663128852844238, "epoch": 0.4312833695867115, "mean_token_accuracy": 0.7497414946556091, "num_tokens": 1890228.0, "step": 4362, "train/ce_loss": 1.0565632581710815 }, { "epoch": 0.4312833695867115, "step": 4362, "train/sim_loss": 0.078125 }, { "epoch": 0.4312833695867115, "step": 4362, "train/total_loss": 0.18378132581710815 }, { "entropy": 8.881734848022461, "epoch": 0.431382242436227, "mean_token_accuracy": 0.7706422209739685, "num_tokens": 1895571.0, "step": 4363, "train/ce_loss": 0.6953521966934204 }, { "epoch": 0.431382242436227, "step": 4363, "train/sim_loss": 0.03515625 }, { "epoch": 0.431382242436227, "step": 4363, "train/total_loss": 0.10469146817922592 }, { "entropy": 8.88752555847168, "epoch": 0.4314811152857425, "mean_token_accuracy": 0.7144457101821899, "num_tokens": 1900906.0, "step": 4364, "train/ce_loss": 0.595391571521759 }, { "epoch": 0.4314811152857425, "step": 4364, "train/sim_loss": 0.0234375 }, { "epoch": 0.4314811152857425, "step": 4364, "train/total_loss": 0.08297665417194366 }, { "entropy": 8.735705375671387, "epoch": 0.43157998813525805, "mean_token_accuracy": 0.7157360315322876, "num_tokens": 1906313.0, "step": 4365, "train/ce_loss": 1.6949741840362549 }, { "epoch": 0.43157998813525805, "step": 4365, "train/sim_loss": 0.15234375 }, { "epoch": 0.43157998813525805, "step": 4365, "train/total_loss": 0.32184118032455444 }, { "entropy": 9.029319763183594, "epoch": 0.4316788609847736, "mean_token_accuracy": 0.6994818449020386, "num_tokens": 1911645.0, "step": 4366, "train/ce_loss": 1.2209709882736206 }, { "epoch": 0.4316788609847736, "step": 4366, "train/sim_loss": 0.15625 }, { "epoch": 0.4316788609847736, "step": 4366, "train/total_loss": 0.27834710478782654 }, { "entropy": 9.10180377960205, "epoch": 0.4317777338342891, "mean_token_accuracy": 0.7441540360450745, "num_tokens": 1916849.0, "step": 4367, "train/ce_loss": 0.9811787009239197 }, { "epoch": 0.4317777338342891, "step": 4367, "train/sim_loss": 0.07421875 }, { "epoch": 0.4317777338342891, "step": 4367, "train/total_loss": 0.1723366230726242 }, { "entropy": 9.039645195007324, "epoch": 0.4318766066838046, "mean_token_accuracy": 0.6883604526519775, "num_tokens": 1922098.0, "step": 4368, "train/ce_loss": 0.7841721177101135 }, { "epoch": 0.4318766066838046, "step": 4368, "train/sim_loss": 0.03125 }, { "epoch": 0.4318766066838046, "step": 4368, "train/total_loss": 0.10966721177101135 }, { "entropy": 8.731348037719727, "epoch": 0.43197547953332016, "mean_token_accuracy": 0.7702381014823914, "num_tokens": 1927427.0, "step": 4369, "train/ce_loss": 0.7687444686889648 }, { "epoch": 0.43197547953332016, "step": 4369, "train/sim_loss": 0.04296875 }, { "epoch": 0.43197547953332016, "step": 4369, "train/total_loss": 0.11984319984912872 }, { "entropy": 8.865472793579102, "epoch": 0.43207435238283565, "mean_token_accuracy": 0.7451971769332886, "num_tokens": 1932875.0, "step": 4370, "train/ce_loss": 0.7522867321968079 }, { "epoch": 0.43207435238283565, "step": 4370, "train/sim_loss": 0.01953125 }, { "epoch": 0.43207435238283565, "step": 4370, "train/total_loss": 0.09475992619991302 }, { "entropy": 9.40871810913086, "epoch": 0.4321732252323512, "mean_token_accuracy": 0.6855828166007996, "num_tokens": 1937990.0, "step": 4371, "train/ce_loss": 1.3217101097106934 }, { "epoch": 0.4321732252323512, "step": 4371, "train/sim_loss": 0.05859375 }, { "epoch": 0.4321732252323512, "step": 4371, "train/total_loss": 0.19076476991176605 }, { "entropy": 8.581138610839844, "epoch": 0.43227209808186673, "mean_token_accuracy": 0.764762818813324, "num_tokens": 1943478.0, "step": 4372, "train/ce_loss": 0.5864698886871338 }, { "epoch": 0.43227209808186673, "step": 4372, "train/sim_loss": 0.0234375 }, { "epoch": 0.43227209808186673, "step": 4372, "train/total_loss": 0.08208449184894562 }, { "entropy": 9.11349105834961, "epoch": 0.4323709709313822, "mean_token_accuracy": 0.753947377204895, "num_tokens": 1948706.0, "step": 4373, "train/ce_loss": 0.831530749797821 }, { "epoch": 0.4323709709313822, "step": 4373, "train/sim_loss": 0.078125 }, { "epoch": 0.4323709709313822, "step": 4373, "train/total_loss": 0.16127806901931763 }, { "entropy": 9.241002082824707, "epoch": 0.43246984378089776, "mean_token_accuracy": 0.7766624689102173, "num_tokens": 1953911.0, "step": 4374, "train/ce_loss": 2.589736368463491e-06 }, { "epoch": 0.43246984378089776, "step": 4374, "train/sim_loss": 0.02734375 }, { "epoch": 0.43246984378089776, "step": 4374, "train/total_loss": 0.027344008907675743 }, { "entropy": 9.518906593322754, "epoch": 0.4325687166304133, "mean_token_accuracy": 0.7632450461387634, "num_tokens": 1958934.0, "step": 4375, "train/ce_loss": 1.8838533163070679 }, { "epoch": 0.4325687166304133, "step": 4375, "train/sim_loss": 0.05859375 }, { "epoch": 0.4325687166304133, "step": 4375, "train/total_loss": 0.24697908759117126 }, { "entropy": 8.939252853393555, "epoch": 0.43266758947992884, "mean_token_accuracy": 0.7582128643989563, "num_tokens": 1964186.0, "step": 4376, "train/ce_loss": 0.8972609043121338 }, { "epoch": 0.43266758947992884, "step": 4376, "train/sim_loss": 0.05078125 }, { "epoch": 0.43266758947992884, "step": 4376, "train/total_loss": 0.14050734043121338 }, { "entropy": 9.001517295837402, "epoch": 0.4327664623294443, "mean_token_accuracy": 0.7444987893104553, "num_tokens": 1969436.0, "step": 4377, "train/ce_loss": 0.8296143412590027 }, { "epoch": 0.4327664623294443, "step": 4377, "train/sim_loss": 0.03125 }, { "epoch": 0.4327664623294443, "step": 4377, "train/total_loss": 0.11421143263578415 }, { "entropy": 9.012580871582031, "epoch": 0.43286533517895986, "mean_token_accuracy": 0.7726218104362488, "num_tokens": 1974738.0, "step": 4378, "train/ce_loss": 0.8610218167304993 }, { "epoch": 0.43286533517895986, "step": 4378, "train/sim_loss": 0.015625 }, { "epoch": 0.43286533517895986, "step": 4378, "train/total_loss": 0.10172718018293381 }, { "entropy": 9.393769264221191, "epoch": 0.4329642080284754, "mean_token_accuracy": 0.7551928758621216, "num_tokens": 1979917.0, "step": 4379, "train/ce_loss": 0.5917772054672241 }, { "epoch": 0.4329642080284754, "step": 4379, "train/sim_loss": 0.03125 }, { "epoch": 0.4329642080284754, "step": 4379, "train/total_loss": 0.09042772650718689 }, { "epoch": 0.4330630808779909, "grad_norm": 0.8429343700408936, "learning_rate": 8.91979429362607e-06, "loss": 0.1395, "step": 4380 }, { "entropy": 9.748490333557129, "epoch": 0.4330630808779909, "mean_token_accuracy": 0.8401727676391602, "num_tokens": 1984804.0, "step": 4380, "train/ce_loss": 0.794469952583313 }, { "epoch": 0.4330630808779909, "step": 4380, "train/sim_loss": 0.0234375 }, { "epoch": 0.4330630808779909, "step": 4380, "train/total_loss": 0.10288449376821518 }, { "entropy": 8.892608642578125, "epoch": 0.43316195372750643, "mean_token_accuracy": 0.7439824938774109, "num_tokens": 1990143.0, "step": 4381, "train/ce_loss": 0.8625227808952332 }, { "epoch": 0.43316195372750643, "step": 4381, "train/sim_loss": 0.015625 }, { "epoch": 0.43316195372750643, "step": 4381, "train/total_loss": 0.10187727957963943 }, { "entropy": 9.310935974121094, "epoch": 0.433260826577022, "mean_token_accuracy": 0.7307692170143127, "num_tokens": 1995295.0, "step": 4382, "train/ce_loss": 1.0909391641616821 }, { "epoch": 0.433260826577022, "step": 4382, "train/sim_loss": 0.0390625 }, { "epoch": 0.433260826577022, "step": 4382, "train/total_loss": 0.14815641939640045 }, { "entropy": 8.98115062713623, "epoch": 0.43335969942653746, "mean_token_accuracy": 0.7350427508354187, "num_tokens": 2000609.0, "step": 4383, "train/ce_loss": 1.1102403402328491 }, { "epoch": 0.43335969942653746, "step": 4383, "train/sim_loss": 0.03515625 }, { "epoch": 0.43335969942653746, "step": 4383, "train/total_loss": 0.14618028700351715 }, { "entropy": 8.781379699707031, "epoch": 0.433458572276053, "mean_token_accuracy": 0.7820796370506287, "num_tokens": 2006028.0, "step": 4384, "train/ce_loss": 0.9370577931404114 }, { "epoch": 0.433458572276053, "step": 4384, "train/sim_loss": 0.015625 }, { "epoch": 0.433458572276053, "step": 4384, "train/total_loss": 0.10933078080415726 }, { "entropy": 9.149648666381836, "epoch": 0.43355744512556854, "mean_token_accuracy": 0.7650063633918762, "num_tokens": 2011440.0, "step": 4385, "train/ce_loss": 0.5976223945617676 }, { "epoch": 0.43355744512556854, "step": 4385, "train/sim_loss": 0.02734375 }, { "epoch": 0.43355744512556854, "step": 4385, "train/total_loss": 0.08710598945617676 }, { "entropy": 9.48122501373291, "epoch": 0.433656317975084, "mean_token_accuracy": 0.687821626663208, "num_tokens": 2016434.0, "step": 4386, "train/ce_loss": 1.4672801853521378e-06 }, { "epoch": 0.433656317975084, "step": 4386, "train/sim_loss": 0.0234375 }, { "epoch": 0.433656317975084, "step": 4386, "train/total_loss": 0.02343764714896679 }, { "entropy": 9.54377555847168, "epoch": 0.43375519082459957, "mean_token_accuracy": 0.7569444179534912, "num_tokens": 2021561.0, "step": 4387, "train/ce_loss": 0.7149176597595215 }, { "epoch": 0.43375519082459957, "step": 4387, "train/sim_loss": 0.01953125 }, { "epoch": 0.43375519082459957, "step": 4387, "train/total_loss": 0.0910230204463005 }, { "entropy": 9.069219589233398, "epoch": 0.4338540636741151, "mean_token_accuracy": 0.7426108121871948, "num_tokens": 2026994.0, "step": 4388, "train/ce_loss": 0.4522751569747925 }, { "epoch": 0.4338540636741151, "step": 4388, "train/sim_loss": 0.05078125 }, { "epoch": 0.4338540636741151, "step": 4388, "train/total_loss": 0.09600876271724701 }, { "entropy": 8.795243263244629, "epoch": 0.4339529365236306, "mean_token_accuracy": 0.7527812123298645, "num_tokens": 2032231.0, "step": 4389, "train/ce_loss": 0.7872169613838196 }, { "epoch": 0.4339529365236306, "step": 4389, "train/sim_loss": 0.06640625 }, { "epoch": 0.4339529365236306, "step": 4389, "train/total_loss": 0.14512795209884644 }, { "entropy": 9.059467315673828, "epoch": 0.43405180937314614, "mean_token_accuracy": 0.7508571147918701, "num_tokens": 2037591.0, "step": 4390, "train/ce_loss": 0.9338961243629456 }, { "epoch": 0.43405180937314614, "step": 4390, "train/sim_loss": 0.03125 }, { "epoch": 0.43405180937314614, "step": 4390, "train/total_loss": 0.1246396154165268 }, { "entropy": 9.725197792053223, "epoch": 0.4341506822226617, "mean_token_accuracy": 0.7208872437477112, "num_tokens": 2042575.0, "step": 4391, "train/ce_loss": 1.528663992881775 }, { "epoch": 0.4341506822226617, "step": 4391, "train/sim_loss": 0.0625 }, { "epoch": 0.4341506822226617, "step": 4391, "train/total_loss": 0.2153664082288742 }, { "entropy": 9.387313842773438, "epoch": 0.43424955507217716, "mean_token_accuracy": 0.7525773048400879, "num_tokens": 2047701.0, "step": 4392, "train/ce_loss": 0.5898157954216003 }, { "epoch": 0.43424955507217716, "step": 4392, "train/sim_loss": 0.0390625 }, { "epoch": 0.43424955507217716, "step": 4392, "train/total_loss": 0.09804408252239227 }, { "entropy": 8.884873390197754, "epoch": 0.4343484279216927, "mean_token_accuracy": 0.7536231875419617, "num_tokens": 2053147.0, "step": 4393, "train/ce_loss": 0.7696712613105774 }, { "epoch": 0.4343484279216927, "step": 4393, "train/sim_loss": 0.04296875 }, { "epoch": 0.4343484279216927, "step": 4393, "train/total_loss": 0.11993587762117386 }, { "entropy": 8.939346313476562, "epoch": 0.43444730077120824, "mean_token_accuracy": 0.7422360181808472, "num_tokens": 2058579.0, "step": 4394, "train/ce_loss": 0.7860175371170044 }, { "epoch": 0.43444730077120824, "step": 4394, "train/sim_loss": 0.07421875 }, { "epoch": 0.43444730077120824, "step": 4394, "train/total_loss": 0.15282049775123596 }, { "entropy": 9.010202407836914, "epoch": 0.43454617362072373, "mean_token_accuracy": 0.6784840822219849, "num_tokens": 2063801.0, "step": 4395, "train/ce_loss": 2.32151460647583 }, { "epoch": 0.43454617362072373, "step": 4395, "train/sim_loss": 0.07421875 }, { "epoch": 0.43454617362072373, "step": 4395, "train/total_loss": 0.30637019872665405 }, { "entropy": 9.382925033569336, "epoch": 0.43464504647023927, "mean_token_accuracy": 0.7042062282562256, "num_tokens": 2069058.0, "step": 4396, "train/ce_loss": 1.4940139055252075 }, { "epoch": 0.43464504647023927, "step": 4396, "train/sim_loss": 0.12109375 }, { "epoch": 0.43464504647023927, "step": 4396, "train/total_loss": 0.27049514651298523 }, { "entropy": 9.745540618896484, "epoch": 0.4347439193197548, "mean_token_accuracy": 0.7615230679512024, "num_tokens": 2074018.0, "step": 4397, "train/ce_loss": 2.7070050236943644e-06 }, { "epoch": 0.4347439193197548, "step": 4397, "train/sim_loss": 0.01953125 }, { "epoch": 0.4347439193197548, "step": 4397, "train/total_loss": 0.01953152008354664 }, { "entropy": 8.980493545532227, "epoch": 0.4348427921692703, "mean_token_accuracy": 0.7279236316680908, "num_tokens": 2079314.0, "step": 4398, "train/ce_loss": 1.029262900352478 }, { "epoch": 0.4348427921692703, "step": 4398, "train/sim_loss": 0.03125 }, { "epoch": 0.4348427921692703, "step": 4398, "train/total_loss": 0.13417628407478333 }, { "entropy": 9.474332809448242, "epoch": 0.43494166501878584, "mean_token_accuracy": 0.7854729890823364, "num_tokens": 2084509.0, "step": 4399, "train/ce_loss": 1.1530356407165527 }, { "epoch": 0.43494166501878584, "step": 4399, "train/sim_loss": 0.08203125 }, { "epoch": 0.43494166501878584, "step": 4399, "train/total_loss": 0.19733482599258423 }, { "epoch": 0.4350405378683014, "grad_norm": 0.7104471921920776, "learning_rate": 8.914849428868121e-06, "loss": 0.1403, "step": 4400 }, { "entropy": 9.791597366333008, "epoch": 0.4350405378683014, "mean_token_accuracy": 0.8131313323974609, "num_tokens": 2089348.0, "step": 4400, "train/ce_loss": 1.4423400163650513 }, { "epoch": 0.4350405378683014, "step": 4400, "train/sim_loss": 0.0390625 }, { "epoch": 0.4350405378683014, "step": 4400, "train/total_loss": 0.18329650163650513 }, { "entropy": 8.359624862670898, "epoch": 0.43513941071781687, "mean_token_accuracy": 0.7226890921592712, "num_tokens": 2094914.0, "step": 4401, "train/ce_loss": 1.043874740600586 }, { "epoch": 0.43513941071781687, "step": 4401, "train/sim_loss": 0.0625 }, { "epoch": 0.43513941071781687, "step": 4401, "train/total_loss": 0.16688747704029083 }, { "entropy": 9.358190536499023, "epoch": 0.4352382835673324, "mean_token_accuracy": 0.6929460763931274, "num_tokens": 2100115.0, "step": 4402, "train/ce_loss": 2.138016700744629 }, { "epoch": 0.4352382835673324, "step": 4402, "train/sim_loss": 0.0859375 }, { "epoch": 0.4352382835673324, "step": 4402, "train/total_loss": 0.29973918199539185 }, { "entropy": 9.750505447387695, "epoch": 0.43533715641684795, "mean_token_accuracy": 0.7675438523292542, "num_tokens": 2104988.0, "step": 4403, "train/ce_loss": 0.8936744928359985 }, { "epoch": 0.43533715641684795, "step": 4403, "train/sim_loss": 0.046875 }, { "epoch": 0.43533715641684795, "step": 4403, "train/total_loss": 0.13624244928359985 }, { "entropy": 9.501214027404785, "epoch": 0.43543602926636343, "mean_token_accuracy": 0.7021604776382446, "num_tokens": 2110080.0, "step": 4404, "train/ce_loss": 1.308893084526062 }, { "epoch": 0.43543602926636343, "step": 4404, "train/sim_loss": 0.03125 }, { "epoch": 0.43543602926636343, "step": 4404, "train/total_loss": 0.16213931143283844 }, { "entropy": 9.20964241027832, "epoch": 0.435534902115879, "mean_token_accuracy": 0.7559171319007874, "num_tokens": 2115387.0, "step": 4405, "train/ce_loss": 2.1970881789457053e-06 }, { "epoch": 0.435534902115879, "step": 4405, "train/sim_loss": 0.03515625 }, { "epoch": 0.435534902115879, "step": 4405, "train/total_loss": 0.03515646979212761 }, { "entropy": 8.804841995239258, "epoch": 0.4356337749653945, "mean_token_accuracy": 0.7403433322906494, "num_tokens": 2120795.0, "step": 4406, "train/ce_loss": 0.4965910017490387 }, { "epoch": 0.4356337749653945, "step": 4406, "train/sim_loss": 0.0625 }, { "epoch": 0.4356337749653945, "step": 4406, "train/total_loss": 0.11215910315513611 }, { "entropy": 8.861150741577148, "epoch": 0.43573264781491, "mean_token_accuracy": 0.7491785287857056, "num_tokens": 2126175.0, "step": 4407, "train/ce_loss": 0.5222152471542358 }, { "epoch": 0.43573264781491, "step": 4407, "train/sim_loss": 0.03125 }, { "epoch": 0.43573264781491, "step": 4407, "train/total_loss": 0.08347152173519135 }, { "entropy": 9.679935455322266, "epoch": 0.43583152066442554, "mean_token_accuracy": 0.6915887594223022, "num_tokens": 2131153.0, "step": 4408, "train/ce_loss": 1.8620322942733765 }, { "epoch": 0.43583152066442554, "step": 4408, "train/sim_loss": 0.09375 }, { "epoch": 0.43583152066442554, "step": 4408, "train/total_loss": 0.2799532413482666 }, { "entropy": 9.366382598876953, "epoch": 0.4359303935139411, "mean_token_accuracy": 0.7406876683235168, "num_tokens": 2136351.0, "step": 4409, "train/ce_loss": 0.778279721736908 }, { "epoch": 0.4359303935139411, "step": 4409, "train/sim_loss": 0.07421875 }, { "epoch": 0.4359303935139411, "step": 4409, "train/total_loss": 0.15204672515392303 }, { "entropy": 9.035079956054688, "epoch": 0.43602926636345657, "mean_token_accuracy": 0.7981545329093933, "num_tokens": 2141681.0, "step": 4410, "train/ce_loss": 0.4029959738254547 }, { "epoch": 0.43602926636345657, "step": 4410, "train/sim_loss": 0.04296875 }, { "epoch": 0.43602926636345657, "step": 4410, "train/total_loss": 0.08326834440231323 }, { "entropy": 9.631559371948242, "epoch": 0.4361281392129721, "mean_token_accuracy": 0.7532894611358643, "num_tokens": 2146726.0, "step": 4411, "train/ce_loss": 0.7673465609550476 }, { "epoch": 0.4361281392129721, "step": 4411, "train/sim_loss": 0.05078125 }, { "epoch": 0.4361281392129721, "step": 4411, "train/total_loss": 0.12751591205596924 }, { "entropy": 9.441352844238281, "epoch": 0.43622701206248765, "mean_token_accuracy": 0.7333333492279053, "num_tokens": 2151798.0, "step": 4412, "train/ce_loss": 1.9639170169830322 }, { "epoch": 0.43622701206248765, "step": 4412, "train/sim_loss": 0.0390625 }, { "epoch": 0.43622701206248765, "step": 4412, "train/total_loss": 0.23545420169830322 }, { "entropy": 8.656509399414062, "epoch": 0.43632588491200314, "mean_token_accuracy": 0.7327365875244141, "num_tokens": 2157047.0, "step": 4413, "train/ce_loss": 0.7206571698188782 }, { "epoch": 0.43632588491200314, "step": 4413, "train/sim_loss": 0.0703125 }, { "epoch": 0.43632588491200314, "step": 4413, "train/total_loss": 0.14237821102142334 }, { "entropy": 9.049860000610352, "epoch": 0.4364247577615187, "mean_token_accuracy": 0.7032418847084045, "num_tokens": 2162305.0, "step": 4414, "train/ce_loss": 0.868624210357666 }, { "epoch": 0.4364247577615187, "step": 4414, "train/sim_loss": 0.015625 }, { "epoch": 0.4364247577615187, "step": 4414, "train/total_loss": 0.10248742252588272 }, { "entropy": 9.341044425964355, "epoch": 0.4365236306110342, "mean_token_accuracy": 0.7268128395080566, "num_tokens": 2167355.0, "step": 4415, "train/ce_loss": 1.2310768365859985 }, { "epoch": 0.4365236306110342, "step": 4415, "train/sim_loss": 0.05859375 }, { "epoch": 0.4365236306110342, "step": 4415, "train/total_loss": 0.1817014366388321 }, { "entropy": 8.874947547912598, "epoch": 0.43662250346054976, "mean_token_accuracy": 0.7199017405509949, "num_tokens": 2172702.0, "step": 4416, "train/ce_loss": 0.9515064358711243 }, { "epoch": 0.43662250346054976, "step": 4416, "train/sim_loss": 0.04296875 }, { "epoch": 0.43662250346054976, "step": 4416, "train/total_loss": 0.1381193995475769 }, { "entropy": 8.804052352905273, "epoch": 0.43672137631006525, "mean_token_accuracy": 0.732833981513977, "num_tokens": 2178037.0, "step": 4417, "train/ce_loss": 1.0769851207733154 }, { "epoch": 0.43672137631006525, "step": 4417, "train/sim_loss": 0.109375 }, { "epoch": 0.43672137631006525, "step": 4417, "train/total_loss": 0.21707351505756378 }, { "entropy": 9.064543724060059, "epoch": 0.4368202491595808, "mean_token_accuracy": 0.6856435537338257, "num_tokens": 2183303.0, "step": 4418, "train/ce_loss": 0.9741151928901672 }, { "epoch": 0.4368202491595808, "step": 4418, "train/sim_loss": 0.08984375 }, { "epoch": 0.4368202491595808, "step": 4418, "train/total_loss": 0.18725526332855225 }, { "entropy": 9.179658889770508, "epoch": 0.43691912200909633, "mean_token_accuracy": 0.7299168705940247, "num_tokens": 2188446.0, "step": 4419, "train/ce_loss": 0.6126939058303833 }, { "epoch": 0.43691912200909633, "step": 4419, "train/sim_loss": 0.0546875 }, { "epoch": 0.43691912200909633, "step": 4419, "train/total_loss": 0.11595688760280609 }, { "epoch": 0.4370179948586118, "grad_norm": 0.7463781833648682, "learning_rate": 8.909904564110172e-06, "loss": 0.1475, "step": 4420 }, { "entropy": 9.411497116088867, "epoch": 0.4370179948586118, "mean_token_accuracy": 0.7161290049552917, "num_tokens": 2193535.0, "step": 4420, "train/ce_loss": 1.4859209060668945 }, { "epoch": 0.4370179948586118, "step": 4420, "train/sim_loss": 0.0546875 }, { "epoch": 0.4370179948586118, "step": 4420, "train/total_loss": 0.20327959954738617 }, { "entropy": 9.652908325195312, "epoch": 0.43711686770812735, "mean_token_accuracy": 0.7307692170143127, "num_tokens": 2198343.0, "step": 4421, "train/ce_loss": 1.3125678300857544 }, { "epoch": 0.43711686770812735, "step": 4421, "train/sim_loss": 0.0703125 }, { "epoch": 0.43711686770812735, "step": 4421, "train/total_loss": 0.20156928896903992 }, { "entropy": 9.79127311706543, "epoch": 0.4372157405576429, "mean_token_accuracy": 0.7169373631477356, "num_tokens": 2203158.0, "step": 4422, "train/ce_loss": 3.1071267127990723 }, { "epoch": 0.4372157405576429, "step": 4422, "train/sim_loss": 0.08203125 }, { "epoch": 0.4372157405576429, "step": 4422, "train/total_loss": 0.39274391531944275 }, { "entropy": 9.302753448486328, "epoch": 0.4373146134071584, "mean_token_accuracy": 0.7614424228668213, "num_tokens": 2208297.0, "step": 4423, "train/ce_loss": 1.1405718326568604 }, { "epoch": 0.4373146134071584, "step": 4423, "train/sim_loss": 0.08984375 }, { "epoch": 0.4373146134071584, "step": 4423, "train/total_loss": 0.20390093326568604 }, { "entropy": 8.907490730285645, "epoch": 0.4374134862566739, "mean_token_accuracy": 0.7476525902748108, "num_tokens": 2213654.0, "step": 4424, "train/ce_loss": 0.684638500213623 }, { "epoch": 0.4374134862566739, "step": 4424, "train/sim_loss": 0.0625 }, { "epoch": 0.4374134862566739, "step": 4424, "train/total_loss": 0.13096386194229126 }, { "entropy": 8.990495681762695, "epoch": 0.43751235910618946, "mean_token_accuracy": 0.7110214829444885, "num_tokens": 2218858.0, "step": 4425, "train/ce_loss": 7.0683554440620355e-06 }, { "epoch": 0.43751235910618946, "step": 4425, "train/sim_loss": 0.0546875 }, { "epoch": 0.43751235910618946, "step": 4425, "train/total_loss": 0.05468820780515671 }, { "entropy": 9.02608585357666, "epoch": 0.43761123195570495, "mean_token_accuracy": 0.7302483320236206, "num_tokens": 2224232.0, "step": 4426, "train/ce_loss": 0.8047318458557129 }, { "epoch": 0.43761123195570495, "step": 4426, "train/sim_loss": 0.046875 }, { "epoch": 0.43761123195570495, "step": 4426, "train/total_loss": 0.1273481845855713 }, { "entropy": 8.982162475585938, "epoch": 0.4377101048052205, "mean_token_accuracy": 0.7677419185638428, "num_tokens": 2229540.0, "step": 4427, "train/ce_loss": 0.6013153791427612 }, { "epoch": 0.4377101048052205, "step": 4427, "train/sim_loss": 0.03125 }, { "epoch": 0.4377101048052205, "step": 4427, "train/total_loss": 0.09138153493404388 }, { "entropy": 9.857841491699219, "epoch": 0.43780897765473603, "mean_token_accuracy": 0.7625330090522766, "num_tokens": 2234303.0, "step": 4428, "train/ce_loss": 1.9261402485426515e-05 }, { "epoch": 0.43780897765473603, "step": 4428, "train/sim_loss": 0.03515625 }, { "epoch": 0.43780897765473603, "step": 4428, "train/total_loss": 0.035158175975084305 }, { "entropy": 9.193056106567383, "epoch": 0.4379078505042515, "mean_token_accuracy": 0.7736132144927979, "num_tokens": 2239384.0, "step": 4429, "train/ce_loss": 1.0085554122924805 }, { "epoch": 0.4379078505042515, "step": 4429, "train/sim_loss": 0.0859375 }, { "epoch": 0.4379078505042515, "step": 4429, "train/total_loss": 0.18679304420948029 }, { "entropy": 10.10301399230957, "epoch": 0.43800672335376706, "mean_token_accuracy": 0.75390625, "num_tokens": 2244042.0, "step": 4430, "train/ce_loss": 1.0564612239249982e-05 }, { "epoch": 0.43800672335376706, "step": 4430, "train/sim_loss": 0.046875 }, { "epoch": 0.43800672335376706, "step": 4430, "train/total_loss": 0.04687605798244476 }, { "entropy": 8.985715866088867, "epoch": 0.4381055962032826, "mean_token_accuracy": 0.7023977637290955, "num_tokens": 2249240.0, "step": 4431, "train/ce_loss": 1.4215041399002075 }, { "epoch": 0.4381055962032826, "step": 4431, "train/sim_loss": 0.07421875 }, { "epoch": 0.4381055962032826, "step": 4431, "train/total_loss": 0.216369166970253 }, { "entropy": 9.483869552612305, "epoch": 0.4382044690527981, "mean_token_accuracy": 0.7144948840141296, "num_tokens": 2254372.0, "step": 4432, "train/ce_loss": 0.6930326223373413 }, { "epoch": 0.4382044690527981, "step": 4432, "train/sim_loss": 0.08203125 }, { "epoch": 0.4382044690527981, "step": 4432, "train/total_loss": 0.15133452415466309 }, { "entropy": 8.84086799621582, "epoch": 0.4383033419023136, "mean_token_accuracy": 0.7124518752098083, "num_tokens": 2259648.0, "step": 4433, "train/ce_loss": 0.6114106774330139 }, { "epoch": 0.4383033419023136, "step": 4433, "train/sim_loss": 0.08984375 }, { "epoch": 0.4383033419023136, "step": 4433, "train/total_loss": 0.15098482370376587 }, { "entropy": 9.003669738769531, "epoch": 0.43840221475182917, "mean_token_accuracy": 0.7843137383460999, "num_tokens": 2265037.0, "step": 4434, "train/ce_loss": 0.8309410214424133 }, { "epoch": 0.43840221475182917, "step": 4434, "train/sim_loss": 0.0546875 }, { "epoch": 0.43840221475182917, "step": 4434, "train/total_loss": 0.13778160512447357 }, { "entropy": 8.685771942138672, "epoch": 0.43850108760134465, "mean_token_accuracy": 0.7375954389572144, "num_tokens": 2270553.0, "step": 4435, "train/ce_loss": 1.0269848108291626 }, { "epoch": 0.43850108760134465, "step": 4435, "train/sim_loss": 0.05078125 }, { "epoch": 0.43850108760134465, "step": 4435, "train/total_loss": 0.15347972512245178 }, { "entropy": 9.298337936401367, "epoch": 0.4385999604508602, "mean_token_accuracy": 0.6993288397789001, "num_tokens": 2275770.0, "step": 4436, "train/ce_loss": 1.4781383275985718 }, { "epoch": 0.4385999604508602, "step": 4436, "train/sim_loss": 0.02734375 }, { "epoch": 0.4385999604508602, "step": 4436, "train/total_loss": 0.1751575917005539 }, { "entropy": 8.53411865234375, "epoch": 0.43869883330037573, "mean_token_accuracy": 0.7323818206787109, "num_tokens": 2281335.0, "step": 4437, "train/ce_loss": 0.9833084940910339 }, { "epoch": 0.43869883330037573, "step": 4437, "train/sim_loss": 0.078125 }, { "epoch": 0.43869883330037573, "step": 4437, "train/total_loss": 0.17645585536956787 }, { "entropy": 9.266082763671875, "epoch": 0.4387977061498912, "mean_token_accuracy": 0.7793493866920471, "num_tokens": 2286490.0, "step": 4438, "train/ce_loss": 0.9420968890190125 }, { "epoch": 0.4387977061498912, "step": 4438, "train/sim_loss": 0.078125 }, { "epoch": 0.4387977061498912, "step": 4438, "train/total_loss": 0.1723347008228302 }, { "entropy": 9.278593063354492, "epoch": 0.43889657899940676, "mean_token_accuracy": 0.743658185005188, "num_tokens": 2291650.0, "step": 4439, "train/ce_loss": 0.7231143712997437 }, { "epoch": 0.43889657899940676, "step": 4439, "train/sim_loss": 0.08203125 }, { "epoch": 0.43889657899940676, "step": 4439, "train/total_loss": 0.1543426811695099 }, { "epoch": 0.4389954518489223, "grad_norm": 0.769729495048523, "learning_rate": 8.904959699352224e-06, "loss": 0.1442, "step": 4440 }, { "entropy": 9.450221061706543, "epoch": 0.4389954518489223, "mean_token_accuracy": 0.7487091422080994, "num_tokens": 2296647.0, "step": 4440, "train/ce_loss": 0.8850860595703125 }, { "epoch": 0.4389954518489223, "step": 4440, "train/sim_loss": 0.03515625 }, { "epoch": 0.4389954518489223, "step": 4440, "train/total_loss": 0.12366485595703125 }, { "entropy": 9.613666534423828, "epoch": 0.4390943246984378, "mean_token_accuracy": 0.692307710647583, "num_tokens": 2301592.0, "step": 4441, "train/ce_loss": 2.9959271614643512e-06 }, { "epoch": 0.4390943246984378, "step": 4441, "train/sim_loss": 0.0703125 }, { "epoch": 0.4390943246984378, "step": 4441, "train/total_loss": 0.07031279802322388 }, { "entropy": 9.27306079864502, "epoch": 0.43919319754795333, "mean_token_accuracy": 0.7120211124420166, "num_tokens": 2306781.0, "step": 4442, "train/ce_loss": 2.1641550064086914 }, { "epoch": 0.43919319754795333, "step": 4442, "train/sim_loss": 0.0859375 }, { "epoch": 0.43919319754795333, "step": 4442, "train/total_loss": 0.30235302448272705 }, { "entropy": 8.658539772033691, "epoch": 0.43929207039746887, "mean_token_accuracy": 0.7223427295684814, "num_tokens": 2312149.0, "step": 4443, "train/ce_loss": 1.2649974822998047 }, { "epoch": 0.43929207039746887, "step": 4443, "train/sim_loss": 0.06640625 }, { "epoch": 0.43929207039746887, "step": 4443, "train/total_loss": 0.19290600717067719 }, { "entropy": 9.389749526977539, "epoch": 0.43939094324698436, "mean_token_accuracy": 0.7435897588729858, "num_tokens": 2317269.0, "step": 4444, "train/ce_loss": 0.3333715796470642 }, { "epoch": 0.43939094324698436, "step": 4444, "train/sim_loss": 0.0625 }, { "epoch": 0.43939094324698436, "step": 4444, "train/total_loss": 0.09583716094493866 }, { "entropy": 9.218347549438477, "epoch": 0.4394898160964999, "mean_token_accuracy": 0.6879084706306458, "num_tokens": 2322329.0, "step": 4445, "train/ce_loss": 1.192376971244812 }, { "epoch": 0.4394898160964999, "step": 4445, "train/sim_loss": 0.08984375 }, { "epoch": 0.4394898160964999, "step": 4445, "train/total_loss": 0.20908144116401672 }, { "entropy": 9.563655853271484, "epoch": 0.43958868894601544, "mean_token_accuracy": 0.664893627166748, "num_tokens": 2327339.0, "step": 4446, "train/ce_loss": 0.9157935380935669 }, { "epoch": 0.43958868894601544, "step": 4446, "train/sim_loss": 0.05078125 }, { "epoch": 0.43958868894601544, "step": 4446, "train/total_loss": 0.1423605978488922 }, { "entropy": 9.286184310913086, "epoch": 0.4396875617955309, "mean_token_accuracy": 0.7344322204589844, "num_tokens": 2332353.0, "step": 4447, "train/ce_loss": 1.0957647562026978 }, { "epoch": 0.4396875617955309, "step": 4447, "train/sim_loss": 0.0703125 }, { "epoch": 0.4396875617955309, "step": 4447, "train/total_loss": 0.17988897860050201 }, { "entropy": 9.33708381652832, "epoch": 0.43978643464504646, "mean_token_accuracy": 0.7566037774085999, "num_tokens": 2337337.0, "step": 4448, "train/ce_loss": 3.0838871225569164e-06 }, { "epoch": 0.43978643464504646, "step": 4448, "train/sim_loss": 0.046875 }, { "epoch": 0.43978643464504646, "step": 4448, "train/total_loss": 0.04687530919909477 }, { "entropy": 9.299051284790039, "epoch": 0.439885307494562, "mean_token_accuracy": 0.7994467616081238, "num_tokens": 2342497.0, "step": 4449, "train/ce_loss": 5.771404630650068e-06 }, { "epoch": 0.439885307494562, "step": 4449, "train/sim_loss": 0.05859375 }, { "epoch": 0.439885307494562, "step": 4449, "train/total_loss": 0.05859432741999626 }, { "entropy": 9.380624771118164, "epoch": 0.4399841803440775, "mean_token_accuracy": 0.7617765665054321, "num_tokens": 2347718.0, "step": 4450, "train/ce_loss": 0.9750670790672302 }, { "epoch": 0.4399841803440775, "step": 4450, "train/sim_loss": 0.078125 }, { "epoch": 0.4399841803440775, "step": 4450, "train/total_loss": 0.17563170194625854 }, { "entropy": 8.716615676879883, "epoch": 0.44008305319359303, "mean_token_accuracy": 0.7680981755256653, "num_tokens": 2353076.0, "step": 4451, "train/ce_loss": 1.348904013633728 }, { "epoch": 0.44008305319359303, "step": 4451, "train/sim_loss": 0.09375 }, { "epoch": 0.44008305319359303, "step": 4451, "train/total_loss": 0.22864040732383728 }, { "entropy": 9.949045181274414, "epoch": 0.4401819260431086, "mean_token_accuracy": 0.7323601245880127, "num_tokens": 2357858.0, "step": 4452, "train/ce_loss": 1.8708515167236328 }, { "epoch": 0.4401819260431086, "step": 4452, "train/sim_loss": 0.0546875 }, { "epoch": 0.4401819260431086, "step": 4452, "train/total_loss": 0.24177265167236328 }, { "entropy": 9.444894790649414, "epoch": 0.44028079889262406, "mean_token_accuracy": 0.7267080545425415, "num_tokens": 2362948.0, "step": 4453, "train/ce_loss": 1.2766855955123901 }, { "epoch": 0.44028079889262406, "step": 4453, "train/sim_loss": 0.02734375 }, { "epoch": 0.44028079889262406, "step": 4453, "train/total_loss": 0.155012309551239 }, { "entropy": 10.00452995300293, "epoch": 0.4403796717421396, "mean_token_accuracy": 0.7047353982925415, "num_tokens": 2367711.0, "step": 4454, "train/ce_loss": 5.856952611793531e-06 }, { "epoch": 0.4403796717421396, "step": 4454, "train/sim_loss": 0.05078125 }, { "epoch": 0.4403796717421396, "step": 4454, "train/total_loss": 0.05078183487057686 }, { "entropy": 8.824409484863281, "epoch": 0.44047854459165514, "mean_token_accuracy": 0.7411764860153198, "num_tokens": 2373109.0, "step": 4455, "train/ce_loss": 0.769832193851471 }, { "epoch": 0.44047854459165514, "step": 4455, "train/sim_loss": 0.03515625 }, { "epoch": 0.44047854459165514, "step": 4455, "train/total_loss": 0.11213947087526321 }, { "entropy": 8.866456985473633, "epoch": 0.4405774174411707, "mean_token_accuracy": 0.716803789138794, "num_tokens": 2378478.0, "step": 4456, "train/ce_loss": 1.0894641876220703 }, { "epoch": 0.4405774174411707, "step": 4456, "train/sim_loss": 0.0546875 }, { "epoch": 0.4405774174411707, "step": 4456, "train/total_loss": 0.16363391280174255 }, { "entropy": 9.864400863647461, "epoch": 0.44067629029068617, "mean_token_accuracy": 0.7126213312149048, "num_tokens": 2383367.0, "step": 4457, "train/ce_loss": 1.4397059679031372 }, { "epoch": 0.44067629029068617, "step": 4457, "train/sim_loss": 0.0859375 }, { "epoch": 0.44067629029068617, "step": 4457, "train/total_loss": 0.22990809381008148 }, { "entropy": 8.804418563842773, "epoch": 0.4407751631402017, "mean_token_accuracy": 0.7957219481468201, "num_tokens": 2388794.0, "step": 4458, "train/ce_loss": 0.3089240491390228 }, { "epoch": 0.4407751631402017, "step": 4458, "train/sim_loss": 0.01171875 }, { "epoch": 0.4407751631402017, "step": 4458, "train/total_loss": 0.04261115565896034 }, { "entropy": 8.95989990234375, "epoch": 0.44087403598971725, "mean_token_accuracy": 0.7293144464492798, "num_tokens": 2394072.0, "step": 4459, "train/ce_loss": 1.2242991924285889 }, { "epoch": 0.44087403598971725, "step": 4459, "train/sim_loss": 0.1015625 }, { "epoch": 0.44087403598971725, "step": 4459, "train/total_loss": 0.22399242222309113 }, { "epoch": 0.44097290883923274, "grad_norm": 0.662868082523346, "learning_rate": 8.900014834594275e-06, "loss": 0.1468, "step": 4460 }, { "entropy": 9.7796049118042, "epoch": 0.44097290883923274, "mean_token_accuracy": 0.6968504190444946, "num_tokens": 2399007.0, "step": 4460, "train/ce_loss": 2.228492498397827 }, { "epoch": 0.44097290883923274, "step": 4460, "train/sim_loss": 0.0625 }, { "epoch": 0.44097290883923274, "step": 4460, "train/total_loss": 0.2853492498397827 }, { "entropy": 9.543054580688477, "epoch": 0.4410717816887483, "mean_token_accuracy": 0.8024263381958008, "num_tokens": 2404063.0, "step": 4461, "train/ce_loss": 0.8705273866653442 }, { "epoch": 0.4410717816887483, "step": 4461, "train/sim_loss": 0.05078125 }, { "epoch": 0.4410717816887483, "step": 4461, "train/total_loss": 0.13783398270606995 }, { "entropy": 9.04907512664795, "epoch": 0.4411706545382638, "mean_token_accuracy": 0.7178649306297302, "num_tokens": 2409424.0, "step": 4462, "train/ce_loss": 0.821657121181488 }, { "epoch": 0.4411706545382638, "step": 4462, "train/sim_loss": 0.05859375 }, { "epoch": 0.4411706545382638, "step": 4462, "train/total_loss": 0.14075946807861328 }, { "entropy": 8.752744674682617, "epoch": 0.4412695273877793, "mean_token_accuracy": 0.7126303315162659, "num_tokens": 2414826.0, "step": 4463, "train/ce_loss": 1.4678646326065063 }, { "epoch": 0.4412695273877793, "step": 4463, "train/sim_loss": 0.05859375 }, { "epoch": 0.4412695273877793, "step": 4463, "train/total_loss": 0.20538021624088287 }, { "entropy": 9.18133544921875, "epoch": 0.44136840023729484, "mean_token_accuracy": 0.7166416645050049, "num_tokens": 2420018.0, "step": 4464, "train/ce_loss": 0.4874807596206665 }, { "epoch": 0.44136840023729484, "step": 4464, "train/sim_loss": 0.05859375 }, { "epoch": 0.44136840023729484, "step": 4464, "train/total_loss": 0.10734182596206665 }, { "entropy": 10.004258155822754, "epoch": 0.4414672730868104, "mean_token_accuracy": 0.7440476417541504, "num_tokens": 2424788.0, "step": 4465, "train/ce_loss": 1.915676474571228 }, { "epoch": 0.4414672730868104, "step": 4465, "train/sim_loss": 0.08984375 }, { "epoch": 0.4414672730868104, "step": 4465, "train/total_loss": 0.28141140937805176 }, { "entropy": 10.212564468383789, "epoch": 0.44156614593632587, "mean_token_accuracy": 0.7654867172241211, "num_tokens": 2429405.0, "step": 4466, "train/ce_loss": 5.95388155488763e-06 }, { "epoch": 0.44156614593632587, "step": 4466, "train/sim_loss": 0.046875 }, { "epoch": 0.44156614593632587, "step": 4466, "train/total_loss": 0.046875596046447754 }, { "entropy": 8.973442077636719, "epoch": 0.4416650187858414, "mean_token_accuracy": 0.763832688331604, "num_tokens": 2434591.0, "step": 4467, "train/ce_loss": 0.42459407448768616 }, { "epoch": 0.4416650187858414, "step": 4467, "train/sim_loss": 0.03125 }, { "epoch": 0.4416650187858414, "step": 4467, "train/total_loss": 0.0737094134092331 }, { "entropy": 8.980101585388184, "epoch": 0.44176389163535695, "mean_token_accuracy": 0.7095671892166138, "num_tokens": 2439939.0, "step": 4468, "train/ce_loss": 1.2070839405059814 }, { "epoch": 0.44176389163535695, "step": 4468, "train/sim_loss": 0.0234375 }, { "epoch": 0.44176389163535695, "step": 4468, "train/total_loss": 0.1441459059715271 }, { "entropy": 9.259428977966309, "epoch": 0.44186276448487244, "mean_token_accuracy": 0.7447090148925781, "num_tokens": 2445089.0, "step": 4469, "train/ce_loss": 1.2775579690933228 }, { "epoch": 0.44186276448487244, "step": 4469, "train/sim_loss": 0.046875 }, { "epoch": 0.44186276448487244, "step": 4469, "train/total_loss": 0.174630805850029 }, { "entropy": 9.76385498046875, "epoch": 0.441961637334388, "mean_token_accuracy": 0.766590416431427, "num_tokens": 2449918.0, "step": 4470, "train/ce_loss": 2.6808668280864367e-06 }, { "epoch": 0.441961637334388, "step": 4470, "train/sim_loss": 0.0625 }, { "epoch": 0.441961637334388, "step": 4470, "train/total_loss": 0.06250026822090149 }, { "entropy": 9.008390426635742, "epoch": 0.4420605101839035, "mean_token_accuracy": 0.8142250776290894, "num_tokens": 2455348.0, "step": 4471, "train/ce_loss": 0.43461742997169495 }, { "epoch": 0.4420605101839035, "step": 4471, "train/sim_loss": 0.0234375 }, { "epoch": 0.4420605101839035, "step": 4471, "train/total_loss": 0.06689924001693726 }, { "entropy": 9.000526428222656, "epoch": 0.442159383033419, "mean_token_accuracy": 0.712435245513916, "num_tokens": 2460570.0, "step": 4472, "train/ce_loss": 0.7038297653198242 }, { "epoch": 0.442159383033419, "step": 4472, "train/sim_loss": 0.03515625 }, { "epoch": 0.442159383033419, "step": 4472, "train/total_loss": 0.1055392250418663 }, { "entropy": 8.812034606933594, "epoch": 0.44225825588293455, "mean_token_accuracy": 0.7597330212593079, "num_tokens": 2465975.0, "step": 4473, "train/ce_loss": 0.5284525156021118 }, { "epoch": 0.44225825588293455, "step": 4473, "train/sim_loss": 0.0234375 }, { "epoch": 0.44225825588293455, "step": 4473, "train/total_loss": 0.07628275454044342 }, { "entropy": 9.443574905395508, "epoch": 0.4423571287324501, "mean_token_accuracy": 0.7342767119407654, "num_tokens": 2471047.0, "step": 4474, "train/ce_loss": 3.8014932215446606e-06 }, { "epoch": 0.4423571287324501, "step": 4474, "train/sim_loss": 0.0703125 }, { "epoch": 0.4423571287324501, "step": 4474, "train/total_loss": 0.07031287997961044 }, { "entropy": 9.411504745483398, "epoch": 0.4424560015819656, "mean_token_accuracy": 0.7269303202629089, "num_tokens": 2476012.0, "step": 4475, "train/ce_loss": 0.9485357999801636 }, { "epoch": 0.4424560015819656, "step": 4475, "train/sim_loss": 0.08984375 }, { "epoch": 0.4424560015819656, "step": 4475, "train/total_loss": 0.18469732999801636 }, { "entropy": 9.112486839294434, "epoch": 0.4425548744314811, "mean_token_accuracy": 0.7319201827049255, "num_tokens": 2481280.0, "step": 4476, "train/ce_loss": 0.4585082232952118 }, { "epoch": 0.4425548744314811, "step": 4476, "train/sim_loss": 0.04296875 }, { "epoch": 0.4425548744314811, "step": 4476, "train/total_loss": 0.08881957828998566 }, { "entropy": 9.011128425598145, "epoch": 0.44265374728099666, "mean_token_accuracy": 0.7423887848854065, "num_tokens": 2486582.0, "step": 4477, "train/ce_loss": 0.679169774055481 }, { "epoch": 0.44265374728099666, "step": 4477, "train/sim_loss": 0.1015625 }, { "epoch": 0.44265374728099666, "step": 4477, "train/total_loss": 0.16947948932647705 }, { "entropy": 8.964334487915039, "epoch": 0.44275262013051214, "mean_token_accuracy": 0.6931949257850647, "num_tokens": 2491911.0, "step": 4478, "train/ce_loss": 0.6913797855377197 }, { "epoch": 0.44275262013051214, "step": 4478, "train/sim_loss": 0.046875 }, { "epoch": 0.44275262013051214, "step": 4478, "train/total_loss": 0.11601298302412033 }, { "entropy": 9.322576522827148, "epoch": 0.4428514929800277, "mean_token_accuracy": 0.7546897530555725, "num_tokens": 2497061.0, "step": 4479, "train/ce_loss": 0.8054964542388916 }, { "epoch": 0.4428514929800277, "step": 4479, "train/sim_loss": 0.0546875 }, { "epoch": 0.4428514929800277, "step": 4479, "train/total_loss": 0.13523715734481812 }, { "epoch": 0.4429503658295432, "grad_norm": 0.699345052242279, "learning_rate": 8.895069969836325e-06, "loss": 0.1442, "step": 4480 }, { "entropy": 9.255762100219727, "epoch": 0.4429503658295432, "mean_token_accuracy": 0.7518796920776367, "num_tokens": 2502221.0, "step": 4480, "train/ce_loss": 0.8112313747406006 }, { "epoch": 0.4429503658295432, "step": 4480, "train/sim_loss": 0.01953125 }, { "epoch": 0.4429503658295432, "step": 4480, "train/total_loss": 0.10065438598394394 }, { "entropy": 9.210655212402344, "epoch": 0.4430492386790587, "mean_token_accuracy": 0.7523584961891174, "num_tokens": 2507517.0, "step": 4481, "train/ce_loss": 0.8111066818237305 }, { "epoch": 0.4430492386790587, "step": 4481, "train/sim_loss": 0.046875 }, { "epoch": 0.4430492386790587, "step": 4481, "train/total_loss": 0.12798567116260529 }, { "entropy": 8.875221252441406, "epoch": 0.44314811152857425, "mean_token_accuracy": 0.7130852341651917, "num_tokens": 2512856.0, "step": 4482, "train/ce_loss": 1.2064754962921143 }, { "epoch": 0.44314811152857425, "step": 4482, "train/sim_loss": 0.03125 }, { "epoch": 0.44314811152857425, "step": 4482, "train/total_loss": 0.15189754962921143 }, { "entropy": 9.424863815307617, "epoch": 0.4432469843780898, "mean_token_accuracy": 0.7870778441429138, "num_tokens": 2518023.0, "step": 4483, "train/ce_loss": 1.2678290605545044 }, { "epoch": 0.4432469843780898, "step": 4483, "train/sim_loss": 0.1015625 }, { "epoch": 0.4432469843780898, "step": 4483, "train/total_loss": 0.22834540903568268 }, { "entropy": 9.196966171264648, "epoch": 0.4433458572276053, "mean_token_accuracy": 0.736775815486908, "num_tokens": 2523289.0, "step": 4484, "train/ce_loss": 1.2598520517349243 }, { "epoch": 0.4433458572276053, "step": 4484, "train/sim_loss": 0.05078125 }, { "epoch": 0.4433458572276053, "step": 4484, "train/total_loss": 0.17676645517349243 }, { "entropy": 8.98042106628418, "epoch": 0.4434447300771208, "mean_token_accuracy": 0.7055960893630981, "num_tokens": 2528596.0, "step": 4485, "train/ce_loss": 0.34039002656936646 }, { "epoch": 0.4434447300771208, "step": 4485, "train/sim_loss": 0.0234375 }, { "epoch": 0.4434447300771208, "step": 4485, "train/total_loss": 0.057476501911878586 }, { "entropy": 9.068885803222656, "epoch": 0.44354360292663636, "mean_token_accuracy": 0.7556390762329102, "num_tokens": 2533928.0, "step": 4486, "train/ce_loss": 0.46264269948005676 }, { "epoch": 0.44354360292663636, "step": 4486, "train/sim_loss": 0.05859375 }, { "epoch": 0.44354360292663636, "step": 4486, "train/total_loss": 0.10485802590847015 }, { "entropy": 8.65289306640625, "epoch": 0.44364247577615185, "mean_token_accuracy": 0.7263875603675842, "num_tokens": 2539488.0, "step": 4487, "train/ce_loss": 0.7177829146385193 }, { "epoch": 0.44364247577615185, "step": 4487, "train/sim_loss": 0.1015625 }, { "epoch": 0.44364247577615185, "step": 4487, "train/total_loss": 0.1733407974243164 }, { "entropy": 8.838767051696777, "epoch": 0.4437413486256674, "mean_token_accuracy": 0.7434210777282715, "num_tokens": 2545037.0, "step": 4488, "train/ce_loss": 0.638717770576477 }, { "epoch": 0.4437413486256674, "step": 4488, "train/sim_loss": 0.05859375 }, { "epoch": 0.4437413486256674, "step": 4488, "train/total_loss": 0.12246552854776382 }, { "entropy": 9.20651912689209, "epoch": 0.44384022147518293, "mean_token_accuracy": 0.7527011036872864, "num_tokens": 2550320.0, "step": 4489, "train/ce_loss": 0.790439248085022 }, { "epoch": 0.44384022147518293, "step": 4489, "train/sim_loss": 0.0859375 }, { "epoch": 0.44384022147518293, "step": 4489, "train/total_loss": 0.1649814248085022 }, { "entropy": 9.543380737304688, "epoch": 0.4439390943246984, "mean_token_accuracy": 0.699999988079071, "num_tokens": 2555328.0, "step": 4490, "train/ce_loss": 1.331195592880249 }, { "epoch": 0.4439390943246984, "step": 4490, "train/sim_loss": 0.08203125 }, { "epoch": 0.4439390943246984, "step": 4490, "train/total_loss": 0.21515081822872162 }, { "entropy": 9.06105899810791, "epoch": 0.44403796717421395, "mean_token_accuracy": 0.7285180687904358, "num_tokens": 2560606.0, "step": 4491, "train/ce_loss": 1.2990325689315796 }, { "epoch": 0.44403796717421395, "step": 4491, "train/sim_loss": 0.08984375 }, { "epoch": 0.44403796717421395, "step": 4491, "train/total_loss": 0.21974700689315796 }, { "entropy": 9.204246520996094, "epoch": 0.4441368400237295, "mean_token_accuracy": 0.7277628183364868, "num_tokens": 2565842.0, "step": 4492, "train/ce_loss": 0.6739773154258728 }, { "epoch": 0.4441368400237295, "step": 4492, "train/sim_loss": 0.046875 }, { "epoch": 0.4441368400237295, "step": 4492, "train/total_loss": 0.11427273601293564 }, { "entropy": 9.881002426147461, "epoch": 0.444235712873245, "mean_token_accuracy": 0.8098039031028748, "num_tokens": 2570756.0, "step": 4493, "train/ce_loss": 1.4940541177566047e-06 }, { "epoch": 0.444235712873245, "step": 4493, "train/sim_loss": 0.0234375 }, { "epoch": 0.444235712873245, "step": 4493, "train/total_loss": 0.02343764901161194 }, { "entropy": 9.324443817138672, "epoch": 0.4443345857227605, "mean_token_accuracy": 0.7636363506317139, "num_tokens": 2575777.0, "step": 4494, "train/ce_loss": 1.4793231457588263e-06 }, { "epoch": 0.4443345857227605, "step": 4494, "train/sim_loss": 0.07421875 }, { "epoch": 0.4443345857227605, "step": 4494, "train/total_loss": 0.07421889901161194 }, { "entropy": 9.813060760498047, "epoch": 0.44443345857227606, "mean_token_accuracy": 0.7376344203948975, "num_tokens": 2580655.0, "step": 4495, "train/ce_loss": 1.8447369711793726e-06 }, { "epoch": 0.44443345857227606, "step": 4495, "train/sim_loss": 0.03125 }, { "epoch": 0.44443345857227606, "step": 4495, "train/total_loss": 0.03125018626451492 }, { "entropy": 9.63151741027832, "epoch": 0.44453233142179155, "mean_token_accuracy": 0.7180384993553162, "num_tokens": 2585676.0, "step": 4496, "train/ce_loss": 1.6704481840133667 }, { "epoch": 0.44453233142179155, "step": 4496, "train/sim_loss": 0.1484375 }, { "epoch": 0.44453233142179155, "step": 4496, "train/total_loss": 0.31548231840133667 }, { "entropy": 9.124244689941406, "epoch": 0.4446312042713071, "mean_token_accuracy": 0.7172236442565918, "num_tokens": 2590930.0, "step": 4497, "train/ce_loss": 1.4694288969039917 }, { "epoch": 0.4446312042713071, "step": 4497, "train/sim_loss": 0.0625 }, { "epoch": 0.4446312042713071, "step": 4497, "train/total_loss": 0.2094428986310959 }, { "entropy": 9.301881790161133, "epoch": 0.44473007712082263, "mean_token_accuracy": 0.7427386045455933, "num_tokens": 2596133.0, "step": 4498, "train/ce_loss": 0.8499370813369751 }, { "epoch": 0.44473007712082263, "step": 4498, "train/sim_loss": 0.07421875 }, { "epoch": 0.44473007712082263, "step": 4498, "train/total_loss": 0.15921247005462646 }, { "entropy": 9.577167510986328, "epoch": 0.44482894997033817, "mean_token_accuracy": 0.7610294222831726, "num_tokens": 2601100.0, "step": 4499, "train/ce_loss": 1.0237698554992676 }, { "epoch": 0.44482894997033817, "step": 4499, "train/sim_loss": 0.09375 }, { "epoch": 0.44482894997033817, "step": 4499, "train/total_loss": 0.1961269974708557 }, { "epoch": 0.44492782281985366, "grad_norm": 0.7579295039176941, "learning_rate": 8.890125105078377e-06, "loss": 0.1471, "step": 4500 }, { "entropy": 8.780494689941406, "epoch": 0.44492782281985366, "mean_token_accuracy": 0.7605911493301392, "num_tokens": 2606616.0, "step": 4500, "train/ce_loss": 0.8086697459220886 }, { "epoch": 0.44492782281985366, "step": 4500, "train/sim_loss": 0.09375 }, { "epoch": 0.44492782281985366, "step": 4500, "train/total_loss": 0.1746169775724411 }, { "entropy": 9.408340454101562, "epoch": 0.4450266956693692, "mean_token_accuracy": 0.7112010717391968, "num_tokens": 2611793.0, "step": 4501, "train/ce_loss": 1.1653521060943604 }, { "epoch": 0.4450266956693692, "step": 4501, "train/sim_loss": 0.1015625 }, { "epoch": 0.4450266956693692, "step": 4501, "train/total_loss": 0.2180977165699005 }, { "entropy": 8.905416488647461, "epoch": 0.44512556851888474, "mean_token_accuracy": 0.692307710647583, "num_tokens": 2616968.0, "step": 4502, "train/ce_loss": 0.7832385897636414 }, { "epoch": 0.44512556851888474, "step": 4502, "train/sim_loss": 0.0546875 }, { "epoch": 0.44512556851888474, "step": 4502, "train/total_loss": 0.1330113708972931 }, { "entropy": 8.959816932678223, "epoch": 0.4452244413684002, "mean_token_accuracy": 0.720200777053833, "num_tokens": 2622259.0, "step": 4503, "train/ce_loss": 0.5568627715110779 }, { "epoch": 0.4452244413684002, "step": 4503, "train/sim_loss": 0.06640625 }, { "epoch": 0.4452244413684002, "step": 4503, "train/total_loss": 0.12209253013134003 }, { "entropy": 9.612812042236328, "epoch": 0.44532331421791577, "mean_token_accuracy": 0.7937062978744507, "num_tokens": 2627241.0, "step": 4504, "train/ce_loss": 0.8046440482139587 }, { "epoch": 0.44532331421791577, "step": 4504, "train/sim_loss": 0.03515625 }, { "epoch": 0.44532331421791577, "step": 4504, "train/total_loss": 0.11562065780162811 }, { "entropy": 9.064831733703613, "epoch": 0.4454221870674313, "mean_token_accuracy": 0.7515006065368652, "num_tokens": 2632580.0, "step": 4505, "train/ce_loss": 1.301162600517273 }, { "epoch": 0.4454221870674313, "step": 4505, "train/sim_loss": 0.06640625 }, { "epoch": 0.4454221870674313, "step": 4505, "train/total_loss": 0.196522518992424 }, { "entropy": 9.75990104675293, "epoch": 0.4455210599169468, "mean_token_accuracy": 0.7781690359115601, "num_tokens": 2637573.0, "step": 4506, "train/ce_loss": 1.1523325443267822 }, { "epoch": 0.4455210599169468, "step": 4506, "train/sim_loss": 0.0703125 }, { "epoch": 0.4455210599169468, "step": 4506, "train/total_loss": 0.18554575741291046 }, { "entropy": 9.029922485351562, "epoch": 0.44561993276646233, "mean_token_accuracy": 0.7578125, "num_tokens": 2642930.0, "step": 4507, "train/ce_loss": 0.8819659352302551 }, { "epoch": 0.44561993276646233, "step": 4507, "train/sim_loss": 0.06640625 }, { "epoch": 0.44561993276646233, "step": 4507, "train/total_loss": 0.15460285544395447 }, { "entropy": 8.70202922821045, "epoch": 0.4457188056159779, "mean_token_accuracy": 0.7034883499145508, "num_tokens": 2648300.0, "step": 4508, "train/ce_loss": 0.7986609935760498 }, { "epoch": 0.4457188056159779, "step": 4508, "train/sim_loss": 0.1484375 }, { "epoch": 0.4457188056159779, "step": 4508, "train/total_loss": 0.22830361127853394 }, { "entropy": 9.651217460632324, "epoch": 0.44581767846549336, "mean_token_accuracy": 0.7282230257987976, "num_tokens": 2653330.0, "step": 4509, "train/ce_loss": 1.7078146934509277 }, { "epoch": 0.44581767846549336, "step": 4509, "train/sim_loss": 0.12109375 }, { "epoch": 0.44581767846549336, "step": 4509, "train/total_loss": 0.2918752431869507 }, { "entropy": 9.287551879882812, "epoch": 0.4459165513150089, "mean_token_accuracy": 0.7611026167869568, "num_tokens": 2658628.0, "step": 4510, "train/ce_loss": 1.1713571548461914 }, { "epoch": 0.4459165513150089, "step": 4510, "train/sim_loss": 0.11328125 }, { "epoch": 0.4459165513150089, "step": 4510, "train/total_loss": 0.23041696846485138 }, { "entropy": 8.904544830322266, "epoch": 0.44601542416452444, "mean_token_accuracy": 0.8007850646972656, "num_tokens": 2664116.0, "step": 4511, "train/ce_loss": 0.8100970387458801 }, { "epoch": 0.44601542416452444, "step": 4511, "train/sim_loss": 0.0703125 }, { "epoch": 0.44601542416452444, "step": 4511, "train/total_loss": 0.15132221579551697 }, { "entropy": 8.725772857666016, "epoch": 0.44611429701403993, "mean_token_accuracy": 0.7802197933197021, "num_tokens": 2669594.0, "step": 4512, "train/ce_loss": 0.774977445602417 }, { "epoch": 0.44611429701403993, "step": 4512, "train/sim_loss": 0.1171875 }, { "epoch": 0.44611429701403993, "step": 4512, "train/total_loss": 0.19468525052070618 }, { "entropy": 9.246919631958008, "epoch": 0.44621316986355547, "mean_token_accuracy": 0.7734877467155457, "num_tokens": 2674776.0, "step": 4513, "train/ce_loss": 0.3518202304840088 }, { "epoch": 0.44621316986355547, "step": 4513, "train/sim_loss": 0.08984375 }, { "epoch": 0.44621316986355547, "step": 4513, "train/total_loss": 0.12502577900886536 }, { "entropy": 9.12303352355957, "epoch": 0.446312042713071, "mean_token_accuracy": 0.6736842393875122, "num_tokens": 2679914.0, "step": 4514, "train/ce_loss": 1.3278347253799438 }, { "epoch": 0.446312042713071, "step": 4514, "train/sim_loss": 0.05078125 }, { "epoch": 0.446312042713071, "step": 4514, "train/total_loss": 0.18356472253799438 }, { "entropy": 9.493539810180664, "epoch": 0.4464109155625865, "mean_token_accuracy": 0.7622504830360413, "num_tokens": 2684898.0, "step": 4515, "train/ce_loss": 0.47448375821113586 }, { "epoch": 0.4464109155625865, "step": 4515, "train/sim_loss": 0.02734375 }, { "epoch": 0.4464109155625865, "step": 4515, "train/total_loss": 0.07479213178157806 }, { "entropy": 8.801179885864258, "epoch": 0.44650978841210204, "mean_token_accuracy": 0.7257732152938843, "num_tokens": 2690354.0, "step": 4516, "train/ce_loss": 0.6246917843818665 }, { "epoch": 0.44650978841210204, "step": 4516, "train/sim_loss": 0.05859375 }, { "epoch": 0.44650978841210204, "step": 4516, "train/total_loss": 0.12106293439865112 }, { "entropy": 9.418305397033691, "epoch": 0.4466086612616176, "mean_token_accuracy": 0.7054010033607483, "num_tokens": 2695422.0, "step": 4517, "train/ce_loss": 1.2904903888702393 }, { "epoch": 0.4466086612616176, "step": 4517, "train/sim_loss": 0.109375 }, { "epoch": 0.4466086612616176, "step": 4517, "train/total_loss": 0.23842404782772064 }, { "entropy": 9.153996467590332, "epoch": 0.44670753411113306, "mean_token_accuracy": 0.6802973747253418, "num_tokens": 2700763.0, "step": 4518, "train/ce_loss": 1.680827021598816 }, { "epoch": 0.44670753411113306, "step": 4518, "train/sim_loss": 0.09375 }, { "epoch": 0.44670753411113306, "step": 4518, "train/total_loss": 0.26183271408081055 }, { "entropy": 8.895081520080566, "epoch": 0.4468064069606486, "mean_token_accuracy": 0.7473903894424438, "num_tokens": 2706165.0, "step": 4519, "train/ce_loss": 1.1039037704467773 }, { "epoch": 0.4468064069606486, "step": 4519, "train/sim_loss": 0.07421875 }, { "epoch": 0.4468064069606486, "step": 4519, "train/total_loss": 0.18460913002490997 }, { "epoch": 0.44690527981016415, "grad_norm": 0.7490431070327759, "learning_rate": 8.885180240320428e-06, "loss": 0.1494, "step": 4520 }, { "entropy": 9.420427322387695, "epoch": 0.44690527981016415, "mean_token_accuracy": 0.7320675253868103, "num_tokens": 2711037.0, "step": 4520, "train/ce_loss": 0.6447953581809998 }, { "epoch": 0.44690527981016415, "step": 4520, "train/sim_loss": 0.04296875 }, { "epoch": 0.44690527981016415, "step": 4520, "train/total_loss": 0.1074482873082161 }, { "entropy": 9.204364776611328, "epoch": 0.44700415265967963, "mean_token_accuracy": 0.734455943107605, "num_tokens": 2716309.0, "step": 4521, "train/ce_loss": 1.1876529455184937 }, { "epoch": 0.44700415265967963, "step": 4521, "train/sim_loss": 0.0859375 }, { "epoch": 0.44700415265967963, "step": 4521, "train/total_loss": 0.20470279455184937 }, { "entropy": 9.333433151245117, "epoch": 0.4471030255091952, "mean_token_accuracy": 0.7256515622138977, "num_tokens": 2721504.0, "step": 4522, "train/ce_loss": 1.3652865886688232 }, { "epoch": 0.4471030255091952, "step": 4522, "train/sim_loss": 0.08203125 }, { "epoch": 0.4471030255091952, "step": 4522, "train/total_loss": 0.21855990588665009 }, { "entropy": 8.984024047851562, "epoch": 0.4472018983587107, "mean_token_accuracy": 0.7811484336853027, "num_tokens": 2726904.0, "step": 4523, "train/ce_loss": 0.9568396210670471 }, { "epoch": 0.4472018983587107, "step": 4523, "train/sim_loss": 0.0234375 }, { "epoch": 0.4472018983587107, "step": 4523, "train/total_loss": 0.11912146210670471 }, { "entropy": 9.023128509521484, "epoch": 0.4473007712082262, "mean_token_accuracy": 0.7204030156135559, "num_tokens": 2732127.0, "step": 4524, "train/ce_loss": 0.5748513340950012 }, { "epoch": 0.4473007712082262, "step": 4524, "train/sim_loss": 0.0234375 }, { "epoch": 0.4473007712082262, "step": 4524, "train/total_loss": 0.08092263340950012 }, { "entropy": 9.544071197509766, "epoch": 0.44739964405774174, "mean_token_accuracy": 0.7534013390541077, "num_tokens": 2737155.0, "step": 4525, "train/ce_loss": 0.6580440998077393 }, { "epoch": 0.44739964405774174, "step": 4525, "train/sim_loss": 0.04296875 }, { "epoch": 0.44739964405774174, "step": 4525, "train/total_loss": 0.10877316445112228 }, { "entropy": 9.339006423950195, "epoch": 0.4474985169072573, "mean_token_accuracy": 0.7637906670570374, "num_tokens": 2742304.0, "step": 4526, "train/ce_loss": 0.7049911618232727 }, { "epoch": 0.4474985169072573, "step": 4526, "train/sim_loss": 0.01953125 }, { "epoch": 0.4474985169072573, "step": 4526, "train/total_loss": 0.09003036469221115 }, { "entropy": 9.086111068725586, "epoch": 0.44759738975677277, "mean_token_accuracy": 0.7605294585227966, "num_tokens": 2747559.0, "step": 4527, "train/ce_loss": 0.6927676796913147 }, { "epoch": 0.44759738975677277, "step": 4527, "train/sim_loss": 0.0703125 }, { "epoch": 0.44759738975677277, "step": 4527, "train/total_loss": 0.13958927989006042 }, { "entropy": 9.318374633789062, "epoch": 0.4476962626062883, "mean_token_accuracy": 0.7939777970314026, "num_tokens": 2752659.0, "step": 4528, "train/ce_loss": 1.3353852033615112 }, { "epoch": 0.4476962626062883, "step": 4528, "train/sim_loss": 0.109375 }, { "epoch": 0.4476962626062883, "step": 4528, "train/total_loss": 0.24291352927684784 }, { "entropy": 9.387085914611816, "epoch": 0.44779513545580385, "mean_token_accuracy": 0.7215999960899353, "num_tokens": 2757745.0, "step": 4529, "train/ce_loss": 1.371522307395935 }, { "epoch": 0.44779513545580385, "step": 4529, "train/sim_loss": 0.05859375 }, { "epoch": 0.44779513545580385, "step": 4529, "train/total_loss": 0.19574598968029022 }, { "entropy": 8.896390914916992, "epoch": 0.44789400830531934, "mean_token_accuracy": 0.764374315738678, "num_tokens": 2763071.0, "step": 4530, "train/ce_loss": 0.7218606472015381 }, { "epoch": 0.44789400830531934, "step": 4530, "train/sim_loss": 0.03515625 }, { "epoch": 0.44789400830531934, "step": 4530, "train/total_loss": 0.10734231770038605 }, { "entropy": 9.415645599365234, "epoch": 0.4479928811548349, "mean_token_accuracy": 0.759530782699585, "num_tokens": 2768197.0, "step": 4531, "train/ce_loss": 1.0284645668434678e-06 }, { "epoch": 0.4479928811548349, "step": 4531, "train/sim_loss": 0.0234375 }, { "epoch": 0.4479928811548349, "step": 4531, "train/total_loss": 0.023437602445483208 }, { "entropy": 9.233901023864746, "epoch": 0.4480917540043504, "mean_token_accuracy": 0.6820428371429443, "num_tokens": 2773273.0, "step": 4532, "train/ce_loss": 1.1894171237945557 }, { "epoch": 0.4480917540043504, "step": 4532, "train/sim_loss": 0.04296875 }, { "epoch": 0.4480917540043504, "step": 4532, "train/total_loss": 0.16191047430038452 }, { "entropy": 8.682958602905273, "epoch": 0.4481906268538659, "mean_token_accuracy": 0.7135416865348816, "num_tokens": 2778736.0, "step": 4533, "train/ce_loss": 1.315225601196289 }, { "epoch": 0.4481906268538659, "step": 4533, "train/sim_loss": 0.05078125 }, { "epoch": 0.4481906268538659, "step": 4533, "train/total_loss": 0.18230381608009338 }, { "entropy": 9.534332275390625, "epoch": 0.44828949970338144, "mean_token_accuracy": 0.7224409580230713, "num_tokens": 2783689.0, "step": 4534, "train/ce_loss": 1.1777727603912354 }, { "epoch": 0.44828949970338144, "step": 4534, "train/sim_loss": 0.0390625 }, { "epoch": 0.44828949970338144, "step": 4534, "train/total_loss": 0.1568397879600525 }, { "entropy": 8.914642333984375, "epoch": 0.448388372552897, "mean_token_accuracy": 0.7865030765533447, "num_tokens": 2788972.0, "step": 4535, "train/ce_loss": 0.7574781775474548 }, { "epoch": 0.448388372552897, "step": 4535, "train/sim_loss": 0.05859375 }, { "epoch": 0.448388372552897, "step": 4535, "train/total_loss": 0.13434156775474548 }, { "entropy": 9.556293487548828, "epoch": 0.44848724540241247, "mean_token_accuracy": 0.7428571581840515, "num_tokens": 2793841.0, "step": 4536, "train/ce_loss": 2.064066171646118 }, { "epoch": 0.44848724540241247, "step": 4536, "train/sim_loss": 0.08203125 }, { "epoch": 0.44848724540241247, "step": 4536, "train/total_loss": 0.2884378731250763 }, { "entropy": 9.276677131652832, "epoch": 0.448586118251928, "mean_token_accuracy": 0.7107309699058533, "num_tokens": 2798925.0, "step": 4537, "train/ce_loss": 0.9554111361503601 }, { "epoch": 0.448586118251928, "step": 4537, "train/sim_loss": 0.05859375 }, { "epoch": 0.448586118251928, "step": 4537, "train/total_loss": 0.1541348695755005 }, { "entropy": 8.949641227722168, "epoch": 0.44868499110144355, "mean_token_accuracy": 0.752136766910553, "num_tokens": 2804230.0, "step": 4538, "train/ce_loss": 1.0398943424224854 }, { "epoch": 0.44868499110144355, "step": 4538, "train/sim_loss": 0.06640625 }, { "epoch": 0.44868499110144355, "step": 4538, "train/total_loss": 0.17039568722248077 }, { "entropy": 9.061538696289062, "epoch": 0.4487838639509591, "mean_token_accuracy": 0.7519999742507935, "num_tokens": 2809537.0, "step": 4539, "train/ce_loss": 0.9472593069076538 }, { "epoch": 0.4487838639509591, "step": 4539, "train/sim_loss": 0.0546875 }, { "epoch": 0.4487838639509591, "step": 4539, "train/total_loss": 0.14941343665122986 }, { "epoch": 0.4488827368004746, "grad_norm": 0.688106894493103, "learning_rate": 8.88023537556248e-06, "loss": 0.1371, "step": 4540 }, { "entropy": 8.731950759887695, "epoch": 0.4488827368004746, "mean_token_accuracy": 0.7028824687004089, "num_tokens": 2814906.0, "step": 4540, "train/ce_loss": 0.5285282135009766 }, { "epoch": 0.4488827368004746, "step": 4540, "train/sim_loss": 0.0234375 }, { "epoch": 0.4488827368004746, "step": 4540, "train/total_loss": 0.0762903243303299 }, { "entropy": 9.08930492401123, "epoch": 0.4489816096499901, "mean_token_accuracy": 0.789002537727356, "num_tokens": 2820117.0, "step": 4541, "train/ce_loss": 1.020498275756836 }, { "epoch": 0.4489816096499901, "step": 4541, "train/sim_loss": 0.05859375 }, { "epoch": 0.4489816096499901, "step": 4541, "train/total_loss": 0.1606435775756836 }, { "entropy": 9.241561889648438, "epoch": 0.44908048249950566, "mean_token_accuracy": 0.7077562212944031, "num_tokens": 2825421.0, "step": 4542, "train/ce_loss": 0.7609159350395203 }, { "epoch": 0.44908048249950566, "step": 4542, "train/sim_loss": 0.06640625 }, { "epoch": 0.44908048249950566, "step": 4542, "train/total_loss": 0.14249783754348755 }, { "entropy": 9.402660369873047, "epoch": 0.44917935534902115, "mean_token_accuracy": 0.7670068144798279, "num_tokens": 2830467.0, "step": 4543, "train/ce_loss": 0.6022214889526367 }, { "epoch": 0.44917935534902115, "step": 4543, "train/sim_loss": 0.05078125 }, { "epoch": 0.44917935534902115, "step": 4543, "train/total_loss": 0.11100339889526367 }, { "entropy": 9.114690780639648, "epoch": 0.4492782281985367, "mean_token_accuracy": 0.7185929417610168, "num_tokens": 2835649.0, "step": 4544, "train/ce_loss": 0.9506949186325073 }, { "epoch": 0.4492782281985367, "step": 4544, "train/sim_loss": 0.08203125 }, { "epoch": 0.4492782281985367, "step": 4544, "train/total_loss": 0.1771007478237152 }, { "entropy": 9.531662940979004, "epoch": 0.44937710104805223, "mean_token_accuracy": 0.7565084099769592, "num_tokens": 2841024.0, "step": 4545, "train/ce_loss": 1.1496831178665161 }, { "epoch": 0.44937710104805223, "step": 4545, "train/sim_loss": 0.078125 }, { "epoch": 0.44937710104805223, "step": 4545, "train/total_loss": 0.19309331476688385 }, { "entropy": 8.738716125488281, "epoch": 0.4494759738975677, "mean_token_accuracy": 0.7577497363090515, "num_tokens": 2846354.0, "step": 4546, "train/ce_loss": 0.7805805802345276 }, { "epoch": 0.4494759738975677, "step": 4546, "train/sim_loss": 0.078125 }, { "epoch": 0.4494759738975677, "step": 4546, "train/total_loss": 0.15618306398391724 }, { "entropy": 8.934268951416016, "epoch": 0.44957484674708326, "mean_token_accuracy": 0.7142857313156128, "num_tokens": 2851684.0, "step": 4547, "train/ce_loss": 0.7433626055717468 }, { "epoch": 0.44957484674708326, "step": 4547, "train/sim_loss": 0.05078125 }, { "epoch": 0.44957484674708326, "step": 4547, "train/total_loss": 0.12511751055717468 }, { "entropy": 8.42918586730957, "epoch": 0.4496737195965988, "mean_token_accuracy": 0.748024582862854, "num_tokens": 2857267.0, "step": 4548, "train/ce_loss": 0.5866866111755371 }, { "epoch": 0.4496737195965988, "step": 4548, "train/sim_loss": 0.01953125 }, { "epoch": 0.4496737195965988, "step": 4548, "train/total_loss": 0.07819990813732147 }, { "entropy": 9.045125961303711, "epoch": 0.4497725924461143, "mean_token_accuracy": 0.7625298500061035, "num_tokens": 2862571.0, "step": 4549, "train/ce_loss": 0.5434145927429199 }, { "epoch": 0.4497725924461143, "step": 4549, "train/sim_loss": 0.06640625 }, { "epoch": 0.4497725924461143, "step": 4549, "train/total_loss": 0.12074771523475647 }, { "entropy": 9.409574508666992, "epoch": 0.4498714652956298, "mean_token_accuracy": 0.7969798445701599, "num_tokens": 2867581.0, "step": 4550, "train/ce_loss": 3.5709465464606183e-06 }, { "epoch": 0.4498714652956298, "step": 4550, "train/sim_loss": 0.0390625 }, { "epoch": 0.4498714652956298, "step": 4550, "train/total_loss": 0.03906285762786865 }, { "entropy": 9.27652359008789, "epoch": 0.44997033814514537, "mean_token_accuracy": 0.6906946301460266, "num_tokens": 2872818.0, "step": 4551, "train/ce_loss": 0.7897971868515015 }, { "epoch": 0.44997033814514537, "step": 4551, "train/sim_loss": 0.078125 }, { "epoch": 0.44997033814514537, "step": 4551, "train/total_loss": 0.1571047306060791 }, { "entropy": 8.805782318115234, "epoch": 0.45006921099466085, "mean_token_accuracy": 0.795258641242981, "num_tokens": 2878284.0, "step": 4552, "train/ce_loss": 0.5840756893157959 }, { "epoch": 0.45006921099466085, "step": 4552, "train/sim_loss": 0.0390625 }, { "epoch": 0.45006921099466085, "step": 4552, "train/total_loss": 0.09747007489204407 }, { "entropy": 8.947723388671875, "epoch": 0.4501680838441764, "mean_token_accuracy": 0.7066817879676819, "num_tokens": 2883651.0, "step": 4553, "train/ce_loss": 0.7233148217201233 }, { "epoch": 0.4501680838441764, "step": 4553, "train/sim_loss": 0.07421875 }, { "epoch": 0.4501680838441764, "step": 4553, "train/total_loss": 0.1465502381324768 }, { "entropy": 8.907604217529297, "epoch": 0.45026695669369193, "mean_token_accuracy": 0.7900485396385193, "num_tokens": 2888999.0, "step": 4554, "train/ce_loss": 0.7272635698318481 }, { "epoch": 0.45026695669369193, "step": 4554, "train/sim_loss": 0.0390625 }, { "epoch": 0.45026695669369193, "step": 4554, "train/total_loss": 0.11178886145353317 }, { "entropy": 9.367353439331055, "epoch": 0.4503658295432074, "mean_token_accuracy": 0.7003610134124756, "num_tokens": 2894023.0, "step": 4555, "train/ce_loss": 0.8232207894325256 }, { "epoch": 0.4503658295432074, "step": 4555, "train/sim_loss": 0.0546875 }, { "epoch": 0.4503658295432074, "step": 4555, "train/total_loss": 0.13700959086418152 }, { "entropy": 9.503122329711914, "epoch": 0.45046470239272296, "mean_token_accuracy": 0.7219662070274353, "num_tokens": 2899121.0, "step": 4556, "train/ce_loss": 1.519469141960144 }, { "epoch": 0.45046470239272296, "step": 4556, "train/sim_loss": 0.09765625 }, { "epoch": 0.45046470239272296, "step": 4556, "train/total_loss": 0.24960316717624664 }, { "entropy": 8.840147018432617, "epoch": 0.4505635752422385, "mean_token_accuracy": 0.7513397932052612, "num_tokens": 2904536.0, "step": 4557, "train/ce_loss": 0.4257037341594696 }, { "epoch": 0.4505635752422385, "step": 4557, "train/sim_loss": 0.0234375 }, { "epoch": 0.4505635752422385, "step": 4557, "train/total_loss": 0.06600787490606308 }, { "entropy": 8.88971996307373, "epoch": 0.450662448091754, "mean_token_accuracy": 0.7338618636131287, "num_tokens": 2909872.0, "step": 4558, "train/ce_loss": 0.4039987623691559 }, { "epoch": 0.450662448091754, "step": 4558, "train/sim_loss": 0.015625 }, { "epoch": 0.450662448091754, "step": 4558, "train/total_loss": 0.05602487549185753 }, { "entropy": 8.467979431152344, "epoch": 0.4507613209412695, "mean_token_accuracy": 0.7732341885566711, "num_tokens": 2915440.0, "step": 4559, "train/ce_loss": 0.8622865080833435 }, { "epoch": 0.4507613209412695, "step": 4559, "train/sim_loss": 0.02734375 }, { "epoch": 0.4507613209412695, "step": 4559, "train/total_loss": 0.11357240378856659 }, { "epoch": 0.45086019379078507, "grad_norm": 0.6074954867362976, "learning_rate": 8.87529051080453e-06, "loss": 0.1409, "step": 4560 }, { "entropy": 8.623495101928711, "epoch": 0.45086019379078507, "mean_token_accuracy": 0.7909091114997864, "num_tokens": 2921052.0, "step": 4560, "train/ce_loss": 0.5332843661308289 }, { "epoch": 0.45086019379078507, "step": 4560, "train/sim_loss": 0.01953125 }, { "epoch": 0.45086019379078507, "step": 4560, "train/total_loss": 0.07285968959331512 }, { "entropy": 9.29909896850586, "epoch": 0.45095906664030055, "mean_token_accuracy": 0.7981510162353516, "num_tokens": 2926179.0, "step": 4561, "train/ce_loss": 2.328898744963226e-06 }, { "epoch": 0.45095906664030055, "step": 4561, "train/sim_loss": 0.02734375 }, { "epoch": 0.45095906664030055, "step": 4561, "train/total_loss": 0.027343982830643654 }, { "entropy": 9.259490013122559, "epoch": 0.4510579394898161, "mean_token_accuracy": 0.6875, "num_tokens": 2931325.0, "step": 4562, "train/ce_loss": 0.7132740616798401 }, { "epoch": 0.4510579394898161, "step": 4562, "train/sim_loss": 0.06640625 }, { "epoch": 0.4510579394898161, "step": 4562, "train/total_loss": 0.13773366808891296 }, { "entropy": 9.044837951660156, "epoch": 0.45115681233933164, "mean_token_accuracy": 0.7777777910232544, "num_tokens": 2936648.0, "step": 4563, "train/ce_loss": 0.6716727018356323 }, { "epoch": 0.45115681233933164, "step": 4563, "train/sim_loss": 0.078125 }, { "epoch": 0.45115681233933164, "step": 4563, "train/total_loss": 0.1452922821044922 }, { "entropy": 9.083380699157715, "epoch": 0.4512556851888471, "mean_token_accuracy": 0.6751207709312439, "num_tokens": 2941976.0, "step": 4564, "train/ce_loss": 1.7078438997268677 }, { "epoch": 0.4512556851888471, "step": 4564, "train/sim_loss": 0.03125 }, { "epoch": 0.4512556851888471, "step": 4564, "train/total_loss": 0.20203439891338348 }, { "entropy": 9.328924179077148, "epoch": 0.45135455803836266, "mean_token_accuracy": 0.675302267074585, "num_tokens": 2946985.0, "step": 4565, "train/ce_loss": 2.5967685360228643e-05 }, { "epoch": 0.45135455803836266, "step": 4565, "train/sim_loss": 0.02734375 }, { "epoch": 0.45135455803836266, "step": 4565, "train/total_loss": 0.027346346527338028 }, { "entropy": 8.78829574584961, "epoch": 0.4514534308878782, "mean_token_accuracy": 0.7212249040603638, "num_tokens": 2952418.0, "step": 4566, "train/ce_loss": 0.8941664099693298 }, { "epoch": 0.4514534308878782, "step": 4566, "train/sim_loss": 0.0625 }, { "epoch": 0.4514534308878782, "step": 4566, "train/total_loss": 0.15191665291786194 }, { "entropy": 9.640069961547852, "epoch": 0.4515523037373937, "mean_token_accuracy": 0.7910714149475098, "num_tokens": 2957380.0, "step": 4567, "train/ce_loss": 6.438088348659221e-06 }, { "epoch": 0.4515523037373937, "step": 4567, "train/sim_loss": 0.046875 }, { "epoch": 0.4515523037373937, "step": 4567, "train/total_loss": 0.046875644475221634 }, { "entropy": 8.637200355529785, "epoch": 0.45165117658690923, "mean_token_accuracy": 0.7096296548843384, "num_tokens": 2962547.0, "step": 4568, "train/ce_loss": 1.6558645963668823 }, { "epoch": 0.45165117658690923, "step": 4568, "train/sim_loss": 0.0859375 }, { "epoch": 0.45165117658690923, "step": 4568, "train/total_loss": 0.2515239715576172 }, { "entropy": 9.362472534179688, "epoch": 0.45175004943642477, "mean_token_accuracy": 0.7953125238418579, "num_tokens": 2967679.0, "step": 4569, "train/ce_loss": 1.5980865555320634e-06 }, { "epoch": 0.45175004943642477, "step": 4569, "train/sim_loss": 0.0625 }, { "epoch": 0.45175004943642477, "step": 4569, "train/total_loss": 0.06250015646219254 }, { "entropy": 9.715399742126465, "epoch": 0.45184892228594026, "mean_token_accuracy": 0.7405303120613098, "num_tokens": 2972687.0, "step": 4570, "train/ce_loss": 1.0586374998092651 }, { "epoch": 0.45184892228594026, "step": 4570, "train/sim_loss": 0.09375 }, { "epoch": 0.45184892228594026, "step": 4570, "train/total_loss": 0.1996137499809265 }, { "entropy": 8.944488525390625, "epoch": 0.4519477951354558, "mean_token_accuracy": 0.7268722653388977, "num_tokens": 2978053.0, "step": 4571, "train/ce_loss": 0.7541816234588623 }, { "epoch": 0.4519477951354558, "step": 4571, "train/sim_loss": 0.046875 }, { "epoch": 0.4519477951354558, "step": 4571, "train/total_loss": 0.12229316681623459 }, { "entropy": 8.754767417907715, "epoch": 0.45204666798497134, "mean_token_accuracy": 0.6900212168693542, "num_tokens": 2983438.0, "step": 4572, "train/ce_loss": 1.0171220302581787 }, { "epoch": 0.45204666798497134, "step": 4572, "train/sim_loss": 0.08203125 }, { "epoch": 0.45204666798497134, "step": 4572, "train/total_loss": 0.1837434470653534 }, { "entropy": 9.43647575378418, "epoch": 0.4521455408344868, "mean_token_accuracy": 0.7944079041481018, "num_tokens": 2988504.0, "step": 4573, "train/ce_loss": 2.7032151592720766e-06 }, { "epoch": 0.4521455408344868, "step": 4573, "train/sim_loss": 0.05078125 }, { "epoch": 0.4521455408344868, "step": 4573, "train/total_loss": 0.05078152194619179 }, { "entropy": 8.957502365112305, "epoch": 0.45224441368400237, "mean_token_accuracy": 0.7759815454483032, "num_tokens": 2993820.0, "step": 4574, "train/ce_loss": 0.5547528862953186 }, { "epoch": 0.45224441368400237, "step": 4574, "train/sim_loss": 0.03125 }, { "epoch": 0.45224441368400237, "step": 4574, "train/total_loss": 0.08672529458999634 }, { "entropy": 9.467229843139648, "epoch": 0.4523432865335179, "mean_token_accuracy": 0.7510373592376709, "num_tokens": 2998736.0, "step": 4575, "train/ce_loss": 1.649519443511963 }, { "epoch": 0.4523432865335179, "step": 4575, "train/sim_loss": 0.140625 }, { "epoch": 0.4523432865335179, "step": 4575, "train/total_loss": 0.30557695031166077 }, { "entropy": 8.758031845092773, "epoch": 0.4524421593830334, "mean_token_accuracy": 0.7468827962875366, "num_tokens": 3004071.0, "step": 4576, "train/ce_loss": 0.7991833686828613 }, { "epoch": 0.4524421593830334, "step": 4576, "train/sim_loss": 0.046875 }, { "epoch": 0.4524421593830334, "step": 4576, "train/total_loss": 0.12679333984851837 }, { "entropy": 9.06222915649414, "epoch": 0.45254103223254893, "mean_token_accuracy": 0.7110438942909241, "num_tokens": 3009153.0, "step": 4577, "train/ce_loss": 0.9158294796943665 }, { "epoch": 0.45254103223254893, "step": 4577, "train/sim_loss": 0.02734375 }, { "epoch": 0.45254103223254893, "step": 4577, "train/total_loss": 0.11892669647932053 }, { "entropy": 8.862366676330566, "epoch": 0.4526399050820645, "mean_token_accuracy": 0.7243852615356445, "num_tokens": 3014601.0, "step": 4578, "train/ce_loss": 1.1242470741271973 }, { "epoch": 0.4526399050820645, "step": 4578, "train/sim_loss": 0.0859375 }, { "epoch": 0.4526399050820645, "step": 4578, "train/total_loss": 0.19836220145225525 }, { "entropy": 9.116423606872559, "epoch": 0.45273877793157996, "mean_token_accuracy": 0.7236255407333374, "num_tokens": 3019671.0, "step": 4579, "train/ce_loss": 0.6466188430786133 }, { "epoch": 0.45273877793157996, "step": 4579, "train/sim_loss": 0.0390625 }, { "epoch": 0.45273877793157996, "step": 4579, "train/total_loss": 0.10372438281774521 }, { "epoch": 0.4528376507810955, "grad_norm": 0.7198318839073181, "learning_rate": 8.870345646046581e-06, "loss": 0.1398, "step": 4580 }, { "entropy": 8.525514602661133, "epoch": 0.4528376507810955, "mean_token_accuracy": 0.7980072498321533, "num_tokens": 3025328.0, "step": 4580, "train/ce_loss": 0.5483117699623108 }, { "epoch": 0.4528376507810955, "step": 4580, "train/sim_loss": 0.09375 }, { "epoch": 0.4528376507810955, "step": 4580, "train/total_loss": 0.14858117699623108 }, { "entropy": 9.093343734741211, "epoch": 0.45293652363061104, "mean_token_accuracy": 0.7529722452163696, "num_tokens": 3030532.0, "step": 4581, "train/ce_loss": 1.0639290809631348 }, { "epoch": 0.45293652363061104, "step": 4581, "train/sim_loss": 0.03515625 }, { "epoch": 0.45293652363061104, "step": 4581, "train/total_loss": 0.14154917001724243 }, { "entropy": 9.027142524719238, "epoch": 0.4530353964801266, "mean_token_accuracy": 0.7376294732093811, "num_tokens": 3035887.0, "step": 4582, "train/ce_loss": 0.9235092997550964 }, { "epoch": 0.4530353964801266, "step": 4582, "train/sim_loss": 0.0546875 }, { "epoch": 0.4530353964801266, "step": 4582, "train/total_loss": 0.14703842997550964 }, { "entropy": 9.52957534790039, "epoch": 0.45313426932964207, "mean_token_accuracy": 0.7444444298744202, "num_tokens": 3040888.0, "step": 4583, "train/ce_loss": 1.0812699794769287 }, { "epoch": 0.45313426932964207, "step": 4583, "train/sim_loss": 0.04296875 }, { "epoch": 0.45313426932964207, "step": 4583, "train/total_loss": 0.15109574794769287 }, { "entropy": 8.962906837463379, "epoch": 0.4532331421791576, "mean_token_accuracy": 0.7603796124458313, "num_tokens": 3046130.0, "step": 4584, "train/ce_loss": 0.6397481560707092 }, { "epoch": 0.4532331421791576, "step": 4584, "train/sim_loss": 0.03125 }, { "epoch": 0.4532331421791576, "step": 4584, "train/total_loss": 0.09522482007741928 }, { "entropy": 8.768574714660645, "epoch": 0.45333201502867315, "mean_token_accuracy": 0.7129135727882385, "num_tokens": 3051537.0, "step": 4585, "train/ce_loss": 0.5469233989715576 }, { "epoch": 0.45333201502867315, "step": 4585, "train/sim_loss": 0.05859375 }, { "epoch": 0.45333201502867315, "step": 4585, "train/total_loss": 0.113286092877388 }, { "entropy": 8.842185974121094, "epoch": 0.45343088787818864, "mean_token_accuracy": 0.7220982313156128, "num_tokens": 3056886.0, "step": 4586, "train/ce_loss": 0.767518937587738 }, { "epoch": 0.45343088787818864, "step": 4586, "train/sim_loss": 0.0390625 }, { "epoch": 0.45343088787818864, "step": 4586, "train/total_loss": 0.11581439524888992 }, { "entropy": 9.495777130126953, "epoch": 0.4535297607277042, "mean_token_accuracy": 0.752525269985199, "num_tokens": 3061907.0, "step": 4587, "train/ce_loss": 1.1620025634765625 }, { "epoch": 0.4535297607277042, "step": 4587, "train/sim_loss": 0.1328125 }, { "epoch": 0.4535297607277042, "step": 4587, "train/total_loss": 0.2490127682685852 }, { "entropy": 8.66620922088623, "epoch": 0.4536286335772197, "mean_token_accuracy": 0.7509652376174927, "num_tokens": 3067414.0, "step": 4588, "train/ce_loss": 0.8208426237106323 }, { "epoch": 0.4536286335772197, "step": 4588, "train/sim_loss": 0.0390625 }, { "epoch": 0.4536286335772197, "step": 4588, "train/total_loss": 0.12114676088094711 }, { "entropy": 9.191309928894043, "epoch": 0.4537275064267352, "mean_token_accuracy": 0.7652173638343811, "num_tokens": 3072543.0, "step": 4589, "train/ce_loss": 0.4373510777950287 }, { "epoch": 0.4537275064267352, "step": 4589, "train/sim_loss": 0.0234375 }, { "epoch": 0.4537275064267352, "step": 4589, "train/total_loss": 0.06717260926961899 }, { "entropy": 9.141670227050781, "epoch": 0.45382637927625075, "mean_token_accuracy": 0.7683615684509277, "num_tokens": 3077693.0, "step": 4590, "train/ce_loss": 0.6883249878883362 }, { "epoch": 0.45382637927625075, "step": 4590, "train/sim_loss": 0.09375 }, { "epoch": 0.45382637927625075, "step": 4590, "train/total_loss": 0.16258250176906586 }, { "entropy": 8.69253158569336, "epoch": 0.4539252521257663, "mean_token_accuracy": 0.7345225811004639, "num_tokens": 3083064.0, "step": 4591, "train/ce_loss": 1.4125851392745972 }, { "epoch": 0.4539252521257663, "step": 4591, "train/sim_loss": 0.0703125 }, { "epoch": 0.4539252521257663, "step": 4591, "train/total_loss": 0.21157102286815643 }, { "entropy": 8.656013488769531, "epoch": 0.4540241249752818, "mean_token_accuracy": 0.7139664888381958, "num_tokens": 3088403.0, "step": 4592, "train/ce_loss": 0.8411061763763428 }, { "epoch": 0.4540241249752818, "step": 4592, "train/sim_loss": 0.0546875 }, { "epoch": 0.4540241249752818, "step": 4592, "train/total_loss": 0.13879811763763428 }, { "entropy": 9.005350112915039, "epoch": 0.4541229978247973, "mean_token_accuracy": 0.7493857741355896, "num_tokens": 3093682.0, "step": 4593, "train/ce_loss": 0.5997017621994019 }, { "epoch": 0.4541229978247973, "step": 4593, "train/sim_loss": 0.0234375 }, { "epoch": 0.4541229978247973, "step": 4593, "train/total_loss": 0.0834076777100563 }, { "entropy": 8.391681671142578, "epoch": 0.45422187067431286, "mean_token_accuracy": 0.7315497994422913, "num_tokens": 3099261.0, "step": 4594, "train/ce_loss": 1.0334621667861938 }, { "epoch": 0.45422187067431286, "step": 4594, "train/sim_loss": 0.04296875 }, { "epoch": 0.45422187067431286, "step": 4594, "train/total_loss": 0.14631497859954834 }, { "entropy": 9.18332290649414, "epoch": 0.45432074352382834, "mean_token_accuracy": 0.752043604850769, "num_tokens": 3104615.0, "step": 4595, "train/ce_loss": 0.8786539435386658 }, { "epoch": 0.45432074352382834, "step": 4595, "train/sim_loss": 0.11328125 }, { "epoch": 0.45432074352382834, "step": 4595, "train/total_loss": 0.20114664733409882 }, { "entropy": 8.647392272949219, "epoch": 0.4544196163733439, "mean_token_accuracy": 0.7243186831474304, "num_tokens": 3110035.0, "step": 4596, "train/ce_loss": 1.0830390453338623 }, { "epoch": 0.4544196163733439, "step": 4596, "train/sim_loss": 0.08984375 }, { "epoch": 0.4544196163733439, "step": 4596, "train/total_loss": 0.19814765453338623 }, { "entropy": 9.091204643249512, "epoch": 0.4545184892228594, "mean_token_accuracy": 0.7458279728889465, "num_tokens": 3115283.0, "step": 4597, "train/ce_loss": 1.1098209619522095 }, { "epoch": 0.4545184892228594, "step": 4597, "train/sim_loss": 0.08984375 }, { "epoch": 0.4545184892228594, "step": 4597, "train/total_loss": 0.20082584023475647 }, { "entropy": 8.805851936340332, "epoch": 0.4546173620723749, "mean_token_accuracy": 0.6880615949630737, "num_tokens": 3120521.0, "step": 4598, "train/ce_loss": 1.021937608718872 }, { "epoch": 0.4546173620723749, "step": 4598, "train/sim_loss": 0.09375 }, { "epoch": 0.4546173620723749, "step": 4598, "train/total_loss": 0.19594377279281616 }, { "entropy": 8.611743927001953, "epoch": 0.45471623492189045, "mean_token_accuracy": 0.7456575632095337, "num_tokens": 3125841.0, "step": 4599, "train/ce_loss": 0.5259397029876709 }, { "epoch": 0.45471623492189045, "step": 4599, "train/sim_loss": 0.04296875 }, { "epoch": 0.45471623492189045, "step": 4599, "train/total_loss": 0.09556272625923157 }, { "epoch": 0.454815107771406, "grad_norm": 0.6952683925628662, "learning_rate": 8.865400781288632e-06, "loss": 0.1411, "step": 4600 }, { "entropy": 9.587837219238281, "epoch": 0.454815107771406, "mean_token_accuracy": 0.8041958212852478, "num_tokens": 3130691.0, "step": 4600, "train/ce_loss": 2.384063691351912e-06 }, { "epoch": 0.454815107771406, "step": 4600, "train/sim_loss": 0.046875 }, { "epoch": 0.454815107771406, "step": 4600, "train/total_loss": 0.0468752384185791 }, { "entropy": 8.851499557495117, "epoch": 0.4549139806209215, "mean_token_accuracy": 0.7266355156898499, "num_tokens": 3136026.0, "step": 4601, "train/ce_loss": 1.0402942895889282 }, { "epoch": 0.4549139806209215, "step": 4601, "train/sim_loss": 0.0625 }, { "epoch": 0.4549139806209215, "step": 4601, "train/total_loss": 0.16652943193912506 }, { "entropy": 9.041091918945312, "epoch": 0.455012853470437, "mean_token_accuracy": 0.6972602605819702, "num_tokens": 3141205.0, "step": 4602, "train/ce_loss": 1.1488741636276245 }, { "epoch": 0.455012853470437, "step": 4602, "train/sim_loss": 0.08984375 }, { "epoch": 0.455012853470437, "step": 4602, "train/total_loss": 0.20473116636276245 }, { "entropy": 9.126296997070312, "epoch": 0.45511172631995256, "mean_token_accuracy": 0.7178423404693604, "num_tokens": 3146394.0, "step": 4603, "train/ce_loss": 1.7863327264785767 }, { "epoch": 0.45511172631995256, "step": 4603, "train/sim_loss": 0.0859375 }, { "epoch": 0.45511172631995256, "step": 4603, "train/total_loss": 0.26457077264785767 }, { "entropy": 9.33647632598877, "epoch": 0.45521059916946804, "mean_token_accuracy": 0.7474302649497986, "num_tokens": 3151505.0, "step": 4604, "train/ce_loss": 1.4108805656433105 }, { "epoch": 0.45521059916946804, "step": 4604, "train/sim_loss": 0.0546875 }, { "epoch": 0.45521059916946804, "step": 4604, "train/total_loss": 0.19577555358409882 }, { "entropy": 8.726951599121094, "epoch": 0.4553094720189836, "mean_token_accuracy": 0.6741154789924622, "num_tokens": 3157061.0, "step": 4605, "train/ce_loss": 1.2111716270446777 }, { "epoch": 0.4553094720189836, "step": 4605, "train/sim_loss": 0.09765625 }, { "epoch": 0.4553094720189836, "step": 4605, "train/total_loss": 0.21877342462539673 }, { "entropy": 8.776931762695312, "epoch": 0.4554083448684991, "mean_token_accuracy": 0.782608687877655, "num_tokens": 3162345.0, "step": 4606, "train/ce_loss": 1.0162129402160645 }, { "epoch": 0.4554083448684991, "step": 4606, "train/sim_loss": 0.0625 }, { "epoch": 0.4554083448684991, "step": 4606, "train/total_loss": 0.16412129998207092 }, { "entropy": 9.911093711853027, "epoch": 0.4555072177180146, "mean_token_accuracy": 0.6822157502174377, "num_tokens": 3167103.0, "step": 4607, "train/ce_loss": 3.440540075302124 }, { "epoch": 0.4555072177180146, "step": 4607, "train/sim_loss": 0.05859375 }, { "epoch": 0.4555072177180146, "step": 4607, "train/total_loss": 0.4026477634906769 }, { "entropy": 9.38325023651123, "epoch": 0.45560609056753015, "mean_token_accuracy": 0.77920001745224, "num_tokens": 3172183.0, "step": 4608, "train/ce_loss": 1.2304880619049072 }, { "epoch": 0.45560609056753015, "step": 4608, "train/sim_loss": 0.05859375 }, { "epoch": 0.45560609056753015, "step": 4608, "train/total_loss": 0.1816425621509552 }, { "entropy": 8.965509414672852, "epoch": 0.4557049634170457, "mean_token_accuracy": 0.7760358452796936, "num_tokens": 3177528.0, "step": 4609, "train/ce_loss": 0.6849386692047119 }, { "epoch": 0.4557049634170457, "step": 4609, "train/sim_loss": 0.1015625 }, { "epoch": 0.4557049634170457, "step": 4609, "train/total_loss": 0.17005637288093567 }, { "entropy": 9.186532020568848, "epoch": 0.4558038362665612, "mean_token_accuracy": 0.7201017737388611, "num_tokens": 3182878.0, "step": 4610, "train/ce_loss": 0.9957976341247559 }, { "epoch": 0.4558038362665612, "step": 4610, "train/sim_loss": 0.078125 }, { "epoch": 0.4558038362665612, "step": 4610, "train/total_loss": 0.17770476639270782 }, { "entropy": 9.407671928405762, "epoch": 0.4559027091160767, "mean_token_accuracy": 0.6867284178733826, "num_tokens": 3187978.0, "step": 4611, "train/ce_loss": 1.1797021627426147 }, { "epoch": 0.4559027091160767, "step": 4611, "train/sim_loss": 0.0625 }, { "epoch": 0.4559027091160767, "step": 4611, "train/total_loss": 0.18047022819519043 }, { "entropy": 9.029430389404297, "epoch": 0.45600158196559226, "mean_token_accuracy": 0.7273809313774109, "num_tokens": 3193290.0, "step": 4612, "train/ce_loss": 0.9973271489143372 }, { "epoch": 0.45600158196559226, "step": 4612, "train/sim_loss": 0.04296875 }, { "epoch": 0.45600158196559226, "step": 4612, "train/total_loss": 0.14270147681236267 }, { "entropy": 9.043212890625, "epoch": 0.45610045481510775, "mean_token_accuracy": 0.762666642665863, "num_tokens": 3198428.0, "step": 4613, "train/ce_loss": 0.5948737263679504 }, { "epoch": 0.45610045481510775, "step": 4613, "train/sim_loss": 0.05078125 }, { "epoch": 0.45610045481510775, "step": 4613, "train/total_loss": 0.11026862263679504 }, { "entropy": 8.697711944580078, "epoch": 0.4561993276646233, "mean_token_accuracy": 0.7046413421630859, "num_tokens": 3203786.0, "step": 4614, "train/ce_loss": 0.8781384229660034 }, { "epoch": 0.4561993276646233, "step": 4614, "train/sim_loss": 0.03515625 }, { "epoch": 0.4561993276646233, "step": 4614, "train/total_loss": 0.1229700967669487 }, { "entropy": 8.974030494689941, "epoch": 0.45629820051413883, "mean_token_accuracy": 0.7385542392730713, "num_tokens": 3209084.0, "step": 4615, "train/ce_loss": 0.943402886390686 }, { "epoch": 0.45629820051413883, "step": 4615, "train/sim_loss": 0.05078125 }, { "epoch": 0.45629820051413883, "step": 4615, "train/total_loss": 0.14512154459953308 }, { "entropy": 9.349186897277832, "epoch": 0.4563970733636543, "mean_token_accuracy": 0.7431610822677612, "num_tokens": 3214154.0, "step": 4616, "train/ce_loss": 2.604249402793357e-06 }, { "epoch": 0.4563970733636543, "step": 4616, "train/sim_loss": 0.0625 }, { "epoch": 0.4563970733636543, "step": 4616, "train/total_loss": 0.06250026077032089 }, { "entropy": 8.613601684570312, "epoch": 0.45649594621316986, "mean_token_accuracy": 0.7882797718048096, "num_tokens": 3219722.0, "step": 4617, "train/ce_loss": 1.5095481872558594 }, { "epoch": 0.45649594621316986, "step": 4617, "train/sim_loss": 0.0703125 }, { "epoch": 0.45649594621316986, "step": 4617, "train/total_loss": 0.22126732766628265 }, { "entropy": 8.749364852905273, "epoch": 0.4565948190626854, "mean_token_accuracy": 0.7271317839622498, "num_tokens": 3224794.0, "step": 4618, "train/ce_loss": 1.9744768451346317e-06 }, { "epoch": 0.4565948190626854, "step": 4618, "train/sim_loss": 0.046875 }, { "epoch": 0.4565948190626854, "step": 4618, "train/total_loss": 0.04687519744038582 }, { "entropy": 9.825479507446289, "epoch": 0.4566936919122009, "mean_token_accuracy": 0.7286821603775024, "num_tokens": 3229581.0, "step": 4619, "train/ce_loss": 1.1511327028274536 }, { "epoch": 0.4566936919122009, "step": 4619, "train/sim_loss": 0.015625 }, { "epoch": 0.4566936919122009, "step": 4619, "train/total_loss": 0.1307382732629776 }, { "epoch": 0.4567925647617164, "grad_norm": 0.8595569729804993, "learning_rate": 8.860455916530684e-06, "loss": 0.1447, "step": 4620 }, { "entropy": 9.360201835632324, "epoch": 0.4567925647617164, "mean_token_accuracy": 0.7547826170921326, "num_tokens": 3234576.0, "step": 4620, "train/ce_loss": 1.0287901163101196 }, { "epoch": 0.4567925647617164, "step": 4620, "train/sim_loss": 0.0546875 }, { "epoch": 0.4567925647617164, "step": 4620, "train/total_loss": 0.15756651759147644 }, { "entropy": 9.104970932006836, "epoch": 0.45689143761123197, "mean_token_accuracy": 0.7006993293762207, "num_tokens": 3239726.0, "step": 4621, "train/ce_loss": 0.5973265171051025 }, { "epoch": 0.45689143761123197, "step": 4621, "train/sim_loss": 0.03125 }, { "epoch": 0.45689143761123197, "step": 4621, "train/total_loss": 0.09098265320062637 }, { "entropy": 8.684115409851074, "epoch": 0.4569903104607475, "mean_token_accuracy": 0.7793522477149963, "num_tokens": 3245193.0, "step": 4622, "train/ce_loss": 0.4985904097557068 }, { "epoch": 0.4569903104607475, "step": 4622, "train/sim_loss": 0.046875 }, { "epoch": 0.4569903104607475, "step": 4622, "train/total_loss": 0.09673404693603516 }, { "entropy": 9.446758270263672, "epoch": 0.457089183310263, "mean_token_accuracy": 0.7712305188179016, "num_tokens": 3250230.0, "step": 4623, "train/ce_loss": 1.3273210525512695 }, { "epoch": 0.457089183310263, "step": 4623, "train/sim_loss": 0.0390625 }, { "epoch": 0.457089183310263, "step": 4623, "train/total_loss": 0.1717946082353592 }, { "entropy": 9.79252815246582, "epoch": 0.45718805615977853, "mean_token_accuracy": 0.7248157262802124, "num_tokens": 3255021.0, "step": 4624, "train/ce_loss": 5.8113864724873565e-06 }, { "epoch": 0.45718805615977853, "step": 4624, "train/sim_loss": 0.05078125 }, { "epoch": 0.45718805615977853, "step": 4624, "train/total_loss": 0.05078183114528656 }, { "entropy": 9.614484786987305, "epoch": 0.4572869290092941, "mean_token_accuracy": 0.7348993420600891, "num_tokens": 3260106.0, "step": 4625, "train/ce_loss": 1.4386826753616333 }, { "epoch": 0.4572869290092941, "step": 4625, "train/sim_loss": 0.07421875 }, { "epoch": 0.4572869290092941, "step": 4625, "train/total_loss": 0.21808701753616333 }, { "entropy": 9.694557189941406, "epoch": 0.45738580185880956, "mean_token_accuracy": 0.7306967973709106, "num_tokens": 3265061.0, "step": 4626, "train/ce_loss": 4.065531811647816e-06 }, { "epoch": 0.45738580185880956, "step": 4626, "train/sim_loss": 0.0390625 }, { "epoch": 0.45738580185880956, "step": 4626, "train/total_loss": 0.03906290605664253 }, { "entropy": 8.791769027709961, "epoch": 0.4574846747083251, "mean_token_accuracy": 0.7226074934005737, "num_tokens": 3270298.0, "step": 4627, "train/ce_loss": 1.35926353931427 }, { "epoch": 0.4574846747083251, "step": 4627, "train/sim_loss": 0.0859375 }, { "epoch": 0.4574846747083251, "step": 4627, "train/total_loss": 0.22186385095119476 }, { "entropy": 9.312559127807617, "epoch": 0.45758354755784064, "mean_token_accuracy": 0.7418879270553589, "num_tokens": 3275443.0, "step": 4628, "train/ce_loss": 0.955916702747345 }, { "epoch": 0.45758354755784064, "step": 4628, "train/sim_loss": 0.0546875 }, { "epoch": 0.45758354755784064, "step": 4628, "train/total_loss": 0.15027916431427002 }, { "entropy": 8.64016056060791, "epoch": 0.4576824204073561, "mean_token_accuracy": 0.7103717923164368, "num_tokens": 3280924.0, "step": 4629, "train/ce_loss": 1.3245102167129517 }, { "epoch": 0.4576824204073561, "step": 4629, "train/sim_loss": 0.08984375 }, { "epoch": 0.4576824204073561, "step": 4629, "train/total_loss": 0.22229477763175964 }, { "entropy": 8.87697982788086, "epoch": 0.45778129325687167, "mean_token_accuracy": 0.698090672492981, "num_tokens": 3286223.0, "step": 4630, "train/ce_loss": 0.564035177230835 }, { "epoch": 0.45778129325687167, "step": 4630, "train/sim_loss": 0.06640625 }, { "epoch": 0.45778129325687167, "step": 4630, "train/total_loss": 0.1228097677230835 }, { "entropy": 9.272031784057617, "epoch": 0.4578801661063872, "mean_token_accuracy": 0.647826075553894, "num_tokens": 3291343.0, "step": 4631, "train/ce_loss": 1.9904353618621826 }, { "epoch": 0.4578801661063872, "step": 4631, "train/sim_loss": 0.125 }, { "epoch": 0.4578801661063872, "step": 4631, "train/total_loss": 0.32404354214668274 }, { "entropy": 8.951705932617188, "epoch": 0.4579790389559027, "mean_token_accuracy": 0.7830626368522644, "num_tokens": 3296658.0, "step": 4632, "train/ce_loss": 0.6698645949363708 }, { "epoch": 0.4579790389559027, "step": 4632, "train/sim_loss": 0.02734375 }, { "epoch": 0.4579790389559027, "step": 4632, "train/total_loss": 0.09433021396398544 }, { "entropy": 8.547548294067383, "epoch": 0.45807791180541824, "mean_token_accuracy": 0.7302573323249817, "num_tokens": 3302243.0, "step": 4633, "train/ce_loss": 1.1055241823196411 }, { "epoch": 0.45807791180541824, "step": 4633, "train/sim_loss": 0.06640625 }, { "epoch": 0.45807791180541824, "step": 4633, "train/total_loss": 0.17695868015289307 }, { "entropy": 9.031946182250977, "epoch": 0.4581767846549338, "mean_token_accuracy": 0.6810126304626465, "num_tokens": 3307521.0, "step": 4634, "train/ce_loss": 0.7403306365013123 }, { "epoch": 0.4581767846549338, "step": 4634, "train/sim_loss": 0.0625 }, { "epoch": 0.4581767846549338, "step": 4634, "train/total_loss": 0.13653306663036346 }, { "entropy": 9.08590316772461, "epoch": 0.45827565750444926, "mean_token_accuracy": 0.7239353656768799, "num_tokens": 3312651.0, "step": 4635, "train/ce_loss": 1.0999616384506226 }, { "epoch": 0.45827565750444926, "step": 4635, "train/sim_loss": 0.02734375 }, { "epoch": 0.45827565750444926, "step": 4635, "train/total_loss": 0.13733991980552673 }, { "entropy": 8.861583709716797, "epoch": 0.4583745303539648, "mean_token_accuracy": 0.760221004486084, "num_tokens": 3318036.0, "step": 4636, "train/ce_loss": 0.7783376574516296 }, { "epoch": 0.4583745303539648, "step": 4636, "train/sim_loss": 0.046875 }, { "epoch": 0.4583745303539648, "step": 4636, "train/total_loss": 0.12470876425504684 }, { "entropy": 9.740151405334473, "epoch": 0.45847340320348035, "mean_token_accuracy": 0.6293245553970337, "num_tokens": 3323038.0, "step": 4637, "train/ce_loss": 1.4067628383636475 }, { "epoch": 0.45847340320348035, "step": 4637, "train/sim_loss": 0.046875 }, { "epoch": 0.45847340320348035, "step": 4637, "train/total_loss": 0.18755128979682922 }, { "entropy": 9.749921798706055, "epoch": 0.45857227605299583, "mean_token_accuracy": 0.7439613342285156, "num_tokens": 3327889.0, "step": 4638, "train/ce_loss": 2.1111593468958745e-06 }, { "epoch": 0.45857227605299583, "step": 4638, "train/sim_loss": 0.01953125 }, { "epoch": 0.45857227605299583, "step": 4638, "train/total_loss": 0.019531460478901863 }, { "entropy": 9.055946350097656, "epoch": 0.45867114890251137, "mean_token_accuracy": 0.678518533706665, "num_tokens": 3333090.0, "step": 4639, "train/ce_loss": 0.7745179533958435 }, { "epoch": 0.45867114890251137, "step": 4639, "train/sim_loss": 0.06640625 }, { "epoch": 0.45867114890251137, "step": 4639, "train/total_loss": 0.14385804533958435 }, { "epoch": 0.4587700217520269, "grad_norm": 0.9158977270126343, "learning_rate": 8.855511051772734e-06, "loss": 0.1441, "step": 4640 }, { "entropy": 9.711057662963867, "epoch": 0.4587700217520269, "mean_token_accuracy": 0.6635338068008423, "num_tokens": 3338025.0, "step": 4640, "train/ce_loss": 1.2220512628555298 }, { "epoch": 0.4587700217520269, "step": 4640, "train/sim_loss": 0.0546875 }, { "epoch": 0.4587700217520269, "step": 4640, "train/total_loss": 0.17689263820648193 }, { "entropy": 9.27509880065918, "epoch": 0.4588688946015424, "mean_token_accuracy": 0.7221324443817139, "num_tokens": 3343035.0, "step": 4641, "train/ce_loss": 0.8792186975479126 }, { "epoch": 0.4588688946015424, "step": 4641, "train/sim_loss": 0.046875 }, { "epoch": 0.4588688946015424, "step": 4641, "train/total_loss": 0.1347968727350235 }, { "entropy": 8.594276428222656, "epoch": 0.45896776745105794, "mean_token_accuracy": 0.703568160533905, "num_tokens": 3348620.0, "step": 4642, "train/ce_loss": 1.3725320100784302 }, { "epoch": 0.45896776745105794, "step": 4642, "train/sim_loss": 0.1484375 }, { "epoch": 0.45896776745105794, "step": 4642, "train/total_loss": 0.2856907248497009 }, { "entropy": 9.057815551757812, "epoch": 0.4590666403005735, "mean_token_accuracy": 0.7582547068595886, "num_tokens": 3353899.0, "step": 4643, "train/ce_loss": 0.656229555606842 }, { "epoch": 0.4590666403005735, "step": 4643, "train/sim_loss": 0.08203125 }, { "epoch": 0.4590666403005735, "step": 4643, "train/total_loss": 0.1476542055606842 }, { "entropy": 9.26467227935791, "epoch": 0.45916551315008897, "mean_token_accuracy": 0.763610303401947, "num_tokens": 3359044.0, "step": 4644, "train/ce_loss": 0.8465744256973267 }, { "epoch": 0.45916551315008897, "step": 4644, "train/sim_loss": 0.06640625 }, { "epoch": 0.45916551315008897, "step": 4644, "train/total_loss": 0.1510636955499649 }, { "entropy": 9.152750015258789, "epoch": 0.4592643859996045, "mean_token_accuracy": 0.7753530144691467, "num_tokens": 3364215.0, "step": 4645, "train/ce_loss": 0.9924351572990417 }, { "epoch": 0.4592643859996045, "step": 4645, "train/sim_loss": 0.05078125 }, { "epoch": 0.4592643859996045, "step": 4645, "train/total_loss": 0.15002477169036865 }, { "entropy": 9.042369842529297, "epoch": 0.45936325884912005, "mean_token_accuracy": 0.7220588326454163, "num_tokens": 3369432.0, "step": 4646, "train/ce_loss": 2.187622547149658 }, { "epoch": 0.45936325884912005, "step": 4646, "train/sim_loss": 0.0546875 }, { "epoch": 0.45936325884912005, "step": 4646, "train/total_loss": 0.27344977855682373 }, { "entropy": 8.96348762512207, "epoch": 0.45946213169863553, "mean_token_accuracy": 0.7243852615356445, "num_tokens": 3374877.0, "step": 4647, "train/ce_loss": 0.5222508311271667 }, { "epoch": 0.45946213169863553, "step": 4647, "train/sim_loss": 0.0390625 }, { "epoch": 0.45946213169863553, "step": 4647, "train/total_loss": 0.09128758311271667 }, { "entropy": 8.688159942626953, "epoch": 0.4595610045481511, "mean_token_accuracy": 0.8160237669944763, "num_tokens": 3380348.0, "step": 4648, "train/ce_loss": 0.5302615761756897 }, { "epoch": 0.4595610045481511, "step": 4648, "train/sim_loss": 0.04296875 }, { "epoch": 0.4595610045481511, "step": 4648, "train/total_loss": 0.09599490463733673 }, { "entropy": 9.019515991210938, "epoch": 0.4596598773976666, "mean_token_accuracy": 0.759096622467041, "num_tokens": 3385587.0, "step": 4649, "train/ce_loss": 0.7856865525245667 }, { "epoch": 0.4596598773976666, "step": 4649, "train/sim_loss": 0.02734375 }, { "epoch": 0.4596598773976666, "step": 4649, "train/total_loss": 0.10591240972280502 }, { "entropy": 9.325714111328125, "epoch": 0.4597587502471821, "mean_token_accuracy": 0.7252747416496277, "num_tokens": 3390662.0, "step": 4650, "train/ce_loss": 1.121762990951538 }, { "epoch": 0.4597587502471821, "step": 4650, "train/sim_loss": 0.04296875 }, { "epoch": 0.4597587502471821, "step": 4650, "train/total_loss": 0.1551450490951538 }, { "entropy": 9.195514678955078, "epoch": 0.45985762309669764, "mean_token_accuracy": 0.718137264251709, "num_tokens": 3396107.0, "step": 4651, "train/ce_loss": 1.328965425491333 }, { "epoch": 0.45985762309669764, "step": 4651, "train/sim_loss": 0.06640625 }, { "epoch": 0.45985762309669764, "step": 4651, "train/total_loss": 0.1993027925491333 }, { "entropy": 9.12911605834961, "epoch": 0.4599564959462132, "mean_token_accuracy": 0.7340686321258545, "num_tokens": 3401403.0, "step": 4652, "train/ce_loss": 0.9378371238708496 }, { "epoch": 0.4599564959462132, "step": 4652, "train/sim_loss": 0.0703125 }, { "epoch": 0.4599564959462132, "step": 4652, "train/total_loss": 0.16409620642662048 }, { "entropy": 9.175090789794922, "epoch": 0.46005536879572867, "mean_token_accuracy": 0.8226857781410217, "num_tokens": 3406640.0, "step": 4653, "train/ce_loss": 1.1295832109681214e-06 }, { "epoch": 0.46005536879572867, "step": 4653, "train/sim_loss": 0.0234375 }, { "epoch": 0.46005536879572867, "step": 4653, "train/total_loss": 0.023437613621354103 }, { "entropy": 9.279150009155273, "epoch": 0.4601542416452442, "mean_token_accuracy": 0.7282758355140686, "num_tokens": 3411757.0, "step": 4654, "train/ce_loss": 1.823083758354187 }, { "epoch": 0.4601542416452442, "step": 4654, "train/sim_loss": 0.125 }, { "epoch": 0.4601542416452442, "step": 4654, "train/total_loss": 0.3073083758354187 }, { "entropy": 8.993045806884766, "epoch": 0.46025311449475975, "mean_token_accuracy": 0.7315855026245117, "num_tokens": 3417034.0, "step": 4655, "train/ce_loss": 0.4706941246986389 }, { "epoch": 0.46025311449475975, "step": 4655, "train/sim_loss": 0.1015625 }, { "epoch": 0.46025311449475975, "step": 4655, "train/total_loss": 0.14863191545009613 }, { "entropy": 9.490364074707031, "epoch": 0.46035198734427524, "mean_token_accuracy": 0.7389830350875854, "num_tokens": 3422013.0, "step": 4656, "train/ce_loss": 1.3873450756072998 }, { "epoch": 0.46035198734427524, "step": 4656, "train/sim_loss": 0.02734375 }, { "epoch": 0.46035198734427524, "step": 4656, "train/total_loss": 0.16607825458049774 }, { "entropy": 9.437137603759766, "epoch": 0.4604508601937908, "mean_token_accuracy": 0.739635169506073, "num_tokens": 3427080.0, "step": 4657, "train/ce_loss": 2.3839854748075595e-06 }, { "epoch": 0.4604508601937908, "step": 4657, "train/sim_loss": 0.03515625 }, { "epoch": 0.4604508601937908, "step": 4657, "train/total_loss": 0.0351564884185791 }, { "entropy": 9.324195861816406, "epoch": 0.4605497330433063, "mean_token_accuracy": 0.7063252925872803, "num_tokens": 3432225.0, "step": 4658, "train/ce_loss": 6.33997342447401e-06 }, { "epoch": 0.4605497330433063, "step": 4658, "train/sim_loss": 0.04296875 }, { "epoch": 0.4605497330433063, "step": 4658, "train/total_loss": 0.04296938329935074 }, { "entropy": 9.167675971984863, "epoch": 0.4606486058928218, "mean_token_accuracy": 0.6957186460494995, "num_tokens": 3437359.0, "step": 4659, "train/ce_loss": 6.890105851198314e-06 }, { "epoch": 0.4606486058928218, "step": 4659, "train/sim_loss": 0.0625 }, { "epoch": 0.4606486058928218, "step": 4659, "train/total_loss": 0.06250068545341492 }, { "epoch": 0.46074747874233735, "grad_norm": 0.9255831241607666, "learning_rate": 8.850566187014787e-06, "loss": 0.1386, "step": 4660 }, { "entropy": 8.594034194946289, "epoch": 0.46074747874233735, "mean_token_accuracy": 0.7196562886238098, "num_tokens": 3442765.0, "step": 4660, "train/ce_loss": 0.7374122738838196 }, { "epoch": 0.46074747874233735, "step": 4660, "train/sim_loss": 0.0546875 }, { "epoch": 0.46074747874233735, "step": 4660, "train/total_loss": 0.12842872738838196 }, { "entropy": 9.10422134399414, "epoch": 0.4608463515918529, "mean_token_accuracy": 0.7849604487419128, "num_tokens": 3447949.0, "step": 4661, "train/ce_loss": 0.9659251570701599 }, { "epoch": 0.4608463515918529, "step": 4661, "train/sim_loss": 0.0859375 }, { "epoch": 0.4608463515918529, "step": 4661, "train/total_loss": 0.182530015707016 }, { "entropy": 9.422708511352539, "epoch": 0.4609452244413684, "mean_token_accuracy": 0.7255244851112366, "num_tokens": 3452957.0, "step": 4662, "train/ce_loss": 0.9071168899536133 }, { "epoch": 0.4609452244413684, "step": 4662, "train/sim_loss": 0.03515625 }, { "epoch": 0.4609452244413684, "step": 4662, "train/total_loss": 0.12586793303489685 }, { "entropy": 8.967698097229004, "epoch": 0.4610440972908839, "mean_token_accuracy": 0.6709601879119873, "num_tokens": 3458249.0, "step": 4663, "train/ce_loss": 1.326856017112732 }, { "epoch": 0.4610440972908839, "step": 4663, "train/sim_loss": 0.0546875 }, { "epoch": 0.4610440972908839, "step": 4663, "train/total_loss": 0.1873731017112732 }, { "entropy": 9.37660026550293, "epoch": 0.46114297014039946, "mean_token_accuracy": 0.7287630438804626, "num_tokens": 3463275.0, "step": 4664, "train/ce_loss": 1.7267590237679542e-06 }, { "epoch": 0.46114297014039946, "step": 4664, "train/sim_loss": 0.02734375 }, { "epoch": 0.46114297014039946, "step": 4664, "train/total_loss": 0.02734392322599888 }, { "entropy": 9.615408897399902, "epoch": 0.461241842989915, "mean_token_accuracy": 0.6708860993385315, "num_tokens": 3468158.0, "step": 4665, "train/ce_loss": 2.5690736770629883 }, { "epoch": 0.461241842989915, "step": 4665, "train/sim_loss": 0.03125 }, { "epoch": 0.461241842989915, "step": 4665, "train/total_loss": 0.2881573736667633 }, { "entropy": 8.768453598022461, "epoch": 0.4613407158394305, "mean_token_accuracy": 0.7710437774658203, "num_tokens": 3473519.0, "step": 4666, "train/ce_loss": 1.3710461854934692 }, { "epoch": 0.4613407158394305, "step": 4666, "train/sim_loss": 0.078125 }, { "epoch": 0.4613407158394305, "step": 4666, "train/total_loss": 0.21522961556911469 }, { "entropy": 9.292464256286621, "epoch": 0.461439588688946, "mean_token_accuracy": 0.762536883354187, "num_tokens": 3478663.0, "step": 4667, "train/ce_loss": 1.1530660390853882 }, { "epoch": 0.461439588688946, "step": 4667, "train/sim_loss": 0.05078125 }, { "epoch": 0.461439588688946, "step": 4667, "train/total_loss": 0.16608786582946777 }, { "entropy": 8.926875114440918, "epoch": 0.46153846153846156, "mean_token_accuracy": 0.7527114748954773, "num_tokens": 3484023.0, "step": 4668, "train/ce_loss": 0.3579964339733124 }, { "epoch": 0.46153846153846156, "step": 4668, "train/sim_loss": 0.0625 }, { "epoch": 0.46153846153846156, "step": 4668, "train/total_loss": 0.09829964488744736 }, { "entropy": 8.580286979675293, "epoch": 0.46163733438797705, "mean_token_accuracy": 0.7695202231407166, "num_tokens": 3489564.0, "step": 4669, "train/ce_loss": 0.6817471981048584 }, { "epoch": 0.46163733438797705, "step": 4669, "train/sim_loss": 0.03515625 }, { "epoch": 0.46163733438797705, "step": 4669, "train/total_loss": 0.10333096981048584 }, { "entropy": 9.17463493347168, "epoch": 0.4617362072374926, "mean_token_accuracy": 0.7695364356040955, "num_tokens": 3494709.0, "step": 4670, "train/ce_loss": 0.9873186945915222 }, { "epoch": 0.4617362072374926, "step": 4670, "train/sim_loss": 0.03125 }, { "epoch": 0.4617362072374926, "step": 4670, "train/total_loss": 0.1299818754196167 }, { "entropy": 9.358234405517578, "epoch": 0.46183508008700813, "mean_token_accuracy": 0.7796102166175842, "num_tokens": 3499961.0, "step": 4671, "train/ce_loss": 1.2731976509094238 }, { "epoch": 0.46183508008700813, "step": 4671, "train/sim_loss": 0.09375 }, { "epoch": 0.46183508008700813, "step": 4671, "train/total_loss": 0.22106976807117462 }, { "entropy": 9.134790420532227, "epoch": 0.4619339529365236, "mean_token_accuracy": 0.7370967864990234, "num_tokens": 3505003.0, "step": 4672, "train/ce_loss": 1.8665708921616897e-05 }, { "epoch": 0.4619339529365236, "step": 4672, "train/sim_loss": 0.09375 }, { "epoch": 0.4619339529365236, "step": 4672, "train/total_loss": 0.09375187009572983 }, { "entropy": 8.932821273803711, "epoch": 0.46203282578603916, "mean_token_accuracy": 0.7169811129570007, "num_tokens": 3510269.0, "step": 4673, "train/ce_loss": 2.768101921901689e-06 }, { "epoch": 0.46203282578603916, "step": 4673, "train/sim_loss": 0.06640625 }, { "epoch": 0.46203282578603916, "step": 4673, "train/total_loss": 0.06640652567148209 }, { "entropy": 9.077056884765625, "epoch": 0.4621316986355547, "mean_token_accuracy": 0.7108433842658997, "num_tokens": 3515392.0, "step": 4674, "train/ce_loss": 1.7194793224334717 }, { "epoch": 0.4621316986355547, "step": 4674, "train/sim_loss": 0.109375 }, { "epoch": 0.4621316986355547, "step": 4674, "train/total_loss": 0.2813229560852051 }, { "entropy": 8.934788703918457, "epoch": 0.4622305714850702, "mean_token_accuracy": 0.6678487062454224, "num_tokens": 3520759.0, "step": 4675, "train/ce_loss": 1.6158264875411987 }, { "epoch": 0.4622305714850702, "step": 4675, "train/sim_loss": 0.0703125 }, { "epoch": 0.4622305714850702, "step": 4675, "train/total_loss": 0.23189514875411987 }, { "entropy": 9.20814037322998, "epoch": 0.4623294443345857, "mean_token_accuracy": 0.7157434225082397, "num_tokens": 3525791.0, "step": 4676, "train/ce_loss": 0.8203097581863403 }, { "epoch": 0.4623294443345857, "step": 4676, "train/sim_loss": 0.05859375 }, { "epoch": 0.4623294443345857, "step": 4676, "train/total_loss": 0.1406247317790985 }, { "entropy": 8.74182415008545, "epoch": 0.46242831718410127, "mean_token_accuracy": 0.7483588457107544, "num_tokens": 3531185.0, "step": 4677, "train/ce_loss": 0.6870768666267395 }, { "epoch": 0.46242831718410127, "step": 4677, "train/sim_loss": 0.046875 }, { "epoch": 0.46242831718410127, "step": 4677, "train/total_loss": 0.11558268964290619 }, { "entropy": 8.711596488952637, "epoch": 0.46252719003361675, "mean_token_accuracy": 0.7013274431228638, "num_tokens": 3536573.0, "step": 4678, "train/ce_loss": 1.0243852138519287 }, { "epoch": 0.46252719003361675, "step": 4678, "train/sim_loss": 0.04296875 }, { "epoch": 0.46252719003361675, "step": 4678, "train/total_loss": 0.1454072743654251 }, { "entropy": 9.993766784667969, "epoch": 0.4626260628831323, "mean_token_accuracy": 0.8398268222808838, "num_tokens": 3541209.0, "step": 4679, "train/ce_loss": 1.9226688146591187 }, { "epoch": 0.4626260628831323, "step": 4679, "train/sim_loss": 0.046875 }, { "epoch": 0.4626260628831323, "step": 4679, "train/total_loss": 0.23914188146591187 }, { "epoch": 0.46272493573264784, "grad_norm": 0.8097333312034607, "learning_rate": 8.845621322256837e-06, "loss": 0.1412, "step": 4680 }, { "entropy": 9.061151504516602, "epoch": 0.46272493573264784, "mean_token_accuracy": 0.6990423798561096, "num_tokens": 3546400.0, "step": 4680, "train/ce_loss": 1.1291780471801758 }, { "epoch": 0.46272493573264784, "step": 4680, "train/sim_loss": 0.0546875 }, { "epoch": 0.46272493573264784, "step": 4680, "train/total_loss": 0.16760531067848206 }, { "entropy": 9.135835647583008, "epoch": 0.4628238085821633, "mean_token_accuracy": 0.8241758346557617, "num_tokens": 3551608.0, "step": 4681, "train/ce_loss": 0.6338585019111633 }, { "epoch": 0.4628238085821633, "step": 4681, "train/sim_loss": 0.01953125 }, { "epoch": 0.4628238085821633, "step": 4681, "train/total_loss": 0.08291710168123245 }, { "entropy": 9.02741813659668, "epoch": 0.46292268143167886, "mean_token_accuracy": 0.7194805145263672, "num_tokens": 3556816.0, "step": 4682, "train/ce_loss": 0.8726161122322083 }, { "epoch": 0.46292268143167886, "step": 4682, "train/sim_loss": 0.03125 }, { "epoch": 0.46292268143167886, "step": 4682, "train/total_loss": 0.1185116097331047 }, { "entropy": 9.087461471557617, "epoch": 0.4630215542811944, "mean_token_accuracy": 0.7063291072845459, "num_tokens": 3562086.0, "step": 4683, "train/ce_loss": 0.5230680704116821 }, { "epoch": 0.4630215542811944, "step": 4683, "train/sim_loss": 0.1015625 }, { "epoch": 0.4630215542811944, "step": 4683, "train/total_loss": 0.15386930108070374 }, { "entropy": 9.792854309082031, "epoch": 0.4631204271307099, "mean_token_accuracy": 0.6943128108978271, "num_tokens": 3566934.0, "step": 4684, "train/ce_loss": 1.461733102798462 }, { "epoch": 0.4631204271307099, "step": 4684, "train/sim_loss": 0.08203125 }, { "epoch": 0.4631204271307099, "step": 4684, "train/total_loss": 0.22820456326007843 }, { "entropy": 9.342538833618164, "epoch": 0.46321929998022543, "mean_token_accuracy": 0.7240896224975586, "num_tokens": 3572019.0, "step": 4685, "train/ce_loss": 1.6679964065551758 }, { "epoch": 0.46321929998022543, "step": 4685, "train/sim_loss": 0.125 }, { "epoch": 0.46321929998022543, "step": 4685, "train/total_loss": 0.2917996644973755 }, { "entropy": 9.116839408874512, "epoch": 0.46331817282974097, "mean_token_accuracy": 0.7314285635948181, "num_tokens": 3577174.0, "step": 4686, "train/ce_loss": 2.474521807016572e-06 }, { "epoch": 0.46331817282974097, "step": 4686, "train/sim_loss": 0.03125 }, { "epoch": 0.46331817282974097, "step": 4686, "train/total_loss": 0.0312502458691597 }, { "entropy": 8.821943283081055, "epoch": 0.46341704567925646, "mean_token_accuracy": 0.6961583495140076, "num_tokens": 3582491.0, "step": 4687, "train/ce_loss": 0.4736084043979645 }, { "epoch": 0.46341704567925646, "step": 4687, "train/sim_loss": 0.046875 }, { "epoch": 0.46341704567925646, "step": 4687, "train/total_loss": 0.09423583745956421 }, { "entropy": 9.354621887207031, "epoch": 0.463515918528772, "mean_token_accuracy": 0.6916524767875671, "num_tokens": 3587542.0, "step": 4688, "train/ce_loss": 0.9794625639915466 }, { "epoch": 0.463515918528772, "step": 4688, "train/sim_loss": 0.0390625 }, { "epoch": 0.463515918528772, "step": 4688, "train/total_loss": 0.13700875639915466 }, { "entropy": 8.913888931274414, "epoch": 0.46361479137828754, "mean_token_accuracy": 0.7779056429862976, "num_tokens": 3592914.0, "step": 4689, "train/ce_loss": 0.6756730079650879 }, { "epoch": 0.46361479137828754, "step": 4689, "train/sim_loss": 0.06640625 }, { "epoch": 0.46361479137828754, "step": 4689, "train/total_loss": 0.13397355377674103 }, { "entropy": 8.87656021118164, "epoch": 0.463713664227803, "mean_token_accuracy": 0.6682986617088318, "num_tokens": 3598233.0, "step": 4690, "train/ce_loss": 0.47903019189834595 }, { "epoch": 0.463713664227803, "step": 4690, "train/sim_loss": 0.046875 }, { "epoch": 0.463713664227803, "step": 4690, "train/total_loss": 0.09477801620960236 }, { "entropy": 8.997529983520508, "epoch": 0.46381253707731857, "mean_token_accuracy": 0.7174940705299377, "num_tokens": 3603499.0, "step": 4691, "train/ce_loss": 0.7582955360412598 }, { "epoch": 0.46381253707731857, "step": 4691, "train/sim_loss": 0.046875 }, { "epoch": 0.46381253707731857, "step": 4691, "train/total_loss": 0.12270455807447433 }, { "entropy": 8.872171401977539, "epoch": 0.4639114099268341, "mean_token_accuracy": 0.7293689250946045, "num_tokens": 3608845.0, "step": 4692, "train/ce_loss": 0.7552477121353149 }, { "epoch": 0.4639114099268341, "step": 4692, "train/sim_loss": 0.046875 }, { "epoch": 0.4639114099268341, "step": 4692, "train/total_loss": 0.12239976972341537 }, { "entropy": 9.48823356628418, "epoch": 0.4640102827763496, "mean_token_accuracy": 0.681208074092865, "num_tokens": 3613840.0, "step": 4693, "train/ce_loss": 0.6615340709686279 }, { "epoch": 0.4640102827763496, "step": 4693, "train/sim_loss": 0.05859375 }, { "epoch": 0.4640102827763496, "step": 4693, "train/total_loss": 0.12474715709686279 }, { "entropy": 8.860837936401367, "epoch": 0.46410915562586513, "mean_token_accuracy": 0.7738748788833618, "num_tokens": 3619213.0, "step": 4694, "train/ce_loss": 0.5516573786735535 }, { "epoch": 0.46410915562586513, "step": 4694, "train/sim_loss": 0.0234375 }, { "epoch": 0.46410915562586513, "step": 4694, "train/total_loss": 0.07860323786735535 }, { "entropy": 8.766907691955566, "epoch": 0.4642080284753807, "mean_token_accuracy": 0.7964988946914673, "num_tokens": 3624624.0, "step": 4695, "train/ce_loss": 0.9928861856460571 }, { "epoch": 0.4642080284753807, "step": 4695, "train/sim_loss": 0.0390625 }, { "epoch": 0.4642080284753807, "step": 4695, "train/total_loss": 0.13835111260414124 }, { "entropy": 8.777790069580078, "epoch": 0.46430690132489616, "mean_token_accuracy": 0.7126436829566956, "num_tokens": 3630097.0, "step": 4696, "train/ce_loss": 1.4684127569198608 }, { "epoch": 0.46430690132489616, "step": 4696, "train/sim_loss": 0.08203125 }, { "epoch": 0.46430690132489616, "step": 4696, "train/total_loss": 0.22887252271175385 }, { "entropy": 9.39077377319336, "epoch": 0.4644057741744117, "mean_token_accuracy": 0.7222222089767456, "num_tokens": 3635247.0, "step": 4697, "train/ce_loss": 1.6382369995117188 }, { "epoch": 0.4644057741744117, "step": 4697, "train/sim_loss": 0.0703125 }, { "epoch": 0.4644057741744117, "step": 4697, "train/total_loss": 0.2341362088918686 }, { "entropy": 9.113056182861328, "epoch": 0.46450464702392724, "mean_token_accuracy": 0.7508532404899597, "num_tokens": 3640724.0, "step": 4698, "train/ce_loss": 0.7351180911064148 }, { "epoch": 0.46450464702392724, "step": 4698, "train/sim_loss": 0.109375 }, { "epoch": 0.46450464702392724, "step": 4698, "train/total_loss": 0.18288680911064148 }, { "entropy": 9.436710357666016, "epoch": 0.4646035198734427, "mean_token_accuracy": 0.7511811256408691, "num_tokens": 3645922.0, "step": 4699, "train/ce_loss": 0.9424399733543396 }, { "epoch": 0.4646035198734427, "step": 4699, "train/sim_loss": 0.03515625 }, { "epoch": 0.4646035198734427, "step": 4699, "train/total_loss": 0.12940025329589844 }, { "epoch": 0.46470239272295827, "grad_norm": 0.6447728276252747, "learning_rate": 8.840676457498888e-06, "loss": 0.1453, "step": 4700 }, { "entropy": 9.457176208496094, "epoch": 0.46470239272295827, "mean_token_accuracy": 0.672913134098053, "num_tokens": 3650937.0, "step": 4700, "train/ce_loss": 0.9132030010223389 }, { "epoch": 0.46470239272295827, "step": 4700, "train/sim_loss": 0.0703125 }, { "epoch": 0.46470239272295827, "step": 4700, "train/total_loss": 0.16163280606269836 }, { "entropy": 9.087905883789062, "epoch": 0.4648012655724738, "mean_token_accuracy": 0.7549574971199036, "num_tokens": 3656119.0, "step": 4701, "train/ce_loss": 1.379743218421936 }, { "epoch": 0.4648012655724738, "step": 4701, "train/sim_loss": 0.04296875 }, { "epoch": 0.4648012655724738, "step": 4701, "train/total_loss": 0.1809430718421936 }, { "entropy": 8.700032234191895, "epoch": 0.4649001384219893, "mean_token_accuracy": 0.736785352230072, "num_tokens": 3661523.0, "step": 4702, "train/ce_loss": 0.7521803975105286 }, { "epoch": 0.4649001384219893, "step": 4702, "train/sim_loss": 0.03515625 }, { "epoch": 0.4649001384219893, "step": 4702, "train/total_loss": 0.11037429422140121 }, { "entropy": 9.367010116577148, "epoch": 0.46499901127150484, "mean_token_accuracy": 0.7763779759407043, "num_tokens": 3666600.0, "step": 4703, "train/ce_loss": 0.8053690791130066 }, { "epoch": 0.46499901127150484, "step": 4703, "train/sim_loss": 0.01953125 }, { "epoch": 0.46499901127150484, "step": 4703, "train/total_loss": 0.10006815940141678 }, { "entropy": 9.050535202026367, "epoch": 0.4650978841210204, "mean_token_accuracy": 0.7229219079017639, "num_tokens": 3671840.0, "step": 4704, "train/ce_loss": 1.0198999643325806 }, { "epoch": 0.4650978841210204, "step": 4704, "train/sim_loss": 0.0859375 }, { "epoch": 0.4650978841210204, "step": 4704, "train/total_loss": 0.1879274994134903 }, { "entropy": 9.247036933898926, "epoch": 0.4651967569705359, "mean_token_accuracy": 0.7175572514533997, "num_tokens": 3676818.0, "step": 4705, "train/ce_loss": 0.8245893120765686 }, { "epoch": 0.4651967569705359, "step": 4705, "train/sim_loss": 0.02734375 }, { "epoch": 0.4651967569705359, "step": 4705, "train/total_loss": 0.10980268567800522 }, { "entropy": 9.679970741271973, "epoch": 0.4652956298200514, "mean_token_accuracy": 0.7616387605667114, "num_tokens": 3681838.0, "step": 4706, "train/ce_loss": 1.5281262903954484e-06 }, { "epoch": 0.4652956298200514, "step": 4706, "train/sim_loss": 0.0390625 }, { "epoch": 0.4652956298200514, "step": 4706, "train/total_loss": 0.03906265273690224 }, { "entropy": 8.594926834106445, "epoch": 0.46539450266956695, "mean_token_accuracy": 0.7148981690406799, "num_tokens": 3687284.0, "step": 4707, "train/ce_loss": 1.0162886381149292 }, { "epoch": 0.46539450266956695, "step": 4707, "train/sim_loss": 0.02734375 }, { "epoch": 0.46539450266956695, "step": 4707, "train/total_loss": 0.1289726197719574 }, { "entropy": 9.453357696533203, "epoch": 0.4654933755190825, "mean_token_accuracy": 0.7584459185600281, "num_tokens": 3692380.0, "step": 4708, "train/ce_loss": 0.9438441395759583 }, { "epoch": 0.4654933755190825, "step": 4708, "train/sim_loss": 0.05859375 }, { "epoch": 0.4654933755190825, "step": 4708, "train/total_loss": 0.15297816693782806 }, { "entropy": 8.775794982910156, "epoch": 0.46559224836859797, "mean_token_accuracy": 0.7706635594367981, "num_tokens": 3697690.0, "step": 4709, "train/ce_loss": 0.7100697755813599 }, { "epoch": 0.46559224836859797, "step": 4709, "train/sim_loss": 0.0234375 }, { "epoch": 0.46559224836859797, "step": 4709, "train/total_loss": 0.09444447606801987 }, { "entropy": 8.7677583694458, "epoch": 0.4656911212181135, "mean_token_accuracy": 0.7585825324058533, "num_tokens": 3703064.0, "step": 4710, "train/ce_loss": 0.8214384317398071 }, { "epoch": 0.4656911212181135, "step": 4710, "train/sim_loss": 0.03515625 }, { "epoch": 0.4656911212181135, "step": 4710, "train/total_loss": 0.11730009317398071 }, { "entropy": 8.944156646728516, "epoch": 0.46578999406762905, "mean_token_accuracy": 0.7311960458755493, "num_tokens": 3708316.0, "step": 4711, "train/ce_loss": 0.7888990640640259 }, { "epoch": 0.46578999406762905, "step": 4711, "train/sim_loss": 0.0859375 }, { "epoch": 0.46578999406762905, "step": 4711, "train/total_loss": 0.1648274064064026 }, { "entropy": 9.67165756225586, "epoch": 0.46588886691714454, "mean_token_accuracy": 0.7676767706871033, "num_tokens": 3713239.0, "step": 4712, "train/ce_loss": 4.874921614828054e-06 }, { "epoch": 0.46588886691714454, "step": 4712, "train/sim_loss": 0.0625 }, { "epoch": 0.46588886691714454, "step": 4712, "train/total_loss": 0.0625004842877388 }, { "entropy": 9.428511619567871, "epoch": 0.4659877397666601, "mean_token_accuracy": 0.7713841199874878, "num_tokens": 3718263.0, "step": 4713, "train/ce_loss": 0.6457332372665405 }, { "epoch": 0.4659877397666601, "step": 4713, "train/sim_loss": 0.04296875 }, { "epoch": 0.4659877397666601, "step": 4713, "train/total_loss": 0.10754207521677017 }, { "entropy": 9.480663299560547, "epoch": 0.4660866126161756, "mean_token_accuracy": 0.7697368264198303, "num_tokens": 3723331.0, "step": 4714, "train/ce_loss": 0.9293239712715149 }, { "epoch": 0.4660866126161756, "step": 4714, "train/sim_loss": 0.0234375 }, { "epoch": 0.4660866126161756, "step": 4714, "train/total_loss": 0.11636989563703537 }, { "entropy": 9.312999725341797, "epoch": 0.4661854854656911, "mean_token_accuracy": 0.7088791728019714, "num_tokens": 3728472.0, "step": 4715, "train/ce_loss": 2.715044502110686e-06 }, { "epoch": 0.4661854854656911, "step": 4715, "train/sim_loss": 0.0703125 }, { "epoch": 0.4661854854656911, "step": 4715, "train/total_loss": 0.07031276822090149 }, { "entropy": 9.983501434326172, "epoch": 0.46628435831520665, "mean_token_accuracy": 0.7093595862388611, "num_tokens": 3733317.0, "step": 4716, "train/ce_loss": 1.5866858120716643e-06 }, { "epoch": 0.46628435831520665, "step": 4716, "train/sim_loss": 0.01953125 }, { "epoch": 0.46628435831520665, "step": 4716, "train/total_loss": 0.019531408324837685 }, { "entropy": 8.879308700561523, "epoch": 0.4663832311647222, "mean_token_accuracy": 0.7541370987892151, "num_tokens": 3738656.0, "step": 4717, "train/ce_loss": 1.1307368278503418 }, { "epoch": 0.4663832311647222, "step": 4717, "train/sim_loss": 0.09765625 }, { "epoch": 0.4663832311647222, "step": 4717, "train/total_loss": 0.2107299268245697 }, { "entropy": 9.347103118896484, "epoch": 0.4664821040142377, "mean_token_accuracy": 0.7087827324867249, "num_tokens": 3743732.0, "step": 4718, "train/ce_loss": 0.7969828844070435 }, { "epoch": 0.4664821040142377, "step": 4718, "train/sim_loss": 0.07421875 }, { "epoch": 0.4664821040142377, "step": 4718, "train/total_loss": 0.15391704440116882 }, { "entropy": 8.976900100708008, "epoch": 0.4665809768637532, "mean_token_accuracy": 0.7645536661148071, "num_tokens": 3749005.0, "step": 4719, "train/ce_loss": 0.543332576751709 }, { "epoch": 0.4665809768637532, "step": 4719, "train/sim_loss": 0.04296875 }, { "epoch": 0.4665809768637532, "step": 4719, "train/total_loss": 0.09730200469493866 }, { "epoch": 0.46667984971326876, "grad_norm": 0.6814486384391785, "learning_rate": 8.83573159274094e-06, "loss": 0.1401, "step": 4720 }, { "entropy": 9.519277572631836, "epoch": 0.46667984971326876, "mean_token_accuracy": 0.6865203976631165, "num_tokens": 3754073.0, "step": 4720, "train/ce_loss": 1.6626695394515991 }, { "epoch": 0.46667984971326876, "step": 4720, "train/sim_loss": 0.0625 }, { "epoch": 0.46667984971326876, "step": 4720, "train/total_loss": 0.22876696288585663 }, { "entropy": 9.098243713378906, "epoch": 0.46677872256278424, "mean_token_accuracy": 0.7195571660995483, "num_tokens": 3759381.0, "step": 4721, "train/ce_loss": 0.9952742457389832 }, { "epoch": 0.46677872256278424, "step": 4721, "train/sim_loss": 0.0625 }, { "epoch": 0.46677872256278424, "step": 4721, "train/total_loss": 0.16202741861343384 }, { "entropy": 8.878311157226562, "epoch": 0.4668775954122998, "mean_token_accuracy": 0.709172248840332, "num_tokens": 3764737.0, "step": 4722, "train/ce_loss": 0.7218380570411682 }, { "epoch": 0.4668775954122998, "step": 4722, "train/sim_loss": 0.0859375 }, { "epoch": 0.4668775954122998, "step": 4722, "train/total_loss": 0.15812131762504578 }, { "entropy": 9.033000946044922, "epoch": 0.4669764682618153, "mean_token_accuracy": 0.7345678806304932, "num_tokens": 3770020.0, "step": 4723, "train/ce_loss": 0.5463669300079346 }, { "epoch": 0.4669764682618153, "step": 4723, "train/sim_loss": 0.078125 }, { "epoch": 0.4669764682618153, "step": 4723, "train/total_loss": 0.13276168704032898 }, { "entropy": 9.114192962646484, "epoch": 0.4670753411113308, "mean_token_accuracy": 0.8257575631141663, "num_tokens": 3775272.0, "step": 4724, "train/ce_loss": 0.45774757862091064 }, { "epoch": 0.4670753411113308, "step": 4724, "train/sim_loss": 0.015625 }, { "epoch": 0.4670753411113308, "step": 4724, "train/total_loss": 0.061399757862091064 }, { "entropy": 9.174012184143066, "epoch": 0.46717421396084635, "mean_token_accuracy": 0.7595474123954773, "num_tokens": 3780403.0, "step": 4725, "train/ce_loss": 0.662388265132904 }, { "epoch": 0.46717421396084635, "step": 4725, "train/sim_loss": 0.03125 }, { "epoch": 0.46717421396084635, "step": 4725, "train/total_loss": 0.09748882800340652 }, { "entropy": 9.906539916992188, "epoch": 0.4672730868103619, "mean_token_accuracy": 0.8241758346557617, "num_tokens": 3785262.0, "step": 4726, "train/ce_loss": 0.9337340593338013 }, { "epoch": 0.4672730868103619, "step": 4726, "train/sim_loss": 0.015625 }, { "epoch": 0.4672730868103619, "step": 4726, "train/total_loss": 0.10899841040372849 }, { "entropy": 8.694759368896484, "epoch": 0.4673719596598774, "mean_token_accuracy": 0.7390710115432739, "num_tokens": 3790497.0, "step": 4727, "train/ce_loss": 0.44753503799438477 }, { "epoch": 0.4673719596598774, "step": 4727, "train/sim_loss": 0.0390625 }, { "epoch": 0.4673719596598774, "step": 4727, "train/total_loss": 0.08381600677967072 }, { "entropy": 9.153018951416016, "epoch": 0.4674708325093929, "mean_token_accuracy": 0.7115384340286255, "num_tokens": 3795913.0, "step": 4728, "train/ce_loss": 1.1834046840667725 }, { "epoch": 0.4674708325093929, "step": 4728, "train/sim_loss": 0.03125 }, { "epoch": 0.4674708325093929, "step": 4728, "train/total_loss": 0.14959046244621277 }, { "entropy": 9.155204772949219, "epoch": 0.46756970535890846, "mean_token_accuracy": 0.7116912603378296, "num_tokens": 3801267.0, "step": 4729, "train/ce_loss": 0.9349053502082825 }, { "epoch": 0.46756970535890846, "step": 4729, "train/sim_loss": 0.0546875 }, { "epoch": 0.46756970535890846, "step": 4729, "train/total_loss": 0.14817804098129272 }, { "entropy": 9.545472145080566, "epoch": 0.46766857820842395, "mean_token_accuracy": 0.7176259160041809, "num_tokens": 3806325.0, "step": 4730, "train/ce_loss": 1.6894537111511454e-06 }, { "epoch": 0.46766857820842395, "step": 4730, "train/sim_loss": 0.05078125 }, { "epoch": 0.46766857820842395, "step": 4730, "train/total_loss": 0.05078141763806343 }, { "entropy": 9.513253211975098, "epoch": 0.4677674510579395, "mean_token_accuracy": 0.7858508825302124, "num_tokens": 3811283.0, "step": 4731, "train/ce_loss": 7.440812169079436e-06 }, { "epoch": 0.4677674510579395, "step": 4731, "train/sim_loss": 0.03515625 }, { "epoch": 0.4677674510579395, "step": 4731, "train/total_loss": 0.03515699505805969 }, { "entropy": 8.996169090270996, "epoch": 0.46786632390745503, "mean_token_accuracy": 0.7416020631790161, "num_tokens": 3816490.0, "step": 4732, "train/ce_loss": 1.2266058921813965 }, { "epoch": 0.46786632390745503, "step": 4732, "train/sim_loss": 0.08203125 }, { "epoch": 0.46786632390745503, "step": 4732, "train/total_loss": 0.2046918421983719 }, { "entropy": 9.178853988647461, "epoch": 0.4679651967569705, "mean_token_accuracy": 0.6740237474441528, "num_tokens": 3821569.0, "step": 4733, "train/ce_loss": 1.269809603691101 }, { "epoch": 0.4679651967569705, "step": 4733, "train/sim_loss": 0.0546875 }, { "epoch": 0.4679651967569705, "step": 4733, "train/total_loss": 0.1816684603691101 }, { "entropy": 9.312187194824219, "epoch": 0.46806406960648606, "mean_token_accuracy": 0.7680000066757202, "num_tokens": 3826623.0, "step": 4734, "train/ce_loss": 3.813394187091035e-06 }, { "epoch": 0.46806406960648606, "step": 4734, "train/sim_loss": 0.05078125 }, { "epoch": 0.46806406960648606, "step": 4734, "train/total_loss": 0.05078162997961044 }, { "entropy": 9.21867847442627, "epoch": 0.4681629424560016, "mean_token_accuracy": 0.7842031121253967, "num_tokens": 3831784.0, "step": 4735, "train/ce_loss": 0.38276079297065735 }, { "epoch": 0.4681629424560016, "step": 4735, "train/sim_loss": 0.078125 }, { "epoch": 0.4681629424560016, "step": 4735, "train/total_loss": 0.1164010763168335 }, { "entropy": 9.087265014648438, "epoch": 0.4682618153055171, "mean_token_accuracy": 0.7552631497383118, "num_tokens": 3836996.0, "step": 4736, "train/ce_loss": 0.8710118532180786 }, { "epoch": 0.4682618153055171, "step": 4736, "train/sim_loss": 0.10546875 }, { "epoch": 0.4682618153055171, "step": 4736, "train/total_loss": 0.19256994128227234 }, { "entropy": 8.832172393798828, "epoch": 0.4683606881550326, "mean_token_accuracy": 0.7178571224212646, "num_tokens": 3842416.0, "step": 4737, "train/ce_loss": 1.8286237716674805 }, { "epoch": 0.4683606881550326, "step": 4737, "train/sim_loss": 0.06640625 }, { "epoch": 0.4683606881550326, "step": 4737, "train/total_loss": 0.24926863610744476 }, { "entropy": 8.908565521240234, "epoch": 0.46845956100454816, "mean_token_accuracy": 0.7598522305488586, "num_tokens": 3847732.0, "step": 4738, "train/ce_loss": 0.5367914438247681 }, { "epoch": 0.46845956100454816, "step": 4738, "train/sim_loss": 0.125 }, { "epoch": 0.46845956100454816, "step": 4738, "train/total_loss": 0.17867913842201233 }, { "entropy": 9.52511215209961, "epoch": 0.46855843385406365, "mean_token_accuracy": 0.7535934448242188, "num_tokens": 3852654.0, "step": 4739, "train/ce_loss": 0.764208197593689 }, { "epoch": 0.46855843385406365, "step": 4739, "train/sim_loss": 0.078125 }, { "epoch": 0.46855843385406365, "step": 4739, "train/total_loss": 0.15454581379890442 }, { "epoch": 0.4686573067035792, "grad_norm": 0.761462390422821, "learning_rate": 8.83078672798299e-06, "loss": 0.1419, "step": 4740 }, { "entropy": 8.753978729248047, "epoch": 0.4686573067035792, "mean_token_accuracy": 0.7377398610115051, "num_tokens": 3858050.0, "step": 4740, "train/ce_loss": 0.5637397766113281 }, { "epoch": 0.4686573067035792, "step": 4740, "train/sim_loss": 0.05078125 }, { "epoch": 0.4686573067035792, "step": 4740, "train/total_loss": 0.10715523362159729 }, { "entropy": 9.044390678405762, "epoch": 0.46875617955309473, "mean_token_accuracy": 0.7644171714782715, "num_tokens": 3863329.0, "step": 4741, "train/ce_loss": 0.9783340096473694 }, { "epoch": 0.46875617955309473, "step": 4741, "train/sim_loss": 0.05859375 }, { "epoch": 0.46875617955309473, "step": 4741, "train/total_loss": 0.15642714500427246 }, { "entropy": 9.420166015625, "epoch": 0.4688550524026102, "mean_token_accuracy": 0.7177033424377441, "num_tokens": 3868419.0, "step": 4742, "train/ce_loss": 1.453926682472229 }, { "epoch": 0.4688550524026102, "step": 4742, "train/sim_loss": 0.12109375 }, { "epoch": 0.4688550524026102, "step": 4742, "train/total_loss": 0.26648640632629395 }, { "entropy": 9.0964994430542, "epoch": 0.46895392525212576, "mean_token_accuracy": 0.7308743000030518, "num_tokens": 3873570.0, "step": 4743, "train/ce_loss": 1.4857808992019272e-06 }, { "epoch": 0.46895392525212576, "step": 4743, "train/sim_loss": 0.05078125 }, { "epoch": 0.46895392525212576, "step": 4743, "train/total_loss": 0.05078139901161194 }, { "entropy": 9.663649559020996, "epoch": 0.4690527981016413, "mean_token_accuracy": 0.7560553550720215, "num_tokens": 3878602.0, "step": 4744, "train/ce_loss": 1.3290947675704956 }, { "epoch": 0.4690527981016413, "step": 4744, "train/sim_loss": 0.05859375 }, { "epoch": 0.4690527981016413, "step": 4744, "train/total_loss": 0.19150322675704956 }, { "entropy": 9.284122467041016, "epoch": 0.4691516709511568, "mean_token_accuracy": 0.671159029006958, "num_tokens": 3883815.0, "step": 4745, "train/ce_loss": 1.277754783630371 }, { "epoch": 0.4691516709511568, "step": 4745, "train/sim_loss": 0.06640625 }, { "epoch": 0.4691516709511568, "step": 4745, "train/total_loss": 0.19418172538280487 }, { "entropy": 9.04500961303711, "epoch": 0.4692505438006723, "mean_token_accuracy": 0.7883211970329285, "num_tokens": 3889091.0, "step": 4746, "train/ce_loss": 0.38756975531578064 }, { "epoch": 0.4692505438006723, "step": 4746, "train/sim_loss": 0.0234375 }, { "epoch": 0.4692505438006723, "step": 4746, "train/total_loss": 0.06219447776675224 }, { "entropy": 9.679605484008789, "epoch": 0.46934941665018787, "mean_token_accuracy": 0.7313974499702454, "num_tokens": 3894034.0, "step": 4747, "train/ce_loss": 0.7483991384506226 }, { "epoch": 0.46934941665018787, "step": 4747, "train/sim_loss": 0.08984375 }, { "epoch": 0.46934941665018787, "step": 4747, "train/total_loss": 0.16468366980552673 }, { "entropy": 9.190399169921875, "epoch": 0.4694482894997034, "mean_token_accuracy": 0.7104247212409973, "num_tokens": 3899261.0, "step": 4748, "train/ce_loss": 1.2331979274749756 }, { "epoch": 0.4694482894997034, "step": 4748, "train/sim_loss": 0.0546875 }, { "epoch": 0.4694482894997034, "step": 4748, "train/total_loss": 0.1780073046684265 }, { "entropy": 9.05646800994873, "epoch": 0.4695471623492189, "mean_token_accuracy": 0.6671069860458374, "num_tokens": 3904435.0, "step": 4749, "train/ce_loss": 1.1537171602249146 }, { "epoch": 0.4695471623492189, "step": 4749, "train/sim_loss": 0.06640625 }, { "epoch": 0.4695471623492189, "step": 4749, "train/total_loss": 0.1817779690027237 }, { "entropy": 9.588630676269531, "epoch": 0.46964603519873444, "mean_token_accuracy": 0.73046875, "num_tokens": 3909387.0, "step": 4750, "train/ce_loss": 0.6952922344207764 }, { "epoch": 0.46964603519873444, "step": 4750, "train/sim_loss": 0.0390625 }, { "epoch": 0.46964603519873444, "step": 4750, "train/total_loss": 0.108591727912426 }, { "entropy": 9.769367218017578, "epoch": 0.46974490804825, "mean_token_accuracy": 0.6651480793952942, "num_tokens": 3914229.0, "step": 4751, "train/ce_loss": 1.5452396869659424 }, { "epoch": 0.46974490804825, "step": 4751, "train/sim_loss": 0.078125 }, { "epoch": 0.46974490804825, "step": 4751, "train/total_loss": 0.23264896869659424 }, { "entropy": 9.275885581970215, "epoch": 0.46984378089776546, "mean_token_accuracy": 0.7324137687683105, "num_tokens": 3919407.0, "step": 4752, "train/ce_loss": 2.936223154392792e-06 }, { "epoch": 0.46984378089776546, "step": 4752, "train/sim_loss": 0.05078125 }, { "epoch": 0.46984378089776546, "step": 4752, "train/total_loss": 0.05078154429793358 }, { "entropy": 10.161367416381836, "epoch": 0.469942653747281, "mean_token_accuracy": 0.7241379022598267, "num_tokens": 3924120.0, "step": 4753, "train/ce_loss": 4.123912731301971e-06 }, { "epoch": 0.469942653747281, "step": 4753, "train/sim_loss": 0.03515625 }, { "epoch": 0.469942653747281, "step": 4753, "train/total_loss": 0.03515666350722313 }, { "entropy": 9.288694381713867, "epoch": 0.47004152659679654, "mean_token_accuracy": 0.729903519153595, "num_tokens": 3929184.0, "step": 4754, "train/ce_loss": 1.5101879853318678e-06 }, { "epoch": 0.47004152659679654, "step": 4754, "train/sim_loss": 0.01953125 }, { "epoch": 0.47004152659679654, "step": 4754, "train/total_loss": 0.019531400874257088 }, { "entropy": 8.630434036254883, "epoch": 0.47014039944631203, "mean_token_accuracy": 0.7230046987533569, "num_tokens": 3934624.0, "step": 4755, "train/ce_loss": 0.6190818548202515 }, { "epoch": 0.47014039944631203, "step": 4755, "train/sim_loss": 0.04296875 }, { "epoch": 0.47014039944631203, "step": 4755, "train/total_loss": 0.10487693548202515 }, { "entropy": 9.039437294006348, "epoch": 0.47023927229582757, "mean_token_accuracy": 0.7242743968963623, "num_tokens": 3939846.0, "step": 4756, "train/ce_loss": 0.5676820874214172 }, { "epoch": 0.47023927229582757, "step": 4756, "train/sim_loss": 0.03515625 }, { "epoch": 0.47023927229582757, "step": 4756, "train/total_loss": 0.09192445874214172 }, { "entropy": 9.227334022521973, "epoch": 0.4703381451453431, "mean_token_accuracy": 0.7220744490623474, "num_tokens": 3945039.0, "step": 4757, "train/ce_loss": 0.6822482943534851 }, { "epoch": 0.4703381451453431, "step": 4757, "train/sim_loss": 0.07421875 }, { "epoch": 0.4703381451453431, "step": 4757, "train/total_loss": 0.14244358241558075 }, { "entropy": 8.718744277954102, "epoch": 0.4704370179948586, "mean_token_accuracy": 0.708149790763855, "num_tokens": 3950437.0, "step": 4758, "train/ce_loss": 0.8345645070075989 }, { "epoch": 0.4704370179948586, "step": 4758, "train/sim_loss": 0.0234375 }, { "epoch": 0.4704370179948586, "step": 4758, "train/total_loss": 0.10689394921064377 }, { "entropy": 9.396023750305176, "epoch": 0.47053589084437414, "mean_token_accuracy": 0.7441497445106506, "num_tokens": 3955485.0, "step": 4759, "train/ce_loss": 1.5894752740859985 }, { "epoch": 0.47053589084437414, "step": 4759, "train/sim_loss": 0.07421875 }, { "epoch": 0.47053589084437414, "step": 4759, "train/total_loss": 0.23316627740859985 }, { "epoch": 0.4706347636938897, "grad_norm": 0.7747063040733337, "learning_rate": 8.825841863225042e-06, "loss": 0.1533, "step": 4760 }, { "entropy": 9.248870849609375, "epoch": 0.4706347636938897, "mean_token_accuracy": 0.6353523135185242, "num_tokens": 3960740.0, "step": 4760, "train/ce_loss": 1.3720840570385917e-06 }, { "epoch": 0.4706347636938897, "step": 4760, "train/sim_loss": 0.03125 }, { "epoch": 0.4706347636938897, "step": 4760, "train/total_loss": 0.03125013783574104 }, { "entropy": 9.110275268554688, "epoch": 0.47073363654340517, "mean_token_accuracy": 0.7091194987297058, "num_tokens": 3965831.0, "step": 4761, "train/ce_loss": 1.8087035417556763 }, { "epoch": 0.47073363654340517, "step": 4761, "train/sim_loss": 0.0859375 }, { "epoch": 0.47073363654340517, "step": 4761, "train/total_loss": 0.2668078541755676 }, { "entropy": 9.457289695739746, "epoch": 0.4708325093929207, "mean_token_accuracy": 0.7428115010261536, "num_tokens": 3970849.0, "step": 4762, "train/ce_loss": 0.5878061652183533 }, { "epoch": 0.4708325093929207, "step": 4762, "train/sim_loss": 0.07421875 }, { "epoch": 0.4708325093929207, "step": 4762, "train/total_loss": 0.13299936056137085 }, { "entropy": 9.001456260681152, "epoch": 0.47093138224243625, "mean_token_accuracy": 0.7251613140106201, "num_tokens": 3976068.0, "step": 4763, "train/ce_loss": 0.7660927772521973 }, { "epoch": 0.47093138224243625, "step": 4763, "train/sim_loss": 0.0234375 }, { "epoch": 0.47093138224243625, "step": 4763, "train/total_loss": 0.10004677623510361 }, { "entropy": 9.152788162231445, "epoch": 0.47103025509195173, "mean_token_accuracy": 0.6974790096282959, "num_tokens": 3981326.0, "step": 4764, "train/ce_loss": 1.1601786613464355 }, { "epoch": 0.47103025509195173, "step": 4764, "train/sim_loss": 0.0546875 }, { "epoch": 0.47103025509195173, "step": 4764, "train/total_loss": 0.1707053780555725 }, { "entropy": 9.557083129882812, "epoch": 0.4711291279414673, "mean_token_accuracy": 0.7285068035125732, "num_tokens": 3986385.0, "step": 4765, "train/ce_loss": 1.3284010887145996 }, { "epoch": 0.4711291279414673, "step": 4765, "train/sim_loss": 0.078125 }, { "epoch": 0.4711291279414673, "step": 4765, "train/total_loss": 0.2109651118516922 }, { "entropy": 9.038352966308594, "epoch": 0.4712280007909828, "mean_token_accuracy": 0.7330827116966248, "num_tokens": 3991665.0, "step": 4766, "train/ce_loss": 0.9846564531326294 }, { "epoch": 0.4712280007909828, "step": 4766, "train/sim_loss": 0.1015625 }, { "epoch": 0.4712280007909828, "step": 4766, "train/total_loss": 0.20002815127372742 }, { "entropy": 9.46235466003418, "epoch": 0.4713268736404983, "mean_token_accuracy": 0.6992248296737671, "num_tokens": 3996722.0, "step": 4767, "train/ce_loss": 1.7705824375152588 }, { "epoch": 0.4713268736404983, "step": 4767, "train/sim_loss": 0.109375 }, { "epoch": 0.4713268736404983, "step": 4767, "train/total_loss": 0.28643324971199036 }, { "entropy": 9.06019115447998, "epoch": 0.47142574649001384, "mean_token_accuracy": 0.743107795715332, "num_tokens": 4002040.0, "step": 4768, "train/ce_loss": 0.8644979000091553 }, { "epoch": 0.47142574649001384, "step": 4768, "train/sim_loss": 0.046875 }, { "epoch": 0.47142574649001384, "step": 4768, "train/total_loss": 0.13332480192184448 }, { "entropy": 8.78596305847168, "epoch": 0.4715246193395294, "mean_token_accuracy": 0.7808510661125183, "num_tokens": 4007471.0, "step": 4769, "train/ce_loss": 0.38614627718925476 }, { "epoch": 0.4715246193395294, "step": 4769, "train/sim_loss": 0.0234375 }, { "epoch": 0.4715246193395294, "step": 4769, "train/total_loss": 0.062052126973867416 }, { "entropy": 9.571614265441895, "epoch": 0.47162349218904487, "mean_token_accuracy": 0.7563636302947998, "num_tokens": 4012473.0, "step": 4770, "train/ce_loss": 0.6596222519874573 }, { "epoch": 0.47162349218904487, "step": 4770, "train/sim_loss": 0.0859375 }, { "epoch": 0.47162349218904487, "step": 4770, "train/total_loss": 0.15189972519874573 }, { "entropy": 9.231966972351074, "epoch": 0.4717223650385604, "mean_token_accuracy": 0.7604617476463318, "num_tokens": 4017597.0, "step": 4771, "train/ce_loss": 9.691642844700254e-06 }, { "epoch": 0.4717223650385604, "step": 4771, "train/sim_loss": 0.03125 }, { "epoch": 0.4717223650385604, "step": 4771, "train/total_loss": 0.0312509685754776 }, { "entropy": 8.980993270874023, "epoch": 0.47182123788807595, "mean_token_accuracy": 0.698924720287323, "num_tokens": 4022916.0, "step": 4772, "train/ce_loss": 0.8316338062286377 }, { "epoch": 0.47182123788807595, "step": 4772, "train/sim_loss": 0.04296875 }, { "epoch": 0.47182123788807595, "step": 4772, "train/total_loss": 0.12613213062286377 }, { "entropy": 8.69944953918457, "epoch": 0.47192011073759144, "mean_token_accuracy": 0.7211538553237915, "num_tokens": 4028488.0, "step": 4773, "train/ce_loss": 1.073030710220337 }, { "epoch": 0.47192011073759144, "step": 4773, "train/sim_loss": 0.05859375 }, { "epoch": 0.47192011073759144, "step": 4773, "train/total_loss": 0.16589683294296265 }, { "entropy": 9.96009635925293, "epoch": 0.472018983587107, "mean_token_accuracy": 0.7681564092636108, "num_tokens": 4033253.0, "step": 4774, "train/ce_loss": 2.6304535367671633e-06 }, { "epoch": 0.472018983587107, "step": 4774, "train/sim_loss": 0.0390625 }, { "epoch": 0.472018983587107, "step": 4774, "train/total_loss": 0.03906276449561119 }, { "entropy": 10.047544479370117, "epoch": 0.4721178564366225, "mean_token_accuracy": 0.7305699586868286, "num_tokens": 4038059.0, "step": 4775, "train/ce_loss": 3.5401205877860775e-06 }, { "epoch": 0.4721178564366225, "step": 4775, "train/sim_loss": 0.0703125 }, { "epoch": 0.4721178564366225, "step": 4775, "train/total_loss": 0.07031285762786865 }, { "entropy": 9.181392669677734, "epoch": 0.472216729286138, "mean_token_accuracy": 0.6970720887184143, "num_tokens": 4043580.0, "step": 4776, "train/ce_loss": 0.6841987371444702 }, { "epoch": 0.472216729286138, "step": 4776, "train/sim_loss": 0.078125 }, { "epoch": 0.472216729286138, "step": 4776, "train/total_loss": 0.14654487371444702 }, { "entropy": 8.766311645507812, "epoch": 0.47231560213565355, "mean_token_accuracy": 0.8162650465965271, "num_tokens": 4049084.0, "step": 4777, "train/ce_loss": 0.48067957162857056 }, { "epoch": 0.47231560213565355, "step": 4777, "train/sim_loss": 0.08203125 }, { "epoch": 0.47231560213565355, "step": 4777, "train/total_loss": 0.13009920716285706 }, { "entropy": 9.338191032409668, "epoch": 0.4724144749851691, "mean_token_accuracy": 0.7837370038032532, "num_tokens": 4054198.0, "step": 4778, "train/ce_loss": 0.6879231929779053 }, { "epoch": 0.4724144749851691, "step": 4778, "train/sim_loss": 0.0546875 }, { "epoch": 0.4724144749851691, "step": 4778, "train/total_loss": 0.12347982078790665 }, { "entropy": 8.6314697265625, "epoch": 0.47251334783468457, "mean_token_accuracy": 0.7299270033836365, "num_tokens": 4059783.0, "step": 4779, "train/ce_loss": 0.6439928412437439 }, { "epoch": 0.47251334783468457, "step": 4779, "train/sim_loss": 0.0625 }, { "epoch": 0.47251334783468457, "step": 4779, "train/total_loss": 0.12689928710460663 }, { "epoch": 0.4726122206842001, "grad_norm": 0.6961796283721924, "learning_rate": 8.820896998467093e-06, "loss": 0.1489, "step": 4780 }, { "entropy": 9.529073715209961, "epoch": 0.4726122206842001, "mean_token_accuracy": 0.7566909790039062, "num_tokens": 4064637.0, "step": 4780, "train/ce_loss": 1.0975415706634521 }, { "epoch": 0.4726122206842001, "step": 4780, "train/sim_loss": 0.109375 }, { "epoch": 0.4726122206842001, "step": 4780, "train/total_loss": 0.21912916004657745 }, { "entropy": 9.070890426635742, "epoch": 0.47271109353371565, "mean_token_accuracy": 0.6548004150390625, "num_tokens": 4070038.0, "step": 4781, "train/ce_loss": 1.7903450727462769 }, { "epoch": 0.47271109353371565, "step": 4781, "train/sim_loss": 0.08984375 }, { "epoch": 0.47271109353371565, "step": 4781, "train/total_loss": 0.2688782811164856 }, { "entropy": 8.831971168518066, "epoch": 0.47280996638323114, "mean_token_accuracy": 0.7632135152816772, "num_tokens": 4075474.0, "step": 4782, "train/ce_loss": 0.7210497260093689 }, { "epoch": 0.47280996638323114, "step": 4782, "train/sim_loss": 0.140625 }, { "epoch": 0.47280996638323114, "step": 4782, "train/total_loss": 0.21272997558116913 }, { "entropy": 9.82503604888916, "epoch": 0.4729088392327467, "mean_token_accuracy": 0.7314049601554871, "num_tokens": 4080392.0, "step": 4783, "train/ce_loss": 1.004560112953186 }, { "epoch": 0.4729088392327467, "step": 4783, "train/sim_loss": 0.0625 }, { "epoch": 0.4729088392327467, "step": 4783, "train/total_loss": 0.16295601427555084 }, { "entropy": 10.069318771362305, "epoch": 0.4730077120822622, "mean_token_accuracy": 0.7079207897186279, "num_tokens": 4085156.0, "step": 4784, "train/ce_loss": 2.222496747970581 }, { "epoch": 0.4730077120822622, "step": 4784, "train/sim_loss": 0.0390625 }, { "epoch": 0.4730077120822622, "step": 4784, "train/total_loss": 0.26131218671798706 }, { "entropy": 8.968809127807617, "epoch": 0.4731065849317777, "mean_token_accuracy": 0.7136611938476562, "num_tokens": 4090538.0, "step": 4785, "train/ce_loss": 1.2665410041809082 }, { "epoch": 0.4731065849317777, "step": 4785, "train/sim_loss": 0.03125 }, { "epoch": 0.4731065849317777, "step": 4785, "train/total_loss": 0.15790410339832306 }, { "entropy": 9.298772811889648, "epoch": 0.47320545778129325, "mean_token_accuracy": 0.7463768124580383, "num_tokens": 4095662.0, "step": 4786, "train/ce_loss": 1.8103519678115845 }, { "epoch": 0.47320545778129325, "step": 4786, "train/sim_loss": 0.07421875 }, { "epoch": 0.47320545778129325, "step": 4786, "train/total_loss": 0.25525397062301636 }, { "entropy": 9.402922630310059, "epoch": 0.4733043306308088, "mean_token_accuracy": 0.6640746593475342, "num_tokens": 4100759.0, "step": 4787, "train/ce_loss": 2.30370831489563 }, { "epoch": 0.4733043306308088, "step": 4787, "train/sim_loss": 0.0703125 }, { "epoch": 0.4733043306308088, "step": 4787, "train/total_loss": 0.30068331956863403 }, { "entropy": 9.477928161621094, "epoch": 0.47340320348032433, "mean_token_accuracy": 0.7245509028434753, "num_tokens": 4105864.0, "step": 4788, "train/ce_loss": 0.534600019454956 }, { "epoch": 0.47340320348032433, "step": 4788, "train/sim_loss": 0.03125 }, { "epoch": 0.47340320348032433, "step": 4788, "train/total_loss": 0.0847100019454956 }, { "entropy": 9.502606391906738, "epoch": 0.4735020763298398, "mean_token_accuracy": 0.7243243455886841, "num_tokens": 4110876.0, "step": 4789, "train/ce_loss": 5.169010819372488e-06 }, { "epoch": 0.4735020763298398, "step": 4789, "train/sim_loss": 0.03125 }, { "epoch": 0.4735020763298398, "step": 4789, "train/total_loss": 0.031250517815351486 }, { "entropy": 9.144216537475586, "epoch": 0.47360094917935536, "mean_token_accuracy": 0.7207943797111511, "num_tokens": 4116224.0, "step": 4790, "train/ce_loss": 0.9613037109375 }, { "epoch": 0.47360094917935536, "step": 4790, "train/sim_loss": 0.12890625 }, { "epoch": 0.47360094917935536, "step": 4790, "train/total_loss": 0.22503662109375 }, { "entropy": 9.424751281738281, "epoch": 0.4736998220288709, "mean_token_accuracy": 0.7784090638160706, "num_tokens": 4121385.0, "step": 4791, "train/ce_loss": 0.6650147438049316 }, { "epoch": 0.4736998220288709, "step": 4791, "train/sim_loss": 0.0234375 }, { "epoch": 0.4736998220288709, "step": 4791, "train/total_loss": 0.08993897587060928 }, { "entropy": 9.131213188171387, "epoch": 0.4737986948783864, "mean_token_accuracy": 0.7556080222129822, "num_tokens": 4126648.0, "step": 4792, "train/ce_loss": 0.741764485836029 }, { "epoch": 0.4737986948783864, "step": 4792, "train/sim_loss": 0.06640625 }, { "epoch": 0.4737986948783864, "step": 4792, "train/total_loss": 0.14058271050453186 }, { "entropy": 9.20728874206543, "epoch": 0.4738975677279019, "mean_token_accuracy": 0.7727891206741333, "num_tokens": 4131876.0, "step": 4793, "train/ce_loss": 0.5385979413986206 }, { "epoch": 0.4738975677279019, "step": 4793, "train/sim_loss": 0.0390625 }, { "epoch": 0.4738975677279019, "step": 4793, "train/total_loss": 0.09292230010032654 }, { "entropy": 9.474047660827637, "epoch": 0.47399644057741747, "mean_token_accuracy": 0.7229102253913879, "num_tokens": 4136981.0, "step": 4794, "train/ce_loss": 1.6584529876708984 }, { "epoch": 0.47399644057741747, "step": 4794, "train/sim_loss": 0.0625 }, { "epoch": 0.47399644057741747, "step": 4794, "train/total_loss": 0.22834530472755432 }, { "entropy": 9.055792808532715, "epoch": 0.47409531342693295, "mean_token_accuracy": 0.7245657444000244, "num_tokens": 4142283.0, "step": 4795, "train/ce_loss": 1.4621721506118774 }, { "epoch": 0.47409531342693295, "step": 4795, "train/sim_loss": 0.04296875 }, { "epoch": 0.47409531342693295, "step": 4795, "train/total_loss": 0.1891859620809555 }, { "entropy": 9.447093963623047, "epoch": 0.4741941862764485, "mean_token_accuracy": 0.7011685967445374, "num_tokens": 4147299.0, "step": 4796, "train/ce_loss": 1.4766745567321777 }, { "epoch": 0.4741941862764485, "step": 4796, "train/sim_loss": 0.10546875 }, { "epoch": 0.4741941862764485, "step": 4796, "train/total_loss": 0.25313621759414673 }, { "entropy": 9.53965950012207, "epoch": 0.47429305912596403, "mean_token_accuracy": 0.7479131817817688, "num_tokens": 4152289.0, "step": 4797, "train/ce_loss": 0.9872329831123352 }, { "epoch": 0.47429305912596403, "step": 4797, "train/sim_loss": 0.0625 }, { "epoch": 0.47429305912596403, "step": 4797, "train/total_loss": 0.16122329235076904 }, { "entropy": 9.425228118896484, "epoch": 0.4743919319754795, "mean_token_accuracy": 0.8389512896537781, "num_tokens": 4157244.0, "step": 4798, "train/ce_loss": 1.3840967416763306 }, { "epoch": 0.4743919319754795, "step": 4798, "train/sim_loss": 0.05859375 }, { "epoch": 0.4743919319754795, "step": 4798, "train/total_loss": 0.19700342416763306 }, { "entropy": 9.233570098876953, "epoch": 0.47449080482499506, "mean_token_accuracy": 0.7161290049552917, "num_tokens": 4162349.0, "step": 4799, "train/ce_loss": 0.9661892652511597 }, { "epoch": 0.47449080482499506, "step": 4799, "train/sim_loss": 0.0703125 }, { "epoch": 0.47449080482499506, "step": 4799, "train/total_loss": 0.1669314205646515 }, { "epoch": 0.4745896776745106, "grad_norm": 0.8365682363510132, "learning_rate": 8.815952133709143e-06, "loss": 0.1458, "step": 4800 }, { "entropy": 9.029359817504883, "epoch": 0.4745896776745106, "mean_token_accuracy": 0.7110582590103149, "num_tokens": 4167644.0, "step": 4800, "train/ce_loss": 0.8061527609825134 }, { "epoch": 0.4745896776745106, "step": 4800, "train/sim_loss": 0.0390625 }, { "epoch": 0.4745896776745106, "step": 4800, "train/total_loss": 0.11967777460813522 }, { "entropy": 9.284017562866211, "epoch": 0.4746885505240261, "mean_token_accuracy": 0.7239999771118164, "num_tokens": 4172992.0, "step": 4801, "train/ce_loss": 1.5423961877822876 }, { "epoch": 0.4746885505240261, "step": 4801, "train/sim_loss": 0.078125 }, { "epoch": 0.4746885505240261, "step": 4801, "train/total_loss": 0.23236462473869324 }, { "entropy": 9.442846298217773, "epoch": 0.47478742337354163, "mean_token_accuracy": 0.7652302980422974, "num_tokens": 4178067.0, "step": 4802, "train/ce_loss": 0.841571033000946 }, { "epoch": 0.47478742337354163, "step": 4802, "train/sim_loss": 0.0859375 }, { "epoch": 0.47478742337354163, "step": 4802, "train/total_loss": 0.17009460926055908 }, { "entropy": 9.568414688110352, "epoch": 0.47488629622305717, "mean_token_accuracy": 0.76936936378479, "num_tokens": 4183034.0, "step": 4803, "train/ce_loss": 0.863416850566864 }, { "epoch": 0.47488629622305717, "step": 4803, "train/sim_loss": 0.04296875 }, { "epoch": 0.47488629622305717, "step": 4803, "train/total_loss": 0.12931042909622192 }, { "entropy": 9.1487455368042, "epoch": 0.47498516907257265, "mean_token_accuracy": 0.7169559597969055, "num_tokens": 4188328.0, "step": 4804, "train/ce_loss": 1.3763349056243896 }, { "epoch": 0.47498516907257265, "step": 4804, "train/sim_loss": 0.0390625 }, { "epoch": 0.47498516907257265, "step": 4804, "train/total_loss": 0.17669598758220673 }, { "entropy": 9.035324096679688, "epoch": 0.4750840419220882, "mean_token_accuracy": 0.6836086511611938, "num_tokens": 4193629.0, "step": 4805, "train/ce_loss": 1.2081576585769653 }, { "epoch": 0.4750840419220882, "step": 4805, "train/sim_loss": 0.05078125 }, { "epoch": 0.4750840419220882, "step": 4805, "train/total_loss": 0.17159701883792877 }, { "entropy": 9.174358367919922, "epoch": 0.47518291477160374, "mean_token_accuracy": 0.7875317931175232, "num_tokens": 4198844.0, "step": 4806, "train/ce_loss": 0.580572783946991 }, { "epoch": 0.47518291477160374, "step": 4806, "train/sim_loss": 0.0546875 }, { "epoch": 0.47518291477160374, "step": 4806, "train/total_loss": 0.1127447783946991 }, { "entropy": 8.938118934631348, "epoch": 0.4752817876211192, "mean_token_accuracy": 0.7669903039932251, "num_tokens": 4204115.0, "step": 4807, "train/ce_loss": 0.3910205662250519 }, { "epoch": 0.4752817876211192, "step": 4807, "train/sim_loss": 0.0625 }, { "epoch": 0.4752817876211192, "step": 4807, "train/total_loss": 0.10160206258296967 }, { "entropy": 9.694165229797363, "epoch": 0.47538066047063476, "mean_token_accuracy": 0.7274549007415771, "num_tokens": 4209030.0, "step": 4808, "train/ce_loss": 1.0041385889053345 }, { "epoch": 0.47538066047063476, "step": 4808, "train/sim_loss": 0.09375 }, { "epoch": 0.47538066047063476, "step": 4808, "train/total_loss": 0.19416385889053345 }, { "entropy": 8.995406150817871, "epoch": 0.4754795333201503, "mean_token_accuracy": 0.7344827651977539, "num_tokens": 4214411.0, "step": 4809, "train/ce_loss": 0.7896780371665955 }, { "epoch": 0.4754795333201503, "step": 4809, "train/sim_loss": 0.02734375 }, { "epoch": 0.4754795333201503, "step": 4809, "train/total_loss": 0.10631155222654343 }, { "entropy": 8.896883964538574, "epoch": 0.4755784061696658, "mean_token_accuracy": 0.7447236180305481, "num_tokens": 4220024.0, "step": 4810, "train/ce_loss": 0.6986748576164246 }, { "epoch": 0.4755784061696658, "step": 4810, "train/sim_loss": 0.015625 }, { "epoch": 0.4755784061696658, "step": 4810, "train/total_loss": 0.08549248427152634 }, { "entropy": 9.272575378417969, "epoch": 0.47567727901918133, "mean_token_accuracy": 0.7357910871505737, "num_tokens": 4225167.0, "step": 4811, "train/ce_loss": 1.4026121561983018e-06 }, { "epoch": 0.47567727901918133, "step": 4811, "train/sim_loss": 0.046875 }, { "epoch": 0.47567727901918133, "step": 4811, "train/total_loss": 0.04687514156103134 }, { "entropy": 9.968191146850586, "epoch": 0.4757761518686969, "mean_token_accuracy": 0.7743902206420898, "num_tokens": 4229949.0, "step": 4812, "train/ce_loss": 6.063038654247066e-06 }, { "epoch": 0.4757761518686969, "step": 4812, "train/sim_loss": 0.046875 }, { "epoch": 0.4757761518686969, "step": 4812, "train/total_loss": 0.04687560722231865 }, { "entropy": 9.479616165161133, "epoch": 0.47587502471821236, "mean_token_accuracy": 0.7243402004241943, "num_tokens": 4235067.0, "step": 4813, "train/ce_loss": 1.4710017442703247 }, { "epoch": 0.47587502471821236, "step": 4813, "train/sim_loss": 0.0703125 }, { "epoch": 0.47587502471821236, "step": 4813, "train/total_loss": 0.21741268038749695 }, { "entropy": 9.045944213867188, "epoch": 0.4759738975677279, "mean_token_accuracy": 0.7221006751060486, "num_tokens": 4240480.0, "step": 4814, "train/ce_loss": 1.0600768327713013 }, { "epoch": 0.4759738975677279, "step": 4814, "train/sim_loss": 0.05078125 }, { "epoch": 0.4759738975677279, "step": 4814, "train/total_loss": 0.15678894519805908 }, { "entropy": 8.691741943359375, "epoch": 0.47607277041724344, "mean_token_accuracy": 0.7612565159797668, "num_tokens": 4245936.0, "step": 4815, "train/ce_loss": 0.5213047862052917 }, { "epoch": 0.47607277041724344, "step": 4815, "train/sim_loss": 0.0390625 }, { "epoch": 0.47607277041724344, "step": 4815, "train/total_loss": 0.09119297564029694 }, { "entropy": 9.274696350097656, "epoch": 0.4761716432667589, "mean_token_accuracy": 0.7519747018814087, "num_tokens": 4251038.0, "step": 4816, "train/ce_loss": 0.9562940001487732 }, { "epoch": 0.4761716432667589, "step": 4816, "train/sim_loss": 0.0703125 }, { "epoch": 0.4761716432667589, "step": 4816, "train/total_loss": 0.16594189405441284 }, { "entropy": 9.602999687194824, "epoch": 0.47627051611627447, "mean_token_accuracy": 0.7448747158050537, "num_tokens": 4255925.0, "step": 4817, "train/ce_loss": 1.7758719650373678e-06 }, { "epoch": 0.47627051611627447, "step": 4817, "train/sim_loss": 0.05078125 }, { "epoch": 0.47627051611627447, "step": 4817, "train/total_loss": 0.050781428813934326 }, { "entropy": 8.861591339111328, "epoch": 0.47636938896579, "mean_token_accuracy": 0.7080820202827454, "num_tokens": 4261243.0, "step": 4818, "train/ce_loss": 1.1278631687164307 }, { "epoch": 0.47636938896579, "step": 4818, "train/sim_loss": 0.10546875 }, { "epoch": 0.47636938896579, "step": 4818, "train/total_loss": 0.21825507283210754 }, { "entropy": 9.071257591247559, "epoch": 0.4764682618153055, "mean_token_accuracy": 0.7853535413742065, "num_tokens": 4266473.0, "step": 4819, "train/ce_loss": 0.601017415523529 }, { "epoch": 0.4764682618153055, "step": 4819, "train/sim_loss": 0.015625 }, { "epoch": 0.4764682618153055, "step": 4819, "train/total_loss": 0.07572674751281738 }, { "epoch": 0.47656713466482103, "grad_norm": 0.683932900428772, "learning_rate": 8.811007268951196e-06, "loss": 0.1375, "step": 4820 }, { "entropy": 8.734341621398926, "epoch": 0.47656713466482103, "mean_token_accuracy": 0.7373225092887878, "num_tokens": 4272090.0, "step": 4820, "train/ce_loss": 1.2590872049331665 }, { "epoch": 0.47656713466482103, "step": 4820, "train/sim_loss": 0.109375 }, { "epoch": 0.47656713466482103, "step": 4820, "train/total_loss": 0.2352837175130844 }, { "entropy": 9.463361740112305, "epoch": 0.4766660075143366, "mean_token_accuracy": 0.7535545229911804, "num_tokens": 4277158.0, "step": 4821, "train/ce_loss": 0.7862908244132996 }, { "epoch": 0.4766660075143366, "step": 4821, "train/sim_loss": 0.04296875 }, { "epoch": 0.4766660075143366, "step": 4821, "train/total_loss": 0.12159783393144608 }, { "entropy": 8.814202308654785, "epoch": 0.47676488036385206, "mean_token_accuracy": 0.7385892271995544, "num_tokens": 4282650.0, "step": 4822, "train/ce_loss": 1.111302375793457 }, { "epoch": 0.47676488036385206, "step": 4822, "train/sim_loss": 0.0859375 }, { "epoch": 0.47676488036385206, "step": 4822, "train/total_loss": 0.1970677375793457 }, { "entropy": 8.99859619140625, "epoch": 0.4768637532133676, "mean_token_accuracy": 0.7559366822242737, "num_tokens": 4287895.0, "step": 4823, "train/ce_loss": 0.7384012937545776 }, { "epoch": 0.4768637532133676, "step": 4823, "train/sim_loss": 0.078125 }, { "epoch": 0.4768637532133676, "step": 4823, "train/total_loss": 0.15196514129638672 }, { "entropy": 8.797042846679688, "epoch": 0.47696262606288314, "mean_token_accuracy": 0.7273743152618408, "num_tokens": 4293266.0, "step": 4824, "train/ce_loss": 0.8045260310173035 }, { "epoch": 0.47696262606288314, "step": 4824, "train/sim_loss": 0.078125 }, { "epoch": 0.47696262606288314, "step": 4824, "train/total_loss": 0.15857760608196259 }, { "entropy": 9.423627853393555, "epoch": 0.47706149891239863, "mean_token_accuracy": 0.7994056344032288, "num_tokens": 4298359.0, "step": 4825, "train/ce_loss": 0.8552690744400024 }, { "epoch": 0.47706149891239863, "step": 4825, "train/sim_loss": 0.046875 }, { "epoch": 0.47706149891239863, "step": 4825, "train/total_loss": 0.13240191340446472 }, { "entropy": 9.24445629119873, "epoch": 0.47716037176191417, "mean_token_accuracy": 0.7112902998924255, "num_tokens": 4303382.0, "step": 4826, "train/ce_loss": 1.6132769584655762 }, { "epoch": 0.47716037176191417, "step": 4826, "train/sim_loss": 0.078125 }, { "epoch": 0.47716037176191417, "step": 4826, "train/total_loss": 0.23945270478725433 }, { "entropy": 9.483026504516602, "epoch": 0.4772592446114297, "mean_token_accuracy": 0.7545605301856995, "num_tokens": 4308358.0, "step": 4827, "train/ce_loss": 0.869056224822998 }, { "epoch": 0.4772592446114297, "step": 4827, "train/sim_loss": 0.0390625 }, { "epoch": 0.4772592446114297, "step": 4827, "train/total_loss": 0.12596812844276428 }, { "entropy": 9.725470542907715, "epoch": 0.4773581174609452, "mean_token_accuracy": 0.7925636172294617, "num_tokens": 4313474.0, "step": 4828, "train/ce_loss": 1.4528536796569824 }, { "epoch": 0.4773581174609452, "step": 4828, "train/sim_loss": 0.07421875 }, { "epoch": 0.4773581174609452, "step": 4828, "train/total_loss": 0.21950411796569824 }, { "entropy": 9.19894027709961, "epoch": 0.47745699031046074, "mean_token_accuracy": 0.6939040422439575, "num_tokens": 4318687.0, "step": 4829, "train/ce_loss": 1.0451105833053589 }, { "epoch": 0.47745699031046074, "step": 4829, "train/sim_loss": 0.05859375 }, { "epoch": 0.47745699031046074, "step": 4829, "train/total_loss": 0.1631048023700714 }, { "entropy": 9.40733814239502, "epoch": 0.4775558631599763, "mean_token_accuracy": 0.7928994297981262, "num_tokens": 4323836.0, "step": 4830, "train/ce_loss": 5.014096586819505e-06 }, { "epoch": 0.4775558631599763, "step": 4830, "train/sim_loss": 0.05078125 }, { "epoch": 0.4775558631599763, "step": 4830, "train/total_loss": 0.05078175291419029 }, { "entropy": 9.185754776000977, "epoch": 0.4776547360094918, "mean_token_accuracy": 0.7546418905258179, "num_tokens": 4329014.0, "step": 4831, "train/ce_loss": 0.9354567527770996 }, { "epoch": 0.4776547360094918, "step": 4831, "train/sim_loss": 0.046875 }, { "epoch": 0.4776547360094918, "step": 4831, "train/total_loss": 0.14042067527770996 }, { "entropy": 9.714949607849121, "epoch": 0.4777536088590073, "mean_token_accuracy": 0.7931034564971924, "num_tokens": 4333916.0, "step": 4832, "train/ce_loss": 1.6440129280090332 }, { "epoch": 0.4777536088590073, "step": 4832, "train/sim_loss": 0.07421875 }, { "epoch": 0.4777536088590073, "step": 4832, "train/total_loss": 0.23862004280090332 }, { "entropy": 9.325277328491211, "epoch": 0.47785248170852285, "mean_token_accuracy": 0.7177321910858154, "num_tokens": 4339216.0, "step": 4833, "train/ce_loss": 1.1986669505859027e-06 }, { "epoch": 0.47785248170852285, "step": 4833, "train/sim_loss": 0.0390625 }, { "epoch": 0.47785248170852285, "step": 4833, "train/total_loss": 0.03906261920928955 }, { "entropy": 9.78929328918457, "epoch": 0.4779513545580384, "mean_token_accuracy": 0.8075221180915833, "num_tokens": 4344103.0, "step": 4834, "train/ce_loss": 1.0145269632339478 }, { "epoch": 0.4779513545580384, "step": 4834, "train/sim_loss": 0.0546875 }, { "epoch": 0.4779513545580384, "step": 4834, "train/total_loss": 0.15614020824432373 }, { "entropy": 9.039548873901367, "epoch": 0.4780502274075539, "mean_token_accuracy": 0.730681836605072, "num_tokens": 4349480.0, "step": 4835, "train/ce_loss": 0.5511088371276855 }, { "epoch": 0.4780502274075539, "step": 4835, "train/sim_loss": 0.05859375 }, { "epoch": 0.4780502274075539, "step": 4835, "train/total_loss": 0.1137046366930008 }, { "entropy": 8.990793228149414, "epoch": 0.4781491002570694, "mean_token_accuracy": 0.6390804648399353, "num_tokens": 4354945.0, "step": 4836, "train/ce_loss": 1.185433030128479 }, { "epoch": 0.4781491002570694, "step": 4836, "train/sim_loss": 0.0625 }, { "epoch": 0.4781491002570694, "step": 4836, "train/total_loss": 0.18104329705238342 }, { "entropy": 9.270225524902344, "epoch": 0.47824797310658496, "mean_token_accuracy": 0.7189542651176453, "num_tokens": 4360128.0, "step": 4837, "train/ce_loss": 0.6311092376708984 }, { "epoch": 0.47824797310658496, "step": 4837, "train/sim_loss": 0.0625 }, { "epoch": 0.47824797310658496, "step": 4837, "train/total_loss": 0.12561091780662537 }, { "entropy": 9.547813415527344, "epoch": 0.47834684595610044, "mean_token_accuracy": 0.8039867281913757, "num_tokens": 4365148.0, "step": 4838, "train/ce_loss": 0.6666879653930664 }, { "epoch": 0.47834684595610044, "step": 4838, "train/sim_loss": 0.0390625 }, { "epoch": 0.47834684595610044, "step": 4838, "train/total_loss": 0.105731301009655 }, { "entropy": 9.311174392700195, "epoch": 0.478445718805616, "mean_token_accuracy": 0.7417582273483276, "num_tokens": 4370360.0, "step": 4839, "train/ce_loss": 0.6862412691116333 }, { "epoch": 0.478445718805616, "step": 4839, "train/sim_loss": 0.09375 }, { "epoch": 0.478445718805616, "step": 4839, "train/total_loss": 0.16237413883209229 }, { "epoch": 0.4785445916551315, "grad_norm": 0.6970511674880981, "learning_rate": 8.806062404193246e-06, "loss": 0.1414, "step": 4840 }, { "entropy": 9.550680160522461, "epoch": 0.4785445916551315, "mean_token_accuracy": 0.7301587462425232, "num_tokens": 4375402.0, "step": 4840, "train/ce_loss": 0.864875316619873 }, { "epoch": 0.4785445916551315, "step": 4840, "train/sim_loss": 0.078125 }, { "epoch": 0.4785445916551315, "step": 4840, "train/total_loss": 0.1646125316619873 }, { "entropy": 9.34185791015625, "epoch": 0.478643464504647, "mean_token_accuracy": 0.6724637746810913, "num_tokens": 4380581.0, "step": 4841, "train/ce_loss": 1.8873491287231445 }, { "epoch": 0.478643464504647, "step": 4841, "train/sim_loss": 0.140625 }, { "epoch": 0.478643464504647, "step": 4841, "train/total_loss": 0.32935991883277893 }, { "entropy": 9.676137924194336, "epoch": 0.47874233735416255, "mean_token_accuracy": 0.784380316734314, "num_tokens": 4385628.0, "step": 4842, "train/ce_loss": 3.004107156812097e-06 }, { "epoch": 0.47874233735416255, "step": 4842, "train/sim_loss": 0.078125 }, { "epoch": 0.47874233735416255, "step": 4842, "train/total_loss": 0.07812529802322388 }, { "entropy": 9.124101638793945, "epoch": 0.4788412102036781, "mean_token_accuracy": 0.8434210419654846, "num_tokens": 4390852.0, "step": 4843, "train/ce_loss": 0.5420211553573608 }, { "epoch": 0.4788412102036781, "step": 4843, "train/sim_loss": 0.01953125 }, { "epoch": 0.4788412102036781, "step": 4843, "train/total_loss": 0.0737333670258522 }, { "entropy": 9.000953674316406, "epoch": 0.4789400830531936, "mean_token_accuracy": 0.7689393758773804, "num_tokens": 4396117.0, "step": 4844, "train/ce_loss": 0.6013636589050293 }, { "epoch": 0.4789400830531936, "step": 4844, "train/sim_loss": 0.109375 }, { "epoch": 0.4789400830531936, "step": 4844, "train/total_loss": 0.1695113629102707 }, { "entropy": 8.986391067504883, "epoch": 0.4790389559027091, "mean_token_accuracy": 0.7383784055709839, "num_tokens": 4401542.0, "step": 4845, "train/ce_loss": 1.416501760482788 }, { "epoch": 0.4790389559027091, "step": 4845, "train/sim_loss": 0.109375 }, { "epoch": 0.4790389559027091, "step": 4845, "train/total_loss": 0.2510251998901367 }, { "entropy": 9.099742889404297, "epoch": 0.47913782875222466, "mean_token_accuracy": 0.7825000286102295, "num_tokens": 4406841.0, "step": 4846, "train/ce_loss": 1.2701870203018188 }, { "epoch": 0.47913782875222466, "step": 4846, "train/sim_loss": 0.078125 }, { "epoch": 0.47913782875222466, "step": 4846, "train/total_loss": 0.20514370501041412 }, { "entropy": 8.608301162719727, "epoch": 0.47923670160174014, "mean_token_accuracy": 0.7108571529388428, "num_tokens": 4412193.0, "step": 4847, "train/ce_loss": 0.48382073640823364 }, { "epoch": 0.47923670160174014, "step": 4847, "train/sim_loss": 0.109375 }, { "epoch": 0.47923670160174014, "step": 4847, "train/total_loss": 0.15775707364082336 }, { "entropy": 8.367998123168945, "epoch": 0.4793355744512557, "mean_token_accuracy": 0.6746666431427002, "num_tokens": 4417897.0, "step": 4848, "train/ce_loss": 1.1643277406692505 }, { "epoch": 0.4793355744512557, "step": 4848, "train/sim_loss": 0.17578125 }, { "epoch": 0.4793355744512557, "step": 4848, "train/total_loss": 0.292214035987854 }, { "entropy": 8.996297836303711, "epoch": 0.4794344473007712, "mean_token_accuracy": 0.7831021547317505, "num_tokens": 4423119.0, "step": 4849, "train/ce_loss": 0.7570226788520813 }, { "epoch": 0.4794344473007712, "step": 4849, "train/sim_loss": 0.04296875 }, { "epoch": 0.4794344473007712, "step": 4849, "train/total_loss": 0.11867102235555649 }, { "entropy": 9.125381469726562, "epoch": 0.4795333201502867, "mean_token_accuracy": 0.7447090148925781, "num_tokens": 4428353.0, "step": 4850, "train/ce_loss": 0.8565332293510437 }, { "epoch": 0.4795333201502867, "step": 4850, "train/sim_loss": 0.08203125 }, { "epoch": 0.4795333201502867, "step": 4850, "train/total_loss": 0.16768458485603333 }, { "entropy": 9.312715530395508, "epoch": 0.47963219299980225, "mean_token_accuracy": 0.7510204315185547, "num_tokens": 4433506.0, "step": 4851, "train/ce_loss": 1.4466944932937622 }, { "epoch": 0.47963219299980225, "step": 4851, "train/sim_loss": 0.09375 }, { "epoch": 0.47963219299980225, "step": 4851, "train/total_loss": 0.23841945827007294 }, { "entropy": 9.121928215026855, "epoch": 0.4797310658493178, "mean_token_accuracy": 0.7729918360710144, "num_tokens": 4438799.0, "step": 4852, "train/ce_loss": 0.7193275094032288 }, { "epoch": 0.4797310658493178, "step": 4852, "train/sim_loss": 0.06640625 }, { "epoch": 0.4797310658493178, "step": 4852, "train/total_loss": 0.13833901286125183 }, { "entropy": 9.010169982910156, "epoch": 0.4798299386988333, "mean_token_accuracy": 0.7578418850898743, "num_tokens": 4444061.0, "step": 4853, "train/ce_loss": 0.5989353656768799 }, { "epoch": 0.4798299386988333, "step": 4853, "train/sim_loss": 0.05859375 }, { "epoch": 0.4798299386988333, "step": 4853, "train/total_loss": 0.11848728358745575 }, { "entropy": 8.95584487915039, "epoch": 0.4799288115483488, "mean_token_accuracy": 0.8355827927589417, "num_tokens": 4449334.0, "step": 4854, "train/ce_loss": 0.47817134857177734 }, { "epoch": 0.4799288115483488, "step": 4854, "train/sim_loss": 0.03125 }, { "epoch": 0.4799288115483488, "step": 4854, "train/total_loss": 0.07906714081764221 }, { "entropy": 9.436650276184082, "epoch": 0.48002768439786436, "mean_token_accuracy": 0.7937008142471313, "num_tokens": 4454463.0, "step": 4855, "train/ce_loss": 0.5432444214820862 }, { "epoch": 0.48002768439786436, "step": 4855, "train/sim_loss": 0.046875 }, { "epoch": 0.48002768439786436, "step": 4855, "train/total_loss": 0.1011994481086731 }, { "entropy": 8.903467178344727, "epoch": 0.48012655724737985, "mean_token_accuracy": 0.7973856329917908, "num_tokens": 4459866.0, "step": 4856, "train/ce_loss": 0.42568978667259216 }, { "epoch": 0.48012655724737985, "step": 4856, "train/sim_loss": 0.0390625 }, { "epoch": 0.48012655724737985, "step": 4856, "train/total_loss": 0.08163148164749146 }, { "entropy": 8.95402717590332, "epoch": 0.4802254300968954, "mean_token_accuracy": 0.6666666865348816, "num_tokens": 4465094.0, "step": 4857, "train/ce_loss": 0.8045952320098877 }, { "epoch": 0.4802254300968954, "step": 4857, "train/sim_loss": 0.08203125 }, { "epoch": 0.4802254300968954, "step": 4857, "train/total_loss": 0.16249078512191772 }, { "entropy": 8.827604293823242, "epoch": 0.48032430294641093, "mean_token_accuracy": 0.791374146938324, "num_tokens": 4470541.0, "step": 4858, "train/ce_loss": 0.48000288009643555 }, { "epoch": 0.48032430294641093, "step": 4858, "train/sim_loss": 0.0234375 }, { "epoch": 0.48032430294641093, "step": 4858, "train/total_loss": 0.0714377909898758 }, { "entropy": 9.142309188842773, "epoch": 0.4804231757959264, "mean_token_accuracy": 0.7300435304641724, "num_tokens": 4475717.0, "step": 4859, "train/ce_loss": 1.0548945665359497 }, { "epoch": 0.4804231757959264, "step": 4859, "train/sim_loss": 0.0859375 }, { "epoch": 0.4804231757959264, "step": 4859, "train/total_loss": 0.19142696261405945 }, { "epoch": 0.48052204864544196, "grad_norm": 0.7656567096710205, "learning_rate": 8.801117539435298e-06, "loss": 0.1385, "step": 4860 }, { "entropy": 8.858229637145996, "epoch": 0.48052204864544196, "mean_token_accuracy": 0.767756462097168, "num_tokens": 4481076.0, "step": 4860, "train/ce_loss": 0.43485215306282043 }, { "epoch": 0.48052204864544196, "step": 4860, "train/sim_loss": 0.03515625 }, { "epoch": 0.48052204864544196, "step": 4860, "train/total_loss": 0.07864146679639816 }, { "entropy": 9.248090744018555, "epoch": 0.4806209214949575, "mean_token_accuracy": 0.7274096608161926, "num_tokens": 4486161.0, "step": 4861, "train/ce_loss": 1.2672001123428345 }, { "epoch": 0.4806209214949575, "step": 4861, "train/sim_loss": 0.06640625 }, { "epoch": 0.4806209214949575, "step": 4861, "train/total_loss": 0.19312626123428345 }, { "entropy": 10.162076950073242, "epoch": 0.480719794344473, "mean_token_accuracy": 0.75, "num_tokens": 4490733.0, "step": 4862, "train/ce_loss": 5.364162461773958e-06 }, { "epoch": 0.480719794344473, "step": 4862, "train/sim_loss": 0.03125 }, { "epoch": 0.480719794344473, "step": 4862, "train/total_loss": 0.03125053644180298 }, { "entropy": 8.947798728942871, "epoch": 0.4808186671939885, "mean_token_accuracy": 0.7198660969734192, "num_tokens": 4496133.0, "step": 4863, "train/ce_loss": 1.493488073348999 }, { "epoch": 0.4808186671939885, "step": 4863, "train/sim_loss": 0.06640625 }, { "epoch": 0.4808186671939885, "step": 4863, "train/total_loss": 0.21575506031513214 }, { "entropy": 9.071576118469238, "epoch": 0.48091754004350407, "mean_token_accuracy": 0.7739899158477783, "num_tokens": 4501368.0, "step": 4864, "train/ce_loss": 0.396329402923584 }, { "epoch": 0.48091754004350407, "step": 4864, "train/sim_loss": 0.15625 }, { "epoch": 0.48091754004350407, "step": 4864, "train/total_loss": 0.19588294625282288 }, { "entropy": 8.7305908203125, "epoch": 0.48101641289301955, "mean_token_accuracy": 0.7126545906066895, "num_tokens": 4506935.0, "step": 4865, "train/ce_loss": 1.3752968311309814 }, { "epoch": 0.48101641289301955, "step": 4865, "train/sim_loss": 0.07421875 }, { "epoch": 0.48101641289301955, "step": 4865, "train/total_loss": 0.21174843609333038 }, { "entropy": 9.435068130493164, "epoch": 0.4811152857425351, "mean_token_accuracy": 0.7475728392601013, "num_tokens": 4511938.0, "step": 4866, "train/ce_loss": 0.795413613319397 }, { "epoch": 0.4811152857425351, "step": 4866, "train/sim_loss": 0.0859375 }, { "epoch": 0.4811152857425351, "step": 4866, "train/total_loss": 0.16547885537147522 }, { "entropy": 8.641799926757812, "epoch": 0.48121415859205063, "mean_token_accuracy": 0.7578268647193909, "num_tokens": 4517491.0, "step": 4867, "train/ce_loss": 0.7943580746650696 }, { "epoch": 0.48121415859205063, "step": 4867, "train/sim_loss": 0.0625 }, { "epoch": 0.48121415859205063, "step": 4867, "train/total_loss": 0.1419358104467392 }, { "entropy": 9.303564071655273, "epoch": 0.4813130314415661, "mean_token_accuracy": 0.7377567291259766, "num_tokens": 4522600.0, "step": 4868, "train/ce_loss": 3.0106859867373714e-06 }, { "epoch": 0.4813130314415661, "step": 4868, "train/sim_loss": 0.046875 }, { "epoch": 0.4813130314415661, "step": 4868, "train/total_loss": 0.046875301748514175 }, { "entropy": 8.76177978515625, "epoch": 0.48141190429108166, "mean_token_accuracy": 0.7208791375160217, "num_tokens": 4527967.0, "step": 4869, "train/ce_loss": 0.673498272895813 }, { "epoch": 0.48141190429108166, "step": 4869, "train/sim_loss": 0.0546875 }, { "epoch": 0.48141190429108166, "step": 4869, "train/total_loss": 0.12203732877969742 }, { "entropy": 8.733413696289062, "epoch": 0.4815107771405972, "mean_token_accuracy": 0.7470588088035583, "num_tokens": 4533451.0, "step": 4870, "train/ce_loss": 0.949754536151886 }, { "epoch": 0.4815107771405972, "step": 4870, "train/sim_loss": 0.07421875 }, { "epoch": 0.4815107771405972, "step": 4870, "train/total_loss": 0.16919420659542084 }, { "entropy": 8.815168380737305, "epoch": 0.48160964999011274, "mean_token_accuracy": 0.7336841821670532, "num_tokens": 4538863.0, "step": 4871, "train/ce_loss": 0.7179884314537048 }, { "epoch": 0.48160964999011274, "step": 4871, "train/sim_loss": 0.0390625 }, { "epoch": 0.48160964999011274, "step": 4871, "train/total_loss": 0.11086134612560272 }, { "entropy": 8.572389602661133, "epoch": 0.48170852283962823, "mean_token_accuracy": 0.6777777671813965, "num_tokens": 4544282.0, "step": 4872, "train/ce_loss": 1.2481721639633179 }, { "epoch": 0.48170852283962823, "step": 4872, "train/sim_loss": 0.11328125 }, { "epoch": 0.48170852283962823, "step": 4872, "train/total_loss": 0.23809847235679626 }, { "entropy": 8.487792015075684, "epoch": 0.48180739568914377, "mean_token_accuracy": 0.6997244954109192, "num_tokens": 4549887.0, "step": 4873, "train/ce_loss": 1.1593464612960815 }, { "epoch": 0.48180739568914377, "step": 4873, "train/sim_loss": 0.0546875 }, { "epoch": 0.48180739568914377, "step": 4873, "train/total_loss": 0.17062214016914368 }, { "entropy": 8.484737396240234, "epoch": 0.4819062685386593, "mean_token_accuracy": 0.7349081635475159, "num_tokens": 4555535.0, "step": 4874, "train/ce_loss": 0.9921157360076904 }, { "epoch": 0.4819062685386593, "step": 4874, "train/sim_loss": 0.0546875 }, { "epoch": 0.4819062685386593, "step": 4874, "train/total_loss": 0.15389907360076904 }, { "entropy": 9.836902618408203, "epoch": 0.4820051413881748, "mean_token_accuracy": 0.7613365054130554, "num_tokens": 4560363.0, "step": 4875, "train/ce_loss": 1.13887619972229 }, { "epoch": 0.4820051413881748, "step": 4875, "train/sim_loss": 0.05859375 }, { "epoch": 0.4820051413881748, "step": 4875, "train/total_loss": 0.17248137295246124 }, { "entropy": 9.677080154418945, "epoch": 0.48210401423769034, "mean_token_accuracy": 0.7024070024490356, "num_tokens": 4565259.0, "step": 4876, "train/ce_loss": 1.0439310244692024e-05 }, { "epoch": 0.48210401423769034, "step": 4876, "train/sim_loss": 0.0625 }, { "epoch": 0.48210401423769034, "step": 4876, "train/total_loss": 0.06250104308128357 }, { "entropy": 8.960569381713867, "epoch": 0.4822028870872059, "mean_token_accuracy": 0.7482993006706238, "num_tokens": 4570583.0, "step": 4877, "train/ce_loss": 0.8461055755615234 }, { "epoch": 0.4822028870872059, "step": 4877, "train/sim_loss": 0.046875 }, { "epoch": 0.4822028870872059, "step": 4877, "train/total_loss": 0.13148555159568787 }, { "entropy": 9.241521835327148, "epoch": 0.48230175993672136, "mean_token_accuracy": 0.6995581984519958, "num_tokens": 4575710.0, "step": 4878, "train/ce_loss": 0.7538856863975525 }, { "epoch": 0.48230175993672136, "step": 4878, "train/sim_loss": 0.04296875 }, { "epoch": 0.48230175993672136, "step": 4878, "train/total_loss": 0.11835732311010361 }, { "entropy": 8.968240737915039, "epoch": 0.4824006327862369, "mean_token_accuracy": 0.7649208307266235, "num_tokens": 4580976.0, "step": 4879, "train/ce_loss": 0.5129885077476501 }, { "epoch": 0.4824006327862369, "step": 4879, "train/sim_loss": 0.02734375 }, { "epoch": 0.4824006327862369, "step": 4879, "train/total_loss": 0.07864260673522949 }, { "epoch": 0.48249950563575245, "grad_norm": 0.7346249222755432, "learning_rate": 8.796172674677347e-06, "loss": 0.1431, "step": 4880 }, { "entropy": 9.064022064208984, "epoch": 0.48249950563575245, "mean_token_accuracy": 0.7308228611946106, "num_tokens": 4586152.0, "step": 4880, "train/ce_loss": 2.089444706143695e-06 }, { "epoch": 0.48249950563575245, "step": 4880, "train/sim_loss": 0.04296875 }, { "epoch": 0.48249950563575245, "step": 4880, "train/total_loss": 0.042968958616256714 }, { "entropy": 9.454401016235352, "epoch": 0.48259837848526793, "mean_token_accuracy": 0.6894824504852295, "num_tokens": 4591169.0, "step": 4881, "train/ce_loss": 0.8874126672744751 }, { "epoch": 0.48259837848526793, "step": 4881, "train/sim_loss": 0.046875 }, { "epoch": 0.48259837848526793, "step": 4881, "train/total_loss": 0.135616272687912 }, { "entropy": 9.163633346557617, "epoch": 0.4826972513347835, "mean_token_accuracy": 0.7462887763977051, "num_tokens": 4596377.0, "step": 4882, "train/ce_loss": 1.0831094980239868 }, { "epoch": 0.4826972513347835, "step": 4882, "train/sim_loss": 0.109375 }, { "epoch": 0.4826972513347835, "step": 4882, "train/total_loss": 0.21768595278263092 }, { "entropy": 9.48274040222168, "epoch": 0.482796124184299, "mean_token_accuracy": 0.7407407164573669, "num_tokens": 4601453.0, "step": 4883, "train/ce_loss": 0.6060320734977722 }, { "epoch": 0.482796124184299, "step": 4883, "train/sim_loss": 0.0234375 }, { "epoch": 0.482796124184299, "step": 4883, "train/total_loss": 0.08404070883989334 }, { "entropy": 8.87118148803711, "epoch": 0.4828949970338145, "mean_token_accuracy": 0.738990306854248, "num_tokens": 4606906.0, "step": 4884, "train/ce_loss": 0.6846893429756165 }, { "epoch": 0.4828949970338145, "step": 4884, "train/sim_loss": 0.03515625 }, { "epoch": 0.4828949970338145, "step": 4884, "train/total_loss": 0.10362518578767776 }, { "entropy": 9.23701000213623, "epoch": 0.48299386988333004, "mean_token_accuracy": 0.7627118825912476, "num_tokens": 4612078.0, "step": 4885, "train/ce_loss": 0.6392655372619629 }, { "epoch": 0.48299386988333004, "step": 4885, "train/sim_loss": 0.03125 }, { "epoch": 0.48299386988333004, "step": 4885, "train/total_loss": 0.09517655521631241 }, { "entropy": 9.396322250366211, "epoch": 0.4830927427328456, "mean_token_accuracy": 0.726190447807312, "num_tokens": 4617226.0, "step": 4886, "train/ce_loss": 2.1839709281921387 }, { "epoch": 0.4830927427328456, "step": 4886, "train/sim_loss": 0.11328125 }, { "epoch": 0.4830927427328456, "step": 4886, "train/total_loss": 0.3316783308982849 }, { "entropy": 8.550373077392578, "epoch": 0.48319161558236107, "mean_token_accuracy": 0.7473683953285217, "num_tokens": 4622645.0, "step": 4887, "train/ce_loss": 1.0314589738845825 }, { "epoch": 0.48319161558236107, "step": 4887, "train/sim_loss": 0.0625 }, { "epoch": 0.48319161558236107, "step": 4887, "train/total_loss": 0.16564589738845825 }, { "entropy": 9.357982635498047, "epoch": 0.4832904884318766, "mean_token_accuracy": 0.7330447435379028, "num_tokens": 4627808.0, "step": 4888, "train/ce_loss": 0.5194808840751648 }, { "epoch": 0.4832904884318766, "step": 4888, "train/sim_loss": 0.06640625 }, { "epoch": 0.4832904884318766, "step": 4888, "train/total_loss": 0.11835433542728424 }, { "entropy": 9.263805389404297, "epoch": 0.48338936128139215, "mean_token_accuracy": 0.7716763019561768, "num_tokens": 4632900.0, "step": 4889, "train/ce_loss": 0.3726261556148529 }, { "epoch": 0.48338936128139215, "step": 4889, "train/sim_loss": 0.0390625 }, { "epoch": 0.48338936128139215, "step": 4889, "train/total_loss": 0.07632511854171753 }, { "entropy": 9.518759727478027, "epoch": 0.48348823413090763, "mean_token_accuracy": 0.7730956077575684, "num_tokens": 4637976.0, "step": 4890, "train/ce_loss": 0.6279394030570984 }, { "epoch": 0.48348823413090763, "step": 4890, "train/sim_loss": 0.015625 }, { "epoch": 0.48348823413090763, "step": 4890, "train/total_loss": 0.07841894030570984 }, { "entropy": 8.828695297241211, "epoch": 0.4835871069804232, "mean_token_accuracy": 0.6888889074325562, "num_tokens": 4643377.0, "step": 4891, "train/ce_loss": 0.5468465089797974 }, { "epoch": 0.4835871069804232, "step": 4891, "train/sim_loss": 0.05078125 }, { "epoch": 0.4835871069804232, "step": 4891, "train/total_loss": 0.10546590387821198 }, { "entropy": 8.950201034545898, "epoch": 0.4836859798299387, "mean_token_accuracy": 0.7175368070602417, "num_tokens": 4648600.0, "step": 4892, "train/ce_loss": 1.4159440994262695 }, { "epoch": 0.4836859798299387, "step": 4892, "train/sim_loss": 0.09765625 }, { "epoch": 0.4836859798299387, "step": 4892, "train/total_loss": 0.23925065994262695 }, { "entropy": 9.095492362976074, "epoch": 0.4837848526794542, "mean_token_accuracy": 0.7314148545265198, "num_tokens": 4653960.0, "step": 4893, "train/ce_loss": 1.207137942314148 }, { "epoch": 0.4837848526794542, "step": 4893, "train/sim_loss": 0.08203125 }, { "epoch": 0.4837848526794542, "step": 4893, "train/total_loss": 0.20274505019187927 }, { "entropy": 8.578197479248047, "epoch": 0.48388372552896974, "mean_token_accuracy": 0.7013100385665894, "num_tokens": 4659591.0, "step": 4894, "train/ce_loss": 1.2564072608947754 }, { "epoch": 0.48388372552896974, "step": 4894, "train/sim_loss": 0.046875 }, { "epoch": 0.48388372552896974, "step": 4894, "train/total_loss": 0.17251573503017426 }, { "entropy": 9.875054359436035, "epoch": 0.4839825983784853, "mean_token_accuracy": 0.7128099203109741, "num_tokens": 4664503.0, "step": 4895, "train/ce_loss": 1.451451063156128 }, { "epoch": 0.4839825983784853, "step": 4895, "train/sim_loss": 0.0390625 }, { "epoch": 0.4839825983784853, "step": 4895, "train/total_loss": 0.18420760333538055 }, { "entropy": 8.972516059875488, "epoch": 0.48408147122800077, "mean_token_accuracy": 0.732758641242981, "num_tokens": 4669931.0, "step": 4896, "train/ce_loss": 0.4388304054737091 }, { "epoch": 0.48408147122800077, "step": 4896, "train/sim_loss": 0.08984375 }, { "epoch": 0.48408147122800077, "step": 4896, "train/total_loss": 0.1337267905473709 }, { "entropy": 9.007209777832031, "epoch": 0.4841803440775163, "mean_token_accuracy": 0.7642015218734741, "num_tokens": 4675331.0, "step": 4897, "train/ce_loss": 0.7995807528495789 }, { "epoch": 0.4841803440775163, "step": 4897, "train/sim_loss": 0.06640625 }, { "epoch": 0.4841803440775163, "step": 4897, "train/total_loss": 0.14636433124542236 }, { "entropy": 8.663717269897461, "epoch": 0.48427921692703185, "mean_token_accuracy": 0.7679924368858337, "num_tokens": 4680850.0, "step": 4898, "train/ce_loss": 0.3658985495567322 }, { "epoch": 0.48427921692703185, "step": 4898, "train/sim_loss": 0.02734375 }, { "epoch": 0.48427921692703185, "step": 4898, "train/total_loss": 0.0639336109161377 }, { "entropy": 9.340051651000977, "epoch": 0.48437808977654734, "mean_token_accuracy": 0.7296918630599976, "num_tokens": 4686050.0, "step": 4899, "train/ce_loss": 0.7124799489974976 }, { "epoch": 0.48437808977654734, "step": 4899, "train/sim_loss": 0.04296875 }, { "epoch": 0.48437808977654734, "step": 4899, "train/total_loss": 0.11421674489974976 }, { "epoch": 0.4844769626260629, "grad_norm": 0.7022703289985657, "learning_rate": 8.7912278099194e-06, "loss": 0.1444, "step": 4900 }, { "entropy": 9.204004287719727, "epoch": 0.4844769626260629, "mean_token_accuracy": 0.7441558241844177, "num_tokens": 4691321.0, "step": 4900, "train/ce_loss": 0.453791081905365 }, { "epoch": 0.4844769626260629, "step": 4900, "train/sim_loss": 0.01953125 }, { "epoch": 0.4844769626260629, "step": 4900, "train/total_loss": 0.06491035968065262 }, { "entropy": 8.931779861450195, "epoch": 0.4845758354755784, "mean_token_accuracy": 0.7771618366241455, "num_tokens": 4696679.0, "step": 4901, "train/ce_loss": 0.5937322974205017 }, { "epoch": 0.4845758354755784, "step": 4901, "train/sim_loss": 0.015625 }, { "epoch": 0.4845758354755784, "step": 4901, "train/total_loss": 0.07499822974205017 }, { "entropy": 9.884674072265625, "epoch": 0.4846747083250939, "mean_token_accuracy": 0.710106372833252, "num_tokens": 4701478.0, "step": 4902, "train/ce_loss": 2.7379010134609416e-06 }, { "epoch": 0.4846747083250939, "step": 4902, "train/sim_loss": 0.0703125 }, { "epoch": 0.4846747083250939, "step": 4902, "train/total_loss": 0.07031277567148209 }, { "entropy": 8.596389770507812, "epoch": 0.48477358117460945, "mean_token_accuracy": 0.7246804237365723, "num_tokens": 4707014.0, "step": 4903, "train/ce_loss": 0.7283797264099121 }, { "epoch": 0.48477358117460945, "step": 4903, "train/sim_loss": 0.17578125 }, { "epoch": 0.48477358117460945, "step": 4903, "train/total_loss": 0.2486192286014557 }, { "entropy": 8.942275047302246, "epoch": 0.484872454024125, "mean_token_accuracy": 0.7307236194610596, "num_tokens": 4712354.0, "step": 4904, "train/ce_loss": 0.9904487133026123 }, { "epoch": 0.484872454024125, "step": 4904, "train/sim_loss": 0.1171875 }, { "epoch": 0.484872454024125, "step": 4904, "train/total_loss": 0.21623237431049347 }, { "entropy": 9.129148483276367, "epoch": 0.4849713268736405, "mean_token_accuracy": 0.709392249584198, "num_tokens": 4717708.0, "step": 4905, "train/ce_loss": 9.562788818584522e-07 }, { "epoch": 0.4849713268736405, "step": 4905, "train/sim_loss": 0.01171875 }, { "epoch": 0.4849713268736405, "step": 4905, "train/total_loss": 0.011718845926225185 }, { "entropy": 9.028470993041992, "epoch": 0.485070199723156, "mean_token_accuracy": 0.7608951926231384, "num_tokens": 4723023.0, "step": 4906, "train/ce_loss": 0.7763880491256714 }, { "epoch": 0.485070199723156, "step": 4906, "train/sim_loss": 0.0625 }, { "epoch": 0.485070199723156, "step": 4906, "train/total_loss": 0.14013880491256714 }, { "entropy": 8.938543319702148, "epoch": 0.48516907257267156, "mean_token_accuracy": 0.7885652780532837, "num_tokens": 4728351.0, "step": 4907, "train/ce_loss": 0.7680575847625732 }, { "epoch": 0.48516907257267156, "step": 4907, "train/sim_loss": 0.0859375 }, { "epoch": 0.48516907257267156, "step": 4907, "train/total_loss": 0.16274327039718628 }, { "entropy": 8.846967697143555, "epoch": 0.48526794542218704, "mean_token_accuracy": 0.7796178460121155, "num_tokens": 4733658.0, "step": 4908, "train/ce_loss": 0.6358514428138733 }, { "epoch": 0.48526794542218704, "step": 4908, "train/sim_loss": 0.0234375 }, { "epoch": 0.48526794542218704, "step": 4908, "train/total_loss": 0.08702264726161957 }, { "entropy": 8.666057586669922, "epoch": 0.4853668182717026, "mean_token_accuracy": 0.7359050512313843, "num_tokens": 4739205.0, "step": 4909, "train/ce_loss": 0.7128942608833313 }, { "epoch": 0.4853668182717026, "step": 4909, "train/sim_loss": 0.0390625 }, { "epoch": 0.4853668182717026, "step": 4909, "train/total_loss": 0.11035192757844925 }, { "entropy": 8.993949890136719, "epoch": 0.4854656911212181, "mean_token_accuracy": 0.7426390647888184, "num_tokens": 4744604.0, "step": 4910, "train/ce_loss": 0.5839024186134338 }, { "epoch": 0.4854656911212181, "step": 4910, "train/sim_loss": 0.046875 }, { "epoch": 0.4854656911212181, "step": 4910, "train/total_loss": 0.10526524484157562 }, { "entropy": 9.8870849609375, "epoch": 0.4855645639707336, "mean_token_accuracy": 0.7641196250915527, "num_tokens": 4749306.0, "step": 4911, "train/ce_loss": 1.7026309967041016 }, { "epoch": 0.4855645639707336, "step": 4911, "train/sim_loss": 0.06640625 }, { "epoch": 0.4855645639707336, "step": 4911, "train/total_loss": 0.23666934669017792 }, { "entropy": 9.434379577636719, "epoch": 0.48566343682024915, "mean_token_accuracy": 0.8050000071525574, "num_tokens": 4754367.0, "step": 4912, "train/ce_loss": 0.8110925555229187 }, { "epoch": 0.48566343682024915, "step": 4912, "train/sim_loss": 0.01953125 }, { "epoch": 0.48566343682024915, "step": 4912, "train/total_loss": 0.10064050555229187 }, { "entropy": 9.033245086669922, "epoch": 0.4857623096697647, "mean_token_accuracy": 0.7487499713897705, "num_tokens": 4759630.0, "step": 4913, "train/ce_loss": 0.7126909494400024 }, { "epoch": 0.4857623096697647, "step": 4913, "train/sim_loss": 0.0625 }, { "epoch": 0.4857623096697647, "step": 4913, "train/total_loss": 0.13376909494400024 }, { "entropy": 9.069683074951172, "epoch": 0.48586118251928023, "mean_token_accuracy": 0.7380627393722534, "num_tokens": 4764817.0, "step": 4914, "train/ce_loss": 0.6253024339675903 }, { "epoch": 0.48586118251928023, "step": 4914, "train/sim_loss": 0.0234375 }, { "epoch": 0.48586118251928023, "step": 4914, "train/total_loss": 0.08596774190664291 }, { "entropy": 9.71072769165039, "epoch": 0.4859600553687957, "mean_token_accuracy": 0.7920792102813721, "num_tokens": 4769611.0, "step": 4915, "train/ce_loss": 1.6909226179122925 }, { "epoch": 0.4859600553687957, "step": 4915, "train/sim_loss": 0.05078125 }, { "epoch": 0.4859600553687957, "step": 4915, "train/total_loss": 0.21987351775169373 }, { "entropy": 9.659954071044922, "epoch": 0.48605892821831126, "mean_token_accuracy": 0.7441314458847046, "num_tokens": 4774478.0, "step": 4916, "train/ce_loss": 1.0239133189315908e-05 }, { "epoch": 0.48605892821831126, "step": 4916, "train/sim_loss": 0.0625 }, { "epoch": 0.48605892821831126, "step": 4916, "train/total_loss": 0.06250102072954178 }, { "entropy": 9.636788368225098, "epoch": 0.4861578010678268, "mean_token_accuracy": 0.7456979155540466, "num_tokens": 4779410.0, "step": 4917, "train/ce_loss": 0.8914626836776733 }, { "epoch": 0.4861578010678268, "step": 4917, "train/sim_loss": 0.05859375 }, { "epoch": 0.4861578010678268, "step": 4917, "train/total_loss": 0.14774002134799957 }, { "entropy": 9.167765617370605, "epoch": 0.4862566739173423, "mean_token_accuracy": 0.7288359999656677, "num_tokens": 4784536.0, "step": 4918, "train/ce_loss": 0.6476504802703857 }, { "epoch": 0.4862566739173423, "step": 4918, "train/sim_loss": 0.0546875 }, { "epoch": 0.4862566739173423, "step": 4918, "train/total_loss": 0.11945255100727081 }, { "entropy": 9.297834396362305, "epoch": 0.4863555467668578, "mean_token_accuracy": 0.74301677942276, "num_tokens": 4789682.0, "step": 4919, "train/ce_loss": 1.2960455417633057 }, { "epoch": 0.4863555467668578, "step": 4919, "train/sim_loss": 0.05078125 }, { "epoch": 0.4863555467668578, "step": 4919, "train/total_loss": 0.18038581311702728 }, { "epoch": 0.48645441961637337, "grad_norm": 0.7376166582107544, "learning_rate": 8.78628294516145e-06, "loss": 0.1446, "step": 4920 }, { "entropy": 9.638121604919434, "epoch": 0.48645441961637337, "mean_token_accuracy": 0.7441016435623169, "num_tokens": 4794733.0, "step": 4920, "train/ce_loss": 0.8663396239280701 }, { "epoch": 0.48645441961637337, "step": 4920, "train/sim_loss": 0.05859375 }, { "epoch": 0.48645441961637337, "step": 4920, "train/total_loss": 0.14522771537303925 }, { "entropy": 8.860578536987305, "epoch": 0.48655329246588885, "mean_token_accuracy": 0.7983014583587646, "num_tokens": 4800140.0, "step": 4921, "train/ce_loss": 0.8453391194343567 }, { "epoch": 0.48655329246588885, "step": 4921, "train/sim_loss": 0.046875 }, { "epoch": 0.48655329246588885, "step": 4921, "train/total_loss": 0.1314089149236679 }, { "entropy": 8.960800170898438, "epoch": 0.4866521653154044, "mean_token_accuracy": 0.7310647368431091, "num_tokens": 4805546.0, "step": 4922, "train/ce_loss": 0.9981918334960938 }, { "epoch": 0.4866521653154044, "step": 4922, "train/sim_loss": 0.06640625 }, { "epoch": 0.4866521653154044, "step": 4922, "train/total_loss": 0.16622543334960938 }, { "entropy": 9.311447143554688, "epoch": 0.48675103816491994, "mean_token_accuracy": 0.7135134935379028, "num_tokens": 4810700.0, "step": 4923, "train/ce_loss": 1.1179834604263306 }, { "epoch": 0.48675103816491994, "step": 4923, "train/sim_loss": 0.03125 }, { "epoch": 0.48675103816491994, "step": 4923, "train/total_loss": 0.14304834604263306 }, { "entropy": 9.43608283996582, "epoch": 0.4868499110144354, "mean_token_accuracy": 0.7465887069702148, "num_tokens": 4815708.0, "step": 4924, "train/ce_loss": 0.8741804957389832 }, { "epoch": 0.4868499110144354, "step": 4924, "train/sim_loss": 0.078125 }, { "epoch": 0.4868499110144354, "step": 4924, "train/total_loss": 0.16554304957389832 }, { "entropy": 9.55827522277832, "epoch": 0.48694878386395096, "mean_token_accuracy": 0.7261029481887817, "num_tokens": 4820691.0, "step": 4925, "train/ce_loss": 1.1498408317565918 }, { "epoch": 0.48694878386395096, "step": 4925, "train/sim_loss": 0.0625 }, { "epoch": 0.48694878386395096, "step": 4925, "train/total_loss": 0.17748409509658813 }, { "entropy": 9.66183853149414, "epoch": 0.4870476567134665, "mean_token_accuracy": 0.7651331424713135, "num_tokens": 4825551.0, "step": 4926, "train/ce_loss": 3.2787506825115997e-06 }, { "epoch": 0.4870476567134665, "step": 4926, "train/sim_loss": 0.046875 }, { "epoch": 0.4870476567134665, "step": 4926, "train/total_loss": 0.046875327825546265 }, { "entropy": 9.215739250183105, "epoch": 0.487146529562982, "mean_token_accuracy": 0.7790697813034058, "num_tokens": 4830744.0, "step": 4927, "train/ce_loss": 0.6452703475952148 }, { "epoch": 0.487146529562982, "step": 4927, "train/sim_loss": 0.0390625 }, { "epoch": 0.487146529562982, "step": 4927, "train/total_loss": 0.10358953475952148 }, { "entropy": 8.913700103759766, "epoch": 0.48724540241249753, "mean_token_accuracy": 0.7453183531761169, "num_tokens": 4836066.0, "step": 4928, "train/ce_loss": 1.0108767747879028 }, { "epoch": 0.48724540241249753, "step": 4928, "train/sim_loss": 0.10546875 }, { "epoch": 0.48724540241249753, "step": 4928, "train/total_loss": 0.20655643939971924 }, { "entropy": 9.374261856079102, "epoch": 0.48734427526201307, "mean_token_accuracy": 0.7281690239906311, "num_tokens": 4841163.0, "step": 4929, "train/ce_loss": 1.19328773021698 }, { "epoch": 0.48734427526201307, "step": 4929, "train/sim_loss": 0.0546875 }, { "epoch": 0.48734427526201307, "step": 4929, "train/total_loss": 0.17401626706123352 }, { "entropy": 9.023553848266602, "epoch": 0.48744314811152856, "mean_token_accuracy": 0.7415143847465515, "num_tokens": 4846455.0, "step": 4930, "train/ce_loss": 1.073759913444519 }, { "epoch": 0.48744314811152856, "step": 4930, "train/sim_loss": 0.03125 }, { "epoch": 0.48744314811152856, "step": 4930, "train/total_loss": 0.13862599432468414 }, { "entropy": 9.224691390991211, "epoch": 0.4875420209610441, "mean_token_accuracy": 0.6523736715316772, "num_tokens": 4851542.0, "step": 4931, "train/ce_loss": 1.0722086429595947 }, { "epoch": 0.4875420209610441, "step": 4931, "train/sim_loss": 0.05859375 }, { "epoch": 0.4875420209610441, "step": 4931, "train/total_loss": 0.165814608335495 }, { "entropy": 9.236860275268555, "epoch": 0.48764089381055964, "mean_token_accuracy": 0.7120822668075562, "num_tokens": 4856762.0, "step": 4932, "train/ce_loss": 0.8498473167419434 }, { "epoch": 0.48764089381055964, "step": 4932, "train/sim_loss": 0.0859375 }, { "epoch": 0.48764089381055964, "step": 4932, "train/total_loss": 0.17092223465442657 }, { "entropy": 8.873945236206055, "epoch": 0.4877397666600751, "mean_token_accuracy": 0.7639344334602356, "num_tokens": 4862133.0, "step": 4933, "train/ce_loss": 0.6059170365333557 }, { "epoch": 0.4877397666600751, "step": 4933, "train/sim_loss": 0.0390625 }, { "epoch": 0.4877397666600751, "step": 4933, "train/total_loss": 0.09965420514345169 }, { "entropy": 9.093036651611328, "epoch": 0.48783863950959067, "mean_token_accuracy": 0.6778350472450256, "num_tokens": 4867340.0, "step": 4934, "train/ce_loss": 1.148424871644238e-06 }, { "epoch": 0.48783863950959067, "step": 4934, "train/sim_loss": 0.078125 }, { "epoch": 0.48783863950959067, "step": 4934, "train/total_loss": 0.07812511175870895 }, { "entropy": 9.153524398803711, "epoch": 0.4879375123591062, "mean_token_accuracy": 0.7430025339126587, "num_tokens": 4872608.0, "step": 4935, "train/ce_loss": 1.041776418685913 }, { "epoch": 0.4879375123591062, "step": 4935, "train/sim_loss": 0.125 }, { "epoch": 0.4879375123591062, "step": 4935, "train/total_loss": 0.22917765378952026 }, { "entropy": 8.765460014343262, "epoch": 0.4880363852086217, "mean_token_accuracy": 0.71875, "num_tokens": 4878047.0, "step": 4936, "train/ce_loss": 0.9821575880050659 }, { "epoch": 0.4880363852086217, "step": 4936, "train/sim_loss": 0.05859375 }, { "epoch": 0.4880363852086217, "step": 4936, "train/total_loss": 0.1568095088005066 }, { "entropy": 9.62529182434082, "epoch": 0.48813525805813723, "mean_token_accuracy": 0.715859055519104, "num_tokens": 4882896.0, "step": 4937, "train/ce_loss": 2.728528897932847e-06 }, { "epoch": 0.48813525805813723, "step": 4937, "train/sim_loss": 0.0546875 }, { "epoch": 0.48813525805813723, "step": 4937, "train/total_loss": 0.05468777194619179 }, { "entropy": 9.226823806762695, "epoch": 0.4882341309076528, "mean_token_accuracy": 0.7292225360870361, "num_tokens": 4888112.0, "step": 4938, "train/ce_loss": 0.8293290138244629 }, { "epoch": 0.4882341309076528, "step": 4938, "train/sim_loss": 0.0546875 }, { "epoch": 0.4882341309076528, "step": 4938, "train/total_loss": 0.13762040436267853 }, { "entropy": 8.93838119506836, "epoch": 0.48833300375716826, "mean_token_accuracy": 0.7661574482917786, "num_tokens": 4893505.0, "step": 4939, "train/ce_loss": 0.6429872512817383 }, { "epoch": 0.48833300375716826, "step": 4939, "train/sim_loss": 0.046875 }, { "epoch": 0.48833300375716826, "step": 4939, "train/total_loss": 0.11117372661828995 }, { "epoch": 0.4884318766066838, "grad_norm": 0.7874643206596375, "learning_rate": 8.781338080403502e-06, "loss": 0.1473, "step": 4940 }, { "entropy": 8.490642547607422, "epoch": 0.4884318766066838, "mean_token_accuracy": 0.7120419144630432, "num_tokens": 4898952.0, "step": 4940, "train/ce_loss": 1.5694804191589355 }, { "epoch": 0.4884318766066838, "step": 4940, "train/sim_loss": 0.06640625 }, { "epoch": 0.4884318766066838, "step": 4940, "train/total_loss": 0.2233542948961258 }, { "entropy": 9.060588836669922, "epoch": 0.48853074945619934, "mean_token_accuracy": 0.7146596908569336, "num_tokens": 4904169.0, "step": 4941, "train/ce_loss": 0.8909388184547424 }, { "epoch": 0.48853074945619934, "step": 4941, "train/sim_loss": 0.0703125 }, { "epoch": 0.48853074945619934, "step": 4941, "train/total_loss": 0.1594063937664032 }, { "entropy": 9.662264823913574, "epoch": 0.48862962230571483, "mean_token_accuracy": 0.75314861536026, "num_tokens": 4908980.0, "step": 4942, "train/ce_loss": 1.9760382175445557 }, { "epoch": 0.48862962230571483, "step": 4942, "train/sim_loss": 0.0390625 }, { "epoch": 0.48862962230571483, "step": 4942, "train/total_loss": 0.23666632175445557 }, { "entropy": 9.120369911193848, "epoch": 0.48872849515523037, "mean_token_accuracy": 0.7997010350227356, "num_tokens": 4914156.0, "step": 4943, "train/ce_loss": 2.164554189221235e-06 }, { "epoch": 0.48872849515523037, "step": 4943, "train/sim_loss": 0.078125 }, { "epoch": 0.48872849515523037, "step": 4943, "train/total_loss": 0.07812521606683731 }, { "entropy": 10.052497863769531, "epoch": 0.4888273680047459, "mean_token_accuracy": 0.7403100728988647, "num_tokens": 4918831.0, "step": 4944, "train/ce_loss": 1.1864396583405323e-05 }, { "epoch": 0.4888273680047459, "step": 4944, "train/sim_loss": 0.05078125 }, { "epoch": 0.4888273680047459, "step": 4944, "train/total_loss": 0.05078243464231491 }, { "entropy": 8.989214897155762, "epoch": 0.4889262408542614, "mean_token_accuracy": 0.7435265183448792, "num_tokens": 4924090.0, "step": 4945, "train/ce_loss": 0.7429519295692444 }, { "epoch": 0.4889262408542614, "step": 4945, "train/sim_loss": 0.01171875 }, { "epoch": 0.4889262408542614, "step": 4945, "train/total_loss": 0.08601394295692444 }, { "entropy": 8.845163345336914, "epoch": 0.48902511370377694, "mean_token_accuracy": 0.7583603262901306, "num_tokens": 4929551.0, "step": 4946, "train/ce_loss": 0.6005305051803589 }, { "epoch": 0.48902511370377694, "step": 4946, "train/sim_loss": 0.03515625 }, { "epoch": 0.48902511370377694, "step": 4946, "train/total_loss": 0.09520930051803589 }, { "entropy": 9.364870071411133, "epoch": 0.4891239865532925, "mean_token_accuracy": 0.7204116582870483, "num_tokens": 4934529.0, "step": 4947, "train/ce_loss": 1.5267508029937744 }, { "epoch": 0.4891239865532925, "step": 4947, "train/sim_loss": 0.06640625 }, { "epoch": 0.4891239865532925, "step": 4947, "train/total_loss": 0.2190813273191452 }, { "entropy": 9.116228103637695, "epoch": 0.48922285940280796, "mean_token_accuracy": 0.7240437269210815, "num_tokens": 4939707.0, "step": 4948, "train/ce_loss": 0.4577290713787079 }, { "epoch": 0.48922285940280796, "step": 4948, "train/sim_loss": 0.0390625 }, { "epoch": 0.48922285940280796, "step": 4948, "train/total_loss": 0.08483541011810303 }, { "entropy": 9.415693283081055, "epoch": 0.4893217322523235, "mean_token_accuracy": 0.7996794581413269, "num_tokens": 4944829.0, "step": 4949, "train/ce_loss": 1.471177339553833 }, { "epoch": 0.4893217322523235, "step": 4949, "train/sim_loss": 0.0703125 }, { "epoch": 0.4893217322523235, "step": 4949, "train/total_loss": 0.2174302339553833 }, { "entropy": 9.266605377197266, "epoch": 0.48942060510183905, "mean_token_accuracy": 0.7365661859512329, "num_tokens": 4949989.0, "step": 4950, "train/ce_loss": 0.6825771927833557 }, { "epoch": 0.48942060510183905, "step": 4950, "train/sim_loss": 0.0703125 }, { "epoch": 0.48942060510183905, "step": 4950, "train/total_loss": 0.13857021927833557 }, { "entropy": 9.919319152832031, "epoch": 0.48951947795135453, "mean_token_accuracy": 0.76115483045578, "num_tokens": 4954809.0, "step": 4951, "train/ce_loss": 2.257694177387748e-06 }, { "epoch": 0.48951947795135453, "step": 4951, "train/sim_loss": 0.0234375 }, { "epoch": 0.48951947795135453, "step": 4951, "train/total_loss": 0.023437725380063057 }, { "entropy": 9.214216232299805, "epoch": 0.4896183508008701, "mean_token_accuracy": 0.7194968461990356, "num_tokens": 4960064.0, "step": 4952, "train/ce_loss": 1.0852882862091064 }, { "epoch": 0.4896183508008701, "step": 4952, "train/sim_loss": 0.0390625 }, { "epoch": 0.4896183508008701, "step": 4952, "train/total_loss": 0.14759132266044617 }, { "entropy": 9.708972930908203, "epoch": 0.4897172236503856, "mean_token_accuracy": 0.7222222089767456, "num_tokens": 4964884.0, "step": 4953, "train/ce_loss": 1.529525252408348e-05 }, { "epoch": 0.4897172236503856, "step": 4953, "train/sim_loss": 0.0234375 }, { "epoch": 0.4897172236503856, "step": 4953, "train/total_loss": 0.02343902923166752 }, { "entropy": 9.27254867553711, "epoch": 0.48981609649990115, "mean_token_accuracy": 0.7869565486907959, "num_tokens": 4970010.0, "step": 4954, "train/ce_loss": 1.4466725587844849 }, { "epoch": 0.48981609649990115, "step": 4954, "train/sim_loss": 0.0546875 }, { "epoch": 0.48981609649990115, "step": 4954, "train/total_loss": 0.19935475289821625 }, { "entropy": 8.7774019241333, "epoch": 0.48991496934941664, "mean_token_accuracy": 0.7177264094352722, "num_tokens": 4975524.0, "step": 4955, "train/ce_loss": 1.0846054553985596 }, { "epoch": 0.48991496934941664, "step": 4955, "train/sim_loss": 0.05078125 }, { "epoch": 0.48991496934941664, "step": 4955, "train/total_loss": 0.15924179553985596 }, { "entropy": 9.477668762207031, "epoch": 0.4900138421989322, "mean_token_accuracy": 0.7560073733329773, "num_tokens": 4980501.0, "step": 4956, "train/ce_loss": 1.5517706871032715 }, { "epoch": 0.4900138421989322, "step": 4956, "train/sim_loss": 0.046875 }, { "epoch": 0.4900138421989322, "step": 4956, "train/total_loss": 0.2020520716905594 }, { "entropy": 9.712724685668945, "epoch": 0.4901127150484477, "mean_token_accuracy": 0.7151514887809753, "num_tokens": 4985440.0, "step": 4957, "train/ce_loss": 1.111464262008667 }, { "epoch": 0.4901127150484477, "step": 4957, "train/sim_loss": 0.046875 }, { "epoch": 0.4901127150484477, "step": 4957, "train/total_loss": 0.15802142024040222 }, { "entropy": 8.931743621826172, "epoch": 0.4902115878979632, "mean_token_accuracy": 0.7174638509750366, "num_tokens": 4990855.0, "step": 4958, "train/ce_loss": 0.7936546206474304 }, { "epoch": 0.4902115878979632, "step": 4958, "train/sim_loss": 0.0625 }, { "epoch": 0.4902115878979632, "step": 4958, "train/total_loss": 0.14186546206474304 }, { "entropy": 8.859094619750977, "epoch": 0.49031046074747875, "mean_token_accuracy": 0.7553072571754456, "num_tokens": 4996280.0, "step": 4959, "train/ce_loss": 0.5950724482536316 }, { "epoch": 0.49031046074747875, "step": 4959, "train/sim_loss": 0.046875 }, { "epoch": 0.49031046074747875, "step": 4959, "train/total_loss": 0.10638225078582764 }, { "epoch": 0.4904093335969943, "grad_norm": 0.6664409637451172, "learning_rate": 8.776393215645553e-06, "loss": 0.1418, "step": 4960 }, { "entropy": 10.08526611328125, "epoch": 0.4904093335969943, "mean_token_accuracy": 0.732087254524231, "num_tokens": 5000962.0, "step": 4960, "train/ce_loss": 0.8471253514289856 }, { "epoch": 0.4904093335969943, "step": 4960, "train/sim_loss": 0.05859375 }, { "epoch": 0.4904093335969943, "step": 4960, "train/total_loss": 0.14330628514289856 }, { "entropy": 9.833306312561035, "epoch": 0.4905082064465098, "mean_token_accuracy": 0.7071239948272705, "num_tokens": 5005775.0, "step": 4961, "train/ce_loss": 2.1891096366744023e-06 }, { "epoch": 0.4905082064465098, "step": 4961, "train/sim_loss": 0.03125 }, { "epoch": 0.4905082064465098, "step": 4961, "train/total_loss": 0.03125021979212761 }, { "entropy": 9.284728050231934, "epoch": 0.4906070792960253, "mean_token_accuracy": 0.7516778707504272, "num_tokens": 5010963.0, "step": 4962, "train/ce_loss": 0.6591690182685852 }, { "epoch": 0.4906070792960253, "step": 4962, "train/sim_loss": 0.04296875 }, { "epoch": 0.4906070792960253, "step": 4962, "train/total_loss": 0.10888565331697464 }, { "entropy": 8.825736045837402, "epoch": 0.49070595214554086, "mean_token_accuracy": 0.7436181902885437, "num_tokens": 5016355.0, "step": 4963, "train/ce_loss": 0.7082291841506958 }, { "epoch": 0.49070595214554086, "step": 4963, "train/sim_loss": 0.0234375 }, { "epoch": 0.49070595214554086, "step": 4963, "train/total_loss": 0.09426041692495346 }, { "entropy": 9.79039478302002, "epoch": 0.49080482499505634, "mean_token_accuracy": 0.7394366264343262, "num_tokens": 5021170.0, "step": 4964, "train/ce_loss": 2.0843520164489746 }, { "epoch": 0.49080482499505634, "step": 4964, "train/sim_loss": 0.0703125 }, { "epoch": 0.49080482499505634, "step": 4964, "train/total_loss": 0.27874770760536194 }, { "entropy": 9.001119613647461, "epoch": 0.4909036978445719, "mean_token_accuracy": 0.7306175827980042, "num_tokens": 5026413.0, "step": 4965, "train/ce_loss": 0.8822122812271118 }, { "epoch": 0.4909036978445719, "step": 4965, "train/sim_loss": 0.078125 }, { "epoch": 0.4909036978445719, "step": 4965, "train/total_loss": 0.1663462221622467 }, { "entropy": 9.187026977539062, "epoch": 0.4910025706940874, "mean_token_accuracy": 0.7484737634658813, "num_tokens": 5031678.0, "step": 4966, "train/ce_loss": 0.9261685013771057 }, { "epoch": 0.4910025706940874, "step": 4966, "train/sim_loss": 0.0625 }, { "epoch": 0.4910025706940874, "step": 4966, "train/total_loss": 0.15511685609817505 }, { "entropy": 9.472979545593262, "epoch": 0.4911014435436029, "mean_token_accuracy": 0.6653944253921509, "num_tokens": 5036938.0, "step": 4967, "train/ce_loss": 2.3817548751831055 }, { "epoch": 0.4911014435436029, "step": 4967, "train/sim_loss": 0.171875 }, { "epoch": 0.4911014435436029, "step": 4967, "train/total_loss": 0.41005051136016846 }, { "entropy": 8.875371932983398, "epoch": 0.49120031639311845, "mean_token_accuracy": 0.7755857110023499, "num_tokens": 5042166.0, "step": 4968, "train/ce_loss": 8.783146768109873e-06 }, { "epoch": 0.49120031639311845, "step": 4968, "train/sim_loss": 0.0390625 }, { "epoch": 0.49120031639311845, "step": 4968, "train/total_loss": 0.03906337916851044 }, { "entropy": 9.897775650024414, "epoch": 0.491299189242634, "mean_token_accuracy": 0.7455621361732483, "num_tokens": 5047056.0, "step": 4969, "train/ce_loss": 0.8315859436988831 }, { "epoch": 0.491299189242634, "step": 4969, "train/sim_loss": 0.05078125 }, { "epoch": 0.491299189242634, "step": 4969, "train/total_loss": 0.13393984735012054 }, { "entropy": 9.3680419921875, "epoch": 0.4913980620921495, "mean_token_accuracy": 0.725874125957489, "num_tokens": 5052190.0, "step": 4970, "train/ce_loss": 1.4078030586242676 }, { "epoch": 0.4913980620921495, "step": 4970, "train/sim_loss": 0.05078125 }, { "epoch": 0.4913980620921495, "step": 4970, "train/total_loss": 0.19156156480312347 }, { "entropy": 9.263813018798828, "epoch": 0.491496934941665, "mean_token_accuracy": 0.6920152306556702, "num_tokens": 5057409.0, "step": 4971, "train/ce_loss": 1.0340186236135196e-06 }, { "epoch": 0.491496934941665, "step": 4971, "train/sim_loss": 0.0234375 }, { "epoch": 0.491496934941665, "step": 4971, "train/total_loss": 0.023437604308128357 }, { "entropy": 9.463968276977539, "epoch": 0.49159580779118056, "mean_token_accuracy": 0.7051724195480347, "num_tokens": 5062430.0, "step": 4972, "train/ce_loss": 2.8040617507940624e-06 }, { "epoch": 0.49159580779118056, "step": 4972, "train/sim_loss": 0.0390625 }, { "epoch": 0.49159580779118056, "step": 4972, "train/total_loss": 0.039062779396772385 }, { "entropy": 9.55792236328125, "epoch": 0.49169468064069605, "mean_token_accuracy": 0.675302267074585, "num_tokens": 5067455.0, "step": 4973, "train/ce_loss": 1.6470224863951444e-06 }, { "epoch": 0.49169468064069605, "step": 4973, "train/sim_loss": 0.05078125 }, { "epoch": 0.49169468064069605, "step": 4973, "train/total_loss": 0.05078141391277313 }, { "entropy": 9.557991027832031, "epoch": 0.4917935534902116, "mean_token_accuracy": 0.7660256624221802, "num_tokens": 5072514.0, "step": 4974, "train/ce_loss": 2.7783050882135285e-06 }, { "epoch": 0.4917935534902116, "step": 4974, "train/sim_loss": 0.05078125 }, { "epoch": 0.4917935534902116, "step": 4974, "train/total_loss": 0.050781529396772385 }, { "entropy": 9.60000228881836, "epoch": 0.49189242633972713, "mean_token_accuracy": 0.7406143546104431, "num_tokens": 5077508.0, "step": 4975, "train/ce_loss": 1.644429403313552e-06 }, { "epoch": 0.49189242633972713, "step": 4975, "train/sim_loss": 0.05078125 }, { "epoch": 0.49189242633972713, "step": 4975, "train/total_loss": 0.05078141391277313 }, { "entropy": 9.294367790222168, "epoch": 0.4919912991892426, "mean_token_accuracy": 0.7238689661026001, "num_tokens": 5082613.0, "step": 4976, "train/ce_loss": 1.3646399974822998 }, { "epoch": 0.4919912991892426, "step": 4976, "train/sim_loss": 0.0234375 }, { "epoch": 0.4919912991892426, "step": 4976, "train/total_loss": 0.15990149974822998 }, { "entropy": 9.06892204284668, "epoch": 0.49209017203875816, "mean_token_accuracy": 0.7161290049552917, "num_tokens": 5087988.0, "step": 4977, "train/ce_loss": 1.189186930656433 }, { "epoch": 0.49209017203875816, "step": 4977, "train/sim_loss": 0.10546875 }, { "epoch": 0.49209017203875816, "step": 4977, "train/total_loss": 0.22438743710517883 }, { "entropy": 9.453010559082031, "epoch": 0.4921890448882737, "mean_token_accuracy": 0.7511811256408691, "num_tokens": 5093055.0, "step": 4978, "train/ce_loss": 1.4672602415084839 }, { "epoch": 0.4921890448882737, "step": 4978, "train/sim_loss": 0.078125 }, { "epoch": 0.4921890448882737, "step": 4978, "train/total_loss": 0.22485102713108063 }, { "entropy": 8.757181167602539, "epoch": 0.4922879177377892, "mean_token_accuracy": 0.7450593113899231, "num_tokens": 5098498.0, "step": 4979, "train/ce_loss": 0.9394068121910095 }, { "epoch": 0.4922879177377892, "step": 4979, "train/sim_loss": 0.0703125 }, { "epoch": 0.4922879177377892, "step": 4979, "train/total_loss": 0.16425317525863647 }, { "epoch": 0.4923867905873047, "grad_norm": 0.6681188941001892, "learning_rate": 8.771448350887603e-06, "loss": 0.1469, "step": 4980 }, { "entropy": 9.40768051147461, "epoch": 0.4923867905873047, "mean_token_accuracy": 0.7666666507720947, "num_tokens": 5103610.0, "step": 4980, "train/ce_loss": 0.6188808083534241 }, { "epoch": 0.4923867905873047, "step": 4980, "train/sim_loss": 0.05859375 }, { "epoch": 0.4923867905873047, "step": 4980, "train/total_loss": 0.12048183381557465 }, { "entropy": 8.912360191345215, "epoch": 0.49248566343682026, "mean_token_accuracy": 0.7740046977996826, "num_tokens": 5108923.0, "step": 4981, "train/ce_loss": 1.0425912141799927 }, { "epoch": 0.49248566343682026, "step": 4981, "train/sim_loss": 0.05078125 }, { "epoch": 0.49248566343682026, "step": 4981, "train/total_loss": 0.15504038333892822 }, { "entropy": 10.315068244934082, "epoch": 0.49258453628633575, "mean_token_accuracy": 0.7553191781044006, "num_tokens": 5113504.0, "step": 4982, "train/ce_loss": 1.1097929927927908e-05 }, { "epoch": 0.49258453628633575, "step": 4982, "train/sim_loss": 0.03515625 }, { "epoch": 0.49258453628633575, "step": 4982, "train/total_loss": 0.03515736013650894 }, { "entropy": 9.888456344604492, "epoch": 0.4926834091358513, "mean_token_accuracy": 0.7823529243469238, "num_tokens": 5118268.0, "step": 4983, "train/ce_loss": 0.65955650806427 }, { "epoch": 0.4926834091358513, "step": 4983, "train/sim_loss": 0.03125 }, { "epoch": 0.4926834091358513, "step": 4983, "train/total_loss": 0.09720565378665924 }, { "entropy": 9.141186714172363, "epoch": 0.49278228198536683, "mean_token_accuracy": 0.737051784992218, "num_tokens": 5123492.0, "step": 4984, "train/ce_loss": 0.6550917625427246 }, { "epoch": 0.49278228198536683, "step": 4984, "train/sim_loss": 0.0625 }, { "epoch": 0.49278228198536683, "step": 4984, "train/total_loss": 0.12800917029380798 }, { "entropy": 9.001733779907227, "epoch": 0.4928811548348823, "mean_token_accuracy": 0.7353951930999756, "num_tokens": 5128838.0, "step": 4985, "train/ce_loss": 0.6808602213859558 }, { "epoch": 0.4928811548348823, "step": 4985, "train/sim_loss": 0.046875 }, { "epoch": 0.4928811548348823, "step": 4985, "train/total_loss": 0.11496102064847946 }, { "entropy": 9.17730712890625, "epoch": 0.49298002768439786, "mean_token_accuracy": 0.6987951993942261, "num_tokens": 5134064.0, "step": 4986, "train/ce_loss": 1.2225298881530762 }, { "epoch": 0.49298002768439786, "step": 4986, "train/sim_loss": 0.09375 }, { "epoch": 0.49298002768439786, "step": 4986, "train/total_loss": 0.21600300073623657 }, { "entropy": 9.292339324951172, "epoch": 0.4930789005339134, "mean_token_accuracy": 0.7302452325820923, "num_tokens": 5139220.0, "step": 4987, "train/ce_loss": 1.093064308166504 }, { "epoch": 0.4930789005339134, "step": 4987, "train/sim_loss": 0.046875 }, { "epoch": 0.4930789005339134, "step": 4987, "train/total_loss": 0.1561814248561859 }, { "entropy": 9.520135879516602, "epoch": 0.4931777733834289, "mean_token_accuracy": 0.6859259009361267, "num_tokens": 5144375.0, "step": 4988, "train/ce_loss": 0.881460428237915 }, { "epoch": 0.4931777733834289, "step": 4988, "train/sim_loss": 0.0859375 }, { "epoch": 0.4931777733834289, "step": 4988, "train/total_loss": 0.17408354580402374 }, { "entropy": 9.749824523925781, "epoch": 0.4932766462329444, "mean_token_accuracy": 0.75262051820755, "num_tokens": 5149261.0, "step": 4989, "train/ce_loss": 3.8940765989536885e-06 }, { "epoch": 0.4932766462329444, "step": 4989, "train/sim_loss": 0.0625 }, { "epoch": 0.4932766462329444, "step": 4989, "train/total_loss": 0.06250038743019104 }, { "entropy": 8.942789077758789, "epoch": 0.49337551908245997, "mean_token_accuracy": 0.7583429217338562, "num_tokens": 5154595.0, "step": 4990, "train/ce_loss": 0.8096892237663269 }, { "epoch": 0.49337551908245997, "step": 4990, "train/sim_loss": 0.05859375 }, { "epoch": 0.49337551908245997, "step": 4990, "train/total_loss": 0.1395626664161682 }, { "entropy": 9.447273254394531, "epoch": 0.49347439193197545, "mean_token_accuracy": 0.7243067026138306, "num_tokens": 5159656.0, "step": 4991, "train/ce_loss": 0.8232438564300537 }, { "epoch": 0.49347439193197545, "step": 4991, "train/sim_loss": 0.01953125 }, { "epoch": 0.49347439193197545, "step": 4991, "train/total_loss": 0.10185563564300537 }, { "entropy": 9.18798542022705, "epoch": 0.493573264781491, "mean_token_accuracy": 0.7008872032165527, "num_tokens": 5165099.0, "step": 4992, "train/ce_loss": 1.2660822868347168 }, { "epoch": 0.493573264781491, "step": 4992, "train/sim_loss": 0.15625 }, { "epoch": 0.493573264781491, "step": 4992, "train/total_loss": 0.2828582525253296 }, { "entropy": 8.92338752746582, "epoch": 0.49367213763100654, "mean_token_accuracy": 0.7684674859046936, "num_tokens": 5170491.0, "step": 4993, "train/ce_loss": 0.6353029608726501 }, { "epoch": 0.49367213763100654, "step": 4993, "train/sim_loss": 0.015625 }, { "epoch": 0.49367213763100654, "step": 4993, "train/total_loss": 0.07915529608726501 }, { "entropy": 8.678300857543945, "epoch": 0.493771010480522, "mean_token_accuracy": 0.7744680643081665, "num_tokens": 5175913.0, "step": 4994, "train/ce_loss": 0.8314459323883057 }, { "epoch": 0.493771010480522, "step": 4994, "train/sim_loss": 0.04296875 }, { "epoch": 0.493771010480522, "step": 4994, "train/total_loss": 0.12611335515975952 }, { "entropy": 8.942366600036621, "epoch": 0.49386988333003756, "mean_token_accuracy": 0.7802874445915222, "num_tokens": 5181510.0, "step": 4995, "train/ce_loss": 0.6299203038215637 }, { "epoch": 0.49386988333003756, "step": 4995, "train/sim_loss": 0.06640625 }, { "epoch": 0.49386988333003756, "step": 4995, "train/total_loss": 0.12939828634262085 }, { "entropy": 9.119218826293945, "epoch": 0.4939687561795531, "mean_token_accuracy": 0.72826087474823, "num_tokens": 5186811.0, "step": 4996, "train/ce_loss": 0.5254350304603577 }, { "epoch": 0.4939687561795531, "step": 4996, "train/sim_loss": 0.0234375 }, { "epoch": 0.4939687561795531, "step": 4996, "train/total_loss": 0.075981006026268 }, { "entropy": 9.664339065551758, "epoch": 0.49406762902906864, "mean_token_accuracy": 0.7256944179534912, "num_tokens": 5191835.0, "step": 4997, "train/ce_loss": 0.9205860495567322 }, { "epoch": 0.49406762902906864, "step": 4997, "train/sim_loss": 0.046875 }, { "epoch": 0.49406762902906864, "step": 4997, "train/total_loss": 0.13893359899520874 }, { "entropy": 8.853754043579102, "epoch": 0.49416650187858413, "mean_token_accuracy": 0.7513691186904907, "num_tokens": 5197280.0, "step": 4998, "train/ce_loss": 0.4986410140991211 }, { "epoch": 0.49416650187858413, "step": 4998, "train/sim_loss": 0.08203125 }, { "epoch": 0.49416650187858413, "step": 4998, "train/total_loss": 0.13189534842967987 }, { "entropy": 9.307087898254395, "epoch": 0.49426537472809967, "mean_token_accuracy": 0.6903669834136963, "num_tokens": 5202112.0, "step": 4999, "train/ce_loss": 2.536309242248535 }, { "epoch": 0.49426537472809967, "step": 4999, "train/sim_loss": 0.0625 }, { "epoch": 0.49426537472809967, "step": 4999, "train/total_loss": 0.31613093614578247 }, { "epoch": 0.4943642475776152, "grad_norm": 0.978468656539917, "learning_rate": 8.766503486129655e-06, "loss": 0.1417, "step": 5000 }, { "entropy": 9.330076217651367, "epoch": 0.4943642475776152, "mean_token_accuracy": 0.7440559267997742, "num_tokens": 5207271.0, "step": 5000, "train/ce_loss": 1.2702337503433228 }, { "epoch": 0.4943642475776152, "step": 5000, "train/sim_loss": 0.12890625 }, { "epoch": 0.4943642475776152, "step": 5000, "train/total_loss": 0.2559296488761902 }, { "entropy": 8.971585273742676, "epoch": 0.4944631204271307, "mean_token_accuracy": 0.7370203137397766, "num_tokens": 5212763.0, "step": 5001, "train/ce_loss": 0.84300297498703 }, { "epoch": 0.4944631204271307, "step": 5001, "train/sim_loss": 0.078125 }, { "epoch": 0.4944631204271307, "step": 5001, "train/total_loss": 0.16242530941963196 }, { "entropy": 8.718502044677734, "epoch": 0.49456199327664624, "mean_token_accuracy": 0.7404162287712097, "num_tokens": 5218152.0, "step": 5002, "train/ce_loss": 1.145225167274475 }, { "epoch": 0.49456199327664624, "step": 5002, "train/sim_loss": 0.09375 }, { "epoch": 0.49456199327664624, "step": 5002, "train/total_loss": 0.2082725167274475 }, { "entropy": 9.640294075012207, "epoch": 0.4946608661261618, "mean_token_accuracy": 0.7985865473747253, "num_tokens": 5223156.0, "step": 5003, "train/ce_loss": 0.8569324612617493 }, { "epoch": 0.4946608661261618, "step": 5003, "train/sim_loss": 0.04296875 }, { "epoch": 0.4946608661261618, "step": 5003, "train/total_loss": 0.12866199016571045 }, { "entropy": 9.10590934753418, "epoch": 0.49475973897567727, "mean_token_accuracy": 0.6654135584831238, "num_tokens": 5228396.0, "step": 5004, "train/ce_loss": 2.4376211058552144e-06 }, { "epoch": 0.49475973897567727, "step": 5004, "train/sim_loss": 0.046875 }, { "epoch": 0.49475973897567727, "step": 5004, "train/total_loss": 0.0468752421438694 }, { "entropy": 8.985590934753418, "epoch": 0.4948586118251928, "mean_token_accuracy": 0.724602222442627, "num_tokens": 5233636.0, "step": 5005, "train/ce_loss": 0.7069535255432129 }, { "epoch": 0.4948586118251928, "step": 5005, "train/sim_loss": 0.046875 }, { "epoch": 0.4948586118251928, "step": 5005, "train/total_loss": 0.11757035553455353 }, { "entropy": 9.321735382080078, "epoch": 0.49495748467470835, "mean_token_accuracy": 0.7353760600090027, "num_tokens": 5238815.0, "step": 5006, "train/ce_loss": 0.682883083820343 }, { "epoch": 0.49495748467470835, "step": 5006, "train/sim_loss": 0.0625 }, { "epoch": 0.49495748467470835, "step": 5006, "train/total_loss": 0.13078831136226654 }, { "entropy": 9.03097152709961, "epoch": 0.49505635752422383, "mean_token_accuracy": 0.6895734667778015, "num_tokens": 5244130.0, "step": 5007, "train/ce_loss": 1.4150376319885254 }, { "epoch": 0.49505635752422383, "step": 5007, "train/sim_loss": 0.04296875 }, { "epoch": 0.49505635752422383, "step": 5007, "train/total_loss": 0.18447251617908478 }, { "entropy": 9.243738174438477, "epoch": 0.4951552303737394, "mean_token_accuracy": 0.82201087474823, "num_tokens": 5249280.0, "step": 5008, "train/ce_loss": 0.3235234320163727 }, { "epoch": 0.4951552303737394, "step": 5008, "train/sim_loss": 0.02734375 }, { "epoch": 0.4951552303737394, "step": 5008, "train/total_loss": 0.05969609320163727 }, { "entropy": 9.48786735534668, "epoch": 0.4952541032232549, "mean_token_accuracy": 0.7474600672721863, "num_tokens": 5254421.0, "step": 5009, "train/ce_loss": 0.7793648838996887 }, { "epoch": 0.4952541032232549, "step": 5009, "train/sim_loss": 0.109375 }, { "epoch": 0.4952541032232549, "step": 5009, "train/total_loss": 0.18731150031089783 }, { "entropy": 9.293548583984375, "epoch": 0.4953529760727704, "mean_token_accuracy": 0.647606372833252, "num_tokens": 5259564.0, "step": 5010, "train/ce_loss": 8.262034612016578e-07 }, { "epoch": 0.4953529760727704, "step": 5010, "train/sim_loss": 0.0234375 }, { "epoch": 0.4953529760727704, "step": 5010, "train/total_loss": 0.023437581956386566 }, { "entropy": 8.798707962036133, "epoch": 0.49545184892228594, "mean_token_accuracy": 0.7243852615356445, "num_tokens": 5265004.0, "step": 5011, "train/ce_loss": 0.8004510402679443 }, { "epoch": 0.49545184892228594, "step": 5011, "train/sim_loss": 0.0703125 }, { "epoch": 0.49545184892228594, "step": 5011, "train/total_loss": 0.15035760402679443 }, { "entropy": 8.94283676147461, "epoch": 0.4955507217718015, "mean_token_accuracy": 0.7059496641159058, "num_tokens": 5270370.0, "step": 5012, "train/ce_loss": 0.8203482031822205 }, { "epoch": 0.4955507217718015, "step": 5012, "train/sim_loss": 0.09765625 }, { "epoch": 0.4955507217718015, "step": 5012, "train/total_loss": 0.17969107627868652 }, { "entropy": 9.285341262817383, "epoch": 0.49564959462131697, "mean_token_accuracy": 0.7302799224853516, "num_tokens": 5275594.0, "step": 5013, "train/ce_loss": 0.4494679272174835 }, { "epoch": 0.49564959462131697, "step": 5013, "train/sim_loss": 0.07421875 }, { "epoch": 0.49564959462131697, "step": 5013, "train/total_loss": 0.11916553974151611 }, { "entropy": 8.98812198638916, "epoch": 0.4957484674708325, "mean_token_accuracy": 0.7211764454841614, "num_tokens": 5280922.0, "step": 5014, "train/ce_loss": 0.7902920246124268 }, { "epoch": 0.4957484674708325, "step": 5014, "train/sim_loss": 0.03125 }, { "epoch": 0.4957484674708325, "step": 5014, "train/total_loss": 0.11027920246124268 }, { "entropy": 8.832348823547363, "epoch": 0.49584734032034805, "mean_token_accuracy": 0.7766179442405701, "num_tokens": 5286345.0, "step": 5015, "train/ce_loss": 0.5260584354400635 }, { "epoch": 0.49584734032034805, "step": 5015, "train/sim_loss": 0.01953125 }, { "epoch": 0.49584734032034805, "step": 5015, "train/total_loss": 0.07213709503412247 }, { "entropy": 9.457469940185547, "epoch": 0.49594621316986354, "mean_token_accuracy": 0.7450330853462219, "num_tokens": 5291401.0, "step": 5016, "train/ce_loss": 0.8888313174247742 }, { "epoch": 0.49594621316986354, "step": 5016, "train/sim_loss": 0.09765625 }, { "epoch": 0.49594621316986354, "step": 5016, "train/total_loss": 0.18653938174247742 }, { "entropy": 8.61086368560791, "epoch": 0.4960450860193791, "mean_token_accuracy": 0.7685185074806213, "num_tokens": 5296942.0, "step": 5017, "train/ce_loss": 0.8579637408256531 }, { "epoch": 0.4960450860193791, "step": 5017, "train/sim_loss": 0.0625 }, { "epoch": 0.4960450860193791, "step": 5017, "train/total_loss": 0.14829638600349426 }, { "entropy": 8.979242324829102, "epoch": 0.4961439588688946, "mean_token_accuracy": 0.7670329809188843, "num_tokens": 5302309.0, "step": 5018, "train/ce_loss": 0.7281983494758606 }, { "epoch": 0.4961439588688946, "step": 5018, "train/sim_loss": 0.05078125 }, { "epoch": 0.4961439588688946, "step": 5018, "train/total_loss": 0.12360108643770218 }, { "entropy": 8.694494247436523, "epoch": 0.4962428317184101, "mean_token_accuracy": 0.76382976770401, "num_tokens": 5307742.0, "step": 5019, "train/ce_loss": 0.7149852514266968 }, { "epoch": 0.4962428317184101, "step": 5019, "train/sim_loss": 0.0234375 }, { "epoch": 0.4962428317184101, "step": 5019, "train/total_loss": 0.09493602812290192 }, { "epoch": 0.49634170456792565, "grad_norm": 0.708148181438446, "learning_rate": 8.761558621371706e-06, "loss": 0.1442, "step": 5020 }, { "entropy": 9.002897262573242, "epoch": 0.49634170456792565, "mean_token_accuracy": 0.7137203216552734, "num_tokens": 5312990.0, "step": 5020, "train/ce_loss": 0.8256528377532959 }, { "epoch": 0.49634170456792565, "step": 5020, "train/sim_loss": 0.05078125 }, { "epoch": 0.49634170456792565, "step": 5020, "train/total_loss": 0.1333465278148651 }, { "entropy": 8.83686351776123, "epoch": 0.4964405774174412, "mean_token_accuracy": 0.7586981058120728, "num_tokens": 5318355.0, "step": 5021, "train/ce_loss": 0.8046140670776367 }, { "epoch": 0.4964405774174412, "step": 5021, "train/sim_loss": 0.04296875 }, { "epoch": 0.4964405774174412, "step": 5021, "train/total_loss": 0.12343015521764755 }, { "entropy": 9.446584701538086, "epoch": 0.4965394502669567, "mean_token_accuracy": 0.7009803652763367, "num_tokens": 5323403.0, "step": 5022, "train/ce_loss": 3.4185343338322127e-06 }, { "epoch": 0.4965394502669567, "step": 5022, "train/sim_loss": 0.03125 }, { "epoch": 0.4965394502669567, "step": 5022, "train/total_loss": 0.03125034272670746 }, { "entropy": 9.473800659179688, "epoch": 0.4966383231164722, "mean_token_accuracy": 0.754687488079071, "num_tokens": 5328506.0, "step": 5023, "train/ce_loss": 1.2287180423736572 }, { "epoch": 0.4966383231164722, "step": 5023, "train/sim_loss": 0.07421875 }, { "epoch": 0.4966383231164722, "step": 5023, "train/total_loss": 0.19709056615829468 }, { "entropy": 8.936258316040039, "epoch": 0.49673719596598775, "mean_token_accuracy": 0.7270588278770447, "num_tokens": 5333952.0, "step": 5024, "train/ce_loss": 1.431602120399475 }, { "epoch": 0.49673719596598775, "step": 5024, "train/sim_loss": 0.10546875 }, { "epoch": 0.49673719596598775, "step": 5024, "train/total_loss": 0.24862895905971527 }, { "entropy": 8.994499206542969, "epoch": 0.49683606881550324, "mean_token_accuracy": 0.7741203308105469, "num_tokens": 5339275.0, "step": 5025, "train/ce_loss": 0.5269079208374023 }, { "epoch": 0.49683606881550324, "step": 5025, "train/sim_loss": 0.01171875 }, { "epoch": 0.49683606881550324, "step": 5025, "train/total_loss": 0.064409539103508 }, { "entropy": 8.922952651977539, "epoch": 0.4969349416650188, "mean_token_accuracy": 0.7412333488464355, "num_tokens": 5344477.0, "step": 5026, "train/ce_loss": 0.9101450443267822 }, { "epoch": 0.4969349416650188, "step": 5026, "train/sim_loss": 0.0390625 }, { "epoch": 0.4969349416650188, "step": 5026, "train/total_loss": 0.13007700443267822 }, { "entropy": 8.915870666503906, "epoch": 0.4970338145145343, "mean_token_accuracy": 0.756394624710083, "num_tokens": 5349750.0, "step": 5027, "train/ce_loss": 0.9538937211036682 }, { "epoch": 0.4970338145145343, "step": 5027, "train/sim_loss": 0.0859375 }, { "epoch": 0.4970338145145343, "step": 5027, "train/total_loss": 0.18132686614990234 }, { "entropy": 8.895801544189453, "epoch": 0.4971326873640498, "mean_token_accuracy": 0.7528795599937439, "num_tokens": 5355175.0, "step": 5028, "train/ce_loss": 0.5815815329551697 }, { "epoch": 0.4971326873640498, "step": 5028, "train/sim_loss": 0.0546875 }, { "epoch": 0.4971326873640498, "step": 5028, "train/total_loss": 0.11284565925598145 }, { "entropy": 8.86722469329834, "epoch": 0.49723156021356535, "mean_token_accuracy": 0.7579185366630554, "num_tokens": 5360546.0, "step": 5029, "train/ce_loss": 0.5811032652854919 }, { "epoch": 0.49723156021356535, "step": 5029, "train/sim_loss": 0.0546875 }, { "epoch": 0.49723156021356535, "step": 5029, "train/total_loss": 0.1127978265285492 }, { "entropy": 9.113153457641602, "epoch": 0.4973304330630809, "mean_token_accuracy": 0.7327001094818115, "num_tokens": 5365752.0, "step": 5030, "train/ce_loss": 0.8052600026130676 }, { "epoch": 0.4973304330630809, "step": 5030, "train/sim_loss": 0.0546875 }, { "epoch": 0.4973304330630809, "step": 5030, "train/total_loss": 0.13521349430084229 }, { "entropy": 9.06174087524414, "epoch": 0.4974293059125964, "mean_token_accuracy": 0.7424072027206421, "num_tokens": 5371100.0, "step": 5031, "train/ce_loss": 0.5513002872467041 }, { "epoch": 0.4974293059125964, "step": 5031, "train/sim_loss": 0.02734375 }, { "epoch": 0.4974293059125964, "step": 5031, "train/total_loss": 0.08247378468513489 }, { "entropy": 9.803543090820312, "epoch": 0.4975281787621119, "mean_token_accuracy": 0.7523364424705505, "num_tokens": 5375937.0, "step": 5032, "train/ce_loss": 3.624947339631035e-06 }, { "epoch": 0.4975281787621119, "step": 5032, "train/sim_loss": 0.0234375 }, { "epoch": 0.4975281787621119, "step": 5032, "train/total_loss": 0.0234378632158041 }, { "entropy": 8.803306579589844, "epoch": 0.49762705161162746, "mean_token_accuracy": 0.7314629554748535, "num_tokens": 5381452.0, "step": 5033, "train/ce_loss": 0.923446774482727 }, { "epoch": 0.49762705161162746, "step": 5033, "train/sim_loss": 0.10546875 }, { "epoch": 0.49762705161162746, "step": 5033, "train/total_loss": 0.19781342148780823 }, { "entropy": 9.446455001831055, "epoch": 0.49772592446114294, "mean_token_accuracy": 0.7828371524810791, "num_tokens": 5386481.0, "step": 5034, "train/ce_loss": 1.076431393623352 }, { "epoch": 0.49772592446114294, "step": 5034, "train/sim_loss": 0.03125 }, { "epoch": 0.49772592446114294, "step": 5034, "train/total_loss": 0.13889314234256744 }, { "entropy": 9.671392440795898, "epoch": 0.4978247973106585, "mean_token_accuracy": 0.8042105436325073, "num_tokens": 5391399.0, "step": 5035, "train/ce_loss": 0.9019742012023926 }, { "epoch": 0.4978247973106585, "step": 5035, "train/sim_loss": 0.03125 }, { "epoch": 0.4978247973106585, "step": 5035, "train/total_loss": 0.12144742161035538 }, { "entropy": 8.512903213500977, "epoch": 0.497923670160174, "mean_token_accuracy": 0.7120291590690613, "num_tokens": 5396742.0, "step": 5036, "train/ce_loss": 1.0357922315597534 }, { "epoch": 0.497923670160174, "step": 5036, "train/sim_loss": 0.03125 }, { "epoch": 0.497923670160174, "step": 5036, "train/total_loss": 0.13482922315597534 }, { "entropy": 9.497211456298828, "epoch": 0.49802254300968957, "mean_token_accuracy": 0.7265501022338867, "num_tokens": 5401805.0, "step": 5037, "train/ce_loss": 0.9477840065956116 }, { "epoch": 0.49802254300968957, "step": 5037, "train/sim_loss": 0.06640625 }, { "epoch": 0.49802254300968957, "step": 5037, "train/total_loss": 0.1611846536397934 }, { "entropy": 9.765968322753906, "epoch": 0.49812141585920505, "mean_token_accuracy": 0.745920717716217, "num_tokens": 5406604.0, "step": 5038, "train/ce_loss": 3.671036438390729e-06 }, { "epoch": 0.49812141585920505, "step": 5038, "train/sim_loss": 0.03125 }, { "epoch": 0.49812141585920505, "step": 5038, "train/total_loss": 0.03125036880373955 }, { "entropy": 8.735824584960938, "epoch": 0.4982202887087206, "mean_token_accuracy": 0.7946635484695435, "num_tokens": 5411940.0, "step": 5039, "train/ce_loss": 0.4332225024700165 }, { "epoch": 0.4982202887087206, "step": 5039, "train/sim_loss": 0.02734375 }, { "epoch": 0.4982202887087206, "step": 5039, "train/total_loss": 0.07066600024700165 }, { "epoch": 0.49831916155823613, "grad_norm": 0.6704682111740112, "learning_rate": 8.756613756613758e-06, "loss": 0.1323, "step": 5040 }, { "entropy": 9.419351577758789, "epoch": 0.49831916155823613, "mean_token_accuracy": 0.702786386013031, "num_tokens": 5417029.0, "step": 5040, "train/ce_loss": 1.07545006275177 }, { "epoch": 0.49831916155823613, "step": 5040, "train/sim_loss": 0.0546875 }, { "epoch": 0.49831916155823613, "step": 5040, "train/total_loss": 0.16223251819610596 }, { "entropy": 9.154134750366211, "epoch": 0.4984180344077516, "mean_token_accuracy": 0.7070844769477844, "num_tokens": 5422191.0, "step": 5041, "train/ce_loss": 0.9596898555755615 }, { "epoch": 0.4984180344077516, "step": 5041, "train/sim_loss": 0.0546875 }, { "epoch": 0.4984180344077516, "step": 5041, "train/total_loss": 0.15065649151802063 }, { "entropy": 8.867704391479492, "epoch": 0.49851690725726716, "mean_token_accuracy": 0.7425968050956726, "num_tokens": 5427569.0, "step": 5042, "train/ce_loss": 1.0225627422332764 }, { "epoch": 0.49851690725726716, "step": 5042, "train/sim_loss": 0.08984375 }, { "epoch": 0.49851690725726716, "step": 5042, "train/total_loss": 0.19210001826286316 }, { "entropy": 8.442657470703125, "epoch": 0.4986157801067827, "mean_token_accuracy": 0.7573149800300598, "num_tokens": 5433229.0, "step": 5043, "train/ce_loss": 0.5535759329795837 }, { "epoch": 0.4986157801067827, "step": 5043, "train/sim_loss": 0.02734375 }, { "epoch": 0.4986157801067827, "step": 5043, "train/total_loss": 0.08270134031772614 }, { "entropy": 8.905779838562012, "epoch": 0.4987146529562982, "mean_token_accuracy": 0.6979293823242188, "num_tokens": 5438570.0, "step": 5044, "train/ce_loss": 1.03871750831604 }, { "epoch": 0.4987146529562982, "step": 5044, "train/sim_loss": 0.0390625 }, { "epoch": 0.4987146529562982, "step": 5044, "train/total_loss": 0.14293426275253296 }, { "entropy": 8.809632301330566, "epoch": 0.49881352580581373, "mean_token_accuracy": 0.7377220392227173, "num_tokens": 5443984.0, "step": 5045, "train/ce_loss": 0.9378743767738342 }, { "epoch": 0.49881352580581373, "step": 5045, "train/sim_loss": 0.06640625 }, { "epoch": 0.49881352580581373, "step": 5045, "train/total_loss": 0.16019368171691895 }, { "entropy": 9.212902069091797, "epoch": 0.49891239865532927, "mean_token_accuracy": 0.7313432693481445, "num_tokens": 5449114.0, "step": 5046, "train/ce_loss": 0.8752282857894897 }, { "epoch": 0.49891239865532927, "step": 5046, "train/sim_loss": 0.12109375 }, { "epoch": 0.49891239865532927, "step": 5046, "train/total_loss": 0.20861658453941345 }, { "entropy": 9.310138702392578, "epoch": 0.49901127150484476, "mean_token_accuracy": 0.7603833675384521, "num_tokens": 5454223.0, "step": 5047, "train/ce_loss": 0.8113348484039307 }, { "epoch": 0.49901127150484476, "step": 5047, "train/sim_loss": 0.03515625 }, { "epoch": 0.49901127150484476, "step": 5047, "train/total_loss": 0.11628973484039307 }, { "entropy": 8.914920806884766, "epoch": 0.4991101443543603, "mean_token_accuracy": 0.6916950941085815, "num_tokens": 5459572.0, "step": 5048, "train/ce_loss": 0.7768692970275879 }, { "epoch": 0.4991101443543603, "step": 5048, "train/sim_loss": 0.0703125 }, { "epoch": 0.4991101443543603, "step": 5048, "train/total_loss": 0.14799943566322327 }, { "entropy": 9.293927192687988, "epoch": 0.49920901720387584, "mean_token_accuracy": 0.7607913613319397, "num_tokens": 5464588.0, "step": 5049, "train/ce_loss": 0.628460705280304 }, { "epoch": 0.49920901720387584, "step": 5049, "train/sim_loss": 0.078125 }, { "epoch": 0.49920901720387584, "step": 5049, "train/total_loss": 0.14097106456756592 }, { "entropy": 8.986357688903809, "epoch": 0.4993078900533913, "mean_token_accuracy": 0.7927711009979248, "num_tokens": 5469900.0, "step": 5050, "train/ce_loss": 0.8069027662277222 }, { "epoch": 0.4993078900533913, "step": 5050, "train/sim_loss": 0.08984375 }, { "epoch": 0.4993078900533913, "step": 5050, "train/total_loss": 0.17053402960300446 }, { "entropy": 8.951379776000977, "epoch": 0.49940676290290686, "mean_token_accuracy": 0.7124999761581421, "num_tokens": 5475275.0, "step": 5051, "train/ce_loss": 1.2251209020614624 }, { "epoch": 0.49940676290290686, "step": 5051, "train/sim_loss": 0.03515625 }, { "epoch": 0.49940676290290686, "step": 5051, "train/total_loss": 0.1576683521270752 }, { "entropy": 8.93557071685791, "epoch": 0.4995056357524224, "mean_token_accuracy": 0.7454545497894287, "num_tokens": 5480505.0, "step": 5052, "train/ce_loss": 0.6807805895805359 }, { "epoch": 0.4995056357524224, "step": 5052, "train/sim_loss": 0.02734375 }, { "epoch": 0.4995056357524224, "step": 5052, "train/total_loss": 0.09542181342840195 }, { "entropy": 8.674575805664062, "epoch": 0.4996045086019379, "mean_token_accuracy": 0.7017892599105835, "num_tokens": 5485975.0, "step": 5053, "train/ce_loss": 0.5238415598869324 }, { "epoch": 0.4996045086019379, "step": 5053, "train/sim_loss": 0.0546875 }, { "epoch": 0.4996045086019379, "step": 5053, "train/total_loss": 0.107071653008461 }, { "entropy": 9.003825187683105, "epoch": 0.49970338145145343, "mean_token_accuracy": 0.7698323726654053, "num_tokens": 5491353.0, "step": 5054, "train/ce_loss": 0.5558789372444153 }, { "epoch": 0.49970338145145343, "step": 5054, "train/sim_loss": 0.046875 }, { "epoch": 0.49970338145145343, "step": 5054, "train/total_loss": 0.10246289521455765 }, { "entropy": 9.294960021972656, "epoch": 0.499802254300969, "mean_token_accuracy": 0.6948148012161255, "num_tokens": 5496551.0, "step": 5055, "train/ce_loss": 1.2571916580200195 }, { "epoch": 0.499802254300969, "step": 5055, "train/sim_loss": 0.0703125 }, { "epoch": 0.499802254300969, "step": 5055, "train/total_loss": 0.19603167474269867 }, { "entropy": 8.980306625366211, "epoch": 0.49990112715048446, "mean_token_accuracy": 0.7334109544754028, "num_tokens": 5501879.0, "step": 5056, "train/ce_loss": 0.9166693091392517 }, { "epoch": 0.49990112715048446, "step": 5056, "train/sim_loss": 0.0546875 }, { "epoch": 0.49990112715048446, "step": 5056, "train/total_loss": 0.14635443687438965 }, { "entropy": 8.993450164794922, "epoch": 0.5, "mean_token_accuracy": 0.7285714149475098, "num_tokens": 5507152.0, "step": 5057, "train/ce_loss": 1.3876967430114746 }, { "epoch": 0.5, "step": 5057, "train/sim_loss": 0.0625 }, { "epoch": 0.5, "step": 5057, "train/total_loss": 0.20126967132091522 }, { "entropy": 9.219903945922852, "epoch": 0.5000988728495155, "mean_token_accuracy": 0.7168141603469849, "num_tokens": 5512277.0, "step": 5058, "train/ce_loss": 0.6238693594932556 }, { "epoch": 0.5000988728495155, "step": 5058, "train/sim_loss": 0.0390625 }, { "epoch": 0.5000988728495155, "step": 5058, "train/total_loss": 0.10144943743944168 }, { "entropy": 8.974245071411133, "epoch": 0.5001977456990311, "mean_token_accuracy": 0.7571234703063965, "num_tokens": 5517552.0, "step": 5059, "train/ce_loss": 1.0427740812301636 }, { "epoch": 0.5001977456990311, "step": 5059, "train/sim_loss": 0.0625 }, { "epoch": 0.5001977456990311, "step": 5059, "train/total_loss": 0.16677740216255188 }, { "epoch": 0.5002966185485466, "grad_norm": 0.7067264914512634, "learning_rate": 8.751668891855809e-06, "loss": 0.148, "step": 5060 }, { "entropy": 9.101495742797852, "epoch": 0.5002966185485466, "mean_token_accuracy": 0.7624831199645996, "num_tokens": 5522769.0, "step": 5060, "train/ce_loss": 0.7306716442108154 }, { "epoch": 0.5002966185485466, "step": 5060, "train/sim_loss": 0.0625 }, { "epoch": 0.5002966185485466, "step": 5060, "train/total_loss": 0.13556715846061707 }, { "entropy": 9.395573616027832, "epoch": 0.500395491398062, "mean_token_accuracy": 0.7682119011878967, "num_tokens": 5527923.0, "step": 5061, "train/ce_loss": 1.0372035503387451 }, { "epoch": 0.500395491398062, "step": 5061, "train/sim_loss": 0.0234375 }, { "epoch": 0.500395491398062, "step": 5061, "train/total_loss": 0.12715786695480347 }, { "entropy": 8.775077819824219, "epoch": 0.5004943642475776, "mean_token_accuracy": 0.7476922869682312, "num_tokens": 5533379.0, "step": 5062, "train/ce_loss": 0.8463733196258545 }, { "epoch": 0.5004943642475776, "step": 5062, "train/sim_loss": 0.08203125 }, { "epoch": 0.5004943642475776, "step": 5062, "train/total_loss": 0.1666685938835144 }, { "entropy": 9.09598159790039, "epoch": 0.5005932370970931, "mean_token_accuracy": 0.7230273485183716, "num_tokens": 5538438.0, "step": 5063, "train/ce_loss": 1.003156304359436 }, { "epoch": 0.5005932370970931, "step": 5063, "train/sim_loss": 0.0546875 }, { "epoch": 0.5005932370970931, "step": 5063, "train/total_loss": 0.1550031304359436 }, { "entropy": 9.751112937927246, "epoch": 0.5006921099466086, "mean_token_accuracy": 0.7576419115066528, "num_tokens": 5543300.0, "step": 5064, "train/ce_loss": 1.6366160480174585e-06 }, { "epoch": 0.5006921099466086, "step": 5064, "train/sim_loss": 0.01953125 }, { "epoch": 0.5006921099466086, "step": 5064, "train/total_loss": 0.019531413912773132 }, { "entropy": 9.934198379516602, "epoch": 0.5007909827961242, "mean_token_accuracy": 0.7223719954490662, "num_tokens": 5548099.0, "step": 5065, "train/ce_loss": 1.7864806522993604e-06 }, { "epoch": 0.5007909827961242, "step": 5065, "train/sim_loss": 0.0625 }, { "epoch": 0.5007909827961242, "step": 5065, "train/total_loss": 0.06250017881393433 }, { "entropy": 9.23078727722168, "epoch": 0.5008898556456397, "mean_token_accuracy": 0.695067286491394, "num_tokens": 5553218.0, "step": 5066, "train/ce_loss": 2.113192067554337e-06 }, { "epoch": 0.5008898556456397, "step": 5066, "train/sim_loss": 0.0546875 }, { "epoch": 0.5008898556456397, "step": 5066, "train/total_loss": 0.05468771234154701 }, { "entropy": 8.907807350158691, "epoch": 0.5009887284951552, "mean_token_accuracy": 0.7356828451156616, "num_tokens": 5558378.0, "step": 5067, "train/ce_loss": 3.088776111326297e-06 }, { "epoch": 0.5009887284951552, "step": 5067, "train/sim_loss": 0.03125 }, { "epoch": 0.5009887284951552, "step": 5067, "train/total_loss": 0.03125030919909477 }, { "entropy": 8.918107986450195, "epoch": 0.5010876013446708, "mean_token_accuracy": 0.7873620986938477, "num_tokens": 5563990.0, "step": 5068, "train/ce_loss": 0.8910037279129028 }, { "epoch": 0.5010876013446708, "step": 5068, "train/sim_loss": 0.06640625 }, { "epoch": 0.5010876013446708, "step": 5068, "train/total_loss": 0.15550662577152252 }, { "entropy": 9.20634937286377, "epoch": 0.5011864741941863, "mean_token_accuracy": 0.8256275057792664, "num_tokens": 5569206.0, "step": 5069, "train/ce_loss": 1.3271516081658774e-06 }, { "epoch": 0.5011864741941863, "step": 5069, "train/sim_loss": 0.08203125 }, { "epoch": 0.5011864741941863, "step": 5069, "train/total_loss": 0.08203138411045074 }, { "entropy": 9.636856079101562, "epoch": 0.5012853470437018, "mean_token_accuracy": 0.8415637612342834, "num_tokens": 5574152.0, "step": 5070, "train/ce_loss": 2.1191829091549153e-06 }, { "epoch": 0.5012853470437018, "step": 5070, "train/sim_loss": 0.0390625 }, { "epoch": 0.5012853470437018, "step": 5070, "train/total_loss": 0.03906271234154701 }, { "entropy": 9.437459945678711, "epoch": 0.5013842198932174, "mean_token_accuracy": 0.7422680258750916, "num_tokens": 5579077.0, "step": 5071, "train/ce_loss": 1.5218898852253915e-06 }, { "epoch": 0.5013842198932174, "step": 5071, "train/sim_loss": 0.04296875 }, { "epoch": 0.5013842198932174, "step": 5071, "train/total_loss": 0.04296890273690224 }, { "entropy": 8.63110637664795, "epoch": 0.5014830927427328, "mean_token_accuracy": 0.7207123041152954, "num_tokens": 5584571.0, "step": 5072, "train/ce_loss": 1.1009140014648438 }, { "epoch": 0.5014830927427328, "step": 5072, "train/sim_loss": 0.1015625 }, { "epoch": 0.5014830927427328, "step": 5072, "train/total_loss": 0.2116539031267166 }, { "entropy": 8.963809967041016, "epoch": 0.5015819655922483, "mean_token_accuracy": 0.7305764555931091, "num_tokens": 5589899.0, "step": 5073, "train/ce_loss": 0.9008622169494629 }, { "epoch": 0.5015819655922483, "step": 5073, "train/sim_loss": 0.08203125 }, { "epoch": 0.5015819655922483, "step": 5073, "train/total_loss": 0.1721174716949463 }, { "entropy": 8.869132041931152, "epoch": 0.5016808384417639, "mean_token_accuracy": 0.7274800539016724, "num_tokens": 5595264.0, "step": 5074, "train/ce_loss": 0.7225847244262695 }, { "epoch": 0.5016808384417639, "step": 5074, "train/sim_loss": 0.0234375 }, { "epoch": 0.5016808384417639, "step": 5074, "train/total_loss": 0.09569597244262695 }, { "entropy": 9.230053901672363, "epoch": 0.5017797112912794, "mean_token_accuracy": 0.7642045617103577, "num_tokens": 5600426.0, "step": 5075, "train/ce_loss": 1.804285034268105e-06 }, { "epoch": 0.5017797112912794, "step": 5075, "train/sim_loss": 0.08203125 }, { "epoch": 0.5017797112912794, "step": 5075, "train/total_loss": 0.08203142881393433 }, { "entropy": 8.775399208068848, "epoch": 0.5018785841407949, "mean_token_accuracy": 0.7524038553237915, "num_tokens": 5605725.0, "step": 5076, "train/ce_loss": 0.5871142745018005 }, { "epoch": 0.5018785841407949, "step": 5076, "train/sim_loss": 0.04296875 }, { "epoch": 0.5018785841407949, "step": 5076, "train/total_loss": 0.10168017446994781 }, { "entropy": 9.455656051635742, "epoch": 0.5019774569903105, "mean_token_accuracy": 0.7743785977363586, "num_tokens": 5610706.0, "step": 5077, "train/ce_loss": 1.207939863204956 }, { "epoch": 0.5019774569903105, "step": 5077, "train/sim_loss": 0.04296875 }, { "epoch": 0.5019774569903105, "step": 5077, "train/total_loss": 0.16376274824142456 }, { "entropy": 8.894990921020508, "epoch": 0.502076329839826, "mean_token_accuracy": 0.6934023499488831, "num_tokens": 5615949.0, "step": 5078, "train/ce_loss": 0.9794240593910217 }, { "epoch": 0.502076329839826, "step": 5078, "train/sim_loss": 0.1015625 }, { "epoch": 0.502076329839826, "step": 5078, "train/total_loss": 0.19950491189956665 }, { "entropy": 8.97636604309082, "epoch": 0.5021752026893415, "mean_token_accuracy": 0.6861042380332947, "num_tokens": 5621186.0, "step": 5079, "train/ce_loss": 1.6270090341567993 }, { "epoch": 0.5021752026893415, "step": 5079, "train/sim_loss": 0.05078125 }, { "epoch": 0.5021752026893415, "step": 5079, "train/total_loss": 0.21348215639591217 }, { "epoch": 0.5022740755388571, "grad_norm": 0.8323706984519958, "learning_rate": 8.746724027097859e-06, "loss": 0.1363, "step": 5080 }, { "entropy": 9.041561126708984, "epoch": 0.5022740755388571, "mean_token_accuracy": 0.6832579374313354, "num_tokens": 5626592.0, "step": 5080, "train/ce_loss": 0.9357867240905762 }, { "epoch": 0.5022740755388571, "step": 5080, "train/sim_loss": 0.05859375 }, { "epoch": 0.5022740755388571, "step": 5080, "train/total_loss": 0.15217241644859314 }, { "entropy": 9.809600830078125, "epoch": 0.5023729483883725, "mean_token_accuracy": 0.6909871101379395, "num_tokens": 5631449.0, "step": 5081, "train/ce_loss": 2.0013485482195392e-06 }, { "epoch": 0.5023729483883725, "step": 5081, "train/sim_loss": 0.05078125 }, { "epoch": 0.5023729483883725, "step": 5081, "train/total_loss": 0.05078145116567612 }, { "entropy": 9.92713737487793, "epoch": 0.502471821237888, "mean_token_accuracy": 0.6908315420150757, "num_tokens": 5636336.0, "step": 5082, "train/ce_loss": 1.3149290084838867 }, { "epoch": 0.502471821237888, "step": 5082, "train/sim_loss": 0.03125 }, { "epoch": 0.502471821237888, "step": 5082, "train/total_loss": 0.16274289786815643 }, { "entropy": 8.473834037780762, "epoch": 0.5025706940874036, "mean_token_accuracy": 0.7618147730827332, "num_tokens": 5641881.0, "step": 5083, "train/ce_loss": 0.5337604284286499 }, { "epoch": 0.5025706940874036, "step": 5083, "train/sim_loss": 0.0546875 }, { "epoch": 0.5025706940874036, "step": 5083, "train/total_loss": 0.10806354880332947 }, { "entropy": 8.596364974975586, "epoch": 0.5026695669369191, "mean_token_accuracy": 0.738070011138916, "num_tokens": 5647324.0, "step": 5084, "train/ce_loss": 0.8980840444564819 }, { "epoch": 0.5026695669369191, "step": 5084, "train/sim_loss": 0.0390625 }, { "epoch": 0.5026695669369191, "step": 5084, "train/total_loss": 0.1288709044456482 }, { "entropy": 9.061573028564453, "epoch": 0.5027684397864346, "mean_token_accuracy": 0.772020697593689, "num_tokens": 5652538.0, "step": 5085, "train/ce_loss": 0.4668557643890381 }, { "epoch": 0.5027684397864346, "step": 5085, "train/sim_loss": 0.05078125 }, { "epoch": 0.5027684397864346, "step": 5085, "train/total_loss": 0.09746682643890381 }, { "entropy": 9.521434783935547, "epoch": 0.5028673126359502, "mean_token_accuracy": 0.761168360710144, "num_tokens": 5657519.0, "step": 5086, "train/ce_loss": 1.008385419845581 }, { "epoch": 0.5028673126359502, "step": 5086, "train/sim_loss": 0.0703125 }, { "epoch": 0.5028673126359502, "step": 5086, "train/total_loss": 0.1711510419845581 }, { "entropy": 9.393722534179688, "epoch": 0.5029661854854657, "mean_token_accuracy": 0.7013698816299438, "num_tokens": 5662607.0, "step": 5087, "train/ce_loss": 0.8183227777481079 }, { "epoch": 0.5029661854854657, "step": 5087, "train/sim_loss": 0.05859375 }, { "epoch": 0.5029661854854657, "step": 5087, "train/total_loss": 0.14042603969573975 }, { "entropy": 9.471672058105469, "epoch": 0.5030650583349812, "mean_token_accuracy": 0.7374045848846436, "num_tokens": 5667655.0, "step": 5088, "train/ce_loss": 1.2775394916534424 }, { "epoch": 0.5030650583349812, "step": 5088, "train/sim_loss": 0.046875 }, { "epoch": 0.5030650583349812, "step": 5088, "train/total_loss": 0.17462895810604095 }, { "entropy": 8.942428588867188, "epoch": 0.5031639311844968, "mean_token_accuracy": 0.7096773982048035, "num_tokens": 5673086.0, "step": 5089, "train/ce_loss": 0.5040650367736816 }, { "epoch": 0.5031639311844968, "step": 5089, "train/sim_loss": 0.08203125 }, { "epoch": 0.5031639311844968, "step": 5089, "train/total_loss": 0.13243775069713593 }, { "entropy": 9.104717254638672, "epoch": 0.5032628040340122, "mean_token_accuracy": 0.7839999794960022, "num_tokens": 5678425.0, "step": 5090, "train/ce_loss": 0.7472172379493713 }, { "epoch": 0.5032628040340122, "step": 5090, "train/sim_loss": 0.03125 }, { "epoch": 0.5032628040340122, "step": 5090, "train/total_loss": 0.10597172379493713 }, { "entropy": 8.89947509765625, "epoch": 0.5033616768835277, "mean_token_accuracy": 0.7226697206497192, "num_tokens": 5683794.0, "step": 5091, "train/ce_loss": 1.0725902318954468 }, { "epoch": 0.5033616768835277, "step": 5091, "train/sim_loss": 0.0625 }, { "epoch": 0.5033616768835277, "step": 5091, "train/total_loss": 0.16975903511047363 }, { "entropy": 9.385111808776855, "epoch": 0.5034605497330433, "mean_token_accuracy": 0.7701492309570312, "num_tokens": 5688932.0, "step": 5092, "train/ce_loss": 0.46031317114830017 }, { "epoch": 0.5034605497330433, "step": 5092, "train/sim_loss": 0.03125 }, { "epoch": 0.5034605497330433, "step": 5092, "train/total_loss": 0.07728131860494614 }, { "entropy": 9.301116943359375, "epoch": 0.5035594225825588, "mean_token_accuracy": 0.7307132482528687, "num_tokens": 5694057.0, "step": 5093, "train/ce_loss": 0.8710409998893738 }, { "epoch": 0.5035594225825588, "step": 5093, "train/sim_loss": 0.05078125 }, { "epoch": 0.5035594225825588, "step": 5093, "train/total_loss": 0.13788536190986633 }, { "entropy": 9.771774291992188, "epoch": 0.5036582954320743, "mean_token_accuracy": 0.752598762512207, "num_tokens": 5698972.0, "step": 5094, "train/ce_loss": 1.4751821756362915 }, { "epoch": 0.5036582954320743, "step": 5094, "train/sim_loss": 0.03515625 }, { "epoch": 0.5036582954320743, "step": 5094, "train/total_loss": 0.18267446756362915 }, { "entropy": 9.59885311126709, "epoch": 0.5037571682815899, "mean_token_accuracy": 0.8051947951316833, "num_tokens": 5704009.0, "step": 5095, "train/ce_loss": 1.8043161844616407e-06 }, { "epoch": 0.5037571682815899, "step": 5095, "train/sim_loss": 0.04296875 }, { "epoch": 0.5037571682815899, "step": 5095, "train/total_loss": 0.042968928813934326 }, { "entropy": 9.618340492248535, "epoch": 0.5038560411311054, "mean_token_accuracy": 0.7326202988624573, "num_tokens": 5708996.0, "step": 5096, "train/ce_loss": 1.0726532764238073e-06 }, { "epoch": 0.5038560411311054, "step": 5096, "train/sim_loss": 0.0234375 }, { "epoch": 0.5038560411311054, "step": 5096, "train/total_loss": 0.023437608033418655 }, { "entropy": 8.889970779418945, "epoch": 0.5039549139806209, "mean_token_accuracy": 0.7784877419471741, "num_tokens": 5714431.0, "step": 5097, "train/ce_loss": 1.165142297744751 }, { "epoch": 0.5039549139806209, "step": 5097, "train/sim_loss": 0.09765625 }, { "epoch": 0.5039549139806209, "step": 5097, "train/total_loss": 0.21417048573493958 }, { "entropy": 9.063730239868164, "epoch": 0.5040537868301365, "mean_token_accuracy": 0.6886792182922363, "num_tokens": 5719672.0, "step": 5098, "train/ce_loss": 1.0833317041397095 }, { "epoch": 0.5040537868301365, "step": 5098, "train/sim_loss": 0.03125 }, { "epoch": 0.5040537868301365, "step": 5098, "train/total_loss": 0.13958317041397095 }, { "entropy": 9.895206451416016, "epoch": 0.504152659679652, "mean_token_accuracy": 0.7579908967018127, "num_tokens": 5724532.0, "step": 5099, "train/ce_loss": 1.2715253829956055 }, { "epoch": 0.504152659679652, "step": 5099, "train/sim_loss": 0.140625 }, { "epoch": 0.504152659679652, "step": 5099, "train/total_loss": 0.26777756214141846 }, { "epoch": 0.5042515325291675, "grad_norm": 0.8440897464752197, "learning_rate": 8.741779162339911e-06, "loss": 0.136, "step": 5100 }, { "entropy": 9.225937843322754, "epoch": 0.5042515325291675, "mean_token_accuracy": 0.7293333411216736, "num_tokens": 5729720.0, "step": 5100, "train/ce_loss": 0.8220096230506897 }, { "epoch": 0.5042515325291675, "step": 5100, "train/sim_loss": 0.0703125 }, { "epoch": 0.5042515325291675, "step": 5100, "train/total_loss": 0.15251347422599792 }, { "entropy": 9.440406799316406, "epoch": 0.504350405378683, "mean_token_accuracy": 0.7401032447814941, "num_tokens": 5734798.0, "step": 5101, "train/ce_loss": 1.4222238063812256 }, { "epoch": 0.504350405378683, "step": 5101, "train/sim_loss": 0.0859375 }, { "epoch": 0.504350405378683, "step": 5101, "train/total_loss": 0.22815988957881927 }, { "entropy": 9.28703498840332, "epoch": 0.5044492782281985, "mean_token_accuracy": 0.7796609997749329, "num_tokens": 5739901.0, "step": 5102, "train/ce_loss": 2.876155122066848e-06 }, { "epoch": 0.5044492782281985, "step": 5102, "train/sim_loss": 0.0546875 }, { "epoch": 0.5044492782281985, "step": 5102, "train/total_loss": 0.05468778684735298 }, { "entropy": 8.814054489135742, "epoch": 0.5045481510777141, "mean_token_accuracy": 0.7667638659477234, "num_tokens": 5745440.0, "step": 5103, "train/ce_loss": 0.5586997866630554 }, { "epoch": 0.5045481510777141, "step": 5103, "train/sim_loss": 0.01953125 }, { "epoch": 0.5045481510777141, "step": 5103, "train/total_loss": 0.07540123164653778 }, { "entropy": 9.398179054260254, "epoch": 0.5046470239272296, "mean_token_accuracy": 0.7651515007019043, "num_tokens": 5750815.0, "step": 5104, "train/ce_loss": 1.1095727682113647 }, { "epoch": 0.5046470239272296, "step": 5104, "train/sim_loss": 0.07421875 }, { "epoch": 0.5046470239272296, "step": 5104, "train/total_loss": 0.1851760298013687 }, { "entropy": 9.207182884216309, "epoch": 0.5047458967767451, "mean_token_accuracy": 0.7735334038734436, "num_tokens": 5756036.0, "step": 5105, "train/ce_loss": 0.7678409814834595 }, { "epoch": 0.5047458967767451, "step": 5105, "train/sim_loss": 0.02734375 }, { "epoch": 0.5047458967767451, "step": 5105, "train/total_loss": 0.10412784665822983 }, { "entropy": 9.221953392028809, "epoch": 0.5048447696262607, "mean_token_accuracy": 0.7303370833396912, "num_tokens": 5761183.0, "step": 5106, "train/ce_loss": 1.2139334678649902 }, { "epoch": 0.5048447696262607, "step": 5106, "train/sim_loss": 0.0546875 }, { "epoch": 0.5048447696262607, "step": 5106, "train/total_loss": 0.1760808527469635 }, { "entropy": 8.855124473571777, "epoch": 0.5049436424757762, "mean_token_accuracy": 0.7108306884765625, "num_tokens": 5766601.0, "step": 5107, "train/ce_loss": 1.0811680555343628 }, { "epoch": 0.5049436424757762, "step": 5107, "train/sim_loss": 0.0546875 }, { "epoch": 0.5049436424757762, "step": 5107, "train/total_loss": 0.16280430555343628 }, { "entropy": 8.905277252197266, "epoch": 0.5050425153252917, "mean_token_accuracy": 0.7352085113525391, "num_tokens": 5772022.0, "step": 5108, "train/ce_loss": 0.4442138969898224 }, { "epoch": 0.5050425153252917, "step": 5108, "train/sim_loss": 0.046875 }, { "epoch": 0.5050425153252917, "step": 5108, "train/total_loss": 0.09129638969898224 }, { "entropy": 9.026247024536133, "epoch": 0.5051413881748072, "mean_token_accuracy": 0.7465224266052246, "num_tokens": 5777074.0, "step": 5109, "train/ce_loss": 1.2243313789367676 }, { "epoch": 0.5051413881748072, "step": 5109, "train/sim_loss": 0.078125 }, { "epoch": 0.5051413881748072, "step": 5109, "train/total_loss": 0.200558140873909 }, { "entropy": 8.925554275512695, "epoch": 0.5052402610243227, "mean_token_accuracy": 0.7467144727706909, "num_tokens": 5782392.0, "step": 5110, "train/ce_loss": 0.7173311114311218 }, { "epoch": 0.5052402610243227, "step": 5110, "train/sim_loss": 0.04296875 }, { "epoch": 0.5052402610243227, "step": 5110, "train/total_loss": 0.11470185965299606 }, { "entropy": 9.078937530517578, "epoch": 0.5053391338738382, "mean_token_accuracy": 0.7120000123977661, "num_tokens": 5787672.0, "step": 5111, "train/ce_loss": 1.3853713274002075 }, { "epoch": 0.5053391338738382, "step": 5111, "train/sim_loss": 0.0625 }, { "epoch": 0.5053391338738382, "step": 5111, "train/total_loss": 0.20103713870048523 }, { "entropy": 9.134469985961914, "epoch": 0.5054380067233538, "mean_token_accuracy": 0.7195122241973877, "num_tokens": 5792963.0, "step": 5112, "train/ce_loss": 0.4597097933292389 }, { "epoch": 0.5054380067233538, "step": 5112, "train/sim_loss": 0.046875 }, { "epoch": 0.5054380067233538, "step": 5112, "train/total_loss": 0.09284597635269165 }, { "entropy": 8.931024551391602, "epoch": 0.5055368795728693, "mean_token_accuracy": 0.7422459721565247, "num_tokens": 5798547.0, "step": 5113, "train/ce_loss": 1.419727087020874 }, { "epoch": 0.5055368795728693, "step": 5113, "train/sim_loss": 0.10546875 }, { "epoch": 0.5055368795728693, "step": 5113, "train/total_loss": 0.24744145572185516 }, { "entropy": 8.888985633850098, "epoch": 0.5056357524223848, "mean_token_accuracy": 0.7433217167854309, "num_tokens": 5803878.0, "step": 5114, "train/ce_loss": 0.7526900768280029 }, { "epoch": 0.5056357524223848, "step": 5114, "train/sim_loss": 0.1015625 }, { "epoch": 0.5056357524223848, "step": 5114, "train/total_loss": 0.17683151364326477 }, { "entropy": 8.805142402648926, "epoch": 0.5057346252719004, "mean_token_accuracy": 0.7789815664291382, "num_tokens": 5809279.0, "step": 5115, "train/ce_loss": 0.848304808139801 }, { "epoch": 0.5057346252719004, "step": 5115, "train/sim_loss": 0.05078125 }, { "epoch": 0.5057346252719004, "step": 5115, "train/total_loss": 0.13561174273490906 }, { "entropy": 9.011064529418945, "epoch": 0.5058334981214159, "mean_token_accuracy": 0.7626146674156189, "num_tokens": 5814627.0, "step": 5116, "train/ce_loss": 0.5869022607803345 }, { "epoch": 0.5058334981214159, "step": 5116, "train/sim_loss": 0.0625 }, { "epoch": 0.5058334981214159, "step": 5116, "train/total_loss": 0.12119022756814957 }, { "entropy": 9.190732955932617, "epoch": 0.5059323709709314, "mean_token_accuracy": 0.7644628286361694, "num_tokens": 5819870.0, "step": 5117, "train/ce_loss": 1.3621413472719723e-06 }, { "epoch": 0.5059323709709314, "step": 5117, "train/sim_loss": 0.03515625 }, { "epoch": 0.5059323709709314, "step": 5117, "train/total_loss": 0.03515638783574104 }, { "entropy": 9.612802505493164, "epoch": 0.506031243820447, "mean_token_accuracy": 0.7743902206420898, "num_tokens": 5824818.0, "step": 5118, "train/ce_loss": 1.2309508323669434 }, { "epoch": 0.506031243820447, "step": 5118, "train/sim_loss": 0.03125 }, { "epoch": 0.506031243820447, "step": 5118, "train/total_loss": 0.1543450951576233 }, { "entropy": 8.730239868164062, "epoch": 0.5061301166699624, "mean_token_accuracy": 0.7402597665786743, "num_tokens": 5830222.0, "step": 5119, "train/ce_loss": 1.0233420133590698 }, { "epoch": 0.5061301166699624, "step": 5119, "train/sim_loss": 0.0859375 }, { "epoch": 0.5061301166699624, "step": 5119, "train/total_loss": 0.18827170133590698 }, { "epoch": 0.5062289895194779, "grad_norm": 0.7438586354255676, "learning_rate": 8.736834297581962e-06, "loss": 0.1341, "step": 5120 }, { "entropy": 9.60583209991455, "epoch": 0.5062289895194779, "mean_token_accuracy": 0.7167530059814453, "num_tokens": 5835240.0, "step": 5120, "train/ce_loss": 0.6151160001754761 }, { "epoch": 0.5062289895194779, "step": 5120, "train/sim_loss": 0.0390625 }, { "epoch": 0.5062289895194779, "step": 5120, "train/total_loss": 0.10057410597801208 }, { "entropy": 9.251870155334473, "epoch": 0.5063278623689935, "mean_token_accuracy": 0.7232267260551453, "num_tokens": 5840406.0, "step": 5121, "train/ce_loss": 1.176452087747748e-06 }, { "epoch": 0.5063278623689935, "step": 5121, "train/sim_loss": 0.05078125 }, { "epoch": 0.5063278623689935, "step": 5121, "train/total_loss": 0.05078136920928955 }, { "entropy": 8.833154678344727, "epoch": 0.506426735218509, "mean_token_accuracy": 0.7489919066429138, "num_tokens": 5845853.0, "step": 5122, "train/ce_loss": 0.9704189896583557 }, { "epoch": 0.506426735218509, "step": 5122, "train/sim_loss": 0.078125 }, { "epoch": 0.506426735218509, "step": 5122, "train/total_loss": 0.17516690492630005 }, { "entropy": 8.725252151489258, "epoch": 0.5065256080680245, "mean_token_accuracy": 0.7568534016609192, "num_tokens": 5851194.0, "step": 5123, "train/ce_loss": 1.1533256769180298 }, { "epoch": 0.5065256080680245, "step": 5123, "train/sim_loss": 0.09375 }, { "epoch": 0.5065256080680245, "step": 5123, "train/total_loss": 0.20908257365226746 }, { "entropy": 8.7609224319458, "epoch": 0.5066244809175401, "mean_token_accuracy": 0.7646474838256836, "num_tokens": 5856675.0, "step": 5124, "train/ce_loss": 1.1014946699142456 }, { "epoch": 0.5066244809175401, "step": 5124, "train/sim_loss": 0.03515625 }, { "epoch": 0.5066244809175401, "step": 5124, "train/total_loss": 0.14530572295188904 }, { "entropy": 8.80714225769043, "epoch": 0.5067233537670556, "mean_token_accuracy": 0.7259439826011658, "num_tokens": 5861970.0, "step": 5125, "train/ce_loss": 0.9872626662254333 }, { "epoch": 0.5067233537670556, "step": 5125, "train/sim_loss": 0.046875 }, { "epoch": 0.5067233537670556, "step": 5125, "train/total_loss": 0.1456012725830078 }, { "entropy": 9.590246200561523, "epoch": 0.5068222266165711, "mean_token_accuracy": 0.7184000015258789, "num_tokens": 5867050.0, "step": 5126, "train/ce_loss": 1.390322208404541 }, { "epoch": 0.5068222266165711, "step": 5126, "train/sim_loss": 0.06640625 }, { "epoch": 0.5068222266165711, "step": 5126, "train/total_loss": 0.20543847978115082 }, { "entropy": 9.42148208618164, "epoch": 0.5069210994660867, "mean_token_accuracy": 0.7205169796943665, "num_tokens": 5872079.0, "step": 5127, "train/ce_loss": 1.5720489025115967 }, { "epoch": 0.5069210994660867, "step": 5127, "train/sim_loss": 0.05859375 }, { "epoch": 0.5069210994660867, "step": 5127, "train/total_loss": 0.21579864621162415 }, { "entropy": 9.098977088928223, "epoch": 0.5070199723156021, "mean_token_accuracy": 0.7375504970550537, "num_tokens": 5877255.0, "step": 5128, "train/ce_loss": 0.5713717937469482 }, { "epoch": 0.5070199723156021, "step": 5128, "train/sim_loss": 0.046875 }, { "epoch": 0.5070199723156021, "step": 5128, "train/total_loss": 0.10401217639446259 }, { "entropy": 8.864845275878906, "epoch": 0.5071188451651176, "mean_token_accuracy": 0.790673553943634, "num_tokens": 5882717.0, "step": 5129, "train/ce_loss": 0.46934014558792114 }, { "epoch": 0.5071188451651176, "step": 5129, "train/sim_loss": 0.03125 }, { "epoch": 0.5071188451651176, "step": 5129, "train/total_loss": 0.07818401604890823 }, { "entropy": 9.221084594726562, "epoch": 0.5072177180146332, "mean_token_accuracy": 0.7324561476707458, "num_tokens": 5887871.0, "step": 5130, "train/ce_loss": 1.0543817281723022 }, { "epoch": 0.5072177180146332, "step": 5130, "train/sim_loss": 0.04296875 }, { "epoch": 0.5072177180146332, "step": 5130, "train/total_loss": 0.14840692281723022 }, { "entropy": 9.405488967895508, "epoch": 0.5073165908641487, "mean_token_accuracy": 0.7632450461387634, "num_tokens": 5892911.0, "step": 5131, "train/ce_loss": 0.8800402283668518 }, { "epoch": 0.5073165908641487, "step": 5131, "train/sim_loss": 0.06640625 }, { "epoch": 0.5073165908641487, "step": 5131, "train/total_loss": 0.15441027283668518 }, { "entropy": 8.841413497924805, "epoch": 0.5074154637136642, "mean_token_accuracy": 0.7932535409927368, "num_tokens": 5898280.0, "step": 5132, "train/ce_loss": 0.5812978744506836 }, { "epoch": 0.5074154637136642, "step": 5132, "train/sim_loss": 0.04296875 }, { "epoch": 0.5074154637136642, "step": 5132, "train/total_loss": 0.10109853744506836 }, { "entropy": 9.707983016967773, "epoch": 0.5075143365631798, "mean_token_accuracy": 0.78899085521698, "num_tokens": 5903242.0, "step": 5133, "train/ce_loss": 0.8480736613273621 }, { "epoch": 0.5075143365631798, "step": 5133, "train/sim_loss": 0.03515625 }, { "epoch": 0.5075143365631798, "step": 5133, "train/total_loss": 0.1199636161327362 }, { "entropy": 8.927094459533691, "epoch": 0.5076132094126953, "mean_token_accuracy": 0.6925795078277588, "num_tokens": 5908548.0, "step": 5134, "train/ce_loss": 1.1896045207977295 }, { "epoch": 0.5076132094126953, "step": 5134, "train/sim_loss": 0.1171875 }, { "epoch": 0.5076132094126953, "step": 5134, "train/total_loss": 0.2361479550600052 }, { "entropy": 9.103012084960938, "epoch": 0.5077120822622108, "mean_token_accuracy": 0.7661388516426086, "num_tokens": 5913852.0, "step": 5135, "train/ce_loss": 0.8272993564605713 }, { "epoch": 0.5077120822622108, "step": 5135, "train/sim_loss": 0.05859375 }, { "epoch": 0.5077120822622108, "step": 5135, "train/total_loss": 0.14132368564605713 }, { "entropy": 9.943624496459961, "epoch": 0.5078109551117264, "mean_token_accuracy": 0.695035457611084, "num_tokens": 5918679.0, "step": 5136, "train/ce_loss": 2.4764817680988926e-06 }, { "epoch": 0.5078109551117264, "step": 5136, "train/sim_loss": 0.05078125 }, { "epoch": 0.5078109551117264, "step": 5136, "train/total_loss": 0.0507814958691597 }, { "entropy": 9.284172058105469, "epoch": 0.5079098279612418, "mean_token_accuracy": 0.7684729099273682, "num_tokens": 5923756.0, "step": 5137, "train/ce_loss": 1.1524968147277832 }, { "epoch": 0.5079098279612418, "step": 5137, "train/sim_loss": 0.02734375 }, { "epoch": 0.5079098279612418, "step": 5137, "train/total_loss": 0.14259344339370728 }, { "entropy": 9.191142082214355, "epoch": 0.5080087008107573, "mean_token_accuracy": 0.7150635123252869, "num_tokens": 5928742.0, "step": 5138, "train/ce_loss": 0.8419607877731323 }, { "epoch": 0.5080087008107573, "step": 5138, "train/sim_loss": 0.0546875 }, { "epoch": 0.5080087008107573, "step": 5138, "train/total_loss": 0.1388835906982422 }, { "entropy": 9.891863822937012, "epoch": 0.5081075736602729, "mean_token_accuracy": 0.8214285969734192, "num_tokens": 5933591.0, "step": 5139, "train/ce_loss": 2.745048732322175e-06 }, { "epoch": 0.5081075736602729, "step": 5139, "train/sim_loss": 0.0234375 }, { "epoch": 0.5081075736602729, "step": 5139, "train/total_loss": 0.023437773808836937 }, { "epoch": 0.5082064465097884, "grad_norm": 0.8406258225440979, "learning_rate": 8.731889432824014e-06, "loss": 0.1341, "step": 5140 }, { "entropy": 9.00814151763916, "epoch": 0.5082064465097884, "mean_token_accuracy": 0.7981545329093933, "num_tokens": 5938939.0, "step": 5140, "train/ce_loss": 0.4677835702896118 }, { "epoch": 0.5082064465097884, "step": 5140, "train/sim_loss": 0.01953125 }, { "epoch": 0.5082064465097884, "step": 5140, "train/total_loss": 0.0663096085190773 }, { "entropy": 9.357422828674316, "epoch": 0.5083053193593039, "mean_token_accuracy": 0.716911792755127, "num_tokens": 5943938.0, "step": 5141, "train/ce_loss": 0.9411023259162903 }, { "epoch": 0.5083053193593039, "step": 5141, "train/sim_loss": 0.0546875 }, { "epoch": 0.5083053193593039, "step": 5141, "train/total_loss": 0.14879773557186127 }, { "entropy": 9.110038757324219, "epoch": 0.5084041922088195, "mean_token_accuracy": 0.7844611406326294, "num_tokens": 5949220.0, "step": 5142, "train/ce_loss": 0.8312951326370239 }, { "epoch": 0.5084041922088195, "step": 5142, "train/sim_loss": 0.0625 }, { "epoch": 0.5084041922088195, "step": 5142, "train/total_loss": 0.14562952518463135 }, { "entropy": 9.562170028686523, "epoch": 0.508503065058335, "mean_token_accuracy": 0.7077465057373047, "num_tokens": 5954258.0, "step": 5143, "train/ce_loss": 2.580452701295144e-06 }, { "epoch": 0.508503065058335, "step": 5143, "train/sim_loss": 0.0859375 }, { "epoch": 0.508503065058335, "step": 5143, "train/total_loss": 0.08593776077032089 }, { "entropy": 9.183984756469727, "epoch": 0.5086019379078505, "mean_token_accuracy": 0.7325000166893005, "num_tokens": 5959454.0, "step": 5144, "train/ce_loss": 0.8328563570976257 }, { "epoch": 0.5086019379078505, "step": 5144, "train/sim_loss": 0.0625 }, { "epoch": 0.5086019379078505, "step": 5144, "train/total_loss": 0.1457856297492981 }, { "entropy": 9.396947860717773, "epoch": 0.5087008107573661, "mean_token_accuracy": 0.7349768877029419, "num_tokens": 5964502.0, "step": 5145, "train/ce_loss": 1.4666695594787598 }, { "epoch": 0.5087008107573661, "step": 5145, "train/sim_loss": 0.046875 }, { "epoch": 0.5087008107573661, "step": 5145, "train/total_loss": 0.19354195892810822 }, { "entropy": 9.139756202697754, "epoch": 0.5087996836068815, "mean_token_accuracy": 0.7391874194145203, "num_tokens": 5969739.0, "step": 5146, "train/ce_loss": 0.9517946839332581 }, { "epoch": 0.5087996836068815, "step": 5146, "train/sim_loss": 0.08984375 }, { "epoch": 0.5087996836068815, "step": 5146, "train/total_loss": 0.1850232183933258 }, { "entropy": 8.929136276245117, "epoch": 0.508898556456397, "mean_token_accuracy": 0.7115117907524109, "num_tokens": 5974947.0, "step": 5147, "train/ce_loss": 0.612743616104126 }, { "epoch": 0.508898556456397, "step": 5147, "train/sim_loss": 0.05078125 }, { "epoch": 0.508898556456397, "step": 5147, "train/total_loss": 0.11205561459064484 }, { "entropy": 9.182073593139648, "epoch": 0.5089974293059126, "mean_token_accuracy": 0.692307710647583, "num_tokens": 5980120.0, "step": 5148, "train/ce_loss": 0.8845396041870117 }, { "epoch": 0.5089974293059126, "step": 5148, "train/sim_loss": 0.06640625 }, { "epoch": 0.5089974293059126, "step": 5148, "train/total_loss": 0.1548602133989334 }, { "entropy": 9.10268783569336, "epoch": 0.5090963021554281, "mean_token_accuracy": 0.698952853679657, "num_tokens": 5985355.0, "step": 5149, "train/ce_loss": 0.7536455988883972 }, { "epoch": 0.5090963021554281, "step": 5149, "train/sim_loss": 0.04296875 }, { "epoch": 0.5090963021554281, "step": 5149, "train/total_loss": 0.11833330988883972 }, { "entropy": 9.613290786743164, "epoch": 0.5091951750049436, "mean_token_accuracy": 0.7426470518112183, "num_tokens": 5990114.0, "step": 5150, "train/ce_loss": 2.3226072788238525 }, { "epoch": 0.5091951750049436, "step": 5150, "train/sim_loss": 0.0546875 }, { "epoch": 0.5091951750049436, "step": 5150, "train/total_loss": 0.28694823384284973 }, { "entropy": 9.525121688842773, "epoch": 0.5092940478544592, "mean_token_accuracy": 0.7599999904632568, "num_tokens": 5995171.0, "step": 5151, "train/ce_loss": 1.2991708517074585 }, { "epoch": 0.5092940478544592, "step": 5151, "train/sim_loss": 0.07421875 }, { "epoch": 0.5092940478544592, "step": 5151, "train/total_loss": 0.20413583517074585 }, { "entropy": 8.902762413024902, "epoch": 0.5093929207039747, "mean_token_accuracy": 0.7444589138031006, "num_tokens": 6000442.0, "step": 5152, "train/ce_loss": 1.308106541633606 }, { "epoch": 0.5093929207039747, "step": 5152, "train/sim_loss": 0.07421875 }, { "epoch": 0.5093929207039747, "step": 5152, "train/total_loss": 0.2050294131040573 }, { "entropy": 9.54294204711914, "epoch": 0.5094917935534902, "mean_token_accuracy": 0.7561779022216797, "num_tokens": 6005433.0, "step": 5153, "train/ce_loss": 0.9077314138412476 }, { "epoch": 0.5094917935534902, "step": 5153, "train/sim_loss": 0.078125 }, { "epoch": 0.5094917935534902, "step": 5153, "train/total_loss": 0.16889813542366028 }, { "entropy": 9.558610916137695, "epoch": 0.5095906664030058, "mean_token_accuracy": 0.7542662024497986, "num_tokens": 6010451.0, "step": 5154, "train/ce_loss": 0.6818474531173706 }, { "epoch": 0.5095906664030058, "step": 5154, "train/sim_loss": 0.0546875 }, { "epoch": 0.5095906664030058, "step": 5154, "train/total_loss": 0.1228722482919693 }, { "entropy": 9.01298713684082, "epoch": 0.5096895392525213, "mean_token_accuracy": 0.7458563446998596, "num_tokens": 6015843.0, "step": 5155, "train/ce_loss": 1.1367805004119873 }, { "epoch": 0.5096895392525213, "step": 5155, "train/sim_loss": 0.1015625 }, { "epoch": 0.5096895392525213, "step": 5155, "train/total_loss": 0.21524055302143097 }, { "entropy": 8.672857284545898, "epoch": 0.5097884121020367, "mean_token_accuracy": 0.7292870879173279, "num_tokens": 6021363.0, "step": 5156, "train/ce_loss": 1.090519905090332 }, { "epoch": 0.5097884121020367, "step": 5156, "train/sim_loss": 0.078125 }, { "epoch": 0.5097884121020367, "step": 5156, "train/total_loss": 0.18717700242996216 }, { "entropy": 9.39059829711914, "epoch": 0.5098872849515523, "mean_token_accuracy": 0.7779456377029419, "num_tokens": 6026521.0, "step": 5157, "train/ce_loss": 0.7553310990333557 }, { "epoch": 0.5098872849515523, "step": 5157, "train/sim_loss": 0.05078125 }, { "epoch": 0.5098872849515523, "step": 5157, "train/total_loss": 0.12631437182426453 }, { "entropy": 9.064903259277344, "epoch": 0.5099861578010678, "mean_token_accuracy": 0.7238442897796631, "num_tokens": 6031797.0, "step": 5158, "train/ce_loss": 1.1853233575820923 }, { "epoch": 0.5099861578010678, "step": 5158, "train/sim_loss": 0.11328125 }, { "epoch": 0.5099861578010678, "step": 5158, "train/total_loss": 0.23181357979774475 }, { "entropy": 9.55142593383789, "epoch": 0.5100850306505833, "mean_token_accuracy": 0.7651122808456421, "num_tokens": 6036802.0, "step": 5159, "train/ce_loss": 0.8801589608192444 }, { "epoch": 0.5100850306505833, "step": 5159, "train/sim_loss": 0.07421875 }, { "epoch": 0.5100850306505833, "step": 5159, "train/total_loss": 0.16223464906215668 }, { "epoch": 0.5101839035000989, "grad_norm": 0.7660681009292603, "learning_rate": 8.726944568066063e-06, "loss": 0.1418, "step": 5160 }, { "entropy": 8.865266799926758, "epoch": 0.5101839035000989, "mean_token_accuracy": 0.7838745713233948, "num_tokens": 6042172.0, "step": 5160, "train/ce_loss": 0.6156168580055237 }, { "epoch": 0.5101839035000989, "step": 5160, "train/sim_loss": 0.03125 }, { "epoch": 0.5101839035000989, "step": 5160, "train/total_loss": 0.0928116887807846 }, { "entropy": 9.807842254638672, "epoch": 0.5102827763496144, "mean_token_accuracy": 0.7559633255004883, "num_tokens": 6047103.0, "step": 5161, "train/ce_loss": 1.5274415016174316 }, { "epoch": 0.5102827763496144, "step": 5161, "train/sim_loss": 0.0703125 }, { "epoch": 0.5102827763496144, "step": 5161, "train/total_loss": 0.22305665910243988 }, { "entropy": 8.848272323608398, "epoch": 0.5103816491991299, "mean_token_accuracy": 0.7283511161804199, "num_tokens": 6052342.0, "step": 5162, "train/ce_loss": 0.5998438596725464 }, { "epoch": 0.5103816491991299, "step": 5162, "train/sim_loss": 0.0703125 }, { "epoch": 0.5103816491991299, "step": 5162, "train/total_loss": 0.13029688596725464 }, { "entropy": 9.544805526733398, "epoch": 0.5104805220486455, "mean_token_accuracy": 0.6873747706413269, "num_tokens": 6057300.0, "step": 5163, "train/ce_loss": 1.176200032234192 }, { "epoch": 0.5104805220486455, "step": 5163, "train/sim_loss": 0.0546875 }, { "epoch": 0.5104805220486455, "step": 5163, "train/total_loss": 0.17230750620365143 }, { "entropy": 9.44500732421875, "epoch": 0.510579394898161, "mean_token_accuracy": 0.7471264600753784, "num_tokens": 6062382.0, "step": 5164, "train/ce_loss": 0.8489091992378235 }, { "epoch": 0.510579394898161, "step": 5164, "train/sim_loss": 0.015625 }, { "epoch": 0.510579394898161, "step": 5164, "train/total_loss": 0.1005159243941307 }, { "entropy": 9.15986442565918, "epoch": 0.5106782677476764, "mean_token_accuracy": 0.715068519115448, "num_tokens": 6067603.0, "step": 5165, "train/ce_loss": 0.9437806606292725 }, { "epoch": 0.5106782677476764, "step": 5165, "train/sim_loss": 0.03515625 }, { "epoch": 0.5106782677476764, "step": 5165, "train/total_loss": 0.12953431904315948 }, { "entropy": 9.356000900268555, "epoch": 0.510777140597192, "mean_token_accuracy": 0.8156862854957581, "num_tokens": 6072833.0, "step": 5166, "train/ce_loss": 0.4159621596336365 }, { "epoch": 0.510777140597192, "step": 5166, "train/sim_loss": 0.015625 }, { "epoch": 0.510777140597192, "step": 5166, "train/total_loss": 0.05722121521830559 }, { "entropy": 8.838285446166992, "epoch": 0.5108760134467075, "mean_token_accuracy": 0.7408313155174255, "num_tokens": 6078100.0, "step": 5167, "train/ce_loss": 0.7657850980758667 }, { "epoch": 0.5108760134467075, "step": 5167, "train/sim_loss": 0.03515625 }, { "epoch": 0.5108760134467075, "step": 5167, "train/total_loss": 0.11173476278781891 }, { "entropy": 9.36933708190918, "epoch": 0.510974886296223, "mean_token_accuracy": 0.7630813717842102, "num_tokens": 6083249.0, "step": 5168, "train/ce_loss": 1.2221713066101074 }, { "epoch": 0.510974886296223, "step": 5168, "train/sim_loss": 0.08984375 }, { "epoch": 0.510974886296223, "step": 5168, "train/total_loss": 0.21206088364124298 }, { "entropy": 9.319719314575195, "epoch": 0.5110737591457386, "mean_token_accuracy": 0.7013372778892517, "num_tokens": 6088357.0, "step": 5169, "train/ce_loss": 9.366481776851288e-07 }, { "epoch": 0.5110737591457386, "step": 5169, "train/sim_loss": 0.01953125 }, { "epoch": 0.5110737591457386, "step": 5169, "train/total_loss": 0.01953134313225746 }, { "entropy": 9.192684173583984, "epoch": 0.5111726319952541, "mean_token_accuracy": 0.7037037014961243, "num_tokens": 6093557.0, "step": 5170, "train/ce_loss": 0.6855148077011108 }, { "epoch": 0.5111726319952541, "step": 5170, "train/sim_loss": 0.04296875 }, { "epoch": 0.5111726319952541, "step": 5170, "train/total_loss": 0.11152023077011108 }, { "entropy": 8.690927505493164, "epoch": 0.5112715048447696, "mean_token_accuracy": 0.7698113322257996, "num_tokens": 6099088.0, "step": 5171, "train/ce_loss": 0.9583162069320679 }, { "epoch": 0.5112715048447696, "step": 5171, "train/sim_loss": 0.09765625 }, { "epoch": 0.5112715048447696, "step": 5171, "train/total_loss": 0.19348788261413574 }, { "entropy": 9.323888778686523, "epoch": 0.5113703776942852, "mean_token_accuracy": 0.7824859023094177, "num_tokens": 6104239.0, "step": 5172, "train/ce_loss": 0.878183126449585 }, { "epoch": 0.5113703776942852, "step": 5172, "train/sim_loss": 0.0234375 }, { "epoch": 0.5113703776942852, "step": 5172, "train/total_loss": 0.11125581711530685 }, { "entropy": 9.09682559967041, "epoch": 0.5114692505438007, "mean_token_accuracy": 0.7463235259056091, "num_tokens": 6109557.0, "step": 5173, "train/ce_loss": 0.5591075420379639 }, { "epoch": 0.5114692505438007, "step": 5173, "train/sim_loss": 0.05859375 }, { "epoch": 0.5114692505438007, "step": 5173, "train/total_loss": 0.11450450122356415 }, { "entropy": 9.161510467529297, "epoch": 0.5115681233933161, "mean_token_accuracy": 0.7394958138465881, "num_tokens": 6114768.0, "step": 5174, "train/ce_loss": 0.5803232192993164 }, { "epoch": 0.5115681233933161, "step": 5174, "train/sim_loss": 0.0859375 }, { "epoch": 0.5115681233933161, "step": 5174, "train/total_loss": 0.1439698189496994 }, { "entropy": 9.101654052734375, "epoch": 0.5116669962428317, "mean_token_accuracy": 0.7426108121871948, "num_tokens": 6120001.0, "step": 5175, "train/ce_loss": 1.0444610118865967 }, { "epoch": 0.5116669962428317, "step": 5175, "train/sim_loss": 0.05859375 }, { "epoch": 0.5116669962428317, "step": 5175, "train/total_loss": 0.16303986310958862 }, { "entropy": 8.89405632019043, "epoch": 0.5117658690923472, "mean_token_accuracy": 0.7247706651687622, "num_tokens": 6125273.0, "step": 5176, "train/ce_loss": 0.5537621974945068 }, { "epoch": 0.5117658690923472, "step": 5176, "train/sim_loss": 0.05859375 }, { "epoch": 0.5117658690923472, "step": 5176, "train/total_loss": 0.11396996676921844 }, { "entropy": 9.195863723754883, "epoch": 0.5118647419418627, "mean_token_accuracy": 0.7087666988372803, "num_tokens": 6130373.0, "step": 5177, "train/ce_loss": 0.8780604600906372 }, { "epoch": 0.5118647419418627, "step": 5177, "train/sim_loss": 0.046875 }, { "epoch": 0.5118647419418627, "step": 5177, "train/total_loss": 0.13468104600906372 }, { "entropy": 8.717813491821289, "epoch": 0.5119636147913783, "mean_token_accuracy": 0.7383177280426025, "num_tokens": 6135845.0, "step": 5178, "train/ce_loss": 1.014733910560608 }, { "epoch": 0.5119636147913783, "step": 5178, "train/sim_loss": 0.09375 }, { "epoch": 0.5119636147913783, "step": 5178, "train/total_loss": 0.1952233910560608 }, { "entropy": 9.682064056396484, "epoch": 0.5120624876408938, "mean_token_accuracy": 0.6962843537330627, "num_tokens": 6140897.0, "step": 5179, "train/ce_loss": 2.111259698867798 }, { "epoch": 0.5120624876408938, "step": 5179, "train/sim_loss": 0.078125 }, { "epoch": 0.5120624876408938, "step": 5179, "train/total_loss": 0.2892509698867798 }, { "epoch": 0.5121613604904093, "grad_norm": 0.7663262486457825, "learning_rate": 8.721999703308115e-06, "loss": 0.1382, "step": 5180 }, { "entropy": 9.39062213897705, "epoch": 0.5121613604904093, "mean_token_accuracy": 0.7942073345184326, "num_tokens": 6146025.0, "step": 5180, "train/ce_loss": 2.2146377887111157e-06 }, { "epoch": 0.5121613604904093, "step": 5180, "train/sim_loss": 0.0390625 }, { "epoch": 0.5121613604904093, "step": 5180, "train/total_loss": 0.03906271979212761 }, { "entropy": 9.252467155456543, "epoch": 0.5122602333399249, "mean_token_accuracy": 0.7272727489471436, "num_tokens": 6151153.0, "step": 5181, "train/ce_loss": 1.0725518465042114 }, { "epoch": 0.5122602333399249, "step": 5181, "train/sim_loss": 0.08203125 }, { "epoch": 0.5122602333399249, "step": 5181, "train/total_loss": 0.18928644061088562 }, { "entropy": 9.536457061767578, "epoch": 0.5123591061894404, "mean_token_accuracy": 0.7960526347160339, "num_tokens": 6156175.0, "step": 5182, "train/ce_loss": 0.8920789957046509 }, { "epoch": 0.5123591061894404, "step": 5182, "train/sim_loss": 0.02734375 }, { "epoch": 0.5123591061894404, "step": 5182, "train/total_loss": 0.11655165255069733 }, { "entropy": 9.23127555847168, "epoch": 0.512457979038956, "mean_token_accuracy": 0.743697464466095, "num_tokens": 6161354.0, "step": 5183, "train/ce_loss": 1.3253401517868042 }, { "epoch": 0.512457979038956, "step": 5183, "train/sim_loss": 0.0625 }, { "epoch": 0.512457979038956, "step": 5183, "train/total_loss": 0.19503401219844818 }, { "entropy": 8.898605346679688, "epoch": 0.5125568518884714, "mean_token_accuracy": 0.7256944179534912, "num_tokens": 6166738.0, "step": 5184, "train/ce_loss": 0.4349287450313568 }, { "epoch": 0.5125568518884714, "step": 5184, "train/sim_loss": 0.0234375 }, { "epoch": 0.5125568518884714, "step": 5184, "train/total_loss": 0.0669303759932518 }, { "entropy": 8.950806617736816, "epoch": 0.5126557247379869, "mean_token_accuracy": 0.7852272987365723, "num_tokens": 6172117.0, "step": 5185, "train/ce_loss": 0.8759341239929199 }, { "epoch": 0.5126557247379869, "step": 5185, "train/sim_loss": 0.05859375 }, { "epoch": 0.5126557247379869, "step": 5185, "train/total_loss": 0.14618715643882751 }, { "entropy": 8.852334022521973, "epoch": 0.5127545975875025, "mean_token_accuracy": 0.7770069241523743, "num_tokens": 6177593.0, "step": 5186, "train/ce_loss": 0.664239764213562 }, { "epoch": 0.5127545975875025, "step": 5186, "train/sim_loss": 0.015625 }, { "epoch": 0.5127545975875025, "step": 5186, "train/total_loss": 0.08204897493124008 }, { "entropy": 9.912579536437988, "epoch": 0.512853470437018, "mean_token_accuracy": 0.6643192768096924, "num_tokens": 6182401.0, "step": 5187, "train/ce_loss": 1.1493152379989624 }, { "epoch": 0.512853470437018, "step": 5187, "train/sim_loss": 0.0625 }, { "epoch": 0.512853470437018, "step": 5187, "train/total_loss": 0.17743152379989624 }, { "entropy": 9.319049835205078, "epoch": 0.5129523432865335, "mean_token_accuracy": 0.7262872457504272, "num_tokens": 6187575.0, "step": 5188, "train/ce_loss": 1.1227772235870361 }, { "epoch": 0.5129523432865335, "step": 5188, "train/sim_loss": 0.0546875 }, { "epoch": 0.5129523432865335, "step": 5188, "train/total_loss": 0.16696521639823914 }, { "entropy": 9.464544296264648, "epoch": 0.5130512161360491, "mean_token_accuracy": 0.7881844639778137, "num_tokens": 6192667.0, "step": 5189, "train/ce_loss": 1.174917459487915 }, { "epoch": 0.5130512161360491, "step": 5189, "train/sim_loss": 0.01953125 }, { "epoch": 0.5130512161360491, "step": 5189, "train/total_loss": 0.13702300190925598 }, { "entropy": 9.355015754699707, "epoch": 0.5131500889855646, "mean_token_accuracy": 0.7685714364051819, "num_tokens": 6197860.0, "step": 5190, "train/ce_loss": 0.5055925846099854 }, { "epoch": 0.5131500889855646, "step": 5190, "train/sim_loss": 0.02734375 }, { "epoch": 0.5131500889855646, "step": 5190, "train/total_loss": 0.07790300995111465 }, { "entropy": 9.103748321533203, "epoch": 0.5132489618350801, "mean_token_accuracy": 0.7351225018501282, "num_tokens": 6203222.0, "step": 5191, "train/ce_loss": 0.31338703632354736 }, { "epoch": 0.5132489618350801, "step": 5191, "train/sim_loss": 0.01953125 }, { "epoch": 0.5132489618350801, "step": 5191, "train/total_loss": 0.05086995288729668 }, { "entropy": 9.010952949523926, "epoch": 0.5133478346845957, "mean_token_accuracy": 0.7311557531356812, "num_tokens": 6208479.0, "step": 5192, "train/ce_loss": 0.680903434753418 }, { "epoch": 0.5133478346845957, "step": 5192, "train/sim_loss": 0.05859375 }, { "epoch": 0.5133478346845957, "step": 5192, "train/total_loss": 0.12668409943580627 }, { "entropy": 8.905805587768555, "epoch": 0.5134467075341111, "mean_token_accuracy": 0.7019562721252441, "num_tokens": 6213812.0, "step": 5193, "train/ce_loss": 0.8126767873764038 }, { "epoch": 0.5134467075341111, "step": 5193, "train/sim_loss": 0.08203125 }, { "epoch": 0.5134467075341111, "step": 5193, "train/total_loss": 0.16329893469810486 }, { "entropy": 9.774221420288086, "epoch": 0.5135455803836266, "mean_token_accuracy": 0.7419962286949158, "num_tokens": 6218729.0, "step": 5194, "train/ce_loss": 1.7258882962778443e-06 }, { "epoch": 0.5135455803836266, "step": 5194, "train/sim_loss": 0.01953125 }, { "epoch": 0.5135455803836266, "step": 5194, "train/total_loss": 0.01953142322599888 }, { "entropy": 10.152777671813965, "epoch": 0.5136444532331422, "mean_token_accuracy": 0.7647058963775635, "num_tokens": 6223498.0, "step": 5195, "train/ce_loss": 1.5547560453414917 }, { "epoch": 0.5136444532331422, "step": 5195, "train/sim_loss": 0.03515625 }, { "epoch": 0.5136444532331422, "step": 5195, "train/total_loss": 0.19063185155391693 }, { "entropy": 8.596878051757812, "epoch": 0.5137433260826577, "mean_token_accuracy": 0.7590909004211426, "num_tokens": 6229028.0, "step": 5196, "train/ce_loss": 0.8892180323600769 }, { "epoch": 0.5137433260826577, "step": 5196, "train/sim_loss": 0.05078125 }, { "epoch": 0.5137433260826577, "step": 5196, "train/total_loss": 0.13970306515693665 }, { "entropy": 8.842941284179688, "epoch": 0.5138421989321732, "mean_token_accuracy": 0.796798050403595, "num_tokens": 6234258.0, "step": 5197, "train/ce_loss": 0.36208295822143555 }, { "epoch": 0.5138421989321732, "step": 5197, "train/sim_loss": 0.1328125 }, { "epoch": 0.5138421989321732, "step": 5197, "train/total_loss": 0.16902080178260803 }, { "entropy": 9.162590026855469, "epoch": 0.5139410717816888, "mean_token_accuracy": 0.7354085445404053, "num_tokens": 6239484.0, "step": 5198, "train/ce_loss": 1.4314345121383667 }, { "epoch": 0.5139410717816888, "step": 5198, "train/sim_loss": 0.078125 }, { "epoch": 0.5139410717816888, "step": 5198, "train/total_loss": 0.2212684601545334 }, { "entropy": 9.713400840759277, "epoch": 0.5140399446312043, "mean_token_accuracy": 0.7263339161872864, "num_tokens": 6244500.0, "step": 5199, "train/ce_loss": 0.7679638862609863 }, { "epoch": 0.5140399446312043, "step": 5199, "train/sim_loss": 0.03125 }, { "epoch": 0.5140399446312043, "step": 5199, "train/total_loss": 0.10804639011621475 }, { "epoch": 0.5141388174807198, "grad_norm": 0.7476158142089844, "learning_rate": 8.717054838550165e-06, "loss": 0.1343, "step": 5200 }, { "entropy": 9.169939041137695, "epoch": 0.5141388174807198, "mean_token_accuracy": 0.7317396998405457, "num_tokens": 6249713.0, "step": 5200, "train/ce_loss": 0.6050461530685425 }, { "epoch": 0.5141388174807198, "step": 5200, "train/sim_loss": 0.0625 }, { "epoch": 0.5141388174807198, "step": 5200, "train/total_loss": 0.12300461530685425 }, { "entropy": 9.089229583740234, "epoch": 0.5142376903302354, "mean_token_accuracy": 0.7239263653755188, "num_tokens": 6255006.0, "step": 5201, "train/ce_loss": 0.9613548517227173 }, { "epoch": 0.5142376903302354, "step": 5201, "train/sim_loss": 0.1015625 }, { "epoch": 0.5142376903302354, "step": 5201, "train/total_loss": 0.19769799709320068 }, { "entropy": 9.296426773071289, "epoch": 0.5143365631797508, "mean_token_accuracy": 0.713004469871521, "num_tokens": 6260124.0, "step": 5202, "train/ce_loss": 0.9375792145729065 }, { "epoch": 0.5143365631797508, "step": 5202, "train/sim_loss": 0.04296875 }, { "epoch": 0.5143365631797508, "step": 5202, "train/total_loss": 0.13672667741775513 }, { "entropy": 8.995756149291992, "epoch": 0.5144354360292663, "mean_token_accuracy": 0.7040572762489319, "num_tokens": 6265506.0, "step": 5203, "train/ce_loss": 1.0593247413635254 }, { "epoch": 0.5144354360292663, "step": 5203, "train/sim_loss": 0.07421875 }, { "epoch": 0.5144354360292663, "step": 5203, "train/total_loss": 0.18015122413635254 }, { "entropy": 9.792580604553223, "epoch": 0.5145343088787819, "mean_token_accuracy": 0.7164556980133057, "num_tokens": 6270339.0, "step": 5204, "train/ce_loss": 1.2554758787155151 }, { "epoch": 0.5145343088787819, "step": 5204, "train/sim_loss": 0.03125 }, { "epoch": 0.5145343088787819, "step": 5204, "train/total_loss": 0.1567975878715515 }, { "entropy": 9.043886184692383, "epoch": 0.5146331817282974, "mean_token_accuracy": 0.7170263528823853, "num_tokens": 6275629.0, "step": 5205, "train/ce_loss": 1.027775764465332 }, { "epoch": 0.5146331817282974, "step": 5205, "train/sim_loss": 0.0859375 }, { "epoch": 0.5146331817282974, "step": 5205, "train/total_loss": 0.18871507048606873 }, { "entropy": 8.943634033203125, "epoch": 0.5147320545778129, "mean_token_accuracy": 0.7334109544754028, "num_tokens": 6280937.0, "step": 5206, "train/ce_loss": 1.068730115890503 }, { "epoch": 0.5147320545778129, "step": 5206, "train/sim_loss": 0.0703125 }, { "epoch": 0.5147320545778129, "step": 5206, "train/total_loss": 0.17718550562858582 }, { "entropy": 9.01152229309082, "epoch": 0.5148309274273285, "mean_token_accuracy": 0.7050997614860535, "num_tokens": 6286307.0, "step": 5207, "train/ce_loss": 0.5275875329971313 }, { "epoch": 0.5148309274273285, "step": 5207, "train/sim_loss": 0.08203125 }, { "epoch": 0.5148309274273285, "step": 5207, "train/total_loss": 0.13479000329971313 }, { "entropy": 9.549314498901367, "epoch": 0.514929800276844, "mean_token_accuracy": 0.7138508558273315, "num_tokens": 6291365.0, "step": 5208, "train/ce_loss": 1.2059669494628906 }, { "epoch": 0.514929800276844, "step": 5208, "train/sim_loss": 0.02734375 }, { "epoch": 0.514929800276844, "step": 5208, "train/total_loss": 0.14794045686721802 }, { "entropy": 9.489936828613281, "epoch": 0.5150286731263595, "mean_token_accuracy": 0.7491748929023743, "num_tokens": 6296472.0, "step": 5209, "train/ce_loss": 1.0462239980697632 }, { "epoch": 0.5150286731263595, "step": 5209, "train/sim_loss": 0.0703125 }, { "epoch": 0.5150286731263595, "step": 5209, "train/total_loss": 0.17493489384651184 }, { "entropy": 8.975013732910156, "epoch": 0.5151275459758751, "mean_token_accuracy": 0.7044392228126526, "num_tokens": 6301820.0, "step": 5210, "train/ce_loss": 0.8023355007171631 }, { "epoch": 0.5151275459758751, "step": 5210, "train/sim_loss": 0.0546875 }, { "epoch": 0.5151275459758751, "step": 5210, "train/total_loss": 0.13492104411125183 }, { "entropy": 8.907506942749023, "epoch": 0.5152264188253906, "mean_token_accuracy": 0.7075055241584778, "num_tokens": 6307203.0, "step": 5211, "train/ce_loss": 0.6670815348625183 }, { "epoch": 0.5152264188253906, "step": 5211, "train/sim_loss": 0.04296875 }, { "epoch": 0.5152264188253906, "step": 5211, "train/total_loss": 0.10967690497636795 }, { "entropy": 8.65451431274414, "epoch": 0.515325291674906, "mean_token_accuracy": 0.7407054305076599, "num_tokens": 6312751.0, "step": 5212, "train/ce_loss": 0.5702404379844666 }, { "epoch": 0.515325291674906, "step": 5212, "train/sim_loss": 0.01953125 }, { "epoch": 0.515325291674906, "step": 5212, "train/total_loss": 0.0765552967786789 }, { "entropy": 9.231220245361328, "epoch": 0.5154241645244216, "mean_token_accuracy": 0.7912687659263611, "num_tokens": 6317975.0, "step": 5213, "train/ce_loss": 0.46404504776000977 }, { "epoch": 0.5154241645244216, "step": 5213, "train/sim_loss": 0.10546875 }, { "epoch": 0.5154241645244216, "step": 5213, "train/total_loss": 0.15187326073646545 }, { "entropy": 9.298812866210938, "epoch": 0.5155230373739371, "mean_token_accuracy": 0.75, "num_tokens": 6323085.0, "step": 5214, "train/ce_loss": 1.2744684219360352 }, { "epoch": 0.5155230373739371, "step": 5214, "train/sim_loss": 0.05859375 }, { "epoch": 0.5155230373739371, "step": 5214, "train/total_loss": 0.18604059517383575 }, { "entropy": 8.690983772277832, "epoch": 0.5156219102234526, "mean_token_accuracy": 0.7608453631401062, "num_tokens": 6328435.0, "step": 5215, "train/ce_loss": 0.4523555636405945 }, { "epoch": 0.5156219102234526, "step": 5215, "train/sim_loss": 0.08984375 }, { "epoch": 0.5156219102234526, "step": 5215, "train/total_loss": 0.1350793093442917 }, { "entropy": 9.051522254943848, "epoch": 0.5157207830729682, "mean_token_accuracy": 0.8401322960853577, "num_tokens": 6333767.0, "step": 5216, "train/ce_loss": 0.3867724537849426 }, { "epoch": 0.5157207830729682, "step": 5216, "train/sim_loss": 0.01953125 }, { "epoch": 0.5157207830729682, "step": 5216, "train/total_loss": 0.05820849537849426 }, { "entropy": 9.134048461914062, "epoch": 0.5158196559224837, "mean_token_accuracy": 0.7589403986930847, "num_tokens": 6338941.0, "step": 5217, "train/ce_loss": 0.5398648381233215 }, { "epoch": 0.5158196559224837, "step": 5217, "train/sim_loss": 0.0390625 }, { "epoch": 0.5158196559224837, "step": 5217, "train/total_loss": 0.09304898977279663 }, { "entropy": 9.101371765136719, "epoch": 0.5159185287719992, "mean_token_accuracy": 0.7251700758934021, "num_tokens": 6344129.0, "step": 5218, "train/ce_loss": 1.1684240102767944 }, { "epoch": 0.5159185287719992, "step": 5218, "train/sim_loss": 0.09765625 }, { "epoch": 0.5159185287719992, "step": 5218, "train/total_loss": 0.21449865400791168 }, { "entropy": 9.00767707824707, "epoch": 0.5160174016215148, "mean_token_accuracy": 0.7233532667160034, "num_tokens": 6349427.0, "step": 5219, "train/ce_loss": 1.1756572723388672 }, { "epoch": 0.5160174016215148, "step": 5219, "train/sim_loss": 0.05859375 }, { "epoch": 0.5160174016215148, "step": 5219, "train/total_loss": 0.17615947127342224 }, { "epoch": 0.5161162744710303, "grad_norm": 0.8571836352348328, "learning_rate": 8.712109973792218e-06, "loss": 0.1413, "step": 5220 }, { "entropy": 9.020292282104492, "epoch": 0.5161162744710303, "mean_token_accuracy": 0.745743453502655, "num_tokens": 6354758.0, "step": 5220, "train/ce_loss": 1.1463453769683838 }, { "epoch": 0.5161162744710303, "step": 5220, "train/sim_loss": 0.078125 }, { "epoch": 0.5161162744710303, "step": 5220, "train/total_loss": 0.19275954365730286 }, { "entropy": 9.523673057556152, "epoch": 0.5162151473205457, "mean_token_accuracy": 0.7344537973403931, "num_tokens": 6359794.0, "step": 5221, "train/ce_loss": 1.286370038986206 }, { "epoch": 0.5162151473205457, "step": 5221, "train/sim_loss": 0.046875 }, { "epoch": 0.5162151473205457, "step": 5221, "train/total_loss": 0.17551200091838837 }, { "entropy": 9.838347434997559, "epoch": 0.5163140201700613, "mean_token_accuracy": 0.6756151914596558, "num_tokens": 6364656.0, "step": 5222, "train/ce_loss": 2.0478384494781494 }, { "epoch": 0.5163140201700613, "step": 5222, "train/sim_loss": 0.08203125 }, { "epoch": 0.5163140201700613, "step": 5222, "train/total_loss": 0.2868151068687439 }, { "entropy": 8.906373977661133, "epoch": 0.5164128930195768, "mean_token_accuracy": 0.7170658707618713, "num_tokens": 6369716.0, "step": 5223, "train/ce_loss": 0.5118294954299927 }, { "epoch": 0.5164128930195768, "step": 5223, "train/sim_loss": 0.0625 }, { "epoch": 0.5164128930195768, "step": 5223, "train/total_loss": 0.11368295550346375 }, { "entropy": 9.602432250976562, "epoch": 0.5165117658690923, "mean_token_accuracy": 0.7116104960441589, "num_tokens": 6374679.0, "step": 5224, "train/ce_loss": 2.8455499432311626e-06 }, { "epoch": 0.5165117658690923, "step": 5224, "train/sim_loss": 0.046875 }, { "epoch": 0.5165117658690923, "step": 5224, "train/total_loss": 0.04687528312206268 }, { "entropy": 9.013307571411133, "epoch": 0.5166106387186079, "mean_token_accuracy": 0.7280488014221191, "num_tokens": 6379944.0, "step": 5225, "train/ce_loss": 0.9952422380447388 }, { "epoch": 0.5166106387186079, "step": 5225, "train/sim_loss": 0.06640625 }, { "epoch": 0.5166106387186079, "step": 5225, "train/total_loss": 0.16593047976493835 }, { "entropy": 10.098270416259766, "epoch": 0.5167095115681234, "mean_token_accuracy": 0.7131367325782776, "num_tokens": 6384717.0, "step": 5226, "train/ce_loss": 1.511696457862854 }, { "epoch": 0.5167095115681234, "step": 5226, "train/sim_loss": 0.14453125 }, { "epoch": 0.5167095115681234, "step": 5226, "train/total_loss": 0.29570090770721436 }, { "entropy": 9.121529579162598, "epoch": 0.5168083844176389, "mean_token_accuracy": 0.6698337197303772, "num_tokens": 6389984.0, "step": 5227, "train/ce_loss": 1.220581293106079 }, { "epoch": 0.5168083844176389, "step": 5227, "train/sim_loss": 0.05859375 }, { "epoch": 0.5168083844176389, "step": 5227, "train/total_loss": 0.18065187335014343 }, { "entropy": 9.042387008666992, "epoch": 0.5169072572671545, "mean_token_accuracy": 0.7225501537322998, "num_tokens": 6395307.0, "step": 5228, "train/ce_loss": 1.320287823677063 }, { "epoch": 0.5169072572671545, "step": 5228, "train/sim_loss": 0.06640625 }, { "epoch": 0.5169072572671545, "step": 5228, "train/total_loss": 0.19843503832817078 }, { "entropy": 9.300015449523926, "epoch": 0.51700613011667, "mean_token_accuracy": 0.7410072088241577, "num_tokens": 6400469.0, "step": 5229, "train/ce_loss": 0.9012818336486816 }, { "epoch": 0.51700613011667, "step": 5229, "train/sim_loss": 0.04296875 }, { "epoch": 0.51700613011667, "step": 5229, "train/total_loss": 0.13309693336486816 }, { "entropy": 8.792694091796875, "epoch": 0.5171050029661854, "mean_token_accuracy": 0.7126193046569824, "num_tokens": 6405925.0, "step": 5230, "train/ce_loss": 0.6694502830505371 }, { "epoch": 0.5171050029661854, "step": 5230, "train/sim_loss": 0.03515625 }, { "epoch": 0.5171050029661854, "step": 5230, "train/total_loss": 0.10210128128528595 }, { "entropy": 9.179935455322266, "epoch": 0.517203875815701, "mean_token_accuracy": 0.7749077677726746, "num_tokens": 6411195.0, "step": 5231, "train/ce_loss": 0.8501054048538208 }, { "epoch": 0.517203875815701, "step": 5231, "train/sim_loss": 0.08203125 }, { "epoch": 0.517203875815701, "step": 5231, "train/total_loss": 0.16704179346561432 }, { "entropy": 9.284443855285645, "epoch": 0.5173027486652165, "mean_token_accuracy": 0.6839762330055237, "num_tokens": 6416325.0, "step": 5232, "train/ce_loss": 4.317288585298229e-06 }, { "epoch": 0.5173027486652165, "step": 5232, "train/sim_loss": 0.03125 }, { "epoch": 0.5173027486652165, "step": 5232, "train/total_loss": 0.03125043213367462 }, { "entropy": 9.127395629882812, "epoch": 0.517401621514732, "mean_token_accuracy": 0.6614457964897156, "num_tokens": 6421602.0, "step": 5233, "train/ce_loss": 0.7827679514884949 }, { "epoch": 0.517401621514732, "step": 5233, "train/sim_loss": 0.09375 }, { "epoch": 0.517401621514732, "step": 5233, "train/total_loss": 0.17202679812908173 }, { "entropy": 8.866366386413574, "epoch": 0.5175004943642476, "mean_token_accuracy": 0.7063007950782776, "num_tokens": 6427049.0, "step": 5234, "train/ce_loss": 1.2991992235183716 }, { "epoch": 0.5175004943642476, "step": 5234, "train/sim_loss": 0.1171875 }, { "epoch": 0.5175004943642476, "step": 5234, "train/total_loss": 0.24710743129253387 }, { "entropy": 9.905988693237305, "epoch": 0.5175993672137631, "mean_token_accuracy": 0.8165374398231506, "num_tokens": 6431811.0, "step": 5235, "train/ce_loss": 7.886806088208687e-06 }, { "epoch": 0.5175993672137631, "step": 5235, "train/sim_loss": 0.05078125 }, { "epoch": 0.5175993672137631, "step": 5235, "train/total_loss": 0.050782039761543274 }, { "entropy": 8.81964111328125, "epoch": 0.5176982400632786, "mean_token_accuracy": 0.732833981513977, "num_tokens": 6437136.0, "step": 5236, "train/ce_loss": 0.5881039500236511 }, { "epoch": 0.5176982400632786, "step": 5236, "train/sim_loss": 0.05859375 }, { "epoch": 0.5176982400632786, "step": 5236, "train/total_loss": 0.11740414798259735 }, { "entropy": 9.448318481445312, "epoch": 0.5177971129127942, "mean_token_accuracy": 0.7641509175300598, "num_tokens": 6442205.0, "step": 5237, "train/ce_loss": 1.2880504131317139 }, { "epoch": 0.5177971129127942, "step": 5237, "train/sim_loss": 0.09375 }, { "epoch": 0.5177971129127942, "step": 5237, "train/total_loss": 0.2225550413131714 }, { "entropy": 9.1721830368042, "epoch": 0.5178959857623097, "mean_token_accuracy": 0.7208480834960938, "num_tokens": 6447550.0, "step": 5238, "train/ce_loss": 1.1573699712753296 }, { "epoch": 0.5178959857623097, "step": 5238, "train/sim_loss": 0.109375 }, { "epoch": 0.5178959857623097, "step": 5238, "train/total_loss": 0.22511199116706848 }, { "entropy": 9.189175605773926, "epoch": 0.5179948586118251, "mean_token_accuracy": 0.7442747950553894, "num_tokens": 6452801.0, "step": 5239, "train/ce_loss": 0.9737340807914734 }, { "epoch": 0.5179948586118251, "step": 5239, "train/sim_loss": 0.0234375 }, { "epoch": 0.5179948586118251, "step": 5239, "train/total_loss": 0.12081091105937958 }, { "epoch": 0.5180937314613407, "grad_norm": 0.6664369702339172, "learning_rate": 8.707165109034268e-06, "loss": 0.1519, "step": 5240 }, { "entropy": 8.873210906982422, "epoch": 0.5180937314613407, "mean_token_accuracy": 0.7202441692352295, "num_tokens": 6458195.0, "step": 5240, "train/ce_loss": 0.8848466277122498 }, { "epoch": 0.5180937314613407, "step": 5240, "train/sim_loss": 0.03125 }, { "epoch": 0.5180937314613407, "step": 5240, "train/total_loss": 0.11973466724157333 }, { "entropy": 9.209671974182129, "epoch": 0.5181926043108562, "mean_token_accuracy": 0.7430093288421631, "num_tokens": 6463421.0, "step": 5241, "train/ce_loss": 0.3739120066165924 }, { "epoch": 0.5181926043108562, "step": 5241, "train/sim_loss": 0.1171875 }, { "epoch": 0.5181926043108562, "step": 5241, "train/total_loss": 0.15457870066165924 }, { "entropy": 8.89012336730957, "epoch": 0.5182914771603717, "mean_token_accuracy": 0.746666669845581, "num_tokens": 6468589.0, "step": 5242, "train/ce_loss": 0.6734580397605896 }, { "epoch": 0.5182914771603717, "step": 5242, "train/sim_loss": 0.08203125 }, { "epoch": 0.5182914771603717, "step": 5242, "train/total_loss": 0.14937704801559448 }, { "entropy": 9.542342185974121, "epoch": 0.5183903500098873, "mean_token_accuracy": 0.7629513144493103, "num_tokens": 6473656.0, "step": 5243, "train/ce_loss": 1.0343583822250366 }, { "epoch": 0.5183903500098873, "step": 5243, "train/sim_loss": 0.04296875 }, { "epoch": 0.5183903500098873, "step": 5243, "train/total_loss": 0.14640459418296814 }, { "entropy": 9.919364929199219, "epoch": 0.5184892228594028, "mean_token_accuracy": 0.740359902381897, "num_tokens": 6478456.0, "step": 5244, "train/ce_loss": 1.3556137084960938 }, { "epoch": 0.5184892228594028, "step": 5244, "train/sim_loss": 0.0390625 }, { "epoch": 0.5184892228594028, "step": 5244, "train/total_loss": 0.17462387681007385 }, { "entropy": 9.06728458404541, "epoch": 0.5185880957089183, "mean_token_accuracy": 0.7038043737411499, "num_tokens": 6483656.0, "step": 5245, "train/ce_loss": 0.7342151403427124 }, { "epoch": 0.5185880957089183, "step": 5245, "train/sim_loss": 0.0859375 }, { "epoch": 0.5185880957089183, "step": 5245, "train/total_loss": 0.15935900807380676 }, { "entropy": 9.586074829101562, "epoch": 0.5186869685584339, "mean_token_accuracy": 0.7088607549667358, "num_tokens": 6488459.0, "step": 5246, "train/ce_loss": 3.950816790165845e-06 }, { "epoch": 0.5186869685584339, "step": 5246, "train/sim_loss": 0.05859375 }, { "epoch": 0.5186869685584339, "step": 5246, "train/total_loss": 0.05859414488077164 }, { "entropy": 8.880306243896484, "epoch": 0.5187858414079494, "mean_token_accuracy": 0.751870334148407, "num_tokens": 6493690.0, "step": 5247, "train/ce_loss": 0.5954692363739014 }, { "epoch": 0.5187858414079494, "step": 5247, "train/sim_loss": 0.02734375 }, { "epoch": 0.5187858414079494, "step": 5247, "train/total_loss": 0.08689067512750626 }, { "entropy": 9.145792007446289, "epoch": 0.5188847142574649, "mean_token_accuracy": 0.7543624043464661, "num_tokens": 6498911.0, "step": 5248, "train/ce_loss": 1.5778733491897583 }, { "epoch": 0.5188847142574649, "step": 5248, "train/sim_loss": 0.109375 }, { "epoch": 0.5188847142574649, "step": 5248, "train/total_loss": 0.2671623229980469 }, { "entropy": 9.40733528137207, "epoch": 0.5189835871069804, "mean_token_accuracy": 0.716946005821228, "num_tokens": 6503915.0, "step": 5249, "train/ce_loss": 1.213287353515625 }, { "epoch": 0.5189835871069804, "step": 5249, "train/sim_loss": 0.03125 }, { "epoch": 0.5189835871069804, "step": 5249, "train/total_loss": 0.15257874131202698 }, { "entropy": 8.57093620300293, "epoch": 0.5190824599564959, "mean_token_accuracy": 0.7903845906257629, "num_tokens": 6509667.0, "step": 5250, "train/ce_loss": 0.6045466065406799 }, { "epoch": 0.5190824599564959, "step": 5250, "train/sim_loss": 0.0390625 }, { "epoch": 0.5190824599564959, "step": 5250, "train/total_loss": 0.09951716661453247 }, { "entropy": 9.538387298583984, "epoch": 0.5191813328060114, "mean_token_accuracy": 0.7171717286109924, "num_tokens": 6514623.0, "step": 5251, "train/ce_loss": 1.3213231563568115 }, { "epoch": 0.5191813328060114, "step": 5251, "train/sim_loss": 0.0546875 }, { "epoch": 0.5191813328060114, "step": 5251, "train/total_loss": 0.18681982159614563 }, { "entropy": 9.209657669067383, "epoch": 0.519280205655527, "mean_token_accuracy": 0.7266187071800232, "num_tokens": 6519770.0, "step": 5252, "train/ce_loss": 0.8175265789031982 }, { "epoch": 0.519280205655527, "step": 5252, "train/sim_loss": 0.046875 }, { "epoch": 0.519280205655527, "step": 5252, "train/total_loss": 0.12862765789031982 }, { "entropy": 9.48554801940918, "epoch": 0.5193790785050425, "mean_token_accuracy": 0.6927176117897034, "num_tokens": 6524802.0, "step": 5253, "train/ce_loss": 1.8125488758087158 }, { "epoch": 0.5193790785050425, "step": 5253, "train/sim_loss": 0.07421875 }, { "epoch": 0.5193790785050425, "step": 5253, "train/total_loss": 0.25547364354133606 }, { "entropy": 9.817337036132812, "epoch": 0.519477951354558, "mean_token_accuracy": 0.7514563202857971, "num_tokens": 6529724.0, "step": 5254, "train/ce_loss": 0.7927056550979614 }, { "epoch": 0.519477951354558, "step": 5254, "train/sim_loss": 0.0625 }, { "epoch": 0.519477951354558, "step": 5254, "train/total_loss": 0.14177057147026062 }, { "entropy": 9.053857803344727, "epoch": 0.5195768242040736, "mean_token_accuracy": 0.7128146290779114, "num_tokens": 6535045.0, "step": 5255, "train/ce_loss": 0.7244611978530884 }, { "epoch": 0.5195768242040736, "step": 5255, "train/sim_loss": 0.0546875 }, { "epoch": 0.5195768242040736, "step": 5255, "train/total_loss": 0.12713362276554108 }, { "entropy": 9.557379722595215, "epoch": 0.5196756970535891, "mean_token_accuracy": 0.7753743529319763, "num_tokens": 6540094.0, "step": 5256, "train/ce_loss": 1.0860475301742554 }, { "epoch": 0.5196756970535891, "step": 5256, "train/sim_loss": 0.015625 }, { "epoch": 0.5196756970535891, "step": 5256, "train/total_loss": 0.12422975152730942 }, { "entropy": 9.26852798461914, "epoch": 0.5197745699031046, "mean_token_accuracy": 0.7614213228225708, "num_tokens": 6545321.0, "step": 5257, "train/ce_loss": 0.38823726773262024 }, { "epoch": 0.5197745699031046, "step": 5257, "train/sim_loss": 0.01953125 }, { "epoch": 0.5197745699031046, "step": 5257, "train/total_loss": 0.058354977518320084 }, { "entropy": 9.255789756774902, "epoch": 0.5198734427526202, "mean_token_accuracy": 0.6846965551376343, "num_tokens": 6550532.0, "step": 5258, "train/ce_loss": 1.7871288061141968 }, { "epoch": 0.5198734427526202, "step": 5258, "train/sim_loss": 0.109375 }, { "epoch": 0.5198734427526202, "step": 5258, "train/total_loss": 0.2880879044532776 }, { "entropy": 9.217986106872559, "epoch": 0.5199723156021356, "mean_token_accuracy": 0.6962864995002747, "num_tokens": 6555745.0, "step": 5259, "train/ce_loss": 0.5425359010696411 }, { "epoch": 0.5199723156021356, "step": 5259, "train/sim_loss": 0.0625 }, { "epoch": 0.5199723156021356, "step": 5259, "train/total_loss": 0.11675359308719635 }, { "epoch": 0.5200711884516511, "grad_norm": 0.7438312768936157, "learning_rate": 8.702220244276319e-06, "loss": 0.1409, "step": 5260 }, { "entropy": 9.3577880859375, "epoch": 0.5200711884516511, "mean_token_accuracy": 0.7811271548271179, "num_tokens": 6560923.0, "step": 5260, "train/ce_loss": 1.415432848261844e-06 }, { "epoch": 0.5200711884516511, "step": 5260, "train/sim_loss": 0.0625 }, { "epoch": 0.5200711884516511, "step": 5260, "train/total_loss": 0.06250014156103134 }, { "entropy": 9.415997505187988, "epoch": 0.5201700613011667, "mean_token_accuracy": 0.7555555701255798, "num_tokens": 6565925.0, "step": 5261, "train/ce_loss": 2.781298690024414e-06 }, { "epoch": 0.5201700613011667, "step": 5261, "train/sim_loss": 0.0625 }, { "epoch": 0.5201700613011667, "step": 5261, "train/total_loss": 0.06250027567148209 }, { "entropy": 8.994209289550781, "epoch": 0.5202689341506822, "mean_token_accuracy": 0.7211660146713257, "num_tokens": 6571218.0, "step": 5262, "train/ce_loss": 1.2079790830612183 }, { "epoch": 0.5202689341506822, "step": 5262, "train/sim_loss": 0.109375 }, { "epoch": 0.5202689341506822, "step": 5262, "train/total_loss": 0.23017290234565735 }, { "entropy": 9.183069229125977, "epoch": 0.5203678070001977, "mean_token_accuracy": 0.7012448310852051, "num_tokens": 6576396.0, "step": 5263, "train/ce_loss": 0.9855268001556396 }, { "epoch": 0.5203678070001977, "step": 5263, "train/sim_loss": 0.03515625 }, { "epoch": 0.5203678070001977, "step": 5263, "train/total_loss": 0.1337089240550995 }, { "entropy": 9.500748634338379, "epoch": 0.5204666798497133, "mean_token_accuracy": 0.6936936974525452, "num_tokens": 6581366.0, "step": 5264, "train/ce_loss": 3.6748667753272457e-06 }, { "epoch": 0.5204666798497133, "step": 5264, "train/sim_loss": 0.01953125 }, { "epoch": 0.5204666798497133, "step": 5264, "train/total_loss": 0.0195316169410944 }, { "entropy": 9.311800956726074, "epoch": 0.5205655526992288, "mean_token_accuracy": 0.7102922201156616, "num_tokens": 6586606.0, "step": 5265, "train/ce_loss": 1.0904486179351807 }, { "epoch": 0.5205655526992288, "step": 5265, "train/sim_loss": 0.0859375 }, { "epoch": 0.5205655526992288, "step": 5265, "train/total_loss": 0.1949823647737503 }, { "entropy": 9.069988250732422, "epoch": 0.5206644255487444, "mean_token_accuracy": 0.71775221824646, "num_tokens": 6591843.0, "step": 5266, "train/ce_loss": 0.8036020994186401 }, { "epoch": 0.5206644255487444, "step": 5266, "train/sim_loss": 0.0625 }, { "epoch": 0.5206644255487444, "step": 5266, "train/total_loss": 0.14286020398139954 }, { "entropy": 9.300836563110352, "epoch": 0.5207632983982599, "mean_token_accuracy": 0.7404958605766296, "num_tokens": 6596956.0, "step": 5267, "train/ce_loss": 0.7432721257209778 }, { "epoch": 0.5207632983982599, "step": 5267, "train/sim_loss": 0.0390625 }, { "epoch": 0.5207632983982599, "step": 5267, "train/total_loss": 0.11338971555233002 }, { "entropy": 8.987645149230957, "epoch": 0.5208621712477753, "mean_token_accuracy": 0.7181687951087952, "num_tokens": 6602135.0, "step": 5268, "train/ce_loss": 1.1967494487762451 }, { "epoch": 0.5208621712477753, "step": 5268, "train/sim_loss": 0.07421875 }, { "epoch": 0.5208621712477753, "step": 5268, "train/total_loss": 0.193893700838089 }, { "entropy": 9.507927894592285, "epoch": 0.5209610440972909, "mean_token_accuracy": 0.6975308656692505, "num_tokens": 6607210.0, "step": 5269, "train/ce_loss": 1.2455633878707886 }, { "epoch": 0.5209610440972909, "step": 5269, "train/sim_loss": 0.08984375 }, { "epoch": 0.5209610440972909, "step": 5269, "train/total_loss": 0.21440008282661438 }, { "entropy": 9.510259628295898, "epoch": 0.5210599169468064, "mean_token_accuracy": 0.6652047038078308, "num_tokens": 6612311.0, "step": 5270, "train/ce_loss": 1.3466548919677734 }, { "epoch": 0.5210599169468064, "step": 5270, "train/sim_loss": 0.0703125 }, { "epoch": 0.5210599169468064, "step": 5270, "train/total_loss": 0.20497798919677734 }, { "entropy": 9.148332595825195, "epoch": 0.5211587897963219, "mean_token_accuracy": 0.7044943571090698, "num_tokens": 6617811.0, "step": 5271, "train/ce_loss": 0.7625870108604431 }, { "epoch": 0.5211587897963219, "step": 5271, "train/sim_loss": 0.05859375 }, { "epoch": 0.5211587897963219, "step": 5271, "train/total_loss": 0.13485245406627655 }, { "entropy": 9.175765037536621, "epoch": 0.5212576626458375, "mean_token_accuracy": 0.7534818649291992, "num_tokens": 6623000.0, "step": 5272, "train/ce_loss": 0.9953817129135132 }, { "epoch": 0.5212576626458375, "step": 5272, "train/sim_loss": 0.0625 }, { "epoch": 0.5212576626458375, "step": 5272, "train/total_loss": 0.1620381772518158 }, { "entropy": 9.28423023223877, "epoch": 0.521356535495353, "mean_token_accuracy": 0.8075187802314758, "num_tokens": 6628153.0, "step": 5273, "train/ce_loss": 0.4667331874370575 }, { "epoch": 0.521356535495353, "step": 5273, "train/sim_loss": 0.04296875 }, { "epoch": 0.521356535495353, "step": 5273, "train/total_loss": 0.08964207023382187 }, { "entropy": 8.493019104003906, "epoch": 0.5214554083448685, "mean_token_accuracy": 0.6848204135894775, "num_tokens": 6633475.0, "step": 5274, "train/ce_loss": 0.7903342843055725 }, { "epoch": 0.5214554083448685, "step": 5274, "train/sim_loss": 0.0546875 }, { "epoch": 0.5214554083448685, "step": 5274, "train/total_loss": 0.13372093439102173 }, { "entropy": 9.316062927246094, "epoch": 0.5215542811943841, "mean_token_accuracy": 0.7250803709030151, "num_tokens": 6638602.0, "step": 5275, "train/ce_loss": 2.0475013116083574e-06 }, { "epoch": 0.5215542811943841, "step": 5275, "train/sim_loss": 0.03515625 }, { "epoch": 0.5215542811943841, "step": 5275, "train/total_loss": 0.035156454890966415 }, { "entropy": 9.227267265319824, "epoch": 0.5216531540438996, "mean_token_accuracy": 0.7750309109687805, "num_tokens": 6643871.0, "step": 5276, "train/ce_loss": 0.4963589310646057 }, { "epoch": 0.5216531540438996, "step": 5276, "train/sim_loss": 0.02734375 }, { "epoch": 0.5216531540438996, "step": 5276, "train/total_loss": 0.07697964459657669 }, { "entropy": 8.601722717285156, "epoch": 0.521752026893415, "mean_token_accuracy": 0.743984580039978, "num_tokens": 6649451.0, "step": 5277, "train/ce_loss": 0.956932544708252 }, { "epoch": 0.521752026893415, "step": 5277, "train/sim_loss": 0.05078125 }, { "epoch": 0.521752026893415, "step": 5277, "train/total_loss": 0.14647451043128967 }, { "entropy": 9.529146194458008, "epoch": 0.5218508997429306, "mean_token_accuracy": 0.7466410994529724, "num_tokens": 6654425.0, "step": 5278, "train/ce_loss": 0.8007546663284302 }, { "epoch": 0.5218508997429306, "step": 5278, "train/sim_loss": 0.04296875 }, { "epoch": 0.5218508997429306, "step": 5278, "train/total_loss": 0.1230442151427269 }, { "entropy": 9.57694149017334, "epoch": 0.5219497725924461, "mean_token_accuracy": 0.7283333539962769, "num_tokens": 6659479.0, "step": 5279, "train/ce_loss": 0.7926003932952881 }, { "epoch": 0.5219497725924461, "step": 5279, "train/sim_loss": 0.078125 }, { "epoch": 0.5219497725924461, "step": 5279, "train/total_loss": 0.15738505125045776 }, { "epoch": 0.5220486454419616, "grad_norm": 0.7354840636253357, "learning_rate": 8.697275379518371e-06, "loss": 0.1431, "step": 5280 }, { "entropy": 9.120805740356445, "epoch": 0.5220486454419616, "mean_token_accuracy": 0.720963180065155, "num_tokens": 6664626.0, "step": 5280, "train/ce_loss": 1.787015776244516e-06 }, { "epoch": 0.5220486454419616, "step": 5280, "train/sim_loss": 0.04296875 }, { "epoch": 0.5220486454419616, "step": 5280, "train/total_loss": 0.042968928813934326 }, { "entropy": 9.510129928588867, "epoch": 0.5221475182914772, "mean_token_accuracy": 0.7185184955596924, "num_tokens": 6669642.0, "step": 5281, "train/ce_loss": 0.986981213092804 }, { "epoch": 0.5221475182914772, "step": 5281, "train/sim_loss": 0.06640625 }, { "epoch": 0.5221475182914772, "step": 5281, "train/total_loss": 0.16510437428951263 }, { "entropy": 9.158186912536621, "epoch": 0.5222463911409927, "mean_token_accuracy": 0.7033132314682007, "num_tokens": 6674784.0, "step": 5282, "train/ce_loss": 2.32814621925354 }, { "epoch": 0.5222463911409927, "step": 5282, "train/sim_loss": 0.0703125 }, { "epoch": 0.5222463911409927, "step": 5282, "train/total_loss": 0.30312711000442505 }, { "entropy": 9.896245956420898, "epoch": 0.5223452639905082, "mean_token_accuracy": 0.7933070659637451, "num_tokens": 6679664.0, "step": 5283, "train/ce_loss": 1.2164676945758401e-06 }, { "epoch": 0.5223452639905082, "step": 5283, "train/sim_loss": 0.015625 }, { "epoch": 0.5223452639905082, "step": 5283, "train/total_loss": 0.0156251210719347 }, { "entropy": 9.684252738952637, "epoch": 0.5224441368400238, "mean_token_accuracy": 0.7310344576835632, "num_tokens": 6684510.0, "step": 5284, "train/ce_loss": 2.1813316345214844 }, { "epoch": 0.5224441368400238, "step": 5284, "train/sim_loss": 0.0859375 }, { "epoch": 0.5224441368400238, "step": 5284, "train/total_loss": 0.3040706515312195 }, { "entropy": 9.001659393310547, "epoch": 0.5225430096895393, "mean_token_accuracy": 0.7677119374275208, "num_tokens": 6689853.0, "step": 5285, "train/ce_loss": 0.3454212248325348 }, { "epoch": 0.5225430096895393, "step": 5285, "train/sim_loss": 0.0390625 }, { "epoch": 0.5225430096895393, "step": 5285, "train/total_loss": 0.07360462844371796 }, { "entropy": 8.513310432434082, "epoch": 0.5226418825390547, "mean_token_accuracy": 0.7743403315544128, "num_tokens": 6695444.0, "step": 5286, "train/ce_loss": 0.8925740122795105 }, { "epoch": 0.5226418825390547, "step": 5286, "train/sim_loss": 0.08203125 }, { "epoch": 0.5226418825390547, "step": 5286, "train/total_loss": 0.1712886542081833 }, { "entropy": 8.87952995300293, "epoch": 0.5227407553885703, "mean_token_accuracy": 0.7326120734214783, "num_tokens": 6700523.0, "step": 5287, "train/ce_loss": 9.726073585625272e-06 }, { "epoch": 0.5227407553885703, "step": 5287, "train/sim_loss": 0.04296875 }, { "epoch": 0.5227407553885703, "step": 5287, "train/total_loss": 0.0429697223007679 }, { "entropy": 9.098947525024414, "epoch": 0.5228396282380858, "mean_token_accuracy": 0.773964524269104, "num_tokens": 6705833.0, "step": 5288, "train/ce_loss": 0.6194140911102295 }, { "epoch": 0.5228396282380858, "step": 5288, "train/sim_loss": 0.0546875 }, { "epoch": 0.5228396282380858, "step": 5288, "train/total_loss": 0.11662891507148743 }, { "entropy": 9.876415252685547, "epoch": 0.5229385010876013, "mean_token_accuracy": 0.7610389590263367, "num_tokens": 6710620.0, "step": 5289, "train/ce_loss": 1.7455840110778809 }, { "epoch": 0.5229385010876013, "step": 5289, "train/sim_loss": 0.09765625 }, { "epoch": 0.5229385010876013, "step": 5289, "train/total_loss": 0.2722146511077881 }, { "entropy": 9.425487518310547, "epoch": 0.5230373739371169, "mean_token_accuracy": 0.769599974155426, "num_tokens": 6715668.0, "step": 5290, "train/ce_loss": 8.170946443897265e-07 }, { "epoch": 0.5230373739371169, "step": 5290, "train/sim_loss": 0.01953125 }, { "epoch": 0.5230373739371169, "step": 5290, "train/total_loss": 0.019531331956386566 }, { "entropy": 8.816116333007812, "epoch": 0.5231362467866324, "mean_token_accuracy": 0.7628541588783264, "num_tokens": 6721053.0, "step": 5291, "train/ce_loss": 1.1168615818023682 }, { "epoch": 0.5231362467866324, "step": 5291, "train/sim_loss": 0.0859375 }, { "epoch": 0.5231362467866324, "step": 5291, "train/total_loss": 0.19762367010116577 }, { "entropy": 9.165191650390625, "epoch": 0.5232351196361479, "mean_token_accuracy": 0.7084367275238037, "num_tokens": 6726314.0, "step": 5292, "train/ce_loss": 0.8726534247398376 }, { "epoch": 0.5232351196361479, "step": 5292, "train/sim_loss": 0.0546875 }, { "epoch": 0.5232351196361479, "step": 5292, "train/total_loss": 0.14195284247398376 }, { "entropy": 9.813233375549316, "epoch": 0.5233339924856635, "mean_token_accuracy": 0.7095343470573425, "num_tokens": 6731179.0, "step": 5293, "train/ce_loss": 5.149100161361275e-06 }, { "epoch": 0.5233339924856635, "step": 5293, "train/sim_loss": 0.0625 }, { "epoch": 0.5233339924856635, "step": 5293, "train/total_loss": 0.06250051409006119 }, { "entropy": 9.578452110290527, "epoch": 0.523432865335179, "mean_token_accuracy": 0.6940966248512268, "num_tokens": 6736110.0, "step": 5294, "train/ce_loss": 8.76888805123599e-07 }, { "epoch": 0.523432865335179, "step": 5294, "train/sim_loss": 0.015625 }, { "epoch": 0.523432865335179, "step": 5294, "train/total_loss": 0.015625087544322014 }, { "entropy": 9.136296272277832, "epoch": 0.5235317381846945, "mean_token_accuracy": 0.728923499584198, "num_tokens": 6741354.0, "step": 5295, "train/ce_loss": 0.9489161968231201 }, { "epoch": 0.5235317381846945, "step": 5295, "train/sim_loss": 0.0703125 }, { "epoch": 0.5235317381846945, "step": 5295, "train/total_loss": 0.16520412266254425 }, { "entropy": 8.934793472290039, "epoch": 0.52363061103421, "mean_token_accuracy": 0.7049808502197266, "num_tokens": 6746643.0, "step": 5296, "train/ce_loss": 1.0172444581985474 }, { "epoch": 0.52363061103421, "step": 5296, "train/sim_loss": 0.08203125 }, { "epoch": 0.52363061103421, "step": 5296, "train/total_loss": 0.18375569581985474 }, { "entropy": 8.824613571166992, "epoch": 0.5237294838837255, "mean_token_accuracy": 0.7030302882194519, "num_tokens": 6752077.0, "step": 5297, "train/ce_loss": 1.4362319707870483 }, { "epoch": 0.5237294838837255, "step": 5297, "train/sim_loss": 0.1015625 }, { "epoch": 0.5237294838837255, "step": 5297, "train/total_loss": 0.2451857030391693 }, { "entropy": 8.722902297973633, "epoch": 0.523828356733241, "mean_token_accuracy": 0.732421875, "num_tokens": 6757588.0, "step": 5298, "train/ce_loss": 0.6351065039634705 }, { "epoch": 0.523828356733241, "step": 5298, "train/sim_loss": 0.05078125 }, { "epoch": 0.523828356733241, "step": 5298, "train/total_loss": 0.11429189890623093 }, { "entropy": 9.172910690307617, "epoch": 0.5239272295827566, "mean_token_accuracy": 0.732899010181427, "num_tokens": 6762677.0, "step": 5299, "train/ce_loss": 1.0685198307037354 }, { "epoch": 0.5239272295827566, "step": 5299, "train/sim_loss": 0.05859375 }, { "epoch": 0.5239272295827566, "step": 5299, "train/total_loss": 0.1654457449913025 }, { "epoch": 0.5240261024322721, "grad_norm": 0.7753241062164307, "learning_rate": 8.692330514760421e-06, "loss": 0.142, "step": 5300 }, { "entropy": 9.261425018310547, "epoch": 0.5240261024322721, "mean_token_accuracy": 0.7984732985496521, "num_tokens": 6767778.0, "step": 5300, "train/ce_loss": 0.6743336319923401 }, { "epoch": 0.5240261024322721, "step": 5300, "train/sim_loss": 0.0390625 }, { "epoch": 0.5240261024322721, "step": 5300, "train/total_loss": 0.10649586468935013 }, { "entropy": 8.882976531982422, "epoch": 0.5241249752817876, "mean_token_accuracy": 0.7522624731063843, "num_tokens": 6773118.0, "step": 5301, "train/ce_loss": 0.9140323996543884 }, { "epoch": 0.5241249752817876, "step": 5301, "train/sim_loss": 0.0390625 }, { "epoch": 0.5241249752817876, "step": 5301, "train/total_loss": 0.13046574592590332 }, { "entropy": 8.891812324523926, "epoch": 0.5242238481313032, "mean_token_accuracy": 0.7790178656578064, "num_tokens": 6778453.0, "step": 5302, "train/ce_loss": 0.5919695496559143 }, { "epoch": 0.5242238481313032, "step": 5302, "train/sim_loss": 0.0234375 }, { "epoch": 0.5242238481313032, "step": 5302, "train/total_loss": 0.08263445645570755 }, { "entropy": 9.686471939086914, "epoch": 0.5243227209808187, "mean_token_accuracy": 0.7914572954177856, "num_tokens": 6783293.0, "step": 5303, "train/ce_loss": 3.1267711619875627e-06 }, { "epoch": 0.5243227209808187, "step": 5303, "train/sim_loss": 0.0625 }, { "epoch": 0.5243227209808187, "step": 5303, "train/total_loss": 0.06250031292438507 }, { "entropy": 10.26594066619873, "epoch": 0.5244215938303342, "mean_token_accuracy": 0.7287449240684509, "num_tokens": 6787955.0, "step": 5304, "train/ce_loss": 1.0271916835336015e-05 }, { "epoch": 0.5244215938303342, "step": 5304, "train/sim_loss": 0.0390625 }, { "epoch": 0.5244215938303342, "step": 5304, "train/total_loss": 0.039063528180122375 }, { "entropy": 9.083431243896484, "epoch": 0.5245204666798498, "mean_token_accuracy": 0.7402746081352234, "num_tokens": 6793496.0, "step": 5305, "train/ce_loss": 1.131496548652649 }, { "epoch": 0.5245204666798498, "step": 5305, "train/sim_loss": 0.0703125 }, { "epoch": 0.5245204666798498, "step": 5305, "train/total_loss": 0.18346215784549713 }, { "entropy": 8.841108322143555, "epoch": 0.5246193395293652, "mean_token_accuracy": 0.6474164128303528, "num_tokens": 6798999.0, "step": 5306, "train/ce_loss": 1.169258952140808 }, { "epoch": 0.5246193395293652, "step": 5306, "train/sim_loss": 0.09375 }, { "epoch": 0.5246193395293652, "step": 5306, "train/total_loss": 0.2106758952140808 }, { "entropy": 9.429069519042969, "epoch": 0.5247182123788807, "mean_token_accuracy": 0.7676281929016113, "num_tokens": 6804040.0, "step": 5307, "train/ce_loss": 1.165016531944275 }, { "epoch": 0.5247182123788807, "step": 5307, "train/sim_loss": 0.0234375 }, { "epoch": 0.5247182123788807, "step": 5307, "train/total_loss": 0.13993915915489197 }, { "entropy": 9.255001068115234, "epoch": 0.5248170852283963, "mean_token_accuracy": 0.7620320916175842, "num_tokens": 6809250.0, "step": 5308, "train/ce_loss": 0.6999289989471436 }, { "epoch": 0.5248170852283963, "step": 5308, "train/sim_loss": 0.08203125 }, { "epoch": 0.5248170852283963, "step": 5308, "train/total_loss": 0.15202414989471436 }, { "entropy": 9.225317001342773, "epoch": 0.5249159580779118, "mean_token_accuracy": 0.6915760636329651, "num_tokens": 6814455.0, "step": 5309, "train/ce_loss": 1.4170299768447876 }, { "epoch": 0.5249159580779118, "step": 5309, "train/sim_loss": 0.06640625 }, { "epoch": 0.5249159580779118, "step": 5309, "train/total_loss": 0.20810924470424652 }, { "entropy": 9.13276481628418, "epoch": 0.5250148309274273, "mean_token_accuracy": 0.7012345790863037, "num_tokens": 6819739.0, "step": 5310, "train/ce_loss": 0.6636089086532593 }, { "epoch": 0.5250148309274273, "step": 5310, "train/sim_loss": 0.0546875 }, { "epoch": 0.5250148309274273, "step": 5310, "train/total_loss": 0.12104839086532593 }, { "entropy": 9.337963104248047, "epoch": 0.5251137037769429, "mean_token_accuracy": 0.7988422513008118, "num_tokens": 6824894.0, "step": 5311, "train/ce_loss": 0.8194974064826965 }, { "epoch": 0.5251137037769429, "step": 5311, "train/sim_loss": 0.0546875 }, { "epoch": 0.5251137037769429, "step": 5311, "train/total_loss": 0.13663724064826965 }, { "entropy": 9.612142562866211, "epoch": 0.5252125766264584, "mean_token_accuracy": 0.7628083229064941, "num_tokens": 6829864.0, "step": 5312, "train/ce_loss": 1.1538187265396118 }, { "epoch": 0.5252125766264584, "step": 5312, "train/sim_loss": 0.0390625 }, { "epoch": 0.5252125766264584, "step": 5312, "train/total_loss": 0.1544443666934967 }, { "entropy": 9.37071704864502, "epoch": 0.5253114494759739, "mean_token_accuracy": 0.7274011373519897, "num_tokens": 6835021.0, "step": 5313, "train/ce_loss": 4.946698027197272e-06 }, { "epoch": 0.5253114494759739, "step": 5313, "train/sim_loss": 0.02734375 }, { "epoch": 0.5253114494759739, "step": 5313, "train/total_loss": 0.027344245463609695 }, { "entropy": 8.565313339233398, "epoch": 0.5254103223254895, "mean_token_accuracy": 0.7023661136627197, "num_tokens": 6840341.0, "step": 5314, "train/ce_loss": 0.6946756839752197 }, { "epoch": 0.5254103223254895, "step": 5314, "train/sim_loss": 0.0390625 }, { "epoch": 0.5254103223254895, "step": 5314, "train/total_loss": 0.10853006690740585 }, { "entropy": 9.333951950073242, "epoch": 0.5255091951750049, "mean_token_accuracy": 0.7058823704719543, "num_tokens": 6845504.0, "step": 5315, "train/ce_loss": 1.2145005464553833 }, { "epoch": 0.5255091951750049, "step": 5315, "train/sim_loss": 0.078125 }, { "epoch": 0.5255091951750049, "step": 5315, "train/total_loss": 0.19957506656646729 }, { "entropy": 9.258594512939453, "epoch": 0.5256080680245204, "mean_token_accuracy": 0.723308265209198, "num_tokens": 6850575.0, "step": 5316, "train/ce_loss": 1.0137473344802856 }, { "epoch": 0.5256080680245204, "step": 5316, "train/sim_loss": 0.05078125 }, { "epoch": 0.5256080680245204, "step": 5316, "train/total_loss": 0.15215599536895752 }, { "entropy": 9.147095680236816, "epoch": 0.525706940874036, "mean_token_accuracy": 0.7363861203193665, "num_tokens": 6855829.0, "step": 5317, "train/ce_loss": 5.505980880116113e-07 }, { "epoch": 0.525706940874036, "step": 5317, "train/sim_loss": 0.01953125 }, { "epoch": 0.525706940874036, "step": 5317, "train/total_loss": 0.019531305879354477 }, { "entropy": 9.644766807556152, "epoch": 0.5258058137235515, "mean_token_accuracy": 0.7862903475761414, "num_tokens": 6860726.0, "step": 5318, "train/ce_loss": 1.0257605254082591e-06 }, { "epoch": 0.5258058137235515, "step": 5318, "train/sim_loss": 0.03125 }, { "epoch": 0.5258058137235515, "step": 5318, "train/total_loss": 0.03125010430812836 }, { "entropy": 9.015385627746582, "epoch": 0.525904686573067, "mean_token_accuracy": 0.6856464743614197, "num_tokens": 6866068.0, "step": 5319, "train/ce_loss": 1.1722711324691772 }, { "epoch": 0.525904686573067, "step": 5319, "train/sim_loss": 0.08203125 }, { "epoch": 0.525904686573067, "step": 5319, "train/total_loss": 0.19925835728645325 }, { "epoch": 0.5260035594225826, "grad_norm": 0.7373986840248108, "learning_rate": 8.687385650002474e-06, "loss": 0.1406, "step": 5320 }, { "entropy": 9.100988388061523, "epoch": 0.5260035594225826, "mean_token_accuracy": 0.7322946190834045, "num_tokens": 6871259.0, "step": 5320, "train/ce_loss": 0.6726405620574951 }, { "epoch": 0.5260035594225826, "step": 5320, "train/sim_loss": 0.03515625 }, { "epoch": 0.5260035594225826, "step": 5320, "train/total_loss": 0.10242030769586563 }, { "entropy": 9.619433403015137, "epoch": 0.5261024322720981, "mean_token_accuracy": 0.7317647337913513, "num_tokens": 6876083.0, "step": 5321, "train/ce_loss": 2.4636622129037278e-06 }, { "epoch": 0.5261024322720981, "step": 5321, "train/sim_loss": 0.03125 }, { "epoch": 0.5261024322720981, "step": 5321, "train/total_loss": 0.0312502458691597 }, { "entropy": 9.355940818786621, "epoch": 0.5262013051216136, "mean_token_accuracy": 0.7342342138290405, "num_tokens": 6881193.0, "step": 5322, "train/ce_loss": 0.9472965002059937 }, { "epoch": 0.5262013051216136, "step": 5322, "train/sim_loss": 0.1015625 }, { "epoch": 0.5262013051216136, "step": 5322, "train/total_loss": 0.19629216194152832 }, { "entropy": 9.124074935913086, "epoch": 0.5263001779711292, "mean_token_accuracy": 0.7120141386985779, "num_tokens": 6886202.0, "step": 5323, "train/ce_loss": 2.04034381567908e-06 }, { "epoch": 0.5263001779711292, "step": 5323, "train/sim_loss": 0.03515625 }, { "epoch": 0.5263001779711292, "step": 5323, "train/total_loss": 0.035156454890966415 }, { "entropy": 9.902753829956055, "epoch": 0.5263990508206446, "mean_token_accuracy": 0.6672897338867188, "num_tokens": 6891114.0, "step": 5324, "train/ce_loss": 9.634625257604057e-07 }, { "epoch": 0.5263990508206446, "step": 5324, "train/sim_loss": 0.01953125 }, { "epoch": 0.5263990508206446, "step": 5324, "train/total_loss": 0.01953134685754776 }, { "entropy": 9.061259269714355, "epoch": 0.5264979236701601, "mean_token_accuracy": 0.729393482208252, "num_tokens": 6896169.0, "step": 5325, "train/ce_loss": 1.6091874837875366 }, { "epoch": 0.5264979236701601, "step": 5325, "train/sim_loss": 0.0625 }, { "epoch": 0.5264979236701601, "step": 5325, "train/total_loss": 0.22341875731945038 }, { "entropy": 8.957914352416992, "epoch": 0.5265967965196757, "mean_token_accuracy": 0.7336394786834717, "num_tokens": 6901464.0, "step": 5326, "train/ce_loss": 0.5018377304077148 }, { "epoch": 0.5265967965196757, "step": 5326, "train/sim_loss": 0.0859375 }, { "epoch": 0.5265967965196757, "step": 5326, "train/total_loss": 0.13612127304077148 }, { "entropy": 8.64019775390625, "epoch": 0.5266956693691912, "mean_token_accuracy": 0.6845564246177673, "num_tokens": 6907038.0, "step": 5327, "train/ce_loss": 0.9129147529602051 }, { "epoch": 0.5266956693691912, "step": 5327, "train/sim_loss": 0.046875 }, { "epoch": 0.5266956693691912, "step": 5327, "train/total_loss": 0.13816648721694946 }, { "entropy": 9.07165241241455, "epoch": 0.5267945422187067, "mean_token_accuracy": 0.7884841561317444, "num_tokens": 6912332.0, "step": 5328, "train/ce_loss": 0.5771158337593079 }, { "epoch": 0.5267945422187067, "step": 5328, "train/sim_loss": 0.0625 }, { "epoch": 0.5267945422187067, "step": 5328, "train/total_loss": 0.12021158635616302 }, { "entropy": 9.02506160736084, "epoch": 0.5268934150682223, "mean_token_accuracy": 0.6881496906280518, "num_tokens": 6917741.0, "step": 5329, "train/ce_loss": 1.7925119400024414 }, { "epoch": 0.5268934150682223, "step": 5329, "train/sim_loss": 0.09765625 }, { "epoch": 0.5268934150682223, "step": 5329, "train/total_loss": 0.27690744400024414 }, { "entropy": 9.574190139770508, "epoch": 0.5269922879177378, "mean_token_accuracy": 0.7754442691802979, "num_tokens": 6922979.0, "step": 5330, "train/ce_loss": 2.2391729999071686e-06 }, { "epoch": 0.5269922879177378, "step": 5330, "train/sim_loss": 0.05859375 }, { "epoch": 0.5269922879177378, "step": 5330, "train/total_loss": 0.05859397351741791 }, { "entropy": 8.93344783782959, "epoch": 0.5270911607672533, "mean_token_accuracy": 0.769138753414154, "num_tokens": 6928305.0, "step": 5331, "train/ce_loss": 0.5848641395568848 }, { "epoch": 0.5270911607672533, "step": 5331, "train/sim_loss": 0.0625 }, { "epoch": 0.5270911607672533, "step": 5331, "train/total_loss": 0.12098641693592072 }, { "entropy": 8.794695854187012, "epoch": 0.5271900336167689, "mean_token_accuracy": 0.7119628190994263, "num_tokens": 6933642.0, "step": 5332, "train/ce_loss": 0.5715578198432922 }, { "epoch": 0.5271900336167689, "step": 5332, "train/sim_loss": 0.05859375 }, { "epoch": 0.5271900336167689, "step": 5332, "train/total_loss": 0.1157495379447937 }, { "entropy": 9.645194053649902, "epoch": 0.5272889064662843, "mean_token_accuracy": 0.7186897993087769, "num_tokens": 6938782.0, "step": 5333, "train/ce_loss": 1.0152158438359038e-06 }, { "epoch": 0.5272889064662843, "step": 5333, "train/sim_loss": 0.015625 }, { "epoch": 0.5272889064662843, "step": 5333, "train/total_loss": 0.015625102445483208 }, { "entropy": 9.530113220214844, "epoch": 0.5273877793157998, "mean_token_accuracy": 0.752212405204773, "num_tokens": 6943810.0, "step": 5334, "train/ce_loss": 0.7390918731689453 }, { "epoch": 0.5273877793157998, "step": 5334, "train/sim_loss": 0.0546875 }, { "epoch": 0.5273877793157998, "step": 5334, "train/total_loss": 0.128596693277359 }, { "entropy": 8.728327751159668, "epoch": 0.5274866521653154, "mean_token_accuracy": 0.7289617657661438, "num_tokens": 6949243.0, "step": 5335, "train/ce_loss": 1.2972229719161987 }, { "epoch": 0.5274866521653154, "step": 5335, "train/sim_loss": 0.0546875 }, { "epoch": 0.5274866521653154, "step": 5335, "train/total_loss": 0.18440979719161987 }, { "entropy": 8.749822616577148, "epoch": 0.5275855250148309, "mean_token_accuracy": 0.7312961220741272, "num_tokens": 6954625.0, "step": 5336, "train/ce_loss": 0.8289675116539001 }, { "epoch": 0.5275855250148309, "step": 5336, "train/sim_loss": 0.0546875 }, { "epoch": 0.5275855250148309, "step": 5336, "train/total_loss": 0.13758425414562225 }, { "entropy": 9.186654090881348, "epoch": 0.5276843978643464, "mean_token_accuracy": 0.7318932414054871, "num_tokens": 6959816.0, "step": 5337, "train/ce_loss": 0.9297966361045837 }, { "epoch": 0.5276843978643464, "step": 5337, "train/sim_loss": 0.0390625 }, { "epoch": 0.5276843978643464, "step": 5337, "train/total_loss": 0.13204216957092285 }, { "entropy": 9.81143856048584, "epoch": 0.527783270713862, "mean_token_accuracy": 0.692307710647583, "num_tokens": 6964574.0, "step": 5338, "train/ce_loss": 2.8371430289553246e-06 }, { "epoch": 0.527783270713862, "step": 5338, "train/sim_loss": 0.0390625 }, { "epoch": 0.527783270713862, "step": 5338, "train/total_loss": 0.03906278312206268 }, { "entropy": 9.589973449707031, "epoch": 0.5278821435633775, "mean_token_accuracy": 0.7482394576072693, "num_tokens": 6969565.0, "step": 5339, "train/ce_loss": 1.5491751432418823 }, { "epoch": 0.5278821435633775, "step": 5339, "train/sim_loss": 0.078125 }, { "epoch": 0.5278821435633775, "step": 5339, "train/total_loss": 0.23304252326488495 }, { "epoch": 0.527981016412893, "grad_norm": 0.6893176436424255, "learning_rate": 8.682440785244524e-06, "loss": 0.1464, "step": 5340 }, { "entropy": 9.429092407226562, "epoch": 0.527981016412893, "mean_token_accuracy": 0.7631999850273132, "num_tokens": 6974619.0, "step": 5340, "train/ce_loss": 1.4807854890823364 }, { "epoch": 0.527981016412893, "step": 5340, "train/sim_loss": 0.0625 }, { "epoch": 0.527981016412893, "step": 5340, "train/total_loss": 0.2105785459280014 }, { "entropy": 8.987339973449707, "epoch": 0.5280798892624086, "mean_token_accuracy": 0.7665505409240723, "num_tokens": 6979942.0, "step": 5341, "train/ce_loss": 0.435159295797348 }, { "epoch": 0.5280798892624086, "step": 5341, "train/sim_loss": 0.02734375 }, { "epoch": 0.5280798892624086, "step": 5341, "train/total_loss": 0.07085968554019928 }, { "entropy": 9.35151481628418, "epoch": 0.528178762111924, "mean_token_accuracy": 0.756533682346344, "num_tokens": 6985119.0, "step": 5342, "train/ce_loss": 1.3392177820205688 }, { "epoch": 0.528178762111924, "step": 5342, "train/sim_loss": 0.0703125 }, { "epoch": 0.528178762111924, "step": 5342, "train/total_loss": 0.2042342871427536 }, { "entropy": 8.658845901489258, "epoch": 0.5282776349614395, "mean_token_accuracy": 0.7789585590362549, "num_tokens": 6990565.0, "step": 5343, "train/ce_loss": 0.3492826521396637 }, { "epoch": 0.5282776349614395, "step": 5343, "train/sim_loss": 0.0390625 }, { "epoch": 0.5282776349614395, "step": 5343, "train/total_loss": 0.07399076223373413 }, { "entropy": 8.939874649047852, "epoch": 0.5283765078109551, "mean_token_accuracy": 0.7493734359741211, "num_tokens": 6995821.0, "step": 5344, "train/ce_loss": 0.746372640132904 }, { "epoch": 0.5283765078109551, "step": 5344, "train/sim_loss": 0.10546875 }, { "epoch": 0.5283765078109551, "step": 5344, "train/total_loss": 0.1801060140132904 }, { "entropy": 8.87545108795166, "epoch": 0.5284753806604706, "mean_token_accuracy": 0.7189781069755554, "num_tokens": 7001091.0, "step": 5345, "train/ce_loss": 1.1034655570983887 }, { "epoch": 0.5284753806604706, "step": 5345, "train/sim_loss": 0.0390625 }, { "epoch": 0.5284753806604706, "step": 5345, "train/total_loss": 0.14940905570983887 }, { "entropy": 9.050783157348633, "epoch": 0.5285742535099861, "mean_token_accuracy": 0.7047619223594666, "num_tokens": 7006373.0, "step": 5346, "train/ce_loss": 0.8942811489105225 }, { "epoch": 0.5285742535099861, "step": 5346, "train/sim_loss": 0.06640625 }, { "epoch": 0.5285742535099861, "step": 5346, "train/total_loss": 0.1558343768119812 }, { "entropy": 8.830314636230469, "epoch": 0.5286731263595017, "mean_token_accuracy": 0.744508683681488, "num_tokens": 7011710.0, "step": 5347, "train/ce_loss": 0.5430724620819092 }, { "epoch": 0.5286731263595017, "step": 5347, "train/sim_loss": 0.0703125 }, { "epoch": 0.5286731263595017, "step": 5347, "train/total_loss": 0.1246197521686554 }, { "entropy": 9.55591106414795, "epoch": 0.5287719992090172, "mean_token_accuracy": 0.7323688864707947, "num_tokens": 7016726.0, "step": 5348, "train/ce_loss": 1.0442057847976685 }, { "epoch": 0.5287719992090172, "step": 5348, "train/sim_loss": 0.078125 }, { "epoch": 0.5287719992090172, "step": 5348, "train/total_loss": 0.18254557251930237 }, { "entropy": 9.241381645202637, "epoch": 0.5288708720585328, "mean_token_accuracy": 0.7426035404205322, "num_tokens": 7021787.0, "step": 5349, "train/ce_loss": 1.3119507684677956e-06 }, { "epoch": 0.5288708720585328, "step": 5349, "train/sim_loss": 0.078125 }, { "epoch": 0.5288708720585328, "step": 5349, "train/total_loss": 0.07812513411045074 }, { "entropy": 8.90439224243164, "epoch": 0.5289697449080483, "mean_token_accuracy": 0.7367021441459656, "num_tokens": 7027049.0, "step": 5350, "train/ce_loss": 0.9882033467292786 }, { "epoch": 0.5289697449080483, "step": 5350, "train/sim_loss": 0.0546875 }, { "epoch": 0.5289697449080483, "step": 5350, "train/total_loss": 0.15350782871246338 }, { "entropy": 9.076597213745117, "epoch": 0.5290686177575638, "mean_token_accuracy": 0.7213695645332336, "num_tokens": 7032379.0, "step": 5351, "train/ce_loss": 0.6717884540557861 }, { "epoch": 0.5290686177575638, "step": 5351, "train/sim_loss": 0.03515625 }, { "epoch": 0.5290686177575638, "step": 5351, "train/total_loss": 0.10233509540557861 }, { "entropy": 9.232297897338867, "epoch": 0.5291674906070793, "mean_token_accuracy": 0.73221755027771, "num_tokens": 7037574.0, "step": 5352, "train/ce_loss": 0.601751983165741 }, { "epoch": 0.5291674906070793, "step": 5352, "train/sim_loss": 0.0390625 }, { "epoch": 0.5291674906070793, "step": 5352, "train/total_loss": 0.09923769533634186 }, { "entropy": 9.587291717529297, "epoch": 0.5292663634565948, "mean_token_accuracy": 0.7495527863502502, "num_tokens": 7042573.0, "step": 5353, "train/ce_loss": 1.2889903783798218 }, { "epoch": 0.5292663634565948, "step": 5353, "train/sim_loss": 0.078125 }, { "epoch": 0.5292663634565948, "step": 5353, "train/total_loss": 0.20702403783798218 }, { "entropy": 9.119341850280762, "epoch": 0.5293652363061103, "mean_token_accuracy": 0.8156911730766296, "num_tokens": 7047852.0, "step": 5354, "train/ce_loss": 0.966998279094696 }, { "epoch": 0.5293652363061103, "step": 5354, "train/sim_loss": 0.078125 }, { "epoch": 0.5293652363061103, "step": 5354, "train/total_loss": 0.17482483386993408 }, { "entropy": 8.981361389160156, "epoch": 0.5294641091556259, "mean_token_accuracy": 0.7072625756263733, "num_tokens": 7053239.0, "step": 5355, "train/ce_loss": 1.3036599159240723 }, { "epoch": 0.5294641091556259, "step": 5355, "train/sim_loss": 0.0625 }, { "epoch": 0.5294641091556259, "step": 5355, "train/total_loss": 0.1928659975528717 }, { "entropy": 9.078542709350586, "epoch": 0.5295629820051414, "mean_token_accuracy": 0.7303493618965149, "num_tokens": 7058558.0, "step": 5356, "train/ce_loss": 1.2763357162475586 }, { "epoch": 0.5295629820051414, "step": 5356, "train/sim_loss": 0.0546875 }, { "epoch": 0.5295629820051414, "step": 5356, "train/total_loss": 0.18232107162475586 }, { "entropy": 9.39804458618164, "epoch": 0.5296618548546569, "mean_token_accuracy": 0.7639344334602356, "num_tokens": 7063629.0, "step": 5357, "train/ce_loss": 1.943985807884019e-06 }, { "epoch": 0.5296618548546569, "step": 5357, "train/sim_loss": 0.0625 }, { "epoch": 0.5296618548546569, "step": 5357, "train/total_loss": 0.06250019371509552 }, { "entropy": 9.844568252563477, "epoch": 0.5297607277041725, "mean_token_accuracy": 0.7413395047187805, "num_tokens": 7068489.0, "step": 5358, "train/ce_loss": 1.7062684297561646 }, { "epoch": 0.5297607277041725, "step": 5358, "train/sim_loss": 0.1015625 }, { "epoch": 0.5297607277041725, "step": 5358, "train/total_loss": 0.27218934893608093 }, { "entropy": 10.298467636108398, "epoch": 0.529859600553688, "mean_token_accuracy": 0.7388888597488403, "num_tokens": 7073075.0, "step": 5359, "train/ce_loss": 9.308042535849381e-06 }, { "epoch": 0.529859600553688, "step": 5359, "train/sim_loss": 0.046875 }, { "epoch": 0.529859600553688, "step": 5359, "train/total_loss": 0.046875931322574615 }, { "epoch": 0.5299584734032035, "grad_norm": 1.147903323173523, "learning_rate": 8.677495920486575e-06, "loss": 0.1397, "step": 5360 }, { "entropy": 9.023246765136719, "epoch": 0.5299584734032035, "mean_token_accuracy": 0.7118881344795227, "num_tokens": 7078321.0, "step": 5360, "train/ce_loss": 0.8227757811546326 }, { "epoch": 0.5299584734032035, "step": 5360, "train/sim_loss": 0.14453125 }, { "epoch": 0.5299584734032035, "step": 5360, "train/total_loss": 0.2268088310956955 }, { "entropy": 8.935810089111328, "epoch": 0.530057346252719, "mean_token_accuracy": 0.7245370149612427, "num_tokens": 7083701.0, "step": 5361, "train/ce_loss": 0.690895140171051 }, { "epoch": 0.530057346252719, "step": 5361, "train/sim_loss": 0.07421875 }, { "epoch": 0.530057346252719, "step": 5361, "train/total_loss": 0.14330826699733734 }, { "entropy": 9.249588012695312, "epoch": 0.5301562191022345, "mean_token_accuracy": 0.6653465628623962, "num_tokens": 7088584.0, "step": 5362, "train/ce_loss": 3.4195520584034966e-06 }, { "epoch": 0.5301562191022345, "step": 5362, "train/sim_loss": 0.0390625 }, { "epoch": 0.5301562191022345, "step": 5362, "train/total_loss": 0.03906284272670746 }, { "entropy": 9.306418418884277, "epoch": 0.53025509195175, "mean_token_accuracy": 0.73893803358078, "num_tokens": 7093689.0, "step": 5363, "train/ce_loss": 1.1926301717758179 }, { "epoch": 0.53025509195175, "step": 5363, "train/sim_loss": 0.03515625 }, { "epoch": 0.53025509195175, "step": 5363, "train/total_loss": 0.15441927313804626 }, { "entropy": 9.027421951293945, "epoch": 0.5303539648012656, "mean_token_accuracy": 0.7839721441268921, "num_tokens": 7099034.0, "step": 5364, "train/ce_loss": 0.5805795788764954 }, { "epoch": 0.5303539648012656, "step": 5364, "train/sim_loss": 0.05078125 }, { "epoch": 0.5303539648012656, "step": 5364, "train/total_loss": 0.10883921384811401 }, { "entropy": 9.983185768127441, "epoch": 0.5304528376507811, "mean_token_accuracy": 0.6724137663841248, "num_tokens": 7103776.0, "step": 5365, "train/ce_loss": 1.7935803953150753e-06 }, { "epoch": 0.5304528376507811, "step": 5365, "train/sim_loss": 0.015625 }, { "epoch": 0.5304528376507811, "step": 5365, "train/total_loss": 0.015625178813934326 }, { "entropy": 9.114487648010254, "epoch": 0.5305517105002966, "mean_token_accuracy": 0.7476882338523865, "num_tokens": 7109028.0, "step": 5366, "train/ce_loss": 1.0102288722991943 }, { "epoch": 0.5305517105002966, "step": 5366, "train/sim_loss": 0.0390625 }, { "epoch": 0.5305517105002966, "step": 5366, "train/total_loss": 0.1400853991508484 }, { "entropy": 9.042924880981445, "epoch": 0.5306505833498122, "mean_token_accuracy": 0.7706935405731201, "num_tokens": 7114372.0, "step": 5367, "train/ce_loss": 0.9071125388145447 }, { "epoch": 0.5306505833498122, "step": 5367, "train/sim_loss": 0.0625 }, { "epoch": 0.5306505833498122, "step": 5367, "train/total_loss": 0.15321126580238342 }, { "entropy": 8.998058319091797, "epoch": 0.5307494561993277, "mean_token_accuracy": 0.7087979912757874, "num_tokens": 7119683.0, "step": 5368, "train/ce_loss": 1.2738776206970215 }, { "epoch": 0.5307494561993277, "step": 5368, "train/sim_loss": 0.046875 }, { "epoch": 0.5307494561993277, "step": 5368, "train/total_loss": 0.17426276206970215 }, { "entropy": 9.279787063598633, "epoch": 0.5308483290488432, "mean_token_accuracy": 0.767169177532196, "num_tokens": 7124710.0, "step": 5369, "train/ce_loss": 0.9067485332489014 }, { "epoch": 0.5308483290488432, "step": 5369, "train/sim_loss": 0.01953125 }, { "epoch": 0.5308483290488432, "step": 5369, "train/total_loss": 0.11020610481500626 }, { "entropy": 9.459781646728516, "epoch": 0.5309472018983588, "mean_token_accuracy": 0.7230046987533569, "num_tokens": 7129771.0, "step": 5370, "train/ce_loss": 1.6413638591766357 }, { "epoch": 0.5309472018983588, "step": 5370, "train/sim_loss": 0.03515625 }, { "epoch": 0.5309472018983588, "step": 5370, "train/total_loss": 0.1992926448583603 }, { "entropy": 9.348337173461914, "epoch": 0.5310460747478742, "mean_token_accuracy": 0.7108238935470581, "num_tokens": 7134836.0, "step": 5371, "train/ce_loss": 1.2921637296676636 }, { "epoch": 0.5310460747478742, "step": 5371, "train/sim_loss": 0.05078125 }, { "epoch": 0.5310460747478742, "step": 5371, "train/total_loss": 0.17999762296676636 }, { "entropy": 8.956254005432129, "epoch": 0.5311449475973897, "mean_token_accuracy": 0.7180910110473633, "num_tokens": 7140224.0, "step": 5372, "train/ce_loss": 1.1430314779281616 }, { "epoch": 0.5311449475973897, "step": 5372, "train/sim_loss": 0.12890625 }, { "epoch": 0.5311449475973897, "step": 5372, "train/total_loss": 0.24320939183235168 }, { "entropy": 9.061437606811523, "epoch": 0.5312438204469053, "mean_token_accuracy": 0.7108886241912842, "num_tokens": 7145494.0, "step": 5373, "train/ce_loss": 1.20707106590271 }, { "epoch": 0.5312438204469053, "step": 5373, "train/sim_loss": 0.03125 }, { "epoch": 0.5312438204469053, "step": 5373, "train/total_loss": 0.15195710957050323 }, { "entropy": 9.329811096191406, "epoch": 0.5313426932964208, "mean_token_accuracy": 0.7623066306114197, "num_tokens": 7150651.0, "step": 5374, "train/ce_loss": 1.5130629539489746 }, { "epoch": 0.5313426932964208, "step": 5374, "train/sim_loss": 0.09765625 }, { "epoch": 0.5313426932964208, "step": 5374, "train/total_loss": 0.24896255135536194 }, { "entropy": 9.067573547363281, "epoch": 0.5314415661459363, "mean_token_accuracy": 0.7314211130142212, "num_tokens": 7155870.0, "step": 5375, "train/ce_loss": 9.660790283305687e-07 }, { "epoch": 0.5314415661459363, "step": 5375, "train/sim_loss": 0.046875 }, { "epoch": 0.5314415661459363, "step": 5375, "train/total_loss": 0.04687509685754776 }, { "entropy": 9.270325660705566, "epoch": 0.5315404389954519, "mean_token_accuracy": 0.7004104256629944, "num_tokens": 7161085.0, "step": 5376, "train/ce_loss": 0.6302452087402344 }, { "epoch": 0.5315404389954519, "step": 5376, "train/sim_loss": 0.0390625 }, { "epoch": 0.5315404389954519, "step": 5376, "train/total_loss": 0.10208702087402344 }, { "entropy": 9.019856452941895, "epoch": 0.5316393118449674, "mean_token_accuracy": 0.7462871074676514, "num_tokens": 7166398.0, "step": 5377, "train/ce_loss": 1.1178202629089355 }, { "epoch": 0.5316393118449674, "step": 5377, "train/sim_loss": 0.08203125 }, { "epoch": 0.5316393118449674, "step": 5377, "train/total_loss": 0.1938132792711258 }, { "entropy": 9.057889938354492, "epoch": 0.5317381846944829, "mean_token_accuracy": 0.7750611305236816, "num_tokens": 7171683.0, "step": 5378, "train/ce_loss": 1.0019845962524414 }, { "epoch": 0.5317381846944829, "step": 5378, "train/sim_loss": 0.10546875 }, { "epoch": 0.5317381846944829, "step": 5378, "train/total_loss": 0.20566721260547638 }, { "entropy": 9.298063278198242, "epoch": 0.5318370575439985, "mean_token_accuracy": 0.7192254662513733, "num_tokens": 7177019.0, "step": 5379, "train/ce_loss": 0.8760280013084412 }, { "epoch": 0.5318370575439985, "step": 5379, "train/sim_loss": 0.1015625 }, { "epoch": 0.5318370575439985, "step": 5379, "train/total_loss": 0.18916529417037964 }, { "epoch": 0.531935930393514, "grad_norm": 0.729664146900177, "learning_rate": 8.672551055728627e-06, "loss": 0.1479, "step": 5380 }, { "entropy": 8.049644470214844, "epoch": 0.531935930393514, "mean_token_accuracy": 0.6997219920158386, "num_tokens": 7182594.0, "step": 5380, "train/ce_loss": 0.6018348336219788 }, { "epoch": 0.531935930393514, "step": 5380, "train/sim_loss": 0.0625 }, { "epoch": 0.531935930393514, "step": 5380, "train/total_loss": 0.12268348038196564 }, { "entropy": 8.955841064453125, "epoch": 0.5320348032430294, "mean_token_accuracy": 0.698060929775238, "num_tokens": 7187783.0, "step": 5381, "train/ce_loss": 0.8994425535202026 }, { "epoch": 0.5320348032430294, "step": 5381, "train/sim_loss": 0.05859375 }, { "epoch": 0.5320348032430294, "step": 5381, "train/total_loss": 0.1485380083322525 }, { "entropy": 9.328859329223633, "epoch": 0.532133676092545, "mean_token_accuracy": 0.7279151678085327, "num_tokens": 7192787.0, "step": 5382, "train/ce_loss": 1.0890889167785645 }, { "epoch": 0.532133676092545, "step": 5382, "train/sim_loss": 0.03515625 }, { "epoch": 0.532133676092545, "step": 5382, "train/total_loss": 0.14406514167785645 }, { "entropy": 9.188087463378906, "epoch": 0.5322325489420605, "mean_token_accuracy": 0.6950182318687439, "num_tokens": 7198064.0, "step": 5383, "train/ce_loss": 1.312862753868103 }, { "epoch": 0.5322325489420605, "step": 5383, "train/sim_loss": 0.05078125 }, { "epoch": 0.5322325489420605, "step": 5383, "train/total_loss": 0.18206752836704254 }, { "entropy": 8.715286254882812, "epoch": 0.532331421791576, "mean_token_accuracy": 0.7806072235107422, "num_tokens": 7203502.0, "step": 5384, "train/ce_loss": 0.5634164214134216 }, { "epoch": 0.532331421791576, "step": 5384, "train/sim_loss": 0.02734375 }, { "epoch": 0.532331421791576, "step": 5384, "train/total_loss": 0.08368539810180664 }, { "entropy": 9.236735343933105, "epoch": 0.5324302946410916, "mean_token_accuracy": 0.6630434989929199, "num_tokens": 7208754.0, "step": 5385, "train/ce_loss": 0.845870316028595 }, { "epoch": 0.5324302946410916, "step": 5385, "train/sim_loss": 0.05078125 }, { "epoch": 0.5324302946410916, "step": 5385, "train/total_loss": 0.13536828756332397 }, { "entropy": 9.177253723144531, "epoch": 0.5325291674906071, "mean_token_accuracy": 0.7102342844009399, "num_tokens": 7214044.0, "step": 5386, "train/ce_loss": 0.7868425250053406 }, { "epoch": 0.5325291674906071, "step": 5386, "train/sim_loss": 0.046875 }, { "epoch": 0.5325291674906071, "step": 5386, "train/total_loss": 0.1255592554807663 }, { "entropy": 9.212908744812012, "epoch": 0.5326280403401226, "mean_token_accuracy": 0.7291960716247559, "num_tokens": 7219250.0, "step": 5387, "train/ce_loss": 0.8648903965950012 }, { "epoch": 0.5326280403401226, "step": 5387, "train/sim_loss": 0.05859375 }, { "epoch": 0.5326280403401226, "step": 5387, "train/total_loss": 0.14508280158042908 }, { "entropy": 9.522449493408203, "epoch": 0.5327269131896382, "mean_token_accuracy": 0.6977124214172363, "num_tokens": 7224314.0, "step": 5388, "train/ce_loss": 7.962746622069972e-07 }, { "epoch": 0.5327269131896382, "step": 5388, "train/sim_loss": 0.02734375 }, { "epoch": 0.5327269131896382, "step": 5388, "train/total_loss": 0.027343830093741417 }, { "entropy": 8.765491485595703, "epoch": 0.5328257860391536, "mean_token_accuracy": 0.7887005805969238, "num_tokens": 7229679.0, "step": 5389, "train/ce_loss": 0.7940881848335266 }, { "epoch": 0.5328257860391536, "step": 5389, "train/sim_loss": 0.0703125 }, { "epoch": 0.5328257860391536, "step": 5389, "train/total_loss": 0.14972132444381714 }, { "entropy": 8.809341430664062, "epoch": 0.5329246588886691, "mean_token_accuracy": 0.7212249040603638, "num_tokens": 7235097.0, "step": 5390, "train/ce_loss": 0.7956955432891846 }, { "epoch": 0.5329246588886691, "step": 5390, "train/sim_loss": 0.06640625 }, { "epoch": 0.5329246588886691, "step": 5390, "train/total_loss": 0.14597579836845398 }, { "entropy": 9.053311347961426, "epoch": 0.5330235317381847, "mean_token_accuracy": 0.782608687877655, "num_tokens": 7240432.0, "step": 5391, "train/ce_loss": 0.7339907884597778 }, { "epoch": 0.5330235317381847, "step": 5391, "train/sim_loss": 0.05078125 }, { "epoch": 0.5330235317381847, "step": 5391, "train/total_loss": 0.12418033182621002 }, { "entropy": 8.960689544677734, "epoch": 0.5331224045877002, "mean_token_accuracy": 0.8105975389480591, "num_tokens": 7245778.0, "step": 5392, "train/ce_loss": 0.5632300972938538 }, { "epoch": 0.5331224045877002, "step": 5392, "train/sim_loss": 0.0546875 }, { "epoch": 0.5331224045877002, "step": 5392, "train/total_loss": 0.11101050674915314 }, { "entropy": 8.973613739013672, "epoch": 0.5332212774372157, "mean_token_accuracy": 0.7176079750061035, "num_tokens": 7251181.0, "step": 5393, "train/ce_loss": 0.6810163259506226 }, { "epoch": 0.5332212774372157, "step": 5393, "train/sim_loss": 0.0234375 }, { "epoch": 0.5332212774372157, "step": 5393, "train/total_loss": 0.09153913706541061 }, { "entropy": 10.398794174194336, "epoch": 0.5333201502867313, "mean_token_accuracy": 0.6315789222717285, "num_tokens": 7255809.0, "step": 5394, "train/ce_loss": 1.6151298041222617e-06 }, { "epoch": 0.5333201502867313, "step": 5394, "train/sim_loss": 0.0234375 }, { "epoch": 0.5333201502867313, "step": 5394, "train/total_loss": 0.023437662050127983 }, { "entropy": 9.192532539367676, "epoch": 0.5334190231362468, "mean_token_accuracy": 0.7105262875556946, "num_tokens": 7261046.0, "step": 5395, "train/ce_loss": 0.6318694353103638 }, { "epoch": 0.5334190231362468, "step": 5395, "train/sim_loss": 0.125 }, { "epoch": 0.5334190231362468, "step": 5395, "train/total_loss": 0.18818694353103638 }, { "entropy": 8.94017505645752, "epoch": 0.5335178959857623, "mean_token_accuracy": 0.7543859481811523, "num_tokens": 7266242.0, "step": 5396, "train/ce_loss": 0.6685409545898438 }, { "epoch": 0.5335178959857623, "step": 5396, "train/sim_loss": 0.046875 }, { "epoch": 0.5335178959857623, "step": 5396, "train/total_loss": 0.1137290969491005 }, { "entropy": 9.322994232177734, "epoch": 0.5336167688352779, "mean_token_accuracy": 0.7636022567749023, "num_tokens": 7271206.0, "step": 5397, "train/ce_loss": 0.7755517363548279 }, { "epoch": 0.5336167688352779, "step": 5397, "train/sim_loss": 0.02734375 }, { "epoch": 0.5336167688352779, "step": 5397, "train/total_loss": 0.10489892214536667 }, { "entropy": 9.49734878540039, "epoch": 0.5337156416847934, "mean_token_accuracy": 0.7454873919487, "num_tokens": 7276177.0, "step": 5398, "train/ce_loss": 1.4217755794525146 }, { "epoch": 0.5337156416847934, "step": 5398, "train/sim_loss": 0.05078125 }, { "epoch": 0.5337156416847934, "step": 5398, "train/total_loss": 0.19295881688594818 }, { "entropy": 9.841611862182617, "epoch": 0.5338145145343088, "mean_token_accuracy": 0.7167919874191284, "num_tokens": 7280977.0, "step": 5399, "train/ce_loss": 1.1508462876008707e-06 }, { "epoch": 0.5338145145343088, "step": 5399, "train/sim_loss": 0.01171875 }, { "epoch": 0.5338145145343088, "step": 5399, "train/total_loss": 0.011718865483999252 }, { "epoch": 0.5339133873838244, "grad_norm": 1.0195847749710083, "learning_rate": 8.667606190970677e-06, "loss": 0.1459, "step": 5400 }, { "entropy": 9.270522117614746, "epoch": 0.5339133873838244, "mean_token_accuracy": 0.7745901346206665, "num_tokens": 7286157.0, "step": 5400, "train/ce_loss": 0.9633910655975342 }, { "epoch": 0.5339133873838244, "step": 5400, "train/sim_loss": 0.0546875 }, { "epoch": 0.5339133873838244, "step": 5400, "train/total_loss": 0.15102660655975342 }, { "entropy": 8.760650634765625, "epoch": 0.5340122602333399, "mean_token_accuracy": 0.7218309640884399, "num_tokens": 7291532.0, "step": 5401, "train/ce_loss": 1.5196077823638916 }, { "epoch": 0.5340122602333399, "step": 5401, "train/sim_loss": 0.046875 }, { "epoch": 0.5340122602333399, "step": 5401, "train/total_loss": 0.19883577525615692 }, { "entropy": 9.711226463317871, "epoch": 0.5341111330828554, "mean_token_accuracy": 0.747474730014801, "num_tokens": 7296457.0, "step": 5402, "train/ce_loss": 2.4552828108426183e-06 }, { "epoch": 0.5341111330828554, "step": 5402, "train/sim_loss": 0.04296875 }, { "epoch": 0.5341111330828554, "step": 5402, "train/total_loss": 0.0429689958691597 }, { "entropy": 10.111250877380371, "epoch": 0.534210005932371, "mean_token_accuracy": 0.71074378490448, "num_tokens": 7301196.0, "step": 5403, "train/ce_loss": 2.5295214653015137 }, { "epoch": 0.534210005932371, "step": 5403, "train/sim_loss": 0.06640625 }, { "epoch": 0.534210005932371, "step": 5403, "train/total_loss": 0.3193584084510803 }, { "entropy": 9.483728408813477, "epoch": 0.5343088787818865, "mean_token_accuracy": 0.7246835231781006, "num_tokens": 7306240.0, "step": 5404, "train/ce_loss": 1.2337623834609985 }, { "epoch": 0.5343088787818865, "step": 5404, "train/sim_loss": 0.05078125 }, { "epoch": 0.5343088787818865, "step": 5404, "train/total_loss": 0.1741575002670288 }, { "entropy": 9.128599166870117, "epoch": 0.534407751631402, "mean_token_accuracy": 0.75, "num_tokens": 7311356.0, "step": 5405, "train/ce_loss": 0.9270554780960083 }, { "epoch": 0.534407751631402, "step": 5405, "train/sim_loss": 0.0703125 }, { "epoch": 0.534407751631402, "step": 5405, "train/total_loss": 0.16301804780960083 }, { "entropy": 9.280820846557617, "epoch": 0.5345066244809176, "mean_token_accuracy": 0.7132459878921509, "num_tokens": 7316523.0, "step": 5406, "train/ce_loss": 1.3654454946517944 }, { "epoch": 0.5345066244809176, "step": 5406, "train/sim_loss": 0.0546875 }, { "epoch": 0.5345066244809176, "step": 5406, "train/total_loss": 0.19123205542564392 }, { "entropy": 8.945045471191406, "epoch": 0.534605497330433, "mean_token_accuracy": 0.7852193713188171, "num_tokens": 7321865.0, "step": 5407, "train/ce_loss": 0.6675707101821899 }, { "epoch": 0.534605497330433, "step": 5407, "train/sim_loss": 0.078125 }, { "epoch": 0.534605497330433, "step": 5407, "train/total_loss": 0.14488208293914795 }, { "entropy": 9.157123565673828, "epoch": 0.5347043701799485, "mean_token_accuracy": 0.6819338202476501, "num_tokens": 7327096.0, "step": 5408, "train/ce_loss": 1.4747229215572588e-06 }, { "epoch": 0.5347043701799485, "step": 5408, "train/sim_loss": 0.06640625 }, { "epoch": 0.5347043701799485, "step": 5408, "train/total_loss": 0.06640639901161194 }, { "entropy": 9.102767944335938, "epoch": 0.5348032430294641, "mean_token_accuracy": 0.7913950681686401, "num_tokens": 7332254.0, "step": 5409, "train/ce_loss": 0.6586277484893799 }, { "epoch": 0.5348032430294641, "step": 5409, "train/sim_loss": 0.046875 }, { "epoch": 0.5348032430294641, "step": 5409, "train/total_loss": 0.11273777484893799 }, { "entropy": 8.996744155883789, "epoch": 0.5349021158789796, "mean_token_accuracy": 0.7413173913955688, "num_tokens": 7337577.0, "step": 5410, "train/ce_loss": 0.6847343444824219 }, { "epoch": 0.5349021158789796, "step": 5410, "train/sim_loss": 0.08984375 }, { "epoch": 0.5349021158789796, "step": 5410, "train/total_loss": 0.1583171784877777 }, { "entropy": 8.977319717407227, "epoch": 0.5350009887284951, "mean_token_accuracy": 0.7600922584533691, "num_tokens": 7342971.0, "step": 5411, "train/ce_loss": 0.6952391266822815 }, { "epoch": 0.5350009887284951, "step": 5411, "train/sim_loss": 0.0234375 }, { "epoch": 0.5350009887284951, "step": 5411, "train/total_loss": 0.09296141564846039 }, { "entropy": 8.610834121704102, "epoch": 0.5350998615780107, "mean_token_accuracy": 0.7481080889701843, "num_tokens": 7348356.0, "step": 5412, "train/ce_loss": 0.9105154871940613 }, { "epoch": 0.5350998615780107, "step": 5412, "train/sim_loss": 0.02734375 }, { "epoch": 0.5350998615780107, "step": 5412, "train/total_loss": 0.11839529871940613 }, { "entropy": 9.360595703125, "epoch": 0.5351987344275262, "mean_token_accuracy": 0.767103374004364, "num_tokens": 7353507.0, "step": 5413, "train/ce_loss": 1.1175366640090942 }, { "epoch": 0.5351987344275262, "step": 5413, "train/sim_loss": 0.0390625 }, { "epoch": 0.5351987344275262, "step": 5413, "train/total_loss": 0.1508161723613739 }, { "entropy": 9.405769348144531, "epoch": 0.5352976072770417, "mean_token_accuracy": 0.7421602606773376, "num_tokens": 7358524.0, "step": 5414, "train/ce_loss": 0.7853061556816101 }, { "epoch": 0.5352976072770417, "step": 5414, "train/sim_loss": 0.03125 }, { "epoch": 0.5352976072770417, "step": 5414, "train/total_loss": 0.10978061705827713 }, { "entropy": 8.846992492675781, "epoch": 0.5353964801265573, "mean_token_accuracy": 0.7455882430076599, "num_tokens": 7363711.0, "step": 5415, "train/ce_loss": 1.6724973917007446 }, { "epoch": 0.5353964801265573, "step": 5415, "train/sim_loss": 0.0234375 }, { "epoch": 0.5353964801265573, "step": 5415, "train/total_loss": 0.19068723917007446 }, { "entropy": 8.983139991760254, "epoch": 0.5354953529760728, "mean_token_accuracy": 0.7798658013343811, "num_tokens": 7368920.0, "step": 5416, "train/ce_loss": 1.1571681852728943e-06 }, { "epoch": 0.5354953529760728, "step": 5416, "train/sim_loss": 0.05078125 }, { "epoch": 0.5354953529760728, "step": 5416, "train/total_loss": 0.05078136548399925 }, { "entropy": 9.146753311157227, "epoch": 0.5355942258255882, "mean_token_accuracy": 0.7410072088241577, "num_tokens": 7374226.0, "step": 5417, "train/ce_loss": 0.9910473227500916 }, { "epoch": 0.5355942258255882, "step": 5417, "train/sim_loss": 0.08203125 }, { "epoch": 0.5355942258255882, "step": 5417, "train/total_loss": 0.18113598227500916 }, { "entropy": 8.94820499420166, "epoch": 0.5356930986751038, "mean_token_accuracy": 0.7194163799285889, "num_tokens": 7379585.0, "step": 5418, "train/ce_loss": 0.9936345219612122 }, { "epoch": 0.5356930986751038, "step": 5418, "train/sim_loss": 0.03515625 }, { "epoch": 0.5356930986751038, "step": 5418, "train/total_loss": 0.13451969623565674 }, { "entropy": 9.583169937133789, "epoch": 0.5357919715246193, "mean_token_accuracy": 0.803108811378479, "num_tokens": 7384551.0, "step": 5419, "train/ce_loss": 0.7731736302375793 }, { "epoch": 0.5357919715246193, "step": 5419, "train/sim_loss": 0.0546875 }, { "epoch": 0.5357919715246193, "step": 5419, "train/total_loss": 0.13200485706329346 }, { "epoch": 0.5358908443741348, "grad_norm": 0.5803881287574768, "learning_rate": 8.66266132621273e-06, "loss": 0.1316, "step": 5420 }, { "entropy": 8.793757438659668, "epoch": 0.5358908443741348, "mean_token_accuracy": 0.758368194103241, "num_tokens": 7390020.0, "step": 5420, "train/ce_loss": 0.31864622235298157 }, { "epoch": 0.5358908443741348, "step": 5420, "train/sim_loss": 0.0703125 }, { "epoch": 0.5358908443741348, "step": 5420, "train/total_loss": 0.10217712819576263 }, { "entropy": 9.193429946899414, "epoch": 0.5359897172236504, "mean_token_accuracy": 0.7450722455978394, "num_tokens": 7395276.0, "step": 5421, "train/ce_loss": 0.626777172088623 }, { "epoch": 0.5359897172236504, "step": 5421, "train/sim_loss": 0.0546875 }, { "epoch": 0.5359897172236504, "step": 5421, "train/total_loss": 0.11736521869897842 }, { "entropy": 9.44847583770752, "epoch": 0.5360885900731659, "mean_token_accuracy": 0.7686212658882141, "num_tokens": 7400331.0, "step": 5422, "train/ce_loss": 1.4848267255729297e-06 }, { "epoch": 0.5360885900731659, "step": 5422, "train/sim_loss": 0.05078125 }, { "epoch": 0.5360885900731659, "step": 5422, "train/total_loss": 0.05078139901161194 }, { "entropy": 9.161100387573242, "epoch": 0.5361874629226814, "mean_token_accuracy": 0.6960784196853638, "num_tokens": 7405450.0, "step": 5423, "train/ce_loss": 6.965454986129771e-07 }, { "epoch": 0.5361874629226814, "step": 5423, "train/sim_loss": 0.01953125 }, { "epoch": 0.5361874629226814, "step": 5423, "train/total_loss": 0.01953131891787052 }, { "entropy": 8.943927764892578, "epoch": 0.536286335772197, "mean_token_accuracy": 0.7231638431549072, "num_tokens": 7410778.0, "step": 5424, "train/ce_loss": 0.6913469433784485 }, { "epoch": 0.536286335772197, "step": 5424, "train/sim_loss": 0.04296875 }, { "epoch": 0.536286335772197, "step": 5424, "train/total_loss": 0.11210344731807709 }, { "entropy": 9.350214958190918, "epoch": 0.5363852086217125, "mean_token_accuracy": 0.6972860097885132, "num_tokens": 7415717.0, "step": 5425, "train/ce_loss": 1.726108166621998e-06 }, { "epoch": 0.5363852086217125, "step": 5425, "train/sim_loss": 0.0390625 }, { "epoch": 0.5363852086217125, "step": 5425, "train/total_loss": 0.03906267136335373 }, { "entropy": 8.813923835754395, "epoch": 0.536484081471228, "mean_token_accuracy": 0.7530864477157593, "num_tokens": 7421082.0, "step": 5426, "train/ce_loss": 0.9239147901535034 }, { "epoch": 0.536484081471228, "step": 5426, "train/sim_loss": 0.0546875 }, { "epoch": 0.536484081471228, "step": 5426, "train/total_loss": 0.1470789909362793 }, { "entropy": 8.823338508605957, "epoch": 0.5365829543207435, "mean_token_accuracy": 0.7474518418312073, "num_tokens": 7426419.0, "step": 5427, "train/ce_loss": 0.8750184774398804 }, { "epoch": 0.5365829543207435, "step": 5427, "train/sim_loss": 0.08203125 }, { "epoch": 0.5365829543207435, "step": 5427, "train/total_loss": 0.16953310370445251 }, { "entropy": 9.309595108032227, "epoch": 0.536681827170259, "mean_token_accuracy": 0.7337559461593628, "num_tokens": 7431482.0, "step": 5428, "train/ce_loss": 0.5525712370872498 }, { "epoch": 0.536681827170259, "step": 5428, "train/sim_loss": 0.0859375 }, { "epoch": 0.536681827170259, "step": 5428, "train/total_loss": 0.14119462668895721 }, { "entropy": 8.547271728515625, "epoch": 0.5367807000197745, "mean_token_accuracy": 0.7235932946205139, "num_tokens": 7437169.0, "step": 5429, "train/ce_loss": 0.5903127789497375 }, { "epoch": 0.5367807000197745, "step": 5429, "train/sim_loss": 0.171875 }, { "epoch": 0.5367807000197745, "step": 5429, "train/total_loss": 0.23090627789497375 }, { "entropy": 8.891470909118652, "epoch": 0.5368795728692901, "mean_token_accuracy": 0.7647702693939209, "num_tokens": 7442568.0, "step": 5430, "train/ce_loss": 0.40449684858322144 }, { "epoch": 0.5368795728692901, "step": 5430, "train/sim_loss": 0.0234375 }, { "epoch": 0.5368795728692901, "step": 5430, "train/total_loss": 0.06388718634843826 }, { "entropy": 8.927657127380371, "epoch": 0.5369784457188056, "mean_token_accuracy": 0.7164790034294128, "num_tokens": 7447986.0, "step": 5431, "train/ce_loss": 1.090582013130188 }, { "epoch": 0.5369784457188056, "step": 5431, "train/sim_loss": 0.05859375 }, { "epoch": 0.5369784457188056, "step": 5431, "train/total_loss": 0.1676519513130188 }, { "entropy": 9.070655822753906, "epoch": 0.5370773185683212, "mean_token_accuracy": 0.7209653258323669, "num_tokens": 7453096.0, "step": 5432, "train/ce_loss": 1.8256173133850098 }, { "epoch": 0.5370773185683212, "step": 5432, "train/sim_loss": 0.07421875 }, { "epoch": 0.5370773185683212, "step": 5432, "train/total_loss": 0.2567805051803589 }, { "entropy": 9.026044845581055, "epoch": 0.5371761914178367, "mean_token_accuracy": 0.7105590105056763, "num_tokens": 7458387.0, "step": 5433, "train/ce_loss": 0.8001567721366882 }, { "epoch": 0.5371761914178367, "step": 5433, "train/sim_loss": 0.03125 }, { "epoch": 0.5371761914178367, "step": 5433, "train/total_loss": 0.11126568168401718 }, { "entropy": 9.867725372314453, "epoch": 0.5372750642673522, "mean_token_accuracy": 0.7709359526634216, "num_tokens": 7463151.0, "step": 5434, "train/ce_loss": 3.672146021926892e-06 }, { "epoch": 0.5372750642673522, "step": 5434, "train/sim_loss": 0.0546875 }, { "epoch": 0.5372750642673522, "step": 5434, "train/total_loss": 0.05468786880373955 }, { "entropy": 9.443593978881836, "epoch": 0.5373739371168678, "mean_token_accuracy": 0.7107023596763611, "num_tokens": 7468198.0, "step": 5435, "train/ce_loss": 1.6333290338516235 }, { "epoch": 0.5373739371168678, "step": 5435, "train/sim_loss": 0.0859375 }, { "epoch": 0.5373739371168678, "step": 5435, "train/total_loss": 0.24927040934562683 }, { "entropy": 8.999428749084473, "epoch": 0.5374728099663832, "mean_token_accuracy": 0.8150510191917419, "num_tokens": 7473460.0, "step": 5436, "train/ce_loss": 0.6773163676261902 }, { "epoch": 0.5374728099663832, "step": 5436, "train/sim_loss": 0.04296875 }, { "epoch": 0.5374728099663832, "step": 5436, "train/total_loss": 0.11070039123296738 }, { "entropy": 9.100971221923828, "epoch": 0.5375716828158987, "mean_token_accuracy": 0.7493606209754944, "num_tokens": 7478680.0, "step": 5437, "train/ce_loss": 0.798964262008667 }, { "epoch": 0.5375716828158987, "step": 5437, "train/sim_loss": 0.03515625 }, { "epoch": 0.5375716828158987, "step": 5437, "train/total_loss": 0.11505267769098282 }, { "entropy": 9.791324615478516, "epoch": 0.5376705556654143, "mean_token_accuracy": 0.767241358757019, "num_tokens": 7483522.0, "step": 5438, "train/ce_loss": 1.0660107135772705 }, { "epoch": 0.5376705556654143, "step": 5438, "train/sim_loss": 0.0625 }, { "epoch": 0.5376705556654143, "step": 5438, "train/total_loss": 0.1691010743379593 }, { "entropy": 9.7655611038208, "epoch": 0.5377694285149298, "mean_token_accuracy": 0.7222222089767456, "num_tokens": 7488401.0, "step": 5439, "train/ce_loss": 0.7153486013412476 }, { "epoch": 0.5377694285149298, "step": 5439, "train/sim_loss": 0.05859375 }, { "epoch": 0.5377694285149298, "step": 5439, "train/total_loss": 0.1301286220550537 }, { "epoch": 0.5378683013644453, "grad_norm": 0.9586021900177002, "learning_rate": 8.65771646145478e-06, "loss": 0.1486, "step": 5440 }, { "entropy": 8.779112815856934, "epoch": 0.5378683013644453, "mean_token_accuracy": 0.7090336084365845, "num_tokens": 7493878.0, "step": 5440, "train/ce_loss": 1.2352663278579712 }, { "epoch": 0.5378683013644453, "step": 5440, "train/sim_loss": 0.09765625 }, { "epoch": 0.5378683013644453, "step": 5440, "train/total_loss": 0.22118288278579712 }, { "entropy": 9.081262588500977, "epoch": 0.5379671742139609, "mean_token_accuracy": 0.7427241206169128, "num_tokens": 7499203.0, "step": 5441, "train/ce_loss": 0.5701096653938293 }, { "epoch": 0.5379671742139609, "step": 5441, "train/sim_loss": 0.07421875 }, { "epoch": 0.5379671742139609, "step": 5441, "train/total_loss": 0.1312297135591507 }, { "entropy": 9.301429748535156, "epoch": 0.5380660470634764, "mean_token_accuracy": 0.8223140239715576, "num_tokens": 7504379.0, "step": 5442, "train/ce_loss": 0.6167430281639099 }, { "epoch": 0.5380660470634764, "step": 5442, "train/sim_loss": 0.01953125 }, { "epoch": 0.5380660470634764, "step": 5442, "train/total_loss": 0.08120555430650711 }, { "entropy": 8.818253517150879, "epoch": 0.5381649199129919, "mean_token_accuracy": 0.7458704113960266, "num_tokens": 7509628.0, "step": 5443, "train/ce_loss": 1.226873755455017 }, { "epoch": 0.5381649199129919, "step": 5443, "train/sim_loss": 0.0390625 }, { "epoch": 0.5381649199129919, "step": 5443, "train/total_loss": 0.16174986958503723 }, { "entropy": 8.970584869384766, "epoch": 0.5382637927625075, "mean_token_accuracy": 0.7167070508003235, "num_tokens": 7515002.0, "step": 5444, "train/ce_loss": 0.552111029624939 }, { "epoch": 0.5382637927625075, "step": 5444, "train/sim_loss": 0.02734375 }, { "epoch": 0.5382637927625075, "step": 5444, "train/total_loss": 0.08255485445261002 }, { "entropy": 10.215428352355957, "epoch": 0.538362665612023, "mean_token_accuracy": 0.6946107745170593, "num_tokens": 7519579.0, "step": 5445, "train/ce_loss": 6.035854312358424e-06 }, { "epoch": 0.538362665612023, "step": 5445, "train/sim_loss": 0.046875 }, { "epoch": 0.538362665612023, "step": 5445, "train/total_loss": 0.04687560349702835 }, { "entropy": 9.030383110046387, "epoch": 0.5384615384615384, "mean_token_accuracy": 0.7371794581413269, "num_tokens": 7524659.0, "step": 5446, "train/ce_loss": 1.2019760333714657e-06 }, { "epoch": 0.5384615384615384, "step": 5446, "train/sim_loss": 0.0546875 }, { "epoch": 0.5384615384615384, "step": 5446, "train/total_loss": 0.05468761920928955 }, { "entropy": 9.103740692138672, "epoch": 0.538560411311054, "mean_token_accuracy": 0.7551282048225403, "num_tokens": 7529886.0, "step": 5447, "train/ce_loss": 0.6153397560119629 }, { "epoch": 0.538560411311054, "step": 5447, "train/sim_loss": 0.046875 }, { "epoch": 0.538560411311054, "step": 5447, "train/total_loss": 0.10840897262096405 }, { "entropy": 8.652833938598633, "epoch": 0.5386592841605695, "mean_token_accuracy": 0.7880377769470215, "num_tokens": 7535394.0, "step": 5448, "train/ce_loss": 0.5609210729598999 }, { "epoch": 0.5386592841605695, "step": 5448, "train/sim_loss": 0.03125 }, { "epoch": 0.5386592841605695, "step": 5448, "train/total_loss": 0.08734211325645447 }, { "entropy": 9.259515762329102, "epoch": 0.538758157010085, "mean_token_accuracy": 0.751288652420044, "num_tokens": 7540611.0, "step": 5449, "train/ce_loss": 1.5452930927276611 }, { "epoch": 0.538758157010085, "step": 5449, "train/sim_loss": 0.0859375 }, { "epoch": 0.538758157010085, "step": 5449, "train/total_loss": 0.24046681821346283 }, { "entropy": 8.676910400390625, "epoch": 0.5388570298596006, "mean_token_accuracy": 0.740818440914154, "num_tokens": 7545979.0, "step": 5450, "train/ce_loss": 1.0719397068023682 }, { "epoch": 0.5388570298596006, "step": 5450, "train/sim_loss": 0.05859375 }, { "epoch": 0.5388570298596006, "step": 5450, "train/total_loss": 0.1657877266407013 }, { "entropy": 8.894207000732422, "epoch": 0.5389559027091161, "mean_token_accuracy": 0.7266272306442261, "num_tokens": 7551300.0, "step": 5451, "train/ce_loss": 0.7713016271591187 }, { "epoch": 0.5389559027091161, "step": 5451, "train/sim_loss": 0.04296875 }, { "epoch": 0.5389559027091161, "step": 5451, "train/total_loss": 0.12009891122579575 }, { "entropy": 9.597565650939941, "epoch": 0.5390547755586316, "mean_token_accuracy": 0.7495291829109192, "num_tokens": 7556265.0, "step": 5452, "train/ce_loss": 0.6753806471824646 }, { "epoch": 0.5390547755586316, "step": 5452, "train/sim_loss": 0.078125 }, { "epoch": 0.5390547755586316, "step": 5452, "train/total_loss": 0.1456630676984787 }, { "entropy": 9.219755172729492, "epoch": 0.5391536484081472, "mean_token_accuracy": 0.7556818127632141, "num_tokens": 7561428.0, "step": 5453, "train/ce_loss": 1.0690592527389526 }, { "epoch": 0.5391536484081472, "step": 5453, "train/sim_loss": 0.0546875 }, { "epoch": 0.5391536484081472, "step": 5453, "train/total_loss": 0.16159343719482422 }, { "entropy": 8.821651458740234, "epoch": 0.5392525212576627, "mean_token_accuracy": 0.7505399584770203, "num_tokens": 7566844.0, "step": 5454, "train/ce_loss": 0.6399568915367126 }, { "epoch": 0.5392525212576627, "step": 5454, "train/sim_loss": 0.0234375 }, { "epoch": 0.5392525212576627, "step": 5454, "train/total_loss": 0.08743318915367126 }, { "entropy": 9.432109832763672, "epoch": 0.5393513941071781, "mean_token_accuracy": 0.730215847492218, "num_tokens": 7571766.0, "step": 5455, "train/ce_loss": 1.1695923805236816 }, { "epoch": 0.5393513941071781, "step": 5455, "train/sim_loss": 0.0625 }, { "epoch": 0.5393513941071781, "step": 5455, "train/total_loss": 0.17945924401283264 }, { "entropy": 8.712631225585938, "epoch": 0.5394502669566937, "mean_token_accuracy": 0.7405140995979309, "num_tokens": 7577428.0, "step": 5456, "train/ce_loss": 0.5700088143348694 }, { "epoch": 0.5394502669566937, "step": 5456, "train/sim_loss": 0.1171875 }, { "epoch": 0.5394502669566937, "step": 5456, "train/total_loss": 0.17418837547302246 }, { "entropy": 9.612346649169922, "epoch": 0.5395491398062092, "mean_token_accuracy": 0.7223300933837891, "num_tokens": 7582393.0, "step": 5457, "train/ce_loss": 1.0203193596680649e-06 }, { "epoch": 0.5395491398062092, "step": 5457, "train/sim_loss": 0.01953125 }, { "epoch": 0.5395491398062092, "step": 5457, "train/total_loss": 0.019531352445483208 }, { "entropy": 8.942792892456055, "epoch": 0.5396480126557247, "mean_token_accuracy": 0.6880733966827393, "num_tokens": 7587664.0, "step": 5458, "train/ce_loss": 1.8375366926193237 }, { "epoch": 0.5396480126557247, "step": 5458, "train/sim_loss": 0.1015625 }, { "epoch": 0.5396480126557247, "step": 5458, "train/total_loss": 0.2853161692619324 }, { "entropy": 9.137945175170898, "epoch": 0.5397468855052403, "mean_token_accuracy": 0.7172932624816895, "num_tokens": 7592807.0, "step": 5459, "train/ce_loss": 1.0247173309326172 }, { "epoch": 0.5397468855052403, "step": 5459, "train/sim_loss": 0.0234375 }, { "epoch": 0.5397468855052403, "step": 5459, "train/total_loss": 0.1259092390537262 }, { "epoch": 0.5398457583547558, "grad_norm": 0.7609133124351501, "learning_rate": 8.65277159669683e-06, "loss": 0.1393, "step": 5460 }, { "entropy": 9.026578903198242, "epoch": 0.5398457583547558, "mean_token_accuracy": 0.6619718074798584, "num_tokens": 7598038.0, "step": 5460, "train/ce_loss": 1.3797636029266869e-06 }, { "epoch": 0.5398457583547558, "step": 5460, "train/sim_loss": 0.03125 }, { "epoch": 0.5398457583547558, "step": 5460, "train/total_loss": 0.03125013783574104 }, { "entropy": 8.902261734008789, "epoch": 0.5399446312042713, "mean_token_accuracy": 0.7545661926269531, "num_tokens": 7603388.0, "step": 5461, "train/ce_loss": 0.5844436287879944 }, { "epoch": 0.5399446312042713, "step": 5461, "train/sim_loss": 0.015625 }, { "epoch": 0.5399446312042713, "step": 5461, "train/total_loss": 0.07406936585903168 }, { "entropy": 9.208501815795898, "epoch": 0.5400435040537869, "mean_token_accuracy": 0.6607142686843872, "num_tokens": 7608472.0, "step": 5462, "train/ce_loss": 1.7073651552200317 }, { "epoch": 0.5400435040537869, "step": 5462, "train/sim_loss": 0.0625 }, { "epoch": 0.5400435040537869, "step": 5462, "train/total_loss": 0.23323652148246765 }, { "entropy": 9.232449531555176, "epoch": 0.5401423769033024, "mean_token_accuracy": 0.7136929631233215, "num_tokens": 7613602.0, "step": 5463, "train/ce_loss": 1.0019513368606567 }, { "epoch": 0.5401423769033024, "step": 5463, "train/sim_loss": 0.0703125 }, { "epoch": 0.5401423769033024, "step": 5463, "train/total_loss": 0.17050763964653015 }, { "entropy": 9.604730606079102, "epoch": 0.5402412497528178, "mean_token_accuracy": 0.7513914704322815, "num_tokens": 7618566.0, "step": 5464, "train/ce_loss": 0.5866101384162903 }, { "epoch": 0.5402412497528178, "step": 5464, "train/sim_loss": 0.07421875 }, { "epoch": 0.5402412497528178, "step": 5464, "train/total_loss": 0.13287976384162903 }, { "entropy": 9.419134140014648, "epoch": 0.5403401226023334, "mean_token_accuracy": 0.778388261795044, "num_tokens": 7623545.0, "step": 5465, "train/ce_loss": 1.16068696975708 }, { "epoch": 0.5403401226023334, "step": 5465, "train/sim_loss": 0.01953125 }, { "epoch": 0.5403401226023334, "step": 5465, "train/total_loss": 0.13559994101524353 }, { "entropy": 9.015172958374023, "epoch": 0.5404389954518489, "mean_token_accuracy": 0.7227949500083923, "num_tokens": 7629071.0, "step": 5466, "train/ce_loss": 1.0439870357513428 }, { "epoch": 0.5404389954518489, "step": 5466, "train/sim_loss": 0.0546875 }, { "epoch": 0.5404389954518489, "step": 5466, "train/total_loss": 0.1590861976146698 }, { "entropy": 9.223745346069336, "epoch": 0.5405378683013644, "mean_token_accuracy": 0.7883755564689636, "num_tokens": 7634255.0, "step": 5467, "train/ce_loss": 0.5337609052658081 }, { "epoch": 0.5405378683013644, "step": 5467, "train/sim_loss": 0.0390625 }, { "epoch": 0.5405378683013644, "step": 5467, "train/total_loss": 0.09243859350681305 }, { "entropy": 9.87230110168457, "epoch": 0.54063674115088, "mean_token_accuracy": 0.6977329850196838, "num_tokens": 7639037.0, "step": 5468, "train/ce_loss": 2.147436816812842e-06 }, { "epoch": 0.54063674115088, "step": 5468, "train/sim_loss": 0.03515625 }, { "epoch": 0.54063674115088, "step": 5468, "train/total_loss": 0.03515646606683731 }, { "entropy": 9.589224815368652, "epoch": 0.5407356140003955, "mean_token_accuracy": 0.7479507923126221, "num_tokens": 7643967.0, "step": 5469, "train/ce_loss": 1.1842515732496395e-06 }, { "epoch": 0.5407356140003955, "step": 5469, "train/sim_loss": 0.02734375 }, { "epoch": 0.5407356140003955, "step": 5469, "train/total_loss": 0.02734386920928955 }, { "entropy": 9.101972579956055, "epoch": 0.540834486849911, "mean_token_accuracy": 0.7326589822769165, "num_tokens": 7649081.0, "step": 5470, "train/ce_loss": 1.2851073741912842 }, { "epoch": 0.540834486849911, "step": 5470, "train/sim_loss": 0.05859375 }, { "epoch": 0.540834486849911, "step": 5470, "train/total_loss": 0.1871044933795929 }, { "entropy": 9.178415298461914, "epoch": 0.5409333596994266, "mean_token_accuracy": 0.7444608807563782, "num_tokens": 7654207.0, "step": 5471, "train/ce_loss": 0.9581825137138367 }, { "epoch": 0.5409333596994266, "step": 5471, "train/sim_loss": 0.02734375 }, { "epoch": 0.5409333596994266, "step": 5471, "train/total_loss": 0.12316200137138367 }, { "entropy": 8.825922966003418, "epoch": 0.5410322325489421, "mean_token_accuracy": 0.7145946025848389, "num_tokens": 7659633.0, "step": 5472, "train/ce_loss": 1.07634437084198 }, { "epoch": 0.5410322325489421, "step": 5472, "train/sim_loss": 0.08203125 }, { "epoch": 0.5410322325489421, "step": 5472, "train/total_loss": 0.18966569006443024 }, { "entropy": 9.041023254394531, "epoch": 0.5411311053984575, "mean_token_accuracy": 0.801001250743866, "num_tokens": 7664920.0, "step": 5473, "train/ce_loss": 0.6893028020858765 }, { "epoch": 0.5411311053984575, "step": 5473, "train/sim_loss": 0.015625 }, { "epoch": 0.5411311053984575, "step": 5473, "train/total_loss": 0.08455528318881989 }, { "entropy": 9.221052169799805, "epoch": 0.5412299782479731, "mean_token_accuracy": 0.7625330090522766, "num_tokens": 7670148.0, "step": 5474, "train/ce_loss": 0.7779694199562073 }, { "epoch": 0.5412299782479731, "step": 5474, "train/sim_loss": 0.01953125 }, { "epoch": 0.5412299782479731, "step": 5474, "train/total_loss": 0.09732819348573685 }, { "entropy": 8.973783493041992, "epoch": 0.5413288510974886, "mean_token_accuracy": 0.7426981925964355, "num_tokens": 7675305.0, "step": 5475, "train/ce_loss": 0.9248960018157959 }, { "epoch": 0.5413288510974886, "step": 5475, "train/sim_loss": 0.0625 }, { "epoch": 0.5413288510974886, "step": 5475, "train/total_loss": 0.1549896001815796 }, { "entropy": 8.819339752197266, "epoch": 0.5414277239470041, "mean_token_accuracy": 0.754478394985199, "num_tokens": 7680705.0, "step": 5476, "train/ce_loss": 0.8168224692344666 }, { "epoch": 0.5414277239470041, "step": 5476, "train/sim_loss": 0.03125 }, { "epoch": 0.5414277239470041, "step": 5476, "train/total_loss": 0.1129322499036789 }, { "entropy": 9.046710968017578, "epoch": 0.5415265967965197, "mean_token_accuracy": 0.7265353202819824, "num_tokens": 7686073.0, "step": 5477, "train/ce_loss": 0.7705683708190918 }, { "epoch": 0.5415265967965197, "step": 5477, "train/sim_loss": 0.109375 }, { "epoch": 0.5415265967965197, "step": 5477, "train/total_loss": 0.18643184006214142 }, { "entropy": 9.227375030517578, "epoch": 0.5416254696460352, "mean_token_accuracy": 0.7328858971595764, "num_tokens": 7691261.0, "step": 5478, "train/ce_loss": 9.011659471980238e-07 }, { "epoch": 0.5416254696460352, "step": 5478, "train/sim_loss": 0.015625 }, { "epoch": 0.5416254696460352, "step": 5478, "train/total_loss": 0.015625089406967163 }, { "entropy": 8.970365524291992, "epoch": 0.5417243424955507, "mean_token_accuracy": 0.747178316116333, "num_tokens": 7696551.0, "step": 5479, "train/ce_loss": 0.7276411652565002 }, { "epoch": 0.5417243424955507, "step": 5479, "train/sim_loss": 0.0859375 }, { "epoch": 0.5417243424955507, "step": 5479, "train/total_loss": 0.15870162844657898 }, { "epoch": 0.5418232153450663, "grad_norm": 0.612647533416748, "learning_rate": 8.647826731938883e-06, "loss": 0.1365, "step": 5480 }, { "entropy": 9.256315231323242, "epoch": 0.5418232153450663, "mean_token_accuracy": 0.6985583305358887, "num_tokens": 7701738.0, "step": 5480, "train/ce_loss": 0.77440345287323 }, { "epoch": 0.5418232153450663, "step": 5480, "train/sim_loss": 0.0390625 }, { "epoch": 0.5418232153450663, "step": 5480, "train/total_loss": 0.11650284379720688 }, { "entropy": 9.61116886138916, "epoch": 0.5419220881945818, "mean_token_accuracy": 0.774946928024292, "num_tokens": 7706607.0, "step": 5481, "train/ce_loss": 2.2646183879260207e-06 }, { "epoch": 0.5419220881945818, "step": 5481, "train/sim_loss": 0.0390625 }, { "epoch": 0.5419220881945818, "step": 5481, "train/total_loss": 0.039062727242708206 }, { "entropy": 9.658862113952637, "epoch": 0.5420209610440972, "mean_token_accuracy": 0.8372092843055725, "num_tokens": 7711598.0, "step": 5482, "train/ce_loss": 2.0505208340182435e-06 }, { "epoch": 0.5420209610440972, "step": 5482, "train/sim_loss": 0.01171875 }, { "epoch": 0.5420209610440972, "step": 5482, "train/total_loss": 0.011718954890966415 }, { "entropy": 9.218994140625, "epoch": 0.5421198338936128, "mean_token_accuracy": 0.7635869383811951, "num_tokens": 7716768.0, "step": 5483, "train/ce_loss": 1.0125045776367188 }, { "epoch": 0.5421198338936128, "step": 5483, "train/sim_loss": 0.1171875 }, { "epoch": 0.5421198338936128, "step": 5483, "train/total_loss": 0.21843796968460083 }, { "entropy": 9.483529090881348, "epoch": 0.5422187067431283, "mean_token_accuracy": 0.7338345646858215, "num_tokens": 7721835.0, "step": 5484, "train/ce_loss": 0.6087969541549683 }, { "epoch": 0.5422187067431283, "step": 5484, "train/sim_loss": 0.06640625 }, { "epoch": 0.5422187067431283, "step": 5484, "train/total_loss": 0.1272859424352646 }, { "entropy": 9.390131950378418, "epoch": 0.5423175795926438, "mean_token_accuracy": 0.6901840567588806, "num_tokens": 7726922.0, "step": 5485, "train/ce_loss": 1.0740700960159302 }, { "epoch": 0.5423175795926438, "step": 5485, "train/sim_loss": 0.05078125 }, { "epoch": 0.5423175795926438, "step": 5485, "train/total_loss": 0.15818825364112854 }, { "entropy": 9.398968696594238, "epoch": 0.5424164524421594, "mean_token_accuracy": 0.8065395355224609, "num_tokens": 7732264.0, "step": 5486, "train/ce_loss": 2.0825088995479746e-06 }, { "epoch": 0.5424164524421594, "step": 5486, "train/sim_loss": 0.0625 }, { "epoch": 0.5424164524421594, "step": 5486, "train/total_loss": 0.06250020861625671 }, { "entropy": 8.707170486450195, "epoch": 0.5425153252916749, "mean_token_accuracy": 0.7239958643913269, "num_tokens": 7737670.0, "step": 5487, "train/ce_loss": 0.9993120431900024 }, { "epoch": 0.5425153252916749, "step": 5487, "train/sim_loss": 0.0625 }, { "epoch": 0.5425153252916749, "step": 5487, "train/total_loss": 0.16243121027946472 }, { "entropy": 9.389225006103516, "epoch": 0.5426141981411904, "mean_token_accuracy": 0.816216230392456, "num_tokens": 7742673.0, "step": 5488, "train/ce_loss": 0.7702173590660095 }, { "epoch": 0.5426141981411904, "step": 5488, "train/sim_loss": 0.01953125 }, { "epoch": 0.5426141981411904, "step": 5488, "train/total_loss": 0.09655299037694931 }, { "entropy": 8.623525619506836, "epoch": 0.542713070990706, "mean_token_accuracy": 0.7491961121559143, "num_tokens": 7748137.0, "step": 5489, "train/ce_loss": 0.44330134987831116 }, { "epoch": 0.542713070990706, "step": 5489, "train/sim_loss": 0.0625 }, { "epoch": 0.542713070990706, "step": 5489, "train/total_loss": 0.10683013498783112 }, { "entropy": 9.485376358032227, "epoch": 0.5428119438402215, "mean_token_accuracy": 0.7637271285057068, "num_tokens": 7753158.0, "step": 5490, "train/ce_loss": 1.3455654652716476e-06 }, { "epoch": 0.5428119438402215, "step": 5490, "train/sim_loss": 0.03125 }, { "epoch": 0.5428119438402215, "step": 5490, "train/total_loss": 0.031250134110450745 }, { "entropy": 8.845754623413086, "epoch": 0.542910816689737, "mean_token_accuracy": 0.7085533142089844, "num_tokens": 7758749.0, "step": 5491, "train/ce_loss": 0.8674387335777283 }, { "epoch": 0.542910816689737, "step": 5491, "train/sim_loss": 0.04296875 }, { "epoch": 0.542910816689737, "step": 5491, "train/total_loss": 0.12971262633800507 }, { "entropy": 9.163055419921875, "epoch": 0.5430096895392525, "mean_token_accuracy": 0.6792699098587036, "num_tokens": 7764038.0, "step": 5492, "train/ce_loss": 0.6201307773590088 }, { "epoch": 0.5430096895392525, "step": 5492, "train/sim_loss": 0.0625 }, { "epoch": 0.5430096895392525, "step": 5492, "train/total_loss": 0.12451307475566864 }, { "entropy": 9.007641792297363, "epoch": 0.543108562388768, "mean_token_accuracy": 0.7056737542152405, "num_tokens": 7769504.0, "step": 5493, "train/ce_loss": 0.7358147501945496 }, { "epoch": 0.543108562388768, "step": 5493, "train/sim_loss": 0.03515625 }, { "epoch": 0.543108562388768, "step": 5493, "train/total_loss": 0.10873772948980331 }, { "entropy": 9.138314247131348, "epoch": 0.5432074352382835, "mean_token_accuracy": 0.8242424130439758, "num_tokens": 7774588.0, "step": 5494, "train/ce_loss": 1.0121489140146878e-06 }, { "epoch": 0.5432074352382835, "step": 5494, "train/sim_loss": 0.05859375 }, { "epoch": 0.5432074352382835, "step": 5494, "train/total_loss": 0.05859385058283806 }, { "entropy": 9.288567543029785, "epoch": 0.5433063080877991, "mean_token_accuracy": 0.7085889577865601, "num_tokens": 7779660.0, "step": 5495, "train/ce_loss": 2.142662879123236e-06 }, { "epoch": 0.5433063080877991, "step": 5495, "train/sim_loss": 0.0546875 }, { "epoch": 0.5433063080877991, "step": 5495, "train/total_loss": 0.05468771606683731 }, { "entropy": 8.77684211730957, "epoch": 0.5434051809373146, "mean_token_accuracy": 0.8189300298690796, "num_tokens": 7785143.0, "step": 5496, "train/ce_loss": 0.680293083190918 }, { "epoch": 0.5434051809373146, "step": 5496, "train/sim_loss": 0.046875 }, { "epoch": 0.5434051809373146, "step": 5496, "train/total_loss": 0.11490430682897568 }, { "entropy": 8.529176712036133, "epoch": 0.5435040537868301, "mean_token_accuracy": 0.7483370304107666, "num_tokens": 7790530.0, "step": 5497, "train/ce_loss": 0.8081743717193604 }, { "epoch": 0.5435040537868301, "step": 5497, "train/sim_loss": 0.05078125 }, { "epoch": 0.5435040537868301, "step": 5497, "train/total_loss": 0.13159868121147156 }, { "entropy": 9.397544860839844, "epoch": 0.5436029266363457, "mean_token_accuracy": 0.7481698393821716, "num_tokens": 7795632.0, "step": 5498, "train/ce_loss": 1.8115889588443679e-06 }, { "epoch": 0.5436029266363457, "step": 5498, "train/sim_loss": 0.03515625 }, { "epoch": 0.5436029266363457, "step": 5498, "train/total_loss": 0.035156432539224625 }, { "entropy": 9.139359474182129, "epoch": 0.5437017994858612, "mean_token_accuracy": 0.7417027354240417, "num_tokens": 7800751.0, "step": 5499, "train/ce_loss": 9.818531907512806e-07 }, { "epoch": 0.5437017994858612, "step": 5499, "train/sim_loss": 0.078125 }, { "epoch": 0.5437017994858612, "step": 5499, "train/total_loss": 0.07812509685754776 }, { "epoch": 0.5438006723353767, "grad_norm": 0.7031065225601196, "learning_rate": 8.642881867180933e-06, "loss": 0.1337, "step": 5500 }, { "entropy": 9.415504455566406, "epoch": 0.5438006723353767, "mean_token_accuracy": 0.7769110798835754, "num_tokens": 7805849.0, "step": 5500, "train/ce_loss": 1.227493405342102 }, { "epoch": 0.5438006723353767, "step": 5500, "train/sim_loss": 0.04296875 }, { "epoch": 0.5438006723353767, "step": 5500, "train/total_loss": 0.16571809351444244 }, { "entropy": 9.082967758178711, "epoch": 0.5438995451848923, "mean_token_accuracy": 0.7361809015274048, "num_tokens": 7811094.0, "step": 5501, "train/ce_loss": 0.47862014174461365 }, { "epoch": 0.5438995451848923, "step": 5501, "train/sim_loss": 0.03125 }, { "epoch": 0.5438995451848923, "step": 5501, "train/total_loss": 0.07911201566457748 }, { "entropy": 9.015972137451172, "epoch": 0.5439984180344077, "mean_token_accuracy": 0.7757731676101685, "num_tokens": 7816367.0, "step": 5502, "train/ce_loss": 0.6524413824081421 }, { "epoch": 0.5439984180344077, "step": 5502, "train/sim_loss": 0.03515625 }, { "epoch": 0.5439984180344077, "step": 5502, "train/total_loss": 0.10040038824081421 }, { "entropy": 8.995569229125977, "epoch": 0.5440972908839232, "mean_token_accuracy": 0.6952941417694092, "num_tokens": 7821677.0, "step": 5503, "train/ce_loss": 1.2559822835100931e-06 }, { "epoch": 0.5440972908839232, "step": 5503, "train/sim_loss": 0.0625 }, { "epoch": 0.5440972908839232, "step": 5503, "train/total_loss": 0.06250012665987015 }, { "entropy": 8.838412284851074, "epoch": 0.5441961637334388, "mean_token_accuracy": 0.7453415989875793, "num_tokens": 7826935.0, "step": 5504, "train/ce_loss": 1.2809569835662842 }, { "epoch": 0.5441961637334388, "step": 5504, "train/sim_loss": 0.06640625 }, { "epoch": 0.5441961637334388, "step": 5504, "train/total_loss": 0.19450195133686066 }, { "entropy": 10.003866195678711, "epoch": 0.5442950365829543, "mean_token_accuracy": 0.8797953724861145, "num_tokens": 7831694.0, "step": 5505, "train/ce_loss": 1.4319297075271606 }, { "epoch": 0.5442950365829543, "step": 5505, "train/sim_loss": 0.0234375 }, { "epoch": 0.5442950365829543, "step": 5505, "train/total_loss": 0.16663047671318054 }, { "entropy": 9.064350128173828, "epoch": 0.5443939094324698, "mean_token_accuracy": 0.7067484855651855, "num_tokens": 7836989.0, "step": 5506, "train/ce_loss": 1.2671979665756226 }, { "epoch": 0.5443939094324698, "step": 5506, "train/sim_loss": 0.03125 }, { "epoch": 0.5443939094324698, "step": 5506, "train/total_loss": 0.15796980261802673 }, { "entropy": 9.026290893554688, "epoch": 0.5444927822819854, "mean_token_accuracy": 0.7358490824699402, "num_tokens": 7842327.0, "step": 5507, "train/ce_loss": 1.0443733930587769 }, { "epoch": 0.5444927822819854, "step": 5507, "train/sim_loss": 0.05859375 }, { "epoch": 0.5444927822819854, "step": 5507, "train/total_loss": 0.16303110122680664 }, { "entropy": 9.018562316894531, "epoch": 0.5445916551315009, "mean_token_accuracy": 0.6709007024765015, "num_tokens": 7847639.0, "step": 5508, "train/ce_loss": 0.595085084438324 }, { "epoch": 0.5445916551315009, "step": 5508, "train/sim_loss": 0.03515625 }, { "epoch": 0.5445916551315009, "step": 5508, "train/total_loss": 0.09466475993394852 }, { "entropy": 9.13880729675293, "epoch": 0.5446905279810164, "mean_token_accuracy": 0.700507640838623, "num_tokens": 7852881.0, "step": 5509, "train/ce_loss": 0.6465105414390564 }, { "epoch": 0.5446905279810164, "step": 5509, "train/sim_loss": 0.03515625 }, { "epoch": 0.5446905279810164, "step": 5509, "train/total_loss": 0.09980730712413788 }, { "entropy": 9.208677291870117, "epoch": 0.544789400830532, "mean_token_accuracy": 0.7856257557868958, "num_tokens": 7858340.0, "step": 5510, "train/ce_loss": 0.6704494953155518 }, { "epoch": 0.544789400830532, "step": 5510, "train/sim_loss": 0.015625 }, { "epoch": 0.544789400830532, "step": 5510, "train/total_loss": 0.0826699510216713 }, { "entropy": 9.106315612792969, "epoch": 0.5448882736800474, "mean_token_accuracy": 0.7473404407501221, "num_tokens": 7863577.0, "step": 5511, "train/ce_loss": 0.7330856323242188 }, { "epoch": 0.5448882736800474, "step": 5511, "train/sim_loss": 0.046875 }, { "epoch": 0.5448882736800474, "step": 5511, "train/total_loss": 0.120183564722538 }, { "entropy": 9.01171875, "epoch": 0.5449871465295629, "mean_token_accuracy": 0.7030812501907349, "num_tokens": 7868762.0, "step": 5512, "train/ce_loss": 1.0409945249557495 }, { "epoch": 0.5449871465295629, "step": 5512, "train/sim_loss": 0.03125 }, { "epoch": 0.5449871465295629, "step": 5512, "train/total_loss": 0.13534945249557495 }, { "entropy": 9.56721305847168, "epoch": 0.5450860193790785, "mean_token_accuracy": 0.7206572890281677, "num_tokens": 7873649.0, "step": 5513, "train/ce_loss": 3.4404743018967565e-06 }, { "epoch": 0.5450860193790785, "step": 5513, "train/sim_loss": 0.06640625 }, { "epoch": 0.5450860193790785, "step": 5513, "train/total_loss": 0.06640659272670746 }, { "entropy": 9.214876174926758, "epoch": 0.545184892228594, "mean_token_accuracy": 0.7772151827812195, "num_tokens": 7878876.0, "step": 5514, "train/ce_loss": 0.36950212717056274 }, { "epoch": 0.545184892228594, "step": 5514, "train/sim_loss": 0.01953125 }, { "epoch": 0.545184892228594, "step": 5514, "train/total_loss": 0.056481461971998215 }, { "entropy": 8.57907485961914, "epoch": 0.5452837650781096, "mean_token_accuracy": 0.7252985835075378, "num_tokens": 7884243.0, "step": 5515, "train/ce_loss": 0.5983100533485413 }, { "epoch": 0.5452837650781096, "step": 5515, "train/sim_loss": 0.0625 }, { "epoch": 0.5452837650781096, "step": 5515, "train/total_loss": 0.12233100831508636 }, { "entropy": 9.1329345703125, "epoch": 0.5453826379276251, "mean_token_accuracy": 0.7596899271011353, "num_tokens": 7889492.0, "step": 5516, "train/ce_loss": 0.3740823566913605 }, { "epoch": 0.5453826379276251, "step": 5516, "train/sim_loss": 0.0859375 }, { "epoch": 0.5453826379276251, "step": 5516, "train/total_loss": 0.12334573268890381 }, { "entropy": 9.813523292541504, "epoch": 0.5454815107771406, "mean_token_accuracy": 0.7532467246055603, "num_tokens": 7894327.0, "step": 5517, "train/ce_loss": 3.1217628020385746e-06 }, { "epoch": 0.5454815107771406, "step": 5517, "train/sim_loss": 0.0859375 }, { "epoch": 0.5454815107771406, "step": 5517, "train/total_loss": 0.08593781292438507 }, { "entropy": 9.10891342163086, "epoch": 0.5455803836266562, "mean_token_accuracy": 0.7839335203170776, "num_tokens": 7899509.0, "step": 5518, "train/ce_loss": 1.0317003726959229 }, { "epoch": 0.5455803836266562, "step": 5518, "train/sim_loss": 0.01953125 }, { "epoch": 0.5455803836266562, "step": 5518, "train/total_loss": 0.12270128726959229 }, { "entropy": 8.824468612670898, "epoch": 0.5456792564761717, "mean_token_accuracy": 0.7766203880310059, "num_tokens": 7904810.0, "step": 5519, "train/ce_loss": 0.4866676330566406 }, { "epoch": 0.5456792564761717, "step": 5519, "train/sim_loss": 0.04296875 }, { "epoch": 0.5456792564761717, "step": 5519, "train/total_loss": 0.09163551032543182 }, { "epoch": 0.5457781293256871, "grad_norm": 0.563425600528717, "learning_rate": 8.637937002422985e-06, "loss": 0.133, "step": 5520 }, { "entropy": 9.161569595336914, "epoch": 0.5457781293256871, "mean_token_accuracy": 0.7279596924781799, "num_tokens": 7910043.0, "step": 5520, "train/ce_loss": 5.308772301759745e-07 }, { "epoch": 0.5457781293256871, "step": 5520, "train/sim_loss": 0.0234375 }, { "epoch": 0.5457781293256871, "step": 5520, "train/total_loss": 0.023437554016709328 }, { "entropy": 8.903478622436523, "epoch": 0.5458770021752027, "mean_token_accuracy": 0.7416563630104065, "num_tokens": 7915287.0, "step": 5521, "train/ce_loss": 1.7618522644042969 }, { "epoch": 0.5458770021752027, "step": 5521, "train/sim_loss": 0.0859375 }, { "epoch": 0.5458770021752027, "step": 5521, "train/total_loss": 0.2621227502822876 }, { "entropy": 9.213225364685059, "epoch": 0.5459758750247182, "mean_token_accuracy": 0.7211394309997559, "num_tokens": 7920387.0, "step": 5522, "train/ce_loss": 1.2949116230010986 }, { "epoch": 0.5459758750247182, "step": 5522, "train/sim_loss": 0.0703125 }, { "epoch": 0.5459758750247182, "step": 5522, "train/total_loss": 0.1998036652803421 }, { "entropy": 9.422765731811523, "epoch": 0.5460747478742337, "mean_token_accuracy": 0.7446457743644714, "num_tokens": 7925440.0, "step": 5523, "train/ce_loss": 5.0349794946669135e-06 }, { "epoch": 0.5460747478742337, "step": 5523, "train/sim_loss": 0.04296875 }, { "epoch": 0.5460747478742337, "step": 5523, "train/total_loss": 0.04296925291419029 }, { "entropy": 10.037985801696777, "epoch": 0.5461736207237493, "mean_token_accuracy": 0.7230320572853088, "num_tokens": 7930181.0, "step": 5524, "train/ce_loss": 1.330657958984375 }, { "epoch": 0.5461736207237493, "step": 5524, "train/sim_loss": 0.0546875 }, { "epoch": 0.5461736207237493, "step": 5524, "train/total_loss": 0.18775330483913422 }, { "entropy": 9.685091018676758, "epoch": 0.5462724935732648, "mean_token_accuracy": 0.6944971680641174, "num_tokens": 7935138.0, "step": 5525, "train/ce_loss": 1.6756025615904946e-06 }, { "epoch": 0.5462724935732648, "step": 5525, "train/sim_loss": 0.0703125 }, { "epoch": 0.5462724935732648, "step": 5525, "train/total_loss": 0.07031266391277313 }, { "entropy": 9.215286254882812, "epoch": 0.5463713664227803, "mean_token_accuracy": 0.7022398114204407, "num_tokens": 7940341.0, "step": 5526, "train/ce_loss": 1.028321623802185 }, { "epoch": 0.5463713664227803, "step": 5526, "train/sim_loss": 0.06640625 }, { "epoch": 0.5463713664227803, "step": 5526, "train/total_loss": 0.16923841834068298 }, { "entropy": 8.892183303833008, "epoch": 0.5464702392722959, "mean_token_accuracy": 0.7200000286102295, "num_tokens": 7945663.0, "step": 5527, "train/ce_loss": 0.6684654951095581 }, { "epoch": 0.5464702392722959, "step": 5527, "train/sim_loss": 0.0390625 }, { "epoch": 0.5464702392722959, "step": 5527, "train/total_loss": 0.10590904951095581 }, { "entropy": 8.921552658081055, "epoch": 0.5465691121218114, "mean_token_accuracy": 0.743139386177063, "num_tokens": 7951040.0, "step": 5528, "train/ce_loss": 0.8332463502883911 }, { "epoch": 0.5465691121218114, "step": 5528, "train/sim_loss": 0.08984375 }, { "epoch": 0.5465691121218114, "step": 5528, "train/total_loss": 0.1731683909893036 }, { "entropy": 9.126616477966309, "epoch": 0.5466679849713268, "mean_token_accuracy": 0.6770833134651184, "num_tokens": 7956156.0, "step": 5529, "train/ce_loss": 1.0017226934432983 }, { "epoch": 0.5466679849713268, "step": 5529, "train/sim_loss": 0.08984375 }, { "epoch": 0.5466679849713268, "step": 5529, "train/total_loss": 0.1900160312652588 }, { "entropy": 8.91090202331543, "epoch": 0.5467668578208424, "mean_token_accuracy": 0.7716346383094788, "num_tokens": 7961411.0, "step": 5530, "train/ce_loss": 0.8838936686515808 }, { "epoch": 0.5467668578208424, "step": 5530, "train/sim_loss": 0.046875 }, { "epoch": 0.5467668578208424, "step": 5530, "train/total_loss": 0.13526436686515808 }, { "entropy": 8.90369987487793, "epoch": 0.5468657306703579, "mean_token_accuracy": 0.710918128490448, "num_tokens": 7966651.0, "step": 5531, "train/ce_loss": 1.5553159713745117 }, { "epoch": 0.5468657306703579, "step": 5531, "train/sim_loss": 0.08203125 }, { "epoch": 0.5468657306703579, "step": 5531, "train/total_loss": 0.2375628501176834 }, { "entropy": 9.05916690826416, "epoch": 0.5469646035198734, "mean_token_accuracy": 0.7317073345184326, "num_tokens": 7972028.0, "step": 5532, "train/ce_loss": 0.7818222045898438 }, { "epoch": 0.5469646035198734, "step": 5532, "train/sim_loss": 0.0234375 }, { "epoch": 0.5469646035198734, "step": 5532, "train/total_loss": 0.10161972045898438 }, { "entropy": 8.867383003234863, "epoch": 0.547063476369389, "mean_token_accuracy": 0.7449947595596313, "num_tokens": 7977475.0, "step": 5533, "train/ce_loss": 1.104671597480774 }, { "epoch": 0.547063476369389, "step": 5533, "train/sim_loss": 0.078125 }, { "epoch": 0.547063476369389, "step": 5533, "train/total_loss": 0.18859216570854187 }, { "entropy": 9.046788215637207, "epoch": 0.5471623492189045, "mean_token_accuracy": 0.7216783165931702, "num_tokens": 7982661.0, "step": 5534, "train/ce_loss": 1.2839608192443848 }, { "epoch": 0.5471623492189045, "step": 5534, "train/sim_loss": 0.06640625 }, { "epoch": 0.5471623492189045, "step": 5534, "train/total_loss": 0.19480232894420624 }, { "entropy": 9.152159690856934, "epoch": 0.54726122206842, "mean_token_accuracy": 0.70257967710495, "num_tokens": 7987819.0, "step": 5535, "train/ce_loss": 1.0716601610183716 }, { "epoch": 0.54726122206842, "step": 5535, "train/sim_loss": 0.04296875 }, { "epoch": 0.54726122206842, "step": 5535, "train/total_loss": 0.15013477206230164 }, { "entropy": 9.02014446258545, "epoch": 0.5473600949179356, "mean_token_accuracy": 0.7812879681587219, "num_tokens": 7993097.0, "step": 5536, "train/ce_loss": 0.5995551943778992 }, { "epoch": 0.5473600949179356, "step": 5536, "train/sim_loss": 0.01953125 }, { "epoch": 0.5473600949179356, "step": 5536, "train/total_loss": 0.07948677241802216 }, { "entropy": 8.631769180297852, "epoch": 0.5474589677674511, "mean_token_accuracy": 0.7827273011207581, "num_tokens": 7998676.0, "step": 5537, "train/ce_loss": 0.4905291199684143 }, { "epoch": 0.5474589677674511, "step": 5537, "train/sim_loss": 0.02734375 }, { "epoch": 0.5474589677674511, "step": 5537, "train/total_loss": 0.07639665901660919 }, { "entropy": 8.562850952148438, "epoch": 0.5475578406169666, "mean_token_accuracy": 0.7535884976387024, "num_tokens": 8003958.0, "step": 5538, "train/ce_loss": 0.5495491623878479 }, { "epoch": 0.5475578406169666, "step": 5538, "train/sim_loss": 0.0625 }, { "epoch": 0.5475578406169666, "step": 5538, "train/total_loss": 0.11745491623878479 }, { "entropy": 9.187483787536621, "epoch": 0.5476567134664821, "mean_token_accuracy": 0.7204610705375671, "num_tokens": 8009118.0, "step": 5539, "train/ce_loss": 1.957935182872461e-06 }, { "epoch": 0.5476567134664821, "step": 5539, "train/sim_loss": 0.0390625 }, { "epoch": 0.5476567134664821, "step": 5539, "train/total_loss": 0.03906269744038582 }, { "epoch": 0.5477555863159976, "grad_norm": 0.8513239026069641, "learning_rate": 8.632992137665036e-06, "loss": 0.1451, "step": 5540 }, { "entropy": 9.163148880004883, "epoch": 0.5477555863159976, "mean_token_accuracy": 0.7201540470123291, "num_tokens": 8014486.0, "step": 5540, "train/ce_loss": 1.322680115699768 }, { "epoch": 0.5477555863159976, "step": 5540, "train/sim_loss": 0.13671875 }, { "epoch": 0.5477555863159976, "step": 5540, "train/total_loss": 0.2689867615699768 }, { "entropy": 9.031539916992188, "epoch": 0.5478544591655131, "mean_token_accuracy": 0.7678160667419434, "num_tokens": 8019808.0, "step": 5541, "train/ce_loss": 0.5397918224334717 }, { "epoch": 0.5478544591655131, "step": 5541, "train/sim_loss": 0.04296875 }, { "epoch": 0.5478544591655131, "step": 5541, "train/total_loss": 0.09694793820381165 }, { "entropy": 9.817032814025879, "epoch": 0.5479533320150287, "mean_token_accuracy": 0.7709251046180725, "num_tokens": 8024639.0, "step": 5542, "train/ce_loss": 1.1984164714813232 }, { "epoch": 0.5479533320150287, "step": 5542, "train/sim_loss": 0.05078125 }, { "epoch": 0.5479533320150287, "step": 5542, "train/total_loss": 0.17062290012836456 }, { "entropy": 8.763422012329102, "epoch": 0.5480522048645442, "mean_token_accuracy": 0.7785087823867798, "num_tokens": 8030041.0, "step": 5543, "train/ce_loss": 0.48883992433547974 }, { "epoch": 0.5480522048645442, "step": 5543, "train/sim_loss": 0.01953125 }, { "epoch": 0.5480522048645442, "step": 5543, "train/total_loss": 0.06841523945331573 }, { "entropy": 8.496919631958008, "epoch": 0.5481510777140597, "mean_token_accuracy": 0.736580491065979, "num_tokens": 8035507.0, "step": 5544, "train/ce_loss": 1.1177259683609009 }, { "epoch": 0.5481510777140597, "step": 5544, "train/sim_loss": 0.09375 }, { "epoch": 0.5481510777140597, "step": 5544, "train/total_loss": 0.2055225968360901 }, { "entropy": 9.677846908569336, "epoch": 0.5482499505635753, "mean_token_accuracy": 0.7397849559783936, "num_tokens": 8040397.0, "step": 5545, "train/ce_loss": 2.08055212169711e-06 }, { "epoch": 0.5482499505635753, "step": 5545, "train/sim_loss": 0.0390625 }, { "epoch": 0.5482499505635753, "step": 5545, "train/total_loss": 0.039062708616256714 }, { "entropy": 9.176383972167969, "epoch": 0.5483488234130908, "mean_token_accuracy": 0.7653791308403015, "num_tokens": 8045550.0, "step": 5546, "train/ce_loss": 0.9093326926231384 }, { "epoch": 0.5483488234130908, "step": 5546, "train/sim_loss": 0.046875 }, { "epoch": 0.5483488234130908, "step": 5546, "train/total_loss": 0.13780826330184937 }, { "entropy": 9.060911178588867, "epoch": 0.5484476962626063, "mean_token_accuracy": 0.7210348844528198, "num_tokens": 8050908.0, "step": 5547, "train/ce_loss": 0.5934411883354187 }, { "epoch": 0.5484476962626063, "step": 5547, "train/sim_loss": 0.078125 }, { "epoch": 0.5484476962626063, "step": 5547, "train/total_loss": 0.1374691128730774 }, { "entropy": 8.97642993927002, "epoch": 0.5485465691121219, "mean_token_accuracy": 0.7463414669036865, "num_tokens": 8056164.0, "step": 5548, "train/ce_loss": 0.7455832362174988 }, { "epoch": 0.5485465691121219, "step": 5548, "train/sim_loss": 0.09765625 }, { "epoch": 0.5485465691121219, "step": 5548, "train/total_loss": 0.1722145676612854 }, { "entropy": 8.86227798461914, "epoch": 0.5486454419616373, "mean_token_accuracy": 0.8031784892082214, "num_tokens": 8061437.0, "step": 5549, "train/ce_loss": 0.524359941482544 }, { "epoch": 0.5486454419616373, "step": 5549, "train/sim_loss": 0.01171875 }, { "epoch": 0.5486454419616373, "step": 5549, "train/total_loss": 0.0641547441482544 }, { "entropy": 9.484848976135254, "epoch": 0.5487443148111528, "mean_token_accuracy": 0.709618866443634, "num_tokens": 8066449.0, "step": 5550, "train/ce_loss": 1.797608733177185 }, { "epoch": 0.5487443148111528, "step": 5550, "train/sim_loss": 0.05078125 }, { "epoch": 0.5487443148111528, "step": 5550, "train/total_loss": 0.2305421233177185 }, { "entropy": 9.054540634155273, "epoch": 0.5488431876606684, "mean_token_accuracy": 0.7254902124404907, "num_tokens": 8071599.0, "step": 5551, "train/ce_loss": 0.9352777004241943 }, { "epoch": 0.5488431876606684, "step": 5551, "train/sim_loss": 0.0390625 }, { "epoch": 0.5488431876606684, "step": 5551, "train/total_loss": 0.13259026408195496 }, { "entropy": 8.990945816040039, "epoch": 0.5489420605101839, "mean_token_accuracy": 0.7782805562019348, "num_tokens": 8076884.0, "step": 5552, "train/ce_loss": 0.7061032652854919 }, { "epoch": 0.5489420605101839, "step": 5552, "train/sim_loss": 0.02734375 }, { "epoch": 0.5489420605101839, "step": 5552, "train/total_loss": 0.09795407950878143 }, { "entropy": 8.656270027160645, "epoch": 0.5490409333596994, "mean_token_accuracy": 0.7148148417472839, "num_tokens": 8082111.0, "step": 5553, "train/ce_loss": 0.8703933358192444 }, { "epoch": 0.5490409333596994, "step": 5553, "train/sim_loss": 0.0546875 }, { "epoch": 0.5490409333596994, "step": 5553, "train/total_loss": 0.14172683656215668 }, { "entropy": 9.270323753356934, "epoch": 0.549139806209215, "mean_token_accuracy": 0.7340720295906067, "num_tokens": 8087280.0, "step": 5554, "train/ce_loss": 0.9211452603340149 }, { "epoch": 0.549139806209215, "step": 5554, "train/sim_loss": 0.04296875 }, { "epoch": 0.549139806209215, "step": 5554, "train/total_loss": 0.13508328795433044 }, { "entropy": 9.821088790893555, "epoch": 0.5492386790587305, "mean_token_accuracy": 0.7368420958518982, "num_tokens": 8092123.0, "step": 5555, "train/ce_loss": 2.184018135070801 }, { "epoch": 0.5492386790587305, "step": 5555, "train/sim_loss": 0.0546875 }, { "epoch": 0.5492386790587305, "step": 5555, "train/total_loss": 0.27308931946754456 }, { "entropy": 9.16021728515625, "epoch": 0.549337551908246, "mean_token_accuracy": 0.681556224822998, "num_tokens": 8097253.0, "step": 5556, "train/ce_loss": 0.7490461468696594 }, { "epoch": 0.549337551908246, "step": 5556, "train/sim_loss": 0.0390625 }, { "epoch": 0.549337551908246, "step": 5556, "train/total_loss": 0.11396711319684982 }, { "entropy": 9.095970153808594, "epoch": 0.5494364247577616, "mean_token_accuracy": 0.7917241454124451, "num_tokens": 8102413.0, "step": 5557, "train/ce_loss": 0.5837256908416748 }, { "epoch": 0.5494364247577616, "step": 5557, "train/sim_loss": 0.03515625 }, { "epoch": 0.5494364247577616, "step": 5557, "train/total_loss": 0.09352882206439972 }, { "entropy": 9.32501220703125, "epoch": 0.549535297607277, "mean_token_accuracy": 0.7417103052139282, "num_tokens": 8107402.0, "step": 5558, "train/ce_loss": 3.2791404009913094e-06 }, { "epoch": 0.549535297607277, "step": 5558, "train/sim_loss": 0.0546875 }, { "epoch": 0.549535297607277, "step": 5558, "train/total_loss": 0.054687827825546265 }, { "entropy": 8.988893508911133, "epoch": 0.5496341704567925, "mean_token_accuracy": 0.7346241474151611, "num_tokens": 8112744.0, "step": 5559, "train/ce_loss": 0.9331421852111816 }, { "epoch": 0.5496341704567925, "step": 5559, "train/sim_loss": 0.05078125 }, { "epoch": 0.5496341704567925, "step": 5559, "train/total_loss": 0.14409548044204712 }, { "epoch": 0.5497330433063081, "grad_norm": 0.7400034070014954, "learning_rate": 8.628047272907086e-06, "loss": 0.138, "step": 5560 }, { "entropy": 9.708757400512695, "epoch": 0.5497330433063081, "mean_token_accuracy": 0.7265306115150452, "num_tokens": 8117594.0, "step": 5560, "train/ce_loss": 8.806768505564833e-07 }, { "epoch": 0.5497330433063081, "step": 5560, "train/sim_loss": 0.01171875 }, { "epoch": 0.5497330433063081, "step": 5560, "train/total_loss": 0.011718838475644588 }, { "entropy": 9.42172622680664, "epoch": 0.5498319161558236, "mean_token_accuracy": 0.731249988079071, "num_tokens": 8122664.0, "step": 5561, "train/ce_loss": 1.261497139930725 }, { "epoch": 0.5498319161558236, "step": 5561, "train/sim_loss": 0.05078125 }, { "epoch": 0.5498319161558236, "step": 5561, "train/total_loss": 0.1769309639930725 }, { "entropy": 9.3308744430542, "epoch": 0.5499307890053391, "mean_token_accuracy": 0.6599063873291016, "num_tokens": 8127792.0, "step": 5562, "train/ce_loss": 1.3462740182876587 }, { "epoch": 0.5499307890053391, "step": 5562, "train/sim_loss": 0.07421875 }, { "epoch": 0.5499307890053391, "step": 5562, "train/total_loss": 0.20884615182876587 }, { "entropy": 8.472457885742188, "epoch": 0.5500296618548547, "mean_token_accuracy": 0.7492983937263489, "num_tokens": 8133348.0, "step": 5563, "train/ce_loss": 0.9793670177459717 }, { "epoch": 0.5500296618548547, "step": 5563, "train/sim_loss": 0.0546875 }, { "epoch": 0.5500296618548547, "step": 5563, "train/total_loss": 0.1526242047548294 }, { "entropy": 8.841888427734375, "epoch": 0.5501285347043702, "mean_token_accuracy": 0.6864516139030457, "num_tokens": 8138586.0, "step": 5564, "train/ce_loss": 1.2656185626983643 }, { "epoch": 0.5501285347043702, "step": 5564, "train/sim_loss": 0.046875 }, { "epoch": 0.5501285347043702, "step": 5564, "train/total_loss": 0.17343686521053314 }, { "entropy": 9.570108413696289, "epoch": 0.5502274075538857, "mean_token_accuracy": 0.7607476711273193, "num_tokens": 8143549.0, "step": 5565, "train/ce_loss": 1.570382096360845e-06 }, { "epoch": 0.5502274075538857, "step": 5565, "train/sim_loss": 0.05859375 }, { "epoch": 0.5502274075538857, "step": 5565, "train/total_loss": 0.058593906462192535 }, { "entropy": 9.095592498779297, "epoch": 0.5503262804034013, "mean_token_accuracy": 0.7247838377952576, "num_tokens": 8148730.0, "step": 5566, "train/ce_loss": 0.39141520857810974 }, { "epoch": 0.5503262804034013, "step": 5566, "train/sim_loss": 0.0390625 }, { "epoch": 0.5503262804034013, "step": 5566, "train/total_loss": 0.07820402085781097 }, { "entropy": 9.185461044311523, "epoch": 0.5504251532529167, "mean_token_accuracy": 0.7051926255226135, "num_tokens": 8153783.0, "step": 5567, "train/ce_loss": 0.9862339496612549 }, { "epoch": 0.5504251532529167, "step": 5567, "train/sim_loss": 0.06640625 }, { "epoch": 0.5504251532529167, "step": 5567, "train/total_loss": 0.1650296449661255 }, { "entropy": 9.859312057495117, "epoch": 0.5505240261024322, "mean_token_accuracy": 0.7260273694992065, "num_tokens": 8158558.0, "step": 5568, "train/ce_loss": 1.0583114089968149e-05 }, { "epoch": 0.5505240261024322, "step": 5568, "train/sim_loss": 0.04296875 }, { "epoch": 0.5505240261024322, "step": 5568, "train/total_loss": 0.04296980798244476 }, { "entropy": 9.152839660644531, "epoch": 0.5506228989519478, "mean_token_accuracy": 0.7062069177627563, "num_tokens": 8163728.0, "step": 5569, "train/ce_loss": 4.242161821821355e-07 }, { "epoch": 0.5506228989519478, "step": 5569, "train/sim_loss": 0.02734375 }, { "epoch": 0.5506228989519478, "step": 5569, "train/total_loss": 0.027343792840838432 }, { "entropy": 8.736828804016113, "epoch": 0.5507217718014633, "mean_token_accuracy": 0.7492323517799377, "num_tokens": 8169180.0, "step": 5570, "train/ce_loss": 0.9878089427947998 }, { "epoch": 0.5507217718014633, "step": 5570, "train/sim_loss": 0.0625 }, { "epoch": 0.5507217718014633, "step": 5570, "train/total_loss": 0.16128090023994446 }, { "entropy": 9.42058277130127, "epoch": 0.5508206446509788, "mean_token_accuracy": 0.7604562640190125, "num_tokens": 8174198.0, "step": 5571, "train/ce_loss": 0.9325904846191406 }, { "epoch": 0.5508206446509788, "step": 5571, "train/sim_loss": 0.0703125 }, { "epoch": 0.5508206446509788, "step": 5571, "train/total_loss": 0.1635715514421463 }, { "entropy": 8.800806045532227, "epoch": 0.5509195175004944, "mean_token_accuracy": 0.7384792566299438, "num_tokens": 8179559.0, "step": 5572, "train/ce_loss": 1.2294607162475586 }, { "epoch": 0.5509195175004944, "step": 5572, "train/sim_loss": 0.078125 }, { "epoch": 0.5509195175004944, "step": 5572, "train/total_loss": 0.20107108354568481 }, { "entropy": 8.983238220214844, "epoch": 0.5510183903500099, "mean_token_accuracy": 0.7275822758674622, "num_tokens": 8184923.0, "step": 5573, "train/ce_loss": 0.8919557332992554 }, { "epoch": 0.5510183903500099, "step": 5573, "train/sim_loss": 0.05078125 }, { "epoch": 0.5510183903500099, "step": 5573, "train/total_loss": 0.13997682929039001 }, { "entropy": 9.278079986572266, "epoch": 0.5511172631995254, "mean_token_accuracy": 0.7447405457496643, "num_tokens": 8190221.0, "step": 5574, "train/ce_loss": 1.7935465166374343e-06 }, { "epoch": 0.5511172631995254, "step": 5574, "train/sim_loss": 0.0703125 }, { "epoch": 0.5511172631995254, "step": 5574, "train/total_loss": 0.07031267881393433 }, { "entropy": 9.665382385253906, "epoch": 0.551216136049041, "mean_token_accuracy": 0.8256704807281494, "num_tokens": 8195140.0, "step": 5575, "train/ce_loss": 8.778425808486645e-07 }, { "epoch": 0.551216136049041, "step": 5575, "train/sim_loss": 0.01171875 }, { "epoch": 0.551216136049041, "step": 5575, "train/total_loss": 0.011718837544322014 }, { "entropy": 8.645299911499023, "epoch": 0.5513150088985564, "mean_token_accuracy": 0.7113187909126282, "num_tokens": 8200579.0, "step": 5576, "train/ce_loss": 1.155021071434021 }, { "epoch": 0.5513150088985564, "step": 5576, "train/sim_loss": 0.0546875 }, { "epoch": 0.5513150088985564, "step": 5576, "train/total_loss": 0.17018961906433105 }, { "entropy": 8.342084884643555, "epoch": 0.5514138817480719, "mean_token_accuracy": 0.8221845626831055, "num_tokens": 8206263.0, "step": 5577, "train/ce_loss": 0.45889559388160706 }, { "epoch": 0.5514138817480719, "step": 5577, "train/sim_loss": 0.0234375 }, { "epoch": 0.5514138817480719, "step": 5577, "train/total_loss": 0.06932705640792847 }, { "entropy": 9.861316680908203, "epoch": 0.5515127545975875, "mean_token_accuracy": 0.6985294222831726, "num_tokens": 8211083.0, "step": 5578, "train/ce_loss": 1.9402579069137573 }, { "epoch": 0.5515127545975875, "step": 5578, "train/sim_loss": 0.0546875 }, { "epoch": 0.5515127545975875, "step": 5578, "train/total_loss": 0.24871329963207245 }, { "entropy": 8.840457916259766, "epoch": 0.551611627447103, "mean_token_accuracy": 0.7086419463157654, "num_tokens": 8216389.0, "step": 5579, "train/ce_loss": 1.1512106657028198 }, { "epoch": 0.551611627447103, "step": 5579, "train/sim_loss": 0.1015625 }, { "epoch": 0.551611627447103, "step": 5579, "train/total_loss": 0.21668356657028198 }, { "epoch": 0.5517105002966185, "grad_norm": 0.9113156795501709, "learning_rate": 8.623102408149137e-06, "loss": 0.1391, "step": 5580 }, { "entropy": 8.88958740234375, "epoch": 0.5517105002966185, "mean_token_accuracy": 0.707317054271698, "num_tokens": 8221609.0, "step": 5580, "train/ce_loss": 1.4289519786834717 }, { "epoch": 0.5517105002966185, "step": 5580, "train/sim_loss": 0.03515625 }, { "epoch": 0.5517105002966185, "step": 5580, "train/total_loss": 0.17805145680904388 }, { "entropy": 8.861713409423828, "epoch": 0.5518093731461341, "mean_token_accuracy": 0.6754478216171265, "num_tokens": 8227034.0, "step": 5581, "train/ce_loss": 1.0220152139663696 }, { "epoch": 0.5518093731461341, "step": 5581, "train/sim_loss": 0.171875 }, { "epoch": 0.5518093731461341, "step": 5581, "train/total_loss": 0.27407652139663696 }, { "entropy": 9.000300407409668, "epoch": 0.5519082459956496, "mean_token_accuracy": 0.750952959060669, "num_tokens": 8232332.0, "step": 5582, "train/ce_loss": 0.7720044255256653 }, { "epoch": 0.5519082459956496, "step": 5582, "train/sim_loss": 0.0703125 }, { "epoch": 0.5519082459956496, "step": 5582, "train/total_loss": 0.14751294255256653 }, { "entropy": 8.809768676757812, "epoch": 0.5520071188451651, "mean_token_accuracy": 0.6777777671813965, "num_tokens": 8237716.0, "step": 5583, "train/ce_loss": 0.761299729347229 }, { "epoch": 0.5520071188451651, "step": 5583, "train/sim_loss": 0.0546875 }, { "epoch": 0.5520071188451651, "step": 5583, "train/total_loss": 0.1308174729347229 }, { "entropy": 9.893903732299805, "epoch": 0.5521059916946807, "mean_token_accuracy": 0.8021390438079834, "num_tokens": 8242522.0, "step": 5584, "train/ce_loss": 1.2303241874178639e-06 }, { "epoch": 0.5521059916946807, "step": 5584, "train/sim_loss": 0.0234375 }, { "epoch": 0.5521059916946807, "step": 5584, "train/total_loss": 0.02343762293457985 }, { "entropy": 8.786431312561035, "epoch": 0.5522048645441962, "mean_token_accuracy": 0.6747252941131592, "num_tokens": 8247842.0, "step": 5585, "train/ce_loss": 1.6742024421691895 }, { "epoch": 0.5522048645441962, "step": 5585, "train/sim_loss": 0.0390625 }, { "epoch": 0.5522048645441962, "step": 5585, "train/total_loss": 0.20648275315761566 }, { "entropy": 9.696439743041992, "epoch": 0.5523037373937116, "mean_token_accuracy": 0.8199446201324463, "num_tokens": 8252630.0, "step": 5586, "train/ce_loss": 7.355477464443538e-06 }, { "epoch": 0.5523037373937116, "step": 5586, "train/sim_loss": 0.0703125 }, { "epoch": 0.5523037373937116, "step": 5586, "train/total_loss": 0.0703132376074791 }, { "entropy": 8.938692092895508, "epoch": 0.5524026102432272, "mean_token_accuracy": 0.7350993156433105, "num_tokens": 8258009.0, "step": 5587, "train/ce_loss": 1.184370517730713 }, { "epoch": 0.5524026102432272, "step": 5587, "train/sim_loss": 0.07421875 }, { "epoch": 0.5524026102432272, "step": 5587, "train/total_loss": 0.1926558017730713 }, { "entropy": 8.986148834228516, "epoch": 0.5525014830927427, "mean_token_accuracy": 0.732375979423523, "num_tokens": 8263220.0, "step": 5588, "train/ce_loss": 1.660227656364441 }, { "epoch": 0.5525014830927427, "step": 5588, "train/sim_loss": 0.0703125 }, { "epoch": 0.5525014830927427, "step": 5588, "train/total_loss": 0.23633526265621185 }, { "entropy": 9.082693099975586, "epoch": 0.5526003559422582, "mean_token_accuracy": 0.6867924332618713, "num_tokens": 8268527.0, "step": 5589, "train/ce_loss": 0.9311652183532715 }, { "epoch": 0.5526003559422582, "step": 5589, "train/sim_loss": 0.1171875 }, { "epoch": 0.5526003559422582, "step": 5589, "train/total_loss": 0.21030402183532715 }, { "entropy": 8.685200691223145, "epoch": 0.5526992287917738, "mean_token_accuracy": 0.7639344334602356, "num_tokens": 8273918.0, "step": 5590, "train/ce_loss": 0.8735048770904541 }, { "epoch": 0.5526992287917738, "step": 5590, "train/sim_loss": 0.05859375 }, { "epoch": 0.5526992287917738, "step": 5590, "train/total_loss": 0.1459442377090454 }, { "entropy": 8.771509170532227, "epoch": 0.5527981016412893, "mean_token_accuracy": 0.7356828451156616, "num_tokens": 8279213.0, "step": 5591, "train/ce_loss": 1.3483002185821533 }, { "epoch": 0.5527981016412893, "step": 5591, "train/sim_loss": 0.0625 }, { "epoch": 0.5527981016412893, "step": 5591, "train/total_loss": 0.1973300278186798 }, { "entropy": 9.093210220336914, "epoch": 0.5528969744908048, "mean_token_accuracy": 0.7544827461242676, "num_tokens": 8284376.0, "step": 5592, "train/ce_loss": 0.5866910815238953 }, { "epoch": 0.5528969744908048, "step": 5592, "train/sim_loss": 0.0546875 }, { "epoch": 0.5528969744908048, "step": 5592, "train/total_loss": 0.11335660517215729 }, { "entropy": 8.567116737365723, "epoch": 0.5529958473403204, "mean_token_accuracy": 0.7652958631515503, "num_tokens": 8289849.0, "step": 5593, "train/ce_loss": 0.9828934073448181 }, { "epoch": 0.5529958473403204, "step": 5593, "train/sim_loss": 0.0625 }, { "epoch": 0.5529958473403204, "step": 5593, "train/total_loss": 0.1607893407344818 }, { "entropy": 8.970977783203125, "epoch": 0.5530947201898359, "mean_token_accuracy": 0.7647058963775635, "num_tokens": 8295138.0, "step": 5594, "train/ce_loss": 1.2613295316696167 }, { "epoch": 0.5530947201898359, "step": 5594, "train/sim_loss": 0.04296875 }, { "epoch": 0.5530947201898359, "step": 5594, "train/total_loss": 0.16910170018672943 }, { "entropy": 8.619521141052246, "epoch": 0.5531935930393513, "mean_token_accuracy": 0.7246192693710327, "num_tokens": 8300383.0, "step": 5595, "train/ce_loss": 0.591346263885498 }, { "epoch": 0.5531935930393513, "step": 5595, "train/sim_loss": 0.09375 }, { "epoch": 0.5531935930393513, "step": 5595, "train/total_loss": 0.15288463234901428 }, { "entropy": 8.945405006408691, "epoch": 0.5532924658888669, "mean_token_accuracy": 0.730555534362793, "num_tokens": 8305601.0, "step": 5596, "train/ce_loss": 1.1461552381515503 }, { "epoch": 0.5532924658888669, "step": 5596, "train/sim_loss": 0.0703125 }, { "epoch": 0.5532924658888669, "step": 5596, "train/total_loss": 0.1849280297756195 }, { "entropy": 8.84506607055664, "epoch": 0.5533913387383824, "mean_token_accuracy": 0.7098265886306763, "num_tokens": 8310946.0, "step": 5597, "train/ce_loss": 0.8909079432487488 }, { "epoch": 0.5533913387383824, "step": 5597, "train/sim_loss": 0.0390625 }, { "epoch": 0.5533913387383824, "step": 5597, "train/total_loss": 0.12815329432487488 }, { "entropy": 8.550594329833984, "epoch": 0.553490211587898, "mean_token_accuracy": 0.7557471394538879, "num_tokens": 8316458.0, "step": 5598, "train/ce_loss": 0.9047664403915405 }, { "epoch": 0.553490211587898, "step": 5598, "train/sim_loss": 0.08203125 }, { "epoch": 0.553490211587898, "step": 5598, "train/total_loss": 0.1725078970193863 }, { "entropy": 9.361294746398926, "epoch": 0.5535890844374135, "mean_token_accuracy": 0.7869822382926941, "num_tokens": 8321599.0, "step": 5599, "train/ce_loss": 0.7582494020462036 }, { "epoch": 0.5535890844374135, "step": 5599, "train/sim_loss": 0.0625 }, { "epoch": 0.5535890844374135, "step": 5599, "train/total_loss": 0.13832494616508484 }, { "epoch": 0.553687957286929, "grad_norm": 0.6769205331802368, "learning_rate": 8.61815754339119e-06, "loss": 0.1456, "step": 5600 }, { "entropy": 9.160236358642578, "epoch": 0.553687957286929, "mean_token_accuracy": 0.7388535141944885, "num_tokens": 8326844.0, "step": 5600, "train/ce_loss": 1.309563572249317e-06 }, { "epoch": 0.553687957286929, "step": 5600, "train/sim_loss": 0.05859375 }, { "epoch": 0.553687957286929, "step": 5600, "train/total_loss": 0.058593880385160446 }, { "entropy": 8.7738618850708, "epoch": 0.5537868301364446, "mean_token_accuracy": 0.773099422454834, "num_tokens": 8332131.0, "step": 5601, "train/ce_loss": 0.9834937453269958 }, { "epoch": 0.5537868301364446, "step": 5601, "train/sim_loss": 0.0703125 }, { "epoch": 0.5537868301364446, "step": 5601, "train/total_loss": 0.16866187751293182 }, { "entropy": 9.020515441894531, "epoch": 0.5538857029859601, "mean_token_accuracy": 0.7438308000564575, "num_tokens": 8337447.0, "step": 5602, "train/ce_loss": 0.47414088249206543 }, { "epoch": 0.5538857029859601, "step": 5602, "train/sim_loss": 0.0625 }, { "epoch": 0.5538857029859601, "step": 5602, "train/total_loss": 0.10991409420967102 }, { "entropy": 8.730883598327637, "epoch": 0.5539845758354756, "mean_token_accuracy": 0.75, "num_tokens": 8342565.0, "step": 5603, "train/ce_loss": 0.5624790191650391 }, { "epoch": 0.5539845758354756, "step": 5603, "train/sim_loss": 0.0390625 }, { "epoch": 0.5539845758354756, "step": 5603, "train/total_loss": 0.09531040489673615 }, { "entropy": 8.995072364807129, "epoch": 0.5540834486849912, "mean_token_accuracy": 0.7682020664215088, "num_tokens": 8347687.0, "step": 5604, "train/ce_loss": 6.226739060366526e-06 }, { "epoch": 0.5540834486849912, "step": 5604, "train/sim_loss": 0.0625 }, { "epoch": 0.5540834486849912, "step": 5604, "train/total_loss": 0.06250062584877014 }, { "entropy": 9.090400695800781, "epoch": 0.5541823215345066, "mean_token_accuracy": 0.697017252445221, "num_tokens": 8352752.0, "step": 5605, "train/ce_loss": 1.042167067527771 }, { "epoch": 0.5541823215345066, "step": 5605, "train/sim_loss": 0.09375 }, { "epoch": 0.5541823215345066, "step": 5605, "train/total_loss": 0.19796670973300934 }, { "entropy": 8.763729095458984, "epoch": 0.5542811943840221, "mean_token_accuracy": 0.7481162548065186, "num_tokens": 8358183.0, "step": 5606, "train/ce_loss": 0.9994056224822998 }, { "epoch": 0.5542811943840221, "step": 5606, "train/sim_loss": 0.05859375 }, { "epoch": 0.5542811943840221, "step": 5606, "train/total_loss": 0.15853431820869446 }, { "entropy": 8.979375839233398, "epoch": 0.5543800672335377, "mean_token_accuracy": 0.7763158082962036, "num_tokens": 8363442.0, "step": 5607, "train/ce_loss": 0.508840799331665 }, { "epoch": 0.5543800672335377, "step": 5607, "train/sim_loss": 0.02734375 }, { "epoch": 0.5543800672335377, "step": 5607, "train/total_loss": 0.07822783291339874 }, { "entropy": 9.339065551757812, "epoch": 0.5544789400830532, "mean_token_accuracy": 0.7257142663002014, "num_tokens": 8368436.0, "step": 5608, "train/ce_loss": 0.7017092108726501 }, { "epoch": 0.5544789400830532, "step": 5608, "train/sim_loss": 0.0390625 }, { "epoch": 0.5544789400830532, "step": 5608, "train/total_loss": 0.10923342406749725 }, { "entropy": 8.672113418579102, "epoch": 0.5545778129325687, "mean_token_accuracy": 0.7023977637290955, "num_tokens": 8373616.0, "step": 5609, "train/ce_loss": 1.250449538230896 }, { "epoch": 0.5545778129325687, "step": 5609, "train/sim_loss": 0.046875 }, { "epoch": 0.5545778129325687, "step": 5609, "train/total_loss": 0.17191995680332184 }, { "entropy": 8.980615615844727, "epoch": 0.5546766857820843, "mean_token_accuracy": 0.6982455849647522, "num_tokens": 8378942.0, "step": 5610, "train/ce_loss": 1.1419041156768799 }, { "epoch": 0.5546766857820843, "step": 5610, "train/sim_loss": 0.09765625 }, { "epoch": 0.5546766857820843, "step": 5610, "train/total_loss": 0.21184666454792023 }, { "entropy": 8.500741004943848, "epoch": 0.5547755586315998, "mean_token_accuracy": 0.7479423880577087, "num_tokens": 8384412.0, "step": 5611, "train/ce_loss": 0.6746719479560852 }, { "epoch": 0.5547755586315998, "step": 5611, "train/sim_loss": 0.015625 }, { "epoch": 0.5547755586315998, "step": 5611, "train/total_loss": 0.08309219777584076 }, { "entropy": 8.913674354553223, "epoch": 0.5548744314811153, "mean_token_accuracy": 0.8062111735343933, "num_tokens": 8389686.0, "step": 5612, "train/ce_loss": 0.6028916239738464 }, { "epoch": 0.5548744314811153, "step": 5612, "train/sim_loss": 0.0703125 }, { "epoch": 0.5548744314811153, "step": 5612, "train/total_loss": 0.1306016594171524 }, { "entropy": 9.013097763061523, "epoch": 0.5549733043306309, "mean_token_accuracy": 0.7185697555541992, "num_tokens": 8395083.0, "step": 5613, "train/ce_loss": 1.010940432548523 }, { "epoch": 0.5549733043306309, "step": 5613, "train/sim_loss": 0.0546875 }, { "epoch": 0.5549733043306309, "step": 5613, "train/total_loss": 0.15578153729438782 }, { "entropy": 9.437446594238281, "epoch": 0.5550721771801463, "mean_token_accuracy": 0.7008403539657593, "num_tokens": 8400156.0, "step": 5614, "train/ce_loss": 9.103745810534747e-07 }, { "epoch": 0.5550721771801463, "step": 5614, "train/sim_loss": 0.05859375 }, { "epoch": 0.5550721771801463, "step": 5614, "train/total_loss": 0.05859383940696716 }, { "entropy": 9.192410469055176, "epoch": 0.5551710500296618, "mean_token_accuracy": 0.7471410632133484, "num_tokens": 8405442.0, "step": 5615, "train/ce_loss": 0.9829228520393372 }, { "epoch": 0.5551710500296618, "step": 5615, "train/sim_loss": 0.046875 }, { "epoch": 0.5551710500296618, "step": 5615, "train/total_loss": 0.1451672911643982 }, { "entropy": 8.884618759155273, "epoch": 0.5552699228791774, "mean_token_accuracy": 0.7849944233894348, "num_tokens": 8410775.0, "step": 5616, "train/ce_loss": 0.7053366899490356 }, { "epoch": 0.5552699228791774, "step": 5616, "train/sim_loss": 0.046875 }, { "epoch": 0.5552699228791774, "step": 5616, "train/total_loss": 0.11740867048501968 }, { "entropy": 9.2008695602417, "epoch": 0.5553687957286929, "mean_token_accuracy": 0.7620967626571655, "num_tokens": 8415928.0, "step": 5617, "train/ce_loss": 1.3845499753952026 }, { "epoch": 0.5553687957286929, "step": 5617, "train/sim_loss": 0.05859375 }, { "epoch": 0.5553687957286929, "step": 5617, "train/total_loss": 0.19704875349998474 }, { "entropy": 8.806562423706055, "epoch": 0.5554676685782084, "mean_token_accuracy": 0.7621621489524841, "num_tokens": 8421297.0, "step": 5618, "train/ce_loss": 0.6530529856681824 }, { "epoch": 0.5554676685782084, "step": 5618, "train/sim_loss": 0.0625 }, { "epoch": 0.5554676685782084, "step": 5618, "train/total_loss": 0.12780529260635376 }, { "entropy": 9.03904914855957, "epoch": 0.555566541427724, "mean_token_accuracy": 0.6881720423698425, "num_tokens": 8426593.0, "step": 5619, "train/ce_loss": 0.9342097043991089 }, { "epoch": 0.555566541427724, "step": 5619, "train/sim_loss": 0.03515625 }, { "epoch": 0.555566541427724, "step": 5619, "train/total_loss": 0.12857723236083984 }, { "epoch": 0.5556654142772395, "grad_norm": 0.8386627435684204, "learning_rate": 8.61321267863324e-06, "loss": 0.1416, "step": 5620 }, { "entropy": 9.006744384765625, "epoch": 0.5556654142772395, "mean_token_accuracy": 0.7448856830596924, "num_tokens": 8431952.0, "step": 5620, "train/ce_loss": 0.7422356605529785 }, { "epoch": 0.5556654142772395, "step": 5620, "train/sim_loss": 0.06640625 }, { "epoch": 0.5556654142772395, "step": 5620, "train/total_loss": 0.1406298279762268 }, { "entropy": 9.266400337219238, "epoch": 0.555764287126755, "mean_token_accuracy": 0.7294750809669495, "num_tokens": 8437140.0, "step": 5621, "train/ce_loss": 0.8088457584381104 }, { "epoch": 0.555764287126755, "step": 5621, "train/sim_loss": 0.046875 }, { "epoch": 0.555764287126755, "step": 5621, "train/total_loss": 0.12775957584381104 }, { "entropy": 9.065841674804688, "epoch": 0.5558631599762706, "mean_token_accuracy": 0.7953431606292725, "num_tokens": 8442452.0, "step": 5622, "train/ce_loss": 0.9971874356269836 }, { "epoch": 0.5558631599762706, "step": 5622, "train/sim_loss": 0.06640625 }, { "epoch": 0.5558631599762706, "step": 5622, "train/total_loss": 0.16612499952316284 }, { "entropy": 9.286073684692383, "epoch": 0.555962032825786, "mean_token_accuracy": 0.7595356702804565, "num_tokens": 8447535.0, "step": 5623, "train/ce_loss": 1.1132365465164185 }, { "epoch": 0.555962032825786, "step": 5623, "train/sim_loss": 0.05078125 }, { "epoch": 0.555962032825786, "step": 5623, "train/total_loss": 0.16210490465164185 }, { "entropy": 9.020340919494629, "epoch": 0.5560609056753015, "mean_token_accuracy": 0.7164705991744995, "num_tokens": 8452899.0, "step": 5624, "train/ce_loss": 1.5640804767608643 }, { "epoch": 0.5560609056753015, "step": 5624, "train/sim_loss": 0.046875 }, { "epoch": 0.5560609056753015, "step": 5624, "train/total_loss": 0.20328305661678314 }, { "entropy": 8.8846435546875, "epoch": 0.5561597785248171, "mean_token_accuracy": 0.7750582695007324, "num_tokens": 8458215.0, "step": 5625, "train/ce_loss": 0.4915861487388611 }, { "epoch": 0.5561597785248171, "step": 5625, "train/sim_loss": 0.01953125 }, { "epoch": 0.5561597785248171, "step": 5625, "train/total_loss": 0.06868986785411835 }, { "entropy": 9.303529739379883, "epoch": 0.5562586513743326, "mean_token_accuracy": 0.7274052500724792, "num_tokens": 8463356.0, "step": 5626, "train/ce_loss": 9.550376489642076e-07 }, { "epoch": 0.5562586513743326, "step": 5626, "train/sim_loss": 0.03125 }, { "epoch": 0.5562586513743326, "step": 5626, "train/total_loss": 0.03125009685754776 }, { "entropy": 9.091670989990234, "epoch": 0.5563575242238481, "mean_token_accuracy": 0.7559523582458496, "num_tokens": 8468465.0, "step": 5627, "train/ce_loss": 1.0190938711166382 }, { "epoch": 0.5563575242238481, "step": 5627, "train/sim_loss": 0.0703125 }, { "epoch": 0.5563575242238481, "step": 5627, "train/total_loss": 0.17222189903259277 }, { "entropy": 9.180963516235352, "epoch": 0.5564563970733637, "mean_token_accuracy": 0.7305389046669006, "num_tokens": 8473808.0, "step": 5628, "train/ce_loss": 1.656849980354309 }, { "epoch": 0.5564563970733637, "step": 5628, "train/sim_loss": 0.0546875 }, { "epoch": 0.5564563970733637, "step": 5628, "train/total_loss": 0.2203724980354309 }, { "entropy": 9.32286262512207, "epoch": 0.5565552699228792, "mean_token_accuracy": 0.7824143171310425, "num_tokens": 8478937.0, "step": 5629, "train/ce_loss": 1.4140739494905574e-06 }, { "epoch": 0.5565552699228792, "step": 5629, "train/sim_loss": 0.0546875 }, { "epoch": 0.5565552699228792, "step": 5629, "train/total_loss": 0.05468764156103134 }, { "entropy": 9.391812324523926, "epoch": 0.5566541427723947, "mean_token_accuracy": 0.7224805951118469, "num_tokens": 8484065.0, "step": 5630, "train/ce_loss": 1.5405515796373948e-06 }, { "epoch": 0.5566541427723947, "step": 5630, "train/sim_loss": 0.0390625 }, { "epoch": 0.5566541427723947, "step": 5630, "train/total_loss": 0.03906265273690224 }, { "entropy": 8.987224578857422, "epoch": 0.5567530156219103, "mean_token_accuracy": 0.7538280487060547, "num_tokens": 8489403.0, "step": 5631, "train/ce_loss": 0.8874568939208984 }, { "epoch": 0.5567530156219103, "step": 5631, "train/sim_loss": 0.0546875 }, { "epoch": 0.5567530156219103, "step": 5631, "train/total_loss": 0.14343318343162537 }, { "entropy": 8.805734634399414, "epoch": 0.5568518884714257, "mean_token_accuracy": 0.7350332736968994, "num_tokens": 8494812.0, "step": 5632, "train/ce_loss": 0.5978017449378967 }, { "epoch": 0.5568518884714257, "step": 5632, "train/sim_loss": 0.02734375 }, { "epoch": 0.5568518884714257, "step": 5632, "train/total_loss": 0.08712393045425415 }, { "entropy": 9.129465103149414, "epoch": 0.5569507613209412, "mean_token_accuracy": 0.7413554787635803, "num_tokens": 8500012.0, "step": 5633, "train/ce_loss": 1.4004480838775635 }, { "epoch": 0.5569507613209412, "step": 5633, "train/sim_loss": 0.0859375 }, { "epoch": 0.5569507613209412, "step": 5633, "train/total_loss": 0.22598230838775635 }, { "entropy": 9.708211898803711, "epoch": 0.5570496341704568, "mean_token_accuracy": 0.7262693047523499, "num_tokens": 8504885.0, "step": 5634, "train/ce_loss": 2.0937819480895996 }, { "epoch": 0.5570496341704568, "step": 5634, "train/sim_loss": 0.0625 }, { "epoch": 0.5570496341704568, "step": 5634, "train/total_loss": 0.271878182888031 }, { "entropy": 8.60002326965332, "epoch": 0.5571485070199723, "mean_token_accuracy": 0.7036669850349426, "num_tokens": 8510350.0, "step": 5635, "train/ce_loss": 0.8146705031394958 }, { "epoch": 0.5571485070199723, "step": 5635, "train/sim_loss": 0.04296875 }, { "epoch": 0.5571485070199723, "step": 5635, "train/total_loss": 0.12443580478429794 }, { "entropy": 9.08309268951416, "epoch": 0.5572473798694878, "mean_token_accuracy": 0.7432065010070801, "num_tokens": 8515556.0, "step": 5636, "train/ce_loss": 0.4288504719734192 }, { "epoch": 0.5572473798694878, "step": 5636, "train/sim_loss": 0.05078125 }, { "epoch": 0.5572473798694878, "step": 5636, "train/total_loss": 0.09366630017757416 }, { "entropy": 8.686668395996094, "epoch": 0.5573462527190034, "mean_token_accuracy": 0.7473806738853455, "num_tokens": 8520870.0, "step": 5637, "train/ce_loss": 0.8942636847496033 }, { "epoch": 0.5573462527190034, "step": 5637, "train/sim_loss": 0.05078125 }, { "epoch": 0.5573462527190034, "step": 5637, "train/total_loss": 0.14020761847496033 }, { "entropy": 9.009510040283203, "epoch": 0.5574451255685189, "mean_token_accuracy": 0.7184594869613647, "num_tokens": 8526107.0, "step": 5638, "train/ce_loss": 0.5916759967803955 }, { "epoch": 0.5574451255685189, "step": 5638, "train/sim_loss": 0.0625 }, { "epoch": 0.5574451255685189, "step": 5638, "train/total_loss": 0.12166760116815567 }, { "entropy": 9.53514289855957, "epoch": 0.5575439984180344, "mean_token_accuracy": 0.7396551966667175, "num_tokens": 8531106.0, "step": 5639, "train/ce_loss": 1.0941106081008911 }, { "epoch": 0.5575439984180344, "step": 5639, "train/sim_loss": 0.109375 }, { "epoch": 0.5575439984180344, "step": 5639, "train/total_loss": 0.2187860608100891 }, { "epoch": 0.55764287126755, "grad_norm": 0.7694854140281677, "learning_rate": 8.608267813875292e-06, "loss": 0.14, "step": 5640 }, { "entropy": 9.507564544677734, "epoch": 0.55764287126755, "mean_token_accuracy": 0.7487520575523376, "num_tokens": 8536317.0, "step": 5640, "train/ce_loss": 0.8715117573738098 }, { "epoch": 0.55764287126755, "step": 5640, "train/sim_loss": 0.10546875 }, { "epoch": 0.55764287126755, "step": 5640, "train/total_loss": 0.1926199197769165 }, { "entropy": 8.974124908447266, "epoch": 0.5577417441170655, "mean_token_accuracy": 0.7629213333129883, "num_tokens": 8541712.0, "step": 5641, "train/ce_loss": 0.6484330892562866 }, { "epoch": 0.5577417441170655, "step": 5641, "train/sim_loss": 0.03515625 }, { "epoch": 0.5577417441170655, "step": 5641, "train/total_loss": 0.0999995619058609 }, { "entropy": 8.264684677124023, "epoch": 0.5578406169665809, "mean_token_accuracy": 0.7664377093315125, "num_tokens": 8547168.0, "step": 5642, "train/ce_loss": 0.7456291913986206 }, { "epoch": 0.5578406169665809, "step": 5642, "train/sim_loss": 0.046875 }, { "epoch": 0.5578406169665809, "step": 5642, "train/total_loss": 0.1214379221200943 }, { "entropy": 8.468953132629395, "epoch": 0.5579394898160965, "mean_token_accuracy": 0.735609769821167, "num_tokens": 8552677.0, "step": 5643, "train/ce_loss": 0.7391076683998108 }, { "epoch": 0.5579394898160965, "step": 5643, "train/sim_loss": 0.06640625 }, { "epoch": 0.5579394898160965, "step": 5643, "train/total_loss": 0.14031702280044556 }, { "entropy": 8.90363597869873, "epoch": 0.558038362665612, "mean_token_accuracy": 0.7963855266571045, "num_tokens": 8557937.0, "step": 5644, "train/ce_loss": 0.4510335922241211 }, { "epoch": 0.558038362665612, "step": 5644, "train/sim_loss": 0.03515625 }, { "epoch": 0.558038362665612, "step": 5644, "train/total_loss": 0.08025960624217987 }, { "entropy": 8.877609252929688, "epoch": 0.5581372355151275, "mean_token_accuracy": 0.7982359528541565, "num_tokens": 8563385.0, "step": 5645, "train/ce_loss": 0.7714386582374573 }, { "epoch": 0.5581372355151275, "step": 5645, "train/sim_loss": 0.10546875 }, { "epoch": 0.5581372355151275, "step": 5645, "train/total_loss": 0.18261262774467468 }, { "entropy": 8.755309104919434, "epoch": 0.5582361083646431, "mean_token_accuracy": 0.748110830783844, "num_tokens": 8568666.0, "step": 5646, "train/ce_loss": 0.8779288530349731 }, { "epoch": 0.5582361083646431, "step": 5646, "train/sim_loss": 0.0625 }, { "epoch": 0.5582361083646431, "step": 5646, "train/total_loss": 0.15029288828372955 }, { "entropy": 9.328277587890625, "epoch": 0.5583349812141586, "mean_token_accuracy": 0.7593880295753479, "num_tokens": 8573972.0, "step": 5647, "train/ce_loss": 8.177023573807674e-07 }, { "epoch": 0.5583349812141586, "step": 5647, "train/sim_loss": 0.03125 }, { "epoch": 0.5583349812141586, "step": 5647, "train/total_loss": 0.031250081956386566 }, { "entropy": 8.753901481628418, "epoch": 0.5584338540636741, "mean_token_accuracy": 0.7020000219345093, "num_tokens": 8579475.0, "step": 5648, "train/ce_loss": 0.7852165699005127 }, { "epoch": 0.5584338540636741, "step": 5648, "train/sim_loss": 0.09765625 }, { "epoch": 0.5584338540636741, "step": 5648, "train/total_loss": 0.17617791891098022 }, { "entropy": 9.581437110900879, "epoch": 0.5585327269131897, "mean_token_accuracy": 0.7967742085456848, "num_tokens": 8584686.0, "step": 5649, "train/ce_loss": 0.6837697625160217 }, { "epoch": 0.5585327269131897, "step": 5649, "train/sim_loss": 0.01953125 }, { "epoch": 0.5585327269131897, "step": 5649, "train/total_loss": 0.08790823072195053 }, { "entropy": 9.493171691894531, "epoch": 0.5586315997627052, "mean_token_accuracy": 0.6952381134033203, "num_tokens": 8589667.0, "step": 5650, "train/ce_loss": 1.4867403507232666 }, { "epoch": 0.5586315997627052, "step": 5650, "train/sim_loss": 0.0625 }, { "epoch": 0.5586315997627052, "step": 5650, "train/total_loss": 0.21117404103279114 }, { "entropy": 8.709896087646484, "epoch": 0.5587304726122206, "mean_token_accuracy": 0.7894117832183838, "num_tokens": 8595022.0, "step": 5651, "train/ce_loss": 0.9483495950698853 }, { "epoch": 0.5587304726122206, "step": 5651, "train/sim_loss": 0.0390625 }, { "epoch": 0.5587304726122206, "step": 5651, "train/total_loss": 0.13389745354652405 }, { "entropy": 9.296415328979492, "epoch": 0.5588293454617362, "mean_token_accuracy": 0.7744107842445374, "num_tokens": 8600050.0, "step": 5652, "train/ce_loss": 1.5259263363986975e-06 }, { "epoch": 0.5588293454617362, "step": 5652, "train/sim_loss": 0.02734375 }, { "epoch": 0.5588293454617362, "step": 5652, "train/total_loss": 0.027343902736902237 }, { "entropy": 9.139948844909668, "epoch": 0.5589282183112517, "mean_token_accuracy": 0.7699346542358398, "num_tokens": 8605289.0, "step": 5653, "train/ce_loss": 0.6834624409675598 }, { "epoch": 0.5589282183112517, "step": 5653, "train/sim_loss": 0.0234375 }, { "epoch": 0.5589282183112517, "step": 5653, "train/total_loss": 0.09178374707698822 }, { "entropy": 9.215531349182129, "epoch": 0.5590270911607672, "mean_token_accuracy": 0.8176583647727966, "num_tokens": 8610231.0, "step": 5654, "train/ce_loss": 1.2036710977554321 }, { "epoch": 0.5590270911607672, "step": 5654, "train/sim_loss": 0.08203125 }, { "epoch": 0.5590270911607672, "step": 5654, "train/total_loss": 0.2023983597755432 }, { "entropy": 9.8184232711792, "epoch": 0.5591259640102828, "mean_token_accuracy": 0.7412280440330505, "num_tokens": 8615130.0, "step": 5655, "train/ce_loss": 1.816309350033407e-06 }, { "epoch": 0.5591259640102828, "step": 5655, "train/sim_loss": 0.06640625 }, { "epoch": 0.5591259640102828, "step": 5655, "train/total_loss": 0.06640642881393433 }, { "entropy": 8.991920471191406, "epoch": 0.5592248368597983, "mean_token_accuracy": 0.7032085657119751, "num_tokens": 8620348.0, "step": 5656, "train/ce_loss": 0.9642362594604492 }, { "epoch": 0.5592248368597983, "step": 5656, "train/sim_loss": 0.11328125 }, { "epoch": 0.5592248368597983, "step": 5656, "train/total_loss": 0.20970487594604492 }, { "entropy": 8.72332763671875, "epoch": 0.5593237097093138, "mean_token_accuracy": 0.7709251046180725, "num_tokens": 8625746.0, "step": 5657, "train/ce_loss": 0.8530164361000061 }, { "epoch": 0.5593237097093138, "step": 5657, "train/sim_loss": 0.0390625 }, { "epoch": 0.5593237097093138, "step": 5657, "train/total_loss": 0.12436414510011673 }, { "entropy": 9.364713668823242, "epoch": 0.5594225825588294, "mean_token_accuracy": 0.7829457521438599, "num_tokens": 8630747.0, "step": 5658, "train/ce_loss": 1.0951863527297974 }, { "epoch": 0.5594225825588294, "step": 5658, "train/sim_loss": 0.07421875 }, { "epoch": 0.5594225825588294, "step": 5658, "train/total_loss": 0.1837373971939087 }, { "entropy": 8.714923858642578, "epoch": 0.5595214554083449, "mean_token_accuracy": 0.7194412350654602, "num_tokens": 8636109.0, "step": 5659, "train/ce_loss": 0.4874354898929596 }, { "epoch": 0.5595214554083449, "step": 5659, "train/sim_loss": 0.046875 }, { "epoch": 0.5595214554083449, "step": 5659, "train/total_loss": 0.09561854600906372 }, { "epoch": 0.5596203282578603, "grad_norm": 0.7099284529685974, "learning_rate": 8.603322949117342e-06, "loss": 0.1325, "step": 5660 }, { "entropy": 8.914743423461914, "epoch": 0.5596203282578603, "mean_token_accuracy": 0.7547169923782349, "num_tokens": 8641401.0, "step": 5660, "train/ce_loss": 0.7595081329345703 }, { "epoch": 0.5596203282578603, "step": 5660, "train/sim_loss": 0.07421875 }, { "epoch": 0.5596203282578603, "step": 5660, "train/total_loss": 0.15016956627368927 }, { "entropy": 9.22370433807373, "epoch": 0.5597192011073759, "mean_token_accuracy": 0.7557603716850281, "num_tokens": 8646495.0, "step": 5661, "train/ce_loss": 1.8618015928950626e-06 }, { "epoch": 0.5597192011073759, "step": 5661, "train/sim_loss": 0.02734375 }, { "epoch": 0.5597192011073759, "step": 5661, "train/total_loss": 0.027343936264514923 }, { "entropy": 8.675178527832031, "epoch": 0.5598180739568914, "mean_token_accuracy": 0.711904764175415, "num_tokens": 8651791.0, "step": 5662, "train/ce_loss": 1.1464154720306396 }, { "epoch": 0.5598180739568914, "step": 5662, "train/sim_loss": 0.12890625 }, { "epoch": 0.5598180739568914, "step": 5662, "train/total_loss": 0.24354779720306396 }, { "entropy": 8.901063919067383, "epoch": 0.5599169468064069, "mean_token_accuracy": 0.7570694088935852, "num_tokens": 8657026.0, "step": 5663, "train/ce_loss": 0.45017698407173157 }, { "epoch": 0.5599169468064069, "step": 5663, "train/sim_loss": 0.0546875 }, { "epoch": 0.5599169468064069, "step": 5663, "train/total_loss": 0.09970520436763763 }, { "entropy": 9.59109115600586, "epoch": 0.5600158196559225, "mean_token_accuracy": 0.7450658082962036, "num_tokens": 8662195.0, "step": 5664, "train/ce_loss": 1.289162278175354 }, { "epoch": 0.5600158196559225, "step": 5664, "train/sim_loss": 0.078125 }, { "epoch": 0.5600158196559225, "step": 5664, "train/total_loss": 0.20704123377799988 }, { "entropy": 9.324518203735352, "epoch": 0.560114692505438, "mean_token_accuracy": 0.7675111889839172, "num_tokens": 8667321.0, "step": 5665, "train/ce_loss": 0.3714592456817627 }, { "epoch": 0.560114692505438, "step": 5665, "train/sim_loss": 0.04296875 }, { "epoch": 0.560114692505438, "step": 5665, "train/total_loss": 0.08011467754840851 }, { "entropy": 8.930580139160156, "epoch": 0.5602135653549535, "mean_token_accuracy": 0.7133758068084717, "num_tokens": 8672583.0, "step": 5666, "train/ce_loss": 0.8320925831794739 }, { "epoch": 0.5602135653549535, "step": 5666, "train/sim_loss": 0.03125 }, { "epoch": 0.5602135653549535, "step": 5666, "train/total_loss": 0.11445926129817963 }, { "entropy": 9.13973331451416, "epoch": 0.5603124382044691, "mean_token_accuracy": 0.7386519908905029, "num_tokens": 8677824.0, "step": 5667, "train/ce_loss": 0.716332197189331 }, { "epoch": 0.5603124382044691, "step": 5667, "train/sim_loss": 0.05859375 }, { "epoch": 0.5603124382044691, "step": 5667, "train/total_loss": 0.1302269697189331 }, { "entropy": 9.639233589172363, "epoch": 0.5604113110539846, "mean_token_accuracy": 0.7438691854476929, "num_tokens": 8682613.0, "step": 5668, "train/ce_loss": 1.7468125820159912 }, { "epoch": 0.5604113110539846, "step": 5668, "train/sim_loss": 0.0390625 }, { "epoch": 0.5604113110539846, "step": 5668, "train/total_loss": 0.21374376118183136 }, { "entropy": 9.132619857788086, "epoch": 0.5605101839035, "mean_token_accuracy": 0.7081760764122009, "num_tokens": 8687793.0, "step": 5669, "train/ce_loss": 0.6844165921211243 }, { "epoch": 0.5605101839035, "step": 5669, "train/sim_loss": 0.0390625 }, { "epoch": 0.5605101839035, "step": 5669, "train/total_loss": 0.10750415921211243 }, { "entropy": 9.330621719360352, "epoch": 0.5606090567530156, "mean_token_accuracy": 0.7121661901473999, "num_tokens": 8692919.0, "step": 5670, "train/ce_loss": 3.438371777519933e-06 }, { "epoch": 0.5606090567530156, "step": 5670, "train/sim_loss": 0.046875 }, { "epoch": 0.5606090567530156, "step": 5670, "train/total_loss": 0.04687534272670746 }, { "entropy": 8.55720329284668, "epoch": 0.5607079296025311, "mean_token_accuracy": 0.7661691308021545, "num_tokens": 8698438.0, "step": 5671, "train/ce_loss": 0.4348050355911255 }, { "epoch": 0.5607079296025311, "step": 5671, "train/sim_loss": 0.0234375 }, { "epoch": 0.5607079296025311, "step": 5671, "train/total_loss": 0.06691800057888031 }, { "entropy": 8.937002182006836, "epoch": 0.5608068024520466, "mean_token_accuracy": 0.7334801554679871, "num_tokens": 8703803.0, "step": 5672, "train/ce_loss": 0.9411271810531616 }, { "epoch": 0.5608068024520466, "step": 5672, "train/sim_loss": 0.05078125 }, { "epoch": 0.5608068024520466, "step": 5672, "train/total_loss": 0.14489397406578064 }, { "entropy": 8.997447967529297, "epoch": 0.5609056753015622, "mean_token_accuracy": 0.7618438005447388, "num_tokens": 8709092.0, "step": 5673, "train/ce_loss": 0.7748585939407349 }, { "epoch": 0.5609056753015622, "step": 5673, "train/sim_loss": 0.03515625 }, { "epoch": 0.5609056753015622, "step": 5673, "train/total_loss": 0.11264210939407349 }, { "entropy": 9.029390335083008, "epoch": 0.5610045481510777, "mean_token_accuracy": 0.7461809515953064, "num_tokens": 8714419.0, "step": 5674, "train/ce_loss": 1.474387526512146 }, { "epoch": 0.5610045481510777, "step": 5674, "train/sim_loss": 0.0546875 }, { "epoch": 0.5610045481510777, "step": 5674, "train/total_loss": 0.20212624967098236 }, { "entropy": 8.8839750289917, "epoch": 0.5611034210005932, "mean_token_accuracy": 0.7093167901039124, "num_tokens": 8719669.0, "step": 5675, "train/ce_loss": 0.7157660722732544 }, { "epoch": 0.5611034210005932, "step": 5675, "train/sim_loss": 0.0390625 }, { "epoch": 0.5611034210005932, "step": 5675, "train/total_loss": 0.11063911020755768 }, { "entropy": 8.97752571105957, "epoch": 0.5612022938501088, "mean_token_accuracy": 0.7981651425361633, "num_tokens": 8724910.0, "step": 5676, "train/ce_loss": 0.7324572205543518 }, { "epoch": 0.5612022938501088, "step": 5676, "train/sim_loss": 0.0625 }, { "epoch": 0.5612022938501088, "step": 5676, "train/total_loss": 0.13574573397636414 }, { "entropy": 9.560317039489746, "epoch": 0.5613011666996243, "mean_token_accuracy": 0.810606062412262, "num_tokens": 8729871.0, "step": 5677, "train/ce_loss": 0.867394745349884 }, { "epoch": 0.5613011666996243, "step": 5677, "train/sim_loss": 0.0234375 }, { "epoch": 0.5613011666996243, "step": 5677, "train/total_loss": 0.11017697304487228 }, { "entropy": 9.277361869812012, "epoch": 0.5614000395491398, "mean_token_accuracy": 0.7605396509170532, "num_tokens": 8734895.0, "step": 5678, "train/ce_loss": 0.9489947557449341 }, { "epoch": 0.5614000395491398, "step": 5678, "train/sim_loss": 0.05078125 }, { "epoch": 0.5614000395491398, "step": 5678, "train/total_loss": 0.1456807255744934 }, { "entropy": 8.804372787475586, "epoch": 0.5614989123986553, "mean_token_accuracy": 0.7472160458564758, "num_tokens": 8740298.0, "step": 5679, "train/ce_loss": 1.4822356700897217 }, { "epoch": 0.5614989123986553, "step": 5679, "train/sim_loss": 0.11328125 }, { "epoch": 0.5614989123986553, "step": 5679, "train/total_loss": 0.2615048289299011 }, { "epoch": 0.5615977852481708, "grad_norm": 0.7704647183418274, "learning_rate": 8.598378084359393e-06, "loss": 0.1372, "step": 5680 }, { "entropy": 8.915640830993652, "epoch": 0.5615977852481708, "mean_token_accuracy": 0.7661574482917786, "num_tokens": 8745618.0, "step": 5680, "train/ce_loss": 0.8491262793540955 }, { "epoch": 0.5615977852481708, "step": 5680, "train/sim_loss": 0.015625 }, { "epoch": 0.5615977852481708, "step": 5680, "train/total_loss": 0.10053762793540955 }, { "entropy": 9.401718139648438, "epoch": 0.5616966580976864, "mean_token_accuracy": 0.7157894968986511, "num_tokens": 8750598.0, "step": 5681, "train/ce_loss": 2.3388354778289795 }, { "epoch": 0.5616966580976864, "step": 5681, "train/sim_loss": 0.0859375 }, { "epoch": 0.5616966580976864, "step": 5681, "train/total_loss": 0.3198210597038269 }, { "entropy": 8.893880844116211, "epoch": 0.5617955309472019, "mean_token_accuracy": 0.7247706651687622, "num_tokens": 8755811.0, "step": 5682, "train/ce_loss": 0.6774603724479675 }, { "epoch": 0.5617955309472019, "step": 5682, "train/sim_loss": 0.05078125 }, { "epoch": 0.5617955309472019, "step": 5682, "train/total_loss": 0.11852728575468063 }, { "entropy": 9.121014595031738, "epoch": 0.5618944037967174, "mean_token_accuracy": 0.792682945728302, "num_tokens": 8760992.0, "step": 5683, "train/ce_loss": 0.852364718914032 }, { "epoch": 0.5618944037967174, "step": 5683, "train/sim_loss": 0.1171875 }, { "epoch": 0.5618944037967174, "step": 5683, "train/total_loss": 0.20242397487163544 }, { "entropy": 9.24246597290039, "epoch": 0.561993276646233, "mean_token_accuracy": 0.7210526466369629, "num_tokens": 8765999.0, "step": 5684, "train/ce_loss": 0.7150706648826599 }, { "epoch": 0.561993276646233, "step": 5684, "train/sim_loss": 0.0234375 }, { "epoch": 0.561993276646233, "step": 5684, "train/total_loss": 0.09494456648826599 }, { "entropy": 9.18613338470459, "epoch": 0.5620921494957485, "mean_token_accuracy": 0.6899350881576538, "num_tokens": 8771036.0, "step": 5685, "train/ce_loss": 5.070203883406066e-07 }, { "epoch": 0.5620921494957485, "step": 5685, "train/sim_loss": 0.01953125 }, { "epoch": 0.5620921494957485, "step": 5685, "train/total_loss": 0.01953130029141903 }, { "entropy": 8.672883987426758, "epoch": 0.562191022345264, "mean_token_accuracy": 0.7551440596580505, "num_tokens": 8776466.0, "step": 5686, "train/ce_loss": 0.8789169192314148 }, { "epoch": 0.562191022345264, "step": 5686, "train/sim_loss": 0.1015625 }, { "epoch": 0.562191022345264, "step": 5686, "train/total_loss": 0.18945419788360596 }, { "entropy": 9.249173164367676, "epoch": 0.5622898951947796, "mean_token_accuracy": 0.7518796920776367, "num_tokens": 8781440.0, "step": 5687, "train/ce_loss": 0.6717962026596069 }, { "epoch": 0.5622898951947796, "step": 5687, "train/sim_loss": 0.0390625 }, { "epoch": 0.5622898951947796, "step": 5687, "train/total_loss": 0.1062421202659607 }, { "entropy": 9.638214111328125, "epoch": 0.562388768044295, "mean_token_accuracy": 0.7749999761581421, "num_tokens": 8786281.0, "step": 5688, "train/ce_loss": 1.647937297821045 }, { "epoch": 0.562388768044295, "step": 5688, "train/sim_loss": 0.046875 }, { "epoch": 0.562388768044295, "step": 5688, "train/total_loss": 0.2116687297821045 }, { "entropy": 8.669907569885254, "epoch": 0.5624876408938105, "mean_token_accuracy": 0.7421109676361084, "num_tokens": 8791594.0, "step": 5689, "train/ce_loss": 0.7021977305412292 }, { "epoch": 0.5624876408938105, "step": 5689, "train/sim_loss": 0.05859375 }, { "epoch": 0.5624876408938105, "step": 5689, "train/total_loss": 0.12881353497505188 }, { "entropy": 8.93558406829834, "epoch": 0.5625865137433261, "mean_token_accuracy": 0.7482876777648926, "num_tokens": 8796644.0, "step": 5690, "train/ce_loss": 1.319148063659668 }, { "epoch": 0.5625865137433261, "step": 5690, "train/sim_loss": 0.05859375 }, { "epoch": 0.5625865137433261, "step": 5690, "train/total_loss": 0.19050855934619904 }, { "entropy": 8.610952377319336, "epoch": 0.5626853865928416, "mean_token_accuracy": 0.7578475475311279, "num_tokens": 8802089.0, "step": 5691, "train/ce_loss": 0.718697190284729 }, { "epoch": 0.5626853865928416, "step": 5691, "train/sim_loss": 0.0703125 }, { "epoch": 0.5626853865928416, "step": 5691, "train/total_loss": 0.14218223094940186 }, { "entropy": 9.175633430480957, "epoch": 0.5627842594423571, "mean_token_accuracy": 0.7532656192779541, "num_tokens": 8807383.0, "step": 5692, "train/ce_loss": 0.9090287089347839 }, { "epoch": 0.5627842594423571, "step": 5692, "train/sim_loss": 0.04296875 }, { "epoch": 0.5627842594423571, "step": 5692, "train/total_loss": 0.13387161493301392 }, { "entropy": 8.94670295715332, "epoch": 0.5628831322918727, "mean_token_accuracy": 0.793379008769989, "num_tokens": 8812742.0, "step": 5693, "train/ce_loss": 0.7289281487464905 }, { "epoch": 0.5628831322918727, "step": 5693, "train/sim_loss": 0.015625 }, { "epoch": 0.5628831322918727, "step": 5693, "train/total_loss": 0.08851781487464905 }, { "entropy": 8.990468978881836, "epoch": 0.5629820051413882, "mean_token_accuracy": 0.791293203830719, "num_tokens": 8818001.0, "step": 5694, "train/ce_loss": 0.6125853657722473 }, { "epoch": 0.5629820051413882, "step": 5694, "train/sim_loss": 0.046875 }, { "epoch": 0.5629820051413882, "step": 5694, "train/total_loss": 0.10813353955745697 }, { "entropy": 8.891982078552246, "epoch": 0.5630808779909037, "mean_token_accuracy": 0.7771493196487427, "num_tokens": 8823232.0, "step": 5695, "train/ce_loss": 0.6236464381217957 }, { "epoch": 0.5630808779909037, "step": 5695, "train/sim_loss": 0.0234375 }, { "epoch": 0.5630808779909037, "step": 5695, "train/total_loss": 0.08580214530229568 }, { "entropy": 9.578967094421387, "epoch": 0.5631797508404193, "mean_token_accuracy": 0.7374100685119629, "num_tokens": 8828219.0, "step": 5696, "train/ce_loss": 1.7185809610964498e-06 }, { "epoch": 0.5631797508404193, "step": 5696, "train/sim_loss": 0.06640625 }, { "epoch": 0.5631797508404193, "step": 5696, "train/total_loss": 0.06640642136335373 }, { "entropy": 8.8345947265625, "epoch": 0.5632786236899348, "mean_token_accuracy": 0.7513455152511597, "num_tokens": 8833644.0, "step": 5697, "train/ce_loss": 1.2025315761566162 }, { "epoch": 0.5632786236899348, "step": 5697, "train/sim_loss": 0.109375 }, { "epoch": 0.5632786236899348, "step": 5697, "train/total_loss": 0.22962816059589386 }, { "entropy": 9.412832260131836, "epoch": 0.5633774965394502, "mean_token_accuracy": 0.7787742614746094, "num_tokens": 8838743.0, "step": 5698, "train/ce_loss": 1.0013915300369263 }, { "epoch": 0.5633774965394502, "step": 5698, "train/sim_loss": 0.046875 }, { "epoch": 0.5633774965394502, "step": 5698, "train/total_loss": 0.14701415598392487 }, { "entropy": 9.53570556640625, "epoch": 0.5634763693889658, "mean_token_accuracy": 0.7682333588600159, "num_tokens": 8843736.0, "step": 5699, "train/ce_loss": 2.7366942958906293e-05 }, { "epoch": 0.5634763693889658, "step": 5699, "train/sim_loss": 0.0234375 }, { "epoch": 0.5634763693889658, "step": 5699, "train/total_loss": 0.02344023622572422 }, { "epoch": 0.5635752422384813, "grad_norm": 0.6368483901023865, "learning_rate": 8.593433219601445e-06, "loss": 0.129, "step": 5700 }, { "entropy": 8.884232521057129, "epoch": 0.5635752422384813, "mean_token_accuracy": 0.7505567669868469, "num_tokens": 8849050.0, "step": 5700, "train/ce_loss": 0.9633924961090088 }, { "epoch": 0.5635752422384813, "step": 5700, "train/sim_loss": 0.0546875 }, { "epoch": 0.5635752422384813, "step": 5700, "train/total_loss": 0.15102675557136536 }, { "entropy": 8.801679611206055, "epoch": 0.5636741150879968, "mean_token_accuracy": 0.733485221862793, "num_tokens": 8854369.0, "step": 5701, "train/ce_loss": 1.0334956645965576 }, { "epoch": 0.5636741150879968, "step": 5701, "train/sim_loss": 0.078125 }, { "epoch": 0.5636741150879968, "step": 5701, "train/total_loss": 0.18147456645965576 }, { "entropy": 8.707024574279785, "epoch": 0.5637729879375124, "mean_token_accuracy": 0.7114093899726868, "num_tokens": 8859908.0, "step": 5702, "train/ce_loss": 0.5167938470840454 }, { "epoch": 0.5637729879375124, "step": 5702, "train/sim_loss": 0.015625 }, { "epoch": 0.5637729879375124, "step": 5702, "train/total_loss": 0.06730438768863678 }, { "entropy": 9.086158752441406, "epoch": 0.5638718607870279, "mean_token_accuracy": 0.6867779493331909, "num_tokens": 8865165.0, "step": 5703, "train/ce_loss": 1.7837257385253906 }, { "epoch": 0.5638718607870279, "step": 5703, "train/sim_loss": 0.06640625 }, { "epoch": 0.5638718607870279, "step": 5703, "train/total_loss": 0.2447788268327713 }, { "entropy": 9.026750564575195, "epoch": 0.5639707336365434, "mean_token_accuracy": 0.6993630528450012, "num_tokens": 8870416.0, "step": 5704, "train/ce_loss": 0.8165999054908752 }, { "epoch": 0.5639707336365434, "step": 5704, "train/sim_loss": 0.03125 }, { "epoch": 0.5639707336365434, "step": 5704, "train/total_loss": 0.11290999501943588 }, { "entropy": 9.089848518371582, "epoch": 0.564069606486059, "mean_token_accuracy": 0.7402777671813965, "num_tokens": 8875584.0, "step": 5705, "train/ce_loss": 4.269746114005102e-06 }, { "epoch": 0.564069606486059, "step": 5705, "train/sim_loss": 0.0390625 }, { "epoch": 0.564069606486059, "step": 5705, "train/total_loss": 0.03906292840838432 }, { "entropy": 9.04459285736084, "epoch": 0.5641684793355745, "mean_token_accuracy": 0.739847719669342, "num_tokens": 8880899.0, "step": 5706, "train/ce_loss": 0.8907245993614197 }, { "epoch": 0.5641684793355745, "step": 5706, "train/sim_loss": 0.0703125 }, { "epoch": 0.5641684793355745, "step": 5706, "train/total_loss": 0.15938496589660645 }, { "entropy": 8.861656188964844, "epoch": 0.5642673521850899, "mean_token_accuracy": 0.65625, "num_tokens": 8886216.0, "step": 5707, "train/ce_loss": 1.6117205619812012 }, { "epoch": 0.5642673521850899, "step": 5707, "train/sim_loss": 0.09375 }, { "epoch": 0.5642673521850899, "step": 5707, "train/total_loss": 0.2549220621585846 }, { "entropy": 9.780906677246094, "epoch": 0.5643662250346055, "mean_token_accuracy": 0.7160493731498718, "num_tokens": 8891058.0, "step": 5708, "train/ce_loss": 2.286335984535981e-06 }, { "epoch": 0.5643662250346055, "step": 5708, "train/sim_loss": 0.0390625 }, { "epoch": 0.5643662250346055, "step": 5708, "train/total_loss": 0.039062727242708206 }, { "entropy": 9.029703140258789, "epoch": 0.564465097884121, "mean_token_accuracy": 0.7704517841339111, "num_tokens": 8896294.0, "step": 5709, "train/ce_loss": 0.8209226727485657 }, { "epoch": 0.564465097884121, "step": 5709, "train/sim_loss": 0.0546875 }, { "epoch": 0.564465097884121, "step": 5709, "train/total_loss": 0.1367797702550888 }, { "entropy": 8.482695579528809, "epoch": 0.5645639707336365, "mean_token_accuracy": 0.7508772015571594, "num_tokens": 8901869.0, "step": 5710, "train/ce_loss": 1.0111846923828125 }, { "epoch": 0.5645639707336365, "step": 5710, "train/sim_loss": 0.05078125 }, { "epoch": 0.5645639707336365, "step": 5710, "train/total_loss": 0.15189972519874573 }, { "entropy": 9.391302108764648, "epoch": 0.5646628435831521, "mean_token_accuracy": 0.7514880895614624, "num_tokens": 8906958.0, "step": 5711, "train/ce_loss": 0.8454544544219971 }, { "epoch": 0.5646628435831521, "step": 5711, "train/sim_loss": 0.02734375 }, { "epoch": 0.5646628435831521, "step": 5711, "train/total_loss": 0.11188919842243195 }, { "entropy": 9.042204856872559, "epoch": 0.5647617164326676, "mean_token_accuracy": 0.7222222089767456, "num_tokens": 8912267.0, "step": 5712, "train/ce_loss": 0.9595696926116943 }, { "epoch": 0.5647617164326676, "step": 5712, "train/sim_loss": 0.06640625 }, { "epoch": 0.5647617164326676, "step": 5712, "train/total_loss": 0.1623632311820984 }, { "entropy": 9.324300765991211, "epoch": 0.5648605892821831, "mean_token_accuracy": 0.7474892139434814, "num_tokens": 8917395.0, "step": 5713, "train/ce_loss": 0.9969939589500427 }, { "epoch": 0.5648605892821831, "step": 5713, "train/sim_loss": 0.0625 }, { "epoch": 0.5648605892821831, "step": 5713, "train/total_loss": 0.16219940781593323 }, { "entropy": 8.86493968963623, "epoch": 0.5649594621316987, "mean_token_accuracy": 0.6859956383705139, "num_tokens": 8922773.0, "step": 5714, "train/ce_loss": 0.708761990070343 }, { "epoch": 0.5649594621316987, "step": 5714, "train/sim_loss": 0.05859375 }, { "epoch": 0.5649594621316987, "step": 5714, "train/total_loss": 0.12946996092796326 }, { "entropy": 9.991195678710938, "epoch": 0.5650583349812142, "mean_token_accuracy": 0.739130437374115, "num_tokens": 8927471.0, "step": 5715, "train/ce_loss": 1.235878348350525 }, { "epoch": 0.5650583349812142, "step": 5715, "train/sim_loss": 0.05078125 }, { "epoch": 0.5650583349812142, "step": 5715, "train/total_loss": 0.17436909675598145 }, { "entropy": 8.891902923583984, "epoch": 0.5651572078307296, "mean_token_accuracy": 0.7307262420654297, "num_tokens": 8932795.0, "step": 5716, "train/ce_loss": 0.42581817507743835 }, { "epoch": 0.5651572078307296, "step": 5716, "train/sim_loss": 0.0234375 }, { "epoch": 0.5651572078307296, "step": 5716, "train/total_loss": 0.06601931899785995 }, { "entropy": 9.35824966430664, "epoch": 0.5652560806802452, "mean_token_accuracy": 0.7387820482254028, "num_tokens": 8937883.0, "step": 5717, "train/ce_loss": 1.1495380401611328 }, { "epoch": 0.5652560806802452, "step": 5717, "train/sim_loss": 0.0703125 }, { "epoch": 0.5652560806802452, "step": 5717, "train/total_loss": 0.18526631593704224 }, { "entropy": 9.609567642211914, "epoch": 0.5653549535297607, "mean_token_accuracy": 0.7029288411140442, "num_tokens": 8942819.0, "step": 5718, "train/ce_loss": 1.4963005696699838e-06 }, { "epoch": 0.5653549535297607, "step": 5718, "train/sim_loss": 0.03515625 }, { "epoch": 0.5653549535297607, "step": 5718, "train/total_loss": 0.03515639901161194 }, { "entropy": 9.406478881835938, "epoch": 0.5654538263792762, "mean_token_accuracy": 0.6909937858581543, "num_tokens": 8947942.0, "step": 5719, "train/ce_loss": 1.1935173915844643e-06 }, { "epoch": 0.5654538263792762, "step": 5719, "train/sim_loss": 0.0625 }, { "epoch": 0.5654538263792762, "step": 5719, "train/total_loss": 0.06250011920928955 }, { "epoch": 0.5655526992287918, "grad_norm": 0.7832507491111755, "learning_rate": 8.588488354843496e-06, "loss": 0.1492, "step": 5720 }, { "entropy": 8.786664009094238, "epoch": 0.5655526992287918, "mean_token_accuracy": 0.7355035543441772, "num_tokens": 8953406.0, "step": 5720, "train/ce_loss": 1.0731121301651 }, { "epoch": 0.5655526992287918, "step": 5720, "train/sim_loss": 0.046875 }, { "epoch": 0.5655526992287918, "step": 5720, "train/total_loss": 0.1541862189769745 }, { "entropy": 9.160669326782227, "epoch": 0.5656515720783073, "mean_token_accuracy": 0.673202633857727, "num_tokens": 8958626.0, "step": 5721, "train/ce_loss": 0.8698038458824158 }, { "epoch": 0.5656515720783073, "step": 5721, "train/sim_loss": 0.0703125 }, { "epoch": 0.5656515720783073, "step": 5721, "train/total_loss": 0.15729288756847382 }, { "entropy": 9.497705459594727, "epoch": 0.5657504449278228, "mean_token_accuracy": 0.7447154521942139, "num_tokens": 8963702.0, "step": 5722, "train/ce_loss": 0.8238604068756104 }, { "epoch": 0.5657504449278228, "step": 5722, "train/sim_loss": 0.046875 }, { "epoch": 0.5657504449278228, "step": 5722, "train/total_loss": 0.1292610466480255 }, { "entropy": 8.987220764160156, "epoch": 0.5658493177773384, "mean_token_accuracy": 0.7350835204124451, "num_tokens": 8968973.0, "step": 5723, "train/ce_loss": 0.6659753918647766 }, { "epoch": 0.5658493177773384, "step": 5723, "train/sim_loss": 0.0546875 }, { "epoch": 0.5658493177773384, "step": 5723, "train/total_loss": 0.12128504365682602 }, { "entropy": 8.96373176574707, "epoch": 0.5659481906268539, "mean_token_accuracy": 0.7341317534446716, "num_tokens": 8974256.0, "step": 5724, "train/ce_loss": 0.8970287442207336 }, { "epoch": 0.5659481906268539, "step": 5724, "train/sim_loss": 0.0703125 }, { "epoch": 0.5659481906268539, "step": 5724, "train/total_loss": 0.16001537442207336 }, { "entropy": 8.677887916564941, "epoch": 0.5660470634763693, "mean_token_accuracy": 0.7993630766868591, "num_tokens": 8979680.0, "step": 5725, "train/ce_loss": 0.8367934226989746 }, { "epoch": 0.5660470634763693, "step": 5725, "train/sim_loss": 0.04296875 }, { "epoch": 0.5660470634763693, "step": 5725, "train/total_loss": 0.12664809823036194 }, { "entropy": 9.57216739654541, "epoch": 0.566145936325885, "mean_token_accuracy": 0.7227926254272461, "num_tokens": 8984589.0, "step": 5726, "train/ce_loss": 8.030754543142393e-06 }, { "epoch": 0.566145936325885, "step": 5726, "train/sim_loss": 0.0703125 }, { "epoch": 0.566145936325885, "step": 5726, "train/total_loss": 0.07031330466270447 }, { "entropy": 9.588593482971191, "epoch": 0.5662448091754004, "mean_token_accuracy": 0.7297297120094299, "num_tokens": 8989477.0, "step": 5727, "train/ce_loss": 4.534702839009697e-06 }, { "epoch": 0.5662448091754004, "step": 5727, "train/sim_loss": 0.0546875 }, { "epoch": 0.5662448091754004, "step": 5727, "train/total_loss": 0.05468795448541641 }, { "entropy": 8.820411682128906, "epoch": 0.5663436820249159, "mean_token_accuracy": 0.7184615135192871, "num_tokens": 8994590.0, "step": 5728, "train/ce_loss": 1.2334914207458496 }, { "epoch": 0.5663436820249159, "step": 5728, "train/sim_loss": 0.046875 }, { "epoch": 0.5663436820249159, "step": 5728, "train/total_loss": 0.1702241450548172 }, { "entropy": 8.630937576293945, "epoch": 0.5664425548744315, "mean_token_accuracy": 0.785977840423584, "num_tokens": 9000125.0, "step": 5729, "train/ce_loss": 0.677941083908081 }, { "epoch": 0.5664425548744315, "step": 5729, "train/sim_loss": 0.01171875 }, { "epoch": 0.5664425548744315, "step": 5729, "train/total_loss": 0.07951285690069199 }, { "entropy": 9.175809860229492, "epoch": 0.566541427723947, "mean_token_accuracy": 0.7271448373794556, "num_tokens": 9005298.0, "step": 5730, "train/ce_loss": 1.6741763353347778 }, { "epoch": 0.566541427723947, "step": 5730, "train/sim_loss": 0.0625 }, { "epoch": 0.566541427723947, "step": 5730, "train/total_loss": 0.22991763055324554 }, { "entropy": 8.94044017791748, "epoch": 0.5666403005734625, "mean_token_accuracy": 0.7077087759971619, "num_tokens": 9010654.0, "step": 5731, "train/ce_loss": 0.7423639297485352 }, { "epoch": 0.5666403005734625, "step": 5731, "train/sim_loss": 0.03515625 }, { "epoch": 0.5666403005734625, "step": 5731, "train/total_loss": 0.10939264297485352 }, { "entropy": 8.686186790466309, "epoch": 0.5667391734229781, "mean_token_accuracy": 0.737535297870636, "num_tokens": 9016243.0, "step": 5732, "train/ce_loss": 0.42112496495246887 }, { "epoch": 0.5667391734229781, "step": 5732, "train/sim_loss": 0.01953125 }, { "epoch": 0.5667391734229781, "step": 5732, "train/total_loss": 0.06164374575018883 }, { "entropy": 9.956960678100586, "epoch": 0.5668380462724936, "mean_token_accuracy": 0.6780487895011902, "num_tokens": 9021067.0, "step": 5733, "train/ce_loss": 1.3502155979949748e-06 }, { "epoch": 0.5668380462724936, "step": 5733, "train/sim_loss": 0.05078125 }, { "epoch": 0.5668380462724936, "step": 5733, "train/total_loss": 0.050781384110450745 }, { "entropy": 8.977663040161133, "epoch": 0.566936919122009, "mean_token_accuracy": 0.8058968186378479, "num_tokens": 9026333.0, "step": 5734, "train/ce_loss": 0.6080503463745117 }, { "epoch": 0.566936919122009, "step": 5734, "train/sim_loss": 0.05859375 }, { "epoch": 0.566936919122009, "step": 5734, "train/total_loss": 0.11939878761768341 }, { "entropy": 9.010187149047852, "epoch": 0.5670357919715246, "mean_token_accuracy": 0.7435610294342041, "num_tokens": 9031660.0, "step": 5735, "train/ce_loss": 0.48231545090675354 }, { "epoch": 0.5670357919715246, "step": 5735, "train/sim_loss": 0.03515625 }, { "epoch": 0.5670357919715246, "step": 5735, "train/total_loss": 0.08338779211044312 }, { "entropy": 9.17400074005127, "epoch": 0.5671346648210401, "mean_token_accuracy": 0.7989130616188049, "num_tokens": 9036913.0, "step": 5736, "train/ce_loss": 0.842216968536377 }, { "epoch": 0.5671346648210401, "step": 5736, "train/sim_loss": 0.10546875 }, { "epoch": 0.5671346648210401, "step": 5736, "train/total_loss": 0.18969044089317322 }, { "entropy": 8.912348747253418, "epoch": 0.5672335376705556, "mean_token_accuracy": 0.7251700758934021, "num_tokens": 9042115.0, "step": 5737, "train/ce_loss": 0.7594404220581055 }, { "epoch": 0.5672335376705556, "step": 5737, "train/sim_loss": 0.1015625 }, { "epoch": 0.5672335376705556, "step": 5737, "train/total_loss": 0.17750653624534607 }, { "entropy": 8.93150520324707, "epoch": 0.5673324105200712, "mean_token_accuracy": 0.7063106894493103, "num_tokens": 9047416.0, "step": 5738, "train/ce_loss": 1.0738160610198975 }, { "epoch": 0.5673324105200712, "step": 5738, "train/sim_loss": 0.11328125 }, { "epoch": 0.5673324105200712, "step": 5738, "train/total_loss": 0.22066286206245422 }, { "entropy": 9.037921905517578, "epoch": 0.5674312833695867, "mean_token_accuracy": 0.714677631855011, "num_tokens": 9052630.0, "step": 5739, "train/ce_loss": 0.9769010543823242 }, { "epoch": 0.5674312833695867, "step": 5739, "train/sim_loss": 0.03125 }, { "epoch": 0.5674312833695867, "step": 5739, "train/total_loss": 0.12894010543823242 }, { "epoch": 0.5675301562191022, "grad_norm": 0.79930579662323, "learning_rate": 8.583543490085546e-06, "loss": 0.1415, "step": 5740 }, { "entropy": 8.723766326904297, "epoch": 0.5675301562191022, "mean_token_accuracy": 0.6941308975219727, "num_tokens": 9057993.0, "step": 5740, "train/ce_loss": 1.2526596784591675 }, { "epoch": 0.5675301562191022, "step": 5740, "train/sim_loss": 0.11328125 }, { "epoch": 0.5675301562191022, "step": 5740, "train/total_loss": 0.238547220826149 }, { "entropy": 9.474671363830566, "epoch": 0.5676290290686178, "mean_token_accuracy": 0.7772194147109985, "num_tokens": 9063069.0, "step": 5741, "train/ce_loss": 1.3601796808870859e-06 }, { "epoch": 0.5676290290686178, "step": 5741, "train/sim_loss": 0.08203125 }, { "epoch": 0.5676290290686178, "step": 5741, "train/total_loss": 0.08203138411045074 }, { "entropy": 8.784934043884277, "epoch": 0.5677279019181333, "mean_token_accuracy": 0.8124330043792725, "num_tokens": 9068503.0, "step": 5742, "train/ce_loss": 0.6312729716300964 }, { "epoch": 0.5677279019181333, "step": 5742, "train/sim_loss": 0.046875 }, { "epoch": 0.5677279019181333, "step": 5742, "train/total_loss": 0.110002301633358 }, { "entropy": 9.346965789794922, "epoch": 0.5678267747676488, "mean_token_accuracy": 0.792151153087616, "num_tokens": 9073622.0, "step": 5743, "train/ce_loss": 4.456435931388114e-07 }, { "epoch": 0.5678267747676488, "step": 5743, "train/sim_loss": 0.02734375 }, { "epoch": 0.5678267747676488, "step": 5743, "train/total_loss": 0.02734379470348358 }, { "entropy": 8.720890045166016, "epoch": 0.5679256476171644, "mean_token_accuracy": 0.7175324559211731, "num_tokens": 9079060.0, "step": 5744, "train/ce_loss": 0.8249804377555847 }, { "epoch": 0.5679256476171644, "step": 5744, "train/sim_loss": 0.0546875 }, { "epoch": 0.5679256476171644, "step": 5744, "train/total_loss": 0.13718554377555847 }, { "entropy": 9.119203567504883, "epoch": 0.5680245204666798, "mean_token_accuracy": 0.7270668148994446, "num_tokens": 9084379.0, "step": 5745, "train/ce_loss": 0.8372083902359009 }, { "epoch": 0.5680245204666798, "step": 5745, "train/sim_loss": 0.0546875 }, { "epoch": 0.5680245204666798, "step": 5745, "train/total_loss": 0.1384083330631256 }, { "entropy": 8.878551483154297, "epoch": 0.5681233933161953, "mean_token_accuracy": 0.7617411017417908, "num_tokens": 9089678.0, "step": 5746, "train/ce_loss": 0.8750964403152466 }, { "epoch": 0.5681233933161953, "step": 5746, "train/sim_loss": 0.06640625 }, { "epoch": 0.5681233933161953, "step": 5746, "train/total_loss": 0.1539158970117569 }, { "entropy": 8.994913101196289, "epoch": 0.5682222661657109, "mean_token_accuracy": 0.7730496525764465, "num_tokens": 9095020.0, "step": 5747, "train/ce_loss": 1.2646600008010864 }, { "epoch": 0.5682222661657109, "step": 5747, "train/sim_loss": 0.06640625 }, { "epoch": 0.5682222661657109, "step": 5747, "train/total_loss": 0.19287225604057312 }, { "entropy": 9.466012954711914, "epoch": 0.5683211390152264, "mean_token_accuracy": 0.7690762877464294, "num_tokens": 9099930.0, "step": 5748, "train/ce_loss": 8.090804044513789e-07 }, { "epoch": 0.5683211390152264, "step": 5748, "train/sim_loss": 0.0234375 }, { "epoch": 0.5683211390152264, "step": 5748, "train/total_loss": 0.023437580093741417 }, { "entropy": 9.00436782836914, "epoch": 0.5684200118647419, "mean_token_accuracy": 0.7311435341835022, "num_tokens": 9105172.0, "step": 5749, "train/ce_loss": 0.501306414604187 }, { "epoch": 0.5684200118647419, "step": 5749, "train/sim_loss": 0.0546875 }, { "epoch": 0.5684200118647419, "step": 5749, "train/total_loss": 0.10481814295053482 }, { "entropy": 8.950907707214355, "epoch": 0.5685188847142575, "mean_token_accuracy": 0.7416020631790161, "num_tokens": 9110386.0, "step": 5750, "train/ce_loss": 0.7944943308830261 }, { "epoch": 0.5685188847142575, "step": 5750, "train/sim_loss": 0.05078125 }, { "epoch": 0.5685188847142575, "step": 5750, "train/total_loss": 0.13023069500923157 }, { "entropy": 9.473054885864258, "epoch": 0.568617757563773, "mean_token_accuracy": 0.6756272315979004, "num_tokens": 9115382.0, "step": 5751, "train/ce_loss": 1.7320858205494005e-06 }, { "epoch": 0.568617757563773, "step": 5751, "train/sim_loss": 0.046875 }, { "epoch": 0.568617757563773, "step": 5751, "train/total_loss": 0.04687517136335373 }, { "entropy": 8.896703720092773, "epoch": 0.5687166304132885, "mean_token_accuracy": 0.7521058917045593, "num_tokens": 9120872.0, "step": 5752, "train/ce_loss": 0.5250265002250671 }, { "epoch": 0.5687166304132885, "step": 5752, "train/sim_loss": 0.0390625 }, { "epoch": 0.5687166304132885, "step": 5752, "train/total_loss": 0.09156514704227448 }, { "entropy": 8.722356796264648, "epoch": 0.5688155032628041, "mean_token_accuracy": 0.7077244520187378, "num_tokens": 9126317.0, "step": 5753, "train/ce_loss": 0.8879641890525818 }, { "epoch": 0.5688155032628041, "step": 5753, "train/sim_loss": 0.1015625 }, { "epoch": 0.5688155032628041, "step": 5753, "train/total_loss": 0.19035892188549042 }, { "entropy": 9.647493362426758, "epoch": 0.5689143761123195, "mean_token_accuracy": 0.6870415806770325, "num_tokens": 9131139.0, "step": 5754, "train/ce_loss": 1.3407045571511844e-06 }, { "epoch": 0.5689143761123195, "step": 5754, "train/sim_loss": 0.0546875 }, { "epoch": 0.5689143761123195, "step": 5754, "train/total_loss": 0.054687634110450745 }, { "entropy": 9.187198638916016, "epoch": 0.569013248961835, "mean_token_accuracy": 0.7637194991111755, "num_tokens": 9136225.0, "step": 5755, "train/ce_loss": 0.8071531057357788 }, { "epoch": 0.569013248961835, "step": 5755, "train/sim_loss": 0.0390625 }, { "epoch": 0.569013248961835, "step": 5755, "train/total_loss": 0.11977781355381012 }, { "entropy": 8.6917724609375, "epoch": 0.5691121218113506, "mean_token_accuracy": 0.7009063363075256, "num_tokens": 9141759.0, "step": 5756, "train/ce_loss": 0.8537380695343018 }, { "epoch": 0.5691121218113506, "step": 5756, "train/sim_loss": 0.046875 }, { "epoch": 0.5691121218113506, "step": 5756, "train/total_loss": 0.13224881887435913 }, { "entropy": 8.852621078491211, "epoch": 0.5692109946608661, "mean_token_accuracy": 0.6953316926956177, "num_tokens": 9147044.0, "step": 5757, "train/ce_loss": 1.4994142055511475 }, { "epoch": 0.5692109946608661, "step": 5757, "train/sim_loss": 0.0390625 }, { "epoch": 0.5692109946608661, "step": 5757, "train/total_loss": 0.18900392949581146 }, { "entropy": 9.199543952941895, "epoch": 0.5693098675103816, "mean_token_accuracy": 0.7749999761581421, "num_tokens": 9152071.0, "step": 5758, "train/ce_loss": 0.7222161889076233 }, { "epoch": 0.5693098675103816, "step": 5758, "train/sim_loss": 0.02734375 }, { "epoch": 0.5693098675103816, "step": 5758, "train/total_loss": 0.09956537187099457 }, { "entropy": 8.63757038116455, "epoch": 0.5694087403598972, "mean_token_accuracy": 0.7416666746139526, "num_tokens": 9157491.0, "step": 5759, "train/ce_loss": 0.6470235586166382 }, { "epoch": 0.5694087403598972, "step": 5759, "train/sim_loss": 0.04296875 }, { "epoch": 0.5694087403598972, "step": 5759, "train/total_loss": 0.1076711043715477 }, { "epoch": 0.5695076132094127, "grad_norm": 0.621943473815918, "learning_rate": 8.578598625327598e-06, "loss": 0.1418, "step": 5760 }, { "entropy": 9.794168472290039, "epoch": 0.5695076132094127, "mean_token_accuracy": 0.7098445892333984, "num_tokens": 9162299.0, "step": 5760, "train/ce_loss": 1.574930191040039 }, { "epoch": 0.5695076132094127, "step": 5760, "train/sim_loss": 0.0859375 }, { "epoch": 0.5695076132094127, "step": 5760, "train/total_loss": 0.24343052506446838 }, { "entropy": 9.196226119995117, "epoch": 0.5696064860589282, "mean_token_accuracy": 0.7082152962684631, "num_tokens": 9167435.0, "step": 5761, "train/ce_loss": 1.043987512588501 }, { "epoch": 0.5696064860589282, "step": 5761, "train/sim_loss": 0.01953125 }, { "epoch": 0.5696064860589282, "step": 5761, "train/total_loss": 0.12392999976873398 }, { "entropy": 9.358050346374512, "epoch": 0.5697053589084438, "mean_token_accuracy": 0.8081841468811035, "num_tokens": 9172263.0, "step": 5762, "train/ce_loss": 2.002396968237008e-06 }, { "epoch": 0.5697053589084438, "step": 5762, "train/sim_loss": 0.03515625 }, { "epoch": 0.5697053589084438, "step": 5762, "train/total_loss": 0.03515645116567612 }, { "entropy": 9.067380905151367, "epoch": 0.5698042317579592, "mean_token_accuracy": 0.7419354915618896, "num_tokens": 9177485.0, "step": 5763, "train/ce_loss": 1.0077073574066162 }, { "epoch": 0.5698042317579592, "step": 5763, "train/sim_loss": 0.0625 }, { "epoch": 0.5698042317579592, "step": 5763, "train/total_loss": 0.1632707417011261 }, { "entropy": 9.170974731445312, "epoch": 0.5699031046074748, "mean_token_accuracy": 0.7475035786628723, "num_tokens": 9182640.0, "step": 5764, "train/ce_loss": 0.7894394993782043 }, { "epoch": 0.5699031046074748, "step": 5764, "train/sim_loss": 0.0625 }, { "epoch": 0.5699031046074748, "step": 5764, "train/total_loss": 0.14144395291805267 }, { "entropy": 8.615402221679688, "epoch": 0.5700019774569903, "mean_token_accuracy": 0.7594339847564697, "num_tokens": 9187950.0, "step": 5765, "train/ce_loss": 1.069886326789856 }, { "epoch": 0.5700019774569903, "step": 5765, "train/sim_loss": 0.078125 }, { "epoch": 0.5700019774569903, "step": 5765, "train/total_loss": 0.18511363863945007 }, { "entropy": 9.6326904296875, "epoch": 0.5701008503065058, "mean_token_accuracy": 0.7012302279472351, "num_tokens": 9192954.0, "step": 5766, "train/ce_loss": 1.0751601848824066e-06 }, { "epoch": 0.5701008503065058, "step": 5766, "train/sim_loss": 0.0234375 }, { "epoch": 0.5701008503065058, "step": 5766, "train/total_loss": 0.023437608033418655 }, { "entropy": 8.751548767089844, "epoch": 0.5701997231560214, "mean_token_accuracy": 0.770531415939331, "num_tokens": 9198273.0, "step": 5767, "train/ce_loss": 0.7971166372299194 }, { "epoch": 0.5701997231560214, "step": 5767, "train/sim_loss": 0.0390625 }, { "epoch": 0.5701997231560214, "step": 5767, "train/total_loss": 0.1187741681933403 }, { "entropy": 9.144685745239258, "epoch": 0.5702985960055369, "mean_token_accuracy": 0.7083333134651184, "num_tokens": 9203449.0, "step": 5768, "train/ce_loss": 1.262489914894104 }, { "epoch": 0.5702985960055369, "step": 5768, "train/sim_loss": 0.078125 }, { "epoch": 0.5702985960055369, "step": 5768, "train/total_loss": 0.20437400043010712 }, { "entropy": 9.466706275939941, "epoch": 0.5703974688550524, "mean_token_accuracy": 0.6808118224143982, "num_tokens": 9208382.0, "step": 5769, "train/ce_loss": 1.224777340888977 }, { "epoch": 0.5703974688550524, "step": 5769, "train/sim_loss": 0.05859375 }, { "epoch": 0.5703974688550524, "step": 5769, "train/total_loss": 0.18107149004936218 }, { "entropy": 8.870538711547852, "epoch": 0.570496341704568, "mean_token_accuracy": 0.7292954325675964, "num_tokens": 9213684.0, "step": 5770, "train/ce_loss": 0.5599069595336914 }, { "epoch": 0.570496341704568, "step": 5770, "train/sim_loss": 0.01171875 }, { "epoch": 0.570496341704568, "step": 5770, "train/total_loss": 0.06770944595336914 }, { "entropy": 9.689857482910156, "epoch": 0.5705952145540835, "mean_token_accuracy": 0.8302752375602722, "num_tokens": 9218571.0, "step": 5771, "train/ce_loss": 1.0707685947418213 }, { "epoch": 0.5705952145540835, "step": 5771, "train/sim_loss": 0.0625 }, { "epoch": 0.5705952145540835, "step": 5771, "train/total_loss": 0.16957685351371765 }, { "entropy": 8.794843673706055, "epoch": 0.570694087403599, "mean_token_accuracy": 0.7710084319114685, "num_tokens": 9223957.0, "step": 5772, "train/ce_loss": 1.0351547002792358 }, { "epoch": 0.570694087403599, "step": 5772, "train/sim_loss": 0.0625 }, { "epoch": 0.570694087403599, "step": 5772, "train/total_loss": 0.16601547598838806 }, { "entropy": 9.70067024230957, "epoch": 0.5707929602531145, "mean_token_accuracy": 0.7928388714790344, "num_tokens": 9228788.0, "step": 5773, "train/ce_loss": 1.7402708530426025 }, { "epoch": 0.5707929602531145, "step": 5773, "train/sim_loss": 0.07421875 }, { "epoch": 0.5707929602531145, "step": 5773, "train/total_loss": 0.24824583530426025 }, { "entropy": 9.039346694946289, "epoch": 0.57089183310263, "mean_token_accuracy": 0.7471697926521301, "num_tokens": 9234018.0, "step": 5774, "train/ce_loss": 0.7928961515426636 }, { "epoch": 0.57089183310263, "step": 5774, "train/sim_loss": 0.01953125 }, { "epoch": 0.57089183310263, "step": 5774, "train/total_loss": 0.09882086515426636 }, { "entropy": 9.668989181518555, "epoch": 0.5709907059521455, "mean_token_accuracy": 0.7594339847564697, "num_tokens": 9238894.0, "step": 5775, "train/ce_loss": 3.404971039344673e-06 }, { "epoch": 0.5709907059521455, "step": 5775, "train/sim_loss": 0.046875 }, { "epoch": 0.5709907059521455, "step": 5775, "train/total_loss": 0.04687533900141716 }, { "entropy": 9.012740135192871, "epoch": 0.5710895788016611, "mean_token_accuracy": 0.6688227653503418, "num_tokens": 9244148.0, "step": 5776, "train/ce_loss": 1.3640942573547363 }, { "epoch": 0.5710895788016611, "step": 5776, "train/sim_loss": 0.06640625 }, { "epoch": 0.5710895788016611, "step": 5776, "train/total_loss": 0.2028156816959381 }, { "entropy": 9.636526107788086, "epoch": 0.5711884516511766, "mean_token_accuracy": 0.7274678349494934, "num_tokens": 9249080.0, "step": 5777, "train/ce_loss": 3.842779278784292e-06 }, { "epoch": 0.5711884516511766, "step": 5777, "train/sim_loss": 0.0546875 }, { "epoch": 0.5711884516511766, "step": 5777, "train/total_loss": 0.05468788370490074 }, { "entropy": 9.943527221679688, "epoch": 0.5712873245006921, "mean_token_accuracy": 0.7570093274116516, "num_tokens": 9253821.0, "step": 5778, "train/ce_loss": 1.4900537729263306 }, { "epoch": 0.5712873245006921, "step": 5778, "train/sim_loss": 0.03515625 }, { "epoch": 0.5712873245006921, "step": 5778, "train/total_loss": 0.18416163325309753 }, { "entropy": 9.047167778015137, "epoch": 0.5713861973502077, "mean_token_accuracy": 0.759096622467041, "num_tokens": 9259251.0, "step": 5779, "train/ce_loss": 0.7313524484634399 }, { "epoch": 0.5713861973502077, "step": 5779, "train/sim_loss": 0.0234375 }, { "epoch": 0.5713861973502077, "step": 5779, "train/total_loss": 0.09657274931669235 }, { "epoch": 0.5714850701997232, "grad_norm": 0.7609543800354004, "learning_rate": 8.573653760569649e-06, "loss": 0.1392, "step": 5780 }, { "entropy": 9.630558013916016, "epoch": 0.5714850701997232, "mean_token_accuracy": 0.7282850742340088, "num_tokens": 9264079.0, "step": 5780, "train/ce_loss": 9.653415418142686e-07 }, { "epoch": 0.5714850701997232, "step": 5780, "train/sim_loss": 0.0234375 }, { "epoch": 0.5714850701997232, "step": 5780, "train/total_loss": 0.02343759685754776 }, { "entropy": 9.017946243286133, "epoch": 0.5715839430492387, "mean_token_accuracy": 0.7664740085601807, "num_tokens": 9269406.0, "step": 5781, "train/ce_loss": 0.6722099781036377 }, { "epoch": 0.5715839430492387, "step": 5781, "train/sim_loss": 0.01953125 }, { "epoch": 0.5715839430492387, "step": 5781, "train/total_loss": 0.08675225079059601 }, { "entropy": 9.25473403930664, "epoch": 0.5716828158987542, "mean_token_accuracy": 0.8168557286262512, "num_tokens": 9274471.0, "step": 5782, "train/ce_loss": 0.4532519280910492 }, { "epoch": 0.5716828158987542, "step": 5782, "train/sim_loss": 0.015625 }, { "epoch": 0.5716828158987542, "step": 5782, "train/total_loss": 0.06095019355416298 }, { "entropy": 9.218679428100586, "epoch": 0.5717816887482697, "mean_token_accuracy": 0.7699999809265137, "num_tokens": 9279683.0, "step": 5783, "train/ce_loss": 0.7935691475868225 }, { "epoch": 0.5717816887482697, "step": 5783, "train/sim_loss": 0.0390625 }, { "epoch": 0.5717816887482697, "step": 5783, "train/total_loss": 0.11841941624879837 }, { "entropy": 9.08060073852539, "epoch": 0.5718805615977852, "mean_token_accuracy": 0.7420634627342224, "num_tokens": 9284892.0, "step": 5784, "train/ce_loss": 0.7454713582992554 }, { "epoch": 0.5718805615977852, "step": 5784, "train/sim_loss": 0.09765625 }, { "epoch": 0.5718805615977852, "step": 5784, "train/total_loss": 0.17220339179039001 }, { "entropy": 8.969976425170898, "epoch": 0.5719794344473008, "mean_token_accuracy": 0.7424058318138123, "num_tokens": 9290175.0, "step": 5785, "train/ce_loss": 1.1526658535003662 }, { "epoch": 0.5719794344473008, "step": 5785, "train/sim_loss": 0.03125 }, { "epoch": 0.5719794344473008, "step": 5785, "train/total_loss": 0.1465165913105011 }, { "entropy": 8.666062355041504, "epoch": 0.5720783072968163, "mean_token_accuracy": 0.7295373678207397, "num_tokens": 9295493.0, "step": 5786, "train/ce_loss": 1.085310935974121 }, { "epoch": 0.5720783072968163, "step": 5786, "train/sim_loss": 0.05859375 }, { "epoch": 0.5720783072968163, "step": 5786, "train/total_loss": 0.16712483763694763 }, { "entropy": 9.163665771484375, "epoch": 0.5721771801463318, "mean_token_accuracy": 0.7680140733718872, "num_tokens": 9300502.0, "step": 5787, "train/ce_loss": 0.8502943515777588 }, { "epoch": 0.5721771801463318, "step": 5787, "train/sim_loss": 0.07421875 }, { "epoch": 0.5721771801463318, "step": 5787, "train/total_loss": 0.15924818813800812 }, { "entropy": 9.09556770324707, "epoch": 0.5722760529958474, "mean_token_accuracy": 0.8070175647735596, "num_tokens": 9305699.0, "step": 5788, "train/ce_loss": 0.9993642568588257 }, { "epoch": 0.5722760529958474, "step": 5788, "train/sim_loss": 0.0625 }, { "epoch": 0.5722760529958474, "step": 5788, "train/total_loss": 0.16243642568588257 }, { "entropy": 8.845906257629395, "epoch": 0.5723749258453629, "mean_token_accuracy": 0.7489451766014099, "num_tokens": 9311136.0, "step": 5789, "train/ce_loss": 0.7303217053413391 }, { "epoch": 0.5723749258453629, "step": 5789, "train/sim_loss": 0.09375 }, { "epoch": 0.5723749258453629, "step": 5789, "train/total_loss": 0.1667821705341339 }, { "entropy": 9.882627487182617, "epoch": 0.5724737986948784, "mean_token_accuracy": 0.8067227005958557, "num_tokens": 9315885.0, "step": 5790, "train/ce_loss": 1.4576836824417114 }, { "epoch": 0.5724737986948784, "step": 5790, "train/sim_loss": 0.05859375 }, { "epoch": 0.5724737986948784, "step": 5790, "train/total_loss": 0.20436212420463562 }, { "entropy": 9.175952911376953, "epoch": 0.572572671544394, "mean_token_accuracy": 0.800582230091095, "num_tokens": 9321042.0, "step": 5791, "train/ce_loss": 0.7040167450904846 }, { "epoch": 0.572572671544394, "step": 5791, "train/sim_loss": 0.03515625 }, { "epoch": 0.572572671544394, "step": 5791, "train/total_loss": 0.10555792599916458 }, { "entropy": 8.598827362060547, "epoch": 0.5726715443939094, "mean_token_accuracy": 0.8154696226119995, "num_tokens": 9326428.0, "step": 5792, "train/ce_loss": 0.6725344061851501 }, { "epoch": 0.5726715443939094, "step": 5792, "train/sim_loss": 0.07421875 }, { "epoch": 0.5726715443939094, "step": 5792, "train/total_loss": 0.14147219061851501 }, { "entropy": 9.333423614501953, "epoch": 0.5727704172434249, "mean_token_accuracy": 0.7334630489349365, "num_tokens": 9331390.0, "step": 5793, "train/ce_loss": 1.299883484840393 }, { "epoch": 0.5727704172434249, "step": 5793, "train/sim_loss": 0.06640625 }, { "epoch": 0.5727704172434249, "step": 5793, "train/total_loss": 0.19639460742473602 }, { "entropy": 8.871832847595215, "epoch": 0.5728692900929405, "mean_token_accuracy": 0.7634854912757874, "num_tokens": 9336780.0, "step": 5794, "train/ce_loss": 1.0562324523925781 }, { "epoch": 0.5728692900929405, "step": 5794, "train/sim_loss": 0.10546875 }, { "epoch": 0.5728692900929405, "step": 5794, "train/total_loss": 0.2110919952392578 }, { "entropy": 8.454917907714844, "epoch": 0.572968162942456, "mean_token_accuracy": 0.7221584320068359, "num_tokens": 9342120.0, "step": 5795, "train/ce_loss": 0.7007420659065247 }, { "epoch": 0.572968162942456, "step": 5795, "train/sim_loss": 0.05859375 }, { "epoch": 0.572968162942456, "step": 5795, "train/total_loss": 0.128667950630188 }, { "entropy": 8.864130020141602, "epoch": 0.5730670357919715, "mean_token_accuracy": 0.7142857313156128, "num_tokens": 9347303.0, "step": 5796, "train/ce_loss": 1.1294949054718018 }, { "epoch": 0.5730670357919715, "step": 5796, "train/sim_loss": 0.0625 }, { "epoch": 0.5730670357919715, "step": 5796, "train/total_loss": 0.17544949054718018 }, { "entropy": 8.857627868652344, "epoch": 0.5731659086414871, "mean_token_accuracy": 0.7629629373550415, "num_tokens": 9352662.0, "step": 5797, "train/ce_loss": 0.7051044702529907 }, { "epoch": 0.5731659086414871, "step": 5797, "train/sim_loss": 0.078125 }, { "epoch": 0.5731659086414871, "step": 5797, "train/total_loss": 0.14863544702529907 }, { "entropy": 9.339004516601562, "epoch": 0.5732647814910026, "mean_token_accuracy": 0.7263843417167664, "num_tokens": 9357745.0, "step": 5798, "train/ce_loss": 1.1820474863052368 }, { "epoch": 0.5732647814910026, "step": 5798, "train/sim_loss": 0.03125 }, { "epoch": 0.5732647814910026, "step": 5798, "train/total_loss": 0.1494547426700592 }, { "entropy": 9.302085876464844, "epoch": 0.5733636543405181, "mean_token_accuracy": 0.723809540271759, "num_tokens": 9362844.0, "step": 5799, "train/ce_loss": 0.4738314151763916 }, { "epoch": 0.5733636543405181, "step": 5799, "train/sim_loss": 0.0546875 }, { "epoch": 0.5733636543405181, "step": 5799, "train/total_loss": 0.1020706444978714 }, { "epoch": 0.5734625271900337, "grad_norm": 0.695505678653717, "learning_rate": 8.568708895811701e-06, "loss": 0.1252, "step": 5800 }, { "entropy": 9.141944885253906, "epoch": 0.5734625271900337, "mean_token_accuracy": 0.7878788113594055, "num_tokens": 9367985.0, "step": 5800, "train/ce_loss": 0.6440024375915527 }, { "epoch": 0.5734625271900337, "step": 5800, "train/sim_loss": 0.08203125 }, { "epoch": 0.5734625271900337, "step": 5800, "train/total_loss": 0.14643150568008423 }, { "entropy": 8.993462562561035, "epoch": 0.5735614000395491, "mean_token_accuracy": 0.7708830833435059, "num_tokens": 9373301.0, "step": 5801, "train/ce_loss": 0.6326388716697693 }, { "epoch": 0.5735614000395491, "step": 5801, "train/sim_loss": 0.08203125 }, { "epoch": 0.5735614000395491, "step": 5801, "train/total_loss": 0.1452951431274414 }, { "entropy": 9.004293441772461, "epoch": 0.5736602728890646, "mean_token_accuracy": 0.6816431283950806, "num_tokens": 9378565.0, "step": 5802, "train/ce_loss": 0.40664011240005493 }, { "epoch": 0.5736602728890646, "step": 5802, "train/sim_loss": 0.046875 }, { "epoch": 0.5736602728890646, "step": 5802, "train/total_loss": 0.08753901720046997 }, { "entropy": 9.30048942565918, "epoch": 0.5737591457385802, "mean_token_accuracy": 0.7446103096008301, "num_tokens": 9383657.0, "step": 5803, "train/ce_loss": 2.19503613152483e-06 }, { "epoch": 0.5737591457385802, "step": 5803, "train/sim_loss": 0.0390625 }, { "epoch": 0.5737591457385802, "step": 5803, "train/total_loss": 0.03906271979212761 }, { "entropy": 8.672707557678223, "epoch": 0.5738580185880957, "mean_token_accuracy": 0.7139852643013, "num_tokens": 9389126.0, "step": 5804, "train/ce_loss": 0.5802134275436401 }, { "epoch": 0.5738580185880957, "step": 5804, "train/sim_loss": 0.05078125 }, { "epoch": 0.5738580185880957, "step": 5804, "train/total_loss": 0.10880259424448013 }, { "entropy": 8.985300064086914, "epoch": 0.5739568914376112, "mean_token_accuracy": 0.7011904716491699, "num_tokens": 9394402.0, "step": 5805, "train/ce_loss": 0.8810886740684509 }, { "epoch": 0.5739568914376112, "step": 5805, "train/sim_loss": 0.0859375 }, { "epoch": 0.5739568914376112, "step": 5805, "train/total_loss": 0.1740463674068451 }, { "entropy": 8.950839042663574, "epoch": 0.5740557642871268, "mean_token_accuracy": 0.7651098966598511, "num_tokens": 9399648.0, "step": 5806, "train/ce_loss": 0.8348586559295654 }, { "epoch": 0.5740557642871268, "step": 5806, "train/sim_loss": 0.078125 }, { "epoch": 0.5740557642871268, "step": 5806, "train/total_loss": 0.16161087155342102 }, { "entropy": 9.569741249084473, "epoch": 0.5741546371366423, "mean_token_accuracy": 0.7185929417610168, "num_tokens": 9404471.0, "step": 5807, "train/ce_loss": 1.4021042585372925 }, { "epoch": 0.5741546371366423, "step": 5807, "train/sim_loss": 0.0625 }, { "epoch": 0.5741546371366423, "step": 5807, "train/total_loss": 0.20271043479442596 }, { "entropy": 8.872579574584961, "epoch": 0.5742535099861578, "mean_token_accuracy": 0.7598608136177063, "num_tokens": 9409830.0, "step": 5808, "train/ce_loss": 1.0803982019424438 }, { "epoch": 0.5742535099861578, "step": 5808, "train/sim_loss": 0.1015625 }, { "epoch": 0.5742535099861578, "step": 5808, "train/total_loss": 0.20960232615470886 }, { "entropy": 9.010385513305664, "epoch": 0.5743523828356734, "mean_token_accuracy": 0.7640750408172607, "num_tokens": 9415072.0, "step": 5809, "train/ce_loss": 0.39146870374679565 }, { "epoch": 0.5743523828356734, "step": 5809, "train/sim_loss": 0.05078125 }, { "epoch": 0.5743523828356734, "step": 5809, "train/total_loss": 0.08992812037467957 }, { "entropy": 9.353796005249023, "epoch": 0.5744512556851888, "mean_token_accuracy": 0.7794612646102905, "num_tokens": 9420094.0, "step": 5810, "train/ce_loss": 0.9817899465560913 }, { "epoch": 0.5744512556851888, "step": 5810, "train/sim_loss": 0.05078125 }, { "epoch": 0.5744512556851888, "step": 5810, "train/total_loss": 0.14896024763584137 }, { "entropy": 8.967628479003906, "epoch": 0.5745501285347043, "mean_token_accuracy": 0.7174825072288513, "num_tokens": 9425254.0, "step": 5811, "train/ce_loss": 1.0944974422454834 }, { "epoch": 0.5745501285347043, "step": 5811, "train/sim_loss": 0.0625 }, { "epoch": 0.5745501285347043, "step": 5811, "train/total_loss": 0.17194974422454834 }, { "entropy": 9.016559600830078, "epoch": 0.5746490013842199, "mean_token_accuracy": 0.808041512966156, "num_tokens": 9430450.0, "step": 5812, "train/ce_loss": 1.0918627977371216 }, { "epoch": 0.5746490013842199, "step": 5812, "train/sim_loss": 0.078125 }, { "epoch": 0.5746490013842199, "step": 5812, "train/total_loss": 0.1873112916946411 }, { "entropy": 8.808783531188965, "epoch": 0.5747478742337354, "mean_token_accuracy": 0.7251908183097839, "num_tokens": 9436021.0, "step": 5813, "train/ce_loss": 0.7345530986785889 }, { "epoch": 0.5747478742337354, "step": 5813, "train/sim_loss": 0.11328125 }, { "epoch": 0.5747478742337354, "step": 5813, "train/total_loss": 0.1867365539073944 }, { "entropy": 9.164575576782227, "epoch": 0.5748467470832509, "mean_token_accuracy": 0.7396121621131897, "num_tokens": 9441244.0, "step": 5814, "train/ce_loss": 1.2660578489303589 }, { "epoch": 0.5748467470832509, "step": 5814, "train/sim_loss": 0.06640625 }, { "epoch": 0.5748467470832509, "step": 5814, "train/total_loss": 0.1930120438337326 }, { "entropy": 8.717116355895996, "epoch": 0.5749456199327665, "mean_token_accuracy": 0.7172839641571045, "num_tokens": 9446579.0, "step": 5815, "train/ce_loss": 0.9173863530158997 }, { "epoch": 0.5749456199327665, "step": 5815, "train/sim_loss": 0.1171875 }, { "epoch": 0.5749456199327665, "step": 5815, "train/total_loss": 0.20892614126205444 }, { "entropy": 8.604945182800293, "epoch": 0.575044492782282, "mean_token_accuracy": 0.7659157514572144, "num_tokens": 9452069.0, "step": 5816, "train/ce_loss": 1.1520651578903198 }, { "epoch": 0.575044492782282, "step": 5816, "train/sim_loss": 0.06640625 }, { "epoch": 0.575044492782282, "step": 5816, "train/total_loss": 0.1816127598285675 }, { "entropy": 8.624781608581543, "epoch": 0.5751433656317975, "mean_token_accuracy": 0.705450713634491, "num_tokens": 9457675.0, "step": 5817, "train/ce_loss": 1.019660234451294 }, { "epoch": 0.5751433656317975, "step": 5817, "train/sim_loss": 0.0703125 }, { "epoch": 0.5751433656317975, "step": 5817, "train/total_loss": 0.1722785234451294 }, { "entropy": 8.754276275634766, "epoch": 0.5752422384813131, "mean_token_accuracy": 0.7320442199707031, "num_tokens": 9462870.0, "step": 5818, "train/ce_loss": 0.7815767526626587 }, { "epoch": 0.5752422384813131, "step": 5818, "train/sim_loss": 0.05859375 }, { "epoch": 0.5752422384813131, "step": 5818, "train/total_loss": 0.1367514282464981 }, { "entropy": 9.46786117553711, "epoch": 0.5753411113308285, "mean_token_accuracy": 0.7357512712478638, "num_tokens": 9467855.0, "step": 5819, "train/ce_loss": 0.8608783483505249 }, { "epoch": 0.5753411113308285, "step": 5819, "train/sim_loss": 0.04296875 }, { "epoch": 0.5753411113308285, "step": 5819, "train/total_loss": 0.12905658781528473 }, { "epoch": 0.575439984180344, "grad_norm": 0.7835647463798523, "learning_rate": 8.563764031053752e-06, "loss": 0.1469, "step": 5820 }, { "entropy": 8.44367790222168, "epoch": 0.575439984180344, "mean_token_accuracy": 0.6938547492027283, "num_tokens": 9473237.0, "step": 5820, "train/ce_loss": 0.3752153515815735 }, { "epoch": 0.575439984180344, "step": 5820, "train/sim_loss": 0.03125 }, { "epoch": 0.575439984180344, "step": 5820, "train/total_loss": 0.06877154111862183 }, { "entropy": 9.075521469116211, "epoch": 0.5755388570298596, "mean_token_accuracy": 0.6924198269844055, "num_tokens": 9478365.0, "step": 5821, "train/ce_loss": 1.483864426612854 }, { "epoch": 0.5755388570298596, "step": 5821, "train/sim_loss": 0.08984375 }, { "epoch": 0.5755388570298596, "step": 5821, "train/total_loss": 0.23823019862174988 }, { "entropy": 8.797341346740723, "epoch": 0.5756377298793751, "mean_token_accuracy": 0.7902023196220398, "num_tokens": 9483787.0, "step": 5822, "train/ce_loss": 0.4620964527130127 }, { "epoch": 0.5756377298793751, "step": 5822, "train/sim_loss": 0.0390625 }, { "epoch": 0.5756377298793751, "step": 5822, "train/total_loss": 0.08527214825153351 }, { "entropy": 9.989152908325195, "epoch": 0.5757366027288906, "mean_token_accuracy": 0.7836065292358398, "num_tokens": 9488520.0, "step": 5823, "train/ce_loss": 0.9272093772888184 }, { "epoch": 0.5757366027288906, "step": 5823, "train/sim_loss": 0.046875 }, { "epoch": 0.5757366027288906, "step": 5823, "train/total_loss": 0.13959594070911407 }, { "entropy": 9.14828109741211, "epoch": 0.5758354755784062, "mean_token_accuracy": 0.7337367534637451, "num_tokens": 9493639.0, "step": 5824, "train/ce_loss": 1.4593510627746582 }, { "epoch": 0.5758354755784062, "step": 5824, "train/sim_loss": 0.07421875 }, { "epoch": 0.5758354755784062, "step": 5824, "train/total_loss": 0.22015385329723358 }, { "entropy": 9.220829010009766, "epoch": 0.5759343484279217, "mean_token_accuracy": 0.7876505851745605, "num_tokens": 9498773.0, "step": 5825, "train/ce_loss": 0.6129733920097351 }, { "epoch": 0.5759343484279217, "step": 5825, "train/sim_loss": 0.01953125 }, { "epoch": 0.5759343484279217, "step": 5825, "train/total_loss": 0.08082859218120575 }, { "entropy": 9.097583770751953, "epoch": 0.5760332212774372, "mean_token_accuracy": 0.7509778141975403, "num_tokens": 9504007.0, "step": 5826, "train/ce_loss": 0.567491888999939 }, { "epoch": 0.5760332212774372, "step": 5826, "train/sim_loss": 0.04296875 }, { "epoch": 0.5760332212774372, "step": 5826, "train/total_loss": 0.09971794486045837 }, { "entropy": 9.62552261352539, "epoch": 0.5761320941269528, "mean_token_accuracy": 0.7119341492652893, "num_tokens": 9508872.0, "step": 5827, "train/ce_loss": 1.5119009049158194e-06 }, { "epoch": 0.5761320941269528, "step": 5827, "train/sim_loss": 0.01953125 }, { "epoch": 0.5761320941269528, "step": 5827, "train/total_loss": 0.019531400874257088 }, { "entropy": 9.689656257629395, "epoch": 0.5762309669764683, "mean_token_accuracy": 0.756157636642456, "num_tokens": 9513696.0, "step": 5828, "train/ce_loss": 3.948211087845266e-06 }, { "epoch": 0.5762309669764683, "step": 5828, "train/sim_loss": 0.0859375 }, { "epoch": 0.5762309669764683, "step": 5828, "train/total_loss": 0.08593789488077164 }, { "entropy": 9.26791763305664, "epoch": 0.5763298398259837, "mean_token_accuracy": 0.6971608996391296, "num_tokens": 9518737.0, "step": 5829, "train/ce_loss": 2.2869637632538797e-06 }, { "epoch": 0.5763298398259837, "step": 5829, "train/sim_loss": 0.0625 }, { "epoch": 0.5763298398259837, "step": 5829, "train/total_loss": 0.0625002309679985 }, { "entropy": 9.628270149230957, "epoch": 0.5764287126754993, "mean_token_accuracy": 0.7145833373069763, "num_tokens": 9523842.0, "step": 5830, "train/ce_loss": 1.0487631559371948 }, { "epoch": 0.5764287126754993, "step": 5830, "train/sim_loss": 0.05859375 }, { "epoch": 0.5764287126754993, "step": 5830, "train/total_loss": 0.163470059633255 }, { "entropy": 9.06503677368164, "epoch": 0.5765275855250148, "mean_token_accuracy": 0.7355769276618958, "num_tokens": 9528929.0, "step": 5831, "train/ce_loss": 0.6332504749298096 }, { "epoch": 0.5765275855250148, "step": 5831, "train/sim_loss": 0.046875 }, { "epoch": 0.5765275855250148, "step": 5831, "train/total_loss": 0.11020004749298096 }, { "entropy": 8.997516632080078, "epoch": 0.5766264583745303, "mean_token_accuracy": 0.7780821919441223, "num_tokens": 9534134.0, "step": 5832, "train/ce_loss": 0.7698352336883545 }, { "epoch": 0.5766264583745303, "step": 5832, "train/sim_loss": 0.0859375 }, { "epoch": 0.5766264583745303, "step": 5832, "train/total_loss": 0.1629210263490677 }, { "entropy": 8.555920600891113, "epoch": 0.5767253312240459, "mean_token_accuracy": 0.761800229549408, "num_tokens": 9539559.0, "step": 5833, "train/ce_loss": 0.46046629548072815 }, { "epoch": 0.5767253312240459, "step": 5833, "train/sim_loss": 0.0546875 }, { "epoch": 0.5767253312240459, "step": 5833, "train/total_loss": 0.10073412954807281 }, { "entropy": 8.977663040161133, "epoch": 0.5768242040735614, "mean_token_accuracy": 0.7265258431434631, "num_tokens": 9544888.0, "step": 5834, "train/ce_loss": 0.5896692276000977 }, { "epoch": 0.5768242040735614, "step": 5834, "train/sim_loss": 0.03515625 }, { "epoch": 0.5768242040735614, "step": 5834, "train/total_loss": 0.09412316977977753 }, { "entropy": 8.919035911560059, "epoch": 0.5769230769230769, "mean_token_accuracy": 0.7393548488616943, "num_tokens": 9550121.0, "step": 5835, "train/ce_loss": 1.3863357305526733 }, { "epoch": 0.5769230769230769, "step": 5835, "train/sim_loss": 0.08203125 }, { "epoch": 0.5769230769230769, "step": 5835, "train/total_loss": 0.2206648290157318 }, { "entropy": 9.460243225097656, "epoch": 0.5770219497725925, "mean_token_accuracy": 0.6178571581840515, "num_tokens": 9555092.0, "step": 5836, "train/ce_loss": 1.983798623085022 }, { "epoch": 0.5770219497725925, "step": 5836, "train/sim_loss": 0.1015625 }, { "epoch": 0.5770219497725925, "step": 5836, "train/total_loss": 0.29994237422943115 }, { "entropy": 8.946954727172852, "epoch": 0.577120822622108, "mean_token_accuracy": 0.7691197395324707, "num_tokens": 9560222.0, "step": 5837, "train/ce_loss": 1.1302258968353271 }, { "epoch": 0.577120822622108, "step": 5837, "train/sim_loss": 0.05859375 }, { "epoch": 0.577120822622108, "step": 5837, "train/total_loss": 0.1716163456439972 }, { "entropy": 8.636548042297363, "epoch": 0.5772196954716234, "mean_token_accuracy": 0.7297979593276978, "num_tokens": 9565480.0, "step": 5838, "train/ce_loss": 0.8710353374481201 }, { "epoch": 0.5772196954716234, "step": 5838, "train/sim_loss": 0.0703125 }, { "epoch": 0.5772196954716234, "step": 5838, "train/total_loss": 0.15741604566574097 }, { "entropy": 8.944669723510742, "epoch": 0.577318568321139, "mean_token_accuracy": 0.7527624368667603, "num_tokens": 9570631.0, "step": 5839, "train/ce_loss": 1.3659089803695679 }, { "epoch": 0.577318568321139, "step": 5839, "train/sim_loss": 0.07421875 }, { "epoch": 0.577318568321139, "step": 5839, "train/total_loss": 0.2108096480369568 }, { "epoch": 0.5774174411706545, "grad_norm": 0.6676998734474182, "learning_rate": 8.558819166295802e-06, "loss": 0.1389, "step": 5840 }, { "entropy": 8.932061195373535, "epoch": 0.5774174411706545, "mean_token_accuracy": 0.7333333492279053, "num_tokens": 9575973.0, "step": 5840, "train/ce_loss": 1.2954726219177246 }, { "epoch": 0.5774174411706545, "step": 5840, "train/sim_loss": 0.0625 }, { "epoch": 0.5774174411706545, "step": 5840, "train/total_loss": 0.19204726815223694 }, { "entropy": 8.650957107543945, "epoch": 0.57751631402017, "mean_token_accuracy": 0.7390829920768738, "num_tokens": 9581377.0, "step": 5841, "train/ce_loss": 1.229686975479126 }, { "epoch": 0.57751631402017, "step": 5841, "train/sim_loss": 0.0390625 }, { "epoch": 0.57751631402017, "step": 5841, "train/total_loss": 0.16203120350837708 }, { "entropy": 9.259391784667969, "epoch": 0.5776151868696856, "mean_token_accuracy": 0.7354497313499451, "num_tokens": 9586347.0, "step": 5842, "train/ce_loss": 1.0366559028625488 }, { "epoch": 0.5776151868696856, "step": 5842, "train/sim_loss": 0.015625 }, { "epoch": 0.5776151868696856, "step": 5842, "train/total_loss": 0.11929059028625488 }, { "entropy": 9.26965618133545, "epoch": 0.5777140597192011, "mean_token_accuracy": 0.6484641432762146, "num_tokens": 9591429.0, "step": 5843, "train/ce_loss": 1.5358511209487915 }, { "epoch": 0.5777140597192011, "step": 5843, "train/sim_loss": 0.0703125 }, { "epoch": 0.5777140597192011, "step": 5843, "train/total_loss": 0.22389762103557587 }, { "entropy": 8.18938159942627, "epoch": 0.5778129325687166, "mean_token_accuracy": 0.7324301600456238, "num_tokens": 9597107.0, "step": 5844, "train/ce_loss": 0.6811568737030029 }, { "epoch": 0.5778129325687166, "step": 5844, "train/sim_loss": 0.0390625 }, { "epoch": 0.5778129325687166, "step": 5844, "train/total_loss": 0.10717818886041641 }, { "entropy": 9.126882553100586, "epoch": 0.5779118054182322, "mean_token_accuracy": 0.7203728556632996, "num_tokens": 9602312.0, "step": 5845, "train/ce_loss": 0.791706919670105 }, { "epoch": 0.5779118054182322, "step": 5845, "train/sim_loss": 0.03125 }, { "epoch": 0.5779118054182322, "step": 5845, "train/total_loss": 0.11042069643735886 }, { "entropy": 8.713577270507812, "epoch": 0.5780106782677477, "mean_token_accuracy": 0.7227822542190552, "num_tokens": 9607747.0, "step": 5846, "train/ce_loss": 0.7399176359176636 }, { "epoch": 0.5780106782677477, "step": 5846, "train/sim_loss": 0.04296875 }, { "epoch": 0.5780106782677477, "step": 5846, "train/total_loss": 0.11696051806211472 }, { "entropy": 9.557413101196289, "epoch": 0.5781095511172633, "mean_token_accuracy": 0.7734940052032471, "num_tokens": 9612553.0, "step": 5847, "train/ce_loss": 1.399521827697754 }, { "epoch": 0.5781095511172633, "step": 5847, "train/sim_loss": 0.05859375 }, { "epoch": 0.5781095511172633, "step": 5847, "train/total_loss": 0.1985459327697754 }, { "entropy": 9.565277099609375, "epoch": 0.5782084239667787, "mean_token_accuracy": 0.6928251385688782, "num_tokens": 9617413.0, "step": 5848, "train/ce_loss": 0.7634159922599792 }, { "epoch": 0.5782084239667787, "step": 5848, "train/sim_loss": 0.015625 }, { "epoch": 0.5782084239667787, "step": 5848, "train/total_loss": 0.09196659922599792 }, { "entropy": 8.671602249145508, "epoch": 0.5783072968162942, "mean_token_accuracy": 0.6481876373291016, "num_tokens": 9622817.0, "step": 5849, "train/ce_loss": 1.070180058479309 }, { "epoch": 0.5783072968162942, "step": 5849, "train/sim_loss": 0.125 }, { "epoch": 0.5783072968162942, "step": 5849, "train/total_loss": 0.23201800882816315 }, { "entropy": 9.2224760055542, "epoch": 0.5784061696658098, "mean_token_accuracy": 0.7203007340431213, "num_tokens": 9627905.0, "step": 5850, "train/ce_loss": 1.6534047517779982e-06 }, { "epoch": 0.5784061696658098, "step": 5850, "train/sim_loss": 0.0390625 }, { "epoch": 0.5784061696658098, "step": 5850, "train/total_loss": 0.03906266391277313 }, { "entropy": 8.414937973022461, "epoch": 0.5785050425153253, "mean_token_accuracy": 0.7024128437042236, "num_tokens": 9633457.0, "step": 5851, "train/ce_loss": 1.6479556560516357 }, { "epoch": 0.5785050425153253, "step": 5851, "train/sim_loss": 0.05078125 }, { "epoch": 0.5785050425153253, "step": 5851, "train/total_loss": 0.21557681262493134 }, { "entropy": 8.665975570678711, "epoch": 0.5786039153648408, "mean_token_accuracy": 0.6990291476249695, "num_tokens": 9638836.0, "step": 5852, "train/ce_loss": 1.2352977991104126 }, { "epoch": 0.5786039153648408, "step": 5852, "train/sim_loss": 0.06640625 }, { "epoch": 0.5786039153648408, "step": 5852, "train/total_loss": 0.18993604183197021 }, { "entropy": 8.959815979003906, "epoch": 0.5787027882143564, "mean_token_accuracy": 0.7120419144630432, "num_tokens": 9644063.0, "step": 5853, "train/ce_loss": 1.505104899406433 }, { "epoch": 0.5787027882143564, "step": 5853, "train/sim_loss": 0.05859375 }, { "epoch": 0.5787027882143564, "step": 5853, "train/total_loss": 0.2091042399406433 }, { "entropy": 9.069355010986328, "epoch": 0.5788016610638719, "mean_token_accuracy": 0.7569974660873413, "num_tokens": 9649366.0, "step": 5854, "train/ce_loss": 0.9077438712120056 }, { "epoch": 0.5788016610638719, "step": 5854, "train/sim_loss": 0.0859375 }, { "epoch": 0.5788016610638719, "step": 5854, "train/total_loss": 0.17671188712120056 }, { "entropy": 9.358163833618164, "epoch": 0.5789005339133874, "mean_token_accuracy": 0.7469135522842407, "num_tokens": 9654286.0, "step": 5855, "train/ce_loss": 1.1978092193603516 }, { "epoch": 0.5789005339133874, "step": 5855, "train/sim_loss": 0.02734375 }, { "epoch": 0.5789005339133874, "step": 5855, "train/total_loss": 0.14712467789649963 }, { "entropy": 9.114377975463867, "epoch": 0.578999406762903, "mean_token_accuracy": 0.727129340171814, "num_tokens": 9659334.0, "step": 5856, "train/ce_loss": 8.964946687228803e-07 }, { "epoch": 0.578999406762903, "step": 5856, "train/sim_loss": 0.04296875 }, { "epoch": 0.578999406762903, "step": 5856, "train/total_loss": 0.04296883940696716 }, { "entropy": 8.770225524902344, "epoch": 0.5790982796124184, "mean_token_accuracy": 0.7616707682609558, "num_tokens": 9664628.0, "step": 5857, "train/ce_loss": 0.5496033430099487 }, { "epoch": 0.5790982796124184, "step": 5857, "train/sim_loss": 0.0234375 }, { "epoch": 0.5790982796124184, "step": 5857, "train/total_loss": 0.07839784026145935 }, { "entropy": 8.775310516357422, "epoch": 0.5791971524619339, "mean_token_accuracy": 0.793379008769989, "num_tokens": 9669980.0, "step": 5858, "train/ce_loss": 0.48683834075927734 }, { "epoch": 0.5791971524619339, "step": 5858, "train/sim_loss": 0.0234375 }, { "epoch": 0.5791971524619339, "step": 5858, "train/total_loss": 0.07212133705615997 }, { "entropy": 8.731611251831055, "epoch": 0.5792960253114495, "mean_token_accuracy": 0.7424083948135376, "num_tokens": 9675405.0, "step": 5859, "train/ce_loss": 0.6361439228057861 }, { "epoch": 0.5792960253114495, "step": 5859, "train/sim_loss": 0.08203125 }, { "epoch": 0.5792960253114495, "step": 5859, "train/total_loss": 0.1456456482410431 }, { "epoch": 0.579394898160965, "grad_norm": 0.7735334634780884, "learning_rate": 8.553874301537853e-06, "loss": 0.1499, "step": 5860 }, { "entropy": 9.669195175170898, "epoch": 0.579394898160965, "mean_token_accuracy": 0.7727272510528564, "num_tokens": 9680187.0, "step": 5860, "train/ce_loss": 1.6363650560379028 }, { "epoch": 0.579394898160965, "step": 5860, "train/sim_loss": 0.0546875 }, { "epoch": 0.579394898160965, "step": 5860, "train/total_loss": 0.21832400560379028 }, { "entropy": 8.610940933227539, "epoch": 0.5794937710104805, "mean_token_accuracy": 0.7306079864501953, "num_tokens": 9685589.0, "step": 5861, "train/ce_loss": 0.8293500542640686 }, { "epoch": 0.5794937710104805, "step": 5861, "train/sim_loss": 0.0546875 }, { "epoch": 0.5794937710104805, "step": 5861, "train/total_loss": 0.13762250542640686 }, { "entropy": 9.000852584838867, "epoch": 0.5795926438599961, "mean_token_accuracy": 0.7827715277671814, "num_tokens": 9690833.0, "step": 5862, "train/ce_loss": 0.8612764477729797 }, { "epoch": 0.5795926438599961, "step": 5862, "train/sim_loss": 0.05078125 }, { "epoch": 0.5795926438599961, "step": 5862, "train/total_loss": 0.1369088888168335 }, { "entropy": 8.683488845825195, "epoch": 0.5796915167095116, "mean_token_accuracy": 0.7246654033660889, "num_tokens": 9696357.0, "step": 5863, "train/ce_loss": 1.3177578449249268 }, { "epoch": 0.5796915167095116, "step": 5863, "train/sim_loss": 0.06640625 }, { "epoch": 0.5796915167095116, "step": 5863, "train/total_loss": 0.19818203151226044 }, { "entropy": 8.988357543945312, "epoch": 0.5797903895590271, "mean_token_accuracy": 0.7926470637321472, "num_tokens": 9701515.0, "step": 5864, "train/ce_loss": 0.8962416648864746 }, { "epoch": 0.5797903895590271, "step": 5864, "train/sim_loss": 0.03125 }, { "epoch": 0.5797903895590271, "step": 5864, "train/total_loss": 0.12087416648864746 }, { "entropy": 8.894834518432617, "epoch": 0.5798892624085427, "mean_token_accuracy": 0.7848605513572693, "num_tokens": 9706752.0, "step": 5865, "train/ce_loss": 0.5083255171775818 }, { "epoch": 0.5798892624085427, "step": 5865, "train/sim_loss": 0.046875 }, { "epoch": 0.5798892624085427, "step": 5865, "train/total_loss": 0.09770755469799042 }, { "entropy": 9.450010299682617, "epoch": 0.5799881352580581, "mean_token_accuracy": 0.7307060956954956, "num_tokens": 9711791.0, "step": 5866, "train/ce_loss": 1.6645127516312641e-06 }, { "epoch": 0.5799881352580581, "step": 5866, "train/sim_loss": 0.0390625 }, { "epoch": 0.5799881352580581, "step": 5866, "train/total_loss": 0.03906266763806343 }, { "entropy": 9.376605033874512, "epoch": 0.5800870081075736, "mean_token_accuracy": 0.7685950398445129, "num_tokens": 9716736.0, "step": 5867, "train/ce_loss": 0.6073023676872253 }, { "epoch": 0.5800870081075736, "step": 5867, "train/sim_loss": 0.03125 }, { "epoch": 0.5800870081075736, "step": 5867, "train/total_loss": 0.0919802337884903 }, { "entropy": 8.476332664489746, "epoch": 0.5801858809570892, "mean_token_accuracy": 0.7014613747596741, "num_tokens": 9722183.0, "step": 5868, "train/ce_loss": 1.0599383115768433 }, { "epoch": 0.5801858809570892, "step": 5868, "train/sim_loss": 0.06640625 }, { "epoch": 0.5801858809570892, "step": 5868, "train/total_loss": 0.1724000871181488 }, { "entropy": 9.360960006713867, "epoch": 0.5802847538066047, "mean_token_accuracy": 0.734133780002594, "num_tokens": 9727209.0, "step": 5869, "train/ce_loss": 1.5586105585098267 }, { "epoch": 0.5802847538066047, "step": 5869, "train/sim_loss": 0.078125 }, { "epoch": 0.5802847538066047, "step": 5869, "train/total_loss": 0.23398606479167938 }, { "entropy": 9.488181114196777, "epoch": 0.5803836266561202, "mean_token_accuracy": 0.695049524307251, "num_tokens": 9732150.0, "step": 5870, "train/ce_loss": 1.8666412415768718e-06 }, { "epoch": 0.5803836266561202, "step": 5870, "train/sim_loss": 0.04296875 }, { "epoch": 0.5803836266561202, "step": 5870, "train/total_loss": 0.04296893626451492 }, { "entropy": 9.39095687866211, "epoch": 0.5804824995056358, "mean_token_accuracy": 0.7685664892196655, "num_tokens": 9737124.0, "step": 5871, "train/ce_loss": 1.294190764427185 }, { "epoch": 0.5804824995056358, "step": 5871, "train/sim_loss": 0.078125 }, { "epoch": 0.5804824995056358, "step": 5871, "train/total_loss": 0.20754407346248627 }, { "entropy": 8.90446662902832, "epoch": 0.5805813723551513, "mean_token_accuracy": 0.7427123188972473, "num_tokens": 9742313.0, "step": 5872, "train/ce_loss": 1.3353374004364014 }, { "epoch": 0.5805813723551513, "step": 5872, "train/sim_loss": 0.046875 }, { "epoch": 0.5805813723551513, "step": 5872, "train/total_loss": 0.18040874600410461 }, { "entropy": 8.926846504211426, "epoch": 0.5806802452046668, "mean_token_accuracy": 0.7269503474235535, "num_tokens": 9747598.0, "step": 5873, "train/ce_loss": 0.8143326640129089 }, { "epoch": 0.5806802452046668, "step": 5873, "train/sim_loss": 0.05859375 }, { "epoch": 0.5806802452046668, "step": 5873, "train/total_loss": 0.1400270164012909 }, { "entropy": 9.394083976745605, "epoch": 0.5807791180541824, "mean_token_accuracy": 0.7317487001419067, "num_tokens": 9752631.0, "step": 5874, "train/ce_loss": 1.077366828918457 }, { "epoch": 0.5807791180541824, "step": 5874, "train/sim_loss": 0.10546875 }, { "epoch": 0.5807791180541824, "step": 5874, "train/total_loss": 0.21320542693138123 }, { "entropy": 9.731376647949219, "epoch": 0.5808779909036978, "mean_token_accuracy": 0.7923627495765686, "num_tokens": 9757435.0, "step": 5875, "train/ce_loss": 1.2870942782683414e-06 }, { "epoch": 0.5808779909036978, "step": 5875, "train/sim_loss": 0.015625 }, { "epoch": 0.5808779909036978, "step": 5875, "train/total_loss": 0.015625128522515297 }, { "entropy": 9.461321830749512, "epoch": 0.5809768637532133, "mean_token_accuracy": 0.7592592835426331, "num_tokens": 9762418.0, "step": 5876, "train/ce_loss": 0.8081197142601013 }, { "epoch": 0.5809768637532133, "step": 5876, "train/sim_loss": 0.04296875 }, { "epoch": 0.5809768637532133, "step": 5876, "train/total_loss": 0.12378071993589401 }, { "entropy": 8.56025505065918, "epoch": 0.5810757366027289, "mean_token_accuracy": 0.746666669845581, "num_tokens": 9767992.0, "step": 5877, "train/ce_loss": 0.9183955788612366 }, { "epoch": 0.5810757366027289, "step": 5877, "train/sim_loss": 0.06640625 }, { "epoch": 0.5810757366027289, "step": 5877, "train/total_loss": 0.15824580192565918 }, { "entropy": 8.989036560058594, "epoch": 0.5811746094522444, "mean_token_accuracy": 0.7418967485427856, "num_tokens": 9773297.0, "step": 5878, "train/ce_loss": 1.1786950826644897 }, { "epoch": 0.5811746094522444, "step": 5878, "train/sim_loss": 0.08984375 }, { "epoch": 0.5811746094522444, "step": 5878, "train/total_loss": 0.2077132612466812 }, { "entropy": 9.000946998596191, "epoch": 0.5812734823017599, "mean_token_accuracy": 0.7270029783248901, "num_tokens": 9778400.0, "step": 5879, "train/ce_loss": 1.591559648513794 }, { "epoch": 0.5812734823017599, "step": 5879, "train/sim_loss": 0.0390625 }, { "epoch": 0.5812734823017599, "step": 5879, "train/total_loss": 0.1982184648513794 }, { "epoch": 0.5813723551512755, "grad_norm": 0.7643924951553345, "learning_rate": 8.548929436779905e-06, "loss": 0.134, "step": 5880 }, { "entropy": 8.803194046020508, "epoch": 0.5813723551512755, "mean_token_accuracy": 0.743888258934021, "num_tokens": 9783757.0, "step": 5880, "train/ce_loss": 1.131672978401184 }, { "epoch": 0.5813723551512755, "step": 5880, "train/sim_loss": 0.046875 }, { "epoch": 0.5813723551512755, "step": 5880, "train/total_loss": 0.16004230082035065 }, { "entropy": 9.034435272216797, "epoch": 0.581471228000791, "mean_token_accuracy": 0.7707006335258484, "num_tokens": 9788827.0, "step": 5881, "train/ce_loss": 0.5767542123794556 }, { "epoch": 0.581471228000791, "step": 5881, "train/sim_loss": 0.05078125 }, { "epoch": 0.581471228000791, "step": 5881, "train/total_loss": 0.10845667123794556 }, { "entropy": 8.828594207763672, "epoch": 0.5815701008503065, "mean_token_accuracy": 0.681034505367279, "num_tokens": 9794306.0, "step": 5882, "train/ce_loss": 1.3235604763031006 }, { "epoch": 0.5815701008503065, "step": 5882, "train/sim_loss": 0.03125 }, { "epoch": 0.5815701008503065, "step": 5882, "train/total_loss": 0.16360604763031006 }, { "entropy": 9.103014945983887, "epoch": 0.5816689736998221, "mean_token_accuracy": 0.7027778029441833, "num_tokens": 9799447.0, "step": 5883, "train/ce_loss": 1.4092310667037964 }, { "epoch": 0.5816689736998221, "step": 5883, "train/sim_loss": 0.046875 }, { "epoch": 0.5816689736998221, "step": 5883, "train/total_loss": 0.18779811263084412 }, { "entropy": 8.762706756591797, "epoch": 0.5817678465493376, "mean_token_accuracy": 0.7728776335716248, "num_tokens": 9804852.0, "step": 5884, "train/ce_loss": 0.5362897515296936 }, { "epoch": 0.5817678465493376, "step": 5884, "train/sim_loss": 0.02734375 }, { "epoch": 0.5817678465493376, "step": 5884, "train/total_loss": 0.08097273111343384 }, { "entropy": 8.828049659729004, "epoch": 0.581866719398853, "mean_token_accuracy": 0.7747858166694641, "num_tokens": 9810111.0, "step": 5885, "train/ce_loss": 0.6966593861579895 }, { "epoch": 0.581866719398853, "step": 5885, "train/sim_loss": 0.06640625 }, { "epoch": 0.581866719398853, "step": 5885, "train/total_loss": 0.13607218861579895 }, { "entropy": 8.948248863220215, "epoch": 0.5819655922483686, "mean_token_accuracy": 0.70126873254776, "num_tokens": 9815483.0, "step": 5886, "train/ce_loss": 0.7913849949836731 }, { "epoch": 0.5819655922483686, "step": 5886, "train/sim_loss": 0.02734375 }, { "epoch": 0.5819655922483686, "step": 5886, "train/total_loss": 0.10648225247859955 }, { "entropy": 9.371553421020508, "epoch": 0.5820644650978841, "mean_token_accuracy": 0.679411768913269, "num_tokens": 9820572.0, "step": 5887, "train/ce_loss": 1.750806632117019e-06 }, { "epoch": 0.5820644650978841, "step": 5887, "train/sim_loss": 0.0234375 }, { "epoch": 0.5820644650978841, "step": 5887, "train/total_loss": 0.023437675088644028 }, { "entropy": 9.152580261230469, "epoch": 0.5821633379473996, "mean_token_accuracy": 0.7692307829856873, "num_tokens": 9825757.0, "step": 5888, "train/ce_loss": 0.8407130837440491 }, { "epoch": 0.5821633379473996, "step": 5888, "train/sim_loss": 0.0546875 }, { "epoch": 0.5821633379473996, "step": 5888, "train/total_loss": 0.1387588083744049 }, { "entropy": 8.81597900390625, "epoch": 0.5822622107969152, "mean_token_accuracy": 0.7447335720062256, "num_tokens": 9831034.0, "step": 5889, "train/ce_loss": 1.1196482181549072 }, { "epoch": 0.5822622107969152, "step": 5889, "train/sim_loss": 0.07421875 }, { "epoch": 0.5822622107969152, "step": 5889, "train/total_loss": 0.18618357181549072 }, { "entropy": 9.071273803710938, "epoch": 0.5823610836464307, "mean_token_accuracy": 0.787162184715271, "num_tokens": 9836069.0, "step": 5890, "train/ce_loss": 0.8394007086753845 }, { "epoch": 0.5823610836464307, "step": 5890, "train/sim_loss": 0.03515625 }, { "epoch": 0.5823610836464307, "step": 5890, "train/total_loss": 0.11909632384777069 }, { "entropy": 8.615422248840332, "epoch": 0.5824599564959462, "mean_token_accuracy": 0.7299492359161377, "num_tokens": 9841517.0, "step": 5891, "train/ce_loss": 0.6269214153289795 }, { "epoch": 0.5824599564959462, "step": 5891, "train/sim_loss": 0.0234375 }, { "epoch": 0.5824599564959462, "step": 5891, "train/total_loss": 0.08612964302301407 }, { "entropy": 8.989612579345703, "epoch": 0.5825588293454618, "mean_token_accuracy": 0.7749999761581421, "num_tokens": 9846669.0, "step": 5892, "train/ce_loss": 0.9876495599746704 }, { "epoch": 0.5825588293454618, "step": 5892, "train/sim_loss": 0.0390625 }, { "epoch": 0.5825588293454618, "step": 5892, "train/total_loss": 0.13782745599746704 }, { "entropy": 8.770111083984375, "epoch": 0.5826577021949773, "mean_token_accuracy": 0.7146017551422119, "num_tokens": 9852008.0, "step": 5893, "train/ce_loss": 0.8659339547157288 }, { "epoch": 0.5826577021949773, "step": 5893, "train/sim_loss": 0.12890625 }, { "epoch": 0.5826577021949773, "step": 5893, "train/total_loss": 0.2154996395111084 }, { "entropy": 8.816354751586914, "epoch": 0.5827565750444927, "mean_token_accuracy": 0.7404305934906006, "num_tokens": 9857316.0, "step": 5894, "train/ce_loss": 0.8825889229774475 }, { "epoch": 0.5827565750444927, "step": 5894, "train/sim_loss": 0.0625 }, { "epoch": 0.5827565750444927, "step": 5894, "train/total_loss": 0.15075889229774475 }, { "entropy": 8.934374809265137, "epoch": 0.5828554478940083, "mean_token_accuracy": 0.7435232996940613, "num_tokens": 9862594.0, "step": 5895, "train/ce_loss": 0.8922495245933533 }, { "epoch": 0.5828554478940083, "step": 5895, "train/sim_loss": 0.109375 }, { "epoch": 0.5828554478940083, "step": 5895, "train/total_loss": 0.19859996438026428 }, { "entropy": 8.723108291625977, "epoch": 0.5829543207435238, "mean_token_accuracy": 0.7158836722373962, "num_tokens": 9867987.0, "step": 5896, "train/ce_loss": 1.0666035413742065 }, { "epoch": 0.5829543207435238, "step": 5896, "train/sim_loss": 0.1328125 }, { "epoch": 0.5829543207435238, "step": 5896, "train/total_loss": 0.2394728660583496 }, { "entropy": 8.87197494506836, "epoch": 0.5830531935930393, "mean_token_accuracy": 0.720441997051239, "num_tokens": 9873373.0, "step": 5897, "train/ce_loss": 0.532087504863739 }, { "epoch": 0.5830531935930393, "step": 5897, "train/sim_loss": 0.0234375 }, { "epoch": 0.5830531935930393, "step": 5897, "train/total_loss": 0.07664625346660614 }, { "entropy": 8.925093650817871, "epoch": 0.5831520664425549, "mean_token_accuracy": 0.7520958185195923, "num_tokens": 9878614.0, "step": 5898, "train/ce_loss": 1.1608963012695312 }, { "epoch": 0.5831520664425549, "step": 5898, "train/sim_loss": 0.0546875 }, { "epoch": 0.5831520664425549, "step": 5898, "train/total_loss": 0.17077714204788208 }, { "entropy": 8.685224533081055, "epoch": 0.5832509392920704, "mean_token_accuracy": 0.7431610822677612, "num_tokens": 9883770.0, "step": 5899, "train/ce_loss": 1.1857091188430786 }, { "epoch": 0.5832509392920704, "step": 5899, "train/sim_loss": 0.046875 }, { "epoch": 0.5832509392920704, "step": 5899, "train/total_loss": 0.16544592380523682 }, { "epoch": 0.5833498121415859, "grad_norm": 0.8614020347595215, "learning_rate": 8.543984572021955e-06, "loss": 0.1414, "step": 5900 }, { "entropy": 8.918161392211914, "epoch": 0.5833498121415859, "mean_token_accuracy": 0.7391930818557739, "num_tokens": 9889001.0, "step": 5900, "train/ce_loss": 0.8995566368103027 }, { "epoch": 0.5833498121415859, "step": 5900, "train/sim_loss": 0.0546875 }, { "epoch": 0.5833498121415859, "step": 5900, "train/total_loss": 0.1446431577205658 }, { "entropy": 9.084014892578125, "epoch": 0.5834486849911015, "mean_token_accuracy": 0.7184594869613647, "num_tokens": 9894231.0, "step": 5901, "train/ce_loss": 1.231453537940979 }, { "epoch": 0.5834486849911015, "step": 5901, "train/sim_loss": 0.03125 }, { "epoch": 0.5834486849911015, "step": 5901, "train/total_loss": 0.15439535677433014 }, { "entropy": 8.931279182434082, "epoch": 0.583547557840617, "mean_token_accuracy": 0.6940749883651733, "num_tokens": 9899487.0, "step": 5902, "train/ce_loss": 0.7057415246963501 }, { "epoch": 0.583547557840617, "step": 5902, "train/sim_loss": 0.05078125 }, { "epoch": 0.583547557840617, "step": 5902, "train/total_loss": 0.12135540693998337 }, { "entropy": 8.705514907836914, "epoch": 0.5836464306901324, "mean_token_accuracy": 0.787994921207428, "num_tokens": 9904759.0, "step": 5903, "train/ce_loss": 0.4530998170375824 }, { "epoch": 0.5836464306901324, "step": 5903, "train/sim_loss": 0.0234375 }, { "epoch": 0.5836464306901324, "step": 5903, "train/total_loss": 0.06874748319387436 }, { "entropy": 9.598268508911133, "epoch": 0.583745303539648, "mean_token_accuracy": 0.8318385481834412, "num_tokens": 9909639.0, "step": 5904, "train/ce_loss": 9.429682563677488e-07 }, { "epoch": 0.583745303539648, "step": 5904, "train/sim_loss": 0.01171875 }, { "epoch": 0.583745303539648, "step": 5904, "train/total_loss": 0.011718844063580036 }, { "entropy": 10.027769088745117, "epoch": 0.5838441763891635, "mean_token_accuracy": 0.8108108043670654, "num_tokens": 9914188.0, "step": 5905, "train/ce_loss": 2.373533248901367 }, { "epoch": 0.5838441763891635, "step": 5905, "train/sim_loss": 0.03125 }, { "epoch": 0.5838441763891635, "step": 5905, "train/total_loss": 0.2686033248901367 }, { "entropy": 9.258383750915527, "epoch": 0.583943049238679, "mean_token_accuracy": 0.7267950773239136, "num_tokens": 9919172.0, "step": 5906, "train/ce_loss": 7.632613119312737e-07 }, { "epoch": 0.583943049238679, "step": 5906, "train/sim_loss": 0.0234375 }, { "epoch": 0.583943049238679, "step": 5906, "train/total_loss": 0.02343757636845112 }, { "entropy": 9.364818572998047, "epoch": 0.5840419220881946, "mean_token_accuracy": 0.7480559945106506, "num_tokens": 9924213.0, "step": 5907, "train/ce_loss": 0.9966214299201965 }, { "epoch": 0.5840419220881946, "step": 5907, "train/sim_loss": 0.046875 }, { "epoch": 0.5840419220881946, "step": 5907, "train/total_loss": 0.1465371549129486 }, { "entropy": 9.138063430786133, "epoch": 0.5841407949377101, "mean_token_accuracy": 0.7645466923713684, "num_tokens": 9929419.0, "step": 5908, "train/ce_loss": 0.7238141298294067 }, { "epoch": 0.5841407949377101, "step": 5908, "train/sim_loss": 0.04296875 }, { "epoch": 0.5841407949377101, "step": 5908, "train/total_loss": 0.1153501644730568 }, { "entropy": 8.901884078979492, "epoch": 0.5842396677872256, "mean_token_accuracy": 0.7815231084823608, "num_tokens": 9934694.0, "step": 5909, "train/ce_loss": 0.6918627619743347 }, { "epoch": 0.5842396677872256, "step": 5909, "train/sim_loss": 0.07421875 }, { "epoch": 0.5842396677872256, "step": 5909, "train/total_loss": 0.143405020236969 }, { "entropy": 8.765707015991211, "epoch": 0.5843385406367412, "mean_token_accuracy": 0.7502837777137756, "num_tokens": 9940041.0, "step": 5910, "train/ce_loss": 0.906067430973053 }, { "epoch": 0.5843385406367412, "step": 5910, "train/sim_loss": 0.109375 }, { "epoch": 0.5843385406367412, "step": 5910, "train/total_loss": 0.19998174905776978 }, { "entropy": 8.78848648071289, "epoch": 0.5844374134862567, "mean_token_accuracy": 0.746302604675293, "num_tokens": 9945462.0, "step": 5911, "train/ce_loss": 1.031307578086853 }, { "epoch": 0.5844374134862567, "step": 5911, "train/sim_loss": 0.06640625 }, { "epoch": 0.5844374134862567, "step": 5911, "train/total_loss": 0.1695370078086853 }, { "entropy": 8.435439109802246, "epoch": 0.5845362863357721, "mean_token_accuracy": 0.7630661725997925, "num_tokens": 9950858.0, "step": 5912, "train/ce_loss": 0.4197191894054413 }, { "epoch": 0.5845362863357721, "step": 5912, "train/sim_loss": 0.06640625 }, { "epoch": 0.5845362863357721, "step": 5912, "train/total_loss": 0.10837817192077637 }, { "entropy": 8.845929145812988, "epoch": 0.5846351591852877, "mean_token_accuracy": 0.7647753953933716, "num_tokens": 9956161.0, "step": 5913, "train/ce_loss": 1.0874028205871582 }, { "epoch": 0.5846351591852877, "step": 5913, "train/sim_loss": 0.046875 }, { "epoch": 0.5846351591852877, "step": 5913, "train/total_loss": 0.15561528503894806 }, { "entropy": 9.359431266784668, "epoch": 0.5847340320348032, "mean_token_accuracy": 0.7215384840965271, "num_tokens": 9961182.0, "step": 5914, "train/ce_loss": 0.7800878286361694 }, { "epoch": 0.5847340320348032, "step": 5914, "train/sim_loss": 0.03125 }, { "epoch": 0.5847340320348032, "step": 5914, "train/total_loss": 0.10925878584384918 }, { "entropy": 8.976252555847168, "epoch": 0.5848329048843187, "mean_token_accuracy": 0.7380607724189758, "num_tokens": 9966365.0, "step": 5915, "train/ce_loss": 0.6915847063064575 }, { "epoch": 0.5848329048843187, "step": 5915, "train/sim_loss": 0.05078125 }, { "epoch": 0.5848329048843187, "step": 5915, "train/total_loss": 0.11993972212076187 }, { "entropy": 9.099713325500488, "epoch": 0.5849317777338343, "mean_token_accuracy": 0.7078014016151428, "num_tokens": 9971523.0, "step": 5916, "train/ce_loss": 1.966060835911776e-06 }, { "epoch": 0.5849317777338343, "step": 5916, "train/sim_loss": 0.0546875 }, { "epoch": 0.5849317777338343, "step": 5916, "train/total_loss": 0.05468769744038582 }, { "entropy": 9.637526512145996, "epoch": 0.5850306505833498, "mean_token_accuracy": 0.7004504799842834, "num_tokens": 9976436.0, "step": 5917, "train/ce_loss": 1.4811153050686698e-05 }, { "epoch": 0.5850306505833498, "step": 5917, "train/sim_loss": 0.0703125 }, { "epoch": 0.5850306505833498, "step": 5917, "train/total_loss": 0.07031398266553879 }, { "entropy": 9.061832427978516, "epoch": 0.5851295234328653, "mean_token_accuracy": 0.7558620572090149, "num_tokens": 9981625.0, "step": 5918, "train/ce_loss": 0.8422671556472778 }, { "epoch": 0.5851295234328653, "step": 5918, "train/sim_loss": 0.04296875 }, { "epoch": 0.5851295234328653, "step": 5918, "train/total_loss": 0.12719547748565674 }, { "entropy": 9.495265007019043, "epoch": 0.5852283962823809, "mean_token_accuracy": 0.7879924774169922, "num_tokens": 9986595.0, "step": 5919, "train/ce_loss": 1.5633907318115234 }, { "epoch": 0.5852283962823809, "step": 5919, "train/sim_loss": 0.0234375 }, { "epoch": 0.5852283962823809, "step": 5919, "train/total_loss": 0.17977657914161682 }, { "epoch": 0.5853272691318964, "grad_norm": 0.660368025302887, "learning_rate": 8.539039707264008e-06, "loss": 0.1305, "step": 5920 }, { "entropy": 8.801708221435547, "epoch": 0.5853272691318964, "mean_token_accuracy": 0.779411792755127, "num_tokens": 9992008.0, "step": 5920, "train/ce_loss": 0.7756808996200562 }, { "epoch": 0.5853272691318964, "step": 5920, "train/sim_loss": 0.06640625 }, { "epoch": 0.5853272691318964, "step": 5920, "train/total_loss": 0.14397433400154114 }, { "entropy": 9.236620903015137, "epoch": 0.5854261419814119, "mean_token_accuracy": 0.767123281955719, "num_tokens": 9997165.0, "step": 5921, "train/ce_loss": 0.737661600112915 }, { "epoch": 0.5854261419814119, "step": 5921, "train/sim_loss": 0.06640625 }, { "epoch": 0.5854261419814119, "step": 5921, "train/total_loss": 0.14017242193222046 }, { "entropy": 8.805265426635742, "epoch": 0.5855250148309274, "mean_token_accuracy": 0.7797872424125671, "num_tokens": 10002562.0, "step": 5922, "train/ce_loss": 1.2521300315856934 }, { "epoch": 0.5855250148309274, "step": 5922, "train/sim_loss": 0.078125 }, { "epoch": 0.5855250148309274, "step": 5922, "train/total_loss": 0.20333801209926605 }, { "entropy": 9.093343734741211, "epoch": 0.5856238876804429, "mean_token_accuracy": 0.7347242832183838, "num_tokens": 10007870.0, "step": 5923, "train/ce_loss": 1.0051777362823486 }, { "epoch": 0.5856238876804429, "step": 5923, "train/sim_loss": 0.046875 }, { "epoch": 0.5856238876804429, "step": 5923, "train/total_loss": 0.14739277958869934 }, { "entropy": 9.039249420166016, "epoch": 0.5857227605299584, "mean_token_accuracy": 0.6990678906440735, "num_tokens": 10013038.0, "step": 5924, "train/ce_loss": 1.0364798307418823 }, { "epoch": 0.5857227605299584, "step": 5924, "train/sim_loss": 0.05078125 }, { "epoch": 0.5857227605299584, "step": 5924, "train/total_loss": 0.15442922711372375 }, { "entropy": 8.729142189025879, "epoch": 0.585821633379474, "mean_token_accuracy": 0.824940025806427, "num_tokens": 10018383.0, "step": 5925, "train/ce_loss": 0.5556420087814331 }, { "epoch": 0.585821633379474, "step": 5925, "train/sim_loss": 0.01953125 }, { "epoch": 0.585821633379474, "step": 5925, "train/total_loss": 0.07509545236825943 }, { "entropy": 9.028743743896484, "epoch": 0.5859205062289895, "mean_token_accuracy": 0.7197723984718323, "num_tokens": 10023580.0, "step": 5926, "train/ce_loss": 0.7546852231025696 }, { "epoch": 0.5859205062289895, "step": 5926, "train/sim_loss": 0.0546875 }, { "epoch": 0.5859205062289895, "step": 5926, "train/total_loss": 0.1301560252904892 }, { "entropy": 9.160871505737305, "epoch": 0.586019379078505, "mean_token_accuracy": 0.8131386637687683, "num_tokens": 10028723.0, "step": 5927, "train/ce_loss": 0.00012569209502544254 }, { "epoch": 0.586019379078505, "step": 5927, "train/sim_loss": 0.0703125 }, { "epoch": 0.586019379078505, "step": 5927, "train/total_loss": 0.07032506912946701 }, { "entropy": 9.165404319763184, "epoch": 0.5861182519280206, "mean_token_accuracy": 0.7296898365020752, "num_tokens": 10033886.0, "step": 5928, "train/ce_loss": 1.1629759073257446 }, { "epoch": 0.5861182519280206, "step": 5928, "train/sim_loss": 0.0703125 }, { "epoch": 0.5861182519280206, "step": 5928, "train/total_loss": 0.18661010265350342 }, { "entropy": 9.300680160522461, "epoch": 0.5862171247775361, "mean_token_accuracy": 0.7137930989265442, "num_tokens": 10038959.0, "step": 5929, "train/ce_loss": 1.3878321647644043 }, { "epoch": 0.5862171247775361, "step": 5929, "train/sim_loss": 0.03125 }, { "epoch": 0.5862171247775361, "step": 5929, "train/total_loss": 0.17003321647644043 }, { "entropy": 8.649507522583008, "epoch": 0.5863159976270517, "mean_token_accuracy": 0.834645688533783, "num_tokens": 10044446.0, "step": 5930, "train/ce_loss": 0.8138261437416077 }, { "epoch": 0.5863159976270517, "step": 5930, "train/sim_loss": 0.0703125 }, { "epoch": 0.5863159976270517, "step": 5930, "train/total_loss": 0.151695117354393 }, { "entropy": 9.599285125732422, "epoch": 0.5864148704765672, "mean_token_accuracy": 0.7827869057655334, "num_tokens": 10049321.0, "step": 5931, "train/ce_loss": 1.53306245803833 }, { "epoch": 0.5864148704765672, "step": 5931, "train/sim_loss": 0.07421875 }, { "epoch": 0.5864148704765672, "step": 5931, "train/total_loss": 0.227524995803833 }, { "entropy": 9.229707717895508, "epoch": 0.5865137433260826, "mean_token_accuracy": 0.7536231875419617, "num_tokens": 10054310.0, "step": 5932, "train/ce_loss": 0.8070537447929382 }, { "epoch": 0.5865137433260826, "step": 5932, "train/sim_loss": 0.046875 }, { "epoch": 0.5865137433260826, "step": 5932, "train/total_loss": 0.12758037447929382 }, { "entropy": 8.597070693969727, "epoch": 0.5866126161755982, "mean_token_accuracy": 0.7008547186851501, "num_tokens": 10059742.0, "step": 5933, "train/ce_loss": 0.6685133576393127 }, { "epoch": 0.5866126161755982, "step": 5933, "train/sim_loss": 0.04296875 }, { "epoch": 0.5866126161755982, "step": 5933, "train/total_loss": 0.10982009023427963 }, { "entropy": 9.315040588378906, "epoch": 0.5867114890251137, "mean_token_accuracy": 0.7763158082962036, "num_tokens": 10064796.0, "step": 5934, "train/ce_loss": 0.8337532877922058 }, { "epoch": 0.5867114890251137, "step": 5934, "train/sim_loss": 0.01953125 }, { "epoch": 0.5867114890251137, "step": 5934, "train/total_loss": 0.10290657728910446 }, { "entropy": 9.534631729125977, "epoch": 0.5868103618746292, "mean_token_accuracy": 0.6329787373542786, "num_tokens": 10069802.0, "step": 5935, "train/ce_loss": 0.7574530839920044 }, { "epoch": 0.5868103618746292, "step": 5935, "train/sim_loss": 0.046875 }, { "epoch": 0.5868103618746292, "step": 5935, "train/total_loss": 0.12262030690908432 }, { "entropy": 8.960814476013184, "epoch": 0.5869092347241448, "mean_token_accuracy": 0.7309644818305969, "num_tokens": 10075081.0, "step": 5936, "train/ce_loss": 0.8639675378799438 }, { "epoch": 0.5869092347241448, "step": 5936, "train/sim_loss": 0.06640625 }, { "epoch": 0.5869092347241448, "step": 5936, "train/total_loss": 0.15280300378799438 }, { "entropy": 9.429279327392578, "epoch": 0.5870081075736603, "mean_token_accuracy": 0.7545126080513, "num_tokens": 10080048.0, "step": 5937, "train/ce_loss": 0.908481240272522 }, { "epoch": 0.5870081075736603, "step": 5937, "train/sim_loss": 0.05078125 }, { "epoch": 0.5870081075736603, "step": 5937, "train/total_loss": 0.14162936806678772 }, { "entropy": 9.159167289733887, "epoch": 0.5871069804231758, "mean_token_accuracy": 0.7885802388191223, "num_tokens": 10085161.0, "step": 5938, "train/ce_loss": 0.7795634269714355 }, { "epoch": 0.5871069804231758, "step": 5938, "train/sim_loss": 0.01953125 }, { "epoch": 0.5871069804231758, "step": 5938, "train/total_loss": 0.09748759120702744 }, { "entropy": 8.759031295776367, "epoch": 0.5872058532726914, "mean_token_accuracy": 0.7425742745399475, "num_tokens": 10090646.0, "step": 5939, "train/ce_loss": 0.8374418020248413 }, { "epoch": 0.5872058532726914, "step": 5939, "train/sim_loss": 0.09375 }, { "epoch": 0.5872058532726914, "step": 5939, "train/total_loss": 0.17749418318271637 }, { "epoch": 0.5873047261222069, "grad_norm": 0.7194293737411499, "learning_rate": 8.534094842506058e-06, "loss": 0.1329, "step": 5940 }, { "entropy": 8.76194953918457, "epoch": 0.5873047261222069, "mean_token_accuracy": 0.6855733394622803, "num_tokens": 10095911.0, "step": 5940, "train/ce_loss": 0.8222272396087646 }, { "epoch": 0.5873047261222069, "step": 5940, "train/sim_loss": 0.06640625 }, { "epoch": 0.5873047261222069, "step": 5940, "train/total_loss": 0.14862897992134094 }, { "entropy": 8.507280349731445, "epoch": 0.5874035989717223, "mean_token_accuracy": 0.7427293062210083, "num_tokens": 10101265.0, "step": 5941, "train/ce_loss": 1.1149815320968628 }, { "epoch": 0.5874035989717223, "step": 5941, "train/sim_loss": 0.03515625 }, { "epoch": 0.5874035989717223, "step": 5941, "train/total_loss": 0.1466543972492218 }, { "entropy": 9.517021179199219, "epoch": 0.5875024718212379, "mean_token_accuracy": 0.7517730593681335, "num_tokens": 10106253.0, "step": 5942, "train/ce_loss": 1.2174076573501225e-06 }, { "epoch": 0.5875024718212379, "step": 5942, "train/sim_loss": 0.015625 }, { "epoch": 0.5875024718212379, "step": 5942, "train/total_loss": 0.0156251210719347 }, { "entropy": 8.679609298706055, "epoch": 0.5876013446707534, "mean_token_accuracy": 0.7927400469779968, "num_tokens": 10111574.0, "step": 5943, "train/ce_loss": 0.6443299651145935 }, { "epoch": 0.5876013446707534, "step": 5943, "train/sim_loss": 0.12890625 }, { "epoch": 0.5876013446707534, "step": 5943, "train/total_loss": 0.1933392584323883 }, { "entropy": 9.07916259765625, "epoch": 0.5877002175202689, "mean_token_accuracy": 0.7250945568084717, "num_tokens": 10116808.0, "step": 5944, "train/ce_loss": 0.5388218760490417 }, { "epoch": 0.5877002175202689, "step": 5944, "train/sim_loss": 0.0390625 }, { "epoch": 0.5877002175202689, "step": 5944, "train/total_loss": 0.0929446890950203 }, { "entropy": 8.75620174407959, "epoch": 0.5877990903697845, "mean_token_accuracy": 0.7218863368034363, "num_tokens": 10122108.0, "step": 5945, "train/ce_loss": 0.9487734436988831 }, { "epoch": 0.5877990903697845, "step": 5945, "train/sim_loss": 0.04296875 }, { "epoch": 0.5877990903697845, "step": 5945, "train/total_loss": 0.13784609735012054 }, { "entropy": 9.748626708984375, "epoch": 0.5878979632193, "mean_token_accuracy": 0.6862170100212097, "num_tokens": 10126858.0, "step": 5946, "train/ce_loss": 2.1903035640716553 }, { "epoch": 0.5878979632193, "step": 5946, "train/sim_loss": 0.13671875 }, { "epoch": 0.5878979632193, "step": 5946, "train/total_loss": 0.35574913024902344 }, { "entropy": 8.954833984375, "epoch": 0.5879968360688155, "mean_token_accuracy": 0.7099125385284424, "num_tokens": 10131995.0, "step": 5947, "train/ce_loss": 1.7198902368545532 }, { "epoch": 0.5879968360688155, "step": 5947, "train/sim_loss": 0.0234375 }, { "epoch": 0.5879968360688155, "step": 5947, "train/total_loss": 0.19542652368545532 }, { "entropy": 9.191041946411133, "epoch": 0.5880957089183311, "mean_token_accuracy": 0.6752827167510986, "num_tokens": 10137079.0, "step": 5948, "train/ce_loss": 1.7161764844786376e-06 }, { "epoch": 0.5880957089183311, "step": 5948, "train/sim_loss": 0.03515625 }, { "epoch": 0.5880957089183311, "step": 5948, "train/total_loss": 0.03515642136335373 }, { "entropy": 8.782689094543457, "epoch": 0.5881945817678466, "mean_token_accuracy": 0.7277432680130005, "num_tokens": 10142694.0, "step": 5949, "train/ce_loss": 0.9652318954467773 }, { "epoch": 0.5881945817678466, "step": 5949, "train/sim_loss": 0.078125 }, { "epoch": 0.5881945817678466, "step": 5949, "train/total_loss": 0.1746481955051422 }, { "entropy": 8.81184196472168, "epoch": 0.588293454617362, "mean_token_accuracy": 0.7011111378669739, "num_tokens": 10148015.0, "step": 5950, "train/ce_loss": 1.3074750900268555 }, { "epoch": 0.588293454617362, "step": 5950, "train/sim_loss": 0.0390625 }, { "epoch": 0.588293454617362, "step": 5950, "train/total_loss": 0.16981001198291779 }, { "entropy": 8.900465965270996, "epoch": 0.5883923274668776, "mean_token_accuracy": 0.7444314360618591, "num_tokens": 10153377.0, "step": 5951, "train/ce_loss": 0.5374799370765686 }, { "epoch": 0.5883923274668776, "step": 5951, "train/sim_loss": 0.078125 }, { "epoch": 0.5883923274668776, "step": 5951, "train/total_loss": 0.1318729966878891 }, { "entropy": 9.416954040527344, "epoch": 0.5884912003163931, "mean_token_accuracy": 0.7321131229400635, "num_tokens": 10158414.0, "step": 5952, "train/ce_loss": 1.132142186164856 }, { "epoch": 0.5884912003163931, "step": 5952, "train/sim_loss": 0.0546875 }, { "epoch": 0.5884912003163931, "step": 5952, "train/total_loss": 0.16790172457695007 }, { "entropy": 9.676469802856445, "epoch": 0.5885900731659086, "mean_token_accuracy": 0.8164557218551636, "num_tokens": 10163165.0, "step": 5953, "train/ce_loss": 1.314548134803772 }, { "epoch": 0.5885900731659086, "step": 5953, "train/sim_loss": 0.078125 }, { "epoch": 0.5885900731659086, "step": 5953, "train/total_loss": 0.20957981050014496 }, { "entropy": 8.674261093139648, "epoch": 0.5886889460154242, "mean_token_accuracy": 0.6891133785247803, "num_tokens": 10168631.0, "step": 5954, "train/ce_loss": 0.7551501989364624 }, { "epoch": 0.5886889460154242, "step": 5954, "train/sim_loss": 0.05859375 }, { "epoch": 0.5886889460154242, "step": 5954, "train/total_loss": 0.1341087818145752 }, { "entropy": 9.235870361328125, "epoch": 0.5887878188649397, "mean_token_accuracy": 0.7565485239028931, "num_tokens": 10173749.0, "step": 5955, "train/ce_loss": 1.3685823678970337 }, { "epoch": 0.5887878188649397, "step": 5955, "train/sim_loss": 0.0703125 }, { "epoch": 0.5887878188649397, "step": 5955, "train/total_loss": 0.2071707397699356 }, { "entropy": 9.224539756774902, "epoch": 0.5888866917144552, "mean_token_accuracy": 0.6733601093292236, "num_tokens": 10178962.0, "step": 5956, "train/ce_loss": 1.0337599515914917 }, { "epoch": 0.5888866917144552, "step": 5956, "train/sim_loss": 0.05859375 }, { "epoch": 0.5888866917144552, "step": 5956, "train/total_loss": 0.16196975111961365 }, { "entropy": 9.61436653137207, "epoch": 0.5889855645639708, "mean_token_accuracy": 0.7746478915214539, "num_tokens": 10183818.0, "step": 5957, "train/ce_loss": 0.8637999892234802 }, { "epoch": 0.5889855645639708, "step": 5957, "train/sim_loss": 0.03515625 }, { "epoch": 0.5889855645639708, "step": 5957, "train/total_loss": 0.1215362474322319 }, { "entropy": 9.203855514526367, "epoch": 0.5890844374134863, "mean_token_accuracy": 0.7799696326255798, "num_tokens": 10188971.0, "step": 5958, "train/ce_loss": 1.0645607709884644 }, { "epoch": 0.5890844374134863, "step": 5958, "train/sim_loss": 0.0546875 }, { "epoch": 0.5890844374134863, "step": 5958, "train/total_loss": 0.16114357113838196 }, { "entropy": 9.107339859008789, "epoch": 0.5891833102630017, "mean_token_accuracy": 0.7337837815284729, "num_tokens": 10194205.0, "step": 5959, "train/ce_loss": 1.004172444343567 }, { "epoch": 0.5891833102630017, "step": 5959, "train/sim_loss": 0.05859375 }, { "epoch": 0.5891833102630017, "step": 5959, "train/total_loss": 0.15901100635528564 }, { "epoch": 0.5892821831125173, "grad_norm": 0.751305103302002, "learning_rate": 8.529149977748109e-06, "loss": 0.1461, "step": 5960 }, { "entropy": 8.705224990844727, "epoch": 0.5892821831125173, "mean_token_accuracy": 0.6809881925582886, "num_tokens": 10199614.0, "step": 5960, "train/ce_loss": 1.2652076482772827 }, { "epoch": 0.5892821831125173, "step": 5960, "train/sim_loss": 0.05078125 }, { "epoch": 0.5892821831125173, "step": 5960, "train/total_loss": 0.1773020178079605 }, { "entropy": 9.094744682312012, "epoch": 0.5893810559620328, "mean_token_accuracy": 0.6996148824691772, "num_tokens": 10204796.0, "step": 5961, "train/ce_loss": 0.6137571334838867 }, { "epoch": 0.5893810559620328, "step": 5961, "train/sim_loss": 0.0234375 }, { "epoch": 0.5893810559620328, "step": 5961, "train/total_loss": 0.08481321483850479 }, { "entropy": 9.080042839050293, "epoch": 0.5894799288115483, "mean_token_accuracy": 0.7582278251647949, "num_tokens": 10210082.0, "step": 5962, "train/ce_loss": 0.7311226725578308 }, { "epoch": 0.5894799288115483, "step": 5962, "train/sim_loss": 0.01953125 }, { "epoch": 0.5894799288115483, "step": 5962, "train/total_loss": 0.09264352172613144 }, { "entropy": 9.18613052368164, "epoch": 0.5895788016610639, "mean_token_accuracy": 0.7206133008003235, "num_tokens": 10215123.0, "step": 5963, "train/ce_loss": 1.1980276107788086 }, { "epoch": 0.5895788016610639, "step": 5963, "train/sim_loss": 0.09375 }, { "epoch": 0.5895788016610639, "step": 5963, "train/total_loss": 0.21355277299880981 }, { "entropy": 8.924026489257812, "epoch": 0.5896776745105794, "mean_token_accuracy": 0.6829574108123779, "num_tokens": 10220411.0, "step": 5964, "train/ce_loss": 1.4706043004989624 }, { "epoch": 0.5896776745105794, "step": 5964, "train/sim_loss": 0.09765625 }, { "epoch": 0.5896776745105794, "step": 5964, "train/total_loss": 0.24471668899059296 }, { "entropy": 9.242369651794434, "epoch": 0.5897765473600949, "mean_token_accuracy": 0.8139534592628479, "num_tokens": 10225578.0, "step": 5965, "train/ce_loss": 0.7793514728546143 }, { "epoch": 0.5897765473600949, "step": 5965, "train/sim_loss": 0.0625 }, { "epoch": 0.5897765473600949, "step": 5965, "train/total_loss": 0.14043515920639038 }, { "entropy": 8.867958068847656, "epoch": 0.5898754202096105, "mean_token_accuracy": 0.7586206793785095, "num_tokens": 10230797.0, "step": 5966, "train/ce_loss": 1.9965153932571411 }, { "epoch": 0.5898754202096105, "step": 5966, "train/sim_loss": 0.1171875 }, { "epoch": 0.5898754202096105, "step": 5966, "train/total_loss": 0.3168390393257141 }, { "entropy": 8.562590599060059, "epoch": 0.589974293059126, "mean_token_accuracy": 0.712990939617157, "num_tokens": 10236311.0, "step": 5967, "train/ce_loss": 0.9384071826934814 }, { "epoch": 0.589974293059126, "step": 5967, "train/sim_loss": 0.078125 }, { "epoch": 0.589974293059126, "step": 5967, "train/total_loss": 0.17196571826934814 }, { "entropy": 9.240837097167969, "epoch": 0.5900731659086415, "mean_token_accuracy": 0.6978852152824402, "num_tokens": 10241585.0, "step": 5968, "train/ce_loss": 0.8842914700508118 }, { "epoch": 0.5900731659086415, "step": 5968, "train/sim_loss": 0.09375 }, { "epoch": 0.5900731659086415, "step": 5968, "train/total_loss": 0.18217915296554565 }, { "entropy": 8.418539047241211, "epoch": 0.590172038758157, "mean_token_accuracy": 0.7446808218955994, "num_tokens": 10247084.0, "step": 5969, "train/ce_loss": 0.591464638710022 }, { "epoch": 0.590172038758157, "step": 5969, "train/sim_loss": 0.03125 }, { "epoch": 0.590172038758157, "step": 5969, "train/total_loss": 0.0903964638710022 }, { "entropy": 9.232823371887207, "epoch": 0.5902709116076725, "mean_token_accuracy": 0.7107195258140564, "num_tokens": 10252373.0, "step": 5970, "train/ce_loss": 1.2386114597320557 }, { "epoch": 0.5902709116076725, "step": 5970, "train/sim_loss": 0.0625 }, { "epoch": 0.5902709116076725, "step": 5970, "train/total_loss": 0.1863611489534378 }, { "entropy": 9.246217727661133, "epoch": 0.590369784457188, "mean_token_accuracy": 0.7820324301719666, "num_tokens": 10257523.0, "step": 5971, "train/ce_loss": 0.5982750654220581 }, { "epoch": 0.590369784457188, "step": 5971, "train/sim_loss": 0.03515625 }, { "epoch": 0.590369784457188, "step": 5971, "train/total_loss": 0.09498375654220581 }, { "entropy": 8.83492660522461, "epoch": 0.5904686573067036, "mean_token_accuracy": 0.7675489187240601, "num_tokens": 10262878.0, "step": 5972, "train/ce_loss": 1.4663736820220947 }, { "epoch": 0.5904686573067036, "step": 5972, "train/sim_loss": 0.015625 }, { "epoch": 0.5904686573067036, "step": 5972, "train/total_loss": 0.16226236522197723 }, { "entropy": 9.27004623413086, "epoch": 0.5905675301562191, "mean_token_accuracy": 0.7492997050285339, "num_tokens": 10268046.0, "step": 5973, "train/ce_loss": 0.9438756704330444 }, { "epoch": 0.5905675301562191, "step": 5973, "train/sim_loss": 0.01953125 }, { "epoch": 0.5905675301562191, "step": 5973, "train/total_loss": 0.11391881853342056 }, { "entropy": 8.810379028320312, "epoch": 0.5906664030057346, "mean_token_accuracy": 0.7188940048217773, "num_tokens": 10273367.0, "step": 5974, "train/ce_loss": 0.8861865997314453 }, { "epoch": 0.5906664030057346, "step": 5974, "train/sim_loss": 0.03515625 }, { "epoch": 0.5906664030057346, "step": 5974, "train/total_loss": 0.12377490848302841 }, { "entropy": 9.149025917053223, "epoch": 0.5907652758552502, "mean_token_accuracy": 0.7546916604042053, "num_tokens": 10278541.0, "step": 5975, "train/ce_loss": 0.8311251997947693 }, { "epoch": 0.5907652758552502, "step": 5975, "train/sim_loss": 0.0859375 }, { "epoch": 0.5907652758552502, "step": 5975, "train/total_loss": 0.16905002295970917 }, { "entropy": 8.849111557006836, "epoch": 0.5908641487047657, "mean_token_accuracy": 0.7751091718673706, "num_tokens": 10283950.0, "step": 5976, "train/ce_loss": 0.901870608329773 }, { "epoch": 0.5908641487047657, "step": 5976, "train/sim_loss": 0.05859375 }, { "epoch": 0.5908641487047657, "step": 5976, "train/total_loss": 0.14878082275390625 }, { "entropy": 8.77665901184082, "epoch": 0.5909630215542812, "mean_token_accuracy": 0.6936842203140259, "num_tokens": 10289340.0, "step": 5977, "train/ce_loss": 1.39908766746521 }, { "epoch": 0.5909630215542812, "step": 5977, "train/sim_loss": 0.08984375 }, { "epoch": 0.5909630215542812, "step": 5977, "train/total_loss": 0.2297525256872177 }, { "entropy": 9.108182907104492, "epoch": 0.5910618944037968, "mean_token_accuracy": 0.8383084535598755, "num_tokens": 10294535.0, "step": 5978, "train/ce_loss": 0.6628880500793457 }, { "epoch": 0.5910618944037968, "step": 5978, "train/sim_loss": 0.01953125 }, { "epoch": 0.5910618944037968, "step": 5978, "train/total_loss": 0.08582005649805069 }, { "entropy": 8.767389297485352, "epoch": 0.5911607672533122, "mean_token_accuracy": 0.7164339423179626, "num_tokens": 10299907.0, "step": 5979, "train/ce_loss": 0.8791292309761047 }, { "epoch": 0.5911607672533122, "step": 5979, "train/sim_loss": 0.046875 }, { "epoch": 0.5911607672533122, "step": 5979, "train/total_loss": 0.134787917137146 }, { "epoch": 0.5912596401028277, "grad_norm": 0.6115033626556396, "learning_rate": 8.52420511299016e-06, "loss": 0.1422, "step": 5980 }, { "entropy": 9.157818794250488, "epoch": 0.5912596401028277, "mean_token_accuracy": 0.7220670580863953, "num_tokens": 10305099.0, "step": 5980, "train/ce_loss": 1.5501344203948975 }, { "epoch": 0.5912596401028277, "step": 5980, "train/sim_loss": 0.0625 }, { "epoch": 0.5912596401028277, "step": 5980, "train/total_loss": 0.21751344203948975 }, { "entropy": 9.237415313720703, "epoch": 0.5913585129523433, "mean_token_accuracy": 0.7444794774055481, "num_tokens": 10310359.0, "step": 5981, "train/ce_loss": 0.811318576335907 }, { "epoch": 0.5913585129523433, "step": 5981, "train/sim_loss": 0.12109375 }, { "epoch": 0.5913585129523433, "step": 5981, "train/total_loss": 0.20222561061382294 }, { "entropy": 9.38630485534668, "epoch": 0.5914573858018588, "mean_token_accuracy": 0.7755681872367859, "num_tokens": 10315509.0, "step": 5982, "train/ce_loss": 0.9271148443222046 }, { "epoch": 0.5914573858018588, "step": 5982, "train/sim_loss": 0.11328125 }, { "epoch": 0.5914573858018588, "step": 5982, "train/total_loss": 0.20599272847175598 }, { "entropy": 8.738609313964844, "epoch": 0.5915562586513743, "mean_token_accuracy": 0.7485648393630981, "num_tokens": 10320852.0, "step": 5983, "train/ce_loss": 0.9920079112052917 }, { "epoch": 0.5915562586513743, "step": 5983, "train/sim_loss": 0.0625 }, { "epoch": 0.5915562586513743, "step": 5983, "train/total_loss": 0.1617007851600647 }, { "entropy": 9.557707786560059, "epoch": 0.5916551315008899, "mean_token_accuracy": 0.7597955465316772, "num_tokens": 10325898.0, "step": 5984, "train/ce_loss": 0.8455591201782227 }, { "epoch": 0.5916551315008899, "step": 5984, "train/sim_loss": 0.05078125 }, { "epoch": 0.5916551315008899, "step": 5984, "train/total_loss": 0.13533717393875122 }, { "entropy": 8.817047119140625, "epoch": 0.5917540043504054, "mean_token_accuracy": 0.7829099297523499, "num_tokens": 10331184.0, "step": 5985, "train/ce_loss": 0.9472475647926331 }, { "epoch": 0.5917540043504054, "step": 5985, "train/sim_loss": 0.0546875 }, { "epoch": 0.5917540043504054, "step": 5985, "train/total_loss": 0.14941225945949554 }, { "entropy": 9.066792488098145, "epoch": 0.5918528771999209, "mean_token_accuracy": 0.7258297204971313, "num_tokens": 10336377.0, "step": 5986, "train/ce_loss": 3.119315124422428e-06 }, { "epoch": 0.5918528771999209, "step": 5986, "train/sim_loss": 0.05859375 }, { "epoch": 0.5918528771999209, "step": 5986, "train/total_loss": 0.05859406292438507 }, { "entropy": 8.724923133850098, "epoch": 0.5919517500494365, "mean_token_accuracy": 0.7150654792785645, "num_tokens": 10341818.0, "step": 5987, "train/ce_loss": 0.7353115081787109 }, { "epoch": 0.5919517500494365, "step": 5987, "train/sim_loss": 0.0859375 }, { "epoch": 0.5919517500494365, "step": 5987, "train/total_loss": 0.1594686508178711 }, { "entropy": 9.618450164794922, "epoch": 0.5920506228989519, "mean_token_accuracy": 0.7329649925231934, "num_tokens": 10346779.0, "step": 5988, "train/ce_loss": 2.397352933883667 }, { "epoch": 0.5920506228989519, "step": 5988, "train/sim_loss": 0.03515625 }, { "epoch": 0.5920506228989519, "step": 5988, "train/total_loss": 0.27489155530929565 }, { "entropy": 8.573394775390625, "epoch": 0.5921494957484674, "mean_token_accuracy": 0.7003710865974426, "num_tokens": 10352328.0, "step": 5989, "train/ce_loss": 0.8444566130638123 }, { "epoch": 0.5921494957484674, "step": 5989, "train/sim_loss": 0.0390625 }, { "epoch": 0.5921494957484674, "step": 5989, "train/total_loss": 0.12350816279649734 }, { "entropy": 8.723489761352539, "epoch": 0.592248368597983, "mean_token_accuracy": 0.7549824118614197, "num_tokens": 10357679.0, "step": 5990, "train/ce_loss": 0.5021554231643677 }, { "epoch": 0.592248368597983, "step": 5990, "train/sim_loss": 0.0390625 }, { "epoch": 0.592248368597983, "step": 5990, "train/total_loss": 0.08927804231643677 }, { "entropy": 9.586736679077148, "epoch": 0.5923472414474985, "mean_token_accuracy": 0.6982142925262451, "num_tokens": 10362687.0, "step": 5991, "train/ce_loss": 1.5734952967250138e-06 }, { "epoch": 0.5923472414474985, "step": 5991, "train/sim_loss": 0.0625 }, { "epoch": 0.5923472414474985, "step": 5991, "train/total_loss": 0.06250015646219254 }, { "entropy": 9.379465103149414, "epoch": 0.592446114297014, "mean_token_accuracy": 0.6677471399307251, "num_tokens": 10367729.0, "step": 5992, "train/ce_loss": 1.6429595947265625 }, { "epoch": 0.592446114297014, "step": 5992, "train/sim_loss": 0.046875 }, { "epoch": 0.592446114297014, "step": 5992, "train/total_loss": 0.211170956492424 }, { "entropy": 9.429786682128906, "epoch": 0.5925449871465296, "mean_token_accuracy": 0.7732864618301392, "num_tokens": 10372714.0, "step": 5993, "train/ce_loss": 0.835451066493988 }, { "epoch": 0.5925449871465296, "step": 5993, "train/sim_loss": 0.078125 }, { "epoch": 0.5925449871465296, "step": 5993, "train/total_loss": 0.16167011857032776 }, { "entropy": 9.059162139892578, "epoch": 0.5926438599960451, "mean_token_accuracy": 0.6840620636940002, "num_tokens": 10377885.0, "step": 5994, "train/ce_loss": 4.421832727530273e-06 }, { "epoch": 0.5926438599960451, "step": 5994, "train/sim_loss": 0.046875 }, { "epoch": 0.5926438599960451, "step": 5994, "train/total_loss": 0.04687544330954552 }, { "entropy": 9.235883712768555, "epoch": 0.5927427328455606, "mean_token_accuracy": 0.7774648070335388, "num_tokens": 10383039.0, "step": 5995, "train/ce_loss": 0.6720249056816101 }, { "epoch": 0.5927427328455606, "step": 5995, "train/sim_loss": 0.13671875 }, { "epoch": 0.5927427328455606, "step": 5995, "train/total_loss": 0.20392124354839325 }, { "entropy": 9.269105911254883, "epoch": 0.5928416056950762, "mean_token_accuracy": 0.7384615540504456, "num_tokens": 10388308.0, "step": 5996, "train/ce_loss": 1.1394755840301514 }, { "epoch": 0.5928416056950762, "step": 5996, "train/sim_loss": 0.06640625 }, { "epoch": 0.5928416056950762, "step": 5996, "train/total_loss": 0.1803538203239441 }, { "entropy": 9.188507080078125, "epoch": 0.5929404785445916, "mean_token_accuracy": 0.7356475591659546, "num_tokens": 10393529.0, "step": 5997, "train/ce_loss": 0.7828561663627625 }, { "epoch": 0.5929404785445916, "step": 5997, "train/sim_loss": 0.046875 }, { "epoch": 0.5929404785445916, "step": 5997, "train/total_loss": 0.12516061961650848 }, { "entropy": 8.738287925720215, "epoch": 0.5930393513941071, "mean_token_accuracy": 0.7173333168029785, "num_tokens": 10398722.0, "step": 5998, "train/ce_loss": 0.8791782259941101 }, { "epoch": 0.5930393513941071, "step": 5998, "train/sim_loss": 0.078125 }, { "epoch": 0.5930393513941071, "step": 5998, "train/total_loss": 0.16604283452033997 }, { "entropy": 9.403421401977539, "epoch": 0.5931382242436227, "mean_token_accuracy": 0.6938775777816772, "num_tokens": 10403738.0, "step": 5999, "train/ce_loss": 1.8218058347702026 }, { "epoch": 0.5931382242436227, "step": 5999, "train/sim_loss": 0.0703125 }, { "epoch": 0.5931382242436227, "step": 5999, "train/total_loss": 0.25249308347702026 }, { "epoch": 0.5932370970931382, "grad_norm": 0.835832953453064, "learning_rate": 8.519260248232211e-06, "loss": 0.1488, "step": 6000 }, { "entropy": 9.12228775024414, "epoch": 0.5932370970931382, "mean_token_accuracy": 0.7197368144989014, "num_tokens": 10408978.0, "step": 6000, "train/ce_loss": 0.6845834851264954 }, { "epoch": 0.5932370970931382, "step": 6000, "train/sim_loss": 0.08203125 }, { "epoch": 0.5932370970931382, "step": 6000, "train/total_loss": 0.15048959851264954 }, { "entropy": 8.558881759643555, "epoch": 0.5933359699426537, "mean_token_accuracy": 0.7311272025108337, "num_tokens": 10414416.0, "step": 6001, "train/ce_loss": 0.7443356513977051 }, { "epoch": 0.5933359699426537, "step": 6001, "train/sim_loss": 0.05078125 }, { "epoch": 0.5933359699426537, "step": 6001, "train/total_loss": 0.1252148151397705 }, { "entropy": 8.505014419555664, "epoch": 0.5934348427921693, "mean_token_accuracy": 0.7942631244659424, "num_tokens": 10419925.0, "step": 6002, "train/ce_loss": 0.6493032574653625 }, { "epoch": 0.5934348427921693, "step": 6002, "train/sim_loss": 0.03515625 }, { "epoch": 0.5934348427921693, "step": 6002, "train/total_loss": 0.10008657723665237 }, { "entropy": 9.107410430908203, "epoch": 0.5935337156416848, "mean_token_accuracy": 0.7581967115402222, "num_tokens": 10425124.0, "step": 6003, "train/ce_loss": 0.5387682914733887 }, { "epoch": 0.5935337156416848, "step": 6003, "train/sim_loss": 0.046875 }, { "epoch": 0.5935337156416848, "step": 6003, "train/total_loss": 0.1007518321275711 }, { "entropy": 9.057453155517578, "epoch": 0.5936325884912003, "mean_token_accuracy": 0.7148760557174683, "num_tokens": 10430300.0, "step": 6004, "train/ce_loss": 1.7691458463668823 }, { "epoch": 0.5936325884912003, "step": 6004, "train/sim_loss": 0.04296875 }, { "epoch": 0.5936325884912003, "step": 6004, "train/total_loss": 0.21988333761692047 }, { "entropy": 8.704456329345703, "epoch": 0.5937314613407159, "mean_token_accuracy": 0.7312373518943787, "num_tokens": 10435742.0, "step": 6005, "train/ce_loss": 0.41926902532577515 }, { "epoch": 0.5937314613407159, "step": 6005, "train/sim_loss": 0.078125 }, { "epoch": 0.5937314613407159, "step": 6005, "train/total_loss": 0.12005190551280975 }, { "entropy": 8.764145851135254, "epoch": 0.5938303341902313, "mean_token_accuracy": 0.7616060376167297, "num_tokens": 10441015.0, "step": 6006, "train/ce_loss": 1.517891764640808 }, { "epoch": 0.5938303341902313, "step": 6006, "train/sim_loss": 0.05078125 }, { "epoch": 0.5938303341902313, "step": 6006, "train/total_loss": 0.20257042348384857 }, { "entropy": 8.942367553710938, "epoch": 0.5939292070397468, "mean_token_accuracy": 0.7817142605781555, "num_tokens": 10446363.0, "step": 6007, "train/ce_loss": 0.6082258820533752 }, { "epoch": 0.5939292070397468, "step": 6007, "train/sim_loss": 0.1015625 }, { "epoch": 0.5939292070397468, "step": 6007, "train/total_loss": 0.16238509118556976 }, { "entropy": 9.576613426208496, "epoch": 0.5940280798892624, "mean_token_accuracy": 0.7442922592163086, "num_tokens": 10451277.0, "step": 6008, "train/ce_loss": 2.372051540078246e-06 }, { "epoch": 0.5940280798892624, "step": 6008, "train/sim_loss": 0.02734375 }, { "epoch": 0.5940280798892624, "step": 6008, "train/total_loss": 0.027343986555933952 }, { "entropy": 9.160564422607422, "epoch": 0.5941269527387779, "mean_token_accuracy": 0.6874135732650757, "num_tokens": 10456436.0, "step": 6009, "train/ce_loss": 0.8028679490089417 }, { "epoch": 0.5941269527387779, "step": 6009, "train/sim_loss": 0.08203125 }, { "epoch": 0.5941269527387779, "step": 6009, "train/total_loss": 0.16231805086135864 }, { "entropy": 9.258647918701172, "epoch": 0.5942258255882934, "mean_token_accuracy": 0.7239263653755188, "num_tokens": 10461502.0, "step": 6010, "train/ce_loss": 2.2997858195594745e-06 }, { "epoch": 0.5942258255882934, "step": 6010, "train/sim_loss": 0.05078125 }, { "epoch": 0.5942258255882934, "step": 6010, "train/total_loss": 0.050781480967998505 }, { "entropy": 8.781116485595703, "epoch": 0.594324698437809, "mean_token_accuracy": 0.7196819186210632, "num_tokens": 10466969.0, "step": 6011, "train/ce_loss": 0.5423060059547424 }, { "epoch": 0.594324698437809, "step": 6011, "train/sim_loss": 0.02734375 }, { "epoch": 0.594324698437809, "step": 6011, "train/total_loss": 0.08157435059547424 }, { "entropy": 9.20504379272461, "epoch": 0.5944235712873245, "mean_token_accuracy": 0.7269841432571411, "num_tokens": 10472046.0, "step": 6012, "train/ce_loss": 1.0770103244794882e-06 }, { "epoch": 0.5944235712873245, "step": 6012, "train/sim_loss": 0.03125 }, { "epoch": 0.5944235712873245, "step": 6012, "train/total_loss": 0.031250108033418655 }, { "entropy": 9.047160148620605, "epoch": 0.5945224441368401, "mean_token_accuracy": 0.7447698712348938, "num_tokens": 10477235.0, "step": 6013, "train/ce_loss": 0.8327636122703552 }, { "epoch": 0.5945224441368401, "step": 6013, "train/sim_loss": 0.0234375 }, { "epoch": 0.5945224441368401, "step": 6013, "train/total_loss": 0.10671386122703552 }, { "entropy": 9.16783332824707, "epoch": 0.5946213169863556, "mean_token_accuracy": 0.7651515007019043, "num_tokens": 10482350.0, "step": 6014, "train/ce_loss": 0.9572628140449524 }, { "epoch": 0.5946213169863556, "step": 6014, "train/sim_loss": 0.08203125 }, { "epoch": 0.5946213169863556, "step": 6014, "train/total_loss": 0.17775753140449524 }, { "entropy": 9.01803970336914, "epoch": 0.594720189835871, "mean_token_accuracy": 0.7959442138671875, "num_tokens": 10487629.0, "step": 6015, "train/ce_loss": 0.5711748003959656 }, { "epoch": 0.594720189835871, "step": 6015, "train/sim_loss": 0.0546875 }, { "epoch": 0.594720189835871, "step": 6015, "train/total_loss": 0.11180497705936432 }, { "entropy": 9.087404251098633, "epoch": 0.5948190626853866, "mean_token_accuracy": 0.7900000214576721, "num_tokens": 10492839.0, "step": 6016, "train/ce_loss": 0.7471591234207153 }, { "epoch": 0.5948190626853866, "step": 6016, "train/sim_loss": 0.0390625 }, { "epoch": 0.5948190626853866, "step": 6016, "train/total_loss": 0.11377841234207153 }, { "entropy": 8.938117980957031, "epoch": 0.5949179355349021, "mean_token_accuracy": 0.7794316411018372, "num_tokens": 10498024.0, "step": 6017, "train/ce_loss": 0.7339110970497131 }, { "epoch": 0.5949179355349021, "step": 6017, "train/sim_loss": 0.05078125 }, { "epoch": 0.5949179355349021, "step": 6017, "train/total_loss": 0.12417235970497131 }, { "entropy": 8.622573852539062, "epoch": 0.5950168083844176, "mean_token_accuracy": 0.7354211807250977, "num_tokens": 10503454.0, "step": 6018, "train/ce_loss": 1.3949031829833984 }, { "epoch": 0.5950168083844176, "step": 6018, "train/sim_loss": 0.06640625 }, { "epoch": 0.5950168083844176, "step": 6018, "train/total_loss": 0.20589657127857208 }, { "entropy": 8.990918159484863, "epoch": 0.5951156812339332, "mean_token_accuracy": 0.735336184501648, "num_tokens": 10508657.0, "step": 6019, "train/ce_loss": 0.8849911093711853 }, { "epoch": 0.5951156812339332, "step": 6019, "train/sim_loss": 0.0546875 }, { "epoch": 0.5951156812339332, "step": 6019, "train/total_loss": 0.14318661391735077 }, { "epoch": 0.5952145540834487, "grad_norm": 0.8095764517784119, "learning_rate": 8.514315383474263e-06, "loss": 0.1375, "step": 6020 }, { "entropy": 8.447221755981445, "epoch": 0.5952145540834487, "mean_token_accuracy": 0.7607361674308777, "num_tokens": 10514128.0, "step": 6020, "train/ce_loss": 1.076474905014038 }, { "epoch": 0.5952145540834487, "step": 6020, "train/sim_loss": 0.046875 }, { "epoch": 0.5952145540834487, "step": 6020, "train/total_loss": 0.15452249348163605 }, { "entropy": 9.175122261047363, "epoch": 0.5953134269329642, "mean_token_accuracy": 0.7266982793807983, "num_tokens": 10519221.0, "step": 6021, "train/ce_loss": 1.7625705003738403 }, { "epoch": 0.5953134269329642, "step": 6021, "train/sim_loss": 0.09375 }, { "epoch": 0.5953134269329642, "step": 6021, "train/total_loss": 0.27000707387924194 }, { "entropy": 9.325368881225586, "epoch": 0.5954122997824798, "mean_token_accuracy": 0.7176870703697205, "num_tokens": 10524291.0, "step": 6022, "train/ce_loss": 0.9657346606254578 }, { "epoch": 0.5954122997824798, "step": 6022, "train/sim_loss": 0.0546875 }, { "epoch": 0.5954122997824798, "step": 6022, "train/total_loss": 0.15126097202301025 }, { "entropy": 8.491939544677734, "epoch": 0.5955111726319953, "mean_token_accuracy": 0.7690721750259399, "num_tokens": 10529793.0, "step": 6023, "train/ce_loss": 0.39177945256233215 }, { "epoch": 0.5955111726319953, "step": 6023, "train/sim_loss": 0.03125 }, { "epoch": 0.5955111726319953, "step": 6023, "train/total_loss": 0.07042794674634933 }, { "entropy": 9.656220436096191, "epoch": 0.5956100454815108, "mean_token_accuracy": 0.733031690120697, "num_tokens": 10534646.0, "step": 6024, "train/ce_loss": 1.2732746601104736 }, { "epoch": 0.5956100454815108, "step": 6024, "train/sim_loss": 0.04296875 }, { "epoch": 0.5956100454815108, "step": 6024, "train/total_loss": 0.17029622197151184 }, { "entropy": 9.38523006439209, "epoch": 0.5957089183310263, "mean_token_accuracy": 0.6588419675827026, "num_tokens": 10539704.0, "step": 6025, "train/ce_loss": 1.8997355699539185 }, { "epoch": 0.5957089183310263, "step": 6025, "train/sim_loss": 0.0546875 }, { "epoch": 0.5957089183310263, "step": 6025, "train/total_loss": 0.24466106295585632 }, { "entropy": 8.721146583557129, "epoch": 0.5958077911805418, "mean_token_accuracy": 0.7536800503730774, "num_tokens": 10545233.0, "step": 6026, "train/ce_loss": 0.39012497663497925 }, { "epoch": 0.5958077911805418, "step": 6026, "train/sim_loss": 0.01953125 }, { "epoch": 0.5958077911805418, "step": 6026, "train/total_loss": 0.058543749153614044 }, { "entropy": 9.056755065917969, "epoch": 0.5959066640300573, "mean_token_accuracy": 0.7568305730819702, "num_tokens": 10550422.0, "step": 6027, "train/ce_loss": 0.6360731720924377 }, { "epoch": 0.5959066640300573, "step": 6027, "train/sim_loss": 0.0234375 }, { "epoch": 0.5959066640300573, "step": 6027, "train/total_loss": 0.08704482018947601 }, { "entropy": 8.49323844909668, "epoch": 0.5960055368795729, "mean_token_accuracy": 0.6995798349380493, "num_tokens": 10555866.0, "step": 6028, "train/ce_loss": 1.7057461738586426 }, { "epoch": 0.5960055368795729, "step": 6028, "train/sim_loss": 0.0390625 }, { "epoch": 0.5960055368795729, "step": 6028, "train/total_loss": 0.2096371203660965 }, { "entropy": 9.338155746459961, "epoch": 0.5961044097290884, "mean_token_accuracy": 0.7523961663246155, "num_tokens": 10560934.0, "step": 6029, "train/ce_loss": 0.5146098136901855 }, { "epoch": 0.5961044097290884, "step": 6029, "train/sim_loss": 0.0625 }, { "epoch": 0.5961044097290884, "step": 6029, "train/total_loss": 0.11396098136901855 }, { "entropy": 9.117780685424805, "epoch": 0.5962032825786039, "mean_token_accuracy": 0.7988826632499695, "num_tokens": 10566128.0, "step": 6030, "train/ce_loss": 0.658829927444458 }, { "epoch": 0.5962032825786039, "step": 6030, "train/sim_loss": 0.01953125 }, { "epoch": 0.5962032825786039, "step": 6030, "train/total_loss": 0.08541424572467804 }, { "entropy": 8.720762252807617, "epoch": 0.5963021554281195, "mean_token_accuracy": 0.7643391489982605, "num_tokens": 10571325.0, "step": 6031, "train/ce_loss": 1.2273279428482056 }, { "epoch": 0.5963021554281195, "step": 6031, "train/sim_loss": 0.05859375 }, { "epoch": 0.5963021554281195, "step": 6031, "train/total_loss": 0.18132653832435608 }, { "entropy": 8.94140625, "epoch": 0.596401028277635, "mean_token_accuracy": 0.8042269349098206, "num_tokens": 10576682.0, "step": 6032, "train/ce_loss": 0.35526126623153687 }, { "epoch": 0.596401028277635, "step": 6032, "train/sim_loss": 0.015625 }, { "epoch": 0.596401028277635, "step": 6032, "train/total_loss": 0.051151126623153687 }, { "entropy": 9.405384063720703, "epoch": 0.5964999011271505, "mean_token_accuracy": 0.7786116600036621, "num_tokens": 10581685.0, "step": 6033, "train/ce_loss": 0.8011324405670166 }, { "epoch": 0.5964999011271505, "step": 6033, "train/sim_loss": 0.03125 }, { "epoch": 0.5964999011271505, "step": 6033, "train/total_loss": 0.1113632470369339 }, { "entropy": 9.004520416259766, "epoch": 0.596598773976666, "mean_token_accuracy": 0.7183908224105835, "num_tokens": 10586975.0, "step": 6034, "train/ce_loss": 2.220550775527954 }, { "epoch": 0.596598773976666, "step": 6034, "train/sim_loss": 0.0625 }, { "epoch": 0.596598773976666, "step": 6034, "train/total_loss": 0.2845550775527954 }, { "entropy": 9.192344665527344, "epoch": 0.5966976468261815, "mean_token_accuracy": 0.6920821070671082, "num_tokens": 10592166.0, "step": 6035, "train/ce_loss": 0.9314236640930176 }, { "epoch": 0.5966976468261815, "step": 6035, "train/sim_loss": 0.07421875 }, { "epoch": 0.5966976468261815, "step": 6035, "train/total_loss": 0.16736111044883728 }, { "entropy": 8.852960586547852, "epoch": 0.596796519675697, "mean_token_accuracy": 0.7294981479644775, "num_tokens": 10597426.0, "step": 6036, "train/ce_loss": 0.7451646327972412 }, { "epoch": 0.596796519675697, "step": 6036, "train/sim_loss": 0.03125 }, { "epoch": 0.596796519675697, "step": 6036, "train/total_loss": 0.10576646775007248 }, { "entropy": 8.74105453491211, "epoch": 0.5968953925252126, "mean_token_accuracy": 0.6511024832725525, "num_tokens": 10602694.0, "step": 6037, "train/ce_loss": 1.219413161277771 }, { "epoch": 0.5968953925252126, "step": 6037, "train/sim_loss": 0.1015625 }, { "epoch": 0.5968953925252126, "step": 6037, "train/total_loss": 0.22350382804870605 }, { "entropy": 8.915050506591797, "epoch": 0.5969942653747281, "mean_token_accuracy": 0.7262569665908813, "num_tokens": 10607843.0, "step": 6038, "train/ce_loss": 0.4067918658256531 }, { "epoch": 0.5969942653747281, "step": 6038, "train/sim_loss": 0.05859375 }, { "epoch": 0.5969942653747281, "step": 6038, "train/total_loss": 0.09927293658256531 }, { "entropy": 9.184982299804688, "epoch": 0.5970931382242436, "mean_token_accuracy": 0.7351852059364319, "num_tokens": 10612812.0, "step": 6039, "train/ce_loss": 0.997426450252533 }, { "epoch": 0.5970931382242436, "step": 6039, "train/sim_loss": 0.05859375 }, { "epoch": 0.5970931382242436, "step": 6039, "train/total_loss": 0.15833640098571777 }, { "epoch": 0.5971920110737592, "grad_norm": 0.8017409443855286, "learning_rate": 8.509370518716314e-06, "loss": 0.1375, "step": 6040 }, { "entropy": 8.831388473510742, "epoch": 0.5971920110737592, "mean_token_accuracy": 0.7482837438583374, "num_tokens": 10618167.0, "step": 6040, "train/ce_loss": 1.6475485153932823e-06 }, { "epoch": 0.5971920110737592, "step": 6040, "train/sim_loss": 0.0390625 }, { "epoch": 0.5971920110737592, "step": 6040, "train/total_loss": 0.03906266391277313 }, { "entropy": 9.149188995361328, "epoch": 0.5972908839232747, "mean_token_accuracy": 0.7090619802474976, "num_tokens": 10623212.0, "step": 6041, "train/ce_loss": 2.4567166292399634e-06 }, { "epoch": 0.5972908839232747, "step": 6041, "train/sim_loss": 0.03515625 }, { "epoch": 0.5972908839232747, "step": 6041, "train/total_loss": 0.0351564958691597 }, { "entropy": 9.088918685913086, "epoch": 0.5973897567727902, "mean_token_accuracy": 0.7598314881324768, "num_tokens": 10628336.0, "step": 6042, "train/ce_loss": 1.2687687873840332 }, { "epoch": 0.5973897567727902, "step": 6042, "train/sim_loss": 0.046875 }, { "epoch": 0.5973897567727902, "step": 6042, "train/total_loss": 0.17375187575817108 }, { "entropy": 9.040409088134766, "epoch": 0.5974886296223058, "mean_token_accuracy": 0.7434841990470886, "num_tokens": 10633439.0, "step": 6043, "train/ce_loss": 1.3814735412597656 }, { "epoch": 0.5974886296223058, "step": 6043, "train/sim_loss": 0.0625 }, { "epoch": 0.5974886296223058, "step": 6043, "train/total_loss": 0.20064735412597656 }, { "entropy": 9.518924713134766, "epoch": 0.5975875024718212, "mean_token_accuracy": 0.7703180313110352, "num_tokens": 10638424.0, "step": 6044, "train/ce_loss": 1.0740244388580322 }, { "epoch": 0.5975875024718212, "step": 6044, "train/sim_loss": 0.06640625 }, { "epoch": 0.5975875024718212, "step": 6044, "train/total_loss": 0.17380869388580322 }, { "entropy": 8.724679946899414, "epoch": 0.5976863753213367, "mean_token_accuracy": 0.731452465057373, "num_tokens": 10643886.0, "step": 6045, "train/ce_loss": 0.7668578028678894 }, { "epoch": 0.5976863753213367, "step": 6045, "train/sim_loss": 0.08203125 }, { "epoch": 0.5976863753213367, "step": 6045, "train/total_loss": 0.15871703624725342 }, { "entropy": 9.082891464233398, "epoch": 0.5977852481708523, "mean_token_accuracy": 0.6972602605819702, "num_tokens": 10648984.0, "step": 6046, "train/ce_loss": 2.7422502171248198e-06 }, { "epoch": 0.5977852481708523, "step": 6046, "train/sim_loss": 0.046875 }, { "epoch": 0.5977852481708523, "step": 6046, "train/total_loss": 0.046875275671482086 }, { "entropy": 9.151057243347168, "epoch": 0.5978841210203678, "mean_token_accuracy": 0.832647442817688, "num_tokens": 10654140.0, "step": 6047, "train/ce_loss": 5.389683792600408e-07 }, { "epoch": 0.5978841210203678, "step": 6047, "train/sim_loss": 0.01953125 }, { "epoch": 0.5978841210203678, "step": 6047, "train/total_loss": 0.019531304016709328 }, { "entropy": 9.38824462890625, "epoch": 0.5979829938698833, "mean_token_accuracy": 0.7116736769676208, "num_tokens": 10659254.0, "step": 6048, "train/ce_loss": 0.9343109130859375 }, { "epoch": 0.5979829938698833, "step": 6048, "train/sim_loss": 0.01953125 }, { "epoch": 0.5979829938698833, "step": 6048, "train/total_loss": 0.11296234279870987 }, { "entropy": 9.886924743652344, "epoch": 0.5980818667193989, "mean_token_accuracy": 0.6893203854560852, "num_tokens": 10664054.0, "step": 6049, "train/ce_loss": 1.4633288383483887 }, { "epoch": 0.5980818667193989, "step": 6049, "train/sim_loss": 0.06640625 }, { "epoch": 0.5980818667193989, "step": 6049, "train/total_loss": 0.21273913979530334 }, { "entropy": 9.636835098266602, "epoch": 0.5981807395689144, "mean_token_accuracy": 0.7022900581359863, "num_tokens": 10668863.0, "step": 6050, "train/ce_loss": 2.3065342903137207 }, { "epoch": 0.5981807395689144, "step": 6050, "train/sim_loss": 0.109375 }, { "epoch": 0.5981807395689144, "step": 6050, "train/total_loss": 0.34002843499183655 }, { "entropy": 8.877485275268555, "epoch": 0.5982796124184299, "mean_token_accuracy": 0.8215129971504211, "num_tokens": 10674161.0, "step": 6051, "train/ce_loss": 0.936775803565979 }, { "epoch": 0.5982796124184299, "step": 6051, "train/sim_loss": 0.0234375 }, { "epoch": 0.5982796124184299, "step": 6051, "train/total_loss": 0.1171150803565979 }, { "entropy": 9.124277114868164, "epoch": 0.5983784852679455, "mean_token_accuracy": 0.7508590817451477, "num_tokens": 10679211.0, "step": 6052, "train/ce_loss": 3.786348315770738e-06 }, { "epoch": 0.5983784852679455, "step": 6052, "train/sim_loss": 0.03515625 }, { "epoch": 0.5983784852679455, "step": 6052, "train/total_loss": 0.03515662997961044 }, { "entropy": 9.106124877929688, "epoch": 0.598477358117461, "mean_token_accuracy": 0.7250945568084717, "num_tokens": 10684475.0, "step": 6053, "train/ce_loss": 0.4788050949573517 }, { "epoch": 0.598477358117461, "step": 6053, "train/sim_loss": 0.03515625 }, { "epoch": 0.598477358117461, "step": 6053, "train/total_loss": 0.08303676545619965 }, { "entropy": 8.839000701904297, "epoch": 0.5985762309669764, "mean_token_accuracy": 0.7988505959510803, "num_tokens": 10689626.0, "step": 6054, "train/ce_loss": 2.159317546102102e-06 }, { "epoch": 0.5985762309669764, "step": 6054, "train/sim_loss": 0.0390625 }, { "epoch": 0.5985762309669764, "step": 6054, "train/total_loss": 0.03906271606683731 }, { "entropy": 9.599370956420898, "epoch": 0.598675103816492, "mean_token_accuracy": 0.6694214940071106, "num_tokens": 10694559.0, "step": 6055, "train/ce_loss": 1.485620941821253e-06 }, { "epoch": 0.598675103816492, "step": 6055, "train/sim_loss": 0.04296875 }, { "epoch": 0.598675103816492, "step": 6055, "train/total_loss": 0.04296889901161194 }, { "entropy": 9.249372482299805, "epoch": 0.5987739766660075, "mean_token_accuracy": 0.7612179517745972, "num_tokens": 10699634.0, "step": 6056, "train/ce_loss": 1.3209097385406494 }, { "epoch": 0.5987739766660075, "step": 6056, "train/sim_loss": 0.05859375 }, { "epoch": 0.5987739766660075, "step": 6056, "train/total_loss": 0.1906847208738327 }, { "entropy": 9.658489227294922, "epoch": 0.598872849515523, "mean_token_accuracy": 0.7472727298736572, "num_tokens": 10704629.0, "step": 6057, "train/ce_loss": 0.8913617730140686 }, { "epoch": 0.598872849515523, "step": 6057, "train/sim_loss": 0.078125 }, { "epoch": 0.598872849515523, "step": 6057, "train/total_loss": 0.16726118326187134 }, { "entropy": 9.664912223815918, "epoch": 0.5989717223650386, "mean_token_accuracy": 0.8282442688941956, "num_tokens": 10709560.0, "step": 6058, "train/ce_loss": 9.336384323432867e-07 }, { "epoch": 0.5989717223650386, "step": 6058, "train/sim_loss": 0.01953125 }, { "epoch": 0.5989717223650386, "step": 6058, "train/total_loss": 0.01953134313225746 }, { "entropy": 8.653030395507812, "epoch": 0.5990705952145541, "mean_token_accuracy": 0.7394514679908752, "num_tokens": 10714994.0, "step": 6059, "train/ce_loss": 0.7872945666313171 }, { "epoch": 0.5990705952145541, "step": 6059, "train/sim_loss": 0.0546875 }, { "epoch": 0.5990705952145541, "step": 6059, "train/total_loss": 0.13341695070266724 }, { "epoch": 0.5991694680640696, "grad_norm": 0.7073284983634949, "learning_rate": 8.504425653958364e-06, "loss": 0.1326, "step": 6060 }, { "entropy": 9.837320327758789, "epoch": 0.5991694680640696, "mean_token_accuracy": 0.6827794313430786, "num_tokens": 10719723.0, "step": 6060, "train/ce_loss": 1.9000295400619507 }, { "epoch": 0.5991694680640696, "step": 6060, "train/sim_loss": 0.0703125 }, { "epoch": 0.5991694680640696, "step": 6060, "train/total_loss": 0.260315477848053 }, { "entropy": 9.144196510314941, "epoch": 0.5992683409135852, "mean_token_accuracy": 0.7172897458076477, "num_tokens": 10724995.0, "step": 6061, "train/ce_loss": 5.91021830587124e-07 }, { "epoch": 0.5992683409135852, "step": 6061, "train/sim_loss": 0.015625 }, { "epoch": 0.5992683409135852, "step": 6061, "train/total_loss": 0.015625059604644775 }, { "entropy": 9.109617233276367, "epoch": 0.5993672137631006, "mean_token_accuracy": 0.7466843724250793, "num_tokens": 10730213.0, "step": 6062, "train/ce_loss": 0.5911141037940979 }, { "epoch": 0.5993672137631006, "step": 6062, "train/sim_loss": 0.015625 }, { "epoch": 0.5993672137631006, "step": 6062, "train/total_loss": 0.07473641633987427 }, { "entropy": 8.925853729248047, "epoch": 0.5994660866126161, "mean_token_accuracy": 0.6997663378715515, "num_tokens": 10735582.0, "step": 6063, "train/ce_loss": 1.1986355781555176 }, { "epoch": 0.5994660866126161, "step": 6063, "train/sim_loss": 0.11328125 }, { "epoch": 0.5994660866126161, "step": 6063, "train/total_loss": 0.2331448197364807 }, { "entropy": 9.185383796691895, "epoch": 0.5995649594621317, "mean_token_accuracy": 0.7010869383811951, "num_tokens": 10740831.0, "step": 6064, "train/ce_loss": 0.6725782155990601 }, { "epoch": 0.5995649594621317, "step": 6064, "train/sim_loss": 0.03515625 }, { "epoch": 0.5995649594621317, "step": 6064, "train/total_loss": 0.102414071559906 }, { "entropy": 9.35740852355957, "epoch": 0.5996638323116472, "mean_token_accuracy": 0.7684563994407654, "num_tokens": 10745859.0, "step": 6065, "train/ce_loss": 0.8081445097923279 }, { "epoch": 0.5996638323116472, "step": 6065, "train/sim_loss": 0.07421875 }, { "epoch": 0.5996638323116472, "step": 6065, "train/total_loss": 0.1550332009792328 }, { "entropy": 8.718416213989258, "epoch": 0.5997627051611627, "mean_token_accuracy": 0.7359490990638733, "num_tokens": 10751274.0, "step": 6066, "train/ce_loss": 0.8912520408630371 }, { "epoch": 0.5997627051611627, "step": 6066, "train/sim_loss": 0.0859375 }, { "epoch": 0.5997627051611627, "step": 6066, "train/total_loss": 0.17506271600723267 }, { "entropy": 9.426103591918945, "epoch": 0.5998615780106783, "mean_token_accuracy": 0.8238636255264282, "num_tokens": 10756195.0, "step": 6067, "train/ce_loss": 0.6718910932540894 }, { "epoch": 0.5998615780106783, "step": 6067, "train/sim_loss": 0.0234375 }, { "epoch": 0.5998615780106783, "step": 6067, "train/total_loss": 0.09062661230564117 }, { "entropy": 8.437259674072266, "epoch": 0.5999604508601938, "mean_token_accuracy": 0.7057521939277649, "num_tokens": 10761553.0, "step": 6068, "train/ce_loss": 1.0020898580551147 }, { "epoch": 0.5999604508601938, "step": 6068, "train/sim_loss": 0.1171875 }, { "epoch": 0.5999604508601938, "step": 6068, "train/total_loss": 0.21739649772644043 }, { "entropy": 9.396541595458984, "epoch": 0.6000593237097093, "mean_token_accuracy": 0.7037617564201355, "num_tokens": 10766650.0, "step": 6069, "train/ce_loss": 6.18084868619917e-07 }, { "epoch": 0.6000593237097093, "step": 6069, "train/sim_loss": 0.03515625 }, { "epoch": 0.6000593237097093, "step": 6069, "train/total_loss": 0.035156313329935074 }, { "entropy": 9.112813949584961, "epoch": 0.6001581965592249, "mean_token_accuracy": 0.744911789894104, "num_tokens": 10771817.0, "step": 6070, "train/ce_loss": 0.3863801956176758 }, { "epoch": 0.6001581965592249, "step": 6070, "train/sim_loss": 0.02734375 }, { "epoch": 0.6001581965592249, "step": 6070, "train/total_loss": 0.06598177552223206 }, { "entropy": 8.795565605163574, "epoch": 0.6002570694087404, "mean_token_accuracy": 0.7511110901832581, "num_tokens": 10777164.0, "step": 6071, "train/ce_loss": 0.993456244468689 }, { "epoch": 0.6002570694087404, "step": 6071, "train/sim_loss": 0.07421875 }, { "epoch": 0.6002570694087404, "step": 6071, "train/total_loss": 0.1735643744468689 }, { "entropy": 9.250688552856445, "epoch": 0.6003559422582558, "mean_token_accuracy": 0.7766830921173096, "num_tokens": 10782202.0, "step": 6072, "train/ce_loss": 1.302918553352356 }, { "epoch": 0.6003559422582558, "step": 6072, "train/sim_loss": 0.05078125 }, { "epoch": 0.6003559422582558, "step": 6072, "train/total_loss": 0.1810731142759323 }, { "entropy": 8.888847351074219, "epoch": 0.6004548151077714, "mean_token_accuracy": 0.7934272289276123, "num_tokens": 10787539.0, "step": 6073, "train/ce_loss": 0.9093997478485107 }, { "epoch": 0.6004548151077714, "step": 6073, "train/sim_loss": 0.02734375 }, { "epoch": 0.6004548151077714, "step": 6073, "train/total_loss": 0.1182837262749672 }, { "entropy": 8.762311935424805, "epoch": 0.6005536879572869, "mean_token_accuracy": 0.6850321292877197, "num_tokens": 10793142.0, "step": 6074, "train/ce_loss": 0.8352211117744446 }, { "epoch": 0.6005536879572869, "step": 6074, "train/sim_loss": 0.0546875 }, { "epoch": 0.6005536879572869, "step": 6074, "train/total_loss": 0.13820961117744446 }, { "entropy": 9.677602767944336, "epoch": 0.6006525608068024, "mean_token_accuracy": 0.7514285445213318, "num_tokens": 10797971.0, "step": 6075, "train/ce_loss": 4.10594248023699e-06 }, { "epoch": 0.6006525608068024, "step": 6075, "train/sim_loss": 0.03125 }, { "epoch": 0.6006525608068024, "step": 6075, "train/total_loss": 0.03125040978193283 }, { "entropy": 9.393272399902344, "epoch": 0.600751433656318, "mean_token_accuracy": 0.7361111044883728, "num_tokens": 10803038.0, "step": 6076, "train/ce_loss": 1.02798593044281 }, { "epoch": 0.600751433656318, "step": 6076, "train/sim_loss": 0.12109375 }, { "epoch": 0.600751433656318, "step": 6076, "train/total_loss": 0.22389234602451324 }, { "entropy": 9.127182006835938, "epoch": 0.6008503065058335, "mean_token_accuracy": 0.6969696879386902, "num_tokens": 10808164.0, "step": 6077, "train/ce_loss": 1.790145993232727 }, { "epoch": 0.6008503065058335, "step": 6077, "train/sim_loss": 0.0546875 }, { "epoch": 0.6008503065058335, "step": 6077, "train/total_loss": 0.23370210826396942 }, { "entropy": 8.687410354614258, "epoch": 0.600949179355349, "mean_token_accuracy": 0.743849515914917, "num_tokens": 10813354.0, "step": 6078, "train/ce_loss": 0.8146626353263855 }, { "epoch": 0.600949179355349, "step": 6078, "train/sim_loss": 0.0859375 }, { "epoch": 0.600949179355349, "step": 6078, "train/total_loss": 0.16740375757217407 }, { "entropy": 9.673949241638184, "epoch": 0.6010480522048646, "mean_token_accuracy": 0.7717121839523315, "num_tokens": 10818247.0, "step": 6079, "train/ce_loss": 1.099326252937317 }, { "epoch": 0.6010480522048646, "step": 6079, "train/sim_loss": 0.06640625 }, { "epoch": 0.6010480522048646, "step": 6079, "train/total_loss": 0.17633888125419617 }, { "epoch": 0.60114692505438, "grad_norm": 0.8666896820068359, "learning_rate": 8.499480789200417e-06, "loss": 0.1416, "step": 6080 }, { "entropy": 8.867826461791992, "epoch": 0.60114692505438, "mean_token_accuracy": 0.7034883499145508, "num_tokens": 10823618.0, "step": 6080, "train/ce_loss": 0.535771369934082 }, { "epoch": 0.60114692505438, "step": 6080, "train/sim_loss": 0.07421875 }, { "epoch": 0.60114692505438, "step": 6080, "train/total_loss": 0.12779588997364044 }, { "entropy": 8.964005470275879, "epoch": 0.6012457979038955, "mean_token_accuracy": 0.7682619690895081, "num_tokens": 10828970.0, "step": 6081, "train/ce_loss": 0.8758470416069031 }, { "epoch": 0.6012457979038955, "step": 6081, "train/sim_loss": 0.078125 }, { "epoch": 0.6012457979038955, "step": 6081, "train/total_loss": 0.1657097041606903 }, { "entropy": 9.141044616699219, "epoch": 0.6013446707534111, "mean_token_accuracy": 0.7322485446929932, "num_tokens": 10834050.0, "step": 6082, "train/ce_loss": 1.4010944366455078 }, { "epoch": 0.6013446707534111, "step": 6082, "train/sim_loss": 0.0625 }, { "epoch": 0.6013446707534111, "step": 6082, "train/total_loss": 0.20260944962501526 }, { "entropy": 8.897449493408203, "epoch": 0.6014435436029266, "mean_token_accuracy": 0.7319587469100952, "num_tokens": 10839401.0, "step": 6083, "train/ce_loss": 0.9998795390129089 }, { "epoch": 0.6014435436029266, "step": 6083, "train/sim_loss": 0.0390625 }, { "epoch": 0.6014435436029266, "step": 6083, "train/total_loss": 0.1390504539012909 }, { "entropy": 8.964387893676758, "epoch": 0.6015424164524421, "mean_token_accuracy": 0.7765432000160217, "num_tokens": 10844696.0, "step": 6084, "train/ce_loss": 0.6983901858329773 }, { "epoch": 0.6015424164524421, "step": 6084, "train/sim_loss": 0.05078125 }, { "epoch": 0.6015424164524421, "step": 6084, "train/total_loss": 0.12062027305364609 }, { "entropy": 9.072749137878418, "epoch": 0.6016412893019577, "mean_token_accuracy": 0.707257091999054, "num_tokens": 10849973.0, "step": 6085, "train/ce_loss": 1.4259077310562134 }, { "epoch": 0.6016412893019577, "step": 6085, "train/sim_loss": 0.15625 }, { "epoch": 0.6016412893019577, "step": 6085, "train/total_loss": 0.2988407611846924 }, { "entropy": 8.899450302124023, "epoch": 0.6017401621514732, "mean_token_accuracy": 0.7363834381103516, "num_tokens": 10855356.0, "step": 6086, "train/ce_loss": 8.118449841276743e-07 }, { "epoch": 0.6017401621514732, "step": 6086, "train/sim_loss": 0.046875 }, { "epoch": 0.6017401621514732, "step": 6086, "train/total_loss": 0.046875081956386566 }, { "entropy": 8.617920875549316, "epoch": 0.6018390350009887, "mean_token_accuracy": 0.8015102744102478, "num_tokens": 10860763.0, "step": 6087, "train/ce_loss": 0.8570935726165771 }, { "epoch": 0.6018390350009887, "step": 6087, "train/sim_loss": 0.09375 }, { "epoch": 0.6018390350009887, "step": 6087, "train/total_loss": 0.1794593632221222 }, { "entropy": 9.354616165161133, "epoch": 0.6019379078505043, "mean_token_accuracy": 0.7318611741065979, "num_tokens": 10865843.0, "step": 6088, "train/ce_loss": 0.7066981792449951 }, { "epoch": 0.6019379078505043, "step": 6088, "train/sim_loss": 0.0625 }, { "epoch": 0.6019379078505043, "step": 6088, "train/total_loss": 0.13316982984542847 }, { "entropy": 9.205799102783203, "epoch": 0.6020367807000198, "mean_token_accuracy": 0.7127516865730286, "num_tokens": 10871069.0, "step": 6089, "train/ce_loss": 1.0004031658172607 }, { "epoch": 0.6020367807000198, "step": 6089, "train/sim_loss": 0.0625 }, { "epoch": 0.6020367807000198, "step": 6089, "train/total_loss": 0.16254031658172607 }, { "entropy": 9.151839256286621, "epoch": 0.6021356535495352, "mean_token_accuracy": 0.76579350233078, "num_tokens": 10876100.0, "step": 6090, "train/ce_loss": 0.9818342924118042 }, { "epoch": 0.6021356535495352, "step": 6090, "train/sim_loss": 0.11328125 }, { "epoch": 0.6021356535495352, "step": 6090, "train/total_loss": 0.21146467328071594 }, { "entropy": 8.641899108886719, "epoch": 0.6022345263990508, "mean_token_accuracy": 0.7617647051811218, "num_tokens": 10881611.0, "step": 6091, "train/ce_loss": 0.9895736575126648 }, { "epoch": 0.6022345263990508, "step": 6091, "train/sim_loss": 0.07421875 }, { "epoch": 0.6022345263990508, "step": 6091, "train/total_loss": 0.173176109790802 }, { "entropy": 9.105466842651367, "epoch": 0.6023333992485663, "mean_token_accuracy": 0.7323037385940552, "num_tokens": 10886756.0, "step": 6092, "train/ce_loss": 6.099965048633749e-07 }, { "epoch": 0.6023333992485663, "step": 6092, "train/sim_loss": 0.0234375 }, { "epoch": 0.6023333992485663, "step": 6092, "train/total_loss": 0.023437561467289925 }, { "entropy": 9.237039566040039, "epoch": 0.6024322720980818, "mean_token_accuracy": 0.8252426981925964, "num_tokens": 10891950.0, "step": 6093, "train/ce_loss": 1.0687254667282104 }, { "epoch": 0.6024322720980818, "step": 6093, "train/sim_loss": 0.0390625 }, { "epoch": 0.6024322720980818, "step": 6093, "train/total_loss": 0.14593505859375 }, { "entropy": 9.210630416870117, "epoch": 0.6025311449475974, "mean_token_accuracy": 0.7210599780082703, "num_tokens": 10897113.0, "step": 6094, "train/ce_loss": 1.0980263948440552 }, { "epoch": 0.6025311449475974, "step": 6094, "train/sim_loss": 0.05078125 }, { "epoch": 0.6025311449475974, "step": 6094, "train/total_loss": 0.16058388352394104 }, { "entropy": 10.069806098937988, "epoch": 0.6026300177971129, "mean_token_accuracy": 0.7323232293128967, "num_tokens": 10901655.0, "step": 6095, "train/ce_loss": 7.5997854764864314e-06 }, { "epoch": 0.6026300177971129, "step": 6095, "train/sim_loss": 0.08203125 }, { "epoch": 0.6026300177971129, "step": 6095, "train/total_loss": 0.08203200995922089 }, { "entropy": 9.082328796386719, "epoch": 0.6027288906466285, "mean_token_accuracy": 0.7697121500968933, "num_tokens": 10906921.0, "step": 6096, "train/ce_loss": 0.46874570846557617 }, { "epoch": 0.6027288906466285, "step": 6096, "train/sim_loss": 0.0390625 }, { "epoch": 0.6027288906466285, "step": 6096, "train/total_loss": 0.08593706786632538 }, { "entropy": 9.460693359375, "epoch": 0.602827763496144, "mean_token_accuracy": 0.7108209133148193, "num_tokens": 10911897.0, "step": 6097, "train/ce_loss": 2.3511090603278717e-06 }, { "epoch": 0.602827763496144, "step": 6097, "train/sim_loss": 0.04296875 }, { "epoch": 0.602827763496144, "step": 6097, "train/total_loss": 0.0429689846932888 }, { "entropy": 9.391380310058594, "epoch": 0.6029266363456595, "mean_token_accuracy": 0.772357702255249, "num_tokens": 10916950.0, "step": 6098, "train/ce_loss": 1.1386464834213257 }, { "epoch": 0.6029266363456595, "step": 6098, "train/sim_loss": 0.0859375 }, { "epoch": 0.6029266363456595, "step": 6098, "train/total_loss": 0.19980216026306152 }, { "entropy": 9.208715438842773, "epoch": 0.6030255091951751, "mean_token_accuracy": 0.7718023061752319, "num_tokens": 10922149.0, "step": 6099, "train/ce_loss": 1.0111721167049836e-06 }, { "epoch": 0.6030255091951751, "step": 6099, "train/sim_loss": 0.0546875 }, { "epoch": 0.6030255091951751, "step": 6099, "train/total_loss": 0.05468760058283806 }, { "epoch": 0.6031243820446905, "grad_norm": 0.7141965627670288, "learning_rate": 8.494535924442467e-06, "loss": 0.1357, "step": 6100 }, { "entropy": 8.944578170776367, "epoch": 0.6031243820446905, "mean_token_accuracy": 0.6878364086151123, "num_tokens": 10927696.0, "step": 6100, "train/ce_loss": 0.508734941482544 }, { "epoch": 0.6031243820446905, "step": 6100, "train/sim_loss": 0.0390625 }, { "epoch": 0.6031243820446905, "step": 6100, "train/total_loss": 0.08993599563837051 }, { "entropy": 9.312324523925781, "epoch": 0.603223254894206, "mean_token_accuracy": 0.7329192757606506, "num_tokens": 10932760.0, "step": 6101, "train/ce_loss": 1.2184512615203857 }, { "epoch": 0.603223254894206, "step": 6101, "train/sim_loss": 0.078125 }, { "epoch": 0.603223254894206, "step": 6101, "train/total_loss": 0.19997012615203857 }, { "entropy": 9.61783218383789, "epoch": 0.6033221277437216, "mean_token_accuracy": 0.7670156955718994, "num_tokens": 10937608.0, "step": 6102, "train/ce_loss": 1.2987922430038452 }, { "epoch": 0.6033221277437216, "step": 6102, "train/sim_loss": 0.0546875 }, { "epoch": 0.6033221277437216, "step": 6102, "train/total_loss": 0.18456672132015228 }, { "entropy": 8.94619369506836, "epoch": 0.6034210005932371, "mean_token_accuracy": 0.7599039673805237, "num_tokens": 10942895.0, "step": 6103, "train/ce_loss": 0.8675341606140137 }, { "epoch": 0.6034210005932371, "step": 6103, "train/sim_loss": 0.0625 }, { "epoch": 0.6034210005932371, "step": 6103, "train/total_loss": 0.14925342798233032 }, { "entropy": 9.454904556274414, "epoch": 0.6035198734427526, "mean_token_accuracy": 0.771266520023346, "num_tokens": 10947834.0, "step": 6104, "train/ce_loss": 0.5550150871276855 }, { "epoch": 0.6035198734427526, "step": 6104, "train/sim_loss": 0.01953125 }, { "epoch": 0.6035198734427526, "step": 6104, "train/total_loss": 0.07503275573253632 }, { "entropy": 8.96200942993164, "epoch": 0.6036187462922682, "mean_token_accuracy": 0.7113526463508606, "num_tokens": 10953158.0, "step": 6105, "train/ce_loss": 1.275610327720642 }, { "epoch": 0.6036187462922682, "step": 6105, "train/sim_loss": 0.0625 }, { "epoch": 0.6036187462922682, "step": 6105, "train/total_loss": 0.1900610327720642 }, { "entropy": 8.748249053955078, "epoch": 0.6037176191417837, "mean_token_accuracy": 0.7409909963607788, "num_tokens": 10958513.0, "step": 6106, "train/ce_loss": 0.9752168655395508 }, { "epoch": 0.6037176191417837, "step": 6106, "train/sim_loss": 0.0546875 }, { "epoch": 0.6037176191417837, "step": 6106, "train/total_loss": 0.15220919251441956 }, { "entropy": 9.620986938476562, "epoch": 0.6038164919912992, "mean_token_accuracy": 0.6916058659553528, "num_tokens": 10963525.0, "step": 6107, "train/ce_loss": 1.447430968284607 }, { "epoch": 0.6038164919912992, "step": 6107, "train/sim_loss": 0.0859375 }, { "epoch": 0.6038164919912992, "step": 6107, "train/total_loss": 0.23068059980869293 }, { "entropy": 8.569491386413574, "epoch": 0.6039153648408148, "mean_token_accuracy": 0.7436241507530212, "num_tokens": 10968760.0, "step": 6108, "train/ce_loss": 1.0208266973495483 }, { "epoch": 0.6039153648408148, "step": 6108, "train/sim_loss": 0.0390625 }, { "epoch": 0.6039153648408148, "step": 6108, "train/total_loss": 0.14114516973495483 }, { "entropy": 8.876388549804688, "epoch": 0.6040142376903302, "mean_token_accuracy": 0.7449495196342468, "num_tokens": 10974002.0, "step": 6109, "train/ce_loss": 0.6652390956878662 }, { "epoch": 0.6040142376903302, "step": 6109, "train/sim_loss": 0.04296875 }, { "epoch": 0.6040142376903302, "step": 6109, "train/total_loss": 0.10949265956878662 }, { "entropy": 8.956596374511719, "epoch": 0.6041131105398457, "mean_token_accuracy": 0.8051947951316833, "num_tokens": 10979256.0, "step": 6110, "train/ce_loss": 2.024814193646307e-06 }, { "epoch": 0.6041131105398457, "step": 6110, "train/sim_loss": 0.0546875 }, { "epoch": 0.6041131105398457, "step": 6110, "train/total_loss": 0.05468770116567612 }, { "entropy": 8.775054931640625, "epoch": 0.6042119833893613, "mean_token_accuracy": 0.7146198749542236, "num_tokens": 10984617.0, "step": 6111, "train/ce_loss": 1.138087272644043 }, { "epoch": 0.6042119833893613, "step": 6111, "train/sim_loss": 0.08203125 }, { "epoch": 0.6042119833893613, "step": 6111, "train/total_loss": 0.19583997130393982 }, { "entropy": 8.835660934448242, "epoch": 0.6043108562388768, "mean_token_accuracy": 0.6846330165863037, "num_tokens": 10989989.0, "step": 6112, "train/ce_loss": 0.7771117687225342 }, { "epoch": 0.6043108562388768, "step": 6112, "train/sim_loss": 0.078125 }, { "epoch": 0.6043108562388768, "step": 6112, "train/total_loss": 0.15583617985248566 }, { "entropy": 9.560922622680664, "epoch": 0.6044097290883923, "mean_token_accuracy": 0.7399617433547974, "num_tokens": 10994934.0, "step": 6113, "train/ce_loss": 2.5262149847549153e-06 }, { "epoch": 0.6044097290883923, "step": 6113, "train/sim_loss": 0.078125 }, { "epoch": 0.6044097290883923, "step": 6113, "train/total_loss": 0.0781252533197403 }, { "entropy": 9.276796340942383, "epoch": 0.6045086019379079, "mean_token_accuracy": 0.7439544796943665, "num_tokens": 11000044.0, "step": 6114, "train/ce_loss": 1.1325781345367432 }, { "epoch": 0.6045086019379079, "step": 6114, "train/sim_loss": 0.16015625 }, { "epoch": 0.6045086019379079, "step": 6114, "train/total_loss": 0.27341407537460327 }, { "entropy": 9.39217758178711, "epoch": 0.6046074747874234, "mean_token_accuracy": 0.7068965435028076, "num_tokens": 11005089.0, "step": 6115, "train/ce_loss": 1.7699488807920716e-06 }, { "epoch": 0.6046074747874234, "step": 6115, "train/sim_loss": 0.03515625 }, { "epoch": 0.6046074747874234, "step": 6115, "train/total_loss": 0.035156428813934326 }, { "entropy": 8.850866317749023, "epoch": 0.6047063476369389, "mean_token_accuracy": 0.7711442708969116, "num_tokens": 11010548.0, "step": 6116, "train/ce_loss": 0.9158958792686462 }, { "epoch": 0.6047063476369389, "step": 6116, "train/sim_loss": 0.03515625 }, { "epoch": 0.6047063476369389, "step": 6116, "train/total_loss": 0.12674584984779358 }, { "entropy": 8.85025691986084, "epoch": 0.6048052204864545, "mean_token_accuracy": 0.7938144207000732, "num_tokens": 11015930.0, "step": 6117, "train/ce_loss": 0.8996186852455139 }, { "epoch": 0.6048052204864545, "step": 6117, "train/sim_loss": 0.046875 }, { "epoch": 0.6048052204864545, "step": 6117, "train/total_loss": 0.13683687150478363 }, { "entropy": 8.638938903808594, "epoch": 0.60490409333597, "mean_token_accuracy": 0.6982182860374451, "num_tokens": 11021276.0, "step": 6118, "train/ce_loss": 0.8769782781600952 }, { "epoch": 0.60490409333597, "step": 6118, "train/sim_loss": 0.046875 }, { "epoch": 0.60490409333597, "step": 6118, "train/total_loss": 0.134572833776474 }, { "entropy": 9.259170532226562, "epoch": 0.6050029661854854, "mean_token_accuracy": 0.7200000286102295, "num_tokens": 11026382.0, "step": 6119, "train/ce_loss": 0.7559342980384827 }, { "epoch": 0.6050029661854854, "step": 6119, "train/sim_loss": 0.04296875 }, { "epoch": 0.6050029661854854, "step": 6119, "train/total_loss": 0.11856218427419662 }, { "epoch": 0.605101839035001, "grad_norm": 0.718431830406189, "learning_rate": 8.48959105968452e-06, "loss": 0.1397, "step": 6120 }, { "entropy": 9.40379524230957, "epoch": 0.605101839035001, "mean_token_accuracy": 0.6692667603492737, "num_tokens": 11031435.0, "step": 6120, "train/ce_loss": 1.3104554414749146 }, { "epoch": 0.605101839035001, "step": 6120, "train/sim_loss": 0.0625 }, { "epoch": 0.605101839035001, "step": 6120, "train/total_loss": 0.19354555010795593 }, { "entropy": 9.068265914916992, "epoch": 0.6052007118845165, "mean_token_accuracy": 0.7567114233970642, "num_tokens": 11036463.0, "step": 6121, "train/ce_loss": 0.6880939602851868 }, { "epoch": 0.6052007118845165, "step": 6121, "train/sim_loss": 0.0234375 }, { "epoch": 0.6052007118845165, "step": 6121, "train/total_loss": 0.0922468975186348 }, { "entropy": 8.987520217895508, "epoch": 0.605299584734032, "mean_token_accuracy": 0.7322834730148315, "num_tokens": 11041702.0, "step": 6122, "train/ce_loss": 0.7852272391319275 }, { "epoch": 0.605299584734032, "step": 6122, "train/sim_loss": 0.0703125 }, { "epoch": 0.605299584734032, "step": 6122, "train/total_loss": 0.148835226893425 }, { "entropy": 8.811985969543457, "epoch": 0.6053984575835476, "mean_token_accuracy": 0.783643901348114, "num_tokens": 11047111.0, "step": 6123, "train/ce_loss": 0.3789895176887512 }, { "epoch": 0.6053984575835476, "step": 6123, "train/sim_loss": 0.01953125 }, { "epoch": 0.6053984575835476, "step": 6123, "train/total_loss": 0.0574302040040493 }, { "entropy": 9.542729377746582, "epoch": 0.6054973304330631, "mean_token_accuracy": 0.7797979712486267, "num_tokens": 11052093.0, "step": 6124, "train/ce_loss": 0.9592763185501099 }, { "epoch": 0.6054973304330631, "step": 6124, "train/sim_loss": 0.07421875 }, { "epoch": 0.6054973304330631, "step": 6124, "train/total_loss": 0.1701463758945465 }, { "entropy": 8.710733413696289, "epoch": 0.6055962032825786, "mean_token_accuracy": 0.7696477174758911, "num_tokens": 11057289.0, "step": 6125, "train/ce_loss": 0.8214151263237 }, { "epoch": 0.6055962032825786, "step": 6125, "train/sim_loss": 0.03515625 }, { "epoch": 0.6055962032825786, "step": 6125, "train/total_loss": 0.11729776114225388 }, { "entropy": 9.01161003112793, "epoch": 0.6056950761320942, "mean_token_accuracy": 0.7325443625450134, "num_tokens": 11062602.0, "step": 6126, "train/ce_loss": 1.3645176887512207 }, { "epoch": 0.6056950761320942, "step": 6126, "train/sim_loss": 0.0859375 }, { "epoch": 0.6056950761320942, "step": 6126, "train/total_loss": 0.22238926589488983 }, { "entropy": 8.993681907653809, "epoch": 0.6057939489816097, "mean_token_accuracy": 0.7953431606292725, "num_tokens": 11067867.0, "step": 6127, "train/ce_loss": 1.6927732531257789e-06 }, { "epoch": 0.6057939489816097, "step": 6127, "train/sim_loss": 0.0859375 }, { "epoch": 0.6057939489816097, "step": 6127, "train/total_loss": 0.08593767136335373 }, { "entropy": 9.10866928100586, "epoch": 0.6058928218311251, "mean_token_accuracy": 0.7489878535270691, "num_tokens": 11073058.0, "step": 6128, "train/ce_loss": 0.9587780237197876 }, { "epoch": 0.6058928218311251, "step": 6128, "train/sim_loss": 0.078125 }, { "epoch": 0.6058928218311251, "step": 6128, "train/total_loss": 0.17400279641151428 }, { "entropy": 9.751190185546875, "epoch": 0.6059916946806407, "mean_token_accuracy": 0.75, "num_tokens": 11077883.0, "step": 6129, "train/ce_loss": 4.459578576643253e-06 }, { "epoch": 0.6059916946806407, "step": 6129, "train/sim_loss": 0.046875 }, { "epoch": 0.6059916946806407, "step": 6129, "train/total_loss": 0.046875447034835815 }, { "entropy": 9.06528091430664, "epoch": 0.6060905675301562, "mean_token_accuracy": 0.7226277589797974, "num_tokens": 11082982.0, "step": 6130, "train/ce_loss": 0.836621105670929 }, { "epoch": 0.6060905675301562, "step": 6130, "train/sim_loss": 0.05859375 }, { "epoch": 0.6060905675301562, "step": 6130, "train/total_loss": 0.14225587248802185 }, { "entropy": 8.767593383789062, "epoch": 0.6061894403796717, "mean_token_accuracy": 0.7079038023948669, "num_tokens": 11088295.0, "step": 6131, "train/ce_loss": 1.151301622390747 }, { "epoch": 0.6061894403796717, "step": 6131, "train/sim_loss": 0.0390625 }, { "epoch": 0.6061894403796717, "step": 6131, "train/total_loss": 0.15419265627861023 }, { "entropy": 8.503793716430664, "epoch": 0.6062883132291873, "mean_token_accuracy": 0.7227227091789246, "num_tokens": 11093754.0, "step": 6132, "train/ce_loss": 0.7511727809906006 }, { "epoch": 0.6062883132291873, "step": 6132, "train/sim_loss": 0.046875 }, { "epoch": 0.6062883132291873, "step": 6132, "train/total_loss": 0.12199228256940842 }, { "entropy": 9.331968307495117, "epoch": 0.6063871860787028, "mean_token_accuracy": 0.778294563293457, "num_tokens": 11098842.0, "step": 6133, "train/ce_loss": 0.6059866547584534 }, { "epoch": 0.6063871860787028, "step": 6133, "train/sim_loss": 0.0703125 }, { "epoch": 0.6063871860787028, "step": 6133, "train/total_loss": 0.13091117143630981 }, { "entropy": 9.549592971801758, "epoch": 0.6064860589282183, "mean_token_accuracy": 0.6774716377258301, "num_tokens": 11104053.0, "step": 6134, "train/ce_loss": 2.494525194168091 }, { "epoch": 0.6064860589282183, "step": 6134, "train/sim_loss": 0.07421875 }, { "epoch": 0.6064860589282183, "step": 6134, "train/total_loss": 0.32367128133773804 }, { "entropy": 9.277434349060059, "epoch": 0.6065849317777339, "mean_token_accuracy": 0.7071651220321655, "num_tokens": 11109204.0, "step": 6135, "train/ce_loss": 1.1674458980560303 }, { "epoch": 0.6065849317777339, "step": 6135, "train/sim_loss": 0.05859375 }, { "epoch": 0.6065849317777339, "step": 6135, "train/total_loss": 0.17533834278583527 }, { "entropy": 9.13475227355957, "epoch": 0.6066838046272494, "mean_token_accuracy": 0.7972972989082336, "num_tokens": 11114530.0, "step": 6136, "train/ce_loss": 0.8006073832511902 }, { "epoch": 0.6066838046272494, "step": 6136, "train/sim_loss": 0.0546875 }, { "epoch": 0.6066838046272494, "step": 6136, "train/total_loss": 0.13474825024604797 }, { "entropy": 9.748363494873047, "epoch": 0.6067826774767648, "mean_token_accuracy": 0.6660377383232117, "num_tokens": 11119636.0, "step": 6137, "train/ce_loss": 2.632155179977417 }, { "epoch": 0.6067826774767648, "step": 6137, "train/sim_loss": 0.09765625 }, { "epoch": 0.6067826774767648, "step": 6137, "train/total_loss": 0.3608717620372772 }, { "entropy": 9.102104187011719, "epoch": 0.6068815503262804, "mean_token_accuracy": 0.8008241653442383, "num_tokens": 11124881.0, "step": 6138, "train/ce_loss": 0.5030428767204285 }, { "epoch": 0.6068815503262804, "step": 6138, "train/sim_loss": 0.05078125 }, { "epoch": 0.6068815503262804, "step": 6138, "train/total_loss": 0.10108554363250732 }, { "entropy": 9.025053024291992, "epoch": 0.6069804231757959, "mean_token_accuracy": 0.7277108430862427, "num_tokens": 11130172.0, "step": 6139, "train/ce_loss": 0.48416703939437866 }, { "epoch": 0.6069804231757959, "step": 6139, "train/sim_loss": 0.0390625 }, { "epoch": 0.6069804231757959, "step": 6139, "train/total_loss": 0.08747920393943787 }, { "epoch": 0.6070792960253114, "grad_norm": 0.6249412298202515, "learning_rate": 8.48464619492657e-06, "loss": 0.1389, "step": 6140 }, { "entropy": 8.812685012817383, "epoch": 0.6070792960253114, "mean_token_accuracy": 0.681922197341919, "num_tokens": 11135547.0, "step": 6140, "train/ce_loss": 1.019753336906433 }, { "epoch": 0.6070792960253114, "step": 6140, "train/sim_loss": 0.078125 }, { "epoch": 0.6070792960253114, "step": 6140, "train/total_loss": 0.18010033667087555 }, { "entropy": 8.79395866394043, "epoch": 0.607178168874827, "mean_token_accuracy": 0.7002262473106384, "num_tokens": 11140890.0, "step": 6141, "train/ce_loss": 1.1063992977142334 }, { "epoch": 0.607178168874827, "step": 6141, "train/sim_loss": 0.11328125 }, { "epoch": 0.607178168874827, "step": 6141, "train/total_loss": 0.22392117977142334 }, { "entropy": 9.144994735717773, "epoch": 0.6072770417243425, "mean_token_accuracy": 0.7426981925964355, "num_tokens": 11146055.0, "step": 6142, "train/ce_loss": 0.8788449764251709 }, { "epoch": 0.6072770417243425, "step": 6142, "train/sim_loss": 0.046875 }, { "epoch": 0.6072770417243425, "step": 6142, "train/total_loss": 0.13475950062274933 }, { "entropy": 8.9202880859375, "epoch": 0.607375914573858, "mean_token_accuracy": 0.7448186278343201, "num_tokens": 11151298.0, "step": 6143, "train/ce_loss": 0.9303768873214722 }, { "epoch": 0.607375914573858, "step": 6143, "train/sim_loss": 0.0625 }, { "epoch": 0.607375914573858, "step": 6143, "train/total_loss": 0.1555376946926117 }, { "entropy": 9.32742691040039, "epoch": 0.6074747874233736, "mean_token_accuracy": 0.6774193644523621, "num_tokens": 11156392.0, "step": 6144, "train/ce_loss": 2.0724892616271973 }, { "epoch": 0.6074747874233736, "step": 6144, "train/sim_loss": 0.10546875 }, { "epoch": 0.6074747874233736, "step": 6144, "train/total_loss": 0.3127176761627197 }, { "entropy": 9.137593269348145, "epoch": 0.6075736602728891, "mean_token_accuracy": 0.7714646458625793, "num_tokens": 11161622.0, "step": 6145, "train/ce_loss": 0.6917032599449158 }, { "epoch": 0.6075736602728891, "step": 6145, "train/sim_loss": 0.0390625 }, { "epoch": 0.6075736602728891, "step": 6145, "train/total_loss": 0.10823282599449158 }, { "entropy": 9.317638397216797, "epoch": 0.6076725331224045, "mean_token_accuracy": 0.7710674405097961, "num_tokens": 11166792.0, "step": 6146, "train/ce_loss": 0.8067945837974548 }, { "epoch": 0.6076725331224045, "step": 6146, "train/sim_loss": 0.03125 }, { "epoch": 0.6076725331224045, "step": 6146, "train/total_loss": 0.11192946135997772 }, { "entropy": 9.131617546081543, "epoch": 0.6077714059719201, "mean_token_accuracy": 0.752173900604248, "num_tokens": 11171923.0, "step": 6147, "train/ce_loss": 1.4311491250991821 }, { "epoch": 0.6077714059719201, "step": 6147, "train/sim_loss": 0.078125 }, { "epoch": 0.6077714059719201, "step": 6147, "train/total_loss": 0.22123990952968597 }, { "entropy": 8.97225570678711, "epoch": 0.6078702788214356, "mean_token_accuracy": 0.7412513494491577, "num_tokens": 11177457.0, "step": 6148, "train/ce_loss": 0.9629305005073547 }, { "epoch": 0.6078702788214356, "step": 6148, "train/sim_loss": 0.04296875 }, { "epoch": 0.6078702788214356, "step": 6148, "train/total_loss": 0.13926181197166443 }, { "entropy": 8.673298835754395, "epoch": 0.6079691516709511, "mean_token_accuracy": 0.7277227640151978, "num_tokens": 11182857.0, "step": 6149, "train/ce_loss": 0.9916688799858093 }, { "epoch": 0.6079691516709511, "step": 6149, "train/sim_loss": 0.046875 }, { "epoch": 0.6079691516709511, "step": 6149, "train/total_loss": 0.1460418999195099 }, { "entropy": 8.975536346435547, "epoch": 0.6080680245204667, "mean_token_accuracy": 0.7418086528778076, "num_tokens": 11188098.0, "step": 6150, "train/ce_loss": 1.0984766483306885 }, { "epoch": 0.6080680245204667, "step": 6150, "train/sim_loss": 0.0390625 }, { "epoch": 0.6080680245204667, "step": 6150, "train/total_loss": 0.14891016483306885 }, { "entropy": 9.112666130065918, "epoch": 0.6081668973699822, "mean_token_accuracy": 0.6454917788505554, "num_tokens": 11193008.0, "step": 6151, "train/ce_loss": 2.2162926197052 }, { "epoch": 0.6081668973699822, "step": 6151, "train/sim_loss": 0.1171875 }, { "epoch": 0.6081668973699822, "step": 6151, "train/total_loss": 0.33881676197052 }, { "entropy": 8.722833633422852, "epoch": 0.6082657702194977, "mean_token_accuracy": 0.6876310110092163, "num_tokens": 11198484.0, "step": 6152, "train/ce_loss": 1.057938575744629 }, { "epoch": 0.6082657702194977, "step": 6152, "train/sim_loss": 0.0546875 }, { "epoch": 0.6082657702194977, "step": 6152, "train/total_loss": 0.16048136353492737 }, { "entropy": 8.93104076385498, "epoch": 0.6083646430690133, "mean_token_accuracy": 0.760401725769043, "num_tokens": 11203652.0, "step": 6153, "train/ce_loss": 0.45881423354148865 }, { "epoch": 0.6083646430690133, "step": 6153, "train/sim_loss": 0.04296875 }, { "epoch": 0.6083646430690133, "step": 6153, "train/total_loss": 0.08885017037391663 }, { "entropy": 9.29659652709961, "epoch": 0.6084635159185288, "mean_token_accuracy": 0.6714060306549072, "num_tokens": 11208667.0, "step": 6154, "train/ce_loss": 1.8182228803634644 }, { "epoch": 0.6084635159185288, "step": 6154, "train/sim_loss": 0.1171875 }, { "epoch": 0.6084635159185288, "step": 6154, "train/total_loss": 0.2990097999572754 }, { "entropy": 8.691879272460938, "epoch": 0.6085623887680442, "mean_token_accuracy": 0.7736966609954834, "num_tokens": 11213986.0, "step": 6155, "train/ce_loss": 0.9295175671577454 }, { "epoch": 0.6085623887680442, "step": 6155, "train/sim_loss": 0.09765625 }, { "epoch": 0.6085623887680442, "step": 6155, "train/total_loss": 0.19060800969600677 }, { "entropy": 9.033947944641113, "epoch": 0.6086612616175598, "mean_token_accuracy": 0.7052767276763916, "num_tokens": 11219280.0, "step": 6156, "train/ce_loss": 0.701568067073822 }, { "epoch": 0.6086612616175598, "step": 6156, "train/sim_loss": 0.0546875 }, { "epoch": 0.6086612616175598, "step": 6156, "train/total_loss": 0.12484430521726608 }, { "entropy": 9.38157844543457, "epoch": 0.6087601344670753, "mean_token_accuracy": 0.7710145115852356, "num_tokens": 11224413.0, "step": 6157, "train/ce_loss": 4.388226102491899e-07 }, { "epoch": 0.6087601344670753, "step": 6157, "train/sim_loss": 0.015625 }, { "epoch": 0.6087601344670753, "step": 6157, "train/total_loss": 0.01562504470348358 }, { "entropy": 8.847856521606445, "epoch": 0.6088590073165908, "mean_token_accuracy": 0.7508896589279175, "num_tokens": 11229683.0, "step": 6158, "train/ce_loss": 1.0196502208709717 }, { "epoch": 0.6088590073165908, "step": 6158, "train/sim_loss": 0.03515625 }, { "epoch": 0.6088590073165908, "step": 6158, "train/total_loss": 0.1371212750673294 }, { "entropy": 8.70765495300293, "epoch": 0.6089578801661064, "mean_token_accuracy": 0.7148891091346741, "num_tokens": 11235072.0, "step": 6159, "train/ce_loss": 1.02029550075531 }, { "epoch": 0.6089578801661064, "step": 6159, "train/sim_loss": 0.0625 }, { "epoch": 0.6089578801661064, "step": 6159, "train/total_loss": 0.16452956199645996 }, { "epoch": 0.6090567530156219, "grad_norm": 0.6889066100120544, "learning_rate": 8.47970133016862e-06, "loss": 0.1488, "step": 6160 }, { "entropy": 9.873184204101562, "epoch": 0.6090567530156219, "mean_token_accuracy": 0.7747524976730347, "num_tokens": 11239868.0, "step": 6160, "train/ce_loss": 8.155694217748533e-07 }, { "epoch": 0.6090567530156219, "step": 6160, "train/sim_loss": 0.015625 }, { "epoch": 0.6090567530156219, "step": 6160, "train/total_loss": 0.015625081956386566 }, { "entropy": 9.006669998168945, "epoch": 0.6091556258651374, "mean_token_accuracy": 0.707379162311554, "num_tokens": 11245147.0, "step": 6161, "train/ce_loss": 1.3221015930175781 }, { "epoch": 0.6091556258651374, "step": 6161, "train/sim_loss": 0.04296875 }, { "epoch": 0.6091556258651374, "step": 6161, "train/total_loss": 0.1751789152622223 }, { "entropy": 8.94556713104248, "epoch": 0.609254498714653, "mean_token_accuracy": 0.724252462387085, "num_tokens": 11250504.0, "step": 6162, "train/ce_loss": 0.4792410135269165 }, { "epoch": 0.609254498714653, "step": 6162, "train/sim_loss": 0.07421875 }, { "epoch": 0.609254498714653, "step": 6162, "train/total_loss": 0.12214285135269165 }, { "entropy": 8.423084259033203, "epoch": 0.6093533715641685, "mean_token_accuracy": 0.678260862827301, "num_tokens": 11255785.0, "step": 6163, "train/ce_loss": 1.0342999696731567 }, { "epoch": 0.6093533715641685, "step": 6163, "train/sim_loss": 0.046875 }, { "epoch": 0.6093533715641685, "step": 6163, "train/total_loss": 0.15030500292778015 }, { "entropy": 9.02614974975586, "epoch": 0.609452244413684, "mean_token_accuracy": 0.7075471878051758, "num_tokens": 11260945.0, "step": 6164, "train/ce_loss": 1.1955517530441284 }, { "epoch": 0.609452244413684, "step": 6164, "train/sim_loss": 0.05859375 }, { "epoch": 0.609452244413684, "step": 6164, "train/total_loss": 0.17814892530441284 }, { "entropy": 9.500265121459961, "epoch": 0.6095511172631995, "mean_token_accuracy": 0.7566909790039062, "num_tokens": 11265750.0, "step": 6165, "train/ce_loss": 0.8175063729286194 }, { "epoch": 0.6095511172631995, "step": 6165, "train/sim_loss": 0.078125 }, { "epoch": 0.6095511172631995, "step": 6165, "train/total_loss": 0.15987563133239746 }, { "entropy": 9.123942375183105, "epoch": 0.609649990112715, "mean_token_accuracy": 0.7383177280426025, "num_tokens": 11270842.0, "step": 6166, "train/ce_loss": 0.5500314831733704 }, { "epoch": 0.609649990112715, "step": 6166, "train/sim_loss": 0.0390625 }, { "epoch": 0.609649990112715, "step": 6166, "train/total_loss": 0.09406565129756927 }, { "entropy": 8.76285457611084, "epoch": 0.6097488629622305, "mean_token_accuracy": 0.7166469693183899, "num_tokens": 11276114.0, "step": 6167, "train/ce_loss": 0.7664913535118103 }, { "epoch": 0.6097488629622305, "step": 6167, "train/sim_loss": 0.03125 }, { "epoch": 0.6097488629622305, "step": 6167, "train/total_loss": 0.10789913684129715 }, { "entropy": 8.750094413757324, "epoch": 0.6098477358117461, "mean_token_accuracy": 0.7453488111495972, "num_tokens": 11281455.0, "step": 6168, "train/ce_loss": 0.5771999359130859 }, { "epoch": 0.6098477358117461, "step": 6168, "train/sim_loss": 0.02734375 }, { "epoch": 0.6098477358117461, "step": 6168, "train/total_loss": 0.08506374061107635 }, { "entropy": 9.110280990600586, "epoch": 0.6099466086612616, "mean_token_accuracy": 0.7319587469100952, "num_tokens": 11286631.0, "step": 6169, "train/ce_loss": 1.0423439741134644 }, { "epoch": 0.6099466086612616, "step": 6169, "train/sim_loss": 0.046875 }, { "epoch": 0.6099466086612616, "step": 6169, "train/total_loss": 0.15110939741134644 }, { "entropy": 8.88326644897461, "epoch": 0.6100454815107771, "mean_token_accuracy": 0.723127007484436, "num_tokens": 11292023.0, "step": 6170, "train/ce_loss": 0.7747926712036133 }, { "epoch": 0.6100454815107771, "step": 6170, "train/sim_loss": 0.03125 }, { "epoch": 0.6100454815107771, "step": 6170, "train/total_loss": 0.10872926563024521 }, { "entropy": 9.328957557678223, "epoch": 0.6101443543602927, "mean_token_accuracy": 0.699312686920166, "num_tokens": 11297033.0, "step": 6171, "train/ce_loss": 1.3747528555541066e-06 }, { "epoch": 0.6101443543602927, "step": 6171, "train/sim_loss": 0.0625 }, { "epoch": 0.6101443543602927, "step": 6171, "train/total_loss": 0.06250013411045074 }, { "entropy": 9.24807357788086, "epoch": 0.6102432272098082, "mean_token_accuracy": 0.7193877696990967, "num_tokens": 11302105.0, "step": 6172, "train/ce_loss": 0.7044182419776917 }, { "epoch": 0.6102432272098082, "step": 6172, "train/sim_loss": 0.0625 }, { "epoch": 0.6102432272098082, "step": 6172, "train/total_loss": 0.1329418271780014 }, { "entropy": 8.854923248291016, "epoch": 0.6103421000593237, "mean_token_accuracy": 0.6898016929626465, "num_tokens": 11307296.0, "step": 6173, "train/ce_loss": 0.6190934777259827 }, { "epoch": 0.6103421000593237, "step": 6173, "train/sim_loss": 0.0390625 }, { "epoch": 0.6103421000593237, "step": 6173, "train/total_loss": 0.10097184777259827 }, { "entropy": 8.727234840393066, "epoch": 0.6104409729088393, "mean_token_accuracy": 0.7434988021850586, "num_tokens": 11312632.0, "step": 6174, "train/ce_loss": 0.9979443550109863 }, { "epoch": 0.6104409729088393, "step": 6174, "train/sim_loss": 0.05078125 }, { "epoch": 0.6104409729088393, "step": 6174, "train/total_loss": 0.1505756974220276 }, { "entropy": 10.119205474853516, "epoch": 0.6105398457583547, "mean_token_accuracy": 0.8416422009468079, "num_tokens": 11317340.0, "step": 6175, "train/ce_loss": 2.1631919935316546e-06 }, { "epoch": 0.6105398457583547, "step": 6175, "train/sim_loss": 0.0234375 }, { "epoch": 0.6105398457583547, "step": 6175, "train/total_loss": 0.02343771606683731 }, { "entropy": 8.866151809692383, "epoch": 0.6106387186078702, "mean_token_accuracy": 0.7371244430541992, "num_tokens": 11322773.0, "step": 6176, "train/ce_loss": 0.47456467151641846 }, { "epoch": 0.6106387186078702, "step": 6176, "train/sim_loss": 0.015625 }, { "epoch": 0.6106387186078702, "step": 6176, "train/total_loss": 0.06308147311210632 }, { "entropy": 9.114995002746582, "epoch": 0.6107375914573858, "mean_token_accuracy": 0.741605818271637, "num_tokens": 11327909.0, "step": 6177, "train/ce_loss": 0.40683189034461975 }, { "epoch": 0.6107375914573858, "step": 6177, "train/sim_loss": 0.02734375 }, { "epoch": 0.6107375914573858, "step": 6177, "train/total_loss": 0.06802694499492645 }, { "entropy": 8.819618225097656, "epoch": 0.6108364643069013, "mean_token_accuracy": 0.7382199168205261, "num_tokens": 11333130.0, "step": 6178, "train/ce_loss": 0.6157546043395996 }, { "epoch": 0.6108364643069013, "step": 6178, "train/sim_loss": 0.015625 }, { "epoch": 0.6108364643069013, "step": 6178, "train/total_loss": 0.07720045745372772 }, { "entropy": 8.793582916259766, "epoch": 0.6109353371564169, "mean_token_accuracy": 0.7600446343421936, "num_tokens": 11338471.0, "step": 6179, "train/ce_loss": 0.6357402801513672 }, { "epoch": 0.6109353371564169, "step": 6179, "train/sim_loss": 0.0703125 }, { "epoch": 0.6109353371564169, "step": 6179, "train/total_loss": 0.13388653099536896 }, { "epoch": 0.6110342100059324, "grad_norm": 0.6334864497184753, "learning_rate": 8.474756465410673e-06, "loss": 0.137, "step": 6180 }, { "entropy": 8.904747009277344, "epoch": 0.6110342100059324, "mean_token_accuracy": 0.7613122463226318, "num_tokens": 11343840.0, "step": 6180, "train/ce_loss": 0.6107622981071472 }, { "epoch": 0.6110342100059324, "step": 6180, "train/sim_loss": 0.0703125 }, { "epoch": 0.6110342100059324, "step": 6180, "train/total_loss": 0.13138872385025024 }, { "entropy": 8.89372444152832, "epoch": 0.6111330828554479, "mean_token_accuracy": 0.8065241575241089, "num_tokens": 11349062.0, "step": 6181, "train/ce_loss": 0.8261597752571106 }, { "epoch": 0.6111330828554479, "step": 6181, "train/sim_loss": 0.0625 }, { "epoch": 0.6111330828554479, "step": 6181, "train/total_loss": 0.14511597156524658 }, { "entropy": 8.715995788574219, "epoch": 0.6112319557049635, "mean_token_accuracy": 0.6978508234024048, "num_tokens": 11354371.0, "step": 6182, "train/ce_loss": 0.6819457411766052 }, { "epoch": 0.6112319557049635, "step": 6182, "train/sim_loss": 0.02734375 }, { "epoch": 0.6112319557049635, "step": 6182, "train/total_loss": 0.09553832560777664 }, { "entropy": 9.236974716186523, "epoch": 0.611330828554479, "mean_token_accuracy": 0.7191600799560547, "num_tokens": 11359717.0, "step": 6183, "train/ce_loss": 2.3581983441545162e-06 }, { "epoch": 0.611330828554479, "step": 6183, "train/sim_loss": 0.0859375 }, { "epoch": 0.611330828554479, "step": 6183, "train/total_loss": 0.0859377384185791 }, { "entropy": 8.74993896484375, "epoch": 0.6114297014039944, "mean_token_accuracy": 0.7932900190353394, "num_tokens": 11365105.0, "step": 6184, "train/ce_loss": 0.482496052980423 }, { "epoch": 0.6114297014039944, "step": 6184, "train/sim_loss": 0.0234375 }, { "epoch": 0.6114297014039944, "step": 6184, "train/total_loss": 0.07168710231781006 }, { "entropy": 9.187471389770508, "epoch": 0.61152857425351, "mean_token_accuracy": 0.7275494933128357, "num_tokens": 11370198.0, "step": 6185, "train/ce_loss": 1.1196050643920898 }, { "epoch": 0.61152857425351, "step": 6185, "train/sim_loss": 0.05078125 }, { "epoch": 0.61152857425351, "step": 6185, "train/total_loss": 0.1627417504787445 }, { "entropy": 9.1384859085083, "epoch": 0.6116274471030255, "mean_token_accuracy": 0.7666068077087402, "num_tokens": 11375240.0, "step": 6186, "train/ce_loss": 0.7825351357460022 }, { "epoch": 0.6116274471030255, "step": 6186, "train/sim_loss": 0.07421875 }, { "epoch": 0.6116274471030255, "step": 6186, "train/total_loss": 0.15247225761413574 }, { "entropy": 8.774885177612305, "epoch": 0.611726319952541, "mean_token_accuracy": 0.7636786699295044, "num_tokens": 11380570.0, "step": 6187, "train/ce_loss": 0.5543928146362305 }, { "epoch": 0.611726319952541, "step": 6187, "train/sim_loss": 0.0546875 }, { "epoch": 0.611726319952541, "step": 6187, "train/total_loss": 0.11012677848339081 }, { "entropy": 8.961973190307617, "epoch": 0.6118251928020566, "mean_token_accuracy": 0.7341935634613037, "num_tokens": 11385743.0, "step": 6188, "train/ce_loss": 1.0541927814483643 }, { "epoch": 0.6118251928020566, "step": 6188, "train/sim_loss": 0.07421875 }, { "epoch": 0.6118251928020566, "step": 6188, "train/total_loss": 0.17963802814483643 }, { "entropy": 9.709959030151367, "epoch": 0.6119240656515721, "mean_token_accuracy": 0.7247058749198914, "num_tokens": 11390595.0, "step": 6189, "train/ce_loss": 1.1530605554580688 }, { "epoch": 0.6119240656515721, "step": 6189, "train/sim_loss": 0.07421875 }, { "epoch": 0.6119240656515721, "step": 6189, "train/total_loss": 0.1895247995853424 }, { "entropy": 8.58003044128418, "epoch": 0.6120229385010876, "mean_token_accuracy": 0.7289271950721741, "num_tokens": 11396092.0, "step": 6190, "train/ce_loss": 0.7498189806938171 }, { "epoch": 0.6120229385010876, "step": 6190, "train/sim_loss": 0.0546875 }, { "epoch": 0.6120229385010876, "step": 6190, "train/total_loss": 0.1296693980693817 }, { "entropy": 9.221874237060547, "epoch": 0.6121218113506032, "mean_token_accuracy": 0.7179487347602844, "num_tokens": 11401173.0, "step": 6191, "train/ce_loss": 1.188353180885315 }, { "epoch": 0.6121218113506032, "step": 6191, "train/sim_loss": 0.0703125 }, { "epoch": 0.6121218113506032, "step": 6191, "train/total_loss": 0.18914783000946045 }, { "entropy": 9.272602081298828, "epoch": 0.6122206842001187, "mean_token_accuracy": 0.7354409098625183, "num_tokens": 11406245.0, "step": 6192, "train/ce_loss": 1.5668947526137345e-06 }, { "epoch": 0.6122206842001187, "step": 6192, "train/sim_loss": 0.046875 }, { "epoch": 0.6122206842001187, "step": 6192, "train/total_loss": 0.046875156462192535 }, { "entropy": 8.841050148010254, "epoch": 0.6123195570496341, "mean_token_accuracy": 0.724304735660553, "num_tokens": 11411568.0, "step": 6193, "train/ce_loss": 0.6666838526725769 }, { "epoch": 0.6123195570496341, "step": 6193, "train/sim_loss": 0.0859375 }, { "epoch": 0.6123195570496341, "step": 6193, "train/total_loss": 0.15260589122772217 }, { "entropy": 9.335506439208984, "epoch": 0.6124184298991497, "mean_token_accuracy": 0.7161290049552917, "num_tokens": 11416482.0, "step": 6194, "train/ce_loss": 1.1841667890548706 }, { "epoch": 0.6124184298991497, "step": 6194, "train/sim_loss": 0.0546875 }, { "epoch": 0.6124184298991497, "step": 6194, "train/total_loss": 0.1731041818857193 }, { "entropy": 8.814682006835938, "epoch": 0.6125173027486652, "mean_token_accuracy": 0.7270269989967346, "num_tokens": 11421743.0, "step": 6195, "train/ce_loss": 1.20308256149292 }, { "epoch": 0.6125173027486652, "step": 6195, "train/sim_loss": 0.09765625 }, { "epoch": 0.6125173027486652, "step": 6195, "train/total_loss": 0.21796450018882751 }, { "entropy": 9.396188735961914, "epoch": 0.6126161755981807, "mean_token_accuracy": 0.7756314873695374, "num_tokens": 11426869.0, "step": 6196, "train/ce_loss": 0.9634108543395996 }, { "epoch": 0.6126161755981807, "step": 6196, "train/sim_loss": 0.08203125 }, { "epoch": 0.6126161755981807, "step": 6196, "train/total_loss": 0.1783723384141922 }, { "entropy": 8.846820831298828, "epoch": 0.6127150484476963, "mean_token_accuracy": 0.7310252785682678, "num_tokens": 11432067.0, "step": 6197, "train/ce_loss": 1.1840417385101318 }, { "epoch": 0.6127150484476963, "step": 6197, "train/sim_loss": 0.05078125 }, { "epoch": 0.6127150484476963, "step": 6197, "train/total_loss": 0.16918542981147766 }, { "entropy": 8.889070510864258, "epoch": 0.6128139212972118, "mean_token_accuracy": 0.7651006579399109, "num_tokens": 11437402.0, "step": 6198, "train/ce_loss": 0.6077854633331299 }, { "epoch": 0.6128139212972118, "step": 6198, "train/sim_loss": 0.03125 }, { "epoch": 0.6128139212972118, "step": 6198, "train/total_loss": 0.09202854335308075 }, { "entropy": 8.941230773925781, "epoch": 0.6129127941467273, "mean_token_accuracy": 0.771136999130249, "num_tokens": 11442586.0, "step": 6199, "train/ce_loss": 0.756876528263092 }, { "epoch": 0.6129127941467273, "step": 6199, "train/sim_loss": 0.0625 }, { "epoch": 0.6129127941467273, "step": 6199, "train/total_loss": 0.13818764686584473 }, { "epoch": 0.6130116669962429, "grad_norm": 0.8004269599914551, "learning_rate": 8.469811600652723e-06, "loss": 0.1392, "step": 6200 }, { "entropy": 9.248784065246582, "epoch": 0.6130116669962429, "mean_token_accuracy": 0.6237244606018066, "num_tokens": 11447810.0, "step": 6200, "train/ce_loss": 0.616258978843689 }, { "epoch": 0.6130116669962429, "step": 6200, "train/sim_loss": 0.06640625 }, { "epoch": 0.6130116669962429, "step": 6200, "train/total_loss": 0.1280321478843689 }, { "entropy": 8.512928009033203, "epoch": 0.6131105398457584, "mean_token_accuracy": 0.7426120042800903, "num_tokens": 11453340.0, "step": 6201, "train/ce_loss": 0.7950108647346497 }, { "epoch": 0.6131105398457584, "step": 6201, "train/sim_loss": 0.0234375 }, { "epoch": 0.6131105398457584, "step": 6201, "train/total_loss": 0.10293858498334885 }, { "entropy": 9.26449966430664, "epoch": 0.6132094126952738, "mean_token_accuracy": 0.6986899375915527, "num_tokens": 11458402.0, "step": 6202, "train/ce_loss": 1.1567436456680298 }, { "epoch": 0.6132094126952738, "step": 6202, "train/sim_loss": 0.04296875 }, { "epoch": 0.6132094126952738, "step": 6202, "train/total_loss": 0.15864312648773193 }, { "entropy": 8.734781265258789, "epoch": 0.6133082855447894, "mean_token_accuracy": 0.7141104340553284, "num_tokens": 11463654.0, "step": 6203, "train/ce_loss": 0.7052172422409058 }, { "epoch": 0.6133082855447894, "step": 6203, "train/sim_loss": 0.0546875 }, { "epoch": 0.6133082855447894, "step": 6203, "train/total_loss": 0.12520922720432281 }, { "entropy": 8.867867469787598, "epoch": 0.6134071583943049, "mean_token_accuracy": 0.7478890419006348, "num_tokens": 11468962.0, "step": 6204, "train/ce_loss": 0.589647114276886 }, { "epoch": 0.6134071583943049, "step": 6204, "train/sim_loss": 0.03515625 }, { "epoch": 0.6134071583943049, "step": 6204, "train/total_loss": 0.09412096440792084 }, { "entropy": 8.96460247039795, "epoch": 0.6135060312438204, "mean_token_accuracy": 0.7923875451087952, "num_tokens": 11474277.0, "step": 6205, "train/ce_loss": 0.5550974607467651 }, { "epoch": 0.6135060312438204, "step": 6205, "train/sim_loss": 0.02734375 }, { "epoch": 0.6135060312438204, "step": 6205, "train/total_loss": 0.08285349607467651 }, { "entropy": 9.070470809936523, "epoch": 0.613604904093336, "mean_token_accuracy": 0.7423398494720459, "num_tokens": 11479483.0, "step": 6206, "train/ce_loss": 1.1447519063949585 }, { "epoch": 0.613604904093336, "step": 6206, "train/sim_loss": 0.0390625 }, { "epoch": 0.613604904093336, "step": 6206, "train/total_loss": 0.15353769063949585 }, { "entropy": 9.126496315002441, "epoch": 0.6137037769428515, "mean_token_accuracy": 0.7736318111419678, "num_tokens": 11484742.0, "step": 6207, "train/ce_loss": 0.7532952427864075 }, { "epoch": 0.6137037769428515, "step": 6207, "train/sim_loss": 0.03125 }, { "epoch": 0.6137037769428515, "step": 6207, "train/total_loss": 0.10657952725887299 }, { "entropy": 8.78580093383789, "epoch": 0.613802649792367, "mean_token_accuracy": 0.7870563864707947, "num_tokens": 11490166.0, "step": 6208, "train/ce_loss": 0.644048810005188 }, { "epoch": 0.613802649792367, "step": 6208, "train/sim_loss": 0.078125 }, { "epoch": 0.613802649792367, "step": 6208, "train/total_loss": 0.14252987504005432 }, { "entropy": 9.546608924865723, "epoch": 0.6139015226418826, "mean_token_accuracy": 0.6877133250236511, "num_tokens": 11495191.0, "step": 6209, "train/ce_loss": 1.6582438945770264 }, { "epoch": 0.6139015226418826, "step": 6209, "train/sim_loss": 0.09765625 }, { "epoch": 0.6139015226418826, "step": 6209, "train/total_loss": 0.26348066329956055 }, { "entropy": 8.778793334960938, "epoch": 0.6140003954913981, "mean_token_accuracy": 0.7088273763656616, "num_tokens": 11500432.0, "step": 6210, "train/ce_loss": 1.4049540758132935 }, { "epoch": 0.6140003954913981, "step": 6210, "train/sim_loss": 0.07421875 }, { "epoch": 0.6140003954913981, "step": 6210, "train/total_loss": 0.2147141546010971 }, { "entropy": 8.937134742736816, "epoch": 0.6140992683409136, "mean_token_accuracy": 0.7649402618408203, "num_tokens": 11505654.0, "step": 6211, "train/ce_loss": 0.6439159512519836 }, { "epoch": 0.6140992683409136, "step": 6211, "train/sim_loss": 0.05078125 }, { "epoch": 0.6140992683409136, "step": 6211, "train/total_loss": 0.1151728481054306 }, { "entropy": 9.340566635131836, "epoch": 0.6141981411904291, "mean_token_accuracy": 0.7306451797485352, "num_tokens": 11510704.0, "step": 6212, "train/ce_loss": 0.6049285531044006 }, { "epoch": 0.6141981411904291, "step": 6212, "train/sim_loss": 0.0546875 }, { "epoch": 0.6141981411904291, "step": 6212, "train/total_loss": 0.1151803582906723 }, { "entropy": 8.783777236938477, "epoch": 0.6142970140399446, "mean_token_accuracy": 0.7322677373886108, "num_tokens": 11516233.0, "step": 6213, "train/ce_loss": 0.45691490173339844 }, { "epoch": 0.6142970140399446, "step": 6213, "train/sim_loss": 0.0625 }, { "epoch": 0.6142970140399446, "step": 6213, "train/total_loss": 0.10819149017333984 }, { "entropy": 9.364067077636719, "epoch": 0.6143958868894601, "mean_token_accuracy": 0.6932849287986755, "num_tokens": 11521220.0, "step": 6214, "train/ce_loss": 1.3702287673950195 }, { "epoch": 0.6143958868894601, "step": 6214, "train/sim_loss": 0.05859375 }, { "epoch": 0.6143958868894601, "step": 6214, "train/total_loss": 0.19561663269996643 }, { "entropy": 9.151406288146973, "epoch": 0.6144947597389757, "mean_token_accuracy": 0.7899860739707947, "num_tokens": 11526383.0, "step": 6215, "train/ce_loss": 0.522618293762207 }, { "epoch": 0.6144947597389757, "step": 6215, "train/sim_loss": 0.01953125 }, { "epoch": 0.6144947597389757, "step": 6215, "train/total_loss": 0.0717930793762207 }, { "entropy": 8.809883117675781, "epoch": 0.6145936325884912, "mean_token_accuracy": 0.7736625671386719, "num_tokens": 11531831.0, "step": 6216, "train/ce_loss": 0.34042075276374817 }, { "epoch": 0.6145936325884912, "step": 6216, "train/sim_loss": 0.015625 }, { "epoch": 0.6145936325884912, "step": 6216, "train/total_loss": 0.04966707527637482 }, { "entropy": 9.537630081176758, "epoch": 0.6146925054380067, "mean_token_accuracy": 0.8206785321235657, "num_tokens": 11536843.0, "step": 6217, "train/ce_loss": 0.9631327986717224 }, { "epoch": 0.6146925054380067, "step": 6217, "train/sim_loss": 0.0234375 }, { "epoch": 0.6146925054380067, "step": 6217, "train/total_loss": 0.11975078284740448 }, { "entropy": 9.146952629089355, "epoch": 0.6147913782875223, "mean_token_accuracy": 0.7847328186035156, "num_tokens": 11541948.0, "step": 6218, "train/ce_loss": 0.7013130784034729 }, { "epoch": 0.6147913782875223, "step": 6218, "train/sim_loss": 0.109375 }, { "epoch": 0.6147913782875223, "step": 6218, "train/total_loss": 0.1795063018798828 }, { "entropy": 8.299921035766602, "epoch": 0.6148902511370378, "mean_token_accuracy": 0.7516198754310608, "num_tokens": 11547343.0, "step": 6219, "train/ce_loss": 0.756189227104187 }, { "epoch": 0.6148902511370378, "step": 6219, "train/sim_loss": 0.05859375 }, { "epoch": 0.6148902511370378, "step": 6219, "train/total_loss": 0.1342126727104187 }, { "epoch": 0.6149891239865533, "grad_norm": 0.6907902956008911, "learning_rate": 8.464866735894775e-06, "loss": 0.1332, "step": 6220 }, { "entropy": 9.126428604125977, "epoch": 0.6149891239865533, "mean_token_accuracy": 0.7591836452484131, "num_tokens": 11552520.0, "step": 6220, "train/ce_loss": 1.4551453590393066 }, { "epoch": 0.6149891239865533, "step": 6220, "train/sim_loss": 0.0625 }, { "epoch": 0.6149891239865533, "step": 6220, "train/total_loss": 0.20801453292369843 }, { "entropy": 9.308960914611816, "epoch": 0.6150879968360689, "mean_token_accuracy": 0.7242105007171631, "num_tokens": 11557445.0, "step": 6221, "train/ce_loss": 1.0523173809051514 }, { "epoch": 0.6150879968360689, "step": 6221, "train/sim_loss": 0.046875 }, { "epoch": 0.6150879968360689, "step": 6221, "train/total_loss": 0.15210673213005066 }, { "entropy": 9.870338439941406, "epoch": 0.6151868696855843, "mean_token_accuracy": 0.780927836894989, "num_tokens": 11562238.0, "step": 6222, "train/ce_loss": 1.603911280632019 }, { "epoch": 0.6151868696855843, "step": 6222, "train/sim_loss": 0.08203125 }, { "epoch": 0.6151868696855843, "step": 6222, "train/total_loss": 0.24242238700389862 }, { "entropy": 8.37800121307373, "epoch": 0.6152857425350998, "mean_token_accuracy": 0.731225311756134, "num_tokens": 11567710.0, "step": 6223, "train/ce_loss": 0.9325500130653381 }, { "epoch": 0.6152857425350998, "step": 6223, "train/sim_loss": 0.046875 }, { "epoch": 0.6152857425350998, "step": 6223, "train/total_loss": 0.14013001322746277 }, { "entropy": 9.188399314880371, "epoch": 0.6153846153846154, "mean_token_accuracy": 0.7624223828315735, "num_tokens": 11572785.0, "step": 6224, "train/ce_loss": 9.269812153434032e-07 }, { "epoch": 0.6153846153846154, "step": 6224, "train/sim_loss": 0.0703125 }, { "epoch": 0.6153846153846154, "step": 6224, "train/total_loss": 0.07031258940696716 }, { "entropy": 9.501968383789062, "epoch": 0.6154834882341309, "mean_token_accuracy": 0.776627242565155, "num_tokens": 11577929.0, "step": 6225, "train/ce_loss": 0.9787071943283081 }, { "epoch": 0.6154834882341309, "step": 6225, "train/sim_loss": 0.0234375 }, { "epoch": 0.6154834882341309, "step": 6225, "train/total_loss": 0.12130822241306305 }, { "entropy": 8.997016906738281, "epoch": 0.6155823610836464, "mean_token_accuracy": 0.7281323671340942, "num_tokens": 11583168.0, "step": 6226, "train/ce_loss": 1.01045823097229 }, { "epoch": 0.6155823610836464, "step": 6226, "train/sim_loss": 0.03125 }, { "epoch": 0.6155823610836464, "step": 6226, "train/total_loss": 0.13229581713676453 }, { "entropy": 8.857669830322266, "epoch": 0.615681233933162, "mean_token_accuracy": 0.7765042781829834, "num_tokens": 11588293.0, "step": 6227, "train/ce_loss": 1.1743441820144653 }, { "epoch": 0.615681233933162, "step": 6227, "train/sim_loss": 0.03515625 }, { "epoch": 0.615681233933162, "step": 6227, "train/total_loss": 0.15259066224098206 }, { "entropy": 8.81214714050293, "epoch": 0.6157801067826775, "mean_token_accuracy": 0.7611940503120422, "num_tokens": 11593576.0, "step": 6228, "train/ce_loss": 0.9884230494499207 }, { "epoch": 0.6157801067826775, "step": 6228, "train/sim_loss": 0.02734375 }, { "epoch": 0.6157801067826775, "step": 6228, "train/total_loss": 0.1261860579252243 }, { "entropy": 8.675318717956543, "epoch": 0.615878979632193, "mean_token_accuracy": 0.7145969271659851, "num_tokens": 11598959.0, "step": 6229, "train/ce_loss": 0.6410172581672668 }, { "epoch": 0.615878979632193, "step": 6229, "train/sim_loss": 0.078125 }, { "epoch": 0.615878979632193, "step": 6229, "train/total_loss": 0.14222672581672668 }, { "entropy": 8.503095626831055, "epoch": 0.6159778524817086, "mean_token_accuracy": 0.7460484504699707, "num_tokens": 11604406.0, "step": 6230, "train/ce_loss": 0.8307203650474548 }, { "epoch": 0.6159778524817086, "step": 6230, "train/sim_loss": 0.09375 }, { "epoch": 0.6159778524817086, "step": 6230, "train/total_loss": 0.17682203650474548 }, { "entropy": 8.74378776550293, "epoch": 0.616076725331224, "mean_token_accuracy": 0.774193525314331, "num_tokens": 11609913.0, "step": 6231, "train/ce_loss": 0.58668053150177 }, { "epoch": 0.616076725331224, "step": 6231, "train/sim_loss": 0.015625 }, { "epoch": 0.616076725331224, "step": 6231, "train/total_loss": 0.07429305464029312 }, { "entropy": 8.621826171875, "epoch": 0.6161755981807395, "mean_token_accuracy": 0.7532728910446167, "num_tokens": 11615550.0, "step": 6232, "train/ce_loss": 0.9261634945869446 }, { "epoch": 0.6161755981807395, "step": 6232, "train/sim_loss": 0.07421875 }, { "epoch": 0.6161755981807395, "step": 6232, "train/total_loss": 0.16683509945869446 }, { "entropy": 9.242183685302734, "epoch": 0.6162744710302551, "mean_token_accuracy": 0.7163233160972595, "num_tokens": 11620645.0, "step": 6233, "train/ce_loss": 1.4938048124313354 }, { "epoch": 0.6162744710302551, "step": 6233, "train/sim_loss": 0.078125 }, { "epoch": 0.6162744710302551, "step": 6233, "train/total_loss": 0.22750549018383026 }, { "entropy": 9.009716033935547, "epoch": 0.6163733438797706, "mean_token_accuracy": 0.7153284549713135, "num_tokens": 11625815.0, "step": 6234, "train/ce_loss": 7.141983360270387e-07 }, { "epoch": 0.6163733438797706, "step": 6234, "train/sim_loss": 0.078125 }, { "epoch": 0.6163733438797706, "step": 6234, "train/total_loss": 0.07812507450580597 }, { "entropy": 9.606576919555664, "epoch": 0.6164722167292861, "mean_token_accuracy": 0.7597172856330872, "num_tokens": 11630837.0, "step": 6235, "train/ce_loss": 0.7407086491584778 }, { "epoch": 0.6164722167292861, "step": 6235, "train/sim_loss": 0.046875 }, { "epoch": 0.6164722167292861, "step": 6235, "train/total_loss": 0.12094586342573166 }, { "entropy": 9.218091011047363, "epoch": 0.6165710895788017, "mean_token_accuracy": 0.7296848893165588, "num_tokens": 11635860.0, "step": 6236, "train/ce_loss": 0.9628185629844666 }, { "epoch": 0.6165710895788017, "step": 6236, "train/sim_loss": 0.07421875 }, { "epoch": 0.6165710895788017, "step": 6236, "train/total_loss": 0.17050060629844666 }, { "entropy": 10.20110034942627, "epoch": 0.6166699624283172, "mean_token_accuracy": 0.757446825504303, "num_tokens": 11640456.0, "step": 6237, "train/ce_loss": 2.8913418645970523e-06 }, { "epoch": 0.6166699624283172, "step": 6237, "train/sim_loss": 0.0234375 }, { "epoch": 0.6166699624283172, "step": 6237, "train/total_loss": 0.02343778870999813 }, { "entropy": 8.834396362304688, "epoch": 0.6167688352778327, "mean_token_accuracy": 0.7523584961891174, "num_tokens": 11645773.0, "step": 6238, "train/ce_loss": 1.054317831993103 }, { "epoch": 0.6167688352778327, "step": 6238, "train/sim_loss": 0.04296875 }, { "epoch": 0.6167688352778327, "step": 6238, "train/total_loss": 0.14840054512023926 }, { "entropy": 9.45189094543457, "epoch": 0.6168677081273483, "mean_token_accuracy": 0.7443946003913879, "num_tokens": 11650648.0, "step": 6239, "train/ce_loss": 1.6271706044790335e-06 }, { "epoch": 0.6168677081273483, "step": 6239, "train/sim_loss": 0.04296875 }, { "epoch": 0.6168677081273483, "step": 6239, "train/total_loss": 0.04296891391277313 }, { "epoch": 0.6169665809768637, "grad_norm": 0.835460901260376, "learning_rate": 8.459921871136824e-06, "loss": 0.1377, "step": 6240 }, { "entropy": 8.653219223022461, "epoch": 0.6169665809768637, "mean_token_accuracy": 0.7077363729476929, "num_tokens": 11656180.0, "step": 6240, "train/ce_loss": 1.6680773496627808 }, { "epoch": 0.6169665809768637, "step": 6240, "train/sim_loss": 0.0703125 }, { "epoch": 0.6169665809768637, "step": 6240, "train/total_loss": 0.23712024092674255 }, { "entropy": 9.151749610900879, "epoch": 0.6170654538263792, "mean_token_accuracy": 0.7945945858955383, "num_tokens": 11661209.0, "step": 6241, "train/ce_loss": 0.5691468119621277 }, { "epoch": 0.6170654538263792, "step": 6241, "train/sim_loss": 0.05859375 }, { "epoch": 0.6170654538263792, "step": 6241, "train/total_loss": 0.11550843715667725 }, { "entropy": 8.956352233886719, "epoch": 0.6171643266758948, "mean_token_accuracy": 0.6747967600822449, "num_tokens": 11666563.0, "step": 6242, "train/ce_loss": 0.6857337355613708 }, { "epoch": 0.6171643266758948, "step": 6242, "train/sim_loss": 0.0234375 }, { "epoch": 0.6171643266758948, "step": 6242, "train/total_loss": 0.09201087802648544 }, { "entropy": 8.7849702835083, "epoch": 0.6172631995254103, "mean_token_accuracy": 0.7371638417243958, "num_tokens": 11671848.0, "step": 6243, "train/ce_loss": 0.596751868724823 }, { "epoch": 0.6172631995254103, "step": 6243, "train/sim_loss": 0.03515625 }, { "epoch": 0.6172631995254103, "step": 6243, "train/total_loss": 0.0948314368724823 }, { "entropy": 9.008289337158203, "epoch": 0.6173620723749258, "mean_token_accuracy": 0.8005865216255188, "num_tokens": 11676988.0, "step": 6244, "train/ce_loss": 0.6968074440956116 }, { "epoch": 0.6173620723749258, "step": 6244, "train/sim_loss": 0.03125 }, { "epoch": 0.6173620723749258, "step": 6244, "train/total_loss": 0.10093074291944504 }, { "entropy": 9.25759220123291, "epoch": 0.6174609452244414, "mean_token_accuracy": 0.7796609997749329, "num_tokens": 11682104.0, "step": 6245, "train/ce_loss": 1.2047686576843262 }, { "epoch": 0.6174609452244414, "step": 6245, "train/sim_loss": 0.10546875 }, { "epoch": 0.6174609452244414, "step": 6245, "train/total_loss": 0.2259456217288971 }, { "entropy": 8.841020584106445, "epoch": 0.6175598180739569, "mean_token_accuracy": 0.7383592128753662, "num_tokens": 11687445.0, "step": 6246, "train/ce_loss": 0.7534462809562683 }, { "epoch": 0.6175598180739569, "step": 6246, "train/sim_loss": 0.046875 }, { "epoch": 0.6175598180739569, "step": 6246, "train/total_loss": 0.12221962958574295 }, { "entropy": 8.788390159606934, "epoch": 0.6176586909234724, "mean_token_accuracy": 0.7828418016433716, "num_tokens": 11692736.0, "step": 6247, "train/ce_loss": 0.6744476556777954 }, { "epoch": 0.6176586909234724, "step": 6247, "train/sim_loss": 0.0234375 }, { "epoch": 0.6176586909234724, "step": 6247, "train/total_loss": 0.09088226407766342 }, { "entropy": 9.2723388671875, "epoch": 0.617757563772988, "mean_token_accuracy": 0.7493036389350891, "num_tokens": 11697864.0, "step": 6248, "train/ce_loss": 0.7439592480659485 }, { "epoch": 0.617757563772988, "step": 6248, "train/sim_loss": 0.0703125 }, { "epoch": 0.617757563772988, "step": 6248, "train/total_loss": 0.14470842480659485 }, { "entropy": 9.078840255737305, "epoch": 0.6178564366225034, "mean_token_accuracy": 0.6967560052871704, "num_tokens": 11703003.0, "step": 6249, "train/ce_loss": 1.248230218887329 }, { "epoch": 0.6178564366225034, "step": 6249, "train/sim_loss": 0.06640625 }, { "epoch": 0.6178564366225034, "step": 6249, "train/total_loss": 0.19122928380966187 }, { "entropy": 9.284374237060547, "epoch": 0.6179553094720189, "mean_token_accuracy": 0.6666666865348816, "num_tokens": 11708102.0, "step": 6250, "train/ce_loss": 1.8740671873092651 }, { "epoch": 0.6179553094720189, "step": 6250, "train/sim_loss": 0.0859375 }, { "epoch": 0.6179553094720189, "step": 6250, "train/total_loss": 0.2733442187309265 }, { "entropy": 9.415840148925781, "epoch": 0.6180541823215345, "mean_token_accuracy": 0.7890772223472595, "num_tokens": 11713093.0, "step": 6251, "train/ce_loss": 1.5481805801391602 }, { "epoch": 0.6180541823215345, "step": 6251, "train/sim_loss": 0.0703125 }, { "epoch": 0.6180541823215345, "step": 6251, "train/total_loss": 0.22513055801391602 }, { "entropy": 8.931975364685059, "epoch": 0.61815305517105, "mean_token_accuracy": 0.7244284152984619, "num_tokens": 11718370.0, "step": 6252, "train/ce_loss": 0.5441824793815613 }, { "epoch": 0.61815305517105, "step": 6252, "train/sim_loss": 0.05078125 }, { "epoch": 0.61815305517105, "step": 6252, "train/total_loss": 0.10519950091838837 }, { "entropy": 9.544601440429688, "epoch": 0.6182519280205655, "mean_token_accuracy": 0.7572559118270874, "num_tokens": 11723170.0, "step": 6253, "train/ce_loss": 2.360188545935671e-06 }, { "epoch": 0.6182519280205655, "step": 6253, "train/sim_loss": 0.0546875 }, { "epoch": 0.6182519280205655, "step": 6253, "train/total_loss": 0.0546877346932888 }, { "entropy": 8.967365264892578, "epoch": 0.6183508008700811, "mean_token_accuracy": 0.7658142447471619, "num_tokens": 11728380.0, "step": 6254, "train/ce_loss": 0.2919480502605438 }, { "epoch": 0.6183508008700811, "step": 6254, "train/sim_loss": 0.02734375 }, { "epoch": 0.6183508008700811, "step": 6254, "train/total_loss": 0.05653855577111244 }, { "entropy": 9.081618309020996, "epoch": 0.6184496737195966, "mean_token_accuracy": 0.795484721660614, "num_tokens": 11733605.0, "step": 6255, "train/ce_loss": 1.271119253942743e-05 }, { "epoch": 0.6184496737195966, "step": 6255, "train/sim_loss": 0.06640625 }, { "epoch": 0.6184496737195966, "step": 6255, "train/total_loss": 0.06640752404928207 }, { "entropy": 8.944194793701172, "epoch": 0.6185485465691121, "mean_token_accuracy": 0.7186261415481567, "num_tokens": 11738795.0, "step": 6256, "train/ce_loss": 0.8012778162956238 }, { "epoch": 0.6185485465691121, "step": 6256, "train/sim_loss": 0.04296875 }, { "epoch": 0.6185485465691121, "step": 6256, "train/total_loss": 0.1230965331196785 }, { "entropy": 8.654861450195312, "epoch": 0.6186474194186277, "mean_token_accuracy": 0.7975663542747498, "num_tokens": 11744109.0, "step": 6257, "train/ce_loss": 0.4617496728897095 }, { "epoch": 0.6186474194186277, "step": 6257, "train/sim_loss": 0.04296875 }, { "epoch": 0.6186474194186277, "step": 6257, "train/total_loss": 0.08914372324943542 }, { "entropy": 9.075235366821289, "epoch": 0.6187462922681431, "mean_token_accuracy": 0.7780784964561462, "num_tokens": 11749328.0, "step": 6258, "train/ce_loss": 0.7430073022842407 }, { "epoch": 0.6187462922681431, "step": 6258, "train/sim_loss": 0.0546875 }, { "epoch": 0.6187462922681431, "step": 6258, "train/total_loss": 0.12898823618888855 }, { "entropy": 8.558425903320312, "epoch": 0.6188451651176586, "mean_token_accuracy": 0.7475622892379761, "num_tokens": 11754749.0, "step": 6259, "train/ce_loss": 0.8147485256195068 }, { "epoch": 0.6188451651176586, "step": 6259, "train/sim_loss": 0.08984375 }, { "epoch": 0.6188451651176586, "step": 6259, "train/total_loss": 0.17131860554218292 }, { "epoch": 0.6189440379671742, "grad_norm": 0.6895498633384705, "learning_rate": 8.454977006378876e-06, "loss": 0.1334, "step": 6260 }, { "entropy": 8.543374061584473, "epoch": 0.6189440379671742, "mean_token_accuracy": 0.7225490212440491, "num_tokens": 11760286.0, "step": 6260, "train/ce_loss": 1.124602198600769 }, { "epoch": 0.6189440379671742, "step": 6260, "train/sim_loss": 0.08203125 }, { "epoch": 0.6189440379671742, "step": 6260, "train/total_loss": 0.19449147582054138 }, { "entropy": 9.300775527954102, "epoch": 0.6190429108166897, "mean_token_accuracy": 0.7272727489471436, "num_tokens": 11765489.0, "step": 6261, "train/ce_loss": 0.8462640643119812 }, { "epoch": 0.6190429108166897, "step": 6261, "train/sim_loss": 0.03515625 }, { "epoch": 0.6190429108166897, "step": 6261, "train/total_loss": 0.11978265643119812 }, { "entropy": 8.763044357299805, "epoch": 0.6191417836662053, "mean_token_accuracy": 0.7466216087341309, "num_tokens": 11770834.0, "step": 6262, "train/ce_loss": 0.8813503980636597 }, { "epoch": 0.6191417836662053, "step": 6262, "train/sim_loss": 0.0546875 }, { "epoch": 0.6191417836662053, "step": 6262, "train/total_loss": 0.1428225338459015 }, { "entropy": 8.738752365112305, "epoch": 0.6192406565157208, "mean_token_accuracy": 0.7152406573295593, "num_tokens": 11775999.0, "step": 6263, "train/ce_loss": 0.6721773147583008 }, { "epoch": 0.6192406565157208, "step": 6263, "train/sim_loss": 0.04296875 }, { "epoch": 0.6192406565157208, "step": 6263, "train/total_loss": 0.11018647998571396 }, { "entropy": 8.763019561767578, "epoch": 0.6193395293652363, "mean_token_accuracy": 0.6934097409248352, "num_tokens": 11781148.0, "step": 6264, "train/ce_loss": 1.057170033454895 }, { "epoch": 0.6193395293652363, "step": 6264, "train/sim_loss": 0.03125 }, { "epoch": 0.6193395293652363, "step": 6264, "train/total_loss": 0.1369670033454895 }, { "entropy": 8.852022171020508, "epoch": 0.6194384022147519, "mean_token_accuracy": 0.7561880946159363, "num_tokens": 11786434.0, "step": 6265, "train/ce_loss": 0.650295615196228 }, { "epoch": 0.6194384022147519, "step": 6265, "train/sim_loss": 0.01953125 }, { "epoch": 0.6194384022147519, "step": 6265, "train/total_loss": 0.0845608115196228 }, { "entropy": 8.8131685256958, "epoch": 0.6195372750642674, "mean_token_accuracy": 0.7301587462425232, "num_tokens": 11791750.0, "step": 6266, "train/ce_loss": 1.1719716787338257 }, { "epoch": 0.6195372750642674, "step": 6266, "train/sim_loss": 0.046875 }, { "epoch": 0.6195372750642674, "step": 6266, "train/total_loss": 0.1640721708536148 }, { "entropy": 9.048507690429688, "epoch": 0.6196361479137829, "mean_token_accuracy": 0.75, "num_tokens": 11797015.0, "step": 6267, "train/ce_loss": 0.5786001086235046 }, { "epoch": 0.6196361479137829, "step": 6267, "train/sim_loss": 0.078125 }, { "epoch": 0.6196361479137829, "step": 6267, "train/total_loss": 0.13598501682281494 }, { "entropy": 9.642312049865723, "epoch": 0.6197350207632984, "mean_token_accuracy": 0.7819253206253052, "num_tokens": 11801960.0, "step": 6268, "train/ce_loss": 0.6379991769790649 }, { "epoch": 0.6197350207632984, "step": 6268, "train/sim_loss": 0.01953125 }, { "epoch": 0.6197350207632984, "step": 6268, "train/total_loss": 0.0833311676979065 }, { "entropy": 9.178560256958008, "epoch": 0.6198338936128139, "mean_token_accuracy": 0.7324159145355225, "num_tokens": 11807065.0, "step": 6269, "train/ce_loss": 8.792806625024241e-07 }, { "epoch": 0.6198338936128139, "step": 6269, "train/sim_loss": 0.03125 }, { "epoch": 0.6198338936128139, "step": 6269, "train/total_loss": 0.03125008940696716 }, { "entropy": 8.404592514038086, "epoch": 0.6199327664623294, "mean_token_accuracy": 0.7387914061546326, "num_tokens": 11812520.0, "step": 6270, "train/ce_loss": 0.7952864170074463 }, { "epoch": 0.6199327664623294, "step": 6270, "train/sim_loss": 0.0234375 }, { "epoch": 0.6199327664623294, "step": 6270, "train/total_loss": 0.10296614468097687 }, { "entropy": 8.78640365600586, "epoch": 0.620031639311845, "mean_token_accuracy": 0.7868852615356445, "num_tokens": 11817911.0, "step": 6271, "train/ce_loss": 0.78782057762146 }, { "epoch": 0.620031639311845, "step": 6271, "train/sim_loss": 0.05078125 }, { "epoch": 0.620031639311845, "step": 6271, "train/total_loss": 0.12956330180168152 }, { "entropy": 8.49759292602539, "epoch": 0.6201305121613605, "mean_token_accuracy": 0.7430703639984131, "num_tokens": 11823337.0, "step": 6272, "train/ce_loss": 0.6456509232521057 }, { "epoch": 0.6201305121613605, "step": 6272, "train/sim_loss": 0.02734375 }, { "epoch": 0.6201305121613605, "step": 6272, "train/total_loss": 0.09190884232521057 }, { "entropy": 9.274723052978516, "epoch": 0.620229385010876, "mean_token_accuracy": 0.7439758777618408, "num_tokens": 11828369.0, "step": 6273, "train/ce_loss": 0.9972986578941345 }, { "epoch": 0.620229385010876, "step": 6273, "train/sim_loss": 0.046875 }, { "epoch": 0.620229385010876, "step": 6273, "train/total_loss": 0.14660486578941345 }, { "entropy": 8.786995887756348, "epoch": 0.6203282578603916, "mean_token_accuracy": 0.73380446434021, "num_tokens": 11833698.0, "step": 6274, "train/ce_loss": 1.1607381105422974 }, { "epoch": 0.6203282578603916, "step": 6274, "train/sim_loss": 0.06640625 }, { "epoch": 0.6203282578603916, "step": 6274, "train/total_loss": 0.1824800670146942 }, { "entropy": 8.677114486694336, "epoch": 0.6204271307099071, "mean_token_accuracy": 0.698074996471405, "num_tokens": 11839113.0, "step": 6275, "train/ce_loss": 0.9493786692619324 }, { "epoch": 0.6204271307099071, "step": 6275, "train/sim_loss": 0.0234375 }, { "epoch": 0.6204271307099071, "step": 6275, "train/total_loss": 0.11837536841630936 }, { "entropy": 8.763256072998047, "epoch": 0.6205260035594226, "mean_token_accuracy": 0.7807737588882446, "num_tokens": 11844481.0, "step": 6276, "train/ce_loss": 0.3372906446456909 }, { "epoch": 0.6205260035594226, "step": 6276, "train/sim_loss": 0.015625 }, { "epoch": 0.6205260035594226, "step": 6276, "train/total_loss": 0.04935406520962715 }, { "entropy": 8.933353424072266, "epoch": 0.6206248764089382, "mean_token_accuracy": 0.733742356300354, "num_tokens": 11849706.0, "step": 6277, "train/ce_loss": 0.6353260278701782 }, { "epoch": 0.6206248764089382, "step": 6277, "train/sim_loss": 0.078125 }, { "epoch": 0.6206248764089382, "step": 6277, "train/total_loss": 0.14165760576725006 }, { "entropy": 8.695978164672852, "epoch": 0.6207237492584536, "mean_token_accuracy": 0.7405900359153748, "num_tokens": 11855180.0, "step": 6278, "train/ce_loss": 0.6998786926269531 }, { "epoch": 0.6207237492584536, "step": 6278, "train/sim_loss": 0.0703125 }, { "epoch": 0.6207237492584536, "step": 6278, "train/total_loss": 0.14030036330223083 }, { "entropy": 9.205509185791016, "epoch": 0.6208226221079691, "mean_token_accuracy": 0.7695418000221252, "num_tokens": 11860317.0, "step": 6279, "train/ce_loss": 4.41487060243162e-07 }, { "epoch": 0.6208226221079691, "step": 6279, "train/sim_loss": 0.01953125 }, { "epoch": 0.6208226221079691, "step": 6279, "train/total_loss": 0.01953129470348358 }, { "epoch": 0.6209214949574847, "grad_norm": 0.5874653458595276, "learning_rate": 8.450032141620927e-06, "loss": 0.1324, "step": 6280 }, { "entropy": 9.59320068359375, "epoch": 0.6209214949574847, "mean_token_accuracy": 0.7029703259468079, "num_tokens": 11865216.0, "step": 6280, "train/ce_loss": 9.11168342554447e-07 }, { "epoch": 0.6209214949574847, "step": 6280, "train/sim_loss": 0.03515625 }, { "epoch": 0.6209214949574847, "step": 6280, "train/total_loss": 0.03515633940696716 }, { "entropy": 9.023270606994629, "epoch": 0.6210203678070002, "mean_token_accuracy": 0.686274528503418, "num_tokens": 11870455.0, "step": 6281, "train/ce_loss": 0.9922044277191162 }, { "epoch": 0.6210203678070002, "step": 6281, "train/sim_loss": 0.08984375 }, { "epoch": 0.6210203678070002, "step": 6281, "train/total_loss": 0.18906420469284058 }, { "entropy": 9.550705909729004, "epoch": 0.6211192406565157, "mean_token_accuracy": 0.7443609237670898, "num_tokens": 11875282.0, "step": 6282, "train/ce_loss": 2.086510903609451e-06 }, { "epoch": 0.6211192406565157, "step": 6282, "train/sim_loss": 0.0390625 }, { "epoch": 0.6211192406565157, "step": 6282, "train/total_loss": 0.039062708616256714 }, { "entropy": 9.219742774963379, "epoch": 0.6212181135060313, "mean_token_accuracy": 0.7130434513092041, "num_tokens": 11880299.0, "step": 6283, "train/ce_loss": 1.163111686706543 }, { "epoch": 0.6212181135060313, "step": 6283, "train/sim_loss": 0.0390625 }, { "epoch": 0.6212181135060313, "step": 6283, "train/total_loss": 0.15537366271018982 }, { "entropy": 9.4982271194458, "epoch": 0.6213169863555468, "mean_token_accuracy": 0.701646089553833, "num_tokens": 11885192.0, "step": 6284, "train/ce_loss": 2.250600814819336 }, { "epoch": 0.6213169863555468, "step": 6284, "train/sim_loss": 0.09375 }, { "epoch": 0.6213169863555468, "step": 6284, "train/total_loss": 0.3188101053237915 }, { "entropy": 9.323200225830078, "epoch": 0.6214158592050623, "mean_token_accuracy": 0.8364197611808777, "num_tokens": 11890271.0, "step": 6285, "train/ce_loss": 0.8418030142784119 }, { "epoch": 0.6214158592050623, "step": 6285, "train/sim_loss": 0.0390625 }, { "epoch": 0.6214158592050623, "step": 6285, "train/total_loss": 0.1232428029179573 }, { "entropy": 8.596650123596191, "epoch": 0.6215147320545779, "mean_token_accuracy": 0.7080745100975037, "num_tokens": 11895712.0, "step": 6286, "train/ce_loss": 1.0133122205734253 }, { "epoch": 0.6215147320545779, "step": 6286, "train/sim_loss": 0.05859375 }, { "epoch": 0.6215147320545779, "step": 6286, "train/total_loss": 0.15992498397827148 }, { "entropy": 8.95394515991211, "epoch": 0.6216136049040933, "mean_token_accuracy": 0.7142857313156128, "num_tokens": 11901026.0, "step": 6287, "train/ce_loss": 1.2561620473861694 }, { "epoch": 0.6216136049040933, "step": 6287, "train/sim_loss": 0.09375 }, { "epoch": 0.6216136049040933, "step": 6287, "train/total_loss": 0.21936620771884918 }, { "entropy": 9.9725341796875, "epoch": 0.6217124777536088, "mean_token_accuracy": 0.8008849620819092, "num_tokens": 11905646.0, "step": 6288, "train/ce_loss": 2.3084328174591064 }, { "epoch": 0.6217124777536088, "step": 6288, "train/sim_loss": 0.046875 }, { "epoch": 0.6217124777536088, "step": 6288, "train/total_loss": 0.27771830558776855 }, { "entropy": 8.914336204528809, "epoch": 0.6218113506031244, "mean_token_accuracy": 0.7150062918663025, "num_tokens": 11910907.0, "step": 6289, "train/ce_loss": 1.248734712600708 }, { "epoch": 0.6218113506031244, "step": 6289, "train/sim_loss": 0.0625 }, { "epoch": 0.6218113506031244, "step": 6289, "train/total_loss": 0.18737347424030304 }, { "entropy": 8.865591049194336, "epoch": 0.6219102234526399, "mean_token_accuracy": 0.6875712871551514, "num_tokens": 11916245.0, "step": 6290, "train/ce_loss": 1.488316297531128 }, { "epoch": 0.6219102234526399, "step": 6290, "train/sim_loss": 0.03125 }, { "epoch": 0.6219102234526399, "step": 6290, "train/total_loss": 0.18008163571357727 }, { "entropy": 9.334177017211914, "epoch": 0.6220090963021554, "mean_token_accuracy": 0.7226890921592712, "num_tokens": 11921281.0, "step": 6291, "train/ce_loss": 1.2111235857009888 }, { "epoch": 0.6220090963021554, "step": 6291, "train/sim_loss": 0.046875 }, { "epoch": 0.6220090963021554, "step": 6291, "train/total_loss": 0.16798736155033112 }, { "entropy": 8.802948951721191, "epoch": 0.622107969151671, "mean_token_accuracy": 0.7705128192901611, "num_tokens": 11926530.0, "step": 6292, "train/ce_loss": 0.5217919945716858 }, { "epoch": 0.622107969151671, "step": 6292, "train/sim_loss": 0.0625 }, { "epoch": 0.622107969151671, "step": 6292, "train/total_loss": 0.11467920243740082 }, { "entropy": 9.367215156555176, "epoch": 0.6222068420011865, "mean_token_accuracy": 0.6818181872367859, "num_tokens": 11931571.0, "step": 6293, "train/ce_loss": 1.4454087018966675 }, { "epoch": 0.6222068420011865, "step": 6293, "train/sim_loss": 0.06640625 }, { "epoch": 0.6222068420011865, "step": 6293, "train/total_loss": 0.21094712615013123 }, { "entropy": 8.841418266296387, "epoch": 0.622305714850702, "mean_token_accuracy": 0.7589802742004395, "num_tokens": 11937241.0, "step": 6294, "train/ce_loss": 0.7581228613853455 }, { "epoch": 0.622305714850702, "step": 6294, "train/sim_loss": 0.0625 }, { "epoch": 0.622305714850702, "step": 6294, "train/total_loss": 0.13831228017807007 }, { "entropy": 8.810118675231934, "epoch": 0.6224045877002176, "mean_token_accuracy": 0.7293986678123474, "num_tokens": 11942602.0, "step": 6295, "train/ce_loss": 0.8448886275291443 }, { "epoch": 0.6224045877002176, "step": 6295, "train/sim_loss": 0.03125 }, { "epoch": 0.6224045877002176, "step": 6295, "train/total_loss": 0.11573886126279831 }, { "entropy": 8.876449584960938, "epoch": 0.622503460549733, "mean_token_accuracy": 0.7777777910232544, "num_tokens": 11947897.0, "step": 6296, "train/ce_loss": 0.6751101613044739 }, { "epoch": 0.622503460549733, "step": 6296, "train/sim_loss": 0.03125 }, { "epoch": 0.622503460549733, "step": 6296, "train/total_loss": 0.09876101464033127 }, { "entropy": 9.05276107788086, "epoch": 0.6226023333992485, "mean_token_accuracy": 0.7639751434326172, "num_tokens": 11953155.0, "step": 6297, "train/ce_loss": 0.8308619260787964 }, { "epoch": 0.6226023333992485, "step": 6297, "train/sim_loss": 0.08984375 }, { "epoch": 0.6226023333992485, "step": 6297, "train/total_loss": 0.17292994260787964 }, { "entropy": 9.669578552246094, "epoch": 0.6227012062487641, "mean_token_accuracy": 0.7356321811676025, "num_tokens": 11957943.0, "step": 6298, "train/ce_loss": 1.627619981765747 }, { "epoch": 0.6227012062487641, "step": 6298, "train/sim_loss": 0.06640625 }, { "epoch": 0.6227012062487641, "step": 6298, "train/total_loss": 0.22916825115680695 }, { "entropy": 8.85727596282959, "epoch": 0.6228000790982796, "mean_token_accuracy": 0.7109066843986511, "num_tokens": 11963151.0, "step": 6299, "train/ce_loss": 1.0590969324111938 }, { "epoch": 0.6228000790982796, "step": 6299, "train/sim_loss": 0.03515625 }, { "epoch": 0.6228000790982796, "step": 6299, "train/total_loss": 0.14106595516204834 }, { "epoch": 0.6228989519477951, "grad_norm": 0.6998578906059265, "learning_rate": 8.445087276862979e-06, "loss": 0.1341, "step": 6300 }, { "entropy": 9.62222671508789, "epoch": 0.6228989519477951, "mean_token_accuracy": 0.7442307472229004, "num_tokens": 11968136.0, "step": 6300, "train/ce_loss": 0.5273857116699219 }, { "epoch": 0.6228989519477951, "step": 6300, "train/sim_loss": 0.0390625 }, { "epoch": 0.6228989519477951, "step": 6300, "train/total_loss": 0.09180107712745667 }, { "entropy": 9.020427703857422, "epoch": 0.6229978247973107, "mean_token_accuracy": 0.7334167957305908, "num_tokens": 11973427.0, "step": 6301, "train/ce_loss": 0.9961581230163574 }, { "epoch": 0.6229978247973107, "step": 6301, "train/sim_loss": 0.09375 }, { "epoch": 0.6229978247973107, "step": 6301, "train/total_loss": 0.19336581230163574 }, { "entropy": 9.166910171508789, "epoch": 0.6230966976468262, "mean_token_accuracy": 0.7630137205123901, "num_tokens": 11978577.0, "step": 6302, "train/ce_loss": 0.8610888123512268 }, { "epoch": 0.6230966976468262, "step": 6302, "train/sim_loss": 0.1015625 }, { "epoch": 0.6230966976468262, "step": 6302, "train/total_loss": 0.18767139315605164 }, { "entropy": 8.74166488647461, "epoch": 0.6231955704963417, "mean_token_accuracy": 0.7590497732162476, "num_tokens": 11983959.0, "step": 6303, "train/ce_loss": 0.8055565357208252 }, { "epoch": 0.6231955704963417, "step": 6303, "train/sim_loss": 0.04296875 }, { "epoch": 0.6231955704963417, "step": 6303, "train/total_loss": 0.12352440506219864 }, { "entropy": 9.161663055419922, "epoch": 0.6232944433458573, "mean_token_accuracy": 0.8209876418113708, "num_tokens": 11989073.0, "step": 6304, "train/ce_loss": 4.1683088056743145e-06 }, { "epoch": 0.6232944433458573, "step": 6304, "train/sim_loss": 0.0234375 }, { "epoch": 0.6232944433458573, "step": 6304, "train/total_loss": 0.023437917232513428 }, { "entropy": 8.7100830078125, "epoch": 0.6233933161953727, "mean_token_accuracy": 0.7294994592666626, "num_tokens": 11994481.0, "step": 6305, "train/ce_loss": 1.027742624282837 }, { "epoch": 0.6233933161953727, "step": 6305, "train/sim_loss": 0.0546875 }, { "epoch": 0.6233933161953727, "step": 6305, "train/total_loss": 0.1574617624282837 }, { "entropy": 8.905563354492188, "epoch": 0.6234921890448882, "mean_token_accuracy": 0.728672981262207, "num_tokens": 11999823.0, "step": 6306, "train/ce_loss": 1.1297070980072021 }, { "epoch": 0.6234921890448882, "step": 6306, "train/sim_loss": 0.078125 }, { "epoch": 0.6234921890448882, "step": 6306, "train/total_loss": 0.19109570980072021 }, { "entropy": 8.815194129943848, "epoch": 0.6235910618944038, "mean_token_accuracy": 0.7109634280204773, "num_tokens": 12005220.0, "step": 6307, "train/ce_loss": 1.033612608909607 }, { "epoch": 0.6235910618944038, "step": 6307, "train/sim_loss": 0.078125 }, { "epoch": 0.6235910618944038, "step": 6307, "train/total_loss": 0.18148626387119293 }, { "entropy": 9.242063522338867, "epoch": 0.6236899347439193, "mean_token_accuracy": 0.7588757276535034, "num_tokens": 12010285.0, "step": 6308, "train/ce_loss": 1.2583582247316372e-06 }, { "epoch": 0.6236899347439193, "step": 6308, "train/sim_loss": 0.02734375 }, { "epoch": 0.6236899347439193, "step": 6308, "train/total_loss": 0.027343876659870148 }, { "entropy": 8.727019309997559, "epoch": 0.6237888075934348, "mean_token_accuracy": 0.7207637429237366, "num_tokens": 12015584.0, "step": 6309, "train/ce_loss": 0.8390503525733948 }, { "epoch": 0.6237888075934348, "step": 6309, "train/sim_loss": 0.12109375 }, { "epoch": 0.6237888075934348, "step": 6309, "train/total_loss": 0.20499879121780396 }, { "entropy": 8.89903450012207, "epoch": 0.6238876804429504, "mean_token_accuracy": 0.7966963052749634, "num_tokens": 12020875.0, "step": 6310, "train/ce_loss": 0.5423491597175598 }, { "epoch": 0.6238876804429504, "step": 6310, "train/sim_loss": 0.13671875 }, { "epoch": 0.6238876804429504, "step": 6310, "train/total_loss": 0.19095367193222046 }, { "entropy": 9.328275680541992, "epoch": 0.6239865532924659, "mean_token_accuracy": 0.7196030020713806, "num_tokens": 12025714.0, "step": 6311, "train/ce_loss": 2.41526198387146 }, { "epoch": 0.6239865532924659, "step": 6311, "train/sim_loss": 0.0546875 }, { "epoch": 0.6239865532924659, "step": 6311, "train/total_loss": 0.29621368646621704 }, { "entropy": 9.257919311523438, "epoch": 0.6240854261419814, "mean_token_accuracy": 0.730659008026123, "num_tokens": 12030887.0, "step": 6312, "train/ce_loss": 0.5913631319999695 }, { "epoch": 0.6240854261419814, "step": 6312, "train/sim_loss": 0.08203125 }, { "epoch": 0.6240854261419814, "step": 6312, "train/total_loss": 0.1411675661802292 }, { "entropy": 8.812848091125488, "epoch": 0.624184298991497, "mean_token_accuracy": 0.7369697093963623, "num_tokens": 12036388.0, "step": 6313, "train/ce_loss": 0.6528817415237427 }, { "epoch": 0.624184298991497, "step": 6313, "train/sim_loss": 0.05859375 }, { "epoch": 0.624184298991497, "step": 6313, "train/total_loss": 0.12388192862272263 }, { "entropy": 8.925192832946777, "epoch": 0.6242831718410125, "mean_token_accuracy": 0.7349260449409485, "num_tokens": 12041744.0, "step": 6314, "train/ce_loss": 0.751113772392273 }, { "epoch": 0.6242831718410125, "step": 6314, "train/sim_loss": 0.05078125 }, { "epoch": 0.6242831718410125, "step": 6314, "train/total_loss": 0.12589263916015625 }, { "entropy": 9.45772933959961, "epoch": 0.6243820446905279, "mean_token_accuracy": 0.7693575024604797, "num_tokens": 12046786.0, "step": 6315, "train/ce_loss": 0.7212726473808289 }, { "epoch": 0.6243820446905279, "step": 6315, "train/sim_loss": 0.03125 }, { "epoch": 0.6243820446905279, "step": 6315, "train/total_loss": 0.10337726771831512 }, { "entropy": 9.179908752441406, "epoch": 0.6244809175400435, "mean_token_accuracy": 0.7732793688774109, "num_tokens": 12051926.0, "step": 6316, "train/ce_loss": 0.7527188062667847 }, { "epoch": 0.6244809175400435, "step": 6316, "train/sim_loss": 0.03125 }, { "epoch": 0.6244809175400435, "step": 6316, "train/total_loss": 0.10652188211679459 }, { "entropy": 8.725812911987305, "epoch": 0.624579790389559, "mean_token_accuracy": 0.6998950839042664, "num_tokens": 12057407.0, "step": 6317, "train/ce_loss": 1.3678371906280518 }, { "epoch": 0.624579790389559, "step": 6317, "train/sim_loss": 0.01953125 }, { "epoch": 0.624579790389559, "step": 6317, "train/total_loss": 0.15631496906280518 }, { "entropy": 8.594905853271484, "epoch": 0.6246786632390745, "mean_token_accuracy": 0.7718191146850586, "num_tokens": 12062862.0, "step": 6318, "train/ce_loss": 0.7105671763420105 }, { "epoch": 0.6246786632390745, "step": 6318, "train/sim_loss": 0.046875 }, { "epoch": 0.6246786632390745, "step": 6318, "train/total_loss": 0.11793171614408493 }, { "entropy": 8.683724403381348, "epoch": 0.6247775360885901, "mean_token_accuracy": 0.8104794025421143, "num_tokens": 12068199.0, "step": 6319, "train/ce_loss": 0.5049375891685486 }, { "epoch": 0.6247775360885901, "step": 6319, "train/sim_loss": 0.02734375 }, { "epoch": 0.6247775360885901, "step": 6319, "train/total_loss": 0.0778375118970871 }, { "epoch": 0.6248764089381056, "grad_norm": 0.549186110496521, "learning_rate": 8.44014241210503e-06, "loss": 0.1383, "step": 6320 }, { "entropy": 9.419797897338867, "epoch": 0.6248764089381056, "mean_token_accuracy": 0.6722129583358765, "num_tokens": 12073177.0, "step": 6320, "train/ce_loss": 1.4812781810760498 }, { "epoch": 0.6248764089381056, "step": 6320, "train/sim_loss": 0.078125 }, { "epoch": 0.6248764089381056, "step": 6320, "train/total_loss": 0.22625282406806946 }, { "entropy": 9.006487846374512, "epoch": 0.6249752817876211, "mean_token_accuracy": 0.7139561772346497, "num_tokens": 12078506.0, "step": 6321, "train/ce_loss": 1.4044777154922485 }, { "epoch": 0.6249752817876211, "step": 6321, "train/sim_loss": 0.0390625 }, { "epoch": 0.6249752817876211, "step": 6321, "train/total_loss": 0.17951028048992157 }, { "entropy": 8.784879684448242, "epoch": 0.6250741546371367, "mean_token_accuracy": 0.7167947292327881, "num_tokens": 12083920.0, "step": 6322, "train/ce_loss": 0.6876106858253479 }, { "epoch": 0.6250741546371367, "step": 6322, "train/sim_loss": 0.12109375 }, { "epoch": 0.6250741546371367, "step": 6322, "train/total_loss": 0.18985483050346375 }, { "entropy": 9.060832977294922, "epoch": 0.6251730274866522, "mean_token_accuracy": 0.7107232213020325, "num_tokens": 12089167.0, "step": 6323, "train/ce_loss": 0.8286123871803284 }, { "epoch": 0.6251730274866522, "step": 6323, "train/sim_loss": 0.0625 }, { "epoch": 0.6251730274866522, "step": 6323, "train/total_loss": 0.14536124467849731 }, { "entropy": 9.480755805969238, "epoch": 0.6252719003361676, "mean_token_accuracy": 0.7591623067855835, "num_tokens": 12093963.0, "step": 6324, "train/ce_loss": 1.2280468940734863 }, { "epoch": 0.6252719003361676, "step": 6324, "train/sim_loss": 0.06640625 }, { "epoch": 0.6252719003361676, "step": 6324, "train/total_loss": 0.1892109513282776 }, { "entropy": 9.252645492553711, "epoch": 0.6253707731856832, "mean_token_accuracy": 0.6762226223945618, "num_tokens": 12099018.0, "step": 6325, "train/ce_loss": 1.6568008661270142 }, { "epoch": 0.6253707731856832, "step": 6325, "train/sim_loss": 0.05859375 }, { "epoch": 0.6253707731856832, "step": 6325, "train/total_loss": 0.22427384555339813 }, { "entropy": 9.285510063171387, "epoch": 0.6254696460351987, "mean_token_accuracy": 0.7503828406333923, "num_tokens": 12104055.0, "step": 6326, "train/ce_loss": 0.9159083366394043 }, { "epoch": 0.6254696460351987, "step": 6326, "train/sim_loss": 0.125 }, { "epoch": 0.6254696460351987, "step": 6326, "train/total_loss": 0.21659083664417267 }, { "entropy": 9.442464828491211, "epoch": 0.6255685188847142, "mean_token_accuracy": 0.7549019455909729, "num_tokens": 12109112.0, "step": 6327, "train/ce_loss": 1.071745753288269 }, { "epoch": 0.6255685188847142, "step": 6327, "train/sim_loss": 0.04296875 }, { "epoch": 0.6255685188847142, "step": 6327, "train/total_loss": 0.1501433253288269 }, { "entropy": 8.657928466796875, "epoch": 0.6256673917342298, "mean_token_accuracy": 0.7690762877464294, "num_tokens": 12114601.0, "step": 6328, "train/ce_loss": 0.8865790367126465 }, { "epoch": 0.6256673917342298, "step": 6328, "train/sim_loss": 0.02734375 }, { "epoch": 0.6256673917342298, "step": 6328, "train/total_loss": 0.116001658141613 }, { "entropy": 9.665094375610352, "epoch": 0.6257662645837453, "mean_token_accuracy": 0.7170731425285339, "num_tokens": 12119404.0, "step": 6329, "train/ce_loss": 1.880294919013977 }, { "epoch": 0.6257662645837453, "step": 6329, "train/sim_loss": 0.046875 }, { "epoch": 0.6257662645837453, "step": 6329, "train/total_loss": 0.23490449786186218 }, { "entropy": 9.09188461303711, "epoch": 0.6258651374332608, "mean_token_accuracy": 0.7213656306266785, "num_tokens": 12124754.0, "step": 6330, "train/ce_loss": 1.0878103971481323 }, { "epoch": 0.6258651374332608, "step": 6330, "train/sim_loss": 0.046875 }, { "epoch": 0.6258651374332608, "step": 6330, "train/total_loss": 0.15565603971481323 }, { "entropy": 8.86585807800293, "epoch": 0.6259640102827764, "mean_token_accuracy": 0.6994949579238892, "num_tokens": 12130026.0, "step": 6331, "train/ce_loss": 0.7202993631362915 }, { "epoch": 0.6259640102827764, "step": 6331, "train/sim_loss": 0.078125 }, { "epoch": 0.6259640102827764, "step": 6331, "train/total_loss": 0.1501549482345581 }, { "entropy": 9.033773422241211, "epoch": 0.6260628831322919, "mean_token_accuracy": 0.800000011920929, "num_tokens": 12135253.0, "step": 6332, "train/ce_loss": 1.0049701586467563e-06 }, { "epoch": 0.6260628831322919, "step": 6332, "train/sim_loss": 0.06640625 }, { "epoch": 0.6260628831322919, "step": 6332, "train/total_loss": 0.06640634685754776 }, { "entropy": 8.9903564453125, "epoch": 0.6261617559818073, "mean_token_accuracy": 0.7041420340538025, "num_tokens": 12140546.0, "step": 6333, "train/ce_loss": 1.2116358280181885 }, { "epoch": 0.6261617559818073, "step": 6333, "train/sim_loss": 0.05078125 }, { "epoch": 0.6261617559818073, "step": 6333, "train/total_loss": 0.17194482684135437 }, { "entropy": 8.666067123413086, "epoch": 0.6262606288313229, "mean_token_accuracy": 0.7283422350883484, "num_tokens": 12145945.0, "step": 6334, "train/ce_loss": 0.9988052248954773 }, { "epoch": 0.6262606288313229, "step": 6334, "train/sim_loss": 0.03515625 }, { "epoch": 0.6262606288313229, "step": 6334, "train/total_loss": 0.13503676652908325 }, { "entropy": 8.50477409362793, "epoch": 0.6263595016808384, "mean_token_accuracy": 0.7868852615356445, "num_tokens": 12151558.0, "step": 6335, "train/ce_loss": 0.24824056029319763 }, { "epoch": 0.6263595016808384, "step": 6335, "train/sim_loss": 0.0234375 }, { "epoch": 0.6263595016808384, "step": 6335, "train/total_loss": 0.04826155677437782 }, { "entropy": 8.916728973388672, "epoch": 0.6264583745303539, "mean_token_accuracy": 0.7371134161949158, "num_tokens": 12156812.0, "step": 6336, "train/ce_loss": 0.47676023840904236 }, { "epoch": 0.6264583745303539, "step": 6336, "train/sim_loss": 0.05078125 }, { "epoch": 0.6264583745303539, "step": 6336, "train/total_loss": 0.09845727682113647 }, { "entropy": 8.71766471862793, "epoch": 0.6265572473798695, "mean_token_accuracy": 0.749417245388031, "num_tokens": 12162113.0, "step": 6337, "train/ce_loss": 0.6502630114555359 }, { "epoch": 0.6265572473798695, "step": 6337, "train/sim_loss": 0.0703125 }, { "epoch": 0.6265572473798695, "step": 6337, "train/total_loss": 0.13533881306648254 }, { "entropy": 8.337970733642578, "epoch": 0.626656120229385, "mean_token_accuracy": 0.7336448431015015, "num_tokens": 12167685.0, "step": 6338, "train/ce_loss": 0.7648563981056213 }, { "epoch": 0.626656120229385, "step": 6338, "train/sim_loss": 0.015625 }, { "epoch": 0.626656120229385, "step": 6338, "train/total_loss": 0.09211064130067825 }, { "entropy": 9.744911193847656, "epoch": 0.6267549930789005, "mean_token_accuracy": 0.7819905281066895, "num_tokens": 12172556.0, "step": 6339, "train/ce_loss": 2.3105878881324315e-06 }, { "epoch": 0.6267549930789005, "step": 6339, "train/sim_loss": 0.046875 }, { "epoch": 0.6267549930789005, "step": 6339, "train/total_loss": 0.046875230967998505 }, { "epoch": 0.6268538659284161, "grad_norm": 0.67138671875, "learning_rate": 8.43519754734708e-06, "loss": 0.1386, "step": 6340 }, { "entropy": 8.503151893615723, "epoch": 0.6268538659284161, "mean_token_accuracy": 0.7273972630500793, "num_tokens": 12177778.0, "step": 6340, "train/ce_loss": 0.588840126991272 }, { "epoch": 0.6268538659284161, "step": 6340, "train/sim_loss": 0.08984375 }, { "epoch": 0.6268538659284161, "step": 6340, "train/total_loss": 0.14872775971889496 }, { "entropy": 9.482362747192383, "epoch": 0.6269527387779316, "mean_token_accuracy": 0.779552698135376, "num_tokens": 12182813.0, "step": 6341, "train/ce_loss": 0.9476832151412964 }, { "epoch": 0.6269527387779316, "step": 6341, "train/sim_loss": 0.0625 }, { "epoch": 0.6269527387779316, "step": 6341, "train/total_loss": 0.15726831555366516 }, { "entropy": 9.053950309753418, "epoch": 0.6270516116274472, "mean_token_accuracy": 0.7777777910232544, "num_tokens": 12188013.0, "step": 6342, "train/ce_loss": 0.9743828773498535 }, { "epoch": 0.6270516116274472, "step": 6342, "train/sim_loss": 0.0546875 }, { "epoch": 0.6270516116274472, "step": 6342, "train/total_loss": 0.1521257907152176 }, { "entropy": 9.118183135986328, "epoch": 0.6271504844769626, "mean_token_accuracy": 0.7409972548484802, "num_tokens": 12193201.0, "step": 6343, "train/ce_loss": 0.8215731978416443 }, { "epoch": 0.6271504844769626, "step": 6343, "train/sim_loss": 0.1015625 }, { "epoch": 0.6271504844769626, "step": 6343, "train/total_loss": 0.18371981382369995 }, { "entropy": 9.800601959228516, "epoch": 0.6272493573264781, "mean_token_accuracy": 0.7318007946014404, "num_tokens": 12198142.0, "step": 6344, "train/ce_loss": 1.7427619695663452 }, { "epoch": 0.6272493573264781, "step": 6344, "train/sim_loss": 0.06640625 }, { "epoch": 0.6272493573264781, "step": 6344, "train/total_loss": 0.240682452917099 }, { "entropy": 9.138225555419922, "epoch": 0.6273482301759937, "mean_token_accuracy": 0.7191176414489746, "num_tokens": 12203236.0, "step": 6345, "train/ce_loss": 1.5853768587112427 }, { "epoch": 0.6273482301759937, "step": 6345, "train/sim_loss": 0.03125 }, { "epoch": 0.6273482301759937, "step": 6345, "train/total_loss": 0.18978768587112427 }, { "entropy": 8.585261344909668, "epoch": 0.6274471030255092, "mean_token_accuracy": 0.7087967395782471, "num_tokens": 12208698.0, "step": 6346, "train/ce_loss": 0.6669883131980896 }, { "epoch": 0.6274471030255092, "step": 6346, "train/sim_loss": 0.05859375 }, { "epoch": 0.6274471030255092, "step": 6346, "train/total_loss": 0.1252925843000412 }, { "entropy": 8.82328987121582, "epoch": 0.6275459758750247, "mean_token_accuracy": 0.7107329964637756, "num_tokens": 12213926.0, "step": 6347, "train/ce_loss": 0.7975558042526245 }, { "epoch": 0.6275459758750247, "step": 6347, "train/sim_loss": 0.0703125 }, { "epoch": 0.6275459758750247, "step": 6347, "train/total_loss": 0.15006807446479797 }, { "entropy": 8.962874412536621, "epoch": 0.6276448487245403, "mean_token_accuracy": 0.7805164456367493, "num_tokens": 12219260.0, "step": 6348, "train/ce_loss": 0.5232795476913452 }, { "epoch": 0.6276448487245403, "step": 6348, "train/sim_loss": 0.0234375 }, { "epoch": 0.6276448487245403, "step": 6348, "train/total_loss": 0.075765460729599 }, { "entropy": 9.269477844238281, "epoch": 0.6277437215740558, "mean_token_accuracy": 0.7456647157669067, "num_tokens": 12224416.0, "step": 6349, "train/ce_loss": 1.0365246534347534 }, { "epoch": 0.6277437215740558, "step": 6349, "train/sim_loss": 0.08984375 }, { "epoch": 0.6277437215740558, "step": 6349, "train/total_loss": 0.1934962272644043 }, { "entropy": 8.638045310974121, "epoch": 0.6278425944235713, "mean_token_accuracy": 0.7481323480606079, "num_tokens": 12229876.0, "step": 6350, "train/ce_loss": 0.6363872289657593 }, { "epoch": 0.6278425944235713, "step": 6350, "train/sim_loss": 0.02734375 }, { "epoch": 0.6278425944235713, "step": 6350, "train/total_loss": 0.09098247438669205 }, { "entropy": 9.32752799987793, "epoch": 0.6279414672730869, "mean_token_accuracy": 0.7577807903289795, "num_tokens": 12235072.0, "step": 6351, "train/ce_loss": 0.5960943698883057 }, { "epoch": 0.6279414672730869, "step": 6351, "train/sim_loss": 0.015625 }, { "epoch": 0.6279414672730869, "step": 6351, "train/total_loss": 0.07523444294929504 }, { "entropy": 8.9277925491333, "epoch": 0.6280403401226023, "mean_token_accuracy": 0.7244501709938049, "num_tokens": 12240321.0, "step": 6352, "train/ce_loss": 0.8829100728034973 }, { "epoch": 0.6280403401226023, "step": 6352, "train/sim_loss": 0.04296875 }, { "epoch": 0.6280403401226023, "step": 6352, "train/total_loss": 0.1312597692012787 }, { "entropy": 9.443146705627441, "epoch": 0.6281392129721178, "mean_token_accuracy": 0.7212317585945129, "num_tokens": 12245356.0, "step": 6353, "train/ce_loss": 1.0199133157730103 }, { "epoch": 0.6281392129721178, "step": 6353, "train/sim_loss": 0.0625 }, { "epoch": 0.6281392129721178, "step": 6353, "train/total_loss": 0.16449132561683655 }, { "entropy": 9.374839782714844, "epoch": 0.6282380858216334, "mean_token_accuracy": 0.7766666412353516, "num_tokens": 12250374.0, "step": 6354, "train/ce_loss": 0.8880688548088074 }, { "epoch": 0.6282380858216334, "step": 6354, "train/sim_loss": 0.05078125 }, { "epoch": 0.6282380858216334, "step": 6354, "train/total_loss": 0.1395881474018097 }, { "entropy": 8.867729187011719, "epoch": 0.6283369586711489, "mean_token_accuracy": 0.732119619846344, "num_tokens": 12255585.0, "step": 6355, "train/ce_loss": 1.7234163284301758 }, { "epoch": 0.6283369586711489, "step": 6355, "train/sim_loss": 0.078125 }, { "epoch": 0.6283369586711489, "step": 6355, "train/total_loss": 0.25046664476394653 }, { "entropy": 9.388571739196777, "epoch": 0.6284358315206644, "mean_token_accuracy": 0.7986577153205872, "num_tokens": 12260623.0, "step": 6356, "train/ce_loss": 0.6396639943122864 }, { "epoch": 0.6284358315206644, "step": 6356, "train/sim_loss": 0.05859375 }, { "epoch": 0.6284358315206644, "step": 6356, "train/total_loss": 0.12256015092134476 }, { "entropy": 8.816532135009766, "epoch": 0.62853470437018, "mean_token_accuracy": 0.8092672228813171, "num_tokens": 12266060.0, "step": 6357, "train/ce_loss": 0.7067915201187134 }, { "epoch": 0.62853470437018, "step": 6357, "train/sim_loss": 0.0546875 }, { "epoch": 0.62853470437018, "step": 6357, "train/total_loss": 0.12536665797233582 }, { "entropy": 8.95190715789795, "epoch": 0.6286335772196955, "mean_token_accuracy": 0.7230320572853088, "num_tokens": 12271182.0, "step": 6358, "train/ce_loss": 0.9889823794364929 }, { "epoch": 0.6286335772196955, "step": 6358, "train/sim_loss": 0.06640625 }, { "epoch": 0.6286335772196955, "step": 6358, "train/total_loss": 0.16530448198318481 }, { "entropy": 8.838409423828125, "epoch": 0.628732450069211, "mean_token_accuracy": 0.7487437129020691, "num_tokens": 12276478.0, "step": 6359, "train/ce_loss": 0.9863818883895874 }, { "epoch": 0.628732450069211, "step": 6359, "train/sim_loss": 0.0546875 }, { "epoch": 0.628732450069211, "step": 6359, "train/total_loss": 0.15332569181919098 }, { "epoch": 0.6288313229187266, "grad_norm": 0.8530322313308716, "learning_rate": 8.430252682589132e-06, "loss": 0.1424, "step": 6360 }, { "entropy": 9.095026016235352, "epoch": 0.6288313229187266, "mean_token_accuracy": 0.74609375, "num_tokens": 12281716.0, "step": 6360, "train/ce_loss": 1.0583113431930542 }, { "epoch": 0.6288313229187266, "step": 6360, "train/sim_loss": 0.05078125 }, { "epoch": 0.6288313229187266, "step": 6360, "train/total_loss": 0.15661239624023438 }, { "entropy": 9.41407585144043, "epoch": 0.628930195768242, "mean_token_accuracy": 0.7508305907249451, "num_tokens": 12286747.0, "step": 6361, "train/ce_loss": 1.2370884418487549 }, { "epoch": 0.628930195768242, "step": 6361, "train/sim_loss": 0.046875 }, { "epoch": 0.628930195768242, "step": 6361, "train/total_loss": 0.1705838441848755 }, { "entropy": 9.389739036560059, "epoch": 0.6290290686177575, "mean_token_accuracy": 0.715925395488739, "num_tokens": 12291911.0, "step": 6362, "train/ce_loss": 1.855309247970581 }, { "epoch": 0.6290290686177575, "step": 6362, "train/sim_loss": 0.08203125 }, { "epoch": 0.6290290686177575, "step": 6362, "train/total_loss": 0.2675621807575226 }, { "entropy": 8.758782386779785, "epoch": 0.6291279414672731, "mean_token_accuracy": 0.7784290909767151, "num_tokens": 12297251.0, "step": 6363, "train/ce_loss": 1.0631740093231201 }, { "epoch": 0.6291279414672731, "step": 6363, "train/sim_loss": 0.0625 }, { "epoch": 0.6291279414672731, "step": 6363, "train/total_loss": 0.168817400932312 }, { "entropy": 9.42778491973877, "epoch": 0.6292268143167886, "mean_token_accuracy": 0.7454545497894287, "num_tokens": 12302070.0, "step": 6364, "train/ce_loss": 3.4973986657860223e-06 }, { "epoch": 0.6292268143167886, "step": 6364, "train/sim_loss": 0.05078125 }, { "epoch": 0.6292268143167886, "step": 6364, "train/total_loss": 0.050781600177288055 }, { "entropy": 9.25421142578125, "epoch": 0.6293256871663041, "mean_token_accuracy": 0.7796852588653564, "num_tokens": 12307195.0, "step": 6365, "train/ce_loss": 0.7625394463539124 }, { "epoch": 0.6293256871663041, "step": 6365, "train/sim_loss": 0.05078125 }, { "epoch": 0.6293256871663041, "step": 6365, "train/total_loss": 0.1270352005958557 }, { "entropy": 9.317729949951172, "epoch": 0.6294245600158197, "mean_token_accuracy": 0.737730085849762, "num_tokens": 12312297.0, "step": 6366, "train/ce_loss": 1.4506595134735107 }, { "epoch": 0.6294245600158197, "step": 6366, "train/sim_loss": 0.0546875 }, { "epoch": 0.6294245600158197, "step": 6366, "train/total_loss": 0.19975344836711884 }, { "entropy": 9.388843536376953, "epoch": 0.6295234328653352, "mean_token_accuracy": 0.7343096137046814, "num_tokens": 12317217.0, "step": 6367, "train/ce_loss": 4.443568286660593e-06 }, { "epoch": 0.6295234328653352, "step": 6367, "train/sim_loss": 0.04296875 }, { "epoch": 0.6295234328653352, "step": 6367, "train/total_loss": 0.04296919330954552 }, { "entropy": 9.684952735900879, "epoch": 0.6296223057148507, "mean_token_accuracy": 0.6673684120178223, "num_tokens": 12322082.0, "step": 6368, "train/ce_loss": 2.226114511489868 }, { "epoch": 0.6296223057148507, "step": 6368, "train/sim_loss": 0.0703125 }, { "epoch": 0.6296223057148507, "step": 6368, "train/total_loss": 0.2929239571094513 }, { "entropy": 9.337738037109375, "epoch": 0.6297211785643663, "mean_token_accuracy": 0.7398753762245178, "num_tokens": 12327158.0, "step": 6369, "train/ce_loss": 1.3865617513656616 }, { "epoch": 0.6297211785643663, "step": 6369, "train/sim_loss": 0.06640625 }, { "epoch": 0.6297211785643663, "step": 6369, "train/total_loss": 0.20506243407726288 }, { "entropy": 9.073951721191406, "epoch": 0.6298200514138818, "mean_token_accuracy": 0.7574578523635864, "num_tokens": 12332418.0, "step": 6370, "train/ce_loss": 0.981586217880249 }, { "epoch": 0.6298200514138818, "step": 6370, "train/sim_loss": 0.06640625 }, { "epoch": 0.6298200514138818, "step": 6370, "train/total_loss": 0.16456487774848938 }, { "entropy": 9.461869239807129, "epoch": 0.6299189242633972, "mean_token_accuracy": 0.7388888597488403, "num_tokens": 12337395.0, "step": 6371, "train/ce_loss": 1.225242018699646 }, { "epoch": 0.6299189242633972, "step": 6371, "train/sim_loss": 0.0859375 }, { "epoch": 0.6299189242633972, "step": 6371, "train/total_loss": 0.2084617018699646 }, { "entropy": 8.798147201538086, "epoch": 0.6300177971129128, "mean_token_accuracy": 0.7759162187576294, "num_tokens": 12342868.0, "step": 6372, "train/ce_loss": 0.590668797492981 }, { "epoch": 0.6300177971129128, "step": 6372, "train/sim_loss": 0.01953125 }, { "epoch": 0.6300177971129128, "step": 6372, "train/total_loss": 0.07859812676906586 }, { "entropy": 8.785633087158203, "epoch": 0.6301166699624283, "mean_token_accuracy": 0.7649824023246765, "num_tokens": 12348262.0, "step": 6373, "train/ce_loss": 0.49392861127853394 }, { "epoch": 0.6301166699624283, "step": 6373, "train/sim_loss": 0.05859375 }, { "epoch": 0.6301166699624283, "step": 6373, "train/total_loss": 0.10798661410808563 }, { "entropy": 9.257098197937012, "epoch": 0.6302155428119438, "mean_token_accuracy": 0.7217742204666138, "num_tokens": 12353458.0, "step": 6374, "train/ce_loss": 1.1889922618865967 }, { "epoch": 0.6302155428119438, "step": 6374, "train/sim_loss": 0.0859375 }, { "epoch": 0.6302155428119438, "step": 6374, "train/total_loss": 0.20483672618865967 }, { "entropy": 8.840173721313477, "epoch": 0.6303144156614594, "mean_token_accuracy": 0.7063491940498352, "num_tokens": 12358819.0, "step": 6375, "train/ce_loss": 1.1618380546569824 }, { "epoch": 0.6303144156614594, "step": 6375, "train/sim_loss": 0.0859375 }, { "epoch": 0.6303144156614594, "step": 6375, "train/total_loss": 0.2021213173866272 }, { "entropy": 9.291227340698242, "epoch": 0.6304132885109749, "mean_token_accuracy": 0.761049747467041, "num_tokens": 12363973.0, "step": 6376, "train/ce_loss": 1.1883305311203003 }, { "epoch": 0.6304132885109749, "step": 6376, "train/sim_loss": 0.0859375 }, { "epoch": 0.6304132885109749, "step": 6376, "train/total_loss": 0.20477056503295898 }, { "entropy": 8.358701705932617, "epoch": 0.6305121613604904, "mean_token_accuracy": 0.7315130829811096, "num_tokens": 12369348.0, "step": 6377, "train/ce_loss": 0.6801207661628723 }, { "epoch": 0.6305121613604904, "step": 6377, "train/sim_loss": 0.0625 }, { "epoch": 0.6305121613604904, "step": 6377, "train/total_loss": 0.1305120885372162 }, { "entropy": 8.610776901245117, "epoch": 0.630611034210006, "mean_token_accuracy": 0.728105902671814, "num_tokens": 12374810.0, "step": 6378, "train/ce_loss": 1.374704122543335 }, { "epoch": 0.630611034210006, "step": 6378, "train/sim_loss": 0.09375 }, { "epoch": 0.630611034210006, "step": 6378, "train/total_loss": 0.23122040927410126 }, { "entropy": 8.675466537475586, "epoch": 0.6307099070595215, "mean_token_accuracy": 0.7681159377098083, "num_tokens": 12380439.0, "step": 6379, "train/ce_loss": 1.0569945573806763 }, { "epoch": 0.6307099070595215, "step": 6379, "train/sim_loss": 0.0859375 }, { "epoch": 0.6307099070595215, "step": 6379, "train/total_loss": 0.19163694977760315 }, { "epoch": 0.6308087799090369, "grad_norm": 0.5189346671104431, "learning_rate": 8.425307817831183e-06, "loss": 0.1383, "step": 6380 }, { "entropy": 8.942543029785156, "epoch": 0.6308087799090369, "mean_token_accuracy": 0.7570093274116516, "num_tokens": 12385839.0, "step": 6380, "train/ce_loss": 1.4659301042556763 }, { "epoch": 0.6308087799090369, "step": 6380, "train/sim_loss": 0.06640625 }, { "epoch": 0.6308087799090369, "step": 6380, "train/total_loss": 0.21299926936626434 }, { "entropy": 8.634477615356445, "epoch": 0.6309076527585525, "mean_token_accuracy": 0.7297979593276978, "num_tokens": 12391104.0, "step": 6381, "train/ce_loss": 0.9466655254364014 }, { "epoch": 0.6309076527585525, "step": 6381, "train/sim_loss": 0.0625 }, { "epoch": 0.6309076527585525, "step": 6381, "train/total_loss": 0.15716655552387238 }, { "entropy": 9.339715957641602, "epoch": 0.631006525608068, "mean_token_accuracy": 0.6768377423286438, "num_tokens": 12396287.0, "step": 6382, "train/ce_loss": 1.5789258480072021 }, { "epoch": 0.631006525608068, "step": 6382, "train/sim_loss": 0.07421875 }, { "epoch": 0.631006525608068, "step": 6382, "train/total_loss": 0.23211133480072021 }, { "entropy": 8.758424758911133, "epoch": 0.6311053984575835, "mean_token_accuracy": 0.6982492208480835, "num_tokens": 12401773.0, "step": 6383, "train/ce_loss": 0.5240266919136047 }, { "epoch": 0.6311053984575835, "step": 6383, "train/sim_loss": 0.046875 }, { "epoch": 0.6311053984575835, "step": 6383, "train/total_loss": 0.09927767515182495 }, { "entropy": 8.72703742980957, "epoch": 0.6312042713070991, "mean_token_accuracy": 0.7908878326416016, "num_tokens": 12407133.0, "step": 6384, "train/ce_loss": 0.3531772792339325 }, { "epoch": 0.6312042713070991, "step": 6384, "train/sim_loss": 0.0234375 }, { "epoch": 0.6312042713070991, "step": 6384, "train/total_loss": 0.05875523015856743 }, { "entropy": 9.161531448364258, "epoch": 0.6313031441566146, "mean_token_accuracy": 0.7280831933021545, "num_tokens": 12412248.0, "step": 6385, "train/ce_loss": 0.7096350789070129 }, { "epoch": 0.6313031441566146, "step": 6385, "train/sim_loss": 0.0546875 }, { "epoch": 0.6313031441566146, "step": 6385, "train/total_loss": 0.12565100193023682 }, { "entropy": 9.38093376159668, "epoch": 0.6314020170061301, "mean_token_accuracy": 0.7904929518699646, "num_tokens": 12417271.0, "step": 6386, "train/ce_loss": 1.4010456652613357e-06 }, { "epoch": 0.6314020170061301, "step": 6386, "train/sim_loss": 0.046875 }, { "epoch": 0.6314020170061301, "step": 6386, "train/total_loss": 0.04687514156103134 }, { "entropy": 9.508302688598633, "epoch": 0.6315008898556457, "mean_token_accuracy": 0.7487603425979614, "num_tokens": 12422339.0, "step": 6387, "train/ce_loss": 1.1079400777816772 }, { "epoch": 0.6315008898556457, "step": 6387, "train/sim_loss": 0.0625 }, { "epoch": 0.6315008898556457, "step": 6387, "train/total_loss": 0.17329400777816772 }, { "entropy": 9.090853691101074, "epoch": 0.6315997627051612, "mean_token_accuracy": 0.7806913256645203, "num_tokens": 12427658.0, "step": 6388, "train/ce_loss": 1.1928500498470385e-06 }, { "epoch": 0.6315997627051612, "step": 6388, "train/sim_loss": 0.046875 }, { "epoch": 0.6315997627051612, "step": 6388, "train/total_loss": 0.04687511920928955 }, { "entropy": 9.914019584655762, "epoch": 0.6316986355546766, "mean_token_accuracy": 0.7288135886192322, "num_tokens": 12432343.0, "step": 6389, "train/ce_loss": 1.8008298873901367 }, { "epoch": 0.6316986355546766, "step": 6389, "train/sim_loss": 0.0859375 }, { "epoch": 0.6316986355546766, "step": 6389, "train/total_loss": 0.2660204768180847 }, { "entropy": 9.699853897094727, "epoch": 0.6317975084041922, "mean_token_accuracy": 0.8121951222419739, "num_tokens": 12437172.0, "step": 6390, "train/ce_loss": 1.049293875694275 }, { "epoch": 0.6317975084041922, "step": 6390, "train/sim_loss": 0.015625 }, { "epoch": 0.6317975084041922, "step": 6390, "train/total_loss": 0.12055438756942749 }, { "entropy": 9.337457656860352, "epoch": 0.6318963812537077, "mean_token_accuracy": 0.6853766441345215, "num_tokens": 12442307.0, "step": 6391, "train/ce_loss": 1.1118484735488892 }, { "epoch": 0.6318963812537077, "step": 6391, "train/sim_loss": 0.0703125 }, { "epoch": 0.6318963812537077, "step": 6391, "train/total_loss": 0.18149735033512115 }, { "entropy": 9.267208099365234, "epoch": 0.6319952541032232, "mean_token_accuracy": 0.7442622780799866, "num_tokens": 12447362.0, "step": 6392, "train/ce_loss": 1.1730303764343262 }, { "epoch": 0.6319952541032232, "step": 6392, "train/sim_loss": 0.078125 }, { "epoch": 0.6319952541032232, "step": 6392, "train/total_loss": 0.1954280436038971 }, { "entropy": 8.493053436279297, "epoch": 0.6320941269527388, "mean_token_accuracy": 0.8197908401489258, "num_tokens": 12453064.0, "step": 6393, "train/ce_loss": 0.5338155627250671 }, { "epoch": 0.6320941269527388, "step": 6393, "train/sim_loss": 0.02734375 }, { "epoch": 0.6320941269527388, "step": 6393, "train/total_loss": 0.08072531223297119 }, { "entropy": 8.964149475097656, "epoch": 0.6321929998022543, "mean_token_accuracy": 0.692396342754364, "num_tokens": 12458397.0, "step": 6394, "train/ce_loss": 0.34347906708717346 }, { "epoch": 0.6321929998022543, "step": 6394, "train/sim_loss": 0.03125 }, { "epoch": 0.6321929998022543, "step": 6394, "train/total_loss": 0.06559790670871735 }, { "entropy": 8.968805313110352, "epoch": 0.6322918726517698, "mean_token_accuracy": 0.7429577708244324, "num_tokens": 12463717.0, "step": 6395, "train/ce_loss": 1.5259385108947754 }, { "epoch": 0.6322918726517698, "step": 6395, "train/sim_loss": 0.06640625 }, { "epoch": 0.6322918726517698, "step": 6395, "train/total_loss": 0.21900010108947754 }, { "entropy": 9.250955581665039, "epoch": 0.6323907455012854, "mean_token_accuracy": 0.661556601524353, "num_tokens": 12469048.0, "step": 6396, "train/ce_loss": 0.9907170534133911 }, { "epoch": 0.6323907455012854, "step": 6396, "train/sim_loss": 0.04296875 }, { "epoch": 0.6323907455012854, "step": 6396, "train/total_loss": 0.1420404613018036 }, { "entropy": 9.317476272583008, "epoch": 0.6324896183508009, "mean_token_accuracy": 0.7774193286895752, "num_tokens": 12474121.0, "step": 6397, "train/ce_loss": 1.778739147084707e-06 }, { "epoch": 0.6324896183508009, "step": 6397, "train/sim_loss": 0.04296875 }, { "epoch": 0.6324896183508009, "step": 6397, "train/total_loss": 0.042968928813934326 }, { "entropy": 9.001506805419922, "epoch": 0.6325884912003163, "mean_token_accuracy": 0.7237196564674377, "num_tokens": 12479371.0, "step": 6398, "train/ce_loss": 0.8848857879638672 }, { "epoch": 0.6325884912003163, "step": 6398, "train/sim_loss": 0.0859375 }, { "epoch": 0.6325884912003163, "step": 6398, "train/total_loss": 0.17442607879638672 }, { "entropy": 9.355804443359375, "epoch": 0.632687364049832, "mean_token_accuracy": 0.7774086594581604, "num_tokens": 12484437.0, "step": 6399, "train/ce_loss": 0.8253676295280457 }, { "epoch": 0.632687364049832, "step": 6399, "train/sim_loss": 0.0234375 }, { "epoch": 0.632687364049832, "step": 6399, "train/total_loss": 0.10597426444292068 }, { "epoch": 0.6327862368993474, "grad_norm": 0.7332557439804077, "learning_rate": 8.420362953073235e-06, "loss": 0.1347, "step": 6400 }, { "entropy": 8.898918151855469, "epoch": 0.6327862368993474, "mean_token_accuracy": 0.7534246444702148, "num_tokens": 12489843.0, "step": 6400, "train/ce_loss": 1.0735923051834106 }, { "epoch": 0.6327862368993474, "step": 6400, "train/sim_loss": 0.05859375 }, { "epoch": 0.6327862368993474, "step": 6400, "train/total_loss": 0.16595298051834106 }, { "entropy": 8.987698554992676, "epoch": 0.6328851097488629, "mean_token_accuracy": 0.732824444770813, "num_tokens": 12495149.0, "step": 6401, "train/ce_loss": 1.0781718492507935 }, { "epoch": 0.6328851097488629, "step": 6401, "train/sim_loss": 0.0859375 }, { "epoch": 0.6328851097488629, "step": 6401, "train/total_loss": 0.19375468790531158 }, { "entropy": 9.062227249145508, "epoch": 0.6329839825983785, "mean_token_accuracy": 0.7556008100509644, "num_tokens": 12500041.0, "step": 6402, "train/ce_loss": 1.5401432165162987e-06 }, { "epoch": 0.6329839825983785, "step": 6402, "train/sim_loss": 0.03515625 }, { "epoch": 0.6329839825983785, "step": 6402, "train/total_loss": 0.03515640273690224 }, { "entropy": 9.318317413330078, "epoch": 0.633082855447894, "mean_token_accuracy": 0.6694560647010803, "num_tokens": 12505169.0, "step": 6403, "train/ce_loss": 4.1232300418414525e-07 }, { "epoch": 0.633082855447894, "step": 6403, "train/sim_loss": 0.01171875 }, { "epoch": 0.633082855447894, "step": 6403, "train/total_loss": 0.011718790978193283 }, { "entropy": 8.892905235290527, "epoch": 0.6331817282974095, "mean_token_accuracy": 0.8035503029823303, "num_tokens": 12510449.0, "step": 6404, "train/ce_loss": 0.6142004132270813 }, { "epoch": 0.6331817282974095, "step": 6404, "train/sim_loss": 0.0703125 }, { "epoch": 0.6331817282974095, "step": 6404, "train/total_loss": 0.1317325383424759 }, { "entropy": 8.429710388183594, "epoch": 0.6332806011469251, "mean_token_accuracy": 0.7497621178627014, "num_tokens": 12515875.0, "step": 6405, "train/ce_loss": 0.46298351883888245 }, { "epoch": 0.6332806011469251, "step": 6405, "train/sim_loss": 0.05078125 }, { "epoch": 0.6332806011469251, "step": 6405, "train/total_loss": 0.09707960486412048 }, { "entropy": 9.63508129119873, "epoch": 0.6333794739964406, "mean_token_accuracy": 0.6765676736831665, "num_tokens": 12520900.0, "step": 6406, "train/ce_loss": 1.3361752033233643 }, { "epoch": 0.6333794739964406, "step": 6406, "train/sim_loss": 0.09375 }, { "epoch": 0.6333794739964406, "step": 6406, "train/total_loss": 0.22736752033233643 }, { "entropy": 8.758066177368164, "epoch": 0.633478346845956, "mean_token_accuracy": 0.7649572491645813, "num_tokens": 12526352.0, "step": 6407, "train/ce_loss": 0.8186273574829102 }, { "epoch": 0.633478346845956, "step": 6407, "train/sim_loss": 0.08984375 }, { "epoch": 0.633478346845956, "step": 6407, "train/total_loss": 0.17170649766921997 }, { "entropy": 9.324136734008789, "epoch": 0.6335772196954716, "mean_token_accuracy": 0.7474600672721863, "num_tokens": 12531502.0, "step": 6408, "train/ce_loss": 7.128099923647824e-07 }, { "epoch": 0.6335772196954716, "step": 6408, "train/sim_loss": 0.02734375 }, { "epoch": 0.6335772196954716, "step": 6408, "train/total_loss": 0.02734382078051567 }, { "entropy": 9.120576858520508, "epoch": 0.6336760925449871, "mean_token_accuracy": 0.7368420958518982, "num_tokens": 12536623.0, "step": 6409, "train/ce_loss": 1.019844651222229 }, { "epoch": 0.6336760925449871, "step": 6409, "train/sim_loss": 0.0546875 }, { "epoch": 0.6336760925449871, "step": 6409, "train/total_loss": 0.15667197108268738 }, { "entropy": 9.586650848388672, "epoch": 0.6337749653945026, "mean_token_accuracy": 0.7752212285995483, "num_tokens": 12541606.0, "step": 6410, "train/ce_loss": 0.8787671327590942 }, { "epoch": 0.6337749653945026, "step": 6410, "train/sim_loss": 0.0234375 }, { "epoch": 0.6337749653945026, "step": 6410, "train/total_loss": 0.11131421476602554 }, { "entropy": 9.47934341430664, "epoch": 0.6338738382440182, "mean_token_accuracy": 0.7429149746894836, "num_tokens": 12546526.0, "step": 6411, "train/ce_loss": 1.7796356678009033 }, { "epoch": 0.6338738382440182, "step": 6411, "train/sim_loss": 0.1015625 }, { "epoch": 0.6338738382440182, "step": 6411, "train/total_loss": 0.2795260548591614 }, { "entropy": 9.546884536743164, "epoch": 0.6339727110935337, "mean_token_accuracy": 0.7328000068664551, "num_tokens": 12551552.0, "step": 6412, "train/ce_loss": 1.4202111959457397 }, { "epoch": 0.6339727110935337, "step": 6412, "train/sim_loss": 0.0234375 }, { "epoch": 0.6339727110935337, "step": 6412, "train/total_loss": 0.16545861959457397 }, { "entropy": 9.052999496459961, "epoch": 0.6340715839430492, "mean_token_accuracy": 0.7513020634651184, "num_tokens": 12556784.0, "step": 6413, "train/ce_loss": 1.0188919305801392 }, { "epoch": 0.6340715839430492, "step": 6413, "train/sim_loss": 0.07421875 }, { "epoch": 0.6340715839430492, "step": 6413, "train/total_loss": 0.17610794305801392 }, { "entropy": 9.16131591796875, "epoch": 0.6341704567925648, "mean_token_accuracy": 0.7312312126159668, "num_tokens": 12561880.0, "step": 6414, "train/ce_loss": 1.1434515714645386 }, { "epoch": 0.6341704567925648, "step": 6414, "train/sim_loss": 0.03125 }, { "epoch": 0.6341704567925648, "step": 6414, "train/total_loss": 0.14559516310691833 }, { "entropy": 8.769760131835938, "epoch": 0.6342693296420803, "mean_token_accuracy": 0.740818440914154, "num_tokens": 12567279.0, "step": 6415, "train/ce_loss": 0.8456613421440125 }, { "epoch": 0.6342693296420803, "step": 6415, "train/sim_loss": 0.0625 }, { "epoch": 0.6342693296420803, "step": 6415, "train/total_loss": 0.1470661461353302 }, { "entropy": 8.74283218383789, "epoch": 0.6343682024915958, "mean_token_accuracy": 0.6560170650482178, "num_tokens": 12572704.0, "step": 6416, "train/ce_loss": 0.8355754613876343 }, { "epoch": 0.6343682024915958, "step": 6416, "train/sim_loss": 0.04296875 }, { "epoch": 0.6343682024915958, "step": 6416, "train/total_loss": 0.12652629613876343 }, { "entropy": 8.902482986450195, "epoch": 0.6344670753411114, "mean_token_accuracy": 0.7208765745162964, "num_tokens": 12578091.0, "step": 6417, "train/ce_loss": 1.4132318496704102 }, { "epoch": 0.6344670753411114, "step": 6417, "train/sim_loss": 0.046875 }, { "epoch": 0.6344670753411114, "step": 6417, "train/total_loss": 0.18819819390773773 }, { "entropy": 9.10424518585205, "epoch": 0.6345659481906268, "mean_token_accuracy": 0.7798408269882202, "num_tokens": 12583317.0, "step": 6418, "train/ce_loss": 2.373131792410277e-06 }, { "epoch": 0.6345659481906268, "step": 6418, "train/sim_loss": 0.05859375 }, { "epoch": 0.6345659481906268, "step": 6418, "train/total_loss": 0.0585939884185791 }, { "entropy": 8.467370986938477, "epoch": 0.6346648210401423, "mean_token_accuracy": 0.7738446593284607, "num_tokens": 12588812.0, "step": 6419, "train/ce_loss": 0.7450070977210999 }, { "epoch": 0.6346648210401423, "step": 6419, "train/sim_loss": 0.04296875 }, { "epoch": 0.6346648210401423, "step": 6419, "train/total_loss": 0.11746945977210999 }, { "epoch": 0.6347636938896579, "grad_norm": 0.7006480097770691, "learning_rate": 8.415418088315285e-06, "loss": 0.1424, "step": 6420 }, { "entropy": 8.973926544189453, "epoch": 0.6347636938896579, "mean_token_accuracy": 0.7991746664047241, "num_tokens": 12594006.0, "step": 6420, "train/ce_loss": 0.6745920777320862 }, { "epoch": 0.6347636938896579, "step": 6420, "train/sim_loss": 0.03515625 }, { "epoch": 0.6347636938896579, "step": 6420, "train/total_loss": 0.10261546075344086 }, { "entropy": 10.236392974853516, "epoch": 0.6348625667391734, "mean_token_accuracy": 0.6958333253860474, "num_tokens": 12598612.0, "step": 6421, "train/ce_loss": 3.722998826560797e-06 }, { "epoch": 0.6348625667391734, "step": 6421, "train/sim_loss": 0.0859375 }, { "epoch": 0.6348625667391734, "step": 6421, "train/total_loss": 0.08593787252902985 }, { "entropy": 9.714025497436523, "epoch": 0.6349614395886889, "mean_token_accuracy": 0.7431906461715698, "num_tokens": 12603525.0, "step": 6422, "train/ce_loss": 0.7071661353111267 }, { "epoch": 0.6349614395886889, "step": 6422, "train/sim_loss": 0.046875 }, { "epoch": 0.6349614395886889, "step": 6422, "train/total_loss": 0.11759161204099655 }, { "entropy": 8.769353866577148, "epoch": 0.6350603124382045, "mean_token_accuracy": 0.7455242872238159, "num_tokens": 12608775.0, "step": 6423, "train/ce_loss": 0.9130332469940186 }, { "epoch": 0.6350603124382045, "step": 6423, "train/sim_loss": 0.046875 }, { "epoch": 0.6350603124382045, "step": 6423, "train/total_loss": 0.13817831873893738 }, { "entropy": 9.194217681884766, "epoch": 0.63515918528772, "mean_token_accuracy": 0.8032069802284241, "num_tokens": 12613870.0, "step": 6424, "train/ce_loss": 0.7883514165878296 }, { "epoch": 0.63515918528772, "step": 6424, "train/sim_loss": 0.015625 }, { "epoch": 0.63515918528772, "step": 6424, "train/total_loss": 0.0944601446390152 }, { "entropy": 8.776252746582031, "epoch": 0.6352580581372356, "mean_token_accuracy": 0.7296379804611206, "num_tokens": 12619275.0, "step": 6425, "train/ce_loss": 0.7394523024559021 }, { "epoch": 0.6352580581372356, "step": 6425, "train/sim_loss": 0.0625 }, { "epoch": 0.6352580581372356, "step": 6425, "train/total_loss": 0.13644522428512573 }, { "entropy": 8.840118408203125, "epoch": 0.6353569309867511, "mean_token_accuracy": 0.6878109574317932, "num_tokens": 12624502.0, "step": 6426, "train/ce_loss": 1.818405270576477 }, { "epoch": 0.6353569309867511, "step": 6426, "train/sim_loss": 0.0625 }, { "epoch": 0.6353569309867511, "step": 6426, "train/total_loss": 0.24434052407741547 }, { "entropy": 8.930981636047363, "epoch": 0.6354558038362665, "mean_token_accuracy": 0.7754654884338379, "num_tokens": 12629847.0, "step": 6427, "train/ce_loss": 0.5735181570053101 }, { "epoch": 0.6354558038362665, "step": 6427, "train/sim_loss": 0.0234375 }, { "epoch": 0.6354558038362665, "step": 6427, "train/total_loss": 0.08078931272029877 }, { "entropy": 9.04982852935791, "epoch": 0.6355546766857821, "mean_token_accuracy": 0.6829971075057983, "num_tokens": 12635037.0, "step": 6428, "train/ce_loss": 0.8141086101531982 }, { "epoch": 0.6355546766857821, "step": 6428, "train/sim_loss": 0.0703125 }, { "epoch": 0.6355546766857821, "step": 6428, "train/total_loss": 0.15172335505485535 }, { "entropy": 8.66606330871582, "epoch": 0.6356535495352976, "mean_token_accuracy": 0.7685631513595581, "num_tokens": 12640547.0, "step": 6429, "train/ce_loss": 0.5794277787208557 }, { "epoch": 0.6356535495352976, "step": 6429, "train/sim_loss": 0.02734375 }, { "epoch": 0.6356535495352976, "step": 6429, "train/total_loss": 0.08528652787208557 }, { "entropy": 9.186049461364746, "epoch": 0.6357524223848131, "mean_token_accuracy": 0.7824561595916748, "num_tokens": 12645563.0, "step": 6430, "train/ce_loss": 0.9713578224182129 }, { "epoch": 0.6357524223848131, "step": 6430, "train/sim_loss": 0.05078125 }, { "epoch": 0.6357524223848131, "step": 6430, "train/total_loss": 0.1479170322418213 }, { "entropy": 9.148721694946289, "epoch": 0.6358512952343287, "mean_token_accuracy": 0.7407894730567932, "num_tokens": 12650779.0, "step": 6431, "train/ce_loss": 1.3405790328979492 }, { "epoch": 0.6358512952343287, "step": 6431, "train/sim_loss": 0.02734375 }, { "epoch": 0.6358512952343287, "step": 6431, "train/total_loss": 0.1614016592502594 }, { "entropy": 8.77899169921875, "epoch": 0.6359501680838442, "mean_token_accuracy": 0.7346465587615967, "num_tokens": 12656145.0, "step": 6432, "train/ce_loss": 0.7334739565849304 }, { "epoch": 0.6359501680838442, "step": 6432, "train/sim_loss": 0.04296875 }, { "epoch": 0.6359501680838442, "step": 6432, "train/total_loss": 0.11631614714860916 }, { "entropy": 9.036822319030762, "epoch": 0.6360490409333597, "mean_token_accuracy": 0.7083870768547058, "num_tokens": 12661438.0, "step": 6433, "train/ce_loss": 0.9512785077095032 }, { "epoch": 0.6360490409333597, "step": 6433, "train/sim_loss": 0.08984375 }, { "epoch": 0.6360490409333597, "step": 6433, "train/total_loss": 0.18497160077095032 }, { "entropy": 9.052738189697266, "epoch": 0.6361479137828753, "mean_token_accuracy": 0.7056604027748108, "num_tokens": 12666695.0, "step": 6434, "train/ce_loss": 1.518971562385559 }, { "epoch": 0.6361479137828753, "step": 6434, "train/sim_loss": 0.05859375 }, { "epoch": 0.6361479137828753, "step": 6434, "train/total_loss": 0.21049091219902039 }, { "entropy": 9.298844337463379, "epoch": 0.6362467866323908, "mean_token_accuracy": 0.7064846158027649, "num_tokens": 12671751.0, "step": 6435, "train/ce_loss": 1.7070856301870663e-06 }, { "epoch": 0.6362467866323908, "step": 6435, "train/sim_loss": 0.0703125 }, { "epoch": 0.6362467866323908, "step": 6435, "train/total_loss": 0.07031267136335373 }, { "entropy": 8.464192390441895, "epoch": 0.6363456594819062, "mean_token_accuracy": 0.7317351698875427, "num_tokens": 12677078.0, "step": 6436, "train/ce_loss": 0.853071928024292 }, { "epoch": 0.6363456594819062, "step": 6436, "train/sim_loss": 0.0234375 }, { "epoch": 0.6363456594819062, "step": 6436, "train/total_loss": 0.10874469578266144 }, { "entropy": 9.107930183410645, "epoch": 0.6364445323314218, "mean_token_accuracy": 0.7402032017707825, "num_tokens": 12682158.0, "step": 6437, "train/ce_loss": 0.8887556791305542 }, { "epoch": 0.6364445323314218, "step": 6437, "train/sim_loss": 0.03125 }, { "epoch": 0.6364445323314218, "step": 6437, "train/total_loss": 0.12012556940317154 }, { "entropy": 8.584230422973633, "epoch": 0.6365434051809373, "mean_token_accuracy": 0.7628541588783264, "num_tokens": 12687593.0, "step": 6438, "train/ce_loss": 0.7757463455200195 }, { "epoch": 0.6365434051809373, "step": 6438, "train/sim_loss": 0.0390625 }, { "epoch": 0.6365434051809373, "step": 6438, "train/total_loss": 0.11663713306188583 }, { "entropy": 9.337575912475586, "epoch": 0.6366422780304528, "mean_token_accuracy": 0.7447916865348816, "num_tokens": 12692596.0, "step": 6439, "train/ce_loss": 1.4264662265777588 }, { "epoch": 0.6366422780304528, "step": 6439, "train/sim_loss": 0.04296875 }, { "epoch": 0.6366422780304528, "step": 6439, "train/total_loss": 0.18561537563800812 }, { "epoch": 0.6367411508799684, "grad_norm": 0.7183775305747986, "learning_rate": 8.410473223557336e-06, "loss": 0.1363, "step": 6440 }, { "entropy": 10.048894882202148, "epoch": 0.6367411508799684, "mean_token_accuracy": 0.6693877577781677, "num_tokens": 12697211.0, "step": 6440, "train/ce_loss": 2.288748646606109e-06 }, { "epoch": 0.6367411508799684, "step": 6440, "train/sim_loss": 0.0234375 }, { "epoch": 0.6367411508799684, "step": 6440, "train/total_loss": 0.023437729105353355 }, { "entropy": 8.808197021484375, "epoch": 0.6368400237294839, "mean_token_accuracy": 0.7426966428756714, "num_tokens": 12702526.0, "step": 6441, "train/ce_loss": 1.0223617553710938 }, { "epoch": 0.6368400237294839, "step": 6441, "train/sim_loss": 0.04296875 }, { "epoch": 0.6368400237294839, "step": 6441, "train/total_loss": 0.14520493149757385 }, { "entropy": 8.888721466064453, "epoch": 0.6369388965789994, "mean_token_accuracy": 0.757656455039978, "num_tokens": 12707726.0, "step": 6442, "train/ce_loss": 1.2178153991699219 }, { "epoch": 0.6369388965789994, "step": 6442, "train/sim_loss": 0.0859375 }, { "epoch": 0.6369388965789994, "step": 6442, "train/total_loss": 0.20771904289722443 }, { "entropy": 9.222532272338867, "epoch": 0.637037769428515, "mean_token_accuracy": 0.7290909290313721, "num_tokens": 12712704.0, "step": 6443, "train/ce_loss": 1.4290187358856201 }, { "epoch": 0.637037769428515, "step": 6443, "train/sim_loss": 0.0703125 }, { "epoch": 0.637037769428515, "step": 6443, "train/total_loss": 0.21321438252925873 }, { "entropy": 9.138294219970703, "epoch": 0.6371366422780305, "mean_token_accuracy": 0.7117263674736023, "num_tokens": 12717775.0, "step": 6444, "train/ce_loss": 9.109377856475476e-07 }, { "epoch": 0.6371366422780305, "step": 6444, "train/sim_loss": 0.03515625 }, { "epoch": 0.6371366422780305, "step": 6444, "train/total_loss": 0.03515633940696716 }, { "entropy": 9.019227027893066, "epoch": 0.637235515127546, "mean_token_accuracy": 0.7219387888908386, "num_tokens": 12723013.0, "step": 6445, "train/ce_loss": 0.775183379650116 }, { "epoch": 0.637235515127546, "step": 6445, "train/sim_loss": 0.09765625 }, { "epoch": 0.637235515127546, "step": 6445, "train/total_loss": 0.17517459392547607 }, { "entropy": 8.739466667175293, "epoch": 0.6373343879770615, "mean_token_accuracy": 0.776992917060852, "num_tokens": 12728471.0, "step": 6446, "train/ce_loss": 0.9948819875717163 }, { "epoch": 0.6373343879770615, "step": 6446, "train/sim_loss": 0.07421875 }, { "epoch": 0.6373343879770615, "step": 6446, "train/total_loss": 0.17370694875717163 }, { "entropy": 9.282242774963379, "epoch": 0.637433260826577, "mean_token_accuracy": 0.7416173815727234, "num_tokens": 12733394.0, "step": 6447, "train/ce_loss": 1.1608867645263672 }, { "epoch": 0.637433260826577, "step": 6447, "train/sim_loss": 0.0390625 }, { "epoch": 0.637433260826577, "step": 6447, "train/total_loss": 0.15515118837356567 }, { "entropy": 9.284667015075684, "epoch": 0.6375321336760925, "mean_token_accuracy": 0.7423999905586243, "num_tokens": 12738428.0, "step": 6448, "train/ce_loss": 0.6794313192367554 }, { "epoch": 0.6375321336760925, "step": 6448, "train/sim_loss": 0.0546875 }, { "epoch": 0.6375321336760925, "step": 6448, "train/total_loss": 0.12263063341379166 }, { "entropy": 8.700347900390625, "epoch": 0.6376310065256081, "mean_token_accuracy": 0.773950457572937, "num_tokens": 12743853.0, "step": 6449, "train/ce_loss": 0.7626098990440369 }, { "epoch": 0.6376310065256081, "step": 6449, "train/sim_loss": 0.09765625 }, { "epoch": 0.6376310065256081, "step": 6449, "train/total_loss": 0.1739172339439392 }, { "entropy": 8.891304016113281, "epoch": 0.6377298793751236, "mean_token_accuracy": 0.7259615659713745, "num_tokens": 12749120.0, "step": 6450, "train/ce_loss": 0.8201631307601929 }, { "epoch": 0.6377298793751236, "step": 6450, "train/sim_loss": 0.0625 }, { "epoch": 0.6377298793751236, "step": 6450, "train/total_loss": 0.14451631903648376 }, { "entropy": 9.129953384399414, "epoch": 0.6378287522246391, "mean_token_accuracy": 0.7582873106002808, "num_tokens": 12754400.0, "step": 6451, "train/ce_loss": 9.450710081182478e-07 }, { "epoch": 0.6378287522246391, "step": 6451, "train/sim_loss": 0.05859375 }, { "epoch": 0.6378287522246391, "step": 6451, "train/total_loss": 0.05859384313225746 }, { "entropy": 9.122050285339355, "epoch": 0.6379276250741547, "mean_token_accuracy": 0.759365975856781, "num_tokens": 12759548.0, "step": 6452, "train/ce_loss": 1.265630841255188 }, { "epoch": 0.6379276250741547, "step": 6452, "train/sim_loss": 0.03125 }, { "epoch": 0.6379276250741547, "step": 6452, "train/total_loss": 0.15781308710575104 }, { "entropy": 8.692692756652832, "epoch": 0.6380264979236702, "mean_token_accuracy": 0.695652186870575, "num_tokens": 12765035.0, "step": 6453, "train/ce_loss": 1.3813843727111816 }, { "epoch": 0.6380264979236702, "step": 6453, "train/sim_loss": 0.05859375 }, { "epoch": 0.6380264979236702, "step": 6453, "train/total_loss": 0.19673219323158264 }, { "entropy": 9.10655689239502, "epoch": 0.6381253707731857, "mean_token_accuracy": 0.7168508172035217, "num_tokens": 12770169.0, "step": 6454, "train/ce_loss": 1.2932935953140259 }, { "epoch": 0.6381253707731857, "step": 6454, "train/sim_loss": 0.07421875 }, { "epoch": 0.6381253707731857, "step": 6454, "train/total_loss": 0.2035481184720993 }, { "entropy": 9.16287612915039, "epoch": 0.6382242436227012, "mean_token_accuracy": 0.7213114500045776, "num_tokens": 12775363.0, "step": 6455, "train/ce_loss": 0.8710024356842041 }, { "epoch": 0.6382242436227012, "step": 6455, "train/sim_loss": 0.08203125 }, { "epoch": 0.6382242436227012, "step": 6455, "train/total_loss": 0.16913148760795593 }, { "entropy": 8.762480735778809, "epoch": 0.6383231164722167, "mean_token_accuracy": 0.7190082669258118, "num_tokens": 12780679.0, "step": 6456, "train/ce_loss": 1.3025904893875122 }, { "epoch": 0.6383231164722167, "step": 6456, "train/sim_loss": 0.07421875 }, { "epoch": 0.6383231164722167, "step": 6456, "train/total_loss": 0.20447780191898346 }, { "entropy": 8.96703815460205, "epoch": 0.6384219893217322, "mean_token_accuracy": 0.7301790118217468, "num_tokens": 12785913.0, "step": 6457, "train/ce_loss": 0.31815260648727417 }, { "epoch": 0.6384219893217322, "step": 6457, "train/sim_loss": 0.0546875 }, { "epoch": 0.6384219893217322, "step": 6457, "train/total_loss": 0.08650276064872742 }, { "entropy": 8.742262840270996, "epoch": 0.6385208621712478, "mean_token_accuracy": 0.7894201278686523, "num_tokens": 12791331.0, "step": 6458, "train/ce_loss": 0.7575759291648865 }, { "epoch": 0.6385208621712478, "step": 6458, "train/sim_loss": 0.04296875 }, { "epoch": 0.6385208621712478, "step": 6458, "train/total_loss": 0.11872634291648865 }, { "entropy": 9.20828628540039, "epoch": 0.6386197350207633, "mean_token_accuracy": 0.7433751821517944, "num_tokens": 12796509.0, "step": 6459, "train/ce_loss": 0.7584218382835388 }, { "epoch": 0.6386197350207633, "step": 6459, "train/sim_loss": 0.06640625 }, { "epoch": 0.6386197350207633, "step": 6459, "train/total_loss": 0.14224843680858612 }, { "epoch": 0.6387186078702788, "grad_norm": 0.6138080358505249, "learning_rate": 8.405528358799388e-06, "loss": 0.1387, "step": 6460 }, { "entropy": 8.801475524902344, "epoch": 0.6387186078702788, "mean_token_accuracy": 0.7757009267807007, "num_tokens": 12801787.0, "step": 6460, "train/ce_loss": 0.7655203938484192 }, { "epoch": 0.6387186078702788, "step": 6460, "train/sim_loss": 0.04296875 }, { "epoch": 0.6387186078702788, "step": 6460, "train/total_loss": 0.11952079087495804 }, { "entropy": 9.13096809387207, "epoch": 0.6388174807197944, "mean_token_accuracy": 0.7729393243789673, "num_tokens": 12806895.0, "step": 6461, "train/ce_loss": 0.6062048077583313 }, { "epoch": 0.6388174807197944, "step": 6461, "train/sim_loss": 0.09375 }, { "epoch": 0.6388174807197944, "step": 6461, "train/total_loss": 0.1543704867362976 }, { "entropy": 9.962745666503906, "epoch": 0.6389163535693099, "mean_token_accuracy": 0.7582417726516724, "num_tokens": 12811478.0, "step": 6462, "train/ce_loss": 3.825109888566658e-05 }, { "epoch": 0.6389163535693099, "step": 6462, "train/sim_loss": 0.02734375 }, { "epoch": 0.6389163535693099, "step": 6462, "train/total_loss": 0.02734757587313652 }, { "entropy": 9.132343292236328, "epoch": 0.6390152264188254, "mean_token_accuracy": 0.7365438938140869, "num_tokens": 12816662.0, "step": 6463, "train/ce_loss": 1.3348407745361328 }, { "epoch": 0.6390152264188254, "step": 6463, "train/sim_loss": 0.078125 }, { "epoch": 0.6390152264188254, "step": 6463, "train/total_loss": 0.21160908043384552 }, { "entropy": 9.118932723999023, "epoch": 0.639114099268341, "mean_token_accuracy": 0.7469586133956909, "num_tokens": 12821922.0, "step": 6464, "train/ce_loss": 1.0696030855178833 }, { "epoch": 0.639114099268341, "step": 6464, "train/sim_loss": 0.078125 }, { "epoch": 0.639114099268341, "step": 6464, "train/total_loss": 0.18508531153202057 }, { "entropy": 8.88484001159668, "epoch": 0.6392129721178564, "mean_token_accuracy": 0.7077844142913818, "num_tokens": 12827176.0, "step": 6465, "train/ce_loss": 0.5346351265907288 }, { "epoch": 0.6392129721178564, "step": 6465, "train/sim_loss": 0.08203125 }, { "epoch": 0.6392129721178564, "step": 6465, "train/total_loss": 0.13549476861953735 }, { "entropy": 9.236489295959473, "epoch": 0.6393118449673719, "mean_token_accuracy": 0.7913562059402466, "num_tokens": 12832291.0, "step": 6466, "train/ce_loss": 0.8808301091194153 }, { "epoch": 0.6393118449673719, "step": 6466, "train/sim_loss": 0.046875 }, { "epoch": 0.6393118449673719, "step": 6466, "train/total_loss": 0.13495801389217377 }, { "entropy": 8.801457405090332, "epoch": 0.6394107178168875, "mean_token_accuracy": 0.7459839582443237, "num_tokens": 12837734.0, "step": 6467, "train/ce_loss": 0.6924058794975281 }, { "epoch": 0.6394107178168875, "step": 6467, "train/sim_loss": 0.0390625 }, { "epoch": 0.6394107178168875, "step": 6467, "train/total_loss": 0.10830309242010117 }, { "entropy": 8.825029373168945, "epoch": 0.639509590666403, "mean_token_accuracy": 0.724252462387085, "num_tokens": 12843146.0, "step": 6468, "train/ce_loss": 9.435718766326318e-07 }, { "epoch": 0.639509590666403, "step": 6468, "train/sim_loss": 0.03125 }, { "epoch": 0.639509590666403, "step": 6468, "train/total_loss": 0.03125009313225746 }, { "entropy": 8.679088592529297, "epoch": 0.6396084635159185, "mean_token_accuracy": 0.6833667159080505, "num_tokens": 12848727.0, "step": 6469, "train/ce_loss": 0.9084466695785522 }, { "epoch": 0.6396084635159185, "step": 6469, "train/sim_loss": 0.06640625 }, { "epoch": 0.6396084635159185, "step": 6469, "train/total_loss": 0.15725091099739075 }, { "entropy": 9.108712196350098, "epoch": 0.6397073363654341, "mean_token_accuracy": 0.7720403075218201, "num_tokens": 12853991.0, "step": 6470, "train/ce_loss": 0.39327272772789 }, { "epoch": 0.6397073363654341, "step": 6470, "train/sim_loss": 0.01953125 }, { "epoch": 0.6397073363654341, "step": 6470, "train/total_loss": 0.05885852500796318 }, { "entropy": 8.573925971984863, "epoch": 0.6398062092149496, "mean_token_accuracy": 0.7548240423202515, "num_tokens": 12859513.0, "step": 6471, "train/ce_loss": 1.0028175115585327 }, { "epoch": 0.6398062092149496, "step": 6471, "train/sim_loss": 0.0390625 }, { "epoch": 0.6398062092149496, "step": 6471, "train/total_loss": 0.1393442451953888 }, { "entropy": 9.224263191223145, "epoch": 0.6399050820644651, "mean_token_accuracy": 0.758400022983551, "num_tokens": 12864631.0, "step": 6472, "train/ce_loss": 0.7310265898704529 }, { "epoch": 0.6399050820644651, "step": 6472, "train/sim_loss": 0.03515625 }, { "epoch": 0.6399050820644651, "step": 6472, "train/total_loss": 0.10825891047716141 }, { "entropy": 9.070579528808594, "epoch": 0.6400039549139807, "mean_token_accuracy": 0.7385203838348389, "num_tokens": 12869839.0, "step": 6473, "train/ce_loss": 1.044979214668274 }, { "epoch": 0.6400039549139807, "step": 6473, "train/sim_loss": 0.0390625 }, { "epoch": 0.6400039549139807, "step": 6473, "train/total_loss": 0.14356042444705963 }, { "entropy": 8.707610130310059, "epoch": 0.6401028277634961, "mean_token_accuracy": 0.7283549904823303, "num_tokens": 12875225.0, "step": 6474, "train/ce_loss": 0.8016955256462097 }, { "epoch": 0.6401028277634961, "step": 6474, "train/sim_loss": 0.078125 }, { "epoch": 0.6401028277634961, "step": 6474, "train/total_loss": 0.15829455852508545 }, { "entropy": 9.587264060974121, "epoch": 0.6402017006130116, "mean_token_accuracy": 0.7548746466636658, "num_tokens": 12880039.0, "step": 6475, "train/ce_loss": 2.041363813987118e-06 }, { "epoch": 0.6402017006130116, "step": 6475, "train/sim_loss": 0.03515625 }, { "epoch": 0.6402017006130116, "step": 6475, "train/total_loss": 0.035156454890966415 }, { "entropy": 9.856155395507812, "epoch": 0.6403005734625272, "mean_token_accuracy": 0.7080745100975037, "num_tokens": 12884958.0, "step": 6476, "train/ce_loss": 0.956764817237854 }, { "epoch": 0.6403005734625272, "step": 6476, "train/sim_loss": 0.06640625 }, { "epoch": 0.6403005734625272, "step": 6476, "train/total_loss": 0.1620827317237854 }, { "entropy": 9.147016525268555, "epoch": 0.6403994463120427, "mean_token_accuracy": 0.722482442855835, "num_tokens": 12890305.0, "step": 6477, "train/ce_loss": 0.8601493239402771 }, { "epoch": 0.6403994463120427, "step": 6477, "train/sim_loss": 0.078125 }, { "epoch": 0.6403994463120427, "step": 6477, "train/total_loss": 0.16413992643356323 }, { "entropy": 9.434209823608398, "epoch": 0.6404983191615582, "mean_token_accuracy": 0.7606679201126099, "num_tokens": 12895298.0, "step": 6478, "train/ce_loss": 0.7040955424308777 }, { "epoch": 0.6404983191615582, "step": 6478, "train/sim_loss": 0.03515625 }, { "epoch": 0.6404983191615582, "step": 6478, "train/total_loss": 0.10556580871343613 }, { "entropy": 8.699009895324707, "epoch": 0.6405971920110738, "mean_token_accuracy": 0.7092288136482239, "num_tokens": 12900625.0, "step": 6479, "train/ce_loss": 1.285062313079834 }, { "epoch": 0.6405971920110738, "step": 6479, "train/sim_loss": 0.0546875 }, { "epoch": 0.6405971920110738, "step": 6479, "train/total_loss": 0.18319372832775116 }, { "epoch": 0.6406960648605893, "grad_norm": 0.7683160901069641, "learning_rate": 8.400583494041439e-06, "loss": 0.1412, "step": 6480 }, { "entropy": 9.59773063659668, "epoch": 0.6406960648605893, "mean_token_accuracy": 0.724252462387085, "num_tokens": 12905647.0, "step": 6480, "train/ce_loss": 0.8121297955513 }, { "epoch": 0.6406960648605893, "step": 6480, "train/sim_loss": 0.0234375 }, { "epoch": 0.6406960648605893, "step": 6480, "train/total_loss": 0.10465048253536224 }, { "entropy": 8.975669860839844, "epoch": 0.6407949377101048, "mean_token_accuracy": 0.7036144733428955, "num_tokens": 12910915.0, "step": 6481, "train/ce_loss": 1.2544474601745605 }, { "epoch": 0.6407949377101048, "step": 6481, "train/sim_loss": 0.109375 }, { "epoch": 0.6407949377101048, "step": 6481, "train/total_loss": 0.23481975495815277 }, { "entropy": 9.157267570495605, "epoch": 0.6408938105596204, "mean_token_accuracy": 0.730708658695221, "num_tokens": 12915981.0, "step": 6482, "train/ce_loss": 1.0704869031906128 }, { "epoch": 0.6408938105596204, "step": 6482, "train/sim_loss": 0.0703125 }, { "epoch": 0.6408938105596204, "step": 6482, "train/total_loss": 0.17736119031906128 }, { "entropy": 9.217813491821289, "epoch": 0.6409926834091358, "mean_token_accuracy": 0.7742382287979126, "num_tokens": 12921133.0, "step": 6483, "train/ce_loss": 0.6621120572090149 }, { "epoch": 0.6409926834091358, "step": 6483, "train/sim_loss": 0.0390625 }, { "epoch": 0.6409926834091358, "step": 6483, "train/total_loss": 0.10527370870113373 }, { "entropy": 9.174644470214844, "epoch": 0.6410915562586513, "mean_token_accuracy": 0.7536231875419617, "num_tokens": 12926156.0, "step": 6484, "train/ce_loss": 0.8040371537208557 }, { "epoch": 0.6410915562586513, "step": 6484, "train/sim_loss": 0.03515625 }, { "epoch": 0.6410915562586513, "step": 6484, "train/total_loss": 0.11555996537208557 }, { "entropy": 9.635343551635742, "epoch": 0.6411904291081669, "mean_token_accuracy": 0.7782177925109863, "num_tokens": 12931054.0, "step": 6485, "train/ce_loss": 9.885932286124444e-07 }, { "epoch": 0.6411904291081669, "step": 6485, "train/sim_loss": 0.01953125 }, { "epoch": 0.6411904291081669, "step": 6485, "train/total_loss": 0.01953134872019291 }, { "entropy": 8.949685096740723, "epoch": 0.6412893019576824, "mean_token_accuracy": 0.7885952591896057, "num_tokens": 12936237.0, "step": 6486, "train/ce_loss": 0.8853443264961243 }, { "epoch": 0.6412893019576824, "step": 6486, "train/sim_loss": 0.07421875 }, { "epoch": 0.6412893019576824, "step": 6486, "train/total_loss": 0.16275319457054138 }, { "entropy": 8.84429931640625, "epoch": 0.6413881748071979, "mean_token_accuracy": 0.7311320900917053, "num_tokens": 12941576.0, "step": 6487, "train/ce_loss": 1.2528860569000244 }, { "epoch": 0.6413881748071979, "step": 6487, "train/sim_loss": 0.05078125 }, { "epoch": 0.6413881748071979, "step": 6487, "train/total_loss": 0.17606985569000244 }, { "entropy": 9.086837768554688, "epoch": 0.6414870476567135, "mean_token_accuracy": 0.7229064106941223, "num_tokens": 12946900.0, "step": 6488, "train/ce_loss": 0.6264514923095703 }, { "epoch": 0.6414870476567135, "step": 6488, "train/sim_loss": 0.0625 }, { "epoch": 0.6414870476567135, "step": 6488, "train/total_loss": 0.12514515221118927 }, { "entropy": 9.181316375732422, "epoch": 0.641585920506229, "mean_token_accuracy": 0.6954612135887146, "num_tokens": 12952055.0, "step": 6489, "train/ce_loss": 1.2988181114196777 }, { "epoch": 0.641585920506229, "step": 6489, "train/sim_loss": 0.01953125 }, { "epoch": 0.641585920506229, "step": 6489, "train/total_loss": 0.1494130641222 }, { "entropy": 9.338220596313477, "epoch": 0.6416847933557445, "mean_token_accuracy": 0.7082683444023132, "num_tokens": 12957049.0, "step": 6490, "train/ce_loss": 1.2148973941802979 }, { "epoch": 0.6416847933557445, "step": 6490, "train/sim_loss": 0.05078125 }, { "epoch": 0.6416847933557445, "step": 6490, "train/total_loss": 0.1722709834575653 }, { "entropy": 9.005218505859375, "epoch": 0.6417836662052601, "mean_token_accuracy": 0.7276028990745544, "num_tokens": 12962344.0, "step": 6491, "train/ce_loss": 0.8991209268569946 }, { "epoch": 0.6417836662052601, "step": 6491, "train/sim_loss": 0.0546875 }, { "epoch": 0.6417836662052601, "step": 6491, "train/total_loss": 0.14459958672523499 }, { "entropy": 8.843526840209961, "epoch": 0.6418825390547755, "mean_token_accuracy": 0.7493403553962708, "num_tokens": 12967605.0, "step": 6492, "train/ce_loss": 1.0386401414871216 }, { "epoch": 0.6418825390547755, "step": 6492, "train/sim_loss": 0.0546875 }, { "epoch": 0.6418825390547755, "step": 6492, "train/total_loss": 0.15855151414871216 }, { "entropy": 9.558185577392578, "epoch": 0.641981411904291, "mean_token_accuracy": 0.7782177925109863, "num_tokens": 12972513.0, "step": 6493, "train/ce_loss": 2.0642099380493164 }, { "epoch": 0.641981411904291, "step": 6493, "train/sim_loss": 0.08203125 }, { "epoch": 0.641981411904291, "step": 6493, "train/total_loss": 0.28845226764678955 }, { "entropy": 8.830387115478516, "epoch": 0.6420802847538066, "mean_token_accuracy": 0.7817258834838867, "num_tokens": 12977975.0, "step": 6494, "train/ce_loss": 0.7758707404136658 }, { "epoch": 0.6420802847538066, "step": 6494, "train/sim_loss": 0.02734375 }, { "epoch": 0.6420802847538066, "step": 6494, "train/total_loss": 0.1049308255314827 }, { "entropy": 9.699666023254395, "epoch": 0.6421791576033221, "mean_token_accuracy": 0.7637795209884644, "num_tokens": 12982785.0, "step": 6495, "train/ce_loss": 0.19067265093326569 }, { "epoch": 0.6421791576033221, "step": 6495, "train/sim_loss": 0.05078125 }, { "epoch": 0.6421791576033221, "step": 6495, "train/total_loss": 0.06984851509332657 }, { "entropy": 8.90860366821289, "epoch": 0.6422780304528376, "mean_token_accuracy": 0.7694090604782104, "num_tokens": 12988131.0, "step": 6496, "train/ce_loss": 0.4023219347000122 }, { "epoch": 0.6422780304528376, "step": 6496, "train/sim_loss": 0.03515625 }, { "epoch": 0.6422780304528376, "step": 6496, "train/total_loss": 0.07538844645023346 }, { "entropy": 8.920339584350586, "epoch": 0.6423769033023532, "mean_token_accuracy": 0.7369077205657959, "num_tokens": 12993367.0, "step": 6497, "train/ce_loss": 0.7874730229377747 }, { "epoch": 0.6423769033023532, "step": 6497, "train/sim_loss": 0.06640625 }, { "epoch": 0.6423769033023532, "step": 6497, "train/total_loss": 0.14515355229377747 }, { "entropy": 9.975255966186523, "epoch": 0.6424757761518687, "mean_token_accuracy": 0.7456647157669067, "num_tokens": 12998140.0, "step": 6498, "train/ce_loss": 0.11420520395040512 }, { "epoch": 0.6424757761518687, "step": 6498, "train/sim_loss": 0.0625 }, { "epoch": 0.6424757761518687, "step": 6498, "train/total_loss": 0.07392051815986633 }, { "entropy": 9.006590843200684, "epoch": 0.6425746490013842, "mean_token_accuracy": 0.6763224005699158, "num_tokens": 13003335.0, "step": 6499, "train/ce_loss": 0.0605606734752655 }, { "epoch": 0.6425746490013842, "step": 6499, "train/sim_loss": 0.0234375 }, { "epoch": 0.6425746490013842, "step": 6499, "train/total_loss": 0.02949356660246849 }, { "epoch": 0.6426735218508998, "grad_norm": 0.769160270690918, "learning_rate": 8.395638629283491e-06, "loss": 0.1434, "step": 6500 }, { "entropy": 9.092881202697754, "epoch": 0.6426735218508998, "mean_token_accuracy": 0.7147335410118103, "num_tokens": 13008415.0, "step": 6500, "train/ce_loss": 0.1218390017747879 }, { "epoch": 0.6426735218508998, "step": 6500, "train/sim_loss": 0.0625 }, { "epoch": 0.6426735218508998, "step": 6500, "train/total_loss": 0.07468389719724655 }, { "entropy": 9.512760162353516, "epoch": 0.6427723947004153, "mean_token_accuracy": 0.7199282050132751, "num_tokens": 13013385.0, "step": 6501, "train/ce_loss": 0.9416092038154602 }, { "epoch": 0.6427723947004153, "step": 6501, "train/sim_loss": 0.05859375 }, { "epoch": 0.6427723947004153, "step": 6501, "train/total_loss": 0.15275466442108154 }, { "entropy": 9.541532516479492, "epoch": 0.6428712675499307, "mean_token_accuracy": 0.722347617149353, "num_tokens": 13018275.0, "step": 6502, "train/ce_loss": 0.13504788279533386 }, { "epoch": 0.6428712675499307, "step": 6502, "train/sim_loss": 0.07421875 }, { "epoch": 0.6428712675499307, "step": 6502, "train/total_loss": 0.08772353827953339 }, { "entropy": 9.089735984802246, "epoch": 0.6429701403994463, "mean_token_accuracy": 0.7394495606422424, "num_tokens": 13023238.0, "step": 6503, "train/ce_loss": 0.05251012742519379 }, { "epoch": 0.6429701403994463, "step": 6503, "train/sim_loss": 0.0234375 }, { "epoch": 0.6429701403994463, "step": 6503, "train/total_loss": 0.02868851274251938 }, { "entropy": 9.593996047973633, "epoch": 0.6430690132489618, "mean_token_accuracy": 0.7515789270401001, "num_tokens": 13028148.0, "step": 6504, "train/ce_loss": 0.05433094874024391 }, { "epoch": 0.6430690132489618, "step": 6504, "train/sim_loss": 0.015625 }, { "epoch": 0.6430690132489618, "step": 6504, "train/total_loss": 0.02105809561908245 }, { "entropy": 9.474388122558594, "epoch": 0.6431678860984773, "mean_token_accuracy": 0.7316561937332153, "num_tokens": 13033014.0, "step": 6505, "train/ce_loss": 1.4684100151062012 }, { "epoch": 0.6431678860984773, "step": 6505, "train/sim_loss": 0.08984375 }, { "epoch": 0.6431678860984773, "step": 6505, "train/total_loss": 0.23668475449085236 }, { "entropy": 9.409302711486816, "epoch": 0.6432667589479929, "mean_token_accuracy": 0.7596490979194641, "num_tokens": 13038017.0, "step": 6506, "train/ce_loss": 1.0788112878799438 }, { "epoch": 0.6432667589479929, "step": 6506, "train/sim_loss": 0.03125 }, { "epoch": 0.6432667589479929, "step": 6506, "train/total_loss": 0.13913112878799438 }, { "entropy": 9.000192642211914, "epoch": 0.6433656317975084, "mean_token_accuracy": 0.7247956395149231, "num_tokens": 13043239.0, "step": 6507, "train/ce_loss": 0.7980868816375732 }, { "epoch": 0.6433656317975084, "step": 6507, "train/sim_loss": 0.08203125 }, { "epoch": 0.6433656317975084, "step": 6507, "train/total_loss": 0.16183993220329285 }, { "entropy": 9.12574577331543, "epoch": 0.643464504647024, "mean_token_accuracy": 0.7312101721763611, "num_tokens": 13048482.0, "step": 6508, "train/ce_loss": 0.012120846658945084 }, { "epoch": 0.643464504647024, "step": 6508, "train/sim_loss": 0.015625 }, { "epoch": 0.643464504647024, "step": 6508, "train/total_loss": 0.01683708466589451 }, { "entropy": 8.820637702941895, "epoch": 0.6435633774965395, "mean_token_accuracy": 0.7223684191703796, "num_tokens": 13053694.0, "step": 6509, "train/ce_loss": 0.47832316160202026 }, { "epoch": 0.6435633774965395, "step": 6509, "train/sim_loss": 0.078125 }, { "epoch": 0.6435633774965395, "step": 6509, "train/total_loss": 0.12595731019973755 }, { "entropy": 8.384625434875488, "epoch": 0.643662250346055, "mean_token_accuracy": 0.732891857624054, "num_tokens": 13059088.0, "step": 6510, "train/ce_loss": 0.9610019326210022 }, { "epoch": 0.643662250346055, "step": 6510, "train/sim_loss": 0.0625 }, { "epoch": 0.643662250346055, "step": 6510, "train/total_loss": 0.15860019624233246 }, { "entropy": 9.12200927734375, "epoch": 0.6437611231955706, "mean_token_accuracy": 0.7317554354667664, "num_tokens": 13064041.0, "step": 6511, "train/ce_loss": 1.9280868768692017 }, { "epoch": 0.6437611231955706, "step": 6511, "train/sim_loss": 0.05078125 }, { "epoch": 0.6437611231955706, "step": 6511, "train/total_loss": 0.24358993768692017 }, { "entropy": 8.986196517944336, "epoch": 0.643859996045086, "mean_token_accuracy": 0.8145454525947571, "num_tokens": 13069315.0, "step": 6512, "train/ce_loss": 8.834888285491616e-05 }, { "epoch": 0.643859996045086, "step": 6512, "train/sim_loss": 0.015625 }, { "epoch": 0.643859996045086, "step": 6512, "train/total_loss": 0.015633834525942802 }, { "entropy": 9.146592140197754, "epoch": 0.6439588688946015, "mean_token_accuracy": 0.7810107469558716, "num_tokens": 13074388.0, "step": 6513, "train/ce_loss": 1.715042233467102 }, { "epoch": 0.6439588688946015, "step": 6513, "train/sim_loss": 0.0625 }, { "epoch": 0.6439588688946015, "step": 6513, "train/total_loss": 0.23400422930717468 }, { "entropy": 8.818115234375, "epoch": 0.6440577417441171, "mean_token_accuracy": 0.7394958138465881, "num_tokens": 13079425.0, "step": 6514, "train/ce_loss": 1.3264988660812378 }, { "epoch": 0.6440577417441171, "step": 6514, "train/sim_loss": 0.0703125 }, { "epoch": 0.6440577417441171, "step": 6514, "train/total_loss": 0.20296238362789154 }, { "entropy": 9.274868965148926, "epoch": 0.6441566145936326, "mean_token_accuracy": 0.703832745552063, "num_tokens": 13084444.0, "step": 6515, "train/ce_loss": 1.360609769821167 }, { "epoch": 0.6441566145936326, "step": 6515, "train/sim_loss": 0.08203125 }, { "epoch": 0.6441566145936326, "step": 6515, "train/total_loss": 0.21809223294258118 }, { "entropy": 9.169843673706055, "epoch": 0.6442554874431481, "mean_token_accuracy": 0.717423141002655, "num_tokens": 13089704.0, "step": 6516, "train/ce_loss": 1.4866522178635933e-05 }, { "epoch": 0.6442554874431481, "step": 6516, "train/sim_loss": 0.0390625 }, { "epoch": 0.6442554874431481, "step": 6516, "train/total_loss": 0.039063986390829086 }, { "entropy": 8.721879005432129, "epoch": 0.6443543602926637, "mean_token_accuracy": 0.7563510537147522, "num_tokens": 13095174.0, "step": 6517, "train/ce_loss": 1.2111377716064453 }, { "epoch": 0.6443543602926637, "step": 6517, "train/sim_loss": 0.046875 }, { "epoch": 0.6443543602926637, "step": 6517, "train/total_loss": 0.16798877716064453 }, { "entropy": 8.488058090209961, "epoch": 0.6444532331421792, "mean_token_accuracy": 0.7695473432540894, "num_tokens": 13100639.0, "step": 6518, "train/ce_loss": 0.6677665710449219 }, { "epoch": 0.6444532331421792, "step": 6518, "train/sim_loss": 0.01953125 }, { "epoch": 0.6444532331421792, "step": 6518, "train/total_loss": 0.08630790561437607 }, { "entropy": 9.38595199584961, "epoch": 0.6445521059916947, "mean_token_accuracy": 0.709756076335907, "num_tokens": 13105459.0, "step": 6519, "train/ce_loss": 1.2585313320159912 }, { "epoch": 0.6445521059916947, "step": 6519, "train/sim_loss": 0.05078125 }, { "epoch": 0.6445521059916947, "step": 6519, "train/total_loss": 0.17663438618183136 }, { "epoch": 0.6446509788412103, "grad_norm": 0.8845223784446716, "learning_rate": 8.39069376452554e-06, "loss": 0.1413, "step": 6520 }, { "entropy": 8.890151023864746, "epoch": 0.6446509788412103, "mean_token_accuracy": 0.7569974660873413, "num_tokens": 13110715.0, "step": 6520, "train/ce_loss": 0.5002956390380859 }, { "epoch": 0.6446509788412103, "step": 6520, "train/sim_loss": 0.046875 }, { "epoch": 0.6446509788412103, "step": 6520, "train/total_loss": 0.09690456092357635 }, { "entropy": 9.051708221435547, "epoch": 0.6447498516907257, "mean_token_accuracy": 0.7416666746139526, "num_tokens": 13115781.0, "step": 6521, "train/ce_loss": 1.3550637959269807e-05 }, { "epoch": 0.6447498516907257, "step": 6521, "train/sim_loss": 0.09375 }, { "epoch": 0.6447498516907257, "step": 6521, "train/total_loss": 0.09375135600566864 }, { "entropy": 8.863210678100586, "epoch": 0.6448487245402412, "mean_token_accuracy": 0.8128272294998169, "num_tokens": 13121016.0, "step": 6522, "train/ce_loss": 0.3710916340351105 }, { "epoch": 0.6448487245402412, "step": 6522, "train/sim_loss": 0.03125 }, { "epoch": 0.6448487245402412, "step": 6522, "train/total_loss": 0.06835916638374329 }, { "entropy": 8.455900192260742, "epoch": 0.6449475973897568, "mean_token_accuracy": 0.7523809671401978, "num_tokens": 13126344.0, "step": 6523, "train/ce_loss": 1.0311092138290405 }, { "epoch": 0.6449475973897568, "step": 6523, "train/sim_loss": 0.09765625 }, { "epoch": 0.6449475973897568, "step": 6523, "train/total_loss": 0.2007671743631363 }, { "entropy": 8.69780445098877, "epoch": 0.6450464702392723, "mean_token_accuracy": 0.7423469424247742, "num_tokens": 13131605.0, "step": 6524, "train/ce_loss": 1.0256714820861816 }, { "epoch": 0.6450464702392723, "step": 6524, "train/sim_loss": 0.0625 }, { "epoch": 0.6450464702392723, "step": 6524, "train/total_loss": 0.1650671511888504 }, { "entropy": 9.350138664245605, "epoch": 0.6451453430887878, "mean_token_accuracy": 0.718654453754425, "num_tokens": 13136683.0, "step": 6525, "train/ce_loss": 1.7895563840866089 }, { "epoch": 0.6451453430887878, "step": 6525, "train/sim_loss": 0.0703125 }, { "epoch": 0.6451453430887878, "step": 6525, "train/total_loss": 0.24926814436912537 }, { "entropy": 9.344786643981934, "epoch": 0.6452442159383034, "mean_token_accuracy": 0.6934046149253845, "num_tokens": 13141698.0, "step": 6526, "train/ce_loss": 1.5403414964675903 }, { "epoch": 0.6452442159383034, "step": 6526, "train/sim_loss": 0.06640625 }, { "epoch": 0.6452442159383034, "step": 6526, "train/total_loss": 0.22044040262699127 }, { "entropy": 9.177766799926758, "epoch": 0.6453430887878189, "mean_token_accuracy": 0.8082901835441589, "num_tokens": 13146733.0, "step": 6527, "train/ce_loss": 6.455883067246759e-06 }, { "epoch": 0.6453430887878189, "step": 6527, "train/sim_loss": 0.015625 }, { "epoch": 0.6453430887878189, "step": 6527, "train/total_loss": 0.015625646337866783 }, { "entropy": 9.098952293395996, "epoch": 0.6454419616373344, "mean_token_accuracy": 0.7235772609710693, "num_tokens": 13151751.0, "step": 6528, "train/ce_loss": 4.270254066796042e-06 }, { "epoch": 0.6454419616373344, "step": 6528, "train/sim_loss": 0.06640625 }, { "epoch": 0.6454419616373344, "step": 6528, "train/total_loss": 0.06640667468309402 }, { "entropy": 8.696406364440918, "epoch": 0.64554083448685, "mean_token_accuracy": 0.7783251404762268, "num_tokens": 13157039.0, "step": 6529, "train/ce_loss": 0.7242198586463928 }, { "epoch": 0.64554083448685, "step": 6529, "train/sim_loss": 0.0546875 }, { "epoch": 0.64554083448685, "step": 6529, "train/total_loss": 0.12710949778556824 }, { "entropy": 8.717599868774414, "epoch": 0.6456397073363654, "mean_token_accuracy": 0.7016215920448303, "num_tokens": 13162397.0, "step": 6530, "train/ce_loss": 0.6972009539604187 }, { "epoch": 0.6456397073363654, "step": 6530, "train/sim_loss": 0.0234375 }, { "epoch": 0.6456397073363654, "step": 6530, "train/total_loss": 0.09315759688615799 }, { "entropy": 8.455299377441406, "epoch": 0.6457385801858809, "mean_token_accuracy": 0.7663461565971375, "num_tokens": 13167951.0, "step": 6531, "train/ce_loss": 0.9205726981163025 }, { "epoch": 0.6457385801858809, "step": 6531, "train/sim_loss": 0.0625 }, { "epoch": 0.6457385801858809, "step": 6531, "train/total_loss": 0.1545572727918625 }, { "entropy": 8.538186073303223, "epoch": 0.6458374530353965, "mean_token_accuracy": 0.7246073484420776, "num_tokens": 13173394.0, "step": 6532, "train/ce_loss": 1.1803375482559204 }, { "epoch": 0.6458374530353965, "step": 6532, "train/sim_loss": 0.0625 }, { "epoch": 0.6458374530353965, "step": 6532, "train/total_loss": 0.180533766746521 }, { "entropy": 8.439651489257812, "epoch": 0.645936325884912, "mean_token_accuracy": 0.7225647568702698, "num_tokens": 13178673.0, "step": 6533, "train/ce_loss": 0.6351827383041382 }, { "epoch": 0.645936325884912, "step": 6533, "train/sim_loss": 0.078125 }, { "epoch": 0.645936325884912, "step": 6533, "train/total_loss": 0.14164328575134277 }, { "entropy": 8.515107154846191, "epoch": 0.6460351987344275, "mean_token_accuracy": 0.7172839641571045, "num_tokens": 13183946.0, "step": 6534, "train/ce_loss": 0.9283249974250793 }, { "epoch": 0.6460351987344275, "step": 6534, "train/sim_loss": 0.05859375 }, { "epoch": 0.6460351987344275, "step": 6534, "train/total_loss": 0.1514262557029724 }, { "entropy": 9.123884201049805, "epoch": 0.6461340715839431, "mean_token_accuracy": 0.6688311696052551, "num_tokens": 13189038.0, "step": 6535, "train/ce_loss": 1.2555750608444214 }, { "epoch": 0.6461340715839431, "step": 6535, "train/sim_loss": 0.046875 }, { "epoch": 0.6461340715839431, "step": 6535, "train/total_loss": 0.17243251204490662 }, { "entropy": 8.950576782226562, "epoch": 0.6462329444334586, "mean_token_accuracy": 0.7177305221557617, "num_tokens": 13194225.0, "step": 6536, "train/ce_loss": 1.3022525310516357 }, { "epoch": 0.6462329444334586, "step": 6536, "train/sim_loss": 0.0859375 }, { "epoch": 0.6462329444334586, "step": 6536, "train/total_loss": 0.2161627560853958 }, { "entropy": 8.180878639221191, "epoch": 0.6463318172829741, "mean_token_accuracy": 0.790450930595398, "num_tokens": 13199834.0, "step": 6537, "train/ce_loss": 0.5404284000396729 }, { "epoch": 0.6463318172829741, "step": 6537, "train/sim_loss": 0.015625 }, { "epoch": 0.6463318172829741, "step": 6537, "train/total_loss": 0.06966784596443176 }, { "entropy": 8.464290618896484, "epoch": 0.6464306901324897, "mean_token_accuracy": 0.756424605846405, "num_tokens": 13205184.0, "step": 6538, "train/ce_loss": 1.1266599893569946 }, { "epoch": 0.6464306901324897, "step": 6538, "train/sim_loss": 0.0859375 }, { "epoch": 0.6464306901324897, "step": 6538, "train/total_loss": 0.19860351085662842 }, { "entropy": 9.445873260498047, "epoch": 0.6465295629820051, "mean_token_accuracy": 0.8164557218551636, "num_tokens": 13210030.0, "step": 6539, "train/ce_loss": 1.4183518886566162 }, { "epoch": 0.6465295629820051, "step": 6539, "train/sim_loss": 0.01953125 }, { "epoch": 0.6465295629820051, "step": 6539, "train/total_loss": 0.16136644780635834 }, { "epoch": 0.6466284358315206, "grad_norm": 0.6027600765228271, "learning_rate": 8.385748899767592e-06, "loss": 0.1377, "step": 6540 }, { "entropy": 8.796388626098633, "epoch": 0.6466284358315206, "mean_token_accuracy": 0.7493261694908142, "num_tokens": 13215144.0, "step": 6540, "train/ce_loss": 1.2235296964645386 }, { "epoch": 0.6466284358315206, "step": 6540, "train/sim_loss": 0.078125 }, { "epoch": 0.6466284358315206, "step": 6540, "train/total_loss": 0.2004779726266861 }, { "entropy": 9.158843994140625, "epoch": 0.6467273086810362, "mean_token_accuracy": 0.7662835121154785, "num_tokens": 13220099.0, "step": 6541, "train/ce_loss": 0.7735692262649536 }, { "epoch": 0.6467273086810362, "step": 6541, "train/sim_loss": 0.0859375 }, { "epoch": 0.6467273086810362, "step": 6541, "train/total_loss": 0.16329443454742432 }, { "entropy": 8.371902465820312, "epoch": 0.6468261815305517, "mean_token_accuracy": 0.6707921028137207, "num_tokens": 13225390.0, "step": 6542, "train/ce_loss": 0.6213967204093933 }, { "epoch": 0.6468261815305517, "step": 6542, "train/sim_loss": 0.0625 }, { "epoch": 0.6468261815305517, "step": 6542, "train/total_loss": 0.12463967502117157 }, { "entropy": 9.107194900512695, "epoch": 0.6469250543800672, "mean_token_accuracy": 0.7919161915779114, "num_tokens": 13230499.0, "step": 6543, "train/ce_loss": 1.2945265769958496 }, { "epoch": 0.6469250543800672, "step": 6543, "train/sim_loss": 0.0859375 }, { "epoch": 0.6469250543800672, "step": 6543, "train/total_loss": 0.2153901606798172 }, { "entropy": 8.621914863586426, "epoch": 0.6470239272295828, "mean_token_accuracy": 0.7265536785125732, "num_tokens": 13235845.0, "step": 6544, "train/ce_loss": 0.813605546951294 }, { "epoch": 0.6470239272295828, "step": 6544, "train/sim_loss": 0.03125 }, { "epoch": 0.6470239272295828, "step": 6544, "train/total_loss": 0.11261055618524551 }, { "entropy": 8.217103958129883, "epoch": 0.6471228000790983, "mean_token_accuracy": 0.6928229928016663, "num_tokens": 13241403.0, "step": 6545, "train/ce_loss": 1.2843297719955444 }, { "epoch": 0.6471228000790983, "step": 6545, "train/sim_loss": 0.078125 }, { "epoch": 0.6471228000790983, "step": 6545, "train/total_loss": 0.2065579742193222 }, { "entropy": 8.447120666503906, "epoch": 0.6472216729286138, "mean_token_accuracy": 0.8052356243133545, "num_tokens": 13246812.0, "step": 6546, "train/ce_loss": 0.758453905582428 }, { "epoch": 0.6472216729286138, "step": 6546, "train/sim_loss": 0.06640625 }, { "epoch": 0.6472216729286138, "step": 6546, "train/total_loss": 0.1422516405582428 }, { "entropy": 8.966032981872559, "epoch": 0.6473205457781294, "mean_token_accuracy": 0.6984333992004395, "num_tokens": 13252015.0, "step": 6547, "train/ce_loss": 0.8154184818267822 }, { "epoch": 0.6473205457781294, "step": 6547, "train/sim_loss": 0.0546875 }, { "epoch": 0.6473205457781294, "step": 6547, "train/total_loss": 0.13622935116291046 }, { "entropy": 8.51400375366211, "epoch": 0.6474194186276448, "mean_token_accuracy": 0.7518636584281921, "num_tokens": 13257630.0, "step": 6548, "train/ce_loss": 0.7422522306442261 }, { "epoch": 0.6474194186276448, "step": 6548, "train/sim_loss": 0.08984375 }, { "epoch": 0.6474194186276448, "step": 6548, "train/total_loss": 0.16406896710395813 }, { "entropy": 9.053725242614746, "epoch": 0.6475182914771603, "mean_token_accuracy": 0.7325383424758911, "num_tokens": 13262667.0, "step": 6549, "train/ce_loss": 0.7162719964981079 }, { "epoch": 0.6475182914771603, "step": 6549, "train/sim_loss": 0.0390625 }, { "epoch": 0.6475182914771603, "step": 6549, "train/total_loss": 0.11068969964981079 }, { "entropy": 8.683499336242676, "epoch": 0.6476171643266759, "mean_token_accuracy": 0.8201342225074768, "num_tokens": 13267911.0, "step": 6550, "train/ce_loss": 0.8026498556137085 }, { "epoch": 0.6476171643266759, "step": 6550, "train/sim_loss": 0.046875 }, { "epoch": 0.6476171643266759, "step": 6550, "train/total_loss": 0.12713998556137085 }, { "entropy": 8.909549713134766, "epoch": 0.6477160371761914, "mean_token_accuracy": 0.8012422323226929, "num_tokens": 13273067.0, "step": 6551, "train/ce_loss": 0.8117330074310303 }, { "epoch": 0.6477160371761914, "step": 6551, "train/sim_loss": 0.015625 }, { "epoch": 0.6477160371761914, "step": 6551, "train/total_loss": 0.09679830074310303 }, { "entropy": 8.517176628112793, "epoch": 0.6478149100257069, "mean_token_accuracy": 0.7692307829856873, "num_tokens": 13278501.0, "step": 6552, "train/ce_loss": 0.9131290316581726 }, { "epoch": 0.6478149100257069, "step": 6552, "train/sim_loss": 0.140625 }, { "epoch": 0.6478149100257069, "step": 6552, "train/total_loss": 0.23193791508674622 }, { "entropy": 8.51297378540039, "epoch": 0.6479137828752225, "mean_token_accuracy": 0.7438370585441589, "num_tokens": 13284024.0, "step": 6553, "train/ce_loss": 0.7460567355155945 }, { "epoch": 0.6479137828752225, "step": 6553, "train/sim_loss": 0.0546875 }, { "epoch": 0.6479137828752225, "step": 6553, "train/total_loss": 0.12929317355155945 }, { "entropy": 8.837274551391602, "epoch": 0.648012655724738, "mean_token_accuracy": 0.7346938848495483, "num_tokens": 13289135.0, "step": 6554, "train/ce_loss": 1.1686605215072632 }, { "epoch": 0.648012655724738, "step": 6554, "train/sim_loss": 0.0703125 }, { "epoch": 0.648012655724738, "step": 6554, "train/total_loss": 0.18717855215072632 }, { "entropy": 8.470785140991211, "epoch": 0.6481115285742535, "mean_token_accuracy": 0.7346723079681396, "num_tokens": 13294540.0, "step": 6555, "train/ce_loss": 1.2083443403244019 }, { "epoch": 0.6481115285742535, "step": 6555, "train/sim_loss": 0.0390625 }, { "epoch": 0.6481115285742535, "step": 6555, "train/total_loss": 0.15989693999290466 }, { "entropy": 8.662359237670898, "epoch": 0.6482104014237691, "mean_token_accuracy": 0.6998841166496277, "num_tokens": 13299849.0, "step": 6556, "train/ce_loss": 0.5744127035140991 }, { "epoch": 0.6482104014237691, "step": 6556, "train/sim_loss": 0.0625 }, { "epoch": 0.6482104014237691, "step": 6556, "train/total_loss": 0.11994127184152603 }, { "entropy": 8.591453552246094, "epoch": 0.6483092742732846, "mean_token_accuracy": 0.7582417726516724, "num_tokens": 13305145.0, "step": 6557, "train/ce_loss": 0.6765773892402649 }, { "epoch": 0.6483092742732846, "step": 6557, "train/sim_loss": 0.03125 }, { "epoch": 0.6483092742732846, "step": 6557, "train/total_loss": 0.09890773892402649 }, { "entropy": 8.676153182983398, "epoch": 0.6484081471228, "mean_token_accuracy": 0.7421965599060059, "num_tokens": 13310466.0, "step": 6558, "train/ce_loss": 0.8314873576164246 }, { "epoch": 0.6484081471228, "step": 6558, "train/sim_loss": 0.0625 }, { "epoch": 0.6484081471228, "step": 6558, "train/total_loss": 0.1456487476825714 }, { "entropy": 9.11447525024414, "epoch": 0.6485070199723156, "mean_token_accuracy": 0.7424749135971069, "num_tokens": 13315514.0, "step": 6559, "train/ce_loss": 2.266011279061786e-06 }, { "epoch": 0.6485070199723156, "step": 6559, "train/sim_loss": 0.03125 }, { "epoch": 0.6485070199723156, "step": 6559, "train/total_loss": 0.031250227242708206 }, { "epoch": 0.6486058928218311, "grad_norm": 0.6868183016777039, "learning_rate": 8.380804035009642e-06, "loss": 0.1334, "step": 6560 }, { "entropy": 9.356553077697754, "epoch": 0.6486058928218311, "mean_token_accuracy": 0.747706413269043, "num_tokens": 13320406.0, "step": 6560, "train/ce_loss": 1.664231300354004 }, { "epoch": 0.6486058928218311, "step": 6560, "train/sim_loss": 0.06640625 }, { "epoch": 0.6486058928218311, "step": 6560, "train/total_loss": 0.23282937705516815 }, { "entropy": 9.0035400390625, "epoch": 0.6487047656713466, "mean_token_accuracy": 0.7130919098854065, "num_tokens": 13325561.0, "step": 6561, "train/ce_loss": 1.0942925214767456 }, { "epoch": 0.6487047656713466, "step": 6561, "train/sim_loss": 0.03125 }, { "epoch": 0.6487047656713466, "step": 6561, "train/total_loss": 0.1406792551279068 }, { "entropy": 8.760725021362305, "epoch": 0.6488036385208622, "mean_token_accuracy": 0.7328671216964722, "num_tokens": 13330740.0, "step": 6562, "train/ce_loss": 1.063631296157837 }, { "epoch": 0.6488036385208622, "step": 6562, "train/sim_loss": 0.08203125 }, { "epoch": 0.6488036385208622, "step": 6562, "train/total_loss": 0.18839438259601593 }, { "entropy": 8.661602020263672, "epoch": 0.6489025113703777, "mean_token_accuracy": 0.7060975432395935, "num_tokens": 13336050.0, "step": 6563, "train/ce_loss": 1.7457458972930908 }, { "epoch": 0.6489025113703777, "step": 6563, "train/sim_loss": 0.04296875 }, { "epoch": 0.6489025113703777, "step": 6563, "train/total_loss": 0.2175433486700058 }, { "entropy": 8.36601448059082, "epoch": 0.6490013842198932, "mean_token_accuracy": 0.6859323978424072, "num_tokens": 13341495.0, "step": 6564, "train/ce_loss": 0.8401528596878052 }, { "epoch": 0.6490013842198932, "step": 6564, "train/sim_loss": 0.078125 }, { "epoch": 0.6490013842198932, "step": 6564, "train/total_loss": 0.16214028000831604 }, { "entropy": 8.529932022094727, "epoch": 0.6491002570694088, "mean_token_accuracy": 0.7721354365348816, "num_tokens": 13346721.0, "step": 6565, "train/ce_loss": 0.5375370383262634 }, { "epoch": 0.6491002570694088, "step": 6565, "train/sim_loss": 0.0546875 }, { "epoch": 0.6491002570694088, "step": 6565, "train/total_loss": 0.10844120383262634 }, { "entropy": 8.754827499389648, "epoch": 0.6491991299189243, "mean_token_accuracy": 0.6712141633033752, "num_tokens": 13351907.0, "step": 6566, "train/ce_loss": 1.1823458671569824 }, { "epoch": 0.6491991299189243, "step": 6566, "train/sim_loss": 0.0625 }, { "epoch": 0.6491991299189243, "step": 6566, "train/total_loss": 0.18073458969593048 }, { "entropy": 9.000566482543945, "epoch": 0.6492980027684397, "mean_token_accuracy": 0.7286096215248108, "num_tokens": 13357104.0, "step": 6567, "train/ce_loss": 0.52986079454422 }, { "epoch": 0.6492980027684397, "step": 6567, "train/sim_loss": 0.05078125 }, { "epoch": 0.6492980027684397, "step": 6567, "train/total_loss": 0.10376733541488647 }, { "entropy": 8.475728034973145, "epoch": 0.6493968756179553, "mean_token_accuracy": 0.7357001900672913, "num_tokens": 13362637.0, "step": 6568, "train/ce_loss": 0.8425010442733765 }, { "epoch": 0.6493968756179553, "step": 6568, "train/sim_loss": 0.0859375 }, { "epoch": 0.6493968756179553, "step": 6568, "train/total_loss": 0.17018760740756989 }, { "entropy": 8.903318405151367, "epoch": 0.6494957484674708, "mean_token_accuracy": 0.7919161915779114, "num_tokens": 13367755.0, "step": 6569, "train/ce_loss": 1.1492321618788992e-06 }, { "epoch": 0.6494957484674708, "step": 6569, "train/sim_loss": 0.03125 }, { "epoch": 0.6494957484674708, "step": 6569, "train/total_loss": 0.03125011548399925 }, { "entropy": 8.71034049987793, "epoch": 0.6495946213169863, "mean_token_accuracy": 0.8113924264907837, "num_tokens": 13373034.0, "step": 6570, "train/ce_loss": 0.5691157579421997 }, { "epoch": 0.6495946213169863, "step": 6570, "train/sim_loss": 0.03125 }, { "epoch": 0.6495946213169863, "step": 6570, "train/total_loss": 0.08816157281398773 }, { "entropy": 9.267087936401367, "epoch": 0.6496934941665019, "mean_token_accuracy": 0.7537091970443726, "num_tokens": 13377814.0, "step": 6571, "train/ce_loss": 5.840172434545821e-06 }, { "epoch": 0.6496934941665019, "step": 6571, "train/sim_loss": 0.0234375 }, { "epoch": 0.6496934941665019, "step": 6571, "train/total_loss": 0.02343808487057686 }, { "entropy": 8.709084510803223, "epoch": 0.6497923670160174, "mean_token_accuracy": 0.7563804984092712, "num_tokens": 13383181.0, "step": 6572, "train/ce_loss": 0.5205847024917603 }, { "epoch": 0.6497923670160174, "step": 6572, "train/sim_loss": 0.03515625 }, { "epoch": 0.6497923670160174, "step": 6572, "train/total_loss": 0.08721472322940826 }, { "entropy": 8.620328903198242, "epoch": 0.6498912398655329, "mean_token_accuracy": 0.6743383407592773, "num_tokens": 13388552.0, "step": 6573, "train/ce_loss": 1.2111564874649048 }, { "epoch": 0.6498912398655329, "step": 6573, "train/sim_loss": 0.0703125 }, { "epoch": 0.6498912398655329, "step": 6573, "train/total_loss": 0.19142815470695496 }, { "entropy": 8.471712112426758, "epoch": 0.6499901127150485, "mean_token_accuracy": 0.6955530047416687, "num_tokens": 13393887.0, "step": 6574, "train/ce_loss": 0.9428759813308716 }, { "epoch": 0.6499901127150485, "step": 6574, "train/sim_loss": 0.06640625 }, { "epoch": 0.6499901127150485, "step": 6574, "train/total_loss": 0.16069385409355164 }, { "entropy": 8.578201293945312, "epoch": 0.650088985564564, "mean_token_accuracy": 0.7132075428962708, "num_tokens": 13399164.0, "step": 6575, "train/ce_loss": 0.6358181834220886 }, { "epoch": 0.650088985564564, "step": 6575, "train/sim_loss": 0.08203125 }, { "epoch": 0.650088985564564, "step": 6575, "train/total_loss": 0.14561307430267334 }, { "entropy": 8.559331893920898, "epoch": 0.6501878584140794, "mean_token_accuracy": 0.7372340559959412, "num_tokens": 13404543.0, "step": 6576, "train/ce_loss": 1.044264793395996 }, { "epoch": 0.6501878584140794, "step": 6576, "train/sim_loss": 0.04296875 }, { "epoch": 0.6501878584140794, "step": 6576, "train/total_loss": 0.14739522337913513 }, { "entropy": 8.661439895629883, "epoch": 0.650286731263595, "mean_token_accuracy": 0.7629629373550415, "num_tokens": 13409931.0, "step": 6577, "train/ce_loss": 0.8685452342033386 }, { "epoch": 0.650286731263595, "step": 6577, "train/sim_loss": 0.0546875 }, { "epoch": 0.650286731263595, "step": 6577, "train/total_loss": 0.14154201745986938 }, { "entropy": 8.24040699005127, "epoch": 0.6503856041131105, "mean_token_accuracy": 0.7872582674026489, "num_tokens": 13415295.0, "step": 6578, "train/ce_loss": 0.36787208914756775 }, { "epoch": 0.6503856041131105, "step": 6578, "train/sim_loss": 0.03125 }, { "epoch": 0.6503856041131105, "step": 6578, "train/total_loss": 0.06803721189498901 }, { "entropy": 8.668405532836914, "epoch": 0.650484476962626, "mean_token_accuracy": 0.7052767276763916, "num_tokens": 13420605.0, "step": 6579, "train/ce_loss": 1.0733392238616943 }, { "epoch": 0.650484476962626, "step": 6579, "train/sim_loss": 0.04296875 }, { "epoch": 0.650484476962626, "step": 6579, "train/total_loss": 0.1503026783466339 }, { "epoch": 0.6505833498121416, "grad_norm": 0.765305757522583, "learning_rate": 8.375859170251695e-06, "loss": 0.1413, "step": 6580 }, { "entropy": 8.656425476074219, "epoch": 0.6505833498121416, "mean_token_accuracy": 0.7190876603126526, "num_tokens": 13425946.0, "step": 6580, "train/ce_loss": 0.8743836283683777 }, { "epoch": 0.6505833498121416, "step": 6580, "train/sim_loss": 0.0703125 }, { "epoch": 0.6505833498121416, "step": 6580, "train/total_loss": 0.15775087475776672 }, { "entropy": 8.495885848999023, "epoch": 0.6506822226616571, "mean_token_accuracy": 0.7882927060127258, "num_tokens": 13431438.0, "step": 6581, "train/ce_loss": 0.6086604595184326 }, { "epoch": 0.6506822226616571, "step": 6581, "train/sim_loss": 0.07421875 }, { "epoch": 0.6506822226616571, "step": 6581, "train/total_loss": 0.13508479297161102 }, { "entropy": 9.197243690490723, "epoch": 0.6507810955111726, "mean_token_accuracy": 0.7380136847496033, "num_tokens": 13436441.0, "step": 6582, "train/ce_loss": 0.9340671896934509 }, { "epoch": 0.6507810955111726, "step": 6582, "train/sim_loss": 0.04296875 }, { "epoch": 0.6507810955111726, "step": 6582, "train/total_loss": 0.13637547194957733 }, { "entropy": 8.552262306213379, "epoch": 0.6508799683606882, "mean_token_accuracy": 0.7147766351699829, "num_tokens": 13441800.0, "step": 6583, "train/ce_loss": 1.2501468658447266 }, { "epoch": 0.6508799683606882, "step": 6583, "train/sim_loss": 0.046875 }, { "epoch": 0.6508799683606882, "step": 6583, "train/total_loss": 0.17188969254493713 }, { "entropy": 8.310001373291016, "epoch": 0.6509788412102037, "mean_token_accuracy": 0.7433722019195557, "num_tokens": 13447242.0, "step": 6584, "train/ce_loss": 0.8037084937095642 }, { "epoch": 0.6509788412102037, "step": 6584, "train/sim_loss": 0.046875 }, { "epoch": 0.6509788412102037, "step": 6584, "train/total_loss": 0.12724584341049194 }, { "entropy": 8.924371719360352, "epoch": 0.6510777140597191, "mean_token_accuracy": 0.6680107712745667, "num_tokens": 13452410.0, "step": 6585, "train/ce_loss": 1.5992698669433594 }, { "epoch": 0.6510777140597191, "step": 6585, "train/sim_loss": 0.0546875 }, { "epoch": 0.6510777140597191, "step": 6585, "train/total_loss": 0.21461449563503265 }, { "entropy": 8.38753890991211, "epoch": 0.6511765869092347, "mean_token_accuracy": 0.7152103781700134, "num_tokens": 13457842.0, "step": 6586, "train/ce_loss": 0.8172120451927185 }, { "epoch": 0.6511765869092347, "step": 6586, "train/sim_loss": 0.04296875 }, { "epoch": 0.6511765869092347, "step": 6586, "train/total_loss": 0.12468995898962021 }, { "entropy": 8.405166625976562, "epoch": 0.6512754597587502, "mean_token_accuracy": 0.687637984752655, "num_tokens": 13463232.0, "step": 6587, "train/ce_loss": 0.8654608130455017 }, { "epoch": 0.6512754597587502, "step": 6587, "train/sim_loss": 0.09765625 }, { "epoch": 0.6512754597587502, "step": 6587, "train/total_loss": 0.18420234322547913 }, { "entropy": 8.861724853515625, "epoch": 0.6513743326082657, "mean_token_accuracy": 0.7378516793251038, "num_tokens": 13468498.0, "step": 6588, "train/ce_loss": 0.8934001326560974 }, { "epoch": 0.6513743326082657, "step": 6588, "train/sim_loss": 0.0625 }, { "epoch": 0.6513743326082657, "step": 6588, "train/total_loss": 0.15184001624584198 }, { "entropy": 8.825806617736816, "epoch": 0.6514732054577813, "mean_token_accuracy": 0.7155388593673706, "num_tokens": 13473776.0, "step": 6589, "train/ce_loss": 0.9997677803039551 }, { "epoch": 0.6514732054577813, "step": 6589, "train/sim_loss": 0.08984375 }, { "epoch": 0.6514732054577813, "step": 6589, "train/total_loss": 0.1898205280303955 }, { "entropy": 8.642440795898438, "epoch": 0.6515720783072968, "mean_token_accuracy": 0.7457212805747986, "num_tokens": 13479071.0, "step": 6590, "train/ce_loss": 0.771452784538269 }, { "epoch": 0.6515720783072968, "step": 6590, "train/sim_loss": 0.078125 }, { "epoch": 0.6515720783072968, "step": 6590, "train/total_loss": 0.1552702784538269 }, { "entropy": 8.950058937072754, "epoch": 0.6516709511568124, "mean_token_accuracy": 0.7306064963340759, "num_tokens": 13484300.0, "step": 6591, "train/ce_loss": 1.9471641280688345e-05 }, { "epoch": 0.6516709511568124, "step": 6591, "train/sim_loss": 0.05078125 }, { "epoch": 0.6516709511568124, "step": 6591, "train/total_loss": 0.050783198326826096 }, { "entropy": 9.265976905822754, "epoch": 0.6517698240063279, "mean_token_accuracy": 0.7322404384613037, "num_tokens": 13489275.0, "step": 6592, "train/ce_loss": 1.37646484375 }, { "epoch": 0.6517698240063279, "step": 6592, "train/sim_loss": 0.078125 }, { "epoch": 0.6517698240063279, "step": 6592, "train/total_loss": 0.21577148139476776 }, { "entropy": 9.448829650878906, "epoch": 0.6518686968558434, "mean_token_accuracy": 0.7547974586486816, "num_tokens": 13494129.0, "step": 6593, "train/ce_loss": 1.212557077407837 }, { "epoch": 0.6518686968558434, "step": 6593, "train/sim_loss": 0.03515625 }, { "epoch": 0.6518686968558434, "step": 6593, "train/total_loss": 0.15641196072101593 }, { "entropy": 9.146718978881836, "epoch": 0.651967569705359, "mean_token_accuracy": 0.7548291087150574, "num_tokens": 13499291.0, "step": 6594, "train/ce_loss": 0.8624992966651917 }, { "epoch": 0.651967569705359, "step": 6594, "train/sim_loss": 0.015625 }, { "epoch": 0.651967569705359, "step": 6594, "train/total_loss": 0.1018749326467514 }, { "entropy": 8.7354154586792, "epoch": 0.6520664425548744, "mean_token_accuracy": 0.7263157963752747, "num_tokens": 13504541.0, "step": 6595, "train/ce_loss": 0.8909603953361511 }, { "epoch": 0.6520664425548744, "step": 6595, "train/sim_loss": 0.02734375 }, { "epoch": 0.6520664425548744, "step": 6595, "train/total_loss": 0.11643978953361511 }, { "entropy": 9.494924545288086, "epoch": 0.6521653154043899, "mean_token_accuracy": 0.7104557752609253, "num_tokens": 13509364.0, "step": 6596, "train/ce_loss": 3.237225246266462e-05 }, { "epoch": 0.6521653154043899, "step": 6596, "train/sim_loss": 0.046875 }, { "epoch": 0.6521653154043899, "step": 6596, "train/total_loss": 0.04687823727726936 }, { "entropy": 8.990909576416016, "epoch": 0.6522641882539055, "mean_token_accuracy": 0.7382671236991882, "num_tokens": 13514338.0, "step": 6597, "train/ce_loss": 5.00813303005998e-06 }, { "epoch": 0.6522641882539055, "step": 6597, "train/sim_loss": 0.05078125 }, { "epoch": 0.6522641882539055, "step": 6597, "train/total_loss": 0.050781749188899994 }, { "entropy": 9.145400047302246, "epoch": 0.652363061103421, "mean_token_accuracy": 0.7071547508239746, "num_tokens": 13519384.0, "step": 6598, "train/ce_loss": 0.9238376617431641 }, { "epoch": 0.652363061103421, "step": 6598, "train/sim_loss": 0.03515625 }, { "epoch": 0.652363061103421, "step": 6598, "train/total_loss": 0.12754002213478088 }, { "entropy": 8.485532760620117, "epoch": 0.6524619339529365, "mean_token_accuracy": 0.8067520260810852, "num_tokens": 13524707.0, "step": 6599, "train/ce_loss": 0.9799271821975708 }, { "epoch": 0.6524619339529365, "step": 6599, "train/sim_loss": 0.0546875 }, { "epoch": 0.6524619339529365, "step": 6599, "train/total_loss": 0.15268021821975708 }, { "epoch": 0.6525608068024521, "grad_norm": 0.6757383942604065, "learning_rate": 8.370914305493745e-06, "loss": 0.143, "step": 6600 }, { "entropy": 8.593257904052734, "epoch": 0.6525608068024521, "mean_token_accuracy": 0.7207586765289307, "num_tokens": 13530165.0, "step": 6600, "train/ce_loss": 2.1290764808654785 }, { "epoch": 0.6525608068024521, "step": 6600, "train/sim_loss": 0.07421875 }, { "epoch": 0.6525608068024521, "step": 6600, "train/total_loss": 0.28712642192840576 }, { "entropy": 8.746946334838867, "epoch": 0.6526596796519676, "mean_token_accuracy": 0.7280187606811523, "num_tokens": 13535521.0, "step": 6601, "train/ce_loss": 0.9030402302742004 }, { "epoch": 0.6526596796519676, "step": 6601, "train/sim_loss": 0.03125 }, { "epoch": 0.6526596796519676, "step": 6601, "train/total_loss": 0.12155402451753616 }, { "entropy": 8.988773345947266, "epoch": 0.6527585525014831, "mean_token_accuracy": 0.7064343094825745, "num_tokens": 13540744.0, "step": 6602, "train/ce_loss": 0.9543810486793518 }, { "epoch": 0.6527585525014831, "step": 6602, "train/sim_loss": 0.0625 }, { "epoch": 0.6527585525014831, "step": 6602, "train/total_loss": 0.15793810784816742 }, { "entropy": 9.077032089233398, "epoch": 0.6528574253509987, "mean_token_accuracy": 0.7015113234519958, "num_tokens": 13546016.0, "step": 6603, "train/ce_loss": 0.969390332698822 }, { "epoch": 0.6528574253509987, "step": 6603, "train/sim_loss": 0.0625 }, { "epoch": 0.6528574253509987, "step": 6603, "train/total_loss": 0.15943902730941772 }, { "entropy": 8.732562065124512, "epoch": 0.6529562982005142, "mean_token_accuracy": 0.7128927707672119, "num_tokens": 13551411.0, "step": 6604, "train/ce_loss": 1.147282600402832 }, { "epoch": 0.6529562982005142, "step": 6604, "train/sim_loss": 0.0625 }, { "epoch": 0.6529562982005142, "step": 6604, "train/total_loss": 0.17722827196121216 }, { "entropy": 8.944042205810547, "epoch": 0.6530551710500296, "mean_token_accuracy": 0.7874214053153992, "num_tokens": 13556656.0, "step": 6605, "train/ce_loss": 0.3569697141647339 }, { "epoch": 0.6530551710500296, "step": 6605, "train/sim_loss": 0.01953125 }, { "epoch": 0.6530551710500296, "step": 6605, "train/total_loss": 0.05522822216153145 }, { "entropy": 9.391716957092285, "epoch": 0.6531540438995452, "mean_token_accuracy": 0.6883116960525513, "num_tokens": 13561482.0, "step": 6606, "train/ce_loss": 9.041209705173969e-05 }, { "epoch": 0.6531540438995452, "step": 6606, "train/sim_loss": 0.046875 }, { "epoch": 0.6531540438995452, "step": 6606, "train/total_loss": 0.04688404127955437 }, { "entropy": 8.47945785522461, "epoch": 0.6532529167490607, "mean_token_accuracy": 0.7885228395462036, "num_tokens": 13566904.0, "step": 6607, "train/ce_loss": 0.5496372580528259 }, { "epoch": 0.6532529167490607, "step": 6607, "train/sim_loss": 0.046875 }, { "epoch": 0.6532529167490607, "step": 6607, "train/total_loss": 0.10183872282505035 }, { "entropy": 8.949787139892578, "epoch": 0.6533517895985762, "mean_token_accuracy": 0.6934523582458496, "num_tokens": 13572036.0, "step": 6608, "train/ce_loss": 0.8607617616653442 }, { "epoch": 0.6533517895985762, "step": 6608, "train/sim_loss": 0.0546875 }, { "epoch": 0.6533517895985762, "step": 6608, "train/total_loss": 0.14076367020606995 }, { "entropy": 8.818286895751953, "epoch": 0.6534506624480918, "mean_token_accuracy": 0.6873508095741272, "num_tokens": 13577314.0, "step": 6609, "train/ce_loss": 0.4437519609928131 }, { "epoch": 0.6534506624480918, "step": 6609, "train/sim_loss": 0.02734375 }, { "epoch": 0.6534506624480918, "step": 6609, "train/total_loss": 0.07171894609928131 }, { "entropy": 8.649569511413574, "epoch": 0.6535495352976073, "mean_token_accuracy": 0.7800776362419128, "num_tokens": 13582591.0, "step": 6610, "train/ce_loss": 0.5533372163772583 }, { "epoch": 0.6535495352976073, "step": 6610, "train/sim_loss": 0.0390625 }, { "epoch": 0.6535495352976073, "step": 6610, "train/total_loss": 0.09439621865749359 }, { "entropy": 8.685169219970703, "epoch": 0.6536484081471228, "mean_token_accuracy": 0.7140974998474121, "num_tokens": 13587857.0, "step": 6611, "train/ce_loss": 0.3878539204597473 }, { "epoch": 0.6536484081471228, "step": 6611, "train/sim_loss": 0.046875 }, { "epoch": 0.6536484081471228, "step": 6611, "train/total_loss": 0.08566039800643921 }, { "entropy": 8.785577774047852, "epoch": 0.6537472809966384, "mean_token_accuracy": 0.7546099424362183, "num_tokens": 13593052.0, "step": 6612, "train/ce_loss": 0.5305328369140625 }, { "epoch": 0.6537472809966384, "step": 6612, "train/sim_loss": 0.05859375 }, { "epoch": 0.6537472809966384, "step": 6612, "train/total_loss": 0.11164703965187073 }, { "entropy": 9.1278076171875, "epoch": 0.6538461538461539, "mean_token_accuracy": 0.7575187683105469, "num_tokens": 13598061.0, "step": 6613, "train/ce_loss": 0.8386176824569702 }, { "epoch": 0.6538461538461539, "step": 6613, "train/sim_loss": 0.0625 }, { "epoch": 0.6538461538461539, "step": 6613, "train/total_loss": 0.14636176824569702 }, { "entropy": 9.5991792678833, "epoch": 0.6539450266956693, "mean_token_accuracy": 0.7457143068313599, "num_tokens": 13602844.0, "step": 6614, "train/ce_loss": 6.30193535471335e-05 }, { "epoch": 0.6539450266956693, "step": 6614, "train/sim_loss": 0.03515625 }, { "epoch": 0.6539450266956693, "step": 6614, "train/total_loss": 0.035162553191185 }, { "entropy": 8.444253921508789, "epoch": 0.6540438995451849, "mean_token_accuracy": 0.7926221489906311, "num_tokens": 13608378.0, "step": 6615, "train/ce_loss": 0.5664340257644653 }, { "epoch": 0.6540438995451849, "step": 6615, "train/sim_loss": 0.01953125 }, { "epoch": 0.6540438995451849, "step": 6615, "train/total_loss": 0.07617465406656265 }, { "entropy": 8.906087875366211, "epoch": 0.6541427723947004, "mean_token_accuracy": 0.7394366264343262, "num_tokens": 13613689.0, "step": 6616, "train/ce_loss": 1.4852736285320134e-06 }, { "epoch": 0.6541427723947004, "step": 6616, "train/sim_loss": 0.02734375 }, { "epoch": 0.6541427723947004, "step": 6616, "train/total_loss": 0.02734389901161194 }, { "entropy": 9.077184677124023, "epoch": 0.6542416452442159, "mean_token_accuracy": 0.7466266751289368, "num_tokens": 13618816.0, "step": 6617, "train/ce_loss": 1.4459218978881836 }, { "epoch": 0.6542416452442159, "step": 6617, "train/sim_loss": 0.12109375 }, { "epoch": 0.6542416452442159, "step": 6617, "train/total_loss": 0.26568594574928284 }, { "entropy": 9.16096305847168, "epoch": 0.6543405180937315, "mean_token_accuracy": 0.7279279232025146, "num_tokens": 13623874.0, "step": 6618, "train/ce_loss": 1.1337878277117852e-05 }, { "epoch": 0.6543405180937315, "step": 6618, "train/sim_loss": 0.05859375 }, { "epoch": 0.6543405180937315, "step": 6618, "train/total_loss": 0.05859488248825073 }, { "entropy": 9.032449722290039, "epoch": 0.654439390943247, "mean_token_accuracy": 0.7461773753166199, "num_tokens": 13628992.0, "step": 6619, "train/ce_loss": 1.344244122505188 }, { "epoch": 0.654439390943247, "step": 6619, "train/sim_loss": 0.11328125 }, { "epoch": 0.654439390943247, "step": 6619, "train/total_loss": 0.24770566821098328 }, { "epoch": 0.6545382637927625, "grad_norm": 0.7162269353866577, "learning_rate": 8.365969440735796e-06, "loss": 0.1417, "step": 6620 }, { "entropy": 8.860625267028809, "epoch": 0.6545382637927625, "mean_token_accuracy": 0.7525906562805176, "num_tokens": 13634159.0, "step": 6620, "train/ce_loss": 0.3801497220993042 }, { "epoch": 0.6545382637927625, "step": 6620, "train/sim_loss": 0.03125 }, { "epoch": 0.6545382637927625, "step": 6620, "train/total_loss": 0.0692649781703949 }, { "entropy": 8.876182556152344, "epoch": 0.6546371366422781, "mean_token_accuracy": 0.688249409198761, "num_tokens": 13639487.0, "step": 6621, "train/ce_loss": 0.8826169967651367 }, { "epoch": 0.6546371366422781, "step": 6621, "train/sim_loss": 0.05859375 }, { "epoch": 0.6546371366422781, "step": 6621, "train/total_loss": 0.1468554437160492 }, { "entropy": 9.362860679626465, "epoch": 0.6547360094917936, "mean_token_accuracy": 0.7879858613014221, "num_tokens": 13644483.0, "step": 6622, "train/ce_loss": 3.3644362247287063e-06 }, { "epoch": 0.6547360094917936, "step": 6622, "train/sim_loss": 0.05078125 }, { "epoch": 0.6547360094917936, "step": 6622, "train/total_loss": 0.05078158527612686 }, { "entropy": 8.86358642578125, "epoch": 0.654834882341309, "mean_token_accuracy": 0.7381545901298523, "num_tokens": 13649743.0, "step": 6623, "train/ce_loss": 0.5250521302223206 }, { "epoch": 0.654834882341309, "step": 6623, "train/sim_loss": 0.0390625 }, { "epoch": 0.654834882341309, "step": 6623, "train/total_loss": 0.09156771004199982 }, { "entropy": 8.727371215820312, "epoch": 0.6549337551908246, "mean_token_accuracy": 0.7717265486717224, "num_tokens": 13655058.0, "step": 6624, "train/ce_loss": 0.3160201609134674 }, { "epoch": 0.6549337551908246, "step": 6624, "train/sim_loss": 0.0390625 }, { "epoch": 0.6549337551908246, "step": 6624, "train/total_loss": 0.07066451758146286 }, { "entropy": 8.269651412963867, "epoch": 0.6550326280403401, "mean_token_accuracy": 0.7158300876617432, "num_tokens": 13660802.0, "step": 6625, "train/ce_loss": 0.7030523419380188 }, { "epoch": 0.6550326280403401, "step": 6625, "train/sim_loss": 0.0234375 }, { "epoch": 0.6550326280403401, "step": 6625, "train/total_loss": 0.093742735683918 }, { "entropy": 9.112279891967773, "epoch": 0.6551315008898556, "mean_token_accuracy": 0.7423312664031982, "num_tokens": 13665900.0, "step": 6626, "train/ce_loss": 1.0576657056808472 }, { "epoch": 0.6551315008898556, "step": 6626, "train/sim_loss": 0.05859375 }, { "epoch": 0.6551315008898556, "step": 6626, "train/total_loss": 0.16436031460762024 }, { "entropy": 8.331881523132324, "epoch": 0.6552303737393712, "mean_token_accuracy": 0.7643442749977112, "num_tokens": 13671380.0, "step": 6627, "train/ce_loss": 0.5838476419448853 }, { "epoch": 0.6552303737393712, "step": 6627, "train/sim_loss": 0.01953125 }, { "epoch": 0.6552303737393712, "step": 6627, "train/total_loss": 0.07791601121425629 }, { "entropy": 8.773751258850098, "epoch": 0.6553292465888867, "mean_token_accuracy": 0.7782964110374451, "num_tokens": 13676686.0, "step": 6628, "train/ce_loss": 0.771364152431488 }, { "epoch": 0.6553292465888867, "step": 6628, "train/sim_loss": 0.015625 }, { "epoch": 0.6553292465888867, "step": 6628, "train/total_loss": 0.09276141971349716 }, { "entropy": 9.157133102416992, "epoch": 0.6554281194384022, "mean_token_accuracy": 0.7109737396240234, "num_tokens": 13681782.0, "step": 6629, "train/ce_loss": 1.915988802909851 }, { "epoch": 0.6554281194384022, "step": 6629, "train/sim_loss": 0.06640625 }, { "epoch": 0.6554281194384022, "step": 6629, "train/total_loss": 0.25800514221191406 }, { "entropy": 9.058282852172852, "epoch": 0.6555269922879178, "mean_token_accuracy": 0.7451612949371338, "num_tokens": 13686850.0, "step": 6630, "train/ce_loss": 0.9509789943695068 }, { "epoch": 0.6555269922879178, "step": 6630, "train/sim_loss": 0.05078125 }, { "epoch": 0.6555269922879178, "step": 6630, "train/total_loss": 0.14587914943695068 }, { "entropy": 9.235963821411133, "epoch": 0.6556258651374333, "mean_token_accuracy": 0.7064220309257507, "num_tokens": 13691920.0, "step": 6631, "train/ce_loss": 1.46619713306427 }, { "epoch": 0.6556258651374333, "step": 6631, "train/sim_loss": 0.0546875 }, { "epoch": 0.6556258651374333, "step": 6631, "train/total_loss": 0.20130722224712372 }, { "entropy": 9.007339477539062, "epoch": 0.6557247379869487, "mean_token_accuracy": 0.7758620977401733, "num_tokens": 13697016.0, "step": 6632, "train/ce_loss": 0.962904691696167 }, { "epoch": 0.6557247379869487, "step": 6632, "train/sim_loss": 0.015625 }, { "epoch": 0.6557247379869487, "step": 6632, "train/total_loss": 0.1119154691696167 }, { "entropy": 8.575750350952148, "epoch": 0.6558236108364643, "mean_token_accuracy": 0.7450593113899231, "num_tokens": 13702497.0, "step": 6633, "train/ce_loss": 0.6679680347442627 }, { "epoch": 0.6558236108364643, "step": 6633, "train/sim_loss": 0.015625 }, { "epoch": 0.6558236108364643, "step": 6633, "train/total_loss": 0.08242180198431015 }, { "entropy": 8.739509582519531, "epoch": 0.6559224836859798, "mean_token_accuracy": 0.6725490093231201, "num_tokens": 13707967.0, "step": 6634, "train/ce_loss": 1.4301599264144897 }, { "epoch": 0.6559224836859798, "step": 6634, "train/sim_loss": 0.10546875 }, { "epoch": 0.6559224836859798, "step": 6634, "train/total_loss": 0.2484847456216812 }, { "entropy": 8.879558563232422, "epoch": 0.6560213565354953, "mean_token_accuracy": 0.7448275685310364, "num_tokens": 13713279.0, "step": 6635, "train/ce_loss": 1.0661180019378662 }, { "epoch": 0.6560213565354953, "step": 6635, "train/sim_loss": 0.0546875 }, { "epoch": 0.6560213565354953, "step": 6635, "train/total_loss": 0.16129930317401886 }, { "entropy": 8.58578872680664, "epoch": 0.6561202293850109, "mean_token_accuracy": 0.7660878300666809, "num_tokens": 13718709.0, "step": 6636, "train/ce_loss": 0.4772024154663086 }, { "epoch": 0.6561202293850109, "step": 6636, "train/sim_loss": 0.046875 }, { "epoch": 0.6561202293850109, "step": 6636, "train/total_loss": 0.09459523856639862 }, { "entropy": 8.64152717590332, "epoch": 0.6562191022345264, "mean_token_accuracy": 0.7270916104316711, "num_tokens": 13724209.0, "step": 6637, "train/ce_loss": 0.7879598736763 }, { "epoch": 0.6562191022345264, "step": 6637, "train/sim_loss": 0.11328125 }, { "epoch": 0.6562191022345264, "step": 6637, "train/total_loss": 0.19207724928855896 }, { "entropy": 8.790847778320312, "epoch": 0.6563179750840419, "mean_token_accuracy": 0.755359411239624, "num_tokens": 13729482.0, "step": 6638, "train/ce_loss": 0.8471806049346924 }, { "epoch": 0.6563179750840419, "step": 6638, "train/sim_loss": 0.0546875 }, { "epoch": 0.6563179750840419, "step": 6638, "train/total_loss": 0.13940556347370148 }, { "entropy": 8.453397750854492, "epoch": 0.6564168479335575, "mean_token_accuracy": 0.8231083750724792, "num_tokens": 13734927.0, "step": 6639, "train/ce_loss": 0.5777555704116821 }, { "epoch": 0.6564168479335575, "step": 6639, "train/sim_loss": 0.015625 }, { "epoch": 0.6564168479335575, "step": 6639, "train/total_loss": 0.07340055704116821 }, { "epoch": 0.656515720783073, "grad_norm": 0.5281113386154175, "learning_rate": 8.361024575977848e-06, "loss": 0.1352, "step": 6640 }, { "entropy": 8.344795227050781, "epoch": 0.656515720783073, "mean_token_accuracy": 0.7142857313156128, "num_tokens": 13740538.0, "step": 6640, "train/ce_loss": 1.1282984018325806 }, { "epoch": 0.656515720783073, "step": 6640, "train/sim_loss": 0.07421875 }, { "epoch": 0.656515720783073, "step": 6640, "train/total_loss": 0.18704858422279358 }, { "entropy": 9.365694046020508, "epoch": 0.6566145936325885, "mean_token_accuracy": 0.7252336740493774, "num_tokens": 13745421.0, "step": 6641, "train/ce_loss": 2.0379106998443604 }, { "epoch": 0.6566145936325885, "step": 6641, "train/sim_loss": 0.046875 }, { "epoch": 0.6566145936325885, "step": 6641, "train/total_loss": 0.250666081905365 }, { "entropy": 9.762799263000488, "epoch": 0.656713466482104, "mean_token_accuracy": 0.688034176826477, "num_tokens": 13750082.0, "step": 6642, "train/ce_loss": 4.048784255981445 }, { "epoch": 0.656713466482104, "step": 6642, "train/sim_loss": 0.03125 }, { "epoch": 0.656713466482104, "step": 6642, "train/total_loss": 0.4361284375190735 }, { "entropy": 8.665780067443848, "epoch": 0.6568123393316195, "mean_token_accuracy": 0.7496463656425476, "num_tokens": 13755510.0, "step": 6643, "train/ce_loss": 0.9824205040931702 }, { "epoch": 0.6568123393316195, "step": 6643, "train/sim_loss": 0.08984375 }, { "epoch": 0.6568123393316195, "step": 6643, "train/total_loss": 0.18808579444885254 }, { "entropy": 8.95378589630127, "epoch": 0.656911212181135, "mean_token_accuracy": 0.737864077091217, "num_tokens": 13760950.0, "step": 6644, "train/ce_loss": 0.618170440196991 }, { "epoch": 0.656911212181135, "step": 6644, "train/sim_loss": 0.0390625 }, { "epoch": 0.656911212181135, "step": 6644, "train/total_loss": 0.10087954998016357 }, { "entropy": 8.267267227172852, "epoch": 0.6570100850306506, "mean_token_accuracy": 0.6891766786575317, "num_tokens": 13766521.0, "step": 6645, "train/ce_loss": 0.5283001065254211 }, { "epoch": 0.6570100850306506, "step": 6645, "train/sim_loss": 0.04296875 }, { "epoch": 0.6570100850306506, "step": 6645, "train/total_loss": 0.09579876065254211 }, { "entropy": 8.782012939453125, "epoch": 0.6571089578801661, "mean_token_accuracy": 0.7252090573310852, "num_tokens": 13771810.0, "step": 6646, "train/ce_loss": 0.9844553470611572 }, { "epoch": 0.6571089578801661, "step": 6646, "train/sim_loss": 0.06640625 }, { "epoch": 0.6571089578801661, "step": 6646, "train/total_loss": 0.16485178470611572 }, { "entropy": 9.146493911743164, "epoch": 0.6572078307296816, "mean_token_accuracy": 0.7063252925872803, "num_tokens": 13776957.0, "step": 6647, "train/ce_loss": 0.8593490719795227 }, { "epoch": 0.6572078307296816, "step": 6647, "train/sim_loss": 0.1875 }, { "epoch": 0.6572078307296816, "step": 6647, "train/total_loss": 0.27343490719795227 }, { "entropy": 8.893928527832031, "epoch": 0.6573067035791972, "mean_token_accuracy": 0.7760097980499268, "num_tokens": 13782278.0, "step": 6648, "train/ce_loss": 1.0574133396148682 }, { "epoch": 0.6573067035791972, "step": 6648, "train/sim_loss": 0.0546875 }, { "epoch": 0.6573067035791972, "step": 6648, "train/total_loss": 0.16042883694171906 }, { "entropy": 8.998648643493652, "epoch": 0.6574055764287127, "mean_token_accuracy": 0.7454545497894287, "num_tokens": 13787487.0, "step": 6649, "train/ce_loss": 0.7707734107971191 }, { "epoch": 0.6574055764287127, "step": 6649, "train/sim_loss": 0.0703125 }, { "epoch": 0.6574055764287127, "step": 6649, "train/total_loss": 0.14738984405994415 }, { "entropy": 8.66889762878418, "epoch": 0.6575044492782282, "mean_token_accuracy": 0.8360995650291443, "num_tokens": 13792893.0, "step": 6650, "train/ce_loss": 0.8410592675209045 }, { "epoch": 0.6575044492782282, "step": 6650, "train/sim_loss": 0.0390625 }, { "epoch": 0.6575044492782282, "step": 6650, "train/total_loss": 0.12316843122243881 }, { "entropy": 8.6030912399292, "epoch": 0.6576033221277438, "mean_token_accuracy": 0.7995283007621765, "num_tokens": 13798209.0, "step": 6651, "train/ce_loss": 0.335406094789505 }, { "epoch": 0.6576033221277438, "step": 6651, "train/sim_loss": 0.01953125 }, { "epoch": 0.6576033221277438, "step": 6651, "train/total_loss": 0.05307186022400856 }, { "entropy": 8.725292205810547, "epoch": 0.6577021949772592, "mean_token_accuracy": 0.7394285798072815, "num_tokens": 13803603.0, "step": 6652, "train/ce_loss": 0.9964840412139893 }, { "epoch": 0.6577021949772592, "step": 6652, "train/sim_loss": 0.0703125 }, { "epoch": 0.6577021949772592, "step": 6652, "train/total_loss": 0.16996091604232788 }, { "entropy": 8.928272247314453, "epoch": 0.6578010678267747, "mean_token_accuracy": 0.7513889074325562, "num_tokens": 13808781.0, "step": 6653, "train/ce_loss": 0.7644780278205872 }, { "epoch": 0.6578010678267747, "step": 6653, "train/sim_loss": 0.09375 }, { "epoch": 0.6578010678267747, "step": 6653, "train/total_loss": 0.17019781470298767 }, { "entropy": 8.922335624694824, "epoch": 0.6578999406762903, "mean_token_accuracy": 0.7458279728889465, "num_tokens": 13814004.0, "step": 6654, "train/ce_loss": 0.4588564932346344 }, { "epoch": 0.6578999406762903, "step": 6654, "train/sim_loss": 0.04296875 }, { "epoch": 0.6578999406762903, "step": 6654, "train/total_loss": 0.08885440230369568 }, { "entropy": 9.43021011352539, "epoch": 0.6579988135258058, "mean_token_accuracy": 0.66847825050354, "num_tokens": 13818946.0, "step": 6655, "train/ce_loss": 2.87549187305558e-06 }, { "epoch": 0.6579988135258058, "step": 6655, "train/sim_loss": 0.01953125 }, { "epoch": 0.6579988135258058, "step": 6655, "train/total_loss": 0.01953153684735298 }, { "entropy": 8.475024223327637, "epoch": 0.6580976863753213, "mean_token_accuracy": 0.7195571660995483, "num_tokens": 13824213.0, "step": 6656, "train/ce_loss": 0.807493269443512 }, { "epoch": 0.6580976863753213, "step": 6656, "train/sim_loss": 0.03125 }, { "epoch": 0.6580976863753213, "step": 6656, "train/total_loss": 0.11199932545423508 }, { "entropy": 8.546957015991211, "epoch": 0.6581965592248369, "mean_token_accuracy": 0.7383177280426025, "num_tokens": 13829565.0, "step": 6657, "train/ce_loss": 0.5284896492958069 }, { "epoch": 0.6581965592248369, "step": 6657, "train/sim_loss": 0.05859375 }, { "epoch": 0.6581965592248369, "step": 6657, "train/total_loss": 0.11144271492958069 }, { "entropy": 9.003338813781738, "epoch": 0.6582954320743524, "mean_token_accuracy": 0.7419354915618896, "num_tokens": 13834659.0, "step": 6658, "train/ce_loss": 1.4813705682754517 }, { "epoch": 0.6582954320743524, "step": 6658, "train/sim_loss": 0.10546875 }, { "epoch": 0.6582954320743524, "step": 6658, "train/total_loss": 0.25360581278800964 }, { "entropy": 9.55681037902832, "epoch": 0.6583943049238679, "mean_token_accuracy": 0.6799163222312927, "num_tokens": 13839510.0, "step": 6659, "train/ce_loss": 1.2218444347381592 }, { "epoch": 0.6583943049238679, "step": 6659, "train/sim_loss": 0.04296875 }, { "epoch": 0.6583943049238679, "step": 6659, "train/total_loss": 0.16515320539474487 }, { "epoch": 0.6584931777733835, "grad_norm": 0.8593719601631165, "learning_rate": 8.356079711219898e-06, "loss": 0.1409, "step": 6660 }, { "entropy": 8.629777908325195, "epoch": 0.6584931777733835, "mean_token_accuracy": 0.7558494210243225, "num_tokens": 13844922.0, "step": 6660, "train/ce_loss": 0.8216527700424194 }, { "epoch": 0.6584931777733835, "step": 6660, "train/sim_loss": 0.03125 }, { "epoch": 0.6584931777733835, "step": 6660, "train/total_loss": 0.11341527849435806 }, { "entropy": 9.151214599609375, "epoch": 0.6585920506228989, "mean_token_accuracy": 0.7333333492279053, "num_tokens": 13850144.0, "step": 6661, "train/ce_loss": 0.5413286685943604 }, { "epoch": 0.6585920506228989, "step": 6661, "train/sim_loss": 0.04296875 }, { "epoch": 0.6585920506228989, "step": 6661, "train/total_loss": 0.0971016138792038 }, { "entropy": 8.66889762878418, "epoch": 0.6586909234724144, "mean_token_accuracy": 0.7402452826499939, "num_tokens": 13855484.0, "step": 6662, "train/ce_loss": 0.7611150741577148 }, { "epoch": 0.6586909234724144, "step": 6662, "train/sim_loss": 0.0234375 }, { "epoch": 0.6586909234724144, "step": 6662, "train/total_loss": 0.09954901039600372 }, { "entropy": 8.713974952697754, "epoch": 0.65878979632193, "mean_token_accuracy": 0.7663934230804443, "num_tokens": 13860949.0, "step": 6663, "train/ce_loss": 0.49784982204437256 }, { "epoch": 0.65878979632193, "step": 6663, "train/sim_loss": 0.07421875 }, { "epoch": 0.65878979632193, "step": 6663, "train/total_loss": 0.12400373816490173 }, { "entropy": 9.036233901977539, "epoch": 0.6588886691714455, "mean_token_accuracy": 0.7224025726318359, "num_tokens": 13866025.0, "step": 6664, "train/ce_loss": 3.2771895348560065e-06 }, { "epoch": 0.6588886691714455, "step": 6664, "train/sim_loss": 0.08203125 }, { "epoch": 0.6588886691714455, "step": 6664, "train/total_loss": 0.08203157782554626 }, { "entropy": 9.548892974853516, "epoch": 0.658987542020961, "mean_token_accuracy": 0.7231759428977966, "num_tokens": 13870876.0, "step": 6665, "train/ce_loss": 1.8042532246909104e-05 }, { "epoch": 0.658987542020961, "step": 6665, "train/sim_loss": 0.0234375 }, { "epoch": 0.658987542020961, "step": 6665, "train/total_loss": 0.023439304903149605 }, { "entropy": 9.140039443969727, "epoch": 0.6590864148704766, "mean_token_accuracy": 0.7146739363670349, "num_tokens": 13876055.0, "step": 6666, "train/ce_loss": 1.681594729423523 }, { "epoch": 0.6590864148704766, "step": 6666, "train/sim_loss": 0.10546875 }, { "epoch": 0.6590864148704766, "step": 6666, "train/total_loss": 0.27362823486328125 }, { "entropy": 8.506306648254395, "epoch": 0.6591852877199921, "mean_token_accuracy": 0.752525269985199, "num_tokens": 13881475.0, "step": 6667, "train/ce_loss": 0.8816389441490173 }, { "epoch": 0.6591852877199921, "step": 6667, "train/sim_loss": 0.0546875 }, { "epoch": 0.6591852877199921, "step": 6667, "train/total_loss": 0.14285139739513397 }, { "entropy": 9.267065048217773, "epoch": 0.6592841605695076, "mean_token_accuracy": 0.7409090995788574, "num_tokens": 13886542.0, "step": 6668, "train/ce_loss": 1.1963961124420166 }, { "epoch": 0.6592841605695076, "step": 6668, "train/sim_loss": 0.0546875 }, { "epoch": 0.6592841605695076, "step": 6668, "train/total_loss": 0.17432710528373718 }, { "entropy": 8.78718090057373, "epoch": 0.6593830334190232, "mean_token_accuracy": 0.790281355381012, "num_tokens": 13891755.0, "step": 6669, "train/ce_loss": 0.8355273604393005 }, { "epoch": 0.6593830334190232, "step": 6669, "train/sim_loss": 0.05078125 }, { "epoch": 0.6593830334190232, "step": 6669, "train/total_loss": 0.134333997964859 }, { "entropy": 8.821840286254883, "epoch": 0.6594819062685386, "mean_token_accuracy": 0.7264770269393921, "num_tokens": 13897144.0, "step": 6670, "train/ce_loss": 0.5888134241104126 }, { "epoch": 0.6594819062685386, "step": 6670, "train/sim_loss": 0.078125 }, { "epoch": 0.6594819062685386, "step": 6670, "train/total_loss": 0.13700634241104126 }, { "entropy": 8.783811569213867, "epoch": 0.6595807791180541, "mean_token_accuracy": 0.6647531390190125, "num_tokens": 13902488.0, "step": 6671, "train/ce_loss": 1.449289083480835 }, { "epoch": 0.6595807791180541, "step": 6671, "train/sim_loss": 0.078125 }, { "epoch": 0.6595807791180541, "step": 6671, "train/total_loss": 0.2230539172887802 }, { "entropy": 9.072467803955078, "epoch": 0.6596796519675697, "mean_token_accuracy": 0.7473958134651184, "num_tokens": 13907714.0, "step": 6672, "train/ce_loss": 0.7859037518501282 }, { "epoch": 0.6596796519675697, "step": 6672, "train/sim_loss": 0.046875 }, { "epoch": 0.6596796519675697, "step": 6672, "train/total_loss": 0.12546537816524506 }, { "entropy": 8.862828254699707, "epoch": 0.6597785248170852, "mean_token_accuracy": 0.7766749262809753, "num_tokens": 13913005.0, "step": 6673, "train/ce_loss": 0.4724079370498657 }, { "epoch": 0.6597785248170852, "step": 6673, "train/sim_loss": 0.0625 }, { "epoch": 0.6597785248170852, "step": 6673, "train/total_loss": 0.10974079370498657 }, { "entropy": 9.026915550231934, "epoch": 0.6598773976666008, "mean_token_accuracy": 0.7480559945106506, "num_tokens": 13918089.0, "step": 6674, "train/ce_loss": 1.6545954942703247 }, { "epoch": 0.6598773976666008, "step": 6674, "train/sim_loss": 0.07421875 }, { "epoch": 0.6598773976666008, "step": 6674, "train/total_loss": 0.2396783083677292 }, { "entropy": 8.767784118652344, "epoch": 0.6599762705161163, "mean_token_accuracy": 0.6919642686843872, "num_tokens": 13923458.0, "step": 6675, "train/ce_loss": 0.8346720933914185 }, { "epoch": 0.6599762705161163, "step": 6675, "train/sim_loss": 0.11328125 }, { "epoch": 0.6599762705161163, "step": 6675, "train/total_loss": 0.19674846529960632 }, { "entropy": 8.52509593963623, "epoch": 0.6600751433656318, "mean_token_accuracy": 0.7439320683479309, "num_tokens": 13928834.0, "step": 6676, "train/ce_loss": 0.4277784526348114 }, { "epoch": 0.6600751433656318, "step": 6676, "train/sim_loss": 0.0234375 }, { "epoch": 0.6600751433656318, "step": 6676, "train/total_loss": 0.06621535122394562 }, { "entropy": 8.762564659118652, "epoch": 0.6601740162151474, "mean_token_accuracy": 0.6973094344139099, "num_tokens": 13934187.0, "step": 6677, "train/ce_loss": 0.502404510974884 }, { "epoch": 0.6601740162151474, "step": 6677, "train/sim_loss": 0.0546875 }, { "epoch": 0.6601740162151474, "step": 6677, "train/total_loss": 0.10492795705795288 }, { "entropy": 8.436891555786133, "epoch": 0.6602728890646629, "mean_token_accuracy": 0.7667103409767151, "num_tokens": 13939522.0, "step": 6678, "train/ce_loss": 0.5824763178825378 }, { "epoch": 0.6602728890646629, "step": 6678, "train/sim_loss": 0.0390625 }, { "epoch": 0.6602728890646629, "step": 6678, "train/total_loss": 0.0973101332783699 }, { "entropy": 8.870096206665039, "epoch": 0.6603717619141783, "mean_token_accuracy": 0.7449101805686951, "num_tokens": 13944801.0, "step": 6679, "train/ce_loss": 0.6867674589157104 }, { "epoch": 0.6603717619141783, "step": 6679, "train/sim_loss": 0.046875 }, { "epoch": 0.6603717619141783, "step": 6679, "train/total_loss": 0.11555174738168716 }, { "epoch": 0.6604706347636939, "grad_norm": 0.6254637837409973, "learning_rate": 8.35113484646195e-06, "loss": 0.1439, "step": 6680 }, { "entropy": 8.857760429382324, "epoch": 0.6604706347636939, "mean_token_accuracy": 0.7624633312225342, "num_tokens": 13949960.0, "step": 6680, "train/ce_loss": 0.8578407168388367 }, { "epoch": 0.6604706347636939, "step": 6680, "train/sim_loss": 0.0390625 }, { "epoch": 0.6604706347636939, "step": 6680, "train/total_loss": 0.12484657019376755 }, { "entropy": 9.655352592468262, "epoch": 0.6605695076132094, "mean_token_accuracy": 0.75, "num_tokens": 13954767.0, "step": 6681, "train/ce_loss": 3.15669763040205e-06 }, { "epoch": 0.6605695076132094, "step": 6681, "train/sim_loss": 0.015625 }, { "epoch": 0.6605695076132094, "step": 6681, "train/total_loss": 0.01562531478703022 }, { "entropy": 8.700724601745605, "epoch": 0.6606683804627249, "mean_token_accuracy": 0.7291440963745117, "num_tokens": 13960159.0, "step": 6682, "train/ce_loss": 0.2854388654232025 }, { "epoch": 0.6606683804627249, "step": 6682, "train/sim_loss": 0.01953125 }, { "epoch": 0.6606683804627249, "step": 6682, "train/total_loss": 0.04807513952255249 }, { "entropy": 8.941474914550781, "epoch": 0.6607672533122405, "mean_token_accuracy": 0.7733674645423889, "num_tokens": 13965383.0, "step": 6683, "train/ce_loss": 0.7744351625442505 }, { "epoch": 0.6607672533122405, "step": 6683, "train/sim_loss": 0.0234375 }, { "epoch": 0.6607672533122405, "step": 6683, "train/total_loss": 0.10088101774454117 }, { "entropy": 8.106334686279297, "epoch": 0.660866126161756, "mean_token_accuracy": 0.7571174502372742, "num_tokens": 13971010.0, "step": 6684, "train/ce_loss": 0.8612960577011108 }, { "epoch": 0.660866126161756, "step": 6684, "train/sim_loss": 0.06640625 }, { "epoch": 0.660866126161756, "step": 6684, "train/total_loss": 0.15253585577011108 }, { "entropy": 8.876348495483398, "epoch": 0.6609649990112715, "mean_token_accuracy": 0.689830482006073, "num_tokens": 13976062.0, "step": 6685, "train/ce_loss": 0.8690265417098999 }, { "epoch": 0.6609649990112715, "step": 6685, "train/sim_loss": 0.06640625 }, { "epoch": 0.6609649990112715, "step": 6685, "train/total_loss": 0.1533088982105255 }, { "entropy": 8.80894660949707, "epoch": 0.6610638718607871, "mean_token_accuracy": 0.7735602259635925, "num_tokens": 13981283.0, "step": 6686, "train/ce_loss": 0.5743243098258972 }, { "epoch": 0.6610638718607871, "step": 6686, "train/sim_loss": 0.01953125 }, { "epoch": 0.6610638718607871, "step": 6686, "train/total_loss": 0.07696367800235748 }, { "entropy": 9.32143783569336, "epoch": 0.6611627447103026, "mean_token_accuracy": 0.745233952999115, "num_tokens": 13986289.0, "step": 6687, "train/ce_loss": 4.4183077989146113e-05 }, { "epoch": 0.6611627447103026, "step": 6687, "train/sim_loss": 0.0390625 }, { "epoch": 0.6611627447103026, "step": 6687, "train/total_loss": 0.039066918194293976 }, { "entropy": 9.051506042480469, "epoch": 0.661261617559818, "mean_token_accuracy": 0.7278911471366882, "num_tokens": 13991496.0, "step": 6688, "train/ce_loss": 0.7195901870727539 }, { "epoch": 0.661261617559818, "step": 6688, "train/sim_loss": 0.0390625 }, { "epoch": 0.661261617559818, "step": 6688, "train/total_loss": 0.11102151870727539 }, { "entropy": 8.631498336791992, "epoch": 0.6613604904093336, "mean_token_accuracy": 0.7300683259963989, "num_tokens": 13996889.0, "step": 6689, "train/ce_loss": 0.8472098112106323 }, { "epoch": 0.6613604904093336, "step": 6689, "train/sim_loss": 0.078125 }, { "epoch": 0.6613604904093336, "step": 6689, "train/total_loss": 0.16284598410129547 }, { "entropy": 8.36520767211914, "epoch": 0.6614593632588491, "mean_token_accuracy": 0.7360097169876099, "num_tokens": 14002188.0, "step": 6690, "train/ce_loss": 1.0956363677978516 }, { "epoch": 0.6614593632588491, "step": 6690, "train/sim_loss": 0.0625 }, { "epoch": 0.6614593632588491, "step": 6690, "train/total_loss": 0.1720636487007141 }, { "entropy": 8.532737731933594, "epoch": 0.6615582361083646, "mean_token_accuracy": 0.7497527003288269, "num_tokens": 14007707.0, "step": 6691, "train/ce_loss": 0.7537171840667725 }, { "epoch": 0.6615582361083646, "step": 6691, "train/sim_loss": 0.02734375 }, { "epoch": 0.6615582361083646, "step": 6691, "train/total_loss": 0.10271546989679337 }, { "entropy": 9.208215713500977, "epoch": 0.6616571089578802, "mean_token_accuracy": 0.6937212944030762, "num_tokens": 14012788.0, "step": 6692, "train/ce_loss": 0.7378474473953247 }, { "epoch": 0.6616571089578802, "step": 6692, "train/sim_loss": 0.0390625 }, { "epoch": 0.6616571089578802, "step": 6692, "train/total_loss": 0.11284724622964859 }, { "entropy": 9.050117492675781, "epoch": 0.6617559818073957, "mean_token_accuracy": 0.6995447874069214, "num_tokens": 14017857.0, "step": 6693, "train/ce_loss": 1.3012582063674927 }, { "epoch": 0.6617559818073957, "step": 6693, "train/sim_loss": 0.09375 }, { "epoch": 0.6617559818073957, "step": 6693, "train/total_loss": 0.22387582063674927 }, { "entropy": 9.351299285888672, "epoch": 0.6618548546569112, "mean_token_accuracy": 0.6732673048973083, "num_tokens": 14022792.0, "step": 6694, "train/ce_loss": 2.2628509998321533 }, { "epoch": 0.6618548546569112, "step": 6694, "train/sim_loss": 0.01953125 }, { "epoch": 0.6618548546569112, "step": 6694, "train/total_loss": 0.24581634998321533 }, { "entropy": 8.946569442749023, "epoch": 0.6619537275064268, "mean_token_accuracy": 0.7586206793785095, "num_tokens": 14028030.0, "step": 6695, "train/ce_loss": 0.7077171206474304 }, { "epoch": 0.6619537275064268, "step": 6695, "train/sim_loss": 0.046875 }, { "epoch": 0.6619537275064268, "step": 6695, "train/total_loss": 0.1176467165350914 }, { "entropy": 8.753110885620117, "epoch": 0.6620526003559423, "mean_token_accuracy": 0.7614796161651611, "num_tokens": 14033286.0, "step": 6696, "train/ce_loss": 0.7196674942970276 }, { "epoch": 0.6620526003559423, "step": 6696, "train/sim_loss": 0.0546875 }, { "epoch": 0.6620526003559423, "step": 6696, "train/total_loss": 0.126654252409935 }, { "entropy": 8.630273818969727, "epoch": 0.6621514732054578, "mean_token_accuracy": 0.7274969220161438, "num_tokens": 14038602.0, "step": 6697, "train/ce_loss": 0.9384482502937317 }, { "epoch": 0.6621514732054578, "step": 6697, "train/sim_loss": 0.0390625 }, { "epoch": 0.6621514732054578, "step": 6697, "train/total_loss": 0.13290733098983765 }, { "entropy": 8.813023567199707, "epoch": 0.6622503460549733, "mean_token_accuracy": 0.7220259308815002, "num_tokens": 14043908.0, "step": 6698, "train/ce_loss": 0.8345155119895935 }, { "epoch": 0.6622503460549733, "step": 6698, "train/sim_loss": 0.078125 }, { "epoch": 0.6622503460549733, "step": 6698, "train/total_loss": 0.1615765541791916 }, { "entropy": 9.198127746582031, "epoch": 0.6623492189044888, "mean_token_accuracy": 0.7068645358085632, "num_tokens": 14048906.0, "step": 6699, "train/ce_loss": 0.8945678472518921 }, { "epoch": 0.6623492189044888, "step": 6699, "train/sim_loss": 0.01953125 }, { "epoch": 0.6623492189044888, "step": 6699, "train/total_loss": 0.10898803919553757 }, { "epoch": 0.6624480917540043, "grad_norm": 0.8893702626228333, "learning_rate": 8.346189981704001e-06, "loss": 0.141, "step": 6700 }, { "entropy": 8.877273559570312, "epoch": 0.6624480917540043, "mean_token_accuracy": 0.7578814625740051, "num_tokens": 14054187.0, "step": 6700, "train/ce_loss": 0.5039969682693481 }, { "epoch": 0.6624480917540043, "step": 6700, "train/sim_loss": 0.01953125 }, { "epoch": 0.6624480917540043, "step": 6700, "train/total_loss": 0.06993094831705093 }, { "entropy": 8.706487655639648, "epoch": 0.6625469646035199, "mean_token_accuracy": 0.7451456189155579, "num_tokens": 14059441.0, "step": 6701, "train/ce_loss": 0.6316986083984375 }, { "epoch": 0.6625469646035199, "step": 6701, "train/sim_loss": 0.03515625 }, { "epoch": 0.6625469646035199, "step": 6701, "train/total_loss": 0.09832610934972763 }, { "entropy": 9.48685359954834, "epoch": 0.6626458374530354, "mean_token_accuracy": 0.7756563425064087, "num_tokens": 14064301.0, "step": 6702, "train/ce_loss": 1.4424176216125488 }, { "epoch": 0.6626458374530354, "step": 6702, "train/sim_loss": 0.04296875 }, { "epoch": 0.6626458374530354, "step": 6702, "train/total_loss": 0.18721051514148712 }, { "entropy": 8.722457885742188, "epoch": 0.6627447103025509, "mean_token_accuracy": 0.6818675398826599, "num_tokens": 14069707.0, "step": 6703, "train/ce_loss": 1.3143783807754517 }, { "epoch": 0.6627447103025509, "step": 6703, "train/sim_loss": 0.05078125 }, { "epoch": 0.6627447103025509, "step": 6703, "train/total_loss": 0.18221908807754517 }, { "entropy": 8.672255516052246, "epoch": 0.6628435831520665, "mean_token_accuracy": 0.7947434186935425, "num_tokens": 14074947.0, "step": 6704, "train/ce_loss": 0.7217705845832825 }, { "epoch": 0.6628435831520665, "step": 6704, "train/sim_loss": 0.02734375 }, { "epoch": 0.6628435831520665, "step": 6704, "train/total_loss": 0.09952080994844437 }, { "entropy": 8.845043182373047, "epoch": 0.662942456001582, "mean_token_accuracy": 0.7313797473907471, "num_tokens": 14080276.0, "step": 6705, "train/ce_loss": 1.2335104942321777 }, { "epoch": 0.662942456001582, "step": 6705, "train/sim_loss": 0.07421875 }, { "epoch": 0.662942456001582, "step": 6705, "train/total_loss": 0.19756980240345 }, { "entropy": 8.619894027709961, "epoch": 0.6630413288510975, "mean_token_accuracy": 0.7848244905471802, "num_tokens": 14085779.0, "step": 6706, "train/ce_loss": 0.6108676195144653 }, { "epoch": 0.6630413288510975, "step": 6706, "train/sim_loss": 0.046875 }, { "epoch": 0.6630413288510975, "step": 6706, "train/total_loss": 0.1079617589712143 }, { "entropy": 8.909207344055176, "epoch": 0.663140201700613, "mean_token_accuracy": 0.7596899271011353, "num_tokens": 14090955.0, "step": 6707, "train/ce_loss": 1.4759429693222046 }, { "epoch": 0.663140201700613, "step": 6707, "train/sim_loss": 0.05078125 }, { "epoch": 0.663140201700613, "step": 6707, "train/total_loss": 0.19837555289268494 }, { "entropy": 9.152202606201172, "epoch": 0.6632390745501285, "mean_token_accuracy": 0.7245901823043823, "num_tokens": 14096014.0, "step": 6708, "train/ce_loss": 0.8172332048416138 }, { "epoch": 0.6632390745501285, "step": 6708, "train/sim_loss": 0.0390625 }, { "epoch": 0.6632390745501285, "step": 6708, "train/total_loss": 0.12078582495450974 }, { "entropy": 9.569263458251953, "epoch": 0.663337947399644, "mean_token_accuracy": 0.7236467003822327, "num_tokens": 14100751.0, "step": 6709, "train/ce_loss": 1.4401479959487915 }, { "epoch": 0.663337947399644, "step": 6709, "train/sim_loss": 0.078125 }, { "epoch": 0.663337947399644, "step": 6709, "train/total_loss": 0.22213980555534363 }, { "entropy": 9.026773452758789, "epoch": 0.6634368202491596, "mean_token_accuracy": 0.7635983228683472, "num_tokens": 14105697.0, "step": 6710, "train/ce_loss": 2.6175093807978556e-06 }, { "epoch": 0.6634368202491596, "step": 6710, "train/sim_loss": 0.05859375 }, { "epoch": 0.6634368202491596, "step": 6710, "train/total_loss": 0.05859401077032089 }, { "entropy": 8.975640296936035, "epoch": 0.6635356930986751, "mean_token_accuracy": 0.6936488151550293, "num_tokens": 14111118.0, "step": 6711, "train/ce_loss": 0.9944825172424316 }, { "epoch": 0.6635356930986751, "step": 6711, "train/sim_loss": 0.046875 }, { "epoch": 0.6635356930986751, "step": 6711, "train/total_loss": 0.14632326364517212 }, { "entropy": 8.736495971679688, "epoch": 0.6636345659481906, "mean_token_accuracy": 0.7727839946746826, "num_tokens": 14116383.0, "step": 6712, "train/ce_loss": 0.6491132974624634 }, { "epoch": 0.6636345659481906, "step": 6712, "train/sim_loss": 0.046875 }, { "epoch": 0.6636345659481906, "step": 6712, "train/total_loss": 0.11178632825613022 }, { "entropy": 9.282264709472656, "epoch": 0.6637334387977062, "mean_token_accuracy": 0.7543859481811523, "num_tokens": 14121401.0, "step": 6713, "train/ce_loss": 0.7997955679893494 }, { "epoch": 0.6637334387977062, "step": 6713, "train/sim_loss": 0.046875 }, { "epoch": 0.6637334387977062, "step": 6713, "train/total_loss": 0.1268545687198639 }, { "entropy": 9.13487434387207, "epoch": 0.6638323116472217, "mean_token_accuracy": 0.7386363744735718, "num_tokens": 14126447.0, "step": 6714, "train/ce_loss": 1.0122233629226685 }, { "epoch": 0.6638323116472217, "step": 6714, "train/sim_loss": 0.046875 }, { "epoch": 0.6638323116472217, "step": 6714, "train/total_loss": 0.14809733629226685 }, { "entropy": 8.932573318481445, "epoch": 0.6639311844967372, "mean_token_accuracy": 0.770039439201355, "num_tokens": 14131682.0, "step": 6715, "train/ce_loss": 0.47455939650535583 }, { "epoch": 0.6639311844967372, "step": 6715, "train/sim_loss": 0.0546875 }, { "epoch": 0.6639311844967372, "step": 6715, "train/total_loss": 0.10214343667030334 }, { "entropy": 8.547382354736328, "epoch": 0.6640300573462528, "mean_token_accuracy": 0.7574094533920288, "num_tokens": 14137106.0, "step": 6716, "train/ce_loss": 0.6185110211372375 }, { "epoch": 0.6640300573462528, "step": 6716, "train/sim_loss": 0.0234375 }, { "epoch": 0.6640300573462528, "step": 6716, "train/total_loss": 0.08528859913349152 }, { "entropy": 8.829623222351074, "epoch": 0.6641289301957682, "mean_token_accuracy": 0.7074742317199707, "num_tokens": 14142346.0, "step": 6717, "train/ce_loss": 0.6980037689208984 }, { "epoch": 0.6641289301957682, "step": 6717, "train/sim_loss": 0.046875 }, { "epoch": 0.6641289301957682, "step": 6717, "train/total_loss": 0.11667537689208984 }, { "entropy": 8.666450500488281, "epoch": 0.6642278030452837, "mean_token_accuracy": 0.7428198456764221, "num_tokens": 14147590.0, "step": 6718, "train/ce_loss": 0.786668598651886 }, { "epoch": 0.6642278030452837, "step": 6718, "train/sim_loss": 0.0703125 }, { "epoch": 0.6642278030452837, "step": 6718, "train/total_loss": 0.14897936582565308 }, { "entropy": 9.03660774230957, "epoch": 0.6643266758947993, "mean_token_accuracy": 0.6939314007759094, "num_tokens": 14152833.0, "step": 6719, "train/ce_loss": 7.933153028716333e-06 }, { "epoch": 0.6643266758947993, "step": 6719, "train/sim_loss": 0.0625 }, { "epoch": 0.6643266758947993, "step": 6719, "train/total_loss": 0.06250078976154327 }, { "epoch": 0.6644255487443148, "grad_norm": 0.8787350058555603, "learning_rate": 8.341245116946052e-06, "loss": 0.1347, "step": 6720 }, { "entropy": 8.631905555725098, "epoch": 0.6644255487443148, "mean_token_accuracy": 0.7658303380012512, "num_tokens": 14158167.0, "step": 6720, "train/ce_loss": 0.6893506050109863 }, { "epoch": 0.6644255487443148, "step": 6720, "train/sim_loss": 0.05859375 }, { "epoch": 0.6644255487443148, "step": 6720, "train/total_loss": 0.1275288164615631 }, { "entropy": 9.196617126464844, "epoch": 0.6645244215938303, "mean_token_accuracy": 0.7628865838050842, "num_tokens": 14163183.0, "step": 6721, "train/ce_loss": 1.015169620513916 }, { "epoch": 0.6645244215938303, "step": 6721, "train/sim_loss": 0.046875 }, { "epoch": 0.6645244215938303, "step": 6721, "train/total_loss": 0.1483919620513916 }, { "entropy": 8.924755096435547, "epoch": 0.6646232944433459, "mean_token_accuracy": 0.7825520634651184, "num_tokens": 14168390.0, "step": 6722, "train/ce_loss": 0.9925330281257629 }, { "epoch": 0.6646232944433459, "step": 6722, "train/sim_loss": 0.01953125 }, { "epoch": 0.6646232944433459, "step": 6722, "train/total_loss": 0.11878455430269241 }, { "entropy": 8.297525405883789, "epoch": 0.6647221672928614, "mean_token_accuracy": 0.7615176439285278, "num_tokens": 14173964.0, "step": 6723, "train/ce_loss": 0.5884344577789307 }, { "epoch": 0.6647221672928614, "step": 6723, "train/sim_loss": 0.015625 }, { "epoch": 0.6647221672928614, "step": 6723, "train/total_loss": 0.0744684487581253 }, { "entropy": 9.536665916442871, "epoch": 0.6648210401423769, "mean_token_accuracy": 0.7642105221748352, "num_tokens": 14178859.0, "step": 6724, "train/ce_loss": 1.7731702327728271 }, { "epoch": 0.6648210401423769, "step": 6724, "train/sim_loss": 0.0703125 }, { "epoch": 0.6648210401423769, "step": 6724, "train/total_loss": 0.24762952327728271 }, { "entropy": 9.07353401184082, "epoch": 0.6649199129918925, "mean_token_accuracy": 0.7966616153717041, "num_tokens": 14183943.0, "step": 6725, "train/ce_loss": 0.5030363202095032 }, { "epoch": 0.6649199129918925, "step": 6725, "train/sim_loss": 0.04296875 }, { "epoch": 0.6649199129918925, "step": 6725, "train/total_loss": 0.0932723879814148 }, { "entropy": 8.971988677978516, "epoch": 0.6650187858414079, "mean_token_accuracy": 0.7762619256973267, "num_tokens": 14189111.0, "step": 6726, "train/ce_loss": 0.7912498712539673 }, { "epoch": 0.6650187858414079, "step": 6726, "train/sim_loss": 0.02734375 }, { "epoch": 0.6650187858414079, "step": 6726, "train/total_loss": 0.10646873712539673 }, { "entropy": 9.1195068359375, "epoch": 0.6651176586909234, "mean_token_accuracy": 0.7333333492279053, "num_tokens": 14194174.0, "step": 6727, "train/ce_loss": 5.986967153148726e-06 }, { "epoch": 0.6651176586909234, "step": 6727, "train/sim_loss": 0.0390625 }, { "epoch": 0.6651176586909234, "step": 6727, "train/total_loss": 0.03906309977173805 }, { "entropy": 8.535524368286133, "epoch": 0.665216531540439, "mean_token_accuracy": 0.7098265886306763, "num_tokens": 14199506.0, "step": 6728, "train/ce_loss": 0.8491111397743225 }, { "epoch": 0.665216531540439, "step": 6728, "train/sim_loss": 0.08203125 }, { "epoch": 0.665216531540439, "step": 6728, "train/total_loss": 0.16694235801696777 }, { "entropy": 8.829087257385254, "epoch": 0.6653154043899545, "mean_token_accuracy": 0.7119078040122986, "num_tokens": 14204739.0, "step": 6729, "train/ce_loss": 1.2258275747299194 }, { "epoch": 0.6653154043899545, "step": 6729, "train/sim_loss": 0.046875 }, { "epoch": 0.6653154043899545, "step": 6729, "train/total_loss": 0.16945776343345642 }, { "entropy": 9.08847427368164, "epoch": 0.66541427723947, "mean_token_accuracy": 0.7658119797706604, "num_tokens": 14209774.0, "step": 6730, "train/ce_loss": 0.6165642142295837 }, { "epoch": 0.66541427723947, "step": 6730, "train/sim_loss": 0.015625 }, { "epoch": 0.66541427723947, "step": 6730, "train/total_loss": 0.0772814229130745 }, { "entropy": 8.732650756835938, "epoch": 0.6655131500889856, "mean_token_accuracy": 0.7557715773582458, "num_tokens": 14215036.0, "step": 6731, "train/ce_loss": 0.8895928859710693 }, { "epoch": 0.6655131500889856, "step": 6731, "train/sim_loss": 0.04296875 }, { "epoch": 0.6655131500889856, "step": 6731, "train/total_loss": 0.13192804157733917 }, { "entropy": 9.088386535644531, "epoch": 0.6656120229385011, "mean_token_accuracy": 0.757785439491272, "num_tokens": 14220092.0, "step": 6732, "train/ce_loss": 1.3937000403529964e-05 }, { "epoch": 0.6656120229385011, "step": 6732, "train/sim_loss": 0.0546875 }, { "epoch": 0.6656120229385011, "step": 6732, "train/total_loss": 0.054688893258571625 }, { "entropy": 8.419425964355469, "epoch": 0.6657108957880166, "mean_token_accuracy": 0.8412538170814514, "num_tokens": 14225602.0, "step": 6733, "train/ce_loss": 0.497548907995224 }, { "epoch": 0.6657108957880166, "step": 6733, "train/sim_loss": 0.1015625 }, { "epoch": 0.6657108957880166, "step": 6733, "train/total_loss": 0.15131738781929016 }, { "entropy": 9.298429489135742, "epoch": 0.6658097686375322, "mean_token_accuracy": 0.75, "num_tokens": 14230566.0, "step": 6734, "train/ce_loss": 1.1033003829652444e-05 }, { "epoch": 0.6658097686375322, "step": 6734, "train/sim_loss": 0.07421875 }, { "epoch": 0.6658097686375322, "step": 6734, "train/total_loss": 0.07421985268592834 }, { "entropy": 9.47636890411377, "epoch": 0.6659086414870476, "mean_token_accuracy": 0.7843137383460999, "num_tokens": 14235495.0, "step": 6735, "train/ce_loss": 0.6626639366149902 }, { "epoch": 0.6659086414870476, "step": 6735, "train/sim_loss": 0.0546875 }, { "epoch": 0.6659086414870476, "step": 6735, "train/total_loss": 0.12095389515161514 }, { "entropy": 8.497119903564453, "epoch": 0.6660075143365631, "mean_token_accuracy": 0.781361997127533, "num_tokens": 14240819.0, "step": 6736, "train/ce_loss": 0.9271575808525085 }, { "epoch": 0.6660075143365631, "step": 6736, "train/sim_loss": 0.0546875 }, { "epoch": 0.6660075143365631, "step": 6736, "train/total_loss": 0.1474032700061798 }, { "entropy": 8.962126731872559, "epoch": 0.6661063871860787, "mean_token_accuracy": 0.754358172416687, "num_tokens": 14245889.0, "step": 6737, "train/ce_loss": 4.2335354919487145e-06 }, { "epoch": 0.6661063871860787, "step": 6737, "train/sim_loss": 0.0234375 }, { "epoch": 0.6661063871860787, "step": 6737, "train/total_loss": 0.023437922820448875 }, { "entropy": 8.75960922241211, "epoch": 0.6662052600355942, "mean_token_accuracy": 0.7316076159477234, "num_tokens": 14251090.0, "step": 6738, "train/ce_loss": 0.414532333612442 }, { "epoch": 0.6662052600355942, "step": 6738, "train/sim_loss": 0.0234375 }, { "epoch": 0.6662052600355942, "step": 6738, "train/total_loss": 0.06489073485136032 }, { "entropy": 9.062548637390137, "epoch": 0.6663041328851097, "mean_token_accuracy": 0.6617050170898438, "num_tokens": 14256276.0, "step": 6739, "train/ce_loss": 0.9967235922813416 }, { "epoch": 0.6663041328851097, "step": 6739, "train/sim_loss": 0.04296875 }, { "epoch": 0.6663041328851097, "step": 6739, "train/total_loss": 0.1426411122083664 }, { "epoch": 0.6664030057346253, "grad_norm": 0.7395882606506348, "learning_rate": 8.336300252188104e-06, "loss": 0.1269, "step": 6740 }, { "entropy": 8.70880126953125, "epoch": 0.6664030057346253, "mean_token_accuracy": 0.7319004535675049, "num_tokens": 14261661.0, "step": 6740, "train/ce_loss": 0.8813509941101074 }, { "epoch": 0.6664030057346253, "step": 6740, "train/sim_loss": 0.04296875 }, { "epoch": 0.6664030057346253, "step": 6740, "train/total_loss": 0.13110384345054626 }, { "entropy": 8.729297637939453, "epoch": 0.6665018785841408, "mean_token_accuracy": 0.8273615837097168, "num_tokens": 14267084.0, "step": 6741, "train/ce_loss": 0.31738346815109253 }, { "epoch": 0.6665018785841408, "step": 6741, "train/sim_loss": 0.0234375 }, { "epoch": 0.6665018785841408, "step": 6741, "train/total_loss": 0.05517584830522537 }, { "entropy": 9.303884506225586, "epoch": 0.6666007514336563, "mean_token_accuracy": 0.721875011920929, "num_tokens": 14272200.0, "step": 6742, "train/ce_loss": 2.1535837650299072 }, { "epoch": 0.6666007514336563, "step": 6742, "train/sim_loss": 0.13671875 }, { "epoch": 0.6666007514336563, "step": 6742, "train/total_loss": 0.3520771265029907 }, { "entropy": 9.582070350646973, "epoch": 0.6666996242831719, "mean_token_accuracy": 0.7639999985694885, "num_tokens": 14277111.0, "step": 6743, "train/ce_loss": 0.5690571069717407 }, { "epoch": 0.6666996242831719, "step": 6743, "train/sim_loss": 0.03125 }, { "epoch": 0.6666996242831719, "step": 6743, "train/total_loss": 0.08815571665763855 }, { "entropy": 8.569799423217773, "epoch": 0.6667984971326874, "mean_token_accuracy": 0.6907216310501099, "num_tokens": 14282536.0, "step": 6744, "train/ce_loss": 0.659394383430481 }, { "epoch": 0.6667984971326874, "step": 6744, "train/sim_loss": 0.0546875 }, { "epoch": 0.6667984971326874, "step": 6744, "train/total_loss": 0.12062694132328033 }, { "entropy": 9.198576927185059, "epoch": 0.6668973699822028, "mean_token_accuracy": 0.6841155290603638, "num_tokens": 14287538.0, "step": 6745, "train/ce_loss": 0.8136149644851685 }, { "epoch": 0.6668973699822028, "step": 6745, "train/sim_loss": 0.04296875 }, { "epoch": 0.6668973699822028, "step": 6745, "train/total_loss": 0.12433024495840073 }, { "entropy": 9.104917526245117, "epoch": 0.6669962428317184, "mean_token_accuracy": 0.7232415676116943, "num_tokens": 14292623.0, "step": 6746, "train/ce_loss": 1.055641531944275 }, { "epoch": 0.6669962428317184, "step": 6746, "train/sim_loss": 0.046875 }, { "epoch": 0.6669962428317184, "step": 6746, "train/total_loss": 0.152439147233963 }, { "entropy": 8.609912872314453, "epoch": 0.6670951156812339, "mean_token_accuracy": 0.7218468189239502, "num_tokens": 14298041.0, "step": 6747, "train/ce_loss": 0.8968522548675537 }, { "epoch": 0.6670951156812339, "step": 6747, "train/sim_loss": 0.11328125 }, { "epoch": 0.6670951156812339, "step": 6747, "train/total_loss": 0.20296648144721985 }, { "entropy": 9.488840103149414, "epoch": 0.6671939885307494, "mean_token_accuracy": 0.6616702079772949, "num_tokens": 14302963.0, "step": 6748, "train/ce_loss": 3.5787792205810547 }, { "epoch": 0.6671939885307494, "step": 6748, "train/sim_loss": 0.0703125 }, { "epoch": 0.6671939885307494, "step": 6748, "train/total_loss": 0.4281904399394989 }, { "entropy": 8.588305473327637, "epoch": 0.667292861380265, "mean_token_accuracy": 0.7162446975708008, "num_tokens": 14308357.0, "step": 6749, "train/ce_loss": 0.5696796178817749 }, { "epoch": 0.667292861380265, "step": 6749, "train/sim_loss": 0.046875 }, { "epoch": 0.667292861380265, "step": 6749, "train/total_loss": 0.10384295880794525 }, { "entropy": 9.04929256439209, "epoch": 0.6673917342297805, "mean_token_accuracy": 0.687158465385437, "num_tokens": 14313550.0, "step": 6750, "train/ce_loss": 1.2263870239257812 }, { "epoch": 0.6673917342297805, "step": 6750, "train/sim_loss": 0.07421875 }, { "epoch": 0.6673917342297805, "step": 6750, "train/total_loss": 0.19685745239257812 }, { "entropy": 8.779260635375977, "epoch": 0.667490607079296, "mean_token_accuracy": 0.7692307829856873, "num_tokens": 14318931.0, "step": 6751, "train/ce_loss": 0.47107505798339844 }, { "epoch": 0.667490607079296, "step": 6751, "train/sim_loss": 0.125 }, { "epoch": 0.667490607079296, "step": 6751, "train/total_loss": 0.1721075028181076 }, { "entropy": 8.935807228088379, "epoch": 0.6675894799288116, "mean_token_accuracy": 0.7761989235877991, "num_tokens": 14323949.0, "step": 6752, "train/ce_loss": 1.0147687196731567 }, { "epoch": 0.6675894799288116, "step": 6752, "train/sim_loss": 0.03515625 }, { "epoch": 0.6675894799288116, "step": 6752, "train/total_loss": 0.13663312792778015 }, { "entropy": 8.626876831054688, "epoch": 0.667688352778327, "mean_token_accuracy": 0.7016574740409851, "num_tokens": 14329367.0, "step": 6753, "train/ce_loss": 1.08138906955719 }, { "epoch": 0.667688352778327, "step": 6753, "train/sim_loss": 0.08984375 }, { "epoch": 0.667688352778327, "step": 6753, "train/total_loss": 0.19798266887664795 }, { "entropy": 8.819351196289062, "epoch": 0.6677872256278425, "mean_token_accuracy": 0.7192053198814392, "num_tokens": 14334578.0, "step": 6754, "train/ce_loss": 2.0306320948293433e-05 }, { "epoch": 0.6677872256278425, "step": 6754, "train/sim_loss": 0.0234375 }, { "epoch": 0.6677872256278425, "step": 6754, "train/total_loss": 0.023439530283212662 }, { "entropy": 8.370071411132812, "epoch": 0.6678860984773581, "mean_token_accuracy": 0.7449078559875488, "num_tokens": 14340035.0, "step": 6755, "train/ce_loss": 0.7535091042518616 }, { "epoch": 0.6678860984773581, "step": 6755, "train/sim_loss": 0.0546875 }, { "epoch": 0.6678860984773581, "step": 6755, "train/total_loss": 0.13003841042518616 }, { "entropy": 8.58914566040039, "epoch": 0.6679849713268736, "mean_token_accuracy": 0.7662061452865601, "num_tokens": 14345496.0, "step": 6756, "train/ce_loss": 1.0949351787567139 }, { "epoch": 0.6679849713268736, "step": 6756, "train/sim_loss": 0.0703125 }, { "epoch": 0.6679849713268736, "step": 6756, "train/total_loss": 0.17980602383613586 }, { "entropy": 9.329448699951172, "epoch": 0.6680838441763892, "mean_token_accuracy": 0.7410358786582947, "num_tokens": 14350425.0, "step": 6757, "train/ce_loss": 0.9834825992584229 }, { "epoch": 0.6680838441763892, "step": 6757, "train/sim_loss": 0.03515625 }, { "epoch": 0.6680838441763892, "step": 6757, "train/total_loss": 0.13350450992584229 }, { "entropy": 9.280977249145508, "epoch": 0.6681827170259047, "mean_token_accuracy": 0.7773787975311279, "num_tokens": 14355429.0, "step": 6758, "train/ce_loss": 1.32757568359375 }, { "epoch": 0.6681827170259047, "step": 6758, "train/sim_loss": 0.046875 }, { "epoch": 0.6681827170259047, "step": 6758, "train/total_loss": 0.17963257431983948 }, { "entropy": 8.651885032653809, "epoch": 0.6682815898754202, "mean_token_accuracy": 0.7680995464324951, "num_tokens": 14360828.0, "step": 6759, "train/ce_loss": 1.0034481287002563 }, { "epoch": 0.6682815898754202, "step": 6759, "train/sim_loss": 0.05078125 }, { "epoch": 0.6682815898754202, "step": 6759, "train/total_loss": 0.15112605690956116 }, { "epoch": 0.6683804627249358, "grad_norm": 0.6059948205947876, "learning_rate": 8.331355387430154e-06, "loss": 0.146, "step": 6760 }, { "entropy": 8.796469688415527, "epoch": 0.6683804627249358, "mean_token_accuracy": 0.7750906944274902, "num_tokens": 14366073.0, "step": 6760, "train/ce_loss": 0.7929439544677734 }, { "epoch": 0.6683804627249358, "step": 6760, "train/sim_loss": 0.02734375 }, { "epoch": 0.6683804627249358, "step": 6760, "train/total_loss": 0.10663814842700958 }, { "entropy": 9.272623062133789, "epoch": 0.6684793355744513, "mean_token_accuracy": 0.7857142686843872, "num_tokens": 14371091.0, "step": 6761, "train/ce_loss": 0.6518434286117554 }, { "epoch": 0.6684793355744513, "step": 6761, "train/sim_loss": 0.05859375 }, { "epoch": 0.6684793355744513, "step": 6761, "train/total_loss": 0.1237780973315239 }, { "entropy": 9.03161907196045, "epoch": 0.6685782084239668, "mean_token_accuracy": 0.6916548609733582, "num_tokens": 14376243.0, "step": 6762, "train/ce_loss": 1.4648661613464355 }, { "epoch": 0.6685782084239668, "step": 6762, "train/sim_loss": 0.0859375 }, { "epoch": 0.6685782084239668, "step": 6762, "train/total_loss": 0.23242412507534027 }, { "entropy": 8.697053909301758, "epoch": 0.6686770812734824, "mean_token_accuracy": 0.7248520851135254, "num_tokens": 14381295.0, "step": 6763, "train/ce_loss": 1.4075952768325806 }, { "epoch": 0.6686770812734824, "step": 6763, "train/sim_loss": 0.04296875 }, { "epoch": 0.6686770812734824, "step": 6763, "train/total_loss": 0.18372827768325806 }, { "entropy": 9.094472885131836, "epoch": 0.6687759541229978, "mean_token_accuracy": 0.7451274394989014, "num_tokens": 14386423.0, "step": 6764, "train/ce_loss": 6.359361123031704e-06 }, { "epoch": 0.6687759541229978, "step": 6764, "train/sim_loss": 0.03515625 }, { "epoch": 0.6687759541229978, "step": 6764, "train/total_loss": 0.03515688702464104 }, { "entropy": 9.747419357299805, "epoch": 0.6688748269725133, "mean_token_accuracy": 0.7487179636955261, "num_tokens": 14391225.0, "step": 6765, "train/ce_loss": 7.125037427613279e-06 }, { "epoch": 0.6688748269725133, "step": 6765, "train/sim_loss": 0.0625 }, { "epoch": 0.6688748269725133, "step": 6765, "train/total_loss": 0.0625007152557373 }, { "entropy": 8.462725639343262, "epoch": 0.6689736998220289, "mean_token_accuracy": 0.7445887327194214, "num_tokens": 14396633.0, "step": 6766, "train/ce_loss": 0.8765015602111816 }, { "epoch": 0.6689736998220289, "step": 6766, "train/sim_loss": 0.1015625 }, { "epoch": 0.6689736998220289, "step": 6766, "train/total_loss": 0.1892126500606537 }, { "entropy": 8.475957870483398, "epoch": 0.6690725726715444, "mean_token_accuracy": 0.743682324886322, "num_tokens": 14401977.0, "step": 6767, "train/ce_loss": 0.8576280474662781 }, { "epoch": 0.6690725726715444, "step": 6767, "train/sim_loss": 0.02734375 }, { "epoch": 0.6690725726715444, "step": 6767, "train/total_loss": 0.11310655623674393 }, { "entropy": 9.403925895690918, "epoch": 0.6691714455210599, "mean_token_accuracy": 0.7103717923164368, "num_tokens": 14406898.0, "step": 6768, "train/ce_loss": 2.383197069168091 }, { "epoch": 0.6691714455210599, "step": 6768, "train/sim_loss": 0.04296875 }, { "epoch": 0.6691714455210599, "step": 6768, "train/total_loss": 0.2812884449958801 }, { "entropy": 8.560701370239258, "epoch": 0.6692703183705755, "mean_token_accuracy": 0.7568756937980652, "num_tokens": 14412318.0, "step": 6769, "train/ce_loss": 0.4928951561450958 }, { "epoch": 0.6692703183705755, "step": 6769, "train/sim_loss": 0.015625 }, { "epoch": 0.6692703183705755, "step": 6769, "train/total_loss": 0.0649145171046257 }, { "entropy": 9.07827377319336, "epoch": 0.669369191220091, "mean_token_accuracy": 0.7397260069847107, "num_tokens": 14417363.0, "step": 6770, "train/ce_loss": 0.8147823214530945 }, { "epoch": 0.669369191220091, "step": 6770, "train/sim_loss": 0.03125 }, { "epoch": 0.669369191220091, "step": 6770, "train/total_loss": 0.11272823065519333 }, { "entropy": 8.791725158691406, "epoch": 0.6694680640696065, "mean_token_accuracy": 0.699881374835968, "num_tokens": 14422677.0, "step": 6771, "train/ce_loss": 1.2302576303482056 }, { "epoch": 0.6694680640696065, "step": 6771, "train/sim_loss": 0.05078125 }, { "epoch": 0.6694680640696065, "step": 6771, "train/total_loss": 0.1738070249557495 }, { "entropy": 9.144021034240723, "epoch": 0.6695669369191221, "mean_token_accuracy": 0.7154605388641357, "num_tokens": 14427707.0, "step": 6772, "train/ce_loss": 0.8733604550361633 }, { "epoch": 0.6695669369191221, "step": 6772, "train/sim_loss": 0.05859375 }, { "epoch": 0.6695669369191221, "step": 6772, "train/total_loss": 0.14592979848384857 }, { "entropy": 9.025733947753906, "epoch": 0.6696658097686375, "mean_token_accuracy": 0.7476038336753845, "num_tokens": 14432767.0, "step": 6773, "train/ce_loss": 0.9594969153404236 }, { "epoch": 0.6696658097686375, "step": 6773, "train/sim_loss": 0.06640625 }, { "epoch": 0.6696658097686375, "step": 6773, "train/total_loss": 0.1623559445142746 }, { "entropy": 9.640586853027344, "epoch": 0.669764682618153, "mean_token_accuracy": 0.6961451172828674, "num_tokens": 14437596.0, "step": 6774, "train/ce_loss": 1.403988242149353 }, { "epoch": 0.669764682618153, "step": 6774, "train/sim_loss": 0.0546875 }, { "epoch": 0.669764682618153, "step": 6774, "train/total_loss": 0.19508633017539978 }, { "entropy": 9.733133316040039, "epoch": 0.6698635554676686, "mean_token_accuracy": 0.7267904281616211, "num_tokens": 14442353.0, "step": 6775, "train/ce_loss": 2.2351179122924805 }, { "epoch": 0.6698635554676686, "step": 6775, "train/sim_loss": 0.1171875 }, { "epoch": 0.6698635554676686, "step": 6775, "train/total_loss": 0.34069931507110596 }, { "entropy": 9.393617630004883, "epoch": 0.6699624283171841, "mean_token_accuracy": 0.689130425453186, "num_tokens": 14447230.0, "step": 6776, "train/ce_loss": 6.242476956686005e-06 }, { "epoch": 0.6699624283171841, "step": 6776, "train/sim_loss": 0.0546875 }, { "epoch": 0.6699624283171841, "step": 6776, "train/total_loss": 0.05468812584877014 }, { "entropy": 8.555450439453125, "epoch": 0.6700613011666996, "mean_token_accuracy": 0.7341463565826416, "num_tokens": 14452488.0, "step": 6777, "train/ce_loss": 0.9241055846214294 }, { "epoch": 0.6700613011666996, "step": 6777, "train/sim_loss": 0.0546875 }, { "epoch": 0.6700613011666996, "step": 6777, "train/total_loss": 0.14709806442260742 }, { "entropy": 8.696893692016602, "epoch": 0.6701601740162152, "mean_token_accuracy": 0.7157464027404785, "num_tokens": 14457945.0, "step": 6778, "train/ce_loss": 0.891961932182312 }, { "epoch": 0.6701601740162152, "step": 6778, "train/sim_loss": 0.0546875 }, { "epoch": 0.6701601740162152, "step": 6778, "train/total_loss": 0.14388370513916016 }, { "entropy": 8.653599739074707, "epoch": 0.6702590468657307, "mean_token_accuracy": 0.6867470145225525, "num_tokens": 14463368.0, "step": 6779, "train/ce_loss": 0.9441421627998352 }, { "epoch": 0.6702590468657307, "step": 6779, "train/sim_loss": 0.08203125 }, { "epoch": 0.6702590468657307, "step": 6779, "train/total_loss": 0.17644546926021576 }, { "epoch": 0.6703579197152462, "grad_norm": 0.7936645150184631, "learning_rate": 8.326410522672206e-06, "loss": 0.1447, "step": 6780 }, { "entropy": 8.938943862915039, "epoch": 0.6703579197152462, "mean_token_accuracy": 0.7451984882354736, "num_tokens": 14468595.0, "step": 6780, "train/ce_loss": 0.7573527693748474 }, { "epoch": 0.6703579197152462, "step": 6780, "train/sim_loss": 0.03515625 }, { "epoch": 0.6703579197152462, "step": 6780, "train/total_loss": 0.11089152842760086 }, { "entropy": 8.220525741577148, "epoch": 0.6704567925647618, "mean_token_accuracy": 0.7124413251876831, "num_tokens": 14473916.0, "step": 6781, "train/ce_loss": 0.3497462272644043 }, { "epoch": 0.6704567925647618, "step": 6781, "train/sim_loss": 0.05078125 }, { "epoch": 0.6704567925647618, "step": 6781, "train/total_loss": 0.08575586974620819 }, { "entropy": 8.594995498657227, "epoch": 0.6705556654142772, "mean_token_accuracy": 0.7618534564971924, "num_tokens": 14479332.0, "step": 6782, "train/ce_loss": 0.8362644910812378 }, { "epoch": 0.6705556654142772, "step": 6782, "train/sim_loss": 0.0546875 }, { "epoch": 0.6705556654142772, "step": 6782, "train/total_loss": 0.13831394910812378 }, { "entropy": 8.570377349853516, "epoch": 0.6706545382637927, "mean_token_accuracy": 0.7267876267433167, "num_tokens": 14484733.0, "step": 6783, "train/ce_loss": 1.20476233959198 }, { "epoch": 0.6706545382637927, "step": 6783, "train/sim_loss": 0.06640625 }, { "epoch": 0.6706545382637927, "step": 6783, "train/total_loss": 0.18688249588012695 }, { "entropy": 8.916112899780273, "epoch": 0.6707534111133083, "mean_token_accuracy": 0.7323369383811951, "num_tokens": 14489945.0, "step": 6784, "train/ce_loss": 0.5036496520042419 }, { "epoch": 0.6707534111133083, "step": 6784, "train/sim_loss": 0.05078125 }, { "epoch": 0.6707534111133083, "step": 6784, "train/total_loss": 0.10114622116088867 }, { "entropy": 8.686356544494629, "epoch": 0.6708522839628238, "mean_token_accuracy": 0.7149999737739563, "num_tokens": 14495205.0, "step": 6785, "train/ce_loss": 0.35887444019317627 }, { "epoch": 0.6708522839628238, "step": 6785, "train/sim_loss": 0.04296875 }, { "epoch": 0.6708522839628238, "step": 6785, "train/total_loss": 0.0788561999797821 }, { "entropy": 9.628941535949707, "epoch": 0.6709511568123393, "mean_token_accuracy": 0.7025641202926636, "num_tokens": 14500025.0, "step": 6786, "train/ce_loss": 7.966723387653474e-06 }, { "epoch": 0.6709511568123393, "step": 6786, "train/sim_loss": 0.05859375 }, { "epoch": 0.6709511568123393, "step": 6786, "train/total_loss": 0.05859454721212387 }, { "entropy": 8.674129486083984, "epoch": 0.6710500296618549, "mean_token_accuracy": 0.7329341173171997, "num_tokens": 14505339.0, "step": 6787, "train/ce_loss": 0.7985280752182007 }, { "epoch": 0.6710500296618549, "step": 6787, "train/sim_loss": 0.08203125 }, { "epoch": 0.6710500296618549, "step": 6787, "train/total_loss": 0.16188406944274902 }, { "entropy": 8.701408386230469, "epoch": 0.6711489025113704, "mean_token_accuracy": 0.7905982732772827, "num_tokens": 14510482.0, "step": 6788, "train/ce_loss": 0.5930907130241394 }, { "epoch": 0.6711489025113704, "step": 6788, "train/sim_loss": 0.0546875 }, { "epoch": 0.6711489025113704, "step": 6788, "train/total_loss": 0.11399657279253006 }, { "entropy": 9.007550239562988, "epoch": 0.6712477753608859, "mean_token_accuracy": 0.7002801299095154, "num_tokens": 14515652.0, "step": 6789, "train/ce_loss": 4.562507001537597e-06 }, { "epoch": 0.6712477753608859, "step": 6789, "train/sim_loss": 0.046875 }, { "epoch": 0.6712477753608859, "step": 6789, "train/total_loss": 0.04687545448541641 }, { "entropy": 9.074921607971191, "epoch": 0.6713466482104015, "mean_token_accuracy": 0.733846127986908, "num_tokens": 14520731.0, "step": 6790, "train/ce_loss": 1.7275176048278809 }, { "epoch": 0.6713466482104015, "step": 6790, "train/sim_loss": 0.06640625 }, { "epoch": 0.6713466482104015, "step": 6790, "train/total_loss": 0.2391580194234848 }, { "entropy": 8.621129035949707, "epoch": 0.671445521059917, "mean_token_accuracy": 0.7228145003318787, "num_tokens": 14526142.0, "step": 6791, "train/ce_loss": 0.8564296960830688 }, { "epoch": 0.671445521059917, "step": 6791, "train/sim_loss": 0.0546875 }, { "epoch": 0.671445521059917, "step": 6791, "train/total_loss": 0.1403304636478424 }, { "entropy": 9.489485740661621, "epoch": 0.6715443939094324, "mean_token_accuracy": 0.7197580933570862, "num_tokens": 14531241.0, "step": 6792, "train/ce_loss": 0.8726058602333069 }, { "epoch": 0.6715443939094324, "step": 6792, "train/sim_loss": 0.109375 }, { "epoch": 0.6715443939094324, "step": 6792, "train/total_loss": 0.19663558900356293 }, { "entropy": 8.867376327514648, "epoch": 0.671643266758948, "mean_token_accuracy": 0.7956621050834656, "num_tokens": 14536649.0, "step": 6793, "train/ce_loss": 0.6209725737571716 }, { "epoch": 0.671643266758948, "step": 6793, "train/sim_loss": 0.02734375 }, { "epoch": 0.671643266758948, "step": 6793, "train/total_loss": 0.08944100886583328 }, { "entropy": 8.421364784240723, "epoch": 0.6717421396084635, "mean_token_accuracy": 0.7338618636131287, "num_tokens": 14542010.0, "step": 6794, "train/ce_loss": 0.8923219442367554 }, { "epoch": 0.6717421396084635, "step": 6794, "train/sim_loss": 0.046875 }, { "epoch": 0.6717421396084635, "step": 6794, "train/total_loss": 0.1361072063446045 }, { "entropy": 9.235627174377441, "epoch": 0.671841012457979, "mean_token_accuracy": 0.7441471815109253, "num_tokens": 14547029.0, "step": 6795, "train/ce_loss": 5.730522843805375e-06 }, { "epoch": 0.671841012457979, "step": 6795, "train/sim_loss": 0.0703125 }, { "epoch": 0.671841012457979, "step": 6795, "train/total_loss": 0.07031307369470596 }, { "entropy": 8.904651641845703, "epoch": 0.6719398853074946, "mean_token_accuracy": 0.6785714030265808, "num_tokens": 14552398.0, "step": 6796, "train/ce_loss": 0.8459588885307312 }, { "epoch": 0.6719398853074946, "step": 6796, "train/sim_loss": 0.140625 }, { "epoch": 0.6719398853074946, "step": 6796, "train/total_loss": 0.22522088885307312 }, { "entropy": 9.159074783325195, "epoch": 0.6720387581570101, "mean_token_accuracy": 0.7045100927352905, "num_tokens": 14557506.0, "step": 6797, "train/ce_loss": 1.0695310831069946 }, { "epoch": 0.6720387581570101, "step": 6797, "train/sim_loss": 0.07421875 }, { "epoch": 0.6720387581570101, "step": 6797, "train/total_loss": 0.18117186427116394 }, { "entropy": 8.746875762939453, "epoch": 0.6721376310065256, "mean_token_accuracy": 0.737726092338562, "num_tokens": 14562784.0, "step": 6798, "train/ce_loss": 0.9978029131889343 }, { "epoch": 0.6721376310065256, "step": 6798, "train/sim_loss": 0.05078125 }, { "epoch": 0.6721376310065256, "step": 6798, "train/total_loss": 0.15056154131889343 }, { "entropy": 8.332484245300293, "epoch": 0.6722365038560412, "mean_token_accuracy": 0.7210215926170349, "num_tokens": 14568293.0, "step": 6799, "train/ce_loss": 0.6993699073791504 }, { "epoch": 0.6722365038560412, "step": 6799, "train/sim_loss": 0.0546875 }, { "epoch": 0.6722365038560412, "step": 6799, "train/total_loss": 0.12462449073791504 }, { "epoch": 0.6723353767055567, "grad_norm": 0.6852823495864868, "learning_rate": 8.321465657914257e-06, "loss": 0.1423, "step": 6800 }, { "entropy": 8.945253372192383, "epoch": 0.6723353767055567, "mean_token_accuracy": 0.7063882350921631, "num_tokens": 14573574.0, "step": 6800, "train/ce_loss": 1.25028657913208 }, { "epoch": 0.6723353767055567, "step": 6800, "train/sim_loss": 0.04296875 }, { "epoch": 0.6723353767055567, "step": 6800, "train/total_loss": 0.16799740493297577 }, { "entropy": 9.010234832763672, "epoch": 0.6724342495550721, "mean_token_accuracy": 0.7916666865348816, "num_tokens": 14578724.0, "step": 6801, "train/ce_loss": 0.6684384346008301 }, { "epoch": 0.6724342495550721, "step": 6801, "train/sim_loss": 0.03515625 }, { "epoch": 0.6724342495550721, "step": 6801, "train/total_loss": 0.10200009495019913 }, { "entropy": 9.172411918640137, "epoch": 0.6725331224045877, "mean_token_accuracy": 0.7311643958091736, "num_tokens": 14583733.0, "step": 6802, "train/ce_loss": 5.024392521590926e-06 }, { "epoch": 0.6725331224045877, "step": 6802, "train/sim_loss": 0.02734375 }, { "epoch": 0.6725331224045877, "step": 6802, "train/total_loss": 0.027344252914190292 }, { "entropy": 8.602190017700195, "epoch": 0.6726319952541032, "mean_token_accuracy": 0.7981330156326294, "num_tokens": 14589126.0, "step": 6803, "train/ce_loss": 0.8179534077644348 }, { "epoch": 0.6726319952541032, "step": 6803, "train/sim_loss": 0.05859375 }, { "epoch": 0.6726319952541032, "step": 6803, "train/total_loss": 0.140389084815979 }, { "entropy": 8.877935409545898, "epoch": 0.6727308681036187, "mean_token_accuracy": 0.7146666646003723, "num_tokens": 14594385.0, "step": 6804, "train/ce_loss": 1.2639179229736328 }, { "epoch": 0.6727308681036187, "step": 6804, "train/sim_loss": 0.1015625 }, { "epoch": 0.6727308681036187, "step": 6804, "train/total_loss": 0.22795429825782776 }, { "entropy": 8.61834716796875, "epoch": 0.6728297409531343, "mean_token_accuracy": 0.7352246046066284, "num_tokens": 14599790.0, "step": 6805, "train/ce_loss": 0.7303071618080139 }, { "epoch": 0.6728297409531343, "step": 6805, "train/sim_loss": 0.07421875 }, { "epoch": 0.6728297409531343, "step": 6805, "train/total_loss": 0.14724946022033691 }, { "entropy": 8.895477294921875, "epoch": 0.6729286138026498, "mean_token_accuracy": 0.8366477489471436, "num_tokens": 14604943.0, "step": 6806, "train/ce_loss": 0.6171497702598572 }, { "epoch": 0.6729286138026498, "step": 6806, "train/sim_loss": 0.0625 }, { "epoch": 0.6729286138026498, "step": 6806, "train/total_loss": 0.12421497702598572 }, { "entropy": 8.971907615661621, "epoch": 0.6730274866521653, "mean_token_accuracy": 0.7116212248802185, "num_tokens": 14610104.0, "step": 6807, "train/ce_loss": 1.6657736523484346e-06 }, { "epoch": 0.6730274866521653, "step": 6807, "train/sim_loss": 0.0546875 }, { "epoch": 0.6730274866521653, "step": 6807, "train/total_loss": 0.05468766763806343 }, { "entropy": 8.885343551635742, "epoch": 0.6731263595016809, "mean_token_accuracy": 0.7256515622138977, "num_tokens": 14615334.0, "step": 6808, "train/ce_loss": 0.7519450783729553 }, { "epoch": 0.6731263595016809, "step": 6808, "train/sim_loss": 0.078125 }, { "epoch": 0.6731263595016809, "step": 6808, "train/total_loss": 0.15331950783729553 }, { "entropy": 8.883771896362305, "epoch": 0.6732252323511964, "mean_token_accuracy": 0.7266355156898499, "num_tokens": 14620641.0, "step": 6809, "train/ce_loss": 0.9417589902877808 }, { "epoch": 0.6732252323511964, "step": 6809, "train/sim_loss": 0.0546875 }, { "epoch": 0.6732252323511964, "step": 6809, "train/total_loss": 0.14886340498924255 }, { "entropy": 9.174736022949219, "epoch": 0.6733241052007118, "mean_token_accuracy": 0.7337837815284729, "num_tokens": 14625817.0, "step": 6810, "train/ce_loss": 0.47125375270843506 }, { "epoch": 0.6733241052007118, "step": 6810, "train/sim_loss": 0.0625 }, { "epoch": 0.6733241052007118, "step": 6810, "train/total_loss": 0.10962537676095963 }, { "entropy": 8.47231674194336, "epoch": 0.6734229780502274, "mean_token_accuracy": 0.7186098694801331, "num_tokens": 14631156.0, "step": 6811, "train/ce_loss": 1.0583274364471436 }, { "epoch": 0.6734229780502274, "step": 6811, "train/sim_loss": 0.04296875 }, { "epoch": 0.6734229780502274, "step": 6811, "train/total_loss": 0.1488015055656433 }, { "entropy": 8.644718170166016, "epoch": 0.6735218508997429, "mean_token_accuracy": 0.7537961006164551, "num_tokens": 14636570.0, "step": 6812, "train/ce_loss": 0.8156536221504211 }, { "epoch": 0.6735218508997429, "step": 6812, "train/sim_loss": 0.01953125 }, { "epoch": 0.6735218508997429, "step": 6812, "train/total_loss": 0.10109661519527435 }, { "entropy": 9.330245971679688, "epoch": 0.6736207237492584, "mean_token_accuracy": 0.7583333253860474, "num_tokens": 14641591.0, "step": 6813, "train/ce_loss": 1.5970101356506348 }, { "epoch": 0.6736207237492584, "step": 6813, "train/sim_loss": 0.02734375 }, { "epoch": 0.6736207237492584, "step": 6813, "train/total_loss": 0.18704476952552795 }, { "entropy": 8.915796279907227, "epoch": 0.673719596598774, "mean_token_accuracy": 0.6687578558921814, "num_tokens": 14646856.0, "step": 6814, "train/ce_loss": 0.5850891470909119 }, { "epoch": 0.673719596598774, "step": 6814, "train/sim_loss": 0.08203125 }, { "epoch": 0.673719596598774, "step": 6814, "train/total_loss": 0.14054016768932343 }, { "entropy": 8.369819641113281, "epoch": 0.6738184694482895, "mean_token_accuracy": 0.7742214798927307, "num_tokens": 14652496.0, "step": 6815, "train/ce_loss": 0.9914873838424683 }, { "epoch": 0.6738184694482895, "step": 6815, "train/sim_loss": 0.05078125 }, { "epoch": 0.6738184694482895, "step": 6815, "train/total_loss": 0.14993000030517578 }, { "entropy": 8.665181159973145, "epoch": 0.673917342297805, "mean_token_accuracy": 0.7642857432365417, "num_tokens": 14657829.0, "step": 6816, "train/ce_loss": 1.0319336652755737 }, { "epoch": 0.673917342297805, "step": 6816, "train/sim_loss": 0.06640625 }, { "epoch": 0.673917342297805, "step": 6816, "train/total_loss": 0.16959962248802185 }, { "entropy": 8.193489074707031, "epoch": 0.6740162151473206, "mean_token_accuracy": 0.7354085445404053, "num_tokens": 14663450.0, "step": 6817, "train/ce_loss": 1.1664539575576782 }, { "epoch": 0.6740162151473206, "step": 6817, "train/sim_loss": 0.09765625 }, { "epoch": 0.6740162151473206, "step": 6817, "train/total_loss": 0.21430164575576782 }, { "entropy": 8.494543075561523, "epoch": 0.6741150879968361, "mean_token_accuracy": 0.7384230494499207, "num_tokens": 14668760.0, "step": 6818, "train/ce_loss": 0.7610848546028137 }, { "epoch": 0.6741150879968361, "step": 6818, "train/sim_loss": 0.0703125 }, { "epoch": 0.6741150879968361, "step": 6818, "train/total_loss": 0.14642098546028137 }, { "entropy": 9.040728569030762, "epoch": 0.6742139608463515, "mean_token_accuracy": 0.7747092843055725, "num_tokens": 14673904.0, "step": 6819, "train/ce_loss": 0.5640541911125183 }, { "epoch": 0.6742139608463515, "step": 6819, "train/sim_loss": 0.046875 }, { "epoch": 0.6742139608463515, "step": 6819, "train/total_loss": 0.10328042507171631 }, { "epoch": 0.6743128336958671, "grad_norm": 0.661296546459198, "learning_rate": 8.316520793156307e-06, "loss": 0.1371, "step": 6820 }, { "entropy": 9.017964363098145, "epoch": 0.6743128336958671, "mean_token_accuracy": 0.7128129601478577, "num_tokens": 14679041.0, "step": 6820, "train/ce_loss": 0.4595761299133301 }, { "epoch": 0.6743128336958671, "step": 6820, "train/sim_loss": 0.09375 }, { "epoch": 0.6743128336958671, "step": 6820, "train/total_loss": 0.13970761001110077 }, { "entropy": 9.40074348449707, "epoch": 0.6744117065453826, "mean_token_accuracy": 0.7176684737205505, "num_tokens": 14683965.0, "step": 6821, "train/ce_loss": 4.357888883532723e-06 }, { "epoch": 0.6744117065453826, "step": 6821, "train/sim_loss": 0.0234375 }, { "epoch": 0.6744117065453826, "step": 6821, "train/total_loss": 0.02343793585896492 }, { "entropy": 8.930868148803711, "epoch": 0.6745105793948981, "mean_token_accuracy": 0.7690058350563049, "num_tokens": 14689138.0, "step": 6822, "train/ce_loss": 3.26072949974332e-06 }, { "epoch": 0.6745105793948981, "step": 6822, "train/sim_loss": 0.0390625 }, { "epoch": 0.6745105793948981, "step": 6822, "train/total_loss": 0.039062827825546265 }, { "entropy": 8.487780570983887, "epoch": 0.6746094522444137, "mean_token_accuracy": 0.7112582921981812, "num_tokens": 14694351.0, "step": 6823, "train/ce_loss": 1.1790664196014404 }, { "epoch": 0.6746094522444137, "step": 6823, "train/sim_loss": 0.046875 }, { "epoch": 0.6746094522444137, "step": 6823, "train/total_loss": 0.16478164494037628 }, { "entropy": 8.819021224975586, "epoch": 0.6747083250939292, "mean_token_accuracy": 0.7446556091308594, "num_tokens": 14699733.0, "step": 6824, "train/ce_loss": 0.4874028265476227 }, { "epoch": 0.6747083250939292, "step": 6824, "train/sim_loss": 0.0625 }, { "epoch": 0.6747083250939292, "step": 6824, "train/total_loss": 0.11124028265476227 }, { "entropy": 8.674089431762695, "epoch": 0.6748071979434447, "mean_token_accuracy": 0.7688171863555908, "num_tokens": 14704947.0, "step": 6825, "train/ce_loss": 0.40436989068984985 }, { "epoch": 0.6748071979434447, "step": 6825, "train/sim_loss": 0.0625 }, { "epoch": 0.6748071979434447, "step": 6825, "train/total_loss": 0.1029369905591011 }, { "entropy": 9.120749473571777, "epoch": 0.6749060707929603, "mean_token_accuracy": 0.7311320900917053, "num_tokens": 14709988.0, "step": 6826, "train/ce_loss": 1.4153249263763428 }, { "epoch": 0.6749060707929603, "step": 6826, "train/sim_loss": 0.05078125 }, { "epoch": 0.6749060707929603, "step": 6826, "train/total_loss": 0.19231374561786652 }, { "entropy": 8.94398307800293, "epoch": 0.6750049436424758, "mean_token_accuracy": 0.7023959755897522, "num_tokens": 14715260.0, "step": 6827, "train/ce_loss": 0.6066616773605347 }, { "epoch": 0.6750049436424758, "step": 6827, "train/sim_loss": 0.0390625 }, { "epoch": 0.6750049436424758, "step": 6827, "train/total_loss": 0.09972867369651794 }, { "entropy": 8.429058074951172, "epoch": 0.6751038164919912, "mean_token_accuracy": 0.7477295398712158, "num_tokens": 14720683.0, "step": 6828, "train/ce_loss": 1.1365456581115723 }, { "epoch": 0.6751038164919912, "step": 6828, "train/sim_loss": 0.12890625 }, { "epoch": 0.6751038164919912, "step": 6828, "train/total_loss": 0.24256081879138947 }, { "entropy": 8.282581329345703, "epoch": 0.6752026893415068, "mean_token_accuracy": 0.7324913740158081, "num_tokens": 14726027.0, "step": 6829, "train/ce_loss": 1.027675747871399 }, { "epoch": 0.6752026893415068, "step": 6829, "train/sim_loss": 0.08984375 }, { "epoch": 0.6752026893415068, "step": 6829, "train/total_loss": 0.19261133670806885 }, { "entropy": 8.828989028930664, "epoch": 0.6753015621910223, "mean_token_accuracy": 0.7523696422576904, "num_tokens": 14731516.0, "step": 6830, "train/ce_loss": 0.8268953561782837 }, { "epoch": 0.6753015621910223, "step": 6830, "train/sim_loss": 0.0703125 }, { "epoch": 0.6753015621910223, "step": 6830, "train/total_loss": 0.1530020385980606 }, { "entropy": 9.216299057006836, "epoch": 0.6754004350405378, "mean_token_accuracy": 0.8033794164657593, "num_tokens": 14736587.0, "step": 6831, "train/ce_loss": 0.6765879988670349 }, { "epoch": 0.6754004350405378, "step": 6831, "train/sim_loss": 0.0859375 }, { "epoch": 0.6754004350405378, "step": 6831, "train/total_loss": 0.15359631180763245 }, { "entropy": 9.358110427856445, "epoch": 0.6754993078900534, "mean_token_accuracy": 0.719298243522644, "num_tokens": 14741464.0, "step": 6832, "train/ce_loss": 2.968577064166311e-06 }, { "epoch": 0.6754993078900534, "step": 6832, "train/sim_loss": 0.04296875 }, { "epoch": 0.6754993078900534, "step": 6832, "train/total_loss": 0.04296904802322388 }, { "entropy": 9.096577644348145, "epoch": 0.6755981807395689, "mean_token_accuracy": 0.7557522058486938, "num_tokens": 14746443.0, "step": 6833, "train/ce_loss": 1.0856497287750244 }, { "epoch": 0.6755981807395689, "step": 6833, "train/sim_loss": 0.046875 }, { "epoch": 0.6755981807395689, "step": 6833, "train/total_loss": 0.15543997287750244 }, { "entropy": 8.473155975341797, "epoch": 0.6756970535890844, "mean_token_accuracy": 0.7213459610939026, "num_tokens": 14751898.0, "step": 6834, "train/ce_loss": 1.297159194946289 }, { "epoch": 0.6756970535890844, "step": 6834, "train/sim_loss": 0.06640625 }, { "epoch": 0.6756970535890844, "step": 6834, "train/total_loss": 0.1961221694946289 }, { "entropy": 9.074554443359375, "epoch": 0.6757959264386, "mean_token_accuracy": 0.7377938628196716, "num_tokens": 14756840.0, "step": 6835, "train/ce_loss": 3.831556114164414e-06 }, { "epoch": 0.6757959264386, "step": 6835, "train/sim_loss": 0.0390625 }, { "epoch": 0.6757959264386, "step": 6835, "train/total_loss": 0.03906288370490074 }, { "entropy": 8.668545722961426, "epoch": 0.6758947992881155, "mean_token_accuracy": 0.7134703397750854, "num_tokens": 14762208.0, "step": 6836, "train/ce_loss": 0.8337442874908447 }, { "epoch": 0.6758947992881155, "step": 6836, "train/sim_loss": 0.0625 }, { "epoch": 0.6758947992881155, "step": 6836, "train/total_loss": 0.14587444067001343 }, { "entropy": 8.646568298339844, "epoch": 0.675993672137631, "mean_token_accuracy": 0.7409909963607788, "num_tokens": 14767590.0, "step": 6837, "train/ce_loss": 0.5781188011169434 }, { "epoch": 0.675993672137631, "step": 6837, "train/sim_loss": 0.046875 }, { "epoch": 0.675993672137631, "step": 6837, "train/total_loss": 0.10468688607215881 }, { "entropy": 8.695724487304688, "epoch": 0.6760925449871465, "mean_token_accuracy": 0.7177321910858154, "num_tokens": 14772971.0, "step": 6838, "train/ce_loss": 0.9211334586143494 }, { "epoch": 0.6760925449871465, "step": 6838, "train/sim_loss": 0.04296875 }, { "epoch": 0.6760925449871465, "step": 6838, "train/total_loss": 0.13508209586143494 }, { "entropy": 8.353857040405273, "epoch": 0.676191417836662, "mean_token_accuracy": 0.7373448014259338, "num_tokens": 14778468.0, "step": 6839, "train/ce_loss": 1.2006250619888306 }, { "epoch": 0.676191417836662, "step": 6839, "train/sim_loss": 0.05859375 }, { "epoch": 0.676191417836662, "step": 6839, "train/total_loss": 0.17865625023841858 }, { "epoch": 0.6762902906861776, "grad_norm": 0.6604064702987671, "learning_rate": 8.311575928398358e-06, "loss": 0.1354, "step": 6840 }, { "entropy": 8.77322006225586, "epoch": 0.6762902906861776, "mean_token_accuracy": 0.8018134832382202, "num_tokens": 14783842.0, "step": 6840, "train/ce_loss": 1.3195806741714478 }, { "epoch": 0.6762902906861776, "step": 6840, "train/sim_loss": 0.0625 }, { "epoch": 0.6762902906861776, "step": 6840, "train/total_loss": 0.19445806741714478 }, { "entropy": 9.559065818786621, "epoch": 0.6763891635356931, "mean_token_accuracy": 0.7227488160133362, "num_tokens": 14788688.0, "step": 6841, "train/ce_loss": 0.7800194621086121 }, { "epoch": 0.6763891635356931, "step": 6841, "train/sim_loss": 0.046875 }, { "epoch": 0.6763891635356931, "step": 6841, "train/total_loss": 0.1248769462108612 }, { "entropy": 9.368246078491211, "epoch": 0.6764880363852086, "mean_token_accuracy": 0.7395498156547546, "num_tokens": 14793765.0, "step": 6842, "train/ce_loss": 1.6094815731048584 }, { "epoch": 0.6764880363852086, "step": 6842, "train/sim_loss": 0.0625 }, { "epoch": 0.6764880363852086, "step": 6842, "train/total_loss": 0.22344815731048584 }, { "entropy": 8.946149826049805, "epoch": 0.6765869092347242, "mean_token_accuracy": 0.7457886934280396, "num_tokens": 14798847.0, "step": 6843, "train/ce_loss": 0.8735904693603516 }, { "epoch": 0.6765869092347242, "step": 6843, "train/sim_loss": 0.0625 }, { "epoch": 0.6765869092347242, "step": 6843, "train/total_loss": 0.14985904097557068 }, { "entropy": 9.335906982421875, "epoch": 0.6766857820842397, "mean_token_accuracy": 0.7568027377128601, "num_tokens": 14803898.0, "step": 6844, "train/ce_loss": 4.403523234941531e-06 }, { "epoch": 0.6766857820842397, "step": 6844, "train/sim_loss": 0.0234375 }, { "epoch": 0.6766857820842397, "step": 6844, "train/total_loss": 0.02343793958425522 }, { "entropy": 8.541854858398438, "epoch": 0.6767846549337552, "mean_token_accuracy": 0.6717724204063416, "num_tokens": 14809218.0, "step": 6845, "train/ce_loss": 0.6349949240684509 }, { "epoch": 0.6767846549337552, "step": 6845, "train/sim_loss": 0.04296875 }, { "epoch": 0.6767846549337552, "step": 6845, "train/total_loss": 0.10646824538707733 }, { "entropy": 8.886573791503906, "epoch": 0.6768835277832708, "mean_token_accuracy": 0.7605633735656738, "num_tokens": 14814368.0, "step": 6846, "train/ce_loss": 1.423653244972229 }, { "epoch": 0.6768835277832708, "step": 6846, "train/sim_loss": 0.1328125 }, { "epoch": 0.6768835277832708, "step": 6846, "train/total_loss": 0.27517783641815186 }, { "entropy": 9.543976783752441, "epoch": 0.6769824006327863, "mean_token_accuracy": 0.7192575335502625, "num_tokens": 14819205.0, "step": 6847, "train/ce_loss": 9.700500413600821e-06 }, { "epoch": 0.6769824006327863, "step": 6847, "train/sim_loss": 0.05859375 }, { "epoch": 0.6769824006327863, "step": 6847, "train/total_loss": 0.0585947185754776 }, { "entropy": 8.52188491821289, "epoch": 0.6770812734823017, "mean_token_accuracy": 0.7841945290565491, "num_tokens": 14824616.0, "step": 6848, "train/ce_loss": 1.2499181032180786 }, { "epoch": 0.6770812734823017, "step": 6848, "train/sim_loss": 0.0703125 }, { "epoch": 0.6770812734823017, "step": 6848, "train/total_loss": 0.19530430436134338 }, { "entropy": 9.283021926879883, "epoch": 0.6771801463318173, "mean_token_accuracy": 0.7885714173316956, "num_tokens": 14829594.0, "step": 6849, "train/ce_loss": 0.9894952178001404 }, { "epoch": 0.6771801463318173, "step": 6849, "train/sim_loss": 0.08203125 }, { "epoch": 0.6771801463318173, "step": 6849, "train/total_loss": 0.18098077178001404 }, { "entropy": 8.678661346435547, "epoch": 0.6772790191813328, "mean_token_accuracy": 0.7326086759567261, "num_tokens": 14834961.0, "step": 6850, "train/ce_loss": 0.7352795004844666 }, { "epoch": 0.6772790191813328, "step": 6850, "train/sim_loss": 0.0234375 }, { "epoch": 0.6772790191813328, "step": 6850, "train/total_loss": 0.09696545451879501 }, { "entropy": 9.54398250579834, "epoch": 0.6773778920308483, "mean_token_accuracy": 0.7697674632072449, "num_tokens": 14839716.0, "step": 6851, "train/ce_loss": 3.1513832254859153e-06 }, { "epoch": 0.6773778920308483, "step": 6851, "train/sim_loss": 0.046875 }, { "epoch": 0.6773778920308483, "step": 6851, "train/total_loss": 0.04687531664967537 }, { "entropy": 8.723682403564453, "epoch": 0.6774767648803639, "mean_token_accuracy": 0.7455012798309326, "num_tokens": 14844916.0, "step": 6852, "train/ce_loss": 0.7275230288505554 }, { "epoch": 0.6774767648803639, "step": 6852, "train/sim_loss": 0.06640625 }, { "epoch": 0.6774767648803639, "step": 6852, "train/total_loss": 0.13915854692459106 }, { "entropy": 8.776834487915039, "epoch": 0.6775756377298794, "mean_token_accuracy": 0.7715404629707336, "num_tokens": 14850185.0, "step": 6853, "train/ce_loss": 0.847054123878479 }, { "epoch": 0.6775756377298794, "step": 6853, "train/sim_loss": 0.07421875 }, { "epoch": 0.6775756377298794, "step": 6853, "train/total_loss": 0.1589241623878479 }, { "entropy": 9.03692626953125, "epoch": 0.6776745105793949, "mean_token_accuracy": 0.7419354915618896, "num_tokens": 14855253.0, "step": 6854, "train/ce_loss": 5.175889782549348e-06 }, { "epoch": 0.6776745105793949, "step": 6854, "train/sim_loss": 0.0546875 }, { "epoch": 0.6776745105793949, "step": 6854, "train/total_loss": 0.054688017815351486 }, { "entropy": 9.113094329833984, "epoch": 0.6777733834289105, "mean_token_accuracy": 0.7496522665023804, "num_tokens": 14860449.0, "step": 6855, "train/ce_loss": 6.646732799708843e-06 }, { "epoch": 0.6777733834289105, "step": 6855, "train/sim_loss": 0.0859375 }, { "epoch": 0.6777733834289105, "step": 6855, "train/total_loss": 0.08593816310167313 }, { "entropy": 8.981744766235352, "epoch": 0.677872256278426, "mean_token_accuracy": 0.7337748408317566, "num_tokens": 14865638.0, "step": 6856, "train/ce_loss": 1.427980661392212 }, { "epoch": 0.677872256278426, "step": 6856, "train/sim_loss": 0.0703125 }, { "epoch": 0.677872256278426, "step": 6856, "train/total_loss": 0.2131105661392212 }, { "entropy": 8.741118431091309, "epoch": 0.6779711291279414, "mean_token_accuracy": 0.7225950956344604, "num_tokens": 14870977.0, "step": 6857, "train/ce_loss": 0.7294755578041077 }, { "epoch": 0.6779711291279414, "step": 6857, "train/sim_loss": 0.07421875 }, { "epoch": 0.6779711291279414, "step": 6857, "train/total_loss": 0.14716631174087524 }, { "entropy": 8.609580039978027, "epoch": 0.678070001977457, "mean_token_accuracy": 0.7299578189849854, "num_tokens": 14876388.0, "step": 6858, "train/ce_loss": 0.8117855787277222 }, { "epoch": 0.678070001977457, "step": 6858, "train/sim_loss": 0.05859375 }, { "epoch": 0.678070001977457, "step": 6858, "train/total_loss": 0.13977231085300446 }, { "entropy": 8.76502799987793, "epoch": 0.6781688748269725, "mean_token_accuracy": 0.6953846216201782, "num_tokens": 14881842.0, "step": 6859, "train/ce_loss": 0.9069038033485413 }, { "epoch": 0.6781688748269725, "step": 6859, "train/sim_loss": 0.08984375 }, { "epoch": 0.6781688748269725, "step": 6859, "train/total_loss": 0.18053412437438965 }, { "epoch": 0.678267747676488, "grad_norm": 0.7154859304428101, "learning_rate": 8.30663106364041e-06, "loss": 0.1401, "step": 6860 }, { "entropy": 9.234578132629395, "epoch": 0.678267747676488, "mean_token_accuracy": 0.7308319807052612, "num_tokens": 14886916.0, "step": 6860, "train/ce_loss": 1.1456135511398315 }, { "epoch": 0.678267747676488, "step": 6860, "train/sim_loss": 0.11328125 }, { "epoch": 0.678267747676488, "step": 6860, "train/total_loss": 0.22784259915351868 }, { "entropy": 9.708675384521484, "epoch": 0.6783666205260036, "mean_token_accuracy": 0.6778523325920105, "num_tokens": 14891772.0, "step": 6861, "train/ce_loss": 2.165543556213379 }, { "epoch": 0.6783666205260036, "step": 6861, "train/sim_loss": 0.109375 }, { "epoch": 0.6783666205260036, "step": 6861, "train/total_loss": 0.32592934370040894 }, { "entropy": 8.549935340881348, "epoch": 0.6784654933755191, "mean_token_accuracy": 0.7832335233688354, "num_tokens": 14897086.0, "step": 6862, "train/ce_loss": 0.6527609825134277 }, { "epoch": 0.6784654933755191, "step": 6862, "train/sim_loss": 0.0390625 }, { "epoch": 0.6784654933755191, "step": 6862, "train/total_loss": 0.10433860123157501 }, { "entropy": 9.58963394165039, "epoch": 0.6785643662250346, "mean_token_accuracy": 0.8432835936546326, "num_tokens": 14901904.0, "step": 6863, "train/ce_loss": 5.530352154892171e-06 }, { "epoch": 0.6785643662250346, "step": 6863, "train/sim_loss": 0.0390625 }, { "epoch": 0.6785643662250346, "step": 6863, "train/total_loss": 0.03906305134296417 }, { "entropy": 9.278046607971191, "epoch": 0.6786632390745502, "mean_token_accuracy": 0.7939698696136475, "num_tokens": 14906965.0, "step": 6864, "train/ce_loss": 1.0491366386413574 }, { "epoch": 0.6786632390745502, "step": 6864, "train/sim_loss": 0.0703125 }, { "epoch": 0.6786632390745502, "step": 6864, "train/total_loss": 0.17522616684436798 }, { "entropy": 8.817192077636719, "epoch": 0.6787621119240657, "mean_token_accuracy": 0.7254408001899719, "num_tokens": 14912234.0, "step": 6865, "train/ce_loss": 0.6184538006782532 }, { "epoch": 0.6787621119240657, "step": 6865, "train/sim_loss": 0.046875 }, { "epoch": 0.6787621119240657, "step": 6865, "train/total_loss": 0.10872037708759308 }, { "entropy": 8.771540641784668, "epoch": 0.6788609847735811, "mean_token_accuracy": 0.6845729947090149, "num_tokens": 14917446.0, "step": 6866, "train/ce_loss": 2.3287134170532227 }, { "epoch": 0.6788609847735811, "step": 6866, "train/sim_loss": 0.04296875 }, { "epoch": 0.6788609847735811, "step": 6866, "train/total_loss": 0.2758401036262512 }, { "entropy": 8.880404472351074, "epoch": 0.6789598576230967, "mean_token_accuracy": 0.768757700920105, "num_tokens": 14922702.0, "step": 6867, "train/ce_loss": 0.3743933141231537 }, { "epoch": 0.6789598576230967, "step": 6867, "train/sim_loss": 0.046875 }, { "epoch": 0.6789598576230967, "step": 6867, "train/total_loss": 0.08431433141231537 }, { "entropy": 8.87601375579834, "epoch": 0.6790587304726122, "mean_token_accuracy": 0.7733989953994751, "num_tokens": 14928006.0, "step": 6868, "train/ce_loss": 0.48566004633903503 }, { "epoch": 0.6790587304726122, "step": 6868, "train/sim_loss": 0.015625 }, { "epoch": 0.6790587304726122, "step": 6868, "train/total_loss": 0.06419100612401962 }, { "entropy": 8.96407413482666, "epoch": 0.6791576033221277, "mean_token_accuracy": 0.7106825113296509, "num_tokens": 14933135.0, "step": 6869, "train/ce_loss": 1.0607415437698364 }, { "epoch": 0.6791576033221277, "step": 6869, "train/sim_loss": 0.04296875 }, { "epoch": 0.6791576033221277, "step": 6869, "train/total_loss": 0.14904290437698364 }, { "entropy": 9.414806365966797, "epoch": 0.6792564761716433, "mean_token_accuracy": 0.7077175974845886, "num_tokens": 14938181.0, "step": 6870, "train/ce_loss": 1.7140169143676758 }, { "epoch": 0.6792564761716433, "step": 6870, "train/sim_loss": 0.08984375 }, { "epoch": 0.6792564761716433, "step": 6870, "train/total_loss": 0.2612454295158386 }, { "entropy": 8.594289779663086, "epoch": 0.6793553490211588, "mean_token_accuracy": 0.7311475276947021, "num_tokens": 14943598.0, "step": 6871, "train/ce_loss": 1.0748900175094604 }, { "epoch": 0.6793553490211588, "step": 6871, "train/sim_loss": 0.02734375 }, { "epoch": 0.6793553490211588, "step": 6871, "train/total_loss": 0.13483275473117828 }, { "entropy": 9.227981567382812, "epoch": 0.6794542218706743, "mean_token_accuracy": 0.6536585092544556, "num_tokens": 14948691.0, "step": 6872, "train/ce_loss": 1.4994652701716404e-06 }, { "epoch": 0.6794542218706743, "step": 6872, "train/sim_loss": 0.03515625 }, { "epoch": 0.6794542218706743, "step": 6872, "train/total_loss": 0.03515639901161194 }, { "entropy": 8.879727363586426, "epoch": 0.6795530947201899, "mean_token_accuracy": 0.7410714030265808, "num_tokens": 14953970.0, "step": 6873, "train/ce_loss": 0.49233755469322205 }, { "epoch": 0.6795530947201899, "step": 6873, "train/sim_loss": 0.03515625 }, { "epoch": 0.6795530947201899, "step": 6873, "train/total_loss": 0.08439000695943832 }, { "entropy": 8.354992866516113, "epoch": 0.6796519675697054, "mean_token_accuracy": 0.7238709926605225, "num_tokens": 14959246.0, "step": 6874, "train/ce_loss": 1.4569038152694702 }, { "epoch": 0.6796519675697054, "step": 6874, "train/sim_loss": 0.109375 }, { "epoch": 0.6796519675697054, "step": 6874, "train/total_loss": 0.255065381526947 }, { "entropy": 8.69202995300293, "epoch": 0.6797508404192208, "mean_token_accuracy": 0.643468976020813, "num_tokens": 14964645.0, "step": 6875, "train/ce_loss": 1.3519893884658813 }, { "epoch": 0.6797508404192208, "step": 6875, "train/sim_loss": 0.09375 }, { "epoch": 0.6797508404192208, "step": 6875, "train/total_loss": 0.2289489358663559 }, { "entropy": 8.685219764709473, "epoch": 0.6798497132687364, "mean_token_accuracy": 0.8020954728126526, "num_tokens": 14969947.0, "step": 6876, "train/ce_loss": 0.3879144787788391 }, { "epoch": 0.6798497132687364, "step": 6876, "train/sim_loss": 0.046875 }, { "epoch": 0.6798497132687364, "step": 6876, "train/total_loss": 0.08566644787788391 }, { "entropy": 8.884944915771484, "epoch": 0.6799485861182519, "mean_token_accuracy": 0.7363494634628296, "num_tokens": 14975010.0, "step": 6877, "train/ce_loss": 1.5219731330871582 }, { "epoch": 0.6799485861182519, "step": 6877, "train/sim_loss": 0.078125 }, { "epoch": 0.6799485861182519, "step": 6877, "train/total_loss": 0.23032231628894806 }, { "entropy": 9.34930419921875, "epoch": 0.6800474589677674, "mean_token_accuracy": 0.7857142686843872, "num_tokens": 14980055.0, "step": 6878, "train/ce_loss": 6.238361947907833e-06 }, { "epoch": 0.6800474589677674, "step": 6878, "train/sim_loss": 0.1171875 }, { "epoch": 0.6800474589677674, "step": 6878, "train/total_loss": 0.11718812584877014 }, { "entropy": 8.546012878417969, "epoch": 0.680146331817283, "mean_token_accuracy": 0.6915477514266968, "num_tokens": 14985474.0, "step": 6879, "train/ce_loss": 1.1794265508651733 }, { "epoch": 0.680146331817283, "step": 6879, "train/sim_loss": 0.05078125 }, { "epoch": 0.680146331817283, "step": 6879, "train/total_loss": 0.1687239110469818 }, { "epoch": 0.6802452046667985, "grad_norm": 0.7139967083930969, "learning_rate": 8.30168619888246e-06, "loss": 0.1433, "step": 6880 }, { "entropy": 8.262923240661621, "epoch": 0.6802452046667985, "mean_token_accuracy": 0.7223360538482666, "num_tokens": 14990985.0, "step": 6880, "train/ce_loss": 0.9992165565490723 }, { "epoch": 0.6802452046667985, "step": 6880, "train/sim_loss": 0.05859375 }, { "epoch": 0.6802452046667985, "step": 6880, "train/total_loss": 0.15851540863513947 }, { "entropy": 8.59264087677002, "epoch": 0.680344077516314, "mean_token_accuracy": 0.7144362330436707, "num_tokens": 14996403.0, "step": 6881, "train/ce_loss": 0.867514967918396 }, { "epoch": 0.680344077516314, "step": 6881, "train/sim_loss": 0.03515625 }, { "epoch": 0.680344077516314, "step": 6881, "train/total_loss": 0.12190774828195572 }, { "entropy": 8.539525032043457, "epoch": 0.6804429503658296, "mean_token_accuracy": 0.7626774907112122, "num_tokens": 15001873.0, "step": 6882, "train/ce_loss": 0.5471914410591125 }, { "epoch": 0.6804429503658296, "step": 6882, "train/sim_loss": 0.01953125 }, { "epoch": 0.6804429503658296, "step": 6882, "train/total_loss": 0.07425040006637573 }, { "entropy": 8.81664752960205, "epoch": 0.6805418232153451, "mean_token_accuracy": 0.7230576276779175, "num_tokens": 15007133.0, "step": 6883, "train/ce_loss": 0.8308492302894592 }, { "epoch": 0.6805418232153451, "step": 6883, "train/sim_loss": 0.08984375 }, { "epoch": 0.6805418232153451, "step": 6883, "train/total_loss": 0.17292867600917816 }, { "entropy": 8.927459716796875, "epoch": 0.6806406960648606, "mean_token_accuracy": 0.7009803652763367, "num_tokens": 15012392.0, "step": 6884, "train/ce_loss": 0.5564685463905334 }, { "epoch": 0.6806406960648606, "step": 6884, "train/sim_loss": 0.05859375 }, { "epoch": 0.6806406960648606, "step": 6884, "train/total_loss": 0.1142406016588211 }, { "entropy": 8.939496994018555, "epoch": 0.6807395689143761, "mean_token_accuracy": 0.8058551549911499, "num_tokens": 15017479.0, "step": 6885, "train/ce_loss": 0.8924233317375183 }, { "epoch": 0.6807395689143761, "step": 6885, "train/sim_loss": 0.06640625 }, { "epoch": 0.6807395689143761, "step": 6885, "train/total_loss": 0.1556485891342163 }, { "entropy": 8.974817276000977, "epoch": 0.6808384417638916, "mean_token_accuracy": 0.7827337980270386, "num_tokens": 15022605.0, "step": 6886, "train/ce_loss": 1.5032484043331351e-05 }, { "epoch": 0.6808384417638916, "step": 6886, "train/sim_loss": 0.04296875 }, { "epoch": 0.6808384417638916, "step": 6886, "train/total_loss": 0.04297025501728058 }, { "entropy": 9.356979370117188, "epoch": 0.6809373146134071, "mean_token_accuracy": 0.7393483519554138, "num_tokens": 15027462.0, "step": 6887, "train/ce_loss": 5.043874807597604e-06 }, { "epoch": 0.6809373146134071, "step": 6887, "train/sim_loss": 0.07421875 }, { "epoch": 0.6809373146134071, "step": 6887, "train/total_loss": 0.07421925663948059 }, { "entropy": 8.717205047607422, "epoch": 0.6810361874629227, "mean_token_accuracy": 0.7674999833106995, "num_tokens": 15032714.0, "step": 6888, "train/ce_loss": 0.8920142650604248 }, { "epoch": 0.6810361874629227, "step": 6888, "train/sim_loss": 0.05078125 }, { "epoch": 0.6810361874629227, "step": 6888, "train/total_loss": 0.139982670545578 }, { "entropy": 9.096343994140625, "epoch": 0.6811350603124382, "mean_token_accuracy": 0.7919555902481079, "num_tokens": 15037871.0, "step": 6889, "train/ce_loss": 1.001779556274414 }, { "epoch": 0.6811350603124382, "step": 6889, "train/sim_loss": 0.07421875 }, { "epoch": 0.6811350603124382, "step": 6889, "train/total_loss": 0.17439670860767365 }, { "entropy": 8.990443229675293, "epoch": 0.6812339331619537, "mean_token_accuracy": 0.7565789222717285, "num_tokens": 15043033.0, "step": 6890, "train/ce_loss": 1.350875973701477 }, { "epoch": 0.6812339331619537, "step": 6890, "train/sim_loss": 0.0625 }, { "epoch": 0.6812339331619537, "step": 6890, "train/total_loss": 0.19758759438991547 }, { "entropy": 9.394641876220703, "epoch": 0.6813328060114693, "mean_token_accuracy": 0.8042226433753967, "num_tokens": 15048019.0, "step": 6891, "train/ce_loss": 0.8413051962852478 }, { "epoch": 0.6813328060114693, "step": 6891, "train/sim_loss": 0.0625 }, { "epoch": 0.6813328060114693, "step": 6891, "train/total_loss": 0.14663052558898926 }, { "entropy": 8.438233375549316, "epoch": 0.6814316788609848, "mean_token_accuracy": 0.7343283295631409, "num_tokens": 15053504.0, "step": 6892, "train/ce_loss": 0.39659619331359863 }, { "epoch": 0.6814316788609848, "step": 6892, "train/sim_loss": 0.0234375 }, { "epoch": 0.6814316788609848, "step": 6892, "train/total_loss": 0.06309711933135986 }, { "entropy": 8.297380447387695, "epoch": 0.6815305517105003, "mean_token_accuracy": 0.7411873936653137, "num_tokens": 15059003.0, "step": 6893, "train/ce_loss": 0.9297574758529663 }, { "epoch": 0.6815305517105003, "step": 6893, "train/sim_loss": 0.05078125 }, { "epoch": 0.6815305517105003, "step": 6893, "train/total_loss": 0.14375700056552887 }, { "entropy": 9.346956253051758, "epoch": 0.6816294245600159, "mean_token_accuracy": 0.7327731251716614, "num_tokens": 15064046.0, "step": 6894, "train/ce_loss": 0.6615893840789795 }, { "epoch": 0.6816294245600159, "step": 6894, "train/sim_loss": 0.046875 }, { "epoch": 0.6816294245600159, "step": 6894, "train/total_loss": 0.11303394287824631 }, { "entropy": 8.94331169128418, "epoch": 0.6817282974095313, "mean_token_accuracy": 0.7450722455978394, "num_tokens": 15069294.0, "step": 6895, "train/ce_loss": 0.8234104514122009 }, { "epoch": 0.6817282974095313, "step": 6895, "train/sim_loss": 0.1015625 }, { "epoch": 0.6817282974095313, "step": 6895, "train/total_loss": 0.1839035451412201 }, { "entropy": 8.835611343383789, "epoch": 0.6818271702590468, "mean_token_accuracy": 0.7842261791229248, "num_tokens": 15074427.0, "step": 6896, "train/ce_loss": 0.6595916748046875 }, { "epoch": 0.6818271702590468, "step": 6896, "train/sim_loss": 0.01953125 }, { "epoch": 0.6818271702590468, "step": 6896, "train/total_loss": 0.08549042046070099 }, { "entropy": 9.11497974395752, "epoch": 0.6819260431085624, "mean_token_accuracy": 0.7549406886100769, "num_tokens": 15079369.0, "step": 6897, "train/ce_loss": 0.7551848292350769 }, { "epoch": 0.6819260431085624, "step": 6897, "train/sim_loss": 0.03125 }, { "epoch": 0.6819260431085624, "step": 6897, "train/total_loss": 0.10676848143339157 }, { "entropy": 8.623621940612793, "epoch": 0.6820249159580779, "mean_token_accuracy": 0.7431507110595703, "num_tokens": 15084689.0, "step": 6898, "train/ce_loss": 0.6764382719993591 }, { "epoch": 0.6820249159580779, "step": 6898, "train/sim_loss": 0.01953125 }, { "epoch": 0.6820249159580779, "step": 6898, "train/total_loss": 0.08717507869005203 }, { "entropy": 8.41831111907959, "epoch": 0.6821237888075934, "mean_token_accuracy": 0.7379958033561707, "num_tokens": 15090114.0, "step": 6899, "train/ce_loss": 0.6716561913490295 }, { "epoch": 0.6821237888075934, "step": 6899, "train/sim_loss": 0.046875 }, { "epoch": 0.6821237888075934, "step": 6899, "train/total_loss": 0.11404062062501907 }, { "epoch": 0.682222661657109, "grad_norm": 0.6683655381202698, "learning_rate": 8.296741334124513e-06, "loss": 0.1307, "step": 6900 }, { "entropy": 8.801922798156738, "epoch": 0.682222661657109, "mean_token_accuracy": 0.7189384698867798, "num_tokens": 15095403.0, "step": 6900, "train/ce_loss": 0.71654212474823 }, { "epoch": 0.682222661657109, "step": 6900, "train/sim_loss": 0.01953125 }, { "epoch": 0.682222661657109, "step": 6900, "train/total_loss": 0.09118546545505524 }, { "entropy": 8.632822036743164, "epoch": 0.6823215345066245, "mean_token_accuracy": 0.8171206116676331, "num_tokens": 15100588.0, "step": 6901, "train/ce_loss": 0.5940370559692383 }, { "epoch": 0.6823215345066245, "step": 6901, "train/sim_loss": 0.0859375 }, { "epoch": 0.6823215345066245, "step": 6901, "train/total_loss": 0.1453412026166916 }, { "entropy": 8.848738670349121, "epoch": 0.68242040735614, "mean_token_accuracy": 0.6931540369987488, "num_tokens": 15105866.0, "step": 6902, "train/ce_loss": 0.3797646164894104 }, { "epoch": 0.68242040735614, "step": 6902, "train/sim_loss": 0.03515625 }, { "epoch": 0.68242040735614, "step": 6902, "train/total_loss": 0.0731327086687088 }, { "entropy": 8.843427658081055, "epoch": 0.6825192802056556, "mean_token_accuracy": 0.7657067775726318, "num_tokens": 15111116.0, "step": 6903, "train/ce_loss": 1.0031598806381226 }, { "epoch": 0.6825192802056556, "step": 6903, "train/sim_loss": 0.05859375 }, { "epoch": 0.6825192802056556, "step": 6903, "train/total_loss": 0.15890973806381226 }, { "entropy": 8.921822547912598, "epoch": 0.682618153055171, "mean_token_accuracy": 0.6976743936538696, "num_tokens": 15116235.0, "step": 6904, "train/ce_loss": 1.6693660020828247 }, { "epoch": 0.682618153055171, "step": 6904, "train/sim_loss": 0.05859375 }, { "epoch": 0.682618153055171, "step": 6904, "train/total_loss": 0.22553035616874695 }, { "entropy": 9.12919807434082, "epoch": 0.6827170259046865, "mean_token_accuracy": 0.7301255464553833, "num_tokens": 15121201.0, "step": 6905, "train/ce_loss": 2.707676410675049 }, { "epoch": 0.6827170259046865, "step": 6905, "train/sim_loss": 0.0703125 }, { "epoch": 0.6827170259046865, "step": 6905, "train/total_loss": 0.3410801589488983 }, { "entropy": 8.956316947937012, "epoch": 0.6828158987542021, "mean_token_accuracy": 0.7127799987792969, "num_tokens": 15126365.0, "step": 6906, "train/ce_loss": 1.3406143188476562 }, { "epoch": 0.6828158987542021, "step": 6906, "train/sim_loss": 0.046875 }, { "epoch": 0.6828158987542021, "step": 6906, "train/total_loss": 0.18093644082546234 }, { "entropy": 9.401857376098633, "epoch": 0.6829147716037176, "mean_token_accuracy": 0.7478448152542114, "num_tokens": 15131228.0, "step": 6907, "train/ce_loss": 1.821577279770281e-05 }, { "epoch": 0.6829147716037176, "step": 6907, "train/sim_loss": 0.0390625 }, { "epoch": 0.6829147716037176, "step": 6907, "train/total_loss": 0.03906432166695595 }, { "entropy": 9.001886367797852, "epoch": 0.6830136444532331, "mean_token_accuracy": 0.7763496041297913, "num_tokens": 15136434.0, "step": 6908, "train/ce_loss": 2.1235814529063646e-06 }, { "epoch": 0.6830136444532331, "step": 6908, "train/sim_loss": 0.05078125 }, { "epoch": 0.6830136444532331, "step": 6908, "train/total_loss": 0.05078146234154701 }, { "entropy": 8.753898620605469, "epoch": 0.6831125173027487, "mean_token_accuracy": 0.743030309677124, "num_tokens": 15141705.0, "step": 6909, "train/ce_loss": 0.5286942720413208 }, { "epoch": 0.6831125173027487, "step": 6909, "train/sim_loss": 0.0234375 }, { "epoch": 0.6831125173027487, "step": 6909, "train/total_loss": 0.07630692422389984 }, { "entropy": 9.249065399169922, "epoch": 0.6832113901522642, "mean_token_accuracy": 0.7991543412208557, "num_tokens": 15146602.0, "step": 6910, "train/ce_loss": 9.989611498895101e-06 }, { "epoch": 0.6832113901522642, "step": 6910, "train/sim_loss": 0.03125 }, { "epoch": 0.6832113901522642, "step": 6910, "train/total_loss": 0.03125099837779999 }, { "entropy": 9.217145919799805, "epoch": 0.6833102630017797, "mean_token_accuracy": 0.7015151381492615, "num_tokens": 15151695.0, "step": 6911, "train/ce_loss": 1.6542013883590698 }, { "epoch": 0.6833102630017797, "step": 6911, "train/sim_loss": 0.03125 }, { "epoch": 0.6833102630017797, "step": 6911, "train/total_loss": 0.19667014479637146 }, { "entropy": 8.901766777038574, "epoch": 0.6834091358512953, "mean_token_accuracy": 0.7896774411201477, "num_tokens": 15157100.0, "step": 6912, "train/ce_loss": 0.6305601596832275 }, { "epoch": 0.6834091358512953, "step": 6912, "train/sim_loss": 0.0546875 }, { "epoch": 0.6834091358512953, "step": 6912, "train/total_loss": 0.11774351447820663 }, { "entropy": 8.767526626586914, "epoch": 0.6835080087008107, "mean_token_accuracy": 0.7545564770698547, "num_tokens": 15162419.0, "step": 6913, "train/ce_loss": 0.6704851984977722 }, { "epoch": 0.6835080087008107, "step": 6913, "train/sim_loss": 0.0546875 }, { "epoch": 0.6835080087008107, "step": 6913, "train/total_loss": 0.12173601984977722 }, { "entropy": 8.950695037841797, "epoch": 0.6836068815503262, "mean_token_accuracy": 0.7896138429641724, "num_tokens": 15167594.0, "step": 6914, "train/ce_loss": 0.7056196331977844 }, { "epoch": 0.6836068815503262, "step": 6914, "train/sim_loss": 0.0390625 }, { "epoch": 0.6836068815503262, "step": 6914, "train/total_loss": 0.1096244677901268 }, { "entropy": 8.735855102539062, "epoch": 0.6837057543998418, "mean_token_accuracy": 0.7021013498306274, "num_tokens": 15172778.0, "step": 6915, "train/ce_loss": 1.1340030431747437 }, { "epoch": 0.6837057543998418, "step": 6915, "train/sim_loss": 0.07421875 }, { "epoch": 0.6837057543998418, "step": 6915, "train/total_loss": 0.18761906027793884 }, { "entropy": 9.025125503540039, "epoch": 0.6838046272493573, "mean_token_accuracy": 0.7478134036064148, "num_tokens": 15177895.0, "step": 6916, "train/ce_loss": 1.2451808452606201 }, { "epoch": 0.6838046272493573, "step": 6916, "train/sim_loss": 0.05859375 }, { "epoch": 0.6838046272493573, "step": 6916, "train/total_loss": 0.18311184644699097 }, { "entropy": 9.271974563598633, "epoch": 0.6839035000988728, "mean_token_accuracy": 0.7707641124725342, "num_tokens": 15182925.0, "step": 6917, "train/ce_loss": 0.7653732299804688 }, { "epoch": 0.6839035000988728, "step": 6917, "train/sim_loss": 0.05078125 }, { "epoch": 0.6839035000988728, "step": 6917, "train/total_loss": 0.1273185759782791 }, { "entropy": 9.092538833618164, "epoch": 0.6840023729483884, "mean_token_accuracy": 0.7117552161216736, "num_tokens": 15187949.0, "step": 6918, "train/ce_loss": 2.0715394839498913e-06 }, { "epoch": 0.6840023729483884, "step": 6918, "train/sim_loss": 0.03515625 }, { "epoch": 0.6840023729483884, "step": 6918, "train/total_loss": 0.035156458616256714 }, { "entropy": 8.716814994812012, "epoch": 0.6841012457979039, "mean_token_accuracy": 0.7954545617103577, "num_tokens": 15193317.0, "step": 6919, "train/ce_loss": 0.5797393321990967 }, { "epoch": 0.6841012457979039, "step": 6919, "train/sim_loss": 0.06640625 }, { "epoch": 0.6841012457979039, "step": 6919, "train/total_loss": 0.1243801862001419 }, { "epoch": 0.6842001186474194, "grad_norm": 0.5404684543609619, "learning_rate": 8.291796469366563e-06, "loss": 0.1353, "step": 6920 }, { "entropy": 9.481237411499023, "epoch": 0.6842001186474194, "mean_token_accuracy": 0.7302325367927551, "num_tokens": 15198166.0, "step": 6920, "train/ce_loss": 1.2219781875610352 }, { "epoch": 0.6842001186474194, "step": 6920, "train/sim_loss": 0.08203125 }, { "epoch": 0.6842001186474194, "step": 6920, "train/total_loss": 0.20422907173633575 }, { "entropy": 9.129714965820312, "epoch": 0.684298991496935, "mean_token_accuracy": 0.7300613522529602, "num_tokens": 15203304.0, "step": 6921, "train/ce_loss": 4.9342497732141055e-06 }, { "epoch": 0.684298991496935, "step": 6921, "train/sim_loss": 0.03125 }, { "epoch": 0.684298991496935, "step": 6921, "train/total_loss": 0.0312504917383194 }, { "entropy": 8.877391815185547, "epoch": 0.6843978643464504, "mean_token_accuracy": 0.727148711681366, "num_tokens": 15208470.0, "step": 6922, "train/ce_loss": 0.9647364020347595 }, { "epoch": 0.6843978643464504, "step": 6922, "train/sim_loss": 0.08203125 }, { "epoch": 0.6843978643464504, "step": 6922, "train/total_loss": 0.17850488424301147 }, { "entropy": 9.041803359985352, "epoch": 0.684496737195966, "mean_token_accuracy": 0.7361769080162048, "num_tokens": 15213600.0, "step": 6923, "train/ce_loss": 0.6440777778625488 }, { "epoch": 0.684496737195966, "step": 6923, "train/sim_loss": 0.0625 }, { "epoch": 0.684496737195966, "step": 6923, "train/total_loss": 0.12690778076648712 }, { "entropy": 8.622148513793945, "epoch": 0.6845956100454815, "mean_token_accuracy": 0.7507886290550232, "num_tokens": 15218999.0, "step": 6924, "train/ce_loss": 0.5754372477531433 }, { "epoch": 0.6845956100454815, "step": 6924, "train/sim_loss": 0.01953125 }, { "epoch": 0.6845956100454815, "step": 6924, "train/total_loss": 0.07707497477531433 }, { "entropy": 8.552651405334473, "epoch": 0.684694482894997, "mean_token_accuracy": 0.7321226000785828, "num_tokens": 15224346.0, "step": 6925, "train/ce_loss": 0.9482977390289307 }, { "epoch": 0.684694482894997, "step": 6925, "train/sim_loss": 0.03515625 }, { "epoch": 0.684694482894997, "step": 6925, "train/total_loss": 0.1299860179424286 }, { "entropy": 8.672773361206055, "epoch": 0.6847933557445126, "mean_token_accuracy": 0.7335600852966309, "num_tokens": 15229690.0, "step": 6926, "train/ce_loss": 0.8891081213951111 }, { "epoch": 0.6847933557445126, "step": 6926, "train/sim_loss": 0.0703125 }, { "epoch": 0.6847933557445126, "step": 6926, "train/total_loss": 0.15922331809997559 }, { "entropy": 9.116071701049805, "epoch": 0.6848922285940281, "mean_token_accuracy": 0.7018927335739136, "num_tokens": 15234745.0, "step": 6927, "train/ce_loss": 1.9434056282043457 }, { "epoch": 0.6848922285940281, "step": 6927, "train/sim_loss": 0.0546875 }, { "epoch": 0.6848922285940281, "step": 6927, "train/total_loss": 0.2490280717611313 }, { "entropy": 8.96804428100586, "epoch": 0.6849911014435436, "mean_token_accuracy": 0.6753424406051636, "num_tokens": 15239930.0, "step": 6928, "train/ce_loss": 1.9519343376159668 }, { "epoch": 0.6849911014435436, "step": 6928, "train/sim_loss": 0.0625 }, { "epoch": 0.6849911014435436, "step": 6928, "train/total_loss": 0.25769343972206116 }, { "entropy": 9.231451034545898, "epoch": 0.6850899742930592, "mean_token_accuracy": 0.70216304063797, "num_tokens": 15244839.0, "step": 6929, "train/ce_loss": 1.6754367351531982 }, { "epoch": 0.6850899742930592, "step": 6929, "train/sim_loss": 0.05859375 }, { "epoch": 0.6850899742930592, "step": 6929, "train/total_loss": 0.2261374294757843 }, { "entropy": 8.91139030456543, "epoch": 0.6851888471425747, "mean_token_accuracy": 0.7185473442077637, "num_tokens": 15250010.0, "step": 6930, "train/ce_loss": 2.334519194846507e-06 }, { "epoch": 0.6851888471425747, "step": 6930, "train/sim_loss": 0.015625 }, { "epoch": 0.6851888471425747, "step": 6930, "train/total_loss": 0.015625232830643654 }, { "entropy": 8.973052978515625, "epoch": 0.6852877199920901, "mean_token_accuracy": 0.7477242946624756, "num_tokens": 15255302.0, "step": 6931, "train/ce_loss": 0.5091356039047241 }, { "epoch": 0.6852877199920901, "step": 6931, "train/sim_loss": 0.0625 }, { "epoch": 0.6852877199920901, "step": 6931, "train/total_loss": 0.11341355741024017 }, { "entropy": 8.891918182373047, "epoch": 0.6853865928416057, "mean_token_accuracy": 0.7184466123580933, "num_tokens": 15260457.0, "step": 6932, "train/ce_loss": 0.8679772615432739 }, { "epoch": 0.6853865928416057, "step": 6932, "train/sim_loss": 0.0703125 }, { "epoch": 0.6853865928416057, "step": 6932, "train/total_loss": 0.15711022913455963 }, { "entropy": 9.115850448608398, "epoch": 0.6854854656911212, "mean_token_accuracy": 0.675000011920929, "num_tokens": 15265470.0, "step": 6933, "train/ce_loss": 2.707383632659912 }, { "epoch": 0.6854854656911212, "step": 6933, "train/sim_loss": 0.16796875 }, { "epoch": 0.6854854656911212, "step": 6933, "train/total_loss": 0.4387071132659912 }, { "entropy": 8.753941535949707, "epoch": 0.6855843385406367, "mean_token_accuracy": 0.7397727370262146, "num_tokens": 15270798.0, "step": 6934, "train/ce_loss": 0.8424127697944641 }, { "epoch": 0.6855843385406367, "step": 6934, "train/sim_loss": 0.04296875 }, { "epoch": 0.6855843385406367, "step": 6934, "train/total_loss": 0.12721002101898193 }, { "entropy": 8.697265625, "epoch": 0.6856832113901523, "mean_token_accuracy": 0.6972677707672119, "num_tokens": 15276176.0, "step": 6935, "train/ce_loss": 0.8581960797309875 }, { "epoch": 0.6856832113901523, "step": 6935, "train/sim_loss": 0.03125 }, { "epoch": 0.6856832113901523, "step": 6935, "train/total_loss": 0.11706960946321487 }, { "entropy": 9.301244735717773, "epoch": 0.6857820842396678, "mean_token_accuracy": 0.8045454621315002, "num_tokens": 15281233.0, "step": 6936, "train/ce_loss": 0.6674541234970093 }, { "epoch": 0.6857820842396678, "step": 6936, "train/sim_loss": 0.08984375 }, { "epoch": 0.6857820842396678, "step": 6936, "train/total_loss": 0.15658916532993317 }, { "entropy": 9.335037231445312, "epoch": 0.6858809570891833, "mean_token_accuracy": 0.7811158895492554, "num_tokens": 15286143.0, "step": 6937, "train/ce_loss": 2.9457798973453464e-06 }, { "epoch": 0.6858809570891833, "step": 6937, "train/sim_loss": 0.046875 }, { "epoch": 0.6858809570891833, "step": 6937, "train/total_loss": 0.04687529429793358 }, { "entropy": 9.344409942626953, "epoch": 0.6859798299386989, "mean_token_accuracy": 0.751724123954773, "num_tokens": 15291190.0, "step": 6938, "train/ce_loss": 1.1120717525482178 }, { "epoch": 0.6859798299386989, "step": 6938, "train/sim_loss": 0.08203125 }, { "epoch": 0.6859798299386989, "step": 6938, "train/total_loss": 0.19323843717575073 }, { "entropy": 8.513089179992676, "epoch": 0.6860787027882144, "mean_token_accuracy": 0.702531635761261, "num_tokens": 15296660.0, "step": 6939, "train/ce_loss": 0.7875324487686157 }, { "epoch": 0.6860787027882144, "step": 6939, "train/sim_loss": 0.04296875 }, { "epoch": 0.6860787027882144, "step": 6939, "train/total_loss": 0.12172199785709381 }, { "epoch": 0.6861775756377299, "grad_norm": 0.817496657371521, "learning_rate": 8.286851604608614e-06, "loss": 0.1428, "step": 6940 }, { "entropy": 8.629898071289062, "epoch": 0.6861775756377299, "mean_token_accuracy": 0.7637028098106384, "num_tokens": 15301930.0, "step": 6940, "train/ce_loss": 1.129717230796814 }, { "epoch": 0.6861775756377299, "step": 6940, "train/sim_loss": 0.02734375 }, { "epoch": 0.6861775756377299, "step": 6940, "train/total_loss": 0.1403154730796814 }, { "entropy": 8.15090560913086, "epoch": 0.6862764484872454, "mean_token_accuracy": 0.7578058838844299, "num_tokens": 15307624.0, "step": 6941, "train/ce_loss": 0.6505463719367981 }, { "epoch": 0.6862764484872454, "step": 6941, "train/sim_loss": 0.015625 }, { "epoch": 0.6862764484872454, "step": 6941, "train/total_loss": 0.08067964017391205 }, { "entropy": 8.585698127746582, "epoch": 0.6863753213367609, "mean_token_accuracy": 0.7148289084434509, "num_tokens": 15313145.0, "step": 6942, "train/ce_loss": 0.7779737114906311 }, { "epoch": 0.6863753213367609, "step": 6942, "train/sim_loss": 0.0390625 }, { "epoch": 0.6863753213367609, "step": 6942, "train/total_loss": 0.11685987561941147 }, { "entropy": 8.508909225463867, "epoch": 0.6864741941862764, "mean_token_accuracy": 0.7624728679656982, "num_tokens": 15318585.0, "step": 6943, "train/ce_loss": 0.967738687992096 }, { "epoch": 0.6864741941862764, "step": 6943, "train/sim_loss": 0.1484375 }, { "epoch": 0.6864741941862764, "step": 6943, "train/total_loss": 0.24521136283874512 }, { "entropy": 8.761722564697266, "epoch": 0.686573067035792, "mean_token_accuracy": 0.7230955362319946, "num_tokens": 15323892.0, "step": 6944, "train/ce_loss": 0.7384951114654541 }, { "epoch": 0.686573067035792, "step": 6944, "train/sim_loss": 0.03125 }, { "epoch": 0.686573067035792, "step": 6944, "train/total_loss": 0.10509951412677765 }, { "entropy": 8.465885162353516, "epoch": 0.6866719398853075, "mean_token_accuracy": 0.7373637557029724, "num_tokens": 15329350.0, "step": 6945, "train/ce_loss": 0.5195974111557007 }, { "epoch": 0.6866719398853075, "step": 6945, "train/sim_loss": 0.01953125 }, { "epoch": 0.6866719398853075, "step": 6945, "train/total_loss": 0.07149098813533783 }, { "entropy": 8.980575561523438, "epoch": 0.686770812734823, "mean_token_accuracy": 0.804964542388916, "num_tokens": 15334364.0, "step": 6946, "train/ce_loss": 0.8190268278121948 }, { "epoch": 0.686770812734823, "step": 6946, "train/sim_loss": 0.06640625 }, { "epoch": 0.686770812734823, "step": 6946, "train/total_loss": 0.14830893278121948 }, { "entropy": 8.666390419006348, "epoch": 0.6868696855843386, "mean_token_accuracy": 0.7522580623626709, "num_tokens": 15339643.0, "step": 6947, "train/ce_loss": 1.0155224800109863 }, { "epoch": 0.6868696855843386, "step": 6947, "train/sim_loss": 0.05078125 }, { "epoch": 0.6868696855843386, "step": 6947, "train/total_loss": 0.15233349800109863 }, { "entropy": 8.394342422485352, "epoch": 0.6869685584338541, "mean_token_accuracy": 0.7468926310539246, "num_tokens": 15345031.0, "step": 6948, "train/ce_loss": 0.4751631021499634 }, { "epoch": 0.6869685584338541, "step": 6948, "train/sim_loss": 0.0859375 }, { "epoch": 0.6869685584338541, "step": 6948, "train/total_loss": 0.13345381617546082 }, { "entropy": 8.674410820007324, "epoch": 0.6870674312833696, "mean_token_accuracy": 0.7519466280937195, "num_tokens": 15350339.0, "step": 6949, "train/ce_loss": 0.7661105394363403 }, { "epoch": 0.6870674312833696, "step": 6949, "train/sim_loss": 0.05859375 }, { "epoch": 0.6870674312833696, "step": 6949, "train/total_loss": 0.13520480692386627 }, { "entropy": 8.750062942504883, "epoch": 0.6871663041328852, "mean_token_accuracy": 0.7936893105506897, "num_tokens": 15355609.0, "step": 6950, "train/ce_loss": 0.45663222670555115 }, { "epoch": 0.6871663041328852, "step": 6950, "train/sim_loss": 0.04296875 }, { "epoch": 0.6871663041328852, "step": 6950, "train/total_loss": 0.08863197267055511 }, { "entropy": 8.613815307617188, "epoch": 0.6872651769824006, "mean_token_accuracy": 0.7497291564941406, "num_tokens": 15361043.0, "step": 6951, "train/ce_loss": 0.8819339871406555 }, { "epoch": 0.6872651769824006, "step": 6951, "train/sim_loss": 0.078125 }, { "epoch": 0.6872651769824006, "step": 6951, "train/total_loss": 0.1663184016942978 }, { "entropy": 8.531137466430664, "epoch": 0.6873640498319161, "mean_token_accuracy": 0.7067415714263916, "num_tokens": 15366352.0, "step": 6952, "train/ce_loss": 0.9505997896194458 }, { "epoch": 0.6873640498319161, "step": 6952, "train/sim_loss": 0.0390625 }, { "epoch": 0.6873640498319161, "step": 6952, "train/total_loss": 0.13412249088287354 }, { "entropy": 8.574007034301758, "epoch": 0.6874629226814317, "mean_token_accuracy": 0.7746614813804626, "num_tokens": 15372090.0, "step": 6953, "train/ce_loss": 0.6122711300849915 }, { "epoch": 0.6874629226814317, "step": 6953, "train/sim_loss": 0.0703125 }, { "epoch": 0.6874629226814317, "step": 6953, "train/total_loss": 0.13153961300849915 }, { "entropy": 9.415061950683594, "epoch": 0.6875617955309472, "mean_token_accuracy": 0.7694117426872253, "num_tokens": 15377107.0, "step": 6954, "train/ce_loss": 3.785308081205585e-06 }, { "epoch": 0.6875617955309472, "step": 6954, "train/sim_loss": 0.04296875 }, { "epoch": 0.6875617955309472, "step": 6954, "train/total_loss": 0.04296912997961044 }, { "entropy": 9.409917831420898, "epoch": 0.6876606683804627, "mean_token_accuracy": 0.7870967984199524, "num_tokens": 15381964.0, "step": 6955, "train/ce_loss": 3.528351726345136e-06 }, { "epoch": 0.6876606683804627, "step": 6955, "train/sim_loss": 0.04296875 }, { "epoch": 0.6876606683804627, "step": 6955, "train/total_loss": 0.042969103902578354 }, { "entropy": 9.558332443237305, "epoch": 0.6877595412299783, "mean_token_accuracy": 0.703797459602356, "num_tokens": 15386747.0, "step": 6956, "train/ce_loss": 2.1576952934265137 }, { "epoch": 0.6877595412299783, "step": 6956, "train/sim_loss": 0.03515625 }, { "epoch": 0.6877595412299783, "step": 6956, "train/total_loss": 0.25092577934265137 }, { "entropy": 8.30762767791748, "epoch": 0.6878584140794938, "mean_token_accuracy": 0.7465091347694397, "num_tokens": 15392211.0, "step": 6957, "train/ce_loss": 0.5005062222480774 }, { "epoch": 0.6878584140794938, "step": 6957, "train/sim_loss": 0.109375 }, { "epoch": 0.6878584140794938, "step": 6957, "train/total_loss": 0.15942561626434326 }, { "entropy": 8.810013771057129, "epoch": 0.6879572869290093, "mean_token_accuracy": 0.7215189933776855, "num_tokens": 15397392.0, "step": 6958, "train/ce_loss": 1.6074625253677368 }, { "epoch": 0.6879572869290093, "step": 6958, "train/sim_loss": 0.05078125 }, { "epoch": 0.6879572869290093, "step": 6958, "train/total_loss": 0.2115275114774704 }, { "entropy": 8.816883087158203, "epoch": 0.6880561597785249, "mean_token_accuracy": 0.8195187449455261, "num_tokens": 15402706.0, "step": 6959, "train/ce_loss": 0.6443217992782593 }, { "epoch": 0.6880561597785249, "step": 6959, "train/sim_loss": 0.015625 }, { "epoch": 0.6880561597785249, "step": 6959, "train/total_loss": 0.08005718141794205 }, { "epoch": 0.6881550326280403, "grad_norm": 0.6103768944740295, "learning_rate": 8.281906739850666e-06, "loss": 0.1311, "step": 6960 }, { "entropy": 9.008502960205078, "epoch": 0.6881550326280403, "mean_token_accuracy": 0.709269642829895, "num_tokens": 15407864.0, "step": 6960, "train/ce_loss": 0.8353558778762817 }, { "epoch": 0.6881550326280403, "step": 6960, "train/sim_loss": 0.0234375 }, { "epoch": 0.6881550326280403, "step": 6960, "train/total_loss": 0.1069730892777443 }, { "entropy": 9.035163879394531, "epoch": 0.6882539054775558, "mean_token_accuracy": 0.7142857313156128, "num_tokens": 15412753.0, "step": 6961, "train/ce_loss": 3.2206226023845375e-06 }, { "epoch": 0.6882539054775558, "step": 6961, "train/sim_loss": 0.04296875 }, { "epoch": 0.6882539054775558, "step": 6961, "train/total_loss": 0.04296907037496567 }, { "entropy": 9.355962753295898, "epoch": 0.6883527783270714, "mean_token_accuracy": 0.742222249507904, "num_tokens": 15417653.0, "step": 6962, "train/ce_loss": 3.1448560093849665e-06 }, { "epoch": 0.6883527783270714, "step": 6962, "train/sim_loss": 0.03515625 }, { "epoch": 0.6883527783270714, "step": 6962, "train/total_loss": 0.03515656292438507 }, { "entropy": 9.3391752243042, "epoch": 0.6884516511765869, "mean_token_accuracy": 0.6926229596138, "num_tokens": 15422612.0, "step": 6963, "train/ce_loss": 2.5738637447357178 }, { "epoch": 0.6884516511765869, "step": 6963, "train/sim_loss": 0.06640625 }, { "epoch": 0.6884516511765869, "step": 6963, "train/total_loss": 0.32379263639450073 }, { "entropy": 9.015137672424316, "epoch": 0.6885505240261024, "mean_token_accuracy": 0.7619718313217163, "num_tokens": 15427819.0, "step": 6964, "train/ce_loss": 2.477358066244051e-06 }, { "epoch": 0.6885505240261024, "step": 6964, "train/sim_loss": 0.01953125 }, { "epoch": 0.6885505240261024, "step": 6964, "train/total_loss": 0.019531497731804848 }, { "entropy": 8.540361404418945, "epoch": 0.688649396875618, "mean_token_accuracy": 0.7202295660972595, "num_tokens": 15432999.0, "step": 6965, "train/ce_loss": 1.135185718536377 }, { "epoch": 0.688649396875618, "step": 6965, "train/sim_loss": 0.03125 }, { "epoch": 0.688649396875618, "step": 6965, "train/total_loss": 0.14476856589317322 }, { "entropy": 8.603002548217773, "epoch": 0.6887482697251335, "mean_token_accuracy": 0.7921653985977173, "num_tokens": 15438421.0, "step": 6966, "train/ce_loss": 0.5192272067070007 }, { "epoch": 0.6887482697251335, "step": 6966, "train/sim_loss": 0.0859375 }, { "epoch": 0.6887482697251335, "step": 6966, "train/total_loss": 0.1378602236509323 }, { "entropy": 8.734756469726562, "epoch": 0.688847142574649, "mean_token_accuracy": 0.7278645634651184, "num_tokens": 15443616.0, "step": 6967, "train/ce_loss": 0.6984837651252747 }, { "epoch": 0.688847142574649, "step": 6967, "train/sim_loss": 0.04296875 }, { "epoch": 0.688847142574649, "step": 6967, "train/total_loss": 0.11281713098287582 }, { "entropy": 8.821735382080078, "epoch": 0.6889460154241646, "mean_token_accuracy": 0.7677664756774902, "num_tokens": 15448912.0, "step": 6968, "train/ce_loss": 0.8653164505958557 }, { "epoch": 0.6889460154241646, "step": 6968, "train/sim_loss": 0.08203125 }, { "epoch": 0.6889460154241646, "step": 6968, "train/total_loss": 0.1685628890991211 }, { "entropy": 8.957123756408691, "epoch": 0.68904488827368, "mean_token_accuracy": 0.6975903511047363, "num_tokens": 15454188.0, "step": 6969, "train/ce_loss": 3.484945636955672e-06 }, { "epoch": 0.68904488827368, "step": 6969, "train/sim_loss": 0.08203125 }, { "epoch": 0.68904488827368, "step": 6969, "train/total_loss": 0.08203160017728806 }, { "entropy": 8.231283187866211, "epoch": 0.6891437611231955, "mean_token_accuracy": 0.7591313123703003, "num_tokens": 15459651.0, "step": 6970, "train/ce_loss": 0.5685218572616577 }, { "epoch": 0.6891437611231955, "step": 6970, "train/sim_loss": 0.015625 }, { "epoch": 0.6891437611231955, "step": 6970, "train/total_loss": 0.07247719168663025 }, { "entropy": 8.5908203125, "epoch": 0.6892426339727111, "mean_token_accuracy": 0.7651006579399109, "num_tokens": 15465033.0, "step": 6971, "train/ce_loss": 1.4566595554351807 }, { "epoch": 0.6892426339727111, "step": 6971, "train/sim_loss": 0.0703125 }, { "epoch": 0.6892426339727111, "step": 6971, "train/total_loss": 0.2159784585237503 }, { "entropy": 8.771675109863281, "epoch": 0.6893415068222266, "mean_token_accuracy": 0.7152230739593506, "num_tokens": 15470295.0, "step": 6972, "train/ce_loss": 0.8336161375045776 }, { "epoch": 0.6893415068222266, "step": 6972, "train/sim_loss": 0.0234375 }, { "epoch": 0.6893415068222266, "step": 6972, "train/total_loss": 0.10679911822080612 }, { "entropy": 9.065987586975098, "epoch": 0.6894403796717421, "mean_token_accuracy": 0.7348178029060364, "num_tokens": 15475175.0, "step": 6973, "train/ce_loss": 1.7554562091827393 }, { "epoch": 0.6894403796717421, "step": 6973, "train/sim_loss": 0.06640625 }, { "epoch": 0.6894403796717421, "step": 6973, "train/total_loss": 0.2419518679380417 }, { "entropy": 8.591651916503906, "epoch": 0.6895392525212577, "mean_token_accuracy": 0.7198124527931213, "num_tokens": 15480486.0, "step": 6974, "train/ce_loss": 0.9288971424102783 }, { "epoch": 0.6895392525212577, "step": 6974, "train/sim_loss": 0.1015625 }, { "epoch": 0.6895392525212577, "step": 6974, "train/total_loss": 0.1944522261619568 }, { "entropy": 8.860366821289062, "epoch": 0.6896381253707732, "mean_token_accuracy": 0.6860730648040771, "num_tokens": 15486018.0, "step": 6975, "train/ce_loss": 1.2888222932815552 }, { "epoch": 0.6896381253707732, "step": 6975, "train/sim_loss": 0.12890625 }, { "epoch": 0.6896381253707732, "step": 6975, "train/total_loss": 0.2577884793281555 }, { "entropy": 9.116162300109863, "epoch": 0.6897369982202887, "mean_token_accuracy": 0.7212543487548828, "num_tokens": 15491020.0, "step": 6976, "train/ce_loss": 1.1505359411239624 }, { "epoch": 0.6897369982202887, "step": 6976, "train/sim_loss": 0.03515625 }, { "epoch": 0.6897369982202887, "step": 6976, "train/total_loss": 0.15020984411239624 }, { "entropy": 8.815119743347168, "epoch": 0.6898358710698043, "mean_token_accuracy": 0.781862735748291, "num_tokens": 15496330.0, "step": 6977, "train/ce_loss": 0.4676681160926819 }, { "epoch": 0.6898358710698043, "step": 6977, "train/sim_loss": 0.015625 }, { "epoch": 0.6898358710698043, "step": 6977, "train/total_loss": 0.06239181384444237 }, { "entropy": 8.83273696899414, "epoch": 0.6899347439193197, "mean_token_accuracy": 0.7268408536911011, "num_tokens": 15501593.0, "step": 6978, "train/ce_loss": 1.1178123950958252 }, { "epoch": 0.6899347439193197, "step": 6978, "train/sim_loss": 0.0625 }, { "epoch": 0.6899347439193197, "step": 6978, "train/total_loss": 0.17428123950958252 }, { "entropy": 8.750473022460938, "epoch": 0.6900336167688352, "mean_token_accuracy": 0.7281213402748108, "num_tokens": 15506965.0, "step": 6979, "train/ce_loss": 0.5950695276260376 }, { "epoch": 0.6900336167688352, "step": 6979, "train/sim_loss": 0.04296875 }, { "epoch": 0.6900336167688352, "step": 6979, "train/total_loss": 0.10247570276260376 }, { "epoch": 0.6901324896183508, "grad_norm": 0.6545323133468628, "learning_rate": 8.276961875092717e-06, "loss": 0.1403, "step": 6980 }, { "entropy": 8.452592849731445, "epoch": 0.6901324896183508, "mean_token_accuracy": 0.7842170000076294, "num_tokens": 15512283.0, "step": 6980, "train/ce_loss": 0.5714089274406433 }, { "epoch": 0.6901324896183508, "step": 6980, "train/sim_loss": 0.05859375 }, { "epoch": 0.6901324896183508, "step": 6980, "train/total_loss": 0.11573464423418045 }, { "entropy": 8.647806167602539, "epoch": 0.6902313624678663, "mean_token_accuracy": 0.7115628719329834, "num_tokens": 15517574.0, "step": 6981, "train/ce_loss": 0.7578251361846924 }, { "epoch": 0.6902313624678663, "step": 6981, "train/sim_loss": 0.05859375 }, { "epoch": 0.6902313624678663, "step": 6981, "train/total_loss": 0.13437625765800476 }, { "entropy": 8.560383796691895, "epoch": 0.6903302353173818, "mean_token_accuracy": 0.7291220426559448, "num_tokens": 15522949.0, "step": 6982, "train/ce_loss": 0.8393318057060242 }, { "epoch": 0.6903302353173818, "step": 6982, "train/sim_loss": 0.0390625 }, { "epoch": 0.6903302353173818, "step": 6982, "train/total_loss": 0.12299568206071854 }, { "entropy": 9.18415355682373, "epoch": 0.6904291081668974, "mean_token_accuracy": 0.7688266038894653, "num_tokens": 15527967.0, "step": 6983, "train/ce_loss": 3.3392479963367805e-06 }, { "epoch": 0.6904291081668974, "step": 6983, "train/sim_loss": 0.015625 }, { "epoch": 0.6904291081668974, "step": 6983, "train/total_loss": 0.015625333413481712 }, { "entropy": 8.457925796508789, "epoch": 0.6905279810164129, "mean_token_accuracy": 0.7348242998123169, "num_tokens": 15533315.0, "step": 6984, "train/ce_loss": 0.8995658159255981 }, { "epoch": 0.6905279810164129, "step": 6984, "train/sim_loss": 0.015625 }, { "epoch": 0.6905279810164129, "step": 6984, "train/total_loss": 0.10558158159255981 }, { "entropy": 8.641708374023438, "epoch": 0.6906268538659284, "mean_token_accuracy": 0.8094576001167297, "num_tokens": 15538496.0, "step": 6985, "train/ce_loss": 0.9776160717010498 }, { "epoch": 0.6906268538659284, "step": 6985, "train/sim_loss": 0.05078125 }, { "epoch": 0.6906268538659284, "step": 6985, "train/total_loss": 0.1485428512096405 }, { "entropy": 9.258674621582031, "epoch": 0.690725726715444, "mean_token_accuracy": 0.69786536693573, "num_tokens": 15543563.0, "step": 6986, "train/ce_loss": 2.1139408090675715e-06 }, { "epoch": 0.690725726715444, "step": 6986, "train/sim_loss": 0.015625 }, { "epoch": 0.690725726715444, "step": 6986, "train/total_loss": 0.015625210478901863 }, { "entropy": 9.310544967651367, "epoch": 0.6908245995649595, "mean_token_accuracy": 0.6595237851142883, "num_tokens": 15548412.0, "step": 6987, "train/ce_loss": 3.0720837116241455 }, { "epoch": 0.6908245995649595, "step": 6987, "train/sim_loss": 0.0546875 }, { "epoch": 0.6908245995649595, "step": 6987, "train/total_loss": 0.361895889043808 }, { "entropy": 8.935279846191406, "epoch": 0.6909234724144749, "mean_token_accuracy": 0.7120419144630432, "num_tokens": 15553773.0, "step": 6988, "train/ce_loss": 1.2668113708496094 }, { "epoch": 0.6909234724144749, "step": 6988, "train/sim_loss": 0.10546875 }, { "epoch": 0.6909234724144749, "step": 6988, "train/total_loss": 0.2321498841047287 }, { "entropy": 8.701950073242188, "epoch": 0.6910223452639905, "mean_token_accuracy": 0.6670190095901489, "num_tokens": 15559163.0, "step": 6989, "train/ce_loss": 0.9159974455833435 }, { "epoch": 0.6910223452639905, "step": 6989, "train/sim_loss": 0.078125 }, { "epoch": 0.6910223452639905, "step": 6989, "train/total_loss": 0.1697247475385666 }, { "entropy": 8.946187973022461, "epoch": 0.691121218113506, "mean_token_accuracy": 0.7379518151283264, "num_tokens": 15564289.0, "step": 6990, "train/ce_loss": 1.1444780826568604 }, { "epoch": 0.691121218113506, "step": 6990, "train/sim_loss": 0.06640625 }, { "epoch": 0.691121218113506, "step": 6990, "train/total_loss": 0.18085405230522156 }, { "entropy": 9.32418441772461, "epoch": 0.6912200909630215, "mean_token_accuracy": 0.7272727489471436, "num_tokens": 15569168.0, "step": 6991, "train/ce_loss": 2.120774030685425 }, { "epoch": 0.6912200909630215, "step": 6991, "train/sim_loss": 0.09375 }, { "epoch": 0.6912200909630215, "step": 6991, "train/total_loss": 0.30582740902900696 }, { "entropy": 8.605212211608887, "epoch": 0.6913189638125371, "mean_token_accuracy": 0.7746650576591492, "num_tokens": 15574463.0, "step": 6992, "train/ce_loss": 0.7690951824188232 }, { "epoch": 0.6913189638125371, "step": 6992, "train/sim_loss": 0.0859375 }, { "epoch": 0.6913189638125371, "step": 6992, "train/total_loss": 0.16284701228141785 }, { "entropy": 8.168206214904785, "epoch": 0.6914178366620526, "mean_token_accuracy": 0.7494226098060608, "num_tokens": 15579807.0, "step": 6993, "train/ce_loss": 0.6665575504302979 }, { "epoch": 0.6914178366620526, "step": 6993, "train/sim_loss": 0.03125 }, { "epoch": 0.6914178366620526, "step": 6993, "train/total_loss": 0.09790575504302979 }, { "entropy": 9.690252304077148, "epoch": 0.6915167095115681, "mean_token_accuracy": 0.7416666746139526, "num_tokens": 15584586.0, "step": 6994, "train/ce_loss": 2.0575296878814697 }, { "epoch": 0.6915167095115681, "step": 6994, "train/sim_loss": 0.078125 }, { "epoch": 0.6915167095115681, "step": 6994, "train/total_loss": 0.283877968788147 }, { "entropy": 8.993197441101074, "epoch": 0.6916155823610837, "mean_token_accuracy": 0.746268630027771, "num_tokens": 15589573.0, "step": 6995, "train/ce_loss": 0.8488063812255859 }, { "epoch": 0.6916155823610837, "step": 6995, "train/sim_loss": 0.08203125 }, { "epoch": 0.6916155823610837, "step": 6995, "train/total_loss": 0.16691190004348755 }, { "entropy": 9.218578338623047, "epoch": 0.6917144552105992, "mean_token_accuracy": 0.727707028388977, "num_tokens": 15594633.0, "step": 6996, "train/ce_loss": 1.031732201576233 }, { "epoch": 0.6917144552105992, "step": 6996, "train/sim_loss": 0.04296875 }, { "epoch": 0.6917144552105992, "step": 6996, "train/total_loss": 0.14614197611808777 }, { "entropy": 8.436715126037598, "epoch": 0.6918133280601146, "mean_token_accuracy": 0.7288801670074463, "num_tokens": 15600086.0, "step": 6997, "train/ce_loss": 0.8681232929229736 }, { "epoch": 0.6918133280601146, "step": 6997, "train/sim_loss": 0.04296875 }, { "epoch": 0.6918133280601146, "step": 6997, "train/total_loss": 0.1297810822725296 }, { "entropy": 8.853265762329102, "epoch": 0.6919122009096302, "mean_token_accuracy": 0.7205438017845154, "num_tokens": 15605201.0, "step": 6998, "train/ce_loss": 1.191149353981018 }, { "epoch": 0.6919122009096302, "step": 6998, "train/sim_loss": 0.0546875 }, { "epoch": 0.6919122009096302, "step": 6998, "train/total_loss": 0.1738024353981018 }, { "entropy": 8.89388656616211, "epoch": 0.6920110737591457, "mean_token_accuracy": 0.7541229128837585, "num_tokens": 15610312.0, "step": 6999, "train/ce_loss": 1.0421494245529175 }, { "epoch": 0.6920110737591457, "step": 6999, "train/sim_loss": 0.08203125 }, { "epoch": 0.6920110737591457, "step": 6999, "train/total_loss": 0.18624618649482727 }, { "epoch": 0.6921099466086612, "grad_norm": 0.7092252373695374, "learning_rate": 8.272017010334769e-06, "loss": 0.1383, "step": 7000 }, { "entropy": 9.542508125305176, "epoch": 0.6921099466086612, "mean_token_accuracy": 0.7443609237670898, "num_tokens": 15615112.0, "step": 7000, "train/ce_loss": 5.2732480071426835e-06 }, { "epoch": 0.6921099466086612, "step": 7000, "train/sim_loss": 0.046875 }, { "epoch": 0.6921099466086612, "step": 7000, "train/total_loss": 0.04687552899122238 }, { "entropy": 9.186712265014648, "epoch": 0.6922088194581768, "mean_token_accuracy": 0.7618243098258972, "num_tokens": 15620157.0, "step": 7001, "train/ce_loss": 0.7200567722320557 }, { "epoch": 0.6922088194581768, "step": 7001, "train/sim_loss": 0.0234375 }, { "epoch": 0.6922088194581768, "step": 7001, "train/total_loss": 0.09544318169355392 }, { "entropy": 9.247856140136719, "epoch": 0.6923076923076923, "mean_token_accuracy": 0.7417103052139282, "num_tokens": 15625177.0, "step": 7002, "train/ce_loss": 1.290831446647644 }, { "epoch": 0.6923076923076923, "step": 7002, "train/sim_loss": 0.05078125 }, { "epoch": 0.6923076923076923, "step": 7002, "train/total_loss": 0.17986439168453217 }, { "entropy": 8.667369842529297, "epoch": 0.6924065651572078, "mean_token_accuracy": 0.7571251392364502, "num_tokens": 15630470.0, "step": 7003, "train/ce_loss": 0.7783504724502563 }, { "epoch": 0.6924065651572078, "step": 7003, "train/sim_loss": 0.01953125 }, { "epoch": 0.6924065651572078, "step": 7003, "train/total_loss": 0.09736629575490952 }, { "entropy": 8.759040832519531, "epoch": 0.6925054380067234, "mean_token_accuracy": 0.7424441576004028, "num_tokens": 15635698.0, "step": 7004, "train/ce_loss": 0.7566330432891846 }, { "epoch": 0.6925054380067234, "step": 7004, "train/sim_loss": 0.0625 }, { "epoch": 0.6925054380067234, "step": 7004, "train/total_loss": 0.13816329836845398 }, { "entropy": 8.970389366149902, "epoch": 0.6926043108562389, "mean_token_accuracy": 0.7370689511299133, "num_tokens": 15640832.0, "step": 7005, "train/ce_loss": 1.5509412288665771 }, { "epoch": 0.6926043108562389, "step": 7005, "train/sim_loss": 0.046875 }, { "epoch": 0.6926043108562389, "step": 7005, "train/total_loss": 0.20196913182735443 }, { "entropy": 9.440950393676758, "epoch": 0.6927031837057545, "mean_token_accuracy": 0.7175572514533997, "num_tokens": 15645719.0, "step": 7006, "train/ce_loss": 5.76427519263234e-05 }, { "epoch": 0.6927031837057545, "step": 7006, "train/sim_loss": 0.0234375 }, { "epoch": 0.6927031837057545, "step": 7006, "train/total_loss": 0.02344326488673687 }, { "entropy": 8.93044662475586, "epoch": 0.6928020565552699, "mean_token_accuracy": 0.7400000095367432, "num_tokens": 15651044.0, "step": 7007, "train/ce_loss": 2.412987214484019e-06 }, { "epoch": 0.6928020565552699, "step": 7007, "train/sim_loss": 0.01953125 }, { "epoch": 0.6928020565552699, "step": 7007, "train/total_loss": 0.0195314921438694 }, { "entropy": 8.99925422668457, "epoch": 0.6929009294047854, "mean_token_accuracy": 0.7074742317199707, "num_tokens": 15656305.0, "step": 7008, "train/ce_loss": 1.2787649631500244 }, { "epoch": 0.6929009294047854, "step": 7008, "train/sim_loss": 0.05078125 }, { "epoch": 0.6929009294047854, "step": 7008, "train/total_loss": 0.17865775525569916 }, { "entropy": 9.029271125793457, "epoch": 0.692999802254301, "mean_token_accuracy": 0.81717449426651, "num_tokens": 15661522.0, "step": 7009, "train/ce_loss": 0.8705811500549316 }, { "epoch": 0.692999802254301, "step": 7009, "train/sim_loss": 0.046875 }, { "epoch": 0.692999802254301, "step": 7009, "train/total_loss": 0.13393312692642212 }, { "entropy": 8.652618408203125, "epoch": 0.6930986751038165, "mean_token_accuracy": 0.6761229038238525, "num_tokens": 15666832.0, "step": 7010, "train/ce_loss": 0.34817835688591003 }, { "epoch": 0.6930986751038165, "step": 7010, "train/sim_loss": 0.046875 }, { "epoch": 0.6930986751038165, "step": 7010, "train/total_loss": 0.08169283717870712 }, { "entropy": 8.780120849609375, "epoch": 0.693197547953332, "mean_token_accuracy": 0.7528344392776489, "num_tokens": 15672164.0, "step": 7011, "train/ce_loss": 0.5572643876075745 }, { "epoch": 0.693197547953332, "step": 7011, "train/sim_loss": 0.078125 }, { "epoch": 0.693197547953332, "step": 7011, "train/total_loss": 0.13385143876075745 }, { "entropy": 8.186322212219238, "epoch": 0.6932964208028476, "mean_token_accuracy": 0.6894736886024475, "num_tokens": 15677614.0, "step": 7012, "train/ce_loss": 1.051735281944275 }, { "epoch": 0.6932964208028476, "step": 7012, "train/sim_loss": 0.07421875 }, { "epoch": 0.6932964208028476, "step": 7012, "train/total_loss": 0.1793922781944275 }, { "entropy": 8.736364364624023, "epoch": 0.6933952936523631, "mean_token_accuracy": 0.734375, "num_tokens": 15682890.0, "step": 7013, "train/ce_loss": 1.169240117073059 }, { "epoch": 0.6933952936523631, "step": 7013, "train/sim_loss": 0.02734375 }, { "epoch": 0.6933952936523631, "step": 7013, "train/total_loss": 0.14426776766777039 }, { "entropy": 8.883325576782227, "epoch": 0.6934941665018786, "mean_token_accuracy": 0.7451523542404175, "num_tokens": 15688037.0, "step": 7014, "train/ce_loss": 0.5323801040649414 }, { "epoch": 0.6934941665018786, "step": 7014, "train/sim_loss": 0.03515625 }, { "epoch": 0.6934941665018786, "step": 7014, "train/total_loss": 0.08839426189661026 }, { "entropy": 8.72563648223877, "epoch": 0.6935930393513942, "mean_token_accuracy": 0.7043189406394958, "num_tokens": 15693454.0, "step": 7015, "train/ce_loss": 0.7103878259658813 }, { "epoch": 0.6935930393513942, "step": 7015, "train/sim_loss": 0.05078125 }, { "epoch": 0.6935930393513942, "step": 7015, "train/total_loss": 0.12182003259658813 }, { "entropy": 9.101256370544434, "epoch": 0.6936919122009096, "mean_token_accuracy": 0.7774342894554138, "num_tokens": 15698527.0, "step": 7016, "train/ce_loss": 0.7961004972457886 }, { "epoch": 0.6936919122009096, "step": 7016, "train/sim_loss": 0.01953125 }, { "epoch": 0.6936919122009096, "step": 7016, "train/total_loss": 0.09914129972457886 }, { "entropy": 9.542841911315918, "epoch": 0.6937907850504251, "mean_token_accuracy": 0.790123462677002, "num_tokens": 15703310.0, "step": 7017, "train/ce_loss": 1.63310968875885 }, { "epoch": 0.6937907850504251, "step": 7017, "train/sim_loss": 0.046875 }, { "epoch": 0.6937907850504251, "step": 7017, "train/total_loss": 0.2101859748363495 }, { "entropy": 9.061891555786133, "epoch": 0.6938896578999407, "mean_token_accuracy": 0.673559844493866, "num_tokens": 15708451.0, "step": 7018, "train/ce_loss": 1.4210647344589233 }, { "epoch": 0.6938896578999407, "step": 7018, "train/sim_loss": 0.046875 }, { "epoch": 0.6938896578999407, "step": 7018, "train/total_loss": 0.18898147344589233 }, { "entropy": 8.897470474243164, "epoch": 0.6939885307494562, "mean_token_accuracy": 0.7648686170578003, "num_tokens": 15713599.0, "step": 7019, "train/ce_loss": 0.8759534955024719 }, { "epoch": 0.6939885307494562, "step": 7019, "train/sim_loss": 0.0625 }, { "epoch": 0.6939885307494562, "step": 7019, "train/total_loss": 0.15009534358978271 }, { "epoch": 0.6940874035989717, "grad_norm": 0.6430907845497131, "learning_rate": 8.26707214557682e-06, "loss": 0.1372, "step": 7020 }, { "entropy": 9.013862609863281, "epoch": 0.6940874035989717, "mean_token_accuracy": 0.7945619225502014, "num_tokens": 15718791.0, "step": 7020, "train/ce_loss": 0.6018125414848328 }, { "epoch": 0.6940874035989717, "step": 7020, "train/sim_loss": 0.04296875 }, { "epoch": 0.6940874035989717, "step": 7020, "train/total_loss": 0.10315001010894775 }, { "entropy": 9.094813346862793, "epoch": 0.6941862764484873, "mean_token_accuracy": 0.7145214676856995, "num_tokens": 15723836.0, "step": 7021, "train/ce_loss": 1.3392627239227295 }, { "epoch": 0.6941862764484873, "step": 7021, "train/sim_loss": 0.0625 }, { "epoch": 0.6941862764484873, "step": 7021, "train/total_loss": 0.19642627239227295 }, { "entropy": 9.047096252441406, "epoch": 0.6942851492980028, "mean_token_accuracy": 0.7503949403762817, "num_tokens": 15729062.0, "step": 7022, "train/ce_loss": 0.79969322681427 }, { "epoch": 0.6942851492980028, "step": 7022, "train/sim_loss": 0.12890625 }, { "epoch": 0.6942851492980028, "step": 7022, "train/total_loss": 0.20887556672096252 }, { "entropy": 9.305543899536133, "epoch": 0.6943840221475183, "mean_token_accuracy": 0.7069536447525024, "num_tokens": 15734089.0, "step": 7023, "train/ce_loss": 1.635424017906189 }, { "epoch": 0.6943840221475183, "step": 7023, "train/sim_loss": 0.0390625 }, { "epoch": 0.6943840221475183, "step": 7023, "train/total_loss": 0.20260490477085114 }, { "entropy": 8.614456176757812, "epoch": 0.6944828949970339, "mean_token_accuracy": 0.6590662598609924, "num_tokens": 15739476.0, "step": 7024, "train/ce_loss": 1.0922640562057495 }, { "epoch": 0.6944828949970339, "step": 7024, "train/sim_loss": 0.109375 }, { "epoch": 0.6944828949970339, "step": 7024, "train/total_loss": 0.21860140562057495 }, { "entropy": 9.667367935180664, "epoch": 0.6945817678465493, "mean_token_accuracy": 0.7654028534889221, "num_tokens": 15744277.0, "step": 7025, "train/ce_loss": 1.4774781465530396 }, { "epoch": 0.6945817678465493, "step": 7025, "train/sim_loss": 0.0234375 }, { "epoch": 0.6945817678465493, "step": 7025, "train/total_loss": 0.17118531465530396 }, { "entropy": 9.445836067199707, "epoch": 0.6946806406960648, "mean_token_accuracy": 0.764976978302002, "num_tokens": 15749147.0, "step": 7026, "train/ce_loss": 0.7500922083854675 }, { "epoch": 0.6946806406960648, "step": 7026, "train/sim_loss": 0.01953125 }, { "epoch": 0.6946806406960648, "step": 7026, "train/total_loss": 0.09454046934843063 }, { "entropy": 8.696281433105469, "epoch": 0.6947795135455804, "mean_token_accuracy": 0.6740331649780273, "num_tokens": 15754495.0, "step": 7027, "train/ce_loss": 1.7816426753997803 }, { "epoch": 0.6947795135455804, "step": 7027, "train/sim_loss": 0.1015625 }, { "epoch": 0.6947795135455804, "step": 7027, "train/total_loss": 0.2797267735004425 }, { "entropy": 8.684164047241211, "epoch": 0.6948783863950959, "mean_token_accuracy": 0.7372781038284302, "num_tokens": 15759817.0, "step": 7028, "train/ce_loss": 0.6079049706459045 }, { "epoch": 0.6948783863950959, "step": 7028, "train/sim_loss": 0.0234375 }, { "epoch": 0.6948783863950959, "step": 7028, "train/total_loss": 0.08422799408435822 }, { "entropy": 9.313911437988281, "epoch": 0.6949772592446114, "mean_token_accuracy": 0.750629723072052, "num_tokens": 15764658.0, "step": 7029, "train/ce_loss": 1.0196951627731323 }, { "epoch": 0.6949772592446114, "step": 7029, "train/sim_loss": 0.01171875 }, { "epoch": 0.6949772592446114, "step": 7029, "train/total_loss": 0.11368826776742935 }, { "entropy": 9.19039535522461, "epoch": 0.695076132094127, "mean_token_accuracy": 0.7332268357276917, "num_tokens": 15769758.0, "step": 7030, "train/ce_loss": 0.7941244840621948 }, { "epoch": 0.695076132094127, "step": 7030, "train/sim_loss": 0.0234375 }, { "epoch": 0.695076132094127, "step": 7030, "train/total_loss": 0.10284995287656784 }, { "entropy": 8.487091064453125, "epoch": 0.6951750049436425, "mean_token_accuracy": 0.7599545121192932, "num_tokens": 15775114.0, "step": 7031, "train/ce_loss": 1.244439721107483 }, { "epoch": 0.6951750049436425, "step": 7031, "train/sim_loss": 0.0703125 }, { "epoch": 0.6951750049436425, "step": 7031, "train/total_loss": 0.19475647807121277 }, { "entropy": 8.887763023376465, "epoch": 0.695273877793158, "mean_token_accuracy": 0.7837837934494019, "num_tokens": 15780342.0, "step": 7032, "train/ce_loss": 0.9753705263137817 }, { "epoch": 0.695273877793158, "step": 7032, "train/sim_loss": 0.109375 }, { "epoch": 0.695273877793158, "step": 7032, "train/total_loss": 0.2069120556116104 }, { "entropy": 8.522140502929688, "epoch": 0.6953727506426736, "mean_token_accuracy": 0.7234273552894592, "num_tokens": 15785701.0, "step": 7033, "train/ce_loss": 1.1009597778320312 }, { "epoch": 0.6953727506426736, "step": 7033, "train/sim_loss": 0.05078125 }, { "epoch": 0.6953727506426736, "step": 7033, "train/total_loss": 0.16087722778320312 }, { "entropy": 8.658458709716797, "epoch": 0.695471623492189, "mean_token_accuracy": 0.8036999106407166, "num_tokens": 15791293.0, "step": 7034, "train/ce_loss": 0.6290245056152344 }, { "epoch": 0.695471623492189, "step": 7034, "train/sim_loss": 0.0625 }, { "epoch": 0.695471623492189, "step": 7034, "train/total_loss": 0.12540245056152344 }, { "entropy": 9.100859642028809, "epoch": 0.6955704963417045, "mean_token_accuracy": 0.7009202241897583, "num_tokens": 15796324.0, "step": 7035, "train/ce_loss": 2.258258973597549e-06 }, { "epoch": 0.6955704963417045, "step": 7035, "train/sim_loss": 0.04296875 }, { "epoch": 0.6955704963417045, "step": 7035, "train/total_loss": 0.042968977242708206 }, { "entropy": 8.710677146911621, "epoch": 0.6956693691912201, "mean_token_accuracy": 0.7430051565170288, "num_tokens": 15801770.0, "step": 7036, "train/ce_loss": 0.6250700950622559 }, { "epoch": 0.6956693691912201, "step": 7036, "train/sim_loss": 0.0390625 }, { "epoch": 0.6956693691912201, "step": 7036, "train/total_loss": 0.1015695109963417 }, { "entropy": 9.15246868133545, "epoch": 0.6957682420407356, "mean_token_accuracy": 0.7771317958831787, "num_tokens": 15806704.0, "step": 7037, "train/ce_loss": 1.804560661315918 }, { "epoch": 0.6957682420407356, "step": 7037, "train/sim_loss": 0.08203125 }, { "epoch": 0.6957682420407356, "step": 7037, "train/total_loss": 0.2624873220920563 }, { "entropy": 8.971076011657715, "epoch": 0.6958671148902511, "mean_token_accuracy": 0.7671394944190979, "num_tokens": 15812041.0, "step": 7038, "train/ce_loss": 1.947181317518698e-06 }, { "epoch": 0.6958671148902511, "step": 7038, "train/sim_loss": 0.05078125 }, { "epoch": 0.6958671148902511, "step": 7038, "train/total_loss": 0.05078144371509552 }, { "entropy": 8.714999198913574, "epoch": 0.6959659877397667, "mean_token_accuracy": 0.7703225612640381, "num_tokens": 15817481.0, "step": 7039, "train/ce_loss": 0.8191965222358704 }, { "epoch": 0.6959659877397667, "step": 7039, "train/sim_loss": 0.0625 }, { "epoch": 0.6959659877397667, "step": 7039, "train/total_loss": 0.14441965520381927 }, { "epoch": 0.6960648605892822, "grad_norm": 0.6332975625991821, "learning_rate": 8.26212728081887e-06, "loss": 0.1369, "step": 7040 }, { "entropy": 8.384191513061523, "epoch": 0.6960648605892822, "mean_token_accuracy": 0.7045454382896423, "num_tokens": 15822919.0, "step": 7040, "train/ce_loss": 0.9338988065719604 }, { "epoch": 0.6960648605892822, "step": 7040, "train/sim_loss": 0.10546875 }, { "epoch": 0.6960648605892822, "step": 7040, "train/total_loss": 0.19885863363742828 }, { "entropy": 8.649618148803711, "epoch": 0.6961637334387977, "mean_token_accuracy": 0.7686403393745422, "num_tokens": 15828269.0, "step": 7041, "train/ce_loss": 0.6279864311218262 }, { "epoch": 0.6961637334387977, "step": 7041, "train/sim_loss": 0.0234375 }, { "epoch": 0.6961637334387977, "step": 7041, "train/total_loss": 0.0862361416220665 }, { "entropy": 8.825262069702148, "epoch": 0.6962626062883133, "mean_token_accuracy": 0.7746913433074951, "num_tokens": 15833417.0, "step": 7042, "train/ce_loss": 0.810846209526062 }, { "epoch": 0.6962626062883133, "step": 7042, "train/sim_loss": 0.01953125 }, { "epoch": 0.6962626062883133, "step": 7042, "train/total_loss": 0.10061587393283844 }, { "entropy": 8.882431983947754, "epoch": 0.6963614791378288, "mean_token_accuracy": 0.7299168705940247, "num_tokens": 15838598.0, "step": 7043, "train/ce_loss": 1.1015716791152954 }, { "epoch": 0.6963614791378288, "step": 7043, "train/sim_loss": 0.0546875 }, { "epoch": 0.6963614791378288, "step": 7043, "train/total_loss": 0.16484466195106506 }, { "entropy": 8.79694938659668, "epoch": 0.6964603519873442, "mean_token_accuracy": 0.7814726829528809, "num_tokens": 15843913.0, "step": 7044, "train/ce_loss": 0.45228341221809387 }, { "epoch": 0.6964603519873442, "step": 7044, "train/sim_loss": 0.01953125 }, { "epoch": 0.6964603519873442, "step": 7044, "train/total_loss": 0.06475959718227386 }, { "entropy": 8.557976722717285, "epoch": 0.6965592248368598, "mean_token_accuracy": 0.7971863746643066, "num_tokens": 15849218.0, "step": 7045, "train/ce_loss": 0.6079275608062744 }, { "epoch": 0.6965592248368598, "step": 7045, "train/sim_loss": 0.05859375 }, { "epoch": 0.6965592248368598, "step": 7045, "train/total_loss": 0.11938650906085968 }, { "entropy": 8.67821216583252, "epoch": 0.6966580976863753, "mean_token_accuracy": 0.7239868640899658, "num_tokens": 15854649.0, "step": 7046, "train/ce_loss": 0.6561583876609802 }, { "epoch": 0.6966580976863753, "step": 7046, "train/sim_loss": 0.02734375 }, { "epoch": 0.6966580976863753, "step": 7046, "train/total_loss": 0.09295959025621414 }, { "entropy": 8.828751564025879, "epoch": 0.6967569705358908, "mean_token_accuracy": 0.7404255270957947, "num_tokens": 15859791.0, "step": 7047, "train/ce_loss": 4.0537565837439615e-06 }, { "epoch": 0.6967569705358908, "step": 7047, "train/sim_loss": 0.0546875 }, { "epoch": 0.6967569705358908, "step": 7047, "train/total_loss": 0.05468790605664253 }, { "entropy": 8.84826946258545, "epoch": 0.6968558433854064, "mean_token_accuracy": 0.7540983557701111, "num_tokens": 15864923.0, "step": 7048, "train/ce_loss": 1.7639780708123e-05 }, { "epoch": 0.6968558433854064, "step": 7048, "train/sim_loss": 0.0859375 }, { "epoch": 0.6968558433854064, "step": 7048, "train/total_loss": 0.08593926578760147 }, { "entropy": 8.52353286743164, "epoch": 0.6969547162349219, "mean_token_accuracy": 0.8147714138031006, "num_tokens": 15870225.0, "step": 7049, "train/ce_loss": 0.5272038578987122 }, { "epoch": 0.6969547162349219, "step": 7049, "train/sim_loss": 0.01953125 }, { "epoch": 0.6969547162349219, "step": 7049, "train/total_loss": 0.07225163280963898 }, { "entropy": 8.890176773071289, "epoch": 0.6970535890844374, "mean_token_accuracy": 0.748851478099823, "num_tokens": 15875310.0, "step": 7050, "train/ce_loss": 1.0641462802886963 }, { "epoch": 0.6970535890844374, "step": 7050, "train/sim_loss": 0.0546875 }, { "epoch": 0.6970535890844374, "step": 7050, "train/total_loss": 0.16110213100910187 }, { "entropy": 8.700510025024414, "epoch": 0.697152461933953, "mean_token_accuracy": 0.7427577972412109, "num_tokens": 15880645.0, "step": 7051, "train/ce_loss": 1.2868669033050537 }, { "epoch": 0.697152461933953, "step": 7051, "train/sim_loss": 0.05078125 }, { "epoch": 0.697152461933953, "step": 7051, "train/total_loss": 0.17946794629096985 }, { "entropy": 9.045878410339355, "epoch": 0.6972513347834685, "mean_token_accuracy": 0.7814815044403076, "num_tokens": 15885580.0, "step": 7052, "train/ce_loss": 1.1209051609039307 }, { "epoch": 0.6972513347834685, "step": 7052, "train/sim_loss": 0.0390625 }, { "epoch": 0.6972513347834685, "step": 7052, "train/total_loss": 0.15115302801132202 }, { "entropy": 9.706929206848145, "epoch": 0.6973502076329839, "mean_token_accuracy": 0.7095709443092346, "num_tokens": 15890263.0, "step": 7053, "train/ce_loss": 1.4180285930633545 }, { "epoch": 0.6973502076329839, "step": 7053, "train/sim_loss": 0.04296875 }, { "epoch": 0.6973502076329839, "step": 7053, "train/total_loss": 0.1847716122865677 }, { "entropy": 8.205760955810547, "epoch": 0.6974490804824995, "mean_token_accuracy": 0.7109295129776001, "num_tokens": 15895745.0, "step": 7054, "train/ce_loss": 1.4638735055923462 }, { "epoch": 0.6974490804824995, "step": 7054, "train/sim_loss": 0.05859375 }, { "epoch": 0.6974490804824995, "step": 7054, "train/total_loss": 0.20498110353946686 }, { "entropy": 8.700922012329102, "epoch": 0.697547953332015, "mean_token_accuracy": 0.6952381134033203, "num_tokens": 15900979.0, "step": 7055, "train/ce_loss": 1.427583932876587 }, { "epoch": 0.697547953332015, "step": 7055, "train/sim_loss": 0.0625 }, { "epoch": 0.697547953332015, "step": 7055, "train/total_loss": 0.20525839924812317 }, { "entropy": 9.129181861877441, "epoch": 0.6976468261815305, "mean_token_accuracy": 0.699999988079071, "num_tokens": 15906045.0, "step": 7056, "train/ce_loss": 1.8817038536071777 }, { "epoch": 0.6976468261815305, "step": 7056, "train/sim_loss": 0.04296875 }, { "epoch": 0.6976468261815305, "step": 7056, "train/total_loss": 0.23113913834095 }, { "entropy": 8.812071800231934, "epoch": 0.6977456990310461, "mean_token_accuracy": 0.7074999809265137, "num_tokens": 15911278.0, "step": 7057, "train/ce_loss": 1.5500264167785645 }, { "epoch": 0.6977456990310461, "step": 7057, "train/sim_loss": 0.046875 }, { "epoch": 0.6977456990310461, "step": 7057, "train/total_loss": 0.2018776386976242 }, { "entropy": 8.717105865478516, "epoch": 0.6978445718805616, "mean_token_accuracy": 0.7850356101989746, "num_tokens": 15916581.0, "step": 7058, "train/ce_loss": 1.033908724784851 }, { "epoch": 0.6978445718805616, "step": 7058, "train/sim_loss": 0.0390625 }, { "epoch": 0.6978445718805616, "step": 7058, "train/total_loss": 0.1424533724784851 }, { "entropy": 8.572164535522461, "epoch": 0.6979434447300771, "mean_token_accuracy": 0.731517493724823, "num_tokens": 15922098.0, "step": 7059, "train/ce_loss": 1.026472568511963 }, { "epoch": 0.6979434447300771, "step": 7059, "train/sim_loss": 0.0625 }, { "epoch": 0.6979434447300771, "step": 7059, "train/total_loss": 0.16514725983142853 }, { "epoch": 0.6980423175795927, "grad_norm": 0.5985382199287415, "learning_rate": 8.257182416060922e-06, "loss": 0.1269, "step": 7060 }, { "entropy": 8.374982833862305, "epoch": 0.6980423175795927, "mean_token_accuracy": 0.7595682144165039, "num_tokens": 15927602.0, "step": 7060, "train/ce_loss": 0.7092851996421814 }, { "epoch": 0.6980423175795927, "step": 7060, "train/sim_loss": 0.05078125 }, { "epoch": 0.6980423175795927, "step": 7060, "train/total_loss": 0.12170977145433426 }, { "entropy": 8.375333786010742, "epoch": 0.6981411904291082, "mean_token_accuracy": 0.7082917094230652, "num_tokens": 15933067.0, "step": 7061, "train/ce_loss": 1.6360722780227661 }, { "epoch": 0.6981411904291082, "step": 7061, "train/sim_loss": 0.109375 }, { "epoch": 0.6981411904291082, "step": 7061, "train/total_loss": 0.27298223972320557 }, { "entropy": 9.378819465637207, "epoch": 0.6982400632786236, "mean_token_accuracy": 0.7847357988357544, "num_tokens": 15938013.0, "step": 7062, "train/ce_loss": 0.7179803252220154 }, { "epoch": 0.6982400632786236, "step": 7062, "train/sim_loss": 0.01953125 }, { "epoch": 0.6982400632786236, "step": 7062, "train/total_loss": 0.09132928401231766 }, { "entropy": 9.801604270935059, "epoch": 0.6983389361281392, "mean_token_accuracy": 0.7347826361656189, "num_tokens": 15942619.0, "step": 7063, "train/ce_loss": 6.8908921093679965e-06 }, { "epoch": 0.6983389361281392, "step": 7063, "train/sim_loss": 0.0390625 }, { "epoch": 0.6983389361281392, "step": 7063, "train/total_loss": 0.039063189178705215 }, { "entropy": 8.975824356079102, "epoch": 0.6984378089776547, "mean_token_accuracy": 0.7554980516433716, "num_tokens": 15947894.0, "step": 7064, "train/ce_loss": 0.6070204377174377 }, { "epoch": 0.6984378089776547, "step": 7064, "train/sim_loss": 0.09375 }, { "epoch": 0.6984378089776547, "step": 7064, "train/total_loss": 0.15445204079151154 }, { "entropy": 8.47336196899414, "epoch": 0.6985366818271702, "mean_token_accuracy": 0.7288135886192322, "num_tokens": 15953294.0, "step": 7065, "train/ce_loss": 0.7014027833938599 }, { "epoch": 0.6985366818271702, "step": 7065, "train/sim_loss": 0.015625 }, { "epoch": 0.6985366818271702, "step": 7065, "train/total_loss": 0.0857652798295021 }, { "entropy": 9.08005142211914, "epoch": 0.6986355546766858, "mean_token_accuracy": 0.7300509214401245, "num_tokens": 15958338.0, "step": 7066, "train/ce_loss": 0.7763445973396301 }, { "epoch": 0.6986355546766858, "step": 7066, "train/sim_loss": 0.11328125 }, { "epoch": 0.6986355546766858, "step": 7066, "train/total_loss": 0.19091570377349854 }, { "entropy": 8.79062271118164, "epoch": 0.6987344275262013, "mean_token_accuracy": 0.7575392127037048, "num_tokens": 15963659.0, "step": 7067, "train/ce_loss": 0.7277435660362244 }, { "epoch": 0.6987344275262013, "step": 7067, "train/sim_loss": 0.0234375 }, { "epoch": 0.6987344275262013, "step": 7067, "train/total_loss": 0.09621185809373856 }, { "entropy": 9.04910659790039, "epoch": 0.6988333003757168, "mean_token_accuracy": 0.6647887229919434, "num_tokens": 15968820.0, "step": 7068, "train/ce_loss": 1.0253169536590576 }, { "epoch": 0.6988333003757168, "step": 7068, "train/sim_loss": 0.05859375 }, { "epoch": 0.6988333003757168, "step": 7068, "train/total_loss": 0.16112545132637024 }, { "entropy": 9.114204406738281, "epoch": 0.6989321732252324, "mean_token_accuracy": 0.8054298758506775, "num_tokens": 15973904.0, "step": 7069, "train/ce_loss": 2.230848394901841e-06 }, { "epoch": 0.6989321732252324, "step": 7069, "train/sim_loss": 0.01953125 }, { "epoch": 0.6989321732252324, "step": 7069, "train/total_loss": 0.019531473517417908 }, { "entropy": 8.6248779296875, "epoch": 0.6990310460747479, "mean_token_accuracy": 0.781737208366394, "num_tokens": 15979295.0, "step": 7070, "train/ce_loss": 0.9150219559669495 }, { "epoch": 0.6990310460747479, "step": 7070, "train/sim_loss": 0.02734375 }, { "epoch": 0.6990310460747479, "step": 7070, "train/total_loss": 0.11884594708681107 }, { "entropy": 9.219822883605957, "epoch": 0.6991299189242633, "mean_token_accuracy": 0.7191780805587769, "num_tokens": 15984285.0, "step": 7071, "train/ce_loss": 4.236349923303351e-06 }, { "epoch": 0.6991299189242633, "step": 7071, "train/sim_loss": 0.05078125 }, { "epoch": 0.6991299189242633, "step": 7071, "train/total_loss": 0.050781674683094025 }, { "entropy": 8.964593887329102, "epoch": 0.699228791773779, "mean_token_accuracy": 0.8073654174804688, "num_tokens": 15989435.0, "step": 7072, "train/ce_loss": 0.8848397731781006 }, { "epoch": 0.699228791773779, "step": 7072, "train/sim_loss": 0.03125 }, { "epoch": 0.699228791773779, "step": 7072, "train/total_loss": 0.11973398178815842 }, { "entropy": 8.831852912902832, "epoch": 0.6993276646232944, "mean_token_accuracy": 0.7979942560195923, "num_tokens": 15994636.0, "step": 7073, "train/ce_loss": 0.48496463894844055 }, { "epoch": 0.6993276646232944, "step": 7073, "train/sim_loss": 0.015625 }, { "epoch": 0.6993276646232944, "step": 7073, "train/total_loss": 0.06412146985530853 }, { "entropy": 8.867591857910156, "epoch": 0.6994265374728099, "mean_token_accuracy": 0.7411095499992371, "num_tokens": 15999758.0, "step": 7074, "train/ce_loss": 0.6474717855453491 }, { "epoch": 0.6994265374728099, "step": 7074, "train/sim_loss": 0.0234375 }, { "epoch": 0.6994265374728099, "step": 7074, "train/total_loss": 0.08818467706441879 }, { "entropy": 8.66320514678955, "epoch": 0.6995254103223255, "mean_token_accuracy": 0.7608951926231384, "num_tokens": 16005014.0, "step": 7075, "train/ce_loss": 1.0257648229599 }, { "epoch": 0.6995254103223255, "step": 7075, "train/sim_loss": 0.078125 }, { "epoch": 0.6995254103223255, "step": 7075, "train/total_loss": 0.18070149421691895 }, { "entropy": 9.197005271911621, "epoch": 0.699624283171841, "mean_token_accuracy": 0.6825174689292908, "num_tokens": 16010200.0, "step": 7076, "train/ce_loss": 0.7518903017044067 }, { "epoch": 0.699624283171841, "step": 7076, "train/sim_loss": 0.0703125 }, { "epoch": 0.699624283171841, "step": 7076, "train/total_loss": 0.1455015242099762 }, { "entropy": 8.708137512207031, "epoch": 0.6997231560213565, "mean_token_accuracy": 0.778064489364624, "num_tokens": 16015397.0, "step": 7077, "train/ce_loss": 1.0956438779830933 }, { "epoch": 0.6997231560213565, "step": 7077, "train/sim_loss": 0.046875 }, { "epoch": 0.6997231560213565, "step": 7077, "train/total_loss": 0.1564393937587738 }, { "entropy": 8.628875732421875, "epoch": 0.6998220288708721, "mean_token_accuracy": 0.7569866180419922, "num_tokens": 16020699.0, "step": 7078, "train/ce_loss": 0.5898162126541138 }, { "epoch": 0.6998220288708721, "step": 7078, "train/sim_loss": 0.0390625 }, { "epoch": 0.6998220288708721, "step": 7078, "train/total_loss": 0.09804412722587585 }, { "entropy": 8.9287691116333, "epoch": 0.6999209017203876, "mean_token_accuracy": 0.7357512712478638, "num_tokens": 16025927.0, "step": 7079, "train/ce_loss": 0.9454010725021362 }, { "epoch": 0.6999209017203876, "step": 7079, "train/sim_loss": 0.0859375 }, { "epoch": 0.6999209017203876, "step": 7079, "train/total_loss": 0.18047761917114258 }, { "epoch": 0.700019774569903, "grad_norm": 0.6242483258247375, "learning_rate": 8.252237551302973e-06, "loss": 0.1294, "step": 7080 }, { "entropy": 8.967345237731934, "epoch": 0.700019774569903, "mean_token_accuracy": 0.7030848264694214, "num_tokens": 16031203.0, "step": 7080, "train/ce_loss": 0.6736631989479065 }, { "epoch": 0.700019774569903, "step": 7080, "train/sim_loss": 0.0234375 }, { "epoch": 0.700019774569903, "step": 7080, "train/total_loss": 0.09080382436513901 }, { "entropy": 8.525558471679688, "epoch": 0.7001186474194186, "mean_token_accuracy": 0.7220930457115173, "num_tokens": 16036539.0, "step": 7081, "train/ce_loss": 1.2200530767440796 }, { "epoch": 0.7001186474194186, "step": 7081, "train/sim_loss": 0.0390625 }, { "epoch": 0.7001186474194186, "step": 7081, "train/total_loss": 0.16106781363487244 }, { "entropy": 9.250432014465332, "epoch": 0.7002175202689341, "mean_token_accuracy": 0.743879497051239, "num_tokens": 16041435.0, "step": 7082, "train/ce_loss": 1.3173713684082031 }, { "epoch": 0.7002175202689341, "step": 7082, "train/sim_loss": 0.03125 }, { "epoch": 0.7002175202689341, "step": 7082, "train/total_loss": 0.1629871428012848 }, { "entropy": 8.72872257232666, "epoch": 0.7003163931184496, "mean_token_accuracy": 0.74609375, "num_tokens": 16046697.0, "step": 7083, "train/ce_loss": 4.460508080228465e-06 }, { "epoch": 0.7003163931184496, "step": 7083, "train/sim_loss": 0.05078125 }, { "epoch": 0.7003163931184496, "step": 7083, "train/total_loss": 0.050781697034835815 }, { "entropy": 8.655278205871582, "epoch": 0.7004152659679652, "mean_token_accuracy": 0.7363515496253967, "num_tokens": 16051930.0, "step": 7084, "train/ce_loss": 0.8728628754615784 }, { "epoch": 0.7004152659679652, "step": 7084, "train/sim_loss": 0.04296875 }, { "epoch": 0.7004152659679652, "step": 7084, "train/total_loss": 0.13025504350662231 }, { "entropy": 8.845071792602539, "epoch": 0.7005141388174807, "mean_token_accuracy": 0.7242857217788696, "num_tokens": 16057115.0, "step": 7085, "train/ce_loss": 0.8364428877830505 }, { "epoch": 0.7005141388174807, "step": 7085, "train/sim_loss": 0.05078125 }, { "epoch": 0.7005141388174807, "step": 7085, "train/total_loss": 0.134425550699234 }, { "entropy": 8.940481185913086, "epoch": 0.7006130116669962, "mean_token_accuracy": 0.7395973205566406, "num_tokens": 16062287.0, "step": 7086, "train/ce_loss": 1.0155926942825317 }, { "epoch": 0.7006130116669962, "step": 7086, "train/sim_loss": 0.03515625 }, { "epoch": 0.7006130116669962, "step": 7086, "train/total_loss": 0.13671553134918213 }, { "entropy": 8.607136726379395, "epoch": 0.7007118845165118, "mean_token_accuracy": 0.698285698890686, "num_tokens": 16067616.0, "step": 7087, "train/ce_loss": 0.9275574684143066 }, { "epoch": 0.7007118845165118, "step": 7087, "train/sim_loss": 0.04296875 }, { "epoch": 0.7007118845165118, "step": 7087, "train/total_loss": 0.1357244998216629 }, { "entropy": 8.59605598449707, "epoch": 0.7008107573660273, "mean_token_accuracy": 0.7386478185653687, "num_tokens": 16073144.0, "step": 7088, "train/ce_loss": 0.5148741602897644 }, { "epoch": 0.7008107573660273, "step": 7088, "train/sim_loss": 0.04296875 }, { "epoch": 0.7008107573660273, "step": 7088, "train/total_loss": 0.09445616602897644 }, { "entropy": 8.678980827331543, "epoch": 0.7009096302155429, "mean_token_accuracy": 0.7221621870994568, "num_tokens": 16078536.0, "step": 7089, "train/ce_loss": 1.5042873620986938 }, { "epoch": 0.7009096302155429, "step": 7089, "train/sim_loss": 0.04296875 }, { "epoch": 0.7009096302155429, "step": 7089, "train/total_loss": 0.19339749217033386 }, { "entropy": 8.27509880065918, "epoch": 0.7010085030650584, "mean_token_accuracy": 0.747553825378418, "num_tokens": 16084061.0, "step": 7090, "train/ce_loss": 1.1672831773757935 }, { "epoch": 0.7010085030650584, "step": 7090, "train/sim_loss": 0.07421875 }, { "epoch": 0.7010085030650584, "step": 7090, "train/total_loss": 0.19094707071781158 }, { "entropy": 8.718148231506348, "epoch": 0.7011073759145738, "mean_token_accuracy": 0.748062014579773, "num_tokens": 16089291.0, "step": 7091, "train/ce_loss": 1.0415087938308716 }, { "epoch": 0.7011073759145738, "step": 7091, "train/sim_loss": 0.078125 }, { "epoch": 0.7011073759145738, "step": 7091, "train/total_loss": 0.1822758913040161 }, { "entropy": 8.659074783325195, "epoch": 0.7012062487640894, "mean_token_accuracy": 0.7489823698997498, "num_tokens": 16094534.0, "step": 7092, "train/ce_loss": 0.36011967062950134 }, { "epoch": 0.7012062487640894, "step": 7092, "train/sim_loss": 0.02734375 }, { "epoch": 0.7012062487640894, "step": 7092, "train/total_loss": 0.0633557140827179 }, { "entropy": 9.10936164855957, "epoch": 0.7013051216136049, "mean_token_accuracy": 0.7325383424758911, "num_tokens": 16099542.0, "step": 7093, "train/ce_loss": 1.0369967222213745 }, { "epoch": 0.7013051216136049, "step": 7093, "train/sim_loss": 0.02734375 }, { "epoch": 0.7013051216136049, "step": 7093, "train/total_loss": 0.1310434341430664 }, { "entropy": 8.790109634399414, "epoch": 0.7014039944631204, "mean_token_accuracy": 0.7553310990333557, "num_tokens": 16104956.0, "step": 7094, "train/ce_loss": 0.7927346229553223 }, { "epoch": 0.7014039944631204, "step": 7094, "train/sim_loss": 0.0859375 }, { "epoch": 0.7014039944631204, "step": 7094, "train/total_loss": 0.16521096229553223 }, { "entropy": 8.922645568847656, "epoch": 0.701502867312636, "mean_token_accuracy": 0.7646198868751526, "num_tokens": 16110125.0, "step": 7095, "train/ce_loss": 0.5583441257476807 }, { "epoch": 0.701502867312636, "step": 7095, "train/sim_loss": 0.0390625 }, { "epoch": 0.701502867312636, "step": 7095, "train/total_loss": 0.09489691257476807 }, { "entropy": 9.202375411987305, "epoch": 0.7016017401621515, "mean_token_accuracy": 0.7791519165039062, "num_tokens": 16115083.0, "step": 7096, "train/ce_loss": 1.1116911172866821 }, { "epoch": 0.7016017401621515, "step": 7096, "train/sim_loss": 0.04296875 }, { "epoch": 0.7016017401621515, "step": 7096, "train/total_loss": 0.15413786470890045 }, { "entropy": 9.530303955078125, "epoch": 0.701700613011667, "mean_token_accuracy": 0.7134387493133545, "num_tokens": 16119981.0, "step": 7097, "train/ce_loss": 2.1672567527275532e-05 }, { "epoch": 0.701700613011667, "step": 7097, "train/sim_loss": 0.0546875 }, { "epoch": 0.701700613011667, "step": 7097, "train/total_loss": 0.054689668118953705 }, { "entropy": 9.085861206054688, "epoch": 0.7017994858611826, "mean_token_accuracy": 0.7403314709663391, "num_tokens": 16124953.0, "step": 7098, "train/ce_loss": 4.617771992343478e-06 }, { "epoch": 0.7017994858611826, "step": 7098, "train/sim_loss": 0.05078125 }, { "epoch": 0.7017994858611826, "step": 7098, "train/total_loss": 0.05078171193599701 }, { "entropy": 8.683131217956543, "epoch": 0.7018983587106981, "mean_token_accuracy": 0.7448512315750122, "num_tokens": 16130276.0, "step": 7099, "train/ce_loss": 1.2270652055740356 }, { "epoch": 0.7018983587106981, "step": 7099, "train/sim_loss": 0.10546875 }, { "epoch": 0.7018983587106981, "step": 7099, "train/total_loss": 0.22817528247833252 }, { "epoch": 0.7019972315602135, "grad_norm": 0.6079980134963989, "learning_rate": 8.247292686545023e-06, "loss": 0.1378, "step": 7100 }, { "entropy": 9.169426918029785, "epoch": 0.7019972315602135, "mean_token_accuracy": 0.7962675094604492, "num_tokens": 16135351.0, "step": 7100, "train/ce_loss": 0.6476722359657288 }, { "epoch": 0.7019972315602135, "step": 7100, "train/sim_loss": 0.05078125 }, { "epoch": 0.7019972315602135, "step": 7100, "train/total_loss": 0.11554847657680511 }, { "entropy": 8.599437713623047, "epoch": 0.7020961044097291, "mean_token_accuracy": 0.695652186870575, "num_tokens": 16140657.0, "step": 7101, "train/ce_loss": 1.5919551849365234 }, { "epoch": 0.7020961044097291, "step": 7101, "train/sim_loss": 0.0546875 }, { "epoch": 0.7020961044097291, "step": 7101, "train/total_loss": 0.21388302743434906 }, { "entropy": 8.517477989196777, "epoch": 0.7021949772592446, "mean_token_accuracy": 0.7347368597984314, "num_tokens": 16146108.0, "step": 7102, "train/ce_loss": 0.6423676609992981 }, { "epoch": 0.7021949772592446, "step": 7102, "train/sim_loss": 0.05078125 }, { "epoch": 0.7021949772592446, "step": 7102, "train/total_loss": 0.11501801759004593 }, { "entropy": 8.598403930664062, "epoch": 0.7022938501087601, "mean_token_accuracy": 0.6909765005111694, "num_tokens": 16151414.0, "step": 7103, "train/ce_loss": 0.7955908179283142 }, { "epoch": 0.7022938501087601, "step": 7103, "train/sim_loss": 0.06640625 }, { "epoch": 0.7022938501087601, "step": 7103, "train/total_loss": 0.1459653377532959 }, { "entropy": 9.130369186401367, "epoch": 0.7023927229582757, "mean_token_accuracy": 0.7560975551605225, "num_tokens": 16156419.0, "step": 7104, "train/ce_loss": 0.9296554327011108 }, { "epoch": 0.7023927229582757, "step": 7104, "train/sim_loss": 0.01171875 }, { "epoch": 0.7023927229582757, "step": 7104, "train/total_loss": 0.10468429327011108 }, { "entropy": 8.827374458312988, "epoch": 0.7024915958077912, "mean_token_accuracy": 0.7533875107765198, "num_tokens": 16161646.0, "step": 7105, "train/ce_loss": 0.5184999108314514 }, { "epoch": 0.7024915958077912, "step": 7105, "train/sim_loss": 0.02734375 }, { "epoch": 0.7024915958077912, "step": 7105, "train/total_loss": 0.07919374108314514 }, { "entropy": 8.807904243469238, "epoch": 0.7025904686573067, "mean_token_accuracy": 0.7344444394111633, "num_tokens": 16166945.0, "step": 7106, "train/ce_loss": 0.5310591459274292 }, { "epoch": 0.7025904686573067, "step": 7106, "train/sim_loss": 0.0625 }, { "epoch": 0.7025904686573067, "step": 7106, "train/total_loss": 0.1156059205532074 }, { "entropy": 9.410507202148438, "epoch": 0.7026893415068223, "mean_token_accuracy": 0.748971164226532, "num_tokens": 16171838.0, "step": 7107, "train/ce_loss": 3.303274752397556e-06 }, { "epoch": 0.7026893415068223, "step": 7107, "train/sim_loss": 0.046875 }, { "epoch": 0.7026893415068223, "step": 7107, "train/total_loss": 0.04687533155083656 }, { "entropy": 9.01647663116455, "epoch": 0.7027882143563378, "mean_token_accuracy": 0.7506082653999329, "num_tokens": 16177259.0, "step": 7108, "train/ce_loss": 6.459321866714163e-06 }, { "epoch": 0.7027882143563378, "step": 7108, "train/sim_loss": 0.0234375 }, { "epoch": 0.7027882143563378, "step": 7108, "train/total_loss": 0.023438146337866783 }, { "entropy": 8.497390747070312, "epoch": 0.7028870872058532, "mean_token_accuracy": 0.747583270072937, "num_tokens": 16182649.0, "step": 7109, "train/ce_loss": 0.850794792175293 }, { "epoch": 0.7028870872058532, "step": 7109, "train/sim_loss": 0.046875 }, { "epoch": 0.7028870872058532, "step": 7109, "train/total_loss": 0.13195449113845825 }, { "entropy": 8.878206253051758, "epoch": 0.7029859600553688, "mean_token_accuracy": 0.6658415794372559, "num_tokens": 16187907.0, "step": 7110, "train/ce_loss": 0.9725397825241089 }, { "epoch": 0.7029859600553688, "step": 7110, "train/sim_loss": 0.12890625 }, { "epoch": 0.7029859600553688, "step": 7110, "train/total_loss": 0.2261602282524109 }, { "entropy": 8.743565559387207, "epoch": 0.7030848329048843, "mean_token_accuracy": 0.6086448431015015, "num_tokens": 16193219.0, "step": 7111, "train/ce_loss": 0.97925865650177 }, { "epoch": 0.7030848329048843, "step": 7111, "train/sim_loss": 0.09375 }, { "epoch": 0.7030848329048843, "step": 7111, "train/total_loss": 0.19167587161064148 }, { "entropy": 8.753701210021973, "epoch": 0.7031837057543998, "mean_token_accuracy": 0.7647058963775635, "num_tokens": 16198592.0, "step": 7112, "train/ce_loss": 0.5746150612831116 }, { "epoch": 0.7031837057543998, "step": 7112, "train/sim_loss": 0.0234375 }, { "epoch": 0.7031837057543998, "step": 7112, "train/total_loss": 0.08089900761842728 }, { "entropy": 8.577747344970703, "epoch": 0.7032825786039154, "mean_token_accuracy": 0.7411225438117981, "num_tokens": 16203972.0, "step": 7113, "train/ce_loss": 1.0217781066894531 }, { "epoch": 0.7032825786039154, "step": 7113, "train/sim_loss": 0.0625 }, { "epoch": 0.7032825786039154, "step": 7113, "train/total_loss": 0.16467781364917755 }, { "entropy": 8.97984504699707, "epoch": 0.7033814514534309, "mean_token_accuracy": 0.7781690359115601, "num_tokens": 16209017.0, "step": 7114, "train/ce_loss": 0.6582441926002502 }, { "epoch": 0.7033814514534309, "step": 7114, "train/sim_loss": 0.08984375 }, { "epoch": 0.7033814514534309, "step": 7114, "train/total_loss": 0.15566816926002502 }, { "entropy": 8.744083404541016, "epoch": 0.7034803243029464, "mean_token_accuracy": 0.7279411554336548, "num_tokens": 16214460.0, "step": 7115, "train/ce_loss": 1.2731103897094727 }, { "epoch": 0.7034803243029464, "step": 7115, "train/sim_loss": 0.046875 }, { "epoch": 0.7034803243029464, "step": 7115, "train/total_loss": 0.17418603599071503 }, { "entropy": 8.947891235351562, "epoch": 0.703579197152462, "mean_token_accuracy": 0.7562408447265625, "num_tokens": 16219600.0, "step": 7116, "train/ce_loss": 0.5274356603622437 }, { "epoch": 0.703579197152462, "step": 7116, "train/sim_loss": 0.05078125 }, { "epoch": 0.703579197152462, "step": 7116, "train/total_loss": 0.1035248190164566 }, { "entropy": 8.943184852600098, "epoch": 0.7036780700019775, "mean_token_accuracy": 0.6681922078132629, "num_tokens": 16224930.0, "step": 7117, "train/ce_loss": 1.4683958292007446 }, { "epoch": 0.7036780700019775, "step": 7117, "train/sim_loss": 0.0625 }, { "epoch": 0.7036780700019775, "step": 7117, "train/total_loss": 0.20933958888053894 }, { "entropy": 9.058347702026367, "epoch": 0.703776942851493, "mean_token_accuracy": 0.7681607604026794, "num_tokens": 16229995.0, "step": 7118, "train/ce_loss": 0.9557143449783325 }, { "epoch": 0.703776942851493, "step": 7118, "train/sim_loss": 0.02734375 }, { "epoch": 0.703776942851493, "step": 7118, "train/total_loss": 0.12291518598794937 }, { "entropy": 8.758286476135254, "epoch": 0.7038758157010085, "mean_token_accuracy": 0.7527272701263428, "num_tokens": 16235290.0, "step": 7119, "train/ce_loss": 1.1078318357467651 }, { "epoch": 0.7038758157010085, "step": 7119, "train/sim_loss": 0.05859375 }, { "epoch": 0.7038758157010085, "step": 7119, "train/total_loss": 0.169376939535141 }, { "epoch": 0.703974688550524, "grad_norm": 0.5794971585273743, "learning_rate": 8.242347821787075e-06, "loss": 0.1395, "step": 7120 }, { "entropy": 8.493532180786133, "epoch": 0.703974688550524, "mean_token_accuracy": 0.758169949054718, "num_tokens": 16240724.0, "step": 7120, "train/ce_loss": 0.9743674397468567 }, { "epoch": 0.703974688550524, "step": 7120, "train/sim_loss": 0.0390625 }, { "epoch": 0.703974688550524, "step": 7120, "train/total_loss": 0.13649925589561462 }, { "entropy": 8.739421844482422, "epoch": 0.7040735614000395, "mean_token_accuracy": 0.7146371603012085, "num_tokens": 16245995.0, "step": 7121, "train/ce_loss": 0.9262082576751709 }, { "epoch": 0.7040735614000395, "step": 7121, "train/sim_loss": 0.03125 }, { "epoch": 0.7040735614000395, "step": 7121, "train/total_loss": 0.12387082725763321 }, { "entropy": 8.64365005493164, "epoch": 0.7041724342495551, "mean_token_accuracy": 0.7761557102203369, "num_tokens": 16251308.0, "step": 7122, "train/ce_loss": 0.5316778421401978 }, { "epoch": 0.7041724342495551, "step": 7122, "train/sim_loss": 0.04296875 }, { "epoch": 0.7041724342495551, "step": 7122, "train/total_loss": 0.09613654017448425 }, { "entropy": 9.161237716674805, "epoch": 0.7042713070990706, "mean_token_accuracy": 0.7161290049552917, "num_tokens": 16256355.0, "step": 7123, "train/ce_loss": 1.7950961589813232 }, { "epoch": 0.7042713070990706, "step": 7123, "train/sim_loss": 0.10546875 }, { "epoch": 0.7042713070990706, "step": 7123, "train/total_loss": 0.28497838973999023 }, { "entropy": 8.801056861877441, "epoch": 0.7043701799485861, "mean_token_accuracy": 0.7650200128555298, "num_tokens": 16261530.0, "step": 7124, "train/ce_loss": 0.7621773481369019 }, { "epoch": 0.7043701799485861, "step": 7124, "train/sim_loss": 0.04296875 }, { "epoch": 0.7043701799485861, "step": 7124, "train/total_loss": 0.11918648332357407 }, { "entropy": 8.350522994995117, "epoch": 0.7044690527981017, "mean_token_accuracy": 0.6791236996650696, "num_tokens": 16266768.0, "step": 7125, "train/ce_loss": 1.3533482551574707 }, { "epoch": 0.7044690527981017, "step": 7125, "train/sim_loss": 0.08984375 }, { "epoch": 0.7044690527981017, "step": 7125, "train/total_loss": 0.2251785844564438 }, { "entropy": 8.447582244873047, "epoch": 0.7045679256476172, "mean_token_accuracy": 0.7481662631034851, "num_tokens": 16272084.0, "step": 7126, "train/ce_loss": 0.9055770635604858 }, { "epoch": 0.7045679256476172, "step": 7126, "train/sim_loss": 0.09765625 }, { "epoch": 0.7045679256476172, "step": 7126, "train/total_loss": 0.18821395933628082 }, { "entropy": 9.425670623779297, "epoch": 0.7046667984971327, "mean_token_accuracy": 0.7310924530029297, "num_tokens": 16276860.0, "step": 7127, "train/ce_loss": 7.22577397027635e-06 }, { "epoch": 0.7046667984971327, "step": 7127, "train/sim_loss": 0.0390625 }, { "epoch": 0.7046667984971327, "step": 7127, "train/total_loss": 0.0390632227063179 }, { "entropy": 8.652997970581055, "epoch": 0.7047656713466482, "mean_token_accuracy": 0.7416201233863831, "num_tokens": 16282076.0, "step": 7128, "train/ce_loss": 1.3002984523773193 }, { "epoch": 0.7047656713466482, "step": 7128, "train/sim_loss": 0.109375 }, { "epoch": 0.7047656713466482, "step": 7128, "train/total_loss": 0.2394048422574997 }, { "entropy": 8.795637130737305, "epoch": 0.7048645441961637, "mean_token_accuracy": 0.7727839946746826, "num_tokens": 16287340.0, "step": 7129, "train/ce_loss": 1.3004807233810425 }, { "epoch": 0.7048645441961637, "step": 7129, "train/sim_loss": 0.04296875 }, { "epoch": 0.7048645441961637, "step": 7129, "train/total_loss": 0.17301683127880096 }, { "entropy": 9.394787788391113, "epoch": 0.7049634170456792, "mean_token_accuracy": 0.7645630836486816, "num_tokens": 16292204.0, "step": 7130, "train/ce_loss": 1.435694694519043 }, { "epoch": 0.7049634170456792, "step": 7130, "train/sim_loss": 0.06640625 }, { "epoch": 0.7049634170456792, "step": 7130, "train/total_loss": 0.2099757194519043 }, { "entropy": 9.610006332397461, "epoch": 0.7050622898951948, "mean_token_accuracy": 0.7081544995307922, "num_tokens": 16297116.0, "step": 7131, "train/ce_loss": 1.6964994529189426e-06 }, { "epoch": 0.7050622898951948, "step": 7131, "train/sim_loss": 0.06640625 }, { "epoch": 0.7050622898951948, "step": 7131, "train/total_loss": 0.06640642136335373 }, { "entropy": 8.901695251464844, "epoch": 0.7051611627447103, "mean_token_accuracy": 0.6830891966819763, "num_tokens": 16302446.0, "step": 7132, "train/ce_loss": 4.321337655710522e-06 }, { "epoch": 0.7051611627447103, "step": 7132, "train/sim_loss": 0.05078125 }, { "epoch": 0.7051611627447103, "step": 7132, "train/total_loss": 0.05078168213367462 }, { "entropy": 8.928903579711914, "epoch": 0.7052600355942258, "mean_token_accuracy": 0.7589158415794373, "num_tokens": 16307597.0, "step": 7133, "train/ce_loss": 0.7011563777923584 }, { "epoch": 0.7052600355942258, "step": 7133, "train/sim_loss": 0.05078125 }, { "epoch": 0.7052600355942258, "step": 7133, "train/total_loss": 0.12089689075946808 }, { "entropy": 8.908486366271973, "epoch": 0.7053589084437414, "mean_token_accuracy": 0.7802907824516296, "num_tokens": 16312610.0, "step": 7134, "train/ce_loss": 1.5629417475793161e-06 }, { "epoch": 0.7053589084437414, "step": 7134, "train/sim_loss": 0.0234375 }, { "epoch": 0.7053589084437414, "step": 7134, "train/total_loss": 0.023437656462192535 }, { "entropy": 8.977813720703125, "epoch": 0.7054577812932569, "mean_token_accuracy": 0.7375504970550537, "num_tokens": 16317834.0, "step": 7135, "train/ce_loss": 1.288329005241394 }, { "epoch": 0.7054577812932569, "step": 7135, "train/sim_loss": 0.0625 }, { "epoch": 0.7054577812932569, "step": 7135, "train/total_loss": 0.19133290648460388 }, { "entropy": 8.279001235961914, "epoch": 0.7055566541427724, "mean_token_accuracy": 0.7865055203437805, "num_tokens": 16323305.0, "step": 7136, "train/ce_loss": 0.8539165258407593 }, { "epoch": 0.7055566541427724, "step": 7136, "train/sim_loss": 0.02734375 }, { "epoch": 0.7055566541427724, "step": 7136, "train/total_loss": 0.11273540556430817 }, { "entropy": 9.084806442260742, "epoch": 0.705655526992288, "mean_token_accuracy": 0.7059925198554993, "num_tokens": 16328261.0, "step": 7137, "train/ce_loss": 1.0540443658828735 }, { "epoch": 0.705655526992288, "step": 7137, "train/sim_loss": 0.046875 }, { "epoch": 0.705655526992288, "step": 7137, "train/total_loss": 0.15227943658828735 }, { "entropy": 8.779060363769531, "epoch": 0.7057543998418034, "mean_token_accuracy": 0.7353308200836182, "num_tokens": 16333537.0, "step": 7138, "train/ce_loss": 0.8341813683509827 }, { "epoch": 0.7057543998418034, "step": 7138, "train/sim_loss": 0.05859375 }, { "epoch": 0.7057543998418034, "step": 7138, "train/total_loss": 0.1420118808746338 }, { "entropy": 8.768844604492188, "epoch": 0.7058532726913189, "mean_token_accuracy": 0.7597402334213257, "num_tokens": 16338772.0, "step": 7139, "train/ce_loss": 0.41492050886154175 }, { "epoch": 0.7058532726913189, "step": 7139, "train/sim_loss": 0.0546875 }, { "epoch": 0.7058532726913189, "step": 7139, "train/total_loss": 0.0961795523762703 }, { "epoch": 0.7059521455408345, "grad_norm": 0.652470052242279, "learning_rate": 8.237402957029126e-06, "loss": 0.133, "step": 7140 }, { "entropy": 8.763189315795898, "epoch": 0.7059521455408345, "mean_token_accuracy": 0.7318007946014404, "num_tokens": 16344046.0, "step": 7140, "train/ce_loss": 1.0437805652618408 }, { "epoch": 0.7059521455408345, "step": 7140, "train/sim_loss": 0.07421875 }, { "epoch": 0.7059521455408345, "step": 7140, "train/total_loss": 0.17859680950641632 }, { "entropy": 8.776391983032227, "epoch": 0.70605101839035, "mean_token_accuracy": 0.746051013469696, "num_tokens": 16349338.0, "step": 7141, "train/ce_loss": 0.7831575274467468 }, { "epoch": 0.70605101839035, "step": 7141, "train/sim_loss": 0.05078125 }, { "epoch": 0.70605101839035, "step": 7141, "train/total_loss": 0.12909701466560364 }, { "entropy": 8.896232604980469, "epoch": 0.7061498912398655, "mean_token_accuracy": 0.736775815486908, "num_tokens": 16354619.0, "step": 7142, "train/ce_loss": 0.939310610294342 }, { "epoch": 0.7061498912398655, "step": 7142, "train/sim_loss": 0.0625 }, { "epoch": 0.7061498912398655, "step": 7142, "train/total_loss": 0.15643106400966644 }, { "entropy": 8.795890808105469, "epoch": 0.7062487640893811, "mean_token_accuracy": 0.7345678806304932, "num_tokens": 16360063.0, "step": 7143, "train/ce_loss": 0.6977701783180237 }, { "epoch": 0.7062487640893811, "step": 7143, "train/sim_loss": 0.05078125 }, { "epoch": 0.7062487640893811, "step": 7143, "train/total_loss": 0.12055826932191849 }, { "entropy": 8.57271957397461, "epoch": 0.7063476369388966, "mean_token_accuracy": 0.7506082653999329, "num_tokens": 16365378.0, "step": 7144, "train/ce_loss": 0.5306150317192078 }, { "epoch": 0.7063476369388966, "step": 7144, "train/sim_loss": 0.02734375 }, { "epoch": 0.7063476369388966, "step": 7144, "train/total_loss": 0.08040525019168854 }, { "entropy": 9.348947525024414, "epoch": 0.7064465097884121, "mean_token_accuracy": 0.7150635123252869, "num_tokens": 16370323.0, "step": 7145, "train/ce_loss": 1.9032906293869019 }, { "epoch": 0.7064465097884121, "step": 7145, "train/sim_loss": 0.03125 }, { "epoch": 0.7064465097884121, "step": 7145, "train/total_loss": 0.22157905995845795 }, { "entropy": 8.477828025817871, "epoch": 0.7065453826379277, "mean_token_accuracy": 0.7294238805770874, "num_tokens": 16375744.0, "step": 7146, "train/ce_loss": 0.8453947901725769 }, { "epoch": 0.7065453826379277, "step": 7146, "train/sim_loss": 0.0703125 }, { "epoch": 0.7065453826379277, "step": 7146, "train/total_loss": 0.1548519730567932 }, { "entropy": 8.79666805267334, "epoch": 0.7066442554874431, "mean_token_accuracy": 0.7016128897666931, "num_tokens": 16381072.0, "step": 7147, "train/ce_loss": 1.7105617189372424e-06 }, { "epoch": 0.7066442554874431, "step": 7147, "train/sim_loss": 0.0234375 }, { "epoch": 0.7066442554874431, "step": 7147, "train/total_loss": 0.02343767136335373 }, { "entropy": 8.350767135620117, "epoch": 0.7067431283369586, "mean_token_accuracy": 0.7127450704574585, "num_tokens": 16386528.0, "step": 7148, "train/ce_loss": 1.3043437004089355 }, { "epoch": 0.7067431283369586, "step": 7148, "train/sim_loss": 0.0546875 }, { "epoch": 0.7067431283369586, "step": 7148, "train/total_loss": 0.18512187898159027 }, { "entropy": 8.232901573181152, "epoch": 0.7068420011864742, "mean_token_accuracy": 0.7346723079681396, "num_tokens": 16391963.0, "step": 7149, "train/ce_loss": 1.0024423599243164 }, { "epoch": 0.7068420011864742, "step": 7149, "train/sim_loss": 0.0703125 }, { "epoch": 0.7068420011864742, "step": 7149, "train/total_loss": 0.17055673897266388 }, { "entropy": 8.889684677124023, "epoch": 0.7069408740359897, "mean_token_accuracy": 0.7333333492279053, "num_tokens": 16397043.0, "step": 7150, "train/ce_loss": 0.8943809866905212 }, { "epoch": 0.7069408740359897, "step": 7150, "train/sim_loss": 0.03125 }, { "epoch": 0.7069408740359897, "step": 7150, "train/total_loss": 0.12068810313940048 }, { "entropy": 8.396048545837402, "epoch": 0.7070397468855052, "mean_token_accuracy": 0.8077325224876404, "num_tokens": 16402546.0, "step": 7151, "train/ce_loss": 0.43090003728866577 }, { "epoch": 0.7070397468855052, "step": 7151, "train/sim_loss": 0.0234375 }, { "epoch": 0.7070397468855052, "step": 7151, "train/total_loss": 0.06652750074863434 }, { "entropy": 8.451786994934082, "epoch": 0.7071386197350208, "mean_token_accuracy": 0.7062146663665771, "num_tokens": 16407882.0, "step": 7152, "train/ce_loss": 0.7465851306915283 }, { "epoch": 0.7071386197350208, "step": 7152, "train/sim_loss": 0.05078125 }, { "epoch": 0.7071386197350208, "step": 7152, "train/total_loss": 0.12543976306915283 }, { "entropy": 8.876398086547852, "epoch": 0.7072374925845363, "mean_token_accuracy": 0.7353433966636658, "num_tokens": 16412986.0, "step": 7153, "train/ce_loss": 0.7852999567985535 }, { "epoch": 0.7072374925845363, "step": 7153, "train/sim_loss": 0.0390625 }, { "epoch": 0.7072374925845363, "step": 7153, "train/total_loss": 0.11759249866008759 }, { "entropy": 8.655569076538086, "epoch": 0.7073363654340518, "mean_token_accuracy": 0.7562130093574524, "num_tokens": 16418289.0, "step": 7154, "train/ce_loss": 0.7815631031990051 }, { "epoch": 0.7073363654340518, "step": 7154, "train/sim_loss": 0.078125 }, { "epoch": 0.7073363654340518, "step": 7154, "train/total_loss": 0.15628132224082947 }, { "entropy": 8.82831859588623, "epoch": 0.7074352382835674, "mean_token_accuracy": 0.7213930487632751, "num_tokens": 16423545.0, "step": 7155, "train/ce_loss": 0.7690247297286987 }, { "epoch": 0.7074352382835674, "step": 7155, "train/sim_loss": 0.05078125 }, { "epoch": 0.7074352382835674, "step": 7155, "train/total_loss": 0.12768372893333435 }, { "entropy": 8.179976463317871, "epoch": 0.7075341111330828, "mean_token_accuracy": 0.6920821070671082, "num_tokens": 16429042.0, "step": 7156, "train/ce_loss": 0.8388369679450989 }, { "epoch": 0.7075341111330828, "step": 7156, "train/sim_loss": 0.078125 }, { "epoch": 0.7075341111330828, "step": 7156, "train/total_loss": 0.16200870275497437 }, { "entropy": 9.279035568237305, "epoch": 0.7076329839825983, "mean_token_accuracy": 0.6789883375167847, "num_tokens": 16434141.0, "step": 7157, "train/ce_loss": 1.4697717428207397 }, { "epoch": 0.7076329839825983, "step": 7157, "train/sim_loss": 0.0703125 }, { "epoch": 0.7076329839825983, "step": 7157, "train/total_loss": 0.21728967130184174 }, { "entropy": 8.525737762451172, "epoch": 0.7077318568321139, "mean_token_accuracy": 0.720652163028717, "num_tokens": 16439476.0, "step": 7158, "train/ce_loss": 0.44447314739227295 }, { "epoch": 0.7077318568321139, "step": 7158, "train/sim_loss": 0.0390625 }, { "epoch": 0.7077318568321139, "step": 7158, "train/total_loss": 0.08350981771945953 }, { "entropy": 8.48501968383789, "epoch": 0.7078307296816294, "mean_token_accuracy": 0.7268195152282715, "num_tokens": 16444987.0, "step": 7159, "train/ce_loss": 1.2266879081726074 }, { "epoch": 0.7078307296816294, "step": 7159, "train/sim_loss": 0.078125 }, { "epoch": 0.7078307296816294, "step": 7159, "train/total_loss": 0.2007938027381897 }, { "epoch": 0.7079296025311449, "grad_norm": 0.7253368496894836, "learning_rate": 8.232458092271178e-06, "loss": 0.1443, "step": 7160 }, { "entropy": 8.727168083190918, "epoch": 0.7079296025311449, "mean_token_accuracy": 0.7009102702140808, "num_tokens": 16450221.0, "step": 7160, "train/ce_loss": 0.8683127164840698 }, { "epoch": 0.7079296025311449, "step": 7160, "train/sim_loss": 0.09765625 }, { "epoch": 0.7079296025311449, "step": 7160, "train/total_loss": 0.18448752164840698 }, { "entropy": 8.723089218139648, "epoch": 0.7080284753806605, "mean_token_accuracy": 0.7420118451118469, "num_tokens": 16455509.0, "step": 7161, "train/ce_loss": 0.4825913906097412 }, { "epoch": 0.7080284753806605, "step": 7161, "train/sim_loss": 0.05078125 }, { "epoch": 0.7080284753806605, "step": 7161, "train/total_loss": 0.09904038906097412 }, { "entropy": 8.638715744018555, "epoch": 0.708127348230176, "mean_token_accuracy": 0.7156177163124084, "num_tokens": 16460895.0, "step": 7162, "train/ce_loss": 0.9495337605476379 }, { "epoch": 0.708127348230176, "step": 7162, "train/sim_loss": 0.08203125 }, { "epoch": 0.708127348230176, "step": 7162, "train/total_loss": 0.17698463797569275 }, { "entropy": 8.273295402526855, "epoch": 0.7082262210796915, "mean_token_accuracy": 0.7514318227767944, "num_tokens": 16466245.0, "step": 7163, "train/ce_loss": 1.0944613218307495 }, { "epoch": 0.7082262210796915, "step": 7163, "train/sim_loss": 0.06640625 }, { "epoch": 0.7082262210796915, "step": 7163, "train/total_loss": 0.17585238814353943 }, { "entropy": 8.572425842285156, "epoch": 0.7083250939292071, "mean_token_accuracy": 0.7860310673713684, "num_tokens": 16471615.0, "step": 7164, "train/ce_loss": 0.6559258699417114 }, { "epoch": 0.7083250939292071, "step": 7164, "train/sim_loss": 0.01171875 }, { "epoch": 0.7083250939292071, "step": 7164, "train/total_loss": 0.07731133699417114 }, { "entropy": 8.556367874145508, "epoch": 0.7084239667787225, "mean_token_accuracy": 0.7476732134819031, "num_tokens": 16477086.0, "step": 7165, "train/ce_loss": 0.7664522528648376 }, { "epoch": 0.7084239667787225, "step": 7165, "train/sim_loss": 0.0625 }, { "epoch": 0.7084239667787225, "step": 7165, "train/total_loss": 0.13914522528648376 }, { "entropy": 8.366483688354492, "epoch": 0.708522839628238, "mean_token_accuracy": 0.7300000190734863, "num_tokens": 16482415.0, "step": 7166, "train/ce_loss": 1.4523415565490723 }, { "epoch": 0.708522839628238, "step": 7166, "train/sim_loss": 0.03515625 }, { "epoch": 0.708522839628238, "step": 7166, "train/total_loss": 0.180390402674675 }, { "entropy": 8.758155822753906, "epoch": 0.7086217124777536, "mean_token_accuracy": 0.7078651785850525, "num_tokens": 16487429.0, "step": 7167, "train/ce_loss": 1.5139796733856201 }, { "epoch": 0.7086217124777536, "step": 7167, "train/sim_loss": 0.0859375 }, { "epoch": 0.7086217124777536, "step": 7167, "train/total_loss": 0.2373354732990265 }, { "entropy": 8.730156898498535, "epoch": 0.7087205853272691, "mean_token_accuracy": 0.7630619406700134, "num_tokens": 16492721.0, "step": 7168, "train/ce_loss": 0.6302322745323181 }, { "epoch": 0.7087205853272691, "step": 7168, "train/sim_loss": 0.1015625 }, { "epoch": 0.7087205853272691, "step": 7168, "train/total_loss": 0.16458573937416077 }, { "entropy": 8.71129322052002, "epoch": 0.7088194581767846, "mean_token_accuracy": 0.6772428750991821, "num_tokens": 16498044.0, "step": 7169, "train/ce_loss": 1.1904356479644775 }, { "epoch": 0.7088194581767846, "step": 7169, "train/sim_loss": 0.046875 }, { "epoch": 0.7088194581767846, "step": 7169, "train/total_loss": 0.16591855883598328 }, { "entropy": 8.920693397521973, "epoch": 0.7089183310263002, "mean_token_accuracy": 0.7358229756355286, "num_tokens": 16503273.0, "step": 7170, "train/ce_loss": 1.2857366800308228 }, { "epoch": 0.7089183310263002, "step": 7170, "train/sim_loss": 0.08984375 }, { "epoch": 0.7089183310263002, "step": 7170, "train/total_loss": 0.21841742098331451 }, { "entropy": 9.01522445678711, "epoch": 0.7090172038758157, "mean_token_accuracy": 0.7606111764907837, "num_tokens": 16508300.0, "step": 7171, "train/ce_loss": 0.4783221483230591 }, { "epoch": 0.7090172038758157, "step": 7171, "train/sim_loss": 0.07421875 }, { "epoch": 0.7090172038758157, "step": 7171, "train/total_loss": 0.12205097079277039 }, { "entropy": 8.480701446533203, "epoch": 0.7091160767253313, "mean_token_accuracy": 0.7057633996009827, "num_tokens": 16513825.0, "step": 7172, "train/ce_loss": 0.7974771857261658 }, { "epoch": 0.7091160767253313, "step": 7172, "train/sim_loss": 0.03125 }, { "epoch": 0.7091160767253313, "step": 7172, "train/total_loss": 0.11099772155284882 }, { "entropy": 8.477746963500977, "epoch": 0.7092149495748468, "mean_token_accuracy": 0.7269545197486877, "num_tokens": 16519141.0, "step": 7173, "train/ce_loss": 0.4708613455295563 }, { "epoch": 0.7092149495748468, "step": 7173, "train/sim_loss": 0.02734375 }, { "epoch": 0.7092149495748468, "step": 7173, "train/total_loss": 0.07442988455295563 }, { "entropy": 8.391982078552246, "epoch": 0.7093138224243623, "mean_token_accuracy": 0.6977459192276001, "num_tokens": 16524613.0, "step": 7174, "train/ce_loss": 1.1097391843795776 }, { "epoch": 0.7093138224243623, "step": 7174, "train/sim_loss": 0.05078125 }, { "epoch": 0.7093138224243623, "step": 7174, "train/total_loss": 0.16175517439842224 }, { "entropy": 9.193445205688477, "epoch": 0.7094126952738778, "mean_token_accuracy": 0.7680412530899048, "num_tokens": 16529469.0, "step": 7175, "train/ce_loss": 1.7883644104003906 }, { "epoch": 0.7094126952738778, "step": 7175, "train/sim_loss": 0.05078125 }, { "epoch": 0.7094126952738778, "step": 7175, "train/total_loss": 0.22961769998073578 }, { "entropy": 9.193803787231445, "epoch": 0.7095115681233933, "mean_token_accuracy": 0.8093883395195007, "num_tokens": 16534618.0, "step": 7176, "train/ce_loss": 0.8265652656555176 }, { "epoch": 0.7095115681233933, "step": 7176, "train/sim_loss": 0.06640625 }, { "epoch": 0.7095115681233933, "step": 7176, "train/total_loss": 0.14906278252601624 }, { "entropy": 8.67904281616211, "epoch": 0.7096104409729088, "mean_token_accuracy": 0.73209547996521, "num_tokens": 16539769.0, "step": 7177, "train/ce_loss": 0.670390784740448 }, { "epoch": 0.7096104409729088, "step": 7177, "train/sim_loss": 0.0625 }, { "epoch": 0.7096104409729088, "step": 7177, "train/total_loss": 0.12953907251358032 }, { "entropy": 9.012173652648926, "epoch": 0.7097093138224244, "mean_token_accuracy": 0.7704447507858276, "num_tokens": 16544894.0, "step": 7178, "train/ce_loss": 0.8676934838294983 }, { "epoch": 0.7097093138224244, "step": 7178, "train/sim_loss": 0.0859375 }, { "epoch": 0.7097093138224244, "step": 7178, "train/total_loss": 0.17270684242248535 }, { "entropy": 8.632959365844727, "epoch": 0.7098081866719399, "mean_token_accuracy": 0.7363238334655762, "num_tokens": 16550413.0, "step": 7179, "train/ce_loss": 1.1974496841430664 }, { "epoch": 0.7098081866719399, "step": 7179, "train/sim_loss": 0.1015625 }, { "epoch": 0.7098081866719399, "step": 7179, "train/total_loss": 0.22130747139453888 }, { "epoch": 0.7099070595214554, "grad_norm": 0.6061992645263672, "learning_rate": 8.227513227513229e-06, "loss": 0.1407, "step": 7180 }, { "entropy": 8.753702163696289, "epoch": 0.7099070595214554, "mean_token_accuracy": 0.7413366436958313, "num_tokens": 16555710.0, "step": 7180, "train/ce_loss": 1.0147998332977295 }, { "epoch": 0.7099070595214554, "step": 7180, "train/sim_loss": 0.10546875 }, { "epoch": 0.7099070595214554, "step": 7180, "train/total_loss": 0.20694872736930847 }, { "entropy": 9.063572883605957, "epoch": 0.710005932370971, "mean_token_accuracy": 0.7465437650680542, "num_tokens": 16560745.0, "step": 7181, "train/ce_loss": 0.8497380018234253 }, { "epoch": 0.710005932370971, "step": 7181, "train/sim_loss": 0.1015625 }, { "epoch": 0.710005932370971, "step": 7181, "train/total_loss": 0.18653631210327148 }, { "entropy": 9.492816925048828, "epoch": 0.7101048052204865, "mean_token_accuracy": 0.6989796161651611, "num_tokens": 16565562.0, "step": 7182, "train/ce_loss": 1.6922687292099 }, { "epoch": 0.7101048052204865, "step": 7182, "train/sim_loss": 0.0234375 }, { "epoch": 0.7101048052204865, "step": 7182, "train/total_loss": 0.19266436994075775 }, { "entropy": 8.737098693847656, "epoch": 0.710203678070002, "mean_token_accuracy": 0.7596795558929443, "num_tokens": 16570949.0, "step": 7183, "train/ce_loss": 0.9820594191551208 }, { "epoch": 0.710203678070002, "step": 7183, "train/sim_loss": 0.109375 }, { "epoch": 0.710203678070002, "step": 7183, "train/total_loss": 0.20758095383644104 }, { "entropy": 9.075728416442871, "epoch": 0.7103025509195176, "mean_token_accuracy": 0.7591836452484131, "num_tokens": 16575852.0, "step": 7184, "train/ce_loss": 3.5117921015626052e-06 }, { "epoch": 0.7103025509195176, "step": 7184, "train/sim_loss": 0.04296875 }, { "epoch": 0.7103025509195176, "step": 7184, "train/total_loss": 0.042969100177288055 }, { "entropy": 8.564920425415039, "epoch": 0.710401423769033, "mean_token_accuracy": 0.7653301954269409, "num_tokens": 16581185.0, "step": 7185, "train/ce_loss": 0.5567665696144104 }, { "epoch": 0.710401423769033, "step": 7185, "train/sim_loss": 0.0390625 }, { "epoch": 0.710401423769033, "step": 7185, "train/total_loss": 0.0947391539812088 }, { "entropy": 8.391427993774414, "epoch": 0.7105002966185485, "mean_token_accuracy": 0.7432170510292053, "num_tokens": 16586717.0, "step": 7186, "train/ce_loss": 0.5869799852371216 }, { "epoch": 0.7105002966185485, "step": 7186, "train/sim_loss": 0.08203125 }, { "epoch": 0.7105002966185485, "step": 7186, "train/total_loss": 0.14072924852371216 }, { "entropy": 8.852867126464844, "epoch": 0.7105991694680641, "mean_token_accuracy": 0.739570140838623, "num_tokens": 16591969.0, "step": 7187, "train/ce_loss": 1.23922598361969 }, { "epoch": 0.7105991694680641, "step": 7187, "train/sim_loss": 0.078125 }, { "epoch": 0.7105991694680641, "step": 7187, "train/total_loss": 0.20204760134220123 }, { "entropy": 9.074911117553711, "epoch": 0.7106980423175796, "mean_token_accuracy": 0.6957831382751465, "num_tokens": 16597053.0, "step": 7188, "train/ce_loss": 1.0194846391677856 }, { "epoch": 0.7106980423175796, "step": 7188, "train/sim_loss": 0.02734375 }, { "epoch": 0.7106980423175796, "step": 7188, "train/total_loss": 0.12929221987724304 }, { "entropy": 9.232538223266602, "epoch": 0.7107969151670951, "mean_token_accuracy": 0.761904776096344, "num_tokens": 16601888.0, "step": 7189, "train/ce_loss": 2.124577522277832 }, { "epoch": 0.7107969151670951, "step": 7189, "train/sim_loss": 0.15625 }, { "epoch": 0.7107969151670951, "step": 7189, "train/total_loss": 0.3687077760696411 }, { "entropy": 8.746088981628418, "epoch": 0.7108957880166107, "mean_token_accuracy": 0.6945031881332397, "num_tokens": 16607285.0, "step": 7190, "train/ce_loss": 1.0544683933258057 }, { "epoch": 0.7108957880166107, "step": 7190, "train/sim_loss": 0.03125 }, { "epoch": 0.7108957880166107, "step": 7190, "train/total_loss": 0.13669684529304504 }, { "entropy": 9.125240325927734, "epoch": 0.7109946608661262, "mean_token_accuracy": 0.8063872456550598, "num_tokens": 16612238.0, "step": 7191, "train/ce_loss": 2.3707571017439477e-06 }, { "epoch": 0.7109946608661262, "step": 7191, "train/sim_loss": 0.078125 }, { "epoch": 0.7109946608661262, "step": 7191, "train/total_loss": 0.0781252384185791 }, { "entropy": 8.543449401855469, "epoch": 0.7110935337156417, "mean_token_accuracy": 0.7433832287788391, "num_tokens": 16617597.0, "step": 7192, "train/ce_loss": 1.0866926908493042 }, { "epoch": 0.7110935337156417, "step": 7192, "train/sim_loss": 0.05078125 }, { "epoch": 0.7110935337156417, "step": 7192, "train/total_loss": 0.15945053100585938 }, { "entropy": 9.435101509094238, "epoch": 0.7111924065651573, "mean_token_accuracy": 0.6426734924316406, "num_tokens": 16622413.0, "step": 7193, "train/ce_loss": 8.009789780771825e-06 }, { "epoch": 0.7111924065651573, "step": 7193, "train/sim_loss": 0.0390625 }, { "epoch": 0.7111924065651573, "step": 7193, "train/total_loss": 0.03906330093741417 }, { "entropy": 8.519792556762695, "epoch": 0.7112912794146727, "mean_token_accuracy": 0.7894088625907898, "num_tokens": 16627750.0, "step": 7194, "train/ce_loss": 0.3221474289894104 }, { "epoch": 0.7112912794146727, "step": 7194, "train/sim_loss": 0.015625 }, { "epoch": 0.7112912794146727, "step": 7194, "train/total_loss": 0.04783974215388298 }, { "entropy": 8.549459457397461, "epoch": 0.7113901522641882, "mean_token_accuracy": 0.6720741391181946, "num_tokens": 16633098.0, "step": 7195, "train/ce_loss": 1.1505359411239624 }, { "epoch": 0.7113901522641882, "step": 7195, "train/sim_loss": 0.05078125 }, { "epoch": 0.7113901522641882, "step": 7195, "train/total_loss": 0.16583484411239624 }, { "entropy": 8.939401626586914, "epoch": 0.7114890251137038, "mean_token_accuracy": 0.7618343234062195, "num_tokens": 16638198.0, "step": 7196, "train/ce_loss": 1.1567680835723877 }, { "epoch": 0.7114890251137038, "step": 7196, "train/sim_loss": 0.07421875 }, { "epoch": 0.7114890251137038, "step": 7196, "train/total_loss": 0.18989557027816772 }, { "entropy": 8.785555839538574, "epoch": 0.7115878979632193, "mean_token_accuracy": 0.807212233543396, "num_tokens": 16643387.0, "step": 7197, "train/ce_loss": 1.0497312545776367 }, { "epoch": 0.7115878979632193, "step": 7197, "train/sim_loss": 0.0703125 }, { "epoch": 0.7115878979632193, "step": 7197, "train/total_loss": 0.17528563737869263 }, { "entropy": 8.629180908203125, "epoch": 0.7116867708127348, "mean_token_accuracy": 0.7587769031524658, "num_tokens": 16648693.0, "step": 7198, "train/ce_loss": 0.8789370656013489 }, { "epoch": 0.7116867708127348, "step": 7198, "train/sim_loss": 0.0703125 }, { "epoch": 0.7116867708127348, "step": 7198, "train/total_loss": 0.15820620954036713 }, { "entropy": 9.374748229980469, "epoch": 0.7117856436622504, "mean_token_accuracy": 0.7395833134651184, "num_tokens": 16653588.0, "step": 7199, "train/ce_loss": 3.264047563789063e-06 }, { "epoch": 0.7117856436622504, "step": 7199, "train/sim_loss": 0.046875 }, { "epoch": 0.7117856436622504, "step": 7199, "train/total_loss": 0.046875327825546265 }, { "epoch": 0.7118845165117659, "grad_norm": 0.7894151210784912, "learning_rate": 8.222568362755279e-06, "loss": 0.1379, "step": 7200 }, { "entropy": 9.173515319824219, "epoch": 0.7118845165117659, "mean_token_accuracy": 0.75, "num_tokens": 16658565.0, "step": 7200, "train/ce_loss": 3.8327752918121405e-06 }, { "epoch": 0.7118845165117659, "step": 7200, "train/sim_loss": 0.0390625 }, { "epoch": 0.7118845165117659, "step": 7200, "train/total_loss": 0.03906288370490074 }, { "entropy": 8.6094970703125, "epoch": 0.7119833893612814, "mean_token_accuracy": 0.6755725145339966, "num_tokens": 16663803.0, "step": 7201, "train/ce_loss": 3.767704765778035e-05 }, { "epoch": 0.7119833893612814, "step": 7201, "train/sim_loss": 0.03125 }, { "epoch": 0.7119833893612814, "step": 7201, "train/total_loss": 0.031253766268491745 }, { "entropy": 8.67281436920166, "epoch": 0.712082262210797, "mean_token_accuracy": 0.7536889910697937, "num_tokens": 16669137.0, "step": 7202, "train/ce_loss": 0.8251853585243225 }, { "epoch": 0.712082262210797, "step": 7202, "train/sim_loss": 0.109375 }, { "epoch": 0.712082262210797, "step": 7202, "train/total_loss": 0.1918935477733612 }, { "entropy": 8.776318550109863, "epoch": 0.7121811350603124, "mean_token_accuracy": 0.759856641292572, "num_tokens": 16674566.0, "step": 7203, "train/ce_loss": 0.6415863037109375 }, { "epoch": 0.7121811350603124, "step": 7203, "train/sim_loss": 0.09765625 }, { "epoch": 0.7121811350603124, "step": 7203, "train/total_loss": 0.161814883351326 }, { "entropy": 8.146249771118164, "epoch": 0.7122800079098279, "mean_token_accuracy": 0.7675840854644775, "num_tokens": 16680003.0, "step": 7204, "train/ce_loss": 0.5550179481506348 }, { "epoch": 0.7122800079098279, "step": 7204, "train/sim_loss": 0.0703125 }, { "epoch": 0.7122800079098279, "step": 7204, "train/total_loss": 0.125814288854599 }, { "entropy": 8.744900703430176, "epoch": 0.7123788807593435, "mean_token_accuracy": 0.727173924446106, "num_tokens": 16685414.0, "step": 7205, "train/ce_loss": 0.7248499393463135 }, { "epoch": 0.7123788807593435, "step": 7205, "train/sim_loss": 0.04296875 }, { "epoch": 0.7123788807593435, "step": 7205, "train/total_loss": 0.11545374244451523 }, { "entropy": 8.859378814697266, "epoch": 0.712477753608859, "mean_token_accuracy": 0.75, "num_tokens": 16690588.0, "step": 7206, "train/ce_loss": 0.5503898859024048 }, { "epoch": 0.712477753608859, "step": 7206, "train/sim_loss": 0.0234375 }, { "epoch": 0.712477753608859, "step": 7206, "train/total_loss": 0.07847648859024048 }, { "entropy": 9.277009963989258, "epoch": 0.7125766264583745, "mean_token_accuracy": 0.7433264851570129, "num_tokens": 16695516.0, "step": 7207, "train/ce_loss": 1.363459825515747 }, { "epoch": 0.7125766264583745, "step": 7207, "train/sim_loss": 0.0625 }, { "epoch": 0.7125766264583745, "step": 7207, "train/total_loss": 0.1988459825515747 }, { "entropy": 8.88760757446289, "epoch": 0.7126754993078901, "mean_token_accuracy": 0.7215026021003723, "num_tokens": 16700755.0, "step": 7208, "train/ce_loss": 0.5441802144050598 }, { "epoch": 0.7126754993078901, "step": 7208, "train/sim_loss": 0.03125 }, { "epoch": 0.7126754993078901, "step": 7208, "train/total_loss": 0.08566802740097046 }, { "entropy": 8.83308219909668, "epoch": 0.7127743721574056, "mean_token_accuracy": 0.751329779624939, "num_tokens": 16705945.0, "step": 7209, "train/ce_loss": 0.4552195072174072 }, { "epoch": 0.7127743721574056, "step": 7209, "train/sim_loss": 0.05859375 }, { "epoch": 0.7127743721574056, "step": 7209, "train/total_loss": 0.10411570221185684 }, { "entropy": 8.83424186706543, "epoch": 0.7128732450069211, "mean_token_accuracy": 0.8017789125442505, "num_tokens": 16711160.0, "step": 7210, "train/ce_loss": 0.930652916431427 }, { "epoch": 0.7128732450069211, "step": 7210, "train/sim_loss": 0.03125 }, { "epoch": 0.7128732450069211, "step": 7210, "train/total_loss": 0.1243152916431427 }, { "entropy": 8.777759552001953, "epoch": 0.7129721178564367, "mean_token_accuracy": 0.7257204055786133, "num_tokens": 16716523.0, "step": 7211, "train/ce_loss": 0.6090152859687805 }, { "epoch": 0.7129721178564367, "step": 7211, "train/sim_loss": 0.0234375 }, { "epoch": 0.7129721178564367, "step": 7211, "train/total_loss": 0.08433903008699417 }, { "entropy": 8.86879825592041, "epoch": 0.7130709907059521, "mean_token_accuracy": 0.7698630094528198, "num_tokens": 16721891.0, "step": 7212, "train/ce_loss": 0.30563464760780334 }, { "epoch": 0.7130709907059521, "step": 7212, "train/sim_loss": 0.01953125 }, { "epoch": 0.7130709907059521, "step": 7212, "train/total_loss": 0.050094716250896454 }, { "entropy": 9.277276992797852, "epoch": 0.7131698635554676, "mean_token_accuracy": 0.7074379920959473, "num_tokens": 16727058.0, "step": 7213, "train/ce_loss": 1.1419485807418823 }, { "epoch": 0.7131698635554676, "step": 7213, "train/sim_loss": 0.08203125 }, { "epoch": 0.7131698635554676, "step": 7213, "train/total_loss": 0.1962261199951172 }, { "entropy": 8.50288200378418, "epoch": 0.7132687364049832, "mean_token_accuracy": 0.7432712316513062, "num_tokens": 16732460.0, "step": 7214, "train/ce_loss": 0.844182550907135 }, { "epoch": 0.7132687364049832, "step": 7214, "train/sim_loss": 0.0703125 }, { "epoch": 0.7132687364049832, "step": 7214, "train/total_loss": 0.15473076701164246 }, { "entropy": 8.25242805480957, "epoch": 0.7133676092544987, "mean_token_accuracy": 0.8155699968338013, "num_tokens": 16738011.0, "step": 7215, "train/ce_loss": 0.46100690960884094 }, { "epoch": 0.7133676092544987, "step": 7215, "train/sim_loss": 0.01171875 }, { "epoch": 0.7133676092544987, "step": 7215, "train/total_loss": 0.057819440960884094 }, { "entropy": 8.90095329284668, "epoch": 0.7134664821040142, "mean_token_accuracy": 0.7601199150085449, "num_tokens": 16743169.0, "step": 7216, "train/ce_loss": 1.0977258682250977 }, { "epoch": 0.7134664821040142, "step": 7216, "train/sim_loss": 0.03125 }, { "epoch": 0.7134664821040142, "step": 7216, "train/total_loss": 0.14102259278297424 }, { "entropy": 8.77444076538086, "epoch": 0.7135653549535298, "mean_token_accuracy": 0.6976456046104431, "num_tokens": 16748427.0, "step": 7217, "train/ce_loss": 1.2158559560775757 }, { "epoch": 0.7135653549535298, "step": 7217, "train/sim_loss": 0.0859375 }, { "epoch": 0.7135653549535298, "step": 7217, "train/total_loss": 0.20752310752868652 }, { "entropy": 8.828907012939453, "epoch": 0.7136642278030453, "mean_token_accuracy": 0.7189265489578247, "num_tokens": 16753620.0, "step": 7218, "train/ce_loss": 1.1605870723724365 }, { "epoch": 0.7136642278030453, "step": 7218, "train/sim_loss": 0.0625 }, { "epoch": 0.7136642278030453, "step": 7218, "train/total_loss": 0.17855870723724365 }, { "entropy": 9.13592529296875, "epoch": 0.7137631006525608, "mean_token_accuracy": 0.7054794430732727, "num_tokens": 16758634.0, "step": 7219, "train/ce_loss": 1.2255994081497192 }, { "epoch": 0.7137631006525608, "step": 7219, "train/sim_loss": 0.078125 }, { "epoch": 0.7137631006525608, "step": 7219, "train/total_loss": 0.20068493485450745 }, { "epoch": 0.7138619735020764, "grad_norm": 0.8070234656333923, "learning_rate": 8.21762349799733e-06, "loss": 0.1408, "step": 7220 }, { "entropy": 8.727354049682617, "epoch": 0.7138619735020764, "mean_token_accuracy": 0.6777523159980774, "num_tokens": 16763997.0, "step": 7220, "train/ce_loss": 0.7659044861793518 }, { "epoch": 0.7138619735020764, "step": 7220, "train/sim_loss": 0.046875 }, { "epoch": 0.7138619735020764, "step": 7220, "train/total_loss": 0.12346544861793518 }, { "entropy": 8.890021324157715, "epoch": 0.7139608463515918, "mean_token_accuracy": 0.767912745475769, "num_tokens": 16769055.0, "step": 7221, "train/ce_loss": 0.47009187936782837 }, { "epoch": 0.7139608463515918, "step": 7221, "train/sim_loss": 0.0390625 }, { "epoch": 0.7139608463515918, "step": 7221, "train/total_loss": 0.0860716849565506 }, { "entropy": 8.97850513458252, "epoch": 0.7140597192011073, "mean_token_accuracy": 0.6974110007286072, "num_tokens": 16774132.0, "step": 7222, "train/ce_loss": 1.646481905481778e-06 }, { "epoch": 0.7140597192011073, "step": 7222, "train/sim_loss": 0.07421875 }, { "epoch": 0.7140597192011073, "step": 7222, "train/total_loss": 0.07421891391277313 }, { "entropy": 8.553703308105469, "epoch": 0.7141585920506229, "mean_token_accuracy": 0.714970052242279, "num_tokens": 16779472.0, "step": 7223, "train/ce_loss": 0.48592522740364075 }, { "epoch": 0.7141585920506229, "step": 7223, "train/sim_loss": 0.03125 }, { "epoch": 0.7141585920506229, "step": 7223, "train/total_loss": 0.07984252274036407 }, { "entropy": 9.40013313293457, "epoch": 0.7142574649001384, "mean_token_accuracy": 0.8220140337944031, "num_tokens": 16784296.0, "step": 7224, "train/ce_loss": 1.2307887077331543 }, { "epoch": 0.7142574649001384, "step": 7224, "train/sim_loss": 0.0234375 }, { "epoch": 0.7142574649001384, "step": 7224, "train/total_loss": 0.14651638269424438 }, { "entropy": 9.15133285522461, "epoch": 0.7143563377496539, "mean_token_accuracy": 0.7839506268501282, "num_tokens": 16789368.0, "step": 7225, "train/ce_loss": 0.8047212958335876 }, { "epoch": 0.7143563377496539, "step": 7225, "train/sim_loss": 0.0234375 }, { "epoch": 0.7143563377496539, "step": 7225, "train/total_loss": 0.10390963405370712 }, { "entropy": 8.94349193572998, "epoch": 0.7144552105991695, "mean_token_accuracy": 0.7742424011230469, "num_tokens": 16794518.0, "step": 7226, "train/ce_loss": 1.8928044482890982e-06 }, { "epoch": 0.7144552105991695, "step": 7226, "train/sim_loss": 0.05859375 }, { "epoch": 0.7144552105991695, "step": 7226, "train/total_loss": 0.05859393998980522 }, { "entropy": 9.260454177856445, "epoch": 0.714554083448685, "mean_token_accuracy": 0.7296360731124878, "num_tokens": 16799497.0, "step": 7227, "train/ce_loss": 2.46664899350435e-06 }, { "epoch": 0.714554083448685, "step": 7227, "train/sim_loss": 0.01953125 }, { "epoch": 0.714554083448685, "step": 7227, "train/total_loss": 0.0195314958691597 }, { "entropy": 8.678625106811523, "epoch": 0.7146529562982005, "mean_token_accuracy": 0.7012820243835449, "num_tokens": 16804724.0, "step": 7228, "train/ce_loss": 1.9900438785552979 }, { "epoch": 0.7146529562982005, "step": 7228, "train/sim_loss": 0.078125 }, { "epoch": 0.7146529562982005, "step": 7228, "train/total_loss": 0.2771294116973877 }, { "entropy": 8.781715393066406, "epoch": 0.7147518291477161, "mean_token_accuracy": 0.748251736164093, "num_tokens": 16809933.0, "step": 7229, "train/ce_loss": 1.2562072277069092 }, { "epoch": 0.7147518291477161, "step": 7229, "train/sim_loss": 0.0625 }, { "epoch": 0.7147518291477161, "step": 7229, "train/total_loss": 0.18812072277069092 }, { "entropy": 8.724804878234863, "epoch": 0.7148507019972316, "mean_token_accuracy": 0.7318652868270874, "num_tokens": 16815153.0, "step": 7230, "train/ce_loss": 0.6815114617347717 }, { "epoch": 0.7148507019972316, "step": 7230, "train/sim_loss": 0.05078125 }, { "epoch": 0.7148507019972316, "step": 7230, "train/total_loss": 0.11893239617347717 }, { "entropy": 8.732213973999023, "epoch": 0.714949574846747, "mean_token_accuracy": 0.6647264361381531, "num_tokens": 16820635.0, "step": 7231, "train/ce_loss": 1.2498207092285156 }, { "epoch": 0.714949574846747, "step": 7231, "train/sim_loss": 0.0390625 }, { "epoch": 0.714949574846747, "step": 7231, "train/total_loss": 0.1640445739030838 }, { "entropy": 8.821627616882324, "epoch": 0.7150484476962626, "mean_token_accuracy": 0.6881851553916931, "num_tokens": 16825913.0, "step": 7232, "train/ce_loss": 0.9049714803695679 }, { "epoch": 0.7150484476962626, "step": 7232, "train/sim_loss": 0.0625 }, { "epoch": 0.7150484476962626, "step": 7232, "train/total_loss": 0.15299715101718903 }, { "entropy": 9.065165519714355, "epoch": 0.7151473205457781, "mean_token_accuracy": 0.7344992160797119, "num_tokens": 16831015.0, "step": 7233, "train/ce_loss": 0.6679177284240723 }, { "epoch": 0.7151473205457781, "step": 7233, "train/sim_loss": 0.046875 }, { "epoch": 0.7151473205457781, "step": 7233, "train/total_loss": 0.11366677284240723 }, { "entropy": 9.262226104736328, "epoch": 0.7152461933952936, "mean_token_accuracy": 0.7128027677536011, "num_tokens": 16836047.0, "step": 7234, "train/ce_loss": 1.1703163385391235 }, { "epoch": 0.7152461933952936, "step": 7234, "train/sim_loss": 0.08203125 }, { "epoch": 0.7152461933952936, "step": 7234, "train/total_loss": 0.19906288385391235 }, { "entropy": 8.538326263427734, "epoch": 0.7153450662448092, "mean_token_accuracy": 0.6952841877937317, "num_tokens": 16841310.0, "step": 7235, "train/ce_loss": 0.514238178730011 }, { "epoch": 0.7153450662448092, "step": 7235, "train/sim_loss": 0.046875 }, { "epoch": 0.7153450662448092, "step": 7235, "train/total_loss": 0.0982988178730011 }, { "entropy": 8.30468463897705, "epoch": 0.7154439390943247, "mean_token_accuracy": 0.7168743014335632, "num_tokens": 16846712.0, "step": 7236, "train/ce_loss": 1.3380621671676636 }, { "epoch": 0.7154439390943247, "step": 7236, "train/sim_loss": 0.05859375 }, { "epoch": 0.7154439390943247, "step": 7236, "train/total_loss": 0.19239996373653412 }, { "entropy": 8.470191955566406, "epoch": 0.7155428119438402, "mean_token_accuracy": 0.7670772671699524, "num_tokens": 16852113.0, "step": 7237, "train/ce_loss": 0.958454430103302 }, { "epoch": 0.7155428119438402, "step": 7237, "train/sim_loss": 0.08984375 }, { "epoch": 0.7155428119438402, "step": 7237, "train/total_loss": 0.18568919599056244 }, { "entropy": 9.10818099975586, "epoch": 0.7156416847933558, "mean_token_accuracy": 0.7554585337638855, "num_tokens": 16857213.0, "step": 7238, "train/ce_loss": 0.8512943983078003 }, { "epoch": 0.7156416847933558, "step": 7238, "train/sim_loss": 0.046875 }, { "epoch": 0.7156416847933558, "step": 7238, "train/total_loss": 0.13200443983078003 }, { "entropy": 8.787921905517578, "epoch": 0.7157405576428713, "mean_token_accuracy": 0.7117117047309875, "num_tokens": 16862428.0, "step": 7239, "train/ce_loss": 0.7441038489341736 }, { "epoch": 0.7157405576428713, "step": 7239, "train/sim_loss": 0.046875 }, { "epoch": 0.7157405576428713, "step": 7239, "train/total_loss": 0.12128538638353348 }, { "epoch": 0.7158394304923867, "grad_norm": 0.7717053890228271, "learning_rate": 8.212678633239382e-06, "loss": 0.1401, "step": 7240 }, { "entropy": 8.294328689575195, "epoch": 0.7158394304923867, "mean_token_accuracy": 0.7929901480674744, "num_tokens": 16867827.0, "step": 7240, "train/ce_loss": 0.516572892665863 }, { "epoch": 0.7158394304923867, "step": 7240, "train/sim_loss": 0.0625 }, { "epoch": 0.7158394304923867, "step": 7240, "train/total_loss": 0.1141572892665863 }, { "entropy": 8.661676406860352, "epoch": 0.7159383033419023, "mean_token_accuracy": 0.6924939751625061, "num_tokens": 16873311.0, "step": 7241, "train/ce_loss": 1.3083157539367676 }, { "epoch": 0.7159383033419023, "step": 7241, "train/sim_loss": 0.0859375 }, { "epoch": 0.7159383033419023, "step": 7241, "train/total_loss": 0.21676908433437347 }, { "entropy": 9.006423950195312, "epoch": 0.7160371761914178, "mean_token_accuracy": 0.7820737957954407, "num_tokens": 16878342.0, "step": 7242, "train/ce_loss": 1.0763325691223145 }, { "epoch": 0.7160371761914178, "step": 7242, "train/sim_loss": 0.05859375 }, { "epoch": 0.7160371761914178, "step": 7242, "train/total_loss": 0.16622701287269592 }, { "entropy": 8.977418899536133, "epoch": 0.7161360490409333, "mean_token_accuracy": 0.7130681872367859, "num_tokens": 16883553.0, "step": 7243, "train/ce_loss": 1.2317839860916138 }, { "epoch": 0.7161360490409333, "step": 7243, "train/sim_loss": 0.1015625 }, { "epoch": 0.7161360490409333, "step": 7243, "train/total_loss": 0.2247408926486969 }, { "entropy": 8.649625778198242, "epoch": 0.7162349218904489, "mean_token_accuracy": 0.7537747025489807, "num_tokens": 16888914.0, "step": 7244, "train/ce_loss": 0.5398925542831421 }, { "epoch": 0.7162349218904489, "step": 7244, "train/sim_loss": 0.015625 }, { "epoch": 0.7162349218904489, "step": 7244, "train/total_loss": 0.06961426138877869 }, { "entropy": 8.799467086791992, "epoch": 0.7163337947399644, "mean_token_accuracy": 0.6811594367027283, "num_tokens": 16894264.0, "step": 7245, "train/ce_loss": 1.9662058353424072 }, { "epoch": 0.7163337947399644, "step": 7245, "train/sim_loss": 0.078125 }, { "epoch": 0.7163337947399644, "step": 7245, "train/total_loss": 0.2747455835342407 }, { "entropy": 8.755645751953125, "epoch": 0.7164326675894799, "mean_token_accuracy": 0.7402299046516418, "num_tokens": 16899628.0, "step": 7246, "train/ce_loss": 1.0392379760742188 }, { "epoch": 0.7164326675894799, "step": 7246, "train/sim_loss": 0.09375 }, { "epoch": 0.7164326675894799, "step": 7246, "train/total_loss": 0.19767379760742188 }, { "entropy": 9.167597770690918, "epoch": 0.7165315404389955, "mean_token_accuracy": 0.7532228231430054, "num_tokens": 16904611.0, "step": 7247, "train/ce_loss": 1.9028851738767116e-06 }, { "epoch": 0.7165315404389955, "step": 7247, "train/sim_loss": 0.08203125 }, { "epoch": 0.7165315404389955, "step": 7247, "train/total_loss": 0.08203144371509552 }, { "entropy": 9.476949691772461, "epoch": 0.716630413288511, "mean_token_accuracy": 0.7322275042533875, "num_tokens": 16909444.0, "step": 7248, "train/ce_loss": 6.365969511534786e-06 }, { "epoch": 0.716630413288511, "step": 7248, "train/sim_loss": 0.046875 }, { "epoch": 0.716630413288511, "step": 7248, "train/total_loss": 0.04687563702464104 }, { "entropy": 9.324455261230469, "epoch": 0.7167292861380264, "mean_token_accuracy": 0.8163716793060303, "num_tokens": 16914355.0, "step": 7249, "train/ce_loss": 1.9346645785844885e-05 }, { "epoch": 0.7167292861380264, "step": 7249, "train/sim_loss": 0.0625 }, { "epoch": 0.7167292861380264, "step": 7249, "train/total_loss": 0.0625019371509552 }, { "entropy": 8.972284317016602, "epoch": 0.716828158987542, "mean_token_accuracy": 0.746081531047821, "num_tokens": 16919452.0, "step": 7250, "train/ce_loss": 0.9190874695777893 }, { "epoch": 0.716828158987542, "step": 7250, "train/sim_loss": 0.04296875 }, { "epoch": 0.716828158987542, "step": 7250, "train/total_loss": 0.1348775029182434 }, { "entropy": 8.439289093017578, "epoch": 0.7169270318370575, "mean_token_accuracy": 0.735052764415741, "num_tokens": 16924781.0, "step": 7251, "train/ce_loss": 1.137518048286438 }, { "epoch": 0.7169270318370575, "step": 7251, "train/sim_loss": 0.08984375 }, { "epoch": 0.7169270318370575, "step": 7251, "train/total_loss": 0.20359554886817932 }, { "entropy": 8.556591033935547, "epoch": 0.717025904686573, "mean_token_accuracy": 0.709227442741394, "num_tokens": 16930205.0, "step": 7252, "train/ce_loss": 1.2083420753479004 }, { "epoch": 0.717025904686573, "step": 7252, "train/sim_loss": 0.0703125 }, { "epoch": 0.717025904686573, "step": 7252, "train/total_loss": 0.19114670157432556 }, { "entropy": 8.435426712036133, "epoch": 0.7171247775360886, "mean_token_accuracy": 0.7184079885482788, "num_tokens": 16935732.0, "step": 7253, "train/ce_loss": 0.6173237562179565 }, { "epoch": 0.7171247775360886, "step": 7253, "train/sim_loss": 0.078125 }, { "epoch": 0.7171247775360886, "step": 7253, "train/total_loss": 0.13985738158226013 }, { "entropy": 8.789794921875, "epoch": 0.7172236503856041, "mean_token_accuracy": 0.710659921169281, "num_tokens": 16940861.0, "step": 7254, "train/ce_loss": 1.0131810903549194 }, { "epoch": 0.7172236503856041, "step": 7254, "train/sim_loss": 0.05859375 }, { "epoch": 0.7172236503856041, "step": 7254, "train/total_loss": 0.1599118709564209 }, { "entropy": 8.52991008758545, "epoch": 0.7173225232351197, "mean_token_accuracy": 0.7250000238418579, "num_tokens": 16946073.0, "step": 7255, "train/ce_loss": 1.1255245208740234 }, { "epoch": 0.7173225232351197, "step": 7255, "train/sim_loss": 0.04296875 }, { "epoch": 0.7173225232351197, "step": 7255, "train/total_loss": 0.1555212140083313 }, { "entropy": 8.874298095703125, "epoch": 0.7174213960846352, "mean_token_accuracy": 0.7180851101875305, "num_tokens": 16951307.0, "step": 7256, "train/ce_loss": 1.0606045722961426 }, { "epoch": 0.7174213960846352, "step": 7256, "train/sim_loss": 0.05859375 }, { "epoch": 0.7174213960846352, "step": 7256, "train/total_loss": 0.1646542102098465 }, { "entropy": 8.793537139892578, "epoch": 0.7175202689341507, "mean_token_accuracy": 0.7648725509643555, "num_tokens": 16956482.0, "step": 7257, "train/ce_loss": 1.007021188735962 }, { "epoch": 0.7175202689341507, "step": 7257, "train/sim_loss": 0.04296875 }, { "epoch": 0.7175202689341507, "step": 7257, "train/total_loss": 0.14367087185382843 }, { "entropy": 8.745311737060547, "epoch": 0.7176191417836663, "mean_token_accuracy": 0.7070600390434265, "num_tokens": 16961909.0, "step": 7258, "train/ce_loss": 0.41854217648506165 }, { "epoch": 0.7176191417836663, "step": 7258, "train/sim_loss": 0.02734375 }, { "epoch": 0.7176191417836663, "step": 7258, "train/total_loss": 0.06919796764850616 }, { "entropy": 9.132467269897461, "epoch": 0.7177180146331817, "mean_token_accuracy": 0.7160278558731079, "num_tokens": 16966932.0, "step": 7259, "train/ce_loss": 2.1726157665252686 }, { "epoch": 0.7177180146331817, "step": 7259, "train/sim_loss": 0.07421875 }, { "epoch": 0.7177180146331817, "step": 7259, "train/total_loss": 0.29148033261299133 }, { "epoch": 0.7178168874826972, "grad_norm": 0.8078387379646301, "learning_rate": 8.207733768481432e-06, "loss": 0.1427, "step": 7260 }, { "entropy": 8.826803207397461, "epoch": 0.7178168874826972, "mean_token_accuracy": 0.707732617855072, "num_tokens": 16972171.0, "step": 7260, "train/ce_loss": 0.8902512788772583 }, { "epoch": 0.7178168874826972, "step": 7260, "train/sim_loss": 0.0625 }, { "epoch": 0.7178168874826972, "step": 7260, "train/total_loss": 0.15152513980865479 }, { "entropy": 8.603378295898438, "epoch": 0.7179157603322128, "mean_token_accuracy": 0.7553443908691406, "num_tokens": 16977437.0, "step": 7261, "train/ce_loss": 0.762908399105072 }, { "epoch": 0.7179157603322128, "step": 7261, "train/sim_loss": 0.04296875 }, { "epoch": 0.7179157603322128, "step": 7261, "train/total_loss": 0.11925958842039108 }, { "entropy": 9.200379371643066, "epoch": 0.7180146331817283, "mean_token_accuracy": 0.7368420958518982, "num_tokens": 16982388.0, "step": 7262, "train/ce_loss": 0.7837957143783569 }, { "epoch": 0.7180146331817283, "step": 7262, "train/sim_loss": 0.03515625 }, { "epoch": 0.7180146331817283, "step": 7262, "train/total_loss": 0.1135358214378357 }, { "entropy": 9.373811721801758, "epoch": 0.7181135060312438, "mean_token_accuracy": 0.7566137313842773, "num_tokens": 16987174.0, "step": 7263, "train/ce_loss": 3.704700475282152e-06 }, { "epoch": 0.7181135060312438, "step": 7263, "train/sim_loss": 0.01953125 }, { "epoch": 0.7181135060312438, "step": 7263, "train/total_loss": 0.019531620666384697 }, { "entropy": 8.62483024597168, "epoch": 0.7182123788807594, "mean_token_accuracy": 0.6995661854743958, "num_tokens": 16992550.0, "step": 7264, "train/ce_loss": 1.0524500608444214 }, { "epoch": 0.7182123788807594, "step": 7264, "train/sim_loss": 0.0546875 }, { "epoch": 0.7182123788807594, "step": 7264, "train/total_loss": 0.15993250906467438 }, { "entropy": 8.923933029174805, "epoch": 0.7183112517302749, "mean_token_accuracy": 0.7289433479309082, "num_tokens": 16997680.0, "step": 7265, "train/ce_loss": 0.675235390663147 }, { "epoch": 0.7183112517302749, "step": 7265, "train/sim_loss": 0.03515625 }, { "epoch": 0.7183112517302749, "step": 7265, "train/total_loss": 0.1026797890663147 }, { "entropy": 8.639825820922852, "epoch": 0.7184101245797904, "mean_token_accuracy": 0.735195517539978, "num_tokens": 17003217.0, "step": 7266, "train/ce_loss": 1.0886646509170532 }, { "epoch": 0.7184101245797904, "step": 7266, "train/sim_loss": 0.078125 }, { "epoch": 0.7184101245797904, "step": 7266, "train/total_loss": 0.18699146807193756 }, { "entropy": 8.842220306396484, "epoch": 0.718508997429306, "mean_token_accuracy": 0.7212121486663818, "num_tokens": 17008323.0, "step": 7267, "train/ce_loss": 1.6363429722332512e-06 }, { "epoch": 0.718508997429306, "step": 7267, "train/sim_loss": 0.03515625 }, { "epoch": 0.718508997429306, "step": 7267, "train/total_loss": 0.03515641391277313 }, { "entropy": 8.581207275390625, "epoch": 0.7186078702788214, "mean_token_accuracy": 0.7721046209335327, "num_tokens": 17013564.0, "step": 7268, "train/ce_loss": 0.8590487837791443 }, { "epoch": 0.7186078702788214, "step": 7268, "train/sim_loss": 0.0546875 }, { "epoch": 0.7186078702788214, "step": 7268, "train/total_loss": 0.14059238135814667 }, { "entropy": 8.644612312316895, "epoch": 0.7187067431283369, "mean_token_accuracy": 0.801672637462616, "num_tokens": 17018854.0, "step": 7269, "train/ce_loss": 0.5526071786880493 }, { "epoch": 0.7187067431283369, "step": 7269, "train/sim_loss": 0.046875 }, { "epoch": 0.7187067431283369, "step": 7269, "train/total_loss": 0.10213571786880493 }, { "entropy": 8.574300765991211, "epoch": 0.7188056159778525, "mean_token_accuracy": 0.6941431760787964, "num_tokens": 17024202.0, "step": 7270, "train/ce_loss": 0.7288708686828613 }, { "epoch": 0.7188056159778525, "step": 7270, "train/sim_loss": 0.07421875 }, { "epoch": 0.7188056159778525, "step": 7270, "train/total_loss": 0.1471058428287506 }, { "entropy": 9.051513671875, "epoch": 0.718904488827368, "mean_token_accuracy": 0.6909385323524475, "num_tokens": 17029259.0, "step": 7271, "train/ce_loss": 1.2106337547302246 }, { "epoch": 0.718904488827368, "step": 7271, "train/sim_loss": 0.02734375 }, { "epoch": 0.718904488827368, "step": 7271, "train/total_loss": 0.14840713143348694 }, { "entropy": 9.47685718536377, "epoch": 0.7190033616768835, "mean_token_accuracy": 0.7555066347122192, "num_tokens": 17034070.0, "step": 7272, "train/ce_loss": 1.5516616106033325 }, { "epoch": 0.7190033616768835, "step": 7272, "train/sim_loss": 0.046875 }, { "epoch": 0.7190033616768835, "step": 7272, "train/total_loss": 0.2020411640405655 }, { "entropy": 8.486614227294922, "epoch": 0.7191022345263991, "mean_token_accuracy": 0.8106796145439148, "num_tokens": 17039398.0, "step": 7273, "train/ce_loss": 0.6014370322227478 }, { "epoch": 0.7191022345263991, "step": 7273, "train/sim_loss": 0.0234375 }, { "epoch": 0.7191022345263991, "step": 7273, "train/total_loss": 0.08358120918273926 }, { "entropy": 8.953771591186523, "epoch": 0.7192011073759146, "mean_token_accuracy": 0.73884516954422, "num_tokens": 17044604.0, "step": 7274, "train/ce_loss": 1.0438170433044434 }, { "epoch": 0.7192011073759146, "step": 7274, "train/sim_loss": 0.01953125 }, { "epoch": 0.7192011073759146, "step": 7274, "train/total_loss": 0.12391295284032822 }, { "entropy": 8.621545791625977, "epoch": 0.7192999802254301, "mean_token_accuracy": 0.7087979912757874, "num_tokens": 17049927.0, "step": 7275, "train/ce_loss": 1.3095701932907104 }, { "epoch": 0.7192999802254301, "step": 7275, "train/sim_loss": 0.07421875 }, { "epoch": 0.7192999802254301, "step": 7275, "train/total_loss": 0.20517577230930328 }, { "entropy": 8.50368881225586, "epoch": 0.7193988530749457, "mean_token_accuracy": 0.7884427309036255, "num_tokens": 17055383.0, "step": 7276, "train/ce_loss": 0.5625127553939819 }, { "epoch": 0.7193988530749457, "step": 7276, "train/sim_loss": 0.078125 }, { "epoch": 0.7193988530749457, "step": 7276, "train/total_loss": 0.13437627255916595 }, { "entropy": 8.556852340698242, "epoch": 0.7194977259244612, "mean_token_accuracy": 0.7288557291030884, "num_tokens": 17060665.0, "step": 7277, "train/ce_loss": 0.8305554986000061 }, { "epoch": 0.7194977259244612, "step": 7277, "train/sim_loss": 0.0546875 }, { "epoch": 0.7194977259244612, "step": 7277, "train/total_loss": 0.1377430558204651 }, { "entropy": 8.74986743927002, "epoch": 0.7195965987739766, "mean_token_accuracy": 0.7279999852180481, "num_tokens": 17065820.0, "step": 7278, "train/ce_loss": 0.9675143361091614 }, { "epoch": 0.7195965987739766, "step": 7278, "train/sim_loss": 0.04296875 }, { "epoch": 0.7195965987739766, "step": 7278, "train/total_loss": 0.13972018659114838 }, { "entropy": 9.176345825195312, "epoch": 0.7196954716234922, "mean_token_accuracy": 0.7654028534889221, "num_tokens": 17070658.0, "step": 7279, "train/ce_loss": 9.67250616668025e-06 }, { "epoch": 0.7196954716234922, "step": 7279, "train/sim_loss": 0.04296875 }, { "epoch": 0.7196954716234922, "step": 7279, "train/total_loss": 0.0429697185754776 }, { "epoch": 0.7197943444730077, "grad_norm": 0.7864909172058105, "learning_rate": 8.202788903723484e-06, "loss": 0.135, "step": 7280 }, { "entropy": 8.759176254272461, "epoch": 0.7197943444730077, "mean_token_accuracy": 0.7441860437393188, "num_tokens": 17075801.0, "step": 7280, "train/ce_loss": 1.1582366228103638 }, { "epoch": 0.7197943444730077, "step": 7280, "train/sim_loss": 0.06640625 }, { "epoch": 0.7197943444730077, "step": 7280, "train/total_loss": 0.1822299063205719 }, { "entropy": 8.721479415893555, "epoch": 0.7198932173225232, "mean_token_accuracy": 0.8014440536499023, "num_tokens": 17081088.0, "step": 7281, "train/ce_loss": 0.3420823812484741 }, { "epoch": 0.7198932173225232, "step": 7281, "train/sim_loss": 0.046875 }, { "epoch": 0.7198932173225232, "step": 7281, "train/total_loss": 0.08108323812484741 }, { "entropy": 8.305360794067383, "epoch": 0.7199920901720388, "mean_token_accuracy": 0.7468030452728271, "num_tokens": 17086360.0, "step": 7282, "train/ce_loss": 1.2415152788162231 }, { "epoch": 0.7199920901720388, "step": 7282, "train/sim_loss": 0.0703125 }, { "epoch": 0.7199920901720388, "step": 7282, "train/total_loss": 0.19446402788162231 }, { "entropy": 8.77110481262207, "epoch": 0.7200909630215543, "mean_token_accuracy": 0.7856173515319824, "num_tokens": 17091557.0, "step": 7283, "train/ce_loss": 1.6788532093414688e-06 }, { "epoch": 0.7200909630215543, "step": 7283, "train/sim_loss": 0.03125 }, { "epoch": 0.7200909630215543, "step": 7283, "train/total_loss": 0.03125016763806343 }, { "entropy": 8.664463996887207, "epoch": 0.7201898358710698, "mean_token_accuracy": 0.7898550629615784, "num_tokens": 17096810.0, "step": 7284, "train/ce_loss": 0.7395745515823364 }, { "epoch": 0.7201898358710698, "step": 7284, "train/sim_loss": 0.01953125 }, { "epoch": 0.7201898358710698, "step": 7284, "train/total_loss": 0.09348870813846588 }, { "entropy": 8.494865417480469, "epoch": 0.7202887087205854, "mean_token_accuracy": 0.7291428446769714, "num_tokens": 17102316.0, "step": 7285, "train/ce_loss": 1.0033003091812134 }, { "epoch": 0.7202887087205854, "step": 7285, "train/sim_loss": 0.09765625 }, { "epoch": 0.7202887087205854, "step": 7285, "train/total_loss": 0.19798627495765686 }, { "entropy": 8.049318313598633, "epoch": 0.7203875815701009, "mean_token_accuracy": 0.7228571176528931, "num_tokens": 17107833.0, "step": 7286, "train/ce_loss": 0.5071789026260376 }, { "epoch": 0.7203875815701009, "step": 7286, "train/sim_loss": 0.05859375 }, { "epoch": 0.7203875815701009, "step": 7286, "train/total_loss": 0.10931164026260376 }, { "entropy": 8.946247100830078, "epoch": 0.7204864544196163, "mean_token_accuracy": 0.7163531184196472, "num_tokens": 17113141.0, "step": 7287, "train/ce_loss": 1.2641355991363525 }, { "epoch": 0.7204864544196163, "step": 7287, "train/sim_loss": 0.0625 }, { "epoch": 0.7204864544196163, "step": 7287, "train/total_loss": 0.18891356885433197 }, { "entropy": 8.590167045593262, "epoch": 0.7205853272691319, "mean_token_accuracy": 0.7274826765060425, "num_tokens": 17118500.0, "step": 7288, "train/ce_loss": 0.917041003704071 }, { "epoch": 0.7205853272691319, "step": 7288, "train/sim_loss": 0.0859375 }, { "epoch": 0.7205853272691319, "step": 7288, "train/total_loss": 0.1776416003704071 }, { "entropy": 8.500893592834473, "epoch": 0.7206842001186474, "mean_token_accuracy": 0.7496740818023682, "num_tokens": 17123731.0, "step": 7289, "train/ce_loss": 0.7513278126716614 }, { "epoch": 0.7206842001186474, "step": 7289, "train/sim_loss": 0.1484375 }, { "epoch": 0.7206842001186474, "step": 7289, "train/total_loss": 0.22357028722763062 }, { "entropy": 8.950748443603516, "epoch": 0.7207830729681629, "mean_token_accuracy": 0.7232796549797058, "num_tokens": 17128851.0, "step": 7290, "train/ce_loss": 1.2981815338134766 }, { "epoch": 0.7207830729681629, "step": 7290, "train/sim_loss": 0.1015625 }, { "epoch": 0.7207830729681629, "step": 7290, "train/total_loss": 0.2313806563615799 }, { "entropy": 8.955303192138672, "epoch": 0.7208819458176785, "mean_token_accuracy": 0.747706413269043, "num_tokens": 17133935.0, "step": 7291, "train/ce_loss": 0.7134157419204712 }, { "epoch": 0.7208819458176785, "step": 7291, "train/sim_loss": 0.0703125 }, { "epoch": 0.7208819458176785, "step": 7291, "train/total_loss": 0.14165407419204712 }, { "entropy": 9.262459754943848, "epoch": 0.720980818667194, "mean_token_accuracy": 0.7288888692855835, "num_tokens": 17138528.0, "step": 7292, "train/ce_loss": 3.491542884148657e-05 }, { "epoch": 0.720980818667194, "step": 7292, "train/sim_loss": 0.078125 }, { "epoch": 0.720980818667194, "step": 7292, "train/total_loss": 0.07812849432229996 }, { "entropy": 8.342453002929688, "epoch": 0.7210796915167095, "mean_token_accuracy": 0.6983184814453125, "num_tokens": 17143950.0, "step": 7293, "train/ce_loss": 0.5059213042259216 }, { "epoch": 0.7210796915167095, "step": 7293, "train/sim_loss": 0.03125 }, { "epoch": 0.7210796915167095, "step": 7293, "train/total_loss": 0.08184213191270828 }, { "entropy": 8.34473991394043, "epoch": 0.7211785643662251, "mean_token_accuracy": 0.7457447052001953, "num_tokens": 17149397.0, "step": 7294, "train/ce_loss": 1.351395606994629 }, { "epoch": 0.7211785643662251, "step": 7294, "train/sim_loss": 0.07421875 }, { "epoch": 0.7211785643662251, "step": 7294, "train/total_loss": 0.2093583196401596 }, { "entropy": 8.92292308807373, "epoch": 0.7212774372157406, "mean_token_accuracy": 0.7750343084335327, "num_tokens": 17154570.0, "step": 7295, "train/ce_loss": 0.6525868773460388 }, { "epoch": 0.7212774372157406, "step": 7295, "train/sim_loss": 0.05078125 }, { "epoch": 0.7212774372157406, "step": 7295, "train/total_loss": 0.11603993922472 }, { "entropy": 8.965505599975586, "epoch": 0.721376310065256, "mean_token_accuracy": 0.7849116921424866, "num_tokens": 17159649.0, "step": 7296, "train/ce_loss": 1.060815691947937 }, { "epoch": 0.721376310065256, "step": 7296, "train/sim_loss": 0.05078125 }, { "epoch": 0.721376310065256, "step": 7296, "train/total_loss": 0.15686282515525818 }, { "entropy": 8.488422393798828, "epoch": 0.7214751829147716, "mean_token_accuracy": 0.707975447177887, "num_tokens": 17164944.0, "step": 7297, "train/ce_loss": 0.5313419699668884 }, { "epoch": 0.7214751829147716, "step": 7297, "train/sim_loss": 0.01953125 }, { "epoch": 0.7214751829147716, "step": 7297, "train/total_loss": 0.07266545295715332 }, { "entropy": 8.473865509033203, "epoch": 0.7215740557642871, "mean_token_accuracy": 0.7348642945289612, "num_tokens": 17170337.0, "step": 7298, "train/ce_loss": 0.6764245629310608 }, { "epoch": 0.7215740557642871, "step": 7298, "train/sim_loss": 0.05078125 }, { "epoch": 0.7215740557642871, "step": 7298, "train/total_loss": 0.1184237077832222 }, { "entropy": 8.944363594055176, "epoch": 0.7216729286138026, "mean_token_accuracy": 0.75, "num_tokens": 17175397.0, "step": 7299, "train/ce_loss": 4.032572178402916e-06 }, { "epoch": 0.7216729286138026, "step": 7299, "train/sim_loss": 0.0390625 }, { "epoch": 0.7216729286138026, "step": 7299, "train/total_loss": 0.039062902331352234 }, { "epoch": 0.7217718014633182, "grad_norm": 0.6683271527290344, "learning_rate": 8.197844038965535e-06, "loss": 0.1385, "step": 7300 }, { "entropy": 8.849038124084473, "epoch": 0.7217718014633182, "mean_token_accuracy": 0.7812929749488831, "num_tokens": 17180597.0, "step": 7300, "train/ce_loss": 0.8105028867721558 }, { "epoch": 0.7217718014633182, "step": 7300, "train/sim_loss": 0.05859375 }, { "epoch": 0.7217718014633182, "step": 7300, "train/total_loss": 0.13964404165744781 }, { "entropy": 8.331449508666992, "epoch": 0.7218706743128337, "mean_token_accuracy": 0.7185500860214233, "num_tokens": 17186034.0, "step": 7301, "train/ce_loss": 0.6758619546890259 }, { "epoch": 0.7218706743128337, "step": 7301, "train/sim_loss": 0.05859375 }, { "epoch": 0.7218706743128337, "step": 7301, "train/total_loss": 0.12617994844913483 }, { "entropy": 9.267677307128906, "epoch": 0.7219695471623492, "mean_token_accuracy": 0.6851485371589661, "num_tokens": 17190994.0, "step": 7302, "train/ce_loss": 1.6048048734664917 }, { "epoch": 0.7219695471623492, "step": 7302, "train/sim_loss": 0.05859375 }, { "epoch": 0.7219695471623492, "step": 7302, "train/total_loss": 0.21907423436641693 }, { "entropy": 8.935868263244629, "epoch": 0.7220684200118648, "mean_token_accuracy": 0.7830769419670105, "num_tokens": 17196129.0, "step": 7303, "train/ce_loss": 2.780159775284119e-06 }, { "epoch": 0.7220684200118648, "step": 7303, "train/sim_loss": 0.0390625 }, { "epoch": 0.7220684200118648, "step": 7303, "train/total_loss": 0.039062779396772385 }, { "entropy": 8.983610153198242, "epoch": 0.7221672928613803, "mean_token_accuracy": 0.7756873965263367, "num_tokens": 17201279.0, "step": 7304, "train/ce_loss": 0.7668209671974182 }, { "epoch": 0.7221672928613803, "step": 7304, "train/sim_loss": 0.0703125 }, { "epoch": 0.7221672928613803, "step": 7304, "train/total_loss": 0.14699459075927734 }, { "entropy": 8.59881591796875, "epoch": 0.7222661657108957, "mean_token_accuracy": 0.7738232016563416, "num_tokens": 17206657.0, "step": 7305, "train/ce_loss": 1.3534693717956543 }, { "epoch": 0.7222661657108957, "step": 7305, "train/sim_loss": 0.0546875 }, { "epoch": 0.7222661657108957, "step": 7305, "train/total_loss": 0.1900344341993332 }, { "entropy": 8.371246337890625, "epoch": 0.7223650385604113, "mean_token_accuracy": 0.6880530714988708, "num_tokens": 17212033.0, "step": 7306, "train/ce_loss": 1.5280489921569824 }, { "epoch": 0.7223650385604113, "step": 7306, "train/sim_loss": 0.109375 }, { "epoch": 0.7223650385604113, "step": 7306, "train/total_loss": 0.2621799111366272 }, { "entropy": 8.451948165893555, "epoch": 0.7224639114099268, "mean_token_accuracy": 0.7366737723350525, "num_tokens": 17217434.0, "step": 7307, "train/ce_loss": 0.8553075790405273 }, { "epoch": 0.7224639114099268, "step": 7307, "train/sim_loss": 0.0859375 }, { "epoch": 0.7224639114099268, "step": 7307, "train/total_loss": 0.17146825790405273 }, { "entropy": 8.971637725830078, "epoch": 0.7225627842594423, "mean_token_accuracy": 0.7372488379478455, "num_tokens": 17222565.0, "step": 7308, "train/ce_loss": 1.1359513998031616 }, { "epoch": 0.7225627842594423, "step": 7308, "train/sim_loss": 0.046875 }, { "epoch": 0.7225627842594423, "step": 7308, "train/total_loss": 0.1604701429605484 }, { "entropy": 9.254190444946289, "epoch": 0.7226616571089579, "mean_token_accuracy": 0.7188612222671509, "num_tokens": 17227609.0, "step": 7309, "train/ce_loss": 1.335377812385559 }, { "epoch": 0.7226616571089579, "step": 7309, "train/sim_loss": 0.0546875 }, { "epoch": 0.7226616571089579, "step": 7309, "train/total_loss": 0.18822528421878815 }, { "entropy": 8.650724411010742, "epoch": 0.7227605299584734, "mean_token_accuracy": 0.7820025086402893, "num_tokens": 17232854.0, "step": 7310, "train/ce_loss": 0.7430127263069153 }, { "epoch": 0.7227605299584734, "step": 7310, "train/sim_loss": 0.0546875 }, { "epoch": 0.7227605299584734, "step": 7310, "train/total_loss": 0.12898877263069153 }, { "entropy": 8.497949600219727, "epoch": 0.7228594028079889, "mean_token_accuracy": 0.6936936974525452, "num_tokens": 17238307.0, "step": 7311, "train/ce_loss": 0.9521242380142212 }, { "epoch": 0.7228594028079889, "step": 7311, "train/sim_loss": 0.04296875 }, { "epoch": 0.7228594028079889, "step": 7311, "train/total_loss": 0.1381811797618866 }, { "entropy": 9.520938873291016, "epoch": 0.7229582756575045, "mean_token_accuracy": 0.7518072128295898, "num_tokens": 17243117.0, "step": 7312, "train/ce_loss": 1.2061069011688232 }, { "epoch": 0.7229582756575045, "step": 7312, "train/sim_loss": 0.046875 }, { "epoch": 0.7229582756575045, "step": 7312, "train/total_loss": 0.16748568415641785 }, { "entropy": 8.96164321899414, "epoch": 0.72305714850702, "mean_token_accuracy": 0.7125645279884338, "num_tokens": 17248119.0, "step": 7313, "train/ce_loss": 1.5244228839874268 }, { "epoch": 0.72305714850702, "step": 7313, "train/sim_loss": 0.0546875 }, { "epoch": 0.72305714850702, "step": 7313, "train/total_loss": 0.20712979137897491 }, { "entropy": 8.680285453796387, "epoch": 0.7231560213565355, "mean_token_accuracy": 0.7047146558761597, "num_tokens": 17253399.0, "step": 7314, "train/ce_loss": 2.0357666015625 }, { "epoch": 0.7231560213565355, "step": 7314, "train/sim_loss": 0.0625 }, { "epoch": 0.7231560213565355, "step": 7314, "train/total_loss": 0.2660766839981079 }, { "entropy": 8.502523422241211, "epoch": 0.723254894206051, "mean_token_accuracy": 0.7521008253097534, "num_tokens": 17258839.0, "step": 7315, "train/ce_loss": 1.0317186117172241 }, { "epoch": 0.723254894206051, "step": 7315, "train/sim_loss": 0.046875 }, { "epoch": 0.723254894206051, "step": 7315, "train/total_loss": 0.15004685521125793 }, { "entropy": 8.51579475402832, "epoch": 0.7233537670555665, "mean_token_accuracy": 0.7277904152870178, "num_tokens": 17264243.0, "step": 7316, "train/ce_loss": 0.7120223641395569 }, { "epoch": 0.7233537670555665, "step": 7316, "train/sim_loss": 0.08984375 }, { "epoch": 0.7233537670555665, "step": 7316, "train/total_loss": 0.16104599833488464 }, { "entropy": 8.494817733764648, "epoch": 0.723452639905082, "mean_token_accuracy": 0.7609195113182068, "num_tokens": 17269687.0, "step": 7317, "train/ce_loss": 0.6494558453559875 }, { "epoch": 0.723452639905082, "step": 7317, "train/sim_loss": 0.09765625 }, { "epoch": 0.723452639905082, "step": 7317, "train/total_loss": 0.16260182857513428 }, { "entropy": 8.299484252929688, "epoch": 0.7235515127545976, "mean_token_accuracy": 0.7377210259437561, "num_tokens": 17275225.0, "step": 7318, "train/ce_loss": 1.2110645771026611 }, { "epoch": 0.7235515127545976, "step": 7318, "train/sim_loss": 0.05859375 }, { "epoch": 0.7235515127545976, "step": 7318, "train/total_loss": 0.17970021069049835 }, { "entropy": 9.056045532226562, "epoch": 0.7236503856041131, "mean_token_accuracy": 0.759358286857605, "num_tokens": 17280252.0, "step": 7319, "train/ce_loss": 3.6766282391909044e-06 }, { "epoch": 0.7236503856041131, "step": 7319, "train/sim_loss": 0.03125 }, { "epoch": 0.7236503856041131, "step": 7319, "train/total_loss": 0.03125036880373955 }, { "epoch": 0.7237492584536286, "grad_norm": 0.7412518262863159, "learning_rate": 8.192899174207585e-06, "loss": 0.1361, "step": 7320 }, { "entropy": 8.961812973022461, "epoch": 0.7237492584536286, "mean_token_accuracy": 0.7068965435028076, "num_tokens": 17285276.0, "step": 7320, "train/ce_loss": 1.3280572891235352 }, { "epoch": 0.7237492584536286, "step": 7320, "train/sim_loss": 0.0703125 }, { "epoch": 0.7237492584536286, "step": 7320, "train/total_loss": 0.203118234872818 }, { "entropy": 8.624178886413574, "epoch": 0.7238481313031442, "mean_token_accuracy": 0.7888888716697693, "num_tokens": 17290638.0, "step": 7321, "train/ce_loss": 0.7367948293685913 }, { "epoch": 0.7238481313031442, "step": 7321, "train/sim_loss": 0.015625 }, { "epoch": 0.7238481313031442, "step": 7321, "train/total_loss": 0.08930448442697525 }, { "entropy": 8.59969711303711, "epoch": 0.7239470041526597, "mean_token_accuracy": 0.7271605134010315, "num_tokens": 17295983.0, "step": 7322, "train/ce_loss": 0.4919517934322357 }, { "epoch": 0.7239470041526597, "step": 7322, "train/sim_loss": 0.03125 }, { "epoch": 0.7239470041526597, "step": 7322, "train/total_loss": 0.08044518530368805 }, { "entropy": 9.034954071044922, "epoch": 0.7240458770021752, "mean_token_accuracy": 0.716911792755127, "num_tokens": 17300973.0, "step": 7323, "train/ce_loss": 0.9119350910186768 }, { "epoch": 0.7240458770021752, "step": 7323, "train/sim_loss": 0.1015625 }, { "epoch": 0.7240458770021752, "step": 7323, "train/total_loss": 0.19275601208209991 }, { "entropy": 8.415181159973145, "epoch": 0.7241447498516907, "mean_token_accuracy": 0.7527233362197876, "num_tokens": 17306318.0, "step": 7324, "train/ce_loss": 1.0075483322143555 }, { "epoch": 0.7241447498516907, "step": 7324, "train/sim_loss": 0.0625 }, { "epoch": 0.7241447498516907, "step": 7324, "train/total_loss": 0.16325482726097107 }, { "entropy": 8.788187980651855, "epoch": 0.7242436227012062, "mean_token_accuracy": 0.7599999904632568, "num_tokens": 17311610.0, "step": 7325, "train/ce_loss": 0.5460984706878662 }, { "epoch": 0.7242436227012062, "step": 7325, "train/sim_loss": 0.05078125 }, { "epoch": 0.7242436227012062, "step": 7325, "train/total_loss": 0.10539110004901886 }, { "entropy": 9.16337776184082, "epoch": 0.7243424955507217, "mean_token_accuracy": 0.7110186815261841, "num_tokens": 17316551.0, "step": 7326, "train/ce_loss": 1.2475454807281494 }, { "epoch": 0.7243424955507217, "step": 7326, "train/sim_loss": 0.0625 }, { "epoch": 0.7243424955507217, "step": 7326, "train/total_loss": 0.18725454807281494 }, { "entropy": 8.655111312866211, "epoch": 0.7244413684002373, "mean_token_accuracy": 0.7433751821517944, "num_tokens": 17321755.0, "step": 7327, "train/ce_loss": 1.4795926809310913 }, { "epoch": 0.7244413684002373, "step": 7327, "train/sim_loss": 0.0625 }, { "epoch": 0.7244413684002373, "step": 7327, "train/total_loss": 0.21045927703380585 }, { "entropy": 9.506175994873047, "epoch": 0.7245402412497528, "mean_token_accuracy": 0.6496163606643677, "num_tokens": 17326533.0, "step": 7328, "train/ce_loss": 1.2679378986358643 }, { "epoch": 0.7245402412497528, "step": 7328, "train/sim_loss": 0.04296875 }, { "epoch": 0.7245402412497528, "step": 7328, "train/total_loss": 0.1697625368833542 }, { "entropy": 8.775362014770508, "epoch": 0.7246391140992683, "mean_token_accuracy": 0.731218695640564, "num_tokens": 17331595.0, "step": 7329, "train/ce_loss": 4.083109615748981e-06 }, { "epoch": 0.7246391140992683, "step": 7329, "train/sim_loss": 0.02734375 }, { "epoch": 0.7246391140992683, "step": 7329, "train/total_loss": 0.02734415791928768 }, { "entropy": 8.553563117980957, "epoch": 0.7247379869487839, "mean_token_accuracy": 0.7808598875999451, "num_tokens": 17336795.0, "step": 7330, "train/ce_loss": 1.1927872896194458 }, { "epoch": 0.7247379869487839, "step": 7330, "train/sim_loss": 0.0546875 }, { "epoch": 0.7247379869487839, "step": 7330, "train/total_loss": 0.17396622896194458 }, { "entropy": 8.947246551513672, "epoch": 0.7248368597982994, "mean_token_accuracy": 0.6691957712173462, "num_tokens": 17341876.0, "step": 7331, "train/ce_loss": 9.290523053095967e-07 }, { "epoch": 0.7248368597982994, "step": 7331, "train/sim_loss": 0.03515625 }, { "epoch": 0.7248368597982994, "step": 7331, "train/total_loss": 0.03515634313225746 }, { "entropy": 8.291561126708984, "epoch": 0.7249357326478149, "mean_token_accuracy": 0.7085152864456177, "num_tokens": 17347287.0, "step": 7332, "train/ce_loss": 0.6597549915313721 }, { "epoch": 0.7249357326478149, "step": 7332, "train/sim_loss": 0.0859375 }, { "epoch": 0.7249357326478149, "step": 7332, "train/total_loss": 0.15191300213336945 }, { "entropy": 8.774089813232422, "epoch": 0.7250346054973305, "mean_token_accuracy": 0.7362204790115356, "num_tokens": 17352474.0, "step": 7333, "train/ce_loss": 0.8822577595710754 }, { "epoch": 0.7250346054973305, "step": 7333, "train/sim_loss": 0.08984375 }, { "epoch": 0.7250346054973305, "step": 7333, "train/total_loss": 0.17806953191757202 }, { "entropy": 9.008880615234375, "epoch": 0.7251334783468459, "mean_token_accuracy": 0.7876105904579163, "num_tokens": 17357450.0, "step": 7334, "train/ce_loss": 1.1267569065093994 }, { "epoch": 0.7251334783468459, "step": 7334, "train/sim_loss": 0.0625 }, { "epoch": 0.7251334783468459, "step": 7334, "train/total_loss": 0.17517569661140442 }, { "entropy": 8.721375465393066, "epoch": 0.7252323511963614, "mean_token_accuracy": 0.7621878981590271, "num_tokens": 17362752.0, "step": 7335, "train/ce_loss": 0.8404841423034668 }, { "epoch": 0.7252323511963614, "step": 7335, "train/sim_loss": 0.06640625 }, { "epoch": 0.7252323511963614, "step": 7335, "train/total_loss": 0.15045467019081116 }, { "entropy": 8.608890533447266, "epoch": 0.725331224045877, "mean_token_accuracy": 0.7049180269241333, "num_tokens": 17368205.0, "step": 7336, "train/ce_loss": 0.607792317867279 }, { "epoch": 0.725331224045877, "step": 7336, "train/sim_loss": 0.05078125 }, { "epoch": 0.725331224045877, "step": 7336, "train/total_loss": 0.11156047880649567 }, { "entropy": 8.911270141601562, "epoch": 0.7254300968953925, "mean_token_accuracy": 0.8027523159980774, "num_tokens": 17373265.0, "step": 7337, "train/ce_loss": 0.5394410490989685 }, { "epoch": 0.7254300968953925, "step": 7337, "train/sim_loss": 0.0390625 }, { "epoch": 0.7254300968953925, "step": 7337, "train/total_loss": 0.09300661087036133 }, { "entropy": 9.362253189086914, "epoch": 0.7255289697449081, "mean_token_accuracy": 0.8140589594841003, "num_tokens": 17378321.0, "step": 7338, "train/ce_loss": 2.5051738248293987e-06 }, { "epoch": 0.7255289697449081, "step": 7338, "train/sim_loss": 0.0234375 }, { "epoch": 0.7255289697449081, "step": 7338, "train/total_loss": 0.023437749594449997 }, { "entropy": 8.757553100585938, "epoch": 0.7256278425944236, "mean_token_accuracy": 0.6928281188011169, "num_tokens": 17383680.0, "step": 7339, "train/ce_loss": 1.2112091779708862 }, { "epoch": 0.7256278425944236, "step": 7339, "train/sim_loss": 0.06640625 }, { "epoch": 0.7256278425944236, "step": 7339, "train/total_loss": 0.18752717971801758 }, { "epoch": 0.7257267154439391, "grad_norm": 0.7014778852462769, "learning_rate": 8.187954309449638e-06, "loss": 0.1341, "step": 7340 }, { "entropy": 9.233248710632324, "epoch": 0.7257267154439391, "mean_token_accuracy": 0.723809540271759, "num_tokens": 17388614.0, "step": 7340, "train/ce_loss": 1.8752094507217407 }, { "epoch": 0.7257267154439391, "step": 7340, "train/sim_loss": 0.08203125 }, { "epoch": 0.7257267154439391, "step": 7340, "train/total_loss": 0.26955220103263855 }, { "entropy": 9.112678527832031, "epoch": 0.7258255882934547, "mean_token_accuracy": 0.7578397393226624, "num_tokens": 17393630.0, "step": 7341, "train/ce_loss": 1.0484298467636108 }, { "epoch": 0.7258255882934547, "step": 7341, "train/sim_loss": 0.0390625 }, { "epoch": 0.7258255882934547, "step": 7341, "train/total_loss": 0.14390549063682556 }, { "entropy": 9.32995319366455, "epoch": 0.7259244611429702, "mean_token_accuracy": 0.657706081867218, "num_tokens": 17398628.0, "step": 7342, "train/ce_loss": 1.2173696756362915 }, { "epoch": 0.7259244611429702, "step": 7342, "train/sim_loss": 0.04296875 }, { "epoch": 0.7259244611429702, "step": 7342, "train/total_loss": 0.16470572352409363 }, { "entropy": 8.545857429504395, "epoch": 0.7260233339924856, "mean_token_accuracy": 0.813034176826477, "num_tokens": 17404007.0, "step": 7343, "train/ce_loss": 0.4299650490283966 }, { "epoch": 0.7260233339924856, "step": 7343, "train/sim_loss": 0.015625 }, { "epoch": 0.7260233339924856, "step": 7343, "train/total_loss": 0.05862150713801384 }, { "entropy": 8.629613876342773, "epoch": 0.7261222068420012, "mean_token_accuracy": 0.7901785969734192, "num_tokens": 17409341.0, "step": 7344, "train/ce_loss": 0.8379325866699219 }, { "epoch": 0.7261222068420012, "step": 7344, "train/sim_loss": 0.0546875 }, { "epoch": 0.7261222068420012, "step": 7344, "train/total_loss": 0.1384807527065277 }, { "entropy": 8.712474822998047, "epoch": 0.7262210796915167, "mean_token_accuracy": 0.7073474526405334, "num_tokens": 17414572.0, "step": 7345, "train/ce_loss": 1.0608352422714233 }, { "epoch": 0.7262210796915167, "step": 7345, "train/sim_loss": 0.0703125 }, { "epoch": 0.7262210796915167, "step": 7345, "train/total_loss": 0.17639602720737457 }, { "entropy": 9.008268356323242, "epoch": 0.7263199525410322, "mean_token_accuracy": 0.7863247990608215, "num_tokens": 17419675.0, "step": 7346, "train/ce_loss": 0.894185483455658 }, { "epoch": 0.7263199525410322, "step": 7346, "train/sim_loss": 0.03125 }, { "epoch": 0.7263199525410322, "step": 7346, "train/total_loss": 0.12066855281591415 }, { "entropy": 8.653308868408203, "epoch": 0.7264188253905478, "mean_token_accuracy": 0.7242990732192993, "num_tokens": 17424975.0, "step": 7347, "train/ce_loss": 0.6597151756286621 }, { "epoch": 0.7264188253905478, "step": 7347, "train/sim_loss": 0.015625 }, { "epoch": 0.7264188253905478, "step": 7347, "train/total_loss": 0.08159651607275009 }, { "entropy": 8.451759338378906, "epoch": 0.7265176982400633, "mean_token_accuracy": 0.7743830680847168, "num_tokens": 17430523.0, "step": 7348, "train/ce_loss": 0.42508605122566223 }, { "epoch": 0.7265176982400633, "step": 7348, "train/sim_loss": 0.0390625 }, { "epoch": 0.7265176982400633, "step": 7348, "train/total_loss": 0.08157110214233398 }, { "entropy": 8.61266803741455, "epoch": 0.7266165710895788, "mean_token_accuracy": 0.7376623153686523, "num_tokens": 17435977.0, "step": 7349, "train/ce_loss": 1.0716931819915771 }, { "epoch": 0.7266165710895788, "step": 7349, "train/sim_loss": 0.02734375 }, { "epoch": 0.7266165710895788, "step": 7349, "train/total_loss": 0.13451308012008667 }, { "entropy": 8.098893165588379, "epoch": 0.7267154439390944, "mean_token_accuracy": 0.7740992903709412, "num_tokens": 17441465.0, "step": 7350, "train/ce_loss": 0.5998178720474243 }, { "epoch": 0.7267154439390944, "step": 7350, "train/sim_loss": 0.046875 }, { "epoch": 0.7267154439390944, "step": 7350, "train/total_loss": 0.10685679316520691 }, { "entropy": 9.199495315551758, "epoch": 0.7268143167886099, "mean_token_accuracy": 0.7086882591247559, "num_tokens": 17446501.0, "step": 7351, "train/ce_loss": 0.7616429924964905 }, { "epoch": 0.7268143167886099, "step": 7351, "train/sim_loss": 0.03125 }, { "epoch": 0.7268143167886099, "step": 7351, "train/total_loss": 0.10741429775953293 }, { "entropy": 9.160688400268555, "epoch": 0.7269131896381253, "mean_token_accuracy": 0.7421602606773376, "num_tokens": 17451525.0, "step": 7352, "train/ce_loss": 0.6872962713241577 }, { "epoch": 0.7269131896381253, "step": 7352, "train/sim_loss": 0.01171875 }, { "epoch": 0.7269131896381253, "step": 7352, "train/total_loss": 0.08044838160276413 }, { "entropy": 8.840906143188477, "epoch": 0.7270120624876409, "mean_token_accuracy": 0.8368263244628906, "num_tokens": 17456701.0, "step": 7353, "train/ce_loss": 0.7152250409126282 }, { "epoch": 0.7270120624876409, "step": 7353, "train/sim_loss": 0.09765625 }, { "epoch": 0.7270120624876409, "step": 7353, "train/total_loss": 0.16917875409126282 }, { "entropy": 8.859922409057617, "epoch": 0.7271109353371564, "mean_token_accuracy": 0.7874464988708496, "num_tokens": 17461877.0, "step": 7354, "train/ce_loss": 0.7153387069702148 }, { "epoch": 0.7271109353371564, "step": 7354, "train/sim_loss": 0.08203125 }, { "epoch": 0.7271109353371564, "step": 7354, "train/total_loss": 0.15356512367725372 }, { "entropy": 8.345499038696289, "epoch": 0.7272098081866719, "mean_token_accuracy": 0.7487636208534241, "num_tokens": 17467412.0, "step": 7355, "train/ce_loss": 0.3188781440258026 }, { "epoch": 0.7272098081866719, "step": 7355, "train/sim_loss": 0.0234375 }, { "epoch": 0.7272098081866719, "step": 7355, "train/total_loss": 0.05532531440258026 }, { "entropy": 8.838354110717773, "epoch": 0.7273086810361875, "mean_token_accuracy": 0.7466443181037903, "num_tokens": 17472610.0, "step": 7356, "train/ce_loss": 1.1068003177642822 }, { "epoch": 0.7273086810361875, "step": 7356, "train/sim_loss": 0.09765625 }, { "epoch": 0.7273086810361875, "step": 7356, "train/total_loss": 0.20833629369735718 }, { "entropy": 8.740463256835938, "epoch": 0.727407553885703, "mean_token_accuracy": 0.8121212124824524, "num_tokens": 17477790.0, "step": 7357, "train/ce_loss": 0.8163862824440002 }, { "epoch": 0.727407553885703, "step": 7357, "train/sim_loss": 0.0390625 }, { "epoch": 0.727407553885703, "step": 7357, "train/total_loss": 0.1207011267542839 }, { "entropy": 8.466141700744629, "epoch": 0.7275064267352185, "mean_token_accuracy": 0.7346513867378235, "num_tokens": 17483220.0, "step": 7358, "train/ce_loss": 0.9694086313247681 }, { "epoch": 0.7275064267352185, "step": 7358, "train/sim_loss": 0.0390625 }, { "epoch": 0.7275064267352185, "step": 7358, "train/total_loss": 0.13600337505340576 }, { "entropy": 9.09575080871582, "epoch": 0.7276052995847341, "mean_token_accuracy": 0.7530487775802612, "num_tokens": 17488329.0, "step": 7359, "train/ce_loss": 2.8264503271202557e-06 }, { "epoch": 0.7276052995847341, "step": 7359, "train/sim_loss": 0.02734375 }, { "epoch": 0.7276052995847341, "step": 7359, "train/total_loss": 0.027344033122062683 }, { "epoch": 0.7277041724342496, "grad_norm": 0.715436577796936, "learning_rate": 8.183009444691688e-06, "loss": 0.1286, "step": 7360 }, { "entropy": 8.299179077148438, "epoch": 0.7277041724342496, "mean_token_accuracy": 0.7663366198539734, "num_tokens": 17493841.0, "step": 7360, "train/ce_loss": 0.8519513010978699 }, { "epoch": 0.7277041724342496, "step": 7360, "train/sim_loss": 0.0390625 }, { "epoch": 0.7277041724342496, "step": 7360, "train/total_loss": 0.1242576315999031 }, { "entropy": 8.913398742675781, "epoch": 0.727803045283765, "mean_token_accuracy": 0.7658402323722839, "num_tokens": 17499035.0, "step": 7361, "train/ce_loss": 0.6264690160751343 }, { "epoch": 0.727803045283765, "step": 7361, "train/sim_loss": 0.02734375 }, { "epoch": 0.727803045283765, "step": 7361, "train/total_loss": 0.08999065309762955 }, { "entropy": 9.48381233215332, "epoch": 0.7279019181332806, "mean_token_accuracy": 0.7548746466636658, "num_tokens": 17503785.0, "step": 7362, "train/ce_loss": 1.407195031788433e-05 }, { "epoch": 0.7279019181332806, "step": 7362, "train/sim_loss": 0.0234375 }, { "epoch": 0.7279019181332806, "step": 7362, "train/total_loss": 0.02343890629708767 }, { "entropy": 9.410112380981445, "epoch": 0.7280007909827961, "mean_token_accuracy": 0.6206185817718506, "num_tokens": 17508709.0, "step": 7363, "train/ce_loss": 3.2126429232448572e-06 }, { "epoch": 0.7280007909827961, "step": 7363, "train/sim_loss": 0.0234375 }, { "epoch": 0.7280007909827961, "step": 7363, "train/total_loss": 0.023437820374965668 }, { "entropy": 8.998542785644531, "epoch": 0.7280996638323116, "mean_token_accuracy": 0.7198879718780518, "num_tokens": 17513839.0, "step": 7364, "train/ce_loss": 1.8226782083511353 }, { "epoch": 0.7280996638323116, "step": 7364, "train/sim_loss": 0.0390625 }, { "epoch": 0.7280996638323116, "step": 7364, "train/total_loss": 0.22133032977581024 }, { "entropy": 8.419729232788086, "epoch": 0.7281985366818272, "mean_token_accuracy": 0.7397260069847107, "num_tokens": 17519340.0, "step": 7365, "train/ce_loss": 0.5257212519645691 }, { "epoch": 0.7281985366818272, "step": 7365, "train/sim_loss": 0.015625 }, { "epoch": 0.7281985366818272, "step": 7365, "train/total_loss": 0.06819713115692139 }, { "entropy": 8.369890213012695, "epoch": 0.7282974095313427, "mean_token_accuracy": 0.7153284549713135, "num_tokens": 17524796.0, "step": 7366, "train/ce_loss": 1.0927273035049438 }, { "epoch": 0.7282974095313427, "step": 7366, "train/sim_loss": 0.0390625 }, { "epoch": 0.7282974095313427, "step": 7366, "train/total_loss": 0.14833523333072662 }, { "entropy": 9.245950698852539, "epoch": 0.7283962823808582, "mean_token_accuracy": 0.7298187613487244, "num_tokens": 17530022.0, "step": 7367, "train/ce_loss": 0.9309850335121155 }, { "epoch": 0.7283962823808582, "step": 7367, "train/sim_loss": 0.05078125 }, { "epoch": 0.7283962823808582, "step": 7367, "train/total_loss": 0.1438797563314438 }, { "entropy": 9.127169609069824, "epoch": 0.7284951552303738, "mean_token_accuracy": 0.7284172773361206, "num_tokens": 17535072.0, "step": 7368, "train/ce_loss": 1.8420739706925815e-06 }, { "epoch": 0.7284951552303738, "step": 7368, "train/sim_loss": 0.03125 }, { "epoch": 0.7284951552303738, "step": 7368, "train/total_loss": 0.031250182539224625 }, { "entropy": 8.681436538696289, "epoch": 0.7285940280798893, "mean_token_accuracy": 0.7388613820075989, "num_tokens": 17540395.0, "step": 7369, "train/ce_loss": 1.1861965656280518 }, { "epoch": 0.7285940280798893, "step": 7369, "train/sim_loss": 0.046875 }, { "epoch": 0.7285940280798893, "step": 7369, "train/total_loss": 0.1654946506023407 }, { "entropy": 9.279239654541016, "epoch": 0.7286929009294048, "mean_token_accuracy": 0.7154639363288879, "num_tokens": 17545286.0, "step": 7370, "train/ce_loss": 1.6929576531765633e-06 }, { "epoch": 0.7286929009294048, "step": 7370, "train/sim_loss": 0.01953125 }, { "epoch": 0.7286929009294048, "step": 7370, "train/total_loss": 0.01953141950070858 }, { "entropy": 8.704438209533691, "epoch": 0.7287917737789203, "mean_token_accuracy": 0.7281323671340942, "num_tokens": 17550780.0, "step": 7371, "train/ce_loss": 0.9354274272918701 }, { "epoch": 0.7287917737789203, "step": 7371, "train/sim_loss": 0.08984375 }, { "epoch": 0.7287917737789203, "step": 7371, "train/total_loss": 0.18338650465011597 }, { "entropy": 9.061699867248535, "epoch": 0.7288906466284358, "mean_token_accuracy": 0.7403684854507446, "num_tokens": 17555835.0, "step": 7372, "train/ce_loss": 0.9406352043151855 }, { "epoch": 0.7288906466284358, "step": 7372, "train/sim_loss": 0.015625 }, { "epoch": 0.7288906466284358, "step": 7372, "train/total_loss": 0.10968852043151855 }, { "entropy": 8.256986618041992, "epoch": 0.7289895194779513, "mean_token_accuracy": 0.8081587553024292, "num_tokens": 17561260.0, "step": 7373, "train/ce_loss": 0.6097618937492371 }, { "epoch": 0.7289895194779513, "step": 7373, "train/sim_loss": 0.06640625 }, { "epoch": 0.7289895194779513, "step": 7373, "train/total_loss": 0.12738244235515594 }, { "entropy": 9.185291290283203, "epoch": 0.7290883923274669, "mean_token_accuracy": 0.7113970518112183, "num_tokens": 17566203.0, "step": 7374, "train/ce_loss": 1.5345923900604248 }, { "epoch": 0.7290883923274669, "step": 7374, "train/sim_loss": 0.0625 }, { "epoch": 0.7290883923274669, "step": 7374, "train/total_loss": 0.21595923602581024 }, { "entropy": 8.327561378479004, "epoch": 0.7291872651769824, "mean_token_accuracy": 0.7195817232131958, "num_tokens": 17571715.0, "step": 7375, "train/ce_loss": 0.7341464161872864 }, { "epoch": 0.7291872651769824, "step": 7375, "train/sim_loss": 0.015625 }, { "epoch": 0.7291872651769824, "step": 7375, "train/total_loss": 0.089039646089077 }, { "entropy": 9.330865859985352, "epoch": 0.7292861380264979, "mean_token_accuracy": 0.6661211252212524, "num_tokens": 17576749.0, "step": 7376, "train/ce_loss": 1.2426577806472778 }, { "epoch": 0.7292861380264979, "step": 7376, "train/sim_loss": 0.0859375 }, { "epoch": 0.7292861380264979, "step": 7376, "train/total_loss": 0.21020328998565674 }, { "entropy": 8.291744232177734, "epoch": 0.7293850108760135, "mean_token_accuracy": 0.7259752750396729, "num_tokens": 17582272.0, "step": 7377, "train/ce_loss": 0.8136835098266602 }, { "epoch": 0.7293850108760135, "step": 7377, "train/sim_loss": 0.0859375 }, { "epoch": 0.7293850108760135, "step": 7377, "train/total_loss": 0.1673058569431305 }, { "entropy": 9.165674209594727, "epoch": 0.729483883725529, "mean_token_accuracy": 0.7454844117164612, "num_tokens": 17587321.0, "step": 7378, "train/ce_loss": 1.0950554609298706 }, { "epoch": 0.729483883725529, "step": 7378, "train/sim_loss": 0.08984375 }, { "epoch": 0.729483883725529, "step": 7378, "train/total_loss": 0.1993492990732193 }, { "entropy": 9.01347541809082, "epoch": 0.7295827565750445, "mean_token_accuracy": 0.6778916716575623, "num_tokens": 17592451.0, "step": 7379, "train/ce_loss": 1.272599458694458 }, { "epoch": 0.7295827565750445, "step": 7379, "train/sim_loss": 0.05859375 }, { "epoch": 0.7295827565750445, "step": 7379, "train/total_loss": 0.18585370481014252 }, { "epoch": 0.72968162942456, "grad_norm": 0.7320528030395508, "learning_rate": 8.17806457993374e-06, "loss": 0.1383, "step": 7380 }, { "entropy": 9.084978103637695, "epoch": 0.72968162942456, "mean_token_accuracy": 0.8264462947845459, "num_tokens": 17597583.0, "step": 7380, "train/ce_loss": 0.6765223145484924 }, { "epoch": 0.72968162942456, "step": 7380, "train/sim_loss": 0.0390625 }, { "epoch": 0.72968162942456, "step": 7380, "train/total_loss": 0.10671473294496536 }, { "entropy": 9.180556297302246, "epoch": 0.7297805022740755, "mean_token_accuracy": 0.711033284664154, "num_tokens": 17602516.0, "step": 7381, "train/ce_loss": 1.0049020051956177 }, { "epoch": 0.7297805022740755, "step": 7381, "train/sim_loss": 0.05078125 }, { "epoch": 0.7297805022740755, "step": 7381, "train/total_loss": 0.15127146244049072 }, { "entropy": 8.788009643554688, "epoch": 0.729879375123591, "mean_token_accuracy": 0.692307710647583, "num_tokens": 17607628.0, "step": 7382, "train/ce_loss": 1.089476227760315 }, { "epoch": 0.729879375123591, "step": 7382, "train/sim_loss": 0.07421875 }, { "epoch": 0.729879375123591, "step": 7382, "train/total_loss": 0.18316638469696045 }, { "entropy": 8.832174301147461, "epoch": 0.7299782479731066, "mean_token_accuracy": 0.791023850440979, "num_tokens": 17612691.0, "step": 7383, "train/ce_loss": 0.3496512472629547 }, { "epoch": 0.7299782479731066, "step": 7383, "train/sim_loss": 0.03515625 }, { "epoch": 0.7299782479731066, "step": 7383, "train/total_loss": 0.07012137770652771 }, { "entropy": 8.969341278076172, "epoch": 0.7300771208226221, "mean_token_accuracy": 0.757656455039978, "num_tokens": 17617886.0, "step": 7384, "train/ce_loss": 1.3367009162902832 }, { "epoch": 0.7300771208226221, "step": 7384, "train/sim_loss": 0.0625 }, { "epoch": 0.7300771208226221, "step": 7384, "train/total_loss": 0.19617009162902832 }, { "entropy": 9.211132049560547, "epoch": 0.7301759936721376, "mean_token_accuracy": 0.7120315432548523, "num_tokens": 17622788.0, "step": 7385, "train/ce_loss": 1.2451320886611938 }, { "epoch": 0.7301759936721376, "step": 7385, "train/sim_loss": 0.0546875 }, { "epoch": 0.7301759936721376, "step": 7385, "train/total_loss": 0.17920070886611938 }, { "entropy": 8.61956787109375, "epoch": 0.7302748665216532, "mean_token_accuracy": 0.7086614370346069, "num_tokens": 17627988.0, "step": 7386, "train/ce_loss": 1.3923940658569336 }, { "epoch": 0.7302748665216532, "step": 7386, "train/sim_loss": 0.08203125 }, { "epoch": 0.7302748665216532, "step": 7386, "train/total_loss": 0.22127066552639008 }, { "entropy": 8.674093246459961, "epoch": 0.7303737393711687, "mean_token_accuracy": 0.7386934757232666, "num_tokens": 17633417.0, "step": 7387, "train/ce_loss": 0.6746419668197632 }, { "epoch": 0.7303737393711687, "step": 7387, "train/sim_loss": 0.08984375 }, { "epoch": 0.7303737393711687, "step": 7387, "train/total_loss": 0.1573079526424408 }, { "entropy": 8.782123565673828, "epoch": 0.7304726122206842, "mean_token_accuracy": 0.6891891956329346, "num_tokens": 17638583.0, "step": 7388, "train/ce_loss": 1.1684306859970093 }, { "epoch": 0.7304726122206842, "step": 7388, "train/sim_loss": 0.04296875 }, { "epoch": 0.7304726122206842, "step": 7388, "train/total_loss": 0.1598118245601654 }, { "entropy": 8.279651641845703, "epoch": 0.7305714850701998, "mean_token_accuracy": 0.7507629990577698, "num_tokens": 17644035.0, "step": 7389, "train/ce_loss": 0.5885629653930664 }, { "epoch": 0.7305714850701998, "step": 7389, "train/sim_loss": 0.03515625 }, { "epoch": 0.7305714850701998, "step": 7389, "train/total_loss": 0.0940125435590744 }, { "entropy": 8.533639907836914, "epoch": 0.7306703579197152, "mean_token_accuracy": 0.6977567672729492, "num_tokens": 17649324.0, "step": 7390, "train/ce_loss": 1.4970142841339111 }, { "epoch": 0.7306703579197152, "step": 7390, "train/sim_loss": 0.03125 }, { "epoch": 0.7306703579197152, "step": 7390, "train/total_loss": 0.18095143139362335 }, { "entropy": 8.730866432189941, "epoch": 0.7307692307692307, "mean_token_accuracy": 0.78311687707901, "num_tokens": 17654555.0, "step": 7391, "train/ce_loss": 0.7643266916275024 }, { "epoch": 0.7307692307692307, "step": 7391, "train/sim_loss": 0.05078125 }, { "epoch": 0.7307692307692307, "step": 7391, "train/total_loss": 0.12721392512321472 }, { "entropy": 8.391874313354492, "epoch": 0.7308681036187463, "mean_token_accuracy": 0.7694672346115112, "num_tokens": 17660014.0, "step": 7392, "train/ce_loss": 0.7122501134872437 }, { "epoch": 0.7308681036187463, "step": 7392, "train/sim_loss": 0.01953125 }, { "epoch": 0.7308681036187463, "step": 7392, "train/total_loss": 0.09075625985860825 }, { "entropy": 8.80854320526123, "epoch": 0.7309669764682618, "mean_token_accuracy": 0.7176287174224854, "num_tokens": 17665096.0, "step": 7393, "train/ce_loss": 2.03812837600708 }, { "epoch": 0.7309669764682618, "step": 7393, "train/sim_loss": 0.0859375 }, { "epoch": 0.7309669764682618, "step": 7393, "train/total_loss": 0.289750337600708 }, { "entropy": 8.550348281860352, "epoch": 0.7310658493177773, "mean_token_accuracy": 0.718826413154602, "num_tokens": 17670382.0, "step": 7394, "train/ce_loss": 0.45230749249458313 }, { "epoch": 0.7310658493177773, "step": 7394, "train/sim_loss": 0.046875 }, { "epoch": 0.7310658493177773, "step": 7394, "train/total_loss": 0.09210574626922607 }, { "entropy": 8.48996353149414, "epoch": 0.7311647221672929, "mean_token_accuracy": 0.7989795804023743, "num_tokens": 17675835.0, "step": 7395, "train/ce_loss": 0.4476233720779419 }, { "epoch": 0.7311647221672929, "step": 7395, "train/sim_loss": 0.02734375 }, { "epoch": 0.7311647221672929, "step": 7395, "train/total_loss": 0.07210609316825867 }, { "entropy": 8.452526092529297, "epoch": 0.7312635950168084, "mean_token_accuracy": 0.7200854420661926, "num_tokens": 17681280.0, "step": 7396, "train/ce_loss": 0.8974115252494812 }, { "epoch": 0.7312635950168084, "step": 7396, "train/sim_loss": 0.0703125 }, { "epoch": 0.7312635950168084, "step": 7396, "train/total_loss": 0.16005365550518036 }, { "entropy": 8.42809009552002, "epoch": 0.7313624678663239, "mean_token_accuracy": 0.7561235427856445, "num_tokens": 17686692.0, "step": 7397, "train/ce_loss": 0.931861162185669 }, { "epoch": 0.7313624678663239, "step": 7397, "train/sim_loss": 0.08203125 }, { "epoch": 0.7313624678663239, "step": 7397, "train/total_loss": 0.17521736025810242 }, { "entropy": 8.542633056640625, "epoch": 0.7314613407158395, "mean_token_accuracy": 0.746835470199585, "num_tokens": 17691887.0, "step": 7398, "train/ce_loss": 1.00189208984375 }, { "epoch": 0.7314613407158395, "step": 7398, "train/sim_loss": 0.05078125 }, { "epoch": 0.7314613407158395, "step": 7398, "train/total_loss": 0.150970458984375 }, { "entropy": 8.98475170135498, "epoch": 0.7315602135653549, "mean_token_accuracy": 0.7868338823318481, "num_tokens": 17696949.0, "step": 7399, "train/ce_loss": 3.0302765026135603e-06 }, { "epoch": 0.7315602135653549, "step": 7399, "train/sim_loss": 0.01953125 }, { "epoch": 0.7315602135653549, "step": 7399, "train/total_loss": 0.019531553611159325 }, { "epoch": 0.7316590864148704, "grad_norm": 0.5891684889793396, "learning_rate": 8.173119715175791e-06, "loss": 0.1299, "step": 7400 }, { "entropy": 8.926944732666016, "epoch": 0.7316590864148704, "mean_token_accuracy": 0.711240291595459, "num_tokens": 17701897.0, "step": 7400, "train/ce_loss": 1.4225362539291382 }, { "epoch": 0.7316590864148704, "step": 7400, "train/sim_loss": 0.06640625 }, { "epoch": 0.7316590864148704, "step": 7400, "train/total_loss": 0.20865987241268158 }, { "entropy": 8.862375259399414, "epoch": 0.731757959264386, "mean_token_accuracy": 0.7735247015953064, "num_tokens": 17707019.0, "step": 7401, "train/ce_loss": 0.6153345704078674 }, { "epoch": 0.731757959264386, "step": 7401, "train/sim_loss": 0.09375 }, { "epoch": 0.731757959264386, "step": 7401, "train/total_loss": 0.15528345108032227 }, { "entropy": 8.799311637878418, "epoch": 0.7318568321139015, "mean_token_accuracy": 0.6855955719947815, "num_tokens": 17712364.0, "step": 7402, "train/ce_loss": 1.2858757972717285 }, { "epoch": 0.7318568321139015, "step": 7402, "train/sim_loss": 0.11328125 }, { "epoch": 0.7318568321139015, "step": 7402, "train/total_loss": 0.24186883866786957 }, { "entropy": 8.513522148132324, "epoch": 0.731955704963417, "mean_token_accuracy": 0.7071239948272705, "num_tokens": 17717553.0, "step": 7403, "train/ce_loss": 0.8545153737068176 }, { "epoch": 0.731955704963417, "step": 7403, "train/sim_loss": 0.0234375 }, { "epoch": 0.731955704963417, "step": 7403, "train/total_loss": 0.10888903588056564 }, { "entropy": 8.651004791259766, "epoch": 0.7320545778129326, "mean_token_accuracy": 0.8285714387893677, "num_tokens": 17722810.0, "step": 7404, "train/ce_loss": 0.6369662880897522 }, { "epoch": 0.7320545778129326, "step": 7404, "train/sim_loss": 0.015625 }, { "epoch": 0.7320545778129326, "step": 7404, "train/total_loss": 0.07932163029909134 }, { "entropy": 8.672464370727539, "epoch": 0.7321534506624481, "mean_token_accuracy": 0.736774206161499, "num_tokens": 17728047.0, "step": 7405, "train/ce_loss": 0.9165966510772705 }, { "epoch": 0.7321534506624481, "step": 7405, "train/sim_loss": 0.0234375 }, { "epoch": 0.7321534506624481, "step": 7405, "train/total_loss": 0.11509716510772705 }, { "entropy": 8.93012523651123, "epoch": 0.7322523235119636, "mean_token_accuracy": 0.6988950371742249, "num_tokens": 17733210.0, "step": 7406, "train/ce_loss": 0.8794297575950623 }, { "epoch": 0.7322523235119636, "step": 7406, "train/sim_loss": 0.06640625 }, { "epoch": 0.7322523235119636, "step": 7406, "train/total_loss": 0.15434923768043518 }, { "entropy": 8.988134384155273, "epoch": 0.7323511963614792, "mean_token_accuracy": 0.6939501762390137, "num_tokens": 17738243.0, "step": 7407, "train/ce_loss": 1.4761024713516235 }, { "epoch": 0.7323511963614792, "step": 7407, "train/sim_loss": 0.07421875 }, { "epoch": 0.7323511963614792, "step": 7407, "train/total_loss": 0.22182899713516235 }, { "entropy": 8.872218132019043, "epoch": 0.7324500692109946, "mean_token_accuracy": 0.715179979801178, "num_tokens": 17743510.0, "step": 7408, "train/ce_loss": 1.2351499795913696 }, { "epoch": 0.7324500692109946, "step": 7408, "train/sim_loss": 0.08203125 }, { "epoch": 0.7324500692109946, "step": 7408, "train/total_loss": 0.20554625988006592 }, { "entropy": 8.71223258972168, "epoch": 0.7325489420605101, "mean_token_accuracy": 0.7410179376602173, "num_tokens": 17748656.0, "step": 7409, "train/ce_loss": 0.929199755191803 }, { "epoch": 0.7325489420605101, "step": 7409, "train/sim_loss": 0.0234375 }, { "epoch": 0.7325489420605101, "step": 7409, "train/total_loss": 0.1163574755191803 }, { "entropy": 8.58144760131836, "epoch": 0.7326478149100257, "mean_token_accuracy": 0.7557160258293152, "num_tokens": 17753983.0, "step": 7410, "train/ce_loss": 0.9028260111808777 }, { "epoch": 0.7326478149100257, "step": 7410, "train/sim_loss": 0.0546875 }, { "epoch": 0.7326478149100257, "step": 7410, "train/total_loss": 0.14497010409832 }, { "entropy": 8.340339660644531, "epoch": 0.7327466877595412, "mean_token_accuracy": 0.7973421812057495, "num_tokens": 17759376.0, "step": 7411, "train/ce_loss": 0.46473783254623413 }, { "epoch": 0.7327466877595412, "step": 7411, "train/sim_loss": 0.02734375 }, { "epoch": 0.7327466877595412, "step": 7411, "train/total_loss": 0.07381753623485565 }, { "entropy": 8.653460502624512, "epoch": 0.7328455606090567, "mean_token_accuracy": 0.6598424911499023, "num_tokens": 17764447.0, "step": 7412, "train/ce_loss": 2.751971483230591 }, { "epoch": 0.7328455606090567, "step": 7412, "train/sim_loss": 0.05859375 }, { "epoch": 0.7328455606090567, "step": 7412, "train/total_loss": 0.3337908983230591 }, { "entropy": 8.45848274230957, "epoch": 0.7329444334585723, "mean_token_accuracy": 0.7096070051193237, "num_tokens": 17769851.0, "step": 7413, "train/ce_loss": 0.9773561358451843 }, { "epoch": 0.7329444334585723, "step": 7413, "train/sim_loss": 0.09375 }, { "epoch": 0.7329444334585723, "step": 7413, "train/total_loss": 0.19148561358451843 }, { "entropy": 8.733375549316406, "epoch": 0.7330433063080878, "mean_token_accuracy": 0.8272109031677246, "num_tokens": 17775073.0, "step": 7414, "train/ce_loss": 0.7675660252571106 }, { "epoch": 0.7330433063080878, "step": 7414, "train/sim_loss": 0.015625 }, { "epoch": 0.7330433063080878, "step": 7414, "train/total_loss": 0.09238160401582718 }, { "entropy": 8.829120635986328, "epoch": 0.7331421791576033, "mean_token_accuracy": 0.7294617295265198, "num_tokens": 17780221.0, "step": 7415, "train/ce_loss": 0.580898106098175 }, { "epoch": 0.7331421791576033, "step": 7415, "train/sim_loss": 0.0625 }, { "epoch": 0.7331421791576033, "step": 7415, "train/total_loss": 0.12058980762958527 }, { "entropy": 8.948328018188477, "epoch": 0.7332410520071189, "mean_token_accuracy": 0.7348703145980835, "num_tokens": 17785351.0, "step": 7416, "train/ce_loss": 1.471127986907959 }, { "epoch": 0.7332410520071189, "step": 7416, "train/sim_loss": 0.03125 }, { "epoch": 0.7332410520071189, "step": 7416, "train/total_loss": 0.17836280167102814 }, { "entropy": 9.177257537841797, "epoch": 0.7333399248566344, "mean_token_accuracy": 0.7938596606254578, "num_tokens": 17790211.0, "step": 7417, "train/ce_loss": 3.0804694688413292e-06 }, { "epoch": 0.7333399248566344, "step": 7417, "train/sim_loss": 0.0546875 }, { "epoch": 0.7333399248566344, "step": 7417, "train/total_loss": 0.05468780919909477 }, { "entropy": 8.93316650390625, "epoch": 0.7334387977061498, "mean_token_accuracy": 0.6968085169792175, "num_tokens": 17795436.0, "step": 7418, "train/ce_loss": 0.7184810042381287 }, { "epoch": 0.7334387977061498, "step": 7418, "train/sim_loss": 0.01953125 }, { "epoch": 0.7334387977061498, "step": 7418, "train/total_loss": 0.09137935191392899 }, { "entropy": 8.684490203857422, "epoch": 0.7335376705556654, "mean_token_accuracy": 0.7452229261398315, "num_tokens": 17800717.0, "step": 7419, "train/ce_loss": 0.981926679611206 }, { "epoch": 0.7335376705556654, "step": 7419, "train/sim_loss": 0.03515625 }, { "epoch": 0.7335376705556654, "step": 7419, "train/total_loss": 0.13334891200065613 }, { "epoch": 0.7336365434051809, "grad_norm": 0.6303120851516724, "learning_rate": 8.168174850417841e-06, "loss": 0.1366, "step": 7420 }, { "entropy": 8.528745651245117, "epoch": 0.7336365434051809, "mean_token_accuracy": 0.7373613119125366, "num_tokens": 17806026.0, "step": 7420, "train/ce_loss": 1.3812599182128906 }, { "epoch": 0.7336365434051809, "step": 7420, "train/sim_loss": 0.12109375 }, { "epoch": 0.7336365434051809, "step": 7420, "train/total_loss": 0.259219765663147 }, { "entropy": 8.751842498779297, "epoch": 0.7337354162546965, "mean_token_accuracy": 0.8198433518409729, "num_tokens": 17811277.0, "step": 7421, "train/ce_loss": 0.6977095007896423 }, { "epoch": 0.7337354162546965, "step": 7421, "train/sim_loss": 0.0625 }, { "epoch": 0.7337354162546965, "step": 7421, "train/total_loss": 0.1322709619998932 }, { "entropy": 8.914030075073242, "epoch": 0.733834289104212, "mean_token_accuracy": 0.7413073778152466, "num_tokens": 17816436.0, "step": 7422, "train/ce_loss": 0.6798083186149597 }, { "epoch": 0.733834289104212, "step": 7422, "train/sim_loss": 0.08203125 }, { "epoch": 0.733834289104212, "step": 7422, "train/total_loss": 0.1500120759010315 }, { "entropy": 8.907278060913086, "epoch": 0.7339331619537275, "mean_token_accuracy": 0.7296072244644165, "num_tokens": 17821531.0, "step": 7423, "train/ce_loss": 1.2015002965927124 }, { "epoch": 0.7339331619537275, "step": 7423, "train/sim_loss": 0.03515625 }, { "epoch": 0.7339331619537275, "step": 7423, "train/total_loss": 0.15530627965927124 }, { "entropy": 8.852396011352539, "epoch": 0.7340320348032431, "mean_token_accuracy": 0.7387606501579285, "num_tokens": 17826820.0, "step": 7424, "train/ce_loss": 0.818252444267273 }, { "epoch": 0.7340320348032431, "step": 7424, "train/sim_loss": 0.08984375 }, { "epoch": 0.7340320348032431, "step": 7424, "train/total_loss": 0.17166900634765625 }, { "entropy": 8.934038162231445, "epoch": 0.7341309076527586, "mean_token_accuracy": 0.6975398063659668, "num_tokens": 17831949.0, "step": 7425, "train/ce_loss": 2.117769718170166 }, { "epoch": 0.7341309076527586, "step": 7425, "train/sim_loss": 0.1171875 }, { "epoch": 0.7341309076527586, "step": 7425, "train/total_loss": 0.3289644718170166 }, { "entropy": 8.69120979309082, "epoch": 0.734229780502274, "mean_token_accuracy": 0.7587500214576721, "num_tokens": 17837188.0, "step": 7426, "train/ce_loss": 0.7669753432273865 }, { "epoch": 0.734229780502274, "step": 7426, "train/sim_loss": 0.04296875 }, { "epoch": 0.734229780502274, "step": 7426, "train/total_loss": 0.11966628581285477 }, { "entropy": 8.386311531066895, "epoch": 0.7343286533517897, "mean_token_accuracy": 0.7668508291244507, "num_tokens": 17842613.0, "step": 7427, "train/ce_loss": 0.4831239581108093 }, { "epoch": 0.7343286533517897, "step": 7427, "train/sim_loss": 0.015625 }, { "epoch": 0.7343286533517897, "step": 7427, "train/total_loss": 0.06393739581108093 }, { "entropy": 8.612841606140137, "epoch": 0.7344275262013051, "mean_token_accuracy": 0.7655259966850281, "num_tokens": 17847857.0, "step": 7428, "train/ce_loss": 0.6544772982597351 }, { "epoch": 0.7344275262013051, "step": 7428, "train/sim_loss": 0.05078125 }, { "epoch": 0.7344275262013051, "step": 7428, "train/total_loss": 0.11622898280620575 }, { "entropy": 8.549814224243164, "epoch": 0.7345263990508206, "mean_token_accuracy": 0.7464324831962585, "num_tokens": 17853239.0, "step": 7429, "train/ce_loss": 0.5706326365470886 }, { "epoch": 0.7345263990508206, "step": 7429, "train/sim_loss": 0.02734375 }, { "epoch": 0.7345263990508206, "step": 7429, "train/total_loss": 0.0844070166349411 }, { "entropy": 8.911842346191406, "epoch": 0.7346252719003362, "mean_token_accuracy": 0.7272727489471436, "num_tokens": 17858466.0, "step": 7430, "train/ce_loss": 0.8218401670455933 }, { "epoch": 0.7346252719003362, "step": 7430, "train/sim_loss": 0.046875 }, { "epoch": 0.7346252719003362, "step": 7430, "train/total_loss": 0.12905901670455933 }, { "entropy": 8.468302726745605, "epoch": 0.7347241447498517, "mean_token_accuracy": 0.7347826361656189, "num_tokens": 17863842.0, "step": 7431, "train/ce_loss": 0.739936113357544 }, { "epoch": 0.7347241447498517, "step": 7431, "train/sim_loss": 0.06640625 }, { "epoch": 0.7347241447498517, "step": 7431, "train/total_loss": 0.14039987325668335 }, { "entropy": 8.911083221435547, "epoch": 0.7348230175993672, "mean_token_accuracy": 0.741428554058075, "num_tokens": 17868967.0, "step": 7432, "train/ce_loss": 1.0359693765640259 }, { "epoch": 0.7348230175993672, "step": 7432, "train/sim_loss": 0.08203125 }, { "epoch": 0.7348230175993672, "step": 7432, "train/total_loss": 0.18562819063663483 }, { "entropy": 8.585333824157715, "epoch": 0.7349218904488828, "mean_token_accuracy": 0.7360946536064148, "num_tokens": 17874280.0, "step": 7433, "train/ce_loss": 0.9600635170936584 }, { "epoch": 0.7349218904488828, "step": 7433, "train/sim_loss": 0.0859375 }, { "epoch": 0.7349218904488828, "step": 7433, "train/total_loss": 0.1819438636302948 }, { "entropy": 8.341687202453613, "epoch": 0.7350207632983983, "mean_token_accuracy": 0.7622789740562439, "num_tokens": 17879742.0, "step": 7434, "train/ce_loss": 0.5410976409912109 }, { "epoch": 0.7350207632983983, "step": 7434, "train/sim_loss": 0.04296875 }, { "epoch": 0.7350207632983983, "step": 7434, "train/total_loss": 0.09707851707935333 }, { "entropy": 8.966690063476562, "epoch": 0.7351196361479138, "mean_token_accuracy": 0.7216890454292297, "num_tokens": 17884712.0, "step": 7435, "train/ce_loss": 1.1198730135220103e-05 }, { "epoch": 0.7351196361479138, "step": 7435, "train/sim_loss": 0.05859375 }, { "epoch": 0.7351196361479138, "step": 7435, "train/total_loss": 0.05859487131237984 }, { "entropy": 9.215240478515625, "epoch": 0.7352185089974294, "mean_token_accuracy": 0.7508361339569092, "num_tokens": 17889779.0, "step": 7436, "train/ce_loss": 2.024491550400853e-06 }, { "epoch": 0.7352185089974294, "step": 7436, "train/sim_loss": 0.0625 }, { "epoch": 0.7352185089974294, "step": 7436, "train/total_loss": 0.06250020116567612 }, { "entropy": 8.692525863647461, "epoch": 0.7353173818469448, "mean_token_accuracy": 0.7033374309539795, "num_tokens": 17895091.0, "step": 7437, "train/ce_loss": 1.055518627166748 }, { "epoch": 0.7353173818469448, "step": 7437, "train/sim_loss": 0.0546875 }, { "epoch": 0.7353173818469448, "step": 7437, "train/total_loss": 0.16023936867713928 }, { "entropy": 8.578010559082031, "epoch": 0.7354162546964603, "mean_token_accuracy": 0.7098501324653625, "num_tokens": 17900483.0, "step": 7438, "train/ce_loss": 0.8790786266326904 }, { "epoch": 0.7354162546964603, "step": 7438, "train/sim_loss": 0.046875 }, { "epoch": 0.7354162546964603, "step": 7438, "train/total_loss": 0.13478286564350128 }, { "entropy": 8.591001510620117, "epoch": 0.7355151275459759, "mean_token_accuracy": 0.7846952080726624, "num_tokens": 17905716.0, "step": 7439, "train/ce_loss": 1.0676848888397217 }, { "epoch": 0.7355151275459759, "step": 7439, "train/sim_loss": 0.09765625 }, { "epoch": 0.7355151275459759, "step": 7439, "train/total_loss": 0.20442473888397217 }, { "epoch": 0.7356140003954914, "grad_norm": 0.6831417083740234, "learning_rate": 8.163229985659894e-06, "loss": 0.1384, "step": 7440 }, { "entropy": 8.783339500427246, "epoch": 0.7356140003954914, "mean_token_accuracy": 0.6866666674613953, "num_tokens": 17910920.0, "step": 7440, "train/ce_loss": 1.181344747543335 }, { "epoch": 0.7356140003954914, "step": 7440, "train/sim_loss": 0.05078125 }, { "epoch": 0.7356140003954914, "step": 7440, "train/total_loss": 0.16891571879386902 }, { "entropy": 8.621808052062988, "epoch": 0.7357128732450069, "mean_token_accuracy": 0.747474730014801, "num_tokens": 17916289.0, "step": 7441, "train/ce_loss": 1.483172059059143 }, { "epoch": 0.7357128732450069, "step": 7441, "train/sim_loss": 0.046875 }, { "epoch": 0.7357128732450069, "step": 7441, "train/total_loss": 0.19519220292568207 }, { "entropy": 8.434520721435547, "epoch": 0.7358117460945225, "mean_token_accuracy": 0.740618109703064, "num_tokens": 17921665.0, "step": 7442, "train/ce_loss": 0.37312033772468567 }, { "epoch": 0.7358117460945225, "step": 7442, "train/sim_loss": 0.0390625 }, { "epoch": 0.7358117460945225, "step": 7442, "train/total_loss": 0.07637453079223633 }, { "entropy": 8.742439270019531, "epoch": 0.735910618944038, "mean_token_accuracy": 0.7351274490356445, "num_tokens": 17926775.0, "step": 7443, "train/ce_loss": 1.4379651546478271 }, { "epoch": 0.735910618944038, "step": 7443, "train/sim_loss": 0.140625 }, { "epoch": 0.735910618944038, "step": 7443, "train/total_loss": 0.28442150354385376 }, { "entropy": 8.910379409790039, "epoch": 0.7360094917935535, "mean_token_accuracy": 0.75, "num_tokens": 17931831.0, "step": 7444, "train/ce_loss": 2.8590995952981757e-06 }, { "epoch": 0.7360094917935535, "step": 7444, "train/sim_loss": 0.0234375 }, { "epoch": 0.7360094917935535, "step": 7444, "train/total_loss": 0.023437784984707832 }, { "entropy": 8.449346542358398, "epoch": 0.7361083646430691, "mean_token_accuracy": 0.7605473399162292, "num_tokens": 17937201.0, "step": 7445, "train/ce_loss": 0.5927489399909973 }, { "epoch": 0.7361083646430691, "step": 7445, "train/sim_loss": 0.06640625 }, { "epoch": 0.7361083646430691, "step": 7445, "train/total_loss": 0.12568114697933197 }, { "entropy": 8.496419906616211, "epoch": 0.7362072374925845, "mean_token_accuracy": 0.7832699418067932, "num_tokens": 17942474.0, "step": 7446, "train/ce_loss": 0.5286972522735596 }, { "epoch": 0.7362072374925845, "step": 7446, "train/sim_loss": 0.04296875 }, { "epoch": 0.7362072374925845, "step": 7446, "train/total_loss": 0.09583847224712372 }, { "entropy": 9.497404098510742, "epoch": 0.7363061103421, "mean_token_accuracy": 0.7071583271026611, "num_tokens": 17947345.0, "step": 7447, "train/ce_loss": 1.1761665344238281 }, { "epoch": 0.7363061103421, "step": 7447, "train/sim_loss": 0.01953125 }, { "epoch": 0.7363061103421, "step": 7447, "train/total_loss": 0.1371479034423828 }, { "entropy": 8.603475570678711, "epoch": 0.7364049831916156, "mean_token_accuracy": 0.7022988796234131, "num_tokens": 17952687.0, "step": 7448, "train/ce_loss": 0.9062698483467102 }, { "epoch": 0.7364049831916156, "step": 7448, "train/sim_loss": 0.02734375 }, { "epoch": 0.7364049831916156, "step": 7448, "train/total_loss": 0.11797073483467102 }, { "entropy": 9.307031631469727, "epoch": 0.7365038560411311, "mean_token_accuracy": 0.7286324501037598, "num_tokens": 17957579.0, "step": 7449, "train/ce_loss": 1.983763337135315 }, { "epoch": 0.7365038560411311, "step": 7449, "train/sim_loss": 0.07421875 }, { "epoch": 0.7365038560411311, "step": 7449, "train/total_loss": 0.2725951075553894 }, { "entropy": 8.819190979003906, "epoch": 0.7366027288906466, "mean_token_accuracy": 0.7057926654815674, "num_tokens": 17962639.0, "step": 7450, "train/ce_loss": 1.0233535766601562 }, { "epoch": 0.7366027288906466, "step": 7450, "train/sim_loss": 0.07421875 }, { "epoch": 0.7366027288906466, "step": 7450, "train/total_loss": 0.1765541136264801 }, { "entropy": 8.637828826904297, "epoch": 0.7367016017401622, "mean_token_accuracy": 0.7119901180267334, "num_tokens": 17967934.0, "step": 7451, "train/ce_loss": 1.9882410764694214 }, { "epoch": 0.7367016017401622, "step": 7451, "train/sim_loss": 0.05078125 }, { "epoch": 0.7367016017401622, "step": 7451, "train/total_loss": 0.24960535764694214 }, { "entropy": 8.599782943725586, "epoch": 0.7368004745896777, "mean_token_accuracy": 0.7238442897796631, "num_tokens": 17973243.0, "step": 7452, "train/ce_loss": 0.9183986186981201 }, { "epoch": 0.7368004745896777, "step": 7452, "train/sim_loss": 0.0859375 }, { "epoch": 0.7368004745896777, "step": 7452, "train/total_loss": 0.17777736485004425 }, { "entropy": 8.703730583190918, "epoch": 0.7368993474391932, "mean_token_accuracy": 0.7442660331726074, "num_tokens": 17978597.0, "step": 7453, "train/ce_loss": 0.710777759552002 }, { "epoch": 0.7368993474391932, "step": 7453, "train/sim_loss": 0.09765625 }, { "epoch": 0.7368993474391932, "step": 7453, "train/total_loss": 0.16873402893543243 }, { "entropy": 8.349955558776855, "epoch": 0.7369982202887088, "mean_token_accuracy": 0.7752193212509155, "num_tokens": 17983918.0, "step": 7454, "train/ce_loss": 0.5250358581542969 }, { "epoch": 0.7369982202887088, "step": 7454, "train/sim_loss": 0.04296875 }, { "epoch": 0.7369982202887088, "step": 7454, "train/total_loss": 0.09547233581542969 }, { "entropy": 9.179643630981445, "epoch": 0.7370970931382242, "mean_token_accuracy": 0.7523629665374756, "num_tokens": 17988831.0, "step": 7455, "train/ce_loss": 1.054730772972107 }, { "epoch": 0.7370970931382242, "step": 7455, "train/sim_loss": 0.0234375 }, { "epoch": 0.7370970931382242, "step": 7455, "train/total_loss": 0.12891057133674622 }, { "entropy": 8.195453643798828, "epoch": 0.7371959659877397, "mean_token_accuracy": 0.7772685885429382, "num_tokens": 17994422.0, "step": 7456, "train/ce_loss": 0.589515209197998 }, { "epoch": 0.7371959659877397, "step": 7456, "train/sim_loss": 0.0625 }, { "epoch": 0.7371959659877397, "step": 7456, "train/total_loss": 0.12145152688026428 }, { "entropy": 8.321382522583008, "epoch": 0.7372948388372553, "mean_token_accuracy": 0.7400000095367432, "num_tokens": 17999746.0, "step": 7457, "train/ce_loss": 1.3899191617965698 }, { "epoch": 0.7372948388372553, "step": 7457, "train/sim_loss": 0.08203125 }, { "epoch": 0.7372948388372553, "step": 7457, "train/total_loss": 0.22102317214012146 }, { "entropy": 8.427468299865723, "epoch": 0.7373937116867708, "mean_token_accuracy": 0.7523609399795532, "num_tokens": 18005183.0, "step": 7458, "train/ce_loss": 0.47106844186782837 }, { "epoch": 0.7373937116867708, "step": 7458, "train/sim_loss": 0.015625 }, { "epoch": 0.7373937116867708, "step": 7458, "train/total_loss": 0.06273184716701508 }, { "entropy": 8.995831489562988, "epoch": 0.7374925845362863, "mean_token_accuracy": 0.686821699142456, "num_tokens": 18010282.0, "step": 7459, "train/ce_loss": 1.6602399349212646 }, { "epoch": 0.7374925845362863, "step": 7459, "train/sim_loss": 0.140625 }, { "epoch": 0.7374925845362863, "step": 7459, "train/total_loss": 0.30664899945259094 }, { "epoch": 0.7375914573858019, "grad_norm": 0.7434929609298706, "learning_rate": 8.158285120901944e-06, "loss": 0.1457, "step": 7460 }, { "entropy": 8.676469802856445, "epoch": 0.7375914573858019, "mean_token_accuracy": 0.7600595951080322, "num_tokens": 18015401.0, "step": 7460, "train/ce_loss": 0.8739643096923828 }, { "epoch": 0.7375914573858019, "step": 7460, "train/sim_loss": 0.0546875 }, { "epoch": 0.7375914573858019, "step": 7460, "train/total_loss": 0.14208394289016724 }, { "entropy": 8.663125038146973, "epoch": 0.7376903302353174, "mean_token_accuracy": 0.7645390033721924, "num_tokens": 18020579.0, "step": 7461, "train/ce_loss": 1.3094218969345093 }, { "epoch": 0.7376903302353174, "step": 7461, "train/sim_loss": 0.0390625 }, { "epoch": 0.7376903302353174, "step": 7461, "train/total_loss": 0.1700046956539154 }, { "entropy": 8.195062637329102, "epoch": 0.7377892030848329, "mean_token_accuracy": 0.7237623929977417, "num_tokens": 18026087.0, "step": 7462, "train/ce_loss": 1.2017582654953003 }, { "epoch": 0.7377892030848329, "step": 7462, "train/sim_loss": 0.0390625 }, { "epoch": 0.7377892030848329, "step": 7462, "train/total_loss": 0.15923833847045898 }, { "entropy": 8.615524291992188, "epoch": 0.7378880759343485, "mean_token_accuracy": 0.7533265352249146, "num_tokens": 18031543.0, "step": 7463, "train/ce_loss": 0.5794716477394104 }, { "epoch": 0.7378880759343485, "step": 7463, "train/sim_loss": 0.03515625 }, { "epoch": 0.7378880759343485, "step": 7463, "train/total_loss": 0.09310341626405716 }, { "entropy": 8.52651309967041, "epoch": 0.737986948783864, "mean_token_accuracy": 0.7455782294273376, "num_tokens": 18036708.0, "step": 7464, "train/ce_loss": 0.650346040725708 }, { "epoch": 0.737986948783864, "step": 7464, "train/sim_loss": 0.03515625 }, { "epoch": 0.737986948783864, "step": 7464, "train/total_loss": 0.10019085556268692 }, { "entropy": 9.093423843383789, "epoch": 0.7380858216333794, "mean_token_accuracy": 0.6904761791229248, "num_tokens": 18041777.0, "step": 7465, "train/ce_loss": 0.5290768146514893 }, { "epoch": 0.7380858216333794, "step": 7465, "train/sim_loss": 0.03515625 }, { "epoch": 0.7380858216333794, "step": 7465, "train/total_loss": 0.08806393295526505 }, { "entropy": 8.408145904541016, "epoch": 0.738184694482895, "mean_token_accuracy": 0.7427341341972351, "num_tokens": 18047248.0, "step": 7466, "train/ce_loss": 1.373179316520691 }, { "epoch": 0.738184694482895, "step": 7466, "train/sim_loss": 0.12890625 }, { "epoch": 0.738184694482895, "step": 7466, "train/total_loss": 0.266224205493927 }, { "entropy": 9.112894058227539, "epoch": 0.7382835673324105, "mean_token_accuracy": 0.7475149035453796, "num_tokens": 18052163.0, "step": 7467, "train/ce_loss": 1.8712252378463745 }, { "epoch": 0.7382835673324105, "step": 7467, "train/sim_loss": 0.05078125 }, { "epoch": 0.7382835673324105, "step": 7467, "train/total_loss": 0.23790377378463745 }, { "entropy": 8.533920288085938, "epoch": 0.738382440181926, "mean_token_accuracy": 0.7494252920150757, "num_tokens": 18057560.0, "step": 7468, "train/ce_loss": 0.5045980215072632 }, { "epoch": 0.738382440181926, "step": 7468, "train/sim_loss": 0.08984375 }, { "epoch": 0.738382440181926, "step": 7468, "train/total_loss": 0.14030355215072632 }, { "entropy": 8.695446968078613, "epoch": 0.7384813130314416, "mean_token_accuracy": 0.7431421279907227, "num_tokens": 18062915.0, "step": 7469, "train/ce_loss": 1.0216988325119019 }, { "epoch": 0.7384813130314416, "step": 7469, "train/sim_loss": 0.07421875 }, { "epoch": 0.7384813130314416, "step": 7469, "train/total_loss": 0.17638863623142242 }, { "entropy": 8.989627838134766, "epoch": 0.7385801858809571, "mean_token_accuracy": 0.7777777910232544, "num_tokens": 18068143.0, "step": 7470, "train/ce_loss": 1.1923831701278687 }, { "epoch": 0.7385801858809571, "step": 7470, "train/sim_loss": 0.07421875 }, { "epoch": 0.7385801858809571, "step": 7470, "train/total_loss": 0.19345706701278687 }, { "entropy": 9.072559356689453, "epoch": 0.7386790587304726, "mean_token_accuracy": 0.7285463809967041, "num_tokens": 18073159.0, "step": 7471, "train/ce_loss": 4.317763796279905e-06 }, { "epoch": 0.7386790587304726, "step": 7471, "train/sim_loss": 0.04296875 }, { "epoch": 0.7386790587304726, "step": 7471, "train/total_loss": 0.04296918213367462 }, { "entropy": 8.951403617858887, "epoch": 0.7387779315799882, "mean_token_accuracy": 0.7418397665023804, "num_tokens": 18078321.0, "step": 7472, "train/ce_loss": 1.0193686485290527 }, { "epoch": 0.7387779315799882, "step": 7472, "train/sim_loss": 0.078125 }, { "epoch": 0.7387779315799882, "step": 7472, "train/total_loss": 0.18006187677383423 }, { "entropy": 8.52254867553711, "epoch": 0.7388768044295037, "mean_token_accuracy": 0.7313883304595947, "num_tokens": 18083782.0, "step": 7473, "train/ce_loss": 0.5721186399459839 }, { "epoch": 0.7388768044295037, "step": 7473, "train/sim_loss": 0.02734375 }, { "epoch": 0.7388768044295037, "step": 7473, "train/total_loss": 0.08455561101436615 }, { "entropy": 8.624273300170898, "epoch": 0.7389756772790191, "mean_token_accuracy": 0.7298387289047241, "num_tokens": 18088950.0, "step": 7474, "train/ce_loss": 0.4974902868270874 }, { "epoch": 0.7389756772790191, "step": 7474, "train/sim_loss": 0.0390625 }, { "epoch": 0.7389756772790191, "step": 7474, "train/total_loss": 0.08881153166294098 }, { "entropy": 8.725934982299805, "epoch": 0.7390745501285347, "mean_token_accuracy": 0.7341115474700928, "num_tokens": 18094363.0, "step": 7475, "train/ce_loss": 0.9648977518081665 }, { "epoch": 0.7390745501285347, "step": 7475, "train/sim_loss": 0.078125 }, { "epoch": 0.7390745501285347, "step": 7475, "train/total_loss": 0.1746147871017456 }, { "entropy": 8.515832901000977, "epoch": 0.7391734229780502, "mean_token_accuracy": 0.75, "num_tokens": 18099660.0, "step": 7476, "train/ce_loss": 0.5893865823745728 }, { "epoch": 0.7391734229780502, "step": 7476, "train/sim_loss": 0.05859375 }, { "epoch": 0.7391734229780502, "step": 7476, "train/total_loss": 0.1175324097275734 }, { "entropy": 8.935949325561523, "epoch": 0.7392722958275657, "mean_token_accuracy": 0.7350427508354187, "num_tokens": 18104844.0, "step": 7477, "train/ce_loss": 1.5920840041871998e-06 }, { "epoch": 0.7392722958275657, "step": 7477, "train/sim_loss": 0.03125 }, { "epoch": 0.7392722958275657, "step": 7477, "train/total_loss": 0.031250160187482834 }, { "entropy": 8.676297187805176, "epoch": 0.7393711686770813, "mean_token_accuracy": 0.7168674468994141, "num_tokens": 18109939.0, "step": 7478, "train/ce_loss": 1.316830039024353 }, { "epoch": 0.7393711686770813, "step": 7478, "train/sim_loss": 0.0625 }, { "epoch": 0.7393711686770813, "step": 7478, "train/total_loss": 0.19418300688266754 }, { "entropy": 8.472021102905273, "epoch": 0.7394700415265968, "mean_token_accuracy": 0.7288135886192322, "num_tokens": 18115328.0, "step": 7479, "train/ce_loss": 0.778196394443512 }, { "epoch": 0.7394700415265968, "step": 7479, "train/sim_loss": 0.0234375 }, { "epoch": 0.7394700415265968, "step": 7479, "train/total_loss": 0.10125713795423508 }, { "epoch": 0.7395689143761123, "grad_norm": 0.6186052560806274, "learning_rate": 8.153340256143996e-06, "loss": 0.1388, "step": 7480 }, { "entropy": 8.667085647583008, "epoch": 0.7395689143761123, "mean_token_accuracy": 0.7706093192100525, "num_tokens": 18120639.0, "step": 7480, "train/ce_loss": 0.9406066536903381 }, { "epoch": 0.7395689143761123, "step": 7480, "train/sim_loss": 0.0546875 }, { "epoch": 0.7395689143761123, "step": 7480, "train/total_loss": 0.14874815940856934 }, { "entropy": 8.820192337036133, "epoch": 0.7396677872256279, "mean_token_accuracy": 0.7028493881225586, "num_tokens": 18125825.0, "step": 7481, "train/ce_loss": 1.157432507170597e-06 }, { "epoch": 0.7396677872256279, "step": 7481, "train/sim_loss": 0.0390625 }, { "epoch": 0.7396677872256279, "step": 7481, "train/total_loss": 0.03906261548399925 }, { "entropy": 8.352201461791992, "epoch": 0.7397666600751434, "mean_token_accuracy": 0.7907253503799438, "num_tokens": 18131117.0, "step": 7482, "train/ce_loss": 0.5861807465553284 }, { "epoch": 0.7397666600751434, "step": 7482, "train/sim_loss": 0.03125 }, { "epoch": 0.7397666600751434, "step": 7482, "train/total_loss": 0.08986807614564896 }, { "entropy": 8.777387619018555, "epoch": 0.7398655329246588, "mean_token_accuracy": 0.7474489808082581, "num_tokens": 18136348.0, "step": 7483, "train/ce_loss": 0.7887819409370422 }, { "epoch": 0.7398655329246588, "step": 7483, "train/sim_loss": 0.04296875 }, { "epoch": 0.7398655329246588, "step": 7483, "train/total_loss": 0.12184694409370422 }, { "entropy": 8.668893814086914, "epoch": 0.7399644057741744, "mean_token_accuracy": 0.7419354915618896, "num_tokens": 18141641.0, "step": 7484, "train/ce_loss": 0.732936680316925 }, { "epoch": 0.7399644057741744, "step": 7484, "train/sim_loss": 0.0703125 }, { "epoch": 0.7399644057741744, "step": 7484, "train/total_loss": 0.14360617101192474 }, { "entropy": 8.698103904724121, "epoch": 0.7400632786236899, "mean_token_accuracy": 0.7807351350784302, "num_tokens": 18146859.0, "step": 7485, "train/ce_loss": 0.6472761631011963 }, { "epoch": 0.7400632786236899, "step": 7485, "train/sim_loss": 0.015625 }, { "epoch": 0.7400632786236899, "step": 7485, "train/total_loss": 0.08035261929035187 }, { "entropy": 8.32318115234375, "epoch": 0.7401621514732054, "mean_token_accuracy": 0.7098692059516907, "num_tokens": 18152188.0, "step": 7486, "train/ce_loss": 0.5486804842948914 }, { "epoch": 0.7401621514732054, "step": 7486, "train/sim_loss": 0.0546875 }, { "epoch": 0.7401621514732054, "step": 7486, "train/total_loss": 0.10955554991960526 }, { "entropy": 8.622991561889648, "epoch": 0.740261024322721, "mean_token_accuracy": 0.7683772444725037, "num_tokens": 18157390.0, "step": 7487, "train/ce_loss": 0.8550941348075867 }, { "epoch": 0.740261024322721, "step": 7487, "train/sim_loss": 0.0625 }, { "epoch": 0.740261024322721, "step": 7487, "train/total_loss": 0.14800941944122314 }, { "entropy": 9.3809814453125, "epoch": 0.7403598971722365, "mean_token_accuracy": 0.7139784693717957, "num_tokens": 18162235.0, "step": 7488, "train/ce_loss": 5.4315864872478414e-06 }, { "epoch": 0.7403598971722365, "step": 7488, "train/sim_loss": 0.015625 }, { "epoch": 0.7403598971722365, "step": 7488, "train/total_loss": 0.015625543892383575 }, { "entropy": 8.999673843383789, "epoch": 0.740458770021752, "mean_token_accuracy": 0.8227091431617737, "num_tokens": 18167216.0, "step": 7489, "train/ce_loss": 3.1229728847392835e-06 }, { "epoch": 0.740458770021752, "step": 7489, "train/sim_loss": 0.04296875 }, { "epoch": 0.740458770021752, "step": 7489, "train/total_loss": 0.04296906292438507 }, { "entropy": 9.476018905639648, "epoch": 0.7405576428712676, "mean_token_accuracy": 0.7777777910232544, "num_tokens": 18172017.0, "step": 7490, "train/ce_loss": 1.7611380815505981 }, { "epoch": 0.7405576428712676, "step": 7490, "train/sim_loss": 0.0625 }, { "epoch": 0.7405576428712676, "step": 7490, "train/total_loss": 0.2386138141155243 }, { "entropy": 8.836692810058594, "epoch": 0.7406565157207831, "mean_token_accuracy": 0.7779291272163391, "num_tokens": 18177190.0, "step": 7491, "train/ce_loss": 9.796309541343362e-07 }, { "epoch": 0.7406565157207831, "step": 7491, "train/sim_loss": 0.0234375 }, { "epoch": 0.7406565157207831, "step": 7491, "train/total_loss": 0.02343759872019291 }, { "entropy": 8.639360427856445, "epoch": 0.7407553885702985, "mean_token_accuracy": 0.798751950263977, "num_tokens": 18182267.0, "step": 7492, "train/ce_loss": 0.6451224088668823 }, { "epoch": 0.7407553885702985, "step": 7492, "train/sim_loss": 0.0234375 }, { "epoch": 0.7407553885702985, "step": 7492, "train/total_loss": 0.08794974535703659 }, { "entropy": 9.31312370300293, "epoch": 0.7408542614198141, "mean_token_accuracy": 0.8333333134651184, "num_tokens": 18187189.0, "step": 7493, "train/ce_loss": 1.679405613685958e-06 }, { "epoch": 0.7408542614198141, "step": 7493, "train/sim_loss": 0.0234375 }, { "epoch": 0.7408542614198141, "step": 7493, "train/total_loss": 0.02343766763806343 }, { "entropy": 8.655989646911621, "epoch": 0.7409531342693296, "mean_token_accuracy": 0.7229729890823364, "num_tokens": 18192370.0, "step": 7494, "train/ce_loss": 0.5090431571006775 }, { "epoch": 0.7409531342693296, "step": 7494, "train/sim_loss": 0.046875 }, { "epoch": 0.7409531342693296, "step": 7494, "train/total_loss": 0.09777931869029999 }, { "entropy": 8.696633338928223, "epoch": 0.7410520071188451, "mean_token_accuracy": 0.7646276354789734, "num_tokens": 18197603.0, "step": 7495, "train/ce_loss": 0.6268374919891357 }, { "epoch": 0.7410520071188451, "step": 7495, "train/sim_loss": 0.01953125 }, { "epoch": 0.7410520071188451, "step": 7495, "train/total_loss": 0.08221500366926193 }, { "entropy": 8.735063552856445, "epoch": 0.7411508799683607, "mean_token_accuracy": 0.7355263233184814, "num_tokens": 18202861.0, "step": 7496, "train/ce_loss": 1.2398719787597656 }, { "epoch": 0.7411508799683607, "step": 7496, "train/sim_loss": 0.03125 }, { "epoch": 0.7411508799683607, "step": 7496, "train/total_loss": 0.15523719787597656 }, { "entropy": 8.043102264404297, "epoch": 0.7412497528178762, "mean_token_accuracy": 0.6976320743560791, "num_tokens": 18208422.0, "step": 7497, "train/ce_loss": 0.8832078576087952 }, { "epoch": 0.7412497528178762, "step": 7497, "train/sim_loss": 0.078125 }, { "epoch": 0.7412497528178762, "step": 7497, "train/total_loss": 0.166445791721344 }, { "entropy": 8.297633171081543, "epoch": 0.7413486256673917, "mean_token_accuracy": 0.770567774772644, "num_tokens": 18213775.0, "step": 7498, "train/ce_loss": 0.9094583988189697 }, { "epoch": 0.7413486256673917, "step": 7498, "train/sim_loss": 0.05078125 }, { "epoch": 0.7413486256673917, "step": 7498, "train/total_loss": 0.14172708988189697 }, { "entropy": 8.361133575439453, "epoch": 0.7414474985169073, "mean_token_accuracy": 0.7287405729293823, "num_tokens": 18219158.0, "step": 7499, "train/ce_loss": 1.0043796300888062 }, { "epoch": 0.7414474985169073, "step": 7499, "train/sim_loss": 0.05859375 }, { "epoch": 0.7414474985169073, "step": 7499, "train/total_loss": 0.1590317189693451 }, { "epoch": 0.7415463713664228, "grad_norm": 0.6593493819236755, "learning_rate": 8.148395391386045e-06, "loss": 0.1274, "step": 7500 }, { "entropy": 8.695594787597656, "epoch": 0.7415463713664228, "mean_token_accuracy": 0.7369165420532227, "num_tokens": 18224300.0, "step": 7500, "train/ce_loss": 0.4475291073322296 }, { "epoch": 0.7415463713664228, "step": 7500, "train/sim_loss": 0.08203125 }, { "epoch": 0.7415463713664228, "step": 7500, "train/total_loss": 0.12678416073322296 }, { "entropy": 8.78538703918457, "epoch": 0.7416452442159382, "mean_token_accuracy": 0.711796224117279, "num_tokens": 18229483.0, "step": 7501, "train/ce_loss": 2.1044628620147705 }, { "epoch": 0.7416452442159382, "step": 7501, "train/sim_loss": 0.046875 }, { "epoch": 0.7416452442159382, "step": 7501, "train/total_loss": 0.257321298122406 }, { "entropy": 8.628006935119629, "epoch": 0.7417441170654538, "mean_token_accuracy": 0.7457386255264282, "num_tokens": 18234634.0, "step": 7502, "train/ce_loss": 0.9490795135498047 }, { "epoch": 0.7417441170654538, "step": 7502, "train/sim_loss": 0.03125 }, { "epoch": 0.7417441170654538, "step": 7502, "train/total_loss": 0.1261579543352127 }, { "entropy": 8.950165748596191, "epoch": 0.7418429899149693, "mean_token_accuracy": 0.7557471394538879, "num_tokens": 18239778.0, "step": 7503, "train/ce_loss": 1.7057231664657593 }, { "epoch": 0.7418429899149693, "step": 7503, "train/sim_loss": 0.05859375 }, { "epoch": 0.7418429899149693, "step": 7503, "train/total_loss": 0.22916607558727264 }, { "entropy": 8.387922286987305, "epoch": 0.7419418627644849, "mean_token_accuracy": 0.739130437374115, "num_tokens": 18245170.0, "step": 7504, "train/ce_loss": 0.7675881385803223 }, { "epoch": 0.7419418627644849, "step": 7504, "train/sim_loss": 0.046875 }, { "epoch": 0.7419418627644849, "step": 7504, "train/total_loss": 0.12363381683826447 }, { "entropy": 8.603219985961914, "epoch": 0.7420407356140004, "mean_token_accuracy": 0.6928961873054504, "num_tokens": 18250566.0, "step": 7505, "train/ce_loss": 1.4200619459152222 }, { "epoch": 0.7420407356140004, "step": 7505, "train/sim_loss": 0.08203125 }, { "epoch": 0.7420407356140004, "step": 7505, "train/total_loss": 0.22403745353221893 }, { "entropy": 8.27988052368164, "epoch": 0.7421396084635159, "mean_token_accuracy": 0.7318918704986572, "num_tokens": 18255985.0, "step": 7506, "train/ce_loss": 0.6595481038093567 }, { "epoch": 0.7421396084635159, "step": 7506, "train/sim_loss": 0.01953125 }, { "epoch": 0.7421396084635159, "step": 7506, "train/total_loss": 0.08548606187105179 }, { "entropy": 8.62784194946289, "epoch": 0.7422384813130315, "mean_token_accuracy": 0.7718501687049866, "num_tokens": 18261498.0, "step": 7507, "train/ce_loss": 1.0468218326568604 }, { "epoch": 0.7422384813130315, "step": 7507, "train/sim_loss": 0.1484375 }, { "epoch": 0.7422384813130315, "step": 7507, "train/total_loss": 0.25311967730522156 }, { "entropy": 8.539094924926758, "epoch": 0.742337354162547, "mean_token_accuracy": 0.7296416759490967, "num_tokens": 18266566.0, "step": 7508, "train/ce_loss": 2.49420668296807e-06 }, { "epoch": 0.742337354162547, "step": 7508, "train/sim_loss": 0.03515625 }, { "epoch": 0.742337354162547, "step": 7508, "train/total_loss": 0.03515649959445 }, { "entropy": 9.223367691040039, "epoch": 0.7424362270120625, "mean_token_accuracy": 0.7969052195549011, "num_tokens": 18271480.0, "step": 7509, "train/ce_loss": 1.2301596403121948 }, { "epoch": 0.7424362270120625, "step": 7509, "train/sim_loss": 0.0703125 }, { "epoch": 0.7424362270120625, "step": 7509, "train/total_loss": 0.19332846999168396 }, { "entropy": 9.014839172363281, "epoch": 0.7425350998615781, "mean_token_accuracy": 0.7388724088668823, "num_tokens": 18276636.0, "step": 7510, "train/ce_loss": 0.7081224322319031 }, { "epoch": 0.7425350998615781, "step": 7510, "train/sim_loss": 0.03125 }, { "epoch": 0.7425350998615781, "step": 7510, "train/total_loss": 0.10206224769353867 }, { "entropy": 9.475015640258789, "epoch": 0.7426339727110935, "mean_token_accuracy": 0.7409470677375793, "num_tokens": 18281409.0, "step": 7511, "train/ce_loss": 4.576662377075991e-06 }, { "epoch": 0.7426339727110935, "step": 7511, "train/sim_loss": 0.046875 }, { "epoch": 0.7426339727110935, "step": 7511, "train/total_loss": 0.04687545821070671 }, { "entropy": 8.920570373535156, "epoch": 0.742732845560609, "mean_token_accuracy": 0.7527910470962524, "num_tokens": 18286444.0, "step": 7512, "train/ce_loss": 6.4199302869383246e-06 }, { "epoch": 0.742732845560609, "step": 7512, "train/sim_loss": 0.02734375 }, { "epoch": 0.742732845560609, "step": 7512, "train/total_loss": 0.027344392612576485 }, { "entropy": 8.559412956237793, "epoch": 0.7428317184101246, "mean_token_accuracy": 0.8051391839981079, "num_tokens": 18292050.0, "step": 7513, "train/ce_loss": 0.3435446619987488 }, { "epoch": 0.7428317184101246, "step": 7513, "train/sim_loss": 0.05078125 }, { "epoch": 0.7428317184101246, "step": 7513, "train/total_loss": 0.08513571321964264 }, { "entropy": 9.073625564575195, "epoch": 0.7429305912596401, "mean_token_accuracy": 0.7381294965744019, "num_tokens": 18297199.0, "step": 7514, "train/ce_loss": 1.1100710253231227e-06 }, { "epoch": 0.7429305912596401, "step": 7514, "train/sim_loss": 0.015625 }, { "epoch": 0.7429305912596401, "step": 7514, "train/total_loss": 0.015625111758708954 }, { "entropy": 8.76772689819336, "epoch": 0.7430294641091556, "mean_token_accuracy": 0.7468879818916321, "num_tokens": 18302398.0, "step": 7515, "train/ce_loss": 1.4463998079299927 }, { "epoch": 0.7430294641091556, "step": 7515, "train/sim_loss": 0.05859375 }, { "epoch": 0.7430294641091556, "step": 7515, "train/total_loss": 0.2032337337732315 }, { "entropy": 8.771977424621582, "epoch": 0.7431283369586712, "mean_token_accuracy": 0.7443820238113403, "num_tokens": 18307595.0, "step": 7516, "train/ce_loss": 0.7369849681854248 }, { "epoch": 0.7431283369586712, "step": 7516, "train/sim_loss": 0.01953125 }, { "epoch": 0.7431283369586712, "step": 7516, "train/total_loss": 0.0932297483086586 }, { "entropy": 8.773170471191406, "epoch": 0.7432272098081867, "mean_token_accuracy": 0.7547169923782349, "num_tokens": 18312728.0, "step": 7517, "train/ce_loss": 0.5180942416191101 }, { "epoch": 0.7432272098081867, "step": 7517, "train/sim_loss": 0.0234375 }, { "epoch": 0.7432272098081867, "step": 7517, "train/total_loss": 0.07524693012237549 }, { "entropy": 8.073772430419922, "epoch": 0.7433260826577022, "mean_token_accuracy": 0.748633861541748, "num_tokens": 18318329.0, "step": 7518, "train/ce_loss": 1.114916443824768 }, { "epoch": 0.7433260826577022, "step": 7518, "train/sim_loss": 0.01953125 }, { "epoch": 0.7433260826577022, "step": 7518, "train/total_loss": 0.13102290034294128 }, { "entropy": 8.676217079162598, "epoch": 0.7434249555072178, "mean_token_accuracy": 0.7458333373069763, "num_tokens": 18323542.0, "step": 7519, "train/ce_loss": 0.2892405688762665 }, { "epoch": 0.7434249555072178, "step": 7519, "train/sim_loss": 0.03125 }, { "epoch": 0.7434249555072178, "step": 7519, "train/total_loss": 0.06017405539751053 }, { "epoch": 0.7435238283567333, "grad_norm": 0.657289981842041, "learning_rate": 8.143450526628097e-06, "loss": 0.1319, "step": 7520 }, { "entropy": 8.515302658081055, "epoch": 0.7435238283567333, "mean_token_accuracy": 0.7568420767784119, "num_tokens": 18328982.0, "step": 7520, "train/ce_loss": 0.7898205518722534 }, { "epoch": 0.7435238283567333, "step": 7520, "train/sim_loss": 0.04296875 }, { "epoch": 0.7435238283567333, "step": 7520, "train/total_loss": 0.12195080518722534 }, { "entropy": 8.461028099060059, "epoch": 0.7436227012062487, "mean_token_accuracy": 0.7594537734985352, "num_tokens": 18334450.0, "step": 7521, "train/ce_loss": 0.7316333055496216 }, { "epoch": 0.7436227012062487, "step": 7521, "train/sim_loss": 0.0234375 }, { "epoch": 0.7436227012062487, "step": 7521, "train/total_loss": 0.09660083055496216 }, { "entropy": 8.769364356994629, "epoch": 0.7437215740557643, "mean_token_accuracy": 0.7709720134735107, "num_tokens": 18339689.0, "step": 7522, "train/ce_loss": 0.9942920207977295 }, { "epoch": 0.7437215740557643, "step": 7522, "train/sim_loss": 0.09765625 }, { "epoch": 0.7437215740557643, "step": 7522, "train/total_loss": 0.1970854550600052 }, { "entropy": 8.664652824401855, "epoch": 0.7438204469052798, "mean_token_accuracy": 0.6864801645278931, "num_tokens": 18345008.0, "step": 7523, "train/ce_loss": 0.9823068976402283 }, { "epoch": 0.7438204469052798, "step": 7523, "train/sim_loss": 0.0625 }, { "epoch": 0.7438204469052798, "step": 7523, "train/total_loss": 0.16073068976402283 }, { "entropy": 8.796646118164062, "epoch": 0.7439193197547953, "mean_token_accuracy": 0.7348203063011169, "num_tokens": 18350282.0, "step": 7524, "train/ce_loss": 0.9658528566360474 }, { "epoch": 0.7439193197547953, "step": 7524, "train/sim_loss": 0.0625 }, { "epoch": 0.7439193197547953, "step": 7524, "train/total_loss": 0.15908528864383698 }, { "entropy": 8.759809494018555, "epoch": 0.7440181926043109, "mean_token_accuracy": 0.7451253533363342, "num_tokens": 18355493.0, "step": 7525, "train/ce_loss": 0.9836140871047974 }, { "epoch": 0.7440181926043109, "step": 7525, "train/sim_loss": 0.02734375 }, { "epoch": 0.7440181926043109, "step": 7525, "train/total_loss": 0.12570515275001526 }, { "entropy": 9.079217910766602, "epoch": 0.7441170654538264, "mean_token_accuracy": 0.7358803749084473, "num_tokens": 18360503.0, "step": 7526, "train/ce_loss": 1.0410563945770264 }, { "epoch": 0.7441170654538264, "step": 7526, "train/sim_loss": 0.0625 }, { "epoch": 0.7441170654538264, "step": 7526, "train/total_loss": 0.1666056513786316 }, { "entropy": 8.668304443359375, "epoch": 0.7442159383033419, "mean_token_accuracy": 0.7101865410804749, "num_tokens": 18365645.0, "step": 7527, "train/ce_loss": 1.1230249404907227 }, { "epoch": 0.7442159383033419, "step": 7527, "train/sim_loss": 0.07421875 }, { "epoch": 0.7442159383033419, "step": 7527, "train/total_loss": 0.1865212470293045 }, { "entropy": 8.49547004699707, "epoch": 0.7443148111528575, "mean_token_accuracy": 0.7277167439460754, "num_tokens": 18371001.0, "step": 7528, "train/ce_loss": 1.0620167255401611 }, { "epoch": 0.7443148111528575, "step": 7528, "train/sim_loss": 0.01953125 }, { "epoch": 0.7443148111528575, "step": 7528, "train/total_loss": 0.1257329285144806 }, { "entropy": 8.6134033203125, "epoch": 0.744413684002373, "mean_token_accuracy": 0.7319819927215576, "num_tokens": 18376333.0, "step": 7529, "train/ce_loss": 0.4151674509048462 }, { "epoch": 0.744413684002373, "step": 7529, "train/sim_loss": 0.046875 }, { "epoch": 0.744413684002373, "step": 7529, "train/total_loss": 0.0883917510509491 }, { "entropy": 8.853605270385742, "epoch": 0.7445125568518884, "mean_token_accuracy": 0.6931297779083252, "num_tokens": 18381429.0, "step": 7530, "train/ce_loss": 1.5182098150253296 }, { "epoch": 0.7445125568518884, "step": 7530, "train/sim_loss": 0.03515625 }, { "epoch": 0.7445125568518884, "step": 7530, "train/total_loss": 0.18697723746299744 }, { "entropy": 9.093571662902832, "epoch": 0.744611429701404, "mean_token_accuracy": 0.76382976770401, "num_tokens": 18386300.0, "step": 7531, "train/ce_loss": 1.4884306192398071 }, { "epoch": 0.744611429701404, "step": 7531, "train/sim_loss": 0.0546875 }, { "epoch": 0.744611429701404, "step": 7531, "train/total_loss": 0.20353056490421295 }, { "entropy": 8.799421310424805, "epoch": 0.7447103025509195, "mean_token_accuracy": 0.7830045819282532, "num_tokens": 18391446.0, "step": 7532, "train/ce_loss": 0.8592729568481445 }, { "epoch": 0.7447103025509195, "step": 7532, "train/sim_loss": 0.0390625 }, { "epoch": 0.7447103025509195, "step": 7532, "train/total_loss": 0.12498980015516281 }, { "entropy": 9.006401062011719, "epoch": 0.744809175400435, "mean_token_accuracy": 0.7160714268684387, "num_tokens": 18396445.0, "step": 7533, "train/ce_loss": 0.8229637145996094 }, { "epoch": 0.744809175400435, "step": 7533, "train/sim_loss": 0.0546875 }, { "epoch": 0.744809175400435, "step": 7533, "train/total_loss": 0.13698387145996094 }, { "entropy": 8.820967674255371, "epoch": 0.7449080482499506, "mean_token_accuracy": 0.742214560508728, "num_tokens": 18401464.0, "step": 7534, "train/ce_loss": 1.3239574432373047 }, { "epoch": 0.7449080482499506, "step": 7534, "train/sim_loss": 0.16015625 }, { "epoch": 0.7449080482499506, "step": 7534, "train/total_loss": 0.29255199432373047 }, { "entropy": 8.689075469970703, "epoch": 0.7450069210994661, "mean_token_accuracy": 0.7237308025360107, "num_tokens": 18406801.0, "step": 7535, "train/ce_loss": 0.7591511011123657 }, { "epoch": 0.7450069210994661, "step": 7535, "train/sim_loss": 0.0859375 }, { "epoch": 0.7450069210994661, "step": 7535, "train/total_loss": 0.1618526130914688 }, { "entropy": 9.02638053894043, "epoch": 0.7451057939489816, "mean_token_accuracy": 0.6994134783744812, "num_tokens": 18411902.0, "step": 7536, "train/ce_loss": 0.9574239253997803 }, { "epoch": 0.7451057939489816, "step": 7536, "train/sim_loss": 0.0546875 }, { "epoch": 0.7451057939489816, "step": 7536, "train/total_loss": 0.15042990446090698 }, { "entropy": 8.805293083190918, "epoch": 0.7452046667984972, "mean_token_accuracy": 0.763005793094635, "num_tokens": 18417236.0, "step": 7537, "train/ce_loss": 0.5509923696517944 }, { "epoch": 0.7452046667984972, "step": 7537, "train/sim_loss": 0.015625 }, { "epoch": 0.7452046667984972, "step": 7537, "train/total_loss": 0.0707242339849472 }, { "entropy": 8.43828010559082, "epoch": 0.7453035396480127, "mean_token_accuracy": 0.7420675754547119, "num_tokens": 18422720.0, "step": 7538, "train/ce_loss": 0.5752097368240356 }, { "epoch": 0.7453035396480127, "step": 7538, "train/sim_loss": 0.015625 }, { "epoch": 0.7453035396480127, "step": 7538, "train/total_loss": 0.07314597070217133 }, { "entropy": 8.483229637145996, "epoch": 0.7454024124975281, "mean_token_accuracy": 0.814213216304779, "num_tokens": 18428155.0, "step": 7539, "train/ce_loss": 0.6831315755844116 }, { "epoch": 0.7454024124975281, "step": 7539, "train/sim_loss": 0.01171875 }, { "epoch": 0.7454024124975281, "step": 7539, "train/total_loss": 0.08003190904855728 }, { "epoch": 0.7455012853470437, "grad_norm": 0.5534847974777222, "learning_rate": 8.138505661870148e-06, "loss": 0.1358, "step": 7540 }, { "entropy": 8.548550605773926, "epoch": 0.7455012853470437, "mean_token_accuracy": 0.7884828448295593, "num_tokens": 18433584.0, "step": 7540, "train/ce_loss": 1.0471758842468262 }, { "epoch": 0.7455012853470437, "step": 7540, "train/sim_loss": 0.0234375 }, { "epoch": 0.7455012853470437, "step": 7540, "train/total_loss": 0.12815508246421814 }, { "entropy": 8.895271301269531, "epoch": 0.7456001581965592, "mean_token_accuracy": 0.6723459959030151, "num_tokens": 18438833.0, "step": 7541, "train/ce_loss": 2.0626556873321533 }, { "epoch": 0.7456001581965592, "step": 7541, "train/sim_loss": 0.12109375 }, { "epoch": 0.7456001581965592, "step": 7541, "train/total_loss": 0.32735931873321533 }, { "entropy": 8.48635196685791, "epoch": 0.7456990310460747, "mean_token_accuracy": 0.7440100908279419, "num_tokens": 18444086.0, "step": 7542, "train/ce_loss": 1.2792021036148071 }, { "epoch": 0.7456990310460747, "step": 7542, "train/sim_loss": 0.05078125 }, { "epoch": 0.7456990310460747, "step": 7542, "train/total_loss": 0.1787014603614807 }, { "entropy": 9.208368301391602, "epoch": 0.7457979038955903, "mean_token_accuracy": 0.740667998790741, "num_tokens": 18449036.0, "step": 7543, "train/ce_loss": 1.4675124883651733 }, { "epoch": 0.7457979038955903, "step": 7543, "train/sim_loss": 0.0546875 }, { "epoch": 0.7457979038955903, "step": 7543, "train/total_loss": 0.2014387547969818 }, { "entropy": 8.466072082519531, "epoch": 0.7458967767451058, "mean_token_accuracy": 0.7203290462493896, "num_tokens": 18454544.0, "step": 7544, "train/ce_loss": 1.236045241355896 }, { "epoch": 0.7458967767451058, "step": 7544, "train/sim_loss": 0.05859375 }, { "epoch": 0.7458967767451058, "step": 7544, "train/total_loss": 0.18219828605651855 }, { "entropy": 8.675228118896484, "epoch": 0.7459956495946213, "mean_token_accuracy": 0.7056451439857483, "num_tokens": 18459762.0, "step": 7545, "train/ce_loss": 1.0474457740783691 }, { "epoch": 0.7459956495946213, "step": 7545, "train/sim_loss": 0.05859375 }, { "epoch": 0.7459956495946213, "step": 7545, "train/total_loss": 0.1633383333683014 }, { "entropy": 8.85736083984375, "epoch": 0.7460945224441369, "mean_token_accuracy": 0.7022106647491455, "num_tokens": 18465006.0, "step": 7546, "train/ce_loss": 0.8944984078407288 }, { "epoch": 0.7460945224441369, "step": 7546, "train/sim_loss": 0.0859375 }, { "epoch": 0.7460945224441369, "step": 7546, "train/total_loss": 0.17538735270500183 }, { "entropy": 9.092981338500977, "epoch": 0.7461933952936524, "mean_token_accuracy": 0.690559446811676, "num_tokens": 18470058.0, "step": 7547, "train/ce_loss": 1.8033366203308105 }, { "epoch": 0.7461933952936524, "step": 7547, "train/sim_loss": 0.0703125 }, { "epoch": 0.7461933952936524, "step": 7547, "train/total_loss": 0.25064617395401 }, { "entropy": 8.797235488891602, "epoch": 0.7462922681431678, "mean_token_accuracy": 0.7824427485466003, "num_tokens": 18475304.0, "step": 7548, "train/ce_loss": 0.7003121972084045 }, { "epoch": 0.7462922681431678, "step": 7548, "train/sim_loss": 0.08984375 }, { "epoch": 0.7462922681431678, "step": 7548, "train/total_loss": 0.15987497568130493 }, { "entropy": 8.513991355895996, "epoch": 0.7463911409926834, "mean_token_accuracy": 0.7707838416099548, "num_tokens": 18480603.0, "step": 7549, "train/ce_loss": 0.6448712944984436 }, { "epoch": 0.7463911409926834, "step": 7549, "train/sim_loss": 0.05078125 }, { "epoch": 0.7463911409926834, "step": 7549, "train/total_loss": 0.11526837944984436 }, { "entropy": 8.103286743164062, "epoch": 0.7464900138421989, "mean_token_accuracy": 0.7144185900688171, "num_tokens": 18486163.0, "step": 7550, "train/ce_loss": 0.8816394805908203 }, { "epoch": 0.7464900138421989, "step": 7550, "train/sim_loss": 0.06640625 }, { "epoch": 0.7464900138421989, "step": 7550, "train/total_loss": 0.15457019209861755 }, { "entropy": 8.91938591003418, "epoch": 0.7465888866917144, "mean_token_accuracy": 0.7802768349647522, "num_tokens": 18491183.0, "step": 7551, "train/ce_loss": 1.2602639198303223 }, { "epoch": 0.7465888866917144, "step": 7551, "train/sim_loss": 0.07421875 }, { "epoch": 0.7465888866917144, "step": 7551, "train/total_loss": 0.20024514198303223 }, { "entropy": 9.720104217529297, "epoch": 0.74668775954123, "mean_token_accuracy": 0.7919074892997742, "num_tokens": 18495760.0, "step": 7552, "train/ce_loss": 1.2329982382652815e-05 }, { "epoch": 0.74668775954123, "step": 7552, "train/sim_loss": 0.04296875 }, { "epoch": 0.74668775954123, "step": 7552, "train/total_loss": 0.04296998307108879 }, { "entropy": 8.548629760742188, "epoch": 0.7467866323907455, "mean_token_accuracy": 0.7142857313156128, "num_tokens": 18500989.0, "step": 7553, "train/ce_loss": 0.7419317364692688 }, { "epoch": 0.7467866323907455, "step": 7553, "train/sim_loss": 0.04296875 }, { "epoch": 0.7467866323907455, "step": 7553, "train/total_loss": 0.11716192215681076 }, { "entropy": 8.859134674072266, "epoch": 0.746885505240261, "mean_token_accuracy": 0.7392686605453491, "num_tokens": 18506091.0, "step": 7554, "train/ce_loss": 3.27682710121735e-06 }, { "epoch": 0.746885505240261, "step": 7554, "train/sim_loss": 0.04296875 }, { "epoch": 0.746885505240261, "step": 7554, "train/total_loss": 0.042969077825546265 }, { "entropy": 8.791265487670898, "epoch": 0.7469843780897766, "mean_token_accuracy": 0.7300944924354553, "num_tokens": 18511294.0, "step": 7555, "train/ce_loss": 0.7456115484237671 }, { "epoch": 0.7469843780897766, "step": 7555, "train/sim_loss": 0.05859375 }, { "epoch": 0.7469843780897766, "step": 7555, "train/total_loss": 0.13315489888191223 }, { "entropy": 8.044285774230957, "epoch": 0.7470832509392921, "mean_token_accuracy": 0.7624565362930298, "num_tokens": 18516656.0, "step": 7556, "train/ce_loss": 0.5414119362831116 }, { "epoch": 0.7470832509392921, "step": 7556, "train/sim_loss": 0.13671875 }, { "epoch": 0.7470832509392921, "step": 7556, "train/total_loss": 0.19085994362831116 }, { "entropy": 9.396928787231445, "epoch": 0.7471821237888076, "mean_token_accuracy": 0.7343096137046814, "num_tokens": 18521562.0, "step": 7557, "train/ce_loss": 1.8854216250474565e-05 }, { "epoch": 0.7471821237888076, "step": 7557, "train/sim_loss": 0.03125 }, { "epoch": 0.7471821237888076, "step": 7557, "train/total_loss": 0.03125188499689102 }, { "entropy": 8.831316947937012, "epoch": 0.7472809966383231, "mean_token_accuracy": 0.7898550629615784, "num_tokens": 18526662.0, "step": 7558, "train/ce_loss": 1.0223673582077026 }, { "epoch": 0.7472809966383231, "step": 7558, "train/sim_loss": 0.03125 }, { "epoch": 0.7472809966383231, "step": 7558, "train/total_loss": 0.13348674774169922 }, { "entropy": 8.615570068359375, "epoch": 0.7473798694878386, "mean_token_accuracy": 0.8077889680862427, "num_tokens": 18531932.0, "step": 7559, "train/ce_loss": 0.5703041553497314 }, { "epoch": 0.7473798694878386, "step": 7559, "train/sim_loss": 0.0546875 }, { "epoch": 0.7473798694878386, "step": 7559, "train/total_loss": 0.11171791702508926 }, { "epoch": 0.7474787423373541, "grad_norm": 0.6037012934684753, "learning_rate": 8.1335607971122e-06, "loss": 0.1366, "step": 7560 }, { "entropy": 8.575265884399414, "epoch": 0.7474787423373541, "mean_token_accuracy": 0.697621762752533, "num_tokens": 18537320.0, "step": 7560, "train/ce_loss": 1.0120177268981934 }, { "epoch": 0.7474787423373541, "step": 7560, "train/sim_loss": 0.078125 }, { "epoch": 0.7474787423373541, "step": 7560, "train/total_loss": 0.17932677268981934 }, { "entropy": 9.304473876953125, "epoch": 0.7475776151868697, "mean_token_accuracy": 0.7781955003738403, "num_tokens": 18542236.0, "step": 7561, "train/ce_loss": 2.2549270397576038e-06 }, { "epoch": 0.7475776151868697, "step": 7561, "train/sim_loss": 0.01953125 }, { "epoch": 0.7475776151868697, "step": 7561, "train/total_loss": 0.019531475380063057 }, { "entropy": 8.95811653137207, "epoch": 0.7476764880363852, "mean_token_accuracy": 0.7650793790817261, "num_tokens": 18547337.0, "step": 7562, "train/ce_loss": 0.82717365026474 }, { "epoch": 0.7476764880363852, "step": 7562, "train/sim_loss": 0.078125 }, { "epoch": 0.7476764880363852, "step": 7562, "train/total_loss": 0.16084235906600952 }, { "entropy": 8.916593551635742, "epoch": 0.7477753608859007, "mean_token_accuracy": 0.7220670580863953, "num_tokens": 18552464.0, "step": 7563, "train/ce_loss": 1.4096484184265137 }, { "epoch": 0.7477753608859007, "step": 7563, "train/sim_loss": 0.03515625 }, { "epoch": 0.7477753608859007, "step": 7563, "train/total_loss": 0.17612110078334808 }, { "entropy": 8.862186431884766, "epoch": 0.7478742337354163, "mean_token_accuracy": 0.7115716934204102, "num_tokens": 18557431.0, "step": 7564, "train/ce_loss": 1.0325582027435303 }, { "epoch": 0.7478742337354163, "step": 7564, "train/sim_loss": 0.0390625 }, { "epoch": 0.7478742337354163, "step": 7564, "train/total_loss": 0.14231832325458527 }, { "entropy": 8.625448226928711, "epoch": 0.7479731065849318, "mean_token_accuracy": 0.7784430980682373, "num_tokens": 18562730.0, "step": 7565, "train/ce_loss": 0.6577219367027283 }, { "epoch": 0.7479731065849318, "step": 7565, "train/sim_loss": 0.125 }, { "epoch": 0.7479731065849318, "step": 7565, "train/total_loss": 0.19077220559120178 }, { "entropy": 8.983163833618164, "epoch": 0.7480719794344473, "mean_token_accuracy": 0.6866952776908875, "num_tokens": 18567875.0, "step": 7566, "train/ce_loss": 0.9836313724517822 }, { "epoch": 0.7480719794344473, "step": 7566, "train/sim_loss": 0.09375 }, { "epoch": 0.7480719794344473, "step": 7566, "train/total_loss": 0.19211313128471375 }, { "entropy": 9.013215065002441, "epoch": 0.7481708522839629, "mean_token_accuracy": 0.733137845993042, "num_tokens": 18572949.0, "step": 7567, "train/ce_loss": 1.7395390272140503 }, { "epoch": 0.7481708522839629, "step": 7567, "train/sim_loss": 0.05859375 }, { "epoch": 0.7481708522839629, "step": 7567, "train/total_loss": 0.23254765570163727 }, { "entropy": 8.51715087890625, "epoch": 0.7482697251334783, "mean_token_accuracy": 0.7427577972412109, "num_tokens": 18578264.0, "step": 7568, "train/ce_loss": 0.794825553894043 }, { "epoch": 0.7482697251334783, "step": 7568, "train/sim_loss": 0.05078125 }, { "epoch": 0.7482697251334783, "step": 7568, "train/total_loss": 0.1302638053894043 }, { "entropy": 8.655542373657227, "epoch": 0.7483685979829938, "mean_token_accuracy": 0.7131428718566895, "num_tokens": 18583553.0, "step": 7569, "train/ce_loss": 0.9008631706237793 }, { "epoch": 0.7483685979829938, "step": 7569, "train/sim_loss": 0.0625 }, { "epoch": 0.7483685979829938, "step": 7569, "train/total_loss": 0.15258631110191345 }, { "entropy": 8.622589111328125, "epoch": 0.7484674708325094, "mean_token_accuracy": 0.746532142162323, "num_tokens": 18588816.0, "step": 7570, "train/ce_loss": 0.6112365126609802 }, { "epoch": 0.7484674708325094, "step": 7570, "train/sim_loss": 0.12109375 }, { "epoch": 0.7484674708325094, "step": 7570, "train/total_loss": 0.18221740424633026 }, { "entropy": 8.559322357177734, "epoch": 0.7485663436820249, "mean_token_accuracy": 0.7550111413002014, "num_tokens": 18594348.0, "step": 7571, "train/ce_loss": 0.8936256170272827 }, { "epoch": 0.7485663436820249, "step": 7571, "train/sim_loss": 0.0625 }, { "epoch": 0.7485663436820249, "step": 7571, "train/total_loss": 0.15186256170272827 }, { "entropy": 8.6543550491333, "epoch": 0.7486652165315404, "mean_token_accuracy": 0.6327043771743774, "num_tokens": 18599599.0, "step": 7572, "train/ce_loss": 1.2990891933441162 }, { "epoch": 0.7486652165315404, "step": 7572, "train/sim_loss": 0.11328125 }, { "epoch": 0.7486652165315404, "step": 7572, "train/total_loss": 0.24319016933441162 }, { "entropy": 8.520683288574219, "epoch": 0.748764089381056, "mean_token_accuracy": 0.7318500876426697, "num_tokens": 18604890.0, "step": 7573, "train/ce_loss": 0.5030612349510193 }, { "epoch": 0.748764089381056, "step": 7573, "train/sim_loss": 0.05859375 }, { "epoch": 0.748764089381056, "step": 7573, "train/total_loss": 0.10889987647533417 }, { "entropy": 8.476627349853516, "epoch": 0.7488629622305715, "mean_token_accuracy": 0.683783769607544, "num_tokens": 18610104.0, "step": 7574, "train/ce_loss": 1.7495384216308594 }, { "epoch": 0.7488629622305715, "step": 7574, "train/sim_loss": 0.11328125 }, { "epoch": 0.7488629622305715, "step": 7574, "train/total_loss": 0.2882350981235504 }, { "entropy": 8.270191192626953, "epoch": 0.748961835080087, "mean_token_accuracy": 0.6940000057220459, "num_tokens": 18615569.0, "step": 7575, "train/ce_loss": 0.976158082485199 }, { "epoch": 0.748961835080087, "step": 7575, "train/sim_loss": 0.0859375 }, { "epoch": 0.748961835080087, "step": 7575, "train/total_loss": 0.1835533082485199 }, { "entropy": 7.990413665771484, "epoch": 0.7490607079296026, "mean_token_accuracy": 0.763129711151123, "num_tokens": 18620977.0, "step": 7576, "train/ce_loss": 0.5641739368438721 }, { "epoch": 0.7490607079296026, "step": 7576, "train/sim_loss": 0.0625 }, { "epoch": 0.7490607079296026, "step": 7576, "train/total_loss": 0.11891739070415497 }, { "entropy": 8.844701766967773, "epoch": 0.749159580779118, "mean_token_accuracy": 0.6914498209953308, "num_tokens": 18625855.0, "step": 7577, "train/ce_loss": 0.5731430053710938 }, { "epoch": 0.749159580779118, "step": 7577, "train/sim_loss": 0.03125 }, { "epoch": 0.749159580779118, "step": 7577, "train/total_loss": 0.08856430649757385 }, { "entropy": 9.015901565551758, "epoch": 0.7492584536286335, "mean_token_accuracy": 0.7592892050743103, "num_tokens": 18630950.0, "step": 7578, "train/ce_loss": 0.7445675134658813 }, { "epoch": 0.7492584536286335, "step": 7578, "train/sim_loss": 0.046875 }, { "epoch": 0.7492584536286335, "step": 7578, "train/total_loss": 0.12133175134658813 }, { "entropy": 8.592124938964844, "epoch": 0.7493573264781491, "mean_token_accuracy": 0.7462483048439026, "num_tokens": 18636177.0, "step": 7579, "train/ce_loss": 0.8068994283676147 }, { "epoch": 0.7493573264781491, "step": 7579, "train/sim_loss": 0.05078125 }, { "epoch": 0.7493573264781491, "step": 7579, "train/total_loss": 0.131471186876297 }, { "epoch": 0.7494561993276646, "grad_norm": 0.6941366195678711, "learning_rate": 8.12861593235425e-06, "loss": 0.1481, "step": 7580 }, { "entropy": 8.709858894348145, "epoch": 0.7494561993276646, "mean_token_accuracy": 0.7689969539642334, "num_tokens": 18641324.0, "step": 7580, "train/ce_loss": 0.4913293123245239 }, { "epoch": 0.7494561993276646, "step": 7580, "train/sim_loss": 0.05078125 }, { "epoch": 0.7494561993276646, "step": 7580, "train/total_loss": 0.09991417825222015 }, { "entropy": 8.855411529541016, "epoch": 0.7495550721771801, "mean_token_accuracy": 0.7503234148025513, "num_tokens": 18646557.0, "step": 7581, "train/ce_loss": 1.4032633304595947 }, { "epoch": 0.7495550721771801, "step": 7581, "train/sim_loss": 0.10546875 }, { "epoch": 0.7495550721771801, "step": 7581, "train/total_loss": 0.2457950860261917 }, { "entropy": 8.342367172241211, "epoch": 0.7496539450266957, "mean_token_accuracy": 0.7551020383834839, "num_tokens": 18651977.0, "step": 7582, "train/ce_loss": 0.5137928128242493 }, { "epoch": 0.7496539450266957, "step": 7582, "train/sim_loss": 0.046875 }, { "epoch": 0.7496539450266957, "step": 7582, "train/total_loss": 0.09825427830219269 }, { "entropy": 8.741762161254883, "epoch": 0.7497528178762112, "mean_token_accuracy": 0.7442424297332764, "num_tokens": 18657241.0, "step": 7583, "train/ce_loss": 1.6014753327908693e-06 }, { "epoch": 0.7497528178762112, "step": 7583, "train/sim_loss": 0.015625 }, { "epoch": 0.7497528178762112, "step": 7583, "train/total_loss": 0.015625160187482834 }, { "entropy": 8.494527816772461, "epoch": 0.7498516907257267, "mean_token_accuracy": 0.7305936217308044, "num_tokens": 18662545.0, "step": 7584, "train/ce_loss": 0.8228175640106201 }, { "epoch": 0.7498516907257267, "step": 7584, "train/sim_loss": 0.05859375 }, { "epoch": 0.7498516907257267, "step": 7584, "train/total_loss": 0.14087551832199097 }, { "entropy": 8.599166870117188, "epoch": 0.7499505635752423, "mean_token_accuracy": 0.7303988933563232, "num_tokens": 18667757.0, "step": 7585, "train/ce_loss": 0.6843902468681335 }, { "epoch": 0.7499505635752423, "step": 7585, "train/sim_loss": 0.0546875 }, { "epoch": 0.7499505635752423, "step": 7585, "train/total_loss": 0.12312652915716171 }, { "entropy": 8.907880783081055, "epoch": 0.7500494364247577, "mean_token_accuracy": 0.7748031616210938, "num_tokens": 18672716.0, "step": 7586, "train/ce_loss": 4.061507752339821e-06 }, { "epoch": 0.7500494364247577, "step": 7586, "train/sim_loss": 0.0234375 }, { "epoch": 0.7500494364247577, "step": 7586, "train/total_loss": 0.023437906056642532 }, { "entropy": 8.357831954956055, "epoch": 0.7501483092742733, "mean_token_accuracy": 0.688720166683197, "num_tokens": 18678023.0, "step": 7587, "train/ce_loss": 1.2316768169403076 }, { "epoch": 0.7501483092742733, "step": 7587, "train/sim_loss": 0.0390625 }, { "epoch": 0.7501483092742733, "step": 7587, "train/total_loss": 0.16223019361495972 }, { "entropy": 8.60870361328125, "epoch": 0.7502471821237888, "mean_token_accuracy": 0.7943166494369507, "num_tokens": 18683248.0, "step": 7588, "train/ce_loss": 0.751041829586029 }, { "epoch": 0.7502471821237888, "step": 7588, "train/sim_loss": 0.0625 }, { "epoch": 0.7502471821237888, "step": 7588, "train/total_loss": 0.13760417699813843 }, { "entropy": 8.520055770874023, "epoch": 0.7503460549733043, "mean_token_accuracy": 0.7567901015281677, "num_tokens": 18688529.0, "step": 7589, "train/ce_loss": 0.8611323833465576 }, { "epoch": 0.7503460549733043, "step": 7589, "train/sim_loss": 0.046875 }, { "epoch": 0.7503460549733043, "step": 7589, "train/total_loss": 0.13298824429512024 }, { "entropy": 8.39891242980957, "epoch": 0.7504449278228199, "mean_token_accuracy": 0.7409700751304626, "num_tokens": 18693959.0, "step": 7590, "train/ce_loss": 0.603983998298645 }, { "epoch": 0.7504449278228199, "step": 7590, "train/sim_loss": 0.0390625 }, { "epoch": 0.7504449278228199, "step": 7590, "train/total_loss": 0.0994608998298645 }, { "entropy": 8.5487699508667, "epoch": 0.7505438006723354, "mean_token_accuracy": 0.79368656873703, "num_tokens": 18699309.0, "step": 7591, "train/ce_loss": 0.9199407696723938 }, { "epoch": 0.7505438006723354, "step": 7591, "train/sim_loss": 0.05859375 }, { "epoch": 0.7505438006723354, "step": 7591, "train/total_loss": 0.15058782696723938 }, { "entropy": 8.455150604248047, "epoch": 0.7506426735218509, "mean_token_accuracy": 0.8018540143966675, "num_tokens": 18704662.0, "step": 7592, "train/ce_loss": 0.48203450441360474 }, { "epoch": 0.7506426735218509, "step": 7592, "train/sim_loss": 0.0546875 }, { "epoch": 0.7506426735218509, "step": 7592, "train/total_loss": 0.10289095342159271 }, { "entropy": 8.719837188720703, "epoch": 0.7507415463713665, "mean_token_accuracy": 0.7514705657958984, "num_tokens": 18709771.0, "step": 7593, "train/ce_loss": 1.3186227083206177 }, { "epoch": 0.7507415463713665, "step": 7593, "train/sim_loss": 0.05078125 }, { "epoch": 0.7507415463713665, "step": 7593, "train/total_loss": 0.18264351785182953 }, { "entropy": 8.980011940002441, "epoch": 0.750840419220882, "mean_token_accuracy": 0.7803278565406799, "num_tokens": 18714836.0, "step": 7594, "train/ce_loss": 0.9642717242240906 }, { "epoch": 0.750840419220882, "step": 7594, "train/sim_loss": 0.03515625 }, { "epoch": 0.750840419220882, "step": 7594, "train/total_loss": 0.13158342242240906 }, { "entropy": 8.246875762939453, "epoch": 0.7509392920703974, "mean_token_accuracy": 0.6725025773048401, "num_tokens": 18720288.0, "step": 7595, "train/ce_loss": 0.7636457085609436 }, { "epoch": 0.7509392920703974, "step": 7595, "train/sim_loss": 0.05859375 }, { "epoch": 0.7509392920703974, "step": 7595, "train/total_loss": 0.13495832681655884 }, { "entropy": 8.382661819458008, "epoch": 0.751038164919913, "mean_token_accuracy": 0.7502527832984924, "num_tokens": 18725702.0, "step": 7596, "train/ce_loss": 1.2867059707641602 }, { "epoch": 0.751038164919913, "step": 7596, "train/sim_loss": 0.08984375 }, { "epoch": 0.751038164919913, "step": 7596, "train/total_loss": 0.2185143530368805 }, { "entropy": 9.54419231414795, "epoch": 0.7511370377694285, "mean_token_accuracy": 0.7117347121238708, "num_tokens": 18730487.0, "step": 7597, "train/ce_loss": 1.4519308805465698 }, { "epoch": 0.7511370377694285, "step": 7597, "train/sim_loss": 0.04296875 }, { "epoch": 0.7511370377694285, "step": 7597, "train/total_loss": 0.18816183507442474 }, { "entropy": 8.40174674987793, "epoch": 0.751235910618944, "mean_token_accuracy": 0.7516778707504272, "num_tokens": 18735741.0, "step": 7598, "train/ce_loss": 0.98127681016922 }, { "epoch": 0.751235910618944, "step": 7598, "train/sim_loss": 0.0625 }, { "epoch": 0.751235910618944, "step": 7598, "train/total_loss": 0.16062769293785095 }, { "entropy": 8.947654724121094, "epoch": 0.7513347834684596, "mean_token_accuracy": 0.7711864113807678, "num_tokens": 18740836.0, "step": 7599, "train/ce_loss": 1.3571666479110718 }, { "epoch": 0.7513347834684596, "step": 7599, "train/sim_loss": 0.01953125 }, { "epoch": 0.7513347834684596, "step": 7599, "train/total_loss": 0.15524791181087494 }, { "epoch": 0.7514336563179751, "grad_norm": 0.5190445184707642, "learning_rate": 8.123671067596301e-06, "loss": 0.127, "step": 7600 }, { "entropy": 8.397933006286621, "epoch": 0.7514336563179751, "mean_token_accuracy": 0.8217922449111938, "num_tokens": 18746269.0, "step": 7600, "train/ce_loss": 0.34773579239845276 }, { "epoch": 0.7514336563179751, "step": 7600, "train/sim_loss": 0.01953125 }, { "epoch": 0.7514336563179751, "step": 7600, "train/total_loss": 0.054304830729961395 }, { "entropy": 8.61817741394043, "epoch": 0.7515325291674906, "mean_token_accuracy": 0.7633495330810547, "num_tokens": 18751603.0, "step": 7601, "train/ce_loss": 0.6802176833152771 }, { "epoch": 0.7515325291674906, "step": 7601, "train/sim_loss": 0.0234375 }, { "epoch": 0.7515325291674906, "step": 7601, "train/total_loss": 0.09145926684141159 }, { "entropy": 9.059032440185547, "epoch": 0.7516314020170062, "mean_token_accuracy": 0.7538726329803467, "num_tokens": 18756648.0, "step": 7602, "train/ce_loss": 0.9057416319847107 }, { "epoch": 0.7516314020170062, "step": 7602, "train/sim_loss": 0.078125 }, { "epoch": 0.7516314020170062, "step": 7602, "train/total_loss": 0.16869917511940002 }, { "entropy": 9.00408935546875, "epoch": 0.7517302748665217, "mean_token_accuracy": 0.6993569135665894, "num_tokens": 18761939.0, "step": 7603, "train/ce_loss": 0.6494431495666504 }, { "epoch": 0.7517302748665217, "step": 7603, "train/sim_loss": 0.0390625 }, { "epoch": 0.7517302748665217, "step": 7603, "train/total_loss": 0.1040068194270134 }, { "entropy": 9.162939071655273, "epoch": 0.7518291477160371, "mean_token_accuracy": 0.7204301357269287, "num_tokens": 18766923.0, "step": 7604, "train/ce_loss": 1.313880205154419 }, { "epoch": 0.7518291477160371, "step": 7604, "train/sim_loss": 0.0390625 }, { "epoch": 0.7518291477160371, "step": 7604, "train/total_loss": 0.17045052349567413 }, { "entropy": 8.271470069885254, "epoch": 0.7519280205655527, "mean_token_accuracy": 0.777990460395813, "num_tokens": 18772439.0, "step": 7605, "train/ce_loss": 0.4249177575111389 }, { "epoch": 0.7519280205655527, "step": 7605, "train/sim_loss": 0.02734375 }, { "epoch": 0.7519280205655527, "step": 7605, "train/total_loss": 0.06983552873134613 }, { "entropy": 8.742707252502441, "epoch": 0.7520268934150682, "mean_token_accuracy": 0.7394468784332275, "num_tokens": 18777605.0, "step": 7606, "train/ce_loss": 0.6090832948684692 }, { "epoch": 0.7520268934150682, "step": 7606, "train/sim_loss": 0.03515625 }, { "epoch": 0.7520268934150682, "step": 7606, "train/total_loss": 0.09606458246707916 }, { "entropy": 8.429786682128906, "epoch": 0.7521257662645837, "mean_token_accuracy": 0.7768508791923523, "num_tokens": 18782997.0, "step": 7607, "train/ce_loss": 0.7540313601493835 }, { "epoch": 0.7521257662645837, "step": 7607, "train/sim_loss": 0.0234375 }, { "epoch": 0.7521257662645837, "step": 7607, "train/total_loss": 0.0988406389951706 }, { "entropy": 8.465465545654297, "epoch": 0.7522246391140993, "mean_token_accuracy": 0.727918803691864, "num_tokens": 18788440.0, "step": 7608, "train/ce_loss": 0.9121677875518799 }, { "epoch": 0.7522246391140993, "step": 7608, "train/sim_loss": 0.09375 }, { "epoch": 0.7522246391140993, "step": 7608, "train/total_loss": 0.1849667727947235 }, { "entropy": 8.796218872070312, "epoch": 0.7523235119636148, "mean_token_accuracy": 0.7910271286964417, "num_tokens": 18793748.0, "step": 7609, "train/ce_loss": 0.5593013167381287 }, { "epoch": 0.7523235119636148, "step": 7609, "train/sim_loss": 0.0859375 }, { "epoch": 0.7523235119636148, "step": 7609, "train/total_loss": 0.14186763763427734 }, { "entropy": 8.514887809753418, "epoch": 0.7524223848131303, "mean_token_accuracy": 0.7386091351509094, "num_tokens": 18799074.0, "step": 7610, "train/ce_loss": 1.0005974769592285 }, { "epoch": 0.7524223848131303, "step": 7610, "train/sim_loss": 0.0703125 }, { "epoch": 0.7524223848131303, "step": 7610, "train/total_loss": 0.17037224769592285 }, { "entropy": 8.344120025634766, "epoch": 0.7525212576626459, "mean_token_accuracy": 0.7127312421798706, "num_tokens": 18804445.0, "step": 7611, "train/ce_loss": 1.3133176565170288 }, { "epoch": 0.7525212576626459, "step": 7611, "train/sim_loss": 0.08984375 }, { "epoch": 0.7525212576626459, "step": 7611, "train/total_loss": 0.22117552161216736 }, { "entropy": 8.522525787353516, "epoch": 0.7526201305121614, "mean_token_accuracy": 0.7158836722373962, "num_tokens": 18809776.0, "step": 7612, "train/ce_loss": 1.3812496662139893 }, { "epoch": 0.7526201305121614, "step": 7612, "train/sim_loss": 0.0546875 }, { "epoch": 0.7526201305121614, "step": 7612, "train/total_loss": 0.1928124725818634 }, { "entropy": 8.93623161315918, "epoch": 0.7527190033616769, "mean_token_accuracy": 0.7377278804779053, "num_tokens": 18814946.0, "step": 7613, "train/ce_loss": 1.4814845323562622 }, { "epoch": 0.7527190033616769, "step": 7613, "train/sim_loss": 0.08203125 }, { "epoch": 0.7527190033616769, "step": 7613, "train/total_loss": 0.23017971217632294 }, { "entropy": 9.092214584350586, "epoch": 0.7528178762111924, "mean_token_accuracy": 0.7739999890327454, "num_tokens": 18819872.0, "step": 7614, "train/ce_loss": 1.1034413576126099 }, { "epoch": 0.7528178762111924, "step": 7614, "train/sim_loss": 0.05859375 }, { "epoch": 0.7528178762111924, "step": 7614, "train/total_loss": 0.16893789172172546 }, { "entropy": 9.46321964263916, "epoch": 0.7529167490607079, "mean_token_accuracy": 0.7542372941970825, "num_tokens": 18824640.0, "step": 7615, "train/ce_loss": 1.9673778297146782e-05 }, { "epoch": 0.7529167490607079, "step": 7615, "train/sim_loss": 0.0625 }, { "epoch": 0.7529167490607079, "step": 7615, "train/total_loss": 0.06250196695327759 }, { "entropy": 8.586030960083008, "epoch": 0.7530156219102234, "mean_token_accuracy": 0.7251114249229431, "num_tokens": 18829748.0, "step": 7616, "train/ce_loss": 1.3748188018798828 }, { "epoch": 0.7530156219102234, "step": 7616, "train/sim_loss": 0.0390625 }, { "epoch": 0.7530156219102234, "step": 7616, "train/total_loss": 0.17654438316822052 }, { "entropy": 8.770456314086914, "epoch": 0.753114494759739, "mean_token_accuracy": 0.7137546539306641, "num_tokens": 18834980.0, "step": 7617, "train/ce_loss": 1.3555060625076294 }, { "epoch": 0.753114494759739, "step": 7617, "train/sim_loss": 0.0546875 }, { "epoch": 0.753114494759739, "step": 7617, "train/total_loss": 0.1902381032705307 }, { "entropy": 8.679096221923828, "epoch": 0.7532133676092545, "mean_token_accuracy": 0.7106325626373291, "num_tokens": 18840183.0, "step": 7618, "train/ce_loss": 0.9374057054519653 }, { "epoch": 0.7532133676092545, "step": 7618, "train/sim_loss": 0.0703125 }, { "epoch": 0.7532133676092545, "step": 7618, "train/total_loss": 0.1640530824661255 }, { "entropy": 8.480308532714844, "epoch": 0.75331224045877, "mean_token_accuracy": 0.785495400428772, "num_tokens": 18845601.0, "step": 7619, "train/ce_loss": 0.5295276641845703 }, { "epoch": 0.75331224045877, "step": 7619, "train/sim_loss": 0.046875 }, { "epoch": 0.75331224045877, "step": 7619, "train/total_loss": 0.09982776641845703 }, { "epoch": 0.7534111133082856, "grad_norm": 0.5392074584960938, "learning_rate": 8.118726202838353e-06, "loss": 0.1327, "step": 7620 }, { "entropy": 8.986780166625977, "epoch": 0.7534111133082856, "mean_token_accuracy": 0.7759259343147278, "num_tokens": 18850607.0, "step": 7620, "train/ce_loss": 1.2527285814285278 }, { "epoch": 0.7534111133082856, "step": 7620, "train/sim_loss": 0.05078125 }, { "epoch": 0.7534111133082856, "step": 7620, "train/total_loss": 0.17605410516262054 }, { "entropy": 8.76120376586914, "epoch": 0.7535099861578011, "mean_token_accuracy": 0.7209677696228027, "num_tokens": 18855644.0, "step": 7621, "train/ce_loss": 0.6488677263259888 }, { "epoch": 0.7535099861578011, "step": 7621, "train/sim_loss": 0.0390625 }, { "epoch": 0.7535099861578011, "step": 7621, "train/total_loss": 0.10394927114248276 }, { "entropy": 8.42733383178711, "epoch": 0.7536088590073166, "mean_token_accuracy": 0.73204106092453, "num_tokens": 18861003.0, "step": 7622, "train/ce_loss": 0.6457306146621704 }, { "epoch": 0.7536088590073166, "step": 7622, "train/sim_loss": 0.02734375 }, { "epoch": 0.7536088590073166, "step": 7622, "train/total_loss": 0.09191681444644928 }, { "entropy": 9.14065170288086, "epoch": 0.7537077318568322, "mean_token_accuracy": 0.7386363744735718, "num_tokens": 18866022.0, "step": 7623, "train/ce_loss": 1.131330966949463 }, { "epoch": 0.7537077318568322, "step": 7623, "train/sim_loss": 0.03515625 }, { "epoch": 0.7537077318568322, "step": 7623, "train/total_loss": 0.14828935265541077 }, { "entropy": 8.719907760620117, "epoch": 0.7538066047063476, "mean_token_accuracy": 0.725806474685669, "num_tokens": 18871251.0, "step": 7624, "train/ce_loss": 0.8957783579826355 }, { "epoch": 0.7538066047063476, "step": 7624, "train/sim_loss": 0.046875 }, { "epoch": 0.7538066047063476, "step": 7624, "train/total_loss": 0.1364528387784958 }, { "entropy": 8.517269134521484, "epoch": 0.7539054775558631, "mean_token_accuracy": 0.8009478449821472, "num_tokens": 18876563.0, "step": 7625, "train/ce_loss": 0.5261335372924805 }, { "epoch": 0.7539054775558631, "step": 7625, "train/sim_loss": 0.01953125 }, { "epoch": 0.7539054775558631, "step": 7625, "train/total_loss": 0.07214460521936417 }, { "entropy": 9.236428260803223, "epoch": 0.7540043504053787, "mean_token_accuracy": 0.8019230961799622, "num_tokens": 18881525.0, "step": 7626, "train/ce_loss": 1.5224268436431885 }, { "epoch": 0.7540043504053787, "step": 7626, "train/sim_loss": 0.0625 }, { "epoch": 0.7540043504053787, "step": 7626, "train/total_loss": 0.21474269032478333 }, { "entropy": 8.618507385253906, "epoch": 0.7541032232548942, "mean_token_accuracy": 0.7896138429641724, "num_tokens": 18886766.0, "step": 7627, "train/ce_loss": 0.7982921600341797 }, { "epoch": 0.7541032232548942, "step": 7627, "train/sim_loss": 0.0390625 }, { "epoch": 0.7541032232548942, "step": 7627, "train/total_loss": 0.11889171600341797 }, { "entropy": 8.485706329345703, "epoch": 0.7542020961044097, "mean_token_accuracy": 0.7259439826011658, "num_tokens": 18892020.0, "step": 7628, "train/ce_loss": 0.44417646527290344 }, { "epoch": 0.7542020961044097, "step": 7628, "train/sim_loss": 0.0625 }, { "epoch": 0.7542020961044097, "step": 7628, "train/total_loss": 0.10691764950752258 }, { "entropy": 8.981820106506348, "epoch": 0.7543009689539253, "mean_token_accuracy": 0.8368263244628906, "num_tokens": 18897145.0, "step": 7629, "train/ce_loss": 1.4848389582766686e-05 }, { "epoch": 0.7543009689539253, "step": 7629, "train/sim_loss": 0.03515625 }, { "epoch": 0.7543009689539253, "step": 7629, "train/total_loss": 0.035157736390829086 }, { "entropy": 8.820948600769043, "epoch": 0.7543998418034408, "mean_token_accuracy": 0.7624161243438721, "num_tokens": 18902319.0, "step": 7630, "train/ce_loss": 1.2992298603057861 }, { "epoch": 0.7543998418034408, "step": 7630, "train/sim_loss": 0.0859375 }, { "epoch": 0.7543998418034408, "step": 7630, "train/total_loss": 0.2158604860305786 }, { "entropy": 8.083815574645996, "epoch": 0.7544987146529563, "mean_token_accuracy": 0.7002996802330017, "num_tokens": 18907893.0, "step": 7631, "train/ce_loss": 0.42063286900520325 }, { "epoch": 0.7544987146529563, "step": 7631, "train/sim_loss": 0.046875 }, { "epoch": 0.7544987146529563, "step": 7631, "train/total_loss": 0.08893828839063644 }, { "entropy": 8.371919631958008, "epoch": 0.7545975875024719, "mean_token_accuracy": 0.7872105836868286, "num_tokens": 18913288.0, "step": 7632, "train/ce_loss": 0.41169917583465576 }, { "epoch": 0.7545975875024719, "step": 7632, "train/sim_loss": 0.015625 }, { "epoch": 0.7545975875024719, "step": 7632, "train/total_loss": 0.056794919073581696 }, { "entropy": 8.150374412536621, "epoch": 0.7546964603519873, "mean_token_accuracy": 0.7114846110343933, "num_tokens": 18919016.0, "step": 7633, "train/ce_loss": 0.9580610394477844 }, { "epoch": 0.7546964603519873, "step": 7633, "train/sim_loss": 0.05078125 }, { "epoch": 0.7546964603519873, "step": 7633, "train/total_loss": 0.14658735692501068 }, { "entropy": 8.695228576660156, "epoch": 0.7547953332015028, "mean_token_accuracy": 0.7293233275413513, "num_tokens": 18924318.0, "step": 7634, "train/ce_loss": 0.9011909365653992 }, { "epoch": 0.7547953332015028, "step": 7634, "train/sim_loss": 0.10546875 }, { "epoch": 0.7547953332015028, "step": 7634, "train/total_loss": 0.19558784365653992 }, { "entropy": 8.712812423706055, "epoch": 0.7548942060510184, "mean_token_accuracy": 0.7779237627983093, "num_tokens": 18929530.0, "step": 7635, "train/ce_loss": 0.6864867210388184 }, { "epoch": 0.7548942060510184, "step": 7635, "train/sim_loss": 0.015625 }, { "epoch": 0.7548942060510184, "step": 7635, "train/total_loss": 0.08427367359399796 }, { "entropy": 8.558393478393555, "epoch": 0.7549930789005339, "mean_token_accuracy": 0.7250280380249023, "num_tokens": 18934873.0, "step": 7636, "train/ce_loss": 0.49897968769073486 }, { "epoch": 0.7549930789005339, "step": 7636, "train/sim_loss": 0.05859375 }, { "epoch": 0.7549930789005339, "step": 7636, "train/total_loss": 0.10849171876907349 }, { "entropy": 9.561351776123047, "epoch": 0.7550919517500494, "mean_token_accuracy": 0.7879580855369568, "num_tokens": 18939694.0, "step": 7637, "train/ce_loss": 4.886520400759764e-06 }, { "epoch": 0.7550919517500494, "step": 7637, "train/sim_loss": 0.03515625 }, { "epoch": 0.7550919517500494, "step": 7637, "train/total_loss": 0.0351567380130291 }, { "entropy": 9.224124908447266, "epoch": 0.755190824599565, "mean_token_accuracy": 0.7718120813369751, "num_tokens": 18944557.0, "step": 7638, "train/ce_loss": 1.5212565660476685 }, { "epoch": 0.755190824599565, "step": 7638, "train/sim_loss": 0.0625 }, { "epoch": 0.755190824599565, "step": 7638, "train/total_loss": 0.21462565660476685 }, { "entropy": 8.878040313720703, "epoch": 0.7552896974490805, "mean_token_accuracy": 0.7241379022598267, "num_tokens": 18949594.0, "step": 7639, "train/ce_loss": 1.5049303770065308 }, { "epoch": 0.7552896974490805, "step": 7639, "train/sim_loss": 0.1328125 }, { "epoch": 0.7552896974490805, "step": 7639, "train/total_loss": 0.2833055257797241 }, { "epoch": 0.755388570298596, "grad_norm": 0.750734269618988, "learning_rate": 8.113781338080404e-06, "loss": 0.1272, "step": 7640 }, { "entropy": 9.231822967529297, "epoch": 0.755388570298596, "mean_token_accuracy": 0.7209653258323669, "num_tokens": 18954725.0, "step": 7640, "train/ce_loss": 0.9756774306297302 }, { "epoch": 0.755388570298596, "step": 7640, "train/sim_loss": 0.0546875 }, { "epoch": 0.755388570298596, "step": 7640, "train/total_loss": 0.15225523710250854 }, { "entropy": 8.80136489868164, "epoch": 0.7554874431481116, "mean_token_accuracy": 0.8112947940826416, "num_tokens": 18959893.0, "step": 7641, "train/ce_loss": 0.6539521813392639 }, { "epoch": 0.7554874431481116, "step": 7641, "train/sim_loss": 0.046875 }, { "epoch": 0.7554874431481116, "step": 7641, "train/total_loss": 0.11227022111415863 }, { "entropy": 8.258249282836914, "epoch": 0.755586315997627, "mean_token_accuracy": 0.7637571096420288, "num_tokens": 18965483.0, "step": 7642, "train/ce_loss": 0.41775640845298767 }, { "epoch": 0.755586315997627, "step": 7642, "train/sim_loss": 0.0234375 }, { "epoch": 0.755586315997627, "step": 7642, "train/total_loss": 0.065213143825531 }, { "entropy": 8.918859481811523, "epoch": 0.7556851888471425, "mean_token_accuracy": 0.6951219439506531, "num_tokens": 18970580.0, "step": 7643, "train/ce_loss": 1.2000300884246826 }, { "epoch": 0.7556851888471425, "step": 7643, "train/sim_loss": 0.0078125 }, { "epoch": 0.7556851888471425, "step": 7643, "train/total_loss": 0.12781551480293274 }, { "entropy": 8.945146560668945, "epoch": 0.7557840616966581, "mean_token_accuracy": 0.7384615540504456, "num_tokens": 18975692.0, "step": 7644, "train/ce_loss": 0.9329941868782043 }, { "epoch": 0.7557840616966581, "step": 7644, "train/sim_loss": 0.0390625 }, { "epoch": 0.7557840616966581, "step": 7644, "train/total_loss": 0.13236191868782043 }, { "entropy": 8.990866661071777, "epoch": 0.7558829345461736, "mean_token_accuracy": 0.7046783566474915, "num_tokens": 18980792.0, "step": 7645, "train/ce_loss": 0.6352463960647583 }, { "epoch": 0.7558829345461736, "step": 7645, "train/sim_loss": 0.03515625 }, { "epoch": 0.7558829345461736, "step": 7645, "train/total_loss": 0.09868089109659195 }, { "entropy": 8.912968635559082, "epoch": 0.7559818073956891, "mean_token_accuracy": 0.7124277353286743, "num_tokens": 18985920.0, "step": 7646, "train/ce_loss": 1.3944169282913208 }, { "epoch": 0.7559818073956891, "step": 7646, "train/sim_loss": 0.0546875 }, { "epoch": 0.7559818073956891, "step": 7646, "train/total_loss": 0.19412919878959656 }, { "entropy": 8.82077693939209, "epoch": 0.7560806802452047, "mean_token_accuracy": 0.7394822239875793, "num_tokens": 18990998.0, "step": 7647, "train/ce_loss": 1.771406888961792 }, { "epoch": 0.7560806802452047, "step": 7647, "train/sim_loss": 0.09765625 }, { "epoch": 0.7560806802452047, "step": 7647, "train/total_loss": 0.2747969627380371 }, { "entropy": 8.739612579345703, "epoch": 0.7561795530947202, "mean_token_accuracy": 0.7270340919494629, "num_tokens": 18996379.0, "step": 7648, "train/ce_loss": 1.1098839044570923 }, { "epoch": 0.7561795530947202, "step": 7648, "train/sim_loss": 0.05859375 }, { "epoch": 0.7561795530947202, "step": 7648, "train/total_loss": 0.16958214342594147 }, { "entropy": 8.741659164428711, "epoch": 0.7562784259442357, "mean_token_accuracy": 0.7295454740524292, "num_tokens": 19001690.0, "step": 7649, "train/ce_loss": 0.9927825927734375 }, { "epoch": 0.7562784259442357, "step": 7649, "train/sim_loss": 0.03125 }, { "epoch": 0.7562784259442357, "step": 7649, "train/total_loss": 0.1305282711982727 }, { "entropy": 8.605138778686523, "epoch": 0.7563772987937513, "mean_token_accuracy": 0.7696078419685364, "num_tokens": 19006970.0, "step": 7650, "train/ce_loss": 0.8369215130805969 }, { "epoch": 0.7563772987937513, "step": 7650, "train/sim_loss": 0.09375 }, { "epoch": 0.7563772987937513, "step": 7650, "train/total_loss": 0.17744216322898865 }, { "entropy": 8.186044692993164, "epoch": 0.7564761716432667, "mean_token_accuracy": 0.737758457660675, "num_tokens": 19012562.0, "step": 7651, "train/ce_loss": 0.9426973462104797 }, { "epoch": 0.7564761716432667, "step": 7651, "train/sim_loss": 0.1015625 }, { "epoch": 0.7564761716432667, "step": 7651, "train/total_loss": 0.1958322376012802 }, { "entropy": 8.739931106567383, "epoch": 0.7565750444927822, "mean_token_accuracy": 0.7268232107162476, "num_tokens": 19017841.0, "step": 7652, "train/ce_loss": 0.5349259972572327 }, { "epoch": 0.7565750444927822, "step": 7652, "train/sim_loss": 0.04296875 }, { "epoch": 0.7565750444927822, "step": 7652, "train/total_loss": 0.09646135568618774 }, { "entropy": 8.697602272033691, "epoch": 0.7566739173422978, "mean_token_accuracy": 0.7437425255775452, "num_tokens": 19023154.0, "step": 7653, "train/ce_loss": 1.1454800367355347 }, { "epoch": 0.7566739173422978, "step": 7653, "train/sim_loss": 0.09375 }, { "epoch": 0.7566739173422978, "step": 7653, "train/total_loss": 0.208297997713089 }, { "entropy": 8.589914321899414, "epoch": 0.7567727901918133, "mean_token_accuracy": 0.7968127727508545, "num_tokens": 19028452.0, "step": 7654, "train/ce_loss": 0.7477198839187622 }, { "epoch": 0.7567727901918133, "step": 7654, "train/sim_loss": 0.0546875 }, { "epoch": 0.7567727901918133, "step": 7654, "train/total_loss": 0.12945950031280518 }, { "entropy": 8.548648834228516, "epoch": 0.7568716630413288, "mean_token_accuracy": 0.7398452758789062, "num_tokens": 19033960.0, "step": 7655, "train/ce_loss": 0.7226057648658752 }, { "epoch": 0.7568716630413288, "step": 7655, "train/sim_loss": 0.01171875 }, { "epoch": 0.7568716630413288, "step": 7655, "train/total_loss": 0.08397933095693588 }, { "entropy": 8.774779319763184, "epoch": 0.7569705358908444, "mean_token_accuracy": 0.7493638396263123, "num_tokens": 19039250.0, "step": 7656, "train/ce_loss": 0.4386369287967682 }, { "epoch": 0.7569705358908444, "step": 7656, "train/sim_loss": 0.046875 }, { "epoch": 0.7569705358908444, "step": 7656, "train/total_loss": 0.0907386988401413 }, { "entropy": 8.82728099822998, "epoch": 0.7570694087403599, "mean_token_accuracy": 0.7601156234741211, "num_tokens": 19044417.0, "step": 7657, "train/ce_loss": 0.7033409476280212 }, { "epoch": 0.7570694087403599, "step": 7657, "train/sim_loss": 0.046875 }, { "epoch": 0.7570694087403599, "step": 7657, "train/total_loss": 0.11720909923315048 }, { "entropy": 9.062734603881836, "epoch": 0.7571682815898754, "mean_token_accuracy": 0.7332268357276917, "num_tokens": 19049517.0, "step": 7658, "train/ce_loss": 0.7107903957366943 }, { "epoch": 0.7571682815898754, "step": 7658, "train/sim_loss": 0.0625 }, { "epoch": 0.7571682815898754, "step": 7658, "train/total_loss": 0.1335790455341339 }, { "entropy": 8.766202926635742, "epoch": 0.757267154439391, "mean_token_accuracy": 0.7806190848350525, "num_tokens": 19054753.0, "step": 7659, "train/ce_loss": 0.40278881788253784 }, { "epoch": 0.757267154439391, "step": 7659, "train/sim_loss": 0.03125 }, { "epoch": 0.757267154439391, "step": 7659, "train/total_loss": 0.07152888178825378 }, { "epoch": 0.7573660272889065, "grad_norm": 0.594711184501648, "learning_rate": 8.108836473322456e-06, "loss": 0.1295, "step": 7660 }, { "entropy": 9.129072189331055, "epoch": 0.7573660272889065, "mean_token_accuracy": 0.7789473533630371, "num_tokens": 19059764.0, "step": 7660, "train/ce_loss": 0.7323489785194397 }, { "epoch": 0.7573660272889065, "step": 7660, "train/sim_loss": 0.05078125 }, { "epoch": 0.7573660272889065, "step": 7660, "train/total_loss": 0.12401615083217621 }, { "entropy": 8.544218063354492, "epoch": 0.7574649001384219, "mean_token_accuracy": 0.7217194437980652, "num_tokens": 19065131.0, "step": 7661, "train/ce_loss": 0.8963987827301025 }, { "epoch": 0.7574649001384219, "step": 7661, "train/sim_loss": 0.0234375 }, { "epoch": 0.7574649001384219, "step": 7661, "train/total_loss": 0.11307737976312637 }, { "entropy": 8.525057792663574, "epoch": 0.7575637729879375, "mean_token_accuracy": 0.7340301871299744, "num_tokens": 19070457.0, "step": 7662, "train/ce_loss": 0.6648871898651123 }, { "epoch": 0.7575637729879375, "step": 7662, "train/sim_loss": 0.0390625 }, { "epoch": 0.7575637729879375, "step": 7662, "train/total_loss": 0.10555122047662735 }, { "entropy": 8.885919570922852, "epoch": 0.757662645837453, "mean_token_accuracy": 0.7124773859977722, "num_tokens": 19075429.0, "step": 7663, "train/ce_loss": 1.6520631334060454e-06 }, { "epoch": 0.757662645837453, "step": 7663, "train/sim_loss": 0.05859375 }, { "epoch": 0.757662645837453, "step": 7663, "train/total_loss": 0.05859391391277313 }, { "entropy": 9.214609146118164, "epoch": 0.7577615186869685, "mean_token_accuracy": 0.6887966990470886, "num_tokens": 19080345.0, "step": 7664, "train/ce_loss": 3.675885182019556e-06 }, { "epoch": 0.7577615186869685, "step": 7664, "train/sim_loss": 0.0390625 }, { "epoch": 0.7577615186869685, "step": 7664, "train/total_loss": 0.03906286880373955 }, { "entropy": 8.944068908691406, "epoch": 0.7578603915364841, "mean_token_accuracy": 0.696825385093689, "num_tokens": 19085386.0, "step": 7665, "train/ce_loss": 0.9921472072601318 }, { "epoch": 0.7578603915364841, "step": 7665, "train/sim_loss": 0.0390625 }, { "epoch": 0.7578603915364841, "step": 7665, "train/total_loss": 0.13827723264694214 }, { "entropy": 8.536942481994629, "epoch": 0.7579592643859996, "mean_token_accuracy": 0.7810304164886475, "num_tokens": 19090667.0, "step": 7666, "train/ce_loss": 0.8015487790107727 }, { "epoch": 0.7579592643859996, "step": 7666, "train/sim_loss": 0.01953125 }, { "epoch": 0.7579592643859996, "step": 7666, "train/total_loss": 0.09968613088130951 }, { "entropy": 8.433038711547852, "epoch": 0.7580581372355151, "mean_token_accuracy": 0.7573891878128052, "num_tokens": 19095914.0, "step": 7667, "train/ce_loss": 0.6970519423484802 }, { "epoch": 0.7580581372355151, "step": 7667, "train/sim_loss": 0.0859375 }, { "epoch": 0.7580581372355151, "step": 7667, "train/total_loss": 0.15564268827438354 }, { "entropy": 9.189191818237305, "epoch": 0.7581570100850307, "mean_token_accuracy": 0.8009592294692993, "num_tokens": 19100740.0, "step": 7668, "train/ce_loss": 1.1578792333602905 }, { "epoch": 0.7581570100850307, "step": 7668, "train/sim_loss": 0.0703125 }, { "epoch": 0.7581570100850307, "step": 7668, "train/total_loss": 0.18610042333602905 }, { "entropy": 8.543998718261719, "epoch": 0.7582558829345462, "mean_token_accuracy": 0.7582417726516724, "num_tokens": 19106022.0, "step": 7669, "train/ce_loss": 0.551040530204773 }, { "epoch": 0.7582558829345462, "step": 7669, "train/sim_loss": 0.03125 }, { "epoch": 0.7582558829345462, "step": 7669, "train/total_loss": 0.08635405451059341 }, { "entropy": 8.42531967163086, "epoch": 0.7583547557840618, "mean_token_accuracy": 0.7783669233322144, "num_tokens": 19111455.0, "step": 7670, "train/ce_loss": 0.9934836626052856 }, { "epoch": 0.7583547557840618, "step": 7670, "train/sim_loss": 0.0625 }, { "epoch": 0.7583547557840618, "step": 7670, "train/total_loss": 0.16184836626052856 }, { "entropy": 8.398706436157227, "epoch": 0.7584536286335772, "mean_token_accuracy": 0.770893394947052, "num_tokens": 19116607.0, "step": 7671, "train/ce_loss": 0.5111764073371887 }, { "epoch": 0.7584536286335772, "step": 7671, "train/sim_loss": 0.046875 }, { "epoch": 0.7584536286335772, "step": 7671, "train/total_loss": 0.09799264371395111 }, { "entropy": 8.344941139221191, "epoch": 0.7585525014830927, "mean_token_accuracy": 0.7381930351257324, "num_tokens": 19122070.0, "step": 7672, "train/ce_loss": 1.1934471130371094 }, { "epoch": 0.7585525014830927, "step": 7672, "train/sim_loss": 0.078125 }, { "epoch": 0.7585525014830927, "step": 7672, "train/total_loss": 0.19746971130371094 }, { "entropy": 8.620504379272461, "epoch": 0.7586513743326083, "mean_token_accuracy": 0.7267637252807617, "num_tokens": 19127440.0, "step": 7673, "train/ce_loss": 1.2869994640350342 }, { "epoch": 0.7586513743326083, "step": 7673, "train/sim_loss": 0.109375 }, { "epoch": 0.7586513743326083, "step": 7673, "train/total_loss": 0.23807494342327118 }, { "entropy": 8.172987937927246, "epoch": 0.7587502471821238, "mean_token_accuracy": 0.7366120219230652, "num_tokens": 19132849.0, "step": 7674, "train/ce_loss": 0.7104565501213074 }, { "epoch": 0.7587502471821238, "step": 7674, "train/sim_loss": 0.03125 }, { "epoch": 0.7587502471821238, "step": 7674, "train/total_loss": 0.1022956594824791 }, { "entropy": 8.5946683883667, "epoch": 0.7588491200316393, "mean_token_accuracy": 0.8109685182571411, "num_tokens": 19138180.0, "step": 7675, "train/ce_loss": 0.5208238363265991 }, { "epoch": 0.7588491200316393, "step": 7675, "train/sim_loss": 0.01953125 }, { "epoch": 0.7588491200316393, "step": 7675, "train/total_loss": 0.07161363959312439 }, { "entropy": 8.85075569152832, "epoch": 0.7589479928811549, "mean_token_accuracy": 0.8359073400497437, "num_tokens": 19143134.0, "step": 7676, "train/ce_loss": 1.2283612489700317 }, { "epoch": 0.7589479928811549, "step": 7676, "train/sim_loss": 0.03125 }, { "epoch": 0.7589479928811549, "step": 7676, "train/total_loss": 0.1540861278772354 }, { "entropy": 9.028573989868164, "epoch": 0.7590468657306704, "mean_token_accuracy": 0.7280265092849731, "num_tokens": 19148196.0, "step": 7677, "train/ce_loss": 1.2335330247879028 }, { "epoch": 0.7590468657306704, "step": 7677, "train/sim_loss": 0.09375 }, { "epoch": 0.7590468657306704, "step": 7677, "train/total_loss": 0.21710330247879028 }, { "entropy": 8.929908752441406, "epoch": 0.7591457385801859, "mean_token_accuracy": 0.7253731489181519, "num_tokens": 19153459.0, "step": 7678, "train/ce_loss": 1.0745279788970947 }, { "epoch": 0.7591457385801859, "step": 7678, "train/sim_loss": 0.04296875 }, { "epoch": 0.7591457385801859, "step": 7678, "train/total_loss": 0.15042155981063843 }, { "entropy": 8.580034255981445, "epoch": 0.7592446114297015, "mean_token_accuracy": 0.7003567218780518, "num_tokens": 19158817.0, "step": 7679, "train/ce_loss": 0.9963113069534302 }, { "epoch": 0.7592446114297015, "step": 7679, "train/sim_loss": 0.06640625 }, { "epoch": 0.7592446114297015, "step": 7679, "train/total_loss": 0.16603738069534302 }, { "epoch": 0.7593434842792169, "grad_norm": 0.6969523429870605, "learning_rate": 8.103891608564506e-06, "loss": 0.1333, "step": 7680 }, { "entropy": 8.960241317749023, "epoch": 0.7593434842792169, "mean_token_accuracy": 0.7313974499702454, "num_tokens": 19163806.0, "step": 7680, "train/ce_loss": 1.0162924528121948 }, { "epoch": 0.7593434842792169, "step": 7680, "train/sim_loss": 0.109375 }, { "epoch": 0.7593434842792169, "step": 7680, "train/total_loss": 0.21100425720214844 }, { "entropy": 8.218679428100586, "epoch": 0.7594423571287324, "mean_token_accuracy": 0.7494692206382751, "num_tokens": 19169271.0, "step": 7681, "train/ce_loss": 0.5055243968963623 }, { "epoch": 0.7594423571287324, "step": 7681, "train/sim_loss": 0.015625 }, { "epoch": 0.7594423571287324, "step": 7681, "train/total_loss": 0.06617744266986847 }, { "entropy": 8.814424514770508, "epoch": 0.759541229978248, "mean_token_accuracy": 0.6930533051490784, "num_tokens": 19174363.0, "step": 7682, "train/ce_loss": 1.045782446861267 }, { "epoch": 0.759541229978248, "step": 7682, "train/sim_loss": 0.046875 }, { "epoch": 0.759541229978248, "step": 7682, "train/total_loss": 0.15145325660705566 }, { "entropy": 8.498937606811523, "epoch": 0.7596401028277635, "mean_token_accuracy": 0.6663055419921875, "num_tokens": 19179733.0, "step": 7683, "train/ce_loss": 1.204154372215271 }, { "epoch": 0.7596401028277635, "step": 7683, "train/sim_loss": 0.03515625 }, { "epoch": 0.7596401028277635, "step": 7683, "train/total_loss": 0.15557169914245605 }, { "entropy": 8.231889724731445, "epoch": 0.759738975677279, "mean_token_accuracy": 0.7150654792785645, "num_tokens": 19185097.0, "step": 7684, "train/ce_loss": 0.5256108045578003 }, { "epoch": 0.759738975677279, "step": 7684, "train/sim_loss": 0.05078125 }, { "epoch": 0.759738975677279, "step": 7684, "train/total_loss": 0.10334233194589615 }, { "entropy": 8.79952621459961, "epoch": 0.7598378485267946, "mean_token_accuracy": 0.7369901537895203, "num_tokens": 19190257.0, "step": 7685, "train/ce_loss": 2.6327656996727455e-06 }, { "epoch": 0.7598378485267946, "step": 7685, "train/sim_loss": 0.03515625 }, { "epoch": 0.7598378485267946, "step": 7685, "train/total_loss": 0.03515651449561119 }, { "entropy": 8.854053497314453, "epoch": 0.7599367213763101, "mean_token_accuracy": 0.7942177057266235, "num_tokens": 19195280.0, "step": 7686, "train/ce_loss": 2.11320639209589e-06 }, { "epoch": 0.7599367213763101, "step": 7686, "train/sim_loss": 0.03515625 }, { "epoch": 0.7599367213763101, "step": 7686, "train/total_loss": 0.03515646234154701 }, { "entropy": 8.679353713989258, "epoch": 0.7600355942258256, "mean_token_accuracy": 0.7257217764854431, "num_tokens": 19200515.0, "step": 7687, "train/ce_loss": 1.6108123064041138 }, { "epoch": 0.7600355942258256, "step": 7687, "train/sim_loss": 0.0546875 }, { "epoch": 0.7600355942258256, "step": 7687, "train/total_loss": 0.2157687395811081 }, { "entropy": 8.521926879882812, "epoch": 0.7601344670753412, "mean_token_accuracy": 0.7382199168205261, "num_tokens": 19205680.0, "step": 7688, "train/ce_loss": 0.5365400910377502 }, { "epoch": 0.7601344670753412, "step": 7688, "train/sim_loss": 0.0625 }, { "epoch": 0.7601344670753412, "step": 7688, "train/total_loss": 0.1161540150642395 }, { "entropy": 8.752731323242188, "epoch": 0.7602333399248566, "mean_token_accuracy": 0.6937212944030762, "num_tokens": 19210798.0, "step": 7689, "train/ce_loss": 7.696427246628446e-07 }, { "epoch": 0.7602333399248566, "step": 7689, "train/sim_loss": 0.015625 }, { "epoch": 0.7602333399248566, "step": 7689, "train/total_loss": 0.01562507636845112 }, { "entropy": 9.152226448059082, "epoch": 0.7603322127743721, "mean_token_accuracy": 0.8024096488952637, "num_tokens": 19215634.0, "step": 7690, "train/ce_loss": 2.3737229639664292e-06 }, { "epoch": 0.7603322127743721, "step": 7690, "train/sim_loss": 0.04296875 }, { "epoch": 0.7603322127743721, "step": 7690, "train/total_loss": 0.0429689884185791 }, { "entropy": 8.405892372131348, "epoch": 0.7604310856238877, "mean_token_accuracy": 0.7084308862686157, "num_tokens": 19220961.0, "step": 7691, "train/ce_loss": 1.2519909143447876 }, { "epoch": 0.7604310856238877, "step": 7691, "train/sim_loss": 0.0546875 }, { "epoch": 0.7604310856238877, "step": 7691, "train/total_loss": 0.179886594414711 }, { "entropy": 8.562359809875488, "epoch": 0.7605299584734032, "mean_token_accuracy": 0.7176079750061035, "num_tokens": 19226386.0, "step": 7692, "train/ce_loss": 0.8045241236686707 }, { "epoch": 0.7605299584734032, "step": 7692, "train/sim_loss": 0.05078125 }, { "epoch": 0.7605299584734032, "step": 7692, "train/total_loss": 0.13123366236686707 }, { "entropy": 8.601509094238281, "epoch": 0.7606288313229187, "mean_token_accuracy": 0.7753058671951294, "num_tokens": 19231750.0, "step": 7693, "train/ce_loss": 1.0604190826416016 }, { "epoch": 0.7606288313229187, "step": 7693, "train/sim_loss": 0.06640625 }, { "epoch": 0.7606288313229187, "step": 7693, "train/total_loss": 0.17244815826416016 }, { "entropy": 8.766267776489258, "epoch": 0.7607277041724343, "mean_token_accuracy": 0.7398843765258789, "num_tokens": 19236937.0, "step": 7694, "train/ce_loss": 0.9179567694664001 }, { "epoch": 0.7607277041724343, "step": 7694, "train/sim_loss": 0.06640625 }, { "epoch": 0.7607277041724343, "step": 7694, "train/total_loss": 0.1582019329071045 }, { "entropy": 9.615768432617188, "epoch": 0.7608265770219498, "mean_token_accuracy": 0.7142857313156128, "num_tokens": 19241813.0, "step": 7695, "train/ce_loss": 1.6348321437835693 }, { "epoch": 0.7608265770219498, "step": 7695, "train/sim_loss": 0.07421875 }, { "epoch": 0.7608265770219498, "step": 7695, "train/total_loss": 0.23770196735858917 }, { "entropy": 9.021735191345215, "epoch": 0.7609254498714653, "mean_token_accuracy": 0.7733812928199768, "num_tokens": 19246819.0, "step": 7696, "train/ce_loss": 1.9381379843252944e-06 }, { "epoch": 0.7609254498714653, "step": 7696, "train/sim_loss": 0.04296875 }, { "epoch": 0.7609254498714653, "step": 7696, "train/total_loss": 0.04296894371509552 }, { "entropy": 8.358237266540527, "epoch": 0.7610243227209809, "mean_token_accuracy": 0.7392776608467102, "num_tokens": 19252203.0, "step": 7697, "train/ce_loss": 0.967032790184021 }, { "epoch": 0.7610243227209809, "step": 7697, "train/sim_loss": 0.07421875 }, { "epoch": 0.7610243227209809, "step": 7697, "train/total_loss": 0.17092204093933105 }, { "entropy": 8.386516571044922, "epoch": 0.7611231955704963, "mean_token_accuracy": 0.7639034390449524, "num_tokens": 19257644.0, "step": 7698, "train/ce_loss": 0.7730222344398499 }, { "epoch": 0.7611231955704963, "step": 7698, "train/sim_loss": 0.05078125 }, { "epoch": 0.7611231955704963, "step": 7698, "train/total_loss": 0.1280834674835205 }, { "entropy": 9.069533348083496, "epoch": 0.7612220684200118, "mean_token_accuracy": 0.7192192077636719, "num_tokens": 19262769.0, "step": 7699, "train/ce_loss": 0.921979546546936 }, { "epoch": 0.7612220684200118, "step": 7699, "train/sim_loss": 0.07421875 }, { "epoch": 0.7612220684200118, "step": 7699, "train/total_loss": 0.1664167046546936 }, { "epoch": 0.7613209412695274, "grad_norm": 0.6902703046798706, "learning_rate": 8.098946743806557e-06, "loss": 0.1421, "step": 7700 }, { "entropy": 8.63135051727295, "epoch": 0.7613209412695274, "mean_token_accuracy": 0.6991676688194275, "num_tokens": 19268070.0, "step": 7700, "train/ce_loss": 0.6126058101654053 }, { "epoch": 0.7613209412695274, "step": 7700, "train/sim_loss": 0.02734375 }, { "epoch": 0.7613209412695274, "step": 7700, "train/total_loss": 0.08860433101654053 }, { "entropy": 9.117762565612793, "epoch": 0.7614198141190429, "mean_token_accuracy": 0.7804877758026123, "num_tokens": 19273207.0, "step": 7701, "train/ce_loss": 0.7227744460105896 }, { "epoch": 0.7614198141190429, "step": 7701, "train/sim_loss": 0.0703125 }, { "epoch": 0.7614198141190429, "step": 7701, "train/total_loss": 0.14258995652198792 }, { "entropy": 8.944087982177734, "epoch": 0.7615186869685584, "mean_token_accuracy": 0.7549341917037964, "num_tokens": 19278312.0, "step": 7702, "train/ce_loss": 0.7299985885620117 }, { "epoch": 0.7615186869685584, "step": 7702, "train/sim_loss": 0.0390625 }, { "epoch": 0.7615186869685584, "step": 7702, "train/total_loss": 0.11206235736608505 }, { "entropy": 8.509510040283203, "epoch": 0.761617559818074, "mean_token_accuracy": 0.7464967966079712, "num_tokens": 19283519.0, "step": 7703, "train/ce_loss": 1.4568054676055908 }, { "epoch": 0.761617559818074, "step": 7703, "train/sim_loss": 0.0625 }, { "epoch": 0.761617559818074, "step": 7703, "train/total_loss": 0.20818054676055908 }, { "entropy": 8.294575691223145, "epoch": 0.7617164326675895, "mean_token_accuracy": 0.7869177460670471, "num_tokens": 19289017.0, "step": 7704, "train/ce_loss": 0.7559100389480591 }, { "epoch": 0.7617164326675895, "step": 7704, "train/sim_loss": 0.04296875 }, { "epoch": 0.7617164326675895, "step": 7704, "train/total_loss": 0.11855975538492203 }, { "entropy": 8.896890640258789, "epoch": 0.761815305517105, "mean_token_accuracy": 0.7196382284164429, "num_tokens": 19294196.0, "step": 7705, "train/ce_loss": 1.29708731174469 }, { "epoch": 0.761815305517105, "step": 7705, "train/sim_loss": 0.03515625 }, { "epoch": 0.761815305517105, "step": 7705, "train/total_loss": 0.16486498713493347 }, { "entropy": 8.722881317138672, "epoch": 0.7619141783666206, "mean_token_accuracy": 0.7140718698501587, "num_tokens": 19299325.0, "step": 7706, "train/ce_loss": 1.6583621501922607 }, { "epoch": 0.7619141783666206, "step": 7706, "train/sim_loss": 0.0703125 }, { "epoch": 0.7619141783666206, "step": 7706, "train/total_loss": 0.23614871501922607 }, { "entropy": 9.242901802062988, "epoch": 0.762013051216136, "mean_token_accuracy": 0.7539823055267334, "num_tokens": 19304310.0, "step": 7707, "train/ce_loss": 1.672262191772461 }, { "epoch": 0.762013051216136, "step": 7707, "train/sim_loss": 0.06640625 }, { "epoch": 0.762013051216136, "step": 7707, "train/total_loss": 0.23363247513771057 }, { "entropy": 8.592472076416016, "epoch": 0.7621119240656515, "mean_token_accuracy": 0.7590798735618591, "num_tokens": 19309635.0, "step": 7708, "train/ce_loss": 0.5775005221366882 }, { "epoch": 0.7621119240656515, "step": 7708, "train/sim_loss": 0.03125 }, { "epoch": 0.7621119240656515, "step": 7708, "train/total_loss": 0.08900005370378494 }, { "entropy": 8.906074523925781, "epoch": 0.7622107969151671, "mean_token_accuracy": 0.7651098966598511, "num_tokens": 19314794.0, "step": 7709, "train/ce_loss": 0.581382155418396 }, { "epoch": 0.7622107969151671, "step": 7709, "train/sim_loss": 0.03125 }, { "epoch": 0.7622107969151671, "step": 7709, "train/total_loss": 0.08938822150230408 }, { "entropy": 8.652992248535156, "epoch": 0.7623096697646826, "mean_token_accuracy": 0.7920299172401428, "num_tokens": 19320053.0, "step": 7710, "train/ce_loss": 1.3876551389694214 }, { "epoch": 0.7623096697646826, "step": 7710, "train/sim_loss": 0.08984375 }, { "epoch": 0.7623096697646826, "step": 7710, "train/total_loss": 0.22860926389694214 }, { "entropy": 9.118717193603516, "epoch": 0.7624085426141981, "mean_token_accuracy": 0.7709029912948608, "num_tokens": 19325094.0, "step": 7711, "train/ce_loss": 1.1129260063171387 }, { "epoch": 0.7624085426141981, "step": 7711, "train/sim_loss": 0.078125 }, { "epoch": 0.7624085426141981, "step": 7711, "train/total_loss": 0.18941760063171387 }, { "entropy": 9.099111557006836, "epoch": 0.7625074154637137, "mean_token_accuracy": 0.7534013390541077, "num_tokens": 19330099.0, "step": 7712, "train/ce_loss": 0.6406109929084778 }, { "epoch": 0.7625074154637137, "step": 7712, "train/sim_loss": 0.0234375 }, { "epoch": 0.7625074154637137, "step": 7712, "train/total_loss": 0.08749859780073166 }, { "entropy": 9.474559783935547, "epoch": 0.7626062883132292, "mean_token_accuracy": 0.7681940793991089, "num_tokens": 19334908.0, "step": 7713, "train/ce_loss": 2.5190438464051113e-06 }, { "epoch": 0.7626062883132292, "step": 7713, "train/sim_loss": 0.07421875 }, { "epoch": 0.7626062883132292, "step": 7713, "train/total_loss": 0.0742190033197403 }, { "entropy": 8.941295623779297, "epoch": 0.7627051611627447, "mean_token_accuracy": 0.7441217303276062, "num_tokens": 19340115.0, "step": 7714, "train/ce_loss": 0.86009681224823 }, { "epoch": 0.7627051611627447, "step": 7714, "train/sim_loss": 0.0625 }, { "epoch": 0.7627051611627447, "step": 7714, "train/total_loss": 0.148509681224823 }, { "entropy": 9.293888092041016, "epoch": 0.7628040340122603, "mean_token_accuracy": 0.7445651888847351, "num_tokens": 19345188.0, "step": 7715, "train/ce_loss": 6.578445663762977e-06 }, { "epoch": 0.7628040340122603, "step": 7715, "train/sim_loss": 0.015625 }, { "epoch": 0.7628040340122603, "step": 7715, "train/total_loss": 0.01562565751373768 }, { "entropy": 8.775178909301758, "epoch": 0.7629029068617758, "mean_token_accuracy": 0.7128713130950928, "num_tokens": 19350483.0, "step": 7716, "train/ce_loss": 0.5081300139427185 }, { "epoch": 0.7629029068617758, "step": 7716, "train/sim_loss": 0.0546875 }, { "epoch": 0.7629029068617758, "step": 7716, "train/total_loss": 0.10550050437450409 }, { "entropy": 8.708213806152344, "epoch": 0.7630017797112912, "mean_token_accuracy": 0.7303128242492676, "num_tokens": 19356076.0, "step": 7717, "train/ce_loss": 0.9680280089378357 }, { "epoch": 0.7630017797112912, "step": 7717, "train/sim_loss": 0.078125 }, { "epoch": 0.7630017797112912, "step": 7717, "train/total_loss": 0.17492780089378357 }, { "entropy": 9.063175201416016, "epoch": 0.7631006525608068, "mean_token_accuracy": 0.7174280881881714, "num_tokens": 19361135.0, "step": 7718, "train/ce_loss": 1.0929538011550903 }, { "epoch": 0.7631006525608068, "step": 7718, "train/sim_loss": 0.07421875 }, { "epoch": 0.7631006525608068, "step": 7718, "train/total_loss": 0.18351413309574127 }, { "entropy": 8.598108291625977, "epoch": 0.7631995254103223, "mean_token_accuracy": 0.7684848308563232, "num_tokens": 19366456.0, "step": 7719, "train/ce_loss": 0.6938397288322449 }, { "epoch": 0.7631995254103223, "step": 7719, "train/sim_loss": 0.046875 }, { "epoch": 0.7631995254103223, "step": 7719, "train/total_loss": 0.11625897139310837 }, { "epoch": 0.7632983982598378, "grad_norm": 0.7268943786621094, "learning_rate": 8.09400187904861e-06, "loss": 0.1302, "step": 7720 }, { "entropy": 9.239712715148926, "epoch": 0.7632983982598378, "mean_token_accuracy": 0.7887640595436096, "num_tokens": 19371358.0, "step": 7720, "train/ce_loss": 0.9097400903701782 }, { "epoch": 0.7632983982598378, "step": 7720, "train/sim_loss": 0.02734375 }, { "epoch": 0.7632983982598378, "step": 7720, "train/total_loss": 0.11831776052713394 }, { "entropy": 8.484024047851562, "epoch": 0.7633972711093534, "mean_token_accuracy": 0.8020594716072083, "num_tokens": 19376667.0, "step": 7721, "train/ce_loss": 0.716927170753479 }, { "epoch": 0.7633972711093534, "step": 7721, "train/sim_loss": 0.05859375 }, { "epoch": 0.7633972711093534, "step": 7721, "train/total_loss": 0.13028647005558014 }, { "entropy": 8.522327423095703, "epoch": 0.7634961439588689, "mean_token_accuracy": 0.7739975452423096, "num_tokens": 19381971.0, "step": 7722, "train/ce_loss": 0.5556038022041321 }, { "epoch": 0.7634961439588689, "step": 7722, "train/sim_loss": 0.03125 }, { "epoch": 0.7634961439588689, "step": 7722, "train/total_loss": 0.08681038022041321 }, { "entropy": 8.600162506103516, "epoch": 0.7635950168083844, "mean_token_accuracy": 0.7875862121582031, "num_tokens": 19387182.0, "step": 7723, "train/ce_loss": 0.3888521194458008 }, { "epoch": 0.7635950168083844, "step": 7723, "train/sim_loss": 0.02734375 }, { "epoch": 0.7635950168083844, "step": 7723, "train/total_loss": 0.0662289634346962 }, { "entropy": 8.18398380279541, "epoch": 0.7636938896579, "mean_token_accuracy": 0.7144240140914917, "num_tokens": 19392683.0, "step": 7724, "train/ce_loss": 0.916076123714447 }, { "epoch": 0.7636938896579, "step": 7724, "train/sim_loss": 0.01953125 }, { "epoch": 0.7636938896579, "step": 7724, "train/total_loss": 0.11113886535167694 }, { "entropy": 9.19622802734375, "epoch": 0.7637927625074155, "mean_token_accuracy": 0.7403846383094788, "num_tokens": 19397348.0, "step": 7725, "train/ce_loss": 3.756938167498447e-05 }, { "epoch": 0.7637927625074155, "step": 7725, "train/sim_loss": 0.046875 }, { "epoch": 0.7637927625074155, "step": 7725, "train/total_loss": 0.04687875509262085 }, { "entropy": 8.722827911376953, "epoch": 0.7638916353569309, "mean_token_accuracy": 0.7213695645332336, "num_tokens": 19402684.0, "step": 7726, "train/ce_loss": 0.4669986367225647 }, { "epoch": 0.7638916353569309, "step": 7726, "train/sim_loss": 0.02734375 }, { "epoch": 0.7638916353569309, "step": 7726, "train/total_loss": 0.07404361665248871 }, { "entropy": 9.330000877380371, "epoch": 0.7639905082064465, "mean_token_accuracy": 0.7877358198165894, "num_tokens": 19407544.0, "step": 7727, "train/ce_loss": 3.991949597548228e-06 }, { "epoch": 0.7639905082064465, "step": 7727, "train/sim_loss": 0.03125 }, { "epoch": 0.7639905082064465, "step": 7727, "train/total_loss": 0.031250398606061935 }, { "entropy": 8.632328987121582, "epoch": 0.764089381055962, "mean_token_accuracy": 0.732824444770813, "num_tokens": 19412908.0, "step": 7728, "train/ce_loss": 0.6812427043914795 }, { "epoch": 0.764089381055962, "step": 7728, "train/sim_loss": 0.03125 }, { "epoch": 0.764089381055962, "step": 7728, "train/total_loss": 0.09937427192926407 }, { "entropy": 9.08932113647461, "epoch": 0.7641882539054775, "mean_token_accuracy": 0.692307710647583, "num_tokens": 19417906.0, "step": 7729, "train/ce_loss": 2.049284375971183e-06 }, { "epoch": 0.7641882539054775, "step": 7729, "train/sim_loss": 0.04296875 }, { "epoch": 0.7641882539054775, "step": 7729, "train/total_loss": 0.042968954890966415 }, { "entropy": 9.123553276062012, "epoch": 0.7642871267549931, "mean_token_accuracy": 0.7763370871543884, "num_tokens": 19422907.0, "step": 7730, "train/ce_loss": 1.2972058057785034 }, { "epoch": 0.7642871267549931, "step": 7730, "train/sim_loss": 0.03125 }, { "epoch": 0.7642871267549931, "step": 7730, "train/total_loss": 0.16097058355808258 }, { "entropy": 9.108598709106445, "epoch": 0.7643859996045086, "mean_token_accuracy": 0.7902207970619202, "num_tokens": 19427959.0, "step": 7731, "train/ce_loss": 0.8736315965652466 }, { "epoch": 0.7643859996045086, "step": 7731, "train/sim_loss": 0.02734375 }, { "epoch": 0.7643859996045086, "step": 7731, "train/total_loss": 0.11470691114664078 }, { "entropy": 9.450088500976562, "epoch": 0.7644848724540241, "mean_token_accuracy": 0.7854166626930237, "num_tokens": 19432839.0, "step": 7732, "train/ce_loss": 1.6410548369094613e-06 }, { "epoch": 0.7644848724540241, "step": 7732, "train/sim_loss": 0.015625 }, { "epoch": 0.7644848724540241, "step": 7732, "train/total_loss": 0.015625163912773132 }, { "entropy": 9.395781517028809, "epoch": 0.7645837453035397, "mean_token_accuracy": 0.7614678740501404, "num_tokens": 19437698.0, "step": 7733, "train/ce_loss": 8.319220796693116e-06 }, { "epoch": 0.7645837453035397, "step": 7733, "train/sim_loss": 0.0390625 }, { "epoch": 0.7645837453035397, "step": 7733, "train/total_loss": 0.03906333073973656 }, { "entropy": 9.136443138122559, "epoch": 0.7646826181530552, "mean_token_accuracy": 0.7687296271324158, "num_tokens": 19442776.0, "step": 7734, "train/ce_loss": 1.2883407407571212e-06 }, { "epoch": 0.7646826181530552, "step": 7734, "train/sim_loss": 0.015625 }, { "epoch": 0.7646826181530552, "step": 7734, "train/total_loss": 0.015625128522515297 }, { "entropy": 9.37185287475586, "epoch": 0.7647814910025706, "mean_token_accuracy": 0.7629063129425049, "num_tokens": 19447748.0, "step": 7735, "train/ce_loss": 0.5439682602882385 }, { "epoch": 0.7647814910025706, "step": 7735, "train/sim_loss": 0.0625 }, { "epoch": 0.7647814910025706, "step": 7735, "train/total_loss": 0.11689682304859161 }, { "entropy": 9.62160587310791, "epoch": 0.7648803638520862, "mean_token_accuracy": 0.7392995953559875, "num_tokens": 19452418.0, "step": 7736, "train/ce_loss": 4.22458151660976e-06 }, { "epoch": 0.7648803638520862, "step": 7736, "train/sim_loss": 0.04296875 }, { "epoch": 0.7648803638520862, "step": 7736, "train/total_loss": 0.042969170957803726 }, { "entropy": 9.938680648803711, "epoch": 0.7649792367016017, "mean_token_accuracy": 0.7355931997299194, "num_tokens": 19457076.0, "step": 7737, "train/ce_loss": 3.6609737890103133e-06 }, { "epoch": 0.7649792367016017, "step": 7737, "train/sim_loss": 0.08203125 }, { "epoch": 0.7649792367016017, "step": 7737, "train/total_loss": 0.08203161507844925 }, { "entropy": 8.995643615722656, "epoch": 0.7650781095511172, "mean_token_accuracy": 0.7534818649291992, "num_tokens": 19462245.0, "step": 7738, "train/ce_loss": 0.4997852146625519 }, { "epoch": 0.7650781095511172, "step": 7738, "train/sim_loss": 0.0625 }, { "epoch": 0.7650781095511172, "step": 7738, "train/total_loss": 0.11247852444648743 }, { "entropy": 8.558038711547852, "epoch": 0.7651769824006328, "mean_token_accuracy": 0.7941550016403198, "num_tokens": 19467522.0, "step": 7739, "train/ce_loss": 0.5175499320030212 }, { "epoch": 0.7651769824006328, "step": 7739, "train/sim_loss": 0.09375 }, { "epoch": 0.7651769824006328, "step": 7739, "train/total_loss": 0.14550499618053436 }, { "epoch": 0.7652758552501483, "grad_norm": 0.7859126925468445, "learning_rate": 8.08905701429066e-06, "loss": 0.1202, "step": 7740 }, { "entropy": 8.586603164672852, "epoch": 0.7652758552501483, "mean_token_accuracy": 0.6853233575820923, "num_tokens": 19472819.0, "step": 7740, "train/ce_loss": 1.1033329963684082 }, { "epoch": 0.7652758552501483, "step": 7740, "train/sim_loss": 0.04296875 }, { "epoch": 0.7652758552501483, "step": 7740, "train/total_loss": 0.15330204367637634 }, { "entropy": 9.153886795043945, "epoch": 0.7653747280996638, "mean_token_accuracy": 0.7885532379150391, "num_tokens": 19477902.0, "step": 7741, "train/ce_loss": 1.0411827564239502 }, { "epoch": 0.7653747280996638, "step": 7741, "train/sim_loss": 0.04296875 }, { "epoch": 0.7653747280996638, "step": 7741, "train/total_loss": 0.14708703756332397 }, { "entropy": 8.211365699768066, "epoch": 0.7654736009491794, "mean_token_accuracy": 0.7018633484840393, "num_tokens": 19483491.0, "step": 7742, "train/ce_loss": 1.3559170961380005 }, { "epoch": 0.7654736009491794, "step": 7742, "train/sim_loss": 0.0703125 }, { "epoch": 0.7654736009491794, "step": 7742, "train/total_loss": 0.20590421557426453 }, { "entropy": 8.63299560546875, "epoch": 0.7655724737986949, "mean_token_accuracy": 0.7731529474258423, "num_tokens": 19489070.0, "step": 7743, "train/ce_loss": 0.27736401557922363 }, { "epoch": 0.7655724737986949, "step": 7743, "train/sim_loss": 0.01953125 }, { "epoch": 0.7655724737986949, "step": 7743, "train/total_loss": 0.04726765304803848 }, { "entropy": 8.707715034484863, "epoch": 0.7656713466482103, "mean_token_accuracy": 0.7670384049415588, "num_tokens": 19494288.0, "step": 7744, "train/ce_loss": 1.2483643293380737 }, { "epoch": 0.7656713466482103, "step": 7744, "train/sim_loss": 0.03515625 }, { "epoch": 0.7656713466482103, "step": 7744, "train/total_loss": 0.15999269485473633 }, { "entropy": 8.702041625976562, "epoch": 0.765770219497726, "mean_token_accuracy": 0.7841463685035706, "num_tokens": 19499582.0, "step": 7745, "train/ce_loss": 0.7944613695144653 }, { "epoch": 0.765770219497726, "step": 7745, "train/sim_loss": 0.08984375 }, { "epoch": 0.765770219497726, "step": 7745, "train/total_loss": 0.16928988695144653 }, { "entropy": 8.619491577148438, "epoch": 0.7658690923472414, "mean_token_accuracy": 0.7274590134620667, "num_tokens": 19505160.0, "step": 7746, "train/ce_loss": 0.8761892318725586 }, { "epoch": 0.7658690923472414, "step": 7746, "train/sim_loss": 0.03515625 }, { "epoch": 0.7658690923472414, "step": 7746, "train/total_loss": 0.12277517467737198 }, { "entropy": 8.589058876037598, "epoch": 0.7659679651967569, "mean_token_accuracy": 0.7404494285583496, "num_tokens": 19510518.0, "step": 7747, "train/ce_loss": 1.9129421710968018 }, { "epoch": 0.7659679651967569, "step": 7747, "train/sim_loss": 0.05078125 }, { "epoch": 0.7659679651967569, "step": 7747, "train/total_loss": 0.24207547307014465 }, { "entropy": 8.918684005737305, "epoch": 0.7660668380462725, "mean_token_accuracy": 0.7492957711219788, "num_tokens": 19515653.0, "step": 7748, "train/ce_loss": 0.5139737129211426 }, { "epoch": 0.7660668380462725, "step": 7748, "train/sim_loss": 0.05859375 }, { "epoch": 0.7660668380462725, "step": 7748, "train/total_loss": 0.10999111831188202 }, { "entropy": 8.723108291625977, "epoch": 0.766165710895788, "mean_token_accuracy": 0.7662178874015808, "num_tokens": 19520907.0, "step": 7749, "train/ce_loss": 0.4100935757160187 }, { "epoch": 0.766165710895788, "step": 7749, "train/sim_loss": 0.03515625 }, { "epoch": 0.766165710895788, "step": 7749, "train/total_loss": 0.07616560906171799 }, { "entropy": 8.271391868591309, "epoch": 0.7662645837453035, "mean_token_accuracy": 0.744027316570282, "num_tokens": 19526263.0, "step": 7750, "train/ce_loss": 0.9203963875770569 }, { "epoch": 0.7662645837453035, "step": 7750, "train/sim_loss": 0.06640625 }, { "epoch": 0.7662645837453035, "step": 7750, "train/total_loss": 0.15844589471817017 }, { "entropy": 8.823776245117188, "epoch": 0.7663634565948191, "mean_token_accuracy": 0.726123571395874, "num_tokens": 19531429.0, "step": 7751, "train/ce_loss": 0.8118509650230408 }, { "epoch": 0.7663634565948191, "step": 7751, "train/sim_loss": 0.0546875 }, { "epoch": 0.7663634565948191, "step": 7751, "train/total_loss": 0.13587260246276855 }, { "entropy": 9.326594352722168, "epoch": 0.7664623294443346, "mean_token_accuracy": 0.7789255976676941, "num_tokens": 19536297.0, "step": 7752, "train/ce_loss": 2.020547071879264e-06 }, { "epoch": 0.7664623294443346, "step": 7752, "train/sim_loss": 0.02734375 }, { "epoch": 0.7664623294443346, "step": 7752, "train/total_loss": 0.027343951165676117 }, { "entropy": 8.197793960571289, "epoch": 0.7665612022938502, "mean_token_accuracy": 0.7110352516174316, "num_tokens": 19541592.0, "step": 7753, "train/ce_loss": 0.9869194030761719 }, { "epoch": 0.7665612022938502, "step": 7753, "train/sim_loss": 0.0546875 }, { "epoch": 0.7665612022938502, "step": 7753, "train/total_loss": 0.1533794403076172 }, { "entropy": 8.259000778198242, "epoch": 0.7666600751433656, "mean_token_accuracy": 0.7468785643577576, "num_tokens": 19546926.0, "step": 7754, "train/ce_loss": 0.7809675931930542 }, { "epoch": 0.7666600751433656, "step": 7754, "train/sim_loss": 0.01953125 }, { "epoch": 0.7666600751433656, "step": 7754, "train/total_loss": 0.09762801229953766 }, { "entropy": 8.879019737243652, "epoch": 0.7667589479928811, "mean_token_accuracy": 0.7185473442077637, "num_tokens": 19552104.0, "step": 7755, "train/ce_loss": 1.5787826776504517 }, { "epoch": 0.7667589479928811, "step": 7755, "train/sim_loss": 0.12890625 }, { "epoch": 0.7667589479928811, "step": 7755, "train/total_loss": 0.2867845296859741 }, { "entropy": 8.442885398864746, "epoch": 0.7668578208423967, "mean_token_accuracy": 0.6954148411750793, "num_tokens": 19557481.0, "step": 7756, "train/ce_loss": 1.7079004049301147 }, { "epoch": 0.7668578208423967, "step": 7756, "train/sim_loss": 0.09765625 }, { "epoch": 0.7668578208423967, "step": 7756, "train/total_loss": 0.26844629645347595 }, { "entropy": 8.493907928466797, "epoch": 0.7669566936919122, "mean_token_accuracy": 0.7236994504928589, "num_tokens": 19562820.0, "step": 7757, "train/ce_loss": 0.990061342716217 }, { "epoch": 0.7669566936919122, "step": 7757, "train/sim_loss": 0.02734375 }, { "epoch": 0.7669566936919122, "step": 7757, "train/total_loss": 0.12634989619255066 }, { "entropy": 8.28232192993164, "epoch": 0.7670555665414277, "mean_token_accuracy": 0.6814891695976257, "num_tokens": 19568273.0, "step": 7758, "train/ce_loss": 0.9605075120925903 }, { "epoch": 0.7670555665414277, "step": 7758, "train/sim_loss": 0.07421875 }, { "epoch": 0.7670555665414277, "step": 7758, "train/total_loss": 0.17026950418949127 }, { "entropy": 8.471275329589844, "epoch": 0.7671544393909433, "mean_token_accuracy": 0.7878788113594055, "num_tokens": 19573710.0, "step": 7759, "train/ce_loss": 0.8600389361381531 }, { "epoch": 0.7671544393909433, "step": 7759, "train/sim_loss": 0.0546875 }, { "epoch": 0.7671544393909433, "step": 7759, "train/total_loss": 0.14069139957427979 }, { "epoch": 0.7672533122404588, "grad_norm": 0.5270076990127563, "learning_rate": 8.084112149532712e-06, "loss": 0.134, "step": 7760 }, { "entropy": 8.327375411987305, "epoch": 0.7672533122404588, "mean_token_accuracy": 0.7164750695228577, "num_tokens": 19579186.0, "step": 7760, "train/ce_loss": 0.4279042184352875 }, { "epoch": 0.7672533122404588, "step": 7760, "train/sim_loss": 0.0390625 }, { "epoch": 0.7672533122404588, "step": 7760, "train/total_loss": 0.08185292780399323 }, { "entropy": 8.624473571777344, "epoch": 0.7673521850899743, "mean_token_accuracy": 0.7292225360870361, "num_tokens": 19584373.0, "step": 7761, "train/ce_loss": 1.4330039448395837e-06 }, { "epoch": 0.7673521850899743, "step": 7761, "train/sim_loss": 0.03125 }, { "epoch": 0.7673521850899743, "step": 7761, "train/total_loss": 0.03125014156103134 }, { "entropy": 8.348209381103516, "epoch": 0.7674510579394899, "mean_token_accuracy": 0.769070029258728, "num_tokens": 19589785.0, "step": 7762, "train/ce_loss": 0.5385426878929138 }, { "epoch": 0.7674510579394899, "step": 7762, "train/sim_loss": 0.015625 }, { "epoch": 0.7674510579394899, "step": 7762, "train/total_loss": 0.06947927176952362 }, { "entropy": 8.514656066894531, "epoch": 0.7675499307890054, "mean_token_accuracy": 0.6915422677993774, "num_tokens": 19594851.0, "step": 7763, "train/ce_loss": 2.1961233615875244 }, { "epoch": 0.7675499307890054, "step": 7763, "train/sim_loss": 0.05859375 }, { "epoch": 0.7675499307890054, "step": 7763, "train/total_loss": 0.27820611000061035 }, { "entropy": 8.7510347366333, "epoch": 0.7676488036385208, "mean_token_accuracy": 0.7577937841415405, "num_tokens": 19600108.0, "step": 7764, "train/ce_loss": 0.3642151355743408 }, { "epoch": 0.7676488036385208, "step": 7764, "train/sim_loss": 0.046875 }, { "epoch": 0.7676488036385208, "step": 7764, "train/total_loss": 0.0832965150475502 }, { "entropy": 8.535284042358398, "epoch": 0.7677476764880364, "mean_token_accuracy": 0.7045454382896423, "num_tokens": 19605586.0, "step": 7765, "train/ce_loss": 0.8237572312355042 }, { "epoch": 0.7677476764880364, "step": 7765, "train/sim_loss": 0.02734375 }, { "epoch": 0.7677476764880364, "step": 7765, "train/total_loss": 0.10971947759389877 }, { "entropy": 8.440315246582031, "epoch": 0.7678465493375519, "mean_token_accuracy": 0.7381489872932434, "num_tokens": 19610983.0, "step": 7766, "train/ce_loss": 0.9059736132621765 }, { "epoch": 0.7678465493375519, "step": 7766, "train/sim_loss": 0.02734375 }, { "epoch": 0.7678465493375519, "step": 7766, "train/total_loss": 0.11794111132621765 }, { "entropy": 8.703001022338867, "epoch": 0.7679454221870674, "mean_token_accuracy": 0.6919592022895813, "num_tokens": 19616312.0, "step": 7767, "train/ce_loss": 1.5078632831573486 }, { "epoch": 0.7679454221870674, "step": 7767, "train/sim_loss": 0.09375 }, { "epoch": 0.7679454221870674, "step": 7767, "train/total_loss": 0.24453632533550262 }, { "entropy": 9.096664428710938, "epoch": 0.768044295036583, "mean_token_accuracy": 0.7035830616950989, "num_tokens": 19621410.0, "step": 7768, "train/ce_loss": 1.1963167190551758 }, { "epoch": 0.768044295036583, "step": 7768, "train/sim_loss": 0.09765625 }, { "epoch": 0.768044295036583, "step": 7768, "train/total_loss": 0.21728792786598206 }, { "entropy": 8.492799758911133, "epoch": 0.7681431678860985, "mean_token_accuracy": 0.7325301170349121, "num_tokens": 19626713.0, "step": 7769, "train/ce_loss": 0.6306684613227844 }, { "epoch": 0.7681431678860985, "step": 7769, "train/sim_loss": 0.078125 }, { "epoch": 0.7681431678860985, "step": 7769, "train/total_loss": 0.14119184017181396 }, { "entropy": 8.575288772583008, "epoch": 0.768242040735614, "mean_token_accuracy": 0.7642276287078857, "num_tokens": 19632059.0, "step": 7770, "train/ce_loss": 0.5693222284317017 }, { "epoch": 0.768242040735614, "step": 7770, "train/sim_loss": 0.03515625 }, { "epoch": 0.768242040735614, "step": 7770, "train/total_loss": 0.0920884758234024 }, { "entropy": 8.951510429382324, "epoch": 0.7683409135851296, "mean_token_accuracy": 0.6703296899795532, "num_tokens": 19637046.0, "step": 7771, "train/ce_loss": 2.1792502403259277 }, { "epoch": 0.7683409135851296, "step": 7771, "train/sim_loss": 0.0546875 }, { "epoch": 0.7683409135851296, "step": 7771, "train/total_loss": 0.2726125121116638 }, { "entropy": 8.60496711730957, "epoch": 0.7684397864346451, "mean_token_accuracy": 0.7296416759490967, "num_tokens": 19642430.0, "step": 7772, "train/ce_loss": 0.6584407687187195 }, { "epoch": 0.7684397864346451, "step": 7772, "train/sim_loss": 0.046875 }, { "epoch": 0.7684397864346451, "step": 7772, "train/total_loss": 0.1127190813422203 }, { "entropy": 8.587507247924805, "epoch": 0.7685386592841605, "mean_token_accuracy": 0.7205567359924316, "num_tokens": 19647828.0, "step": 7773, "train/ce_loss": 0.542307436466217 }, { "epoch": 0.7685386592841605, "step": 7773, "train/sim_loss": 0.0390625 }, { "epoch": 0.7685386592841605, "step": 7773, "train/total_loss": 0.09329324960708618 }, { "entropy": 8.893842697143555, "epoch": 0.7686375321336761, "mean_token_accuracy": 0.770380437374115, "num_tokens": 19652950.0, "step": 7774, "train/ce_loss": 1.1178854703903198 }, { "epoch": 0.7686375321336761, "step": 7774, "train/sim_loss": 0.1171875 }, { "epoch": 0.7686375321336761, "step": 7774, "train/total_loss": 0.2289760410785675 }, { "entropy": 8.655986785888672, "epoch": 0.7687364049831916, "mean_token_accuracy": 0.7772215008735657, "num_tokens": 19658209.0, "step": 7775, "train/ce_loss": 0.47028452157974243 }, { "epoch": 0.7687364049831916, "step": 7775, "train/sim_loss": 0.03515625 }, { "epoch": 0.7687364049831916, "step": 7775, "train/total_loss": 0.08218470215797424 }, { "entropy": 9.172534942626953, "epoch": 0.7688352778327071, "mean_token_accuracy": 0.7229129672050476, "num_tokens": 19663189.0, "step": 7776, "train/ce_loss": 1.1840168099297443e-06 }, { "epoch": 0.7688352778327071, "step": 7776, "train/sim_loss": 0.015625 }, { "epoch": 0.7688352778327071, "step": 7776, "train/total_loss": 0.01562511920928955 }, { "entropy": 8.723504066467285, "epoch": 0.7689341506822227, "mean_token_accuracy": 0.7420494556427002, "num_tokens": 19668212.0, "step": 7777, "train/ce_loss": 1.058593988418579 }, { "epoch": 0.7689341506822227, "step": 7777, "train/sim_loss": 0.046875 }, { "epoch": 0.7689341506822227, "step": 7777, "train/total_loss": 0.1527343988418579 }, { "entropy": 8.734945297241211, "epoch": 0.7690330235317382, "mean_token_accuracy": 0.7875568866729736, "num_tokens": 19673230.0, "step": 7778, "train/ce_loss": 0.6322168111801147 }, { "epoch": 0.7690330235317382, "step": 7778, "train/sim_loss": 0.05859375 }, { "epoch": 0.7690330235317382, "step": 7778, "train/total_loss": 0.12181543558835983 }, { "entropy": 8.808191299438477, "epoch": 0.7691318963812537, "mean_token_accuracy": 0.7994100451469421, "num_tokens": 19678355.0, "step": 7779, "train/ce_loss": 0.9891080260276794 }, { "epoch": 0.7691318963812537, "step": 7779, "train/sim_loss": 0.03515625 }, { "epoch": 0.7691318963812537, "step": 7779, "train/total_loss": 0.13406705856323242 }, { "epoch": 0.7692307692307693, "grad_norm": 0.5392338633537292, "learning_rate": 8.07916728477476e-06, "loss": 0.1423, "step": 7780 }, { "entropy": 8.632076263427734, "epoch": 0.7692307692307693, "mean_token_accuracy": 0.740406334400177, "num_tokens": 19683747.0, "step": 7780, "train/ce_loss": 0.6915012001991272 }, { "epoch": 0.7692307692307693, "step": 7780, "train/sim_loss": 0.06640625 }, { "epoch": 0.7692307692307693, "step": 7780, "train/total_loss": 0.13555637001991272 }, { "entropy": 8.572942733764648, "epoch": 0.7693296420802848, "mean_token_accuracy": 0.7436463832855225, "num_tokens": 19689155.0, "step": 7781, "train/ce_loss": 1.505046010017395 }, { "epoch": 0.7693296420802848, "step": 7781, "train/sim_loss": 0.046875 }, { "epoch": 0.7693296420802848, "step": 7781, "train/total_loss": 0.19737960398197174 }, { "entropy": 8.728029251098633, "epoch": 0.7694285149298002, "mean_token_accuracy": 0.7652778029441833, "num_tokens": 19694368.0, "step": 7782, "train/ce_loss": 0.4747575521469116 }, { "epoch": 0.7694285149298002, "step": 7782, "train/sim_loss": 0.0703125 }, { "epoch": 0.7694285149298002, "step": 7782, "train/total_loss": 0.11778825521469116 }, { "entropy": 8.992225646972656, "epoch": 0.7695273877793158, "mean_token_accuracy": 0.8357142806053162, "num_tokens": 19699488.0, "step": 7783, "train/ce_loss": 0.41708889603614807 }, { "epoch": 0.7695273877793158, "step": 7783, "train/sim_loss": 0.03125 }, { "epoch": 0.7695273877793158, "step": 7783, "train/total_loss": 0.07295888662338257 }, { "entropy": 9.007128715515137, "epoch": 0.7696262606288313, "mean_token_accuracy": 0.7003424763679504, "num_tokens": 19704472.0, "step": 7784, "train/ce_loss": 2.804467840178404e-06 }, { "epoch": 0.7696262606288313, "step": 7784, "train/sim_loss": 0.08203125 }, { "epoch": 0.7696262606288313, "step": 7784, "train/total_loss": 0.08203153312206268 }, { "entropy": 8.808753967285156, "epoch": 0.7697251334783468, "mean_token_accuracy": 0.7394578456878662, "num_tokens": 19709610.0, "step": 7785, "train/ce_loss": 5.0079838729288895e-06 }, { "epoch": 0.7697251334783468, "step": 7785, "train/sim_loss": 0.04296875 }, { "epoch": 0.7697251334783468, "step": 7785, "train/total_loss": 0.042969249188899994 }, { "entropy": 8.428844451904297, "epoch": 0.7698240063278624, "mean_token_accuracy": 0.7607496976852417, "num_tokens": 19715017.0, "step": 7786, "train/ce_loss": 0.3310222625732422 }, { "epoch": 0.7698240063278624, "step": 7786, "train/sim_loss": 0.046875 }, { "epoch": 0.7698240063278624, "step": 7786, "train/total_loss": 0.07997722923755646 }, { "entropy": 8.50446605682373, "epoch": 0.7699228791773779, "mean_token_accuracy": 0.8143203854560852, "num_tokens": 19720320.0, "step": 7787, "train/ce_loss": 0.5868676900863647 }, { "epoch": 0.7699228791773779, "step": 7787, "train/sim_loss": 0.01171875 }, { "epoch": 0.7699228791773779, "step": 7787, "train/total_loss": 0.0704055204987526 }, { "entropy": 8.214300155639648, "epoch": 0.7700217520268934, "mean_token_accuracy": 0.7140077948570251, "num_tokens": 19725838.0, "step": 7788, "train/ce_loss": 0.5881697535514832 }, { "epoch": 0.7700217520268934, "step": 7788, "train/sim_loss": 0.0390625 }, { "epoch": 0.7700217520268934, "step": 7788, "train/total_loss": 0.09787947684526443 }, { "entropy": 8.687908172607422, "epoch": 0.770120624876409, "mean_token_accuracy": 0.7725381255149841, "num_tokens": 19731025.0, "step": 7789, "train/ce_loss": 0.4396521747112274 }, { "epoch": 0.770120624876409, "step": 7789, "train/sim_loss": 0.03125 }, { "epoch": 0.770120624876409, "step": 7789, "train/total_loss": 0.07521522045135498 }, { "entropy": 8.553461074829102, "epoch": 0.7702194977259245, "mean_token_accuracy": 0.7950581312179565, "num_tokens": 19736225.0, "step": 7790, "train/ce_loss": 0.9271291494369507 }, { "epoch": 0.7702194977259245, "step": 7790, "train/sim_loss": 0.0390625 }, { "epoch": 0.7702194977259245, "step": 7790, "train/total_loss": 0.1317754089832306 }, { "entropy": 8.774941444396973, "epoch": 0.77031837057544, "mean_token_accuracy": 0.7512626051902771, "num_tokens": 19741488.0, "step": 7791, "train/ce_loss": 1.02925443649292 }, { "epoch": 0.77031837057544, "step": 7791, "train/sim_loss": 0.0703125 }, { "epoch": 0.77031837057544, "step": 7791, "train/total_loss": 0.17323794960975647 }, { "entropy": 8.831399917602539, "epoch": 0.7704172434249555, "mean_token_accuracy": 0.7074742317199707, "num_tokens": 19746683.0, "step": 7792, "train/ce_loss": 1.3657745122909546 }, { "epoch": 0.7704172434249555, "step": 7792, "train/sim_loss": 0.0703125 }, { "epoch": 0.7704172434249555, "step": 7792, "train/total_loss": 0.20688995718955994 }, { "entropy": 8.7943115234375, "epoch": 0.770516116274471, "mean_token_accuracy": 0.7727952003479004, "num_tokens": 19751793.0, "step": 7793, "train/ce_loss": 0.578055202960968 }, { "epoch": 0.770516116274471, "step": 7793, "train/sim_loss": 0.0859375 }, { "epoch": 0.770516116274471, "step": 7793, "train/total_loss": 0.14374302327632904 }, { "entropy": 8.386873245239258, "epoch": 0.7706149891239865, "mean_token_accuracy": 0.7150714993476868, "num_tokens": 19757159.0, "step": 7794, "train/ce_loss": 0.5064057111740112 }, { "epoch": 0.7706149891239865, "step": 7794, "train/sim_loss": 0.01953125 }, { "epoch": 0.7706149891239865, "step": 7794, "train/total_loss": 0.07017181813716888 }, { "entropy": 8.634651184082031, "epoch": 0.7707138619735021, "mean_token_accuracy": 0.7295454740524292, "num_tokens": 19762451.0, "step": 7795, "train/ce_loss": 0.6535912752151489 }, { "epoch": 0.7707138619735021, "step": 7795, "train/sim_loss": 0.03515625 }, { "epoch": 0.7707138619735021, "step": 7795, "train/total_loss": 0.10051538050174713 }, { "entropy": 8.393540382385254, "epoch": 0.7708127348230176, "mean_token_accuracy": 0.7106382846832275, "num_tokens": 19768056.0, "step": 7796, "train/ce_loss": 0.6739010810852051 }, { "epoch": 0.7708127348230176, "step": 7796, "train/sim_loss": 0.05078125 }, { "epoch": 0.7708127348230176, "step": 7796, "train/total_loss": 0.11817135661840439 }, { "entropy": 8.74815845489502, "epoch": 0.7709116076725331, "mean_token_accuracy": 0.7723463773727417, "num_tokens": 19773324.0, "step": 7797, "train/ce_loss": 0.4139121174812317 }, { "epoch": 0.7709116076725331, "step": 7797, "train/sim_loss": 0.03125 }, { "epoch": 0.7709116076725331, "step": 7797, "train/total_loss": 0.07264120876789093 }, { "entropy": 9.048473358154297, "epoch": 0.7710104805220487, "mean_token_accuracy": 0.7356114983558655, "num_tokens": 19778355.0, "step": 7798, "train/ce_loss": 2.1130688310222467e-06 }, { "epoch": 0.7710104805220487, "step": 7798, "train/sim_loss": 0.05859375 }, { "epoch": 0.7710104805220487, "step": 7798, "train/total_loss": 0.05859396234154701 }, { "entropy": 8.616962432861328, "epoch": 0.7711093533715642, "mean_token_accuracy": 0.7152858972549438, "num_tokens": 19783653.0, "step": 7799, "train/ce_loss": 0.5495577454566956 }, { "epoch": 0.7711093533715642, "step": 7799, "train/sim_loss": 0.03125 }, { "epoch": 0.7711093533715642, "step": 7799, "train/total_loss": 0.08620578050613403 }, { "epoch": 0.7712082262210797, "grad_norm": 0.6130492091178894, "learning_rate": 8.074222420016813e-06, "loss": 0.13, "step": 7800 }, { "entropy": 8.810811996459961, "epoch": 0.7712082262210797, "mean_token_accuracy": 0.7312661409378052, "num_tokens": 19788897.0, "step": 7800, "train/ce_loss": 0.4379151165485382 }, { "epoch": 0.7712082262210797, "step": 7800, "train/sim_loss": 0.05078125 }, { "epoch": 0.7712082262210797, "step": 7800, "train/total_loss": 0.0945727676153183 }, { "entropy": 8.515292167663574, "epoch": 0.7713070990705952, "mean_token_accuracy": 0.7429577708244324, "num_tokens": 19794191.0, "step": 7801, "train/ce_loss": 0.9319631457328796 }, { "epoch": 0.7713070990705952, "step": 7801, "train/sim_loss": 0.0703125 }, { "epoch": 0.7713070990705952, "step": 7801, "train/total_loss": 0.1635088175535202 }, { "entropy": 9.327791213989258, "epoch": 0.7714059719201107, "mean_token_accuracy": 0.7540650367736816, "num_tokens": 19799088.0, "step": 7802, "train/ce_loss": 4.191006610199111e-06 }, { "epoch": 0.7714059719201107, "step": 7802, "train/sim_loss": 0.02734375 }, { "epoch": 0.7714059719201107, "step": 7802, "train/total_loss": 0.027344169095158577 }, { "entropy": 8.99296760559082, "epoch": 0.7715048447696262, "mean_token_accuracy": 0.7601810097694397, "num_tokens": 19804183.0, "step": 7803, "train/ce_loss": 0.6521633863449097 }, { "epoch": 0.7715048447696262, "step": 7803, "train/sim_loss": 0.0234375 }, { "epoch": 0.7715048447696262, "step": 7803, "train/total_loss": 0.08865384012460709 }, { "entropy": 8.937102317810059, "epoch": 0.7716037176191418, "mean_token_accuracy": 0.7731829285621643, "num_tokens": 19809453.0, "step": 7804, "train/ce_loss": 1.5798154890944716e-06 }, { "epoch": 0.7716037176191418, "step": 7804, "train/sim_loss": 0.0390625 }, { "epoch": 0.7716037176191418, "step": 7804, "train/total_loss": 0.039062656462192535 }, { "entropy": 9.016761779785156, "epoch": 0.7717025904686573, "mean_token_accuracy": 0.7131931185722351, "num_tokens": 19814573.0, "step": 7805, "train/ce_loss": 1.0731233358383179 }, { "epoch": 0.7717025904686573, "step": 7805, "train/sim_loss": 0.03125 }, { "epoch": 0.7717025904686573, "step": 7805, "train/total_loss": 0.13856233656406403 }, { "entropy": 8.710519790649414, "epoch": 0.7718014633181728, "mean_token_accuracy": 0.7759398221969604, "num_tokens": 19819705.0, "step": 7806, "train/ce_loss": 0.8306262493133545 }, { "epoch": 0.7718014633181728, "step": 7806, "train/sim_loss": 0.0703125 }, { "epoch": 0.7718014633181728, "step": 7806, "train/total_loss": 0.15337511897087097 }, { "entropy": 8.52718734741211, "epoch": 0.7719003361676884, "mean_token_accuracy": 0.7378410696983337, "num_tokens": 19825064.0, "step": 7807, "train/ce_loss": 0.7315281629562378 }, { "epoch": 0.7719003361676884, "step": 7807, "train/sim_loss": 0.0625 }, { "epoch": 0.7719003361676884, "step": 7807, "train/total_loss": 0.1356528103351593 }, { "entropy": 8.339141845703125, "epoch": 0.7719992090172039, "mean_token_accuracy": 0.7184035181999207, "num_tokens": 19830435.0, "step": 7808, "train/ce_loss": 0.5292571783065796 }, { "epoch": 0.7719992090172039, "step": 7808, "train/sim_loss": 0.02734375 }, { "epoch": 0.7719992090172039, "step": 7808, "train/total_loss": 0.0802694708108902 }, { "entropy": 9.055339813232422, "epoch": 0.7720980818667194, "mean_token_accuracy": 0.7195122241973877, "num_tokens": 19835580.0, "step": 7809, "train/ce_loss": 1.4505902528762817 }, { "epoch": 0.7720980818667194, "step": 7809, "train/sim_loss": 0.05078125 }, { "epoch": 0.7720980818667194, "step": 7809, "train/total_loss": 0.1958402842283249 }, { "entropy": 9.082110404968262, "epoch": 0.772196954716235, "mean_token_accuracy": 0.7423664331436157, "num_tokens": 19840588.0, "step": 7810, "train/ce_loss": 1.3220585584640503 }, { "epoch": 0.772196954716235, "step": 7810, "train/sim_loss": 0.05078125 }, { "epoch": 0.772196954716235, "step": 7810, "train/total_loss": 0.18298710882663727 }, { "entropy": 8.77676773071289, "epoch": 0.7722958275657504, "mean_token_accuracy": 0.7403461933135986, "num_tokens": 19845779.0, "step": 7811, "train/ce_loss": 0.6185296177864075 }, { "epoch": 0.7722958275657504, "step": 7811, "train/sim_loss": 0.03125 }, { "epoch": 0.7722958275657504, "step": 7811, "train/total_loss": 0.09310296177864075 }, { "entropy": 8.527267456054688, "epoch": 0.7723947004152659, "mean_token_accuracy": 0.7119205594062805, "num_tokens": 19851209.0, "step": 7812, "train/ce_loss": 0.42909958958625793 }, { "epoch": 0.7723947004152659, "step": 7812, "train/sim_loss": 0.04296875 }, { "epoch": 0.7723947004152659, "step": 7812, "train/total_loss": 0.08587871491909027 }, { "entropy": 8.80923080444336, "epoch": 0.7724935732647815, "mean_token_accuracy": 0.7142857313156128, "num_tokens": 19856396.0, "step": 7813, "train/ce_loss": 1.7006196975708008 }, { "epoch": 0.7724935732647815, "step": 7813, "train/sim_loss": 0.0703125 }, { "epoch": 0.7724935732647815, "step": 7813, "train/total_loss": 0.24037447571754456 }, { "entropy": 8.203269958496094, "epoch": 0.772592446114297, "mean_token_accuracy": 0.7153153419494629, "num_tokens": 19861948.0, "step": 7814, "train/ce_loss": 0.7045519948005676 }, { "epoch": 0.772592446114297, "step": 7814, "train/sim_loss": 0.05078125 }, { "epoch": 0.772592446114297, "step": 7814, "train/total_loss": 0.12123645097017288 }, { "entropy": 9.388669967651367, "epoch": 0.7726913189638125, "mean_token_accuracy": 0.7602591514587402, "num_tokens": 19866819.0, "step": 7815, "train/ce_loss": 1.1423147916793823 }, { "epoch": 0.7726913189638125, "step": 7815, "train/sim_loss": 0.08984375 }, { "epoch": 0.7726913189638125, "step": 7815, "train/total_loss": 0.20407523214817047 }, { "entropy": 9.166075706481934, "epoch": 0.7727901918133281, "mean_token_accuracy": 0.8114104866981506, "num_tokens": 19871845.0, "step": 7816, "train/ce_loss": 0.8951804041862488 }, { "epoch": 0.7727901918133281, "step": 7816, "train/sim_loss": 0.0234375 }, { "epoch": 0.7727901918133281, "step": 7816, "train/total_loss": 0.11295554041862488 }, { "entropy": 8.625469207763672, "epoch": 0.7728890646628436, "mean_token_accuracy": 0.7141280174255371, "num_tokens": 19877227.0, "step": 7817, "train/ce_loss": 0.49586185812950134 }, { "epoch": 0.7728890646628436, "step": 7817, "train/sim_loss": 0.09375 }, { "epoch": 0.7728890646628436, "step": 7817, "train/total_loss": 0.1433361917734146 }, { "entropy": 8.812108993530273, "epoch": 0.7729879375123591, "mean_token_accuracy": 0.7391952276229858, "num_tokens": 19882372.0, "step": 7818, "train/ce_loss": 1.0758458375930786 }, { "epoch": 0.7729879375123591, "step": 7818, "train/sim_loss": 0.078125 }, { "epoch": 0.7729879375123591, "step": 7818, "train/total_loss": 0.18570959568023682 }, { "entropy": 9.89470100402832, "epoch": 0.7730868103618747, "mean_token_accuracy": 0.7422680258750916, "num_tokens": 19886928.0, "step": 7819, "train/ce_loss": 2.782318460958777e-06 }, { "epoch": 0.7730868103618747, "step": 7819, "train/sim_loss": 0.015625 }, { "epoch": 0.7730868103618747, "step": 7819, "train/total_loss": 0.015625277534127235 }, { "epoch": 0.7731856832113901, "grad_norm": 1.0795270204544067, "learning_rate": 8.069277555258863e-06, "loss": 0.1332, "step": 7820 }, { "entropy": 8.894759178161621, "epoch": 0.7731856832113901, "mean_token_accuracy": 0.7464183568954468, "num_tokens": 19892023.0, "step": 7820, "train/ce_loss": 0.7307823896408081 }, { "epoch": 0.7731856832113901, "step": 7820, "train/sim_loss": 0.04296875 }, { "epoch": 0.7731856832113901, "step": 7820, "train/total_loss": 0.11604698747396469 }, { "entropy": 9.182503700256348, "epoch": 0.7732845560609056, "mean_token_accuracy": 0.759013295173645, "num_tokens": 19897012.0, "step": 7821, "train/ce_loss": 1.2509524822235107 }, { "epoch": 0.7732845560609056, "step": 7821, "train/sim_loss": 0.03125 }, { "epoch": 0.7732845560609056, "step": 7821, "train/total_loss": 0.15634524822235107 }, { "entropy": 8.66482925415039, "epoch": 0.7733834289104212, "mean_token_accuracy": 0.709549069404602, "num_tokens": 19902223.0, "step": 7822, "train/ce_loss": 1.0512527227401733 }, { "epoch": 0.7733834289104212, "step": 7822, "train/sim_loss": 0.0546875 }, { "epoch": 0.7733834289104212, "step": 7822, "train/total_loss": 0.1598127782344818 }, { "entropy": 8.769386291503906, "epoch": 0.7734823017599367, "mean_token_accuracy": 0.810693621635437, "num_tokens": 19907394.0, "step": 7823, "train/ce_loss": 0.879429817199707 }, { "epoch": 0.7734823017599367, "step": 7823, "train/sim_loss": 0.07421875 }, { "epoch": 0.7734823017599367, "step": 7823, "train/total_loss": 0.16216173768043518 }, { "entropy": 8.26472282409668, "epoch": 0.7735811746094522, "mean_token_accuracy": 0.8351115584373474, "num_tokens": 19912907.0, "step": 7824, "train/ce_loss": 0.8675022125244141 }, { "epoch": 0.7735811746094522, "step": 7824, "train/sim_loss": 0.09765625 }, { "epoch": 0.7735811746094522, "step": 7824, "train/total_loss": 0.18440647423267365 }, { "entropy": 8.358294486999512, "epoch": 0.7736800474589678, "mean_token_accuracy": 0.6705202460289001, "num_tokens": 19918464.0, "step": 7825, "train/ce_loss": 0.9174467325210571 }, { "epoch": 0.7736800474589678, "step": 7825, "train/sim_loss": 0.078125 }, { "epoch": 0.7736800474589678, "step": 7825, "train/total_loss": 0.16986967623233795 }, { "entropy": 8.729463577270508, "epoch": 0.7737789203084833, "mean_token_accuracy": 0.7240259647369385, "num_tokens": 19923848.0, "step": 7826, "train/ce_loss": 0.843796968460083 }, { "epoch": 0.7737789203084833, "step": 7826, "train/sim_loss": 0.13671875 }, { "epoch": 0.7737789203084833, "step": 7826, "train/total_loss": 0.22109845280647278 }, { "entropy": 8.410392761230469, "epoch": 0.7738777931579988, "mean_token_accuracy": 0.7503410577774048, "num_tokens": 19929120.0, "step": 7827, "train/ce_loss": 0.8885999321937561 }, { "epoch": 0.7738777931579988, "step": 7827, "train/sim_loss": 0.03125 }, { "epoch": 0.7738777931579988, "step": 7827, "train/total_loss": 0.12010999768972397 }, { "entropy": 8.993410110473633, "epoch": 0.7739766660075144, "mean_token_accuracy": 0.800000011920929, "num_tokens": 19934179.0, "step": 7828, "train/ce_loss": 2.7117188437841833e-05 }, { "epoch": 0.7739766660075144, "step": 7828, "train/sim_loss": 0.0390625 }, { "epoch": 0.7739766660075144, "step": 7828, "train/total_loss": 0.03906521201133728 }, { "entropy": 9.25328254699707, "epoch": 0.7740755388570298, "mean_token_accuracy": 0.7326202988624573, "num_tokens": 19939189.0, "step": 7829, "train/ce_loss": 1.108992099761963 }, { "epoch": 0.7740755388570298, "step": 7829, "train/sim_loss": 0.0390625 }, { "epoch": 0.7740755388570298, "step": 7829, "train/total_loss": 0.1499617099761963 }, { "entropy": 9.52667236328125, "epoch": 0.7741744117065453, "mean_token_accuracy": 0.8308605551719666, "num_tokens": 19943900.0, "step": 7830, "train/ce_loss": 2.575121698100702e-06 }, { "epoch": 0.7741744117065453, "step": 7830, "train/sim_loss": 0.01171875 }, { "epoch": 0.7741744117065453, "step": 7830, "train/total_loss": 0.011719007976353168 }, { "entropy": 9.501398086547852, "epoch": 0.7742732845560609, "mean_token_accuracy": 0.8183807730674744, "num_tokens": 19948748.0, "step": 7831, "train/ce_loss": 2.076280452456558e-06 }, { "epoch": 0.7742732845560609, "step": 7831, "train/sim_loss": 0.0234375 }, { "epoch": 0.7742732845560609, "step": 7831, "train/total_loss": 0.023437706753611565 }, { "entropy": 8.508647918701172, "epoch": 0.7743721574055764, "mean_token_accuracy": 0.7374100685119629, "num_tokens": 19954051.0, "step": 7832, "train/ce_loss": 0.8255497217178345 }, { "epoch": 0.7743721574055764, "step": 7832, "train/sim_loss": 0.05859375 }, { "epoch": 0.7743721574055764, "step": 7832, "train/total_loss": 0.14114871621131897 }, { "entropy": 8.713500022888184, "epoch": 0.7744710302550919, "mean_token_accuracy": 0.7908496856689453, "num_tokens": 19959266.0, "step": 7833, "train/ce_loss": 1.030476450920105 }, { "epoch": 0.7744710302550919, "step": 7833, "train/sim_loss": 0.07421875 }, { "epoch": 0.7744710302550919, "step": 7833, "train/total_loss": 0.17726638913154602 }, { "entropy": 8.520734786987305, "epoch": 0.7745699031046075, "mean_token_accuracy": 0.8053553104400635, "num_tokens": 19964739.0, "step": 7834, "train/ce_loss": 0.7346962094306946 }, { "epoch": 0.7745699031046075, "step": 7834, "train/sim_loss": 0.04296875 }, { "epoch": 0.7745699031046075, "step": 7834, "train/total_loss": 0.1164383739233017 }, { "entropy": 8.950132369995117, "epoch": 0.774668775954123, "mean_token_accuracy": 0.7404129505157471, "num_tokens": 19969880.0, "step": 7835, "train/ce_loss": 0.5527032613754272 }, { "epoch": 0.774668775954123, "step": 7835, "train/sim_loss": 0.04296875 }, { "epoch": 0.774668775954123, "step": 7835, "train/total_loss": 0.09823907911777496 }, { "entropy": 8.807478904724121, "epoch": 0.7747676488036386, "mean_token_accuracy": 0.7493734359741211, "num_tokens": 19975201.0, "step": 7836, "train/ce_loss": 1.1441510915756226 }, { "epoch": 0.7747676488036386, "step": 7836, "train/sim_loss": 0.0625 }, { "epoch": 0.7747676488036386, "step": 7836, "train/total_loss": 0.17691510915756226 }, { "entropy": 8.433201789855957, "epoch": 0.7748665216531541, "mean_token_accuracy": 0.6755037307739258, "num_tokens": 19980635.0, "step": 7837, "train/ce_loss": 0.5758907198905945 }, { "epoch": 0.7748665216531541, "step": 7837, "train/sim_loss": 0.0703125 }, { "epoch": 0.7748665216531541, "step": 7837, "train/total_loss": 0.1279015690088272 }, { "entropy": 8.277987480163574, "epoch": 0.7749653945026695, "mean_token_accuracy": 0.7234762907028198, "num_tokens": 19986007.0, "step": 7838, "train/ce_loss": 0.9494882822036743 }, { "epoch": 0.7749653945026695, "step": 7838, "train/sim_loss": 0.0390625 }, { "epoch": 0.7749653945026695, "step": 7838, "train/total_loss": 0.13401132822036743 }, { "entropy": 8.675048828125, "epoch": 0.7750642673521851, "mean_token_accuracy": 0.7599999904632568, "num_tokens": 19991128.0, "step": 7839, "train/ce_loss": 1.2529450259535224e-06 }, { "epoch": 0.7750642673521851, "step": 7839, "train/sim_loss": 0.05078125 }, { "epoch": 0.7750642673521851, "step": 7839, "train/total_loss": 0.05078137665987015 }, { "epoch": 0.7751631402017006, "grad_norm": 0.7123085260391235, "learning_rate": 8.064332690500916e-06, "loss": 0.1327, "step": 7840 }, { "entropy": 9.40606689453125, "epoch": 0.7751631402017006, "mean_token_accuracy": 0.7947976589202881, "num_tokens": 19995897.0, "step": 7840, "train/ce_loss": 2.4185565052903257e-06 }, { "epoch": 0.7751631402017006, "step": 7840, "train/sim_loss": 0.078125 }, { "epoch": 0.7751631402017006, "step": 7840, "train/total_loss": 0.0781252384185791 }, { "entropy": 9.181451797485352, "epoch": 0.7752620130512161, "mean_token_accuracy": 0.75, "num_tokens": 20000705.0, "step": 7841, "train/ce_loss": 1.2251102924346924 }, { "epoch": 0.7752620130512161, "step": 7841, "train/sim_loss": 0.08203125 }, { "epoch": 0.7752620130512161, "step": 7841, "train/total_loss": 0.20454227924346924 }, { "entropy": 8.59832763671875, "epoch": 0.7753608859007317, "mean_token_accuracy": 0.8022598624229431, "num_tokens": 20006031.0, "step": 7842, "train/ce_loss": 0.5766798853874207 }, { "epoch": 0.7753608859007317, "step": 7842, "train/sim_loss": 0.05859375 }, { "epoch": 0.7753608859007317, "step": 7842, "train/total_loss": 0.11626173555850983 }, { "entropy": 8.97331428527832, "epoch": 0.7754597587502472, "mean_token_accuracy": 0.7272727489471436, "num_tokens": 20011111.0, "step": 7843, "train/ce_loss": 0.6290687918663025 }, { "epoch": 0.7754597587502472, "step": 7843, "train/sim_loss": 0.05859375 }, { "epoch": 0.7754597587502472, "step": 7843, "train/total_loss": 0.12150063365697861 }, { "entropy": 8.379244804382324, "epoch": 0.7755586315997627, "mean_token_accuracy": 0.7770419716835022, "num_tokens": 20016430.0, "step": 7844, "train/ce_loss": 0.4407392740249634 }, { "epoch": 0.7755586315997627, "step": 7844, "train/sim_loss": 0.03125 }, { "epoch": 0.7755586315997627, "step": 7844, "train/total_loss": 0.0753239244222641 }, { "entropy": 8.849149703979492, "epoch": 0.7756575044492783, "mean_token_accuracy": 0.7721179723739624, "num_tokens": 20021657.0, "step": 7845, "train/ce_loss": 0.9204382300376892 }, { "epoch": 0.7756575044492783, "step": 7845, "train/sim_loss": 0.0703125 }, { "epoch": 0.7756575044492783, "step": 7845, "train/total_loss": 0.16235631704330444 }, { "entropy": 8.597208023071289, "epoch": 0.7757563772987938, "mean_token_accuracy": 0.6629588603973389, "num_tokens": 20027039.0, "step": 7846, "train/ce_loss": 1.005993366241455 }, { "epoch": 0.7757563772987938, "step": 7846, "train/sim_loss": 0.078125 }, { "epoch": 0.7757563772987938, "step": 7846, "train/total_loss": 0.17872434854507446 }, { "entropy": 8.525615692138672, "epoch": 0.7758552501483093, "mean_token_accuracy": 0.7469879388809204, "num_tokens": 20032241.0, "step": 7847, "train/ce_loss": 0.6244814395904541 }, { "epoch": 0.7758552501483093, "step": 7847, "train/sim_loss": 0.04296875 }, { "epoch": 0.7758552501483093, "step": 7847, "train/total_loss": 0.10541689395904541 }, { "entropy": 8.570363998413086, "epoch": 0.7759541229978248, "mean_token_accuracy": 0.8177676796913147, "num_tokens": 20037628.0, "step": 7848, "train/ce_loss": 0.5132327675819397 }, { "epoch": 0.7759541229978248, "step": 7848, "train/sim_loss": 0.046875 }, { "epoch": 0.7759541229978248, "step": 7848, "train/total_loss": 0.09819827973842621 }, { "entropy": 8.65130615234375, "epoch": 0.7760529958473403, "mean_token_accuracy": 0.7279821634292603, "num_tokens": 20042958.0, "step": 7849, "train/ce_loss": 0.5276586413383484 }, { "epoch": 0.7760529958473403, "step": 7849, "train/sim_loss": 0.046875 }, { "epoch": 0.7760529958473403, "step": 7849, "train/total_loss": 0.0996408611536026 }, { "entropy": 8.43891716003418, "epoch": 0.7761518686968558, "mean_token_accuracy": 0.6803699731826782, "num_tokens": 20048392.0, "step": 7850, "train/ce_loss": 0.39941999316215515 }, { "epoch": 0.7761518686968558, "step": 7850, "train/sim_loss": 0.04296875 }, { "epoch": 0.7761518686968558, "step": 7850, "train/total_loss": 0.08291074633598328 }, { "entropy": 8.427473068237305, "epoch": 0.7762507415463714, "mean_token_accuracy": 0.7265135645866394, "num_tokens": 20053815.0, "step": 7851, "train/ce_loss": 0.7450786828994751 }, { "epoch": 0.7762507415463714, "step": 7851, "train/sim_loss": 0.07421875 }, { "epoch": 0.7762507415463714, "step": 7851, "train/total_loss": 0.14872661232948303 }, { "entropy": 8.747206687927246, "epoch": 0.7763496143958869, "mean_token_accuracy": 0.6984318494796753, "num_tokens": 20059083.0, "step": 7852, "train/ce_loss": 1.3575971126556396 }, { "epoch": 0.7763496143958869, "step": 7852, "train/sim_loss": 0.0859375 }, { "epoch": 0.7763496143958869, "step": 7852, "train/total_loss": 0.22169721126556396 }, { "entropy": 9.182523727416992, "epoch": 0.7764484872454024, "mean_token_accuracy": 0.7379844784736633, "num_tokens": 20064155.0, "step": 7853, "train/ce_loss": 1.1574733257293701 }, { "epoch": 0.7764484872454024, "step": 7853, "train/sim_loss": 0.078125 }, { "epoch": 0.7764484872454024, "step": 7853, "train/total_loss": 0.193872332572937 }, { "entropy": 9.358688354492188, "epoch": 0.776547360094918, "mean_token_accuracy": 0.8188976645469666, "num_tokens": 20068980.0, "step": 7854, "train/ce_loss": 0.8725805282592773 }, { "epoch": 0.776547360094918, "step": 7854, "train/sim_loss": 0.015625 }, { "epoch": 0.776547360094918, "step": 7854, "train/total_loss": 0.10288305580615997 }, { "entropy": 9.307382583618164, "epoch": 0.7766462329444335, "mean_token_accuracy": 0.744990885257721, "num_tokens": 20073911.0, "step": 7855, "train/ce_loss": 1.0040943622589111 }, { "epoch": 0.7766462329444335, "step": 7855, "train/sim_loss": 0.0390625 }, { "epoch": 0.7766462329444335, "step": 7855, "train/total_loss": 0.13947194814682007 }, { "entropy": 9.696178436279297, "epoch": 0.776745105793949, "mean_token_accuracy": 0.6959064602851868, "num_tokens": 20078626.0, "step": 7856, "train/ce_loss": 2.161552906036377 }, { "epoch": 0.776745105793949, "step": 7856, "train/sim_loss": 0.109375 }, { "epoch": 0.776745105793949, "step": 7856, "train/total_loss": 0.3255302906036377 }, { "entropy": 9.63293743133545, "epoch": 0.7768439786434645, "mean_token_accuracy": 0.8255813717842102, "num_tokens": 20083370.0, "step": 7857, "train/ce_loss": 1.1936830282211304 }, { "epoch": 0.7768439786434645, "step": 7857, "train/sim_loss": 0.04296875 }, { "epoch": 0.7768439786434645, "step": 7857, "train/total_loss": 0.162337064743042 }, { "entropy": 8.957457542419434, "epoch": 0.77694285149298, "mean_token_accuracy": 0.7108014225959778, "num_tokens": 20088397.0, "step": 7858, "train/ce_loss": 0.5364453196525574 }, { "epoch": 0.77694285149298, "step": 7858, "train/sim_loss": 0.0234375 }, { "epoch": 0.77694285149298, "step": 7858, "train/total_loss": 0.07708203792572021 }, { "entropy": 8.190643310546875, "epoch": 0.7770417243424955, "mean_token_accuracy": 0.7280939221382141, "num_tokens": 20093960.0, "step": 7859, "train/ce_loss": 0.6429663896560669 }, { "epoch": 0.7770417243424955, "step": 7859, "train/sim_loss": 0.05859375 }, { "epoch": 0.7770417243424955, "step": 7859, "train/total_loss": 0.12289039045572281 }, { "epoch": 0.7771405971920111, "grad_norm": 0.6672192215919495, "learning_rate": 8.059387825742966e-06, "loss": 0.1312, "step": 7860 }, { "entropy": 9.07497501373291, "epoch": 0.7771405971920111, "mean_token_accuracy": 0.7948275804519653, "num_tokens": 20098993.0, "step": 7860, "train/ce_loss": 0.8659631609916687 }, { "epoch": 0.7771405971920111, "step": 7860, "train/sim_loss": 0.0390625 }, { "epoch": 0.7771405971920111, "step": 7860, "train/total_loss": 0.1256588101387024 }, { "entropy": 8.92231559753418, "epoch": 0.7772394700415266, "mean_token_accuracy": 0.7153846025466919, "num_tokens": 20104232.0, "step": 7861, "train/ce_loss": 0.7884999513626099 }, { "epoch": 0.7772394700415266, "step": 7861, "train/sim_loss": 0.0390625 }, { "epoch": 0.7772394700415266, "step": 7861, "train/total_loss": 0.11791249364614487 }, { "entropy": 8.849882125854492, "epoch": 0.7773383428910421, "mean_token_accuracy": 0.7496598362922668, "num_tokens": 20109437.0, "step": 7862, "train/ce_loss": 1.1555202007293701 }, { "epoch": 0.7773383428910421, "step": 7862, "train/sim_loss": 0.05078125 }, { "epoch": 0.7773383428910421, "step": 7862, "train/total_loss": 0.16633327305316925 }, { "entropy": 8.588375091552734, "epoch": 0.7774372157405577, "mean_token_accuracy": 0.759115993976593, "num_tokens": 20114778.0, "step": 7863, "train/ce_loss": 0.625299334526062 }, { "epoch": 0.7774372157405577, "step": 7863, "train/sim_loss": 0.0625 }, { "epoch": 0.7774372157405577, "step": 7863, "train/total_loss": 0.12502993643283844 }, { "entropy": 8.494949340820312, "epoch": 0.7775360885900732, "mean_token_accuracy": 0.7898658514022827, "num_tokens": 20119964.0, "step": 7864, "train/ce_loss": 0.4647732079029083 }, { "epoch": 0.7775360885900732, "step": 7864, "train/sim_loss": 0.1015625 }, { "epoch": 0.7775360885900732, "step": 7864, "train/total_loss": 0.1480398178100586 }, { "entropy": 9.21607780456543, "epoch": 0.7776349614395887, "mean_token_accuracy": 0.8204697966575623, "num_tokens": 20124948.0, "step": 7865, "train/ce_loss": 8.904492574401957e-07 }, { "epoch": 0.7776349614395887, "step": 7865, "train/sim_loss": 0.0234375 }, { "epoch": 0.7776349614395887, "step": 7865, "train/total_loss": 0.023437589406967163 }, { "entropy": 9.210044860839844, "epoch": 0.7777338342891043, "mean_token_accuracy": 0.7599999904632568, "num_tokens": 20130002.0, "step": 7866, "train/ce_loss": 1.7427250895707402e-06 }, { "epoch": 0.7777338342891043, "step": 7866, "train/sim_loss": 0.0390625 }, { "epoch": 0.7777338342891043, "step": 7866, "train/total_loss": 0.03906267508864403 }, { "entropy": 8.752881050109863, "epoch": 0.7778327071386197, "mean_token_accuracy": 0.699999988079071, "num_tokens": 20135281.0, "step": 7867, "train/ce_loss": 0.5289360284805298 }, { "epoch": 0.7778327071386197, "step": 7867, "train/sim_loss": 0.046875 }, { "epoch": 0.7778327071386197, "step": 7867, "train/total_loss": 0.09976860880851746 }, { "entropy": 9.038463592529297, "epoch": 0.7779315799881352, "mean_token_accuracy": 0.6651446223258972, "num_tokens": 20140378.0, "step": 7868, "train/ce_loss": 1.573218822479248 }, { "epoch": 0.7779315799881352, "step": 7868, "train/sim_loss": 0.0703125 }, { "epoch": 0.7779315799881352, "step": 7868, "train/total_loss": 0.22763438522815704 }, { "entropy": 8.640933990478516, "epoch": 0.7780304528376508, "mean_token_accuracy": 0.7322134375572205, "num_tokens": 20145873.0, "step": 7869, "train/ce_loss": 1.3555995225906372 }, { "epoch": 0.7780304528376508, "step": 7869, "train/sim_loss": 0.05859375 }, { "epoch": 0.7780304528376508, "step": 7869, "train/total_loss": 0.19415371119976044 }, { "entropy": 8.888750076293945, "epoch": 0.7781293256871663, "mean_token_accuracy": 0.7715517282485962, "num_tokens": 20151019.0, "step": 7870, "train/ce_loss": 1.0199583768844604 }, { "epoch": 0.7781293256871663, "step": 7870, "train/sim_loss": 0.046875 }, { "epoch": 0.7781293256871663, "step": 7870, "train/total_loss": 0.14887084066867828 }, { "entropy": 8.983304977416992, "epoch": 0.7782281985366818, "mean_token_accuracy": 0.7315541505813599, "num_tokens": 20156135.0, "step": 7871, "train/ce_loss": 3.7230190628179116e-06 }, { "epoch": 0.7782281985366818, "step": 7871, "train/sim_loss": 0.0625 }, { "epoch": 0.7782281985366818, "step": 7871, "train/total_loss": 0.06250037252902985 }, { "entropy": 8.64794921875, "epoch": 0.7783270713861974, "mean_token_accuracy": 0.7871345281600952, "num_tokens": 20161460.0, "step": 7872, "train/ce_loss": 0.6233188509941101 }, { "epoch": 0.7783270713861974, "step": 7872, "train/sim_loss": 0.0390625 }, { "epoch": 0.7783270713861974, "step": 7872, "train/total_loss": 0.10139438509941101 }, { "entropy": 8.281718254089355, "epoch": 0.7784259442357129, "mean_token_accuracy": 0.7824831604957581, "num_tokens": 20166968.0, "step": 7873, "train/ce_loss": 0.7039061784744263 }, { "epoch": 0.7784259442357129, "step": 7873, "train/sim_loss": 0.05078125 }, { "epoch": 0.7784259442357129, "step": 7873, "train/total_loss": 0.12117186933755875 }, { "entropy": 8.540313720703125, "epoch": 0.7785248170852284, "mean_token_accuracy": 0.7196765542030334, "num_tokens": 20172248.0, "step": 7874, "train/ce_loss": 0.5756349563598633 }, { "epoch": 0.7785248170852284, "step": 7874, "train/sim_loss": 0.0625 }, { "epoch": 0.7785248170852284, "step": 7874, "train/total_loss": 0.12006349861621857 }, { "entropy": 8.901520729064941, "epoch": 0.778623689934744, "mean_token_accuracy": 0.733433723449707, "num_tokens": 20177335.0, "step": 7875, "train/ce_loss": 2.54775773100846e-06 }, { "epoch": 0.778623689934744, "step": 7875, "train/sim_loss": 0.09375 }, { "epoch": 0.778623689934744, "step": 7875, "train/total_loss": 0.0937502533197403 }, { "entropy": 8.65632438659668, "epoch": 0.7787225627842594, "mean_token_accuracy": 0.8341708779335022, "num_tokens": 20182542.0, "step": 7876, "train/ce_loss": 0.6827567219734192 }, { "epoch": 0.7787225627842594, "step": 7876, "train/sim_loss": 0.03125 }, { "epoch": 0.7787225627842594, "step": 7876, "train/total_loss": 0.09952567517757416 }, { "entropy": 9.449702262878418, "epoch": 0.7788214356337749, "mean_token_accuracy": 0.7385321259498596, "num_tokens": 20187400.0, "step": 7877, "train/ce_loss": 1.2082908153533936 }, { "epoch": 0.7788214356337749, "step": 7877, "train/sim_loss": 0.07421875 }, { "epoch": 0.7788214356337749, "step": 7877, "train/total_loss": 0.19504782557487488 }, { "entropy": 8.83372688293457, "epoch": 0.7789203084832905, "mean_token_accuracy": 0.7173333168029785, "num_tokens": 20192618.0, "step": 7878, "train/ce_loss": 0.7673982977867126 }, { "epoch": 0.7789203084832905, "step": 7878, "train/sim_loss": 0.0234375 }, { "epoch": 0.7789203084832905, "step": 7878, "train/total_loss": 0.1001773327589035 }, { "entropy": 8.807306289672852, "epoch": 0.779019181332806, "mean_token_accuracy": 0.7661290168762207, "num_tokens": 20197706.0, "step": 7879, "train/ce_loss": 3.00373426398437e-06 }, { "epoch": 0.779019181332806, "step": 7879, "train/sim_loss": 0.0625 }, { "epoch": 0.779019181332806, "step": 7879, "train/total_loss": 0.06250029802322388 }, { "epoch": 0.7791180541823215, "grad_norm": 0.6747611165046692, "learning_rate": 8.054442960985017e-06, "loss": 0.1315, "step": 7880 }, { "entropy": 8.52204704284668, "epoch": 0.7791180541823215, "mean_token_accuracy": 0.7667436599731445, "num_tokens": 20203057.0, "step": 7880, "train/ce_loss": 0.5791946053504944 }, { "epoch": 0.7791180541823215, "step": 7880, "train/sim_loss": 0.04296875 }, { "epoch": 0.7791180541823215, "step": 7880, "train/total_loss": 0.1008882075548172 }, { "entropy": 9.394721984863281, "epoch": 0.7792169270318371, "mean_token_accuracy": 0.7935871481895447, "num_tokens": 20207943.0, "step": 7881, "train/ce_loss": 1.1336095333099365 }, { "epoch": 0.7792169270318371, "step": 7881, "train/sim_loss": 0.046875 }, { "epoch": 0.7792169270318371, "step": 7881, "train/total_loss": 0.1602359563112259 }, { "entropy": 8.673254013061523, "epoch": 0.7793157998813526, "mean_token_accuracy": 0.7245509028434753, "num_tokens": 20213260.0, "step": 7882, "train/ce_loss": 0.628462016582489 }, { "epoch": 0.7793157998813526, "step": 7882, "train/sim_loss": 0.0390625 }, { "epoch": 0.7793157998813526, "step": 7882, "train/total_loss": 0.10190870612859726 }, { "entropy": 8.758913040161133, "epoch": 0.7794146727308681, "mean_token_accuracy": 0.7386215925216675, "num_tokens": 20218504.0, "step": 7883, "train/ce_loss": 0.5018093585968018 }, { "epoch": 0.7794146727308681, "step": 7883, "train/sim_loss": 0.0546875 }, { "epoch": 0.7794146727308681, "step": 7883, "train/total_loss": 0.10486844182014465 }, { "entropy": 9.086111068725586, "epoch": 0.7795135455803837, "mean_token_accuracy": 0.68006432056427, "num_tokens": 20223621.0, "step": 7884, "train/ce_loss": 1.9067862033843994 }, { "epoch": 0.7795135455803837, "step": 7884, "train/sim_loss": 0.06640625 }, { "epoch": 0.7795135455803837, "step": 7884, "train/total_loss": 0.2570848762989044 }, { "entropy": 8.832433700561523, "epoch": 0.7796124184298991, "mean_token_accuracy": 0.7552356123924255, "num_tokens": 20228894.0, "step": 7885, "train/ce_loss": 0.9636496305465698 }, { "epoch": 0.7796124184298991, "step": 7885, "train/sim_loss": 0.0625 }, { "epoch": 0.7796124184298991, "step": 7885, "train/total_loss": 0.15886497497558594 }, { "entropy": 9.566644668579102, "epoch": 0.7797112912794146, "mean_token_accuracy": 0.7421875, "num_tokens": 20233709.0, "step": 7886, "train/ce_loss": 2.084218978881836 }, { "epoch": 0.7797112912794146, "step": 7886, "train/sim_loss": 0.05078125 }, { "epoch": 0.7797112912794146, "step": 7886, "train/total_loss": 0.25920313596725464 }, { "entropy": 8.960962295532227, "epoch": 0.7798101641289302, "mean_token_accuracy": 0.6954838633537292, "num_tokens": 20238938.0, "step": 7887, "train/ce_loss": 1.7520071268081665 }, { "epoch": 0.7798101641289302, "step": 7887, "train/sim_loss": 0.08984375 }, { "epoch": 0.7798101641289302, "step": 7887, "train/total_loss": 0.2650444507598877 }, { "entropy": 9.25937271118164, "epoch": 0.7799090369784457, "mean_token_accuracy": 0.7517730593681335, "num_tokens": 20243936.0, "step": 7888, "train/ce_loss": 1.1822519302368164 }, { "epoch": 0.7799090369784457, "step": 7888, "train/sim_loss": 0.0390625 }, { "epoch": 0.7799090369784457, "step": 7888, "train/total_loss": 0.15728768706321716 }, { "entropy": 8.589618682861328, "epoch": 0.7800079098279612, "mean_token_accuracy": 0.7505091428756714, "num_tokens": 20249400.0, "step": 7889, "train/ce_loss": 0.8180780410766602 }, { "epoch": 0.7800079098279612, "step": 7889, "train/sim_loss": 0.046875 }, { "epoch": 0.7800079098279612, "step": 7889, "train/total_loss": 0.12868280708789825 }, { "entropy": 8.764129638671875, "epoch": 0.7801067826774768, "mean_token_accuracy": 0.7448107600212097, "num_tokens": 20254624.0, "step": 7890, "train/ce_loss": 0.923783540725708 }, { "epoch": 0.7801067826774768, "step": 7890, "train/sim_loss": 0.03515625 }, { "epoch": 0.7801067826774768, "step": 7890, "train/total_loss": 0.12753459811210632 }, { "entropy": 8.824312210083008, "epoch": 0.7802056555269923, "mean_token_accuracy": 0.7082429528236389, "num_tokens": 20259950.0, "step": 7891, "train/ce_loss": 0.5625573396682739 }, { "epoch": 0.7802056555269923, "step": 7891, "train/sim_loss": 0.046875 }, { "epoch": 0.7802056555269923, "step": 7891, "train/total_loss": 0.10313073545694351 }, { "entropy": 8.657915115356445, "epoch": 0.7803045283765078, "mean_token_accuracy": 0.7373167872428894, "num_tokens": 20265329.0, "step": 7892, "train/ce_loss": 0.8548310399055481 }, { "epoch": 0.7803045283765078, "step": 7892, "train/sim_loss": 0.078125 }, { "epoch": 0.7803045283765078, "step": 7892, "train/total_loss": 0.1636081039905548 }, { "entropy": 8.728321075439453, "epoch": 0.7804034012260234, "mean_token_accuracy": 0.7177419066429138, "num_tokens": 20270510.0, "step": 7893, "train/ce_loss": 1.0501341819763184 }, { "epoch": 0.7804034012260234, "step": 7893, "train/sim_loss": 0.09375 }, { "epoch": 0.7804034012260234, "step": 7893, "train/total_loss": 0.1987634301185608 }, { "entropy": 8.53168773651123, "epoch": 0.7805022740755388, "mean_token_accuracy": 0.7386243343353271, "num_tokens": 20275934.0, "step": 7894, "train/ce_loss": 0.6309541463851929 }, { "epoch": 0.7805022740755388, "step": 7894, "train/sim_loss": 0.01953125 }, { "epoch": 0.7805022740755388, "step": 7894, "train/total_loss": 0.08262666314840317 }, { "entropy": 8.515279769897461, "epoch": 0.7806011469250543, "mean_token_accuracy": 0.7110874056816101, "num_tokens": 20281360.0, "step": 7895, "train/ce_loss": 0.46416860818862915 }, { "epoch": 0.7806011469250543, "step": 7895, "train/sim_loss": 0.07421875 }, { "epoch": 0.7806011469250543, "step": 7895, "train/total_loss": 0.12063561379909515 }, { "entropy": 8.935840606689453, "epoch": 0.7807000197745699, "mean_token_accuracy": 0.75, "num_tokens": 20286587.0, "step": 7896, "train/ce_loss": 1.288562536239624 }, { "epoch": 0.7807000197745699, "step": 7896, "train/sim_loss": 0.01953125 }, { "epoch": 0.7807000197745699, "step": 7896, "train/total_loss": 0.14838750660419464 }, { "entropy": 9.0083589553833, "epoch": 0.7807988926240854, "mean_token_accuracy": 0.7654135227203369, "num_tokens": 20291683.0, "step": 7897, "train/ce_loss": 0.8810030817985535 }, { "epoch": 0.7807988926240854, "step": 7897, "train/sim_loss": 0.05078125 }, { "epoch": 0.7807988926240854, "step": 7897, "train/total_loss": 0.13888156414031982 }, { "entropy": 9.198946952819824, "epoch": 0.7808977654736009, "mean_token_accuracy": 0.6476923227310181, "num_tokens": 20296804.0, "step": 7898, "train/ce_loss": 1.6421705484390259 }, { "epoch": 0.7808977654736009, "step": 7898, "train/sim_loss": 0.07421875 }, { "epoch": 0.7808977654736009, "step": 7898, "train/total_loss": 0.2384358048439026 }, { "entropy": 8.83854866027832, "epoch": 0.7809966383231165, "mean_token_accuracy": 0.7840490937232971, "num_tokens": 20302066.0, "step": 7899, "train/ce_loss": 0.9571102857589722 }, { "epoch": 0.7809966383231165, "step": 7899, "train/sim_loss": 0.0703125 }, { "epoch": 0.7809966383231165, "step": 7899, "train/total_loss": 0.16602352261543274 }, { "epoch": 0.781095511172632, "grad_norm": 0.6194737553596497, "learning_rate": 8.049498096227069e-06, "loss": 0.1419, "step": 7900 }, { "entropy": 8.932303428649902, "epoch": 0.781095511172632, "mean_token_accuracy": 0.7843137383460999, "num_tokens": 20306917.0, "step": 7900, "train/ce_loss": 1.1366647481918335 }, { "epoch": 0.781095511172632, "step": 7900, "train/sim_loss": 0.03125 }, { "epoch": 0.781095511172632, "step": 7900, "train/total_loss": 0.14491647481918335 }, { "entropy": 8.959232330322266, "epoch": 0.7811943840221475, "mean_token_accuracy": 0.7389885783195496, "num_tokens": 20311969.0, "step": 7901, "train/ce_loss": 2.387908125456306e-06 }, { "epoch": 0.7811943840221475, "step": 7901, "train/sim_loss": 0.0390625 }, { "epoch": 0.7811943840221475, "step": 7901, "train/total_loss": 0.0390627384185791 }, { "entropy": 8.497319221496582, "epoch": 0.7812932568716631, "mean_token_accuracy": 0.7304452657699585, "num_tokens": 20317288.0, "step": 7902, "train/ce_loss": 0.5628913044929504 }, { "epoch": 0.7812932568716631, "step": 7902, "train/sim_loss": 0.046875 }, { "epoch": 0.7812932568716631, "step": 7902, "train/total_loss": 0.10316413640975952 }, { "entropy": 8.847746849060059, "epoch": 0.7813921297211786, "mean_token_accuracy": 0.75698322057724, "num_tokens": 20322479.0, "step": 7903, "train/ce_loss": 2.7168525775778107e-06 }, { "epoch": 0.7813921297211786, "step": 7903, "train/sim_loss": 0.03515625 }, { "epoch": 0.7813921297211786, "step": 7903, "train/total_loss": 0.03515652194619179 }, { "entropy": 8.57381820678711, "epoch": 0.781491002570694, "mean_token_accuracy": 0.7816377282142639, "num_tokens": 20327782.0, "step": 7904, "train/ce_loss": 0.9726953506469727 }, { "epoch": 0.781491002570694, "step": 7904, "train/sim_loss": 0.06640625 }, { "epoch": 0.781491002570694, "step": 7904, "train/total_loss": 0.16367578506469727 }, { "entropy": 8.720494270324707, "epoch": 0.7815898754202096, "mean_token_accuracy": 0.7451456189155579, "num_tokens": 20333043.0, "step": 7905, "train/ce_loss": 0.9451776742935181 }, { "epoch": 0.7815898754202096, "step": 7905, "train/sim_loss": 0.03515625 }, { "epoch": 0.7815898754202096, "step": 7905, "train/total_loss": 0.1296740174293518 }, { "entropy": 8.519255638122559, "epoch": 0.7816887482697251, "mean_token_accuracy": 0.7466487884521484, "num_tokens": 20338247.0, "step": 7906, "train/ce_loss": 0.9122191071510315 }, { "epoch": 0.7816887482697251, "step": 7906, "train/sim_loss": 0.08203125 }, { "epoch": 0.7816887482697251, "step": 7906, "train/total_loss": 0.1732531636953354 }, { "entropy": 8.998676300048828, "epoch": 0.7817876211192406, "mean_token_accuracy": 0.7673667073249817, "num_tokens": 20343328.0, "step": 7907, "train/ce_loss": 2.213766947534168e-06 }, { "epoch": 0.7817876211192406, "step": 7907, "train/sim_loss": 0.0390625 }, { "epoch": 0.7817876211192406, "step": 7907, "train/total_loss": 0.03906271979212761 }, { "entropy": 9.285778999328613, "epoch": 0.7818864939687562, "mean_token_accuracy": 0.7413793206214905, "num_tokens": 20348267.0, "step": 7908, "train/ce_loss": 1.0977128744125366 }, { "epoch": 0.7818864939687562, "step": 7908, "train/sim_loss": 0.0234375 }, { "epoch": 0.7818864939687562, "step": 7908, "train/total_loss": 0.13320878148078918 }, { "entropy": 8.340143203735352, "epoch": 0.7819853668182717, "mean_token_accuracy": 0.7763578295707703, "num_tokens": 20353650.0, "step": 7909, "train/ce_loss": 0.7174997925758362 }, { "epoch": 0.7819853668182717, "step": 7909, "train/sim_loss": 0.0390625 }, { "epoch": 0.7819853668182717, "step": 7909, "train/total_loss": 0.1108124777674675 }, { "entropy": 9.001302719116211, "epoch": 0.7820842396677872, "mean_token_accuracy": 0.7584615349769592, "num_tokens": 20358736.0, "step": 7910, "train/ce_loss": 0.7865976095199585 }, { "epoch": 0.7820842396677872, "step": 7910, "train/sim_loss": 0.03125 }, { "epoch": 0.7820842396677872, "step": 7910, "train/total_loss": 0.10990976542234421 }, { "entropy": 8.796815872192383, "epoch": 0.7821831125173028, "mean_token_accuracy": 0.7554697394371033, "num_tokens": 20363966.0, "step": 7911, "train/ce_loss": 0.8870112895965576 }, { "epoch": 0.7821831125173028, "step": 7911, "train/sim_loss": 0.05859375 }, { "epoch": 0.7821831125173028, "step": 7911, "train/total_loss": 0.14729487895965576 }, { "entropy": 8.566937446594238, "epoch": 0.7822819853668183, "mean_token_accuracy": 0.7266111969947815, "num_tokens": 20369407.0, "step": 7912, "train/ce_loss": 0.5319351553916931 }, { "epoch": 0.7822819853668183, "step": 7912, "train/sim_loss": 0.078125 }, { "epoch": 0.7822819853668183, "step": 7912, "train/total_loss": 0.13131850957870483 }, { "entropy": 8.470523834228516, "epoch": 0.7823808582163337, "mean_token_accuracy": 0.7873210906982422, "num_tokens": 20374871.0, "step": 7913, "train/ce_loss": 0.5656315088272095 }, { "epoch": 0.7823808582163337, "step": 7913, "train/sim_loss": 0.06640625 }, { "epoch": 0.7823808582163337, "step": 7913, "train/total_loss": 0.12296940386295319 }, { "entropy": 8.459247589111328, "epoch": 0.7824797310658493, "mean_token_accuracy": 0.747826099395752, "num_tokens": 20380133.0, "step": 7914, "train/ce_loss": 1.336376667022705 }, { "epoch": 0.7824797310658493, "step": 7914, "train/sim_loss": 0.0234375 }, { "epoch": 0.7824797310658493, "step": 7914, "train/total_loss": 0.1570751667022705 }, { "entropy": 8.66012191772461, "epoch": 0.7825786039153648, "mean_token_accuracy": 0.7441558241844177, "num_tokens": 20385392.0, "step": 7915, "train/ce_loss": 0.6318830251693726 }, { "epoch": 0.7825786039153648, "step": 7915, "train/sim_loss": 0.03515625 }, { "epoch": 0.7825786039153648, "step": 7915, "train/total_loss": 0.09834455698728561 }, { "entropy": 8.976568222045898, "epoch": 0.7826774767648803, "mean_token_accuracy": 0.6805348992347717, "num_tokens": 20390493.0, "step": 7916, "train/ce_loss": 0.5899818539619446 }, { "epoch": 0.7826774767648803, "step": 7916, "train/sim_loss": 0.0234375 }, { "epoch": 0.7826774767648803, "step": 7916, "train/total_loss": 0.08243568241596222 }, { "entropy": 8.40031623840332, "epoch": 0.7827763496143959, "mean_token_accuracy": 0.7359437942504883, "num_tokens": 20395980.0, "step": 7917, "train/ce_loss": 0.8957025408744812 }, { "epoch": 0.7827763496143959, "step": 7917, "train/sim_loss": 0.0390625 }, { "epoch": 0.7827763496143959, "step": 7917, "train/total_loss": 0.12863275408744812 }, { "entropy": 8.910615921020508, "epoch": 0.7828752224639114, "mean_token_accuracy": 0.7111756205558777, "num_tokens": 20401136.0, "step": 7918, "train/ce_loss": 0.6762477159500122 }, { "epoch": 0.7828752224639114, "step": 7918, "train/sim_loss": 0.0625 }, { "epoch": 0.7828752224639114, "step": 7918, "train/total_loss": 0.1301247775554657 }, { "entropy": 8.72070598602295, "epoch": 0.782974095313427, "mean_token_accuracy": 0.7680723071098328, "num_tokens": 20406278.0, "step": 7919, "train/ce_loss": 1.0571833848953247 }, { "epoch": 0.782974095313427, "step": 7919, "train/sim_loss": 0.05859375 }, { "epoch": 0.782974095313427, "step": 7919, "train/total_loss": 0.16431209444999695 }, { "epoch": 0.7830729681629425, "grad_norm": 0.6485387682914734, "learning_rate": 8.04455323146912e-06, "loss": 0.1316, "step": 7920 }, { "entropy": 8.218687057495117, "epoch": 0.7830729681629425, "mean_token_accuracy": 0.8042895197868347, "num_tokens": 20411890.0, "step": 7920, "train/ce_loss": 1.124735713005066 }, { "epoch": 0.7830729681629425, "step": 7920, "train/sim_loss": 0.046875 }, { "epoch": 0.7830729681629425, "step": 7920, "train/total_loss": 0.15934857726097107 }, { "entropy": 8.609413146972656, "epoch": 0.783171841012458, "mean_token_accuracy": 0.7561797499656677, "num_tokens": 20417281.0, "step": 7921, "train/ce_loss": 0.5667004585266113 }, { "epoch": 0.783171841012458, "step": 7921, "train/sim_loss": 0.09765625 }, { "epoch": 0.783171841012458, "step": 7921, "train/total_loss": 0.15432628989219666 }, { "entropy": 9.017016410827637, "epoch": 0.7832707138619736, "mean_token_accuracy": 0.7774389982223511, "num_tokens": 20422422.0, "step": 7922, "train/ce_loss": 1.0670339634089032e-06 }, { "epoch": 0.7832707138619736, "step": 7922, "train/sim_loss": 0.01171875 }, { "epoch": 0.7832707138619736, "step": 7922, "train/total_loss": 0.01171885710209608 }, { "entropy": 8.53697395324707, "epoch": 0.783369586711489, "mean_token_accuracy": 0.807603657245636, "num_tokens": 20427795.0, "step": 7923, "train/ce_loss": 0.6338130235671997 }, { "epoch": 0.783369586711489, "step": 7923, "train/sim_loss": 0.04296875 }, { "epoch": 0.783369586711489, "step": 7923, "train/total_loss": 0.10635005682706833 }, { "entropy": 8.64338493347168, "epoch": 0.7834684595610045, "mean_token_accuracy": 0.7033492922782898, "num_tokens": 20433096.0, "step": 7924, "train/ce_loss": 0.6053569316864014 }, { "epoch": 0.7834684595610045, "step": 7924, "train/sim_loss": 0.0859375 }, { "epoch": 0.7834684595610045, "step": 7924, "train/total_loss": 0.14647319912910461 }, { "entropy": 8.749120712280273, "epoch": 0.7835673324105201, "mean_token_accuracy": 0.7053571343421936, "num_tokens": 20438470.0, "step": 7925, "train/ce_loss": 0.5116437673568726 }, { "epoch": 0.7835673324105201, "step": 7925, "train/sim_loss": 0.02734375 }, { "epoch": 0.7835673324105201, "step": 7925, "train/total_loss": 0.07850812375545502 }, { "entropy": 8.944839477539062, "epoch": 0.7836662052600356, "mean_token_accuracy": 0.7326732873916626, "num_tokens": 20443596.0, "step": 7926, "train/ce_loss": 1.0792173147201538 }, { "epoch": 0.7836662052600356, "step": 7926, "train/sim_loss": 0.03125 }, { "epoch": 0.7836662052600356, "step": 7926, "train/total_loss": 0.13917173445224762 }, { "entropy": 8.53488540649414, "epoch": 0.7837650781095511, "mean_token_accuracy": 0.7253814339637756, "num_tokens": 20448762.0, "step": 7927, "train/ce_loss": 0.893704891204834 }, { "epoch": 0.7837650781095511, "step": 7927, "train/sim_loss": 0.078125 }, { "epoch": 0.7837650781095511, "step": 7927, "train/total_loss": 0.1674954891204834 }, { "entropy": 8.753499984741211, "epoch": 0.7838639509590667, "mean_token_accuracy": 0.6675094962120056, "num_tokens": 20454029.0, "step": 7928, "train/ce_loss": 1.5897996425628662 }, { "epoch": 0.7838639509590667, "step": 7928, "train/sim_loss": 0.06640625 }, { "epoch": 0.7838639509590667, "step": 7928, "train/total_loss": 0.22538621723651886 }, { "entropy": 8.803192138671875, "epoch": 0.7839628238085822, "mean_token_accuracy": 0.7758620977401733, "num_tokens": 20459208.0, "step": 7929, "train/ce_loss": 0.9285788536071777 }, { "epoch": 0.7839628238085822, "step": 7929, "train/sim_loss": 0.0703125 }, { "epoch": 0.7839628238085822, "step": 7929, "train/total_loss": 0.16317039728164673 }, { "entropy": 8.868133544921875, "epoch": 0.7840616966580977, "mean_token_accuracy": 0.7783641219139099, "num_tokens": 20464413.0, "step": 7930, "train/ce_loss": 1.8015189198195003e-06 }, { "epoch": 0.7840616966580977, "step": 7930, "train/sim_loss": 0.04296875 }, { "epoch": 0.7840616966580977, "step": 7930, "train/total_loss": 0.042968928813934326 }, { "entropy": 9.242012023925781, "epoch": 0.7841605695076133, "mean_token_accuracy": 0.7279411554336548, "num_tokens": 20469376.0, "step": 7931, "train/ce_loss": 0.6100656390190125 }, { "epoch": 0.7841605695076133, "step": 7931, "train/sim_loss": 0.03515625 }, { "epoch": 0.7841605695076133, "step": 7931, "train/total_loss": 0.096162810921669 }, { "entropy": 8.578420639038086, "epoch": 0.7842594423571287, "mean_token_accuracy": 0.6862967014312744, "num_tokens": 20474698.0, "step": 7932, "train/ce_loss": 1.4359699487686157 }, { "epoch": 0.7842594423571287, "step": 7932, "train/sim_loss": 0.03515625 }, { "epoch": 0.7842594423571287, "step": 7932, "train/total_loss": 0.17875324189662933 }, { "entropy": 8.901908874511719, "epoch": 0.7843583152066442, "mean_token_accuracy": 0.7245222926139832, "num_tokens": 20479970.0, "step": 7933, "train/ce_loss": 0.8366997838020325 }, { "epoch": 0.7843583152066442, "step": 7933, "train/sim_loss": 0.0859375 }, { "epoch": 0.7843583152066442, "step": 7933, "train/total_loss": 0.1696074903011322 }, { "entropy": 9.048408508300781, "epoch": 0.7844571880561598, "mean_token_accuracy": 0.7338235378265381, "num_tokens": 20485079.0, "step": 7934, "train/ce_loss": 0.9843783974647522 }, { "epoch": 0.7844571880561598, "step": 7934, "train/sim_loss": 0.0625 }, { "epoch": 0.7844571880561598, "step": 7934, "train/total_loss": 0.1609378457069397 }, { "entropy": 8.651874542236328, "epoch": 0.7845560609056753, "mean_token_accuracy": 0.7883771657943726, "num_tokens": 20490458.0, "step": 7935, "train/ce_loss": 0.5401686429977417 }, { "epoch": 0.7845560609056753, "step": 7935, "train/sim_loss": 0.0234375 }, { "epoch": 0.7845560609056753, "step": 7935, "train/total_loss": 0.07745436578989029 }, { "entropy": 8.963913917541504, "epoch": 0.7846549337551908, "mean_token_accuracy": 0.7274011373519897, "num_tokens": 20495576.0, "step": 7936, "train/ce_loss": 0.631771445274353 }, { "epoch": 0.7846549337551908, "step": 7936, "train/sim_loss": 0.0546875 }, { "epoch": 0.7846549337551908, "step": 7936, "train/total_loss": 0.11786464601755142 }, { "entropy": 8.555036544799805, "epoch": 0.7847538066047064, "mean_token_accuracy": 0.7471967339515686, "num_tokens": 20501050.0, "step": 7937, "train/ce_loss": 0.7506260275840759 }, { "epoch": 0.7847538066047064, "step": 7937, "train/sim_loss": 0.07421875 }, { "epoch": 0.7847538066047064, "step": 7937, "train/total_loss": 0.1492813527584076 }, { "entropy": 8.483772277832031, "epoch": 0.7848526794542219, "mean_token_accuracy": 0.7716371417045593, "num_tokens": 20506422.0, "step": 7938, "train/ce_loss": 0.5519025325775146 }, { "epoch": 0.7848526794542219, "step": 7938, "train/sim_loss": 0.046875 }, { "epoch": 0.7848526794542219, "step": 7938, "train/total_loss": 0.10206525027751923 }, { "entropy": 8.557881355285645, "epoch": 0.7849515523037374, "mean_token_accuracy": 0.7925764322280884, "num_tokens": 20511788.0, "step": 7939, "train/ce_loss": 0.6271799206733704 }, { "epoch": 0.7849515523037374, "step": 7939, "train/sim_loss": 0.04296875 }, { "epoch": 0.7849515523037374, "step": 7939, "train/total_loss": 0.1056867465376854 }, { "epoch": 0.785050425153253, "grad_norm": 0.5147098898887634, "learning_rate": 8.039608366711172e-06, "loss": 0.1314, "step": 7940 }, { "entropy": 9.699087142944336, "epoch": 0.785050425153253, "mean_token_accuracy": 0.7768816947937012, "num_tokens": 20516521.0, "step": 7940, "train/ce_loss": 1.6855492503964342e-06 }, { "epoch": 0.785050425153253, "step": 7940, "train/sim_loss": 0.01953125 }, { "epoch": 0.785050425153253, "step": 7940, "train/total_loss": 0.01953141763806343 }, { "entropy": 8.913328170776367, "epoch": 0.7851492980027684, "mean_token_accuracy": 0.7882736325263977, "num_tokens": 20521566.0, "step": 7941, "train/ce_loss": 0.806064248085022 }, { "epoch": 0.7851492980027684, "step": 7941, "train/sim_loss": 0.015625 }, { "epoch": 0.7851492980027684, "step": 7941, "train/total_loss": 0.09623142331838608 }, { "entropy": 9.045415878295898, "epoch": 0.7852481708522839, "mean_token_accuracy": 0.7320703864097595, "num_tokens": 20526796.0, "step": 7942, "train/ce_loss": 5.9377862271503545e-06 }, { "epoch": 0.7852481708522839, "step": 7942, "train/sim_loss": 0.0546875 }, { "epoch": 0.7852481708522839, "step": 7942, "train/total_loss": 0.054688092321157455 }, { "entropy": 8.949482917785645, "epoch": 0.7853470437017995, "mean_token_accuracy": 0.6782729625701904, "num_tokens": 20531958.0, "step": 7943, "train/ce_loss": 1.5178576707839966 }, { "epoch": 0.7853470437017995, "step": 7943, "train/sim_loss": 0.03125 }, { "epoch": 0.7853470437017995, "step": 7943, "train/total_loss": 0.18303577601909637 }, { "entropy": 8.98634147644043, "epoch": 0.785445916551315, "mean_token_accuracy": 0.739130437374115, "num_tokens": 20537152.0, "step": 7944, "train/ce_loss": 1.258015751838684 }, { "epoch": 0.785445916551315, "step": 7944, "train/sim_loss": 0.05859375 }, { "epoch": 0.785445916551315, "step": 7944, "train/total_loss": 0.18439532816410065 }, { "entropy": 8.618897438049316, "epoch": 0.7855447894008305, "mean_token_accuracy": 0.7541713118553162, "num_tokens": 20542613.0, "step": 7945, "train/ce_loss": 0.8079213500022888 }, { "epoch": 0.7855447894008305, "step": 7945, "train/sim_loss": 0.08984375 }, { "epoch": 0.7855447894008305, "step": 7945, "train/total_loss": 0.1706358790397644 }, { "entropy": 9.174298286437988, "epoch": 0.7856436622503461, "mean_token_accuracy": 0.7657807469367981, "num_tokens": 20547689.0, "step": 7946, "train/ce_loss": 5.641299139824696e-06 }, { "epoch": 0.7856436622503461, "step": 7946, "train/sim_loss": 0.0546875 }, { "epoch": 0.7856436622503461, "step": 7946, "train/total_loss": 0.05468806251883507 }, { "entropy": 9.220314025878906, "epoch": 0.7857425350998616, "mean_token_accuracy": 0.7574257254600525, "num_tokens": 20552669.0, "step": 7947, "train/ce_loss": 1.3368439674377441 }, { "epoch": 0.7857425350998616, "step": 7947, "train/sim_loss": 0.05859375 }, { "epoch": 0.7857425350998616, "step": 7947, "train/total_loss": 0.19227814674377441 }, { "entropy": 8.648846626281738, "epoch": 0.7858414079493771, "mean_token_accuracy": 0.7247058749198914, "num_tokens": 20557996.0, "step": 7948, "train/ce_loss": 0.4762094020843506 }, { "epoch": 0.7858414079493771, "step": 7948, "train/sim_loss": 0.05859375 }, { "epoch": 0.7858414079493771, "step": 7948, "train/total_loss": 0.10621468722820282 }, { "entropy": 8.662435531616211, "epoch": 0.7859402807988927, "mean_token_accuracy": 0.7303522825241089, "num_tokens": 20563177.0, "step": 7949, "train/ce_loss": 1.3563436269760132 }, { "epoch": 0.7859402807988927, "step": 7949, "train/sim_loss": 0.0390625 }, { "epoch": 0.7859402807988927, "step": 7949, "train/total_loss": 0.17469686269760132 }, { "entropy": 9.52070426940918, "epoch": 0.7860391536484082, "mean_token_accuracy": 0.7683284282684326, "num_tokens": 20567934.0, "step": 7950, "train/ce_loss": 1.9786020857281983e-06 }, { "epoch": 0.7860391536484082, "step": 7950, "train/sim_loss": 0.0390625 }, { "epoch": 0.7860391536484082, "step": 7950, "train/total_loss": 0.03906269744038582 }, { "entropy": 8.594144821166992, "epoch": 0.7861380264979236, "mean_token_accuracy": 0.7701793909072876, "num_tokens": 20573264.0, "step": 7951, "train/ce_loss": 0.4847142994403839 }, { "epoch": 0.7861380264979236, "step": 7951, "train/sim_loss": 0.0390625 }, { "epoch": 0.7861380264979236, "step": 7951, "train/total_loss": 0.08753393590450287 }, { "entropy": 8.669903755187988, "epoch": 0.7862368993474392, "mean_token_accuracy": 0.7776025533676147, "num_tokens": 20578340.0, "step": 7952, "train/ce_loss": 0.6803780198097229 }, { "epoch": 0.7862368993474392, "step": 7952, "train/sim_loss": 0.015625 }, { "epoch": 0.7862368993474392, "step": 7952, "train/total_loss": 0.08366280049085617 }, { "entropy": 8.677490234375, "epoch": 0.7863357721969547, "mean_token_accuracy": 0.7487562298774719, "num_tokens": 20583615.0, "step": 7953, "train/ce_loss": 0.7094335556030273 }, { "epoch": 0.7863357721969547, "step": 7953, "train/sim_loss": 0.0390625 }, { "epoch": 0.7863357721969547, "step": 7953, "train/total_loss": 0.11000585556030273 }, { "entropy": 8.330732345581055, "epoch": 0.7864346450464702, "mean_token_accuracy": 0.7355035543441772, "num_tokens": 20589087.0, "step": 7954, "train/ce_loss": 0.6643909215927124 }, { "epoch": 0.7864346450464702, "step": 7954, "train/sim_loss": 0.078125 }, { "epoch": 0.7864346450464702, "step": 7954, "train/total_loss": 0.14456409215927124 }, { "entropy": 8.504324913024902, "epoch": 0.7865335178959858, "mean_token_accuracy": 0.7805677056312561, "num_tokens": 20594501.0, "step": 7955, "train/ce_loss": 0.6007000803947449 }, { "epoch": 0.7865335178959858, "step": 7955, "train/sim_loss": 0.05859375 }, { "epoch": 0.7865335178959858, "step": 7955, "train/total_loss": 0.11866375803947449 }, { "entropy": 9.33056640625, "epoch": 0.7866323907455013, "mean_token_accuracy": 0.7306967973709106, "num_tokens": 20599418.0, "step": 7956, "train/ce_loss": 1.3013379573822021 }, { "epoch": 0.7866323907455013, "step": 7956, "train/sim_loss": 0.05859375 }, { "epoch": 0.7866323907455013, "step": 7956, "train/total_loss": 0.18872754275798798 }, { "entropy": 8.516487121582031, "epoch": 0.7867312635950168, "mean_token_accuracy": 0.6837030053138733, "num_tokens": 20604922.0, "step": 7957, "train/ce_loss": 1.3547759056091309 }, { "epoch": 0.7867312635950168, "step": 7957, "train/sim_loss": 0.078125 }, { "epoch": 0.7867312635950168, "step": 7957, "train/total_loss": 0.21360258758068085 }, { "entropy": 8.600809097290039, "epoch": 0.7868301364445324, "mean_token_accuracy": 0.7753201127052307, "num_tokens": 20610223.0, "step": 7958, "train/ce_loss": 0.9719870686531067 }, { "epoch": 0.7868301364445324, "step": 7958, "train/sim_loss": 0.0625 }, { "epoch": 0.7868301364445324, "step": 7958, "train/total_loss": 0.1596987098455429 }, { "entropy": 8.850839614868164, "epoch": 0.7869290092940479, "mean_token_accuracy": 0.7395944595336914, "num_tokens": 20615630.0, "step": 7959, "train/ce_loss": 1.0459824800491333 }, { "epoch": 0.7869290092940479, "step": 7959, "train/sim_loss": 0.046875 }, { "epoch": 0.7869290092940479, "step": 7959, "train/total_loss": 0.1514732539653778 }, { "epoch": 0.7870278821435633, "grad_norm": 0.7744261622428894, "learning_rate": 8.034663501953222e-06, "loss": 0.1369, "step": 7960 }, { "entropy": 8.881771087646484, "epoch": 0.7870278821435633, "mean_token_accuracy": 0.7635036706924438, "num_tokens": 20620788.0, "step": 7960, "train/ce_loss": 0.6608197689056396 }, { "epoch": 0.7870278821435633, "step": 7960, "train/sim_loss": 0.015625 }, { "epoch": 0.7870278821435633, "step": 7960, "train/total_loss": 0.08170697838068008 }, { "entropy": 8.501601219177246, "epoch": 0.7871267549930789, "mean_token_accuracy": 0.7590233683586121, "num_tokens": 20626220.0, "step": 7961, "train/ce_loss": 0.9903839230537415 }, { "epoch": 0.7871267549930789, "step": 7961, "train/sim_loss": 0.046875 }, { "epoch": 0.7871267549930789, "step": 7961, "train/total_loss": 0.14591339230537415 }, { "entropy": 8.645923614501953, "epoch": 0.7872256278425944, "mean_token_accuracy": 0.6985210180282593, "num_tokens": 20631757.0, "step": 7962, "train/ce_loss": 1.2180404663085938 }, { "epoch": 0.7872256278425944, "step": 7962, "train/sim_loss": 0.08984375 }, { "epoch": 0.7872256278425944, "step": 7962, "train/total_loss": 0.21164780855178833 }, { "entropy": 8.543001174926758, "epoch": 0.7873245006921099, "mean_token_accuracy": 0.7823721170425415, "num_tokens": 20637151.0, "step": 7963, "train/ce_loss": 0.7809872627258301 }, { "epoch": 0.7873245006921099, "step": 7963, "train/sim_loss": 0.015625 }, { "epoch": 0.7873245006921099, "step": 7963, "train/total_loss": 0.09372372925281525 }, { "entropy": 8.694469451904297, "epoch": 0.7874233735416255, "mean_token_accuracy": 0.7221029996871948, "num_tokens": 20642564.0, "step": 7964, "train/ce_loss": 0.7001290321350098 }, { "epoch": 0.7874233735416255, "step": 7964, "train/sim_loss": 0.05078125 }, { "epoch": 0.7874233735416255, "step": 7964, "train/total_loss": 0.1207941547036171 }, { "entropy": 9.358198165893555, "epoch": 0.787522246391141, "mean_token_accuracy": 0.7299145460128784, "num_tokens": 20647740.0, "step": 7965, "train/ce_loss": 0.8079087734222412 }, { "epoch": 0.787522246391141, "step": 7965, "train/sim_loss": 0.05078125 }, { "epoch": 0.787522246391141, "step": 7965, "train/total_loss": 0.13157212734222412 }, { "entropy": 8.975767135620117, "epoch": 0.7876211192406565, "mean_token_accuracy": 0.7756654024124146, "num_tokens": 20653030.0, "step": 7966, "train/ce_loss": 1.059606671333313 }, { "epoch": 0.7876211192406565, "step": 7966, "train/sim_loss": 0.046875 }, { "epoch": 0.7876211192406565, "step": 7966, "train/total_loss": 0.1528356671333313 }, { "entropy": 8.476731300354004, "epoch": 0.7877199920901721, "mean_token_accuracy": 0.7681007385253906, "num_tokens": 20658446.0, "step": 7967, "train/ce_loss": 0.644286572933197 }, { "epoch": 0.7877199920901721, "step": 7967, "train/sim_loss": 0.015625 }, { "epoch": 0.7877199920901721, "step": 7967, "train/total_loss": 0.0800536572933197 }, { "entropy": 8.317113876342773, "epoch": 0.7878188649396876, "mean_token_accuracy": 0.7533039450645447, "num_tokens": 20663819.0, "step": 7968, "train/ce_loss": 0.49962639808654785 }, { "epoch": 0.7878188649396876, "step": 7968, "train/sim_loss": 0.0390625 }, { "epoch": 0.7878188649396876, "step": 7968, "train/total_loss": 0.08902513980865479 }, { "entropy": 9.088321685791016, "epoch": 0.787917737789203, "mean_token_accuracy": 0.7987321615219116, "num_tokens": 20668916.0, "step": 7969, "train/ce_loss": 1.279579520225525 }, { "epoch": 0.787917737789203, "step": 7969, "train/sim_loss": 0.04296875 }, { "epoch": 0.787917737789203, "step": 7969, "train/total_loss": 0.17092670500278473 }, { "entropy": 8.945789337158203, "epoch": 0.7880166106387186, "mean_token_accuracy": 0.7312588691711426, "num_tokens": 20674098.0, "step": 7970, "train/ce_loss": 0.9316537380218506 }, { "epoch": 0.7880166106387186, "step": 7970, "train/sim_loss": 0.05078125 }, { "epoch": 0.7880166106387186, "step": 7970, "train/total_loss": 0.14394661784172058 }, { "entropy": 8.59889030456543, "epoch": 0.7881154834882341, "mean_token_accuracy": 0.667382001876831, "num_tokens": 20679676.0, "step": 7971, "train/ce_loss": 0.9962515830993652 }, { "epoch": 0.7881154834882341, "step": 7971, "train/sim_loss": 0.0546875 }, { "epoch": 0.7881154834882341, "step": 7971, "train/total_loss": 0.15431267023086548 }, { "entropy": 8.534368515014648, "epoch": 0.7882143563377496, "mean_token_accuracy": 0.7758007049560547, "num_tokens": 20684968.0, "step": 7972, "train/ce_loss": 0.6547650098800659 }, { "epoch": 0.7882143563377496, "step": 7972, "train/sim_loss": 0.06640625 }, { "epoch": 0.7882143563377496, "step": 7972, "train/total_loss": 0.13188275694847107 }, { "entropy": 8.79300594329834, "epoch": 0.7883132291872652, "mean_token_accuracy": 0.7480106353759766, "num_tokens": 20690174.0, "step": 7973, "train/ce_loss": 0.600091278553009 }, { "epoch": 0.7883132291872652, "step": 7973, "train/sim_loss": 0.0703125 }, { "epoch": 0.7883132291872652, "step": 7973, "train/total_loss": 0.13032162189483643 }, { "entropy": 9.573885917663574, "epoch": 0.7884121020367807, "mean_token_accuracy": 0.6720430254936218, "num_tokens": 20694967.0, "step": 7974, "train/ce_loss": 2.0439798831939697 }, { "epoch": 0.7884121020367807, "step": 7974, "train/sim_loss": 0.04296875 }, { "epoch": 0.7884121020367807, "step": 7974, "train/total_loss": 0.2473667412996292 }, { "entropy": 9.054139137268066, "epoch": 0.7885109748862962, "mean_token_accuracy": 0.7531914710998535, "num_tokens": 20700189.0, "step": 7975, "train/ce_loss": 0.9281861186027527 }, { "epoch": 0.7885109748862962, "step": 7975, "train/sim_loss": 0.0546875 }, { "epoch": 0.7885109748862962, "step": 7975, "train/total_loss": 0.14750611782073975 }, { "entropy": 8.449117660522461, "epoch": 0.7886098477358118, "mean_token_accuracy": 0.7948139905929565, "num_tokens": 20705548.0, "step": 7976, "train/ce_loss": 0.6490321755409241 }, { "epoch": 0.7886098477358118, "step": 7976, "train/sim_loss": 0.05859375 }, { "epoch": 0.7886098477358118, "step": 7976, "train/total_loss": 0.12349697202444077 }, { "entropy": 8.732007026672363, "epoch": 0.7887087205853273, "mean_token_accuracy": 0.7657784223556519, "num_tokens": 20710665.0, "step": 7977, "train/ce_loss": 0.5330588817596436 }, { "epoch": 0.7887087205853273, "step": 7977, "train/sim_loss": 0.0234375 }, { "epoch": 0.7887087205853273, "step": 7977, "train/total_loss": 0.07674339413642883 }, { "entropy": 9.389777183532715, "epoch": 0.7888075934348427, "mean_token_accuracy": 0.7830508351325989, "num_tokens": 20715658.0, "step": 7978, "train/ce_loss": 6.085639938646636e-07 }, { "epoch": 0.7888075934348427, "step": 7978, "train/sim_loss": 0.01953125 }, { "epoch": 0.7888075934348427, "step": 7978, "train/total_loss": 0.019531311467289925 }, { "entropy": 8.771812438964844, "epoch": 0.7889064662843583, "mean_token_accuracy": 0.7468944191932678, "num_tokens": 20720768.0, "step": 7979, "train/ce_loss": 1.1544526815414429 }, { "epoch": 0.7889064662843583, "step": 7979, "train/sim_loss": 0.015625 }, { "epoch": 0.7889064662843583, "step": 7979, "train/total_loss": 0.13107027113437653 }, { "epoch": 0.7890053391338738, "grad_norm": 0.67234867811203, "learning_rate": 8.029718637195273e-06, "loss": 0.1305, "step": 7980 }, { "entropy": 8.965940475463867, "epoch": 0.7890053391338738, "mean_token_accuracy": 0.7742448449134827, "num_tokens": 20725814.0, "step": 7980, "train/ce_loss": 1.0769838094711304 }, { "epoch": 0.7890053391338738, "step": 7980, "train/sim_loss": 0.0546875 }, { "epoch": 0.7890053391338738, "step": 7980, "train/total_loss": 0.16238588094711304 }, { "entropy": 8.755380630493164, "epoch": 0.7891042119833893, "mean_token_accuracy": 0.7167567610740662, "num_tokens": 20731220.0, "step": 7981, "train/ce_loss": 1.0310207605361938 }, { "epoch": 0.7891042119833893, "step": 7981, "train/sim_loss": 0.0703125 }, { "epoch": 0.7891042119833893, "step": 7981, "train/total_loss": 0.17341458797454834 }, { "entropy": 9.05705451965332, "epoch": 0.7892030848329049, "mean_token_accuracy": 0.7604166865348816, "num_tokens": 20736324.0, "step": 7982, "train/ce_loss": 1.1232382348680403e-06 }, { "epoch": 0.7892030848329049, "step": 7982, "train/sim_loss": 0.046875 }, { "epoch": 0.7892030848329049, "step": 7982, "train/total_loss": 0.046875111758708954 }, { "entropy": 8.760584831237793, "epoch": 0.7893019576824204, "mean_token_accuracy": 0.7796013951301575, "num_tokens": 20741626.0, "step": 7983, "train/ce_loss": 1.0431007146835327 }, { "epoch": 0.7893019576824204, "step": 7983, "train/sim_loss": 0.0390625 }, { "epoch": 0.7893019576824204, "step": 7983, "train/total_loss": 0.1433725655078888 }, { "entropy": 9.176156997680664, "epoch": 0.7894008305319359, "mean_token_accuracy": 0.7445651888847351, "num_tokens": 20746585.0, "step": 7984, "train/ce_loss": 5.271598638501018e-06 }, { "epoch": 0.7894008305319359, "step": 7984, "train/sim_loss": 0.04296875 }, { "epoch": 0.7894008305319359, "step": 7984, "train/total_loss": 0.04296927899122238 }, { "entropy": 8.639911651611328, "epoch": 0.7894997033814515, "mean_token_accuracy": 0.6930022835731506, "num_tokens": 20751976.0, "step": 7985, "train/ce_loss": 1.131714105606079 }, { "epoch": 0.7894997033814515, "step": 7985, "train/sim_loss": 0.06640625 }, { "epoch": 0.7894997033814515, "step": 7985, "train/total_loss": 0.17957766354084015 }, { "entropy": 9.173746109008789, "epoch": 0.789598576230967, "mean_token_accuracy": 0.7767145037651062, "num_tokens": 20757029.0, "step": 7986, "train/ce_loss": 0.716386079788208 }, { "epoch": 0.789598576230967, "step": 7986, "train/sim_loss": 0.015625 }, { "epoch": 0.789598576230967, "step": 7986, "train/total_loss": 0.08726360648870468 }, { "entropy": 8.37651252746582, "epoch": 0.7896974490804824, "mean_token_accuracy": 0.7713097929954529, "num_tokens": 20762505.0, "step": 7987, "train/ce_loss": 1.0379140377044678 }, { "epoch": 0.7896974490804824, "step": 7987, "train/sim_loss": 0.03125 }, { "epoch": 0.7896974490804824, "step": 7987, "train/total_loss": 0.13504141569137573 }, { "entropy": 8.751646041870117, "epoch": 0.789796321929998, "mean_token_accuracy": 0.7278287410736084, "num_tokens": 20767694.0, "step": 7988, "train/ce_loss": 1.356079906145169e-06 }, { "epoch": 0.789796321929998, "step": 7988, "train/sim_loss": 0.02734375 }, { "epoch": 0.789796321929998, "step": 7988, "train/total_loss": 0.027343885973095894 }, { "entropy": 9.12930679321289, "epoch": 0.7898951947795135, "mean_token_accuracy": 0.7540983557701111, "num_tokens": 20772824.0, "step": 7989, "train/ce_loss": 0.7866105437278748 }, { "epoch": 0.7898951947795135, "step": 7989, "train/sim_loss": 0.14453125 }, { "epoch": 0.7898951947795135, "step": 7989, "train/total_loss": 0.22319230437278748 }, { "entropy": 8.300350189208984, "epoch": 0.789994067629029, "mean_token_accuracy": 0.7761467695236206, "num_tokens": 20778429.0, "step": 7990, "train/ce_loss": 0.7868548631668091 }, { "epoch": 0.789994067629029, "step": 7990, "train/sim_loss": 0.0703125 }, { "epoch": 0.789994067629029, "step": 7990, "train/total_loss": 0.14899799227714539 }, { "entropy": 8.613931655883789, "epoch": 0.7900929404785446, "mean_token_accuracy": 0.8106951713562012, "num_tokens": 20783832.0, "step": 7991, "train/ce_loss": 0.6341633796691895 }, { "epoch": 0.7900929404785446, "step": 7991, "train/sim_loss": 0.015625 }, { "epoch": 0.7900929404785446, "step": 7991, "train/total_loss": 0.07904133945703506 }, { "entropy": 8.594196319580078, "epoch": 0.7901918133280601, "mean_token_accuracy": 0.7136514782905579, "num_tokens": 20789196.0, "step": 7992, "train/ce_loss": 1.3270422220230103 }, { "epoch": 0.7901918133280601, "step": 7992, "train/sim_loss": 0.0703125 }, { "epoch": 0.7901918133280601, "step": 7992, "train/total_loss": 0.2030167281627655 }, { "entropy": 8.562560081481934, "epoch": 0.7902906861775756, "mean_token_accuracy": 0.7399267554283142, "num_tokens": 20794525.0, "step": 7993, "train/ce_loss": 0.4412449896335602 }, { "epoch": 0.7902906861775756, "step": 7993, "train/sim_loss": 0.01171875 }, { "epoch": 0.7902906861775756, "step": 7993, "train/total_loss": 0.05584324896335602 }, { "entropy": 8.837336540222168, "epoch": 0.7903895590270912, "mean_token_accuracy": 0.7310705184936523, "num_tokens": 20800023.0, "step": 7994, "train/ce_loss": 0.6181901693344116 }, { "epoch": 0.7903895590270912, "step": 7994, "train/sim_loss": 0.0703125 }, { "epoch": 0.7903895590270912, "step": 7994, "train/total_loss": 0.13213151693344116 }, { "entropy": 8.306492805480957, "epoch": 0.7904884318766067, "mean_token_accuracy": 0.8111979365348816, "num_tokens": 20805260.0, "step": 7995, "train/ce_loss": 2.1152385670575313e-05 }, { "epoch": 0.7904884318766067, "step": 7995, "train/sim_loss": 0.0390625 }, { "epoch": 0.7904884318766067, "step": 7995, "train/total_loss": 0.039064615964889526 }, { "entropy": 8.57056999206543, "epoch": 0.7905873047261222, "mean_token_accuracy": 0.7406143546104431, "num_tokens": 20810589.0, "step": 7996, "train/ce_loss": 0.44312337040901184 }, { "epoch": 0.7905873047261222, "step": 7996, "train/sim_loss": 0.01953125 }, { "epoch": 0.7905873047261222, "step": 7996, "train/total_loss": 0.06384359300136566 }, { "entropy": 8.586959838867188, "epoch": 0.7906861775756377, "mean_token_accuracy": 0.7104018926620483, "num_tokens": 20815889.0, "step": 7997, "train/ce_loss": 1.5188014507293701 }, { "epoch": 0.7906861775756377, "step": 7997, "train/sim_loss": 0.04296875 }, { "epoch": 0.7906861775756377, "step": 7997, "train/total_loss": 0.194848895072937 }, { "entropy": 8.29849910736084, "epoch": 0.7907850504251532, "mean_token_accuracy": 0.7167235612869263, "num_tokens": 20821299.0, "step": 7998, "train/ce_loss": 0.7832455039024353 }, { "epoch": 0.7907850504251532, "step": 7998, "train/sim_loss": 0.05078125 }, { "epoch": 0.7907850504251532, "step": 7998, "train/total_loss": 0.129105806350708 }, { "entropy": 8.774063110351562, "epoch": 0.7908839232746687, "mean_token_accuracy": 0.7605262994766235, "num_tokens": 20826559.0, "step": 7999, "train/ce_loss": 0.4658834636211395 }, { "epoch": 0.7908839232746687, "step": 7999, "train/sim_loss": 0.05078125 }, { "epoch": 0.7908839232746687, "step": 7999, "train/total_loss": 0.09736959636211395 }, { "epoch": 0.7909827961241843, "grad_norm": 0.6165838837623596, "learning_rate": 8.024773772437325e-06, "loss": 0.1267, "step": 8000 }, { "entropy": 8.382328987121582, "epoch": 0.7909827961241843, "mean_token_accuracy": 0.6791630387306213, "num_tokens": 20832156.0, "step": 8000, "train/ce_loss": 1.113782286643982 }, { "epoch": 0.7909827961241843, "step": 8000, "train/sim_loss": 0.0859375 }, { "epoch": 0.7909827961241843, "step": 8000, "train/total_loss": 0.19731572270393372 }, { "entropy": 8.665933609008789, "epoch": 0.7910816689736998, "mean_token_accuracy": 0.7242206335067749, "num_tokens": 20837632.0, "step": 8001, "train/ce_loss": 1.1206034421920776 }, { "epoch": 0.7910816689736998, "step": 8001, "train/sim_loss": 0.07421875 }, { "epoch": 0.7910816689736998, "step": 8001, "train/total_loss": 0.1862790882587433 }, { "entropy": 8.408807754516602, "epoch": 0.7911805418232154, "mean_token_accuracy": 0.7311936020851135, "num_tokens": 20843124.0, "step": 8002, "train/ce_loss": 1.1915003061294556 }, { "epoch": 0.7911805418232154, "step": 8002, "train/sim_loss": 0.09375 }, { "epoch": 0.7911805418232154, "step": 8002, "train/total_loss": 0.2129000425338745 }, { "entropy": 8.198397636413574, "epoch": 0.7912794146727309, "mean_token_accuracy": 0.8179509043693542, "num_tokens": 20848756.0, "step": 8003, "train/ce_loss": 0.5002986788749695 }, { "epoch": 0.7912794146727309, "step": 8003, "train/sim_loss": 0.01953125 }, { "epoch": 0.7912794146727309, "step": 8003, "train/total_loss": 0.06956112384796143 }, { "entropy": 8.506534576416016, "epoch": 0.7913782875222464, "mean_token_accuracy": 0.7067209482192993, "num_tokens": 20854222.0, "step": 8004, "train/ce_loss": 0.7447643876075745 }, { "epoch": 0.7913782875222464, "step": 8004, "train/sim_loss": 0.0625 }, { "epoch": 0.7913782875222464, "step": 8004, "train/total_loss": 0.1369764506816864 }, { "entropy": 8.983867645263672, "epoch": 0.791477160371762, "mean_token_accuracy": 0.7689393758773804, "num_tokens": 20859472.0, "step": 8005, "train/ce_loss": 0.9216192960739136 }, { "epoch": 0.791477160371762, "step": 8005, "train/sim_loss": 0.109375 }, { "epoch": 0.791477160371762, "step": 8005, "train/total_loss": 0.20153692364692688 }, { "entropy": 8.91226577758789, "epoch": 0.7915760332212775, "mean_token_accuracy": 0.7316715717315674, "num_tokens": 20864596.0, "step": 8006, "train/ce_loss": 1.0091112852096558 }, { "epoch": 0.7915760332212775, "step": 8006, "train/sim_loss": 0.09375 }, { "epoch": 0.7915760332212775, "step": 8006, "train/total_loss": 0.19466114044189453 }, { "entropy": 9.32400894165039, "epoch": 0.7916749060707929, "mean_token_accuracy": 0.7775735259056091, "num_tokens": 20869586.0, "step": 8007, "train/ce_loss": 2.9520026600948768e-06 }, { "epoch": 0.7916749060707929, "step": 8007, "train/sim_loss": 0.05078125 }, { "epoch": 0.7916749060707929, "step": 8007, "train/total_loss": 0.05078154429793358 }, { "entropy": 8.588582992553711, "epoch": 0.7917737789203085, "mean_token_accuracy": 0.7236692905426025, "num_tokens": 20874962.0, "step": 8008, "train/ce_loss": 1.1916790008544922 }, { "epoch": 0.7917737789203085, "step": 8008, "train/sim_loss": 0.01953125 }, { "epoch": 0.7917737789203085, "step": 8008, "train/total_loss": 0.13869914412498474 }, { "entropy": 8.967981338500977, "epoch": 0.791872651769824, "mean_token_accuracy": 0.753926694393158, "num_tokens": 20880203.0, "step": 8009, "train/ce_loss": 0.5056743621826172 }, { "epoch": 0.791872651769824, "step": 8009, "train/sim_loss": 0.06640625 }, { "epoch": 0.791872651769824, "step": 8009, "train/total_loss": 0.11697368323802948 }, { "entropy": 9.474925994873047, "epoch": 0.7919715246193395, "mean_token_accuracy": 0.7214699983596802, "num_tokens": 20885184.0, "step": 8010, "train/ce_loss": 1.2627744674682617 }, { "epoch": 0.7919715246193395, "step": 8010, "train/sim_loss": 0.05859375 }, { "epoch": 0.7919715246193395, "step": 8010, "train/total_loss": 0.18487119674682617 }, { "entropy": 9.434576988220215, "epoch": 0.7920703974688551, "mean_token_accuracy": 0.6704225540161133, "num_tokens": 20889996.0, "step": 8011, "train/ce_loss": 0.5485074520111084 }, { "epoch": 0.7920703974688551, "step": 8011, "train/sim_loss": 0.07421875 }, { "epoch": 0.7920703974688551, "step": 8011, "train/total_loss": 0.1290694922208786 }, { "entropy": 8.907894134521484, "epoch": 0.7921692703183706, "mean_token_accuracy": 0.7462499737739563, "num_tokens": 20895250.0, "step": 8012, "train/ce_loss": 0.9570124745368958 }, { "epoch": 0.7921692703183706, "step": 8012, "train/sim_loss": 0.01953125 }, { "epoch": 0.7921692703183706, "step": 8012, "train/total_loss": 0.11523249745368958 }, { "entropy": 8.427125930786133, "epoch": 0.7922681431678861, "mean_token_accuracy": 0.7129337787628174, "num_tokens": 20900657.0, "step": 8013, "train/ce_loss": 1.1414391994476318 }, { "epoch": 0.7922681431678861, "step": 8013, "train/sim_loss": 0.046875 }, { "epoch": 0.7922681431678861, "step": 8013, "train/total_loss": 0.16101892292499542 }, { "entropy": 8.855546951293945, "epoch": 0.7923670160174017, "mean_token_accuracy": 0.7314356565475464, "num_tokens": 20905914.0, "step": 8014, "train/ce_loss": 1.0533287525177002 }, { "epoch": 0.7923670160174017, "step": 8014, "train/sim_loss": 0.04296875 }, { "epoch": 0.7923670160174017, "step": 8014, "train/total_loss": 0.1483016312122345 }, { "entropy": 9.2638521194458, "epoch": 0.7924658888669172, "mean_token_accuracy": 0.7286245226860046, "num_tokens": 20910943.0, "step": 8015, "train/ce_loss": 0.7868447303771973 }, { "epoch": 0.7924658888669172, "step": 8015, "train/sim_loss": 0.0546875 }, { "epoch": 0.7924658888669172, "step": 8015, "train/total_loss": 0.1333719789981842 }, { "entropy": 8.576969146728516, "epoch": 0.7925647617164326, "mean_token_accuracy": 0.720710039138794, "num_tokens": 20916322.0, "step": 8016, "train/ce_loss": 0.5343302488327026 }, { "epoch": 0.7925647617164326, "step": 8016, "train/sim_loss": 0.03515625 }, { "epoch": 0.7925647617164326, "step": 8016, "train/total_loss": 0.08858928084373474 }, { "entropy": 8.793830871582031, "epoch": 0.7926636345659482, "mean_token_accuracy": 0.7413554787635803, "num_tokens": 20921481.0, "step": 8017, "train/ce_loss": 0.7343287467956543 }, { "epoch": 0.7926636345659482, "step": 8017, "train/sim_loss": 0.05078125 }, { "epoch": 0.7926636345659482, "step": 8017, "train/total_loss": 0.12421412765979767 }, { "entropy": 8.47189712524414, "epoch": 0.7927625074154637, "mean_token_accuracy": 0.7388414144515991, "num_tokens": 20927218.0, "step": 8018, "train/ce_loss": 0.5181703567504883 }, { "epoch": 0.7927625074154637, "step": 8018, "train/sim_loss": 0.06640625 }, { "epoch": 0.7927625074154637, "step": 8018, "train/total_loss": 0.11822328716516495 }, { "entropy": 8.603368759155273, "epoch": 0.7928613802649792, "mean_token_accuracy": 0.7450058460235596, "num_tokens": 20932722.0, "step": 8019, "train/ce_loss": 0.7076160907745361 }, { "epoch": 0.7928613802649792, "step": 8019, "train/sim_loss": 0.03515625 }, { "epoch": 0.7928613802649792, "step": 8019, "train/total_loss": 0.10591786354780197 }, { "epoch": 0.7929602531144948, "grad_norm": 0.6106243133544922, "learning_rate": 8.019828907679375e-06, "loss": 0.1447, "step": 8020 }, { "entropy": 8.636609077453613, "epoch": 0.7929602531144948, "mean_token_accuracy": 0.6854565739631653, "num_tokens": 20938063.0, "step": 8020, "train/ce_loss": 1.7223507165908813 }, { "epoch": 0.7929602531144948, "step": 8020, "train/sim_loss": 0.0390625 }, { "epoch": 0.7929602531144948, "step": 8020, "train/total_loss": 0.21129757165908813 }, { "entropy": 8.813518524169922, "epoch": 0.7930591259640103, "mean_token_accuracy": 0.7285902500152588, "num_tokens": 20943288.0, "step": 8021, "train/ce_loss": 0.8579605221748352 }, { "epoch": 0.7930591259640103, "step": 8021, "train/sim_loss": 0.05078125 }, { "epoch": 0.7930591259640103, "step": 8021, "train/total_loss": 0.136577308177948 }, { "entropy": 8.812105178833008, "epoch": 0.7931579988135258, "mean_token_accuracy": 0.7507122755050659, "num_tokens": 20948458.0, "step": 8022, "train/ce_loss": 1.2979592084884644 }, { "epoch": 0.7931579988135258, "step": 8022, "train/sim_loss": 0.02734375 }, { "epoch": 0.7931579988135258, "step": 8022, "train/total_loss": 0.15713967382907867 }, { "entropy": 8.767423629760742, "epoch": 0.7932568716630414, "mean_token_accuracy": 0.7603305578231812, "num_tokens": 20953932.0, "step": 8023, "train/ce_loss": 0.4973965585231781 }, { "epoch": 0.7932568716630414, "step": 8023, "train/sim_loss": 0.0234375 }, { "epoch": 0.7932568716630414, "step": 8023, "train/total_loss": 0.07317715883255005 }, { "entropy": 8.79323673248291, "epoch": 0.7933557445125569, "mean_token_accuracy": 0.7204058766365051, "num_tokens": 20959292.0, "step": 8024, "train/ce_loss": 0.7244845628738403 }, { "epoch": 0.7933557445125569, "step": 8024, "train/sim_loss": 0.0234375 }, { "epoch": 0.7933557445125569, "step": 8024, "train/total_loss": 0.09588595479726791 }, { "entropy": 8.509698867797852, "epoch": 0.7934546173620723, "mean_token_accuracy": 0.7055630683898926, "num_tokens": 20964491.0, "step": 8025, "train/ce_loss": 0.5640966892242432 }, { "epoch": 0.7934546173620723, "step": 8025, "train/sim_loss": 0.05078125 }, { "epoch": 0.7934546173620723, "step": 8025, "train/total_loss": 0.10719092190265656 }, { "entropy": 8.53420352935791, "epoch": 0.7935534902115879, "mean_token_accuracy": 0.7690721750259399, "num_tokens": 20969951.0, "step": 8026, "train/ce_loss": 0.4616752564907074 }, { "epoch": 0.7935534902115879, "step": 8026, "train/sim_loss": 0.02734375 }, { "epoch": 0.7935534902115879, "step": 8026, "train/total_loss": 0.0735112726688385 }, { "entropy": 9.335734367370605, "epoch": 0.7936523630611034, "mean_token_accuracy": 0.7820267677307129, "num_tokens": 20974926.0, "step": 8027, "train/ce_loss": 1.5045855045318604 }, { "epoch": 0.7936523630611034, "step": 8027, "train/sim_loss": 0.0859375 }, { "epoch": 0.7936523630611034, "step": 8027, "train/total_loss": 0.23639605939388275 }, { "entropy": 8.86979866027832, "epoch": 0.7937512359106189, "mean_token_accuracy": 0.7473560571670532, "num_tokens": 20980290.0, "step": 8028, "train/ce_loss": 0.551834762096405 }, { "epoch": 0.7937512359106189, "step": 8028, "train/sim_loss": 0.03515625 }, { "epoch": 0.7937512359106189, "step": 8028, "train/total_loss": 0.09033972769975662 }, { "entropy": 9.030107498168945, "epoch": 0.7938501087601345, "mean_token_accuracy": 0.7059679627418518, "num_tokens": 20985458.0, "step": 8029, "train/ce_loss": 0.9660660624504089 }, { "epoch": 0.7938501087601345, "step": 8029, "train/sim_loss": 0.01953125 }, { "epoch": 0.7938501087601345, "step": 8029, "train/total_loss": 0.11613785475492477 }, { "entropy": 9.329030990600586, "epoch": 0.79394898160965, "mean_token_accuracy": 0.7026239037513733, "num_tokens": 20990211.0, "step": 8030, "train/ce_loss": 2.4383382424275624e-06 }, { "epoch": 0.79394898160965, "step": 8030, "train/sim_loss": 0.0390625 }, { "epoch": 0.79394898160965, "step": 8030, "train/total_loss": 0.0390627421438694 }, { "entropy": 9.173990249633789, "epoch": 0.7940478544591655, "mean_token_accuracy": 0.7726597189903259, "num_tokens": 20995291.0, "step": 8031, "train/ce_loss": 0.7765879034996033 }, { "epoch": 0.7940478544591655, "step": 8031, "train/sim_loss": 0.05078125 }, { "epoch": 0.7940478544591655, "step": 8031, "train/total_loss": 0.12844005227088928 }, { "entropy": 9.423505783081055, "epoch": 0.7941467273086811, "mean_token_accuracy": 0.6603773832321167, "num_tokens": 21000213.0, "step": 8032, "train/ce_loss": 4.033063305541873e-06 }, { "epoch": 0.7941467273086811, "step": 8032, "train/sim_loss": 0.0546875 }, { "epoch": 0.7941467273086811, "step": 8032, "train/total_loss": 0.054687902331352234 }, { "entropy": 8.714384078979492, "epoch": 0.7942456001581966, "mean_token_accuracy": 0.744516134262085, "num_tokens": 21005463.0, "step": 8033, "train/ce_loss": 0.688065767288208 }, { "epoch": 0.7942456001581966, "step": 8033, "train/sim_loss": 0.02734375 }, { "epoch": 0.7942456001581966, "step": 8033, "train/total_loss": 0.09615033119916916 }, { "entropy": 8.982434272766113, "epoch": 0.794344473007712, "mean_token_accuracy": 0.6989721059799194, "num_tokens": 21010597.0, "step": 8034, "train/ce_loss": 1.1527574062347412 }, { "epoch": 0.794344473007712, "step": 8034, "train/sim_loss": 0.14453125 }, { "epoch": 0.794344473007712, "step": 8034, "train/total_loss": 0.2598069906234741 }, { "entropy": 9.549768447875977, "epoch": 0.7944433458572276, "mean_token_accuracy": 0.6770833134651184, "num_tokens": 21015279.0, "step": 8035, "train/ce_loss": 4.862302830588305e-06 }, { "epoch": 0.7944433458572276, "step": 8035, "train/sim_loss": 0.0546875 }, { "epoch": 0.7944433458572276, "step": 8035, "train/total_loss": 0.0546879880130291 }, { "entropy": 8.494789123535156, "epoch": 0.7945422187067431, "mean_token_accuracy": 0.7814776301383972, "num_tokens": 21020720.0, "step": 8036, "train/ce_loss": 0.793206512928009 }, { "epoch": 0.7945422187067431, "step": 8036, "train/sim_loss": 0.0546875 }, { "epoch": 0.7945422187067431, "step": 8036, "train/total_loss": 0.13400815427303314 }, { "entropy": 8.888826370239258, "epoch": 0.7946410915562586, "mean_token_accuracy": 0.7537091970443726, "num_tokens": 21025807.0, "step": 8037, "train/ce_loss": 0.4757600724697113 }, { "epoch": 0.7946410915562586, "step": 8037, "train/sim_loss": 0.09375 }, { "epoch": 0.7946410915562586, "step": 8037, "train/total_loss": 0.14132601022720337 }, { "entropy": 8.643922805786133, "epoch": 0.7947399644057742, "mean_token_accuracy": 0.7627695798873901, "num_tokens": 21031175.0, "step": 8038, "train/ce_loss": 0.4287779927253723 }, { "epoch": 0.7947399644057742, "step": 8038, "train/sim_loss": 0.01953125 }, { "epoch": 0.7947399644057742, "step": 8038, "train/total_loss": 0.06240905076265335 }, { "entropy": 8.62867546081543, "epoch": 0.7948388372552897, "mean_token_accuracy": 0.7097480893135071, "num_tokens": 21036536.0, "step": 8039, "train/ce_loss": 1.2380282878875732 }, { "epoch": 0.7948388372552897, "step": 8039, "train/sim_loss": 0.140625 }, { "epoch": 0.7948388372552897, "step": 8039, "train/total_loss": 0.2644278407096863 }, { "epoch": 0.7949377101048052, "grad_norm": 0.6808719635009766, "learning_rate": 8.014884042921427e-06, "loss": 0.1365, "step": 8040 }, { "entropy": 9.270933151245117, "epoch": 0.7949377101048052, "mean_token_accuracy": 0.7643442749977112, "num_tokens": 21041484.0, "step": 8040, "train/ce_loss": 1.065250277519226 }, { "epoch": 0.7949377101048052, "step": 8040, "train/sim_loss": 0.0390625 }, { "epoch": 0.7949377101048052, "step": 8040, "train/total_loss": 0.14558753371238708 }, { "entropy": 8.76633358001709, "epoch": 0.7950365829543208, "mean_token_accuracy": 0.7448609471321106, "num_tokens": 21046722.0, "step": 8041, "train/ce_loss": 0.8189980387687683 }, { "epoch": 0.7950365829543208, "step": 8041, "train/sim_loss": 0.0390625 }, { "epoch": 0.7950365829543208, "step": 8041, "train/total_loss": 0.12096230685710907 }, { "entropy": 9.02629280090332, "epoch": 0.7951354558038363, "mean_token_accuracy": 0.7824143171310425, "num_tokens": 21051861.0, "step": 8042, "train/ce_loss": 1.332713007926941 }, { "epoch": 0.7951354558038363, "step": 8042, "train/sim_loss": 0.06640625 }, { "epoch": 0.7951354558038363, "step": 8042, "train/total_loss": 0.19967755675315857 }, { "entropy": 8.792691230773926, "epoch": 0.7952343286533518, "mean_token_accuracy": 0.7832929491996765, "num_tokens": 21057169.0, "step": 8043, "train/ce_loss": 0.7716123461723328 }, { "epoch": 0.7952343286533518, "step": 8043, "train/sim_loss": 0.0234375 }, { "epoch": 0.7952343286533518, "step": 8043, "train/total_loss": 0.10059873759746552 }, { "entropy": 8.570995330810547, "epoch": 0.7953332015028673, "mean_token_accuracy": 0.7436932325363159, "num_tokens": 21062602.0, "step": 8044, "train/ce_loss": 1.0313103199005127 }, { "epoch": 0.7953332015028673, "step": 8044, "train/sim_loss": 0.046875 }, { "epoch": 0.7953332015028673, "step": 8044, "train/total_loss": 0.1500060260295868 }, { "entropy": 8.827428817749023, "epoch": 0.7954320743523828, "mean_token_accuracy": 0.7553072571754456, "num_tokens": 21067957.0, "step": 8045, "train/ce_loss": 0.8951282501220703 }, { "epoch": 0.7954320743523828, "step": 8045, "train/sim_loss": 0.08984375 }, { "epoch": 0.7954320743523828, "step": 8045, "train/total_loss": 0.17935657501220703 }, { "entropy": 8.760117530822754, "epoch": 0.7955309472018983, "mean_token_accuracy": 0.7811704874038696, "num_tokens": 21073205.0, "step": 8046, "train/ce_loss": 1.0642935037612915 }, { "epoch": 0.7955309472018983, "step": 8046, "train/sim_loss": 0.04296875 }, { "epoch": 0.7955309472018983, "step": 8046, "train/total_loss": 0.1493981033563614 }, { "entropy": 8.706624031066895, "epoch": 0.7956298200514139, "mean_token_accuracy": 0.7051442861557007, "num_tokens": 21078485.0, "step": 8047, "train/ce_loss": 0.5215659141540527 }, { "epoch": 0.7956298200514139, "step": 8047, "train/sim_loss": 0.07421875 }, { "epoch": 0.7956298200514139, "step": 8047, "train/total_loss": 0.12637534737586975 }, { "entropy": 8.552886962890625, "epoch": 0.7957286929009294, "mean_token_accuracy": 0.769328236579895, "num_tokens": 21083766.0, "step": 8048, "train/ce_loss": 0.7825904488563538 }, { "epoch": 0.7957286929009294, "step": 8048, "train/sim_loss": 0.0546875 }, { "epoch": 0.7957286929009294, "step": 8048, "train/total_loss": 0.13294655084609985 }, { "entropy": 8.60584831237793, "epoch": 0.7958275657504449, "mean_token_accuracy": 0.740899384021759, "num_tokens": 21089199.0, "step": 8049, "train/ce_loss": 0.49789944291114807 }, { "epoch": 0.7958275657504449, "step": 8049, "train/sim_loss": 0.0546875 }, { "epoch": 0.7958275657504449, "step": 8049, "train/total_loss": 0.10447745025157928 }, { "entropy": 8.944599151611328, "epoch": 0.7959264385999605, "mean_token_accuracy": 0.7732843160629272, "num_tokens": 21094487.0, "step": 8050, "train/ce_loss": 1.007731318473816 }, { "epoch": 0.7959264385999605, "step": 8050, "train/sim_loss": 0.03125 }, { "epoch": 0.7959264385999605, "step": 8050, "train/total_loss": 0.13202312588691711 }, { "entropy": 8.915923118591309, "epoch": 0.796025311449476, "mean_token_accuracy": 0.7136498689651489, "num_tokens": 21099605.0, "step": 8051, "train/ce_loss": 1.4537798166275024 }, { "epoch": 0.796025311449476, "step": 8051, "train/sim_loss": 0.07421875 }, { "epoch": 0.796025311449476, "step": 8051, "train/total_loss": 0.219596728682518 }, { "entropy": 9.284667015075684, "epoch": 0.7961241842989915, "mean_token_accuracy": 0.762666642665863, "num_tokens": 21104423.0, "step": 8052, "train/ce_loss": 1.6215224266052246 }, { "epoch": 0.7961241842989915, "step": 8052, "train/sim_loss": 0.03515625 }, { "epoch": 0.7961241842989915, "step": 8052, "train/total_loss": 0.1973084956407547 }, { "entropy": 9.210882186889648, "epoch": 0.796223057148507, "mean_token_accuracy": 0.7402597665786743, "num_tokens": 21109519.0, "step": 8053, "train/ce_loss": 1.3008627891540527 }, { "epoch": 0.796223057148507, "step": 8053, "train/sim_loss": 0.10546875 }, { "epoch": 0.796223057148507, "step": 8053, "train/total_loss": 0.235555037856102 }, { "entropy": 9.005620956420898, "epoch": 0.7963219299980225, "mean_token_accuracy": 0.7664974331855774, "num_tokens": 21114766.0, "step": 8054, "train/ce_loss": 0.9678490161895752 }, { "epoch": 0.7963219299980225, "step": 8054, "train/sim_loss": 0.04296875 }, { "epoch": 0.7963219299980225, "step": 8054, "train/total_loss": 0.13975365459918976 }, { "entropy": 8.489038467407227, "epoch": 0.796420802847538, "mean_token_accuracy": 0.7626546621322632, "num_tokens": 21120156.0, "step": 8055, "train/ce_loss": 0.6151663661003113 }, { "epoch": 0.796420802847538, "step": 8055, "train/sim_loss": 0.11328125 }, { "epoch": 0.796420802847538, "step": 8055, "train/total_loss": 0.1747978925704956 }, { "entropy": 9.020843505859375, "epoch": 0.7965196756970536, "mean_token_accuracy": 0.682539701461792, "num_tokens": 21125371.0, "step": 8056, "train/ce_loss": 1.6898895502090454 }, { "epoch": 0.7965196756970536, "step": 8056, "train/sim_loss": 0.078125 }, { "epoch": 0.7965196756970536, "step": 8056, "train/total_loss": 0.24711395800113678 }, { "entropy": 9.3050537109375, "epoch": 0.7966185485465691, "mean_token_accuracy": 0.7439446449279785, "num_tokens": 21130385.0, "step": 8057, "train/ce_loss": 1.4615914821624756 }, { "epoch": 0.7966185485465691, "step": 8057, "train/sim_loss": 0.05078125 }, { "epoch": 0.7966185485465691, "step": 8057, "train/total_loss": 0.19694040715694427 }, { "entropy": 9.728808403015137, "epoch": 0.7967174213960846, "mean_token_accuracy": 0.7196765542030334, "num_tokens": 21135193.0, "step": 8058, "train/ce_loss": 8.455596798739862e-06 }, { "epoch": 0.7967174213960846, "step": 8058, "train/sim_loss": 0.04296875 }, { "epoch": 0.7967174213960846, "step": 8058, "train/total_loss": 0.04296959564089775 }, { "entropy": 8.709403038024902, "epoch": 0.7968162942456002, "mean_token_accuracy": 0.7583047151565552, "num_tokens": 21140515.0, "step": 8059, "train/ce_loss": 0.709441065788269 }, { "epoch": 0.7968162942456002, "step": 8059, "train/sim_loss": 0.03515625 }, { "epoch": 0.7968162942456002, "step": 8059, "train/total_loss": 0.10610035806894302 }, { "epoch": 0.7969151670951157, "grad_norm": 0.6506040692329407, "learning_rate": 8.009939178163478e-06, "loss": 0.1338, "step": 8060 }, { "entropy": 8.993186950683594, "epoch": 0.7969151670951157, "mean_token_accuracy": 0.7208976149559021, "num_tokens": 21145651.0, "step": 8060, "train/ce_loss": 0.6721799373626709 }, { "epoch": 0.7969151670951157, "step": 8060, "train/sim_loss": 0.0859375 }, { "epoch": 0.7969151670951157, "step": 8060, "train/total_loss": 0.15315550565719604 }, { "entropy": 9.022912979125977, "epoch": 0.7970140399446312, "mean_token_accuracy": 0.6871657967567444, "num_tokens": 21150860.0, "step": 8061, "train/ce_loss": 1.7690759897232056 }, { "epoch": 0.7970140399446312, "step": 8061, "train/sim_loss": 0.0390625 }, { "epoch": 0.7970140399446312, "step": 8061, "train/total_loss": 0.21597009897232056 }, { "entropy": 9.806182861328125, "epoch": 0.7971129127941468, "mean_token_accuracy": 0.8284023404121399, "num_tokens": 21155635.0, "step": 8062, "train/ce_loss": 5.1401830205577426e-06 }, { "epoch": 0.7971129127941468, "step": 8062, "train/sim_loss": 0.015625 }, { "epoch": 0.7971129127941468, "step": 8062, "train/total_loss": 0.015625514090061188 }, { "entropy": 9.324712753295898, "epoch": 0.7972117856436622, "mean_token_accuracy": 0.7597172856330872, "num_tokens": 21160607.0, "step": 8063, "train/ce_loss": 1.064432978630066 }, { "epoch": 0.7972117856436622, "step": 8063, "train/sim_loss": 0.05078125 }, { "epoch": 0.7972117856436622, "step": 8063, "train/total_loss": 0.15722455084323883 }, { "entropy": 8.54730224609375, "epoch": 0.7973106584931777, "mean_token_accuracy": 0.7011363506317139, "num_tokens": 21165949.0, "step": 8064, "train/ce_loss": 0.995823085308075 }, { "epoch": 0.7973106584931777, "step": 8064, "train/sim_loss": 0.0546875 }, { "epoch": 0.7973106584931777, "step": 8064, "train/total_loss": 0.15426981449127197 }, { "entropy": 8.729358673095703, "epoch": 0.7974095313426933, "mean_token_accuracy": 0.7662790417671204, "num_tokens": 21171319.0, "step": 8065, "train/ce_loss": 0.858303427696228 }, { "epoch": 0.7974095313426933, "step": 8065, "train/sim_loss": 0.046875 }, { "epoch": 0.7974095313426933, "step": 8065, "train/total_loss": 0.13270534574985504 }, { "entropy": 8.9976806640625, "epoch": 0.7975084041922088, "mean_token_accuracy": 0.7122905254364014, "num_tokens": 21176505.0, "step": 8066, "train/ce_loss": 2.1025643348693848 }, { "epoch": 0.7975084041922088, "step": 8066, "train/sim_loss": 0.08203125 }, { "epoch": 0.7975084041922088, "step": 8066, "train/total_loss": 0.2922877073287964 }, { "entropy": 8.699651718139648, "epoch": 0.7976072770417243, "mean_token_accuracy": 0.7576754093170166, "num_tokens": 21181869.0, "step": 8067, "train/ce_loss": 1.1232519149780273 }, { "epoch": 0.7976072770417243, "step": 8067, "train/sim_loss": 0.1328125 }, { "epoch": 0.7976072770417243, "step": 8067, "train/total_loss": 0.24513769149780273 }, { "entropy": 8.542829513549805, "epoch": 0.7977061498912399, "mean_token_accuracy": 0.694774329662323, "num_tokens": 21187195.0, "step": 8068, "train/ce_loss": 1.2314989566802979 }, { "epoch": 0.7977061498912399, "step": 8068, "train/sim_loss": 0.05078125 }, { "epoch": 0.7977061498912399, "step": 8068, "train/total_loss": 0.17393115162849426 }, { "entropy": 9.735811233520508, "epoch": 0.7978050227407554, "mean_token_accuracy": 0.7394958138465881, "num_tokens": 21191956.0, "step": 8069, "train/ce_loss": 1.8081225156784058 }, { "epoch": 0.7978050227407554, "step": 8069, "train/sim_loss": 0.0625 }, { "epoch": 0.7978050227407554, "step": 8069, "train/total_loss": 0.24331225454807281 }, { "entropy": 9.444601058959961, "epoch": 0.7979038955902709, "mean_token_accuracy": 0.7428571581840515, "num_tokens": 21196910.0, "step": 8070, "train/ce_loss": 2.05026503863337e-06 }, { "epoch": 0.7979038955902709, "step": 8070, "train/sim_loss": 0.01953125 }, { "epoch": 0.7979038955902709, "step": 8070, "train/total_loss": 0.019531454890966415 }, { "entropy": 9.299373626708984, "epoch": 0.7980027684397865, "mean_token_accuracy": 0.7241379022598267, "num_tokens": 21201912.0, "step": 8071, "train/ce_loss": 1.0788629055023193 }, { "epoch": 0.7980027684397865, "step": 8071, "train/sim_loss": 0.0703125 }, { "epoch": 0.7980027684397865, "step": 8071, "train/total_loss": 0.17819878458976746 }, { "entropy": 9.13897705078125, "epoch": 0.7981016412893019, "mean_token_accuracy": 0.744966447353363, "num_tokens": 21206880.0, "step": 8072, "train/ce_loss": 3.742259877981269e-06 }, { "epoch": 0.7981016412893019, "step": 8072, "train/sim_loss": 0.0390625 }, { "epoch": 0.7981016412893019, "step": 8072, "train/total_loss": 0.039062872529029846 }, { "entropy": 8.684549331665039, "epoch": 0.7982005141388174, "mean_token_accuracy": 0.764011800289154, "num_tokens": 21212030.0, "step": 8073, "train/ce_loss": 0.8428314328193665 }, { "epoch": 0.7982005141388174, "step": 8073, "train/sim_loss": 0.04296875 }, { "epoch": 0.7982005141388174, "step": 8073, "train/total_loss": 0.12725189328193665 }, { "entropy": 9.31668758392334, "epoch": 0.798299386988333, "mean_token_accuracy": 0.77625572681427, "num_tokens": 21217085.0, "step": 8074, "train/ce_loss": 5.788366706838133e-06 }, { "epoch": 0.798299386988333, "step": 8074, "train/sim_loss": 0.0859375 }, { "epoch": 0.798299386988333, "step": 8074, "train/total_loss": 0.08593808114528656 }, { "entropy": 8.60556411743164, "epoch": 0.7983982598378485, "mean_token_accuracy": 0.7404305934906006, "num_tokens": 21222449.0, "step": 8075, "train/ce_loss": 0.6787205934524536 }, { "epoch": 0.7983982598378485, "step": 8075, "train/sim_loss": 0.09375 }, { "epoch": 0.7983982598378485, "step": 8075, "train/total_loss": 0.1616220623254776 }, { "entropy": 9.245166778564453, "epoch": 0.798497132687364, "mean_token_accuracy": 0.7532051205635071, "num_tokens": 21227519.0, "step": 8076, "train/ce_loss": 1.3790321350097656 }, { "epoch": 0.798497132687364, "step": 8076, "train/sim_loss": 0.10546875 }, { "epoch": 0.798497132687364, "step": 8076, "train/total_loss": 0.24337196350097656 }, { "entropy": 8.748480796813965, "epoch": 0.7985960055368796, "mean_token_accuracy": 0.7029339671134949, "num_tokens": 21232773.0, "step": 8077, "train/ce_loss": 0.5744321346282959 }, { "epoch": 0.7985960055368796, "step": 8077, "train/sim_loss": 0.1015625 }, { "epoch": 0.7985960055368796, "step": 8077, "train/total_loss": 0.15900571644306183 }, { "entropy": 8.951897621154785, "epoch": 0.7986948783863951, "mean_token_accuracy": 0.6483516693115234, "num_tokens": 21237912.0, "step": 8078, "train/ce_loss": 1.4018466472625732 }, { "epoch": 0.7986948783863951, "step": 8078, "train/sim_loss": 0.07421875 }, { "epoch": 0.7986948783863951, "step": 8078, "train/total_loss": 0.2144034206867218 }, { "entropy": 9.206893920898438, "epoch": 0.7987937512359106, "mean_token_accuracy": 0.7587253451347351, "num_tokens": 21243002.0, "step": 8079, "train/ce_loss": 1.1328591108322144 }, { "epoch": 0.7987937512359106, "step": 8079, "train/sim_loss": 0.0546875 }, { "epoch": 0.7987937512359106, "step": 8079, "train/total_loss": 0.16797341406345367 }, { "epoch": 0.7988926240854262, "grad_norm": 0.7565287947654724, "learning_rate": 8.004994313405528e-06, "loss": 0.1375, "step": 8080 }, { "entropy": 8.812400817871094, "epoch": 0.7988926240854262, "mean_token_accuracy": 0.7281166911125183, "num_tokens": 21248212.0, "step": 8080, "train/ce_loss": 0.8350350856781006 }, { "epoch": 0.7988926240854262, "step": 8080, "train/sim_loss": 0.05859375 }, { "epoch": 0.7988926240854262, "step": 8080, "train/total_loss": 0.14209726452827454 }, { "entropy": 8.763813018798828, "epoch": 0.7989914969349416, "mean_token_accuracy": 0.6979866027832031, "num_tokens": 21253595.0, "step": 8081, "train/ce_loss": 1.2388736009597778 }, { "epoch": 0.7989914969349416, "step": 8081, "train/sim_loss": 0.06640625 }, { "epoch": 0.7989914969349416, "step": 8081, "train/total_loss": 0.19029361009597778 }, { "entropy": 9.12805461883545, "epoch": 0.7990903697844571, "mean_token_accuracy": 0.760869562625885, "num_tokens": 21258659.0, "step": 8082, "train/ce_loss": 0.6981890201568604 }, { "epoch": 0.7990903697844571, "step": 8082, "train/sim_loss": 0.0390625 }, { "epoch": 0.7990903697844571, "step": 8082, "train/total_loss": 0.1088814064860344 }, { "entropy": 8.688815116882324, "epoch": 0.7991892426339727, "mean_token_accuracy": 0.7022653818130493, "num_tokens": 21264000.0, "step": 8083, "train/ce_loss": 1.0230445861816406 }, { "epoch": 0.7991892426339727, "step": 8083, "train/sim_loss": 0.05078125 }, { "epoch": 0.7991892426339727, "step": 8083, "train/total_loss": 0.15308570861816406 }, { "entropy": 8.801924705505371, "epoch": 0.7992881154834882, "mean_token_accuracy": 0.737864077091217, "num_tokens": 21269218.0, "step": 8084, "train/ce_loss": 0.7019436955451965 }, { "epoch": 0.7992881154834882, "step": 8084, "train/sim_loss": 0.02734375 }, { "epoch": 0.7992881154834882, "step": 8084, "train/total_loss": 0.09753812104463577 }, { "entropy": 8.858683586120605, "epoch": 0.7993869883330038, "mean_token_accuracy": 0.6863309144973755, "num_tokens": 21274397.0, "step": 8085, "train/ce_loss": 1.792798399925232 }, { "epoch": 0.7993869883330038, "step": 8085, "train/sim_loss": 0.1953125 }, { "epoch": 0.7993869883330038, "step": 8085, "train/total_loss": 0.3745923638343811 }, { "entropy": 9.004164695739746, "epoch": 0.7994858611825193, "mean_token_accuracy": 0.7594594359397888, "num_tokens": 21279601.0, "step": 8086, "train/ce_loss": 0.7385054230690002 }, { "epoch": 0.7994858611825193, "step": 8086, "train/sim_loss": 0.109375 }, { "epoch": 0.7994858611825193, "step": 8086, "train/total_loss": 0.18322554230690002 }, { "entropy": 10.292235374450684, "epoch": 0.7995847340320348, "mean_token_accuracy": 1.0, "num_tokens": 21283981.0, "step": 8087, "train/ce_loss": 4.2019906686618924e-05 }, { "epoch": 0.7995847340320348, "step": 8087, "train/sim_loss": 0.06640625 }, { "epoch": 0.7995847340320348, "step": 8087, "train/total_loss": 0.06641045212745667 }, { "entropy": 8.526718139648438, "epoch": 0.7996836068815504, "mean_token_accuracy": 0.7063829898834229, "num_tokens": 21289353.0, "step": 8088, "train/ce_loss": 0.9399168491363525 }, { "epoch": 0.7996836068815504, "step": 8088, "train/sim_loss": 0.03125 }, { "epoch": 0.7996836068815504, "step": 8088, "train/total_loss": 0.1252416968345642 }, { "entropy": 8.619998931884766, "epoch": 0.7997824797310659, "mean_token_accuracy": 0.7592592835426331, "num_tokens": 21294710.0, "step": 8089, "train/ce_loss": 0.638979434967041 }, { "epoch": 0.7997824797310659, "step": 8089, "train/sim_loss": 0.06640625 }, { "epoch": 0.7997824797310659, "step": 8089, "train/total_loss": 0.13030418753623962 }, { "entropy": 8.820860862731934, "epoch": 0.7998813525805814, "mean_token_accuracy": 0.7468671798706055, "num_tokens": 21299985.0, "step": 8090, "train/ce_loss": 0.9259432554244995 }, { "epoch": 0.7998813525805814, "step": 8090, "train/sim_loss": 0.02734375 }, { "epoch": 0.7998813525805814, "step": 8090, "train/total_loss": 0.11993807554244995 }, { "entropy": 8.713180541992188, "epoch": 0.799980225430097, "mean_token_accuracy": 0.7084826827049255, "num_tokens": 21305282.0, "step": 8091, "train/ce_loss": 1.0818867683410645 }, { "epoch": 0.799980225430097, "step": 8091, "train/sim_loss": 0.078125 }, { "epoch": 0.799980225430097, "step": 8091, "train/total_loss": 0.1863136887550354 }, { "entropy": 8.716011047363281, "epoch": 0.8000790982796124, "mean_token_accuracy": 0.7217788100242615, "num_tokens": 21310640.0, "step": 8092, "train/ce_loss": 0.6730086803436279 }, { "epoch": 0.8000790982796124, "step": 8092, "train/sim_loss": 0.0625 }, { "epoch": 0.8000790982796124, "step": 8092, "train/total_loss": 0.12980087101459503 }, { "entropy": 8.94961166381836, "epoch": 0.8001779711291279, "mean_token_accuracy": 0.7549669146537781, "num_tokens": 21315713.0, "step": 8093, "train/ce_loss": 1.4025733470916748 }, { "epoch": 0.8001779711291279, "step": 8093, "train/sim_loss": 0.08203125 }, { "epoch": 0.8001779711291279, "step": 8093, "train/total_loss": 0.2222885936498642 }, { "entropy": 8.678363800048828, "epoch": 0.8002768439786435, "mean_token_accuracy": 0.6886574029922485, "num_tokens": 21321056.0, "step": 8094, "train/ce_loss": 1.2181483507156372 }, { "epoch": 0.8002768439786435, "step": 8094, "train/sim_loss": 0.07421875 }, { "epoch": 0.8002768439786435, "step": 8094, "train/total_loss": 0.19603359699249268 }, { "entropy": 8.87493896484375, "epoch": 0.800375716828159, "mean_token_accuracy": 0.7653213739395142, "num_tokens": 21326136.0, "step": 8095, "train/ce_loss": 0.7125213742256165 }, { "epoch": 0.800375716828159, "step": 8095, "train/sim_loss": 0.015625 }, { "epoch": 0.800375716828159, "step": 8095, "train/total_loss": 0.08687713742256165 }, { "entropy": 8.643080711364746, "epoch": 0.8004745896776745, "mean_token_accuracy": 0.7255638837814331, "num_tokens": 21331607.0, "step": 8096, "train/ce_loss": 0.5503685474395752 }, { "epoch": 0.8004745896776745, "step": 8096, "train/sim_loss": 0.03515625 }, { "epoch": 0.8004745896776745, "step": 8096, "train/total_loss": 0.09019310772418976 }, { "entropy": 8.950005531311035, "epoch": 0.8005734625271901, "mean_token_accuracy": 0.7805907130241394, "num_tokens": 21336787.0, "step": 8097, "train/ce_loss": 0.9524298310279846 }, { "epoch": 0.8005734625271901, "step": 8097, "train/sim_loss": 0.0234375 }, { "epoch": 0.8005734625271901, "step": 8097, "train/total_loss": 0.11868048459291458 }, { "entropy": 8.641998291015625, "epoch": 0.8006723353767056, "mean_token_accuracy": 0.7713178396224976, "num_tokens": 21342041.0, "step": 8098, "train/ce_loss": 0.896429181098938 }, { "epoch": 0.8006723353767056, "step": 8098, "train/sim_loss": 0.0625 }, { "epoch": 0.8006723353767056, "step": 8098, "train/total_loss": 0.15214291214942932 }, { "entropy": 9.116812705993652, "epoch": 0.800771208226221, "mean_token_accuracy": 0.7391952276229858, "num_tokens": 21347123.0, "step": 8099, "train/ce_loss": 3.7170384530327283e-06 }, { "epoch": 0.800771208226221, "step": 8099, "train/sim_loss": 0.0234375 }, { "epoch": 0.800771208226221, "step": 8099, "train/total_loss": 0.023437872529029846 }, { "epoch": 0.8008700810757367, "grad_norm": 0.6553953886032104, "learning_rate": 8.00004944864758e-06, "loss": 0.1359, "step": 8100 }, { "entropy": 8.752761840820312, "epoch": 0.8008700810757367, "mean_token_accuracy": 0.7319347262382507, "num_tokens": 21352467.0, "step": 8100, "train/ce_loss": 0.5254290103912354 }, { "epoch": 0.8008700810757367, "step": 8100, "train/sim_loss": 0.015625 }, { "epoch": 0.8008700810757367, "step": 8100, "train/total_loss": 0.06816790252923965 }, { "entropy": 8.795303344726562, "epoch": 0.8009689539252521, "mean_token_accuracy": 0.7713936567306519, "num_tokens": 21357741.0, "step": 8101, "train/ce_loss": 0.5583730936050415 }, { "epoch": 0.8009689539252521, "step": 8101, "train/sim_loss": 0.0703125 }, { "epoch": 0.8009689539252521, "step": 8101, "train/total_loss": 0.12614980340003967 }, { "entropy": 8.594700813293457, "epoch": 0.8010678267747676, "mean_token_accuracy": 0.7375565767288208, "num_tokens": 21363153.0, "step": 8102, "train/ce_loss": 1.0188671350479126 }, { "epoch": 0.8010678267747676, "step": 8102, "train/sim_loss": 0.078125 }, { "epoch": 0.8010678267747676, "step": 8102, "train/total_loss": 0.18001171946525574 }, { "entropy": 8.538590431213379, "epoch": 0.8011666996242832, "mean_token_accuracy": 0.7801578640937805, "num_tokens": 21368469.0, "step": 8103, "train/ce_loss": 0.700299084186554 }, { "epoch": 0.8011666996242832, "step": 8103, "train/sim_loss": 0.04296875 }, { "epoch": 0.8011666996242832, "step": 8103, "train/total_loss": 0.11299865692853928 }, { "entropy": 9.091485977172852, "epoch": 0.8012655724737987, "mean_token_accuracy": 0.7887067198753357, "num_tokens": 21373453.0, "step": 8104, "train/ce_loss": 1.0037702322006226 }, { "epoch": 0.8012655724737987, "step": 8104, "train/sim_loss": 0.07421875 }, { "epoch": 0.8012655724737987, "step": 8104, "train/total_loss": 0.17459577322006226 }, { "entropy": 9.3186674118042, "epoch": 0.8013644453233142, "mean_token_accuracy": 0.78925621509552, "num_tokens": 21378354.0, "step": 8105, "train/ce_loss": 1.93576056517486e-06 }, { "epoch": 0.8013644453233142, "step": 8105, "train/sim_loss": 0.05078125 }, { "epoch": 0.8013644453233142, "step": 8105, "train/total_loss": 0.05078144371509552 }, { "entropy": 8.80603313446045, "epoch": 0.8014633181728298, "mean_token_accuracy": 0.7463087439537048, "num_tokens": 21383590.0, "step": 8106, "train/ce_loss": 1.2183705568313599 }, { "epoch": 0.8014633181728298, "step": 8106, "train/sim_loss": 0.078125 }, { "epoch": 0.8014633181728298, "step": 8106, "train/total_loss": 0.1999620497226715 }, { "entropy": 9.395215034484863, "epoch": 0.8015621910223453, "mean_token_accuracy": 0.7768595218658447, "num_tokens": 21388364.0, "step": 8107, "train/ce_loss": 1.455410361289978 }, { "epoch": 0.8015621910223453, "step": 8107, "train/sim_loss": 0.05078125 }, { "epoch": 0.8015621910223453, "step": 8107, "train/total_loss": 0.19632229208946228 }, { "entropy": 8.796661376953125, "epoch": 0.8016610638718608, "mean_token_accuracy": 0.7198124527931213, "num_tokens": 21393700.0, "step": 8108, "train/ce_loss": 0.7744140625 }, { "epoch": 0.8016610638718608, "step": 8108, "train/sim_loss": 0.09375 }, { "epoch": 0.8016610638718608, "step": 8108, "train/total_loss": 0.17119140923023224 }, { "entropy": 8.673404693603516, "epoch": 0.8017599367213764, "mean_token_accuracy": 0.7109470963478088, "num_tokens": 21398962.0, "step": 8109, "train/ce_loss": 1.0927854776382446 }, { "epoch": 0.8017599367213764, "step": 8109, "train/sim_loss": 0.0625 }, { "epoch": 0.8017599367213764, "step": 8109, "train/total_loss": 0.17177855968475342 }, { "entropy": 8.964365005493164, "epoch": 0.8018588095708918, "mean_token_accuracy": 0.7806748747825623, "num_tokens": 21404080.0, "step": 8110, "train/ce_loss": 3.7938925743219443e-06 }, { "epoch": 0.8018588095708918, "step": 8110, "train/sim_loss": 0.04296875 }, { "epoch": 0.8018588095708918, "step": 8110, "train/total_loss": 0.04296912997961044 }, { "entropy": 8.62191104888916, "epoch": 0.8019576824204073, "mean_token_accuracy": 0.7578288316726685, "num_tokens": 21409560.0, "step": 8111, "train/ce_loss": 0.6696912050247192 }, { "epoch": 0.8019576824204073, "step": 8111, "train/sim_loss": 0.01953125 }, { "epoch": 0.8019576824204073, "step": 8111, "train/total_loss": 0.0865003690123558 }, { "entropy": 8.387670516967773, "epoch": 0.8020565552699229, "mean_token_accuracy": 0.7623188495635986, "num_tokens": 21415084.0, "step": 8112, "train/ce_loss": 0.8153111338615417 }, { "epoch": 0.8020565552699229, "step": 8112, "train/sim_loss": 0.0546875 }, { "epoch": 0.8020565552699229, "step": 8112, "train/total_loss": 0.1362186074256897 }, { "entropy": 8.499277114868164, "epoch": 0.8021554281194384, "mean_token_accuracy": 0.7409224510192871, "num_tokens": 21420575.0, "step": 8113, "train/ce_loss": 0.9124782085418701 }, { "epoch": 0.8021554281194384, "step": 8113, "train/sim_loss": 0.07421875 }, { "epoch": 0.8021554281194384, "step": 8113, "train/total_loss": 0.1654665768146515 }, { "entropy": 8.523259162902832, "epoch": 0.8022543009689539, "mean_token_accuracy": 0.7751396894454956, "num_tokens": 21425784.0, "step": 8114, "train/ce_loss": 1.2068164348602295 }, { "epoch": 0.8022543009689539, "step": 8114, "train/sim_loss": 0.02734375 }, { "epoch": 0.8022543009689539, "step": 8114, "train/total_loss": 0.14802539348602295 }, { "entropy": 8.784333229064941, "epoch": 0.8023531738184695, "mean_token_accuracy": 0.7310252785682678, "num_tokens": 21430948.0, "step": 8115, "train/ce_loss": 0.9043253064155579 }, { "epoch": 0.8023531738184695, "step": 8115, "train/sim_loss": 0.0546875 }, { "epoch": 0.8023531738184695, "step": 8115, "train/total_loss": 0.1451200246810913 }, { "entropy": 9.411741256713867, "epoch": 0.802452046667985, "mean_token_accuracy": 0.7320099472999573, "num_tokens": 21435781.0, "step": 8116, "train/ce_loss": 1.9460113048553467 }, { "epoch": 0.802452046667985, "step": 8116, "train/sim_loss": 0.11328125 }, { "epoch": 0.802452046667985, "step": 8116, "train/total_loss": 0.3078823685646057 }, { "entropy": 8.769495010375977, "epoch": 0.8025509195175005, "mean_token_accuracy": 0.7507836818695068, "num_tokens": 21440880.0, "step": 8117, "train/ce_loss": 0.932226300239563 }, { "epoch": 0.8025509195175005, "step": 8117, "train/sim_loss": 0.0234375 }, { "epoch": 0.8025509195175005, "step": 8117, "train/total_loss": 0.11666013300418854 }, { "entropy": 9.231952667236328, "epoch": 0.8026497923670161, "mean_token_accuracy": 0.7342550158500671, "num_tokens": 21446017.0, "step": 8118, "train/ce_loss": 1.7037280797958374 }, { "epoch": 0.8026497923670161, "step": 8118, "train/sim_loss": 0.09375 }, { "epoch": 0.8026497923670161, "step": 8118, "train/total_loss": 0.2641228139400482 }, { "entropy": 8.689058303833008, "epoch": 0.8027486652165315, "mean_token_accuracy": 0.7462499737739563, "num_tokens": 21451367.0, "step": 8119, "train/ce_loss": 0.5837413668632507 }, { "epoch": 0.8027486652165315, "step": 8119, "train/sim_loss": 0.05078125 }, { "epoch": 0.8027486652165315, "step": 8119, "train/total_loss": 0.10915538668632507 }, { "epoch": 0.802847538066047, "grad_norm": 0.6518300175666809, "learning_rate": 7.995104583889631e-06, "loss": 0.1314, "step": 8120 }, { "entropy": 9.131936073303223, "epoch": 0.802847538066047, "mean_token_accuracy": 0.8208954930305481, "num_tokens": 21456214.0, "step": 8120, "train/ce_loss": 4.40249687017058e-06 }, { "epoch": 0.802847538066047, "step": 8120, "train/sim_loss": 0.05859375 }, { "epoch": 0.802847538066047, "step": 8120, "train/total_loss": 0.05859418958425522 }, { "entropy": 9.137779235839844, "epoch": 0.8029464109155626, "mean_token_accuracy": 0.765925943851471, "num_tokens": 21461376.0, "step": 8121, "train/ce_loss": 0.7983360290527344 }, { "epoch": 0.8029464109155626, "step": 8121, "train/sim_loss": 0.046875 }, { "epoch": 0.8029464109155626, "step": 8121, "train/total_loss": 0.12670859694480896 }, { "entropy": 8.53607177734375, "epoch": 0.8030452837650781, "mean_token_accuracy": 0.7017017006874084, "num_tokens": 21466842.0, "step": 8122, "train/ce_loss": 0.70408034324646 }, { "epoch": 0.8030452837650781, "step": 8122, "train/sim_loss": 0.01171875 }, { "epoch": 0.8030452837650781, "step": 8122, "train/total_loss": 0.08212678879499435 }, { "entropy": 8.714844703674316, "epoch": 0.8031441566145936, "mean_token_accuracy": 0.7058252692222595, "num_tokens": 21472370.0, "step": 8123, "train/ce_loss": 1.137970209121704 }, { "epoch": 0.8031441566145936, "step": 8123, "train/sim_loss": 0.0234375 }, { "epoch": 0.8031441566145936, "step": 8123, "train/total_loss": 0.13723452389240265 }, { "entropy": 8.661134719848633, "epoch": 0.8032430294641092, "mean_token_accuracy": 0.756440281867981, "num_tokens": 21477721.0, "step": 8124, "train/ce_loss": 0.646306037902832 }, { "epoch": 0.8032430294641092, "step": 8124, "train/sim_loss": 0.0234375 }, { "epoch": 0.8032430294641092, "step": 8124, "train/total_loss": 0.08806810528039932 }, { "entropy": 9.107043266296387, "epoch": 0.8033419023136247, "mean_token_accuracy": 0.7185840606689453, "num_tokens": 21482741.0, "step": 8125, "train/ce_loss": 1.4271183013916016 }, { "epoch": 0.8033419023136247, "step": 8125, "train/sim_loss": 0.0390625 }, { "epoch": 0.8033419023136247, "step": 8125, "train/total_loss": 0.1817743331193924 }, { "entropy": 9.141106605529785, "epoch": 0.8034407751631402, "mean_token_accuracy": 0.687609076499939, "num_tokens": 21487739.0, "step": 8126, "train/ce_loss": 1.2258257865905762 }, { "epoch": 0.8034407751631402, "step": 8126, "train/sim_loss": 0.01953125 }, { "epoch": 0.8034407751631402, "step": 8126, "train/total_loss": 0.1421138346195221 }, { "entropy": 8.606668472290039, "epoch": 0.8035396480126558, "mean_token_accuracy": 0.7492130398750305, "num_tokens": 21493147.0, "step": 8127, "train/ce_loss": 0.83812415599823 }, { "epoch": 0.8035396480126558, "step": 8127, "train/sim_loss": 0.140625 }, { "epoch": 0.8035396480126558, "step": 8127, "train/total_loss": 0.224437415599823 }, { "entropy": 9.192540168762207, "epoch": 0.8036385208621712, "mean_token_accuracy": 0.7893129587173462, "num_tokens": 21498259.0, "step": 8128, "train/ce_loss": 2.97106703328609e-06 }, { "epoch": 0.8036385208621712, "step": 8128, "train/sim_loss": 0.015625 }, { "epoch": 0.8036385208621712, "step": 8128, "train/total_loss": 0.015625298023223877 }, { "entropy": 8.542991638183594, "epoch": 0.8037373937116867, "mean_token_accuracy": 0.7178378105163574, "num_tokens": 21503648.0, "step": 8129, "train/ce_loss": 0.6392026543617249 }, { "epoch": 0.8037373937116867, "step": 8129, "train/sim_loss": 0.0390625 }, { "epoch": 0.8037373937116867, "step": 8129, "train/total_loss": 0.1029827669262886 }, { "entropy": 8.937554359436035, "epoch": 0.8038362665612023, "mean_token_accuracy": 0.7507507801055908, "num_tokens": 21508919.0, "step": 8130, "train/ce_loss": 4.179452662356198e-05 }, { "epoch": 0.8038362665612023, "step": 8130, "train/sim_loss": 0.03515625 }, { "epoch": 0.8038362665612023, "step": 8130, "train/total_loss": 0.035160429775714874 }, { "entropy": 8.675518989562988, "epoch": 0.8039351394107178, "mean_token_accuracy": 0.6987804770469666, "num_tokens": 21514166.0, "step": 8131, "train/ce_loss": 0.961373507976532 }, { "epoch": 0.8039351394107178, "step": 8131, "train/sim_loss": 0.0625 }, { "epoch": 0.8039351394107178, "step": 8131, "train/total_loss": 0.15863734483718872 }, { "entropy": 9.307016372680664, "epoch": 0.8040340122602333, "mean_token_accuracy": 0.7984790802001953, "num_tokens": 21519108.0, "step": 8132, "train/ce_loss": 0.9115478992462158 }, { "epoch": 0.8040340122602333, "step": 8132, "train/sim_loss": 0.0234375 }, { "epoch": 0.8040340122602333, "step": 8132, "train/total_loss": 0.1145922914147377 }, { "entropy": 8.928248405456543, "epoch": 0.8041328851097489, "mean_token_accuracy": 0.7286184430122375, "num_tokens": 21524159.0, "step": 8133, "train/ce_loss": 0.7917159199714661 }, { "epoch": 0.8041328851097489, "step": 8133, "train/sim_loss": 0.0390625 }, { "epoch": 0.8041328851097489, "step": 8133, "train/total_loss": 0.11823409050703049 }, { "entropy": 8.766800880432129, "epoch": 0.8042317579592644, "mean_token_accuracy": 0.7781732082366943, "num_tokens": 21529497.0, "step": 8134, "train/ce_loss": 1.123258113861084 }, { "epoch": 0.8042317579592644, "step": 8134, "train/sim_loss": 0.04296875 }, { "epoch": 0.8042317579592644, "step": 8134, "train/total_loss": 0.15529456734657288 }, { "entropy": 8.762529373168945, "epoch": 0.8043306308087799, "mean_token_accuracy": 0.7233333587646484, "num_tokens": 21534555.0, "step": 8135, "train/ce_loss": 1.5933242138999049e-06 }, { "epoch": 0.8043306308087799, "step": 8135, "train/sim_loss": 0.0703125 }, { "epoch": 0.8043306308087799, "step": 8135, "train/total_loss": 0.07031265646219254 }, { "entropy": 9.216514587402344, "epoch": 0.8044295036582955, "mean_token_accuracy": 0.7174721360206604, "num_tokens": 21539554.0, "step": 8136, "train/ce_loss": 4.063111191499047e-06 }, { "epoch": 0.8044295036582955, "step": 8136, "train/sim_loss": 0.05859375 }, { "epoch": 0.8044295036582955, "step": 8136, "train/total_loss": 0.05859415605664253 }, { "entropy": 9.50296401977539, "epoch": 0.804528376507811, "mean_token_accuracy": 0.7253668904304504, "num_tokens": 21544465.0, "step": 8137, "train/ce_loss": 2.0438973903656006 }, { "epoch": 0.804528376507811, "step": 8137, "train/sim_loss": 0.046875 }, { "epoch": 0.804528376507811, "step": 8137, "train/total_loss": 0.251264750957489 }, { "entropy": 9.341086387634277, "epoch": 0.8046272493573264, "mean_token_accuracy": 0.7492625117301941, "num_tokens": 21549378.0, "step": 8138, "train/ce_loss": 1.6934353113174438 }, { "epoch": 0.8046272493573264, "step": 8138, "train/sim_loss": 0.08203125 }, { "epoch": 0.8046272493573264, "step": 8138, "train/total_loss": 0.2513747811317444 }, { "entropy": 8.868955612182617, "epoch": 0.804726122206842, "mean_token_accuracy": 0.6798365116119385, "num_tokens": 21554554.0, "step": 8139, "train/ce_loss": 1.5643260478973389 }, { "epoch": 0.804726122206842, "step": 8139, "train/sim_loss": 0.05859375 }, { "epoch": 0.804726122206842, "step": 8139, "train/total_loss": 0.2150263637304306 }, { "epoch": 0.8048249950563575, "grad_norm": 0.7821770906448364, "learning_rate": 7.990159719131683e-06, "loss": 0.1326, "step": 8140 }, { "entropy": 8.8527250289917, "epoch": 0.8048249950563575, "mean_token_accuracy": 0.7285902500152588, "num_tokens": 21559769.0, "step": 8140, "train/ce_loss": 1.3271492719650269 }, { "epoch": 0.8048249950563575, "step": 8140, "train/sim_loss": 0.02734375 }, { "epoch": 0.8048249950563575, "step": 8140, "train/total_loss": 0.16005867719650269 }, { "entropy": 9.02514362335205, "epoch": 0.804923867905873, "mean_token_accuracy": 0.7098214030265808, "num_tokens": 21564937.0, "step": 8141, "train/ce_loss": 1.5891838073730469 }, { "epoch": 0.804923867905873, "step": 8141, "train/sim_loss": 0.07421875 }, { "epoch": 0.804923867905873, "step": 8141, "train/total_loss": 0.2331371307373047 }, { "entropy": 8.961231231689453, "epoch": 0.8050227407553886, "mean_token_accuracy": 0.7310230731964111, "num_tokens": 21569969.0, "step": 8142, "train/ce_loss": 0.8391844034194946 }, { "epoch": 0.8050227407553886, "step": 8142, "train/sim_loss": 0.06640625 }, { "epoch": 0.8050227407553886, "step": 8142, "train/total_loss": 0.15032470226287842 }, { "entropy": 8.839179992675781, "epoch": 0.8051216136049041, "mean_token_accuracy": 0.7161997556686401, "num_tokens": 21575231.0, "step": 8143, "train/ce_loss": 1.0139042139053345 }, { "epoch": 0.8051216136049041, "step": 8143, "train/sim_loss": 0.07421875 }, { "epoch": 0.8051216136049041, "step": 8143, "train/total_loss": 0.17560917139053345 }, { "entropy": 9.141321182250977, "epoch": 0.8052204864544196, "mean_token_accuracy": 0.7846715450286865, "num_tokens": 21580195.0, "step": 8144, "train/ce_loss": 0.9555160403251648 }, { "epoch": 0.8052204864544196, "step": 8144, "train/sim_loss": 0.03125 }, { "epoch": 0.8052204864544196, "step": 8144, "train/total_loss": 0.12680160999298096 }, { "entropy": 8.899325370788574, "epoch": 0.8053193593039352, "mean_token_accuracy": 0.7378129363059998, "num_tokens": 21585434.0, "step": 8145, "train/ce_loss": 1.0690226554870605 }, { "epoch": 0.8053193593039352, "step": 8145, "train/sim_loss": 0.10546875 }, { "epoch": 0.8053193593039352, "step": 8145, "train/total_loss": 0.21237102150917053 }, { "entropy": 8.712418556213379, "epoch": 0.8054182321534507, "mean_token_accuracy": 0.8041236996650696, "num_tokens": 21590656.0, "step": 8146, "train/ce_loss": 0.6086009740829468 }, { "epoch": 0.8054182321534507, "step": 8146, "train/sim_loss": 0.015625 }, { "epoch": 0.8054182321534507, "step": 8146, "train/total_loss": 0.07648509740829468 }, { "entropy": 9.335983276367188, "epoch": 0.8055171050029661, "mean_token_accuracy": 0.7619834542274475, "num_tokens": 21595696.0, "step": 8147, "train/ce_loss": 0.8511669039726257 }, { "epoch": 0.8055171050029661, "step": 8147, "train/sim_loss": 0.078125 }, { "epoch": 0.8055171050029661, "step": 8147, "train/total_loss": 0.1632416844367981 }, { "entropy": 8.933341026306152, "epoch": 0.8056159778524817, "mean_token_accuracy": 0.7549574971199036, "num_tokens": 21600821.0, "step": 8148, "train/ce_loss": 1.0629116296768188 }, { "epoch": 0.8056159778524817, "step": 8148, "train/sim_loss": 0.03515625 }, { "epoch": 0.8056159778524817, "step": 8148, "train/total_loss": 0.14144742488861084 }, { "entropy": 8.205276489257812, "epoch": 0.8057148507019972, "mean_token_accuracy": 0.7639310956001282, "num_tokens": 21606265.0, "step": 8149, "train/ce_loss": 1.0072635412216187 }, { "epoch": 0.8057148507019972, "step": 8149, "train/sim_loss": 0.1015625 }, { "epoch": 0.8057148507019972, "step": 8149, "train/total_loss": 0.20228886604309082 }, { "entropy": 9.068489074707031, "epoch": 0.8058137235515127, "mean_token_accuracy": 0.707025408744812, "num_tokens": 21611427.0, "step": 8150, "train/ce_loss": 4.91906757815741e-06 }, { "epoch": 0.8058137235515127, "step": 8150, "train/sim_loss": 0.046875 }, { "epoch": 0.8058137235515127, "step": 8150, "train/total_loss": 0.0468754917383194 }, { "entropy": 8.444740295410156, "epoch": 0.8059125964010283, "mean_token_accuracy": 0.7334058880805969, "num_tokens": 21616869.0, "step": 8151, "train/ce_loss": 1.6713114976882935 }, { "epoch": 0.8059125964010283, "step": 8151, "train/sim_loss": 0.0703125 }, { "epoch": 0.8059125964010283, "step": 8151, "train/total_loss": 0.23744365572929382 }, { "entropy": 9.057316780090332, "epoch": 0.8060114692505438, "mean_token_accuracy": 0.7879341840744019, "num_tokens": 21621889.0, "step": 8152, "train/ce_loss": 4.6114215024317673e-07 }, { "epoch": 0.8060114692505438, "step": 8152, "train/sim_loss": 0.0390625 }, { "epoch": 0.8060114692505438, "step": 8152, "train/total_loss": 0.03906254470348358 }, { "entropy": 8.672420501708984, "epoch": 0.8061103421000593, "mean_token_accuracy": 0.800936758518219, "num_tokens": 21627189.0, "step": 8153, "train/ce_loss": 0.46436673402786255 }, { "epoch": 0.8061103421000593, "step": 8153, "train/sim_loss": 0.046875 }, { "epoch": 0.8061103421000593, "step": 8153, "train/total_loss": 0.09331167489290237 }, { "entropy": 8.96420669555664, "epoch": 0.8062092149495749, "mean_token_accuracy": 0.7317743897438049, "num_tokens": 21632434.0, "step": 8154, "train/ce_loss": 1.1457781791687012 }, { "epoch": 0.8062092149495749, "step": 8154, "train/sim_loss": 0.07421875 }, { "epoch": 0.8062092149495749, "step": 8154, "train/total_loss": 0.18879657983779907 }, { "entropy": 8.43289566040039, "epoch": 0.8063080877990904, "mean_token_accuracy": 0.6918798685073853, "num_tokens": 21637818.0, "step": 8155, "train/ce_loss": 0.8233134746551514 }, { "epoch": 0.8063080877990904, "step": 8155, "train/sim_loss": 0.046875 }, { "epoch": 0.8063080877990904, "step": 8155, "train/total_loss": 0.1292063593864441 }, { "entropy": 9.024656295776367, "epoch": 0.8064069606486058, "mean_token_accuracy": 0.7409162521362305, "num_tokens": 21642914.0, "step": 8156, "train/ce_loss": 0.43868541717529297 }, { "epoch": 0.8064069606486058, "step": 8156, "train/sim_loss": 0.03125 }, { "epoch": 0.8064069606486058, "step": 8156, "train/total_loss": 0.0751185417175293 }, { "entropy": 8.357614517211914, "epoch": 0.8065058334981214, "mean_token_accuracy": 0.6878452897071838, "num_tokens": 21648492.0, "step": 8157, "train/ce_loss": 1.4950826168060303 }, { "epoch": 0.8065058334981214, "step": 8157, "train/sim_loss": 0.03125 }, { "epoch": 0.8065058334981214, "step": 8157, "train/total_loss": 0.1807582676410675 }, { "entropy": 8.726078033447266, "epoch": 0.8066047063476369, "mean_token_accuracy": 0.7450110912322998, "num_tokens": 21653859.0, "step": 8158, "train/ce_loss": 0.613193154335022 }, { "epoch": 0.8066047063476369, "step": 8158, "train/sim_loss": 0.03125 }, { "epoch": 0.8066047063476369, "step": 8158, "train/total_loss": 0.09256932139396667 }, { "entropy": 9.057610511779785, "epoch": 0.8067035791971524, "mean_token_accuracy": 0.773181140422821, "num_tokens": 21659023.0, "step": 8159, "train/ce_loss": 1.34531831741333 }, { "epoch": 0.8067035791971524, "step": 8159, "train/sim_loss": 0.02734375 }, { "epoch": 0.8067035791971524, "step": 8159, "train/total_loss": 0.16187559068202972 }, { "epoch": 0.806802452046668, "grad_norm": 0.655254602432251, "learning_rate": 7.985214854373734e-06, "loss": 0.1312, "step": 8160 }, { "entropy": 8.64984130859375, "epoch": 0.806802452046668, "mean_token_accuracy": 0.7977805137634277, "num_tokens": 21664315.0, "step": 8160, "train/ce_loss": 0.7628602981567383 }, { "epoch": 0.806802452046668, "step": 8160, "train/sim_loss": 0.0234375 }, { "epoch": 0.806802452046668, "step": 8160, "train/total_loss": 0.09972353279590607 }, { "entropy": 9.269373893737793, "epoch": 0.8069013248961835, "mean_token_accuracy": 0.67405766248703, "num_tokens": 21669226.0, "step": 8161, "train/ce_loss": 1.1899515390396118 }, { "epoch": 0.8069013248961835, "step": 8161, "train/sim_loss": 0.01171875 }, { "epoch": 0.8069013248961835, "step": 8161, "train/total_loss": 0.13071390986442566 }, { "entropy": 9.361366271972656, "epoch": 0.807000197745699, "mean_token_accuracy": 0.8279069662094116, "num_tokens": 21674053.0, "step": 8162, "train/ce_loss": 1.273462176322937 }, { "epoch": 0.807000197745699, "step": 8162, "train/sim_loss": 0.03125 }, { "epoch": 0.807000197745699, "step": 8162, "train/total_loss": 0.1585962176322937 }, { "entropy": 8.576358795166016, "epoch": 0.8070990705952146, "mean_token_accuracy": 0.7305315136909485, "num_tokens": 21679304.0, "step": 8163, "train/ce_loss": 1.026901364326477 }, { "epoch": 0.8070990705952146, "step": 8163, "train/sim_loss": 0.04296875 }, { "epoch": 0.8070990705952146, "step": 8163, "train/total_loss": 0.14565888047218323 }, { "entropy": 8.602506637573242, "epoch": 0.8071979434447301, "mean_token_accuracy": 0.7146092653274536, "num_tokens": 21684674.0, "step": 8164, "train/ce_loss": 0.6335917115211487 }, { "epoch": 0.8071979434447301, "step": 8164, "train/sim_loss": 0.0625 }, { "epoch": 0.8071979434447301, "step": 8164, "train/total_loss": 0.12585917115211487 }, { "entropy": 9.482227325439453, "epoch": 0.8072968162942455, "mean_token_accuracy": 0.7589454054832458, "num_tokens": 21689605.0, "step": 8165, "train/ce_loss": 6.770823119950364e-07 }, { "epoch": 0.8072968162942455, "step": 8165, "train/sim_loss": 0.01953125 }, { "epoch": 0.8072968162942455, "step": 8165, "train/total_loss": 0.019531317055225372 }, { "entropy": 8.98696231842041, "epoch": 0.8073956891437611, "mean_token_accuracy": 0.7142857313156128, "num_tokens": 21694766.0, "step": 8166, "train/ce_loss": 0.6035987138748169 }, { "epoch": 0.8073956891437611, "step": 8166, "train/sim_loss": 0.04296875 }, { "epoch": 0.8073956891437611, "step": 8166, "train/total_loss": 0.10332862287759781 }, { "entropy": 8.953849792480469, "epoch": 0.8074945619932766, "mean_token_accuracy": 0.717391312122345, "num_tokens": 21699946.0, "step": 8167, "train/ce_loss": 1.9634870290756226 }, { "epoch": 0.8074945619932766, "step": 8167, "train/sim_loss": 0.03125 }, { "epoch": 0.8074945619932766, "step": 8167, "train/total_loss": 0.22759871184825897 }, { "entropy": 8.882444381713867, "epoch": 0.8075934348427922, "mean_token_accuracy": 0.7415730357170105, "num_tokens": 21704924.0, "step": 8168, "train/ce_loss": 0.8021706342697144 }, { "epoch": 0.8075934348427922, "step": 8168, "train/sim_loss": 0.05078125 }, { "epoch": 0.8075934348427922, "step": 8168, "train/total_loss": 0.13099831342697144 }, { "entropy": 8.663026809692383, "epoch": 0.8076923076923077, "mean_token_accuracy": 0.802879273891449, "num_tokens": 21710294.0, "step": 8169, "train/ce_loss": 0.42362141609191895 }, { "epoch": 0.8076923076923077, "step": 8169, "train/sim_loss": 0.015625 }, { "epoch": 0.8076923076923077, "step": 8169, "train/total_loss": 0.057987142354249954 }, { "entropy": 9.483896255493164, "epoch": 0.8077911805418232, "mean_token_accuracy": 0.7542856931686401, "num_tokens": 21715068.0, "step": 8170, "train/ce_loss": 5.225174845691072e-06 }, { "epoch": 0.8077911805418232, "step": 8170, "train/sim_loss": 0.05859375 }, { "epoch": 0.8077911805418232, "step": 8170, "train/total_loss": 0.058594271540641785 }, { "entropy": 8.993762969970703, "epoch": 0.8078900533913388, "mean_token_accuracy": 0.6810228824615479, "num_tokens": 21720460.0, "step": 8171, "train/ce_loss": 1.189120894196094e-06 }, { "epoch": 0.8078900533913388, "step": 8171, "train/sim_loss": 0.015625 }, { "epoch": 0.8078900533913388, "step": 8171, "train/total_loss": 0.01562511920928955 }, { "entropy": 8.781641006469727, "epoch": 0.8079889262408543, "mean_token_accuracy": 0.7355769276618958, "num_tokens": 21725690.0, "step": 8172, "train/ce_loss": 0.5689041614532471 }, { "epoch": 0.8079889262408543, "step": 8172, "train/sim_loss": 0.0546875 }, { "epoch": 0.8079889262408543, "step": 8172, "train/total_loss": 0.11157791316509247 }, { "entropy": 9.35464859008789, "epoch": 0.8080877990903698, "mean_token_accuracy": 0.8356807231903076, "num_tokens": 21730563.0, "step": 8173, "train/ce_loss": 1.0675036907196045 }, { "epoch": 0.8080877990903698, "step": 8173, "train/sim_loss": 0.03515625 }, { "epoch": 0.8080877990903698, "step": 8173, "train/total_loss": 0.14190661907196045 }, { "entropy": 8.79731273651123, "epoch": 0.8081866719398854, "mean_token_accuracy": 0.6955530047416687, "num_tokens": 21735924.0, "step": 8174, "train/ce_loss": 0.9024097919464111 }, { "epoch": 0.8081866719398854, "step": 8174, "train/sim_loss": 0.08203125 }, { "epoch": 0.8081866719398854, "step": 8174, "train/total_loss": 0.1722722351551056 }, { "entropy": 8.88913631439209, "epoch": 0.8082855447894008, "mean_token_accuracy": 0.7214191555976868, "num_tokens": 21741107.0, "step": 8175, "train/ce_loss": 0.5039877891540527 }, { "epoch": 0.8082855447894008, "step": 8175, "train/sim_loss": 0.0703125 }, { "epoch": 0.8082855447894008, "step": 8175, "train/total_loss": 0.12071128189563751 }, { "entropy": 8.999834060668945, "epoch": 0.8083844176389163, "mean_token_accuracy": 0.7393162250518799, "num_tokens": 21746213.0, "step": 8176, "train/ce_loss": 1.1778316497802734 }, { "epoch": 0.8083844176389163, "step": 8176, "train/sim_loss": 0.06640625 }, { "epoch": 0.8083844176389163, "step": 8176, "train/total_loss": 0.18418940901756287 }, { "entropy": 9.247943878173828, "epoch": 0.8084832904884319, "mean_token_accuracy": 0.6855670213699341, "num_tokens": 21751221.0, "step": 8177, "train/ce_loss": 1.1590492725372314 }, { "epoch": 0.8084832904884319, "step": 8177, "train/sim_loss": 0.046875 }, { "epoch": 0.8084832904884319, "step": 8177, "train/total_loss": 0.16277992725372314 }, { "entropy": 8.619396209716797, "epoch": 0.8085821633379474, "mean_token_accuracy": 0.7241014838218689, "num_tokens": 21756648.0, "step": 8178, "train/ce_loss": 2.023618221282959 }, { "epoch": 0.8085821633379474, "step": 8178, "train/sim_loss": 0.15625 }, { "epoch": 0.8085821633379474, "step": 8178, "train/total_loss": 0.3586118221282959 }, { "entropy": 8.900561332702637, "epoch": 0.8086810361874629, "mean_token_accuracy": 0.7817638516426086, "num_tokens": 21761811.0, "step": 8179, "train/ce_loss": 0.7690855860710144 }, { "epoch": 0.8086810361874629, "step": 8179, "train/sim_loss": 0.0390625 }, { "epoch": 0.8086810361874629, "step": 8179, "train/total_loss": 0.11597105860710144 }, { "epoch": 0.8087799090369785, "grad_norm": 0.5436288118362427, "learning_rate": 7.980269989615784e-06, "loss": 0.1367, "step": 8180 }, { "entropy": 8.950399398803711, "epoch": 0.8087799090369785, "mean_token_accuracy": 0.7471979856491089, "num_tokens": 21767080.0, "step": 8180, "train/ce_loss": 1.2610423564910889 }, { "epoch": 0.8087799090369785, "step": 8180, "train/sim_loss": 0.06640625 }, { "epoch": 0.8087799090369785, "step": 8180, "train/total_loss": 0.1925104856491089 }, { "entropy": 8.72748851776123, "epoch": 0.808878781886494, "mean_token_accuracy": 0.7057444453239441, "num_tokens": 21772390.0, "step": 8181, "train/ce_loss": 1.424280047416687 }, { "epoch": 0.808878781886494, "step": 8181, "train/sim_loss": 0.109375 }, { "epoch": 0.808878781886494, "step": 8181, "train/total_loss": 0.2518030107021332 }, { "entropy": 9.251352310180664, "epoch": 0.8089776547360095, "mean_token_accuracy": 0.7716049551963806, "num_tokens": 21777306.0, "step": 8182, "train/ce_loss": 3.123627266177209e-06 }, { "epoch": 0.8089776547360095, "step": 8182, "train/sim_loss": 0.04296875 }, { "epoch": 0.8089776547360095, "step": 8182, "train/total_loss": 0.04296906292438507 }, { "entropy": 9.193682670593262, "epoch": 0.8090765275855251, "mean_token_accuracy": 0.7491638660430908, "num_tokens": 21782342.0, "step": 8183, "train/ce_loss": 2.006776809692383 }, { "epoch": 0.8090765275855251, "step": 8183, "train/sim_loss": 0.08203125 }, { "epoch": 0.8090765275855251, "step": 8183, "train/total_loss": 0.28270894289016724 }, { "entropy": 8.846532821655273, "epoch": 0.8091754004350405, "mean_token_accuracy": 0.7214533090591431, "num_tokens": 21787379.0, "step": 8184, "train/ce_loss": 8.488844400744711e-07 }, { "epoch": 0.8091754004350405, "step": 8184, "train/sim_loss": 0.0625 }, { "epoch": 0.8091754004350405, "step": 8184, "train/total_loss": 0.06250008195638657 }, { "entropy": 9.142698287963867, "epoch": 0.809274273284556, "mean_token_accuracy": 0.6946688294410706, "num_tokens": 21792454.0, "step": 8185, "train/ce_loss": 4.0535462176194414e-05 }, { "epoch": 0.809274273284556, "step": 8185, "train/sim_loss": 0.03515625 }, { "epoch": 0.809274273284556, "step": 8185, "train/total_loss": 0.03516030311584473 }, { "entropy": 9.179564476013184, "epoch": 0.8093731461340716, "mean_token_accuracy": 0.7639639377593994, "num_tokens": 21797410.0, "step": 8186, "train/ce_loss": 6.890414852023241e-07 }, { "epoch": 0.8093731461340716, "step": 8186, "train/sim_loss": 0.04296875 }, { "epoch": 0.8093731461340716, "step": 8186, "train/total_loss": 0.04296881705522537 }, { "entropy": 9.330995559692383, "epoch": 0.8094720189835871, "mean_token_accuracy": 0.6957928538322449, "num_tokens": 21802469.0, "step": 8187, "train/ce_loss": 1.5272752046585083 }, { "epoch": 0.8094720189835871, "step": 8187, "train/sim_loss": 0.0625 }, { "epoch": 0.8094720189835871, "step": 8187, "train/total_loss": 0.21522752940654755 }, { "entropy": 9.198179244995117, "epoch": 0.8095708918331026, "mean_token_accuracy": 0.7378151416778564, "num_tokens": 21807485.0, "step": 8188, "train/ce_loss": 1.382462739944458 }, { "epoch": 0.8095708918331026, "step": 8188, "train/sim_loss": 0.05859375 }, { "epoch": 0.8095708918331026, "step": 8188, "train/total_loss": 0.19684003293514252 }, { "entropy": 8.477127075195312, "epoch": 0.8096697646826182, "mean_token_accuracy": 0.7734375, "num_tokens": 21812875.0, "step": 8189, "train/ce_loss": 0.6133180260658264 }, { "epoch": 0.8096697646826182, "step": 8189, "train/sim_loss": 0.01171875 }, { "epoch": 0.8096697646826182, "step": 8189, "train/total_loss": 0.07305055856704712 }, { "entropy": 8.476020812988281, "epoch": 0.8097686375321337, "mean_token_accuracy": 0.7351290583610535, "num_tokens": 21818262.0, "step": 8190, "train/ce_loss": 0.6485268473625183 }, { "epoch": 0.8097686375321337, "step": 8190, "train/sim_loss": 0.04296875 }, { "epoch": 0.8097686375321337, "step": 8190, "train/total_loss": 0.10782143473625183 }, { "entropy": 8.579954147338867, "epoch": 0.8098675103816492, "mean_token_accuracy": 0.7418032884597778, "num_tokens": 21823665.0, "step": 8191, "train/ce_loss": 0.7341662645339966 }, { "epoch": 0.8098675103816492, "step": 8191, "train/sim_loss": 0.046875 }, { "epoch": 0.8098675103816492, "step": 8191, "train/total_loss": 0.12029162794351578 }, { "entropy": 9.22266674041748, "epoch": 0.8099663832311648, "mean_token_accuracy": 0.7193763852119446, "num_tokens": 21828619.0, "step": 8192, "train/ce_loss": 2.1391375064849854 }, { "epoch": 0.8099663832311648, "step": 8192, "train/sim_loss": 0.06640625 }, { "epoch": 0.8099663832311648, "step": 8192, "train/total_loss": 0.2803199887275696 }, { "entropy": 8.551570892333984, "epoch": 0.8100652560806803, "mean_token_accuracy": 0.7967479825019836, "num_tokens": 21834084.0, "step": 8193, "train/ce_loss": 0.7586435675621033 }, { "epoch": 0.8100652560806803, "step": 8193, "train/sim_loss": 0.02734375 }, { "epoch": 0.8100652560806803, "step": 8193, "train/total_loss": 0.10320810973644257 }, { "entropy": 9.315771102905273, "epoch": 0.8101641289301957, "mean_token_accuracy": 0.7569573521614075, "num_tokens": 21839027.0, "step": 8194, "train/ce_loss": 1.9297350645065308 }, { "epoch": 0.8101641289301957, "step": 8194, "train/sim_loss": 0.0546875 }, { "epoch": 0.8101641289301957, "step": 8194, "train/total_loss": 0.24766100943088531 }, { "entropy": 8.790742874145508, "epoch": 0.8102630017797113, "mean_token_accuracy": 0.7670454382896423, "num_tokens": 21844536.0, "step": 8195, "train/ce_loss": 0.6125141382217407 }, { "epoch": 0.8102630017797113, "step": 8195, "train/sim_loss": 0.07421875 }, { "epoch": 0.8102630017797113, "step": 8195, "train/total_loss": 0.1354701668024063 }, { "entropy": 9.010497093200684, "epoch": 0.8103618746292268, "mean_token_accuracy": 0.7312925457954407, "num_tokens": 21849575.0, "step": 8196, "train/ce_loss": 0.7210679650306702 }, { "epoch": 0.8103618746292268, "step": 8196, "train/sim_loss": 0.046875 }, { "epoch": 0.8103618746292268, "step": 8196, "train/total_loss": 0.11898180097341537 }, { "entropy": 8.528179168701172, "epoch": 0.8104607474787423, "mean_token_accuracy": 0.7588516473770142, "num_tokens": 21855248.0, "step": 8197, "train/ce_loss": 0.9351038932800293 }, { "epoch": 0.8104607474787423, "step": 8197, "train/sim_loss": 0.0859375 }, { "epoch": 0.8104607474787423, "step": 8197, "train/total_loss": 0.17944788932800293 }, { "entropy": 9.088539123535156, "epoch": 0.8105596203282579, "mean_token_accuracy": 0.7485148310661316, "num_tokens": 21860220.0, "step": 8198, "train/ce_loss": 2.4155744540621527e-05 }, { "epoch": 0.8105596203282579, "step": 8198, "train/sim_loss": 0.03515625 }, { "epoch": 0.8105596203282579, "step": 8198, "train/total_loss": 0.0351586639881134 }, { "entropy": 8.88365364074707, "epoch": 0.8106584931777734, "mean_token_accuracy": 0.7418397665023804, "num_tokens": 21865383.0, "step": 8199, "train/ce_loss": 1.06198251247406 }, { "epoch": 0.8106584931777734, "step": 8199, "train/sim_loss": 0.078125 }, { "epoch": 0.8106584931777734, "step": 8199, "train/total_loss": 0.184323251247406 }, { "epoch": 0.8107573660272889, "grad_norm": 0.7262281775474548, "learning_rate": 7.975325124857835e-06, "loss": 0.1342, "step": 8200 }, { "entropy": 8.694978713989258, "epoch": 0.8107573660272889, "mean_token_accuracy": 0.7109470963478088, "num_tokens": 21870697.0, "step": 8200, "train/ce_loss": 1.0218790769577026 }, { "epoch": 0.8107573660272889, "step": 8200, "train/sim_loss": 0.06640625 }, { "epoch": 0.8107573660272889, "step": 8200, "train/total_loss": 0.1685941517353058 }, { "entropy": 9.071819305419922, "epoch": 0.8108562388768045, "mean_token_accuracy": 0.7636612057685852, "num_tokens": 21875848.0, "step": 8201, "train/ce_loss": 1.4440220594406128 }, { "epoch": 0.8108562388768045, "step": 8201, "train/sim_loss": 0.05078125 }, { "epoch": 0.8108562388768045, "step": 8201, "train/total_loss": 0.19518345594406128 }, { "entropy": 9.03312873840332, "epoch": 0.81095511172632, "mean_token_accuracy": 0.7526717782020569, "num_tokens": 21880982.0, "step": 8202, "train/ce_loss": 0.9207428097724915 }, { "epoch": 0.81095511172632, "step": 8202, "train/sim_loss": 0.046875 }, { "epoch": 0.81095511172632, "step": 8202, "train/total_loss": 0.13894927501678467 }, { "entropy": 8.990741729736328, "epoch": 0.8110539845758354, "mean_token_accuracy": 0.7273918986320496, "num_tokens": 21886231.0, "step": 8203, "train/ce_loss": 0.988568127155304 }, { "epoch": 0.8110539845758354, "step": 8203, "train/sim_loss": 0.0546875 }, { "epoch": 0.8110539845758354, "step": 8203, "train/total_loss": 0.15354430675506592 }, { "entropy": 9.083792686462402, "epoch": 0.811152857425351, "mean_token_accuracy": 0.7047308087348938, "num_tokens": 21891320.0, "step": 8204, "train/ce_loss": 1.0045556336990558e-05 }, { "epoch": 0.811152857425351, "step": 8204, "train/sim_loss": 0.046875 }, { "epoch": 0.811152857425351, "step": 8204, "train/total_loss": 0.046876005828380585 }, { "entropy": 8.488182067871094, "epoch": 0.8112517302748665, "mean_token_accuracy": 0.7369033694267273, "num_tokens": 21896692.0, "step": 8205, "train/ce_loss": 1.4631035327911377 }, { "epoch": 0.8112517302748665, "step": 8205, "train/sim_loss": 0.05078125 }, { "epoch": 0.8112517302748665, "step": 8205, "train/total_loss": 0.19709160923957825 }, { "entropy": 9.184462547302246, "epoch": 0.811350603124382, "mean_token_accuracy": 0.7267658114433289, "num_tokens": 21901678.0, "step": 8206, "train/ce_loss": 1.351523995399475 }, { "epoch": 0.811350603124382, "step": 8206, "train/sim_loss": 0.046875 }, { "epoch": 0.811350603124382, "step": 8206, "train/total_loss": 0.1820273995399475 }, { "entropy": 8.80001449584961, "epoch": 0.8114494759738976, "mean_token_accuracy": 0.6877419352531433, "num_tokens": 21906948.0, "step": 8207, "train/ce_loss": 7.886482489993796e-05 }, { "epoch": 0.8114494759738976, "step": 8207, "train/sim_loss": 0.0234375 }, { "epoch": 0.8114494759738976, "step": 8207, "train/total_loss": 0.023445386439561844 }, { "entropy": 9.19816780090332, "epoch": 0.8115483488234131, "mean_token_accuracy": 0.7311828136444092, "num_tokens": 21912085.0, "step": 8208, "train/ce_loss": 0.5881509780883789 }, { "epoch": 0.8115483488234131, "step": 8208, "train/sim_loss": 0.0703125 }, { "epoch": 0.8115483488234131, "step": 8208, "train/total_loss": 0.1291275918483734 }, { "entropy": 8.79111385345459, "epoch": 0.8116472216729286, "mean_token_accuracy": 0.6975609660148621, "num_tokens": 21917155.0, "step": 8209, "train/ce_loss": 1.086192011833191 }, { "epoch": 0.8116472216729286, "step": 8209, "train/sim_loss": 0.08984375 }, { "epoch": 0.8116472216729286, "step": 8209, "train/total_loss": 0.19846296310424805 }, { "entropy": 8.186117172241211, "epoch": 0.8117460945224442, "mean_token_accuracy": 0.7790403962135315, "num_tokens": 21922386.0, "step": 8210, "train/ce_loss": 0.6409353017807007 }, { "epoch": 0.8117460945224442, "step": 8210, "train/sim_loss": 0.0234375 }, { "epoch": 0.8117460945224442, "step": 8210, "train/total_loss": 0.08753103017807007 }, { "entropy": 8.486915588378906, "epoch": 0.8118449673719597, "mean_token_accuracy": 0.7954545617103577, "num_tokens": 21927842.0, "step": 8211, "train/ce_loss": 0.6763221621513367 }, { "epoch": 0.8118449673719597, "step": 8211, "train/sim_loss": 0.01953125 }, { "epoch": 0.8118449673719597, "step": 8211, "train/total_loss": 0.08716347068548203 }, { "entropy": 8.522668838500977, "epoch": 0.8119438402214751, "mean_token_accuracy": 0.7724137902259827, "num_tokens": 21933210.0, "step": 8212, "train/ce_loss": 0.39027079939842224 }, { "epoch": 0.8119438402214751, "step": 8212, "train/sim_loss": 0.03515625 }, { "epoch": 0.8119438402214751, "step": 8212, "train/total_loss": 0.07418332993984222 }, { "entropy": 9.335029602050781, "epoch": 0.8120427130709907, "mean_token_accuracy": 0.751396656036377, "num_tokens": 21938016.0, "step": 8213, "train/ce_loss": 0.0015728664584457874 }, { "epoch": 0.8120427130709907, "step": 8213, "train/sim_loss": 0.03515625 }, { "epoch": 0.8120427130709907, "step": 8213, "train/total_loss": 0.03531353548169136 }, { "entropy": 8.462377548217773, "epoch": 0.8121415859205062, "mean_token_accuracy": 0.7784945964813232, "num_tokens": 21943407.0, "step": 8214, "train/ce_loss": 0.7918002605438232 }, { "epoch": 0.8121415859205062, "step": 8214, "train/sim_loss": 0.03515625 }, { "epoch": 0.8121415859205062, "step": 8214, "train/total_loss": 0.1143362745642662 }, { "entropy": 8.499216079711914, "epoch": 0.8122404587700217, "mean_token_accuracy": 0.7302423715591431, "num_tokens": 21948804.0, "step": 8215, "train/ce_loss": 0.5783441066741943 }, { "epoch": 0.8122404587700217, "step": 8215, "train/sim_loss": 0.0390625 }, { "epoch": 0.8122404587700217, "step": 8215, "train/total_loss": 0.09689691662788391 }, { "entropy": 9.778457641601562, "epoch": 0.8123393316195373, "mean_token_accuracy": 0.7428571581840515, "num_tokens": 21953551.0, "step": 8216, "train/ce_loss": 1.4274528439273126e-06 }, { "epoch": 0.8123393316195373, "step": 8216, "train/sim_loss": 0.02734375 }, { "epoch": 0.8123393316195373, "step": 8216, "train/total_loss": 0.02734389342367649 }, { "entropy": 8.859435081481934, "epoch": 0.8124382044690528, "mean_token_accuracy": 0.7398601174354553, "num_tokens": 21959061.0, "step": 8217, "train/ce_loss": 0.7559324502944946 }, { "epoch": 0.8124382044690528, "step": 8217, "train/sim_loss": 0.0859375 }, { "epoch": 0.8124382044690528, "step": 8217, "train/total_loss": 0.1615307480096817 }, { "entropy": 8.961817741394043, "epoch": 0.8125370773185683, "mean_token_accuracy": 0.780802309513092, "num_tokens": 21964232.0, "step": 8218, "train/ce_loss": 0.7763864398002625 }, { "epoch": 0.8125370773185683, "step": 8218, "train/sim_loss": 0.09375 }, { "epoch": 0.8125370773185683, "step": 8218, "train/total_loss": 0.1713886559009552 }, { "entropy": 8.722240447998047, "epoch": 0.8126359501680839, "mean_token_accuracy": 0.7825000286102295, "num_tokens": 21969525.0, "step": 8219, "train/ce_loss": 0.6948031783103943 }, { "epoch": 0.8126359501680839, "step": 8219, "train/sim_loss": 0.046875 }, { "epoch": 0.8126359501680839, "step": 8219, "train/total_loss": 0.11635532230138779 }, { "epoch": 0.8127348230175994, "grad_norm": 0.5876989960670471, "learning_rate": 7.970380260099887e-06, "loss": 0.1333, "step": 8220 }, { "entropy": 9.117151260375977, "epoch": 0.8127348230175994, "mean_token_accuracy": 0.7641242742538452, "num_tokens": 21974903.0, "step": 8220, "train/ce_loss": 0.6785506010055542 }, { "epoch": 0.8127348230175994, "step": 8220, "train/sim_loss": 0.0546875 }, { "epoch": 0.8127348230175994, "step": 8220, "train/total_loss": 0.12254256010055542 }, { "entropy": 8.457649230957031, "epoch": 0.8128336958671148, "mean_token_accuracy": 0.7218309640884399, "num_tokens": 21980267.0, "step": 8221, "train/ce_loss": 1.3428711891174316 }, { "epoch": 0.8128336958671148, "step": 8221, "train/sim_loss": 0.07421875 }, { "epoch": 0.8128336958671148, "step": 8221, "train/total_loss": 0.20850586891174316 }, { "entropy": 9.19150161743164, "epoch": 0.8129325687166304, "mean_token_accuracy": 0.740234375, "num_tokens": 21985217.0, "step": 8222, "train/ce_loss": 0.7980934381484985 }, { "epoch": 0.8129325687166304, "step": 8222, "train/sim_loss": 0.04296875 }, { "epoch": 0.8129325687166304, "step": 8222, "train/total_loss": 0.12277809530496597 }, { "entropy": 9.105156898498535, "epoch": 0.8130314415661459, "mean_token_accuracy": 0.6800000071525574, "num_tokens": 21990177.0, "step": 8223, "train/ce_loss": 0.8357672095298767 }, { "epoch": 0.8130314415661459, "step": 8223, "train/sim_loss": 0.03515625 }, { "epoch": 0.8130314415661459, "step": 8223, "train/total_loss": 0.11873297393321991 }, { "entropy": 8.884305953979492, "epoch": 0.8131303144156614, "mean_token_accuracy": 0.7631579041481018, "num_tokens": 21995414.0, "step": 8224, "train/ce_loss": 3.9317649225267814e-07 }, { "epoch": 0.8131303144156614, "step": 8224, "train/sim_loss": 0.0390625 }, { "epoch": 0.8131303144156614, "step": 8224, "train/total_loss": 0.03906254097819328 }, { "entropy": 8.017794609069824, "epoch": 0.813229187265177, "mean_token_accuracy": 0.7348754405975342, "num_tokens": 22001014.0, "step": 8225, "train/ce_loss": 1.0558550357818604 }, { "epoch": 0.813229187265177, "step": 8225, "train/sim_loss": 0.0546875 }, { "epoch": 0.813229187265177, "step": 8225, "train/total_loss": 0.160273015499115 }, { "entropy": 8.938722610473633, "epoch": 0.8133280601146925, "mean_token_accuracy": 0.7399702668190002, "num_tokens": 22006115.0, "step": 8226, "train/ce_loss": 1.6660056114196777 }, { "epoch": 0.8133280601146925, "step": 8226, "train/sim_loss": 0.0625 }, { "epoch": 0.8133280601146925, "step": 8226, "train/total_loss": 0.2291005700826645 }, { "entropy": 9.419342994689941, "epoch": 0.813426932964208, "mean_token_accuracy": 0.7194244861602783, "num_tokens": 22010946.0, "step": 8227, "train/ce_loss": 4.596821554514463e-07 }, { "epoch": 0.813426932964208, "step": 8227, "train/sim_loss": 0.03125 }, { "epoch": 0.813426932964208, "step": 8227, "train/total_loss": 0.03125004470348358 }, { "entropy": 9.03645133972168, "epoch": 0.8135258058137236, "mean_token_accuracy": 0.7803468108177185, "num_tokens": 22016062.0, "step": 8228, "train/ce_loss": 6.380624313351291e-07 }, { "epoch": 0.8135258058137236, "step": 8228, "train/sim_loss": 0.04296875 }, { "epoch": 0.8135258058137236, "step": 8228, "train/total_loss": 0.042968813329935074 }, { "entropy": 8.901385307312012, "epoch": 0.8136246786632391, "mean_token_accuracy": 0.692187488079071, "num_tokens": 22021166.0, "step": 8229, "train/ce_loss": 6.219978786248248e-06 }, { "epoch": 0.8136246786632391, "step": 8229, "train/sim_loss": 0.03515625 }, { "epoch": 0.8136246786632391, "step": 8229, "train/total_loss": 0.03515687212347984 }, { "entropy": 8.753562927246094, "epoch": 0.8137235515127546, "mean_token_accuracy": 0.7471910119056702, "num_tokens": 22026560.0, "step": 8230, "train/ce_loss": 0.8991544246673584 }, { "epoch": 0.8137235515127546, "step": 8230, "train/sim_loss": 0.0546875 }, { "epoch": 0.8137235515127546, "step": 8230, "train/total_loss": 0.1446029543876648 }, { "entropy": 9.053617477416992, "epoch": 0.8138224243622701, "mean_token_accuracy": 0.7304624915122986, "num_tokens": 22031656.0, "step": 8231, "train/ce_loss": 6.243632242330932e-07 }, { "epoch": 0.8138224243622701, "step": 8231, "train/sim_loss": 0.0625 }, { "epoch": 0.8138224243622701, "step": 8231, "train/total_loss": 0.06250005960464478 }, { "entropy": 8.620099067687988, "epoch": 0.8139212972117856, "mean_token_accuracy": 0.6655328869819641, "num_tokens": 22036981.0, "step": 8232, "train/ce_loss": 1.7731776237487793 }, { "epoch": 0.8139212972117856, "step": 8232, "train/sim_loss": 0.08203125 }, { "epoch": 0.8139212972117856, "step": 8232, "train/total_loss": 0.2593490183353424 }, { "entropy": 8.56728744506836, "epoch": 0.8140201700613011, "mean_token_accuracy": 0.746582567691803, "num_tokens": 22042453.0, "step": 8233, "train/ce_loss": 0.9670370221138 }, { "epoch": 0.8140201700613011, "step": 8233, "train/sim_loss": 0.06640625 }, { "epoch": 0.8140201700613011, "step": 8233, "train/total_loss": 0.16310995817184448 }, { "entropy": 8.869913101196289, "epoch": 0.8141190429108167, "mean_token_accuracy": 0.7425068020820618, "num_tokens": 22047647.0, "step": 8234, "train/ce_loss": 0.49576520919799805 }, { "epoch": 0.8141190429108167, "step": 8234, "train/sim_loss": 0.01953125 }, { "epoch": 0.8141190429108167, "step": 8234, "train/total_loss": 0.0691077709197998 }, { "entropy": 8.869132995605469, "epoch": 0.8142179157603322, "mean_token_accuracy": 0.7114361524581909, "num_tokens": 22052916.0, "step": 8235, "train/ce_loss": 0.7772649526596069 }, { "epoch": 0.8142179157603322, "step": 8235, "train/sim_loss": 0.0390625 }, { "epoch": 0.8142179157603322, "step": 8235, "train/total_loss": 0.11678899824619293 }, { "entropy": 8.645116806030273, "epoch": 0.8143167886098477, "mean_token_accuracy": 0.7642679810523987, "num_tokens": 22058211.0, "step": 8236, "train/ce_loss": 1.0038399696350098 }, { "epoch": 0.8143167886098477, "step": 8236, "train/sim_loss": 0.02734375 }, { "epoch": 0.8143167886098477, "step": 8236, "train/total_loss": 0.12772774696350098 }, { "entropy": 8.418092727661133, "epoch": 0.8144156614593633, "mean_token_accuracy": 0.7742663621902466, "num_tokens": 22063569.0, "step": 8237, "train/ce_loss": 0.8072574734687805 }, { "epoch": 0.8144156614593633, "step": 8237, "train/sim_loss": 0.0546875 }, { "epoch": 0.8144156614593633, "step": 8237, "train/total_loss": 0.135413259267807 }, { "entropy": 8.732414245605469, "epoch": 0.8145145343088788, "mean_token_accuracy": 0.7214111685752869, "num_tokens": 22068807.0, "step": 8238, "train/ce_loss": 0.7674160599708557 }, { "epoch": 0.8145145343088788, "step": 8238, "train/sim_loss": 0.046875 }, { "epoch": 0.8145145343088788, "step": 8238, "train/total_loss": 0.12361660599708557 }, { "entropy": 8.821417808532715, "epoch": 0.8146134071583943, "mean_token_accuracy": 0.6559571623802185, "num_tokens": 22074002.0, "step": 8239, "train/ce_loss": 1.8144460916519165 }, { "epoch": 0.8146134071583943, "step": 8239, "train/sim_loss": 0.03515625 }, { "epoch": 0.8146134071583943, "step": 8239, "train/total_loss": 0.21660086512565613 }, { "epoch": 0.8147122800079099, "grad_norm": 0.7151651382446289, "learning_rate": 7.965435395341938e-06, "loss": 0.1417, "step": 8240 }, { "entropy": 9.104612350463867, "epoch": 0.8147122800079099, "mean_token_accuracy": 0.779321014881134, "num_tokens": 22079096.0, "step": 8240, "train/ce_loss": 0.8043537735939026 }, { "epoch": 0.8147122800079099, "step": 8240, "train/sim_loss": 0.01171875 }, { "epoch": 0.8147122800079099, "step": 8240, "train/total_loss": 0.0921541303396225 }, { "entropy": 9.01150131225586, "epoch": 0.8148111528574253, "mean_token_accuracy": 0.8212209343910217, "num_tokens": 22084214.0, "step": 8241, "train/ce_loss": 0.7468549013137817 }, { "epoch": 0.8148111528574253, "step": 8241, "train/sim_loss": 0.015625 }, { "epoch": 0.8148111528574253, "step": 8241, "train/total_loss": 0.0903104916214943 }, { "entropy": 8.795052528381348, "epoch": 0.8149100257069408, "mean_token_accuracy": 0.7451403737068176, "num_tokens": 22089596.0, "step": 8242, "train/ce_loss": 0.8570025563240051 }, { "epoch": 0.8149100257069408, "step": 8242, "train/sim_loss": 0.0703125 }, { "epoch": 0.8149100257069408, "step": 8242, "train/total_loss": 0.15601275861263275 }, { "entropy": 8.701278686523438, "epoch": 0.8150088985564564, "mean_token_accuracy": 0.7772151827812195, "num_tokens": 22094869.0, "step": 8243, "train/ce_loss": 0.9589567184448242 }, { "epoch": 0.8150088985564564, "step": 8243, "train/sim_loss": 0.1015625 }, { "epoch": 0.8150088985564564, "step": 8243, "train/total_loss": 0.1974581778049469 }, { "entropy": 8.662206649780273, "epoch": 0.8151077714059719, "mean_token_accuracy": 0.6931034326553345, "num_tokens": 22099893.0, "step": 8244, "train/ce_loss": 1.9629923105239868 }, { "epoch": 0.8151077714059719, "step": 8244, "train/sim_loss": 0.0546875 }, { "epoch": 0.8151077714059719, "step": 8244, "train/total_loss": 0.2509867548942566 }, { "entropy": 9.216302871704102, "epoch": 0.8152066442554874, "mean_token_accuracy": 0.7698675394058228, "num_tokens": 22104907.0, "step": 8245, "train/ce_loss": 0.9153091907501221 }, { "epoch": 0.8152066442554874, "step": 8245, "train/sim_loss": 0.03515625 }, { "epoch": 0.8152066442554874, "step": 8245, "train/total_loss": 0.1266871690750122 }, { "entropy": 8.466188430786133, "epoch": 0.815305517105003, "mean_token_accuracy": 0.7120253443717957, "num_tokens": 22110327.0, "step": 8246, "train/ce_loss": 0.749636173248291 }, { "epoch": 0.815305517105003, "step": 8246, "train/sim_loss": 0.09765625 }, { "epoch": 0.815305517105003, "step": 8246, "train/total_loss": 0.17261987924575806 }, { "entropy": 8.246451377868652, "epoch": 0.8154043899545185, "mean_token_accuracy": 0.7100409865379333, "num_tokens": 22115812.0, "step": 8247, "train/ce_loss": 1.0034377574920654 }, { "epoch": 0.8154043899545185, "step": 8247, "train/sim_loss": 0.05859375 }, { "epoch": 0.8154043899545185, "step": 8247, "train/total_loss": 0.15893752872943878 }, { "entropy": 8.852086067199707, "epoch": 0.815503262804034, "mean_token_accuracy": 0.7243510484695435, "num_tokens": 22121040.0, "step": 8248, "train/ce_loss": 0.5238107442855835 }, { "epoch": 0.815503262804034, "step": 8248, "train/sim_loss": 0.05859375 }, { "epoch": 0.815503262804034, "step": 8248, "train/total_loss": 0.11097482591867447 }, { "entropy": 8.911182403564453, "epoch": 0.8156021356535496, "mean_token_accuracy": 0.7021563053131104, "num_tokens": 22126240.0, "step": 8249, "train/ce_loss": 0.6195603609085083 }, { "epoch": 0.8156021356535496, "step": 8249, "train/sim_loss": 0.04296875 }, { "epoch": 0.8156021356535496, "step": 8249, "train/total_loss": 0.10492478311061859 }, { "entropy": 9.07373046875, "epoch": 0.815701008503065, "mean_token_accuracy": 0.730182945728302, "num_tokens": 22131352.0, "step": 8250, "train/ce_loss": 0.567441999912262 }, { "epoch": 0.815701008503065, "step": 8250, "train/sim_loss": 0.0390625 }, { "epoch": 0.815701008503065, "step": 8250, "train/total_loss": 0.09580670297145844 }, { "entropy": 8.514007568359375, "epoch": 0.8157998813525806, "mean_token_accuracy": 0.7369614243507385, "num_tokens": 22136710.0, "step": 8251, "train/ce_loss": 0.41268324851989746 }, { "epoch": 0.8157998813525806, "step": 8251, "train/sim_loss": 0.0234375 }, { "epoch": 0.8157998813525806, "step": 8251, "train/total_loss": 0.06470582634210587 }, { "entropy": 8.258570671081543, "epoch": 0.8158987542020961, "mean_token_accuracy": 0.6890848875045776, "num_tokens": 22142144.0, "step": 8252, "train/ce_loss": 1.0246692895889282 }, { "epoch": 0.8158987542020961, "step": 8252, "train/sim_loss": 0.046875 }, { "epoch": 0.8158987542020961, "step": 8252, "train/total_loss": 0.14934194087982178 }, { "entropy": 8.888803482055664, "epoch": 0.8159976270516116, "mean_token_accuracy": 0.7424441576004028, "num_tokens": 22147351.0, "step": 8253, "train/ce_loss": 0.8449608087539673 }, { "epoch": 0.8159976270516116, "step": 8253, "train/sim_loss": 0.03125 }, { "epoch": 0.8159976270516116, "step": 8253, "train/total_loss": 0.11574608087539673 }, { "entropy": 8.89084243774414, "epoch": 0.8160964999011272, "mean_token_accuracy": 0.7249190807342529, "num_tokens": 22152469.0, "step": 8254, "train/ce_loss": 0.8302783370018005 }, { "epoch": 0.8160964999011272, "step": 8254, "train/sim_loss": 0.0625 }, { "epoch": 0.8160964999011272, "step": 8254, "train/total_loss": 0.14552783966064453 }, { "entropy": 9.082358360290527, "epoch": 0.8161953727506427, "mean_token_accuracy": 0.7819444537162781, "num_tokens": 22157606.0, "step": 8255, "train/ce_loss": 1.2188773155212402 }, { "epoch": 0.8161953727506427, "step": 8255, "train/sim_loss": 0.0546875 }, { "epoch": 0.8161953727506427, "step": 8255, "train/total_loss": 0.17657524347305298 }, { "entropy": 8.382397651672363, "epoch": 0.8162942456001582, "mean_token_accuracy": 0.7291471362113953, "num_tokens": 22163092.0, "step": 8256, "train/ce_loss": 0.9671027064323425 }, { "epoch": 0.8162942456001582, "step": 8256, "train/sim_loss": 0.05859375 }, { "epoch": 0.8162942456001582, "step": 8256, "train/total_loss": 0.15530401468276978 }, { "entropy": 8.917795181274414, "epoch": 0.8163931184496738, "mean_token_accuracy": 0.7467249035835266, "num_tokens": 22168221.0, "step": 8257, "train/ce_loss": 8.391322126044543e-07 }, { "epoch": 0.8163931184496738, "step": 8257, "train/sim_loss": 0.08203125 }, { "epoch": 0.8163931184496738, "step": 8257, "train/total_loss": 0.08203133195638657 }, { "entropy": 9.265806198120117, "epoch": 0.8164919912991893, "mean_token_accuracy": 0.7693877816200256, "num_tokens": 22173150.0, "step": 8258, "train/ce_loss": 1.5727033615112305 }, { "epoch": 0.8164919912991893, "step": 8258, "train/sim_loss": 0.03515625 }, { "epoch": 0.8164919912991893, "step": 8258, "train/total_loss": 0.19242659211158752 }, { "entropy": 8.78251838684082, "epoch": 0.8165908641487047, "mean_token_accuracy": 0.8165829181671143, "num_tokens": 22178403.0, "step": 8259, "train/ce_loss": 0.8774014711380005 }, { "epoch": 0.8165908641487047, "step": 8259, "train/sim_loss": 0.0234375 }, { "epoch": 0.8165908641487047, "step": 8259, "train/total_loss": 0.11117764562368393 }, { "epoch": 0.8166897369982203, "grad_norm": 0.6642946004867554, "learning_rate": 7.96049053058399e-06, "loss": 0.134, "step": 8260 }, { "entropy": 8.415128707885742, "epoch": 0.8166897369982203, "mean_token_accuracy": 0.6840882897377014, "num_tokens": 22183787.0, "step": 8260, "train/ce_loss": 0.7524656653404236 }, { "epoch": 0.8166897369982203, "step": 8260, "train/sim_loss": 0.07421875 }, { "epoch": 0.8166897369982203, "step": 8260, "train/total_loss": 0.14946532249450684 }, { "entropy": 8.952371597290039, "epoch": 0.8167886098477358, "mean_token_accuracy": 0.7789165377616882, "num_tokens": 22188922.0, "step": 8261, "train/ce_loss": 1.5190229415893555 }, { "epoch": 0.8167886098477358, "step": 8261, "train/sim_loss": 0.1484375 }, { "epoch": 0.8167886098477358, "step": 8261, "train/total_loss": 0.30033981800079346 }, { "entropy": 8.918517112731934, "epoch": 0.8168874826972513, "mean_token_accuracy": 0.6932153105735779, "num_tokens": 22194040.0, "step": 8262, "train/ce_loss": 1.3680672645568848 }, { "epoch": 0.8168874826972513, "step": 8262, "train/sim_loss": 0.046875 }, { "epoch": 0.8168874826972513, "step": 8262, "train/total_loss": 0.18368172645568848 }, { "entropy": 8.656283378601074, "epoch": 0.8169863555467669, "mean_token_accuracy": 0.7665418386459351, "num_tokens": 22199355.0, "step": 8263, "train/ce_loss": 0.9361574053764343 }, { "epoch": 0.8169863555467669, "step": 8263, "train/sim_loss": 0.0625 }, { "epoch": 0.8169863555467669, "step": 8263, "train/total_loss": 0.15611574053764343 }, { "entropy": 8.89004898071289, "epoch": 0.8170852283962824, "mean_token_accuracy": 0.767160177230835, "num_tokens": 22204549.0, "step": 8264, "train/ce_loss": 0.7157173156738281 }, { "epoch": 0.8170852283962824, "step": 8264, "train/sim_loss": 0.0390625 }, { "epoch": 0.8170852283962824, "step": 8264, "train/total_loss": 0.1106342300772667 }, { "entropy": 8.740985870361328, "epoch": 0.8171841012457979, "mean_token_accuracy": 0.7036144733428955, "num_tokens": 22209881.0, "step": 8265, "train/ce_loss": 1.5244280099868774 }, { "epoch": 0.8171841012457979, "step": 8265, "train/sim_loss": 0.08984375 }, { "epoch": 0.8171841012457979, "step": 8265, "train/total_loss": 0.2422865480184555 }, { "entropy": 8.232596397399902, "epoch": 0.8172829740953135, "mean_token_accuracy": 0.7505694627761841, "num_tokens": 22215296.0, "step": 8266, "train/ce_loss": 0.7841481566429138 }, { "epoch": 0.8172829740953135, "step": 8266, "train/sim_loss": 0.0390625 }, { "epoch": 0.8172829740953135, "step": 8266, "train/total_loss": 0.11747732013463974 }, { "entropy": 8.950629234313965, "epoch": 0.817381846944829, "mean_token_accuracy": 0.7545582056045532, "num_tokens": 22220480.0, "step": 8267, "train/ce_loss": 0.7292152643203735 }, { "epoch": 0.817381846944829, "step": 8267, "train/sim_loss": 0.06640625 }, { "epoch": 0.817381846944829, "step": 8267, "train/total_loss": 0.1393277794122696 }, { "entropy": 8.48008918762207, "epoch": 0.8174807197943444, "mean_token_accuracy": 0.6881313323974609, "num_tokens": 22225729.0, "step": 8268, "train/ce_loss": 1.4195605899658403e-06 }, { "epoch": 0.8174807197943444, "step": 8268, "train/sim_loss": 0.05078125 }, { "epoch": 0.8174807197943444, "step": 8268, "train/total_loss": 0.05078139156103134 }, { "entropy": 9.015676498413086, "epoch": 0.81757959264386, "mean_token_accuracy": 0.7643312215805054, "num_tokens": 22230803.0, "step": 8269, "train/ce_loss": 0.6524415016174316 }, { "epoch": 0.81757959264386, "step": 8269, "train/sim_loss": 0.0703125 }, { "epoch": 0.81757959264386, "step": 8269, "train/total_loss": 0.1355566531419754 }, { "entropy": 8.901986122131348, "epoch": 0.8176784654933755, "mean_token_accuracy": 0.7915493249893188, "num_tokens": 22235973.0, "step": 8270, "train/ce_loss": 0.9602965712547302 }, { "epoch": 0.8176784654933755, "step": 8270, "train/sim_loss": 0.06640625 }, { "epoch": 0.8176784654933755, "step": 8270, "train/total_loss": 0.16243591904640198 }, { "entropy": 8.636728286743164, "epoch": 0.817777338342891, "mean_token_accuracy": 0.7170022130012512, "num_tokens": 22241334.0, "step": 8271, "train/ce_loss": 1.8001161813735962 }, { "epoch": 0.817777338342891, "step": 8271, "train/sim_loss": 0.046875 }, { "epoch": 0.817777338342891, "step": 8271, "train/total_loss": 0.22688661515712738 }, { "entropy": 9.415346145629883, "epoch": 0.8178762111924066, "mean_token_accuracy": 0.7160883545875549, "num_tokens": 22246057.0, "step": 8272, "train/ce_loss": 2.0104639530181885 }, { "epoch": 0.8178762111924066, "step": 8272, "train/sim_loss": 0.0625 }, { "epoch": 0.8178762111924066, "step": 8272, "train/total_loss": 0.2635464072227478 }, { "entropy": 9.019721031188965, "epoch": 0.8179750840419221, "mean_token_accuracy": 0.7765957713127136, "num_tokens": 22251219.0, "step": 8273, "train/ce_loss": 0.819446325302124 }, { "epoch": 0.8179750840419221, "step": 8273, "train/sim_loss": 0.0390625 }, { "epoch": 0.8179750840419221, "step": 8273, "train/total_loss": 0.12100713700056076 }, { "entropy": 8.677478790283203, "epoch": 0.8180739568914376, "mean_token_accuracy": 0.7400274872779846, "num_tokens": 22256400.0, "step": 8274, "train/ce_loss": 5.961961164757668e-07 }, { "epoch": 0.8180739568914376, "step": 8274, "train/sim_loss": 0.01953125 }, { "epoch": 0.8180739568914376, "step": 8274, "train/total_loss": 0.019531309604644775 }, { "entropy": 8.468635559082031, "epoch": 0.8181728297409532, "mean_token_accuracy": 0.8016194105148315, "num_tokens": 22261867.0, "step": 8275, "train/ce_loss": 0.6289389729499817 }, { "epoch": 0.8181728297409532, "step": 8275, "train/sim_loss": 0.0625 }, { "epoch": 0.8181728297409532, "step": 8275, "train/total_loss": 0.12539389729499817 }, { "entropy": 8.647253036499023, "epoch": 0.8182717025904687, "mean_token_accuracy": 0.734375, "num_tokens": 22267152.0, "step": 8276, "train/ce_loss": 0.6211887001991272 }, { "epoch": 0.8182717025904687, "step": 8276, "train/sim_loss": 0.03125 }, { "epoch": 0.8182717025904687, "step": 8276, "train/total_loss": 0.09336887300014496 }, { "entropy": 9.016571998596191, "epoch": 0.8183705754399841, "mean_token_accuracy": 0.765739381313324, "num_tokens": 22272265.0, "step": 8277, "train/ce_loss": 1.0259438753128052 }, { "epoch": 0.8183705754399841, "step": 8277, "train/sim_loss": 0.078125 }, { "epoch": 0.8183705754399841, "step": 8277, "train/total_loss": 0.18071939051151276 }, { "entropy": 8.80375862121582, "epoch": 0.8184694482894997, "mean_token_accuracy": 0.7493734359741211, "num_tokens": 22277490.0, "step": 8278, "train/ce_loss": 5.999174277349084e-07 }, { "epoch": 0.8184694482894997, "step": 8278, "train/sim_loss": 0.03515625 }, { "epoch": 0.8184694482894997, "step": 8278, "train/total_loss": 0.035156309604644775 }, { "entropy": 8.638599395751953, "epoch": 0.8185683211390152, "mean_token_accuracy": 0.7134292721748352, "num_tokens": 22282797.0, "step": 8279, "train/ce_loss": 0.7618140578269958 }, { "epoch": 0.8185683211390152, "step": 8279, "train/sim_loss": 0.05078125 }, { "epoch": 0.8185683211390152, "step": 8279, "train/total_loss": 0.12696266174316406 }, { "epoch": 0.8186671939885307, "grad_norm": 0.7438392043113708, "learning_rate": 7.95554566582604e-06, "loss": 0.1353, "step": 8280 }, { "entropy": 9.337332725524902, "epoch": 0.8186671939885307, "mean_token_accuracy": 0.7757847309112549, "num_tokens": 22287669.0, "step": 8280, "train/ce_loss": 1.153381586074829 }, { "epoch": 0.8186671939885307, "step": 8280, "train/sim_loss": 0.08203125 }, { "epoch": 0.8186671939885307, "step": 8280, "train/total_loss": 0.19736941158771515 }, { "entropy": 8.556232452392578, "epoch": 0.8187660668380463, "mean_token_accuracy": 0.692118227481842, "num_tokens": 22292920.0, "step": 8281, "train/ce_loss": 0.8099319338798523 }, { "epoch": 0.8187660668380463, "step": 8281, "train/sim_loss": 0.0234375 }, { "epoch": 0.8187660668380463, "step": 8281, "train/total_loss": 0.10443069785833359 }, { "entropy": 8.515803337097168, "epoch": 0.8188649396875618, "mean_token_accuracy": 0.6752136945724487, "num_tokens": 22298330.0, "step": 8282, "train/ce_loss": 1.8313559293746948 }, { "epoch": 0.8188649396875618, "step": 8282, "train/sim_loss": 0.0390625 }, { "epoch": 0.8188649396875618, "step": 8282, "train/total_loss": 0.22219809889793396 }, { "entropy": 9.142642974853516, "epoch": 0.8189638125370773, "mean_token_accuracy": 0.7852664589881897, "num_tokens": 22303398.0, "step": 8283, "train/ce_loss": 1.0307633876800537 }, { "epoch": 0.8189638125370773, "step": 8283, "train/sim_loss": 0.0546875 }, { "epoch": 0.8189638125370773, "step": 8283, "train/total_loss": 0.15776383876800537 }, { "entropy": 8.419479370117188, "epoch": 0.8190626853865929, "mean_token_accuracy": 0.7898229956626892, "num_tokens": 22308773.0, "step": 8284, "train/ce_loss": 0.5398300886154175 }, { "epoch": 0.8190626853865929, "step": 8284, "train/sim_loss": 0.0546875 }, { "epoch": 0.8190626853865929, "step": 8284, "train/total_loss": 0.10867051035165787 }, { "entropy": 8.660914421081543, "epoch": 0.8191615582361084, "mean_token_accuracy": 0.7525309324264526, "num_tokens": 22314115.0, "step": 8285, "train/ce_loss": 1.1084849834442139 }, { "epoch": 0.8191615582361084, "step": 8285, "train/sim_loss": 0.046875 }, { "epoch": 0.8191615582361084, "step": 8285, "train/total_loss": 0.15772350132465363 }, { "entropy": 8.585163116455078, "epoch": 0.8192604310856239, "mean_token_accuracy": 0.8113878965377808, "num_tokens": 22319453.0, "step": 8286, "train/ce_loss": 0.3588511645793915 }, { "epoch": 0.8192604310856239, "step": 8286, "train/sim_loss": 0.078125 }, { "epoch": 0.8192604310856239, "step": 8286, "train/total_loss": 0.11401011794805527 }, { "entropy": 8.49917221069336, "epoch": 0.8193593039351394, "mean_token_accuracy": 0.7005405426025391, "num_tokens": 22324888.0, "step": 8287, "train/ce_loss": 0.6729772090911865 }, { "epoch": 0.8193593039351394, "step": 8287, "train/sim_loss": 0.07421875 }, { "epoch": 0.8193593039351394, "step": 8287, "train/total_loss": 0.14151647686958313 }, { "entropy": 9.326393127441406, "epoch": 0.8194581767846549, "mean_token_accuracy": 0.723809540271759, "num_tokens": 22329744.0, "step": 8288, "train/ce_loss": 1.513746976852417 }, { "epoch": 0.8194581767846549, "step": 8288, "train/sim_loss": 0.09375 }, { "epoch": 0.8194581767846549, "step": 8288, "train/total_loss": 0.2451246976852417 }, { "entropy": 8.601880073547363, "epoch": 0.8195570496341704, "mean_token_accuracy": 0.7397260069847107, "num_tokens": 22335017.0, "step": 8289, "train/ce_loss": 0.7906304001808167 }, { "epoch": 0.8195570496341704, "step": 8289, "train/sim_loss": 0.0390625 }, { "epoch": 0.8195570496341704, "step": 8289, "train/total_loss": 0.1181255429983139 }, { "entropy": 8.5064115524292, "epoch": 0.819655922483686, "mean_token_accuracy": 0.7393674850463867, "num_tokens": 22340411.0, "step": 8290, "train/ce_loss": 0.7451791167259216 }, { "epoch": 0.819655922483686, "step": 8290, "train/sim_loss": 0.015625 }, { "epoch": 0.819655922483686, "step": 8290, "train/total_loss": 0.09014291316270828 }, { "entropy": 9.029192924499512, "epoch": 0.8197547953332015, "mean_token_accuracy": 0.7339743375778198, "num_tokens": 22345485.0, "step": 8291, "train/ce_loss": 0.9517990350723267 }, { "epoch": 0.8197547953332015, "step": 8291, "train/sim_loss": 0.05859375 }, { "epoch": 0.8197547953332015, "step": 8291, "train/total_loss": 0.15377366542816162 }, { "entropy": 8.917953491210938, "epoch": 0.819853668182717, "mean_token_accuracy": 0.7058823704719543, "num_tokens": 22350742.0, "step": 8292, "train/ce_loss": 1.5822000705156825e-06 }, { "epoch": 0.819853668182717, "step": 8292, "train/sim_loss": 0.03125 }, { "epoch": 0.819853668182717, "step": 8292, "train/total_loss": 0.031250156462192535 }, { "entropy": 9.037485122680664, "epoch": 0.8199525410322326, "mean_token_accuracy": 0.7503876090049744, "num_tokens": 22355827.0, "step": 8293, "train/ce_loss": 1.3138306140899658 }, { "epoch": 0.8199525410322326, "step": 8293, "train/sim_loss": 0.06640625 }, { "epoch": 0.8199525410322326, "step": 8293, "train/total_loss": 0.19778931140899658 }, { "entropy": 9.30839729309082, "epoch": 0.8200514138817481, "mean_token_accuracy": 0.7744361162185669, "num_tokens": 22360627.0, "step": 8294, "train/ce_loss": 1.0988813638687134 }, { "epoch": 0.8200514138817481, "step": 8294, "train/sim_loss": 0.0390625 }, { "epoch": 0.8200514138817481, "step": 8294, "train/total_loss": 0.14895063638687134 }, { "entropy": 8.727283477783203, "epoch": 0.8201502867312636, "mean_token_accuracy": 0.7434210777282715, "num_tokens": 22365805.0, "step": 8295, "train/ce_loss": 1.1040056943893433 }, { "epoch": 0.8201502867312636, "step": 8295, "train/sim_loss": 0.03125 }, { "epoch": 0.8201502867312636, "step": 8295, "train/total_loss": 0.14165057241916656 }, { "entropy": 8.835163116455078, "epoch": 0.8202491595807792, "mean_token_accuracy": 0.7066666483879089, "num_tokens": 22371104.0, "step": 8296, "train/ce_loss": 1.369777798652649 }, { "epoch": 0.8202491595807792, "step": 8296, "train/sim_loss": 0.05859375 }, { "epoch": 0.8202491595807792, "step": 8296, "train/total_loss": 0.19557152688503265 }, { "entropy": 8.338752746582031, "epoch": 0.8203480324302946, "mean_token_accuracy": 0.748913049697876, "num_tokens": 22376719.0, "step": 8297, "train/ce_loss": 0.8425332307815552 }, { "epoch": 0.8203480324302946, "step": 8297, "train/sim_loss": 0.0625 }, { "epoch": 0.8203480324302946, "step": 8297, "train/total_loss": 0.14675332605838776 }, { "entropy": 8.253923416137695, "epoch": 0.8204469052798101, "mean_token_accuracy": 0.7366803288459778, "num_tokens": 22382188.0, "step": 8298, "train/ce_loss": 0.9377067685127258 }, { "epoch": 0.8204469052798101, "step": 8298, "train/sim_loss": 0.0546875 }, { "epoch": 0.8204469052798101, "step": 8298, "train/total_loss": 0.14845818281173706 }, { "entropy": 9.257837295532227, "epoch": 0.8205457781293257, "mean_token_accuracy": 0.728314220905304, "num_tokens": 22387243.0, "step": 8299, "train/ce_loss": 0.8520750403404236 }, { "epoch": 0.8205457781293257, "step": 8299, "train/sim_loss": 0.03515625 }, { "epoch": 0.8205457781293257, "step": 8299, "train/total_loss": 0.1203637570142746 }, { "epoch": 0.8206446509788412, "grad_norm": 0.7130552530288696, "learning_rate": 7.950600801068091e-06, "loss": 0.1332, "step": 8300 }, { "entropy": 8.64598274230957, "epoch": 0.8206446509788412, "mean_token_accuracy": 0.7901375889778137, "num_tokens": 22392555.0, "step": 8300, "train/ce_loss": 0.6230838894844055 }, { "epoch": 0.8206446509788412, "step": 8300, "train/sim_loss": 0.11328125 }, { "epoch": 0.8206446509788412, "step": 8300, "train/total_loss": 0.1755896359682083 }, { "entropy": 9.163591384887695, "epoch": 0.8207435238283567, "mean_token_accuracy": 0.7315112352371216, "num_tokens": 22397643.0, "step": 8301, "train/ce_loss": 1.5040645599365234 }, { "epoch": 0.8207435238283567, "step": 8301, "train/sim_loss": 0.078125 }, { "epoch": 0.8207435238283567, "step": 8301, "train/total_loss": 0.22853146493434906 }, { "entropy": 9.302888870239258, "epoch": 0.8208423966778723, "mean_token_accuracy": 0.7225000262260437, "num_tokens": 22402433.0, "step": 8302, "train/ce_loss": 1.322007392445812e-06 }, { "epoch": 0.8208423966778723, "step": 8302, "train/sim_loss": 0.046875 }, { "epoch": 0.8208423966778723, "step": 8302, "train/total_loss": 0.046875130385160446 }, { "entropy": 9.107100486755371, "epoch": 0.8209412695273878, "mean_token_accuracy": 0.6994134783744812, "num_tokens": 22407533.0, "step": 8303, "train/ce_loss": 5.614239171336521e-07 }, { "epoch": 0.8209412695273878, "step": 8303, "train/sim_loss": 0.03515625 }, { "epoch": 0.8209412695273878, "step": 8303, "train/total_loss": 0.03515630587935448 }, { "entropy": 8.753803253173828, "epoch": 0.8210401423769033, "mean_token_accuracy": 0.746185839176178, "num_tokens": 22412736.0, "step": 8304, "train/ce_loss": 0.6114625334739685 }, { "epoch": 0.8210401423769033, "step": 8304, "train/sim_loss": 0.05078125 }, { "epoch": 0.8210401423769033, "step": 8304, "train/total_loss": 0.11192750930786133 }, { "entropy": 8.770864486694336, "epoch": 0.8211390152264189, "mean_token_accuracy": 0.6758373379707336, "num_tokens": 22418028.0, "step": 8305, "train/ce_loss": 1.266835331916809 }, { "epoch": 0.8211390152264189, "step": 8305, "train/sim_loss": 0.0703125 }, { "epoch": 0.8211390152264189, "step": 8305, "train/total_loss": 0.1969960331916809 }, { "entropy": 9.309890747070312, "epoch": 0.8212378880759343, "mean_token_accuracy": 0.7563451528549194, "num_tokens": 22423032.0, "step": 8306, "train/ce_loss": 1.0493959188461304 }, { "epoch": 0.8212378880759343, "step": 8306, "train/sim_loss": 0.09375 }, { "epoch": 0.8212378880759343, "step": 8306, "train/total_loss": 0.19868959486484528 }, { "entropy": 9.099388122558594, "epoch": 0.8213367609254498, "mean_token_accuracy": 0.8033536672592163, "num_tokens": 22428139.0, "step": 8307, "train/ce_loss": 0.6932326555252075 }, { "epoch": 0.8213367609254498, "step": 8307, "train/sim_loss": 0.015625 }, { "epoch": 0.8213367609254498, "step": 8307, "train/total_loss": 0.08494826406240463 }, { "entropy": 8.80750846862793, "epoch": 0.8214356337749654, "mean_token_accuracy": 0.8082840442657471, "num_tokens": 22433438.0, "step": 8308, "train/ce_loss": 0.5369587540626526 }, { "epoch": 0.8214356337749654, "step": 8308, "train/sim_loss": 0.03125 }, { "epoch": 0.8214356337749654, "step": 8308, "train/total_loss": 0.08494587242603302 }, { "entropy": 8.695101737976074, "epoch": 0.8215345066244809, "mean_token_accuracy": 0.7209302186965942, "num_tokens": 22438765.0, "step": 8309, "train/ce_loss": 1.1398298740386963 }, { "epoch": 0.8215345066244809, "step": 8309, "train/sim_loss": 0.06640625 }, { "epoch": 0.8215345066244809, "step": 8309, "train/total_loss": 0.18038924038410187 }, { "entropy": 9.937353134155273, "epoch": 0.8216333794739964, "mean_token_accuracy": 0.8384279608726501, "num_tokens": 22443405.0, "step": 8310, "train/ce_loss": 1.3805160961055662e-06 }, { "epoch": 0.8216333794739964, "step": 8310, "train/sim_loss": 0.01953125 }, { "epoch": 0.8216333794739964, "step": 8310, "train/total_loss": 0.019531387835741043 }, { "entropy": 9.165075302124023, "epoch": 0.821732252323512, "mean_token_accuracy": 0.7601476311683655, "num_tokens": 22448388.0, "step": 8311, "train/ce_loss": 0.6348003149032593 }, { "epoch": 0.821732252323512, "step": 8311, "train/sim_loss": 0.02734375 }, { "epoch": 0.821732252323512, "step": 8311, "train/total_loss": 0.09082378447055817 }, { "entropy": 9.357953071594238, "epoch": 0.8218311251730275, "mean_token_accuracy": 0.7235293984413147, "num_tokens": 22453288.0, "step": 8312, "train/ce_loss": 1.2712736129760742 }, { "epoch": 0.8218311251730275, "step": 8312, "train/sim_loss": 0.0390625 }, { "epoch": 0.8218311251730275, "step": 8312, "train/total_loss": 0.16618986427783966 }, { "entropy": 8.546711921691895, "epoch": 0.821929998022543, "mean_token_accuracy": 0.7825160026550293, "num_tokens": 22458686.0, "step": 8313, "train/ce_loss": 0.5331897735595703 }, { "epoch": 0.821929998022543, "step": 8313, "train/sim_loss": 0.015625 }, { "epoch": 0.821929998022543, "step": 8313, "train/total_loss": 0.06894397735595703 }, { "entropy": 8.55560302734375, "epoch": 0.8220288708720586, "mean_token_accuracy": 0.7384615540504456, "num_tokens": 22464101.0, "step": 8314, "train/ce_loss": 0.44517233967781067 }, { "epoch": 0.8220288708720586, "step": 8314, "train/sim_loss": 0.02734375 }, { "epoch": 0.8220288708720586, "step": 8314, "train/total_loss": 0.07186098396778107 }, { "entropy": 8.67685317993164, "epoch": 0.822127743721574, "mean_token_accuracy": 0.7303225994110107, "num_tokens": 22469378.0, "step": 8315, "train/ce_loss": 0.9484623670578003 }, { "epoch": 0.822127743721574, "step": 8315, "train/sim_loss": 0.0625 }, { "epoch": 0.822127743721574, "step": 8315, "train/total_loss": 0.15734624862670898 }, { "entropy": 8.476226806640625, "epoch": 0.8222266165710895, "mean_token_accuracy": 0.7451550364494324, "num_tokens": 22474939.0, "step": 8316, "train/ce_loss": 1.000063419342041 }, { "epoch": 0.8222266165710895, "step": 8316, "train/sim_loss": 0.046875 }, { "epoch": 0.8222266165710895, "step": 8316, "train/total_loss": 0.1468813419342041 }, { "entropy": 9.247249603271484, "epoch": 0.8223254894206051, "mean_token_accuracy": 0.6802217960357666, "num_tokens": 22479948.0, "step": 8317, "train/ce_loss": 1.2007057666778564 }, { "epoch": 0.8223254894206051, "step": 8317, "train/sim_loss": 0.09765625 }, { "epoch": 0.8223254894206051, "step": 8317, "train/total_loss": 0.21772682666778564 }, { "entropy": 9.541007995605469, "epoch": 0.8224243622701206, "mean_token_accuracy": 0.6674311757087708, "num_tokens": 22484793.0, "step": 8318, "train/ce_loss": 3.321171561765368e-06 }, { "epoch": 0.8224243622701206, "step": 8318, "train/sim_loss": 0.0234375 }, { "epoch": 0.8224243622701206, "step": 8318, "train/total_loss": 0.023437831550836563 }, { "entropy": 8.837417602539062, "epoch": 0.8225232351196361, "mean_token_accuracy": 0.6903954744338989, "num_tokens": 22490131.0, "step": 8319, "train/ce_loss": 1.2030870914459229 }, { "epoch": 0.8225232351196361, "step": 8319, "train/sim_loss": 0.0703125 }, { "epoch": 0.8225232351196361, "step": 8319, "train/total_loss": 0.19062121212482452 }, { "epoch": 0.8226221079691517, "grad_norm": 0.7055894732475281, "learning_rate": 7.945655936310143e-06, "loss": 0.1319, "step": 8320 }, { "entropy": 8.6664400100708, "epoch": 0.8226221079691517, "mean_token_accuracy": 0.7209821343421936, "num_tokens": 22495530.0, "step": 8320, "train/ce_loss": 1.1936757564544678 }, { "epoch": 0.8226221079691517, "step": 8320, "train/sim_loss": 0.0546875 }, { "epoch": 0.8226221079691517, "step": 8320, "train/total_loss": 0.1740550696849823 }, { "entropy": 9.077009201049805, "epoch": 0.8227209808186672, "mean_token_accuracy": 0.7269230484962463, "num_tokens": 22500516.0, "step": 8321, "train/ce_loss": 0.8068293929100037 }, { "epoch": 0.8227209808186672, "step": 8321, "train/sim_loss": 0.0390625 }, { "epoch": 0.8227209808186672, "step": 8321, "train/total_loss": 0.11974544078111649 }, { "entropy": 8.769912719726562, "epoch": 0.8228198536681827, "mean_token_accuracy": 0.6963350772857666, "num_tokens": 22505729.0, "step": 8322, "train/ce_loss": 8.714407158549875e-06 }, { "epoch": 0.8228198536681827, "step": 8322, "train/sim_loss": 0.02734375 }, { "epoch": 0.8228198536681827, "step": 8322, "train/total_loss": 0.02734462171792984 }, { "entropy": 9.142882347106934, "epoch": 0.8229187265176983, "mean_token_accuracy": 0.6964285969734192, "num_tokens": 22510860.0, "step": 8323, "train/ce_loss": 0.9637071490287781 }, { "epoch": 0.8229187265176983, "step": 8323, "train/sim_loss": 0.0859375 }, { "epoch": 0.8229187265176983, "step": 8323, "train/total_loss": 0.18230822682380676 }, { "entropy": 9.229522705078125, "epoch": 0.8230175993672137, "mean_token_accuracy": 0.7593167424201965, "num_tokens": 22515897.0, "step": 8324, "train/ce_loss": 5.397641871240921e-07 }, { "epoch": 0.8230175993672137, "step": 8324, "train/sim_loss": 0.01171875 }, { "epoch": 0.8230175993672137, "step": 8324, "train/total_loss": 0.011718804016709328 }, { "entropy": 8.921040534973145, "epoch": 0.8231164722167292, "mean_token_accuracy": 0.7547847032546997, "num_tokens": 22521199.0, "step": 8325, "train/ce_loss": 0.5268176794052124 }, { "epoch": 0.8231164722167292, "step": 8325, "train/sim_loss": 0.0390625 }, { "epoch": 0.8231164722167292, "step": 8325, "train/total_loss": 0.09174427390098572 }, { "entropy": 9.184289932250977, "epoch": 0.8232153450662448, "mean_token_accuracy": 0.7299703359603882, "num_tokens": 22526335.0, "step": 8326, "train/ce_loss": 0.8263388872146606 }, { "epoch": 0.8232153450662448, "step": 8326, "train/sim_loss": 0.08203125 }, { "epoch": 0.8232153450662448, "step": 8326, "train/total_loss": 0.1646651327610016 }, { "entropy": 8.599185943603516, "epoch": 0.8233142179157603, "mean_token_accuracy": 0.7813725471496582, "num_tokens": 22531824.0, "step": 8327, "train/ce_loss": 0.7705698609352112 }, { "epoch": 0.8233142179157603, "step": 8327, "train/sim_loss": 0.01953125 }, { "epoch": 0.8233142179157603, "step": 8327, "train/total_loss": 0.09658823907375336 }, { "entropy": 9.578577995300293, "epoch": 0.8234130907652758, "mean_token_accuracy": 0.7731958627700806, "num_tokens": 22536644.0, "step": 8328, "train/ce_loss": 1.8175444438384147e-06 }, { "epoch": 0.8234130907652758, "step": 8328, "train/sim_loss": 0.04296875 }, { "epoch": 0.8234130907652758, "step": 8328, "train/total_loss": 0.042968932539224625 }, { "entropy": 8.849867820739746, "epoch": 0.8235119636147914, "mean_token_accuracy": 0.7250280380249023, "num_tokens": 22541954.0, "step": 8329, "train/ce_loss": 1.332899808883667 }, { "epoch": 0.8235119636147914, "step": 8329, "train/sim_loss": 0.02734375 }, { "epoch": 0.8235119636147914, "step": 8329, "train/total_loss": 0.16063372790813446 }, { "entropy": 8.916130065917969, "epoch": 0.8236108364643069, "mean_token_accuracy": 0.7496318221092224, "num_tokens": 22547085.0, "step": 8330, "train/ce_loss": 0.9665127992630005 }, { "epoch": 0.8236108364643069, "step": 8330, "train/sim_loss": 0.05078125 }, { "epoch": 0.8236108364643069, "step": 8330, "train/total_loss": 0.14743253588676453 }, { "entropy": 8.864173889160156, "epoch": 0.8237097093138224, "mean_token_accuracy": 0.7624861001968384, "num_tokens": 22552479.0, "step": 8331, "train/ce_loss": 0.5066568851470947 }, { "epoch": 0.8237097093138224, "step": 8331, "train/sim_loss": 0.0859375 }, { "epoch": 0.8237097093138224, "step": 8331, "train/total_loss": 0.1366031914949417 }, { "entropy": 8.682044982910156, "epoch": 0.823808582163338, "mean_token_accuracy": 0.7339832782745361, "num_tokens": 22557646.0, "step": 8332, "train/ce_loss": 0.9429482817649841 }, { "epoch": 0.823808582163338, "step": 8332, "train/sim_loss": 0.109375 }, { "epoch": 0.823808582163338, "step": 8332, "train/total_loss": 0.20366983115673065 }, { "entropy": 8.33643627166748, "epoch": 0.8239074550128535, "mean_token_accuracy": 0.7113022208213806, "num_tokens": 22562989.0, "step": 8333, "train/ce_loss": 1.1955353021621704 }, { "epoch": 0.8239074550128535, "step": 8333, "train/sim_loss": 0.09375 }, { "epoch": 0.8239074550128535, "step": 8333, "train/total_loss": 0.21330353617668152 }, { "entropy": 9.05999755859375, "epoch": 0.824006327862369, "mean_token_accuracy": 0.7610872387886047, "num_tokens": 22568175.0, "step": 8334, "train/ce_loss": 4.788768706021074e-07 }, { "epoch": 0.824006327862369, "step": 8334, "train/sim_loss": 0.01171875 }, { "epoch": 0.824006327862369, "step": 8334, "train/total_loss": 0.011718797497451305 }, { "entropy": 8.767662048339844, "epoch": 0.8241052007118845, "mean_token_accuracy": 0.7311828136444092, "num_tokens": 22573392.0, "step": 8335, "train/ce_loss": 0.5941759347915649 }, { "epoch": 0.8241052007118845, "step": 8335, "train/sim_loss": 0.0546875 }, { "epoch": 0.8241052007118845, "step": 8335, "train/total_loss": 0.11410509049892426 }, { "entropy": 8.826053619384766, "epoch": 0.8242040735614, "mean_token_accuracy": 0.7337662577629089, "num_tokens": 22578487.0, "step": 8336, "train/ce_loss": 4.6122145249682944e-06 }, { "epoch": 0.8242040735614, "step": 8336, "train/sim_loss": 0.03515625 }, { "epoch": 0.8242040735614, "step": 8336, "train/total_loss": 0.03515671193599701 }, { "entropy": 9.424985885620117, "epoch": 0.8243029464109156, "mean_token_accuracy": 0.7524752616882324, "num_tokens": 22583428.0, "step": 8337, "train/ce_loss": 0.8262380957603455 }, { "epoch": 0.8243029464109156, "step": 8337, "train/sim_loss": 0.0546875 }, { "epoch": 0.8243029464109156, "step": 8337, "train/total_loss": 0.13731130957603455 }, { "entropy": 9.327249526977539, "epoch": 0.8244018192604311, "mean_token_accuracy": 0.7720306515693665, "num_tokens": 22588375.0, "step": 8338, "train/ce_loss": 0.8780062198638916 }, { "epoch": 0.8244018192604311, "step": 8338, "train/sim_loss": 0.0390625 }, { "epoch": 0.8244018192604311, "step": 8338, "train/total_loss": 0.12686312198638916 }, { "entropy": 8.657686233520508, "epoch": 0.8245006921099466, "mean_token_accuracy": 0.7360405921936035, "num_tokens": 22593823.0, "step": 8339, "train/ce_loss": 1.0873697996139526 }, { "epoch": 0.8245006921099466, "step": 8339, "train/sim_loss": 0.05078125 }, { "epoch": 0.8245006921099466, "step": 8339, "train/total_loss": 0.15951824188232422 }, { "epoch": 0.8245995649594622, "grad_norm": 0.573781430721283, "learning_rate": 7.940711071552194e-06, "loss": 0.1365, "step": 8340 }, { "entropy": 8.919189453125, "epoch": 0.8245995649594622, "mean_token_accuracy": 0.6984318494796753, "num_tokens": 22599110.0, "step": 8340, "train/ce_loss": 1.0977706909179688 }, { "epoch": 0.8245995649594622, "step": 8340, "train/sim_loss": 0.03515625 }, { "epoch": 0.8245995649594622, "step": 8340, "train/total_loss": 0.1449333131313324 }, { "entropy": 8.751079559326172, "epoch": 0.8246984378089777, "mean_token_accuracy": 0.6935867071151733, "num_tokens": 22604477.0, "step": 8341, "train/ce_loss": 1.095212459564209 }, { "epoch": 0.8246984378089777, "step": 8341, "train/sim_loss": 0.05859375 }, { "epoch": 0.8246984378089777, "step": 8341, "train/total_loss": 0.16811498999595642 }, { "entropy": 8.826139450073242, "epoch": 0.8247973106584932, "mean_token_accuracy": 0.7035236954689026, "num_tokens": 22609834.0, "step": 8342, "train/ce_loss": 1.364446759223938 }, { "epoch": 0.8247973106584932, "step": 8342, "train/sim_loss": 0.10546875 }, { "epoch": 0.8247973106584932, "step": 8342, "train/total_loss": 0.24191342294216156 }, { "entropy": 9.234088897705078, "epoch": 0.8248961835080088, "mean_token_accuracy": 0.7575277090072632, "num_tokens": 22614883.0, "step": 8343, "train/ce_loss": 1.440025806427002 }, { "epoch": 0.8248961835080088, "step": 8343, "train/sim_loss": 0.0234375 }, { "epoch": 0.8248961835080088, "step": 8343, "train/total_loss": 0.16744008660316467 }, { "entropy": 9.1658935546875, "epoch": 0.8249950563575242, "mean_token_accuracy": 0.743813693523407, "num_tokens": 22620039.0, "step": 8344, "train/ce_loss": 1.5082443952560425 }, { "epoch": 0.8249950563575242, "step": 8344, "train/sim_loss": 0.046875 }, { "epoch": 0.8249950563575242, "step": 8344, "train/total_loss": 0.1976994425058365 }, { "entropy": 9.100385665893555, "epoch": 0.8250939292070397, "mean_token_accuracy": 0.761689305305481, "num_tokens": 22625123.0, "step": 8345, "train/ce_loss": 8.795655048743356e-07 }, { "epoch": 0.8250939292070397, "step": 8345, "train/sim_loss": 0.05859375 }, { "epoch": 0.8250939292070397, "step": 8345, "train/total_loss": 0.05859383940696716 }, { "entropy": 9.031524658203125, "epoch": 0.8251928020565553, "mean_token_accuracy": 0.7572078704833984, "num_tokens": 22630206.0, "step": 8346, "train/ce_loss": 0.6743995547294617 }, { "epoch": 0.8251928020565553, "step": 8346, "train/sim_loss": 0.02734375 }, { "epoch": 0.8251928020565553, "step": 8346, "train/total_loss": 0.0947837084531784 }, { "entropy": 8.112039566040039, "epoch": 0.8252916749060708, "mean_token_accuracy": 0.7008771896362305, "num_tokens": 22635773.0, "step": 8347, "train/ce_loss": 1.1441271305084229 }, { "epoch": 0.8252916749060708, "step": 8347, "train/sim_loss": 0.0859375 }, { "epoch": 0.8252916749060708, "step": 8347, "train/total_loss": 0.20035022497177124 }, { "entropy": 8.473743438720703, "epoch": 0.8253905477555863, "mean_token_accuracy": 0.7245370149612427, "num_tokens": 22641116.0, "step": 8348, "train/ce_loss": 0.8390929698944092 }, { "epoch": 0.8253905477555863, "step": 8348, "train/sim_loss": 0.0703125 }, { "epoch": 0.8253905477555863, "step": 8348, "train/total_loss": 0.1542218029499054 }, { "entropy": 8.437721252441406, "epoch": 0.8254894206051019, "mean_token_accuracy": 0.725806474685669, "num_tokens": 22646642.0, "step": 8349, "train/ce_loss": 0.5069792866706848 }, { "epoch": 0.8254894206051019, "step": 8349, "train/sim_loss": 0.05078125 }, { "epoch": 0.8254894206051019, "step": 8349, "train/total_loss": 0.1014791801571846 }, { "entropy": 8.793678283691406, "epoch": 0.8255882934546174, "mean_token_accuracy": 0.7767969965934753, "num_tokens": 22651962.0, "step": 8350, "train/ce_loss": 0.43854376673698425 }, { "epoch": 0.8255882934546174, "step": 8350, "train/sim_loss": 0.05078125 }, { "epoch": 0.8255882934546174, "step": 8350, "train/total_loss": 0.09463562816381454 }, { "entropy": 8.788917541503906, "epoch": 0.8256871663041329, "mean_token_accuracy": 0.7319728136062622, "num_tokens": 22657170.0, "step": 8351, "train/ce_loss": 0.5017361640930176 }, { "epoch": 0.8256871663041329, "step": 8351, "train/sim_loss": 0.04296875 }, { "epoch": 0.8256871663041329, "step": 8351, "train/total_loss": 0.09314236789941788 }, { "entropy": 8.618673324584961, "epoch": 0.8257860391536485, "mean_token_accuracy": 0.6952965259552002, "num_tokens": 22662603.0, "step": 8352, "train/ce_loss": 1.5782723426818848 }, { "epoch": 0.8257860391536485, "step": 8352, "train/sim_loss": 0.078125 }, { "epoch": 0.8257860391536485, "step": 8352, "train/total_loss": 0.2359522432088852 }, { "entropy": 8.515067100524902, "epoch": 0.8258849120031639, "mean_token_accuracy": 0.7948718070983887, "num_tokens": 22667913.0, "step": 8353, "train/ce_loss": 0.9898000359535217 }, { "epoch": 0.8258849120031639, "step": 8353, "train/sim_loss": 0.046875 }, { "epoch": 0.8258849120031639, "step": 8353, "train/total_loss": 0.14585500955581665 }, { "entropy": 8.51097297668457, "epoch": 0.8259837848526794, "mean_token_accuracy": 0.7309874892234802, "num_tokens": 22673272.0, "step": 8354, "train/ce_loss": 0.6297245621681213 }, { "epoch": 0.8259837848526794, "step": 8354, "train/sim_loss": 0.03125 }, { "epoch": 0.8259837848526794, "step": 8354, "train/total_loss": 0.09422245621681213 }, { "entropy": 8.786314010620117, "epoch": 0.826082657702195, "mean_token_accuracy": 0.6924968957901001, "num_tokens": 22678561.0, "step": 8355, "train/ce_loss": 0.7038342952728271 }, { "epoch": 0.826082657702195, "step": 8355, "train/sim_loss": 0.03515625 }, { "epoch": 0.826082657702195, "step": 8355, "train/total_loss": 0.10553967952728271 }, { "entropy": 9.134121894836426, "epoch": 0.8261815305517105, "mean_token_accuracy": 0.8154860138893127, "num_tokens": 22683557.0, "step": 8356, "train/ce_loss": 5.090377044325578e-07 }, { "epoch": 0.8261815305517105, "step": 8356, "train/sim_loss": 0.046875 }, { "epoch": 0.8261815305517105, "step": 8356, "train/total_loss": 0.04687505215406418 }, { "entropy": 8.495359420776367, "epoch": 0.826280403401226, "mean_token_accuracy": 0.6715686321258545, "num_tokens": 22689015.0, "step": 8357, "train/ce_loss": 1.5298234224319458 }, { "epoch": 0.826280403401226, "step": 8357, "train/sim_loss": 0.0390625 }, { "epoch": 0.826280403401226, "step": 8357, "train/total_loss": 0.19204483926296234 }, { "entropy": 8.934258460998535, "epoch": 0.8263792762507416, "mean_token_accuracy": 0.75, "num_tokens": 22694265.0, "step": 8358, "train/ce_loss": 0.9743590950965881 }, { "epoch": 0.8263792762507416, "step": 8358, "train/sim_loss": 0.078125 }, { "epoch": 0.8263792762507416, "step": 8358, "train/total_loss": 0.17556092143058777 }, { "entropy": 9.121557235717773, "epoch": 0.8264781491002571, "mean_token_accuracy": 0.680672287940979, "num_tokens": 22699467.0, "step": 8359, "train/ce_loss": 0.8887518048286438 }, { "epoch": 0.8264781491002571, "step": 8359, "train/sim_loss": 0.046875 }, { "epoch": 0.8264781491002571, "step": 8359, "train/total_loss": 0.1357501745223999 }, { "epoch": 0.8265770219497726, "grad_norm": 0.7569577693939209, "learning_rate": 7.935766206794246e-06, "loss": 0.1418, "step": 8360 }, { "entropy": 8.900482177734375, "epoch": 0.8265770219497726, "mean_token_accuracy": 0.7642276287078857, "num_tokens": 22704663.0, "step": 8360, "train/ce_loss": 0.5398333072662354 }, { "epoch": 0.8265770219497726, "step": 8360, "train/sim_loss": 0.0625 }, { "epoch": 0.8265770219497726, "step": 8360, "train/total_loss": 0.11648333072662354 }, { "entropy": 9.118875503540039, "epoch": 0.8266758947992882, "mean_token_accuracy": 0.6995153427124023, "num_tokens": 22709685.0, "step": 8361, "train/ce_loss": 1.1485044524306431e-06 }, { "epoch": 0.8266758947992882, "step": 8361, "train/sim_loss": 0.03515625 }, { "epoch": 0.8266758947992882, "step": 8361, "train/total_loss": 0.03515636548399925 }, { "entropy": 8.710692405700684, "epoch": 0.8267747676488036, "mean_token_accuracy": 0.7369020581245422, "num_tokens": 22715058.0, "step": 8362, "train/ce_loss": 0.6468256711959839 }, { "epoch": 0.8267747676488036, "step": 8362, "train/sim_loss": 0.10546875 }, { "epoch": 0.8267747676488036, "step": 8362, "train/total_loss": 0.17015132308006287 }, { "entropy": 9.265089988708496, "epoch": 0.8268736404983191, "mean_token_accuracy": 0.7182866334915161, "num_tokens": 22720280.0, "step": 8363, "train/ce_loss": 1.0972142219543457 }, { "epoch": 0.8268736404983191, "step": 8363, "train/sim_loss": 0.0703125 }, { "epoch": 0.8268736404983191, "step": 8363, "train/total_loss": 0.18003392219543457 }, { "entropy": 8.912721633911133, "epoch": 0.8269725133478347, "mean_token_accuracy": 0.7468531727790833, "num_tokens": 22725493.0, "step": 8364, "train/ce_loss": 1.1656962897177436e-06 }, { "epoch": 0.8269725133478347, "step": 8364, "train/sim_loss": 0.05859375 }, { "epoch": 0.8269725133478347, "step": 8364, "train/total_loss": 0.05859386548399925 }, { "entropy": 9.387989044189453, "epoch": 0.8270713861973502, "mean_token_accuracy": 0.8360071182250977, "num_tokens": 22730463.0, "step": 8365, "train/ce_loss": 0.6007579565048218 }, { "epoch": 0.8270713861973502, "step": 8365, "train/sim_loss": 0.0859375 }, { "epoch": 0.8270713861973502, "step": 8365, "train/total_loss": 0.1460132896900177 }, { "entropy": 8.675314903259277, "epoch": 0.8271702590468657, "mean_token_accuracy": 0.7235772609710693, "num_tokens": 22735762.0, "step": 8366, "train/ce_loss": 0.8701668977737427 }, { "epoch": 0.8271702590468657, "step": 8366, "train/sim_loss": 0.11328125 }, { "epoch": 0.8271702590468657, "step": 8366, "train/total_loss": 0.20029795169830322 }, { "entropy": 9.281265258789062, "epoch": 0.8272691318963813, "mean_token_accuracy": 0.7655601501464844, "num_tokens": 22740697.0, "step": 8367, "train/ce_loss": 0.726944625377655 }, { "epoch": 0.8272691318963813, "step": 8367, "train/sim_loss": 0.05078125 }, { "epoch": 0.8272691318963813, "step": 8367, "train/total_loss": 0.12347571551799774 }, { "entropy": 9.316499710083008, "epoch": 0.8273680047458968, "mean_token_accuracy": 0.7140411138534546, "num_tokens": 22745762.0, "step": 8368, "train/ce_loss": 1.2663293773584883e-06 }, { "epoch": 0.8273680047458968, "step": 8368, "train/sim_loss": 0.04296875 }, { "epoch": 0.8273680047458968, "step": 8368, "train/total_loss": 0.04296887665987015 }, { "entropy": 8.754581451416016, "epoch": 0.8274668775954123, "mean_token_accuracy": 0.7535884976387024, "num_tokens": 22751003.0, "step": 8369, "train/ce_loss": 1.0439002513885498 }, { "epoch": 0.8274668775954123, "step": 8369, "train/sim_loss": 0.03515625 }, { "epoch": 0.8274668775954123, "step": 8369, "train/total_loss": 0.13954627513885498 }, { "entropy": 9.088933944702148, "epoch": 0.8275657504449279, "mean_token_accuracy": 0.745814323425293, "num_tokens": 22756149.0, "step": 8370, "train/ce_loss": 0.6570780873298645 }, { "epoch": 0.8275657504449279, "step": 8370, "train/sim_loss": 0.0859375 }, { "epoch": 0.8275657504449279, "step": 8370, "train/total_loss": 0.15164530277252197 }, { "entropy": 8.768074035644531, "epoch": 0.8276646232944433, "mean_token_accuracy": 0.7833982110023499, "num_tokens": 22761399.0, "step": 8371, "train/ce_loss": 0.6726462244987488 }, { "epoch": 0.8276646232944433, "step": 8371, "train/sim_loss": 0.02734375 }, { "epoch": 0.8276646232944433, "step": 8371, "train/total_loss": 0.094608373939991 }, { "entropy": 8.775765419006348, "epoch": 0.8277634961439588, "mean_token_accuracy": 0.7537227869033813, "num_tokens": 22766702.0, "step": 8372, "train/ce_loss": 0.612133800983429 }, { "epoch": 0.8277634961439588, "step": 8372, "train/sim_loss": 0.0234375 }, { "epoch": 0.8277634961439588, "step": 8372, "train/total_loss": 0.08465088158845901 }, { "entropy": 9.579498291015625, "epoch": 0.8278623689934744, "mean_token_accuracy": 0.6849710941314697, "num_tokens": 22771445.0, "step": 8373, "train/ce_loss": 1.8132412433624268 }, { "epoch": 0.8278623689934744, "step": 8373, "train/sim_loss": 0.05859375 }, { "epoch": 0.8278623689934744, "step": 8373, "train/total_loss": 0.23991787433624268 }, { "entropy": 8.698003768920898, "epoch": 0.8279612418429899, "mean_token_accuracy": 0.7521276473999023, "num_tokens": 22776877.0, "step": 8374, "train/ce_loss": 0.5366253852844238 }, { "epoch": 0.8279612418429899, "step": 8374, "train/sim_loss": 0.0546875 }, { "epoch": 0.8279612418429899, "step": 8374, "train/total_loss": 0.10835003852844238 }, { "entropy": 9.23279857635498, "epoch": 0.8280601146925054, "mean_token_accuracy": 0.7651376128196716, "num_tokens": 22781854.0, "step": 8375, "train/ce_loss": 0.7307769656181335 }, { "epoch": 0.8280601146925054, "step": 8375, "train/sim_loss": 0.0390625 }, { "epoch": 0.8280601146925054, "step": 8375, "train/total_loss": 0.11214020103216171 }, { "entropy": 8.741460800170898, "epoch": 0.828158987542021, "mean_token_accuracy": 0.7210884094238281, "num_tokens": 22787014.0, "step": 8376, "train/ce_loss": 0.9084756374359131 }, { "epoch": 0.828158987542021, "step": 8376, "train/sim_loss": 0.05859375 }, { "epoch": 0.828158987542021, "step": 8376, "train/total_loss": 0.14944131672382355 }, { "entropy": 8.50411605834961, "epoch": 0.8282578603915365, "mean_token_accuracy": 0.7203065156936646, "num_tokens": 22792294.0, "step": 8377, "train/ce_loss": 1.2492197751998901 }, { "epoch": 0.8282578603915365, "step": 8377, "train/sim_loss": 0.0859375 }, { "epoch": 0.8282578603915365, "step": 8377, "train/total_loss": 0.210859477519989 }, { "entropy": 8.712186813354492, "epoch": 0.828356733241052, "mean_token_accuracy": 0.7301587462425232, "num_tokens": 22797685.0, "step": 8378, "train/ce_loss": 0.7416990995407104 }, { "epoch": 0.828356733241052, "step": 8378, "train/sim_loss": 0.05078125 }, { "epoch": 0.828356733241052, "step": 8378, "train/total_loss": 0.12495116144418716 }, { "entropy": 8.593022346496582, "epoch": 0.8284556060905676, "mean_token_accuracy": 0.7089552283287048, "num_tokens": 22802988.0, "step": 8379, "train/ce_loss": 0.6530058979988098 }, { "epoch": 0.8284556060905676, "step": 8379, "train/sim_loss": 0.01953125 }, { "epoch": 0.8284556060905676, "step": 8379, "train/total_loss": 0.0848318412899971 }, { "epoch": 0.828554478940083, "grad_norm": 0.8061201572418213, "learning_rate": 7.930821342036296e-06, "loss": 0.1408, "step": 8380 }, { "entropy": 8.493386268615723, "epoch": 0.828554478940083, "mean_token_accuracy": 0.7535545229911804, "num_tokens": 22808315.0, "step": 8380, "train/ce_loss": 3.5311497867951402e-06 }, { "epoch": 0.828554478940083, "step": 8380, "train/sim_loss": 0.03515625 }, { "epoch": 0.828554478940083, "step": 8380, "train/total_loss": 0.035156603902578354 }, { "entropy": 8.784348487854004, "epoch": 0.8286533517895985, "mean_token_accuracy": 0.674054741859436, "num_tokens": 22813511.0, "step": 8381, "train/ce_loss": 1.5281237363815308 }, { "epoch": 0.8286533517895985, "step": 8381, "train/sim_loss": 0.09375 }, { "epoch": 0.8286533517895985, "step": 8381, "train/total_loss": 0.24656237661838531 }, { "entropy": 8.662527084350586, "epoch": 0.8287522246391141, "mean_token_accuracy": 0.7210144996643066, "num_tokens": 22818804.0, "step": 8382, "train/ce_loss": 0.9179139137268066 }, { "epoch": 0.8287522246391141, "step": 8382, "train/sim_loss": 0.06640625 }, { "epoch": 0.8287522246391141, "step": 8382, "train/total_loss": 0.15819764137268066 }, { "entropy": 9.229565620422363, "epoch": 0.8288510974886296, "mean_token_accuracy": 0.7356687784194946, "num_tokens": 22823921.0, "step": 8383, "train/ce_loss": 0.8467808365821838 }, { "epoch": 0.8288510974886296, "step": 8383, "train/sim_loss": 0.0625 }, { "epoch": 0.8288510974886296, "step": 8383, "train/total_loss": 0.14717808365821838 }, { "entropy": 9.11532974243164, "epoch": 0.8289499703381451, "mean_token_accuracy": 0.7118353247642517, "num_tokens": 22828969.0, "step": 8384, "train/ce_loss": 3.157426118850708 }, { "epoch": 0.8289499703381451, "step": 8384, "train/sim_loss": 0.0546875 }, { "epoch": 0.8289499703381451, "step": 8384, "train/total_loss": 0.3704301118850708 }, { "entropy": 8.697054862976074, "epoch": 0.8290488431876607, "mean_token_accuracy": 0.736540675163269, "num_tokens": 22834322.0, "step": 8385, "train/ce_loss": 0.3840124309062958 }, { "epoch": 0.8290488431876607, "step": 8385, "train/sim_loss": 0.04296875 }, { "epoch": 0.8290488431876607, "step": 8385, "train/total_loss": 0.08136999607086182 }, { "entropy": 8.476293563842773, "epoch": 0.8291477160371762, "mean_token_accuracy": 0.7830578684806824, "num_tokens": 22839780.0, "step": 8386, "train/ce_loss": 0.36207708716392517 }, { "epoch": 0.8291477160371762, "step": 8386, "train/sim_loss": 0.015625 }, { "epoch": 0.8291477160371762, "step": 8386, "train/total_loss": 0.05183270946145058 }, { "entropy": 8.90697956085205, "epoch": 0.8292465888866917, "mean_token_accuracy": 0.705633819103241, "num_tokens": 22844896.0, "step": 8387, "train/ce_loss": 1.3907127380371094 }, { "epoch": 0.8292465888866917, "step": 8387, "train/sim_loss": 0.0546875 }, { "epoch": 0.8292465888866917, "step": 8387, "train/total_loss": 0.1937587708234787 }, { "entropy": 9.319096565246582, "epoch": 0.8293454617362073, "mean_token_accuracy": 0.69140625, "num_tokens": 22849833.0, "step": 8388, "train/ce_loss": 7.63372918299865e-06 }, { "epoch": 0.8293454617362073, "step": 8388, "train/sim_loss": 0.015625 }, { "epoch": 0.8293454617362073, "step": 8388, "train/total_loss": 0.015625763684511185 }, { "entropy": 9.05936050415039, "epoch": 0.8294443345857228, "mean_token_accuracy": 0.6970198750495911, "num_tokens": 22854904.0, "step": 8389, "train/ce_loss": 1.126064419746399 }, { "epoch": 0.8294443345857228, "step": 8389, "train/sim_loss": 0.0625 }, { "epoch": 0.8294443345857228, "step": 8389, "train/total_loss": 0.17510643601417542 }, { "entropy": 9.503225326538086, "epoch": 0.8295432074352382, "mean_token_accuracy": 0.7247706651687622, "num_tokens": 22859662.0, "step": 8390, "train/ce_loss": 1.6506280644534854e-06 }, { "epoch": 0.8295432074352382, "step": 8390, "train/sim_loss": 0.0390625 }, { "epoch": 0.8295432074352382, "step": 8390, "train/total_loss": 0.03906266391277313 }, { "entropy": 8.62103271484375, "epoch": 0.8296420802847538, "mean_token_accuracy": 0.7653429508209229, "num_tokens": 22864963.0, "step": 8391, "train/ce_loss": 0.7772435545921326 }, { "epoch": 0.8296420802847538, "step": 8391, "train/sim_loss": 0.05859375 }, { "epoch": 0.8296420802847538, "step": 8391, "train/total_loss": 0.1363181173801422 }, { "entropy": 8.943082809448242, "epoch": 0.8297409531342693, "mean_token_accuracy": 0.6905370950698853, "num_tokens": 22870227.0, "step": 8392, "train/ce_loss": 1.8764548301696777 }, { "epoch": 0.8297409531342693, "step": 8392, "train/sim_loss": 0.11328125 }, { "epoch": 0.8297409531342693, "step": 8392, "train/total_loss": 0.30092674493789673 }, { "entropy": 9.290210723876953, "epoch": 0.8298398259837848, "mean_token_accuracy": 0.7767857313156128, "num_tokens": 22875191.0, "step": 8393, "train/ce_loss": 1.4096912145614624 }, { "epoch": 0.8298398259837848, "step": 8393, "train/sim_loss": 0.05078125 }, { "epoch": 0.8298398259837848, "step": 8393, "train/total_loss": 0.19175037741661072 }, { "entropy": 9.422948837280273, "epoch": 0.8299386988333004, "mean_token_accuracy": 0.7759036421775818, "num_tokens": 22880006.0, "step": 8394, "train/ce_loss": 3.2801135603222065e-06 }, { "epoch": 0.8299386988333004, "step": 8394, "train/sim_loss": 0.01953125 }, { "epoch": 0.8299386988333004, "step": 8394, "train/total_loss": 0.019531577825546265 }, { "entropy": 8.709457397460938, "epoch": 0.8300375716828159, "mean_token_accuracy": 0.6770833134651184, "num_tokens": 22885445.0, "step": 8395, "train/ce_loss": 1.0227208137512207 }, { "epoch": 0.8300375716828159, "step": 8395, "train/sim_loss": 0.08203125 }, { "epoch": 0.8300375716828159, "step": 8395, "train/total_loss": 0.18430334329605103 }, { "entropy": 8.994638442993164, "epoch": 0.8301364445323314, "mean_token_accuracy": 0.7828842997550964, "num_tokens": 22890524.0, "step": 8396, "train/ce_loss": 1.1766117811203003 }, { "epoch": 0.8301364445323314, "step": 8396, "train/sim_loss": 0.0546875 }, { "epoch": 0.8301364445323314, "step": 8396, "train/total_loss": 0.17234867811203003 }, { "entropy": 8.95175552368164, "epoch": 0.830235317381847, "mean_token_accuracy": 0.7016011476516724, "num_tokens": 22895628.0, "step": 8397, "train/ce_loss": 1.218043565750122 }, { "epoch": 0.830235317381847, "step": 8397, "train/sim_loss": 0.0546875 }, { "epoch": 0.830235317381847, "step": 8397, "train/total_loss": 0.1764918565750122 }, { "entropy": 8.634076118469238, "epoch": 0.8303341902313625, "mean_token_accuracy": 0.7063106894493103, "num_tokens": 22900889.0, "step": 8398, "train/ce_loss": 0.709955096244812 }, { "epoch": 0.8303341902313625, "step": 8398, "train/sim_loss": 0.03515625 }, { "epoch": 0.8303341902313625, "step": 8398, "train/total_loss": 0.1061517596244812 }, { "entropy": 8.409835815429688, "epoch": 0.8304330630808779, "mean_token_accuracy": 0.7674418687820435, "num_tokens": 22906310.0, "step": 8399, "train/ce_loss": 0.748572826385498 }, { "epoch": 0.8304330630808779, "step": 8399, "train/sim_loss": 0.0625 }, { "epoch": 0.8304330630808779, "step": 8399, "train/total_loss": 0.13735729455947876 }, { "epoch": 0.8305319359303935, "grad_norm": 0.5774850845336914, "learning_rate": 7.925876477278347e-06, "loss": 0.1424, "step": 8400 }, { "entropy": 9.510812759399414, "epoch": 0.8305319359303935, "mean_token_accuracy": 0.8356807231903076, "num_tokens": 22911129.0, "step": 8400, "train/ce_loss": 1.0387249176346813e-06 }, { "epoch": 0.8305319359303935, "step": 8400, "train/sim_loss": 0.015625 }, { "epoch": 0.8305319359303935, "step": 8400, "train/total_loss": 0.015625104308128357 }, { "entropy": 8.4102783203125, "epoch": 0.830630808779909, "mean_token_accuracy": 0.7367829084396362, "num_tokens": 22916518.0, "step": 8401, "train/ce_loss": 0.835471510887146 }, { "epoch": 0.830630808779909, "step": 8401, "train/sim_loss": 0.06640625 }, { "epoch": 0.830630808779909, "step": 8401, "train/total_loss": 0.14995339512825012 }, { "entropy": 8.879125595092773, "epoch": 0.8307296816294245, "mean_token_accuracy": 0.7929373979568481, "num_tokens": 22921647.0, "step": 8402, "train/ce_loss": 0.6748191118240356 }, { "epoch": 0.8307296816294245, "step": 8402, "train/sim_loss": 0.0234375 }, { "epoch": 0.8307296816294245, "step": 8402, "train/total_loss": 0.09091941267251968 }, { "entropy": 9.066545486450195, "epoch": 0.8308285544789401, "mean_token_accuracy": 0.7420634627342224, "num_tokens": 22926624.0, "step": 8403, "train/ce_loss": 0.577014148235321 }, { "epoch": 0.8308285544789401, "step": 8403, "train/sim_loss": 0.03125 }, { "epoch": 0.8308285544789401, "step": 8403, "train/total_loss": 0.08895141631364822 }, { "entropy": 8.172971725463867, "epoch": 0.8309274273284556, "mean_token_accuracy": 0.7911571264266968, "num_tokens": 22932192.0, "step": 8404, "train/ce_loss": 1.1235606670379639 }, { "epoch": 0.8309274273284556, "step": 8404, "train/sim_loss": 0.07421875 }, { "epoch": 0.8309274273284556, "step": 8404, "train/total_loss": 0.1865748167037964 }, { "entropy": 8.688688278198242, "epoch": 0.8310263001779711, "mean_token_accuracy": 0.7402597665786743, "num_tokens": 22937426.0, "step": 8405, "train/ce_loss": 0.8759625554084778 }, { "epoch": 0.8310263001779711, "step": 8405, "train/sim_loss": 0.0234375 }, { "epoch": 0.8310263001779711, "step": 8405, "train/total_loss": 0.11103376001119614 }, { "entropy": 8.854475021362305, "epoch": 0.8311251730274867, "mean_token_accuracy": 0.7823033928871155, "num_tokens": 22942586.0, "step": 8406, "train/ce_loss": 0.6007594466209412 }, { "epoch": 0.8311251730274867, "step": 8406, "train/sim_loss": 0.05078125 }, { "epoch": 0.8311251730274867, "step": 8406, "train/total_loss": 0.11085719615221024 }, { "entropy": 8.476692199707031, "epoch": 0.8312240458770022, "mean_token_accuracy": 0.7699680328369141, "num_tokens": 22948003.0, "step": 8407, "train/ce_loss": 0.741219162940979 }, { "epoch": 0.8312240458770022, "step": 8407, "train/sim_loss": 0.015625 }, { "epoch": 0.8312240458770022, "step": 8407, "train/total_loss": 0.08974691480398178 }, { "entropy": 8.197213172912598, "epoch": 0.8313229187265176, "mean_token_accuracy": 0.7603121399879456, "num_tokens": 22953378.0, "step": 8408, "train/ce_loss": 0.6864122748374939 }, { "epoch": 0.8313229187265176, "step": 8408, "train/sim_loss": 0.04296875 }, { "epoch": 0.8313229187265176, "step": 8408, "train/total_loss": 0.11160998046398163 }, { "entropy": 8.923828125, "epoch": 0.8314217915760332, "mean_token_accuracy": 0.6744548082351685, "num_tokens": 22958449.0, "step": 8409, "train/ce_loss": 1.0419980753795244e-06 }, { "epoch": 0.8314217915760332, "step": 8409, "train/sim_loss": 0.04296875 }, { "epoch": 0.8314217915760332, "step": 8409, "train/total_loss": 0.04296885430812836 }, { "entropy": 9.024328231811523, "epoch": 0.8315206644255487, "mean_token_accuracy": 0.7577413320541382, "num_tokens": 22963474.0, "step": 8410, "train/ce_loss": 0.8398553729057312 }, { "epoch": 0.8315206644255487, "step": 8410, "train/sim_loss": 0.0234375 }, { "epoch": 0.8315206644255487, "step": 8410, "train/total_loss": 0.10742303729057312 }, { "entropy": 8.459671020507812, "epoch": 0.8316195372750642, "mean_token_accuracy": 0.6199377179145813, "num_tokens": 22968874.0, "step": 8411, "train/ce_loss": 0.92495197057724 }, { "epoch": 0.8316195372750642, "step": 8411, "train/sim_loss": 0.0625 }, { "epoch": 0.8316195372750642, "step": 8411, "train/total_loss": 0.15499520301818848 }, { "entropy": 8.697453498840332, "epoch": 0.8317184101245798, "mean_token_accuracy": 0.7323232293128967, "num_tokens": 22974116.0, "step": 8412, "train/ce_loss": 0.7704938650131226 }, { "epoch": 0.8317184101245798, "step": 8412, "train/sim_loss": 0.05078125 }, { "epoch": 0.8317184101245798, "step": 8412, "train/total_loss": 0.1278306394815445 }, { "entropy": 8.825996398925781, "epoch": 0.8318172829740953, "mean_token_accuracy": 0.7680251002311707, "num_tokens": 22979196.0, "step": 8413, "train/ce_loss": 1.0850144624710083 }, { "epoch": 0.8318172829740953, "step": 8413, "train/sim_loss": 0.0234375 }, { "epoch": 0.8318172829740953, "step": 8413, "train/total_loss": 0.13193894922733307 }, { "entropy": 8.536054611206055, "epoch": 0.8319161558236108, "mean_token_accuracy": 0.7847380638122559, "num_tokens": 22984495.0, "step": 8414, "train/ce_loss": 0.5865741968154907 }, { "epoch": 0.8319161558236108, "step": 8414, "train/sim_loss": 0.02734375 }, { "epoch": 0.8319161558236108, "step": 8414, "train/total_loss": 0.08600117266178131 }, { "entropy": 8.073843002319336, "epoch": 0.8320150286731264, "mean_token_accuracy": 0.741847813129425, "num_tokens": 22990053.0, "step": 8415, "train/ce_loss": 0.7983760833740234 }, { "epoch": 0.8320150286731264, "step": 8415, "train/sim_loss": 0.05078125 }, { "epoch": 0.8320150286731264, "step": 8415, "train/total_loss": 0.1306188702583313 }, { "entropy": 8.630202293395996, "epoch": 0.8321139015226419, "mean_token_accuracy": 0.7215189933776855, "num_tokens": 22995255.0, "step": 8416, "train/ce_loss": 0.43194296956062317 }, { "epoch": 0.8321139015226419, "step": 8416, "train/sim_loss": 0.0625 }, { "epoch": 0.8321139015226419, "step": 8416, "train/total_loss": 0.10569429397583008 }, { "entropy": 8.924360275268555, "epoch": 0.8322127743721575, "mean_token_accuracy": 0.7236024737358093, "num_tokens": 23000323.0, "step": 8417, "train/ce_loss": 1.4973204135894775 }, { "epoch": 0.8322127743721575, "step": 8417, "train/sim_loss": 0.06640625 }, { "epoch": 0.8322127743721575, "step": 8417, "train/total_loss": 0.21613828837871552 }, { "entropy": 8.600046157836914, "epoch": 0.832311647221673, "mean_token_accuracy": 0.7043596506118774, "num_tokens": 23005480.0, "step": 8418, "train/ce_loss": 0.9622127413749695 }, { "epoch": 0.832311647221673, "step": 8418, "train/sim_loss": 0.08203125 }, { "epoch": 0.832311647221673, "step": 8418, "train/total_loss": 0.17825251817703247 }, { "entropy": 8.23831558227539, "epoch": 0.8324105200711884, "mean_token_accuracy": 0.7659574747085571, "num_tokens": 23011068.0, "step": 8419, "train/ce_loss": 1.023498296737671 }, { "epoch": 0.8324105200711884, "step": 8419, "train/sim_loss": 0.03125 }, { "epoch": 0.8324105200711884, "step": 8419, "train/total_loss": 0.13359983265399933 }, { "epoch": 0.832509392920704, "grad_norm": 0.5148611664772034, "learning_rate": 7.920931612520399e-06, "loss": 0.1285, "step": 8420 }, { "entropy": 8.815807342529297, "epoch": 0.832509392920704, "mean_token_accuracy": 0.793795645236969, "num_tokens": 23016096.0, "step": 8420, "train/ce_loss": 0.9615424871444702 }, { "epoch": 0.832509392920704, "step": 8420, "train/sim_loss": 0.046875 }, { "epoch": 0.832509392920704, "step": 8420, "train/total_loss": 0.14302924275398254 }, { "entropy": 9.1303129196167, "epoch": 0.8326082657702195, "mean_token_accuracy": 0.7280858755111694, "num_tokens": 23021063.0, "step": 8421, "train/ce_loss": 0.7835027575492859 }, { "epoch": 0.8326082657702195, "step": 8421, "train/sim_loss": 0.0234375 }, { "epoch": 0.8326082657702195, "step": 8421, "train/total_loss": 0.10178777575492859 }, { "entropy": 8.770580291748047, "epoch": 0.832707138619735, "mean_token_accuracy": 0.7217742204666138, "num_tokens": 23026414.0, "step": 8422, "train/ce_loss": 0.8574286103248596 }, { "epoch": 0.832707138619735, "step": 8422, "train/sim_loss": 0.0390625 }, { "epoch": 0.832707138619735, "step": 8422, "train/total_loss": 0.12480536103248596 }, { "entropy": 8.346721649169922, "epoch": 0.8328060114692506, "mean_token_accuracy": 0.7166344523429871, "num_tokens": 23032154.0, "step": 8423, "train/ce_loss": 0.6669735908508301 }, { "epoch": 0.8328060114692506, "step": 8423, "train/sim_loss": 0.078125 }, { "epoch": 0.8328060114692506, "step": 8423, "train/total_loss": 0.144822359085083 }, { "entropy": 8.428167343139648, "epoch": 0.8329048843187661, "mean_token_accuracy": 0.6963037252426147, "num_tokens": 23037691.0, "step": 8424, "train/ce_loss": 1.1446759700775146 }, { "epoch": 0.8329048843187661, "step": 8424, "train/sim_loss": 0.078125 }, { "epoch": 0.8329048843187661, "step": 8424, "train/total_loss": 0.192592591047287 }, { "entropy": 9.567185401916504, "epoch": 0.8330037571682816, "mean_token_accuracy": 0.7131147384643555, "num_tokens": 23042447.0, "step": 8425, "train/ce_loss": 1.6128241213664296e-06 }, { "epoch": 0.8330037571682816, "step": 8425, "train/sim_loss": 0.0390625 }, { "epoch": 0.8330037571682816, "step": 8425, "train/total_loss": 0.039062660187482834 }, { "entropy": 8.869767189025879, "epoch": 0.8331026300177972, "mean_token_accuracy": 0.757377028465271, "num_tokens": 23047558.0, "step": 8426, "train/ce_loss": 0.9084510803222656 }, { "epoch": 0.8331026300177972, "step": 8426, "train/sim_loss": 0.05078125 }, { "epoch": 0.8331026300177972, "step": 8426, "train/total_loss": 0.14162635803222656 }, { "entropy": 8.885663986206055, "epoch": 0.8332015028673126, "mean_token_accuracy": 0.7267002463340759, "num_tokens": 23052820.0, "step": 8427, "train/ce_loss": 0.9693460464477539 }, { "epoch": 0.8332015028673126, "step": 8427, "train/sim_loss": 0.03125 }, { "epoch": 0.8332015028673126, "step": 8427, "train/total_loss": 0.12818461656570435 }, { "entropy": 8.810038566589355, "epoch": 0.8333003757168281, "mean_token_accuracy": 0.779724657535553, "num_tokens": 23058062.0, "step": 8428, "train/ce_loss": 0.5381803512573242 }, { "epoch": 0.8333003757168281, "step": 8428, "train/sim_loss": 0.02734375 }, { "epoch": 0.8333003757168281, "step": 8428, "train/total_loss": 0.08116178214550018 }, { "entropy": 9.27984619140625, "epoch": 0.8333992485663437, "mean_token_accuracy": 0.694200336933136, "num_tokens": 23063100.0, "step": 8429, "train/ce_loss": 2.27346134185791 }, { "epoch": 0.8333992485663437, "step": 8429, "train/sim_loss": 0.10546875 }, { "epoch": 0.8333992485663437, "step": 8429, "train/total_loss": 0.33281487226486206 }, { "entropy": 9.171561241149902, "epoch": 0.8334981214158592, "mean_token_accuracy": 0.7915254235267639, "num_tokens": 23068114.0, "step": 8430, "train/ce_loss": 1.2283937849133508e-06 }, { "epoch": 0.8334981214158592, "step": 8430, "train/sim_loss": 0.046875 }, { "epoch": 0.8334981214158592, "step": 8430, "train/total_loss": 0.04687512293457985 }, { "entropy": 8.703938484191895, "epoch": 0.8335969942653747, "mean_token_accuracy": 0.7080045342445374, "num_tokens": 23073502.0, "step": 8431, "train/ce_loss": 1.0218861103057861 }, { "epoch": 0.8335969942653747, "step": 8431, "train/sim_loss": 0.078125 }, { "epoch": 0.8335969942653747, "step": 8431, "train/total_loss": 0.1803136169910431 }, { "entropy": 8.560479164123535, "epoch": 0.8336958671148903, "mean_token_accuracy": 0.7452300786972046, "num_tokens": 23078827.0, "step": 8432, "train/ce_loss": 0.6318869590759277 }, { "epoch": 0.8336958671148903, "step": 8432, "train/sim_loss": 0.046875 }, { "epoch": 0.8336958671148903, "step": 8432, "train/total_loss": 0.11006369441747665 }, { "entropy": 8.864185333251953, "epoch": 0.8337947399644058, "mean_token_accuracy": 0.7435897588729858, "num_tokens": 23083971.0, "step": 8433, "train/ce_loss": 1.4856287240982056 }, { "epoch": 0.8337947399644058, "step": 8433, "train/sim_loss": 0.05078125 }, { "epoch": 0.8337947399644058, "step": 8433, "train/total_loss": 0.19934412837028503 }, { "entropy": 8.569899559020996, "epoch": 0.8338936128139213, "mean_token_accuracy": 0.7508690357208252, "num_tokens": 23089307.0, "step": 8434, "train/ce_loss": 0.5698906183242798 }, { "epoch": 0.8338936128139213, "step": 8434, "train/sim_loss": 0.046875 }, { "epoch": 0.8338936128139213, "step": 8434, "train/total_loss": 0.10386405885219574 }, { "entropy": 8.418647766113281, "epoch": 0.8339924856634369, "mean_token_accuracy": 0.7304609417915344, "num_tokens": 23094802.0, "step": 8435, "train/ce_loss": 0.8897053599357605 }, { "epoch": 0.8339924856634369, "step": 8435, "train/sim_loss": 0.0546875 }, { "epoch": 0.8339924856634369, "step": 8435, "train/total_loss": 0.14365804195404053 }, { "entropy": 8.659425735473633, "epoch": 0.8340913585129524, "mean_token_accuracy": 0.78125, "num_tokens": 23100106.0, "step": 8436, "train/ce_loss": 0.287148654460907 }, { "epoch": 0.8340913585129524, "step": 8436, "train/sim_loss": 0.0234375 }, { "epoch": 0.8340913585129524, "step": 8436, "train/total_loss": 0.0521523654460907 }, { "entropy": 8.957656860351562, "epoch": 0.8341902313624678, "mean_token_accuracy": 0.8513761758804321, "num_tokens": 23105083.0, "step": 8437, "train/ce_loss": 0.9667260050773621 }, { "epoch": 0.8341902313624678, "step": 8437, "train/sim_loss": 0.015625 }, { "epoch": 0.8341902313624678, "step": 8437, "train/total_loss": 0.11229760199785233 }, { "entropy": 8.240507125854492, "epoch": 0.8342891042119834, "mean_token_accuracy": 0.772009015083313, "num_tokens": 23110549.0, "step": 8438, "train/ce_loss": 0.5577598214149475 }, { "epoch": 0.8342891042119834, "step": 8438, "train/sim_loss": 0.0390625 }, { "epoch": 0.8342891042119834, "step": 8438, "train/total_loss": 0.09483848512172699 }, { "entropy": 8.941741943359375, "epoch": 0.8343879770614989, "mean_token_accuracy": 0.7020057439804077, "num_tokens": 23115686.0, "step": 8439, "train/ce_loss": 1.318642258644104 }, { "epoch": 0.8343879770614989, "step": 8439, "train/sim_loss": 0.08203125 }, { "epoch": 0.8343879770614989, "step": 8439, "train/total_loss": 0.21389548480510712 }, { "epoch": 0.8344868499110144, "grad_norm": 0.7413841485977173, "learning_rate": 7.91598674776245e-06, "loss": 0.1269, "step": 8440 }, { "entropy": 9.051006317138672, "epoch": 0.8344868499110144, "mean_token_accuracy": 0.6917688250541687, "num_tokens": 23120688.0, "step": 8440, "train/ce_loss": 1.116180181503296 }, { "epoch": 0.8344868499110144, "step": 8440, "train/sim_loss": 0.0234375 }, { "epoch": 0.8344868499110144, "step": 8440, "train/total_loss": 0.1350555121898651 }, { "entropy": 8.562871932983398, "epoch": 0.83458572276053, "mean_token_accuracy": 0.751091718673706, "num_tokens": 23125839.0, "step": 8441, "train/ce_loss": 1.2514069080352783 }, { "epoch": 0.83458572276053, "step": 8441, "train/sim_loss": 0.0546875 }, { "epoch": 0.83458572276053, "step": 8441, "train/total_loss": 0.1798281967639923 }, { "entropy": 8.867137908935547, "epoch": 0.8346845956100455, "mean_token_accuracy": 0.7886075973510742, "num_tokens": 23131065.0, "step": 8442, "train/ce_loss": 0.664675772190094 }, { "epoch": 0.8346845956100455, "step": 8442, "train/sim_loss": 0.0546875 }, { "epoch": 0.8346845956100455, "step": 8442, "train/total_loss": 0.12115507572889328 }, { "entropy": 8.775603294372559, "epoch": 0.834783468459561, "mean_token_accuracy": 0.7020602226257324, "num_tokens": 23136150.0, "step": 8443, "train/ce_loss": 0.9842575788497925 }, { "epoch": 0.834783468459561, "step": 8443, "train/sim_loss": 0.03125 }, { "epoch": 0.834783468459561, "step": 8443, "train/total_loss": 0.1296757608652115 }, { "entropy": 8.78519344329834, "epoch": 0.8348823413090766, "mean_token_accuracy": 0.6872811913490295, "num_tokens": 23141520.0, "step": 8444, "train/ce_loss": 1.3550491333007812 }, { "epoch": 0.8348823413090766, "step": 8444, "train/sim_loss": 0.09375 }, { "epoch": 0.8348823413090766, "step": 8444, "train/total_loss": 0.22925491631031036 }, { "entropy": 8.138087272644043, "epoch": 0.8349812141585921, "mean_token_accuracy": 0.7243589758872986, "num_tokens": 23146908.0, "step": 8445, "train/ce_loss": 0.7495837211608887 }, { "epoch": 0.8349812141585921, "step": 8445, "train/sim_loss": 0.0625 }, { "epoch": 0.8349812141585921, "step": 8445, "train/total_loss": 0.13745838403701782 }, { "entropy": 8.91369342803955, "epoch": 0.8350800870081075, "mean_token_accuracy": 0.7582781314849854, "num_tokens": 23151983.0, "step": 8446, "train/ce_loss": 0.5326682925224304 }, { "epoch": 0.8350800870081075, "step": 8446, "train/sim_loss": 0.03125 }, { "epoch": 0.8350800870081075, "step": 8446, "train/total_loss": 0.08451683074235916 }, { "entropy": 8.554800033569336, "epoch": 0.8351789598576231, "mean_token_accuracy": 0.7573529481887817, "num_tokens": 23157260.0, "step": 8447, "train/ce_loss": 0.7137629389762878 }, { "epoch": 0.8351789598576231, "step": 8447, "train/sim_loss": 0.0625 }, { "epoch": 0.8351789598576231, "step": 8447, "train/total_loss": 0.13387629389762878 }, { "entropy": 9.390222549438477, "epoch": 0.8352778327071386, "mean_token_accuracy": 0.742222249507904, "num_tokens": 23162099.0, "step": 8448, "train/ce_loss": 4.685093244916061e-06 }, { "epoch": 0.8352778327071386, "step": 8448, "train/sim_loss": 0.05078125 }, { "epoch": 0.8352778327071386, "step": 8448, "train/total_loss": 0.050781719386577606 }, { "entropy": 8.345687866210938, "epoch": 0.8353767055566541, "mean_token_accuracy": 0.7475345134735107, "num_tokens": 23167566.0, "step": 8449, "train/ce_loss": 0.34656718373298645 }, { "epoch": 0.8353767055566541, "step": 8449, "train/sim_loss": 0.015625 }, { "epoch": 0.8353767055566541, "step": 8449, "train/total_loss": 0.050281718373298645 }, { "entropy": 9.12002182006836, "epoch": 0.8354755784061697, "mean_token_accuracy": 0.7476190328598022, "num_tokens": 23172605.0, "step": 8450, "train/ce_loss": 1.1007190942764282 }, { "epoch": 0.8354755784061697, "step": 8450, "train/sim_loss": 0.02734375 }, { "epoch": 0.8354755784061697, "step": 8450, "train/total_loss": 0.13741566240787506 }, { "entropy": 9.263957977294922, "epoch": 0.8355744512556852, "mean_token_accuracy": 0.7510040402412415, "num_tokens": 23177672.0, "step": 8451, "train/ce_loss": 2.7391645289753797e-06 }, { "epoch": 0.8355744512556852, "step": 8451, "train/sim_loss": 0.03515625 }, { "epoch": 0.8355744512556852, "step": 8451, "train/total_loss": 0.035156525671482086 }, { "entropy": 8.432599067687988, "epoch": 0.8356733241052007, "mean_token_accuracy": 0.7297297120094299, "num_tokens": 23183068.0, "step": 8452, "train/ce_loss": 0.8153901100158691 }, { "epoch": 0.8356733241052007, "step": 8452, "train/sim_loss": 0.015625 }, { "epoch": 0.8356733241052007, "step": 8452, "train/total_loss": 0.09716401249170303 }, { "entropy": 8.373237609863281, "epoch": 0.8357721969547163, "mean_token_accuracy": 0.7285407781600952, "num_tokens": 23188483.0, "step": 8453, "train/ce_loss": 1.45299232006073 }, { "epoch": 0.8357721969547163, "step": 8453, "train/sim_loss": 0.0625 }, { "epoch": 0.8357721969547163, "step": 8453, "train/total_loss": 0.20779924094676971 }, { "entropy": 8.922636032104492, "epoch": 0.8358710698042318, "mean_token_accuracy": 0.7309644818305969, "num_tokens": 23193585.0, "step": 8454, "train/ce_loss": 0.8643047213554382 }, { "epoch": 0.8358710698042318, "step": 8454, "train/sim_loss": 0.0703125 }, { "epoch": 0.8358710698042318, "step": 8454, "train/total_loss": 0.15674297511577606 }, { "entropy": 8.581033706665039, "epoch": 0.8359699426537472, "mean_token_accuracy": 0.761904776096344, "num_tokens": 23198944.0, "step": 8455, "train/ce_loss": 0.6584579944610596 }, { "epoch": 0.8359699426537472, "step": 8455, "train/sim_loss": 0.04296875 }, { "epoch": 0.8359699426537472, "step": 8455, "train/total_loss": 0.1088145524263382 }, { "entropy": 8.840801239013672, "epoch": 0.8360688155032628, "mean_token_accuracy": 0.754054069519043, "num_tokens": 23204141.0, "step": 8456, "train/ce_loss": 0.6707462072372437 }, { "epoch": 0.8360688155032628, "step": 8456, "train/sim_loss": 0.0703125 }, { "epoch": 0.8360688155032628, "step": 8456, "train/total_loss": 0.13738712668418884 }, { "entropy": 9.381414413452148, "epoch": 0.8361676883527783, "mean_token_accuracy": 0.8161616325378418, "num_tokens": 23209045.0, "step": 8457, "train/ce_loss": 6.694864396195044e-07 }, { "epoch": 0.8361676883527783, "step": 8457, "train/sim_loss": 0.015625 }, { "epoch": 0.8361676883527783, "step": 8457, "train/total_loss": 0.015625067055225372 }, { "entropy": 8.514923095703125, "epoch": 0.8362665612022938, "mean_token_accuracy": 0.7355371713638306, "num_tokens": 23214327.0, "step": 8458, "train/ce_loss": 1.0225625038146973 }, { "epoch": 0.8362665612022938, "step": 8458, "train/sim_loss": 0.0546875 }, { "epoch": 0.8362665612022938, "step": 8458, "train/total_loss": 0.15694375336170197 }, { "entropy": 8.720748901367188, "epoch": 0.8363654340518094, "mean_token_accuracy": 0.7418967485427856, "num_tokens": 23219806.0, "step": 8459, "train/ce_loss": 1.3102567195892334 }, { "epoch": 0.8363654340518094, "step": 8459, "train/sim_loss": 0.0859375 }, { "epoch": 0.8363654340518094, "step": 8459, "train/total_loss": 0.21696317195892334 }, { "epoch": 0.8364643069013249, "grad_norm": 0.6300832033157349, "learning_rate": 7.9110418830045e-06, "loss": 0.1355, "step": 8460 }, { "entropy": 8.613868713378906, "epoch": 0.8364643069013249, "mean_token_accuracy": 0.7577142715454102, "num_tokens": 23225144.0, "step": 8460, "train/ce_loss": 0.8855839967727661 }, { "epoch": 0.8364643069013249, "step": 8460, "train/sim_loss": 0.03515625 }, { "epoch": 0.8364643069013249, "step": 8460, "train/total_loss": 0.12371464818716049 }, { "entropy": 8.367238998413086, "epoch": 0.8365631797508404, "mean_token_accuracy": 0.7315436005592346, "num_tokens": 23230505.0, "step": 8461, "train/ce_loss": 1.0768791437149048 }, { "epoch": 0.8365631797508404, "step": 8461, "train/sim_loss": 0.0703125 }, { "epoch": 0.8365631797508404, "step": 8461, "train/total_loss": 0.17800042033195496 }, { "entropy": 8.522979736328125, "epoch": 0.836662052600356, "mean_token_accuracy": 0.7502837777137756, "num_tokens": 23235848.0, "step": 8462, "train/ce_loss": 0.714676022529602 }, { "epoch": 0.836662052600356, "step": 8462, "train/sim_loss": 0.05859375 }, { "epoch": 0.836662052600356, "step": 8462, "train/total_loss": 0.13006135821342468 }, { "entropy": 9.270840644836426, "epoch": 0.8367609254498715, "mean_token_accuracy": 0.762135922908783, "num_tokens": 23240680.0, "step": 8463, "train/ce_loss": 2.330828692720388e-06 }, { "epoch": 0.8367609254498715, "step": 8463, "train/sim_loss": 0.03125 }, { "epoch": 0.8367609254498715, "step": 8463, "train/total_loss": 0.0312502346932888 }, { "entropy": 9.12474536895752, "epoch": 0.836859798299387, "mean_token_accuracy": 0.760869562625885, "num_tokens": 23245576.0, "step": 8464, "train/ce_loss": 0.8324379324913025 }, { "epoch": 0.836859798299387, "step": 8464, "train/sim_loss": 0.04296875 }, { "epoch": 0.836859798299387, "step": 8464, "train/total_loss": 0.12621253728866577 }, { "entropy": 8.247017860412598, "epoch": 0.8369586711489025, "mean_token_accuracy": 0.7172264456748962, "num_tokens": 23250968.0, "step": 8465, "train/ce_loss": 1.229318618774414 }, { "epoch": 0.8369586711489025, "step": 8465, "train/sim_loss": 0.0703125 }, { "epoch": 0.8369586711489025, "step": 8465, "train/total_loss": 0.19324436783790588 }, { "entropy": 9.369348526000977, "epoch": 0.837057543998418, "mean_token_accuracy": 0.7494033575057983, "num_tokens": 23255798.0, "step": 8466, "train/ce_loss": 1.835410237312317 }, { "epoch": 0.837057543998418, "step": 8466, "train/sim_loss": 0.08203125 }, { "epoch": 0.837057543998418, "step": 8466, "train/total_loss": 0.26557227969169617 }, { "entropy": 8.333223342895508, "epoch": 0.8371564168479335, "mean_token_accuracy": 0.7402885556221008, "num_tokens": 23261203.0, "step": 8467, "train/ce_loss": 0.9438441395759583 }, { "epoch": 0.8371564168479335, "step": 8467, "train/sim_loss": 0.05859375 }, { "epoch": 0.8371564168479335, "step": 8467, "train/total_loss": 0.15297816693782806 }, { "entropy": 8.755775451660156, "epoch": 0.8372552896974491, "mean_token_accuracy": 0.7366504669189453, "num_tokens": 23266466.0, "step": 8468, "train/ce_loss": 0.7262039184570312 }, { "epoch": 0.8372552896974491, "step": 8468, "train/sim_loss": 0.046875 }, { "epoch": 0.8372552896974491, "step": 8468, "train/total_loss": 0.11949539184570312 }, { "entropy": 9.043832778930664, "epoch": 0.8373541625469646, "mean_token_accuracy": 0.7504159808158875, "num_tokens": 23271467.0, "step": 8469, "train/ce_loss": 1.2775529623031616 }, { "epoch": 0.8373541625469646, "step": 8469, "train/sim_loss": 0.0390625 }, { "epoch": 0.8373541625469646, "step": 8469, "train/total_loss": 0.1668177992105484 }, { "entropy": 9.056379318237305, "epoch": 0.8374530353964801, "mean_token_accuracy": 0.7526132464408875, "num_tokens": 23276509.0, "step": 8470, "train/ce_loss": 0.983027458190918 }, { "epoch": 0.8374530353964801, "step": 8470, "train/sim_loss": 0.07421875 }, { "epoch": 0.8374530353964801, "step": 8470, "train/total_loss": 0.17252150177955627 }, { "entropy": 9.1365385055542, "epoch": 0.8375519082459957, "mean_token_accuracy": 0.7578008770942688, "num_tokens": 23281638.0, "step": 8471, "train/ce_loss": 7.340035494962649e-07 }, { "epoch": 0.8375519082459957, "step": 8471, "train/sim_loss": 0.03125 }, { "epoch": 0.8375519082459957, "step": 8471, "train/total_loss": 0.03125007450580597 }, { "entropy": 8.997591018676758, "epoch": 0.8376507810955112, "mean_token_accuracy": 0.7923728823661804, "num_tokens": 23286777.0, "step": 8472, "train/ce_loss": 0.713519275188446 }, { "epoch": 0.8376507810955112, "step": 8472, "train/sim_loss": 0.015625 }, { "epoch": 0.8376507810955112, "step": 8472, "train/total_loss": 0.08697693049907684 }, { "entropy": 8.927431106567383, "epoch": 0.8377496539450267, "mean_token_accuracy": 0.6694214940071106, "num_tokens": 23291899.0, "step": 8473, "train/ce_loss": 1.389837384223938 }, { "epoch": 0.8377496539450267, "step": 8473, "train/sim_loss": 0.0546875 }, { "epoch": 0.8377496539450267, "step": 8473, "train/total_loss": 0.19367124140262604 }, { "entropy": 8.895759582519531, "epoch": 0.8378485267945422, "mean_token_accuracy": 0.718068540096283, "num_tokens": 23297049.0, "step": 8474, "train/ce_loss": 0.7009792327880859 }, { "epoch": 0.8378485267945422, "step": 8474, "train/sim_loss": 0.0234375 }, { "epoch": 0.8378485267945422, "step": 8474, "train/total_loss": 0.0935354232788086 }, { "entropy": 9.136212348937988, "epoch": 0.8379473996440577, "mean_token_accuracy": 0.7834862470626831, "num_tokens": 23302024.0, "step": 8475, "train/ce_loss": 0.6531566381454468 }, { "epoch": 0.8379473996440577, "step": 8475, "train/sim_loss": 0.0625 }, { "epoch": 0.8379473996440577, "step": 8475, "train/total_loss": 0.12781566381454468 }, { "entropy": 8.840536117553711, "epoch": 0.8380462724935732, "mean_token_accuracy": 0.7091906666755676, "num_tokens": 23307208.0, "step": 8476, "train/ce_loss": 1.3356698751449585 }, { "epoch": 0.8380462724935732, "step": 8476, "train/sim_loss": 0.0703125 }, { "epoch": 0.8380462724935732, "step": 8476, "train/total_loss": 0.2038794904947281 }, { "entropy": 8.715561866760254, "epoch": 0.8381451453430888, "mean_token_accuracy": 0.7680000066757202, "num_tokens": 23312434.0, "step": 8477, "train/ce_loss": 0.7491929531097412 }, { "epoch": 0.8381451453430888, "step": 8477, "train/sim_loss": 0.01171875 }, { "epoch": 0.8381451453430888, "step": 8477, "train/total_loss": 0.08663804829120636 }, { "entropy": 8.269938468933105, "epoch": 0.8382440181926043, "mean_token_accuracy": 0.6903846263885498, "num_tokens": 23317965.0, "step": 8478, "train/ce_loss": 0.7817928791046143 }, { "epoch": 0.8382440181926043, "step": 8478, "train/sim_loss": 0.0703125 }, { "epoch": 0.8382440181926043, "step": 8478, "train/total_loss": 0.14849179983139038 }, { "entropy": 8.730112075805664, "epoch": 0.8383428910421198, "mean_token_accuracy": 0.7268588542938232, "num_tokens": 23323042.0, "step": 8479, "train/ce_loss": 1.4868693351745605 }, { "epoch": 0.8383428910421198, "step": 8479, "train/sim_loss": 0.03515625 }, { "epoch": 0.8383428910421198, "step": 8479, "train/total_loss": 0.18384318053722382 }, { "epoch": 0.8384417638916354, "grad_norm": 0.7349727749824524, "learning_rate": 7.90609701824655e-06, "loss": 0.1321, "step": 8480 }, { "entropy": 9.112332344055176, "epoch": 0.8384417638916354, "mean_token_accuracy": 0.7666666507720947, "num_tokens": 23328042.0, "step": 8480, "train/ce_loss": 1.523605465888977 }, { "epoch": 0.8384417638916354, "step": 8480, "train/sim_loss": 0.078125 }, { "epoch": 0.8384417638916354, "step": 8480, "train/total_loss": 0.23048554360866547 }, { "entropy": 8.693452835083008, "epoch": 0.8385406367411509, "mean_token_accuracy": 0.7580246925354004, "num_tokens": 23333344.0, "step": 8481, "train/ce_loss": 0.7240076661109924 }, { "epoch": 0.8385406367411509, "step": 8481, "train/sim_loss": 0.0546875 }, { "epoch": 0.8385406367411509, "step": 8481, "train/total_loss": 0.1270882785320282 }, { "entropy": 9.14535140991211, "epoch": 0.8386395095906664, "mean_token_accuracy": 0.735988199710846, "num_tokens": 23338414.0, "step": 8482, "train/ce_loss": 0.896630585193634 }, { "epoch": 0.8386395095906664, "step": 8482, "train/sim_loss": 0.01171875 }, { "epoch": 0.8386395095906664, "step": 8482, "train/total_loss": 0.1013818085193634 }, { "entropy": 8.412368774414062, "epoch": 0.838738382440182, "mean_token_accuracy": 0.7041965126991272, "num_tokens": 23344043.0, "step": 8483, "train/ce_loss": 1.0863405466079712 }, { "epoch": 0.838738382440182, "step": 8483, "train/sim_loss": 0.05859375 }, { "epoch": 0.838738382440182, "step": 8483, "train/total_loss": 0.16722780466079712 }, { "entropy": 9.090755462646484, "epoch": 0.8388372552896974, "mean_token_accuracy": 0.7093425393104553, "num_tokens": 23349056.0, "step": 8484, "train/ce_loss": 1.0335745811462402 }, { "epoch": 0.8388372552896974, "step": 8484, "train/sim_loss": 0.0546875 }, { "epoch": 0.8388372552896974, "step": 8484, "train/total_loss": 0.1580449640750885 }, { "entropy": 9.019186973571777, "epoch": 0.8389361281392129, "mean_token_accuracy": 0.800000011920929, "num_tokens": 23354014.0, "step": 8485, "train/ce_loss": 9.221578238793882e-07 }, { "epoch": 0.8389361281392129, "step": 8485, "train/sim_loss": 0.03125 }, { "epoch": 0.8389361281392129, "step": 8485, "train/total_loss": 0.03125009313225746 }, { "entropy": 8.70306396484375, "epoch": 0.8390350009887285, "mean_token_accuracy": 0.7714681625366211, "num_tokens": 23359218.0, "step": 8486, "train/ce_loss": 0.4649391770362854 }, { "epoch": 0.8390350009887285, "step": 8486, "train/sim_loss": 0.01953125 }, { "epoch": 0.8390350009887285, "step": 8486, "train/total_loss": 0.06602516770362854 }, { "entropy": 8.45088005065918, "epoch": 0.839133873838244, "mean_token_accuracy": 0.7324613332748413, "num_tokens": 23364545.0, "step": 8487, "train/ce_loss": 0.860693097114563 }, { "epoch": 0.839133873838244, "step": 8487, "train/sim_loss": 0.0625 }, { "epoch": 0.839133873838244, "step": 8487, "train/total_loss": 0.14856931567192078 }, { "entropy": 8.545808792114258, "epoch": 0.8392327466877595, "mean_token_accuracy": 0.7394015192985535, "num_tokens": 23369802.0, "step": 8488, "train/ce_loss": 0.7315399646759033 }, { "epoch": 0.8392327466877595, "step": 8488, "train/sim_loss": 0.02734375 }, { "epoch": 0.8392327466877595, "step": 8488, "train/total_loss": 0.10049774497747421 }, { "entropy": 8.705459594726562, "epoch": 0.8393316195372751, "mean_token_accuracy": 0.7708830833435059, "num_tokens": 23375068.0, "step": 8489, "train/ce_loss": 0.6915591359138489 }, { "epoch": 0.8393316195372751, "step": 8489, "train/sim_loss": 0.046875 }, { "epoch": 0.8393316195372751, "step": 8489, "train/total_loss": 0.11603091657161713 }, { "entropy": 8.906850814819336, "epoch": 0.8394304923867906, "mean_token_accuracy": 0.7515822649002075, "num_tokens": 23380162.0, "step": 8490, "train/ce_loss": 0.8677210807800293 }, { "epoch": 0.8394304923867906, "step": 8490, "train/sim_loss": 0.07421875 }, { "epoch": 0.8394304923867906, "step": 8490, "train/total_loss": 0.1609908640384674 }, { "entropy": 8.584822654724121, "epoch": 0.8395293652363061, "mean_token_accuracy": 0.7032257914543152, "num_tokens": 23385409.0, "step": 8491, "train/ce_loss": 1.0996503829956055 }, { "epoch": 0.8395293652363061, "step": 8491, "train/sim_loss": 0.0703125 }, { "epoch": 0.8395293652363061, "step": 8491, "train/total_loss": 0.18027754127979279 }, { "entropy": 8.949121475219727, "epoch": 0.8396282380858217, "mean_token_accuracy": 0.771345853805542, "num_tokens": 23390561.0, "step": 8492, "train/ce_loss": 0.966265857219696 }, { "epoch": 0.8396282380858217, "step": 8492, "train/sim_loss": 0.0234375 }, { "epoch": 0.8396282380858217, "step": 8492, "train/total_loss": 0.12006408721208572 }, { "entropy": 9.422487258911133, "epoch": 0.8397271109353371, "mean_token_accuracy": 0.7461240291595459, "num_tokens": 23395468.0, "step": 8493, "train/ce_loss": 1.1239761114120483 }, { "epoch": 0.8397271109353371, "step": 8493, "train/sim_loss": 0.07421875 }, { "epoch": 0.8397271109353371, "step": 8493, "train/total_loss": 0.18661636114120483 }, { "entropy": 8.569074630737305, "epoch": 0.8398259837848526, "mean_token_accuracy": 0.7707602381706238, "num_tokens": 23400763.0, "step": 8494, "train/ce_loss": 0.8903215527534485 }, { "epoch": 0.8398259837848526, "step": 8494, "train/sim_loss": 0.046875 }, { "epoch": 0.8398259837848526, "step": 8494, "train/total_loss": 0.1359071582555771 }, { "entropy": 9.111557960510254, "epoch": 0.8399248566343682, "mean_token_accuracy": 0.7453798651695251, "num_tokens": 23405671.0, "step": 8495, "train/ce_loss": 1.4308420419692993 }, { "epoch": 0.8399248566343682, "step": 8495, "train/sim_loss": 0.0703125 }, { "epoch": 0.8399248566343682, "step": 8495, "train/total_loss": 0.21339671313762665 }, { "entropy": 8.813003540039062, "epoch": 0.8400237294838837, "mean_token_accuracy": 0.801536500453949, "num_tokens": 23411051.0, "step": 8496, "train/ce_loss": 0.3880203366279602 }, { "epoch": 0.8400237294838837, "step": 8496, "train/sim_loss": 0.03515625 }, { "epoch": 0.8400237294838837, "step": 8496, "train/total_loss": 0.07395828515291214 }, { "entropy": 9.437934875488281, "epoch": 0.8401226023333992, "mean_token_accuracy": 0.6849315166473389, "num_tokens": 23415900.0, "step": 8497, "train/ce_loss": 1.8858469724655151 }, { "epoch": 0.8401226023333992, "step": 8497, "train/sim_loss": 0.109375 }, { "epoch": 0.8401226023333992, "step": 8497, "train/total_loss": 0.29795968532562256 }, { "entropy": 8.867807388305664, "epoch": 0.8402214751829148, "mean_token_accuracy": 0.7607142925262451, "num_tokens": 23420951.0, "step": 8498, "train/ce_loss": 1.3461689150062739e-06 }, { "epoch": 0.8402214751829148, "step": 8498, "train/sim_loss": 0.05078125 }, { "epoch": 0.8402214751829148, "step": 8498, "train/total_loss": 0.050781384110450745 }, { "entropy": 8.785367965698242, "epoch": 0.8403203480324303, "mean_token_accuracy": 0.7249602675437927, "num_tokens": 23426042.0, "step": 8499, "train/ce_loss": 6.686180995529867e-07 }, { "epoch": 0.8403203480324303, "step": 8499, "train/sim_loss": 0.0625 }, { "epoch": 0.8403203480324303, "step": 8499, "train/total_loss": 0.06250006705522537 }, { "epoch": 0.8404192208819459, "grad_norm": 0.6899054646492004, "learning_rate": 7.901152153488603e-06, "loss": 0.1273, "step": 8500 }, { "entropy": 8.333955764770508, "epoch": 0.8404192208819459, "mean_token_accuracy": 0.7285569906234741, "num_tokens": 23431545.0, "step": 8500, "train/ce_loss": 1.2725051641464233 }, { "epoch": 0.8404192208819459, "step": 8500, "train/sim_loss": 0.078125 }, { "epoch": 0.8404192208819459, "step": 8500, "train/total_loss": 0.2053755223751068 }, { "entropy": 8.942061424255371, "epoch": 0.8405180937314614, "mean_token_accuracy": 0.7556270360946655, "num_tokens": 23436600.0, "step": 8501, "train/ce_loss": 0.9197985529899597 }, { "epoch": 0.8405180937314614, "step": 8501, "train/sim_loss": 0.046875 }, { "epoch": 0.8405180937314614, "step": 8501, "train/total_loss": 0.13885486125946045 }, { "entropy": 9.137856483459473, "epoch": 0.8406169665809768, "mean_token_accuracy": 0.8154761791229248, "num_tokens": 23441542.0, "step": 8502, "train/ce_loss": 1.5246752500534058 }, { "epoch": 0.8406169665809768, "step": 8502, "train/sim_loss": 0.05078125 }, { "epoch": 0.8406169665809768, "step": 8502, "train/total_loss": 0.2032487839460373 }, { "entropy": 8.56517219543457, "epoch": 0.8407158394304924, "mean_token_accuracy": 0.7099056839942932, "num_tokens": 23446869.0, "step": 8503, "train/ce_loss": 0.5327627062797546 }, { "epoch": 0.8407158394304924, "step": 8503, "train/sim_loss": 0.06640625 }, { "epoch": 0.8407158394304924, "step": 8503, "train/total_loss": 0.11968252062797546 }, { "entropy": 8.913006782531738, "epoch": 0.8408147122800079, "mean_token_accuracy": 0.7347221970558167, "num_tokens": 23451981.0, "step": 8504, "train/ce_loss": 1.4104857444763184 }, { "epoch": 0.8408147122800079, "step": 8504, "train/sim_loss": 0.0546875 }, { "epoch": 0.8408147122800079, "step": 8504, "train/total_loss": 0.1957360804080963 }, { "entropy": 8.650247573852539, "epoch": 0.8409135851295234, "mean_token_accuracy": 0.6997635960578918, "num_tokens": 23457235.0, "step": 8505, "train/ce_loss": 1.308672547340393 }, { "epoch": 0.8409135851295234, "step": 8505, "train/sim_loss": 0.0859375 }, { "epoch": 0.8409135851295234, "step": 8505, "train/total_loss": 0.21680475771427155 }, { "entropy": 8.899694442749023, "epoch": 0.841012457979039, "mean_token_accuracy": 0.7393526434898376, "num_tokens": 23462282.0, "step": 8506, "train/ce_loss": 4.857674298364145e-07 }, { "epoch": 0.841012457979039, "step": 8506, "train/sim_loss": 0.01953125 }, { "epoch": 0.841012457979039, "step": 8506, "train/total_loss": 0.01953129842877388 }, { "entropy": 8.43980884552002, "epoch": 0.8411113308285545, "mean_token_accuracy": 0.751091718673706, "num_tokens": 23467612.0, "step": 8507, "train/ce_loss": 0.5670673847198486 }, { "epoch": 0.8411113308285545, "step": 8507, "train/sim_loss": 0.02734375 }, { "epoch": 0.8411113308285545, "step": 8507, "train/total_loss": 0.0840504914522171 }, { "entropy": 8.516087532043457, "epoch": 0.84121020367807, "mean_token_accuracy": 0.7193396091461182, "num_tokens": 23472937.0, "step": 8508, "train/ce_loss": 0.6912945508956909 }, { "epoch": 0.84121020367807, "step": 8508, "train/sim_loss": 0.05859375 }, { "epoch": 0.84121020367807, "step": 8508, "train/total_loss": 0.12772321701049805 }, { "entropy": 9.018800735473633, "epoch": 0.8413090765275856, "mean_token_accuracy": 0.7811934947967529, "num_tokens": 23477954.0, "step": 8509, "train/ce_loss": 7.553363730039564e-07 }, { "epoch": 0.8413090765275856, "step": 8509, "train/sim_loss": 0.02734375 }, { "epoch": 0.8413090765275856, "step": 8509, "train/total_loss": 0.02734382636845112 }, { "entropy": 8.852306365966797, "epoch": 0.8414079493771011, "mean_token_accuracy": 0.7095046639442444, "num_tokens": 23483141.0, "step": 8510, "train/ce_loss": 1.117521047592163 }, { "epoch": 0.8414079493771011, "step": 8510, "train/sim_loss": 0.07421875 }, { "epoch": 0.8414079493771011, "step": 8510, "train/total_loss": 0.18597085773944855 }, { "entropy": 8.106832504272461, "epoch": 0.8415068222266165, "mean_token_accuracy": 0.6650148630142212, "num_tokens": 23488644.0, "step": 8511, "train/ce_loss": 1.5925387144088745 }, { "epoch": 0.8415068222266165, "step": 8511, "train/sim_loss": 0.02734375 }, { "epoch": 0.8415068222266165, "step": 8511, "train/total_loss": 0.18659763038158417 }, { "entropy": 8.672381401062012, "epoch": 0.8416056950761321, "mean_token_accuracy": 0.7424425482749939, "num_tokens": 23493957.0, "step": 8512, "train/ce_loss": 1.1097936630249023 }, { "epoch": 0.8416056950761321, "step": 8512, "train/sim_loss": 0.0546875 }, { "epoch": 0.8416056950761321, "step": 8512, "train/total_loss": 0.1656668782234192 }, { "entropy": 8.412344932556152, "epoch": 0.8417045679256476, "mean_token_accuracy": 0.7193638682365417, "num_tokens": 23499553.0, "step": 8513, "train/ce_loss": 0.8414641618728638 }, { "epoch": 0.8417045679256476, "step": 8513, "train/sim_loss": 0.0546875 }, { "epoch": 0.8417045679256476, "step": 8513, "train/total_loss": 0.1388339102268219 }, { "entropy": 8.977106094360352, "epoch": 0.8418034407751631, "mean_token_accuracy": 0.7268656492233276, "num_tokens": 23504695.0, "step": 8514, "train/ce_loss": 1.0664161443710327 }, { "epoch": 0.8418034407751631, "step": 8514, "train/sim_loss": 0.0703125 }, { "epoch": 0.8418034407751631, "step": 8514, "train/total_loss": 0.17695412039756775 }, { "entropy": 8.835411071777344, "epoch": 0.8419023136246787, "mean_token_accuracy": 0.731121301651001, "num_tokens": 23510047.0, "step": 8515, "train/ce_loss": 0.530247688293457 }, { "epoch": 0.8419023136246787, "step": 8515, "train/sim_loss": 0.0390625 }, { "epoch": 0.8419023136246787, "step": 8515, "train/total_loss": 0.0920872688293457 }, { "entropy": 8.61327075958252, "epoch": 0.8420011864741942, "mean_token_accuracy": 0.7627856135368347, "num_tokens": 23515473.0, "step": 8516, "train/ce_loss": 1.1188116073608398 }, { "epoch": 0.8420011864741942, "step": 8516, "train/sim_loss": 0.125 }, { "epoch": 0.8420011864741942, "step": 8516, "train/total_loss": 0.23688116669654846 }, { "entropy": 9.671828269958496, "epoch": 0.8421000593237097, "mean_token_accuracy": 0.834645688533783, "num_tokens": 23520133.0, "step": 8517, "train/ce_loss": 3.985355306213023e-06 }, { "epoch": 0.8421000593237097, "step": 8517, "train/sim_loss": 0.0703125 }, { "epoch": 0.8421000593237097, "step": 8517, "train/total_loss": 0.07031289488077164 }, { "entropy": 8.810830116271973, "epoch": 0.8421989321732253, "mean_token_accuracy": 0.7390710115432739, "num_tokens": 23525305.0, "step": 8518, "train/ce_loss": 1.1688063144683838 }, { "epoch": 0.8421989321732253, "step": 8518, "train/sim_loss": 0.06640625 }, { "epoch": 0.8421989321732253, "step": 8518, "train/total_loss": 0.1832868754863739 }, { "entropy": 9.445963859558105, "epoch": 0.8422978050227408, "mean_token_accuracy": 0.7844611406326294, "num_tokens": 23530088.0, "step": 8519, "train/ce_loss": 2.614632194308797e-06 }, { "epoch": 0.8422978050227408, "step": 8519, "train/sim_loss": 0.02734375 }, { "epoch": 0.8422978050227408, "step": 8519, "train/total_loss": 0.027344010770320892 }, { "epoch": 0.8423966778722562, "grad_norm": 0.7130556702613831, "learning_rate": 7.896207288730653e-06, "loss": 0.1326, "step": 8520 }, { "entropy": 9.060623168945312, "epoch": 0.8423966778722562, "mean_token_accuracy": 0.733564019203186, "num_tokens": 23535111.0, "step": 8520, "train/ce_loss": 1.1208001375198364 }, { "epoch": 0.8423966778722562, "step": 8520, "train/sim_loss": 0.01953125 }, { "epoch": 0.8423966778722562, "step": 8520, "train/total_loss": 0.13161125779151917 }, { "entropy": 8.928070068359375, "epoch": 0.8424955507217718, "mean_token_accuracy": 0.7264630794525146, "num_tokens": 23540361.0, "step": 8521, "train/ce_loss": 1.4061381816864014 }, { "epoch": 0.8424955507217718, "step": 8521, "train/sim_loss": 0.0625 }, { "epoch": 0.8424955507217718, "step": 8521, "train/total_loss": 0.20311382412910461 }, { "entropy": 8.617700576782227, "epoch": 0.8425944235712873, "mean_token_accuracy": 0.6704730987548828, "num_tokens": 23545414.0, "step": 8522, "train/ce_loss": 2.2443900108337402 }, { "epoch": 0.8425944235712873, "step": 8522, "train/sim_loss": 0.0703125 }, { "epoch": 0.8425944235712873, "step": 8522, "train/total_loss": 0.29475152492523193 }, { "entropy": 8.72222900390625, "epoch": 0.8426932964208028, "mean_token_accuracy": 0.7211201786994934, "num_tokens": 23550708.0, "step": 8523, "train/ce_loss": 0.5025374889373779 }, { "epoch": 0.8426932964208028, "step": 8523, "train/sim_loss": 0.03515625 }, { "epoch": 0.8426932964208028, "step": 8523, "train/total_loss": 0.08540999889373779 }, { "entropy": 8.840371131896973, "epoch": 0.8427921692703184, "mean_token_accuracy": 0.7091690301895142, "num_tokens": 23555829.0, "step": 8524, "train/ce_loss": 1.5976157188415527 }, { "epoch": 0.8427921692703184, "step": 8524, "train/sim_loss": 0.05078125 }, { "epoch": 0.8427921692703184, "step": 8524, "train/total_loss": 0.21054282784461975 }, { "entropy": 8.897891998291016, "epoch": 0.8428910421198339, "mean_token_accuracy": 0.7764876484870911, "num_tokens": 23561122.0, "step": 8525, "train/ce_loss": 0.9834213852882385 }, { "epoch": 0.8428910421198339, "step": 8525, "train/sim_loss": 0.02734375 }, { "epoch": 0.8428910421198339, "step": 8525, "train/total_loss": 0.1256859004497528 }, { "entropy": 9.54948616027832, "epoch": 0.8429899149693494, "mean_token_accuracy": 0.7002288103103638, "num_tokens": 23565997.0, "step": 8526, "train/ce_loss": 1.4457448287430452e-06 }, { "epoch": 0.8429899149693494, "step": 8526, "train/sim_loss": 0.06640625 }, { "epoch": 0.8429899149693494, "step": 8526, "train/total_loss": 0.06640639156103134 }, { "entropy": 8.320509910583496, "epoch": 0.843088787818865, "mean_token_accuracy": 0.7177914381027222, "num_tokens": 23571297.0, "step": 8527, "train/ce_loss": 0.9710502624511719 }, { "epoch": 0.843088787818865, "step": 8527, "train/sim_loss": 0.03125 }, { "epoch": 0.843088787818865, "step": 8527, "train/total_loss": 0.1283550262451172 }, { "entropy": 8.480062484741211, "epoch": 0.8431876606683805, "mean_token_accuracy": 0.7616875767707825, "num_tokens": 23576669.0, "step": 8528, "train/ce_loss": 0.5755949020385742 }, { "epoch": 0.8431876606683805, "step": 8528, "train/sim_loss": 0.015625 }, { "epoch": 0.8431876606683805, "step": 8528, "train/total_loss": 0.07318449020385742 }, { "entropy": 9.013786315917969, "epoch": 0.843286533517896, "mean_token_accuracy": 0.7514124512672424, "num_tokens": 23581811.0, "step": 8529, "train/ce_loss": 1.2379498481750488 }, { "epoch": 0.843286533517896, "step": 8529, "train/sim_loss": 0.05859375 }, { "epoch": 0.843286533517896, "step": 8529, "train/total_loss": 0.18238873779773712 }, { "entropy": 9.010156631469727, "epoch": 0.8433854063674115, "mean_token_accuracy": 0.7237654328346252, "num_tokens": 23586901.0, "step": 8530, "train/ce_loss": 0.8623937964439392 }, { "epoch": 0.8433854063674115, "step": 8530, "train/sim_loss": 0.02734375 }, { "epoch": 0.8433854063674115, "step": 8530, "train/total_loss": 0.11358313262462616 }, { "entropy": 8.625732421875, "epoch": 0.843484279216927, "mean_token_accuracy": 0.7690504193305969, "num_tokens": 23592195.0, "step": 8531, "train/ce_loss": 0.524692714214325 }, { "epoch": 0.843484279216927, "step": 8531, "train/sim_loss": 0.08203125 }, { "epoch": 0.843484279216927, "step": 8531, "train/total_loss": 0.13450051844120026 }, { "entropy": 9.116499900817871, "epoch": 0.8435831520664425, "mean_token_accuracy": 0.7751572132110596, "num_tokens": 23597256.0, "step": 8532, "train/ce_loss": 1.6770809888839722 }, { "epoch": 0.8435831520664425, "step": 8532, "train/sim_loss": 0.0546875 }, { "epoch": 0.8435831520664425, "step": 8532, "train/total_loss": 0.22239559888839722 }, { "entropy": 9.061075210571289, "epoch": 0.8436820249159581, "mean_token_accuracy": 0.7697160840034485, "num_tokens": 23602309.0, "step": 8533, "train/ce_loss": 0.5759576559066772 }, { "epoch": 0.8436820249159581, "step": 8533, "train/sim_loss": 0.03515625 }, { "epoch": 0.8436820249159581, "step": 8533, "train/total_loss": 0.09275201708078384 }, { "entropy": 9.309283256530762, "epoch": 0.8437808977654736, "mean_token_accuracy": 0.7600849270820618, "num_tokens": 23607234.0, "step": 8534, "train/ce_loss": 5.001173235541501e-07 }, { "epoch": 0.8437808977654736, "step": 8534, "train/sim_loss": 0.01953125 }, { "epoch": 0.8437808977654736, "step": 8534, "train/total_loss": 0.01953130029141903 }, { "entropy": 9.241487503051758, "epoch": 0.8438797706149891, "mean_token_accuracy": 0.6947040557861328, "num_tokens": 23612257.0, "step": 8535, "train/ce_loss": 1.3567728996276855 }, { "epoch": 0.8438797706149891, "step": 8535, "train/sim_loss": 0.0703125 }, { "epoch": 0.8438797706149891, "step": 8535, "train/total_loss": 0.2059897929430008 }, { "entropy": 8.804858207702637, "epoch": 0.8439786434645047, "mean_token_accuracy": 0.6940194964408875, "num_tokens": 23617462.0, "step": 8536, "train/ce_loss": 1.312057614326477 }, { "epoch": 0.8439786434645047, "step": 8536, "train/sim_loss": 0.0390625 }, { "epoch": 0.8439786434645047, "step": 8536, "train/total_loss": 0.17026826739311218 }, { "entropy": 8.754676818847656, "epoch": 0.8440775163140202, "mean_token_accuracy": 0.7330623269081116, "num_tokens": 23622683.0, "step": 8537, "train/ce_loss": 0.9614264369010925 }, { "epoch": 0.8440775163140202, "step": 8537, "train/sim_loss": 0.03515625 }, { "epoch": 0.8440775163140202, "step": 8537, "train/total_loss": 0.13129889965057373 }, { "entropy": 8.331369400024414, "epoch": 0.8441763891635357, "mean_token_accuracy": 0.7461140155792236, "num_tokens": 23628181.0, "step": 8538, "train/ce_loss": 0.43463143706321716 }, { "epoch": 0.8441763891635357, "step": 8538, "train/sim_loss": 0.01953125 }, { "epoch": 0.8441763891635357, "step": 8538, "train/total_loss": 0.06299439072608948 }, { "entropy": 8.614924430847168, "epoch": 0.8442752620130513, "mean_token_accuracy": 0.7688098549842834, "num_tokens": 23633417.0, "step": 8539, "train/ce_loss": 0.895168125629425 }, { "epoch": 0.8442752620130513, "step": 8539, "train/sim_loss": 0.09375 }, { "epoch": 0.8442752620130513, "step": 8539, "train/total_loss": 0.18326681852340698 }, { "epoch": 0.8443741348625667, "grad_norm": 0.6965118646621704, "learning_rate": 7.891262423972705e-06, "loss": 0.132, "step": 8540 }, { "entropy": 9.057525634765625, "epoch": 0.8443741348625667, "mean_token_accuracy": 0.7484939694404602, "num_tokens": 23638500.0, "step": 8540, "train/ce_loss": 4.804210789188801e-07 }, { "epoch": 0.8443741348625667, "step": 8540, "train/sim_loss": 0.01171875 }, { "epoch": 0.8443741348625667, "step": 8540, "train/total_loss": 0.01171879842877388 }, { "entropy": 8.604438781738281, "epoch": 0.8444730077120822, "mean_token_accuracy": 0.7147846221923828, "num_tokens": 23643836.0, "step": 8541, "train/ce_loss": 1.0626955032348633 }, { "epoch": 0.8444730077120822, "step": 8541, "train/sim_loss": 0.046875 }, { "epoch": 0.8444730077120822, "step": 8541, "train/total_loss": 0.15314455330371857 }, { "entropy": 8.803327560424805, "epoch": 0.8445718805615978, "mean_token_accuracy": 0.7428977489471436, "num_tokens": 23649029.0, "step": 8542, "train/ce_loss": 0.7747955918312073 }, { "epoch": 0.8445718805615978, "step": 8542, "train/sim_loss": 0.08203125 }, { "epoch": 0.8445718805615978, "step": 8542, "train/total_loss": 0.15951082110404968 }, { "entropy": 8.597539901733398, "epoch": 0.8446707534111133, "mean_token_accuracy": 0.7333333492279053, "num_tokens": 23654293.0, "step": 8543, "train/ce_loss": 0.9350537657737732 }, { "epoch": 0.8446707534111133, "step": 8543, "train/sim_loss": 0.04296875 }, { "epoch": 0.8446707534111133, "step": 8543, "train/total_loss": 0.1364741325378418 }, { "entropy": 8.989640235900879, "epoch": 0.8447696262606288, "mean_token_accuracy": 0.6534771919250488, "num_tokens": 23659552.0, "step": 8544, "train/ce_loss": 2.057020902633667 }, { "epoch": 0.8447696262606288, "step": 8544, "train/sim_loss": 0.1015625 }, { "epoch": 0.8447696262606288, "step": 8544, "train/total_loss": 0.3072645962238312 }, { "entropy": 8.553593635559082, "epoch": 0.8448684991101444, "mean_token_accuracy": 0.7130801677703857, "num_tokens": 23664935.0, "step": 8545, "train/ce_loss": 0.978156328201294 }, { "epoch": 0.8448684991101444, "step": 8545, "train/sim_loss": 0.16015625 }, { "epoch": 0.8448684991101444, "step": 8545, "train/total_loss": 0.2579718828201294 }, { "entropy": 8.832277297973633, "epoch": 0.8449673719596599, "mean_token_accuracy": 0.7181926369667053, "num_tokens": 23670259.0, "step": 8546, "train/ce_loss": 0.8887196779251099 }, { "epoch": 0.8449673719596599, "step": 8546, "train/sim_loss": 0.05078125 }, { "epoch": 0.8449673719596599, "step": 8546, "train/total_loss": 0.13965322077274323 }, { "entropy": 9.03927993774414, "epoch": 0.8450662448091754, "mean_token_accuracy": 0.7805194854736328, "num_tokens": 23675503.0, "step": 8547, "train/ce_loss": 0.836915910243988 }, { "epoch": 0.8450662448091754, "step": 8547, "train/sim_loss": 0.0234375 }, { "epoch": 0.8450662448091754, "step": 8547, "train/total_loss": 0.10712908953428268 }, { "entropy": 9.511265754699707, "epoch": 0.845165117658691, "mean_token_accuracy": 0.7051281929016113, "num_tokens": 23680415.0, "step": 8548, "train/ce_loss": 1.317632794380188 }, { "epoch": 0.845165117658691, "step": 8548, "train/sim_loss": 0.03515625 }, { "epoch": 0.845165117658691, "step": 8548, "train/total_loss": 0.1669195294380188 }, { "entropy": 8.35056209564209, "epoch": 0.8452639905082064, "mean_token_accuracy": 0.6816367506980896, "num_tokens": 23685928.0, "step": 8549, "train/ce_loss": 0.5039235353469849 }, { "epoch": 0.8452639905082064, "step": 8549, "train/sim_loss": 0.0625 }, { "epoch": 0.8452639905082064, "step": 8549, "train/total_loss": 0.11289235949516296 }, { "entropy": 8.82366943359375, "epoch": 0.8453628633577219, "mean_token_accuracy": 0.6337500214576721, "num_tokens": 23691174.0, "step": 8550, "train/ce_loss": 1.8179452419281006 }, { "epoch": 0.8453628633577219, "step": 8550, "train/sim_loss": 0.1015625 }, { "epoch": 0.8453628633577219, "step": 8550, "train/total_loss": 0.28335702419281006 }, { "entropy": 8.933710098266602, "epoch": 0.8454617362072375, "mean_token_accuracy": 0.7696709632873535, "num_tokens": 23696311.0, "step": 8551, "train/ce_loss": 0.43073558807373047 }, { "epoch": 0.8454617362072375, "step": 8551, "train/sim_loss": 0.0546875 }, { "epoch": 0.8454617362072375, "step": 8551, "train/total_loss": 0.09776106476783752 }, { "entropy": 8.46588134765625, "epoch": 0.845560609056753, "mean_token_accuracy": 0.7494845390319824, "num_tokens": 23701739.0, "step": 8552, "train/ce_loss": 1.047522783279419 }, { "epoch": 0.845560609056753, "step": 8552, "train/sim_loss": 0.0234375 }, { "epoch": 0.845560609056753, "step": 8552, "train/total_loss": 0.12818977236747742 }, { "entropy": 8.805068969726562, "epoch": 0.8456594819062685, "mean_token_accuracy": 0.7956104278564453, "num_tokens": 23706978.0, "step": 8553, "train/ce_loss": 1.2623344218809507e-06 }, { "epoch": 0.8456594819062685, "step": 8553, "train/sim_loss": 0.0546875 }, { "epoch": 0.8456594819062685, "step": 8553, "train/total_loss": 0.05468762665987015 }, { "entropy": 8.793703079223633, "epoch": 0.8457583547557841, "mean_token_accuracy": 0.7777777910232544, "num_tokens": 23712031.0, "step": 8554, "train/ce_loss": 0.9132997989654541 }, { "epoch": 0.8457583547557841, "step": 8554, "train/sim_loss": 0.0390625 }, { "epoch": 0.8457583547557841, "step": 8554, "train/total_loss": 0.13039249181747437 }, { "entropy": 8.934954643249512, "epoch": 0.8458572276052996, "mean_token_accuracy": 0.6742532253265381, "num_tokens": 23717178.0, "step": 8555, "train/ce_loss": 4.5444750185197336e-07 }, { "epoch": 0.8458572276052996, "step": 8555, "train/sim_loss": 0.015625 }, { "epoch": 0.8458572276052996, "step": 8555, "train/total_loss": 0.01562504470348358 }, { "entropy": 8.994284629821777, "epoch": 0.8459561004548151, "mean_token_accuracy": 0.7478134036064148, "num_tokens": 23722343.0, "step": 8556, "train/ce_loss": 1.5355756282806396 }, { "epoch": 0.8459561004548151, "step": 8556, "train/sim_loss": 0.06640625 }, { "epoch": 0.8459561004548151, "step": 8556, "train/total_loss": 0.21996381878852844 }, { "entropy": 8.861419677734375, "epoch": 0.8460549733043307, "mean_token_accuracy": 0.7296416759490967, "num_tokens": 23727741.0, "step": 8557, "train/ce_loss": 1.1478617191314697 }, { "epoch": 0.8460549733043307, "step": 8557, "train/sim_loss": 0.1171875 }, { "epoch": 0.8460549733043307, "step": 8557, "train/total_loss": 0.23197367787361145 }, { "entropy": 9.423959732055664, "epoch": 0.8461538461538461, "mean_token_accuracy": 0.7896679043769836, "num_tokens": 23732738.0, "step": 8558, "train/ce_loss": 0.9041481018066406 }, { "epoch": 0.8461538461538461, "step": 8558, "train/sim_loss": 0.0703125 }, { "epoch": 0.8461538461538461, "step": 8558, "train/total_loss": 0.16072732210159302 }, { "entropy": 8.978797912597656, "epoch": 0.8462527190033616, "mean_token_accuracy": 0.7086092829704285, "num_tokens": 23737967.0, "step": 8559, "train/ce_loss": 0.9606765508651733 }, { "epoch": 0.8462527190033616, "step": 8559, "train/sim_loss": 0.078125 }, { "epoch": 0.8462527190033616, "step": 8559, "train/total_loss": 0.1741926670074463 }, { "epoch": 0.8463515918528772, "grad_norm": 0.7191409468650818, "learning_rate": 7.886317559214756e-06, "loss": 0.1419, "step": 8560 }, { "entropy": 8.879575729370117, "epoch": 0.8463515918528772, "mean_token_accuracy": 0.7217962741851807, "num_tokens": 23743346.0, "step": 8560, "train/ce_loss": 0.960830807685852 }, { "epoch": 0.8463515918528772, "step": 8560, "train/sim_loss": 0.08984375 }, { "epoch": 0.8463515918528772, "step": 8560, "train/total_loss": 0.18592682480812073 }, { "entropy": 9.213173866271973, "epoch": 0.8464504647023927, "mean_token_accuracy": 0.7724252343177795, "num_tokens": 23748358.0, "step": 8561, "train/ce_loss": 0.8338235020637512 }, { "epoch": 0.8464504647023927, "step": 8561, "train/sim_loss": 0.03515625 }, { "epoch": 0.8464504647023927, "step": 8561, "train/total_loss": 0.11853860318660736 }, { "entropy": 8.947521209716797, "epoch": 0.8465493375519082, "mean_token_accuracy": 0.7548387050628662, "num_tokens": 23753612.0, "step": 8562, "train/ce_loss": 0.5646962523460388 }, { "epoch": 0.8465493375519082, "step": 8562, "train/sim_loss": 0.09765625 }, { "epoch": 0.8465493375519082, "step": 8562, "train/total_loss": 0.1541258692741394 }, { "entropy": 9.613811492919922, "epoch": 0.8466482104014238, "mean_token_accuracy": 0.7535211443901062, "num_tokens": 23758428.0, "step": 8563, "train/ce_loss": 7.110681963240495e-06 }, { "epoch": 0.8466482104014238, "step": 8563, "train/sim_loss": 0.05078125 }, { "epoch": 0.8466482104014238, "step": 8563, "train/total_loss": 0.050781961530447006 }, { "entropy": 9.07571029663086, "epoch": 0.8467470832509393, "mean_token_accuracy": 0.6820987462997437, "num_tokens": 23763537.0, "step": 8564, "train/ce_loss": 0.8961170315742493 }, { "epoch": 0.8467470832509393, "step": 8564, "train/sim_loss": 0.0390625 }, { "epoch": 0.8467470832509393, "step": 8564, "train/total_loss": 0.1286742091178894 }, { "entropy": 8.552220344543457, "epoch": 0.8468459561004548, "mean_token_accuracy": 0.7806385159492493, "num_tokens": 23768918.0, "step": 8565, "train/ce_loss": 0.8199772238731384 }, { "epoch": 0.8468459561004548, "step": 8565, "train/sim_loss": 0.03515625 }, { "epoch": 0.8468459561004548, "step": 8565, "train/total_loss": 0.11715397238731384 }, { "entropy": 9.405799865722656, "epoch": 0.8469448289499704, "mean_token_accuracy": 0.7757936716079712, "num_tokens": 23773824.0, "step": 8566, "train/ce_loss": 1.1489684581756592 }, { "epoch": 0.8469448289499704, "step": 8566, "train/sim_loss": 0.015625 }, { "epoch": 0.8469448289499704, "step": 8566, "train/total_loss": 0.13052184879779816 }, { "entropy": 8.768903732299805, "epoch": 0.8470437017994858, "mean_token_accuracy": 0.7369093298912048, "num_tokens": 23779078.0, "step": 8567, "train/ce_loss": 1.2097439765930176 }, { "epoch": 0.8470437017994858, "step": 8567, "train/sim_loss": 0.0546875 }, { "epoch": 0.8470437017994858, "step": 8567, "train/total_loss": 0.17566189169883728 }, { "entropy": 8.474605560302734, "epoch": 0.8471425746490013, "mean_token_accuracy": 0.7535714507102966, "num_tokens": 23784381.0, "step": 8568, "train/ce_loss": 0.6563302874565125 }, { "epoch": 0.8471425746490013, "step": 8568, "train/sim_loss": 0.01171875 }, { "epoch": 0.8471425746490013, "step": 8568, "train/total_loss": 0.07735177874565125 }, { "entropy": 8.380297660827637, "epoch": 0.8472414474985169, "mean_token_accuracy": 0.7276736497879028, "num_tokens": 23789765.0, "step": 8569, "train/ce_loss": 1.0520323514938354 }, { "epoch": 0.8472414474985169, "step": 8569, "train/sim_loss": 0.0390625 }, { "epoch": 0.8472414474985169, "step": 8569, "train/total_loss": 0.14426574110984802 }, { "entropy": 9.462567329406738, "epoch": 0.8473403203480324, "mean_token_accuracy": 0.75, "num_tokens": 23794751.0, "step": 8570, "train/ce_loss": 1.1305701264063828e-06 }, { "epoch": 0.8473403203480324, "step": 8570, "train/sim_loss": 0.0390625 }, { "epoch": 0.8473403203480324, "step": 8570, "train/total_loss": 0.039062611758708954 }, { "entropy": 8.370973587036133, "epoch": 0.8474391931975479, "mean_token_accuracy": 0.7162954211235046, "num_tokens": 23800133.0, "step": 8571, "train/ce_loss": 0.7542774677276611 }, { "epoch": 0.8474391931975479, "step": 8571, "train/sim_loss": 0.02734375 }, { "epoch": 0.8474391931975479, "step": 8571, "train/total_loss": 0.10277149826288223 }, { "entropy": 8.593929290771484, "epoch": 0.8475380660470635, "mean_token_accuracy": 0.7829294204711914, "num_tokens": 23805505.0, "step": 8572, "train/ce_loss": 1.0726655721664429 }, { "epoch": 0.8475380660470635, "step": 8572, "train/sim_loss": 0.0703125 }, { "epoch": 0.8475380660470635, "step": 8572, "train/total_loss": 0.17757906019687653 }, { "entropy": 8.853445053100586, "epoch": 0.847636938896579, "mean_token_accuracy": 0.7512755393981934, "num_tokens": 23810777.0, "step": 8573, "train/ce_loss": 1.2246817350387573 }, { "epoch": 0.847636938896579, "step": 8573, "train/sim_loss": 0.09375 }, { "epoch": 0.847636938896579, "step": 8573, "train/total_loss": 0.21621817350387573 }, { "entropy": 8.767151832580566, "epoch": 0.8477358117460945, "mean_token_accuracy": 0.7639665007591248, "num_tokens": 23815947.0, "step": 8574, "train/ce_loss": 0.9610381126403809 }, { "epoch": 0.8477358117460945, "step": 8574, "train/sim_loss": 0.04296875 }, { "epoch": 0.8477358117460945, "step": 8574, "train/total_loss": 0.13907256722450256 }, { "entropy": 8.860359191894531, "epoch": 0.8478346845956101, "mean_token_accuracy": 0.7936893105506897, "num_tokens": 23821210.0, "step": 8575, "train/ce_loss": 0.795008659362793 }, { "epoch": 0.8478346845956101, "step": 8575, "train/sim_loss": 0.05078125 }, { "epoch": 0.8478346845956101, "step": 8575, "train/total_loss": 0.13028211891651154 }, { "entropy": 8.468084335327148, "epoch": 0.8479335574451256, "mean_token_accuracy": 0.6940928101539612, "num_tokens": 23826593.0, "step": 8576, "train/ce_loss": 0.7455776333808899 }, { "epoch": 0.8479335574451256, "step": 8576, "train/sim_loss": 0.0390625 }, { "epoch": 0.8479335574451256, "step": 8576, "train/total_loss": 0.11362026631832123 }, { "entropy": 8.811766624450684, "epoch": 0.848032430294641, "mean_token_accuracy": 0.7359477281570435, "num_tokens": 23831802.0, "step": 8577, "train/ce_loss": 0.7632312178611755 }, { "epoch": 0.848032430294641, "step": 8577, "train/sim_loss": 0.05078125 }, { "epoch": 0.848032430294641, "step": 8577, "train/total_loss": 0.12710437178611755 }, { "entropy": 8.497659683227539, "epoch": 0.8481313031441566, "mean_token_accuracy": 0.715871274471283, "num_tokens": 23837209.0, "step": 8578, "train/ce_loss": 0.6656380295753479 }, { "epoch": 0.8481313031441566, "step": 8578, "train/sim_loss": 0.015625 }, { "epoch": 0.8481313031441566, "step": 8578, "train/total_loss": 0.08218880742788315 }, { "entropy": 8.694424629211426, "epoch": 0.8482301759936721, "mean_token_accuracy": 0.7347715497016907, "num_tokens": 23842508.0, "step": 8579, "train/ce_loss": 0.8246199488639832 }, { "epoch": 0.8482301759936721, "step": 8579, "train/sim_loss": 0.02734375 }, { "epoch": 0.8482301759936721, "step": 8579, "train/total_loss": 0.10980574786663055 }, { "epoch": 0.8483290488431876, "grad_norm": 0.6876291036605835, "learning_rate": 7.881372694456806e-06, "loss": 0.1312, "step": 8580 }, { "entropy": 9.102163314819336, "epoch": 0.8483290488431876, "mean_token_accuracy": 0.8110516667366028, "num_tokens": 23847560.0, "step": 8580, "train/ce_loss": 9.460008527639729e-07 }, { "epoch": 0.8483290488431876, "step": 8580, "train/sim_loss": 0.03515625 }, { "epoch": 0.8483290488431876, "step": 8580, "train/total_loss": 0.03515634313225746 }, { "entropy": 8.552837371826172, "epoch": 0.8484279216927032, "mean_token_accuracy": 0.7890088558197021, "num_tokens": 23853053.0, "step": 8581, "train/ce_loss": 0.6224937438964844 }, { "epoch": 0.8484279216927032, "step": 8581, "train/sim_loss": 0.0390625 }, { "epoch": 0.8484279216927032, "step": 8581, "train/total_loss": 0.10131187736988068 }, { "entropy": 8.294087409973145, "epoch": 0.8485267945422187, "mean_token_accuracy": 0.7723258137702942, "num_tokens": 23858541.0, "step": 8582, "train/ce_loss": 0.699788510799408 }, { "epoch": 0.8485267945422187, "step": 8582, "train/sim_loss": 0.04296875 }, { "epoch": 0.8485267945422187, "step": 8582, "train/total_loss": 0.11294760555028915 }, { "entropy": 9.38934326171875, "epoch": 0.8486256673917343, "mean_token_accuracy": 0.7436892986297607, "num_tokens": 23863503.0, "step": 8583, "train/ce_loss": 0.9503109455108643 }, { "epoch": 0.8486256673917343, "step": 8583, "train/sim_loss": 0.05078125 }, { "epoch": 0.8486256673917343, "step": 8583, "train/total_loss": 0.14581234753131866 }, { "entropy": 9.223760604858398, "epoch": 0.8487245402412498, "mean_token_accuracy": 0.7319148778915405, "num_tokens": 23868383.0, "step": 8584, "train/ce_loss": 5.062517516307707e-07 }, { "epoch": 0.8487245402412498, "step": 8584, "train/sim_loss": 0.05078125 }, { "epoch": 0.8487245402412498, "step": 8584, "train/total_loss": 0.05078130215406418 }, { "entropy": 8.907994270324707, "epoch": 0.8488234130907653, "mean_token_accuracy": 0.7239512801170349, "num_tokens": 23873611.0, "step": 8585, "train/ce_loss": 0.813693106174469 }, { "epoch": 0.8488234130907653, "step": 8585, "train/sim_loss": 0.046875 }, { "epoch": 0.8488234130907653, "step": 8585, "train/total_loss": 0.1282443106174469 }, { "entropy": 8.57661247253418, "epoch": 0.8489222859402809, "mean_token_accuracy": 0.7664429545402527, "num_tokens": 23878867.0, "step": 8586, "train/ce_loss": 0.8562458753585815 }, { "epoch": 0.8489222859402809, "step": 8586, "train/sim_loss": 0.03125 }, { "epoch": 0.8489222859402809, "step": 8586, "train/total_loss": 0.1168745905160904 }, { "entropy": 8.623764038085938, "epoch": 0.8490211587897963, "mean_token_accuracy": 0.7465224266052246, "num_tokens": 23884026.0, "step": 8587, "train/ce_loss": 0.5021078586578369 }, { "epoch": 0.8490211587897963, "step": 8587, "train/sim_loss": 0.06640625 }, { "epoch": 0.8490211587897963, "step": 8587, "train/total_loss": 0.11661703884601593 }, { "entropy": 9.092384338378906, "epoch": 0.8491200316393118, "mean_token_accuracy": 0.6960926055908203, "num_tokens": 23889140.0, "step": 8588, "train/ce_loss": 1.0651516914367676 }, { "epoch": 0.8491200316393118, "step": 8588, "train/sim_loss": 0.05859375 }, { "epoch": 0.8491200316393118, "step": 8588, "train/total_loss": 0.16510891914367676 }, { "entropy": 8.752415657043457, "epoch": 0.8492189044888274, "mean_token_accuracy": 0.7476537823677063, "num_tokens": 23894586.0, "step": 8589, "train/ce_loss": 0.5429462790489197 }, { "epoch": 0.8492189044888274, "step": 8589, "train/sim_loss": 0.078125 }, { "epoch": 0.8492189044888274, "step": 8589, "train/total_loss": 0.1324196308851242 }, { "entropy": 9.215569496154785, "epoch": 0.8493177773383429, "mean_token_accuracy": 0.8021978139877319, "num_tokens": 23899594.0, "step": 8590, "train/ce_loss": 8.209082693610981e-07 }, { "epoch": 0.8493177773383429, "step": 8590, "train/sim_loss": 0.0703125 }, { "epoch": 0.8493177773383429, "step": 8590, "train/total_loss": 0.07031258195638657 }, { "entropy": 8.700742721557617, "epoch": 0.8494166501878584, "mean_token_accuracy": 0.7142857313156128, "num_tokens": 23904918.0, "step": 8591, "train/ce_loss": 0.5889235138893127 }, { "epoch": 0.8494166501878584, "step": 8591, "train/sim_loss": 0.0390625 }, { "epoch": 0.8494166501878584, "step": 8591, "train/total_loss": 0.09795485436916351 }, { "entropy": 8.850866317749023, "epoch": 0.849515523037374, "mean_token_accuracy": 0.7548543810844421, "num_tokens": 23910206.0, "step": 8592, "train/ce_loss": 0.4724769592285156 }, { "epoch": 0.849515523037374, "step": 8592, "train/sim_loss": 0.05078125 }, { "epoch": 0.849515523037374, "step": 8592, "train/total_loss": 0.09802894294261932 }, { "entropy": 9.597957611083984, "epoch": 0.8496143958868895, "mean_token_accuracy": 0.738161563873291, "num_tokens": 23914980.0, "step": 8593, "train/ce_loss": 1.9964507818222046 }, { "epoch": 0.8496143958868895, "step": 8593, "train/sim_loss": 0.07421875 }, { "epoch": 0.8496143958868895, "step": 8593, "train/total_loss": 0.27386385202407837 }, { "entropy": 8.678569793701172, "epoch": 0.849713268736405, "mean_token_accuracy": 0.6766623258590698, "num_tokens": 23920252.0, "step": 8594, "train/ce_loss": 1.3998847007751465 }, { "epoch": 0.849713268736405, "step": 8594, "train/sim_loss": 0.0703125 }, { "epoch": 0.849713268736405, "step": 8594, "train/total_loss": 0.2103009670972824 }, { "entropy": 8.598791122436523, "epoch": 0.8498121415859206, "mean_token_accuracy": 0.7106481194496155, "num_tokens": 23925626.0, "step": 8595, "train/ce_loss": 0.6554878950119019 }, { "epoch": 0.8498121415859206, "step": 8595, "train/sim_loss": 0.0625 }, { "epoch": 0.8498121415859206, "step": 8595, "train/total_loss": 0.12804879248142242 }, { "entropy": 8.57380485534668, "epoch": 0.849911014435436, "mean_token_accuracy": 0.738170325756073, "num_tokens": 23930999.0, "step": 8596, "train/ce_loss": 1.0399413108825684 }, { "epoch": 0.849911014435436, "step": 8596, "train/sim_loss": 0.04296875 }, { "epoch": 0.849911014435436, "step": 8596, "train/total_loss": 0.14696288108825684 }, { "entropy": 8.762775421142578, "epoch": 0.8500098872849515, "mean_token_accuracy": 0.7331759333610535, "num_tokens": 23936310.0, "step": 8597, "train/ce_loss": 0.8365949988365173 }, { "epoch": 0.8500098872849515, "step": 8597, "train/sim_loss": 0.02734375 }, { "epoch": 0.8500098872849515, "step": 8597, "train/total_loss": 0.11100324988365173 }, { "entropy": 9.271051406860352, "epoch": 0.8501087601344671, "mean_token_accuracy": 0.7364568114280701, "num_tokens": 23941398.0, "step": 8598, "train/ce_loss": 1.2323108911514282 }, { "epoch": 0.8501087601344671, "step": 8598, "train/sim_loss": 0.03515625 }, { "epoch": 0.8501087601344671, "step": 8598, "train/total_loss": 0.15838733315467834 }, { "entropy": 8.725774765014648, "epoch": 0.8502076329839826, "mean_token_accuracy": 0.7077682018280029, "num_tokens": 23946664.0, "step": 8599, "train/ce_loss": 0.5549308657646179 }, { "epoch": 0.8502076329839826, "step": 8599, "train/sim_loss": 0.0625 }, { "epoch": 0.8502076329839826, "step": 8599, "train/total_loss": 0.11799308657646179 }, { "epoch": 0.8503065058334981, "grad_norm": 0.736488401889801, "learning_rate": 7.876427829698859e-06, "loss": 0.1359, "step": 8600 }, { "entropy": 8.868095397949219, "epoch": 0.8503065058334981, "mean_token_accuracy": 0.7949790954589844, "num_tokens": 23951870.0, "step": 8600, "train/ce_loss": 0.7157239317893982 }, { "epoch": 0.8503065058334981, "step": 8600, "train/sim_loss": 0.0390625 }, { "epoch": 0.8503065058334981, "step": 8600, "train/total_loss": 0.11063489317893982 }, { "entropy": 8.631006240844727, "epoch": 0.8504053786830137, "mean_token_accuracy": 0.7225305438041687, "num_tokens": 23957208.0, "step": 8601, "train/ce_loss": 0.7594647407531738 }, { "epoch": 0.8504053786830137, "step": 8601, "train/sim_loss": 0.04296875 }, { "epoch": 0.8504053786830137, "step": 8601, "train/total_loss": 0.11891522258520126 }, { "entropy": 8.989002227783203, "epoch": 0.8505042515325292, "mean_token_accuracy": 0.7481371164321899, "num_tokens": 23962275.0, "step": 8602, "train/ce_loss": 0.7256680727005005 }, { "epoch": 0.8505042515325292, "step": 8602, "train/sim_loss": 0.046875 }, { "epoch": 0.8505042515325292, "step": 8602, "train/total_loss": 0.11944180727005005 }, { "entropy": 8.472922325134277, "epoch": 0.8506031243820447, "mean_token_accuracy": 0.7381423115730286, "num_tokens": 23967743.0, "step": 8603, "train/ce_loss": 1.0811166763305664 }, { "epoch": 0.8506031243820447, "step": 8603, "train/sim_loss": 0.04296875 }, { "epoch": 0.8506031243820447, "step": 8603, "train/total_loss": 0.1510804295539856 }, { "entropy": 8.90719223022461, "epoch": 0.8507019972315603, "mean_token_accuracy": 0.730654776096344, "num_tokens": 23972855.0, "step": 8604, "train/ce_loss": 4.292879225431534e-07 }, { "epoch": 0.8507019972315603, "step": 8604, "train/sim_loss": 0.046875 }, { "epoch": 0.8507019972315603, "step": 8604, "train/total_loss": 0.04687504470348358 }, { "entropy": 8.540361404418945, "epoch": 0.8508008700810757, "mean_token_accuracy": 0.7266355156898499, "num_tokens": 23978191.0, "step": 8605, "train/ce_loss": 0.5352979898452759 }, { "epoch": 0.8508008700810757, "step": 8605, "train/sim_loss": 0.0234375 }, { "epoch": 0.8508008700810757, "step": 8605, "train/total_loss": 0.07696729898452759 }, { "entropy": 9.061471939086914, "epoch": 0.8508997429305912, "mean_token_accuracy": 0.8200590014457703, "num_tokens": 23983318.0, "step": 8606, "train/ce_loss": 7.412546665364061e-07 }, { "epoch": 0.8508997429305912, "step": 8606, "train/sim_loss": 0.0625 }, { "epoch": 0.8508997429305912, "step": 8606, "train/total_loss": 0.06250007450580597 }, { "entropy": 8.739973068237305, "epoch": 0.8509986157801068, "mean_token_accuracy": 0.7247474789619446, "num_tokens": 23988557.0, "step": 8607, "train/ce_loss": 0.7236993908882141 }, { "epoch": 0.8509986157801068, "step": 8607, "train/sim_loss": 0.07421875 }, { "epoch": 0.8509986157801068, "step": 8607, "train/total_loss": 0.14658868312835693 }, { "entropy": 9.018573760986328, "epoch": 0.8510974886296223, "mean_token_accuracy": 0.7268170714378357, "num_tokens": 23994006.0, "step": 8608, "train/ce_loss": 1.1890531778335571 }, { "epoch": 0.8510974886296223, "step": 8608, "train/sim_loss": 0.06640625 }, { "epoch": 0.8510974886296223, "step": 8608, "train/total_loss": 0.18531157076358795 }, { "entropy": 8.670655250549316, "epoch": 0.8511963614791378, "mean_token_accuracy": 0.7262773513793945, "num_tokens": 23999337.0, "step": 8609, "train/ce_loss": 0.41880103945732117 }, { "epoch": 0.8511963614791378, "step": 8609, "train/sim_loss": 0.015625 }, { "epoch": 0.8511963614791378, "step": 8609, "train/total_loss": 0.057505104690790176 }, { "entropy": 8.736785888671875, "epoch": 0.8512952343286534, "mean_token_accuracy": 0.7797872424125671, "num_tokens": 24004694.0, "step": 8610, "train/ce_loss": 0.5912141799926758 }, { "epoch": 0.8512952343286534, "step": 8610, "train/sim_loss": 0.03125 }, { "epoch": 0.8512952343286534, "step": 8610, "train/total_loss": 0.09037141501903534 }, { "entropy": 9.069501876831055, "epoch": 0.8513941071781689, "mean_token_accuracy": 0.7878260612487793, "num_tokens": 24009678.0, "step": 8611, "train/ce_loss": 5.896106358704856e-07 }, { "epoch": 0.8513941071781689, "step": 8611, "train/sim_loss": 0.046875 }, { "epoch": 0.8513941071781689, "step": 8611, "train/total_loss": 0.046875059604644775 }, { "entropy": 8.639055252075195, "epoch": 0.8514929800276844, "mean_token_accuracy": 0.7667887806892395, "num_tokens": 24014937.0, "step": 8612, "train/ce_loss": 0.49192896485328674 }, { "epoch": 0.8514929800276844, "step": 8612, "train/sim_loss": 0.0234375 }, { "epoch": 0.8514929800276844, "step": 8612, "train/total_loss": 0.0726303979754448 }, { "entropy": 8.987211227416992, "epoch": 0.8515918528772, "mean_token_accuracy": 0.7503566145896912, "num_tokens": 24020045.0, "step": 8613, "train/ce_loss": 2.2853257632959867e-06 }, { "epoch": 0.8515918528772, "step": 8613, "train/sim_loss": 0.06640625 }, { "epoch": 0.8515918528772, "step": 8613, "train/total_loss": 0.0664064809679985 }, { "entropy": 8.735445976257324, "epoch": 0.8516907257267154, "mean_token_accuracy": 0.7709359526634216, "num_tokens": 24025315.0, "step": 8614, "train/ce_loss": 0.8446625471115112 }, { "epoch": 0.8516907257267154, "step": 8614, "train/sim_loss": 0.02734375 }, { "epoch": 0.8516907257267154, "step": 8614, "train/total_loss": 0.11181000620126724 }, { "entropy": 8.674492835998535, "epoch": 0.8517895985762309, "mean_token_accuracy": 0.8137565851211548, "num_tokens": 24030721.0, "step": 8615, "train/ce_loss": 0.6308075785636902 }, { "epoch": 0.8517895985762309, "step": 8615, "train/sim_loss": 0.0546875 }, { "epoch": 0.8517895985762309, "step": 8615, "train/total_loss": 0.11776825785636902 }, { "entropy": 9.038165092468262, "epoch": 0.8518884714257465, "mean_token_accuracy": 0.6757457852363586, "num_tokens": 24035949.0, "step": 8616, "train/ce_loss": 1.302553415298462 }, { "epoch": 0.8518884714257465, "step": 8616, "train/sim_loss": 0.0546875 }, { "epoch": 0.8518884714257465, "step": 8616, "train/total_loss": 0.1849428415298462 }, { "entropy": 8.956336975097656, "epoch": 0.851987344275262, "mean_token_accuracy": 0.7837445735931396, "num_tokens": 24041072.0, "step": 8617, "train/ce_loss": 0.46485111117362976 }, { "epoch": 0.851987344275262, "step": 8617, "train/sim_loss": 0.01953125 }, { "epoch": 0.851987344275262, "step": 8617, "train/total_loss": 0.06601636111736298 }, { "entropy": 8.625548362731934, "epoch": 0.8520862171247775, "mean_token_accuracy": 0.7753530144691467, "num_tokens": 24046361.0, "step": 8618, "train/ce_loss": 0.8610982298851013 }, { "epoch": 0.8520862171247775, "step": 8618, "train/sim_loss": 0.01953125 }, { "epoch": 0.8520862171247775, "step": 8618, "train/total_loss": 0.10564107447862625 }, { "entropy": 8.861141204833984, "epoch": 0.8521850899742931, "mean_token_accuracy": 0.788170576095581, "num_tokens": 24051489.0, "step": 8619, "train/ce_loss": 0.5501077771186829 }, { "epoch": 0.8521850899742931, "step": 8619, "train/sim_loss": 0.06640625 }, { "epoch": 0.8521850899742931, "step": 8619, "train/total_loss": 0.12141703069210052 }, { "epoch": 0.8522839628238086, "grad_norm": 0.5112330913543701, "learning_rate": 7.87148296494091e-06, "loss": 0.1179, "step": 8620 }, { "entropy": 8.934500694274902, "epoch": 0.8522839628238086, "mean_token_accuracy": 0.727748692035675, "num_tokens": 24056752.0, "step": 8620, "train/ce_loss": 0.6823888421058655 }, { "epoch": 0.8522839628238086, "step": 8620, "train/sim_loss": 0.046875 }, { "epoch": 0.8522839628238086, "step": 8620, "train/total_loss": 0.11511388421058655 }, { "entropy": 9.102067947387695, "epoch": 0.8523828356733241, "mean_token_accuracy": 0.704402506351471, "num_tokens": 24061844.0, "step": 8621, "train/ce_loss": 1.138018012046814 }, { "epoch": 0.8523828356733241, "step": 8621, "train/sim_loss": 0.05859375 }, { "epoch": 0.8523828356733241, "step": 8621, "train/total_loss": 0.17239555716514587 }, { "entropy": 8.598770141601562, "epoch": 0.8524817085228397, "mean_token_accuracy": 0.7490774989128113, "num_tokens": 24067075.0, "step": 8622, "train/ce_loss": 0.7021009922027588 }, { "epoch": 0.8524817085228397, "step": 8622, "train/sim_loss": 0.0546875 }, { "epoch": 0.8524817085228397, "step": 8622, "train/total_loss": 0.12489759922027588 }, { "entropy": 8.624722480773926, "epoch": 0.8525805813723552, "mean_token_accuracy": 0.7587034702301025, "num_tokens": 24072299.0, "step": 8623, "train/ce_loss": 0.8237633109092712 }, { "epoch": 0.8525805813723552, "step": 8623, "train/sim_loss": 0.0390625 }, { "epoch": 0.8525805813723552, "step": 8623, "train/total_loss": 0.12143883109092712 }, { "entropy": 9.024396896362305, "epoch": 0.8526794542218706, "mean_token_accuracy": 0.779411792755127, "num_tokens": 24077336.0, "step": 8624, "train/ce_loss": 1.0187090635299683 }, { "epoch": 0.8526794542218706, "step": 8624, "train/sim_loss": 0.04296875 }, { "epoch": 0.8526794542218706, "step": 8624, "train/total_loss": 0.14483965933322906 }, { "entropy": 8.925800323486328, "epoch": 0.8527783270713862, "mean_token_accuracy": 0.7268722653388977, "num_tokens": 24082508.0, "step": 8625, "train/ce_loss": 1.5128824710845947 }, { "epoch": 0.8527783270713862, "step": 8625, "train/sim_loss": 0.05859375 }, { "epoch": 0.8527783270713862, "step": 8625, "train/total_loss": 0.2098820060491562 }, { "entropy": 8.57644271850586, "epoch": 0.8528771999209017, "mean_token_accuracy": 0.7945075631141663, "num_tokens": 24088036.0, "step": 8626, "train/ce_loss": 0.8464938402175903 }, { "epoch": 0.8528771999209017, "step": 8626, "train/sim_loss": 0.01953125 }, { "epoch": 0.8528771999209017, "step": 8626, "train/total_loss": 0.10418063402175903 }, { "entropy": 9.06151008605957, "epoch": 0.8529760727704172, "mean_token_accuracy": 0.7252747416496277, "num_tokens": 24093196.0, "step": 8627, "train/ce_loss": 1.6834861040115356 }, { "epoch": 0.8529760727704172, "step": 8627, "train/sim_loss": 0.0546875 }, { "epoch": 0.8529760727704172, "step": 8627, "train/total_loss": 0.22303611040115356 }, { "entropy": 9.575900077819824, "epoch": 0.8530749456199328, "mean_token_accuracy": 0.780701756477356, "num_tokens": 24097951.0, "step": 8628, "train/ce_loss": 1.3008063888264587e-06 }, { "epoch": 0.8530749456199328, "step": 8628, "train/sim_loss": 0.0546875 }, { "epoch": 0.8530749456199328, "step": 8628, "train/total_loss": 0.054687630385160446 }, { "entropy": 8.6054105758667, "epoch": 0.8531738184694483, "mean_token_accuracy": 0.695652186870575, "num_tokens": 24103249.0, "step": 8629, "train/ce_loss": 0.9870397448539734 }, { "epoch": 0.8531738184694483, "step": 8629, "train/sim_loss": 0.05078125 }, { "epoch": 0.8531738184694483, "step": 8629, "train/total_loss": 0.14948523044586182 }, { "entropy": 8.558268547058105, "epoch": 0.8532726913189638, "mean_token_accuracy": 0.8256983160972595, "num_tokens": 24108620.0, "step": 8630, "train/ce_loss": 0.5947093367576599 }, { "epoch": 0.8532726913189638, "step": 8630, "train/sim_loss": 0.0234375 }, { "epoch": 0.8532726913189638, "step": 8630, "train/total_loss": 0.08290843665599823 }, { "entropy": 9.279542922973633, "epoch": 0.8533715641684794, "mean_token_accuracy": 0.7066895365715027, "num_tokens": 24113653.0, "step": 8631, "train/ce_loss": 0.893915057182312 }, { "epoch": 0.8533715641684794, "step": 8631, "train/sim_loss": 0.078125 }, { "epoch": 0.8533715641684794, "step": 8631, "train/total_loss": 0.16751649975776672 }, { "entropy": 8.98904037475586, "epoch": 0.8534704370179949, "mean_token_accuracy": 0.7587301731109619, "num_tokens": 24118699.0, "step": 8632, "train/ce_loss": 0.8543035387992859 }, { "epoch": 0.8534704370179949, "step": 8632, "train/sim_loss": 0.05859375 }, { "epoch": 0.8534704370179949, "step": 8632, "train/total_loss": 0.1440241038799286 }, { "entropy": 9.05948257446289, "epoch": 0.8535693098675103, "mean_token_accuracy": 0.6973180174827576, "num_tokens": 24123664.0, "step": 8633, "train/ce_loss": 1.6334046125411987 }, { "epoch": 0.8535693098675103, "step": 8633, "train/sim_loss": 0.08203125 }, { "epoch": 0.8535693098675103, "step": 8633, "train/total_loss": 0.2453717142343521 }, { "entropy": 8.830093383789062, "epoch": 0.8536681827170259, "mean_token_accuracy": 0.7382388710975647, "num_tokens": 24128879.0, "step": 8634, "train/ce_loss": 1.3079240322113037 }, { "epoch": 0.8536681827170259, "step": 8634, "train/sim_loss": 0.06640625 }, { "epoch": 0.8536681827170259, "step": 8634, "train/total_loss": 0.19719865918159485 }, { "entropy": 8.617467880249023, "epoch": 0.8537670555665414, "mean_token_accuracy": 0.7291910648345947, "num_tokens": 24134170.0, "step": 8635, "train/ce_loss": 0.49795177578926086 }, { "epoch": 0.8537670555665414, "step": 8635, "train/sim_loss": 0.0546875 }, { "epoch": 0.8537670555665414, "step": 8635, "train/total_loss": 0.10448268055915833 }, { "entropy": 8.834949493408203, "epoch": 0.8538659284160569, "mean_token_accuracy": 0.7502726316452026, "num_tokens": 24139579.0, "step": 8636, "train/ce_loss": 0.7361787557601929 }, { "epoch": 0.8538659284160569, "step": 8636, "train/sim_loss": 0.046875 }, { "epoch": 0.8538659284160569, "step": 8636, "train/total_loss": 0.12049287557601929 }, { "entropy": 8.288768768310547, "epoch": 0.8539648012655725, "mean_token_accuracy": 0.7339534759521484, "num_tokens": 24145189.0, "step": 8637, "train/ce_loss": 1.0247743129730225 }, { "epoch": 0.8539648012655725, "step": 8637, "train/sim_loss": 0.046875 }, { "epoch": 0.8539648012655725, "step": 8637, "train/total_loss": 0.14935243129730225 }, { "entropy": 8.689209938049316, "epoch": 0.854063674115088, "mean_token_accuracy": 0.7563636302947998, "num_tokens": 24150462.0, "step": 8638, "train/ce_loss": 0.6432814002037048 }, { "epoch": 0.854063674115088, "step": 8638, "train/sim_loss": 0.046875 }, { "epoch": 0.854063674115088, "step": 8638, "train/total_loss": 0.1112031415104866 }, { "entropy": 8.928628921508789, "epoch": 0.8541625469646035, "mean_token_accuracy": 0.7103004455566406, "num_tokens": 24155427.0, "step": 8639, "train/ce_loss": 0.775364339351654 }, { "epoch": 0.8541625469646035, "step": 8639, "train/sim_loss": 0.0234375 }, { "epoch": 0.8541625469646035, "step": 8639, "train/total_loss": 0.1009739339351654 }, { "epoch": 0.8542614198141191, "grad_norm": 0.8426479697227478, "learning_rate": 7.866538100182961e-06, "loss": 0.1292, "step": 8640 }, { "entropy": 8.4351806640625, "epoch": 0.8542614198141191, "mean_token_accuracy": 0.8066465258598328, "num_tokens": 24160886.0, "step": 8640, "train/ce_loss": 0.2624002695083618 }, { "epoch": 0.8542614198141191, "step": 8640, "train/sim_loss": 0.0390625 }, { "epoch": 0.8542614198141191, "step": 8640, "train/total_loss": 0.0653025284409523 }, { "entropy": 9.260761260986328, "epoch": 0.8543602926636346, "mean_token_accuracy": 0.7677165269851685, "num_tokens": 24165991.0, "step": 8641, "train/ce_loss": 4.111280986762722e-07 }, { "epoch": 0.8543602926636346, "step": 8641, "train/sim_loss": 0.015625 }, { "epoch": 0.8543602926636346, "step": 8641, "train/total_loss": 0.015625040978193283 }, { "entropy": 8.621223449707031, "epoch": 0.85445916551315, "mean_token_accuracy": 0.7359307408332825, "num_tokens": 24171379.0, "step": 8642, "train/ce_loss": 0.6777894496917725 }, { "epoch": 0.85445916551315, "step": 8642, "train/sim_loss": 0.109375 }, { "epoch": 0.85445916551315, "step": 8642, "train/total_loss": 0.17715394496917725 }, { "entropy": 9.42569351196289, "epoch": 0.8545580383626656, "mean_token_accuracy": 0.7537593841552734, "num_tokens": 24176280.0, "step": 8643, "train/ce_loss": 1.375551462173462 }, { "epoch": 0.8545580383626656, "step": 8643, "train/sim_loss": 0.0234375 }, { "epoch": 0.8545580383626656, "step": 8643, "train/total_loss": 0.16099265217781067 }, { "entropy": 8.520685195922852, "epoch": 0.8546569112121811, "mean_token_accuracy": 0.7365339398384094, "num_tokens": 24181571.0, "step": 8644, "train/ce_loss": 1.1391706466674805 }, { "epoch": 0.8546569112121811, "step": 8644, "train/sim_loss": 0.0546875 }, { "epoch": 0.8546569112121811, "step": 8644, "train/total_loss": 0.16860456764698029 }, { "entropy": 8.459127426147461, "epoch": 0.8547557840616966, "mean_token_accuracy": 0.7465968728065491, "num_tokens": 24187035.0, "step": 8645, "train/ce_loss": 0.6572544574737549 }, { "epoch": 0.8547557840616966, "step": 8645, "train/sim_loss": 0.0234375 }, { "epoch": 0.8547557840616966, "step": 8645, "train/total_loss": 0.08916294574737549 }, { "entropy": 8.707165718078613, "epoch": 0.8548546569112122, "mean_token_accuracy": 0.7386519908905029, "num_tokens": 24192243.0, "step": 8646, "train/ce_loss": 0.7648477554321289 }, { "epoch": 0.8548546569112122, "step": 8646, "train/sim_loss": 0.03515625 }, { "epoch": 0.8548546569112122, "step": 8646, "train/total_loss": 0.11164102703332901 }, { "entropy": 8.758549690246582, "epoch": 0.8549535297607277, "mean_token_accuracy": 0.722908079624176, "num_tokens": 24197457.0, "step": 8647, "train/ce_loss": 0.8366193771362305 }, { "epoch": 0.8549535297607277, "step": 8647, "train/sim_loss": 0.0625 }, { "epoch": 0.8549535297607277, "step": 8647, "train/total_loss": 0.14616194367408752 }, { "entropy": 8.51551628112793, "epoch": 0.8550524026102432, "mean_token_accuracy": 0.6890308856964111, "num_tokens": 24202863.0, "step": 8648, "train/ce_loss": 1.5394750833511353 }, { "epoch": 0.8550524026102432, "step": 8648, "train/sim_loss": 0.0390625 }, { "epoch": 0.8550524026102432, "step": 8648, "train/total_loss": 0.19301001727581024 }, { "entropy": 8.53546142578125, "epoch": 0.8551512754597588, "mean_token_accuracy": 0.703592836856842, "num_tokens": 24208316.0, "step": 8649, "train/ce_loss": 0.7742993235588074 }, { "epoch": 0.8551512754597588, "step": 8649, "train/sim_loss": 0.03515625 }, { "epoch": 0.8551512754597588, "step": 8649, "train/total_loss": 0.11258618533611298 }, { "entropy": 8.493489265441895, "epoch": 0.8552501483092743, "mean_token_accuracy": 0.7212954163551331, "num_tokens": 24213864.0, "step": 8650, "train/ce_loss": 0.537199079990387 }, { "epoch": 0.8552501483092743, "step": 8650, "train/sim_loss": 0.02734375 }, { "epoch": 0.8552501483092743, "step": 8650, "train/total_loss": 0.0810636579990387 }, { "entropy": 8.240386009216309, "epoch": 0.8553490211587897, "mean_token_accuracy": 0.748680055141449, "num_tokens": 24219362.0, "step": 8651, "train/ce_loss": 1.250221610069275 }, { "epoch": 0.8553490211587897, "step": 8651, "train/sim_loss": 0.046875 }, { "epoch": 0.8553490211587897, "step": 8651, "train/total_loss": 0.17189715802669525 }, { "entropy": 9.68118667602539, "epoch": 0.8554478940083053, "mean_token_accuracy": 0.6902654767036438, "num_tokens": 24224195.0, "step": 8652, "train/ce_loss": 1.649112343788147 }, { "epoch": 0.8554478940083053, "step": 8652, "train/sim_loss": 0.078125 }, { "epoch": 0.8554478940083053, "step": 8652, "train/total_loss": 0.24303624033927917 }, { "entropy": 9.208187103271484, "epoch": 0.8555467668578208, "mean_token_accuracy": 0.7402597665786743, "num_tokens": 24229169.0, "step": 8653, "train/ce_loss": 0.9814550280570984 }, { "epoch": 0.8555467668578208, "step": 8653, "train/sim_loss": 0.02734375 }, { "epoch": 0.8555467668578208, "step": 8653, "train/total_loss": 0.1254892647266388 }, { "entropy": 8.391998291015625, "epoch": 0.8556456397073363, "mean_token_accuracy": 0.7403433322906494, "num_tokens": 24234544.0, "step": 8654, "train/ce_loss": 1.4496474266052246 }, { "epoch": 0.8556456397073363, "step": 8654, "train/sim_loss": 0.05859375 }, { "epoch": 0.8556456397073363, "step": 8654, "train/total_loss": 0.20355848968029022 }, { "entropy": 9.0525484085083, "epoch": 0.8557445125568519, "mean_token_accuracy": 0.789559543132782, "num_tokens": 24239631.0, "step": 8655, "train/ce_loss": 1.1713453531265259 }, { "epoch": 0.8557445125568519, "step": 8655, "train/sim_loss": 0.01953125 }, { "epoch": 0.8557445125568519, "step": 8655, "train/total_loss": 0.13666579127311707 }, { "entropy": 9.150043487548828, "epoch": 0.8558433854063674, "mean_token_accuracy": 0.7431610822677612, "num_tokens": 24244700.0, "step": 8656, "train/ce_loss": 2.7685500754159875e-06 }, { "epoch": 0.8558433854063674, "step": 8656, "train/sim_loss": 0.06640625 }, { "epoch": 0.8558433854063674, "step": 8656, "train/total_loss": 0.06640652567148209 }, { "entropy": 8.428903579711914, "epoch": 0.8559422582558829, "mean_token_accuracy": 0.7721261382102966, "num_tokens": 24250134.0, "step": 8657, "train/ce_loss": 0.8671013116836548 }, { "epoch": 0.8559422582558829, "step": 8657, "train/sim_loss": 0.06640625 }, { "epoch": 0.8559422582558829, "step": 8657, "train/total_loss": 0.153116375207901 }, { "entropy": 9.028989791870117, "epoch": 0.8560411311053985, "mean_token_accuracy": 0.7668308615684509, "num_tokens": 24255237.0, "step": 8658, "train/ce_loss": 0.7063049077987671 }, { "epoch": 0.8560411311053985, "step": 8658, "train/sim_loss": 0.01953125 }, { "epoch": 0.8560411311053985, "step": 8658, "train/total_loss": 0.09016174077987671 }, { "entropy": 8.776468276977539, "epoch": 0.856140003954914, "mean_token_accuracy": 0.7960928082466125, "num_tokens": 24260669.0, "step": 8659, "train/ce_loss": 0.8988680839538574 }, { "epoch": 0.856140003954914, "step": 8659, "train/sim_loss": 0.015625 }, { "epoch": 0.856140003954914, "step": 8659, "train/total_loss": 0.10551180690526962 }, { "epoch": 0.8562388768044294, "grad_norm": 0.543863832950592, "learning_rate": 7.861593235425012e-06, "loss": 0.1278, "step": 8660 }, { "entropy": 9.166704177856445, "epoch": 0.8562388768044294, "mean_token_accuracy": 0.7692307829856873, "num_tokens": 24265760.0, "step": 8660, "train/ce_loss": 1.2796474695205688 }, { "epoch": 0.8562388768044294, "step": 8660, "train/sim_loss": 0.0703125 }, { "epoch": 0.8562388768044294, "step": 8660, "train/total_loss": 0.19827724993228912 }, { "entropy": 8.707932472229004, "epoch": 0.856337749653945, "mean_token_accuracy": 0.7459584474563599, "num_tokens": 24271079.0, "step": 8661, "train/ce_loss": 0.7910167574882507 }, { "epoch": 0.856337749653945, "step": 8661, "train/sim_loss": 0.11328125 }, { "epoch": 0.856337749653945, "step": 8661, "train/total_loss": 0.19238293170928955 }, { "entropy": 8.982584953308105, "epoch": 0.8564366225034605, "mean_token_accuracy": 0.7586206793785095, "num_tokens": 24275989.0, "step": 8662, "train/ce_loss": 1.5561401844024658 }, { "epoch": 0.8564366225034605, "step": 8662, "train/sim_loss": 0.0546875 }, { "epoch": 0.8564366225034605, "step": 8662, "train/total_loss": 0.21030151844024658 }, { "entropy": 8.329089164733887, "epoch": 0.856535495352976, "mean_token_accuracy": 0.7138493061065674, "num_tokens": 24281442.0, "step": 8663, "train/ce_loss": 0.9314218759536743 }, { "epoch": 0.856535495352976, "step": 8663, "train/sim_loss": 0.046875 }, { "epoch": 0.856535495352976, "step": 8663, "train/total_loss": 0.14001718163490295 }, { "entropy": 8.926669120788574, "epoch": 0.8566343682024916, "mean_token_accuracy": 0.7155612111091614, "num_tokens": 24286662.0, "step": 8664, "train/ce_loss": 0.7832470536231995 }, { "epoch": 0.8566343682024916, "step": 8664, "train/sim_loss": 0.07421875 }, { "epoch": 0.8566343682024916, "step": 8664, "train/total_loss": 0.15254345536231995 }, { "entropy": 8.831933975219727, "epoch": 0.8567332410520071, "mean_token_accuracy": 0.7798560857772827, "num_tokens": 24291849.0, "step": 8665, "train/ce_loss": 2.490518511422124e-07 }, { "epoch": 0.8567332410520071, "step": 8665, "train/sim_loss": 0.01953125 }, { "epoch": 0.8567332410520071, "step": 8665, "train/total_loss": 0.01953127421438694 }, { "entropy": 8.320831298828125, "epoch": 0.8568321139015227, "mean_token_accuracy": 0.7524752616882324, "num_tokens": 24297357.0, "step": 8666, "train/ce_loss": 0.5641217827796936 }, { "epoch": 0.8568321139015227, "step": 8666, "train/sim_loss": 0.01171875 }, { "epoch": 0.8568321139015227, "step": 8666, "train/total_loss": 0.06813092529773712 }, { "entropy": 8.73794937133789, "epoch": 0.8569309867510382, "mean_token_accuracy": 0.7379972338676453, "num_tokens": 24302554.0, "step": 8667, "train/ce_loss": 1.0336923599243164 }, { "epoch": 0.8569309867510382, "step": 8667, "train/sim_loss": 0.0859375 }, { "epoch": 0.8569309867510382, "step": 8667, "train/total_loss": 0.18930673599243164 }, { "entropy": 9.101522445678711, "epoch": 0.8570298596005537, "mean_token_accuracy": 0.7204301357269287, "num_tokens": 24307562.0, "step": 8668, "train/ce_loss": 1.9060012102127075 }, { "epoch": 0.8570298596005537, "step": 8668, "train/sim_loss": 0.0625 }, { "epoch": 0.8570298596005537, "step": 8668, "train/total_loss": 0.25310012698173523 }, { "entropy": 9.16499137878418, "epoch": 0.8571287324500693, "mean_token_accuracy": 0.741605818271637, "num_tokens": 24312723.0, "step": 8669, "train/ce_loss": 2.9166920967327314e-07 }, { "epoch": 0.8571287324500693, "step": 8669, "train/sim_loss": 0.01953125 }, { "epoch": 0.8571287324500693, "step": 8669, "train/total_loss": 0.019531279802322388 }, { "entropy": 8.343894958496094, "epoch": 0.8572276052995847, "mean_token_accuracy": 0.7389557957649231, "num_tokens": 24317914.0, "step": 8670, "train/ce_loss": 1.0072234869003296 }, { "epoch": 0.8572276052995847, "step": 8670, "train/sim_loss": 0.05078125 }, { "epoch": 0.8572276052995847, "step": 8670, "train/total_loss": 0.15150359272956848 }, { "entropy": 8.694144248962402, "epoch": 0.8573264781491002, "mean_token_accuracy": 0.739386796951294, "num_tokens": 24323244.0, "step": 8671, "train/ce_loss": 0.7169451713562012 }, { "epoch": 0.8573264781491002, "step": 8671, "train/sim_loss": 0.0859375 }, { "epoch": 0.8573264781491002, "step": 8671, "train/total_loss": 0.1576320230960846 }, { "entropy": 8.642142295837402, "epoch": 0.8574253509986158, "mean_token_accuracy": 0.7232142686843872, "num_tokens": 24328527.0, "step": 8672, "train/ce_loss": 1.4176908731460571 }, { "epoch": 0.8574253509986158, "step": 8672, "train/sim_loss": 0.0625 }, { "epoch": 0.8574253509986158, "step": 8672, "train/total_loss": 0.20426909625530243 }, { "entropy": 8.432353973388672, "epoch": 0.8575242238481313, "mean_token_accuracy": 0.7180179953575134, "num_tokens": 24334127.0, "step": 8673, "train/ce_loss": 0.810718834400177 }, { "epoch": 0.8575242238481313, "step": 8673, "train/sim_loss": 0.05859375 }, { "epoch": 0.8575242238481313, "step": 8673, "train/total_loss": 0.1396656334400177 }, { "entropy": 8.835989952087402, "epoch": 0.8576230966976468, "mean_token_accuracy": 0.7057521939277649, "num_tokens": 24339514.0, "step": 8674, "train/ce_loss": 1.4376308917999268 }, { "epoch": 0.8576230966976468, "step": 8674, "train/sim_loss": 0.0703125 }, { "epoch": 0.8576230966976468, "step": 8674, "train/total_loss": 0.21407559514045715 }, { "entropy": 8.778553009033203, "epoch": 0.8577219695471624, "mean_token_accuracy": 0.7320512533187866, "num_tokens": 24344730.0, "step": 8675, "train/ce_loss": 0.897465169429779 }, { "epoch": 0.8577219695471624, "step": 8675, "train/sim_loss": 0.0234375 }, { "epoch": 0.8577219695471624, "step": 8675, "train/total_loss": 0.11318401992321014 }, { "entropy": 8.590299606323242, "epoch": 0.8578208423966779, "mean_token_accuracy": 0.7136150002479553, "num_tokens": 24350023.0, "step": 8676, "train/ce_loss": 1.29165780544281 }, { "epoch": 0.8578208423966779, "step": 8676, "train/sim_loss": 0.05078125 }, { "epoch": 0.8578208423966779, "step": 8676, "train/total_loss": 0.17994703352451324 }, { "entropy": 8.699274063110352, "epoch": 0.8579197152461934, "mean_token_accuracy": 0.7906976938247681, "num_tokens": 24355356.0, "step": 8677, "train/ce_loss": 0.42230668663978577 }, { "epoch": 0.8579197152461934, "step": 8677, "train/sim_loss": 0.02734375 }, { "epoch": 0.8579197152461934, "step": 8677, "train/total_loss": 0.06957441568374634 }, { "entropy": 9.221429824829102, "epoch": 0.858018588095709, "mean_token_accuracy": 0.7283018827438354, "num_tokens": 24360328.0, "step": 8678, "train/ce_loss": 0.7309057116508484 }, { "epoch": 0.858018588095709, "step": 8678, "train/sim_loss": 0.0546875 }, { "epoch": 0.858018588095709, "step": 8678, "train/total_loss": 0.1277780830860138 }, { "entropy": 8.58169937133789, "epoch": 0.8581174609452245, "mean_token_accuracy": 0.7274701595306396, "num_tokens": 24365720.0, "step": 8679, "train/ce_loss": 0.5765038132667542 }, { "epoch": 0.8581174609452245, "step": 8679, "train/sim_loss": 0.0234375 }, { "epoch": 0.8581174609452245, "step": 8679, "train/total_loss": 0.08108788728713989 }, { "epoch": 0.8582163337947399, "grad_norm": 0.5956012606620789, "learning_rate": 7.856648370667062e-06, "loss": 0.1323, "step": 8680 }, { "entropy": 9.144123077392578, "epoch": 0.8582163337947399, "mean_token_accuracy": 0.7272727489471436, "num_tokens": 24370787.0, "step": 8680, "train/ce_loss": 0.9091194272041321 }, { "epoch": 0.8582163337947399, "step": 8680, "train/sim_loss": 0.03515625 }, { "epoch": 0.8582163337947399, "step": 8680, "train/total_loss": 0.12606820464134216 }, { "entropy": 8.755536079406738, "epoch": 0.8583152066442555, "mean_token_accuracy": 0.7513020634651184, "num_tokens": 24376023.0, "step": 8681, "train/ce_loss": 1.099310278892517 }, { "epoch": 0.8583152066442555, "step": 8681, "train/sim_loss": 0.05078125 }, { "epoch": 0.8583152066442555, "step": 8681, "train/total_loss": 0.16071227192878723 }, { "entropy": 8.497855186462402, "epoch": 0.858414079493771, "mean_token_accuracy": 0.7962962985038757, "num_tokens": 24381515.0, "step": 8682, "train/ce_loss": 0.759404182434082 }, { "epoch": 0.858414079493771, "step": 8682, "train/sim_loss": 0.0546875 }, { "epoch": 0.858414079493771, "step": 8682, "train/total_loss": 0.13062793016433716 }, { "entropy": 8.67982292175293, "epoch": 0.8585129523432865, "mean_token_accuracy": 0.7808219194412231, "num_tokens": 24386825.0, "step": 8683, "train/ce_loss": 0.6889883875846863 }, { "epoch": 0.8585129523432865, "step": 8683, "train/sim_loss": 0.02734375 }, { "epoch": 0.8585129523432865, "step": 8683, "train/total_loss": 0.09624259173870087 }, { "entropy": 8.736405372619629, "epoch": 0.8586118251928021, "mean_token_accuracy": 0.7019950151443481, "num_tokens": 24392090.0, "step": 8684, "train/ce_loss": 1.2038031816482544 }, { "epoch": 0.8586118251928021, "step": 8684, "train/sim_loss": 0.078125 }, { "epoch": 0.8586118251928021, "step": 8684, "train/total_loss": 0.19850531220436096 }, { "entropy": 9.02114486694336, "epoch": 0.8587106980423176, "mean_token_accuracy": 0.7817531228065491, "num_tokens": 24397094.0, "step": 8685, "train/ce_loss": 1.209519863128662 }, { "epoch": 0.8587106980423176, "step": 8685, "train/sim_loss": 0.0390625 }, { "epoch": 0.8587106980423176, "step": 8685, "train/total_loss": 0.16001448035240173 }, { "entropy": 9.204957008361816, "epoch": 0.8588095708918331, "mean_token_accuracy": 0.8021582961082458, "num_tokens": 24402084.0, "step": 8686, "train/ce_loss": 1.132004285864241e-06 }, { "epoch": 0.8588095708918331, "step": 8686, "train/sim_loss": 0.015625 }, { "epoch": 0.8588095708918331, "step": 8686, "train/total_loss": 0.015625113621354103 }, { "entropy": 9.330758094787598, "epoch": 0.8589084437413487, "mean_token_accuracy": 0.8086304068565369, "num_tokens": 24407084.0, "step": 8687, "train/ce_loss": 0.6505013704299927 }, { "epoch": 0.8589084437413487, "step": 8687, "train/sim_loss": 0.01953125 }, { "epoch": 0.8589084437413487, "step": 8687, "train/total_loss": 0.0845813900232315 }, { "entropy": 8.71497631072998, "epoch": 0.8590073165908642, "mean_token_accuracy": 0.7440318465232849, "num_tokens": 24412280.0, "step": 8688, "train/ce_loss": 0.48954129219055176 }, { "epoch": 0.8590073165908642, "step": 8688, "train/sim_loss": 0.0625 }, { "epoch": 0.8590073165908642, "step": 8688, "train/total_loss": 0.11145412921905518 }, { "entropy": 8.394184112548828, "epoch": 0.8591061894403796, "mean_token_accuracy": 0.7381465435028076, "num_tokens": 24417632.0, "step": 8689, "train/ce_loss": 0.6080519556999207 }, { "epoch": 0.8591061894403796, "step": 8689, "train/sim_loss": 0.05078125 }, { "epoch": 0.8591061894403796, "step": 8689, "train/total_loss": 0.11158645153045654 }, { "entropy": 9.146610260009766, "epoch": 0.8592050622898952, "mean_token_accuracy": 0.7051281929016113, "num_tokens": 24422677.0, "step": 8690, "train/ce_loss": 1.0803229808807373 }, { "epoch": 0.8592050622898952, "step": 8690, "train/sim_loss": 0.06640625 }, { "epoch": 0.8592050622898952, "step": 8690, "train/total_loss": 0.17443855106830597 }, { "entropy": 8.834553718566895, "epoch": 0.8593039351394107, "mean_token_accuracy": 0.7250900268554688, "num_tokens": 24427992.0, "step": 8691, "train/ce_loss": 1.0625476837158203 }, { "epoch": 0.8593039351394107, "step": 8691, "train/sim_loss": 0.0625 }, { "epoch": 0.8593039351394107, "step": 8691, "train/total_loss": 0.16875477135181427 }, { "entropy": 9.251741409301758, "epoch": 0.8594028079889262, "mean_token_accuracy": 0.7544169425964355, "num_tokens": 24432969.0, "step": 8692, "train/ce_loss": 0.8937926888465881 }, { "epoch": 0.8594028079889262, "step": 8692, "train/sim_loss": 0.0625 }, { "epoch": 0.8594028079889262, "step": 8692, "train/total_loss": 0.15187928080558777 }, { "entropy": 9.002845764160156, "epoch": 0.8595016808384418, "mean_token_accuracy": 0.6908212304115295, "num_tokens": 24437999.0, "step": 8693, "train/ce_loss": 1.8824050426483154 }, { "epoch": 0.8595016808384418, "step": 8693, "train/sim_loss": 0.09765625 }, { "epoch": 0.8595016808384418, "step": 8693, "train/total_loss": 0.28589677810668945 }, { "entropy": 9.49638557434082, "epoch": 0.8596005536879573, "mean_token_accuracy": 0.6689655184745789, "num_tokens": 24442894.0, "step": 8694, "train/ce_loss": 3.628325373483676e-07 }, { "epoch": 0.8596005536879573, "step": 8694, "train/sim_loss": 0.01171875 }, { "epoch": 0.8596005536879573, "step": 8694, "train/total_loss": 0.01171878632158041 }, { "entropy": 8.463619232177734, "epoch": 0.8596994265374728, "mean_token_accuracy": 0.7720670104026794, "num_tokens": 24448289.0, "step": 8695, "train/ce_loss": 0.5734670162200928 }, { "epoch": 0.8596994265374728, "step": 8695, "train/sim_loss": 0.03125 }, { "epoch": 0.8596994265374728, "step": 8695, "train/total_loss": 0.08859670162200928 }, { "entropy": 8.894155502319336, "epoch": 0.8597982993869884, "mean_token_accuracy": 0.7293127775192261, "num_tokens": 24453448.0, "step": 8696, "train/ce_loss": 0.939193069934845 }, { "epoch": 0.8597982993869884, "step": 8696, "train/sim_loss": 0.0625 }, { "epoch": 0.8597982993869884, "step": 8696, "train/total_loss": 0.1564193069934845 }, { "entropy": 8.99164867401123, "epoch": 0.8598971722365039, "mean_token_accuracy": 0.7431906461715698, "num_tokens": 24458622.0, "step": 8697, "train/ce_loss": 1.413056492805481 }, { "epoch": 0.8598971722365039, "step": 8697, "train/sim_loss": 0.046875 }, { "epoch": 0.8598971722365039, "step": 8697, "train/total_loss": 0.18818065524101257 }, { "entropy": 8.630083084106445, "epoch": 0.8599960450860193, "mean_token_accuracy": 0.720200777053833, "num_tokens": 24463854.0, "step": 8698, "train/ce_loss": 0.9030309915542603 }, { "epoch": 0.8599960450860193, "step": 8698, "train/sim_loss": 0.10546875 }, { "epoch": 0.8599960450860193, "step": 8698, "train/total_loss": 0.19577184319496155 }, { "entropy": 8.450031280517578, "epoch": 0.8600949179355349, "mean_token_accuracy": 0.7494736909866333, "num_tokens": 24469293.0, "step": 8699, "train/ce_loss": 0.5924932956695557 }, { "epoch": 0.8600949179355349, "step": 8699, "train/sim_loss": 0.02734375 }, { "epoch": 0.8600949179355349, "step": 8699, "train/total_loss": 0.08659307658672333 }, { "epoch": 0.8601937907850504, "grad_norm": 0.5866478085517883, "learning_rate": 7.851703505909115e-06, "loss": 0.1359, "step": 8700 }, { "entropy": 8.636362075805664, "epoch": 0.8601937907850504, "mean_token_accuracy": 0.7581620216369629, "num_tokens": 24474585.0, "step": 8700, "train/ce_loss": 1.01895272731781 }, { "epoch": 0.8601937907850504, "step": 8700, "train/sim_loss": 0.078125 }, { "epoch": 0.8601937907850504, "step": 8700, "train/total_loss": 0.180020272731781 }, { "entropy": 9.017528533935547, "epoch": 0.8602926636345659, "mean_token_accuracy": 0.7680412530899048, "num_tokens": 24480023.0, "step": 8701, "train/ce_loss": 1.0584609508514404 }, { "epoch": 0.8602926636345659, "step": 8701, "train/sim_loss": 0.0703125 }, { "epoch": 0.8602926636345659, "step": 8701, "train/total_loss": 0.176158607006073 }, { "entropy": 8.56066608428955, "epoch": 0.8603915364840815, "mean_token_accuracy": 0.7193585634231567, "num_tokens": 24485343.0, "step": 8702, "train/ce_loss": 1.0047358274459839 }, { "epoch": 0.8603915364840815, "step": 8702, "train/sim_loss": 0.0625 }, { "epoch": 0.8603915364840815, "step": 8702, "train/total_loss": 0.1629735827445984 }, { "entropy": 8.613938331604004, "epoch": 0.860490409333597, "mean_token_accuracy": 0.7861557602882385, "num_tokens": 24490596.0, "step": 8703, "train/ce_loss": 0.46136343479156494 }, { "epoch": 0.860490409333597, "step": 8703, "train/sim_loss": 0.0546875 }, { "epoch": 0.860490409333597, "step": 8703, "train/total_loss": 0.10082384943962097 }, { "entropy": 8.38138484954834, "epoch": 0.8605892821831125, "mean_token_accuracy": 0.7080609798431396, "num_tokens": 24495944.0, "step": 8704, "train/ce_loss": 1.1422832012176514 }, { "epoch": 0.8605892821831125, "step": 8704, "train/sim_loss": 0.03515625 }, { "epoch": 0.8605892821831125, "step": 8704, "train/total_loss": 0.14938457310199738 }, { "entropy": 8.48853588104248, "epoch": 0.8606881550326281, "mean_token_accuracy": 0.7091295123100281, "num_tokens": 24501366.0, "step": 8705, "train/ce_loss": 0.797477662563324 }, { "epoch": 0.8606881550326281, "step": 8705, "train/sim_loss": 0.03515625 }, { "epoch": 0.8606881550326281, "step": 8705, "train/total_loss": 0.1149040162563324 }, { "entropy": 9.005110740661621, "epoch": 0.8607870278821436, "mean_token_accuracy": 0.7484076619148254, "num_tokens": 24506428.0, "step": 8706, "train/ce_loss": 1.3149898052215576 }, { "epoch": 0.8607870278821436, "step": 8706, "train/sim_loss": 0.04296875 }, { "epoch": 0.8607870278821436, "step": 8706, "train/total_loss": 0.17446772754192352 }, { "entropy": 8.744300842285156, "epoch": 0.860885900731659, "mean_token_accuracy": 0.8268733620643616, "num_tokens": 24511674.0, "step": 8707, "train/ce_loss": 0.35784006118774414 }, { "epoch": 0.860885900731659, "step": 8707, "train/sim_loss": 0.0390625 }, { "epoch": 0.860885900731659, "step": 8707, "train/total_loss": 0.07484650611877441 }, { "entropy": 8.888997077941895, "epoch": 0.8609847735811746, "mean_token_accuracy": 0.72541743516922, "num_tokens": 24516650.0, "step": 8708, "train/ce_loss": 1.081268310546875 }, { "epoch": 0.8609847735811746, "step": 8708, "train/sim_loss": 0.04296875 }, { "epoch": 0.8609847735811746, "step": 8708, "train/total_loss": 0.15109558403491974 }, { "entropy": 9.171119689941406, "epoch": 0.8610836464306901, "mean_token_accuracy": 0.7603550553321838, "num_tokens": 24521774.0, "step": 8709, "train/ce_loss": 0.6120818257331848 }, { "epoch": 0.8610836464306901, "step": 8709, "train/sim_loss": 0.109375 }, { "epoch": 0.8610836464306901, "step": 8709, "train/total_loss": 0.17058318853378296 }, { "entropy": 8.389473915100098, "epoch": 0.8611825192802056, "mean_token_accuracy": 0.7079002261161804, "num_tokens": 24527231.0, "step": 8710, "train/ce_loss": 1.2086772918701172 }, { "epoch": 0.8611825192802056, "step": 8710, "train/sim_loss": 0.05859375 }, { "epoch": 0.8611825192802056, "step": 8710, "train/total_loss": 0.17946147918701172 }, { "entropy": 8.300840377807617, "epoch": 0.8612813921297212, "mean_token_accuracy": 0.72667396068573, "num_tokens": 24532651.0, "step": 8711, "train/ce_loss": 0.726784884929657 }, { "epoch": 0.8612813921297212, "step": 8711, "train/sim_loss": 0.07421875 }, { "epoch": 0.8612813921297212, "step": 8711, "train/total_loss": 0.14689724147319794 }, { "entropy": 8.55239486694336, "epoch": 0.8613802649792367, "mean_token_accuracy": 0.7553735971450806, "num_tokens": 24538110.0, "step": 8712, "train/ce_loss": 0.4948091506958008 }, { "epoch": 0.8613802649792367, "step": 8712, "train/sim_loss": 0.0234375 }, { "epoch": 0.8613802649792367, "step": 8712, "train/total_loss": 0.07291841506958008 }, { "entropy": 9.009099960327148, "epoch": 0.8614791378287522, "mean_token_accuracy": 0.7603305578231812, "num_tokens": 24543177.0, "step": 8713, "train/ce_loss": 1.6545424461364746 }, { "epoch": 0.8614791378287522, "step": 8713, "train/sim_loss": 0.05078125 }, { "epoch": 0.8614791378287522, "step": 8713, "train/total_loss": 0.21623550355434418 }, { "entropy": 8.838738441467285, "epoch": 0.8615780106782678, "mean_token_accuracy": 0.6797385811805725, "num_tokens": 24548427.0, "step": 8714, "train/ce_loss": 0.9460775256156921 }, { "epoch": 0.8615780106782678, "step": 8714, "train/sim_loss": 0.05859375 }, { "epoch": 0.8615780106782678, "step": 8714, "train/total_loss": 0.15320150554180145 }, { "entropy": 9.11565113067627, "epoch": 0.8616768835277833, "mean_token_accuracy": 0.7608370780944824, "num_tokens": 24553517.0, "step": 8715, "train/ce_loss": 1.4782397747039795 }, { "epoch": 0.8616768835277833, "step": 8715, "train/sim_loss": 0.0625 }, { "epoch": 0.8616768835277833, "step": 8715, "train/total_loss": 0.2103239744901657 }, { "entropy": 8.60834789276123, "epoch": 0.8617757563772988, "mean_token_accuracy": 0.6983606815338135, "num_tokens": 24558912.0, "step": 8716, "train/ce_loss": 1.4724786281585693 }, { "epoch": 0.8617757563772988, "step": 8716, "train/sim_loss": 0.02734375 }, { "epoch": 0.8617757563772988, "step": 8716, "train/total_loss": 0.17459161579608917 }, { "entropy": 8.75145149230957, "epoch": 0.8618746292268143, "mean_token_accuracy": 0.7423133254051208, "num_tokens": 24564083.0, "step": 8717, "train/ce_loss": 0.9547011256217957 }, { "epoch": 0.8618746292268143, "step": 8717, "train/sim_loss": 0.11328125 }, { "epoch": 0.8618746292268143, "step": 8717, "train/total_loss": 0.2087513655424118 }, { "entropy": 8.839400291442871, "epoch": 0.8619735020763298, "mean_token_accuracy": 0.7345399856567383, "num_tokens": 24569171.0, "step": 8718, "train/ce_loss": 1.3656909465789795 }, { "epoch": 0.8619735020763298, "step": 8718, "train/sim_loss": 0.05859375 }, { "epoch": 0.8619735020763298, "step": 8718, "train/total_loss": 0.1951628476381302 }, { "entropy": 8.544647216796875, "epoch": 0.8620723749258453, "mean_token_accuracy": 0.7559462189674377, "num_tokens": 24574615.0, "step": 8719, "train/ce_loss": 0.7375919818878174 }, { "epoch": 0.8620723749258453, "step": 8719, "train/sim_loss": 0.09765625 }, { "epoch": 0.8620723749258453, "step": 8719, "train/total_loss": 0.17141544818878174 }, { "epoch": 0.8621712477753609, "grad_norm": 0.6967912912368774, "learning_rate": 7.846758641151165e-06, "loss": 0.1358, "step": 8720 }, { "entropy": 8.95772933959961, "epoch": 0.8621712477753609, "mean_token_accuracy": 0.6918518543243408, "num_tokens": 24579691.0, "step": 8720, "train/ce_loss": 1.047044277191162 }, { "epoch": 0.8621712477753609, "step": 8720, "train/sim_loss": 0.078125 }, { "epoch": 0.8621712477753609, "step": 8720, "train/total_loss": 0.18282943964004517 }, { "entropy": 8.738275527954102, "epoch": 0.8622701206248764, "mean_token_accuracy": 0.676980197429657, "num_tokens": 24584963.0, "step": 8721, "train/ce_loss": 0.7576073408126831 }, { "epoch": 0.8622701206248764, "step": 8721, "train/sim_loss": 0.0546875 }, { "epoch": 0.8622701206248764, "step": 8721, "train/total_loss": 0.13044823706150055 }, { "entropy": 8.73996353149414, "epoch": 0.8623689934743919, "mean_token_accuracy": 0.7284533381462097, "num_tokens": 24590246.0, "step": 8722, "train/ce_loss": 0.6583800315856934 }, { "epoch": 0.8623689934743919, "step": 8722, "train/sim_loss": 0.0390625 }, { "epoch": 0.8623689934743919, "step": 8722, "train/total_loss": 0.10490050166845322 }, { "entropy": 8.506805419921875, "epoch": 0.8624678663239075, "mean_token_accuracy": 0.7607433199882507, "num_tokens": 24595734.0, "step": 8723, "train/ce_loss": 0.6425955891609192 }, { "epoch": 0.8624678663239075, "step": 8723, "train/sim_loss": 0.0703125 }, { "epoch": 0.8624678663239075, "step": 8723, "train/total_loss": 0.13457205891609192 }, { "entropy": 8.412109375, "epoch": 0.862566739173423, "mean_token_accuracy": 0.6770114898681641, "num_tokens": 24601053.0, "step": 8724, "train/ce_loss": 1.2470697164535522 }, { "epoch": 0.862566739173423, "step": 8724, "train/sim_loss": 0.05859375 }, { "epoch": 0.862566739173423, "step": 8724, "train/total_loss": 0.18330073356628418 }, { "entropy": 8.725008010864258, "epoch": 0.8626656120229385, "mean_token_accuracy": 0.7747875452041626, "num_tokens": 24606234.0, "step": 8725, "train/ce_loss": 1.2740275859832764 }, { "epoch": 0.8626656120229385, "step": 8725, "train/sim_loss": 0.06640625 }, { "epoch": 0.8626656120229385, "step": 8725, "train/total_loss": 0.19380901753902435 }, { "entropy": 8.509666442871094, "epoch": 0.862764484872454, "mean_token_accuracy": 0.680190920829773, "num_tokens": 24611590.0, "step": 8726, "train/ce_loss": 0.9858887791633606 }, { "epoch": 0.862764484872454, "step": 8726, "train/sim_loss": 0.0703125 }, { "epoch": 0.862764484872454, "step": 8726, "train/total_loss": 0.16890138387680054 }, { "entropy": 8.78453254699707, "epoch": 0.8628633577219695, "mean_token_accuracy": 0.7397769689559937, "num_tokens": 24616850.0, "step": 8727, "train/ce_loss": 0.9157573580741882 }, { "epoch": 0.8628633577219695, "step": 8727, "train/sim_loss": 0.046875 }, { "epoch": 0.8628633577219695, "step": 8727, "train/total_loss": 0.1384507417678833 }, { "entropy": 9.509747505187988, "epoch": 0.862962230571485, "mean_token_accuracy": 0.6687116622924805, "num_tokens": 24621756.0, "step": 8728, "train/ce_loss": 2.419189929962158 }, { "epoch": 0.862962230571485, "step": 8728, "train/sim_loss": 0.02734375 }, { "epoch": 0.862962230571485, "step": 8728, "train/total_loss": 0.26926273107528687 }, { "entropy": 8.149187088012695, "epoch": 0.8630611034210006, "mean_token_accuracy": 0.7740345001220703, "num_tokens": 24627458.0, "step": 8729, "train/ce_loss": 0.5720027089118958 }, { "epoch": 0.8630611034210006, "step": 8729, "train/sim_loss": 0.0390625 }, { "epoch": 0.8630611034210006, "step": 8729, "train/total_loss": 0.09626276791095734 }, { "entropy": 9.031068801879883, "epoch": 0.8631599762705161, "mean_token_accuracy": 0.7213114500045776, "num_tokens": 24632551.0, "step": 8730, "train/ce_loss": 1.2604602575302124 }, { "epoch": 0.8631599762705161, "step": 8730, "train/sim_loss": 0.0703125 }, { "epoch": 0.8631599762705161, "step": 8730, "train/total_loss": 0.19635853171348572 }, { "entropy": 8.85468864440918, "epoch": 0.8632588491200316, "mean_token_accuracy": 0.7720588445663452, "num_tokens": 24637645.0, "step": 8731, "train/ce_loss": 1.0503281354904175 }, { "epoch": 0.8632588491200316, "step": 8731, "train/sim_loss": 0.01953125 }, { "epoch": 0.8632588491200316, "step": 8731, "train/total_loss": 0.12456406652927399 }, { "entropy": 9.003961563110352, "epoch": 0.8633577219695472, "mean_token_accuracy": 0.8041431307792664, "num_tokens": 24642631.0, "step": 8732, "train/ce_loss": 0.6411572098731995 }, { "epoch": 0.8633577219695472, "step": 8732, "train/sim_loss": 0.015625 }, { "epoch": 0.8633577219695472, "step": 8732, "train/total_loss": 0.0797407254576683 }, { "entropy": 8.73344898223877, "epoch": 0.8634565948190627, "mean_token_accuracy": 0.8298969268798828, "num_tokens": 24647867.0, "step": 8733, "train/ce_loss": 0.9211376905441284 }, { "epoch": 0.8634565948190627, "step": 8733, "train/sim_loss": 0.0546875 }, { "epoch": 0.8634565948190627, "step": 8733, "train/total_loss": 0.14680126309394836 }, { "entropy": 8.371345520019531, "epoch": 0.8635554676685782, "mean_token_accuracy": 0.7224669456481934, "num_tokens": 24653284.0, "step": 8734, "train/ce_loss": 1.3611029386520386 }, { "epoch": 0.8635554676685782, "step": 8734, "train/sim_loss": 0.078125 }, { "epoch": 0.8635554676685782, "step": 8734, "train/total_loss": 0.21423529088497162 }, { "entropy": 8.128787994384766, "epoch": 0.8636543405180938, "mean_token_accuracy": 0.8126079440116882, "num_tokens": 24658967.0, "step": 8735, "train/ce_loss": 0.3554665446281433 }, { "epoch": 0.8636543405180938, "step": 8735, "train/sim_loss": 0.015625 }, { "epoch": 0.8636543405180938, "step": 8735, "train/total_loss": 0.05117165669798851 }, { "entropy": 8.596895217895508, "epoch": 0.8637532133676092, "mean_token_accuracy": 0.750348687171936, "num_tokens": 24664138.0, "step": 8736, "train/ce_loss": 0.7949276566505432 }, { "epoch": 0.8637532133676092, "step": 8736, "train/sim_loss": 0.0546875 }, { "epoch": 0.8637532133676092, "step": 8736, "train/total_loss": 0.13418027758598328 }, { "entropy": 8.983320236206055, "epoch": 0.8638520862171247, "mean_token_accuracy": 0.7711864113807678, "num_tokens": 24669276.0, "step": 8737, "train/ce_loss": 0.9564729928970337 }, { "epoch": 0.8638520862171247, "step": 8737, "train/sim_loss": 0.1171875 }, { "epoch": 0.8638520862171247, "step": 8737, "train/total_loss": 0.21283480525016785 }, { "entropy": 8.86839485168457, "epoch": 0.8639509590666403, "mean_token_accuracy": 0.8213740587234497, "num_tokens": 24674384.0, "step": 8738, "train/ce_loss": 0.7088941931724548 }, { "epoch": 0.8639509590666403, "step": 8738, "train/sim_loss": 0.0390625 }, { "epoch": 0.8639509590666403, "step": 8738, "train/total_loss": 0.1099519208073616 }, { "entropy": 8.702260971069336, "epoch": 0.8640498319161558, "mean_token_accuracy": 0.7682619690895081, "num_tokens": 24679662.0, "step": 8739, "train/ce_loss": 0.48759010434150696 }, { "epoch": 0.8640498319161558, "step": 8739, "train/sim_loss": 0.0234375 }, { "epoch": 0.8640498319161558, "step": 8739, "train/total_loss": 0.07219651341438293 }, { "epoch": 0.8641487047656713, "grad_norm": 0.6484668850898743, "learning_rate": 7.841813776393217e-06, "loss": 0.1303, "step": 8740 }, { "entropy": 8.378477096557617, "epoch": 0.8641487047656713, "mean_token_accuracy": 0.7508571147918701, "num_tokens": 24685015.0, "step": 8740, "train/ce_loss": 0.791218638420105 }, { "epoch": 0.8641487047656713, "step": 8740, "train/sim_loss": 0.05859375 }, { "epoch": 0.8641487047656713, "step": 8740, "train/total_loss": 0.13771560788154602 }, { "entropy": 8.485795974731445, "epoch": 0.8642475776151869, "mean_token_accuracy": 0.7496991753578186, "num_tokens": 24690284.0, "step": 8741, "train/ce_loss": 1.1184836626052856 }, { "epoch": 0.8642475776151869, "step": 8741, "train/sim_loss": 0.0625 }, { "epoch": 0.8642475776151869, "step": 8741, "train/total_loss": 0.1743483692407608 }, { "entropy": 8.313455581665039, "epoch": 0.8643464504647024, "mean_token_accuracy": 0.8098591566085815, "num_tokens": 24695674.0, "step": 8742, "train/ce_loss": 0.8051064610481262 }, { "epoch": 0.8643464504647024, "step": 8742, "train/sim_loss": 0.06640625 }, { "epoch": 0.8643464504647024, "step": 8742, "train/total_loss": 0.14691689610481262 }, { "entropy": 8.535341262817383, "epoch": 0.8644453233142179, "mean_token_accuracy": 0.7163029313087463, "num_tokens": 24700922.0, "step": 8743, "train/ce_loss": 0.9688300490379333 }, { "epoch": 0.8644453233142179, "step": 8743, "train/sim_loss": 0.05078125 }, { "epoch": 0.8644453233142179, "step": 8743, "train/total_loss": 0.14766424894332886 }, { "entropy": 8.891427993774414, "epoch": 0.8645441961637335, "mean_token_accuracy": 0.771175742149353, "num_tokens": 24706199.0, "step": 8744, "train/ce_loss": 0.7586009502410889 }, { "epoch": 0.8645441961637335, "step": 8744, "train/sim_loss": 0.05078125 }, { "epoch": 0.8645441961637335, "step": 8744, "train/total_loss": 0.12664134800434113 }, { "entropy": 8.649910926818848, "epoch": 0.8646430690132489, "mean_token_accuracy": 0.7512376308441162, "num_tokens": 24711489.0, "step": 8745, "train/ce_loss": 0.7900895476341248 }, { "epoch": 0.8646430690132489, "step": 8745, "train/sim_loss": 0.0546875 }, { "epoch": 0.8646430690132489, "step": 8745, "train/total_loss": 0.13369646668434143 }, { "entropy": 8.408275604248047, "epoch": 0.8647419418627644, "mean_token_accuracy": 0.7441386580467224, "num_tokens": 24717007.0, "step": 8746, "train/ce_loss": 1.1202709674835205 }, { "epoch": 0.8647419418627644, "step": 8746, "train/sim_loss": 0.0859375 }, { "epoch": 0.8647419418627644, "step": 8746, "train/total_loss": 0.197964608669281 }, { "entropy": 8.852964401245117, "epoch": 0.86484081471228, "mean_token_accuracy": 0.6896046996116638, "num_tokens": 24722128.0, "step": 8747, "train/ce_loss": 4.757750502903946e-06 }, { "epoch": 0.86484081471228, "step": 8747, "train/sim_loss": 0.0390625 }, { "epoch": 0.86484081471228, "step": 8747, "train/total_loss": 0.0390629768371582 }, { "entropy": 8.401473999023438, "epoch": 0.8649396875617955, "mean_token_accuracy": 0.7679222226142883, "num_tokens": 24727533.0, "step": 8748, "train/ce_loss": 0.5992299914360046 }, { "epoch": 0.8649396875617955, "step": 8748, "train/sim_loss": 0.04296875 }, { "epoch": 0.8649396875617955, "step": 8748, "train/total_loss": 0.10289175063371658 }, { "entropy": 9.06220531463623, "epoch": 0.8650385604113111, "mean_token_accuracy": 0.7325102686882019, "num_tokens": 24732446.0, "step": 8749, "train/ce_loss": 6.240025527404214e-07 }, { "epoch": 0.8650385604113111, "step": 8749, "train/sim_loss": 0.046875 }, { "epoch": 0.8650385604113111, "step": 8749, "train/total_loss": 0.046875063329935074 }, { "entropy": 8.637744903564453, "epoch": 0.8651374332608266, "mean_token_accuracy": 0.7092198729515076, "num_tokens": 24737699.0, "step": 8750, "train/ce_loss": 1.3729150295257568 }, { "epoch": 0.8651374332608266, "step": 8750, "train/sim_loss": 0.0390625 }, { "epoch": 0.8651374332608266, "step": 8750, "train/total_loss": 0.17635400593280792 }, { "entropy": 8.494135856628418, "epoch": 0.8652363061103421, "mean_token_accuracy": 0.7586206793785095, "num_tokens": 24742937.0, "step": 8751, "train/ce_loss": 0.7793292999267578 }, { "epoch": 0.8652363061103421, "step": 8751, "train/sim_loss": 0.05078125 }, { "epoch": 0.8652363061103421, "step": 8751, "train/total_loss": 0.1287141740322113 }, { "entropy": 8.59860610961914, "epoch": 0.8653351789598577, "mean_token_accuracy": 0.7230255603790283, "num_tokens": 24748247.0, "step": 8752, "train/ce_loss": 0.9081260561943054 }, { "epoch": 0.8653351789598577, "step": 8752, "train/sim_loss": 0.0703125 }, { "epoch": 0.8653351789598577, "step": 8752, "train/total_loss": 0.16112510859966278 }, { "entropy": 8.650777816772461, "epoch": 0.8654340518093732, "mean_token_accuracy": 0.7725631594657898, "num_tokens": 24753533.0, "step": 8753, "train/ce_loss": 0.8317409753799438 }, { "epoch": 0.8654340518093732, "step": 8753, "train/sim_loss": 0.0859375 }, { "epoch": 0.8654340518093732, "step": 8753, "train/total_loss": 0.16911160945892334 }, { "entropy": 8.657723426818848, "epoch": 0.8655329246588886, "mean_token_accuracy": 0.6906077265739441, "num_tokens": 24758684.0, "step": 8754, "train/ce_loss": 1.6056824922561646 }, { "epoch": 0.8655329246588886, "step": 8754, "train/sim_loss": 0.046875 }, { "epoch": 0.8655329246588886, "step": 8754, "train/total_loss": 0.2074432522058487 }, { "entropy": 8.510689735412598, "epoch": 0.8656317975084042, "mean_token_accuracy": 0.783369779586792, "num_tokens": 24764084.0, "step": 8755, "train/ce_loss": 0.9446129202842712 }, { "epoch": 0.8656317975084042, "step": 8755, "train/sim_loss": 0.09375 }, { "epoch": 0.8656317975084042, "step": 8755, "train/total_loss": 0.18821129202842712 }, { "entropy": 8.236587524414062, "epoch": 0.8657306703579197, "mean_token_accuracy": 0.772357702255249, "num_tokens": 24769425.0, "step": 8756, "train/ce_loss": 0.6139279007911682 }, { "epoch": 0.8657306703579197, "step": 8756, "train/sim_loss": 0.0546875 }, { "epoch": 0.8657306703579197, "step": 8756, "train/total_loss": 0.11608029156923294 }, { "entropy": 8.884057998657227, "epoch": 0.8658295432074352, "mean_token_accuracy": 0.7586705088615417, "num_tokens": 24774582.0, "step": 8757, "train/ce_loss": 0.7076722979545593 }, { "epoch": 0.8658295432074352, "step": 8757, "train/sim_loss": 0.05078125 }, { "epoch": 0.8658295432074352, "step": 8757, "train/total_loss": 0.12154848128557205 }, { "entropy": 8.224753379821777, "epoch": 0.8659284160569508, "mean_token_accuracy": 0.7279999852180481, "num_tokens": 24780062.0, "step": 8758, "train/ce_loss": 1.2939484119415283 }, { "epoch": 0.8659284160569508, "step": 8758, "train/sim_loss": 0.0234375 }, { "epoch": 0.8659284160569508, "step": 8758, "train/total_loss": 0.15283234417438507 }, { "entropy": 8.87765884399414, "epoch": 0.8660272889064663, "mean_token_accuracy": 0.765562891960144, "num_tokens": 24785284.0, "step": 8759, "train/ce_loss": 1.3594692945480347 }, { "epoch": 0.8660272889064663, "step": 8759, "train/sim_loss": 0.06640625 }, { "epoch": 0.8660272889064663, "step": 8759, "train/total_loss": 0.20235317945480347 }, { "epoch": 0.8661261617559818, "grad_norm": 0.588288426399231, "learning_rate": 7.836868911635268e-06, "loss": 0.1316, "step": 8760 }, { "entropy": 9.202454566955566, "epoch": 0.8661261617559818, "mean_token_accuracy": 0.7523452043533325, "num_tokens": 24790218.0, "step": 8760, "train/ce_loss": 7.031700306470157e-07 }, { "epoch": 0.8661261617559818, "step": 8760, "train/sim_loss": 0.03515625 }, { "epoch": 0.8661261617559818, "step": 8760, "train/total_loss": 0.03515632078051567 }, { "entropy": 8.66160774230957, "epoch": 0.8662250346054974, "mean_token_accuracy": 0.6910466551780701, "num_tokens": 24795443.0, "step": 8761, "train/ce_loss": 0.6732712388038635 }, { "epoch": 0.8662250346054974, "step": 8761, "train/sim_loss": 0.1015625 }, { "epoch": 0.8662250346054974, "step": 8761, "train/total_loss": 0.1688896268606186 }, { "entropy": 9.296399116516113, "epoch": 0.8663239074550129, "mean_token_accuracy": 0.7829268574714661, "num_tokens": 24800288.0, "step": 8762, "train/ce_loss": 9.687728379503824e-07 }, { "epoch": 0.8663239074550129, "step": 8762, "train/sim_loss": 0.0234375 }, { "epoch": 0.8663239074550129, "step": 8762, "train/total_loss": 0.02343759685754776 }, { "entropy": 8.678385734558105, "epoch": 0.8664227803045284, "mean_token_accuracy": 0.7857142686843872, "num_tokens": 24805656.0, "step": 8763, "train/ce_loss": 0.9216625094413757 }, { "epoch": 0.8664227803045284, "step": 8763, "train/sim_loss": 0.02734375 }, { "epoch": 0.8664227803045284, "step": 8763, "train/total_loss": 0.11951000243425369 }, { "entropy": 9.073097229003906, "epoch": 0.866521653154044, "mean_token_accuracy": 0.7687074542045593, "num_tokens": 24810682.0, "step": 8764, "train/ce_loss": 0.7515096664428711 }, { "epoch": 0.866521653154044, "step": 8764, "train/sim_loss": 0.01171875 }, { "epoch": 0.866521653154044, "step": 8764, "train/total_loss": 0.08686971664428711 }, { "entropy": 8.48833179473877, "epoch": 0.8666205260035594, "mean_token_accuracy": 0.7521263957023621, "num_tokens": 24816012.0, "step": 8765, "train/ce_loss": 0.7278606295585632 }, { "epoch": 0.8666205260035594, "step": 8765, "train/sim_loss": 0.046875 }, { "epoch": 0.8666205260035594, "step": 8765, "train/total_loss": 0.11966106295585632 }, { "entropy": 8.351832389831543, "epoch": 0.8667193988530749, "mean_token_accuracy": 0.7191234827041626, "num_tokens": 24821523.0, "step": 8766, "train/ce_loss": 1.1557978391647339 }, { "epoch": 0.8667193988530749, "step": 8766, "train/sim_loss": 0.04296875 }, { "epoch": 0.8667193988530749, "step": 8766, "train/total_loss": 0.1585485339164734 }, { "entropy": 8.598573684692383, "epoch": 0.8668182717025905, "mean_token_accuracy": 0.7502762675285339, "num_tokens": 24826916.0, "step": 8767, "train/ce_loss": 0.9814897775650024 }, { "epoch": 0.8668182717025905, "step": 8767, "train/sim_loss": 0.07421875 }, { "epoch": 0.8668182717025905, "step": 8767, "train/total_loss": 0.17236772179603577 }, { "entropy": 8.701048851013184, "epoch": 0.866917144552106, "mean_token_accuracy": 0.7069825530052185, "num_tokens": 24832214.0, "step": 8768, "train/ce_loss": 1.222639560699463 }, { "epoch": 0.866917144552106, "step": 8768, "train/sim_loss": 0.06640625 }, { "epoch": 0.866917144552106, "step": 8768, "train/total_loss": 0.18867021799087524 }, { "entropy": 8.696617126464844, "epoch": 0.8670160174016215, "mean_token_accuracy": 0.7450058460235596, "num_tokens": 24837607.0, "step": 8769, "train/ce_loss": 0.3104584515094757 }, { "epoch": 0.8670160174016215, "step": 8769, "train/sim_loss": 0.0546875 }, { "epoch": 0.8670160174016215, "step": 8769, "train/total_loss": 0.08573334664106369 }, { "entropy": 8.912432670593262, "epoch": 0.8671148902511371, "mean_token_accuracy": 0.7912912964820862, "num_tokens": 24842729.0, "step": 8770, "train/ce_loss": 1.3645211458206177 }, { "epoch": 0.8671148902511371, "step": 8770, "train/sim_loss": 0.0703125 }, { "epoch": 0.8671148902511371, "step": 8770, "train/total_loss": 0.20676462352275848 }, { "entropy": 8.724910736083984, "epoch": 0.8672137631006526, "mean_token_accuracy": 0.7616279125213623, "num_tokens": 24847877.0, "step": 8771, "train/ce_loss": 1.2197141647338867 }, { "epoch": 0.8672137631006526, "step": 8771, "train/sim_loss": 0.078125 }, { "epoch": 0.8672137631006526, "step": 8771, "train/total_loss": 0.20009642839431763 }, { "entropy": 8.996208190917969, "epoch": 0.867312635950168, "mean_token_accuracy": 0.7210440635681152, "num_tokens": 24852966.0, "step": 8772, "train/ce_loss": 1.4611930847167969 }, { "epoch": 0.867312635950168, "step": 8772, "train/sim_loss": 0.0625 }, { "epoch": 0.867312635950168, "step": 8772, "train/total_loss": 0.20861931145191193 }, { "entropy": 8.577859878540039, "epoch": 0.8674115087996837, "mean_token_accuracy": 0.718191385269165, "num_tokens": 24858363.0, "step": 8773, "train/ce_loss": 1.5229592323303223 }, { "epoch": 0.8674115087996837, "step": 8773, "train/sim_loss": 0.046875 }, { "epoch": 0.8674115087996837, "step": 8773, "train/total_loss": 0.19917093217372894 }, { "entropy": 8.59080982208252, "epoch": 0.8675103816491991, "mean_token_accuracy": 0.7405966520309448, "num_tokens": 24863565.0, "step": 8774, "train/ce_loss": 0.35415276885032654 }, { "epoch": 0.8675103816491991, "step": 8774, "train/sim_loss": 0.03515625 }, { "epoch": 0.8675103816491991, "step": 8774, "train/total_loss": 0.07057152688503265 }, { "entropy": 8.744940757751465, "epoch": 0.8676092544987146, "mean_token_accuracy": 0.7650063633918762, "num_tokens": 24868777.0, "step": 8775, "train/ce_loss": 0.9832481741905212 }, { "epoch": 0.8676092544987146, "step": 8775, "train/sim_loss": 0.1171875 }, { "epoch": 0.8676092544987146, "step": 8775, "train/total_loss": 0.21551232039928436 }, { "entropy": 8.630331039428711, "epoch": 0.8677081273482302, "mean_token_accuracy": 0.7324913740158081, "num_tokens": 24874099.0, "step": 8776, "train/ce_loss": 0.6644471883773804 }, { "epoch": 0.8677081273482302, "step": 8776, "train/sim_loss": 0.0390625 }, { "epoch": 0.8677081273482302, "step": 8776, "train/total_loss": 0.10550721734762192 }, { "entropy": 8.871881484985352, "epoch": 0.8678070001977457, "mean_token_accuracy": 0.7710843086242676, "num_tokens": 24879211.0, "step": 8777, "train/ce_loss": 0.5619823336601257 }, { "epoch": 0.8678070001977457, "step": 8777, "train/sim_loss": 0.04296875 }, { "epoch": 0.8678070001977457, "step": 8777, "train/total_loss": 0.09916698932647705 }, { "entropy": 8.483622550964355, "epoch": 0.8679058730472612, "mean_token_accuracy": 0.7622682452201843, "num_tokens": 24884564.0, "step": 8778, "train/ce_loss": 0.8081666231155396 }, { "epoch": 0.8679058730472612, "step": 8778, "train/sim_loss": 0.05859375 }, { "epoch": 0.8679058730472612, "step": 8778, "train/total_loss": 0.13941040635108948 }, { "entropy": 8.644962310791016, "epoch": 0.8680047458967768, "mean_token_accuracy": 0.7714285850524902, "num_tokens": 24889883.0, "step": 8779, "train/ce_loss": 0.7450631856918335 }, { "epoch": 0.8680047458967768, "step": 8779, "train/sim_loss": 0.04296875 }, { "epoch": 0.8680047458967768, "step": 8779, "train/total_loss": 0.11747507005929947 }, { "epoch": 0.8681036187462923, "grad_norm": 0.5960235595703125, "learning_rate": 7.831924046877318e-06, "loss": 0.1282, "step": 8780 }, { "entropy": 8.878667831420898, "epoch": 0.8681036187462923, "mean_token_accuracy": 0.730715274810791, "num_tokens": 24895108.0, "step": 8780, "train/ce_loss": 0.5047854781150818 }, { "epoch": 0.8681036187462923, "step": 8780, "train/sim_loss": 0.1015625 }, { "epoch": 0.8681036187462923, "step": 8780, "train/total_loss": 0.15204104781150818 }, { "entropy": 8.98653793334961, "epoch": 0.8682024915958078, "mean_token_accuracy": 0.6356073021888733, "num_tokens": 24900180.0, "step": 8781, "train/ce_loss": 0.7897573709487915 }, { "epoch": 0.8682024915958078, "step": 8781, "train/sim_loss": 0.09375 }, { "epoch": 0.8682024915958078, "step": 8781, "train/total_loss": 0.17272573709487915 }, { "entropy": 9.035877227783203, "epoch": 0.8683013644453234, "mean_token_accuracy": 0.7129120826721191, "num_tokens": 24905321.0, "step": 8782, "train/ce_loss": 0.4941462278366089 }, { "epoch": 0.8683013644453234, "step": 8782, "train/sim_loss": 0.01953125 }, { "epoch": 0.8683013644453234, "step": 8782, "train/total_loss": 0.06894586980342865 }, { "entropy": 8.775835990905762, "epoch": 0.8684002372948388, "mean_token_accuracy": 0.7245950102806091, "num_tokens": 24910441.0, "step": 8783, "train/ce_loss": 5.589906777458964e-07 }, { "epoch": 0.8684002372948388, "step": 8783, "train/sim_loss": 0.046875 }, { "epoch": 0.8684002372948388, "step": 8783, "train/total_loss": 0.04687505587935448 }, { "entropy": 8.742904663085938, "epoch": 0.8684991101443543, "mean_token_accuracy": 0.7463592290878296, "num_tokens": 24915743.0, "step": 8784, "train/ce_loss": 0.47416871786117554 }, { "epoch": 0.8684991101443543, "step": 8784, "train/sim_loss": 0.05078125 }, { "epoch": 0.8684991101443543, "step": 8784, "train/total_loss": 0.09819812327623367 }, { "entropy": 8.247857093811035, "epoch": 0.8685979829938699, "mean_token_accuracy": 0.739294707775116, "num_tokens": 24921019.0, "step": 8785, "train/ce_loss": 1.2385609149932861 }, { "epoch": 0.8685979829938699, "step": 8785, "train/sim_loss": 0.04296875 }, { "epoch": 0.8685979829938699, "step": 8785, "train/total_loss": 0.1668248474597931 }, { "entropy": 8.880462646484375, "epoch": 0.8686968558433854, "mean_token_accuracy": 0.765925943851471, "num_tokens": 24926142.0, "step": 8786, "train/ce_loss": 0.6230125427246094 }, { "epoch": 0.8686968558433854, "step": 8786, "train/sim_loss": 0.0625 }, { "epoch": 0.8686968558433854, "step": 8786, "train/total_loss": 0.12480125576257706 }, { "entropy": 8.575641632080078, "epoch": 0.8687957286929009, "mean_token_accuracy": 0.7439724206924438, "num_tokens": 24931476.0, "step": 8787, "train/ce_loss": 0.9515129923820496 }, { "epoch": 0.8687957286929009, "step": 8787, "train/sim_loss": 0.05859375 }, { "epoch": 0.8687957286929009, "step": 8787, "train/total_loss": 0.15374505519866943 }, { "entropy": 8.655603408813477, "epoch": 0.8688946015424165, "mean_token_accuracy": 0.7214885950088501, "num_tokens": 24936757.0, "step": 8788, "train/ce_loss": 0.8460344672203064 }, { "epoch": 0.8688946015424165, "step": 8788, "train/sim_loss": 0.04296875 }, { "epoch": 0.8688946015424165, "step": 8788, "train/total_loss": 0.1275722086429596 }, { "entropy": 8.732928276062012, "epoch": 0.868993474391932, "mean_token_accuracy": 0.703611433506012, "num_tokens": 24942020.0, "step": 8789, "train/ce_loss": 1.4696121215820312 }, { "epoch": 0.868993474391932, "step": 8789, "train/sim_loss": 0.10546875 }, { "epoch": 0.868993474391932, "step": 8789, "train/total_loss": 0.2524299621582031 }, { "entropy": 8.986867904663086, "epoch": 0.8690923472414475, "mean_token_accuracy": 0.7361111044883728, "num_tokens": 24947228.0, "step": 8790, "train/ce_loss": 6.576165105798282e-07 }, { "epoch": 0.8690923472414475, "step": 8790, "train/sim_loss": 0.05078125 }, { "epoch": 0.8690923472414475, "step": 8790, "train/total_loss": 0.05078131705522537 }, { "entropy": 9.043817520141602, "epoch": 0.8691912200909631, "mean_token_accuracy": 0.7069182395935059, "num_tokens": 24952470.0, "step": 8791, "train/ce_loss": 0.8918421864509583 }, { "epoch": 0.8691912200909631, "step": 8791, "train/sim_loss": 0.04296875 }, { "epoch": 0.8691912200909631, "step": 8791, "train/total_loss": 0.1321529746055603 }, { "entropy": 8.527637481689453, "epoch": 0.8692900929404785, "mean_token_accuracy": 0.7156398296356201, "num_tokens": 24957819.0, "step": 8792, "train/ce_loss": 0.6995357871055603 }, { "epoch": 0.8692900929404785, "step": 8792, "train/sim_loss": 0.03125 }, { "epoch": 0.8692900929404785, "step": 8792, "train/total_loss": 0.10120358318090439 }, { "entropy": 8.938925743103027, "epoch": 0.869388965789994, "mean_token_accuracy": 0.7340824007987976, "num_tokens": 24963084.0, "step": 8793, "train/ce_loss": 0.5987036228179932 }, { "epoch": 0.869388965789994, "step": 8793, "train/sim_loss": 0.015625 }, { "epoch": 0.869388965789994, "step": 8793, "train/total_loss": 0.07549536228179932 }, { "entropy": 8.46339225769043, "epoch": 0.8694878386395096, "mean_token_accuracy": 0.7095837593078613, "num_tokens": 24968653.0, "step": 8794, "train/ce_loss": 1.06622314453125 }, { "epoch": 0.8694878386395096, "step": 8794, "train/sim_loss": 0.04296875 }, { "epoch": 0.8694878386395096, "step": 8794, "train/total_loss": 0.14959105849266052 }, { "entropy": 8.62234115600586, "epoch": 0.8695867114890251, "mean_token_accuracy": 0.7968923449516296, "num_tokens": 24973975.0, "step": 8795, "train/ce_loss": 0.8510656952857971 }, { "epoch": 0.8695867114890251, "step": 8795, "train/sim_loss": 0.015625 }, { "epoch": 0.8695867114890251, "step": 8795, "train/total_loss": 0.10073157399892807 }, { "entropy": 8.597803115844727, "epoch": 0.8696855843385406, "mean_token_accuracy": 0.6967545747756958, "num_tokens": 24979419.0, "step": 8796, "train/ce_loss": 1.3414621353149414 }, { "epoch": 0.8696855843385406, "step": 8796, "train/sim_loss": 0.0234375 }, { "epoch": 0.8696855843385406, "step": 8796, "train/total_loss": 0.15758371353149414 }, { "entropy": 8.722497940063477, "epoch": 0.8697844571880562, "mean_token_accuracy": 0.6937500238418579, "num_tokens": 24984711.0, "step": 8797, "train/ce_loss": 0.9156614542007446 }, { "epoch": 0.8697844571880562, "step": 8797, "train/sim_loss": 0.08203125 }, { "epoch": 0.8697844571880562, "step": 8797, "train/total_loss": 0.17359739542007446 }, { "entropy": 8.773809432983398, "epoch": 0.8698833300375717, "mean_token_accuracy": 0.7800875306129456, "num_tokens": 24990046.0, "step": 8798, "train/ce_loss": 0.6494016647338867 }, { "epoch": 0.8698833300375717, "step": 8798, "train/sim_loss": 0.0234375 }, { "epoch": 0.8698833300375717, "step": 8798, "train/total_loss": 0.08837766945362091 }, { "entropy": 9.861303329467773, "epoch": 0.8699822028870872, "mean_token_accuracy": 0.7676056623458862, "num_tokens": 24994736.0, "step": 8799, "train/ce_loss": 7.118906637515465e-07 }, { "epoch": 0.8699822028870872, "step": 8799, "train/sim_loss": 0.03515625 }, { "epoch": 0.8699822028870872, "step": 8799, "train/total_loss": 0.03515632078051567 }, { "epoch": 0.8700810757366028, "grad_norm": 0.8049528002738953, "learning_rate": 7.82697918211937e-06, "loss": 0.1392, "step": 8800 }, { "entropy": 9.053065299987793, "epoch": 0.8700810757366028, "mean_token_accuracy": 0.772357702255249, "num_tokens": 24999798.0, "step": 8800, "train/ce_loss": 1.8910090923309326 }, { "epoch": 0.8700810757366028, "step": 8800, "train/sim_loss": 0.078125 }, { "epoch": 0.8700810757366028, "step": 8800, "train/total_loss": 0.2672259211540222 }, { "entropy": 8.306099891662598, "epoch": 0.8701799485861182, "mean_token_accuracy": 0.7704485654830933, "num_tokens": 25005411.0, "step": 8801, "train/ce_loss": 0.46456578373908997 }, { "epoch": 0.8701799485861182, "step": 8801, "train/sim_loss": 0.01953125 }, { "epoch": 0.8701799485861182, "step": 8801, "train/total_loss": 0.06598782539367676 }, { "entropy": 8.760089874267578, "epoch": 0.8702788214356337, "mean_token_accuracy": 0.718482255935669, "num_tokens": 25010714.0, "step": 8802, "train/ce_loss": 0.6581257581710815 }, { "epoch": 0.8702788214356337, "step": 8802, "train/sim_loss": 0.04296875 }, { "epoch": 0.8702788214356337, "step": 8802, "train/total_loss": 0.10878133028745651 }, { "entropy": 8.544943809509277, "epoch": 0.8703776942851493, "mean_token_accuracy": 0.7622149586677551, "num_tokens": 25016044.0, "step": 8803, "train/ce_loss": 0.3731022775173187 }, { "epoch": 0.8703776942851493, "step": 8803, "train/sim_loss": 0.046875 }, { "epoch": 0.8703776942851493, "step": 8803, "train/total_loss": 0.08418522775173187 }, { "entropy": 9.079167366027832, "epoch": 0.8704765671346648, "mean_token_accuracy": 0.7332361340522766, "num_tokens": 25021135.0, "step": 8804, "train/ce_loss": 1.3193233013153076 }, { "epoch": 0.8704765671346648, "step": 8804, "train/sim_loss": 0.09375 }, { "epoch": 0.8704765671346648, "step": 8804, "train/total_loss": 0.225682333111763 }, { "entropy": 8.460450172424316, "epoch": 0.8705754399841803, "mean_token_accuracy": 0.7470398545265198, "num_tokens": 25026685.0, "step": 8805, "train/ce_loss": 0.6708948612213135 }, { "epoch": 0.8705754399841803, "step": 8805, "train/sim_loss": 0.046875 }, { "epoch": 0.8705754399841803, "step": 8805, "train/total_loss": 0.1139644905924797 }, { "entropy": 8.694866180419922, "epoch": 0.8706743128336959, "mean_token_accuracy": 0.7612500190734863, "num_tokens": 25031976.0, "step": 8806, "train/ce_loss": 1.2063027620315552 }, { "epoch": 0.8706743128336959, "step": 8806, "train/sim_loss": 0.05078125 }, { "epoch": 0.8706743128336959, "step": 8806, "train/total_loss": 0.17141152918338776 }, { "entropy": 9.025408744812012, "epoch": 0.8707731856832114, "mean_token_accuracy": 0.759381890296936, "num_tokens": 25036890.0, "step": 8807, "train/ce_loss": 0.5602459907531738 }, { "epoch": 0.8707731856832114, "step": 8807, "train/sim_loss": 0.05078125 }, { "epoch": 0.8707731856832114, "step": 8807, "train/total_loss": 0.10680584609508514 }, { "entropy": 9.421650886535645, "epoch": 0.8708720585327269, "mean_token_accuracy": 0.7763713002204895, "num_tokens": 25041789.0, "step": 8808, "train/ce_loss": 1.0831292867660522 }, { "epoch": 0.8708720585327269, "step": 8808, "train/sim_loss": 0.09765625 }, { "epoch": 0.8708720585327269, "step": 8808, "train/total_loss": 0.2059691846370697 }, { "entropy": 8.586231231689453, "epoch": 0.8709709313822425, "mean_token_accuracy": 0.7010989189147949, "num_tokens": 25047134.0, "step": 8809, "train/ce_loss": 1.241375207901001 }, { "epoch": 0.8709709313822425, "step": 8809, "train/sim_loss": 0.0859375 }, { "epoch": 0.8709709313822425, "step": 8809, "train/total_loss": 0.2100750207901001 }, { "entropy": 8.946133613586426, "epoch": 0.871069804231758, "mean_token_accuracy": 0.7638036608695984, "num_tokens": 25052235.0, "step": 8810, "train/ce_loss": 0.470211386680603 }, { "epoch": 0.871069804231758, "step": 8810, "train/sim_loss": 0.04296875 }, { "epoch": 0.871069804231758, "step": 8810, "train/total_loss": 0.08998988568782806 }, { "entropy": 9.209354400634766, "epoch": 0.8711686770812734, "mean_token_accuracy": 0.7810857892036438, "num_tokens": 25057229.0, "step": 8811, "train/ce_loss": 3.2818891781971615e-07 }, { "epoch": 0.8711686770812734, "step": 8811, "train/sim_loss": 0.03515625 }, { "epoch": 0.8711686770812734, "step": 8811, "train/total_loss": 0.035156283527612686 }, { "entropy": 9.166122436523438, "epoch": 0.871267549930789, "mean_token_accuracy": 0.7071197628974915, "num_tokens": 25062294.0, "step": 8812, "train/ce_loss": 2.000129461288452 }, { "epoch": 0.871267549930789, "step": 8812, "train/sim_loss": 0.0625 }, { "epoch": 0.871267549930789, "step": 8812, "train/total_loss": 0.2625129520893097 }, { "entropy": 9.153691291809082, "epoch": 0.8713664227803045, "mean_token_accuracy": 0.7372449040412903, "num_tokens": 25067145.0, "step": 8813, "train/ce_loss": 1.0874640565816662e-06 }, { "epoch": 0.8713664227803045, "step": 8813, "train/sim_loss": 0.03515625 }, { "epoch": 0.8713664227803045, "step": 8813, "train/total_loss": 0.035156358033418655 }, { "entropy": 9.020694732666016, "epoch": 0.87146529562982, "mean_token_accuracy": 0.7691154479980469, "num_tokens": 25072257.0, "step": 8814, "train/ce_loss": 0.9739776253700256 }, { "epoch": 0.87146529562982, "step": 8814, "train/sim_loss": 0.05078125 }, { "epoch": 0.87146529562982, "step": 8814, "train/total_loss": 0.14817902445793152 }, { "entropy": 8.523090362548828, "epoch": 0.8715641684793356, "mean_token_accuracy": 0.7251995205879211, "num_tokens": 25077597.0, "step": 8815, "train/ce_loss": 1.0241416692733765 }, { "epoch": 0.8715641684793356, "step": 8815, "train/sim_loss": 0.05859375 }, { "epoch": 0.8715641684793356, "step": 8815, "train/total_loss": 0.16100791096687317 }, { "entropy": 9.017142295837402, "epoch": 0.8716630413288511, "mean_token_accuracy": 0.7455196976661682, "num_tokens": 25082608.0, "step": 8816, "train/ce_loss": 1.1585428714752197 }, { "epoch": 0.8716630413288511, "step": 8816, "train/sim_loss": 0.06640625 }, { "epoch": 0.8716630413288511, "step": 8816, "train/total_loss": 0.18226054310798645 }, { "entropy": 8.302145004272461, "epoch": 0.8717619141783666, "mean_token_accuracy": 0.7987151741981506, "num_tokens": 25088050.0, "step": 8817, "train/ce_loss": 0.5422400236129761 }, { "epoch": 0.8717619141783666, "step": 8817, "train/sim_loss": 0.03515625 }, { "epoch": 0.8717619141783666, "step": 8817, "train/total_loss": 0.08938024938106537 }, { "entropy": 8.53742790222168, "epoch": 0.8718607870278822, "mean_token_accuracy": 0.7130434513092041, "num_tokens": 25093477.0, "step": 8818, "train/ce_loss": 0.6250754594802856 }, { "epoch": 0.8718607870278822, "step": 8818, "train/sim_loss": 0.109375 }, { "epoch": 0.8718607870278822, "step": 8818, "train/total_loss": 0.1718825399875641 }, { "entropy": 8.350768089294434, "epoch": 0.8719596598773977, "mean_token_accuracy": 0.76962810754776, "num_tokens": 25098957.0, "step": 8819, "train/ce_loss": 0.6983389258384705 }, { "epoch": 0.8719596598773977, "step": 8819, "train/sim_loss": 0.0234375 }, { "epoch": 0.8719596598773977, "step": 8819, "train/total_loss": 0.0932713970541954 }, { "epoch": 0.8720585327269131, "grad_norm": 0.6103883385658264, "learning_rate": 7.822034317361421e-06, "loss": 0.1331, "step": 8820 }, { "entropy": 8.73092269897461, "epoch": 0.8720585327269131, "mean_token_accuracy": 0.7148817777633667, "num_tokens": 25104127.0, "step": 8820, "train/ce_loss": 8.011184604583832e-07 }, { "epoch": 0.8720585327269131, "step": 8820, "train/sim_loss": 0.03125 }, { "epoch": 0.8720585327269131, "step": 8820, "train/total_loss": 0.031250081956386566 }, { "entropy": 8.991170883178711, "epoch": 0.8721574055764287, "mean_token_accuracy": 0.8072837591171265, "num_tokens": 25109231.0, "step": 8821, "train/ce_loss": 0.746583878993988 }, { "epoch": 0.8721574055764287, "step": 8821, "train/sim_loss": 0.015625 }, { "epoch": 0.8721574055764287, "step": 8821, "train/total_loss": 0.09028338640928268 }, { "entropy": 8.568203926086426, "epoch": 0.8722562784259442, "mean_token_accuracy": 0.7308707237243652, "num_tokens": 25114476.0, "step": 8822, "train/ce_loss": 1.2717561721801758 }, { "epoch": 0.8722562784259442, "step": 8822, "train/sim_loss": 0.05859375 }, { "epoch": 0.8722562784259442, "step": 8822, "train/total_loss": 0.18576936423778534 }, { "entropy": 8.938301086425781, "epoch": 0.8723551512754597, "mean_token_accuracy": 0.7442799210548401, "num_tokens": 25119680.0, "step": 8823, "train/ce_loss": 1.2276751704121125e-06 }, { "epoch": 0.8723551512754597, "step": 8823, "train/sim_loss": 0.04296875 }, { "epoch": 0.8723551512754597, "step": 8823, "train/total_loss": 0.04296887293457985 }, { "entropy": 8.760126113891602, "epoch": 0.8724540241249753, "mean_token_accuracy": 0.7533252835273743, "num_tokens": 25124967.0, "step": 8824, "train/ce_loss": 1.063308835029602 }, { "epoch": 0.8724540241249753, "step": 8824, "train/sim_loss": 0.05859375 }, { "epoch": 0.8724540241249753, "step": 8824, "train/total_loss": 0.16492463648319244 }, { "entropy": 8.983345031738281, "epoch": 0.8725528969744908, "mean_token_accuracy": 0.7981651425361633, "num_tokens": 25130067.0, "step": 8825, "train/ce_loss": 1.3582448959350586 }, { "epoch": 0.8725528969744908, "step": 8825, "train/sim_loss": 0.0234375 }, { "epoch": 0.8725528969744908, "step": 8825, "train/total_loss": 0.15926198661327362 }, { "entropy": 8.7062349319458, "epoch": 0.8726517698240063, "mean_token_accuracy": 0.7188295125961304, "num_tokens": 25135337.0, "step": 8826, "train/ce_loss": 0.798263669013977 }, { "epoch": 0.8726517698240063, "step": 8826, "train/sim_loss": 0.05859375 }, { "epoch": 0.8726517698240063, "step": 8826, "train/total_loss": 0.13842011988162994 }, { "entropy": 9.845602035522461, "epoch": 0.8727506426735219, "mean_token_accuracy": 0.6994949579238892, "num_tokens": 25140133.0, "step": 8827, "train/ce_loss": 1.5620946884155273 }, { "epoch": 0.8727506426735219, "step": 8827, "train/sim_loss": 0.06640625 }, { "epoch": 0.8727506426735219, "step": 8827, "train/total_loss": 0.22261571884155273 }, { "entropy": 9.002527236938477, "epoch": 0.8728495155230374, "mean_token_accuracy": 0.7468553185462952, "num_tokens": 25145188.0, "step": 8828, "train/ce_loss": 9.026667271427868e-07 }, { "epoch": 0.8728495155230374, "step": 8828, "train/sim_loss": 0.0546875 }, { "epoch": 0.8728495155230374, "step": 8828, "train/total_loss": 0.05468758940696716 }, { "entropy": 9.244731903076172, "epoch": 0.8729483883725528, "mean_token_accuracy": 0.7821100950241089, "num_tokens": 25150100.0, "step": 8829, "train/ce_loss": 0.8010627627372742 }, { "epoch": 0.8729483883725528, "step": 8829, "train/sim_loss": 0.0390625 }, { "epoch": 0.8729483883725528, "step": 8829, "train/total_loss": 0.11916878074407578 }, { "entropy": 8.634561538696289, "epoch": 0.8730472612220684, "mean_token_accuracy": 0.7494061589241028, "num_tokens": 25155403.0, "step": 8830, "train/ce_loss": 0.6409665942192078 }, { "epoch": 0.8730472612220684, "step": 8830, "train/sim_loss": 0.03125 }, { "epoch": 0.8730472612220684, "step": 8830, "train/total_loss": 0.09534665942192078 }, { "entropy": 8.507293701171875, "epoch": 0.8731461340715839, "mean_token_accuracy": 0.7270641922950745, "num_tokens": 25160774.0, "step": 8831, "train/ce_loss": 0.6567292809486389 }, { "epoch": 0.8731461340715839, "step": 8831, "train/sim_loss": 0.0234375 }, { "epoch": 0.8731461340715839, "step": 8831, "train/total_loss": 0.08911042660474777 }, { "entropy": 8.993139266967773, "epoch": 0.8732450069210995, "mean_token_accuracy": 0.7669291496276855, "num_tokens": 25165869.0, "step": 8832, "train/ce_loss": 1.3291172981262207 }, { "epoch": 0.8732450069210995, "step": 8832, "train/sim_loss": 0.05859375 }, { "epoch": 0.8732450069210995, "step": 8832, "train/total_loss": 0.19150547683238983 }, { "entropy": 8.491973876953125, "epoch": 0.873343879770615, "mean_token_accuracy": 0.7161226272583008, "num_tokens": 25171504.0, "step": 8833, "train/ce_loss": 0.9487578868865967 }, { "epoch": 0.873343879770615, "step": 8833, "train/sim_loss": 0.08203125 }, { "epoch": 0.873343879770615, "step": 8833, "train/total_loss": 0.1769070327281952 }, { "entropy": 9.158414840698242, "epoch": 0.8734427526201305, "mean_token_accuracy": 0.7287319302558899, "num_tokens": 25176575.0, "step": 8834, "train/ce_loss": 1.3763008117675781 }, { "epoch": 0.8734427526201305, "step": 8834, "train/sim_loss": 0.01953125 }, { "epoch": 0.8734427526201305, "step": 8834, "train/total_loss": 0.15716134011745453 }, { "entropy": 8.496709823608398, "epoch": 0.8735416254696461, "mean_token_accuracy": 0.6809210777282715, "num_tokens": 25181957.0, "step": 8835, "train/ce_loss": 1.1742103099822998 }, { "epoch": 0.8735416254696461, "step": 8835, "train/sim_loss": 0.0390625 }, { "epoch": 0.8735416254696461, "step": 8835, "train/total_loss": 0.15648353099822998 }, { "entropy": 9.131623268127441, "epoch": 0.8736404983191616, "mean_token_accuracy": 0.6885812878608704, "num_tokens": 25186968.0, "step": 8836, "train/ce_loss": 0.7909911274909973 }, { "epoch": 0.8736404983191616, "step": 8836, "train/sim_loss": 0.05078125 }, { "epoch": 0.8736404983191616, "step": 8836, "train/total_loss": 0.1298803687095642 }, { "entropy": 8.554312705993652, "epoch": 0.8737393711686771, "mean_token_accuracy": 0.7601432204246521, "num_tokens": 25192270.0, "step": 8837, "train/ce_loss": 0.972604513168335 }, { "epoch": 0.8737393711686771, "step": 8837, "train/sim_loss": 0.109375 }, { "epoch": 0.8737393711686771, "step": 8837, "train/total_loss": 0.20663544535636902 }, { "entropy": 8.629166603088379, "epoch": 0.8738382440181927, "mean_token_accuracy": 0.7542662024497986, "num_tokens": 25197672.0, "step": 8838, "train/ce_loss": 1.1455631256103516 }, { "epoch": 0.8738382440181927, "step": 8838, "train/sim_loss": 0.0625 }, { "epoch": 0.8738382440181927, "step": 8838, "train/total_loss": 0.17705631256103516 }, { "entropy": 8.445144653320312, "epoch": 0.8739371168677081, "mean_token_accuracy": 0.7559139728546143, "num_tokens": 25203111.0, "step": 8839, "train/ce_loss": 0.46726107597351074 }, { "epoch": 0.8739371168677081, "step": 8839, "train/sim_loss": 0.0234375 }, { "epoch": 0.8739371168677081, "step": 8839, "train/total_loss": 0.07016360759735107 }, { "epoch": 0.8740359897172236, "grad_norm": 0.5828443169593811, "learning_rate": 7.817089452603473e-06, "loss": 0.1283, "step": 8840 }, { "entropy": 8.578930854797363, "epoch": 0.8740359897172236, "mean_token_accuracy": 0.716911792755127, "num_tokens": 25208403.0, "step": 8840, "train/ce_loss": 0.8367867469787598 }, { "epoch": 0.8740359897172236, "step": 8840, "train/sim_loss": 0.0546875 }, { "epoch": 0.8740359897172236, "step": 8840, "train/total_loss": 0.13836617767810822 }, { "entropy": 9.337748527526855, "epoch": 0.8741348625667392, "mean_token_accuracy": 0.7061403393745422, "num_tokens": 25213254.0, "step": 8841, "train/ce_loss": 3.4821451322386565e-07 }, { "epoch": 0.8741348625667392, "step": 8841, "train/sim_loss": 0.015625 }, { "epoch": 0.8741348625667392, "step": 8841, "train/total_loss": 0.015625035390257835 }, { "entropy": 9.182525634765625, "epoch": 0.8742337354162547, "mean_token_accuracy": 0.7558139562606812, "num_tokens": 25218292.0, "step": 8842, "train/ce_loss": 0.8731189370155334 }, { "epoch": 0.8742337354162547, "step": 8842, "train/sim_loss": 0.0234375 }, { "epoch": 0.8742337354162547, "step": 8842, "train/total_loss": 0.11074939370155334 }, { "entropy": 9.116571426391602, "epoch": 0.8743326082657702, "mean_token_accuracy": 0.7578125, "num_tokens": 25223217.0, "step": 8843, "train/ce_loss": 1.6050145626068115 }, { "epoch": 0.8743326082657702, "step": 8843, "train/sim_loss": 0.0546875 }, { "epoch": 0.8743326082657702, "step": 8843, "train/total_loss": 0.21518896520137787 }, { "entropy": 8.42207145690918, "epoch": 0.8744314811152858, "mean_token_accuracy": 0.7744282484054565, "num_tokens": 25228672.0, "step": 8844, "train/ce_loss": 0.43781447410583496 }, { "epoch": 0.8744314811152858, "step": 8844, "train/sim_loss": 0.0234375 }, { "epoch": 0.8744314811152858, "step": 8844, "train/total_loss": 0.06721894443035126 }, { "entropy": 9.061729431152344, "epoch": 0.8745303539648013, "mean_token_accuracy": 0.7083947062492371, "num_tokens": 25233823.0, "step": 8845, "train/ce_loss": 1.604802131652832 }, { "epoch": 0.8745303539648013, "step": 8845, "train/sim_loss": 0.06640625 }, { "epoch": 0.8745303539648013, "step": 8845, "train/total_loss": 0.22688646614551544 }, { "entropy": 8.510478019714355, "epoch": 0.8746292268143168, "mean_token_accuracy": 0.748400866985321, "num_tokens": 25239165.0, "step": 8846, "train/ce_loss": 0.6801382303237915 }, { "epoch": 0.8746292268143168, "step": 8846, "train/sim_loss": 0.0390625 }, { "epoch": 0.8746292268143168, "step": 8846, "train/total_loss": 0.10707632452249527 }, { "entropy": 8.817245483398438, "epoch": 0.8747280996638324, "mean_token_accuracy": 0.7609921097755432, "num_tokens": 25244504.0, "step": 8847, "train/ce_loss": 1.1438863277435303 }, { "epoch": 0.8747280996638324, "step": 8847, "train/sim_loss": 0.0546875 }, { "epoch": 0.8747280996638324, "step": 8847, "train/total_loss": 0.16907614469528198 }, { "entropy": 8.993545532226562, "epoch": 0.8748269725133478, "mean_token_accuracy": 0.7871674299240112, "num_tokens": 25249611.0, "step": 8848, "train/ce_loss": 1.3642228841781616 }, { "epoch": 0.8748269725133478, "step": 8848, "train/sim_loss": 0.0625 }, { "epoch": 0.8748269725133478, "step": 8848, "train/total_loss": 0.1989222913980484 }, { "entropy": 8.39812183380127, "epoch": 0.8749258453628633, "mean_token_accuracy": 0.7722457647323608, "num_tokens": 25255046.0, "step": 8849, "train/ce_loss": 0.45712316036224365 }, { "epoch": 0.8749258453628633, "step": 8849, "train/sim_loss": 0.046875 }, { "epoch": 0.8749258453628633, "step": 8849, "train/total_loss": 0.09258732199668884 }, { "entropy": 8.450510025024414, "epoch": 0.8750247182123789, "mean_token_accuracy": 0.7152698040008545, "num_tokens": 25260336.0, "step": 8850, "train/ce_loss": 0.7078292369842529 }, { "epoch": 0.8750247182123789, "step": 8850, "train/sim_loss": 0.0234375 }, { "epoch": 0.8750247182123789, "step": 8850, "train/total_loss": 0.09422042220830917 }, { "entropy": 9.017822265625, "epoch": 0.8751235910618944, "mean_token_accuracy": 0.7021898031234741, "num_tokens": 25265448.0, "step": 8851, "train/ce_loss": 0.6941998600959778 }, { "epoch": 0.8751235910618944, "step": 8851, "train/sim_loss": 0.03125 }, { "epoch": 0.8751235910618944, "step": 8851, "train/total_loss": 0.1006699874997139 }, { "entropy": 9.01706314086914, "epoch": 0.8752224639114099, "mean_token_accuracy": 0.7002583742141724, "num_tokens": 25270700.0, "step": 8852, "train/ce_loss": 0.6723163723945618 }, { "epoch": 0.8752224639114099, "step": 8852, "train/sim_loss": 0.0546875 }, { "epoch": 0.8752224639114099, "step": 8852, "train/total_loss": 0.12191914021968842 }, { "entropy": 8.469293594360352, "epoch": 0.8753213367609255, "mean_token_accuracy": 0.7325714230537415, "num_tokens": 25276066.0, "step": 8853, "train/ce_loss": 0.9326720833778381 }, { "epoch": 0.8753213367609255, "step": 8853, "train/sim_loss": 0.04296875 }, { "epoch": 0.8753213367609255, "step": 8853, "train/total_loss": 0.13623595237731934 }, { "entropy": 9.226408004760742, "epoch": 0.875420209610441, "mean_token_accuracy": 0.7230769395828247, "num_tokens": 25281273.0, "step": 8854, "train/ce_loss": 1.058532953262329 }, { "epoch": 0.875420209610441, "step": 8854, "train/sim_loss": 0.07421875 }, { "epoch": 0.875420209610441, "step": 8854, "train/total_loss": 0.18007203936576843 }, { "entropy": 8.688737869262695, "epoch": 0.8755190824599565, "mean_token_accuracy": 0.7356194853782654, "num_tokens": 25286672.0, "step": 8855, "train/ce_loss": 0.6025984883308411 }, { "epoch": 0.8755190824599565, "step": 8855, "train/sim_loss": 0.03125 }, { "epoch": 0.8755190824599565, "step": 8855, "train/total_loss": 0.0915098488330841 }, { "entropy": 8.628371238708496, "epoch": 0.8756179553094721, "mean_token_accuracy": 0.7133243680000305, "num_tokens": 25291871.0, "step": 8856, "train/ce_loss": 0.9561830163002014 }, { "epoch": 0.8756179553094721, "step": 8856, "train/sim_loss": 0.05859375 }, { "epoch": 0.8756179553094721, "step": 8856, "train/total_loss": 0.15421205759048462 }, { "entropy": 8.08566665649414, "epoch": 0.8757168281589875, "mean_token_accuracy": 0.6803196668624878, "num_tokens": 25297367.0, "step": 8857, "train/ce_loss": 1.001320242881775 }, { "epoch": 0.8757168281589875, "step": 8857, "train/sim_loss": 0.0625 }, { "epoch": 0.8757168281589875, "step": 8857, "train/total_loss": 0.162632018327713 }, { "entropy": 9.131614685058594, "epoch": 0.875815701008503, "mean_token_accuracy": 0.7594202756881714, "num_tokens": 25302495.0, "step": 8858, "train/ce_loss": 0.6407871842384338 }, { "epoch": 0.875815701008503, "step": 8858, "train/sim_loss": 0.0234375 }, { "epoch": 0.875815701008503, "step": 8858, "train/total_loss": 0.08751621842384338 }, { "entropy": 8.983501434326172, "epoch": 0.8759145738580186, "mean_token_accuracy": 0.75, "num_tokens": 25307599.0, "step": 8859, "train/ce_loss": 1.0661547183990479 }, { "epoch": 0.8759145738580186, "step": 8859, "train/sim_loss": 0.05859375 }, { "epoch": 0.8759145738580186, "step": 8859, "train/total_loss": 0.16520923376083374 }, { "epoch": 0.8760134467075341, "grad_norm": 0.7033073306083679, "learning_rate": 7.812144587845522e-06, "loss": 0.1372, "step": 8860 }, { "entropy": 8.933948516845703, "epoch": 0.8760134467075341, "mean_token_accuracy": 0.7825342416763306, "num_tokens": 25312658.0, "step": 8860, "train/ce_loss": 0.9847171902656555 }, { "epoch": 0.8760134467075341, "step": 8860, "train/sim_loss": 0.0703125 }, { "epoch": 0.8760134467075341, "step": 8860, "train/total_loss": 0.1687842309474945 }, { "entropy": 8.694652557373047, "epoch": 0.8761123195570496, "mean_token_accuracy": 0.7148289084434509, "num_tokens": 25317887.0, "step": 8861, "train/ce_loss": 1.199857473373413 }, { "epoch": 0.8761123195570496, "step": 8861, "train/sim_loss": 0.05859375 }, { "epoch": 0.8761123195570496, "step": 8861, "train/total_loss": 0.17857950925827026 }, { "entropy": 8.699516296386719, "epoch": 0.8762111924065652, "mean_token_accuracy": 0.7503090500831604, "num_tokens": 25323166.0, "step": 8862, "train/ce_loss": 0.9494368433952332 }, { "epoch": 0.8762111924065652, "step": 8862, "train/sim_loss": 0.04296875 }, { "epoch": 0.8762111924065652, "step": 8862, "train/total_loss": 0.13791243731975555 }, { "entropy": 8.456817626953125, "epoch": 0.8763100652560807, "mean_token_accuracy": 0.8034188151359558, "num_tokens": 25328609.0, "step": 8863, "train/ce_loss": 0.7831841111183167 }, { "epoch": 0.8763100652560807, "step": 8863, "train/sim_loss": 0.0546875 }, { "epoch": 0.8763100652560807, "step": 8863, "train/total_loss": 0.13300591707229614 }, { "entropy": 8.716633796691895, "epoch": 0.8764089381055962, "mean_token_accuracy": 0.7696245908737183, "num_tokens": 25333649.0, "step": 8864, "train/ce_loss": 0.5097966194152832 }, { "epoch": 0.8764089381055962, "step": 8864, "train/sim_loss": 0.01953125 }, { "epoch": 0.8764089381055962, "step": 8864, "train/total_loss": 0.07051090896129608 }, { "entropy": 9.292539596557617, "epoch": 0.8765078109551118, "mean_token_accuracy": 0.7435897588729858, "num_tokens": 25338463.0, "step": 8865, "train/ce_loss": 1.7354304790496826 }, { "epoch": 0.8765078109551118, "step": 8865, "train/sim_loss": 0.1171875 }, { "epoch": 0.8765078109551118, "step": 8865, "train/total_loss": 0.2907305359840393 }, { "entropy": 8.276805877685547, "epoch": 0.8766066838046273, "mean_token_accuracy": 0.7647058963775635, "num_tokens": 25343993.0, "step": 8866, "train/ce_loss": 0.9382361769676208 }, { "epoch": 0.8766066838046273, "step": 8866, "train/sim_loss": 0.02734375 }, { "epoch": 0.8766066838046273, "step": 8866, "train/total_loss": 0.1211673691868782 }, { "entropy": 8.47601318359375, "epoch": 0.8767055566541427, "mean_token_accuracy": 0.7644970417022705, "num_tokens": 25349328.0, "step": 8867, "train/ce_loss": 1.3544975519180298 }, { "epoch": 0.8767055566541427, "step": 8867, "train/sim_loss": 0.0546875 }, { "epoch": 0.8767055566541427, "step": 8867, "train/total_loss": 0.19013725221157074 }, { "entropy": 8.886510848999023, "epoch": 0.8768044295036583, "mean_token_accuracy": 0.7770618796348572, "num_tokens": 25354554.0, "step": 8868, "train/ce_loss": 0.7130325436592102 }, { "epoch": 0.8768044295036583, "step": 8868, "train/sim_loss": 0.0859375 }, { "epoch": 0.8768044295036583, "step": 8868, "train/total_loss": 0.15724074840545654 }, { "entropy": 8.782245635986328, "epoch": 0.8769033023531738, "mean_token_accuracy": 0.7518247961997986, "num_tokens": 25359880.0, "step": 8869, "train/ce_loss": 0.6772040128707886 }, { "epoch": 0.8769033023531738, "step": 8869, "train/sim_loss": 0.0234375 }, { "epoch": 0.8769033023531738, "step": 8869, "train/total_loss": 0.09115790575742722 }, { "entropy": 9.141199111938477, "epoch": 0.8770021752026893, "mean_token_accuracy": 0.7357357144355774, "num_tokens": 25365008.0, "step": 8870, "train/ce_loss": 1.5943024158477783 }, { "epoch": 0.8770021752026893, "step": 8870, "train/sim_loss": 0.08984375 }, { "epoch": 0.8770021752026893, "step": 8870, "train/total_loss": 0.24927400052547455 }, { "entropy": 8.956819534301758, "epoch": 0.8771010480522049, "mean_token_accuracy": 0.752755880355835, "num_tokens": 25370085.0, "step": 8871, "train/ce_loss": 0.7061737775802612 }, { "epoch": 0.8771010480522049, "step": 8871, "train/sim_loss": 0.0234375 }, { "epoch": 0.8771010480522049, "step": 8871, "train/total_loss": 0.09405487775802612 }, { "entropy": 8.722127914428711, "epoch": 0.8771999209017204, "mean_token_accuracy": 0.7299363017082214, "num_tokens": 25375319.0, "step": 8872, "train/ce_loss": 0.9193618893623352 }, { "epoch": 0.8771999209017204, "step": 8872, "train/sim_loss": 0.046875 }, { "epoch": 0.8771999209017204, "step": 8872, "train/total_loss": 0.13881120085716248 }, { "entropy": 9.018770217895508, "epoch": 0.8772987937512359, "mean_token_accuracy": 0.7918623089790344, "num_tokens": 25380414.0, "step": 8873, "train/ce_loss": 0.9188916087150574 }, { "epoch": 0.8772987937512359, "step": 8873, "train/sim_loss": 0.046875 }, { "epoch": 0.8772987937512359, "step": 8873, "train/total_loss": 0.1387641727924347 }, { "entropy": 8.860586166381836, "epoch": 0.8773976666007515, "mean_token_accuracy": 0.73758864402771, "num_tokens": 25385740.0, "step": 8874, "train/ce_loss": 0.7601509690284729 }, { "epoch": 0.8773976666007515, "step": 8874, "train/sim_loss": 0.06640625 }, { "epoch": 0.8773976666007515, "step": 8874, "train/total_loss": 0.14242134988307953 }, { "entropy": 8.414628982543945, "epoch": 0.877496539450267, "mean_token_accuracy": 0.7476635575294495, "num_tokens": 25391182.0, "step": 8875, "train/ce_loss": 1.0848225355148315 }, { "epoch": 0.877496539450267, "step": 8875, "train/sim_loss": 0.02734375 }, { "epoch": 0.877496539450267, "step": 8875, "train/total_loss": 0.1358260065317154 }, { "entropy": 9.601459503173828, "epoch": 0.8775954122997824, "mean_token_accuracy": 0.7941888570785522, "num_tokens": 25396004.0, "step": 8876, "train/ce_loss": 0.9619978666305542 }, { "epoch": 0.8775954122997824, "step": 8876, "train/sim_loss": 0.01171875 }, { "epoch": 0.8775954122997824, "step": 8876, "train/total_loss": 0.10791853815317154 }, { "entropy": 8.563558578491211, "epoch": 0.877694285149298, "mean_token_accuracy": 0.740024209022522, "num_tokens": 25401315.0, "step": 8877, "train/ce_loss": 0.6377543807029724 }, { "epoch": 0.877694285149298, "step": 8877, "train/sim_loss": 0.015625 }, { "epoch": 0.877694285149298, "step": 8877, "train/total_loss": 0.0794004425406456 }, { "entropy": 8.761125564575195, "epoch": 0.8777931579988135, "mean_token_accuracy": 0.7519209384918213, "num_tokens": 25406665.0, "step": 8878, "train/ce_loss": 1.182747721672058 }, { "epoch": 0.8777931579988135, "step": 8878, "train/sim_loss": 0.0625 }, { "epoch": 0.8777931579988135, "step": 8878, "train/total_loss": 0.1807747781276703 }, { "entropy": 8.149176597595215, "epoch": 0.877892030848329, "mean_token_accuracy": 0.7310924530029297, "num_tokens": 25412100.0, "step": 8879, "train/ce_loss": 0.8335572481155396 }, { "epoch": 0.877892030848329, "step": 8879, "train/sim_loss": 0.04296875 }, { "epoch": 0.877892030848329, "step": 8879, "train/total_loss": 0.12632447481155396 }, { "epoch": 0.8779909036978446, "grad_norm": 0.6092635989189148, "learning_rate": 7.807199723087574e-06, "loss": 0.1257, "step": 8880 }, { "entropy": 8.752358436584473, "epoch": 0.8779909036978446, "mean_token_accuracy": 0.7354497313499451, "num_tokens": 25417280.0, "step": 8880, "train/ce_loss": 0.844520628452301 }, { "epoch": 0.8779909036978446, "step": 8880, "train/sim_loss": 0.046875 }, { "epoch": 0.8779909036978446, "step": 8880, "train/total_loss": 0.1313270628452301 }, { "entropy": 9.201775550842285, "epoch": 0.8780897765473601, "mean_token_accuracy": 0.7435158491134644, "num_tokens": 25422467.0, "step": 8881, "train/ce_loss": 5.139292511557869e-07 }, { "epoch": 0.8780897765473601, "step": 8881, "train/sim_loss": 0.0234375 }, { "epoch": 0.8780897765473601, "step": 8881, "train/total_loss": 0.02343755215406418 }, { "entropy": 9.13831901550293, "epoch": 0.8781886493968756, "mean_token_accuracy": 0.7526236772537231, "num_tokens": 25427702.0, "step": 8882, "train/ce_loss": 0.7612507343292236 }, { "epoch": 0.8781886493968756, "step": 8882, "train/sim_loss": 0.078125 }, { "epoch": 0.8781886493968756, "step": 8882, "train/total_loss": 0.15425008535385132 }, { "entropy": 8.566118240356445, "epoch": 0.8782875222463912, "mean_token_accuracy": 0.7414966225624084, "num_tokens": 25433054.0, "step": 8883, "train/ce_loss": 0.5385515093803406 }, { "epoch": 0.8782875222463912, "step": 8883, "train/sim_loss": 0.015625 }, { "epoch": 0.8782875222463912, "step": 8883, "train/total_loss": 0.06948015093803406 }, { "entropy": 9.248607635498047, "epoch": 0.8783863950959067, "mean_token_accuracy": 0.6926910281181335, "num_tokens": 25438066.0, "step": 8884, "train/ce_loss": 1.8416887521743774 }, { "epoch": 0.8783863950959067, "step": 8884, "train/sim_loss": 0.1875 }, { "epoch": 0.8783863950959067, "step": 8884, "train/total_loss": 0.37166887521743774 }, { "entropy": 8.906831741333008, "epoch": 0.8784852679454221, "mean_token_accuracy": 0.7623888254165649, "num_tokens": 25443328.0, "step": 8885, "train/ce_loss": 0.5900859832763672 }, { "epoch": 0.8784852679454221, "step": 8885, "train/sim_loss": 0.05078125 }, { "epoch": 0.8784852679454221, "step": 8885, "train/total_loss": 0.10978984832763672 }, { "entropy": 8.640851020812988, "epoch": 0.8785841407949377, "mean_token_accuracy": 0.696351945400238, "num_tokens": 25448728.0, "step": 8886, "train/ce_loss": 0.9450283050537109 }, { "epoch": 0.8785841407949377, "step": 8886, "train/sim_loss": 0.1171875 }, { "epoch": 0.8785841407949377, "step": 8886, "train/total_loss": 0.21169033646583557 }, { "entropy": 8.625853538513184, "epoch": 0.8786830136444532, "mean_token_accuracy": 0.7610993385314941, "num_tokens": 25454337.0, "step": 8887, "train/ce_loss": 0.9452787637710571 }, { "epoch": 0.8786830136444532, "step": 8887, "train/sim_loss": 0.04296875 }, { "epoch": 0.8786830136444532, "step": 8887, "train/total_loss": 0.13749662041664124 }, { "entropy": 8.575627326965332, "epoch": 0.8787818864939687, "mean_token_accuracy": 0.7576141953468323, "num_tokens": 25459571.0, "step": 8888, "train/ce_loss": 0.5424915552139282 }, { "epoch": 0.8787818864939687, "step": 8888, "train/sim_loss": 0.0234375 }, { "epoch": 0.8787818864939687, "step": 8888, "train/total_loss": 0.07768665254116058 }, { "entropy": 8.32967758178711, "epoch": 0.8788807593434843, "mean_token_accuracy": 0.7444238066673279, "num_tokens": 25465173.0, "step": 8889, "train/ce_loss": 0.8996832966804504 }, { "epoch": 0.8788807593434843, "step": 8889, "train/sim_loss": 0.08984375 }, { "epoch": 0.8788807593434843, "step": 8889, "train/total_loss": 0.17981207370758057 }, { "entropy": 8.881383895874023, "epoch": 0.8789796321929998, "mean_token_accuracy": 0.7243995070457458, "num_tokens": 25470418.0, "step": 8890, "train/ce_loss": 1.5166651010513306 }, { "epoch": 0.8789796321929998, "step": 8890, "train/sim_loss": 0.12890625 }, { "epoch": 0.8789796321929998, "step": 8890, "train/total_loss": 0.280572772026062 }, { "entropy": 8.884958267211914, "epoch": 0.8790785050425153, "mean_token_accuracy": 0.7473524808883667, "num_tokens": 25475540.0, "step": 8891, "train/ce_loss": 0.95103520154953 }, { "epoch": 0.8790785050425153, "step": 8891, "train/sim_loss": 0.046875 }, { "epoch": 0.8790785050425153, "step": 8891, "train/total_loss": 0.14197853207588196 }, { "entropy": 9.408562660217285, "epoch": 0.8791773778920309, "mean_token_accuracy": 0.8113207817077637, "num_tokens": 25480414.0, "step": 8892, "train/ce_loss": 2.3370089365926106e-06 }, { "epoch": 0.8791773778920309, "step": 8892, "train/sim_loss": 0.02734375 }, { "epoch": 0.8791773778920309, "step": 8892, "train/total_loss": 0.027343982830643654 }, { "entropy": 8.868795394897461, "epoch": 0.8792762507415464, "mean_token_accuracy": 0.7309237122535706, "num_tokens": 25485656.0, "step": 8893, "train/ce_loss": 1.4700690507888794 }, { "epoch": 0.8792762507415464, "step": 8893, "train/sim_loss": 0.0703125 }, { "epoch": 0.8792762507415464, "step": 8893, "train/total_loss": 0.21731941401958466 }, { "entropy": 8.755334854125977, "epoch": 0.8793751235910618, "mean_token_accuracy": 0.7582278251647949, "num_tokens": 25490928.0, "step": 8894, "train/ce_loss": 1.7055973557944526e-06 }, { "epoch": 0.8793751235910618, "step": 8894, "train/sim_loss": 0.03125 }, { "epoch": 0.8793751235910618, "step": 8894, "train/total_loss": 0.03125017136335373 }, { "entropy": 8.831209182739258, "epoch": 0.8794739964405774, "mean_token_accuracy": 0.7928921580314636, "num_tokens": 25496223.0, "step": 8895, "train/ce_loss": 1.1295431852340698 }, { "epoch": 0.8794739964405774, "step": 8895, "train/sim_loss": 0.015625 }, { "epoch": 0.8794739964405774, "step": 8895, "train/total_loss": 0.12857931852340698 }, { "entropy": 8.743522644042969, "epoch": 0.8795728692900929, "mean_token_accuracy": 0.7045161128044128, "num_tokens": 25501482.0, "step": 8896, "train/ce_loss": 0.559874951839447 }, { "epoch": 0.8795728692900929, "step": 8896, "train/sim_loss": 0.08203125 }, { "epoch": 0.8795728692900929, "step": 8896, "train/total_loss": 0.13801874220371246 }, { "entropy": 8.64323616027832, "epoch": 0.8796717421396084, "mean_token_accuracy": 0.7845528721809387, "num_tokens": 25506910.0, "step": 8897, "train/ce_loss": 0.48229286074638367 }, { "epoch": 0.8796717421396084, "step": 8897, "train/sim_loss": 0.0390625 }, { "epoch": 0.8796717421396084, "step": 8897, "train/total_loss": 0.08729179203510284 }, { "entropy": 8.610434532165527, "epoch": 0.879770614989124, "mean_token_accuracy": 0.7132784724235535, "num_tokens": 25512183.0, "step": 8898, "train/ce_loss": 0.48742854595184326 }, { "epoch": 0.879770614989124, "step": 8898, "train/sim_loss": 0.015625 }, { "epoch": 0.879770614989124, "step": 8898, "train/total_loss": 0.0643678605556488 }, { "entropy": 8.946176528930664, "epoch": 0.8798694878386395, "mean_token_accuracy": 0.6847222447395325, "num_tokens": 25517383.0, "step": 8899, "train/ce_loss": 0.8376083374023438 }, { "epoch": 0.8798694878386395, "step": 8899, "train/sim_loss": 0.0546875 }, { "epoch": 0.8798694878386395, "step": 8899, "train/total_loss": 0.1384483277797699 }, { "epoch": 0.879968360688155, "grad_norm": 0.725326657295227, "learning_rate": 7.802254858329625e-06, "loss": 0.1384, "step": 8900 }, { "entropy": 8.435836791992188, "epoch": 0.879968360688155, "mean_token_accuracy": 0.7551867365837097, "num_tokens": 25522877.0, "step": 8900, "train/ce_loss": 2.0146045684814453 }, { "epoch": 0.879968360688155, "step": 8900, "train/sim_loss": 0.05859375 }, { "epoch": 0.879968360688155, "step": 8900, "train/total_loss": 0.26005423069000244 }, { "entropy": 8.750149726867676, "epoch": 0.8800672335376706, "mean_token_accuracy": 0.7774968147277832, "num_tokens": 25528141.0, "step": 8901, "train/ce_loss": 0.4915764033794403 }, { "epoch": 0.8800672335376706, "step": 8901, "train/sim_loss": 0.03125 }, { "epoch": 0.8800672335376706, "step": 8901, "train/total_loss": 0.08040764182806015 }, { "entropy": 8.452852249145508, "epoch": 0.8801661063871861, "mean_token_accuracy": 0.755959153175354, "num_tokens": 25533531.0, "step": 8902, "train/ce_loss": 0.8060742020606995 }, { "epoch": 0.8801661063871861, "step": 8902, "train/sim_loss": 0.1328125 }, { "epoch": 0.8801661063871861, "step": 8902, "train/total_loss": 0.21341991424560547 }, { "entropy": 8.768773078918457, "epoch": 0.8802649792367016, "mean_token_accuracy": 0.7303370833396912, "num_tokens": 25538688.0, "step": 8903, "train/ce_loss": 0.4603393077850342 }, { "epoch": 0.8802649792367016, "step": 8903, "train/sim_loss": 0.0703125 }, { "epoch": 0.8802649792367016, "step": 8903, "train/total_loss": 0.11634643375873566 }, { "entropy": 8.699012756347656, "epoch": 0.8803638520862171, "mean_token_accuracy": 0.7281795740127563, "num_tokens": 25543955.0, "step": 8904, "train/ce_loss": 1.3716063499450684 }, { "epoch": 0.8803638520862171, "step": 8904, "train/sim_loss": 0.07421875 }, { "epoch": 0.8803638520862171, "step": 8904, "train/total_loss": 0.21137939393520355 }, { "entropy": 8.342909812927246, "epoch": 0.8804627249357326, "mean_token_accuracy": 0.7587548494338989, "num_tokens": 25549428.0, "step": 8905, "train/ce_loss": 1.0330997705459595 }, { "epoch": 0.8804627249357326, "step": 8905, "train/sim_loss": 0.0390625 }, { "epoch": 0.8804627249357326, "step": 8905, "train/total_loss": 0.1423724889755249 }, { "entropy": 8.701141357421875, "epoch": 0.8805615977852481, "mean_token_accuracy": 0.7564895153045654, "num_tokens": 25554642.0, "step": 8906, "train/ce_loss": 0.7028892040252686 }, { "epoch": 0.8805615977852481, "step": 8906, "train/sim_loss": 0.0625 }, { "epoch": 0.8805615977852481, "step": 8906, "train/total_loss": 0.13278892636299133 }, { "entropy": 8.76907730102539, "epoch": 0.8806604706347637, "mean_token_accuracy": 0.7080394625663757, "num_tokens": 25559834.0, "step": 8907, "train/ce_loss": 0.895540177822113 }, { "epoch": 0.8806604706347637, "step": 8907, "train/sim_loss": 0.046875 }, { "epoch": 0.8806604706347637, "step": 8907, "train/total_loss": 0.13642901182174683 }, { "entropy": 8.812956809997559, "epoch": 0.8807593434842792, "mean_token_accuracy": 0.729411780834198, "num_tokens": 25565162.0, "step": 8908, "train/ce_loss": 0.562024712562561 }, { "epoch": 0.8807593434842792, "step": 8908, "train/sim_loss": 0.04296875 }, { "epoch": 0.8807593434842792, "step": 8908, "train/total_loss": 0.0991712212562561 }, { "entropy": 8.56463623046875, "epoch": 0.8808582163337947, "mean_token_accuracy": 0.7385892271995544, "num_tokens": 25570584.0, "step": 8909, "train/ce_loss": 0.5374073386192322 }, { "epoch": 0.8808582163337947, "step": 8909, "train/sim_loss": 0.05078125 }, { "epoch": 0.8808582163337947, "step": 8909, "train/total_loss": 0.1045219898223877 }, { "entropy": 8.398541450500488, "epoch": 0.8809570891833103, "mean_token_accuracy": 0.6953405141830444, "num_tokens": 25575893.0, "step": 8910, "train/ce_loss": 0.5384745597839355 }, { "epoch": 0.8809570891833103, "step": 8910, "train/sim_loss": 0.0234375 }, { "epoch": 0.8809570891833103, "step": 8910, "train/total_loss": 0.07728496193885803 }, { "entropy": 8.852869033813477, "epoch": 0.8810559620328258, "mean_token_accuracy": 0.7487179636955261, "num_tokens": 25581057.0, "step": 8911, "train/ce_loss": 0.5818669199943542 }, { "epoch": 0.8810559620328258, "step": 8911, "train/sim_loss": 0.046875 }, { "epoch": 0.8810559620328258, "step": 8911, "train/total_loss": 0.10506169497966766 }, { "entropy": 8.566947937011719, "epoch": 0.8811548348823414, "mean_token_accuracy": 0.8086283206939697, "num_tokens": 25586451.0, "step": 8912, "train/ce_loss": 0.47238612174987793 }, { "epoch": 0.8811548348823414, "step": 8912, "train/sim_loss": 0.01953125 }, { "epoch": 0.8811548348823414, "step": 8912, "train/total_loss": 0.06676986813545227 }, { "entropy": 8.759763717651367, "epoch": 0.8812537077318569, "mean_token_accuracy": 0.7574626803398132, "num_tokens": 25591708.0, "step": 8913, "train/ce_loss": 0.5444103479385376 }, { "epoch": 0.8812537077318569, "step": 8913, "train/sim_loss": 0.05859375 }, { "epoch": 0.8812537077318569, "step": 8913, "train/total_loss": 0.11303478479385376 }, { "entropy": 8.827457427978516, "epoch": 0.8813525805813723, "mean_token_accuracy": 0.7457886934280396, "num_tokens": 25596775.0, "step": 8914, "train/ce_loss": 0.7621586918830872 }, { "epoch": 0.8813525805813723, "step": 8914, "train/sim_loss": 0.0625 }, { "epoch": 0.8813525805813723, "step": 8914, "train/total_loss": 0.13871586322784424 }, { "entropy": 8.827047348022461, "epoch": 0.8814514534308879, "mean_token_accuracy": 0.7344877123832703, "num_tokens": 25601953.0, "step": 8915, "train/ce_loss": 2.1599555015563965 }, { "epoch": 0.8814514534308879, "step": 8915, "train/sim_loss": 0.06640625 }, { "epoch": 0.8814514534308879, "step": 8915, "train/total_loss": 0.28240180015563965 }, { "entropy": 8.259698867797852, "epoch": 0.8815503262804034, "mean_token_accuracy": 0.7744755148887634, "num_tokens": 25607565.0, "step": 8916, "train/ce_loss": 0.24848879873752594 }, { "epoch": 0.8815503262804034, "step": 8916, "train/sim_loss": 0.03125 }, { "epoch": 0.8815503262804034, "step": 8916, "train/total_loss": 0.056098878383636475 }, { "entropy": 9.535283088684082, "epoch": 0.8816491991299189, "mean_token_accuracy": 0.821670413017273, "num_tokens": 25612404.0, "step": 8917, "train/ce_loss": 3.0784343607592746e-07 }, { "epoch": 0.8816491991299189, "step": 8917, "train/sim_loss": 0.015625 }, { "epoch": 0.8816491991299189, "step": 8917, "train/total_loss": 0.015625031664967537 }, { "entropy": 8.790726661682129, "epoch": 0.8817480719794345, "mean_token_accuracy": 0.707563042640686, "num_tokens": 25617497.0, "step": 8918, "train/ce_loss": 2.574655354692368e-06 }, { "epoch": 0.8817480719794345, "step": 8918, "train/sim_loss": 0.0625 }, { "epoch": 0.8817480719794345, "step": 8918, "train/total_loss": 0.06250026077032089 }, { "entropy": 9.147619247436523, "epoch": 0.88184694482895, "mean_token_accuracy": 0.7005163431167603, "num_tokens": 25622540.0, "step": 8919, "train/ce_loss": 8.420393555752526e-07 }, { "epoch": 0.88184694482895, "step": 8919, "train/sim_loss": 0.0390625 }, { "epoch": 0.88184694482895, "step": 8919, "train/total_loss": 0.039062585681676865 }, { "epoch": 0.8819458176784655, "grad_norm": 0.7121282815933228, "learning_rate": 7.797309993571677e-06, "loss": 0.133, "step": 8920 }, { "entropy": 8.917064666748047, "epoch": 0.8819458176784655, "mean_token_accuracy": 0.7324262857437134, "num_tokens": 25627895.0, "step": 8920, "train/ce_loss": 0.6380739808082581 }, { "epoch": 0.8819458176784655, "step": 8920, "train/sim_loss": 0.03515625 }, { "epoch": 0.8819458176784655, "step": 8920, "train/total_loss": 0.0989636480808258 }, { "entropy": 9.06655502319336, "epoch": 0.8820446905279811, "mean_token_accuracy": 0.7949080467224121, "num_tokens": 25633036.0, "step": 8921, "train/ce_loss": 0.5876515507698059 }, { "epoch": 0.8820446905279811, "step": 8921, "train/sim_loss": 0.0546875 }, { "epoch": 0.8820446905279811, "step": 8921, "train/total_loss": 0.11345265805721283 }, { "entropy": 8.67611026763916, "epoch": 0.8821435633774966, "mean_token_accuracy": 0.7111111283302307, "num_tokens": 25638258.0, "step": 8922, "train/ce_loss": 0.8048073053359985 }, { "epoch": 0.8821435633774966, "step": 8922, "train/sim_loss": 0.03515625 }, { "epoch": 0.8821435633774966, "step": 8922, "train/total_loss": 0.11563698202371597 }, { "entropy": 9.230369567871094, "epoch": 0.882242436227012, "mean_token_accuracy": 0.7452692985534668, "num_tokens": 25643372.0, "step": 8923, "train/ce_loss": 0.4094837009906769 }, { "epoch": 0.882242436227012, "step": 8923, "train/sim_loss": 0.05859375 }, { "epoch": 0.882242436227012, "step": 8923, "train/total_loss": 0.09954212605953217 }, { "entropy": 8.365276336669922, "epoch": 0.8823413090765276, "mean_token_accuracy": 0.7350254058837891, "num_tokens": 25648815.0, "step": 8924, "train/ce_loss": 0.9520606994628906 }, { "epoch": 0.8823413090765276, "step": 8924, "train/sim_loss": 0.0859375 }, { "epoch": 0.8823413090765276, "step": 8924, "train/total_loss": 0.18114358186721802 }, { "entropy": 8.996768951416016, "epoch": 0.8824401819260431, "mean_token_accuracy": 0.7585185170173645, "num_tokens": 25653965.0, "step": 8925, "train/ce_loss": 0.7729405164718628 }, { "epoch": 0.8824401819260431, "step": 8925, "train/sim_loss": 0.046875 }, { "epoch": 0.8824401819260431, "step": 8925, "train/total_loss": 0.12416905164718628 }, { "entropy": 8.97836971282959, "epoch": 0.8825390547755586, "mean_token_accuracy": 0.7607843279838562, "num_tokens": 25659203.0, "step": 8926, "train/ce_loss": 0.8300597071647644 }, { "epoch": 0.8825390547755586, "step": 8926, "train/sim_loss": 0.078125 }, { "epoch": 0.8825390547755586, "step": 8926, "train/total_loss": 0.16113096475601196 }, { "entropy": 8.835588455200195, "epoch": 0.8826379276250742, "mean_token_accuracy": 0.7110862135887146, "num_tokens": 25664553.0, "step": 8927, "train/ce_loss": 0.7836290597915649 }, { "epoch": 0.8826379276250742, "step": 8927, "train/sim_loss": 0.0234375 }, { "epoch": 0.8826379276250742, "step": 8927, "train/total_loss": 0.10180040448904037 }, { "entropy": 8.801051139831543, "epoch": 0.8827368004745897, "mean_token_accuracy": 0.746666669845581, "num_tokens": 25669887.0, "step": 8928, "train/ce_loss": 0.8353655934333801 }, { "epoch": 0.8827368004745897, "step": 8928, "train/sim_loss": 0.0390625 }, { "epoch": 0.8827368004745897, "step": 8928, "train/total_loss": 0.1225990578532219 }, { "entropy": 8.794574737548828, "epoch": 0.8828356733241052, "mean_token_accuracy": 0.7582292556762695, "num_tokens": 25675246.0, "step": 8929, "train/ce_loss": 0.35076427459716797 }, { "epoch": 0.8828356733241052, "step": 8929, "train/sim_loss": 0.015625 }, { "epoch": 0.8828356733241052, "step": 8929, "train/total_loss": 0.05070142820477486 }, { "entropy": 9.182476997375488, "epoch": 0.8829345461736208, "mean_token_accuracy": 0.7186991572380066, "num_tokens": 25680278.0, "step": 8930, "train/ce_loss": 9.869708037513192e-07 }, { "epoch": 0.8829345461736208, "step": 8930, "train/sim_loss": 0.03125 }, { "epoch": 0.8829345461736208, "step": 8930, "train/total_loss": 0.03125009685754776 }, { "entropy": 9.165849685668945, "epoch": 0.8830334190231363, "mean_token_accuracy": 0.7638669013977051, "num_tokens": 25685336.0, "step": 8931, "train/ce_loss": 1.3718249797821045 }, { "epoch": 0.8830334190231363, "step": 8931, "train/sim_loss": 0.0546875 }, { "epoch": 0.8830334190231363, "step": 8931, "train/total_loss": 0.19187000393867493 }, { "entropy": 9.511493682861328, "epoch": 0.8831322918726517, "mean_token_accuracy": 0.7290748953819275, "num_tokens": 25690213.0, "step": 8932, "train/ce_loss": 2.57829879046767e-06 }, { "epoch": 0.8831322918726517, "step": 8932, "train/sim_loss": 0.0546875 }, { "epoch": 0.8831322918726517, "step": 8932, "train/total_loss": 0.054687757045030594 }, { "entropy": 9.19780445098877, "epoch": 0.8832311647221673, "mean_token_accuracy": 0.8355704545974731, "num_tokens": 25695253.0, "step": 8933, "train/ce_loss": 0.6308323740959167 }, { "epoch": 0.8832311647221673, "step": 8933, "train/sim_loss": 0.0234375 }, { "epoch": 0.8832311647221673, "step": 8933, "train/total_loss": 0.0865207388997078 }, { "entropy": 8.815532684326172, "epoch": 0.8833300375716828, "mean_token_accuracy": 0.7622842192649841, "num_tokens": 25700488.0, "step": 8934, "train/ce_loss": 0.6281819343566895 }, { "epoch": 0.8833300375716828, "step": 8934, "train/sim_loss": 0.03125 }, { "epoch": 0.8833300375716828, "step": 8934, "train/total_loss": 0.09406819194555283 }, { "entropy": 8.927444458007812, "epoch": 0.8834289104211983, "mean_token_accuracy": 0.7278250455856323, "num_tokens": 25705765.0, "step": 8935, "train/ce_loss": 0.6441558599472046 }, { "epoch": 0.8834289104211983, "step": 8935, "train/sim_loss": 0.05078125 }, { "epoch": 0.8834289104211983, "step": 8935, "train/total_loss": 0.1151968389749527 }, { "entropy": 8.61522102355957, "epoch": 0.8835277832707139, "mean_token_accuracy": 0.7848244905471802, "num_tokens": 25711116.0, "step": 8936, "train/ce_loss": 1.077426552772522 }, { "epoch": 0.8835277832707139, "step": 8936, "train/sim_loss": 0.04296875 }, { "epoch": 0.8835277832707139, "step": 8936, "train/total_loss": 0.15071141719818115 }, { "entropy": 9.279502868652344, "epoch": 0.8836266561202294, "mean_token_accuracy": 0.6969178318977356, "num_tokens": 25716089.0, "step": 8937, "train/ce_loss": 1.4144339561462402 }, { "epoch": 0.8836266561202294, "step": 8937, "train/sim_loss": 0.05078125 }, { "epoch": 0.8836266561202294, "step": 8937, "train/total_loss": 0.1922246515750885 }, { "entropy": 8.89355754852295, "epoch": 0.8837255289697449, "mean_token_accuracy": 0.7366504669189453, "num_tokens": 25721422.0, "step": 8938, "train/ce_loss": 0.6931769847869873 }, { "epoch": 0.8837255289697449, "step": 8938, "train/sim_loss": 0.0234375 }, { "epoch": 0.8837255289697449, "step": 8938, "train/total_loss": 0.09275519847869873 }, { "entropy": 8.457046508789062, "epoch": 0.8838244018192605, "mean_token_accuracy": 0.7530864477157593, "num_tokens": 25726869.0, "step": 8939, "train/ce_loss": 0.583381712436676 }, { "epoch": 0.8838244018192605, "step": 8939, "train/sim_loss": 0.01953125 }, { "epoch": 0.8838244018192605, "step": 8939, "train/total_loss": 0.07786942273378372 }, { "epoch": 0.883923274668776, "grad_norm": 0.5730092525482178, "learning_rate": 7.792365128813727e-06, "loss": 0.1301, "step": 8940 }, { "entropy": 9.440145492553711, "epoch": 0.883923274668776, "mean_token_accuracy": 0.6542239785194397, "num_tokens": 25731837.0, "step": 8940, "train/ce_loss": 3.5538878440856934 }, { "epoch": 0.883923274668776, "step": 8940, "train/sim_loss": 0.046875 }, { "epoch": 0.883923274668776, "step": 8940, "train/total_loss": 0.4022637903690338 }, { "entropy": 8.689183235168457, "epoch": 0.8840221475182914, "mean_token_accuracy": 0.7245322465896606, "num_tokens": 25737243.0, "step": 8941, "train/ce_loss": 1.8543144464492798 }, { "epoch": 0.8840221475182914, "step": 8941, "train/sim_loss": 0.0703125 }, { "epoch": 0.8840221475182914, "step": 8941, "train/total_loss": 0.25574395060539246 }, { "entropy": 8.877262115478516, "epoch": 0.884121020367807, "mean_token_accuracy": 0.7209026217460632, "num_tokens": 25742519.0, "step": 8942, "train/ce_loss": 2.0073533058166504 }, { "epoch": 0.884121020367807, "step": 8942, "train/sim_loss": 0.07421875 }, { "epoch": 0.884121020367807, "step": 8942, "train/total_loss": 0.27495408058166504 }, { "entropy": 8.815057754516602, "epoch": 0.8842198932173225, "mean_token_accuracy": 0.7562254071235657, "num_tokens": 25747716.0, "step": 8943, "train/ce_loss": 0.8791753649711609 }, { "epoch": 0.8842198932173225, "step": 8943, "train/sim_loss": 0.0625 }, { "epoch": 0.8842198932173225, "step": 8943, "train/total_loss": 0.1504175364971161 }, { "entropy": 8.566951751708984, "epoch": 0.884318766066838, "mean_token_accuracy": 0.689734697341919, "num_tokens": 25753045.0, "step": 8944, "train/ce_loss": 1.353271245956421 }, { "epoch": 0.884318766066838, "step": 8944, "train/sim_loss": 0.109375 }, { "epoch": 0.884318766066838, "step": 8944, "train/total_loss": 0.24470213055610657 }, { "entropy": 8.79873275756836, "epoch": 0.8844176389163536, "mean_token_accuracy": 0.7470167279243469, "num_tokens": 25758511.0, "step": 8945, "train/ce_loss": 0.7518976330757141 }, { "epoch": 0.8844176389163536, "step": 8945, "train/sim_loss": 0.05859375 }, { "epoch": 0.8844176389163536, "step": 8945, "train/total_loss": 0.1337835192680359 }, { "entropy": 9.191802024841309, "epoch": 0.8845165117658691, "mean_token_accuracy": 0.7674825191497803, "num_tokens": 25763466.0, "step": 8946, "train/ce_loss": 0.6582119464874268 }, { "epoch": 0.8845165117658691, "step": 8946, "train/sim_loss": 0.0703125 }, { "epoch": 0.8845165117658691, "step": 8946, "train/total_loss": 0.13613370060920715 }, { "entropy": 8.626323699951172, "epoch": 0.8846153846153846, "mean_token_accuracy": 0.7957276105880737, "num_tokens": 25768667.0, "step": 8947, "train/ce_loss": 0.5319151878356934 }, { "epoch": 0.8846153846153846, "step": 8947, "train/sim_loss": 0.01953125 }, { "epoch": 0.8846153846153846, "step": 8947, "train/total_loss": 0.07272277027368546 }, { "entropy": 8.56462287902832, "epoch": 0.8847142574649002, "mean_token_accuracy": 0.7169590592384338, "num_tokens": 25774038.0, "step": 8948, "train/ce_loss": 1.3787561655044556 }, { "epoch": 0.8847142574649002, "step": 8948, "train/sim_loss": 0.09375 }, { "epoch": 0.8847142574649002, "step": 8948, "train/total_loss": 0.23162561655044556 }, { "entropy": 8.933222770690918, "epoch": 0.8848131303144157, "mean_token_accuracy": 0.7023121118545532, "num_tokens": 25779246.0, "step": 8949, "train/ce_loss": 1.3690531253814697 }, { "epoch": 0.8848131303144157, "step": 8949, "train/sim_loss": 0.0546875 }, { "epoch": 0.8848131303144157, "step": 8949, "train/total_loss": 0.19159281253814697 }, { "entropy": 9.15608024597168, "epoch": 0.8849120031639311, "mean_token_accuracy": 0.7441016435623169, "num_tokens": 25784243.0, "step": 8950, "train/ce_loss": 1.1210671663284302 }, { "epoch": 0.8849120031639311, "step": 8950, "train/sim_loss": 0.06640625 }, { "epoch": 0.8849120031639311, "step": 8950, "train/total_loss": 0.17851296067237854 }, { "entropy": 8.732675552368164, "epoch": 0.8850108760134467, "mean_token_accuracy": 0.6734475493431091, "num_tokens": 25789648.0, "step": 8951, "train/ce_loss": 2.0168395042419434 }, { "epoch": 0.8850108760134467, "step": 8951, "train/sim_loss": 0.03125 }, { "epoch": 0.8850108760134467, "step": 8951, "train/total_loss": 0.23293395340442657 }, { "entropy": 8.678586959838867, "epoch": 0.8851097488629622, "mean_token_accuracy": 0.795134425163269, "num_tokens": 25794883.0, "step": 8952, "train/ce_loss": 0.3202112317085266 }, { "epoch": 0.8851097488629622, "step": 8952, "train/sim_loss": 0.02734375 }, { "epoch": 0.8851097488629622, "step": 8952, "train/total_loss": 0.05936487391591072 }, { "entropy": 8.49631118774414, "epoch": 0.8852086217124777, "mean_token_accuracy": 0.8096304535865784, "num_tokens": 25800285.0, "step": 8953, "train/ce_loss": 0.6475458145141602 }, { "epoch": 0.8852086217124777, "step": 8953, "train/sim_loss": 0.03515625 }, { "epoch": 0.8852086217124777, "step": 8953, "train/total_loss": 0.09991083294153214 }, { "entropy": 8.76569938659668, "epoch": 0.8853074945619933, "mean_token_accuracy": 0.7706855535507202, "num_tokens": 25805590.0, "step": 8954, "train/ce_loss": 1.0109540224075317 }, { "epoch": 0.8853074945619933, "step": 8954, "train/sim_loss": 0.0859375 }, { "epoch": 0.8853074945619933, "step": 8954, "train/total_loss": 0.18703290820121765 }, { "entropy": 9.034889221191406, "epoch": 0.8854063674115088, "mean_token_accuracy": 0.6842105388641357, "num_tokens": 25810630.0, "step": 8955, "train/ce_loss": 1.3987741470336914 }, { "epoch": 0.8854063674115088, "step": 8955, "train/sim_loss": 0.0859375 }, { "epoch": 0.8854063674115088, "step": 8955, "train/total_loss": 0.22581492364406586 }, { "entropy": 8.44428825378418, "epoch": 0.8855052402610243, "mean_token_accuracy": 0.7245631814002991, "num_tokens": 25816120.0, "step": 8956, "train/ce_loss": 0.7968977093696594 }, { "epoch": 0.8855052402610243, "step": 8956, "train/sim_loss": 0.01171875 }, { "epoch": 0.8855052402610243, "step": 8956, "train/total_loss": 0.09140852093696594 }, { "entropy": 8.451257705688477, "epoch": 0.8856041131105399, "mean_token_accuracy": 0.7467532753944397, "num_tokens": 25821561.0, "step": 8957, "train/ce_loss": 0.9519866108894348 }, { "epoch": 0.8856041131105399, "step": 8957, "train/sim_loss": 0.02734375 }, { "epoch": 0.8856041131105399, "step": 8957, "train/total_loss": 0.12254241108894348 }, { "entropy": 9.339811325073242, "epoch": 0.8857029859600554, "mean_token_accuracy": 0.7164750695228577, "num_tokens": 25826653.0, "step": 8958, "train/ce_loss": 1.1054483652114868 }, { "epoch": 0.8857029859600554, "step": 8958, "train/sim_loss": 0.07421875 }, { "epoch": 0.8857029859600554, "step": 8958, "train/total_loss": 0.1847635805606842 }, { "entropy": 9.519251823425293, "epoch": 0.8858018588095709, "mean_token_accuracy": 0.6827957034111023, "num_tokens": 25831625.0, "step": 8959, "train/ce_loss": 0.9935401082038879 }, { "epoch": 0.8858018588095709, "step": 8959, "train/sim_loss": 0.07421875 }, { "epoch": 0.8858018588095709, "step": 8959, "train/total_loss": 0.17357276380062103 }, { "epoch": 0.8859007316590864, "grad_norm": 0.7268169522285461, "learning_rate": 7.787420264055778e-06, "loss": 0.1342, "step": 8960 }, { "entropy": 9.06598949432373, "epoch": 0.8859007316590864, "mean_token_accuracy": 0.697609007358551, "num_tokens": 25836814.0, "step": 8960, "train/ce_loss": 0.7908419370651245 }, { "epoch": 0.8859007316590864, "step": 8960, "train/sim_loss": 0.03515625 }, { "epoch": 0.8859007316590864, "step": 8960, "train/total_loss": 0.11424044519662857 }, { "entropy": 8.391468048095703, "epoch": 0.8859996045086019, "mean_token_accuracy": 0.7623947858810425, "num_tokens": 25842363.0, "step": 8961, "train/ce_loss": 0.9892774820327759 }, { "epoch": 0.8859996045086019, "step": 8961, "train/sim_loss": 0.0859375 }, { "epoch": 0.8859996045086019, "step": 8961, "train/total_loss": 0.18486525118350983 }, { "entropy": 9.127792358398438, "epoch": 0.8860984773581174, "mean_token_accuracy": 0.7111111283302307, "num_tokens": 25847520.0, "step": 8962, "train/ce_loss": 0.6025692224502563 }, { "epoch": 0.8860984773581174, "step": 8962, "train/sim_loss": 0.04296875 }, { "epoch": 0.8860984773581174, "step": 8962, "train/total_loss": 0.10322567820549011 }, { "entropy": 9.31008529663086, "epoch": 0.886197350207633, "mean_token_accuracy": 0.7643678188323975, "num_tokens": 25852467.0, "step": 8963, "train/ce_loss": 1.804742693901062 }, { "epoch": 0.886197350207633, "step": 8963, "train/sim_loss": 0.06640625 }, { "epoch": 0.886197350207633, "step": 8963, "train/total_loss": 0.24688051640987396 }, { "entropy": 8.78582763671875, "epoch": 0.8862962230571485, "mean_token_accuracy": 0.7967032790184021, "num_tokens": 25857661.0, "step": 8964, "train/ce_loss": 0.6282260417938232 }, { "epoch": 0.8862962230571485, "step": 8964, "train/sim_loss": 0.0234375 }, { "epoch": 0.8862962230571485, "step": 8964, "train/total_loss": 0.0862601026892662 }, { "entropy": 9.728044509887695, "epoch": 0.886395095906664, "mean_token_accuracy": 0.7571884989738464, "num_tokens": 25862386.0, "step": 8965, "train/ce_loss": 4.494114875797095e-07 }, { "epoch": 0.886395095906664, "step": 8965, "train/sim_loss": 0.0390625 }, { "epoch": 0.886395095906664, "step": 8965, "train/total_loss": 0.03906254470348358 }, { "entropy": 9.573162078857422, "epoch": 0.8864939687561796, "mean_token_accuracy": 0.7716186046600342, "num_tokens": 25867218.0, "step": 8966, "train/ce_loss": 6.132223688837257e-07 }, { "epoch": 0.8864939687561796, "step": 8966, "train/sim_loss": 0.03515625 }, { "epoch": 0.8864939687561796, "step": 8966, "train/total_loss": 0.035156309604644775 }, { "entropy": 8.320290565490723, "epoch": 0.8865928416056951, "mean_token_accuracy": 0.7339003682136536, "num_tokens": 25872510.0, "step": 8967, "train/ce_loss": 0.7447497844696045 }, { "epoch": 0.8865928416056951, "step": 8967, "train/sim_loss": 0.05078125 }, { "epoch": 0.8865928416056951, "step": 8967, "train/total_loss": 0.1252562403678894 }, { "entropy": 9.116350173950195, "epoch": 0.8866917144552106, "mean_token_accuracy": 0.7511811256408691, "num_tokens": 25877586.0, "step": 8968, "train/ce_loss": 4.29521065825611e-07 }, { "epoch": 0.8866917144552106, "step": 8968, "train/sim_loss": 0.046875 }, { "epoch": 0.8866917144552106, "step": 8968, "train/total_loss": 0.04687504470348358 }, { "entropy": 8.468484878540039, "epoch": 0.8867905873047262, "mean_token_accuracy": 0.7777777910232544, "num_tokens": 25882886.0, "step": 8969, "train/ce_loss": 0.45138150453567505 }, { "epoch": 0.8867905873047262, "step": 8969, "train/sim_loss": 0.03125 }, { "epoch": 0.8867905873047262, "step": 8969, "train/total_loss": 0.0763881504535675 }, { "entropy": 8.804032325744629, "epoch": 0.8868894601542416, "mean_token_accuracy": 0.7127799987792969, "num_tokens": 25888161.0, "step": 8970, "train/ce_loss": 0.6077730655670166 }, { "epoch": 0.8868894601542416, "step": 8970, "train/sim_loss": 0.01953125 }, { "epoch": 0.8868894601542416, "step": 8970, "train/total_loss": 0.08030855655670166 }, { "entropy": 8.558965682983398, "epoch": 0.8869883330037571, "mean_token_accuracy": 0.7459633946418762, "num_tokens": 25893581.0, "step": 8971, "train/ce_loss": 0.9098638296127319 }, { "epoch": 0.8869883330037571, "step": 8971, "train/sim_loss": 0.05078125 }, { "epoch": 0.8869883330037571, "step": 8971, "train/total_loss": 0.14176763594150543 }, { "entropy": 9.194341659545898, "epoch": 0.8870872058532727, "mean_token_accuracy": 0.7659574747085571, "num_tokens": 25898560.0, "step": 8972, "train/ce_loss": 1.1279083490371704 }, { "epoch": 0.8870872058532727, "step": 8972, "train/sim_loss": 0.06640625 }, { "epoch": 0.8870872058532727, "step": 8972, "train/total_loss": 0.17919708788394928 }, { "entropy": 9.268182754516602, "epoch": 0.8871860787027882, "mean_token_accuracy": 0.7967479825019836, "num_tokens": 25903590.0, "step": 8973, "train/ce_loss": 0.5328656435012817 }, { "epoch": 0.8871860787027882, "step": 8973, "train/sim_loss": 0.0234375 }, { "epoch": 0.8871860787027882, "step": 8973, "train/total_loss": 0.07672406733036041 }, { "entropy": 8.511130332946777, "epoch": 0.8872849515523037, "mean_token_accuracy": 0.7100238800048828, "num_tokens": 25908896.0, "step": 8974, "train/ce_loss": 0.8261290192604065 }, { "epoch": 0.8872849515523037, "step": 8974, "train/sim_loss": 0.04296875 }, { "epoch": 0.8872849515523037, "step": 8974, "train/total_loss": 0.12558165192604065 }, { "entropy": 8.378144264221191, "epoch": 0.8873838244018193, "mean_token_accuracy": 0.7524971961975098, "num_tokens": 25914255.0, "step": 8975, "train/ce_loss": 0.9754911661148071 }, { "epoch": 0.8873838244018193, "step": 8975, "train/sim_loss": 0.046875 }, { "epoch": 0.8873838244018193, "step": 8975, "train/total_loss": 0.14442411065101624 }, { "entropy": 9.569849014282227, "epoch": 0.8874826972513348, "mean_token_accuracy": 0.726190447807312, "num_tokens": 25919130.0, "step": 8976, "train/ce_loss": 8.707828555998276e-07 }, { "epoch": 0.8874826972513348, "step": 8976, "train/sim_loss": 0.0390625 }, { "epoch": 0.8874826972513348, "step": 8976, "train/total_loss": 0.039062585681676865 }, { "entropy": 9.503039360046387, "epoch": 0.8875815701008503, "mean_token_accuracy": 0.71856290102005, "num_tokens": 25924031.0, "step": 8977, "train/ce_loss": 6.605205271625891e-07 }, { "epoch": 0.8875815701008503, "step": 8977, "train/sim_loss": 0.05859375 }, { "epoch": 0.8875815701008503, "step": 8977, "train/total_loss": 0.05859381705522537 }, { "entropy": 9.024887084960938, "epoch": 0.8876804429503659, "mean_token_accuracy": 0.8118518590927124, "num_tokens": 25929125.0, "step": 8978, "train/ce_loss": 0.4504447877407074 }, { "epoch": 0.8876804429503659, "step": 8978, "train/sim_loss": 0.0234375 }, { "epoch": 0.8876804429503659, "step": 8978, "train/total_loss": 0.06848198175430298 }, { "entropy": 8.868086814880371, "epoch": 0.8877793157998813, "mean_token_accuracy": 0.7417417168617249, "num_tokens": 25934228.0, "step": 8979, "train/ce_loss": 1.3392902612686157 }, { "epoch": 0.8877793157998813, "step": 8979, "train/sim_loss": 0.05078125 }, { "epoch": 0.8877793157998813, "step": 8979, "train/total_loss": 0.1847102791070938 }, { "epoch": 0.8878781886493968, "grad_norm": 0.6756343841552734, "learning_rate": 7.78247539929783e-06, "loss": 0.1313, "step": 8980 }, { "entropy": 9.047006607055664, "epoch": 0.8878781886493968, "mean_token_accuracy": 0.729907751083374, "num_tokens": 25939570.0, "step": 8980, "train/ce_loss": 1.318399429321289 }, { "epoch": 0.8878781886493968, "step": 8980, "train/sim_loss": 0.09375 }, { "epoch": 0.8878781886493968, "step": 8980, "train/total_loss": 0.22558994591236115 }, { "entropy": 8.497231483459473, "epoch": 0.8879770614989124, "mean_token_accuracy": 0.7470930218696594, "num_tokens": 25945098.0, "step": 8981, "train/ce_loss": 0.9360732436180115 }, { "epoch": 0.8879770614989124, "step": 8981, "train/sim_loss": 0.046875 }, { "epoch": 0.8879770614989124, "step": 8981, "train/total_loss": 0.1404823362827301 }, { "entropy": 8.695652961730957, "epoch": 0.8880759343484279, "mean_token_accuracy": 0.734649121761322, "num_tokens": 25950459.0, "step": 8982, "train/ce_loss": 0.5582740902900696 }, { "epoch": 0.8880759343484279, "step": 8982, "train/sim_loss": 0.015625 }, { "epoch": 0.8880759343484279, "step": 8982, "train/total_loss": 0.07145240902900696 }, { "entropy": 8.390104293823242, "epoch": 0.8881748071979434, "mean_token_accuracy": 0.6779220700263977, "num_tokens": 25956086.0, "step": 8983, "train/ce_loss": 0.9199328422546387 }, { "epoch": 0.8881748071979434, "step": 8983, "train/sim_loss": 0.0390625 }, { "epoch": 0.8881748071979434, "step": 8983, "train/total_loss": 0.1310557872056961 }, { "entropy": 8.763376235961914, "epoch": 0.888273680047459, "mean_token_accuracy": 0.765389084815979, "num_tokens": 25961419.0, "step": 8984, "train/ce_loss": 0.5386528372764587 }, { "epoch": 0.888273680047459, "step": 8984, "train/sim_loss": 0.0234375 }, { "epoch": 0.888273680047459, "step": 8984, "train/total_loss": 0.07730278372764587 }, { "entropy": 8.878808975219727, "epoch": 0.8883725528969745, "mean_token_accuracy": 0.7311557531356812, "num_tokens": 25966562.0, "step": 8985, "train/ce_loss": 0.8655556440353394 }, { "epoch": 0.8883725528969745, "step": 8985, "train/sim_loss": 0.1171875 }, { "epoch": 0.8883725528969745, "step": 8985, "train/total_loss": 0.2037430703639984 }, { "entropy": 9.184910774230957, "epoch": 0.88847142574649, "mean_token_accuracy": 0.7799227833747864, "num_tokens": 25971506.0, "step": 8986, "train/ce_loss": 2.3841751328745886e-07 }, { "epoch": 0.88847142574649, "step": 8986, "train/sim_loss": 0.0390625 }, { "epoch": 0.88847142574649, "step": 8986, "train/total_loss": 0.03906252235174179 }, { "entropy": 8.740650177001953, "epoch": 0.8885702985960056, "mean_token_accuracy": 0.745932400226593, "num_tokens": 25976772.0, "step": 8987, "train/ce_loss": 0.5288046598434448 }, { "epoch": 0.8885702985960056, "step": 8987, "train/sim_loss": 0.05078125 }, { "epoch": 0.8885702985960056, "step": 8987, "train/total_loss": 0.10366171598434448 }, { "entropy": 8.849593162536621, "epoch": 0.888669171445521, "mean_token_accuracy": 0.737423300743103, "num_tokens": 25982054.0, "step": 8988, "train/ce_loss": 1.2712167501449585 }, { "epoch": 0.888669171445521, "step": 8988, "train/sim_loss": 0.0859375 }, { "epoch": 0.888669171445521, "step": 8988, "train/total_loss": 0.2130591720342636 }, { "entropy": 8.474994659423828, "epoch": 0.8887680442950365, "mean_token_accuracy": 0.7028301954269409, "num_tokens": 25987379.0, "step": 8989, "train/ce_loss": 0.5919832587242126 }, { "epoch": 0.8887680442950365, "step": 8989, "train/sim_loss": 0.03515625 }, { "epoch": 0.8887680442950365, "step": 8989, "train/total_loss": 0.09435457736253738 }, { "entropy": 8.605907440185547, "epoch": 0.8888669171445521, "mean_token_accuracy": 0.7706310749053955, "num_tokens": 25992705.0, "step": 8990, "train/ce_loss": 0.5869959592819214 }, { "epoch": 0.8888669171445521, "step": 8990, "train/sim_loss": 0.03125 }, { "epoch": 0.8888669171445521, "step": 8990, "train/total_loss": 0.0899495929479599 }, { "entropy": 8.663553237915039, "epoch": 0.8889657899940676, "mean_token_accuracy": 0.7052631378173828, "num_tokens": 25998037.0, "step": 8991, "train/ce_loss": 0.7423734068870544 }, { "epoch": 0.8889657899940676, "step": 8991, "train/sim_loss": 0.08203125 }, { "epoch": 0.8889657899940676, "step": 8991, "train/total_loss": 0.15626859664916992 }, { "entropy": 9.869404792785645, "epoch": 0.8890646628435831, "mean_token_accuracy": 0.748792290687561, "num_tokens": 26002662.0, "step": 8992, "train/ce_loss": 6.933688041499408e-07 }, { "epoch": 0.8890646628435831, "step": 8992, "train/sim_loss": 0.0234375 }, { "epoch": 0.8890646628435831, "step": 8992, "train/total_loss": 0.02343756891787052 }, { "entropy": 8.575206756591797, "epoch": 0.8891635356930987, "mean_token_accuracy": 0.6972789168357849, "num_tokens": 26008022.0, "step": 8993, "train/ce_loss": 0.8619474172592163 }, { "epoch": 0.8891635356930987, "step": 8993, "train/sim_loss": 0.03515625 }, { "epoch": 0.8891635356930987, "step": 8993, "train/total_loss": 0.12135099619626999 }, { "entropy": 9.302907943725586, "epoch": 0.8892624085426142, "mean_token_accuracy": 0.7334710955619812, "num_tokens": 26012932.0, "step": 8994, "train/ce_loss": 5.172261126062949e-07 }, { "epoch": 0.8892624085426142, "step": 8994, "train/sim_loss": 0.0546875 }, { "epoch": 0.8892624085426142, "step": 8994, "train/total_loss": 0.05468755215406418 }, { "entropy": 8.820615768432617, "epoch": 0.8893612813921298, "mean_token_accuracy": 0.7038043737411499, "num_tokens": 26018045.0, "step": 8995, "train/ce_loss": 8.845435331750195e-06 }, { "epoch": 0.8893612813921298, "step": 8995, "train/sim_loss": 0.05859375 }, { "epoch": 0.8893612813921298, "step": 8995, "train/total_loss": 0.058594632893800735 }, { "entropy": 8.942163467407227, "epoch": 0.8894601542416453, "mean_token_accuracy": 0.7947368621826172, "num_tokens": 26023442.0, "step": 8996, "train/ce_loss": 0.6891187429428101 }, { "epoch": 0.8894601542416453, "step": 8996, "train/sim_loss": 0.0625 }, { "epoch": 0.8894601542416453, "step": 8996, "train/total_loss": 0.13141188025474548 }, { "entropy": 8.762359619140625, "epoch": 0.8895590270911607, "mean_token_accuracy": 0.7494407296180725, "num_tokens": 26028765.0, "step": 8997, "train/ce_loss": 1.0854436159133911 }, { "epoch": 0.8895590270911607, "step": 8997, "train/sim_loss": 0.03125 }, { "epoch": 0.8895590270911607, "step": 8997, "train/total_loss": 0.13979436457157135 }, { "entropy": 8.596555709838867, "epoch": 0.8896578999406763, "mean_token_accuracy": 0.8172484636306763, "num_tokens": 26034234.0, "step": 8998, "train/ce_loss": 0.6125220656394958 }, { "epoch": 0.8896578999406763, "step": 8998, "train/sim_loss": 0.04296875 }, { "epoch": 0.8896578999406763, "step": 8998, "train/total_loss": 0.10422095656394958 }, { "entropy": 9.128072738647461, "epoch": 0.8897567727901918, "mean_token_accuracy": 0.6632201075553894, "num_tokens": 26039394.0, "step": 8999, "train/ce_loss": 0.9784678220748901 }, { "epoch": 0.8897567727901918, "step": 8999, "train/sim_loss": 0.05859375 }, { "epoch": 0.8897567727901918, "step": 8999, "train/total_loss": 0.15644052624702454 }, { "epoch": 0.8898556456397073, "grad_norm": 0.751347541809082, "learning_rate": 7.77753053453988e-06, "loss": 0.1359, "step": 9000 }, { "entropy": 8.731948852539062, "epoch": 0.8898556456397073, "mean_token_accuracy": 0.7823458313941956, "num_tokens": 26044735.0, "step": 9000, "train/ce_loss": 0.5468152165412903 }, { "epoch": 0.8898556456397073, "step": 9000, "train/sim_loss": 0.0234375 }, { "epoch": 0.8898556456397073, "step": 9000, "train/total_loss": 0.07811902463436127 }, { "entropy": 9.005437850952148, "epoch": 0.8899545184892229, "mean_token_accuracy": 0.695195198059082, "num_tokens": 26049826.0, "step": 9001, "train/ce_loss": 0.40771716833114624 }, { "epoch": 0.8899545184892229, "step": 9001, "train/sim_loss": 0.06640625 }, { "epoch": 0.8899545184892229, "step": 9001, "train/total_loss": 0.1071779727935791 }, { "entropy": 8.488256454467773, "epoch": 0.8900533913387384, "mean_token_accuracy": 0.75, "num_tokens": 26055136.0, "step": 9002, "train/ce_loss": 0.5950515270233154 }, { "epoch": 0.8900533913387384, "step": 9002, "train/sim_loss": 0.09375 }, { "epoch": 0.8900533913387384, "step": 9002, "train/total_loss": 0.1532551497220993 }, { "entropy": 8.892885208129883, "epoch": 0.8901522641882539, "mean_token_accuracy": 0.7684515118598938, "num_tokens": 26060244.0, "step": 9003, "train/ce_loss": 0.6378443837165833 }, { "epoch": 0.8901522641882539, "step": 9003, "train/sim_loss": 0.02734375 }, { "epoch": 0.8901522641882539, "step": 9003, "train/total_loss": 0.09112819284200668 }, { "entropy": 9.24199390411377, "epoch": 0.8902511370377695, "mean_token_accuracy": 0.7516629695892334, "num_tokens": 26065111.0, "step": 9004, "train/ce_loss": 1.6132978200912476 }, { "epoch": 0.8902511370377695, "step": 9004, "train/sim_loss": 0.0625 }, { "epoch": 0.8902511370377695, "step": 9004, "train/total_loss": 0.22382979094982147 }, { "entropy": 9.007030487060547, "epoch": 0.890350009887285, "mean_token_accuracy": 0.7781201601028442, "num_tokens": 26070247.0, "step": 9005, "train/ce_loss": 0.9047526121139526 }, { "epoch": 0.890350009887285, "step": 9005, "train/sim_loss": 0.0390625 }, { "epoch": 0.890350009887285, "step": 9005, "train/total_loss": 0.12953776121139526 }, { "entropy": 8.367483139038086, "epoch": 0.8904488827368005, "mean_token_accuracy": 0.7826552391052246, "num_tokens": 26075689.0, "step": 9006, "train/ce_loss": 0.6974188089370728 }, { "epoch": 0.8904488827368005, "step": 9006, "train/sim_loss": 0.046875 }, { "epoch": 0.8904488827368005, "step": 9006, "train/total_loss": 0.1166168823838234 }, { "entropy": 8.697525024414062, "epoch": 0.890547755586316, "mean_token_accuracy": 0.8026666641235352, "num_tokens": 26080902.0, "step": 9007, "train/ce_loss": 1.102607250213623 }, { "epoch": 0.890547755586316, "step": 9007, "train/sim_loss": 0.046875 }, { "epoch": 0.890547755586316, "step": 9007, "train/total_loss": 0.1571357250213623 }, { "entropy": 9.06612777709961, "epoch": 0.8906466284358315, "mean_token_accuracy": 0.744027316570282, "num_tokens": 26085963.0, "step": 9008, "train/ce_loss": 4.638117445665557e-07 }, { "epoch": 0.8906466284358315, "step": 9008, "train/sim_loss": 0.0234375 }, { "epoch": 0.8906466284358315, "step": 9008, "train/total_loss": 0.02343754656612873 }, { "entropy": 8.718547821044922, "epoch": 0.890745501285347, "mean_token_accuracy": 0.7577388882637024, "num_tokens": 26091164.0, "step": 9009, "train/ce_loss": 0.515997588634491 }, { "epoch": 0.890745501285347, "step": 9009, "train/sim_loss": 0.0390625 }, { "epoch": 0.890745501285347, "step": 9009, "train/total_loss": 0.09066225588321686 }, { "entropy": 8.434659957885742, "epoch": 0.8908443741348626, "mean_token_accuracy": 0.6988235116004944, "num_tokens": 26096475.0, "step": 9010, "train/ce_loss": 0.7154737710952759 }, { "epoch": 0.8908443741348626, "step": 9010, "train/sim_loss": 0.0390625 }, { "epoch": 0.8908443741348626, "step": 9010, "train/total_loss": 0.11060988157987595 }, { "entropy": 8.800464630126953, "epoch": 0.8909432469843781, "mean_token_accuracy": 0.771501898765564, "num_tokens": 26101733.0, "step": 9011, "train/ce_loss": 4.872323984272953e-07 }, { "epoch": 0.8909432469843781, "step": 9011, "train/sim_loss": 0.0546875 }, { "epoch": 0.8909432469843781, "step": 9011, "train/total_loss": 0.05468754842877388 }, { "entropy": 8.192608833312988, "epoch": 0.8910421198338936, "mean_token_accuracy": 0.7322916388511658, "num_tokens": 26107180.0, "step": 9012, "train/ce_loss": 0.6412844061851501 }, { "epoch": 0.8910421198338936, "step": 9012, "train/sim_loss": 0.05078125 }, { "epoch": 0.8910421198338936, "step": 9012, "train/total_loss": 0.11490969359874725 }, { "entropy": 9.043550491333008, "epoch": 0.8911409926834092, "mean_token_accuracy": 0.7675675749778748, "num_tokens": 26112352.0, "step": 9013, "train/ce_loss": 0.4754367172718048 }, { "epoch": 0.8911409926834092, "step": 9013, "train/sim_loss": 0.0546875 }, { "epoch": 0.8911409926834092, "step": 9013, "train/total_loss": 0.10223117470741272 }, { "entropy": 8.560033798217773, "epoch": 0.8912398655329247, "mean_token_accuracy": 0.7476303577423096, "num_tokens": 26117652.0, "step": 9014, "train/ce_loss": 1.167054533958435 }, { "epoch": 0.8912398655329247, "step": 9014, "train/sim_loss": 0.0546875 }, { "epoch": 0.8912398655329247, "step": 9014, "train/total_loss": 0.17139294743537903 }, { "entropy": 8.732704162597656, "epoch": 0.8913387383824402, "mean_token_accuracy": 0.7706043720245361, "num_tokens": 26122837.0, "step": 9015, "train/ce_loss": 0.5358596444129944 }, { "epoch": 0.8913387383824402, "step": 9015, "train/sim_loss": 0.04296875 }, { "epoch": 0.8913387383824402, "step": 9015, "train/total_loss": 0.0965547114610672 }, { "entropy": 8.724569320678711, "epoch": 0.8914376112319558, "mean_token_accuracy": 0.6988266110420227, "num_tokens": 26128083.0, "step": 9016, "train/ce_loss": 0.9315149784088135 }, { "epoch": 0.8914376112319558, "step": 9016, "train/sim_loss": 0.0390625 }, { "epoch": 0.8914376112319558, "step": 9016, "train/total_loss": 0.1322140097618103 }, { "entropy": 8.461874008178711, "epoch": 0.8915364840814712, "mean_token_accuracy": 0.7396335601806641, "num_tokens": 26133608.0, "step": 9017, "train/ce_loss": 0.8199291229248047 }, { "epoch": 0.8915364840814712, "step": 9017, "train/sim_loss": 0.10546875 }, { "epoch": 0.8915364840814712, "step": 9017, "train/total_loss": 0.18746167421340942 }, { "entropy": 8.35120964050293, "epoch": 0.8916353569309867, "mean_token_accuracy": 0.7727272510528564, "num_tokens": 26139153.0, "step": 9018, "train/ce_loss": 0.6267951130867004 }, { "epoch": 0.8916353569309867, "step": 9018, "train/sim_loss": 0.03515625 }, { "epoch": 0.8916353569309867, "step": 9018, "train/total_loss": 0.09783576428890228 }, { "entropy": 9.33806037902832, "epoch": 0.8917342297805023, "mean_token_accuracy": 0.7386138439178467, "num_tokens": 26144101.0, "step": 9019, "train/ce_loss": 1.6418176889419556 }, { "epoch": 0.8917342297805023, "step": 9019, "train/sim_loss": 0.05078125 }, { "epoch": 0.8917342297805023, "step": 9019, "train/total_loss": 0.21496301889419556 }, { "epoch": 0.8918331026300178, "grad_norm": 0.6972878575325012, "learning_rate": 7.772585669781933e-06, "loss": 0.1285, "step": 9020 }, { "entropy": 9.390382766723633, "epoch": 0.8918331026300178, "mean_token_accuracy": 0.7250608205795288, "num_tokens": 26148919.0, "step": 9020, "train/ce_loss": 1.5816576480865479 }, { "epoch": 0.8918331026300178, "step": 9020, "train/sim_loss": 0.03125 }, { "epoch": 0.8918331026300178, "step": 9020, "train/total_loss": 0.18941576778888702 }, { "entropy": 9.10663890838623, "epoch": 0.8919319754795333, "mean_token_accuracy": 0.7777777910232544, "num_tokens": 26154008.0, "step": 9021, "train/ce_loss": 1.269675850868225 }, { "epoch": 0.8919319754795333, "step": 9021, "train/sim_loss": 0.0703125 }, { "epoch": 0.8919319754795333, "step": 9021, "train/total_loss": 0.19728009402751923 }, { "entropy": 9.42596435546875, "epoch": 0.8920308483290489, "mean_token_accuracy": 0.7516930103302002, "num_tokens": 26158859.0, "step": 9022, "train/ce_loss": 4.456167630451091e-07 }, { "epoch": 0.8920308483290489, "step": 9022, "train/sim_loss": 0.0390625 }, { "epoch": 0.8920308483290489, "step": 9022, "train/total_loss": 0.03906254470348358 }, { "entropy": 8.594335556030273, "epoch": 0.8921297211785644, "mean_token_accuracy": 0.7644287347793579, "num_tokens": 26164362.0, "step": 9023, "train/ce_loss": 0.6554343104362488 }, { "epoch": 0.8921297211785644, "step": 9023, "train/sim_loss": 0.01953125 }, { "epoch": 0.8921297211785644, "step": 9023, "train/total_loss": 0.08507468551397324 }, { "entropy": 9.105646133422852, "epoch": 0.8922285940280799, "mean_token_accuracy": 0.692307710647583, "num_tokens": 26169576.0, "step": 9024, "train/ce_loss": 1.1244890689849854 }, { "epoch": 0.8922285940280799, "step": 9024, "train/sim_loss": 0.08984375 }, { "epoch": 0.8922285940280799, "step": 9024, "train/total_loss": 0.20229265093803406 }, { "entropy": 9.05662727355957, "epoch": 0.8923274668775955, "mean_token_accuracy": 0.7682291865348816, "num_tokens": 26174811.0, "step": 9025, "train/ce_loss": 0.9285861849784851 }, { "epoch": 0.8923274668775955, "step": 9025, "train/sim_loss": 0.07421875 }, { "epoch": 0.8923274668775955, "step": 9025, "train/total_loss": 0.16707736253738403 }, { "entropy": 8.581493377685547, "epoch": 0.8924263397271109, "mean_token_accuracy": 0.7704917788505554, "num_tokens": 26180168.0, "step": 9026, "train/ce_loss": 0.6383937001228333 }, { "epoch": 0.8924263397271109, "step": 9026, "train/sim_loss": 0.04296875 }, { "epoch": 0.8924263397271109, "step": 9026, "train/total_loss": 0.1068081185221672 }, { "entropy": 9.577193260192871, "epoch": 0.8925252125766264, "mean_token_accuracy": 0.782608687877655, "num_tokens": 26184875.0, "step": 9027, "train/ce_loss": 0.9829044342041016 }, { "epoch": 0.8925252125766264, "step": 9027, "train/sim_loss": 0.0390625 }, { "epoch": 0.8925252125766264, "step": 9027, "train/total_loss": 0.13735294342041016 }, { "entropy": 8.63807487487793, "epoch": 0.892624085426142, "mean_token_accuracy": 0.7050528526306152, "num_tokens": 26190204.0, "step": 9028, "train/ce_loss": 0.7294370532035828 }, { "epoch": 0.892624085426142, "step": 9028, "train/sim_loss": 0.06640625 }, { "epoch": 0.892624085426142, "step": 9028, "train/total_loss": 0.13934996724128723 }, { "entropy": 8.742045402526855, "epoch": 0.8927229582756575, "mean_token_accuracy": 0.662162184715271, "num_tokens": 26195414.0, "step": 9029, "train/ce_loss": 6.97838459018385e-07 }, { "epoch": 0.8927229582756575, "step": 9029, "train/sim_loss": 0.046875 }, { "epoch": 0.8927229582756575, "step": 9029, "train/total_loss": 0.04687507078051567 }, { "entropy": 8.24318790435791, "epoch": 0.892821831125173, "mean_token_accuracy": 0.7603686451911926, "num_tokens": 26200938.0, "step": 9030, "train/ce_loss": 0.8421469330787659 }, { "epoch": 0.892821831125173, "step": 9030, "train/sim_loss": 0.078125 }, { "epoch": 0.892821831125173, "step": 9030, "train/total_loss": 0.1623396873474121 }, { "entropy": 9.521178245544434, "epoch": 0.8929207039746886, "mean_token_accuracy": 0.6871035695075989, "num_tokens": 26205841.0, "step": 9031, "train/ce_loss": 2.0296339988708496 }, { "epoch": 0.8929207039746886, "step": 9031, "train/sim_loss": 0.0625 }, { "epoch": 0.8929207039746886, "step": 9031, "train/total_loss": 0.2654634118080139 }, { "entropy": 8.726725578308105, "epoch": 0.8930195768242041, "mean_token_accuracy": 0.8068965673446655, "num_tokens": 26211062.0, "step": 9032, "train/ce_loss": 0.7748110890388489 }, { "epoch": 0.8930195768242041, "step": 9032, "train/sim_loss": 0.015625 }, { "epoch": 0.8930195768242041, "step": 9032, "train/total_loss": 0.09310611337423325 }, { "entropy": 9.207667350769043, "epoch": 0.8931184496737196, "mean_token_accuracy": 0.7491227984428406, "num_tokens": 26216088.0, "step": 9033, "train/ce_loss": 0.8433367609977722 }, { "epoch": 0.8931184496737196, "step": 9033, "train/sim_loss": 0.078125 }, { "epoch": 0.8931184496737196, "step": 9033, "train/total_loss": 0.16245868802070618 }, { "entropy": 9.220256805419922, "epoch": 0.8932173225232352, "mean_token_accuracy": 0.7688524723052979, "num_tokens": 26221121.0, "step": 9034, "train/ce_loss": 0.7536109089851379 }, { "epoch": 0.8932173225232352, "step": 9034, "train/sim_loss": 0.05078125 }, { "epoch": 0.8932173225232352, "step": 9034, "train/total_loss": 0.12614235281944275 }, { "entropy": 9.368236541748047, "epoch": 0.8933161953727506, "mean_token_accuracy": 0.785263180732727, "num_tokens": 26226030.0, "step": 9035, "train/ce_loss": 1.0121777057647705 }, { "epoch": 0.8933161953727506, "step": 9035, "train/sim_loss": 0.03515625 }, { "epoch": 0.8933161953727506, "step": 9035, "train/total_loss": 0.13637402653694153 }, { "entropy": 8.360307693481445, "epoch": 0.8934150682222661, "mean_token_accuracy": 0.7129071354866028, "num_tokens": 26231407.0, "step": 9036, "train/ce_loss": 1.0447863340377808 }, { "epoch": 0.8934150682222661, "step": 9036, "train/sim_loss": 0.0625 }, { "epoch": 0.8934150682222661, "step": 9036, "train/total_loss": 0.1669786274433136 }, { "entropy": 8.6937255859375, "epoch": 0.8935139410717817, "mean_token_accuracy": 0.7613762617111206, "num_tokens": 26236818.0, "step": 9037, "train/ce_loss": 0.8467341661453247 }, { "epoch": 0.8935139410717817, "step": 9037, "train/sim_loss": 0.015625 }, { "epoch": 0.8935139410717817, "step": 9037, "train/total_loss": 0.10029841959476471 }, { "entropy": 8.954123497009277, "epoch": 0.8936128139212972, "mean_token_accuracy": 0.7614991664886475, "num_tokens": 26241818.0, "step": 9038, "train/ce_loss": 0.7165358066558838 }, { "epoch": 0.8936128139212972, "step": 9038, "train/sim_loss": 0.04296875 }, { "epoch": 0.8936128139212972, "step": 9038, "train/total_loss": 0.1146223321557045 }, { "entropy": 9.098472595214844, "epoch": 0.8937116867708127, "mean_token_accuracy": 0.7612456679344177, "num_tokens": 26246832.0, "step": 9039, "train/ce_loss": 1.1427669525146484 }, { "epoch": 0.8937116867708127, "step": 9039, "train/sim_loss": 0.05078125 }, { "epoch": 0.8937116867708127, "step": 9039, "train/total_loss": 0.1650579571723938 }, { "epoch": 0.8938105596203283, "grad_norm": 0.6742735505104065, "learning_rate": 7.767640805023983e-06, "loss": 0.1266, "step": 9040 }, { "entropy": 8.34196662902832, "epoch": 0.8938105596203283, "mean_token_accuracy": 0.7777777910232544, "num_tokens": 26252304.0, "step": 9040, "train/ce_loss": 1.012823462486267 }, { "epoch": 0.8938105596203283, "step": 9040, "train/sim_loss": 0.046875 }, { "epoch": 0.8938105596203283, "step": 9040, "train/total_loss": 0.14815735816955566 }, { "entropy": 8.855175971984863, "epoch": 0.8939094324698438, "mean_token_accuracy": 0.7379679083824158, "num_tokens": 26257507.0, "step": 9041, "train/ce_loss": 1.2165783643722534 }, { "epoch": 0.8939094324698438, "step": 9041, "train/sim_loss": 0.07421875 }, { "epoch": 0.8939094324698438, "step": 9041, "train/total_loss": 0.1958765983581543 }, { "entropy": 8.561556816101074, "epoch": 0.8940083053193593, "mean_token_accuracy": 0.7976694703102112, "num_tokens": 26262920.0, "step": 9042, "train/ce_loss": 0.725635826587677 }, { "epoch": 0.8940083053193593, "step": 9042, "train/sim_loss": 0.01953125 }, { "epoch": 0.8940083053193593, "step": 9042, "train/total_loss": 0.09209483116865158 }, { "entropy": 8.421992301940918, "epoch": 0.8941071781688749, "mean_token_accuracy": 0.7265269160270691, "num_tokens": 26268483.0, "step": 9043, "train/ce_loss": 1.123929738998413 }, { "epoch": 0.8941071781688749, "step": 9043, "train/sim_loss": 0.13671875 }, { "epoch": 0.8941071781688749, "step": 9043, "train/total_loss": 0.24911172688007355 }, { "entropy": 8.830625534057617, "epoch": 0.8942060510183903, "mean_token_accuracy": 0.7846607565879822, "num_tokens": 26273604.0, "step": 9044, "train/ce_loss": 1.676502506597899e-06 }, { "epoch": 0.8942060510183903, "step": 9044, "train/sim_loss": 0.08984375 }, { "epoch": 0.8942060510183903, "step": 9044, "train/total_loss": 0.08984392136335373 }, { "entropy": 9.134194374084473, "epoch": 0.8943049238679058, "mean_token_accuracy": 0.7523364424705505, "num_tokens": 26278673.0, "step": 9045, "train/ce_loss": 0.8511750102043152 }, { "epoch": 0.8943049238679058, "step": 9045, "train/sim_loss": 0.03515625 }, { "epoch": 0.8943049238679058, "step": 9045, "train/total_loss": 0.12027375400066376 }, { "entropy": 8.744175910949707, "epoch": 0.8944037967174214, "mean_token_accuracy": 0.70333331823349, "num_tokens": 26284089.0, "step": 9046, "train/ce_loss": 1.0774354934692383 }, { "epoch": 0.8944037967174214, "step": 9046, "train/sim_loss": 0.0859375 }, { "epoch": 0.8944037967174214, "step": 9046, "train/total_loss": 0.19368106126785278 }, { "entropy": 8.416398048400879, "epoch": 0.8945026695669369, "mean_token_accuracy": 0.7437295317649841, "num_tokens": 26289527.0, "step": 9047, "train/ce_loss": 0.5761348605155945 }, { "epoch": 0.8945026695669369, "step": 9047, "train/sim_loss": 0.046875 }, { "epoch": 0.8945026695669369, "step": 9047, "train/total_loss": 0.10448849201202393 }, { "entropy": 9.505701065063477, "epoch": 0.8946015424164524, "mean_token_accuracy": 0.7559241652488708, "num_tokens": 26294365.0, "step": 9048, "train/ce_loss": 1.436909556388855 }, { "epoch": 0.8946015424164524, "step": 9048, "train/sim_loss": 0.04296875 }, { "epoch": 0.8946015424164524, "step": 9048, "train/total_loss": 0.18665970861911774 }, { "entropy": 9.163416862487793, "epoch": 0.894700415265968, "mean_token_accuracy": 0.7727952003479004, "num_tokens": 26299507.0, "step": 9049, "train/ce_loss": 1.247650146484375 }, { "epoch": 0.894700415265968, "step": 9049, "train/sim_loss": 0.0546875 }, { "epoch": 0.894700415265968, "step": 9049, "train/total_loss": 0.17945250868797302 }, { "entropy": 9.116547584533691, "epoch": 0.8947992881154835, "mean_token_accuracy": 0.7496111989021301, "num_tokens": 26304595.0, "step": 9050, "train/ce_loss": 1.3614546060562134 }, { "epoch": 0.8947992881154835, "step": 9050, "train/sim_loss": 0.11328125 }, { "epoch": 0.8947992881154835, "step": 9050, "train/total_loss": 0.2494267076253891 }, { "entropy": 9.33833122253418, "epoch": 0.894898160964999, "mean_token_accuracy": 0.6994906663894653, "num_tokens": 26309668.0, "step": 9051, "train/ce_loss": 0.9015560150146484 }, { "epoch": 0.894898160964999, "step": 9051, "train/sim_loss": 0.0703125 }, { "epoch": 0.894898160964999, "step": 9051, "train/total_loss": 0.16046810150146484 }, { "entropy": 8.719079971313477, "epoch": 0.8949970338145146, "mean_token_accuracy": 0.787089467048645, "num_tokens": 26315022.0, "step": 9052, "train/ce_loss": 0.3523921072483063 }, { "epoch": 0.8949970338145146, "step": 9052, "train/sim_loss": 0.01171875 }, { "epoch": 0.8949970338145146, "step": 9052, "train/total_loss": 0.04695796221494675 }, { "entropy": 8.357917785644531, "epoch": 0.89509590666403, "mean_token_accuracy": 0.71685391664505, "num_tokens": 26320401.0, "step": 9053, "train/ce_loss": 1.4288767576217651 }, { "epoch": 0.89509590666403, "step": 9053, "train/sim_loss": 0.03125 }, { "epoch": 0.89509590666403, "step": 9053, "train/total_loss": 0.174137681722641 }, { "entropy": 9.23624038696289, "epoch": 0.8951947795135455, "mean_token_accuracy": 0.7727272510528564, "num_tokens": 26325419.0, "step": 9054, "train/ce_loss": 0.46162018179893494 }, { "epoch": 0.8951947795135455, "step": 9054, "train/sim_loss": 0.03125 }, { "epoch": 0.8951947795135455, "step": 9054, "train/total_loss": 0.07741202414035797 }, { "entropy": 8.519001007080078, "epoch": 0.8952936523630611, "mean_token_accuracy": 0.7407833933830261, "num_tokens": 26330690.0, "step": 9055, "train/ce_loss": 0.4581650495529175 }, { "epoch": 0.8952936523630611, "step": 9055, "train/sim_loss": 0.03125 }, { "epoch": 0.8952936523630611, "step": 9055, "train/total_loss": 0.07706651091575623 }, { "entropy": 9.182341575622559, "epoch": 0.8953925252125766, "mean_token_accuracy": 0.7250922322273254, "num_tokens": 26335664.0, "step": 9056, "train/ce_loss": 0.8898815512657166 }, { "epoch": 0.8953925252125766, "step": 9056, "train/sim_loss": 0.0546875 }, { "epoch": 0.8953925252125766, "step": 9056, "train/total_loss": 0.14367565512657166 }, { "entropy": 8.56419563293457, "epoch": 0.8954913980620921, "mean_token_accuracy": 0.7774358987808228, "num_tokens": 26341103.0, "step": 9057, "train/ce_loss": 0.5643113851547241 }, { "epoch": 0.8954913980620921, "step": 9057, "train/sim_loss": 0.05078125 }, { "epoch": 0.8954913980620921, "step": 9057, "train/total_loss": 0.10721239447593689 }, { "entropy": 8.964850425720215, "epoch": 0.8955902709116077, "mean_token_accuracy": 0.7044117450714111, "num_tokens": 26346212.0, "step": 9058, "train/ce_loss": 1.4405393600463867 }, { "epoch": 0.8955902709116077, "step": 9058, "train/sim_loss": 0.078125 }, { "epoch": 0.8955902709116077, "step": 9058, "train/total_loss": 0.22217893600463867 }, { "entropy": 8.850532531738281, "epoch": 0.8956891437611232, "mean_token_accuracy": 0.6666666865348816, "num_tokens": 26351549.0, "step": 9059, "train/ce_loss": 1.0828642845153809 }, { "epoch": 0.8956891437611232, "step": 9059, "train/sim_loss": 0.08984375 }, { "epoch": 0.8956891437611232, "step": 9059, "train/total_loss": 0.19813019037246704 }, { "epoch": 0.8957880166106387, "grad_norm": 0.6650404930114746, "learning_rate": 7.762695940266034e-06, "loss": 0.1373, "step": 9060 }, { "entropy": 8.52380084991455, "epoch": 0.8957880166106387, "mean_token_accuracy": 0.7497337460517883, "num_tokens": 26356938.0, "step": 9060, "train/ce_loss": 1.0425094366073608 }, { "epoch": 0.8957880166106387, "step": 9060, "train/sim_loss": 0.06640625 }, { "epoch": 0.8957880166106387, "step": 9060, "train/total_loss": 0.1706571877002716 }, { "entropy": 8.88840103149414, "epoch": 0.8958868894601543, "mean_token_accuracy": 0.7521994113922119, "num_tokens": 26362086.0, "step": 9061, "train/ce_loss": 1.2668105000557262e-06 }, { "epoch": 0.8958868894601543, "step": 9061, "train/sim_loss": 0.0234375 }, { "epoch": 0.8958868894601543, "step": 9061, "train/total_loss": 0.023437626659870148 }, { "entropy": 8.716756820678711, "epoch": 0.8959857623096698, "mean_token_accuracy": 0.7196701765060425, "num_tokens": 26367400.0, "step": 9062, "train/ce_loss": 0.8180581331253052 }, { "epoch": 0.8959857623096698, "step": 9062, "train/sim_loss": 0.0546875 }, { "epoch": 0.8959857623096698, "step": 9062, "train/total_loss": 0.13649332523345947 }, { "entropy": 9.058746337890625, "epoch": 0.8960846351591852, "mean_token_accuracy": 0.7813051342964172, "num_tokens": 26372600.0, "step": 9063, "train/ce_loss": 0.6065887212753296 }, { "epoch": 0.8960846351591852, "step": 9063, "train/sim_loss": 0.03515625 }, { "epoch": 0.8960846351591852, "step": 9063, "train/total_loss": 0.09581512212753296 }, { "entropy": 8.757225036621094, "epoch": 0.8961835080087008, "mean_token_accuracy": 0.6877419352531433, "num_tokens": 26377839.0, "step": 9064, "train/ce_loss": 1.433215618133545 }, { "epoch": 0.8961835080087008, "step": 9064, "train/sim_loss": 0.03125 }, { "epoch": 0.8961835080087008, "step": 9064, "train/total_loss": 0.17457155883312225 }, { "entropy": 8.651361465454102, "epoch": 0.8962823808582163, "mean_token_accuracy": 0.7403740286827087, "num_tokens": 26383210.0, "step": 9065, "train/ce_loss": 0.8476965427398682 }, { "epoch": 0.8962823808582163, "step": 9065, "train/sim_loss": 0.078125 }, { "epoch": 0.8962823808582163, "step": 9065, "train/total_loss": 0.16289466619491577 }, { "entropy": 9.399979591369629, "epoch": 0.8963812537077318, "mean_token_accuracy": 0.8402489423751831, "num_tokens": 26388116.0, "step": 9066, "train/ce_loss": 1.3859827518463135 }, { "epoch": 0.8963812537077318, "step": 9066, "train/sim_loss": 0.03125 }, { "epoch": 0.8963812537077318, "step": 9066, "train/total_loss": 0.1698482781648636 }, { "entropy": 8.580986022949219, "epoch": 0.8964801265572474, "mean_token_accuracy": 0.7384284138679504, "num_tokens": 26393511.0, "step": 9067, "train/ce_loss": 0.5452489852905273 }, { "epoch": 0.8964801265572474, "step": 9067, "train/sim_loss": 0.02734375 }, { "epoch": 0.8964801265572474, "step": 9067, "train/total_loss": 0.08186864852905273 }, { "entropy": 9.107563972473145, "epoch": 0.8965789994067629, "mean_token_accuracy": 0.716356098651886, "num_tokens": 26398411.0, "step": 9068, "train/ce_loss": 2.5687904357910156 }, { "epoch": 0.8965789994067629, "step": 9068, "train/sim_loss": 0.09765625 }, { "epoch": 0.8965789994067629, "step": 9068, "train/total_loss": 0.354535311460495 }, { "entropy": 8.618518829345703, "epoch": 0.8966778722562784, "mean_token_accuracy": 0.7164179086685181, "num_tokens": 26403833.0, "step": 9069, "train/ce_loss": 0.5482277870178223 }, { "epoch": 0.8966778722562784, "step": 9069, "train/sim_loss": 0.03515625 }, { "epoch": 0.8966778722562784, "step": 9069, "train/total_loss": 0.08997903019189835 }, { "entropy": 8.767492294311523, "epoch": 0.896776745105794, "mean_token_accuracy": 0.7418263554573059, "num_tokens": 26409137.0, "step": 9070, "train/ce_loss": 0.6119635105133057 }, { "epoch": 0.896776745105794, "step": 9070, "train/sim_loss": 0.109375 }, { "epoch": 0.896776745105794, "step": 9070, "train/total_loss": 0.17057135701179504 }, { "entropy": 8.644116401672363, "epoch": 0.8968756179553095, "mean_token_accuracy": 0.7927107214927673, "num_tokens": 26414534.0, "step": 9071, "train/ce_loss": 0.7480507493019104 }, { "epoch": 0.8968756179553095, "step": 9071, "train/sim_loss": 0.0859375 }, { "epoch": 0.8968756179553095, "step": 9071, "train/total_loss": 0.16074258089065552 }, { "entropy": 8.785004615783691, "epoch": 0.8969744908048249, "mean_token_accuracy": 0.7204545736312866, "num_tokens": 26419843.0, "step": 9072, "train/ce_loss": 1.3798236846923828 }, { "epoch": 0.8969744908048249, "step": 9072, "train/sim_loss": 0.06640625 }, { "epoch": 0.8969744908048249, "step": 9072, "train/total_loss": 0.20438861846923828 }, { "entropy": 8.707588195800781, "epoch": 0.8970733636543405, "mean_token_accuracy": 0.7354685664176941, "num_tokens": 26425176.0, "step": 9073, "train/ce_loss": 0.8631170988082886 }, { "epoch": 0.8970733636543405, "step": 9073, "train/sim_loss": 0.10546875 }, { "epoch": 0.8970733636543405, "step": 9073, "train/total_loss": 0.1917804628610611 }, { "entropy": 8.898002624511719, "epoch": 0.897172236503856, "mean_token_accuracy": 0.7440944910049438, "num_tokens": 26430433.0, "step": 9074, "train/ce_loss": 0.4725923240184784 }, { "epoch": 0.897172236503856, "step": 9074, "train/sim_loss": 0.046875 }, { "epoch": 0.897172236503856, "step": 9074, "train/total_loss": 0.09413423389196396 }, { "entropy": 9.20399284362793, "epoch": 0.8972711093533715, "mean_token_accuracy": 0.7118644118309021, "num_tokens": 26435497.0, "step": 9075, "train/ce_loss": 1.0065940618515015 }, { "epoch": 0.8972711093533715, "step": 9075, "train/sim_loss": 0.02734375 }, { "epoch": 0.8972711093533715, "step": 9075, "train/total_loss": 0.12800315022468567 }, { "entropy": 8.567405700683594, "epoch": 0.8973699822028871, "mean_token_accuracy": 0.7644171714782715, "num_tokens": 26440767.0, "step": 9076, "train/ce_loss": 0.484589159488678 }, { "epoch": 0.8973699822028871, "step": 9076, "train/sim_loss": 0.0390625 }, { "epoch": 0.8973699822028871, "step": 9076, "train/total_loss": 0.08752141892910004 }, { "entropy": 8.940652847290039, "epoch": 0.8974688550524026, "mean_token_accuracy": 0.750348687171936, "num_tokens": 26445893.0, "step": 9077, "train/ce_loss": 0.8887761831283569 }, { "epoch": 0.8974688550524026, "step": 9077, "train/sim_loss": 0.0703125 }, { "epoch": 0.8974688550524026, "step": 9077, "train/total_loss": 0.1591901183128357 }, { "entropy": 8.863910675048828, "epoch": 0.8975677279019182, "mean_token_accuracy": 0.7949735522270203, "num_tokens": 26451137.0, "step": 9078, "train/ce_loss": 0.5831772685050964 }, { "epoch": 0.8975677279019182, "step": 9078, "train/sim_loss": 0.046875 }, { "epoch": 0.8975677279019182, "step": 9078, "train/total_loss": 0.10519272834062576 }, { "entropy": 9.978321075439453, "epoch": 0.8976666007514337, "mean_token_accuracy": 0.7720588445663452, "num_tokens": 26455767.0, "step": 9079, "train/ce_loss": 8.344578077412734e-07 }, { "epoch": 0.8976666007514337, "step": 9079, "train/sim_loss": 0.01953125 }, { "epoch": 0.8976666007514337, "step": 9079, "train/total_loss": 0.019531333819031715 }, { "epoch": 0.8977654736009492, "grad_norm": 0.8531478047370911, "learning_rate": 7.757751075508086e-06, "loss": 0.1295, "step": 9080 }, { "entropy": 9.510713577270508, "epoch": 0.8977654736009492, "mean_token_accuracy": 0.7338501214981079, "num_tokens": 26460556.0, "step": 9080, "train/ce_loss": 1.867562174797058 }, { "epoch": 0.8977654736009492, "step": 9080, "train/sim_loss": 0.0625 }, { "epoch": 0.8977654736009492, "step": 9080, "train/total_loss": 0.2492562234401703 }, { "entropy": 8.828611373901367, "epoch": 0.8978643464504648, "mean_token_accuracy": 0.7374301552772522, "num_tokens": 26465710.0, "step": 9081, "train/ce_loss": 0.5925540924072266 }, { "epoch": 0.8978643464504648, "step": 9081, "train/sim_loss": 0.0859375 }, { "epoch": 0.8978643464504648, "step": 9081, "train/total_loss": 0.14519290626049042 }, { "entropy": 8.482282638549805, "epoch": 0.8979632192999802, "mean_token_accuracy": 0.792417049407959, "num_tokens": 26471268.0, "step": 9082, "train/ce_loss": 0.6010853052139282 }, { "epoch": 0.8979632192999802, "step": 9082, "train/sim_loss": 0.0703125 }, { "epoch": 0.8979632192999802, "step": 9082, "train/total_loss": 0.13042102754116058 }, { "entropy": 9.155592918395996, "epoch": 0.8980620921494957, "mean_token_accuracy": 0.7203539609909058, "num_tokens": 26476288.0, "step": 9083, "train/ce_loss": 3.7049531442789885e-07 }, { "epoch": 0.8980620921494957, "step": 9083, "train/sim_loss": 0.01171875 }, { "epoch": 0.8980620921494957, "step": 9083, "train/total_loss": 0.011718787252902985 }, { "entropy": 8.928726196289062, "epoch": 0.8981609649990113, "mean_token_accuracy": 0.7413127422332764, "num_tokens": 26481530.0, "step": 9084, "train/ce_loss": 0.9454926252365112 }, { "epoch": 0.8981609649990113, "step": 9084, "train/sim_loss": 0.08203125 }, { "epoch": 0.8981609649990113, "step": 9084, "train/total_loss": 0.1765805184841156 }, { "entropy": 9.107856750488281, "epoch": 0.8982598378485268, "mean_token_accuracy": 0.7364746928215027, "num_tokens": 26486612.0, "step": 9085, "train/ce_loss": 2.3217562272748182e-07 }, { "epoch": 0.8982598378485268, "step": 9085, "train/sim_loss": 0.02734375 }, { "epoch": 0.8982598378485268, "step": 9085, "train/total_loss": 0.02734377235174179 }, { "entropy": 9.56021785736084, "epoch": 0.8983587106980423, "mean_token_accuracy": 0.7511848211288452, "num_tokens": 26491412.0, "step": 9086, "train/ce_loss": 1.2456231117248535 }, { "epoch": 0.8983587106980423, "step": 9086, "train/sim_loss": 0.046875 }, { "epoch": 0.8983587106980423, "step": 9086, "train/total_loss": 0.1714373230934143 }, { "entropy": 9.498257637023926, "epoch": 0.8984575835475579, "mean_token_accuracy": 0.7429245114326477, "num_tokens": 26496233.0, "step": 9087, "train/ce_loss": 2.564113401604118e-07 }, { "epoch": 0.8984575835475579, "step": 9087, "train/sim_loss": 0.015625 }, { "epoch": 0.8984575835475579, "step": 9087, "train/total_loss": 0.01562502607703209 }, { "entropy": 9.015697479248047, "epoch": 0.8985564563970734, "mean_token_accuracy": 0.7487603425979614, "num_tokens": 26501311.0, "step": 9088, "train/ce_loss": 2.7880012112291297e-06 }, { "epoch": 0.8985564563970734, "step": 9088, "train/sim_loss": 0.0234375 }, { "epoch": 0.8985564563970734, "step": 9088, "train/total_loss": 0.023437779396772385 }, { "entropy": 9.054108619689941, "epoch": 0.8986553292465889, "mean_token_accuracy": 0.7352415323257446, "num_tokens": 26506316.0, "step": 9089, "train/ce_loss": 0.9395824074745178 }, { "epoch": 0.8986553292465889, "step": 9089, "train/sim_loss": 0.0546875 }, { "epoch": 0.8986553292465889, "step": 9089, "train/total_loss": 0.14864574372768402 }, { "entropy": 8.577143669128418, "epoch": 0.8987542020961045, "mean_token_accuracy": 0.7199612259864807, "num_tokens": 26511840.0, "step": 9090, "train/ce_loss": 0.6009590029716492 }, { "epoch": 0.8987542020961045, "step": 9090, "train/sim_loss": 0.03125 }, { "epoch": 0.8987542020961045, "step": 9090, "train/total_loss": 0.0913459062576294 }, { "entropy": 8.717094421386719, "epoch": 0.89885307494562, "mean_token_accuracy": 0.7784172892570496, "num_tokens": 26516988.0, "step": 9091, "train/ce_loss": 1.873485643955064e-06 }, { "epoch": 0.89885307494562, "step": 9091, "train/sim_loss": 0.046875 }, { "epoch": 0.89885307494562, "step": 9091, "train/total_loss": 0.04687518626451492 }, { "entropy": 9.038698196411133, "epoch": 0.8989519477951354, "mean_token_accuracy": 0.7762619256973267, "num_tokens": 26522208.0, "step": 9092, "train/ce_loss": 0.8083727359771729 }, { "epoch": 0.8989519477951354, "step": 9092, "train/sim_loss": 0.0390625 }, { "epoch": 0.8989519477951354, "step": 9092, "train/total_loss": 0.11989977210760117 }, { "entropy": 8.21472454071045, "epoch": 0.899050820644651, "mean_token_accuracy": 0.7547169923782349, "num_tokens": 26527721.0, "step": 9093, "train/ce_loss": 0.8184592723846436 }, { "epoch": 0.899050820644651, "step": 9093, "train/sim_loss": 0.0234375 }, { "epoch": 0.899050820644651, "step": 9093, "train/total_loss": 0.10528343170881271 }, { "entropy": 8.824597358703613, "epoch": 0.8991496934941665, "mean_token_accuracy": 0.6925031542778015, "num_tokens": 26532997.0, "step": 9094, "train/ce_loss": 1.0248957872390747 }, { "epoch": 0.8991496934941665, "step": 9094, "train/sim_loss": 0.05859375 }, { "epoch": 0.8991496934941665, "step": 9094, "train/total_loss": 0.16108334064483643 }, { "entropy": 8.745586395263672, "epoch": 0.899248566343682, "mean_token_accuracy": 0.6959999799728394, "num_tokens": 26538335.0, "step": 9095, "train/ce_loss": 0.4734397232532501 }, { "epoch": 0.899248566343682, "step": 9095, "train/sim_loss": 0.04296875 }, { "epoch": 0.899248566343682, "step": 9095, "train/total_loss": 0.09031271934509277 }, { "entropy": 8.67325210571289, "epoch": 0.8993474391931976, "mean_token_accuracy": 0.7426666617393494, "num_tokens": 26543510.0, "step": 9096, "train/ce_loss": 0.7213162183761597 }, { "epoch": 0.8993474391931976, "step": 9096, "train/sim_loss": 0.0546875 }, { "epoch": 0.8993474391931976, "step": 9096, "train/total_loss": 0.12681913375854492 }, { "entropy": 8.939607620239258, "epoch": 0.8994463120427131, "mean_token_accuracy": 0.7454031109809875, "num_tokens": 26548596.0, "step": 9097, "train/ce_loss": 0.9978237748146057 }, { "epoch": 0.8994463120427131, "step": 9097, "train/sim_loss": 0.0625 }, { "epoch": 0.8994463120427131, "step": 9097, "train/total_loss": 0.16228237748146057 }, { "entropy": 8.586393356323242, "epoch": 0.8995451848922286, "mean_token_accuracy": 0.787089467048645, "num_tokens": 26553991.0, "step": 9098, "train/ce_loss": 0.5694405436515808 }, { "epoch": 0.8995451848922286, "step": 9098, "train/sim_loss": 0.07421875 }, { "epoch": 0.8995451848922286, "step": 9098, "train/total_loss": 0.13116280734539032 }, { "entropy": 9.860730171203613, "epoch": 0.8996440577417442, "mean_token_accuracy": 0.8520709872245789, "num_tokens": 26558591.0, "step": 9099, "train/ce_loss": 1.3599603789771209e-06 }, { "epoch": 0.8996440577417442, "step": 9099, "train/sim_loss": 0.04296875 }, { "epoch": 0.8996440577417442, "step": 9099, "train/total_loss": 0.04296888783574104 }, { "epoch": 0.8997429305912596, "grad_norm": 0.9253165125846863, "learning_rate": 7.752806210750137e-06, "loss": 0.1263, "step": 9100 }, { "entropy": 8.981040000915527, "epoch": 0.8997429305912596, "mean_token_accuracy": 0.7735849022865295, "num_tokens": 26564011.0, "step": 9100, "train/ce_loss": 0.9932456016540527 }, { "epoch": 0.8997429305912596, "step": 9100, "train/sim_loss": 0.078125 }, { "epoch": 0.8997429305912596, "step": 9100, "train/total_loss": 0.1774495542049408 }, { "entropy": 8.719240188598633, "epoch": 0.8998418034407751, "mean_token_accuracy": 0.7344497442245483, "num_tokens": 26569279.0, "step": 9101, "train/ce_loss": 0.8734342455863953 }, { "epoch": 0.8998418034407751, "step": 9101, "train/sim_loss": 0.05859375 }, { "epoch": 0.8998418034407751, "step": 9101, "train/total_loss": 0.14593717455863953 }, { "entropy": 9.323492050170898, "epoch": 0.8999406762902907, "mean_token_accuracy": 0.7406716346740723, "num_tokens": 26574252.0, "step": 9102, "train/ce_loss": 1.1112210750579834 }, { "epoch": 0.8999406762902907, "step": 9102, "train/sim_loss": 0.11328125 }, { "epoch": 0.8999406762902907, "step": 9102, "train/total_loss": 0.22440335154533386 }, { "entropy": 8.75655746459961, "epoch": 0.9000395491398062, "mean_token_accuracy": 0.7513020634651184, "num_tokens": 26579511.0, "step": 9103, "train/ce_loss": 0.8895690441131592 }, { "epoch": 0.9000395491398062, "step": 9103, "train/sim_loss": 0.06640625 }, { "epoch": 0.9000395491398062, "step": 9103, "train/total_loss": 0.15536315739154816 }, { "entropy": 8.985508918762207, "epoch": 0.9001384219893217, "mean_token_accuracy": 0.780635416507721, "num_tokens": 26584613.0, "step": 9104, "train/ce_loss": 5.280485311232042e-07 }, { "epoch": 0.9001384219893217, "step": 9104, "train/sim_loss": 0.05859375 }, { "epoch": 0.9001384219893217, "step": 9104, "train/total_loss": 0.05859380215406418 }, { "entropy": 8.611551284790039, "epoch": 0.9002372948388373, "mean_token_accuracy": 0.7111801505088806, "num_tokens": 26590051.0, "step": 9105, "train/ce_loss": 1.0287556648254395 }, { "epoch": 0.9002372948388373, "step": 9105, "train/sim_loss": 0.0625 }, { "epoch": 0.9002372948388373, "step": 9105, "train/total_loss": 0.16537556052207947 }, { "entropy": 8.797918319702148, "epoch": 0.9003361676883528, "mean_token_accuracy": 0.7944133877754211, "num_tokens": 26595407.0, "step": 9106, "train/ce_loss": 0.426572322845459 }, { "epoch": 0.9003361676883528, "step": 9106, "train/sim_loss": 0.0234375 }, { "epoch": 0.9003361676883528, "step": 9106, "train/total_loss": 0.06609473377466202 }, { "entropy": 8.250228881835938, "epoch": 0.9004350405378683, "mean_token_accuracy": 0.7409178018569946, "num_tokens": 26600923.0, "step": 9107, "train/ce_loss": 1.0799992084503174 }, { "epoch": 0.9004350405378683, "step": 9107, "train/sim_loss": 0.05078125 }, { "epoch": 0.9004350405378683, "step": 9107, "train/total_loss": 0.15878117084503174 }, { "entropy": 9.036908149719238, "epoch": 0.9005339133873839, "mean_token_accuracy": 0.7595541477203369, "num_tokens": 26606050.0, "step": 9108, "train/ce_loss": 0.5973562598228455 }, { "epoch": 0.9005339133873839, "step": 9108, "train/sim_loss": 0.06640625 }, { "epoch": 0.9005339133873839, "step": 9108, "train/total_loss": 0.12614187598228455 }, { "entropy": 9.420276641845703, "epoch": 0.9006327862368994, "mean_token_accuracy": 0.800000011920929, "num_tokens": 26610906.0, "step": 9109, "train/ce_loss": 5.389318857851322e-07 }, { "epoch": 0.9006327862368994, "step": 9109, "train/sim_loss": 0.0390625 }, { "epoch": 0.9006327862368994, "step": 9109, "train/total_loss": 0.03906255215406418 }, { "entropy": 8.7485933303833, "epoch": 0.9007316590864148, "mean_token_accuracy": 0.7695035338401794, "num_tokens": 26615909.0, "step": 9110, "train/ce_loss": 6.372333245963091e-06 }, { "epoch": 0.9007316590864148, "step": 9110, "train/sim_loss": 0.03125 }, { "epoch": 0.9007316590864148, "step": 9110, "train/total_loss": 0.03125063702464104 }, { "entropy": 9.318817138671875, "epoch": 0.9008305319359304, "mean_token_accuracy": 0.748603343963623, "num_tokens": 26620884.0, "step": 9111, "train/ce_loss": 5.345463023331831e-07 }, { "epoch": 0.9008305319359304, "step": 9111, "train/sim_loss": 0.0390625 }, { "epoch": 0.9008305319359304, "step": 9111, "train/total_loss": 0.03906255215406418 }, { "entropy": 8.58206558227539, "epoch": 0.9009294047854459, "mean_token_accuracy": 0.7491289377212524, "num_tokens": 26626274.0, "step": 9112, "train/ce_loss": 0.705833911895752 }, { "epoch": 0.9009294047854459, "step": 9112, "train/sim_loss": 0.046875 }, { "epoch": 0.9009294047854459, "step": 9112, "train/total_loss": 0.11745839565992355 }, { "entropy": 8.656524658203125, "epoch": 0.9010282776349614, "mean_token_accuracy": 0.702570378780365, "num_tokens": 26631603.0, "step": 9113, "train/ce_loss": 1.5656601190567017 }, { "epoch": 0.9010282776349614, "step": 9113, "train/sim_loss": 0.0234375 }, { "epoch": 0.9010282776349614, "step": 9113, "train/total_loss": 0.18000350892543793 }, { "entropy": 8.704153060913086, "epoch": 0.901127150484477, "mean_token_accuracy": 0.7424789667129517, "num_tokens": 26636942.0, "step": 9114, "train/ce_loss": 0.9390100836753845 }, { "epoch": 0.901127150484477, "step": 9114, "train/sim_loss": 0.05078125 }, { "epoch": 0.901127150484477, "step": 9114, "train/total_loss": 0.14468225836753845 }, { "entropy": 8.923505783081055, "epoch": 0.9012260233339925, "mean_token_accuracy": 0.7388362884521484, "num_tokens": 26642169.0, "step": 9115, "train/ce_loss": 0.6768019795417786 }, { "epoch": 0.9012260233339925, "step": 9115, "train/sim_loss": 0.0625 }, { "epoch": 0.9012260233339925, "step": 9115, "train/total_loss": 0.1301802098751068 }, { "entropy": 8.667463302612305, "epoch": 0.901324896183508, "mean_token_accuracy": 0.7836593985557556, "num_tokens": 26647479.0, "step": 9116, "train/ce_loss": 0.4220693111419678 }, { "epoch": 0.901324896183508, "step": 9116, "train/sim_loss": 0.05859375 }, { "epoch": 0.901324896183508, "step": 9116, "train/total_loss": 0.10080067813396454 }, { "entropy": 8.569649696350098, "epoch": 0.9014237690330236, "mean_token_accuracy": 0.7390350699424744, "num_tokens": 26652858.0, "step": 9117, "train/ce_loss": 0.9989830851554871 }, { "epoch": 0.9014237690330236, "step": 9117, "train/sim_loss": 0.0546875 }, { "epoch": 0.9014237690330236, "step": 9117, "train/total_loss": 0.1545858085155487 }, { "entropy": 9.056396484375, "epoch": 0.901522641882539, "mean_token_accuracy": 0.7503759264945984, "num_tokens": 26657918.0, "step": 9118, "train/ce_loss": 0.9660353660583496 }, { "epoch": 0.901522641882539, "step": 9118, "train/sim_loss": 0.03125 }, { "epoch": 0.901522641882539, "step": 9118, "train/total_loss": 0.12785354256629944 }, { "entropy": 9.318904876708984, "epoch": 0.9016215147320545, "mean_token_accuracy": 0.7260536551475525, "num_tokens": 26662878.0, "step": 9119, "train/ce_loss": 1.0541210174560547 }, { "epoch": 0.9016215147320545, "step": 9119, "train/sim_loss": 0.03125 }, { "epoch": 0.9016215147320545, "step": 9119, "train/total_loss": 0.136662095785141 }, { "epoch": 0.9017203875815701, "grad_norm": 0.7299725413322449, "learning_rate": 7.747861345992189e-06, "loss": 0.1259, "step": 9120 }, { "entropy": 8.333454132080078, "epoch": 0.9017203875815701, "mean_token_accuracy": 0.7875458002090454, "num_tokens": 26668243.0, "step": 9120, "train/ce_loss": 0.542241632938385 }, { "epoch": 0.9017203875815701, "step": 9120, "train/sim_loss": 0.0546875 }, { "epoch": 0.9017203875815701, "step": 9120, "train/total_loss": 0.1089116632938385 }, { "entropy": 8.740531921386719, "epoch": 0.9018192604310856, "mean_token_accuracy": 0.7414966225624084, "num_tokens": 26673584.0, "step": 9121, "train/ce_loss": 0.5661129951477051 }, { "epoch": 0.9018192604310856, "step": 9121, "train/sim_loss": 0.06640625 }, { "epoch": 0.9018192604310856, "step": 9121, "train/total_loss": 0.12301754951477051 }, { "entropy": 8.611451148986816, "epoch": 0.9019181332806011, "mean_token_accuracy": 0.7427912354469299, "num_tokens": 26678920.0, "step": 9122, "train/ce_loss": 1.0637965202331543 }, { "epoch": 0.9019181332806011, "step": 9122, "train/sim_loss": 0.03515625 }, { "epoch": 0.9019181332806011, "step": 9122, "train/total_loss": 0.1415359079837799 }, { "entropy": 9.500768661499023, "epoch": 0.9020170061301167, "mean_token_accuracy": 0.8245614171028137, "num_tokens": 26683884.0, "step": 9123, "train/ce_loss": 8.357000638170575e-07 }, { "epoch": 0.9020170061301167, "step": 9123, "train/sim_loss": 0.05078125 }, { "epoch": 0.9020170061301167, "step": 9123, "train/total_loss": 0.050781331956386566 }, { "entropy": 9.072831153869629, "epoch": 0.9021158789796322, "mean_token_accuracy": 0.7768240571022034, "num_tokens": 26689055.0, "step": 9124, "train/ce_loss": 0.9636762142181396 }, { "epoch": 0.9021158789796322, "step": 9124, "train/sim_loss": 0.03125 }, { "epoch": 0.9021158789796322, "step": 9124, "train/total_loss": 0.12761762738227844 }, { "entropy": 8.837091445922852, "epoch": 0.9022147518291477, "mean_token_accuracy": 0.7314702272415161, "num_tokens": 26694300.0, "step": 9125, "train/ce_loss": 0.9927276372909546 }, { "epoch": 0.9022147518291477, "step": 9125, "train/sim_loss": 0.0703125 }, { "epoch": 0.9022147518291477, "step": 9125, "train/total_loss": 0.16958525776863098 }, { "entropy": 9.130998611450195, "epoch": 0.9023136246786633, "mean_token_accuracy": 0.7519999742507935, "num_tokens": 26699487.0, "step": 9126, "train/ce_loss": 1.4159026145935059 }, { "epoch": 0.9023136246786633, "step": 9126, "train/sim_loss": 0.0546875 }, { "epoch": 0.9023136246786633, "step": 9126, "train/total_loss": 0.19627776741981506 }, { "entropy": 8.802294731140137, "epoch": 0.9024124975281788, "mean_token_accuracy": 0.7866848111152649, "num_tokens": 26704706.0, "step": 9127, "train/ce_loss": 7.055182322801556e-07 }, { "epoch": 0.9024124975281788, "step": 9127, "train/sim_loss": 0.0234375 }, { "epoch": 0.9024124975281788, "step": 9127, "train/total_loss": 0.02343757078051567 }, { "entropy": 9.004886627197266, "epoch": 0.9025113703776942, "mean_token_accuracy": 0.7532467246055603, "num_tokens": 26709952.0, "step": 9128, "train/ce_loss": 0.871261715888977 }, { "epoch": 0.9025113703776942, "step": 9128, "train/sim_loss": 0.01953125 }, { "epoch": 0.9025113703776942, "step": 9128, "train/total_loss": 0.10665742307901382 }, { "entropy": 8.931509017944336, "epoch": 0.9026102432272098, "mean_token_accuracy": 0.7090908885002136, "num_tokens": 26715301.0, "step": 9129, "train/ce_loss": 0.673997700214386 }, { "epoch": 0.9026102432272098, "step": 9129, "train/sim_loss": 0.06640625 }, { "epoch": 0.9026102432272098, "step": 9129, "train/total_loss": 0.1338060200214386 }, { "entropy": 8.808271408081055, "epoch": 0.9027091160767253, "mean_token_accuracy": 0.8070422410964966, "num_tokens": 26720477.0, "step": 9130, "train/ce_loss": 0.6568292379379272 }, { "epoch": 0.9027091160767253, "step": 9130, "train/sim_loss": 0.0625 }, { "epoch": 0.9027091160767253, "step": 9130, "train/total_loss": 0.12818291783332825 }, { "entropy": 8.710620880126953, "epoch": 0.9028079889262408, "mean_token_accuracy": 0.7081589698791504, "num_tokens": 26725899.0, "step": 9131, "train/ce_loss": 1.0490682125091553 }, { "epoch": 0.9028079889262408, "step": 9131, "train/sim_loss": 0.0703125 }, { "epoch": 0.9028079889262408, "step": 9131, "train/total_loss": 0.17521932721138 }, { "entropy": 8.956960678100586, "epoch": 0.9029068617757564, "mean_token_accuracy": 0.7303797602653503, "num_tokens": 26731228.0, "step": 9132, "train/ce_loss": 1.0235652923583984 }, { "epoch": 0.9029068617757564, "step": 9132, "train/sim_loss": 0.1171875 }, { "epoch": 0.9029068617757564, "step": 9132, "train/total_loss": 0.21954402327537537 }, { "entropy": 9.084754943847656, "epoch": 0.9030057346252719, "mean_token_accuracy": 0.7411003112792969, "num_tokens": 26736233.0, "step": 9133, "train/ce_loss": 0.7100213766098022 }, { "epoch": 0.9030057346252719, "step": 9133, "train/sim_loss": 0.0234375 }, { "epoch": 0.9030057346252719, "step": 9133, "train/total_loss": 0.09443964064121246 }, { "entropy": 8.940507888793945, "epoch": 0.9031046074747874, "mean_token_accuracy": 0.7477341294288635, "num_tokens": 26741328.0, "step": 9134, "train/ce_loss": 0.31841954588890076 }, { "epoch": 0.9031046074747874, "step": 9134, "train/sim_loss": 0.0546875 }, { "epoch": 0.9031046074747874, "step": 9134, "train/total_loss": 0.0865294560790062 }, { "entropy": 8.825894355773926, "epoch": 0.903203480324303, "mean_token_accuracy": 0.6662983298301697, "num_tokens": 26746719.0, "step": 9135, "train/ce_loss": 1.049531102180481 }, { "epoch": 0.903203480324303, "step": 9135, "train/sim_loss": 0.0546875 }, { "epoch": 0.903203480324303, "step": 9135, "train/total_loss": 0.1596406102180481 }, { "entropy": 8.965797424316406, "epoch": 0.9033023531738185, "mean_token_accuracy": 0.7156084775924683, "num_tokens": 26751905.0, "step": 9136, "train/ce_loss": 1.2573646306991577 }, { "epoch": 0.9033023531738185, "step": 9136, "train/sim_loss": 0.05078125 }, { "epoch": 0.9033023531738185, "step": 9136, "train/total_loss": 0.17651771008968353 }, { "entropy": 8.739272117614746, "epoch": 0.903401226023334, "mean_token_accuracy": 0.772357702255249, "num_tokens": 26757181.0, "step": 9137, "train/ce_loss": 0.5272718071937561 }, { "epoch": 0.903401226023334, "step": 9137, "train/sim_loss": 0.04296875 }, { "epoch": 0.903401226023334, "step": 9137, "train/total_loss": 0.09569592773914337 }, { "entropy": 9.129547119140625, "epoch": 0.9035000988728495, "mean_token_accuracy": 0.6932408809661865, "num_tokens": 26762223.0, "step": 9138, "train/ce_loss": 1.9463756084442139 }, { "epoch": 0.9035000988728495, "step": 9138, "train/sim_loss": 0.05859375 }, { "epoch": 0.9035000988728495, "step": 9138, "train/total_loss": 0.25323131680488586 }, { "entropy": 8.961040496826172, "epoch": 0.903598971722365, "mean_token_accuracy": 0.753947377204895, "num_tokens": 26767426.0, "step": 9139, "train/ce_loss": 0.9590904712677002 }, { "epoch": 0.903598971722365, "step": 9139, "train/sim_loss": 0.1015625 }, { "epoch": 0.903598971722365, "step": 9139, "train/total_loss": 0.19747155904769897 }, { "epoch": 0.9036978445718805, "grad_norm": 0.5847175121307373, "learning_rate": 7.742916481234238e-06, "loss": 0.1273, "step": 9140 }, { "entropy": 8.927392959594727, "epoch": 0.9036978445718805, "mean_token_accuracy": 0.7094972133636475, "num_tokens": 26772608.0, "step": 9140, "train/ce_loss": 3.629513116720773e-07 }, { "epoch": 0.9036978445718805, "step": 9140, "train/sim_loss": 0.015625 }, { "epoch": 0.9036978445718805, "step": 9140, "train/total_loss": 0.015625035390257835 }, { "entropy": 8.195661544799805, "epoch": 0.9037967174213961, "mean_token_accuracy": 0.7666666507720947, "num_tokens": 26778244.0, "step": 9141, "train/ce_loss": 0.8570559024810791 }, { "epoch": 0.9037967174213961, "step": 9141, "train/sim_loss": 0.0234375 }, { "epoch": 0.9037967174213961, "step": 9141, "train/total_loss": 0.10914309322834015 }, { "entropy": 8.750200271606445, "epoch": 0.9038955902709116, "mean_token_accuracy": 0.7545344829559326, "num_tokens": 26783477.0, "step": 9142, "train/ce_loss": 0.7238687872886658 }, { "epoch": 0.9038955902709116, "step": 9142, "train/sim_loss": 0.0546875 }, { "epoch": 0.9038955902709116, "step": 9142, "train/total_loss": 0.12707439064979553 }, { "entropy": 9.928706169128418, "epoch": 0.9039944631204271, "mean_token_accuracy": 0.841269850730896, "num_tokens": 26788083.0, "step": 9143, "train/ce_loss": 1.1201769893887104e-06 }, { "epoch": 0.9039944631204271, "step": 9143, "train/sim_loss": 0.015625 }, { "epoch": 0.9039944631204271, "step": 9143, "train/total_loss": 0.015625111758708954 }, { "entropy": 8.867383003234863, "epoch": 0.9040933359699427, "mean_token_accuracy": 0.6898638606071472, "num_tokens": 26793199.0, "step": 9144, "train/ce_loss": 1.7946258594747633e-06 }, { "epoch": 0.9040933359699427, "step": 9144, "train/sim_loss": 0.0390625 }, { "epoch": 0.9040933359699427, "step": 9144, "train/total_loss": 0.039062678813934326 }, { "entropy": 8.619510650634766, "epoch": 0.9041922088194582, "mean_token_accuracy": 0.803983211517334, "num_tokens": 26798589.0, "step": 9145, "train/ce_loss": 0.8176717758178711 }, { "epoch": 0.9041922088194582, "step": 9145, "train/sim_loss": 0.04296875 }, { "epoch": 0.9041922088194582, "step": 9145, "train/total_loss": 0.12473592907190323 }, { "entropy": 9.34872817993164, "epoch": 0.9042910816689737, "mean_token_accuracy": 0.7686274647712708, "num_tokens": 26803531.0, "step": 9146, "train/ce_loss": 9.471125963500526e-07 }, { "epoch": 0.9042910816689737, "step": 9146, "train/sim_loss": 0.03515625 }, { "epoch": 0.9042910816689737, "step": 9146, "train/total_loss": 0.03515634313225746 }, { "entropy": 8.584484100341797, "epoch": 0.9043899545184892, "mean_token_accuracy": 0.7789968848228455, "num_tokens": 26808715.0, "step": 9147, "train/ce_loss": 1.163509726524353 }, { "epoch": 0.9043899545184892, "step": 9147, "train/sim_loss": 0.0703125 }, { "epoch": 0.9043899545184892, "step": 9147, "train/total_loss": 0.18666347861289978 }, { "entropy": 9.282931327819824, "epoch": 0.9044888273680047, "mean_token_accuracy": 0.6739864945411682, "num_tokens": 26813759.0, "step": 9148, "train/ce_loss": 1.4836020469665527 }, { "epoch": 0.9044888273680047, "step": 9148, "train/sim_loss": 0.0859375 }, { "epoch": 0.9044888273680047, "step": 9148, "train/total_loss": 0.2342977076768875 }, { "entropy": 8.913932800292969, "epoch": 0.9045877002175202, "mean_token_accuracy": 0.747035562992096, "num_tokens": 26819024.0, "step": 9149, "train/ce_loss": 0.3981608748435974 }, { "epoch": 0.9045877002175202, "step": 9149, "train/sim_loss": 0.09375 }, { "epoch": 0.9045877002175202, "step": 9149, "train/total_loss": 0.13356608152389526 }, { "entropy": 8.687820434570312, "epoch": 0.9046865730670358, "mean_token_accuracy": 0.7634803652763367, "num_tokens": 26824330.0, "step": 9150, "train/ce_loss": 0.4469693601131439 }, { "epoch": 0.9046865730670358, "step": 9150, "train/sim_loss": 0.03515625 }, { "epoch": 0.9046865730670358, "step": 9150, "train/total_loss": 0.07985319197177887 }, { "entropy": 8.851806640625, "epoch": 0.9047854459165513, "mean_token_accuracy": 0.7380627393722534, "num_tokens": 26829470.0, "step": 9151, "train/ce_loss": 1.8865202378037793e-07 }, { "epoch": 0.9047854459165513, "step": 9151, "train/sim_loss": 0.015625 }, { "epoch": 0.9047854459165513, "step": 9151, "train/total_loss": 0.015625018626451492 }, { "entropy": 8.814598083496094, "epoch": 0.9048843187660668, "mean_token_accuracy": 0.7309321761131287, "num_tokens": 26835016.0, "step": 9152, "train/ce_loss": 0.8167648315429688 }, { "epoch": 0.9048843187660668, "step": 9152, "train/sim_loss": 0.03125 }, { "epoch": 0.9048843187660668, "step": 9152, "train/total_loss": 0.11292648315429688 }, { "entropy": 9.176883697509766, "epoch": 0.9049831916155824, "mean_token_accuracy": 0.7152209281921387, "num_tokens": 26840035.0, "step": 9153, "train/ce_loss": 0.8641502261161804 }, { "epoch": 0.9049831916155824, "step": 9153, "train/sim_loss": 0.0390625 }, { "epoch": 0.9049831916155824, "step": 9153, "train/total_loss": 0.12547752261161804 }, { "entropy": 8.835829734802246, "epoch": 0.9050820644650979, "mean_token_accuracy": 0.7415881752967834, "num_tokens": 26845249.0, "step": 9154, "train/ce_loss": 0.39165768027305603 }, { "epoch": 0.9050820644650979, "step": 9154, "train/sim_loss": 0.046875 }, { "epoch": 0.9050820644650979, "step": 9154, "train/total_loss": 0.08604076504707336 }, { "entropy": 8.979759216308594, "epoch": 0.9051809373146134, "mean_token_accuracy": 0.7575757503509521, "num_tokens": 26850299.0, "step": 9155, "train/ce_loss": 1.1635652780532837 }, { "epoch": 0.9051809373146134, "step": 9155, "train/sim_loss": 0.078125 }, { "epoch": 0.9051809373146134, "step": 9155, "train/total_loss": 0.1944815218448639 }, { "entropy": 9.043808937072754, "epoch": 0.905279810164129, "mean_token_accuracy": 0.7861635088920593, "num_tokens": 26855454.0, "step": 9156, "train/ce_loss": 0.6003595590591431 }, { "epoch": 0.905279810164129, "step": 9156, "train/sim_loss": 0.0390625 }, { "epoch": 0.905279810164129, "step": 9156, "train/total_loss": 0.09909845888614655 }, { "entropy": 8.320194244384766, "epoch": 0.9053786830136444, "mean_token_accuracy": 0.7412678003311157, "num_tokens": 26860710.0, "step": 9157, "train/ce_loss": 0.7466564178466797 }, { "epoch": 0.9053786830136444, "step": 9157, "train/sim_loss": 0.06640625 }, { "epoch": 0.9053786830136444, "step": 9157, "train/total_loss": 0.1410718858242035 }, { "entropy": 8.66702651977539, "epoch": 0.9054775558631599, "mean_token_accuracy": 0.76171875, "num_tokens": 26865975.0, "step": 9158, "train/ce_loss": 0.6458284854888916 }, { "epoch": 0.9054775558631599, "step": 9158, "train/sim_loss": 0.0234375 }, { "epoch": 0.9054775558631599, "step": 9158, "train/total_loss": 0.08802034705877304 }, { "entropy": 8.560710906982422, "epoch": 0.9055764287126755, "mean_token_accuracy": 0.7720670104026794, "num_tokens": 26871375.0, "step": 9159, "train/ce_loss": 0.32802441716194153 }, { "epoch": 0.9055764287126755, "step": 9159, "train/sim_loss": 0.03125 }, { "epoch": 0.9055764287126755, "step": 9159, "train/total_loss": 0.06405244767665863 }, { "epoch": 0.905675301562191, "grad_norm": 0.6260942816734314, "learning_rate": 7.73797161647629e-06, "loss": 0.1289, "step": 9160 }, { "entropy": 8.676794052124023, "epoch": 0.905675301562191, "mean_token_accuracy": 0.7962154150009155, "num_tokens": 26876558.0, "step": 9160, "train/ce_loss": 0.5289694666862488 }, { "epoch": 0.905675301562191, "step": 9160, "train/sim_loss": 0.06640625 }, { "epoch": 0.905675301562191, "step": 9160, "train/total_loss": 0.11930319666862488 }, { "entropy": 9.213626861572266, "epoch": 0.9057741744117066, "mean_token_accuracy": 0.742671012878418, "num_tokens": 26881760.0, "step": 9161, "train/ce_loss": 1.0828567743301392 }, { "epoch": 0.9057741744117066, "step": 9161, "train/sim_loss": 0.0546875 }, { "epoch": 0.9057741744117066, "step": 9161, "train/total_loss": 0.16297318041324615 }, { "entropy": 8.690057754516602, "epoch": 0.9058730472612221, "mean_token_accuracy": 0.7410604357719421, "num_tokens": 26887077.0, "step": 9162, "train/ce_loss": 1.0695013999938965 }, { "epoch": 0.9058730472612221, "step": 9162, "train/sim_loss": 0.0703125 }, { "epoch": 0.9058730472612221, "step": 9162, "train/total_loss": 0.17726263403892517 }, { "entropy": 8.557741165161133, "epoch": 0.9059719201107376, "mean_token_accuracy": 0.7709497213363647, "num_tokens": 26892280.0, "step": 9163, "train/ce_loss": 0.44319382309913635 }, { "epoch": 0.9059719201107376, "step": 9163, "train/sim_loss": 0.046875 }, { "epoch": 0.9059719201107376, "step": 9163, "train/total_loss": 0.09119438380002975 }, { "entropy": 8.492541313171387, "epoch": 0.9060707929602532, "mean_token_accuracy": 0.6945652365684509, "num_tokens": 26897639.0, "step": 9164, "train/ce_loss": 1.1665364503860474 }, { "epoch": 0.9060707929602532, "step": 9164, "train/sim_loss": 0.1171875 }, { "epoch": 0.9060707929602532, "step": 9164, "train/total_loss": 0.2338411509990692 }, { "entropy": 8.92304801940918, "epoch": 0.9061696658097687, "mean_token_accuracy": 0.7109588980674744, "num_tokens": 26902832.0, "step": 9165, "train/ce_loss": 0.8163487911224365 }, { "epoch": 0.9061696658097687, "step": 9165, "train/sim_loss": 0.03515625 }, { "epoch": 0.9061696658097687, "step": 9165, "train/total_loss": 0.11679112911224365 }, { "entropy": 9.129404067993164, "epoch": 0.9062685386592841, "mean_token_accuracy": 0.7043918967247009, "num_tokens": 26907859.0, "step": 9166, "train/ce_loss": 0.8976324796676636 }, { "epoch": 0.9062685386592841, "step": 9166, "train/sim_loss": 0.11328125 }, { "epoch": 0.9062685386592841, "step": 9166, "train/total_loss": 0.20304450392723083 }, { "entropy": 9.423097610473633, "epoch": 0.9063674115087997, "mean_token_accuracy": 0.8117870688438416, "num_tokens": 26912793.0, "step": 9167, "train/ce_loss": 2.8646294936152117e-07 }, { "epoch": 0.9063674115087997, "step": 9167, "train/sim_loss": 0.015625 }, { "epoch": 0.9063674115087997, "step": 9167, "train/total_loss": 0.01562502793967724 }, { "entropy": 8.839334487915039, "epoch": 0.9064662843583152, "mean_token_accuracy": 0.7293844223022461, "num_tokens": 26918139.0, "step": 9168, "train/ce_loss": 1.317361831665039 }, { "epoch": 0.9064662843583152, "step": 9168, "train/sim_loss": 0.12890625 }, { "epoch": 0.9064662843583152, "step": 9168, "train/total_loss": 0.2606424391269684 }, { "entropy": 8.991203308105469, "epoch": 0.9065651572078307, "mean_token_accuracy": 0.7046070694923401, "num_tokens": 26923334.0, "step": 9169, "train/ce_loss": 0.9669110774993896 }, { "epoch": 0.9065651572078307, "step": 9169, "train/sim_loss": 0.05859375 }, { "epoch": 0.9065651572078307, "step": 9169, "train/total_loss": 0.1552848517894745 }, { "entropy": 8.864419937133789, "epoch": 0.9066640300573463, "mean_token_accuracy": 0.8049853444099426, "num_tokens": 26928485.0, "step": 9170, "train/ce_loss": 1.5716816506028408e-06 }, { "epoch": 0.9066640300573463, "step": 9170, "train/sim_loss": 0.0390625 }, { "epoch": 0.9066640300573463, "step": 9170, "train/total_loss": 0.039062656462192535 }, { "entropy": 8.876815795898438, "epoch": 0.9067629029068618, "mean_token_accuracy": 0.732467532157898, "num_tokens": 26933708.0, "step": 9171, "train/ce_loss": 7.270077730936464e-07 }, { "epoch": 0.9067629029068618, "step": 9171, "train/sim_loss": 0.0625 }, { "epoch": 0.9067629029068618, "step": 9171, "train/total_loss": 0.06250007450580597 }, { "entropy": 8.742498397827148, "epoch": 0.9068617757563773, "mean_token_accuracy": 0.7655718922615051, "num_tokens": 26939038.0, "step": 9172, "train/ce_loss": 0.66084885597229 }, { "epoch": 0.9068617757563773, "step": 9172, "train/sim_loss": 0.0703125 }, { "epoch": 0.9068617757563773, "step": 9172, "train/total_loss": 0.13639739155769348 }, { "entropy": 8.657337188720703, "epoch": 0.9069606486058929, "mean_token_accuracy": 0.7401197552680969, "num_tokens": 26944377.0, "step": 9173, "train/ce_loss": 0.8242636322975159 }, { "epoch": 0.9069606486058929, "step": 9173, "train/sim_loss": 0.03125 }, { "epoch": 0.9069606486058929, "step": 9173, "train/total_loss": 0.11367636173963547 }, { "entropy": 8.706947326660156, "epoch": 0.9070595214554084, "mean_token_accuracy": 0.7205039858818054, "num_tokens": 26949694.0, "step": 9174, "train/ce_loss": 1.2270557880401611 }, { "epoch": 0.9070595214554084, "step": 9174, "train/sim_loss": 0.06640625 }, { "epoch": 0.9070595214554084, "step": 9174, "train/total_loss": 0.1891118288040161 }, { "entropy": 8.381051063537598, "epoch": 0.9071583943049238, "mean_token_accuracy": 0.7837281227111816, "num_tokens": 26955155.0, "step": 9175, "train/ce_loss": 0.6724500060081482 }, { "epoch": 0.9071583943049238, "step": 9175, "train/sim_loss": 0.046875 }, { "epoch": 0.9071583943049238, "step": 9175, "train/total_loss": 0.1141199991106987 }, { "entropy": 8.379070281982422, "epoch": 0.9072572671544394, "mean_token_accuracy": 0.7655755281448364, "num_tokens": 26960616.0, "step": 9176, "train/ce_loss": 0.6805732250213623 }, { "epoch": 0.9072572671544394, "step": 9176, "train/sim_loss": 0.01953125 }, { "epoch": 0.9072572671544394, "step": 9176, "train/total_loss": 0.08758857101202011 }, { "entropy": 9.151988983154297, "epoch": 0.9073561400039549, "mean_token_accuracy": 0.6783999800682068, "num_tokens": 26965728.0, "step": 9177, "train/ce_loss": 1.9155022528138943e-06 }, { "epoch": 0.9073561400039549, "step": 9177, "train/sim_loss": 0.0546875 }, { "epoch": 0.9073561400039549, "step": 9177, "train/total_loss": 0.05468768998980522 }, { "entropy": 9.601924896240234, "epoch": 0.9074550128534704, "mean_token_accuracy": 0.7032967209815979, "num_tokens": 26970773.0, "step": 9178, "train/ce_loss": 1.1842122376037878e-06 }, { "epoch": 0.9074550128534704, "step": 9178, "train/sim_loss": 0.0625 }, { "epoch": 0.9074550128534704, "step": 9178, "train/total_loss": 0.06250011920928955 }, { "entropy": 8.63504409790039, "epoch": 0.907553885702986, "mean_token_accuracy": 0.7269076108932495, "num_tokens": 26975983.0, "step": 9179, "train/ce_loss": 1.1727397441864014 }, { "epoch": 0.907553885702986, "step": 9179, "train/sim_loss": 0.04296875 }, { "epoch": 0.907553885702986, "step": 9179, "train/total_loss": 0.1602427363395691 }, { "epoch": 0.9076527585525015, "grad_norm": 0.6241506338119507, "learning_rate": 7.73302675171834e-06, "loss": 0.1372, "step": 9180 }, { "entropy": 8.744926452636719, "epoch": 0.9076527585525015, "mean_token_accuracy": 0.7477124333381653, "num_tokens": 26981181.0, "step": 9180, "train/ce_loss": 0.494517058134079 }, { "epoch": 0.9076527585525015, "step": 9180, "train/sim_loss": 0.05859375 }, { "epoch": 0.9076527585525015, "step": 9180, "train/total_loss": 0.10804545879364014 }, { "entropy": 8.447515487670898, "epoch": 0.907751631402017, "mean_token_accuracy": 0.7246073484420776, "num_tokens": 26986567.0, "step": 9181, "train/ce_loss": 0.8649625182151794 }, { "epoch": 0.907751631402017, "step": 9181, "train/sim_loss": 0.04296875 }, { "epoch": 0.907751631402017, "step": 9181, "train/total_loss": 0.1294650137424469 }, { "entropy": 8.460390090942383, "epoch": 0.9078505042515326, "mean_token_accuracy": 0.7459839582443237, "num_tokens": 26991997.0, "step": 9182, "train/ce_loss": 0.8686043620109558 }, { "epoch": 0.9078505042515326, "step": 9182, "train/sim_loss": 0.0703125 }, { "epoch": 0.9078505042515326, "step": 9182, "train/total_loss": 0.15717294812202454 }, { "entropy": 9.152366638183594, "epoch": 0.9079493771010481, "mean_token_accuracy": 0.7311828136444092, "num_tokens": 26997101.0, "step": 9183, "train/ce_loss": 7.624874456269026e-07 }, { "epoch": 0.9079493771010481, "step": 9183, "train/sim_loss": 0.046875 }, { "epoch": 0.9079493771010481, "step": 9183, "train/total_loss": 0.04687507450580597 }, { "entropy": 9.098976135253906, "epoch": 0.9080482499505635, "mean_token_accuracy": 0.7281553149223328, "num_tokens": 27002238.0, "step": 9184, "train/ce_loss": 1.229797124862671 }, { "epoch": 0.9080482499505635, "step": 9184, "train/sim_loss": 0.0390625 }, { "epoch": 0.9080482499505635, "step": 9184, "train/total_loss": 0.16204221546649933 }, { "entropy": 8.807500839233398, "epoch": 0.9081471228000791, "mean_token_accuracy": 0.7618438005447388, "num_tokens": 27007489.0, "step": 9185, "train/ce_loss": 2.1185859111483296e-07 }, { "epoch": 0.9081471228000791, "step": 9185, "train/sim_loss": 0.015625 }, { "epoch": 0.9081471228000791, "step": 9185, "train/total_loss": 0.01562502048909664 }, { "entropy": 8.601675033569336, "epoch": 0.9082459956495946, "mean_token_accuracy": 0.7455310225486755, "num_tokens": 27012864.0, "step": 9186, "train/ce_loss": 0.5795109272003174 }, { "epoch": 0.9082459956495946, "step": 9186, "train/sim_loss": 0.04296875 }, { "epoch": 0.9082459956495946, "step": 9186, "train/total_loss": 0.10091984272003174 }, { "entropy": 8.90341567993164, "epoch": 0.9083448684991101, "mean_token_accuracy": 0.7690447568893433, "num_tokens": 27018125.0, "step": 9187, "train/ce_loss": 0.8771971464157104 }, { "epoch": 0.9083448684991101, "step": 9187, "train/sim_loss": 0.1015625 }, { "epoch": 0.9083448684991101, "step": 9187, "train/total_loss": 0.18928220868110657 }, { "entropy": 8.82425594329834, "epoch": 0.9084437413486257, "mean_token_accuracy": 0.7973778247833252, "num_tokens": 27023453.0, "step": 9188, "train/ce_loss": 0.5918988585472107 }, { "epoch": 0.9084437413486257, "step": 9188, "train/sim_loss": 0.04296875 }, { "epoch": 0.9084437413486257, "step": 9188, "train/total_loss": 0.10215863585472107 }, { "entropy": 9.128678321838379, "epoch": 0.9085426141981412, "mean_token_accuracy": 0.7230769395828247, "num_tokens": 27028529.0, "step": 9189, "train/ce_loss": 0.9039449095726013 }, { "epoch": 0.9085426141981412, "step": 9189, "train/sim_loss": 0.0390625 }, { "epoch": 0.9085426141981412, "step": 9189, "train/total_loss": 0.1294569969177246 }, { "entropy": 8.939802169799805, "epoch": 0.9086414870476567, "mean_token_accuracy": 0.7655259966850281, "num_tokens": 27033766.0, "step": 9190, "train/ce_loss": 0.656508207321167 }, { "epoch": 0.9086414870476567, "step": 9190, "train/sim_loss": 0.03125 }, { "epoch": 0.9086414870476567, "step": 9190, "train/total_loss": 0.0969008207321167 }, { "entropy": 9.163475036621094, "epoch": 0.9087403598971723, "mean_token_accuracy": 0.7784810066223145, "num_tokens": 27038851.0, "step": 9191, "train/ce_loss": 3.538524424584466e-07 }, { "epoch": 0.9087403598971723, "step": 9191, "train/sim_loss": 0.015625 }, { "epoch": 0.9087403598971723, "step": 9191, "train/total_loss": 0.015625035390257835 }, { "entropy": 9.445109367370605, "epoch": 0.9088392327466878, "mean_token_accuracy": 0.7329192757606506, "num_tokens": 27043810.0, "step": 9192, "train/ce_loss": 1.2226264476776123 }, { "epoch": 0.9088392327466878, "step": 9192, "train/sim_loss": 0.09765625 }, { "epoch": 0.9088392327466878, "step": 9192, "train/total_loss": 0.21991890668869019 }, { "entropy": 9.054713249206543, "epoch": 0.9089381055962032, "mean_token_accuracy": 0.7955145239830017, "num_tokens": 27049054.0, "step": 9193, "train/ce_loss": 1.1977680921554565 }, { "epoch": 0.9089381055962032, "step": 9193, "train/sim_loss": 0.08984375 }, { "epoch": 0.9089381055962032, "step": 9193, "train/total_loss": 0.20962056517601013 }, { "entropy": 8.675012588500977, "epoch": 0.9090369784457188, "mean_token_accuracy": 0.7595212459564209, "num_tokens": 27054398.0, "step": 9194, "train/ce_loss": 1.0104807615280151 }, { "epoch": 0.9090369784457188, "step": 9194, "train/sim_loss": 0.05078125 }, { "epoch": 0.9090369784457188, "step": 9194, "train/total_loss": 0.151829332113266 }, { "entropy": 9.217578887939453, "epoch": 0.9091358512952343, "mean_token_accuracy": 0.7096296548843384, "num_tokens": 27059540.0, "step": 9195, "train/ce_loss": 1.2349307537078857 }, { "epoch": 0.9091358512952343, "step": 9195, "train/sim_loss": 0.03125 }, { "epoch": 0.9091358512952343, "step": 9195, "train/total_loss": 0.15474307537078857 }, { "entropy": 9.376506805419922, "epoch": 0.9092347241447498, "mean_token_accuracy": 0.7658802270889282, "num_tokens": 27064537.0, "step": 9196, "train/ce_loss": 1.2885411706520244e-06 }, { "epoch": 0.9092347241447498, "step": 9196, "train/sim_loss": 0.046875 }, { "epoch": 0.9092347241447498, "step": 9196, "train/total_loss": 0.046875130385160446 }, { "entropy": 8.676100730895996, "epoch": 0.9093335969942654, "mean_token_accuracy": 0.7641752362251282, "num_tokens": 27069753.0, "step": 9197, "train/ce_loss": 0.9406799674034119 }, { "epoch": 0.9093335969942654, "step": 9197, "train/sim_loss": 0.0390625 }, { "epoch": 0.9093335969942654, "step": 9197, "train/total_loss": 0.1331304907798767 }, { "entropy": 10.059473991394043, "epoch": 0.9094324698437809, "mean_token_accuracy": 0.7715736031532288, "num_tokens": 27074306.0, "step": 9198, "train/ce_loss": 1.067428115675284e-06 }, { "epoch": 0.9094324698437809, "step": 9198, "train/sim_loss": 0.015625 }, { "epoch": 0.9094324698437809, "step": 9198, "train/total_loss": 0.015625106170773506 }, { "entropy": 9.032780647277832, "epoch": 0.9095313426932964, "mean_token_accuracy": 0.7309486865997314, "num_tokens": 27079421.0, "step": 9199, "train/ce_loss": 0.7582253217697144 }, { "epoch": 0.9095313426932964, "step": 9199, "train/sim_loss": 0.05078125 }, { "epoch": 0.9095313426932964, "step": 9199, "train/total_loss": 0.12660378217697144 }, { "epoch": 0.909630215542812, "grad_norm": 0.6809642910957336, "learning_rate": 7.728081886960393e-06, "loss": 0.1245, "step": 9200 }, { "entropy": 9.335073471069336, "epoch": 0.909630215542812, "mean_token_accuracy": 0.7510121464729309, "num_tokens": 27084389.0, "step": 9200, "train/ce_loss": 1.4134788513183594 }, { "epoch": 0.909630215542812, "step": 9200, "train/sim_loss": 0.06640625 }, { "epoch": 0.909630215542812, "step": 9200, "train/total_loss": 0.20775413513183594 }, { "entropy": 9.247215270996094, "epoch": 0.9097290883923275, "mean_token_accuracy": 0.726396918296814, "num_tokens": 27089343.0, "step": 9201, "train/ce_loss": 1.3582441806793213 }, { "epoch": 0.9097290883923275, "step": 9201, "train/sim_loss": 0.05078125 }, { "epoch": 0.9097290883923275, "step": 9201, "train/total_loss": 0.18660567700862885 }, { "entropy": 9.34528923034668, "epoch": 0.909827961241843, "mean_token_accuracy": 0.787564754486084, "num_tokens": 27094157.0, "step": 9202, "train/ce_loss": 1.572838544845581 }, { "epoch": 0.909827961241843, "step": 9202, "train/sim_loss": 0.0546875 }, { "epoch": 0.909827961241843, "step": 9202, "train/total_loss": 0.21197135746479034 }, { "entropy": 9.446272850036621, "epoch": 0.9099268340913585, "mean_token_accuracy": 0.7732793688774109, "num_tokens": 27099110.0, "step": 9203, "train/ce_loss": 0.8522922992706299 }, { "epoch": 0.9099268340913585, "step": 9203, "train/sim_loss": 0.08203125 }, { "epoch": 0.9099268340913585, "step": 9203, "train/total_loss": 0.16726048290729523 }, { "entropy": 8.76350212097168, "epoch": 0.910025706940874, "mean_token_accuracy": 0.740618109703064, "num_tokens": 27104477.0, "step": 9204, "train/ce_loss": 1.0039551258087158 }, { "epoch": 0.910025706940874, "step": 9204, "train/sim_loss": 0.015625 }, { "epoch": 0.910025706940874, "step": 9204, "train/total_loss": 0.11602051556110382 }, { "entropy": 8.420951843261719, "epoch": 0.9101245797903895, "mean_token_accuracy": 0.784009575843811, "num_tokens": 27109800.0, "step": 9205, "train/ce_loss": 0.7582249045372009 }, { "epoch": 0.9101245797903895, "step": 9205, "train/sim_loss": 0.01953125 }, { "epoch": 0.9101245797903895, "step": 9205, "train/total_loss": 0.09535374492406845 }, { "entropy": 9.049461364746094, "epoch": 0.9102234526399051, "mean_token_accuracy": 0.7334410548210144, "num_tokens": 27114884.0, "step": 9206, "train/ce_loss": 0.9187730550765991 }, { "epoch": 0.9102234526399051, "step": 9206, "train/sim_loss": 0.0234375 }, { "epoch": 0.9102234526399051, "step": 9206, "train/total_loss": 0.11531480401754379 }, { "entropy": 8.63083267211914, "epoch": 0.9103223254894206, "mean_token_accuracy": 0.7093023061752319, "num_tokens": 27120259.0, "step": 9207, "train/ce_loss": 0.7709442377090454 }, { "epoch": 0.9103223254894206, "step": 9207, "train/sim_loss": 0.0546875 }, { "epoch": 0.9103223254894206, "step": 9207, "train/total_loss": 0.1317819356918335 }, { "entropy": 8.907642364501953, "epoch": 0.9104211983389361, "mean_token_accuracy": 0.7167182564735413, "num_tokens": 27125329.0, "step": 9208, "train/ce_loss": 1.4443556070327759 }, { "epoch": 0.9104211983389361, "step": 9208, "train/sim_loss": 0.0703125 }, { "epoch": 0.9104211983389361, "step": 9208, "train/total_loss": 0.2147480696439743 }, { "entropy": 8.556644439697266, "epoch": 0.9105200711884517, "mean_token_accuracy": 0.7161290049552917, "num_tokens": 27130716.0, "step": 9209, "train/ce_loss": 1.3037917613983154 }, { "epoch": 0.9105200711884517, "step": 9209, "train/sim_loss": 0.046875 }, { "epoch": 0.9105200711884517, "step": 9209, "train/total_loss": 0.17725418508052826 }, { "entropy": 9.725595474243164, "epoch": 0.9106189440379672, "mean_token_accuracy": 0.9027777910232544, "num_tokens": 27135372.0, "step": 9210, "train/ce_loss": 1.4040017504157731e-06 }, { "epoch": 0.9106189440379672, "step": 9210, "train/sim_loss": 0.0625 }, { "epoch": 0.9106189440379672, "step": 9210, "train/total_loss": 0.06250014156103134 }, { "entropy": 8.989730834960938, "epoch": 0.9107178168874827, "mean_token_accuracy": 0.7469879388809204, "num_tokens": 27140475.0, "step": 9211, "train/ce_loss": 1.525792121887207 }, { "epoch": 0.9107178168874827, "step": 9211, "train/sim_loss": 0.04296875 }, { "epoch": 0.9107178168874827, "step": 9211, "train/total_loss": 0.19554796814918518 }, { "entropy": 8.68646240234375, "epoch": 0.9108166897369983, "mean_token_accuracy": 0.732467532157898, "num_tokens": 27145687.0, "step": 9212, "train/ce_loss": 1.2112396955490112 }, { "epoch": 0.9108166897369983, "step": 9212, "train/sim_loss": 0.06640625 }, { "epoch": 0.9108166897369983, "step": 9212, "train/total_loss": 0.18753021955490112 }, { "entropy": 8.59811782836914, "epoch": 0.9109155625865137, "mean_token_accuracy": 0.7418655157089233, "num_tokens": 27151078.0, "step": 9213, "train/ce_loss": 0.6997737884521484 }, { "epoch": 0.9109155625865137, "step": 9213, "train/sim_loss": 0.0234375 }, { "epoch": 0.9109155625865137, "step": 9213, "train/total_loss": 0.09341488033533096 }, { "entropy": 8.52330207824707, "epoch": 0.9110144354360292, "mean_token_accuracy": 0.7364583611488342, "num_tokens": 27156531.0, "step": 9214, "train/ce_loss": 0.8881973028182983 }, { "epoch": 0.9110144354360292, "step": 9214, "train/sim_loss": 0.0625 }, { "epoch": 0.9110144354360292, "step": 9214, "train/total_loss": 0.1513197422027588 }, { "entropy": 9.369311332702637, "epoch": 0.9111133082855448, "mean_token_accuracy": 0.7292490005493164, "num_tokens": 27161463.0, "step": 9215, "train/ce_loss": 4.153951522312127e-06 }, { "epoch": 0.9111133082855448, "step": 9215, "train/sim_loss": 0.05859375 }, { "epoch": 0.9111133082855448, "step": 9215, "train/total_loss": 0.05859416723251343 }, { "entropy": 8.633882522583008, "epoch": 0.9112121811350603, "mean_token_accuracy": 0.7447513937950134, "num_tokens": 27166985.0, "step": 9216, "train/ce_loss": 0.7126514911651611 }, { "epoch": 0.9112121811350603, "step": 9216, "train/sim_loss": 0.03125 }, { "epoch": 0.9112121811350603, "step": 9216, "train/total_loss": 0.10251515358686447 }, { "entropy": 8.838998794555664, "epoch": 0.9113110539845758, "mean_token_accuracy": 0.7244623899459839, "num_tokens": 27172206.0, "step": 9217, "train/ce_loss": 0.6789557933807373 }, { "epoch": 0.9113110539845758, "step": 9217, "train/sim_loss": 0.02734375 }, { "epoch": 0.9113110539845758, "step": 9217, "train/total_loss": 0.09523933380842209 }, { "entropy": 9.317330360412598, "epoch": 0.9114099268340914, "mean_token_accuracy": 0.7415094375610352, "num_tokens": 27177146.0, "step": 9218, "train/ce_loss": 1.1412620544433594 }, { "epoch": 0.9114099268340914, "step": 9218, "train/sim_loss": 0.03515625 }, { "epoch": 0.9114099268340914, "step": 9218, "train/total_loss": 0.14928245544433594 }, { "entropy": 8.91617202758789, "epoch": 0.9115087996836069, "mean_token_accuracy": 0.6881889700889587, "num_tokens": 27182261.0, "step": 9219, "train/ce_loss": 1.661460041999817 }, { "epoch": 0.9115087996836069, "step": 9219, "train/sim_loss": 0.0546875 }, { "epoch": 0.9115087996836069, "step": 9219, "train/total_loss": 0.22083351016044617 }, { "epoch": 0.9116076725331224, "grad_norm": 0.727852463722229, "learning_rate": 7.723137022202443e-06, "loss": 0.1244, "step": 9220 }, { "entropy": 9.493191719055176, "epoch": 0.9116076725331224, "mean_token_accuracy": 0.7830578684806824, "num_tokens": 27187157.0, "step": 9220, "train/ce_loss": 1.112162709236145 }, { "epoch": 0.9116076725331224, "step": 9220, "train/sim_loss": 0.03515625 }, { "epoch": 0.9116076725331224, "step": 9220, "train/total_loss": 0.14637252688407898 }, { "entropy": 8.813331604003906, "epoch": 0.911706545382638, "mean_token_accuracy": 0.7963483333587646, "num_tokens": 27192336.0, "step": 9221, "train/ce_loss": 0.5560468435287476 }, { "epoch": 0.911706545382638, "step": 9221, "train/sim_loss": 0.015625 }, { "epoch": 0.911706545382638, "step": 9221, "train/total_loss": 0.07122968137264252 }, { "entropy": 8.88023567199707, "epoch": 0.9118054182321534, "mean_token_accuracy": 0.7069988250732422, "num_tokens": 27197600.0, "step": 9222, "train/ce_loss": 1.0730652809143066 }, { "epoch": 0.9118054182321534, "step": 9222, "train/sim_loss": 0.0546875 }, { "epoch": 0.9118054182321534, "step": 9222, "train/total_loss": 0.16199404001235962 }, { "entropy": 8.567594528198242, "epoch": 0.9119042910816689, "mean_token_accuracy": 0.7450058460235596, "num_tokens": 27202955.0, "step": 9223, "train/ce_loss": 0.6554208397865295 }, { "epoch": 0.9119042910816689, "step": 9223, "train/sim_loss": 0.08984375 }, { "epoch": 0.9119042910816689, "step": 9223, "train/total_loss": 0.1553858369588852 }, { "entropy": 8.949714660644531, "epoch": 0.9120031639311845, "mean_token_accuracy": 0.7708333134651184, "num_tokens": 27208181.0, "step": 9224, "train/ce_loss": 0.5297962427139282 }, { "epoch": 0.9120031639311845, "step": 9224, "train/sim_loss": 0.03515625 }, { "epoch": 0.9120031639311845, "step": 9224, "train/total_loss": 0.08813587576150894 }, { "entropy": 9.567924499511719, "epoch": 0.9121020367807, "mean_token_accuracy": 0.7350649237632751, "num_tokens": 27212987.0, "step": 9225, "train/ce_loss": 3.7651375350833405e-07 }, { "epoch": 0.9121020367807, "step": 9225, "train/sim_loss": 0.015625 }, { "epoch": 0.9121020367807, "step": 9225, "train/total_loss": 0.015625037252902985 }, { "entropy": 8.928082466125488, "epoch": 0.9122009096302155, "mean_token_accuracy": 0.6947040557861328, "num_tokens": 27218082.0, "step": 9226, "train/ce_loss": 0.8295297622680664 }, { "epoch": 0.9122009096302155, "step": 9226, "train/sim_loss": 0.08984375 }, { "epoch": 0.9122009096302155, "step": 9226, "train/total_loss": 0.17279672622680664 }, { "entropy": 8.772733688354492, "epoch": 0.9122997824797311, "mean_token_accuracy": 0.734375, "num_tokens": 27223286.0, "step": 9227, "train/ce_loss": 0.479086697101593 }, { "epoch": 0.9122997824797311, "step": 9227, "train/sim_loss": 0.0390625 }, { "epoch": 0.9122997824797311, "step": 9227, "train/total_loss": 0.08697117120027542 }, { "entropy": 8.668851852416992, "epoch": 0.9123986553292466, "mean_token_accuracy": 0.7693236470222473, "num_tokens": 27228540.0, "step": 9228, "train/ce_loss": 0.6925671100616455 }, { "epoch": 0.9123986553292466, "step": 9228, "train/sim_loss": 0.015625 }, { "epoch": 0.9123986553292466, "step": 9228, "train/total_loss": 0.08488171547651291 }, { "entropy": 8.739944458007812, "epoch": 0.9124975281787621, "mean_token_accuracy": 0.7110519409179688, "num_tokens": 27233732.0, "step": 9229, "train/ce_loss": 0.7571595907211304 }, { "epoch": 0.9124975281787621, "step": 9229, "train/sim_loss": 0.0546875 }, { "epoch": 0.9124975281787621, "step": 9229, "train/total_loss": 0.13040345907211304 }, { "entropy": 8.30958366394043, "epoch": 0.9125964010282777, "mean_token_accuracy": 0.7227227091789246, "num_tokens": 27239173.0, "step": 9230, "train/ce_loss": 1.288848638534546 }, { "epoch": 0.9125964010282777, "step": 9230, "train/sim_loss": 0.1015625 }, { "epoch": 0.9125964010282777, "step": 9230, "train/total_loss": 0.23044736683368683 }, { "entropy": 8.77785587310791, "epoch": 0.9126952738777931, "mean_token_accuracy": 0.7530266046524048, "num_tokens": 27244481.0, "step": 9231, "train/ce_loss": 0.694959282875061 }, { "epoch": 0.9126952738777931, "step": 9231, "train/sim_loss": 0.01953125 }, { "epoch": 0.9126952738777931, "step": 9231, "train/total_loss": 0.08902718126773834 }, { "entropy": 8.481739044189453, "epoch": 0.9127941467273086, "mean_token_accuracy": 0.722806990146637, "num_tokens": 27249811.0, "step": 9232, "train/ce_loss": 0.933229386806488 }, { "epoch": 0.9127941467273086, "step": 9232, "train/sim_loss": 0.05859375 }, { "epoch": 0.9127941467273086, "step": 9232, "train/total_loss": 0.15191668272018433 }, { "entropy": 8.568881034851074, "epoch": 0.9128930195768242, "mean_token_accuracy": 0.7235932946205139, "num_tokens": 27255309.0, "step": 9233, "train/ce_loss": 0.6516080498695374 }, { "epoch": 0.9128930195768242, "step": 9233, "train/sim_loss": 0.03125 }, { "epoch": 0.9128930195768242, "step": 9233, "train/total_loss": 0.09641080349683762 }, { "entropy": 9.117914199829102, "epoch": 0.9129918924263397, "mean_token_accuracy": 0.8020231127738953, "num_tokens": 27260464.0, "step": 9234, "train/ce_loss": 0.9868393540382385 }, { "epoch": 0.9129918924263397, "step": 9234, "train/sim_loss": 0.03125 }, { "epoch": 0.9129918924263397, "step": 9234, "train/total_loss": 0.1299339383840561 }, { "entropy": 8.817948341369629, "epoch": 0.9130907652758552, "mean_token_accuracy": 0.7510040402412415, "num_tokens": 27265683.0, "step": 9235, "train/ce_loss": 0.8873830437660217 }, { "epoch": 0.9130907652758552, "step": 9235, "train/sim_loss": 0.04296875 }, { "epoch": 0.9130907652758552, "step": 9235, "train/total_loss": 0.1317070573568344 }, { "entropy": 8.456457138061523, "epoch": 0.9131896381253708, "mean_token_accuracy": 0.7397563457489014, "num_tokens": 27271089.0, "step": 9236, "train/ce_loss": 0.7246193885803223 }, { "epoch": 0.9131896381253708, "step": 9236, "train/sim_loss": 0.046875 }, { "epoch": 0.9131896381253708, "step": 9236, "train/total_loss": 0.11933694034814835 }, { "entropy": 8.499395370483398, "epoch": 0.9132885109748863, "mean_token_accuracy": 0.7075055241584778, "num_tokens": 27276488.0, "step": 9237, "train/ce_loss": 0.5785139799118042 }, { "epoch": 0.9132885109748863, "step": 9237, "train/sim_loss": 0.046875 }, { "epoch": 0.9132885109748863, "step": 9237, "train/total_loss": 0.1047264039516449 }, { "entropy": 8.961029052734375, "epoch": 0.9133873838244018, "mean_token_accuracy": 0.6569620370864868, "num_tokens": 27281784.0, "step": 9238, "train/ce_loss": 0.47236815094947815 }, { "epoch": 0.9133873838244018, "step": 9238, "train/sim_loss": 0.046875 }, { "epoch": 0.9133873838244018, "step": 9238, "train/total_loss": 0.09411181509494781 }, { "entropy": 8.695600509643555, "epoch": 0.9134862566739174, "mean_token_accuracy": 0.7176339030265808, "num_tokens": 27287120.0, "step": 9239, "train/ce_loss": 0.7086272835731506 }, { "epoch": 0.9134862566739174, "step": 9239, "train/sim_loss": 0.03515625 }, { "epoch": 0.9134862566739174, "step": 9239, "train/total_loss": 0.10601898282766342 }, { "epoch": 0.9135851295234328, "grad_norm": 0.606782853603363, "learning_rate": 7.718192157444494e-06, "loss": 0.1405, "step": 9240 }, { "entropy": 8.963159561157227, "epoch": 0.9135851295234328, "mean_token_accuracy": 0.771324872970581, "num_tokens": 27292104.0, "step": 9240, "train/ce_loss": 1.5950840711593628 }, { "epoch": 0.9135851295234328, "step": 9240, "train/sim_loss": 0.04296875 }, { "epoch": 0.9135851295234328, "step": 9240, "train/total_loss": 0.20247715711593628 }, { "entropy": 8.75002670288086, "epoch": 0.9136840023729483, "mean_token_accuracy": 0.78812575340271, "num_tokens": 27297428.0, "step": 9241, "train/ce_loss": 0.3548147976398468 }, { "epoch": 0.9136840023729483, "step": 9241, "train/sim_loss": 0.015625 }, { "epoch": 0.9136840023729483, "step": 9241, "train/total_loss": 0.05110647901892662 }, { "entropy": 8.583799362182617, "epoch": 0.9137828752224639, "mean_token_accuracy": 0.7675977945327759, "num_tokens": 27302770.0, "step": 9242, "train/ce_loss": 0.4433761537075043 }, { "epoch": 0.9137828752224639, "step": 9242, "train/sim_loss": 0.015625 }, { "epoch": 0.9137828752224639, "step": 9242, "train/total_loss": 0.05996261537075043 }, { "entropy": 8.733439445495605, "epoch": 0.9138817480719794, "mean_token_accuracy": 0.7619718313217163, "num_tokens": 27307975.0, "step": 9243, "train/ce_loss": 0.8882772922515869 }, { "epoch": 0.9138817480719794, "step": 9243, "train/sim_loss": 0.02734375 }, { "epoch": 0.9138817480719794, "step": 9243, "train/total_loss": 0.11617147922515869 }, { "entropy": 8.964921951293945, "epoch": 0.913980620921495, "mean_token_accuracy": 0.7971576452255249, "num_tokens": 27313209.0, "step": 9244, "train/ce_loss": 0.5068889856338501 }, { "epoch": 0.913980620921495, "step": 9244, "train/sim_loss": 0.0234375 }, { "epoch": 0.913980620921495, "step": 9244, "train/total_loss": 0.07412640005350113 }, { "entropy": 8.99893569946289, "epoch": 0.9140794937710105, "mean_token_accuracy": 0.7366071343421936, "num_tokens": 27318395.0, "step": 9245, "train/ce_loss": 0.6802981495857239 }, { "epoch": 0.9140794937710105, "step": 9245, "train/sim_loss": 0.0390625 }, { "epoch": 0.9140794937710105, "step": 9245, "train/total_loss": 0.10709231346845627 }, { "entropy": 8.335893630981445, "epoch": 0.914178366620526, "mean_token_accuracy": 0.7097457647323608, "num_tokens": 27323782.0, "step": 9246, "train/ce_loss": 1.2256686687469482 }, { "epoch": 0.914178366620526, "step": 9246, "train/sim_loss": 0.046875 }, { "epoch": 0.914178366620526, "step": 9246, "train/total_loss": 0.16944187879562378 }, { "entropy": 8.798724174499512, "epoch": 0.9142772394700416, "mean_token_accuracy": 0.7006451487541199, "num_tokens": 27328988.0, "step": 9247, "train/ce_loss": 3.9869163970251975e-07 }, { "epoch": 0.9142772394700416, "step": 9247, "train/sim_loss": 0.03125 }, { "epoch": 0.9142772394700416, "step": 9247, "train/total_loss": 0.03125004097819328 }, { "entropy": 8.986804008483887, "epoch": 0.9143761123195571, "mean_token_accuracy": 0.7445008754730225, "num_tokens": 27334065.0, "step": 9248, "train/ce_loss": 1.5509508848190308 }, { "epoch": 0.9143761123195571, "step": 9248, "train/sim_loss": 0.1328125 }, { "epoch": 0.9143761123195571, "step": 9248, "train/total_loss": 0.28790760040283203 }, { "entropy": 8.465126037597656, "epoch": 0.9144749851690726, "mean_token_accuracy": 0.7416666746139526, "num_tokens": 27339382.0, "step": 9249, "train/ce_loss": 1.0632635354995728 }, { "epoch": 0.9144749851690726, "step": 9249, "train/sim_loss": 0.08203125 }, { "epoch": 0.9144749851690726, "step": 9249, "train/total_loss": 0.18835760653018951 }, { "entropy": 8.786970138549805, "epoch": 0.9145738580185881, "mean_token_accuracy": 0.7538461685180664, "num_tokens": 27344615.0, "step": 9250, "train/ce_loss": 8.582583745919692e-07 }, { "epoch": 0.9145738580185881, "step": 9250, "train/sim_loss": 0.0390625 }, { "epoch": 0.9145738580185881, "step": 9250, "train/total_loss": 0.039062585681676865 }, { "entropy": 9.150355339050293, "epoch": 0.9146727308681036, "mean_token_accuracy": 0.6981481313705444, "num_tokens": 27349612.0, "step": 9251, "train/ce_loss": 0.901170551776886 }, { "epoch": 0.9146727308681036, "step": 9251, "train/sim_loss": 0.03125 }, { "epoch": 0.9146727308681036, "step": 9251, "train/total_loss": 0.12136705964803696 }, { "entropy": 8.961719512939453, "epoch": 0.9147716037176191, "mean_token_accuracy": 0.7664121985435486, "num_tokens": 27354652.0, "step": 9252, "train/ce_loss": 1.3180468082427979 }, { "epoch": 0.9147716037176191, "step": 9252, "train/sim_loss": 0.0703125 }, { "epoch": 0.9147716037176191, "step": 9252, "train/total_loss": 0.2021171897649765 }, { "entropy": 9.69161605834961, "epoch": 0.9148704765671347, "mean_token_accuracy": 0.7443820238113403, "num_tokens": 27359403.0, "step": 9253, "train/ce_loss": 3.000313597567583e-07 }, { "epoch": 0.9148704765671347, "step": 9253, "train/sim_loss": 0.015625 }, { "epoch": 0.9148704765671347, "step": 9253, "train/total_loss": 0.015625029802322388 }, { "entropy": 8.836685180664062, "epoch": 0.9149693494166502, "mean_token_accuracy": 0.7243107557296753, "num_tokens": 27364834.0, "step": 9254, "train/ce_loss": 0.7340941429138184 }, { "epoch": 0.9149693494166502, "step": 9254, "train/sim_loss": 0.05078125 }, { "epoch": 0.9149693494166502, "step": 9254, "train/total_loss": 0.12419066578149796 }, { "entropy": 8.632948875427246, "epoch": 0.9150682222661657, "mean_token_accuracy": 0.730681836605072, "num_tokens": 27370163.0, "step": 9255, "train/ce_loss": 1.0817639827728271 }, { "epoch": 0.9150682222661657, "step": 9255, "train/sim_loss": 0.0390625 }, { "epoch": 0.9150682222661657, "step": 9255, "train/total_loss": 0.14723891019821167 }, { "entropy": 8.910545349121094, "epoch": 0.9151670951156813, "mean_token_accuracy": 0.8300395011901855, "num_tokens": 27375394.0, "step": 9256, "train/ce_loss": 9.63684556154476e-07 }, { "epoch": 0.9151670951156813, "step": 9256, "train/sim_loss": 0.046875 }, { "epoch": 0.9151670951156813, "step": 9256, "train/total_loss": 0.04687509685754776 }, { "entropy": 9.36585521697998, "epoch": 0.9152659679651968, "mean_token_accuracy": 0.7733089327812195, "num_tokens": 27380386.0, "step": 9257, "train/ce_loss": 1.8916507826816087e-07 }, { "epoch": 0.9152659679651968, "step": 9257, "train/sim_loss": 0.015625 }, { "epoch": 0.9152659679651968, "step": 9257, "train/total_loss": 0.015625018626451492 }, { "entropy": 9.008048057556152, "epoch": 0.9153648408147123, "mean_token_accuracy": 0.727544903755188, "num_tokens": 27385501.0, "step": 9258, "train/ce_loss": 1.2201671600341797 }, { "epoch": 0.9153648408147123, "step": 9258, "train/sim_loss": 0.07421875 }, { "epoch": 0.9153648408147123, "step": 9258, "train/total_loss": 0.19623547792434692 }, { "entropy": 8.924967765808105, "epoch": 0.9154637136642279, "mean_token_accuracy": 0.75, "num_tokens": 27390665.0, "step": 9259, "train/ce_loss": 0.3701741695404053 }, { "epoch": 0.9154637136642279, "step": 9259, "train/sim_loss": 0.0390625 }, { "epoch": 0.9154637136642279, "step": 9259, "train/total_loss": 0.07607991993427277 }, { "epoch": 0.9155625865137433, "grad_norm": 0.612991452217102, "learning_rate": 7.713247292686546e-06, "loss": 0.1263, "step": 9260 }, { "entropy": 8.833109855651855, "epoch": 0.9155625865137433, "mean_token_accuracy": 0.7330677509307861, "num_tokens": 27395949.0, "step": 9260, "train/ce_loss": 0.6401509642601013 }, { "epoch": 0.9155625865137433, "step": 9260, "train/sim_loss": 0.02734375 }, { "epoch": 0.9155625865137433, "step": 9260, "train/total_loss": 0.09135884791612625 }, { "entropy": 9.314435958862305, "epoch": 0.9156614593632588, "mean_token_accuracy": 0.7307692170143127, "num_tokens": 27400866.0, "step": 9261, "train/ce_loss": 2.0196523666381836 }, { "epoch": 0.9156614593632588, "step": 9261, "train/sim_loss": 0.04296875 }, { "epoch": 0.9156614593632588, "step": 9261, "train/total_loss": 0.24493399262428284 }, { "entropy": 8.92432975769043, "epoch": 0.9157603322127744, "mean_token_accuracy": 0.7299270033836365, "num_tokens": 27406057.0, "step": 9262, "train/ce_loss": 1.7605401277542114 }, { "epoch": 0.9157603322127744, "step": 9262, "train/sim_loss": 0.08203125 }, { "epoch": 0.9157603322127744, "step": 9262, "train/total_loss": 0.2580852508544922 }, { "entropy": 9.119075775146484, "epoch": 0.9158592050622899, "mean_token_accuracy": 0.8072100281715393, "num_tokens": 27411098.0, "step": 9263, "train/ce_loss": 0.87204909324646 }, { "epoch": 0.9158592050622899, "step": 9263, "train/sim_loss": 0.01171875 }, { "epoch": 0.9158592050622899, "step": 9263, "train/total_loss": 0.09892366081476212 }, { "entropy": 8.559356689453125, "epoch": 0.9159580779118054, "mean_token_accuracy": 0.6959459185600281, "num_tokens": 27416476.0, "step": 9264, "train/ce_loss": 0.5877631306648254 }, { "epoch": 0.9159580779118054, "step": 9264, "train/sim_loss": 0.02734375 }, { "epoch": 0.9159580779118054, "step": 9264, "train/total_loss": 0.08612006902694702 }, { "entropy": 8.968629837036133, "epoch": 0.916056950761321, "mean_token_accuracy": 0.7583732008934021, "num_tokens": 27421731.0, "step": 9265, "train/ce_loss": 1.246213674545288 }, { "epoch": 0.916056950761321, "step": 9265, "train/sim_loss": 0.015625 }, { "epoch": 0.916056950761321, "step": 9265, "train/total_loss": 0.14024636149406433 }, { "entropy": 9.342061042785645, "epoch": 0.9161558236108365, "mean_token_accuracy": 0.7292817831039429, "num_tokens": 27426712.0, "step": 9266, "train/ce_loss": 1.7498931884765625 }, { "epoch": 0.9161558236108365, "step": 9266, "train/sim_loss": 0.02734375 }, { "epoch": 0.9161558236108365, "step": 9266, "train/total_loss": 0.20233307778835297 }, { "entropy": 9.135245323181152, "epoch": 0.916254696460352, "mean_token_accuracy": 0.8283828496932983, "num_tokens": 27431751.0, "step": 9267, "train/ce_loss": 3.4464170539649785e-07 }, { "epoch": 0.916254696460352, "step": 9267, "train/sim_loss": 0.0546875 }, { "epoch": 0.916254696460352, "step": 9267, "train/total_loss": 0.054687533527612686 }, { "entropy": 8.552834510803223, "epoch": 0.9163535693098676, "mean_token_accuracy": 0.7318059206008911, "num_tokens": 27436990.0, "step": 9268, "train/ce_loss": 1.207385778427124 }, { "epoch": 0.9163535693098676, "step": 9268, "train/sim_loss": 0.07421875 }, { "epoch": 0.9163535693098676, "step": 9268, "train/total_loss": 0.19495733082294464 }, { "entropy": 8.659819602966309, "epoch": 0.916452442159383, "mean_token_accuracy": 0.7121387124061584, "num_tokens": 27442306.0, "step": 9269, "train/ce_loss": 0.9533243179321289 }, { "epoch": 0.916452442159383, "step": 9269, "train/sim_loss": 0.03125 }, { "epoch": 0.916452442159383, "step": 9269, "train/total_loss": 0.12658244371414185 }, { "entropy": 8.757511138916016, "epoch": 0.9165513150088985, "mean_token_accuracy": 0.701298713684082, "num_tokens": 27447400.0, "step": 9270, "train/ce_loss": 7.575519589408941e-07 }, { "epoch": 0.9165513150088985, "step": 9270, "train/sim_loss": 0.05078125 }, { "epoch": 0.9165513150088985, "step": 9270, "train/total_loss": 0.05078132450580597 }, { "entropy": 8.658183097839355, "epoch": 0.9166501878584141, "mean_token_accuracy": 0.70652174949646, "num_tokens": 27452557.0, "step": 9271, "train/ce_loss": 0.825967013835907 }, { "epoch": 0.9166501878584141, "step": 9271, "train/sim_loss": 0.06640625 }, { "epoch": 0.9166501878584141, "step": 9271, "train/total_loss": 0.14900295436382294 }, { "entropy": 9.13985538482666, "epoch": 0.9167490607079296, "mean_token_accuracy": 0.7724770903587341, "num_tokens": 27457613.0, "step": 9272, "train/ce_loss": 0.9535839557647705 }, { "epoch": 0.9167490607079296, "step": 9272, "train/sim_loss": 0.01171875 }, { "epoch": 0.9167490607079296, "step": 9272, "train/total_loss": 0.10707714408636093 }, { "entropy": 9.225017547607422, "epoch": 0.9168479335574451, "mean_token_accuracy": 0.73758864402771, "num_tokens": 27462630.0, "step": 9273, "train/ce_loss": 0.764447033405304 }, { "epoch": 0.9168479335574451, "step": 9273, "train/sim_loss": 0.03125 }, { "epoch": 0.9168479335574451, "step": 9273, "train/total_loss": 0.10769470781087875 }, { "entropy": 8.432116508483887, "epoch": 0.9169468064069607, "mean_token_accuracy": 0.7677624821662903, "num_tokens": 27467996.0, "step": 9274, "train/ce_loss": 0.9285256862640381 }, { "epoch": 0.9169468064069607, "step": 9274, "train/sim_loss": 0.03125 }, { "epoch": 0.9169468064069607, "step": 9274, "train/total_loss": 0.12410257011651993 }, { "entropy": 8.640363693237305, "epoch": 0.9170456792564762, "mean_token_accuracy": 0.7194994688034058, "num_tokens": 27473427.0, "step": 9275, "train/ce_loss": 0.7433151602745056 }, { "epoch": 0.9170456792564762, "step": 9275, "train/sim_loss": 0.046875 }, { "epoch": 0.9170456792564762, "step": 9275, "train/total_loss": 0.12120651453733444 }, { "entropy": 10.058313369750977, "epoch": 0.9171445521059917, "mean_token_accuracy": 0.8109452724456787, "num_tokens": 27477965.0, "step": 9276, "train/ce_loss": 1.6321387192874681e-06 }, { "epoch": 0.9171445521059917, "step": 9276, "train/sim_loss": 0.05078125 }, { "epoch": 0.9171445521059917, "step": 9276, "train/total_loss": 0.05078141391277313 }, { "entropy": 8.52928352355957, "epoch": 0.9172434249555073, "mean_token_accuracy": 0.7412935495376587, "num_tokens": 27483468.0, "step": 9277, "train/ce_loss": 0.7360466718673706 }, { "epoch": 0.9172434249555073, "step": 9277, "train/sim_loss": 0.03515625 }, { "epoch": 0.9172434249555073, "step": 9277, "train/total_loss": 0.10876091569662094 }, { "entropy": 9.13144302368164, "epoch": 0.9173422978050227, "mean_token_accuracy": 0.75, "num_tokens": 27488506.0, "step": 9278, "train/ce_loss": 1.0652343034744263 }, { "epoch": 0.9173422978050227, "step": 9278, "train/sim_loss": 0.0390625 }, { "epoch": 0.9173422978050227, "step": 9278, "train/total_loss": 0.14558592438697815 }, { "entropy": 8.72749137878418, "epoch": 0.9174411706545382, "mean_token_accuracy": 0.7104247212409973, "num_tokens": 27493705.0, "step": 9279, "train/ce_loss": 1.1955887079238892 }, { "epoch": 0.9174411706545382, "step": 9279, "train/sim_loss": 0.04296875 }, { "epoch": 0.9174411706545382, "step": 9279, "train/total_loss": 0.16252762079238892 }, { "epoch": 0.9175400435040538, "grad_norm": 0.5956372618675232, "learning_rate": 7.708302427928596e-06, "loss": 0.1313, "step": 9280 }, { "entropy": 9.752147674560547, "epoch": 0.9175400435040538, "mean_token_accuracy": 0.7214611768722534, "num_tokens": 27498331.0, "step": 9280, "train/ce_loss": 1.589429871273751e-06 }, { "epoch": 0.9175400435040538, "step": 9280, "train/sim_loss": 0.03125 }, { "epoch": 0.9175400435040538, "step": 9280, "train/total_loss": 0.031250160187482834 }, { "entropy": 8.482603073120117, "epoch": 0.9176389163535693, "mean_token_accuracy": 0.7118483185768127, "num_tokens": 27503836.0, "step": 9281, "train/ce_loss": 1.0050632953643799 }, { "epoch": 0.9176389163535693, "step": 9281, "train/sim_loss": 0.08984375 }, { "epoch": 0.9176389163535693, "step": 9281, "train/total_loss": 0.19035008549690247 }, { "entropy": 9.328857421875, "epoch": 0.9177377892030848, "mean_token_accuracy": 0.7199169993400574, "num_tokens": 27508780.0, "step": 9282, "train/ce_loss": 0.933283269405365 }, { "epoch": 0.9177377892030848, "step": 9282, "train/sim_loss": 0.046875 }, { "epoch": 0.9177377892030848, "step": 9282, "train/total_loss": 0.1402033269405365 }, { "entropy": 8.806611061096191, "epoch": 0.9178366620526004, "mean_token_accuracy": 0.793608546257019, "num_tokens": 27513994.0, "step": 9283, "train/ce_loss": 0.7182518839836121 }, { "epoch": 0.9178366620526004, "step": 9283, "train/sim_loss": 0.0859375 }, { "epoch": 0.9178366620526004, "step": 9283, "train/total_loss": 0.15776269137859344 }, { "entropy": 8.265416145324707, "epoch": 0.9179355349021159, "mean_token_accuracy": 0.6988210082054138, "num_tokens": 27519390.0, "step": 9284, "train/ce_loss": 0.8868662118911743 }, { "epoch": 0.9179355349021159, "step": 9284, "train/sim_loss": 0.04296875 }, { "epoch": 0.9179355349021159, "step": 9284, "train/total_loss": 0.13165536522865295 }, { "entropy": 9.252330780029297, "epoch": 0.9180344077516314, "mean_token_accuracy": 0.7689655423164368, "num_tokens": 27524408.0, "step": 9285, "train/ce_loss": 2.17863515672434e-07 }, { "epoch": 0.9180344077516314, "step": 9285, "train/sim_loss": 0.0234375 }, { "epoch": 0.9180344077516314, "step": 9285, "train/total_loss": 0.02343752235174179 }, { "entropy": 8.831356048583984, "epoch": 0.918133280601147, "mean_token_accuracy": 0.7590512037277222, "num_tokens": 27529679.0, "step": 9286, "train/ce_loss": 0.5693334341049194 }, { "epoch": 0.918133280601147, "step": 9286, "train/sim_loss": 0.02734375 }, { "epoch": 0.918133280601147, "step": 9286, "train/total_loss": 0.08427709341049194 }, { "entropy": 9.128816604614258, "epoch": 0.9182321534506624, "mean_token_accuracy": 0.7772194147109985, "num_tokens": 27534745.0, "step": 9287, "train/ce_loss": 0.9913820028305054 }, { "epoch": 0.9182321534506624, "step": 9287, "train/sim_loss": 0.0703125 }, { "epoch": 0.9182321534506624, "step": 9287, "train/total_loss": 0.16945070028305054 }, { "entropy": 8.789335250854492, "epoch": 0.9183310263001779, "mean_token_accuracy": 0.7612121105194092, "num_tokens": 27540036.0, "step": 9288, "train/ce_loss": 0.47905784845352173 }, { "epoch": 0.9183310263001779, "step": 9288, "train/sim_loss": 0.046875 }, { "epoch": 0.9183310263001779, "step": 9288, "train/total_loss": 0.09478078782558441 }, { "entropy": 10.050600051879883, "epoch": 0.9184298991496935, "mean_token_accuracy": 0.7607361674308777, "num_tokens": 27544587.0, "step": 9289, "train/ce_loss": 2.6196956634521484 }, { "epoch": 0.9184298991496935, "step": 9289, "train/sim_loss": 0.09765625 }, { "epoch": 0.9184298991496935, "step": 9289, "train/total_loss": 0.35962581634521484 }, { "entropy": 8.77641773223877, "epoch": 0.918528771999209, "mean_token_accuracy": 0.717277467250824, "num_tokens": 27549822.0, "step": 9290, "train/ce_loss": 1.258701205253601 }, { "epoch": 0.918528771999209, "step": 9290, "train/sim_loss": 0.05078125 }, { "epoch": 0.918528771999209, "step": 9290, "train/total_loss": 0.17665137350559235 }, { "entropy": 9.14539909362793, "epoch": 0.9186276448487245, "mean_token_accuracy": 0.7543553709983826, "num_tokens": 27554853.0, "step": 9291, "train/ce_loss": 1.2595537900924683 }, { "epoch": 0.9186276448487245, "step": 9291, "train/sim_loss": 0.0703125 }, { "epoch": 0.9186276448487245, "step": 9291, "train/total_loss": 0.19626788794994354 }, { "entropy": 8.761069297790527, "epoch": 0.9187265176982401, "mean_token_accuracy": 0.7253668904304504, "num_tokens": 27560241.0, "step": 9292, "train/ce_loss": 1.3040637969970703 }, { "epoch": 0.9187265176982401, "step": 9292, "train/sim_loss": 0.0390625 }, { "epoch": 0.9187265176982401, "step": 9292, "train/total_loss": 0.16946887969970703 }, { "entropy": 8.862565994262695, "epoch": 0.9188253905477556, "mean_token_accuracy": 0.7532281279563904, "num_tokens": 27565362.0, "step": 9293, "train/ce_loss": 0.617352306842804 }, { "epoch": 0.9188253905477556, "step": 9293, "train/sim_loss": 0.03125 }, { "epoch": 0.9188253905477556, "step": 9293, "train/total_loss": 0.09298522770404816 }, { "entropy": 8.723288536071777, "epoch": 0.9189242633972711, "mean_token_accuracy": 0.6952140927314758, "num_tokens": 27570667.0, "step": 9294, "train/ce_loss": 0.7255082130432129 }, { "epoch": 0.9189242633972711, "step": 9294, "train/sim_loss": 0.02734375 }, { "epoch": 0.9189242633972711, "step": 9294, "train/total_loss": 0.09989457577466965 }, { "entropy": 8.683667182922363, "epoch": 0.9190231362467867, "mean_token_accuracy": 0.7905982732772827, "num_tokens": 27576021.0, "step": 9295, "train/ce_loss": 0.7735714316368103 }, { "epoch": 0.9190231362467867, "step": 9295, "train/sim_loss": 0.0234375 }, { "epoch": 0.9190231362467867, "step": 9295, "train/total_loss": 0.10079464316368103 }, { "entropy": 8.675325393676758, "epoch": 0.9191220090963022, "mean_token_accuracy": 0.6952879428863525, "num_tokens": 27581428.0, "step": 9296, "train/ce_loss": 0.9181460738182068 }, { "epoch": 0.9191220090963022, "step": 9296, "train/sim_loss": 0.0625 }, { "epoch": 0.9191220090963022, "step": 9296, "train/total_loss": 0.15431460738182068 }, { "entropy": 9.235428810119629, "epoch": 0.9192208819458176, "mean_token_accuracy": 0.7586206793785095, "num_tokens": 27586409.0, "step": 9297, "train/ce_loss": 1.6527061462402344 }, { "epoch": 0.9192208819458176, "step": 9297, "train/sim_loss": 0.08984375 }, { "epoch": 0.9192208819458176, "step": 9297, "train/total_loss": 0.2551143765449524 }, { "entropy": 9.978853225708008, "epoch": 0.9193197547953332, "mean_token_accuracy": 0.7686567306518555, "num_tokens": 27591004.0, "step": 9298, "train/ce_loss": 1.1262466159678297e-06 }, { "epoch": 0.9193197547953332, "step": 9298, "train/sim_loss": 0.0546875 }, { "epoch": 0.9193197547953332, "step": 9298, "train/total_loss": 0.054687611758708954 }, { "entropy": 8.849390983581543, "epoch": 0.9194186276448487, "mean_token_accuracy": 0.6966292262077332, "num_tokens": 27596258.0, "step": 9299, "train/ce_loss": 1.3365767002105713 }, { "epoch": 0.9194186276448487, "step": 9299, "train/sim_loss": 0.0859375 }, { "epoch": 0.9194186276448487, "step": 9299, "train/total_loss": 0.21959517896175385 }, { "epoch": 0.9195175004943642, "grad_norm": 0.7616487145423889, "learning_rate": 7.703357563170648e-06, "loss": 0.1308, "step": 9300 }, { "entropy": 9.030107498168945, "epoch": 0.9195175004943642, "mean_token_accuracy": 0.7249283790588379, "num_tokens": 27601384.0, "step": 9300, "train/ce_loss": 0.00010385373025201261 }, { "epoch": 0.9195175004943642, "step": 9300, "train/sim_loss": 0.0546875 }, { "epoch": 0.9195175004943642, "step": 9300, "train/total_loss": 0.05469788610935211 }, { "entropy": 8.875327110290527, "epoch": 0.9196163733438798, "mean_token_accuracy": 0.7274472117424011, "num_tokens": 27606329.0, "step": 9301, "train/ce_loss": 1.3728803396224976 }, { "epoch": 0.9196163733438798, "step": 9301, "train/sim_loss": 0.04296875 }, { "epoch": 0.9196163733438798, "step": 9301, "train/total_loss": 0.18025678396224976 }, { "entropy": 8.848821640014648, "epoch": 0.9197152461933953, "mean_token_accuracy": 0.7321637272834778, "num_tokens": 27611646.0, "step": 9302, "train/ce_loss": 0.8853605389595032 }, { "epoch": 0.9197152461933953, "step": 9302, "train/sim_loss": 0.14453125 }, { "epoch": 0.9197152461933953, "step": 9302, "train/total_loss": 0.23306730389595032 }, { "entropy": 8.887819290161133, "epoch": 0.9198141190429108, "mean_token_accuracy": 0.8480325937271118, "num_tokens": 27616848.0, "step": 9303, "train/ce_loss": 2.756186461283505e-07 }, { "epoch": 0.9198141190429108, "step": 9303, "train/sim_loss": 0.04296875 }, { "epoch": 0.9198141190429108, "step": 9303, "train/total_loss": 0.04296877607703209 }, { "entropy": 8.672847747802734, "epoch": 0.9199129918924264, "mean_token_accuracy": 0.7010309100151062, "num_tokens": 27622316.0, "step": 9304, "train/ce_loss": 0.9316827058792114 }, { "epoch": 0.9199129918924264, "step": 9304, "train/sim_loss": 0.06640625 }, { "epoch": 0.9199129918924264, "step": 9304, "train/total_loss": 0.15957452356815338 }, { "entropy": 8.53351879119873, "epoch": 0.9200118647419419, "mean_token_accuracy": 0.7390350699424744, "num_tokens": 27627718.0, "step": 9305, "train/ce_loss": 0.8892126083374023 }, { "epoch": 0.9200118647419419, "step": 9305, "train/sim_loss": 0.02734375 }, { "epoch": 0.9200118647419419, "step": 9305, "train/total_loss": 0.11626501381397247 }, { "entropy": 9.371720314025879, "epoch": 0.9201107375914573, "mean_token_accuracy": 0.7847533822059631, "num_tokens": 27632609.0, "step": 9306, "train/ce_loss": 3.378468704795523e-07 }, { "epoch": 0.9201107375914573, "step": 9306, "train/sim_loss": 0.0390625 }, { "epoch": 0.9201107375914573, "step": 9306, "train/total_loss": 0.039062533527612686 }, { "entropy": 8.608626365661621, "epoch": 0.9202096104409729, "mean_token_accuracy": 0.7905759215354919, "num_tokens": 27638025.0, "step": 9307, "train/ce_loss": 0.4734403192996979 }, { "epoch": 0.9202096104409729, "step": 9307, "train/sim_loss": 0.0234375 }, { "epoch": 0.9202096104409729, "step": 9307, "train/total_loss": 0.07078152894973755 }, { "entropy": 8.684558868408203, "epoch": 0.9203084832904884, "mean_token_accuracy": 0.759036123752594, "num_tokens": 27643228.0, "step": 9308, "train/ce_loss": 1.0258185863494873 }, { "epoch": 0.9203084832904884, "step": 9308, "train/sim_loss": 0.03125 }, { "epoch": 0.9203084832904884, "step": 9308, "train/total_loss": 0.13383185863494873 }, { "entropy": 8.951282501220703, "epoch": 0.9204073561400039, "mean_token_accuracy": 0.769336998462677, "num_tokens": 27648405.0, "step": 9309, "train/ce_loss": 0.7111250162124634 }, { "epoch": 0.9204073561400039, "step": 9309, "train/sim_loss": 0.01953125 }, { "epoch": 0.9204073561400039, "step": 9309, "train/total_loss": 0.0906437560915947 }, { "entropy": 9.013503074645996, "epoch": 0.9205062289895195, "mean_token_accuracy": 0.7174515128135681, "num_tokens": 27653765.0, "step": 9310, "train/ce_loss": 1.3770428895950317 }, { "epoch": 0.9205062289895195, "step": 9310, "train/sim_loss": 0.09765625 }, { "epoch": 0.9205062289895195, "step": 9310, "train/total_loss": 0.2353605479001999 }, { "entropy": 8.536752700805664, "epoch": 0.920605101839035, "mean_token_accuracy": 0.760613203048706, "num_tokens": 27659097.0, "step": 9311, "train/ce_loss": 0.510001003742218 }, { "epoch": 0.920605101839035, "step": 9311, "train/sim_loss": 0.03125 }, { "epoch": 0.920605101839035, "step": 9311, "train/total_loss": 0.08225010335445404 }, { "entropy": 8.710546493530273, "epoch": 0.9207039746885505, "mean_token_accuracy": 0.7442396283149719, "num_tokens": 27664485.0, "step": 9312, "train/ce_loss": 0.58183354139328 }, { "epoch": 0.9207039746885505, "step": 9312, "train/sim_loss": 0.0546875 }, { "epoch": 0.9207039746885505, "step": 9312, "train/total_loss": 0.11287085711956024 }, { "entropy": 8.914073944091797, "epoch": 0.9208028475380661, "mean_token_accuracy": 0.7561929821968079, "num_tokens": 27669741.0, "step": 9313, "train/ce_loss": 1.2763566970825195 }, { "epoch": 0.9208028475380661, "step": 9313, "train/sim_loss": 0.07421875 }, { "epoch": 0.9208028475380661, "step": 9313, "train/total_loss": 0.2018544226884842 }, { "entropy": 8.939849853515625, "epoch": 0.9209017203875816, "mean_token_accuracy": 0.6697009205818176, "num_tokens": 27674962.0, "step": 9314, "train/ce_loss": 1.2684932947158813 }, { "epoch": 0.9209017203875816, "step": 9314, "train/sim_loss": 0.07421875 }, { "epoch": 0.9209017203875816, "step": 9314, "train/total_loss": 0.20106808841228485 }, { "entropy": 9.103290557861328, "epoch": 0.921000593237097, "mean_token_accuracy": 0.7338345646858215, "num_tokens": 27680084.0, "step": 9315, "train/ce_loss": 0.7312984466552734 }, { "epoch": 0.921000593237097, "step": 9315, "train/sim_loss": 0.04296875 }, { "epoch": 0.921000593237097, "step": 9315, "train/total_loss": 0.11609859764575958 }, { "entropy": 8.95407485961914, "epoch": 0.9210994660866126, "mean_token_accuracy": 0.7327327132225037, "num_tokens": 27685230.0, "step": 9316, "train/ce_loss": 0.873866617679596 }, { "epoch": 0.9210994660866126, "step": 9316, "train/sim_loss": 0.0703125 }, { "epoch": 0.9210994660866126, "step": 9316, "train/total_loss": 0.15769916772842407 }, { "entropy": 9.344866752624512, "epoch": 0.9211983389361281, "mean_token_accuracy": 0.7724957466125488, "num_tokens": 27690251.0, "step": 9317, "train/ce_loss": 1.6272326774924295e-07 }, { "epoch": 0.9211983389361281, "step": 9317, "train/sim_loss": 0.01171875 }, { "epoch": 0.9211983389361281, "step": 9317, "train/total_loss": 0.011718765832483768 }, { "entropy": 8.525527000427246, "epoch": 0.9212972117856436, "mean_token_accuracy": 0.7003610134124756, "num_tokens": 27695584.0, "step": 9318, "train/ce_loss": 0.5618838667869568 }, { "epoch": 0.9212972117856436, "step": 9318, "train/sim_loss": 0.05859375 }, { "epoch": 0.9212972117856436, "step": 9318, "train/total_loss": 0.11478213965892792 }, { "entropy": 9.122682571411133, "epoch": 0.9213960846351592, "mean_token_accuracy": 0.6692759394645691, "num_tokens": 27700559.0, "step": 9319, "train/ce_loss": 2.500817402051325e-07 }, { "epoch": 0.9213960846351592, "step": 9319, "train/sim_loss": 0.02734375 }, { "epoch": 0.9213960846351592, "step": 9319, "train/total_loss": 0.02734377421438694 }, { "epoch": 0.9214949574846747, "grad_norm": 0.9534464478492737, "learning_rate": 7.698412698412699e-06, "loss": 0.1333, "step": 9320 }, { "entropy": 8.832344055175781, "epoch": 0.9214949574846747, "mean_token_accuracy": 0.7819650173187256, "num_tokens": 27705765.0, "step": 9320, "train/ce_loss": 0.5779180526733398 }, { "epoch": 0.9214949574846747, "step": 9320, "train/sim_loss": 0.03125 }, { "epoch": 0.9214949574846747, "step": 9320, "train/total_loss": 0.0890418067574501 }, { "entropy": 9.167750358581543, "epoch": 0.9215938303341902, "mean_token_accuracy": 0.7516447305679321, "num_tokens": 27710813.0, "step": 9321, "train/ce_loss": 1.5943471193313599 }, { "epoch": 0.9215938303341902, "step": 9321, "train/sim_loss": 0.02734375 }, { "epoch": 0.9215938303341902, "step": 9321, "train/total_loss": 0.1867784708738327 }, { "entropy": 8.775861740112305, "epoch": 0.9216927031837058, "mean_token_accuracy": 0.7531003355979919, "num_tokens": 27716209.0, "step": 9322, "train/ce_loss": 0.7420780658721924 }, { "epoch": 0.9216927031837058, "step": 9322, "train/sim_loss": 0.01953125 }, { "epoch": 0.9216927031837058, "step": 9322, "train/total_loss": 0.09373905509710312 }, { "entropy": 9.585281372070312, "epoch": 0.9217915760332213, "mean_token_accuracy": 0.7176781296730042, "num_tokens": 27720975.0, "step": 9323, "train/ce_loss": 2.351759672164917 }, { "epoch": 0.9217915760332213, "step": 9323, "train/sim_loss": 0.046875 }, { "epoch": 0.9217915760332213, "step": 9323, "train/total_loss": 0.2820509672164917 }, { "entropy": 8.793519973754883, "epoch": 0.9218904488827367, "mean_token_accuracy": 0.8018372654914856, "num_tokens": 27726209.0, "step": 9324, "train/ce_loss": 0.6098015904426575 }, { "epoch": 0.9218904488827367, "step": 9324, "train/sim_loss": 0.01953125 }, { "epoch": 0.9218904488827367, "step": 9324, "train/total_loss": 0.08051140606403351 }, { "entropy": 8.71558952331543, "epoch": 0.9219893217322523, "mean_token_accuracy": 0.725784420967102, "num_tokens": 27731440.0, "step": 9325, "train/ce_loss": 0.9672392010688782 }, { "epoch": 0.9219893217322523, "step": 9325, "train/sim_loss": 0.03125 }, { "epoch": 0.9219893217322523, "step": 9325, "train/total_loss": 0.12797391414642334 }, { "entropy": 9.141233444213867, "epoch": 0.9220881945817678, "mean_token_accuracy": 0.7003105878829956, "num_tokens": 27736475.0, "step": 9326, "train/ce_loss": 1.083046555519104 }, { "epoch": 0.9220881945817678, "step": 9326, "train/sim_loss": 0.08203125 }, { "epoch": 0.9220881945817678, "step": 9326, "train/total_loss": 0.19033589959144592 }, { "entropy": 8.981075286865234, "epoch": 0.9221870674312834, "mean_token_accuracy": 0.7578125, "num_tokens": 27741557.0, "step": 9327, "train/ce_loss": 1.4328120946884155 }, { "epoch": 0.9221870674312834, "step": 9327, "train/sim_loss": 0.07421875 }, { "epoch": 0.9221870674312834, "step": 9327, "train/total_loss": 0.21749995648860931 }, { "entropy": 8.807497024536133, "epoch": 0.9222859402807989, "mean_token_accuracy": 0.7117486596107483, "num_tokens": 27746743.0, "step": 9328, "train/ce_loss": 0.545606255531311 }, { "epoch": 0.9222859402807989, "step": 9328, "train/sim_loss": 0.05078125 }, { "epoch": 0.9222859402807989, "step": 9328, "train/total_loss": 0.10534188151359558 }, { "entropy": 9.165096282958984, "epoch": 0.9223848131303144, "mean_token_accuracy": 0.7166324257850647, "num_tokens": 27751672.0, "step": 9329, "train/ce_loss": 1.3440909385681152 }, { "epoch": 0.9223848131303144, "step": 9329, "train/sim_loss": 0.03125 }, { "epoch": 0.9223848131303144, "step": 9329, "train/total_loss": 0.165659099817276 }, { "entropy": 9.130973815917969, "epoch": 0.92248368597983, "mean_token_accuracy": 0.7927756905555725, "num_tokens": 27756659.0, "step": 9330, "train/ce_loss": 0.8847929835319519 }, { "epoch": 0.92248368597983, "step": 9330, "train/sim_loss": 0.046875 }, { "epoch": 0.92248368597983, "step": 9330, "train/total_loss": 0.13535431027412415 }, { "entropy": 9.225261688232422, "epoch": 0.9225825588293455, "mean_token_accuracy": 0.8013244867324829, "num_tokens": 27761720.0, "step": 9331, "train/ce_loss": 1.2016048431396484 }, { "epoch": 0.9225825588293455, "step": 9331, "train/sim_loss": 0.02734375 }, { "epoch": 0.9225825588293455, "step": 9331, "train/total_loss": 0.14750424027442932 }, { "entropy": 9.045722007751465, "epoch": 0.922681431678861, "mean_token_accuracy": 0.7472222447395325, "num_tokens": 27766929.0, "step": 9332, "train/ce_loss": 0.681461751461029 }, { "epoch": 0.922681431678861, "step": 9332, "train/sim_loss": 0.0546875 }, { "epoch": 0.922681431678861, "step": 9332, "train/total_loss": 0.12283367663621902 }, { "entropy": 8.748706817626953, "epoch": 0.9227803045283766, "mean_token_accuracy": 0.6838790774345398, "num_tokens": 27772216.0, "step": 9333, "train/ce_loss": 0.8040033578872681 }, { "epoch": 0.9227803045283766, "step": 9333, "train/sim_loss": 0.04296875 }, { "epoch": 0.9227803045283766, "step": 9333, "train/total_loss": 0.12336909025907516 }, { "entropy": 9.276330947875977, "epoch": 0.922879177377892, "mean_token_accuracy": 0.6828479170799255, "num_tokens": 27777279.0, "step": 9334, "train/ce_loss": 2.368741007785502e-07 }, { "epoch": 0.922879177377892, "step": 9334, "train/sim_loss": 0.015625 }, { "epoch": 0.922879177377892, "step": 9334, "train/total_loss": 0.01562502421438694 }, { "entropy": 9.032341003417969, "epoch": 0.9229780502274075, "mean_token_accuracy": 0.7513736486434937, "num_tokens": 27782429.0, "step": 9335, "train/ce_loss": 0.8243290781974792 }, { "epoch": 0.9229780502274075, "step": 9335, "train/sim_loss": 0.02734375 }, { "epoch": 0.9229780502274075, "step": 9335, "train/total_loss": 0.10977666079998016 }, { "entropy": 8.510507583618164, "epoch": 0.9230769230769231, "mean_token_accuracy": 0.8008130192756653, "num_tokens": 27787892.0, "step": 9336, "train/ce_loss": 0.4929116368293762 }, { "epoch": 0.9230769230769231, "step": 9336, "train/sim_loss": 0.02734375 }, { "epoch": 0.9230769230769231, "step": 9336, "train/total_loss": 0.07663491368293762 }, { "entropy": 8.87246322631836, "epoch": 0.9231757959264386, "mean_token_accuracy": 0.7857961058616638, "num_tokens": 27793215.0, "step": 9337, "train/ce_loss": 0.6730414628982544 }, { "epoch": 0.9231757959264386, "step": 9337, "train/sim_loss": 0.0625 }, { "epoch": 0.9231757959264386, "step": 9337, "train/total_loss": 0.12980414927005768 }, { "entropy": 9.145835876464844, "epoch": 0.9232746687759541, "mean_token_accuracy": 0.7697160840034485, "num_tokens": 27798264.0, "step": 9338, "train/ce_loss": 0.790299117565155 }, { "epoch": 0.9232746687759541, "step": 9338, "train/sim_loss": 0.06640625 }, { "epoch": 0.9232746687759541, "step": 9338, "train/total_loss": 0.14543616771697998 }, { "entropy": 8.914115905761719, "epoch": 0.9233735416254697, "mean_token_accuracy": 0.7496671080589294, "num_tokens": 27803469.0, "step": 9339, "train/ce_loss": 0.8739914298057556 }, { "epoch": 0.9233735416254697, "step": 9339, "train/sim_loss": 0.09765625 }, { "epoch": 0.9233735416254697, "step": 9339, "train/total_loss": 0.18505540490150452 }, { "epoch": 0.9234724144749852, "grad_norm": 0.6647242903709412, "learning_rate": 7.69346783365475e-06, "loss": 0.1258, "step": 9340 }, { "entropy": 8.883416175842285, "epoch": 0.9234724144749852, "mean_token_accuracy": 0.7220026254653931, "num_tokens": 27808710.0, "step": 9340, "train/ce_loss": 1.3461934328079224 }, { "epoch": 0.9234724144749852, "step": 9340, "train/sim_loss": 0.04296875 }, { "epoch": 0.9234724144749852, "step": 9340, "train/total_loss": 0.17758809030056 }, { "entropy": 8.845987319946289, "epoch": 0.9235712873245007, "mean_token_accuracy": 0.8050633072853088, "num_tokens": 27813966.0, "step": 9341, "train/ce_loss": 0.6493423581123352 }, { "epoch": 0.9235712873245007, "step": 9341, "train/sim_loss": 0.0234375 }, { "epoch": 0.9235712873245007, "step": 9341, "train/total_loss": 0.08837173879146576 }, { "entropy": 8.537727355957031, "epoch": 0.9236701601740163, "mean_token_accuracy": 0.7448609471321106, "num_tokens": 27819236.0, "step": 9342, "train/ce_loss": 0.8598325252532959 }, { "epoch": 0.9236701601740163, "step": 9342, "train/sim_loss": 0.0390625 }, { "epoch": 0.9236701601740163, "step": 9342, "train/total_loss": 0.1250457465648651 }, { "entropy": 8.727884292602539, "epoch": 0.9237690330235317, "mean_token_accuracy": 0.7356608510017395, "num_tokens": 27824499.0, "step": 9343, "train/ce_loss": 0.755394697189331 }, { "epoch": 0.9237690330235317, "step": 9343, "train/sim_loss": 0.0546875 }, { "epoch": 0.9237690330235317, "step": 9343, "train/total_loss": 0.1302269697189331 }, { "entropy": 8.807491302490234, "epoch": 0.9238679058730472, "mean_token_accuracy": 0.761904776096344, "num_tokens": 27829734.0, "step": 9344, "train/ce_loss": 0.4971083402633667 }, { "epoch": 0.9238679058730472, "step": 9344, "train/sim_loss": 0.03125 }, { "epoch": 0.9238679058730472, "step": 9344, "train/total_loss": 0.08096083998680115 }, { "entropy": 8.90900993347168, "epoch": 0.9239667787225628, "mean_token_accuracy": 0.7629969716072083, "num_tokens": 27834842.0, "step": 9345, "train/ce_loss": 4.6224951688600413e-07 }, { "epoch": 0.9239667787225628, "step": 9345, "train/sim_loss": 0.04296875 }, { "epoch": 0.9239667787225628, "step": 9345, "train/total_loss": 0.04296879470348358 }, { "entropy": 8.945581436157227, "epoch": 0.9240656515720783, "mean_token_accuracy": 0.636623740196228, "num_tokens": 27840000.0, "step": 9346, "train/ce_loss": 1.802316665649414 }, { "epoch": 0.9240656515720783, "step": 9346, "train/sim_loss": 0.05078125 }, { "epoch": 0.9240656515720783, "step": 9346, "train/total_loss": 0.23101292550563812 }, { "entropy": 8.90842056274414, "epoch": 0.9241645244215938, "mean_token_accuracy": 0.7927536368370056, "num_tokens": 27845147.0, "step": 9347, "train/ce_loss": 1.3514710664749146 }, { "epoch": 0.9241645244215938, "step": 9347, "train/sim_loss": 0.0625 }, { "epoch": 0.9241645244215938, "step": 9347, "train/total_loss": 0.1976471096277237 }, { "entropy": 9.230728149414062, "epoch": 0.9242633972711094, "mean_token_accuracy": 0.7515337467193604, "num_tokens": 27850251.0, "step": 9348, "train/ce_loss": 0.6681864857673645 }, { "epoch": 0.9242633972711094, "step": 9348, "train/sim_loss": 0.02734375 }, { "epoch": 0.9242633972711094, "step": 9348, "train/total_loss": 0.09416239708662033 }, { "entropy": 8.170320510864258, "epoch": 0.9243622701206249, "mean_token_accuracy": 0.7357609868049622, "num_tokens": 27855784.0, "step": 9349, "train/ce_loss": 1.053152084350586 }, { "epoch": 0.9243622701206249, "step": 9349, "train/sim_loss": 0.08984375 }, { "epoch": 0.9243622701206249, "step": 9349, "train/total_loss": 0.1951589584350586 }, { "entropy": 8.387100219726562, "epoch": 0.9244611429701404, "mean_token_accuracy": 0.7791411280632019, "num_tokens": 27861374.0, "step": 9350, "train/ce_loss": 0.7524048089981079 }, { "epoch": 0.9244611429701404, "step": 9350, "train/sim_loss": 0.078125 }, { "epoch": 0.9244611429701404, "step": 9350, "train/total_loss": 0.15336549282073975 }, { "entropy": 8.531319618225098, "epoch": 0.924560015819656, "mean_token_accuracy": 0.7187127470970154, "num_tokens": 27866691.0, "step": 9351, "train/ce_loss": 1.0648727416992188 }, { "epoch": 0.924560015819656, "step": 9351, "train/sim_loss": 0.0546875 }, { "epoch": 0.924560015819656, "step": 9351, "train/total_loss": 0.16117477416992188 }, { "entropy": 8.408615112304688, "epoch": 0.9246588886691715, "mean_token_accuracy": 0.7593184113502502, "num_tokens": 27872086.0, "step": 9352, "train/ce_loss": 0.732154905796051 }, { "epoch": 0.9246588886691715, "step": 9352, "train/sim_loss": 0.05078125 }, { "epoch": 0.9246588886691715, "step": 9352, "train/total_loss": 0.12399674206972122 }, { "entropy": 8.75208854675293, "epoch": 0.9247577615186869, "mean_token_accuracy": 0.7730769515037537, "num_tokens": 27877323.0, "step": 9353, "train/ce_loss": 1.2357757091522217 }, { "epoch": 0.9247577615186869, "step": 9353, "train/sim_loss": 0.046875 }, { "epoch": 0.9247577615186869, "step": 9353, "train/total_loss": 0.1704525649547577 }, { "entropy": 8.690408706665039, "epoch": 0.9248566343682025, "mean_token_accuracy": 0.6970338821411133, "num_tokens": 27882606.0, "step": 9354, "train/ce_loss": 0.5237799286842346 }, { "epoch": 0.9248566343682025, "step": 9354, "train/sim_loss": 0.0625 }, { "epoch": 0.9248566343682025, "step": 9354, "train/total_loss": 0.11487799882888794 }, { "entropy": 8.586967468261719, "epoch": 0.924955507217718, "mean_token_accuracy": 0.7449495196342468, "num_tokens": 27887896.0, "step": 9355, "train/ce_loss": 0.8277714252471924 }, { "epoch": 0.924955507217718, "step": 9355, "train/sim_loss": 0.0625 }, { "epoch": 0.924955507217718, "step": 9355, "train/total_loss": 0.14527714252471924 }, { "entropy": 8.509919166564941, "epoch": 0.9250543800672335, "mean_token_accuracy": 0.7360350489616394, "num_tokens": 27893289.0, "step": 9356, "train/ce_loss": 0.7717025876045227 }, { "epoch": 0.9250543800672335, "step": 9356, "train/sim_loss": 0.03125 }, { "epoch": 0.9250543800672335, "step": 9356, "train/total_loss": 0.10842026025056839 }, { "entropy": 8.69703483581543, "epoch": 0.9251532529167491, "mean_token_accuracy": 0.7157190442085266, "num_tokens": 27898670.0, "step": 9357, "train/ce_loss": 1.3000333309173584 }, { "epoch": 0.9251532529167491, "step": 9357, "train/sim_loss": 0.0625 }, { "epoch": 0.9251532529167491, "step": 9357, "train/total_loss": 0.19250333309173584 }, { "entropy": 8.952316284179688, "epoch": 0.9252521257662646, "mean_token_accuracy": 0.8059490323066711, "num_tokens": 27903898.0, "step": 9358, "train/ce_loss": 0.3204611837863922 }, { "epoch": 0.9252521257662646, "step": 9358, "train/sim_loss": 0.01953125 }, { "epoch": 0.9252521257662646, "step": 9358, "train/total_loss": 0.0515773706138134 }, { "entropy": 9.340235710144043, "epoch": 0.9253509986157801, "mean_token_accuracy": 0.8120915293693542, "num_tokens": 27908961.0, "step": 9359, "train/ce_loss": 0.7203558683395386 }, { "epoch": 0.9253509986157801, "step": 9359, "train/sim_loss": 0.01953125 }, { "epoch": 0.9253509986157801, "step": 9359, "train/total_loss": 0.09156683832406998 }, { "epoch": 0.9254498714652957, "grad_norm": 0.6123246550559998, "learning_rate": 7.688522968896802e-06, "loss": 0.1322, "step": 9360 }, { "entropy": 8.673837661743164, "epoch": 0.9254498714652957, "mean_token_accuracy": 0.7361446022987366, "num_tokens": 27914459.0, "step": 9360, "train/ce_loss": 1.0571768283843994 }, { "epoch": 0.9254498714652957, "step": 9360, "train/sim_loss": 0.02734375 }, { "epoch": 0.9254498714652957, "step": 9360, "train/total_loss": 0.13306143879890442 }, { "entropy": 8.64249038696289, "epoch": 0.9255487443148112, "mean_token_accuracy": 0.7620252966880798, "num_tokens": 27919767.0, "step": 9361, "train/ce_loss": 0.5044848918914795 }, { "epoch": 0.9255487443148112, "step": 9361, "train/sim_loss": 0.05859375 }, { "epoch": 0.9255487443148112, "step": 9361, "train/total_loss": 0.10904224216938019 }, { "entropy": 8.920316696166992, "epoch": 0.9256476171643266, "mean_token_accuracy": 0.7136498689651489, "num_tokens": 27924860.0, "step": 9362, "train/ce_loss": 3.5302545597915014e-07 }, { "epoch": 0.9256476171643266, "step": 9362, "train/sim_loss": 0.0390625 }, { "epoch": 0.9256476171643266, "step": 9362, "train/total_loss": 0.039062533527612686 }, { "entropy": 9.195442199707031, "epoch": 0.9257464900138422, "mean_token_accuracy": 0.7482993006706238, "num_tokens": 27929869.0, "step": 9363, "train/ce_loss": 0.5305820107460022 }, { "epoch": 0.9257464900138422, "step": 9363, "train/sim_loss": 0.02734375 }, { "epoch": 0.9257464900138422, "step": 9363, "train/total_loss": 0.0804019570350647 }, { "entropy": 9.042174339294434, "epoch": 0.9258453628633577, "mean_token_accuracy": 0.776562511920929, "num_tokens": 27934975.0, "step": 9364, "train/ce_loss": 0.9903772473335266 }, { "epoch": 0.9258453628633577, "step": 9364, "train/sim_loss": 0.02734375 }, { "epoch": 0.9258453628633577, "step": 9364, "train/total_loss": 0.12638148665428162 }, { "entropy": 8.419917106628418, "epoch": 0.9259442357128732, "mean_token_accuracy": 0.7669452428817749, "num_tokens": 27940552.0, "step": 9365, "train/ce_loss": 0.4101727306842804 }, { "epoch": 0.9259442357128732, "step": 9365, "train/sim_loss": 0.015625 }, { "epoch": 0.9259442357128732, "step": 9365, "train/total_loss": 0.05664227530360222 }, { "entropy": 8.653945922851562, "epoch": 0.9260431085623888, "mean_token_accuracy": 0.7473053932189941, "num_tokens": 27945920.0, "step": 9366, "train/ce_loss": 0.6925281286239624 }, { "epoch": 0.9260431085623888, "step": 9366, "train/sim_loss": 0.05859375 }, { "epoch": 0.9260431085623888, "step": 9366, "train/total_loss": 0.12784656882286072 }, { "entropy": 9.342437744140625, "epoch": 0.9261419814119043, "mean_token_accuracy": 0.8017751574516296, "num_tokens": 27950700.0, "step": 9367, "train/ce_loss": 1.431895498171798e-06 }, { "epoch": 0.9261419814119043, "step": 9367, "train/sim_loss": 0.05859375 }, { "epoch": 0.9261419814119043, "step": 9367, "train/total_loss": 0.05859389156103134 }, { "entropy": 8.90158462524414, "epoch": 0.9262408542614198, "mean_token_accuracy": 0.7121464014053345, "num_tokens": 27955770.0, "step": 9368, "train/ce_loss": 1.037416696548462 }, { "epoch": 0.9262408542614198, "step": 9368, "train/sim_loss": 0.05078125 }, { "epoch": 0.9262408542614198, "step": 9368, "train/total_loss": 0.15452292561531067 }, { "entropy": 9.231480598449707, "epoch": 0.9263397271109354, "mean_token_accuracy": 0.7996219396591187, "num_tokens": 27960757.0, "step": 9369, "train/ce_loss": 1.3427621126174927 }, { "epoch": 0.9263397271109354, "step": 9369, "train/sim_loss": 0.09375 }, { "epoch": 0.9263397271109354, "step": 9369, "train/total_loss": 0.22802621126174927 }, { "entropy": 9.150675773620605, "epoch": 0.9264385999604509, "mean_token_accuracy": 0.6822916865348816, "num_tokens": 27965964.0, "step": 9370, "train/ce_loss": 1.2984963655471802 }, { "epoch": 0.9264385999604509, "step": 9370, "train/sim_loss": 0.08984375 }, { "epoch": 0.9264385999604509, "step": 9370, "train/total_loss": 0.2196933925151825 }, { "entropy": 8.527593612670898, "epoch": 0.9265374728099663, "mean_token_accuracy": 0.7487179636955261, "num_tokens": 27971450.0, "step": 9371, "train/ce_loss": 0.7690181732177734 }, { "epoch": 0.9265374728099663, "step": 9371, "train/sim_loss": 0.02734375 }, { "epoch": 0.9265374728099663, "step": 9371, "train/total_loss": 0.10424556583166122 }, { "entropy": 9.013280868530273, "epoch": 0.9266363456594819, "mean_token_accuracy": 0.7367668151855469, "num_tokens": 27976582.0, "step": 9372, "train/ce_loss": 0.826474666595459 }, { "epoch": 0.9266363456594819, "step": 9372, "train/sim_loss": 0.109375 }, { "epoch": 0.9266363456594819, "step": 9372, "train/total_loss": 0.19202247262001038 }, { "entropy": 9.376861572265625, "epoch": 0.9267352185089974, "mean_token_accuracy": 0.631130039691925, "num_tokens": 27981476.0, "step": 9373, "train/ce_loss": 1.5703624486923218 }, { "epoch": 0.9267352185089974, "step": 9373, "train/sim_loss": 0.0390625 }, { "epoch": 0.9267352185089974, "step": 9373, "train/total_loss": 0.19609874486923218 }, { "entropy": 9.084300994873047, "epoch": 0.9268340913585129, "mean_token_accuracy": 0.7346647381782532, "num_tokens": 27986643.0, "step": 9374, "train/ce_loss": 1.1067943572998047 }, { "epoch": 0.9268340913585129, "step": 9374, "train/sim_loss": 0.07421875 }, { "epoch": 0.9268340913585129, "step": 9374, "train/total_loss": 0.18489819765090942 }, { "entropy": 8.769912719726562, "epoch": 0.9269329642080285, "mean_token_accuracy": 0.73072749376297, "num_tokens": 27992034.0, "step": 9375, "train/ce_loss": 0.39563554525375366 }, { "epoch": 0.9269329642080285, "step": 9375, "train/sim_loss": 0.02734375 }, { "epoch": 0.9269329642080285, "step": 9375, "train/total_loss": 0.06690730154514313 }, { "entropy": 8.79995346069336, "epoch": 0.927031837057544, "mean_token_accuracy": 0.7369047403335571, "num_tokens": 27997372.0, "step": 9376, "train/ce_loss": 1.4369641542434692 }, { "epoch": 0.927031837057544, "step": 9376, "train/sim_loss": 0.09375 }, { "epoch": 0.927031837057544, "step": 9376, "train/total_loss": 0.23744641244411469 }, { "entropy": 8.828529357910156, "epoch": 0.9271307099070595, "mean_token_accuracy": 0.6910377144813538, "num_tokens": 28002695.0, "step": 9377, "train/ce_loss": 1.3177534341812134 }, { "epoch": 0.9271307099070595, "step": 9377, "train/sim_loss": 0.046875 }, { "epoch": 0.9271307099070595, "step": 9377, "train/total_loss": 0.17865034937858582 }, { "entropy": 9.035560607910156, "epoch": 0.9272295827565751, "mean_token_accuracy": 0.7108792662620544, "num_tokens": 28007804.0, "step": 9378, "train/ce_loss": 1.0736099481582642 }, { "epoch": 0.9272295827565751, "step": 9378, "train/sim_loss": 0.03125 }, { "epoch": 0.9272295827565751, "step": 9378, "train/total_loss": 0.13861098885536194 }, { "entropy": 9.269739151000977, "epoch": 0.9273284556060906, "mean_token_accuracy": 0.7559523582458496, "num_tokens": 28012757.0, "step": 9379, "train/ce_loss": 3.0842841169942403e-07 }, { "epoch": 0.9273284556060906, "step": 9379, "train/sim_loss": 0.0234375 }, { "epoch": 0.9273284556060906, "step": 9379, "train/total_loss": 0.023437531664967537 }, { "epoch": 0.927427328455606, "grad_norm": 0.6609386801719666, "learning_rate": 7.683578104138852e-06, "loss": 0.1385, "step": 9380 }, { "entropy": 9.316780090332031, "epoch": 0.927427328455606, "mean_token_accuracy": 0.7272727489471436, "num_tokens": 28017765.0, "step": 9380, "train/ce_loss": 1.2086098194122314 }, { "epoch": 0.927427328455606, "step": 9380, "train/sim_loss": 0.1171875 }, { "epoch": 0.927427328455606, "step": 9380, "train/total_loss": 0.2380484938621521 }, { "entropy": 8.851558685302734, "epoch": 0.9275262013051216, "mean_token_accuracy": 0.7346938848495483, "num_tokens": 28022978.0, "step": 9381, "train/ce_loss": 0.7256733179092407 }, { "epoch": 0.9275262013051216, "step": 9381, "train/sim_loss": 0.0234375 }, { "epoch": 0.9275262013051216, "step": 9381, "train/total_loss": 0.09600483626127243 }, { "entropy": 8.732415199279785, "epoch": 0.9276250741546371, "mean_token_accuracy": 0.6887417435646057, "num_tokens": 28028004.0, "step": 9382, "train/ce_loss": 1.2488353252410889 }, { "epoch": 0.9276250741546371, "step": 9382, "train/sim_loss": 0.046875 }, { "epoch": 0.9276250741546371, "step": 9382, "train/total_loss": 0.1717585325241089 }, { "entropy": 8.534761428833008, "epoch": 0.9277239470041526, "mean_token_accuracy": 0.6759545803070068, "num_tokens": 28033425.0, "step": 9383, "train/ce_loss": 0.968550980091095 }, { "epoch": 0.9277239470041526, "step": 9383, "train/sim_loss": 0.046875 }, { "epoch": 0.9277239470041526, "step": 9383, "train/total_loss": 0.14373010396957397 }, { "entropy": 8.801989555358887, "epoch": 0.9278228198536682, "mean_token_accuracy": 0.824999988079071, "num_tokens": 28038650.0, "step": 9384, "train/ce_loss": 0.741737425327301 }, { "epoch": 0.9278228198536682, "step": 9384, "train/sim_loss": 0.02734375 }, { "epoch": 0.9278228198536682, "step": 9384, "train/total_loss": 0.10151749104261398 }, { "entropy": 8.70286750793457, "epoch": 0.9279216927031837, "mean_token_accuracy": 0.7546961307525635, "num_tokens": 28043969.0, "step": 9385, "train/ce_loss": 0.5636538863182068 }, { "epoch": 0.9279216927031837, "step": 9385, "train/sim_loss": 0.01953125 }, { "epoch": 0.9279216927031837, "step": 9385, "train/total_loss": 0.07589663565158844 }, { "entropy": 8.636743545532227, "epoch": 0.9280205655526992, "mean_token_accuracy": 0.767208993434906, "num_tokens": 28049265.0, "step": 9386, "train/ce_loss": 0.4064827561378479 }, { "epoch": 0.9280205655526992, "step": 9386, "train/sim_loss": 0.046875 }, { "epoch": 0.9280205655526992, "step": 9386, "train/total_loss": 0.08752328157424927 }, { "entropy": 8.807705879211426, "epoch": 0.9281194384022148, "mean_token_accuracy": 0.7386634945869446, "num_tokens": 28054567.0, "step": 9387, "train/ce_loss": 1.0495035648345947 }, { "epoch": 0.9281194384022148, "step": 9387, "train/sim_loss": 0.02734375 }, { "epoch": 0.9281194384022148, "step": 9387, "train/total_loss": 0.13229411840438843 }, { "entropy": 9.348652839660645, "epoch": 0.9282183112517303, "mean_token_accuracy": 0.7552182078361511, "num_tokens": 28059544.0, "step": 9388, "train/ce_loss": 1.1646846532821655 }, { "epoch": 0.9282183112517303, "step": 9388, "train/sim_loss": 0.01953125 }, { "epoch": 0.9282183112517303, "step": 9388, "train/total_loss": 0.13599970936775208 }, { "entropy": 9.150012016296387, "epoch": 0.9283171841012458, "mean_token_accuracy": 0.6988906264305115, "num_tokens": 28064639.0, "step": 9389, "train/ce_loss": 2.826244838161074e-07 }, { "epoch": 0.9283171841012458, "step": 9389, "train/sim_loss": 0.0078125 }, { "epoch": 0.9283171841012458, "step": 9389, "train/total_loss": 0.007812527939677238 }, { "entropy": 9.319042205810547, "epoch": 0.9284160569507613, "mean_token_accuracy": 0.8239316344261169, "num_tokens": 28069650.0, "step": 9390, "train/ce_loss": 0.6236292719841003 }, { "epoch": 0.9284160569507613, "step": 9390, "train/sim_loss": 0.015625 }, { "epoch": 0.9284160569507613, "step": 9390, "train/total_loss": 0.0779879242181778 }, { "entropy": 8.673835754394531, "epoch": 0.9285149298002768, "mean_token_accuracy": 0.7916136980056763, "num_tokens": 28074852.0, "step": 9391, "train/ce_loss": 0.8313379287719727 }, { "epoch": 0.9285149298002768, "step": 9391, "train/sim_loss": 0.05078125 }, { "epoch": 0.9285149298002768, "step": 9391, "train/total_loss": 0.1339150369167328 }, { "entropy": 9.542570114135742, "epoch": 0.9286138026497923, "mean_token_accuracy": 0.7303664684295654, "num_tokens": 28079667.0, "step": 9392, "train/ce_loss": 9.037328823069402e-07 }, { "epoch": 0.9286138026497923, "step": 9392, "train/sim_loss": 0.03125 }, { "epoch": 0.9286138026497923, "step": 9392, "train/total_loss": 0.03125008940696716 }, { "entropy": 8.512406349182129, "epoch": 0.9287126754993079, "mean_token_accuracy": 0.7486457228660583, "num_tokens": 28085098.0, "step": 9393, "train/ce_loss": 0.8508886694908142 }, { "epoch": 0.9287126754993079, "step": 9393, "train/sim_loss": 0.08203125 }, { "epoch": 0.9287126754993079, "step": 9393, "train/total_loss": 0.16712012887001038 }, { "entropy": 8.425409317016602, "epoch": 0.9288115483488234, "mean_token_accuracy": 0.7757973670959473, "num_tokens": 28090656.0, "step": 9394, "train/ce_loss": 0.9109476208686829 }, { "epoch": 0.9288115483488234, "step": 9394, "train/sim_loss": 0.0703125 }, { "epoch": 0.9288115483488234, "step": 9394, "train/total_loss": 0.1614072620868683 }, { "entropy": 8.87820816040039, "epoch": 0.9289104211983389, "mean_token_accuracy": 0.8025157451629639, "num_tokens": 28095867.0, "step": 9395, "train/ce_loss": 5.457933411889826e-07 }, { "epoch": 0.9289104211983389, "step": 9395, "train/sim_loss": 0.03125 }, { "epoch": 0.9289104211983389, "step": 9395, "train/total_loss": 0.03125005587935448 }, { "entropy": 8.390907287597656, "epoch": 0.9290092940478545, "mean_token_accuracy": 0.7643064856529236, "num_tokens": 28101557.0, "step": 9396, "train/ce_loss": 0.4423627257347107 }, { "epoch": 0.9290092940478545, "step": 9396, "train/sim_loss": 0.01171875 }, { "epoch": 0.9290092940478545, "step": 9396, "train/total_loss": 0.05595502257347107 }, { "entropy": 8.457071304321289, "epoch": 0.92910816689737, "mean_token_accuracy": 0.7335984110832214, "num_tokens": 28107049.0, "step": 9397, "train/ce_loss": 1.928679347038269 }, { "epoch": 0.92910816689737, "step": 9397, "train/sim_loss": 0.0859375 }, { "epoch": 0.92910816689737, "step": 9397, "train/total_loss": 0.2788054347038269 }, { "entropy": 8.952810287475586, "epoch": 0.9292070397468855, "mean_token_accuracy": 0.7200646996498108, "num_tokens": 28112238.0, "step": 9398, "train/ce_loss": 1.202625036239624 }, { "epoch": 0.9292070397468855, "step": 9398, "train/sim_loss": 0.0625 }, { "epoch": 0.9292070397468855, "step": 9398, "train/total_loss": 0.1827625036239624 }, { "entropy": 8.893738746643066, "epoch": 0.929305912596401, "mean_token_accuracy": 0.7468531727790833, "num_tokens": 28117434.0, "step": 9399, "train/ce_loss": 0.9753656983375549 }, { "epoch": 0.929305912596401, "step": 9399, "train/sim_loss": 0.0859375 }, { "epoch": 0.929305912596401, "step": 9399, "train/total_loss": 0.18347406387329102 }, { "epoch": 0.9294047854459165, "grad_norm": 0.6104568243026733, "learning_rate": 7.678633239380904e-06, "loss": 0.1266, "step": 9400 }, { "entropy": 9.174907684326172, "epoch": 0.9294047854459165, "mean_token_accuracy": 0.6466974020004272, "num_tokens": 28122494.0, "step": 9400, "train/ce_loss": 1.2414363622665405 }, { "epoch": 0.9294047854459165, "step": 9400, "train/sim_loss": 0.046875 }, { "epoch": 0.9294047854459165, "step": 9400, "train/total_loss": 0.17101863026618958 }, { "entropy": 8.795722961425781, "epoch": 0.929503658295432, "mean_token_accuracy": 0.7110266089439392, "num_tokens": 28127718.0, "step": 9401, "train/ce_loss": 0.6686466336250305 }, { "epoch": 0.929503658295432, "step": 9401, "train/sim_loss": 0.0703125 }, { "epoch": 0.929503658295432, "step": 9401, "train/total_loss": 0.13717716932296753 }, { "entropy": 8.616256713867188, "epoch": 0.9296025311449476, "mean_token_accuracy": 0.7277277112007141, "num_tokens": 28133177.0, "step": 9402, "train/ce_loss": 0.44011417031288147 }, { "epoch": 0.9296025311449476, "step": 9402, "train/sim_loss": 0.02734375 }, { "epoch": 0.9296025311449476, "step": 9402, "train/total_loss": 0.07135516405105591 }, { "entropy": 8.786222457885742, "epoch": 0.9297014039944631, "mean_token_accuracy": 0.774631917476654, "num_tokens": 28138520.0, "step": 9403, "train/ce_loss": 0.43570011854171753 }, { "epoch": 0.9297014039944631, "step": 9403, "train/sim_loss": 0.01171875 }, { "epoch": 0.9297014039944631, "step": 9403, "train/total_loss": 0.05528876185417175 }, { "entropy": 8.698478698730469, "epoch": 0.9298002768439786, "mean_token_accuracy": 0.7355072498321533, "num_tokens": 28143834.0, "step": 9404, "train/ce_loss": 1.078285813331604 }, { "epoch": 0.9298002768439786, "step": 9404, "train/sim_loss": 0.05859375 }, { "epoch": 0.9298002768439786, "step": 9404, "train/total_loss": 0.16642233729362488 }, { "entropy": 8.438079833984375, "epoch": 0.9298991496934942, "mean_token_accuracy": 0.7645788192749023, "num_tokens": 28149162.0, "step": 9405, "train/ce_loss": 0.39540475606918335 }, { "epoch": 0.9298991496934942, "step": 9405, "train/sim_loss": 0.0625 }, { "epoch": 0.9298991496934942, "step": 9405, "train/total_loss": 0.10204047709703445 }, { "entropy": 9.068194389343262, "epoch": 0.9299980225430097, "mean_token_accuracy": 0.7638484239578247, "num_tokens": 28154305.0, "step": 9406, "train/ce_loss": 0.7043539881706238 }, { "epoch": 0.9299980225430097, "step": 9406, "train/sim_loss": 0.0546875 }, { "epoch": 0.9299980225430097, "step": 9406, "train/total_loss": 0.12512290477752686 }, { "entropy": 8.68542766571045, "epoch": 0.9300968953925252, "mean_token_accuracy": 0.7389221787452698, "num_tokens": 28159634.0, "step": 9407, "train/ce_loss": 1.0234767198562622 }, { "epoch": 0.9300968953925252, "step": 9407, "train/sim_loss": 0.08203125 }, { "epoch": 0.9300968953925252, "step": 9407, "train/total_loss": 0.18437892198562622 }, { "entropy": 9.482927322387695, "epoch": 0.9301957682420408, "mean_token_accuracy": 0.7506775259971619, "num_tokens": 28164412.0, "step": 9408, "train/ce_loss": 4.923381311527919e-07 }, { "epoch": 0.9301957682420408, "step": 9408, "train/sim_loss": 0.0234375 }, { "epoch": 0.9301957682420408, "step": 9408, "train/total_loss": 0.02343754842877388 }, { "entropy": 9.366888046264648, "epoch": 0.9302946410915562, "mean_token_accuracy": 0.7749999761581421, "num_tokens": 28169334.0, "step": 9409, "train/ce_loss": 1.7896606922149658 }, { "epoch": 0.9302946410915562, "step": 9409, "train/sim_loss": 0.12890625 }, { "epoch": 0.9302946410915562, "step": 9409, "train/total_loss": 0.30787232518196106 }, { "entropy": 8.677665710449219, "epoch": 0.9303935139410718, "mean_token_accuracy": 0.7423912882804871, "num_tokens": 28174760.0, "step": 9410, "train/ce_loss": 0.6476882100105286 }, { "epoch": 0.9303935139410718, "step": 9410, "train/sim_loss": 0.0390625 }, { "epoch": 0.9303935139410718, "step": 9410, "train/total_loss": 0.10383132100105286 }, { "entropy": 8.895466804504395, "epoch": 0.9304923867905873, "mean_token_accuracy": 0.7579972147941589, "num_tokens": 28179943.0, "step": 9411, "train/ce_loss": 1.3835035562515259 }, { "epoch": 0.9304923867905873, "step": 9411, "train/sim_loss": 0.0859375 }, { "epoch": 0.9304923867905873, "step": 9411, "train/total_loss": 0.22428785264492035 }, { "entropy": 9.131093978881836, "epoch": 0.9305912596401028, "mean_token_accuracy": 0.7207207083702087, "num_tokens": 28185245.0, "step": 9412, "train/ce_loss": 0.8876646757125854 }, { "epoch": 0.9305912596401028, "step": 9412, "train/sim_loss": 0.0703125 }, { "epoch": 0.9305912596401028, "step": 9412, "train/total_loss": 0.15907897055149078 }, { "entropy": 9.342243194580078, "epoch": 0.9306901324896184, "mean_token_accuracy": 0.75, "num_tokens": 28190261.0, "step": 9413, "train/ce_loss": 0.8801150321960449 }, { "epoch": 0.9306901324896184, "step": 9413, "train/sim_loss": 0.0546875 }, { "epoch": 0.9306901324896184, "step": 9413, "train/total_loss": 0.1426990032196045 }, { "entropy": 8.698667526245117, "epoch": 0.9307890053391339, "mean_token_accuracy": 0.7549019455909729, "num_tokens": 28195539.0, "step": 9414, "train/ce_loss": 1.0844625234603882 }, { "epoch": 0.9307890053391339, "step": 9414, "train/sim_loss": 0.046875 }, { "epoch": 0.9307890053391339, "step": 9414, "train/total_loss": 0.15532125532627106 }, { "entropy": 8.73432445526123, "epoch": 0.9308878781886494, "mean_token_accuracy": 0.756898820400238, "num_tokens": 28200778.0, "step": 9415, "train/ce_loss": 0.7432786226272583 }, { "epoch": 0.9308878781886494, "step": 9415, "train/sim_loss": 0.0546875 }, { "epoch": 0.9308878781886494, "step": 9415, "train/total_loss": 0.12901535630226135 }, { "entropy": 8.85866928100586, "epoch": 0.930986751038165, "mean_token_accuracy": 0.6936936974525452, "num_tokens": 28205941.0, "step": 9416, "train/ce_loss": 2.242464065551758 }, { "epoch": 0.930986751038165, "step": 9416, "train/sim_loss": 0.07421875 }, { "epoch": 0.930986751038165, "step": 9416, "train/total_loss": 0.29846516251564026 }, { "entropy": 8.43039321899414, "epoch": 0.9310856238876805, "mean_token_accuracy": 0.7508090734481812, "num_tokens": 28211377.0, "step": 9417, "train/ce_loss": 0.8832802176475525 }, { "epoch": 0.9310856238876805, "step": 9417, "train/sim_loss": 0.05859375 }, { "epoch": 0.9310856238876805, "step": 9417, "train/total_loss": 0.1469217836856842 }, { "entropy": 8.777997970581055, "epoch": 0.9311844967371959, "mean_token_accuracy": 0.7008149027824402, "num_tokens": 28216684.0, "step": 9418, "train/ce_loss": 0.7578139305114746 }, { "epoch": 0.9311844967371959, "step": 9418, "train/sim_loss": 0.046875 }, { "epoch": 0.9311844967371959, "step": 9418, "train/total_loss": 0.12265639752149582 }, { "entropy": 9.463607788085938, "epoch": 0.9312833695867115, "mean_token_accuracy": 0.7788461446762085, "num_tokens": 28221485.0, "step": 9419, "train/ce_loss": 1.238187313079834 }, { "epoch": 0.9312833695867115, "step": 9419, "train/sim_loss": 0.04296875 }, { "epoch": 0.9312833695867115, "step": 9419, "train/total_loss": 0.16678747534751892 }, { "epoch": 0.931382242436227, "grad_norm": 0.8246861100196838, "learning_rate": 7.673688374622955e-06, "loss": 0.1339, "step": 9420 }, { "entropy": 8.770391464233398, "epoch": 0.931382242436227, "mean_token_accuracy": 0.7437499761581421, "num_tokens": 28226591.0, "step": 9420, "train/ce_loss": 1.1773754358291626 }, { "epoch": 0.931382242436227, "step": 9420, "train/sim_loss": 0.1171875 }, { "epoch": 0.931382242436227, "step": 9420, "train/total_loss": 0.2349250465631485 }, { "entropy": 8.86601448059082, "epoch": 0.9314811152857425, "mean_token_accuracy": 0.7668097019195557, "num_tokens": 28231728.0, "step": 9421, "train/ce_loss": 0.9965757727622986 }, { "epoch": 0.9314811152857425, "step": 9421, "train/sim_loss": 0.0390625 }, { "epoch": 0.9314811152857425, "step": 9421, "train/total_loss": 0.1387200802564621 }, { "entropy": 9.105216979980469, "epoch": 0.9315799881352581, "mean_token_accuracy": 0.7777777910232544, "num_tokens": 28236718.0, "step": 9422, "train/ce_loss": 1.0376790761947632 }, { "epoch": 0.9315799881352581, "step": 9422, "train/sim_loss": 0.0703125 }, { "epoch": 0.9315799881352581, "step": 9422, "train/total_loss": 0.17408040165901184 }, { "entropy": 8.490823745727539, "epoch": 0.9316788609847736, "mean_token_accuracy": 0.7954047918319702, "num_tokens": 28242119.0, "step": 9423, "train/ce_loss": 0.6964000463485718 }, { "epoch": 0.9316788609847736, "step": 9423, "train/sim_loss": 0.05078125 }, { "epoch": 0.9316788609847736, "step": 9423, "train/total_loss": 0.12042125314474106 }, { "entropy": 8.82767105102539, "epoch": 0.9317777338342891, "mean_token_accuracy": 0.7571428418159485, "num_tokens": 28247158.0, "step": 9424, "train/ce_loss": 3.042653418106056e-07 }, { "epoch": 0.9317777338342891, "step": 9424, "train/sim_loss": 0.0390625 }, { "epoch": 0.9317777338342891, "step": 9424, "train/total_loss": 0.03906252980232239 }, { "entropy": 8.905559539794922, "epoch": 0.9318766066838047, "mean_token_accuracy": 0.7159383296966553, "num_tokens": 28252397.0, "step": 9425, "train/ce_loss": 1.7327330112457275 }, { "epoch": 0.9318766066838047, "step": 9425, "train/sim_loss": 0.0703125 }, { "epoch": 0.9318766066838047, "step": 9425, "train/total_loss": 0.24358581006526947 }, { "entropy": 8.280866622924805, "epoch": 0.9319754795333202, "mean_token_accuracy": 0.8251879811286926, "num_tokens": 28257970.0, "step": 9426, "train/ce_loss": 0.32057952880859375 }, { "epoch": 0.9319754795333202, "step": 9426, "train/sim_loss": 0.01171875 }, { "epoch": 0.9319754795333202, "step": 9426, "train/total_loss": 0.043776702135801315 }, { "entropy": 8.459920883178711, "epoch": 0.9320743523828356, "mean_token_accuracy": 0.7546992301940918, "num_tokens": 28263529.0, "step": 9427, "train/ce_loss": 0.8341599702835083 }, { "epoch": 0.9320743523828356, "step": 9427, "train/sim_loss": 0.05078125 }, { "epoch": 0.9320743523828356, "step": 9427, "train/total_loss": 0.13419725000858307 }, { "entropy": 8.291421890258789, "epoch": 0.9321732252323512, "mean_token_accuracy": 0.7515856027603149, "num_tokens": 28268893.0, "step": 9428, "train/ce_loss": 0.6774831414222717 }, { "epoch": 0.9321732252323512, "step": 9428, "train/sim_loss": 0.046875 }, { "epoch": 0.9321732252323512, "step": 9428, "train/total_loss": 0.11462331563234329 }, { "entropy": 8.916852951049805, "epoch": 0.9322720980818667, "mean_token_accuracy": 0.7021276354789734, "num_tokens": 28274092.0, "step": 9429, "train/ce_loss": 1.4796149730682373 }, { "epoch": 0.9322720980818667, "step": 9429, "train/sim_loss": 0.07421875 }, { "epoch": 0.9322720980818667, "step": 9429, "train/total_loss": 0.22218024730682373 }, { "entropy": 8.927335739135742, "epoch": 0.9323709709313822, "mean_token_accuracy": 0.7384615540504456, "num_tokens": 28279173.0, "step": 9430, "train/ce_loss": 5.237801019575272e-07 }, { "epoch": 0.9323709709313822, "step": 9430, "train/sim_loss": 0.0390625 }, { "epoch": 0.9323709709313822, "step": 9430, "train/total_loss": 0.03906255215406418 }, { "entropy": 8.914237976074219, "epoch": 0.9324698437808978, "mean_token_accuracy": 0.7145135402679443, "num_tokens": 28284245.0, "step": 9431, "train/ce_loss": 7.544079494437028e-07 }, { "epoch": 0.9324698437808978, "step": 9431, "train/sim_loss": 0.0546875 }, { "epoch": 0.9324698437808978, "step": 9431, "train/total_loss": 0.05468757450580597 }, { "entropy": 9.386693954467773, "epoch": 0.9325687166304133, "mean_token_accuracy": 0.8069105744361877, "num_tokens": 28289149.0, "step": 9432, "train/ce_loss": 2.0158906011147337e-07 }, { "epoch": 0.9325687166304133, "step": 9432, "train/sim_loss": 0.02734375 }, { "epoch": 0.9325687166304133, "step": 9432, "train/total_loss": 0.02734377048909664 }, { "entropy": 8.992332458496094, "epoch": 0.9326675894799288, "mean_token_accuracy": 0.7102941274642944, "num_tokens": 28294284.0, "step": 9433, "train/ce_loss": 3.898809097790945e-07 }, { "epoch": 0.9326675894799288, "step": 9433, "train/sim_loss": 0.0390625 }, { "epoch": 0.9326675894799288, "step": 9433, "train/total_loss": 0.039062537252902985 }, { "entropy": 8.570638656616211, "epoch": 0.9327664623294444, "mean_token_accuracy": 0.7350119948387146, "num_tokens": 28299590.0, "step": 9434, "train/ce_loss": 1.2431139945983887 }, { "epoch": 0.9327664623294444, "step": 9434, "train/sim_loss": 0.02734375 }, { "epoch": 0.9327664623294444, "step": 9434, "train/total_loss": 0.1516551524400711 }, { "entropy": 9.032379150390625, "epoch": 0.9328653351789599, "mean_token_accuracy": 0.7147335410118103, "num_tokens": 28304706.0, "step": 9435, "train/ce_loss": 1.2679431438446045 }, { "epoch": 0.9328653351789599, "step": 9435, "train/sim_loss": 0.0546875 }, { "epoch": 0.9328653351789599, "step": 9435, "train/total_loss": 0.18148182332515717 }, { "entropy": 8.528156280517578, "epoch": 0.9329642080284754, "mean_token_accuracy": 0.7256729006767273, "num_tokens": 28310092.0, "step": 9436, "train/ce_loss": 1.0612057447433472 }, { "epoch": 0.9329642080284754, "step": 9436, "train/sim_loss": 0.0546875 }, { "epoch": 0.9329642080284754, "step": 9436, "train/total_loss": 0.16080808639526367 }, { "entropy": 9.329557418823242, "epoch": 0.933063080877991, "mean_token_accuracy": 0.7918455004692078, "num_tokens": 28315002.0, "step": 9437, "train/ce_loss": 1.523861289024353 }, { "epoch": 0.933063080877991, "step": 9437, "train/sim_loss": 0.0546875 }, { "epoch": 0.933063080877991, "step": 9437, "train/total_loss": 0.2070736289024353 }, { "entropy": 8.956287384033203, "epoch": 0.9331619537275064, "mean_token_accuracy": 0.8003876209259033, "num_tokens": 28319910.0, "step": 9438, "train/ce_loss": 6.311527727120847e-07 }, { "epoch": 0.9331619537275064, "step": 9438, "train/sim_loss": 0.0390625 }, { "epoch": 0.9331619537275064, "step": 9438, "train/total_loss": 0.039062563329935074 }, { "entropy": 9.165496826171875, "epoch": 0.9332608265770219, "mean_token_accuracy": 0.6877256035804749, "num_tokens": 28324890.0, "step": 9439, "train/ce_loss": 2.1929569244384766 }, { "epoch": 0.9332608265770219, "step": 9439, "train/sim_loss": 0.0859375 }, { "epoch": 0.9332608265770219, "step": 9439, "train/total_loss": 0.3052331805229187 }, { "epoch": 0.9333596994265375, "grad_norm": 0.7782477736473083, "learning_rate": 7.668743509865005e-06, "loss": 0.1292, "step": 9440 }, { "entropy": 9.423730850219727, "epoch": 0.9333596994265375, "mean_token_accuracy": 0.6735632419586182, "num_tokens": 28329728.0, "step": 9440, "train/ce_loss": 6.598913273592188e-07 }, { "epoch": 0.9333596994265375, "step": 9440, "train/sim_loss": 0.0546875 }, { "epoch": 0.9333596994265375, "step": 9440, "train/total_loss": 0.05468756705522537 }, { "entropy": 8.39457893371582, "epoch": 0.933458572276053, "mean_token_accuracy": 0.8268858790397644, "num_tokens": 28335274.0, "step": 9441, "train/ce_loss": 0.6306044459342957 }, { "epoch": 0.933458572276053, "step": 9441, "train/sim_loss": 0.05078125 }, { "epoch": 0.933458572276053, "step": 9441, "train/total_loss": 0.1138416975736618 }, { "entropy": 8.52895736694336, "epoch": 0.9335574451255685, "mean_token_accuracy": 0.7826552391052246, "num_tokens": 28340661.0, "step": 9442, "train/ce_loss": 0.7003907561302185 }, { "epoch": 0.9335574451255685, "step": 9442, "train/sim_loss": 0.046875 }, { "epoch": 0.9335574451255685, "step": 9442, "train/total_loss": 0.11691407859325409 }, { "entropy": 8.414928436279297, "epoch": 0.9336563179750841, "mean_token_accuracy": 0.7371188402175903, "num_tokens": 28346164.0, "step": 9443, "train/ce_loss": 1.0276933908462524 }, { "epoch": 0.9336563179750841, "step": 9443, "train/sim_loss": 0.046875 }, { "epoch": 0.9336563179750841, "step": 9443, "train/total_loss": 0.14964434504508972 }, { "entropy": 8.391592025756836, "epoch": 0.9337551908245996, "mean_token_accuracy": 0.6993339657783508, "num_tokens": 28351747.0, "step": 9444, "train/ce_loss": 1.360185980796814 }, { "epoch": 0.9337551908245996, "step": 9444, "train/sim_loss": 0.0703125 }, { "epoch": 0.9337551908245996, "step": 9444, "train/total_loss": 0.20633110404014587 }, { "entropy": 9.084711074829102, "epoch": 0.933854063674115, "mean_token_accuracy": 0.7538226246833801, "num_tokens": 28356847.0, "step": 9445, "train/ce_loss": 0.840169370174408 }, { "epoch": 0.933854063674115, "step": 9445, "train/sim_loss": 0.03125 }, { "epoch": 0.933854063674115, "step": 9445, "train/total_loss": 0.11526694148778915 }, { "entropy": 8.835979461669922, "epoch": 0.9339529365236307, "mean_token_accuracy": 0.7090908885002136, "num_tokens": 28361968.0, "step": 9446, "train/ce_loss": 1.1038906574249268 }, { "epoch": 0.9339529365236307, "step": 9446, "train/sim_loss": 0.0234375 }, { "epoch": 0.9339529365236307, "step": 9446, "train/total_loss": 0.13382656872272491 }, { "entropy": 8.689046859741211, "epoch": 0.9340518093731461, "mean_token_accuracy": 0.7245509028434753, "num_tokens": 28367278.0, "step": 9447, "train/ce_loss": 0.9410738348960876 }, { "epoch": 0.9340518093731461, "step": 9447, "train/sim_loss": 0.0625 }, { "epoch": 0.9340518093731461, "step": 9447, "train/total_loss": 0.15660738945007324 }, { "entropy": 9.413528442382812, "epoch": 0.9341506822226616, "mean_token_accuracy": 0.7511013150215149, "num_tokens": 28372140.0, "step": 9448, "train/ce_loss": 1.5572891235351562 }, { "epoch": 0.9341506822226616, "step": 9448, "train/sim_loss": 0.046875 }, { "epoch": 0.9341506822226616, "step": 9448, "train/total_loss": 0.20260392129421234 }, { "entropy": 8.6624755859375, "epoch": 0.9342495550721772, "mean_token_accuracy": 0.7384230494499207, "num_tokens": 28377348.0, "step": 9449, "train/ce_loss": 0.7418690323829651 }, { "epoch": 0.9342495550721772, "step": 9449, "train/sim_loss": 0.04296875 }, { "epoch": 0.9342495550721772, "step": 9449, "train/total_loss": 0.11715565621852875 }, { "entropy": 8.503235816955566, "epoch": 0.9343484279216927, "mean_token_accuracy": 0.7927736639976501, "num_tokens": 28382754.0, "step": 9450, "train/ce_loss": 0.5844284296035767 }, { "epoch": 0.9343484279216927, "step": 9450, "train/sim_loss": 0.02734375 }, { "epoch": 0.9343484279216927, "step": 9450, "train/total_loss": 0.0857865959405899 }, { "entropy": 8.794065475463867, "epoch": 0.9344473007712082, "mean_token_accuracy": 0.7134071588516235, "num_tokens": 28388029.0, "step": 9451, "train/ce_loss": 1.0954326391220093 }, { "epoch": 0.9344473007712082, "step": 9451, "train/sim_loss": 0.05859375 }, { "epoch": 0.9344473007712082, "step": 9451, "train/total_loss": 0.16813701391220093 }, { "entropy": 9.619552612304688, "epoch": 0.9345461736207238, "mean_token_accuracy": 0.7867435216903687, "num_tokens": 28392811.0, "step": 9452, "train/ce_loss": 1.8935411389975343e-06 }, { "epoch": 0.9345461736207238, "step": 9452, "train/sim_loss": 0.05078125 }, { "epoch": 0.9345461736207238, "step": 9452, "train/total_loss": 0.05078143998980522 }, { "entropy": 9.024389266967773, "epoch": 0.9346450464702393, "mean_token_accuracy": 0.8212290406227112, "num_tokens": 28397955.0, "step": 9453, "train/ce_loss": 1.036226990436262e-06 }, { "epoch": 0.9346450464702393, "step": 9453, "train/sim_loss": 0.03515625 }, { "epoch": 0.9346450464702393, "step": 9453, "train/total_loss": 0.03515635430812836 }, { "entropy": 8.194178581237793, "epoch": 0.9347439193197548, "mean_token_accuracy": 0.6833616495132446, "num_tokens": 28403671.0, "step": 9454, "train/ce_loss": 1.6191717386245728 }, { "epoch": 0.9347439193197548, "step": 9454, "train/sim_loss": 0.0703125 }, { "epoch": 0.9347439193197548, "step": 9454, "train/total_loss": 0.23222967982292175 }, { "entropy": 9.058138847351074, "epoch": 0.9348427921692704, "mean_token_accuracy": 0.6903137564659119, "num_tokens": 28408808.0, "step": 9455, "train/ce_loss": 1.0023647546768188 }, { "epoch": 0.9348427921692704, "step": 9455, "train/sim_loss": 0.1015625 }, { "epoch": 0.9348427921692704, "step": 9455, "train/total_loss": 0.20179897546768188 }, { "entropy": 8.40052604675293, "epoch": 0.9349416650187858, "mean_token_accuracy": 0.724258303642273, "num_tokens": 28414418.0, "step": 9456, "train/ce_loss": 0.5172601938247681 }, { "epoch": 0.9349416650187858, "step": 9456, "train/sim_loss": 0.015625 }, { "epoch": 0.9349416650187858, "step": 9456, "train/total_loss": 0.06735102087259293 }, { "entropy": 9.11497974395752, "epoch": 0.9350405378683013, "mean_token_accuracy": 0.7714285850524902, "num_tokens": 28419507.0, "step": 9457, "train/ce_loss": 0.9743853211402893 }, { "epoch": 0.9350405378683013, "step": 9457, "train/sim_loss": 0.0234375 }, { "epoch": 0.9350405378683013, "step": 9457, "train/total_loss": 0.12087603658437729 }, { "entropy": 9.073640823364258, "epoch": 0.9351394107178169, "mean_token_accuracy": 0.6483180522918701, "num_tokens": 28424605.0, "step": 9458, "train/ce_loss": 2.0779506826329452e-07 }, { "epoch": 0.9351394107178169, "step": 9458, "train/sim_loss": 0.015625 }, { "epoch": 0.9351394107178169, "step": 9458, "train/total_loss": 0.01562502048909664 }, { "entropy": 9.379833221435547, "epoch": 0.9352382835673324, "mean_token_accuracy": 0.7451403737068176, "num_tokens": 28429526.0, "step": 9459, "train/ce_loss": 2.451115790336189e-07 }, { "epoch": 0.9352382835673324, "step": 9459, "train/sim_loss": 0.015625 }, { "epoch": 0.9352382835673324, "step": 9459, "train/total_loss": 0.01562502421438694 }, { "epoch": 0.9353371564168479, "grad_norm": 0.743285059928894, "learning_rate": 7.663798645107056e-06, "loss": 0.1311, "step": 9460 }, { "entropy": 8.391401290893555, "epoch": 0.9353371564168479, "mean_token_accuracy": 0.757080614566803, "num_tokens": 28434897.0, "step": 9460, "train/ce_loss": 0.6055987477302551 }, { "epoch": 0.9353371564168479, "step": 9460, "train/sim_loss": 0.0234375 }, { "epoch": 0.9353371564168479, "step": 9460, "train/total_loss": 0.08399737626314163 }, { "entropy": 9.518046379089355, "epoch": 0.9354360292663635, "mean_token_accuracy": 0.7429466843605042, "num_tokens": 28439582.0, "step": 9461, "train/ce_loss": 1.153960511146579e-06 }, { "epoch": 0.9354360292663635, "step": 9461, "train/sim_loss": 0.0546875 }, { "epoch": 0.9354360292663635, "step": 9461, "train/total_loss": 0.05468761548399925 }, { "entropy": 8.630594253540039, "epoch": 0.935534902115879, "mean_token_accuracy": 0.7149643898010254, "num_tokens": 28444917.0, "step": 9462, "train/ce_loss": 0.5538694262504578 }, { "epoch": 0.935534902115879, "step": 9462, "train/sim_loss": 0.06640625 }, { "epoch": 0.935534902115879, "step": 9462, "train/total_loss": 0.12179319560527802 }, { "entropy": 9.009743690490723, "epoch": 0.9356337749653945, "mean_token_accuracy": 0.7209302186965942, "num_tokens": 28449949.0, "step": 9463, "train/ce_loss": 1.1485670804977417 }, { "epoch": 0.9356337749653945, "step": 9463, "train/sim_loss": 0.07421875 }, { "epoch": 0.9356337749653945, "step": 9463, "train/total_loss": 0.18907546997070312 }, { "entropy": 9.719734191894531, "epoch": 0.9357326478149101, "mean_token_accuracy": 0.7188405990600586, "num_tokens": 28454663.0, "step": 9464, "train/ce_loss": 3.317109076306224e-07 }, { "epoch": 0.9357326478149101, "step": 9464, "train/sim_loss": 0.01171875 }, { "epoch": 0.9357326478149101, "step": 9464, "train/total_loss": 0.011718783527612686 }, { "entropy": 8.511116027832031, "epoch": 0.9358315206644255, "mean_token_accuracy": 0.739506185054779, "num_tokens": 28459920.0, "step": 9465, "train/ce_loss": 0.5615465044975281 }, { "epoch": 0.9358315206644255, "step": 9465, "train/sim_loss": 0.0390625 }, { "epoch": 0.9358315206644255, "step": 9465, "train/total_loss": 0.09521715342998505 }, { "entropy": 8.738088607788086, "epoch": 0.935930393513941, "mean_token_accuracy": 0.7585799098014832, "num_tokens": 28465216.0, "step": 9466, "train/ce_loss": 0.5507985353469849 }, { "epoch": 0.935930393513941, "step": 9466, "train/sim_loss": 0.046875 }, { "epoch": 0.935930393513941, "step": 9466, "train/total_loss": 0.1019548550248146 }, { "entropy": 8.782062530517578, "epoch": 0.9360292663634566, "mean_token_accuracy": 0.7211155295372009, "num_tokens": 28470456.0, "step": 9467, "train/ce_loss": 0.7884370684623718 }, { "epoch": 0.9360292663634566, "step": 9467, "train/sim_loss": 0.08984375 }, { "epoch": 0.9360292663634566, "step": 9467, "train/total_loss": 0.16868746280670166 }, { "entropy": 8.478357315063477, "epoch": 0.9361281392129721, "mean_token_accuracy": 0.7252604365348816, "num_tokens": 28475703.0, "step": 9468, "train/ce_loss": 0.412435382604599 }, { "epoch": 0.9361281392129721, "step": 9468, "train/sim_loss": 0.015625 }, { "epoch": 0.9361281392129721, "step": 9468, "train/total_loss": 0.0568685382604599 }, { "entropy": 9.02198314666748, "epoch": 0.9362270120624876, "mean_token_accuracy": 0.781345546245575, "num_tokens": 28480785.0, "step": 9469, "train/ce_loss": 0.653954267501831 }, { "epoch": 0.9362270120624876, "step": 9469, "train/sim_loss": 0.0390625 }, { "epoch": 0.9362270120624876, "step": 9469, "train/total_loss": 0.10445792973041534 }, { "entropy": 9.033451080322266, "epoch": 0.9363258849120032, "mean_token_accuracy": 0.7223340272903442, "num_tokens": 28485735.0, "step": 9470, "train/ce_loss": 1.467859192416654e-06 }, { "epoch": 0.9363258849120032, "step": 9470, "train/sim_loss": 0.0625 }, { "epoch": 0.9363258849120032, "step": 9470, "train/total_loss": 0.06250014901161194 }, { "entropy": 8.486593246459961, "epoch": 0.9364247577615187, "mean_token_accuracy": 0.7819972038269043, "num_tokens": 28490908.0, "step": 9471, "train/ce_loss": 3.661734240267833e-07 }, { "epoch": 0.9364247577615187, "step": 9471, "train/sim_loss": 0.0234375 }, { "epoch": 0.9364247577615187, "step": 9471, "train/total_loss": 0.023437537252902985 }, { "entropy": 9.105995178222656, "epoch": 0.9365236306110342, "mean_token_accuracy": 0.7398496270179749, "num_tokens": 28496000.0, "step": 9472, "train/ce_loss": 2.3877581156739325e-07 }, { "epoch": 0.9365236306110342, "step": 9472, "train/sim_loss": 0.0234375 }, { "epoch": 0.9365236306110342, "step": 9472, "train/total_loss": 0.02343752421438694 }, { "entropy": 8.429557800292969, "epoch": 0.9366225034605498, "mean_token_accuracy": 0.769978404045105, "num_tokens": 28501386.0, "step": 9473, "train/ce_loss": 0.9409735798835754 }, { "epoch": 0.9366225034605498, "step": 9473, "train/sim_loss": 0.05078125 }, { "epoch": 0.9366225034605498, "step": 9473, "train/total_loss": 0.14487861096858978 }, { "entropy": 9.37339973449707, "epoch": 0.9367213763100652, "mean_token_accuracy": 0.7769376039505005, "num_tokens": 28506381.0, "step": 9474, "train/ce_loss": 4.75928715104601e-07 }, { "epoch": 0.9367213763100652, "step": 9474, "train/sim_loss": 0.0390625 }, { "epoch": 0.9367213763100652, "step": 9474, "train/total_loss": 0.03906254842877388 }, { "entropy": 8.381181716918945, "epoch": 0.9368202491595807, "mean_token_accuracy": 0.782608687877655, "num_tokens": 28511802.0, "step": 9475, "train/ce_loss": 0.6702274680137634 }, { "epoch": 0.9368202491595807, "step": 9475, "train/sim_loss": 0.0546875 }, { "epoch": 0.9368202491595807, "step": 9475, "train/total_loss": 0.12171024829149246 }, { "entropy": 9.021618843078613, "epoch": 0.9369191220090963, "mean_token_accuracy": 0.7468030452728271, "num_tokens": 28517043.0, "step": 9476, "train/ce_loss": 0.4683550298213959 }, { "epoch": 0.9369191220090963, "step": 9476, "train/sim_loss": 0.01953125 }, { "epoch": 0.9369191220090963, "step": 9476, "train/total_loss": 0.0663667544722557 }, { "entropy": 8.575700759887695, "epoch": 0.9370179948586118, "mean_token_accuracy": 0.736775815486908, "num_tokens": 28522247.0, "step": 9477, "train/ce_loss": 1.0939861536026 }, { "epoch": 0.9370179948586118, "step": 9477, "train/sim_loss": 0.03515625 }, { "epoch": 0.9370179948586118, "step": 9477, "train/total_loss": 0.14455486834049225 }, { "entropy": 8.39863395690918, "epoch": 0.9371168677081273, "mean_token_accuracy": 0.7544204592704773, "num_tokens": 28527742.0, "step": 9478, "train/ce_loss": 0.6728148460388184 }, { "epoch": 0.9371168677081273, "step": 9478, "train/sim_loss": 0.09765625 }, { "epoch": 0.9371168677081273, "step": 9478, "train/total_loss": 0.16493773460388184 }, { "entropy": 8.780609130859375, "epoch": 0.9372157405576429, "mean_token_accuracy": 0.7668965458869934, "num_tokens": 28532951.0, "step": 9479, "train/ce_loss": 1.119539499282837 }, { "epoch": 0.9372157405576429, "step": 9479, "train/sim_loss": 0.0546875 }, { "epoch": 0.9372157405576429, "step": 9479, "train/total_loss": 0.1666414439678192 }, { "epoch": 0.9373146134071584, "grad_norm": 0.5941998362541199, "learning_rate": 7.658853780349108e-06, "loss": 0.129, "step": 9480 }, { "entropy": 9.30009651184082, "epoch": 0.9373146134071584, "mean_token_accuracy": 0.7337883710861206, "num_tokens": 28538002.0, "step": 9480, "train/ce_loss": 3.8732500229343714e-07 }, { "epoch": 0.9373146134071584, "step": 9480, "train/sim_loss": 0.02734375 }, { "epoch": 0.9373146134071584, "step": 9480, "train/total_loss": 0.027343789115548134 }, { "entropy": 8.628240585327148, "epoch": 0.9374134862566739, "mean_token_accuracy": 0.692307710647583, "num_tokens": 28543458.0, "step": 9481, "train/ce_loss": 1.2278821468353271 }, { "epoch": 0.9374134862566739, "step": 9481, "train/sim_loss": 0.05859375 }, { "epoch": 0.9374134862566739, "step": 9481, "train/total_loss": 0.1813819706439972 }, { "entropy": 8.353310585021973, "epoch": 0.9375123591061895, "mean_token_accuracy": 0.769911527633667, "num_tokens": 28548977.0, "step": 9482, "train/ce_loss": 0.6554830074310303 }, { "epoch": 0.9375123591061895, "step": 9482, "train/sim_loss": 0.05859375 }, { "epoch": 0.9375123591061895, "step": 9482, "train/total_loss": 0.12414205074310303 }, { "entropy": 8.617682456970215, "epoch": 0.937611231955705, "mean_token_accuracy": 0.7256097793579102, "num_tokens": 28554441.0, "step": 9483, "train/ce_loss": 1.6344026327133179 }, { "epoch": 0.937611231955705, "step": 9483, "train/sim_loss": 0.09375 }, { "epoch": 0.937611231955705, "step": 9483, "train/total_loss": 0.2571902871131897 }, { "entropy": 9.161894798278809, "epoch": 0.9377101048052204, "mean_token_accuracy": 0.7301855087280273, "num_tokens": 28559481.0, "step": 9484, "train/ce_loss": 1.361616611480713 }, { "epoch": 0.9377101048052204, "step": 9484, "train/sim_loss": 0.05078125 }, { "epoch": 0.9377101048052204, "step": 9484, "train/total_loss": 0.186942920088768 }, { "entropy": 9.093269348144531, "epoch": 0.937808977654736, "mean_token_accuracy": 0.7424242496490479, "num_tokens": 28564504.0, "step": 9485, "train/ce_loss": 8.236112307713483e-07 }, { "epoch": 0.937808977654736, "step": 9485, "train/sim_loss": 0.0625 }, { "epoch": 0.937808977654736, "step": 9485, "train/total_loss": 0.06250008195638657 }, { "entropy": 8.668212890625, "epoch": 0.9379078505042515, "mean_token_accuracy": 0.693315863609314, "num_tokens": 28569736.0, "step": 9486, "train/ce_loss": 1.6782050132751465 }, { "epoch": 0.9379078505042515, "step": 9486, "train/sim_loss": 0.05078125 }, { "epoch": 0.9379078505042515, "step": 9486, "train/total_loss": 0.2186017483472824 }, { "entropy": 8.875524520874023, "epoch": 0.938006723353767, "mean_token_accuracy": 0.7543390989303589, "num_tokens": 28574911.0, "step": 9487, "train/ce_loss": 0.9683337807655334 }, { "epoch": 0.938006723353767, "step": 9487, "train/sim_loss": 0.05078125 }, { "epoch": 0.938006723353767, "step": 9487, "train/total_loss": 0.14761462807655334 }, { "entropy": 8.638969421386719, "epoch": 0.9381055962032826, "mean_token_accuracy": 0.7071611285209656, "num_tokens": 28580186.0, "step": 9488, "train/ce_loss": 0.5961774587631226 }, { "epoch": 0.9381055962032826, "step": 9488, "train/sim_loss": 0.03515625 }, { "epoch": 0.9381055962032826, "step": 9488, "train/total_loss": 0.09477399289608002 }, { "entropy": 8.519998550415039, "epoch": 0.9382044690527981, "mean_token_accuracy": 0.7333333492279053, "num_tokens": 28585438.0, "step": 9489, "train/ce_loss": 0.6842387914657593 }, { "epoch": 0.9382044690527981, "step": 9489, "train/sim_loss": 0.046875 }, { "epoch": 0.9382044690527981, "step": 9489, "train/total_loss": 0.11529888212680817 }, { "entropy": 8.97115421295166, "epoch": 0.9383033419023136, "mean_token_accuracy": 0.7360115051269531, "num_tokens": 28590564.0, "step": 9490, "train/ce_loss": 1.1503511667251587 }, { "epoch": 0.9383033419023136, "step": 9490, "train/sim_loss": 0.03515625 }, { "epoch": 0.9383033419023136, "step": 9490, "train/total_loss": 0.15019136667251587 }, { "entropy": 9.357909202575684, "epoch": 0.9384022147518292, "mean_token_accuracy": 0.7748593091964722, "num_tokens": 28595539.0, "step": 9491, "train/ce_loss": 0.5976631045341492 }, { "epoch": 0.9384022147518292, "step": 9491, "train/sim_loss": 0.0390625 }, { "epoch": 0.9384022147518292, "step": 9491, "train/total_loss": 0.09882880747318268 }, { "entropy": 8.69143009185791, "epoch": 0.9385010876013447, "mean_token_accuracy": 0.763052225112915, "num_tokens": 28600742.0, "step": 9492, "train/ce_loss": 1.2426862716674805 }, { "epoch": 0.9385010876013447, "step": 9492, "train/sim_loss": 0.07421875 }, { "epoch": 0.9385010876013447, "step": 9492, "train/total_loss": 0.19848737120628357 }, { "entropy": 8.563522338867188, "epoch": 0.9385999604508602, "mean_token_accuracy": 0.7716450095176697, "num_tokens": 28606121.0, "step": 9493, "train/ce_loss": 1.5273971557617188 }, { "epoch": 0.9385999604508602, "step": 9493, "train/sim_loss": 0.0703125 }, { "epoch": 0.9385999604508602, "step": 9493, "train/total_loss": 0.2230522185564041 }, { "entropy": 8.836255073547363, "epoch": 0.9386988333003757, "mean_token_accuracy": 0.7194631099700928, "num_tokens": 28611370.0, "step": 9494, "train/ce_loss": 0.778255820274353 }, { "epoch": 0.9386988333003757, "step": 9494, "train/sim_loss": 0.0234375 }, { "epoch": 0.9386988333003757, "step": 9494, "train/total_loss": 0.10126308351755142 }, { "entropy": 8.884140968322754, "epoch": 0.9387977061498912, "mean_token_accuracy": 0.7642752528190613, "num_tokens": 28616492.0, "step": 9495, "train/ce_loss": 0.8069877028465271 }, { "epoch": 0.9387977061498912, "step": 9495, "train/sim_loss": 0.05859375 }, { "epoch": 0.9387977061498912, "step": 9495, "train/total_loss": 0.13929252326488495 }, { "entropy": 8.872888565063477, "epoch": 0.9388965789994068, "mean_token_accuracy": 0.748308539390564, "num_tokens": 28621699.0, "step": 9496, "train/ce_loss": 0.8947360515594482 }, { "epoch": 0.9388965789994068, "step": 9496, "train/sim_loss": 0.046875 }, { "epoch": 0.9388965789994068, "step": 9496, "train/total_loss": 0.13634860515594482 }, { "entropy": 9.04621410369873, "epoch": 0.9389954518489223, "mean_token_accuracy": 0.7554054260253906, "num_tokens": 28626910.0, "step": 9497, "train/ce_loss": 0.47385963797569275 }, { "epoch": 0.9389954518489223, "step": 9497, "train/sim_loss": 0.0546875 }, { "epoch": 0.9389954518489223, "step": 9497, "train/total_loss": 0.10207346081733704 }, { "entropy": 8.489900588989258, "epoch": 0.9390943246984378, "mean_token_accuracy": 0.7533129453659058, "num_tokens": 28632354.0, "step": 9498, "train/ce_loss": 0.5772135257720947 }, { "epoch": 0.9390943246984378, "step": 9498, "train/sim_loss": 0.02734375 }, { "epoch": 0.9390943246984378, "step": 9498, "train/total_loss": 0.08506510406732559 }, { "entropy": 9.268988609313965, "epoch": 0.9391931975479534, "mean_token_accuracy": 0.782865583896637, "num_tokens": 28637473.0, "step": 9499, "train/ce_loss": 0.5241722464561462 }, { "epoch": 0.9391931975479534, "step": 9499, "train/sim_loss": 0.01953125 }, { "epoch": 0.9391931975479534, "step": 9499, "train/total_loss": 0.07194847613573074 }, { "epoch": 0.9392920703974689, "grad_norm": 0.6899769306182861, "learning_rate": 7.653908915591159e-06, "loss": 0.1295, "step": 9500 }, { "entropy": 8.788375854492188, "epoch": 0.9392920703974689, "mean_token_accuracy": 0.7633832693099976, "num_tokens": 28642838.0, "step": 9500, "train/ce_loss": 1.4281773567199707 }, { "epoch": 0.9392920703974689, "step": 9500, "train/sim_loss": 0.09375 }, { "epoch": 0.9392920703974689, "step": 9500, "train/total_loss": 0.23656773567199707 }, { "entropy": 8.71684455871582, "epoch": 0.9393909432469844, "mean_token_accuracy": 0.7353801131248474, "num_tokens": 28648153.0, "step": 9501, "train/ce_loss": 0.3031177520751953 }, { "epoch": 0.9393909432469844, "step": 9501, "train/sim_loss": 0.04296875 }, { "epoch": 0.9393909432469844, "step": 9501, "train/total_loss": 0.07328052818775177 }, { "entropy": 8.878063201904297, "epoch": 0.9394898160965, "mean_token_accuracy": 0.7370129823684692, "num_tokens": 28653270.0, "step": 9502, "train/ce_loss": 3.8936110513532185e-07 }, { "epoch": 0.9394898160965, "step": 9502, "train/sim_loss": 0.046875 }, { "epoch": 0.9394898160965, "step": 9502, "train/total_loss": 0.046875037252902985 }, { "entropy": 9.535175323486328, "epoch": 0.9395886889460154, "mean_token_accuracy": 0.7554945349693298, "num_tokens": 28658044.0, "step": 9503, "train/ce_loss": 8.357617389265215e-07 }, { "epoch": 0.9395886889460154, "step": 9503, "train/sim_loss": 0.03515625 }, { "epoch": 0.9395886889460154, "step": 9503, "train/total_loss": 0.035156331956386566 }, { "entropy": 8.738447189331055, "epoch": 0.9396875617955309, "mean_token_accuracy": 0.723294734954834, "num_tokens": 28663295.0, "step": 9504, "train/ce_loss": 1.1513490676879883 }, { "epoch": 0.9396875617955309, "step": 9504, "train/sim_loss": 0.04296875 }, { "epoch": 0.9396875617955309, "step": 9504, "train/total_loss": 0.15810365974903107 }, { "entropy": 8.636959075927734, "epoch": 0.9397864346450465, "mean_token_accuracy": 0.7348484992980957, "num_tokens": 28668564.0, "step": 9505, "train/ce_loss": 0.7040850520133972 }, { "epoch": 0.9397864346450465, "step": 9505, "train/sim_loss": 0.0546875 }, { "epoch": 0.9397864346450465, "step": 9505, "train/total_loss": 0.12509600818157196 }, { "entropy": 8.755784034729004, "epoch": 0.939885307494562, "mean_token_accuracy": 0.7001166939735413, "num_tokens": 28673910.0, "step": 9506, "train/ce_loss": 0.7709546685218811 }, { "epoch": 0.939885307494562, "step": 9506, "train/sim_loss": 0.02734375 }, { "epoch": 0.939885307494562, "step": 9506, "train/total_loss": 0.10443922132253647 }, { "entropy": 8.498984336853027, "epoch": 0.9399841803440775, "mean_token_accuracy": 0.8089758157730103, "num_tokens": 28679269.0, "step": 9507, "train/ce_loss": 0.44236424565315247 }, { "epoch": 0.9399841803440775, "step": 9507, "train/sim_loss": 0.01953125 }, { "epoch": 0.9399841803440775, "step": 9507, "train/total_loss": 0.06376767158508301 }, { "entropy": 9.350627899169922, "epoch": 0.9400830531935931, "mean_token_accuracy": 0.7412587404251099, "num_tokens": 28684233.0, "step": 9508, "train/ce_loss": 1.5312329530715942 }, { "epoch": 0.9400830531935931, "step": 9508, "train/sim_loss": 0.05859375 }, { "epoch": 0.9400830531935931, "step": 9508, "train/total_loss": 0.21171705424785614 }, { "entropy": 8.621431350708008, "epoch": 0.9401819260431086, "mean_token_accuracy": 0.7551867365837097, "num_tokens": 28689675.0, "step": 9509, "train/ce_loss": 0.46875298023223877 }, { "epoch": 0.9401819260431086, "step": 9509, "train/sim_loss": 0.015625 }, { "epoch": 0.9401819260431086, "step": 9509, "train/total_loss": 0.06250029802322388 }, { "entropy": 8.615789413452148, "epoch": 0.9402807988926241, "mean_token_accuracy": 0.767241358757019, "num_tokens": 28695205.0, "step": 9510, "train/ce_loss": 0.8325075507164001 }, { "epoch": 0.9402807988926241, "step": 9510, "train/sim_loss": 0.08984375 }, { "epoch": 0.9402807988926241, "step": 9510, "train/total_loss": 0.1730945110321045 }, { "entropy": 9.028203964233398, "epoch": 0.9403796717421397, "mean_token_accuracy": 0.743145763874054, "num_tokens": 28700348.0, "step": 9511, "train/ce_loss": 2.415129642940883e-07 }, { "epoch": 0.9403796717421397, "step": 9511, "train/sim_loss": 0.04296875 }, { "epoch": 0.9403796717421397, "step": 9511, "train/total_loss": 0.04296877235174179 }, { "entropy": 8.917841911315918, "epoch": 0.9404785445916551, "mean_token_accuracy": 0.7300613522529602, "num_tokens": 28705647.0, "step": 9512, "train/ce_loss": 0.7141034007072449 }, { "epoch": 0.9404785445916551, "step": 9512, "train/sim_loss": 0.02734375 }, { "epoch": 0.9404785445916551, "step": 9512, "train/total_loss": 0.09875409305095673 }, { "entropy": 8.543609619140625, "epoch": 0.9405774174411706, "mean_token_accuracy": 0.7538101077079773, "num_tokens": 28710939.0, "step": 9513, "train/ce_loss": 0.6660972833633423 }, { "epoch": 0.9405774174411706, "step": 9513, "train/sim_loss": 0.046875 }, { "epoch": 0.9405774174411706, "step": 9513, "train/total_loss": 0.11348473280668259 }, { "entropy": 9.336162567138672, "epoch": 0.9406762902906862, "mean_token_accuracy": 0.713385820388794, "num_tokens": 28715956.0, "step": 9514, "train/ce_loss": 1.1417412757873535 }, { "epoch": 0.9406762902906862, "step": 9514, "train/sim_loss": 0.0546875 }, { "epoch": 0.9406762902906862, "step": 9514, "train/total_loss": 0.16886162757873535 }, { "entropy": 8.69278335571289, "epoch": 0.9407751631402017, "mean_token_accuracy": 0.733668327331543, "num_tokens": 28721273.0, "step": 9515, "train/ce_loss": 0.9447028636932373 }, { "epoch": 0.9407751631402017, "step": 9515, "train/sim_loss": 0.01953125 }, { "epoch": 0.9407751631402017, "step": 9515, "train/total_loss": 0.11400153487920761 }, { "entropy": 8.511211395263672, "epoch": 0.9408740359897172, "mean_token_accuracy": 0.7770137786865234, "num_tokens": 28726782.0, "step": 9516, "train/ce_loss": 0.6004266738891602 }, { "epoch": 0.9408740359897172, "step": 9516, "train/sim_loss": 0.01953125 }, { "epoch": 0.9408740359897172, "step": 9516, "train/total_loss": 0.07957391440868378 }, { "entropy": 8.731058120727539, "epoch": 0.9409729088392328, "mean_token_accuracy": 0.7242236137390137, "num_tokens": 28732050.0, "step": 9517, "train/ce_loss": 0.6168516874313354 }, { "epoch": 0.9409729088392328, "step": 9517, "train/sim_loss": 0.05078125 }, { "epoch": 0.9409729088392328, "step": 9517, "train/total_loss": 0.11246642470359802 }, { "entropy": 8.753804206848145, "epoch": 0.9410717816887483, "mean_token_accuracy": 0.756041407585144, "num_tokens": 28737385.0, "step": 9518, "train/ce_loss": 0.41143131256103516 }, { "epoch": 0.9410717816887483, "step": 9518, "train/sim_loss": 0.04296875 }, { "epoch": 0.9410717816887483, "step": 9518, "train/total_loss": 0.08411188423633575 }, { "entropy": 8.625227928161621, "epoch": 0.9411706545382638, "mean_token_accuracy": 0.7270471453666687, "num_tokens": 28742634.0, "step": 9519, "train/ce_loss": 0.6203886866569519 }, { "epoch": 0.9411706545382638, "step": 9519, "train/sim_loss": 0.0390625 }, { "epoch": 0.9411706545382638, "step": 9519, "train/total_loss": 0.10110136866569519 }, { "epoch": 0.9412695273877794, "grad_norm": 0.676220178604126, "learning_rate": 7.64896405083321e-06, "loss": 0.1316, "step": 9520 }, { "entropy": 9.301724433898926, "epoch": 0.9412695273877794, "mean_token_accuracy": 0.7676923274993896, "num_tokens": 28747708.0, "step": 9520, "train/ce_loss": 0.675580620765686 }, { "epoch": 0.9412695273877794, "step": 9520, "train/sim_loss": 0.08203125 }, { "epoch": 0.9412695273877794, "step": 9520, "train/total_loss": 0.14958931505680084 }, { "entropy": 8.998207092285156, "epoch": 0.9413684002372948, "mean_token_accuracy": 0.7565698623657227, "num_tokens": 28752900.0, "step": 9521, "train/ce_loss": 1.0089915990829468 }, { "epoch": 0.9413684002372948, "step": 9521, "train/sim_loss": 0.03515625 }, { "epoch": 0.9413684002372948, "step": 9521, "train/total_loss": 0.13605540990829468 }, { "entropy": 8.510425567626953, "epoch": 0.9414672730868103, "mean_token_accuracy": 0.8020133972167969, "num_tokens": 28758310.0, "step": 9522, "train/ce_loss": 0.4035511314868927 }, { "epoch": 0.9414672730868103, "step": 9522, "train/sim_loss": 0.01953125 }, { "epoch": 0.9414672730868103, "step": 9522, "train/total_loss": 0.05988636240363121 }, { "entropy": 8.765780448913574, "epoch": 0.9415661459363259, "mean_token_accuracy": 0.7340686321258545, "num_tokens": 28763603.0, "step": 9523, "train/ce_loss": 1.7169021368026733 }, { "epoch": 0.9415661459363259, "step": 9523, "train/sim_loss": 0.05078125 }, { "epoch": 0.9415661459363259, "step": 9523, "train/total_loss": 0.2224714607000351 }, { "entropy": 8.83626937866211, "epoch": 0.9416650187858414, "mean_token_accuracy": 0.7493917346000671, "num_tokens": 28768926.0, "step": 9524, "train/ce_loss": 0.545708417892456 }, { "epoch": 0.9416650187858414, "step": 9524, "train/sim_loss": 0.0625 }, { "epoch": 0.9416650187858414, "step": 9524, "train/total_loss": 0.11707083880901337 }, { "entropy": 8.736515045166016, "epoch": 0.9417638916353569, "mean_token_accuracy": 0.7122762203216553, "num_tokens": 28774218.0, "step": 9525, "train/ce_loss": 0.6712250113487244 }, { "epoch": 0.9417638916353569, "step": 9525, "train/sim_loss": 0.05078125 }, { "epoch": 0.9417638916353569, "step": 9525, "train/total_loss": 0.11790375411510468 }, { "entropy": 8.839805603027344, "epoch": 0.9418627644848725, "mean_token_accuracy": 0.7200474739074707, "num_tokens": 28779521.0, "step": 9526, "train/ce_loss": 0.5456531047821045 }, { "epoch": 0.9418627644848725, "step": 9526, "train/sim_loss": 0.0546875 }, { "epoch": 0.9418627644848725, "step": 9526, "train/total_loss": 0.10925281047821045 }, { "entropy": 9.020331382751465, "epoch": 0.941961637334388, "mean_token_accuracy": 0.6974790096282959, "num_tokens": 28784698.0, "step": 9527, "train/ce_loss": 0.7585151791572571 }, { "epoch": 0.941961637334388, "step": 9527, "train/sim_loss": 0.06640625 }, { "epoch": 0.941961637334388, "step": 9527, "train/total_loss": 0.14225777983665466 }, { "entropy": 9.176788330078125, "epoch": 0.9420605101839035, "mean_token_accuracy": 0.7791798114776611, "num_tokens": 28789834.0, "step": 9528, "train/ce_loss": 1.0635613203048706 }, { "epoch": 0.9420605101839035, "step": 9528, "train/sim_loss": 0.04296875 }, { "epoch": 0.9420605101839035, "step": 9528, "train/total_loss": 0.14932489395141602 }, { "entropy": 9.404987335205078, "epoch": 0.9421593830334191, "mean_token_accuracy": 0.7586206793785095, "num_tokens": 28794813.0, "step": 9529, "train/ce_loss": 2.1721525911289064e-07 }, { "epoch": 0.9421593830334191, "step": 9529, "train/sim_loss": 0.015625 }, { "epoch": 0.9421593830334191, "step": 9529, "train/total_loss": 0.01562502235174179 }, { "entropy": 8.8800048828125, "epoch": 0.9422582558829345, "mean_token_accuracy": 0.7600519061088562, "num_tokens": 28800027.0, "step": 9530, "train/ce_loss": 0.807660698890686 }, { "epoch": 0.9422582558829345, "step": 9530, "train/sim_loss": 0.0390625 }, { "epoch": 0.9422582558829345, "step": 9530, "train/total_loss": 0.11982857435941696 }, { "entropy": 8.948602676391602, "epoch": 0.94235712873245, "mean_token_accuracy": 0.6991150379180908, "num_tokens": 28805025.0, "step": 9531, "train/ce_loss": 8.591353548581537e-07 }, { "epoch": 0.94235712873245, "step": 9531, "train/sim_loss": 0.0703125 }, { "epoch": 0.94235712873245, "step": 9531, "train/total_loss": 0.07031258940696716 }, { "entropy": 8.528826713562012, "epoch": 0.9424560015819656, "mean_token_accuracy": 0.7619577050209045, "num_tokens": 28810411.0, "step": 9532, "train/ce_loss": 0.952943742275238 }, { "epoch": 0.9424560015819656, "step": 9532, "train/sim_loss": 0.0390625 }, { "epoch": 0.9424560015819656, "step": 9532, "train/total_loss": 0.13435688614845276 }, { "entropy": 9.240312576293945, "epoch": 0.9425548744314811, "mean_token_accuracy": 0.7006173133850098, "num_tokens": 28815519.0, "step": 9533, "train/ce_loss": 1.3274903297424316 }, { "epoch": 0.9425548744314811, "step": 9533, "train/sim_loss": 0.046875 }, { "epoch": 0.9425548744314811, "step": 9533, "train/total_loss": 0.1796240359544754 }, { "entropy": 8.97305679321289, "epoch": 0.9426537472809966, "mean_token_accuracy": 0.7384615540504456, "num_tokens": 28820566.0, "step": 9534, "train/ce_loss": 0.9110743999481201 }, { "epoch": 0.9426537472809966, "step": 9534, "train/sim_loss": 0.0234375 }, { "epoch": 0.9426537472809966, "step": 9534, "train/total_loss": 0.11454494297504425 }, { "entropy": 8.747942924499512, "epoch": 0.9427526201305122, "mean_token_accuracy": 0.7578828930854797, "num_tokens": 28825885.0, "step": 9535, "train/ce_loss": 0.47831931710243225 }, { "epoch": 0.9427526201305122, "step": 9535, "train/sim_loss": 0.1015625 }, { "epoch": 0.9427526201305122, "step": 9535, "train/total_loss": 0.1493944376707077 }, { "entropy": 8.6371488571167, "epoch": 0.9428514929800277, "mean_token_accuracy": 0.7226791977882385, "num_tokens": 28831199.0, "step": 9536, "train/ce_loss": 0.798780083656311 }, { "epoch": 0.9428514929800277, "step": 9536, "train/sim_loss": 0.0546875 }, { "epoch": 0.9428514929800277, "step": 9536, "train/total_loss": 0.13456550240516663 }, { "entropy": 8.563827514648438, "epoch": 0.9429503658295432, "mean_token_accuracy": 0.7590090036392212, "num_tokens": 28836555.0, "step": 9537, "train/ce_loss": 0.910048246383667 }, { "epoch": 0.9429503658295432, "step": 9537, "train/sim_loss": 0.01953125 }, { "epoch": 0.9429503658295432, "step": 9537, "train/total_loss": 0.11053607612848282 }, { "entropy": 8.62954044342041, "epoch": 0.9430492386790588, "mean_token_accuracy": 0.7395833134651184, "num_tokens": 28841813.0, "step": 9538, "train/ce_loss": 0.6747787594795227 }, { "epoch": 0.9430492386790588, "step": 9538, "train/sim_loss": 0.0390625 }, { "epoch": 0.9430492386790588, "step": 9538, "train/total_loss": 0.10654037445783615 }, { "entropy": 8.663762092590332, "epoch": 0.9431481115285743, "mean_token_accuracy": 0.7041942477226257, "num_tokens": 28847202.0, "step": 9539, "train/ce_loss": 0.8284857273101807 }, { "epoch": 0.9431481115285743, "step": 9539, "train/sim_loss": 0.04296875 }, { "epoch": 0.9431481115285743, "step": 9539, "train/total_loss": 0.12581732869148254 }, { "epoch": 0.9432469843780897, "grad_norm": 0.6371222138404846, "learning_rate": 7.644019186075261e-06, "loss": 0.1344, "step": 9540 }, { "entropy": 9.424263954162598, "epoch": 0.9432469843780897, "mean_token_accuracy": 0.6541849970817566, "num_tokens": 28852115.0, "step": 9540, "train/ce_loss": 4.169666567577224e-07 }, { "epoch": 0.9432469843780897, "step": 9540, "train/sim_loss": 0.0390625 }, { "epoch": 0.9432469843780897, "step": 9540, "train/total_loss": 0.03906254097819328 }, { "entropy": 8.73521900177002, "epoch": 0.9433458572276053, "mean_token_accuracy": 0.7703889608383179, "num_tokens": 28857360.0, "step": 9541, "train/ce_loss": 0.6447766423225403 }, { "epoch": 0.9433458572276053, "step": 9541, "train/sim_loss": 0.0859375 }, { "epoch": 0.9433458572276053, "step": 9541, "train/total_loss": 0.15041516721248627 }, { "entropy": 8.16667366027832, "epoch": 0.9434447300771208, "mean_token_accuracy": 0.6919233798980713, "num_tokens": 28863070.0, "step": 9542, "train/ce_loss": 0.875781238079071 }, { "epoch": 0.9434447300771208, "step": 9542, "train/sim_loss": 0.10546875 }, { "epoch": 0.9434447300771208, "step": 9542, "train/total_loss": 0.19304686784744263 }, { "entropy": 8.911556243896484, "epoch": 0.9435436029266363, "mean_token_accuracy": 0.8066825866699219, "num_tokens": 28868394.0, "step": 9543, "train/ce_loss": 1.0784525871276855 }, { "epoch": 0.9435436029266363, "step": 9543, "train/sim_loss": 0.0390625 }, { "epoch": 0.9435436029266363, "step": 9543, "train/total_loss": 0.1469077616930008 }, { "entropy": 8.681024551391602, "epoch": 0.9436424757761519, "mean_token_accuracy": 0.7692307829856873, "num_tokens": 28873701.0, "step": 9544, "train/ce_loss": 0.6686069369316101 }, { "epoch": 0.9436424757761519, "step": 9544, "train/sim_loss": 0.0234375 }, { "epoch": 0.9436424757761519, "step": 9544, "train/total_loss": 0.09029819816350937 }, { "entropy": 9.512763977050781, "epoch": 0.9437413486256674, "mean_token_accuracy": 0.7889125943183899, "num_tokens": 28878591.0, "step": 9545, "train/ce_loss": 5.38847814368637e-07 }, { "epoch": 0.9437413486256674, "step": 9545, "train/sim_loss": 0.04296875 }, { "epoch": 0.9437413486256674, "step": 9545, "train/total_loss": 0.04296880215406418 }, { "entropy": 9.259182929992676, "epoch": 0.9438402214751829, "mean_token_accuracy": 0.7662538886070251, "num_tokens": 28883674.0, "step": 9546, "train/ce_loss": 0.5185627937316895 }, { "epoch": 0.9438402214751829, "step": 9546, "train/sim_loss": 0.015625 }, { "epoch": 0.9438402214751829, "step": 9546, "train/total_loss": 0.06748127937316895 }, { "entropy": 8.751737594604492, "epoch": 0.9439390943246985, "mean_token_accuracy": 0.7868080139160156, "num_tokens": 28888989.0, "step": 9547, "train/ce_loss": 0.4132890999317169 }, { "epoch": 0.9439390943246985, "step": 9547, "train/sim_loss": 0.0546875 }, { "epoch": 0.9439390943246985, "step": 9547, "train/total_loss": 0.09601640701293945 }, { "entropy": 8.520257949829102, "epoch": 0.944037967174214, "mean_token_accuracy": 0.6856866478919983, "num_tokens": 28894447.0, "step": 9548, "train/ce_loss": 0.6307772994041443 }, { "epoch": 0.944037967174214, "step": 9548, "train/sim_loss": 0.03125 }, { "epoch": 0.944037967174214, "step": 9548, "train/total_loss": 0.09432773292064667 }, { "entropy": 9.019133567810059, "epoch": 0.9441368400237294, "mean_token_accuracy": 0.7587253451347351, "num_tokens": 28899570.0, "step": 9549, "train/ce_loss": 0.7235116958618164 }, { "epoch": 0.9441368400237294, "step": 9549, "train/sim_loss": 0.04296875 }, { "epoch": 0.9441368400237294, "step": 9549, "train/total_loss": 0.11531992256641388 }, { "entropy": 8.642354965209961, "epoch": 0.944235712873245, "mean_token_accuracy": 0.7587336301803589, "num_tokens": 28904949.0, "step": 9550, "train/ce_loss": 0.7822985649108887 }, { "epoch": 0.944235712873245, "step": 9550, "train/sim_loss": 0.03125 }, { "epoch": 0.944235712873245, "step": 9550, "train/total_loss": 0.1094798594713211 }, { "entropy": 9.277965545654297, "epoch": 0.9443345857227605, "mean_token_accuracy": 0.7056530117988586, "num_tokens": 28909941.0, "step": 9551, "train/ce_loss": 0.6955422163009644 }, { "epoch": 0.9443345857227605, "step": 9551, "train/sim_loss": 0.046875 }, { "epoch": 0.9443345857227605, "step": 9551, "train/total_loss": 0.11642922461032867 }, { "entropy": 8.385626792907715, "epoch": 0.944433458572276, "mean_token_accuracy": 0.7405660152435303, "num_tokens": 28915504.0, "step": 9552, "train/ce_loss": 0.6856048107147217 }, { "epoch": 0.944433458572276, "step": 9552, "train/sim_loss": 0.04296875 }, { "epoch": 0.944433458572276, "step": 9552, "train/total_loss": 0.11152923107147217 }, { "entropy": 8.710573196411133, "epoch": 0.9445323314217916, "mean_token_accuracy": 0.7350993156433105, "num_tokens": 28920720.0, "step": 9553, "train/ce_loss": 0.7603187561035156 }, { "epoch": 0.9445323314217916, "step": 9553, "train/sim_loss": 0.0546875 }, { "epoch": 0.9445323314217916, "step": 9553, "train/total_loss": 0.1307193785905838 }, { "entropy": 8.891063690185547, "epoch": 0.9446312042713071, "mean_token_accuracy": 0.7756097316741943, "num_tokens": 28925965.0, "step": 9554, "train/ce_loss": 0.5832635760307312 }, { "epoch": 0.9446312042713071, "step": 9554, "train/sim_loss": 0.0390625 }, { "epoch": 0.9446312042713071, "step": 9554, "train/total_loss": 0.0973888635635376 }, { "entropy": 8.735498428344727, "epoch": 0.9447300771208226, "mean_token_accuracy": 0.7409793734550476, "num_tokens": 28931288.0, "step": 9555, "train/ce_loss": 0.45289525389671326 }, { "epoch": 0.9447300771208226, "step": 9555, "train/sim_loss": 0.0546875 }, { "epoch": 0.9447300771208226, "step": 9555, "train/total_loss": 0.0999770313501358 }, { "entropy": 9.089837074279785, "epoch": 0.9448289499703382, "mean_token_accuracy": 0.7692307829856873, "num_tokens": 28936466.0, "step": 9556, "train/ce_loss": 0.5755773186683655 }, { "epoch": 0.9448289499703382, "step": 9556, "train/sim_loss": 0.04296875 }, { "epoch": 0.9448289499703382, "step": 9556, "train/total_loss": 0.10052648186683655 }, { "entropy": 8.925742149353027, "epoch": 0.9449278228198537, "mean_token_accuracy": 0.7666263580322266, "num_tokens": 28941765.0, "step": 9557, "train/ce_loss": 0.8353256583213806 }, { "epoch": 0.9449278228198537, "step": 9557, "train/sim_loss": 0.10546875 }, { "epoch": 0.9449278228198537, "step": 9557, "train/total_loss": 0.18900132179260254 }, { "entropy": 9.195016860961914, "epoch": 0.9450266956693691, "mean_token_accuracy": 0.7957860827445984, "num_tokens": 28947015.0, "step": 9558, "train/ce_loss": 1.1745822429656982 }, { "epoch": 0.9450266956693691, "step": 9558, "train/sim_loss": 0.08984375 }, { "epoch": 0.9450266956693691, "step": 9558, "train/total_loss": 0.20730197429656982 }, { "entropy": 9.235591888427734, "epoch": 0.9451255685188847, "mean_token_accuracy": 0.7303225994110107, "num_tokens": 28952238.0, "step": 9559, "train/ce_loss": 1.6796906265881262e-07 }, { "epoch": 0.9451255685188847, "step": 9559, "train/sim_loss": 0.01953125 }, { "epoch": 0.9451255685188847, "step": 9559, "train/total_loss": 0.019531266763806343 }, { "epoch": 0.9452244413684002, "grad_norm": 0.6818765997886658, "learning_rate": 7.639074321317312e-06, "loss": 0.1339, "step": 9560 }, { "entropy": 8.612297058105469, "epoch": 0.9452244413684002, "mean_token_accuracy": 0.7153465151786804, "num_tokens": 28957528.0, "step": 9560, "train/ce_loss": 1.071324110031128 }, { "epoch": 0.9452244413684002, "step": 9560, "train/sim_loss": 0.06640625 }, { "epoch": 0.9452244413684002, "step": 9560, "train/total_loss": 0.17353865504264832 }, { "entropy": 8.844977378845215, "epoch": 0.9453233142179157, "mean_token_accuracy": 0.7661623358726501, "num_tokens": 28962730.0, "step": 9561, "train/ce_loss": 7.732745075372804e-07 }, { "epoch": 0.9453233142179157, "step": 9561, "train/sim_loss": 0.05078125 }, { "epoch": 0.9453233142179157, "step": 9561, "train/total_loss": 0.05078132823109627 }, { "entropy": 8.497743606567383, "epoch": 0.9454221870674313, "mean_token_accuracy": 0.7420249581336975, "num_tokens": 28967957.0, "step": 9562, "train/ce_loss": 1.2307610511779785 }, { "epoch": 0.9454221870674313, "step": 9562, "train/sim_loss": 0.04296875 }, { "epoch": 0.9454221870674313, "step": 9562, "train/total_loss": 0.16604486107826233 }, { "entropy": 8.593058586120605, "epoch": 0.9455210599169468, "mean_token_accuracy": 0.7852193713188171, "num_tokens": 28973312.0, "step": 9563, "train/ce_loss": 0.708541750907898 }, { "epoch": 0.9455210599169468, "step": 9563, "train/sim_loss": 0.0390625 }, { "epoch": 0.9455210599169468, "step": 9563, "train/total_loss": 0.10991667956113815 }, { "entropy": 9.197546005249023, "epoch": 0.9456199327664623, "mean_token_accuracy": 0.7704917788505554, "num_tokens": 28978302.0, "step": 9564, "train/ce_loss": 0.8882774114608765 }, { "epoch": 0.9456199327664623, "step": 9564, "train/sim_loss": 0.05078125 }, { "epoch": 0.9456199327664623, "step": 9564, "train/total_loss": 0.13960899412631989 }, { "entropy": 8.964488983154297, "epoch": 0.9457188056159779, "mean_token_accuracy": 0.7850098609924316, "num_tokens": 28983289.0, "step": 9565, "train/ce_loss": 1.0092006921768188 }, { "epoch": 0.9457188056159779, "step": 9565, "train/sim_loss": 0.0859375 }, { "epoch": 0.9457188056159779, "step": 9565, "train/total_loss": 0.18685758113861084 }, { "entropy": 8.841659545898438, "epoch": 0.9458176784654934, "mean_token_accuracy": 0.7730496525764465, "num_tokens": 28988462.0, "step": 9566, "train/ce_loss": 2.0214712619781494 }, { "epoch": 0.9458176784654934, "step": 9566, "train/sim_loss": 0.06640625 }, { "epoch": 0.9458176784654934, "step": 9566, "train/total_loss": 0.26855337619781494 }, { "entropy": 8.560609817504883, "epoch": 0.9459165513150088, "mean_token_accuracy": 0.6855733394622803, "num_tokens": 28993716.0, "step": 9567, "train/ce_loss": 0.9242957830429077 }, { "epoch": 0.9459165513150088, "step": 9567, "train/sim_loss": 0.05078125 }, { "epoch": 0.9459165513150088, "step": 9567, "train/total_loss": 0.14321082830429077 }, { "entropy": 9.03412914276123, "epoch": 0.9460154241645244, "mean_token_accuracy": 0.836241602897644, "num_tokens": 28998902.0, "step": 9568, "train/ce_loss": 0.48793551325798035 }, { "epoch": 0.9460154241645244, "step": 9568, "train/sim_loss": 0.02734375 }, { "epoch": 0.9460154241645244, "step": 9568, "train/total_loss": 0.07613730430603027 }, { "entropy": 8.861091613769531, "epoch": 0.9461142970140399, "mean_token_accuracy": 0.7330447435379028, "num_tokens": 29004087.0, "step": 9569, "train/ce_loss": 0.4134216010570526 }, { "epoch": 0.9461142970140399, "step": 9569, "train/sim_loss": 0.10546875 }, { "epoch": 0.9461142970140399, "step": 9569, "train/total_loss": 0.14681091904640198 }, { "entropy": 9.295025825500488, "epoch": 0.9462131698635554, "mean_token_accuracy": 0.739130437374115, "num_tokens": 29009015.0, "step": 9570, "train/ce_loss": 1.2365758419036865 }, { "epoch": 0.9462131698635554, "step": 9570, "train/sim_loss": 0.0625 }, { "epoch": 0.9462131698635554, "step": 9570, "train/total_loss": 0.18615758419036865 }, { "entropy": 8.862635612487793, "epoch": 0.946312042713071, "mean_token_accuracy": 0.7630890011787415, "num_tokens": 29014235.0, "step": 9571, "train/ce_loss": 0.8983027935028076 }, { "epoch": 0.946312042713071, "step": 9571, "train/sim_loss": 0.0546875 }, { "epoch": 0.946312042713071, "step": 9571, "train/total_loss": 0.14451777935028076 }, { "entropy": 9.31369400024414, "epoch": 0.9464109155625865, "mean_token_accuracy": 0.7960088849067688, "num_tokens": 29019124.0, "step": 9572, "train/ce_loss": 1.9161144495010376 }, { "epoch": 0.9464109155625865, "step": 9572, "train/sim_loss": 0.078125 }, { "epoch": 0.9464109155625865, "step": 9572, "train/total_loss": 0.26973646879196167 }, { "entropy": 9.24710464477539, "epoch": 0.946509788412102, "mean_token_accuracy": 0.715242862701416, "num_tokens": 29024179.0, "step": 9573, "train/ce_loss": 1.3356640338897705 }, { "epoch": 0.946509788412102, "step": 9573, "train/sim_loss": 0.046875 }, { "epoch": 0.946509788412102, "step": 9573, "train/total_loss": 0.18044140934944153 }, { "entropy": 8.64162540435791, "epoch": 0.9466086612616176, "mean_token_accuracy": 0.7286624312400818, "num_tokens": 29029453.0, "step": 9574, "train/ce_loss": 7.203916538855992e-07 }, { "epoch": 0.9466086612616176, "step": 9574, "train/sim_loss": 0.01953125 }, { "epoch": 0.9466086612616176, "step": 9574, "train/total_loss": 0.01953132264316082 }, { "entropy": 8.737985610961914, "epoch": 0.9467075341111331, "mean_token_accuracy": 0.6741440296173096, "num_tokens": 29034778.0, "step": 9575, "train/ce_loss": 1.2737239599227905 }, { "epoch": 0.9467075341111331, "step": 9575, "train/sim_loss": 0.07421875 }, { "epoch": 0.9467075341111331, "step": 9575, "train/total_loss": 0.2015911489725113 }, { "entropy": 8.338376998901367, "epoch": 0.9468064069606487, "mean_token_accuracy": 0.7080808281898499, "num_tokens": 29040270.0, "step": 9576, "train/ce_loss": 1.3967866897583008 }, { "epoch": 0.9468064069606487, "step": 9576, "train/sim_loss": 0.078125 }, { "epoch": 0.9468064069606487, "step": 9576, "train/total_loss": 0.21780367195606232 }, { "entropy": 8.711580276489258, "epoch": 0.9469052798101641, "mean_token_accuracy": 0.7848761677742004, "num_tokens": 29045465.0, "step": 9577, "train/ce_loss": 0.6465726494789124 }, { "epoch": 0.9469052798101641, "step": 9577, "train/sim_loss": 0.1328125 }, { "epoch": 0.9469052798101641, "step": 9577, "train/total_loss": 0.1974697709083557 }, { "entropy": 8.95461654663086, "epoch": 0.9470041526596796, "mean_token_accuracy": 0.7730711102485657, "num_tokens": 29050545.0, "step": 9578, "train/ce_loss": 0.6712116599082947 }, { "epoch": 0.9470041526596796, "step": 9578, "train/sim_loss": 0.1171875 }, { "epoch": 0.9470041526596796, "step": 9578, "train/total_loss": 0.18430867791175842 }, { "entropy": 8.306659698486328, "epoch": 0.9471030255091952, "mean_token_accuracy": 0.7609561681747437, "num_tokens": 29056056.0, "step": 9579, "train/ce_loss": 0.7028023600578308 }, { "epoch": 0.9471030255091952, "step": 9579, "train/sim_loss": 0.03125 }, { "epoch": 0.9471030255091952, "step": 9579, "train/total_loss": 0.10153023898601532 }, { "epoch": 0.9472018983587107, "grad_norm": 0.5578201413154602, "learning_rate": 7.634129456559364e-06, "loss": 0.1314, "step": 9580 }, { "entropy": 8.874184608459473, "epoch": 0.9472018983587107, "mean_token_accuracy": 0.761049747467041, "num_tokens": 29061251.0, "step": 9580, "train/ce_loss": 1.3506217002868652 }, { "epoch": 0.9472018983587107, "step": 9580, "train/sim_loss": 0.0625 }, { "epoch": 0.9472018983587107, "step": 9580, "train/total_loss": 0.19756217300891876 }, { "entropy": 8.735138893127441, "epoch": 0.9473007712082262, "mean_token_accuracy": 0.7437810897827148, "num_tokens": 29066553.0, "step": 9581, "train/ce_loss": 0.697642982006073 }, { "epoch": 0.9473007712082262, "step": 9581, "train/sim_loss": 0.046875 }, { "epoch": 0.9473007712082262, "step": 9581, "train/total_loss": 0.11663930118083954 }, { "entropy": 8.543025970458984, "epoch": 0.9473996440577418, "mean_token_accuracy": 0.7392815947532654, "num_tokens": 29071903.0, "step": 9582, "train/ce_loss": 1.2665374279022217 }, { "epoch": 0.9473996440577418, "step": 9582, "train/sim_loss": 0.08203125 }, { "epoch": 0.9473996440577418, "step": 9582, "train/total_loss": 0.2086849957704544 }, { "entropy": 8.526872634887695, "epoch": 0.9474985169072573, "mean_token_accuracy": 0.7544987201690674, "num_tokens": 29077161.0, "step": 9583, "train/ce_loss": 0.6691533923149109 }, { "epoch": 0.9474985169072573, "step": 9583, "train/sim_loss": 0.0625 }, { "epoch": 0.9474985169072573, "step": 9583, "train/total_loss": 0.1294153332710266 }, { "entropy": 8.957462310791016, "epoch": 0.9475973897567728, "mean_token_accuracy": 0.7225806713104248, "num_tokens": 29082393.0, "step": 9584, "train/ce_loss": 0.49990689754486084 }, { "epoch": 0.9475973897567728, "step": 9584, "train/sim_loss": 0.01953125 }, { "epoch": 0.9475973897567728, "step": 9584, "train/total_loss": 0.0695219412446022 }, { "entropy": 9.148205757141113, "epoch": 0.9476962626062884, "mean_token_accuracy": 0.7481343150138855, "num_tokens": 29087348.0, "step": 9585, "train/ce_loss": 1.5612250763297197e-06 }, { "epoch": 0.9476962626062884, "step": 9585, "train/sim_loss": 0.0390625 }, { "epoch": 0.9476962626062884, "step": 9585, "train/total_loss": 0.039062656462192535 }, { "entropy": 9.202255249023438, "epoch": 0.9477951354558038, "mean_token_accuracy": 0.7464454770088196, "num_tokens": 29092247.0, "step": 9586, "train/ce_loss": 9.728628356242552e-07 }, { "epoch": 0.9477951354558038, "step": 9586, "train/sim_loss": 0.03125 }, { "epoch": 0.9477951354558038, "step": 9586, "train/total_loss": 0.03125009685754776 }, { "entropy": 8.650840759277344, "epoch": 0.9478940083053193, "mean_token_accuracy": 0.7518337368965149, "num_tokens": 29097518.0, "step": 9587, "train/ce_loss": 0.9363239407539368 }, { "epoch": 0.9478940083053193, "step": 9587, "train/sim_loss": 0.078125 }, { "epoch": 0.9478940083053193, "step": 9587, "train/total_loss": 0.17175740003585815 }, { "entropy": 8.485133171081543, "epoch": 0.9479928811548349, "mean_token_accuracy": 0.7577720284461975, "num_tokens": 29102713.0, "step": 9588, "train/ce_loss": 0.8084627389907837 }, { "epoch": 0.9479928811548349, "step": 9588, "train/sim_loss": 0.05078125 }, { "epoch": 0.9479928811548349, "step": 9588, "train/total_loss": 0.13162752985954285 }, { "entropy": 8.302215576171875, "epoch": 0.9480917540043504, "mean_token_accuracy": 0.7535714507102966, "num_tokens": 29108249.0, "step": 9589, "train/ce_loss": 0.6899745464324951 }, { "epoch": 0.9480917540043504, "step": 9589, "train/sim_loss": 0.046875 }, { "epoch": 0.9480917540043504, "step": 9589, "train/total_loss": 0.11587245762348175 }, { "entropy": 9.150361061096191, "epoch": 0.9481906268538659, "mean_token_accuracy": 0.720588207244873, "num_tokens": 29113185.0, "step": 9590, "train/ce_loss": 9.790721833269345e-07 }, { "epoch": 0.9481906268538659, "step": 9590, "train/sim_loss": 0.03125 }, { "epoch": 0.9481906268538659, "step": 9590, "train/total_loss": 0.03125009685754776 }, { "entropy": 8.1705322265625, "epoch": 0.9482894997033815, "mean_token_accuracy": 0.7268232107162476, "num_tokens": 29118477.0, "step": 9591, "train/ce_loss": 0.5983821749687195 }, { "epoch": 0.9482894997033815, "step": 9591, "train/sim_loss": 0.03125 }, { "epoch": 0.9482894997033815, "step": 9591, "train/total_loss": 0.09108822047710419 }, { "entropy": 8.875567436218262, "epoch": 0.948388372552897, "mean_token_accuracy": 0.7209302186965942, "num_tokens": 29123381.0, "step": 9592, "train/ce_loss": 0.9020702242851257 }, { "epoch": 0.948388372552897, "step": 9592, "train/sim_loss": 0.0234375 }, { "epoch": 0.948388372552897, "step": 9592, "train/total_loss": 0.11364452540874481 }, { "entropy": 8.887754440307617, "epoch": 0.9484872454024125, "mean_token_accuracy": 0.7001434564590454, "num_tokens": 29128591.0, "step": 9593, "train/ce_loss": 4.049984170251264e-07 }, { "epoch": 0.9484872454024125, "step": 9593, "train/sim_loss": 0.0390625 }, { "epoch": 0.9484872454024125, "step": 9593, "train/total_loss": 0.03906254097819328 }, { "entropy": 8.799975395202637, "epoch": 0.9485861182519281, "mean_token_accuracy": 0.757615864276886, "num_tokens": 29133798.0, "step": 9594, "train/ce_loss": 1.4511492252349854 }, { "epoch": 0.9485861182519281, "step": 9594, "train/sim_loss": 0.0859375 }, { "epoch": 0.9485861182519281, "step": 9594, "train/total_loss": 0.231052428483963 }, { "entropy": 8.386930465698242, "epoch": 0.9486849911014436, "mean_token_accuracy": 0.8011173009872437, "num_tokens": 29139193.0, "step": 9595, "train/ce_loss": 0.666289210319519 }, { "epoch": 0.9486849911014436, "step": 9595, "train/sim_loss": 0.0234375 }, { "epoch": 0.9486849911014436, "step": 9595, "train/total_loss": 0.09006642550230026 }, { "entropy": 8.39547061920166, "epoch": 0.948783863950959, "mean_token_accuracy": 0.7128072381019592, "num_tokens": 29144436.0, "step": 9596, "train/ce_loss": 1.6115036010742188 }, { "epoch": 0.948783863950959, "step": 9596, "train/sim_loss": 0.0390625 }, { "epoch": 0.948783863950959, "step": 9596, "train/total_loss": 0.20021286606788635 }, { "entropy": 8.999906539916992, "epoch": 0.9488827368004746, "mean_token_accuracy": 0.7307692170143127, "num_tokens": 29149488.0, "step": 9597, "train/ce_loss": 1.4049520586922881e-06 }, { "epoch": 0.9488827368004746, "step": 9597, "train/sim_loss": 0.04296875 }, { "epoch": 0.9488827368004746, "step": 9597, "train/total_loss": 0.04296889156103134 }, { "entropy": 9.10368824005127, "epoch": 0.9489816096499901, "mean_token_accuracy": 0.830232560634613, "num_tokens": 29154363.0, "step": 9598, "train/ce_loss": 1.1234755516052246 }, { "epoch": 0.9489816096499901, "step": 9598, "train/sim_loss": 0.0234375 }, { "epoch": 0.9489816096499901, "step": 9598, "train/total_loss": 0.1357850581407547 }, { "entropy": 9.209823608398438, "epoch": 0.9490804824995056, "mean_token_accuracy": 0.8063943386077881, "num_tokens": 29159343.0, "step": 9599, "train/ce_loss": 0.7923817038536072 }, { "epoch": 0.9490804824995056, "step": 9599, "train/sim_loss": 0.04296875 }, { "epoch": 0.9490804824995056, "step": 9599, "train/total_loss": 0.1222069188952446 }, { "epoch": 0.9491793553490212, "grad_norm": 0.5473429560661316, "learning_rate": 7.629184591801415e-06, "loss": 0.1311, "step": 9600 }, { "entropy": 8.758648872375488, "epoch": 0.9491793553490212, "mean_token_accuracy": 0.7156726717948914, "num_tokens": 29164485.0, "step": 9600, "train/ce_loss": 1.3763682842254639 }, { "epoch": 0.9491793553490212, "step": 9600, "train/sim_loss": 0.046875 }, { "epoch": 0.9491793553490212, "step": 9600, "train/total_loss": 0.18451182544231415 }, { "entropy": 8.479524612426758, "epoch": 0.9492782281985367, "mean_token_accuracy": 0.7768199443817139, "num_tokens": 29170003.0, "step": 9601, "train/ce_loss": 0.7482802271842957 }, { "epoch": 0.9492782281985367, "step": 9601, "train/sim_loss": 0.0390625 }, { "epoch": 0.9492782281985367, "step": 9601, "train/total_loss": 0.11389052122831345 }, { "entropy": 8.709478378295898, "epoch": 0.9493771010480522, "mean_token_accuracy": 0.7390776872634888, "num_tokens": 29175249.0, "step": 9602, "train/ce_loss": 0.5524051785469055 }, { "epoch": 0.9493771010480522, "step": 9602, "train/sim_loss": 0.05078125 }, { "epoch": 0.9493771010480522, "step": 9602, "train/total_loss": 0.10602176934480667 }, { "entropy": 8.462508201599121, "epoch": 0.9494759738975678, "mean_token_accuracy": 0.8088942170143127, "num_tokens": 29180548.0, "step": 9603, "train/ce_loss": 0.45807701349258423 }, { "epoch": 0.9494759738975678, "step": 9603, "train/sim_loss": 0.03515625 }, { "epoch": 0.9494759738975678, "step": 9603, "train/total_loss": 0.08096395432949066 }, { "entropy": 8.42574405670166, "epoch": 0.9495748467470833, "mean_token_accuracy": 0.7428571581840515, "num_tokens": 29185774.0, "step": 9604, "train/ce_loss": 0.8734005689620972 }, { "epoch": 0.9495748467470833, "step": 9604, "train/sim_loss": 0.03515625 }, { "epoch": 0.9495748467470833, "step": 9604, "train/total_loss": 0.12249630689620972 }, { "entropy": 8.75879955291748, "epoch": 0.9496737195965987, "mean_token_accuracy": 0.7345844507217407, "num_tokens": 29191030.0, "step": 9605, "train/ce_loss": 1.0893501043319702 }, { "epoch": 0.9496737195965987, "step": 9605, "train/sim_loss": 0.02734375 }, { "epoch": 0.9496737195965987, "step": 9605, "train/total_loss": 0.13627876341342926 }, { "entropy": 8.954879760742188, "epoch": 0.9497725924461143, "mean_token_accuracy": 0.7334235310554504, "num_tokens": 29196218.0, "step": 9606, "train/ce_loss": 1.2382944822311401 }, { "epoch": 0.9497725924461143, "step": 9606, "train/sim_loss": 0.09765625 }, { "epoch": 0.9497725924461143, "step": 9606, "train/total_loss": 0.2214857041835785 }, { "entropy": 8.716231346130371, "epoch": 0.9498714652956298, "mean_token_accuracy": 0.7468671798706055, "num_tokens": 29201482.0, "step": 9607, "train/ce_loss": 0.9447196125984192 }, { "epoch": 0.9498714652956298, "step": 9607, "train/sim_loss": 0.04296875 }, { "epoch": 0.9498714652956298, "step": 9607, "train/total_loss": 0.13744071125984192 }, { "entropy": 8.335722923278809, "epoch": 0.9499703381451453, "mean_token_accuracy": 0.7696390748023987, "num_tokens": 29206827.0, "step": 9608, "train/ce_loss": 0.7796752452850342 }, { "epoch": 0.9499703381451453, "step": 9608, "train/sim_loss": 0.03515625 }, { "epoch": 0.9499703381451453, "step": 9608, "train/total_loss": 0.11312377452850342 }, { "entropy": 8.784276962280273, "epoch": 0.9500692109946609, "mean_token_accuracy": 0.7550200819969177, "num_tokens": 29212004.0, "step": 9609, "train/ce_loss": 1.1044058799743652 }, { "epoch": 0.9500692109946609, "step": 9609, "train/sim_loss": 0.109375 }, { "epoch": 0.9500692109946609, "step": 9609, "train/total_loss": 0.21981558203697205 }, { "entropy": 8.904178619384766, "epoch": 0.9501680838441764, "mean_token_accuracy": 0.7452830076217651, "num_tokens": 29217230.0, "step": 9610, "train/ce_loss": 2.821152236265334e-07 }, { "epoch": 0.9501680838441764, "step": 9610, "train/sim_loss": 0.0546875 }, { "epoch": 0.9501680838441764, "step": 9610, "train/total_loss": 0.05468752980232239 }, { "entropy": 9.369682312011719, "epoch": 0.9502669566936919, "mean_token_accuracy": 0.7763158082962036, "num_tokens": 29222069.0, "step": 9611, "train/ce_loss": 9.107867526836344e-07 }, { "epoch": 0.9502669566936919, "step": 9611, "train/sim_loss": 0.046875 }, { "epoch": 0.9502669566936919, "step": 9611, "train/total_loss": 0.04687508940696716 }, { "entropy": 8.254669189453125, "epoch": 0.9503658295432075, "mean_token_accuracy": 0.6848049163818359, "num_tokens": 29227423.0, "step": 9612, "train/ce_loss": 1.081194519996643 }, { "epoch": 0.9503658295432075, "step": 9612, "train/sim_loss": 0.08203125 }, { "epoch": 0.9503658295432075, "step": 9612, "train/total_loss": 0.19015070796012878 }, { "entropy": 8.887582778930664, "epoch": 0.950464702392723, "mean_token_accuracy": 0.7818182110786438, "num_tokens": 29232676.0, "step": 9613, "train/ce_loss": 1.269497715838952e-07 }, { "epoch": 0.950464702392723, "step": 9613, "train/sim_loss": 0.01171875 }, { "epoch": 0.950464702392723, "step": 9613, "train/total_loss": 0.011718763038516045 }, { "entropy": 8.436988830566406, "epoch": 0.9505635752422384, "mean_token_accuracy": 0.7754459381103516, "num_tokens": 29238129.0, "step": 9614, "train/ce_loss": 0.8593966364860535 }, { "epoch": 0.9505635752422384, "step": 9614, "train/sim_loss": 0.0390625 }, { "epoch": 0.9505635752422384, "step": 9614, "train/total_loss": 0.1250021755695343 }, { "entropy": 8.663818359375, "epoch": 0.950662448091754, "mean_token_accuracy": 0.746268630027771, "num_tokens": 29243334.0, "step": 9615, "train/ce_loss": 0.8685634136199951 }, { "epoch": 0.950662448091754, "step": 9615, "train/sim_loss": 0.015625 }, { "epoch": 0.950662448091754, "step": 9615, "train/total_loss": 0.10248134285211563 }, { "entropy": 9.107805252075195, "epoch": 0.9507613209412695, "mean_token_accuracy": 0.7438271641731262, "num_tokens": 29248443.0, "step": 9616, "train/ce_loss": 1.4567358493804932 }, { "epoch": 0.9507613209412695, "step": 9616, "train/sim_loss": 0.06640625 }, { "epoch": 0.9507613209412695, "step": 9616, "train/total_loss": 0.21207983791828156 }, { "entropy": 8.361902236938477, "epoch": 0.950860193790785, "mean_token_accuracy": 0.7422266602516174, "num_tokens": 29254118.0, "step": 9617, "train/ce_loss": 0.8464975357055664 }, { "epoch": 0.950860193790785, "step": 9617, "train/sim_loss": 0.08984375 }, { "epoch": 0.950860193790785, "step": 9617, "train/total_loss": 0.17449350655078888 }, { "entropy": 8.67348861694336, "epoch": 0.9509590666403006, "mean_token_accuracy": 0.7758620977401733, "num_tokens": 29259309.0, "step": 9618, "train/ce_loss": 0.9091631174087524 }, { "epoch": 0.9509590666403006, "step": 9618, "train/sim_loss": 0.046875 }, { "epoch": 0.9509590666403006, "step": 9618, "train/total_loss": 0.13779130578041077 }, { "entropy": 9.485261917114258, "epoch": 0.9510579394898161, "mean_token_accuracy": 0.7659090757369995, "num_tokens": 29264202.0, "step": 9619, "train/ce_loss": 1.1151223588967696e-06 }, { "epoch": 0.9510579394898161, "step": 9619, "train/sim_loss": 0.03125 }, { "epoch": 0.9510579394898161, "step": 9619, "train/total_loss": 0.031250111758708954 }, { "epoch": 0.9511568123393316, "grad_norm": 0.7135426998138428, "learning_rate": 7.624239727043467e-06, "loss": 0.1283, "step": 9620 }, { "entropy": 9.077592849731445, "epoch": 0.9511568123393316, "mean_token_accuracy": 0.7478849291801453, "num_tokens": 29269192.0, "step": 9620, "train/ce_loss": 2.533438134832977e-07 }, { "epoch": 0.9511568123393316, "step": 9620, "train/sim_loss": 0.01171875 }, { "epoch": 0.9511568123393316, "step": 9620, "train/total_loss": 0.011718775145709515 }, { "entropy": 8.475610733032227, "epoch": 0.9512556851888472, "mean_token_accuracy": 0.7614907026290894, "num_tokens": 29274501.0, "step": 9621, "train/ce_loss": 0.9760831594467163 }, { "epoch": 0.9512556851888472, "step": 9621, "train/sim_loss": 0.0625 }, { "epoch": 0.9512556851888472, "step": 9621, "train/total_loss": 0.16010832786560059 }, { "entropy": 8.620203018188477, "epoch": 0.9513545580383627, "mean_token_accuracy": 0.6976484060287476, "num_tokens": 29279850.0, "step": 9622, "train/ce_loss": 0.9753076434135437 }, { "epoch": 0.9513545580383627, "step": 9622, "train/sim_loss": 0.06640625 }, { "epoch": 0.9513545580383627, "step": 9622, "train/total_loss": 0.1639370173215866 }, { "entropy": 8.828330039978027, "epoch": 0.9514534308878781, "mean_token_accuracy": 0.6967320442199707, "num_tokens": 29285023.0, "step": 9623, "train/ce_loss": 1.4751416444778442 }, { "epoch": 0.9514534308878781, "step": 9623, "train/sim_loss": 0.0703125 }, { "epoch": 0.9514534308878781, "step": 9623, "train/total_loss": 0.21782666444778442 }, { "entropy": 9.476432800292969, "epoch": 0.9515523037373937, "mean_token_accuracy": 0.6666666865348816, "num_tokens": 29289840.0, "step": 9624, "train/ce_loss": 2.2870726585388184 }, { "epoch": 0.9515523037373937, "step": 9624, "train/sim_loss": 0.05078125 }, { "epoch": 0.9515523037373937, "step": 9624, "train/total_loss": 0.2794885039329529 }, { "entropy": 8.957849502563477, "epoch": 0.9516511765869092, "mean_token_accuracy": 0.7439544796943665, "num_tokens": 29294973.0, "step": 9625, "train/ce_loss": 0.8902990818023682 }, { "epoch": 0.9516511765869092, "step": 9625, "train/sim_loss": 0.0625 }, { "epoch": 0.9516511765869092, "step": 9625, "train/total_loss": 0.15152990818023682 }, { "entropy": 8.880248069763184, "epoch": 0.9517500494364247, "mean_token_accuracy": 0.7473261952400208, "num_tokens": 29300196.0, "step": 9626, "train/ce_loss": 0.47349148988723755 }, { "epoch": 0.9517500494364247, "step": 9626, "train/sim_loss": 0.09375 }, { "epoch": 0.9517500494364247, "step": 9626, "train/total_loss": 0.14109915494918823 }, { "entropy": 8.577116966247559, "epoch": 0.9518489222859403, "mean_token_accuracy": 0.7583892345428467, "num_tokens": 29305536.0, "step": 9627, "train/ce_loss": 0.6490175127983093 }, { "epoch": 0.9518489222859403, "step": 9627, "train/sim_loss": 0.08203125 }, { "epoch": 0.9518489222859403, "step": 9627, "train/total_loss": 0.14693300426006317 }, { "entropy": 9.41624927520752, "epoch": 0.9519477951354558, "mean_token_accuracy": 0.7373272180557251, "num_tokens": 29310379.0, "step": 9628, "train/ce_loss": 1.0523601770401 }, { "epoch": 0.9519477951354558, "step": 9628, "train/sim_loss": 0.05078125 }, { "epoch": 0.9519477951354558, "step": 9628, "train/total_loss": 0.1560172736644745 }, { "entropy": 8.359001159667969, "epoch": 0.9520466679849713, "mean_token_accuracy": 0.7457831501960754, "num_tokens": 29315681.0, "step": 9629, "train/ce_loss": 1.1068475246429443 }, { "epoch": 0.9520466679849713, "step": 9629, "train/sim_loss": 0.015625 }, { "epoch": 0.9520466679849713, "step": 9629, "train/total_loss": 0.12630975246429443 }, { "entropy": 9.151994705200195, "epoch": 0.9521455408344869, "mean_token_accuracy": 0.8147059082984924, "num_tokens": 29320810.0, "step": 9630, "train/ce_loss": 0.5198752284049988 }, { "epoch": 0.9521455408344869, "step": 9630, "train/sim_loss": 0.015625 }, { "epoch": 0.9521455408344869, "step": 9630, "train/total_loss": 0.06761252880096436 }, { "entropy": 9.170339584350586, "epoch": 0.9522444136840024, "mean_token_accuracy": 0.7709401845932007, "num_tokens": 29325847.0, "step": 9631, "train/ce_loss": 0.9556100368499756 }, { "epoch": 0.9522444136840024, "step": 9631, "train/sim_loss": 0.015625 }, { "epoch": 0.9522444136840024, "step": 9631, "train/total_loss": 0.11118600517511368 }, { "entropy": 8.444564819335938, "epoch": 0.9523432865335179, "mean_token_accuracy": 0.7725714445114136, "num_tokens": 29331221.0, "step": 9632, "train/ce_loss": 0.5062717795372009 }, { "epoch": 0.9523432865335179, "step": 9632, "train/sim_loss": 0.015625 }, { "epoch": 0.9523432865335179, "step": 9632, "train/total_loss": 0.06625217944383621 }, { "entropy": 9.152999877929688, "epoch": 0.9524421593830334, "mean_token_accuracy": 0.7718023061752319, "num_tokens": 29336331.0, "step": 9633, "train/ce_loss": 3.02873019109029e-07 }, { "epoch": 0.9524421593830334, "step": 9633, "train/sim_loss": 0.015625 }, { "epoch": 0.9524421593830334, "step": 9633, "train/total_loss": 0.015625029802322388 }, { "entropy": 9.484378814697266, "epoch": 0.9525410322325489, "mean_token_accuracy": 0.7777777910232544, "num_tokens": 29341224.0, "step": 9634, "train/ce_loss": 1.2847749530919828e-06 }, { "epoch": 0.9525410322325489, "step": 9634, "train/sim_loss": 0.0390625 }, { "epoch": 0.9525410322325489, "step": 9634, "train/total_loss": 0.03906262665987015 }, { "entropy": 8.592009544372559, "epoch": 0.9526399050820644, "mean_token_accuracy": 0.7072879076004028, "num_tokens": 29346541.0, "step": 9635, "train/ce_loss": 1.1064434051513672 }, { "epoch": 0.9526399050820644, "step": 9635, "train/sim_loss": 0.0546875 }, { "epoch": 0.9526399050820644, "step": 9635, "train/total_loss": 0.16533184051513672 }, { "entropy": 7.667773246765137, "epoch": 0.95273877793158, "mean_token_accuracy": 0.6940418481826782, "num_tokens": 29352255.0, "step": 9636, "train/ce_loss": 0.5391709804534912 }, { "epoch": 0.95273877793158, "step": 9636, "train/sim_loss": 0.03515625 }, { "epoch": 0.95273877793158, "step": 9636, "train/total_loss": 0.08907334506511688 }, { "entropy": 9.034952163696289, "epoch": 0.9528376507810955, "mean_token_accuracy": 0.7628294229507446, "num_tokens": 29357433.0, "step": 9637, "train/ce_loss": 0.8349732756614685 }, { "epoch": 0.9528376507810955, "step": 9637, "train/sim_loss": 0.078125 }, { "epoch": 0.9528376507810955, "step": 9637, "train/total_loss": 0.1616223305463791 }, { "entropy": 8.987411499023438, "epoch": 0.952936523630611, "mean_token_accuracy": 0.7757773995399475, "num_tokens": 29362482.0, "step": 9638, "train/ce_loss": 1.2512165307998657 }, { "epoch": 0.952936523630611, "step": 9638, "train/sim_loss": 0.05078125 }, { "epoch": 0.952936523630611, "step": 9638, "train/total_loss": 0.17590290307998657 }, { "entropy": 8.629709243774414, "epoch": 0.9530353964801266, "mean_token_accuracy": 0.7574031949043274, "num_tokens": 29367840.0, "step": 9639, "train/ce_loss": 0.7920249104499817 }, { "epoch": 0.9530353964801266, "step": 9639, "train/sim_loss": 0.078125 }, { "epoch": 0.9530353964801266, "step": 9639, "train/total_loss": 0.15732750296592712 }, { "epoch": 0.9531342693296421, "grad_norm": 0.6226643919944763, "learning_rate": 7.6192948622855164e-06, "loss": 0.1297, "step": 9640 }, { "entropy": 9.593673706054688, "epoch": 0.9531342693296421, "mean_token_accuracy": 0.7761557102203369, "num_tokens": 29372689.0, "step": 9640, "train/ce_loss": 0.9557827115058899 }, { "epoch": 0.9531342693296421, "step": 9640, "train/sim_loss": 0.0546875 }, { "epoch": 0.9531342693296421, "step": 9640, "train/total_loss": 0.15026578307151794 }, { "entropy": 8.888915061950684, "epoch": 0.9532331421791576, "mean_token_accuracy": 0.7330383658409119, "num_tokens": 29377870.0, "step": 9641, "train/ce_loss": 1.5397324562072754 }, { "epoch": 0.9532331421791576, "step": 9641, "train/sim_loss": 0.03125 }, { "epoch": 0.9532331421791576, "step": 9641, "train/total_loss": 0.18522325158119202 }, { "entropy": 8.38487434387207, "epoch": 0.9533320150286732, "mean_token_accuracy": 0.7719486355781555, "num_tokens": 29383274.0, "step": 9642, "train/ce_loss": 0.7030945420265198 }, { "epoch": 0.9533320150286732, "step": 9642, "train/sim_loss": 0.015625 }, { "epoch": 0.9533320150286732, "step": 9642, "train/total_loss": 0.08593445271253586 }, { "entropy": 8.955349922180176, "epoch": 0.9534308878781886, "mean_token_accuracy": 0.7538461685180664, "num_tokens": 29388606.0, "step": 9643, "train/ce_loss": 4.794979986399994e-07 }, { "epoch": 0.9534308878781886, "step": 9643, "train/sim_loss": 0.0546875 }, { "epoch": 0.9534308878781886, "step": 9643, "train/total_loss": 0.05468754842877388 }, { "entropy": 8.561123847961426, "epoch": 0.9535297607277041, "mean_token_accuracy": 0.8042105436325073, "num_tokens": 29394009.0, "step": 9644, "train/ce_loss": 0.6880059838294983 }, { "epoch": 0.9535297607277041, "step": 9644, "train/sim_loss": 0.04296875 }, { "epoch": 0.9535297607277041, "step": 9644, "train/total_loss": 0.11176934838294983 }, { "entropy": 8.632359504699707, "epoch": 0.9536286335772197, "mean_token_accuracy": 0.7747524976730347, "num_tokens": 29399300.0, "step": 9645, "train/ce_loss": 0.5286980867385864 }, { "epoch": 0.9536286335772197, "step": 9645, "train/sim_loss": 0.03125 }, { "epoch": 0.9536286335772197, "step": 9645, "train/total_loss": 0.08411981165409088 }, { "entropy": 9.39550495147705, "epoch": 0.9537275064267352, "mean_token_accuracy": 0.760765552520752, "num_tokens": 29404308.0, "step": 9646, "train/ce_loss": 0.5505694150924683 }, { "epoch": 0.9537275064267352, "step": 9646, "train/sim_loss": 0.01953125 }, { "epoch": 0.9537275064267352, "step": 9646, "train/total_loss": 0.07458819448947906 }, { "entropy": 9.014603614807129, "epoch": 0.9538263792762507, "mean_token_accuracy": 0.7614285945892334, "num_tokens": 29409535.0, "step": 9647, "train/ce_loss": 0.5262411832809448 }, { "epoch": 0.9538263792762507, "step": 9647, "train/sim_loss": 0.015625 }, { "epoch": 0.9538263792762507, "step": 9647, "train/total_loss": 0.06824912130832672 }, { "entropy": 9.35798168182373, "epoch": 0.9539252521257663, "mean_token_accuracy": 0.8034825921058655, "num_tokens": 29414344.0, "step": 9648, "train/ce_loss": 1.2550104856491089 }, { "epoch": 0.9539252521257663, "step": 9648, "train/sim_loss": 0.03515625 }, { "epoch": 0.9539252521257663, "step": 9648, "train/total_loss": 0.16065730154514313 }, { "entropy": 8.385096549987793, "epoch": 0.9540241249752818, "mean_token_accuracy": 0.7011995911598206, "num_tokens": 29419718.0, "step": 9649, "train/ce_loss": 0.684291422367096 }, { "epoch": 0.9540241249752818, "step": 9649, "train/sim_loss": 0.03125 }, { "epoch": 0.9540241249752818, "step": 9649, "train/total_loss": 0.0996791422367096 }, { "entropy": 9.243229866027832, "epoch": 0.9541229978247973, "mean_token_accuracy": 0.7060367465019226, "num_tokens": 29424501.0, "step": 9650, "train/ce_loss": 4.0800068745738827e-07 }, { "epoch": 0.9541229978247973, "step": 9650, "train/sim_loss": 0.015625 }, { "epoch": 0.9541229978247973, "step": 9650, "train/total_loss": 0.015625040978193283 }, { "entropy": 8.5402193069458, "epoch": 0.9542218706743129, "mean_token_accuracy": 0.7251521348953247, "num_tokens": 29429954.0, "step": 9651, "train/ce_loss": 0.5222027897834778 }, { "epoch": 0.9542218706743129, "step": 9651, "train/sim_loss": 0.01953125 }, { "epoch": 0.9542218706743129, "step": 9651, "train/total_loss": 0.07175153493881226 }, { "entropy": 8.449823379516602, "epoch": 0.9543207435238283, "mean_token_accuracy": 0.7398906946182251, "num_tokens": 29435328.0, "step": 9652, "train/ce_loss": 0.9733685255050659 }, { "epoch": 0.9543207435238283, "step": 9652, "train/sim_loss": 0.02734375 }, { "epoch": 0.9543207435238283, "step": 9652, "train/total_loss": 0.12468060106039047 }, { "entropy": 8.732869148254395, "epoch": 0.9544196163733438, "mean_token_accuracy": 0.7914831042289734, "num_tokens": 29440456.0, "step": 9653, "train/ce_loss": 0.6452661156654358 }, { "epoch": 0.9544196163733438, "step": 9653, "train/sim_loss": 0.0390625 }, { "epoch": 0.9544196163733438, "step": 9653, "train/total_loss": 0.10358911007642746 }, { "entropy": 9.277456283569336, "epoch": 0.9545184892228594, "mean_token_accuracy": 0.8242678046226501, "num_tokens": 29445359.0, "step": 9654, "train/ce_loss": 1.2522435188293457 }, { "epoch": 0.9545184892228594, "step": 9654, "train/sim_loss": 0.01171875 }, { "epoch": 0.9545184892228594, "step": 9654, "train/total_loss": 0.13694310188293457 }, { "entropy": 8.717188835144043, "epoch": 0.9546173620723749, "mean_token_accuracy": 0.7574257254600525, "num_tokens": 29450639.0, "step": 9655, "train/ce_loss": 0.5885151624679565 }, { "epoch": 0.9546173620723749, "step": 9655, "train/sim_loss": 0.03515625 }, { "epoch": 0.9546173620723749, "step": 9655, "train/total_loss": 0.09400776773691177 }, { "entropy": 9.030670166015625, "epoch": 0.9547162349218904, "mean_token_accuracy": 0.7245762944221497, "num_tokens": 29455741.0, "step": 9656, "train/ce_loss": 1.4345485510602884e-07 }, { "epoch": 0.9547162349218904, "step": 9656, "train/sim_loss": 0.015625 }, { "epoch": 0.9547162349218904, "step": 9656, "train/total_loss": 0.015625014901161194 }, { "entropy": 9.049999237060547, "epoch": 0.954815107771406, "mean_token_accuracy": 0.7521968483924866, "num_tokens": 29460741.0, "step": 9657, "train/ce_loss": 0.8397353887557983 }, { "epoch": 0.954815107771406, "step": 9657, "train/sim_loss": 0.04296875 }, { "epoch": 0.954815107771406, "step": 9657, "train/total_loss": 0.12694229185581207 }, { "entropy": 8.608626365661621, "epoch": 0.9549139806209215, "mean_token_accuracy": 0.7580274939537048, "num_tokens": 29466118.0, "step": 9658, "train/ce_loss": 0.45035797357559204 }, { "epoch": 0.9549139806209215, "step": 9658, "train/sim_loss": 0.05859375 }, { "epoch": 0.9549139806209215, "step": 9658, "train/total_loss": 0.10362954437732697 }, { "entropy": 9.093671798706055, "epoch": 0.9550128534704371, "mean_token_accuracy": 0.7730496525764465, "num_tokens": 29471203.0, "step": 9659, "train/ce_loss": 0.7704161405563354 }, { "epoch": 0.9550128534704371, "step": 9659, "train/sim_loss": 0.07421875 }, { "epoch": 0.9550128534704371, "step": 9659, "train/total_loss": 0.1512603759765625 }, { "epoch": 0.9551117263199526, "grad_norm": 0.7387588620185852, "learning_rate": 7.614349997527568e-06, "loss": 0.1241, "step": 9660 }, { "entropy": 8.689239501953125, "epoch": 0.9551117263199526, "mean_token_accuracy": 0.7180365324020386, "num_tokens": 29476548.0, "step": 9660, "train/ce_loss": 0.4345012605190277 }, { "epoch": 0.9551117263199526, "step": 9660, "train/sim_loss": 0.05078125 }, { "epoch": 0.9551117263199526, "step": 9660, "train/total_loss": 0.09423138201236725 }, { "entropy": 8.494304656982422, "epoch": 0.955210599169468, "mean_token_accuracy": 0.7832568883895874, "num_tokens": 29481913.0, "step": 9661, "train/ce_loss": 0.9814844131469727 }, { "epoch": 0.955210599169468, "step": 9661, "train/sim_loss": 0.02734375 }, { "epoch": 0.955210599169468, "step": 9661, "train/total_loss": 0.1254921853542328 }, { "entropy": 8.977173805236816, "epoch": 0.9553094720189836, "mean_token_accuracy": 0.755464494228363, "num_tokens": 29487064.0, "step": 9662, "train/ce_loss": 0.5466949939727783 }, { "epoch": 0.9553094720189836, "step": 9662, "train/sim_loss": 0.01953125 }, { "epoch": 0.9553094720189836, "step": 9662, "train/total_loss": 0.07420074939727783 }, { "entropy": 8.666604995727539, "epoch": 0.9554083448684991, "mean_token_accuracy": 0.7538644671440125, "num_tokens": 29492379.0, "step": 9663, "train/ce_loss": 1.08368718624115 }, { "epoch": 0.9554083448684991, "step": 9663, "train/sim_loss": 0.0546875 }, { "epoch": 0.9554083448684991, "step": 9663, "train/total_loss": 0.16305622458457947 }, { "entropy": 8.567361831665039, "epoch": 0.9555072177180146, "mean_token_accuracy": 0.747474730014801, "num_tokens": 29497766.0, "step": 9664, "train/ce_loss": 0.9273149967193604 }, { "epoch": 0.9555072177180146, "step": 9664, "train/sim_loss": 0.13671875 }, { "epoch": 0.9555072177180146, "step": 9664, "train/total_loss": 0.2294502556324005 }, { "entropy": 9.599896430969238, "epoch": 0.9556060905675302, "mean_token_accuracy": 0.7493734359741211, "num_tokens": 29502580.0, "step": 9665, "train/ce_loss": 2.07197642326355 }, { "epoch": 0.9556060905675302, "step": 9665, "train/sim_loss": 0.0703125 }, { "epoch": 0.9556060905675302, "step": 9665, "train/total_loss": 0.2775101661682129 }, { "entropy": 8.539717674255371, "epoch": 0.9557049634170457, "mean_token_accuracy": 0.7021898031234741, "num_tokens": 29507763.0, "step": 9666, "train/ce_loss": 0.8698290586471558 }, { "epoch": 0.9557049634170457, "step": 9666, "train/sim_loss": 0.046875 }, { "epoch": 0.9557049634170457, "step": 9666, "train/total_loss": 0.13385790586471558 }, { "entropy": 8.689737319946289, "epoch": 0.9558038362665612, "mean_token_accuracy": 0.7953714728355408, "num_tokens": 29513050.0, "step": 9667, "train/ce_loss": 0.3238702714443207 }, { "epoch": 0.9558038362665612, "step": 9667, "train/sim_loss": 0.015625 }, { "epoch": 0.9558038362665612, "step": 9667, "train/total_loss": 0.04801202937960625 }, { "entropy": 8.805400848388672, "epoch": 0.9559027091160768, "mean_token_accuracy": 0.7234352231025696, "num_tokens": 29518208.0, "step": 9668, "train/ce_loss": 1.4133466482162476 }, { "epoch": 0.9559027091160768, "step": 9668, "train/sim_loss": 0.0390625 }, { "epoch": 0.9559027091160768, "step": 9668, "train/total_loss": 0.180397167801857 }, { "entropy": 8.859384536743164, "epoch": 0.9560015819655923, "mean_token_accuracy": 0.6579973697662354, "num_tokens": 29523450.0, "step": 9669, "train/ce_loss": 1.8035385608673096 }, { "epoch": 0.9560015819655923, "step": 9669, "train/sim_loss": 0.05859375 }, { "epoch": 0.9560015819655923, "step": 9669, "train/total_loss": 0.23894761502742767 }, { "entropy": 8.606244087219238, "epoch": 0.9561004548151077, "mean_token_accuracy": 0.7643097639083862, "num_tokens": 29528827.0, "step": 9670, "train/ce_loss": 0.9713116884231567 }, { "epoch": 0.9561004548151077, "step": 9670, "train/sim_loss": 0.109375 }, { "epoch": 0.9561004548151077, "step": 9670, "train/total_loss": 0.2065061628818512 }, { "entropy": 8.814547538757324, "epoch": 0.9561993276646233, "mean_token_accuracy": 0.7670885920524597, "num_tokens": 29534110.0, "step": 9671, "train/ce_loss": 0.8379871845245361 }, { "epoch": 0.9561993276646233, "step": 9671, "train/sim_loss": 0.0625 }, { "epoch": 0.9561993276646233, "step": 9671, "train/total_loss": 0.14629872143268585 }, { "entropy": 8.33130931854248, "epoch": 0.9562982005141388, "mean_token_accuracy": 0.6969397664070129, "num_tokens": 29539627.0, "step": 9672, "train/ce_loss": 0.6200782656669617 }, { "epoch": 0.9562982005141388, "step": 9672, "train/sim_loss": 0.04296875 }, { "epoch": 0.9562982005141388, "step": 9672, "train/total_loss": 0.1049765795469284 }, { "entropy": 8.958049774169922, "epoch": 0.9563970733636543, "mean_token_accuracy": 0.7506925463676453, "num_tokens": 29544822.0, "step": 9673, "train/ce_loss": 0.6942681670188904 }, { "epoch": 0.9563970733636543, "step": 9673, "train/sim_loss": 0.06640625 }, { "epoch": 0.9563970733636543, "step": 9673, "train/total_loss": 0.13583306968212128 }, { "entropy": 8.575597763061523, "epoch": 0.9564959462131699, "mean_token_accuracy": 0.695364236831665, "num_tokens": 29550184.0, "step": 9674, "train/ce_loss": 0.8115186095237732 }, { "epoch": 0.9564959462131699, "step": 9674, "train/sim_loss": 0.0625 }, { "epoch": 0.9564959462131699, "step": 9674, "train/total_loss": 0.14365187287330627 }, { "entropy": 8.64834213256836, "epoch": 0.9565948190626854, "mean_token_accuracy": 0.7287761569023132, "num_tokens": 29555594.0, "step": 9675, "train/ce_loss": 0.45716869831085205 }, { "epoch": 0.9565948190626854, "step": 9675, "train/sim_loss": 0.09375 }, { "epoch": 0.9565948190626854, "step": 9675, "train/total_loss": 0.13946686685085297 }, { "entropy": 8.666889190673828, "epoch": 0.9566936919122009, "mean_token_accuracy": 0.7227272987365723, "num_tokens": 29560910.0, "step": 9676, "train/ce_loss": 0.3776177763938904 }, { "epoch": 0.9566936919122009, "step": 9676, "train/sim_loss": 0.05859375 }, { "epoch": 0.9566936919122009, "step": 9676, "train/total_loss": 0.09635552763938904 }, { "entropy": 8.791280746459961, "epoch": 0.9567925647617165, "mean_token_accuracy": 0.7641395926475525, "num_tokens": 29566179.0, "step": 9677, "train/ce_loss": 0.40096956491470337 }, { "epoch": 0.9567925647617165, "step": 9677, "train/sim_loss": 0.0234375 }, { "epoch": 0.9567925647617165, "step": 9677, "train/total_loss": 0.0635344535112381 }, { "entropy": 9.04377269744873, "epoch": 0.956891437611232, "mean_token_accuracy": 0.7180555462837219, "num_tokens": 29571332.0, "step": 9678, "train/ce_loss": 1.2874047756195068 }, { "epoch": 0.956891437611232, "step": 9678, "train/sim_loss": 0.03125 }, { "epoch": 0.956891437611232, "step": 9678, "train/total_loss": 0.15999047458171844 }, { "entropy": 9.064447402954102, "epoch": 0.9569903104607475, "mean_token_accuracy": 0.7198142409324646, "num_tokens": 29576401.0, "step": 9679, "train/ce_loss": 1.0077346563339233 }, { "epoch": 0.9569903104607475, "step": 9679, "train/sim_loss": 0.0234375 }, { "epoch": 0.9569903104607475, "step": 9679, "train/total_loss": 0.12421096861362457 }, { "epoch": 0.957089183310263, "grad_norm": 0.8103324770927429, "learning_rate": 7.609405132769619e-06, "loss": 0.1383, "step": 9680 }, { "entropy": 8.435659408569336, "epoch": 0.957089183310263, "mean_token_accuracy": 0.7245119214057922, "num_tokens": 29581723.0, "step": 9680, "train/ce_loss": 0.8044260740280151 }, { "epoch": 0.957089183310263, "step": 9680, "train/sim_loss": 0.05078125 }, { "epoch": 0.957089183310263, "step": 9680, "train/total_loss": 0.1312238574028015 }, { "entropy": 9.460543632507324, "epoch": 0.9571880561597785, "mean_token_accuracy": 0.7295454740524292, "num_tokens": 29586561.0, "step": 9681, "train/ce_loss": 1.2621893882751465 }, { "epoch": 0.9571880561597785, "step": 9681, "train/sim_loss": 0.0390625 }, { "epoch": 0.9571880561597785, "step": 9681, "train/total_loss": 0.16528144478797913 }, { "entropy": 8.640851974487305, "epoch": 0.957286929009294, "mean_token_accuracy": 0.7362045645713806, "num_tokens": 29591791.0, "step": 9682, "train/ce_loss": 0.948087751865387 }, { "epoch": 0.957286929009294, "step": 9682, "train/sim_loss": 0.078125 }, { "epoch": 0.957286929009294, "step": 9682, "train/total_loss": 0.17293378710746765 }, { "entropy": 9.918706893920898, "epoch": 0.9573858018588096, "mean_token_accuracy": 0.7696078419685364, "num_tokens": 29596377.0, "step": 9683, "train/ce_loss": 2.9881794452667236 }, { "epoch": 0.9573858018588096, "step": 9683, "train/sim_loss": 0.08984375 }, { "epoch": 0.9573858018588096, "step": 9683, "train/total_loss": 0.3886617124080658 }, { "entropy": 8.680938720703125, "epoch": 0.9574846747083251, "mean_token_accuracy": 0.7422279715538025, "num_tokens": 29601598.0, "step": 9684, "train/ce_loss": 0.5348566174507141 }, { "epoch": 0.9574846747083251, "step": 9684, "train/sim_loss": 0.046875 }, { "epoch": 0.9574846747083251, "step": 9684, "train/total_loss": 0.10036066174507141 }, { "entropy": 8.284825325012207, "epoch": 0.9575835475578406, "mean_token_accuracy": 0.7689906358718872, "num_tokens": 29607074.0, "step": 9685, "train/ce_loss": 0.5938931703567505 }, { "epoch": 0.9575835475578406, "step": 9685, "train/sim_loss": 0.04296875 }, { "epoch": 0.9575835475578406, "step": 9685, "train/total_loss": 0.10235807299613953 }, { "entropy": 8.49846363067627, "epoch": 0.9576824204073562, "mean_token_accuracy": 0.7897196412086487, "num_tokens": 29612427.0, "step": 9686, "train/ce_loss": 0.4811134338378906 }, { "epoch": 0.9576824204073562, "step": 9686, "train/sim_loss": 0.0234375 }, { "epoch": 0.9576824204073562, "step": 9686, "train/total_loss": 0.07154884934425354 }, { "entropy": 8.664314270019531, "epoch": 0.9577812932568717, "mean_token_accuracy": 0.720812201499939, "num_tokens": 29617668.0, "step": 9687, "train/ce_loss": 0.7171815037727356 }, { "epoch": 0.9577812932568717, "step": 9687, "train/sim_loss": 0.046875 }, { "epoch": 0.9577812932568717, "step": 9687, "train/total_loss": 0.11859314888715744 }, { "entropy": 9.304027557373047, "epoch": 0.9578801661063872, "mean_token_accuracy": 0.7219047546386719, "num_tokens": 29622602.0, "step": 9688, "train/ce_loss": 3.151641010390449e-07 }, { "epoch": 0.9578801661063872, "step": 9688, "train/sim_loss": 0.01953125 }, { "epoch": 0.9578801661063872, "step": 9688, "train/total_loss": 0.019531281664967537 }, { "entropy": 9.046564102172852, "epoch": 0.9579790389559028, "mean_token_accuracy": 0.6889952421188354, "num_tokens": 29627665.0, "step": 9689, "train/ce_loss": 1.4818629026412964 }, { "epoch": 0.9579790389559028, "step": 9689, "train/sim_loss": 0.109375 }, { "epoch": 0.9579790389559028, "step": 9689, "train/total_loss": 0.2575612962245941 }, { "entropy": 8.488359451293945, "epoch": 0.9580779118054182, "mean_token_accuracy": 0.7052153944969177, "num_tokens": 29633056.0, "step": 9690, "train/ce_loss": 0.4829596281051636 }, { "epoch": 0.9580779118054182, "step": 9690, "train/sim_loss": 0.03125 }, { "epoch": 0.9580779118054182, "step": 9690, "train/total_loss": 0.07954595983028412 }, { "entropy": 8.586043357849121, "epoch": 0.9581767846549337, "mean_token_accuracy": 0.7146092653274536, "num_tokens": 29638386.0, "step": 9691, "train/ce_loss": 1.2549149990081787 }, { "epoch": 0.9581767846549337, "step": 9691, "train/sim_loss": 0.0390625 }, { "epoch": 0.9581767846549337, "step": 9691, "train/total_loss": 0.16455399990081787 }, { "entropy": 8.421553611755371, "epoch": 0.9582756575044493, "mean_token_accuracy": 0.7447306513786316, "num_tokens": 29643718.0, "step": 9692, "train/ce_loss": 0.5929149985313416 }, { "epoch": 0.9582756575044493, "step": 9692, "train/sim_loss": 0.0859375 }, { "epoch": 0.9582756575044493, "step": 9692, "train/total_loss": 0.14522899687290192 }, { "entropy": 8.817549705505371, "epoch": 0.9583745303539648, "mean_token_accuracy": 0.6828644275665283, "num_tokens": 29648974.0, "step": 9693, "train/ce_loss": 1.9984155893325806 }, { "epoch": 0.9583745303539648, "step": 9693, "train/sim_loss": 0.046875 }, { "epoch": 0.9583745303539648, "step": 9693, "train/total_loss": 0.24671655893325806 }, { "entropy": 9.072053909301758, "epoch": 0.9584734032034803, "mean_token_accuracy": 0.7591911554336548, "num_tokens": 29653978.0, "step": 9694, "train/ce_loss": 8.633719517092686e-07 }, { "epoch": 0.9584734032034803, "step": 9694, "train/sim_loss": 0.05859375 }, { "epoch": 0.9584734032034803, "step": 9694, "train/total_loss": 0.058593835681676865 }, { "entropy": 9.154497146606445, "epoch": 0.9585722760529959, "mean_token_accuracy": 0.7762646079063416, "num_tokens": 29658966.0, "step": 9695, "train/ce_loss": 0.8793838620185852 }, { "epoch": 0.9585722760529959, "step": 9695, "train/sim_loss": 0.03125 }, { "epoch": 0.9585722760529959, "step": 9695, "train/total_loss": 0.11918839067220688 }, { "entropy": 8.997401237487793, "epoch": 0.9586711489025114, "mean_token_accuracy": 0.7197368144989014, "num_tokens": 29664156.0, "step": 9696, "train/ce_loss": 0.655886173248291 }, { "epoch": 0.9586711489025114, "step": 9696, "train/sim_loss": 0.05078125 }, { "epoch": 0.9586711489025114, "step": 9696, "train/total_loss": 0.11636986583471298 }, { "entropy": 8.906684875488281, "epoch": 0.9587700217520269, "mean_token_accuracy": 0.6468305587768555, "num_tokens": 29669328.0, "step": 9697, "train/ce_loss": 0.8485145568847656 }, { "epoch": 0.9587700217520269, "step": 9697, "train/sim_loss": 0.08203125 }, { "epoch": 0.9587700217520269, "step": 9697, "train/total_loss": 0.1668827086687088 }, { "entropy": 8.718497276306152, "epoch": 0.9588688946015425, "mean_token_accuracy": 0.751937985420227, "num_tokens": 29674657.0, "step": 9698, "train/ce_loss": 0.854433000087738 }, { "epoch": 0.9588688946015425, "step": 9698, "train/sim_loss": 0.02734375 }, { "epoch": 0.9588688946015425, "step": 9698, "train/total_loss": 0.11278705298900604 }, { "entropy": 8.88694953918457, "epoch": 0.9589677674510579, "mean_token_accuracy": 0.6712749600410461, "num_tokens": 29679726.0, "step": 9699, "train/ce_loss": 5.976818897579506e-07 }, { "epoch": 0.9589677674510579, "step": 9699, "train/sim_loss": 0.0234375 }, { "epoch": 0.9589677674510579, "step": 9699, "train/total_loss": 0.023437559604644775 }, { "epoch": 0.9590666403005734, "grad_norm": 0.7731952667236328, "learning_rate": 7.6044602680116705e-06, "loss": 0.1421, "step": 9700 }, { "entropy": 8.920503616333008, "epoch": 0.9590666403005734, "mean_token_accuracy": 0.7343957424163818, "num_tokens": 29684953.0, "step": 9700, "train/ce_loss": 3.4321760722377803e-07 }, { "epoch": 0.9590666403005734, "step": 9700, "train/sim_loss": 0.03515625 }, { "epoch": 0.9590666403005734, "step": 9700, "train/total_loss": 0.035156283527612686 }, { "entropy": 9.540019989013672, "epoch": 0.959165513150089, "mean_token_accuracy": 0.7209302186965942, "num_tokens": 29689795.0, "step": 9701, "train/ce_loss": 7.614484047735459e-07 }, { "epoch": 0.959165513150089, "step": 9701, "train/sim_loss": 0.03125 }, { "epoch": 0.959165513150089, "step": 9701, "train/total_loss": 0.03125007450580597 }, { "entropy": 8.575620651245117, "epoch": 0.9592643859996045, "mean_token_accuracy": 0.7465618848800659, "num_tokens": 29695306.0, "step": 9702, "train/ce_loss": 0.6707134246826172 }, { "epoch": 0.9592643859996045, "step": 9702, "train/sim_loss": 0.05859375 }, { "epoch": 0.9592643859996045, "step": 9702, "train/total_loss": 0.1256650984287262 }, { "entropy": 9.787461280822754, "epoch": 0.95936325884912, "mean_token_accuracy": 0.7609755992889404, "num_tokens": 29699883.0, "step": 9703, "train/ce_loss": 9.148747267317958e-06 }, { "epoch": 0.95936325884912, "step": 9703, "train/sim_loss": 0.03515625 }, { "epoch": 0.95936325884912, "step": 9703, "train/total_loss": 0.03515716642141342 }, { "entropy": 8.471220016479492, "epoch": 0.9594621316986356, "mean_token_accuracy": 0.7642998099327087, "num_tokens": 29705421.0, "step": 9704, "train/ce_loss": 0.848787248134613 }, { "epoch": 0.9594621316986356, "step": 9704, "train/sim_loss": 0.0390625 }, { "epoch": 0.9594621316986356, "step": 9704, "train/total_loss": 0.12394122779369354 }, { "entropy": 9.439668655395508, "epoch": 0.9595610045481511, "mean_token_accuracy": 0.7807376980781555, "num_tokens": 29710337.0, "step": 9705, "train/ce_loss": 0.8956339359283447 }, { "epoch": 0.9595610045481511, "step": 9705, "train/sim_loss": 0.0859375 }, { "epoch": 0.9595610045481511, "step": 9705, "train/total_loss": 0.17550089955329895 }, { "entropy": 8.944560050964355, "epoch": 0.9596598773976666, "mean_token_accuracy": 0.7627737522125244, "num_tokens": 29715623.0, "step": 9706, "train/ce_loss": 1.211319923400879 }, { "epoch": 0.9596598773976666, "step": 9706, "train/sim_loss": 0.0546875 }, { "epoch": 0.9596598773976666, "step": 9706, "train/total_loss": 0.1758194863796234 }, { "entropy": 8.496309280395508, "epoch": 0.9597587502471822, "mean_token_accuracy": 0.6829004287719727, "num_tokens": 29721051.0, "step": 9707, "train/ce_loss": 1.096146583557129 }, { "epoch": 0.9597587502471822, "step": 9707, "train/sim_loss": 0.109375 }, { "epoch": 0.9597587502471822, "step": 9707, "train/total_loss": 0.21898967027664185 }, { "entropy": 8.67544937133789, "epoch": 0.9598576230966976, "mean_token_accuracy": 0.730140209197998, "num_tokens": 29726388.0, "step": 9708, "train/ce_loss": 0.6697824597358704 }, { "epoch": 0.9598576230966976, "step": 9708, "train/sim_loss": 0.03515625 }, { "epoch": 0.9598576230966976, "step": 9708, "train/total_loss": 0.10213449597358704 }, { "entropy": 8.619380950927734, "epoch": 0.9599564959462131, "mean_token_accuracy": 0.689538836479187, "num_tokens": 29731719.0, "step": 9709, "train/ce_loss": 0.7001871466636658 }, { "epoch": 0.9599564959462131, "step": 9709, "train/sim_loss": 0.0390625 }, { "epoch": 0.9599564959462131, "step": 9709, "train/total_loss": 0.1090812161564827 }, { "entropy": 9.104570388793945, "epoch": 0.9600553687957287, "mean_token_accuracy": 0.7729941010475159, "num_tokens": 29736686.0, "step": 9710, "train/ce_loss": 1.1940386295318604 }, { "epoch": 0.9600553687957287, "step": 9710, "train/sim_loss": 0.05859375 }, { "epoch": 0.9600553687957287, "step": 9710, "train/total_loss": 0.1779976189136505 }, { "entropy": 8.630685806274414, "epoch": 0.9601542416452442, "mean_token_accuracy": 0.7542168498039246, "num_tokens": 29741962.0, "step": 9711, "train/ce_loss": 0.6684284210205078 }, { "epoch": 0.9601542416452442, "step": 9711, "train/sim_loss": 0.046875 }, { "epoch": 0.9601542416452442, "step": 9711, "train/total_loss": 0.11371784657239914 }, { "entropy": 9.062545776367188, "epoch": 0.9602531144947597, "mean_token_accuracy": 0.720447301864624, "num_tokens": 29747047.0, "step": 9712, "train/ce_loss": 1.7544846534729004 }, { "epoch": 0.9602531144947597, "step": 9712, "train/sim_loss": 0.078125 }, { "epoch": 0.9602531144947597, "step": 9712, "train/total_loss": 0.253573477268219 }, { "entropy": 10.125118255615234, "epoch": 0.9603519873442753, "mean_token_accuracy": 0.6926069855690002, "num_tokens": 29751652.0, "step": 9713, "train/ce_loss": 6.363983402479789e-07 }, { "epoch": 0.9603519873442753, "step": 9713, "train/sim_loss": 0.01953125 }, { "epoch": 0.9603519873442753, "step": 9713, "train/total_loss": 0.019531313329935074 }, { "entropy": 8.604927062988281, "epoch": 0.9604508601937908, "mean_token_accuracy": 0.7398785352706909, "num_tokens": 29757145.0, "step": 9714, "train/ce_loss": 0.7927497625350952 }, { "epoch": 0.9604508601937908, "step": 9714, "train/sim_loss": 0.08984375 }, { "epoch": 0.9604508601937908, "step": 9714, "train/total_loss": 0.169118732213974 }, { "entropy": 8.794660568237305, "epoch": 0.9605497330433063, "mean_token_accuracy": 0.796316385269165, "num_tokens": 29762488.0, "step": 9715, "train/ce_loss": 0.6840125918388367 }, { "epoch": 0.9605497330433063, "step": 9715, "train/sim_loss": 0.0625 }, { "epoch": 0.9605497330433063, "step": 9715, "train/total_loss": 0.1309012621641159 }, { "entropy": 8.498865127563477, "epoch": 0.9606486058928219, "mean_token_accuracy": 0.7266536951065063, "num_tokens": 29768016.0, "step": 9716, "train/ce_loss": 1.4905140399932861 }, { "epoch": 0.9606486058928219, "step": 9716, "train/sim_loss": 0.125 }, { "epoch": 0.9606486058928219, "step": 9716, "train/total_loss": 0.2740514278411865 }, { "entropy": 8.539307594299316, "epoch": 0.9607474787423373, "mean_token_accuracy": 0.6680244207382202, "num_tokens": 29773440.0, "step": 9717, "train/ce_loss": 0.664154589176178 }, { "epoch": 0.9607474787423373, "step": 9717, "train/sim_loss": 0.015625 }, { "epoch": 0.9607474787423373, "step": 9717, "train/total_loss": 0.0820404589176178 }, { "entropy": 9.315811157226562, "epoch": 0.9608463515918528, "mean_token_accuracy": 0.6927710771560669, "num_tokens": 29778373.0, "step": 9718, "train/ce_loss": 0.6773354411125183 }, { "epoch": 0.9608463515918528, "step": 9718, "train/sim_loss": 0.08203125 }, { "epoch": 0.9608463515918528, "step": 9718, "train/total_loss": 0.1497648060321808 }, { "entropy": 8.938533782958984, "epoch": 0.9609452244413684, "mean_token_accuracy": 0.6992481350898743, "num_tokens": 29783528.0, "step": 9719, "train/ce_loss": 1.8118160963058472 }, { "epoch": 0.9609452244413684, "step": 9719, "train/sim_loss": 0.08984375 }, { "epoch": 0.9609452244413684, "step": 9719, "train/total_loss": 0.2710253596305847 }, { "epoch": 0.9610440972908839, "grad_norm": 0.7465754151344299, "learning_rate": 7.599515403253722e-06, "loss": 0.1395, "step": 9720 }, { "entropy": 8.669353485107422, "epoch": 0.9610440972908839, "mean_token_accuracy": 0.7660500407218933, "num_tokens": 29788915.0, "step": 9720, "train/ce_loss": 0.6009553074836731 }, { "epoch": 0.9610440972908839, "step": 9720, "train/sim_loss": 0.01953125 }, { "epoch": 0.9610440972908839, "step": 9720, "train/total_loss": 0.07962678372859955 }, { "entropy": 8.840965270996094, "epoch": 0.9611429701403994, "mean_token_accuracy": 0.7968936562538147, "num_tokens": 29794204.0, "step": 9721, "train/ce_loss": 0.5751218795776367 }, { "epoch": 0.9611429701403994, "step": 9721, "train/sim_loss": 0.015625 }, { "epoch": 0.9611429701403994, "step": 9721, "train/total_loss": 0.07313719391822815 }, { "entropy": 8.781105041503906, "epoch": 0.961241842989915, "mean_token_accuracy": 0.8096385598182678, "num_tokens": 29799491.0, "step": 9722, "train/ce_loss": 0.7813106775283813 }, { "epoch": 0.961241842989915, "step": 9722, "train/sim_loss": 0.02734375 }, { "epoch": 0.961241842989915, "step": 9722, "train/total_loss": 0.10547482222318649 }, { "entropy": 8.61587905883789, "epoch": 0.9613407158394305, "mean_token_accuracy": 0.7680995464324951, "num_tokens": 29805027.0, "step": 9723, "train/ce_loss": 0.7303171157836914 }, { "epoch": 0.9613407158394305, "step": 9723, "train/sim_loss": 0.0390625 }, { "epoch": 0.9613407158394305, "step": 9723, "train/total_loss": 0.1120942160487175 }, { "entropy": 8.903885841369629, "epoch": 0.961439588688946, "mean_token_accuracy": 0.8101736903190613, "num_tokens": 29810293.0, "step": 9724, "train/ce_loss": 0.5742484927177429 }, { "epoch": 0.961439588688946, "step": 9724, "train/sim_loss": 0.0234375 }, { "epoch": 0.961439588688946, "step": 9724, "train/total_loss": 0.08086235076189041 }, { "entropy": 9.229190826416016, "epoch": 0.9615384615384616, "mean_token_accuracy": 0.7374179363250732, "num_tokens": 29815150.0, "step": 9725, "train/ce_loss": 1.182202935218811 }, { "epoch": 0.9615384615384616, "step": 9725, "train/sim_loss": 0.01171875 }, { "epoch": 0.9615384615384616, "step": 9725, "train/total_loss": 0.12993904948234558 }, { "entropy": 9.013776779174805, "epoch": 0.961637334387977, "mean_token_accuracy": 0.7624309659004211, "num_tokens": 29820350.0, "step": 9726, "train/ce_loss": 1.519352912902832 }, { "epoch": 0.961637334387977, "step": 9726, "train/sim_loss": 0.07421875 }, { "epoch": 0.961637334387977, "step": 9726, "train/total_loss": 0.22615404427051544 }, { "entropy": 9.120112419128418, "epoch": 0.9617362072374925, "mean_token_accuracy": 0.7324414849281311, "num_tokens": 29825351.0, "step": 9727, "train/ce_loss": 1.4603698253631592 }, { "epoch": 0.9617362072374925, "step": 9727, "train/sim_loss": 0.0546875 }, { "epoch": 0.9617362072374925, "step": 9727, "train/total_loss": 0.20072448253631592 }, { "entropy": 9.247553825378418, "epoch": 0.9618350800870081, "mean_token_accuracy": 0.7626526951789856, "num_tokens": 29830317.0, "step": 9728, "train/ce_loss": 0.6655154228210449 }, { "epoch": 0.9618350800870081, "step": 9728, "train/sim_loss": 0.046875 }, { "epoch": 0.9618350800870081, "step": 9728, "train/total_loss": 0.11342654377222061 }, { "entropy": 8.839900970458984, "epoch": 0.9619339529365236, "mean_token_accuracy": 0.7316129207611084, "num_tokens": 29835544.0, "step": 9729, "train/ce_loss": 1.413920283317566 }, { "epoch": 0.9619339529365236, "step": 9729, "train/sim_loss": 0.0546875 }, { "epoch": 0.9619339529365236, "step": 9729, "train/total_loss": 0.1960795372724533 }, { "entropy": 9.041462898254395, "epoch": 0.9620328257860391, "mean_token_accuracy": 0.7145161032676697, "num_tokens": 29840625.0, "step": 9730, "train/ce_loss": 0.8221074342727661 }, { "epoch": 0.9620328257860391, "step": 9730, "train/sim_loss": 0.03515625 }, { "epoch": 0.9620328257860391, "step": 9730, "train/total_loss": 0.11736699193716049 }, { "entropy": 8.483409881591797, "epoch": 0.9621316986355547, "mean_token_accuracy": 0.7784730792045593, "num_tokens": 29845908.0, "step": 9731, "train/ce_loss": 1.1115496158599854 }, { "epoch": 0.9621316986355547, "step": 9731, "train/sim_loss": 0.1328125 }, { "epoch": 0.9621316986355547, "step": 9731, "train/total_loss": 0.2439674735069275 }, { "entropy": 8.228540420532227, "epoch": 0.9622305714850702, "mean_token_accuracy": 0.7142857313156128, "num_tokens": 29851287.0, "step": 9732, "train/ce_loss": 0.6496933698654175 }, { "epoch": 0.9622305714850702, "step": 9732, "train/sim_loss": 0.0546875 }, { "epoch": 0.9622305714850702, "step": 9732, "train/total_loss": 0.11965683847665787 }, { "entropy": 8.675930976867676, "epoch": 0.9623294443345857, "mean_token_accuracy": 0.8305687308311462, "num_tokens": 29856605.0, "step": 9733, "train/ce_loss": 5.587366445070074e-07 }, { "epoch": 0.9623294443345857, "step": 9733, "train/sim_loss": 0.02734375 }, { "epoch": 0.9623294443345857, "step": 9733, "train/total_loss": 0.027343805879354477 }, { "entropy": 9.15136432647705, "epoch": 0.9624283171841013, "mean_token_accuracy": 0.676800012588501, "num_tokens": 29861676.0, "step": 9734, "train/ce_loss": 0.9130910038948059 }, { "epoch": 0.9624283171841013, "step": 9734, "train/sim_loss": 0.0625 }, { "epoch": 0.9624283171841013, "step": 9734, "train/total_loss": 0.1538091003894806 }, { "entropy": 8.893739700317383, "epoch": 0.9625271900336168, "mean_token_accuracy": 0.7581274509429932, "num_tokens": 29866891.0, "step": 9735, "train/ce_loss": 0.6514489650726318 }, { "epoch": 0.9625271900336168, "step": 9735, "train/sim_loss": 0.015625 }, { "epoch": 0.9625271900336168, "step": 9735, "train/total_loss": 0.08076989650726318 }, { "entropy": 8.613104820251465, "epoch": 0.9626260628831322, "mean_token_accuracy": 0.8061728477478027, "num_tokens": 29872167.0, "step": 9736, "train/ce_loss": 1.2039073705673218 }, { "epoch": 0.9626260628831322, "step": 9736, "train/sim_loss": 0.0234375 }, { "epoch": 0.9626260628831322, "step": 9736, "train/total_loss": 0.14382824301719666 }, { "entropy": 8.599849700927734, "epoch": 0.9627249357326478, "mean_token_accuracy": 0.7332563400268555, "num_tokens": 29877656.0, "step": 9737, "train/ce_loss": 0.8840844631195068 }, { "epoch": 0.9627249357326478, "step": 9737, "train/sim_loss": 0.07421875 }, { "epoch": 0.9627249357326478, "step": 9737, "train/total_loss": 0.1626271903514862 }, { "entropy": 8.765642166137695, "epoch": 0.9628238085821633, "mean_token_accuracy": 0.7921419739723206, "num_tokens": 29882890.0, "step": 9738, "train/ce_loss": 2.2119331788417185e-07 }, { "epoch": 0.9628238085821633, "step": 9738, "train/sim_loss": 0.015625 }, { "epoch": 0.9628238085821633, "step": 9738, "train/total_loss": 0.01562502235174179 }, { "entropy": 8.54442310333252, "epoch": 0.9629226814316788, "mean_token_accuracy": 0.7326139211654663, "num_tokens": 29888132.0, "step": 9739, "train/ce_loss": 1.1796483993530273 }, { "epoch": 0.9629226814316788, "step": 9739, "train/sim_loss": 0.05078125 }, { "epoch": 0.9629226814316788, "step": 9739, "train/total_loss": 0.16874608397483826 }, { "epoch": 0.9630215542811944, "grad_norm": 0.616840124130249, "learning_rate": 7.594570538495772e-06, "loss": 0.1206, "step": 9740 }, { "entropy": 8.68200969696045, "epoch": 0.9630215542811944, "mean_token_accuracy": 0.7403846383094788, "num_tokens": 29893468.0, "step": 9740, "train/ce_loss": 1.115727424621582 }, { "epoch": 0.9630215542811944, "step": 9740, "train/sim_loss": 0.078125 }, { "epoch": 0.9630215542811944, "step": 9740, "train/total_loss": 0.1896977424621582 }, { "entropy": 9.168376922607422, "epoch": 0.9631204271307099, "mean_token_accuracy": 0.7841140627861023, "num_tokens": 29898362.0, "step": 9741, "train/ce_loss": 1.3955055475234985 }, { "epoch": 0.9631204271307099, "step": 9741, "train/sim_loss": 0.046875 }, { "epoch": 0.9631204271307099, "step": 9741, "train/total_loss": 0.18642555177211761 }, { "entropy": 8.921894073486328, "epoch": 0.9632192999802255, "mean_token_accuracy": 0.723809540271759, "num_tokens": 29903562.0, "step": 9742, "train/ce_loss": 0.8586784601211548 }, { "epoch": 0.9632192999802255, "step": 9742, "train/sim_loss": 0.0234375 }, { "epoch": 0.9632192999802255, "step": 9742, "train/total_loss": 0.10930534452199936 }, { "entropy": 8.499792098999023, "epoch": 0.963318172829741, "mean_token_accuracy": 0.7182254195213318, "num_tokens": 29908899.0, "step": 9743, "train/ce_loss": 0.44669750332832336 }, { "epoch": 0.963318172829741, "step": 9743, "train/sim_loss": 0.06640625 }, { "epoch": 0.963318172829741, "step": 9743, "train/total_loss": 0.1110759973526001 }, { "entropy": 9.490238189697266, "epoch": 0.9634170456792565, "mean_token_accuracy": 0.7023121118545532, "num_tokens": 29913656.0, "step": 9744, "train/ce_loss": 1.694462537765503 }, { "epoch": 0.9634170456792565, "step": 9744, "train/sim_loss": 0.0390625 }, { "epoch": 0.9634170456792565, "step": 9744, "train/total_loss": 0.20850875973701477 }, { "entropy": 9.028064727783203, "epoch": 0.963515918528772, "mean_token_accuracy": 0.7840909361839294, "num_tokens": 29918706.0, "step": 9745, "train/ce_loss": 8.081343594312784e-07 }, { "epoch": 0.963515918528772, "step": 9745, "train/sim_loss": 0.046875 }, { "epoch": 0.963515918528772, "step": 9745, "train/total_loss": 0.046875081956386566 }, { "entropy": 9.367633819580078, "epoch": 0.9636147913782875, "mean_token_accuracy": 0.7660818696022034, "num_tokens": 29923828.0, "step": 9746, "train/ce_loss": 1.222727656364441 }, { "epoch": 0.9636147913782875, "step": 9746, "train/sim_loss": 0.109375 }, { "epoch": 0.9636147913782875, "step": 9746, "train/total_loss": 0.23164775967597961 }, { "entropy": 9.461725234985352, "epoch": 0.963713664227803, "mean_token_accuracy": 0.7025495767593384, "num_tokens": 29928614.0, "step": 9747, "train/ce_loss": 1.1468218872323632e-06 }, { "epoch": 0.963713664227803, "step": 9747, "train/sim_loss": 0.08203125 }, { "epoch": 0.963713664227803, "step": 9747, "train/total_loss": 0.08203136175870895 }, { "entropy": 9.182886123657227, "epoch": 0.9638125370773186, "mean_token_accuracy": 0.76897132396698, "num_tokens": 29933684.0, "step": 9748, "train/ce_loss": 3.44960398024341e-07 }, { "epoch": 0.9638125370773186, "step": 9748, "train/sim_loss": 0.03125 }, { "epoch": 0.9638125370773186, "step": 9748, "train/total_loss": 0.031250033527612686 }, { "entropy": 9.002074241638184, "epoch": 0.9639114099268341, "mean_token_accuracy": 0.7204142212867737, "num_tokens": 29938812.0, "step": 9749, "train/ce_loss": 0.9889717698097229 }, { "epoch": 0.9639114099268341, "step": 9749, "train/sim_loss": 0.08984375 }, { "epoch": 0.9639114099268341, "step": 9749, "train/total_loss": 0.18874093890190125 }, { "entropy": 8.901506423950195, "epoch": 0.9640102827763496, "mean_token_accuracy": 0.7672035098075867, "num_tokens": 29943939.0, "step": 9750, "train/ce_loss": 0.7076161503791809 }, { "epoch": 0.9640102827763496, "step": 9750, "train/sim_loss": 0.0625 }, { "epoch": 0.9640102827763496, "step": 9750, "train/total_loss": 0.13326162099838257 }, { "entropy": 8.964765548706055, "epoch": 0.9641091556258652, "mean_token_accuracy": 0.7090432643890381, "num_tokens": 29949172.0, "step": 9751, "train/ce_loss": 1.442529559135437 }, { "epoch": 0.9641091556258652, "step": 9751, "train/sim_loss": 0.05859375 }, { "epoch": 0.9641091556258652, "step": 9751, "train/total_loss": 0.2028467059135437 }, { "entropy": 8.82308578491211, "epoch": 0.9642080284753807, "mean_token_accuracy": 0.7761006355285645, "num_tokens": 29954325.0, "step": 9752, "train/ce_loss": 0.9967244267463684 }, { "epoch": 0.9642080284753807, "step": 9752, "train/sim_loss": 0.0625 }, { "epoch": 0.9642080284753807, "step": 9752, "train/total_loss": 0.16217243671417236 }, { "entropy": 9.71257209777832, "epoch": 0.9643069013248962, "mean_token_accuracy": 0.7117437720298767, "num_tokens": 29959030.0, "step": 9753, "train/ce_loss": 1.1878239547513658e-06 }, { "epoch": 0.9643069013248962, "step": 9753, "train/sim_loss": 0.03125 }, { "epoch": 0.9643069013248962, "step": 9753, "train/total_loss": 0.03125011920928955 }, { "entropy": 9.152084350585938, "epoch": 0.9644057741744118, "mean_token_accuracy": 0.7585227489471436, "num_tokens": 29964127.0, "step": 9754, "train/ce_loss": 1.111953616142273 }, { "epoch": 0.9644057741744118, "step": 9754, "train/sim_loss": 0.05078125 }, { "epoch": 0.9644057741744118, "step": 9754, "train/total_loss": 0.16197660565376282 }, { "entropy": 8.521442413330078, "epoch": 0.9645046470239272, "mean_token_accuracy": 0.7327141165733337, "num_tokens": 29969554.0, "step": 9755, "train/ce_loss": 0.7364586591720581 }, { "epoch": 0.9645046470239272, "step": 9755, "train/sim_loss": 0.0390625 }, { "epoch": 0.9645046470239272, "step": 9755, "train/total_loss": 0.11270836740732193 }, { "entropy": 8.969746589660645, "epoch": 0.9646035198734427, "mean_token_accuracy": 0.8072992563247681, "num_tokens": 29974716.0, "step": 9756, "train/ce_loss": 9.724466281113564e-07 }, { "epoch": 0.9646035198734427, "step": 9756, "train/sim_loss": 0.046875 }, { "epoch": 0.9646035198734427, "step": 9756, "train/total_loss": 0.04687509685754776 }, { "entropy": 8.370565414428711, "epoch": 0.9647023927229583, "mean_token_accuracy": 0.7384259104728699, "num_tokens": 29980014.0, "step": 9757, "train/ce_loss": 0.9871373176574707 }, { "epoch": 0.9647023927229583, "step": 9757, "train/sim_loss": 0.046875 }, { "epoch": 0.9647023927229583, "step": 9757, "train/total_loss": 0.1455887258052826 }, { "entropy": 8.649551391601562, "epoch": 0.9648012655724738, "mean_token_accuracy": 0.8205384016036987, "num_tokens": 29985526.0, "step": 9758, "train/ce_loss": 0.5213025212287903 }, { "epoch": 0.9648012655724738, "step": 9758, "train/sim_loss": 0.01953125 }, { "epoch": 0.9648012655724738, "step": 9758, "train/total_loss": 0.07166150212287903 }, { "entropy": 8.791582107543945, "epoch": 0.9649001384219893, "mean_token_accuracy": 0.7807424664497375, "num_tokens": 29990858.0, "step": 9759, "train/ce_loss": 1.0622608661651611 }, { "epoch": 0.9649001384219893, "step": 9759, "train/sim_loss": 0.0546875 }, { "epoch": 0.9649001384219893, "step": 9759, "train/total_loss": 0.1609135866165161 }, { "epoch": 0.9649990112715049, "grad_norm": 0.5699576735496521, "learning_rate": 7.589625673737824e-06, "loss": 0.1311, "step": 9760 }, { "entropy": 9.449071884155273, "epoch": 0.9649990112715049, "mean_token_accuracy": 0.6702355742454529, "num_tokens": 29995723.0, "step": 9760, "train/ce_loss": 9.597781627235236e-07 }, { "epoch": 0.9649990112715049, "step": 9760, "train/sim_loss": 0.046875 }, { "epoch": 0.9649990112715049, "step": 9760, "train/total_loss": 0.04687509685754776 }, { "entropy": 8.83869743347168, "epoch": 0.9650978841210204, "mean_token_accuracy": 0.6340621113777161, "num_tokens": 30001091.0, "step": 9761, "train/ce_loss": 0.7668175101280212 }, { "epoch": 0.9650978841210204, "step": 9761, "train/sim_loss": 0.09375 }, { "epoch": 0.9650978841210204, "step": 9761, "train/total_loss": 0.17043176293373108 }, { "entropy": 8.57378101348877, "epoch": 0.9651967569705359, "mean_token_accuracy": 0.6736842393875122, "num_tokens": 30006428.0, "step": 9762, "train/ce_loss": 0.937447190284729 }, { "epoch": 0.9651967569705359, "step": 9762, "train/sim_loss": 0.03515625 }, { "epoch": 0.9651967569705359, "step": 9762, "train/total_loss": 0.12890097498893738 }, { "entropy": 8.815035820007324, "epoch": 0.9652956298200515, "mean_token_accuracy": 0.7747858166694641, "num_tokens": 30011686.0, "step": 9763, "train/ce_loss": 0.5151112675666809 }, { "epoch": 0.9652956298200515, "step": 9763, "train/sim_loss": 0.0234375 }, { "epoch": 0.9652956298200515, "step": 9763, "train/total_loss": 0.07494862377643585 }, { "entropy": 8.976507186889648, "epoch": 0.965394502669567, "mean_token_accuracy": 0.7398081421852112, "num_tokens": 30016975.0, "step": 9764, "train/ce_loss": 0.5990418195724487 }, { "epoch": 0.965394502669567, "step": 9764, "train/sim_loss": 0.05859375 }, { "epoch": 0.965394502669567, "step": 9764, "train/total_loss": 0.11849793791770935 }, { "entropy": 9.343544960021973, "epoch": 0.9654933755190824, "mean_token_accuracy": 0.8410351276397705, "num_tokens": 30021936.0, "step": 9765, "train/ce_loss": 0.700653612613678 }, { "epoch": 0.9654933755190824, "step": 9765, "train/sim_loss": 0.04296875 }, { "epoch": 0.9654933755190824, "step": 9765, "train/total_loss": 0.11303411424160004 }, { "entropy": 8.677410125732422, "epoch": 0.965592248368598, "mean_token_accuracy": 0.7547393441200256, "num_tokens": 30027192.0, "step": 9766, "train/ce_loss": 0.7619554996490479 }, { "epoch": 0.965592248368598, "step": 9766, "train/sim_loss": 0.046875 }, { "epoch": 0.965592248368598, "step": 9766, "train/total_loss": 0.12307055294513702 }, { "entropy": 8.991375923156738, "epoch": 0.9656911212181135, "mean_token_accuracy": 0.7194656729698181, "num_tokens": 30032130.0, "step": 9767, "train/ce_loss": 1.2186259031295776 }, { "epoch": 0.9656911212181135, "step": 9767, "train/sim_loss": 0.0390625 }, { "epoch": 0.9656911212181135, "step": 9767, "train/total_loss": 0.16092509031295776 }, { "entropy": 8.920236587524414, "epoch": 0.965789994067629, "mean_token_accuracy": 0.767160177230835, "num_tokens": 30037370.0, "step": 9768, "train/ce_loss": 0.3864889442920685 }, { "epoch": 0.965789994067629, "step": 9768, "train/sim_loss": 0.09765625 }, { "epoch": 0.965789994067629, "step": 9768, "train/total_loss": 0.13630515336990356 }, { "entropy": 8.582466125488281, "epoch": 0.9658888669171446, "mean_token_accuracy": 0.7327766418457031, "num_tokens": 30042785.0, "step": 9769, "train/ce_loss": 1.0490230321884155 }, { "epoch": 0.9658888669171446, "step": 9769, "train/sim_loss": 0.0703125 }, { "epoch": 0.9658888669171446, "step": 9769, "train/total_loss": 0.17521479725837708 }, { "entropy": 8.956478118896484, "epoch": 0.9659877397666601, "mean_token_accuracy": 0.7528735399246216, "num_tokens": 30047924.0, "step": 9770, "train/ce_loss": 1.0998936891555786 }, { "epoch": 0.9659877397666601, "step": 9770, "train/sim_loss": 0.0546875 }, { "epoch": 0.9659877397666601, "step": 9770, "train/total_loss": 0.16467687487602234 }, { "entropy": 9.13542652130127, "epoch": 0.9660866126161756, "mean_token_accuracy": 0.6312500238418579, "num_tokens": 30053032.0, "step": 9771, "train/ce_loss": 7.420662768709008e-07 }, { "epoch": 0.9660866126161756, "step": 9771, "train/sim_loss": 0.046875 }, { "epoch": 0.9660866126161756, "step": 9771, "train/total_loss": 0.04687507450580597 }, { "entropy": 8.715719223022461, "epoch": 0.9661854854656912, "mean_token_accuracy": 0.7609391808509827, "num_tokens": 30058415.0, "step": 9772, "train/ce_loss": 0.7486229538917542 }, { "epoch": 0.9661854854656912, "step": 9772, "train/sim_loss": 0.0546875 }, { "epoch": 0.9661854854656912, "step": 9772, "train/total_loss": 0.1295498013496399 }, { "entropy": 8.704704284667969, "epoch": 0.9662843583152066, "mean_token_accuracy": 0.7709563374519348, "num_tokens": 30063731.0, "step": 9773, "train/ce_loss": 1.3971641063690186 }, { "epoch": 0.9662843583152066, "step": 9773, "train/sim_loss": 0.06640625 }, { "epoch": 0.9662843583152066, "step": 9773, "train/total_loss": 0.20612266659736633 }, { "entropy": 9.03307056427002, "epoch": 0.9663832311647221, "mean_token_accuracy": 0.7463414669036865, "num_tokens": 30068963.0, "step": 9774, "train/ce_loss": 1.5434439182281494 }, { "epoch": 0.9663832311647221, "step": 9774, "train/sim_loss": 0.08203125 }, { "epoch": 0.9663832311647221, "step": 9774, "train/total_loss": 0.23637564480304718 }, { "entropy": 8.791257858276367, "epoch": 0.9664821040142377, "mean_token_accuracy": 0.7603748440742493, "num_tokens": 30074113.0, "step": 9775, "train/ce_loss": 0.7728161811828613 }, { "epoch": 0.9664821040142377, "step": 9775, "train/sim_loss": 0.046875 }, { "epoch": 0.9664821040142377, "step": 9775, "train/total_loss": 0.12415661662817001 }, { "entropy": 8.576519966125488, "epoch": 0.9665809768637532, "mean_token_accuracy": 0.7609427571296692, "num_tokens": 30079485.0, "step": 9776, "train/ce_loss": 0.6334747672080994 }, { "epoch": 0.9665809768637532, "step": 9776, "train/sim_loss": 0.0703125 }, { "epoch": 0.9665809768637532, "step": 9776, "train/total_loss": 0.1336599886417389 }, { "entropy": 9.028536796569824, "epoch": 0.9666798497132687, "mean_token_accuracy": 0.7583333253860474, "num_tokens": 30084501.0, "step": 9777, "train/ce_loss": 1.0083562135696411 }, { "epoch": 0.9666798497132687, "step": 9777, "train/sim_loss": 0.02734375 }, { "epoch": 0.9666798497132687, "step": 9777, "train/total_loss": 0.1281793713569641 }, { "entropy": 8.261909484863281, "epoch": 0.9667787225627843, "mean_token_accuracy": 0.6924778819084167, "num_tokens": 30089816.0, "step": 9778, "train/ce_loss": 0.8240401148796082 }, { "epoch": 0.9667787225627843, "step": 9778, "train/sim_loss": 0.046875 }, { "epoch": 0.9667787225627843, "step": 9778, "train/total_loss": 0.1292790174484253 }, { "entropy": 8.421815872192383, "epoch": 0.9668775954122998, "mean_token_accuracy": 0.7316821217536926, "num_tokens": 30095429.0, "step": 9779, "train/ce_loss": 0.6641775369644165 }, { "epoch": 0.9668775954122998, "step": 9779, "train/sim_loss": 0.07421875 }, { "epoch": 0.9668775954122998, "step": 9779, "train/total_loss": 0.14063650369644165 }, { "epoch": 0.9669764682618153, "grad_norm": 0.6132939457893372, "learning_rate": 7.584680808979875e-06, "loss": 0.1351, "step": 9780 }, { "entropy": 8.637802124023438, "epoch": 0.9669764682618153, "mean_token_accuracy": 0.7306889295578003, "num_tokens": 30100871.0, "step": 9780, "train/ce_loss": 1.974214792251587 }, { "epoch": 0.9669764682618153, "step": 9780, "train/sim_loss": 0.13671875 }, { "epoch": 0.9669764682618153, "step": 9780, "train/total_loss": 0.33414024114608765 }, { "entropy": 8.890082359313965, "epoch": 0.9670753411113309, "mean_token_accuracy": 0.7455048561096191, "num_tokens": 30106019.0, "step": 9781, "train/ce_loss": 0.925648033618927 }, { "epoch": 0.9670753411113309, "step": 9781, "train/sim_loss": 0.05859375 }, { "epoch": 0.9670753411113309, "step": 9781, "train/total_loss": 0.15115855634212494 }, { "entropy": 8.773796081542969, "epoch": 0.9671742139608464, "mean_token_accuracy": 0.7570332288742065, "num_tokens": 30111273.0, "step": 9782, "train/ce_loss": 0.6232446432113647 }, { "epoch": 0.9671742139608464, "step": 9782, "train/sim_loss": 0.0234375 }, { "epoch": 0.9671742139608464, "step": 9782, "train/total_loss": 0.08576196432113647 }, { "entropy": 9.439830780029297, "epoch": 0.9672730868103618, "mean_token_accuracy": 0.7900000214576721, "num_tokens": 30116166.0, "step": 9783, "train/ce_loss": 0.6501454710960388 }, { "epoch": 0.9672730868103618, "step": 9783, "train/sim_loss": 0.046875 }, { "epoch": 0.9672730868103618, "step": 9783, "train/total_loss": 0.11188954859972 }, { "entropy": 8.262337684631348, "epoch": 0.9673719596598774, "mean_token_accuracy": 0.7378542423248291, "num_tokens": 30121630.0, "step": 9784, "train/ce_loss": 0.834562361240387 }, { "epoch": 0.9673719596598774, "step": 9784, "train/sim_loss": 0.0390625 }, { "epoch": 0.9673719596598774, "step": 9784, "train/total_loss": 0.12251874059438705 }, { "entropy": 9.01753044128418, "epoch": 0.9674708325093929, "mean_token_accuracy": 0.7933884263038635, "num_tokens": 30126694.0, "step": 9785, "train/ce_loss": 0.9453256130218506 }, { "epoch": 0.9674708325093929, "step": 9785, "train/sim_loss": 0.0390625 }, { "epoch": 0.9674708325093929, "step": 9785, "train/total_loss": 0.1335950642824173 }, { "entropy": 8.868257522583008, "epoch": 0.9675697053589084, "mean_token_accuracy": 0.6899999976158142, "num_tokens": 30131991.0, "step": 9786, "train/ce_loss": 1.6262303590774536 }, { "epoch": 0.9675697053589084, "step": 9786, "train/sim_loss": 0.0546875 }, { "epoch": 0.9675697053589084, "step": 9786, "train/total_loss": 0.21731053292751312 }, { "entropy": 8.569860458374023, "epoch": 0.967668578208424, "mean_token_accuracy": 0.7249712347984314, "num_tokens": 30137306.0, "step": 9787, "train/ce_loss": 1.28461492061615 }, { "epoch": 0.967668578208424, "step": 9787, "train/sim_loss": 0.046875 }, { "epoch": 0.967668578208424, "step": 9787, "train/total_loss": 0.17533649504184723 }, { "entropy": 8.76042366027832, "epoch": 0.9677674510579395, "mean_token_accuracy": 0.7638669013977051, "num_tokens": 30142376.0, "step": 9788, "train/ce_loss": 4.821223456019652e-07 }, { "epoch": 0.9677674510579395, "step": 9788, "train/sim_loss": 0.05859375 }, { "epoch": 0.9677674510579395, "step": 9788, "train/total_loss": 0.05859379842877388 }, { "entropy": 9.251687049865723, "epoch": 0.967866323907455, "mean_token_accuracy": 0.7751371264457703, "num_tokens": 30147381.0, "step": 9789, "train/ce_loss": 0.8808366656303406 }, { "epoch": 0.967866323907455, "step": 9789, "train/sim_loss": 0.046875 }, { "epoch": 0.967866323907455, "step": 9789, "train/total_loss": 0.1349586695432663 }, { "entropy": 8.435419082641602, "epoch": 0.9679651967569706, "mean_token_accuracy": 0.811965823173523, "num_tokens": 30152838.0, "step": 9790, "train/ce_loss": 0.6257777810096741 }, { "epoch": 0.9679651967569706, "step": 9790, "train/sim_loss": 0.01953125 }, { "epoch": 0.9679651967569706, "step": 9790, "train/total_loss": 0.08210902661085129 }, { "entropy": 8.560461044311523, "epoch": 0.968064069606486, "mean_token_accuracy": 0.7419700026512146, "num_tokens": 30158236.0, "step": 9791, "train/ce_loss": 0.6622475385665894 }, { "epoch": 0.968064069606486, "step": 9791, "train/sim_loss": 0.0234375 }, { "epoch": 0.968064069606486, "step": 9791, "train/total_loss": 0.08966225385665894 }, { "entropy": 8.549823760986328, "epoch": 0.9681629424560015, "mean_token_accuracy": 0.7554479241371155, "num_tokens": 30163545.0, "step": 9792, "train/ce_loss": 0.7888723611831665 }, { "epoch": 0.9681629424560015, "step": 9792, "train/sim_loss": 0.05859375 }, { "epoch": 0.9681629424560015, "step": 9792, "train/total_loss": 0.1374809890985489 }, { "entropy": 8.473400115966797, "epoch": 0.9682618153055171, "mean_token_accuracy": 0.742290735244751, "num_tokens": 30168950.0, "step": 9793, "train/ce_loss": 1.3957717418670654 }, { "epoch": 0.9682618153055171, "step": 9793, "train/sim_loss": 0.03125 }, { "epoch": 0.9682618153055171, "step": 9793, "train/total_loss": 0.17082718014717102 }, { "entropy": 8.768596649169922, "epoch": 0.9683606881550326, "mean_token_accuracy": 0.8123359680175781, "num_tokens": 30174239.0, "step": 9794, "train/ce_loss": 0.8686895966529846 }, { "epoch": 0.9683606881550326, "step": 9794, "train/sim_loss": 0.015625 }, { "epoch": 0.9683606881550326, "step": 9794, "train/total_loss": 0.10249396413564682 }, { "entropy": 9.506278991699219, "epoch": 0.9684595610045481, "mean_token_accuracy": 0.7516930103302002, "num_tokens": 30179078.0, "step": 9795, "train/ce_loss": 1.5441043376922607 }, { "epoch": 0.9684595610045481, "step": 9795, "train/sim_loss": 0.01953125 }, { "epoch": 0.9684595610045481, "step": 9795, "train/total_loss": 0.1739416867494583 }, { "entropy": 9.06515121459961, "epoch": 0.9685584338540637, "mean_token_accuracy": 0.7006579041481018, "num_tokens": 30184104.0, "step": 9796, "train/ce_loss": 1.252088189125061 }, { "epoch": 0.9685584338540637, "step": 9796, "train/sim_loss": 0.03125 }, { "epoch": 0.9685584338540637, "step": 9796, "train/total_loss": 0.15645882487297058 }, { "entropy": 8.85397720336914, "epoch": 0.9686573067035792, "mean_token_accuracy": 0.7564275860786438, "num_tokens": 30189344.0, "step": 9797, "train/ce_loss": 0.6353757381439209 }, { "epoch": 0.9686573067035792, "step": 9797, "train/sim_loss": 0.0390625 }, { "epoch": 0.9686573067035792, "step": 9797, "train/total_loss": 0.10260007530450821 }, { "entropy": 8.397137641906738, "epoch": 0.9687561795530947, "mean_token_accuracy": 0.7437499761581421, "num_tokens": 30194958.0, "step": 9798, "train/ce_loss": 0.8359341025352478 }, { "epoch": 0.9687561795530947, "step": 9798, "train/sim_loss": 0.04296875 }, { "epoch": 0.9687561795530947, "step": 9798, "train/total_loss": 0.12656216323375702 }, { "entropy": 8.923301696777344, "epoch": 0.9688550524026103, "mean_token_accuracy": 0.7220077514648438, "num_tokens": 30200219.0, "step": 9799, "train/ce_loss": 1.037619948387146 }, { "epoch": 0.9688550524026103, "step": 9799, "train/sim_loss": 0.08984375 }, { "epoch": 0.9688550524026103, "step": 9799, "train/total_loss": 0.19360575079917908 }, { "epoch": 0.9689539252521258, "grad_norm": 0.7058612108230591, "learning_rate": 7.5797359442219264e-06, "loss": 0.1257, "step": 9800 }, { "entropy": 8.534658432006836, "epoch": 0.9689539252521258, "mean_token_accuracy": 0.757446825504303, "num_tokens": 30205669.0, "step": 9800, "train/ce_loss": 0.5837336182594299 }, { "epoch": 0.9689539252521258, "step": 9800, "train/sim_loss": 0.015625 }, { "epoch": 0.9689539252521258, "step": 9800, "train/total_loss": 0.073998361825943 }, { "entropy": 8.685020446777344, "epoch": 0.9690527981016412, "mean_token_accuracy": 0.7762399315834045, "num_tokens": 30211013.0, "step": 9801, "train/ce_loss": 0.9707077741622925 }, { "epoch": 0.9690527981016412, "step": 9801, "train/sim_loss": 0.05859375 }, { "epoch": 0.9690527981016412, "step": 9801, "train/total_loss": 0.15566453337669373 }, { "entropy": 8.840173721313477, "epoch": 0.9691516709511568, "mean_token_accuracy": 0.763239860534668, "num_tokens": 30216148.0, "step": 9802, "train/ce_loss": 0.9764208197593689 }, { "epoch": 0.9691516709511568, "step": 9802, "train/sim_loss": 0.07421875 }, { "epoch": 0.9691516709511568, "step": 9802, "train/total_loss": 0.17186084389686584 }, { "entropy": 9.262995719909668, "epoch": 0.9692505438006723, "mean_token_accuracy": 0.7594501972198486, "num_tokens": 30221204.0, "step": 9803, "train/ce_loss": 1.0339422225952148 }, { "epoch": 0.9692505438006723, "step": 9803, "train/sim_loss": 0.08984375 }, { "epoch": 0.9692505438006723, "step": 9803, "train/total_loss": 0.19323797523975372 }, { "entropy": 8.904062271118164, "epoch": 0.9693494166501878, "mean_token_accuracy": 0.7573964595794678, "num_tokens": 30226408.0, "step": 9804, "train/ce_loss": 1.0313167572021484 }, { "epoch": 0.9693494166501878, "step": 9804, "train/sim_loss": 0.06640625 }, { "epoch": 0.9693494166501878, "step": 9804, "train/total_loss": 0.16953793168067932 }, { "entropy": 9.223979949951172, "epoch": 0.9694482894997034, "mean_token_accuracy": 0.7154255509376526, "num_tokens": 30231239.0, "step": 9805, "train/ce_loss": 1.9309848546981812 }, { "epoch": 0.9694482894997034, "step": 9805, "train/sim_loss": 0.05078125 }, { "epoch": 0.9694482894997034, "step": 9805, "train/total_loss": 0.24387973546981812 }, { "entropy": 8.908904075622559, "epoch": 0.9695471623492189, "mean_token_accuracy": 0.7402945160865784, "num_tokens": 30236399.0, "step": 9806, "train/ce_loss": 0.8049737811088562 }, { "epoch": 0.9695471623492189, "step": 9806, "train/sim_loss": 0.03125 }, { "epoch": 0.9695471623492189, "step": 9806, "train/total_loss": 0.1117473766207695 }, { "entropy": 9.124170303344727, "epoch": 0.9696460351987344, "mean_token_accuracy": 0.7725321650505066, "num_tokens": 30241528.0, "step": 9807, "train/ce_loss": 1.1656169891357422 }, { "epoch": 0.9696460351987344, "step": 9807, "train/sim_loss": 0.140625 }, { "epoch": 0.9696460351987344, "step": 9807, "train/total_loss": 0.2571867108345032 }, { "entropy": 8.931862831115723, "epoch": 0.96974490804825, "mean_token_accuracy": 0.7455621361732483, "num_tokens": 30246653.0, "step": 9808, "train/ce_loss": 1.578021764755249 }, { "epoch": 0.96974490804825, "step": 9808, "train/sim_loss": 0.0625 }, { "epoch": 0.96974490804825, "step": 9808, "train/total_loss": 0.22030217945575714 }, { "entropy": 8.949090957641602, "epoch": 0.9698437808977655, "mean_token_accuracy": 0.6895973086357117, "num_tokens": 30251677.0, "step": 9809, "train/ce_loss": 3.2962290674731776e-07 }, { "epoch": 0.9698437808977655, "step": 9809, "train/sim_loss": 0.0390625 }, { "epoch": 0.9698437808977655, "step": 9809, "train/total_loss": 0.039062533527612686 }, { "entropy": 8.885849952697754, "epoch": 0.969942653747281, "mean_token_accuracy": 0.7645348906517029, "num_tokens": 30256773.0, "step": 9810, "train/ce_loss": 0.618908703327179 }, { "epoch": 0.969942653747281, "step": 9810, "train/sim_loss": 0.03515625 }, { "epoch": 0.969942653747281, "step": 9810, "train/total_loss": 0.0970471203327179 }, { "entropy": 8.56889820098877, "epoch": 0.9700415265967965, "mean_token_accuracy": 0.805587887763977, "num_tokens": 30262103.0, "step": 9811, "train/ce_loss": 0.7377899289131165 }, { "epoch": 0.9700415265967965, "step": 9811, "train/sim_loss": 0.04296875 }, { "epoch": 0.9700415265967965, "step": 9811, "train/total_loss": 0.11674774438142776 }, { "entropy": 8.948206901550293, "epoch": 0.970140399446312, "mean_token_accuracy": 0.836454451084137, "num_tokens": 30267351.0, "step": 9812, "train/ce_loss": 0.5882524251937866 }, { "epoch": 0.970140399446312, "step": 9812, "train/sim_loss": 0.0234375 }, { "epoch": 0.970140399446312, "step": 9812, "train/total_loss": 0.08226273953914642 }, { "entropy": 8.484578132629395, "epoch": 0.9702392722958275, "mean_token_accuracy": 0.752653956413269, "num_tokens": 30272732.0, "step": 9813, "train/ce_loss": 0.6853912472724915 }, { "epoch": 0.9702392722958275, "step": 9813, "train/sim_loss": 0.07421875 }, { "epoch": 0.9702392722958275, "step": 9813, "train/total_loss": 0.14275787770748138 }, { "entropy": 8.83863353729248, "epoch": 0.9703381451453431, "mean_token_accuracy": 0.7642045617103577, "num_tokens": 30277918.0, "step": 9814, "train/ce_loss": 1.0013800859451294 }, { "epoch": 0.9703381451453431, "step": 9814, "train/sim_loss": 0.05078125 }, { "epoch": 0.9703381451453431, "step": 9814, "train/total_loss": 0.15091925859451294 }, { "entropy": 8.29507064819336, "epoch": 0.9704370179948586, "mean_token_accuracy": 0.7156756520271301, "num_tokens": 30283333.0, "step": 9815, "train/ce_loss": 1.232089877128601 }, { "epoch": 0.9704370179948586, "step": 9815, "train/sim_loss": 0.109375 }, { "epoch": 0.9704370179948586, "step": 9815, "train/total_loss": 0.23258399963378906 }, { "entropy": 9.107410430908203, "epoch": 0.9705358908443741, "mean_token_accuracy": 0.7877813577651978, "num_tokens": 30288411.0, "step": 9816, "train/ce_loss": 0.7494767904281616 }, { "epoch": 0.9705358908443741, "step": 9816, "train/sim_loss": 0.0234375 }, { "epoch": 0.9705358908443741, "step": 9816, "train/total_loss": 0.09838517755270004 }, { "entropy": 8.747941017150879, "epoch": 0.9706347636938897, "mean_token_accuracy": 0.7611940503120422, "num_tokens": 30293477.0, "step": 9817, "train/ce_loss": 1.2272424697875977 }, { "epoch": 0.9706347636938897, "step": 9817, "train/sim_loss": 0.02734375 }, { "epoch": 0.9706347636938897, "step": 9817, "train/total_loss": 0.150067999958992 }, { "entropy": 8.865287780761719, "epoch": 0.9707336365434052, "mean_token_accuracy": 0.7332214713096619, "num_tokens": 30298540.0, "step": 9818, "train/ce_loss": 0.9569790363311768 }, { "epoch": 0.9707336365434052, "step": 9818, "train/sim_loss": 0.05859375 }, { "epoch": 0.9707336365434052, "step": 9818, "train/total_loss": 0.15429165959358215 }, { "entropy": 8.483968734741211, "epoch": 0.9708325093929207, "mean_token_accuracy": 0.7448747158050537, "num_tokens": 30303910.0, "step": 9819, "train/ce_loss": 0.814598560333252 }, { "epoch": 0.9708325093929207, "step": 9819, "train/sim_loss": 0.05859375 }, { "epoch": 0.9708325093929207, "step": 9819, "train/total_loss": 0.14005360007286072 }, { "epoch": 0.9709313822424362, "grad_norm": 0.5963754057884216, "learning_rate": 7.574791079463977e-06, "loss": 0.1267, "step": 9820 }, { "entropy": 9.1385498046875, "epoch": 0.9709313822424362, "mean_token_accuracy": 0.7041096091270447, "num_tokens": 30309183.0, "step": 9820, "train/ce_loss": 1.7761496305465698 }, { "epoch": 0.9709313822424362, "step": 9820, "train/sim_loss": 0.03125 }, { "epoch": 0.9709313822424362, "step": 9820, "train/total_loss": 0.2088649719953537 }, { "entropy": 8.867751121520996, "epoch": 0.9710302550919517, "mean_token_accuracy": 0.6960408687591553, "num_tokens": 30314426.0, "step": 9821, "train/ce_loss": 0.4713476300239563 }, { "epoch": 0.9710302550919517, "step": 9821, "train/sim_loss": 0.0234375 }, { "epoch": 0.9710302550919517, "step": 9821, "train/total_loss": 0.07057226449251175 }, { "entropy": 8.420510292053223, "epoch": 0.9711291279414672, "mean_token_accuracy": 0.7186813354492188, "num_tokens": 30319966.0, "step": 9822, "train/ce_loss": 1.010133981704712 }, { "epoch": 0.9711291279414672, "step": 9822, "train/sim_loss": 0.09765625 }, { "epoch": 0.9711291279414672, "step": 9822, "train/total_loss": 0.1986696422100067 }, { "entropy": 9.160971641540527, "epoch": 0.9712280007909828, "mean_token_accuracy": 0.7020280957221985, "num_tokens": 30325034.0, "step": 9823, "train/ce_loss": 1.3547375202178955 }, { "epoch": 0.9712280007909828, "step": 9823, "train/sim_loss": 0.046875 }, { "epoch": 0.9712280007909828, "step": 9823, "train/total_loss": 0.18234875798225403 }, { "entropy": 8.668876647949219, "epoch": 0.9713268736404983, "mean_token_accuracy": 0.6980676054954529, "num_tokens": 30330338.0, "step": 9824, "train/ce_loss": 0.9471348524093628 }, { "epoch": 0.9713268736404983, "step": 9824, "train/sim_loss": 0.05859375 }, { "epoch": 0.9713268736404983, "step": 9824, "train/total_loss": 0.1533072292804718 }, { "entropy": 8.502266883850098, "epoch": 0.9714257464900139, "mean_token_accuracy": 0.7967391014099121, "num_tokens": 30335731.0, "step": 9825, "train/ce_loss": 0.61232990026474 }, { "epoch": 0.9714257464900139, "step": 9825, "train/sim_loss": 0.0390625 }, { "epoch": 0.9714257464900139, "step": 9825, "train/total_loss": 0.10029549151659012 }, { "entropy": 8.246334075927734, "epoch": 0.9715246193395294, "mean_token_accuracy": 0.7240592837333679, "num_tokens": 30341120.0, "step": 9826, "train/ce_loss": 1.5039557218551636 }, { "epoch": 0.9715246193395294, "step": 9826, "train/sim_loss": 0.07421875 }, { "epoch": 0.9715246193395294, "step": 9826, "train/total_loss": 0.22461432218551636 }, { "entropy": 8.657670974731445, "epoch": 0.9716234921890449, "mean_token_accuracy": 0.7759783864021301, "num_tokens": 30346333.0, "step": 9827, "train/ce_loss": 0.38430023193359375 }, { "epoch": 0.9716234921890449, "step": 9827, "train/sim_loss": 0.0234375 }, { "epoch": 0.9716234921890449, "step": 9827, "train/total_loss": 0.061867523938417435 }, { "entropy": 8.815181732177734, "epoch": 0.9717223650385605, "mean_token_accuracy": 0.737313449382782, "num_tokens": 30351417.0, "step": 9828, "train/ce_loss": 0.8075527548789978 }, { "epoch": 0.9717223650385605, "step": 9828, "train/sim_loss": 0.04296875 }, { "epoch": 0.9717223650385605, "step": 9828, "train/total_loss": 0.12372402846813202 }, { "entropy": 8.354762077331543, "epoch": 0.971821237888076, "mean_token_accuracy": 0.768878698348999, "num_tokens": 30356742.0, "step": 9829, "train/ce_loss": 0.6924364566802979 }, { "epoch": 0.971821237888076, "step": 9829, "train/sim_loss": 0.04296875 }, { "epoch": 0.971821237888076, "step": 9829, "train/total_loss": 0.1122123971581459 }, { "entropy": 8.414628982543945, "epoch": 0.9719201107375914, "mean_token_accuracy": 0.7400644421577454, "num_tokens": 30362147.0, "step": 9830, "train/ce_loss": 0.3731345236301422 }, { "epoch": 0.9719201107375914, "step": 9830, "train/sim_loss": 0.0625 }, { "epoch": 0.9719201107375914, "step": 9830, "train/total_loss": 0.09981345385313034 }, { "entropy": 8.397029876708984, "epoch": 0.972018983587107, "mean_token_accuracy": 0.759100615978241, "num_tokens": 30367529.0, "step": 9831, "train/ce_loss": 0.46859368681907654 }, { "epoch": 0.972018983587107, "step": 9831, "train/sim_loss": 0.05859375 }, { "epoch": 0.972018983587107, "step": 9831, "train/total_loss": 0.10545311868190765 }, { "entropy": 8.723847389221191, "epoch": 0.9721178564366225, "mean_token_accuracy": 0.6708715558052063, "num_tokens": 30372884.0, "step": 9832, "train/ce_loss": 0.9105349183082581 }, { "epoch": 0.9721178564366225, "step": 9832, "train/sim_loss": 0.078125 }, { "epoch": 0.9721178564366225, "step": 9832, "train/total_loss": 0.16917848587036133 }, { "entropy": 9.426420211791992, "epoch": 0.972216729286138, "mean_token_accuracy": 0.6875, "num_tokens": 30377721.0, "step": 9833, "train/ce_loss": 3.1561458110809326 }, { "epoch": 0.972216729286138, "step": 9833, "train/sim_loss": 0.04296875 }, { "epoch": 0.972216729286138, "step": 9833, "train/total_loss": 0.35858333110809326 }, { "entropy": 8.554241180419922, "epoch": 0.9723156021356536, "mean_token_accuracy": 0.7353658676147461, "num_tokens": 30383019.0, "step": 9834, "train/ce_loss": 0.9683541655540466 }, { "epoch": 0.9723156021356536, "step": 9834, "train/sim_loss": 0.046875 }, { "epoch": 0.9723156021356536, "step": 9834, "train/total_loss": 0.1437104195356369 }, { "entropy": 9.29680061340332, "epoch": 0.9724144749851691, "mean_token_accuracy": 0.7786259651184082, "num_tokens": 30387872.0, "step": 9835, "train/ce_loss": 3.724886425970908e-07 }, { "epoch": 0.9724144749851691, "step": 9835, "train/sim_loss": 0.0390625 }, { "epoch": 0.9724144749851691, "step": 9835, "train/total_loss": 0.039062537252902985 }, { "entropy": 9.01911735534668, "epoch": 0.9725133478346846, "mean_token_accuracy": 0.7025723457336426, "num_tokens": 30392988.0, "step": 9836, "train/ce_loss": 0.9779345393180847 }, { "epoch": 0.9725133478346846, "step": 9836, "train/sim_loss": 0.109375 }, { "epoch": 0.9725133478346846, "step": 9836, "train/total_loss": 0.20716845989227295 }, { "entropy": 8.451966285705566, "epoch": 0.9726122206842002, "mean_token_accuracy": 0.7625570893287659, "num_tokens": 30398387.0, "step": 9837, "train/ce_loss": 0.8094248175621033 }, { "epoch": 0.9726122206842002, "step": 9837, "train/sim_loss": 0.07421875 }, { "epoch": 0.9726122206842002, "step": 9837, "train/total_loss": 0.15516123175621033 }, { "entropy": 8.538702964782715, "epoch": 0.9727110935337157, "mean_token_accuracy": 0.7369033694267273, "num_tokens": 30403754.0, "step": 9838, "train/ce_loss": 0.5979429483413696 }, { "epoch": 0.9727110935337157, "step": 9838, "train/sim_loss": 0.09375 }, { "epoch": 0.9727110935337157, "step": 9838, "train/total_loss": 0.15354429185390472 }, { "entropy": 9.114009857177734, "epoch": 0.9728099663832311, "mean_token_accuracy": 0.7113401889801025, "num_tokens": 30408920.0, "step": 9839, "train/ce_loss": 1.1505573987960815 }, { "epoch": 0.9728099663832311, "step": 9839, "train/sim_loss": 0.05078125 }, { "epoch": 0.9728099663832311, "step": 9839, "train/total_loss": 0.16583698987960815 }, { "epoch": 0.9729088392327467, "grad_norm": 0.6539301872253418, "learning_rate": 7.569846214706028e-06, "loss": 0.1425, "step": 9840 }, { "entropy": 8.513267517089844, "epoch": 0.9729088392327467, "mean_token_accuracy": 0.706315815448761, "num_tokens": 30414386.0, "step": 9840, "train/ce_loss": 1.0598288774490356 }, { "epoch": 0.9729088392327467, "step": 9840, "train/sim_loss": 0.07421875 }, { "epoch": 0.9729088392327467, "step": 9840, "train/total_loss": 0.18020164966583252 }, { "entropy": 8.936271667480469, "epoch": 0.9730077120822622, "mean_token_accuracy": 0.7788461446762085, "num_tokens": 30419528.0, "step": 9841, "train/ce_loss": 1.055103063583374 }, { "epoch": 0.9730077120822622, "step": 9841, "train/sim_loss": 0.0546875 }, { "epoch": 0.9730077120822622, "step": 9841, "train/total_loss": 0.16019780933856964 }, { "entropy": 8.57362174987793, "epoch": 0.9731065849317777, "mean_token_accuracy": 0.7004279494285583, "num_tokens": 30424689.0, "step": 9842, "train/ce_loss": 1.0744620561599731 }, { "epoch": 0.9731065849317777, "step": 9842, "train/sim_loss": 0.05078125 }, { "epoch": 0.9731065849317777, "step": 9842, "train/total_loss": 0.15822745859622955 }, { "entropy": 9.38490104675293, "epoch": 0.9732054577812933, "mean_token_accuracy": 0.8166311383247375, "num_tokens": 30429612.0, "step": 9843, "train/ce_loss": 1.1905419796676142e-06 }, { "epoch": 0.9732054577812933, "step": 9843, "train/sim_loss": 0.046875 }, { "epoch": 0.9732054577812933, "step": 9843, "train/total_loss": 0.04687511920928955 }, { "entropy": 9.143271446228027, "epoch": 0.9733043306308088, "mean_token_accuracy": 0.711033284664154, "num_tokens": 30434596.0, "step": 9844, "train/ce_loss": 1.6245604753494263 }, { "epoch": 0.9733043306308088, "step": 9844, "train/sim_loss": 0.0234375 }, { "epoch": 0.9733043306308088, "step": 9844, "train/total_loss": 0.18589355051517487 }, { "entropy": 9.067255020141602, "epoch": 0.9734032034803243, "mean_token_accuracy": 0.7121879458427429, "num_tokens": 30439631.0, "step": 9845, "train/ce_loss": 2.583715286164079e-07 }, { "epoch": 0.9734032034803243, "step": 9845, "train/sim_loss": 0.01953125 }, { "epoch": 0.9734032034803243, "step": 9845, "train/total_loss": 0.01953127607703209 }, { "entropy": 9.127470970153809, "epoch": 0.9735020763298399, "mean_token_accuracy": 0.7874763011932373, "num_tokens": 30444609.0, "step": 9846, "train/ce_loss": 1.6459321975708008 }, { "epoch": 0.9735020763298399, "step": 9846, "train/sim_loss": 0.0390625 }, { "epoch": 0.9735020763298399, "step": 9846, "train/total_loss": 0.20365571975708008 }, { "entropy": 8.572263717651367, "epoch": 0.9736009491793554, "mean_token_accuracy": 0.7341115474700928, "num_tokens": 30449885.0, "step": 9847, "train/ce_loss": 1.071635127067566 }, { "epoch": 0.9736009491793554, "step": 9847, "train/sim_loss": 0.01953125 }, { "epoch": 0.9736009491793554, "step": 9847, "train/total_loss": 0.12669476866722107 }, { "entropy": 8.690587997436523, "epoch": 0.9736998220288708, "mean_token_accuracy": 0.748633861541748, "num_tokens": 30455114.0, "step": 9848, "train/ce_loss": 0.4136137366294861 }, { "epoch": 0.9736998220288708, "step": 9848, "train/sim_loss": 0.01171875 }, { "epoch": 0.9736998220288708, "step": 9848, "train/total_loss": 0.05308012291789055 }, { "entropy": 9.048288345336914, "epoch": 0.9737986948783864, "mean_token_accuracy": 0.7550644278526306, "num_tokens": 30460086.0, "step": 9849, "train/ce_loss": 0.7388678789138794 }, { "epoch": 0.9737986948783864, "step": 9849, "train/sim_loss": 0.03125 }, { "epoch": 0.9737986948783864, "step": 9849, "train/total_loss": 0.10513678938150406 }, { "entropy": 8.765560150146484, "epoch": 0.9738975677279019, "mean_token_accuracy": 0.7100840210914612, "num_tokens": 30465212.0, "step": 9850, "train/ce_loss": 0.4580962061882019 }, { "epoch": 0.9738975677279019, "step": 9850, "train/sim_loss": 0.046875 }, { "epoch": 0.9738975677279019, "step": 9850, "train/total_loss": 0.09268462657928467 }, { "entropy": 8.234114646911621, "epoch": 0.9739964405774174, "mean_token_accuracy": 0.7294520735740662, "num_tokens": 30470577.0, "step": 9851, "train/ce_loss": 0.8070613145828247 }, { "epoch": 0.9739964405774174, "step": 9851, "train/sim_loss": 0.046875 }, { "epoch": 0.9739964405774174, "step": 9851, "train/total_loss": 0.1275811344385147 }, { "entropy": 8.795877456665039, "epoch": 0.974095313426933, "mean_token_accuracy": 0.773553729057312, "num_tokens": 30475656.0, "step": 9852, "train/ce_loss": 6.72289729664044e-07 }, { "epoch": 0.974095313426933, "step": 9852, "train/sim_loss": 0.046875 }, { "epoch": 0.974095313426933, "step": 9852, "train/total_loss": 0.04687506705522537 }, { "entropy": 8.467682838439941, "epoch": 0.9741941862764485, "mean_token_accuracy": 0.7105262875556946, "num_tokens": 30481086.0, "step": 9853, "train/ce_loss": 1.3465851545333862 }, { "epoch": 0.9741941862764485, "step": 9853, "train/sim_loss": 0.078125 }, { "epoch": 0.9741941862764485, "step": 9853, "train/total_loss": 0.21278351545333862 }, { "entropy": 9.144447326660156, "epoch": 0.974293059125964, "mean_token_accuracy": 0.7205169796943665, "num_tokens": 30486170.0, "step": 9854, "train/ce_loss": 4.991715059077251e-07 }, { "epoch": 0.974293059125964, "step": 9854, "train/sim_loss": 0.03125 }, { "epoch": 0.974293059125964, "step": 9854, "train/total_loss": 0.03125004842877388 }, { "entropy": 9.285968780517578, "epoch": 0.9743919319754796, "mean_token_accuracy": 0.8068833947181702, "num_tokens": 30491137.0, "step": 9855, "train/ce_loss": 0.5230701565742493 }, { "epoch": 0.9743919319754796, "step": 9855, "train/sim_loss": 0.03515625 }, { "epoch": 0.9743919319754796, "step": 9855, "train/total_loss": 0.08746326714754105 }, { "entropy": 8.926168441772461, "epoch": 0.9744908048249951, "mean_token_accuracy": 0.7908309698104858, "num_tokens": 30496281.0, "step": 9856, "train/ce_loss": 0.7687076926231384 }, { "epoch": 0.9744908048249951, "step": 9856, "train/sim_loss": 0.04296875 }, { "epoch": 0.9744908048249951, "step": 9856, "train/total_loss": 0.11983951926231384 }, { "entropy": 8.906764030456543, "epoch": 0.9745896776745105, "mean_token_accuracy": 0.7139175534248352, "num_tokens": 30501532.0, "step": 9857, "train/ce_loss": 0.8398147821426392 }, { "epoch": 0.9745896776745105, "step": 9857, "train/sim_loss": 0.01953125 }, { "epoch": 0.9745896776745105, "step": 9857, "train/total_loss": 0.1035127267241478 }, { "entropy": 8.640974044799805, "epoch": 0.9746885505240261, "mean_token_accuracy": 0.7666666507720947, "num_tokens": 30506782.0, "step": 9858, "train/ce_loss": 0.8701812624931335 }, { "epoch": 0.9746885505240261, "step": 9858, "train/sim_loss": 0.0234375 }, { "epoch": 0.9746885505240261, "step": 9858, "train/total_loss": 0.11045562475919724 }, { "entropy": 8.79356861114502, "epoch": 0.9747874233735416, "mean_token_accuracy": 0.7195994257926941, "num_tokens": 30511942.0, "step": 9859, "train/ce_loss": 0.7857524752616882 }, { "epoch": 0.9747874233735416, "step": 9859, "train/sim_loss": 0.078125 }, { "epoch": 0.9747874233735416, "step": 9859, "train/total_loss": 0.1567002534866333 }, { "epoch": 0.9748862962230571, "grad_norm": 0.777504026889801, "learning_rate": 7.56490134994808e-06, "loss": 0.1331, "step": 9860 }, { "entropy": 9.06789779663086, "epoch": 0.9748862962230571, "mean_token_accuracy": 0.7567976117134094, "num_tokens": 30517046.0, "step": 9860, "train/ce_loss": 1.3188773393630981 }, { "epoch": 0.9748862962230571, "step": 9860, "train/sim_loss": 0.078125 }, { "epoch": 0.9748862962230571, "step": 9860, "train/total_loss": 0.21001273393630981 }, { "entropy": 8.430561065673828, "epoch": 0.9749851690725727, "mean_token_accuracy": 0.7646432518959045, "num_tokens": 30522474.0, "step": 9861, "train/ce_loss": 0.9632667899131775 }, { "epoch": 0.9749851690725727, "step": 9861, "train/sim_loss": 0.07421875 }, { "epoch": 0.9749851690725727, "step": 9861, "train/total_loss": 0.17054542899131775 }, { "entropy": 9.121234893798828, "epoch": 0.9750840419220882, "mean_token_accuracy": 0.7977991700172424, "num_tokens": 30527648.0, "step": 9862, "train/ce_loss": 1.0044971704483032 }, { "epoch": 0.9750840419220882, "step": 9862, "train/sim_loss": 0.05078125 }, { "epoch": 0.9750840419220882, "step": 9862, "train/total_loss": 0.15123096108436584 }, { "entropy": 8.365032196044922, "epoch": 0.9751829147716037, "mean_token_accuracy": 0.7340182662010193, "num_tokens": 30533041.0, "step": 9863, "train/ce_loss": 1.510496973991394 }, { "epoch": 0.9751829147716037, "step": 9863, "train/sim_loss": 0.0703125 }, { "epoch": 0.9751829147716037, "step": 9863, "train/total_loss": 0.22136220335960388 }, { "entropy": 8.620687484741211, "epoch": 0.9752817876211193, "mean_token_accuracy": 0.7404305934906006, "num_tokens": 30538414.0, "step": 9864, "train/ce_loss": 0.6511469483375549 }, { "epoch": 0.9752817876211193, "step": 9864, "train/sim_loss": 0.03125 }, { "epoch": 0.9752817876211193, "step": 9864, "train/total_loss": 0.09636469930410385 }, { "entropy": 8.600302696228027, "epoch": 0.9753806604706348, "mean_token_accuracy": 0.8005018830299377, "num_tokens": 30543687.0, "step": 9865, "train/ce_loss": 4.128142165882309e-07 }, { "epoch": 0.9753806604706348, "step": 9865, "train/sim_loss": 0.01953125 }, { "epoch": 0.9753806604706348, "step": 9865, "train/total_loss": 0.019531290978193283 }, { "entropy": 8.438641548156738, "epoch": 0.9754795333201502, "mean_token_accuracy": 0.7306532859802246, "num_tokens": 30549115.0, "step": 9866, "train/ce_loss": 0.852030336856842 }, { "epoch": 0.9754795333201502, "step": 9866, "train/sim_loss": 0.046875 }, { "epoch": 0.9754795333201502, "step": 9866, "train/total_loss": 0.13207803666591644 }, { "entropy": 8.44935131072998, "epoch": 0.9755784061696658, "mean_token_accuracy": 0.7590726017951965, "num_tokens": 30554567.0, "step": 9867, "train/ce_loss": 0.7305549383163452 }, { "epoch": 0.9755784061696658, "step": 9867, "train/sim_loss": 0.015625 }, { "epoch": 0.9755784061696658, "step": 9867, "train/total_loss": 0.08868049830198288 }, { "entropy": 9.04803466796875, "epoch": 0.9756772790191813, "mean_token_accuracy": 0.715242862701416, "num_tokens": 30559655.0, "step": 9868, "train/ce_loss": 9.432754382032726e-07 }, { "epoch": 0.9756772790191813, "step": 9868, "train/sim_loss": 0.046875 }, { "epoch": 0.9756772790191813, "step": 9868, "train/total_loss": 0.04687509313225746 }, { "entropy": 8.45159912109375, "epoch": 0.9757761518686968, "mean_token_accuracy": 0.7975757718086243, "num_tokens": 30564978.0, "step": 9869, "train/ce_loss": 0.6422327756881714 }, { "epoch": 0.9757761518686968, "step": 9869, "train/sim_loss": 0.02734375 }, { "epoch": 0.9757761518686968, "step": 9869, "train/total_loss": 0.0915670320391655 }, { "entropy": 8.481781005859375, "epoch": 0.9758750247182124, "mean_token_accuracy": 0.7322221994400024, "num_tokens": 30570348.0, "step": 9870, "train/ce_loss": 0.5732051730155945 }, { "epoch": 0.9758750247182124, "step": 9870, "train/sim_loss": 0.03125 }, { "epoch": 0.9758750247182124, "step": 9870, "train/total_loss": 0.08857052028179169 }, { "entropy": 8.707862854003906, "epoch": 0.9759738975677279, "mean_token_accuracy": 0.7855477929115295, "num_tokens": 30575692.0, "step": 9871, "train/ce_loss": 0.9808839559555054 }, { "epoch": 0.9759738975677279, "step": 9871, "train/sim_loss": 0.04296875 }, { "epoch": 0.9759738975677279, "step": 9871, "train/total_loss": 0.14105714857578278 }, { "entropy": 8.741270065307617, "epoch": 0.9760727704172434, "mean_token_accuracy": 0.7372549176216125, "num_tokens": 30580956.0, "step": 9872, "train/ce_loss": 1.205855369567871 }, { "epoch": 0.9760727704172434, "step": 9872, "train/sim_loss": 0.046875 }, { "epoch": 0.9760727704172434, "step": 9872, "train/total_loss": 0.16746053099632263 }, { "entropy": 8.641582489013672, "epoch": 0.976171643266759, "mean_token_accuracy": 0.7350705862045288, "num_tokens": 30586365.0, "step": 9873, "train/ce_loss": 0.45913925766944885 }, { "epoch": 0.976171643266759, "step": 9873, "train/sim_loss": 0.01953125 }, { "epoch": 0.976171643266759, "step": 9873, "train/total_loss": 0.065445177257061 }, { "entropy": 10.355825424194336, "epoch": 0.9762705161162745, "mean_token_accuracy": 1.0, "num_tokens": 30590747.0, "step": 9874, "train/ce_loss": 6.013480378896929e-05 }, { "epoch": 0.9762705161162745, "step": 9874, "train/sim_loss": 0.03515625 }, { "epoch": 0.9762705161162745, "step": 9874, "train/total_loss": 0.03516226261854172 }, { "entropy": 8.985115051269531, "epoch": 0.97636938896579, "mean_token_accuracy": 0.7647058963775635, "num_tokens": 30595756.0, "step": 9875, "train/ce_loss": 8.571366834075889e-07 }, { "epoch": 0.97636938896579, "step": 9875, "train/sim_loss": 0.046875 }, { "epoch": 0.97636938896579, "step": 9875, "train/total_loss": 0.046875085681676865 }, { "entropy": 8.91389274597168, "epoch": 0.9764682618153055, "mean_token_accuracy": 0.783382773399353, "num_tokens": 30600919.0, "step": 9876, "train/ce_loss": 3.1128601563068514e-07 }, { "epoch": 0.9764682618153055, "step": 9876, "train/sim_loss": 0.0390625 }, { "epoch": 0.9764682618153055, "step": 9876, "train/total_loss": 0.03906252980232239 }, { "entropy": 8.787052154541016, "epoch": 0.976567134664821, "mean_token_accuracy": 0.7713903784751892, "num_tokens": 30606118.0, "step": 9877, "train/ce_loss": 0.4978821277618408 }, { "epoch": 0.976567134664821, "step": 9877, "train/sim_loss": 0.015625 }, { "epoch": 0.976567134664821, "step": 9877, "train/total_loss": 0.0654132142663002 }, { "entropy": 8.785783767700195, "epoch": 0.9766660075143365, "mean_token_accuracy": 0.7283018827438354, "num_tokens": 30611376.0, "step": 9878, "train/ce_loss": 0.9105390310287476 }, { "epoch": 0.9766660075143365, "step": 9878, "train/sim_loss": 0.08203125 }, { "epoch": 0.9766660075143365, "step": 9878, "train/total_loss": 0.17308515310287476 }, { "entropy": 8.438060760498047, "epoch": 0.9767648803638521, "mean_token_accuracy": 0.7971563935279846, "num_tokens": 30616939.0, "step": 9879, "train/ce_loss": 0.8996409773826599 }, { "epoch": 0.9767648803638521, "step": 9879, "train/sim_loss": 0.046875 }, { "epoch": 0.9767648803638521, "step": 9879, "train/total_loss": 0.1368390917778015 }, { "epoch": 0.9768637532133676, "grad_norm": 0.510572075843811, "learning_rate": 7.559956485190131e-06, "loss": 0.1195, "step": 9880 }, { "entropy": 8.88320541381836, "epoch": 0.9768637532133676, "mean_token_accuracy": 0.718068540096283, "num_tokens": 30622004.0, "step": 9880, "train/ce_loss": 1.1161073446273804 }, { "epoch": 0.9768637532133676, "step": 9880, "train/sim_loss": 0.0234375 }, { "epoch": 0.9768637532133676, "step": 9880, "train/total_loss": 0.13504824042320251 }, { "entropy": 8.4711275100708, "epoch": 0.9769626260628831, "mean_token_accuracy": 0.7662650346755981, "num_tokens": 30627319.0, "step": 9881, "train/ce_loss": 0.8220862746238708 }, { "epoch": 0.9769626260628831, "step": 9881, "train/sim_loss": 0.0546875 }, { "epoch": 0.9769626260628831, "step": 9881, "train/total_loss": 0.13689613342285156 }, { "entropy": 8.859695434570312, "epoch": 0.9770614989123987, "mean_token_accuracy": 0.8397040963172913, "num_tokens": 30632589.0, "step": 9882, "train/ce_loss": 0.789210855960846 }, { "epoch": 0.9770614989123987, "step": 9882, "train/sim_loss": 0.1171875 }, { "epoch": 0.9770614989123987, "step": 9882, "train/total_loss": 0.19610857963562012 }, { "entropy": 8.766387939453125, "epoch": 0.9771603717619142, "mean_token_accuracy": 0.7624831199645996, "num_tokens": 30637801.0, "step": 9883, "train/ce_loss": 1.42343008518219 }, { "epoch": 0.9771603717619142, "step": 9883, "train/sim_loss": 0.125 }, { "epoch": 0.9771603717619142, "step": 9883, "train/total_loss": 0.26734301447868347 }, { "entropy": 8.970907211303711, "epoch": 0.9772592446114297, "mean_token_accuracy": 0.7400881052017212, "num_tokens": 30642949.0, "step": 9884, "train/ce_loss": 1.1563928127288818 }, { "epoch": 0.9772592446114297, "step": 9884, "train/sim_loss": 0.0546875 }, { "epoch": 0.9772592446114297, "step": 9884, "train/total_loss": 0.17032678425312042 }, { "entropy": 8.41048526763916, "epoch": 0.9773581174609453, "mean_token_accuracy": 0.7832817435264587, "num_tokens": 30648399.0, "step": 9885, "train/ce_loss": 0.7665212154388428 }, { "epoch": 0.9773581174609453, "step": 9885, "train/sim_loss": 0.0546875 }, { "epoch": 0.9773581174609453, "step": 9885, "train/total_loss": 0.13133962452411652 }, { "entropy": 8.745559692382812, "epoch": 0.9774569903104607, "mean_token_accuracy": 0.736923098564148, "num_tokens": 30653559.0, "step": 9886, "train/ce_loss": 1.0831034183502197 }, { "epoch": 0.9774569903104607, "step": 9886, "train/sim_loss": 0.02734375 }, { "epoch": 0.9774569903104607, "step": 9886, "train/total_loss": 0.13565409183502197 }, { "entropy": 8.54442024230957, "epoch": 0.9775558631599762, "mean_token_accuracy": 0.750295877456665, "num_tokens": 30658783.0, "step": 9887, "train/ce_loss": 1.1568293571472168 }, { "epoch": 0.9775558631599762, "step": 9887, "train/sim_loss": 0.02734375 }, { "epoch": 0.9775558631599762, "step": 9887, "train/total_loss": 0.1430266797542572 }, { "entropy": 8.582108497619629, "epoch": 0.9776547360094918, "mean_token_accuracy": 0.7840467095375061, "num_tokens": 30664269.0, "step": 9888, "train/ce_loss": 0.4403001368045807 }, { "epoch": 0.9776547360094918, "step": 9888, "train/sim_loss": 0.06640625 }, { "epoch": 0.9776547360094918, "step": 9888, "train/total_loss": 0.11043626070022583 }, { "entropy": 8.791365623474121, "epoch": 0.9777536088590073, "mean_token_accuracy": 0.7666666507720947, "num_tokens": 30669518.0, "step": 9889, "train/ce_loss": 0.5905187726020813 }, { "epoch": 0.9777536088590073, "step": 9889, "train/sim_loss": 0.0234375 }, { "epoch": 0.9777536088590073, "step": 9889, "train/total_loss": 0.08248937875032425 }, { "entropy": 8.640741348266602, "epoch": 0.9778524817085228, "mean_token_accuracy": 0.7566371560096741, "num_tokens": 30674853.0, "step": 9890, "train/ce_loss": 0.721811056137085 }, { "epoch": 0.9778524817085228, "step": 9890, "train/sim_loss": 0.02734375 }, { "epoch": 0.9778524817085228, "step": 9890, "train/total_loss": 0.0995248556137085 }, { "entropy": 8.512459754943848, "epoch": 0.9779513545580384, "mean_token_accuracy": 0.7124260067939758, "num_tokens": 30680183.0, "step": 9891, "train/ce_loss": 0.8118852972984314 }, { "epoch": 0.9779513545580384, "step": 9891, "train/sim_loss": 0.046875 }, { "epoch": 0.9779513545580384, "step": 9891, "train/total_loss": 0.12806352972984314 }, { "entropy": 8.836570739746094, "epoch": 0.9780502274075539, "mean_token_accuracy": 0.8242074847221375, "num_tokens": 30685331.0, "step": 9892, "train/ce_loss": 0.5848731398582458 }, { "epoch": 0.9780502274075539, "step": 9892, "train/sim_loss": 0.015625 }, { "epoch": 0.9780502274075539, "step": 9892, "train/total_loss": 0.07411231100559235 }, { "entropy": 8.382086753845215, "epoch": 0.9781491002570694, "mean_token_accuracy": 0.7394678592681885, "num_tokens": 30690664.0, "step": 9893, "train/ce_loss": 0.9393115043640137 }, { "epoch": 0.9781491002570694, "step": 9893, "train/sim_loss": 0.046875 }, { "epoch": 0.9781491002570694, "step": 9893, "train/total_loss": 0.1408061534166336 }, { "entropy": 8.651121139526367, "epoch": 0.978247973106585, "mean_token_accuracy": 0.7192254662513733, "num_tokens": 30695806.0, "step": 9894, "train/ce_loss": 1.034562110900879 }, { "epoch": 0.978247973106585, "step": 9894, "train/sim_loss": 0.09375 }, { "epoch": 0.978247973106585, "step": 9894, "train/total_loss": 0.19720621407032013 }, { "entropy": 9.303420066833496, "epoch": 0.9783468459561004, "mean_token_accuracy": 0.6924528479576111, "num_tokens": 30700778.0, "step": 9895, "train/ce_loss": 1.5140877962112427 }, { "epoch": 0.9783468459561004, "step": 9895, "train/sim_loss": 0.03515625 }, { "epoch": 0.9783468459561004, "step": 9895, "train/total_loss": 0.18656502664089203 }, { "entropy": 9.45178508758545, "epoch": 0.9784457188056159, "mean_token_accuracy": 0.7640449404716492, "num_tokens": 30705674.0, "step": 9896, "train/ce_loss": 2.3216692568439612e-07 }, { "epoch": 0.9784457188056159, "step": 9896, "train/sim_loss": 0.0078125 }, { "epoch": 0.9784457188056159, "step": 9896, "train/total_loss": 0.007812523283064365 }, { "entropy": 8.834487915039062, "epoch": 0.9785445916551315, "mean_token_accuracy": 0.7036328911781311, "num_tokens": 30710659.0, "step": 9897, "train/ce_loss": 1.789854884147644 }, { "epoch": 0.9785445916551315, "step": 9897, "train/sim_loss": 0.046875 }, { "epoch": 0.9785445916551315, "step": 9897, "train/total_loss": 0.22586049139499664 }, { "entropy": 8.278316497802734, "epoch": 0.978643464504647, "mean_token_accuracy": 0.7219917178153992, "num_tokens": 30716035.0, "step": 9898, "train/ce_loss": 0.8721766471862793 }, { "epoch": 0.978643464504647, "step": 9898, "train/sim_loss": 0.05859375 }, { "epoch": 0.978643464504647, "step": 9898, "train/total_loss": 0.14581140875816345 }, { "entropy": 9.036575317382812, "epoch": 0.9787423373541625, "mean_token_accuracy": 0.6861110925674438, "num_tokens": 30721229.0, "step": 9899, "train/ce_loss": 1.099528193473816 }, { "epoch": 0.9787423373541625, "step": 9899, "train/sim_loss": 0.04296875 }, { "epoch": 0.9787423373541625, "step": 9899, "train/total_loss": 0.15292157232761383 }, { "epoch": 0.9788412102036781, "grad_norm": 0.7387040853500366, "learning_rate": 7.555011620432182e-06, "loss": 0.1293, "step": 9900 }, { "entropy": 8.581748962402344, "epoch": 0.9788412102036781, "mean_token_accuracy": 0.689486563205719, "num_tokens": 30726478.0, "step": 9900, "train/ce_loss": 0.6335959434509277 }, { "epoch": 0.9788412102036781, "step": 9900, "train/sim_loss": 0.03125 }, { "epoch": 0.9788412102036781, "step": 9900, "train/total_loss": 0.09460959583520889 }, { "entropy": 8.285743713378906, "epoch": 0.9789400830531936, "mean_token_accuracy": 0.7516650557518005, "num_tokens": 30732196.0, "step": 9901, "train/ce_loss": 1.0870752334594727 }, { "epoch": 0.9789400830531936, "step": 9901, "train/sim_loss": 0.078125 }, { "epoch": 0.9789400830531936, "step": 9901, "train/total_loss": 0.1868325173854828 }, { "entropy": 8.982463836669922, "epoch": 0.9790389559027091, "mean_token_accuracy": 0.6801406145095825, "num_tokens": 30737219.0, "step": 9902, "train/ce_loss": 1.3527101278305054 }, { "epoch": 0.9790389559027091, "step": 9902, "train/sim_loss": 0.0546875 }, { "epoch": 0.9790389559027091, "step": 9902, "train/total_loss": 0.18995851278305054 }, { "entropy": 9.142914772033691, "epoch": 0.9791378287522247, "mean_token_accuracy": 0.7255892157554626, "num_tokens": 30742283.0, "step": 9903, "train/ce_loss": 1.7833095788955688 }, { "epoch": 0.9791378287522247, "step": 9903, "train/sim_loss": 0.06640625 }, { "epoch": 0.9791378287522247, "step": 9903, "train/total_loss": 0.24473720788955688 }, { "entropy": 8.998891830444336, "epoch": 0.9792367016017401, "mean_token_accuracy": 0.7496296167373657, "num_tokens": 30747397.0, "step": 9904, "train/ce_loss": 0.5943091511726379 }, { "epoch": 0.9792367016017401, "step": 9904, "train/sim_loss": 0.0234375 }, { "epoch": 0.9792367016017401, "step": 9904, "train/total_loss": 0.08286841213703156 }, { "entropy": 8.840842247009277, "epoch": 0.9793355744512556, "mean_token_accuracy": 0.7252252101898193, "num_tokens": 30752514.0, "step": 9905, "train/ce_loss": 1.216805338859558 }, { "epoch": 0.9793355744512556, "step": 9905, "train/sim_loss": 0.0703125 }, { "epoch": 0.9793355744512556, "step": 9905, "train/total_loss": 0.19199302792549133 }, { "entropy": 8.774174690246582, "epoch": 0.9794344473007712, "mean_token_accuracy": 0.7604422569274902, "num_tokens": 30757778.0, "step": 9906, "train/ce_loss": 0.9100068807601929 }, { "epoch": 0.9794344473007712, "step": 9906, "train/sim_loss": 0.03515625 }, { "epoch": 0.9794344473007712, "step": 9906, "train/total_loss": 0.12615694105625153 }, { "entropy": 8.940999984741211, "epoch": 0.9795333201502867, "mean_token_accuracy": 0.7410179376602173, "num_tokens": 30762903.0, "step": 9907, "train/ce_loss": 0.520973801612854 }, { "epoch": 0.9795333201502867, "step": 9907, "train/sim_loss": 0.046875 }, { "epoch": 0.9795333201502867, "step": 9907, "train/total_loss": 0.0989723801612854 }, { "entropy": 8.454092025756836, "epoch": 0.9796321929998023, "mean_token_accuracy": 0.7634069323539734, "num_tokens": 30768319.0, "step": 9908, "train/ce_loss": 0.9952600598335266 }, { "epoch": 0.9796321929998023, "step": 9908, "train/sim_loss": 0.06640625 }, { "epoch": 0.9796321929998023, "step": 9908, "train/total_loss": 0.16593226790428162 }, { "entropy": 8.704395294189453, "epoch": 0.9797310658493178, "mean_token_accuracy": 0.646039605140686, "num_tokens": 30773610.0, "step": 9909, "train/ce_loss": 1.4009902477264404 }, { "epoch": 0.9797310658493178, "step": 9909, "train/sim_loss": 0.06640625 }, { "epoch": 0.9797310658493178, "step": 9909, "train/total_loss": 0.20650528371334076 }, { "entropy": 8.383907318115234, "epoch": 0.9798299386988333, "mean_token_accuracy": 0.7507853507995605, "num_tokens": 30779058.0, "step": 9910, "train/ce_loss": 0.7845810055732727 }, { "epoch": 0.9798299386988333, "step": 9910, "train/sim_loss": 0.0390625 }, { "epoch": 0.9798299386988333, "step": 9910, "train/total_loss": 0.11752060055732727 }, { "entropy": 8.56901741027832, "epoch": 0.9799288115483489, "mean_token_accuracy": 0.7284946441650391, "num_tokens": 30784258.0, "step": 9911, "train/ce_loss": 0.6746019721031189 }, { "epoch": 0.9799288115483489, "step": 9911, "train/sim_loss": 0.0390625 }, { "epoch": 0.9799288115483489, "step": 9911, "train/total_loss": 0.10652270168066025 }, { "entropy": 8.998884201049805, "epoch": 0.9800276843978644, "mean_token_accuracy": 0.8318318128585815, "num_tokens": 30789417.0, "step": 9912, "train/ce_loss": 0.777534008026123 }, { "epoch": 0.9800276843978644, "step": 9912, "train/sim_loss": 0.01953125 }, { "epoch": 0.9800276843978644, "step": 9912, "train/total_loss": 0.09728465229272842 }, { "entropy": 8.481942176818848, "epoch": 0.9801265572473798, "mean_token_accuracy": 0.7428229451179504, "num_tokens": 30794865.0, "step": 9913, "train/ce_loss": 0.9887452721595764 }, { "epoch": 0.9801265572473798, "step": 9913, "train/sim_loss": 0.08203125 }, { "epoch": 0.9801265572473798, "step": 9913, "train/total_loss": 0.1809057891368866 }, { "entropy": 8.863540649414062, "epoch": 0.9802254300968954, "mean_token_accuracy": 0.7674094438552856, "num_tokens": 30800041.0, "step": 9914, "train/ce_loss": 1.9064351320266724 }, { "epoch": 0.9802254300968954, "step": 9914, "train/sim_loss": 0.0625 }, { "epoch": 0.9802254300968954, "step": 9914, "train/total_loss": 0.2531435191631317 }, { "entropy": 8.945619583129883, "epoch": 0.9803243029464109, "mean_token_accuracy": 0.7545271515846252, "num_tokens": 30805020.0, "step": 9915, "train/ce_loss": 0.5835563540458679 }, { "epoch": 0.9803243029464109, "step": 9915, "train/sim_loss": 0.0234375 }, { "epoch": 0.9803243029464109, "step": 9915, "train/total_loss": 0.08179313689470291 }, { "entropy": 9.063322067260742, "epoch": 0.9804231757959264, "mean_token_accuracy": 0.7410604357719421, "num_tokens": 30810451.0, "step": 9916, "train/ce_loss": 0.8868422508239746 }, { "epoch": 0.9804231757959264, "step": 9916, "train/sim_loss": 0.02734375 }, { "epoch": 0.9804231757959264, "step": 9916, "train/total_loss": 0.11602797359228134 }, { "entropy": 8.314630508422852, "epoch": 0.980522048645442, "mean_token_accuracy": 0.7217973470687866, "num_tokens": 30815993.0, "step": 9917, "train/ce_loss": 0.9607058167457581 }, { "epoch": 0.980522048645442, "step": 9917, "train/sim_loss": 0.01953125 }, { "epoch": 0.980522048645442, "step": 9917, "train/total_loss": 0.11560183018445969 }, { "entropy": 8.702873229980469, "epoch": 0.9806209214949575, "mean_token_accuracy": 0.6839160919189453, "num_tokens": 30821165.0, "step": 9918, "train/ce_loss": 0.7455655336380005 }, { "epoch": 0.9806209214949575, "step": 9918, "train/sim_loss": 0.05078125 }, { "epoch": 0.9806209214949575, "step": 9918, "train/total_loss": 0.12533780932426453 }, { "entropy": 8.36336612701416, "epoch": 0.980719794344473, "mean_token_accuracy": 0.8054474592208862, "num_tokens": 30826831.0, "step": 9919, "train/ce_loss": 0.691750705242157 }, { "epoch": 0.980719794344473, "step": 9919, "train/sim_loss": 0.0625 }, { "epoch": 0.980719794344473, "step": 9919, "train/total_loss": 0.13167506456375122 }, { "epoch": 0.9808186671939886, "grad_norm": 0.5420098900794983, "learning_rate": 7.550066755674232e-06, "loss": 0.1404, "step": 9920 }, { "entropy": 8.432720184326172, "epoch": 0.9808186671939886, "mean_token_accuracy": 0.6704196333885193, "num_tokens": 30832308.0, "step": 9920, "train/ce_loss": 1.9303783178329468 }, { "epoch": 0.9808186671939886, "step": 9920, "train/sim_loss": 0.04296875 }, { "epoch": 0.9808186671939886, "step": 9920, "train/total_loss": 0.23600658774375916 }, { "entropy": 8.786901473999023, "epoch": 0.9809175400435041, "mean_token_accuracy": 0.7964236736297607, "num_tokens": 30837514.0, "step": 9921, "train/ce_loss": 5.456970484374324e-07 }, { "epoch": 0.9809175400435041, "step": 9921, "train/sim_loss": 0.04296875 }, { "epoch": 0.9809175400435041, "step": 9921, "train/total_loss": 0.04296880587935448 }, { "entropy": 8.23611831665039, "epoch": 0.9810164128930196, "mean_token_accuracy": 0.7310061454772949, "num_tokens": 30843032.0, "step": 9922, "train/ce_loss": 0.5126314163208008 }, { "epoch": 0.9810164128930196, "step": 9922, "train/sim_loss": 0.03515625 }, { "epoch": 0.9810164128930196, "step": 9922, "train/total_loss": 0.08641938865184784 }, { "entropy": 8.348541259765625, "epoch": 0.9811152857425351, "mean_token_accuracy": 0.776190459728241, "num_tokens": 30848329.0, "step": 9923, "train/ce_loss": 0.7847875952720642 }, { "epoch": 0.9811152857425351, "step": 9923, "train/sim_loss": 0.078125 }, { "epoch": 0.9811152857425351, "step": 9923, "train/total_loss": 0.15660375356674194 }, { "entropy": 8.548707008361816, "epoch": 0.9812141585920506, "mean_token_accuracy": 0.7929824590682983, "num_tokens": 30853656.0, "step": 9924, "train/ce_loss": 0.8370770812034607 }, { "epoch": 0.9812141585920506, "step": 9924, "train/sim_loss": 0.04296875 }, { "epoch": 0.9812141585920506, "step": 9924, "train/total_loss": 0.12667647004127502 }, { "entropy": 8.849676132202148, "epoch": 0.9813130314415661, "mean_token_accuracy": 0.7673179507255554, "num_tokens": 30858608.0, "step": 9925, "train/ce_loss": 0.9295164346694946 }, { "epoch": 0.9813130314415661, "step": 9925, "train/sim_loss": 0.0390625 }, { "epoch": 0.9813130314415661, "step": 9925, "train/total_loss": 0.13201415538787842 }, { "entropy": 8.875197410583496, "epoch": 0.9814119042910817, "mean_token_accuracy": 0.7397959232330322, "num_tokens": 30863848.0, "step": 9926, "train/ce_loss": 0.45025020837783813 }, { "epoch": 0.9814119042910817, "step": 9926, "train/sim_loss": 0.078125 }, { "epoch": 0.9814119042910817, "step": 9926, "train/total_loss": 0.12315002083778381 }, { "entropy": 8.888309478759766, "epoch": 0.9815107771405972, "mean_token_accuracy": 0.7327459454536438, "num_tokens": 30868968.0, "step": 9927, "train/ce_loss": 0.6028944253921509 }, { "epoch": 0.9815107771405972, "step": 9927, "train/sim_loss": 0.04296875 }, { "epoch": 0.9815107771405972, "step": 9927, "train/total_loss": 0.10325819253921509 }, { "entropy": 8.876193046569824, "epoch": 0.9816096499901127, "mean_token_accuracy": 0.7164179086685181, "num_tokens": 30874115.0, "step": 9928, "train/ce_loss": 0.8704211115837097 }, { "epoch": 0.9816096499901127, "step": 9928, "train/sim_loss": 0.03125 }, { "epoch": 0.9816096499901127, "step": 9928, "train/total_loss": 0.11829211562871933 }, { "entropy": 8.968652725219727, "epoch": 0.9817085228396283, "mean_token_accuracy": 0.7547169923782349, "num_tokens": 30879250.0, "step": 9929, "train/ce_loss": 0.651799738407135 }, { "epoch": 0.9817085228396283, "step": 9929, "train/sim_loss": 0.015625 }, { "epoch": 0.9817085228396283, "step": 9929, "train/total_loss": 0.0808049738407135 }, { "entropy": 8.65110969543457, "epoch": 0.9818073956891438, "mean_token_accuracy": 0.790043294429779, "num_tokens": 30884875.0, "step": 9930, "train/ce_loss": 1.005576491355896 }, { "epoch": 0.9818073956891438, "step": 9930, "train/sim_loss": 0.14453125 }, { "epoch": 0.9818073956891438, "step": 9930, "train/total_loss": 0.24508890509605408 }, { "entropy": 9.129095077514648, "epoch": 0.9819062685386593, "mean_token_accuracy": 0.7457627058029175, "num_tokens": 30889864.0, "step": 9931, "train/ce_loss": 9.725036989038927e-07 }, { "epoch": 0.9819062685386593, "step": 9931, "train/sim_loss": 0.04296875 }, { "epoch": 0.9819062685386593, "step": 9931, "train/total_loss": 0.04296884685754776 }, { "entropy": 9.076364517211914, "epoch": 0.9820051413881749, "mean_token_accuracy": 0.6855670213699341, "num_tokens": 30894878.0, "step": 9932, "train/ce_loss": 0.7352795004844666 }, { "epoch": 0.9820051413881749, "step": 9932, "train/sim_loss": 0.03125 }, { "epoch": 0.9820051413881749, "step": 9932, "train/total_loss": 0.10477795451879501 }, { "entropy": 8.525617599487305, "epoch": 0.9821040142376903, "mean_token_accuracy": 0.7788162231445312, "num_tokens": 30900269.0, "step": 9933, "train/ce_loss": 0.4009047746658325 }, { "epoch": 0.9821040142376903, "step": 9933, "train/sim_loss": 0.0390625 }, { "epoch": 0.9821040142376903, "step": 9933, "train/total_loss": 0.07915297895669937 }, { "entropy": 8.45478343963623, "epoch": 0.9822028870872058, "mean_token_accuracy": 0.7869757413864136, "num_tokens": 30905662.0, "step": 9934, "train/ce_loss": 0.6240054965019226 }, { "epoch": 0.9822028870872058, "step": 9934, "train/sim_loss": 0.03125 }, { "epoch": 0.9822028870872058, "step": 9934, "train/total_loss": 0.09365054965019226 }, { "entropy": 8.567974090576172, "epoch": 0.9823017599367214, "mean_token_accuracy": 0.7644444704055786, "num_tokens": 30911045.0, "step": 9935, "train/ce_loss": 0.5513052940368652 }, { "epoch": 0.9823017599367214, "step": 9935, "train/sim_loss": 0.02734375 }, { "epoch": 0.9823017599367214, "step": 9935, "train/total_loss": 0.08247427642345428 }, { "entropy": 8.663185119628906, "epoch": 0.9824006327862369, "mean_token_accuracy": 0.7214452028274536, "num_tokens": 30916391.0, "step": 9936, "train/ce_loss": 1.0856975317001343 }, { "epoch": 0.9824006327862369, "step": 9936, "train/sim_loss": 0.0625 }, { "epoch": 0.9824006327862369, "step": 9936, "train/total_loss": 0.17106975615024567 }, { "entropy": 8.67253303527832, "epoch": 0.9824995056357524, "mean_token_accuracy": 0.7493606209754944, "num_tokens": 30921630.0, "step": 9937, "train/ce_loss": 0.9833881855010986 }, { "epoch": 0.9824995056357524, "step": 9937, "train/sim_loss": 0.05859375 }, { "epoch": 0.9824995056357524, "step": 9937, "train/total_loss": 0.15693256258964539 }, { "entropy": 8.700638771057129, "epoch": 0.982598378485268, "mean_token_accuracy": 0.6666666865348816, "num_tokens": 30926947.0, "step": 9938, "train/ce_loss": 0.9952269196510315 }, { "epoch": 0.982598378485268, "step": 9938, "train/sim_loss": 0.046875 }, { "epoch": 0.982598378485268, "step": 9938, "train/total_loss": 0.1463976949453354 }, { "entropy": 9.080299377441406, "epoch": 0.9826972513347835, "mean_token_accuracy": 0.8273026347160339, "num_tokens": 30932016.0, "step": 9939, "train/ce_loss": 0.6579766869544983 }, { "epoch": 0.9826972513347835, "step": 9939, "train/sim_loss": 0.0234375 }, { "epoch": 0.9826972513347835, "step": 9939, "train/total_loss": 0.08923517167568207 }, { "epoch": 0.982796124184299, "grad_norm": 0.5730013251304626, "learning_rate": 7.545121890916283e-06, "loss": 0.1323, "step": 9940 }, { "entropy": 8.32789421081543, "epoch": 0.982796124184299, "mean_token_accuracy": 0.7092130780220032, "num_tokens": 30937548.0, "step": 9940, "train/ce_loss": 1.2583142518997192 }, { "epoch": 0.982796124184299, "step": 9940, "train/sim_loss": 0.04296875 }, { "epoch": 0.982796124184299, "step": 9940, "train/total_loss": 0.16880017518997192 }, { "entropy": 8.597822189331055, "epoch": 0.9828949970338146, "mean_token_accuracy": 0.7548179626464844, "num_tokens": 30942910.0, "step": 9941, "train/ce_loss": 0.8048480153083801 }, { "epoch": 0.9828949970338146, "step": 9941, "train/sim_loss": 0.05859375 }, { "epoch": 0.9828949970338146, "step": 9941, "train/total_loss": 0.1390785574913025 }, { "entropy": 8.579482078552246, "epoch": 0.98299386988333, "mean_token_accuracy": 0.7096773982048035, "num_tokens": 30948202.0, "step": 9942, "train/ce_loss": 0.7927011251449585 }, { "epoch": 0.98299386988333, "step": 9942, "train/sim_loss": 0.03125 }, { "epoch": 0.98299386988333, "step": 9942, "train/total_loss": 0.11052011698484421 }, { "entropy": 8.698169708251953, "epoch": 0.9830927427328455, "mean_token_accuracy": 0.7479091882705688, "num_tokens": 30953496.0, "step": 9943, "train/ce_loss": 0.5432027578353882 }, { "epoch": 0.9830927427328455, "step": 9943, "train/sim_loss": 0.05078125 }, { "epoch": 0.9830927427328455, "step": 9943, "train/total_loss": 0.10510152578353882 }, { "entropy": 9.422571182250977, "epoch": 0.9831916155823611, "mean_token_accuracy": 0.7852882742881775, "num_tokens": 30958416.0, "step": 9944, "train/ce_loss": 1.8939794017569511e-06 }, { "epoch": 0.9831916155823611, "step": 9944, "train/sim_loss": 0.03125 }, { "epoch": 0.9831916155823611, "step": 9944, "train/total_loss": 0.03125018998980522 }, { "entropy": 8.789154052734375, "epoch": 0.9832904884318766, "mean_token_accuracy": 0.7508854866027832, "num_tokens": 30963690.0, "step": 9945, "train/ce_loss": 0.2710076868534088 }, { "epoch": 0.9832904884318766, "step": 9945, "train/sim_loss": 0.01171875 }, { "epoch": 0.9832904884318766, "step": 9945, "train/total_loss": 0.03881952166557312 }, { "entropy": 8.839859008789062, "epoch": 0.9833893612813921, "mean_token_accuracy": 0.708020031452179, "num_tokens": 30968956.0, "step": 9946, "train/ce_loss": 0.5631778836250305 }, { "epoch": 0.9833893612813921, "step": 9946, "train/sim_loss": 0.0703125 }, { "epoch": 0.9833893612813921, "step": 9946, "train/total_loss": 0.1266302913427353 }, { "entropy": 8.943111419677734, "epoch": 0.9834882341309077, "mean_token_accuracy": 0.7838214635848999, "num_tokens": 30974140.0, "step": 9947, "train/ce_loss": 3.6909526102135715e-07 }, { "epoch": 0.9834882341309077, "step": 9947, "train/sim_loss": 0.0390625 }, { "epoch": 0.9834882341309077, "step": 9947, "train/total_loss": 0.039062537252902985 }, { "entropy": 9.292435646057129, "epoch": 0.9835871069804232, "mean_token_accuracy": 0.8214936256408691, "num_tokens": 30979104.0, "step": 9948, "train/ce_loss": 1.589453262340612e-07 }, { "epoch": 0.9835871069804232, "step": 9948, "train/sim_loss": 0.01953125 }, { "epoch": 0.9835871069804232, "step": 9948, "train/total_loss": 0.019531266763806343 }, { "entropy": 8.4045991897583, "epoch": 0.9836859798299387, "mean_token_accuracy": 0.7729138135910034, "num_tokens": 30984340.0, "step": 9949, "train/ce_loss": 0.5247573256492615 }, { "epoch": 0.9836859798299387, "step": 9949, "train/sim_loss": 0.0390625 }, { "epoch": 0.9836859798299387, "step": 9949, "train/total_loss": 0.09153823554515839 }, { "entropy": 8.655339241027832, "epoch": 0.9837848526794543, "mean_token_accuracy": 0.8016877770423889, "num_tokens": 30989909.0, "step": 9950, "train/ce_loss": 0.4090072810649872 }, { "epoch": 0.9837848526794543, "step": 9950, "train/sim_loss": 0.015625 }, { "epoch": 0.9837848526794543, "step": 9950, "train/total_loss": 0.05652572959661484 }, { "entropy": 8.748329162597656, "epoch": 0.9838837255289697, "mean_token_accuracy": 0.7439024448394775, "num_tokens": 30995145.0, "step": 9951, "train/ce_loss": 0.8711205720901489 }, { "epoch": 0.9838837255289697, "step": 9951, "train/sim_loss": 0.03515625 }, { "epoch": 0.9838837255289697, "step": 9951, "train/total_loss": 0.12226831167936325 }, { "entropy": 8.843263626098633, "epoch": 0.9839825983784852, "mean_token_accuracy": 0.7362045645713806, "num_tokens": 31000515.0, "step": 9952, "train/ce_loss": 1.092915415763855 }, { "epoch": 0.9839825983784852, "step": 9952, "train/sim_loss": 0.10546875 }, { "epoch": 0.9839825983784852, "step": 9952, "train/total_loss": 0.21476030349731445 }, { "entropy": 8.46907901763916, "epoch": 0.9840814712280008, "mean_token_accuracy": 0.7589175701141357, "num_tokens": 31005985.0, "step": 9953, "train/ce_loss": 0.49849000573158264 }, { "epoch": 0.9840814712280008, "step": 9953, "train/sim_loss": 0.0703125 }, { "epoch": 0.9840814712280008, "step": 9953, "train/total_loss": 0.1201615035533905 }, { "entropy": 8.036636352539062, "epoch": 0.9841803440775163, "mean_token_accuracy": 0.7485265135765076, "num_tokens": 31011469.0, "step": 9954, "train/ce_loss": 1.0809634923934937 }, { "epoch": 0.9841803440775163, "step": 9954, "train/sim_loss": 0.05078125 }, { "epoch": 0.9841803440775163, "step": 9954, "train/total_loss": 0.15887761116027832 }, { "entropy": 9.49459457397461, "epoch": 0.9842792169270318, "mean_token_accuracy": 0.7575757503509521, "num_tokens": 31016328.0, "step": 9955, "train/ce_loss": 1.3868564367294312 }, { "epoch": 0.9842792169270318, "step": 9955, "train/sim_loss": 0.03125 }, { "epoch": 0.9842792169270318, "step": 9955, "train/total_loss": 0.16993564367294312 }, { "entropy": 8.62130355834961, "epoch": 0.9843780897765474, "mean_token_accuracy": 0.7247806787490845, "num_tokens": 31021779.0, "step": 9956, "train/ce_loss": 0.7187850475311279 }, { "epoch": 0.9843780897765474, "step": 9956, "train/sim_loss": 0.03515625 }, { "epoch": 0.9843780897765474, "step": 9956, "train/total_loss": 0.10703475773334503 }, { "entropy": 8.984066009521484, "epoch": 0.9844769626260629, "mean_token_accuracy": 0.7504835724830627, "num_tokens": 31026751.0, "step": 9957, "train/ce_loss": 0.8555747866630554 }, { "epoch": 0.9844769626260629, "step": 9957, "train/sim_loss": 0.02734375 }, { "epoch": 0.9844769626260629, "step": 9957, "train/total_loss": 0.1129012331366539 }, { "entropy": 8.909862518310547, "epoch": 0.9845758354755784, "mean_token_accuracy": 0.7599451541900635, "num_tokens": 31031971.0, "step": 9958, "train/ce_loss": 1.0698707103729248 }, { "epoch": 0.9845758354755784, "step": 9958, "train/sim_loss": 0.05859375 }, { "epoch": 0.9845758354755784, "step": 9958, "train/total_loss": 0.16558082401752472 }, { "entropy": 8.192459106445312, "epoch": 0.984674708325094, "mean_token_accuracy": 0.7552602291107178, "num_tokens": 31037396.0, "step": 9959, "train/ce_loss": 0.49683094024658203 }, { "epoch": 0.984674708325094, "step": 9959, "train/sim_loss": 0.05859375 }, { "epoch": 0.984674708325094, "step": 9959, "train/total_loss": 0.1082768440246582 }, { "epoch": 0.9847735811746094, "grad_norm": 0.6442089080810547, "learning_rate": 7.540177026158335e-06, "loss": 0.1207, "step": 9960 }, { "entropy": 8.275558471679688, "epoch": 0.9847735811746094, "mean_token_accuracy": 0.7203311920166016, "num_tokens": 31042946.0, "step": 9960, "train/ce_loss": 0.5132285356521606 }, { "epoch": 0.9847735811746094, "step": 9960, "train/sim_loss": 0.03515625 }, { "epoch": 0.9847735811746094, "step": 9960, "train/total_loss": 0.08647910505533218 }, { "entropy": 8.636802673339844, "epoch": 0.9848724540241249, "mean_token_accuracy": 0.7493857741355896, "num_tokens": 31048241.0, "step": 9961, "train/ce_loss": 0.7567731142044067 }, { "epoch": 0.9848724540241249, "step": 9961, "train/sim_loss": 0.05078125 }, { "epoch": 0.9848724540241249, "step": 9961, "train/total_loss": 0.1264585554599762 }, { "entropy": 8.473033905029297, "epoch": 0.9849713268736405, "mean_token_accuracy": 0.7456258535385132, "num_tokens": 31053448.0, "step": 9962, "train/ce_loss": 0.773692786693573 }, { "epoch": 0.9849713268736405, "step": 9962, "train/sim_loss": 0.0390625 }, { "epoch": 0.9849713268736405, "step": 9962, "train/total_loss": 0.11643178015947342 }, { "entropy": 8.31374740600586, "epoch": 0.985070199723156, "mean_token_accuracy": 0.7385475039482117, "num_tokens": 31058793.0, "step": 9963, "train/ce_loss": 1.0783629417419434 }, { "epoch": 0.985070199723156, "step": 9963, "train/sim_loss": 0.046875 }, { "epoch": 0.985070199723156, "step": 9963, "train/total_loss": 0.1547113060951233 }, { "entropy": 8.551907539367676, "epoch": 0.9851690725726715, "mean_token_accuracy": 0.7377245426177979, "num_tokens": 31064149.0, "step": 9964, "train/ce_loss": 1.03456449508667 }, { "epoch": 0.9851690725726715, "step": 9964, "train/sim_loss": 0.07421875 }, { "epoch": 0.9851690725726715, "step": 9964, "train/total_loss": 0.17767520248889923 }, { "entropy": 8.632230758666992, "epoch": 0.9852679454221871, "mean_token_accuracy": 0.751396656036377, "num_tokens": 31069309.0, "step": 9965, "train/ce_loss": 0.8975366353988647 }, { "epoch": 0.9852679454221871, "step": 9965, "train/sim_loss": 0.0390625 }, { "epoch": 0.9852679454221871, "step": 9965, "train/total_loss": 0.128816157579422 }, { "entropy": 8.25493049621582, "epoch": 0.9853668182717026, "mean_token_accuracy": 0.7153518199920654, "num_tokens": 31074731.0, "step": 9966, "train/ce_loss": 1.4290907382965088 }, { "epoch": 0.9853668182717026, "step": 9966, "train/sim_loss": 0.046875 }, { "epoch": 0.9853668182717026, "step": 9966, "train/total_loss": 0.18978407979011536 }, { "entropy": 8.804088592529297, "epoch": 0.9854656911212181, "mean_token_accuracy": 0.737500011920929, "num_tokens": 31079930.0, "step": 9967, "train/ce_loss": 0.7617242336273193 }, { "epoch": 0.9854656911212181, "step": 9967, "train/sim_loss": 0.05078125 }, { "epoch": 0.9854656911212181, "step": 9967, "train/total_loss": 0.12695367634296417 }, { "entropy": 8.463518142700195, "epoch": 0.9855645639707337, "mean_token_accuracy": 0.7080292105674744, "num_tokens": 31085205.0, "step": 9968, "train/ce_loss": 0.8394144177436829 }, { "epoch": 0.9855645639707337, "step": 9968, "train/sim_loss": 0.01953125 }, { "epoch": 0.9855645639707337, "step": 9968, "train/total_loss": 0.10347269475460052 }, { "entropy": 8.643590927124023, "epoch": 0.9856634368202492, "mean_token_accuracy": 0.7765700221061707, "num_tokens": 31090452.0, "step": 9969, "train/ce_loss": 0.3944287896156311 }, { "epoch": 0.9856634368202492, "step": 9969, "train/sim_loss": 0.015625 }, { "epoch": 0.9856634368202492, "step": 9969, "train/total_loss": 0.05506787821650505 }, { "entropy": 8.440271377563477, "epoch": 0.9857623096697646, "mean_token_accuracy": 0.7650334239006042, "num_tokens": 31095819.0, "step": 9970, "train/ce_loss": 0.8097338676452637 }, { "epoch": 0.9857623096697646, "step": 9970, "train/sim_loss": 0.046875 }, { "epoch": 0.9857623096697646, "step": 9970, "train/total_loss": 0.12784838676452637 }, { "entropy": 8.892143249511719, "epoch": 0.9858611825192802, "mean_token_accuracy": 0.7469244003295898, "num_tokens": 31100858.0, "step": 9971, "train/ce_loss": 1.3040673732757568 }, { "epoch": 0.9858611825192802, "step": 9971, "train/sim_loss": 0.04296875 }, { "epoch": 0.9858611825192802, "step": 9971, "train/total_loss": 0.17337548732757568 }, { "entropy": 9.170970916748047, "epoch": 0.9859600553687957, "mean_token_accuracy": 0.8060606122016907, "num_tokens": 31105753.0, "step": 9972, "train/ce_loss": 1.0176302194595337 }, { "epoch": 0.9859600553687957, "step": 9972, "train/sim_loss": 0.015625 }, { "epoch": 0.9859600553687957, "step": 9972, "train/total_loss": 0.11738802492618561 }, { "entropy": 8.85882568359375, "epoch": 0.9860589282183112, "mean_token_accuracy": 0.6770708560943604, "num_tokens": 31111062.0, "step": 9973, "train/ce_loss": 0.7416624426841736 }, { "epoch": 0.9860589282183112, "step": 9973, "train/sim_loss": 0.0625 }, { "epoch": 0.9860589282183112, "step": 9973, "train/total_loss": 0.13666623830795288 }, { "entropy": 8.846567153930664, "epoch": 0.9861578010678268, "mean_token_accuracy": 0.822603702545166, "num_tokens": 31116244.0, "step": 9974, "train/ce_loss": 0.8458290100097656 }, { "epoch": 0.9861578010678268, "step": 9974, "train/sim_loss": 0.0390625 }, { "epoch": 0.9861578010678268, "step": 9974, "train/total_loss": 0.12364540249109268 }, { "entropy": 8.67448616027832, "epoch": 0.9862566739173423, "mean_token_accuracy": 0.7335957884788513, "num_tokens": 31121455.0, "step": 9975, "train/ce_loss": 0.8351233601570129 }, { "epoch": 0.9862566739173423, "step": 9975, "train/sim_loss": 0.0703125 }, { "epoch": 0.9862566739173423, "step": 9975, "train/total_loss": 0.1538248360157013 }, { "entropy": 8.290175437927246, "epoch": 0.9863555467668578, "mean_token_accuracy": 0.742484986782074, "num_tokens": 31126914.0, "step": 9976, "train/ce_loss": 1.2690761089324951 }, { "epoch": 0.9863555467668578, "step": 9976, "train/sim_loss": 0.0546875 }, { "epoch": 0.9863555467668578, "step": 9976, "train/total_loss": 0.181595116853714 }, { "entropy": 8.776311874389648, "epoch": 0.9864544196163734, "mean_token_accuracy": 0.739534854888916, "num_tokens": 31132012.0, "step": 9977, "train/ce_loss": 0.7153046131134033 }, { "epoch": 0.9864544196163734, "step": 9977, "train/sim_loss": 0.0546875 }, { "epoch": 0.9864544196163734, "step": 9977, "train/total_loss": 0.12621796131134033 }, { "entropy": 8.852508544921875, "epoch": 0.9865532924658889, "mean_token_accuracy": 0.671999990940094, "num_tokens": 31137088.0, "step": 9978, "train/ce_loss": 7.408016244880855e-07 }, { "epoch": 0.9865532924658889, "step": 9978, "train/sim_loss": 0.05859375 }, { "epoch": 0.9865532924658889, "step": 9978, "train/total_loss": 0.05859382450580597 }, { "entropy": 9.135875701904297, "epoch": 0.9866521653154043, "mean_token_accuracy": 0.7842639684677124, "num_tokens": 31141936.0, "step": 9979, "train/ce_loss": 1.3566555026045535e-06 }, { "epoch": 0.9866521653154043, "step": 9979, "train/sim_loss": 0.046875 }, { "epoch": 0.9866521653154043, "step": 9979, "train/total_loss": 0.046875134110450745 }, { "epoch": 0.9867510381649199, "grad_norm": 0.7542499899864197, "learning_rate": 7.535232161400386e-06, "loss": 0.1305, "step": 9980 }, { "entropy": 8.518350601196289, "epoch": 0.9867510381649199, "mean_token_accuracy": 0.7614973187446594, "num_tokens": 31147269.0, "step": 9980, "train/ce_loss": 0.5412923693656921 }, { "epoch": 0.9867510381649199, "step": 9980, "train/sim_loss": 0.015625 }, { "epoch": 0.9867510381649199, "step": 9980, "train/total_loss": 0.06975424289703369 }, { "entropy": 8.914718627929688, "epoch": 0.9868499110144354, "mean_token_accuracy": 0.7020348906517029, "num_tokens": 31152404.0, "step": 9981, "train/ce_loss": 0.8738462924957275 }, { "epoch": 0.9868499110144354, "step": 9981, "train/sim_loss": 0.125 }, { "epoch": 0.9868499110144354, "step": 9981, "train/total_loss": 0.2123846411705017 }, { "entropy": 8.742120742797852, "epoch": 0.9869487838639509, "mean_token_accuracy": 0.7226277589797974, "num_tokens": 31157670.0, "step": 9982, "train/ce_loss": 0.7680981755256653 }, { "epoch": 0.9869487838639509, "step": 9982, "train/sim_loss": 0.04296875 }, { "epoch": 0.9869487838639509, "step": 9982, "train/total_loss": 0.11977856606245041 }, { "entropy": 8.708446502685547, "epoch": 0.9870476567134665, "mean_token_accuracy": 0.7712895274162292, "num_tokens": 31162978.0, "step": 9983, "train/ce_loss": 0.5740722417831421 }, { "epoch": 0.9870476567134665, "step": 9983, "train/sim_loss": 0.03125 }, { "epoch": 0.9870476567134665, "step": 9983, "train/total_loss": 0.08865723013877869 }, { "entropy": 8.663925170898438, "epoch": 0.987146529562982, "mean_token_accuracy": 0.7137305736541748, "num_tokens": 31168202.0, "step": 9984, "train/ce_loss": 1.1544108390808105 }, { "epoch": 0.987146529562982, "step": 9984, "train/sim_loss": 0.03515625 }, { "epoch": 0.987146529562982, "step": 9984, "train/total_loss": 0.15059733390808105 }, { "entropy": 8.859031677246094, "epoch": 0.9872454024124975, "mean_token_accuracy": 0.7522522807121277, "num_tokens": 31173490.0, "step": 9985, "train/ce_loss": 1.1717761754989624 }, { "epoch": 0.9872454024124975, "step": 9985, "train/sim_loss": 0.0859375 }, { "epoch": 0.9872454024124975, "step": 9985, "train/total_loss": 0.20311512053012848 }, { "entropy": 8.618402481079102, "epoch": 0.9873442752620131, "mean_token_accuracy": 0.7758620977401733, "num_tokens": 31178664.0, "step": 9986, "train/ce_loss": 0.8400186896324158 }, { "epoch": 0.9873442752620131, "step": 9986, "train/sim_loss": 0.03515625 }, { "epoch": 0.9873442752620131, "step": 9986, "train/total_loss": 0.11915811896324158 }, { "entropy": 9.091707229614258, "epoch": 0.9874431481115286, "mean_token_accuracy": 0.8032000064849854, "num_tokens": 31183757.0, "step": 9987, "train/ce_loss": 8.002979825505463e-07 }, { "epoch": 0.9874431481115286, "step": 9987, "train/sim_loss": 0.0234375 }, { "epoch": 0.9874431481115286, "step": 9987, "train/total_loss": 0.023437580093741417 }, { "entropy": 8.52845287322998, "epoch": 0.987542020961044, "mean_token_accuracy": 0.7927232384681702, "num_tokens": 31189178.0, "step": 9988, "train/ce_loss": 0.7290910482406616 }, { "epoch": 0.987542020961044, "step": 9988, "train/sim_loss": 0.0546875 }, { "epoch": 0.987542020961044, "step": 9988, "train/total_loss": 0.12759661674499512 }, { "entropy": 9.61940860748291, "epoch": 0.9876408938105596, "mean_token_accuracy": 0.7434554696083069, "num_tokens": 31193922.0, "step": 9989, "train/ce_loss": 3.083202955167508e-07 }, { "epoch": 0.9876408938105596, "step": 9989, "train/sim_loss": 0.01171875 }, { "epoch": 0.9876408938105596, "step": 9989, "train/total_loss": 0.011718780733644962 }, { "entropy": 8.437448501586914, "epoch": 0.9877397666600751, "mean_token_accuracy": 0.7885652780532837, "num_tokens": 31199360.0, "step": 9990, "train/ce_loss": 0.2778100073337555 }, { "epoch": 0.9877397666600751, "step": 9990, "train/sim_loss": 0.015625 }, { "epoch": 0.9877397666600751, "step": 9990, "train/total_loss": 0.04340600222349167 }, { "entropy": 9.451400756835938, "epoch": 0.9878386395095907, "mean_token_accuracy": 0.747863233089447, "num_tokens": 31204240.0, "step": 9991, "train/ce_loss": 0.9985411167144775 }, { "epoch": 0.9878386395095907, "step": 9991, "train/sim_loss": 0.06640625 }, { "epoch": 0.9878386395095907, "step": 9991, "train/total_loss": 0.16626036167144775 }, { "entropy": 8.895739555358887, "epoch": 0.9879375123591062, "mean_token_accuracy": 0.7634561061859131, "num_tokens": 31209503.0, "step": 9992, "train/ce_loss": 1.0490940809249878 }, { "epoch": 0.9879375123591062, "step": 9992, "train/sim_loss": 0.02734375 }, { "epoch": 0.9879375123591062, "step": 9992, "train/total_loss": 0.13225317001342773 }, { "entropy": 9.31628704071045, "epoch": 0.9880363852086217, "mean_token_accuracy": 0.7696850299835205, "num_tokens": 31214458.0, "step": 9993, "train/ce_loss": 0.9760105609893799 }, { "epoch": 0.9880363852086217, "step": 9993, "train/sim_loss": 0.01953125 }, { "epoch": 0.9880363852086217, "step": 9993, "train/total_loss": 0.11713230609893799 }, { "entropy": 8.331695556640625, "epoch": 0.9881352580581373, "mean_token_accuracy": 0.7680355310440063, "num_tokens": 31219831.0, "step": 9994, "train/ce_loss": 0.7177839279174805 }, { "epoch": 0.9881352580581373, "step": 9994, "train/sim_loss": 0.0234375 }, { "epoch": 0.9881352580581373, "step": 9994, "train/total_loss": 0.09521589428186417 }, { "entropy": 8.767294883728027, "epoch": 0.9882341309076528, "mean_token_accuracy": 0.799501895904541, "num_tokens": 31225104.0, "step": 9995, "train/ce_loss": 0.5451284050941467 }, { "epoch": 0.9882341309076528, "step": 9995, "train/sim_loss": 0.0234375 }, { "epoch": 0.9882341309076528, "step": 9995, "train/total_loss": 0.07795034348964691 }, { "entropy": 8.49264907836914, "epoch": 0.9883330037571683, "mean_token_accuracy": 0.7464065551757812, "num_tokens": 31230563.0, "step": 9996, "train/ce_loss": 0.4141041040420532 }, { "epoch": 0.9883330037571683, "step": 9996, "train/sim_loss": 0.015625 }, { "epoch": 0.9883330037571683, "step": 9996, "train/total_loss": 0.0570354126393795 }, { "entropy": 8.719945907592773, "epoch": 0.9884318766066839, "mean_token_accuracy": 0.7779156565666199, "num_tokens": 31235811.0, "step": 9997, "train/ce_loss": 0.7226426005363464 }, { "epoch": 0.9884318766066839, "step": 9997, "train/sim_loss": 0.01953125 }, { "epoch": 0.9884318766066839, "step": 9997, "train/total_loss": 0.09179551154375076 }, { "entropy": 9.418034553527832, "epoch": 0.9885307494561993, "mean_token_accuracy": 0.7284482717514038, "num_tokens": 31240665.0, "step": 9998, "train/ce_loss": 1.6247684955596924 }, { "epoch": 0.9885307494561993, "step": 9998, "train/sim_loss": 0.0703125 }, { "epoch": 0.9885307494561993, "step": 9998, "train/total_loss": 0.23278935253620148 }, { "entropy": 8.844606399536133, "epoch": 0.9886296223057148, "mean_token_accuracy": 0.7281553149223328, "num_tokens": 31245851.0, "step": 9999, "train/ce_loss": 0.621844470500946 }, { "epoch": 0.9886296223057148, "step": 9999, "train/sim_loss": 0.08984375 }, { "epoch": 0.9886296223057148, "step": 9999, "train/total_loss": 0.15202820301055908 }, { "epoch": 0.9887284951552304, "grad_norm": 0.7090817093849182, "learning_rate": 7.5302872966424374e-06, "loss": 0.1195, "step": 10000 } ], "logging_steps": 20, "max_steps": 40456, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.41961109815296e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }