{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.39549139806209216, "eval_steps": 500, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 3.8872127532958984, "epoch": 0, "mean_token_accuracy": 0.5917355418205261, "num_tokens": 50381.0, "step": 0, "train/ce_loss": 3.7070188522338867 }, { "epoch": 0, "step": 0, "train/sim_loss": 0.9783360958099365 }, { "epoch": 0, "step": 0, "train/total_loss": 1.349038004875183 }, { "entropy": 3.8865139484405518, "epoch": 0.00019774569903104609, "mean_token_accuracy": 0.6157925724983215, "num_tokens": 88316.0, "step": 1, "train/ce_loss": 2.21616268157959 }, { "epoch": 0.00019774569903104609, "step": 1, "train/sim_loss": 0.9697240591049194 }, { "epoch": 0.00019774569903104609, "step": 1, "train/total_loss": 1.1913403272628784 }, { "entropy": 4.0025811195373535, "epoch": 0.00039549139806209217, "mean_token_accuracy": 0.6337064504623413, "num_tokens": 119001.0, "step": 2, "train/ce_loss": 2.502980947494507 }, { "epoch": 0.00039549139806209217, "step": 2, "train/sim_loss": 0.9777464270591736 }, { "epoch": 0.00039549139806209217, "step": 2, "train/total_loss": 1.2280445098876953 }, { "entropy": 3.9231350421905518, "epoch": 0.0005932370970931383, "mean_token_accuracy": 0.6126453280448914, "num_tokens": 169333.0, "step": 3, "train/ce_loss": 2.1544790267944336 }, { "epoch": 0.0005932370970931383, "step": 3, "train/sim_loss": 0.9749578237533569 }, { "epoch": 0.0005932370970931383, "step": 3, "train/total_loss": 1.1904057264328003 }, { "entropy": 4.304129123687744, "epoch": 0.0007909827961241843, "mean_token_accuracy": 0.5909090638160706, "num_tokens": 205473.0, "step": 4, "train/ce_loss": 0.5527390241622925 }, { "epoch": 0.0007909827961241843, "step": 4, "train/sim_loss": 0.9380554556846619 }, { "epoch": 0.0007909827961241843, "step": 4, "train/total_loss": 0.9933293461799622 }, { "entropy": 3.7595107555389404, "epoch": 0.0009887284951552303, "mean_token_accuracy": 0.686661422252655, "num_tokens": 236391.0, "step": 5, "train/ce_loss": 1.8855355978012085 }, { "epoch": 0.0009887284951552303, "step": 5, "train/sim_loss": 0.9098438024520874 }, { "epoch": 0.0009887284951552303, "step": 5, "train/total_loss": 1.0983973741531372 }, { "entropy": 3.959226608276367, "epoch": 0.0011864741941862765, "mean_token_accuracy": 0.6334134340286255, "num_tokens": 284751.0, "step": 6, "train/ce_loss": 0.6815671920776367 }, { "epoch": 0.0011864741941862765, "step": 6, "train/sim_loss": 0.8965779542922974 }, { "epoch": 0.0011864741941862765, "step": 6, "train/total_loss": 0.964734673500061 }, { "entropy": 4.297743320465088, "epoch": 0.0013842198932173225, "mean_token_accuracy": 0.6939629316329956, "num_tokens": 345807.0, "step": 7, "train/ce_loss": 1.2219197750091553 }, { "epoch": 0.0013842198932173225, "step": 7, "train/sim_loss": 0.8181607127189636 }, { "epoch": 0.0013842198932173225, "step": 7, "train/total_loss": 0.9403526782989502 }, { "entropy": 4.212011337280273, "epoch": 0.0015819655922483687, "mean_token_accuracy": 0.656129002571106, "num_tokens": 383636.0, "step": 8, "train/ce_loss": 1.95631742477417 }, { "epoch": 0.0015819655922483687, "step": 8, "train/sim_loss": 0.797670841217041 }, { "epoch": 0.0015819655922483687, "step": 8, "train/total_loss": 0.993302583694458 }, { "entropy": 4.223197937011719, "epoch": 0.0017797112912794147, "mean_token_accuracy": 0.7255005240440369, "num_tokens": 436381.0, "step": 9, "train/ce_loss": 1.2108083963394165 }, { "epoch": 0.0017797112912794147, "step": 9, "train/sim_loss": 0.7214459776878357 }, { "epoch": 0.0017797112912794147, "step": 9, "train/total_loss": 0.8425267934799194 }, { "entropy": 4.3371076583862305, "epoch": 0.0019774569903104606, "mean_token_accuracy": 0.7260100841522217, "num_tokens": 482695.0, "step": 10, "train/ce_loss": 0.7096691131591797 }, { "epoch": 0.0019774569903104606, "step": 10, "train/sim_loss": 0.60673987865448 }, { "epoch": 0.0019774569903104606, "step": 10, "train/total_loss": 0.677706778049469 }, { "entropy": 4.30339241027832, "epoch": 0.002175202689341507, "mean_token_accuracy": 0.6805555820465088, "num_tokens": 514188.0, "step": 11, "train/ce_loss": 1.195139765739441 }, { "epoch": 0.002175202689341507, "step": 11, "train/sim_loss": 0.5427866578102112 }, { "epoch": 0.002175202689341507, "step": 11, "train/total_loss": 0.6623006463050842 }, { "entropy": 4.659426212310791, "epoch": 0.002372948388372553, "mean_token_accuracy": 0.667060911655426, "num_tokens": 576342.0, "step": 12, "train/ce_loss": 0.8507458567619324 }, { "epoch": 0.002372948388372553, "step": 12, "train/sim_loss": 0.42880260944366455 }, { "epoch": 0.002372948388372553, "step": 12, "train/total_loss": 0.5138772130012512 }, { "entropy": 4.532121658325195, "epoch": 0.002570694087403599, "mean_token_accuracy": 0.7205169796943665, "num_tokens": 612471.0, "step": 13, "train/ce_loss": 1.5486241579055786 }, { "epoch": 0.002570694087403599, "step": 13, "train/sim_loss": 0.3347446918487549 }, { "epoch": 0.002570694087403599, "step": 13, "train/total_loss": 0.4896070957183838 }, { "entropy": 4.673823356628418, "epoch": 0.002768439786434645, "mean_token_accuracy": 0.7434467077255249, "num_tokens": 653550.0, "step": 14, "train/ce_loss": 0.858471155166626 }, { "epoch": 0.002768439786434645, "step": 14, "train/sim_loss": 0.3023592233657837 }, { "epoch": 0.002768439786434645, "step": 14, "train/total_loss": 0.3882063329219818 }, { "entropy": 5.127185821533203, "epoch": 0.002966185485465691, "mean_token_accuracy": 0.7166778445243835, "num_tokens": 699346.0, "step": 15, "train/ce_loss": 1.2532557249069214 }, { "epoch": 0.002966185485465691, "step": 15, "train/sim_loss": 0.27063441276550293 }, { "epoch": 0.002966185485465691, "step": 15, "train/total_loss": 0.3959599733352661 }, { "entropy": 4.686318397521973, "epoch": 0.0031639311844967374, "mean_token_accuracy": 0.7198697328567505, "num_tokens": 733245.0, "step": 16, "train/ce_loss": 1.1497808694839478 }, { "epoch": 0.0031639311844967374, "step": 16, "train/sim_loss": 0.1867692470550537 }, { "epoch": 0.0031639311844967374, "step": 16, "train/total_loss": 0.30174732208251953 }, { "entropy": 5.101997375488281, "epoch": 0.003361676883527783, "mean_token_accuracy": 0.6893437504768372, "num_tokens": 781633.0, "step": 17, "train/ce_loss": 1.009338140487671 }, { "epoch": 0.003361676883527783, "step": 17, "train/sim_loss": 0.18192684650421143 }, { "epoch": 0.003361676883527783, "step": 17, "train/total_loss": 0.282860666513443 }, { "entropy": 4.994562149047852, "epoch": 0.0035594225825588293, "mean_token_accuracy": 0.7290909290313721, "num_tokens": 828056.0, "step": 18, "train/ce_loss": 1.1494468450546265 }, { "epoch": 0.0035594225825588293, "step": 18, "train/sim_loss": 0.1335231065750122 }, { "epoch": 0.0035594225825588293, "step": 18, "train/total_loss": 0.2484678030014038 }, { "entropy": 5.193882465362549, "epoch": 0.0037571682815898755, "mean_token_accuracy": 0.6987607479095459, "num_tokens": 863720.0, "step": 19, "train/ce_loss": 0.24169014394283295 }, { "epoch": 0.0037571682815898755, "step": 19, "train/sim_loss": 0.13817179203033447 }, { "epoch": 0.0037571682815898755, "step": 19, "train/total_loss": 0.16234080493450165 }, { "epoch": 0.003954913980620921, "grad_norm": 1.1003997325897217, "learning_rate": 9.995548521119796e-06, "loss": 0.7391, "step": 20 }, { "entropy": 5.302530288696289, "epoch": 0.003954913980620921, "mean_token_accuracy": 0.7212894558906555, "num_tokens": 920031.0, "step": 20, "train/ce_loss": 0.8538235425949097 }, { "epoch": 0.003954913980620921, "step": 20, "train/sim_loss": 0.11618638038635254 }, { "epoch": 0.003954913980620921, "step": 20, "train/total_loss": 0.20156873762607574 }, { "entropy": 5.189628601074219, "epoch": 0.004152659679651968, "mean_token_accuracy": 0.7469471096992493, "num_tokens": 958302.0, "step": 21, "train/ce_loss": 1.0184314250946045 }, { "epoch": 0.004152659679651968, "step": 21, "train/sim_loss": 0.11410486698150635 }, { "epoch": 0.004152659679651968, "step": 21, "train/total_loss": 0.21594801545143127 }, { "entropy": 5.541162967681885, "epoch": 0.004350405378683014, "mean_token_accuracy": 0.7521833777427673, "num_tokens": 1008897.0, "step": 22, "train/ce_loss": 0.2181837558746338 }, { "epoch": 0.004350405378683014, "step": 22, "train/sim_loss": 0.09087485074996948 }, { "epoch": 0.004350405378683014, "step": 22, "train/total_loss": 0.11269322782754898 }, { "entropy": 5.499630928039551, "epoch": 0.004548151077714059, "mean_token_accuracy": 0.704803466796875, "num_tokens": 1039229.0, "step": 23, "train/ce_loss": 0.15805459022521973 }, { "epoch": 0.004548151077714059, "step": 23, "train/sim_loss": 0.06486129760742188 }, { "epoch": 0.004548151077714059, "step": 23, "train/total_loss": 0.08066675812005997 }, { "entropy": 5.3893723487854, "epoch": 0.004745896776745106, "mean_token_accuracy": 0.7516212463378906, "num_tokens": 1067063.0, "step": 24, "train/ce_loss": 1.3459582328796387 }, { "epoch": 0.004745896776745106, "step": 24, "train/sim_loss": 0.0835641622543335 }, { "epoch": 0.004745896776745106, "step": 24, "train/total_loss": 0.2181599885225296 }, { "entropy": 5.630428791046143, "epoch": 0.004943642475776152, "mean_token_accuracy": 0.7260273694992065, "num_tokens": 1118631.0, "step": 25, "train/ce_loss": 0.9441943168640137 }, { "epoch": 0.004943642475776152, "step": 25, "train/sim_loss": 0.06369775533676147 }, { "epoch": 0.004943642475776152, "step": 25, "train/total_loss": 0.15811719000339508 }, { "entropy": 5.429488182067871, "epoch": 0.005141388174807198, "mean_token_accuracy": 0.699050784111023, "num_tokens": 1164403.0, "step": 26, "train/ce_loss": 0.9720959663391113 }, { "epoch": 0.005141388174807198, "step": 26, "train/sim_loss": 0.0705181360244751 }, { "epoch": 0.005141388174807198, "step": 26, "train/total_loss": 0.1677277386188507 }, { "entropy": 5.347123146057129, "epoch": 0.005339133873838244, "mean_token_accuracy": 0.7375654578208923, "num_tokens": 1213403.0, "step": 27, "train/ce_loss": 1.3343250751495361 }, { "epoch": 0.005339133873838244, "step": 27, "train/sim_loss": 0.06407308578491211 }, { "epoch": 0.005339133873838244, "step": 27, "train/total_loss": 0.19750559329986572 }, { "entropy": 5.57736873626709, "epoch": 0.00553687957286929, "mean_token_accuracy": 0.7152956128120422, "num_tokens": 1251376.0, "step": 28, "train/ce_loss": 0.612929105758667 }, { "epoch": 0.00553687957286929, "step": 28, "train/sim_loss": 0.09657418727874756 }, { "epoch": 0.00553687957286929, "step": 28, "train/total_loss": 0.15786710381507874 }, { "entropy": 5.669429779052734, "epoch": 0.005734625271900337, "mean_token_accuracy": 0.6957566142082214, "num_tokens": 1286881.0, "step": 29, "train/ce_loss": 1.626391887664795 }, { "epoch": 0.005734625271900337, "step": 29, "train/sim_loss": 0.05084109306335449 }, { "epoch": 0.005734625271900337, "step": 29, "train/total_loss": 0.21348027884960175 }, { "entropy": 5.377643585205078, "epoch": 0.005932370970931382, "mean_token_accuracy": 0.6972353458404541, "num_tokens": 1330883.0, "step": 30, "train/ce_loss": 1.1438572406768799 }, { "epoch": 0.005932370970931382, "step": 30, "train/sim_loss": 0.05107426643371582 }, { "epoch": 0.005932370970931382, "step": 30, "train/total_loss": 0.1654599905014038 }, { "entropy": 5.8812408447265625, "epoch": 0.006130116669962428, "mean_token_accuracy": 0.7341325879096985, "num_tokens": 1374529.0, "step": 31, "train/ce_loss": 1.1452523469924927 }, { "epoch": 0.006130116669962428, "step": 31, "train/sim_loss": 0.036649227142333984 }, { "epoch": 0.006130116669962428, "step": 31, "train/total_loss": 0.15117445588111877 }, { "entropy": 5.719261646270752, "epoch": 0.006327862368993475, "mean_token_accuracy": 0.7351318001747131, "num_tokens": 1444366.0, "step": 32, "train/ce_loss": 0.8688517808914185 }, { "epoch": 0.006327862368993475, "step": 32, "train/sim_loss": 0.031846046447753906 }, { "epoch": 0.006327862368993475, "step": 32, "train/total_loss": 0.11873122304677963 }, { "entropy": 5.123924255371094, "epoch": 0.0065256080680245205, "mean_token_accuracy": 0.7303506731987, "num_tokens": 1493371.0, "step": 33, "train/ce_loss": 1.2104504108428955 }, { "epoch": 0.0065256080680245205, "step": 33, "train/sim_loss": 0.037885427474975586 }, { "epoch": 0.0065256080680245205, "step": 33, "train/total_loss": 0.1589304804801941 }, { "entropy": 5.795130729675293, "epoch": 0.006723353767055566, "mean_token_accuracy": 0.7095237970352173, "num_tokens": 1536482.0, "step": 34, "train/ce_loss": 1.0465365648269653 }, { "epoch": 0.006723353767055566, "step": 34, "train/sim_loss": 0.0243949294090271 }, { "epoch": 0.006723353767055566, "step": 34, "train/total_loss": 0.12904858589172363 }, { "entropy": 5.8453369140625, "epoch": 0.006921099466086613, "mean_token_accuracy": 0.687543511390686, "num_tokens": 1590189.0, "step": 35, "train/ce_loss": 0.7026560306549072 }, { "epoch": 0.006921099466086613, "step": 35, "train/sim_loss": 0.031241536140441895 }, { "epoch": 0.006921099466086613, "step": 35, "train/total_loss": 0.10150714218616486 }, { "entropy": 5.763403415679932, "epoch": 0.007118845165117659, "mean_token_accuracy": 0.7474815249443054, "num_tokens": 1636524.0, "step": 36, "train/ce_loss": 0.7884125709533691 }, { "epoch": 0.007118845165117659, "step": 36, "train/sim_loss": 0.02793663740158081 }, { "epoch": 0.007118845165117659, "step": 36, "train/total_loss": 0.10677789896726608 }, { "entropy": 5.952225685119629, "epoch": 0.007316590864148704, "mean_token_accuracy": 0.707317054271698, "num_tokens": 1682755.0, "step": 37, "train/ce_loss": 1.22393000125885 }, { "epoch": 0.007316590864148704, "step": 37, "train/sim_loss": 0.0796809196472168 }, { "epoch": 0.007316590864148704, "step": 37, "train/total_loss": 0.20207393169403076 }, { "entropy": 5.348940849304199, "epoch": 0.007514336563179751, "mean_token_accuracy": 0.7175257802009583, "num_tokens": 1708659.0, "step": 38, "train/ce_loss": 0.857283353805542 }, { "epoch": 0.007514336563179751, "step": 38, "train/sim_loss": 0.027422010898590088 }, { "epoch": 0.007514336563179751, "step": 38, "train/total_loss": 0.11315035074949265 }, { "entropy": 5.800405025482178, "epoch": 0.007712082262210797, "mean_token_accuracy": 0.7185545563697815, "num_tokens": 1755228.0, "step": 39, "train/ce_loss": 0.07463020831346512 }, { "epoch": 0.007712082262210797, "step": 39, "train/sim_loss": 0.02935636043548584 }, { "epoch": 0.007712082262210797, "step": 39, "train/total_loss": 0.03681937977671623 }, { "epoch": 0.007909827961241843, "grad_norm": 0.6892441511154175, "learning_rate": 9.985656345830448e-06, "loss": 0.1568, "step": 40 }, { "entropy": 5.29024076461792, "epoch": 0.007909827961241843, "mean_token_accuracy": 0.7888594269752502, "num_tokens": 1784689.0, "step": 40, "train/ce_loss": 1.036678433418274 }, { "epoch": 0.007909827961241843, "step": 40, "train/sim_loss": 0.03383827209472656 }, { "epoch": 0.007909827961241843, "step": 40, "train/total_loss": 0.1375061273574829 }, { "entropy": 5.048645973205566, "epoch": 0.008107573660272888, "mean_token_accuracy": 0.792941153049469, "num_tokens": 1821025.0, "step": 41, "train/ce_loss": 1.2434687614440918 }, { "epoch": 0.008107573660272888, "step": 41, "train/sim_loss": 0.016914010047912598 }, { "epoch": 0.008107573660272888, "step": 41, "train/total_loss": 0.14126089215278625 }, { "entropy": 5.37271785736084, "epoch": 0.008305319359303936, "mean_token_accuracy": 0.7529330849647522, "num_tokens": 1881553.0, "step": 42, "train/ce_loss": 0.9140902757644653 }, { "epoch": 0.008305319359303936, "step": 42, "train/sim_loss": 0.022767961025238037 }, { "epoch": 0.008305319359303936, "step": 42, "train/total_loss": 0.11417698860168457 }, { "entropy": 5.911882400512695, "epoch": 0.008503065058334982, "mean_token_accuracy": 0.7285171151161194, "num_tokens": 1930503.0, "step": 43, "train/ce_loss": 0.9356524348258972 }, { "epoch": 0.008503065058334982, "step": 43, "train/sim_loss": 0.026882469654083252 }, { "epoch": 0.008503065058334982, "step": 43, "train/total_loss": 0.12044771760702133 }, { "entropy": 5.5693583488464355, "epoch": 0.008700810757366027, "mean_token_accuracy": 0.7413793206214905, "num_tokens": 1979141.0, "step": 44, "train/ce_loss": 1.2723579406738281 }, { "epoch": 0.008700810757366027, "step": 44, "train/sim_loss": 0.02841949462890625 }, { "epoch": 0.008700810757366027, "step": 44, "train/total_loss": 0.15565529465675354 }, { "entropy": 5.416491508483887, "epoch": 0.008898556456397073, "mean_token_accuracy": 0.7378814816474915, "num_tokens": 2015945.0, "step": 45, "train/ce_loss": 0.06876914203166962 }, { "epoch": 0.008898556456397073, "step": 45, "train/sim_loss": 0.026132822036743164 }, { "epoch": 0.008898556456397073, "step": 45, "train/total_loss": 0.033009737730026245 }, { "entropy": 5.6775336265563965, "epoch": 0.009096302155428119, "mean_token_accuracy": 0.7012696266174316, "num_tokens": 2063490.0, "step": 46, "train/ce_loss": 1.0801770687103271 }, { "epoch": 0.009096302155428119, "step": 46, "train/sim_loss": 0.01758420467376709 }, { "epoch": 0.009096302155428119, "step": 46, "train/total_loss": 0.12560191750526428 }, { "entropy": 5.940728187561035, "epoch": 0.009294047854459166, "mean_token_accuracy": 0.7204229831695557, "num_tokens": 2119844.0, "step": 47, "train/ce_loss": 0.8718334436416626 }, { "epoch": 0.009294047854459166, "step": 47, "train/sim_loss": 0.01655399799346924 }, { "epoch": 0.009294047854459166, "step": 47, "train/total_loss": 0.10373734682798386 }, { "entropy": 5.615647315979004, "epoch": 0.009491793553490212, "mean_token_accuracy": 0.7609942555427551, "num_tokens": 2154926.0, "step": 48, "train/ce_loss": 0.4964156150817871 }, { "epoch": 0.009491793553490212, "step": 48, "train/sim_loss": 0.022137999534606934 }, { "epoch": 0.009491793553490212, "step": 48, "train/total_loss": 0.07177956402301788 }, { "entropy": 5.767778396606445, "epoch": 0.009689539252521258, "mean_token_accuracy": 0.7297297120094299, "num_tokens": 2217865.0, "step": 49, "train/ce_loss": 1.4015429019927979 }, { "epoch": 0.009689539252521258, "step": 49, "train/sim_loss": 0.019148528575897217 }, { "epoch": 0.009689539252521258, "step": 49, "train/total_loss": 0.15930281579494476 }, { "entropy": 5.838512420654297, "epoch": 0.009887284951552304, "mean_token_accuracy": 0.6993517279624939, "num_tokens": 2279842.0, "step": 50, "train/ce_loss": 0.05263669416308403 }, { "epoch": 0.009887284951552304, "step": 50, "train/sim_loss": 0.022365093231201172 }, { "epoch": 0.009887284951552304, "step": 50, "train/total_loss": 0.027628762647509575 }, { "entropy": 5.840867042541504, "epoch": 0.01008503065058335, "mean_token_accuracy": 0.723802924156189, "num_tokens": 2319437.0, "step": 51, "train/ce_loss": 0.7736684083938599 }, { "epoch": 0.01008503065058335, "step": 51, "train/sim_loss": 0.018631458282470703 }, { "epoch": 0.01008503065058335, "step": 51, "train/total_loss": 0.09599830210208893 }, { "entropy": 5.601288795471191, "epoch": 0.010282776349614395, "mean_token_accuracy": 0.7476038336753845, "num_tokens": 2355925.0, "step": 52, "train/ce_loss": 0.033609502017498016 }, { "epoch": 0.010282776349614395, "step": 52, "train/sim_loss": 0.017129063606262207 }, { "epoch": 0.010282776349614395, "step": 52, "train/total_loss": 0.02049001306295395 }, { "entropy": 5.635674953460693, "epoch": 0.010480522048645443, "mean_token_accuracy": 0.6860670447349548, "num_tokens": 2408875.0, "step": 53, "train/ce_loss": 0.7140606045722961 }, { "epoch": 0.010480522048645443, "step": 53, "train/sim_loss": 0.01459360122680664 }, { "epoch": 0.010480522048645443, "step": 53, "train/total_loss": 0.08599966019392014 }, { "entropy": 5.578782558441162, "epoch": 0.010678267747676488, "mean_token_accuracy": 0.721043050289154, "num_tokens": 2454770.0, "step": 54, "train/ce_loss": 0.9930257201194763 }, { "epoch": 0.010678267747676488, "step": 54, "train/sim_loss": 0.01340937614440918 }, { "epoch": 0.010678267747676488, "step": 54, "train/total_loss": 0.11271195113658905 }, { "entropy": 5.980039596557617, "epoch": 0.010876013446707534, "mean_token_accuracy": 0.7248935103416443, "num_tokens": 2505120.0, "step": 55, "train/ce_loss": 1.336120367050171 }, { "epoch": 0.010876013446707534, "step": 55, "train/sim_loss": 0.015119194984436035 }, { "epoch": 0.010876013446707534, "step": 55, "train/total_loss": 0.14873123168945312 }, { "entropy": 5.498307228088379, "epoch": 0.01107375914573858, "mean_token_accuracy": 0.7366071343421936, "num_tokens": 2531944.0, "step": 56, "train/ce_loss": 0.030888434499502182 }, { "epoch": 0.01107375914573858, "step": 56, "train/sim_loss": 0.011683642864227295 }, { "epoch": 0.01107375914573858, "step": 56, "train/total_loss": 0.014772485941648483 }, { "entropy": 5.753145694732666, "epoch": 0.011271504844769626, "mean_token_accuracy": 0.7308743000030518, "num_tokens": 2582537.0, "step": 57, "train/ce_loss": 1.0170098543167114 }, { "epoch": 0.011271504844769626, "step": 57, "train/sim_loss": 0.014897525310516357 }, { "epoch": 0.011271504844769626, "step": 57, "train/total_loss": 0.11659850925207138 }, { "entropy": 5.675912380218506, "epoch": 0.011469250543800673, "mean_token_accuracy": 0.7284226417541504, "num_tokens": 2620548.0, "step": 58, "train/ce_loss": 1.6090497970581055 }, { "epoch": 0.011469250543800673, "step": 58, "train/sim_loss": 0.01339501142501831 }, { "epoch": 0.011469250543800673, "step": 58, "train/total_loss": 0.17430000007152557 }, { "entropy": 5.741678237915039, "epoch": 0.011666996242831719, "mean_token_accuracy": 0.706620991230011, "num_tokens": 2672242.0, "step": 59, "train/ce_loss": 0.6681963205337524 }, { "epoch": 0.011666996242831719, "step": 59, "train/sim_loss": 0.012975931167602539 }, { "epoch": 0.011666996242831719, "step": 59, "train/total_loss": 0.07979556173086166 }, { "epoch": 0.011864741941862765, "grad_norm": 0.5891191363334656, "learning_rate": 9.975764170541102e-06, "loss": 0.1095, "step": 60 }, { "entropy": 5.678043842315674, "epoch": 0.011864741941862765, "mean_token_accuracy": 0.716366171836853, "num_tokens": 2733718.0, "step": 60, "train/ce_loss": 0.5850629806518555 }, { "epoch": 0.011864741941862765, "step": 60, "train/sim_loss": 0.00974273681640625 }, { "epoch": 0.011864741941862765, "step": 60, "train/total_loss": 0.06824903190135956 }, { "entropy": 5.396895408630371, "epoch": 0.01206248764089381, "mean_token_accuracy": 0.7372159361839294, "num_tokens": 2779582.0, "step": 61, "train/ce_loss": 0.9019386768341064 }, { "epoch": 0.01206248764089381, "step": 61, "train/sim_loss": 0.011626720428466797 }, { "epoch": 0.01206248764089381, "step": 61, "train/total_loss": 0.10182058811187744 }, { "entropy": 5.455183506011963, "epoch": 0.012260233339924856, "mean_token_accuracy": 0.7514084577560425, "num_tokens": 2817753.0, "step": 62, "train/ce_loss": 0.6628961563110352 }, { "epoch": 0.012260233339924856, "step": 62, "train/sim_loss": 0.014423489570617676 }, { "epoch": 0.012260233339924856, "step": 62, "train/total_loss": 0.08071310818195343 }, { "entropy": 5.595412254333496, "epoch": 0.012457979038955902, "mean_token_accuracy": 0.7558290362358093, "num_tokens": 2859766.0, "step": 63, "train/ce_loss": 1.1596070528030396 }, { "epoch": 0.012457979038955902, "step": 63, "train/sim_loss": 0.016056954860687256 }, { "epoch": 0.012457979038955902, "step": 63, "train/total_loss": 0.13201767206192017 }, { "entropy": 5.657646179199219, "epoch": 0.01265572473798695, "mean_token_accuracy": 0.6865285038948059, "num_tokens": 2924566.0, "step": 64, "train/ce_loss": 0.7790413498878479 }, { "epoch": 0.01265572473798695, "step": 64, "train/sim_loss": 0.02332603931427002 }, { "epoch": 0.01265572473798695, "step": 64, "train/total_loss": 0.10123017430305481 }, { "entropy": 5.2199859619140625, "epoch": 0.012853470437017995, "mean_token_accuracy": 0.738111674785614, "num_tokens": 2983747.0, "step": 65, "train/ce_loss": 0.9148899912834167 }, { "epoch": 0.012853470437017995, "step": 65, "train/sim_loss": 0.008977055549621582 }, { "epoch": 0.012853470437017995, "step": 65, "train/total_loss": 0.1004660576581955 }, { "entropy": 5.3019185066223145, "epoch": 0.013051216136049041, "mean_token_accuracy": 0.7261987328529358, "num_tokens": 3031616.0, "step": 66, "train/ce_loss": 1.7916879653930664 }, { "epoch": 0.013051216136049041, "step": 66, "train/sim_loss": 0.015873193740844727 }, { "epoch": 0.013051216136049041, "step": 66, "train/total_loss": 0.19504199922084808 }, { "entropy": 5.24873161315918, "epoch": 0.013248961835080087, "mean_token_accuracy": 0.7167919874191284, "num_tokens": 3083071.0, "step": 67, "train/ce_loss": 0.7067745923995972 }, { "epoch": 0.013248961835080087, "step": 67, "train/sim_loss": 0.008074700832366943 }, { "epoch": 0.013248961835080087, "step": 67, "train/total_loss": 0.07875216007232666 }, { "entropy": 5.577188491821289, "epoch": 0.013446707534111133, "mean_token_accuracy": 0.7472527623176575, "num_tokens": 3120509.0, "step": 68, "train/ce_loss": 1.185904622077942 }, { "epoch": 0.013446707534111133, "step": 68, "train/sim_loss": 0.012984931468963623 }, { "epoch": 0.013446707534111133, "step": 68, "train/total_loss": 0.13157540559768677 }, { "entropy": 5.400413513183594, "epoch": 0.01364445323314218, "mean_token_accuracy": 0.7184873819351196, "num_tokens": 3164265.0, "step": 69, "train/ce_loss": 0.9857748746871948 }, { "epoch": 0.01364445323314218, "step": 69, "train/sim_loss": 0.012157440185546875 }, { "epoch": 0.01364445323314218, "step": 69, "train/total_loss": 0.11073493212461472 }, { "entropy": 5.56998348236084, "epoch": 0.013842198932173226, "mean_token_accuracy": 0.7203513979911804, "num_tokens": 3210151.0, "step": 70, "train/ce_loss": 1.1573768854141235 }, { "epoch": 0.013842198932173226, "step": 70, "train/sim_loss": 0.009069263935089111 }, { "epoch": 0.013842198932173226, "step": 70, "train/total_loss": 0.1248069554567337 }, { "entropy": 5.540490627288818, "epoch": 0.014039944631204272, "mean_token_accuracy": 0.7358185052871704, "num_tokens": 3273493.0, "step": 71, "train/ce_loss": 1.3981571197509766 }, { "epoch": 0.014039944631204272, "step": 71, "train/sim_loss": 0.01232600212097168 }, { "epoch": 0.014039944631204272, "step": 71, "train/total_loss": 0.1521417200565338 }, { "entropy": 5.438465118408203, "epoch": 0.014237690330235317, "mean_token_accuracy": 0.7278953790664673, "num_tokens": 3335518.0, "step": 72, "train/ce_loss": 0.7087013125419617 }, { "epoch": 0.014237690330235317, "step": 72, "train/sim_loss": 0.012213468551635742 }, { "epoch": 0.014237690330235317, "step": 72, "train/total_loss": 0.08308359980583191 }, { "entropy": 5.416237831115723, "epoch": 0.014435436029266363, "mean_token_accuracy": 0.7145318984985352, "num_tokens": 3379354.0, "step": 73, "train/ce_loss": 0.7383574843406677 }, { "epoch": 0.014435436029266363, "step": 73, "train/sim_loss": 0.009310543537139893 }, { "epoch": 0.014435436029266363, "step": 73, "train/total_loss": 0.08314629644155502 }, { "entropy": 5.388800621032715, "epoch": 0.014633181728297409, "mean_token_accuracy": 0.7173202633857727, "num_tokens": 3414118.0, "step": 74, "train/ce_loss": 1.306372046470642 }, { "epoch": 0.014633181728297409, "step": 74, "train/sim_loss": 0.009606719017028809 }, { "epoch": 0.014633181728297409, "step": 74, "train/total_loss": 0.14024393260478973 }, { "entropy": 5.539669990539551, "epoch": 0.014830927427328456, "mean_token_accuracy": 0.7327001094818115, "num_tokens": 3474437.0, "step": 75, "train/ce_loss": 0.6185916662216187 }, { "epoch": 0.014830927427328456, "step": 75, "train/sim_loss": 0.009356856346130371 }, { "epoch": 0.014830927427328456, "step": 75, "train/total_loss": 0.07121602445840836 }, { "entropy": 5.505520820617676, "epoch": 0.015028673126359502, "mean_token_accuracy": 0.7172808051109314, "num_tokens": 3520150.0, "step": 76, "train/ce_loss": 0.9408271312713623 }, { "epoch": 0.015028673126359502, "step": 76, "train/sim_loss": 0.00921785831451416 }, { "epoch": 0.015028673126359502, "step": 76, "train/total_loss": 0.10330057144165039 }, { "entropy": 5.351016044616699, "epoch": 0.015226418825390548, "mean_token_accuracy": 0.7317073345184326, "num_tokens": 3554075.0, "step": 77, "train/ce_loss": 0.8012406826019287 }, { "epoch": 0.015226418825390548, "step": 77, "train/sim_loss": 0.018411636352539062 }, { "epoch": 0.015226418825390548, "step": 77, "train/total_loss": 0.09853570908308029 }, { "entropy": 5.4517717361450195, "epoch": 0.015424164524421594, "mean_token_accuracy": 0.7124137878417969, "num_tokens": 3604120.0, "step": 78, "train/ce_loss": 1.2851883172988892 }, { "epoch": 0.015424164524421594, "step": 78, "train/sim_loss": 0.011327207088470459 }, { "epoch": 0.015424164524421594, "step": 78, "train/total_loss": 0.1398460417985916 }, { "entropy": 5.316267967224121, "epoch": 0.01562191022345264, "mean_token_accuracy": 0.734446108341217, "num_tokens": 3644125.0, "step": 79, "train/ce_loss": 1.6733952760696411 }, { "epoch": 0.01562191022345264, "step": 79, "train/sim_loss": 0.010225892066955566 }, { "epoch": 0.01562191022345264, "step": 79, "train/total_loss": 0.17756542563438416 }, { "epoch": 0.015819655922483685, "grad_norm": 0.6507789492607117, "learning_rate": 9.965871995251758e-06, "loss": 0.1029, "step": 80 }, { "entropy": 5.068170547485352, "epoch": 0.015819655922483685, "mean_token_accuracy": 0.7239344120025635, "num_tokens": 3690792.0, "step": 80, "train/ce_loss": 0.4748115837574005 }, { "epoch": 0.015819655922483685, "step": 80, "train/sim_loss": 0.010663747787475586 }, { "epoch": 0.015819655922483685, "step": 80, "train/total_loss": 0.058144908398389816 }, { "entropy": 5.177664756774902, "epoch": 0.016017401621514733, "mean_token_accuracy": 0.7609329223632812, "num_tokens": 3743243.0, "step": 81, "train/ce_loss": 0.002738288603723049 }, { "epoch": 0.016017401621514733, "step": 81, "train/sim_loss": 0.009984314441680908 }, { "epoch": 0.016017401621514733, "step": 81, "train/total_loss": 0.0102581437677145 }, { "entropy": 5.632203102111816, "epoch": 0.016215147320545777, "mean_token_accuracy": 0.7121473550796509, "num_tokens": 3785666.0, "step": 82, "train/ce_loss": 0.9313804507255554 }, { "epoch": 0.016215147320545777, "step": 82, "train/sim_loss": 0.012984156608581543 }, { "epoch": 0.016215147320545777, "step": 82, "train/total_loss": 0.1061222031712532 }, { "entropy": 5.345677852630615, "epoch": 0.016412893019576824, "mean_token_accuracy": 0.6928062438964844, "num_tokens": 3836862.0, "step": 83, "train/ce_loss": 1.7581712007522583 }, { "epoch": 0.016412893019576824, "step": 83, "train/sim_loss": 0.017396092414855957 }, { "epoch": 0.016412893019576824, "step": 83, "train/total_loss": 0.19321320950984955 }, { "entropy": 5.141568183898926, "epoch": 0.01661063871860787, "mean_token_accuracy": 0.7244299650192261, "num_tokens": 3872775.0, "step": 84, "train/ce_loss": 1.2081438302993774 }, { "epoch": 0.01661063871860787, "step": 84, "train/sim_loss": 0.007030963897705078 }, { "epoch": 0.01661063871860787, "step": 84, "train/total_loss": 0.12784534692764282 }, { "entropy": 5.377555847167969, "epoch": 0.016808384417638916, "mean_token_accuracy": 0.7298456430435181, "num_tokens": 3921378.0, "step": 85, "train/ce_loss": 1.488629698753357 }, { "epoch": 0.016808384417638916, "step": 85, "train/sim_loss": 0.008724331855773926 }, { "epoch": 0.016808384417638916, "step": 85, "train/total_loss": 0.15758730471134186 }, { "entropy": 5.391998767852783, "epoch": 0.017006130116669963, "mean_token_accuracy": 0.7074985504150391, "num_tokens": 3963132.0, "step": 86, "train/ce_loss": 0.8642844557762146 }, { "epoch": 0.017006130116669963, "step": 86, "train/sim_loss": 0.007748663425445557 }, { "epoch": 0.017006130116669963, "step": 86, "train/total_loss": 0.09417711198329926 }, { "entropy": 5.126368522644043, "epoch": 0.017203875815701007, "mean_token_accuracy": 0.7468274235725403, "num_tokens": 4007118.0, "step": 87, "train/ce_loss": 0.9252598285675049 }, { "epoch": 0.017203875815701007, "step": 87, "train/sim_loss": 0.015508532524108887 }, { "epoch": 0.017203875815701007, "step": 87, "train/total_loss": 0.10803451389074326 }, { "entropy": 5.572304725646973, "epoch": 0.017401621514732055, "mean_token_accuracy": 0.7294626235961914, "num_tokens": 4053058.0, "step": 88, "train/ce_loss": 0.8977378010749817 }, { "epoch": 0.017401621514732055, "step": 88, "train/sim_loss": 0.009112119674682617 }, { "epoch": 0.017401621514732055, "step": 88, "train/total_loss": 0.0988859012722969 }, { "entropy": 5.34841251373291, "epoch": 0.017599367213763102, "mean_token_accuracy": 0.7450096607208252, "num_tokens": 4089489.0, "step": 89, "train/ce_loss": 1.0037304162979126 }, { "epoch": 0.017599367213763102, "step": 89, "train/sim_loss": 0.007692694664001465 }, { "epoch": 0.017599367213763102, "step": 89, "train/total_loss": 0.10806573927402496 }, { "entropy": 5.680527687072754, "epoch": 0.017797112912794146, "mean_token_accuracy": 0.7086279988288879, "num_tokens": 4140713.0, "step": 90, "train/ce_loss": 1.4682397842407227 }, { "epoch": 0.017797112912794146, "step": 90, "train/sim_loss": 0.009348750114440918 }, { "epoch": 0.017797112912794146, "step": 90, "train/total_loss": 0.1561727374792099 }, { "entropy": 5.265936374664307, "epoch": 0.017994858611825194, "mean_token_accuracy": 0.7166494131088257, "num_tokens": 4184299.0, "step": 91, "train/ce_loss": 1.0135834217071533 }, { "epoch": 0.017994858611825194, "step": 91, "train/sim_loss": 0.005894780158996582 }, { "epoch": 0.017994858611825194, "step": 91, "train/total_loss": 0.10725312680006027 }, { "entropy": 5.374821662902832, "epoch": 0.018192604310856238, "mean_token_accuracy": 0.7211155295372009, "num_tokens": 4227245.0, "step": 92, "train/ce_loss": 0.7675289511680603 }, { "epoch": 0.018192604310856238, "step": 92, "train/sim_loss": 0.007227063179016113 }, { "epoch": 0.018192604310856238, "step": 92, "train/total_loss": 0.08397995680570602 }, { "entropy": 5.416843414306641, "epoch": 0.018390350009887285, "mean_token_accuracy": 0.7084124088287354, "num_tokens": 4276561.0, "step": 93, "train/ce_loss": 1.8257437944412231 }, { "epoch": 0.018390350009887285, "step": 93, "train/sim_loss": 0.006943881511688232 }, { "epoch": 0.018390350009887285, "step": 93, "train/total_loss": 0.1895182579755783 }, { "entropy": 5.174567699432373, "epoch": 0.018588095708918333, "mean_token_accuracy": 0.7669903039932251, "num_tokens": 4305239.0, "step": 94, "train/ce_loss": 0.7895597815513611 }, { "epoch": 0.018588095708918333, "step": 94, "train/sim_loss": 0.0063204169273376465 }, { "epoch": 0.018588095708918333, "step": 94, "train/total_loss": 0.08527639508247375 }, { "entropy": 5.221612930297852, "epoch": 0.018785841407949377, "mean_token_accuracy": 0.7777048945426941, "num_tokens": 4349861.0, "step": 95, "train/ce_loss": 0.7057914733886719 }, { "epoch": 0.018785841407949377, "step": 95, "train/sim_loss": 0.004695534706115723 }, { "epoch": 0.018785841407949377, "step": 95, "train/total_loss": 0.07527468353509903 }, { "entropy": 5.189736366271973, "epoch": 0.018983587106980424, "mean_token_accuracy": 0.7209442853927612, "num_tokens": 4382655.0, "step": 96, "train/ce_loss": 1.1945136785507202 }, { "epoch": 0.018983587106980424, "step": 96, "train/sim_loss": 0.005962789058685303 }, { "epoch": 0.018983587106980424, "step": 96, "train/total_loss": 0.1254141628742218 }, { "entropy": 5.224713325500488, "epoch": 0.019181332806011468, "mean_token_accuracy": 0.7371295094490051, "num_tokens": 4439952.0, "step": 97, "train/ce_loss": 1.3761649131774902 }, { "epoch": 0.019181332806011468, "step": 97, "train/sim_loss": 0.008097052574157715 }, { "epoch": 0.019181332806011468, "step": 97, "train/total_loss": 0.14571355283260345 }, { "entropy": 5.1256232261657715, "epoch": 0.019379078505042516, "mean_token_accuracy": 0.7552631497383118, "num_tokens": 4489161.0, "step": 98, "train/ce_loss": 1.1334357261657715 }, { "epoch": 0.019379078505042516, "step": 98, "train/sim_loss": 0.004998207092285156 }, { "epoch": 0.019379078505042516, "step": 98, "train/total_loss": 0.11834178119897842 }, { "entropy": 5.543116569519043, "epoch": 0.01957682420407356, "mean_token_accuracy": 0.7534090876579285, "num_tokens": 4542798.0, "step": 99, "train/ce_loss": 1.117351770401001 }, { "epoch": 0.01957682420407356, "step": 99, "train/sim_loss": 0.014609992504119873 }, { "epoch": 0.01957682420407356, "step": 99, "train/total_loss": 0.1263451725244522 }, { "epoch": 0.019774569903104607, "grad_norm": 0.5300605893135071, "learning_rate": 9.95597981996241e-06, "loss": 0.0966, "step": 100 }, { "entropy": 5.512630462646484, "epoch": 0.019774569903104607, "mean_token_accuracy": 0.7240192890167236, "num_tokens": 4590496.0, "step": 100, "train/ce_loss": 0.639026939868927 }, { "epoch": 0.019774569903104607, "step": 100, "train/sim_loss": 0.014202475547790527 }, { "epoch": 0.019774569903104607, "step": 100, "train/total_loss": 0.07810517400503159 }, { "entropy": 4.808377265930176, "epoch": 0.019972315602135655, "mean_token_accuracy": 0.7302904725074768, "num_tokens": 4623863.0, "step": 101, "train/ce_loss": 0.8830198645591736 }, { "epoch": 0.019972315602135655, "step": 101, "train/sim_loss": 0.006345272064208984 }, { "epoch": 0.019972315602135655, "step": 101, "train/total_loss": 0.09464725852012634 }, { "entropy": 5.364417552947998, "epoch": 0.0201700613011667, "mean_token_accuracy": 0.759205162525177, "num_tokens": 4663567.0, "step": 102, "train/ce_loss": 0.45664215087890625 }, { "epoch": 0.0201700613011667, "step": 102, "train/sim_loss": 0.004687130451202393 }, { "epoch": 0.0201700613011667, "step": 102, "train/total_loss": 0.0503513477742672 }, { "entropy": 5.702671051025391, "epoch": 0.020367807000197746, "mean_token_accuracy": 0.7070844769477844, "num_tokens": 4712955.0, "step": 103, "train/ce_loss": 1.2859517335891724 }, { "epoch": 0.020367807000197746, "step": 103, "train/sim_loss": 0.007664144039154053 }, { "epoch": 0.020367807000197746, "step": 103, "train/total_loss": 0.1362593173980713 }, { "entropy": 5.550717353820801, "epoch": 0.02056555269922879, "mean_token_accuracy": 0.7071428298950195, "num_tokens": 4744408.0, "step": 104, "train/ce_loss": 1.0909379720687866 }, { "epoch": 0.02056555269922879, "step": 104, "train/sim_loss": 0.006180524826049805 }, { "epoch": 0.02056555269922879, "step": 104, "train/total_loss": 0.1152743250131607 }, { "entropy": 5.506555080413818, "epoch": 0.020763298398259838, "mean_token_accuracy": 0.724723219871521, "num_tokens": 4797407.0, "step": 105, "train/ce_loss": 0.9258939623832703 }, { "epoch": 0.020763298398259838, "step": 105, "train/sim_loss": 0.0070953369140625 }, { "epoch": 0.020763298398259838, "step": 105, "train/total_loss": 0.09968473762273788 }, { "entropy": 5.350674629211426, "epoch": 0.020961044097290885, "mean_token_accuracy": 0.7451456189155579, "num_tokens": 4841627.0, "step": 106, "train/ce_loss": 1.0658727884292603 }, { "epoch": 0.020961044097290885, "step": 106, "train/sim_loss": 0.006538271903991699 }, { "epoch": 0.020961044097290885, "step": 106, "train/total_loss": 0.11312555521726608 }, { "entropy": 5.66782283782959, "epoch": 0.02115878979632193, "mean_token_accuracy": 0.7390317916870117, "num_tokens": 4883121.0, "step": 107, "train/ce_loss": 1.08169686794281 }, { "epoch": 0.02115878979632193, "step": 107, "train/sim_loss": 0.007291853427886963 }, { "epoch": 0.02115878979632193, "step": 107, "train/total_loss": 0.11546154320240021 }, { "entropy": 5.281320571899414, "epoch": 0.021356535495352977, "mean_token_accuracy": 0.7618476152420044, "num_tokens": 4921997.0, "step": 108, "train/ce_loss": 0.9735116362571716 }, { "epoch": 0.021356535495352977, "step": 108, "train/sim_loss": 0.007384598255157471 }, { "epoch": 0.021356535495352977, "step": 108, "train/total_loss": 0.10473576188087463 }, { "entropy": 5.693179607391357, "epoch": 0.02155428119438402, "mean_token_accuracy": 0.7148175835609436, "num_tokens": 4953838.0, "step": 109, "train/ce_loss": 1.0692379474639893 }, { "epoch": 0.02155428119438402, "step": 109, "train/sim_loss": 0.0036603212356567383 }, { "epoch": 0.02155428119438402, "step": 109, "train/total_loss": 0.11058411747217178 }, { "entropy": 5.420840740203857, "epoch": 0.02175202689341507, "mean_token_accuracy": 0.7463768124580383, "num_tokens": 4992953.0, "step": 110, "train/ce_loss": 1.043798804283142 }, { "epoch": 0.02175202689341507, "step": 110, "train/sim_loss": 0.007619380950927734 }, { "epoch": 0.02175202689341507, "step": 110, "train/total_loss": 0.1119992658495903 }, { "entropy": 5.623539924621582, "epoch": 0.021949772592446116, "mean_token_accuracy": 0.7385804653167725, "num_tokens": 5053408.0, "step": 111, "train/ce_loss": 1.0319174528121948 }, { "epoch": 0.021949772592446116, "step": 111, "train/sim_loss": 0.007094025611877441 }, { "epoch": 0.021949772592446116, "step": 111, "train/total_loss": 0.11028577387332916 }, { "entropy": 4.89241886138916, "epoch": 0.02214751829147716, "mean_token_accuracy": 0.7945659160614014, "num_tokens": 5081843.0, "step": 112, "train/ce_loss": 0.000525900162756443 }, { "epoch": 0.02214751829147716, "step": 112, "train/sim_loss": 0.007423520088195801 }, { "epoch": 0.02214751829147716, "step": 112, "train/total_loss": 0.007476110011339188 }, { "entropy": 5.382798194885254, "epoch": 0.022345263990508207, "mean_token_accuracy": 0.7507163286209106, "num_tokens": 5119732.0, "step": 113, "train/ce_loss": 0.5792762041091919 }, { "epoch": 0.022345263990508207, "step": 113, "train/sim_loss": 0.006399989128112793 }, { "epoch": 0.022345263990508207, "step": 113, "train/total_loss": 0.06432761251926422 }, { "entropy": 5.605935573577881, "epoch": 0.02254300968953925, "mean_token_accuracy": 0.727770984172821, "num_tokens": 5158328.0, "step": 114, "train/ce_loss": 0.7038564085960388 }, { "epoch": 0.02254300968953925, "step": 114, "train/sim_loss": 0.0072547197341918945 }, { "epoch": 0.02254300968953925, "step": 114, "train/total_loss": 0.0776403620839119 }, { "entropy": 5.722214221954346, "epoch": 0.0227407553885703, "mean_token_accuracy": 0.7047738432884216, "num_tokens": 5213603.0, "step": 115, "train/ce_loss": 1.6293336153030396 }, { "epoch": 0.0227407553885703, "step": 115, "train/sim_loss": 0.009015679359436035 }, { "epoch": 0.0227407553885703, "step": 115, "train/total_loss": 0.17194904386997223 }, { "entropy": 5.876331329345703, "epoch": 0.022938501087601346, "mean_token_accuracy": 0.7291325926780701, "num_tokens": 5261337.0, "step": 116, "train/ce_loss": 1.9582345485687256 }, { "epoch": 0.022938501087601346, "step": 116, "train/sim_loss": 0.005879878997802734 }, { "epoch": 0.022938501087601346, "step": 116, "train/total_loss": 0.20170333981513977 }, { "entropy": 5.420672416687012, "epoch": 0.02313624678663239, "mean_token_accuracy": 0.7590987682342529, "num_tokens": 5305708.0, "step": 117, "train/ce_loss": 0.792396605014801 }, { "epoch": 0.02313624678663239, "step": 117, "train/sim_loss": 0.0044023990631103516 }, { "epoch": 0.02313624678663239, "step": 117, "train/total_loss": 0.08364205807447433 }, { "entropy": 5.694249153137207, "epoch": 0.023333992485663438, "mean_token_accuracy": 0.7737388610839844, "num_tokens": 5351882.0, "step": 118, "train/ce_loss": 1.1884180307388306 }, { "epoch": 0.023333992485663438, "step": 118, "train/sim_loss": 0.0035119056701660156 }, { "epoch": 0.023333992485663438, "step": 118, "train/total_loss": 0.12235371023416519 }, { "entropy": 5.683180809020996, "epoch": 0.023531738184694482, "mean_token_accuracy": 0.7089297771453857, "num_tokens": 5389925.0, "step": 119, "train/ce_loss": 1.1468416452407837 }, { "epoch": 0.023531738184694482, "step": 119, "train/sim_loss": 0.0064803361892700195 }, { "epoch": 0.023531738184694482, "step": 119, "train/total_loss": 0.12116450071334839 }, { "epoch": 0.02372948388372553, "grad_norm": 0.5984599590301514, "learning_rate": 9.946087644673065e-06, "loss": 0.0917, "step": 120 }, { "entropy": 5.748069763183594, "epoch": 0.02372948388372553, "mean_token_accuracy": 0.7271353006362915, "num_tokens": 5442802.0, "step": 120, "train/ce_loss": 0.32922646403312683 }, { "epoch": 0.02372948388372553, "step": 120, "train/sim_loss": 0.005884289741516113 }, { "epoch": 0.02372948388372553, "step": 120, "train/total_loss": 0.038806937634944916 }, { "entropy": 5.55022668838501, "epoch": 0.023927229582756573, "mean_token_accuracy": 0.6989640593528748, "num_tokens": 5498700.0, "step": 121, "train/ce_loss": 0.9323674440383911 }, { "epoch": 0.023927229582756573, "step": 121, "train/sim_loss": 0.016794323921203613 }, { "epoch": 0.023927229582756573, "step": 121, "train/total_loss": 0.11003106832504272 }, { "entropy": 5.745837688446045, "epoch": 0.02412497528178762, "mean_token_accuracy": 0.7287381887435913, "num_tokens": 5542415.0, "step": 122, "train/ce_loss": 1.4613248109817505 }, { "epoch": 0.02412497528178762, "step": 122, "train/sim_loss": 0.005347490310668945 }, { "epoch": 0.02412497528178762, "step": 122, "train/total_loss": 0.15147997438907623 }, { "entropy": 5.599244117736816, "epoch": 0.02432272098081867, "mean_token_accuracy": 0.727213978767395, "num_tokens": 5581644.0, "step": 123, "train/ce_loss": 0.813494086265564 }, { "epoch": 0.02432272098081867, "step": 123, "train/sim_loss": 0.004829049110412598 }, { "epoch": 0.02432272098081867, "step": 123, "train/total_loss": 0.08617845922708511 }, { "entropy": 5.808537483215332, "epoch": 0.024520466679849712, "mean_token_accuracy": 0.721981406211853, "num_tokens": 5641315.0, "step": 124, "train/ce_loss": 0.8339769840240479 }, { "epoch": 0.024520466679849712, "step": 124, "train/sim_loss": 0.011134862899780273 }, { "epoch": 0.024520466679849712, "step": 124, "train/total_loss": 0.0945325642824173 }, { "entropy": 5.680286407470703, "epoch": 0.02471821237888076, "mean_token_accuracy": 0.7337042689323425, "num_tokens": 5688190.0, "step": 125, "train/ce_loss": 1.5351011753082275 }, { "epoch": 0.02471821237888076, "step": 125, "train/sim_loss": 0.007368206977844238 }, { "epoch": 0.02471821237888076, "step": 125, "train/total_loss": 0.16087833046913147 }, { "entropy": 5.348459243774414, "epoch": 0.024915958077911804, "mean_token_accuracy": 0.7431243062019348, "num_tokens": 5736378.0, "step": 126, "train/ce_loss": 1.119261384010315 }, { "epoch": 0.024915958077911804, "step": 126, "train/sim_loss": 0.004918396472930908 }, { "epoch": 0.024915958077911804, "step": 126, "train/total_loss": 0.1168445348739624 }, { "entropy": 5.411444187164307, "epoch": 0.02511370377694285, "mean_token_accuracy": 0.722296416759491, "num_tokens": 5784644.0, "step": 127, "train/ce_loss": 1.6119855642318726 }, { "epoch": 0.02511370377694285, "step": 127, "train/sim_loss": 0.004353582859039307 }, { "epoch": 0.02511370377694285, "step": 127, "train/total_loss": 0.16555213928222656 }, { "entropy": 5.37313175201416, "epoch": 0.0253114494759739, "mean_token_accuracy": 0.7407675385475159, "num_tokens": 5821620.0, "step": 128, "train/ce_loss": 0.6660165786743164 }, { "epoch": 0.0253114494759739, "step": 128, "train/sim_loss": 0.007208049297332764 }, { "epoch": 0.0253114494759739, "step": 128, "train/total_loss": 0.07380970567464828 }, { "entropy": 5.482139587402344, "epoch": 0.025509195175004943, "mean_token_accuracy": 0.7065775990486145, "num_tokens": 5859340.0, "step": 129, "train/ce_loss": 0.7602159380912781 }, { "epoch": 0.025509195175004943, "step": 129, "train/sim_loss": 0.008024394512176514 }, { "epoch": 0.025509195175004943, "step": 129, "train/total_loss": 0.08404599130153656 }, { "entropy": 5.538297653198242, "epoch": 0.02570694087403599, "mean_token_accuracy": 0.7385229468345642, "num_tokens": 5897134.0, "step": 130, "train/ce_loss": 0.9417870044708252 }, { "epoch": 0.02570694087403599, "step": 130, "train/sim_loss": 0.004256844520568848 }, { "epoch": 0.02570694087403599, "step": 130, "train/total_loss": 0.09843554347753525 }, { "entropy": 5.579758167266846, "epoch": 0.025904686573067034, "mean_token_accuracy": 0.6976889371871948, "num_tokens": 5935086.0, "step": 131, "train/ce_loss": 1.9783289432525635 }, { "epoch": 0.025904686573067034, "step": 131, "train/sim_loss": 0.005571246147155762 }, { "epoch": 0.025904686573067034, "step": 131, "train/total_loss": 0.20340414345264435 }, { "entropy": 5.988212585449219, "epoch": 0.026102432272098082, "mean_token_accuracy": 0.7078891396522522, "num_tokens": 5971683.0, "step": 132, "train/ce_loss": 2.2108871936798096 }, { "epoch": 0.026102432272098082, "step": 132, "train/sim_loss": 0.0072019100189208984 }, { "epoch": 0.026102432272098082, "step": 132, "train/total_loss": 0.2282906323671341 }, { "entropy": 5.6699066162109375, "epoch": 0.02630017797112913, "mean_token_accuracy": 0.722161591053009, "num_tokens": 6022017.0, "step": 133, "train/ce_loss": 0.8808659911155701 }, { "epoch": 0.02630017797112913, "step": 133, "train/sim_loss": 0.007794320583343506 }, { "epoch": 0.02630017797112913, "step": 133, "train/total_loss": 0.0958809182047844 }, { "entropy": 5.320906162261963, "epoch": 0.026497923670160173, "mean_token_accuracy": 0.7456485033035278, "num_tokens": 6051847.0, "step": 134, "train/ce_loss": 0.4920349419116974 }, { "epoch": 0.026497923670160173, "step": 134, "train/sim_loss": 0.006436824798583984 }, { "epoch": 0.026497923670160173, "step": 134, "train/total_loss": 0.0556403212249279 }, { "entropy": 5.507585525512695, "epoch": 0.02669566936919122, "mean_token_accuracy": 0.7073030471801758, "num_tokens": 6097862.0, "step": 135, "train/ce_loss": 1.3671016693115234 }, { "epoch": 0.02669566936919122, "step": 135, "train/sim_loss": 0.004433751106262207 }, { "epoch": 0.02669566936919122, "step": 135, "train/total_loss": 0.14114391803741455 }, { "entropy": 5.793395042419434, "epoch": 0.026893415068222265, "mean_token_accuracy": 0.7203579545021057, "num_tokens": 6149933.0, "step": 136, "train/ce_loss": 1.1749088764190674 }, { "epoch": 0.026893415068222265, "step": 136, "train/sim_loss": 0.004670143127441406 }, { "epoch": 0.026893415068222265, "step": 136, "train/total_loss": 0.12216103076934814 }, { "entropy": 5.465941429138184, "epoch": 0.027091160767253313, "mean_token_accuracy": 0.7089804410934448, "num_tokens": 6193233.0, "step": 137, "train/ce_loss": 0.9060531854629517 }, { "epoch": 0.027091160767253313, "step": 137, "train/sim_loss": 0.006893575191497803 }, { "epoch": 0.027091160767253313, "step": 137, "train/total_loss": 0.09749889373779297 }, { "entropy": 5.414890766143799, "epoch": 0.02728890646628436, "mean_token_accuracy": 0.7749003767967224, "num_tokens": 6239737.0, "step": 138, "train/ce_loss": 0.00047445620293729007 }, { "epoch": 0.02728890646628436, "step": 138, "train/sim_loss": 0.005631506443023682 }, { "epoch": 0.02728890646628436, "step": 138, "train/total_loss": 0.00567895220592618 }, { "entropy": 5.316211223602295, "epoch": 0.027486652165315404, "mean_token_accuracy": 0.7571690082550049, "num_tokens": 6300436.0, "step": 139, "train/ce_loss": 0.6843438744544983 }, { "epoch": 0.027486652165315404, "step": 139, "train/sim_loss": 0.003938794136047363 }, { "epoch": 0.027486652165315404, "step": 139, "train/total_loss": 0.07237318158149719 }, { "epoch": 0.02768439786434645, "grad_norm": 0.545576274394989, "learning_rate": 9.936195469383719e-06, "loss": 0.0944, "step": 140 }, { "entropy": 5.233609199523926, "epoch": 0.02768439786434645, "mean_token_accuracy": 0.731517493724823, "num_tokens": 6338773.0, "step": 140, "train/ce_loss": 1.0613957643508911 }, { "epoch": 0.02768439786434645, "step": 140, "train/sim_loss": 0.005838155746459961 }, { "epoch": 0.02768439786434645, "step": 140, "train/total_loss": 0.11197773367166519 }, { "entropy": 5.680075645446777, "epoch": 0.027882143563377496, "mean_token_accuracy": 0.7465635538101196, "num_tokens": 6381966.0, "step": 141, "train/ce_loss": 1.232954978942871 }, { "epoch": 0.027882143563377496, "step": 141, "train/sim_loss": 0.004732489585876465 }, { "epoch": 0.027882143563377496, "step": 141, "train/total_loss": 0.1280279904603958 }, { "entropy": 5.214236259460449, "epoch": 0.028079889262408543, "mean_token_accuracy": 0.7743216156959534, "num_tokens": 6418496.0, "step": 142, "train/ce_loss": 0.9455273151397705 }, { "epoch": 0.028079889262408543, "step": 142, "train/sim_loss": 0.0056452155113220215 }, { "epoch": 0.028079889262408543, "step": 142, "train/total_loss": 0.10019794851541519 }, { "entropy": 5.252363681793213, "epoch": 0.028277634961439587, "mean_token_accuracy": 0.7348178029060364, "num_tokens": 6459912.0, "step": 143, "train/ce_loss": 1.075695514678955 }, { "epoch": 0.028277634961439587, "step": 143, "train/sim_loss": 0.0039196014404296875 }, { "epoch": 0.028277634961439587, "step": 143, "train/total_loss": 0.11148915439844131 }, { "entropy": 5.728428840637207, "epoch": 0.028475380660470635, "mean_token_accuracy": 0.7162944674491882, "num_tokens": 6515943.0, "step": 144, "train/ce_loss": 0.0005405514966696501 }, { "epoch": 0.028475380660470635, "step": 144, "train/sim_loss": 0.0037211179733276367 }, { "epoch": 0.028475380660470635, "step": 144, "train/total_loss": 0.0037751730997115374 }, { "entropy": 5.364627838134766, "epoch": 0.028673126359501682, "mean_token_accuracy": 0.7668038606643677, "num_tokens": 6560416.0, "step": 145, "train/ce_loss": 0.9375442266464233 }, { "epoch": 0.028673126359501682, "step": 145, "train/sim_loss": 0.0027421116828918457 }, { "epoch": 0.028673126359501682, "step": 145, "train/total_loss": 0.09649653732776642 }, { "entropy": 5.397754669189453, "epoch": 0.028870872058532726, "mean_token_accuracy": 0.7442010045051575, "num_tokens": 6595646.0, "step": 146, "train/ce_loss": 1.0882431268692017 }, { "epoch": 0.028870872058532726, "step": 146, "train/sim_loss": 0.00524449348449707 }, { "epoch": 0.028870872058532726, "step": 146, "train/total_loss": 0.11406880617141724 }, { "entropy": 5.156062126159668, "epoch": 0.029068617757563774, "mean_token_accuracy": 0.7266187071800232, "num_tokens": 6637582.0, "step": 147, "train/ce_loss": 1.4853625297546387 }, { "epoch": 0.029068617757563774, "step": 147, "train/sim_loss": 0.005172371864318848 }, { "epoch": 0.029068617757563774, "step": 147, "train/total_loss": 0.15370862185955048 }, { "entropy": 5.460251808166504, "epoch": 0.029266363456594818, "mean_token_accuracy": 0.7469570636749268, "num_tokens": 6673395.0, "step": 148, "train/ce_loss": 0.9272188544273376 }, { "epoch": 0.029266363456594818, "step": 148, "train/sim_loss": 0.004868626594543457 }, { "epoch": 0.029266363456594818, "step": 148, "train/total_loss": 0.09759051352739334 }, { "entropy": 5.467174530029297, "epoch": 0.029464109155625865, "mean_token_accuracy": 0.7337110638618469, "num_tokens": 6721023.0, "step": 149, "train/ce_loss": 0.0006408291519619524 }, { "epoch": 0.029464109155625865, "step": 149, "train/sim_loss": 0.006495177745819092 }, { "epoch": 0.029464109155625865, "step": 149, "train/total_loss": 0.006559260655194521 }, { "entropy": 5.751723289489746, "epoch": 0.029661854854656913, "mean_token_accuracy": 0.7048693299293518, "num_tokens": 6761319.0, "step": 150, "train/ce_loss": 0.6587111353874207 }, { "epoch": 0.029661854854656913, "step": 150, "train/sim_loss": 0.003396272659301758 }, { "epoch": 0.029661854854656913, "step": 150, "train/total_loss": 0.0692673847079277 }, { "entropy": 5.4323201179504395, "epoch": 0.029859600553687957, "mean_token_accuracy": 0.7268746495246887, "num_tokens": 6795690.0, "step": 151, "train/ce_loss": 0.8870315551757812 }, { "epoch": 0.029859600553687957, "step": 151, "train/sim_loss": 0.004121303558349609 }, { "epoch": 0.029859600553687957, "step": 151, "train/total_loss": 0.09282445907592773 }, { "entropy": 5.6616363525390625, "epoch": 0.030057346252719004, "mean_token_accuracy": 0.7005987763404846, "num_tokens": 6848856.0, "step": 152, "train/ce_loss": 0.5608669519424438 }, { "epoch": 0.030057346252719004, "step": 152, "train/sim_loss": 0.003920316696166992 }, { "epoch": 0.030057346252719004, "step": 152, "train/total_loss": 0.060007013380527496 }, { "entropy": 5.5218915939331055, "epoch": 0.030255091951750048, "mean_token_accuracy": 0.7388114333152771, "num_tokens": 6891426.0, "step": 153, "train/ce_loss": 1.165250539779663 }, { "epoch": 0.030255091951750048, "step": 153, "train/sim_loss": 0.004454255104064941 }, { "epoch": 0.030255091951750048, "step": 153, "train/total_loss": 0.12097930908203125 }, { "entropy": 5.450357437133789, "epoch": 0.030452837650781096, "mean_token_accuracy": 0.7557603716850281, "num_tokens": 6929090.0, "step": 154, "train/ce_loss": 1.2035101652145386 }, { "epoch": 0.030452837650781096, "step": 154, "train/sim_loss": 0.005906999111175537 }, { "epoch": 0.030452837650781096, "step": 154, "train/total_loss": 0.1262580156326294 }, { "entropy": 5.523527145385742, "epoch": 0.030650583349812143, "mean_token_accuracy": 0.7550058960914612, "num_tokens": 6974097.0, "step": 155, "train/ce_loss": 1.3062076568603516 }, { "epoch": 0.030650583349812143, "step": 155, "train/sim_loss": 0.005739331245422363 }, { "epoch": 0.030650583349812143, "step": 155, "train/total_loss": 0.13636009395122528 }, { "entropy": 5.767185211181641, "epoch": 0.030848329048843187, "mean_token_accuracy": 0.7418375015258789, "num_tokens": 7031584.0, "step": 156, "train/ce_loss": 1.0888110399246216 }, { "epoch": 0.030848329048843187, "step": 156, "train/sim_loss": 0.004229307174682617 }, { "epoch": 0.030848329048843187, "step": 156, "train/total_loss": 0.11311041563749313 }, { "entropy": 5.958285808563232, "epoch": 0.031046074747874235, "mean_token_accuracy": 0.6795976758003235, "num_tokens": 7091179.0, "step": 157, "train/ce_loss": 1.1375224590301514 }, { "epoch": 0.031046074747874235, "step": 157, "train/sim_loss": 0.004027307033538818 }, { "epoch": 0.031046074747874235, "step": 157, "train/total_loss": 0.11777955293655396 }, { "entropy": 5.519291877746582, "epoch": 0.03124382044690528, "mean_token_accuracy": 0.7203626036643982, "num_tokens": 7126542.0, "step": 158, "train/ce_loss": 0.9161903858184814 }, { "epoch": 0.03124382044690528, "step": 158, "train/sim_loss": 0.0032401084899902344 }, { "epoch": 0.03124382044690528, "step": 158, "train/total_loss": 0.09485914558172226 }, { "entropy": 5.9572529792785645, "epoch": 0.03144156614593632, "mean_token_accuracy": 0.7558308839797974, "num_tokens": 7174443.0, "step": 159, "train/ce_loss": 1.5331377983093262 }, { "epoch": 0.03144156614593632, "step": 159, "train/sim_loss": 0.005185186862945557 }, { "epoch": 0.03144156614593632, "step": 159, "train/total_loss": 0.15849897265434265 }, { "epoch": 0.03163931184496737, "grad_norm": 0.5786049962043762, "learning_rate": 9.926303294094373e-06, "loss": 0.0916, "step": 160 }, { "entropy": 5.580877304077148, "epoch": 0.03163931184496737, "mean_token_accuracy": 0.7304843068122864, "num_tokens": 7217499.0, "step": 160, "train/ce_loss": 1.1334261894226074 }, { "epoch": 0.03163931184496737, "step": 160, "train/sim_loss": 0.004884243011474609 }, { "epoch": 0.03163931184496737, "step": 160, "train/total_loss": 0.11822686344385147 }, { "entropy": 5.774509906768799, "epoch": 0.03183705754399842, "mean_token_accuracy": 0.7099286317825317, "num_tokens": 7282485.0, "step": 161, "train/ce_loss": 0.694904625415802 }, { "epoch": 0.03183705754399842, "step": 161, "train/sim_loss": 0.003537297248840332 }, { "epoch": 0.03183705754399842, "step": 161, "train/total_loss": 0.07302775979042053 }, { "entropy": 5.514602184295654, "epoch": 0.032034803243029465, "mean_token_accuracy": 0.718492329120636, "num_tokens": 7322801.0, "step": 162, "train/ce_loss": 1.6375269889831543 }, { "epoch": 0.032034803243029465, "step": 162, "train/sim_loss": 0.00483173131942749 }, { "epoch": 0.032034803243029465, "step": 162, "train/total_loss": 0.1685844361782074 }, { "entropy": 5.769126892089844, "epoch": 0.03223254894206051, "mean_token_accuracy": 0.6916488409042358, "num_tokens": 7388528.0, "step": 163, "train/ce_loss": 0.7750133275985718 }, { "epoch": 0.03223254894206051, "step": 163, "train/sim_loss": 0.003557562828063965 }, { "epoch": 0.03223254894206051, "step": 163, "train/total_loss": 0.08105889707803726 }, { "entropy": 6.287981033325195, "epoch": 0.03243029464109155, "mean_token_accuracy": 0.722352921962738, "num_tokens": 7432664.0, "step": 164, "train/ce_loss": 0.7397950887680054 }, { "epoch": 0.03243029464109155, "step": 164, "train/sim_loss": 0.0031853914260864258 }, { "epoch": 0.03243029464109155, "step": 164, "train/total_loss": 0.0771649032831192 }, { "entropy": 6.117288112640381, "epoch": 0.0326280403401226, "mean_token_accuracy": 0.695652186870575, "num_tokens": 7478323.0, "step": 165, "train/ce_loss": 0.7992004752159119 }, { "epoch": 0.0326280403401226, "step": 165, "train/sim_loss": 0.0033379793167114258 }, { "epoch": 0.0326280403401226, "step": 165, "train/total_loss": 0.08325802534818649 }, { "entropy": 6.241514205932617, "epoch": 0.03282578603915365, "mean_token_accuracy": 0.7395070195198059, "num_tokens": 7538402.0, "step": 166, "train/ce_loss": 1.2715126276016235 }, { "epoch": 0.03282578603915365, "step": 166, "train/sim_loss": 0.003983497619628906 }, { "epoch": 0.03282578603915365, "step": 166, "train/total_loss": 0.1311347633600235 }, { "entropy": 5.749470233917236, "epoch": 0.033023531738184696, "mean_token_accuracy": 0.7528020143508911, "num_tokens": 7574529.0, "step": 167, "train/ce_loss": 1.3394511938095093 }, { "epoch": 0.033023531738184696, "step": 167, "train/sim_loss": 0.004103124141693115 }, { "epoch": 0.033023531738184696, "step": 167, "train/total_loss": 0.13804824650287628 }, { "entropy": 6.116143226623535, "epoch": 0.03322127743721574, "mean_token_accuracy": 0.73225998878479, "num_tokens": 7634395.0, "step": 168, "train/ce_loss": 0.6306108832359314 }, { "epoch": 0.03322127743721574, "step": 168, "train/sim_loss": 0.003449380397796631 }, { "epoch": 0.03322127743721574, "step": 168, "train/total_loss": 0.06651046872138977 }, { "entropy": 5.5470476150512695, "epoch": 0.033419023136246784, "mean_token_accuracy": 0.7710843086242676, "num_tokens": 7666749.0, "step": 169, "train/ce_loss": 0.6491318941116333 }, { "epoch": 0.033419023136246784, "step": 169, "train/sim_loss": 0.0023813247680664062 }, { "epoch": 0.033419023136246784, "step": 169, "train/total_loss": 0.06729451566934586 }, { "entropy": 5.86069917678833, "epoch": 0.03361676883527783, "mean_token_accuracy": 0.7391051650047302, "num_tokens": 7703702.0, "step": 170, "train/ce_loss": 0.9565059542655945 }, { "epoch": 0.03361676883527783, "step": 170, "train/sim_loss": 0.003596961498260498 }, { "epoch": 0.03361676883527783, "step": 170, "train/total_loss": 0.09924755990505219 }, { "entropy": 5.672433853149414, "epoch": 0.03381451453430888, "mean_token_accuracy": 0.7640527486801147, "num_tokens": 7745794.0, "step": 171, "train/ce_loss": 0.0002499893307685852 }, { "epoch": 0.03381451453430888, "step": 171, "train/sim_loss": 0.0026894211769104004 }, { "epoch": 0.03381451453430888, "step": 171, "train/total_loss": 0.0027144202031195164 }, { "entropy": 5.823018550872803, "epoch": 0.034012260233339926, "mean_token_accuracy": 0.7798948884010315, "num_tokens": 7788738.0, "step": 172, "train/ce_loss": 0.00025093115982599556 }, { "epoch": 0.034012260233339926, "step": 172, "train/sim_loss": 0.0022336244583129883 }, { "epoch": 0.034012260233339926, "step": 172, "train/total_loss": 0.0022587175481021404 }, { "entropy": 6.174272537231445, "epoch": 0.034210005932370974, "mean_token_accuracy": 0.7249626517295837, "num_tokens": 7836003.0, "step": 173, "train/ce_loss": 0.6946882605552673 }, { "epoch": 0.034210005932370974, "step": 173, "train/sim_loss": 0.0040721893310546875 }, { "epoch": 0.034210005932370974, "step": 173, "train/total_loss": 0.07354101538658142 }, { "entropy": 6.1251139640808105, "epoch": 0.034407751631402014, "mean_token_accuracy": 0.7021433711051941, "num_tokens": 7885129.0, "step": 174, "train/ce_loss": 1.2037456035614014 }, { "epoch": 0.034407751631402014, "step": 174, "train/sim_loss": 0.004275202751159668 }, { "epoch": 0.034407751631402014, "step": 174, "train/total_loss": 0.1246497631072998 }, { "entropy": 5.81584358215332, "epoch": 0.03460549733043306, "mean_token_accuracy": 0.7445651888847351, "num_tokens": 7922323.0, "step": 175, "train/ce_loss": 0.8722188472747803 }, { "epoch": 0.03460549733043306, "step": 175, "train/sim_loss": 0.0023966431617736816 }, { "epoch": 0.03460549733043306, "step": 175, "train/total_loss": 0.08961852639913559 }, { "entropy": 5.958613395690918, "epoch": 0.03480324302946411, "mean_token_accuracy": 0.6873920559883118, "num_tokens": 7976728.0, "step": 176, "train/ce_loss": 0.00033002105192281306 }, { "epoch": 0.03480324302946411, "step": 176, "train/sim_loss": 0.0048059821128845215 }, { "epoch": 0.03480324302946411, "step": 176, "train/total_loss": 0.004838983993977308 }, { "entropy": 5.762041091918945, "epoch": 0.03500098872849516, "mean_token_accuracy": 0.7691780924797058, "num_tokens": 8019122.0, "step": 177, "train/ce_loss": 0.0002223849151050672 }, { "epoch": 0.03500098872849516, "step": 177, "train/sim_loss": 0.003806173801422119 }, { "epoch": 0.03500098872849516, "step": 177, "train/total_loss": 0.003828412387520075 }, { "entropy": 6.083193778991699, "epoch": 0.035198734427526204, "mean_token_accuracy": 0.7230303287506104, "num_tokens": 8067116.0, "step": 178, "train/ce_loss": 0.6755728721618652 }, { "epoch": 0.035198734427526204, "step": 178, "train/sim_loss": 0.0029006004333496094 }, { "epoch": 0.035198734427526204, "step": 178, "train/total_loss": 0.07045789062976837 }, { "entropy": 5.940494537353516, "epoch": 0.035396480126557245, "mean_token_accuracy": 0.7220149040222168, "num_tokens": 8112743.0, "step": 179, "train/ce_loss": 0.8784121870994568 }, { "epoch": 0.035396480126557245, "step": 179, "train/sim_loss": 0.0030535459518432617 }, { "epoch": 0.035396480126557245, "step": 179, "train/total_loss": 0.09089476615190506 }, { "epoch": 0.03559422582558829, "grad_norm": 0.666103720664978, "learning_rate": 9.916411118805026e-06, "loss": 0.0914, "step": 180 }, { "entropy": 6.117097854614258, "epoch": 0.03559422582558829, "mean_token_accuracy": 0.7763480544090271, "num_tokens": 8170560.0, "step": 180, "train/ce_loss": 0.8865546584129333 }, { "epoch": 0.03559422582558829, "step": 180, "train/sim_loss": 0.0035605430603027344 }, { "epoch": 0.03559422582558829, "step": 180, "train/total_loss": 0.09221600741147995 }, { "entropy": 5.7888407707214355, "epoch": 0.03579197152461934, "mean_token_accuracy": 0.7389404773712158, "num_tokens": 8202151.0, "step": 181, "train/ce_loss": 0.8060799837112427 }, { "epoch": 0.03579197152461934, "step": 181, "train/sim_loss": 0.004577577114105225 }, { "epoch": 0.03579197152461934, "step": 181, "train/total_loss": 0.08518557995557785 }, { "entropy": 5.5391716957092285, "epoch": 0.03598971722365039, "mean_token_accuracy": 0.7300115823745728, "num_tokens": 8246441.0, "step": 182, "train/ce_loss": 0.5409895777702332 }, { "epoch": 0.03598971722365039, "step": 182, "train/sim_loss": 0.0020212531089782715 }, { "epoch": 0.03598971722365039, "step": 182, "train/total_loss": 0.056120213121175766 }, { "entropy": 6.090121746063232, "epoch": 0.036187462922681435, "mean_token_accuracy": 0.7157490253448486, "num_tokens": 8298368.0, "step": 183, "train/ce_loss": 1.004600167274475 }, { "epoch": 0.036187462922681435, "step": 183, "train/sim_loss": 0.003901243209838867 }, { "epoch": 0.036187462922681435, "step": 183, "train/total_loss": 0.10436125844717026 }, { "entropy": 6.063948154449463, "epoch": 0.036385208621712475, "mean_token_accuracy": 0.7183800339698792, "num_tokens": 8343397.0, "step": 184, "train/ce_loss": 1.0527777671813965 }, { "epoch": 0.036385208621712475, "step": 184, "train/sim_loss": 0.00450211763381958 }, { "epoch": 0.036385208621712475, "step": 184, "train/total_loss": 0.10977989435195923 }, { "entropy": 5.76295280456543, "epoch": 0.03658295432074352, "mean_token_accuracy": 0.7416342496871948, "num_tokens": 8393831.0, "step": 185, "train/ce_loss": 1.0993934869766235 }, { "epoch": 0.03658295432074352, "step": 185, "train/sim_loss": 0.003838658332824707 }, { "epoch": 0.03658295432074352, "step": 185, "train/total_loss": 0.1137780100107193 }, { "entropy": 5.717108726501465, "epoch": 0.03678070001977457, "mean_token_accuracy": 0.7425622344017029, "num_tokens": 8442191.0, "step": 186, "train/ce_loss": 1.1778340339660645 }, { "epoch": 0.03678070001977457, "step": 186, "train/sim_loss": 0.0022584199905395508 }, { "epoch": 0.03678070001977457, "step": 186, "train/total_loss": 0.12004182487726212 }, { "entropy": 5.806510925292969, "epoch": 0.03697844571880562, "mean_token_accuracy": 0.7416818141937256, "num_tokens": 8483022.0, "step": 187, "train/ce_loss": 0.5648406744003296 }, { "epoch": 0.03697844571880562, "step": 187, "train/sim_loss": 0.0033875107765197754 }, { "epoch": 0.03697844571880562, "step": 187, "train/total_loss": 0.05987158045172691 }, { "entropy": 5.990212917327881, "epoch": 0.037176191417836665, "mean_token_accuracy": 0.7352941036224365, "num_tokens": 8538777.0, "step": 188, "train/ce_loss": 1.5028339624404907 }, { "epoch": 0.037176191417836665, "step": 188, "train/sim_loss": 0.0038264989852905273 }, { "epoch": 0.037176191417836665, "step": 188, "train/total_loss": 0.1541098952293396 }, { "entropy": 5.475760459899902, "epoch": 0.037373937116867706, "mean_token_accuracy": 0.7936508059501648, "num_tokens": 8573057.0, "step": 189, "train/ce_loss": 0.00035292404936626554 }, { "epoch": 0.037373937116867706, "step": 189, "train/sim_loss": 0.0022844672203063965 }, { "epoch": 0.037373937116867706, "step": 189, "train/total_loss": 0.00231975968927145 }, { "entropy": 5.689772605895996, "epoch": 0.03757168281589875, "mean_token_accuracy": 0.7415565252304077, "num_tokens": 8621388.0, "step": 190, "train/ce_loss": 1.190209150314331 }, { "epoch": 0.03757168281589875, "step": 190, "train/sim_loss": 0.004219114780426025 }, { "epoch": 0.03757168281589875, "step": 190, "train/total_loss": 0.12324003130197525 }, { "entropy": 5.840243339538574, "epoch": 0.0377694285149298, "mean_token_accuracy": 0.7367501258850098, "num_tokens": 8671354.0, "step": 191, "train/ce_loss": 1.479458212852478 }, { "epoch": 0.0377694285149298, "step": 191, "train/sim_loss": 0.003091275691986084 }, { "epoch": 0.0377694285149298, "step": 191, "train/total_loss": 0.1510370969772339 }, { "entropy": 5.999958038330078, "epoch": 0.03796717421396085, "mean_token_accuracy": 0.7124239802360535, "num_tokens": 8709356.0, "step": 192, "train/ce_loss": 0.00025326182367280126 }, { "epoch": 0.03796717421396085, "step": 192, "train/sim_loss": 0.004060626029968262 }, { "epoch": 0.03796717421396085, "step": 192, "train/total_loss": 0.004085952416062355 }, { "entropy": 6.155965328216553, "epoch": 0.03816491991299189, "mean_token_accuracy": 0.7362401485443115, "num_tokens": 8752532.0, "step": 193, "train/ce_loss": 0.7762113213539124 }, { "epoch": 0.03816491991299189, "step": 193, "train/sim_loss": 0.0023941993713378906 }, { "epoch": 0.03816491991299189, "step": 193, "train/total_loss": 0.08001533150672913 }, { "entropy": 5.965462684631348, "epoch": 0.038362665612022936, "mean_token_accuracy": 0.7516148090362549, "num_tokens": 8803702.0, "step": 194, "train/ce_loss": 0.9389039874076843 }, { "epoch": 0.038362665612022936, "step": 194, "train/sim_loss": 0.002212047576904297 }, { "epoch": 0.038362665612022936, "step": 194, "train/total_loss": 0.09610244631767273 }, { "entropy": 6.204084396362305, "epoch": 0.038560411311053984, "mean_token_accuracy": 0.7568181753158569, "num_tokens": 8868075.0, "step": 195, "train/ce_loss": 0.5641533732414246 }, { "epoch": 0.038560411311053984, "step": 195, "train/sim_loss": 0.003572523593902588 }, { "epoch": 0.038560411311053984, "step": 195, "train/total_loss": 0.059987861663103104 }, { "entropy": 5.38918399810791, "epoch": 0.03875815701008503, "mean_token_accuracy": 0.7129372358322144, "num_tokens": 8903182.0, "step": 196, "train/ce_loss": 0.00019254493236076087 }, { "epoch": 0.03875815701008503, "step": 196, "train/sim_loss": 0.0034416913986206055 }, { "epoch": 0.03875815701008503, "step": 196, "train/total_loss": 0.0034609457943588495 }, { "entropy": 5.793299674987793, "epoch": 0.03895590270911608, "mean_token_accuracy": 0.7517517805099487, "num_tokens": 8946468.0, "step": 197, "train/ce_loss": 2.9437146186828613 }, { "epoch": 0.03895590270911608, "step": 197, "train/sim_loss": 0.0029891133308410645 }, { "epoch": 0.03895590270911608, "step": 197, "train/total_loss": 0.2973605692386627 }, { "entropy": 5.817845821380615, "epoch": 0.03915364840814712, "mean_token_accuracy": 0.7493351101875305, "num_tokens": 8982705.0, "step": 198, "train/ce_loss": 1.143243670463562 }, { "epoch": 0.03915364840814712, "step": 198, "train/sim_loss": 0.004785358905792236 }, { "epoch": 0.03915364840814712, "step": 198, "train/total_loss": 0.11910972744226456 }, { "entropy": 5.907153606414795, "epoch": 0.03935139410717817, "mean_token_accuracy": 0.6792332530021667, "num_tokens": 9034041.0, "step": 199, "train/ce_loss": 1.6776996850967407 }, { "epoch": 0.03935139410717817, "step": 199, "train/sim_loss": 0.003486156463623047 }, { "epoch": 0.03935139410717817, "step": 199, "train/total_loss": 0.17125612497329712 }, { "epoch": 0.039549139806209214, "grad_norm": 0.5853844285011292, "learning_rate": 9.90651894351568e-06, "loss": 0.0888, "step": 200 }, { "entropy": 6.259209632873535, "epoch": 0.039549139806209214, "mean_token_accuracy": 0.708537757396698, "num_tokens": 9078391.0, "step": 200, "train/ce_loss": 0.00033320128568448126 }, { "epoch": 0.039549139806209214, "step": 200, "train/sim_loss": 0.002574145793914795 }, { "epoch": 0.039549139806209214, "step": 200, "train/total_loss": 0.0026074659544974566 }, { "entropy": 6.029792785644531, "epoch": 0.03974688550524026, "mean_token_accuracy": 0.7275747656822205, "num_tokens": 9113850.0, "step": 201, "train/ce_loss": 0.805978536605835 }, { "epoch": 0.03974688550524026, "step": 201, "train/sim_loss": 0.002693653106689453 }, { "epoch": 0.03974688550524026, "step": 201, "train/total_loss": 0.08329150825738907 }, { "entropy": 6.183684349060059, "epoch": 0.03994463120427131, "mean_token_accuracy": 0.7209857106208801, "num_tokens": 9150751.0, "step": 202, "train/ce_loss": 1.180189609527588 }, { "epoch": 0.03994463120427131, "step": 202, "train/sim_loss": 0.002086818218231201 }, { "epoch": 0.03994463120427131, "step": 202, "train/total_loss": 0.12010578066110611 }, { "entropy": 5.6927666664123535, "epoch": 0.04014237690330235, "mean_token_accuracy": 0.7832911610603333, "num_tokens": 9208972.0, "step": 203, "train/ce_loss": 0.5599971413612366 }, { "epoch": 0.04014237690330235, "step": 203, "train/sim_loss": 0.002585887908935547 }, { "epoch": 0.04014237690330235, "step": 203, "train/total_loss": 0.058585602790117264 }, { "entropy": 6.247528076171875, "epoch": 0.0403401226023334, "mean_token_accuracy": 0.7285223603248596, "num_tokens": 9262684.0, "step": 204, "train/ce_loss": 1.093955159187317 }, { "epoch": 0.0403401226023334, "step": 204, "train/sim_loss": 0.001821756362915039 }, { "epoch": 0.0403401226023334, "step": 204, "train/total_loss": 0.11121727526187897 }, { "entropy": 6.141802787780762, "epoch": 0.040537868301364445, "mean_token_accuracy": 0.754182755947113, "num_tokens": 9298667.0, "step": 205, "train/ce_loss": 0.7415148615837097 }, { "epoch": 0.040537868301364445, "step": 205, "train/sim_loss": 0.0043299198150634766 }, { "epoch": 0.040537868301364445, "step": 205, "train/total_loss": 0.07848140597343445 }, { "entropy": 5.489391326904297, "epoch": 0.04073561400039549, "mean_token_accuracy": 0.7525648474693298, "num_tokens": 9333002.0, "step": 206, "train/ce_loss": 0.8401524424552917 }, { "epoch": 0.04073561400039549, "step": 206, "train/sim_loss": 0.0029736757278442383 }, { "epoch": 0.04073561400039549, "step": 206, "train/total_loss": 0.0869889184832573 }, { "entropy": 6.360934257507324, "epoch": 0.04093335969942654, "mean_token_accuracy": 0.7248984575271606, "num_tokens": 9379747.0, "step": 207, "train/ce_loss": 1.1799793243408203 }, { "epoch": 0.04093335969942654, "step": 207, "train/sim_loss": 0.002317667007446289 }, { "epoch": 0.04093335969942654, "step": 207, "train/total_loss": 0.12031560391187668 }, { "entropy": 5.848916053771973, "epoch": 0.04113110539845758, "mean_token_accuracy": 0.7017654180526733, "num_tokens": 9412595.0, "step": 208, "train/ce_loss": 1.5947428941726685 }, { "epoch": 0.04113110539845758, "step": 208, "train/sim_loss": 0.0036807656288146973 }, { "epoch": 0.04113110539845758, "step": 208, "train/total_loss": 0.16315506398677826 }, { "entropy": 5.726275444030762, "epoch": 0.04132885109748863, "mean_token_accuracy": 0.7552289366722107, "num_tokens": 9439094.0, "step": 209, "train/ce_loss": 0.7303430438041687 }, { "epoch": 0.04132885109748863, "step": 209, "train/sim_loss": 0.0016942024230957031 }, { "epoch": 0.04132885109748863, "step": 209, "train/total_loss": 0.07472851127386093 }, { "entropy": 6.09914493560791, "epoch": 0.041526596796519676, "mean_token_accuracy": 0.7238895297050476, "num_tokens": 9482266.0, "step": 210, "train/ce_loss": 0.9625969529151917 }, { "epoch": 0.041526596796519676, "step": 210, "train/sim_loss": 0.003124713897705078 }, { "epoch": 0.041526596796519676, "step": 210, "train/total_loss": 0.09938441216945648 }, { "entropy": 6.405793190002441, "epoch": 0.04172434249555072, "mean_token_accuracy": 0.7262958288192749, "num_tokens": 9541590.0, "step": 211, "train/ce_loss": 0.6267989873886108 }, { "epoch": 0.04172434249555072, "step": 211, "train/sim_loss": 0.0021789073944091797 }, { "epoch": 0.04172434249555072, "step": 211, "train/total_loss": 0.0648588091135025 }, { "entropy": 5.900712966918945, "epoch": 0.04192208819458177, "mean_token_accuracy": 0.7347161769866943, "num_tokens": 9574450.0, "step": 212, "train/ce_loss": 0.6492196321487427 }, { "epoch": 0.04192208819458177, "step": 212, "train/sim_loss": 0.002523183822631836 }, { "epoch": 0.04192208819458177, "step": 212, "train/total_loss": 0.06744515150785446 }, { "entropy": 6.324042320251465, "epoch": 0.04211983389361281, "mean_token_accuracy": 0.6870503425598145, "num_tokens": 9616027.0, "step": 213, "train/ce_loss": 0.00024846437736414373 }, { "epoch": 0.04211983389361281, "step": 213, "train/sim_loss": 0.002086639404296875 }, { "epoch": 0.04211983389361281, "step": 213, "train/total_loss": 0.0021114859264343977 }, { "entropy": 6.256585121154785, "epoch": 0.04231757959264386, "mean_token_accuracy": 0.7065404653549194, "num_tokens": 9664841.0, "step": 214, "train/ce_loss": 1.436423897743225 }, { "epoch": 0.04231757959264386, "step": 214, "train/sim_loss": 0.0035344362258911133 }, { "epoch": 0.04231757959264386, "step": 214, "train/total_loss": 0.1471768319606781 }, { "entropy": 6.073983192443848, "epoch": 0.042515325291674906, "mean_token_accuracy": 0.6726190447807312, "num_tokens": 9739468.0, "step": 215, "train/ce_loss": 1.0304441452026367 }, { "epoch": 0.042515325291674906, "step": 215, "train/sim_loss": 0.0018131136894226074 }, { "epoch": 0.042515325291674906, "step": 215, "train/total_loss": 0.10485752671957016 }, { "entropy": 6.123852729797363, "epoch": 0.042713070990705954, "mean_token_accuracy": 0.7542017102241516, "num_tokens": 9793690.0, "step": 216, "train/ce_loss": 0.7021613717079163 }, { "epoch": 0.042713070990705954, "step": 216, "train/sim_loss": 0.002382218837738037 }, { "epoch": 0.042713070990705954, "step": 216, "train/total_loss": 0.07259836047887802 }, { "entropy": 6.070633888244629, "epoch": 0.042910816689737, "mean_token_accuracy": 0.7711238861083984, "num_tokens": 9826525.0, "step": 217, "train/ce_loss": 0.00020065269200131297 }, { "epoch": 0.042910816689737, "step": 217, "train/sim_loss": 0.0029862523078918457 }, { "epoch": 0.042910816689737, "step": 217, "train/total_loss": 0.003006317652761936 }, { "entropy": 6.136026382446289, "epoch": 0.04310856238876804, "mean_token_accuracy": 0.6945454478263855, "num_tokens": 9872177.0, "step": 218, "train/ce_loss": 0.6764545440673828 }, { "epoch": 0.04310856238876804, "step": 218, "train/sim_loss": 0.002591729164123535 }, { "epoch": 0.04310856238876804, "step": 218, "train/total_loss": 0.0702371820807457 }, { "entropy": 6.383015155792236, "epoch": 0.04330630808779909, "mean_token_accuracy": 0.7058383226394653, "num_tokens": 9935257.0, "step": 219, "train/ce_loss": 2.997581720352173 }, { "epoch": 0.04330630808779909, "step": 219, "train/sim_loss": 0.0017222762107849121 }, { "epoch": 0.04330630808779909, "step": 219, "train/total_loss": 0.3014804422855377 }, { "epoch": 0.04350405378683014, "grad_norm": 0.587738573551178, "learning_rate": 9.896626768226334e-06, "loss": 0.0922, "step": 220 }, { "entropy": 6.304775238037109, "epoch": 0.04350405378683014, "mean_token_accuracy": 0.7520350813865662, "num_tokens": 9991703.0, "step": 220, "train/ce_loss": 0.9331344962120056 }, { "epoch": 0.04350405378683014, "step": 220, "train/sim_loss": 0.0024532675743103027 }, { "epoch": 0.04350405378683014, "step": 220, "train/total_loss": 0.09576671570539474 }, { "entropy": 6.126867294311523, "epoch": 0.043701799485861184, "mean_token_accuracy": 0.7199730277061462, "num_tokens": 10045301.0, "step": 221, "train/ce_loss": 1.2255245447158813 }, { "epoch": 0.043701799485861184, "step": 221, "train/sim_loss": 0.0021390914916992188 }, { "epoch": 0.043701799485861184, "step": 221, "train/total_loss": 0.12469154596328735 }, { "entropy": 6.105471611022949, "epoch": 0.04389954518489223, "mean_token_accuracy": 0.7292700409889221, "num_tokens": 10092191.0, "step": 222, "train/ce_loss": 1.0318121910095215 }, { "epoch": 0.04389954518489223, "step": 222, "train/sim_loss": 0.0020540952682495117 }, { "epoch": 0.04389954518489223, "step": 222, "train/total_loss": 0.10523531585931778 }, { "entropy": 6.419918060302734, "epoch": 0.04409729088392327, "mean_token_accuracy": 0.7411764860153198, "num_tokens": 10145208.0, "step": 223, "train/ce_loss": 0.7109538912773132 }, { "epoch": 0.04409729088392327, "step": 223, "train/sim_loss": 0.001876533031463623 }, { "epoch": 0.04409729088392327, "step": 223, "train/total_loss": 0.07297192513942719 }, { "entropy": 6.312893390655518, "epoch": 0.04429503658295432, "mean_token_accuracy": 0.6968153119087219, "num_tokens": 10186364.0, "step": 224, "train/ce_loss": 0.00015741604147478938 }, { "epoch": 0.04429503658295432, "step": 224, "train/sim_loss": 0.0018547773361206055 }, { "epoch": 0.04429503658295432, "step": 224, "train/total_loss": 0.0018705188995227218 }, { "entropy": 6.516845226287842, "epoch": 0.04449278228198537, "mean_token_accuracy": 0.7437417507171631, "num_tokens": 10235721.0, "step": 225, "train/ce_loss": 0.8701533079147339 }, { "epoch": 0.04449278228198537, "step": 225, "train/sim_loss": 0.002685844898223877 }, { "epoch": 0.04449278228198537, "step": 225, "train/total_loss": 0.08970117568969727 }, { "entropy": 6.257672309875488, "epoch": 0.044690527981016415, "mean_token_accuracy": 0.708737850189209, "num_tokens": 10283426.0, "step": 226, "train/ce_loss": 0.5786328315734863 }, { "epoch": 0.044690527981016415, "step": 226, "train/sim_loss": 0.0027266740798950195 }, { "epoch": 0.044690527981016415, "step": 226, "train/total_loss": 0.06058995798230171 }, { "entropy": 6.176913261413574, "epoch": 0.04488827368004746, "mean_token_accuracy": 0.6750614047050476, "num_tokens": 10337660.0, "step": 227, "train/ce_loss": 0.7932107448577881 }, { "epoch": 0.04488827368004746, "step": 227, "train/sim_loss": 0.0027088522911071777 }, { "epoch": 0.04488827368004746, "step": 227, "train/total_loss": 0.08202993124723434 }, { "entropy": 6.086831569671631, "epoch": 0.0450860193790785, "mean_token_accuracy": 0.7388818264007568, "num_tokens": 10384636.0, "step": 228, "train/ce_loss": 0.7747422456741333 }, { "epoch": 0.0450860193790785, "step": 228, "train/sim_loss": 0.0025826692581176758 }, { "epoch": 0.0450860193790785, "step": 228, "train/total_loss": 0.08005689829587936 }, { "entropy": 6.016354560852051, "epoch": 0.04528376507810955, "mean_token_accuracy": 0.761904776096344, "num_tokens": 10424338.0, "step": 229, "train/ce_loss": 0.5654276609420776 }, { "epoch": 0.04528376507810955, "step": 229, "train/sim_loss": 0.00267714262008667 }, { "epoch": 0.04528376507810955, "step": 229, "train/total_loss": 0.059219907969236374 }, { "entropy": 6.204808235168457, "epoch": 0.0454815107771406, "mean_token_accuracy": 0.7272727489471436, "num_tokens": 10468351.0, "step": 230, "train/ce_loss": 1.1364598274230957 }, { "epoch": 0.0454815107771406, "step": 230, "train/sim_loss": 0.0024920105934143066 }, { "epoch": 0.0454815107771406, "step": 230, "train/total_loss": 0.11613799631595612 }, { "entropy": 5.669295310974121, "epoch": 0.045679256476171645, "mean_token_accuracy": 0.7445708513259888, "num_tokens": 10508617.0, "step": 231, "train/ce_loss": 1.046099305152893 }, { "epoch": 0.045679256476171645, "step": 231, "train/sim_loss": 0.0022145509719848633 }, { "epoch": 0.045679256476171645, "step": 231, "train/total_loss": 0.10682447999715805 }, { "entropy": 6.14771032333374, "epoch": 0.04587700217520269, "mean_token_accuracy": 0.7320692539215088, "num_tokens": 10548351.0, "step": 232, "train/ce_loss": 0.7977413535118103 }, { "epoch": 0.04587700217520269, "step": 232, "train/sim_loss": 0.0034532546997070312 }, { "epoch": 0.04587700217520269, "step": 232, "train/total_loss": 0.08322738856077194 }, { "entropy": 6.247295379638672, "epoch": 0.04607474787423373, "mean_token_accuracy": 0.7699429988861084, "num_tokens": 10591988.0, "step": 233, "train/ce_loss": 0.9703869819641113 }, { "epoch": 0.04607474787423373, "step": 233, "train/sim_loss": 0.0028352737426757812 }, { "epoch": 0.04607474787423373, "step": 233, "train/total_loss": 0.09987397491931915 }, { "entropy": 6.068336486816406, "epoch": 0.04627249357326478, "mean_token_accuracy": 0.7375415563583374, "num_tokens": 10634536.0, "step": 234, "train/ce_loss": 1.1217594146728516 }, { "epoch": 0.04627249357326478, "step": 234, "train/sim_loss": 0.003766179084777832 }, { "epoch": 0.04627249357326478, "step": 234, "train/total_loss": 0.11594212055206299 }, { "entropy": 5.7838592529296875, "epoch": 0.04647023927229583, "mean_token_accuracy": 0.7052127122879028, "num_tokens": 10679071.0, "step": 235, "train/ce_loss": 1.1026235818862915 }, { "epoch": 0.04647023927229583, "step": 235, "train/sim_loss": 0.00549924373626709 }, { "epoch": 0.04647023927229583, "step": 235, "train/total_loss": 0.11576160043478012 }, { "entropy": 6.118648052215576, "epoch": 0.046667984971326876, "mean_token_accuracy": 0.7218934893608093, "num_tokens": 10722618.0, "step": 236, "train/ce_loss": 0.8957979083061218 }, { "epoch": 0.046667984971326876, "step": 236, "train/sim_loss": 0.0033072233200073242 }, { "epoch": 0.046667984971326876, "step": 236, "train/total_loss": 0.0928870141506195 }, { "entropy": 6.124458312988281, "epoch": 0.046865730670357916, "mean_token_accuracy": 0.7123622894287109, "num_tokens": 10767881.0, "step": 237, "train/ce_loss": 0.0002700022014323622 }, { "epoch": 0.046865730670357916, "step": 237, "train/sim_loss": 0.0033934712409973145 }, { "epoch": 0.046865730670357916, "step": 237, "train/total_loss": 0.0034204714465886354 }, { "entropy": 5.831693172454834, "epoch": 0.047063476369388964, "mean_token_accuracy": 0.7399317622184753, "num_tokens": 10806280.0, "step": 238, "train/ce_loss": 1.1556239128112793 }, { "epoch": 0.047063476369388964, "step": 238, "train/sim_loss": 0.003062605857849121 }, { "epoch": 0.047063476369388964, "step": 238, "train/total_loss": 0.11862500011920929 }, { "entropy": 5.825222015380859, "epoch": 0.04726122206842001, "mean_token_accuracy": 0.7465091347694397, "num_tokens": 10843647.0, "step": 239, "train/ce_loss": 0.5987352132797241 }, { "epoch": 0.04726122206842001, "step": 239, "train/sim_loss": 0.001917719841003418 }, { "epoch": 0.04726122206842001, "step": 239, "train/total_loss": 0.06179124116897583 }, { "epoch": 0.04745896776745106, "grad_norm": 0.4572809934616089, "learning_rate": 9.886734592936988e-06, "loss": 0.0896, "step": 240 }, { "entropy": 5.674968719482422, "epoch": 0.04745896776745106, "mean_token_accuracy": 0.7773497700691223, "num_tokens": 10905274.0, "step": 240, "train/ce_loss": 0.800529956817627 }, { "epoch": 0.04745896776745106, "step": 240, "train/sim_loss": 0.0018821954727172852 }, { "epoch": 0.04745896776745106, "step": 240, "train/total_loss": 0.08193518966436386 }, { "entropy": 6.348086357116699, "epoch": 0.047656713466482106, "mean_token_accuracy": 0.713274359703064, "num_tokens": 10966824.0, "step": 241, "train/ce_loss": 0.9583829045295715 }, { "epoch": 0.047656713466482106, "step": 241, "train/sim_loss": 0.002040565013885498 }, { "epoch": 0.047656713466482106, "step": 241, "train/total_loss": 0.09787885844707489 }, { "entropy": 6.321876525878906, "epoch": 0.04785445916551315, "mean_token_accuracy": 0.7387717962265015, "num_tokens": 11018147.0, "step": 242, "train/ce_loss": 1.4425816535949707 }, { "epoch": 0.04785445916551315, "step": 242, "train/sim_loss": 0.0025421977043151855 }, { "epoch": 0.04785445916551315, "step": 242, "train/total_loss": 0.14680036902427673 }, { "entropy": 6.439914703369141, "epoch": 0.048052204864544194, "mean_token_accuracy": 0.7307692170143127, "num_tokens": 11084941.0, "step": 243, "train/ce_loss": 1.0359135866165161 }, { "epoch": 0.048052204864544194, "step": 243, "train/sim_loss": 0.003669261932373047 }, { "epoch": 0.048052204864544194, "step": 243, "train/total_loss": 0.10726062208414078 }, { "entropy": 5.890529632568359, "epoch": 0.04824995056357524, "mean_token_accuracy": 0.7131280303001404, "num_tokens": 11129736.0, "step": 244, "train/ce_loss": 1.4633880853652954 }, { "epoch": 0.04824995056357524, "step": 244, "train/sim_loss": 0.002194643020629883 }, { "epoch": 0.04824995056357524, "step": 244, "train/total_loss": 0.14853344857692719 }, { "entropy": 6.110105991363525, "epoch": 0.04844769626260629, "mean_token_accuracy": 0.725653886795044, "num_tokens": 11172543.0, "step": 245, "train/ce_loss": 1.0649223327636719 }, { "epoch": 0.04844769626260629, "step": 245, "train/sim_loss": 0.003980457782745361 }, { "epoch": 0.04844769626260629, "step": 245, "train/total_loss": 0.11047269403934479 }, { "entropy": 6.205379962921143, "epoch": 0.04864544196163734, "mean_token_accuracy": 0.7057584524154663, "num_tokens": 11239631.0, "step": 246, "train/ce_loss": 0.9687055945396423 }, { "epoch": 0.04864544196163734, "step": 246, "train/sim_loss": 0.0014594793319702148 }, { "epoch": 0.04864544196163734, "step": 246, "train/total_loss": 0.0983300432562828 }, { "entropy": 5.718387603759766, "epoch": 0.04884318766066838, "mean_token_accuracy": 0.7027677297592163, "num_tokens": 11283731.0, "step": 247, "train/ce_loss": 0.643815279006958 }, { "epoch": 0.04884318766066838, "step": 247, "train/sim_loss": 0.0014005303382873535 }, { "epoch": 0.04884318766066838, "step": 247, "train/total_loss": 0.06578206270933151 }, { "entropy": 6.194766998291016, "epoch": 0.049040933359699425, "mean_token_accuracy": 0.7172236442565918, "num_tokens": 11340606.0, "step": 248, "train/ce_loss": 0.887425422668457 }, { "epoch": 0.049040933359699425, "step": 248, "train/sim_loss": 0.0011725425720214844 }, { "epoch": 0.049040933359699425, "step": 248, "train/total_loss": 0.08991508930921555 }, { "entropy": 5.855417251586914, "epoch": 0.04923867905873047, "mean_token_accuracy": 0.6762028336524963, "num_tokens": 11388470.0, "step": 249, "train/ce_loss": 0.8820322155952454 }, { "epoch": 0.04923867905873047, "step": 249, "train/sim_loss": 0.0020778775215148926 }, { "epoch": 0.04923867905873047, "step": 249, "train/total_loss": 0.09028109908103943 }, { "entropy": 6.009776592254639, "epoch": 0.04943642475776152, "mean_token_accuracy": 0.728205144405365, "num_tokens": 11425393.0, "step": 250, "train/ce_loss": 0.8835954666137695 }, { "epoch": 0.04943642475776152, "step": 250, "train/sim_loss": 0.0020388364791870117 }, { "epoch": 0.04943642475776152, "step": 250, "train/total_loss": 0.0903983861207962 }, { "entropy": 5.9451446533203125, "epoch": 0.04963417045679257, "mean_token_accuracy": 0.728977620601654, "num_tokens": 11468654.0, "step": 251, "train/ce_loss": 2.279505491256714 }, { "epoch": 0.04963417045679257, "step": 251, "train/sim_loss": 0.008562326431274414 }, { "epoch": 0.04963417045679257, "step": 251, "train/total_loss": 0.23651288449764252 }, { "entropy": 6.021491050720215, "epoch": 0.04983191615582361, "mean_token_accuracy": 0.7490752339363098, "num_tokens": 11526049.0, "step": 252, "train/ce_loss": 0.9156975150108337 }, { "epoch": 0.04983191615582361, "step": 252, "train/sim_loss": 0.002097010612487793 }, { "epoch": 0.04983191615582361, "step": 252, "train/total_loss": 0.09366676211357117 }, { "entropy": 5.818253517150879, "epoch": 0.050029661854854655, "mean_token_accuracy": 0.7422892451286316, "num_tokens": 11566686.0, "step": 253, "train/ce_loss": 1.047959804534912 }, { "epoch": 0.050029661854854655, "step": 253, "train/sim_loss": 0.001704096794128418 }, { "epoch": 0.050029661854854655, "step": 253, "train/total_loss": 0.10650008171796799 }, { "entropy": 5.6636152267456055, "epoch": 0.0502274075538857, "mean_token_accuracy": 0.7504244446754456, "num_tokens": 11600404.0, "step": 254, "train/ce_loss": 0.00017380574718117714 }, { "epoch": 0.0502274075538857, "step": 254, "train/sim_loss": 0.002116560935974121 }, { "epoch": 0.0502274075538857, "step": 254, "train/total_loss": 0.002133941510692239 }, { "entropy": 6.298640251159668, "epoch": 0.05042515325291675, "mean_token_accuracy": 0.6981580257415771, "num_tokens": 11656577.0, "step": 255, "train/ce_loss": 0.3462703824043274 }, { "epoch": 0.05042515325291675, "step": 255, "train/sim_loss": 0.0029479265213012695 }, { "epoch": 0.05042515325291675, "step": 255, "train/total_loss": 0.03757496550679207 }, { "entropy": 5.899532318115234, "epoch": 0.0506228989519478, "mean_token_accuracy": 0.7111255526542664, "num_tokens": 11707915.0, "step": 256, "train/ce_loss": 1.1099574565887451 }, { "epoch": 0.0506228989519478, "step": 256, "train/sim_loss": 0.00199282169342041 }, { "epoch": 0.0506228989519478, "step": 256, "train/total_loss": 0.11298856884241104 }, { "entropy": 5.848351955413818, "epoch": 0.05082064465097884, "mean_token_accuracy": 0.72721928358078, "num_tokens": 11757105.0, "step": 257, "train/ce_loss": 0.7929616570472717 }, { "epoch": 0.05082064465097884, "step": 257, "train/sim_loss": 0.0012878179550170898 }, { "epoch": 0.05082064465097884, "step": 257, "train/total_loss": 0.08058398216962814 }, { "entropy": 5.542336463928223, "epoch": 0.051018390350009886, "mean_token_accuracy": 0.7655657529830933, "num_tokens": 11787537.0, "step": 258, "train/ce_loss": 1.1375694274902344 }, { "epoch": 0.051018390350009886, "step": 258, "train/sim_loss": 0.0020345449447631836 }, { "epoch": 0.051018390350009886, "step": 258, "train/total_loss": 0.11579149216413498 }, { "entropy": 6.119966506958008, "epoch": 0.05121613604904093, "mean_token_accuracy": 0.7631798982620239, "num_tokens": 11837051.0, "step": 259, "train/ce_loss": 1.3085471391677856 }, { "epoch": 0.05121613604904093, "step": 259, "train/sim_loss": 0.0022999048233032227 }, { "epoch": 0.05121613604904093, "step": 259, "train/total_loss": 0.13315461575984955 }, { "epoch": 0.05141388174807198, "grad_norm": 0.5896568894386292, "learning_rate": 9.876842417647641e-06, "loss": 0.0902, "step": 260 }, { "entropy": 5.958221912384033, "epoch": 0.05141388174807198, "mean_token_accuracy": 0.7310855388641357, "num_tokens": 11883715.0, "step": 260, "train/ce_loss": 1.1801217794418335 }, { "epoch": 0.05141388174807198, "step": 260, "train/sim_loss": 0.0013539791107177734 }, { "epoch": 0.05141388174807198, "step": 260, "train/total_loss": 0.11936616152524948 }, { "entropy": 5.965867519378662, "epoch": 0.05161162744710303, "mean_token_accuracy": 0.7255370020866394, "num_tokens": 11938768.0, "step": 261, "train/ce_loss": 1.0407382249832153 }, { "epoch": 0.05161162744710303, "step": 261, "train/sim_loss": 0.002217888832092285 }, { "epoch": 0.05161162744710303, "step": 261, "train/total_loss": 0.10629171133041382 }, { "entropy": 5.955913543701172, "epoch": 0.05180937314613407, "mean_token_accuracy": 0.7188478112220764, "num_tokens": 11986270.0, "step": 262, "train/ce_loss": 0.0005326005048118532 }, { "epoch": 0.05180937314613407, "step": 262, "train/sim_loss": 0.0012707710266113281 }, { "epoch": 0.05180937314613407, "step": 262, "train/total_loss": 0.0013240310363471508 }, { "entropy": 5.890755653381348, "epoch": 0.052007118845165116, "mean_token_accuracy": 0.7373737096786499, "num_tokens": 12051102.0, "step": 263, "train/ce_loss": 0.8749343156814575 }, { "epoch": 0.052007118845165116, "step": 263, "train/sim_loss": 0.001895904541015625 }, { "epoch": 0.052007118845165116, "step": 263, "train/total_loss": 0.08938933908939362 }, { "entropy": 5.9732208251953125, "epoch": 0.052204864544196164, "mean_token_accuracy": 0.7333333492279053, "num_tokens": 12094221.0, "step": 264, "train/ce_loss": 0.5004105567932129 }, { "epoch": 0.052204864544196164, "step": 264, "train/sim_loss": 0.003201723098754883 }, { "epoch": 0.052204864544196164, "step": 264, "train/total_loss": 0.05324278026819229 }, { "entropy": 5.334837436676025, "epoch": 0.05240261024322721, "mean_token_accuracy": 0.7498565912246704, "num_tokens": 12130391.0, "step": 265, "train/ce_loss": 0.7336531281471252 }, { "epoch": 0.05240261024322721, "step": 265, "train/sim_loss": 0.001495659351348877 }, { "epoch": 0.05240261024322721, "step": 265, "train/total_loss": 0.07486097514629364 }, { "entropy": 5.526782035827637, "epoch": 0.05260035594225826, "mean_token_accuracy": 0.7653631567955017, "num_tokens": 12174262.0, "step": 266, "train/ce_loss": 1.1221624612808228 }, { "epoch": 0.05260035594225826, "step": 266, "train/sim_loss": 0.0025675296783447266 }, { "epoch": 0.05260035594225826, "step": 266, "train/total_loss": 0.11478377878665924 }, { "entropy": 5.969976425170898, "epoch": 0.0527981016412893, "mean_token_accuracy": 0.7200000286102295, "num_tokens": 12203725.0, "step": 267, "train/ce_loss": 0.9926760792732239 }, { "epoch": 0.0527981016412893, "step": 267, "train/sim_loss": 0.0021628141403198242 }, { "epoch": 0.0527981016412893, "step": 267, "train/total_loss": 0.10143042355775833 }, { "entropy": 5.886158466339111, "epoch": 0.05299584734032035, "mean_token_accuracy": 0.7078838348388672, "num_tokens": 12243849.0, "step": 268, "train/ce_loss": 0.00017611519433557987 }, { "epoch": 0.05299584734032035, "step": 268, "train/sim_loss": 0.002360224723815918 }, { "epoch": 0.05299584734032035, "step": 268, "train/total_loss": 0.0023778362665325403 }, { "entropy": 5.6253862380981445, "epoch": 0.053193593039351394, "mean_token_accuracy": 0.7533286809921265, "num_tokens": 12280589.0, "step": 269, "train/ce_loss": 0.7438363432884216 }, { "epoch": 0.053193593039351394, "step": 269, "train/sim_loss": 0.0026435256004333496 }, { "epoch": 0.053193593039351394, "step": 269, "train/total_loss": 0.07702716439962387 }, { "entropy": 5.864531517028809, "epoch": 0.05339133873838244, "mean_token_accuracy": 0.7315084338188171, "num_tokens": 12328621.0, "step": 270, "train/ce_loss": 1.057058572769165 }, { "epoch": 0.05339133873838244, "step": 270, "train/sim_loss": 0.0016450881958007812 }, { "epoch": 0.05339133873838244, "step": 270, "train/total_loss": 0.10735094547271729 }, { "entropy": 5.19373893737793, "epoch": 0.05358908443741349, "mean_token_accuracy": 0.7917320728302002, "num_tokens": 12353029.0, "step": 271, "train/ce_loss": 0.4839043915271759 }, { "epoch": 0.05358908443741349, "step": 271, "train/sim_loss": 0.0011012554168701172 }, { "epoch": 0.05358908443741349, "step": 271, "train/total_loss": 0.04949169605970383 }, { "entropy": 5.731775760650635, "epoch": 0.05378683013644453, "mean_token_accuracy": 0.692105233669281, "num_tokens": 12392405.0, "step": 272, "train/ce_loss": 1.8190622329711914 }, { "epoch": 0.05378683013644453, "step": 272, "train/sim_loss": 0.002282381057739258 }, { "epoch": 0.05378683013644453, "step": 272, "train/total_loss": 0.1841886043548584 }, { "entropy": 6.082909107208252, "epoch": 0.05398457583547558, "mean_token_accuracy": 0.7408786416053772, "num_tokens": 12450431.0, "step": 273, "train/ce_loss": 0.7111964821815491 }, { "epoch": 0.05398457583547558, "step": 273, "train/sim_loss": 0.002157151699066162 }, { "epoch": 0.05398457583547558, "step": 273, "train/total_loss": 0.07327680289745331 }, { "entropy": 6.1704277992248535, "epoch": 0.054182321534506625, "mean_token_accuracy": 0.7453675866127014, "num_tokens": 12490863.0, "step": 274, "train/ce_loss": 0.5982339382171631 }, { "epoch": 0.054182321534506625, "step": 274, "train/sim_loss": 0.0021625161170959473 }, { "epoch": 0.054182321534506625, "step": 274, "train/total_loss": 0.061985909938812256 }, { "entropy": 5.587904930114746, "epoch": 0.05438006723353767, "mean_token_accuracy": 0.739099383354187, "num_tokens": 12516018.0, "step": 275, "train/ce_loss": 1.0016882419586182 }, { "epoch": 0.05438006723353767, "step": 275, "train/sim_loss": 0.002178788185119629 }, { "epoch": 0.05438006723353767, "step": 275, "train/total_loss": 0.10234761238098145 }, { "entropy": 5.735330581665039, "epoch": 0.05457781293256872, "mean_token_accuracy": 0.7441176176071167, "num_tokens": 12558879.0, "step": 276, "train/ce_loss": 1.0810245275497437 }, { "epoch": 0.05457781293256872, "step": 276, "train/sim_loss": 0.001596689224243164 }, { "epoch": 0.05457781293256872, "step": 276, "train/total_loss": 0.10969914495944977 }, { "entropy": 5.743949890136719, "epoch": 0.05477555863159976, "mean_token_accuracy": 0.7487468719482422, "num_tokens": 12592679.0, "step": 277, "train/ce_loss": 0.9998674392700195 }, { "epoch": 0.05477555863159976, "step": 277, "train/sim_loss": 0.0014492273330688477 }, { "epoch": 0.05477555863159976, "step": 277, "train/total_loss": 0.10143597424030304 }, { "entropy": 5.798332214355469, "epoch": 0.05497330433063081, "mean_token_accuracy": 0.7839272022247314, "num_tokens": 12631996.0, "step": 278, "train/ce_loss": 1.0264854431152344 }, { "epoch": 0.05497330433063081, "step": 278, "train/sim_loss": 0.0023467540740966797 }, { "epoch": 0.05497330433063081, "step": 278, "train/total_loss": 0.10499530285596848 }, { "entropy": 6.0974836349487305, "epoch": 0.055171050029661856, "mean_token_accuracy": 0.7379032373428345, "num_tokens": 12681884.0, "step": 279, "train/ce_loss": 0.6904160976409912 }, { "epoch": 0.055171050029661856, "step": 279, "train/sim_loss": 0.0030857324600219727 }, { "epoch": 0.055171050029661856, "step": 279, "train/total_loss": 0.0721273422241211 }, { "epoch": 0.0553687957286929, "grad_norm": 0.5108522772789001, "learning_rate": 9.866950242358295e-06, "loss": 0.085, "step": 280 }, { "entropy": 6.142452239990234, "epoch": 0.0553687957286929, "mean_token_accuracy": 0.7491013407707214, "num_tokens": 12737554.0, "step": 280, "train/ce_loss": 1.2742245197296143 }, { "epoch": 0.0553687957286929, "step": 280, "train/sim_loss": 0.0017330050468444824 }, { "epoch": 0.0553687957286929, "step": 280, "train/total_loss": 0.1291554570198059 }, { "entropy": 6.096884727478027, "epoch": 0.055566541427723944, "mean_token_accuracy": 0.7319767475128174, "num_tokens": 12789597.0, "step": 281, "train/ce_loss": 1.268289566040039 }, { "epoch": 0.055566541427723944, "step": 281, "train/sim_loss": 0.0016939640045166016 }, { "epoch": 0.055566541427723944, "step": 281, "train/total_loss": 0.12852291762828827 }, { "entropy": 6.105064392089844, "epoch": 0.05576428712675499, "mean_token_accuracy": 0.6846104860305786, "num_tokens": 12849712.0, "step": 282, "train/ce_loss": 0.9685271382331848 }, { "epoch": 0.05576428712675499, "step": 282, "train/sim_loss": 0.002525627613067627 }, { "epoch": 0.05576428712675499, "step": 282, "train/total_loss": 0.09937833994626999 }, { "entropy": 5.959161758422852, "epoch": 0.05596203282578604, "mean_token_accuracy": 0.7590939998626709, "num_tokens": 12892363.0, "step": 283, "train/ce_loss": 0.8132340908050537 }, { "epoch": 0.05596203282578604, "step": 283, "train/sim_loss": 0.0024794340133666992 }, { "epoch": 0.05596203282578604, "step": 283, "train/total_loss": 0.08380284160375595 }, { "entropy": 5.726922512054443, "epoch": 0.056159778524817086, "mean_token_accuracy": 0.7754266262054443, "num_tokens": 12942314.0, "step": 284, "train/ce_loss": 0.0001389797980664298 }, { "epoch": 0.056159778524817086, "step": 284, "train/sim_loss": 0.0017485618591308594 }, { "epoch": 0.056159778524817086, "step": 284, "train/total_loss": 0.0017624598694965243 }, { "entropy": 5.83951473236084, "epoch": 0.056357524223848134, "mean_token_accuracy": 0.7591241002082825, "num_tokens": 12984801.0, "step": 285, "train/ce_loss": 1.5241178274154663 }, { "epoch": 0.056357524223848134, "step": 285, "train/sim_loss": 0.0022482872009277344 }, { "epoch": 0.056357524223848134, "step": 285, "train/total_loss": 0.15466007590293884 }, { "entropy": 6.160141944885254, "epoch": 0.056555269922879174, "mean_token_accuracy": 0.7547274827957153, "num_tokens": 13038400.0, "step": 286, "train/ce_loss": 0.00011649807856883854 }, { "epoch": 0.056555269922879174, "step": 286, "train/sim_loss": 0.0018707513809204102 }, { "epoch": 0.056555269922879174, "step": 286, "train/total_loss": 0.0018824011785909534 }, { "entropy": 6.242450714111328, "epoch": 0.05675301562191022, "mean_token_accuracy": 0.678463876247406, "num_tokens": 13090582.0, "step": 287, "train/ce_loss": 1.0219491720199585 }, { "epoch": 0.05675301562191022, "step": 287, "train/sim_loss": 0.0021335482597351074 }, { "epoch": 0.05675301562191022, "step": 287, "train/total_loss": 0.1043284684419632 }, { "entropy": 5.902544975280762, "epoch": 0.05695076132094127, "mean_token_accuracy": 0.7410387992858887, "num_tokens": 13125570.0, "step": 288, "train/ce_loss": 0.6679409146308899 }, { "epoch": 0.05695076132094127, "step": 288, "train/sim_loss": 0.002152740955352783 }, { "epoch": 0.05695076132094127, "step": 288, "train/total_loss": 0.06894683092832565 }, { "entropy": 5.653456687927246, "epoch": 0.05714850701997232, "mean_token_accuracy": 0.7258732914924622, "num_tokens": 13162145.0, "step": 289, "train/ce_loss": 0.5031962394714355 }, { "epoch": 0.05714850701997232, "step": 289, "train/sim_loss": 0.002287924289703369 }, { "epoch": 0.05714850701997232, "step": 289, "train/total_loss": 0.052607547491788864 }, { "entropy": 5.51302433013916, "epoch": 0.057346252719003364, "mean_token_accuracy": 0.7627336978912354, "num_tokens": 13210249.0, "step": 290, "train/ce_loss": 0.00010999407095368952 }, { "epoch": 0.057346252719003364, "step": 290, "train/sim_loss": 0.0015815496444702148 }, { "epoch": 0.057346252719003364, "step": 290, "train/total_loss": 0.001592549029737711 }, { "entropy": 6.196866989135742, "epoch": 0.057543998418034405, "mean_token_accuracy": 0.7023255825042725, "num_tokens": 13246574.0, "step": 291, "train/ce_loss": 0.6263709664344788 }, { "epoch": 0.057543998418034405, "step": 291, "train/sim_loss": 0.0023353099822998047 }, { "epoch": 0.057543998418034405, "step": 291, "train/total_loss": 0.0649724081158638 }, { "entropy": 5.841832160949707, "epoch": 0.05774174411706545, "mean_token_accuracy": 0.7458893656730652, "num_tokens": 13282396.0, "step": 292, "train/ce_loss": 1.175521969795227 }, { "epoch": 0.05774174411706545, "step": 292, "train/sim_loss": 0.0010464191436767578 }, { "epoch": 0.05774174411706545, "step": 292, "train/total_loss": 0.11859861761331558 }, { "entropy": 5.576370716094971, "epoch": 0.0579394898160965, "mean_token_accuracy": 0.7662721872329712, "num_tokens": 13304905.0, "step": 293, "train/ce_loss": 0.7450346946716309 }, { "epoch": 0.0579394898160965, "step": 293, "train/sim_loss": 0.0013440847396850586 }, { "epoch": 0.0579394898160965, "step": 293, "train/total_loss": 0.0758475586771965 }, { "entropy": 5.783541679382324, "epoch": 0.05813723551512755, "mean_token_accuracy": 0.7494394779205322, "num_tokens": 13336413.0, "step": 294, "train/ce_loss": 1.2474857568740845 }, { "epoch": 0.05813723551512755, "step": 294, "train/sim_loss": 0.0022690296173095703 }, { "epoch": 0.05813723551512755, "step": 294, "train/total_loss": 0.12701761722564697 }, { "entropy": 5.863532066345215, "epoch": 0.058334981214158595, "mean_token_accuracy": 0.7401315569877625, "num_tokens": 13375420.0, "step": 295, "train/ce_loss": 1.2076274156570435 }, { "epoch": 0.058334981214158595, "step": 295, "train/sim_loss": 0.0014764070510864258 }, { "epoch": 0.058334981214158595, "step": 295, "train/total_loss": 0.12223915010690689 }, { "entropy": 6.113058090209961, "epoch": 0.058532726913189635, "mean_token_accuracy": 0.7369808554649353, "num_tokens": 13418064.0, "step": 296, "train/ce_loss": 1.1633683443069458 }, { "epoch": 0.058532726913189635, "step": 296, "train/sim_loss": 0.0016730427742004395 }, { "epoch": 0.058532726913189635, "step": 296, "train/total_loss": 0.11800988018512726 }, { "entropy": 6.340071678161621, "epoch": 0.05873047261222068, "mean_token_accuracy": 0.7265116572380066, "num_tokens": 13462515.0, "step": 297, "train/ce_loss": 0.00016325325123034418 }, { "epoch": 0.05873047261222068, "step": 297, "train/sim_loss": 0.001781463623046875 }, { "epoch": 0.05873047261222068, "step": 297, "train/total_loss": 0.0017977888928726315 }, { "entropy": 6.041450500488281, "epoch": 0.05892821831125173, "mean_token_accuracy": 0.7169139385223389, "num_tokens": 13509172.0, "step": 298, "train/ce_loss": 0.5503990054130554 }, { "epoch": 0.05892821831125173, "step": 298, "train/sim_loss": 0.002329230308532715 }, { "epoch": 0.05892821831125173, "step": 298, "train/total_loss": 0.057369131594896317 }, { "entropy": 6.087851524353027, "epoch": 0.05912596401028278, "mean_token_accuracy": 0.7258254885673523, "num_tokens": 13571343.0, "step": 299, "train/ce_loss": 9.420052811037749e-05 }, { "epoch": 0.05912596401028278, "step": 299, "train/sim_loss": 0.001353144645690918 }, { "epoch": 0.05912596401028278, "step": 299, "train/total_loss": 0.0013625647407025099 }, { "epoch": 0.059323709709313825, "grad_norm": 0.5439019203186035, "learning_rate": 9.857058067068949e-06, "loss": 0.0864, "step": 300 }, { "entropy": 6.264642715454102, "epoch": 0.059323709709313825, "mean_token_accuracy": 0.7396256923675537, "num_tokens": 13636809.0, "step": 300, "train/ce_loss": 0.8735285997390747 }, { "epoch": 0.059323709709313825, "step": 300, "train/sim_loss": 0.0011110901832580566 }, { "epoch": 0.059323709709313825, "step": 300, "train/total_loss": 0.08846395462751389 }, { "entropy": 5.732763290405273, "epoch": 0.059521455408344866, "mean_token_accuracy": 0.7995391488075256, "num_tokens": 13676197.0, "step": 301, "train/ce_loss": 0.0001247594627784565 }, { "epoch": 0.059521455408344866, "step": 301, "train/sim_loss": 0.0012198686599731445 }, { "epoch": 0.059521455408344866, "step": 301, "train/total_loss": 0.0012323446571826935 }, { "entropy": 5.901372909545898, "epoch": 0.05971920110737591, "mean_token_accuracy": 0.7047913670539856, "num_tokens": 13731485.0, "step": 302, "train/ce_loss": 1.2384577989578247 }, { "epoch": 0.05971920110737591, "step": 302, "train/sim_loss": 0.0013376474380493164 }, { "epoch": 0.05971920110737591, "step": 302, "train/total_loss": 0.12518343329429626 }, { "entropy": 6.319486618041992, "epoch": 0.05991694680640696, "mean_token_accuracy": 0.7421307563781738, "num_tokens": 13778428.0, "step": 303, "train/ce_loss": 1.8248335123062134 }, { "epoch": 0.05991694680640696, "step": 303, "train/sim_loss": 0.0032384395599365234 }, { "epoch": 0.05991694680640696, "step": 303, "train/total_loss": 0.18572179973125458 }, { "entropy": 5.9470953941345215, "epoch": 0.06011469250543801, "mean_token_accuracy": 0.7545030117034912, "num_tokens": 13814233.0, "step": 304, "train/ce_loss": 0.6549165844917297 }, { "epoch": 0.06011469250543801, "step": 304, "train/sim_loss": 0.0019332170486450195 }, { "epoch": 0.06011469250543801, "step": 304, "train/total_loss": 0.06742487847805023 }, { "entropy": 6.249849319458008, "epoch": 0.060312438204469056, "mean_token_accuracy": 0.740963876247406, "num_tokens": 13867855.0, "step": 305, "train/ce_loss": 0.6073223948478699 }, { "epoch": 0.060312438204469056, "step": 305, "train/sim_loss": 0.0026482343673706055 }, { "epoch": 0.060312438204469056, "step": 305, "train/total_loss": 0.06338047981262207 }, { "entropy": 5.8195977210998535, "epoch": 0.060510183903500096, "mean_token_accuracy": 0.7681512832641602, "num_tokens": 13915114.0, "step": 306, "train/ce_loss": 0.3583773672580719 }, { "epoch": 0.060510183903500096, "step": 306, "train/sim_loss": 0.0014423131942749023 }, { "epoch": 0.060510183903500096, "step": 306, "train/total_loss": 0.03728004917502403 }, { "entropy": 6.427165985107422, "epoch": 0.060707929602531144, "mean_token_accuracy": 0.7174603343009949, "num_tokens": 13969806.0, "step": 307, "train/ce_loss": 1.5441391468048096 }, { "epoch": 0.060707929602531144, "step": 307, "train/sim_loss": 0.0013355016708374023 }, { "epoch": 0.060707929602531144, "step": 307, "train/total_loss": 0.15574942529201508 }, { "entropy": 6.313803195953369, "epoch": 0.06090567530156219, "mean_token_accuracy": 0.7409090995788574, "num_tokens": 14028598.0, "step": 308, "train/ce_loss": 0.00019275076920166612 }, { "epoch": 0.06090567530156219, "step": 308, "train/sim_loss": 0.0011289119720458984 }, { "epoch": 0.06090567530156219, "step": 308, "train/total_loss": 0.0011481870897114277 }, { "entropy": 5.752597332000732, "epoch": 0.06110342100059324, "mean_token_accuracy": 0.7355738878250122, "num_tokens": 14066023.0, "step": 309, "train/ce_loss": 1.304394006729126 }, { "epoch": 0.06110342100059324, "step": 309, "train/sim_loss": 0.0024842023849487305 }, { "epoch": 0.06110342100059324, "step": 309, "train/total_loss": 0.13292360305786133 }, { "entropy": 6.162275314331055, "epoch": 0.061301166699624286, "mean_token_accuracy": 0.7623604536056519, "num_tokens": 14097171.0, "step": 310, "train/ce_loss": 0.9813752174377441 }, { "epoch": 0.061301166699624286, "step": 310, "train/sim_loss": 0.0012891292572021484 }, { "epoch": 0.061301166699624286, "step": 310, "train/total_loss": 0.09942664951086044 }, { "entropy": 6.18077278137207, "epoch": 0.06149891239865533, "mean_token_accuracy": 0.7569745779037476, "num_tokens": 14138928.0, "step": 311, "train/ce_loss": 1.078444004058838 }, { "epoch": 0.06149891239865533, "step": 311, "train/sim_loss": 0.0013306736946105957 }, { "epoch": 0.06149891239865533, "step": 311, "train/total_loss": 0.10917507857084274 }, { "entropy": 5.914327621459961, "epoch": 0.061696658097686374, "mean_token_accuracy": 0.7730870842933655, "num_tokens": 14178838.0, "step": 312, "train/ce_loss": 0.6977765560150146 }, { "epoch": 0.061696658097686374, "step": 312, "train/sim_loss": 0.001223444938659668 }, { "epoch": 0.061696658097686374, "step": 312, "train/total_loss": 0.07100110501050949 }, { "entropy": 5.5692033767700195, "epoch": 0.06189440379671742, "mean_token_accuracy": 0.7747068405151367, "num_tokens": 14213201.0, "step": 313, "train/ce_loss": 0.00013404346827883273 }, { "epoch": 0.06189440379671742, "step": 313, "train/sim_loss": 0.0012976527214050293 }, { "epoch": 0.06189440379671742, "step": 313, "train/total_loss": 0.0013110570143908262 }, { "entropy": 6.05233907699585, "epoch": 0.06209214949574847, "mean_token_accuracy": 0.7868852615356445, "num_tokens": 14243891.0, "step": 314, "train/ce_loss": 0.6892787218093872 }, { "epoch": 0.06209214949574847, "step": 314, "train/sim_loss": 0.0020035505294799805 }, { "epoch": 0.06209214949574847, "step": 314, "train/total_loss": 0.07093142718076706 }, { "entropy": 5.945684432983398, "epoch": 0.06228989519477952, "mean_token_accuracy": 0.7172710299491882, "num_tokens": 14277227.0, "step": 315, "train/ce_loss": 0.5299190878868103 }, { "epoch": 0.06228989519477952, "step": 315, "train/sim_loss": 0.0011898279190063477 }, { "epoch": 0.06228989519477952, "step": 315, "train/total_loss": 0.05418173596262932 }, { "entropy": 5.618294715881348, "epoch": 0.06248764089381056, "mean_token_accuracy": 0.7681499123573303, "num_tokens": 14306133.0, "step": 316, "train/ce_loss": 0.00013580457016360015 }, { "epoch": 0.06248764089381056, "step": 316, "train/sim_loss": 0.001158595085144043 }, { "epoch": 0.06248764089381056, "step": 316, "train/total_loss": 0.001172175514511764 }, { "entropy": 6.073228359222412, "epoch": 0.0626853865928416, "mean_token_accuracy": 0.7142857313156128, "num_tokens": 14364588.0, "step": 317, "train/ce_loss": 0.9131065011024475 }, { "epoch": 0.0626853865928416, "step": 317, "train/sim_loss": 0.002227962017059326 }, { "epoch": 0.0626853865928416, "step": 317, "train/total_loss": 0.09353861212730408 }, { "entropy": 5.877865314483643, "epoch": 0.06288313229187265, "mean_token_accuracy": 0.7689573168754578, "num_tokens": 14414408.0, "step": 318, "train/ce_loss": 9.880107245407999e-05 }, { "epoch": 0.06288313229187265, "step": 318, "train/sim_loss": 0.0018352866172790527 }, { "epoch": 0.06288313229187265, "step": 318, "train/total_loss": 0.0018451666692271829 }, { "entropy": 6.043904781341553, "epoch": 0.0630808779909037, "mean_token_accuracy": 0.6984984874725342, "num_tokens": 14461167.0, "step": 319, "train/ce_loss": 0.6597232222557068 }, { "epoch": 0.0630808779909037, "step": 319, "train/sim_loss": 0.001633286476135254 }, { "epoch": 0.0630808779909037, "step": 319, "train/total_loss": 0.06760560721158981 }, { "epoch": 0.06327862368993474, "grad_norm": 0.5783921480178833, "learning_rate": 9.847165891779603e-06, "loss": 0.0833, "step": 320 }, { "entropy": 6.382856369018555, "epoch": 0.06327862368993474, "mean_token_accuracy": 0.7016011476516724, "num_tokens": 14517055.0, "step": 320, "train/ce_loss": 0.00015250418800860643 }, { "epoch": 0.06327862368993474, "step": 320, "train/sim_loss": 0.0016808509826660156 }, { "epoch": 0.06327862368993474, "step": 320, "train/total_loss": 0.001696101389825344 }, { "entropy": 6.227334499359131, "epoch": 0.0634763693889658, "mean_token_accuracy": 0.7280150651931763, "num_tokens": 14565975.0, "step": 321, "train/ce_loss": 0.75583815574646 }, { "epoch": 0.0634763693889658, "step": 321, "train/sim_loss": 0.001871943473815918 }, { "epoch": 0.0634763693889658, "step": 321, "train/total_loss": 0.07745575904846191 }, { "entropy": 6.435788154602051, "epoch": 0.06367411508799684, "mean_token_accuracy": 0.7305699586868286, "num_tokens": 14617187.0, "step": 322, "train/ce_loss": 0.5745871067047119 }, { "epoch": 0.06367411508799684, "step": 322, "train/sim_loss": 0.001839280128479004 }, { "epoch": 0.06367411508799684, "step": 322, "train/total_loss": 0.059297990053892136 }, { "entropy": 5.939131736755371, "epoch": 0.06387186078702788, "mean_token_accuracy": 0.7696759104728699, "num_tokens": 14665269.0, "step": 323, "train/ce_loss": 0.4711361527442932 }, { "epoch": 0.06387186078702788, "step": 323, "train/sim_loss": 0.0011136531829833984 }, { "epoch": 0.06387186078702788, "step": 323, "train/total_loss": 0.04822726920247078 }, { "entropy": 5.787720680236816, "epoch": 0.06406960648605893, "mean_token_accuracy": 0.7085227370262146, "num_tokens": 14705892.0, "step": 324, "train/ce_loss": 0.8080941438674927 }, { "epoch": 0.06406960648605893, "step": 324, "train/sim_loss": 0.003852546215057373 }, { "epoch": 0.06406960648605893, "step": 324, "train/total_loss": 0.08466196060180664 }, { "entropy": 5.824300289154053, "epoch": 0.06426735218508997, "mean_token_accuracy": 0.7172533273696899, "num_tokens": 14761982.0, "step": 325, "train/ce_loss": 0.5543658137321472 }, { "epoch": 0.06426735218508997, "step": 325, "train/sim_loss": 0.001959860324859619 }, { "epoch": 0.06426735218508997, "step": 325, "train/total_loss": 0.05739644169807434 }, { "entropy": 6.001358985900879, "epoch": 0.06446509788412103, "mean_token_accuracy": 0.7427884340286255, "num_tokens": 14808296.0, "step": 326, "train/ce_loss": 0.00013316630793269724 }, { "epoch": 0.06446509788412103, "step": 326, "train/sim_loss": 0.0015674829483032227 }, { "epoch": 0.06446509788412103, "step": 326, "train/total_loss": 0.001580799580551684 }, { "entropy": 5.867360591888428, "epoch": 0.06466284358315207, "mean_token_accuracy": 0.7153284549713135, "num_tokens": 14850728.0, "step": 327, "train/ce_loss": 1.6962165832519531 }, { "epoch": 0.06466284358315207, "step": 327, "train/sim_loss": 0.0013698339462280273 }, { "epoch": 0.06466284358315207, "step": 327, "train/total_loss": 0.17099149525165558 }, { "entropy": 5.964676856994629, "epoch": 0.0648605892821831, "mean_token_accuracy": 0.7357819676399231, "num_tokens": 14902592.0, "step": 328, "train/ce_loss": 0.5692281723022461 }, { "epoch": 0.0648605892821831, "step": 328, "train/sim_loss": 0.0013669729232788086 }, { "epoch": 0.0648605892821831, "step": 328, "train/total_loss": 0.0582897923886776 }, { "entropy": 6.3392333984375, "epoch": 0.06505833498121416, "mean_token_accuracy": 0.7177157402038574, "num_tokens": 14979007.0, "step": 329, "train/ce_loss": 1.2037159204483032 }, { "epoch": 0.06505833498121416, "step": 329, "train/sim_loss": 0.0012908577919006348 }, { "epoch": 0.06505833498121416, "step": 329, "train/total_loss": 0.1216624528169632 }, { "entropy": 5.999922752380371, "epoch": 0.0652560806802452, "mean_token_accuracy": 0.7093750238418579, "num_tokens": 15020443.0, "step": 330, "train/ce_loss": 0.6855100989341736 }, { "epoch": 0.0652560806802452, "step": 330, "train/sim_loss": 0.0019969940185546875 }, { "epoch": 0.0652560806802452, "step": 330, "train/total_loss": 0.07054800540208817 }, { "entropy": 6.044669151306152, "epoch": 0.06545382637927626, "mean_token_accuracy": 0.725242018699646, "num_tokens": 15064602.0, "step": 331, "train/ce_loss": 0.847236692905426 }, { "epoch": 0.06545382637927626, "step": 331, "train/sim_loss": 0.0018666982650756836 }, { "epoch": 0.06545382637927626, "step": 331, "train/total_loss": 0.08659037202596664 }, { "entropy": 5.606973171234131, "epoch": 0.0656515720783073, "mean_token_accuracy": 0.7402952313423157, "num_tokens": 15115658.0, "step": 332, "train/ce_loss": 1.1097733974456787 }, { "epoch": 0.0656515720783073, "step": 332, "train/sim_loss": 0.002302408218383789 }, { "epoch": 0.0656515720783073, "step": 332, "train/total_loss": 0.11327975243330002 }, { "entropy": 5.975988864898682, "epoch": 0.06584931777733834, "mean_token_accuracy": 0.7722263336181641, "num_tokens": 15165994.0, "step": 333, "train/ce_loss": 1.0870298147201538 }, { "epoch": 0.06584931777733834, "step": 333, "train/sim_loss": 0.0015590786933898926 }, { "epoch": 0.06584931777733834, "step": 333, "train/total_loss": 0.11026205867528915 }, { "entropy": 5.99609375, "epoch": 0.06604706347636939, "mean_token_accuracy": 0.7440347075462341, "num_tokens": 15207104.0, "step": 334, "train/ce_loss": 0.01861746609210968 }, { "epoch": 0.06604706347636939, "step": 334, "train/sim_loss": 0.0015545487403869629 }, { "epoch": 0.06604706347636939, "step": 334, "train/total_loss": 0.0034162953961640596 }, { "entropy": 5.903363227844238, "epoch": 0.06624480917540043, "mean_token_accuracy": 0.7365710139274597, "num_tokens": 15247275.0, "step": 335, "train/ce_loss": 1.2350883483886719 }, { "epoch": 0.06624480917540043, "step": 335, "train/sim_loss": 0.001577138900756836 }, { "epoch": 0.06624480917540043, "step": 335, "train/total_loss": 0.1250859797000885 }, { "entropy": 5.901702404022217, "epoch": 0.06644255487443149, "mean_token_accuracy": 0.7422600388526917, "num_tokens": 15290524.0, "step": 336, "train/ce_loss": 1.1615396738052368 }, { "epoch": 0.06644255487443149, "step": 336, "train/sim_loss": 0.003594636917114258 }, { "epoch": 0.06644255487443149, "step": 336, "train/total_loss": 0.11974860727787018 }, { "entropy": 6.2594380378723145, "epoch": 0.06664030057346253, "mean_token_accuracy": 0.7089508175849915, "num_tokens": 15339581.0, "step": 337, "train/ce_loss": 1.1257671117782593 }, { "epoch": 0.06664030057346253, "step": 337, "train/sim_loss": 0.002087116241455078 }, { "epoch": 0.06664030057346253, "step": 337, "train/total_loss": 0.11466383188962936 }, { "entropy": 5.9496307373046875, "epoch": 0.06683804627249357, "mean_token_accuracy": 0.7795857787132263, "num_tokens": 15379658.0, "step": 338, "train/ce_loss": 0.8622129559516907 }, { "epoch": 0.06683804627249357, "step": 338, "train/sim_loss": 0.0021062493324279785 }, { "epoch": 0.06683804627249357, "step": 338, "train/total_loss": 0.0883275493979454 }, { "entropy": 6.164770126342773, "epoch": 0.06703579197152462, "mean_token_accuracy": 0.7254408001899719, "num_tokens": 15426225.0, "step": 339, "train/ce_loss": 0.6297210454940796 }, { "epoch": 0.06703579197152462, "step": 339, "train/sim_loss": 0.0010211467742919922 }, { "epoch": 0.06703579197152462, "step": 339, "train/total_loss": 0.06399325281381607 }, { "epoch": 0.06723353767055566, "grad_norm": 0.5418144464492798, "learning_rate": 9.837273716490257e-06, "loss": 0.0872, "step": 340 }, { "entropy": 6.28240442276001, "epoch": 0.06723353767055566, "mean_token_accuracy": 0.7107709050178528, "num_tokens": 15474020.0, "step": 340, "train/ce_loss": 1.0083770751953125 }, { "epoch": 0.06723353767055566, "step": 340, "train/sim_loss": 0.0015277862548828125 }, { "epoch": 0.06723353767055566, "step": 340, "train/total_loss": 0.10236549377441406 }, { "entropy": 6.137720108032227, "epoch": 0.06743128336958672, "mean_token_accuracy": 0.7497446537017822, "num_tokens": 15533559.0, "step": 341, "train/ce_loss": 2.276036262512207 }, { "epoch": 0.06743128336958672, "step": 341, "train/sim_loss": 0.002442479133605957 }, { "epoch": 0.06743128336958672, "step": 341, "train/total_loss": 0.2300461083650589 }, { "entropy": 5.962731838226318, "epoch": 0.06762902906861776, "mean_token_accuracy": 0.741167426109314, "num_tokens": 15580850.0, "step": 342, "train/ce_loss": 1.0314046144485474 }, { "epoch": 0.06762902906861776, "step": 342, "train/sim_loss": 0.001607060432434082 }, { "epoch": 0.06762902906861776, "step": 342, "train/total_loss": 0.10474752634763718 }, { "entropy": 6.27766752243042, "epoch": 0.0678267747676488, "mean_token_accuracy": 0.7351804375648499, "num_tokens": 15625586.0, "step": 343, "train/ce_loss": 0.7232856154441833 }, { "epoch": 0.0678267747676488, "step": 343, "train/sim_loss": 0.0009726285934448242 }, { "epoch": 0.0678267747676488, "step": 343, "train/total_loss": 0.07330118864774704 }, { "entropy": 5.876796245574951, "epoch": 0.06802452046667985, "mean_token_accuracy": 0.7566001415252686, "num_tokens": 15681239.0, "step": 344, "train/ce_loss": 1.3944395780563354 }, { "epoch": 0.06802452046667985, "step": 344, "train/sim_loss": 0.0033742189407348633 }, { "epoch": 0.06802452046667985, "step": 344, "train/total_loss": 0.14281818270683289 }, { "entropy": 6.229395866394043, "epoch": 0.0682222661657109, "mean_token_accuracy": 0.7417530417442322, "num_tokens": 15733631.0, "step": 345, "train/ce_loss": 0.0001287110208068043 }, { "epoch": 0.0682222661657109, "step": 345, "train/sim_loss": 0.0009920001029968262 }, { "epoch": 0.0682222661657109, "step": 345, "train/total_loss": 0.0010048712138086557 }, { "entropy": 5.914237976074219, "epoch": 0.06842001186474195, "mean_token_accuracy": 0.7313546538352966, "num_tokens": 15784667.0, "step": 346, "train/ce_loss": 1.0020078420639038 }, { "epoch": 0.06842001186474195, "step": 346, "train/sim_loss": 0.0010837316513061523 }, { "epoch": 0.06842001186474195, "step": 346, "train/total_loss": 0.10128451883792877 }, { "entropy": 6.154259204864502, "epoch": 0.06861775756377299, "mean_token_accuracy": 0.7324737906455994, "num_tokens": 15808461.0, "step": 347, "train/ce_loss": 1.4126274585723877 }, { "epoch": 0.06861775756377299, "step": 347, "train/sim_loss": 0.0013793110847473145 }, { "epoch": 0.06861775756377299, "step": 347, "train/total_loss": 0.1426420658826828 }, { "entropy": 6.447692394256592, "epoch": 0.06881550326280403, "mean_token_accuracy": 0.7140340805053711, "num_tokens": 15869103.0, "step": 348, "train/ce_loss": 0.7733914852142334 }, { "epoch": 0.06881550326280403, "step": 348, "train/sim_loss": 0.0015562176704406738 }, { "epoch": 0.06881550326280403, "step": 348, "train/total_loss": 0.07889536768198013 }, { "entropy": 6.218447208404541, "epoch": 0.06901324896183508, "mean_token_accuracy": 0.7074122428894043, "num_tokens": 15929338.0, "step": 349, "train/ce_loss": 1.8632811307907104 }, { "epoch": 0.06901324896183508, "step": 349, "train/sim_loss": 0.0022896528244018555 }, { "epoch": 0.06901324896183508, "step": 349, "train/total_loss": 0.1886177659034729 }, { "entropy": 6.150102615356445, "epoch": 0.06921099466086612, "mean_token_accuracy": 0.7139272093772888, "num_tokens": 15973462.0, "step": 350, "train/ce_loss": 0.00010025887604570016 }, { "epoch": 0.06921099466086612, "step": 350, "train/sim_loss": 0.0021195411682128906 }, { "epoch": 0.06921099466086612, "step": 350, "train/total_loss": 0.00212956708855927 }, { "entropy": 6.097103595733643, "epoch": 0.06940874035989718, "mean_token_accuracy": 0.7563840746879578, "num_tokens": 16020880.0, "step": 351, "train/ce_loss": 0.4633747935295105 }, { "epoch": 0.06940874035989718, "step": 351, "train/sim_loss": 0.0008648037910461426 }, { "epoch": 0.06940874035989718, "step": 351, "train/total_loss": 0.04720228537917137 }, { "entropy": 6.250653266906738, "epoch": 0.06960648605892822, "mean_token_accuracy": 0.7070193290710449, "num_tokens": 16068478.0, "step": 352, "train/ce_loss": 1.1940531730651855 }, { "epoch": 0.06960648605892822, "step": 352, "train/sim_loss": 0.0016661882400512695 }, { "epoch": 0.06960648605892822, "step": 352, "train/total_loss": 0.12107151001691818 }, { "entropy": 6.2395548820495605, "epoch": 0.06980423175795926, "mean_token_accuracy": 0.7440597414970398, "num_tokens": 16112290.0, "step": 353, "train/ce_loss": 1.2748665809631348 }, { "epoch": 0.06980423175795926, "step": 353, "train/sim_loss": 0.0020273327827453613 }, { "epoch": 0.06980423175795926, "step": 353, "train/total_loss": 0.12951399385929108 }, { "entropy": 6.003415107727051, "epoch": 0.07000197745699031, "mean_token_accuracy": 0.7471675872802734, "num_tokens": 16159412.0, "step": 354, "train/ce_loss": 1.0085188150405884 }, { "epoch": 0.07000197745699031, "step": 354, "train/sim_loss": 0.001086592674255371 }, { "epoch": 0.07000197745699031, "step": 354, "train/total_loss": 0.10193847864866257 }, { "entropy": 6.333674430847168, "epoch": 0.07019972315602135, "mean_token_accuracy": 0.7337962985038757, "num_tokens": 16201374.0, "step": 355, "train/ce_loss": 0.6348781585693359 }, { "epoch": 0.07019972315602135, "step": 355, "train/sim_loss": 0.0015856027603149414 }, { "epoch": 0.07019972315602135, "step": 355, "train/total_loss": 0.0650734230875969 }, { "entropy": 6.411984443664551, "epoch": 0.07039746885505241, "mean_token_accuracy": 0.7186030745506287, "num_tokens": 16254007.0, "step": 356, "train/ce_loss": 1.0449293851852417 }, { "epoch": 0.07039746885505241, "step": 356, "train/sim_loss": 0.0014511942863464355 }, { "epoch": 0.07039746885505241, "step": 356, "train/total_loss": 0.10594413429498672 }, { "entropy": 5.8798394203186035, "epoch": 0.07059521455408345, "mean_token_accuracy": 0.7560647130012512, "num_tokens": 16285297.0, "step": 357, "train/ce_loss": 0.6497535109519958 }, { "epoch": 0.07059521455408345, "step": 357, "train/sim_loss": 0.001465439796447754 }, { "epoch": 0.07059521455408345, "step": 357, "train/total_loss": 0.06644079089164734 }, { "entropy": 6.338820457458496, "epoch": 0.07079296025311449, "mean_token_accuracy": 0.7133492231369019, "num_tokens": 16336367.0, "step": 358, "train/ce_loss": 0.966612696647644 }, { "epoch": 0.07079296025311449, "step": 358, "train/sim_loss": 0.0014145374298095703 }, { "epoch": 0.07079296025311449, "step": 358, "train/total_loss": 0.09807580709457397 }, { "entropy": 6.192384719848633, "epoch": 0.07099070595214554, "mean_token_accuracy": 0.7148514986038208, "num_tokens": 16388058.0, "step": 359, "train/ce_loss": 0.8275354504585266 }, { "epoch": 0.07099070595214554, "step": 359, "train/sim_loss": 0.0012810230255126953 }, { "epoch": 0.07099070595214554, "step": 359, "train/total_loss": 0.08403456956148148 }, { "epoch": 0.07118845165117658, "grad_norm": 0.5341076850891113, "learning_rate": 9.82738154120091e-06, "loss": 0.0885, "step": 360 }, { "entropy": 5.922525882720947, "epoch": 0.07118845165117658, "mean_token_accuracy": 0.7432113289833069, "num_tokens": 16429697.0, "step": 360, "train/ce_loss": 0.8996303677558899 }, { "epoch": 0.07118845165117658, "step": 360, "train/sim_loss": 0.003126204013824463 }, { "epoch": 0.07118845165117658, "step": 360, "train/total_loss": 0.09308924525976181 }, { "entropy": 6.162260055541992, "epoch": 0.07138619735020764, "mean_token_accuracy": 0.7539135813713074, "num_tokens": 16476931.0, "step": 361, "train/ce_loss": 0.6976432204246521 }, { "epoch": 0.07138619735020764, "step": 361, "train/sim_loss": 0.0008309483528137207 }, { "epoch": 0.07138619735020764, "step": 361, "train/total_loss": 0.07059527188539505 }, { "entropy": 6.1700334548950195, "epoch": 0.07158394304923868, "mean_token_accuracy": 0.7542856931686401, "num_tokens": 16542445.0, "step": 362, "train/ce_loss": 0.9747052788734436 }, { "epoch": 0.07158394304923868, "step": 362, "train/sim_loss": 0.0011420249938964844 }, { "epoch": 0.07158394304923868, "step": 362, "train/total_loss": 0.09861255437135696 }, { "entropy": 6.033313274383545, "epoch": 0.07178168874826972, "mean_token_accuracy": 0.7295660972595215, "num_tokens": 16575611.0, "step": 363, "train/ce_loss": 1.6613017320632935 }, { "epoch": 0.07178168874826972, "step": 363, "train/sim_loss": 0.001795351505279541 }, { "epoch": 0.07178168874826972, "step": 363, "train/total_loss": 0.16792552173137665 }, { "entropy": 5.53885555267334, "epoch": 0.07197943444730077, "mean_token_accuracy": 0.765462338924408, "num_tokens": 16610960.0, "step": 364, "train/ce_loss": 0.42008453607559204 }, { "epoch": 0.07197943444730077, "step": 364, "train/sim_loss": 0.0018650293350219727 }, { "epoch": 0.07197943444730077, "step": 364, "train/total_loss": 0.043873485177755356 }, { "entropy": 5.995070457458496, "epoch": 0.07217718014633182, "mean_token_accuracy": 0.7192546725273132, "num_tokens": 16660008.0, "step": 365, "train/ce_loss": 0.7772306203842163 }, { "epoch": 0.07217718014633182, "step": 365, "train/sim_loss": 0.0012853145599365234 }, { "epoch": 0.07217718014633182, "step": 365, "train/total_loss": 0.07900837808847427 }, { "entropy": 6.227198600769043, "epoch": 0.07237492584536287, "mean_token_accuracy": 0.746434211730957, "num_tokens": 16704871.0, "step": 366, "train/ce_loss": 0.00010236837988486513 }, { "epoch": 0.07237492584536287, "step": 366, "train/sim_loss": 0.001741647720336914 }, { "epoch": 0.07237492584536287, "step": 366, "train/total_loss": 0.0017518845852464437 }, { "entropy": 6.246466636657715, "epoch": 0.07257267154439391, "mean_token_accuracy": 0.7452692985534668, "num_tokens": 16752441.0, "step": 367, "train/ce_loss": 1.0860624313354492 }, { "epoch": 0.07257267154439391, "step": 367, "train/sim_loss": 0.002068936824798584 }, { "epoch": 0.07257267154439391, "step": 367, "train/total_loss": 0.11067517846822739 }, { "entropy": 6.148958206176758, "epoch": 0.07277041724342495, "mean_token_accuracy": 0.7360000014305115, "num_tokens": 16791023.0, "step": 368, "train/ce_loss": 1.5211011171340942 }, { "epoch": 0.07277041724342495, "step": 368, "train/sim_loss": 0.0013931989669799805 }, { "epoch": 0.07277041724342495, "step": 368, "train/total_loss": 0.15350331366062164 }, { "entropy": 6.246423721313477, "epoch": 0.072968162942456, "mean_token_accuracy": 0.7436661720275879, "num_tokens": 16830398.0, "step": 369, "train/ce_loss": 0.6596885323524475 }, { "epoch": 0.072968162942456, "step": 369, "train/sim_loss": 0.0010967254638671875 }, { "epoch": 0.072968162942456, "step": 369, "train/total_loss": 0.06706558167934418 }, { "entropy": 6.2328338623046875, "epoch": 0.07316590864148705, "mean_token_accuracy": 0.7167192697525024, "num_tokens": 16879160.0, "step": 370, "train/ce_loss": 0.9408416748046875 }, { "epoch": 0.07316590864148705, "step": 370, "train/sim_loss": 0.0009969472885131836 }, { "epoch": 0.07316590864148705, "step": 370, "train/total_loss": 0.09508111327886581 }, { "entropy": 5.729766368865967, "epoch": 0.0733636543405181, "mean_token_accuracy": 0.7772170305252075, "num_tokens": 16910298.0, "step": 371, "train/ce_loss": 0.7892571687698364 }, { "epoch": 0.0733636543405181, "step": 371, "train/sim_loss": 0.0010811090469360352 }, { "epoch": 0.0733636543405181, "step": 371, "train/total_loss": 0.08000683039426804 }, { "entropy": 5.768686294555664, "epoch": 0.07356140003954914, "mean_token_accuracy": 0.7559055089950562, "num_tokens": 16957430.0, "step": 372, "train/ce_loss": 0.7414770126342773 }, { "epoch": 0.07356140003954914, "step": 372, "train/sim_loss": 0.0010215044021606445 }, { "epoch": 0.07356140003954914, "step": 372, "train/total_loss": 0.07516920566558838 }, { "entropy": 6.284891605377197, "epoch": 0.07375914573858018, "mean_token_accuracy": 0.7111650705337524, "num_tokens": 17011535.0, "step": 373, "train/ce_loss": 0.00011944158177357167 }, { "epoch": 0.07375914573858018, "step": 373, "train/sim_loss": 0.001505136489868164 }, { "epoch": 0.07375914573858018, "step": 373, "train/total_loss": 0.0015170807018876076 }, { "entropy": 6.041722297668457, "epoch": 0.07395689143761124, "mean_token_accuracy": 0.7495373487472534, "num_tokens": 17043630.0, "step": 374, "train/ce_loss": 0.8823565244674683 }, { "epoch": 0.07395689143761124, "step": 374, "train/sim_loss": 0.0010445117950439453 }, { "epoch": 0.07395689143761124, "step": 374, "train/total_loss": 0.08928016573190689 }, { "entropy": 5.791815280914307, "epoch": 0.07415463713664228, "mean_token_accuracy": 0.7478936910629272, "num_tokens": 17075465.0, "step": 375, "train/ce_loss": 0.7836744785308838 }, { "epoch": 0.07415463713664228, "step": 375, "train/sim_loss": 0.00152510404586792 }, { "epoch": 0.07415463713664228, "step": 375, "train/total_loss": 0.07989255338907242 }, { "entropy": 6.269449710845947, "epoch": 0.07435238283567333, "mean_token_accuracy": 0.7455934286117554, "num_tokens": 17129246.0, "step": 376, "train/ce_loss": 1.2043771743774414 }, { "epoch": 0.07435238283567333, "step": 376, "train/sim_loss": 0.00163191556930542 }, { "epoch": 0.07435238283567333, "step": 376, "train/total_loss": 0.12206963449716568 }, { "entropy": 6.412583351135254, "epoch": 0.07455012853470437, "mean_token_accuracy": 0.7422969341278076, "num_tokens": 17176431.0, "step": 377, "train/ce_loss": 1.4139810800552368 }, { "epoch": 0.07455012853470437, "step": 377, "train/sim_loss": 0.0021806955337524414 }, { "epoch": 0.07455012853470437, "step": 377, "train/total_loss": 0.14357881247997284 }, { "entropy": 6.022729873657227, "epoch": 0.07474787423373541, "mean_token_accuracy": 0.7728740572929382, "num_tokens": 17208081.0, "step": 378, "train/ce_loss": 1.5451982021331787 }, { "epoch": 0.07474787423373541, "step": 378, "train/sim_loss": 0.001235365867614746 }, { "epoch": 0.07474787423373541, "step": 378, "train/total_loss": 0.1557551920413971 }, { "entropy": 6.342796802520752, "epoch": 0.07494561993276647, "mean_token_accuracy": 0.6875, "num_tokens": 17253547.0, "step": 379, "train/ce_loss": 2.5981838703155518 }, { "epoch": 0.07494561993276647, "step": 379, "train/sim_loss": 0.0015532970428466797 }, { "epoch": 0.07494561993276647, "step": 379, "train/total_loss": 0.2613717019557953 }, { "epoch": 0.0751433656317975, "grad_norm": 0.6407986879348755, "learning_rate": 9.817489365911564e-06, "loss": 0.0842, "step": 380 }, { "entropy": 5.832916259765625, "epoch": 0.0751433656317975, "mean_token_accuracy": 0.7366809844970703, "num_tokens": 17308779.0, "step": 380, "train/ce_loss": 0.5865389108657837 }, { "epoch": 0.0751433656317975, "step": 380, "train/sim_loss": 0.001540064811706543 }, { "epoch": 0.0751433656317975, "step": 380, "train/total_loss": 0.06019395589828491 }, { "entropy": 5.73239803314209, "epoch": 0.07534111133082856, "mean_token_accuracy": 0.7305936217308044, "num_tokens": 17347922.0, "step": 381, "train/ce_loss": 1.1817069053649902 }, { "epoch": 0.07534111133082856, "step": 381, "train/sim_loss": 0.002941131591796875 }, { "epoch": 0.07534111133082856, "step": 381, "train/total_loss": 0.12111182510852814 }, { "entropy": 6.088849067687988, "epoch": 0.0755388570298596, "mean_token_accuracy": 0.7150837779045105, "num_tokens": 17386113.0, "step": 382, "train/ce_loss": 9.200163185596466e-05 }, { "epoch": 0.0755388570298596, "step": 382, "train/sim_loss": 0.0029191970825195312 }, { "epoch": 0.0755388570298596, "step": 382, "train/total_loss": 0.0029283971525728703 }, { "entropy": 6.23739767074585, "epoch": 0.07573660272889064, "mean_token_accuracy": 0.7508038878440857, "num_tokens": 17428799.0, "step": 383, "train/ce_loss": 2.0188961029052734 }, { "epoch": 0.07573660272889064, "step": 383, "train/sim_loss": 0.0014382004737854004 }, { "epoch": 0.07573660272889064, "step": 383, "train/total_loss": 0.20332781970500946 }, { "entropy": 5.45576286315918, "epoch": 0.0759343484279217, "mean_token_accuracy": 0.7662807703018188, "num_tokens": 17451717.0, "step": 384, "train/ce_loss": 0.8522616028785706 }, { "epoch": 0.0759343484279217, "step": 384, "train/sim_loss": 0.0007863044738769531 }, { "epoch": 0.0759343484279217, "step": 384, "train/total_loss": 0.08601246774196625 }, { "entropy": 6.119839191436768, "epoch": 0.07613209412695274, "mean_token_accuracy": 0.7458471655845642, "num_tokens": 17492263.0, "step": 385, "train/ce_loss": 0.7344909906387329 }, { "epoch": 0.07613209412695274, "step": 385, "train/sim_loss": 0.0013491511344909668 }, { "epoch": 0.07613209412695274, "step": 385, "train/total_loss": 0.07479824870824814 }, { "entropy": 6.301417350769043, "epoch": 0.07632983982598378, "mean_token_accuracy": 0.7534148097038269, "num_tokens": 17541312.0, "step": 386, "train/ce_loss": 8.824572432786226e-05 }, { "epoch": 0.07632983982598378, "step": 386, "train/sim_loss": 0.0015096664428710938 }, { "epoch": 0.07632983982598378, "step": 386, "train/total_loss": 0.001518490957096219 }, { "entropy": 6.083306312561035, "epoch": 0.07652758552501483, "mean_token_accuracy": 0.7189964056015015, "num_tokens": 17589946.0, "step": 387, "train/ce_loss": 0.00010419396858196706 }, { "epoch": 0.07652758552501483, "step": 387, "train/sim_loss": 0.0008211135864257812 }, { "epoch": 0.07652758552501483, "step": 387, "train/total_loss": 0.0008315329905599356 }, { "entropy": 5.98771858215332, "epoch": 0.07672533122404587, "mean_token_accuracy": 0.7475308775901794, "num_tokens": 17646721.0, "step": 388, "train/ce_loss": 7.023834041319788e-05 }, { "epoch": 0.07672533122404587, "step": 388, "train/sim_loss": 0.0018520355224609375 }, { "epoch": 0.07672533122404587, "step": 388, "train/total_loss": 0.0018590593244880438 }, { "entropy": 5.858847618103027, "epoch": 0.07692307692307693, "mean_token_accuracy": 0.7289992570877075, "num_tokens": 17693018.0, "step": 389, "train/ce_loss": 1.20111083984375 }, { "epoch": 0.07692307692307693, "step": 389, "train/sim_loss": 0.0011598467826843262 }, { "epoch": 0.07692307692307693, "step": 389, "train/total_loss": 0.12127093225717545 }, { "entropy": 6.2136993408203125, "epoch": 0.07712082262210797, "mean_token_accuracy": 0.7407647371292114, "num_tokens": 17737005.0, "step": 390, "train/ce_loss": 0.865757167339325 }, { "epoch": 0.07712082262210797, "step": 390, "train/sim_loss": 0.0014815926551818848 }, { "epoch": 0.07712082262210797, "step": 390, "train/total_loss": 0.08805730938911438 }, { "entropy": 6.0749616622924805, "epoch": 0.07731856832113901, "mean_token_accuracy": 0.7226277589797974, "num_tokens": 17781742.0, "step": 391, "train/ce_loss": 0.9141871333122253 }, { "epoch": 0.07731856832113901, "step": 391, "train/sim_loss": 0.0011489391326904297 }, { "epoch": 0.07731856832113901, "step": 391, "train/total_loss": 0.09256765246391296 }, { "entropy": 5.922192573547363, "epoch": 0.07751631402017006, "mean_token_accuracy": 0.772370457649231, "num_tokens": 17818180.0, "step": 392, "train/ce_loss": 0.6291870474815369 }, { "epoch": 0.07751631402017006, "step": 392, "train/sim_loss": 0.0012137889862060547 }, { "epoch": 0.07751631402017006, "step": 392, "train/total_loss": 0.06413249671459198 }, { "entropy": 5.878640174865723, "epoch": 0.0777140597192011, "mean_token_accuracy": 0.752470076084137, "num_tokens": 17858461.0, "step": 393, "train/ce_loss": 0.6556269526481628 }, { "epoch": 0.0777140597192011, "step": 393, "train/sim_loss": 0.0014565587043762207 }, { "epoch": 0.0777140597192011, "step": 393, "train/total_loss": 0.0670192539691925 }, { "entropy": 5.655073165893555, "epoch": 0.07791180541823216, "mean_token_accuracy": 0.7182642221450806, "num_tokens": 17897859.0, "step": 394, "train/ce_loss": 1.6870887279510498 }, { "epoch": 0.07791180541823216, "step": 394, "train/sim_loss": 0.0010265707969665527 }, { "epoch": 0.07791180541823216, "step": 394, "train/total_loss": 0.16973544657230377 }, { "entropy": 6.233129024505615, "epoch": 0.0781095511172632, "mean_token_accuracy": 0.7042660713195801, "num_tokens": 17947647.0, "step": 395, "train/ce_loss": 1.9940696954727173 }, { "epoch": 0.0781095511172632, "step": 395, "train/sim_loss": 0.0012454986572265625 }, { "epoch": 0.0781095511172632, "step": 395, "train/total_loss": 0.20065246522426605 }, { "entropy": 6.033013343811035, "epoch": 0.07830729681629424, "mean_token_accuracy": 0.7528180480003357, "num_tokens": 18005985.0, "step": 396, "train/ce_loss": 1.0584192276000977 }, { "epoch": 0.07830729681629424, "step": 396, "train/sim_loss": 0.0012069940567016602 }, { "epoch": 0.07830729681629424, "step": 396, "train/total_loss": 0.10704892128705978 }, { "entropy": 6.010364055633545, "epoch": 0.0785050425153253, "mean_token_accuracy": 0.7189189195632935, "num_tokens": 18051022.0, "step": 397, "train/ce_loss": 0.7069169878959656 }, { "epoch": 0.0785050425153253, "step": 397, "train/sim_loss": 0.0009278059005737305 }, { "epoch": 0.0785050425153253, "step": 397, "train/total_loss": 0.07161950320005417 }, { "entropy": 5.939867973327637, "epoch": 0.07870278821435633, "mean_token_accuracy": 0.737092912197113, "num_tokens": 18096149.0, "step": 398, "train/ce_loss": 1.5014406442642212 }, { "epoch": 0.07870278821435633, "step": 398, "train/sim_loss": 0.0019299983978271484 }, { "epoch": 0.07870278821435633, "step": 398, "train/total_loss": 0.15207406878471375 }, { "entropy": 5.585721492767334, "epoch": 0.07890053391338739, "mean_token_accuracy": 0.765856921672821, "num_tokens": 18141152.0, "step": 399, "train/ce_loss": 0.8293741941452026 }, { "epoch": 0.07890053391338739, "step": 399, "train/sim_loss": 0.0019614696502685547 }, { "epoch": 0.07890053391338739, "step": 399, "train/total_loss": 0.08489888906478882 }, { "epoch": 0.07909827961241843, "grad_norm": 0.47396498918533325, "learning_rate": 9.80759719062222e-06, "loss": 0.0856, "step": 400 }, { "entropy": 5.953599452972412, "epoch": 0.07909827961241843, "mean_token_accuracy": 0.7022900581359863, "num_tokens": 18173551.0, "step": 400, "train/ce_loss": 2.074883222579956 }, { "epoch": 0.07909827961241843, "step": 400, "train/sim_loss": 0.0012990236282348633 }, { "epoch": 0.07909827961241843, "step": 400, "train/total_loss": 0.20878735184669495 }, { "entropy": 6.008562088012695, "epoch": 0.07929602531144947, "mean_token_accuracy": 0.740764319896698, "num_tokens": 18219187.0, "step": 401, "train/ce_loss": 1.0479708909988403 }, { "epoch": 0.07929602531144947, "step": 401, "train/sim_loss": 0.0010545849800109863 }, { "epoch": 0.07929602531144947, "step": 401, "train/total_loss": 0.1058516725897789 }, { "entropy": 5.940791130065918, "epoch": 0.07949377101048052, "mean_token_accuracy": 0.7264528870582581, "num_tokens": 18260607.0, "step": 402, "train/ce_loss": 1.1632741689682007 }, { "epoch": 0.07949377101048052, "step": 402, "train/sim_loss": 0.0017009377479553223 }, { "epoch": 0.07949377101048052, "step": 402, "train/total_loss": 0.11802835762500763 }, { "entropy": 6.11130952835083, "epoch": 0.07969151670951156, "mean_token_accuracy": 0.7183544039726257, "num_tokens": 18302220.0, "step": 403, "train/ce_loss": 7.635834481334314e-05 }, { "epoch": 0.07969151670951156, "step": 403, "train/sim_loss": 0.0021973848342895508 }, { "epoch": 0.07969151670951156, "step": 403, "train/total_loss": 0.002205020748078823 }, { "entropy": 6.0885515213012695, "epoch": 0.07988926240854262, "mean_token_accuracy": 0.7244898080825806, "num_tokens": 18352723.0, "step": 404, "train/ce_loss": 1.074416995048523 }, { "epoch": 0.07988926240854262, "step": 404, "train/sim_loss": 0.0013431310653686523 }, { "epoch": 0.07988926240854262, "step": 404, "train/total_loss": 0.10878483206033707 }, { "entropy": 6.179445743560791, "epoch": 0.08008700810757366, "mean_token_accuracy": 0.7109867334365845, "num_tokens": 18403762.0, "step": 405, "train/ce_loss": 2.100004196166992 }, { "epoch": 0.08008700810757366, "step": 405, "train/sim_loss": 0.0016506314277648926 }, { "epoch": 0.08008700810757366, "step": 405, "train/total_loss": 0.2116510570049286 }, { "entropy": 5.981734275817871, "epoch": 0.0802847538066047, "mean_token_accuracy": 0.7338003516197205, "num_tokens": 18444684.0, "step": 406, "train/ce_loss": 8.46993934828788e-05 }, { "epoch": 0.0802847538066047, "step": 406, "train/sim_loss": 0.0015105009078979492 }, { "epoch": 0.0802847538066047, "step": 406, "train/total_loss": 0.0015189708210527897 }, { "entropy": 6.184047698974609, "epoch": 0.08048249950563575, "mean_token_accuracy": 0.7317661046981812, "num_tokens": 18503396.0, "step": 407, "train/ce_loss": 1.6971664428710938 }, { "epoch": 0.08048249950563575, "step": 407, "train/sim_loss": 0.0018453598022460938 }, { "epoch": 0.08048249950563575, "step": 407, "train/total_loss": 0.17156200110912323 }, { "entropy": 6.081608772277832, "epoch": 0.0806802452046668, "mean_token_accuracy": 0.7521035671234131, "num_tokens": 18562974.0, "step": 408, "train/ce_loss": 0.44196462631225586 }, { "epoch": 0.0806802452046668, "step": 408, "train/sim_loss": 0.000741124153137207 }, { "epoch": 0.0806802452046668, "step": 408, "train/total_loss": 0.04493758827447891 }, { "entropy": 5.921474933624268, "epoch": 0.08087799090369785, "mean_token_accuracy": 0.7202492356300354, "num_tokens": 18617625.0, "step": 409, "train/ce_loss": 0.8562267422676086 }, { "epoch": 0.08087799090369785, "step": 409, "train/sim_loss": 0.0009495019912719727 }, { "epoch": 0.08087799090369785, "step": 409, "train/total_loss": 0.08657217770814896 }, { "entropy": 5.972265720367432, "epoch": 0.08107573660272889, "mean_token_accuracy": 0.7147949934005737, "num_tokens": 18657157.0, "step": 410, "train/ce_loss": 0.9952260851860046 }, { "epoch": 0.08107573660272889, "step": 410, "train/sim_loss": 0.0013058185577392578 }, { "epoch": 0.08107573660272889, "step": 410, "train/total_loss": 0.10082843154668808 }, { "entropy": 5.710962772369385, "epoch": 0.08127348230175993, "mean_token_accuracy": 0.7293092608451843, "num_tokens": 18713136.0, "step": 411, "train/ce_loss": 1.2463315725326538 }, { "epoch": 0.08127348230175993, "step": 411, "train/sim_loss": 0.0011734962463378906 }, { "epoch": 0.08127348230175993, "step": 411, "train/total_loss": 0.12580665946006775 }, { "entropy": 6.080829620361328, "epoch": 0.08147122800079099, "mean_token_accuracy": 0.7763975262641907, "num_tokens": 18767418.0, "step": 412, "train/ce_loss": 0.9504113793373108 }, { "epoch": 0.08147122800079099, "step": 412, "train/sim_loss": 0.0012881755828857422 }, { "epoch": 0.08147122800079099, "step": 412, "train/total_loss": 0.09632931649684906 }, { "entropy": 6.017111301422119, "epoch": 0.08166897369982203, "mean_token_accuracy": 0.7417942881584167, "num_tokens": 18810928.0, "step": 413, "train/ce_loss": 0.9983757734298706 }, { "epoch": 0.08166897369982203, "step": 413, "train/sim_loss": 0.0016800165176391602 }, { "epoch": 0.08166897369982203, "step": 413, "train/total_loss": 0.10151759535074234 }, { "entropy": 5.810848236083984, "epoch": 0.08186671939885308, "mean_token_accuracy": 0.6991814374923706, "num_tokens": 18840634.0, "step": 414, "train/ce_loss": 7.213951175799593e-05 }, { "epoch": 0.08186671939885308, "step": 414, "train/sim_loss": 0.0009849071502685547 }, { "epoch": 0.08186671939885308, "step": 414, "train/total_loss": 0.0009921210585162044 }, { "entropy": 5.889314651489258, "epoch": 0.08206446509788412, "mean_token_accuracy": 0.7479674816131592, "num_tokens": 18886604.0, "step": 415, "train/ce_loss": 0.7202638983726501 }, { "epoch": 0.08206446509788412, "step": 415, "train/sim_loss": 0.0011483430862426758 }, { "epoch": 0.08206446509788412, "step": 415, "train/total_loss": 0.07317473739385605 }, { "entropy": 5.938218116760254, "epoch": 0.08226221079691516, "mean_token_accuracy": 0.7669280171394348, "num_tokens": 18925876.0, "step": 416, "train/ce_loss": 7.044220546958968e-05 }, { "epoch": 0.08226221079691516, "step": 416, "train/sim_loss": 0.0007337331771850586 }, { "epoch": 0.08226221079691516, "step": 416, "train/total_loss": 0.0007407774101011455 }, { "entropy": 6.1819257736206055, "epoch": 0.08245995649594622, "mean_token_accuracy": 0.7039815187454224, "num_tokens": 18969311.0, "step": 417, "train/ce_loss": 0.3881903886795044 }, { "epoch": 0.08245995649594622, "step": 417, "train/sim_loss": 0.0010862946510314941 }, { "epoch": 0.08245995649594622, "step": 417, "train/total_loss": 0.03990533575415611 }, { "entropy": 5.923312187194824, "epoch": 0.08265770219497726, "mean_token_accuracy": 0.7248189449310303, "num_tokens": 19018021.0, "step": 418, "train/ce_loss": 1.3059251308441162 }, { "epoch": 0.08265770219497726, "step": 418, "train/sim_loss": 0.0021437406539916992 }, { "epoch": 0.08265770219497726, "step": 418, "train/total_loss": 0.13273625075817108 }, { "entropy": 5.983886241912842, "epoch": 0.08285544789400831, "mean_token_accuracy": 0.7505057454109192, "num_tokens": 19062759.0, "step": 419, "train/ce_loss": 0.6964156627655029 }, { "epoch": 0.08285544789400831, "step": 419, "train/sim_loss": 0.0009859204292297363 }, { "epoch": 0.08285544789400831, "step": 419, "train/total_loss": 0.07062748819589615 }, { "epoch": 0.08305319359303935, "grad_norm": 0.4855821430683136, "learning_rate": 9.797705015332872e-06, "loss": 0.089, "step": 420 }, { "entropy": 5.929636001586914, "epoch": 0.08305319359303935, "mean_token_accuracy": 0.7211481928825378, "num_tokens": 19114982.0, "step": 420, "train/ce_loss": 0.6401516199111938 }, { "epoch": 0.08305319359303935, "step": 420, "train/sim_loss": 0.0011960268020629883 }, { "epoch": 0.08305319359303935, "step": 420, "train/total_loss": 0.06521119177341461 }, { "entropy": 6.073713302612305, "epoch": 0.08325093929207039, "mean_token_accuracy": 0.710189163684845, "num_tokens": 19158994.0, "step": 421, "train/ce_loss": 0.5889050364494324 }, { "epoch": 0.08325093929207039, "step": 421, "train/sim_loss": 0.0010290741920471191 }, { "epoch": 0.08325093929207039, "step": 421, "train/total_loss": 0.0599195770919323 }, { "entropy": 6.2894392013549805, "epoch": 0.08344868499110145, "mean_token_accuracy": 0.7010631561279297, "num_tokens": 19218102.0, "step": 422, "train/ce_loss": 0.6545810103416443 }, { "epoch": 0.08344868499110145, "step": 422, "train/sim_loss": 0.0012134313583374023 }, { "epoch": 0.08344868499110145, "step": 422, "train/total_loss": 0.06667153537273407 }, { "entropy": 6.114630699157715, "epoch": 0.08364643069013249, "mean_token_accuracy": 0.7083616852760315, "num_tokens": 19263707.0, "step": 423, "train/ce_loss": 0.8851117491722107 }, { "epoch": 0.08364643069013249, "step": 423, "train/sim_loss": 0.001911759376525879 }, { "epoch": 0.08364643069013249, "step": 423, "train/total_loss": 0.09042293578386307 }, { "entropy": 6.150749206542969, "epoch": 0.08384417638916354, "mean_token_accuracy": 0.744339644908905, "num_tokens": 19301833.0, "step": 424, "train/ce_loss": 1.6356947422027588 }, { "epoch": 0.08384417638916354, "step": 424, "train/sim_loss": 0.001331031322479248 }, { "epoch": 0.08384417638916354, "step": 424, "train/total_loss": 0.1649005115032196 }, { "entropy": 5.7360382080078125, "epoch": 0.08404192208819458, "mean_token_accuracy": 0.7447289228439331, "num_tokens": 19338087.0, "step": 425, "train/ce_loss": 0.7059090733528137 }, { "epoch": 0.08404192208819458, "step": 425, "train/sim_loss": 0.0007269382476806641 }, { "epoch": 0.08404192208819458, "step": 425, "train/total_loss": 0.07131784409284592 }, { "entropy": 6.125059604644775, "epoch": 0.08423966778722562, "mean_token_accuracy": 0.7220902442932129, "num_tokens": 19389955.0, "step": 426, "train/ce_loss": 0.00011510140757309273 }, { "epoch": 0.08423966778722562, "step": 426, "train/sim_loss": 0.001309812068939209 }, { "epoch": 0.08423966778722562, "step": 426, "train/total_loss": 0.0013213221682235599 }, { "entropy": 6.458940505981445, "epoch": 0.08443741348625668, "mean_token_accuracy": 0.7235252261161804, "num_tokens": 19444368.0, "step": 427, "train/ce_loss": 1.2005345821380615 }, { "epoch": 0.08443741348625668, "step": 427, "train/sim_loss": 0.0012345314025878906 }, { "epoch": 0.08443741348625668, "step": 427, "train/total_loss": 0.1212879940867424 }, { "entropy": 6.112157821655273, "epoch": 0.08463515918528772, "mean_token_accuracy": 0.739534854888916, "num_tokens": 19483231.0, "step": 428, "train/ce_loss": 0.8143983483314514 }, { "epoch": 0.08463515918528772, "step": 428, "train/sim_loss": 0.0013194084167480469 }, { "epoch": 0.08463515918528772, "step": 428, "train/total_loss": 0.08275924623012543 }, { "entropy": 5.957883834838867, "epoch": 0.08483290488431877, "mean_token_accuracy": 0.7077675461769104, "num_tokens": 19524702.0, "step": 429, "train/ce_loss": 0.6525189280509949 }, { "epoch": 0.08483290488431877, "step": 429, "train/sim_loss": 0.0008578300476074219 }, { "epoch": 0.08483290488431877, "step": 429, "train/total_loss": 0.06610972434282303 }, { "entropy": 6.2308831214904785, "epoch": 0.08503065058334981, "mean_token_accuracy": 0.6970428228378296, "num_tokens": 19574800.0, "step": 430, "train/ce_loss": 1.1981221437454224 }, { "epoch": 0.08503065058334981, "step": 430, "train/sim_loss": 0.0017489194869995117 }, { "epoch": 0.08503065058334981, "step": 430, "train/total_loss": 0.12156113237142563 }, { "entropy": 5.792413711547852, "epoch": 0.08522839628238085, "mean_token_accuracy": 0.7235516309738159, "num_tokens": 19606897.0, "step": 431, "train/ce_loss": 1.9031213521957397 }, { "epoch": 0.08522839628238085, "step": 431, "train/sim_loss": 0.0010125041007995605 }, { "epoch": 0.08522839628238085, "step": 431, "train/total_loss": 0.1913246363401413 }, { "entropy": 5.678318500518799, "epoch": 0.08542614198141191, "mean_token_accuracy": 0.765553891658783, "num_tokens": 19641653.0, "step": 432, "train/ce_loss": 0.6807713508605957 }, { "epoch": 0.08542614198141191, "step": 432, "train/sim_loss": 0.0008646249771118164 }, { "epoch": 0.08542614198141191, "step": 432, "train/total_loss": 0.06894176453351974 }, { "entropy": 6.236268043518066, "epoch": 0.08562388768044295, "mean_token_accuracy": 0.7112210988998413, "num_tokens": 19692746.0, "step": 433, "train/ce_loss": 7.689603808103129e-05 }, { "epoch": 0.08562388768044295, "step": 433, "train/sim_loss": 0.001190781593322754 }, { "epoch": 0.08562388768044295, "step": 433, "train/total_loss": 0.0011984711745753884 }, { "entropy": 5.982734680175781, "epoch": 0.085821633379474, "mean_token_accuracy": 0.7550632953643799, "num_tokens": 19750500.0, "step": 434, "train/ce_loss": 0.939193069934845 }, { "epoch": 0.085821633379474, "step": 434, "train/sim_loss": 0.0007295608520507812 }, { "epoch": 0.085821633379474, "step": 434, "train/total_loss": 0.09464886784553528 }, { "entropy": 6.128968238830566, "epoch": 0.08601937907850504, "mean_token_accuracy": 0.747538685798645, "num_tokens": 19781153.0, "step": 435, "train/ce_loss": 0.6718581914901733 }, { "epoch": 0.08601937907850504, "step": 435, "train/sim_loss": 0.0007437467575073242 }, { "epoch": 0.08601937907850504, "step": 435, "train/total_loss": 0.06792956590652466 }, { "entropy": 5.993041038513184, "epoch": 0.08621712477753608, "mean_token_accuracy": 0.738054633140564, "num_tokens": 19818320.0, "step": 436, "train/ce_loss": 0.8144791126251221 }, { "epoch": 0.08621712477753608, "step": 436, "train/sim_loss": 0.001184701919555664 }, { "epoch": 0.08621712477753608, "step": 436, "train/total_loss": 0.08263261616230011 }, { "entropy": 6.323106288909912, "epoch": 0.08641487047656714, "mean_token_accuracy": 0.7322229146957397, "num_tokens": 19863772.0, "step": 437, "train/ce_loss": 0.8280262351036072 }, { "epoch": 0.08641487047656714, "step": 437, "train/sim_loss": 0.0011897683143615723 }, { "epoch": 0.08641487047656714, "step": 437, "train/total_loss": 0.08399239182472229 }, { "entropy": 6.171309471130371, "epoch": 0.08661261617559818, "mean_token_accuracy": 0.7144653797149658, "num_tokens": 19906408.0, "step": 438, "train/ce_loss": 1.0299571752548218 }, { "epoch": 0.08661261617559818, "step": 438, "train/sim_loss": 0.0010467767715454102 }, { "epoch": 0.08661261617559818, "step": 438, "train/total_loss": 0.10404249280691147 }, { "entropy": 6.157327651977539, "epoch": 0.08681036187462923, "mean_token_accuracy": 0.7407614588737488, "num_tokens": 19959065.0, "step": 439, "train/ce_loss": 1.297670841217041 }, { "epoch": 0.08681036187462923, "step": 439, "train/sim_loss": 0.001376032829284668 }, { "epoch": 0.08681036187462923, "step": 439, "train/total_loss": 0.13114312291145325 }, { "epoch": 0.08700810757366027, "grad_norm": 0.44035378098487854, "learning_rate": 9.787812840043527e-06, "loss": 0.0891, "step": 440 }, { "entropy": 5.9398393630981445, "epoch": 0.08700810757366027, "mean_token_accuracy": 0.7409836053848267, "num_tokens": 20000110.0, "step": 440, "train/ce_loss": 0.6510357856750488 }, { "epoch": 0.08700810757366027, "step": 440, "train/sim_loss": 0.0007523894309997559 }, { "epoch": 0.08700810757366027, "step": 440, "train/total_loss": 0.065855972468853 }, { "entropy": 6.217242240905762, "epoch": 0.08720585327269131, "mean_token_accuracy": 0.7209944725036621, "num_tokens": 20035245.0, "step": 441, "train/ce_loss": 1.7070637941360474 }, { "epoch": 0.08720585327269131, "step": 441, "train/sim_loss": 0.0008425116539001465 }, { "epoch": 0.08720585327269131, "step": 441, "train/total_loss": 0.17154888808727264 }, { "entropy": 6.182831764221191, "epoch": 0.08740359897172237, "mean_token_accuracy": 0.7221108675003052, "num_tokens": 20091769.0, "step": 442, "train/ce_loss": 7.313759124372154e-05 }, { "epoch": 0.08740359897172237, "step": 442, "train/sim_loss": 0.0006185770034790039 }, { "epoch": 0.08740359897172237, "step": 442, "train/total_loss": 0.0006258907378651202 }, { "entropy": 5.823101997375488, "epoch": 0.08760134467075341, "mean_token_accuracy": 0.7562642097473145, "num_tokens": 20132894.0, "step": 443, "train/ce_loss": 1.0454251766204834 }, { "epoch": 0.08760134467075341, "step": 443, "train/sim_loss": 0.0010925531387329102 }, { "epoch": 0.08760134467075341, "step": 443, "train/total_loss": 0.10563506931066513 }, { "entropy": 5.961187362670898, "epoch": 0.08779909036978446, "mean_token_accuracy": 0.7215836644172668, "num_tokens": 20173235.0, "step": 444, "train/ce_loss": 0.7024710774421692 }, { "epoch": 0.08779909036978446, "step": 444, "train/sim_loss": 0.0008199214935302734 }, { "epoch": 0.08779909036978446, "step": 444, "train/total_loss": 0.07106702774763107 }, { "entropy": 6.1648173332214355, "epoch": 0.0879968360688155, "mean_token_accuracy": 0.747657299041748, "num_tokens": 20224815.0, "step": 445, "train/ce_loss": 1.1009680032730103 }, { "epoch": 0.0879968360688155, "step": 445, "train/sim_loss": 0.0018263459205627441 }, { "epoch": 0.0879968360688155, "step": 445, "train/total_loss": 0.11192315071821213 }, { "entropy": 5.985129356384277, "epoch": 0.08819458176784654, "mean_token_accuracy": 0.7470816969871521, "num_tokens": 20276802.0, "step": 446, "train/ce_loss": 0.8671373128890991 }, { "epoch": 0.08819458176784654, "step": 446, "train/sim_loss": 0.0010516643524169922 }, { "epoch": 0.08819458176784654, "step": 446, "train/total_loss": 0.0877653956413269 }, { "entropy": 6.009296894073486, "epoch": 0.0883923274668776, "mean_token_accuracy": 0.738095223903656, "num_tokens": 20323106.0, "step": 447, "train/ce_loss": 0.5126538872718811 }, { "epoch": 0.0883923274668776, "step": 447, "train/sim_loss": 0.0006364583969116211 }, { "epoch": 0.0883923274668776, "step": 447, "train/total_loss": 0.05190184712409973 }, { "entropy": 5.952524185180664, "epoch": 0.08859007316590864, "mean_token_accuracy": 0.7003725171089172, "num_tokens": 20369160.0, "step": 448, "train/ce_loss": 0.7302852272987366 }, { "epoch": 0.08859007316590864, "step": 448, "train/sim_loss": 0.0012155771255493164 }, { "epoch": 0.08859007316590864, "step": 448, "train/total_loss": 0.07424410432577133 }, { "entropy": 6.056340217590332, "epoch": 0.0887878188649397, "mean_token_accuracy": 0.7320687770843506, "num_tokens": 20410214.0, "step": 449, "train/ce_loss": 6.42470404272899e-05 }, { "epoch": 0.0887878188649397, "step": 449, "train/sim_loss": 0.0007627010345458984 }, { "epoch": 0.0887878188649397, "step": 449, "train/total_loss": 0.0007691257633268833 }, { "entropy": 6.094886779785156, "epoch": 0.08898556456397073, "mean_token_accuracy": 0.7111631631851196, "num_tokens": 20461912.0, "step": 450, "train/ce_loss": 1.015572190284729 }, { "epoch": 0.08898556456397073, "step": 450, "train/sim_loss": 0.0015051960945129395 }, { "epoch": 0.08898556456397073, "step": 450, "train/total_loss": 0.10306241363286972 }, { "entropy": 6.131892204284668, "epoch": 0.08918331026300177, "mean_token_accuracy": 0.7490013241767883, "num_tokens": 20516572.0, "step": 451, "train/ce_loss": 0.7508107423782349 }, { "epoch": 0.08918331026300177, "step": 451, "train/sim_loss": 0.0006638765335083008 }, { "epoch": 0.08918331026300177, "step": 451, "train/total_loss": 0.07574494928121567 }, { "entropy": 6.230250358581543, "epoch": 0.08938105596203283, "mean_token_accuracy": 0.7038081884384155, "num_tokens": 20565254.0, "step": 452, "train/ce_loss": 7.176168583100662e-05 }, { "epoch": 0.08938105596203283, "step": 452, "train/sim_loss": 0.0017697811126708984 }, { "epoch": 0.08938105596203283, "step": 452, "train/total_loss": 0.0017769573023542762 }, { "entropy": 5.470293045043945, "epoch": 0.08957880166106387, "mean_token_accuracy": 0.7934176921844482, "num_tokens": 20595320.0, "step": 453, "train/ce_loss": 0.5771066546440125 }, { "epoch": 0.08957880166106387, "step": 453, "train/sim_loss": 0.0006079673767089844 }, { "epoch": 0.08957880166106387, "step": 453, "train/total_loss": 0.05831863358616829 }, { "entropy": 6.141108512878418, "epoch": 0.08977654736009492, "mean_token_accuracy": 0.6969283223152161, "num_tokens": 20638791.0, "step": 454, "train/ce_loss": 0.9435936212539673 }, { "epoch": 0.08977654736009492, "step": 454, "train/sim_loss": 0.0011292099952697754 }, { "epoch": 0.08977654736009492, "step": 454, "train/total_loss": 0.09548857063055038 }, { "entropy": 5.977075099945068, "epoch": 0.08997429305912596, "mean_token_accuracy": 0.7308048009872437, "num_tokens": 20666641.0, "step": 455, "train/ce_loss": 1.9542707204818726 }, { "epoch": 0.08997429305912596, "step": 455, "train/sim_loss": 0.0013605356216430664 }, { "epoch": 0.08997429305912596, "step": 455, "train/total_loss": 0.19678761065006256 }, { "entropy": 6.12550687789917, "epoch": 0.090172038758157, "mean_token_accuracy": 0.768937349319458, "num_tokens": 20714247.0, "step": 456, "train/ce_loss": 0.8150110840797424 }, { "epoch": 0.090172038758157, "step": 456, "train/sim_loss": 0.001068115234375 }, { "epoch": 0.090172038758157, "step": 456, "train/total_loss": 0.08256922662258148 }, { "entropy": 5.753043174743652, "epoch": 0.09036978445718806, "mean_token_accuracy": 0.7469419240951538, "num_tokens": 20747455.0, "step": 457, "train/ce_loss": 0.9216404557228088 }, { "epoch": 0.09036978445718806, "step": 457, "train/sim_loss": 0.0006186962127685547 }, { "epoch": 0.09036978445718806, "step": 457, "train/total_loss": 0.09278274327516556 }, { "entropy": 6.282642364501953, "epoch": 0.0905675301562191, "mean_token_accuracy": 0.727785587310791, "num_tokens": 20781445.0, "step": 458, "train/ce_loss": 0.9478515982627869 }, { "epoch": 0.0905675301562191, "step": 458, "train/sim_loss": 0.0014216899871826172 }, { "epoch": 0.0905675301562191, "step": 458, "train/total_loss": 0.09620685130357742 }, { "entropy": 6.059443473815918, "epoch": 0.09076527585525015, "mean_token_accuracy": 0.7445194125175476, "num_tokens": 20815720.0, "step": 459, "train/ce_loss": 0.7592887282371521 }, { "epoch": 0.09076527585525015, "step": 459, "train/sim_loss": 0.0008331537246704102 }, { "epoch": 0.09076527585525015, "step": 459, "train/total_loss": 0.07676202803850174 }, { "epoch": 0.0909630215542812, "grad_norm": 0.5159197449684143, "learning_rate": 9.77792066475418e-06, "loss": 0.0868, "step": 460 }, { "entropy": 6.354032039642334, "epoch": 0.0909630215542812, "mean_token_accuracy": 0.6747126579284668, "num_tokens": 20871140.0, "step": 460, "train/ce_loss": 0.6539819240570068 }, { "epoch": 0.0909630215542812, "step": 460, "train/sim_loss": 0.0013225078582763672 }, { "epoch": 0.0909630215542812, "step": 460, "train/total_loss": 0.06672070175409317 }, { "entropy": 5.987180709838867, "epoch": 0.09116076725331224, "mean_token_accuracy": 0.7626321911811829, "num_tokens": 20922494.0, "step": 461, "train/ce_loss": 0.45016562938690186 }, { "epoch": 0.09116076725331224, "step": 461, "train/sim_loss": 0.0006824135780334473 }, { "epoch": 0.09116076725331224, "step": 461, "train/total_loss": 0.04569897800683975 }, { "entropy": 6.053849220275879, "epoch": 0.09135851295234329, "mean_token_accuracy": 0.7168294191360474, "num_tokens": 20987706.0, "step": 462, "train/ce_loss": 0.6947836875915527 }, { "epoch": 0.09135851295234329, "step": 462, "train/sim_loss": 0.0012488365173339844 }, { "epoch": 0.09135851295234329, "step": 462, "train/total_loss": 0.07072720676660538 }, { "entropy": 6.348329544067383, "epoch": 0.09155625865137433, "mean_token_accuracy": 0.732758641242981, "num_tokens": 21049375.0, "step": 463, "train/ce_loss": 1.201309323310852 }, { "epoch": 0.09155625865137433, "step": 463, "train/sim_loss": 0.0015772581100463867 }, { "epoch": 0.09155625865137433, "step": 463, "train/total_loss": 0.12170819193124771 }, { "entropy": 6.089416980743408, "epoch": 0.09175400435040539, "mean_token_accuracy": 0.7021158337593079, "num_tokens": 21108126.0, "step": 464, "train/ce_loss": 0.8971300721168518 }, { "epoch": 0.09175400435040539, "step": 464, "train/sim_loss": 0.0011579394340515137 }, { "epoch": 0.09175400435040539, "step": 464, "train/total_loss": 0.0908709466457367 }, { "entropy": 6.055810928344727, "epoch": 0.09195175004943643, "mean_token_accuracy": 0.7465667724609375, "num_tokens": 21163273.0, "step": 465, "train/ce_loss": 1.2266682386398315 }, { "epoch": 0.09195175004943643, "step": 465, "train/sim_loss": 0.0013442039489746094 }, { "epoch": 0.09195175004943643, "step": 465, "train/total_loss": 0.12401103228330612 }, { "entropy": 5.954634189605713, "epoch": 0.09214949574846747, "mean_token_accuracy": 0.7111853361129761, "num_tokens": 21200877.0, "step": 466, "train/ce_loss": 0.8385537266731262 }, { "epoch": 0.09214949574846747, "step": 466, "train/sim_loss": 0.0008686184883117676 }, { "epoch": 0.09214949574846747, "step": 466, "train/total_loss": 0.08472399413585663 }, { "entropy": 6.2491559982299805, "epoch": 0.09234724144749852, "mean_token_accuracy": 0.7563636302947998, "num_tokens": 21255653.0, "step": 467, "train/ce_loss": 1.198103904724121 }, { "epoch": 0.09234724144749852, "step": 467, "train/sim_loss": 0.000634312629699707 }, { "epoch": 0.09234724144749852, "step": 467, "train/total_loss": 0.12044470757246017 }, { "entropy": 5.634122848510742, "epoch": 0.09254498714652956, "mean_token_accuracy": 0.7703663110733032, "num_tokens": 21294923.0, "step": 468, "train/ce_loss": 0.45932653546333313 }, { "epoch": 0.09254498714652956, "step": 468, "train/sim_loss": 0.0011487007141113281 }, { "epoch": 0.09254498714652956, "step": 468, "train/total_loss": 0.0470813550055027 }, { "entropy": 5.979020118713379, "epoch": 0.09274273284556062, "mean_token_accuracy": 0.7658565640449524, "num_tokens": 21341383.0, "step": 469, "train/ce_loss": 0.7719852328300476 }, { "epoch": 0.09274273284556062, "step": 469, "train/sim_loss": 0.0016307830810546875 }, { "epoch": 0.09274273284556062, "step": 469, "train/total_loss": 0.0788293108344078 }, { "entropy": 5.905631065368652, "epoch": 0.09294047854459166, "mean_token_accuracy": 0.7725393176078796, "num_tokens": 21395142.0, "step": 470, "train/ce_loss": 0.8003997206687927 }, { "epoch": 0.09294047854459166, "step": 470, "train/sim_loss": 0.0019464492797851562 }, { "epoch": 0.09294047854459166, "step": 470, "train/total_loss": 0.08198641985654831 }, { "entropy": 5.461054801940918, "epoch": 0.0931382242436227, "mean_token_accuracy": 0.7866894006729126, "num_tokens": 21422954.0, "step": 471, "train/ce_loss": 0.6253111362457275 }, { "epoch": 0.0931382242436227, "step": 471, "train/sim_loss": 0.0007066130638122559 }, { "epoch": 0.0931382242436227, "step": 471, "train/total_loss": 0.06323772668838501 }, { "entropy": 5.806418418884277, "epoch": 0.09333596994265375, "mean_token_accuracy": 0.7262958288192749, "num_tokens": 21461112.0, "step": 472, "train/ce_loss": 0.9238807559013367 }, { "epoch": 0.09333596994265375, "step": 472, "train/sim_loss": 0.0006926655769348145 }, { "epoch": 0.09333596994265375, "step": 472, "train/total_loss": 0.09308074414730072 }, { "entropy": 6.040583610534668, "epoch": 0.09353371564168479, "mean_token_accuracy": 0.7684647440910339, "num_tokens": 21508407.0, "step": 473, "train/ce_loss": 0.7117552161216736 }, { "epoch": 0.09353371564168479, "step": 473, "train/sim_loss": 0.0008240938186645508 }, { "epoch": 0.09353371564168479, "step": 473, "train/total_loss": 0.07199961692094803 }, { "entropy": 5.911423683166504, "epoch": 0.09373146134071583, "mean_token_accuracy": 0.7165242433547974, "num_tokens": 21554395.0, "step": 474, "train/ce_loss": 6.049881994840689e-05 }, { "epoch": 0.09373146134071583, "step": 474, "train/sim_loss": 0.0008189082145690918 }, { "epoch": 0.09373146134071583, "step": 474, "train/total_loss": 0.0008249580860137939 }, { "entropy": 5.4729390144348145, "epoch": 0.09392920703974689, "mean_token_accuracy": 0.7410972118377686, "num_tokens": 21597806.0, "step": 475, "train/ce_loss": 1.2782552242279053 }, { "epoch": 0.09392920703974689, "step": 475, "train/sim_loss": 0.0010161399841308594 }, { "epoch": 0.09392920703974689, "step": 475, "train/total_loss": 0.12884166836738586 }, { "entropy": 5.779295444488525, "epoch": 0.09412695273877793, "mean_token_accuracy": 0.7582260370254517, "num_tokens": 21630372.0, "step": 476, "train/ce_loss": 1.0493532419204712 }, { "epoch": 0.09412695273877793, "step": 476, "train/sim_loss": 0.0012088418006896973 }, { "epoch": 0.09412695273877793, "step": 476, "train/total_loss": 0.10614416748285294 }, { "entropy": 6.038074016571045, "epoch": 0.09432469843780898, "mean_token_accuracy": 0.6895954012870789, "num_tokens": 21671288.0, "step": 477, "train/ce_loss": 1.4092198610305786 }, { "epoch": 0.09432469843780898, "step": 477, "train/sim_loss": 0.0007559657096862793 }, { "epoch": 0.09432469843780898, "step": 477, "train/total_loss": 0.14167796075344086 }, { "entropy": 6.199128150939941, "epoch": 0.09452244413684002, "mean_token_accuracy": 0.7579318284988403, "num_tokens": 21725630.0, "step": 478, "train/ce_loss": 0.827947199344635 }, { "epoch": 0.09452244413684002, "step": 478, "train/sim_loss": 0.0007905960083007812 }, { "epoch": 0.09452244413684002, "step": 478, "train/total_loss": 0.08358531445264816 }, { "entropy": 6.202330589294434, "epoch": 0.09472018983587106, "mean_token_accuracy": 0.7173169255256653, "num_tokens": 21774920.0, "step": 479, "train/ce_loss": 0.9447615742683411 }, { "epoch": 0.09472018983587106, "step": 479, "train/sim_loss": 0.000665128231048584 }, { "epoch": 0.09472018983587106, "step": 479, "train/total_loss": 0.09514128416776657 }, { "epoch": 0.09491793553490212, "grad_norm": 0.5236664414405823, "learning_rate": 9.768028489464833e-06, "loss": 0.0859, "step": 480 }, { "entropy": 6.026163101196289, "epoch": 0.09491793553490212, "mean_token_accuracy": 0.7227505445480347, "num_tokens": 21813273.0, "step": 480, "train/ce_loss": 1.0833412408828735 }, { "epoch": 0.09491793553490212, "step": 480, "train/sim_loss": 0.0013110637664794922 }, { "epoch": 0.09491793553490212, "step": 480, "train/total_loss": 0.10964518785476685 }, { "entropy": 5.619831085205078, "epoch": 0.09511568123393316, "mean_token_accuracy": 0.768693208694458, "num_tokens": 21849432.0, "step": 481, "train/ce_loss": 0.6608303189277649 }, { "epoch": 0.09511568123393316, "step": 481, "train/sim_loss": 0.0007346868515014648 }, { "epoch": 0.09511568123393316, "step": 481, "train/total_loss": 0.06681772321462631 }, { "entropy": 6.025839805603027, "epoch": 0.09531342693296421, "mean_token_accuracy": 0.7261227369308472, "num_tokens": 21912445.0, "step": 482, "train/ce_loss": 1.0038230419158936 }, { "epoch": 0.09531342693296421, "step": 482, "train/sim_loss": 0.0011608600616455078 }, { "epoch": 0.09531342693296421, "step": 482, "train/total_loss": 0.10154316574335098 }, { "entropy": 6.114964485168457, "epoch": 0.09551117263199525, "mean_token_accuracy": 0.7281947135925293, "num_tokens": 21949741.0, "step": 483, "train/ce_loss": 0.9397491216659546 }, { "epoch": 0.09551117263199525, "step": 483, "train/sim_loss": 0.0015996694564819336 }, { "epoch": 0.09551117263199525, "step": 483, "train/total_loss": 0.09557458013296127 }, { "entropy": 6.026642799377441, "epoch": 0.0957089183310263, "mean_token_accuracy": 0.7163388729095459, "num_tokens": 21987534.0, "step": 484, "train/ce_loss": 7.584181730635464e-05 }, { "epoch": 0.0957089183310263, "step": 484, "train/sim_loss": 0.0006910562515258789 }, { "epoch": 0.0957089183310263, "step": 484, "train/total_loss": 0.0006986404187045991 }, { "entropy": 6.212000370025635, "epoch": 0.09590666403005735, "mean_token_accuracy": 0.6988322138786316, "num_tokens": 22023357.0, "step": 485, "train/ce_loss": 0.913593053817749 }, { "epoch": 0.09590666403005735, "step": 485, "train/sim_loss": 0.0008903741836547852 }, { "epoch": 0.09590666403005735, "step": 485, "train/total_loss": 0.09224968403577805 }, { "entropy": 6.405904769897461, "epoch": 0.09610440972908839, "mean_token_accuracy": 0.7386530041694641, "num_tokens": 22068225.0, "step": 486, "train/ce_loss": 0.5052089691162109 }, { "epoch": 0.09610440972908839, "step": 486, "train/sim_loss": 0.0008275508880615234 }, { "epoch": 0.09610440972908839, "step": 486, "train/total_loss": 0.05134844779968262 }, { "entropy": 6.179349422454834, "epoch": 0.09630215542811944, "mean_token_accuracy": 0.792620837688446, "num_tokens": 22119811.0, "step": 487, "train/ce_loss": 1.053547739982605 }, { "epoch": 0.09630215542811944, "step": 487, "train/sim_loss": 0.0016101598739624023 }, { "epoch": 0.09630215542811944, "step": 487, "train/total_loss": 0.10696493834257126 }, { "entropy": 5.742912769317627, "epoch": 0.09649990112715048, "mean_token_accuracy": 0.7148664593696594, "num_tokens": 22161920.0, "step": 488, "train/ce_loss": 0.6989427804946899 }, { "epoch": 0.09649990112715048, "step": 488, "train/sim_loss": 0.0012930035591125488 }, { "epoch": 0.09649990112715048, "step": 488, "train/total_loss": 0.07118728011846542 }, { "entropy": 5.9550886154174805, "epoch": 0.09669764682618152, "mean_token_accuracy": 0.7752721905708313, "num_tokens": 22195808.0, "step": 489, "train/ce_loss": 0.7598577737808228 }, { "epoch": 0.09669764682618152, "step": 489, "train/sim_loss": 0.0006237030029296875 }, { "epoch": 0.09669764682618152, "step": 489, "train/total_loss": 0.07660948485136032 }, { "entropy": 6.171591758728027, "epoch": 0.09689539252521258, "mean_token_accuracy": 0.7138211131095886, "num_tokens": 22257773.0, "step": 490, "train/ce_loss": 0.8441061973571777 }, { "epoch": 0.09689539252521258, "step": 490, "train/sim_loss": 0.001112818717956543 }, { "epoch": 0.09689539252521258, "step": 490, "train/total_loss": 0.08552344143390656 }, { "entropy": 5.717827796936035, "epoch": 0.09709313822424362, "mean_token_accuracy": 0.7474674582481384, "num_tokens": 22308040.0, "step": 491, "train/ce_loss": 1.0153752565383911 }, { "epoch": 0.09709313822424362, "step": 491, "train/sim_loss": 0.0006552934646606445 }, { "epoch": 0.09709313822424362, "step": 491, "train/total_loss": 0.10219281911849976 }, { "entropy": 5.570613861083984, "epoch": 0.09729088392327467, "mean_token_accuracy": 0.7306168675422668, "num_tokens": 22338089.0, "step": 492, "train/ce_loss": 1.100622296333313 }, { "epoch": 0.09729088392327467, "step": 492, "train/sim_loss": 0.001097261905670166 }, { "epoch": 0.09729088392327467, "step": 492, "train/total_loss": 0.11115949600934982 }, { "entropy": 6.270557880401611, "epoch": 0.09748862962230571, "mean_token_accuracy": 0.75, "num_tokens": 22378999.0, "step": 493, "train/ce_loss": 0.7971228361129761 }, { "epoch": 0.09748862962230571, "step": 493, "train/sim_loss": 0.0007027387619018555 }, { "epoch": 0.09748862962230571, "step": 493, "train/total_loss": 0.0804150253534317 }, { "entropy": 6.1581807136535645, "epoch": 0.09768637532133675, "mean_token_accuracy": 0.7287721633911133, "num_tokens": 22419192.0, "step": 494, "train/ce_loss": 0.7943023443222046 }, { "epoch": 0.09768637532133675, "step": 494, "train/sim_loss": 0.0009785890579223633 }, { "epoch": 0.09768637532133675, "step": 494, "train/total_loss": 0.08040882647037506 }, { "entropy": 6.349737167358398, "epoch": 0.09788412102036781, "mean_token_accuracy": 0.7147937417030334, "num_tokens": 22454197.0, "step": 495, "train/ce_loss": 1.1528983116149902 }, { "epoch": 0.09788412102036781, "step": 495, "train/sim_loss": 0.0007842183113098145 }, { "epoch": 0.09788412102036781, "step": 495, "train/total_loss": 0.11607404798269272 }, { "entropy": 6.077477931976318, "epoch": 0.09808186671939885, "mean_token_accuracy": 0.7346801161766052, "num_tokens": 22503382.0, "step": 496, "train/ce_loss": 0.9282881021499634 }, { "epoch": 0.09808186671939885, "step": 496, "train/sim_loss": 0.0008760690689086914 }, { "epoch": 0.09808186671939885, "step": 496, "train/total_loss": 0.09370487928390503 }, { "entropy": 6.355741500854492, "epoch": 0.0982796124184299, "mean_token_accuracy": 0.7536092400550842, "num_tokens": 22545665.0, "step": 497, "train/ce_loss": 8.04343944764696e-05 }, { "epoch": 0.0982796124184299, "step": 497, "train/sim_loss": 0.000703275203704834 }, { "epoch": 0.0982796124184299, "step": 497, "train/total_loss": 0.0007113186293281615 }, { "entropy": 6.248959064483643, "epoch": 0.09847735811746094, "mean_token_accuracy": 0.7269919514656067, "num_tokens": 22587609.0, "step": 498, "train/ce_loss": 0.892748236656189 }, { "epoch": 0.09847735811746094, "step": 498, "train/sim_loss": 0.001089632511138916 }, { "epoch": 0.09847735811746094, "step": 498, "train/total_loss": 0.09036445617675781 }, { "entropy": 6.112403392791748, "epoch": 0.09867510381649199, "mean_token_accuracy": 0.7230514287948608, "num_tokens": 22650070.0, "step": 499, "train/ce_loss": 0.7257183194160461 }, { "epoch": 0.09867510381649199, "step": 499, "train/sim_loss": 0.0008617639541625977 }, { "epoch": 0.09867510381649199, "step": 499, "train/total_loss": 0.07343360036611557 }, { "epoch": 0.09887284951552304, "grad_norm": 0.4497354030609131, "learning_rate": 9.758136314175488e-06, "loss": 0.0872, "step": 500 }, { "entropy": 6.303040981292725, "epoch": 0.09887284951552304, "mean_token_accuracy": 0.7665029764175415, "num_tokens": 22692389.0, "step": 500, "train/ce_loss": 1.043304681777954 }, { "epoch": 0.09887284951552304, "step": 500, "train/sim_loss": 0.0007751584053039551 }, { "epoch": 0.09887284951552304, "step": 500, "train/total_loss": 0.10510563105344772 }, { "entropy": 6.558235168457031, "epoch": 0.09907059521455408, "mean_token_accuracy": 0.7208016514778137, "num_tokens": 22730393.0, "step": 501, "train/ce_loss": 0.8740500807762146 }, { "epoch": 0.09907059521455408, "step": 501, "train/sim_loss": 0.0011798739433288574 }, { "epoch": 0.09907059521455408, "step": 501, "train/total_loss": 0.08858488500118256 }, { "entropy": 5.862274169921875, "epoch": 0.09926834091358513, "mean_token_accuracy": 0.7287166714668274, "num_tokens": 22754663.0, "step": 502, "train/ce_loss": 0.6824101805686951 }, { "epoch": 0.09926834091358513, "step": 502, "train/sim_loss": 0.0006320476531982422 }, { "epoch": 0.09926834091358513, "step": 502, "train/total_loss": 0.06887307018041611 }, { "entropy": 6.232660293579102, "epoch": 0.09946608661261618, "mean_token_accuracy": 0.7246049642562866, "num_tokens": 22814982.0, "step": 503, "train/ce_loss": 1.2115979194641113 }, { "epoch": 0.09946608661261618, "step": 503, "train/sim_loss": 0.001230001449584961 }, { "epoch": 0.09946608661261618, "step": 503, "train/total_loss": 0.1223897933959961 }, { "entropy": 6.198065280914307, "epoch": 0.09966383231164722, "mean_token_accuracy": 0.7532680034637451, "num_tokens": 22867004.0, "step": 504, "train/ce_loss": 0.7978671789169312 }, { "epoch": 0.09966383231164722, "step": 504, "train/sim_loss": 0.000995039939880371 }, { "epoch": 0.09966383231164722, "step": 504, "train/total_loss": 0.08078175783157349 }, { "entropy": 6.011065483093262, "epoch": 0.09986157801067827, "mean_token_accuracy": 0.7274662852287292, "num_tokens": 22911775.0, "step": 505, "train/ce_loss": 1.2242186069488525 }, { "epoch": 0.09986157801067827, "step": 505, "train/sim_loss": 0.0020309090614318848 }, { "epoch": 0.09986157801067827, "step": 505, "train/total_loss": 0.12445276975631714 }, { "entropy": 5.814265251159668, "epoch": 0.10005932370970931, "mean_token_accuracy": 0.7400835156440735, "num_tokens": 22972333.0, "step": 506, "train/ce_loss": 0.5037992000579834 }, { "epoch": 0.10005932370970931, "step": 506, "train/sim_loss": 0.0006310939788818359 }, { "epoch": 0.10005932370970931, "step": 506, "train/total_loss": 0.051011014729738235 }, { "entropy": 6.188089370727539, "epoch": 0.10025706940874037, "mean_token_accuracy": 0.738849401473999, "num_tokens": 23020296.0, "step": 507, "train/ce_loss": 0.7422274351119995 }, { "epoch": 0.10025706940874037, "step": 507, "train/sim_loss": 0.0009174942970275879 }, { "epoch": 0.10025706940874037, "step": 507, "train/total_loss": 0.07514023780822754 }, { "entropy": 6.348583221435547, "epoch": 0.1004548151077714, "mean_token_accuracy": 0.6840659379959106, "num_tokens": 23069724.0, "step": 508, "train/ce_loss": 1.1375383138656616 }, { "epoch": 0.1004548151077714, "step": 508, "train/sim_loss": 0.0008388757705688477 }, { "epoch": 0.1004548151077714, "step": 508, "train/total_loss": 0.11459270864725113 }, { "entropy": 6.534713268280029, "epoch": 0.10065256080680245, "mean_token_accuracy": 0.7163398861885071, "num_tokens": 23118177.0, "step": 509, "train/ce_loss": 0.7182133793830872 }, { "epoch": 0.10065256080680245, "step": 509, "train/sim_loss": 0.0008765459060668945 }, { "epoch": 0.10065256080680245, "step": 509, "train/total_loss": 0.07269788533449173 }, { "entropy": 6.182679176330566, "epoch": 0.1008503065058335, "mean_token_accuracy": 0.7166144251823425, "num_tokens": 23168475.0, "step": 510, "train/ce_loss": 1.4234827756881714 }, { "epoch": 0.1008503065058335, "step": 510, "train/sim_loss": 0.0011848807334899902 }, { "epoch": 0.1008503065058335, "step": 510, "train/total_loss": 0.1435331553220749 }, { "entropy": 5.863994598388672, "epoch": 0.10104805220486454, "mean_token_accuracy": 0.7439796924591064, "num_tokens": 23202808.0, "step": 511, "train/ce_loss": 0.7732111811637878 }, { "epoch": 0.10104805220486454, "step": 511, "train/sim_loss": 0.001001119613647461 }, { "epoch": 0.10104805220486454, "step": 511, "train/total_loss": 0.07832223922014236 }, { "entropy": 6.300995349884033, "epoch": 0.1012457979038956, "mean_token_accuracy": 0.7377466559410095, "num_tokens": 23258113.0, "step": 512, "train/ce_loss": 0.6175611019134521 }, { "epoch": 0.1012457979038956, "step": 512, "train/sim_loss": 0.0020503997802734375 }, { "epoch": 0.1012457979038956, "step": 512, "train/total_loss": 0.06380651146173477 }, { "entropy": 6.010631561279297, "epoch": 0.10144354360292664, "mean_token_accuracy": 0.772656261920929, "num_tokens": 23309962.0, "step": 513, "train/ce_loss": 1.2076680660247803 }, { "epoch": 0.10144354360292664, "step": 513, "train/sim_loss": 0.0008578300476074219 }, { "epoch": 0.10144354360292664, "step": 513, "train/total_loss": 0.12162464112043381 }, { "entropy": 6.465473651885986, "epoch": 0.10164128930195768, "mean_token_accuracy": 0.7377850413322449, "num_tokens": 23357713.0, "step": 514, "train/ce_loss": 0.4932093918323517 }, { "epoch": 0.10164128930195768, "step": 514, "train/sim_loss": 0.0011118650436401367 }, { "epoch": 0.10164128930195768, "step": 514, "train/total_loss": 0.050432804971933365 }, { "entropy": 5.9339094161987305, "epoch": 0.10183903500098873, "mean_token_accuracy": 0.777413010597229, "num_tokens": 23392005.0, "step": 515, "train/ce_loss": 4.434643415152095e-05 }, { "epoch": 0.10183903500098873, "step": 515, "train/sim_loss": 0.0009027719497680664 }, { "epoch": 0.10183903500098873, "step": 515, "train/total_loss": 0.0009072066168300807 }, { "entropy": 6.143435478210449, "epoch": 0.10203678070001977, "mean_token_accuracy": 0.7352564334869385, "num_tokens": 23439573.0, "step": 516, "train/ce_loss": 4.460178388399072e-05 }, { "epoch": 0.10203678070001977, "step": 516, "train/sim_loss": 0.0007305145263671875 }, { "epoch": 0.10203678070001977, "step": 516, "train/total_loss": 0.0007349746883846819 }, { "entropy": 6.147747993469238, "epoch": 0.10223452639905083, "mean_token_accuracy": 0.7380211353302002, "num_tokens": 23487748.0, "step": 517, "train/ce_loss": 1.0860873460769653 }, { "epoch": 0.10223452639905083, "step": 517, "train/sim_loss": 0.0006400346755981445 }, { "epoch": 0.10223452639905083, "step": 517, "train/total_loss": 0.10924877226352692 }, { "entropy": 6.383942604064941, "epoch": 0.10243227209808187, "mean_token_accuracy": 0.742277979850769, "num_tokens": 23529454.0, "step": 518, "train/ce_loss": 7.575904601253569e-05 }, { "epoch": 0.10243227209808187, "step": 518, "train/sim_loss": 0.0009056329727172852 }, { "epoch": 0.10243227209808187, "step": 518, "train/total_loss": 0.0009132088744081557 }, { "entropy": 6.498430252075195, "epoch": 0.10263001779711291, "mean_token_accuracy": 0.7092476487159729, "num_tokens": 23576426.0, "step": 519, "train/ce_loss": 0.9268960356712341 }, { "epoch": 0.10263001779711291, "step": 519, "train/sim_loss": 0.0008155107498168945 }, { "epoch": 0.10263001779711291, "step": 519, "train/total_loss": 0.09350511431694031 }, { "epoch": 0.10282776349614396, "grad_norm": 0.5595378875732422, "learning_rate": 9.748244138886142e-06, "loss": 0.086, "step": 520 }, { "entropy": 6.6813507080078125, "epoch": 0.10282776349614396, "mean_token_accuracy": 0.721327006816864, "num_tokens": 23617910.0, "step": 520, "train/ce_loss": 6.241967639653012e-05 }, { "epoch": 0.10282776349614396, "step": 520, "train/sim_loss": 0.0007625818252563477 }, { "epoch": 0.10282776349614396, "step": 520, "train/total_loss": 0.0007688237819820642 }, { "entropy": 6.065478324890137, "epoch": 0.103025509195175, "mean_token_accuracy": 0.7028274536132812, "num_tokens": 23662609.0, "step": 521, "train/ce_loss": 1.0790481567382812 }, { "epoch": 0.103025509195175, "step": 521, "train/sim_loss": 0.0013287067413330078 }, { "epoch": 0.103025509195175, "step": 521, "train/total_loss": 0.10923352092504501 }, { "entropy": 6.25161075592041, "epoch": 0.10322325489420606, "mean_token_accuracy": 0.732243537902832, "num_tokens": 23727817.0, "step": 522, "train/ce_loss": 0.724105954170227 }, { "epoch": 0.10322325489420606, "step": 522, "train/sim_loss": 0.001020193099975586 }, { "epoch": 0.10322325489420606, "step": 522, "train/total_loss": 0.07343079149723053 }, { "entropy": 6.262462615966797, "epoch": 0.1034210005932371, "mean_token_accuracy": 0.7252625823020935, "num_tokens": 23763175.0, "step": 523, "train/ce_loss": 0.5744786262512207 }, { "epoch": 0.1034210005932371, "step": 523, "train/sim_loss": 0.0005340576171875 }, { "epoch": 0.1034210005932371, "step": 523, "train/total_loss": 0.05798191949725151 }, { "entropy": 5.974859714508057, "epoch": 0.10361874629226814, "mean_token_accuracy": 0.7408758997917175, "num_tokens": 23816064.0, "step": 524, "train/ce_loss": 0.7810661792755127 }, { "epoch": 0.10361874629226814, "step": 524, "train/sim_loss": 0.0007131099700927734 }, { "epoch": 0.10361874629226814, "step": 524, "train/total_loss": 0.07881972938776016 }, { "entropy": 6.331967353820801, "epoch": 0.10381649199129919, "mean_token_accuracy": 0.7422003149986267, "num_tokens": 23874373.0, "step": 525, "train/ce_loss": 0.9171954989433289 }, { "epoch": 0.10381649199129919, "step": 525, "train/sim_loss": 0.0013189315795898438 }, { "epoch": 0.10381649199129919, "step": 525, "train/total_loss": 0.09303848445415497 }, { "entropy": 6.113881587982178, "epoch": 0.10401423769033023, "mean_token_accuracy": 0.7659409046173096, "num_tokens": 23919370.0, "step": 526, "train/ce_loss": 6.670393486274406e-05 }, { "epoch": 0.10401423769033023, "step": 526, "train/sim_loss": 0.0007198452949523926 }, { "epoch": 0.10401423769033023, "step": 526, "train/total_loss": 0.0007265156600624323 }, { "entropy": 6.075135707855225, "epoch": 0.10421198338936129, "mean_token_accuracy": 0.7524971961975098, "num_tokens": 23961558.0, "step": 527, "train/ce_loss": 0.7214177250862122 }, { "epoch": 0.10421198338936129, "step": 527, "train/sim_loss": 0.0005724430084228516 }, { "epoch": 0.10421198338936129, "step": 527, "train/total_loss": 0.07271421700716019 }, { "entropy": 6.095463275909424, "epoch": 0.10440972908839233, "mean_token_accuracy": 0.7211920619010925, "num_tokens": 23991766.0, "step": 528, "train/ce_loss": 0.5668331980705261 }, { "epoch": 0.10440972908839233, "step": 528, "train/sim_loss": 0.0009710788726806641 }, { "epoch": 0.10440972908839233, "step": 528, "train/total_loss": 0.057654399424791336 }, { "entropy": 6.030959129333496, "epoch": 0.10460747478742337, "mean_token_accuracy": 0.7401091456413269, "num_tokens": 24033857.0, "step": 529, "train/ce_loss": 0.7261236906051636 }, { "epoch": 0.10460747478742337, "step": 529, "train/sim_loss": 0.0006560087203979492 }, { "epoch": 0.10460747478742337, "step": 529, "train/total_loss": 0.07326837629079819 }, { "entropy": 5.643402099609375, "epoch": 0.10480522048645442, "mean_token_accuracy": 0.7996794581413269, "num_tokens": 24057450.0, "step": 530, "train/ce_loss": 0.8065379858016968 }, { "epoch": 0.10480522048645442, "step": 530, "train/sim_loss": 0.0006707906723022461 }, { "epoch": 0.10480522048645442, "step": 530, "train/total_loss": 0.08132459223270416 }, { "entropy": 6.438591957092285, "epoch": 0.10500296618548546, "mean_token_accuracy": 0.7362573146820068, "num_tokens": 24103629.0, "step": 531, "train/ce_loss": 0.4807679057121277 }, { "epoch": 0.10500296618548546, "step": 531, "train/sim_loss": 0.0010447502136230469 }, { "epoch": 0.10500296618548546, "step": 531, "train/total_loss": 0.049121540039777756 }, { "entropy": 6.236705780029297, "epoch": 0.10520071188451652, "mean_token_accuracy": 0.7466007471084595, "num_tokens": 24138764.0, "step": 532, "train/ce_loss": 3.2946019172668457 }, { "epoch": 0.10520071188451652, "step": 532, "train/sim_loss": 0.0011935234069824219 }, { "epoch": 0.10520071188451652, "step": 532, "train/total_loss": 0.33065372705459595 }, { "entropy": 6.3466362953186035, "epoch": 0.10539845758354756, "mean_token_accuracy": 0.7505003213882446, "num_tokens": 24175075.0, "step": 533, "train/ce_loss": 0.9297951459884644 }, { "epoch": 0.10539845758354756, "step": 533, "train/sim_loss": 0.001353442668914795 }, { "epoch": 0.10539845758354756, "step": 533, "train/total_loss": 0.09433295577764511 }, { "entropy": 6.395438194274902, "epoch": 0.1055962032825786, "mean_token_accuracy": 0.7288609147071838, "num_tokens": 24224506.0, "step": 534, "train/ce_loss": 0.8246214985847473 }, { "epoch": 0.1055962032825786, "step": 534, "train/sim_loss": 0.00109785795211792 }, { "epoch": 0.1055962032825786, "step": 534, "train/total_loss": 0.08356001228094101 }, { "entropy": 6.128121376037598, "epoch": 0.10579394898160965, "mean_token_accuracy": 0.7014188766479492, "num_tokens": 24279395.0, "step": 535, "train/ce_loss": 0.954718291759491 }, { "epoch": 0.10579394898160965, "step": 535, "train/sim_loss": 0.0014132857322692871 }, { "epoch": 0.10579394898160965, "step": 535, "train/total_loss": 0.09688511490821838 }, { "entropy": 6.306400299072266, "epoch": 0.1059916946806407, "mean_token_accuracy": 0.7367491126060486, "num_tokens": 24342778.0, "step": 536, "train/ce_loss": 6.253004539757967e-05 }, { "epoch": 0.1059916946806407, "step": 536, "train/sim_loss": 0.0011567473411560059 }, { "epoch": 0.1059916946806407, "step": 536, "train/total_loss": 0.001163000357337296 }, { "entropy": 6.6188859939575195, "epoch": 0.10618944037967175, "mean_token_accuracy": 0.7391673922538757, "num_tokens": 24382386.0, "step": 537, "train/ce_loss": 1.4127438068389893 }, { "epoch": 0.10618944037967175, "step": 537, "train/sim_loss": 0.0013848543167114258 }, { "epoch": 0.10618944037967175, "step": 537, "train/total_loss": 0.1426592320203781 }, { "entropy": 6.366977691650391, "epoch": 0.10638718607870279, "mean_token_accuracy": 0.7269043922424316, "num_tokens": 24416619.0, "step": 538, "train/ce_loss": 0.9243720173835754 }, { "epoch": 0.10638718607870279, "step": 538, "train/sim_loss": 0.0007497072219848633 }, { "epoch": 0.10638718607870279, "step": 538, "train/total_loss": 0.09318690747022629 }, { "entropy": 6.0450639724731445, "epoch": 0.10658493177773383, "mean_token_accuracy": 0.8035439252853394, "num_tokens": 24447786.0, "step": 539, "train/ce_loss": 5.7396242482354864e-05 }, { "epoch": 0.10658493177773383, "step": 539, "train/sim_loss": 0.0007891058921813965 }, { "epoch": 0.10658493177773383, "step": 539, "train/total_loss": 0.0007948455167934299 }, { "epoch": 0.10678267747676488, "grad_norm": 0.43274030089378357, "learning_rate": 9.738351963596796e-06, "loss": 0.0832, "step": 540 }, { "entropy": 6.1666717529296875, "epoch": 0.10678267747676488, "mean_token_accuracy": 0.7817853689193726, "num_tokens": 24484519.0, "step": 540, "train/ce_loss": 0.5907310247421265 }, { "epoch": 0.10678267747676488, "step": 540, "train/sim_loss": 0.0007482171058654785 }, { "epoch": 0.10678267747676488, "step": 540, "train/total_loss": 0.059821318835020065 }, { "entropy": 6.46751594543457, "epoch": 0.10698042317579592, "mean_token_accuracy": 0.7428414225578308, "num_tokens": 24541249.0, "step": 541, "train/ce_loss": 1.1208765506744385 }, { "epoch": 0.10698042317579592, "step": 541, "train/sim_loss": 0.0009992122650146484 }, { "epoch": 0.10698042317579592, "step": 541, "train/total_loss": 0.11308687180280685 }, { "entropy": 6.217024326324463, "epoch": 0.10717816887482698, "mean_token_accuracy": 0.7402768731117249, "num_tokens": 24584546.0, "step": 542, "train/ce_loss": 0.8005713820457458 }, { "epoch": 0.10717816887482698, "step": 542, "train/sim_loss": 0.0006681680679321289 }, { "epoch": 0.10717816887482698, "step": 542, "train/total_loss": 0.0807253047823906 }, { "entropy": 6.071673393249512, "epoch": 0.10737591457385802, "mean_token_accuracy": 0.7296442985534668, "num_tokens": 24625825.0, "step": 543, "train/ce_loss": 1.3873510360717773 }, { "epoch": 0.10737591457385802, "step": 543, "train/sim_loss": 0.0005273818969726562 }, { "epoch": 0.10737591457385802, "step": 543, "train/total_loss": 0.13926248252391815 }, { "entropy": 6.190598487854004, "epoch": 0.10757366027288906, "mean_token_accuracy": 0.7614178657531738, "num_tokens": 24691862.0, "step": 544, "train/ce_loss": 4.465912206796929e-05 }, { "epoch": 0.10757366027288906, "step": 544, "train/sim_loss": 0.0007704496383666992 }, { "epoch": 0.10757366027288906, "step": 544, "train/total_loss": 0.0007749155629426241 }, { "entropy": 6.14871883392334, "epoch": 0.10777140597192011, "mean_token_accuracy": 0.7230527400970459, "num_tokens": 24730082.0, "step": 545, "train/ce_loss": 0.7179839015007019 }, { "epoch": 0.10777140597192011, "step": 545, "train/sim_loss": 0.000574946403503418 }, { "epoch": 0.10777140597192011, "step": 545, "train/total_loss": 0.07237333804368973 }, { "entropy": 6.302890777587891, "epoch": 0.10796915167095116, "mean_token_accuracy": 0.7251856923103333, "num_tokens": 24776849.0, "step": 546, "train/ce_loss": 0.7483782768249512 }, { "epoch": 0.10796915167095116, "step": 546, "train/sim_loss": 0.0006003379821777344 }, { "epoch": 0.10796915167095116, "step": 546, "train/total_loss": 0.07543816417455673 }, { "entropy": 6.157774925231934, "epoch": 0.10816689736998221, "mean_token_accuracy": 0.72920823097229, "num_tokens": 24811474.0, "step": 547, "train/ce_loss": 0.8783758878707886 }, { "epoch": 0.10816689736998221, "step": 547, "train/sim_loss": 0.0007457137107849121 }, { "epoch": 0.10816689736998221, "step": 547, "train/total_loss": 0.08858330547809601 }, { "entropy": 6.068285942077637, "epoch": 0.10836464306901325, "mean_token_accuracy": 0.7579445838928223, "num_tokens": 24862123.0, "step": 548, "train/ce_loss": 4.365808126749471e-05 }, { "epoch": 0.10836464306901325, "step": 548, "train/sim_loss": 0.0021712779998779297 }, { "epoch": 0.10836464306901325, "step": 548, "train/total_loss": 0.0021756438072770834 }, { "entropy": 5.870238304138184, "epoch": 0.10856238876804429, "mean_token_accuracy": 0.7197802066802979, "num_tokens": 24910174.0, "step": 549, "train/ce_loss": 5.321910066413693e-05 }, { "epoch": 0.10856238876804429, "step": 549, "train/sim_loss": 0.0005890130996704102 }, { "epoch": 0.10856238876804429, "step": 549, "train/total_loss": 0.0005943350261077285 }, { "entropy": 5.763305187225342, "epoch": 0.10876013446707535, "mean_token_accuracy": 0.7143691778182983, "num_tokens": 24949768.0, "step": 550, "train/ce_loss": 0.7010257244110107 }, { "epoch": 0.10876013446707535, "step": 550, "train/sim_loss": 0.001278996467590332 }, { "epoch": 0.10876013446707535, "step": 550, "train/total_loss": 0.0713815689086914 }, { "entropy": 6.219409942626953, "epoch": 0.10895788016610639, "mean_token_accuracy": 0.7433071136474609, "num_tokens": 25014192.0, "step": 551, "train/ce_loss": 5.341826908988878e-05 }, { "epoch": 0.10895788016610639, "step": 551, "train/sim_loss": 0.0013689994812011719 }, { "epoch": 0.10895788016610639, "step": 551, "train/total_loss": 0.0013743413146585226 }, { "entropy": 6.397554874420166, "epoch": 0.10915562586513744, "mean_token_accuracy": 0.7329420447349548, "num_tokens": 25071355.0, "step": 552, "train/ce_loss": 0.7818087935447693 }, { "epoch": 0.10915562586513744, "step": 552, "train/sim_loss": 0.0006763339042663574 }, { "epoch": 0.10915562586513744, "step": 552, "train/total_loss": 0.07885721325874329 }, { "entropy": 6.0607500076293945, "epoch": 0.10935337156416848, "mean_token_accuracy": 0.7586206793785095, "num_tokens": 25113631.0, "step": 553, "train/ce_loss": 0.7450578212738037 }, { "epoch": 0.10935337156416848, "step": 553, "train/sim_loss": 0.0009107589721679688 }, { "epoch": 0.10935337156416848, "step": 553, "train/total_loss": 0.07541654258966446 }, { "entropy": 6.066739559173584, "epoch": 0.10955111726319952, "mean_token_accuracy": 0.7402113080024719, "num_tokens": 25145008.0, "step": 554, "train/ce_loss": 1.0869847536087036 }, { "epoch": 0.10955111726319952, "step": 554, "train/sim_loss": 0.0008156299591064453 }, { "epoch": 0.10955111726319952, "step": 554, "train/total_loss": 0.10951410979032516 }, { "entropy": 5.586544990539551, "epoch": 0.10974886296223058, "mean_token_accuracy": 0.7697160840034485, "num_tokens": 25175366.0, "step": 555, "train/ce_loss": 0.46354061365127563 }, { "epoch": 0.10974886296223058, "step": 555, "train/sim_loss": 0.0008311271667480469 }, { "epoch": 0.10974886296223058, "step": 555, "train/total_loss": 0.04718519002199173 }, { "entropy": 6.174548149108887, "epoch": 0.10994660866126162, "mean_token_accuracy": 0.7084218263626099, "num_tokens": 25218340.0, "step": 556, "train/ce_loss": 1.2301552295684814 }, { "epoch": 0.10994660866126162, "step": 556, "train/sim_loss": 0.000667572021484375 }, { "epoch": 0.10994660866126162, "step": 556, "train/total_loss": 0.12368309497833252 }, { "entropy": 6.254624366760254, "epoch": 0.11014435436029267, "mean_token_accuracy": 0.7572289109230042, "num_tokens": 25254042.0, "step": 557, "train/ce_loss": 0.5924925208091736 }, { "epoch": 0.11014435436029267, "step": 557, "train/sim_loss": 0.001293480396270752 }, { "epoch": 0.11014435436029267, "step": 557, "train/total_loss": 0.06054273247718811 }, { "entropy": 6.109245777130127, "epoch": 0.11034210005932371, "mean_token_accuracy": 0.7356792092323303, "num_tokens": 25288419.0, "step": 558, "train/ce_loss": 1.1919480562210083 }, { "epoch": 0.11034210005932371, "step": 558, "train/sim_loss": 0.0004930496215820312 }, { "epoch": 0.11034210005932371, "step": 558, "train/total_loss": 0.11968785524368286 }, { "entropy": 6.231842041015625, "epoch": 0.11053984575835475, "mean_token_accuracy": 0.7064706087112427, "num_tokens": 25325252.0, "step": 559, "train/ce_loss": 1.5713086128234863 }, { "epoch": 0.11053984575835475, "step": 559, "train/sim_loss": 0.0005136728286743164 }, { "epoch": 0.11053984575835475, "step": 559, "train/total_loss": 0.15764454007148743 }, { "epoch": 0.1107375914573858, "grad_norm": 0.505063533782959, "learning_rate": 9.72845978830745e-06, "loss": 0.0842, "step": 560 }, { "entropy": 6.405361652374268, "epoch": 0.1107375914573858, "mean_token_accuracy": 0.7434523701667786, "num_tokens": 25370314.0, "step": 560, "train/ce_loss": 1.066406488418579 }, { "epoch": 0.1107375914573858, "step": 560, "train/sim_loss": 0.00106048583984375 }, { "epoch": 0.1107375914573858, "step": 560, "train/total_loss": 0.1077011376619339 }, { "entropy": 6.419428825378418, "epoch": 0.11093533715641685, "mean_token_accuracy": 0.7562540173530579, "num_tokens": 25406154.0, "step": 561, "train/ce_loss": 1.1596894264221191 }, { "epoch": 0.11093533715641685, "step": 561, "train/sim_loss": 0.001968562602996826 }, { "epoch": 0.11093533715641685, "step": 561, "train/total_loss": 0.11793750524520874 }, { "entropy": 6.0027570724487305, "epoch": 0.11113308285544789, "mean_token_accuracy": 0.7699458003044128, "num_tokens": 25469649.0, "step": 562, "train/ce_loss": 0.6311708092689514 }, { "epoch": 0.11113308285544789, "step": 562, "train/sim_loss": 0.0006566643714904785 }, { "epoch": 0.11113308285544789, "step": 562, "train/total_loss": 0.0637737438082695 }, { "entropy": 6.086897850036621, "epoch": 0.11133082855447894, "mean_token_accuracy": 0.6969875693321228, "num_tokens": 25526905.0, "step": 563, "train/ce_loss": 0.7936128377914429 }, { "epoch": 0.11133082855447894, "step": 563, "train/sim_loss": 0.0008391737937927246 }, { "epoch": 0.11133082855447894, "step": 563, "train/total_loss": 0.08020045608282089 }, { "entropy": 6.4272379875183105, "epoch": 0.11152857425350998, "mean_token_accuracy": 0.7324343323707581, "num_tokens": 25582684.0, "step": 564, "train/ce_loss": 0.7258131504058838 }, { "epoch": 0.11152857425350998, "step": 564, "train/sim_loss": 0.0009008049964904785 }, { "epoch": 0.11152857425350998, "step": 564, "train/total_loss": 0.07348211854696274 }, { "entropy": 5.720972061157227, "epoch": 0.11172631995254104, "mean_token_accuracy": 0.7526881694793701, "num_tokens": 25633614.0, "step": 565, "train/ce_loss": 0.9064192771911621 }, { "epoch": 0.11172631995254104, "step": 565, "train/sim_loss": 0.0008062124252319336 }, { "epoch": 0.11172631995254104, "step": 565, "train/total_loss": 0.09144814312458038 }, { "entropy": 5.943207740783691, "epoch": 0.11192406565157208, "mean_token_accuracy": 0.76449054479599, "num_tokens": 25676171.0, "step": 566, "train/ce_loss": 0.8792154788970947 }, { "epoch": 0.11192406565157208, "step": 566, "train/sim_loss": 0.0008357763290405273 }, { "epoch": 0.11192406565157208, "step": 566, "train/total_loss": 0.08875732868909836 }, { "entropy": 6.148874282836914, "epoch": 0.11212181135060312, "mean_token_accuracy": 0.7268785834312439, "num_tokens": 25719580.0, "step": 567, "train/ce_loss": 1.7416837215423584 }, { "epoch": 0.11212181135060312, "step": 567, "train/sim_loss": 0.0007559657096862793 }, { "epoch": 0.11212181135060312, "step": 567, "train/total_loss": 0.1749243438243866 }, { "entropy": 6.203708648681641, "epoch": 0.11231955704963417, "mean_token_accuracy": 0.7255638837814331, "num_tokens": 25766924.0, "step": 568, "train/ce_loss": 0.8673768043518066 }, { "epoch": 0.11231955704963417, "step": 568, "train/sim_loss": 0.0005576610565185547 }, { "epoch": 0.11231955704963417, "step": 568, "train/total_loss": 0.08729534596204758 }, { "entropy": 5.845222473144531, "epoch": 0.11251730274866521, "mean_token_accuracy": 0.7429398894309998, "num_tokens": 25811745.0, "step": 569, "train/ce_loss": 4.1496379708405584e-05 }, { "epoch": 0.11251730274866521, "step": 569, "train/sim_loss": 0.0007607936859130859 }, { "epoch": 0.11251730274866521, "step": 569, "train/total_loss": 0.000764943310059607 }, { "entropy": 6.2429046630859375, "epoch": 0.11271504844769627, "mean_token_accuracy": 0.7744593024253845, "num_tokens": 25864920.0, "step": 570, "train/ce_loss": 6.612328434130177e-05 }, { "epoch": 0.11271504844769627, "step": 570, "train/sim_loss": 0.0006710290908813477 }, { "epoch": 0.11271504844769627, "step": 570, "train/total_loss": 0.0006776414229534566 }, { "entropy": 6.054330825805664, "epoch": 0.11291279414672731, "mean_token_accuracy": 0.7166017889976501, "num_tokens": 25909164.0, "step": 571, "train/ce_loss": 0.6597997546195984 }, { "epoch": 0.11291279414672731, "step": 571, "train/sim_loss": 0.0008932352066040039 }, { "epoch": 0.11291279414672731, "step": 571, "train/total_loss": 0.0668732151389122 }, { "entropy": 6.152830600738525, "epoch": 0.11311053984575835, "mean_token_accuracy": 0.7377710342407227, "num_tokens": 25954939.0, "step": 572, "train/ce_loss": 0.715154230594635 }, { "epoch": 0.11311053984575835, "step": 572, "train/sim_loss": 0.0010331273078918457 }, { "epoch": 0.11311053984575835, "step": 572, "train/total_loss": 0.07254855334758759 }, { "entropy": 5.868516445159912, "epoch": 0.1133082855447894, "mean_token_accuracy": 0.719298243522644, "num_tokens": 26008996.0, "step": 573, "train/ce_loss": 4.856556552113034e-05 }, { "epoch": 0.1133082855447894, "step": 573, "train/sim_loss": 0.0009305477142333984 }, { "epoch": 0.1133082855447894, "step": 573, "train/total_loss": 0.0009354042704217136 }, { "entropy": 6.296710014343262, "epoch": 0.11350603124382044, "mean_token_accuracy": 0.7751519083976746, "num_tokens": 26078198.0, "step": 574, "train/ce_loss": 0.6451379060745239 }, { "epoch": 0.11350603124382044, "step": 574, "train/sim_loss": 0.0008420348167419434 }, { "epoch": 0.11350603124382044, "step": 574, "train/total_loss": 0.0653558298945427 }, { "entropy": 5.953096866607666, "epoch": 0.1137037769428515, "mean_token_accuracy": 0.772891104221344, "num_tokens": 26112916.0, "step": 575, "train/ce_loss": 0.8817339539527893 }, { "epoch": 0.1137037769428515, "step": 575, "train/sim_loss": 0.0009093880653381348 }, { "epoch": 0.1137037769428515, "step": 575, "train/total_loss": 0.08908278495073318 }, { "entropy": 6.099413871765137, "epoch": 0.11390152264188254, "mean_token_accuracy": 0.7819767594337463, "num_tokens": 26160403.0, "step": 576, "train/ce_loss": 0.9004310965538025 }, { "epoch": 0.11390152264188254, "step": 576, "train/sim_loss": 0.0006569623947143555 }, { "epoch": 0.11390152264188254, "step": 576, "train/total_loss": 0.09070007503032684 }, { "entropy": 6.197218894958496, "epoch": 0.11409926834091358, "mean_token_accuracy": 0.7176923155784607, "num_tokens": 26194869.0, "step": 577, "train/ce_loss": 0.542341947555542 }, { "epoch": 0.11409926834091358, "step": 577, "train/sim_loss": 0.0008060932159423828 }, { "epoch": 0.11409926834091358, "step": 577, "train/total_loss": 0.05504028871655464 }, { "entropy": 6.0669474601745605, "epoch": 0.11429701403994463, "mean_token_accuracy": 0.730715274810791, "num_tokens": 26231436.0, "step": 578, "train/ce_loss": 1.292199730873108 }, { "epoch": 0.11429701403994463, "step": 578, "train/sim_loss": 0.0016291141510009766 }, { "epoch": 0.11429701403994463, "step": 578, "train/total_loss": 0.13084909319877625 }, { "entropy": 6.257142066955566, "epoch": 0.11449475973897567, "mean_token_accuracy": 0.7356853485107422, "num_tokens": 26275057.0, "step": 579, "train/ce_loss": 0.6967375874519348 }, { "epoch": 0.11449475973897567, "step": 579, "train/sim_loss": 0.0010458827018737793 }, { "epoch": 0.11449475973897567, "step": 579, "train/total_loss": 0.0707196444272995 }, { "epoch": 0.11469250543800673, "grad_norm": 0.44161853194236755, "learning_rate": 9.718567613018103e-06, "loss": 0.0836, "step": 580 }, { "entropy": 6.3711371421813965, "epoch": 0.11469250543800673, "mean_token_accuracy": 0.7261484265327454, "num_tokens": 26316536.0, "step": 580, "train/ce_loss": 0.9306738972663879 }, { "epoch": 0.11469250543800673, "step": 580, "train/sim_loss": 0.0005167126655578613 }, { "epoch": 0.11469250543800673, "step": 580, "train/total_loss": 0.0935841053724289 }, { "entropy": 6.22993278503418, "epoch": 0.11489025113703777, "mean_token_accuracy": 0.7381545901298523, "num_tokens": 26353655.0, "step": 581, "train/ce_loss": 0.8340519666671753 }, { "epoch": 0.11489025113703777, "step": 581, "train/sim_loss": 0.0008862018585205078 }, { "epoch": 0.11489025113703777, "step": 581, "train/total_loss": 0.08429139852523804 }, { "entropy": 6.436459541320801, "epoch": 0.11508799683606881, "mean_token_accuracy": 0.7028921842575073, "num_tokens": 26420185.0, "step": 582, "train/ce_loss": 5.348266495275311e-05 }, { "epoch": 0.11508799683606881, "step": 582, "train/sim_loss": 0.0005248785018920898 }, { "epoch": 0.11508799683606881, "step": 582, "train/total_loss": 0.000530226796399802 }, { "entropy": 6.437438011169434, "epoch": 0.11528574253509986, "mean_token_accuracy": 0.7246474027633667, "num_tokens": 26470448.0, "step": 583, "train/ce_loss": 1.38981294631958 }, { "epoch": 0.11528574253509986, "step": 583, "train/sim_loss": 0.000644981861114502 }, { "epoch": 0.11528574253509986, "step": 583, "train/total_loss": 0.13962627947330475 }, { "entropy": 6.171667098999023, "epoch": 0.1154834882341309, "mean_token_accuracy": 0.7574324607849121, "num_tokens": 26507450.0, "step": 584, "train/ce_loss": 0.8380984663963318 }, { "epoch": 0.1154834882341309, "step": 584, "train/sim_loss": 0.0013026595115661621 }, { "epoch": 0.1154834882341309, "step": 584, "train/total_loss": 0.08511250466108322 }, { "entropy": 5.828327178955078, "epoch": 0.11568123393316196, "mean_token_accuracy": 0.7249122858047485, "num_tokens": 26555230.0, "step": 585, "train/ce_loss": 0.8947806358337402 }, { "epoch": 0.11568123393316196, "step": 585, "train/sim_loss": 0.0004693269729614258 }, { "epoch": 0.11568123393316196, "step": 585, "train/total_loss": 0.08994739502668381 }, { "entropy": 5.898695945739746, "epoch": 0.115878979632193, "mean_token_accuracy": 0.7683197855949402, "num_tokens": 26578615.0, "step": 586, "train/ce_loss": 0.792346179485321 }, { "epoch": 0.115878979632193, "step": 586, "train/sim_loss": 0.0005991458892822266 }, { "epoch": 0.115878979632193, "step": 586, "train/total_loss": 0.07983376830816269 }, { "entropy": 6.411154747009277, "epoch": 0.11607672533122404, "mean_token_accuracy": 0.6960716843605042, "num_tokens": 26646273.0, "step": 587, "train/ce_loss": 1.0121382474899292 }, { "epoch": 0.11607672533122404, "step": 587, "train/sim_loss": 0.001188039779663086 }, { "epoch": 0.11607672533122404, "step": 587, "train/total_loss": 0.10240186750888824 }, { "entropy": 5.886495113372803, "epoch": 0.1162744710302551, "mean_token_accuracy": 0.7207829356193542, "num_tokens": 26693482.0, "step": 588, "train/ce_loss": 3.53668219759129e-05 }, { "epoch": 0.1162744710302551, "step": 588, "train/sim_loss": 0.0006122589111328125 }, { "epoch": 0.1162744710302551, "step": 588, "train/total_loss": 0.0006157956086099148 }, { "entropy": 5.913239479064941, "epoch": 0.11647221672928613, "mean_token_accuracy": 0.7543859481811523, "num_tokens": 26732319.0, "step": 589, "train/ce_loss": 0.6096445322036743 }, { "epoch": 0.11647221672928613, "step": 589, "train/sim_loss": 0.0007977485656738281 }, { "epoch": 0.11647221672928613, "step": 589, "train/total_loss": 0.06176220253109932 }, { "entropy": 6.314866542816162, "epoch": 0.11666996242831719, "mean_token_accuracy": 0.7250257730484009, "num_tokens": 26786051.0, "step": 590, "train/ce_loss": 1.4157036542892456 }, { "epoch": 0.11666996242831719, "step": 590, "train/sim_loss": 0.0008903741836547852 }, { "epoch": 0.11666996242831719, "step": 590, "train/total_loss": 0.14246074855327606 }, { "entropy": 6.134860992431641, "epoch": 0.11686770812734823, "mean_token_accuracy": 0.7410447597503662, "num_tokens": 26831411.0, "step": 591, "train/ce_loss": 1.2146337032318115 }, { "epoch": 0.11686770812734823, "step": 591, "train/sim_loss": 0.0004647970199584961 }, { "epoch": 0.11686770812734823, "step": 591, "train/total_loss": 0.12192817032337189 }, { "entropy": 6.290523052215576, "epoch": 0.11706545382637927, "mean_token_accuracy": 0.7322670221328735, "num_tokens": 26871152.0, "step": 592, "train/ce_loss": 1.0195738077163696 }, { "epoch": 0.11706545382637927, "step": 592, "train/sim_loss": 0.0013827085494995117 }, { "epoch": 0.11706545382637927, "step": 592, "train/total_loss": 0.10334008932113647 }, { "entropy": 6.0504045486450195, "epoch": 0.11726319952541032, "mean_token_accuracy": 0.7530266046524048, "num_tokens": 26916088.0, "step": 593, "train/ce_loss": 1.1978813409805298 }, { "epoch": 0.11726319952541032, "step": 593, "train/sim_loss": 0.0007534623146057129 }, { "epoch": 0.11726319952541032, "step": 593, "train/total_loss": 0.12054159492254257 }, { "entropy": 6.288121223449707, "epoch": 0.11746094522444137, "mean_token_accuracy": 0.7054610848426819, "num_tokens": 26946797.0, "step": 594, "train/ce_loss": 3.206779718399048 }, { "epoch": 0.11746094522444137, "step": 594, "train/sim_loss": 0.0009601116180419922 }, { "epoch": 0.11746094522444137, "step": 594, "train/total_loss": 0.3216380774974823 }, { "entropy": 6.189278602600098, "epoch": 0.11765869092347242, "mean_token_accuracy": 0.7248592376708984, "num_tokens": 26987564.0, "step": 595, "train/ce_loss": 1.640616774559021 }, { "epoch": 0.11765869092347242, "step": 595, "train/sim_loss": 0.0006165504455566406 }, { "epoch": 0.11765869092347242, "step": 595, "train/total_loss": 0.16467823088169098 }, { "entropy": 5.899967193603516, "epoch": 0.11785643662250346, "mean_token_accuracy": 0.7563451528549194, "num_tokens": 27024486.0, "step": 596, "train/ce_loss": 0.79423987865448 }, { "epoch": 0.11785643662250346, "step": 596, "train/sim_loss": 0.0009961128234863281 }, { "epoch": 0.11785643662250346, "step": 596, "train/total_loss": 0.0804200991988182 }, { "entropy": 5.764100074768066, "epoch": 0.1180541823215345, "mean_token_accuracy": 0.8059045076370239, "num_tokens": 27059085.0, "step": 597, "train/ce_loss": 1.0257465839385986 }, { "epoch": 0.1180541823215345, "step": 597, "train/sim_loss": 0.0010786056518554688 }, { "epoch": 0.1180541823215345, "step": 597, "train/total_loss": 0.10365326702594757 }, { "entropy": 5.749194622039795, "epoch": 0.11825192802056556, "mean_token_accuracy": 0.7518987059593201, "num_tokens": 27105011.0, "step": 598, "train/ce_loss": 3.773485150304623e-05 }, { "epoch": 0.11825192802056556, "step": 598, "train/sim_loss": 0.0005005598068237305 }, { "epoch": 0.11825192802056556, "step": 598, "train/total_loss": 0.0005043332930654287 }, { "entropy": 6.098031044006348, "epoch": 0.1184496737195966, "mean_token_accuracy": 0.7251815795898438, "num_tokens": 27139034.0, "step": 599, "train/ce_loss": 1.490221381187439 }, { "epoch": 0.1184496737195966, "step": 599, "train/sim_loss": 0.0007370114326477051 }, { "epoch": 0.1184496737195966, "step": 599, "train/total_loss": 0.14975915849208832 }, { "epoch": 0.11864741941862765, "grad_norm": 0.47269588708877563, "learning_rate": 9.708675437728757e-06, "loss": 0.086, "step": 600 }, { "entropy": 5.730496406555176, "epoch": 0.11864741941862765, "mean_token_accuracy": 0.7840909361839294, "num_tokens": 27175765.0, "step": 600, "train/ce_loss": 4.59193361166399e-05 }, { "epoch": 0.11864741941862765, "step": 600, "train/sim_loss": 0.0004843473434448242 }, { "epoch": 0.11864741941862765, "step": 600, "train/total_loss": 0.0004889392876066267 }, { "entropy": 6.048929214477539, "epoch": 0.11884516511765869, "mean_token_accuracy": 0.745976984500885, "num_tokens": 27213198.0, "step": 601, "train/ce_loss": 0.5301898121833801 }, { "epoch": 0.11884516511765869, "step": 601, "train/sim_loss": 0.0005449056625366211 }, { "epoch": 0.11884516511765869, "step": 601, "train/total_loss": 0.05356388911604881 }, { "entropy": 5.885995864868164, "epoch": 0.11904291081668973, "mean_token_accuracy": 0.75, "num_tokens": 27267389.0, "step": 602, "train/ce_loss": 0.7561390995979309 }, { "epoch": 0.11904291081668973, "step": 602, "train/sim_loss": 0.0010575652122497559 }, { "epoch": 0.11904291081668973, "step": 602, "train/total_loss": 0.07667147368192673 }, { "entropy": 6.033019065856934, "epoch": 0.11924065651572079, "mean_token_accuracy": 0.752417802810669, "num_tokens": 27306267.0, "step": 603, "train/ce_loss": 1.0019031763076782 }, { "epoch": 0.11924065651572079, "step": 603, "train/sim_loss": 0.001365363597869873 }, { "epoch": 0.11924065651572079, "step": 603, "train/total_loss": 0.10155568271875381 }, { "entropy": 6.013729095458984, "epoch": 0.11943840221475183, "mean_token_accuracy": 0.7458033561706543, "num_tokens": 27357104.0, "step": 604, "train/ce_loss": 1.4700828790664673 }, { "epoch": 0.11943840221475183, "step": 604, "train/sim_loss": 0.0005685091018676758 }, { "epoch": 0.11943840221475183, "step": 604, "train/total_loss": 0.14757679402828217 }, { "entropy": 5.626277923583984, "epoch": 0.11963614791378288, "mean_token_accuracy": 0.7213470935821533, "num_tokens": 27398343.0, "step": 605, "train/ce_loss": 1.2139307260513306 }, { "epoch": 0.11963614791378288, "step": 605, "train/sim_loss": 0.001175224781036377 }, { "epoch": 0.11963614791378288, "step": 605, "train/total_loss": 0.12256830185651779 }, { "entropy": 5.936385631561279, "epoch": 0.11983389361281392, "mean_token_accuracy": 0.7411764860153198, "num_tokens": 27430985.0, "step": 606, "train/ce_loss": 1.0427016019821167 }, { "epoch": 0.11983389361281392, "step": 606, "train/sim_loss": 0.000569760799407959 }, { "epoch": 0.11983389361281392, "step": 606, "train/total_loss": 0.10483992099761963 }, { "entropy": 5.760608673095703, "epoch": 0.12003163931184496, "mean_token_accuracy": 0.7372764945030212, "num_tokens": 27465684.0, "step": 607, "train/ce_loss": 1.2509468793869019 }, { "epoch": 0.12003163931184496, "step": 607, "train/sim_loss": 0.0009860992431640625 }, { "epoch": 0.12003163931184496, "step": 607, "train/total_loss": 0.12608079612255096 }, { "entropy": 6.067965507507324, "epoch": 0.12022938501087602, "mean_token_accuracy": 0.7058823704719543, "num_tokens": 27502172.0, "step": 608, "train/ce_loss": 0.7017411589622498 }, { "epoch": 0.12022938501087602, "step": 608, "train/sim_loss": 0.0011396408081054688 }, { "epoch": 0.12022938501087602, "step": 608, "train/total_loss": 0.0713137611746788 }, { "entropy": 6.298610210418701, "epoch": 0.12042713070990706, "mean_token_accuracy": 0.7591801881790161, "num_tokens": 27537761.0, "step": 609, "train/ce_loss": 1.02512526512146 }, { "epoch": 0.12042713070990706, "step": 609, "train/sim_loss": 0.0007987022399902344 }, { "epoch": 0.12042713070990706, "step": 609, "train/total_loss": 0.10331123322248459 }, { "entropy": 5.7649993896484375, "epoch": 0.12062487640893811, "mean_token_accuracy": 0.7249674797058105, "num_tokens": 27570643.0, "step": 610, "train/ce_loss": 0.7448219060897827 }, { "epoch": 0.12062487640893811, "step": 610, "train/sim_loss": 0.0008988380432128906 }, { "epoch": 0.12062487640893811, "step": 610, "train/total_loss": 0.07538103312253952 }, { "entropy": 5.92007303237915, "epoch": 0.12082262210796915, "mean_token_accuracy": 0.7068575620651245, "num_tokens": 27616090.0, "step": 611, "train/ce_loss": 0.8748180270195007 }, { "epoch": 0.12082262210796915, "step": 611, "train/sim_loss": 0.000769495964050293 }, { "epoch": 0.12082262210796915, "step": 611, "train/total_loss": 0.08825130015611649 }, { "entropy": 6.096446990966797, "epoch": 0.12102036780700019, "mean_token_accuracy": 0.7452107071876526, "num_tokens": 27654741.0, "step": 612, "train/ce_loss": 5.736296225222759e-05 }, { "epoch": 0.12102036780700019, "step": 612, "train/sim_loss": 0.0010297894477844238 }, { "epoch": 0.12102036780700019, "step": 612, "train/total_loss": 0.0010355256963521242 }, { "entropy": 6.233041763305664, "epoch": 0.12121811350603125, "mean_token_accuracy": 0.7386594414710999, "num_tokens": 27707063.0, "step": 613, "train/ce_loss": 0.9807993173599243 }, { "epoch": 0.12121811350603125, "step": 613, "train/sim_loss": 0.0007704496383666992 }, { "epoch": 0.12121811350603125, "step": 613, "train/total_loss": 0.09885038435459137 }, { "entropy": 6.305136680603027, "epoch": 0.12141585920506229, "mean_token_accuracy": 0.7460216283798218, "num_tokens": 27760476.0, "step": 614, "train/ce_loss": 0.3908658027648926 }, { "epoch": 0.12141585920506229, "step": 614, "train/sim_loss": 0.0007993578910827637 }, { "epoch": 0.12141585920506229, "step": 614, "train/total_loss": 0.03988593816757202 }, { "entropy": 6.200961112976074, "epoch": 0.12161360490409334, "mean_token_accuracy": 0.6477572321891785, "num_tokens": 27806152.0, "step": 615, "train/ce_loss": 1.664127230644226 }, { "epoch": 0.12161360490409334, "step": 615, "train/sim_loss": 0.000926673412322998 }, { "epoch": 0.12161360490409334, "step": 615, "train/total_loss": 0.16733939945697784 }, { "entropy": 6.190495491027832, "epoch": 0.12181135060312438, "mean_token_accuracy": 0.7042006850242615, "num_tokens": 27861655.0, "step": 616, "train/ce_loss": 0.9056597948074341 }, { "epoch": 0.12181135060312438, "step": 616, "train/sim_loss": 0.0005937814712524414 }, { "epoch": 0.12181135060312438, "step": 616, "train/total_loss": 0.09115976095199585 }, { "entropy": 6.236837387084961, "epoch": 0.12200909630215542, "mean_token_accuracy": 0.7435590028762817, "num_tokens": 27930271.0, "step": 617, "train/ce_loss": 0.8794528841972351 }, { "epoch": 0.12200909630215542, "step": 617, "train/sim_loss": 0.0005555152893066406 }, { "epoch": 0.12200909630215542, "step": 617, "train/total_loss": 0.08850080519914627 }, { "entropy": 6.051233291625977, "epoch": 0.12220684200118648, "mean_token_accuracy": 0.7270047068595886, "num_tokens": 27978238.0, "step": 618, "train/ce_loss": 0.6954624056816101 }, { "epoch": 0.12220684200118648, "step": 618, "train/sim_loss": 0.0013319849967956543 }, { "epoch": 0.12220684200118648, "step": 618, "train/total_loss": 0.07087823003530502 }, { "entropy": 6.313536643981934, "epoch": 0.12240458770021752, "mean_token_accuracy": 0.7079697251319885, "num_tokens": 28033777.0, "step": 619, "train/ce_loss": 1.0160987377166748 }, { "epoch": 0.12240458770021752, "step": 619, "train/sim_loss": 0.0007264614105224609 }, { "epoch": 0.12240458770021752, "step": 619, "train/total_loss": 0.1023363396525383 }, { "epoch": 0.12260233339924857, "grad_norm": 0.4921729266643524, "learning_rate": 9.698783262439411e-06, "loss": 0.0884, "step": 620 }, { "entropy": 5.988088607788086, "epoch": 0.12260233339924857, "mean_token_accuracy": 0.732172966003418, "num_tokens": 28088650.0, "step": 620, "train/ce_loss": 0.3487662971019745 }, { "epoch": 0.12260233339924857, "step": 620, "train/sim_loss": 0.0005491971969604492 }, { "epoch": 0.12260233339924857, "step": 620, "train/total_loss": 0.0354258269071579 }, { "entropy": 5.716334342956543, "epoch": 0.12280007909827961, "mean_token_accuracy": 0.775347888469696, "num_tokens": 28118604.0, "step": 621, "train/ce_loss": 0.9337778091430664 }, { "epoch": 0.12280007909827961, "step": 621, "train/sim_loss": 0.0006767511367797852 }, { "epoch": 0.12280007909827961, "step": 621, "train/total_loss": 0.09405453503131866 }, { "entropy": 5.812374114990234, "epoch": 0.12299782479731065, "mean_token_accuracy": 0.7210702300071716, "num_tokens": 28168397.0, "step": 622, "train/ce_loss": 1.260109782218933 }, { "epoch": 0.12299782479731065, "step": 622, "train/sim_loss": 0.0007009506225585938 }, { "epoch": 0.12299782479731065, "step": 622, "train/total_loss": 0.12671193480491638 }, { "entropy": 5.769130706787109, "epoch": 0.12319557049634171, "mean_token_accuracy": 0.7231214046478271, "num_tokens": 28228167.0, "step": 623, "train/ce_loss": 0.8372365832328796 }, { "epoch": 0.12319557049634171, "step": 623, "train/sim_loss": 0.0013037323951721191 }, { "epoch": 0.12319557049634171, "step": 623, "train/total_loss": 0.08502738922834396 }, { "entropy": 6.039603233337402, "epoch": 0.12339331619537275, "mean_token_accuracy": 0.7154046893119812, "num_tokens": 28279411.0, "step": 624, "train/ce_loss": 0.8043203949928284 }, { "epoch": 0.12339331619537275, "step": 624, "train/sim_loss": 0.0005688667297363281 }, { "epoch": 0.12339331619537275, "step": 624, "train/total_loss": 0.0810009092092514 }, { "entropy": 5.811825275421143, "epoch": 0.1235910618944038, "mean_token_accuracy": 0.7452521324157715, "num_tokens": 28326923.0, "step": 625, "train/ce_loss": 0.5874802470207214 }, { "epoch": 0.1235910618944038, "step": 625, "train/sim_loss": 0.0004895329475402832 }, { "epoch": 0.1235910618944038, "step": 625, "train/total_loss": 0.059237558394670486 }, { "entropy": 6.454201698303223, "epoch": 0.12378880759343484, "mean_token_accuracy": 0.7321131229400635, "num_tokens": 28376171.0, "step": 626, "train/ce_loss": 0.7096596360206604 }, { "epoch": 0.12378880759343484, "step": 626, "train/sim_loss": 0.0005436539649963379 }, { "epoch": 0.12378880759343484, "step": 626, "train/total_loss": 0.07150962203741074 }, { "entropy": 5.800426483154297, "epoch": 0.12398655329246588, "mean_token_accuracy": 0.7654896974563599, "num_tokens": 28402806.0, "step": 627, "train/ce_loss": 4.670398629968986e-05 }, { "epoch": 0.12398655329246588, "step": 627, "train/sim_loss": 0.00042629241943359375 }, { "epoch": 0.12398655329246588, "step": 627, "train/total_loss": 0.00043096282752230763 }, { "entropy": 6.122576713562012, "epoch": 0.12418429899149694, "mean_token_accuracy": 0.7404537796974182, "num_tokens": 28459232.0, "step": 628, "train/ce_loss": 0.6282673478126526 }, { "epoch": 0.12418429899149694, "step": 628, "train/sim_loss": 0.00042551755905151367 }, { "epoch": 0.12418429899149694, "step": 628, "train/total_loss": 0.06325225532054901 }, { "entropy": 6.106772422790527, "epoch": 0.12438204469052798, "mean_token_accuracy": 0.7569342851638794, "num_tokens": 28501930.0, "step": 629, "train/ce_loss": 2.0183098316192627 }, { "epoch": 0.12438204469052798, "step": 629, "train/sim_loss": 0.0006026029586791992 }, { "epoch": 0.12438204469052798, "step": 629, "train/total_loss": 0.20243358612060547 }, { "entropy": 6.110541343688965, "epoch": 0.12457979038955903, "mean_token_accuracy": 0.7582733631134033, "num_tokens": 28544484.0, "step": 630, "train/ce_loss": 1.0897371768951416 }, { "epoch": 0.12457979038955903, "step": 630, "train/sim_loss": 0.0007355809211730957 }, { "epoch": 0.12457979038955903, "step": 630, "train/total_loss": 0.10970930010080338 }, { "entropy": 6.307455062866211, "epoch": 0.12477753608859007, "mean_token_accuracy": 0.7700934410095215, "num_tokens": 28604277.0, "step": 631, "train/ce_loss": 0.9179310202598572 }, { "epoch": 0.12477753608859007, "step": 631, "train/sim_loss": 0.0005244016647338867 }, { "epoch": 0.12477753608859007, "step": 631, "train/total_loss": 0.09231750667095184 }, { "entropy": 6.10416841506958, "epoch": 0.12497528178762111, "mean_token_accuracy": 0.7643384337425232, "num_tokens": 28658072.0, "step": 632, "train/ce_loss": 0.958868145942688 }, { "epoch": 0.12497528178762111, "step": 632, "train/sim_loss": 0.0005815625190734863 }, { "epoch": 0.12497528178762111, "step": 632, "train/total_loss": 0.09646838158369064 }, { "entropy": 6.012268543243408, "epoch": 0.12517302748665216, "mean_token_accuracy": 0.743697464466095, "num_tokens": 28705393.0, "step": 633, "train/ce_loss": 0.9592368006706238 }, { "epoch": 0.12517302748665216, "step": 633, "train/sim_loss": 0.0006198883056640625 }, { "epoch": 0.12517302748665216, "step": 633, "train/total_loss": 0.0965435728430748 }, { "entropy": 6.371953010559082, "epoch": 0.1253707731856832, "mean_token_accuracy": 0.7328370809555054, "num_tokens": 28743875.0, "step": 634, "train/ce_loss": 1.093881368637085 }, { "epoch": 0.1253707731856832, "step": 634, "train/sim_loss": 0.0008979439735412598 }, { "epoch": 0.1253707731856832, "step": 634, "train/total_loss": 0.11028607934713364 }, { "entropy": 6.293490886688232, "epoch": 0.12556851888471426, "mean_token_accuracy": 0.7227112650871277, "num_tokens": 28785061.0, "step": 635, "train/ce_loss": 0.9544775485992432 }, { "epoch": 0.12556851888471426, "step": 635, "train/sim_loss": 0.00072479248046875 }, { "epoch": 0.12556851888471426, "step": 635, "train/total_loss": 0.09617254883050919 }, { "entropy": 6.305379867553711, "epoch": 0.1257662645837453, "mean_token_accuracy": 0.7565698623657227, "num_tokens": 28841286.0, "step": 636, "train/ce_loss": 1.2007282972335815 }, { "epoch": 0.1257662645837453, "step": 636, "train/sim_loss": 0.0008318424224853516 }, { "epoch": 0.1257662645837453, "step": 636, "train/total_loss": 0.12090467661619186 }, { "entropy": 6.229506015777588, "epoch": 0.12596401028277635, "mean_token_accuracy": 0.7303071022033691, "num_tokens": 28876434.0, "step": 637, "train/ce_loss": 1.2764211893081665 }, { "epoch": 0.12596401028277635, "step": 637, "train/sim_loss": 0.0007686614990234375 }, { "epoch": 0.12596401028277635, "step": 637, "train/total_loss": 0.12841078639030457 }, { "entropy": 6.347451210021973, "epoch": 0.1261617559818074, "mean_token_accuracy": 0.7249814867973328, "num_tokens": 28926463.0, "step": 638, "train/ce_loss": 0.7618205547332764 }, { "epoch": 0.1261617559818074, "step": 638, "train/sim_loss": 0.0007395148277282715 }, { "epoch": 0.1261617559818074, "step": 638, "train/total_loss": 0.07692157477140427 }, { "entropy": 5.7793426513671875, "epoch": 0.12635950168083845, "mean_token_accuracy": 0.7544052600860596, "num_tokens": 28967719.0, "step": 639, "train/ce_loss": 0.7371929883956909 }, { "epoch": 0.12635950168083845, "step": 639, "train/sim_loss": 0.0005372762680053711 }, { "epoch": 0.12635950168083845, "step": 639, "train/total_loss": 0.07425657659769058 }, { "epoch": 0.12655724737986948, "grad_norm": 0.4491388201713562, "learning_rate": 9.688891087150065e-06, "loss": 0.0839, "step": 640 }, { "entropy": 5.839798450469971, "epoch": 0.12655724737986948, "mean_token_accuracy": 0.746835470199585, "num_tokens": 28998767.0, "step": 640, "train/ce_loss": 0.6868319511413574 }, { "epoch": 0.12655724737986948, "step": 640, "train/sim_loss": 0.0005218982696533203 }, { "epoch": 0.12655724737986948, "step": 640, "train/total_loss": 0.06920509785413742 }, { "entropy": 5.814058303833008, "epoch": 0.12675499307890054, "mean_token_accuracy": 0.777694046497345, "num_tokens": 29036498.0, "step": 641, "train/ce_loss": 1.3241469860076904 }, { "epoch": 0.12675499307890054, "step": 641, "train/sim_loss": 0.0008108615875244141 }, { "epoch": 0.12675499307890054, "step": 641, "train/total_loss": 0.13322556018829346 }, { "entropy": 6.00761604309082, "epoch": 0.1269527387779316, "mean_token_accuracy": 0.6857334971427917, "num_tokens": 29076495.0, "step": 642, "train/ce_loss": 1.1248598098754883 }, { "epoch": 0.1269527387779316, "step": 642, "train/sim_loss": 0.0007562637329101562 }, { "epoch": 0.1269527387779316, "step": 642, "train/total_loss": 0.1132422462105751 }, { "entropy": 5.5292816162109375, "epoch": 0.12715048447696262, "mean_token_accuracy": 0.7838957905769348, "num_tokens": 29108838.0, "step": 643, "train/ce_loss": 0.44833803176879883 }, { "epoch": 0.12715048447696262, "step": 643, "train/sim_loss": 0.0005987882614135742 }, { "epoch": 0.12715048447696262, "step": 643, "train/total_loss": 0.045432593673467636 }, { "entropy": 5.629228115081787, "epoch": 0.12734823017599367, "mean_token_accuracy": 0.7352281212806702, "num_tokens": 29137366.0, "step": 644, "train/ce_loss": 4.926456676912494e-05 }, { "epoch": 0.12734823017599367, "step": 644, "train/sim_loss": 0.000510096549987793 }, { "epoch": 0.12734823017599367, "step": 644, "train/total_loss": 0.0005150230135768652 }, { "entropy": 6.210299491882324, "epoch": 0.12754597587502473, "mean_token_accuracy": 0.7634092569351196, "num_tokens": 29177100.0, "step": 645, "train/ce_loss": 3.811829083133489e-05 }, { "epoch": 0.12754597587502473, "step": 645, "train/sim_loss": 0.0009427666664123535 }, { "epoch": 0.12754597587502473, "step": 645, "train/total_loss": 0.0009465785115025938 }, { "entropy": 6.072360038757324, "epoch": 0.12774372157405575, "mean_token_accuracy": 0.7041719555854797, "num_tokens": 29212895.0, "step": 646, "train/ce_loss": 1.0919660329818726 }, { "epoch": 0.12774372157405575, "step": 646, "train/sim_loss": 0.0005402565002441406 }, { "epoch": 0.12774372157405575, "step": 646, "train/total_loss": 0.1097368597984314 }, { "entropy": 6.282230377197266, "epoch": 0.1279414672730868, "mean_token_accuracy": 0.6912133693695068, "num_tokens": 29250077.0, "step": 647, "train/ce_loss": 1.10418701171875 }, { "epoch": 0.1279414672730868, "step": 647, "train/sim_loss": 0.0009850263595581055 }, { "epoch": 0.1279414672730868, "step": 647, "train/total_loss": 0.11140372604131699 }, { "entropy": 6.279695987701416, "epoch": 0.12813921297211786, "mean_token_accuracy": 0.7065727710723877, "num_tokens": 29313159.0, "step": 648, "train/ce_loss": 1.6936777830123901 }, { "epoch": 0.12813921297211786, "step": 648, "train/sim_loss": 0.0005993247032165527 }, { "epoch": 0.12813921297211786, "step": 648, "train/total_loss": 0.16996710002422333 }, { "entropy": 5.914213180541992, "epoch": 0.12833695867114892, "mean_token_accuracy": 0.7654321193695068, "num_tokens": 29347356.0, "step": 649, "train/ce_loss": 1.0455437898635864 }, { "epoch": 0.12833695867114892, "step": 649, "train/sim_loss": 0.0003940463066101074 }, { "epoch": 0.12833695867114892, "step": 649, "train/total_loss": 0.10494842380285263 }, { "entropy": 6.000546455383301, "epoch": 0.12853470437017994, "mean_token_accuracy": 0.760047972202301, "num_tokens": 29368828.0, "step": 650, "train/ce_loss": 0.4242617189884186 }, { "epoch": 0.12853470437017994, "step": 650, "train/sim_loss": 0.0007671117782592773 }, { "epoch": 0.12853470437017994, "step": 650, "train/total_loss": 0.043193284422159195 }, { "entropy": 6.137625694274902, "epoch": 0.128732450069211, "mean_token_accuracy": 0.7590782046318054, "num_tokens": 29428547.0, "step": 651, "train/ce_loss": 0.6093527674674988 }, { "epoch": 0.128732450069211, "step": 651, "train/sim_loss": 0.0004203319549560547 }, { "epoch": 0.128732450069211, "step": 651, "train/total_loss": 0.06135560944676399 }, { "entropy": 5.8990373611450195, "epoch": 0.12893019576824205, "mean_token_accuracy": 0.7393749952316284, "num_tokens": 29469953.0, "step": 652, "train/ce_loss": 1.277830958366394 }, { "epoch": 0.12893019576824205, "step": 652, "train/sim_loss": 0.0006027817726135254 }, { "epoch": 0.12893019576824205, "step": 652, "train/total_loss": 0.12838588654994965 }, { "entropy": 5.540538787841797, "epoch": 0.12912794146727308, "mean_token_accuracy": 0.7984886765480042, "num_tokens": 29496042.0, "step": 653, "train/ce_loss": 3.1145948014454916e-05 }, { "epoch": 0.12912794146727308, "step": 653, "train/sim_loss": 0.0007616281509399414 }, { "epoch": 0.12912794146727308, "step": 653, "train/total_loss": 0.0007647427264600992 }, { "entropy": 6.063568592071533, "epoch": 0.12932568716630413, "mean_token_accuracy": 0.7481824159622192, "num_tokens": 29538917.0, "step": 654, "train/ce_loss": 0.8467016220092773 }, { "epoch": 0.12932568716630413, "step": 654, "train/sim_loss": 0.0004944801330566406 }, { "epoch": 0.12932568716630413, "step": 654, "train/total_loss": 0.0851646438241005 }, { "entropy": 6.1175618171691895, "epoch": 0.1295234328653352, "mean_token_accuracy": 0.7189655303955078, "num_tokens": 29591602.0, "step": 655, "train/ce_loss": 0.30977362394332886 }, { "epoch": 0.1295234328653352, "step": 655, "train/sim_loss": 0.0006098747253417969 }, { "epoch": 0.1295234328653352, "step": 655, "train/total_loss": 0.03158723562955856 }, { "entropy": 6.039461135864258, "epoch": 0.1297211785643662, "mean_token_accuracy": 0.7529904246330261, "num_tokens": 29636892.0, "step": 656, "train/ce_loss": 0.9232975244522095 }, { "epoch": 0.1297211785643662, "step": 656, "train/sim_loss": 0.0006821155548095703 }, { "epoch": 0.1297211785643662, "step": 656, "train/total_loss": 0.09301187098026276 }, { "entropy": 5.921460151672363, "epoch": 0.12991892426339727, "mean_token_accuracy": 0.7633175849914551, "num_tokens": 29685969.0, "step": 657, "train/ce_loss": 3.33777024934534e-05 }, { "epoch": 0.12991892426339727, "step": 657, "train/sim_loss": 0.0005954504013061523 }, { "epoch": 0.12991892426339727, "step": 657, "train/total_loss": 0.0005987881449982524 }, { "entropy": 6.045784950256348, "epoch": 0.13011666996242832, "mean_token_accuracy": 0.7382328510284424, "num_tokens": 29722203.0, "step": 658, "train/ce_loss": 0.751658022403717 }, { "epoch": 0.13011666996242832, "step": 658, "train/sim_loss": 0.00047600269317626953 }, { "epoch": 0.13011666996242832, "step": 658, "train/total_loss": 0.07564180344343185 }, { "entropy": 6.198486328125, "epoch": 0.13031441566145938, "mean_token_accuracy": 0.7074204683303833, "num_tokens": 29774303.0, "step": 659, "train/ce_loss": 0.9177547097206116 }, { "epoch": 0.13031441566145938, "step": 659, "train/sim_loss": 0.0006203651428222656 }, { "epoch": 0.13031441566145938, "step": 659, "train/total_loss": 0.0923958346247673 }, { "epoch": 0.1305121613604904, "grad_norm": 0.4769200384616852, "learning_rate": 9.678998911860718e-06, "loss": 0.0825, "step": 660 }, { "entropy": 6.363011360168457, "epoch": 0.1305121613604904, "mean_token_accuracy": 0.7548161149024963, "num_tokens": 29814969.0, "step": 660, "train/ce_loss": 0.8302666544914246 }, { "epoch": 0.1305121613604904, "step": 660, "train/sim_loss": 0.0008617639541625977 }, { "epoch": 0.1305121613604904, "step": 660, "train/total_loss": 0.08388843387365341 }, { "entropy": 6.038259983062744, "epoch": 0.13070990705952146, "mean_token_accuracy": 0.7933383584022522, "num_tokens": 29852335.0, "step": 661, "train/ce_loss": 3.717758954735473e-05 }, { "epoch": 0.13070990705952146, "step": 661, "train/sim_loss": 0.0011419057846069336 }, { "epoch": 0.13070990705952146, "step": 661, "train/total_loss": 0.0011456235079094768 }, { "entropy": 6.083418846130371, "epoch": 0.1309076527585525, "mean_token_accuracy": 0.7268332242965698, "num_tokens": 29892020.0, "step": 662, "train/ce_loss": 1.1799134016036987 }, { "epoch": 0.1309076527585525, "step": 662, "train/sim_loss": 0.0009133815765380859 }, { "epoch": 0.1309076527585525, "step": 662, "train/total_loss": 0.1189047247171402 }, { "entropy": 5.984925746917725, "epoch": 0.13110539845758354, "mean_token_accuracy": 0.709956705570221, "num_tokens": 29943473.0, "step": 663, "train/ce_loss": 0.6088041663169861 }, { "epoch": 0.13110539845758354, "step": 663, "train/sim_loss": 0.0006122589111328125 }, { "epoch": 0.13110539845758354, "step": 663, "train/total_loss": 0.0614926777780056 }, { "entropy": 6.3466081619262695, "epoch": 0.1313031441566146, "mean_token_accuracy": 0.7011265754699707, "num_tokens": 29990171.0, "step": 664, "train/ce_loss": 4.21528093283996e-05 }, { "epoch": 0.1313031441566146, "step": 664, "train/sim_loss": 0.0005725622177124023 }, { "epoch": 0.1313031441566146, "step": 664, "train/total_loss": 0.0005767775001004338 }, { "entropy": 6.151966571807861, "epoch": 0.13150088985564565, "mean_token_accuracy": 0.7633175849914551, "num_tokens": 30054092.0, "step": 665, "train/ce_loss": 0.41795840859413147 }, { "epoch": 0.13150088985564565, "step": 665, "train/sim_loss": 0.0006021261215209961 }, { "epoch": 0.13150088985564565, "step": 665, "train/total_loss": 0.04239796847105026 }, { "entropy": 5.769698143005371, "epoch": 0.13169863555467667, "mean_token_accuracy": 0.7265238761901855, "num_tokens": 30108278.0, "step": 666, "train/ce_loss": 0.5056857466697693 }, { "epoch": 0.13169863555467667, "step": 666, "train/sim_loss": 0.0010325908660888672 }, { "epoch": 0.13169863555467667, "step": 666, "train/total_loss": 0.051601167768239975 }, { "entropy": 5.67519474029541, "epoch": 0.13189638125370773, "mean_token_accuracy": 0.7461851239204407, "num_tokens": 30143653.0, "step": 667, "train/ce_loss": 0.5801792740821838 }, { "epoch": 0.13189638125370773, "step": 667, "train/sim_loss": 0.00043892860412597656 }, { "epoch": 0.13189638125370773, "step": 667, "train/total_loss": 0.05845685675740242 }, { "entropy": 5.860019683837891, "epoch": 0.13209412695273878, "mean_token_accuracy": 0.7636494040489197, "num_tokens": 30202123.0, "step": 668, "train/ce_loss": 0.7695927023887634 }, { "epoch": 0.13209412695273878, "step": 668, "train/sim_loss": 0.000885009765625 }, { "epoch": 0.13209412695273878, "step": 668, "train/total_loss": 0.0778442844748497 }, { "entropy": 6.007426738739014, "epoch": 0.13229187265176984, "mean_token_accuracy": 0.7546838521957397, "num_tokens": 30236701.0, "step": 669, "train/ce_loss": 0.5337738394737244 }, { "epoch": 0.13229187265176984, "step": 669, "train/sim_loss": 0.0006247758865356445 }, { "epoch": 0.13229187265176984, "step": 669, "train/total_loss": 0.05400216206908226 }, { "entropy": 5.741494178771973, "epoch": 0.13248961835080086, "mean_token_accuracy": 0.6916996240615845, "num_tokens": 30278355.0, "step": 670, "train/ce_loss": 2.7709038257598877 }, { "epoch": 0.13248961835080086, "step": 670, "train/sim_loss": 0.0005031228065490723 }, { "epoch": 0.13248961835080086, "step": 670, "train/total_loss": 0.2775935232639313 }, { "entropy": 6.099565029144287, "epoch": 0.13268736404983192, "mean_token_accuracy": 0.7235659956932068, "num_tokens": 30318328.0, "step": 671, "train/ce_loss": 1.3175170421600342 }, { "epoch": 0.13268736404983192, "step": 671, "train/sim_loss": 0.0007691383361816406 }, { "epoch": 0.13268736404983192, "step": 671, "train/total_loss": 0.13252083957195282 }, { "entropy": 5.900182723999023, "epoch": 0.13288510974886297, "mean_token_accuracy": 0.7734909653663635, "num_tokens": 30362311.0, "step": 672, "train/ce_loss": 3.783823922276497e-05 }, { "epoch": 0.13288510974886297, "step": 672, "train/sim_loss": 0.0008071064949035645 }, { "epoch": 0.13288510974886297, "step": 672, "train/total_loss": 0.0008108903421089053 }, { "entropy": 6.384413242340088, "epoch": 0.133082855447894, "mean_token_accuracy": 0.7354739904403687, "num_tokens": 30402913.0, "step": 673, "train/ce_loss": 1.2271872758865356 }, { "epoch": 0.133082855447894, "step": 673, "train/sim_loss": 0.0007339715957641602 }, { "epoch": 0.133082855447894, "step": 673, "train/total_loss": 0.12345270067453384 }, { "entropy": 6.100738048553467, "epoch": 0.13328060114692505, "mean_token_accuracy": 0.7361563444137573, "num_tokens": 30457917.0, "step": 674, "train/ce_loss": 1.0595135688781738 }, { "epoch": 0.13328060114692505, "step": 674, "train/sim_loss": 0.00040531158447265625 }, { "epoch": 0.13328060114692505, "step": 674, "train/total_loss": 0.1063566729426384 }, { "entropy": 5.9133195877075195, "epoch": 0.1334783468459561, "mean_token_accuracy": 0.7486979365348816, "num_tokens": 30497894.0, "step": 675, "train/ce_loss": 1.087918996810913 }, { "epoch": 0.1334783468459561, "step": 675, "train/sim_loss": 0.0007911920547485352 }, { "epoch": 0.1334783468459561, "step": 675, "train/total_loss": 0.10958309471607208 }, { "entropy": 6.162140846252441, "epoch": 0.13367609254498714, "mean_token_accuracy": 0.7315658926963806, "num_tokens": 30529849.0, "step": 676, "train/ce_loss": 4.023990550194867e-05 }, { "epoch": 0.13367609254498714, "step": 676, "train/sim_loss": 0.0010644793510437012 }, { "epoch": 0.13367609254498714, "step": 676, "train/total_loss": 0.001068503363057971 }, { "entropy": 6.2546563148498535, "epoch": 0.1338738382440182, "mean_token_accuracy": 0.6955719590187073, "num_tokens": 30582617.0, "step": 677, "train/ce_loss": 1.9307942390441895 }, { "epoch": 0.1338738382440182, "step": 677, "train/sim_loss": 0.0005082488059997559 }, { "epoch": 0.1338738382440182, "step": 677, "train/total_loss": 0.19358767569065094 }, { "entropy": 6.2351274490356445, "epoch": 0.13407158394304924, "mean_token_accuracy": 0.72547847032547, "num_tokens": 30625134.0, "step": 678, "train/ce_loss": 1.6317099332809448 }, { "epoch": 0.13407158394304924, "step": 678, "train/sim_loss": 0.0008195042610168457 }, { "epoch": 0.13407158394304924, "step": 678, "train/total_loss": 0.16399049758911133 }, { "entropy": 6.325404644012451, "epoch": 0.1342693296420803, "mean_token_accuracy": 0.698630154132843, "num_tokens": 30661121.0, "step": 679, "train/ce_loss": 0.6604559421539307 }, { "epoch": 0.1342693296420803, "step": 679, "train/sim_loss": 0.0007506608963012695 }, { "epoch": 0.1342693296420803, "step": 679, "train/total_loss": 0.06679625809192657 }, { "epoch": 0.13446707534111133, "grad_norm": 0.5012522339820862, "learning_rate": 9.669106736571374e-06, "loss": 0.0861, "step": 680 }, { "entropy": 6.185981750488281, "epoch": 0.13446707534111133, "mean_token_accuracy": 0.7355466485023499, "num_tokens": 30707110.0, "step": 680, "train/ce_loss": 0.3940977454185486 }, { "epoch": 0.13446707534111133, "step": 680, "train/sim_loss": 0.0007114410400390625 }, { "epoch": 0.13446707534111133, "step": 680, "train/total_loss": 0.04012121632695198 }, { "entropy": 5.910550117492676, "epoch": 0.13466482104014238, "mean_token_accuracy": 0.7567370533943176, "num_tokens": 30746556.0, "step": 681, "train/ce_loss": 1.8076609373092651 }, { "epoch": 0.13466482104014238, "step": 681, "train/sim_loss": 0.0006428360939025879 }, { "epoch": 0.13466482104014238, "step": 681, "train/total_loss": 0.18140892684459686 }, { "entropy": 6.329290390014648, "epoch": 0.13486256673917343, "mean_token_accuracy": 0.7456250190734863, "num_tokens": 30798055.0, "step": 682, "train/ce_loss": 0.5806108117103577 }, { "epoch": 0.13486256673917343, "step": 682, "train/sim_loss": 0.0005775094032287598 }, { "epoch": 0.13486256673917343, "step": 682, "train/total_loss": 0.058638591319322586 }, { "entropy": 5.773369789123535, "epoch": 0.13506031243820446, "mean_token_accuracy": 0.7305629849433899, "num_tokens": 30826022.0, "step": 683, "train/ce_loss": 1.5867148637771606 }, { "epoch": 0.13506031243820446, "step": 683, "train/sim_loss": 0.0008674860000610352 }, { "epoch": 0.13506031243820446, "step": 683, "train/total_loss": 0.15953896939754486 }, { "entropy": 6.180400848388672, "epoch": 0.13525805813723552, "mean_token_accuracy": 0.7767475247383118, "num_tokens": 30876388.0, "step": 684, "train/ce_loss": 1.07284677028656 }, { "epoch": 0.13525805813723552, "step": 684, "train/sim_loss": 0.00077056884765625 }, { "epoch": 0.13525805813723552, "step": 684, "train/total_loss": 0.1080552488565445 }, { "entropy": 6.345798492431641, "epoch": 0.13545580383626657, "mean_token_accuracy": 0.7222653031349182, "num_tokens": 30916068.0, "step": 685, "train/ce_loss": 4.0786366298561916e-05 }, { "epoch": 0.13545580383626657, "step": 685, "train/sim_loss": 0.0004131197929382324 }, { "epoch": 0.13545580383626657, "step": 685, "train/total_loss": 0.00041719843284226954 }, { "entropy": 5.7451348304748535, "epoch": 0.1356535495352976, "mean_token_accuracy": 0.7398884296417236, "num_tokens": 30968774.0, "step": 686, "train/ce_loss": 0.41754600405693054 }, { "epoch": 0.1356535495352976, "step": 686, "train/sim_loss": 0.0007041096687316895 }, { "epoch": 0.1356535495352976, "step": 686, "train/total_loss": 0.042458709329366684 }, { "entropy": 6.1932373046875, "epoch": 0.13585129523432865, "mean_token_accuracy": 0.71875, "num_tokens": 31015548.0, "step": 687, "train/ce_loss": 1.065834879875183 }, { "epoch": 0.13585129523432865, "step": 687, "train/sim_loss": 0.0008228421211242676 }, { "epoch": 0.13585129523432865, "step": 687, "train/total_loss": 0.10740633308887482 }, { "entropy": 6.070703506469727, "epoch": 0.1360490409333597, "mean_token_accuracy": 0.6784660816192627, "num_tokens": 31068193.0, "step": 688, "train/ce_loss": 1.2736165523529053 }, { "epoch": 0.1360490409333597, "step": 688, "train/sim_loss": 0.0007831454277038574 }, { "epoch": 0.1360490409333597, "step": 688, "train/total_loss": 0.12814480066299438 }, { "entropy": 5.592737197875977, "epoch": 0.13624678663239073, "mean_token_accuracy": 0.7121661901473999, "num_tokens": 31096695.0, "step": 689, "train/ce_loss": 3.57753669959493e-05 }, { "epoch": 0.13624678663239073, "step": 689, "train/sim_loss": 0.0004055500030517578 }, { "epoch": 0.13624678663239073, "step": 689, "train/total_loss": 0.0004091275332029909 }, { "entropy": 6.19500732421875, "epoch": 0.1364445323314218, "mean_token_accuracy": 0.7396088242530823, "num_tokens": 31163757.0, "step": 690, "train/ce_loss": 0.7207539081573486 }, { "epoch": 0.1364445323314218, "step": 690, "train/sim_loss": 0.00038909912109375 }, { "epoch": 0.1364445323314218, "step": 690, "train/total_loss": 0.0724644884467125 }, { "entropy": 5.935831069946289, "epoch": 0.13664227803045284, "mean_token_accuracy": 0.7315484881401062, "num_tokens": 31199672.0, "step": 691, "train/ce_loss": 0.7263574004173279 }, { "epoch": 0.13664227803045284, "step": 691, "train/sim_loss": 0.000958561897277832 }, { "epoch": 0.13664227803045284, "step": 691, "train/total_loss": 0.07359430193901062 }, { "entropy": 5.733369827270508, "epoch": 0.1368400237294839, "mean_token_accuracy": 0.7704455852508545, "num_tokens": 31231448.0, "step": 692, "train/ce_loss": 1.227258563041687 }, { "epoch": 0.1368400237294839, "step": 692, "train/sim_loss": 0.001041114330291748 }, { "epoch": 0.1368400237294839, "step": 692, "train/total_loss": 0.12376697361469269 }, { "entropy": 6.060691833496094, "epoch": 0.13703776942851492, "mean_token_accuracy": 0.7437137365341187, "num_tokens": 31277360.0, "step": 693, "train/ce_loss": 4.265638563083485e-05 }, { "epoch": 0.13703776942851492, "step": 693, "train/sim_loss": 0.000894010066986084 }, { "epoch": 0.13703776942851492, "step": 693, "train/total_loss": 0.0008982756990008056 }, { "entropy": 6.047507286071777, "epoch": 0.13723551512754598, "mean_token_accuracy": 0.7446945309638977, "num_tokens": 31315229.0, "step": 694, "train/ce_loss": 1.249376893043518 }, { "epoch": 0.13723551512754598, "step": 694, "train/sim_loss": 0.0011130571365356445 }, { "epoch": 0.13723551512754598, "step": 694, "train/total_loss": 0.12605074048042297 }, { "entropy": 6.26873779296875, "epoch": 0.13743326082657703, "mean_token_accuracy": 0.6860395669937134, "num_tokens": 31353871.0, "step": 695, "train/ce_loss": 1.3255025148391724 }, { "epoch": 0.13743326082657703, "step": 695, "train/sim_loss": 0.0007224082946777344 }, { "epoch": 0.13743326082657703, "step": 695, "train/total_loss": 0.1332726627588272 }, { "entropy": 6.002578258514404, "epoch": 0.13763100652560806, "mean_token_accuracy": 0.7297857403755188, "num_tokens": 31392104.0, "step": 696, "train/ce_loss": 0.713679313659668 }, { "epoch": 0.13763100652560806, "step": 696, "train/sim_loss": 0.0010158419609069824 }, { "epoch": 0.13763100652560806, "step": 696, "train/total_loss": 0.07238377630710602 }, { "entropy": 6.107769966125488, "epoch": 0.1378287522246391, "mean_token_accuracy": 0.718299150466919, "num_tokens": 31428611.0, "step": 697, "train/ce_loss": 1.565159559249878 }, { "epoch": 0.1378287522246391, "step": 697, "train/sim_loss": 0.000714421272277832 }, { "epoch": 0.1378287522246391, "step": 697, "train/total_loss": 0.15723037719726562 }, { "entropy": 6.153082370758057, "epoch": 0.13802649792367017, "mean_token_accuracy": 0.7036734819412231, "num_tokens": 31478868.0, "step": 698, "train/ce_loss": 1.2591923475265503 }, { "epoch": 0.13802649792367017, "step": 698, "train/sim_loss": 0.0008510351181030273 }, { "epoch": 0.13802649792367017, "step": 698, "train/total_loss": 0.1267702728509903 }, { "entropy": 6.093291282653809, "epoch": 0.1382242436227012, "mean_token_accuracy": 0.7334167957305908, "num_tokens": 31527463.0, "step": 699, "train/ce_loss": 0.7204099297523499 }, { "epoch": 0.1382242436227012, "step": 699, "train/sim_loss": 0.0007330179214477539 }, { "epoch": 0.1382242436227012, "step": 699, "train/total_loss": 0.0727740153670311 }, { "epoch": 0.13842198932173225, "grad_norm": 0.4504731297492981, "learning_rate": 9.659214561282026e-06, "loss": 0.0882, "step": 700 }, { "entropy": 6.046587944030762, "epoch": 0.13842198932173225, "mean_token_accuracy": 0.7513340711593628, "num_tokens": 31560203.0, "step": 700, "train/ce_loss": 1.379116415977478 }, { "epoch": 0.13842198932173225, "step": 700, "train/sim_loss": 0.0007834434509277344 }, { "epoch": 0.13842198932173225, "step": 700, "train/total_loss": 0.13869509100914001 }, { "entropy": 6.281664848327637, "epoch": 0.1386197350207633, "mean_token_accuracy": 0.6857541799545288, "num_tokens": 31617436.0, "step": 701, "train/ce_loss": 0.7510918974876404 }, { "epoch": 0.1386197350207633, "step": 701, "train/sim_loss": 0.0009597539901733398 }, { "epoch": 0.1386197350207633, "step": 701, "train/total_loss": 0.0760689452290535 }, { "entropy": 6.157711029052734, "epoch": 0.13881748071979436, "mean_token_accuracy": 0.7136363387107849, "num_tokens": 31656438.0, "step": 702, "train/ce_loss": 1.3220241069793701 }, { "epoch": 0.13881748071979436, "step": 702, "train/sim_loss": 0.0006500482559204102 }, { "epoch": 0.13881748071979436, "step": 702, "train/total_loss": 0.1328524649143219 }, { "entropy": 5.7258734703063965, "epoch": 0.13901522641882538, "mean_token_accuracy": 0.7597617506980896, "num_tokens": 31684138.0, "step": 703, "train/ce_loss": 0.9069976210594177 }, { "epoch": 0.13901522641882538, "step": 703, "train/sim_loss": 0.0007346868515014648 }, { "epoch": 0.13901522641882538, "step": 703, "train/total_loss": 0.09143444895744324 }, { "entropy": 6.295951843261719, "epoch": 0.13921297211785644, "mean_token_accuracy": 0.6741753816604614, "num_tokens": 31746605.0, "step": 704, "train/ce_loss": 1.3952454328536987 }, { "epoch": 0.13921297211785644, "step": 704, "train/sim_loss": 0.0008940696716308594 }, { "epoch": 0.13921297211785644, "step": 704, "train/total_loss": 0.1404186189174652 }, { "entropy": 5.348919868469238, "epoch": 0.1394107178168875, "mean_token_accuracy": 0.7763941287994385, "num_tokens": 31770304.0, "step": 705, "train/ce_loss": 0.5714664459228516 }, { "epoch": 0.1394107178168875, "step": 705, "train/sim_loss": 0.0004017353057861328 }, { "epoch": 0.1394107178168875, "step": 705, "train/total_loss": 0.05754838138818741 }, { "entropy": 5.997974395751953, "epoch": 0.13960846351591852, "mean_token_accuracy": 0.7185128927230835, "num_tokens": 31815145.0, "step": 706, "train/ce_loss": 1.6170854568481445 }, { "epoch": 0.13960846351591852, "step": 706, "train/sim_loss": 0.000601649284362793 }, { "epoch": 0.13960846351591852, "step": 706, "train/total_loss": 0.16231019794940948 }, { "entropy": 5.863480567932129, "epoch": 0.13980620921494957, "mean_token_accuracy": 0.7472903728485107, "num_tokens": 31858552.0, "step": 707, "train/ce_loss": 1.3668495416641235 }, { "epoch": 0.13980620921494957, "step": 707, "train/sim_loss": 0.0006902217864990234 }, { "epoch": 0.13980620921494957, "step": 707, "train/total_loss": 0.13737517595291138 }, { "entropy": 6.02211856842041, "epoch": 0.14000395491398063, "mean_token_accuracy": 0.7336292862892151, "num_tokens": 31920010.0, "step": 708, "train/ce_loss": 1.0591322183609009 }, { "epoch": 0.14000395491398063, "step": 708, "train/sim_loss": 0.0006671547889709473 }, { "epoch": 0.14000395491398063, "step": 708, "train/total_loss": 0.10658037662506104 }, { "entropy": 5.834864616394043, "epoch": 0.14020170061301165, "mean_token_accuracy": 0.7911671996116638, "num_tokens": 31964695.0, "step": 709, "train/ce_loss": 0.7407451272010803 }, { "epoch": 0.14020170061301165, "step": 709, "train/sim_loss": 0.0006744861602783203 }, { "epoch": 0.14020170061301165, "step": 709, "train/total_loss": 0.07474900037050247 }, { "entropy": 5.750908851623535, "epoch": 0.1403994463120427, "mean_token_accuracy": 0.7739071249961853, "num_tokens": 31995968.0, "step": 710, "train/ce_loss": 0.9534330368041992 }, { "epoch": 0.1403994463120427, "step": 710, "train/sim_loss": 0.0006870031356811523 }, { "epoch": 0.1403994463120427, "step": 710, "train/total_loss": 0.09603030979633331 }, { "entropy": 6.03861141204834, "epoch": 0.14059719201107376, "mean_token_accuracy": 0.7707865238189697, "num_tokens": 32048225.0, "step": 711, "train/ce_loss": 0.8444477915763855 }, { "epoch": 0.14059719201107376, "step": 711, "train/sim_loss": 0.0005996227264404297 }, { "epoch": 0.14059719201107376, "step": 711, "train/total_loss": 0.08504440635442734 }, { "entropy": 6.2528791427612305, "epoch": 0.14079493771010482, "mean_token_accuracy": 0.7300122976303101, "num_tokens": 32096301.0, "step": 712, "train/ce_loss": 1.235609531402588 }, { "epoch": 0.14079493771010482, "step": 712, "train/sim_loss": 0.0005751848220825195 }, { "epoch": 0.14079493771010482, "step": 712, "train/total_loss": 0.12413614243268967 }, { "entropy": 6.059607028961182, "epoch": 0.14099268340913584, "mean_token_accuracy": 0.7098727822303772, "num_tokens": 32144282.0, "step": 713, "train/ce_loss": 1.403409481048584 }, { "epoch": 0.14099268340913584, "step": 713, "train/sim_loss": 0.0006203651428222656 }, { "epoch": 0.14099268340913584, "step": 713, "train/total_loss": 0.14096131920814514 }, { "entropy": 6.17418098449707, "epoch": 0.1411904291081669, "mean_token_accuracy": 0.7330357432365417, "num_tokens": 32184647.0, "step": 714, "train/ce_loss": 1.6752488613128662 }, { "epoch": 0.1411904291081669, "step": 714, "train/sim_loss": 0.0005712509155273438 }, { "epoch": 0.1411904291081669, "step": 714, "train/total_loss": 0.1680961400270462 }, { "entropy": 5.903757095336914, "epoch": 0.14138817480719795, "mean_token_accuracy": 0.7401162981987, "num_tokens": 32225076.0, "step": 715, "train/ce_loss": 0.6776177883148193 }, { "epoch": 0.14138817480719795, "step": 715, "train/sim_loss": 0.0004767179489135742 }, { "epoch": 0.14138817480719795, "step": 715, "train/total_loss": 0.06823849678039551 }, { "entropy": 6.100008964538574, "epoch": 0.14158592050622898, "mean_token_accuracy": 0.7437461614608765, "num_tokens": 32257936.0, "step": 716, "train/ce_loss": 0.8269609808921814 }, { "epoch": 0.14158592050622898, "step": 716, "train/sim_loss": 0.0005573034286499023 }, { "epoch": 0.14158592050622898, "step": 716, "train/total_loss": 0.0832534059882164 }, { "entropy": 5.991170883178711, "epoch": 0.14178366620526003, "mean_token_accuracy": 0.7524116039276123, "num_tokens": 32311605.0, "step": 717, "train/ce_loss": 0.6601826548576355 }, { "epoch": 0.14178366620526003, "step": 717, "train/sim_loss": 0.0004388093948364258 }, { "epoch": 0.14178366620526003, "step": 717, "train/total_loss": 0.06645707786083221 }, { "entropy": 6.18347692489624, "epoch": 0.1419814119042911, "mean_token_accuracy": 0.7178118824958801, "num_tokens": 32365556.0, "step": 718, "train/ce_loss": 0.6420957446098328 }, { "epoch": 0.1419814119042911, "step": 718, "train/sim_loss": 0.0006856918334960938 }, { "epoch": 0.1419814119042911, "step": 718, "train/total_loss": 0.06489526480436325 }, { "entropy": 5.873352527618408, "epoch": 0.14217915760332211, "mean_token_accuracy": 0.7496412992477417, "num_tokens": 32405909.0, "step": 719, "train/ce_loss": 1.1812493801116943 }, { "epoch": 0.14217915760332211, "step": 719, "train/sim_loss": 0.0006456375122070312 }, { "epoch": 0.14217915760332211, "step": 719, "train/total_loss": 0.11877057701349258 }, { "epoch": 0.14237690330235317, "grad_norm": 0.48519372940063477, "learning_rate": 9.649322385992681e-06, "loss": 0.084, "step": 720 }, { "entropy": 5.831648826599121, "epoch": 0.14237690330235317, "mean_token_accuracy": 0.754182755947113, "num_tokens": 32436279.0, "step": 720, "train/ce_loss": 1.2216434478759766 }, { "epoch": 0.14237690330235317, "step": 720, "train/sim_loss": 0.0007121562957763672 }, { "epoch": 0.14237690330235317, "step": 720, "train/total_loss": 0.12287650257349014 }, { "entropy": 6.160427093505859, "epoch": 0.14257464900138422, "mean_token_accuracy": 0.7222222089767456, "num_tokens": 32475281.0, "step": 721, "train/ce_loss": 0.5460183620452881 }, { "epoch": 0.14257464900138422, "step": 721, "train/sim_loss": 0.0008324384689331055 }, { "epoch": 0.14257464900138422, "step": 721, "train/total_loss": 0.055434275418519974 }, { "entropy": 6.039376258850098, "epoch": 0.14277239470041528, "mean_token_accuracy": 0.7141905426979065, "num_tokens": 32525213.0, "step": 722, "train/ce_loss": 1.4990922212600708 }, { "epoch": 0.14277239470041528, "step": 722, "train/sim_loss": 0.000691533088684082 }, { "epoch": 0.14277239470041528, "step": 722, "train/total_loss": 0.15060076117515564 }, { "entropy": 6.0113325119018555, "epoch": 0.1429701403994463, "mean_token_accuracy": 0.7737594842910767, "num_tokens": 32579038.0, "step": 723, "train/ce_loss": 1.159472942352295 }, { "epoch": 0.1429701403994463, "step": 723, "train/sim_loss": 0.0007061362266540527 }, { "epoch": 0.1429701403994463, "step": 723, "train/total_loss": 0.1166534349322319 }, { "entropy": 5.932990074157715, "epoch": 0.14316788609847736, "mean_token_accuracy": 0.7091866731643677, "num_tokens": 32624143.0, "step": 724, "train/ce_loss": 1.295810580253601 }, { "epoch": 0.14316788609847736, "step": 724, "train/sim_loss": 0.0005172491073608398 }, { "epoch": 0.14316788609847736, "step": 724, "train/total_loss": 0.13009831309318542 }, { "entropy": 6.0218706130981445, "epoch": 0.1433656317975084, "mean_token_accuracy": 0.7509627938270569, "num_tokens": 32653970.0, "step": 725, "train/ce_loss": 0.8803297281265259 }, { "epoch": 0.1433656317975084, "step": 725, "train/sim_loss": 0.0006763339042663574 }, { "epoch": 0.1433656317975084, "step": 725, "train/total_loss": 0.08870930969715118 }, { "entropy": 5.63425350189209, "epoch": 0.14356337749653944, "mean_token_accuracy": 0.7155476808547974, "num_tokens": 32700212.0, "step": 726, "train/ce_loss": 1.343698501586914 }, { "epoch": 0.14356337749653944, "step": 726, "train/sim_loss": 0.0006854534149169922 }, { "epoch": 0.14356337749653944, "step": 726, "train/total_loss": 0.1350553035736084 }, { "entropy": 6.26544189453125, "epoch": 0.1437611231955705, "mean_token_accuracy": 0.7293606400489807, "num_tokens": 32748768.0, "step": 727, "train/ce_loss": 0.5677874684333801 }, { "epoch": 0.1437611231955705, "step": 727, "train/sim_loss": 0.0004892349243164062 }, { "epoch": 0.1437611231955705, "step": 727, "train/total_loss": 0.05726798251271248 }, { "entropy": 5.97520637512207, "epoch": 0.14395886889460155, "mean_token_accuracy": 0.7523961663246155, "num_tokens": 32797810.0, "step": 728, "train/ce_loss": 3.527695662342012e-05 }, { "epoch": 0.14395886889460155, "step": 728, "train/sim_loss": 0.0007425546646118164 }, { "epoch": 0.14395886889460155, "step": 728, "train/total_loss": 0.0007460823399014771 }, { "entropy": 5.722914695739746, "epoch": 0.14415661459363258, "mean_token_accuracy": 0.7763794660568237, "num_tokens": 32839352.0, "step": 729, "train/ce_loss": 4.563731999951415e-05 }, { "epoch": 0.14415661459363258, "step": 729, "train/sim_loss": 0.0005604028701782227 }, { "epoch": 0.14415661459363258, "step": 729, "train/total_loss": 0.0005649665836244822 }, { "entropy": 6.058287620544434, "epoch": 0.14435436029266363, "mean_token_accuracy": 0.7021709680557251, "num_tokens": 32906164.0, "step": 730, "train/ce_loss": 0.9201800227165222 }, { "epoch": 0.14435436029266363, "step": 730, "train/sim_loss": 0.0004374384880065918 }, { "epoch": 0.14435436029266363, "step": 730, "train/total_loss": 0.0924554392695427 }, { "entropy": 6.204686641693115, "epoch": 0.14455210599169468, "mean_token_accuracy": 0.7571906447410583, "num_tokens": 32957592.0, "step": 731, "train/ce_loss": 0.6215150952339172 }, { "epoch": 0.14455210599169468, "step": 731, "train/sim_loss": 0.000523686408996582 }, { "epoch": 0.14455210599169468, "step": 731, "train/total_loss": 0.06267519295215607 }, { "entropy": 6.13161039352417, "epoch": 0.14474985169072574, "mean_token_accuracy": 0.7216828465461731, "num_tokens": 33002311.0, "step": 732, "train/ce_loss": 0.7559165954589844 }, { "epoch": 0.14474985169072574, "step": 732, "train/sim_loss": 0.0008817911148071289 }, { "epoch": 0.14474985169072574, "step": 732, "train/total_loss": 0.07647345215082169 }, { "entropy": 6.210601806640625, "epoch": 0.14494759738975677, "mean_token_accuracy": 0.7308282256126404, "num_tokens": 33064840.0, "step": 733, "train/ce_loss": 0.9161707162857056 }, { "epoch": 0.14494759738975677, "step": 733, "train/sim_loss": 0.0003381967544555664 }, { "epoch": 0.14494759738975677, "step": 733, "train/total_loss": 0.09195526689291 }, { "entropy": 6.355133533477783, "epoch": 0.14514534308878782, "mean_token_accuracy": 0.6993180513381958, "num_tokens": 33112297.0, "step": 734, "train/ce_loss": 1.594114065170288 }, { "epoch": 0.14514534308878782, "step": 734, "train/sim_loss": 0.0010364055633544922 }, { "epoch": 0.14514534308878782, "step": 734, "train/total_loss": 0.16044782102108002 }, { "entropy": 5.707343101501465, "epoch": 0.14534308878781887, "mean_token_accuracy": 0.7393283843994141, "num_tokens": 33143605.0, "step": 735, "train/ce_loss": 0.9844399094581604 }, { "epoch": 0.14534308878781887, "step": 735, "train/sim_loss": 0.00039523839950561523 }, { "epoch": 0.14534308878781887, "step": 735, "train/total_loss": 0.09883923083543777 }, { "entropy": 6.507383823394775, "epoch": 0.1455408344868499, "mean_token_accuracy": 0.7348423600196838, "num_tokens": 33190125.0, "step": 736, "train/ce_loss": 1.8183165788650513 }, { "epoch": 0.1455408344868499, "step": 736, "train/sim_loss": 0.00042235851287841797 }, { "epoch": 0.1455408344868499, "step": 736, "train/total_loss": 0.18225401639938354 }, { "entropy": 6.10075044631958, "epoch": 0.14573858018588096, "mean_token_accuracy": 0.774193525314331, "num_tokens": 33240964.0, "step": 737, "train/ce_loss": 3.8869835407240316e-05 }, { "epoch": 0.14573858018588096, "step": 737, "train/sim_loss": 0.0006636977195739746 }, { "epoch": 0.14573858018588096, "step": 737, "train/total_loss": 0.0006675847107544541 }, { "entropy": 6.510662078857422, "epoch": 0.145936325884912, "mean_token_accuracy": 0.7103244662284851, "num_tokens": 33274594.0, "step": 738, "train/ce_loss": 1.4206814765930176 }, { "epoch": 0.145936325884912, "step": 738, "train/sim_loss": 0.0006055831909179688 }, { "epoch": 0.145936325884912, "step": 738, "train/total_loss": 0.14267373085021973 }, { "entropy": 6.229083061218262, "epoch": 0.14613407158394304, "mean_token_accuracy": 0.6993540525436401, "num_tokens": 33320558.0, "step": 739, "train/ce_loss": 0.5512683391571045 }, { "epoch": 0.14613407158394304, "step": 739, "train/sim_loss": 0.0006592273712158203 }, { "epoch": 0.14613407158394304, "step": 739, "train/total_loss": 0.05578606203198433 }, { "epoch": 0.1463318172829741, "grad_norm": 0.49461597204208374, "learning_rate": 9.639430210703335e-06, "loss": 0.0856, "step": 740 }, { "entropy": 6.374995231628418, "epoch": 0.1463318172829741, "mean_token_accuracy": 0.7316076159477234, "num_tokens": 33377307.0, "step": 740, "train/ce_loss": 1.501476526260376 }, { "epoch": 0.1463318172829741, "step": 740, "train/sim_loss": 0.0006122589111328125 }, { "epoch": 0.1463318172829741, "step": 740, "train/total_loss": 0.15075992047786713 }, { "entropy": 6.279999256134033, "epoch": 0.14652956298200515, "mean_token_accuracy": 0.7415914535522461, "num_tokens": 33428391.0, "step": 741, "train/ce_loss": 1.717004418373108 }, { "epoch": 0.14652956298200515, "step": 741, "train/sim_loss": 0.0007467269897460938 }, { "epoch": 0.14652956298200515, "step": 741, "train/total_loss": 0.17244717478752136 }, { "entropy": 6.060567855834961, "epoch": 0.1467273086810362, "mean_token_accuracy": 0.760127067565918, "num_tokens": 33460027.0, "step": 742, "train/ce_loss": 1.6731353998184204 }, { "epoch": 0.1467273086810362, "step": 742, "train/sim_loss": 0.0006374120712280273 }, { "epoch": 0.1467273086810362, "step": 742, "train/total_loss": 0.16795095801353455 }, { "entropy": 6.0965046882629395, "epoch": 0.14692505438006723, "mean_token_accuracy": 0.7474371194839478, "num_tokens": 33514317.0, "step": 743, "train/ce_loss": 4.245695163263008e-05 }, { "epoch": 0.14692505438006723, "step": 743, "train/sim_loss": 0.0008668899536132812 }, { "epoch": 0.14692505438006723, "step": 743, "train/total_loss": 0.0008711356204003096 }, { "entropy": 6.12230110168457, "epoch": 0.14712280007909828, "mean_token_accuracy": 0.7373737096786499, "num_tokens": 33571807.0, "step": 744, "train/ce_loss": 0.6823589205741882 }, { "epoch": 0.14712280007909828, "step": 744, "train/sim_loss": 0.0004271864891052246 }, { "epoch": 0.14712280007909828, "step": 744, "train/total_loss": 0.0686630830168724 }, { "entropy": 6.031367301940918, "epoch": 0.14732054577812934, "mean_token_accuracy": 0.732087254524231, "num_tokens": 33615702.0, "step": 745, "train/ce_loss": 1.614130973815918 }, { "epoch": 0.14732054577812934, "step": 745, "train/sim_loss": 0.00038236379623413086 }, { "epoch": 0.14732054577812934, "step": 745, "train/total_loss": 0.1617954671382904 }, { "entropy": 5.896158695220947, "epoch": 0.14751829147716036, "mean_token_accuracy": 0.7440310716629028, "num_tokens": 33652287.0, "step": 746, "train/ce_loss": 0.7640425562858582 }, { "epoch": 0.14751829147716036, "step": 746, "train/sim_loss": 0.00046509504318237305 }, { "epoch": 0.14751829147716036, "step": 746, "train/total_loss": 0.07686935365200043 }, { "entropy": 5.848008155822754, "epoch": 0.14771603717619142, "mean_token_accuracy": 0.7485254406929016, "num_tokens": 33687130.0, "step": 747, "train/ce_loss": 1.2946670055389404 }, { "epoch": 0.14771603717619142, "step": 747, "train/sim_loss": 0.000528872013092041 }, { "epoch": 0.14771603717619142, "step": 747, "train/total_loss": 0.12999556958675385 }, { "entropy": 6.412802696228027, "epoch": 0.14791378287522247, "mean_token_accuracy": 0.7773279547691345, "num_tokens": 33730052.0, "step": 748, "train/ce_loss": 1.2585899829864502 }, { "epoch": 0.14791378287522247, "step": 748, "train/sim_loss": 0.000408172607421875 }, { "epoch": 0.14791378287522247, "step": 748, "train/total_loss": 0.1262671798467636 }, { "entropy": 6.520142555236816, "epoch": 0.1481115285742535, "mean_token_accuracy": 0.7015341520309448, "num_tokens": 33777295.0, "step": 749, "train/ce_loss": 1.2205653190612793 }, { "epoch": 0.1481115285742535, "step": 749, "train/sim_loss": 0.0007026791572570801 }, { "epoch": 0.1481115285742535, "step": 749, "train/total_loss": 0.12275921553373337 }, { "entropy": 6.229394435882568, "epoch": 0.14830927427328455, "mean_token_accuracy": 0.70888352394104, "num_tokens": 33822846.0, "step": 750, "train/ce_loss": 0.981556236743927 }, { "epoch": 0.14830927427328455, "step": 750, "train/sim_loss": 0.0004981756210327148 }, { "epoch": 0.14830927427328455, "step": 750, "train/total_loss": 0.09865380078554153 }, { "entropy": 5.982748985290527, "epoch": 0.1485070199723156, "mean_token_accuracy": 0.7230662703514099, "num_tokens": 33870125.0, "step": 751, "train/ce_loss": 1.9742752313613892 }, { "epoch": 0.1485070199723156, "step": 751, "train/sim_loss": 0.0005538463592529297 }, { "epoch": 0.1485070199723156, "step": 751, "train/total_loss": 0.19798137247562408 }, { "entropy": 5.756265163421631, "epoch": 0.14870476567134666, "mean_token_accuracy": 0.7548661828041077, "num_tokens": 33906166.0, "step": 752, "train/ce_loss": 0.8023564219474792 }, { "epoch": 0.14870476567134666, "step": 752, "train/sim_loss": 0.000459134578704834 }, { "epoch": 0.14870476567134666, "step": 752, "train/total_loss": 0.080694779753685 }, { "entropy": 6.303708076477051, "epoch": 0.1489025113703777, "mean_token_accuracy": 0.7541729807853699, "num_tokens": 33955559.0, "step": 753, "train/ce_loss": 1.4032233953475952 }, { "epoch": 0.1489025113703777, "step": 753, "train/sim_loss": 0.0007995963096618652 }, { "epoch": 0.1489025113703777, "step": 753, "train/total_loss": 0.14112193882465363 }, { "entropy": 5.829625129699707, "epoch": 0.14910025706940874, "mean_token_accuracy": 0.7309185266494751, "num_tokens": 33995630.0, "step": 754, "train/ce_loss": 2.0085930824279785 }, { "epoch": 0.14910025706940874, "step": 754, "train/sim_loss": 0.0005683302879333496 }, { "epoch": 0.14910025706940874, "step": 754, "train/total_loss": 0.2014276385307312 }, { "entropy": 6.611815452575684, "epoch": 0.1492980027684398, "mean_token_accuracy": 0.7684478163719177, "num_tokens": 34048454.0, "step": 755, "train/ce_loss": 1.0315526723861694 }, { "epoch": 0.1492980027684398, "step": 755, "train/sim_loss": 0.000591278076171875 }, { "epoch": 0.1492980027684398, "step": 755, "train/total_loss": 0.10374654829502106 }, { "entropy": 6.025767803192139, "epoch": 0.14949574846747082, "mean_token_accuracy": 0.7703281044960022, "num_tokens": 34095801.0, "step": 756, "train/ce_loss": 0.8362480998039246 }, { "epoch": 0.14949574846747082, "step": 756, "train/sim_loss": 0.0008872747421264648 }, { "epoch": 0.14949574846747082, "step": 756, "train/total_loss": 0.08451208472251892 }, { "entropy": 6.137965679168701, "epoch": 0.14969349416650188, "mean_token_accuracy": 0.6931506991386414, "num_tokens": 34140086.0, "step": 757, "train/ce_loss": 0.9251447319984436 }, { "epoch": 0.14969349416650188, "step": 757, "train/sim_loss": 0.0007396340370178223 }, { "epoch": 0.14969349416650188, "step": 757, "train/total_loss": 0.09325411170721054 }, { "entropy": 6.259452819824219, "epoch": 0.14989123986553293, "mean_token_accuracy": 0.7591522336006165, "num_tokens": 34210950.0, "step": 758, "train/ce_loss": 0.6991536617279053 }, { "epoch": 0.14989123986553293, "step": 758, "train/sim_loss": 0.0006464123725891113 }, { "epoch": 0.14989123986553293, "step": 758, "train/total_loss": 0.07056178152561188 }, { "entropy": 6.661359786987305, "epoch": 0.15008898556456396, "mean_token_accuracy": 0.7376237511634827, "num_tokens": 34265581.0, "step": 759, "train/ce_loss": 0.5623676180839539 }, { "epoch": 0.15008898556456396, "step": 759, "train/sim_loss": 0.0006248950958251953 }, { "epoch": 0.15008898556456396, "step": 759, "train/total_loss": 0.05686165764927864 }, { "epoch": 0.150286731263595, "grad_norm": 0.4608074724674225, "learning_rate": 9.629538035413987e-06, "loss": 0.0846, "step": 760 }, { "entropy": 5.99024772644043, "epoch": 0.150286731263595, "mean_token_accuracy": 0.718497097492218, "num_tokens": 34324633.0, "step": 760, "train/ce_loss": 0.7353611588478088 }, { "epoch": 0.150286731263595, "step": 760, "train/sim_loss": 0.0005794763565063477 }, { "epoch": 0.150286731263595, "step": 760, "train/total_loss": 0.07411559671163559 }, { "entropy": 6.132070064544678, "epoch": 0.15048447696262607, "mean_token_accuracy": 0.706403911113739, "num_tokens": 34368221.0, "step": 761, "train/ce_loss": 1.3388326168060303 }, { "epoch": 0.15048447696262607, "step": 761, "train/sim_loss": 0.00043070316314697266 }, { "epoch": 0.15048447696262607, "step": 761, "train/total_loss": 0.13431397080421448 }, { "entropy": 6.42775297164917, "epoch": 0.15068222266165712, "mean_token_accuracy": 0.707115650177002, "num_tokens": 34417725.0, "step": 762, "train/ce_loss": 0.9217014908790588 }, { "epoch": 0.15068222266165712, "step": 762, "train/sim_loss": 0.0005899667739868164 }, { "epoch": 0.15068222266165712, "step": 762, "train/total_loss": 0.0927601158618927 }, { "entropy": 6.1792802810668945, "epoch": 0.15087996836068815, "mean_token_accuracy": 0.7744593024253845, "num_tokens": 34449982.0, "step": 763, "train/ce_loss": 0.8618647456169128 }, { "epoch": 0.15087996836068815, "step": 763, "train/sim_loss": 0.0003795623779296875 }, { "epoch": 0.15087996836068815, "step": 763, "train/total_loss": 0.08656603842973709 }, { "entropy": 6.229071617126465, "epoch": 0.1510777140597192, "mean_token_accuracy": 0.6895284652709961, "num_tokens": 34494259.0, "step": 764, "train/ce_loss": 0.7572550177574158 }, { "epoch": 0.1510777140597192, "step": 764, "train/sim_loss": 0.0009020566940307617 }, { "epoch": 0.1510777140597192, "step": 764, "train/total_loss": 0.07662755995988846 }, { "entropy": 6.086216926574707, "epoch": 0.15127545975875026, "mean_token_accuracy": 0.7234656810760498, "num_tokens": 34528923.0, "step": 765, "train/ce_loss": 1.1789156198501587 }, { "epoch": 0.15127545975875026, "step": 765, "train/sim_loss": 0.0005021095275878906 }, { "epoch": 0.15127545975875026, "step": 765, "train/total_loss": 0.118393674492836 }, { "entropy": 6.110109329223633, "epoch": 0.15147320545778128, "mean_token_accuracy": 0.729522705078125, "num_tokens": 34561530.0, "step": 766, "train/ce_loss": 1.031272053718567 }, { "epoch": 0.15147320545778128, "step": 766, "train/sim_loss": 0.0007088184356689453 }, { "epoch": 0.15147320545778128, "step": 766, "train/total_loss": 0.10383602231740952 }, { "entropy": 6.414558410644531, "epoch": 0.15167095115681234, "mean_token_accuracy": 0.7561436891555786, "num_tokens": 34609631.0, "step": 767, "train/ce_loss": 3.5446220863377675e-05 }, { "epoch": 0.15167095115681234, "step": 767, "train/sim_loss": 0.000560462474822998 }, { "epoch": 0.15167095115681234, "step": 767, "train/total_loss": 0.0005640070885419846 }, { "entropy": 6.092993259429932, "epoch": 0.1518686968558434, "mean_token_accuracy": 0.728646993637085, "num_tokens": 34652535.0, "step": 768, "train/ce_loss": 3.09530551021453e-05 }, { "epoch": 0.1518686968558434, "step": 768, "train/sim_loss": 0.0006473660469055176 }, { "epoch": 0.1518686968558434, "step": 768, "train/total_loss": 0.000650461355689913 }, { "entropy": 6.042799949645996, "epoch": 0.15206644255487442, "mean_token_accuracy": 0.7326362729072571, "num_tokens": 34714220.0, "step": 769, "train/ce_loss": 0.5616521835327148 }, { "epoch": 0.15206644255487442, "step": 769, "train/sim_loss": 0.0003840923309326172 }, { "epoch": 0.15206644255487442, "step": 769, "train/total_loss": 0.0565493106842041 }, { "entropy": 6.104544639587402, "epoch": 0.15226418825390547, "mean_token_accuracy": 0.7774193286895752, "num_tokens": 34744367.0, "step": 770, "train/ce_loss": 0.7824622392654419 }, { "epoch": 0.15226418825390547, "step": 770, "train/sim_loss": 0.0005940794944763184 }, { "epoch": 0.15226418825390547, "step": 770, "train/total_loss": 0.07884030789136887 }, { "entropy": 6.117010593414307, "epoch": 0.15246193395293653, "mean_token_accuracy": 0.7243688702583313, "num_tokens": 34804019.0, "step": 771, "train/ce_loss": 0.4804728031158447 }, { "epoch": 0.15246193395293653, "step": 771, "train/sim_loss": 0.0007261037826538086 }, { "epoch": 0.15246193395293653, "step": 771, "train/total_loss": 0.0487733855843544 }, { "entropy": 6.06578254699707, "epoch": 0.15265967965196756, "mean_token_accuracy": 0.7423049211502075, "num_tokens": 34846043.0, "step": 772, "train/ce_loss": 0.778624415397644 }, { "epoch": 0.15265967965196756, "step": 772, "train/sim_loss": 0.00039315223693847656 }, { "epoch": 0.15265967965196756, "step": 772, "train/total_loss": 0.07825559377670288 }, { "entropy": 6.078032493591309, "epoch": 0.1528574253509986, "mean_token_accuracy": 0.722082257270813, "num_tokens": 34892416.0, "step": 773, "train/ce_loss": 1.2663419246673584 }, { "epoch": 0.1528574253509986, "step": 773, "train/sim_loss": 0.0006077289581298828 }, { "epoch": 0.1528574253509986, "step": 773, "train/total_loss": 0.12724192440509796 }, { "entropy": 5.946030616760254, "epoch": 0.15305517105002966, "mean_token_accuracy": 0.7539302706718445, "num_tokens": 34932340.0, "step": 774, "train/ce_loss": 0.440004825592041 }, { "epoch": 0.15305517105002966, "step": 774, "train/sim_loss": 0.00042241811752319336 }, { "epoch": 0.15305517105002966, "step": 774, "train/total_loss": 0.044422902166843414 }, { "entropy": 6.500945091247559, "epoch": 0.15325291674906072, "mean_token_accuracy": 0.7091875672340393, "num_tokens": 34987553.0, "step": 775, "train/ce_loss": 1.682155966758728 }, { "epoch": 0.15325291674906072, "step": 775, "train/sim_loss": 0.0005405545234680176 }, { "epoch": 0.15325291674906072, "step": 775, "train/total_loss": 0.1687561571598053 }, { "entropy": 6.2921319007873535, "epoch": 0.15345066244809175, "mean_token_accuracy": 0.7085751295089722, "num_tokens": 35043044.0, "step": 776, "train/ce_loss": 1.095621943473816 }, { "epoch": 0.15345066244809175, "step": 776, "train/sim_loss": 0.0008479952812194824 }, { "epoch": 0.15345066244809175, "step": 776, "train/total_loss": 0.1104101911187172 }, { "entropy": 6.38136100769043, "epoch": 0.1536484081471228, "mean_token_accuracy": 0.727216362953186, "num_tokens": 35092241.0, "step": 777, "train/ce_loss": 0.6614619493484497 }, { "epoch": 0.1536484081471228, "step": 777, "train/sim_loss": 0.0005806088447570801 }, { "epoch": 0.1536484081471228, "step": 777, "train/total_loss": 0.06672680377960205 }, { "entropy": 6.465221405029297, "epoch": 0.15384615384615385, "mean_token_accuracy": 0.6899676322937012, "num_tokens": 35135672.0, "step": 778, "train/ce_loss": 2.742137908935547 }, { "epoch": 0.15384615384615385, "step": 778, "train/sim_loss": 0.0004805922508239746 }, { "epoch": 0.15384615384615385, "step": 778, "train/total_loss": 0.27469438314437866 }, { "entropy": 6.159089088439941, "epoch": 0.15404389954518488, "mean_token_accuracy": 0.703318178653717, "num_tokens": 35177991.0, "step": 779, "train/ce_loss": 0.6879115104675293 }, { "epoch": 0.15404389954518488, "step": 779, "train/sim_loss": 0.0003846883773803711 }, { "epoch": 0.15404389954518488, "step": 779, "train/total_loss": 0.0691758394241333 }, { "epoch": 0.15424164524421594, "grad_norm": 0.4756080210208893, "learning_rate": 9.619645860124643e-06, "loss": 0.0886, "step": 780 }, { "entropy": 6.611437797546387, "epoch": 0.15424164524421594, "mean_token_accuracy": 0.7034728527069092, "num_tokens": 35212155.0, "step": 780, "train/ce_loss": 3.49659712810535e-05 }, { "epoch": 0.15424164524421594, "step": 780, "train/sim_loss": 0.00042808055877685547 }, { "epoch": 0.15424164524421594, "step": 780, "train/total_loss": 0.00043157715117558837 }, { "entropy": 6.413640022277832, "epoch": 0.154439390943247, "mean_token_accuracy": 0.71387779712677, "num_tokens": 35277273.0, "step": 781, "train/ce_loss": 1.1239272356033325 }, { "epoch": 0.154439390943247, "step": 781, "train/sim_loss": 0.0009385347366333008 }, { "epoch": 0.154439390943247, "step": 781, "train/total_loss": 0.11333125829696655 }, { "entropy": 6.403033256530762, "epoch": 0.15463713664227802, "mean_token_accuracy": 0.7463445663452148, "num_tokens": 35336308.0, "step": 782, "train/ce_loss": 1.0557019710540771 }, { "epoch": 0.15463713664227802, "step": 782, "train/sim_loss": 0.0004553794860839844 }, { "epoch": 0.15463713664227802, "step": 782, "train/total_loss": 0.1060255765914917 }, { "entropy": 6.200300216674805, "epoch": 0.15483488234130907, "mean_token_accuracy": 0.777319610118866, "num_tokens": 35384381.0, "step": 783, "train/ce_loss": 1.0834182500839233 }, { "epoch": 0.15483488234130907, "step": 783, "train/sim_loss": 0.0009672641754150391 }, { "epoch": 0.15483488234130907, "step": 783, "train/total_loss": 0.10930909216403961 }, { "entropy": 5.918460845947266, "epoch": 0.15503262804034013, "mean_token_accuracy": 0.7202838659286499, "num_tokens": 35427721.0, "step": 784, "train/ce_loss": 1.630872130393982 }, { "epoch": 0.15503262804034013, "step": 784, "train/sim_loss": 0.0006432533264160156 }, { "epoch": 0.15503262804034013, "step": 784, "train/total_loss": 0.1637304723262787 }, { "entropy": 6.40040397644043, "epoch": 0.15523037373937118, "mean_token_accuracy": 0.729238748550415, "num_tokens": 35473385.0, "step": 785, "train/ce_loss": 4.1187195165548474e-05 }, { "epoch": 0.15523037373937118, "step": 785, "train/sim_loss": 0.00044220685958862305 }, { "epoch": 0.15523037373937118, "step": 785, "train/total_loss": 0.0004463255754671991 }, { "entropy": 5.755337238311768, "epoch": 0.1554281194384022, "mean_token_accuracy": 0.7762085795402527, "num_tokens": 35525230.0, "step": 786, "train/ce_loss": 0.51157146692276 }, { "epoch": 0.1554281194384022, "step": 786, "train/sim_loss": 0.00043845176696777344 }, { "epoch": 0.1554281194384022, "step": 786, "train/total_loss": 0.051595598459243774 }, { "entropy": 6.1873345375061035, "epoch": 0.15562586513743326, "mean_token_accuracy": 0.7570215463638306, "num_tokens": 35572373.0, "step": 787, "train/ce_loss": 0.4118484556674957 }, { "epoch": 0.15562586513743326, "step": 787, "train/sim_loss": 0.000567317008972168 }, { "epoch": 0.15562586513743326, "step": 787, "train/total_loss": 0.0417521633207798 }, { "entropy": 5.807949066162109, "epoch": 0.15582361083646432, "mean_token_accuracy": 0.7335680723190308, "num_tokens": 35600801.0, "step": 788, "train/ce_loss": 0.5406607389450073 }, { "epoch": 0.15582361083646432, "step": 788, "train/sim_loss": 0.00042569637298583984 }, { "epoch": 0.15582361083646432, "step": 788, "train/total_loss": 0.05449176952242851 }, { "entropy": 6.093209266662598, "epoch": 0.15602135653549534, "mean_token_accuracy": 0.7223300933837891, "num_tokens": 35631396.0, "step": 789, "train/ce_loss": 0.9416353702545166 }, { "epoch": 0.15602135653549534, "step": 789, "train/sim_loss": 0.0005903840065002441 }, { "epoch": 0.15602135653549534, "step": 789, "train/total_loss": 0.0947539210319519 }, { "entropy": 5.860041618347168, "epoch": 0.1562191022345264, "mean_token_accuracy": 0.7643818855285645, "num_tokens": 35667996.0, "step": 790, "train/ce_loss": 1.3608359098434448 }, { "epoch": 0.1562191022345264, "step": 790, "train/sim_loss": 0.0006351470947265625 }, { "epoch": 0.1562191022345264, "step": 790, "train/total_loss": 0.1367187350988388 }, { "entropy": 5.466909408569336, "epoch": 0.15641684793355745, "mean_token_accuracy": 0.7891201972961426, "num_tokens": 35709128.0, "step": 791, "train/ce_loss": 2.6416564651299268e-05 }, { "epoch": 0.15641684793355745, "step": 791, "train/sim_loss": 0.00038874149322509766 }, { "epoch": 0.15641684793355745, "step": 791, "train/total_loss": 0.000391383160604164 }, { "entropy": 5.976718902587891, "epoch": 0.15661459363258848, "mean_token_accuracy": 0.7595287561416626, "num_tokens": 35757916.0, "step": 792, "train/ce_loss": 1.1642018556594849 }, { "epoch": 0.15661459363258848, "step": 792, "train/sim_loss": 0.0004552602767944336 }, { "epoch": 0.15661459363258848, "step": 792, "train/total_loss": 0.11687544733285904 }, { "entropy": 5.695167541503906, "epoch": 0.15681233933161953, "mean_token_accuracy": 0.7426650524139404, "num_tokens": 35797908.0, "step": 793, "train/ce_loss": 0.4294622242450714 }, { "epoch": 0.15681233933161953, "step": 793, "train/sim_loss": 0.0005609989166259766 }, { "epoch": 0.15681233933161953, "step": 793, "train/total_loss": 0.04350722208619118 }, { "entropy": 6.2554755210876465, "epoch": 0.1570100850306506, "mean_token_accuracy": 0.7431507110595703, "num_tokens": 35845439.0, "step": 794, "train/ce_loss": 0.6149212121963501 }, { "epoch": 0.1570100850306506, "step": 794, "train/sim_loss": 0.0005919933319091797 }, { "epoch": 0.1570100850306506, "step": 794, "train/total_loss": 0.06208411604166031 }, { "entropy": 6.239692687988281, "epoch": 0.15720783072968164, "mean_token_accuracy": 0.7469803094863892, "num_tokens": 35892225.0, "step": 795, "train/ce_loss": 1.068001627922058 }, { "epoch": 0.15720783072968164, "step": 795, "train/sim_loss": 0.0004540681838989258 }, { "epoch": 0.15720783072968164, "step": 795, "train/total_loss": 0.10725422948598862 }, { "entropy": 6.0857253074646, "epoch": 0.15740557642871267, "mean_token_accuracy": 0.7736873626708984, "num_tokens": 35939406.0, "step": 796, "train/ce_loss": 0.538107693195343 }, { "epoch": 0.15740557642871267, "step": 796, "train/sim_loss": 0.00041353702545166016 }, { "epoch": 0.15740557642871267, "step": 796, "train/total_loss": 0.05422430858016014 }, { "entropy": 5.920829772949219, "epoch": 0.15760332212774372, "mean_token_accuracy": 0.760617733001709, "num_tokens": 35987011.0, "step": 797, "train/ce_loss": 1.4633541107177734 }, { "epoch": 0.15760332212774372, "step": 797, "train/sim_loss": 0.0007446408271789551 }, { "epoch": 0.15760332212774372, "step": 797, "train/total_loss": 0.14708004891872406 }, { "entropy": 5.816672325134277, "epoch": 0.15780106782677478, "mean_token_accuracy": 0.75, "num_tokens": 36035054.0, "step": 798, "train/ce_loss": 0.7522972226142883 }, { "epoch": 0.15780106782677478, "step": 798, "train/sim_loss": 0.00031441450119018555 }, { "epoch": 0.15780106782677478, "step": 798, "train/total_loss": 0.07554414123296738 }, { "entropy": 5.873414993286133, "epoch": 0.1579988135258058, "mean_token_accuracy": 0.7119497060775757, "num_tokens": 36084664.0, "step": 799, "train/ce_loss": 1.4956581592559814 }, { "epoch": 0.1579988135258058, "step": 799, "train/sim_loss": 0.0005009174346923828 }, { "epoch": 0.1579988135258058, "step": 799, "train/total_loss": 0.15006673336029053 }, { "epoch": 0.15819655922483686, "grad_norm": 0.4992196261882782, "learning_rate": 9.609753684835296e-06, "loss": 0.0817, "step": 800 }, { "entropy": 6.190710067749023, "epoch": 0.15819655922483686, "mean_token_accuracy": 0.7223230600357056, "num_tokens": 36132588.0, "step": 800, "train/ce_loss": 0.8538002371788025 }, { "epoch": 0.15819655922483686, "step": 800, "train/sim_loss": 0.0013027191162109375 }, { "epoch": 0.15819655922483686, "step": 800, "train/total_loss": 0.0866827443242073 }, { "entropy": 6.134610176086426, "epoch": 0.1583943049238679, "mean_token_accuracy": 0.7444113492965698, "num_tokens": 36176577.0, "step": 801, "train/ce_loss": 4.121221354580484e-05 }, { "epoch": 0.1583943049238679, "step": 801, "train/sim_loss": 0.0007519125938415527 }, { "epoch": 0.1583943049238679, "step": 801, "train/total_loss": 0.000756033812649548 }, { "entropy": 6.099167346954346, "epoch": 0.15859205062289894, "mean_token_accuracy": 0.7731277346611023, "num_tokens": 36224174.0, "step": 802, "train/ce_loss": 1.13960862159729 }, { "epoch": 0.15859205062289894, "step": 802, "train/sim_loss": 0.0006726980209350586 }, { "epoch": 0.15859205062289894, "step": 802, "train/total_loss": 0.11463356018066406 }, { "entropy": 6.260166645050049, "epoch": 0.15878979632193, "mean_token_accuracy": 0.7448617815971375, "num_tokens": 36283375.0, "step": 803, "train/ce_loss": 0.6627668142318726 }, { "epoch": 0.15878979632193, "step": 803, "train/sim_loss": 0.0009391903877258301 }, { "epoch": 0.15878979632193, "step": 803, "train/total_loss": 0.06721587479114532 }, { "entropy": 5.594262599945068, "epoch": 0.15898754202096105, "mean_token_accuracy": 0.7552370429039001, "num_tokens": 36331509.0, "step": 804, "train/ce_loss": 0.5497221946716309 }, { "epoch": 0.15898754202096105, "step": 804, "train/sim_loss": 0.0005052089691162109 }, { "epoch": 0.15898754202096105, "step": 804, "train/total_loss": 0.05547742918133736 }, { "entropy": 6.495787620544434, "epoch": 0.1591852877199921, "mean_token_accuracy": 0.7160725593566895, "num_tokens": 36381959.0, "step": 805, "train/ce_loss": 0.8676860332489014 }, { "epoch": 0.1591852877199921, "step": 805, "train/sim_loss": 0.0006629228591918945 }, { "epoch": 0.1591852877199921, "step": 805, "train/total_loss": 0.08743152767419815 }, { "entropy": 6.389760494232178, "epoch": 0.15938303341902313, "mean_token_accuracy": 0.7168508172035217, "num_tokens": 36436642.0, "step": 806, "train/ce_loss": 0.5473613142967224 }, { "epoch": 0.15938303341902313, "step": 806, "train/sim_loss": 0.0005332231521606445 }, { "epoch": 0.15938303341902313, "step": 806, "train/total_loss": 0.055269356817007065 }, { "entropy": 5.9624104499816895, "epoch": 0.15958077911805418, "mean_token_accuracy": 0.7268798351287842, "num_tokens": 36485948.0, "step": 807, "train/ce_loss": 3.5513294278644025e-05 }, { "epoch": 0.15958077911805418, "step": 807, "train/sim_loss": 0.0006628632545471191 }, { "epoch": 0.15958077911805418, "step": 807, "train/total_loss": 0.0006664145621471107 }, { "entropy": 6.512443542480469, "epoch": 0.15977852481708524, "mean_token_accuracy": 0.7596091032028198, "num_tokens": 36545803.0, "step": 808, "train/ce_loss": 1.3727333545684814 }, { "epoch": 0.15977852481708524, "step": 808, "train/sim_loss": 0.000707089900970459 }, { "epoch": 0.15977852481708524, "step": 808, "train/total_loss": 0.13798043131828308 }, { "entropy": 5.735389709472656, "epoch": 0.15997627051611626, "mean_token_accuracy": 0.7597765326499939, "num_tokens": 36567531.0, "step": 809, "train/ce_loss": 1.940700888633728 }, { "epoch": 0.15997627051611626, "step": 809, "train/sim_loss": 0.0006239414215087891 }, { "epoch": 0.15997627051611626, "step": 809, "train/total_loss": 0.19469402730464935 }, { "entropy": 6.097994327545166, "epoch": 0.16017401621514732, "mean_token_accuracy": 0.773402214050293, "num_tokens": 36626697.0, "step": 810, "train/ce_loss": 0.7916562557220459 }, { "epoch": 0.16017401621514732, "step": 810, "train/sim_loss": 0.00035011768341064453 }, { "epoch": 0.16017401621514732, "step": 810, "train/total_loss": 0.07951574772596359 }, { "entropy": 5.857694149017334, "epoch": 0.16037176191417837, "mean_token_accuracy": 0.7578166127204895, "num_tokens": 36671596.0, "step": 811, "train/ce_loss": 0.6413581371307373 }, { "epoch": 0.16037176191417837, "step": 811, "train/sim_loss": 0.000847935676574707 }, { "epoch": 0.16037176191417837, "step": 811, "train/total_loss": 0.06498374789953232 }, { "entropy": 6.248139381408691, "epoch": 0.1605695076132094, "mean_token_accuracy": 0.7416728138923645, "num_tokens": 36712428.0, "step": 812, "train/ce_loss": 3.632614607340656e-05 }, { "epoch": 0.1605695076132094, "step": 812, "train/sim_loss": 0.00033551454544067383 }, { "epoch": 0.1605695076132094, "step": 812, "train/total_loss": 0.0003391471691429615 }, { "entropy": 6.253540992736816, "epoch": 0.16076725331224045, "mean_token_accuracy": 0.7048950791358948, "num_tokens": 36755288.0, "step": 813, "train/ce_loss": 1.2946990728378296 }, { "epoch": 0.16076725331224045, "step": 813, "train/sim_loss": 0.0004850625991821289 }, { "epoch": 0.16076725331224045, "step": 813, "train/total_loss": 0.1299549788236618 }, { "entropy": 5.997068881988525, "epoch": 0.1609649990112715, "mean_token_accuracy": 0.7517843246459961, "num_tokens": 36788414.0, "step": 814, "train/ce_loss": 0.8462032079696655 }, { "epoch": 0.1609649990112715, "step": 814, "train/sim_loss": 0.0004951953887939453 }, { "epoch": 0.1609649990112715, "step": 814, "train/total_loss": 0.08511551469564438 }, { "entropy": 5.76879358291626, "epoch": 0.16116274471030256, "mean_token_accuracy": 0.7209994196891785, "num_tokens": 36834097.0, "step": 815, "train/ce_loss": 1.5258455276489258 }, { "epoch": 0.16116274471030256, "step": 815, "train/sim_loss": 0.0006303787231445312 }, { "epoch": 0.16116274471030256, "step": 815, "train/total_loss": 0.1532149314880371 }, { "entropy": 6.401705741882324, "epoch": 0.1613604904093336, "mean_token_accuracy": 0.7501810193061829, "num_tokens": 36864394.0, "step": 816, "train/ce_loss": 0.43305104970932007 }, { "epoch": 0.1613604904093336, "step": 816, "train/sim_loss": 0.0008289813995361328 }, { "epoch": 0.1613604904093336, "step": 816, "train/total_loss": 0.04413408786058426 }, { "entropy": 5.814763069152832, "epoch": 0.16155823610836464, "mean_token_accuracy": 0.7739602327346802, "num_tokens": 36900415.0, "step": 817, "train/ce_loss": 0.3828214108943939 }, { "epoch": 0.16155823610836464, "step": 817, "train/sim_loss": 0.0003387331962585449 }, { "epoch": 0.16155823610836464, "step": 817, "train/total_loss": 0.03862087428569794 }, { "entropy": 5.838335990905762, "epoch": 0.1617559818073957, "mean_token_accuracy": 0.7919947504997253, "num_tokens": 36954338.0, "step": 818, "train/ce_loss": 0.5943712592124939 }, { "epoch": 0.1617559818073957, "step": 818, "train/sim_loss": 0.0006898045539855957 }, { "epoch": 0.1617559818073957, "step": 818, "train/total_loss": 0.060126930475234985 }, { "entropy": 6.038624286651611, "epoch": 0.16195372750642673, "mean_token_accuracy": 0.7573726773262024, "num_tokens": 36996544.0, "step": 819, "train/ce_loss": 0.9424594640731812 }, { "epoch": 0.16195372750642673, "step": 819, "train/sim_loss": 0.0003249645233154297 }, { "epoch": 0.16195372750642673, "step": 819, "train/total_loss": 0.09457091242074966 }, { "epoch": 0.16215147320545778, "grad_norm": 0.43687674403190613, "learning_rate": 9.59986150954595e-06, "loss": 0.0808, "step": 820 }, { "entropy": 6.421820640563965, "epoch": 0.16215147320545778, "mean_token_accuracy": 0.7325466871261597, "num_tokens": 37032444.0, "step": 820, "train/ce_loss": 0.6137392520904541 }, { "epoch": 0.16215147320545778, "step": 820, "train/sim_loss": 0.0003642439842224121 }, { "epoch": 0.16215147320545778, "step": 820, "train/total_loss": 0.06173817068338394 }, { "entropy": 6.042006969451904, "epoch": 0.16234921890448883, "mean_token_accuracy": 0.72586590051651, "num_tokens": 37072174.0, "step": 821, "train/ce_loss": 3.122860653093085e-05 }, { "epoch": 0.16234921890448883, "step": 821, "train/sim_loss": 0.0006390810012817383 }, { "epoch": 0.16234921890448883, "step": 821, "train/total_loss": 0.0006422038422897458 }, { "entropy": 6.308150768280029, "epoch": 0.16254696460351986, "mean_token_accuracy": 0.7253424525260925, "num_tokens": 37111573.0, "step": 822, "train/ce_loss": 0.5570074915885925 }, { "epoch": 0.16254696460351986, "step": 822, "train/sim_loss": 0.0007982254028320312 }, { "epoch": 0.16254696460351986, "step": 822, "train/total_loss": 0.056498974561691284 }, { "entropy": 6.11309289932251, "epoch": 0.16274471030255092, "mean_token_accuracy": 0.6963109374046326, "num_tokens": 37157361.0, "step": 823, "train/ce_loss": 2.99302555504255e-05 }, { "epoch": 0.16274471030255092, "step": 823, "train/sim_loss": 0.0005219578742980957 }, { "epoch": 0.16274471030255092, "step": 823, "train/total_loss": 0.0005249509122222662 }, { "entropy": 6.6020612716674805, "epoch": 0.16294245600158197, "mean_token_accuracy": 0.7255043387413025, "num_tokens": 37199735.0, "step": 824, "train/ce_loss": 1.26649808883667 }, { "epoch": 0.16294245600158197, "step": 824, "train/sim_loss": 0.0006380081176757812 }, { "epoch": 0.16294245600158197, "step": 824, "train/total_loss": 0.127287819981575 }, { "entropy": 6.061012268066406, "epoch": 0.16314020170061302, "mean_token_accuracy": 0.7201998829841614, "num_tokens": 37248149.0, "step": 825, "train/ce_loss": 3.0307804991025478e-05 }, { "epoch": 0.16314020170061302, "step": 825, "train/sim_loss": 0.0005083084106445312 }, { "epoch": 0.16314020170061302, "step": 825, "train/total_loss": 0.0005113391671329737 }, { "entropy": 6.557894229888916, "epoch": 0.16333794739964405, "mean_token_accuracy": 0.6984957456588745, "num_tokens": 37299392.0, "step": 826, "train/ce_loss": 0.7592353224754333 }, { "epoch": 0.16333794739964405, "step": 826, "train/sim_loss": 0.00033992528915405273 }, { "epoch": 0.16333794739964405, "step": 826, "train/total_loss": 0.07626345753669739 }, { "entropy": 5.689159393310547, "epoch": 0.1635356930986751, "mean_token_accuracy": 0.7286773920059204, "num_tokens": 37330733.0, "step": 827, "train/ce_loss": 0.9632502198219299 }, { "epoch": 0.1635356930986751, "step": 827, "train/sim_loss": 0.0004526376724243164 }, { "epoch": 0.1635356930986751, "step": 827, "train/total_loss": 0.09677766263484955 }, { "entropy": 6.383375644683838, "epoch": 0.16373343879770616, "mean_token_accuracy": 0.7261261343955994, "num_tokens": 37368977.0, "step": 828, "train/ce_loss": 1.537707805633545 }, { "epoch": 0.16373343879770616, "step": 828, "train/sim_loss": 0.0003993511199951172 }, { "epoch": 0.16373343879770616, "step": 828, "train/total_loss": 0.15417014062404633 }, { "entropy": 6.461726665496826, "epoch": 0.1639311844967372, "mean_token_accuracy": 0.7003890872001648, "num_tokens": 37424072.0, "step": 829, "train/ce_loss": 1.0270617008209229 }, { "epoch": 0.1639311844967372, "step": 829, "train/sim_loss": 0.0007734298706054688 }, { "epoch": 0.1639311844967372, "step": 829, "train/total_loss": 0.10347960144281387 }, { "entropy": 6.034839630126953, "epoch": 0.16412893019576824, "mean_token_accuracy": 0.71387779712677, "num_tokens": 37464152.0, "step": 830, "train/ce_loss": 1.2900217771530151 }, { "epoch": 0.16412893019576824, "step": 830, "train/sim_loss": 0.0005019903182983398 }, { "epoch": 0.16412893019576824, "step": 830, "train/total_loss": 0.12950417399406433 }, { "entropy": 5.858022212982178, "epoch": 0.1643266758947993, "mean_token_accuracy": 0.7901726365089417, "num_tokens": 37504901.0, "step": 831, "train/ce_loss": 0.8047574758529663 }, { "epoch": 0.1643266758947993, "step": 831, "train/sim_loss": 0.0005994439125061035 }, { "epoch": 0.1643266758947993, "step": 831, "train/total_loss": 0.08107519149780273 }, { "entropy": 6.355953216552734, "epoch": 0.16452442159383032, "mean_token_accuracy": 0.7534050345420837, "num_tokens": 37539509.0, "step": 832, "train/ce_loss": 0.4613479971885681 }, { "epoch": 0.16452442159383032, "step": 832, "train/sim_loss": 0.0004165768623352051 }, { "epoch": 0.16452442159383032, "step": 832, "train/total_loss": 0.04655137658119202 }, { "entropy": 6.002435684204102, "epoch": 0.16472216729286138, "mean_token_accuracy": 0.680701732635498, "num_tokens": 37583957.0, "step": 833, "train/ce_loss": 2.268211841583252 }, { "epoch": 0.16472216729286138, "step": 833, "train/sim_loss": 0.000425875186920166 }, { "epoch": 0.16472216729286138, "step": 833, "train/total_loss": 0.22724705934524536 }, { "entropy": 6.310410499572754, "epoch": 0.16491991299189243, "mean_token_accuracy": 0.7442273497581482, "num_tokens": 37624162.0, "step": 834, "train/ce_loss": 0.9455983638763428 }, { "epoch": 0.16491991299189243, "step": 834, "train/sim_loss": 0.00045430660247802734 }, { "epoch": 0.16491991299189243, "step": 834, "train/total_loss": 0.09501414746046066 }, { "entropy": 6.299015998840332, "epoch": 0.16511765869092349, "mean_token_accuracy": 0.704483687877655, "num_tokens": 37670103.0, "step": 835, "train/ce_loss": 0.8755614161491394 }, { "epoch": 0.16511765869092349, "step": 835, "train/sim_loss": 0.0005204677581787109 }, { "epoch": 0.16511765869092349, "step": 835, "train/total_loss": 0.08807661384344101 }, { "entropy": 6.009383678436279, "epoch": 0.1653154043899545, "mean_token_accuracy": 0.7513644695281982, "num_tokens": 37710146.0, "step": 836, "train/ce_loss": 0.7953025698661804 }, { "epoch": 0.1653154043899545, "step": 836, "train/sim_loss": 0.0008255839347839355 }, { "epoch": 0.1653154043899545, "step": 836, "train/total_loss": 0.08035584539175034 }, { "entropy": 6.124383926391602, "epoch": 0.16551315008898557, "mean_token_accuracy": 0.7519566416740417, "num_tokens": 37745339.0, "step": 837, "train/ce_loss": 1.2637065649032593 }, { "epoch": 0.16551315008898557, "step": 837, "train/sim_loss": 0.0004146695137023926 }, { "epoch": 0.16551315008898557, "step": 837, "train/total_loss": 0.12678532302379608 }, { "entropy": 6.323497772216797, "epoch": 0.16571089578801662, "mean_token_accuracy": 0.7380585670471191, "num_tokens": 37790124.0, "step": 838, "train/ce_loss": 1.0167328119277954 }, { "epoch": 0.16571089578801662, "step": 838, "train/sim_loss": 0.000578761100769043 }, { "epoch": 0.16571089578801662, "step": 838, "train/total_loss": 0.1022520437836647 }, { "entropy": 6.189668655395508, "epoch": 0.16590864148704765, "mean_token_accuracy": 0.7155115604400635, "num_tokens": 37825060.0, "step": 839, "train/ce_loss": 0.7132323980331421 }, { "epoch": 0.16590864148704765, "step": 839, "train/sim_loss": 0.0007999539375305176 }, { "epoch": 0.16590864148704765, "step": 839, "train/total_loss": 0.07212319225072861 }, { "epoch": 0.1661063871860787, "grad_norm": 0.47489091753959656, "learning_rate": 9.589969334256604e-06, "loss": 0.0885, "step": 840 }, { "entropy": 6.144478797912598, "epoch": 0.1661063871860787, "mean_token_accuracy": 0.732421875, "num_tokens": 37868127.0, "step": 840, "train/ce_loss": 0.578076183795929 }, { "epoch": 0.1661063871860787, "step": 840, "train/sim_loss": 0.00034356117248535156 }, { "epoch": 0.1661063871860787, "step": 840, "train/total_loss": 0.058151181787252426 }, { "entropy": 6.121541500091553, "epoch": 0.16630413288510976, "mean_token_accuracy": 0.7436440587043762, "num_tokens": 37923608.0, "step": 841, "train/ce_loss": 1.4891854524612427 }, { "epoch": 0.16630413288510976, "step": 841, "train/sim_loss": 0.00045502185821533203 }, { "epoch": 0.16630413288510976, "step": 841, "train/total_loss": 0.14937357604503632 }, { "entropy": 6.027796745300293, "epoch": 0.16650187858414078, "mean_token_accuracy": 0.7598627805709839, "num_tokens": 37984659.0, "step": 842, "train/ce_loss": 0.8870397806167603 }, { "epoch": 0.16650187858414078, "step": 842, "train/sim_loss": 0.00029909610748291016 }, { "epoch": 0.16650187858414078, "step": 842, "train/total_loss": 0.0890030786395073 }, { "entropy": 6.112740516662598, "epoch": 0.16669962428317184, "mean_token_accuracy": 0.7417135834693909, "num_tokens": 38028547.0, "step": 843, "train/ce_loss": 2.358464917051606e-05 }, { "epoch": 0.16669962428317184, "step": 843, "train/sim_loss": 0.0008102059364318848 }, { "epoch": 0.16669962428317184, "step": 843, "train/total_loss": 0.0008125643944367766 }, { "entropy": 6.142219066619873, "epoch": 0.1668973699822029, "mean_token_accuracy": 0.7256894111633301, "num_tokens": 38061977.0, "step": 844, "train/ce_loss": 0.7927985191345215 }, { "epoch": 0.1668973699822029, "step": 844, "train/sim_loss": 0.0005742311477661133 }, { "epoch": 0.1668973699822029, "step": 844, "train/total_loss": 0.0798540860414505 }, { "entropy": 6.43042516708374, "epoch": 0.16709511568123395, "mean_token_accuracy": 0.7336719632148743, "num_tokens": 38106580.0, "step": 845, "train/ce_loss": 1.0883145332336426 }, { "epoch": 0.16709511568123395, "step": 845, "train/sim_loss": 0.00046896934509277344 }, { "epoch": 0.16709511568123395, "step": 845, "train/total_loss": 0.10930042713880539 }, { "entropy": 6.226681709289551, "epoch": 0.16729286138026497, "mean_token_accuracy": 0.7424242496490479, "num_tokens": 38153077.0, "step": 846, "train/ce_loss": 0.7316372990608215 }, { "epoch": 0.16729286138026497, "step": 846, "train/sim_loss": 0.001212775707244873 }, { "epoch": 0.16729286138026497, "step": 846, "train/total_loss": 0.07437650859355927 }, { "entropy": 6.272769451141357, "epoch": 0.16749060707929603, "mean_token_accuracy": 0.726052463054657, "num_tokens": 38188600.0, "step": 847, "train/ce_loss": 0.9749823808670044 }, { "epoch": 0.16749060707929603, "step": 847, "train/sim_loss": 0.0003444552421569824 }, { "epoch": 0.16749060707929603, "step": 847, "train/total_loss": 0.09784269332885742 }, { "entropy": 6.511717796325684, "epoch": 0.16768835277832708, "mean_token_accuracy": 0.7506281137466431, "num_tokens": 38233069.0, "step": 848, "train/ce_loss": 0.5407779812812805 }, { "epoch": 0.16768835277832708, "step": 848, "train/sim_loss": 0.0006200075149536133 }, { "epoch": 0.16768835277832708, "step": 848, "train/total_loss": 0.054697807878255844 }, { "entropy": 6.194171905517578, "epoch": 0.1678860984773581, "mean_token_accuracy": 0.7612903118133545, "num_tokens": 38284069.0, "step": 849, "train/ce_loss": 0.8805084228515625 }, { "epoch": 0.1678860984773581, "step": 849, "train/sim_loss": 0.0008324384689331055 }, { "epoch": 0.1678860984773581, "step": 849, "train/total_loss": 0.08888328075408936 }, { "entropy": 6.105377674102783, "epoch": 0.16808384417638916, "mean_token_accuracy": 0.7136101126670837, "num_tokens": 38315688.0, "step": 850, "train/ce_loss": 0.6117926239967346 }, { "epoch": 0.16808384417638916, "step": 850, "train/sim_loss": 0.0003508329391479492 }, { "epoch": 0.16808384417638916, "step": 850, "train/total_loss": 0.06153009459376335 }, { "entropy": 6.150570392608643, "epoch": 0.16828158987542022, "mean_token_accuracy": 0.6991661190986633, "num_tokens": 38365577.0, "step": 851, "train/ce_loss": 1.0462929010391235 }, { "epoch": 0.16828158987542022, "step": 851, "train/sim_loss": 0.0005286335945129395 }, { "epoch": 0.16828158987542022, "step": 851, "train/total_loss": 0.10515792667865753 }, { "entropy": 6.2901105880737305, "epoch": 0.16847933557445124, "mean_token_accuracy": 0.7516425848007202, "num_tokens": 38412733.0, "step": 852, "train/ce_loss": 2.1955584088573232e-05 }, { "epoch": 0.16847933557445124, "step": 852, "train/sim_loss": 0.00047659873962402344 }, { "epoch": 0.16847933557445124, "step": 852, "train/total_loss": 0.00047879430348984897 }, { "entropy": 6.549300670623779, "epoch": 0.1686770812734823, "mean_token_accuracy": 0.7533875107765198, "num_tokens": 38471111.0, "step": 853, "train/ce_loss": 4.007011739304289e-05 }, { "epoch": 0.1686770812734823, "step": 853, "train/sim_loss": 0.0004069805145263672 }, { "epoch": 0.1686770812734823, "step": 853, "train/total_loss": 0.0004109875299036503 }, { "entropy": 6.314935684204102, "epoch": 0.16887482697251335, "mean_token_accuracy": 0.7308868765830994, "num_tokens": 38516085.0, "step": 854, "train/ce_loss": 3.8365506043192e-05 }, { "epoch": 0.16887482697251335, "step": 854, "train/sim_loss": 0.0005425214767456055 }, { "epoch": 0.16887482697251335, "step": 854, "train/total_loss": 0.000546358001884073 }, { "entropy": 6.1727294921875, "epoch": 0.1690725726715444, "mean_token_accuracy": 0.7445185780525208, "num_tokens": 38548480.0, "step": 855, "train/ce_loss": 3.487121648504399e-05 }, { "epoch": 0.1690725726715444, "step": 855, "train/sim_loss": 0.0006808042526245117 }, { "epoch": 0.1690725726715444, "step": 855, "train/total_loss": 0.0006842913571745157 }, { "entropy": 6.168739318847656, "epoch": 0.16927031837057543, "mean_token_accuracy": 0.7487080097198486, "num_tokens": 38582884.0, "step": 856, "train/ce_loss": 1.0915284156799316 }, { "epoch": 0.16927031837057543, "step": 856, "train/sim_loss": 0.0004928708076477051 }, { "epoch": 0.16927031837057543, "step": 856, "train/total_loss": 0.10964571684598923 }, { "entropy": 6.370624542236328, "epoch": 0.1694680640696065, "mean_token_accuracy": 0.7331550717353821, "num_tokens": 38627361.0, "step": 857, "train/ce_loss": 0.7229024767875671 }, { "epoch": 0.1694680640696065, "step": 857, "train/sim_loss": 0.0006742477416992188 }, { "epoch": 0.1694680640696065, "step": 857, "train/total_loss": 0.07296449691057205 }, { "entropy": 6.006468772888184, "epoch": 0.16966580976863754, "mean_token_accuracy": 0.766835629940033, "num_tokens": 38679415.0, "step": 858, "train/ce_loss": 0.7404318451881409 }, { "epoch": 0.16966580976863754, "step": 858, "train/sim_loss": 0.0006622076034545898 }, { "epoch": 0.16966580976863754, "step": 858, "train/total_loss": 0.07470539212226868 }, { "entropy": 6.507725238800049, "epoch": 0.16986355546766857, "mean_token_accuracy": 0.7589082717895508, "num_tokens": 38714996.0, "step": 859, "train/ce_loss": 1.1401838064193726 }, { "epoch": 0.16986355546766857, "step": 859, "train/sim_loss": 0.0004978179931640625 }, { "epoch": 0.16986355546766857, "step": 859, "train/total_loss": 0.11451619863510132 }, { "epoch": 0.17006130116669962, "grad_norm": 0.48583880066871643, "learning_rate": 9.580077158967258e-06, "loss": 0.0842, "step": 860 }, { "entropy": 6.1049885749816895, "epoch": 0.17006130116669962, "mean_token_accuracy": 0.7207724452018738, "num_tokens": 38757061.0, "step": 860, "train/ce_loss": 0.622389018535614 }, { "epoch": 0.17006130116669962, "step": 860, "train/sim_loss": 0.000490725040435791 }, { "epoch": 0.17006130116669962, "step": 860, "train/total_loss": 0.06272962689399719 }, { "entropy": 6.2620849609375, "epoch": 0.17025904686573068, "mean_token_accuracy": 0.7105788588523865, "num_tokens": 38813639.0, "step": 861, "train/ce_loss": 0.4613713324069977 }, { "epoch": 0.17025904686573068, "step": 861, "train/sim_loss": 0.0004611015319824219 }, { "epoch": 0.17025904686573068, "step": 861, "train/total_loss": 0.04659823700785637 }, { "entropy": 5.932345390319824, "epoch": 0.1704567925647617, "mean_token_accuracy": 0.7539432048797607, "num_tokens": 38853109.0, "step": 862, "train/ce_loss": 0.8443958759307861 }, { "epoch": 0.1704567925647617, "step": 862, "train/sim_loss": 0.00038802623748779297 }, { "epoch": 0.1704567925647617, "step": 862, "train/total_loss": 0.08482761681079865 }, { "entropy": 6.420476913452148, "epoch": 0.17065453826379276, "mean_token_accuracy": 0.7263940572738647, "num_tokens": 38908040.0, "step": 863, "train/ce_loss": 3.618231130531058e-05 }, { "epoch": 0.17065453826379276, "step": 863, "train/sim_loss": 0.00042682886123657227 }, { "epoch": 0.17065453826379276, "step": 863, "train/total_loss": 0.00043044707854278386 }, { "entropy": 6.431980609893799, "epoch": 0.17085228396282381, "mean_token_accuracy": 0.7231183052062988, "num_tokens": 38953875.0, "step": 864, "train/ce_loss": 3.182882574037649e-05 }, { "epoch": 0.17085228396282381, "step": 864, "train/sim_loss": 0.0005233287811279297 }, { "epoch": 0.17085228396282381, "step": 864, "train/total_loss": 0.0005265116924419999 }, { "entropy": 5.7065935134887695, "epoch": 0.17105002966185484, "mean_token_accuracy": 0.7581893801689148, "num_tokens": 38995695.0, "step": 865, "train/ce_loss": 0.5963133573532104 }, { "epoch": 0.17105002966185484, "step": 865, "train/sim_loss": 0.000335693359375 }, { "epoch": 0.17105002966185484, "step": 865, "train/total_loss": 0.059967029839754105 }, { "entropy": 6.153328895568848, "epoch": 0.1712477753608859, "mean_token_accuracy": 0.7296677827835083, "num_tokens": 39048484.0, "step": 866, "train/ce_loss": 1.2630339860916138 }, { "epoch": 0.1712477753608859, "step": 866, "train/sim_loss": 0.0005652904510498047 }, { "epoch": 0.1712477753608859, "step": 866, "train/total_loss": 0.12686869502067566 }, { "entropy": 6.138699531555176, "epoch": 0.17144552105991695, "mean_token_accuracy": 0.7694703936576843, "num_tokens": 39086662.0, "step": 867, "train/ce_loss": 0.5129541158676147 }, { "epoch": 0.17144552105991695, "step": 867, "train/sim_loss": 0.0003323554992675781 }, { "epoch": 0.17144552105991695, "step": 867, "train/total_loss": 0.05162776634097099 }, { "entropy": 6.0272979736328125, "epoch": 0.171643266758948, "mean_token_accuracy": 0.7514156103134155, "num_tokens": 39133929.0, "step": 868, "train/ce_loss": 0.48970478773117065 }, { "epoch": 0.171643266758948, "step": 868, "train/sim_loss": 0.00029581785202026367 }, { "epoch": 0.171643266758948, "step": 868, "train/total_loss": 0.04926629737019539 }, { "entropy": 6.383090972900391, "epoch": 0.17184101245797903, "mean_token_accuracy": 0.6998680830001831, "num_tokens": 39187930.0, "step": 869, "train/ce_loss": 0.5336440205574036 }, { "epoch": 0.17184101245797903, "step": 869, "train/sim_loss": 0.00047910213470458984 }, { "epoch": 0.17184101245797903, "step": 869, "train/total_loss": 0.053843505680561066 }, { "entropy": 6.009572505950928, "epoch": 0.17203875815701009, "mean_token_accuracy": 0.7035024166107178, "num_tokens": 39217776.0, "step": 870, "train/ce_loss": 0.6906639337539673 }, { "epoch": 0.17203875815701009, "step": 870, "train/sim_loss": 0.0005308985710144043 }, { "epoch": 0.17203875815701009, "step": 870, "train/total_loss": 0.06959729641675949 }, { "entropy": 5.709681034088135, "epoch": 0.17223650385604114, "mean_token_accuracy": 0.7505720853805542, "num_tokens": 39250022.0, "step": 871, "train/ce_loss": 0.7183809280395508 }, { "epoch": 0.17223650385604114, "step": 871, "train/sim_loss": 0.0003713369369506836 }, { "epoch": 0.17223650385604114, "step": 871, "train/total_loss": 0.072209432721138 }, { "entropy": 6.341579437255859, "epoch": 0.17243424955507217, "mean_token_accuracy": 0.7193947434425354, "num_tokens": 39294977.0, "step": 872, "train/ce_loss": 1.166833758354187 }, { "epoch": 0.17243424955507217, "step": 872, "train/sim_loss": 0.00048041343688964844 }, { "epoch": 0.17243424955507217, "step": 872, "train/total_loss": 0.11716379225254059 }, { "entropy": 5.940237998962402, "epoch": 0.17263199525410322, "mean_token_accuracy": 0.7555266618728638, "num_tokens": 39335844.0, "step": 873, "train/ce_loss": 0.7204118371009827 }, { "epoch": 0.17263199525410322, "step": 873, "train/sim_loss": 0.0005341768264770508 }, { "epoch": 0.17263199525410322, "step": 873, "train/total_loss": 0.07257536053657532 }, { "entropy": 6.1406145095825195, "epoch": 0.17282974095313428, "mean_token_accuracy": 0.724250316619873, "num_tokens": 39387048.0, "step": 874, "train/ce_loss": 0.7937323451042175 }, { "epoch": 0.17282974095313428, "step": 874, "train/sim_loss": 0.00043892860412597656 }, { "epoch": 0.17282974095313428, "step": 874, "train/total_loss": 0.07981216162443161 }, { "entropy": 6.134888648986816, "epoch": 0.1730274866521653, "mean_token_accuracy": 0.7562310099601746, "num_tokens": 39437741.0, "step": 875, "train/ce_loss": 0.9624637961387634 }, { "epoch": 0.1730274866521653, "step": 875, "train/sim_loss": 0.0005238056182861328 }, { "epoch": 0.1730274866521653, "step": 875, "train/total_loss": 0.09677018970251083 }, { "entropy": 6.2160844802856445, "epoch": 0.17322523235119636, "mean_token_accuracy": 0.7269399762153625, "num_tokens": 39479035.0, "step": 876, "train/ce_loss": 0.7640460133552551 }, { "epoch": 0.17322523235119636, "step": 876, "train/sim_loss": 0.0007624626159667969 }, { "epoch": 0.17322523235119636, "step": 876, "train/total_loss": 0.07716706395149231 }, { "entropy": 6.355103492736816, "epoch": 0.1734229780502274, "mean_token_accuracy": 0.7287307977676392, "num_tokens": 39537312.0, "step": 877, "train/ce_loss": 0.5845659375190735 }, { "epoch": 0.1734229780502274, "step": 877, "train/sim_loss": 0.0007725358009338379 }, { "epoch": 0.1734229780502274, "step": 877, "train/total_loss": 0.059229131788015366 }, { "entropy": 5.9649457931518555, "epoch": 0.17362072374925847, "mean_token_accuracy": 0.7468982338905334, "num_tokens": 39581340.0, "step": 878, "train/ce_loss": 3.158990875817835e-05 }, { "epoch": 0.17362072374925847, "step": 878, "train/sim_loss": 0.00048673152923583984 }, { "epoch": 0.17362072374925847, "step": 878, "train/total_loss": 0.0004898905172012746 }, { "entropy": 5.915836334228516, "epoch": 0.1738184694482895, "mean_token_accuracy": 0.759215235710144, "num_tokens": 39617724.0, "step": 879, "train/ce_loss": 0.6222426891326904 }, { "epoch": 0.1738184694482895, "step": 879, "train/sim_loss": 0.0006793737411499023 }, { "epoch": 0.1738184694482895, "step": 879, "train/total_loss": 0.06290364265441895 }, { "epoch": 0.17401621514732055, "grad_norm": 0.4477567672729492, "learning_rate": 9.570184983677912e-06, "loss": 0.0853, "step": 880 }, { "entropy": 5.933104038238525, "epoch": 0.17401621514732055, "mean_token_accuracy": 0.767344057559967, "num_tokens": 39645314.0, "step": 880, "train/ce_loss": 1.3763948678970337 }, { "epoch": 0.17401621514732055, "step": 880, "train/sim_loss": 0.0004824399948120117 }, { "epoch": 0.17401621514732055, "step": 880, "train/total_loss": 0.13812193274497986 }, { "entropy": 5.671149253845215, "epoch": 0.1742139608463516, "mean_token_accuracy": 0.7520504593849182, "num_tokens": 39672955.0, "step": 881, "train/ce_loss": 0.3762763440608978 }, { "epoch": 0.1742139608463516, "step": 881, "train/sim_loss": 0.0005101561546325684 }, { "epoch": 0.1742139608463516, "step": 881, "train/total_loss": 0.03813778981566429 }, { "entropy": 5.822959899902344, "epoch": 0.17441170654538263, "mean_token_accuracy": 0.7465986609458923, "num_tokens": 39710823.0, "step": 882, "train/ce_loss": 3.469746297923848e-05 }, { "epoch": 0.17441170654538263, "step": 882, "train/sim_loss": 0.00031638145446777344 }, { "epoch": 0.17441170654538263, "step": 882, "train/total_loss": 0.00031985121313482523 }, { "entropy": 5.887517929077148, "epoch": 0.17460945224441368, "mean_token_accuracy": 0.779627799987793, "num_tokens": 39761188.0, "step": 883, "train/ce_loss": 0.8551323413848877 }, { "epoch": 0.17460945224441368, "step": 883, "train/sim_loss": 0.0003191232681274414 }, { "epoch": 0.17460945224441368, "step": 883, "train/total_loss": 0.08583235740661621 }, { "entropy": 5.781937599182129, "epoch": 0.17480719794344474, "mean_token_accuracy": 0.7555555701255798, "num_tokens": 39806716.0, "step": 884, "train/ce_loss": 3.398406624910422e-05 }, { "epoch": 0.17480719794344474, "step": 884, "train/sim_loss": 0.00031173229217529297 }, { "epoch": 0.17480719794344474, "step": 884, "train/total_loss": 0.00031513068825006485 }, { "entropy": 6.146018028259277, "epoch": 0.17500494364247576, "mean_token_accuracy": 0.7533432245254517, "num_tokens": 39852855.0, "step": 885, "train/ce_loss": 0.7430900931358337 }, { "epoch": 0.17500494364247576, "step": 885, "train/sim_loss": 0.0006860494613647461 }, { "epoch": 0.17500494364247576, "step": 885, "train/total_loss": 0.07499506324529648 }, { "entropy": 5.861197471618652, "epoch": 0.17520268934150682, "mean_token_accuracy": 0.7303047776222229, "num_tokens": 39893692.0, "step": 886, "train/ce_loss": 0.7413029670715332 }, { "epoch": 0.17520268934150682, "step": 886, "train/sim_loss": 0.00046956539154052734 }, { "epoch": 0.17520268934150682, "step": 886, "train/total_loss": 0.07459986209869385 }, { "entropy": 6.177164077758789, "epoch": 0.17540043504053787, "mean_token_accuracy": 0.6938775777816772, "num_tokens": 39932116.0, "step": 887, "train/ce_loss": 1.1344823837280273 }, { "epoch": 0.17540043504053787, "step": 887, "train/sim_loss": 0.0005655288696289062 }, { "epoch": 0.17540043504053787, "step": 887, "train/total_loss": 0.11401376873254776 }, { "entropy": 5.914151191711426, "epoch": 0.17559818073956893, "mean_token_accuracy": 0.7164009213447571, "num_tokens": 39977603.0, "step": 888, "train/ce_loss": 0.9021461606025696 }, { "epoch": 0.17559818073956893, "step": 888, "train/sim_loss": 0.00033921003341674805 }, { "epoch": 0.17559818073956893, "step": 888, "train/total_loss": 0.09055382758378983 }, { "entropy": 5.507960319519043, "epoch": 0.17579592643859995, "mean_token_accuracy": 0.7788881659507751, "num_tokens": 40012180.0, "step": 889, "train/ce_loss": 0.9448145627975464 }, { "epoch": 0.17579592643859995, "step": 889, "train/sim_loss": 0.0006470680236816406 }, { "epoch": 0.17579592643859995, "step": 889, "train/total_loss": 0.09512852877378464 }, { "entropy": 5.913802146911621, "epoch": 0.175993672137631, "mean_token_accuracy": 0.7600519061088562, "num_tokens": 40064782.0, "step": 890, "train/ce_loss": 0.8061873316764832 }, { "epoch": 0.175993672137631, "step": 890, "train/sim_loss": 0.0004913210868835449 }, { "epoch": 0.175993672137631, "step": 890, "train/total_loss": 0.08111005276441574 }, { "entropy": 6.091452598571777, "epoch": 0.17619141783666206, "mean_token_accuracy": 0.7137150168418884, "num_tokens": 40116475.0, "step": 891, "train/ce_loss": 1.4320881366729736 }, { "epoch": 0.17619141783666206, "step": 891, "train/sim_loss": 0.00039821863174438477 }, { "epoch": 0.17619141783666206, "step": 891, "train/total_loss": 0.143607035279274 }, { "entropy": 5.9276604652404785, "epoch": 0.1763891635356931, "mean_token_accuracy": 0.7619655132293701, "num_tokens": 40160062.0, "step": 892, "train/ce_loss": 1.189579725265503 }, { "epoch": 0.1763891635356931, "step": 892, "train/sim_loss": 0.00036787986755371094 }, { "epoch": 0.1763891635356931, "step": 892, "train/total_loss": 0.11932585388422012 }, { "entropy": 6.359746932983398, "epoch": 0.17658690923472414, "mean_token_accuracy": 0.7454090118408203, "num_tokens": 40211474.0, "step": 893, "train/ce_loss": 0.7903146743774414 }, { "epoch": 0.17658690923472414, "step": 893, "train/sim_loss": 0.0003693103790283203 }, { "epoch": 0.17658690923472414, "step": 893, "train/total_loss": 0.07940077781677246 }, { "entropy": 6.205320358276367, "epoch": 0.1767846549337552, "mean_token_accuracy": 0.7249398827552795, "num_tokens": 40264273.0, "step": 894, "train/ce_loss": 2.9258801077958196e-05 }, { "epoch": 0.1767846549337552, "step": 894, "train/sim_loss": 0.00031745433807373047 }, { "epoch": 0.1767846549337552, "step": 894, "train/total_loss": 0.0003203802043572068 }, { "entropy": 6.140684127807617, "epoch": 0.17698240063278622, "mean_token_accuracy": 0.7570621371269226, "num_tokens": 40304034.0, "step": 895, "train/ce_loss": 0.8934596180915833 }, { "epoch": 0.17698240063278622, "step": 895, "train/sim_loss": 0.000562131404876709 }, { "epoch": 0.17698240063278622, "step": 895, "train/total_loss": 0.08990809321403503 }, { "entropy": 5.876838684082031, "epoch": 0.17718014633181728, "mean_token_accuracy": 0.7276037931442261, "num_tokens": 40343481.0, "step": 896, "train/ce_loss": 1.5445058345794678 }, { "epoch": 0.17718014633181728, "step": 896, "train/sim_loss": 0.00028717517852783203 }, { "epoch": 0.17718014633181728, "step": 896, "train/total_loss": 0.15473775565624237 }, { "entropy": 5.990553855895996, "epoch": 0.17737789203084833, "mean_token_accuracy": 0.7552674412727356, "num_tokens": 40383853.0, "step": 897, "train/ce_loss": 1.4029604196548462 }, { "epoch": 0.17737789203084833, "step": 897, "train/sim_loss": 0.000529170036315918 }, { "epoch": 0.17737789203084833, "step": 897, "train/total_loss": 0.14082521200180054 }, { "entropy": 5.664402008056641, "epoch": 0.1775756377298794, "mean_token_accuracy": 0.7524547576904297, "num_tokens": 40422862.0, "step": 898, "train/ce_loss": 0.4607609212398529 }, { "epoch": 0.1775756377298794, "step": 898, "train/sim_loss": 0.00045228004455566406 }, { "epoch": 0.1775756377298794, "step": 898, "train/total_loss": 0.046528372913599014 }, { "entropy": 6.116058349609375, "epoch": 0.17777338342891041, "mean_token_accuracy": 0.7257187366485596, "num_tokens": 40471454.0, "step": 899, "train/ce_loss": 0.986572265625 }, { "epoch": 0.17777338342891041, "step": 899, "train/sim_loss": 0.0004464387893676758 }, { "epoch": 0.17777338342891041, "step": 899, "train/total_loss": 0.0991036668419838 }, { "epoch": 0.17797112912794147, "grad_norm": 0.5447693467140198, "learning_rate": 9.560292808388565e-06, "loss": 0.0812, "step": 900 }, { "entropy": 5.695709228515625, "epoch": 0.17797112912794147, "mean_token_accuracy": 0.7344573140144348, "num_tokens": 40522312.0, "step": 900, "train/ce_loss": 0.8247905969619751 }, { "epoch": 0.17797112912794147, "step": 900, "train/sim_loss": 0.0003930330276489258 }, { "epoch": 0.17797112912794147, "step": 900, "train/total_loss": 0.08287209272384644 }, { "entropy": 5.6394548416137695, "epoch": 0.17816887482697252, "mean_token_accuracy": 0.7413268685340881, "num_tokens": 40557739.0, "step": 901, "train/ce_loss": 0.8608778715133667 }, { "epoch": 0.17816887482697252, "step": 901, "train/sim_loss": 0.0006534457206726074 }, { "epoch": 0.17816887482697252, "step": 901, "train/total_loss": 0.08674123138189316 }, { "entropy": 5.615354537963867, "epoch": 0.17836662052600355, "mean_token_accuracy": 0.7687671184539795, "num_tokens": 40591361.0, "step": 902, "train/ce_loss": 0.7310686111450195 }, { "epoch": 0.17836662052600355, "step": 902, "train/sim_loss": 0.0002745389938354492 }, { "epoch": 0.17836662052600355, "step": 902, "train/total_loss": 0.07338140159845352 }, { "entropy": 5.983602046966553, "epoch": 0.1785643662250346, "mean_token_accuracy": 0.7626459002494812, "num_tokens": 40636027.0, "step": 903, "train/ce_loss": 0.572303831577301 }, { "epoch": 0.1785643662250346, "step": 903, "train/sim_loss": 0.0006339550018310547 }, { "epoch": 0.1785643662250346, "step": 903, "train/total_loss": 0.05786433815956116 }, { "entropy": 5.787463665008545, "epoch": 0.17876211192406566, "mean_token_accuracy": 0.7627118825912476, "num_tokens": 40684123.0, "step": 904, "train/ce_loss": 0.5522582530975342 }, { "epoch": 0.17876211192406566, "step": 904, "train/sim_loss": 0.0008505582809448242 }, { "epoch": 0.17876211192406566, "step": 904, "train/total_loss": 0.05607638508081436 }, { "entropy": 5.646628379821777, "epoch": 0.17895985762309669, "mean_token_accuracy": 0.7684407234191895, "num_tokens": 40709820.0, "step": 905, "train/ce_loss": 1.5415079593658447 }, { "epoch": 0.17895985762309669, "step": 905, "train/sim_loss": 0.0004050135612487793 }, { "epoch": 0.17895985762309669, "step": 905, "train/total_loss": 0.1545558124780655 }, { "entropy": 5.752814769744873, "epoch": 0.17915760332212774, "mean_token_accuracy": 0.716284990310669, "num_tokens": 40746107.0, "step": 906, "train/ce_loss": 0.8470278978347778 }, { "epoch": 0.17915760332212774, "step": 906, "train/sim_loss": 0.0004647374153137207 }, { "epoch": 0.17915760332212774, "step": 906, "train/total_loss": 0.0851675271987915 }, { "entropy": 5.951000213623047, "epoch": 0.1793553490211588, "mean_token_accuracy": 0.7640918493270874, "num_tokens": 40784210.0, "step": 907, "train/ce_loss": 1.4278758764266968 }, { "epoch": 0.1793553490211588, "step": 907, "train/sim_loss": 0.0005139708518981934 }, { "epoch": 0.1793553490211588, "step": 907, "train/total_loss": 0.1433015614748001 }, { "entropy": 5.829477787017822, "epoch": 0.17955309472018985, "mean_token_accuracy": 0.7200512290000916, "num_tokens": 40823908.0, "step": 908, "train/ce_loss": 0.9055891036987305 }, { "epoch": 0.17955309472018985, "step": 908, "train/sim_loss": 0.0003383159637451172 }, { "epoch": 0.17955309472018985, "step": 908, "train/total_loss": 0.09089722484350204 }, { "entropy": 5.680323600769043, "epoch": 0.17975084041922088, "mean_token_accuracy": 0.7300736904144287, "num_tokens": 40862663.0, "step": 909, "train/ce_loss": 2.1668562112608925e-05 }, { "epoch": 0.17975084041922088, "step": 909, "train/sim_loss": 0.0006186962127685547 }, { "epoch": 0.17975084041922088, "step": 909, "train/total_loss": 0.0006208630511537194 }, { "entropy": 5.834210395812988, "epoch": 0.17994858611825193, "mean_token_accuracy": 0.7213114500045776, "num_tokens": 40919880.0, "step": 910, "train/ce_loss": 0.9477481842041016 }, { "epoch": 0.17994858611825193, "step": 910, "train/sim_loss": 0.0004839897155761719 }, { "epoch": 0.17994858611825193, "step": 910, "train/total_loss": 0.09525880962610245 }, { "entropy": 5.730003356933594, "epoch": 0.18014633181728298, "mean_token_accuracy": 0.7355769276618958, "num_tokens": 40958927.0, "step": 911, "train/ce_loss": 2.8620426746783778e-05 }, { "epoch": 0.18014633181728298, "step": 911, "train/sim_loss": 0.000247955322265625 }, { "epoch": 0.18014633181728298, "step": 911, "train/total_loss": 0.00025081736384890974 }, { "entropy": 5.886023998260498, "epoch": 0.180344077516314, "mean_token_accuracy": 0.7641456723213196, "num_tokens": 41014602.0, "step": 912, "train/ce_loss": 1.0743240118026733 }, { "epoch": 0.180344077516314, "step": 912, "train/sim_loss": 0.00039076805114746094 }, { "epoch": 0.180344077516314, "step": 912, "train/total_loss": 0.10782317072153091 }, { "entropy": 5.883519172668457, "epoch": 0.18054182321534507, "mean_token_accuracy": 0.7728310227394104, "num_tokens": 41060851.0, "step": 913, "train/ce_loss": 4.4186639570398256e-05 }, { "epoch": 0.18054182321534507, "step": 913, "train/sim_loss": 0.00040853023529052734 }, { "epoch": 0.18054182321534507, "step": 913, "train/total_loss": 0.0004129488952457905 }, { "entropy": 6.174501895904541, "epoch": 0.18073956891437612, "mean_token_accuracy": 0.7061855792999268, "num_tokens": 41092886.0, "step": 914, "train/ce_loss": 1.044195532798767 }, { "epoch": 0.18073956891437612, "step": 914, "train/sim_loss": 0.0003470182418823242 }, { "epoch": 0.18073956891437612, "step": 914, "train/total_loss": 0.10476657003164291 }, { "entropy": 5.921706199645996, "epoch": 0.18093731461340715, "mean_token_accuracy": 0.7036144733428955, "num_tokens": 41125480.0, "step": 915, "train/ce_loss": 0.6598717570304871 }, { "epoch": 0.18093731461340715, "step": 915, "train/sim_loss": 0.0003033876419067383 }, { "epoch": 0.18093731461340715, "step": 915, "train/total_loss": 0.06629056483507156 }, { "entropy": 5.825878620147705, "epoch": 0.1811350603124382, "mean_token_accuracy": 0.761838436126709, "num_tokens": 41180751.0, "step": 916, "train/ce_loss": 0.7637384533882141 }, { "epoch": 0.1811350603124382, "step": 916, "train/sim_loss": 0.00033587217330932617 }, { "epoch": 0.1811350603124382, "step": 916, "train/total_loss": 0.07670971751213074 }, { "entropy": 5.624827861785889, "epoch": 0.18133280601146926, "mean_token_accuracy": 0.7425578832626343, "num_tokens": 41222640.0, "step": 917, "train/ce_loss": 1.1379565000534058 }, { "epoch": 0.18133280601146926, "step": 917, "train/sim_loss": 0.0009222626686096191 }, { "epoch": 0.18133280601146926, "step": 917, "train/total_loss": 0.11471791565418243 }, { "entropy": 5.632491111755371, "epoch": 0.1815305517105003, "mean_token_accuracy": 0.7354064583778381, "num_tokens": 41259078.0, "step": 918, "train/ce_loss": 0.707190215587616 }, { "epoch": 0.1815305517105003, "step": 918, "train/sim_loss": 0.0007188916206359863 }, { "epoch": 0.1815305517105003, "step": 918, "train/total_loss": 0.07143791764974594 }, { "entropy": 6.014286994934082, "epoch": 0.18172829740953134, "mean_token_accuracy": 0.7127496004104614, "num_tokens": 41294242.0, "step": 919, "train/ce_loss": 0.6055211424827576 }, { "epoch": 0.18172829740953134, "step": 919, "train/sim_loss": 0.0008692741394042969 }, { "epoch": 0.18172829740953134, "step": 919, "train/total_loss": 0.06142139062285423 }, { "epoch": 0.1819260431085624, "grad_norm": 0.5434746742248535, "learning_rate": 9.550400633099219e-06, "loss": 0.0823, "step": 920 }, { "entropy": 5.864728927612305, "epoch": 0.1819260431085624, "mean_token_accuracy": 0.7417551875114441, "num_tokens": 41338938.0, "step": 920, "train/ce_loss": 0.7124691605567932 }, { "epoch": 0.1819260431085624, "step": 920, "train/sim_loss": 0.0004849433898925781 }, { "epoch": 0.1819260431085624, "step": 920, "train/total_loss": 0.07173185795545578 }, { "entropy": 5.861666679382324, "epoch": 0.18212378880759345, "mean_token_accuracy": 0.7254259586334229, "num_tokens": 41390750.0, "step": 921, "train/ce_loss": 0.7540968656539917 }, { "epoch": 0.18212378880759345, "step": 921, "train/sim_loss": 0.0002512335777282715 }, { "epoch": 0.18212378880759345, "step": 921, "train/total_loss": 0.07566092163324356 }, { "entropy": 5.757942199707031, "epoch": 0.18232153450662447, "mean_token_accuracy": 0.761040985584259, "num_tokens": 41426348.0, "step": 922, "train/ce_loss": 3.153954094159417e-05 }, { "epoch": 0.18232153450662447, "step": 922, "train/sim_loss": 0.0006340742111206055 }, { "epoch": 0.18232153450662447, "step": 922, "train/total_loss": 0.0006372281932272017 }, { "entropy": 5.632686614990234, "epoch": 0.18251928020565553, "mean_token_accuracy": 0.7644683718681335, "num_tokens": 41459000.0, "step": 923, "train/ce_loss": 0.7295912504196167 }, { "epoch": 0.18251928020565553, "step": 923, "train/sim_loss": 0.00032514333724975586 }, { "epoch": 0.18251928020565553, "step": 923, "train/total_loss": 0.07328426837921143 }, { "entropy": 5.81019401550293, "epoch": 0.18271702590468658, "mean_token_accuracy": 0.7487223148345947, "num_tokens": 41501846.0, "step": 924, "train/ce_loss": 1.745457649230957 }, { "epoch": 0.18271702590468658, "step": 924, "train/sim_loss": 0.0005629062652587891 }, { "epoch": 0.18271702590468658, "step": 924, "train/total_loss": 0.1751086711883545 }, { "entropy": 5.602986812591553, "epoch": 0.1829147716037176, "mean_token_accuracy": 0.7757712602615356, "num_tokens": 41548228.0, "step": 925, "train/ce_loss": 2.5053112040041015e-05 }, { "epoch": 0.1829147716037176, "step": 925, "train/sim_loss": 0.00044035911560058594 }, { "epoch": 0.1829147716037176, "step": 925, "train/total_loss": 0.0004428644315339625 }, { "entropy": 6.029115200042725, "epoch": 0.18311251730274866, "mean_token_accuracy": 0.7302977442741394, "num_tokens": 41616043.0, "step": 926, "train/ce_loss": 1.502369999885559 }, { "epoch": 0.18311251730274866, "step": 926, "train/sim_loss": 0.0005166530609130859 }, { "epoch": 0.18311251730274866, "step": 926, "train/total_loss": 0.1507536619901657 }, { "entropy": 5.737114429473877, "epoch": 0.18331026300177972, "mean_token_accuracy": 0.7373877167701721, "num_tokens": 41665121.0, "step": 927, "train/ce_loss": 0.8905923366546631 }, { "epoch": 0.18331026300177972, "step": 927, "train/sim_loss": 0.00029969215393066406 }, { "epoch": 0.18331026300177972, "step": 927, "train/total_loss": 0.08935892581939697 }, { "entropy": 5.944769382476807, "epoch": 0.18350800870081077, "mean_token_accuracy": 0.7258805632591248, "num_tokens": 41701623.0, "step": 928, "train/ce_loss": 1.213098406791687 }, { "epoch": 0.18350800870081077, "step": 928, "train/sim_loss": 0.0003858208656311035 }, { "epoch": 0.18350800870081077, "step": 928, "train/total_loss": 0.12169566005468369 }, { "entropy": 6.29250431060791, "epoch": 0.1837057543998418, "mean_token_accuracy": 0.7293606400489807, "num_tokens": 41751498.0, "step": 929, "train/ce_loss": 0.998897135257721 }, { "epoch": 0.1837057543998418, "step": 929, "train/sim_loss": 0.0005041956901550293 }, { "epoch": 0.1837057543998418, "step": 929, "train/total_loss": 0.10039391368627548 }, { "entropy": 5.996811866760254, "epoch": 0.18390350009887285, "mean_token_accuracy": 0.7442116737365723, "num_tokens": 41786940.0, "step": 930, "train/ce_loss": 1.0927090644836426 }, { "epoch": 0.18390350009887285, "step": 930, "train/sim_loss": 0.0005134344100952148 }, { "epoch": 0.18390350009887285, "step": 930, "train/total_loss": 0.10978434234857559 }, { "entropy": 5.791335105895996, "epoch": 0.1841012457979039, "mean_token_accuracy": 0.7507787942886353, "num_tokens": 41828224.0, "step": 931, "train/ce_loss": 2.0682871763710864e-05 }, { "epoch": 0.1841012457979039, "step": 931, "train/sim_loss": 0.0006006360054016113 }, { "epoch": 0.1841012457979039, "step": 931, "train/total_loss": 0.0006027042982168496 }, { "entropy": 6.16992712020874, "epoch": 0.18429899149693493, "mean_token_accuracy": 0.6826722621917725, "num_tokens": 41876992.0, "step": 932, "train/ce_loss": 3.221597580704838e-05 }, { "epoch": 0.18429899149693493, "step": 932, "train/sim_loss": 0.00036585330963134766 }, { "epoch": 0.18429899149693493, "step": 932, "train/total_loss": 0.0003690748999360949 }, { "entropy": 5.910127639770508, "epoch": 0.184496737195966, "mean_token_accuracy": 0.7185809016227722, "num_tokens": 41940196.0, "step": 933, "train/ce_loss": 0.7783108353614807 }, { "epoch": 0.184496737195966, "step": 933, "train/sim_loss": 0.0003973245620727539 }, { "epoch": 0.184496737195966, "step": 933, "train/total_loss": 0.0782284066081047 }, { "entropy": 5.893465042114258, "epoch": 0.18469448289499704, "mean_token_accuracy": 0.7353119254112244, "num_tokens": 41979545.0, "step": 934, "train/ce_loss": 0.6887893080711365 }, { "epoch": 0.18469448289499704, "step": 934, "train/sim_loss": 0.0005317330360412598 }, { "epoch": 0.18469448289499704, "step": 934, "train/total_loss": 0.06941066682338715 }, { "entropy": 6.209988117218018, "epoch": 0.18489222859402807, "mean_token_accuracy": 0.7304204702377319, "num_tokens": 42033396.0, "step": 935, "train/ce_loss": 0.8151450157165527 }, { "epoch": 0.18489222859402807, "step": 935, "train/sim_loss": 0.0003324151039123535 }, { "epoch": 0.18489222859402807, "step": 935, "train/total_loss": 0.08184691518545151 }, { "entropy": 6.1435723304748535, "epoch": 0.18508997429305912, "mean_token_accuracy": 0.7108541131019592, "num_tokens": 42072081.0, "step": 936, "train/ce_loss": 0.8562251329421997 }, { "epoch": 0.18508997429305912, "step": 936, "train/sim_loss": 0.0004717707633972168 }, { "epoch": 0.18508997429305912, "step": 936, "train/total_loss": 0.08609428256750107 }, { "entropy": 5.61570405960083, "epoch": 0.18528771999209018, "mean_token_accuracy": 0.7305993437767029, "num_tokens": 42109162.0, "step": 937, "train/ce_loss": 0.48524266481399536 }, { "epoch": 0.18528771999209018, "step": 937, "train/sim_loss": 0.0006175041198730469 }, { "epoch": 0.18528771999209018, "step": 937, "train/total_loss": 0.0491417720913887 }, { "entropy": 5.812820911407471, "epoch": 0.18548546569112123, "mean_token_accuracy": 0.7201274633407593, "num_tokens": 42144545.0, "step": 938, "train/ce_loss": 0.7408753633499146 }, { "epoch": 0.18548546569112123, "step": 938, "train/sim_loss": 0.00039196014404296875 }, { "epoch": 0.18548546569112123, "step": 938, "train/total_loss": 0.07447949796915054 }, { "entropy": 5.9596967697143555, "epoch": 0.18568321139015226, "mean_token_accuracy": 0.7596726417541504, "num_tokens": 42202828.0, "step": 939, "train/ce_loss": 0.5127313137054443 }, { "epoch": 0.18568321139015226, "step": 939, "train/sim_loss": 0.0004585385322570801 }, { "epoch": 0.18568321139015226, "step": 939, "train/total_loss": 0.05173167213797569 }, { "epoch": 0.1858809570891833, "grad_norm": 0.4961676597595215, "learning_rate": 9.540508457809873e-06, "loss": 0.0853, "step": 940 }, { "entropy": 6.069070339202881, "epoch": 0.1858809570891833, "mean_token_accuracy": 0.7138461470603943, "num_tokens": 42237251.0, "step": 940, "train/ce_loss": 1.579426646232605 }, { "epoch": 0.1858809570891833, "step": 940, "train/sim_loss": 0.0004264712333679199 }, { "epoch": 0.1858809570891833, "step": 940, "train/total_loss": 0.15836913883686066 }, { "entropy": 5.470828056335449, "epoch": 0.18607870278821437, "mean_token_accuracy": 0.730434775352478, "num_tokens": 42278996.0, "step": 941, "train/ce_loss": 2.17940778384218e-05 }, { "epoch": 0.18607870278821437, "step": 941, "train/sim_loss": 0.00033605098724365234 }, { "epoch": 0.18607870278821437, "step": 941, "train/total_loss": 0.0003382303984835744 }, { "entropy": 5.408329963684082, "epoch": 0.1862764484872454, "mean_token_accuracy": 0.7528011202812195, "num_tokens": 42308550.0, "step": 942, "train/ce_loss": 1.2251064777374268 }, { "epoch": 0.1862764484872454, "step": 942, "train/sim_loss": 0.000446319580078125 }, { "epoch": 0.1862764484872454, "step": 942, "train/total_loss": 0.12295696884393692 }, { "entropy": 5.715401649475098, "epoch": 0.18647419418627645, "mean_token_accuracy": 0.7581612467765808, "num_tokens": 42342164.0, "step": 943, "train/ce_loss": 0.5403480529785156 }, { "epoch": 0.18647419418627645, "step": 943, "train/sim_loss": 0.00045669078826904297 }, { "epoch": 0.18647419418627645, "step": 943, "train/total_loss": 0.054491497576236725 }, { "entropy": 5.816168785095215, "epoch": 0.1866719398853075, "mean_token_accuracy": 0.7220486998558044, "num_tokens": 42392176.0, "step": 944, "train/ce_loss": 0.8888463973999023 }, { "epoch": 0.1866719398853075, "step": 944, "train/sim_loss": 0.0004544854164123535 }, { "epoch": 0.1866719398853075, "step": 944, "train/total_loss": 0.08933912962675095 }, { "entropy": 5.978943824768066, "epoch": 0.18686968558433853, "mean_token_accuracy": 0.740882933139801, "num_tokens": 42445845.0, "step": 945, "train/ce_loss": 0.7342298626899719 }, { "epoch": 0.18686968558433853, "step": 945, "train/sim_loss": 0.0005200505256652832 }, { "epoch": 0.18686968558433853, "step": 945, "train/total_loss": 0.07394304126501083 }, { "entropy": 5.415502548217773, "epoch": 0.18706743128336958, "mean_token_accuracy": 0.7520278096199036, "num_tokens": 42470230.0, "step": 946, "train/ce_loss": 0.7045917510986328 }, { "epoch": 0.18706743128336958, "step": 946, "train/sim_loss": 0.00048661231994628906 }, { "epoch": 0.18706743128336958, "step": 946, "train/total_loss": 0.07094579190015793 }, { "entropy": 5.813493728637695, "epoch": 0.18726517698240064, "mean_token_accuracy": 0.7440476417541504, "num_tokens": 42523762.0, "step": 947, "train/ce_loss": 3.7436006095958874e-05 }, { "epoch": 0.18726517698240064, "step": 947, "train/sim_loss": 0.0006382465362548828 }, { "epoch": 0.18726517698240064, "step": 947, "train/total_loss": 0.0006419901619665325 }, { "entropy": 5.901955604553223, "epoch": 0.18746292268143167, "mean_token_accuracy": 0.713087260723114, "num_tokens": 42569032.0, "step": 948, "train/ce_loss": 0.46022793650627136 }, { "epoch": 0.18746292268143167, "step": 948, "train/sim_loss": 0.00033092498779296875 }, { "epoch": 0.18746292268143167, "step": 948, "train/total_loss": 0.046353720128536224 }, { "entropy": 5.973097801208496, "epoch": 0.18766066838046272, "mean_token_accuracy": 0.6803455948829651, "num_tokens": 42621975.0, "step": 949, "train/ce_loss": 3.3210584660992026e-05 }, { "epoch": 0.18766066838046272, "step": 949, "train/sim_loss": 0.0006328821182250977 }, { "epoch": 0.18766066838046272, "step": 949, "train/total_loss": 0.0006362031563185155 }, { "entropy": 5.812294006347656, "epoch": 0.18785841407949377, "mean_token_accuracy": 0.7305737137794495, "num_tokens": 42663586.0, "step": 950, "train/ce_loss": 0.7329044342041016 }, { "epoch": 0.18785841407949377, "step": 950, "train/sim_loss": 0.00046372413635253906 }, { "epoch": 0.18785841407949377, "step": 950, "train/total_loss": 0.07375416904687881 }, { "entropy": 5.702945232391357, "epoch": 0.18805615977852483, "mean_token_accuracy": 0.7189542651176453, "num_tokens": 42700653.0, "step": 951, "train/ce_loss": 0.9103171229362488 }, { "epoch": 0.18805615977852483, "step": 951, "train/sim_loss": 0.00042831897735595703 }, { "epoch": 0.18805615977852483, "step": 951, "train/total_loss": 0.09146003425121307 }, { "entropy": 5.746790409088135, "epoch": 0.18825390547755586, "mean_token_accuracy": 0.7622571587562561, "num_tokens": 42736537.0, "step": 952, "train/ce_loss": 1.1315609216690063 }, { "epoch": 0.18825390547755586, "step": 952, "train/sim_loss": 0.0016847848892211914 }, { "epoch": 0.18825390547755586, "step": 952, "train/total_loss": 0.11484088003635406 }, { "entropy": 5.878786087036133, "epoch": 0.1884516511765869, "mean_token_accuracy": 0.694968581199646, "num_tokens": 42774836.0, "step": 953, "train/ce_loss": 1.6037302017211914 }, { "epoch": 0.1884516511765869, "step": 953, "train/sim_loss": 0.000614166259765625 }, { "epoch": 0.1884516511765869, "step": 953, "train/total_loss": 0.16098718345165253 }, { "entropy": 6.253015518188477, "epoch": 0.18864939687561796, "mean_token_accuracy": 0.7459227442741394, "num_tokens": 42809709.0, "step": 954, "train/ce_loss": 1.0775833129882812 }, { "epoch": 0.18864939687561796, "step": 954, "train/sim_loss": 0.00042873620986938477 }, { "epoch": 0.18864939687561796, "step": 954, "train/total_loss": 0.10818707197904587 }, { "entropy": 5.793221473693848, "epoch": 0.188847142574649, "mean_token_accuracy": 0.71161949634552, "num_tokens": 42852253.0, "step": 955, "train/ce_loss": 0.974926769733429 }, { "epoch": 0.188847142574649, "step": 955, "train/sim_loss": 0.00044524669647216797 }, { "epoch": 0.188847142574649, "step": 955, "train/total_loss": 0.0979379266500473 }, { "entropy": 5.761362075805664, "epoch": 0.18904488827368005, "mean_token_accuracy": 0.7494866251945496, "num_tokens": 42894440.0, "step": 956, "train/ce_loss": 1.092433214187622 }, { "epoch": 0.18904488827368005, "step": 956, "train/sim_loss": 0.0003101825714111328 }, { "epoch": 0.18904488827368005, "step": 956, "train/total_loss": 0.1095535084605217 }, { "entropy": 5.910808563232422, "epoch": 0.1892426339727111, "mean_token_accuracy": 0.737300455570221, "num_tokens": 42950563.0, "step": 957, "train/ce_loss": 0.7922112345695496 }, { "epoch": 0.1892426339727111, "step": 957, "train/sim_loss": 0.0005276203155517578 }, { "epoch": 0.1892426339727111, "step": 957, "train/total_loss": 0.0797487422823906 }, { "entropy": 5.843334674835205, "epoch": 0.18944037967174213, "mean_token_accuracy": 0.7237299680709839, "num_tokens": 42984260.0, "step": 958, "train/ce_loss": 0.8279839754104614 }, { "epoch": 0.18944037967174213, "step": 958, "train/sim_loss": 0.00034606456756591797 }, { "epoch": 0.18944037967174213, "step": 958, "train/total_loss": 0.08314446359872818 }, { "entropy": 6.091283798217773, "epoch": 0.18963812537077318, "mean_token_accuracy": 0.7645705342292786, "num_tokens": 43021219.0, "step": 959, "train/ce_loss": 2.6053259716718458e-05 }, { "epoch": 0.18963812537077318, "step": 959, "train/sim_loss": 0.0005582571029663086 }, { "epoch": 0.18963812537077318, "step": 959, "train/total_loss": 0.0005608624196611345 }, { "epoch": 0.18983587106980424, "grad_norm": 0.4637255370616913, "learning_rate": 9.530616282520527e-06, "loss": 0.0864, "step": 960 }, { "entropy": 6.1225433349609375, "epoch": 0.18983587106980424, "mean_token_accuracy": 0.6696012616157532, "num_tokens": 43063437.0, "step": 960, "train/ce_loss": 0.8244902491569519 }, { "epoch": 0.18983587106980424, "step": 960, "train/sim_loss": 0.0007408857345581055 }, { "epoch": 0.18983587106980424, "step": 960, "train/total_loss": 0.08318991214036942 }, { "entropy": 6.150982856750488, "epoch": 0.1900336167688353, "mean_token_accuracy": 0.7101727724075317, "num_tokens": 43091025.0, "step": 961, "train/ce_loss": 3.078684312640689e-05 }, { "epoch": 0.1900336167688353, "step": 961, "train/sim_loss": 0.00028133392333984375 }, { "epoch": 0.1900336167688353, "step": 961, "train/total_loss": 0.0002844126138370484 }, { "entropy": 6.3805975914001465, "epoch": 0.19023136246786632, "mean_token_accuracy": 0.7265685796737671, "num_tokens": 43130799.0, "step": 962, "train/ce_loss": 2.4413666324107908e-05 }, { "epoch": 0.19023136246786632, "step": 962, "train/sim_loss": 0.0006200075149536133 }, { "epoch": 0.19023136246786632, "step": 962, "train/total_loss": 0.0006224488606676459 }, { "entropy": 6.2934980392456055, "epoch": 0.19042910816689737, "mean_token_accuracy": 0.6909430623054504, "num_tokens": 43174355.0, "step": 963, "train/ce_loss": 1.042553424835205 }, { "epoch": 0.19042910816689737, "step": 963, "train/sim_loss": 0.0004419684410095215 }, { "epoch": 0.19042910816689737, "step": 963, "train/total_loss": 0.10469730943441391 }, { "entropy": 5.686549186706543, "epoch": 0.19062685386592843, "mean_token_accuracy": 0.7647776007652283, "num_tokens": 43210406.0, "step": 964, "train/ce_loss": 0.9205357432365417 }, { "epoch": 0.19062685386592843, "step": 964, "train/sim_loss": 0.0006437897682189941 }, { "epoch": 0.19062685386592843, "step": 964, "train/total_loss": 0.09269736707210541 }, { "entropy": 6.159442901611328, "epoch": 0.19082459956495945, "mean_token_accuracy": 0.7344236969947815, "num_tokens": 43253581.0, "step": 965, "train/ce_loss": 1.036731243133545 }, { "epoch": 0.19082459956495945, "step": 965, "train/sim_loss": 0.0006538629531860352 }, { "epoch": 0.19082459956495945, "step": 965, "train/total_loss": 0.10432698577642441 }, { "entropy": 5.937551498413086, "epoch": 0.1910223452639905, "mean_token_accuracy": 0.7308294177055359, "num_tokens": 43301883.0, "step": 966, "train/ce_loss": 1.479828119277954 }, { "epoch": 0.1910223452639905, "step": 966, "train/sim_loss": 0.0007500648498535156 }, { "epoch": 0.1910223452639905, "step": 966, "train/total_loss": 0.14873288571834564 }, { "entropy": 6.236640930175781, "epoch": 0.19122009096302156, "mean_token_accuracy": 0.7266915440559387, "num_tokens": 43353260.0, "step": 967, "train/ce_loss": 0.9945668578147888 }, { "epoch": 0.19122009096302156, "step": 967, "train/sim_loss": 0.00044476985931396484 }, { "epoch": 0.19122009096302156, "step": 967, "train/total_loss": 0.0999014601111412 }, { "entropy": 6.038466453552246, "epoch": 0.1914178366620526, "mean_token_accuracy": 0.7235257029533386, "num_tokens": 43394881.0, "step": 968, "train/ce_loss": 0.537767231464386 }, { "epoch": 0.1914178366620526, "step": 968, "train/sim_loss": 0.0003763437271118164 }, { "epoch": 0.1914178366620526, "step": 968, "train/total_loss": 0.054153066128492355 }, { "entropy": 6.228448867797852, "epoch": 0.19161558236108364, "mean_token_accuracy": 0.7155230045318604, "num_tokens": 43453382.0, "step": 969, "train/ce_loss": 1.0318231582641602 }, { "epoch": 0.19161558236108364, "step": 969, "train/sim_loss": 0.0004165768623352051 }, { "epoch": 0.19161558236108364, "step": 969, "train/total_loss": 0.10359889268875122 }, { "entropy": 6.058676719665527, "epoch": 0.1918133280601147, "mean_token_accuracy": 0.7243356108665466, "num_tokens": 43507588.0, "step": 970, "train/ce_loss": 1.0591105222702026 }, { "epoch": 0.1918133280601147, "step": 970, "train/sim_loss": 0.00043392181396484375 }, { "epoch": 0.1918133280601147, "step": 970, "train/total_loss": 0.10634497553110123 }, { "entropy": 5.992038726806641, "epoch": 0.19201107375914575, "mean_token_accuracy": 0.731663703918457, "num_tokens": 43539363.0, "step": 971, "train/ce_loss": 0.424455463886261 }, { "epoch": 0.19201107375914575, "step": 971, "train/sim_loss": 0.0005370378494262695 }, { "epoch": 0.19201107375914575, "step": 971, "train/total_loss": 0.04298258572816849 }, { "entropy": 5.816704750061035, "epoch": 0.19220881945817678, "mean_token_accuracy": 0.7436636090278625, "num_tokens": 43586625.0, "step": 972, "train/ce_loss": 0.9371786713600159 }, { "epoch": 0.19220881945817678, "step": 972, "train/sim_loss": 0.0004233121871948242 }, { "epoch": 0.19220881945817678, "step": 972, "train/total_loss": 0.09414117783308029 }, { "entropy": 5.7081618309021, "epoch": 0.19240656515720783, "mean_token_accuracy": 0.7347368597984314, "num_tokens": 43624667.0, "step": 973, "train/ce_loss": 0.933086097240448 }, { "epoch": 0.19240656515720783, "step": 973, "train/sim_loss": 0.00046443939208984375 }, { "epoch": 0.19240656515720783, "step": 973, "train/total_loss": 0.09377305209636688 }, { "entropy": 6.036599159240723, "epoch": 0.1926043108562389, "mean_token_accuracy": 0.7520184516906738, "num_tokens": 43679739.0, "step": 974, "train/ce_loss": 1.1406548023223877 }, { "epoch": 0.1926043108562389, "step": 974, "train/sim_loss": 0.0003896951675415039 }, { "epoch": 0.1926043108562389, "step": 974, "train/total_loss": 0.11445517838001251 }, { "entropy": 5.857789516448975, "epoch": 0.1928020565552699, "mean_token_accuracy": 0.7631925940513611, "num_tokens": 43714148.0, "step": 975, "train/ce_loss": 0.3526795208454132 }, { "epoch": 0.1928020565552699, "step": 975, "train/sim_loss": 0.000546574592590332 }, { "epoch": 0.1928020565552699, "step": 975, "train/total_loss": 0.03581452742218971 }, { "entropy": 5.9729814529418945, "epoch": 0.19299980225430097, "mean_token_accuracy": 0.7710437774658203, "num_tokens": 43761642.0, "step": 976, "train/ce_loss": 0.2963966429233551 }, { "epoch": 0.19299980225430097, "step": 976, "train/sim_loss": 0.0004448890686035156 }, { "epoch": 0.19299980225430097, "step": 976, "train/total_loss": 0.030084554105997086 }, { "entropy": 6.26218318939209, "epoch": 0.19319754795333202, "mean_token_accuracy": 0.7142857313156128, "num_tokens": 43795596.0, "step": 977, "train/ce_loss": 0.975942850112915 }, { "epoch": 0.19319754795333202, "step": 977, "train/sim_loss": 0.00048041343688964844 }, { "epoch": 0.19319754795333202, "step": 977, "train/total_loss": 0.09807469695806503 }, { "entropy": 5.736576080322266, "epoch": 0.19339529365236305, "mean_token_accuracy": 0.764642059803009, "num_tokens": 43844778.0, "step": 978, "train/ce_loss": 0.502390444278717 }, { "epoch": 0.19339529365236305, "step": 978, "train/sim_loss": 0.000293731689453125 }, { "epoch": 0.19339529365236305, "step": 978, "train/total_loss": 0.05053277686238289 }, { "entropy": 6.562127113342285, "epoch": 0.1935930393513941, "mean_token_accuracy": 0.7076433300971985, "num_tokens": 43890016.0, "step": 979, "train/ce_loss": 0.7542132139205933 }, { "epoch": 0.1935930393513941, "step": 979, "train/sim_loss": 0.00029659271240234375 }, { "epoch": 0.1935930393513941, "step": 979, "train/total_loss": 0.07571791857481003 }, { "epoch": 0.19379078505042516, "grad_norm": 0.4861571788787842, "learning_rate": 9.52072410723118e-06, "loss": 0.0883, "step": 980 }, { "entropy": 5.915984153747559, "epoch": 0.19379078505042516, "mean_token_accuracy": 0.7669073939323425, "num_tokens": 43934504.0, "step": 980, "train/ce_loss": 1.0402363538742065 }, { "epoch": 0.19379078505042516, "step": 980, "train/sim_loss": 0.00035750865936279297 }, { "epoch": 0.19379078505042516, "step": 980, "train/total_loss": 0.10438114404678345 }, { "entropy": 5.624630928039551, "epoch": 0.1939885307494562, "mean_token_accuracy": 0.7816677093505859, "num_tokens": 43967964.0, "step": 981, "train/ce_loss": 0.6465548276901245 }, { "epoch": 0.1939885307494562, "step": 981, "train/sim_loss": 0.0005551576614379883 }, { "epoch": 0.1939885307494562, "step": 981, "train/total_loss": 0.06521064043045044 }, { "entropy": 6.444609642028809, "epoch": 0.19418627644848724, "mean_token_accuracy": 0.7502034306526184, "num_tokens": 44023574.0, "step": 982, "train/ce_loss": 0.8994408845901489 }, { "epoch": 0.19418627644848724, "step": 982, "train/sim_loss": 0.0003396272659301758 }, { "epoch": 0.19418627644848724, "step": 982, "train/total_loss": 0.09028371423482895 }, { "entropy": 6.4282684326171875, "epoch": 0.1943840221475183, "mean_token_accuracy": 0.7362401485443115, "num_tokens": 44072766.0, "step": 983, "train/ce_loss": 1.5299649238586426 }, { "epoch": 0.1943840221475183, "step": 983, "train/sim_loss": 0.00043570995330810547 }, { "epoch": 0.1943840221475183, "step": 983, "train/total_loss": 0.1534322053194046 }, { "entropy": 6.114212989807129, "epoch": 0.19458176784654935, "mean_token_accuracy": 0.7150101661682129, "num_tokens": 44121382.0, "step": 984, "train/ce_loss": 3.91886496799998e-05 }, { "epoch": 0.19458176784654935, "step": 984, "train/sim_loss": 0.00029015541076660156 }, { "epoch": 0.19458176784654935, "step": 984, "train/total_loss": 0.0002940742706414312 }, { "entropy": 6.259116172790527, "epoch": 0.19477951354558037, "mean_token_accuracy": 0.7310038805007935, "num_tokens": 44168881.0, "step": 985, "train/ce_loss": 0.4740426242351532 }, { "epoch": 0.19477951354558037, "step": 985, "train/sim_loss": 0.000514984130859375 }, { "epoch": 0.19477951354558037, "step": 985, "train/total_loss": 0.047919247299432755 }, { "entropy": 6.098090171813965, "epoch": 0.19497725924461143, "mean_token_accuracy": 0.7200854420661926, "num_tokens": 44222018.0, "step": 986, "train/ce_loss": 0.558929979801178 }, { "epoch": 0.19497725924461143, "step": 986, "train/sim_loss": 0.0004582405090332031 }, { "epoch": 0.19497725924461143, "step": 986, "train/total_loss": 0.05635124072432518 }, { "entropy": 6.347962379455566, "epoch": 0.19517500494364248, "mean_token_accuracy": 0.7223042845726013, "num_tokens": 44264903.0, "step": 987, "train/ce_loss": 1.169192910194397 }, { "epoch": 0.19517500494364248, "step": 987, "train/sim_loss": 0.0005123615264892578 }, { "epoch": 0.19517500494364248, "step": 987, "train/total_loss": 0.1174316555261612 }, { "entropy": 6.156818866729736, "epoch": 0.1953727506426735, "mean_token_accuracy": 0.7587230801582336, "num_tokens": 44303201.0, "step": 988, "train/ce_loss": 1.013159990310669 }, { "epoch": 0.1953727506426735, "step": 988, "train/sim_loss": 0.0006837844848632812 }, { "epoch": 0.1953727506426735, "step": 988, "train/total_loss": 0.10199978202581406 }, { "entropy": 6.041546821594238, "epoch": 0.19557049634170456, "mean_token_accuracy": 0.7581903338432312, "num_tokens": 44341341.0, "step": 989, "train/ce_loss": 1.8265820741653442 }, { "epoch": 0.19557049634170456, "step": 989, "train/sim_loss": 0.0006162524223327637 }, { "epoch": 0.19557049634170456, "step": 989, "train/total_loss": 0.18327446281909943 }, { "entropy": 6.501068592071533, "epoch": 0.19576824204073562, "mean_token_accuracy": 0.6789883375167847, "num_tokens": 44384632.0, "step": 990, "train/ce_loss": 0.7801655530929565 }, { "epoch": 0.19576824204073562, "step": 990, "train/sim_loss": 0.0006493926048278809 }, { "epoch": 0.19576824204073562, "step": 990, "train/total_loss": 0.07866594940423965 }, { "entropy": 6.125299453735352, "epoch": 0.19596598773976667, "mean_token_accuracy": 0.7168923020362854, "num_tokens": 44423511.0, "step": 991, "train/ce_loss": 1.8085107803344727 }, { "epoch": 0.19596598773976667, "step": 991, "train/sim_loss": 0.0003319978713989258 }, { "epoch": 0.19596598773976667, "step": 991, "train/total_loss": 0.1811830848455429 }, { "entropy": 5.905462265014648, "epoch": 0.1961637334387977, "mean_token_accuracy": 0.7977527976036072, "num_tokens": 44467536.0, "step": 992, "train/ce_loss": 0.7122392654418945 }, { "epoch": 0.1961637334387977, "step": 992, "train/sim_loss": 0.00031173229217529297 }, { "epoch": 0.1961637334387977, "step": 992, "train/total_loss": 0.07153566181659698 }, { "entropy": 5.875614166259766, "epoch": 0.19636147913782875, "mean_token_accuracy": 0.753778338432312, "num_tokens": 44496034.0, "step": 993, "train/ce_loss": 0.5827322006225586 }, { "epoch": 0.19636147913782875, "step": 993, "train/sim_loss": 0.00041294097900390625 }, { "epoch": 0.19636147913782875, "step": 993, "train/total_loss": 0.058686163276433945 }, { "entropy": 6.0705885887146, "epoch": 0.1965592248368598, "mean_token_accuracy": 0.7874464988708496, "num_tokens": 44534593.0, "step": 994, "train/ce_loss": 0.5598700642585754 }, { "epoch": 0.1965592248368598, "step": 994, "train/sim_loss": 0.00028395652770996094 }, { "epoch": 0.1965592248368598, "step": 994, "train/total_loss": 0.056270964443683624 }, { "entropy": 6.102170944213867, "epoch": 0.19675697053589083, "mean_token_accuracy": 0.7650537490844727, "num_tokens": 44569457.0, "step": 995, "train/ce_loss": 0.9188746213912964 }, { "epoch": 0.19675697053589083, "step": 995, "train/sim_loss": 0.0005621910095214844 }, { "epoch": 0.19675697053589083, "step": 995, "train/total_loss": 0.09244965761899948 }, { "entropy": 6.1860575675964355, "epoch": 0.1969547162349219, "mean_token_accuracy": 0.752996027469635, "num_tokens": 44600732.0, "step": 996, "train/ce_loss": 0.3550959825515747 }, { "epoch": 0.1969547162349219, "step": 996, "train/sim_loss": 0.0002982616424560547 }, { "epoch": 0.1969547162349219, "step": 996, "train/total_loss": 0.035807859152555466 }, { "entropy": 6.31489372253418, "epoch": 0.19715246193395294, "mean_token_accuracy": 0.6981026530265808, "num_tokens": 44660336.0, "step": 997, "train/ce_loss": 0.8951104283332825 }, { "epoch": 0.19715246193395294, "step": 997, "train/sim_loss": 0.0005097389221191406 }, { "epoch": 0.19715246193395294, "step": 997, "train/total_loss": 0.09002078324556351 }, { "entropy": 6.250098705291748, "epoch": 0.19735020763298397, "mean_token_accuracy": 0.7416209578514099, "num_tokens": 44704588.0, "step": 998, "train/ce_loss": 1.0169382095336914 }, { "epoch": 0.19735020763298397, "step": 998, "train/sim_loss": 0.00047385692596435547 }, { "epoch": 0.19735020763298397, "step": 998, "train/total_loss": 0.10216768085956573 }, { "entropy": 6.217045783996582, "epoch": 0.19754795333201502, "mean_token_accuracy": 0.7171216011047363, "num_tokens": 44748232.0, "step": 999, "train/ce_loss": 1.1147325038909912 }, { "epoch": 0.19754795333201502, "step": 999, "train/sim_loss": 0.00041878223419189453 }, { "epoch": 0.19754795333201502, "step": 999, "train/total_loss": 0.11189203709363937 }, { "epoch": 0.19774569903104608, "grad_norm": 0.4421977698802948, "learning_rate": 9.510831931941836e-06, "loss": 0.0824, "step": 1000 }, { "entropy": 5.983539581298828, "epoch": 0.19774569903104608, "mean_token_accuracy": 0.7685050964355469, "num_tokens": 44797869.0, "step": 1000, "train/ce_loss": 0.868344783782959 }, { "epoch": 0.19774569903104608, "step": 1000, "train/sim_loss": 0.000499725341796875 }, { "epoch": 0.19774569903104608, "step": 1000, "train/total_loss": 0.08733420819044113 }, { "entropy": 6.473170280456543, "epoch": 0.19794344473007713, "mean_token_accuracy": 0.7495145797729492, "num_tokens": 44840900.0, "step": 1001, "train/ce_loss": 0.9148979783058167 }, { "epoch": 0.19794344473007713, "step": 1001, "train/sim_loss": 0.0004048943519592285 }, { "epoch": 0.19794344473007713, "step": 1001, "train/total_loss": 0.09189469367265701 }, { "entropy": 6.082024574279785, "epoch": 0.19814119042910816, "mean_token_accuracy": 0.7754642367362976, "num_tokens": 44882105.0, "step": 1002, "train/ce_loss": 0.8419674038887024 }, { "epoch": 0.19814119042910816, "step": 1002, "train/sim_loss": 0.0005565881729125977 }, { "epoch": 0.19814119042910816, "step": 1002, "train/total_loss": 0.08475332707166672 }, { "entropy": 5.93507719039917, "epoch": 0.19833893612813921, "mean_token_accuracy": 0.767175555229187, "num_tokens": 44910846.0, "step": 1003, "train/ce_loss": 0.945040225982666 }, { "epoch": 0.19833893612813921, "step": 1003, "train/sim_loss": 0.0006377696990966797 }, { "epoch": 0.19833893612813921, "step": 1003, "train/total_loss": 0.09514179080724716 }, { "entropy": 6.194492816925049, "epoch": 0.19853668182717027, "mean_token_accuracy": 0.7555898427963257, "num_tokens": 44940080.0, "step": 1004, "train/ce_loss": 2.4737013518461026e-05 }, { "epoch": 0.19853668182717027, "step": 1004, "train/sim_loss": 0.00028192996978759766 }, { "epoch": 0.19853668182717027, "step": 1004, "train/total_loss": 0.0002844036789610982 }, { "entropy": 5.352424621582031, "epoch": 0.1987344275262013, "mean_token_accuracy": 0.7651880383491516, "num_tokens": 44965508.0, "step": 1005, "train/ce_loss": 0.6431272625923157 }, { "epoch": 0.1987344275262013, "step": 1005, "train/sim_loss": 0.00032401084899902344 }, { "epoch": 0.1987344275262013, "step": 1005, "train/total_loss": 0.06463673710823059 }, { "entropy": 6.029069900512695, "epoch": 0.19893217322523235, "mean_token_accuracy": 0.7058353424072266, "num_tokens": 45015537.0, "step": 1006, "train/ce_loss": 1.2153663635253906 }, { "epoch": 0.19893217322523235, "step": 1006, "train/sim_loss": 0.00036156177520751953 }, { "epoch": 0.19893217322523235, "step": 1006, "train/total_loss": 0.12189819663763046 }, { "entropy": 6.2679643630981445, "epoch": 0.1991299189242634, "mean_token_accuracy": 0.729138195514679, "num_tokens": 45063324.0, "step": 1007, "train/ce_loss": 1.5730035305023193 }, { "epoch": 0.1991299189242634, "step": 1007, "train/sim_loss": 0.0002652406692504883 }, { "epoch": 0.1991299189242634, "step": 1007, "train/total_loss": 0.15756559371948242 }, { "entropy": 5.701669216156006, "epoch": 0.19932766462329443, "mean_token_accuracy": 0.7797356843948364, "num_tokens": 45086846.0, "step": 1008, "train/ce_loss": 2.001578286581207e-05 }, { "epoch": 0.19932766462329443, "step": 1008, "train/sim_loss": 0.00032395124435424805 }, { "epoch": 0.19932766462329443, "step": 1008, "train/total_loss": 0.00032595283119007945 }, { "entropy": 5.897169589996338, "epoch": 0.19952541032232549, "mean_token_accuracy": 0.7540983557701111, "num_tokens": 45137736.0, "step": 1009, "train/ce_loss": 4.534307663561776e-05 }, { "epoch": 0.19952541032232549, "step": 1009, "train/sim_loss": 0.00032508373260498047 }, { "epoch": 0.19952541032232549, "step": 1009, "train/total_loss": 0.00032961805118247867 }, { "entropy": 5.710657119750977, "epoch": 0.19972315602135654, "mean_token_accuracy": 0.7623604536056519, "num_tokens": 45180659.0, "step": 1010, "train/ce_loss": 0.750190258026123 }, { "epoch": 0.19972315602135654, "step": 1010, "train/sim_loss": 0.0004521608352661133 }, { "epoch": 0.19972315602135654, "step": 1010, "train/total_loss": 0.0754711851477623 }, { "entropy": 5.656459808349609, "epoch": 0.1999209017203876, "mean_token_accuracy": 0.7815079689025879, "num_tokens": 45220020.0, "step": 1011, "train/ce_loss": 0.4199903607368469 }, { "epoch": 0.1999209017203876, "step": 1011, "train/sim_loss": 0.0002822279930114746 }, { "epoch": 0.1999209017203876, "step": 1011, "train/total_loss": 0.042281266301870346 }, { "entropy": 5.809752464294434, "epoch": 0.20011864741941862, "mean_token_accuracy": 0.7469955682754517, "num_tokens": 45250088.0, "step": 1012, "train/ce_loss": 0.8475485444068909 }, { "epoch": 0.20011864741941862, "step": 1012, "train/sim_loss": 0.00033915042877197266 }, { "epoch": 0.20011864741941862, "step": 1012, "train/total_loss": 0.08509400486946106 }, { "entropy": 6.288045883178711, "epoch": 0.20031639311844968, "mean_token_accuracy": 0.7477724552154541, "num_tokens": 45295721.0, "step": 1013, "train/ce_loss": 2.2431147954193875e-05 }, { "epoch": 0.20031639311844968, "step": 1013, "train/sim_loss": 0.0002568364143371582 }, { "epoch": 0.20031639311844968, "step": 1013, "train/total_loss": 0.00025907953386195004 }, { "entropy": 5.943202018737793, "epoch": 0.20051413881748073, "mean_token_accuracy": 0.7153419852256775, "num_tokens": 45358433.0, "step": 1014, "train/ce_loss": 0.9144335985183716 }, { "epoch": 0.20051413881748073, "step": 1014, "train/sim_loss": 0.00038683414459228516 }, { "epoch": 0.20051413881748073, "step": 1014, "train/total_loss": 0.09183019399642944 }, { "entropy": 5.749565124511719, "epoch": 0.20071188451651176, "mean_token_accuracy": 0.7291842103004456, "num_tokens": 45394821.0, "step": 1015, "train/ce_loss": 0.5818427801132202 }, { "epoch": 0.20071188451651176, "step": 1015, "train/sim_loss": 0.00025212764739990234 }, { "epoch": 0.20071188451651176, "step": 1015, "train/total_loss": 0.058436404913663864 }, { "entropy": 6.25107479095459, "epoch": 0.2009096302155428, "mean_token_accuracy": 0.7398160099983215, "num_tokens": 45439039.0, "step": 1016, "train/ce_loss": 0.6534475684165955 }, { "epoch": 0.2009096302155428, "step": 1016, "train/sim_loss": 0.0005404353141784668 }, { "epoch": 0.2009096302155428, "step": 1016, "train/total_loss": 0.06588519364595413 }, { "entropy": 6.056707859039307, "epoch": 0.20110737591457387, "mean_token_accuracy": 0.7296969890594482, "num_tokens": 45487666.0, "step": 1017, "train/ce_loss": 1.9487282037734985 }, { "epoch": 0.20110737591457387, "step": 1017, "train/sim_loss": 0.0003439188003540039 }, { "epoch": 0.20110737591457387, "step": 1017, "train/total_loss": 0.19521674513816833 }, { "entropy": 6.0989789962768555, "epoch": 0.2013051216136049, "mean_token_accuracy": 0.7081829309463501, "num_tokens": 45542907.0, "step": 1018, "train/ce_loss": 0.789156436920166 }, { "epoch": 0.2013051216136049, "step": 1018, "train/sim_loss": 0.0003185272216796875 }, { "epoch": 0.2013051216136049, "step": 1018, "train/total_loss": 0.07923417538404465 }, { "entropy": 6.158151626586914, "epoch": 0.20150286731263595, "mean_token_accuracy": 0.7469168901443481, "num_tokens": 45597569.0, "step": 1019, "train/ce_loss": 0.9469456672668457 }, { "epoch": 0.20150286731263595, "step": 1019, "train/sim_loss": 0.0008144378662109375 }, { "epoch": 0.20150286731263595, "step": 1019, "train/total_loss": 0.09550900757312775 }, { "epoch": 0.201700613011667, "grad_norm": 0.4267827570438385, "learning_rate": 9.500939756652488e-06, "loss": 0.08, "step": 1020 }, { "entropy": 6.254552841186523, "epoch": 0.201700613011667, "mean_token_accuracy": 0.712950587272644, "num_tokens": 45647756.0, "step": 1020, "train/ce_loss": 0.9021135568618774 }, { "epoch": 0.201700613011667, "step": 1020, "train/sim_loss": 0.00037044286727905273 }, { "epoch": 0.201700613011667, "step": 1020, "train/total_loss": 0.09058179706335068 }, { "entropy": 6.139871597290039, "epoch": 0.20189835871069806, "mean_token_accuracy": 0.7238723635673523, "num_tokens": 45674270.0, "step": 1021, "train/ce_loss": 1.0148656368255615 }, { "epoch": 0.20189835871069806, "step": 1021, "train/sim_loss": 0.0005040168762207031 }, { "epoch": 0.20189835871069806, "step": 1021, "train/total_loss": 0.10199058055877686 }, { "entropy": 5.938094139099121, "epoch": 0.20209610440972908, "mean_token_accuracy": 0.748516321182251, "num_tokens": 45720535.0, "step": 1022, "train/ce_loss": 0.8035510778427124 }, { "epoch": 0.20209610440972908, "step": 1022, "train/sim_loss": 0.00033652782440185547 }, { "epoch": 0.20209610440972908, "step": 1022, "train/total_loss": 0.0806916356086731 }, { "entropy": 5.951905250549316, "epoch": 0.20229385010876014, "mean_token_accuracy": 0.7182823419570923, "num_tokens": 45759033.0, "step": 1023, "train/ce_loss": 0.8089202642440796 }, { "epoch": 0.20229385010876014, "step": 1023, "train/sim_loss": 0.00039565563201904297 }, { "epoch": 0.20229385010876014, "step": 1023, "train/total_loss": 0.081287682056427 }, { "entropy": 6.196842193603516, "epoch": 0.2024915958077912, "mean_token_accuracy": 0.7570815682411194, "num_tokens": 45794287.0, "step": 1024, "train/ce_loss": 0.6589667201042175 }, { "epoch": 0.2024915958077912, "step": 1024, "train/sim_loss": 0.000494837760925293 }, { "epoch": 0.2024915958077912, "step": 1024, "train/total_loss": 0.06639151275157928 }, { "entropy": 5.9435811042785645, "epoch": 0.20268934150682222, "mean_token_accuracy": 0.7467602491378784, "num_tokens": 45836095.0, "step": 1025, "train/ce_loss": 0.8063828945159912 }, { "epoch": 0.20268934150682222, "step": 1025, "train/sim_loss": 0.00034236907958984375 }, { "epoch": 0.20268934150682222, "step": 1025, "train/total_loss": 0.08098065853118896 }, { "entropy": 6.362430572509766, "epoch": 0.20288708720585327, "mean_token_accuracy": 0.7242285013198853, "num_tokens": 45887530.0, "step": 1026, "train/ce_loss": 1.2805064916610718 }, { "epoch": 0.20288708720585327, "step": 1026, "train/sim_loss": 0.000517427921295166 }, { "epoch": 0.20288708720585327, "step": 1026, "train/total_loss": 0.12856808304786682 }, { "entropy": 5.960529327392578, "epoch": 0.20308483290488433, "mean_token_accuracy": 0.7313769459724426, "num_tokens": 45925546.0, "step": 1027, "train/ce_loss": 0.7811924815177917 }, { "epoch": 0.20308483290488433, "step": 1027, "train/sim_loss": 0.000525355339050293 }, { "epoch": 0.20308483290488433, "step": 1027, "train/total_loss": 0.07864460349082947 }, { "entropy": 6.218160629272461, "epoch": 0.20328257860391535, "mean_token_accuracy": 0.7322275042533875, "num_tokens": 45971015.0, "step": 1028, "train/ce_loss": 1.655188798904419 }, { "epoch": 0.20328257860391535, "step": 1028, "train/sim_loss": 0.000348508358001709 }, { "epoch": 0.20328257860391535, "step": 1028, "train/total_loss": 0.1658673882484436 }, { "entropy": 6.478936195373535, "epoch": 0.2034803243029464, "mean_token_accuracy": 0.7397034764289856, "num_tokens": 46022235.0, "step": 1029, "train/ce_loss": 0.4891010522842407 }, { "epoch": 0.2034803243029464, "step": 1029, "train/sim_loss": 0.00026303529739379883 }, { "epoch": 0.2034803243029464, "step": 1029, "train/total_loss": 0.04917314276099205 }, { "entropy": 5.648528099060059, "epoch": 0.20367807000197746, "mean_token_accuracy": 0.7862686514854431, "num_tokens": 46049143.0, "step": 1030, "train/ce_loss": 0.48738327622413635 }, { "epoch": 0.20367807000197746, "step": 1030, "train/sim_loss": 0.0002626180648803711 }, { "epoch": 0.20367807000197746, "step": 1030, "train/total_loss": 0.04900094494223595 }, { "entropy": 5.806298732757568, "epoch": 0.2038758157010085, "mean_token_accuracy": 0.7353219985961914, "num_tokens": 46083088.0, "step": 1031, "train/ce_loss": 0.8828989267349243 }, { "epoch": 0.2038758157010085, "step": 1031, "train/sim_loss": 0.0008012056350708008 }, { "epoch": 0.2038758157010085, "step": 1031, "train/total_loss": 0.08909109979867935 }, { "entropy": 6.024219512939453, "epoch": 0.20407356140003954, "mean_token_accuracy": 0.7304737567901611, "num_tokens": 46148110.0, "step": 1032, "train/ce_loss": 1.1280580759048462 }, { "epoch": 0.20407356140003954, "step": 1032, "train/sim_loss": 0.0003752708435058594 }, { "epoch": 0.20407356140003954, "step": 1032, "train/total_loss": 0.11318107694387436 }, { "entropy": 5.9187469482421875, "epoch": 0.2042713070990706, "mean_token_accuracy": 0.7297464609146118, "num_tokens": 46186857.0, "step": 1033, "train/ce_loss": 1.3814356327056885 }, { "epoch": 0.2042713070990706, "step": 1033, "train/sim_loss": 0.0002452731132507324 }, { "epoch": 0.2042713070990706, "step": 1033, "train/total_loss": 0.13838884234428406 }, { "entropy": 6.145115852355957, "epoch": 0.20446905279810165, "mean_token_accuracy": 0.7368420958518982, "num_tokens": 46226418.0, "step": 1034, "train/ce_loss": 0.8486335873603821 }, { "epoch": 0.20446905279810165, "step": 1034, "train/sim_loss": 0.00033664703369140625 }, { "epoch": 0.20446905279810165, "step": 1034, "train/total_loss": 0.0852000042796135 }, { "entropy": 5.760893821716309, "epoch": 0.20466679849713268, "mean_token_accuracy": 0.7280949950218201, "num_tokens": 46274343.0, "step": 1035, "train/ce_loss": 0.7585851550102234 }, { "epoch": 0.20466679849713268, "step": 1035, "train/sim_loss": 0.0004571676254272461 }, { "epoch": 0.20466679849713268, "step": 1035, "train/total_loss": 0.07631568610668182 }, { "entropy": 6.210639953613281, "epoch": 0.20486454419616373, "mean_token_accuracy": 0.7331057786941528, "num_tokens": 46322535.0, "step": 1036, "train/ce_loss": 0.5566270351409912 }, { "epoch": 0.20486454419616373, "step": 1036, "train/sim_loss": 0.00042116641998291016 }, { "epoch": 0.20486454419616373, "step": 1036, "train/total_loss": 0.05608386918902397 }, { "entropy": 5.908239364624023, "epoch": 0.2050622898951948, "mean_token_accuracy": 0.7379912734031677, "num_tokens": 46357671.0, "step": 1037, "train/ce_loss": 2.112256333930418e-05 }, { "epoch": 0.2050622898951948, "step": 1037, "train/sim_loss": 0.0002582073211669922 }, { "epoch": 0.2050622898951948, "step": 1037, "train/total_loss": 0.00026031958987005055 }, { "entropy": 5.545375823974609, "epoch": 0.20526003559422581, "mean_token_accuracy": 0.7534050345420837, "num_tokens": 46398677.0, "step": 1038, "train/ce_loss": 0.7306080460548401 }, { "epoch": 0.20526003559422581, "step": 1038, "train/sim_loss": 0.00046640634536743164 }, { "epoch": 0.20526003559422581, "step": 1038, "train/total_loss": 0.07352720946073532 }, { "entropy": 5.926036357879639, "epoch": 0.20545778129325687, "mean_token_accuracy": 0.7252221703529358, "num_tokens": 46447986.0, "step": 1039, "train/ce_loss": 0.966347873210907 }, { "epoch": 0.20545778129325687, "step": 1039, "train/sim_loss": 0.00047475099563598633 }, { "epoch": 0.20545778129325687, "step": 1039, "train/total_loss": 0.09710954129695892 }, { "epoch": 0.20565552699228792, "grad_norm": 0.4881480634212494, "learning_rate": 9.491047581363143e-06, "loss": 0.0855, "step": 1040 }, { "entropy": 6.023126602172852, "epoch": 0.20565552699228792, "mean_token_accuracy": 0.769753098487854, "num_tokens": 46492228.0, "step": 1040, "train/ce_loss": 1.0231924057006836 }, { "epoch": 0.20565552699228792, "step": 1040, "train/sim_loss": 0.00034809112548828125 }, { "epoch": 0.20565552699228792, "step": 1040, "train/total_loss": 0.10266733169555664 }, { "entropy": 6.148863315582275, "epoch": 0.20585327269131895, "mean_token_accuracy": 0.7279661297798157, "num_tokens": 46537054.0, "step": 1041, "train/ce_loss": 2.5932765007019043 }, { "epoch": 0.20585327269131895, "step": 1041, "train/sim_loss": 0.0004494786262512207 }, { "epoch": 0.20585327269131895, "step": 1041, "train/total_loss": 0.25977712869644165 }, { "entropy": 5.614251136779785, "epoch": 0.20605101839035, "mean_token_accuracy": 0.749868631362915, "num_tokens": 46581891.0, "step": 1042, "train/ce_loss": 0.6356918811798096 }, { "epoch": 0.20605101839035, "step": 1042, "train/sim_loss": 0.00039207935333251953 }, { "epoch": 0.20605101839035, "step": 1042, "train/total_loss": 0.06396126747131348 }, { "entropy": 5.8701605796813965, "epoch": 0.20624876408938106, "mean_token_accuracy": 0.7747184038162231, "num_tokens": 46627311.0, "step": 1043, "train/ce_loss": 0.9279744625091553 }, { "epoch": 0.20624876408938106, "step": 1043, "train/sim_loss": 0.0005053281784057617 }, { "epoch": 0.20624876408938106, "step": 1043, "train/total_loss": 0.09330277889966965 }, { "entropy": 6.162171363830566, "epoch": 0.2064465097884121, "mean_token_accuracy": 0.6770588159561157, "num_tokens": 46687258.0, "step": 1044, "train/ce_loss": 1.0166577100753784 }, { "epoch": 0.2064465097884121, "step": 1044, "train/sim_loss": 0.0002677440643310547 }, { "epoch": 0.2064465097884121, "step": 1044, "train/total_loss": 0.10193351656198502 }, { "entropy": 6.061986923217773, "epoch": 0.20664425548744314, "mean_token_accuracy": 0.8037225008010864, "num_tokens": 46732342.0, "step": 1045, "train/ce_loss": 2.2333411834551953e-05 }, { "epoch": 0.20664425548744314, "step": 1045, "train/sim_loss": 0.00038498640060424805 }, { "epoch": 0.20664425548744314, "step": 1045, "train/total_loss": 0.0003872197412420064 }, { "entropy": 6.113218307495117, "epoch": 0.2068420011864742, "mean_token_accuracy": 0.7439500093460083, "num_tokens": 46773690.0, "step": 1046, "train/ce_loss": 1.4594131708145142 }, { "epoch": 0.2068420011864742, "step": 1046, "train/sim_loss": 0.0004006624221801758 }, { "epoch": 0.2068420011864742, "step": 1046, "train/total_loss": 0.1463419795036316 }, { "entropy": 5.872550010681152, "epoch": 0.20703974688550525, "mean_token_accuracy": 0.7553191781044006, "num_tokens": 46818128.0, "step": 1047, "train/ce_loss": 0.970711886882782 }, { "epoch": 0.20703974688550525, "step": 1047, "train/sim_loss": 0.00043010711669921875 }, { "epoch": 0.20703974688550525, "step": 1047, "train/total_loss": 0.09750130027532578 }, { "entropy": 6.362067699432373, "epoch": 0.20723749258453628, "mean_token_accuracy": 0.7579948306083679, "num_tokens": 46849058.0, "step": 1048, "train/ce_loss": 0.9864766597747803 }, { "epoch": 0.20723749258453628, "step": 1048, "train/sim_loss": 0.0003693699836730957 }, { "epoch": 0.20723749258453628, "step": 1048, "train/total_loss": 0.09901703894138336 }, { "entropy": 5.6553635597229, "epoch": 0.20743523828356733, "mean_token_accuracy": 0.8005974888801575, "num_tokens": 46882324.0, "step": 1049, "train/ce_loss": 0.8187140226364136 }, { "epoch": 0.20743523828356733, "step": 1049, "train/sim_loss": 0.00047898292541503906 }, { "epoch": 0.20743523828356733, "step": 1049, "train/total_loss": 0.08235038816928864 }, { "entropy": 5.514379024505615, "epoch": 0.20763298398259838, "mean_token_accuracy": 0.7468112111091614, "num_tokens": 46922201.0, "step": 1050, "train/ce_loss": 1.7246036804863252e-05 }, { "epoch": 0.20763298398259838, "step": 1050, "train/sim_loss": 0.0003171563148498535 }, { "epoch": 0.20763298398259838, "step": 1050, "train/total_loss": 0.0003188809205312282 }, { "entropy": 6.495547294616699, "epoch": 0.2078307296816294, "mean_token_accuracy": 0.7523809671401978, "num_tokens": 46960936.0, "step": 1051, "train/ce_loss": 0.936090886592865 }, { "epoch": 0.2078307296816294, "step": 1051, "train/sim_loss": 0.0006090402603149414 }, { "epoch": 0.2078307296816294, "step": 1051, "train/total_loss": 0.09421812742948532 }, { "entropy": 6.3015642166137695, "epoch": 0.20802847538066047, "mean_token_accuracy": 0.7240411639213562, "num_tokens": 47007771.0, "step": 1052, "train/ce_loss": 2.8225518690305762e-05 }, { "epoch": 0.20802847538066047, "step": 1052, "train/sim_loss": 0.00032383203506469727 }, { "epoch": 0.20802847538066047, "step": 1052, "train/total_loss": 0.0003266545827500522 }, { "entropy": 6.009166240692139, "epoch": 0.20822622107969152, "mean_token_accuracy": 0.6869356036186218, "num_tokens": 47058374.0, "step": 1053, "train/ce_loss": 2.2697598934173584 }, { "epoch": 0.20822622107969152, "step": 1053, "train/sim_loss": 0.0003638267517089844 }, { "epoch": 0.20822622107969152, "step": 1053, "train/total_loss": 0.22733981907367706 }, { "entropy": 6.188570022583008, "epoch": 0.20842396677872257, "mean_token_accuracy": 0.7249047160148621, "num_tokens": 47096021.0, "step": 1054, "train/ce_loss": 1.217147707939148 }, { "epoch": 0.20842396677872257, "step": 1054, "train/sim_loss": 0.0003101825714111328 }, { "epoch": 0.20842396677872257, "step": 1054, "train/total_loss": 0.12202495336532593 }, { "entropy": 5.949759483337402, "epoch": 0.2086217124777536, "mean_token_accuracy": 0.7622950673103333, "num_tokens": 47141552.0, "step": 1055, "train/ce_loss": 2.2958187400945462e-05 }, { "epoch": 0.2086217124777536, "step": 1055, "train/sim_loss": 0.00040799379348754883 }, { "epoch": 0.2086217124777536, "step": 1055, "train/total_loss": 0.0004102896200492978 }, { "entropy": 5.893932819366455, "epoch": 0.20881945817678466, "mean_token_accuracy": 0.7661035060882568, "num_tokens": 47186997.0, "step": 1056, "train/ce_loss": 0.7557312846183777 }, { "epoch": 0.20881945817678466, "step": 1056, "train/sim_loss": 0.0004686117172241211 }, { "epoch": 0.20881945817678466, "step": 1056, "train/total_loss": 0.07604174315929413 }, { "entropy": 6.092228889465332, "epoch": 0.2090172038758157, "mean_token_accuracy": 0.7453027367591858, "num_tokens": 47239740.0, "step": 1057, "train/ce_loss": 0.6433855891227722 }, { "epoch": 0.2090172038758157, "step": 1057, "train/sim_loss": 0.00038182735443115234 }, { "epoch": 0.2090172038758157, "step": 1057, "train/total_loss": 0.06472038477659225 }, { "entropy": 5.7330522537231445, "epoch": 0.20921494957484674, "mean_token_accuracy": 0.7211635112762451, "num_tokens": 47277028.0, "step": 1058, "train/ce_loss": 2.579156898718793e-05 }, { "epoch": 0.20921494957484674, "step": 1058, "train/sim_loss": 0.0003523826599121094 }, { "epoch": 0.20921494957484674, "step": 1058, "train/total_loss": 0.00035496181226335466 }, { "entropy": 5.921323776245117, "epoch": 0.2094126952738778, "mean_token_accuracy": 0.7399193644523621, "num_tokens": 47321723.0, "step": 1059, "train/ce_loss": 0.9779353141784668 }, { "epoch": 0.2094126952738778, "step": 1059, "train/sim_loss": 0.0004584193229675293 }, { "epoch": 0.2094126952738778, "step": 1059, "train/total_loss": 0.09825195372104645 }, { "epoch": 0.20961044097290885, "grad_norm": 0.4362231492996216, "learning_rate": 9.481155406073797e-06, "loss": 0.0817, "step": 1060 }, { "entropy": 6.361217021942139, "epoch": 0.20961044097290885, "mean_token_accuracy": 0.7070707082748413, "num_tokens": 47375966.0, "step": 1060, "train/ce_loss": 2.0788625988643616e-05 }, { "epoch": 0.20961044097290885, "step": 1060, "train/sim_loss": 0.0002797842025756836 }, { "epoch": 0.20961044097290885, "step": 1060, "train/total_loss": 0.0002818630600813776 }, { "entropy": 5.987520217895508, "epoch": 0.20980818667193987, "mean_token_accuracy": 0.7455782294273376, "num_tokens": 47420920.0, "step": 1061, "train/ce_loss": 1.9867504835128784 }, { "epoch": 0.20980818667193987, "step": 1061, "train/sim_loss": 0.00044739246368408203 }, { "epoch": 0.20980818667193987, "step": 1061, "train/total_loss": 0.19912244379520416 }, { "entropy": 6.144518852233887, "epoch": 0.21000593237097093, "mean_token_accuracy": 0.7474424839019775, "num_tokens": 47473736.0, "step": 1062, "train/ce_loss": 0.800940215587616 }, { "epoch": 0.21000593237097093, "step": 1062, "train/sim_loss": 0.0003224015235900879 }, { "epoch": 0.21000593237097093, "step": 1062, "train/total_loss": 0.08041642606258392 }, { "entropy": 6.216766357421875, "epoch": 0.21020367807000198, "mean_token_accuracy": 0.7048710584640503, "num_tokens": 47510650.0, "step": 1063, "train/ce_loss": 1.0543867349624634 }, { "epoch": 0.21020367807000198, "step": 1063, "train/sim_loss": 0.0003001689910888672 }, { "epoch": 0.21020367807000198, "step": 1063, "train/total_loss": 0.10573884099721909 }, { "entropy": 6.141896724700928, "epoch": 0.21040142376903304, "mean_token_accuracy": 0.725153386592865, "num_tokens": 47551701.0, "step": 1064, "train/ce_loss": 1.0344234704971313 }, { "epoch": 0.21040142376903304, "step": 1064, "train/sim_loss": 0.0007352828979492188 }, { "epoch": 0.21040142376903304, "step": 1064, "train/total_loss": 0.10417763143777847 }, { "entropy": 6.547467231750488, "epoch": 0.21059916946806406, "mean_token_accuracy": 0.7368804812431335, "num_tokens": 47589965.0, "step": 1065, "train/ce_loss": 1.0359593629837036 }, { "epoch": 0.21059916946806406, "step": 1065, "train/sim_loss": 0.00036466121673583984 }, { "epoch": 0.21059916946806406, "step": 1065, "train/total_loss": 0.10396059602499008 }, { "entropy": 6.000206470489502, "epoch": 0.21079691516709512, "mean_token_accuracy": 0.6977300047874451, "num_tokens": 47639064.0, "step": 1066, "train/ce_loss": 0.864838719367981 }, { "epoch": 0.21079691516709512, "step": 1066, "train/sim_loss": 0.0004286766052246094 }, { "epoch": 0.21079691516709512, "step": 1066, "train/total_loss": 0.08691255003213882 }, { "entropy": 5.898134231567383, "epoch": 0.21099466086612617, "mean_token_accuracy": 0.768216073513031, "num_tokens": 47683300.0, "step": 1067, "train/ce_loss": 0.7967759966850281 }, { "epoch": 0.21099466086612617, "step": 1067, "train/sim_loss": 0.00023025274276733398 }, { "epoch": 0.21099466086612617, "step": 1067, "train/total_loss": 0.0799078568816185 }, { "entropy": 6.141915321350098, "epoch": 0.2111924065651572, "mean_token_accuracy": 0.7678207755088806, "num_tokens": 47724471.0, "step": 1068, "train/ce_loss": 0.7146511673927307 }, { "epoch": 0.2111924065651572, "step": 1068, "train/sim_loss": 0.0003947019577026367 }, { "epoch": 0.2111924065651572, "step": 1068, "train/total_loss": 0.07185982167720795 }, { "entropy": 6.328557968139648, "epoch": 0.21139015226418825, "mean_token_accuracy": 0.6702127456665039, "num_tokens": 47764411.0, "step": 1069, "train/ce_loss": 1.1589215993881226 }, { "epoch": 0.21139015226418825, "step": 1069, "train/sim_loss": 0.0004633665084838867 }, { "epoch": 0.21139015226418825, "step": 1069, "train/total_loss": 0.1163555309176445 }, { "entropy": 6.191598892211914, "epoch": 0.2115878979632193, "mean_token_accuracy": 0.7064803242683411, "num_tokens": 47802940.0, "step": 1070, "train/ce_loss": 3.3804026315920055e-05 }, { "epoch": 0.2115878979632193, "step": 1070, "train/sim_loss": 0.0004284381866455078 }, { "epoch": 0.2115878979632193, "step": 1070, "train/total_loss": 0.00043181859655305743 }, { "entropy": 5.94265604019165, "epoch": 0.21178564366225033, "mean_token_accuracy": 0.7563884258270264, "num_tokens": 47837160.0, "step": 1071, "train/ce_loss": 0.854489266872406 }, { "epoch": 0.21178564366225033, "step": 1071, "train/sim_loss": 0.00047153234481811523 }, { "epoch": 0.21178564366225033, "step": 1071, "train/total_loss": 0.08592046052217484 }, { "entropy": 6.52748441696167, "epoch": 0.2119833893612814, "mean_token_accuracy": 0.7448405027389526, "num_tokens": 47897340.0, "step": 1072, "train/ce_loss": 0.9125421047210693 }, { "epoch": 0.2119833893612814, "step": 1072, "train/sim_loss": 0.0003542900085449219 }, { "epoch": 0.2119833893612814, "step": 1072, "train/total_loss": 0.09160850197076797 }, { "entropy": 5.829516887664795, "epoch": 0.21218113506031244, "mean_token_accuracy": 0.7530413866043091, "num_tokens": 47933357.0, "step": 1073, "train/ce_loss": 0.5278152823448181 }, { "epoch": 0.21218113506031244, "step": 1073, "train/sim_loss": 0.00024175643920898438 }, { "epoch": 0.21218113506031244, "step": 1073, "train/total_loss": 0.053023286163806915 }, { "entropy": 6.018360137939453, "epoch": 0.2123788807593435, "mean_token_accuracy": 0.7241586446762085, "num_tokens": 47966699.0, "step": 1074, "train/ce_loss": 4.1228624468203634e-05 }, { "epoch": 0.2123788807593435, "step": 1074, "train/sim_loss": 0.00033396482467651367 }, { "epoch": 0.2123788807593435, "step": 1074, "train/total_loss": 0.00033808767329901457 }, { "entropy": 6.172513961791992, "epoch": 0.21257662645837452, "mean_token_accuracy": 0.7122460007667542, "num_tokens": 48017028.0, "step": 1075, "train/ce_loss": 0.7399613857269287 }, { "epoch": 0.21257662645837452, "step": 1075, "train/sim_loss": 0.00048154592514038086 }, { "epoch": 0.21257662645837452, "step": 1075, "train/total_loss": 0.07447768747806549 }, { "entropy": 6.224673271179199, "epoch": 0.21277437215740558, "mean_token_accuracy": 0.746666669845581, "num_tokens": 48048013.0, "step": 1076, "train/ce_loss": 1.072978138923645 }, { "epoch": 0.21277437215740558, "step": 1076, "train/sim_loss": 0.00041162967681884766 }, { "epoch": 0.21277437215740558, "step": 1076, "train/total_loss": 0.10770944505929947 }, { "entropy": 6.047102928161621, "epoch": 0.21297211785643663, "mean_token_accuracy": 0.7266880869865417, "num_tokens": 48098669.0, "step": 1077, "train/ce_loss": 0.9643751978874207 }, { "epoch": 0.21297211785643663, "step": 1077, "train/sim_loss": 0.0004208087921142578 }, { "epoch": 0.21297211785643663, "step": 1077, "train/total_loss": 0.09685833007097244 }, { "entropy": 5.925528526306152, "epoch": 0.21316986355546766, "mean_token_accuracy": 0.7236136198043823, "num_tokens": 48148634.0, "step": 1078, "train/ce_loss": 1.1011146306991577 }, { "epoch": 0.21316986355546766, "step": 1078, "train/sim_loss": 0.0002818107604980469 }, { "epoch": 0.21316986355546766, "step": 1078, "train/total_loss": 0.11039327830076218 }, { "entropy": 6.198968887329102, "epoch": 0.2133676092544987, "mean_token_accuracy": 0.7535321712493896, "num_tokens": 48192152.0, "step": 1079, "train/ce_loss": 1.0184736251831055 }, { "epoch": 0.2133676092544987, "step": 1079, "train/sim_loss": 0.0002613663673400879 }, { "epoch": 0.2133676092544987, "step": 1079, "train/total_loss": 0.10210873186588287 }, { "epoch": 0.21356535495352977, "grad_norm": 0.47805699706077576, "learning_rate": 9.47126323078445e-06, "loss": 0.0874, "step": 1080 }, { "entropy": 6.117816925048828, "epoch": 0.21356535495352977, "mean_token_accuracy": 0.6847360730171204, "num_tokens": 48242905.0, "step": 1080, "train/ce_loss": 1.3381807804107666 }, { "epoch": 0.21356535495352977, "step": 1080, "train/sim_loss": 0.00040334463119506836 }, { "epoch": 0.21356535495352977, "step": 1080, "train/total_loss": 0.1342214196920395 }, { "entropy": 5.966245651245117, "epoch": 0.2137631006525608, "mean_token_accuracy": 0.7915676832199097, "num_tokens": 48305084.0, "step": 1081, "train/ce_loss": 0.33169975876808167 }, { "epoch": 0.2137631006525608, "step": 1081, "train/sim_loss": 0.0005632638931274414 }, { "epoch": 0.2137631006525608, "step": 1081, "train/total_loss": 0.03373324126005173 }, { "entropy": 5.901087760925293, "epoch": 0.21396084635159185, "mean_token_accuracy": 0.7416452169418335, "num_tokens": 48336836.0, "step": 1082, "train/ce_loss": 1.3340651988983154 }, { "epoch": 0.21396084635159185, "step": 1082, "train/sim_loss": 0.00047767162322998047 }, { "epoch": 0.21396084635159185, "step": 1082, "train/total_loss": 0.13388419151306152 }, { "entropy": 5.723097801208496, "epoch": 0.2141585920506229, "mean_token_accuracy": 0.7662050127983093, "num_tokens": 48368758.0, "step": 1083, "train/ce_loss": 0.8831948637962341 }, { "epoch": 0.2141585920506229, "step": 1083, "train/sim_loss": 0.00033658742904663086 }, { "epoch": 0.2141585920506229, "step": 1083, "train/total_loss": 0.08865607529878616 }, { "entropy": 5.8788676261901855, "epoch": 0.21435633774965396, "mean_token_accuracy": 0.6975269317626953, "num_tokens": 48408151.0, "step": 1084, "train/ce_loss": 1.4550950527191162 }, { "epoch": 0.21435633774965396, "step": 1084, "train/sim_loss": 0.0002719759941101074 }, { "epoch": 0.21435633774965396, "step": 1084, "train/total_loss": 0.1457814872264862 }, { "entropy": 5.564487457275391, "epoch": 0.21455408344868498, "mean_token_accuracy": 0.7836147546768188, "num_tokens": 48436768.0, "step": 1085, "train/ce_loss": 0.6476079821586609 }, { "epoch": 0.21455408344868498, "step": 1085, "train/sim_loss": 0.0002187490463256836 }, { "epoch": 0.21455408344868498, "step": 1085, "train/total_loss": 0.06497954577207565 }, { "entropy": 6.023898124694824, "epoch": 0.21475182914771604, "mean_token_accuracy": 0.7396199703216553, "num_tokens": 48486309.0, "step": 1086, "train/ce_loss": 0.8605237007141113 }, { "epoch": 0.21475182914771604, "step": 1086, "train/sim_loss": 0.0002884864807128906 }, { "epoch": 0.21475182914771604, "step": 1086, "train/total_loss": 0.08634085953235626 }, { "entropy": 6.157839775085449, "epoch": 0.2149495748467471, "mean_token_accuracy": 0.7018181681632996, "num_tokens": 48538481.0, "step": 1087, "train/ce_loss": 2.4668728656251915e-05 }, { "epoch": 0.2149495748467471, "step": 1087, "train/sim_loss": 0.00046259164810180664 }, { "epoch": 0.2149495748467471, "step": 1087, "train/total_loss": 0.00046505851787514985 }, { "entropy": 6.021604537963867, "epoch": 0.21514732054577812, "mean_token_accuracy": 0.7290448546409607, "num_tokens": 48591738.0, "step": 1088, "train/ce_loss": 4.139711018069647e-05 }, { "epoch": 0.21514732054577812, "step": 1088, "train/sim_loss": 0.0002580881118774414 }, { "epoch": 0.21514732054577812, "step": 1088, "train/total_loss": 0.00026222781161777675 }, { "entropy": 5.861176013946533, "epoch": 0.21534506624480917, "mean_token_accuracy": 0.7482941746711731, "num_tokens": 48635752.0, "step": 1089, "train/ce_loss": 3.0291121220216155e-05 }, { "epoch": 0.21534506624480917, "step": 1089, "train/sim_loss": 0.00024890899658203125 }, { "epoch": 0.21534506624480917, "step": 1089, "train/total_loss": 0.0002519381232559681 }, { "entropy": 6.439871788024902, "epoch": 0.21554281194384023, "mean_token_accuracy": 0.711033284664154, "num_tokens": 48671909.0, "step": 1090, "train/ce_loss": 2.199871301651001 }, { "epoch": 0.21554281194384023, "step": 1090, "train/sim_loss": 0.0003153681755065918 }, { "epoch": 0.21554281194384023, "step": 1090, "train/total_loss": 0.2203025072813034 }, { "entropy": 5.312363147735596, "epoch": 0.21574055764287126, "mean_token_accuracy": 0.7704828977584839, "num_tokens": 48711758.0, "step": 1091, "train/ce_loss": 0.8372965455055237 }, { "epoch": 0.21574055764287126, "step": 1091, "train/sim_loss": 0.0005120038986206055 }, { "epoch": 0.21574055764287126, "step": 1091, "train/total_loss": 0.08424165844917297 }, { "entropy": 5.776908874511719, "epoch": 0.2159383033419023, "mean_token_accuracy": 0.7255146503448486, "num_tokens": 48767689.0, "step": 1092, "train/ce_loss": 1.610575054655783e-05 }, { "epoch": 0.2159383033419023, "step": 1092, "train/sim_loss": 0.00032722949981689453 }, { "epoch": 0.2159383033419023, "step": 1092, "train/total_loss": 0.0003288400766905397 }, { "entropy": 6.073269367218018, "epoch": 0.21613604904093336, "mean_token_accuracy": 0.7429227232933044, "num_tokens": 48806321.0, "step": 1093, "train/ce_loss": 0.8174327611923218 }, { "epoch": 0.21613604904093336, "step": 1093, "train/sim_loss": 0.0006530284881591797 }, { "epoch": 0.21613604904093336, "step": 1093, "train/total_loss": 0.08239630609750748 }, { "entropy": 6.328851222991943, "epoch": 0.21633379473996442, "mean_token_accuracy": 0.7359798550605774, "num_tokens": 48846624.0, "step": 1094, "train/ce_loss": 1.0080665349960327 }, { "epoch": 0.21633379473996442, "step": 1094, "train/sim_loss": 0.0002511739730834961 }, { "epoch": 0.21633379473996442, "step": 1094, "train/total_loss": 0.10105782747268677 }, { "entropy": 5.857417583465576, "epoch": 0.21653154043899545, "mean_token_accuracy": 0.7667322754859924, "num_tokens": 48880030.0, "step": 1095, "train/ce_loss": 0.6346655488014221 }, { "epoch": 0.21653154043899545, "step": 1095, "train/sim_loss": 0.0002499818801879883 }, { "epoch": 0.21653154043899545, "step": 1095, "train/total_loss": 0.06371653825044632 }, { "entropy": 5.770508766174316, "epoch": 0.2167292861380265, "mean_token_accuracy": 0.7358943819999695, "num_tokens": 48929026.0, "step": 1096, "train/ce_loss": 1.5513008832931519 }, { "epoch": 0.2167292861380265, "step": 1096, "train/sim_loss": 0.0003472566604614258 }, { "epoch": 0.2167292861380265, "step": 1096, "train/total_loss": 0.1554773449897766 }, { "entropy": 5.934349060058594, "epoch": 0.21692703183705755, "mean_token_accuracy": 0.7225760817527771, "num_tokens": 48978797.0, "step": 1097, "train/ce_loss": 0.8812382817268372 }, { "epoch": 0.21692703183705755, "step": 1097, "train/sim_loss": 0.0004302859306335449 }, { "epoch": 0.21692703183705755, "step": 1097, "train/total_loss": 0.08855411410331726 }, { "entropy": 5.890908241271973, "epoch": 0.21712477753608858, "mean_token_accuracy": 0.7196064591407776, "num_tokens": 49036856.0, "step": 1098, "train/ce_loss": 0.41337236762046814 }, { "epoch": 0.21712477753608858, "step": 1098, "train/sim_loss": 0.00036406517028808594 }, { "epoch": 0.21712477753608858, "step": 1098, "train/total_loss": 0.0417013019323349 }, { "entropy": 5.544255256652832, "epoch": 0.21732252323511964, "mean_token_accuracy": 0.7781862616539001, "num_tokens": 49092275.0, "step": 1099, "train/ce_loss": 0.6806511878967285 }, { "epoch": 0.21732252323511964, "step": 1099, "train/sim_loss": 0.00032639503479003906 }, { "epoch": 0.21732252323511964, "step": 1099, "train/total_loss": 0.06839151680469513 }, { "epoch": 0.2175202689341507, "grad_norm": 0.3805777132511139, "learning_rate": 9.461371055495105e-06, "loss": 0.0846, "step": 1100 }, { "entropy": 6.180237293243408, "epoch": 0.2175202689341507, "mean_token_accuracy": 0.7253121733665466, "num_tokens": 49141158.0, "step": 1100, "train/ce_loss": 0.9295773506164551 }, { "epoch": 0.2175202689341507, "step": 1100, "train/sim_loss": 0.00029456615447998047 }, { "epoch": 0.2175202689341507, "step": 1100, "train/total_loss": 0.09325230121612549 }, { "entropy": 5.854374885559082, "epoch": 0.21771801463318172, "mean_token_accuracy": 0.7408270239830017, "num_tokens": 49181902.0, "step": 1101, "train/ce_loss": 0.8353855609893799 }, { "epoch": 0.21771801463318172, "step": 1101, "train/sim_loss": 0.00027680397033691406 }, { "epoch": 0.21771801463318172, "step": 1101, "train/total_loss": 0.08381535857915878 }, { "entropy": 6.131408214569092, "epoch": 0.21791576033221277, "mean_token_accuracy": 0.6856250166893005, "num_tokens": 49221924.0, "step": 1102, "train/ce_loss": 0.6708939075469971 }, { "epoch": 0.21791576033221277, "step": 1102, "train/sim_loss": 0.0002741217613220215 }, { "epoch": 0.21791576033221277, "step": 1102, "train/total_loss": 0.06736351549625397 }, { "entropy": 5.519102573394775, "epoch": 0.21811350603124383, "mean_token_accuracy": 0.8155197501182556, "num_tokens": 49254194.0, "step": 1103, "train/ce_loss": 1.9617002180893905e-05 }, { "epoch": 0.21811350603124383, "step": 1103, "train/sim_loss": 0.00022721290588378906 }, { "epoch": 0.21811350603124383, "step": 1103, "train/total_loss": 0.0002291746059199795 }, { "entropy": 6.124795913696289, "epoch": 0.21831125173027488, "mean_token_accuracy": 0.7296457290649414, "num_tokens": 49306608.0, "step": 1104, "train/ce_loss": 0.9262959957122803 }, { "epoch": 0.21831125173027488, "step": 1104, "train/sim_loss": 0.0003998279571533203 }, { "epoch": 0.21831125173027488, "step": 1104, "train/total_loss": 0.0930294319987297 }, { "entropy": 5.57341194152832, "epoch": 0.2185089974293059, "mean_token_accuracy": 0.7545931935310364, "num_tokens": 49346170.0, "step": 1105, "train/ce_loss": 0.5100508332252502 }, { "epoch": 0.2185089974293059, "step": 1105, "train/sim_loss": 0.0005506277084350586 }, { "epoch": 0.2185089974293059, "step": 1105, "train/total_loss": 0.05155571177601814 }, { "entropy": 6.066858291625977, "epoch": 0.21870674312833696, "mean_token_accuracy": 0.7392815947532654, "num_tokens": 49396188.0, "step": 1106, "train/ce_loss": 2.0990960597991943 }, { "epoch": 0.21870674312833696, "step": 1106, "train/sim_loss": 0.00022077560424804688 }, { "epoch": 0.21870674312833696, "step": 1106, "train/total_loss": 0.21013037860393524 }, { "entropy": 5.929189682006836, "epoch": 0.21890448882736802, "mean_token_accuracy": 0.7740916013717651, "num_tokens": 49441130.0, "step": 1107, "train/ce_loss": 0.9507960677146912 }, { "epoch": 0.21890448882736802, "step": 1107, "train/sim_loss": 0.00036901235580444336 }, { "epoch": 0.21890448882736802, "step": 1107, "train/total_loss": 0.09544862061738968 }, { "entropy": 6.063515663146973, "epoch": 0.21910223452639904, "mean_token_accuracy": 0.710349440574646, "num_tokens": 49490841.0, "step": 1108, "train/ce_loss": 1.0607272386550903 }, { "epoch": 0.21910223452639904, "step": 1108, "train/sim_loss": 0.0005377531051635742 }, { "epoch": 0.21910223452639904, "step": 1108, "train/total_loss": 0.10661047697067261 }, { "entropy": 6.051987648010254, "epoch": 0.2192999802254301, "mean_token_accuracy": 0.7273308038711548, "num_tokens": 49556386.0, "step": 1109, "train/ce_loss": 1.8280409574508667 }, { "epoch": 0.2192999802254301, "step": 1109, "train/sim_loss": 0.0005906224250793457 }, { "epoch": 0.2192999802254301, "step": 1109, "train/total_loss": 0.18339471518993378 }, { "entropy": 6.02731990814209, "epoch": 0.21949772592446115, "mean_token_accuracy": 0.7232277393341064, "num_tokens": 49599518.0, "step": 1110, "train/ce_loss": 1.2449404001235962 }, { "epoch": 0.21949772592446115, "step": 1110, "train/sim_loss": 0.0003104209899902344 }, { "epoch": 0.21949772592446115, "step": 1110, "train/total_loss": 0.12480445951223373 }, { "entropy": 6.283064842224121, "epoch": 0.21969547162349218, "mean_token_accuracy": 0.7236180901527405, "num_tokens": 49639428.0, "step": 1111, "train/ce_loss": 0.8899973034858704 }, { "epoch": 0.21969547162349218, "step": 1111, "train/sim_loss": 0.0003790855407714844 }, { "epoch": 0.21969547162349218, "step": 1111, "train/total_loss": 0.08937881886959076 }, { "entropy": 5.891690254211426, "epoch": 0.21989321732252323, "mean_token_accuracy": 0.7308917045593262, "num_tokens": 49678079.0, "step": 1112, "train/ce_loss": 1.157725214958191 }, { "epoch": 0.21989321732252323, "step": 1112, "train/sim_loss": 0.0004885196685791016 }, { "epoch": 0.21989321732252323, "step": 1112, "train/total_loss": 0.11626104265451431 }, { "entropy": 6.180918216705322, "epoch": 0.2200909630215543, "mean_token_accuracy": 0.75, "num_tokens": 49721520.0, "step": 1113, "train/ce_loss": 0.53685063123703 }, { "epoch": 0.2200909630215543, "step": 1113, "train/sim_loss": 0.0005154609680175781 }, { "epoch": 0.2200909630215543, "step": 1113, "train/total_loss": 0.05420052632689476 }, { "entropy": 6.299198627471924, "epoch": 0.22028870872058534, "mean_token_accuracy": 0.7201550602912903, "num_tokens": 49768249.0, "step": 1114, "train/ce_loss": 0.9566022157669067 }, { "epoch": 0.22028870872058534, "step": 1114, "train/sim_loss": 0.00045043230056762695 }, { "epoch": 0.22028870872058534, "step": 1114, "train/total_loss": 0.09611065685749054 }, { "entropy": 5.756984710693359, "epoch": 0.22048645441961637, "mean_token_accuracy": 0.7552552819252014, "num_tokens": 49813755.0, "step": 1115, "train/ce_loss": 2.7999964004266076e-05 }, { "epoch": 0.22048645441961637, "step": 1115, "train/sim_loss": 0.00023472309112548828 }, { "epoch": 0.22048645441961637, "step": 1115, "train/total_loss": 0.00023752308334223926 }, { "entropy": 6.046344757080078, "epoch": 0.22068420011864742, "mean_token_accuracy": 0.7598299384117126, "num_tokens": 49850865.0, "step": 1116, "train/ce_loss": 0.6546515226364136 }, { "epoch": 0.22068420011864742, "step": 1116, "train/sim_loss": 0.0002541542053222656 }, { "epoch": 0.22068420011864742, "step": 1116, "train/total_loss": 0.06571930646896362 }, { "entropy": 5.86164665222168, "epoch": 0.22088194581767848, "mean_token_accuracy": 0.7590176463127136, "num_tokens": 49888519.0, "step": 1117, "train/ce_loss": 0.6624237895011902 }, { "epoch": 0.22088194581767848, "step": 1117, "train/sim_loss": 0.00025081634521484375 }, { "epoch": 0.22088194581767848, "step": 1117, "train/total_loss": 0.0664931982755661 }, { "entropy": 6.205061435699463, "epoch": 0.2210796915167095, "mean_token_accuracy": 0.7034529447555542, "num_tokens": 49933768.0, "step": 1118, "train/ce_loss": 0.9625222682952881 }, { "epoch": 0.2210796915167095, "step": 1118, "train/sim_loss": 0.00041544437408447266 }, { "epoch": 0.2210796915167095, "step": 1118, "train/total_loss": 0.09666766971349716 }, { "entropy": 5.844366550445557, "epoch": 0.22127743721574056, "mean_token_accuracy": 0.7193877696990967, "num_tokens": 49972136.0, "step": 1119, "train/ce_loss": 0.7475121021270752 }, { "epoch": 0.22127743721574056, "step": 1119, "train/sim_loss": 0.00045603513717651367 }, { "epoch": 0.22127743721574056, "step": 1119, "train/total_loss": 0.07520724833011627 }, { "epoch": 0.2214751829147716, "grad_norm": 0.43034714460372925, "learning_rate": 9.451478880205758e-06, "loss": 0.0845, "step": 1120 }, { "entropy": 5.978951930999756, "epoch": 0.2214751829147716, "mean_token_accuracy": 0.7349323630332947, "num_tokens": 50015863.0, "step": 1120, "train/ce_loss": 0.8477593064308167 }, { "epoch": 0.2214751829147716, "step": 1120, "train/sim_loss": 0.00034314393997192383 }, { "epoch": 0.2214751829147716, "step": 1120, "train/total_loss": 0.08511907607316971 }, { "entropy": 5.990959167480469, "epoch": 0.22167292861380264, "mean_token_accuracy": 0.7031063437461853, "num_tokens": 50052069.0, "step": 1121, "train/ce_loss": 0.6111167073249817 }, { "epoch": 0.22167292861380264, "step": 1121, "train/sim_loss": 0.00023603439331054688 }, { "epoch": 0.22167292861380264, "step": 1121, "train/total_loss": 0.061347704380750656 }, { "entropy": 5.753885269165039, "epoch": 0.2218706743128337, "mean_token_accuracy": 0.7893863916397095, "num_tokens": 50087070.0, "step": 1122, "train/ce_loss": 0.8053613901138306 }, { "epoch": 0.2218706743128337, "step": 1122, "train/sim_loss": 0.00027489662170410156 }, { "epoch": 0.2218706743128337, "step": 1122, "train/total_loss": 0.0808110386133194 }, { "entropy": 5.576076030731201, "epoch": 0.22206842001186475, "mean_token_accuracy": 0.7582911849021912, "num_tokens": 50132994.0, "step": 1123, "train/ce_loss": 0.7599890232086182 }, { "epoch": 0.22206842001186475, "step": 1123, "train/sim_loss": 0.0002847909927368164 }, { "epoch": 0.22206842001186475, "step": 1123, "train/total_loss": 0.07628369331359863 }, { "entropy": 5.976726055145264, "epoch": 0.22226616571089577, "mean_token_accuracy": 0.7264337539672852, "num_tokens": 50173132.0, "step": 1124, "train/ce_loss": 0.7527846693992615 }, { "epoch": 0.22226616571089577, "step": 1124, "train/sim_loss": 0.0004214644432067871 }, { "epoch": 0.22226616571089577, "step": 1124, "train/total_loss": 0.07569993287324905 }, { "entropy": 5.604635238647461, "epoch": 0.22246391140992683, "mean_token_accuracy": 0.7679955959320068, "num_tokens": 50220783.0, "step": 1125, "train/ce_loss": 1.1338902711868286 }, { "epoch": 0.22246391140992683, "step": 1125, "train/sim_loss": 0.00034618377685546875 }, { "epoch": 0.22246391140992683, "step": 1125, "train/total_loss": 0.11373521387577057 }, { "entropy": 6.043375492095947, "epoch": 0.22266165710895788, "mean_token_accuracy": 0.7134955525398254, "num_tokens": 50270948.0, "step": 1126, "train/ce_loss": 0.8533519506454468 }, { "epoch": 0.22266165710895788, "step": 1126, "train/sim_loss": 0.0002529621124267578 }, { "epoch": 0.22266165710895788, "step": 1126, "train/total_loss": 0.08558815717697144 }, { "entropy": 5.856296539306641, "epoch": 0.22285940280798894, "mean_token_accuracy": 0.741193413734436, "num_tokens": 50327768.0, "step": 1127, "train/ce_loss": 1.8555216229287907e-05 }, { "epoch": 0.22285940280798894, "step": 1127, "train/sim_loss": 0.00025969743728637695 }, { "epoch": 0.22285940280798894, "step": 1127, "train/total_loss": 0.000261552951997146 }, { "entropy": 6.305148124694824, "epoch": 0.22305714850701996, "mean_token_accuracy": 0.7064117193222046, "num_tokens": 50392287.0, "step": 1128, "train/ce_loss": 0.74908047914505 }, { "epoch": 0.22305714850701996, "step": 1128, "train/sim_loss": 0.00032657384872436523 }, { "epoch": 0.22305714850701996, "step": 1128, "train/total_loss": 0.07523462176322937 }, { "entropy": 5.390703201293945, "epoch": 0.22325489420605102, "mean_token_accuracy": 0.7614277005195618, "num_tokens": 50418499.0, "step": 1129, "train/ce_loss": 1.0476371049880981 }, { "epoch": 0.22325489420605102, "step": 1129, "train/sim_loss": 0.00033670663833618164 }, { "epoch": 0.22325489420605102, "step": 1129, "train/total_loss": 0.10510041564702988 }, { "entropy": 6.171955108642578, "epoch": 0.22345263990508207, "mean_token_accuracy": 0.7136563658714294, "num_tokens": 50478058.0, "step": 1130, "train/ce_loss": 0.9195505976676941 }, { "epoch": 0.22345263990508207, "step": 1130, "train/sim_loss": 0.00034880638122558594 }, { "epoch": 0.22345263990508207, "step": 1130, "train/total_loss": 0.09230386465787888 }, { "entropy": 5.784954071044922, "epoch": 0.2236503856041131, "mean_token_accuracy": 0.7764383554458618, "num_tokens": 50523510.0, "step": 1131, "train/ce_loss": 0.7119393944740295 }, { "epoch": 0.2236503856041131, "step": 1131, "train/sim_loss": 0.00022399425506591797 }, { "epoch": 0.2236503856041131, "step": 1131, "train/total_loss": 0.07141793519258499 }, { "entropy": 6.00910758972168, "epoch": 0.22384813130314415, "mean_token_accuracy": 0.7231734991073608, "num_tokens": 50585334.0, "step": 1132, "train/ce_loss": 0.6507935523986816 }, { "epoch": 0.22384813130314415, "step": 1132, "train/sim_loss": 0.0004887580871582031 }, { "epoch": 0.22384813130314415, "step": 1132, "train/total_loss": 0.06556811183691025 }, { "entropy": 6.012314796447754, "epoch": 0.2240458770021752, "mean_token_accuracy": 0.7092297077178955, "num_tokens": 50641268.0, "step": 1133, "train/ce_loss": 1.642403244972229 }, { "epoch": 0.2240458770021752, "step": 1133, "train/sim_loss": 0.0006501078605651855 }, { "epoch": 0.2240458770021752, "step": 1133, "train/total_loss": 0.16489043831825256 }, { "entropy": 5.498167037963867, "epoch": 0.22424362270120624, "mean_token_accuracy": 0.7669441103935242, "num_tokens": 50665793.0, "step": 1134, "train/ce_loss": 0.6159619092941284 }, { "epoch": 0.22424362270120624, "step": 1134, "train/sim_loss": 0.00021326541900634766 }, { "epoch": 0.22424362270120624, "step": 1134, "train/total_loss": 0.06180945783853531 }, { "entropy": 5.804330825805664, "epoch": 0.2244413684002373, "mean_token_accuracy": 0.7575083374977112, "num_tokens": 50711810.0, "step": 1135, "train/ce_loss": 3.7181620427872986e-05 }, { "epoch": 0.2244413684002373, "step": 1135, "train/sim_loss": 0.0003809332847595215 }, { "epoch": 0.2244413684002373, "step": 1135, "train/total_loss": 0.0003846514446195215 }, { "entropy": 5.502416610717773, "epoch": 0.22463911409926834, "mean_token_accuracy": 0.7616613507270813, "num_tokens": 50740506.0, "step": 1136, "train/ce_loss": 0.5819190144538879 }, { "epoch": 0.22463911409926834, "step": 1136, "train/sim_loss": 0.0002078413963317871 }, { "epoch": 0.22463911409926834, "step": 1136, "train/total_loss": 0.0583997443318367 }, { "entropy": 5.692489147186279, "epoch": 0.2248368597982994, "mean_token_accuracy": 0.7635829448699951, "num_tokens": 50774817.0, "step": 1137, "train/ce_loss": 0.5752754807472229 }, { "epoch": 0.2248368597982994, "step": 1137, "train/sim_loss": 0.0004502534866333008 }, { "epoch": 0.2248368597982994, "step": 1137, "train/total_loss": 0.05797780305147171 }, { "entropy": 5.999688148498535, "epoch": 0.22503460549733043, "mean_token_accuracy": 0.7308598160743713, "num_tokens": 50815563.0, "step": 1138, "train/ce_loss": 1.5985478162765503 }, { "epoch": 0.22503460549733043, "step": 1138, "train/sim_loss": 0.0006422996520996094 }, { "epoch": 0.22503460549733043, "step": 1138, "train/total_loss": 0.16049708425998688 }, { "entropy": 5.908845901489258, "epoch": 0.22523235119636148, "mean_token_accuracy": 0.745932400226593, "num_tokens": 50858458.0, "step": 1139, "train/ce_loss": 1.0876805782318115 }, { "epoch": 0.22523235119636148, "step": 1139, "train/sim_loss": 0.00029087066650390625 }, { "epoch": 0.22523235119636148, "step": 1139, "train/total_loss": 0.1090589314699173 }, { "epoch": 0.22543009689539253, "grad_norm": 0.4276716709136963, "learning_rate": 9.441586704916412e-06, "loss": 0.0826, "step": 1140 }, { "entropy": 6.012338638305664, "epoch": 0.22543009689539253, "mean_token_accuracy": 0.7453874349594116, "num_tokens": 50897636.0, "step": 1140, "train/ce_loss": 0.4048922657966614 }, { "epoch": 0.22543009689539253, "step": 1140, "train/sim_loss": 0.00022470951080322266 }, { "epoch": 0.22543009689539253, "step": 1140, "train/total_loss": 0.04071393609046936 }, { "entropy": 5.9301252365112305, "epoch": 0.22562784259442356, "mean_token_accuracy": 0.7413905262947083, "num_tokens": 50951866.0, "step": 1141, "train/ce_loss": 0.6463845372200012 }, { "epoch": 0.22562784259442356, "step": 1141, "train/sim_loss": 0.00027233362197875977 }, { "epoch": 0.22562784259442356, "step": 1141, "train/total_loss": 0.06491079181432724 }, { "entropy": 6.160374641418457, "epoch": 0.22582558829345462, "mean_token_accuracy": 0.6941081285476685, "num_tokens": 51004511.0, "step": 1142, "train/ce_loss": 2.5828856450971216e-05 }, { "epoch": 0.22582558829345462, "step": 1142, "train/sim_loss": 0.0002231597900390625 }, { "epoch": 0.22582558829345462, "step": 1142, "train/total_loss": 0.00022574268223252147 }, { "entropy": 6.083847522735596, "epoch": 0.22602333399248567, "mean_token_accuracy": 0.7028753757476807, "num_tokens": 51050640.0, "step": 1143, "train/ce_loss": 1.252753496170044 }, { "epoch": 0.22602333399248567, "step": 1143, "train/sim_loss": 0.00018513202667236328 }, { "epoch": 0.22602333399248567, "step": 1143, "train/total_loss": 0.12546049058437347 }, { "entropy": 5.968011856079102, "epoch": 0.2262210796915167, "mean_token_accuracy": 0.7299196720123291, "num_tokens": 51083545.0, "step": 1144, "train/ce_loss": 1.3787789344787598 }, { "epoch": 0.2262210796915167, "step": 1144, "train/sim_loss": 0.00043487548828125 }, { "epoch": 0.2262210796915167, "step": 1144, "train/total_loss": 0.13831277191638947 }, { "entropy": 5.666933059692383, "epoch": 0.22641882539054775, "mean_token_accuracy": 0.7435897588729858, "num_tokens": 51130098.0, "step": 1145, "train/ce_loss": 0.36126187443733215 }, { "epoch": 0.22641882539054775, "step": 1145, "train/sim_loss": 0.0006210803985595703 }, { "epoch": 0.22641882539054775, "step": 1145, "train/total_loss": 0.036747269332408905 }, { "entropy": 5.897841453552246, "epoch": 0.2266165710895788, "mean_token_accuracy": 0.6889692544937134, "num_tokens": 51163047.0, "step": 1146, "train/ce_loss": 1.846625804901123 }, { "epoch": 0.2266165710895788, "step": 1146, "train/sim_loss": 0.00033080577850341797 }, { "epoch": 0.2266165710895788, "step": 1146, "train/total_loss": 0.18499338626861572 }, { "entropy": 5.868244171142578, "epoch": 0.22681431678860986, "mean_token_accuracy": 0.7568470239639282, "num_tokens": 51208258.0, "step": 1147, "train/ce_loss": 1.0842819213867188 }, { "epoch": 0.22681431678860986, "step": 1147, "train/sim_loss": 0.00047338008880615234 }, { "epoch": 0.22681431678860986, "step": 1147, "train/total_loss": 0.10890157520771027 }, { "entropy": 5.933119773864746, "epoch": 0.2270120624876409, "mean_token_accuracy": 0.7512580752372742, "num_tokens": 51253681.0, "step": 1148, "train/ce_loss": 0.7384254932403564 }, { "epoch": 0.2270120624876409, "step": 1148, "train/sim_loss": 0.0005393624305725098 }, { "epoch": 0.2270120624876409, "step": 1148, "train/total_loss": 0.07438191026449203 }, { "entropy": 6.439493179321289, "epoch": 0.22720980818667194, "mean_token_accuracy": 0.7226229310035706, "num_tokens": 51297491.0, "step": 1149, "train/ce_loss": 1.6878731912584044e-05 }, { "epoch": 0.22720980818667194, "step": 1149, "train/sim_loss": 0.0003979802131652832 }, { "epoch": 0.22720980818667194, "step": 1149, "train/total_loss": 0.0003996680898126215 }, { "entropy": 6.060522079467773, "epoch": 0.227407553885703, "mean_token_accuracy": 0.7684594392776489, "num_tokens": 51335957.0, "step": 1150, "train/ce_loss": 0.6134547591209412 }, { "epoch": 0.227407553885703, "step": 1150, "train/sim_loss": 0.00031000375747680664 }, { "epoch": 0.227407553885703, "step": 1150, "train/total_loss": 0.06165548041462898 }, { "entropy": 5.723703384399414, "epoch": 0.22760529958473402, "mean_token_accuracy": 0.7689133286476135, "num_tokens": 51375681.0, "step": 1151, "train/ce_loss": 3.064799966523424e-05 }, { "epoch": 0.22760529958473402, "step": 1151, "train/sim_loss": 0.0004704594612121582 }, { "epoch": 0.22760529958473402, "step": 1151, "train/total_loss": 0.000473524269182235 }, { "entropy": 5.863536834716797, "epoch": 0.22780304528376508, "mean_token_accuracy": 0.7467309236526489, "num_tokens": 51423601.0, "step": 1152, "train/ce_loss": 1.0006659030914307 }, { "epoch": 0.22780304528376508, "step": 1152, "train/sim_loss": 0.00031942129135131836 }, { "epoch": 0.22780304528376508, "step": 1152, "train/total_loss": 0.10038601607084274 }, { "entropy": 5.872834205627441, "epoch": 0.22800079098279613, "mean_token_accuracy": 0.7408595085144043, "num_tokens": 51464645.0, "step": 1153, "train/ce_loss": 1.6340745787601918e-05 }, { "epoch": 0.22800079098279613, "step": 1153, "train/sim_loss": 0.0004011392593383789 }, { "epoch": 0.22800079098279613, "step": 1153, "train/total_loss": 0.0004027733230032027 }, { "entropy": 5.921611785888672, "epoch": 0.22819853668182716, "mean_token_accuracy": 0.7027027010917664, "num_tokens": 51497495.0, "step": 1154, "train/ce_loss": 0.8081992864608765 }, { "epoch": 0.22819853668182716, "step": 1154, "train/sim_loss": 0.0005131959915161133 }, { "epoch": 0.22819853668182716, "step": 1154, "train/total_loss": 0.08133312314748764 }, { "entropy": 5.793512344360352, "epoch": 0.2283962823808582, "mean_token_accuracy": 0.7209908962249756, "num_tokens": 51557308.0, "step": 1155, "train/ce_loss": 1.1303554773330688 }, { "epoch": 0.2283962823808582, "step": 1155, "train/sim_loss": 0.00044983625411987305 }, { "epoch": 0.2283962823808582, "step": 1155, "train/total_loss": 0.11348538845777512 }, { "entropy": 5.662315368652344, "epoch": 0.22859402807988927, "mean_token_accuracy": 0.7520541548728943, "num_tokens": 51605964.0, "step": 1156, "train/ce_loss": 0.8130104541778564 }, { "epoch": 0.22859402807988927, "step": 1156, "train/sim_loss": 0.0004443526268005371 }, { "epoch": 0.22859402807988927, "step": 1156, "train/total_loss": 0.08174540102481842 }, { "entropy": 5.669113636016846, "epoch": 0.22879177377892032, "mean_token_accuracy": 0.7622249126434326, "num_tokens": 51665473.0, "step": 1157, "train/ce_loss": 0.5843952298164368 }, { "epoch": 0.22879177377892032, "step": 1157, "train/sim_loss": 0.00039970874786376953 }, { "epoch": 0.22879177377892032, "step": 1157, "train/total_loss": 0.058839231729507446 }, { "entropy": 5.816442966461182, "epoch": 0.22898951947795135, "mean_token_accuracy": 0.7468085289001465, "num_tokens": 51713924.0, "step": 1158, "train/ce_loss": 1.3618675470352173 }, { "epoch": 0.22898951947795135, "step": 1158, "train/sim_loss": 0.00022852420806884766 }, { "epoch": 0.22898951947795135, "step": 1158, "train/total_loss": 0.1364152878522873 }, { "entropy": 5.861784934997559, "epoch": 0.2291872651769824, "mean_token_accuracy": 0.7409090995788574, "num_tokens": 51750944.0, "step": 1159, "train/ce_loss": 0.7192519307136536 }, { "epoch": 0.2291872651769824, "step": 1159, "train/sim_loss": 0.00035858154296875 }, { "epoch": 0.2291872651769824, "step": 1159, "train/total_loss": 0.0722837746143341 }, { "epoch": 0.22938501087601346, "grad_norm": 0.4420951306819916, "learning_rate": 9.431694529627066e-06, "loss": 0.0855, "step": 1160 }, { "entropy": 5.608598709106445, "epoch": 0.22938501087601346, "mean_token_accuracy": 0.7588956952095032, "num_tokens": 51783539.0, "step": 1160, "train/ce_loss": 0.7391158938407898 }, { "epoch": 0.22938501087601346, "step": 1160, "train/sim_loss": 0.00022417306900024414 }, { "epoch": 0.22938501087601346, "step": 1160, "train/total_loss": 0.07413576543331146 }, { "entropy": 6.185880661010742, "epoch": 0.22958275657504448, "mean_token_accuracy": 0.7306748628616333, "num_tokens": 51838076.0, "step": 1161, "train/ce_loss": 0.715273916721344 }, { "epoch": 0.22958275657504448, "step": 1161, "train/sim_loss": 0.0003848075866699219 }, { "epoch": 0.22958275657504448, "step": 1161, "train/total_loss": 0.07191219925880432 }, { "entropy": 6.347365379333496, "epoch": 0.22978050227407554, "mean_token_accuracy": 0.7342799305915833, "num_tokens": 51888710.0, "step": 1162, "train/ce_loss": 0.8607775568962097 }, { "epoch": 0.22978050227407554, "step": 1162, "train/sim_loss": 0.00020736455917358398 }, { "epoch": 0.22978050227407554, "step": 1162, "train/total_loss": 0.08628512173891068 }, { "entropy": 5.822361946105957, "epoch": 0.2299782479731066, "mean_token_accuracy": 0.7519209384918213, "num_tokens": 51926833.0, "step": 1163, "train/ce_loss": 0.6632264852523804 }, { "epoch": 0.2299782479731066, "step": 1163, "train/sim_loss": 0.00024890899658203125 }, { "epoch": 0.2299782479731066, "step": 1163, "train/total_loss": 0.06657155603170395 }, { "entropy": 5.866372108459473, "epoch": 0.23017599367213762, "mean_token_accuracy": 0.7675585150718689, "num_tokens": 51969321.0, "step": 1164, "train/ce_loss": 1.6521739959716797 }, { "epoch": 0.23017599367213762, "step": 1164, "train/sim_loss": 0.00034868717193603516 }, { "epoch": 0.23017599367213762, "step": 1164, "train/total_loss": 0.165566086769104 }, { "entropy": 5.918087482452393, "epoch": 0.23037373937116867, "mean_token_accuracy": 0.7795918583869934, "num_tokens": 52019259.0, "step": 1165, "train/ce_loss": 1.3267898559570312 }, { "epoch": 0.23037373937116867, "step": 1165, "train/sim_loss": 0.00026929378509521484 }, { "epoch": 0.23037373937116867, "step": 1165, "train/total_loss": 0.13294827938079834 }, { "entropy": 6.047554969787598, "epoch": 0.23057148507019973, "mean_token_accuracy": 0.7228915691375732, "num_tokens": 52055740.0, "step": 1166, "train/ce_loss": 1.6315568685531616 }, { "epoch": 0.23057148507019973, "step": 1166, "train/sim_loss": 0.0003871917724609375 }, { "epoch": 0.23057148507019973, "step": 1166, "train/total_loss": 0.16354288160800934 }, { "entropy": 5.541162967681885, "epoch": 0.23076923076923078, "mean_token_accuracy": 0.7316403388977051, "num_tokens": 52085681.0, "step": 1167, "train/ce_loss": 0.6286728382110596 }, { "epoch": 0.23076923076923078, "step": 1167, "train/sim_loss": 0.00028133392333984375 }, { "epoch": 0.23076923076923078, "step": 1167, "train/total_loss": 0.0631486177444458 }, { "entropy": 5.808832168579102, "epoch": 0.2309669764682618, "mean_token_accuracy": 0.7523703575134277, "num_tokens": 52131006.0, "step": 1168, "train/ce_loss": 0.4238070249557495 }, { "epoch": 0.2309669764682618, "step": 1168, "train/sim_loss": 0.00023186206817626953 }, { "epoch": 0.2309669764682618, "step": 1168, "train/total_loss": 0.04261256381869316 }, { "entropy": 5.837436676025391, "epoch": 0.23116472216729286, "mean_token_accuracy": 0.748008668422699, "num_tokens": 52164636.0, "step": 1169, "train/ce_loss": 1.1386468410491943 }, { "epoch": 0.23116472216729286, "step": 1169, "train/sim_loss": 0.0004233121871948242 }, { "epoch": 0.23116472216729286, "step": 1169, "train/total_loss": 0.11428799480199814 }, { "entropy": 5.538986682891846, "epoch": 0.23136246786632392, "mean_token_accuracy": 0.7613168954849243, "num_tokens": 52198142.0, "step": 1170, "train/ce_loss": 0.8227111101150513 }, { "epoch": 0.23136246786632392, "step": 1170, "train/sim_loss": 0.0002866983413696289 }, { "epoch": 0.23136246786632392, "step": 1170, "train/total_loss": 0.082557812333107 }, { "entropy": 5.868678092956543, "epoch": 0.23156021356535494, "mean_token_accuracy": 0.7067209482192993, "num_tokens": 52243523.0, "step": 1171, "train/ce_loss": 0.653983473777771 }, { "epoch": 0.23156021356535494, "step": 1171, "train/sim_loss": 0.00038754940032958984 }, { "epoch": 0.23156021356535494, "step": 1171, "train/total_loss": 0.06578589975833893 }, { "entropy": 5.790794372558594, "epoch": 0.231757959264386, "mean_token_accuracy": 0.7630752921104431, "num_tokens": 52279536.0, "step": 1172, "train/ce_loss": 0.9814574718475342 }, { "epoch": 0.231757959264386, "step": 1172, "train/sim_loss": 0.00019693374633789062 }, { "epoch": 0.231757959264386, "step": 1172, "train/total_loss": 0.09834267944097519 }, { "entropy": 6.053695201873779, "epoch": 0.23195570496341705, "mean_token_accuracy": 0.7412095665931702, "num_tokens": 52321587.0, "step": 1173, "train/ce_loss": 0.9427739381790161 }, { "epoch": 0.23195570496341705, "step": 1173, "train/sim_loss": 0.00031936168670654297 }, { "epoch": 0.23195570496341705, "step": 1173, "train/total_loss": 0.0945967584848404 }, { "entropy": 6.018607139587402, "epoch": 0.23215345066244808, "mean_token_accuracy": 0.7060931921005249, "num_tokens": 52376413.0, "step": 1174, "train/ce_loss": 0.7009943723678589 }, { "epoch": 0.23215345066244808, "step": 1174, "train/sim_loss": 0.0003142356872558594 }, { "epoch": 0.23215345066244808, "step": 1174, "train/total_loss": 0.07041367143392563 }, { "entropy": 6.071520805358887, "epoch": 0.23235119636147913, "mean_token_accuracy": 0.7039999961853027, "num_tokens": 52429318.0, "step": 1175, "train/ce_loss": 0.607112467288971 }, { "epoch": 0.23235119636147913, "step": 1175, "train/sim_loss": 0.00038868188858032227 }, { "epoch": 0.23235119636147913, "step": 1175, "train/total_loss": 0.06109992787241936 }, { "entropy": 5.656872749328613, "epoch": 0.2325489420605102, "mean_token_accuracy": 0.7534246444702148, "num_tokens": 52476661.0, "step": 1176, "train/ce_loss": 1.5123836994171143 }, { "epoch": 0.2325489420605102, "step": 1176, "train/sim_loss": 0.0005486011505126953 }, { "epoch": 0.2325489420605102, "step": 1176, "train/total_loss": 0.15178696811199188 }, { "entropy": 5.810354709625244, "epoch": 0.23274668775954124, "mean_token_accuracy": 0.7209622263908386, "num_tokens": 52535794.0, "step": 1177, "train/ce_loss": 1.30222749710083 }, { "epoch": 0.23274668775954124, "step": 1177, "train/sim_loss": 0.00025147199630737305 }, { "epoch": 0.23274668775954124, "step": 1177, "train/total_loss": 0.13047422468662262 }, { "entropy": 5.955678939819336, "epoch": 0.23294443345857227, "mean_token_accuracy": 0.7071651220321655, "num_tokens": 52579314.0, "step": 1178, "train/ce_loss": 0.8244516849517822 }, { "epoch": 0.23294443345857227, "step": 1178, "train/sim_loss": 0.00021958351135253906 }, { "epoch": 0.23294443345857227, "step": 1178, "train/total_loss": 0.08266475051641464 }, { "entropy": 5.843437194824219, "epoch": 0.23314217915760332, "mean_token_accuracy": 0.7711598873138428, "num_tokens": 52621160.0, "step": 1179, "train/ce_loss": 1.5801642803126015e-05 }, { "epoch": 0.23314217915760332, "step": 1179, "train/sim_loss": 0.0002523660659790039 }, { "epoch": 0.23314217915760332, "step": 1179, "train/total_loss": 0.0002539462293498218 }, { "epoch": 0.23333992485663438, "grad_norm": 0.4895051419734955, "learning_rate": 9.42180235433772e-06, "loss": 0.084, "step": 1180 }, { "entropy": 5.547832489013672, "epoch": 0.23333992485663438, "mean_token_accuracy": 0.7597551941871643, "num_tokens": 52667712.0, "step": 1180, "train/ce_loss": 0.9289265275001526 }, { "epoch": 0.23333992485663438, "step": 1180, "train/sim_loss": 0.00048530101776123047 }, { "epoch": 0.23333992485663438, "step": 1180, "train/total_loss": 0.09337795525789261 }, { "entropy": 5.587679386138916, "epoch": 0.2335376705556654, "mean_token_accuracy": 0.7393681406974792, "num_tokens": 52710627.0, "step": 1181, "train/ce_loss": 0.6742366552352905 }, { "epoch": 0.2335376705556654, "step": 1181, "train/sim_loss": 0.00022125244140625 }, { "epoch": 0.2335376705556654, "step": 1181, "train/total_loss": 0.06764491647481918 }, { "entropy": 5.781608581542969, "epoch": 0.23373541625469646, "mean_token_accuracy": 0.7276914119720459, "num_tokens": 52756684.0, "step": 1182, "train/ce_loss": 0.8890633583068848 }, { "epoch": 0.23373541625469646, "step": 1182, "train/sim_loss": 0.0004399418830871582 }, { "epoch": 0.23373541625469646, "step": 1182, "train/total_loss": 0.08934628218412399 }, { "entropy": 5.585827827453613, "epoch": 0.23393316195372751, "mean_token_accuracy": 0.7302631735801697, "num_tokens": 52795110.0, "step": 1183, "train/ce_loss": 0.9464160203933716 }, { "epoch": 0.23393316195372751, "step": 1183, "train/sim_loss": 0.000265657901763916 }, { "epoch": 0.23393316195372751, "step": 1183, "train/total_loss": 0.0949072614312172 }, { "entropy": 5.836297035217285, "epoch": 0.23413090765275854, "mean_token_accuracy": 0.7392601370811462, "num_tokens": 52825189.0, "step": 1184, "train/ce_loss": 0.6723836660385132 }, { "epoch": 0.23413090765275854, "step": 1184, "train/sim_loss": 0.00035262107849121094 }, { "epoch": 0.23413090765275854, "step": 1184, "train/total_loss": 0.06759098917245865 }, { "entropy": 5.986987113952637, "epoch": 0.2343286533517896, "mean_token_accuracy": 0.7333333492279053, "num_tokens": 52878265.0, "step": 1185, "train/ce_loss": 1.6108606359921396e-05 }, { "epoch": 0.2343286533517896, "step": 1185, "train/sim_loss": 0.00034809112548828125 }, { "epoch": 0.2343286533517896, "step": 1185, "train/total_loss": 0.000349701993400231 }, { "entropy": 5.856890678405762, "epoch": 0.23452639905082065, "mean_token_accuracy": 0.7591078281402588, "num_tokens": 52933056.0, "step": 1186, "train/ce_loss": 0.6489793062210083 }, { "epoch": 0.23452639905082065, "step": 1186, "train/sim_loss": 0.0004298686981201172 }, { "epoch": 0.23452639905082065, "step": 1186, "train/total_loss": 0.06532780081033707 }, { "entropy": 6.122570037841797, "epoch": 0.2347241447498517, "mean_token_accuracy": 0.7312746644020081, "num_tokens": 52977615.0, "step": 1187, "train/ce_loss": 1.1979541778564453 }, { "epoch": 0.2347241447498517, "step": 1187, "train/sim_loss": 0.0004056692123413086 }, { "epoch": 0.2347241447498517, "step": 1187, "train/total_loss": 0.12020108848810196 }, { "entropy": 5.7684712409973145, "epoch": 0.23492189044888273, "mean_token_accuracy": 0.765893816947937, "num_tokens": 53014208.0, "step": 1188, "train/ce_loss": 2.280274566146545e-05 }, { "epoch": 0.23492189044888273, "step": 1188, "train/sim_loss": 0.0002682209014892578 }, { "epoch": 0.23492189044888273, "step": 1188, "train/total_loss": 0.0002705011866055429 }, { "entropy": 5.8952789306640625, "epoch": 0.23511963614791379, "mean_token_accuracy": 0.7509363293647766, "num_tokens": 53061347.0, "step": 1189, "train/ce_loss": 1.070253610610962 }, { "epoch": 0.23511963614791379, "step": 1189, "train/sim_loss": 0.00029021501541137695 }, { "epoch": 0.23511963614791379, "step": 1189, "train/total_loss": 0.10731557756662369 }, { "entropy": 6.050017833709717, "epoch": 0.23531738184694484, "mean_token_accuracy": 0.6971881985664368, "num_tokens": 53101648.0, "step": 1190, "train/ce_loss": 1.1303833723068237 }, { "epoch": 0.23531738184694484, "step": 1190, "train/sim_loss": 0.00044780969619750977 }, { "epoch": 0.23531738184694484, "step": 1190, "train/total_loss": 0.113486148416996 }, { "entropy": 6.503205299377441, "epoch": 0.23551512754597587, "mean_token_accuracy": 0.7160963416099548, "num_tokens": 53146606.0, "step": 1191, "train/ce_loss": 0.921988308429718 }, { "epoch": 0.23551512754597587, "step": 1191, "train/sim_loss": 0.0007126331329345703 }, { "epoch": 0.23551512754597587, "step": 1191, "train/total_loss": 0.09291146695613861 }, { "entropy": 5.5841593742370605, "epoch": 0.23571287324500692, "mean_token_accuracy": 0.774014413356781, "num_tokens": 53189838.0, "step": 1192, "train/ce_loss": 0.5184661746025085 }, { "epoch": 0.23571287324500692, "step": 1192, "train/sim_loss": 0.0003504157066345215 }, { "epoch": 0.23571287324500692, "step": 1192, "train/total_loss": 0.052197035402059555 }, { "entropy": 5.7388482093811035, "epoch": 0.23591061894403798, "mean_token_accuracy": 0.7010309100151062, "num_tokens": 53236468.0, "step": 1193, "train/ce_loss": 0.6677466034889221 }, { "epoch": 0.23591061894403798, "step": 1193, "train/sim_loss": 0.0003669261932373047 }, { "epoch": 0.23591061894403798, "step": 1193, "train/total_loss": 0.0671415850520134 }, { "entropy": 6.169657230377197, "epoch": 0.236108364643069, "mean_token_accuracy": 0.7224615216255188, "num_tokens": 53304433.0, "step": 1194, "train/ce_loss": 0.8266186118125916 }, { "epoch": 0.236108364643069, "step": 1194, "train/sim_loss": 0.0002676248550415039 }, { "epoch": 0.236108364643069, "step": 1194, "train/total_loss": 0.08292948454618454 }, { "entropy": 5.999200344085693, "epoch": 0.23630611034210006, "mean_token_accuracy": 0.7700626254081726, "num_tokens": 53361386.0, "step": 1195, "train/ce_loss": 1.021740436553955 }, { "epoch": 0.23630611034210006, "step": 1195, "train/sim_loss": 0.00034672021865844727 }, { "epoch": 0.23630611034210006, "step": 1195, "train/total_loss": 0.10252076387405396 }, { "entropy": 5.453333377838135, "epoch": 0.2365038560411311, "mean_token_accuracy": 0.7490397095680237, "num_tokens": 53406192.0, "step": 1196, "train/ce_loss": 1.0814729928970337 }, { "epoch": 0.2365038560411311, "step": 1196, "train/sim_loss": 0.00040841102600097656 }, { "epoch": 0.2365038560411311, "step": 1196, "train/total_loss": 0.10855571180582047 }, { "entropy": 5.628547668457031, "epoch": 0.23670160174016217, "mean_token_accuracy": 0.7346698045730591, "num_tokens": 53436949.0, "step": 1197, "train/ce_loss": 1.431207299232483 }, { "epoch": 0.23670160174016217, "step": 1197, "train/sim_loss": 0.0003171563148498535 }, { "epoch": 0.23670160174016217, "step": 1197, "train/total_loss": 0.14343789219856262 }, { "entropy": 5.898207664489746, "epoch": 0.2368993474391932, "mean_token_accuracy": 0.7220566272735596, "num_tokens": 53496790.0, "step": 1198, "train/ce_loss": 1.5372397899627686 }, { "epoch": 0.2368993474391932, "step": 1198, "train/sim_loss": 0.00035244226455688477 }, { "epoch": 0.2368993474391932, "step": 1198, "train/total_loss": 0.15407642722129822 }, { "entropy": 6.314878463745117, "epoch": 0.23709709313822425, "mean_token_accuracy": 0.7492043375968933, "num_tokens": 53550871.0, "step": 1199, "train/ce_loss": 1.0768669843673706 }, { "epoch": 0.23709709313822425, "step": 1199, "train/sim_loss": 0.0004062652587890625 }, { "epoch": 0.23709709313822425, "step": 1199, "train/total_loss": 0.10809296369552612 }, { "epoch": 0.2372948388372553, "grad_norm": 0.4059201776981354, "learning_rate": 9.411910179048373e-06, "loss": 0.0848, "step": 1200 }, { "entropy": 6.226415157318115, "epoch": 0.2372948388372553, "mean_token_accuracy": 0.7487623691558838, "num_tokens": 53597723.0, "step": 1200, "train/ce_loss": 0.4376180171966553 }, { "epoch": 0.2372948388372553, "step": 1200, "train/sim_loss": 0.00019544363021850586 }, { "epoch": 0.2372948388372553, "step": 1200, "train/total_loss": 0.043957244604825974 }, { "entropy": 5.837862014770508, "epoch": 0.23749258453628633, "mean_token_accuracy": 0.7320221066474915, "num_tokens": 53639229.0, "step": 1201, "train/ce_loss": 0.806725025177002 }, { "epoch": 0.23749258453628633, "step": 1201, "train/sim_loss": 0.00033974647521972656 }, { "epoch": 0.23749258453628633, "step": 1201, "train/total_loss": 0.08101224899291992 }, { "entropy": 5.8404541015625, "epoch": 0.23769033023531738, "mean_token_accuracy": 0.7667121291160583, "num_tokens": 53674333.0, "step": 1202, "train/ce_loss": 1.2042052745819092 }, { "epoch": 0.23769033023531738, "step": 1202, "train/sim_loss": 0.0004082918167114258 }, { "epoch": 0.23769033023531738, "step": 1202, "train/total_loss": 0.12082882225513458 }, { "entropy": 5.868091583251953, "epoch": 0.23788807593434844, "mean_token_accuracy": 0.7188459038734436, "num_tokens": 53729832.0, "step": 1203, "train/ce_loss": 1.2671868717006873e-05 }, { "epoch": 0.23788807593434844, "step": 1203, "train/sim_loss": 0.0004392862319946289 }, { "epoch": 0.23788807593434844, "step": 1203, "train/total_loss": 0.0004405534127727151 }, { "entropy": 6.241964340209961, "epoch": 0.23808582163337946, "mean_token_accuracy": 0.7466539144515991, "num_tokens": 53777216.0, "step": 1204, "train/ce_loss": 1.2132008075714111 }, { "epoch": 0.23808582163337946, "step": 1204, "train/sim_loss": 0.0002875328063964844 }, { "epoch": 0.23808582163337946, "step": 1204, "train/total_loss": 0.12160761654376984 }, { "entropy": 6.192549705505371, "epoch": 0.23828356733241052, "mean_token_accuracy": 0.7421777248382568, "num_tokens": 53843384.0, "step": 1205, "train/ce_loss": 0.9330734610557556 }, { "epoch": 0.23828356733241052, "step": 1205, "train/sim_loss": 0.00019127130508422852 }, { "epoch": 0.23828356733241052, "step": 1205, "train/total_loss": 0.09349861741065979 }, { "entropy": 6.159868240356445, "epoch": 0.23848131303144157, "mean_token_accuracy": 0.7216748595237732, "num_tokens": 53870878.0, "step": 1206, "train/ce_loss": 0.8875803351402283 }, { "epoch": 0.23848131303144157, "step": 1206, "train/sim_loss": 0.0004292130470275879 }, { "epoch": 0.23848131303144157, "step": 1206, "train/total_loss": 0.08918724954128265 }, { "entropy": 6.184939861297607, "epoch": 0.2386790587304726, "mean_token_accuracy": 0.7361842393875122, "num_tokens": 53919247.0, "step": 1207, "train/ce_loss": 0.7359360456466675 }, { "epoch": 0.2386790587304726, "step": 1207, "train/sim_loss": 0.0004892349243164062 }, { "epoch": 0.2386790587304726, "step": 1207, "train/total_loss": 0.07408284395933151 }, { "entropy": 5.851834774017334, "epoch": 0.23887680442950365, "mean_token_accuracy": 0.7123287916183472, "num_tokens": 53968483.0, "step": 1208, "train/ce_loss": 1.0829722881317139 }, { "epoch": 0.23887680442950365, "step": 1208, "train/sim_loss": 0.00021249055862426758 }, { "epoch": 0.23887680442950365, "step": 1208, "train/total_loss": 0.10850971937179565 }, { "entropy": 5.975038051605225, "epoch": 0.2390745501285347, "mean_token_accuracy": 0.7063806056976318, "num_tokens": 54022945.0, "step": 1209, "train/ce_loss": 1.1755521297454834 }, { "epoch": 0.2390745501285347, "step": 1209, "train/sim_loss": 0.00045931339263916016 }, { "epoch": 0.2390745501285347, "step": 1209, "train/total_loss": 0.11801452934741974 }, { "entropy": 6.030392169952393, "epoch": 0.23927229582756576, "mean_token_accuracy": 0.7731262445449829, "num_tokens": 54072191.0, "step": 1210, "train/ce_loss": 0.8486732840538025 }, { "epoch": 0.23927229582756576, "step": 1210, "train/sim_loss": 0.0003103017807006836 }, { "epoch": 0.23927229582756576, "step": 1210, "train/total_loss": 0.08517763018608093 }, { "entropy": 6.063719749450684, "epoch": 0.2394700415265968, "mean_token_accuracy": 0.6972789168357849, "num_tokens": 54122407.0, "step": 1211, "train/ce_loss": 0.6796739101409912 }, { "epoch": 0.2394700415265968, "step": 1211, "train/sim_loss": 0.00029861927032470703 }, { "epoch": 0.2394700415265968, "step": 1211, "train/total_loss": 0.06826601177453995 }, { "entropy": 5.517884731292725, "epoch": 0.23966778722562784, "mean_token_accuracy": 0.8118143677711487, "num_tokens": 54173022.0, "step": 1212, "train/ce_loss": 2.0011348169646226e-05 }, { "epoch": 0.23966778722562784, "step": 1212, "train/sim_loss": 0.00019431114196777344 }, { "epoch": 0.23966778722562784, "step": 1212, "train/total_loss": 0.00019631227769423276 }, { "entropy": 5.486027717590332, "epoch": 0.2398655329246589, "mean_token_accuracy": 0.7770609259605408, "num_tokens": 54210222.0, "step": 1213, "train/ce_loss": 1.102411150932312 }, { "epoch": 0.2398655329246589, "step": 1213, "train/sim_loss": 0.00036787986755371094 }, { "epoch": 0.2398655329246589, "step": 1213, "train/total_loss": 0.11060899496078491 }, { "entropy": 5.778380393981934, "epoch": 0.24006327862368992, "mean_token_accuracy": 0.718679666519165, "num_tokens": 54258270.0, "step": 1214, "train/ce_loss": 1.524660547147505e-05 }, { "epoch": 0.24006327862368992, "step": 1214, "train/sim_loss": 0.0002492070198059082 }, { "epoch": 0.24006327862368992, "step": 1214, "train/total_loss": 0.0002507316821720451 }, { "entropy": 6.292346954345703, "epoch": 0.24026102432272098, "mean_token_accuracy": 0.7227488160133362, "num_tokens": 54303788.0, "step": 1215, "train/ce_loss": 1.1038190126419067 }, { "epoch": 0.24026102432272098, "step": 1215, "train/sim_loss": 0.00036269426345825195 }, { "epoch": 0.24026102432272098, "step": 1215, "train/total_loss": 0.11074459552764893 }, { "entropy": 5.971410751342773, "epoch": 0.24045877002175203, "mean_token_accuracy": 0.7156862616539001, "num_tokens": 54342924.0, "step": 1216, "train/ce_loss": 1.4501633813779335e-05 }, { "epoch": 0.24045877002175203, "step": 1216, "train/sim_loss": 0.00033462047576904297 }, { "epoch": 0.24045877002175203, "step": 1216, "train/total_loss": 0.00033607063232921064 }, { "entropy": 5.983178615570068, "epoch": 0.24065651572078306, "mean_token_accuracy": 0.7384030222892761, "num_tokens": 54407870.0, "step": 1217, "train/ce_loss": 1.581124342919793e-05 }, { "epoch": 0.24065651572078306, "step": 1217, "train/sim_loss": 0.00024759769439697266 }, { "epoch": 0.24065651572078306, "step": 1217, "train/total_loss": 0.00024917881819419563 }, { "entropy": 5.632805347442627, "epoch": 0.24085426141981411, "mean_token_accuracy": 0.766508162021637, "num_tokens": 54439066.0, "step": 1218, "train/ce_loss": 0.27858585119247437 }, { "epoch": 0.24085426141981411, "step": 1218, "train/sim_loss": 0.0002263784408569336 }, { "epoch": 0.24085426141981411, "step": 1218, "train/total_loss": 0.02808496356010437 }, { "entropy": 5.765274524688721, "epoch": 0.24105200711884517, "mean_token_accuracy": 0.7465165853500366, "num_tokens": 54492031.0, "step": 1219, "train/ce_loss": 0.7967976927757263 }, { "epoch": 0.24105200711884517, "step": 1219, "train/sim_loss": 0.0002721548080444336 }, { "epoch": 0.24105200711884517, "step": 1219, "train/total_loss": 0.0799519270658493 }, { "epoch": 0.24124975281787622, "grad_norm": 0.4171260893344879, "learning_rate": 9.402018003759027e-06, "loss": 0.0829, "step": 1220 }, { "entropy": 6.1074628829956055, "epoch": 0.24124975281787622, "mean_token_accuracy": 0.7236559391021729, "num_tokens": 54542751.0, "step": 1220, "train/ce_loss": 0.6603206992149353 }, { "epoch": 0.24124975281787622, "step": 1220, "train/sim_loss": 0.0005448460578918457 }, { "epoch": 0.24124975281787622, "step": 1220, "train/total_loss": 0.06657692044973373 }, { "entropy": 5.870491981506348, "epoch": 0.24144749851690725, "mean_token_accuracy": 0.7573587894439697, "num_tokens": 54595930.0, "step": 1221, "train/ce_loss": 1.434145450592041 }, { "epoch": 0.24144749851690725, "step": 1221, "train/sim_loss": 0.0003865957260131836 }, { "epoch": 0.24144749851690725, "step": 1221, "train/total_loss": 0.14380113780498505 }, { "entropy": 5.77119255065918, "epoch": 0.2416452442159383, "mean_token_accuracy": 0.758269727230072, "num_tokens": 54640646.0, "step": 1222, "train/ce_loss": 0.8254532814025879 }, { "epoch": 0.2416452442159383, "step": 1222, "train/sim_loss": 0.00032007694244384766 }, { "epoch": 0.2416452442159383, "step": 1222, "train/total_loss": 0.082865409553051 }, { "entropy": 5.694089412689209, "epoch": 0.24184298991496936, "mean_token_accuracy": 0.739635169506073, "num_tokens": 54687643.0, "step": 1223, "train/ce_loss": 0.7926769852638245 }, { "epoch": 0.24184298991496936, "step": 1223, "train/sim_loss": 0.0005179643630981445 }, { "epoch": 0.24184298991496936, "step": 1223, "train/total_loss": 0.07978566735982895 }, { "entropy": 5.8186187744140625, "epoch": 0.24204073561400039, "mean_token_accuracy": 0.7060703039169312, "num_tokens": 54729020.0, "step": 1224, "train/ce_loss": 1.0105623006820679 }, { "epoch": 0.24204073561400039, "step": 1224, "train/sim_loss": 0.0006204843521118164 }, { "epoch": 0.24204073561400039, "step": 1224, "train/total_loss": 0.10167671740055084 }, { "entropy": 5.738733291625977, "epoch": 0.24223848131303144, "mean_token_accuracy": 0.7205387353897095, "num_tokens": 54776337.0, "step": 1225, "train/ce_loss": 0.6728471517562866 }, { "epoch": 0.24223848131303144, "step": 1225, "train/sim_loss": 0.00038695335388183594 }, { "epoch": 0.24223848131303144, "step": 1225, "train/total_loss": 0.06767167150974274 }, { "entropy": 5.525790214538574, "epoch": 0.2424362270120625, "mean_token_accuracy": 0.7762345671653748, "num_tokens": 54802503.0, "step": 1226, "train/ce_loss": 1.843574136728421e-05 }, { "epoch": 0.2424362270120625, "step": 1226, "train/sim_loss": 0.0002624988555908203 }, { "epoch": 0.2424362270120625, "step": 1226, "train/total_loss": 0.0002643424377311021 }, { "entropy": 6.05937385559082, "epoch": 0.24263397271109352, "mean_token_accuracy": 0.7371225357055664, "num_tokens": 54855247.0, "step": 1227, "train/ce_loss": 1.6501481533050537 }, { "epoch": 0.24263397271109352, "step": 1227, "train/sim_loss": 0.00038886070251464844 }, { "epoch": 0.24263397271109352, "step": 1227, "train/total_loss": 0.16540367901325226 }, { "entropy": 6.015216827392578, "epoch": 0.24283171841012458, "mean_token_accuracy": 0.7000681757926941, "num_tokens": 54885229.0, "step": 1228, "train/ce_loss": 1.7075341020245105e-05 }, { "epoch": 0.24283171841012458, "step": 1228, "train/sim_loss": 0.0002181529998779297 }, { "epoch": 0.24283171841012458, "step": 1228, "train/total_loss": 0.00021986053616274148 }, { "entropy": 5.891243934631348, "epoch": 0.24302946410915563, "mean_token_accuracy": 0.7153987288475037, "num_tokens": 54929386.0, "step": 1229, "train/ce_loss": 0.38417530059814453 }, { "epoch": 0.24302946410915563, "step": 1229, "train/sim_loss": 0.00027120113372802734 }, { "epoch": 0.24302946410915563, "step": 1229, "train/total_loss": 0.03868873044848442 }, { "entropy": 6.0579423904418945, "epoch": 0.24322720980818668, "mean_token_accuracy": 0.7365028262138367, "num_tokens": 54968093.0, "step": 1230, "train/ce_loss": 0.9748556017875671 }, { "epoch": 0.24322720980818668, "step": 1230, "train/sim_loss": 0.00025141239166259766 }, { "epoch": 0.24322720980818668, "step": 1230, "train/total_loss": 0.09773697704076767 }, { "entropy": 6.035906791687012, "epoch": 0.2434249555072177, "mean_token_accuracy": 0.7453449964523315, "num_tokens": 55003382.0, "step": 1231, "train/ce_loss": 0.878312349319458 }, { "epoch": 0.2434249555072177, "step": 1231, "train/sim_loss": 0.0003275871276855469 }, { "epoch": 0.2434249555072177, "step": 1231, "train/total_loss": 0.08815882354974747 }, { "entropy": 6.0100507736206055, "epoch": 0.24362270120624877, "mean_token_accuracy": 0.7519336938858032, "num_tokens": 55059948.0, "step": 1232, "train/ce_loss": 2.388536214828491 }, { "epoch": 0.24362270120624877, "step": 1232, "train/sim_loss": 0.00036716461181640625 }, { "epoch": 0.24362270120624877, "step": 1232, "train/total_loss": 0.2392207831144333 }, { "entropy": 5.491940975189209, "epoch": 0.24382044690527982, "mean_token_accuracy": 0.7736877799034119, "num_tokens": 55090847.0, "step": 1233, "train/ce_loss": 0.832378089427948 }, { "epoch": 0.24382044690527982, "step": 1233, "train/sim_loss": 0.0003936290740966797 }, { "epoch": 0.24382044690527982, "step": 1233, "train/total_loss": 0.08363144099712372 }, { "entropy": 5.831950664520264, "epoch": 0.24401819260431085, "mean_token_accuracy": 0.7598802447319031, "num_tokens": 55139682.0, "step": 1234, "train/ce_loss": 0.911975622177124 }, { "epoch": 0.24401819260431085, "step": 1234, "train/sim_loss": 0.0005290508270263672 }, { "epoch": 0.24401819260431085, "step": 1234, "train/total_loss": 0.09172661602497101 }, { "entropy": 5.727273941040039, "epoch": 0.2442159383033419, "mean_token_accuracy": 0.7379181981086731, "num_tokens": 55187160.0, "step": 1235, "train/ce_loss": 0.46820536255836487 }, { "epoch": 0.2442159383033419, "step": 1235, "train/sim_loss": 0.00031429529190063477 }, { "epoch": 0.2442159383033419, "step": 1235, "train/total_loss": 0.04713483154773712 }, { "entropy": 6.004815101623535, "epoch": 0.24441368400237296, "mean_token_accuracy": 0.7283398509025574, "num_tokens": 55239424.0, "step": 1236, "train/ce_loss": 0.8027140498161316 }, { "epoch": 0.24441368400237296, "step": 1236, "train/sim_loss": 0.0005010366439819336 }, { "epoch": 0.24441368400237296, "step": 1236, "train/total_loss": 0.08077244460582733 }, { "entropy": 5.9520440101623535, "epoch": 0.24461142970140398, "mean_token_accuracy": 0.7462173104286194, "num_tokens": 55289673.0, "step": 1237, "train/ce_loss": 0.4935908019542694 }, { "epoch": 0.24461142970140398, "step": 1237, "train/sim_loss": 0.00033932924270629883 }, { "epoch": 0.24461142970140398, "step": 1237, "train/total_loss": 0.04969840869307518 }, { "entropy": 5.376651763916016, "epoch": 0.24480917540043504, "mean_token_accuracy": 0.7673814296722412, "num_tokens": 55327530.0, "step": 1238, "train/ce_loss": 0.5782356262207031 }, { "epoch": 0.24480917540043504, "step": 1238, "train/sim_loss": 0.0003508925437927246 }, { "epoch": 0.24480917540043504, "step": 1238, "train/total_loss": 0.058174457401037216 }, { "entropy": 5.854668617248535, "epoch": 0.2450069210994661, "mean_token_accuracy": 0.6963874101638794, "num_tokens": 55373777.0, "step": 1239, "train/ce_loss": 2.1182597265578806e-05 }, { "epoch": 0.2450069210994661, "step": 1239, "train/sim_loss": 0.00019562244415283203 }, { "epoch": 0.2450069210994661, "step": 1239, "train/total_loss": 0.00019774070824496448 }, { "epoch": 0.24520466679849715, "grad_norm": 0.5180304050445557, "learning_rate": 9.392125828469681e-06, "loss": 0.0838, "step": 1240 }, { "entropy": 5.7653489112854, "epoch": 0.24520466679849715, "mean_token_accuracy": 0.7511211037635803, "num_tokens": 55406960.0, "step": 1240, "train/ce_loss": 0.9828577637672424 }, { "epoch": 0.24520466679849715, "step": 1240, "train/sim_loss": 0.0002503395080566406 }, { "epoch": 0.24520466679849715, "step": 1240, "train/total_loss": 0.09853611886501312 }, { "entropy": 6.089593410491943, "epoch": 0.24540241249752817, "mean_token_accuracy": 0.7304737567901611, "num_tokens": 55444953.0, "step": 1241, "train/ce_loss": 1.0972157716751099 }, { "epoch": 0.24540241249752817, "step": 1241, "train/sim_loss": 0.0003223419189453125 }, { "epoch": 0.24540241249752817, "step": 1241, "train/total_loss": 0.11004392057657242 }, { "entropy": 5.862956523895264, "epoch": 0.24560015819655923, "mean_token_accuracy": 0.7493589520454407, "num_tokens": 55484021.0, "step": 1242, "train/ce_loss": 1.0332452058792114 }, { "epoch": 0.24560015819655923, "step": 1242, "train/sim_loss": 0.0003361701965332031 }, { "epoch": 0.24560015819655923, "step": 1242, "train/total_loss": 0.1036606952548027 }, { "entropy": 5.711444854736328, "epoch": 0.24579790389559028, "mean_token_accuracy": 0.7593297958374023, "num_tokens": 55527842.0, "step": 1243, "train/ce_loss": 0.5303989052772522 }, { "epoch": 0.24579790389559028, "step": 1243, "train/sim_loss": 0.0003070235252380371 }, { "epoch": 0.24579790389559028, "step": 1243, "train/total_loss": 0.0533469133079052 }, { "entropy": 5.621028423309326, "epoch": 0.2459956495946213, "mean_token_accuracy": 0.7622489929199219, "num_tokens": 55570181.0, "step": 1244, "train/ce_loss": 1.160593032836914 }, { "epoch": 0.2459956495946213, "step": 1244, "train/sim_loss": 0.00029474496841430664 }, { "epoch": 0.2459956495946213, "step": 1244, "train/total_loss": 0.11635404825210571 }, { "entropy": 6.070384979248047, "epoch": 0.24619339529365236, "mean_token_accuracy": 0.754325270652771, "num_tokens": 55609545.0, "step": 1245, "train/ce_loss": 0.8557243347167969 }, { "epoch": 0.24619339529365236, "step": 1245, "train/sim_loss": 0.0006021261215209961 }, { "epoch": 0.24619339529365236, "step": 1245, "train/total_loss": 0.08617456257343292 }, { "entropy": 5.882753372192383, "epoch": 0.24639114099268342, "mean_token_accuracy": 0.7487764954566956, "num_tokens": 55661425.0, "step": 1246, "train/ce_loss": 0.843774139881134 }, { "epoch": 0.24639114099268342, "step": 1246, "train/sim_loss": 0.00027358531951904297 }, { "epoch": 0.24639114099268342, "step": 1246, "train/total_loss": 0.08465100079774857 }, { "entropy": 6.246391296386719, "epoch": 0.24658888669171444, "mean_token_accuracy": 0.7358642816543579, "num_tokens": 55698213.0, "step": 1247, "train/ce_loss": 1.3138315677642822 }, { "epoch": 0.24658888669171444, "step": 1247, "train/sim_loss": 0.0002796649932861328 }, { "epoch": 0.24658888669171444, "step": 1247, "train/total_loss": 0.13166283071041107 }, { "entropy": 5.73565673828125, "epoch": 0.2467866323907455, "mean_token_accuracy": 0.7382001280784607, "num_tokens": 55757013.0, "step": 1248, "train/ce_loss": 0.8469803929328918 }, { "epoch": 0.2467866323907455, "step": 1248, "train/sim_loss": 0.00019752979278564453 }, { "epoch": 0.2467866323907455, "step": 1248, "train/total_loss": 0.08489557355642319 }, { "entropy": 5.54352331161499, "epoch": 0.24698437808977655, "mean_token_accuracy": 0.7115275263786316, "num_tokens": 55800122.0, "step": 1249, "train/ce_loss": 0.5487363934516907 }, { "epoch": 0.24698437808977655, "step": 1249, "train/sim_loss": 0.00036132335662841797 }, { "epoch": 0.24698437808977655, "step": 1249, "train/total_loss": 0.055234964936971664 }, { "entropy": 5.86241340637207, "epoch": 0.2471821237888076, "mean_token_accuracy": 0.7360953688621521, "num_tokens": 55846056.0, "step": 1250, "train/ce_loss": 1.0971832275390625 }, { "epoch": 0.2471821237888076, "step": 1250, "train/sim_loss": 0.00035262107849121094 }, { "epoch": 0.2471821237888076, "step": 1250, "train/total_loss": 0.11007094383239746 }, { "entropy": 6.085351943969727, "epoch": 0.24737986948783863, "mean_token_accuracy": 0.7056758403778076, "num_tokens": 55889548.0, "step": 1251, "train/ce_loss": 0.5552483201026917 }, { "epoch": 0.24737986948783863, "step": 1251, "train/sim_loss": 0.000284731388092041 }, { "epoch": 0.24737986948783863, "step": 1251, "train/total_loss": 0.055809564888477325 }, { "entropy": 5.881979942321777, "epoch": 0.2475776151868697, "mean_token_accuracy": 0.7589040994644165, "num_tokens": 55933944.0, "step": 1252, "train/ce_loss": 0.6917106509208679 }, { "epoch": 0.2475776151868697, "step": 1252, "train/sim_loss": 0.00019097328186035156 }, { "epoch": 0.2475776151868697, "step": 1252, "train/total_loss": 0.06936203688383102 }, { "entropy": 5.9351806640625, "epoch": 0.24777536088590074, "mean_token_accuracy": 0.7558139562606812, "num_tokens": 55987527.0, "step": 1253, "train/ce_loss": 1.9566325136111118e-05 }, { "epoch": 0.24777536088590074, "step": 1253, "train/sim_loss": 0.00034546852111816406 }, { "epoch": 0.24777536088590074, "step": 1253, "train/total_loss": 0.0003474251425359398 }, { "entropy": 6.072530746459961, "epoch": 0.24797310658493177, "mean_token_accuracy": 0.7654321193695068, "num_tokens": 56041453.0, "step": 1254, "train/ce_loss": 1.0737606287002563 }, { "epoch": 0.24797310658493177, "step": 1254, "train/sim_loss": 0.00033092498779296875 }, { "epoch": 0.24797310658493177, "step": 1254, "train/total_loss": 0.10770698636770248 }, { "entropy": 6.07515287399292, "epoch": 0.24817085228396282, "mean_token_accuracy": 0.7456790208816528, "num_tokens": 56090928.0, "step": 1255, "train/ce_loss": 1.343637228012085 }, { "epoch": 0.24817085228396282, "step": 1255, "train/sim_loss": 0.00028121471405029297 }, { "epoch": 0.24817085228396282, "step": 1255, "train/total_loss": 0.13464494049549103 }, { "entropy": 6.207268714904785, "epoch": 0.24836859798299388, "mean_token_accuracy": 0.7649456262588501, "num_tokens": 56138173.0, "step": 1256, "train/ce_loss": 0.702512264251709 }, { "epoch": 0.24836859798299388, "step": 1256, "train/sim_loss": 0.0003521442413330078 }, { "epoch": 0.24836859798299388, "step": 1256, "train/total_loss": 0.0706033706665039 }, { "entropy": 5.7743730545043945, "epoch": 0.2485663436820249, "mean_token_accuracy": 0.764502763748169, "num_tokens": 56186727.0, "step": 1257, "train/ce_loss": 1.6220285033341497e-05 }, { "epoch": 0.2485663436820249, "step": 1257, "train/sim_loss": 0.00029796361923217773 }, { "epoch": 0.2485663436820249, "step": 1257, "train/total_loss": 0.0002995856339111924 }, { "entropy": 5.338169574737549, "epoch": 0.24876408938105596, "mean_token_accuracy": 0.7659574747085571, "num_tokens": 56224141.0, "step": 1258, "train/ce_loss": 0.5819286108016968 }, { "epoch": 0.24876408938105596, "step": 1258, "train/sim_loss": 0.00020813941955566406 }, { "epoch": 0.24876408938105596, "step": 1258, "train/total_loss": 0.05840099975466728 }, { "entropy": 6.040826320648193, "epoch": 0.248961835080087, "mean_token_accuracy": 0.7420058250427246, "num_tokens": 56262148.0, "step": 1259, "train/ce_loss": 0.7698171734809875 }, { "epoch": 0.248961835080087, "step": 1259, "train/sim_loss": 0.0002600550651550293 }, { "epoch": 0.248961835080087, "step": 1259, "train/total_loss": 0.07724177092313766 }, { "epoch": 0.24915958077911807, "grad_norm": 0.48992618918418884, "learning_rate": 9.382233653180335e-06, "loss": 0.0812, "step": 1260 }, { "entropy": 6.206862449645996, "epoch": 0.24915958077911807, "mean_token_accuracy": 0.735837459564209, "num_tokens": 56321522.0, "step": 1260, "train/ce_loss": 1.0010055303573608 }, { "epoch": 0.24915958077911807, "step": 1260, "train/sim_loss": 0.00029146671295166016 }, { "epoch": 0.24915958077911807, "step": 1260, "train/total_loss": 0.10039202123880386 }, { "entropy": 6.006711959838867, "epoch": 0.2493573264781491, "mean_token_accuracy": 0.7182596325874329, "num_tokens": 56367609.0, "step": 1261, "train/ce_loss": 1.4898699191689957e-05 }, { "epoch": 0.2493573264781491, "step": 1261, "train/sim_loss": 0.00031071901321411133 }, { "epoch": 0.2493573264781491, "step": 1261, "train/total_loss": 0.00031220889650285244 }, { "entropy": 5.498718738555908, "epoch": 0.24955507217718015, "mean_token_accuracy": 0.7189329862594604, "num_tokens": 56414362.0, "step": 1262, "train/ce_loss": 0.8674229979515076 }, { "epoch": 0.24955507217718015, "step": 1262, "train/sim_loss": 0.00021886825561523438 }, { "epoch": 0.24955507217718015, "step": 1262, "train/total_loss": 0.08696117252111435 }, { "entropy": 6.102687835693359, "epoch": 0.2497528178762112, "mean_token_accuracy": 0.7281553149223328, "num_tokens": 56472504.0, "step": 1263, "train/ce_loss": 1.1943846940994263 }, { "epoch": 0.2497528178762112, "step": 1263, "train/sim_loss": 0.00030863285064697266 }, { "epoch": 0.2497528178762112, "step": 1263, "train/total_loss": 0.1197471022605896 }, { "entropy": 5.5306477546691895, "epoch": 0.24995056357524223, "mean_token_accuracy": 0.7647058963775635, "num_tokens": 56502422.0, "step": 1264, "train/ce_loss": 1.480146884918213 }, { "epoch": 0.24995056357524223, "step": 1264, "train/sim_loss": 0.0003775358200073242 }, { "epoch": 0.24995056357524223, "step": 1264, "train/total_loss": 0.1483922302722931 }, { "entropy": 5.929191589355469, "epoch": 0.2501483092742733, "mean_token_accuracy": 0.7279129028320312, "num_tokens": 56556330.0, "step": 1265, "train/ce_loss": 0.7833368182182312 }, { "epoch": 0.2501483092742733, "step": 1265, "train/sim_loss": 0.00033092498779296875 }, { "epoch": 0.2501483092742733, "step": 1265, "train/total_loss": 0.07866460829973221 }, { "entropy": 5.4893107414245605, "epoch": 0.2503460549733043, "mean_token_accuracy": 0.7286908030509949, "num_tokens": 56600794.0, "step": 1266, "train/ce_loss": 0.6296938061714172 }, { "epoch": 0.2503460549733043, "step": 1266, "train/sim_loss": 0.00029265880584716797 }, { "epoch": 0.2503460549733043, "step": 1266, "train/total_loss": 0.06326203793287277 }, { "entropy": 5.377072334289551, "epoch": 0.2505438006723354, "mean_token_accuracy": 0.7241169214248657, "num_tokens": 56638559.0, "step": 1267, "train/ce_loss": 0.8974431753158569 }, { "epoch": 0.2505438006723354, "step": 1267, "train/sim_loss": 0.0003088712692260742 }, { "epoch": 0.2505438006723354, "step": 1267, "train/total_loss": 0.09005319327116013 }, { "entropy": 5.604376316070557, "epoch": 0.2507415463713664, "mean_token_accuracy": 0.7772079706192017, "num_tokens": 56664576.0, "step": 1268, "train/ce_loss": 0.7021110653877258 }, { "epoch": 0.2507415463713664, "step": 1268, "train/sim_loss": 0.00025647878646850586 }, { "epoch": 0.2507415463713664, "step": 1268, "train/total_loss": 0.07046758383512497 }, { "entropy": 5.7430243492126465, "epoch": 0.25093929207039745, "mean_token_accuracy": 0.7548291087150574, "num_tokens": 56704778.0, "step": 1269, "train/ce_loss": 1.110605001449585 }, { "epoch": 0.25093929207039745, "step": 1269, "train/sim_loss": 0.0003763437271118164 }, { "epoch": 0.25093929207039745, "step": 1269, "train/total_loss": 0.11143684387207031 }, { "entropy": 5.951332092285156, "epoch": 0.25113703776942853, "mean_token_accuracy": 0.7565982341766357, "num_tokens": 56767396.0, "step": 1270, "train/ce_loss": 0.7061643004417419 }, { "epoch": 0.25113703776942853, "step": 1270, "train/sim_loss": 0.0003116130828857422 }, { "epoch": 0.25113703776942853, "step": 1270, "train/total_loss": 0.07092804461717606 }, { "entropy": 6.068436622619629, "epoch": 0.25133478346845955, "mean_token_accuracy": 0.7329509258270264, "num_tokens": 56814123.0, "step": 1271, "train/ce_loss": 1.3661968296219129e-05 }, { "epoch": 0.25133478346845955, "step": 1271, "train/sim_loss": 0.00024205446243286133 }, { "epoch": 0.25133478346845955, "step": 1271, "train/total_loss": 0.00024342065444216132 }, { "entropy": 6.389737606048584, "epoch": 0.2515325291674906, "mean_token_accuracy": 0.7386196851730347, "num_tokens": 56865312.0, "step": 1272, "train/ce_loss": 0.6694696545600891 }, { "epoch": 0.2515325291674906, "step": 1272, "train/sim_loss": 0.00046813488006591797 }, { "epoch": 0.2515325291674906, "step": 1272, "train/total_loss": 0.06741510331630707 }, { "entropy": 5.9168877601623535, "epoch": 0.25173027486652166, "mean_token_accuracy": 0.7103357315063477, "num_tokens": 56905652.0, "step": 1273, "train/ce_loss": 1.3506368398666382 }, { "epoch": 0.25173027486652166, "step": 1273, "train/sim_loss": 0.0002620816230773926 }, { "epoch": 0.25173027486652166, "step": 1273, "train/total_loss": 0.13532577455043793 }, { "entropy": 6.030937194824219, "epoch": 0.2519280205655527, "mean_token_accuracy": 0.7358490824699402, "num_tokens": 56956504.0, "step": 1274, "train/ce_loss": 1.2225251197814941 }, { "epoch": 0.2519280205655527, "step": 1274, "train/sim_loss": 0.0006496310234069824 }, { "epoch": 0.2519280205655527, "step": 1274, "train/total_loss": 0.12290214747190475 }, { "entropy": 5.80412483215332, "epoch": 0.2521257662645838, "mean_token_accuracy": 0.7382550239562988, "num_tokens": 56998622.0, "step": 1275, "train/ce_loss": 0.8080923557281494 }, { "epoch": 0.2521257662645838, "step": 1275, "train/sim_loss": 0.0002747178077697754 }, { "epoch": 0.2521257662645838, "step": 1275, "train/total_loss": 0.08108395338058472 }, { "entropy": 5.97095251083374, "epoch": 0.2523235119636148, "mean_token_accuracy": 0.7545515894889832, "num_tokens": 57033536.0, "step": 1276, "train/ce_loss": 0.7915042638778687 }, { "epoch": 0.2523235119636148, "step": 1276, "train/sim_loss": 0.0002612471580505371 }, { "epoch": 0.2523235119636148, "step": 1276, "train/total_loss": 0.07941167801618576 }, { "entropy": 5.850551605224609, "epoch": 0.2525212576626458, "mean_token_accuracy": 0.7318132519721985, "num_tokens": 57054092.0, "step": 1277, "train/ce_loss": 1.1236928701400757 }, { "epoch": 0.2525212576626458, "step": 1277, "train/sim_loss": 0.0003484487533569336 }, { "epoch": 0.2525212576626458, "step": 1277, "train/total_loss": 0.11271774023771286 }, { "entropy": 6.523205280303955, "epoch": 0.2527190033616769, "mean_token_accuracy": 0.7283774018287659, "num_tokens": 57098055.0, "step": 1278, "train/ce_loss": 0.995484471321106 }, { "epoch": 0.2527190033616769, "step": 1278, "train/sim_loss": 0.0003865361213684082 }, { "epoch": 0.2527190033616769, "step": 1278, "train/total_loss": 0.09993498772382736 }, { "entropy": 6.397531509399414, "epoch": 0.25291674906070793, "mean_token_accuracy": 0.7350543737411499, "num_tokens": 57135401.0, "step": 1279, "train/ce_loss": 0.8108571171760559 }, { "epoch": 0.25291674906070793, "step": 1279, "train/sim_loss": 0.00041306018829345703 }, { "epoch": 0.25291674906070793, "step": 1279, "train/total_loss": 0.08149877190589905 }, { "epoch": 0.25311449475973896, "grad_norm": 0.4755316972732544, "learning_rate": 9.37234147789099e-06, "loss": 0.0851, "step": 1280 }, { "entropy": 6.2127299308776855, "epoch": 0.25311449475973896, "mean_token_accuracy": 0.7144959568977356, "num_tokens": 57186510.0, "step": 1280, "train/ce_loss": 0.6497376561164856 }, { "epoch": 0.25311449475973896, "step": 1280, "train/sim_loss": 0.0002669095993041992 }, { "epoch": 0.25311449475973896, "step": 1280, "train/total_loss": 0.06524067372083664 }, { "entropy": 5.851686477661133, "epoch": 0.25331224045877004, "mean_token_accuracy": 0.7364228963851929, "num_tokens": 57225947.0, "step": 1281, "train/ce_loss": 1.0835120677947998 }, { "epoch": 0.25331224045877004, "step": 1281, "train/sim_loss": 0.0001996159553527832 }, { "epoch": 0.25331224045877004, "step": 1281, "train/total_loss": 0.10855082422494888 }, { "entropy": 6.332733154296875, "epoch": 0.25350998615780107, "mean_token_accuracy": 0.7128335237503052, "num_tokens": 57270532.0, "step": 1282, "train/ce_loss": 0.6809115409851074 }, { "epoch": 0.25350998615780107, "step": 1282, "train/sim_loss": 0.00024563074111938477 }, { "epoch": 0.25350998615780107, "step": 1282, "train/total_loss": 0.06833678483963013 }, { "entropy": 6.449740409851074, "epoch": 0.2537077318568321, "mean_token_accuracy": 0.6977518796920776, "num_tokens": 57326779.0, "step": 1283, "train/ce_loss": 1.7895911931991577 }, { "epoch": 0.2537077318568321, "step": 1283, "train/sim_loss": 0.00033080577850341797 }, { "epoch": 0.2537077318568321, "step": 1283, "train/total_loss": 0.17928992211818695 }, { "entropy": 5.881281852722168, "epoch": 0.2539054775558632, "mean_token_accuracy": 0.7159152626991272, "num_tokens": 57357262.0, "step": 1284, "train/ce_loss": 1.26857590675354 }, { "epoch": 0.2539054775558632, "step": 1284, "train/sim_loss": 0.00035578012466430664 }, { "epoch": 0.2539054775558632, "step": 1284, "train/total_loss": 0.12721337378025055 }, { "entropy": 6.561927318572998, "epoch": 0.2541032232548942, "mean_token_accuracy": 0.7215909361839294, "num_tokens": 57396628.0, "step": 1285, "train/ce_loss": 0.6432398557662964 }, { "epoch": 0.2541032232548942, "step": 1285, "train/sim_loss": 0.0004121065139770508 }, { "epoch": 0.2541032232548942, "step": 1285, "train/total_loss": 0.06473609060049057 }, { "entropy": 6.295108795166016, "epoch": 0.25430096895392523, "mean_token_accuracy": 0.7311111092567444, "num_tokens": 57458241.0, "step": 1286, "train/ce_loss": 0.8091775178909302 }, { "epoch": 0.25430096895392523, "step": 1286, "train/sim_loss": 0.0003040432929992676 }, { "epoch": 0.25430096895392523, "step": 1286, "train/total_loss": 0.0812217965722084 }, { "entropy": 6.387172698974609, "epoch": 0.2544987146529563, "mean_token_accuracy": 0.7280513644218445, "num_tokens": 57511217.0, "step": 1287, "train/ce_loss": 1.4238043149816804e-05 }, { "epoch": 0.2544987146529563, "step": 1287, "train/sim_loss": 0.0003173351287841797 }, { "epoch": 0.2544987146529563, "step": 1287, "train/total_loss": 0.000318758946377784 }, { "entropy": 5.79270076751709, "epoch": 0.25469646035198734, "mean_token_accuracy": 0.7780579924583435, "num_tokens": 57542576.0, "step": 1288, "train/ce_loss": 1.5856937170028687 }, { "epoch": 0.25469646035198734, "step": 1288, "train/sim_loss": 0.00024700164794921875 }, { "epoch": 0.25469646035198734, "step": 1288, "train/total_loss": 0.1588163822889328 }, { "entropy": 6.316830635070801, "epoch": 0.25489420605101837, "mean_token_accuracy": 0.703180193901062, "num_tokens": 57602536.0, "step": 1289, "train/ce_loss": 1.2999309301376343 }, { "epoch": 0.25489420605101837, "step": 1289, "train/sim_loss": 0.0004925727844238281 }, { "epoch": 0.25489420605101837, "step": 1289, "train/total_loss": 0.1304856687784195 }, { "entropy": 5.703634262084961, "epoch": 0.25509195175004945, "mean_token_accuracy": 0.7742165327072144, "num_tokens": 57658809.0, "step": 1290, "train/ce_loss": 0.5969295501708984 }, { "epoch": 0.25509195175004945, "step": 1290, "train/sim_loss": 0.00018715858459472656 }, { "epoch": 0.25509195175004945, "step": 1290, "train/total_loss": 0.05988011509180069 }, { "entropy": 6.2296905517578125, "epoch": 0.2552896974490805, "mean_token_accuracy": 0.7053789496421814, "num_tokens": 57716232.0, "step": 1291, "train/ce_loss": 0.8254818320274353 }, { "epoch": 0.2552896974490805, "step": 1291, "train/sim_loss": 0.00040984153747558594 }, { "epoch": 0.2552896974490805, "step": 1291, "train/total_loss": 0.08295802772045135 }, { "entropy": 6.338239669799805, "epoch": 0.2554874431481115, "mean_token_accuracy": 0.6860632300376892, "num_tokens": 57759005.0, "step": 1292, "train/ce_loss": 0.7095423936843872 }, { "epoch": 0.2554874431481115, "step": 1292, "train/sim_loss": 0.0005066394805908203 }, { "epoch": 0.2554874431481115, "step": 1292, "train/total_loss": 0.07146088033914566 }, { "entropy": 6.229414463043213, "epoch": 0.2556851888471426, "mean_token_accuracy": 0.7250274419784546, "num_tokens": 57816765.0, "step": 1293, "train/ce_loss": 1.3293912410736084 }, { "epoch": 0.2556851888471426, "step": 1293, "train/sim_loss": 0.0002855062484741211 }, { "epoch": 0.2556851888471426, "step": 1293, "train/total_loss": 0.13322463631629944 }, { "entropy": 5.2939252853393555, "epoch": 0.2558829345461736, "mean_token_accuracy": 0.7633587718009949, "num_tokens": 57838291.0, "step": 1294, "train/ce_loss": 0.7462392449378967 }, { "epoch": 0.2558829345461736, "step": 1294, "train/sim_loss": 0.000209808349609375 }, { "epoch": 0.2558829345461736, "step": 1294, "train/total_loss": 0.07483373582363129 }, { "entropy": 5.9818220138549805, "epoch": 0.25608068024520464, "mean_token_accuracy": 0.7176656126976013, "num_tokens": 57869883.0, "step": 1295, "train/ce_loss": 1.333632469177246 }, { "epoch": 0.25608068024520464, "step": 1295, "train/sim_loss": 0.0003781914710998535 }, { "epoch": 0.25608068024520464, "step": 1295, "train/total_loss": 0.13374143838882446 }, { "entropy": 6.190181732177734, "epoch": 0.2562784259442357, "mean_token_accuracy": 0.7364438772201538, "num_tokens": 57941097.0, "step": 1296, "train/ce_loss": 1.2313194274902344 }, { "epoch": 0.2562784259442357, "step": 1296, "train/sim_loss": 0.00027054548263549805 }, { "epoch": 0.2562784259442357, "step": 1296, "train/total_loss": 0.12340249121189117 }, { "entropy": 5.88594913482666, "epoch": 0.25647617164326675, "mean_token_accuracy": 0.7324841022491455, "num_tokens": 57987742.0, "step": 1297, "train/ce_loss": 0.4647485911846161 }, { "epoch": 0.25647617164326675, "step": 1297, "train/sim_loss": 0.00020432472229003906 }, { "epoch": 0.25647617164326675, "step": 1297, "train/total_loss": 0.04667918384075165 }, { "entropy": 6.441249370574951, "epoch": 0.25667391734229783, "mean_token_accuracy": 0.7121986746788025, "num_tokens": 58029685.0, "step": 1298, "train/ce_loss": 1.9317487478256226 }, { "epoch": 0.25667391734229783, "step": 1298, "train/sim_loss": 0.00022232532501220703 }, { "epoch": 0.25667391734229783, "step": 1298, "train/total_loss": 0.19339720904827118 }, { "entropy": 5.746822357177734, "epoch": 0.25687166304132886, "mean_token_accuracy": 0.7718204259872437, "num_tokens": 58066338.0, "step": 1299, "train/ce_loss": 0.49266213178634644 }, { "epoch": 0.25687166304132886, "step": 1299, "train/sim_loss": 0.00022470951080322266 }, { "epoch": 0.25687166304132886, "step": 1299, "train/total_loss": 0.049490924924612045 }, { "epoch": 0.2570694087403599, "grad_norm": 0.4081153869628906, "learning_rate": 9.362449302601642e-06, "loss": 0.0878, "step": 1300 }, { "entropy": 6.4213666915893555, "epoch": 0.2570694087403599, "mean_token_accuracy": 0.7611650228500366, "num_tokens": 58112445.0, "step": 1300, "train/ce_loss": 1.2952055931091309 }, { "epoch": 0.2570694087403599, "step": 1300, "train/sim_loss": 0.0005875229835510254 }, { "epoch": 0.2570694087403599, "step": 1300, "train/total_loss": 0.1301080882549286 }, { "entropy": 6.138303756713867, "epoch": 0.25726715443939097, "mean_token_accuracy": 0.7562130093574524, "num_tokens": 58165257.0, "step": 1301, "train/ce_loss": 0.7196416258811951 }, { "epoch": 0.25726715443939097, "step": 1301, "train/sim_loss": 0.0003655552864074707 }, { "epoch": 0.25726715443939097, "step": 1301, "train/total_loss": 0.07232972234487534 }, { "entropy": 5.969266414642334, "epoch": 0.257464900138422, "mean_token_accuracy": 0.7607934474945068, "num_tokens": 58198012.0, "step": 1302, "train/ce_loss": 1.067848801612854 }, { "epoch": 0.257464900138422, "step": 1302, "train/sim_loss": 0.00032711029052734375 }, { "epoch": 0.257464900138422, "step": 1302, "train/total_loss": 0.10711199045181274 }, { "entropy": 5.9509077072143555, "epoch": 0.257662645837453, "mean_token_accuracy": 0.795332133769989, "num_tokens": 58244263.0, "step": 1303, "train/ce_loss": 0.5198935270309448 }, { "epoch": 0.257662645837453, "step": 1303, "train/sim_loss": 0.0003113746643066406 }, { "epoch": 0.257662645837453, "step": 1303, "train/total_loss": 0.05230072885751724 }, { "entropy": 6.314679145812988, "epoch": 0.2578603915364841, "mean_token_accuracy": 0.7444444298744202, "num_tokens": 58279136.0, "step": 1304, "train/ce_loss": 0.9659485816955566 }, { "epoch": 0.2578603915364841, "step": 1304, "train/sim_loss": 0.00027561187744140625 }, { "epoch": 0.2578603915364841, "step": 1304, "train/total_loss": 0.09687047451734543 }, { "entropy": 6.383233070373535, "epoch": 0.25805813723551513, "mean_token_accuracy": 0.7294034361839294, "num_tokens": 58337625.0, "step": 1305, "train/ce_loss": 1.5179091691970825 }, { "epoch": 0.25805813723551513, "step": 1305, "train/sim_loss": 0.00028842687606811523 }, { "epoch": 0.25805813723551513, "step": 1305, "train/total_loss": 0.15207934379577637 }, { "entropy": 6.098579406738281, "epoch": 0.25825588293454615, "mean_token_accuracy": 0.7629338502883911, "num_tokens": 58384850.0, "step": 1306, "train/ce_loss": 1.1931419372558594 }, { "epoch": 0.25825588293454615, "step": 1306, "train/sim_loss": 0.0002529025077819824 }, { "epoch": 0.25825588293454615, "step": 1306, "train/total_loss": 0.11956709623336792 }, { "entropy": 5.8787736892700195, "epoch": 0.25845362863357724, "mean_token_accuracy": 0.7380484342575073, "num_tokens": 58432141.0, "step": 1307, "train/ce_loss": 1.1292500495910645 }, { "epoch": 0.25845362863357724, "step": 1307, "train/sim_loss": 0.00020647048950195312 }, { "epoch": 0.25845362863357724, "step": 1307, "train/total_loss": 0.11313147842884064 }, { "entropy": 5.877622604370117, "epoch": 0.25865137433260826, "mean_token_accuracy": 0.7585752010345459, "num_tokens": 58474208.0, "step": 1308, "train/ce_loss": 0.8630056977272034 }, { "epoch": 0.25865137433260826, "step": 1308, "train/sim_loss": 0.00032061338424682617 }, { "epoch": 0.25865137433260826, "step": 1308, "train/total_loss": 0.08662118762731552 }, { "entropy": 6.271310806274414, "epoch": 0.2588491200316393, "mean_token_accuracy": 0.7387802004814148, "num_tokens": 58518241.0, "step": 1309, "train/ce_loss": 0.6936272978782654 }, { "epoch": 0.2588491200316393, "step": 1309, "train/sim_loss": 0.00022804737091064453 }, { "epoch": 0.2588491200316393, "step": 1309, "train/total_loss": 0.06959077715873718 }, { "entropy": 5.577438831329346, "epoch": 0.2590468657306704, "mean_token_accuracy": 0.7831325531005859, "num_tokens": 58547125.0, "step": 1310, "train/ce_loss": 0.477845698595047 }, { "epoch": 0.2590468657306704, "step": 1310, "train/sim_loss": 0.00023174285888671875 }, { "epoch": 0.2590468657306704, "step": 1310, "train/total_loss": 0.04801631346344948 }, { "entropy": 6.3159379959106445, "epoch": 0.2592446114297014, "mean_token_accuracy": 0.7442207336425781, "num_tokens": 58593769.0, "step": 1311, "train/ce_loss": 0.994236946105957 }, { "epoch": 0.2592446114297014, "step": 1311, "train/sim_loss": 0.0003254413604736328 }, { "epoch": 0.2592446114297014, "step": 1311, "train/total_loss": 0.0997491404414177 }, { "entropy": 5.958308219909668, "epoch": 0.2594423571287324, "mean_token_accuracy": 0.7758726477622986, "num_tokens": 58651814.0, "step": 1312, "train/ce_loss": 0.5538480877876282 }, { "epoch": 0.2594423571287324, "step": 1312, "train/sim_loss": 0.0003402233123779297 }, { "epoch": 0.2594423571287324, "step": 1312, "train/total_loss": 0.055725034326314926 }, { "entropy": 6.272611618041992, "epoch": 0.2596401028277635, "mean_token_accuracy": 0.6979830265045166, "num_tokens": 58690152.0, "step": 1313, "train/ce_loss": 0.9527662396430969 }, { "epoch": 0.2596401028277635, "step": 1313, "train/sim_loss": 0.00035381317138671875 }, { "epoch": 0.2596401028277635, "step": 1313, "train/total_loss": 0.09563043713569641 }, { "entropy": 5.954226016998291, "epoch": 0.25983784852679453, "mean_token_accuracy": 0.7438551187515259, "num_tokens": 58733944.0, "step": 1314, "train/ce_loss": 1.3499070519173983e-05 }, { "epoch": 0.25983784852679453, "step": 1314, "train/sim_loss": 0.0003113746643066406 }, { "epoch": 0.25983784852679453, "step": 1314, "train/total_loss": 0.00031272455817088485 }, { "entropy": 5.73858642578125, "epoch": 0.26003559422582556, "mean_token_accuracy": 0.7592717409133911, "num_tokens": 58777791.0, "step": 1315, "train/ce_loss": 0.9121809005737305 }, { "epoch": 0.26003559422582556, "step": 1315, "train/sim_loss": 0.0002446174621582031 }, { "epoch": 0.26003559422582556, "step": 1315, "train/total_loss": 0.09146270900964737 }, { "entropy": 5.921746253967285, "epoch": 0.26023333992485664, "mean_token_accuracy": 0.7443249821662903, "num_tokens": 58824906.0, "step": 1316, "train/ce_loss": 0.46426501870155334 }, { "epoch": 0.26023333992485664, "step": 1316, "train/sim_loss": 0.00022614002227783203 }, { "epoch": 0.26023333992485664, "step": 1316, "train/total_loss": 0.04665264114737511 }, { "entropy": 6.136545181274414, "epoch": 0.26043108562388767, "mean_token_accuracy": 0.6989462971687317, "num_tokens": 58878654.0, "step": 1317, "train/ce_loss": 0.9630584120750427 }, { "epoch": 0.26043108562388767, "step": 1317, "train/sim_loss": 0.00031304359436035156 }, { "epoch": 0.26043108562388767, "step": 1317, "train/total_loss": 0.0966188833117485 }, { "entropy": 6.244336128234863, "epoch": 0.26062883132291875, "mean_token_accuracy": 0.7428896427154541, "num_tokens": 58924784.0, "step": 1318, "train/ce_loss": 0.5975376963615417 }, { "epoch": 0.26062883132291875, "step": 1318, "train/sim_loss": 0.00026720762252807617 }, { "epoch": 0.26062883132291875, "step": 1318, "train/total_loss": 0.06002097949385643 }, { "entropy": 6.346209526062012, "epoch": 0.2608265770219498, "mean_token_accuracy": 0.7331311702728271, "num_tokens": 58964396.0, "step": 1319, "train/ce_loss": 0.7357563376426697 }, { "epoch": 0.2608265770219498, "step": 1319, "train/sim_loss": 0.00020319223403930664 }, { "epoch": 0.2608265770219498, "step": 1319, "train/total_loss": 0.07377883046865463 }, { "epoch": 0.2610243227209808, "grad_norm": 0.6864363551139832, "learning_rate": 9.352557127312298e-06, "loss": 0.0804, "step": 1320 }, { "entropy": 6.420966148376465, "epoch": 0.2610243227209808, "mean_token_accuracy": 0.7283072471618652, "num_tokens": 59016394.0, "step": 1320, "train/ce_loss": 0.622209906578064 }, { "epoch": 0.2610243227209808, "step": 1320, "train/sim_loss": 0.00020456314086914062 }, { "epoch": 0.2610243227209808, "step": 1320, "train/total_loss": 0.06242555379867554 }, { "entropy": 5.950719833374023, "epoch": 0.2612220684200119, "mean_token_accuracy": 0.7336421608924866, "num_tokens": 59053189.0, "step": 1321, "train/ce_loss": 1.0666755437850952 }, { "epoch": 0.2612220684200119, "step": 1321, "train/sim_loss": 0.00023066997528076172 }, { "epoch": 0.2612220684200119, "step": 1321, "train/total_loss": 0.1068982258439064 }, { "entropy": 5.782868385314941, "epoch": 0.2614198141190429, "mean_token_accuracy": 0.7770154476165771, "num_tokens": 59091049.0, "step": 1322, "train/ce_loss": 0.5226247906684875 }, { "epoch": 0.2614198141190429, "step": 1322, "train/sim_loss": 0.0002409219741821289 }, { "epoch": 0.2614198141190429, "step": 1322, "train/total_loss": 0.05250340327620506 }, { "entropy": 6.117471694946289, "epoch": 0.26161755981807394, "mean_token_accuracy": 0.7546218633651733, "num_tokens": 59128603.0, "step": 1323, "train/ce_loss": 1.143065094947815 }, { "epoch": 0.26161755981807394, "step": 1323, "train/sim_loss": 0.0002847909927368164 }, { "epoch": 0.26161755981807394, "step": 1323, "train/total_loss": 0.11459130048751831 }, { "entropy": 5.859009742736816, "epoch": 0.261815305517105, "mean_token_accuracy": 0.7645348906517029, "num_tokens": 59173000.0, "step": 1324, "train/ce_loss": 1.2702845335006714 }, { "epoch": 0.261815305517105, "step": 1324, "train/sim_loss": 0.00045752525329589844 }, { "epoch": 0.261815305517105, "step": 1324, "train/total_loss": 0.1274859756231308 }, { "entropy": 6.281252861022949, "epoch": 0.26201305121613605, "mean_token_accuracy": 0.7083616852760315, "num_tokens": 59225497.0, "step": 1325, "train/ce_loss": 2.1315839290618896 }, { "epoch": 0.26201305121613605, "step": 1325, "train/sim_loss": 0.0002747178077697754 }, { "epoch": 0.26201305121613605, "step": 1325, "train/total_loss": 0.21343311667442322 }, { "entropy": 6.231717586517334, "epoch": 0.2622107969151671, "mean_token_accuracy": 0.7184409499168396, "num_tokens": 59268325.0, "step": 1326, "train/ce_loss": 1.2389147281646729 }, { "epoch": 0.2622107969151671, "step": 1326, "train/sim_loss": 0.0002588033676147461 }, { "epoch": 0.2622107969151671, "step": 1326, "train/total_loss": 0.12415027618408203 }, { "entropy": 5.929417610168457, "epoch": 0.26240854261419816, "mean_token_accuracy": 0.760789155960083, "num_tokens": 59299330.0, "step": 1327, "train/ce_loss": 0.6290435791015625 }, { "epoch": 0.26240854261419816, "step": 1327, "train/sim_loss": 0.000299990177154541 }, { "epoch": 0.26240854261419816, "step": 1327, "train/total_loss": 0.06320434808731079 }, { "entropy": 6.375881195068359, "epoch": 0.2626062883132292, "mean_token_accuracy": 0.7250274419784546, "num_tokens": 59365783.0, "step": 1328, "train/ce_loss": 0.9284252524375916 }, { "epoch": 0.2626062883132292, "step": 1328, "train/sim_loss": 0.0002168416976928711 }, { "epoch": 0.2626062883132292, "step": 1328, "train/total_loss": 0.09305936843156815 }, { "entropy": 6.1087422370910645, "epoch": 0.2628040340122602, "mean_token_accuracy": 0.7306733131408691, "num_tokens": 59407871.0, "step": 1329, "train/ce_loss": 0.7903437614440918 }, { "epoch": 0.2628040340122602, "step": 1329, "train/sim_loss": 0.00035321712493896484 }, { "epoch": 0.2628040340122602, "step": 1329, "train/total_loss": 0.0793875977396965 }, { "entropy": 6.0588059425354, "epoch": 0.2630017797112913, "mean_token_accuracy": 0.7764298319816589, "num_tokens": 59442187.0, "step": 1330, "train/ce_loss": 0.8955581784248352 }, { "epoch": 0.2630017797112913, "step": 1330, "train/sim_loss": 0.00032007694244384766 }, { "epoch": 0.2630017797112913, "step": 1330, "train/total_loss": 0.08987589925527573 }, { "entropy": 6.362881183624268, "epoch": 0.2631995254103223, "mean_token_accuracy": 0.7603349089622498, "num_tokens": 59497718.0, "step": 1331, "train/ce_loss": 1.0137184858322144 }, { "epoch": 0.2631995254103223, "step": 1331, "train/sim_loss": 0.00032275915145874023 }, { "epoch": 0.2631995254103223, "step": 1331, "train/total_loss": 0.10169460624456406 }, { "entropy": 6.302515983581543, "epoch": 0.26339727110935335, "mean_token_accuracy": 0.72817462682724, "num_tokens": 59534155.0, "step": 1332, "train/ce_loss": 0.803471565246582 }, { "epoch": 0.26339727110935335, "step": 1332, "train/sim_loss": 0.00018608570098876953 }, { "epoch": 0.26339727110935335, "step": 1332, "train/total_loss": 0.08053324371576309 }, { "entropy": 6.633875370025635, "epoch": 0.26359501680838443, "mean_token_accuracy": 0.7007299065589905, "num_tokens": 59581649.0, "step": 1333, "train/ce_loss": 1.8258558511734009 }, { "epoch": 0.26359501680838443, "step": 1333, "train/sim_loss": 0.0002543926239013672 }, { "epoch": 0.26359501680838443, "step": 1333, "train/total_loss": 0.18283997476100922 }, { "entropy": 6.4653849601745605, "epoch": 0.26379276250741546, "mean_token_accuracy": 0.7173610925674438, "num_tokens": 59627601.0, "step": 1334, "train/ce_loss": 0.6507107019424438 }, { "epoch": 0.26379276250741546, "step": 1334, "train/sim_loss": 0.00027573108673095703 }, { "epoch": 0.26379276250741546, "step": 1334, "train/total_loss": 0.06534679979085922 }, { "entropy": 5.757517337799072, "epoch": 0.2639905082064465, "mean_token_accuracy": 0.7795893549919128, "num_tokens": 59664495.0, "step": 1335, "train/ce_loss": 0.7920358777046204 }, { "epoch": 0.2639905082064465, "step": 1335, "train/sim_loss": 0.00032263994216918945 }, { "epoch": 0.2639905082064465, "step": 1335, "train/total_loss": 0.07952623069286346 }, { "entropy": 5.985734939575195, "epoch": 0.26418825390547757, "mean_token_accuracy": 0.7261098623275757, "num_tokens": 59695126.0, "step": 1336, "train/ce_loss": 1.7828788757324219 }, { "epoch": 0.26418825390547757, "step": 1336, "train/sim_loss": 0.00026738643646240234 }, { "epoch": 0.26418825390547757, "step": 1336, "train/total_loss": 0.17855527997016907 }, { "entropy": 6.253944396972656, "epoch": 0.2643859996045086, "mean_token_accuracy": 0.7226480841636658, "num_tokens": 59741985.0, "step": 1337, "train/ce_loss": 0.6657653450965881 }, { "epoch": 0.2643859996045086, "step": 1337, "train/sim_loss": 0.0002827644348144531 }, { "epoch": 0.2643859996045086, "step": 1337, "train/total_loss": 0.06685929745435715 }, { "entropy": 5.967124938964844, "epoch": 0.2645837453035397, "mean_token_accuracy": 0.7378013134002686, "num_tokens": 59782787.0, "step": 1338, "train/ce_loss": 1.587923526763916 }, { "epoch": 0.2645837453035397, "step": 1338, "train/sim_loss": 0.0004928112030029297 }, { "epoch": 0.2645837453035397, "step": 1338, "train/total_loss": 0.15928517282009125 }, { "entropy": 5.852090835571289, "epoch": 0.2647814910025707, "mean_token_accuracy": 0.7722488045692444, "num_tokens": 59831713.0, "step": 1339, "train/ce_loss": 0.33183005452156067 }, { "epoch": 0.2647814910025707, "step": 1339, "train/sim_loss": 0.00024366378784179688 }, { "epoch": 0.2647814910025707, "step": 1339, "train/total_loss": 0.033426668494939804 }, { "epoch": 0.26497923670160173, "grad_norm": 0.3704085350036621, "learning_rate": 9.342664952022951e-06, "loss": 0.0824, "step": 1340 }, { "entropy": 6.289078712463379, "epoch": 0.26497923670160173, "mean_token_accuracy": 0.7489014267921448, "num_tokens": 59878800.0, "step": 1340, "train/ce_loss": 0.6070172190666199 }, { "epoch": 0.26497923670160173, "step": 1340, "train/sim_loss": 0.0002525448799133301 }, { "epoch": 0.26497923670160173, "step": 1340, "train/total_loss": 0.060954269021749496 }, { "entropy": 5.938503742218018, "epoch": 0.2651769824006328, "mean_token_accuracy": 0.7033898234367371, "num_tokens": 59927382.0, "step": 1341, "train/ce_loss": 1.3991137742996216 }, { "epoch": 0.2651769824006328, "step": 1341, "train/sim_loss": 0.00036722421646118164 }, { "epoch": 0.2651769824006328, "step": 1341, "train/total_loss": 0.14027860760688782 }, { "entropy": 5.8275251388549805, "epoch": 0.26537472809966384, "mean_token_accuracy": 0.7165924310684204, "num_tokens": 59971008.0, "step": 1342, "train/ce_loss": 0.9583948850631714 }, { "epoch": 0.26537472809966384, "step": 1342, "train/sim_loss": 0.0003762245178222656 }, { "epoch": 0.26537472809966384, "step": 1342, "train/total_loss": 0.09621571749448776 }, { "entropy": 5.894725322723389, "epoch": 0.26557247379869486, "mean_token_accuracy": 0.7832576036453247, "num_tokens": 60031426.0, "step": 1343, "train/ce_loss": 0.7346280217170715 }, { "epoch": 0.26557247379869486, "step": 1343, "train/sim_loss": 0.00019127130508422852 }, { "epoch": 0.26557247379869486, "step": 1343, "train/total_loss": 0.07365407794713974 }, { "entropy": 5.865812301635742, "epoch": 0.26577021949772595, "mean_token_accuracy": 0.7231777310371399, "num_tokens": 60061957.0, "step": 1344, "train/ce_loss": 0.7600436806678772 }, { "epoch": 0.26577021949772595, "step": 1344, "train/sim_loss": 0.0002067089080810547 }, { "epoch": 0.26577021949772595, "step": 1344, "train/total_loss": 0.07621107995510101 }, { "entropy": 5.840634346008301, "epoch": 0.265967965196757, "mean_token_accuracy": 0.7806574106216431, "num_tokens": 60102563.0, "step": 1345, "train/ce_loss": 1.407803483743919e-05 }, { "epoch": 0.265967965196757, "step": 1345, "train/sim_loss": 0.00020194053649902344 }, { "epoch": 0.265967965196757, "step": 1345, "train/total_loss": 0.00020334834698587656 }, { "entropy": 6.201423645019531, "epoch": 0.266165710895788, "mean_token_accuracy": 0.71656334400177, "num_tokens": 60148971.0, "step": 1346, "train/ce_loss": 0.781093955039978 }, { "epoch": 0.266165710895788, "step": 1346, "train/sim_loss": 0.0002593398094177246 }, { "epoch": 0.266165710895788, "step": 1346, "train/total_loss": 0.07836873829364777 }, { "entropy": 6.027127742767334, "epoch": 0.2663634565948191, "mean_token_accuracy": 0.7191079258918762, "num_tokens": 60184299.0, "step": 1347, "train/ce_loss": 0.32918816804885864 }, { "epoch": 0.2663634565948191, "step": 1347, "train/sim_loss": 0.0002771615982055664 }, { "epoch": 0.2663634565948191, "step": 1347, "train/total_loss": 0.03319597989320755 }, { "entropy": 6.15208625793457, "epoch": 0.2665612022938501, "mean_token_accuracy": 0.7348684072494507, "num_tokens": 60239196.0, "step": 1348, "train/ce_loss": 0.48299190402030945 }, { "epoch": 0.2665612022938501, "step": 1348, "train/sim_loss": 0.00023740530014038086 }, { "epoch": 0.2665612022938501, "step": 1348, "train/total_loss": 0.048536594957113266 }, { "entropy": 6.103736400604248, "epoch": 0.26675894799288113, "mean_token_accuracy": 0.7480550408363342, "num_tokens": 60265703.0, "step": 1349, "train/ce_loss": 1.1562050580978394 }, { "epoch": 0.26675894799288113, "step": 1349, "train/sim_loss": 0.00024318695068359375 }, { "epoch": 0.26675894799288113, "step": 1349, "train/total_loss": 0.11586369574069977 }, { "entropy": 6.069911479949951, "epoch": 0.2669566936919122, "mean_token_accuracy": 0.7397660613059998, "num_tokens": 60322486.0, "step": 1350, "train/ce_loss": 0.5336400270462036 }, { "epoch": 0.2669566936919122, "step": 1350, "train/sim_loss": 0.0002906322479248047 }, { "epoch": 0.2669566936919122, "step": 1350, "train/total_loss": 0.053654637187719345 }, { "entropy": 6.359891891479492, "epoch": 0.26715443939094324, "mean_token_accuracy": 0.7544615268707275, "num_tokens": 60378315.0, "step": 1351, "train/ce_loss": 0.5258625149726868 }, { "epoch": 0.26715443939094324, "step": 1351, "train/sim_loss": 0.0003567337989807129 }, { "epoch": 0.26715443939094324, "step": 1351, "train/total_loss": 0.05294298753142357 }, { "entropy": 6.270388603210449, "epoch": 0.26735218508997427, "mean_token_accuracy": 0.7365911602973938, "num_tokens": 60416002.0, "step": 1352, "train/ce_loss": 0.7681063413619995 }, { "epoch": 0.26735218508997427, "step": 1352, "train/sim_loss": 0.00031065940856933594 }, { "epoch": 0.26735218508997427, "step": 1352, "train/total_loss": 0.0771212950348854 }, { "entropy": 6.206828594207764, "epoch": 0.26754993078900535, "mean_token_accuracy": 0.7062807679176331, "num_tokens": 60462549.0, "step": 1353, "train/ce_loss": 1.3601770401000977 }, { "epoch": 0.26754993078900535, "step": 1353, "train/sim_loss": 0.0002492666244506836 }, { "epoch": 0.26754993078900535, "step": 1353, "train/total_loss": 0.13626697659492493 }, { "entropy": 5.954756736755371, "epoch": 0.2677476764880364, "mean_token_accuracy": 0.7648809552192688, "num_tokens": 60508890.0, "step": 1354, "train/ce_loss": 0.9473959803581238 }, { "epoch": 0.2677476764880364, "step": 1354, "train/sim_loss": 0.0002645254135131836 }, { "epoch": 0.2677476764880364, "step": 1354, "train/total_loss": 0.0950041264295578 }, { "entropy": 6.382312774658203, "epoch": 0.2679454221870674, "mean_token_accuracy": 0.7585539221763611, "num_tokens": 60559853.0, "step": 1355, "train/ce_loss": 0.42190396785736084 }, { "epoch": 0.2679454221870674, "step": 1355, "train/sim_loss": 0.00018745660781860352 }, { "epoch": 0.2679454221870674, "step": 1355, "train/total_loss": 0.04237785562872887 }, { "entropy": 5.994254112243652, "epoch": 0.2681431678860985, "mean_token_accuracy": 0.7468434572219849, "num_tokens": 60598080.0, "step": 1356, "train/ce_loss": 0.5108550190925598 }, { "epoch": 0.2681431678860985, "step": 1356, "train/sim_loss": 0.0002740621566772461 }, { "epoch": 0.2681431678860985, "step": 1356, "train/total_loss": 0.05135956406593323 }, { "entropy": 6.226233959197998, "epoch": 0.2683409135851295, "mean_token_accuracy": 0.7225988507270813, "num_tokens": 60647138.0, "step": 1357, "train/ce_loss": 1.4839868545532227 }, { "epoch": 0.2683409135851295, "step": 1357, "train/sim_loss": 0.0003104209899902344 }, { "epoch": 0.2683409135851295, "step": 1357, "train/total_loss": 0.14870910346508026 }, { "entropy": 5.702395439147949, "epoch": 0.2685386592841606, "mean_token_accuracy": 0.7674169540405273, "num_tokens": 60691909.0, "step": 1358, "train/ce_loss": 0.4122723937034607 }, { "epoch": 0.2685386592841606, "step": 1358, "train/sim_loss": 0.0002434849739074707 }, { "epoch": 0.2685386592841606, "step": 1358, "train/total_loss": 0.0414707250893116 }, { "entropy": 6.148886680603027, "epoch": 0.2687364049831916, "mean_token_accuracy": 0.6930473446846008, "num_tokens": 60738116.0, "step": 1359, "train/ce_loss": 1.0965486764907837 }, { "epoch": 0.2687364049831916, "step": 1359, "train/sim_loss": 0.0004233121871948242 }, { "epoch": 0.2687364049831916, "step": 1359, "train/total_loss": 0.11007817834615707 }, { "epoch": 0.26893415068222265, "grad_norm": 0.5233904719352722, "learning_rate": 9.332772776733604e-06, "loss": 0.0846, "step": 1360 }, { "entropy": 6.292816162109375, "epoch": 0.26893415068222265, "mean_token_accuracy": 0.7706649899482727, "num_tokens": 60797062.0, "step": 1360, "train/ce_loss": 0.5395644903182983 }, { "epoch": 0.26893415068222265, "step": 1360, "train/sim_loss": 0.00019997358322143555 }, { "epoch": 0.26893415068222265, "step": 1360, "train/total_loss": 0.05415642261505127 }, { "entropy": 6.435740947723389, "epoch": 0.26913189638125373, "mean_token_accuracy": 0.7250859141349792, "num_tokens": 60862723.0, "step": 1361, "train/ce_loss": 0.31629225611686707 }, { "epoch": 0.26913189638125373, "step": 1361, "train/sim_loss": 0.00025910139083862305 }, { "epoch": 0.26913189638125373, "step": 1361, "train/total_loss": 0.03188832849264145 }, { "entropy": 6.065546989440918, "epoch": 0.26932964208028476, "mean_token_accuracy": 0.7607510685920715, "num_tokens": 60906788.0, "step": 1362, "train/ce_loss": 0.6397671699523926 }, { "epoch": 0.26932964208028476, "step": 1362, "train/sim_loss": 0.00026541948318481445 }, { "epoch": 0.26932964208028476, "step": 1362, "train/total_loss": 0.06424213945865631 }, { "entropy": 6.3836259841918945, "epoch": 0.2695273877793158, "mean_token_accuracy": 0.7370588183403015, "num_tokens": 60953182.0, "step": 1363, "train/ce_loss": 0.5570991635322571 }, { "epoch": 0.2695273877793158, "step": 1363, "train/sim_loss": 0.000250399112701416 }, { "epoch": 0.2695273877793158, "step": 1363, "train/total_loss": 0.055960316210985184 }, { "entropy": 6.048716068267822, "epoch": 0.26972513347834687, "mean_token_accuracy": 0.7350771427154541, "num_tokens": 60997441.0, "step": 1364, "train/ce_loss": 1.48622993947356e-05 }, { "epoch": 0.26972513347834687, "step": 1364, "train/sim_loss": 0.00019180774688720703 }, { "epoch": 0.26972513347834687, "step": 1364, "train/total_loss": 0.00019329397764522582 }, { "entropy": 6.051727294921875, "epoch": 0.2699228791773779, "mean_token_accuracy": 0.7454751133918762, "num_tokens": 61046289.0, "step": 1365, "train/ce_loss": 0.6880075931549072 }, { "epoch": 0.2699228791773779, "step": 1365, "train/sim_loss": 0.00034934282302856445 }, { "epoch": 0.2699228791773779, "step": 1365, "train/total_loss": 0.06915010511875153 }, { "entropy": 6.421989440917969, "epoch": 0.2701206248764089, "mean_token_accuracy": 0.7367506623268127, "num_tokens": 61095981.0, "step": 1366, "train/ce_loss": 0.8941583633422852 }, { "epoch": 0.2701206248764089, "step": 1366, "train/sim_loss": 0.0002747774124145508 }, { "epoch": 0.2701206248764089, "step": 1366, "train/total_loss": 0.08969061821699142 }, { "entropy": 5.98145055770874, "epoch": 0.27031837057544, "mean_token_accuracy": 0.7563636302947998, "num_tokens": 61144585.0, "step": 1367, "train/ce_loss": 1.2577928304672241 }, { "epoch": 0.27031837057544, "step": 1367, "train/sim_loss": 0.00043463706970214844 }, { "epoch": 0.27031837057544, "step": 1367, "train/total_loss": 0.1262139230966568 }, { "entropy": 6.247781276702881, "epoch": 0.27051611627447103, "mean_token_accuracy": 0.7488452792167664, "num_tokens": 61181129.0, "step": 1368, "train/ce_loss": 0.6106777191162109 }, { "epoch": 0.27051611627447103, "step": 1368, "train/sim_loss": 0.0002110004425048828 }, { "epoch": 0.27051611627447103, "step": 1368, "train/total_loss": 0.06127877160906792 }, { "entropy": 6.317516803741455, "epoch": 0.27071386197350206, "mean_token_accuracy": 0.737574577331543, "num_tokens": 61223450.0, "step": 1369, "train/ce_loss": 1.4925047159194946 }, { "epoch": 0.27071386197350206, "step": 1369, "train/sim_loss": 0.0013512372970581055 }, { "epoch": 0.27071386197350206, "step": 1369, "train/total_loss": 0.15060171484947205 }, { "entropy": 6.289669036865234, "epoch": 0.27091160767253314, "mean_token_accuracy": 0.7411392331123352, "num_tokens": 61257095.0, "step": 1370, "train/ce_loss": 0.616107165813446 }, { "epoch": 0.27091160767253314, "step": 1370, "train/sim_loss": 0.00020015239715576172 }, { "epoch": 0.27091160767253314, "step": 1370, "train/total_loss": 0.061810869723558426 }, { "entropy": 5.71242618560791, "epoch": 0.27110935337156417, "mean_token_accuracy": 0.7532838582992554, "num_tokens": 61293006.0, "step": 1371, "train/ce_loss": 0.6007556915283203 }, { "epoch": 0.27110935337156417, "step": 1371, "train/sim_loss": 0.00021636486053466797 }, { "epoch": 0.27110935337156417, "step": 1371, "train/total_loss": 0.06029193475842476 }, { "entropy": 6.399554252624512, "epoch": 0.2713070990705952, "mean_token_accuracy": 0.7336480021476746, "num_tokens": 61346940.0, "step": 1372, "train/ce_loss": 0.9628363251686096 }, { "epoch": 0.2713070990705952, "step": 1372, "train/sim_loss": 0.0003769397735595703 }, { "epoch": 0.2713070990705952, "step": 1372, "train/total_loss": 0.09666057676076889 }, { "entropy": 6.070538520812988, "epoch": 0.2715048447696263, "mean_token_accuracy": 0.7230246663093567, "num_tokens": 61396442.0, "step": 1373, "train/ce_loss": 1.5727644495200366e-05 }, { "epoch": 0.2715048447696263, "step": 1373, "train/sim_loss": 0.00021147727966308594 }, { "epoch": 0.2715048447696263, "step": 1373, "train/total_loss": 0.00021305005066096783 }, { "entropy": 6.054449081420898, "epoch": 0.2717025904686573, "mean_token_accuracy": 0.7436201572418213, "num_tokens": 61457957.0, "step": 1374, "train/ce_loss": 1.0261921882629395 }, { "epoch": 0.2717025904686573, "step": 1374, "train/sim_loss": 0.0003148317337036133 }, { "epoch": 0.2717025904686573, "step": 1374, "train/total_loss": 0.10293405503034592 }, { "entropy": 6.253711700439453, "epoch": 0.27190033616768833, "mean_token_accuracy": 0.7683352828025818, "num_tokens": 61503893.0, "step": 1375, "train/ce_loss": 1.7619853679207154e-05 }, { "epoch": 0.27190033616768833, "step": 1375, "train/sim_loss": 0.000279843807220459 }, { "epoch": 0.27190033616768833, "step": 1375, "train/total_loss": 0.0002816057822201401 }, { "entropy": 5.915518760681152, "epoch": 0.2720980818667194, "mean_token_accuracy": 0.7664042115211487, "num_tokens": 61539856.0, "step": 1376, "train/ce_loss": 0.9755957126617432 }, { "epoch": 0.2720980818667194, "step": 1376, "train/sim_loss": 0.00030791759490966797 }, { "epoch": 0.2720980818667194, "step": 1376, "train/total_loss": 0.09786748886108398 }, { "entropy": 5.932598114013672, "epoch": 0.27229582756575044, "mean_token_accuracy": 0.7266233563423157, "num_tokens": 61593833.0, "step": 1377, "train/ce_loss": 0.5503478646278381 }, { "epoch": 0.27229582756575044, "step": 1377, "train/sim_loss": 0.00030815601348876953 }, { "epoch": 0.27229582756575044, "step": 1377, "train/total_loss": 0.05534294247627258 }, { "entropy": 5.900007247924805, "epoch": 0.27249357326478146, "mean_token_accuracy": 0.7630952596664429, "num_tokens": 61643572.0, "step": 1378, "train/ce_loss": 0.6552371978759766 }, { "epoch": 0.27249357326478146, "step": 1378, "train/sim_loss": 0.0002549290657043457 }, { "epoch": 0.27249357326478146, "step": 1378, "train/total_loss": 0.06577865034341812 }, { "entropy": 5.658809661865234, "epoch": 0.27269131896381255, "mean_token_accuracy": 0.7510620355606079, "num_tokens": 61686966.0, "step": 1379, "train/ce_loss": 1.7848893404006958 }, { "epoch": 0.27269131896381255, "step": 1379, "train/sim_loss": 0.00021332502365112305 }, { "epoch": 0.27269131896381255, "step": 1379, "train/total_loss": 0.17870226502418518 }, { "epoch": 0.2728890646628436, "grad_norm": 0.4981587529182434, "learning_rate": 9.322880601444259e-06, "loss": 0.0804, "step": 1380 }, { "entropy": 5.787121772766113, "epoch": 0.2728890646628436, "mean_token_accuracy": 0.6885578632354736, "num_tokens": 61724816.0, "step": 1380, "train/ce_loss": 1.6117253303527832 }, { "epoch": 0.2728890646628436, "step": 1380, "train/sim_loss": 0.0003090500831604004 }, { "epoch": 0.2728890646628436, "step": 1380, "train/total_loss": 0.1614815890789032 }, { "entropy": 6.231237411499023, "epoch": 0.27308681036187465, "mean_token_accuracy": 0.7282378077507019, "num_tokens": 61762733.0, "step": 1381, "train/ce_loss": 0.7178558707237244 }, { "epoch": 0.27308681036187465, "step": 1381, "train/sim_loss": 0.00033289194107055664 }, { "epoch": 0.27308681036187465, "step": 1381, "train/total_loss": 0.07211848348379135 }, { "entropy": 5.944671154022217, "epoch": 0.2732845560609057, "mean_token_accuracy": 0.7332233190536499, "num_tokens": 61819228.0, "step": 1382, "train/ce_loss": 0.5611568093299866 }, { "epoch": 0.2732845560609057, "step": 1382, "train/sim_loss": 0.0001875162124633789 }, { "epoch": 0.2732845560609057, "step": 1382, "train/total_loss": 0.056303199380636215 }, { "entropy": 6.008012771606445, "epoch": 0.2734823017599367, "mean_token_accuracy": 0.7330439686775208, "num_tokens": 61871317.0, "step": 1383, "train/ce_loss": 0.6517830491065979 }, { "epoch": 0.2734823017599367, "step": 1383, "train/sim_loss": 0.00033038854598999023 }, { "epoch": 0.2734823017599367, "step": 1383, "train/total_loss": 0.06550869345664978 }, { "entropy": 6.291975021362305, "epoch": 0.2736800474589678, "mean_token_accuracy": 0.7562833428382874, "num_tokens": 61911708.0, "step": 1384, "train/ce_loss": 0.8824570775032043 }, { "epoch": 0.2736800474589678, "step": 1384, "train/sim_loss": 0.0004266500473022461 }, { "epoch": 0.2736800474589678, "step": 1384, "train/total_loss": 0.08867236226797104 }, { "entropy": 6.354132652282715, "epoch": 0.2738777931579988, "mean_token_accuracy": 0.7431620955467224, "num_tokens": 61966138.0, "step": 1385, "train/ce_loss": 1.5758882761001587 }, { "epoch": 0.2738777931579988, "step": 1385, "train/sim_loss": 0.00026994943618774414 }, { "epoch": 0.2738777931579988, "step": 1385, "train/total_loss": 0.15785877406597137 }, { "entropy": 5.87414026260376, "epoch": 0.27407553885702984, "mean_token_accuracy": 0.7397698163986206, "num_tokens": 61993413.0, "step": 1386, "train/ce_loss": 0.4033581614494324 }, { "epoch": 0.27407553885702984, "step": 1386, "train/sim_loss": 0.0002428889274597168 }, { "epoch": 0.27407553885702984, "step": 1386, "train/total_loss": 0.040578704327344894 }, { "entropy": 6.41604471206665, "epoch": 0.2742732845560609, "mean_token_accuracy": 0.7270220518112183, "num_tokens": 62045302.0, "step": 1387, "train/ce_loss": 1.6757650882937014e-05 }, { "epoch": 0.2742732845560609, "step": 1387, "train/sim_loss": 0.00031828880310058594 }, { "epoch": 0.2742732845560609, "step": 1387, "train/total_loss": 0.0003199645725544542 }, { "entropy": 6.223320007324219, "epoch": 0.27447103025509195, "mean_token_accuracy": 0.7312373518943787, "num_tokens": 62082340.0, "step": 1388, "train/ce_loss": 1.6301006078720093 }, { "epoch": 0.27447103025509195, "step": 1388, "train/sim_loss": 0.00033092498779296875 }, { "epoch": 0.27447103025509195, "step": 1388, "train/total_loss": 0.1633409857749939 }, { "entropy": 5.442910194396973, "epoch": 0.274668775954123, "mean_token_accuracy": 0.7678265571594238, "num_tokens": 62125201.0, "step": 1389, "train/ce_loss": 0.8043228983879089 }, { "epoch": 0.274668775954123, "step": 1389, "train/sim_loss": 0.00022208690643310547 }, { "epoch": 0.274668775954123, "step": 1389, "train/total_loss": 0.08065437525510788 }, { "entropy": 6.253296852111816, "epoch": 0.27486652165315406, "mean_token_accuracy": 0.7373678088188171, "num_tokens": 62170140.0, "step": 1390, "train/ce_loss": 0.7942336797714233 }, { "epoch": 0.27486652165315406, "step": 1390, "train/sim_loss": 0.0003923177719116211 }, { "epoch": 0.27486652165315406, "step": 1390, "train/total_loss": 0.07981568574905396 }, { "entropy": 6.449943542480469, "epoch": 0.2750642673521851, "mean_token_accuracy": 0.7494089603424072, "num_tokens": 62219809.0, "step": 1391, "train/ce_loss": 1.611235893506091e-05 }, { "epoch": 0.2750642673521851, "step": 1391, "train/sim_loss": 0.00032514333724975586 }, { "epoch": 0.2750642673521851, "step": 1391, "train/total_loss": 0.00032675458351150155 }, { "entropy": 5.839452743530273, "epoch": 0.2752620130512161, "mean_token_accuracy": 0.7373102903366089, "num_tokens": 62258764.0, "step": 1392, "train/ce_loss": 0.6881932616233826 }, { "epoch": 0.2752620130512161, "step": 1392, "train/sim_loss": 0.0001970529556274414 }, { "epoch": 0.2752620130512161, "step": 1392, "train/total_loss": 0.06901638209819794 }, { "entropy": 6.244833946228027, "epoch": 0.2754597587502472, "mean_token_accuracy": 0.7162790894508362, "num_tokens": 62309119.0, "step": 1393, "train/ce_loss": 1.4514369468088262e-05 }, { "epoch": 0.2754597587502472, "step": 1393, "train/sim_loss": 0.000308990478515625 }, { "epoch": 0.2754597587502472, "step": 1393, "train/total_loss": 0.00031044191564433277 }, { "entropy": 6.296979904174805, "epoch": 0.2756575044492782, "mean_token_accuracy": 0.7317763566970825, "num_tokens": 62348614.0, "step": 1394, "train/ce_loss": 1.3974946737289429 }, { "epoch": 0.2756575044492782, "step": 1394, "train/sim_loss": 0.0001805424690246582 }, { "epoch": 0.2756575044492782, "step": 1394, "train/total_loss": 0.13993000984191895 }, { "entropy": 6.127070426940918, "epoch": 0.27585525014830925, "mean_token_accuracy": 0.7695473432540894, "num_tokens": 62399636.0, "step": 1395, "train/ce_loss": 1.1526918411254883 }, { "epoch": 0.27585525014830925, "step": 1395, "train/sim_loss": 0.00018858909606933594 }, { "epoch": 0.27585525014830925, "step": 1395, "train/total_loss": 0.11545777320861816 }, { "entropy": 5.795981407165527, "epoch": 0.27605299584734033, "mean_token_accuracy": 0.7801014184951782, "num_tokens": 62431454.0, "step": 1396, "train/ce_loss": 0.6496817469596863 }, { "epoch": 0.27605299584734033, "step": 1396, "train/sim_loss": 0.00027310848236083984 }, { "epoch": 0.27605299584734033, "step": 1396, "train/total_loss": 0.06524128466844559 }, { "entropy": 5.988924980163574, "epoch": 0.27625074154637136, "mean_token_accuracy": 0.7583333253860474, "num_tokens": 62481871.0, "step": 1397, "train/ce_loss": 0.46986618638038635 }, { "epoch": 0.27625074154637136, "step": 1397, "train/sim_loss": 0.00040096044540405273 }, { "epoch": 0.27625074154637136, "step": 1397, "train/total_loss": 0.04738758131861687 }, { "entropy": 6.2803144454956055, "epoch": 0.2764484872454024, "mean_token_accuracy": 0.7153996229171753, "num_tokens": 62529733.0, "step": 1398, "train/ce_loss": 0.7618244886398315 }, { "epoch": 0.2764484872454024, "step": 1398, "train/sim_loss": 0.00031751394271850586 }, { "epoch": 0.2764484872454024, "step": 1398, "train/total_loss": 0.07649996131658554 }, { "entropy": 5.976651668548584, "epoch": 0.27664623294443347, "mean_token_accuracy": 0.7334348559379578, "num_tokens": 62560550.0, "step": 1399, "train/ce_loss": 0.9461105465888977 }, { "epoch": 0.27664623294443347, "step": 1399, "train/sim_loss": 0.00019037723541259766 }, { "epoch": 0.27664623294443347, "step": 1399, "train/total_loss": 0.09480143338441849 }, { "epoch": 0.2768439786434645, "grad_norm": 0.48993703722953796, "learning_rate": 9.312988426154913e-06, "loss": 0.0826, "step": 1400 }, { "entropy": 5.992923736572266, "epoch": 0.2768439786434645, "mean_token_accuracy": 0.7256823182106018, "num_tokens": 62604268.0, "step": 1400, "train/ce_loss": 0.4261343777179718 }, { "epoch": 0.2768439786434645, "step": 1400, "train/sim_loss": 0.00029277801513671875 }, { "epoch": 0.2768439786434645, "step": 1400, "train/total_loss": 0.04290621727705002 }, { "entropy": 5.915910720825195, "epoch": 0.2770417243424956, "mean_token_accuracy": 0.7546296119689941, "num_tokens": 62649243.0, "step": 1401, "train/ce_loss": 1.106372356414795 }, { "epoch": 0.2770417243424956, "step": 1401, "train/sim_loss": 0.00032138824462890625 }, { "epoch": 0.2770417243424956, "step": 1401, "train/total_loss": 0.11095862835645676 }, { "entropy": 6.118398666381836, "epoch": 0.2772394700415266, "mean_token_accuracy": 0.7428343892097473, "num_tokens": 62696255.0, "step": 1402, "train/ce_loss": 1.2982144653506111e-05 }, { "epoch": 0.2772394700415266, "step": 1402, "train/sim_loss": 0.0002875328063964844 }, { "epoch": 0.2772394700415266, "step": 1402, "train/total_loss": 0.00028883101185783744 }, { "entropy": 5.956717014312744, "epoch": 0.27743721574055763, "mean_token_accuracy": 0.732692301273346, "num_tokens": 62744325.0, "step": 1403, "train/ce_loss": 0.5516884922981262 }, { "epoch": 0.27743721574055763, "step": 1403, "train/sim_loss": 0.00029963254928588867 }, { "epoch": 0.27743721574055763, "step": 1403, "train/total_loss": 0.05546848103404045 }, { "entropy": 5.446832656860352, "epoch": 0.2776349614395887, "mean_token_accuracy": 0.789650559425354, "num_tokens": 62779167.0, "step": 1404, "train/ce_loss": 0.668845534324646 }, { "epoch": 0.2776349614395887, "step": 1404, "train/sim_loss": 0.00037729740142822266 }, { "epoch": 0.2776349614395887, "step": 1404, "train/total_loss": 0.06726185232400894 }, { "entropy": 5.8923773765563965, "epoch": 0.27783270713861974, "mean_token_accuracy": 0.7249022126197815, "num_tokens": 62823257.0, "step": 1405, "train/ce_loss": 1.2722424268722534 }, { "epoch": 0.27783270713861974, "step": 1405, "train/sim_loss": 0.00026935338973999023 }, { "epoch": 0.27783270713861974, "step": 1405, "train/total_loss": 0.12749360501766205 }, { "entropy": 5.596715927124023, "epoch": 0.27803045283765077, "mean_token_accuracy": 0.8043217062950134, "num_tokens": 62848009.0, "step": 1406, "train/ce_loss": 0.7534134387969971 }, { "epoch": 0.27803045283765077, "step": 1406, "train/sim_loss": 0.00020879507064819336 }, { "epoch": 0.27803045283765077, "step": 1406, "train/total_loss": 0.0755501389503479 }, { "entropy": 6.01859712600708, "epoch": 0.27822819853668185, "mean_token_accuracy": 0.733371913433075, "num_tokens": 62903753.0, "step": 1407, "train/ce_loss": 0.5718063712120056 }, { "epoch": 0.27822819853668185, "step": 1407, "train/sim_loss": 0.0002492666244506836 }, { "epoch": 0.27822819853668185, "step": 1407, "train/total_loss": 0.057429905980825424 }, { "entropy": 5.870102882385254, "epoch": 0.2784259442357129, "mean_token_accuracy": 0.7549669146537781, "num_tokens": 62952236.0, "step": 1408, "train/ce_loss": 1.0341897010803223 }, { "epoch": 0.2784259442357129, "step": 1408, "train/sim_loss": 0.00025093555450439453 }, { "epoch": 0.2784259442357129, "step": 1408, "train/total_loss": 0.1036699041724205 }, { "entropy": 6.150113105773926, "epoch": 0.2786236899347439, "mean_token_accuracy": 0.7013986110687256, "num_tokens": 62997797.0, "step": 1409, "train/ce_loss": 1.1609407663345337 }, { "epoch": 0.2786236899347439, "step": 1409, "train/sim_loss": 0.00020372867584228516 }, { "epoch": 0.2786236899347439, "step": 1409, "train/total_loss": 0.11629780381917953 }, { "entropy": 5.954914569854736, "epoch": 0.278821435633775, "mean_token_accuracy": 0.7541322112083435, "num_tokens": 63035619.0, "step": 1410, "train/ce_loss": 1.5484540462493896 }, { "epoch": 0.278821435633775, "step": 1410, "train/sim_loss": 0.00034677982330322266 }, { "epoch": 0.278821435633775, "step": 1410, "train/total_loss": 0.15519218146800995 }, { "entropy": 6.087646484375, "epoch": 0.279019181332806, "mean_token_accuracy": 0.7615511417388916, "num_tokens": 63072132.0, "step": 1411, "train/ce_loss": 2.9104137420654297 }, { "epoch": 0.279019181332806, "step": 1411, "train/sim_loss": 0.00032329559326171875 }, { "epoch": 0.279019181332806, "step": 1411, "train/total_loss": 0.2913646697998047 }, { "entropy": 6.141111850738525, "epoch": 0.27921692703183704, "mean_token_accuracy": 0.7242251634597778, "num_tokens": 63116979.0, "step": 1412, "train/ce_loss": 1.0956629921565764e-05 }, { "epoch": 0.27921692703183704, "step": 1412, "train/sim_loss": 0.0003504753112792969 }, { "epoch": 0.27921692703183704, "step": 1412, "train/total_loss": 0.00035157098318450153 }, { "entropy": 6.1542439460754395, "epoch": 0.2794146727308681, "mean_token_accuracy": 0.6829710006713867, "num_tokens": 63170141.0, "step": 1413, "train/ce_loss": 1.7890115976333618 }, { "epoch": 0.2794146727308681, "step": 1413, "train/sim_loss": 0.00021147727966308594 }, { "epoch": 0.2794146727308681, "step": 1413, "train/total_loss": 0.17911264300346375 }, { "entropy": 6.173246383666992, "epoch": 0.27961241842989915, "mean_token_accuracy": 0.7514124512672424, "num_tokens": 63210553.0, "step": 1414, "train/ce_loss": 0.7665231823921204 }, { "epoch": 0.27961241842989915, "step": 1414, "train/sim_loss": 0.0003802776336669922 }, { "epoch": 0.27961241842989915, "step": 1414, "train/total_loss": 0.07703259587287903 }, { "entropy": 5.966665744781494, "epoch": 0.27981016412893017, "mean_token_accuracy": 0.7069737911224365, "num_tokens": 63259869.0, "step": 1415, "train/ce_loss": 1.0489590167999268 }, { "epoch": 0.27981016412893017, "step": 1415, "train/sim_loss": 0.0001977086067199707 }, { "epoch": 0.27981016412893017, "step": 1415, "train/total_loss": 0.10509361326694489 }, { "entropy": 6.106112480163574, "epoch": 0.28000790982796125, "mean_token_accuracy": 0.7264381647109985, "num_tokens": 63306021.0, "step": 1416, "train/ce_loss": 0.5641176700592041 }, { "epoch": 0.28000790982796125, "step": 1416, "train/sim_loss": 0.0002638101577758789 }, { "epoch": 0.28000790982796125, "step": 1416, "train/total_loss": 0.05667557939887047 }, { "entropy": 6.037538528442383, "epoch": 0.2802056555269923, "mean_token_accuracy": 0.7466238141059875, "num_tokens": 63358878.0, "step": 1417, "train/ce_loss": 0.9319711327552795 }, { "epoch": 0.2802056555269923, "step": 1417, "train/sim_loss": 0.00024259090423583984 }, { "epoch": 0.2802056555269923, "step": 1417, "train/total_loss": 0.09343970566987991 }, { "entropy": 6.223165512084961, "epoch": 0.2804034012260233, "mean_token_accuracy": 0.7651951313018799, "num_tokens": 63429750.0, "step": 1418, "train/ce_loss": 0.35234662890434265 }, { "epoch": 0.2804034012260233, "step": 1418, "train/sim_loss": 0.00026601552963256836 }, { "epoch": 0.2804034012260233, "step": 1418, "train/total_loss": 0.03550067916512489 }, { "entropy": 5.977346420288086, "epoch": 0.2806011469250544, "mean_token_accuracy": 0.7104042172431946, "num_tokens": 63469683.0, "step": 1419, "train/ce_loss": 1.0976272821426392 }, { "epoch": 0.2806011469250544, "step": 1419, "train/sim_loss": 0.00020378828048706055 }, { "epoch": 0.2806011469250544, "step": 1419, "train/total_loss": 0.10996651649475098 }, { "epoch": 0.2807988926240854, "grad_norm": 0.4753243327140808, "learning_rate": 9.303096250865567e-06, "loss": 0.0829, "step": 1420 }, { "entropy": 6.172545909881592, "epoch": 0.2807988926240854, "mean_token_accuracy": 0.7151360511779785, "num_tokens": 63516875.0, "step": 1420, "train/ce_loss": 1.552350295241922e-05 }, { "epoch": 0.2807988926240854, "step": 1420, "train/sim_loss": 0.0002124309539794922 }, { "epoch": 0.2807988926240854, "step": 1420, "train/total_loss": 0.00021398330864030868 }, { "entropy": 6.31565523147583, "epoch": 0.2809966383231165, "mean_token_accuracy": 0.7802887558937073, "num_tokens": 63570455.0, "step": 1421, "train/ce_loss": 0.6174848079681396 }, { "epoch": 0.2809966383231165, "step": 1421, "train/sim_loss": 0.0005453824996948242 }, { "epoch": 0.2809966383231165, "step": 1421, "train/total_loss": 0.06229386478662491 }, { "entropy": 6.19658088684082, "epoch": 0.2811943840221475, "mean_token_accuracy": 0.7389853000640869, "num_tokens": 63634189.0, "step": 1422, "train/ce_loss": 1.1900368008355144e-05 }, { "epoch": 0.2811943840221475, "step": 1422, "train/sim_loss": 0.000314176082611084 }, { "epoch": 0.2811943840221475, "step": 1422, "train/total_loss": 0.00031536610913462937 }, { "entropy": 6.150001525878906, "epoch": 0.28139212972117855, "mean_token_accuracy": 0.7410256266593933, "num_tokens": 63700122.0, "step": 1423, "train/ce_loss": 0.7634169459342957 }, { "epoch": 0.28139212972117855, "step": 1423, "train/sim_loss": 0.00019621849060058594 }, { "epoch": 0.28139212972117855, "step": 1423, "train/total_loss": 0.07653791457414627 }, { "entropy": 6.029879570007324, "epoch": 0.28158987542020963, "mean_token_accuracy": 0.7153502106666565, "num_tokens": 63743347.0, "step": 1424, "train/ce_loss": 1.4326769814942963e-05 }, { "epoch": 0.28158987542020963, "step": 1424, "train/sim_loss": 0.00019252300262451172 }, { "epoch": 0.28158987542020963, "step": 1424, "train/total_loss": 0.00019395568233449012 }, { "entropy": 5.99974250793457, "epoch": 0.28178762111924066, "mean_token_accuracy": 0.7370479106903076, "num_tokens": 63782602.0, "step": 1425, "train/ce_loss": 2.462014675140381 }, { "epoch": 0.28178762111924066, "step": 1425, "train/sim_loss": 0.00019478797912597656 }, { "epoch": 0.28178762111924066, "step": 1425, "train/total_loss": 0.2463962584733963 }, { "entropy": 6.248898983001709, "epoch": 0.2819853668182717, "mean_token_accuracy": 0.736407458782196, "num_tokens": 63838520.0, "step": 1426, "train/ce_loss": 1.0594528913497925 }, { "epoch": 0.2819853668182717, "step": 1426, "train/sim_loss": 0.00034111738204956055 }, { "epoch": 0.2819853668182717, "step": 1426, "train/total_loss": 0.10628640651702881 }, { "entropy": 6.0354156494140625, "epoch": 0.28218311251730277, "mean_token_accuracy": 0.7437829971313477, "num_tokens": 63889818.0, "step": 1427, "train/ce_loss": 1.0755252838134766 }, { "epoch": 0.28218311251730277, "step": 1427, "train/sim_loss": 0.00033652782440185547 }, { "epoch": 0.28218311251730277, "step": 1427, "train/total_loss": 0.10788905620574951 }, { "entropy": 5.806759834289551, "epoch": 0.2823808582163338, "mean_token_accuracy": 0.7314867377281189, "num_tokens": 63921104.0, "step": 1428, "train/ce_loss": 0.3772438168525696 }, { "epoch": 0.2823808582163338, "step": 1428, "train/sim_loss": 0.000186920166015625 }, { "epoch": 0.2823808582163338, "step": 1428, "train/total_loss": 0.0379113033413887 }, { "entropy": 5.970833778381348, "epoch": 0.2825786039153648, "mean_token_accuracy": 0.7332490682601929, "num_tokens": 63959808.0, "step": 1429, "train/ce_loss": 0.862689197063446 }, { "epoch": 0.2825786039153648, "step": 1429, "train/sim_loss": 0.0002732276916503906 }, { "epoch": 0.2825786039153648, "step": 1429, "train/total_loss": 0.08654215186834335 }, { "entropy": 5.928610801696777, "epoch": 0.2827763496143959, "mean_token_accuracy": 0.7264842987060547, "num_tokens": 64033663.0, "step": 1430, "train/ce_loss": 0.7347158193588257 }, { "epoch": 0.2827763496143959, "step": 1430, "train/sim_loss": 0.00029289722442626953 }, { "epoch": 0.2827763496143959, "step": 1430, "train/total_loss": 0.07376448065042496 }, { "entropy": 6.208909034729004, "epoch": 0.28297409531342693, "mean_token_accuracy": 0.7769652605056763, "num_tokens": 64094443.0, "step": 1431, "train/ce_loss": 0.6953509449958801 }, { "epoch": 0.28297409531342693, "step": 1431, "train/sim_loss": 0.00017398595809936523 }, { "epoch": 0.28297409531342693, "step": 1431, "train/total_loss": 0.06970908492803574 }, { "entropy": 5.604248523712158, "epoch": 0.28317184101245796, "mean_token_accuracy": 0.7547516822814941, "num_tokens": 64117914.0, "step": 1432, "train/ce_loss": 1.484061360359192 }, { "epoch": 0.28317184101245796, "step": 1432, "train/sim_loss": 0.00029158592224121094 }, { "epoch": 0.28317184101245796, "step": 1432, "train/total_loss": 0.14869771897792816 }, { "entropy": 6.060870170593262, "epoch": 0.28336958671148904, "mean_token_accuracy": 0.7179327607154846, "num_tokens": 64160109.0, "step": 1433, "train/ce_loss": 0.8291856050491333 }, { "epoch": 0.28336958671148904, "step": 1433, "train/sim_loss": 0.00021010637283325195 }, { "epoch": 0.28336958671148904, "step": 1433, "train/total_loss": 0.0831286683678627 }, { "entropy": 5.437710285186768, "epoch": 0.28356733241052007, "mean_token_accuracy": 0.7407814860343933, "num_tokens": 64192927.0, "step": 1434, "train/ce_loss": 1.2184054851531982 }, { "epoch": 0.28356733241052007, "step": 1434, "train/sim_loss": 0.0003135800361633301 }, { "epoch": 0.28356733241052007, "step": 1434, "train/total_loss": 0.1221541315317154 }, { "entropy": 5.872725486755371, "epoch": 0.2837650781095511, "mean_token_accuracy": 0.7366369962692261, "num_tokens": 64235192.0, "step": 1435, "train/ce_loss": 0.5141510963439941 }, { "epoch": 0.2837650781095511, "step": 1435, "train/sim_loss": 0.00025022029876708984 }, { "epoch": 0.2837650781095511, "step": 1435, "train/total_loss": 0.05166533216834068 }, { "entropy": 5.896030426025391, "epoch": 0.2839628238085822, "mean_token_accuracy": 0.7357840538024902, "num_tokens": 64285834.0, "step": 1436, "train/ce_loss": 0.7063482999801636 }, { "epoch": 0.2839628238085822, "step": 1436, "train/sim_loss": 0.00021451711654663086 }, { "epoch": 0.2839628238085822, "step": 1436, "train/total_loss": 0.07084935158491135 }, { "entropy": 5.9187822341918945, "epoch": 0.2841605695076132, "mean_token_accuracy": 0.771731436252594, "num_tokens": 64342193.0, "step": 1437, "train/ce_loss": 0.5503979325294495 }, { "epoch": 0.2841605695076132, "step": 1437, "train/sim_loss": 0.00019609928131103516 }, { "epoch": 0.2841605695076132, "step": 1437, "train/total_loss": 0.05523589253425598 }, { "entropy": 5.711573600769043, "epoch": 0.28435831520664423, "mean_token_accuracy": 0.7647733688354492, "num_tokens": 64386059.0, "step": 1438, "train/ce_loss": 0.3834146559238434 }, { "epoch": 0.28435831520664423, "step": 1438, "train/sim_loss": 0.00015020370483398438 }, { "epoch": 0.28435831520664423, "step": 1438, "train/total_loss": 0.03849167004227638 }, { "entropy": 5.787826061248779, "epoch": 0.2845560609056753, "mean_token_accuracy": 0.771256148815155, "num_tokens": 64426398.0, "step": 1439, "train/ce_loss": 0.6445726752281189 }, { "epoch": 0.2845560609056753, "step": 1439, "train/sim_loss": 0.00035965442657470703 }, { "epoch": 0.2845560609056753, "step": 1439, "train/total_loss": 0.0648169219493866 }, { "epoch": 0.28475380660470634, "grad_norm": 0.380705863237381, "learning_rate": 9.29320407557622e-06, "loss": 0.0829, "step": 1440 }, { "entropy": 6.339787483215332, "epoch": 0.28475380660470634, "mean_token_accuracy": 0.7430830001831055, "num_tokens": 64459190.0, "step": 1440, "train/ce_loss": 1.5312705039978027 }, { "epoch": 0.28475380660470634, "step": 1440, "train/sim_loss": 0.00028955936431884766 }, { "epoch": 0.28475380660470634, "step": 1440, "train/total_loss": 0.15341661870479584 }, { "entropy": 6.210602760314941, "epoch": 0.2849515523037374, "mean_token_accuracy": 0.744143009185791, "num_tokens": 64516194.0, "step": 1441, "train/ce_loss": 0.4678441882133484 }, { "epoch": 0.2849515523037374, "step": 1441, "train/sim_loss": 0.00023758411407470703 }, { "epoch": 0.2849515523037374, "step": 1441, "train/total_loss": 0.047022003680467606 }, { "entropy": 5.962409019470215, "epoch": 0.28514929800276845, "mean_token_accuracy": 0.7442668676376343, "num_tokens": 64567626.0, "step": 1442, "train/ce_loss": 1.3795554637908936 }, { "epoch": 0.28514929800276845, "step": 1442, "train/sim_loss": 0.00020563602447509766 }, { "epoch": 0.28514929800276845, "step": 1442, "train/total_loss": 0.13816118240356445 }, { "entropy": 5.790746688842773, "epoch": 0.2853470437017995, "mean_token_accuracy": 0.7530180811882019, "num_tokens": 64614220.0, "step": 1443, "train/ce_loss": 0.9321748614311218 }, { "epoch": 0.2853470437017995, "step": 1443, "train/sim_loss": 0.00022917985916137695 }, { "epoch": 0.2853470437017995, "step": 1443, "train/total_loss": 0.09344666451215744 }, { "entropy": 5.905367851257324, "epoch": 0.28554478940083056, "mean_token_accuracy": 0.7390350699424744, "num_tokens": 64658960.0, "step": 1444, "train/ce_loss": 1.0060112476348877 }, { "epoch": 0.28554478940083056, "step": 1444, "train/sim_loss": 0.00038057565689086914 }, { "epoch": 0.28554478940083056, "step": 1444, "train/total_loss": 0.100981704890728 }, { "entropy": 6.114255905151367, "epoch": 0.2857425350998616, "mean_token_accuracy": 0.7096512317657471, "num_tokens": 64699523.0, "step": 1445, "train/ce_loss": 0.7237657904624939 }, { "epoch": 0.2857425350998616, "step": 1445, "train/sim_loss": 0.00022912025451660156 }, { "epoch": 0.2857425350998616, "step": 1445, "train/total_loss": 0.07260569930076599 }, { "entropy": 5.289156436920166, "epoch": 0.2859402807988926, "mean_token_accuracy": 0.7716535329818726, "num_tokens": 64725859.0, "step": 1446, "train/ce_loss": 0.6372517347335815 }, { "epoch": 0.2859402807988926, "step": 1446, "train/sim_loss": 0.0001780986785888672 }, { "epoch": 0.2859402807988926, "step": 1446, "train/total_loss": 0.06390327215194702 }, { "entropy": 6.091620445251465, "epoch": 0.2861380264979237, "mean_token_accuracy": 0.7257525324821472, "num_tokens": 64779091.0, "step": 1447, "train/ce_loss": 1.4193204641342163 }, { "epoch": 0.2861380264979237, "step": 1447, "train/sim_loss": 0.00048613548278808594 }, { "epoch": 0.2861380264979237, "step": 1447, "train/total_loss": 0.14241819083690643 }, { "entropy": 5.805756568908691, "epoch": 0.2863357721969547, "mean_token_accuracy": 0.7938342690467834, "num_tokens": 64822042.0, "step": 1448, "train/ce_loss": 0.8306125998497009 }, { "epoch": 0.2863357721969547, "step": 1448, "train/sim_loss": 0.0004235506057739258 }, { "epoch": 0.2863357721969547, "step": 1448, "train/total_loss": 0.08348481357097626 }, { "entropy": 5.975780487060547, "epoch": 0.28653351789598575, "mean_token_accuracy": 0.768324613571167, "num_tokens": 64865610.0, "step": 1449, "train/ce_loss": 0.9472398161888123 }, { "epoch": 0.28653351789598575, "step": 1449, "train/sim_loss": 0.00023216009140014648 }, { "epoch": 0.28653351789598575, "step": 1449, "train/total_loss": 0.09495614469051361 }, { "entropy": 5.896078109741211, "epoch": 0.2867312635950168, "mean_token_accuracy": 0.7426655292510986, "num_tokens": 64908338.0, "step": 1450, "train/ce_loss": 1.3569269180297852 }, { "epoch": 0.2867312635950168, "step": 1450, "train/sim_loss": 0.00021946430206298828 }, { "epoch": 0.2867312635950168, "step": 1450, "train/total_loss": 0.13591216504573822 }, { "entropy": 5.4925737380981445, "epoch": 0.28692900929404785, "mean_token_accuracy": 0.7287706136703491, "num_tokens": 64942481.0, "step": 1451, "train/ce_loss": 1.0106927156448364 }, { "epoch": 0.28692900929404785, "step": 1451, "train/sim_loss": 0.0004379749298095703 }, { "epoch": 0.28692900929404785, "step": 1451, "train/total_loss": 0.10150724649429321 }, { "entropy": 5.790771484375, "epoch": 0.2871267549930789, "mean_token_accuracy": 0.718482255935669, "num_tokens": 64989887.0, "step": 1452, "train/ce_loss": 1.852949857711792 }, { "epoch": 0.2871267549930789, "step": 1452, "train/sim_loss": 0.00023794174194335938 }, { "epoch": 0.2871267549930789, "step": 1452, "train/total_loss": 0.18553292751312256 }, { "entropy": 6.0849690437316895, "epoch": 0.28732450069210996, "mean_token_accuracy": 0.731203019618988, "num_tokens": 65042027.0, "step": 1453, "train/ce_loss": 0.5080971121788025 }, { "epoch": 0.28732450069210996, "step": 1453, "train/sim_loss": 0.00016051530838012695 }, { "epoch": 0.28732450069210996, "step": 1453, "train/total_loss": 0.050970226526260376 }, { "entropy": 5.4657883644104, "epoch": 0.287522246391141, "mean_token_accuracy": 0.7833333611488342, "num_tokens": 65075244.0, "step": 1454, "train/ce_loss": 0.765856146812439 }, { "epoch": 0.287522246391141, "step": 1454, "train/sim_loss": 0.00021123886108398438 }, { "epoch": 0.287522246391141, "step": 1454, "train/total_loss": 0.07679685205221176 }, { "entropy": 6.139813423156738, "epoch": 0.287719992090172, "mean_token_accuracy": 0.7775378227233887, "num_tokens": 65113107.0, "step": 1455, "train/ce_loss": 0.7063860893249512 }, { "epoch": 0.287719992090172, "step": 1455, "train/sim_loss": 0.00019407272338867188 }, { "epoch": 0.287719992090172, "step": 1455, "train/total_loss": 0.07083268463611603 }, { "entropy": 5.971603870391846, "epoch": 0.2879177377892031, "mean_token_accuracy": 0.7026568651199341, "num_tokens": 65147975.0, "step": 1456, "train/ce_loss": 0.6307923197746277 }, { "epoch": 0.2879177377892031, "step": 1456, "train/sim_loss": 0.00024116039276123047 }, { "epoch": 0.2879177377892031, "step": 1456, "train/total_loss": 0.06332039088010788 }, { "entropy": 6.03143835067749, "epoch": 0.2881154834882341, "mean_token_accuracy": 0.7232787013053894, "num_tokens": 65192706.0, "step": 1457, "train/ce_loss": 1.2786309719085693 }, { "epoch": 0.2881154834882341, "step": 1457, "train/sim_loss": 0.0002071857452392578 }, { "epoch": 0.2881154834882341, "step": 1457, "train/total_loss": 0.12807027995586395 }, { "entropy": 5.651984214782715, "epoch": 0.28831322918726515, "mean_token_accuracy": 0.7308435440063477, "num_tokens": 65222891.0, "step": 1458, "train/ce_loss": 0.5028856992721558 }, { "epoch": 0.28831322918726515, "step": 1458, "train/sim_loss": 0.0002111196517944336 }, { "epoch": 0.28831322918726515, "step": 1458, "train/total_loss": 0.05049968883395195 }, { "entropy": 5.894759178161621, "epoch": 0.28851097488629623, "mean_token_accuracy": 0.7307236194610596, "num_tokens": 65259029.0, "step": 1459, "train/ce_loss": 0.5747893452644348 }, { "epoch": 0.28851097488629623, "step": 1459, "train/sim_loss": 0.00029599666595458984 }, { "epoch": 0.28851097488629623, "step": 1459, "train/total_loss": 0.05777493119239807 }, { "epoch": 0.28870872058532726, "grad_norm": 0.45021986961364746, "learning_rate": 9.283311900286874e-06, "loss": 0.0824, "step": 1460 }, { "entropy": 5.759481906890869, "epoch": 0.28870872058532726, "mean_token_accuracy": 0.7411764860153198, "num_tokens": 65301709.0, "step": 1460, "train/ce_loss": 1.422661542892456 }, { "epoch": 0.28870872058532726, "step": 1460, "train/sim_loss": 0.00022304058074951172 }, { "epoch": 0.28870872058532726, "step": 1460, "train/total_loss": 0.14248919486999512 }, { "entropy": 5.557753086090088, "epoch": 0.2889064662843583, "mean_token_accuracy": 0.7494145035743713, "num_tokens": 65357008.0, "step": 1461, "train/ce_loss": 1.028348445892334 }, { "epoch": 0.2889064662843583, "step": 1461, "train/sim_loss": 0.0003300905227661133 }, { "epoch": 0.2889064662843583, "step": 1461, "train/total_loss": 0.10316493362188339 }, { "entropy": 6.081916809082031, "epoch": 0.28910421198338937, "mean_token_accuracy": 0.7429643273353577, "num_tokens": 65400256.0, "step": 1462, "train/ce_loss": 1.1857891082763672 }, { "epoch": 0.28910421198338937, "step": 1462, "train/sim_loss": 0.00045943260192871094 }, { "epoch": 0.28910421198338937, "step": 1462, "train/total_loss": 0.11903834342956543 }, { "entropy": 5.928780555725098, "epoch": 0.2893019576824204, "mean_token_accuracy": 0.767192006111145, "num_tokens": 65441144.0, "step": 1463, "train/ce_loss": 0.7332123517990112 }, { "epoch": 0.2893019576824204, "step": 1463, "train/sim_loss": 0.0003141164779663086 }, { "epoch": 0.2893019576824204, "step": 1463, "train/total_loss": 0.07363535463809967 }, { "entropy": 5.828678131103516, "epoch": 0.2894997033814515, "mean_token_accuracy": 0.7271514534950256, "num_tokens": 65483600.0, "step": 1464, "train/ce_loss": 0.8361809253692627 }, { "epoch": 0.2894997033814515, "step": 1464, "train/sim_loss": 0.0002942085266113281 }, { "epoch": 0.2894997033814515, "step": 1464, "train/total_loss": 0.08391230553388596 }, { "entropy": 6.146675109863281, "epoch": 0.2896974490804825, "mean_token_accuracy": 0.7143927812576294, "num_tokens": 65526119.0, "step": 1465, "train/ce_loss": 0.8248622417449951 }, { "epoch": 0.2896974490804825, "step": 1465, "train/sim_loss": 0.00020617246627807617 }, { "epoch": 0.2896974490804825, "step": 1465, "train/total_loss": 0.08269239962100983 }, { "entropy": 5.607232570648193, "epoch": 0.28989519477951353, "mean_token_accuracy": 0.7512048482894897, "num_tokens": 65577500.0, "step": 1466, "train/ce_loss": 0.8292957544326782 }, { "epoch": 0.28989519477951353, "step": 1466, "train/sim_loss": 0.0001819133758544922 }, { "epoch": 0.28989519477951353, "step": 1466, "train/total_loss": 0.0831114873290062 }, { "entropy": 6.151317119598389, "epoch": 0.2900929404785446, "mean_token_accuracy": 0.7510349154472351, "num_tokens": 65652784.0, "step": 1467, "train/ce_loss": 0.5285210013389587 }, { "epoch": 0.2900929404785446, "step": 1467, "train/sim_loss": 0.00036656856536865234 }, { "epoch": 0.2900929404785446, "step": 1467, "train/total_loss": 0.053218670189380646 }, { "entropy": 5.866268157958984, "epoch": 0.29029068617757564, "mean_token_accuracy": 0.7391774654388428, "num_tokens": 65703900.0, "step": 1468, "train/ce_loss": 0.9081763029098511 }, { "epoch": 0.29029068617757564, "step": 1468, "train/sim_loss": 0.0004570484161376953 }, { "epoch": 0.29029068617757564, "step": 1468, "train/total_loss": 0.0912746787071228 }, { "entropy": 5.732693672180176, "epoch": 0.29048843187660667, "mean_token_accuracy": 0.7163705825805664, "num_tokens": 65751676.0, "step": 1469, "train/ce_loss": 0.9196526408195496 }, { "epoch": 0.29048843187660667, "step": 1469, "train/sim_loss": 0.0002079606056213379 }, { "epoch": 0.29048843187660667, "step": 1469, "train/total_loss": 0.09217322617769241 }, { "entropy": 5.982329368591309, "epoch": 0.29068617757563775, "mean_token_accuracy": 0.721727192401886, "num_tokens": 65799544.0, "step": 1470, "train/ce_loss": 0.6256153583526611 }, { "epoch": 0.29068617757563775, "step": 1470, "train/sim_loss": 0.00031495094299316406 }, { "epoch": 0.29068617757563775, "step": 1470, "train/total_loss": 0.06287648528814316 }, { "entropy": 6.074132919311523, "epoch": 0.2908839232746688, "mean_token_accuracy": 0.7131336331367493, "num_tokens": 65853294.0, "step": 1471, "train/ce_loss": 0.6660249829292297 }, { "epoch": 0.2908839232746688, "step": 1471, "train/sim_loss": 0.00022095441818237305 }, { "epoch": 0.2908839232746688, "step": 1471, "train/total_loss": 0.06682345271110535 }, { "entropy": 5.894937038421631, "epoch": 0.2910816689736998, "mean_token_accuracy": 0.7399103045463562, "num_tokens": 65900973.0, "step": 1472, "train/ce_loss": 2.238689661026001 }, { "epoch": 0.2910816689736998, "step": 1472, "train/sim_loss": 0.00025159120559692383 }, { "epoch": 0.2910816689736998, "step": 1472, "train/total_loss": 0.22412055730819702 }, { "entropy": 5.737550258636475, "epoch": 0.2912794146727309, "mean_token_accuracy": 0.7328730821609497, "num_tokens": 65936918.0, "step": 1473, "train/ce_loss": 0.6758273243904114 }, { "epoch": 0.2912794146727309, "step": 1473, "train/sim_loss": 0.0004336833953857422 }, { "epoch": 0.2912794146727309, "step": 1473, "train/total_loss": 0.068016417324543 }, { "entropy": 6.027665138244629, "epoch": 0.2914771603717619, "mean_token_accuracy": 0.7115839123725891, "num_tokens": 65985169.0, "step": 1474, "train/ce_loss": 0.8537374138832092 }, { "epoch": 0.2914771603717619, "step": 1474, "train/sim_loss": 0.0002504587173461914 }, { "epoch": 0.2914771603717619, "step": 1474, "train/total_loss": 0.08562420308589935 }, { "entropy": 6.082272529602051, "epoch": 0.29167490607079294, "mean_token_accuracy": 0.7532467246055603, "num_tokens": 66033388.0, "step": 1475, "train/ce_loss": 1.3205516338348389 }, { "epoch": 0.29167490607079294, "step": 1475, "train/sim_loss": 0.0003281831741333008 }, { "epoch": 0.29167490607079294, "step": 1475, "train/total_loss": 0.1323833465576172 }, { "entropy": 5.686689853668213, "epoch": 0.291872651769824, "mean_token_accuracy": 0.7519756555557251, "num_tokens": 66073388.0, "step": 1476, "train/ce_loss": 0.35750287771224976 }, { "epoch": 0.291872651769824, "step": 1476, "train/sim_loss": 0.00026357173919677734 }, { "epoch": 0.291872651769824, "step": 1476, "train/total_loss": 0.03601386025547981 }, { "entropy": 5.524979591369629, "epoch": 0.29207039746885505, "mean_token_accuracy": 0.7528542876243591, "num_tokens": 66113469.0, "step": 1477, "train/ce_loss": 1.0137901306152344 }, { "epoch": 0.29207039746885505, "step": 1477, "train/sim_loss": 0.00015628337860107422 }, { "epoch": 0.29207039746885505, "step": 1477, "train/total_loss": 0.10153529793024063 }, { "entropy": 6.142505645751953, "epoch": 0.2922681431678861, "mean_token_accuracy": 0.7268750071525574, "num_tokens": 66173238.0, "step": 1478, "train/ce_loss": 0.9867656230926514 }, { "epoch": 0.2922681431678861, "step": 1478, "train/sim_loss": 0.0004901289939880371 }, { "epoch": 0.2922681431678861, "step": 1478, "train/total_loss": 0.09916669130325317 }, { "entropy": 6.181461334228516, "epoch": 0.29246588886691716, "mean_token_accuracy": 0.7170263528823853, "num_tokens": 66218561.0, "step": 1479, "train/ce_loss": 0.8689871430397034 }, { "epoch": 0.29246588886691716, "step": 1479, "train/sim_loss": 0.0002586841583251953 }, { "epoch": 0.29246588886691716, "step": 1479, "train/total_loss": 0.08715739846229553 }, { "epoch": 0.2926636345659482, "grad_norm": 0.5142648220062256, "learning_rate": 9.273419724997528e-06, "loss": 0.0846, "step": 1480 }, { "entropy": 6.035006046295166, "epoch": 0.2926636345659482, "mean_token_accuracy": 0.733000636100769, "num_tokens": 66263044.0, "step": 1480, "train/ce_loss": 0.8867052793502808 }, { "epoch": 0.2926636345659482, "step": 1480, "train/sim_loss": 0.00033742189407348633 }, { "epoch": 0.2926636345659482, "step": 1480, "train/total_loss": 0.08900795131921768 }, { "entropy": 5.998033046722412, "epoch": 0.2928613802649792, "mean_token_accuracy": 0.6971946954727173, "num_tokens": 66298308.0, "step": 1481, "train/ce_loss": 1.00356125831604 }, { "epoch": 0.2928613802649792, "step": 1481, "train/sim_loss": 0.0003718137741088867 }, { "epoch": 0.2928613802649792, "step": 1481, "train/total_loss": 0.10072793811559677 }, { "entropy": 6.058512210845947, "epoch": 0.2930591259640103, "mean_token_accuracy": 0.7218590974807739, "num_tokens": 66347234.0, "step": 1482, "train/ce_loss": 0.6414934396743774 }, { "epoch": 0.2930591259640103, "step": 1482, "train/sim_loss": 0.00026595592498779297 }, { "epoch": 0.2930591259640103, "step": 1482, "train/total_loss": 0.06441529840230942 }, { "entropy": 5.891698837280273, "epoch": 0.2932568716630413, "mean_token_accuracy": 0.7430747747421265, "num_tokens": 66384760.0, "step": 1483, "train/ce_loss": 1.1216681741643697e-05 }, { "epoch": 0.2932568716630413, "step": 1483, "train/sim_loss": 0.00021827220916748047 }, { "epoch": 0.2932568716630413, "step": 1483, "train/total_loss": 0.00021939387079328299 }, { "entropy": 5.637735366821289, "epoch": 0.2934546173620724, "mean_token_accuracy": 0.7662588357925415, "num_tokens": 66412216.0, "step": 1484, "train/ce_loss": 1.2735013115161564e-05 }, { "epoch": 0.2934546173620724, "step": 1484, "train/sim_loss": 0.00020182132720947266 }, { "epoch": 0.2934546173620724, "step": 1484, "train/total_loss": 0.00020309482351876795 }, { "entropy": 5.372897148132324, "epoch": 0.2936523630611034, "mean_token_accuracy": 0.7803308963775635, "num_tokens": 66462499.0, "step": 1485, "train/ce_loss": 1.3607293367385864 }, { "epoch": 0.2936523630611034, "step": 1485, "train/sim_loss": 0.00023293495178222656 }, { "epoch": 0.2936523630611034, "step": 1485, "train/total_loss": 0.13630586862564087 }, { "entropy": 5.531595230102539, "epoch": 0.29385010876013445, "mean_token_accuracy": 0.7485714554786682, "num_tokens": 66506711.0, "step": 1486, "train/ce_loss": 1.6661767959594727 }, { "epoch": 0.29385010876013445, "step": 1486, "train/sim_loss": 0.0002630949020385742 }, { "epoch": 0.29385010876013445, "step": 1486, "train/total_loss": 0.1668807715177536 }, { "entropy": 5.675688743591309, "epoch": 0.29404785445916554, "mean_token_accuracy": 0.7483870983123779, "num_tokens": 66544615.0, "step": 1487, "train/ce_loss": 0.9859394431114197 }, { "epoch": 0.29404785445916554, "step": 1487, "train/sim_loss": 0.00043195486068725586 }, { "epoch": 0.29404785445916554, "step": 1487, "train/total_loss": 0.0990258976817131 }, { "entropy": 6.232668399810791, "epoch": 0.29424560015819656, "mean_token_accuracy": 0.738212525844574, "num_tokens": 66590891.0, "step": 1488, "train/ce_loss": 0.6246801614761353 }, { "epoch": 0.29424560015819656, "step": 1488, "train/sim_loss": 0.00021451711654663086 }, { "epoch": 0.29424560015819656, "step": 1488, "train/total_loss": 0.06268253922462463 }, { "entropy": 5.885590553283691, "epoch": 0.2944433458572276, "mean_token_accuracy": 0.7490494251251221, "num_tokens": 66631083.0, "step": 1489, "train/ce_loss": 2.240498361061327e-05 }, { "epoch": 0.2944433458572276, "step": 1489, "train/sim_loss": 0.00027501583099365234 }, { "epoch": 0.2944433458572276, "step": 1489, "train/total_loss": 0.0002772563311737031 }, { "entropy": 5.893329620361328, "epoch": 0.29464109155625867, "mean_token_accuracy": 0.7720338702201843, "num_tokens": 66673504.0, "step": 1490, "train/ce_loss": 0.6373405456542969 }, { "epoch": 0.29464109155625867, "step": 1490, "train/sim_loss": 0.00030738115310668945 }, { "epoch": 0.29464109155625867, "step": 1490, "train/total_loss": 0.06404143571853638 }, { "entropy": 5.755450248718262, "epoch": 0.2948388372552897, "mean_token_accuracy": 0.7358121275901794, "num_tokens": 66722383.0, "step": 1491, "train/ce_loss": 1.406410574913025 }, { "epoch": 0.2948388372552897, "step": 1491, "train/sim_loss": 0.0003186464309692383 }, { "epoch": 0.2948388372552897, "step": 1491, "train/total_loss": 0.1409597098827362 }, { "entropy": 5.721223831176758, "epoch": 0.2950365829543207, "mean_token_accuracy": 0.737500011920929, "num_tokens": 66760766.0, "step": 1492, "train/ce_loss": 0.7019999623298645 }, { "epoch": 0.2950365829543207, "step": 1492, "train/sim_loss": 0.00024169683456420898 }, { "epoch": 0.2950365829543207, "step": 1492, "train/total_loss": 0.07044169306755066 }, { "entropy": 5.858222961425781, "epoch": 0.2952343286533518, "mean_token_accuracy": 0.7493438124656677, "num_tokens": 66797424.0, "step": 1493, "train/ce_loss": 1.1844556331634521 }, { "epoch": 0.2952343286533518, "step": 1493, "train/sim_loss": 0.00035965442657470703 }, { "epoch": 0.2952343286533518, "step": 1493, "train/total_loss": 0.11880522221326828 }, { "entropy": 6.240923881530762, "epoch": 0.29543207435238283, "mean_token_accuracy": 0.7474683523178101, "num_tokens": 66852429.0, "step": 1494, "train/ce_loss": 1.0986298322677612 }, { "epoch": 0.29543207435238283, "step": 1494, "train/sim_loss": 0.0003554821014404297 }, { "epoch": 0.29543207435238283, "step": 1494, "train/total_loss": 0.11021846532821655 }, { "entropy": 5.9874114990234375, "epoch": 0.29562982005141386, "mean_token_accuracy": 0.7387518286705017, "num_tokens": 66911945.0, "step": 1495, "train/ce_loss": 0.8695228695869446 }, { "epoch": 0.29562982005141386, "step": 1495, "train/sim_loss": 0.00021386146545410156 }, { "epoch": 0.29562982005141386, "step": 1495, "train/total_loss": 0.08716615289449692 }, { "entropy": 6.061871528625488, "epoch": 0.29582756575044494, "mean_token_accuracy": 0.7263056039810181, "num_tokens": 66941703.0, "step": 1496, "train/ce_loss": 1.462289571762085 }, { "epoch": 0.29582756575044494, "step": 1496, "train/sim_loss": 0.0003440380096435547 }, { "epoch": 0.29582756575044494, "step": 1496, "train/total_loss": 0.1465729922056198 }, { "entropy": 5.709127426147461, "epoch": 0.29602531144947597, "mean_token_accuracy": 0.700402557849884, "num_tokens": 66992717.0, "step": 1497, "train/ce_loss": 0.8397037982940674 }, { "epoch": 0.29602531144947597, "step": 1497, "train/sim_loss": 0.00018870830535888672 }, { "epoch": 0.29602531144947597, "step": 1497, "train/total_loss": 0.08415909111499786 }, { "entropy": 5.914474010467529, "epoch": 0.296223057148507, "mean_token_accuracy": 0.759878396987915, "num_tokens": 67037797.0, "step": 1498, "train/ce_loss": 1.4016030036145821e-05 }, { "epoch": 0.296223057148507, "step": 1498, "train/sim_loss": 0.0002155303955078125 }, { "epoch": 0.296223057148507, "step": 1498, "train/total_loss": 0.0002169319923268631 }, { "entropy": 6.2138671875, "epoch": 0.2964208028475381, "mean_token_accuracy": 0.7172995805740356, "num_tokens": 67068403.0, "step": 1499, "train/ce_loss": 2.308985948562622 }, { "epoch": 0.2964208028475381, "step": 1499, "train/sim_loss": 0.0006559491157531738 }, { "epoch": 0.2964208028475381, "step": 1499, "train/total_loss": 0.2315545529127121 }, { "epoch": 0.2966185485465691, "grad_norm": 0.5828509330749512, "learning_rate": 9.263527549708182e-06, "loss": 0.0835, "step": 1500 }, { "entropy": 5.833737373352051, "epoch": 0.2966185485465691, "mean_token_accuracy": 0.760401725769043, "num_tokens": 67116288.0, "step": 1500, "train/ce_loss": 1.7021087408065796 }, { "epoch": 0.2966185485465691, "step": 1500, "train/sim_loss": 0.0002110600471496582 }, { "epoch": 0.2966185485465691, "step": 1500, "train/total_loss": 0.17042194306850433 }, { "entropy": 5.457770347595215, "epoch": 0.29681629424560013, "mean_token_accuracy": 0.7180887460708618, "num_tokens": 67157305.0, "step": 1501, "train/ce_loss": 1.0894605111388955e-05 }, { "epoch": 0.29681629424560013, "step": 1501, "train/sim_loss": 0.00016635656356811523 }, { "epoch": 0.29681629424560013, "step": 1501, "train/total_loss": 0.00016744602180551738 }, { "entropy": 6.196662902832031, "epoch": 0.2970140399446312, "mean_token_accuracy": 0.7318339347839355, "num_tokens": 67197083.0, "step": 1502, "train/ce_loss": 0.6177138090133667 }, { "epoch": 0.2970140399446312, "step": 1502, "train/sim_loss": 0.0002110004425048828 }, { "epoch": 0.2970140399446312, "step": 1502, "train/total_loss": 0.06198238208889961 }, { "entropy": 6.125579357147217, "epoch": 0.29721178564366224, "mean_token_accuracy": 0.7271156907081604, "num_tokens": 67234470.0, "step": 1503, "train/ce_loss": 1.2252994775772095 }, { "epoch": 0.29721178564366224, "step": 1503, "train/sim_loss": 0.00022673606872558594 }, { "epoch": 0.29721178564366224, "step": 1503, "train/total_loss": 0.12275668233633041 }, { "entropy": 5.814309120178223, "epoch": 0.2974095313426933, "mean_token_accuracy": 0.7440159320831299, "num_tokens": 67281840.0, "step": 1504, "train/ce_loss": 0.7162976264953613 }, { "epoch": 0.2974095313426933, "step": 1504, "train/sim_loss": 0.000362396240234375 }, { "epoch": 0.2974095313426933, "step": 1504, "train/total_loss": 0.07199215888977051 }, { "entropy": 6.397547245025635, "epoch": 0.29760727704172435, "mean_token_accuracy": 0.739547610282898, "num_tokens": 67328252.0, "step": 1505, "train/ce_loss": 1.3194579878472723e-05 }, { "epoch": 0.29760727704172435, "step": 1505, "train/sim_loss": 0.0002478361129760742 }, { "epoch": 0.29760727704172435, "step": 1505, "train/total_loss": 0.0002491555642336607 }, { "entropy": 6.09009313583374, "epoch": 0.2978050227407554, "mean_token_accuracy": 0.7391037344932556, "num_tokens": 67370769.0, "step": 1506, "train/ce_loss": 0.4349030554294586 }, { "epoch": 0.2978050227407554, "step": 1506, "train/sim_loss": 0.00028520822525024414 }, { "epoch": 0.2978050227407554, "step": 1506, "train/total_loss": 0.043775513768196106 }, { "entropy": 5.772714614868164, "epoch": 0.29800276843978646, "mean_token_accuracy": 0.758378803730011, "num_tokens": 67409650.0, "step": 1507, "train/ce_loss": 1.575462556502316e-05 }, { "epoch": 0.29800276843978646, "step": 1507, "train/sim_loss": 0.00042551755905151367 }, { "epoch": 0.29800276843978646, "step": 1507, "train/total_loss": 0.0004270930076017976 }, { "entropy": 5.855850696563721, "epoch": 0.2982005141388175, "mean_token_accuracy": 0.7743862271308899, "num_tokens": 67456957.0, "step": 1508, "train/ce_loss": 0.826910138130188 }, { "epoch": 0.2982005141388175, "step": 1508, "train/sim_loss": 0.0002499818801879883 }, { "epoch": 0.2982005141388175, "step": 1508, "train/total_loss": 0.08294099569320679 }, { "entropy": 5.684581756591797, "epoch": 0.2983982598378485, "mean_token_accuracy": 0.7474005818367004, "num_tokens": 67500817.0, "step": 1509, "train/ce_loss": 1.2037760019302368 }, { "epoch": 0.2983982598378485, "step": 1509, "train/sim_loss": 0.00037670135498046875 }, { "epoch": 0.2983982598378485, "step": 1509, "train/total_loss": 0.12075430154800415 }, { "entropy": 5.613366603851318, "epoch": 0.2985960055368796, "mean_token_accuracy": 0.7475185990333557, "num_tokens": 67560056.0, "step": 1510, "train/ce_loss": 0.8367668986320496 }, { "epoch": 0.2985960055368796, "step": 1510, "train/sim_loss": 0.0002034902572631836 }, { "epoch": 0.2985960055368796, "step": 1510, "train/total_loss": 0.08388017863035202 }, { "entropy": 5.950087547302246, "epoch": 0.2987937512359106, "mean_token_accuracy": 0.7322074770927429, "num_tokens": 67609953.0, "step": 1511, "train/ce_loss": 0.39897269010543823 }, { "epoch": 0.2987937512359106, "step": 1511, "train/sim_loss": 0.00022619962692260742 }, { "epoch": 0.2987937512359106, "step": 1511, "train/total_loss": 0.04012347012758255 }, { "entropy": 6.0636515617370605, "epoch": 0.29899149693494165, "mean_token_accuracy": 0.7261345982551575, "num_tokens": 67658045.0, "step": 1512, "train/ce_loss": 1.103432077798061e-05 }, { "epoch": 0.29899149693494165, "step": 1512, "train/sim_loss": 0.00033479928970336914 }, { "epoch": 0.29899149693494165, "step": 1512, "train/total_loss": 0.00033590273233130574 }, { "entropy": 5.9765825271606445, "epoch": 0.29918924263397273, "mean_token_accuracy": 0.7380560040473938, "num_tokens": 67710947.0, "step": 1513, "train/ce_loss": 6.20907885604538e-05 }, { "epoch": 0.29918924263397273, "step": 1513, "train/sim_loss": 0.0002663731575012207 }, { "epoch": 0.29918924263397273, "step": 1513, "train/total_loss": 0.0002725822268985212 }, { "entropy": 6.130129814147949, "epoch": 0.29938698833300376, "mean_token_accuracy": 0.7681041359901428, "num_tokens": 67768706.0, "step": 1514, "train/ce_loss": 1.5033651834528428e-05 }, { "epoch": 0.29938698833300376, "step": 1514, "train/sim_loss": 0.00018775463104248047 }, { "epoch": 0.29938698833300376, "step": 1514, "train/total_loss": 0.00018925798940472305 }, { "entropy": 5.93087100982666, "epoch": 0.2995847340320348, "mean_token_accuracy": 0.7659906148910522, "num_tokens": 67807224.0, "step": 1515, "train/ce_loss": 1.499138761573704e-05 }, { "epoch": 0.2995847340320348, "step": 1515, "train/sim_loss": 0.0001659393310546875 }, { "epoch": 0.2995847340320348, "step": 1515, "train/total_loss": 0.00016743846936151385 }, { "entropy": 5.848367691040039, "epoch": 0.29978247973106587, "mean_token_accuracy": 0.8175018429756165, "num_tokens": 67840951.0, "step": 1516, "train/ce_loss": 1.54821955220541e-05 }, { "epoch": 0.29978247973106587, "step": 1516, "train/sim_loss": 0.00026100873947143555 }, { "epoch": 0.29978247973106587, "step": 1516, "train/total_loss": 0.00026255694683641195 }, { "entropy": 6.064877510070801, "epoch": 0.2999802254300969, "mean_token_accuracy": 0.708717942237854, "num_tokens": 67881224.0, "step": 1517, "train/ce_loss": 1.8075492334901355e-05 }, { "epoch": 0.2999802254300969, "step": 1517, "train/sim_loss": 0.00026214122772216797 }, { "epoch": 0.2999802254300969, "step": 1517, "train/total_loss": 0.00026394877932034433 }, { "entropy": 6.428577423095703, "epoch": 0.3001779711291279, "mean_token_accuracy": 0.7547169923782349, "num_tokens": 67917631.0, "step": 1518, "train/ce_loss": 1.675365924835205 }, { "epoch": 0.3001779711291279, "step": 1518, "train/sim_loss": 0.00018155574798583984 }, { "epoch": 0.3001779711291279, "step": 1518, "train/total_loss": 0.16771815717220306 }, { "entropy": 6.500916004180908, "epoch": 0.300375716828159, "mean_token_accuracy": 0.7785046696662903, "num_tokens": 67955400.0, "step": 1519, "train/ce_loss": 0.9798368215560913 }, { "epoch": 0.300375716828159, "step": 1519, "train/sim_loss": 0.00020927190780639648 }, { "epoch": 0.300375716828159, "step": 1519, "train/total_loss": 0.09819295257329941 }, { "epoch": 0.30057346252719, "grad_norm": 0.5000779628753662, "learning_rate": 9.253635374418835e-06, "loss": 0.0796, "step": 1520 }, { "entropy": 6.5430426597595215, "epoch": 0.30057346252719, "mean_token_accuracy": 0.7208624482154846, "num_tokens": 68018926.0, "step": 1520, "train/ce_loss": 1.3101915121078491 }, { "epoch": 0.30057346252719, "step": 1520, "train/sim_loss": 0.0002537369728088379 }, { "epoch": 0.30057346252719, "step": 1520, "train/total_loss": 0.13127289712429047 }, { "entropy": 6.051292419433594, "epoch": 0.30077120822622105, "mean_token_accuracy": 0.746835470199585, "num_tokens": 68054395.0, "step": 1521, "train/ce_loss": 2.0887155187665485e-05 }, { "epoch": 0.30077120822622105, "step": 1521, "train/sim_loss": 0.00025206804275512695 }, { "epoch": 0.30077120822622105, "step": 1521, "train/total_loss": 0.0002541567664593458 }, { "entropy": 6.113526344299316, "epoch": 0.30096895392525214, "mean_token_accuracy": 0.7457855939865112, "num_tokens": 68107299.0, "step": 1522, "train/ce_loss": 0.6250994205474854 }, { "epoch": 0.30096895392525214, "step": 1522, "train/sim_loss": 0.00022327899932861328 }, { "epoch": 0.30096895392525214, "step": 1522, "train/total_loss": 0.0627332255244255 }, { "entropy": 6.151803016662598, "epoch": 0.30116669962428316, "mean_token_accuracy": 0.7215859293937683, "num_tokens": 68142425.0, "step": 1523, "train/ce_loss": 1.3586208820343018 }, { "epoch": 0.30116669962428316, "step": 1523, "train/sim_loss": 0.00029718875885009766 }, { "epoch": 0.30116669962428316, "step": 1523, "train/total_loss": 0.136159285902977 }, { "entropy": 5.683213233947754, "epoch": 0.30136444532331425, "mean_token_accuracy": 0.7754838466644287, "num_tokens": 68183513.0, "step": 1524, "train/ce_loss": 0.3094063103199005 }, { "epoch": 0.30136444532331425, "step": 1524, "train/sim_loss": 0.00018680095672607422 }, { "epoch": 0.30136444532331425, "step": 1524, "train/total_loss": 0.031127432361245155 }, { "entropy": 5.984888076782227, "epoch": 0.30156219102234527, "mean_token_accuracy": 0.7547683715820312, "num_tokens": 68216805.0, "step": 1525, "train/ce_loss": 0.9415696263313293 }, { "epoch": 0.30156219102234527, "step": 1525, "train/sim_loss": 0.00024080276489257812 }, { "epoch": 0.30156219102234527, "step": 1525, "train/total_loss": 0.09439776837825775 }, { "entropy": 5.726061820983887, "epoch": 0.3017599367213763, "mean_token_accuracy": 0.7542213797569275, "num_tokens": 68246671.0, "step": 1526, "train/ce_loss": 1.0707508325576782 }, { "epoch": 0.3017599367213763, "step": 1526, "train/sim_loss": 0.00016897916793823242 }, { "epoch": 0.3017599367213763, "step": 1526, "train/total_loss": 0.10724406689405441 }, { "entropy": 5.789563179016113, "epoch": 0.3019576824204074, "mean_token_accuracy": 0.783758282661438, "num_tokens": 68281008.0, "step": 1527, "train/ce_loss": 0.9314910173416138 }, { "epoch": 0.3019576824204074, "step": 1527, "train/sim_loss": 0.00021028518676757812 }, { "epoch": 0.3019576824204074, "step": 1527, "train/total_loss": 0.09335938841104507 }, { "entropy": 6.010985374450684, "epoch": 0.3021554281194384, "mean_token_accuracy": 0.7081165313720703, "num_tokens": 68331059.0, "step": 1528, "train/ce_loss": 1.3657710552215576 }, { "epoch": 0.3021554281194384, "step": 1528, "train/sim_loss": 0.00021708011627197266 }, { "epoch": 0.3021554281194384, "step": 1528, "train/total_loss": 0.13679419457912445 }, { "entropy": 5.667337417602539, "epoch": 0.30235317381846943, "mean_token_accuracy": 0.7899793982505798, "num_tokens": 68380385.0, "step": 1529, "train/ce_loss": 0.8637521266937256 }, { "epoch": 0.30235317381846943, "step": 1529, "train/sim_loss": 0.00022029876708984375 }, { "epoch": 0.30235317381846943, "step": 1529, "train/total_loss": 0.08659551292657852 }, { "entropy": 6.261979103088379, "epoch": 0.3025509195175005, "mean_token_accuracy": 0.7090229392051697, "num_tokens": 68423892.0, "step": 1530, "train/ce_loss": 1.274030089378357 }, { "epoch": 0.3025509195175005, "step": 1530, "train/sim_loss": 0.0003215670585632324 }, { "epoch": 0.3025509195175005, "step": 1530, "train/total_loss": 0.1277245730161667 }, { "entropy": 5.8460798263549805, "epoch": 0.30274866521653154, "mean_token_accuracy": 0.7335660457611084, "num_tokens": 68462560.0, "step": 1531, "train/ce_loss": 1.2954620122909546 }, { "epoch": 0.30274866521653154, "step": 1531, "train/sim_loss": 0.00033223628997802734 }, { "epoch": 0.30274866521653154, "step": 1531, "train/total_loss": 0.1298784464597702 }, { "entropy": 5.96157693862915, "epoch": 0.30294641091556257, "mean_token_accuracy": 0.7454017400741577, "num_tokens": 68505389.0, "step": 1532, "train/ce_loss": 0.8508079648017883 }, { "epoch": 0.30294641091556257, "step": 1532, "train/sim_loss": 0.0003427863121032715 }, { "epoch": 0.30294641091556257, "step": 1532, "train/total_loss": 0.08542358130216599 }, { "entropy": 6.001040458679199, "epoch": 0.30314415661459365, "mean_token_accuracy": 0.7335285544395447, "num_tokens": 68569854.0, "step": 1533, "train/ce_loss": 1.2013323307037354 }, { "epoch": 0.30314415661459365, "step": 1533, "train/sim_loss": 0.0002681612968444824 }, { "epoch": 0.30314415661459365, "step": 1533, "train/total_loss": 0.12040139734745026 }, { "entropy": 5.731022834777832, "epoch": 0.3033419023136247, "mean_token_accuracy": 0.7950617074966431, "num_tokens": 68595139.0, "step": 1534, "train/ce_loss": 0.7603418827056885 }, { "epoch": 0.3033419023136247, "step": 1534, "train/sim_loss": 0.000186920166015625 }, { "epoch": 0.3033419023136247, "step": 1534, "train/total_loss": 0.07622110843658447 }, { "entropy": 5.495149612426758, "epoch": 0.3035396480126557, "mean_token_accuracy": 0.7738814949989319, "num_tokens": 68635651.0, "step": 1535, "train/ce_loss": 0.390705406665802 }, { "epoch": 0.3035396480126557, "step": 1535, "train/sim_loss": 0.00033992528915405273 }, { "epoch": 0.3035396480126557, "step": 1535, "train/total_loss": 0.03941046819090843 }, { "entropy": 6.16923713684082, "epoch": 0.3037373937116868, "mean_token_accuracy": 0.7414529919624329, "num_tokens": 68703096.0, "step": 1536, "train/ce_loss": 1.0846631084859837e-05 }, { "epoch": 0.3037373937116868, "step": 1536, "train/sim_loss": 0.00021076202392578125 }, { "epoch": 0.3037373937116868, "step": 1536, "train/total_loss": 0.00021184668003115803 }, { "entropy": 5.927695274353027, "epoch": 0.3039351394107178, "mean_token_accuracy": 0.7559666037559509, "num_tokens": 68747746.0, "step": 1537, "train/ce_loss": 1.4092307090759277 }, { "epoch": 0.3039351394107178, "step": 1537, "train/sim_loss": 0.0003051161766052246 }, { "epoch": 0.3039351394107178, "step": 1537, "train/total_loss": 0.14122818410396576 }, { "entropy": 6.008458614349365, "epoch": 0.30413288510974884, "mean_token_accuracy": 0.7167463898658752, "num_tokens": 68790143.0, "step": 1538, "train/ce_loss": 0.9903160333633423 }, { "epoch": 0.30413288510974884, "step": 1538, "train/sim_loss": 0.00026553869247436523 }, { "epoch": 0.30413288510974884, "step": 1538, "train/total_loss": 0.09929714351892471 }, { "entropy": 6.1904296875, "epoch": 0.3043306308087799, "mean_token_accuracy": 0.713798999786377, "num_tokens": 68856908.0, "step": 1539, "train/ce_loss": 1.0354466438293457 }, { "epoch": 0.3043306308087799, "step": 1539, "train/sim_loss": 0.0003279447555541992 }, { "epoch": 0.3043306308087799, "step": 1539, "train/total_loss": 0.10387261211872101 }, { "epoch": 0.30452837650781095, "grad_norm": 0.5074532628059387, "learning_rate": 9.243743199129489e-06, "loss": 0.0805, "step": 1540 }, { "entropy": 5.855683326721191, "epoch": 0.30452837650781095, "mean_token_accuracy": 0.7121034264564514, "num_tokens": 68900998.0, "step": 1540, "train/ce_loss": 9.684146789368242e-06 }, { "epoch": 0.30452837650781095, "step": 1540, "train/sim_loss": 0.00032806396484375 }, { "epoch": 0.30452837650781095, "step": 1540, "train/total_loss": 0.00032903236569836736 }, { "entropy": 5.843552589416504, "epoch": 0.304726122206842, "mean_token_accuracy": 0.7601307034492493, "num_tokens": 68939679.0, "step": 1541, "train/ce_loss": 0.7689717411994934 }, { "epoch": 0.304726122206842, "step": 1541, "train/sim_loss": 0.0001952052116394043 }, { "epoch": 0.304726122206842, "step": 1541, "train/total_loss": 0.07709237933158875 }, { "entropy": 6.295602798461914, "epoch": 0.30492386790587306, "mean_token_accuracy": 0.7510699033737183, "num_tokens": 68986680.0, "step": 1542, "train/ce_loss": 1.2521625757217407 }, { "epoch": 0.30492386790587306, "step": 1542, "train/sim_loss": 0.0003012418746948242 }, { "epoch": 0.30492386790587306, "step": 1542, "train/total_loss": 0.12551750242710114 }, { "entropy": 6.181563854217529, "epoch": 0.3051216136049041, "mean_token_accuracy": 0.7050960659980774, "num_tokens": 69038204.0, "step": 1543, "train/ce_loss": 1.2319424152374268 }, { "epoch": 0.3051216136049041, "step": 1543, "train/sim_loss": 0.00019109249114990234 }, { "epoch": 0.3051216136049041, "step": 1543, "train/total_loss": 0.12338533252477646 }, { "entropy": 5.873246192932129, "epoch": 0.3053193593039351, "mean_token_accuracy": 0.7467783689498901, "num_tokens": 69072748.0, "step": 1544, "train/ce_loss": 0.4848366975784302 }, { "epoch": 0.3053193593039351, "step": 1544, "train/sim_loss": 0.00020062923431396484 }, { "epoch": 0.3053193593039351, "step": 1544, "train/total_loss": 0.04868429899215698 }, { "entropy": 5.89625358581543, "epoch": 0.3055171050029662, "mean_token_accuracy": 0.7389215230941772, "num_tokens": 69119553.0, "step": 1545, "train/ce_loss": 0.9560545682907104 }, { "epoch": 0.3055171050029662, "step": 1545, "train/sim_loss": 0.00016379356384277344 }, { "epoch": 0.3055171050029662, "step": 1545, "train/total_loss": 0.0957692489027977 }, { "entropy": 5.994850158691406, "epoch": 0.3057148507019972, "mean_token_accuracy": 0.7247324585914612, "num_tokens": 69165678.0, "step": 1546, "train/ce_loss": 0.6724380254745483 }, { "epoch": 0.3057148507019972, "step": 1546, "train/sim_loss": 0.0002785921096801758 }, { "epoch": 0.3057148507019972, "step": 1546, "train/total_loss": 0.06752239912748337 }, { "entropy": 6.126839637756348, "epoch": 0.3059125964010283, "mean_token_accuracy": 0.7246835231781006, "num_tokens": 69202995.0, "step": 1547, "train/ce_loss": 1.4648791551589966 }, { "epoch": 0.3059125964010283, "step": 1547, "train/sim_loss": 0.00019979476928710938 }, { "epoch": 0.3059125964010283, "step": 1547, "train/total_loss": 0.14668771624565125 }, { "entropy": 5.974391937255859, "epoch": 0.30611034210005933, "mean_token_accuracy": 0.7675628662109375, "num_tokens": 69238160.0, "step": 1548, "train/ce_loss": 1.503043677075766e-05 }, { "epoch": 0.30611034210005933, "step": 1548, "train/sim_loss": 0.0001850128173828125 }, { "epoch": 0.30611034210005933, "step": 1548, "train/total_loss": 0.00018651585560292006 }, { "entropy": 6.357900619506836, "epoch": 0.30630808779909036, "mean_token_accuracy": 0.6803921461105347, "num_tokens": 69285988.0, "step": 1549, "train/ce_loss": 1.399558663368225 }, { "epoch": 0.30630808779909036, "step": 1549, "train/sim_loss": 0.0001671314239501953 }, { "epoch": 0.30630808779909036, "step": 1549, "train/total_loss": 0.14012299478054047 }, { "entropy": 5.951502799987793, "epoch": 0.30650583349812144, "mean_token_accuracy": 0.7535545229911804, "num_tokens": 69318864.0, "step": 1550, "train/ce_loss": 0.5347445011138916 }, { "epoch": 0.30650583349812144, "step": 1550, "train/sim_loss": 0.00029522180557250977 }, { "epoch": 0.30650583349812144, "step": 1550, "train/total_loss": 0.05376967415213585 }, { "entropy": 5.835508346557617, "epoch": 0.30670357919715246, "mean_token_accuracy": 0.7760898470878601, "num_tokens": 69362598.0, "step": 1551, "train/ce_loss": 0.6889052391052246 }, { "epoch": 0.30670357919715246, "step": 1551, "train/sim_loss": 0.0001804828643798828 }, { "epoch": 0.30670357919715246, "step": 1551, "train/total_loss": 0.06907100975513458 }, { "entropy": 5.819210529327393, "epoch": 0.3069013248961835, "mean_token_accuracy": 0.769291341304779, "num_tokens": 69415546.0, "step": 1552, "train/ce_loss": 0.7487441301345825 }, { "epoch": 0.3069013248961835, "step": 1552, "train/sim_loss": 0.00016254186630249023 }, { "epoch": 0.3069013248961835, "step": 1552, "train/total_loss": 0.07503695785999298 }, { "entropy": 5.955567359924316, "epoch": 0.3070990705952146, "mean_token_accuracy": 0.7164573669433594, "num_tokens": 69463186.0, "step": 1553, "train/ce_loss": 1.2402429580688477 }, { "epoch": 0.3070990705952146, "step": 1553, "train/sim_loss": 0.00022304058074951172 }, { "epoch": 0.3070990705952146, "step": 1553, "train/total_loss": 0.12424733489751816 }, { "entropy": 6.076925754547119, "epoch": 0.3072968162942456, "mean_token_accuracy": 0.7153846025466919, "num_tokens": 69521845.0, "step": 1554, "train/ce_loss": 1.2325592251727358e-05 }, { "epoch": 0.3072968162942456, "step": 1554, "train/sim_loss": 0.00017547607421875 }, { "epoch": 0.3072968162942456, "step": 1554, "train/total_loss": 0.0001767086359905079 }, { "entropy": 6.190805912017822, "epoch": 0.3074945619932766, "mean_token_accuracy": 0.7128658294677734, "num_tokens": 69569417.0, "step": 1555, "train/ce_loss": 1.5458953380584717 }, { "epoch": 0.3074945619932766, "step": 1555, "train/sim_loss": 0.00027376413345336914 }, { "epoch": 0.3074945619932766, "step": 1555, "train/total_loss": 0.15486329793930054 }, { "entropy": 6.206090927124023, "epoch": 0.3076923076923077, "mean_token_accuracy": 0.7552631497383118, "num_tokens": 69609259.0, "step": 1556, "train/ce_loss": 0.8690729141235352 }, { "epoch": 0.3076923076923077, "step": 1556, "train/sim_loss": 0.00027042627334594727 }, { "epoch": 0.3076923076923077, "step": 1556, "train/total_loss": 0.08717771619558334 }, { "entropy": 5.850266456604004, "epoch": 0.30789005339133874, "mean_token_accuracy": 0.7618069648742676, "num_tokens": 69641928.0, "step": 1557, "train/ce_loss": 1.1051791906356812 }, { "epoch": 0.30789005339133874, "step": 1557, "train/sim_loss": 0.00020706653594970703 }, { "epoch": 0.30789005339133874, "step": 1557, "train/total_loss": 0.11072498559951782 }, { "entropy": 6.256167411804199, "epoch": 0.30808779909036976, "mean_token_accuracy": 0.7100149393081665, "num_tokens": 69689847.0, "step": 1558, "train/ce_loss": 1.4848990440368652 }, { "epoch": 0.30808779909036976, "step": 1558, "train/sim_loss": 0.00020515918731689453 }, { "epoch": 0.30808779909036976, "step": 1558, "train/total_loss": 0.14869506657123566 }, { "entropy": 5.910398006439209, "epoch": 0.30828554478940084, "mean_token_accuracy": 0.7450532913208008, "num_tokens": 69731555.0, "step": 1559, "train/ce_loss": 1.153900484496262e-05 }, { "epoch": 0.30828554478940084, "step": 1559, "train/sim_loss": 0.0002396106719970703 }, { "epoch": 0.30828554478940084, "step": 1559, "train/total_loss": 0.00024076456611510366 }, { "epoch": 0.30848329048843187, "grad_norm": 0.46878471970558167, "learning_rate": 9.233851023840143e-06, "loss": 0.0855, "step": 1560 }, { "entropy": 5.768556594848633, "epoch": 0.30848329048843187, "mean_token_accuracy": 0.7191516757011414, "num_tokens": 69769884.0, "step": 1560, "train/ce_loss": 0.5238661766052246 }, { "epoch": 0.30848329048843187, "step": 1560, "train/sim_loss": 0.00018227100372314453 }, { "epoch": 0.30848329048843187, "step": 1560, "train/total_loss": 0.052568890154361725 }, { "entropy": 5.91499137878418, "epoch": 0.3086810361874629, "mean_token_accuracy": 0.7383309602737427, "num_tokens": 69827337.0, "step": 1561, "train/ce_loss": 0.7043523192405701 }, { "epoch": 0.3086810361874629, "step": 1561, "train/sim_loss": 0.0002053380012512207 }, { "epoch": 0.3086810361874629, "step": 1561, "train/total_loss": 0.07064057141542435 }, { "entropy": 6.195272445678711, "epoch": 0.308878781886494, "mean_token_accuracy": 0.7307953834533691, "num_tokens": 69875128.0, "step": 1562, "train/ce_loss": 1.0634255409240723 }, { "epoch": 0.308878781886494, "step": 1562, "train/sim_loss": 0.0001888275146484375 }, { "epoch": 0.308878781886494, "step": 1562, "train/total_loss": 0.10653138160705566 }, { "entropy": 6.205271244049072, "epoch": 0.309076527585525, "mean_token_accuracy": 0.7559958100318909, "num_tokens": 69923545.0, "step": 1563, "train/ce_loss": 1.578784576850012e-05 }, { "epoch": 0.309076527585525, "step": 1563, "train/sim_loss": 0.00016558170318603516 }, { "epoch": 0.309076527585525, "step": 1563, "train/total_loss": 0.00016716048412490636 }, { "entropy": 6.355605125427246, "epoch": 0.30927427328455603, "mean_token_accuracy": 0.7026074528694153, "num_tokens": 69959869.0, "step": 1564, "train/ce_loss": 1.2277339696884155 }, { "epoch": 0.30927427328455603, "step": 1564, "train/sim_loss": 0.0002636909484863281 }, { "epoch": 0.30927427328455603, "step": 1564, "train/total_loss": 0.12303709238767624 }, { "entropy": 6.2356157302856445, "epoch": 0.3094720189835871, "mean_token_accuracy": 0.7207654118537903, "num_tokens": 70001969.0, "step": 1565, "train/ce_loss": 1.2253266504558269e-05 }, { "epoch": 0.3094720189835871, "step": 1565, "train/sim_loss": 0.00014990568161010742 }, { "epoch": 0.3094720189835871, "step": 1565, "train/total_loss": 0.00015113101107999682 }, { "entropy": 6.294546127319336, "epoch": 0.30966976468261814, "mean_token_accuracy": 0.740903377532959, "num_tokens": 70057578.0, "step": 1566, "train/ce_loss": 0.7297672033309937 }, { "epoch": 0.30966976468261814, "step": 1566, "train/sim_loss": 0.00024777650833129883 }, { "epoch": 0.30966976468261814, "step": 1566, "train/total_loss": 0.0732244998216629 }, { "entropy": 6.0408172607421875, "epoch": 0.3098675103816492, "mean_token_accuracy": 0.7681512832641602, "num_tokens": 70113060.0, "step": 1567, "train/ce_loss": 0.6225643754005432 }, { "epoch": 0.3098675103816492, "step": 1567, "train/sim_loss": 0.0001678466796875 }, { "epoch": 0.3098675103816492, "step": 1567, "train/total_loss": 0.06242428347468376 }, { "entropy": 6.145441055297852, "epoch": 0.31006525608068025, "mean_token_accuracy": 0.7157257795333862, "num_tokens": 70171503.0, "step": 1568, "train/ce_loss": 1.1928907632827759 }, { "epoch": 0.31006525608068025, "step": 1568, "train/sim_loss": 0.00022363662719726562 }, { "epoch": 0.31006525608068025, "step": 1568, "train/total_loss": 0.11951271444559097 }, { "entropy": 6.511309623718262, "epoch": 0.3102630017797113, "mean_token_accuracy": 0.7077151536941528, "num_tokens": 70221513.0, "step": 1569, "train/ce_loss": 1.4161993265151978 }, { "epoch": 0.3102630017797113, "step": 1569, "train/sim_loss": 0.00023066997528076172 }, { "epoch": 0.3102630017797113, "step": 1569, "train/total_loss": 0.14185060560703278 }, { "entropy": 6.176023483276367, "epoch": 0.31046074747874236, "mean_token_accuracy": 0.7637271285057068, "num_tokens": 70262110.0, "step": 1570, "train/ce_loss": 0.533210277557373 }, { "epoch": 0.31046074747874236, "step": 1570, "train/sim_loss": 0.0003396272659301758 }, { "epoch": 0.31046074747874236, "step": 1570, "train/total_loss": 0.05366065725684166 }, { "entropy": 6.185259819030762, "epoch": 0.3106584931777734, "mean_token_accuracy": 0.7521489858627319, "num_tokens": 70304773.0, "step": 1571, "train/ce_loss": 1.2143501043319702 }, { "epoch": 0.3106584931777734, "step": 1571, "train/sim_loss": 0.00023627281188964844 }, { "epoch": 0.3106584931777734, "step": 1571, "train/total_loss": 0.12167128175497055 }, { "entropy": 5.998712539672852, "epoch": 0.3108562388768044, "mean_token_accuracy": 0.7579010128974915, "num_tokens": 70348221.0, "step": 1572, "train/ce_loss": 0.8397617936134338 }, { "epoch": 0.3108562388768044, "step": 1572, "train/sim_loss": 0.00026804208755493164 }, { "epoch": 0.3108562388768044, "step": 1572, "train/total_loss": 0.08424422144889832 }, { "entropy": 5.852245330810547, "epoch": 0.3110539845758355, "mean_token_accuracy": 0.7600806355476379, "num_tokens": 70374497.0, "step": 1573, "train/ce_loss": 1.0149910849577282e-05 }, { "epoch": 0.3110539845758355, "step": 1573, "train/sim_loss": 0.0002677440643310547 }, { "epoch": 0.3110539845758355, "step": 1573, "train/total_loss": 0.0002687590604182333 }, { "entropy": 6.2509307861328125, "epoch": 0.3112517302748665, "mean_token_accuracy": 0.7209985256195068, "num_tokens": 70435913.0, "step": 1574, "train/ce_loss": 1.5841840649954975e-05 }, { "epoch": 0.3112517302748665, "step": 1574, "train/sim_loss": 0.00045311450958251953 }, { "epoch": 0.3112517302748665, "step": 1574, "train/total_loss": 0.00045469868928194046 }, { "entropy": 6.130251884460449, "epoch": 0.31144947597389755, "mean_token_accuracy": 0.7370544672012329, "num_tokens": 70488571.0, "step": 1575, "train/ce_loss": 1.337831735610962 }, { "epoch": 0.31144947597389755, "step": 1575, "train/sim_loss": 0.0003120303153991699 }, { "epoch": 0.31144947597389755, "step": 1575, "train/total_loss": 0.1340952068567276 }, { "entropy": 6.320976257324219, "epoch": 0.31164722167292863, "mean_token_accuracy": 0.7368420958518982, "num_tokens": 70537790.0, "step": 1576, "train/ce_loss": 0.994045078754425 }, { "epoch": 0.31164722167292863, "step": 1576, "train/sim_loss": 0.00026428699493408203 }, { "epoch": 0.31164722167292863, "step": 1576, "train/total_loss": 0.09966879338026047 }, { "entropy": 5.633877754211426, "epoch": 0.31184496737195966, "mean_token_accuracy": 0.7617514729499817, "num_tokens": 70584025.0, "step": 1577, "train/ce_loss": 0.725443959236145 }, { "epoch": 0.31184496737195966, "step": 1577, "train/sim_loss": 0.00017827749252319336 }, { "epoch": 0.31184496737195966, "step": 1577, "train/total_loss": 0.0727226734161377 }, { "entropy": 6.204786777496338, "epoch": 0.3120427130709907, "mean_token_accuracy": 0.7387686967849731, "num_tokens": 70640388.0, "step": 1578, "train/ce_loss": 0.5542773604393005 }, { "epoch": 0.3120427130709907, "step": 1578, "train/sim_loss": 0.00029671192169189453 }, { "epoch": 0.3120427130709907, "step": 1578, "train/total_loss": 0.05572444945573807 }, { "entropy": 5.848178863525391, "epoch": 0.31224045877002177, "mean_token_accuracy": 0.7121439576148987, "num_tokens": 70689900.0, "step": 1579, "train/ce_loss": 0.667990505695343 }, { "epoch": 0.31224045877002177, "step": 1579, "train/sim_loss": 0.00016546249389648438 }, { "epoch": 0.31224045877002177, "step": 1579, "train/total_loss": 0.0669645145535469 }, { "epoch": 0.3124382044690528, "grad_norm": 0.49329009652137756, "learning_rate": 9.223958848550797e-06, "loss": 0.084, "step": 1580 }, { "entropy": 5.788503170013428, "epoch": 0.3124382044690528, "mean_token_accuracy": 0.7210386395454407, "num_tokens": 70742415.0, "step": 1580, "train/ce_loss": 0.7791708111763 }, { "epoch": 0.3124382044690528, "step": 1580, "train/sim_loss": 0.0001951456069946289 }, { "epoch": 0.3124382044690528, "step": 1580, "train/total_loss": 0.07811222970485687 }, { "entropy": 6.170810699462891, "epoch": 0.3126359501680838, "mean_token_accuracy": 0.7043247818946838, "num_tokens": 70777159.0, "step": 1581, "train/ce_loss": 1.3015174865722656 }, { "epoch": 0.3126359501680838, "step": 1581, "train/sim_loss": 0.0001823902130126953 }, { "epoch": 0.3126359501680838, "step": 1581, "train/total_loss": 0.13033413887023926 }, { "entropy": 5.940251350402832, "epoch": 0.3128336958671149, "mean_token_accuracy": 0.759188175201416, "num_tokens": 70821574.0, "step": 1582, "train/ce_loss": 0.8619784712791443 }, { "epoch": 0.3128336958671149, "step": 1582, "train/sim_loss": 0.00022214651107788086 }, { "epoch": 0.3128336958671149, "step": 1582, "train/total_loss": 0.08641999214887619 }, { "entropy": 6.0452728271484375, "epoch": 0.31303144156614593, "mean_token_accuracy": 0.7579415440559387, "num_tokens": 70859794.0, "step": 1583, "train/ce_loss": 1.0120624210685492e-05 }, { "epoch": 0.31303144156614593, "step": 1583, "train/sim_loss": 0.0001729130744934082 }, { "epoch": 0.31303144156614593, "step": 1583, "train/total_loss": 0.00017392513109371066 }, { "entropy": 6.364413261413574, "epoch": 0.31322918726517696, "mean_token_accuracy": 0.7215719223022461, "num_tokens": 70896552.0, "step": 1584, "train/ce_loss": 0.5258574485778809 }, { "epoch": 0.31322918726517696, "step": 1584, "train/sim_loss": 0.0002154707908630371 }, { "epoch": 0.31322918726517696, "step": 1584, "train/total_loss": 0.0528012178838253 }, { "entropy": 5.808774471282959, "epoch": 0.31342693296420804, "mean_token_accuracy": 0.7243935465812683, "num_tokens": 70946774.0, "step": 1585, "train/ce_loss": 0.7202849388122559 }, { "epoch": 0.31342693296420804, "step": 1585, "train/sim_loss": 0.0002944469451904297 }, { "epoch": 0.31342693296420804, "step": 1585, "train/total_loss": 0.07232294231653214 }, { "entropy": 6.2091593742370605, "epoch": 0.31362467866323906, "mean_token_accuracy": 0.7663352489471436, "num_tokens": 70995414.0, "step": 1586, "train/ce_loss": 0.9082958698272705 }, { "epoch": 0.31362467866323906, "step": 1586, "train/sim_loss": 0.00021004676818847656 }, { "epoch": 0.31362467866323906, "step": 1586, "train/total_loss": 0.09103963524103165 }, { "entropy": 6.012488842010498, "epoch": 0.31382242436227015, "mean_token_accuracy": 0.7175018191337585, "num_tokens": 71042364.0, "step": 1587, "train/ce_loss": 1.1824564933776855 }, { "epoch": 0.31382242436227015, "step": 1587, "train/sim_loss": 0.00023305416107177734 }, { "epoch": 0.31382242436227015, "step": 1587, "train/total_loss": 0.11847870796918869 }, { "entropy": 6.147686004638672, "epoch": 0.3140201700613012, "mean_token_accuracy": 0.7322946190834045, "num_tokens": 71087380.0, "step": 1588, "train/ce_loss": 0.7048010230064392 }, { "epoch": 0.3140201700613012, "step": 1588, "train/sim_loss": 0.00016885995864868164 }, { "epoch": 0.3140201700613012, "step": 1588, "train/total_loss": 0.07064896076917648 }, { "entropy": 6.189990043640137, "epoch": 0.3142179157603322, "mean_token_accuracy": 0.7457162737846375, "num_tokens": 71121426.0, "step": 1589, "train/ce_loss": 1.5964447259902954 }, { "epoch": 0.3142179157603322, "step": 1589, "train/sim_loss": 0.00021970272064208984 }, { "epoch": 0.3142179157603322, "step": 1589, "train/total_loss": 0.1598641723394394 }, { "entropy": 6.2066473960876465, "epoch": 0.3144156614593633, "mean_token_accuracy": 0.7354670166969299, "num_tokens": 71170424.0, "step": 1590, "train/ce_loss": 1.4355498552322388 }, { "epoch": 0.3144156614593633, "step": 1590, "train/sim_loss": 0.000288546085357666 }, { "epoch": 0.3144156614593633, "step": 1590, "train/total_loss": 0.14384353160858154 }, { "entropy": 6.22827672958374, "epoch": 0.3146134071583943, "mean_token_accuracy": 0.741194486618042, "num_tokens": 71209038.0, "step": 1591, "train/ce_loss": 0.9919328093528748 }, { "epoch": 0.3146134071583943, "step": 1591, "train/sim_loss": 0.0003058910369873047 }, { "epoch": 0.3146134071583943, "step": 1591, "train/total_loss": 0.0994991734623909 }, { "entropy": 5.670672416687012, "epoch": 0.31481115285742534, "mean_token_accuracy": 0.7571337819099426, "num_tokens": 71245036.0, "step": 1592, "train/ce_loss": 0.49923598766326904 }, { "epoch": 0.31481115285742534, "step": 1592, "train/sim_loss": 0.00018805265426635742 }, { "epoch": 0.31481115285742534, "step": 1592, "train/total_loss": 0.05011165142059326 }, { "entropy": 5.829217910766602, "epoch": 0.3150088985564564, "mean_token_accuracy": 0.7419533729553223, "num_tokens": 71297021.0, "step": 1593, "train/ce_loss": 0.7683616280555725 }, { "epoch": 0.3150088985564564, "step": 1593, "train/sim_loss": 0.0002714395523071289 }, { "epoch": 0.3150088985564564, "step": 1593, "train/total_loss": 0.07710760086774826 }, { "entropy": 5.9882001876831055, "epoch": 0.31520664425548744, "mean_token_accuracy": 0.7423349022865295, "num_tokens": 71357553.0, "step": 1594, "train/ce_loss": 0.7401898503303528 }, { "epoch": 0.31520664425548744, "step": 1594, "train/sim_loss": 0.00022715330123901367 }, { "epoch": 0.31520664425548744, "step": 1594, "train/total_loss": 0.07424613833427429 }, { "entropy": 5.989432334899902, "epoch": 0.31540438995451847, "mean_token_accuracy": 0.7284366488456726, "num_tokens": 71390977.0, "step": 1595, "train/ce_loss": 1.9914532899856567 }, { "epoch": 0.31540438995451847, "step": 1595, "train/sim_loss": 0.00018155574798583984 }, { "epoch": 0.31540438995451847, "step": 1595, "train/total_loss": 0.19932688772678375 }, { "entropy": 6.023273944854736, "epoch": 0.31560213565354955, "mean_token_accuracy": 0.7119784951210022, "num_tokens": 71440689.0, "step": 1596, "train/ce_loss": 0.5858834981918335 }, { "epoch": 0.31560213565354955, "step": 1596, "train/sim_loss": 0.00016611814498901367 }, { "epoch": 0.31560213565354955, "step": 1596, "train/total_loss": 0.05875447019934654 }, { "entropy": 6.010629653930664, "epoch": 0.3157998813525806, "mean_token_accuracy": 0.7583596110343933, "num_tokens": 71489455.0, "step": 1597, "train/ce_loss": 0.43513426184654236 }, { "epoch": 0.3157998813525806, "step": 1597, "train/sim_loss": 0.0002499818801879883 }, { "epoch": 0.3157998813525806, "step": 1597, "train/total_loss": 0.0437634103000164 }, { "entropy": 6.236546516418457, "epoch": 0.3159976270516116, "mean_token_accuracy": 0.7404948472976685, "num_tokens": 71529076.0, "step": 1598, "train/ce_loss": 0.6493671536445618 }, { "epoch": 0.3159976270516116, "step": 1598, "train/sim_loss": 0.00026494264602661133 }, { "epoch": 0.3159976270516116, "step": 1598, "train/total_loss": 0.06520166248083115 }, { "entropy": 6.596467971801758, "epoch": 0.3161953727506427, "mean_token_accuracy": 0.709160327911377, "num_tokens": 71576196.0, "step": 1599, "train/ce_loss": 1.2739825248718262 }, { "epoch": 0.3161953727506427, "step": 1599, "train/sim_loss": 0.0002586245536804199 }, { "epoch": 0.3161953727506427, "step": 1599, "train/total_loss": 0.12765687704086304 }, { "epoch": 0.3163931184496737, "grad_norm": 0.4892312288284302, "learning_rate": 9.214066673261452e-06, "loss": 0.0842, "step": 1600 }, { "entropy": 6.10526180267334, "epoch": 0.3163931184496737, "mean_token_accuracy": 0.7279411554336548, "num_tokens": 71635748.0, "step": 1600, "train/ce_loss": 1.0776954889297485 }, { "epoch": 0.3163931184496737, "step": 1600, "train/sim_loss": 0.00023949146270751953 }, { "epoch": 0.3163931184496737, "step": 1600, "train/total_loss": 0.10800904035568237 }, { "entropy": 6.456125259399414, "epoch": 0.31659086414870474, "mean_token_accuracy": 0.7370948195457458, "num_tokens": 71685427.0, "step": 1601, "train/ce_loss": 1.5391483306884766 }, { "epoch": 0.31659086414870474, "step": 1601, "train/sim_loss": 0.0002504587173461914 }, { "epoch": 0.31659086414870474, "step": 1601, "train/total_loss": 0.15416529774665833 }, { "entropy": 6.337928295135498, "epoch": 0.3167886098477358, "mean_token_accuracy": 0.7524917125701904, "num_tokens": 71730064.0, "step": 1602, "train/ce_loss": 1.1501989364624023 }, { "epoch": 0.3167886098477358, "step": 1602, "train/sim_loss": 0.00023818016052246094 }, { "epoch": 0.3167886098477358, "step": 1602, "train/total_loss": 0.11525807529687881 }, { "entropy": 6.107874870300293, "epoch": 0.31698635554676685, "mean_token_accuracy": 0.7255370020866394, "num_tokens": 71782253.0, "step": 1603, "train/ce_loss": 0.8669885396957397 }, { "epoch": 0.31698635554676685, "step": 1603, "train/sim_loss": 0.00017833709716796875 }, { "epoch": 0.31698635554676685, "step": 1603, "train/total_loss": 0.08687718957662582 }, { "entropy": 6.472430229187012, "epoch": 0.3171841012457979, "mean_token_accuracy": 0.7235257029533386, "num_tokens": 71833743.0, "step": 1604, "train/ce_loss": 0.8704966306686401 }, { "epoch": 0.3171841012457979, "step": 1604, "train/sim_loss": 0.00017321109771728516 }, { "epoch": 0.3171841012457979, "step": 1604, "train/total_loss": 0.0872228741645813 }, { "entropy": 6.368939399719238, "epoch": 0.31738184694482896, "mean_token_accuracy": 0.762565016746521, "num_tokens": 71882744.0, "step": 1605, "train/ce_loss": 0.6850404143333435 }, { "epoch": 0.31738184694482896, "step": 1605, "train/sim_loss": 0.0001628398895263672 }, { "epoch": 0.31738184694482896, "step": 1605, "train/total_loss": 0.06866688281297684 }, { "entropy": 6.247353553771973, "epoch": 0.31757959264386, "mean_token_accuracy": 0.7535014152526855, "num_tokens": 71934896.0, "step": 1606, "train/ce_loss": 0.8764733076095581 }, { "epoch": 0.31757959264386, "step": 1606, "train/sim_loss": 0.00016105175018310547 }, { "epoch": 0.31757959264386, "step": 1606, "train/total_loss": 0.08780838549137115 }, { "entropy": 5.943172931671143, "epoch": 0.31777733834289107, "mean_token_accuracy": 0.7567567825317383, "num_tokens": 71976299.0, "step": 1607, "train/ce_loss": 1.3445524928101804e-05 }, { "epoch": 0.31777733834289107, "step": 1607, "train/sim_loss": 0.00020313262939453125 }, { "epoch": 0.31777733834289107, "step": 1607, "train/total_loss": 0.00020447718270588666 }, { "entropy": 6.165987968444824, "epoch": 0.3179750840419221, "mean_token_accuracy": 0.7483574151992798, "num_tokens": 72035771.0, "step": 1608, "train/ce_loss": 0.4871208667755127 }, { "epoch": 0.3179750840419221, "step": 1608, "train/sim_loss": 0.00020569562911987305 }, { "epoch": 0.3179750840419221, "step": 1608, "train/total_loss": 0.04891778156161308 }, { "entropy": 6.2843451499938965, "epoch": 0.3181728297409531, "mean_token_accuracy": 0.7198641896247864, "num_tokens": 72066560.0, "step": 1609, "train/ce_loss": 2.904594293795526e-05 }, { "epoch": 0.3181728297409531, "step": 1609, "train/sim_loss": 0.00019663572311401367 }, { "epoch": 0.3181728297409531, "step": 1609, "train/total_loss": 0.00019954031449742615 }, { "entropy": 6.084291934967041, "epoch": 0.3183705754399842, "mean_token_accuracy": 0.7556818127632141, "num_tokens": 72117039.0, "step": 1610, "train/ce_loss": 1.4188599586486816 }, { "epoch": 0.3183705754399842, "step": 1610, "train/sim_loss": 0.00030094385147094727 }, { "epoch": 0.3183705754399842, "step": 1610, "train/total_loss": 0.1421869397163391 }, { "entropy": 6.057844638824463, "epoch": 0.31856832113901523, "mean_token_accuracy": 0.7647058963775635, "num_tokens": 72156923.0, "step": 1611, "train/ce_loss": 0.4745127558708191 }, { "epoch": 0.31856832113901523, "step": 1611, "train/sim_loss": 0.0002308487892150879 }, { "epoch": 0.31856832113901523, "step": 1611, "train/total_loss": 0.04768212512135506 }, { "entropy": 6.639603614807129, "epoch": 0.31876606683804626, "mean_token_accuracy": 0.7146154046058655, "num_tokens": 72214905.0, "step": 1612, "train/ce_loss": 0.996002197265625 }, { "epoch": 0.31876606683804626, "step": 1612, "train/sim_loss": 0.00018095970153808594 }, { "epoch": 0.31876606683804626, "step": 1612, "train/total_loss": 0.09978117793798447 }, { "entropy": 6.127943992614746, "epoch": 0.31896381253707734, "mean_token_accuracy": 0.7495934963226318, "num_tokens": 72246218.0, "step": 1613, "train/ce_loss": 1.2718943253275938e-05 }, { "epoch": 0.31896381253707734, "step": 1613, "train/sim_loss": 0.0001571178436279297 }, { "epoch": 0.31896381253707734, "step": 1613, "train/total_loss": 0.00015838973922654986 }, { "entropy": 6.324923515319824, "epoch": 0.31916155823610837, "mean_token_accuracy": 0.759729266166687, "num_tokens": 72295600.0, "step": 1614, "train/ce_loss": 1.5871111154556274 }, { "epoch": 0.31916155823610837, "step": 1614, "train/sim_loss": 0.00024753808975219727 }, { "epoch": 0.31916155823610837, "step": 1614, "train/total_loss": 0.15895865857601166 }, { "entropy": 6.525955677032471, "epoch": 0.3193593039351394, "mean_token_accuracy": 0.7145438194274902, "num_tokens": 72338652.0, "step": 1615, "train/ce_loss": 1.4024099982634652e-05 }, { "epoch": 0.3193593039351394, "step": 1615, "train/sim_loss": 0.00034737586975097656 }, { "epoch": 0.3193593039351394, "step": 1615, "train/total_loss": 0.00034877826692536473 }, { "entropy": 5.8268723487854, "epoch": 0.3195570496341705, "mean_token_accuracy": 0.7965335249900818, "num_tokens": 72374609.0, "step": 1616, "train/ce_loss": 1.3842271982866805e-05 }, { "epoch": 0.3195570496341705, "step": 1616, "train/sim_loss": 0.0003148317337036133 }, { "epoch": 0.3195570496341705, "step": 1616, "train/total_loss": 0.00031621597008779645 }, { "entropy": 6.109858512878418, "epoch": 0.3197547953332015, "mean_token_accuracy": 0.7240418195724487, "num_tokens": 72417834.0, "step": 1617, "train/ce_loss": 0.8482828736305237 }, { "epoch": 0.3197547953332015, "step": 1617, "train/sim_loss": 0.0003674030303955078 }, { "epoch": 0.3197547953332015, "step": 1617, "train/total_loss": 0.08519569039344788 }, { "entropy": 6.337536811828613, "epoch": 0.31995254103223253, "mean_token_accuracy": 0.7476500272750854, "num_tokens": 72473936.0, "step": 1618, "train/ce_loss": 1.0699886083602905 }, { "epoch": 0.31995254103223253, "step": 1618, "train/sim_loss": 0.0003343820571899414 }, { "epoch": 0.31995254103223253, "step": 1618, "train/total_loss": 0.107333242893219 }, { "entropy": 6.304073333740234, "epoch": 0.3201502867312636, "mean_token_accuracy": 0.761500358581543, "num_tokens": 72516892.0, "step": 1619, "train/ce_loss": 0.611953616142273 }, { "epoch": 0.3201502867312636, "step": 1619, "train/sim_loss": 0.00020450353622436523 }, { "epoch": 0.3201502867312636, "step": 1619, "train/total_loss": 0.06139986589550972 }, { "epoch": 0.32034803243029464, "grad_norm": 0.44861191511154175, "learning_rate": 9.204174497972104e-06, "loss": 0.0808, "step": 1620 }, { "entropy": 6.42220401763916, "epoch": 0.32034803243029464, "mean_token_accuracy": 0.7811188697814941, "num_tokens": 72565856.0, "step": 1620, "train/ce_loss": 1.2492501809902024e-05 }, { "epoch": 0.32034803243029464, "step": 1620, "train/sim_loss": 0.00039398670196533203 }, { "epoch": 0.32034803243029464, "step": 1620, "train/total_loss": 0.00039523595478385687 }, { "entropy": 6.290999412536621, "epoch": 0.32054577812932566, "mean_token_accuracy": 0.7287264466285706, "num_tokens": 72598934.0, "step": 1621, "train/ce_loss": 1.0212270021438599 }, { "epoch": 0.32054577812932566, "step": 1621, "train/sim_loss": 0.00022560358047485352 }, { "epoch": 0.32054577812932566, "step": 1621, "train/total_loss": 0.10234830528497696 }, { "entropy": 6.325187683105469, "epoch": 0.32074352382835675, "mean_token_accuracy": 0.6915988922119141, "num_tokens": 72650234.0, "step": 1622, "train/ce_loss": 0.6990964412689209 }, { "epoch": 0.32074352382835675, "step": 1622, "train/sim_loss": 0.0003097057342529297 }, { "epoch": 0.32074352382835675, "step": 1622, "train/total_loss": 0.07021935284137726 }, { "entropy": 6.358364582061768, "epoch": 0.3209412695273878, "mean_token_accuracy": 0.7516746520996094, "num_tokens": 72701928.0, "step": 1623, "train/ce_loss": 0.6949672102928162 }, { "epoch": 0.3209412695273878, "step": 1623, "train/sim_loss": 0.0002595186233520508 }, { "epoch": 0.3209412695273878, "step": 1623, "train/total_loss": 0.06975623965263367 }, { "entropy": 5.638031005859375, "epoch": 0.3211390152264188, "mean_token_accuracy": 0.7710418105125427, "num_tokens": 72732890.0, "step": 1624, "train/ce_loss": 8.39411950437352e-06 }, { "epoch": 0.3211390152264188, "step": 1624, "train/sim_loss": 0.0002511739730834961 }, { "epoch": 0.3211390152264188, "step": 1624, "train/total_loss": 0.0002520133857615292 }, { "entropy": 6.2067461013793945, "epoch": 0.3213367609254499, "mean_token_accuracy": 0.7123362421989441, "num_tokens": 72784513.0, "step": 1625, "train/ce_loss": 0.770240843296051 }, { "epoch": 0.3213367609254499, "step": 1625, "train/sim_loss": 0.00023806095123291016 }, { "epoch": 0.3213367609254499, "step": 1625, "train/total_loss": 0.07726214826107025 }, { "entropy": 6.431839466094971, "epoch": 0.3215345066244809, "mean_token_accuracy": 0.7283688187599182, "num_tokens": 72838152.0, "step": 1626, "train/ce_loss": 1.054310917854309 }, { "epoch": 0.3215345066244809, "step": 1626, "train/sim_loss": 0.00025212764739990234 }, { "epoch": 0.3215345066244809, "step": 1626, "train/total_loss": 0.10568322241306305 }, { "entropy": 6.201539993286133, "epoch": 0.321732252323512, "mean_token_accuracy": 0.7606837749481201, "num_tokens": 72902559.0, "step": 1627, "train/ce_loss": 0.5868589282035828 }, { "epoch": 0.321732252323512, "step": 1627, "train/sim_loss": 0.0001830458641052246 }, { "epoch": 0.321732252323512, "step": 1627, "train/total_loss": 0.05886894091963768 }, { "entropy": 6.296112060546875, "epoch": 0.321929998022543, "mean_token_accuracy": 0.7485761046409607, "num_tokens": 72942998.0, "step": 1628, "train/ce_loss": 1.1173640489578247 }, { "epoch": 0.321929998022543, "step": 1628, "train/sim_loss": 0.0002504587173461914 }, { "epoch": 0.321929998022543, "step": 1628, "train/total_loss": 0.11198686808347702 }, { "entropy": 6.161230087280273, "epoch": 0.32212774372157404, "mean_token_accuracy": 0.7026049494743347, "num_tokens": 72988241.0, "step": 1629, "train/ce_loss": 1.1306266784667969 }, { "epoch": 0.32212774372157404, "step": 1629, "train/sim_loss": 0.0002506375312805176 }, { "epoch": 0.32212774372157404, "step": 1629, "train/total_loss": 0.11331330984830856 }, { "entropy": 5.987273216247559, "epoch": 0.3223254894206051, "mean_token_accuracy": 0.7670600414276123, "num_tokens": 73045353.0, "step": 1630, "train/ce_loss": 0.4402099847793579 }, { "epoch": 0.3223254894206051, "step": 1630, "train/sim_loss": 0.0001614093780517578 }, { "epoch": 0.3223254894206051, "step": 1630, "train/total_loss": 0.04418240860104561 }, { "entropy": 6.623687744140625, "epoch": 0.32252323511963615, "mean_token_accuracy": 0.7180094718933105, "num_tokens": 73079917.0, "step": 1631, "train/ce_loss": 1.1400508880615234 }, { "epoch": 0.32252323511963615, "step": 1631, "train/sim_loss": 0.00031006336212158203 }, { "epoch": 0.32252323511963615, "step": 1631, "train/total_loss": 0.11431515216827393 }, { "entropy": 6.544501304626465, "epoch": 0.3227209808186672, "mean_token_accuracy": 0.7426470518112183, "num_tokens": 73120126.0, "step": 1632, "train/ce_loss": 0.9874280691146851 }, { "epoch": 0.3227209808186672, "step": 1632, "train/sim_loss": 0.00022679567337036133 }, { "epoch": 0.3227209808186672, "step": 1632, "train/total_loss": 0.09896960109472275 }, { "entropy": 6.566156387329102, "epoch": 0.32291872651769826, "mean_token_accuracy": 0.7511563301086426, "num_tokens": 73175557.0, "step": 1633, "train/ce_loss": 0.7887973189353943 }, { "epoch": 0.32291872651769826, "step": 1633, "train/sim_loss": 0.00016105175018310547 }, { "epoch": 0.32291872651769826, "step": 1633, "train/total_loss": 0.07904078811407089 }, { "entropy": 6.410069942474365, "epoch": 0.3231164722167293, "mean_token_accuracy": 0.7941558361053467, "num_tokens": 73211780.0, "step": 1634, "train/ce_loss": 0.8326653242111206 }, { "epoch": 0.3231164722167293, "step": 1634, "train/sim_loss": 0.00023651123046875 }, { "epoch": 0.3231164722167293, "step": 1634, "train/total_loss": 0.08350304514169693 }, { "entropy": 6.278820514678955, "epoch": 0.3233142179157603, "mean_token_accuracy": 0.7411971688270569, "num_tokens": 73249125.0, "step": 1635, "train/ce_loss": 0.9474406838417053 }, { "epoch": 0.3233142179157603, "step": 1635, "train/sim_loss": 0.00020879507064819336 }, { "epoch": 0.3233142179157603, "step": 1635, "train/total_loss": 0.09495286643505096 }, { "entropy": 6.402674198150635, "epoch": 0.3235119636147914, "mean_token_accuracy": 0.769612729549408, "num_tokens": 73296999.0, "step": 1636, "train/ce_loss": 1.6753454474383034e-05 }, { "epoch": 0.3235119636147914, "step": 1636, "train/sim_loss": 0.00015223026275634766 }, { "epoch": 0.3235119636147914, "step": 1636, "train/total_loss": 0.0001539056102046743 }, { "entropy": 6.530575275421143, "epoch": 0.3237097093138224, "mean_token_accuracy": 0.7426666617393494, "num_tokens": 73342052.0, "step": 1637, "train/ce_loss": 1.0498636960983276 }, { "epoch": 0.3237097093138224, "step": 1637, "train/sim_loss": 0.00022995471954345703 }, { "epoch": 0.3237097093138224, "step": 1637, "train/total_loss": 0.10521632432937622 }, { "entropy": 6.212288856506348, "epoch": 0.32390745501285345, "mean_token_accuracy": 0.75, "num_tokens": 73379523.0, "step": 1638, "train/ce_loss": 1.03407621383667 }, { "epoch": 0.32390745501285345, "step": 1638, "train/sim_loss": 0.0003720521926879883 }, { "epoch": 0.32390745501285345, "step": 1638, "train/total_loss": 0.10377967357635498 }, { "entropy": 6.547874450683594, "epoch": 0.32410520071188453, "mean_token_accuracy": 0.7273768782615662, "num_tokens": 73437663.0, "step": 1639, "train/ce_loss": 0.7882471680641174 }, { "epoch": 0.32410520071188453, "step": 1639, "train/sim_loss": 0.00020694732666015625 }, { "epoch": 0.32410520071188453, "step": 1639, "train/total_loss": 0.07903166860342026 }, { "epoch": 0.32430294641091556, "grad_norm": 0.420431911945343, "learning_rate": 9.194282322682758e-06, "loss": 0.0823, "step": 1640 }, { "entropy": 5.8788251876831055, "epoch": 0.32430294641091556, "mean_token_accuracy": 0.792556881904602, "num_tokens": 73472992.0, "step": 1640, "train/ce_loss": 1.3141814470291138 }, { "epoch": 0.32430294641091556, "step": 1640, "train/sim_loss": 0.00017076730728149414 }, { "epoch": 0.32430294641091556, "step": 1640, "train/total_loss": 0.1315889209508896 }, { "entropy": 6.167997360229492, "epoch": 0.3245006921099466, "mean_token_accuracy": 0.7706766724586487, "num_tokens": 73506930.0, "step": 1641, "train/ce_loss": 0.7814515829086304 }, { "epoch": 0.3245006921099466, "step": 1641, "train/sim_loss": 0.00041419267654418945 }, { "epoch": 0.3245006921099466, "step": 1641, "train/total_loss": 0.07855935394763947 }, { "entropy": 5.97220516204834, "epoch": 0.32469843780897767, "mean_token_accuracy": 0.7305296063423157, "num_tokens": 73542257.0, "step": 1642, "train/ce_loss": 9.323018275608774e-06 }, { "epoch": 0.32469843780897767, "step": 1642, "train/sim_loss": 0.00018656253814697266 }, { "epoch": 0.32469843780897767, "step": 1642, "train/total_loss": 0.00018749483569990844 }, { "entropy": 5.817999839782715, "epoch": 0.3248961835080087, "mean_token_accuracy": 0.754181444644928, "num_tokens": 73594482.0, "step": 1643, "train/ce_loss": 0.5945837497711182 }, { "epoch": 0.3248961835080087, "step": 1643, "train/sim_loss": 0.00015032291412353516 }, { "epoch": 0.3248961835080087, "step": 1643, "train/total_loss": 0.05960869789123535 }, { "entropy": 6.276334762573242, "epoch": 0.3250939292070397, "mean_token_accuracy": 0.7516340017318726, "num_tokens": 73639576.0, "step": 1644, "train/ce_loss": 1.0801662938320078e-05 }, { "epoch": 0.3250939292070397, "step": 1644, "train/sim_loss": 0.00018543004989624023 }, { "epoch": 0.3250939292070397, "step": 1644, "train/total_loss": 0.00018651020945981145 }, { "entropy": 6.360741138458252, "epoch": 0.3252916749060708, "mean_token_accuracy": 0.75, "num_tokens": 73680121.0, "step": 1645, "train/ce_loss": 0.6565601825714111 }, { "epoch": 0.3252916749060708, "step": 1645, "train/sim_loss": 0.00019407272338867188 }, { "epoch": 0.3252916749060708, "step": 1645, "train/total_loss": 0.06585009396076202 }, { "entropy": 6.218259811401367, "epoch": 0.32548942060510183, "mean_token_accuracy": 0.7561357617378235, "num_tokens": 73748258.0, "step": 1646, "train/ce_loss": 0.8452968001365662 }, { "epoch": 0.32548942060510183, "step": 1646, "train/sim_loss": 0.00020378828048706055 }, { "epoch": 0.32548942060510183, "step": 1646, "train/total_loss": 0.08473347127437592 }, { "entropy": 6.1693806648254395, "epoch": 0.32568716630413286, "mean_token_accuracy": 0.7783051133155823, "num_tokens": 73788737.0, "step": 1647, "train/ce_loss": 1.0526691767154261e-05 }, { "epoch": 0.32568716630413286, "step": 1647, "train/sim_loss": 0.00018346309661865234 }, { "epoch": 0.32568716630413286, "step": 1647, "train/total_loss": 0.00018451576761435717 }, { "entropy": 6.453582763671875, "epoch": 0.32588491200316394, "mean_token_accuracy": 0.7192429304122925, "num_tokens": 73823868.0, "step": 1648, "train/ce_loss": 1.6280677300528623e-05 }, { "epoch": 0.32588491200316394, "step": 1648, "train/sim_loss": 0.00014609098434448242 }, { "epoch": 0.32588491200316394, "step": 1648, "train/total_loss": 0.0001477190526202321 }, { "entropy": 6.205606460571289, "epoch": 0.32608265770219497, "mean_token_accuracy": 0.7510676980018616, "num_tokens": 73872094.0, "step": 1649, "train/ce_loss": 0.8201037645339966 }, { "epoch": 0.32608265770219497, "step": 1649, "train/sim_loss": 0.0002409219741821289 }, { "epoch": 0.32608265770219497, "step": 1649, "train/total_loss": 0.08225130289793015 }, { "entropy": 6.3160905838012695, "epoch": 0.32628040340122605, "mean_token_accuracy": 0.7571428418159485, "num_tokens": 73929520.0, "step": 1650, "train/ce_loss": 0.8476790189743042 }, { "epoch": 0.32628040340122605, "step": 1650, "train/sim_loss": 0.0003694891929626465 }, { "epoch": 0.32628040340122605, "step": 1650, "train/total_loss": 0.08513738960027695 }, { "entropy": 6.018482208251953, "epoch": 0.3264781491002571, "mean_token_accuracy": 0.7398101687431335, "num_tokens": 73983459.0, "step": 1651, "train/ce_loss": 0.6787801384925842 }, { "epoch": 0.3264781491002571, "step": 1651, "train/sim_loss": 0.0003529191017150879 }, { "epoch": 0.3264781491002571, "step": 1651, "train/total_loss": 0.06823093444108963 }, { "entropy": 6.0467400550842285, "epoch": 0.3266758947992881, "mean_token_accuracy": 0.7277919054031372, "num_tokens": 74021485.0, "step": 1652, "train/ce_loss": 1.4354430437088013 }, { "epoch": 0.3266758947992881, "step": 1652, "train/sim_loss": 0.00018459558486938477 }, { "epoch": 0.3266758947992881, "step": 1652, "train/total_loss": 0.14372889697551727 }, { "entropy": 5.772747993469238, "epoch": 0.3268736404983192, "mean_token_accuracy": 0.7522059082984924, "num_tokens": 74064505.0, "step": 1653, "train/ce_loss": 1.0713084520830307e-05 }, { "epoch": 0.3268736404983192, "step": 1653, "train/sim_loss": 0.00022393465042114258 }, { "epoch": 0.3268736404983192, "step": 1653, "train/total_loss": 0.00022500596242025495 }, { "entropy": 6.574682235717773, "epoch": 0.3270713861973502, "mean_token_accuracy": 0.7577181458473206, "num_tokens": 74102967.0, "step": 1654, "train/ce_loss": 1.0047454452433158e-05 }, { "epoch": 0.3270713861973502, "step": 1654, "train/sim_loss": 0.00022721290588378906 }, { "epoch": 0.3270713861973502, "step": 1654, "train/total_loss": 0.00022821765742264688 }, { "entropy": 5.811762809753418, "epoch": 0.32726913189638124, "mean_token_accuracy": 0.7531279921531677, "num_tokens": 74145682.0, "step": 1655, "train/ce_loss": 0.5710539817810059 }, { "epoch": 0.32726913189638124, "step": 1655, "train/sim_loss": 0.0001868605613708496 }, { "epoch": 0.32726913189638124, "step": 1655, "train/total_loss": 0.057292260229587555 }, { "entropy": 6.116443634033203, "epoch": 0.3274668775954123, "mean_token_accuracy": 0.7550894618034363, "num_tokens": 74202627.0, "step": 1656, "train/ce_loss": 0.7788888216018677 }, { "epoch": 0.3274668775954123, "step": 1656, "train/sim_loss": 0.00016623735427856445 }, { "epoch": 0.3274668775954123, "step": 1656, "train/total_loss": 0.07805512100458145 }, { "entropy": 6.450084209442139, "epoch": 0.32766462329444335, "mean_token_accuracy": 0.7305785417556763, "num_tokens": 74253264.0, "step": 1657, "train/ce_loss": 0.9340744614601135 }, { "epoch": 0.32766462329444335, "step": 1657, "train/sim_loss": 0.0002530813217163086 }, { "epoch": 0.32766462329444335, "step": 1657, "train/total_loss": 0.09366052597761154 }, { "entropy": 6.241091728210449, "epoch": 0.3278623689934744, "mean_token_accuracy": 0.7393617033958435, "num_tokens": 74310815.0, "step": 1658, "train/ce_loss": 1.1376692056655884 }, { "epoch": 0.3278623689934744, "step": 1658, "train/sim_loss": 0.0005877017974853516 }, { "epoch": 0.3278623689934744, "step": 1658, "train/total_loss": 0.11435462534427643 }, { "entropy": 6.411901950836182, "epoch": 0.32806011469250546, "mean_token_accuracy": 0.7090694904327393, "num_tokens": 74343786.0, "step": 1659, "train/ce_loss": 1.168367624282837 }, { "epoch": 0.32806011469250546, "step": 1659, "train/sim_loss": 0.0002676248550415039 }, { "epoch": 0.32806011469250546, "step": 1659, "train/total_loss": 0.11710438877344131 }, { "epoch": 0.3282578603915365, "grad_norm": 0.42853111028671265, "learning_rate": 9.184390147393413e-06, "loss": 0.0811, "step": 1660 }, { "entropy": 6.389330863952637, "epoch": 0.3282578603915365, "mean_token_accuracy": 0.7364264726638794, "num_tokens": 74380828.0, "step": 1660, "train/ce_loss": 1.6101612345664762e-05 }, { "epoch": 0.3282578603915365, "step": 1660, "train/sim_loss": 0.00024199485778808594 }, { "epoch": 0.3282578603915365, "step": 1660, "train/total_loss": 0.0002436050126561895 }, { "entropy": 6.104854583740234, "epoch": 0.3284556060905675, "mean_token_accuracy": 0.8263118863105774, "num_tokens": 74405137.0, "step": 1661, "train/ce_loss": 0.5735926032066345 }, { "epoch": 0.3284556060905675, "step": 1661, "train/sim_loss": 0.00026279687881469727 }, { "epoch": 0.3284556060905675, "step": 1661, "train/total_loss": 0.05762205645442009 }, { "entropy": 6.623139381408691, "epoch": 0.3286533517895986, "mean_token_accuracy": 0.7398971319198608, "num_tokens": 74462728.0, "step": 1662, "train/ce_loss": 1.130248785018921 }, { "epoch": 0.3286533517895986, "step": 1662, "train/sim_loss": 0.00023949146270751953 }, { "epoch": 0.3286533517895986, "step": 1662, "train/total_loss": 0.11326437443494797 }, { "entropy": 6.43631649017334, "epoch": 0.3288510974886296, "mean_token_accuracy": 0.7104136943817139, "num_tokens": 74505563.0, "step": 1663, "train/ce_loss": 1.186050295829773 }, { "epoch": 0.3288510974886296, "step": 1663, "train/sim_loss": 0.00017130374908447266 }, { "epoch": 0.3288510974886296, "step": 1663, "train/total_loss": 0.118776336312294 }, { "entropy": 5.959081649780273, "epoch": 0.32904884318766064, "mean_token_accuracy": 0.7891601920127869, "num_tokens": 74535597.0, "step": 1664, "train/ce_loss": 0.7691791653633118 }, { "epoch": 0.32904884318766064, "step": 1664, "train/sim_loss": 0.0003573894500732422 }, { "epoch": 0.32904884318766064, "step": 1664, "train/total_loss": 0.07727530598640442 }, { "entropy": 5.96266508102417, "epoch": 0.3292465888866917, "mean_token_accuracy": 0.7762938141822815, "num_tokens": 74576235.0, "step": 1665, "train/ce_loss": 1.3694672816200182e-05 }, { "epoch": 0.3292465888866917, "step": 1665, "train/sim_loss": 0.00022226572036743164 }, { "epoch": 0.3292465888866917, "step": 1665, "train/total_loss": 0.00022363518655765802 }, { "entropy": 6.620163917541504, "epoch": 0.32944433458572275, "mean_token_accuracy": 0.7544072866439819, "num_tokens": 74620743.0, "step": 1666, "train/ce_loss": 0.6106863021850586 }, { "epoch": 0.32944433458572275, "step": 1666, "train/sim_loss": 0.00021284818649291992 }, { "epoch": 0.32944433458572275, "step": 1666, "train/total_loss": 0.0612814798951149 }, { "entropy": 6.339188098907471, "epoch": 0.3296420802847538, "mean_token_accuracy": 0.762734591960907, "num_tokens": 74660808.0, "step": 1667, "train/ce_loss": 9.373600732942577e-06 }, { "epoch": 0.3296420802847538, "step": 1667, "train/sim_loss": 0.0002244114875793457 }, { "epoch": 0.3296420802847538, "step": 1667, "train/total_loss": 0.00022534884919878095 }, { "entropy": 6.049135208129883, "epoch": 0.32983982598378486, "mean_token_accuracy": 0.7421013712882996, "num_tokens": 74705793.0, "step": 1668, "train/ce_loss": 1.1038785487471614e-05 }, { "epoch": 0.32983982598378486, "step": 1668, "train/sim_loss": 0.0002377033233642578 }, { "epoch": 0.32983982598378486, "step": 1668, "train/total_loss": 0.00023880720254965127 }, { "entropy": 5.957433223724365, "epoch": 0.3300375716828159, "mean_token_accuracy": 0.7676886916160583, "num_tokens": 74758421.0, "step": 1669, "train/ce_loss": 0.32052096724510193 }, { "epoch": 0.3300375716828159, "step": 1669, "train/sim_loss": 0.0001710653305053711 }, { "epoch": 0.3300375716828159, "step": 1669, "train/total_loss": 0.032223161309957504 }, { "entropy": 6.142133712768555, "epoch": 0.33023531738184697, "mean_token_accuracy": 0.7357926368713379, "num_tokens": 74798709.0, "step": 1670, "train/ce_loss": 0.6591081023216248 }, { "epoch": 0.33023531738184697, "step": 1670, "train/sim_loss": 0.00034224987030029297 }, { "epoch": 0.33023531738184697, "step": 1670, "train/total_loss": 0.06625305861234665 }, { "entropy": 6.139787197113037, "epoch": 0.330433063080878, "mean_token_accuracy": 0.8035914897918701, "num_tokens": 74826439.0, "step": 1671, "train/ce_loss": 2.9110729883541353e-05 }, { "epoch": 0.330433063080878, "step": 1671, "train/sim_loss": 0.0001576542854309082 }, { "epoch": 0.330433063080878, "step": 1671, "train/total_loss": 0.0001605653524165973 }, { "entropy": 6.107159614562988, "epoch": 0.330630808779909, "mean_token_accuracy": 0.7717528343200684, "num_tokens": 74867362.0, "step": 1672, "train/ce_loss": 0.5285351276397705 }, { "epoch": 0.330630808779909, "step": 1672, "train/sim_loss": 0.0001500844955444336 }, { "epoch": 0.330630808779909, "step": 1672, "train/total_loss": 0.053003598004579544 }, { "entropy": 6.406487464904785, "epoch": 0.3308285544789401, "mean_token_accuracy": 0.6923868060112, "num_tokens": 74909975.0, "step": 1673, "train/ce_loss": 1.384498318657279e-05 }, { "epoch": 0.3308285544789401, "step": 1673, "train/sim_loss": 0.0003864765167236328 }, { "epoch": 0.3308285544789401, "step": 1673, "train/total_loss": 0.0003878610150422901 }, { "entropy": 5.717354774475098, "epoch": 0.33102630017797113, "mean_token_accuracy": 0.7490272521972656, "num_tokens": 74951320.0, "step": 1674, "train/ce_loss": 1.3543238639831543 }, { "epoch": 0.33102630017797113, "step": 1674, "train/sim_loss": 0.0002906322479248047 }, { "epoch": 0.33102630017797113, "step": 1674, "train/total_loss": 0.1357230246067047 }, { "entropy": 5.891085147857666, "epoch": 0.33122404587700216, "mean_token_accuracy": 0.7323377132415771, "num_tokens": 75003161.0, "step": 1675, "train/ce_loss": 0.7429784536361694 }, { "epoch": 0.33122404587700216, "step": 1675, "train/sim_loss": 0.0002777576446533203 }, { "epoch": 0.33122404587700216, "step": 1675, "train/total_loss": 0.07457560300827026 }, { "entropy": 6.340249061584473, "epoch": 0.33142179157603324, "mean_token_accuracy": 0.7580453157424927, "num_tokens": 75050939.0, "step": 1676, "train/ce_loss": 0.7265517115592957 }, { "epoch": 0.33142179157603324, "step": 1676, "train/sim_loss": 0.00021588802337646484 }, { "epoch": 0.33142179157603324, "step": 1676, "train/total_loss": 0.07287105917930603 }, { "entropy": 6.108093738555908, "epoch": 0.33161953727506427, "mean_token_accuracy": 0.7449874877929688, "num_tokens": 75088342.0, "step": 1677, "train/ce_loss": 0.7358986139297485 }, { "epoch": 0.33161953727506427, "step": 1677, "train/sim_loss": 0.00014281272888183594 }, { "epoch": 0.33161953727506427, "step": 1677, "train/total_loss": 0.07373267412185669 }, { "entropy": 6.2817583084106445, "epoch": 0.3318172829740953, "mean_token_accuracy": 0.7383939623832703, "num_tokens": 75122244.0, "step": 1678, "train/ce_loss": 1.3641729354858398 }, { "epoch": 0.3318172829740953, "step": 1678, "train/sim_loss": 0.00029206275939941406 }, { "epoch": 0.3318172829740953, "step": 1678, "train/total_loss": 0.13670936226844788 }, { "entropy": 5.761523723602295, "epoch": 0.3320150286731264, "mean_token_accuracy": 0.7603383660316467, "num_tokens": 75169075.0, "step": 1679, "train/ce_loss": 0.8304466009140015 }, { "epoch": 0.3320150286731264, "step": 1679, "train/sim_loss": 0.00029265880584716797 }, { "epoch": 0.3320150286731264, "step": 1679, "train/total_loss": 0.08333732187747955 }, { "epoch": 0.3322127743721574, "grad_norm": 0.34152713418006897, "learning_rate": 9.174497972104065e-06, "loss": 0.0774, "step": 1680 }, { "entropy": 6.071650981903076, "epoch": 0.3322127743721574, "mean_token_accuracy": 0.7423934936523438, "num_tokens": 75221515.0, "step": 1680, "train/ce_loss": 2.037940740585327 }, { "epoch": 0.3322127743721574, "step": 1680, "train/sim_loss": 0.00017005205154418945 }, { "epoch": 0.3322127743721574, "step": 1680, "train/total_loss": 0.20396412909030914 }, { "entropy": 6.135915756225586, "epoch": 0.33241052007118843, "mean_token_accuracy": 0.7422053217887878, "num_tokens": 75271836.0, "step": 1681, "train/ce_loss": 0.7717385292053223 }, { "epoch": 0.33241052007118843, "step": 1681, "train/sim_loss": 0.0001469254493713379 }, { "epoch": 0.33241052007118843, "step": 1681, "train/total_loss": 0.07732077687978745 }, { "entropy": 5.963191986083984, "epoch": 0.3326082657702195, "mean_token_accuracy": 0.7491329312324524, "num_tokens": 75311470.0, "step": 1682, "train/ce_loss": 0.8551445007324219 }, { "epoch": 0.3326082657702195, "step": 1682, "train/sim_loss": 0.0002454519271850586 }, { "epoch": 0.3326082657702195, "step": 1682, "train/total_loss": 0.08575990051031113 }, { "entropy": 6.0057830810546875, "epoch": 0.33280601146925054, "mean_token_accuracy": 0.758746325969696, "num_tokens": 75355138.0, "step": 1683, "train/ce_loss": 1.524310209788382e-05 }, { "epoch": 0.33280601146925054, "step": 1683, "train/sim_loss": 0.00013506412506103516 }, { "epoch": 0.33280601146925054, "step": 1683, "train/total_loss": 0.00013658843818120658 }, { "entropy": 6.133098125457764, "epoch": 0.33300375716828157, "mean_token_accuracy": 0.7718213200569153, "num_tokens": 75395273.0, "step": 1684, "train/ce_loss": 0.9677779674530029 }, { "epoch": 0.33300375716828157, "step": 1684, "train/sim_loss": 0.00023984909057617188 }, { "epoch": 0.33300375716828157, "step": 1684, "train/total_loss": 0.09701764583587646 }, { "entropy": 6.188331604003906, "epoch": 0.33320150286731265, "mean_token_accuracy": 0.7801418304443359, "num_tokens": 75433444.0, "step": 1685, "train/ce_loss": 1.4263498783111572 }, { "epoch": 0.33320150286731265, "step": 1685, "train/sim_loss": 0.0003056526184082031 }, { "epoch": 0.33320150286731265, "step": 1685, "train/total_loss": 0.14294064044952393 }, { "entropy": 6.30792760848999, "epoch": 0.3333992485663437, "mean_token_accuracy": 0.7049180269241333, "num_tokens": 75477211.0, "step": 1686, "train/ce_loss": 2.4814364910125732 }, { "epoch": 0.3333992485663437, "step": 1686, "train/sim_loss": 0.00017881393432617188 }, { "epoch": 0.3333992485663437, "step": 1686, "train/total_loss": 0.2483224719762802 }, { "entropy": 5.873821258544922, "epoch": 0.3335969942653747, "mean_token_accuracy": 0.7508436441421509, "num_tokens": 75514782.0, "step": 1687, "train/ce_loss": 8.596649422543123e-06 }, { "epoch": 0.3335969942653747, "step": 1687, "train/sim_loss": 0.00025272369384765625 }, { "epoch": 0.3335969942653747, "step": 1687, "train/total_loss": 0.00025358336279168725 }, { "entropy": 6.335683345794678, "epoch": 0.3337947399644058, "mean_token_accuracy": 0.771163284778595, "num_tokens": 75559344.0, "step": 1688, "train/ce_loss": 0.5565885305404663 }, { "epoch": 0.3337947399644058, "step": 1688, "train/sim_loss": 0.0001811981201171875 }, { "epoch": 0.3337947399644058, "step": 1688, "train/total_loss": 0.05584005266427994 }, { "entropy": 6.199748992919922, "epoch": 0.3339924856634368, "mean_token_accuracy": 0.7737407088279724, "num_tokens": 75588257.0, "step": 1689, "train/ce_loss": 1.2464968676795252e-05 }, { "epoch": 0.3339924856634368, "step": 1689, "train/sim_loss": 0.00024640560150146484 }, { "epoch": 0.3339924856634368, "step": 1689, "train/total_loss": 0.0002476520894560963 }, { "entropy": 6.328556060791016, "epoch": 0.3341902313624679, "mean_token_accuracy": 0.7670713067054749, "num_tokens": 75637398.0, "step": 1690, "train/ce_loss": 0.7003502249717712 }, { "epoch": 0.3341902313624679, "step": 1690, "train/sim_loss": 0.0001512765884399414 }, { "epoch": 0.3341902313624679, "step": 1690, "train/total_loss": 0.0701863020658493 }, { "entropy": 6.199260711669922, "epoch": 0.3343879770614989, "mean_token_accuracy": 0.7588545680046082, "num_tokens": 75690512.0, "step": 1691, "train/ce_loss": 0.5478453636169434 }, { "epoch": 0.3343879770614989, "step": 1691, "train/sim_loss": 0.00023448467254638672 }, { "epoch": 0.3343879770614989, "step": 1691, "train/total_loss": 0.05501902103424072 }, { "entropy": 5.880649089813232, "epoch": 0.33458572276052995, "mean_token_accuracy": 0.8216307759284973, "num_tokens": 75725275.0, "step": 1692, "train/ce_loss": 0.8755099773406982 }, { "epoch": 0.33458572276052995, "step": 1692, "train/sim_loss": 0.0002079606056213379 }, { "epoch": 0.33458572276052995, "step": 1692, "train/total_loss": 0.08775895833969116 }, { "entropy": 6.328742027282715, "epoch": 0.33478346845956103, "mean_token_accuracy": 0.7158671617507935, "num_tokens": 75765686.0, "step": 1693, "train/ce_loss": 1.6588307619094849 }, { "epoch": 0.33478346845956103, "step": 1693, "train/sim_loss": 0.0002897977828979492 }, { "epoch": 0.33478346845956103, "step": 1693, "train/total_loss": 0.16617287695407867 }, { "entropy": 6.410037517547607, "epoch": 0.33498121415859206, "mean_token_accuracy": 0.7382588982582092, "num_tokens": 75824358.0, "step": 1694, "train/ce_loss": 0.7009553909301758 }, { "epoch": 0.33498121415859206, "step": 1694, "train/sim_loss": 0.00028586387634277344 }, { "epoch": 0.33498121415859206, "step": 1694, "train/total_loss": 0.07038140296936035 }, { "entropy": 6.543137550354004, "epoch": 0.3351789598576231, "mean_token_accuracy": 0.7143762111663818, "num_tokens": 75879695.0, "step": 1695, "train/ce_loss": 0.6596512198448181 }, { "epoch": 0.3351789598576231, "step": 1695, "train/sim_loss": 0.00020551681518554688 }, { "epoch": 0.3351789598576231, "step": 1695, "train/total_loss": 0.06617064028978348 }, { "entropy": 5.835155487060547, "epoch": 0.33537670555665416, "mean_token_accuracy": 0.8119205236434937, "num_tokens": 75920808.0, "step": 1696, "train/ce_loss": 1.6971522200037725e-05 }, { "epoch": 0.33537670555665416, "step": 1696, "train/sim_loss": 0.00027424097061157227 }, { "epoch": 0.33537670555665416, "step": 1696, "train/total_loss": 0.00027593813138082623 }, { "entropy": 6.04110050201416, "epoch": 0.3355744512556852, "mean_token_accuracy": 0.7522371411323547, "num_tokens": 75968874.0, "step": 1697, "train/ce_loss": 0.4499416947364807 }, { "epoch": 0.3355744512556852, "step": 1697, "train/sim_loss": 0.00017750263214111328 }, { "epoch": 0.3355744512556852, "step": 1697, "train/total_loss": 0.045171674340963364 }, { "entropy": 6.073241233825684, "epoch": 0.3357721969547162, "mean_token_accuracy": 0.7506250143051147, "num_tokens": 76007837.0, "step": 1698, "train/ce_loss": 1.6487672328948975 }, { "epoch": 0.3357721969547162, "step": 1698, "train/sim_loss": 0.00024962425231933594 }, { "epoch": 0.3357721969547162, "step": 1698, "train/total_loss": 0.16512635350227356 }, { "entropy": 6.078444004058838, "epoch": 0.3359699426537473, "mean_token_accuracy": 0.7737313508987427, "num_tokens": 76065572.0, "step": 1699, "train/ce_loss": 0.4198875427246094 }, { "epoch": 0.3359699426537473, "step": 1699, "train/sim_loss": 0.00016814470291137695 }, { "epoch": 0.3359699426537473, "step": 1699, "train/total_loss": 0.042156901210546494 }, { "epoch": 0.3361676883527783, "grad_norm": 0.38280192017555237, "learning_rate": 9.164605796814721e-06, "loss": 0.077, "step": 1700 }, { "entropy": 6.2137861251831055, "epoch": 0.3361676883527783, "mean_token_accuracy": 0.7495543956756592, "num_tokens": 76093102.0, "step": 1700, "train/ce_loss": 1.2420107850630302e-05 }, { "epoch": 0.3361676883527783, "step": 1700, "train/sim_loss": 0.00026428699493408203 }, { "epoch": 0.3361676883527783, "step": 1700, "train/total_loss": 0.00026552900089882314 }, { "entropy": 6.237618446350098, "epoch": 0.33636543405180935, "mean_token_accuracy": 0.7205691933631897, "num_tokens": 76152160.0, "step": 1701, "train/ce_loss": 1.316186785697937 }, { "epoch": 0.33636543405180935, "step": 1701, "train/sim_loss": 0.0001977682113647461 }, { "epoch": 0.33636543405180935, "step": 1701, "train/total_loss": 0.13181644678115845 }, { "entropy": 6.346428394317627, "epoch": 0.33656317975084044, "mean_token_accuracy": 0.7282535433769226, "num_tokens": 76220052.0, "step": 1702, "train/ce_loss": 1.1516450643539429 }, { "epoch": 0.33656317975084044, "step": 1702, "train/sim_loss": 0.00021463632583618164 }, { "epoch": 0.33656317975084044, "step": 1702, "train/total_loss": 0.11537914723157883 }, { "entropy": 6.383146286010742, "epoch": 0.33676092544987146, "mean_token_accuracy": 0.7565279006958008, "num_tokens": 76268575.0, "step": 1703, "train/ce_loss": 1.3182975053787231 }, { "epoch": 0.33676092544987146, "step": 1703, "train/sim_loss": 0.00018537044525146484 }, { "epoch": 0.33676092544987146, "step": 1703, "train/total_loss": 0.13201512396335602 }, { "entropy": 5.921137809753418, "epoch": 0.3369586711489025, "mean_token_accuracy": 0.7692767381668091, "num_tokens": 76309244.0, "step": 1704, "train/ce_loss": 0.4978574514389038 }, { "epoch": 0.3369586711489025, "step": 1704, "train/sim_loss": 0.0002448558807373047 }, { "epoch": 0.3369586711489025, "step": 1704, "train/total_loss": 0.050030600279569626 }, { "entropy": 6.220261573791504, "epoch": 0.33715641684793357, "mean_token_accuracy": 0.7188841104507446, "num_tokens": 76346240.0, "step": 1705, "train/ce_loss": 0.6414035558700562 }, { "epoch": 0.33715641684793357, "step": 1705, "train/sim_loss": 0.0002554655075073242 }, { "epoch": 0.33715641684793357, "step": 1705, "train/total_loss": 0.06439582258462906 }, { "entropy": 5.854345798492432, "epoch": 0.3373541625469646, "mean_token_accuracy": 0.7549457550048828, "num_tokens": 76392976.0, "step": 1706, "train/ce_loss": 0.7379885911941528 }, { "epoch": 0.3373541625469646, "step": 1706, "train/sim_loss": 0.00023865699768066406 }, { "epoch": 0.3373541625469646, "step": 1706, "train/total_loss": 0.07403751462697983 }, { "entropy": 5.906292915344238, "epoch": 0.3375519082459956, "mean_token_accuracy": 0.747698962688446, "num_tokens": 76427545.0, "step": 1707, "train/ce_loss": 0.8706995844841003 }, { "epoch": 0.3375519082459956, "step": 1707, "train/sim_loss": 0.0002893209457397461 }, { "epoch": 0.3375519082459956, "step": 1707, "train/total_loss": 0.08735927939414978 }, { "entropy": 6.167821407318115, "epoch": 0.3377496539450267, "mean_token_accuracy": 0.7275640964508057, "num_tokens": 76457637.0, "step": 1708, "train/ce_loss": 1.3948078155517578 }, { "epoch": 0.3377496539450267, "step": 1708, "train/sim_loss": 0.00020772218704223633 }, { "epoch": 0.3377496539450267, "step": 1708, "train/total_loss": 0.13968850672245026 }, { "entropy": 5.932746887207031, "epoch": 0.33794739964405773, "mean_token_accuracy": 0.7784313559532166, "num_tokens": 76499554.0, "step": 1709, "train/ce_loss": 0.3955308198928833 }, { "epoch": 0.33794739964405773, "step": 1709, "train/sim_loss": 0.00015306472778320312 }, { "epoch": 0.33794739964405773, "step": 1709, "train/total_loss": 0.03970614820718765 }, { "entropy": 6.086979389190674, "epoch": 0.3381451453430888, "mean_token_accuracy": 0.7652081251144409, "num_tokens": 76543988.0, "step": 1710, "train/ce_loss": 0.2910439074039459 }, { "epoch": 0.3381451453430888, "step": 1710, "train/sim_loss": 0.00020688772201538086 }, { "epoch": 0.3381451453430888, "step": 1710, "train/total_loss": 0.029311278834939003 }, { "entropy": 6.359226703643799, "epoch": 0.33834289104211984, "mean_token_accuracy": 0.741216242313385, "num_tokens": 76602492.0, "step": 1711, "train/ce_loss": 1.150869607925415 }, { "epoch": 0.33834289104211984, "step": 1711, "train/sim_loss": 0.00016236305236816406 }, { "epoch": 0.33834289104211984, "step": 1711, "train/total_loss": 0.11524932831525803 }, { "entropy": 6.23283576965332, "epoch": 0.33854063674115087, "mean_token_accuracy": 0.7716480493545532, "num_tokens": 76644337.0, "step": 1712, "train/ce_loss": 0.9162403345108032 }, { "epoch": 0.33854063674115087, "step": 1712, "train/sim_loss": 0.00021201372146606445 }, { "epoch": 0.33854063674115087, "step": 1712, "train/total_loss": 0.09183605015277863 }, { "entropy": 6.319002628326416, "epoch": 0.33873838244018195, "mean_token_accuracy": 0.7455253005027771, "num_tokens": 76707420.0, "step": 1713, "train/ce_loss": 0.702610969543457 }, { "epoch": 0.33873838244018195, "step": 1713, "train/sim_loss": 0.00019252300262451172 }, { "epoch": 0.33873838244018195, "step": 1713, "train/total_loss": 0.07045362144708633 }, { "entropy": 5.9130330085754395, "epoch": 0.338936128139213, "mean_token_accuracy": 0.7647457718849182, "num_tokens": 76763492.0, "step": 1714, "train/ce_loss": 0.5083461999893188 }, { "epoch": 0.338936128139213, "step": 1714, "train/sim_loss": 0.00019061565399169922 }, { "epoch": 0.338936128139213, "step": 1714, "train/total_loss": 0.05102523788809776 }, { "entropy": 5.811820983886719, "epoch": 0.339133873838244, "mean_token_accuracy": 0.7398764491081238, "num_tokens": 76798881.0, "step": 1715, "train/ce_loss": 0.49513494968414307 }, { "epoch": 0.339133873838244, "step": 1715, "train/sim_loss": 0.00014829635620117188 }, { "epoch": 0.339133873838244, "step": 1715, "train/total_loss": 0.0496617928147316 }, { "entropy": 6.334724426269531, "epoch": 0.3393316195372751, "mean_token_accuracy": 0.6995486617088318, "num_tokens": 76864678.0, "step": 1716, "train/ce_loss": 0.8611987233161926 }, { "epoch": 0.3393316195372751, "step": 1716, "train/sim_loss": 0.00028330087661743164 }, { "epoch": 0.3393316195372751, "step": 1716, "train/total_loss": 0.08640317618846893 }, { "entropy": 5.87380313873291, "epoch": 0.3395293652363061, "mean_token_accuracy": 0.7204486727714539, "num_tokens": 76903748.0, "step": 1717, "train/ce_loss": 1.7040132284164429 }, { "epoch": 0.3395293652363061, "step": 1717, "train/sim_loss": 0.00027692317962646484 }, { "epoch": 0.3395293652363061, "step": 1717, "train/total_loss": 0.1706782430410385 }, { "entropy": 5.798242568969727, "epoch": 0.33972711093533714, "mean_token_accuracy": 0.7629846334457397, "num_tokens": 76931831.0, "step": 1718, "train/ce_loss": 1.0434828996658325 }, { "epoch": 0.33972711093533714, "step": 1718, "train/sim_loss": 0.0002956390380859375 }, { "epoch": 0.33972711093533714, "step": 1718, "train/total_loss": 0.10464393347501755 }, { "entropy": 6.271583080291748, "epoch": 0.3399248566343682, "mean_token_accuracy": 0.7412364482879639, "num_tokens": 76977418.0, "step": 1719, "train/ce_loss": 0.5859628915786743 }, { "epoch": 0.3399248566343682, "step": 1719, "train/sim_loss": 0.0002416372299194336 }, { "epoch": 0.3399248566343682, "step": 1719, "train/total_loss": 0.058837927877902985 }, { "epoch": 0.34012260233339925, "grad_norm": 0.43278372287750244, "learning_rate": 9.154713621525375e-06, "loss": 0.0812, "step": 1720 }, { "entropy": 6.235049247741699, "epoch": 0.34012260233339925, "mean_token_accuracy": 0.7360544204711914, "num_tokens": 77008485.0, "step": 1720, "train/ce_loss": 1.1726974248886108 }, { "epoch": 0.34012260233339925, "step": 1720, "train/sim_loss": 0.0003103017807006836 }, { "epoch": 0.34012260233339925, "step": 1720, "train/total_loss": 0.11758004873991013 }, { "entropy": 6.311173439025879, "epoch": 0.3403203480324303, "mean_token_accuracy": 0.7116898894309998, "num_tokens": 77052870.0, "step": 1721, "train/ce_loss": 0.8047730922698975 }, { "epoch": 0.3403203480324303, "step": 1721, "train/sim_loss": 0.0001601576805114746 }, { "epoch": 0.3403203480324303, "step": 1721, "train/total_loss": 0.08063746988773346 }, { "entropy": 6.241524696350098, "epoch": 0.34051809373146136, "mean_token_accuracy": 0.7298474907875061, "num_tokens": 77101662.0, "step": 1722, "train/ce_loss": 1.270914077758789 }, { "epoch": 0.34051809373146136, "step": 1722, "train/sim_loss": 0.0001672506332397461 }, { "epoch": 0.34051809373146136, "step": 1722, "train/total_loss": 0.12725865840911865 }, { "entropy": 6.294099807739258, "epoch": 0.3407158394304924, "mean_token_accuracy": 0.7465020418167114, "num_tokens": 77155201.0, "step": 1723, "train/ce_loss": 0.6727654337882996 }, { "epoch": 0.3407158394304924, "step": 1723, "train/sim_loss": 0.00029075145721435547 }, { "epoch": 0.3407158394304924, "step": 1723, "train/total_loss": 0.06756729632616043 }, { "entropy": 6.296210765838623, "epoch": 0.3409135851295234, "mean_token_accuracy": 0.7506702542304993, "num_tokens": 77204847.0, "step": 1724, "train/ce_loss": 0.7087648510932922 }, { "epoch": 0.3409135851295234, "step": 1724, "train/sim_loss": 0.00020307302474975586 }, { "epoch": 0.3409135851295234, "step": 1724, "train/total_loss": 0.0710795596241951 }, { "entropy": 6.185664176940918, "epoch": 0.3411113308285545, "mean_token_accuracy": 0.7356521487236023, "num_tokens": 77250990.0, "step": 1725, "train/ce_loss": 1.0004631280899048 }, { "epoch": 0.3411113308285545, "step": 1725, "train/sim_loss": 0.00030052661895751953 }, { "epoch": 0.3411113308285545, "step": 1725, "train/total_loss": 0.10034684091806412 }, { "entropy": 6.245695114135742, "epoch": 0.3413090765275855, "mean_token_accuracy": 0.7048675417900085, "num_tokens": 77294137.0, "step": 1726, "train/ce_loss": 1.1982176303863525 }, { "epoch": 0.3413090765275855, "step": 1726, "train/sim_loss": 0.00019532442092895508 }, { "epoch": 0.3413090765275855, "step": 1726, "train/total_loss": 0.12001708894968033 }, { "entropy": 5.763768672943115, "epoch": 0.34150682222661655, "mean_token_accuracy": 0.7775496244430542, "num_tokens": 77329455.0, "step": 1727, "train/ce_loss": 1.1198452711105347 }, { "epoch": 0.34150682222661655, "step": 1727, "train/sim_loss": 0.0002040266990661621 }, { "epoch": 0.34150682222661655, "step": 1727, "train/total_loss": 0.11218855530023575 }, { "entropy": 6.201149940490723, "epoch": 0.34170456792564763, "mean_token_accuracy": 0.7629724740982056, "num_tokens": 77372148.0, "step": 1728, "train/ce_loss": 0.5911086201667786 }, { "epoch": 0.34170456792564763, "step": 1728, "train/sim_loss": 0.00013434886932373047 }, { "epoch": 0.34170456792564763, "step": 1728, "train/total_loss": 0.05924521014094353 }, { "entropy": 5.915065765380859, "epoch": 0.34190231362467866, "mean_token_accuracy": 0.7381818294525146, "num_tokens": 77426785.0, "step": 1729, "train/ce_loss": 0.797240674495697 }, { "epoch": 0.34190231362467866, "step": 1729, "train/sim_loss": 0.0001556873321533203 }, { "epoch": 0.34190231362467866, "step": 1729, "train/total_loss": 0.0798797532916069 }, { "entropy": 6.1647233963012695, "epoch": 0.3421000593237097, "mean_token_accuracy": 0.749356210231781, "num_tokens": 77472547.0, "step": 1730, "train/ce_loss": 1.715311050415039 }, { "epoch": 0.3421000593237097, "step": 1730, "train/sim_loss": 0.00018918514251708984 }, { "epoch": 0.3421000593237097, "step": 1730, "train/total_loss": 0.17172029614448547 }, { "entropy": 6.0263261795043945, "epoch": 0.34229780502274076, "mean_token_accuracy": 0.7155050039291382, "num_tokens": 77513400.0, "step": 1731, "train/ce_loss": 0.9808324575424194 }, { "epoch": 0.34229780502274076, "step": 1731, "train/sim_loss": 0.00018829107284545898 }, { "epoch": 0.34229780502274076, "step": 1731, "train/total_loss": 0.09827154129743576 }, { "entropy": 6.274771690368652, "epoch": 0.3424955507217718, "mean_token_accuracy": 0.7378716468811035, "num_tokens": 77568483.0, "step": 1732, "train/ce_loss": 0.668099582195282 }, { "epoch": 0.3424955507217718, "step": 1732, "train/sim_loss": 0.00019156932830810547 }, { "epoch": 0.3424955507217718, "step": 1732, "train/total_loss": 0.06700152903795242 }, { "entropy": 5.991031646728516, "epoch": 0.3426932964208029, "mean_token_accuracy": 0.7736982703208923, "num_tokens": 77607308.0, "step": 1733, "train/ce_loss": 0.6657684445381165 }, { "epoch": 0.3426932964208029, "step": 1733, "train/sim_loss": 0.000247955322265625 }, { "epoch": 0.3426932964208029, "step": 1733, "train/total_loss": 0.06682480126619339 }, { "entropy": 6.345213413238525, "epoch": 0.3428910421198339, "mean_token_accuracy": 0.7215073704719543, "num_tokens": 77670859.0, "step": 1734, "train/ce_loss": 2.3643016815185547 }, { "epoch": 0.3428910421198339, "step": 1734, "train/sim_loss": 0.00023496150970458984 }, { "epoch": 0.3428910421198339, "step": 1734, "train/total_loss": 0.23666512966156006 }, { "entropy": 6.0105671882629395, "epoch": 0.3430887878188649, "mean_token_accuracy": 0.757885754108429, "num_tokens": 77724077.0, "step": 1735, "train/ce_loss": 0.5529216527938843 }, { "epoch": 0.3430887878188649, "step": 1735, "train/sim_loss": 0.00019788742065429688 }, { "epoch": 0.3430887878188649, "step": 1735, "train/total_loss": 0.055490054190158844 }, { "entropy": 5.985905647277832, "epoch": 0.343286533517896, "mean_token_accuracy": 0.7306622266769409, "num_tokens": 77767811.0, "step": 1736, "train/ce_loss": 0.5630609393119812 }, { "epoch": 0.343286533517896, "step": 1736, "train/sim_loss": 0.00020867586135864258 }, { "epoch": 0.343286533517896, "step": 1736, "train/total_loss": 0.05651476979255676 }, { "entropy": 5.892918586730957, "epoch": 0.34348427921692704, "mean_token_accuracy": 0.7505845427513123, "num_tokens": 77805434.0, "step": 1737, "train/ce_loss": 1.1694572094711475e-05 }, { "epoch": 0.34348427921692704, "step": 1737, "train/sim_loss": 0.0001742839813232422 }, { "epoch": 0.34348427921692704, "step": 1737, "train/total_loss": 0.00017545343143865466 }, { "entropy": 6.242980003356934, "epoch": 0.34368202491595806, "mean_token_accuracy": 0.7441860437393188, "num_tokens": 77852484.0, "step": 1738, "train/ce_loss": 0.5579925775527954 }, { "epoch": 0.34368202491595806, "step": 1738, "train/sim_loss": 0.00016224384307861328 }, { "epoch": 0.34368202491595806, "step": 1738, "train/total_loss": 0.055961500853300095 }, { "entropy": 6.182841777801514, "epoch": 0.34387977061498914, "mean_token_accuracy": 0.7262135744094849, "num_tokens": 77907424.0, "step": 1739, "train/ce_loss": 0.8554466366767883 }, { "epoch": 0.34387977061498914, "step": 1739, "train/sim_loss": 0.0002218484878540039 }, { "epoch": 0.34387977061498914, "step": 1739, "train/total_loss": 0.0857665166258812 }, { "epoch": 0.34407751631402017, "grad_norm": 0.557971715927124, "learning_rate": 9.144821446236028e-06, "loss": 0.082, "step": 1740 }, { "entropy": 6.374668121337891, "epoch": 0.34407751631402017, "mean_token_accuracy": 0.6971098184585571, "num_tokens": 77955332.0, "step": 1740, "train/ce_loss": 0.9146985411643982 }, { "epoch": 0.34407751631402017, "step": 1740, "train/sim_loss": 0.00017583370208740234 }, { "epoch": 0.34407751631402017, "step": 1740, "train/total_loss": 0.09164568781852722 }, { "entropy": 5.856602191925049, "epoch": 0.3442752620130512, "mean_token_accuracy": 0.760103166103363, "num_tokens": 77996215.0, "step": 1741, "train/ce_loss": 0.9756723046302795 }, { "epoch": 0.3442752620130512, "step": 1741, "train/sim_loss": 0.00019550323486328125 }, { "epoch": 0.3442752620130512, "step": 1741, "train/total_loss": 0.09776273369789124 }, { "entropy": 6.228270530700684, "epoch": 0.3444730077120823, "mean_token_accuracy": 0.7140804529190063, "num_tokens": 78029236.0, "step": 1742, "train/ce_loss": 1.421807885169983 }, { "epoch": 0.3444730077120823, "step": 1742, "train/sim_loss": 0.0002644062042236328 }, { "epoch": 0.3444730077120823, "step": 1742, "train/total_loss": 0.14244519174098969 }, { "entropy": 5.838635444641113, "epoch": 0.3446707534111133, "mean_token_accuracy": 0.7591331005096436, "num_tokens": 78097094.0, "step": 1743, "train/ce_loss": 1.1457115411758423 }, { "epoch": 0.3446707534111133, "step": 1743, "train/sim_loss": 0.00014352798461914062 }, { "epoch": 0.3446707534111133, "step": 1743, "train/total_loss": 0.11471468210220337 }, { "entropy": 6.312458038330078, "epoch": 0.34486849911014433, "mean_token_accuracy": 0.6959414482116699, "num_tokens": 78147890.0, "step": 1744, "train/ce_loss": 0.9086865186691284 }, { "epoch": 0.34486849911014433, "step": 1744, "train/sim_loss": 0.0004178285598754883 }, { "epoch": 0.34486849911014433, "step": 1744, "train/total_loss": 0.09128648042678833 }, { "entropy": 6.262222766876221, "epoch": 0.3450662448091754, "mean_token_accuracy": 0.7420520186424255, "num_tokens": 78202328.0, "step": 1745, "train/ce_loss": 1.1169111728668213 }, { "epoch": 0.3450662448091754, "step": 1745, "train/sim_loss": 0.00020015239715576172 }, { "epoch": 0.3450662448091754, "step": 1745, "train/total_loss": 0.11189126968383789 }, { "entropy": 5.651692867279053, "epoch": 0.34526399050820644, "mean_token_accuracy": 0.7682737112045288, "num_tokens": 78232205.0, "step": 1746, "train/ce_loss": 0.7634576559066772 }, { "epoch": 0.34526399050820644, "step": 1746, "train/sim_loss": 0.00031185150146484375 }, { "epoch": 0.34526399050820644, "step": 1746, "train/total_loss": 0.07665761560201645 }, { "entropy": 6.08376407623291, "epoch": 0.34546173620723747, "mean_token_accuracy": 0.7337164878845215, "num_tokens": 78293055.0, "step": 1747, "train/ce_loss": 0.6026018261909485 }, { "epoch": 0.34546173620723747, "step": 1747, "train/sim_loss": 0.00014901161193847656 }, { "epoch": 0.34546173620723747, "step": 1747, "train/total_loss": 0.060409195721149445 }, { "entropy": 5.835596561431885, "epoch": 0.34565948190626855, "mean_token_accuracy": 0.76417076587677, "num_tokens": 78342410.0, "step": 1748, "train/ce_loss": 0.5755059719085693 }, { "epoch": 0.34565948190626855, "step": 1748, "train/sim_loss": 0.00015026330947875977 }, { "epoch": 0.34565948190626855, "step": 1748, "train/total_loss": 0.05770086124539375 }, { "entropy": 6.042150020599365, "epoch": 0.3458572276052996, "mean_token_accuracy": 0.7770320773124695, "num_tokens": 78402110.0, "step": 1749, "train/ce_loss": 0.6937757134437561 }, { "epoch": 0.3458572276052996, "step": 1749, "train/sim_loss": 0.00028955936431884766 }, { "epoch": 0.3458572276052996, "step": 1749, "train/total_loss": 0.06966713070869446 }, { "entropy": 6.174585819244385, "epoch": 0.3460549733043306, "mean_token_accuracy": 0.7173431515693665, "num_tokens": 78449426.0, "step": 1750, "train/ce_loss": 0.8439496159553528 }, { "epoch": 0.3460549733043306, "step": 1750, "train/sim_loss": 0.00024968385696411133 }, { "epoch": 0.3460549733043306, "step": 1750, "train/total_loss": 0.08464464545249939 }, { "entropy": 6.3498125076293945, "epoch": 0.3462527190033617, "mean_token_accuracy": 0.7368055582046509, "num_tokens": 78511492.0, "step": 1751, "train/ce_loss": 0.6445308923721313 }, { "epoch": 0.3462527190033617, "step": 1751, "train/sim_loss": 0.0001659393310546875 }, { "epoch": 0.3462527190033617, "step": 1751, "train/total_loss": 0.0646190270781517 }, { "entropy": 5.964466094970703, "epoch": 0.3464504647023927, "mean_token_accuracy": 0.715452253818512, "num_tokens": 78560036.0, "step": 1752, "train/ce_loss": 1.464551568031311 }, { "epoch": 0.3464504647023927, "step": 1752, "train/sim_loss": 0.00020939111709594727 }, { "epoch": 0.3464504647023927, "step": 1752, "train/total_loss": 0.1466645449399948 }, { "entropy": 5.977078914642334, "epoch": 0.3466482104014238, "mean_token_accuracy": 0.7019657492637634, "num_tokens": 78610764.0, "step": 1753, "train/ce_loss": 0.8040879368782043 }, { "epoch": 0.3466482104014238, "step": 1753, "train/sim_loss": 0.00021350383758544922 }, { "epoch": 0.3466482104014238, "step": 1753, "train/total_loss": 0.08062230050563812 }, { "entropy": 6.131739616394043, "epoch": 0.3468459561004548, "mean_token_accuracy": 0.7060086131095886, "num_tokens": 78675212.0, "step": 1754, "train/ce_loss": 1.035931944847107 }, { "epoch": 0.3468459561004548, "step": 1754, "train/sim_loss": 0.00014460086822509766 }, { "epoch": 0.3468459561004548, "step": 1754, "train/total_loss": 0.10373779386281967 }, { "entropy": 5.8209733963012695, "epoch": 0.34704370179948585, "mean_token_accuracy": 0.7647432088851929, "num_tokens": 78711729.0, "step": 1755, "train/ce_loss": 1.1156770597153809e-05 }, { "epoch": 0.34704370179948585, "step": 1755, "train/sim_loss": 0.0001926422119140625 }, { "epoch": 0.34704370179948585, "step": 1755, "train/total_loss": 0.00019375789270270616 }, { "entropy": 6.189640998840332, "epoch": 0.34724144749851693, "mean_token_accuracy": 0.7059145569801331, "num_tokens": 78770982.0, "step": 1756, "train/ce_loss": 0.8762401342391968 }, { "epoch": 0.34724144749851693, "step": 1756, "train/sim_loss": 0.0002300739288330078 }, { "epoch": 0.34724144749851693, "step": 1756, "train/total_loss": 0.08785408735275269 }, { "entropy": 5.977341651916504, "epoch": 0.34743919319754796, "mean_token_accuracy": 0.7450980544090271, "num_tokens": 78825717.0, "step": 1757, "train/ce_loss": 0.7477531433105469 }, { "epoch": 0.34743919319754796, "step": 1757, "train/sim_loss": 0.00016808509826660156 }, { "epoch": 0.34743919319754796, "step": 1757, "train/total_loss": 0.07494340091943741 }, { "entropy": 6.0181474685668945, "epoch": 0.347636938896579, "mean_token_accuracy": 0.7187708616256714, "num_tokens": 78869331.0, "step": 1758, "train/ce_loss": 1.1067860126495361 }, { "epoch": 0.347636938896579, "step": 1758, "train/sim_loss": 0.0001875162124633789 }, { "epoch": 0.347636938896579, "step": 1758, "train/total_loss": 0.11086612194776535 }, { "entropy": 5.990894317626953, "epoch": 0.34783468459561007, "mean_token_accuracy": 0.740867555141449, "num_tokens": 78919195.0, "step": 1759, "train/ce_loss": 0.6151270866394043 }, { "epoch": 0.34783468459561007, "step": 1759, "train/sim_loss": 0.00014984607696533203 }, { "epoch": 0.34783468459561007, "step": 1759, "train/total_loss": 0.06166255474090576 }, { "epoch": 0.3480324302946411, "grad_norm": 0.399882048368454, "learning_rate": 9.134929270946682e-06, "loss": 0.0868, "step": 1760 }, { "entropy": 6.073004722595215, "epoch": 0.3480324302946411, "mean_token_accuracy": 0.7451851963996887, "num_tokens": 78967235.0, "step": 1760, "train/ce_loss": 1.288806652155472e-05 }, { "epoch": 0.3480324302946411, "step": 1760, "train/sim_loss": 0.00017875432968139648 }, { "epoch": 0.3480324302946411, "step": 1760, "train/total_loss": 0.00018004313460551202 }, { "entropy": 6.086967468261719, "epoch": 0.3482301759936721, "mean_token_accuracy": 0.7350800633430481, "num_tokens": 79014487.0, "step": 1761, "train/ce_loss": 1.747042179107666 }, { "epoch": 0.3482301759936721, "step": 1761, "train/sim_loss": 0.00019741058349609375 }, { "epoch": 0.3482301759936721, "step": 1761, "train/total_loss": 0.17490163445472717 }, { "entropy": 5.940788269042969, "epoch": 0.3484279216927032, "mean_token_accuracy": 0.7224755883216858, "num_tokens": 79066979.0, "step": 1762, "train/ce_loss": 0.8074043989181519 }, { "epoch": 0.3484279216927032, "step": 1762, "train/sim_loss": 0.0001507401466369629 }, { "epoch": 0.3484279216927032, "step": 1762, "train/total_loss": 0.0808911845088005 }, { "entropy": 6.116911888122559, "epoch": 0.34862566739173423, "mean_token_accuracy": 0.7553101181983948, "num_tokens": 79104190.0, "step": 1763, "train/ce_loss": 0.6544486880302429 }, { "epoch": 0.34862566739173423, "step": 1763, "train/sim_loss": 0.00012743473052978516 }, { "epoch": 0.34862566739173423, "step": 1763, "train/total_loss": 0.06557230651378632 }, { "entropy": 6.456245422363281, "epoch": 0.34882341309076526, "mean_token_accuracy": 0.7202889323234558, "num_tokens": 79161755.0, "step": 1764, "train/ce_loss": 0.973090648651123 }, { "epoch": 0.34882341309076526, "step": 1764, "train/sim_loss": 0.00022113323211669922 }, { "epoch": 0.34882341309076526, "step": 1764, "train/total_loss": 0.09753020107746124 }, { "entropy": 5.925066947937012, "epoch": 0.34902115878979634, "mean_token_accuracy": 0.762968897819519, "num_tokens": 79220846.0, "step": 1765, "train/ce_loss": 0.6138321757316589 }, { "epoch": 0.34902115878979634, "step": 1765, "train/sim_loss": 0.0001367330551147461 }, { "epoch": 0.34902115878979634, "step": 1765, "train/total_loss": 0.06151995062828064 }, { "entropy": 5.986207485198975, "epoch": 0.34921890448882736, "mean_token_accuracy": 0.7516375780105591, "num_tokens": 79265309.0, "step": 1766, "train/ce_loss": 0.5571813583374023 }, { "epoch": 0.34921890448882736, "step": 1766, "train/sim_loss": 0.00016736984252929688 }, { "epoch": 0.34921890448882736, "step": 1766, "train/total_loss": 0.05588550493121147 }, { "entropy": 6.3133134841918945, "epoch": 0.3494166501878584, "mean_token_accuracy": 0.7508440017700195, "num_tokens": 79334202.0, "step": 1767, "train/ce_loss": 0.6085557341575623 }, { "epoch": 0.3494166501878584, "step": 1767, "train/sim_loss": 0.00014919042587280273 }, { "epoch": 0.3494166501878584, "step": 1767, "train/total_loss": 0.06100476533174515 }, { "entropy": 5.973306655883789, "epoch": 0.3496143958868895, "mean_token_accuracy": 0.7950507402420044, "num_tokens": 79367415.0, "step": 1768, "train/ce_loss": 0.4127938151359558 }, { "epoch": 0.3496143958868895, "step": 1768, "train/sim_loss": 0.0002561807632446289 }, { "epoch": 0.3496143958868895, "step": 1768, "train/total_loss": 0.04153556376695633 }, { "entropy": 6.125088691711426, "epoch": 0.3498121415859205, "mean_token_accuracy": 0.739701509475708, "num_tokens": 79413182.0, "step": 1769, "train/ce_loss": 1.1855803728103638 }, { "epoch": 0.3498121415859205, "step": 1769, "train/sim_loss": 0.00022232532501220703 }, { "epoch": 0.3498121415859205, "step": 1769, "train/total_loss": 0.11878036707639694 }, { "entropy": 5.9014129638671875, "epoch": 0.3500098872849515, "mean_token_accuracy": 0.7438187003135681, "num_tokens": 79453199.0, "step": 1770, "train/ce_loss": 1.2984066009521484 }, { "epoch": 0.3500098872849515, "step": 1770, "train/sim_loss": 0.0003647804260253906 }, { "epoch": 0.3500098872849515, "step": 1770, "train/total_loss": 0.130205437541008 }, { "entropy": 6.195622444152832, "epoch": 0.3502076329839826, "mean_token_accuracy": 0.7164705991744995, "num_tokens": 79502615.0, "step": 1771, "train/ce_loss": 0.9158042073249817 }, { "epoch": 0.3502076329839826, "step": 1771, "train/sim_loss": 0.00024127960205078125 }, { "epoch": 0.3502076329839826, "step": 1771, "train/total_loss": 0.09182170033454895 }, { "entropy": 5.538210391998291, "epoch": 0.35040537868301364, "mean_token_accuracy": 0.7541766166687012, "num_tokens": 79557531.0, "step": 1772, "train/ce_loss": 0.4872627854347229 }, { "epoch": 0.35040537868301364, "step": 1772, "train/sim_loss": 0.0001723766326904297 }, { "epoch": 0.35040537868301364, "step": 1772, "train/total_loss": 0.04889865592122078 }, { "entropy": 6.026872634887695, "epoch": 0.3506031243820447, "mean_token_accuracy": 0.7685459852218628, "num_tokens": 79605510.0, "step": 1773, "train/ce_loss": 0.49908193945884705 }, { "epoch": 0.3506031243820447, "step": 1773, "train/sim_loss": 0.00021779537200927734 }, { "epoch": 0.3506031243820447, "step": 1773, "train/total_loss": 0.05012599006295204 }, { "entropy": 6.027985095977783, "epoch": 0.35080087008107574, "mean_token_accuracy": 0.7369175553321838, "num_tokens": 79646237.0, "step": 1774, "train/ce_loss": 1.1940449476242065 }, { "epoch": 0.35080087008107574, "step": 1774, "train/sim_loss": 0.0002541542053222656 }, { "epoch": 0.35080087008107574, "step": 1774, "train/total_loss": 0.11965864896774292 }, { "entropy": 5.974953651428223, "epoch": 0.35099861578010677, "mean_token_accuracy": 0.7234042286872864, "num_tokens": 79706811.0, "step": 1775, "train/ce_loss": 0.9147178530693054 }, { "epoch": 0.35099861578010677, "step": 1775, "train/sim_loss": 0.0002678036689758301 }, { "epoch": 0.35099861578010677, "step": 1775, "train/total_loss": 0.09173958748579025 }, { "entropy": 6.076504707336426, "epoch": 0.35119636147913785, "mean_token_accuracy": 0.7720306515693665, "num_tokens": 79742800.0, "step": 1776, "train/ce_loss": 1.3476043939590454 }, { "epoch": 0.35119636147913785, "step": 1776, "train/sim_loss": 0.00020194053649902344 }, { "epoch": 0.35119636147913785, "step": 1776, "train/total_loss": 0.13496237993240356 }, { "entropy": 5.901402473449707, "epoch": 0.3513941071781689, "mean_token_accuracy": 0.7347174286842346, "num_tokens": 79789443.0, "step": 1777, "train/ce_loss": 0.8784245848655701 }, { "epoch": 0.3513941071781689, "step": 1777, "train/sim_loss": 0.00017714500427246094 }, { "epoch": 0.3513941071781689, "step": 1777, "train/total_loss": 0.08801960200071335 }, { "entropy": 6.303762435913086, "epoch": 0.3515918528771999, "mean_token_accuracy": 0.7362728714942932, "num_tokens": 79842036.0, "step": 1778, "train/ce_loss": 0.8497328758239746 }, { "epoch": 0.3515918528771999, "step": 1778, "train/sim_loss": 0.0002727508544921875 }, { "epoch": 0.3515918528771999, "step": 1778, "train/total_loss": 0.08524604141712189 }, { "entropy": 6.018318176269531, "epoch": 0.351789598576231, "mean_token_accuracy": 0.7512242794036865, "num_tokens": 79878536.0, "step": 1779, "train/ce_loss": 1.1513512134552002 }, { "epoch": 0.351789598576231, "step": 1779, "train/sim_loss": 0.00019073486328125 }, { "epoch": 0.351789598576231, "step": 1779, "train/total_loss": 0.11532586067914963 }, { "epoch": 0.351987344275262, "grad_norm": 0.4959181845188141, "learning_rate": 9.125037095657336e-06, "loss": 0.0814, "step": 1780 }, { "entropy": 6.003194808959961, "epoch": 0.351987344275262, "mean_token_accuracy": 0.7177215218544006, "num_tokens": 79939412.0, "step": 1780, "train/ce_loss": 1.6251469850540161 }, { "epoch": 0.351987344275262, "step": 1780, "train/sim_loss": 0.00021696090698242188 }, { "epoch": 0.351987344275262, "step": 1780, "train/total_loss": 0.16273166239261627 }, { "entropy": 6.050382614135742, "epoch": 0.35218508997429304, "mean_token_accuracy": 0.7178571224212646, "num_tokens": 79984599.0, "step": 1781, "train/ce_loss": 1.3915221691131592 }, { "epoch": 0.35218508997429304, "step": 1781, "train/sim_loss": 0.0003274679183959961 }, { "epoch": 0.35218508997429304, "step": 1781, "train/total_loss": 0.13947968184947968 }, { "entropy": 6.0952067375183105, "epoch": 0.3523828356733241, "mean_token_accuracy": 0.7584884762763977, "num_tokens": 80025026.0, "step": 1782, "train/ce_loss": 0.5810282230377197 }, { "epoch": 0.3523828356733241, "step": 1782, "train/sim_loss": 0.0002524852752685547 }, { "epoch": 0.3523828356733241, "step": 1782, "train/total_loss": 0.05835530906915665 }, { "entropy": 6.060956001281738, "epoch": 0.35258058137235515, "mean_token_accuracy": 0.7444005012512207, "num_tokens": 80059707.0, "step": 1783, "train/ce_loss": 1.0536068677902222 }, { "epoch": 0.35258058137235515, "step": 1783, "train/sim_loss": 0.0002874135971069336 }, { "epoch": 0.35258058137235515, "step": 1783, "train/total_loss": 0.10564810037612915 }, { "entropy": 6.120932102203369, "epoch": 0.3527783270713862, "mean_token_accuracy": 0.7497699856758118, "num_tokens": 80102043.0, "step": 1784, "train/ce_loss": 1.3810806194669567e-05 }, { "epoch": 0.3527783270713862, "step": 1784, "train/sim_loss": 0.00020635128021240234 }, { "epoch": 0.3527783270713862, "step": 1784, "train/total_loss": 0.00020773235883098096 }, { "entropy": 6.213570594787598, "epoch": 0.35297607277041726, "mean_token_accuracy": 0.7548426389694214, "num_tokens": 80142819.0, "step": 1785, "train/ce_loss": 0.6616263389587402 }, { "epoch": 0.35297607277041726, "step": 1785, "train/sim_loss": 0.0001373887062072754 }, { "epoch": 0.35297607277041726, "step": 1785, "train/total_loss": 0.06630002707242966 }, { "entropy": 5.900242328643799, "epoch": 0.3531738184694483, "mean_token_accuracy": 0.7589703798294067, "num_tokens": 80179607.0, "step": 1786, "train/ce_loss": 0.9848667979240417 }, { "epoch": 0.3531738184694483, "step": 1786, "train/sim_loss": 0.00027763843536376953 }, { "epoch": 0.3531738184694483, "step": 1786, "train/total_loss": 0.0987643226981163 }, { "entropy": 6.16032075881958, "epoch": 0.3533715641684793, "mean_token_accuracy": 0.7382234334945679, "num_tokens": 80236002.0, "step": 1787, "train/ce_loss": 1.3326481580734253 }, { "epoch": 0.3533715641684793, "step": 1787, "train/sim_loss": 0.00045412778854370117 }, { "epoch": 0.3533715641684793, "step": 1787, "train/total_loss": 0.13371895253658295 }, { "entropy": 5.849244594573975, "epoch": 0.3535693098675104, "mean_token_accuracy": 0.8115702271461487, "num_tokens": 80276354.0, "step": 1788, "train/ce_loss": 1.2821327447891235 }, { "epoch": 0.3535693098675104, "step": 1788, "train/sim_loss": 0.0001512765884399414 }, { "epoch": 0.3535693098675104, "step": 1788, "train/total_loss": 0.12836454808712006 }, { "entropy": 6.437813758850098, "epoch": 0.3537670555665414, "mean_token_accuracy": 0.7139334082603455, "num_tokens": 80328643.0, "step": 1789, "train/ce_loss": 0.9272111654281616 }, { "epoch": 0.3537670555665414, "step": 1789, "train/sim_loss": 0.00016486644744873047 }, { "epoch": 0.3537670555665414, "step": 1789, "train/total_loss": 0.09288598597049713 }, { "entropy": 5.733613967895508, "epoch": 0.35396480126557245, "mean_token_accuracy": 0.7924888730049133, "num_tokens": 80367904.0, "step": 1790, "train/ce_loss": 0.45803555846214294 }, { "epoch": 0.35396480126557245, "step": 1790, "train/sim_loss": 0.0002396106719970703 }, { "epoch": 0.35396480126557245, "step": 1790, "train/total_loss": 0.046043168753385544 }, { "entropy": 6.049384117126465, "epoch": 0.35416254696460353, "mean_token_accuracy": 0.68808913230896, "num_tokens": 80401583.0, "step": 1791, "train/ce_loss": 0.8602838516235352 }, { "epoch": 0.35416254696460353, "step": 1791, "train/sim_loss": 0.00024437904357910156 }, { "epoch": 0.35416254696460353, "step": 1791, "train/total_loss": 0.08627276867628098 }, { "entropy": 5.821619033813477, "epoch": 0.35436029266363456, "mean_token_accuracy": 0.7954133749008179, "num_tokens": 80444306.0, "step": 1792, "train/ce_loss": 0.5160769820213318 }, { "epoch": 0.35436029266363456, "step": 1792, "train/sim_loss": 0.00016188621520996094 }, { "epoch": 0.35436029266363456, "step": 1792, "train/total_loss": 0.05176958441734314 }, { "entropy": 6.063204765319824, "epoch": 0.35455803836266564, "mean_token_accuracy": 0.7680525183677673, "num_tokens": 80499704.0, "step": 1793, "train/ce_loss": 1.3098848285153508e-05 }, { "epoch": 0.35455803836266564, "step": 1793, "train/sim_loss": 0.00016105175018310547 }, { "epoch": 0.35455803836266564, "step": 1793, "train/total_loss": 0.0001623616408323869 }, { "entropy": 6.349546432495117, "epoch": 0.35475578406169667, "mean_token_accuracy": 0.7473958134651184, "num_tokens": 80527994.0, "step": 1794, "train/ce_loss": 1.0305641889572144 }, { "epoch": 0.35475578406169667, "step": 1794, "train/sim_loss": 0.0002021193504333496 }, { "epoch": 0.35475578406169667, "step": 1794, "train/total_loss": 0.10325854271650314 }, { "entropy": 6.369498252868652, "epoch": 0.3549535297607277, "mean_token_accuracy": 0.7216676473617554, "num_tokens": 80579639.0, "step": 1795, "train/ce_loss": 1.1241194009780884 }, { "epoch": 0.3549535297607277, "step": 1795, "train/sim_loss": 0.0001544952392578125 }, { "epoch": 0.3549535297607277, "step": 1795, "train/total_loss": 0.11256643384695053 }, { "entropy": 6.566048622131348, "epoch": 0.3551512754597588, "mean_token_accuracy": 0.7097232341766357, "num_tokens": 80622771.0, "step": 1796, "train/ce_loss": 0.7719599008560181 }, { "epoch": 0.3551512754597588, "step": 1796, "train/sim_loss": 0.00023996829986572266 }, { "epoch": 0.3551512754597588, "step": 1796, "train/total_loss": 0.07743596285581589 }, { "entropy": 6.323087215423584, "epoch": 0.3553490211587898, "mean_token_accuracy": 0.7241647243499756, "num_tokens": 80675233.0, "step": 1797, "train/ce_loss": 1.1191059350967407 }, { "epoch": 0.3553490211587898, "step": 1797, "train/sim_loss": 0.00027811527252197266 }, { "epoch": 0.3553490211587898, "step": 1797, "train/total_loss": 0.11218871176242828 }, { "entropy": 6.518558025360107, "epoch": 0.35554676685782083, "mean_token_accuracy": 0.7302291393280029, "num_tokens": 80734904.0, "step": 1798, "train/ce_loss": 1.1112103462219238 }, { "epoch": 0.35554676685782083, "step": 1798, "train/sim_loss": 0.00015342235565185547 }, { "epoch": 0.35554676685782083, "step": 1798, "train/total_loss": 0.11127445846796036 }, { "entropy": 5.8131256103515625, "epoch": 0.3557445125568519, "mean_token_accuracy": 0.7648136615753174, "num_tokens": 80772515.0, "step": 1799, "train/ce_loss": 1.3206366300582886 }, { "epoch": 0.3557445125568519, "step": 1799, "train/sim_loss": 0.0002715587615966797 }, { "epoch": 0.3557445125568519, "step": 1799, "train/total_loss": 0.13233523070812225 }, { "epoch": 0.35594225825588294, "grad_norm": 0.4200820326805115, "learning_rate": 9.11514492036799e-06, "loss": 0.0807, "step": 1800 }, { "entropy": 5.859002113342285, "epoch": 0.35594225825588294, "mean_token_accuracy": 0.7891825437545776, "num_tokens": 80818473.0, "step": 1800, "train/ce_loss": 0.5021631717681885 }, { "epoch": 0.35594225825588294, "step": 1800, "train/sim_loss": 0.00014930963516235352 }, { "epoch": 0.35594225825588294, "step": 1800, "train/total_loss": 0.0503656268119812 }, { "entropy": 6.591703414916992, "epoch": 0.35614000395491396, "mean_token_accuracy": 0.7069096565246582, "num_tokens": 80876879.0, "step": 1801, "train/ce_loss": 0.8159557580947876 }, { "epoch": 0.35614000395491396, "step": 1801, "train/sim_loss": 0.00022363662719726562 }, { "epoch": 0.35614000395491396, "step": 1801, "train/total_loss": 0.08181921392679214 }, { "entropy": 5.820387840270996, "epoch": 0.35633774965394505, "mean_token_accuracy": 0.7509933710098267, "num_tokens": 80920096.0, "step": 1802, "train/ce_loss": 0.8704583644866943 }, { "epoch": 0.35633774965394505, "step": 1802, "train/sim_loss": 0.00020802021026611328 }, { "epoch": 0.35633774965394505, "step": 1802, "train/total_loss": 0.0872538611292839 }, { "entropy": 6.057682514190674, "epoch": 0.3565354953529761, "mean_token_accuracy": 0.7572815418243408, "num_tokens": 80971889.0, "step": 1803, "train/ce_loss": 0.5231915712356567 }, { "epoch": 0.3565354953529761, "step": 1803, "train/sim_loss": 0.0002429485321044922 }, { "epoch": 0.3565354953529761, "step": 1803, "train/total_loss": 0.052562106400728226 }, { "entropy": 6.176255702972412, "epoch": 0.3567332410520071, "mean_token_accuracy": 0.7248322367668152, "num_tokens": 81017789.0, "step": 1804, "train/ce_loss": 1.0594435930252075 }, { "epoch": 0.3567332410520071, "step": 1804, "train/sim_loss": 0.0001583695411682129 }, { "epoch": 0.3567332410520071, "step": 1804, "train/total_loss": 0.10610272735357285 }, { "entropy": 6.044563293457031, "epoch": 0.3569309867510382, "mean_token_accuracy": 0.7498396635055542, "num_tokens": 81064056.0, "step": 1805, "train/ce_loss": 0.9281615614891052 }, { "epoch": 0.3569309867510382, "step": 1805, "train/sim_loss": 0.0001800060272216797 }, { "epoch": 0.3569309867510382, "step": 1805, "train/total_loss": 0.09299616515636444 }, { "entropy": 6.000401496887207, "epoch": 0.3571287324500692, "mean_token_accuracy": 0.7730220556259155, "num_tokens": 81089235.0, "step": 1806, "train/ce_loss": 0.5928955078125 }, { "epoch": 0.3571287324500692, "step": 1806, "train/sim_loss": 0.0002669095993041992 }, { "epoch": 0.3571287324500692, "step": 1806, "train/total_loss": 0.05955646187067032 }, { "entropy": 6.12520694732666, "epoch": 0.35732647814910024, "mean_token_accuracy": 0.7178065180778503, "num_tokens": 81134757.0, "step": 1807, "train/ce_loss": 1.823119878768921 }, { "epoch": 0.35732647814910024, "step": 1807, "train/sim_loss": 0.0002372264862060547 }, { "epoch": 0.35732647814910024, "step": 1807, "train/total_loss": 0.18254922330379486 }, { "entropy": 5.959286212921143, "epoch": 0.3575242238481313, "mean_token_accuracy": 0.7384615540504456, "num_tokens": 81171378.0, "step": 1808, "train/ce_loss": 0.4494324028491974 }, { "epoch": 0.3575242238481313, "step": 1808, "train/sim_loss": 0.0003319382667541504 }, { "epoch": 0.3575242238481313, "step": 1808, "train/total_loss": 0.04527517780661583 }, { "entropy": 5.630306243896484, "epoch": 0.35772196954716234, "mean_token_accuracy": 0.7854803204536438, "num_tokens": 81190704.0, "step": 1809, "train/ce_loss": 0.5759479999542236 }, { "epoch": 0.35772196954716234, "step": 1809, "train/sim_loss": 0.0001380443572998047 }, { "epoch": 0.35772196954716234, "step": 1809, "train/total_loss": 0.05773284658789635 }, { "entropy": 6.4989542961120605, "epoch": 0.35791971524619337, "mean_token_accuracy": 0.7216775417327881, "num_tokens": 81249840.0, "step": 1810, "train/ce_loss": 0.7515940070152283 }, { "epoch": 0.35791971524619337, "step": 1810, "train/sim_loss": 0.0002327561378479004 }, { "epoch": 0.35791971524619337, "step": 1810, "train/total_loss": 0.07539215683937073 }, { "entropy": 6.6525373458862305, "epoch": 0.35811746094522445, "mean_token_accuracy": 0.7067092657089233, "num_tokens": 81296326.0, "step": 1811, "train/ce_loss": 1.0938971042633057 }, { "epoch": 0.35811746094522445, "step": 1811, "train/sim_loss": 0.00020712614059448242 }, { "epoch": 0.35811746094522445, "step": 1811, "train/total_loss": 0.10959684103727341 }, { "entropy": 6.254176616668701, "epoch": 0.3583152066442555, "mean_token_accuracy": 0.7251184582710266, "num_tokens": 81341604.0, "step": 1812, "train/ce_loss": 0.5500666499137878 }, { "epoch": 0.3583152066442555, "step": 1812, "train/sim_loss": 0.00017893314361572266 }, { "epoch": 0.3583152066442555, "step": 1812, "train/total_loss": 0.05518559738993645 }, { "entropy": 6.0066423416137695, "epoch": 0.3585129523432865, "mean_token_accuracy": 0.725895345211029, "num_tokens": 81380145.0, "step": 1813, "train/ce_loss": 0.5484873652458191 }, { "epoch": 0.3585129523432865, "step": 1813, "train/sim_loss": 0.0001976490020751953 }, { "epoch": 0.3585129523432865, "step": 1813, "train/total_loss": 0.055046387016773224 }, { "entropy": 5.788744926452637, "epoch": 0.3587106980423176, "mean_token_accuracy": 0.7958656549453735, "num_tokens": 81420119.0, "step": 1814, "train/ce_loss": 0.6565447449684143 }, { "epoch": 0.3587106980423176, "step": 1814, "train/sim_loss": 0.0001621246337890625 }, { "epoch": 0.3587106980423176, "step": 1814, "train/total_loss": 0.06581660360097885 }, { "entropy": 6.443427085876465, "epoch": 0.3589084437413486, "mean_token_accuracy": 0.7292358875274658, "num_tokens": 81456175.0, "step": 1815, "train/ce_loss": 1.263802170753479 }, { "epoch": 0.3589084437413486, "step": 1815, "train/sim_loss": 0.0001481175422668457 }, { "epoch": 0.3589084437413486, "step": 1815, "train/total_loss": 0.12652833759784698 }, { "entropy": 6.410154819488525, "epoch": 0.3591061894403797, "mean_token_accuracy": 0.7457107901573181, "num_tokens": 81516172.0, "step": 1816, "train/ce_loss": 8.359592357010115e-06 }, { "epoch": 0.3591061894403797, "step": 1816, "train/sim_loss": 0.00016641616821289062 }, { "epoch": 0.3591061894403797, "step": 1816, "train/total_loss": 0.00016725213208701462 }, { "entropy": 6.333199501037598, "epoch": 0.3593039351394107, "mean_token_accuracy": 0.6834094524383545, "num_tokens": 81557723.0, "step": 1817, "train/ce_loss": 1.2023274898529053 }, { "epoch": 0.3593039351394107, "step": 1817, "train/sim_loss": 0.00026988983154296875 }, { "epoch": 0.3593039351394107, "step": 1817, "train/total_loss": 0.12050264328718185 }, { "entropy": 5.851015567779541, "epoch": 0.35950168083844175, "mean_token_accuracy": 0.7444089651107788, "num_tokens": 81591992.0, "step": 1818, "train/ce_loss": 0.5465871691703796 }, { "epoch": 0.35950168083844175, "step": 1818, "train/sim_loss": 0.00021183490753173828 }, { "epoch": 0.35950168083844175, "step": 1818, "train/total_loss": 0.05487055331468582 }, { "entropy": 6.263816833496094, "epoch": 0.35969942653747283, "mean_token_accuracy": 0.765802264213562, "num_tokens": 81642950.0, "step": 1819, "train/ce_loss": 0.7612847089767456 }, { "epoch": 0.35969942653747283, "step": 1819, "train/sim_loss": 0.00018966197967529297 }, { "epoch": 0.35969942653747283, "step": 1819, "train/total_loss": 0.07631813734769821 }, { "epoch": 0.35989717223650386, "grad_norm": 0.4855090081691742, "learning_rate": 9.105252745078644e-06, "loss": 0.0823, "step": 1820 }, { "entropy": 6.202285289764404, "epoch": 0.35989717223650386, "mean_token_accuracy": 0.7472406029701233, "num_tokens": 81687151.0, "step": 1820, "train/ce_loss": 1.529732584953308 }, { "epoch": 0.35989717223650386, "step": 1820, "train/sim_loss": 0.00017970800399780273 }, { "epoch": 0.35989717223650386, "step": 1820, "train/total_loss": 0.1531529724597931 }, { "entropy": 5.862597465515137, "epoch": 0.3600949179355349, "mean_token_accuracy": 0.7639612555503845, "num_tokens": 81721220.0, "step": 1821, "train/ce_loss": 1.0389450788497925 }, { "epoch": 0.3600949179355349, "step": 1821, "train/sim_loss": 0.00016200542449951172 }, { "epoch": 0.3600949179355349, "step": 1821, "train/total_loss": 0.10405651479959488 }, { "entropy": 6.475543975830078, "epoch": 0.36029266363456597, "mean_token_accuracy": 0.721485435962677, "num_tokens": 81771975.0, "step": 1822, "train/ce_loss": 1.163027627626434e-05 }, { "epoch": 0.36029266363456597, "step": 1822, "train/sim_loss": 0.0001690983772277832 }, { "epoch": 0.36029266363456597, "step": 1822, "train/total_loss": 0.00017026140994857997 }, { "entropy": 6.0276570320129395, "epoch": 0.360490409333597, "mean_token_accuracy": 0.7204782962799072, "num_tokens": 81819449.0, "step": 1823, "train/ce_loss": 1.1519674062728882 }, { "epoch": 0.360490409333597, "step": 1823, "train/sim_loss": 0.0001302957534790039 }, { "epoch": 0.360490409333597, "step": 1823, "train/total_loss": 0.11532703787088394 }, { "entropy": 5.5307698249816895, "epoch": 0.360688155032628, "mean_token_accuracy": 0.7698882222175598, "num_tokens": 81848461.0, "step": 1824, "train/ce_loss": 7.656307388970163e-06 }, { "epoch": 0.360688155032628, "step": 1824, "train/sim_loss": 0.0002701878547668457 }, { "epoch": 0.360688155032628, "step": 1824, "train/total_loss": 0.000270953489234671 }, { "entropy": 6.291306495666504, "epoch": 0.3608859007316591, "mean_token_accuracy": 0.7485294342041016, "num_tokens": 81908185.0, "step": 1825, "train/ce_loss": 1.0478092432022095 }, { "epoch": 0.3608859007316591, "step": 1825, "train/sim_loss": 0.0001252889633178711 }, { "epoch": 0.3608859007316591, "step": 1825, "train/total_loss": 0.10490621626377106 }, { "entropy": 6.136411666870117, "epoch": 0.36108364643069013, "mean_token_accuracy": 0.7530339956283569, "num_tokens": 81960936.0, "step": 1826, "train/ce_loss": 0.7194365859031677 }, { "epoch": 0.36108364643069013, "step": 1826, "train/sim_loss": 0.00011658668518066406 }, { "epoch": 0.36108364643069013, "step": 1826, "train/total_loss": 0.0720602497458458 }, { "entropy": 6.281103134155273, "epoch": 0.36128139212972116, "mean_token_accuracy": 0.7221494317054749, "num_tokens": 82001497.0, "step": 1827, "train/ce_loss": 0.7426247596740723 }, { "epoch": 0.36128139212972116, "step": 1827, "train/sim_loss": 0.0002067089080810547 }, { "epoch": 0.36128139212972116, "step": 1827, "train/total_loss": 0.0744691863656044 }, { "entropy": 5.877315521240234, "epoch": 0.36147913782875224, "mean_token_accuracy": 0.7627507448196411, "num_tokens": 82046770.0, "step": 1828, "train/ce_loss": 0.850477397441864 }, { "epoch": 0.36147913782875224, "step": 1828, "train/sim_loss": 0.00017410516738891602 }, { "epoch": 0.36147913782875224, "step": 1828, "train/total_loss": 0.08522184938192368 }, { "entropy": 6.1913251876831055, "epoch": 0.36167688352778327, "mean_token_accuracy": 0.7895077466964722, "num_tokens": 82094849.0, "step": 1829, "train/ce_loss": 0.6848989129066467 }, { "epoch": 0.36167688352778327, "step": 1829, "train/sim_loss": 0.0002480745315551758 }, { "epoch": 0.36167688352778327, "step": 1829, "train/total_loss": 0.06873796880245209 }, { "entropy": 6.20527458190918, "epoch": 0.3618746292268143, "mean_token_accuracy": 0.7558227181434631, "num_tokens": 82152238.0, "step": 1830, "train/ce_loss": 1.0304306745529175 }, { "epoch": 0.3618746292268143, "step": 1830, "train/sim_loss": 0.00019693374633789062 }, { "epoch": 0.3618746292268143, "step": 1830, "train/total_loss": 0.103240005671978 }, { "entropy": 6.153907299041748, "epoch": 0.3620723749258454, "mean_token_accuracy": 0.7454417943954468, "num_tokens": 82204627.0, "step": 1831, "train/ce_loss": 0.6517117619514465 }, { "epoch": 0.3620723749258454, "step": 1831, "train/sim_loss": 0.00014400482177734375 }, { "epoch": 0.3620723749258454, "step": 1831, "train/total_loss": 0.06531517952680588 }, { "entropy": 6.275851249694824, "epoch": 0.3622701206248764, "mean_token_accuracy": 0.7161971926689148, "num_tokens": 82237488.0, "step": 1832, "train/ce_loss": 1.1381467580795288 }, { "epoch": 0.3622701206248764, "step": 1832, "train/sim_loss": 0.0002925395965576172 }, { "epoch": 0.3622701206248764, "step": 1832, "train/total_loss": 0.11410721391439438 }, { "entropy": 5.773592948913574, "epoch": 0.36246786632390743, "mean_token_accuracy": 0.8042789101600647, "num_tokens": 82291188.0, "step": 1833, "train/ce_loss": 9.821369530982338e-06 }, { "epoch": 0.36246786632390743, "step": 1833, "train/sim_loss": 0.00024753808975219727 }, { "epoch": 0.36246786632390743, "step": 1833, "train/total_loss": 0.0002485202276147902 }, { "entropy": 5.734610557556152, "epoch": 0.3626656120229385, "mean_token_accuracy": 0.7498428821563721, "num_tokens": 82324097.0, "step": 1834, "train/ce_loss": 0.596554696559906 }, { "epoch": 0.3626656120229385, "step": 1834, "train/sim_loss": 0.00023126602172851562 }, { "epoch": 0.3626656120229385, "step": 1834, "train/total_loss": 0.059886734932661057 }, { "entropy": 6.189540863037109, "epoch": 0.36286335772196954, "mean_token_accuracy": 0.7746199369430542, "num_tokens": 82370747.0, "step": 1835, "train/ce_loss": 0.7789009809494019 }, { "epoch": 0.36286335772196954, "step": 1835, "train/sim_loss": 0.00015592575073242188 }, { "epoch": 0.36286335772196954, "step": 1835, "train/total_loss": 0.07804602384567261 }, { "entropy": 6.057797431945801, "epoch": 0.3630611034210006, "mean_token_accuracy": 0.7535301446914673, "num_tokens": 82415224.0, "step": 1836, "train/ce_loss": 0.631793737411499 }, { "epoch": 0.3630611034210006, "step": 1836, "train/sim_loss": 0.00015115737915039062 }, { "epoch": 0.3630611034210006, "step": 1836, "train/total_loss": 0.06333053112030029 }, { "entropy": 6.126429557800293, "epoch": 0.36325884912003165, "mean_token_accuracy": 0.7236743569374084, "num_tokens": 82484897.0, "step": 1837, "train/ce_loss": 1.0451244115829468 }, { "epoch": 0.36325884912003165, "step": 1837, "train/sim_loss": 0.00019365549087524414 }, { "epoch": 0.36325884912003165, "step": 1837, "train/total_loss": 0.10470610111951828 }, { "entropy": 6.2502593994140625, "epoch": 0.3634565948190627, "mean_token_accuracy": 0.7167785167694092, "num_tokens": 82525858.0, "step": 1838, "train/ce_loss": 0.8499458432197571 }, { "epoch": 0.3634565948190627, "step": 1838, "train/sim_loss": 0.0001983642578125 }, { "epoch": 0.3634565948190627, "step": 1838, "train/total_loss": 0.08519294857978821 }, { "entropy": 5.821659088134766, "epoch": 0.36365434051809375, "mean_token_accuracy": 0.7562540173530579, "num_tokens": 82560615.0, "step": 1839, "train/ce_loss": 1.203177571296692 }, { "epoch": 0.36365434051809375, "step": 1839, "train/sim_loss": 0.00022870302200317383 }, { "epoch": 0.36365434051809375, "step": 1839, "train/total_loss": 0.12054646015167236 }, { "epoch": 0.3638520862171248, "grad_norm": 0.4399433135986328, "learning_rate": 9.095360569789297e-06, "loss": 0.0787, "step": 1840 }, { "entropy": 6.203226089477539, "epoch": 0.3638520862171248, "mean_token_accuracy": 0.7299226522445679, "num_tokens": 82605772.0, "step": 1840, "train/ce_loss": 0.5816347002983093 }, { "epoch": 0.3638520862171248, "step": 1840, "train/sim_loss": 0.0001704692840576172 }, { "epoch": 0.3638520862171248, "step": 1840, "train/total_loss": 0.05833394080400467 }, { "entropy": 5.775381088256836, "epoch": 0.3640498319161558, "mean_token_accuracy": 0.7685643434524536, "num_tokens": 82642750.0, "step": 1841, "train/ce_loss": 1.0155216455459595 }, { "epoch": 0.3640498319161558, "step": 1841, "train/sim_loss": 0.00013619661331176758 }, { "epoch": 0.3640498319161558, "step": 1841, "train/total_loss": 0.10168836265802383 }, { "entropy": 5.968633651733398, "epoch": 0.3642475776151869, "mean_token_accuracy": 0.7653429508209229, "num_tokens": 82686824.0, "step": 1842, "train/ce_loss": 1.0095977783203125 }, { "epoch": 0.3642475776151869, "step": 1842, "train/sim_loss": 0.00019311904907226562 }, { "epoch": 0.3642475776151869, "step": 1842, "train/total_loss": 0.10115289688110352 }, { "entropy": 6.140104293823242, "epoch": 0.3644453233142179, "mean_token_accuracy": 0.788597047328949, "num_tokens": 82742237.0, "step": 1843, "train/ce_loss": 0.7454175353050232 }, { "epoch": 0.3644453233142179, "step": 1843, "train/sim_loss": 0.000249326229095459 }, { "epoch": 0.3644453233142179, "step": 1843, "train/total_loss": 0.0747910812497139 }, { "entropy": 5.648642539978027, "epoch": 0.36464306901324894, "mean_token_accuracy": 0.7754164338111877, "num_tokens": 82766615.0, "step": 1844, "train/ce_loss": 0.7156115174293518 }, { "epoch": 0.36464306901324894, "step": 1844, "train/sim_loss": 0.0002117156982421875 }, { "epoch": 0.36464306901324894, "step": 1844, "train/total_loss": 0.07177286595106125 }, { "entropy": 6.493882179260254, "epoch": 0.36484081471228, "mean_token_accuracy": 0.7534935474395752, "num_tokens": 82815092.0, "step": 1845, "train/ce_loss": 0.5802046060562134 }, { "epoch": 0.36484081471228, "step": 1845, "train/sim_loss": 0.00018042325973510742 }, { "epoch": 0.36484081471228, "step": 1845, "train/total_loss": 0.058200884610414505 }, { "entropy": 5.474713325500488, "epoch": 0.36503856041131105, "mean_token_accuracy": 0.7637069225311279, "num_tokens": 82842344.0, "step": 1846, "train/ce_loss": 0.5311844944953918 }, { "epoch": 0.36503856041131105, "step": 1846, "train/sim_loss": 0.00021731853485107422 }, { "epoch": 0.36503856041131105, "step": 1846, "train/total_loss": 0.0533357672393322 }, { "entropy": 6.195293426513672, "epoch": 0.3652363061103421, "mean_token_accuracy": 0.7339339256286621, "num_tokens": 82891548.0, "step": 1847, "train/ce_loss": 1.566123127937317 }, { "epoch": 0.3652363061103421, "step": 1847, "train/sim_loss": 0.0001533031463623047 }, { "epoch": 0.3652363061103421, "step": 1847, "train/total_loss": 0.1567656248807907 }, { "entropy": 6.198780059814453, "epoch": 0.36543405180937316, "mean_token_accuracy": 0.7517588138580322, "num_tokens": 82929252.0, "step": 1848, "train/ce_loss": 1.0466408729553223 }, { "epoch": 0.36543405180937316, "step": 1848, "train/sim_loss": 0.00017380714416503906 }, { "epoch": 0.36543405180937316, "step": 1848, "train/total_loss": 0.10483789443969727 }, { "entropy": 5.934572696685791, "epoch": 0.3656317975084042, "mean_token_accuracy": 0.74842768907547, "num_tokens": 82967714.0, "step": 1849, "train/ce_loss": 1.2167446613311768 }, { "epoch": 0.3656317975084042, "step": 1849, "train/sim_loss": 0.00017821788787841797 }, { "epoch": 0.3656317975084042, "step": 1849, "train/total_loss": 0.12185268849134445 }, { "entropy": 6.40000581741333, "epoch": 0.3658295432074352, "mean_token_accuracy": 0.7446954250335693, "num_tokens": 83020008.0, "step": 1850, "train/ce_loss": 0.9348931312561035 }, { "epoch": 0.3658295432074352, "step": 1850, "train/sim_loss": 0.0001551508903503418 }, { "epoch": 0.3658295432074352, "step": 1850, "train/total_loss": 0.09364446252584457 }, { "entropy": 6.090867042541504, "epoch": 0.3660272889064663, "mean_token_accuracy": 0.7518672347068787, "num_tokens": 83075045.0, "step": 1851, "train/ce_loss": 1.6406481266021729 }, { "epoch": 0.3660272889064663, "step": 1851, "train/sim_loss": 0.0001944899559020996 }, { "epoch": 0.3660272889064663, "step": 1851, "train/total_loss": 0.16425929963588715 }, { "entropy": 5.976355075836182, "epoch": 0.3662250346054973, "mean_token_accuracy": 0.7680327892303467, "num_tokens": 83115671.0, "step": 1852, "train/ce_loss": 0.7440586686134338 }, { "epoch": 0.3662250346054973, "step": 1852, "train/sim_loss": 0.0001481771469116211 }, { "epoch": 0.3662250346054973, "step": 1852, "train/total_loss": 0.07455404847860336 }, { "entropy": 5.829061508178711, "epoch": 0.36642278030452835, "mean_token_accuracy": 0.7303487658500671, "num_tokens": 83161646.0, "step": 1853, "train/ce_loss": 0.6767574548721313 }, { "epoch": 0.36642278030452835, "step": 1853, "train/sim_loss": 0.000263214111328125 }, { "epoch": 0.36642278030452835, "step": 1853, "train/total_loss": 0.06793896108865738 }, { "entropy": 6.028472423553467, "epoch": 0.36662052600355943, "mean_token_accuracy": 0.7358490824699402, "num_tokens": 83198270.0, "step": 1854, "train/ce_loss": 0.7003992795944214 }, { "epoch": 0.36662052600355943, "step": 1854, "train/sim_loss": 0.00015980005264282227 }, { "epoch": 0.36662052600355943, "step": 1854, "train/total_loss": 0.07019972801208496 }, { "entropy": 6.281445503234863, "epoch": 0.36681827170259046, "mean_token_accuracy": 0.7589605450630188, "num_tokens": 83242564.0, "step": 1855, "train/ce_loss": 1.0395845174789429 }, { "epoch": 0.36681827170259046, "step": 1855, "train/sim_loss": 0.00022071599960327148 }, { "epoch": 0.36681827170259046, "step": 1855, "train/total_loss": 0.10417916625738144 }, { "entropy": 6.2009992599487305, "epoch": 0.36701601740162154, "mean_token_accuracy": 0.7439024448394775, "num_tokens": 83277000.0, "step": 1856, "train/ce_loss": 0.7145689129829407 }, { "epoch": 0.36701601740162154, "step": 1856, "train/sim_loss": 0.0002931356430053711 }, { "epoch": 0.36701601740162154, "step": 1856, "train/total_loss": 0.07175002992153168 }, { "entropy": 6.257477760314941, "epoch": 0.36721376310065257, "mean_token_accuracy": 0.7114893794059753, "num_tokens": 83338076.0, "step": 1857, "train/ce_loss": 1.3078763484954834 }, { "epoch": 0.36721376310065257, "step": 1857, "train/sim_loss": 0.00015878677368164062 }, { "epoch": 0.36721376310065257, "step": 1857, "train/total_loss": 0.13094642758369446 }, { "entropy": 6.220616817474365, "epoch": 0.3674115087996836, "mean_token_accuracy": 0.7706422209739685, "num_tokens": 83394172.0, "step": 1858, "train/ce_loss": 0.6507598757743835 }, { "epoch": 0.3674115087996836, "step": 1858, "train/sim_loss": 0.00016224384307861328 }, { "epoch": 0.3674115087996836, "step": 1858, "train/total_loss": 0.06523822993040085 }, { "entropy": 6.091424465179443, "epoch": 0.3676092544987147, "mean_token_accuracy": 0.7475728392601013, "num_tokens": 83436646.0, "step": 1859, "train/ce_loss": 1.2102510929107666 }, { "epoch": 0.3676092544987147, "step": 1859, "train/sim_loss": 0.00016313791275024414 }, { "epoch": 0.3676092544987147, "step": 1859, "train/total_loss": 0.12118824571371078 }, { "epoch": 0.3678070001977457, "grad_norm": 0.4087671637535095, "learning_rate": 9.085468394499951e-06, "loss": 0.0791, "step": 1860 }, { "entropy": 6.262997150421143, "epoch": 0.3678070001977457, "mean_token_accuracy": 0.7635682821273804, "num_tokens": 83471534.0, "step": 1860, "train/ce_loss": 0.6804749965667725 }, { "epoch": 0.3678070001977457, "step": 1860, "train/sim_loss": 0.0002468228340148926 }, { "epoch": 0.3678070001977457, "step": 1860, "train/total_loss": 0.06829432398080826 }, { "entropy": 6.088583946228027, "epoch": 0.36800474589677673, "mean_token_accuracy": 0.7319999933242798, "num_tokens": 83512236.0, "step": 1861, "train/ce_loss": 1.050081491470337 }, { "epoch": 0.36800474589677673, "step": 1861, "train/sim_loss": 0.00016576051712036133 }, { "epoch": 0.36800474589677673, "step": 1861, "train/total_loss": 0.10517390817403793 }, { "entropy": 6.036880016326904, "epoch": 0.3682024915958078, "mean_token_accuracy": 0.7622377872467041, "num_tokens": 83553474.0, "step": 1862, "train/ce_loss": 1.1656856258923654e-05 }, { "epoch": 0.3682024915958078, "step": 1862, "train/sim_loss": 0.00016999244689941406 }, { "epoch": 0.3682024915958078, "step": 1862, "train/total_loss": 0.0001711581280687824 }, { "entropy": 5.922101020812988, "epoch": 0.36840023729483884, "mean_token_accuracy": 0.7426218390464783, "num_tokens": 83593650.0, "step": 1863, "train/ce_loss": 7.939136594359297e-06 }, { "epoch": 0.36840023729483884, "step": 1863, "train/sim_loss": 0.0003217458724975586 }, { "epoch": 0.36840023729483884, "step": 1863, "train/total_loss": 0.00032253979588858783 }, { "entropy": 6.018083572387695, "epoch": 0.36859798299386987, "mean_token_accuracy": 0.7697076797485352, "num_tokens": 83652446.0, "step": 1864, "train/ce_loss": 1.3244602680206299 }, { "epoch": 0.36859798299386987, "step": 1864, "train/sim_loss": 0.0002772808074951172 }, { "epoch": 0.36859798299386987, "step": 1864, "train/total_loss": 0.13272331655025482 }, { "entropy": 6.152430534362793, "epoch": 0.36879572869290095, "mean_token_accuracy": 0.767005443572998, "num_tokens": 83688701.0, "step": 1865, "train/ce_loss": 0.8243263959884644 }, { "epoch": 0.36879572869290095, "step": 1865, "train/sim_loss": 0.00014221668243408203 }, { "epoch": 0.36879572869290095, "step": 1865, "train/total_loss": 0.08257485926151276 }, { "entropy": 6.246474266052246, "epoch": 0.368993474391932, "mean_token_accuracy": 0.7760695219039917, "num_tokens": 83741026.0, "step": 1866, "train/ce_loss": 0.7809668779373169 }, { "epoch": 0.368993474391932, "step": 1866, "train/sim_loss": 0.00013464689254760742 }, { "epoch": 0.368993474391932, "step": 1866, "train/total_loss": 0.0782313346862793 }, { "entropy": 6.418489456176758, "epoch": 0.369191220090963, "mean_token_accuracy": 0.7541452050209045, "num_tokens": 83791671.0, "step": 1867, "train/ce_loss": 0.946120023727417 }, { "epoch": 0.369191220090963, "step": 1867, "train/sim_loss": 0.00015085935592651367 }, { "epoch": 0.369191220090963, "step": 1867, "train/total_loss": 0.09476286172866821 }, { "entropy": 6.275266647338867, "epoch": 0.3693889657899941, "mean_token_accuracy": 0.7347561120986938, "num_tokens": 83844911.0, "step": 1868, "train/ce_loss": 0.6528725624084473 }, { "epoch": 0.3693889657899941, "step": 1868, "train/sim_loss": 0.00017088651657104492 }, { "epoch": 0.3693889657899941, "step": 1868, "train/total_loss": 0.06545814126729965 }, { "entropy": 6.360476970672607, "epoch": 0.3695867114890251, "mean_token_accuracy": 0.733130693435669, "num_tokens": 83894643.0, "step": 1869, "train/ce_loss": 0.6079747080802917 }, { "epoch": 0.3695867114890251, "step": 1869, "train/sim_loss": 0.00020116567611694336 }, { "epoch": 0.3695867114890251, "step": 1869, "train/total_loss": 0.06099863722920418 }, { "entropy": 6.05117130279541, "epoch": 0.36978445718805614, "mean_token_accuracy": 0.7699386477470398, "num_tokens": 83940821.0, "step": 1870, "train/ce_loss": 0.9335567355155945 }, { "epoch": 0.36978445718805614, "step": 1870, "train/sim_loss": 0.0002480149269104004 }, { "epoch": 0.36978445718805614, "step": 1870, "train/total_loss": 0.0936036929488182 }, { "entropy": 6.132176399230957, "epoch": 0.3699822028870872, "mean_token_accuracy": 0.7645650506019592, "num_tokens": 83982609.0, "step": 1871, "train/ce_loss": 1.4046333490114193e-05 }, { "epoch": 0.3699822028870872, "step": 1871, "train/sim_loss": 0.0003097057342529297 }, { "epoch": 0.3699822028870872, "step": 1871, "train/total_loss": 0.000311110372422263 }, { "entropy": 6.25678014755249, "epoch": 0.37017994858611825, "mean_token_accuracy": 0.7489878535270691, "num_tokens": 84025864.0, "step": 1872, "train/ce_loss": 0.8967486619949341 }, { "epoch": 0.37017994858611825, "step": 1872, "train/sim_loss": 0.0001971125602722168 }, { "epoch": 0.37017994858611825, "step": 1872, "train/total_loss": 0.08987198024988174 }, { "entropy": 5.7154974937438965, "epoch": 0.3703776942851493, "mean_token_accuracy": 0.7723183631896973, "num_tokens": 84055032.0, "step": 1873, "train/ce_loss": 1.9379434888833202e-05 }, { "epoch": 0.3703776942851493, "step": 1873, "train/sim_loss": 0.00015807151794433594 }, { "epoch": 0.3703776942851493, "step": 1873, "train/total_loss": 0.00016000945470295846 }, { "entropy": 5.760051727294922, "epoch": 0.37057543998418035, "mean_token_accuracy": 0.7432432174682617, "num_tokens": 84103327.0, "step": 1874, "train/ce_loss": 1.0436153411865234 }, { "epoch": 0.37057543998418035, "step": 1874, "train/sim_loss": 0.0002467632293701172 }, { "epoch": 0.37057543998418035, "step": 1874, "train/total_loss": 0.10460829734802246 }, { "entropy": 5.646867752075195, "epoch": 0.3707731856832114, "mean_token_accuracy": 0.723929762840271, "num_tokens": 84158650.0, "step": 1875, "train/ce_loss": 0.8801656365394592 }, { "epoch": 0.3707731856832114, "step": 1875, "train/sim_loss": 0.00022530555725097656 }, { "epoch": 0.3707731856832114, "step": 1875, "train/total_loss": 0.08824186772108078 }, { "entropy": 5.677369117736816, "epoch": 0.37097093138224246, "mean_token_accuracy": 0.7874564528465271, "num_tokens": 84202068.0, "step": 1876, "train/ce_loss": 0.979835033416748 }, { "epoch": 0.37097093138224246, "step": 1876, "train/sim_loss": 0.00029033422470092773 }, { "epoch": 0.37097093138224246, "step": 1876, "train/total_loss": 0.09827383607625961 }, { "entropy": 6.246989727020264, "epoch": 0.3711686770812735, "mean_token_accuracy": 0.7247191071510315, "num_tokens": 84260576.0, "step": 1877, "train/ce_loss": 1.0398510694503784 }, { "epoch": 0.3711686770812735, "step": 1877, "train/sim_loss": 0.00022494792938232422 }, { "epoch": 0.3711686770812735, "step": 1877, "train/total_loss": 0.10421005636453629 }, { "entropy": 6.3699798583984375, "epoch": 0.3713664227803045, "mean_token_accuracy": 0.730402946472168, "num_tokens": 84310359.0, "step": 1878, "train/ce_loss": 1.0224618911743164 }, { "epoch": 0.3713664227803045, "step": 1878, "train/sim_loss": 0.00020140409469604492 }, { "epoch": 0.3713664227803045, "step": 1878, "train/total_loss": 0.10244759172201157 }, { "entropy": 6.368810653686523, "epoch": 0.3715641684793356, "mean_token_accuracy": 0.717875063419342, "num_tokens": 84376182.0, "step": 1879, "train/ce_loss": 0.9234561324119568 }, { "epoch": 0.3715641684793356, "step": 1879, "train/sim_loss": 0.0002194046974182129 }, { "epoch": 0.3715641684793356, "step": 1879, "train/total_loss": 0.09256502240896225 }, { "epoch": 0.3717619141783666, "grad_norm": 0.4851798415184021, "learning_rate": 9.075576219210605e-06, "loss": 0.079, "step": 1880 }, { "entropy": 5.754037857055664, "epoch": 0.3717619141783666, "mean_token_accuracy": 0.739234447479248, "num_tokens": 84413628.0, "step": 1880, "train/ce_loss": 1.182148814201355 }, { "epoch": 0.3717619141783666, "step": 1880, "train/sim_loss": 0.0001679062843322754 }, { "epoch": 0.3717619141783666, "step": 1880, "train/total_loss": 0.11838278919458389 }, { "entropy": 6.09008264541626, "epoch": 0.37195965987739765, "mean_token_accuracy": 0.7384076714515686, "num_tokens": 84446005.0, "step": 1881, "train/ce_loss": 0.9617998003959656 }, { "epoch": 0.37195965987739765, "step": 1881, "train/sim_loss": 0.0001156926155090332 }, { "epoch": 0.37195965987739765, "step": 1881, "train/total_loss": 0.09629567712545395 }, { "entropy": 6.6140947341918945, "epoch": 0.37215740557642873, "mean_token_accuracy": 0.7330595254898071, "num_tokens": 84505051.0, "step": 1882, "train/ce_loss": 1.4366703033447266 }, { "epoch": 0.37215740557642873, "step": 1882, "train/sim_loss": 0.0003299713134765625 }, { "epoch": 0.37215740557642873, "step": 1882, "train/total_loss": 0.14399699866771698 }, { "entropy": 6.090902328491211, "epoch": 0.37235515127545976, "mean_token_accuracy": 0.7442290186882019, "num_tokens": 84554171.0, "step": 1883, "train/ce_loss": 1.2306835742492694e-05 }, { "epoch": 0.37235515127545976, "step": 1883, "train/sim_loss": 0.00015532970428466797 }, { "epoch": 0.37235515127545976, "step": 1883, "train/total_loss": 0.0001565603888593614 }, { "entropy": 6.286965370178223, "epoch": 0.3725528969744908, "mean_token_accuracy": 0.7331223487854004, "num_tokens": 84609303.0, "step": 1884, "train/ce_loss": 0.8240832090377808 }, { "epoch": 0.3725528969744908, "step": 1884, "train/sim_loss": 0.00033402442932128906 }, { "epoch": 0.3725528969744908, "step": 1884, "train/total_loss": 0.0827423483133316 }, { "entropy": 5.804598331451416, "epoch": 0.37275064267352187, "mean_token_accuracy": 0.7324093580245972, "num_tokens": 84665232.0, "step": 1885, "train/ce_loss": 0.542401909828186 }, { "epoch": 0.37275064267352187, "step": 1885, "train/sim_loss": 0.00016069412231445312 }, { "epoch": 0.37275064267352187, "step": 1885, "train/total_loss": 0.054400887340307236 }, { "entropy": 6.370386600494385, "epoch": 0.3729483883725529, "mean_token_accuracy": 0.7567999958992004, "num_tokens": 84707272.0, "step": 1886, "train/ce_loss": 0.9210249781608582 }, { "epoch": 0.3729483883725529, "step": 1886, "train/sim_loss": 0.0001690387725830078 }, { "epoch": 0.3729483883725529, "step": 1886, "train/total_loss": 0.09227153658866882 }, { "entropy": 6.130235195159912, "epoch": 0.3731461340715839, "mean_token_accuracy": 0.7226162552833557, "num_tokens": 84744045.0, "step": 1887, "train/ce_loss": 1.0500678399694152e-05 }, { "epoch": 0.3731461340715839, "step": 1887, "train/sim_loss": 0.00015801191329956055 }, { "epoch": 0.3731461340715839, "step": 1887, "train/total_loss": 0.0001590619795024395 }, { "entropy": 6.006278038024902, "epoch": 0.373343879770615, "mean_token_accuracy": 0.7312947511672974, "num_tokens": 84794491.0, "step": 1888, "train/ce_loss": 7.510292107326677e-06 }, { "epoch": 0.373343879770615, "step": 1888, "train/sim_loss": 0.00023698806762695312 }, { "epoch": 0.373343879770615, "step": 1888, "train/total_loss": 0.00023773909197188914 }, { "entropy": 5.935866832733154, "epoch": 0.37354162546964603, "mean_token_accuracy": 0.7655417323112488, "num_tokens": 84824255.0, "step": 1889, "train/ce_loss": 0.8779361844062805 }, { "epoch": 0.37354162546964603, "step": 1889, "train/sim_loss": 0.0002677440643310547 }, { "epoch": 0.37354162546964603, "step": 1889, "train/total_loss": 0.0880613625049591 }, { "entropy": 6.144777774810791, "epoch": 0.37373937116867706, "mean_token_accuracy": 0.76917564868927, "num_tokens": 84856756.0, "step": 1890, "train/ce_loss": 0.7362905740737915 }, { "epoch": 0.37373937116867706, "step": 1890, "train/sim_loss": 0.00018006563186645508 }, { "epoch": 0.37373937116867706, "step": 1890, "train/total_loss": 0.07380912452936172 }, { "entropy": 6.353994846343994, "epoch": 0.37393711686770814, "mean_token_accuracy": 0.7568947672843933, "num_tokens": 84897711.0, "step": 1891, "train/ce_loss": 0.5456266403198242 }, { "epoch": 0.37393711686770814, "step": 1891, "train/sim_loss": 0.00014770030975341797 }, { "epoch": 0.37393711686770814, "step": 1891, "train/total_loss": 0.05471036583185196 }, { "entropy": 6.237025260925293, "epoch": 0.37413486256673917, "mean_token_accuracy": 0.7227324843406677, "num_tokens": 84938019.0, "step": 1892, "train/ce_loss": 1.1014094352722168 }, { "epoch": 0.37413486256673917, "step": 1892, "train/sim_loss": 0.00015437602996826172 }, { "epoch": 0.37413486256673917, "step": 1892, "train/total_loss": 0.11029531806707382 }, { "entropy": 6.156904220581055, "epoch": 0.3743326082657702, "mean_token_accuracy": 0.7321267127990723, "num_tokens": 84975096.0, "step": 1893, "train/ce_loss": 1.0120208571606781e-05 }, { "epoch": 0.3743326082657702, "step": 1893, "train/sim_loss": 0.0002841949462890625 }, { "epoch": 0.3743326082657702, "step": 1893, "train/total_loss": 0.0002852069737855345 }, { "entropy": 6.372241497039795, "epoch": 0.3745303539648013, "mean_token_accuracy": 0.7936508059501648, "num_tokens": 85020267.0, "step": 1894, "train/ce_loss": 0.3738952577114105 }, { "epoch": 0.3745303539648013, "step": 1894, "train/sim_loss": 0.00017774105072021484 }, { "epoch": 0.3745303539648013, "step": 1894, "train/total_loss": 0.037567269057035446 }, { "entropy": 6.474673271179199, "epoch": 0.3747280996638323, "mean_token_accuracy": 0.747391939163208, "num_tokens": 85082746.0, "step": 1895, "train/ce_loss": 8.222382348321844e-06 }, { "epoch": 0.3747280996638323, "step": 1895, "train/sim_loss": 0.00017386674880981445 }, { "epoch": 0.3747280996638323, "step": 1895, "train/total_loss": 0.0001746889902278781 }, { "entropy": 6.481321811676025, "epoch": 0.37492584536286333, "mean_token_accuracy": 0.7345678806304932, "num_tokens": 85125924.0, "step": 1896, "train/ce_loss": 0.7734512686729431 }, { "epoch": 0.37492584536286333, "step": 1896, "train/sim_loss": 0.0001977086067199707 }, { "epoch": 0.37492584536286333, "step": 1896, "train/total_loss": 0.07754283398389816 }, { "entropy": 6.450306415557861, "epoch": 0.3751235910618944, "mean_token_accuracy": 0.7059490084648132, "num_tokens": 85170033.0, "step": 1897, "train/ce_loss": 0.725158154964447 }, { "epoch": 0.3751235910618944, "step": 1897, "train/sim_loss": 0.0002072453498840332 }, { "epoch": 0.3751235910618944, "step": 1897, "train/total_loss": 0.07272306084632874 }, { "entropy": 5.941999435424805, "epoch": 0.37532133676092544, "mean_token_accuracy": 0.7831395268440247, "num_tokens": 85216084.0, "step": 1898, "train/ce_loss": 0.6726963520050049 }, { "epoch": 0.37532133676092544, "step": 1898, "train/sim_loss": 0.00019174814224243164 }, { "epoch": 0.37532133676092544, "step": 1898, "train/total_loss": 0.06746138632297516 }, { "entropy": 6.151829242706299, "epoch": 0.3755190824599565, "mean_token_accuracy": 0.7535070180892944, "num_tokens": 85254777.0, "step": 1899, "train/ce_loss": 0.8875383138656616 }, { "epoch": 0.3755190824599565, "step": 1899, "train/sim_loss": 0.00022649765014648438 }, { "epoch": 0.3755190824599565, "step": 1899, "train/total_loss": 0.08898033201694489 }, { "epoch": 0.37571682815898755, "grad_norm": 0.4473031759262085, "learning_rate": 9.065684043921259e-06, "loss": 0.0815, "step": 1900 }, { "entropy": 6.508297443389893, "epoch": 0.37571682815898755, "mean_token_accuracy": 0.7588817477226257, "num_tokens": 85306414.0, "step": 1900, "train/ce_loss": 0.6376824975013733 }, { "epoch": 0.37571682815898755, "step": 1900, "train/sim_loss": 0.0001672506332397461 }, { "epoch": 0.37571682815898755, "step": 1900, "train/total_loss": 0.06393550336360931 }, { "entropy": 6.259031772613525, "epoch": 0.3759145738580186, "mean_token_accuracy": 0.7590435147285461, "num_tokens": 85366951.0, "step": 1901, "train/ce_loss": 1.0132285356521606 }, { "epoch": 0.3759145738580186, "step": 1901, "train/sim_loss": 0.00018674135208129883 }, { "epoch": 0.3759145738580186, "step": 1901, "train/total_loss": 0.10150959342718124 }, { "entropy": 6.011051654815674, "epoch": 0.37611231955704966, "mean_token_accuracy": 0.7715486884117126, "num_tokens": 85422460.0, "step": 1902, "train/ce_loss": 0.8330803513526917 }, { "epoch": 0.37611231955704966, "step": 1902, "train/sim_loss": 0.00017344951629638672 }, { "epoch": 0.37611231955704966, "step": 1902, "train/total_loss": 0.08348148316144943 }, { "entropy": 5.906074047088623, "epoch": 0.3763100652560807, "mean_token_accuracy": 0.7617924809455872, "num_tokens": 85464174.0, "step": 1903, "train/ce_loss": 2.4432053565979004 }, { "epoch": 0.3763100652560807, "step": 1903, "train/sim_loss": 0.00012892484664916992 }, { "epoch": 0.3763100652560807, "step": 1903, "train/total_loss": 0.2444494664669037 }, { "entropy": 6.090970993041992, "epoch": 0.3765078109551117, "mean_token_accuracy": 0.7569919228553772, "num_tokens": 85498720.0, "step": 1904, "train/ce_loss": 0.7793447375297546 }, { "epoch": 0.3765078109551117, "step": 1904, "train/sim_loss": 0.00014543533325195312 }, { "epoch": 0.3765078109551117, "step": 1904, "train/total_loss": 0.07807990908622742 }, { "entropy": 6.218658447265625, "epoch": 0.3767055566541428, "mean_token_accuracy": 0.7051671743392944, "num_tokens": 85538940.0, "step": 1905, "train/ce_loss": 0.7835392951965332 }, { "epoch": 0.3767055566541428, "step": 1905, "train/sim_loss": 0.0002193450927734375 }, { "epoch": 0.3767055566541428, "step": 1905, "train/total_loss": 0.07857327908277512 }, { "entropy": 5.899871826171875, "epoch": 0.3769033023531738, "mean_token_accuracy": 0.7463818192481995, "num_tokens": 85572711.0, "step": 1906, "train/ce_loss": 1.161563754081726 }, { "epoch": 0.3769033023531738, "step": 1906, "train/sim_loss": 0.0002557039260864258 }, { "epoch": 0.3769033023531738, "step": 1906, "train/total_loss": 0.11641208082437515 }, { "entropy": 6.164103984832764, "epoch": 0.37710104805220485, "mean_token_accuracy": 0.8037518262863159, "num_tokens": 85622636.0, "step": 1907, "train/ce_loss": 0.8630029559135437 }, { "epoch": 0.37710104805220485, "step": 1907, "train/sim_loss": 0.00016099214553833008 }, { "epoch": 0.37710104805220485, "step": 1907, "train/total_loss": 0.08646129071712494 }, { "entropy": 6.152756690979004, "epoch": 0.37729879375123593, "mean_token_accuracy": 0.7341693043708801, "num_tokens": 85665893.0, "step": 1908, "train/ce_loss": 0.6181596517562866 }, { "epoch": 0.37729879375123593, "step": 1908, "train/sim_loss": 0.0002543926239013672 }, { "epoch": 0.37729879375123593, "step": 1908, "train/total_loss": 0.06207035854458809 }, { "entropy": 6.09741735458374, "epoch": 0.37749653945026695, "mean_token_accuracy": 0.733116865158081, "num_tokens": 85723373.0, "step": 1909, "train/ce_loss": 0.6055279970169067 }, { "epoch": 0.37749653945026695, "step": 1909, "train/sim_loss": 0.00014781951904296875 }, { "epoch": 0.37749653945026695, "step": 1909, "train/total_loss": 0.06070062145590782 }, { "entropy": 5.923820495605469, "epoch": 0.377694285149298, "mean_token_accuracy": 0.7679944038391113, "num_tokens": 85766450.0, "step": 1910, "train/ce_loss": 1.2476615905761719 }, { "epoch": 0.377694285149298, "step": 1910, "train/sim_loss": 0.0002835988998413086 }, { "epoch": 0.377694285149298, "step": 1910, "train/total_loss": 0.12504976987838745 }, { "entropy": 5.9649763107299805, "epoch": 0.37789203084832906, "mean_token_accuracy": 0.7631403803825378, "num_tokens": 85807710.0, "step": 1911, "train/ce_loss": 0.448843389749527 }, { "epoch": 0.37789203084832906, "step": 1911, "train/sim_loss": 0.00017088651657104492 }, { "epoch": 0.37789203084832906, "step": 1911, "train/total_loss": 0.04505522549152374 }, { "entropy": 6.743001937866211, "epoch": 0.3780897765473601, "mean_token_accuracy": 0.7161961197853088, "num_tokens": 85843015.0, "step": 1912, "train/ce_loss": 0.7061569094657898 }, { "epoch": 0.3780897765473601, "step": 1912, "train/sim_loss": 0.00016188621520996094 }, { "epoch": 0.3780897765473601, "step": 1912, "train/total_loss": 0.07077758014202118 }, { "entropy": 6.496891498565674, "epoch": 0.3782875222463911, "mean_token_accuracy": 0.7296395301818848, "num_tokens": 85891355.0, "step": 1913, "train/ce_loss": 0.6313892006874084 }, { "epoch": 0.3782875222463911, "step": 1913, "train/sim_loss": 0.0002276301383972168 }, { "epoch": 0.3782875222463911, "step": 1913, "train/total_loss": 0.06336655467748642 }, { "entropy": 6.693994998931885, "epoch": 0.3784852679454222, "mean_token_accuracy": 0.7475214600563049, "num_tokens": 85940970.0, "step": 1914, "train/ce_loss": 8.859516128723044e-06 }, { "epoch": 0.3784852679454222, "step": 1914, "train/sim_loss": 0.00021064281463623047 }, { "epoch": 0.3784852679454222, "step": 1914, "train/total_loss": 0.0002115287643391639 }, { "entropy": 6.518868446350098, "epoch": 0.3786830136444532, "mean_token_accuracy": 0.7288046479225159, "num_tokens": 85979247.0, "step": 1915, "train/ce_loss": 0.7041361927986145 }, { "epoch": 0.3786830136444532, "step": 1915, "train/sim_loss": 0.00020313262939453125 }, { "epoch": 0.3786830136444532, "step": 1915, "train/total_loss": 0.07061675190925598 }, { "entropy": 6.334962844848633, "epoch": 0.37888075934348425, "mean_token_accuracy": 0.7424242496490479, "num_tokens": 86013233.0, "step": 1916, "train/ce_loss": 9.938314178725705e-06 }, { "epoch": 0.37888075934348425, "step": 1916, "train/sim_loss": 0.00014740228652954102 }, { "epoch": 0.37888075934348425, "step": 1916, "train/total_loss": 0.00014839612413197756 }, { "entropy": 6.264568328857422, "epoch": 0.37907850504251533, "mean_token_accuracy": 0.7065868377685547, "num_tokens": 86073438.0, "step": 1917, "train/ce_loss": 1.3842999935150146 }, { "epoch": 0.37907850504251533, "step": 1917, "train/sim_loss": 0.00042432546615600586 }, { "epoch": 0.37907850504251533, "step": 1917, "train/total_loss": 0.13885432481765747 }, { "entropy": 5.774651527404785, "epoch": 0.37927625074154636, "mean_token_accuracy": 0.7754759192466736, "num_tokens": 86109238.0, "step": 1918, "train/ce_loss": 0.855889618396759 }, { "epoch": 0.37927625074154636, "step": 1918, "train/sim_loss": 0.00022411346435546875 }, { "epoch": 0.37927625074154636, "step": 1918, "train/total_loss": 0.08581307530403137 }, { "entropy": 6.119989395141602, "epoch": 0.37947399644057744, "mean_token_accuracy": 0.758142352104187, "num_tokens": 86162048.0, "step": 1919, "train/ce_loss": 0.8780182003974915 }, { "epoch": 0.37947399644057744, "step": 1919, "train/sim_loss": 0.0001901388168334961 }, { "epoch": 0.37947399644057744, "step": 1919, "train/total_loss": 0.08799196034669876 }, { "epoch": 0.37967174213960847, "grad_norm": 0.3981471657752991, "learning_rate": 9.055791868631912e-06, "loss": 0.0805, "step": 1920 }, { "entropy": 5.620644569396973, "epoch": 0.37967174213960847, "mean_token_accuracy": 0.7318702340126038, "num_tokens": 86205326.0, "step": 1920, "train/ce_loss": 1.0452855825424194 }, { "epoch": 0.37967174213960847, "step": 1920, "train/sim_loss": 0.0001857280731201172 }, { "epoch": 0.37967174213960847, "step": 1920, "train/total_loss": 0.1047142893075943 }, { "entropy": 5.714784622192383, "epoch": 0.3798694878386395, "mean_token_accuracy": 0.7796609997749329, "num_tokens": 86256222.0, "step": 1921, "train/ce_loss": 0.9451001286506653 }, { "epoch": 0.3798694878386395, "step": 1921, "train/sim_loss": 0.00016993284225463867 }, { "epoch": 0.3798694878386395, "step": 1921, "train/total_loss": 0.09467994421720505 }, { "entropy": 5.646666526794434, "epoch": 0.3800672335376706, "mean_token_accuracy": 0.7481967210769653, "num_tokens": 86290808.0, "step": 1922, "train/ce_loss": 1.3011598587036133 }, { "epoch": 0.3800672335376706, "step": 1922, "train/sim_loss": 0.00018209218978881836 }, { "epoch": 0.3800672335376706, "step": 1922, "train/total_loss": 0.13029807806015015 }, { "entropy": 5.7297210693359375, "epoch": 0.3802649792367016, "mean_token_accuracy": 0.7730299830436707, "num_tokens": 86327918.0, "step": 1923, "train/ce_loss": 0.6802741885185242 }, { "epoch": 0.3802649792367016, "step": 1923, "train/sim_loss": 0.0001970529556274414 }, { "epoch": 0.3802649792367016, "step": 1923, "train/total_loss": 0.0682244747877121 }, { "entropy": 5.811336517333984, "epoch": 0.38046272493573263, "mean_token_accuracy": 0.7640782594680786, "num_tokens": 86389496.0, "step": 1924, "train/ce_loss": 1.0145856142044067 }, { "epoch": 0.38046272493573263, "step": 1924, "train/sim_loss": 0.00020927190780639648 }, { "epoch": 0.38046272493573263, "step": 1924, "train/total_loss": 0.10166783630847931 }, { "entropy": 5.830902099609375, "epoch": 0.3806604706347637, "mean_token_accuracy": 0.7422113418579102, "num_tokens": 86431798.0, "step": 1925, "train/ce_loss": 0.5430341362953186 }, { "epoch": 0.3806604706347637, "step": 1925, "train/sim_loss": 0.00013011693954467773 }, { "epoch": 0.3806604706347637, "step": 1925, "train/total_loss": 0.05443353205919266 }, { "entropy": 5.674495697021484, "epoch": 0.38085821633379474, "mean_token_accuracy": 0.7488458156585693, "num_tokens": 86472240.0, "step": 1926, "train/ce_loss": 2.4088070858852006e-05 }, { "epoch": 0.38085821633379474, "step": 1926, "train/sim_loss": 0.0002949237823486328 }, { "epoch": 0.38085821633379474, "step": 1926, "train/total_loss": 0.00029733258998021483 }, { "entropy": 6.012594223022461, "epoch": 0.38105596203282577, "mean_token_accuracy": 0.7437666058540344, "num_tokens": 86526530.0, "step": 1927, "train/ce_loss": 1.1365164518356323 }, { "epoch": 0.38105596203282577, "step": 1927, "train/sim_loss": 0.00023502111434936523 }, { "epoch": 0.38105596203282577, "step": 1927, "train/total_loss": 0.11388666927814484 }, { "entropy": 5.9150919914245605, "epoch": 0.38125370773185685, "mean_token_accuracy": 0.6944881677627563, "num_tokens": 86585806.0, "step": 1928, "train/ce_loss": 2.176605224609375 }, { "epoch": 0.38125370773185685, "step": 1928, "train/sim_loss": 0.00018346309661865234 }, { "epoch": 0.38125370773185685, "step": 1928, "train/total_loss": 0.21784399449825287 }, { "entropy": 5.795348167419434, "epoch": 0.3814514534308879, "mean_token_accuracy": 0.7648663520812988, "num_tokens": 86623994.0, "step": 1929, "train/ce_loss": 0.5205368995666504 }, { "epoch": 0.3814514534308879, "step": 1929, "train/sim_loss": 0.0001266002655029297 }, { "epoch": 0.3814514534308879, "step": 1929, "train/total_loss": 0.05218029022216797 }, { "entropy": 5.689846038818359, "epoch": 0.3816491991299189, "mean_token_accuracy": 0.7676203846931458, "num_tokens": 86685970.0, "step": 1930, "train/ce_loss": 0.8714039921760559 }, { "epoch": 0.3816491991299189, "step": 1930, "train/sim_loss": 0.000278472900390625 }, { "epoch": 0.3816491991299189, "step": 1930, "train/total_loss": 0.08741887658834457 }, { "entropy": 6.306318283081055, "epoch": 0.38184694482895, "mean_token_accuracy": 0.721666693687439, "num_tokens": 86739707.0, "step": 1931, "train/ce_loss": 0.514968991279602 }, { "epoch": 0.38184694482895, "step": 1931, "train/sim_loss": 0.00015151500701904297 }, { "epoch": 0.38184694482895, "step": 1931, "train/total_loss": 0.05164841562509537 }, { "entropy": 5.770925521850586, "epoch": 0.382044690527981, "mean_token_accuracy": 0.7441224455833435, "num_tokens": 86776367.0, "step": 1932, "train/ce_loss": 1.1303026676177979 }, { "epoch": 0.382044690527981, "step": 1932, "train/sim_loss": 0.0002562999725341797 }, { "epoch": 0.382044690527981, "step": 1932, "train/total_loss": 0.1132865697145462 }, { "entropy": 5.919334888458252, "epoch": 0.38224243622701204, "mean_token_accuracy": 0.7199512124061584, "num_tokens": 86821971.0, "step": 1933, "train/ce_loss": 0.9035099148750305 }, { "epoch": 0.38224243622701204, "step": 1933, "train/sim_loss": 0.00014966726303100586 }, { "epoch": 0.38224243622701204, "step": 1933, "train/total_loss": 0.09050066024065018 }, { "entropy": 6.359022617340088, "epoch": 0.3824401819260431, "mean_token_accuracy": 0.7567370533943176, "num_tokens": 86863022.0, "step": 1934, "train/ce_loss": 1.1745451956812758e-05 }, { "epoch": 0.3824401819260431, "step": 1934, "train/sim_loss": 0.0001659989356994629 }, { "epoch": 0.3824401819260431, "step": 1934, "train/total_loss": 0.0001671734789852053 }, { "entropy": 6.375410079956055, "epoch": 0.38263792762507415, "mean_token_accuracy": 0.7087643146514893, "num_tokens": 86914727.0, "step": 1935, "train/ce_loss": 0.7581233382225037 }, { "epoch": 0.38263792762507415, "step": 1935, "train/sim_loss": 0.00018978118896484375 }, { "epoch": 0.38263792762507415, "step": 1935, "train/total_loss": 0.07600211352109909 }, { "entropy": 6.498897552490234, "epoch": 0.3828356733241052, "mean_token_accuracy": 0.7204861044883728, "num_tokens": 86969116.0, "step": 1936, "train/ce_loss": 1.0400906801223755 }, { "epoch": 0.3828356733241052, "step": 1936, "train/sim_loss": 0.00019788742065429688 }, { "epoch": 0.3828356733241052, "step": 1936, "train/total_loss": 0.10420695692300797 }, { "entropy": 5.93052864074707, "epoch": 0.38303341902313626, "mean_token_accuracy": 0.7533496022224426, "num_tokens": 87012390.0, "step": 1937, "train/ce_loss": 0.704498827457428 }, { "epoch": 0.38303341902313626, "step": 1937, "train/sim_loss": 0.0002002716064453125 }, { "epoch": 0.38303341902313626, "step": 1937, "train/total_loss": 0.07065015286207199 }, { "entropy": 6.304135322570801, "epoch": 0.3832311647221673, "mean_token_accuracy": 0.7157434225082397, "num_tokens": 87053169.0, "step": 1938, "train/ce_loss": 3.29619251715485e-05 }, { "epoch": 0.3832311647221673, "step": 1938, "train/sim_loss": 0.0002353191375732422 }, { "epoch": 0.3832311647221673, "step": 1938, "train/total_loss": 0.00023861533554736525 }, { "entropy": 6.115264892578125, "epoch": 0.38342891042119837, "mean_token_accuracy": 0.727956235408783, "num_tokens": 87107475.0, "step": 1939, "train/ce_loss": 0.559650719165802 }, { "epoch": 0.38342891042119837, "step": 1939, "train/sim_loss": 0.0002410411834716797 }, { "epoch": 0.38342891042119837, "step": 1939, "train/total_loss": 0.056206114590168 }, { "epoch": 0.3836266561202294, "grad_norm": 0.451919823884964, "learning_rate": 9.045899693342566e-06, "loss": 0.0819, "step": 1940 }, { "entropy": 6.024674415588379, "epoch": 0.3836266561202294, "mean_token_accuracy": 0.7206012606620789, "num_tokens": 87152259.0, "step": 1940, "train/ce_loss": 2.45851042564027e-05 }, { "epoch": 0.3836266561202294, "step": 1940, "train/sim_loss": 0.00014168024063110352 }, { "epoch": 0.3836266561202294, "step": 1940, "train/total_loss": 0.00014413875760510564 }, { "entropy": 5.9431562423706055, "epoch": 0.3838244018192604, "mean_token_accuracy": 0.7560241222381592, "num_tokens": 87198189.0, "step": 1941, "train/ce_loss": 1.6512462025275454e-05 }, { "epoch": 0.3838244018192604, "step": 1941, "train/sim_loss": 0.00021976232528686523 }, { "epoch": 0.3838244018192604, "step": 1941, "train/total_loss": 0.0002214135747635737 }, { "entropy": 5.778876304626465, "epoch": 0.3840221475182915, "mean_token_accuracy": 0.7502662539482117, "num_tokens": 87241371.0, "step": 1942, "train/ce_loss": 0.7181722521781921 }, { "epoch": 0.3840221475182915, "step": 1942, "train/sim_loss": 0.00013393163681030273 }, { "epoch": 0.3840221475182915, "step": 1942, "train/total_loss": 0.07195115834474564 }, { "entropy": 6.417158126831055, "epoch": 0.38421989321732253, "mean_token_accuracy": 0.7325789332389832, "num_tokens": 87282191.0, "step": 1943, "train/ce_loss": 7.2126231316360645e-06 }, { "epoch": 0.38421989321732253, "step": 1943, "train/sim_loss": 0.00013196468353271484 }, { "epoch": 0.38421989321732253, "step": 1943, "train/total_loss": 0.00013268594921100885 }, { "entropy": 6.20965051651001, "epoch": 0.38441763891635355, "mean_token_accuracy": 0.7586891651153564, "num_tokens": 87342549.0, "step": 1944, "train/ce_loss": 1.553360743855592e-05 }, { "epoch": 0.38441763891635355, "step": 1944, "train/sim_loss": 0.00020706653594970703 }, { "epoch": 0.38441763891635355, "step": 1944, "train/total_loss": 0.00020861989469267428 }, { "entropy": 6.468358039855957, "epoch": 0.38461538461538464, "mean_token_accuracy": 0.751091718673706, "num_tokens": 87391938.0, "step": 1945, "train/ce_loss": 0.9629147052764893 }, { "epoch": 0.38461538461538464, "step": 1945, "train/sim_loss": 0.00016796588897705078 }, { "epoch": 0.38461538461538464, "step": 1945, "train/total_loss": 0.09645944088697433 }, { "entropy": 6.1775312423706055, "epoch": 0.38481313031441566, "mean_token_accuracy": 0.7633135914802551, "num_tokens": 87445719.0, "step": 1946, "train/ce_loss": 9.853770279732998e-06 }, { "epoch": 0.38481313031441566, "step": 1946, "train/sim_loss": 0.0002193450927734375 }, { "epoch": 0.38481313031441566, "step": 1946, "train/total_loss": 0.00022033047571312636 }, { "entropy": 5.998003005981445, "epoch": 0.3850108760134467, "mean_token_accuracy": 0.7326732873916626, "num_tokens": 87481332.0, "step": 1947, "train/ce_loss": 1.7461724281311035 }, { "epoch": 0.3850108760134467, "step": 1947, "train/sim_loss": 0.00021123886108398438 }, { "epoch": 0.3850108760134467, "step": 1947, "train/total_loss": 0.17482848465442657 }, { "entropy": 6.069235801696777, "epoch": 0.3852086217124778, "mean_token_accuracy": 0.7501357793807983, "num_tokens": 87542984.0, "step": 1948, "train/ce_loss": 0.7298363447189331 }, { "epoch": 0.3852086217124778, "step": 1948, "train/sim_loss": 0.00015872716903686523 }, { "epoch": 0.3852086217124778, "step": 1948, "train/total_loss": 0.07314236462116241 }, { "entropy": 6.212136745452881, "epoch": 0.3854063674115088, "mean_token_accuracy": 0.7622237205505371, "num_tokens": 87595392.0, "step": 1949, "train/ce_loss": 0.3615112602710724 }, { "epoch": 0.3854063674115088, "step": 1949, "train/sim_loss": 0.00012755393981933594 }, { "epoch": 0.3854063674115088, "step": 1949, "train/total_loss": 0.036278679966926575 }, { "entropy": 6.4500508308410645, "epoch": 0.3856041131105398, "mean_token_accuracy": 0.713678240776062, "num_tokens": 87642149.0, "step": 1950, "train/ce_loss": 0.6503891348838806 }, { "epoch": 0.3856041131105398, "step": 1950, "train/sim_loss": 0.00025331974029541016 }, { "epoch": 0.3856041131105398, "step": 1950, "train/total_loss": 0.06529223173856735 }, { "entropy": 6.343816757202148, "epoch": 0.3858018588095709, "mean_token_accuracy": 0.7336212992668152, "num_tokens": 87697796.0, "step": 1951, "train/ce_loss": 1.122354507446289 }, { "epoch": 0.3858018588095709, "step": 1951, "train/sim_loss": 0.00014293193817138672 }, { "epoch": 0.3858018588095709, "step": 1951, "train/total_loss": 0.11237838119268417 }, { "entropy": 6.02321720123291, "epoch": 0.38599960450860193, "mean_token_accuracy": 0.7826759219169617, "num_tokens": 87734063.0, "step": 1952, "train/ce_loss": 0.776482880115509 }, { "epoch": 0.38599960450860193, "step": 1952, "train/sim_loss": 0.00019490718841552734 }, { "epoch": 0.38599960450860193, "step": 1952, "train/total_loss": 0.07784319669008255 }, { "entropy": 5.77499532699585, "epoch": 0.38619735020763296, "mean_token_accuracy": 0.7732852101325989, "num_tokens": 87777415.0, "step": 1953, "train/ce_loss": 0.8424825668334961 }, { "epoch": 0.38619735020763296, "step": 1953, "train/sim_loss": 0.00010406970977783203 }, { "epoch": 0.38619735020763296, "step": 1953, "train/total_loss": 0.08435232937335968 }, { "entropy": 6.297221660614014, "epoch": 0.38639509590666404, "mean_token_accuracy": 0.7265122532844543, "num_tokens": 87828122.0, "step": 1954, "train/ce_loss": 1.524063229560852 }, { "epoch": 0.38639509590666404, "step": 1954, "train/sim_loss": 0.00023365020751953125 }, { "epoch": 0.38639509590666404, "step": 1954, "train/total_loss": 0.1526399701833725 }, { "entropy": 6.110097885131836, "epoch": 0.38659284160569507, "mean_token_accuracy": 0.739924430847168, "num_tokens": 87881565.0, "step": 1955, "train/ce_loss": 0.8241598606109619 }, { "epoch": 0.38659284160569507, "step": 1955, "train/sim_loss": 0.0001666545867919922 }, { "epoch": 0.38659284160569507, "step": 1955, "train/total_loss": 0.08258264511823654 }, { "entropy": 6.297881126403809, "epoch": 0.3867905873047261, "mean_token_accuracy": 0.7364506125450134, "num_tokens": 87936328.0, "step": 1956, "train/ce_loss": 1.274543285369873 }, { "epoch": 0.3867905873047261, "step": 1956, "train/sim_loss": 0.0002097487449645996 }, { "epoch": 0.3867905873047261, "step": 1956, "train/total_loss": 0.12766407430171967 }, { "entropy": 6.248499870300293, "epoch": 0.3869883330037572, "mean_token_accuracy": 0.7293998003005981, "num_tokens": 87986180.0, "step": 1957, "train/ce_loss": 1.022087574005127 }, { "epoch": 0.3869883330037572, "step": 1957, "train/sim_loss": 0.00013494491577148438 }, { "epoch": 0.3869883330037572, "step": 1957, "train/total_loss": 0.10234370082616806 }, { "entropy": 6.403501510620117, "epoch": 0.3871860787027882, "mean_token_accuracy": 0.7261462211608887, "num_tokens": 88048892.0, "step": 1958, "train/ce_loss": 0.8112246990203857 }, { "epoch": 0.3871860787027882, "step": 1958, "train/sim_loss": 0.0001710653305053711 }, { "epoch": 0.3871860787027882, "step": 1958, "train/total_loss": 0.08129353821277618 }, { "entropy": 6.0917582511901855, "epoch": 0.3873838244018193, "mean_token_accuracy": 0.7613843083381653, "num_tokens": 88104036.0, "step": 1959, "train/ce_loss": 1.7140192985534668 }, { "epoch": 0.3873838244018193, "step": 1959, "train/sim_loss": 0.00016248226165771484 }, { "epoch": 0.3873838244018193, "step": 1959, "train/total_loss": 0.17156441509723663 }, { "epoch": 0.3875815701008503, "grad_norm": 0.47982779145240784, "learning_rate": 9.03600751805322e-06, "loss": 0.0812, "step": 1960 }, { "entropy": 6.18646240234375, "epoch": 0.3875815701008503, "mean_token_accuracy": 0.7345553040504456, "num_tokens": 88138811.0, "step": 1960, "train/ce_loss": 1.824692964553833 }, { "epoch": 0.3875815701008503, "step": 1960, "train/sim_loss": 0.00023889541625976562 }, { "epoch": 0.3875815701008503, "step": 1960, "train/total_loss": 0.18270818889141083 }, { "entropy": 6.402781009674072, "epoch": 0.38777931579988134, "mean_token_accuracy": 0.7473118305206299, "num_tokens": 88182404.0, "step": 1961, "train/ce_loss": 6.960686732782051e-06 }, { "epoch": 0.38777931579988134, "step": 1961, "train/sim_loss": 0.0002535581588745117 }, { "epoch": 0.38777931579988134, "step": 1961, "train/total_loss": 0.0002542542351875454 }, { "entropy": 6.044205665588379, "epoch": 0.3879770614989124, "mean_token_accuracy": 0.7533227205276489, "num_tokens": 88220709.0, "step": 1962, "train/ce_loss": 0.5493988990783691 }, { "epoch": 0.3879770614989124, "step": 1962, "train/sim_loss": 0.00017499923706054688 }, { "epoch": 0.3879770614989124, "step": 1962, "train/total_loss": 0.05511489138007164 }, { "entropy": 6.084802627563477, "epoch": 0.38817480719794345, "mean_token_accuracy": 0.7562465071678162, "num_tokens": 88276288.0, "step": 1963, "train/ce_loss": 0.7418662309646606 }, { "epoch": 0.38817480719794345, "step": 1963, "train/sim_loss": 0.00022804737091064453 }, { "epoch": 0.38817480719794345, "step": 1963, "train/total_loss": 0.07441467046737671 }, { "entropy": 6.229714870452881, "epoch": 0.3883725528969745, "mean_token_accuracy": 0.7454909682273865, "num_tokens": 88330854.0, "step": 1964, "train/ce_loss": 6.729419510520529e-06 }, { "epoch": 0.3883725528969745, "step": 1964, "train/sim_loss": 0.00017881393432617188 }, { "epoch": 0.3883725528969745, "step": 1964, "train/total_loss": 0.00017948687309399247 }, { "entropy": 6.124083995819092, "epoch": 0.38857029859600556, "mean_token_accuracy": 0.7525525689125061, "num_tokens": 88379347.0, "step": 1965, "train/ce_loss": 0.86416095495224 }, { "epoch": 0.38857029859600556, "step": 1965, "train/sim_loss": 0.00015914440155029297 }, { "epoch": 0.38857029859600556, "step": 1965, "train/total_loss": 0.08657523989677429 }, { "entropy": 6.149815559387207, "epoch": 0.3887680442950366, "mean_token_accuracy": 0.744966447353363, "num_tokens": 88418489.0, "step": 1966, "train/ce_loss": 9.589797627995722e-06 }, { "epoch": 0.3887680442950366, "step": 1966, "train/sim_loss": 0.00010597705841064453 }, { "epoch": 0.3887680442950366, "step": 1966, "train/total_loss": 0.00010693603690015152 }, { "entropy": 5.795426368713379, "epoch": 0.3889657899940676, "mean_token_accuracy": 0.7503075003623962, "num_tokens": 88452255.0, "step": 1967, "train/ce_loss": 0.6485480666160583 }, { "epoch": 0.3889657899940676, "step": 1967, "train/sim_loss": 0.00013899803161621094 }, { "epoch": 0.3889657899940676, "step": 1967, "train/total_loss": 0.06499380618333817 }, { "entropy": 6.425619125366211, "epoch": 0.3891635356930987, "mean_token_accuracy": 0.7255638837814331, "num_tokens": 88503489.0, "step": 1968, "train/ce_loss": 1.5066901445388794 }, { "epoch": 0.3891635356930987, "step": 1968, "train/sim_loss": 0.00038105249404907227 }, { "epoch": 0.3891635356930987, "step": 1968, "train/total_loss": 0.15105007588863373 }, { "entropy": 5.990150451660156, "epoch": 0.3893612813921297, "mean_token_accuracy": 0.7546082735061646, "num_tokens": 88547357.0, "step": 1969, "train/ce_loss": 0.6238672137260437 }, { "epoch": 0.3893612813921297, "step": 1969, "train/sim_loss": 0.0001385211944580078 }, { "epoch": 0.3893612813921297, "step": 1969, "train/total_loss": 0.06252524256706238 }, { "entropy": 6.329071521759033, "epoch": 0.38955902709116075, "mean_token_accuracy": 0.7297677397727966, "num_tokens": 88592531.0, "step": 1970, "train/ce_loss": 1.1106252670288086 }, { "epoch": 0.38955902709116075, "step": 1970, "train/sim_loss": 0.00012314319610595703 }, { "epoch": 0.38955902709116075, "step": 1970, "train/total_loss": 0.11118566989898682 }, { "entropy": 6.059565544128418, "epoch": 0.38975677279019183, "mean_token_accuracy": 0.7216610312461853, "num_tokens": 88644102.0, "step": 1971, "train/ce_loss": 0.4148382246494293 }, { "epoch": 0.38975677279019183, "step": 1971, "train/sim_loss": 0.0001461505889892578 }, { "epoch": 0.38975677279019183, "step": 1971, "train/total_loss": 0.04162997379899025 }, { "entropy": 6.065770626068115, "epoch": 0.38995451848922286, "mean_token_accuracy": 0.7433510422706604, "num_tokens": 88673571.0, "step": 1972, "train/ce_loss": 0.8441992998123169 }, { "epoch": 0.38995451848922286, "step": 1972, "train/sim_loss": 0.00025832653045654297 }, { "epoch": 0.38995451848922286, "step": 1972, "train/total_loss": 0.08467825502157211 }, { "entropy": 6.2567572593688965, "epoch": 0.3901522641882539, "mean_token_accuracy": 0.7231671810150146, "num_tokens": 88706487.0, "step": 1973, "train/ce_loss": 0.9166598916053772 }, { "epoch": 0.3901522641882539, "step": 1973, "train/sim_loss": 0.00020760297775268555 }, { "epoch": 0.3901522641882539, "step": 1973, "train/total_loss": 0.09187359362840652 }, { "entropy": 6.447089672088623, "epoch": 0.39035000988728497, "mean_token_accuracy": 0.7345559597015381, "num_tokens": 88744113.0, "step": 1974, "train/ce_loss": 1.13687002658844 }, { "epoch": 0.39035000988728497, "step": 1974, "train/sim_loss": 0.00014919042587280273 }, { "epoch": 0.39035000988728497, "step": 1974, "train/total_loss": 0.11383619159460068 }, { "entropy": 6.541308403015137, "epoch": 0.390547755586316, "mean_token_accuracy": 0.7667984366416931, "num_tokens": 88785491.0, "step": 1975, "train/ce_loss": 1.2084623575210571 }, { "epoch": 0.390547755586316, "step": 1975, "train/sim_loss": 0.0002219080924987793 }, { "epoch": 0.390547755586316, "step": 1975, "train/total_loss": 0.12106814235448837 }, { "entropy": 6.65529727935791, "epoch": 0.390745501285347, "mean_token_accuracy": 0.7177597880363464, "num_tokens": 88837975.0, "step": 1976, "train/ce_loss": 0.6913167834281921 }, { "epoch": 0.390745501285347, "step": 1976, "train/sim_loss": 0.00017130374908447266 }, { "epoch": 0.390745501285347, "step": 1976, "train/total_loss": 0.0693029835820198 }, { "entropy": 6.332866668701172, "epoch": 0.3909432469843781, "mean_token_accuracy": 0.7379612326622009, "num_tokens": 88878214.0, "step": 1977, "train/ce_loss": 0.6891587376594543 }, { "epoch": 0.3909432469843781, "step": 1977, "train/sim_loss": 0.00011944770812988281 }, { "epoch": 0.3909432469843781, "step": 1977, "train/total_loss": 0.06903532147407532 }, { "entropy": 6.271721839904785, "epoch": 0.3911409926834091, "mean_token_accuracy": 0.7735071778297424, "num_tokens": 88927838.0, "step": 1978, "train/ce_loss": 0.7623811364173889 }, { "epoch": 0.3911409926834091, "step": 1978, "train/sim_loss": 0.00019925832748413086 }, { "epoch": 0.3911409926834091, "step": 1978, "train/total_loss": 0.07643737643957138 }, { "entropy": 6.3429694175720215, "epoch": 0.39133873838244015, "mean_token_accuracy": 0.7527151107788086, "num_tokens": 88987475.0, "step": 1979, "train/ce_loss": 0.8517413139343262 }, { "epoch": 0.39133873838244015, "step": 1979, "train/sim_loss": 0.000244140625 }, { "epoch": 0.39133873838244015, "step": 1979, "train/total_loss": 0.08541827648878098 }, { "epoch": 0.39153648408147124, "grad_norm": 0.4612235724925995, "learning_rate": 9.026115342763875e-06, "loss": 0.0825, "step": 1980 }, { "entropy": 6.16206693649292, "epoch": 0.39153648408147124, "mean_token_accuracy": 0.7810794711112976, "num_tokens": 89024888.0, "step": 1980, "train/ce_loss": 0.6198760271072388 }, { "epoch": 0.39153648408147124, "step": 1980, "train/sim_loss": 0.0002754330635070801 }, { "epoch": 0.39153648408147124, "step": 1980, "train/total_loss": 0.062263038009405136 }, { "entropy": 6.187088489532471, "epoch": 0.39173422978050226, "mean_token_accuracy": 0.7558528184890747, "num_tokens": 89073545.0, "step": 1981, "train/ce_loss": 1.494591236114502 }, { "epoch": 0.39173422978050226, "step": 1981, "train/sim_loss": 0.00016039609909057617 }, { "epoch": 0.39173422978050226, "step": 1981, "train/total_loss": 0.14961951971054077 }, { "entropy": 6.5332159996032715, "epoch": 0.39193197547953335, "mean_token_accuracy": 0.7580772042274475, "num_tokens": 89114014.0, "step": 1982, "train/ce_loss": 8.30612043500878e-06 }, { "epoch": 0.39193197547953335, "step": 1982, "train/sim_loss": 0.00013703107833862305 }, { "epoch": 0.39193197547953335, "step": 1982, "train/total_loss": 0.000137861687107943 }, { "entropy": 5.913962364196777, "epoch": 0.39212972117856437, "mean_token_accuracy": 0.758695662021637, "num_tokens": 89154891.0, "step": 1983, "train/ce_loss": 0.9601818323135376 }, { "epoch": 0.39212972117856437, "step": 1983, "train/sim_loss": 0.00016552209854125977 }, { "epoch": 0.39212972117856437, "step": 1983, "train/total_loss": 0.09618370980024338 }, { "entropy": 6.155887603759766, "epoch": 0.3923274668775954, "mean_token_accuracy": 0.7681592106819153, "num_tokens": 89190165.0, "step": 1984, "train/ce_loss": 1.0633881092071533 }, { "epoch": 0.3923274668775954, "step": 1984, "train/sim_loss": 0.0001970529556274414 }, { "epoch": 0.3923274668775954, "step": 1984, "train/total_loss": 0.10653586685657501 }, { "entropy": 5.797464370727539, "epoch": 0.3925252125766265, "mean_token_accuracy": 0.7194758653640747, "num_tokens": 89234247.0, "step": 1985, "train/ce_loss": 0.7898057699203491 }, { "epoch": 0.3925252125766265, "step": 1985, "train/sim_loss": 0.00013595819473266602 }, { "epoch": 0.3925252125766265, "step": 1985, "train/total_loss": 0.07911653816699982 }, { "entropy": 5.9519243240356445, "epoch": 0.3927229582756575, "mean_token_accuracy": 0.7551487684249878, "num_tokens": 89292998.0, "step": 1986, "train/ce_loss": 0.565156102180481 }, { "epoch": 0.3927229582756575, "step": 1986, "train/sim_loss": 0.00014352798461914062 }, { "epoch": 0.3927229582756575, "step": 1986, "train/total_loss": 0.056659139692783356 }, { "entropy": 6.003956317901611, "epoch": 0.39292070397468853, "mean_token_accuracy": 0.6902238130569458, "num_tokens": 89334442.0, "step": 1987, "train/ce_loss": 1.279573917388916 }, { "epoch": 0.39292070397468853, "step": 1987, "train/sim_loss": 0.0002516508102416992 }, { "epoch": 0.39292070397468853, "step": 1987, "train/total_loss": 0.12820903956890106 }, { "entropy": 5.957432270050049, "epoch": 0.3931184496737196, "mean_token_accuracy": 0.7746478915214539, "num_tokens": 89360452.0, "step": 1988, "train/ce_loss": 1.7213869094848633 }, { "epoch": 0.3931184496737196, "step": 1988, "train/sim_loss": 0.00020903348922729492 }, { "epoch": 0.3931184496737196, "step": 1988, "train/total_loss": 0.17234772443771362 }, { "entropy": 6.102113723754883, "epoch": 0.39331619537275064, "mean_token_accuracy": 0.7493734359741211, "num_tokens": 89410091.0, "step": 1989, "train/ce_loss": 1.0554633140563965 }, { "epoch": 0.39331619537275064, "step": 1989, "train/sim_loss": 0.00018596649169921875 }, { "epoch": 0.39331619537275064, "step": 1989, "train/total_loss": 0.10573229938745499 }, { "entropy": 6.084328651428223, "epoch": 0.39351394107178167, "mean_token_accuracy": 0.7306979894638062, "num_tokens": 89454206.0, "step": 1990, "train/ce_loss": 0.8533467054367065 }, { "epoch": 0.39351394107178167, "step": 1990, "train/sim_loss": 0.00021892786026000977 }, { "epoch": 0.39351394107178167, "step": 1990, "train/total_loss": 0.0855536013841629 }, { "entropy": 6.4037933349609375, "epoch": 0.39371168677081275, "mean_token_accuracy": 0.7511062026023865, "num_tokens": 89499194.0, "step": 1991, "train/ce_loss": 0.433034747838974 }, { "epoch": 0.39371168677081275, "step": 1991, "train/sim_loss": 0.0002180337905883789 }, { "epoch": 0.39371168677081275, "step": 1991, "train/total_loss": 0.04352150857448578 }, { "entropy": 5.813656806945801, "epoch": 0.3939094324698438, "mean_token_accuracy": 0.7396807670593262, "num_tokens": 89548915.0, "step": 1992, "train/ce_loss": 1.1250476837158203 }, { "epoch": 0.3939094324698438, "step": 1992, "train/sim_loss": 0.0001932382583618164 }, { "epoch": 0.3939094324698438, "step": 1992, "train/total_loss": 0.1126980111002922 }, { "entropy": 5.865546226501465, "epoch": 0.3941071781688748, "mean_token_accuracy": 0.7507147192955017, "num_tokens": 89598245.0, "step": 1993, "train/ce_loss": 0.5513539910316467 }, { "epoch": 0.3941071781688748, "step": 1993, "train/sim_loss": 0.00019723176956176758 }, { "epoch": 0.3941071781688748, "step": 1993, "train/total_loss": 0.05533263087272644 }, { "entropy": 6.144070625305176, "epoch": 0.3943049238679059, "mean_token_accuracy": 0.7114551067352295, "num_tokens": 89646532.0, "step": 1994, "train/ce_loss": 0.9539219737052917 }, { "epoch": 0.3943049238679059, "step": 1994, "train/sim_loss": 0.00015556812286376953 }, { "epoch": 0.3943049238679059, "step": 1994, "train/total_loss": 0.09554776549339294 }, { "entropy": 5.859287261962891, "epoch": 0.3945026695669369, "mean_token_accuracy": 0.7569499611854553, "num_tokens": 89679901.0, "step": 1995, "train/ce_loss": 0.8501414656639099 }, { "epoch": 0.3945026695669369, "step": 1995, "train/sim_loss": 0.0002887248992919922 }, { "epoch": 0.3945026695669369, "step": 1995, "train/total_loss": 0.08530287444591522 }, { "entropy": 5.94838809967041, "epoch": 0.39470041526596794, "mean_token_accuracy": 0.7410894632339478, "num_tokens": 89720775.0, "step": 1996, "train/ce_loss": 1.228342056274414 }, { "epoch": 0.39470041526596794, "step": 1996, "train/sim_loss": 0.00013577938079833984 }, { "epoch": 0.39470041526596794, "step": 1996, "train/total_loss": 0.12296998500823975 }, { "entropy": 6.127821922302246, "epoch": 0.394898160964999, "mean_token_accuracy": 0.6999109387397766, "num_tokens": 89774458.0, "step": 1997, "train/ce_loss": 8.915570106182713e-06 }, { "epoch": 0.394898160964999, "step": 1997, "train/sim_loss": 0.00017547607421875 }, { "epoch": 0.394898160964999, "step": 1997, "train/total_loss": 0.00017636762640904635 }, { "entropy": 6.193144798278809, "epoch": 0.39509590666403005, "mean_token_accuracy": 0.7533664107322693, "num_tokens": 89821518.0, "step": 1998, "train/ce_loss": 1.1472810506820679 }, { "epoch": 0.39509590666403005, "step": 1998, "train/sim_loss": 0.00019431114196777344 }, { "epoch": 0.39509590666403005, "step": 1998, "train/total_loss": 0.1149224191904068 }, { "entropy": 5.970088958740234, "epoch": 0.3952936523630611, "mean_token_accuracy": 0.7551990747451782, "num_tokens": 89866856.0, "step": 1999, "train/ce_loss": 0.33696892857551575 }, { "epoch": 0.3952936523630611, "step": 1999, "train/sim_loss": 0.0002478361129760742 }, { "epoch": 0.3952936523630611, "step": 1999, "train/total_loss": 0.03394472971558571 }, { "epoch": 0.39549139806209216, "grad_norm": 0.41220617294311523, "learning_rate": 9.016223167474527e-06, "loss": 0.0808, "step": 2000 } ], "logging_steps": 20, "max_steps": 20228, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.5795557969218765e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }