diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -2,9 +2,9 @@ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, - "epoch": 1.1864741941862764, + "epoch": 1.5819655922483686, "eval_steps": 500, - "global_step": 12000, + "global_step": 16000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, @@ -220208,6 +220208,73406 @@ "learning_rate": 7.03580082084755e-06, "loss": 0.0943, "step": 12000 + }, + { + "entropy": 9.30201244354248, + "epoch": 1.1864741941862764, + "mean_token_accuracy": 0.8526466488838196, + "num_tokens": 9544.0, + "step": 12000, + "train/ce_loss": 0.39435845613479614 + }, + { + "epoch": 1.1864741941862764, + "step": 12000, + "train/sim_loss": 0.05044734477996826 + }, + { + "epoch": 1.1864741941862764, + "step": 12000, + "train/total_loss": 0.08988319337368011 + }, + { + "entropy": 8.75871467590332, + "epoch": 1.186573067035792, + "mean_token_accuracy": 0.8403990268707275, + "num_tokens": 20398.0, + "step": 12001, + "train/ce_loss": 0.916489839553833 + }, + { + "epoch": 1.186573067035792, + "step": 12001, + "train/sim_loss": 0.03775280714035034 + }, + { + "epoch": 1.186573067035792, + "step": 12001, + "train/total_loss": 0.1294018030166626 + }, + { + "entropy": 9.408636093139648, + "epoch": 1.1866719398853074, + "mean_token_accuracy": 0.8077436685562134, + "num_tokens": 35568.0, + "step": 12002, + "train/ce_loss": 0.5907993912696838 + }, + { + "epoch": 1.1866719398853074, + "step": 12002, + "train/sim_loss": 0.04150593280792236 + }, + { + "epoch": 1.1866719398853074, + "step": 12002, + "train/total_loss": 0.10058587789535522 + }, + { + "entropy": 9.48786735534668, + "epoch": 1.186770812734823, + "mean_token_accuracy": 0.8110328912734985, + "num_tokens": 47694.0, + "step": 12003, + "train/ce_loss": 0.4138023257255554 + }, + { + "epoch": 1.186770812734823, + "step": 12003, + "train/sim_loss": 0.009717047214508057 + }, + { + "epoch": 1.186770812734823, + "step": 12003, + "train/total_loss": 0.05109728127717972 + }, + { + "entropy": 9.710404396057129, + "epoch": 1.1868696855843386, + "mean_token_accuracy": 0.9132652878761292, + "num_tokens": 63339.0, + "step": 12004, + "train/ce_loss": 0.3520352244377136 + }, + { + "epoch": 1.1868696855843386, + "step": 12004, + "train/sim_loss": 0.04371142387390137 + }, + { + "epoch": 1.1868696855843386, + "step": 12004, + "train/total_loss": 0.07891494780778885 + }, + { + "entropy": 9.743324279785156, + "epoch": 1.186968558433854, + "mean_token_accuracy": 0.8215384483337402, + "num_tokens": 77923.0, + "step": 12005, + "train/ce_loss": 0.5179856419563293 + }, + { + "epoch": 1.186968558433854, + "step": 12005, + "train/sim_loss": 0.06120210886001587 + }, + { + "epoch": 1.186968558433854, + "step": 12005, + "train/total_loss": 0.11300067603588104 + }, + { + "entropy": 8.992076873779297, + "epoch": 1.1870674312833696, + "mean_token_accuracy": 0.80580073595047, + "num_tokens": 84500.0, + "step": 12006, + "train/ce_loss": 0.5422663688659668 + }, + { + "epoch": 1.1870674312833696, + "step": 12006, + "train/sim_loss": 0.06298482418060303 + }, + { + "epoch": 1.1870674312833696, + "step": 12006, + "train/total_loss": 0.11721146106719971 + }, + { + "entropy": 8.836404800415039, + "epoch": 1.187166304132885, + "mean_token_accuracy": 0.821007490158081, + "num_tokens": 92230.0, + "step": 12007, + "train/ce_loss": 0.5428987741470337 + }, + { + "epoch": 1.187166304132885, + "step": 12007, + "train/sim_loss": 0.016510426998138428 + }, + { + "epoch": 1.187166304132885, + "step": 12007, + "train/total_loss": 0.0708003044128418 + }, + { + "entropy": 9.739350318908691, + "epoch": 1.1872651769824007, + "mean_token_accuracy": 0.8442105054855347, + "num_tokens": 107339.0, + "step": 12008, + "train/ce_loss": 1.0259137752655079e-06 + }, + { + "epoch": 1.1872651769824007, + "step": 12008, + "train/sim_loss": 0.08228504657745361 + }, + { + "epoch": 1.1872651769824007, + "step": 12008, + "train/total_loss": 0.08228515088558197 + }, + { + "entropy": 9.461175918579102, + "epoch": 1.1873640498319162, + "mean_token_accuracy": 0.8055235743522644, + "num_tokens": 122074.0, + "step": 12009, + "train/ce_loss": 0.830836296081543 + }, + { + "epoch": 1.1873640498319162, + "step": 12009, + "train/sim_loss": 0.04163050651550293 + }, + { + "epoch": 1.1873640498319162, + "step": 12009, + "train/total_loss": 0.12471413612365723 + }, + { + "entropy": 9.422439575195312, + "epoch": 1.1874629226814317, + "mean_token_accuracy": 0.8019272089004517, + "num_tokens": 134438.0, + "step": 12010, + "train/ce_loss": 0.365692138671875 + }, + { + "epoch": 1.1874629226814317, + "step": 12010, + "train/sim_loss": 0.015139222145080566 + }, + { + "epoch": 1.1874629226814317, + "step": 12010, + "train/total_loss": 0.051708437502384186 + }, + { + "entropy": 9.36741828918457, + "epoch": 1.1875617955309472, + "mean_token_accuracy": 0.8067915439605713, + "num_tokens": 148472.0, + "step": 12011, + "train/ce_loss": 1.1829642057418823 + }, + { + "epoch": 1.1875617955309472, + "step": 12011, + "train/sim_loss": 0.06788480281829834 + }, + { + "epoch": 1.1875617955309472, + "step": 12011, + "train/total_loss": 0.1861812174320221 + }, + { + "entropy": 9.74792194366455, + "epoch": 1.1876606683804627, + "mean_token_accuracy": 0.796226441860199, + "num_tokens": 162349.0, + "step": 12012, + "train/ce_loss": 1.0777757779578678e-06 + }, + { + "epoch": 1.1876606683804627, + "step": 12012, + "train/sim_loss": 0.030178308486938477 + }, + { + "epoch": 1.1876606683804627, + "step": 12012, + "train/total_loss": 0.030178416520357132 + }, + { + "entropy": 9.774438858032227, + "epoch": 1.1877595412299782, + "mean_token_accuracy": 0.9015151262283325, + "num_tokens": 177582.0, + "step": 12013, + "train/ce_loss": 0.6199422478675842 + }, + { + "epoch": 1.1877595412299782, + "step": 12013, + "train/sim_loss": 0.05528092384338379 + }, + { + "epoch": 1.1877595412299782, + "step": 12013, + "train/total_loss": 0.11727514863014221 + }, + { + "entropy": 9.463468551635742, + "epoch": 1.1878584140794937, + "mean_token_accuracy": 0.8229954838752747, + "num_tokens": 184631.0, + "step": 12014, + "train/ce_loss": 0.5613468289375305 + }, + { + "epoch": 1.1878584140794937, + "step": 12014, + "train/sim_loss": 0.03141629695892334 + }, + { + "epoch": 1.1878584140794937, + "step": 12014, + "train/total_loss": 0.08755098283290863 + }, + { + "entropy": 9.382086753845215, + "epoch": 1.1879572869290094, + "mean_token_accuracy": 0.7988826632499695, + "num_tokens": 199753.0, + "step": 12015, + "train/ce_loss": 0.39102935791015625 + }, + { + "epoch": 1.1879572869290094, + "step": 12015, + "train/sim_loss": 0.04786229133605957 + }, + { + "epoch": 1.1879572869290094, + "step": 12015, + "train/total_loss": 0.08696523308753967 + }, + { + "entropy": 9.441227912902832, + "epoch": 1.1880561597785249, + "mean_token_accuracy": 0.8721109628677368, + "num_tokens": 212182.0, + "step": 12016, + "train/ce_loss": 0.22381126880645752 + }, + { + "epoch": 1.1880561597785249, + "step": 12016, + "train/sim_loss": 0.060930728912353516 + }, + { + "epoch": 1.1880561597785249, + "step": 12016, + "train/total_loss": 0.08331185579299927 + }, + { + "entropy": 8.795269966125488, + "epoch": 1.1881550326280403, + "mean_token_accuracy": 0.8785471320152283, + "num_tokens": 219740.0, + "step": 12017, + "train/ce_loss": 0.5001271367073059 + }, + { + "epoch": 1.1881550326280403, + "step": 12017, + "train/sim_loss": 0.014128923416137695 + }, + { + "epoch": 1.1881550326280403, + "step": 12017, + "train/total_loss": 0.0641416385769844 + }, + { + "entropy": 9.494636535644531, + "epoch": 1.1882539054775558, + "mean_token_accuracy": 0.862582802772522, + "num_tokens": 229196.0, + "step": 12018, + "train/ce_loss": 0.555290699005127 + }, + { + "epoch": 1.1882539054775558, + "step": 12018, + "train/sim_loss": 0.11465728282928467 + }, + { + "epoch": 1.1882539054775558, + "step": 12018, + "train/total_loss": 0.1701863557100296 + }, + { + "entropy": 9.587408065795898, + "epoch": 1.1883527783270713, + "mean_token_accuracy": 0.9308176040649414, + "num_tokens": 239392.0, + "step": 12019, + "train/ce_loss": 0.2490909844636917 + }, + { + "epoch": 1.1883527783270713, + "step": 12019, + "train/sim_loss": 0.02233600616455078 + }, + { + "epoch": 1.1883527783270713, + "step": 12019, + "train/total_loss": 0.04724510759115219 + }, + { + "epoch": 1.188451651176587, + "grad_norm": 0.4616815745830536, + "learning_rate": 7.0308559560896015e-06, + "loss": 0.0937, + "step": 12020 + }, + { + "entropy": 9.592860221862793, + "epoch": 1.188451651176587, + "mean_token_accuracy": 0.8927614092826843, + "num_tokens": 251868.0, + "step": 12020, + "train/ce_loss": 0.8992545008659363 + }, + { + "epoch": 1.188451651176587, + "step": 12020, + "train/sim_loss": 0.07834380865097046 + }, + { + "epoch": 1.188451651176587, + "step": 12020, + "train/total_loss": 0.16826926171779633 + }, + { + "entropy": 9.482629776000977, + "epoch": 1.1885505240261025, + "mean_token_accuracy": 0.7986230850219727, + "num_tokens": 259939.0, + "step": 12021, + "train/ce_loss": 0.35070058703422546 + }, + { + "epoch": 1.1885505240261025, + "step": 12021, + "train/sim_loss": 0.02086174488067627 + }, + { + "epoch": 1.1885505240261025, + "step": 12021, + "train/total_loss": 0.055931802839040756 + }, + { + "entropy": 9.550813674926758, + "epoch": 1.188649396875618, + "mean_token_accuracy": 0.8342105150222778, + "num_tokens": 273843.0, + "step": 12022, + "train/ce_loss": 0.5223795175552368 + }, + { + "epoch": 1.188649396875618, + "step": 12022, + "train/sim_loss": 0.04405522346496582 + }, + { + "epoch": 1.188649396875618, + "step": 12022, + "train/total_loss": 0.09629318118095398 + }, + { + "entropy": 10.006155967712402, + "epoch": 1.1887482697251335, + "mean_token_accuracy": 0.8641975522041321, + "num_tokens": 281547.0, + "step": 12023, + "train/ce_loss": 1.0146080255508423 + }, + { + "epoch": 1.1887482697251335, + "step": 12023, + "train/sim_loss": 0.055779337882995605 + }, + { + "epoch": 1.1887482697251335, + "step": 12023, + "train/total_loss": 0.1572401523590088 + }, + { + "entropy": 9.169214248657227, + "epoch": 1.188847142574649, + "mean_token_accuracy": 0.8339060544967651, + "num_tokens": 292241.0, + "step": 12024, + "train/ce_loss": 0.4551922678947449 + }, + { + "epoch": 1.188847142574649, + "step": 12024, + "train/sim_loss": 0.05973595380783081 + }, + { + "epoch": 1.188847142574649, + "step": 12024, + "train/total_loss": 0.10525518655776978 + }, + { + "entropy": 9.212973594665527, + "epoch": 1.1889460154241644, + "mean_token_accuracy": 0.8286802172660828, + "num_tokens": 304355.0, + "step": 12025, + "train/ce_loss": 0.6572568416595459 + }, + { + "epoch": 1.1889460154241644, + "step": 12025, + "train/sim_loss": 0.03473907709121704 + }, + { + "epoch": 1.1889460154241644, + "step": 12025, + "train/total_loss": 0.10046476125717163 + }, + { + "entropy": 8.730758666992188, + "epoch": 1.18904488827368, + "mean_token_accuracy": 0.8013902902603149, + "num_tokens": 316620.0, + "step": 12026, + "train/ce_loss": 0.5829775333404541 + }, + { + "epoch": 1.18904488827368, + "step": 12026, + "train/sim_loss": 0.012981176376342773 + }, + { + "epoch": 1.18904488827368, + "step": 12026, + "train/total_loss": 0.07127892971038818 + }, + { + "entropy": 9.724416732788086, + "epoch": 1.1891437611231956, + "mean_token_accuracy": 0.8766520023345947, + "num_tokens": 330396.0, + "step": 12027, + "train/ce_loss": 1.103801965713501 + }, + { + "epoch": 1.1891437611231956, + "step": 12027, + "train/sim_loss": 0.06405043601989746 + }, + { + "epoch": 1.1891437611231956, + "step": 12027, + "train/total_loss": 0.17443063855171204 + }, + { + "entropy": 9.27188491821289, + "epoch": 1.1892426339727111, + "mean_token_accuracy": 0.8417639136314392, + "num_tokens": 347392.0, + "step": 12028, + "train/ce_loss": 0.2766914665699005 + }, + { + "epoch": 1.1892426339727111, + "step": 12028, + "train/sim_loss": 0.04362964630126953 + }, + { + "epoch": 1.1892426339727111, + "step": 12028, + "train/total_loss": 0.07129879295825958 + }, + { + "entropy": 9.513599395751953, + "epoch": 1.1893415068222266, + "mean_token_accuracy": 0.8352553248405457, + "num_tokens": 361879.0, + "step": 12029, + "train/ce_loss": 0.5797502994537354 + }, + { + "epoch": 1.1893415068222266, + "step": 12029, + "train/sim_loss": 0.03149235248565674 + }, + { + "epoch": 1.1893415068222266, + "step": 12029, + "train/total_loss": 0.08946738392114639 + }, + { + "entropy": 9.180732727050781, + "epoch": 1.189440379671742, + "mean_token_accuracy": 0.791208803653717, + "num_tokens": 372395.0, + "step": 12030, + "train/ce_loss": 0.4821467399597168 + }, + { + "epoch": 1.189440379671742, + "step": 12030, + "train/sim_loss": 0.0370105504989624 + }, + { + "epoch": 1.189440379671742, + "step": 12030, + "train/total_loss": 0.08522522449493408 + }, + { + "entropy": 9.51494026184082, + "epoch": 1.1895392525212576, + "mean_token_accuracy": 0.8430420756340027, + "num_tokens": 381952.0, + "step": 12031, + "train/ce_loss": 0.6470370292663574 + }, + { + "epoch": 1.1895392525212576, + "step": 12031, + "train/sim_loss": 0.08071708679199219 + }, + { + "epoch": 1.1895392525212576, + "step": 12031, + "train/total_loss": 0.14542078971862793 + }, + { + "entropy": 9.642601013183594, + "epoch": 1.1896381253707733, + "mean_token_accuracy": 0.8832487463951111, + "num_tokens": 393600.0, + "step": 12032, + "train/ce_loss": 1.6546489405300235e-06 + }, + { + "epoch": 1.1896381253707733, + "step": 12032, + "train/sim_loss": 0.06389486789703369 + }, + { + "epoch": 1.1896381253707733, + "step": 12032, + "train/total_loss": 0.06389503180980682 + }, + { + "entropy": 9.495450019836426, + "epoch": 1.1897369982202888, + "mean_token_accuracy": 0.8302945494651794, + "num_tokens": 402543.0, + "step": 12033, + "train/ce_loss": 0.5930079221725464 + }, + { + "epoch": 1.1897369982202888, + "step": 12033, + "train/sim_loss": 0.014825582504272461 + }, + { + "epoch": 1.1897369982202888, + "step": 12033, + "train/total_loss": 0.07412637770175934 + }, + { + "entropy": 9.219415664672852, + "epoch": 1.1898358710698043, + "mean_token_accuracy": 0.7366771101951599, + "num_tokens": 413360.0, + "step": 12034, + "train/ce_loss": 0.4417794942855835 + }, + { + "epoch": 1.1898358710698043, + "step": 12034, + "train/sim_loss": 0.03398865461349487 + }, + { + "epoch": 1.1898358710698043, + "step": 12034, + "train/total_loss": 0.07816660404205322 + }, + { + "entropy": 8.824728012084961, + "epoch": 1.1899347439193197, + "mean_token_accuracy": 0.790554404258728, + "num_tokens": 424889.0, + "step": 12035, + "train/ce_loss": 0.19741539657115936 + }, + { + "epoch": 1.1899347439193197, + "step": 12035, + "train/sim_loss": 0.06809580326080322 + }, + { + "epoch": 1.1899347439193197, + "step": 12035, + "train/total_loss": 0.0878373458981514 + }, + { + "entropy": 9.937858581542969, + "epoch": 1.1900336167688352, + "mean_token_accuracy": 0.8237623572349548, + "num_tokens": 432684.0, + "step": 12036, + "train/ce_loss": 4.4284007572059636e-07 + }, + { + "epoch": 1.1900336167688352, + "step": 12036, + "train/sim_loss": 0.01019984483718872 + }, + { + "epoch": 1.1900336167688352, + "step": 12036, + "train/total_loss": 0.010199889540672302 + }, + { + "entropy": 9.609661102294922, + "epoch": 1.1901324896183507, + "mean_token_accuracy": 0.800744891166687, + "num_tokens": 440813.0, + "step": 12037, + "train/ce_loss": 0.9094429016113281 + }, + { + "epoch": 1.1901324896183507, + "step": 12037, + "train/sim_loss": 0.04582613706588745 + }, + { + "epoch": 1.1901324896183507, + "step": 12037, + "train/total_loss": 0.13677042722702026 + }, + { + "entropy": 9.575846672058105, + "epoch": 1.1902313624678662, + "mean_token_accuracy": 0.9330708384513855, + "num_tokens": 450384.0, + "step": 12038, + "train/ce_loss": 1.064387333826744e-06 + }, + { + "epoch": 1.1902313624678662, + "step": 12038, + "train/sim_loss": 0.03794825077056885 + }, + { + "epoch": 1.1902313624678662, + "step": 12038, + "train/total_loss": 0.0379483588039875 + }, + { + "entropy": 9.461681365966797, + "epoch": 1.190330235317382, + "mean_token_accuracy": 0.7713921070098877, + "num_tokens": 462933.0, + "step": 12039, + "train/ce_loss": 0.8886452913284302 + }, + { + "epoch": 1.190330235317382, + "step": 12039, + "train/sim_loss": 0.060841381549835205 + }, + { + "epoch": 1.190330235317382, + "step": 12039, + "train/total_loss": 0.1497059166431427 + }, + { + "epoch": 1.1904291081668974, + "grad_norm": 0.7158655524253845, + "learning_rate": 7.025911091331653e-06, + "loss": 0.0975, + "step": 12040 + }, + { + "entropy": 9.491741180419922, + "epoch": 1.1904291081668974, + "mean_token_accuracy": 0.7917261123657227, + "num_tokens": 475304.0, + "step": 12040, + "train/ce_loss": 0.4612691402435303 + }, + { + "epoch": 1.1904291081668974, + "step": 12040, + "train/sim_loss": 0.04487931728363037 + }, + { + "epoch": 1.1904291081668974, + "step": 12040, + "train/total_loss": 0.09100623428821564 + }, + { + "entropy": 10.048495292663574, + "epoch": 1.1905279810164129, + "mean_token_accuracy": 0.8459459543228149, + "num_tokens": 487361.0, + "step": 12041, + "train/ce_loss": 0.8365214467048645 + }, + { + "epoch": 1.1905279810164129, + "step": 12041, + "train/sim_loss": 0.09021323919296265 + }, + { + "epoch": 1.1905279810164129, + "step": 12041, + "train/total_loss": 0.17386537790298462 + }, + { + "entropy": 8.998584747314453, + "epoch": 1.1906268538659284, + "mean_token_accuracy": 0.8698030710220337, + "num_tokens": 500767.0, + "step": 12042, + "train/ce_loss": 0.501583993434906 + }, + { + "epoch": 1.1906268538659284, + "step": 12042, + "train/sim_loss": 0.028455376625061035 + }, + { + "epoch": 1.1906268538659284, + "step": 12042, + "train/total_loss": 0.0786137729883194 + }, + { + "entropy": 9.247148513793945, + "epoch": 1.1907257267154439, + "mean_token_accuracy": 0.8541952967643738, + "num_tokens": 509765.0, + "step": 12043, + "train/ce_loss": 0.45714202523231506 + }, + { + "epoch": 1.1907257267154439, + "step": 12043, + "train/sim_loss": 0.049756407737731934 + }, + { + "epoch": 1.1907257267154439, + "step": 12043, + "train/total_loss": 0.0954706072807312 + }, + { + "entropy": 9.959579467773438, + "epoch": 1.1908245995649596, + "mean_token_accuracy": 0.8979057669639587, + "num_tokens": 521193.0, + "step": 12044, + "train/ce_loss": 0.7483283877372742 + }, + { + "epoch": 1.1908245995649596, + "step": 12044, + "train/sim_loss": 0.05144083499908447 + }, + { + "epoch": 1.1908245995649596, + "step": 12044, + "train/total_loss": 0.12627367675304413 + }, + { + "entropy": 9.292121887207031, + "epoch": 1.190923472414475, + "mean_token_accuracy": 0.8470066785812378, + "num_tokens": 534207.0, + "step": 12045, + "train/ce_loss": 0.34064149856567383 + }, + { + "epoch": 1.190923472414475, + "step": 12045, + "train/sim_loss": 0.020230770111083984 + }, + { + "epoch": 1.190923472414475, + "step": 12045, + "train/total_loss": 0.05429492145776749 + }, + { + "entropy": 9.328018188476562, + "epoch": 1.1910223452639905, + "mean_token_accuracy": 0.809135377407074, + "num_tokens": 542109.0, + "step": 12046, + "train/ce_loss": 0.9192597270011902 + }, + { + "epoch": 1.1910223452639905, + "step": 12046, + "train/sim_loss": 0.187871515750885 + }, + { + "epoch": 1.1910223452639905, + "step": 12046, + "train/total_loss": 0.2797974944114685 + }, + { + "entropy": 9.292183876037598, + "epoch": 1.191121218113506, + "mean_token_accuracy": 0.9157894849777222, + "num_tokens": 559294.0, + "step": 12047, + "train/ce_loss": 0.18550372123718262 + }, + { + "epoch": 1.191121218113506, + "step": 12047, + "train/sim_loss": 0.01846921443939209 + }, + { + "epoch": 1.191121218113506, + "step": 12047, + "train/total_loss": 0.03701958805322647 + }, + { + "entropy": 9.027030944824219, + "epoch": 1.1912200909630215, + "mean_token_accuracy": 0.7867246866226196, + "num_tokens": 571706.0, + "step": 12048, + "train/ce_loss": 0.6393879652023315 + }, + { + "epoch": 1.1912200909630215, + "step": 12048, + "train/sim_loss": 0.03819429874420166 + }, + { + "epoch": 1.1912200909630215, + "step": 12048, + "train/total_loss": 0.10213309526443481 + }, + { + "entropy": 9.575033187866211, + "epoch": 1.191318963812537, + "mean_token_accuracy": 0.8444676399230957, + "num_tokens": 589031.0, + "step": 12049, + "train/ce_loss": 0.9840396046638489 + }, + { + "epoch": 1.191318963812537, + "step": 12049, + "train/sim_loss": 0.07640314102172852 + }, + { + "epoch": 1.191318963812537, + "step": 12049, + "train/total_loss": 0.1748071014881134 + }, + { + "entropy": 9.486078262329102, + "epoch": 1.1914178366620525, + "mean_token_accuracy": 0.874828040599823, + "num_tokens": 601382.0, + "step": 12050, + "train/ce_loss": 0.3907480835914612 + }, + { + "epoch": 1.1914178366620525, + "step": 12050, + "train/sim_loss": 0.03967946767807007 + }, + { + "epoch": 1.1914178366620525, + "step": 12050, + "train/total_loss": 0.07875427603721619 + }, + { + "entropy": 9.289798736572266, + "epoch": 1.1915167095115682, + "mean_token_accuracy": 0.8670588135719299, + "num_tokens": 613850.0, + "step": 12051, + "train/ce_loss": 0.33720317482948303 + }, + { + "epoch": 1.1915167095115682, + "step": 12051, + "train/sim_loss": 0.07178628444671631 + }, + { + "epoch": 1.1915167095115682, + "step": 12051, + "train/total_loss": 0.10550659894943237 + }, + { + "entropy": 9.744569778442383, + "epoch": 1.1916155823610837, + "mean_token_accuracy": 0.8702290058135986, + "num_tokens": 628234.0, + "step": 12052, + "train/ce_loss": 0.4665185809135437 + }, + { + "epoch": 1.1916155823610837, + "step": 12052, + "train/sim_loss": 0.03358179330825806 + }, + { + "epoch": 1.1916155823610837, + "step": 12052, + "train/total_loss": 0.08023364841938019 + }, + { + "entropy": 9.390167236328125, + "epoch": 1.1917144552105992, + "mean_token_accuracy": 0.8349650502204895, + "num_tokens": 644535.0, + "step": 12053, + "train/ce_loss": 0.4468449056148529 + }, + { + "epoch": 1.1917144552105992, + "step": 12053, + "train/sim_loss": 0.04113757610321045 + }, + { + "epoch": 1.1917144552105992, + "step": 12053, + "train/total_loss": 0.08582206815481186 + }, + { + "entropy": 8.975379943847656, + "epoch": 1.1918133280601146, + "mean_token_accuracy": 0.8037974834442139, + "num_tokens": 664143.0, + "step": 12054, + "train/ce_loss": 0.7025911211967468 + }, + { + "epoch": 1.1918133280601146, + "step": 12054, + "train/sim_loss": 0.060774803161621094 + }, + { + "epoch": 1.1918133280601146, + "step": 12054, + "train/total_loss": 0.13103392720222473 + }, + { + "entropy": 9.233539581298828, + "epoch": 1.1919122009096301, + "mean_token_accuracy": 0.8031980395317078, + "num_tokens": 673301.0, + "step": 12055, + "train/ce_loss": 0.7352741360664368 + }, + { + "epoch": 1.1919122009096301, + "step": 12055, + "train/sim_loss": 0.05402684211730957 + }, + { + "epoch": 1.1919122009096301, + "step": 12055, + "train/total_loss": 0.1275542676448822 + }, + { + "entropy": 9.532390594482422, + "epoch": 1.1920110737591458, + "mean_token_accuracy": 0.7444314360618591, + "num_tokens": 681387.0, + "step": 12056, + "train/ce_loss": 0.33589479327201843 + }, + { + "epoch": 1.1920110737591458, + "step": 12056, + "train/sim_loss": 0.07970273494720459 + }, + { + "epoch": 1.1920110737591458, + "step": 12056, + "train/total_loss": 0.11329221725463867 + }, + { + "entropy": 8.868392944335938, + "epoch": 1.1921099466086613, + "mean_token_accuracy": 0.8555825352668762, + "num_tokens": 690958.0, + "step": 12057, + "train/ce_loss": 0.4572591781616211 + }, + { + "epoch": 1.1921099466086613, + "step": 12057, + "train/sim_loss": 0.01708054542541504 + }, + { + "epoch": 1.1921099466086613, + "step": 12057, + "train/total_loss": 0.06280646473169327 + }, + { + "entropy": 9.008177757263184, + "epoch": 1.1922088194581768, + "mean_token_accuracy": 0.7865731716156006, + "num_tokens": 699328.0, + "step": 12058, + "train/ce_loss": 0.6299508213996887 + }, + { + "epoch": 1.1922088194581768, + "step": 12058, + "train/sim_loss": 0.05885803699493408 + }, + { + "epoch": 1.1922088194581768, + "step": 12058, + "train/total_loss": 0.12185312062501907 + }, + { + "entropy": 9.41628646850586, + "epoch": 1.1923076923076923, + "mean_token_accuracy": 0.8197832107543945, + "num_tokens": 716188.0, + "step": 12059, + "train/ce_loss": 1.7468699979872326e-06 + }, + { + "epoch": 1.1923076923076923, + "step": 12059, + "train/sim_loss": 0.05688881874084473 + }, + { + "epoch": 1.1923076923076923, + "step": 12059, + "train/total_loss": 0.056888993829488754 + }, + { + "epoch": 1.1924065651572078, + "grad_norm": 0.6295737624168396, + "learning_rate": 7.020966226573703e-06, + "loss": 0.0971, + "step": 12060 + }, + { + "entropy": 9.262917518615723, + "epoch": 1.1924065651572078, + "mean_token_accuracy": 0.8351955413818359, + "num_tokens": 728622.0, + "step": 12060, + "train/ce_loss": 0.5943187475204468 + }, + { + "epoch": 1.1924065651572078, + "step": 12060, + "train/sim_loss": 0.06626951694488525 + }, + { + "epoch": 1.1924065651572078, + "step": 12060, + "train/total_loss": 0.1257013976573944 + }, + { + "entropy": 8.861442565917969, + "epoch": 1.1925054380067233, + "mean_token_accuracy": 0.8307475447654724, + "num_tokens": 739543.0, + "step": 12061, + "train/ce_loss": 0.6109131574630737 + }, + { + "epoch": 1.1925054380067233, + "step": 12061, + "train/sim_loss": 0.01046133041381836 + }, + { + "epoch": 1.1925054380067233, + "step": 12061, + "train/total_loss": 0.07155264914035797 + }, + { + "entropy": 9.406874656677246, + "epoch": 1.192604310856239, + "mean_token_accuracy": 0.8773006200790405, + "num_tokens": 754765.0, + "step": 12062, + "train/ce_loss": 0.25997987389564514 + }, + { + "epoch": 1.192604310856239, + "step": 12062, + "train/sim_loss": 0.04769432544708252 + }, + { + "epoch": 1.192604310856239, + "step": 12062, + "train/total_loss": 0.07369231432676315 + }, + { + "entropy": 9.336954116821289, + "epoch": 1.1927031837057545, + "mean_token_accuracy": 0.8637565970420837, + "num_tokens": 768215.0, + "step": 12063, + "train/ce_loss": 0.33020737767219543 + }, + { + "epoch": 1.1927031837057545, + "step": 12063, + "train/sim_loss": 0.023285269737243652 + }, + { + "epoch": 1.1927031837057545, + "step": 12063, + "train/total_loss": 0.056306008249521255 + }, + { + "entropy": 9.2068510055542, + "epoch": 1.19280205655527, + "mean_token_accuracy": 0.8610389828681946, + "num_tokens": 776841.0, + "step": 12064, + "train/ce_loss": 0.22270946204662323 + }, + { + "epoch": 1.19280205655527, + "step": 12064, + "train/sim_loss": 0.041854143142700195 + }, + { + "epoch": 1.19280205655527, + "step": 12064, + "train/total_loss": 0.06412509083747864 + }, + { + "entropy": 9.4176664352417, + "epoch": 1.1929009294047854, + "mean_token_accuracy": 0.8270777463912964, + "num_tokens": 786904.0, + "step": 12065, + "train/ce_loss": 0.3809674084186554 + }, + { + "epoch": 1.1929009294047854, + "step": 12065, + "train/sim_loss": 0.03417205810546875 + }, + { + "epoch": 1.1929009294047854, + "step": 12065, + "train/total_loss": 0.07226879894733429 + }, + { + "entropy": 9.278510093688965, + "epoch": 1.192999802254301, + "mean_token_accuracy": 0.8178331851959229, + "num_tokens": 800074.0, + "step": 12066, + "train/ce_loss": 0.22525423765182495 + }, + { + "epoch": 1.192999802254301, + "step": 12066, + "train/sim_loss": 0.04959690570831299 + }, + { + "epoch": 1.192999802254301, + "step": 12066, + "train/total_loss": 0.07212232798337936 + }, + { + "entropy": 8.892887115478516, + "epoch": 1.1930986751038164, + "mean_token_accuracy": 0.84555983543396, + "num_tokens": 812406.0, + "step": 12067, + "train/ce_loss": 0.38810858130455017 + }, + { + "epoch": 1.1930986751038164, + "step": 12067, + "train/sim_loss": 0.04263496398925781 + }, + { + "epoch": 1.1930986751038164, + "step": 12067, + "train/total_loss": 0.08144582808017731 + }, + { + "entropy": 9.399478912353516, + "epoch": 1.193197547953332, + "mean_token_accuracy": 0.8847150206565857, + "num_tokens": 821927.0, + "step": 12068, + "train/ce_loss": 0.5344647765159607 + }, + { + "epoch": 1.193197547953332, + "step": 12068, + "train/sim_loss": 0.06310045719146729 + }, + { + "epoch": 1.193197547953332, + "step": 12068, + "train/total_loss": 0.11654693633317947 + }, + { + "entropy": 9.344433784484863, + "epoch": 1.1932964208028476, + "mean_token_accuracy": 0.8644859790802002, + "num_tokens": 830320.0, + "step": 12069, + "train/ce_loss": 0.6354048848152161 + }, + { + "epoch": 1.1932964208028476, + "step": 12069, + "train/sim_loss": 0.027473092079162598 + }, + { + "epoch": 1.1932964208028476, + "step": 12069, + "train/total_loss": 0.0910135805606842 + }, + { + "entropy": 8.958080291748047, + "epoch": 1.193395293652363, + "mean_token_accuracy": 0.8669725060462952, + "num_tokens": 843418.0, + "step": 12070, + "train/ce_loss": 0.34080150723457336 + }, + { + "epoch": 1.193395293652363, + "step": 12070, + "train/sim_loss": 0.015337944030761719 + }, + { + "epoch": 1.193395293652363, + "step": 12070, + "train/total_loss": 0.049418095499277115 + }, + { + "entropy": 9.152567863464355, + "epoch": 1.1934941665018786, + "mean_token_accuracy": 0.8374485373497009, + "num_tokens": 856534.0, + "step": 12071, + "train/ce_loss": 0.7473486661911011 + }, + { + "epoch": 1.1934941665018786, + "step": 12071, + "train/sim_loss": 0.024274826049804688 + }, + { + "epoch": 1.1934941665018786, + "step": 12071, + "train/total_loss": 0.0990096926689148 + }, + { + "entropy": 9.59975814819336, + "epoch": 1.193593039351394, + "mean_token_accuracy": 0.8587127327919006, + "num_tokens": 869298.0, + "step": 12072, + "train/ce_loss": 0.568401038646698 + }, + { + "epoch": 1.193593039351394, + "step": 12072, + "train/sim_loss": 0.022824108600616455 + }, + { + "epoch": 1.193593039351394, + "step": 12072, + "train/total_loss": 0.0796642154455185 + }, + { + "entropy": 9.749897956848145, + "epoch": 1.1936919122009098, + "mean_token_accuracy": 0.7792068719863892, + "num_tokens": 880516.0, + "step": 12073, + "train/ce_loss": 0.9955918192863464 + }, + { + "epoch": 1.1936919122009098, + "step": 12073, + "train/sim_loss": 0.06358104944229126 + }, + { + "epoch": 1.1936919122009098, + "step": 12073, + "train/total_loss": 0.16314023733139038 + }, + { + "entropy": 9.397902488708496, + "epoch": 1.1937907850504252, + "mean_token_accuracy": 0.9073171019554138, + "num_tokens": 889334.0, + "step": 12074, + "train/ce_loss": 0.3924555480480194 + }, + { + "epoch": 1.1937907850504252, + "step": 12074, + "train/sim_loss": 0.09632503986358643 + }, + { + "epoch": 1.1937907850504252, + "step": 12074, + "train/total_loss": 0.13557060062885284 + }, + { + "entropy": 9.164909362792969, + "epoch": 1.1938896578999407, + "mean_token_accuracy": 0.8458150029182434, + "num_tokens": 904676.0, + "step": 12075, + "train/ce_loss": 0.5299190878868103 + }, + { + "epoch": 1.1938896578999407, + "step": 12075, + "train/sim_loss": 0.0263519287109375 + }, + { + "epoch": 1.1938896578999407, + "step": 12075, + "train/total_loss": 0.07934384047985077 + }, + { + "entropy": 8.938480377197266, + "epoch": 1.1939885307494562, + "mean_token_accuracy": 0.8671140670776367, + "num_tokens": 913658.0, + "step": 12076, + "train/ce_loss": 0.4312075674533844 + }, + { + "epoch": 1.1939885307494562, + "step": 12076, + "train/sim_loss": 0.041255950927734375 + }, + { + "epoch": 1.1939885307494562, + "step": 12076, + "train/total_loss": 0.08437670767307281 + }, + { + "entropy": 8.97673225402832, + "epoch": 1.1940874035989717, + "mean_token_accuracy": 0.852300226688385, + "num_tokens": 923384.0, + "step": 12077, + "train/ce_loss": 0.44107967615127563 + }, + { + "epoch": 1.1940874035989717, + "step": 12077, + "train/sim_loss": 0.0886850357055664 + }, + { + "epoch": 1.1940874035989717, + "step": 12077, + "train/total_loss": 0.13279300928115845 + }, + { + "entropy": 9.616897583007812, + "epoch": 1.1941862764484872, + "mean_token_accuracy": 0.8265971541404724, + "num_tokens": 941203.0, + "step": 12078, + "train/ce_loss": 0.5399758219718933 + }, + { + "epoch": 1.1941862764484872, + "step": 12078, + "train/sim_loss": 0.0775301456451416 + }, + { + "epoch": 1.1941862764484872, + "step": 12078, + "train/total_loss": 0.13152772188186646 + }, + { + "entropy": 9.82278060913086, + "epoch": 1.1942851492980027, + "mean_token_accuracy": 0.9047619104385376, + "num_tokens": 956698.0, + "step": 12079, + "train/ce_loss": 7.482650516976719e-07 + }, + { + "epoch": 1.1942851492980027, + "step": 12079, + "train/sim_loss": 0.03780418634414673 + }, + { + "epoch": 1.1942851492980027, + "step": 12079, + "train/total_loss": 0.0378042608499527 + }, + { + "epoch": 1.1943840221475184, + "grad_norm": 0.5720798373222351, + "learning_rate": 7.016021361815755e-06, + "loss": 0.0861, + "step": 12080 + }, + { + "entropy": 9.370224952697754, + "epoch": 1.1943840221475184, + "mean_token_accuracy": 0.8197183012962341, + "num_tokens": 970540.0, + "step": 12080, + "train/ce_loss": 0.6688541769981384 + }, + { + "epoch": 1.1943840221475184, + "step": 12080, + "train/sim_loss": 0.10815274715423584 + }, + { + "epoch": 1.1943840221475184, + "step": 12080, + "train/total_loss": 0.1750381588935852 + }, + { + "entropy": 9.51002025604248, + "epoch": 1.1944828949970339, + "mean_token_accuracy": 0.8737300634384155, + "num_tokens": 982706.0, + "step": 12081, + "train/ce_loss": 0.36985576152801514 + }, + { + "epoch": 1.1944828949970339, + "step": 12081, + "train/sim_loss": 0.026745319366455078 + }, + { + "epoch": 1.1944828949970339, + "step": 12081, + "train/total_loss": 0.06373089551925659 + }, + { + "entropy": 9.330716133117676, + "epoch": 1.1945817678465493, + "mean_token_accuracy": 0.81186443567276, + "num_tokens": 999174.0, + "step": 12082, + "train/ce_loss": 7.928131822154683e-07 + }, + { + "epoch": 1.1945817678465493, + "step": 12082, + "train/sim_loss": 0.025990188121795654 + }, + { + "epoch": 1.1945817678465493, + "step": 12082, + "train/total_loss": 0.02599026821553707 + }, + { + "entropy": 9.868609428405762, + "epoch": 1.1946806406960648, + "mean_token_accuracy": 0.8585858345031738, + "num_tokens": 1011778.0, + "step": 12083, + "train/ce_loss": 6.959786560400971e-07 + }, + { + "epoch": 1.1946806406960648, + "step": 12083, + "train/sim_loss": 0.013536572456359863 + }, + { + "epoch": 1.1946806406960648, + "step": 12083, + "train/total_loss": 0.01353664230555296 + }, + { + "entropy": 9.247125625610352, + "epoch": 1.1947795135455803, + "mean_token_accuracy": 0.8719052672386169, + "num_tokens": 1024947.0, + "step": 12084, + "train/ce_loss": 0.19491904973983765 + }, + { + "epoch": 1.1947795135455803, + "step": 12084, + "train/sim_loss": 0.026473581790924072 + }, + { + "epoch": 1.1947795135455803, + "step": 12084, + "train/total_loss": 0.04596548527479172 + }, + { + "entropy": 9.40192699432373, + "epoch": 1.194878386395096, + "mean_token_accuracy": 0.8672769069671631, + "num_tokens": 1036653.0, + "step": 12085, + "train/ce_loss": 0.6764732599258423 + }, + { + "epoch": 1.194878386395096, + "step": 12085, + "train/sim_loss": 0.03803229331970215 + }, + { + "epoch": 1.194878386395096, + "step": 12085, + "train/total_loss": 0.10567962378263474 + }, + { + "entropy": 9.466707229614258, + "epoch": 1.1949772592446115, + "mean_token_accuracy": 0.850602388381958, + "num_tokens": 1051918.0, + "step": 12086, + "train/ce_loss": 0.17141534388065338 + }, + { + "epoch": 1.1949772592446115, + "step": 12086, + "train/sim_loss": 0.019203543663024902 + }, + { + "epoch": 1.1949772592446115, + "step": 12086, + "train/total_loss": 0.03634507954120636 + }, + { + "entropy": 9.795360565185547, + "epoch": 1.195076132094127, + "mean_token_accuracy": 0.891986072063446, + "num_tokens": 1065894.0, + "step": 12087, + "train/ce_loss": 0.5359116196632385 + }, + { + "epoch": 1.195076132094127, + "step": 12087, + "train/sim_loss": 0.03264331817626953 + }, + { + "epoch": 1.195076132094127, + "step": 12087, + "train/total_loss": 0.08623448014259338 + }, + { + "entropy": 9.288105010986328, + "epoch": 1.1951750049436425, + "mean_token_accuracy": 0.8983268737792969, + "num_tokens": 1078273.0, + "step": 12088, + "train/ce_loss": 0.6029052138328552 + }, + { + "epoch": 1.1951750049436425, + "step": 12088, + "train/sim_loss": 0.020281672477722168 + }, + { + "epoch": 1.1951750049436425, + "step": 12088, + "train/total_loss": 0.08057219535112381 + }, + { + "entropy": 9.573914527893066, + "epoch": 1.195273877793158, + "mean_token_accuracy": 0.8405545949935913, + "num_tokens": 1088727.0, + "step": 12089, + "train/ce_loss": 0.6595571637153625 + }, + { + "epoch": 1.195273877793158, + "step": 12089, + "train/sim_loss": 0.029127538204193115 + }, + { + "epoch": 1.195273877793158, + "step": 12089, + "train/total_loss": 0.09508325904607773 + }, + { + "entropy": 9.271018028259277, + "epoch": 1.1953727506426735, + "mean_token_accuracy": 0.8300561904907227, + "num_tokens": 1097829.0, + "step": 12090, + "train/ce_loss": 0.560325026512146 + }, + { + "epoch": 1.1953727506426735, + "step": 12090, + "train/sim_loss": 0.057840943336486816 + }, + { + "epoch": 1.1953727506426735, + "step": 12090, + "train/total_loss": 0.1138734519481659 + }, + { + "entropy": 9.63856029510498, + "epoch": 1.195471623492189, + "mean_token_accuracy": 0.9023622274398804, + "num_tokens": 1115860.0, + "step": 12091, + "train/ce_loss": 5.233837896412297e-07 + }, + { + "epoch": 1.195471623492189, + "step": 12091, + "train/sim_loss": 0.024469733238220215 + }, + { + "epoch": 1.195471623492189, + "step": 12091, + "train/total_loss": 0.024469785392284393 + }, + { + "entropy": 9.479328155517578, + "epoch": 1.1955704963417046, + "mean_token_accuracy": 0.8435754179954529, + "num_tokens": 1128043.0, + "step": 12092, + "train/ce_loss": 0.6530123949050903 + }, + { + "epoch": 1.1955704963417046, + "step": 12092, + "train/sim_loss": 0.0838133692741394 + }, + { + "epoch": 1.1955704963417046, + "step": 12092, + "train/total_loss": 0.14911460876464844 + }, + { + "entropy": 9.846307754516602, + "epoch": 1.1956693691912201, + "mean_token_accuracy": 0.8733850121498108, + "num_tokens": 1137200.0, + "step": 12093, + "train/ce_loss": 3.191216819686815e-07 + }, + { + "epoch": 1.1956693691912201, + "step": 12093, + "train/sim_loss": 0.015852630138397217 + }, + { + "epoch": 1.1956693691912201, + "step": 12093, + "train/total_loss": 0.015852661803364754 + }, + { + "entropy": 9.546796798706055, + "epoch": 1.1957682420407356, + "mean_token_accuracy": 0.8292682766914368, + "num_tokens": 1148779.0, + "step": 12094, + "train/ce_loss": 0.5408859252929688 + }, + { + "epoch": 1.1957682420407356, + "step": 12094, + "train/sim_loss": 0.08034634590148926 + }, + { + "epoch": 1.1957682420407356, + "step": 12094, + "train/total_loss": 0.13443493843078613 + }, + { + "entropy": 9.604915618896484, + "epoch": 1.195867114890251, + "mean_token_accuracy": 0.8296020030975342, + "num_tokens": 1159843.0, + "step": 12095, + "train/ce_loss": 1.0134859849131317e-06 + }, + { + "epoch": 1.195867114890251, + "step": 12095, + "train/sim_loss": 0.02939826250076294 + }, + { + "epoch": 1.195867114890251, + "step": 12095, + "train/total_loss": 0.029398363083600998 + }, + { + "entropy": 9.42379093170166, + "epoch": 1.1959659877397666, + "mean_token_accuracy": 0.792610228061676, + "num_tokens": 1172810.0, + "step": 12096, + "train/ce_loss": 0.8766727447509766 + }, + { + "epoch": 1.1959659877397666, + "step": 12096, + "train/sim_loss": 0.060923993587493896 + }, + { + "epoch": 1.1959659877397666, + "step": 12096, + "train/total_loss": 0.1485912799835205 + }, + { + "entropy": 9.247201919555664, + "epoch": 1.1960648605892823, + "mean_token_accuracy": 0.8081395626068115, + "num_tokens": 1181014.0, + "step": 12097, + "train/ce_loss": 0.33187901973724365 + }, + { + "epoch": 1.1960648605892823, + "step": 12097, + "train/sim_loss": 0.013956427574157715 + }, + { + "epoch": 1.1960648605892823, + "step": 12097, + "train/total_loss": 0.0471443310379982 + }, + { + "entropy": 9.0596342086792, + "epoch": 1.1961637334387978, + "mean_token_accuracy": 0.9267643094062805, + "num_tokens": 1187219.0, + "step": 12098, + "train/ce_loss": 0.26496243476867676 + }, + { + "epoch": 1.1961637334387978, + "step": 12098, + "train/sim_loss": 0.012967705726623535 + }, + { + "epoch": 1.1961637334387978, + "step": 12098, + "train/total_loss": 0.03946395218372345 + }, + { + "entropy": 9.475400924682617, + "epoch": 1.1962626062883133, + "mean_token_accuracy": 0.9202127456665039, + "num_tokens": 1207057.0, + "step": 12099, + "train/ce_loss": 0.4954540729522705 + }, + { + "epoch": 1.1962626062883133, + "step": 12099, + "train/sim_loss": 0.02165842056274414 + }, + { + "epoch": 1.1962626062883133, + "step": 12099, + "train/total_loss": 0.07120382785797119 + }, + { + "epoch": 1.1963614791378288, + "grad_norm": 0.4209743142127991, + "learning_rate": 7.011076497057806e-06, + "loss": 0.0845, + "step": 12100 + }, + { + "entropy": 8.876734733581543, + "epoch": 1.1963614791378288, + "mean_token_accuracy": 0.8740554451942444, + "num_tokens": 1217185.0, + "step": 12100, + "train/ce_loss": 0.40256252884864807 + }, + { + "epoch": 1.1963614791378288, + "step": 12100, + "train/sim_loss": 0.0296475887298584 + }, + { + "epoch": 1.1963614791378288, + "step": 12100, + "train/total_loss": 0.06990384310483932 + }, + { + "entropy": 9.111523628234863, + "epoch": 1.1964603519873442, + "mean_token_accuracy": 0.8627204298973083, + "num_tokens": 1225787.0, + "step": 12101, + "train/ce_loss": 0.696497917175293 + }, + { + "epoch": 1.1964603519873442, + "step": 12101, + "train/sim_loss": 0.0836416482925415 + }, + { + "epoch": 1.1964603519873442, + "step": 12101, + "train/total_loss": 0.15329143404960632 + }, + { + "entropy": 8.761823654174805, + "epoch": 1.1965592248368597, + "mean_token_accuracy": 0.8700000047683716, + "num_tokens": 1239066.0, + "step": 12102, + "train/ce_loss": 0.16250558197498322 + }, + { + "epoch": 1.1965592248368597, + "step": 12102, + "train/sim_loss": 0.01332390308380127 + }, + { + "epoch": 1.1965592248368597, + "step": 12102, + "train/total_loss": 0.02957446128129959 + }, + { + "entropy": 9.958925247192383, + "epoch": 1.1966580976863752, + "mean_token_accuracy": 0.9054726362228394, + "num_tokens": 1248026.0, + "step": 12103, + "train/ce_loss": 0.34873977303504944 + }, + { + "epoch": 1.1966580976863752, + "step": 12103, + "train/sim_loss": 0.05459702014923096 + }, + { + "epoch": 1.1966580976863752, + "step": 12103, + "train/total_loss": 0.0894709974527359 + }, + { + "entropy": 9.24943733215332, + "epoch": 1.196756970535891, + "mean_token_accuracy": 0.8559814095497131, + "num_tokens": 1256945.0, + "step": 12104, + "train/ce_loss": 0.49449771642684937 + }, + { + "epoch": 1.196756970535891, + "step": 12104, + "train/sim_loss": 0.03945362567901611 + }, + { + "epoch": 1.196756970535891, + "step": 12104, + "train/total_loss": 0.08890339732170105 + }, + { + "entropy": 9.082710266113281, + "epoch": 1.1968558433854064, + "mean_token_accuracy": 0.8098098039627075, + "num_tokens": 1267578.0, + "step": 12105, + "train/ce_loss": 0.6963963508605957 + }, + { + "epoch": 1.1968558433854064, + "step": 12105, + "train/sim_loss": 0.03248190879821777 + }, + { + "epoch": 1.1968558433854064, + "step": 12105, + "train/total_loss": 0.10212154686450958 + }, + { + "entropy": 9.587442398071289, + "epoch": 1.196954716234922, + "mean_token_accuracy": 0.9048951268196106, + "num_tokens": 1280547.0, + "step": 12106, + "train/ce_loss": 0.2983993887901306 + }, + { + "epoch": 1.196954716234922, + "step": 12106, + "train/sim_loss": 0.014323770999908447 + }, + { + "epoch": 1.196954716234922, + "step": 12106, + "train/total_loss": 0.04416371136903763 + }, + { + "entropy": 9.255990982055664, + "epoch": 1.1970535890844374, + "mean_token_accuracy": 0.7966963052749634, + "num_tokens": 1288029.0, + "step": 12107, + "train/ce_loss": 0.5605956315994263 + }, + { + "epoch": 1.1970535890844374, + "step": 12107, + "train/sim_loss": 0.08429121971130371 + }, + { + "epoch": 1.1970535890844374, + "step": 12107, + "train/total_loss": 0.14035078883171082 + }, + { + "entropy": 9.308679580688477, + "epoch": 1.1971524619339529, + "mean_token_accuracy": 0.7510121464729309, + "num_tokens": 1299570.0, + "step": 12108, + "train/ce_loss": 0.560217559337616 + }, + { + "epoch": 1.1971524619339529, + "step": 12108, + "train/sim_loss": 0.07597827911376953 + }, + { + "epoch": 1.1971524619339529, + "step": 12108, + "train/total_loss": 0.13200002908706665 + }, + { + "entropy": 9.553647994995117, + "epoch": 1.1972513347834686, + "mean_token_accuracy": 0.8830845952033997, + "num_tokens": 1319504.0, + "step": 12109, + "train/ce_loss": 0.6032390594482422 + }, + { + "epoch": 1.1972513347834686, + "step": 12109, + "train/sim_loss": 0.08760958909988403 + }, + { + "epoch": 1.1972513347834686, + "step": 12109, + "train/total_loss": 0.1479334980249405 + }, + { + "entropy": 9.353020668029785, + "epoch": 1.197350207632984, + "mean_token_accuracy": 0.8259162306785583, + "num_tokens": 1328328.0, + "step": 12110, + "train/ce_loss": 0.6692216992378235 + }, + { + "epoch": 1.197350207632984, + "step": 12110, + "train/sim_loss": 0.07592082023620605 + }, + { + "epoch": 1.197350207632984, + "step": 12110, + "train/total_loss": 0.14284299314022064 + }, + { + "entropy": 9.372318267822266, + "epoch": 1.1974490804824995, + "mean_token_accuracy": 0.8688711524009705, + "num_tokens": 1344492.0, + "step": 12111, + "train/ce_loss": 0.32438117265701294 + }, + { + "epoch": 1.1974490804824995, + "step": 12111, + "train/sim_loss": 0.024227619171142578 + }, + { + "epoch": 1.1974490804824995, + "step": 12111, + "train/total_loss": 0.05666573718190193 + }, + { + "entropy": 9.029775619506836, + "epoch": 1.197547953332015, + "mean_token_accuracy": 0.8551941514015198, + "num_tokens": 1358476.0, + "step": 12112, + "train/ce_loss": 0.2469124048948288 + }, + { + "epoch": 1.197547953332015, + "step": 12112, + "train/sim_loss": 0.040833234786987305 + }, + { + "epoch": 1.197547953332015, + "step": 12112, + "train/total_loss": 0.06552447378635406 + }, + { + "entropy": 9.859222412109375, + "epoch": 1.1976468261815305, + "mean_token_accuracy": 0.8467375040054321, + "num_tokens": 1368312.0, + "step": 12113, + "train/ce_loss": 0.24658040702342987 + }, + { + "epoch": 1.1976468261815305, + "step": 12113, + "train/sim_loss": 0.061048030853271484 + }, + { + "epoch": 1.1976468261815305, + "step": 12113, + "train/total_loss": 0.08570607006549835 + }, + { + "entropy": 9.50908088684082, + "epoch": 1.197745699031046, + "mean_token_accuracy": 0.9246676564216614, + "num_tokens": 1383021.0, + "step": 12114, + "train/ce_loss": 0.3936172425746918 + }, + { + "epoch": 1.197745699031046, + "step": 12114, + "train/sim_loss": 0.02631688117980957 + }, + { + "epoch": 1.197745699031046, + "step": 12114, + "train/total_loss": 0.06567861139774323 + }, + { + "entropy": 9.241902351379395, + "epoch": 1.1978445718805615, + "mean_token_accuracy": 0.8345221281051636, + "num_tokens": 1395229.0, + "step": 12115, + "train/ce_loss": 0.5625475645065308 + }, + { + "epoch": 1.1978445718805615, + "step": 12115, + "train/sim_loss": 0.04268288612365723 + }, + { + "epoch": 1.1978445718805615, + "step": 12115, + "train/total_loss": 0.09893764555454254 + }, + { + "entropy": 9.58127498626709, + "epoch": 1.1979434447300772, + "mean_token_accuracy": 0.852185070514679, + "num_tokens": 1409068.0, + "step": 12116, + "train/ce_loss": 0.45877087116241455 + }, + { + "epoch": 1.1979434447300772, + "step": 12116, + "train/sim_loss": 0.08430284261703491 + }, + { + "epoch": 1.1979434447300772, + "step": 12116, + "train/total_loss": 0.13017992675304413 + }, + { + "entropy": 9.710277557373047, + "epoch": 1.1980423175795927, + "mean_token_accuracy": 0.8474576473236084, + "num_tokens": 1418220.0, + "step": 12117, + "train/ce_loss": 0.7761214375495911 + }, + { + "epoch": 1.1980423175795927, + "step": 12117, + "train/sim_loss": 0.06970691680908203 + }, + { + "epoch": 1.1980423175795927, + "step": 12117, + "train/total_loss": 0.14731906354427338 + }, + { + "entropy": 9.549999237060547, + "epoch": 1.1981411904291082, + "mean_token_accuracy": 0.8468084931373596, + "num_tokens": 1434305.0, + "step": 12118, + "train/ce_loss": 0.5371553897857666 + }, + { + "epoch": 1.1981411904291082, + "step": 12118, + "train/sim_loss": 0.026676058769226074 + }, + { + "epoch": 1.1981411904291082, + "step": 12118, + "train/total_loss": 0.08039160072803497 + }, + { + "entropy": 8.922538757324219, + "epoch": 1.1982400632786236, + "mean_token_accuracy": 0.8542413115501404, + "num_tokens": 1442325.0, + "step": 12119, + "train/ce_loss": 0.43053075671195984 + }, + { + "epoch": 1.1982400632786236, + "step": 12119, + "train/sim_loss": 0.046777963638305664 + }, + { + "epoch": 1.1982400632786236, + "step": 12119, + "train/total_loss": 0.08983103930950165 + }, + { + "epoch": 1.1983389361281391, + "grad_norm": 0.5345555543899536, + "learning_rate": 7.006131632299857e-06, + "loss": 0.0878, + "step": 12120 + }, + { + "entropy": 9.012600898742676, + "epoch": 1.1983389361281391, + "mean_token_accuracy": 0.876335859298706, + "num_tokens": 1448635.0, + "step": 12120, + "train/ce_loss": 0.6052045822143555 + }, + { + "epoch": 1.1983389361281391, + "step": 12120, + "train/sim_loss": 0.05603635311126709 + }, + { + "epoch": 1.1983389361281391, + "step": 12120, + "train/total_loss": 0.1165568083524704 + }, + { + "entropy": 9.524401664733887, + "epoch": 1.1984378089776548, + "mean_token_accuracy": 0.8351144790649414, + "num_tokens": 1458974.0, + "step": 12121, + "train/ce_loss": 0.4853256046772003 + }, + { + "epoch": 1.1984378089776548, + "step": 12121, + "train/sim_loss": 0.04915893077850342 + }, + { + "epoch": 1.1984378089776548, + "step": 12121, + "train/total_loss": 0.09769149124622345 + }, + { + "entropy": 9.536958694458008, + "epoch": 1.1985366818271703, + "mean_token_accuracy": 0.8387096524238586, + "num_tokens": 1465952.0, + "step": 12122, + "train/ce_loss": 2.699491403745924e-07 + }, + { + "epoch": 1.1985366818271703, + "step": 12122, + "train/sim_loss": 0.012252986431121826 + }, + { + "epoch": 1.1985366818271703, + "step": 12122, + "train/total_loss": 0.01225301343947649 + }, + { + "entropy": 9.51460075378418, + "epoch": 1.1986355546766858, + "mean_token_accuracy": 0.8655834794044495, + "num_tokens": 1475400.0, + "step": 12123, + "train/ce_loss": 0.8551988005638123 + }, + { + "epoch": 1.1986355546766858, + "step": 12123, + "train/sim_loss": 0.09383344650268555 + }, + { + "epoch": 1.1986355546766858, + "step": 12123, + "train/total_loss": 0.17935332655906677 + }, + { + "entropy": 8.706031799316406, + "epoch": 1.1987344275262013, + "mean_token_accuracy": 0.8612663149833679, + "num_tokens": 1484564.0, + "step": 12124, + "train/ce_loss": 0.4237210154533386 + }, + { + "epoch": 1.1987344275262013, + "step": 12124, + "train/sim_loss": 0.0176699161529541 + }, + { + "epoch": 1.1987344275262013, + "step": 12124, + "train/total_loss": 0.06004201993346214 + }, + { + "entropy": 8.584253311157227, + "epoch": 1.1988333003757168, + "mean_token_accuracy": 0.8349999785423279, + "num_tokens": 1490820.0, + "step": 12125, + "train/ce_loss": 0.3456590473651886 + }, + { + "epoch": 1.1988333003757168, + "step": 12125, + "train/sim_loss": 0.013428092002868652 + }, + { + "epoch": 1.1988333003757168, + "step": 12125, + "train/total_loss": 0.04799399897456169 + }, + { + "entropy": 9.084688186645508, + "epoch": 1.1989321732252323, + "mean_token_accuracy": 0.7972545027732849, + "num_tokens": 1505046.0, + "step": 12126, + "train/ce_loss": 0.5957202315330505 + }, + { + "epoch": 1.1989321732252323, + "step": 12126, + "train/sim_loss": 0.021021366119384766 + }, + { + "epoch": 1.1989321732252323, + "step": 12126, + "train/total_loss": 0.08059339225292206 + }, + { + "entropy": 10.190340995788574, + "epoch": 1.1990310460747478, + "mean_token_accuracy": 0.8651315569877625, + "num_tokens": 1519649.0, + "step": 12127, + "train/ce_loss": 1.5640140771865845 + }, + { + "epoch": 1.1990310460747478, + "step": 12127, + "train/sim_loss": 0.06370729207992554 + }, + { + "epoch": 1.1990310460747478, + "step": 12127, + "train/total_loss": 0.22010870277881622 + }, + { + "entropy": 9.334735870361328, + "epoch": 1.1991299189242635, + "mean_token_accuracy": 0.8612099885940552, + "num_tokens": 1529790.0, + "step": 12128, + "train/ce_loss": 0.3440960943698883 + }, + { + "epoch": 1.1991299189242635, + "step": 12128, + "train/sim_loss": 0.04507249593734741 + }, + { + "epoch": 1.1991299189242635, + "step": 12128, + "train/total_loss": 0.07948210835456848 + }, + { + "entropy": 9.81424331665039, + "epoch": 1.199228791773779, + "mean_token_accuracy": 0.8529887199401855, + "num_tokens": 1546300.0, + "step": 12129, + "train/ce_loss": 1.0653038771124557e-06 + }, + { + "epoch": 1.199228791773779, + "step": 12129, + "train/sim_loss": 0.036967694759368896 + }, + { + "epoch": 1.199228791773779, + "step": 12129, + "train/total_loss": 0.03696780279278755 + }, + { + "entropy": 9.881717681884766, + "epoch": 1.1993276646232944, + "mean_token_accuracy": 0.8582230806350708, + "num_tokens": 1552774.0, + "step": 12130, + "train/ce_loss": 4.831402407035057e-07 + }, + { + "epoch": 1.1993276646232944, + "step": 12130, + "train/sim_loss": 0.014988422393798828 + }, + { + "epoch": 1.1993276646232944, + "step": 12130, + "train/total_loss": 0.014988470822572708 + }, + { + "entropy": 8.86518383026123, + "epoch": 1.19942653747281, + "mean_token_accuracy": 0.8999999761581421, + "num_tokens": 1560982.0, + "step": 12131, + "train/ce_loss": 0.4368402659893036 + }, + { + "epoch": 1.19942653747281, + "step": 12131, + "train/sim_loss": 0.032755255699157715 + }, + { + "epoch": 1.19942653747281, + "step": 12131, + "train/total_loss": 0.0764392837882042 + }, + { + "entropy": 9.926994323730469, + "epoch": 1.1995254103223254, + "mean_token_accuracy": 0.8383620977401733, + "num_tokens": 1577357.0, + "step": 12132, + "train/ce_loss": 1.0235223726340337e-06 + }, + { + "epoch": 1.1995254103223254, + "step": 12132, + "train/sim_loss": 0.024066686630249023 + }, + { + "epoch": 1.1995254103223254, + "step": 12132, + "train/total_loss": 0.02406678907573223 + }, + { + "entropy": 9.565913200378418, + "epoch": 1.199624283171841, + "mean_token_accuracy": 0.8641025424003601, + "num_tokens": 1590718.0, + "step": 12133, + "train/ce_loss": 0.643301248550415 + }, + { + "epoch": 1.199624283171841, + "step": 12133, + "train/sim_loss": 0.011903524398803711 + }, + { + "epoch": 1.199624283171841, + "step": 12133, + "train/total_loss": 0.0762336477637291 + }, + { + "entropy": 8.88635540008545, + "epoch": 1.1997231560213566, + "mean_token_accuracy": 0.8212512135505676, + "num_tokens": 1602830.0, + "step": 12134, + "train/ce_loss": 0.5473271012306213 + }, + { + "epoch": 1.1997231560213566, + "step": 12134, + "train/sim_loss": 0.03407520055770874 + }, + { + "epoch": 1.1997231560213566, + "step": 12134, + "train/total_loss": 0.08880791068077087 + }, + { + "entropy": 9.348816871643066, + "epoch": 1.199822028870872, + "mean_token_accuracy": 0.8135011196136475, + "num_tokens": 1614289.0, + "step": 12135, + "train/ce_loss": 0.4334621727466583 + }, + { + "epoch": 1.199822028870872, + "step": 12135, + "train/sim_loss": 0.02456200122833252 + }, + { + "epoch": 1.199822028870872, + "step": 12135, + "train/total_loss": 0.06790821999311447 + }, + { + "entropy": 8.726845741271973, + "epoch": 1.1999209017203876, + "mean_token_accuracy": 0.8807339668273926, + "num_tokens": 1622817.0, + "step": 12136, + "train/ce_loss": 0.3294817805290222 + }, + { + "epoch": 1.1999209017203876, + "step": 12136, + "train/sim_loss": 0.014603793621063232 + }, + { + "epoch": 1.1999209017203876, + "step": 12136, + "train/total_loss": 0.047551970928907394 + }, + { + "entropy": 9.205098152160645, + "epoch": 1.200019774569903, + "mean_token_accuracy": 0.8359788656234741, + "num_tokens": 1638210.0, + "step": 12137, + "train/ce_loss": 0.5120014548301697 + }, + { + "epoch": 1.200019774569903, + "step": 12137, + "train/sim_loss": 0.03835815191268921 + }, + { + "epoch": 1.200019774569903, + "step": 12137, + "train/total_loss": 0.08955830335617065 + }, + { + "entropy": 9.273774147033691, + "epoch": 1.2001186474194185, + "mean_token_accuracy": 0.8041958212852478, + "num_tokens": 1646227.0, + "step": 12138, + "train/ce_loss": 0.5968320369720459 + }, + { + "epoch": 1.2001186474194185, + "step": 12138, + "train/sim_loss": 0.015178561210632324 + }, + { + "epoch": 1.2001186474194185, + "step": 12138, + "train/total_loss": 0.07486176490783691 + }, + { + "entropy": 9.079368591308594, + "epoch": 1.2002175202689342, + "mean_token_accuracy": 0.8787528872489929, + "num_tokens": 1656096.0, + "step": 12139, + "train/ce_loss": 0.14538533985614777 + }, + { + "epoch": 1.2002175202689342, + "step": 12139, + "train/sim_loss": 0.04369783401489258 + }, + { + "epoch": 1.2002175202689342, + "step": 12139, + "train/total_loss": 0.058236368000507355 + }, + { + "epoch": 1.2003163931184497, + "grad_norm": 0.7734224200248718, + "learning_rate": 7.001186767541909e-06, + "loss": 0.0875, + "step": 12140 + }, + { + "entropy": 9.950226783752441, + "epoch": 1.2003163931184497, + "mean_token_accuracy": 0.850931704044342, + "num_tokens": 1669573.0, + "step": 12140, + "train/ce_loss": 7.325186857087829e-07 + }, + { + "epoch": 1.2003163931184497, + "step": 12140, + "train/sim_loss": 0.009495139122009277 + }, + { + "epoch": 1.2003163931184497, + "step": 12140, + "train/total_loss": 0.009495212696492672 + }, + { + "entropy": 9.177801132202148, + "epoch": 1.2004152659679652, + "mean_token_accuracy": 0.8340163826942444, + "num_tokens": 1682513.0, + "step": 12141, + "train/ce_loss": 0.4339035451412201 + }, + { + "epoch": 1.2004152659679652, + "step": 12141, + "train/sim_loss": 0.06792211532592773 + }, + { + "epoch": 1.2004152659679652, + "step": 12141, + "train/total_loss": 0.11131247133016586 + }, + { + "entropy": 9.60444450378418, + "epoch": 1.2005141388174807, + "mean_token_accuracy": 0.8707224130630493, + "num_tokens": 1698644.0, + "step": 12142, + "train/ce_loss": 0.1577463299036026 + }, + { + "epoch": 1.2005141388174807, + "step": 12142, + "train/sim_loss": 0.041687726974487305 + }, + { + "epoch": 1.2005141388174807, + "step": 12142, + "train/total_loss": 0.057462360709905624 + }, + { + "entropy": 9.52314281463623, + "epoch": 1.2006130116669962, + "mean_token_accuracy": 0.8273381590843201, + "num_tokens": 1717089.0, + "step": 12143, + "train/ce_loss": 0.48214656114578247 + }, + { + "epoch": 1.2006130116669962, + "step": 12143, + "train/sim_loss": 0.06520092487335205 + }, + { + "epoch": 1.2006130116669962, + "step": 12143, + "train/total_loss": 0.11341558396816254 + }, + { + "entropy": 9.217514038085938, + "epoch": 1.2007118845165117, + "mean_token_accuracy": 0.8986867070198059, + "num_tokens": 1724807.0, + "step": 12144, + "train/ce_loss": 0.2043696492910385 + }, + { + "epoch": 1.2007118845165117, + "step": 12144, + "train/sim_loss": 0.011166810989379883 + }, + { + "epoch": 1.2007118845165117, + "step": 12144, + "train/total_loss": 0.031603775918483734 + }, + { + "entropy": 9.407533645629883, + "epoch": 1.2008107573660274, + "mean_token_accuracy": 0.792682945728302, + "num_tokens": 1736565.0, + "step": 12145, + "train/ce_loss": 0.6348121762275696 + }, + { + "epoch": 1.2008107573660274, + "step": 12145, + "train/sim_loss": 0.10307300090789795 + }, + { + "epoch": 1.2008107573660274, + "step": 12145, + "train/total_loss": 0.16655421257019043 + }, + { + "entropy": 9.329944610595703, + "epoch": 1.2009096302155429, + "mean_token_accuracy": 0.8714069724082947, + "num_tokens": 1746061.0, + "step": 12146, + "train/ce_loss": 0.4714162349700928 + }, + { + "epoch": 1.2009096302155429, + "step": 12146, + "train/sim_loss": 0.04445481300354004 + }, + { + "epoch": 1.2009096302155429, + "step": 12146, + "train/total_loss": 0.09159643948078156 + }, + { + "entropy": 9.821382522583008, + "epoch": 1.2010085030650584, + "mean_token_accuracy": 0.9001161456108093, + "num_tokens": 1760735.0, + "step": 12147, + "train/ce_loss": 0.27110734581947327 + }, + { + "epoch": 1.2010085030650584, + "step": 12147, + "train/sim_loss": 0.05752342939376831 + }, + { + "epoch": 1.2010085030650584, + "step": 12147, + "train/total_loss": 0.08463416248559952 + }, + { + "entropy": 9.546095848083496, + "epoch": 1.2011073759145738, + "mean_token_accuracy": 0.913095235824585, + "num_tokens": 1778109.0, + "step": 12148, + "train/ce_loss": 0.456252783536911 + }, + { + "epoch": 1.2011073759145738, + "step": 12148, + "train/sim_loss": 0.07020151615142822 + }, + { + "epoch": 1.2011073759145738, + "step": 12148, + "train/total_loss": 0.1158268004655838 + }, + { + "entropy": 9.313358306884766, + "epoch": 1.2012062487640893, + "mean_token_accuracy": 0.8604938387870789, + "num_tokens": 1787314.0, + "step": 12149, + "train/ce_loss": 0.332504004240036 + }, + { + "epoch": 1.2012062487640893, + "step": 12149, + "train/sim_loss": 0.01864480972290039 + }, + { + "epoch": 1.2012062487640893, + "step": 12149, + "train/total_loss": 0.05189521238207817 + }, + { + "entropy": 9.596182823181152, + "epoch": 1.2013051216136048, + "mean_token_accuracy": 0.8676789402961731, + "num_tokens": 1801924.0, + "step": 12150, + "train/ce_loss": 0.15596602857112885 + }, + { + "epoch": 1.2013051216136048, + "step": 12150, + "train/sim_loss": 0.016765236854553223 + }, + { + "epoch": 1.2013051216136048, + "step": 12150, + "train/total_loss": 0.03236183896660805 + }, + { + "entropy": 9.833520889282227, + "epoch": 1.2014039944631205, + "mean_token_accuracy": 0.8756756782531738, + "num_tokens": 1818267.0, + "step": 12151, + "train/ce_loss": 0.16582484543323517 + }, + { + "epoch": 1.2014039944631205, + "step": 12151, + "train/sim_loss": 0.05924201011657715 + }, + { + "epoch": 1.2014039944631205, + "step": 12151, + "train/total_loss": 0.07582449913024902 + }, + { + "entropy": 9.515667915344238, + "epoch": 1.201502867312636, + "mean_token_accuracy": 0.8164793848991394, + "num_tokens": 1834628.0, + "step": 12152, + "train/ce_loss": 0.5541511178016663 + }, + { + "epoch": 1.201502867312636, + "step": 12152, + "train/sim_loss": 0.02357625961303711 + }, + { + "epoch": 1.201502867312636, + "step": 12152, + "train/total_loss": 0.0789913684129715 + }, + { + "entropy": 9.332179069519043, + "epoch": 1.2016017401621515, + "mean_token_accuracy": 0.8186089992523193, + "num_tokens": 1849289.0, + "step": 12153, + "train/ce_loss": 0.5142397284507751 + }, + { + "epoch": 1.2016017401621515, + "step": 12153, + "train/sim_loss": 0.022013604640960693 + }, + { + "epoch": 1.2016017401621515, + "step": 12153, + "train/total_loss": 0.07343757897615433 + }, + { + "entropy": 9.321317672729492, + "epoch": 1.201700613011667, + "mean_token_accuracy": 0.817241370677948, + "num_tokens": 1862230.0, + "step": 12154, + "train/ce_loss": 0.6118091940879822 + }, + { + "epoch": 1.201700613011667, + "step": 12154, + "train/sim_loss": 0.021961867809295654 + }, + { + "epoch": 1.201700613011667, + "step": 12154, + "train/total_loss": 0.08314278721809387 + }, + { + "entropy": 9.626823425292969, + "epoch": 1.2017994858611825, + "mean_token_accuracy": 0.8264331221580505, + "num_tokens": 1875356.0, + "step": 12155, + "train/ce_loss": 0.6919722557067871 + }, + { + "epoch": 1.2017994858611825, + "step": 12155, + "train/sim_loss": 0.07463961839675903 + }, + { + "epoch": 1.2017994858611825, + "step": 12155, + "train/total_loss": 0.1438368558883667 + }, + { + "entropy": 9.29214096069336, + "epoch": 1.201898358710698, + "mean_token_accuracy": 0.798701286315918, + "num_tokens": 1882514.0, + "step": 12156, + "train/ce_loss": 0.7698619365692139 + }, + { + "epoch": 1.201898358710698, + "step": 12156, + "train/sim_loss": 0.07003462314605713 + }, + { + "epoch": 1.201898358710698, + "step": 12156, + "train/total_loss": 0.14702081680297852 + }, + { + "entropy": 9.100775718688965, + "epoch": 1.2019972315602137, + "mean_token_accuracy": 0.8444656729698181, + "num_tokens": 1891284.0, + "step": 12157, + "train/ce_loss": 0.4280892312526703 + }, + { + "epoch": 1.2019972315602137, + "step": 12157, + "train/sim_loss": 0.04686117172241211 + }, + { + "epoch": 1.2019972315602137, + "step": 12157, + "train/total_loss": 0.0896700918674469 + }, + { + "entropy": 9.44741153717041, + "epoch": 1.2020961044097291, + "mean_token_accuracy": 0.874154269695282, + "num_tokens": 1904187.0, + "step": 12158, + "train/ce_loss": 0.41716268658638 + }, + { + "epoch": 1.2020961044097291, + "step": 12158, + "train/sim_loss": 0.03289520740509033 + }, + { + "epoch": 1.2020961044097291, + "step": 12158, + "train/total_loss": 0.07461147755384445 + }, + { + "entropy": 9.159854888916016, + "epoch": 1.2021949772592446, + "mean_token_accuracy": 0.866995096206665, + "num_tokens": 1917837.0, + "step": 12159, + "train/ce_loss": 0.6766729354858398 + }, + { + "epoch": 1.2021949772592446, + "step": 12159, + "train/sim_loss": 0.01881551742553711 + }, + { + "epoch": 1.2021949772592446, + "step": 12159, + "train/total_loss": 0.08648281544446945 + }, + { + "epoch": 1.20229385010876, + "grad_norm": 0.6737831234931946, + "learning_rate": 6.996241902783959e-06, + "loss": 0.0906, + "step": 12160 + }, + { + "entropy": 9.595084190368652, + "epoch": 1.20229385010876, + "mean_token_accuracy": 0.7872596383094788, + "num_tokens": 1929890.0, + "step": 12160, + "train/ce_loss": 0.6010480523109436 + }, + { + "epoch": 1.20229385010876, + "step": 12160, + "train/sim_loss": 0.035374224185943604 + }, + { + "epoch": 1.20229385010876, + "step": 12160, + "train/total_loss": 0.09547902643680573 + }, + { + "entropy": 9.430290222167969, + "epoch": 1.2023927229582756, + "mean_token_accuracy": 0.8241134881973267, + "num_tokens": 1940719.0, + "step": 12161, + "train/ce_loss": 0.4699902832508087 + }, + { + "epoch": 1.2023927229582756, + "step": 12161, + "train/sim_loss": 0.01396799087524414 + }, + { + "epoch": 1.2023927229582756, + "step": 12161, + "train/total_loss": 0.06096702069044113 + }, + { + "entropy": 9.670385360717773, + "epoch": 1.2024915958077913, + "mean_token_accuracy": 0.8561983704566956, + "num_tokens": 1955929.0, + "step": 12162, + "train/ce_loss": 0.45736220479011536 + }, + { + "epoch": 1.2024915958077913, + "step": 12162, + "train/sim_loss": 0.016169726848602295 + }, + { + "epoch": 1.2024915958077913, + "step": 12162, + "train/total_loss": 0.06190594658255577 + }, + { + "entropy": 9.653350830078125, + "epoch": 1.2025904686573068, + "mean_token_accuracy": 0.8267857432365417, + "num_tokens": 1962755.0, + "step": 12163, + "train/ce_loss": 4.001981892542972e-07 + }, + { + "epoch": 1.2025904686573068, + "step": 12163, + "train/sim_loss": 0.014146924018859863 + }, + { + "epoch": 1.2025904686573068, + "step": 12163, + "train/total_loss": 0.014146964065730572 + }, + { + "entropy": 9.771574020385742, + "epoch": 1.2026893415068223, + "mean_token_accuracy": 0.7768421173095703, + "num_tokens": 1978422.0, + "step": 12164, + "train/ce_loss": 0.6787973046302795 + }, + { + "epoch": 1.2026893415068223, + "step": 12164, + "train/sim_loss": 0.027910292148590088 + }, + { + "epoch": 1.2026893415068223, + "step": 12164, + "train/total_loss": 0.09579002112150192 + }, + { + "entropy": 8.732379913330078, + "epoch": 1.2027882143563378, + "mean_token_accuracy": 0.8592017889022827, + "num_tokens": 1985763.0, + "step": 12165, + "train/ce_loss": 0.2771172821521759 + }, + { + "epoch": 1.2027882143563378, + "step": 12165, + "train/sim_loss": 0.01950836181640625 + }, + { + "epoch": 1.2027882143563378, + "step": 12165, + "train/total_loss": 0.04722008854150772 + }, + { + "entropy": 9.732826232910156, + "epoch": 1.2028870872058532, + "mean_token_accuracy": 0.882758617401123, + "num_tokens": 2001968.0, + "step": 12166, + "train/ce_loss": 0.28700727224349976 + }, + { + "epoch": 1.2028870872058532, + "step": 12166, + "train/sim_loss": 0.03727531433105469 + }, + { + "epoch": 1.2028870872058532, + "step": 12166, + "train/total_loss": 0.06597603857517242 + }, + { + "entropy": 9.557085037231445, + "epoch": 1.2029859600553687, + "mean_token_accuracy": 0.8508771657943726, + "num_tokens": 2012462.0, + "step": 12167, + "train/ce_loss": 0.6607643961906433 + }, + { + "epoch": 1.2029859600553687, + "step": 12167, + "train/sim_loss": 0.046958982944488525 + }, + { + "epoch": 1.2029859600553687, + "step": 12167, + "train/total_loss": 0.1130354255437851 + }, + { + "entropy": 9.470657348632812, + "epoch": 1.2030848329048842, + "mean_token_accuracy": 0.8539682626724243, + "num_tokens": 2023140.0, + "step": 12168, + "train/ce_loss": 0.4614185094833374 + }, + { + "epoch": 1.2030848329048842, + "step": 12168, + "train/sim_loss": 0.033758699893951416 + }, + { + "epoch": 1.2030848329048842, + "step": 12168, + "train/total_loss": 0.07990054786205292 + }, + { + "entropy": 9.033346176147461, + "epoch": 1.2031837057544, + "mean_token_accuracy": 0.8583691120147705, + "num_tokens": 2035350.0, + "step": 12169, + "train/ce_loss": 0.7361543774604797 + }, + { + "epoch": 1.2031837057544, + "step": 12169, + "train/sim_loss": 0.07124799489974976 + }, + { + "epoch": 1.2031837057544, + "step": 12169, + "train/total_loss": 0.14486342668533325 + }, + { + "entropy": 9.305068969726562, + "epoch": 1.2032825786039154, + "mean_token_accuracy": 0.7921928763389587, + "num_tokens": 2044959.0, + "step": 12170, + "train/ce_loss": 0.6473608613014221 + }, + { + "epoch": 1.2032825786039154, + "step": 12170, + "train/sim_loss": 0.08282959461212158 + }, + { + "epoch": 1.2032825786039154, + "step": 12170, + "train/total_loss": 0.14756569266319275 + }, + { + "entropy": 10.060918807983398, + "epoch": 1.203381451453431, + "mean_token_accuracy": 0.8911392688751221, + "num_tokens": 2060782.0, + "step": 12171, + "train/ce_loss": 4.176816048584442e-07 + }, + { + "epoch": 1.203381451453431, + "step": 12171, + "train/sim_loss": 0.016863465309143066 + }, + { + "epoch": 1.203381451453431, + "step": 12171, + "train/total_loss": 0.01686350628733635 + }, + { + "entropy": 9.675952911376953, + "epoch": 1.2034803243029464, + "mean_token_accuracy": 0.8021534085273743, + "num_tokens": 2072741.0, + "step": 12172, + "train/ce_loss": 0.4099577069282532 + }, + { + "epoch": 1.2034803243029464, + "step": 12172, + "train/sim_loss": 0.02839970588684082 + }, + { + "epoch": 1.2034803243029464, + "step": 12172, + "train/total_loss": 0.06939548254013062 + }, + { + "entropy": 9.281343460083008, + "epoch": 1.2035791971524619, + "mean_token_accuracy": 0.8051947951316833, + "num_tokens": 2087538.0, + "step": 12173, + "train/ce_loss": 0.643671989440918 + }, + { + "epoch": 1.2035791971524619, + "step": 12173, + "train/sim_loss": 0.043750762939453125 + }, + { + "epoch": 1.2035791971524619, + "step": 12173, + "train/total_loss": 0.1081179603934288 + }, + { + "entropy": 9.005239486694336, + "epoch": 1.2036780700019776, + "mean_token_accuracy": 0.8327645063400269, + "num_tokens": 2096725.0, + "step": 12174, + "train/ce_loss": 0.35321828722953796 + }, + { + "epoch": 1.2036780700019776, + "step": 12174, + "train/sim_loss": 0.011792659759521484 + }, + { + "epoch": 1.2036780700019776, + "step": 12174, + "train/total_loss": 0.04711448773741722 + }, + { + "entropy": 9.276435852050781, + "epoch": 1.203776942851493, + "mean_token_accuracy": 0.8241205811500549, + "num_tokens": 2107806.0, + "step": 12175, + "train/ce_loss": 5.097748498883448e-07 + }, + { + "epoch": 1.203776942851493, + "step": 12175, + "train/sim_loss": 0.04264187812805176 + }, + { + "epoch": 1.203776942851493, + "step": 12175, + "train/total_loss": 0.042641930282115936 + }, + { + "entropy": 8.981546401977539, + "epoch": 1.2038758157010085, + "mean_token_accuracy": 0.7779660820960999, + "num_tokens": 2116482.0, + "step": 12176, + "train/ce_loss": 0.5609048008918762 + }, + { + "epoch": 1.2038758157010085, + "step": 12176, + "train/sim_loss": 0.06995970010757446 + }, + { + "epoch": 1.2038758157010085, + "step": 12176, + "train/total_loss": 0.1260501742362976 + }, + { + "entropy": 9.034652709960938, + "epoch": 1.203974688550524, + "mean_token_accuracy": 0.8401287794113159, + "num_tokens": 2127039.0, + "step": 12177, + "train/ce_loss": 0.3839402198791504 + }, + { + "epoch": 1.203974688550524, + "step": 12177, + "train/sim_loss": 0.01740974187850952 + }, + { + "epoch": 1.203974688550524, + "step": 12177, + "train/total_loss": 0.05580376461148262 + }, + { + "entropy": 9.610013961791992, + "epoch": 1.2040735614000395, + "mean_token_accuracy": 0.8167539238929749, + "num_tokens": 2148209.0, + "step": 12178, + "train/ce_loss": 0.774265468120575 + }, + { + "epoch": 1.2040735614000395, + "step": 12178, + "train/sim_loss": 0.045218706130981445 + }, + { + "epoch": 1.2040735614000395, + "step": 12178, + "train/total_loss": 0.12264525145292282 + }, + { + "entropy": 9.410388946533203, + "epoch": 1.204172434249555, + "mean_token_accuracy": 0.8195488452911377, + "num_tokens": 2162357.0, + "step": 12179, + "train/ce_loss": 0.40761128067970276 + }, + { + "epoch": 1.204172434249555, + "step": 12179, + "train/sim_loss": 0.03149592876434326 + }, + { + "epoch": 1.204172434249555, + "step": 12179, + "train/total_loss": 0.07225705683231354 + }, + { + "epoch": 1.2042713070990705, + "grad_norm": 0.5288592576980591, + "learning_rate": 6.991297038026011e-06, + "loss": 0.092, + "step": 12180 + }, + { + "entropy": 9.144697189331055, + "epoch": 1.2042713070990705, + "mean_token_accuracy": 0.8385772705078125, + "num_tokens": 2173591.0, + "step": 12180, + "train/ce_loss": 0.6467904448509216 + }, + { + "epoch": 1.2042713070990705, + "step": 12180, + "train/sim_loss": 0.020577073097229004 + }, + { + "epoch": 1.2042713070990705, + "step": 12180, + "train/total_loss": 0.08525612205266953 + }, + { + "entropy": 9.488515853881836, + "epoch": 1.2043701799485862, + "mean_token_accuracy": 0.8171557784080505, + "num_tokens": 2185033.0, + "step": 12181, + "train/ce_loss": 0.5253749489784241 + }, + { + "epoch": 1.2043701799485862, + "step": 12181, + "train/sim_loss": 0.11799407005310059 + }, + { + "epoch": 1.2043701799485862, + "step": 12181, + "train/total_loss": 0.17053157091140747 + }, + { + "entropy": 8.959880828857422, + "epoch": 1.2044690527981017, + "mean_token_accuracy": 0.8539071083068848, + "num_tokens": 2194259.0, + "step": 12182, + "train/ce_loss": 0.27124571800231934 + }, + { + "epoch": 1.2044690527981017, + "step": 12182, + "train/sim_loss": 0.08130085468292236 + }, + { + "epoch": 1.2044690527981017, + "step": 12182, + "train/total_loss": 0.10842542350292206 + }, + { + "entropy": 10.028404235839844, + "epoch": 1.2045679256476172, + "mean_token_accuracy": 0.8431034684181213, + "num_tokens": 2206242.0, + "step": 12183, + "train/ce_loss": 0.7606911063194275 + }, + { + "epoch": 1.2045679256476172, + "step": 12183, + "train/sim_loss": 0.07238459587097168 + }, + { + "epoch": 1.2045679256476172, + "step": 12183, + "train/total_loss": 0.1484537124633789 + }, + { + "entropy": 9.16627311706543, + "epoch": 1.2046667984971327, + "mean_token_accuracy": 0.85447758436203, + "num_tokens": 2219340.0, + "step": 12184, + "train/ce_loss": 0.4203639328479767 + }, + { + "epoch": 1.2046667984971327, + "step": 12184, + "train/sim_loss": 0.05333155393600464 + }, + { + "epoch": 1.2046667984971327, + "step": 12184, + "train/total_loss": 0.09536795318126678 + }, + { + "entropy": 9.166338920593262, + "epoch": 1.2047656713466481, + "mean_token_accuracy": 0.8248704671859741, + "num_tokens": 2231823.0, + "step": 12185, + "train/ce_loss": 0.5637363791465759 + }, + { + "epoch": 1.2047656713466481, + "step": 12185, + "train/sim_loss": 0.09938555955886841 + }, + { + "epoch": 1.2047656713466481, + "step": 12185, + "train/total_loss": 0.15575920045375824 + }, + { + "entropy": 9.595993995666504, + "epoch": 1.2048645441961638, + "mean_token_accuracy": 0.8193103671073914, + "num_tokens": 2242823.0, + "step": 12186, + "train/ce_loss": 0.4139774739742279 + }, + { + "epoch": 1.2048645441961638, + "step": 12186, + "train/sim_loss": 0.04324650764465332 + }, + { + "epoch": 1.2048645441961638, + "step": 12186, + "train/total_loss": 0.08464425802230835 + }, + { + "entropy": 9.604408264160156, + "epoch": 1.2049634170456793, + "mean_token_accuracy": 0.7986486554145813, + "num_tokens": 2256903.0, + "step": 12187, + "train/ce_loss": 0.5459436178207397 + }, + { + "epoch": 1.2049634170456793, + "step": 12187, + "train/sim_loss": 0.048603355884552 + }, + { + "epoch": 1.2049634170456793, + "step": 12187, + "train/total_loss": 0.10319772362709045 + }, + { + "entropy": 10.101354598999023, + "epoch": 1.2050622898951948, + "mean_token_accuracy": 0.8679927587509155, + "num_tokens": 2267773.0, + "step": 12188, + "train/ce_loss": 4.069889598667942e-07 + }, + { + "epoch": 1.2050622898951948, + "step": 12188, + "train/sim_loss": 0.015943527221679688 + }, + { + "epoch": 1.2050622898951948, + "step": 12188, + "train/total_loss": 0.01594356819987297 + }, + { + "entropy": 9.065542221069336, + "epoch": 1.2051611627447103, + "mean_token_accuracy": 0.8514007329940796, + "num_tokens": 2276319.0, + "step": 12189, + "train/ce_loss": 0.6425339579582214 + }, + { + "epoch": 1.2051611627447103, + "step": 12189, + "train/sim_loss": 0.03162097930908203 + }, + { + "epoch": 1.2051611627447103, + "step": 12189, + "train/total_loss": 0.0958743765950203 + }, + { + "entropy": 9.896890640258789, + "epoch": 1.2052600355942258, + "mean_token_accuracy": 0.8412942886352539, + "num_tokens": 2292020.0, + "step": 12190, + "train/ce_loss": 0.5184510350227356 + }, + { + "epoch": 1.2052600355942258, + "step": 12190, + "train/sim_loss": 0.030397653579711914 + }, + { + "epoch": 1.2052600355942258, + "step": 12190, + "train/total_loss": 0.08224275708198547 + }, + { + "entropy": 9.135892868041992, + "epoch": 1.2053589084437413, + "mean_token_accuracy": 0.8941018581390381, + "num_tokens": 2299149.0, + "step": 12191, + "train/ce_loss": 0.27342572808265686 + }, + { + "epoch": 1.2053589084437413, + "step": 12191, + "train/sim_loss": 0.011980652809143066 + }, + { + "epoch": 1.2053589084437413, + "step": 12191, + "train/total_loss": 0.03932322561740875 + }, + { + "entropy": 8.986479759216309, + "epoch": 1.2054577812932568, + "mean_token_accuracy": 0.8250591158866882, + "num_tokens": 2310379.0, + "step": 12192, + "train/ce_loss": 0.520305335521698 + }, + { + "epoch": 1.2054577812932568, + "step": 12192, + "train/sim_loss": 0.04466700553894043 + }, + { + "epoch": 1.2054577812932568, + "step": 12192, + "train/total_loss": 0.09669753909111023 + }, + { + "entropy": 9.657670021057129, + "epoch": 1.2055566541427725, + "mean_token_accuracy": 0.8168557286262512, + "num_tokens": 2327306.0, + "step": 12193, + "train/ce_loss": 0.6602516770362854 + }, + { + "epoch": 1.2055566541427725, + "step": 12193, + "train/sim_loss": 0.07642483711242676 + }, + { + "epoch": 1.2055566541427725, + "step": 12193, + "train/total_loss": 0.1424500048160553 + }, + { + "entropy": 9.705907821655273, + "epoch": 1.205655526992288, + "mean_token_accuracy": 0.8859180212020874, + "num_tokens": 2339899.0, + "step": 12194, + "train/ce_loss": 0.6695759296417236 + }, + { + "epoch": 1.205655526992288, + "step": 12194, + "train/sim_loss": 0.06932282447814941 + }, + { + "epoch": 1.205655526992288, + "step": 12194, + "train/total_loss": 0.13628041744232178 + }, + { + "entropy": 9.120492935180664, + "epoch": 1.2057543998418034, + "mean_token_accuracy": 0.8951841592788696, + "num_tokens": 2350476.0, + "step": 12195, + "train/ce_loss": 0.3009778559207916 + }, + { + "epoch": 1.2057543998418034, + "step": 12195, + "train/sim_loss": 0.055516839027404785 + }, + { + "epoch": 1.2057543998418034, + "step": 12195, + "train/total_loss": 0.08561462163925171 + }, + { + "entropy": 9.465991973876953, + "epoch": 1.205853272691319, + "mean_token_accuracy": 0.8128654956817627, + "num_tokens": 2362675.0, + "step": 12196, + "train/ce_loss": 1.3523025512695312 + }, + { + "epoch": 1.205853272691319, + "step": 12196, + "train/sim_loss": 0.028966546058654785 + }, + { + "epoch": 1.205853272691319, + "step": 12196, + "train/total_loss": 0.16419680416584015 + }, + { + "entropy": 9.69385814666748, + "epoch": 1.2059521455408344, + "mean_token_accuracy": 0.8931451439857483, + "num_tokens": 2373871.0, + "step": 12197, + "train/ce_loss": 1.8120207414540346e-06 + }, + { + "epoch": 1.2059521455408344, + "step": 12197, + "train/sim_loss": 0.0604405403137207 + }, + { + "epoch": 1.2059521455408344, + "step": 12197, + "train/total_loss": 0.06044072285294533 + }, + { + "entropy": 9.032966613769531, + "epoch": 1.2060510183903501, + "mean_token_accuracy": 0.8331360816955566, + "num_tokens": 2384369.0, + "step": 12198, + "train/ce_loss": 0.43658390641212463 + }, + { + "epoch": 1.2060510183903501, + "step": 12198, + "train/sim_loss": 0.027806222438812256 + }, + { + "epoch": 1.2060510183903501, + "step": 12198, + "train/total_loss": 0.07146461308002472 + }, + { + "entropy": 9.231586456298828, + "epoch": 1.2061498912398656, + "mean_token_accuracy": 0.8705103993415833, + "num_tokens": 2398428.0, + "step": 12199, + "train/ce_loss": 0.26593562960624695 + }, + { + "epoch": 1.2061498912398656, + "step": 12199, + "train/sim_loss": 0.044879794120788574 + }, + { + "epoch": 1.2061498912398656, + "step": 12199, + "train/total_loss": 0.07147336006164551 + }, + { + "epoch": 1.206248764089381, + "grad_norm": 0.5902646780014038, + "learning_rate": 6.986352173268062e-06, + "loss": 0.0901, + "step": 12200 + }, + { + "entropy": 8.856341361999512, + "epoch": 1.206248764089381, + "mean_token_accuracy": 0.853881299495697, + "num_tokens": 2404564.0, + "step": 12200, + "train/ce_loss": 0.5701154470443726 + }, + { + "epoch": 1.206248764089381, + "step": 12200, + "train/sim_loss": 0.027989983558654785 + }, + { + "epoch": 1.206248764089381, + "step": 12200, + "train/total_loss": 0.08500152826309204 + }, + { + "entropy": 9.723925590515137, + "epoch": 1.2063476369388966, + "mean_token_accuracy": 0.8642659187316895, + "num_tokens": 2423476.0, + "step": 12201, + "train/ce_loss": 0.7688032984733582 + }, + { + "epoch": 1.2063476369388966, + "step": 12201, + "train/sim_loss": 0.042231976985931396 + }, + { + "epoch": 1.2063476369388966, + "step": 12201, + "train/total_loss": 0.11911230534315109 + }, + { + "entropy": 9.662643432617188, + "epoch": 1.206446509788412, + "mean_token_accuracy": 0.8672566413879395, + "num_tokens": 2437224.0, + "step": 12202, + "train/ce_loss": 0.14265012741088867 + }, + { + "epoch": 1.206446509788412, + "step": 12202, + "train/sim_loss": 0.020578861236572266 + }, + { + "epoch": 1.206446509788412, + "step": 12202, + "train/total_loss": 0.03484387323260307 + }, + { + "entropy": 9.619638442993164, + "epoch": 1.2065453826379275, + "mean_token_accuracy": 0.820351779460907, + "num_tokens": 2455599.0, + "step": 12203, + "train/ce_loss": 0.49675193428993225 + }, + { + "epoch": 1.2065453826379275, + "step": 12203, + "train/sim_loss": 0.020619750022888184 + }, + { + "epoch": 1.2065453826379275, + "step": 12203, + "train/total_loss": 0.07029494643211365 + }, + { + "entropy": 10.220514297485352, + "epoch": 1.206644255487443, + "mean_token_accuracy": 0.9295154213905334, + "num_tokens": 2471273.0, + "step": 12204, + "train/ce_loss": 2.4659955215611262e-06 + }, + { + "epoch": 1.206644255487443, + "step": 12204, + "train/sim_loss": 0.05857568979263306 + }, + { + "epoch": 1.206644255487443, + "step": 12204, + "train/total_loss": 0.058575935661792755 + }, + { + "entropy": 9.227325439453125, + "epoch": 1.2067431283369587, + "mean_token_accuracy": 0.8931937217712402, + "num_tokens": 2480488.0, + "step": 12205, + "train/ce_loss": 0.07859855890274048 + }, + { + "epoch": 1.2067431283369587, + "step": 12205, + "train/sim_loss": 0.032501935958862305 + }, + { + "epoch": 1.2067431283369587, + "step": 12205, + "train/total_loss": 0.04036179184913635 + }, + { + "entropy": 9.504983901977539, + "epoch": 1.2068420011864742, + "mean_token_accuracy": 0.9038759469985962, + "num_tokens": 2494502.0, + "step": 12206, + "train/ce_loss": 0.6858880519866943 + }, + { + "epoch": 1.2068420011864742, + "step": 12206, + "train/sim_loss": 0.038382649421691895 + }, + { + "epoch": 1.2068420011864742, + "step": 12206, + "train/total_loss": 0.10697145760059357 + }, + { + "entropy": 9.798738479614258, + "epoch": 1.2069408740359897, + "mean_token_accuracy": 0.8977987170219421, + "num_tokens": 2505372.0, + "step": 12207, + "train/ce_loss": 0.5564303398132324 + }, + { + "epoch": 1.2069408740359897, + "step": 12207, + "train/sim_loss": 0.05490124225616455 + }, + { + "epoch": 1.2069408740359897, + "step": 12207, + "train/total_loss": 0.11054427921772003 + }, + { + "entropy": 9.676867485046387, + "epoch": 1.2070397468855052, + "mean_token_accuracy": 0.9287671446800232, + "num_tokens": 2517549.0, + "step": 12208, + "train/ce_loss": 0.4633353352546692 + }, + { + "epoch": 1.2070397468855052, + "step": 12208, + "train/sim_loss": 0.0360679030418396 + }, + { + "epoch": 1.2070397468855052, + "step": 12208, + "train/total_loss": 0.08240143954753876 + }, + { + "entropy": 9.770922660827637, + "epoch": 1.2071386197350207, + "mean_token_accuracy": 0.8837209343910217, + "num_tokens": 2529002.0, + "step": 12209, + "train/ce_loss": 2.6340374006394995e-06 + }, + { + "epoch": 1.2071386197350207, + "step": 12209, + "train/sim_loss": 0.040137290954589844 + }, + { + "epoch": 1.2071386197350207, + "step": 12209, + "train/total_loss": 0.040137555450201035 + }, + { + "entropy": 9.788351058959961, + "epoch": 1.2072374925845364, + "mean_token_accuracy": 0.8784067034721375, + "num_tokens": 2537656.0, + "step": 12210, + "train/ce_loss": 0.5521018505096436 + }, + { + "epoch": 1.2072374925845364, + "step": 12210, + "train/sim_loss": 0.03652787208557129 + }, + { + "epoch": 1.2072374925845364, + "step": 12210, + "train/total_loss": 0.09173806011676788 + }, + { + "entropy": 9.48051643371582, + "epoch": 1.2073363654340519, + "mean_token_accuracy": 0.8264706134796143, + "num_tokens": 2546036.0, + "step": 12211, + "train/ce_loss": 0.7311780452728271 + }, + { + "epoch": 1.2073363654340519, + "step": 12211, + "train/sim_loss": 0.0746464729309082 + }, + { + "epoch": 1.2073363654340519, + "step": 12211, + "train/total_loss": 0.14776428043842316 + }, + { + "entropy": 9.121892929077148, + "epoch": 1.2074352382835674, + "mean_token_accuracy": 0.8902742862701416, + "num_tokens": 2554604.0, + "step": 12212, + "train/ce_loss": 0.33202728629112244 + }, + { + "epoch": 1.2074352382835674, + "step": 12212, + "train/sim_loss": 0.06832319498062134 + }, + { + "epoch": 1.2074352382835674, + "step": 12212, + "train/total_loss": 0.1015259250998497 + }, + { + "entropy": 9.484994888305664, + "epoch": 1.2075341111330828, + "mean_token_accuracy": 0.8528863787651062, + "num_tokens": 2566318.0, + "step": 12213, + "train/ce_loss": 0.6667621731758118 + }, + { + "epoch": 1.2075341111330828, + "step": 12213, + "train/sim_loss": 0.05124872922897339 + }, + { + "epoch": 1.2075341111330828, + "step": 12213, + "train/total_loss": 0.11792495101690292 + }, + { + "entropy": 8.317331314086914, + "epoch": 1.2076329839825983, + "mean_token_accuracy": 0.9128738641738892, + "num_tokens": 2575564.0, + "step": 12214, + "train/ce_loss": 5.778896934316435e-07 + }, + { + "epoch": 1.2076329839825983, + "step": 12214, + "train/sim_loss": 0.02132391929626465 + }, + { + "epoch": 1.2076329839825983, + "step": 12214, + "train/total_loss": 0.021323977038264275 + }, + { + "entropy": 9.249664306640625, + "epoch": 1.2077318568321138, + "mean_token_accuracy": 0.7947434186935425, + "num_tokens": 2590283.0, + "step": 12215, + "train/ce_loss": 0.766125500202179 + }, + { + "epoch": 1.2077318568321138, + "step": 12215, + "train/sim_loss": 0.08211171627044678 + }, + { + "epoch": 1.2077318568321138, + "step": 12215, + "train/total_loss": 0.15872427821159363 + }, + { + "entropy": 9.45874309539795, + "epoch": 1.2078307296816293, + "mean_token_accuracy": 0.8799999952316284, + "num_tokens": 2605682.0, + "step": 12216, + "train/ce_loss": 0.3945120573043823 + }, + { + "epoch": 1.2078307296816293, + "step": 12216, + "train/sim_loss": 0.03697460889816284 + }, + { + "epoch": 1.2078307296816293, + "step": 12216, + "train/total_loss": 0.07642582058906555 + }, + { + "entropy": 9.45685863494873, + "epoch": 1.207929602531145, + "mean_token_accuracy": 0.8338461518287659, + "num_tokens": 2614282.0, + "step": 12217, + "train/ce_loss": 0.9388144016265869 + }, + { + "epoch": 1.207929602531145, + "step": 12217, + "train/sim_loss": 0.06778228282928467 + }, + { + "epoch": 1.207929602531145, + "step": 12217, + "train/total_loss": 0.1616637259721756 + }, + { + "entropy": 10.234899520874023, + "epoch": 1.2080284753806605, + "mean_token_accuracy": 0.8965517282485962, + "num_tokens": 2630177.0, + "step": 12218, + "train/ce_loss": 8.27603287234524e-07 + }, + { + "epoch": 1.2080284753806605, + "step": 12218, + "train/sim_loss": 0.03127121925354004 + }, + { + "epoch": 1.2080284753806605, + "step": 12218, + "train/total_loss": 0.031271301209926605 + }, + { + "entropy": 9.240243911743164, + "epoch": 1.208127348230176, + "mean_token_accuracy": 0.8640776872634888, + "num_tokens": 2638901.0, + "step": 12219, + "train/ce_loss": 0.6288719773292542 + }, + { + "epoch": 1.208127348230176, + "step": 12219, + "train/sim_loss": 0.06481480598449707 + }, + { + "epoch": 1.208127348230176, + "step": 12219, + "train/total_loss": 0.127701997756958 + }, + { + "epoch": 1.2082262210796915, + "grad_norm": 0.5556718111038208, + "learning_rate": 6.981407308510113e-06, + "loss": 0.0798, + "step": 12220 + }, + { + "entropy": 9.278646469116211, + "epoch": 1.2082262210796915, + "mean_token_accuracy": 0.8272095322608948, + "num_tokens": 2652494.0, + "step": 12220, + "train/ce_loss": 1.2371797561645508 + }, + { + "epoch": 1.2082262210796915, + "step": 12220, + "train/sim_loss": 0.07004183530807495 + }, + { + "epoch": 1.2082262210796915, + "step": 12220, + "train/total_loss": 0.19375981390476227 + }, + { + "entropy": 9.267435073852539, + "epoch": 1.208325093929207, + "mean_token_accuracy": 0.8363914489746094, + "num_tokens": 2665288.0, + "step": 12221, + "train/ce_loss": 0.26108524203300476 + }, + { + "epoch": 1.208325093929207, + "step": 12221, + "train/sim_loss": 0.021169543266296387 + }, + { + "epoch": 1.208325093929207, + "step": 12221, + "train/total_loss": 0.04727806895971298 + }, + { + "entropy": 9.38145923614502, + "epoch": 1.2084239667787227, + "mean_token_accuracy": 0.780458390712738, + "num_tokens": 2679497.0, + "step": 12222, + "train/ce_loss": 0.7495726346969604 + }, + { + "epoch": 1.2084239667787227, + "step": 12222, + "train/sim_loss": 0.02008974552154541 + }, + { + "epoch": 1.2084239667787227, + "step": 12222, + "train/total_loss": 0.0950470119714737 + }, + { + "entropy": 9.676080703735352, + "epoch": 1.2085228396282381, + "mean_token_accuracy": 0.8308724761009216, + "num_tokens": 2689411.0, + "step": 12223, + "train/ce_loss": 0.7849757075309753 + }, + { + "epoch": 1.2085228396282381, + "step": 12223, + "train/sim_loss": 0.0590328574180603 + }, + { + "epoch": 1.2085228396282381, + "step": 12223, + "train/total_loss": 0.13753043115139008 + }, + { + "entropy": 9.388385772705078, + "epoch": 1.2086217124777536, + "mean_token_accuracy": 0.8528826832771301, + "num_tokens": 2703927.0, + "step": 12224, + "train/ce_loss": 6.503082659037318e-07 + }, + { + "epoch": 1.2086217124777536, + "step": 12224, + "train/sim_loss": 0.028040528297424316 + }, + { + "epoch": 1.2086217124777536, + "step": 12224, + "train/total_loss": 0.02804059349000454 + }, + { + "entropy": 9.37136173248291, + "epoch": 1.2087205853272691, + "mean_token_accuracy": 0.857740581035614, + "num_tokens": 2715622.0, + "step": 12225, + "train/ce_loss": 0.16458599269390106 + }, + { + "epoch": 1.2087205853272691, + "step": 12225, + "train/sim_loss": 0.04318368434906006 + }, + { + "epoch": 1.2087205853272691, + "step": 12225, + "train/total_loss": 0.059642285108566284 + }, + { + "entropy": 8.962467193603516, + "epoch": 1.2088194581767846, + "mean_token_accuracy": 0.8780251741409302, + "num_tokens": 2727368.0, + "step": 12226, + "train/ce_loss": 0.4291505813598633 + }, + { + "epoch": 1.2088194581767846, + "step": 12226, + "train/sim_loss": 0.03036057949066162 + }, + { + "epoch": 1.2088194581767846, + "step": 12226, + "train/total_loss": 0.07327564060688019 + }, + { + "entropy": 9.62371826171875, + "epoch": 1.2089183310263, + "mean_token_accuracy": 0.7886179089546204, + "num_tokens": 2740371.0, + "step": 12227, + "train/ce_loss": 0.5640349984169006 + }, + { + "epoch": 1.2089183310263, + "step": 12227, + "train/sim_loss": 0.05989682674407959 + }, + { + "epoch": 1.2089183310263, + "step": 12227, + "train/total_loss": 0.11630032956600189 + }, + { + "entropy": 9.263019561767578, + "epoch": 1.2090172038758158, + "mean_token_accuracy": 0.8441203236579895, + "num_tokens": 2754034.0, + "step": 12228, + "train/ce_loss": 0.14474889636039734 + }, + { + "epoch": 1.2090172038758158, + "step": 12228, + "train/sim_loss": 0.03335118293762207 + }, + { + "epoch": 1.2090172038758158, + "step": 12228, + "train/total_loss": 0.047826074063777924 + }, + { + "entropy": 9.469720840454102, + "epoch": 1.2091160767253313, + "mean_token_accuracy": 0.8363858461380005, + "num_tokens": 2767906.0, + "step": 12229, + "train/ce_loss": 0.4620227813720703 + }, + { + "epoch": 1.2091160767253313, + "step": 12229, + "train/sim_loss": 0.019924819469451904 + }, + { + "epoch": 1.2091160767253313, + "step": 12229, + "train/total_loss": 0.06612709909677505 + }, + { + "entropy": 9.606344223022461, + "epoch": 1.2092149495748468, + "mean_token_accuracy": 0.9025521874427795, + "num_tokens": 2775599.0, + "step": 12230, + "train/ce_loss": 3.6591429761756444e-06 + }, + { + "epoch": 1.2092149495748468, + "step": 12230, + "train/sim_loss": 0.030186891555786133 + }, + { + "epoch": 1.2092149495748468, + "step": 12230, + "train/total_loss": 0.030187256634235382 + }, + { + "entropy": 9.429762840270996, + "epoch": 1.2093138224243623, + "mean_token_accuracy": 0.8445229530334473, + "num_tokens": 2791748.0, + "step": 12231, + "train/ce_loss": 6.79012487125874e-07 + }, + { + "epoch": 1.2093138224243623, + "step": 12231, + "train/sim_loss": 0.02401578426361084 + }, + { + "epoch": 1.2093138224243623, + "step": 12231, + "train/total_loss": 0.024015851318836212 + }, + { + "entropy": 9.275896072387695, + "epoch": 1.2094126952738777, + "mean_token_accuracy": 0.8571428656578064, + "num_tokens": 2805746.0, + "step": 12232, + "train/ce_loss": 0.5949133634567261 + }, + { + "epoch": 1.2094126952738777, + "step": 12232, + "train/sim_loss": 0.04895240068435669 + }, + { + "epoch": 1.2094126952738777, + "step": 12232, + "train/total_loss": 0.1084437370300293 + }, + { + "entropy": 9.553608894348145, + "epoch": 1.2095115681233932, + "mean_token_accuracy": 0.8246376514434814, + "num_tokens": 2815075.0, + "step": 12233, + "train/ce_loss": 0.7189662456512451 + }, + { + "epoch": 1.2095115681233932, + "step": 12233, + "train/sim_loss": 0.04627037048339844 + }, + { + "epoch": 1.2095115681233932, + "step": 12233, + "train/total_loss": 0.11816699802875519 + }, + { + "entropy": 9.976279258728027, + "epoch": 1.209610440972909, + "mean_token_accuracy": 0.8242990374565125, + "num_tokens": 2833405.0, + "step": 12234, + "train/ce_loss": 8.333225309797854e-07 + }, + { + "epoch": 1.209610440972909, + "step": 12234, + "train/sim_loss": 0.04210895299911499 + }, + { + "epoch": 1.209610440972909, + "step": 12234, + "train/total_loss": 0.042109034955501556 + }, + { + "entropy": 9.667464256286621, + "epoch": 1.2097093138224244, + "mean_token_accuracy": 0.8497340679168701, + "num_tokens": 2845214.0, + "step": 12235, + "train/ce_loss": 0.17807628214359283 + }, + { + "epoch": 1.2097093138224244, + "step": 12235, + "train/sim_loss": 0.04200410842895508 + }, + { + "epoch": 1.2097093138224244, + "step": 12235, + "train/total_loss": 0.05981173738837242 + }, + { + "entropy": 9.149913787841797, + "epoch": 1.20980818667194, + "mean_token_accuracy": 0.851898729801178, + "num_tokens": 2853336.0, + "step": 12236, + "train/ce_loss": 0.4186973571777344 + }, + { + "epoch": 1.20980818667194, + "step": 12236, + "train/sim_loss": 0.05288374423980713 + }, + { + "epoch": 1.20980818667194, + "step": 12236, + "train/total_loss": 0.09475348144769669 + }, + { + "entropy": 9.496471405029297, + "epoch": 1.2099070595214554, + "mean_token_accuracy": 0.8970588445663452, + "num_tokens": 2869592.0, + "step": 12237, + "train/ce_loss": 0.4630652070045471 + }, + { + "epoch": 1.2099070595214554, + "step": 12237, + "train/sim_loss": 0.01071476936340332 + }, + { + "epoch": 1.2099070595214554, + "step": 12237, + "train/total_loss": 0.05702129006385803 + }, + { + "entropy": 9.644886016845703, + "epoch": 1.2100059323709709, + "mean_token_accuracy": 0.8414815068244934, + "num_tokens": 2886771.0, + "step": 12238, + "train/ce_loss": 0.6650797128677368 + }, + { + "epoch": 1.2100059323709709, + "step": 12238, + "train/sim_loss": 0.038480162620544434 + }, + { + "epoch": 1.2100059323709709, + "step": 12238, + "train/total_loss": 0.10498813539743423 + }, + { + "entropy": 9.302558898925781, + "epoch": 1.2101048052204866, + "mean_token_accuracy": 0.8571428656578064, + "num_tokens": 2897861.0, + "step": 12239, + "train/ce_loss": 0.16932201385498047 + }, + { + "epoch": 1.2101048052204866, + "step": 12239, + "train/sim_loss": 0.0828620195388794 + }, + { + "epoch": 1.2101048052204866, + "step": 12239, + "train/total_loss": 0.09979422390460968 + }, + { + "epoch": 1.210203678070002, + "grad_norm": 0.5084015130996704, + "learning_rate": 6.976462443752165e-06, + "loss": 0.0868, + "step": 12240 + }, + { + "entropy": 9.473886489868164, + "epoch": 1.210203678070002, + "mean_token_accuracy": 0.8002793192863464, + "num_tokens": 2911141.0, + "step": 12240, + "train/ce_loss": 0.6809278130531311 + }, + { + "epoch": 1.210203678070002, + "step": 12240, + "train/sim_loss": 0.048586249351501465 + }, + { + "epoch": 1.210203678070002, + "step": 12240, + "train/total_loss": 0.11667903512716293 + }, + { + "entropy": 9.322026252746582, + "epoch": 1.2103025509195176, + "mean_token_accuracy": 0.8264706134796143, + "num_tokens": 2918490.0, + "step": 12241, + "train/ce_loss": 1.8075364778269432e-06 + }, + { + "epoch": 1.2103025509195176, + "step": 12241, + "train/sim_loss": 0.06574487686157227 + }, + { + "epoch": 1.2103025509195176, + "step": 12241, + "train/total_loss": 0.06574505567550659 + }, + { + "entropy": 9.409202575683594, + "epoch": 1.210401423769033, + "mean_token_accuracy": 0.8352059721946716, + "num_tokens": 2929001.0, + "step": 12242, + "train/ce_loss": 0.5462270975112915 + }, + { + "epoch": 1.210401423769033, + "step": 12242, + "train/sim_loss": 0.046414971351623535 + }, + { + "epoch": 1.210401423769033, + "step": 12242, + "train/total_loss": 0.10103768110275269 + }, + { + "entropy": 9.471015930175781, + "epoch": 1.2105002966185485, + "mean_token_accuracy": 0.8516284823417664, + "num_tokens": 2940257.0, + "step": 12243, + "train/ce_loss": 0.3225162923336029 + }, + { + "epoch": 1.2105002966185485, + "step": 12243, + "train/sim_loss": 0.05386930704116821 + }, + { + "epoch": 1.2105002966185485, + "step": 12243, + "train/total_loss": 0.08612093329429626 + }, + { + "entropy": 9.788667678833008, + "epoch": 1.210599169468064, + "mean_token_accuracy": 0.8198581337928772, + "num_tokens": 2951510.0, + "step": 12244, + "train/ce_loss": 0.4873996675014496 + }, + { + "epoch": 1.210599169468064, + "step": 12244, + "train/sim_loss": 0.07179605960845947 + }, + { + "epoch": 1.210599169468064, + "step": 12244, + "train/total_loss": 0.12053602933883667 + }, + { + "entropy": 9.726455688476562, + "epoch": 1.2106980423175795, + "mean_token_accuracy": 0.8335832357406616, + "num_tokens": 2960305.0, + "step": 12245, + "train/ce_loss": 0.4327826201915741 + }, + { + "epoch": 1.2106980423175795, + "step": 12245, + "train/sim_loss": 0.02301579713821411 + }, + { + "epoch": 1.2106980423175795, + "step": 12245, + "train/total_loss": 0.06629405915737152 + }, + { + "entropy": 9.38766098022461, + "epoch": 1.2107969151670952, + "mean_token_accuracy": 0.909621000289917, + "num_tokens": 2969720.0, + "step": 12246, + "train/ce_loss": 0.2943173944950104 + }, + { + "epoch": 1.2107969151670952, + "step": 12246, + "train/sim_loss": 0.03780180215835571 + }, + { + "epoch": 1.2107969151670952, + "step": 12246, + "train/total_loss": 0.06723354011774063 + }, + { + "entropy": 9.381096839904785, + "epoch": 1.2108957880166107, + "mean_token_accuracy": 0.8319838047027588, + "num_tokens": 2986659.0, + "step": 12247, + "train/ce_loss": 0.5737438201904297 + }, + { + "epoch": 1.2108957880166107, + "step": 12247, + "train/sim_loss": 0.06304645538330078 + }, + { + "epoch": 1.2108957880166107, + "step": 12247, + "train/total_loss": 0.12042084336280823 + }, + { + "entropy": 9.141307830810547, + "epoch": 1.2109946608661262, + "mean_token_accuracy": 0.873576283454895, + "num_tokens": 3002274.0, + "step": 12248, + "train/ce_loss": 0.5204167366027832 + }, + { + "epoch": 1.2109946608661262, + "step": 12248, + "train/sim_loss": 0.021251797676086426 + }, + { + "epoch": 1.2109946608661262, + "step": 12248, + "train/total_loss": 0.07329347729682922 + }, + { + "entropy": 9.434374809265137, + "epoch": 1.2110935337156417, + "mean_token_accuracy": 0.8389355540275574, + "num_tokens": 3011743.0, + "step": 12249, + "train/ce_loss": 0.5472025871276855 + }, + { + "epoch": 1.2110935337156417, + "step": 12249, + "train/sim_loss": 0.03639703989028931 + }, + { + "epoch": 1.2110935337156417, + "step": 12249, + "train/total_loss": 0.09111730009317398 + }, + { + "entropy": 9.559883117675781, + "epoch": 1.2111924065651571, + "mean_token_accuracy": 0.8649425506591797, + "num_tokens": 3024893.0, + "step": 12250, + "train/ce_loss": 0.15959307551383972 + }, + { + "epoch": 1.2111924065651571, + "step": 12250, + "train/sim_loss": 0.03602403402328491 + }, + { + "epoch": 1.2111924065651571, + "step": 12250, + "train/total_loss": 0.051983341574668884 + }, + { + "entropy": 9.502554893493652, + "epoch": 1.2112912794146729, + "mean_token_accuracy": 0.8571428656578064, + "num_tokens": 3035254.0, + "step": 12251, + "train/ce_loss": 0.5642011165618896 + }, + { + "epoch": 1.2112912794146729, + "step": 12251, + "train/sim_loss": 0.04087817668914795 + }, + { + "epoch": 1.2112912794146729, + "step": 12251, + "train/total_loss": 0.09729829430580139 + }, + { + "entropy": 9.543970108032227, + "epoch": 1.2113901522641883, + "mean_token_accuracy": 0.793020486831665, + "num_tokens": 3044314.0, + "step": 12252, + "train/ce_loss": 0.7522476315498352 + }, + { + "epoch": 1.2113901522641883, + "step": 12252, + "train/sim_loss": 0.04063594341278076 + }, + { + "epoch": 1.2113901522641883, + "step": 12252, + "train/total_loss": 0.1158607080578804 + }, + { + "entropy": 9.385408401489258, + "epoch": 1.2114890251137038, + "mean_token_accuracy": 0.897001326084137, + "num_tokens": 3056699.0, + "step": 12253, + "train/ce_loss": 5.265576419333229e-07 + }, + { + "epoch": 1.2114890251137038, + "step": 12253, + "train/sim_loss": 0.04018962383270264 + }, + { + "epoch": 1.2114890251137038, + "step": 12253, + "train/total_loss": 0.040189675986766815 + }, + { + "entropy": 10.100996971130371, + "epoch": 1.2115878979632193, + "mean_token_accuracy": 0.7988929748535156, + "num_tokens": 3067890.0, + "step": 12254, + "train/ce_loss": 6.765283728782379e-07 + }, + { + "epoch": 1.2115878979632193, + "step": 12254, + "train/sim_loss": 0.045752644538879395 + }, + { + "epoch": 1.2115878979632193, + "step": 12254, + "train/total_loss": 0.04575271159410477 + }, + { + "entropy": 10.278759002685547, + "epoch": 1.2116867708127348, + "mean_token_accuracy": 0.9102040529251099, + "num_tokens": 3077476.0, + "step": 12255, + "train/ce_loss": 1.1334028244018555 + }, + { + "epoch": 1.2116867708127348, + "step": 12255, + "train/sim_loss": 0.03492838144302368 + }, + { + "epoch": 1.2116867708127348, + "step": 12255, + "train/total_loss": 0.1482686698436737 + }, + { + "entropy": 9.038031578063965, + "epoch": 1.2117856436622503, + "mean_token_accuracy": 0.8623566031455994, + "num_tokens": 3086456.0, + "step": 12256, + "train/ce_loss": 0.5298771858215332 + }, + { + "epoch": 1.2117856436622503, + "step": 12256, + "train/sim_loss": 0.08001303672790527 + }, + { + "epoch": 1.2117856436622503, + "step": 12256, + "train/total_loss": 0.13300076127052307 + }, + { + "entropy": 9.558409690856934, + "epoch": 1.2118845165117658, + "mean_token_accuracy": 0.8067331910133362, + "num_tokens": 3097288.0, + "step": 12257, + "train/ce_loss": 0.6172279715538025 + }, + { + "epoch": 1.2118845165117658, + "step": 12257, + "train/sim_loss": 0.04998064041137695 + }, + { + "epoch": 1.2118845165117658, + "step": 12257, + "train/total_loss": 0.11170344054698944 + }, + { + "entropy": 9.678045272827148, + "epoch": 1.2119833893612815, + "mean_token_accuracy": 0.8677165508270264, + "num_tokens": 3111820.0, + "step": 12258, + "train/ce_loss": 0.4554758667945862 + }, + { + "epoch": 1.2119833893612815, + "step": 12258, + "train/sim_loss": 0.04017883539199829 + }, + { + "epoch": 1.2119833893612815, + "step": 12258, + "train/total_loss": 0.08572642505168915 + }, + { + "entropy": 9.432576179504395, + "epoch": 1.212082262210797, + "mean_token_accuracy": 0.9056016802787781, + "num_tokens": 3129852.0, + "step": 12259, + "train/ce_loss": 0.1388675421476364 + }, + { + "epoch": 1.212082262210797, + "step": 12259, + "train/sim_loss": 0.05271303653717041 + }, + { + "epoch": 1.212082262210797, + "step": 12259, + "train/total_loss": 0.06659979373216629 + }, + { + "epoch": 1.2121811350603124, + "grad_norm": 0.5102376937866211, + "learning_rate": 6.971517578994214e-06, + "loss": 0.0928, + "step": 12260 + }, + { + "entropy": 9.274757385253906, + "epoch": 1.2121811350603124, + "mean_token_accuracy": 0.8348519206047058, + "num_tokens": 3137617.0, + "step": 12260, + "train/ce_loss": 0.34904998540878296 + }, + { + "epoch": 1.2121811350603124, + "step": 12260, + "train/sim_loss": 0.01555478572845459 + }, + { + "epoch": 1.2121811350603124, + "step": 12260, + "train/total_loss": 0.050459783524274826 + }, + { + "entropy": 9.15090560913086, + "epoch": 1.212280007909828, + "mean_token_accuracy": 0.8342036604881287, + "num_tokens": 3149853.0, + "step": 12261, + "train/ce_loss": 0.3983233571052551 + }, + { + "epoch": 1.212280007909828, + "step": 12261, + "train/sim_loss": 0.055590152740478516 + }, + { + "epoch": 1.212280007909828, + "step": 12261, + "train/total_loss": 0.09542249143123627 + }, + { + "entropy": 9.39038372039795, + "epoch": 1.2123788807593434, + "mean_token_accuracy": 0.8303797245025635, + "num_tokens": 3157999.0, + "step": 12262, + "train/ce_loss": 2.860988956854271e-07 + }, + { + "epoch": 1.2123788807593434, + "step": 12262, + "train/sim_loss": 0.024332165718078613 + }, + { + "epoch": 1.2123788807593434, + "step": 12262, + "train/total_loss": 0.024332193657755852 + }, + { + "entropy": 9.210709571838379, + "epoch": 1.2124777536088591, + "mean_token_accuracy": 0.8363844156265259, + "num_tokens": 3168869.0, + "step": 12263, + "train/ce_loss": 0.4014941453933716 + }, + { + "epoch": 1.2124777536088591, + "step": 12263, + "train/sim_loss": 0.030262887477874756 + }, + { + "epoch": 1.2124777536088591, + "step": 12263, + "train/total_loss": 0.07041230797767639 + }, + { + "entropy": 9.768019676208496, + "epoch": 1.2125766264583746, + "mean_token_accuracy": 0.8154761791229248, + "num_tokens": 3182848.0, + "step": 12264, + "train/ce_loss": 0.5411247611045837 + }, + { + "epoch": 1.2125766264583746, + "step": 12264, + "train/sim_loss": 0.06482458114624023 + }, + { + "epoch": 1.2125766264583746, + "step": 12264, + "train/total_loss": 0.11893706023693085 + }, + { + "entropy": 9.321430206298828, + "epoch": 1.21267549930789, + "mean_token_accuracy": 0.785932719707489, + "num_tokens": 3195638.0, + "step": 12265, + "train/ce_loss": 0.6612563133239746 + }, + { + "epoch": 1.21267549930789, + "step": 12265, + "train/sim_loss": 0.041078805923461914 + }, + { + "epoch": 1.21267549930789, + "step": 12265, + "train/total_loss": 0.10720443725585938 + }, + { + "entropy": 9.278233528137207, + "epoch": 1.2127743721574056, + "mean_token_accuracy": 0.8101266026496887, + "num_tokens": 3203963.0, + "step": 12266, + "train/ce_loss": 0.5850991606712341 + }, + { + "epoch": 1.2127743721574056, + "step": 12266, + "train/sim_loss": 0.027595937252044678 + }, + { + "epoch": 1.2127743721574056, + "step": 12266, + "train/total_loss": 0.08610585331916809 + }, + { + "entropy": 9.924612045288086, + "epoch": 1.212873245006921, + "mean_token_accuracy": 0.875, + "num_tokens": 3212840.0, + "step": 12267, + "train/ce_loss": 0.41301417350769043 + }, + { + "epoch": 1.212873245006921, + "step": 12267, + "train/sim_loss": 0.07111793756484985 + }, + { + "epoch": 1.212873245006921, + "step": 12267, + "train/total_loss": 0.11241935193538666 + }, + { + "entropy": 9.427352905273438, + "epoch": 1.2129721178564365, + "mean_token_accuracy": 0.8047091364860535, + "num_tokens": 3224391.0, + "step": 12268, + "train/ce_loss": 0.7264388799667358 + }, + { + "epoch": 1.2129721178564365, + "step": 12268, + "train/sim_loss": 0.050833821296691895 + }, + { + "epoch": 1.2129721178564365, + "step": 12268, + "train/total_loss": 0.12347771227359772 + }, + { + "entropy": 9.322540283203125, + "epoch": 1.213070990705952, + "mean_token_accuracy": 0.8223289251327515, + "num_tokens": 3239398.0, + "step": 12269, + "train/ce_loss": 0.7172441482543945 + }, + { + "epoch": 1.213070990705952, + "step": 12269, + "train/sim_loss": 0.09057271480560303 + }, + { + "epoch": 1.213070990705952, + "step": 12269, + "train/total_loss": 0.16229712963104248 + }, + { + "entropy": 9.730963706970215, + "epoch": 1.2131698635554677, + "mean_token_accuracy": 0.8208092451095581, + "num_tokens": 3253291.0, + "step": 12270, + "train/ce_loss": 0.6440098285675049 + }, + { + "epoch": 1.2131698635554677, + "step": 12270, + "train/sim_loss": 0.03764986991882324 + }, + { + "epoch": 1.2131698635554677, + "step": 12270, + "train/total_loss": 0.10205085575580597 + }, + { + "entropy": 9.614448547363281, + "epoch": 1.2132687364049832, + "mean_token_accuracy": 0.908180296421051, + "num_tokens": 3266587.0, + "step": 12271, + "train/ce_loss": 0.5902631878852844 + }, + { + "epoch": 1.2132687364049832, + "step": 12271, + "train/sim_loss": 0.06152904033660889 + }, + { + "epoch": 1.2132687364049832, + "step": 12271, + "train/total_loss": 0.12055535614490509 + }, + { + "entropy": 10.11133098602295, + "epoch": 1.2133676092544987, + "mean_token_accuracy": 0.8509749174118042, + "num_tokens": 3279540.0, + "step": 12272, + "train/ce_loss": 0.5651184916496277 + }, + { + "epoch": 1.2133676092544987, + "step": 12272, + "train/sim_loss": 0.04851233959197998 + }, + { + "epoch": 1.2133676092544987, + "step": 12272, + "train/total_loss": 0.10502418875694275 + }, + { + "entropy": 9.585952758789062, + "epoch": 1.2134664821040142, + "mean_token_accuracy": 0.8866906762123108, + "num_tokens": 3291489.0, + "step": 12273, + "train/ce_loss": 0.27247220277786255 + }, + { + "epoch": 1.2134664821040142, + "step": 12273, + "train/sim_loss": 0.03867900371551514 + }, + { + "epoch": 1.2134664821040142, + "step": 12273, + "train/total_loss": 0.06592622399330139 + }, + { + "entropy": 9.623138427734375, + "epoch": 1.2135653549535297, + "mean_token_accuracy": 0.744705855846405, + "num_tokens": 3306444.0, + "step": 12274, + "train/ce_loss": 0.7882283926010132 + }, + { + "epoch": 1.2135653549535297, + "step": 12274, + "train/sim_loss": 0.01895582675933838 + }, + { + "epoch": 1.2135653549535297, + "step": 12274, + "train/total_loss": 0.09777867048978806 + }, + { + "entropy": 9.658477783203125, + "epoch": 1.2136642278030454, + "mean_token_accuracy": 0.8786231875419617, + "num_tokens": 3319412.0, + "step": 12275, + "train/ce_loss": 0.5868175625801086 + }, + { + "epoch": 1.2136642278030454, + "step": 12275, + "train/sim_loss": 0.045105159282684326 + }, + { + "epoch": 1.2136642278030454, + "step": 12275, + "train/total_loss": 0.10378691554069519 + }, + { + "entropy": 9.666300773620605, + "epoch": 1.2137631006525609, + "mean_token_accuracy": 0.8773841857910156, + "num_tokens": 3333847.0, + "step": 12276, + "train/ce_loss": 0.23846760392189026 + }, + { + "epoch": 1.2137631006525609, + "step": 12276, + "train/sim_loss": 0.03780055046081543 + }, + { + "epoch": 1.2137631006525609, + "step": 12276, + "train/total_loss": 0.061647310853004456 + }, + { + "entropy": 9.26974868774414, + "epoch": 1.2138619735020764, + "mean_token_accuracy": 0.8442105054855347, + "num_tokens": 3353314.0, + "step": 12277, + "train/ce_loss": 0.4140678942203522 + }, + { + "epoch": 1.2138619735020764, + "step": 12277, + "train/sim_loss": 0.030120134353637695 + }, + { + "epoch": 1.2138619735020764, + "step": 12277, + "train/total_loss": 0.07152692973613739 + }, + { + "entropy": 8.6697998046875, + "epoch": 1.2139608463515918, + "mean_token_accuracy": 0.8168462514877319, + "num_tokens": 3362538.0, + "step": 12278, + "train/ce_loss": 0.7147200107574463 + }, + { + "epoch": 1.2139608463515918, + "step": 12278, + "train/sim_loss": 0.08428287506103516 + }, + { + "epoch": 1.2139608463515918, + "step": 12278, + "train/total_loss": 0.15575487911701202 + }, + { + "entropy": 9.49748420715332, + "epoch": 1.2140597192011073, + "mean_token_accuracy": 0.8317757248878479, + "num_tokens": 3372842.0, + "step": 12279, + "train/ce_loss": 0.27470070123672485 + }, + { + "epoch": 1.2140597192011073, + "step": 12279, + "train/sim_loss": 0.056224703788757324 + }, + { + "epoch": 1.2140597192011073, + "step": 12279, + "train/total_loss": 0.08369477093219757 + }, + { + "epoch": 1.2141585920506228, + "grad_norm": 0.6940965056419373, + "learning_rate": 6.966572714236266e-06, + "loss": 0.0948, + "step": 12280 + }, + { + "entropy": 8.92327880859375, + "epoch": 1.2141585920506228, + "mean_token_accuracy": 0.886929452419281, + "num_tokens": 3380421.0, + "step": 12280, + "train/ce_loss": 0.6108546853065491 + }, + { + "epoch": 1.2141585920506228, + "step": 12280, + "train/sim_loss": 0.015032052993774414 + }, + { + "epoch": 1.2141585920506228, + "step": 12280, + "train/total_loss": 0.07611752301454544 + }, + { + "entropy": 9.093962669372559, + "epoch": 1.2142574649001383, + "mean_token_accuracy": 0.7895362377166748, + "num_tokens": 3390966.0, + "step": 12281, + "train/ce_loss": 0.5347420573234558 + }, + { + "epoch": 1.2142574649001383, + "step": 12281, + "train/sim_loss": 0.03823888301849365 + }, + { + "epoch": 1.2142574649001383, + "step": 12281, + "train/total_loss": 0.091713085770607 + }, + { + "entropy": 9.469560623168945, + "epoch": 1.214356337749654, + "mean_token_accuracy": 0.8153098225593567, + "num_tokens": 3400477.0, + "step": 12282, + "train/ce_loss": 0.7187314033508301 + }, + { + "epoch": 1.214356337749654, + "step": 12282, + "train/sim_loss": 0.05461311340332031 + }, + { + "epoch": 1.214356337749654, + "step": 12282, + "train/total_loss": 0.12648625671863556 + }, + { + "entropy": 9.047447204589844, + "epoch": 1.2144552105991695, + "mean_token_accuracy": 0.8236074447631836, + "num_tokens": 3411679.0, + "step": 12283, + "train/ce_loss": 3.933542416234559e-07 + }, + { + "epoch": 1.2144552105991695, + "step": 12283, + "train/sim_loss": 0.030900418758392334 + }, + { + "epoch": 1.2144552105991695, + "step": 12283, + "train/total_loss": 0.030900457873940468 + }, + { + "entropy": 9.238042831420898, + "epoch": 1.214554083448685, + "mean_token_accuracy": 0.8476977348327637, + "num_tokens": 3421416.0, + "step": 12284, + "train/ce_loss": 0.46117132902145386 + }, + { + "epoch": 1.214554083448685, + "step": 12284, + "train/sim_loss": 0.01477748155593872 + }, + { + "epoch": 1.214554083448685, + "step": 12284, + "train/total_loss": 0.060894615948200226 + }, + { + "entropy": 9.205521583557129, + "epoch": 1.2146529562982005, + "mean_token_accuracy": 0.8563829660415649, + "num_tokens": 3428440.0, + "step": 12285, + "train/ce_loss": 0.36273786425590515 + }, + { + "epoch": 1.2146529562982005, + "step": 12285, + "train/sim_loss": 0.03350633382797241 + }, + { + "epoch": 1.2146529562982005, + "step": 12285, + "train/total_loss": 0.0697801262140274 + }, + { + "entropy": 9.45505142211914, + "epoch": 1.214751829147716, + "mean_token_accuracy": 0.8297872543334961, + "num_tokens": 3444198.0, + "step": 12286, + "train/ce_loss": 0.445887953042984 + }, + { + "epoch": 1.214751829147716, + "step": 12286, + "train/sim_loss": 0.010697484016418457 + }, + { + "epoch": 1.214751829147716, + "step": 12286, + "train/total_loss": 0.05528628081083298 + }, + { + "entropy": 9.909345626831055, + "epoch": 1.2148507019972317, + "mean_token_accuracy": 0.8648648858070374, + "num_tokens": 3457507.0, + "step": 12287, + "train/ce_loss": 0.47901347279548645 + }, + { + "epoch": 1.2148507019972317, + "step": 12287, + "train/sim_loss": 0.02479785680770874 + }, + { + "epoch": 1.2148507019972317, + "step": 12287, + "train/total_loss": 0.07269920408725739 + }, + { + "entropy": 9.689492225646973, + "epoch": 1.2149495748467471, + "mean_token_accuracy": 0.8455284833908081, + "num_tokens": 3469132.0, + "step": 12288, + "train/ce_loss": 1.2123879287173622e-06 + }, + { + "epoch": 1.2149495748467471, + "step": 12288, + "train/sim_loss": 0.026602566242218018 + }, + { + "epoch": 1.2149495748467471, + "step": 12288, + "train/total_loss": 0.026602687314152718 + }, + { + "entropy": 9.80856704711914, + "epoch": 1.2150484476962626, + "mean_token_accuracy": 0.8174341917037964, + "num_tokens": 3488849.0, + "step": 12289, + "train/ce_loss": 1.0022011995315552 + }, + { + "epoch": 1.2150484476962626, + "step": 12289, + "train/sim_loss": 0.05867576599121094 + }, + { + "epoch": 1.2150484476962626, + "step": 12289, + "train/total_loss": 0.15889587998390198 + }, + { + "entropy": 10.091397285461426, + "epoch": 1.2151473205457781, + "mean_token_accuracy": 0.9075000286102295, + "num_tokens": 3502872.0, + "step": 12290, + "train/ce_loss": 5.257066959529766e-07 + }, + { + "epoch": 1.2151473205457781, + "step": 12290, + "train/sim_loss": 0.01393425464630127 + }, + { + "epoch": 1.2151473205457781, + "step": 12290, + "train/total_loss": 0.013934306800365448 + }, + { + "entropy": 9.62080192565918, + "epoch": 1.2152461933952936, + "mean_token_accuracy": 0.8434504866600037, + "num_tokens": 3517919.0, + "step": 12291, + "train/ce_loss": 9.833385092861135e-07 + }, + { + "epoch": 1.2152461933952936, + "step": 12291, + "train/sim_loss": 0.04971814155578613 + }, + { + "epoch": 1.2152461933952936, + "step": 12291, + "train/total_loss": 0.04971823841333389 + }, + { + "entropy": 9.845029830932617, + "epoch": 1.215345066244809, + "mean_token_accuracy": 0.8502673506736755, + "num_tokens": 3528232.0, + "step": 12292, + "train/ce_loss": 2.923897852724622e-07 + }, + { + "epoch": 1.215345066244809, + "step": 12292, + "train/sim_loss": 0.019643545150756836 + }, + { + "epoch": 1.215345066244809, + "step": 12292, + "train/total_loss": 0.019643574953079224 + }, + { + "entropy": 8.985353469848633, + "epoch": 1.2154439390943246, + "mean_token_accuracy": 0.8337730765342712, + "num_tokens": 3541277.0, + "step": 12293, + "train/ce_loss": 0.42083829641342163 + }, + { + "epoch": 1.2154439390943246, + "step": 12293, + "train/sim_loss": 0.08484834432601929 + }, + { + "epoch": 1.2154439390943246, + "step": 12293, + "train/total_loss": 0.12693217396736145 + }, + { + "entropy": 9.643804550170898, + "epoch": 1.2155428119438403, + "mean_token_accuracy": 0.9222462177276611, + "num_tokens": 3549435.0, + "step": 12294, + "train/ce_loss": 0.3274211287498474 + }, + { + "epoch": 1.2155428119438403, + "step": 12294, + "train/sim_loss": 0.029454171657562256 + }, + { + "epoch": 1.2155428119438403, + "step": 12294, + "train/total_loss": 0.062196284532547 + }, + { + "entropy": 9.928210258483887, + "epoch": 1.2156416847933558, + "mean_token_accuracy": 0.8674521446228027, + "num_tokens": 3560138.0, + "step": 12295, + "train/ce_loss": 1.080005858966615e-06 + }, + { + "epoch": 1.2156416847933558, + "step": 12295, + "train/sim_loss": 0.05282461643218994 + }, + { + "epoch": 1.2156416847933558, + "step": 12295, + "train/total_loss": 0.0528247244656086 + }, + { + "entropy": 8.686418533325195, + "epoch": 1.2157405576428713, + "mean_token_accuracy": 0.8437843918800354, + "num_tokens": 3571160.0, + "step": 12296, + "train/ce_loss": 0.5691115260124207 + }, + { + "epoch": 1.2157405576428713, + "step": 12296, + "train/sim_loss": 0.05983102321624756 + }, + { + "epoch": 1.2157405576428713, + "step": 12296, + "train/total_loss": 0.11674217879772186 + }, + { + "entropy": 8.719575881958008, + "epoch": 1.2158394304923867, + "mean_token_accuracy": 0.8511777520179749, + "num_tokens": 3582563.0, + "step": 12297, + "train/ce_loss": 0.2600997984409332 + }, + { + "epoch": 1.2158394304923867, + "step": 12297, + "train/sim_loss": 0.01802980899810791 + }, + { + "epoch": 1.2158394304923867, + "step": 12297, + "train/total_loss": 0.04403978958725929 + }, + { + "entropy": 9.178729057312012, + "epoch": 1.2159383033419022, + "mean_token_accuracy": 0.8883048892021179, + "num_tokens": 3592495.0, + "step": 12298, + "train/ce_loss": 5.714357484976063e-07 + }, + { + "epoch": 1.2159383033419022, + "step": 12298, + "train/sim_loss": 0.03861212730407715 + }, + { + "epoch": 1.2159383033419022, + "step": 12298, + "train/total_loss": 0.038612183183431625 + }, + { + "entropy": 9.807744979858398, + "epoch": 1.216037176191418, + "mean_token_accuracy": 0.7743467688560486, + "num_tokens": 3612534.0, + "step": 12299, + "train/ce_loss": 0.5408740639686584 + }, + { + "epoch": 1.216037176191418, + "step": 12299, + "train/sim_loss": 0.07960867881774902 + }, + { + "epoch": 1.216037176191418, + "step": 12299, + "train/total_loss": 0.1336960792541504 + }, + { + "epoch": 1.2161360490409334, + "grad_norm": 0.6871027946472168, + "learning_rate": 6.961627849478317e-06, + "loss": 0.0876, + "step": 12300 + }, + { + "entropy": 9.384878158569336, + "epoch": 1.2161360490409334, + "mean_token_accuracy": 0.8629689812660217, + "num_tokens": 3624189.0, + "step": 12300, + "train/ce_loss": 0.43817102909088135 + }, + { + "epoch": 1.2161360490409334, + "step": 12300, + "train/sim_loss": 0.06493651866912842 + }, + { + "epoch": 1.2161360490409334, + "step": 12300, + "train/total_loss": 0.10875362157821655 + }, + { + "entropy": 9.708819389343262, + "epoch": 1.216234921890449, + "mean_token_accuracy": 0.8342939615249634, + "num_tokens": 3633925.0, + "step": 12301, + "train/ce_loss": 2.3566822449083702e-07 + }, + { + "epoch": 1.216234921890449, + "step": 12301, + "train/sim_loss": 0.012864291667938232 + }, + { + "epoch": 1.216234921890449, + "step": 12301, + "train/total_loss": 0.012864314951002598 + }, + { + "entropy": 9.553540229797363, + "epoch": 1.2163337947399644, + "mean_token_accuracy": 0.9016602635383606, + "num_tokens": 3644741.0, + "step": 12302, + "train/ce_loss": 0.19168874621391296 + }, + { + "epoch": 1.2163337947399644, + "step": 12302, + "train/sim_loss": 0.052059948444366455 + }, + { + "epoch": 1.2163337947399644, + "step": 12302, + "train/total_loss": 0.07122882455587387 + }, + { + "entropy": 9.515405654907227, + "epoch": 1.2164326675894799, + "mean_token_accuracy": 0.8718905448913574, + "num_tokens": 3655100.0, + "step": 12303, + "train/ce_loss": 0.5175080299377441 + }, + { + "epoch": 1.2164326675894799, + "step": 12303, + "train/sim_loss": 0.04464101791381836 + }, + { + "epoch": 1.2164326675894799, + "step": 12303, + "train/total_loss": 0.09639182686805725 + }, + { + "entropy": 9.340581893920898, + "epoch": 1.2165315404389954, + "mean_token_accuracy": 0.8656716346740723, + "num_tokens": 3663892.0, + "step": 12304, + "train/ce_loss": 0.4172906279563904 + }, + { + "epoch": 1.2165315404389954, + "step": 12304, + "train/sim_loss": 0.019451022148132324 + }, + { + "epoch": 1.2165315404389954, + "step": 12304, + "train/total_loss": 0.06118008494377136 + }, + { + "entropy": 9.869073867797852, + "epoch": 1.216630413288511, + "mean_token_accuracy": 0.8770833611488342, + "num_tokens": 3673938.0, + "step": 12305, + "train/ce_loss": 0.923169732093811 + }, + { + "epoch": 1.216630413288511, + "step": 12305, + "train/sim_loss": 0.08020496368408203 + }, + { + "epoch": 1.216630413288511, + "step": 12305, + "train/total_loss": 0.1725219488143921 + }, + { + "entropy": 9.676857948303223, + "epoch": 1.2167292861380266, + "mean_token_accuracy": 0.8607407212257385, + "num_tokens": 3691108.0, + "step": 12306, + "train/ce_loss": 0.8635454773902893 + }, + { + "epoch": 1.2167292861380266, + "step": 12306, + "train/sim_loss": 0.057217419147491455 + }, + { + "epoch": 1.2167292861380266, + "step": 12306, + "train/total_loss": 0.14357197284698486 + }, + { + "entropy": 8.871986389160156, + "epoch": 1.216828158987542, + "mean_token_accuracy": 0.8072654008865356, + "num_tokens": 3699974.0, + "step": 12307, + "train/ce_loss": 0.6024616956710815 + }, + { + "epoch": 1.216828158987542, + "step": 12307, + "train/sim_loss": 0.035323381423950195 + }, + { + "epoch": 1.216828158987542, + "step": 12307, + "train/total_loss": 0.09556955099105835 + }, + { + "entropy": 9.080808639526367, + "epoch": 1.2169270318370575, + "mean_token_accuracy": 0.8386713862419128, + "num_tokens": 3710211.0, + "step": 12308, + "train/ce_loss": 0.3975769579410553 + }, + { + "epoch": 1.2169270318370575, + "step": 12308, + "train/sim_loss": 0.11464095115661621 + }, + { + "epoch": 1.2169270318370575, + "step": 12308, + "train/total_loss": 0.15439864993095398 + }, + { + "entropy": 9.520320892333984, + "epoch": 1.217025904686573, + "mean_token_accuracy": 0.8543859720230103, + "num_tokens": 3726257.0, + "step": 12309, + "train/ce_loss": 0.6950336694717407 + }, + { + "epoch": 1.217025904686573, + "step": 12309, + "train/sim_loss": 0.022997140884399414 + }, + { + "epoch": 1.217025904686573, + "step": 12309, + "train/total_loss": 0.09250050783157349 + }, + { + "entropy": 9.331123352050781, + "epoch": 1.2171247775360885, + "mean_token_accuracy": 0.8445322513580322, + "num_tokens": 3744054.0, + "step": 12310, + "train/ce_loss": 0.12932337820529938 + }, + { + "epoch": 1.2171247775360885, + "step": 12310, + "train/sim_loss": 0.03776240348815918 + }, + { + "epoch": 1.2171247775360885, + "step": 12310, + "train/total_loss": 0.05069474130868912 + }, + { + "entropy": 9.24036979675293, + "epoch": 1.2172236503856042, + "mean_token_accuracy": 0.8433734774589539, + "num_tokens": 3756212.0, + "step": 12311, + "train/ce_loss": 0.5036128759384155 + }, + { + "epoch": 1.2172236503856042, + "step": 12311, + "train/sim_loss": 0.03628057241439819 + }, + { + "epoch": 1.2172236503856042, + "step": 12311, + "train/total_loss": 0.08664186298847198 + }, + { + "entropy": 9.036075592041016, + "epoch": 1.2173225232351197, + "mean_token_accuracy": 0.8622908592224121, + "num_tokens": 3767984.0, + "step": 12312, + "train/ce_loss": 0.3991820514202118 + }, + { + "epoch": 1.2173225232351197, + "step": 12312, + "train/sim_loss": 0.03398585319519043 + }, + { + "epoch": 1.2173225232351197, + "step": 12312, + "train/total_loss": 0.07390405982732773 + }, + { + "entropy": 9.673563003540039, + "epoch": 1.2174213960846352, + "mean_token_accuracy": 0.8912280797958374, + "num_tokens": 3786623.0, + "step": 12313, + "train/ce_loss": 0.20584052801132202 + }, + { + "epoch": 1.2174213960846352, + "step": 12313, + "train/sim_loss": 0.06562995910644531 + }, + { + "epoch": 1.2174213960846352, + "step": 12313, + "train/total_loss": 0.08621401339769363 + }, + { + "entropy": 9.491785049438477, + "epoch": 1.2175202689341507, + "mean_token_accuracy": 0.8531540632247925, + "num_tokens": 3799215.0, + "step": 12314, + "train/ce_loss": 0.4801539182662964 + }, + { + "epoch": 1.2175202689341507, + "step": 12314, + "train/sim_loss": 0.04869621992111206 + }, + { + "epoch": 1.2175202689341507, + "step": 12314, + "train/total_loss": 0.09671161323785782 + }, + { + "entropy": 9.631752967834473, + "epoch": 1.2176191417836661, + "mean_token_accuracy": 0.8176100850105286, + "num_tokens": 3812642.0, + "step": 12315, + "train/ce_loss": 0.3101538419723511 + }, + { + "epoch": 1.2176191417836661, + "step": 12315, + "train/sim_loss": 0.0314178466796875 + }, + { + "epoch": 1.2176191417836661, + "step": 12315, + "train/total_loss": 0.06243323162198067 + }, + { + "entropy": 8.960750579833984, + "epoch": 1.2177180146331816, + "mean_token_accuracy": 0.8539215922355652, + "num_tokens": 3825185.0, + "step": 12316, + "train/ce_loss": 0.34747081995010376 + }, + { + "epoch": 1.2177180146331816, + "step": 12316, + "train/sim_loss": 0.017311692237854004 + }, + { + "epoch": 1.2177180146331816, + "step": 12316, + "train/total_loss": 0.05205877497792244 + }, + { + "entropy": 9.547222137451172, + "epoch": 1.2178168874826973, + "mean_token_accuracy": 0.8421807885169983, + "num_tokens": 3836967.0, + "step": 12317, + "train/ce_loss": 0.6304792165756226 + }, + { + "epoch": 1.2178168874826973, + "step": 12317, + "train/sim_loss": 0.06548649072647095 + }, + { + "epoch": 1.2178168874826973, + "step": 12317, + "train/total_loss": 0.12853440642356873 + }, + { + "entropy": 9.670757293701172, + "epoch": 1.2179157603322128, + "mean_token_accuracy": 0.8216957449913025, + "num_tokens": 3849991.0, + "step": 12318, + "train/ce_loss": 0.4450819790363312 + }, + { + "epoch": 1.2179157603322128, + "step": 12318, + "train/sim_loss": 0.01151418685913086 + }, + { + "epoch": 1.2179157603322128, + "step": 12318, + "train/total_loss": 0.056022386997938156 + }, + { + "entropy": 9.392099380493164, + "epoch": 1.2180146331817283, + "mean_token_accuracy": 0.8558897376060486, + "num_tokens": 3860909.0, + "step": 12319, + "train/ce_loss": 0.5241879820823669 + }, + { + "epoch": 1.2180146331817283, + "step": 12319, + "train/sim_loss": 0.041209518909454346 + }, + { + "epoch": 1.2180146331817283, + "step": 12319, + "train/total_loss": 0.09362831711769104 + }, + { + "epoch": 1.2181135060312438, + "grad_norm": 0.5880166888237, + "learning_rate": 6.956682984720368e-06, + "loss": 0.0863, + "step": 12320 + }, + { + "entropy": 9.763325691223145, + "epoch": 1.2181135060312438, + "mean_token_accuracy": 0.8939759135246277, + "num_tokens": 3871964.0, + "step": 12320, + "train/ce_loss": 5.836881769027968e-07 + }, + { + "epoch": 1.2181135060312438, + "step": 12320, + "train/sim_loss": 0.027262449264526367 + }, + { + "epoch": 1.2181135060312438, + "step": 12320, + "train/total_loss": 0.027262507006525993 + }, + { + "entropy": 8.8307523727417, + "epoch": 1.2182123788807593, + "mean_token_accuracy": 0.890641450881958, + "num_tokens": 3883191.0, + "step": 12321, + "train/ce_loss": 0.3308497369289398 + }, + { + "epoch": 1.2182123788807593, + "step": 12321, + "train/sim_loss": 0.013187229633331299 + }, + { + "epoch": 1.2182123788807593, + "step": 12321, + "train/total_loss": 0.04627220332622528 + }, + { + "entropy": 9.367931365966797, + "epoch": 1.2183112517302748, + "mean_token_accuracy": 0.8479729890823364, + "num_tokens": 3898406.0, + "step": 12322, + "train/ce_loss": 0.3847808837890625 + }, + { + "epoch": 1.2183112517302748, + "step": 12322, + "train/sim_loss": 0.03282594680786133 + }, + { + "epoch": 1.2183112517302748, + "step": 12322, + "train/total_loss": 0.07130403816699982 + }, + { + "entropy": 9.760305404663086, + "epoch": 1.2184101245797905, + "mean_token_accuracy": 0.7850208282470703, + "num_tokens": 3910609.0, + "step": 12323, + "train/ce_loss": 0.5534500479698181 + }, + { + "epoch": 1.2184101245797905, + "step": 12323, + "train/sim_loss": 0.07405924797058105 + }, + { + "epoch": 1.2184101245797905, + "step": 12323, + "train/total_loss": 0.1294042468070984 + }, + { + "entropy": 9.584075927734375, + "epoch": 1.218508997429306, + "mean_token_accuracy": 0.8181818127632141, + "num_tokens": 3925180.0, + "step": 12324, + "train/ce_loss": 0.259078711271286 + }, + { + "epoch": 1.218508997429306, + "step": 12324, + "train/sim_loss": 0.039851605892181396 + }, + { + "epoch": 1.218508997429306, + "step": 12324, + "train/total_loss": 0.06575947999954224 + }, + { + "entropy": 8.97484016418457, + "epoch": 1.2186078702788214, + "mean_token_accuracy": 0.8784722089767456, + "num_tokens": 3934245.0, + "step": 12325, + "train/ce_loss": 0.20682069659233093 + }, + { + "epoch": 1.2186078702788214, + "step": 12325, + "train/sim_loss": 0.01220470666885376 + }, + { + "epoch": 1.2186078702788214, + "step": 12325, + "train/total_loss": 0.03288677707314491 + }, + { + "entropy": 9.67483139038086, + "epoch": 1.218706743128337, + "mean_token_accuracy": 0.8971630930900574, + "num_tokens": 3950804.0, + "step": 12326, + "train/ce_loss": 0.7821524143218994 + }, + { + "epoch": 1.218706743128337, + "step": 12326, + "train/sim_loss": 0.025313735008239746 + }, + { + "epoch": 1.218706743128337, + "step": 12326, + "train/total_loss": 0.10352897644042969 + }, + { + "entropy": 10.071149826049805, + "epoch": 1.2188056159778524, + "mean_token_accuracy": 0.9163879752159119, + "num_tokens": 3960701.0, + "step": 12327, + "train/ce_loss": 3.254604280300555e-06 + }, + { + "epoch": 1.2188056159778524, + "step": 12327, + "train/sim_loss": 0.030878722667694092 + }, + { + "epoch": 1.2188056159778524, + "step": 12327, + "train/total_loss": 0.030879048630595207 + }, + { + "entropy": 9.022608757019043, + "epoch": 1.2189044888273681, + "mean_token_accuracy": 0.8713991641998291, + "num_tokens": 3970036.0, + "step": 12328, + "train/ce_loss": 0.5458477735519409 + }, + { + "epoch": 1.2189044888273681, + "step": 12328, + "train/sim_loss": 0.10224282741546631 + }, + { + "epoch": 1.2189044888273681, + "step": 12328, + "train/total_loss": 0.15682759881019592 + }, + { + "entropy": 9.50587272644043, + "epoch": 1.2190033616768836, + "mean_token_accuracy": 0.8327939510345459, + "num_tokens": 3985623.0, + "step": 12329, + "train/ce_loss": 0.6517170667648315 + }, + { + "epoch": 1.2190033616768836, + "step": 12329, + "train/sim_loss": 0.03561145067214966 + }, + { + "epoch": 1.2190033616768836, + "step": 12329, + "train/total_loss": 0.10078316181898117 + }, + { + "entropy": 9.867895126342773, + "epoch": 1.219102234526399, + "mean_token_accuracy": 0.7835051417350769, + "num_tokens": 4000135.0, + "step": 12330, + "train/ce_loss": 0.535205066204071 + }, + { + "epoch": 1.219102234526399, + "step": 12330, + "train/sim_loss": 0.05292689800262451 + }, + { + "epoch": 1.219102234526399, + "step": 12330, + "train/total_loss": 0.10644740611314774 + }, + { + "entropy": 9.227341651916504, + "epoch": 1.2192011073759146, + "mean_token_accuracy": 0.8117977380752563, + "num_tokens": 4011958.0, + "step": 12331, + "train/ce_loss": 9.629942496758304e-07 + }, + { + "epoch": 1.2192011073759146, + "step": 12331, + "train/sim_loss": 0.04131597280502319 + }, + { + "epoch": 1.2192011073759146, + "step": 12331, + "train/total_loss": 0.04131606966257095 + }, + { + "entropy": 9.557458877563477, + "epoch": 1.21929998022543, + "mean_token_accuracy": 0.8197734355926514, + "num_tokens": 4031104.0, + "step": 12332, + "train/ce_loss": 0.7599210739135742 + }, + { + "epoch": 1.21929998022543, + "step": 12332, + "train/sim_loss": 0.05150175094604492 + }, + { + "epoch": 1.21929998022543, + "step": 12332, + "train/total_loss": 0.12749385833740234 + }, + { + "entropy": 9.70240306854248, + "epoch": 1.2193988530749456, + "mean_token_accuracy": 0.8521008491516113, + "num_tokens": 4047151.0, + "step": 12333, + "train/ce_loss": 0.9153469800949097 + }, + { + "epoch": 1.2193988530749456, + "step": 12333, + "train/sim_loss": 0.045163869857788086 + }, + { + "epoch": 1.2193988530749456, + "step": 12333, + "train/total_loss": 0.13669857382774353 + }, + { + "entropy": 9.47489070892334, + "epoch": 1.219497725924461, + "mean_token_accuracy": 0.8583235144615173, + "num_tokens": 4059756.0, + "step": 12334, + "train/ce_loss": 0.563121497631073 + }, + { + "epoch": 1.219497725924461, + "step": 12334, + "train/sim_loss": 0.03403586149215698 + }, + { + "epoch": 1.219497725924461, + "step": 12334, + "train/total_loss": 0.0903480127453804 + }, + { + "entropy": 9.62340259552002, + "epoch": 1.2195965987739767, + "mean_token_accuracy": 0.81210857629776, + "num_tokens": 4074755.0, + "step": 12335, + "train/ce_loss": 0.7077773213386536 + }, + { + "epoch": 1.2195965987739767, + "step": 12335, + "train/sim_loss": 0.04929083585739136 + }, + { + "epoch": 1.2195965987739767, + "step": 12335, + "train/total_loss": 0.12006857246160507 + }, + { + "entropy": 9.59853744506836, + "epoch": 1.2196954716234922, + "mean_token_accuracy": 0.8969520926475525, + "num_tokens": 4087306.0, + "step": 12336, + "train/ce_loss": 0.7181105017662048 + }, + { + "epoch": 1.2196954716234922, + "step": 12336, + "train/sim_loss": 0.058077335357666016 + }, + { + "epoch": 1.2196954716234922, + "step": 12336, + "train/total_loss": 0.1298883855342865 + }, + { + "entropy": 9.513152122497559, + "epoch": 1.2197943444730077, + "mean_token_accuracy": 0.8516057729721069, + "num_tokens": 4096988.0, + "step": 12337, + "train/ce_loss": 0.3586418330669403 + }, + { + "epoch": 1.2197943444730077, + "step": 12337, + "train/sim_loss": 0.06388974189758301 + }, + { + "epoch": 1.2197943444730077, + "step": 12337, + "train/total_loss": 0.09975393116474152 + }, + { + "entropy": 9.515629768371582, + "epoch": 1.2198932173225232, + "mean_token_accuracy": 0.9105960130691528, + "num_tokens": 4104368.0, + "step": 12338, + "train/ce_loss": 0.3413555324077606 + }, + { + "epoch": 1.2198932173225232, + "step": 12338, + "train/sim_loss": 0.04742586612701416 + }, + { + "epoch": 1.2198932173225232, + "step": 12338, + "train/total_loss": 0.08156141638755798 + }, + { + "entropy": 9.409269332885742, + "epoch": 1.2199920901720387, + "mean_token_accuracy": 0.85429447889328, + "num_tokens": 4120615.0, + "step": 12339, + "train/ce_loss": 4.417277068569092e-07 + }, + { + "epoch": 1.2199920901720387, + "step": 12339, + "train/sim_loss": 0.0581478476524353 + }, + { + "epoch": 1.2199920901720387, + "step": 12339, + "train/total_loss": 0.058147892355918884 + }, + { + "epoch": 1.2200909630215544, + "grad_norm": 0.640548050403595, + "learning_rate": 6.95173811996242e-06, + "loss": 0.0855, + "step": 12340 + }, + { + "entropy": 9.906530380249023, + "epoch": 1.2200909630215544, + "mean_token_accuracy": 0.8279703259468079, + "num_tokens": 4135645.0, + "step": 12340, + "train/ce_loss": 0.38863763213157654 + }, + { + "epoch": 1.2200909630215544, + "step": 12340, + "train/sim_loss": 0.0400620698928833 + }, + { + "epoch": 1.2200909630215544, + "step": 12340, + "train/total_loss": 0.07892583310604095 + }, + { + "entropy": 9.141192436218262, + "epoch": 1.2201898358710699, + "mean_token_accuracy": 0.8433476686477661, + "num_tokens": 4143936.0, + "step": 12341, + "train/ce_loss": 0.368844211101532 + }, + { + "epoch": 1.2201898358710699, + "step": 12341, + "train/sim_loss": 0.01979696750640869 + }, + { + "epoch": 1.2201898358710699, + "step": 12341, + "train/total_loss": 0.05668139085173607 + }, + { + "entropy": 9.395427703857422, + "epoch": 1.2202887087205854, + "mean_token_accuracy": 0.8431654572486877, + "num_tokens": 4156520.0, + "step": 12342, + "train/ce_loss": 0.5941901206970215 + }, + { + "epoch": 1.2202887087205854, + "step": 12342, + "train/sim_loss": 0.04603922367095947 + }, + { + "epoch": 1.2202887087205854, + "step": 12342, + "train/total_loss": 0.10545823723077774 + }, + { + "entropy": 9.780287742614746, + "epoch": 1.2203875815701009, + "mean_token_accuracy": 0.8151260614395142, + "num_tokens": 4168185.0, + "step": 12343, + "train/ce_loss": 1.32302341171453e-06 + }, + { + "epoch": 1.2203875815701009, + "step": 12343, + "train/sim_loss": 0.03741949796676636 + }, + { + "epoch": 1.2203875815701009, + "step": 12343, + "train/total_loss": 0.0374196320772171 + }, + { + "entropy": 9.730535507202148, + "epoch": 1.2204864544196163, + "mean_token_accuracy": 0.8634259104728699, + "num_tokens": 4181134.0, + "step": 12344, + "train/ce_loss": 7.329047662096855e-07 + }, + { + "epoch": 1.2204864544196163, + "step": 12344, + "train/sim_loss": 0.02247035503387451 + }, + { + "epoch": 1.2204864544196163, + "step": 12344, + "train/total_loss": 0.02247042767703533 + }, + { + "entropy": 9.097251892089844, + "epoch": 1.2205853272691318, + "mean_token_accuracy": 0.8600000143051147, + "num_tokens": 4194819.0, + "step": 12345, + "train/ce_loss": 0.3062162399291992 + }, + { + "epoch": 1.2205853272691318, + "step": 12345, + "train/sim_loss": 0.014584898948669434 + }, + { + "epoch": 1.2205853272691318, + "step": 12345, + "train/total_loss": 0.045206524431705475 + }, + { + "entropy": 8.900970458984375, + "epoch": 1.2206842001186473, + "mean_token_accuracy": 0.8650553822517395, + "num_tokens": 4202498.0, + "step": 12346, + "train/ce_loss": 0.4773501753807068 + }, + { + "epoch": 1.2206842001186473, + "step": 12346, + "train/sim_loss": 0.017909526824951172 + }, + { + "epoch": 1.2206842001186473, + "step": 12346, + "train/total_loss": 0.06564454734325409 + }, + { + "entropy": 9.797491073608398, + "epoch": 1.220783072968163, + "mean_token_accuracy": 0.8335208296775818, + "num_tokens": 4216374.0, + "step": 12347, + "train/ce_loss": 0.413166344165802 + }, + { + "epoch": 1.220783072968163, + "step": 12347, + "train/sim_loss": 0.04388928413391113 + }, + { + "epoch": 1.220783072968163, + "step": 12347, + "train/total_loss": 0.08520592004060745 + }, + { + "entropy": 9.410884857177734, + "epoch": 1.2208819458176785, + "mean_token_accuracy": 0.8807212114334106, + "num_tokens": 4228807.0, + "step": 12348, + "train/ce_loss": 0.40800851583480835 + }, + { + "epoch": 1.2208819458176785, + "step": 12348, + "train/sim_loss": 0.015671908855438232 + }, + { + "epoch": 1.2208819458176785, + "step": 12348, + "train/total_loss": 0.05647275969386101 + }, + { + "entropy": 9.636482238769531, + "epoch": 1.220980818667194, + "mean_token_accuracy": 0.8264604806900024, + "num_tokens": 4237608.0, + "step": 12349, + "train/ce_loss": 0.42452892661094666 + }, + { + "epoch": 1.220980818667194, + "step": 12349, + "train/sim_loss": 0.06715363264083862 + }, + { + "epoch": 1.220980818667194, + "step": 12349, + "train/total_loss": 0.10960652679204941 + }, + { + "entropy": 9.616299629211426, + "epoch": 1.2210796915167095, + "mean_token_accuracy": 0.8145695328712463, + "num_tokens": 4252169.0, + "step": 12350, + "train/ce_loss": 0.5743029117584229 + }, + { + "epoch": 1.2210796915167095, + "step": 12350, + "train/sim_loss": 0.04488629102706909 + }, + { + "epoch": 1.2210796915167095, + "step": 12350, + "train/total_loss": 0.10231658816337585 + }, + { + "entropy": 9.562432289123535, + "epoch": 1.221178564366225, + "mean_token_accuracy": 0.8462603688240051, + "num_tokens": 4263555.0, + "step": 12351, + "train/ce_loss": 0.6762869954109192 + }, + { + "epoch": 1.221178564366225, + "step": 12351, + "train/sim_loss": 0.02525186538696289 + }, + { + "epoch": 1.221178564366225, + "step": 12351, + "train/total_loss": 0.09288056939840317 + }, + { + "entropy": 9.140308380126953, + "epoch": 1.2212774372157407, + "mean_token_accuracy": 0.8250303864479065, + "num_tokens": 4275890.0, + "step": 12352, + "train/ce_loss": 0.6391190886497498 + }, + { + "epoch": 1.2212774372157407, + "step": 12352, + "train/sim_loss": 0.03883129358291626 + }, + { + "epoch": 1.2212774372157407, + "step": 12352, + "train/total_loss": 0.10274320095777512 + }, + { + "entropy": 9.192422866821289, + "epoch": 1.2213763100652562, + "mean_token_accuracy": 0.8087141513824463, + "num_tokens": 4285599.0, + "step": 12353, + "train/ce_loss": 0.5590543746948242 + }, + { + "epoch": 1.2213763100652562, + "step": 12353, + "train/sim_loss": 0.053655028343200684 + }, + { + "epoch": 1.2213763100652562, + "step": 12353, + "train/total_loss": 0.10956046730279922 + }, + { + "entropy": 9.54885482788086, + "epoch": 1.2214751829147716, + "mean_token_accuracy": 0.8761290311813354, + "num_tokens": 4295127.0, + "step": 12354, + "train/ce_loss": 0.5270962119102478 + }, + { + "epoch": 1.2214751829147716, + "step": 12354, + "train/sim_loss": 0.0819389820098877 + }, + { + "epoch": 1.2214751829147716, + "step": 12354, + "train/total_loss": 0.13464860618114471 + }, + { + "entropy": 9.195196151733398, + "epoch": 1.2215740557642871, + "mean_token_accuracy": 0.836662769317627, + "num_tokens": 4309306.0, + "step": 12355, + "train/ce_loss": 0.44527071714401245 + }, + { + "epoch": 1.2215740557642871, + "step": 12355, + "train/sim_loss": 0.07294583320617676 + }, + { + "epoch": 1.2215740557642871, + "step": 12355, + "train/total_loss": 0.11747290194034576 + }, + { + "entropy": 9.093071937561035, + "epoch": 1.2216729286138026, + "mean_token_accuracy": 0.774956226348877, + "num_tokens": 4318825.0, + "step": 12356, + "train/ce_loss": 0.35762447118759155 + }, + { + "epoch": 1.2216729286138026, + "step": 12356, + "train/sim_loss": 0.014440298080444336 + }, + { + "epoch": 1.2216729286138026, + "step": 12356, + "train/total_loss": 0.05020274594426155 + }, + { + "entropy": 9.569374084472656, + "epoch": 1.221771801463318, + "mean_token_accuracy": 0.9116541147232056, + "num_tokens": 4331891.0, + "step": 12357, + "train/ce_loss": 0.7480858564376831 + }, + { + "epoch": 1.221771801463318, + "step": 12357, + "train/sim_loss": 0.03594452142715454 + }, + { + "epoch": 1.221771801463318, + "step": 12357, + "train/total_loss": 0.11075311154127121 + }, + { + "entropy": 9.362945556640625, + "epoch": 1.2218706743128336, + "mean_token_accuracy": 0.8836565017700195, + "num_tokens": 4346040.0, + "step": 12358, + "train/ce_loss": 0.30595526099205017 + }, + { + "epoch": 1.2218706743128336, + "step": 12358, + "train/sim_loss": 0.03167062997817993 + }, + { + "epoch": 1.2218706743128336, + "step": 12358, + "train/total_loss": 0.06226615607738495 + }, + { + "entropy": 9.535125732421875, + "epoch": 1.2219695471623493, + "mean_token_accuracy": 0.913223147392273, + "num_tokens": 4356054.0, + "step": 12359, + "train/ce_loss": 0.6061103940010071 + }, + { + "epoch": 1.2219695471623493, + "step": 12359, + "train/sim_loss": 0.058463335037231445 + }, + { + "epoch": 1.2219695471623493, + "step": 12359, + "train/total_loss": 0.11907437443733215 + }, + { + "epoch": 1.2220684200118648, + "grad_norm": 0.5171182751655579, + "learning_rate": 6.94679325520447e-06, + "loss": 0.0901, + "step": 12360 + }, + { + "entropy": 9.4806547164917, + "epoch": 1.2220684200118648, + "mean_token_accuracy": 0.7947179079055786, + "num_tokens": 4370484.0, + "step": 12360, + "train/ce_loss": 0.5266485214233398 + }, + { + "epoch": 1.2220684200118648, + "step": 12360, + "train/sim_loss": 0.02971804141998291 + }, + { + "epoch": 1.2220684200118648, + "step": 12360, + "train/total_loss": 0.08238289505243301 + }, + { + "entropy": 8.80954360961914, + "epoch": 1.2221672928613803, + "mean_token_accuracy": 0.873150110244751, + "num_tokens": 4381098.0, + "step": 12361, + "train/ce_loss": 0.2623887062072754 + }, + { + "epoch": 1.2221672928613803, + "step": 12361, + "train/sim_loss": 0.017757058143615723 + }, + { + "epoch": 1.2221672928613803, + "step": 12361, + "train/total_loss": 0.0439959317445755 + }, + { + "entropy": 9.33731460571289, + "epoch": 1.2222661657108957, + "mean_token_accuracy": 0.8459421992301941, + "num_tokens": 4389414.0, + "step": 12362, + "train/ce_loss": 0.5098050236701965 + }, + { + "epoch": 1.2222661657108957, + "step": 12362, + "train/sim_loss": 0.06003469228744507 + }, + { + "epoch": 1.2222661657108957, + "step": 12362, + "train/total_loss": 0.1110152006149292 + }, + { + "entropy": 9.509520530700684, + "epoch": 1.2223650385604112, + "mean_token_accuracy": 0.8911392688751221, + "num_tokens": 4403282.0, + "step": 12363, + "train/ce_loss": 0.24246221780776978 + }, + { + "epoch": 1.2223650385604112, + "step": 12363, + "train/sim_loss": 0.03142958879470825 + }, + { + "epoch": 1.2223650385604112, + "step": 12363, + "train/total_loss": 0.05567581206560135 + }, + { + "entropy": 8.983186721801758, + "epoch": 1.222463911409927, + "mean_token_accuracy": 0.8684210777282715, + "num_tokens": 4412565.0, + "step": 12364, + "train/ce_loss": 0.7521303296089172 + }, + { + "epoch": 1.222463911409927, + "step": 12364, + "train/sim_loss": 0.06092226505279541 + }, + { + "epoch": 1.222463911409927, + "step": 12364, + "train/total_loss": 0.1361353099346161 + }, + { + "entropy": 9.361138343811035, + "epoch": 1.2225627842594424, + "mean_token_accuracy": 0.8092909455299377, + "num_tokens": 4429213.0, + "step": 12365, + "train/ce_loss": 0.884473443031311 + }, + { + "epoch": 1.2225627842594424, + "step": 12365, + "train/sim_loss": 0.020297110080718994 + }, + { + "epoch": 1.2225627842594424, + "step": 12365, + "train/total_loss": 0.10874445736408234 + }, + { + "entropy": 9.476067543029785, + "epoch": 1.222661657108958, + "mean_token_accuracy": 0.8112980723381042, + "num_tokens": 4442192.0, + "step": 12366, + "train/ce_loss": 0.4974093735218048 + }, + { + "epoch": 1.222661657108958, + "step": 12366, + "train/sim_loss": 0.03709840774536133 + }, + { + "epoch": 1.222661657108958, + "step": 12366, + "train/total_loss": 0.08683934807777405 + }, + { + "entropy": 9.447927474975586, + "epoch": 1.2227605299584734, + "mean_token_accuracy": 0.8094262480735779, + "num_tokens": 4455426.0, + "step": 12367, + "train/ce_loss": 0.5658778548240662 + }, + { + "epoch": 1.2227605299584734, + "step": 12367, + "train/sim_loss": 0.04463988542556763 + }, + { + "epoch": 1.2227605299584734, + "step": 12367, + "train/total_loss": 0.10122767090797424 + }, + { + "entropy": 9.468875885009766, + "epoch": 1.2228594028079889, + "mean_token_accuracy": 0.8833333253860474, + "num_tokens": 4475994.0, + "step": 12368, + "train/ce_loss": 6.668242349405773e-07 + }, + { + "epoch": 1.2228594028079889, + "step": 12368, + "train/sim_loss": 0.018172025680541992 + }, + { + "epoch": 1.2228594028079889, + "step": 12368, + "train/total_loss": 0.018172092735767365 + }, + { + "entropy": 9.388229370117188, + "epoch": 1.2229582756575044, + "mean_token_accuracy": 0.8399452567100525, + "num_tokens": 4487990.0, + "step": 12369, + "train/ce_loss": 0.8118572235107422 + }, + { + "epoch": 1.2229582756575044, + "step": 12369, + "train/sim_loss": 0.05589640140533447 + }, + { + "epoch": 1.2229582756575044, + "step": 12369, + "train/total_loss": 0.13708212971687317 + }, + { + "entropy": 9.696677207946777, + "epoch": 1.2230571485070199, + "mean_token_accuracy": 0.8084479570388794, + "num_tokens": 4506592.0, + "step": 12370, + "train/ce_loss": 0.4316536486148834 + }, + { + "epoch": 1.2230571485070199, + "step": 12370, + "train/sim_loss": 0.03211575746536255 + }, + { + "epoch": 1.2230571485070199, + "step": 12370, + "train/total_loss": 0.07528112828731537 + }, + { + "entropy": 9.652165412902832, + "epoch": 1.2231560213565356, + "mean_token_accuracy": 0.8236842155456543, + "num_tokens": 4521381.0, + "step": 12371, + "train/ce_loss": 0.48102930188179016 + }, + { + "epoch": 1.2231560213565356, + "step": 12371, + "train/sim_loss": 0.034020066261291504 + }, + { + "epoch": 1.2231560213565356, + "step": 12371, + "train/total_loss": 0.08212299644947052 + }, + { + "entropy": 9.011503219604492, + "epoch": 1.223254894206051, + "mean_token_accuracy": 0.8094144463539124, + "num_tokens": 4530219.0, + "step": 12372, + "train/ce_loss": 0.7125208973884583 + }, + { + "epoch": 1.223254894206051, + "step": 12372, + "train/sim_loss": 0.04491305351257324 + }, + { + "epoch": 1.223254894206051, + "step": 12372, + "train/total_loss": 0.1161651462316513 + }, + { + "entropy": 9.25747299194336, + "epoch": 1.2233537670555665, + "mean_token_accuracy": 0.8801652789115906, + "num_tokens": 4537211.0, + "step": 12373, + "train/ce_loss": 0.6589058041572571 + }, + { + "epoch": 1.2233537670555665, + "step": 12373, + "train/sim_loss": 0.06527441740036011 + }, + { + "epoch": 1.2233537670555665, + "step": 12373, + "train/total_loss": 0.13116499781608582 + }, + { + "entropy": 9.669172286987305, + "epoch": 1.223452639905082, + "mean_token_accuracy": 0.8123287558555603, + "num_tokens": 4550962.0, + "step": 12374, + "train/ce_loss": 0.5225564241409302 + }, + { + "epoch": 1.223452639905082, + "step": 12374, + "train/sim_loss": 0.06595069169998169 + }, + { + "epoch": 1.223452639905082, + "step": 12374, + "train/total_loss": 0.11820633709430695 + }, + { + "entropy": 9.5338773727417, + "epoch": 1.2235515127545975, + "mean_token_accuracy": 0.8474971055984497, + "num_tokens": 4568010.0, + "step": 12375, + "train/ce_loss": 0.7912534475326538 + }, + { + "epoch": 1.2235515127545975, + "step": 12375, + "train/sim_loss": 0.09344804286956787 + }, + { + "epoch": 1.2235515127545975, + "step": 12375, + "train/total_loss": 0.17257338762283325 + }, + { + "entropy": 9.222970962524414, + "epoch": 1.2236503856041132, + "mean_token_accuracy": 0.8763388395309448, + "num_tokens": 4584944.0, + "step": 12376, + "train/ce_loss": 0.3567762076854706 + }, + { + "epoch": 1.2236503856041132, + "step": 12376, + "train/sim_loss": 0.020408034324645996 + }, + { + "epoch": 1.2236503856041132, + "step": 12376, + "train/total_loss": 0.05608565732836723 + }, + { + "entropy": 9.575040817260742, + "epoch": 1.2237492584536287, + "mean_token_accuracy": 0.8571428656578064, + "num_tokens": 4593806.0, + "step": 12377, + "train/ce_loss": 2.1750265943865088e-07 + }, + { + "epoch": 1.2237492584536287, + "step": 12377, + "train/sim_loss": 0.016683459281921387 + }, + { + "epoch": 1.2237492584536287, + "step": 12377, + "train/total_loss": 0.016683481633663177 + }, + { + "entropy": 9.474279403686523, + "epoch": 1.2238481313031442, + "mean_token_accuracy": 0.8167028427124023, + "num_tokens": 4609807.0, + "step": 12378, + "train/ce_loss": 0.21390284597873688 + }, + { + "epoch": 1.2238481313031442, + "step": 12378, + "train/sim_loss": 0.028393983840942383 + }, + { + "epoch": 1.2238481313031442, + "step": 12378, + "train/total_loss": 0.04978426918387413 + }, + { + "entropy": 9.53254508972168, + "epoch": 1.2239470041526597, + "mean_token_accuracy": 0.8180722594261169, + "num_tokens": 4626654.0, + "step": 12379, + "train/ce_loss": 0.6253156661987305 + }, + { + "epoch": 1.2239470041526597, + "step": 12379, + "train/sim_loss": 0.030759453773498535 + }, + { + "epoch": 1.2239470041526597, + "step": 12379, + "train/total_loss": 0.0932910218834877 + }, + { + "epoch": 1.2240458770021752, + "grad_norm": 0.5809435248374939, + "learning_rate": 6.941848390446522e-06, + "loss": 0.0884, + "step": 12380 + }, + { + "entropy": 9.517359733581543, + "epoch": 1.2240458770021752, + "mean_token_accuracy": 0.8548185229301453, + "num_tokens": 4644492.0, + "step": 12380, + "train/ce_loss": 0.6107867360115051 + }, + { + "epoch": 1.2240458770021752, + "step": 12380, + "train/sim_loss": 0.05364727973937988 + }, + { + "epoch": 1.2240458770021752, + "step": 12380, + "train/total_loss": 0.11472595483064651 + }, + { + "entropy": 9.673927307128906, + "epoch": 1.2241447498516906, + "mean_token_accuracy": 0.8971962332725525, + "num_tokens": 4655572.0, + "step": 12381, + "train/ce_loss": 1.9798851553787244e-06 + }, + { + "epoch": 1.2241447498516906, + "step": 12381, + "train/sim_loss": 0.03334391117095947 + }, + { + "epoch": 1.2241447498516906, + "step": 12381, + "train/total_loss": 0.03334410861134529 + }, + { + "entropy": 8.865463256835938, + "epoch": 1.2242436227012061, + "mean_token_accuracy": 0.8544652462005615, + "num_tokens": 4661922.0, + "step": 12382, + "train/ce_loss": 0.48242536187171936 + }, + { + "epoch": 1.2242436227012061, + "step": 12382, + "train/sim_loss": 0.012728869915008545 + }, + { + "epoch": 1.2242436227012061, + "step": 12382, + "train/total_loss": 0.06097140535712242 + }, + { + "entropy": 9.32944393157959, + "epoch": 1.2243424955507218, + "mean_token_accuracy": 0.8180645108222961, + "num_tokens": 4670531.0, + "step": 12383, + "train/ce_loss": 0.4107396602630615 + }, + { + "epoch": 1.2243424955507218, + "step": 12383, + "train/sim_loss": 0.053674399852752686 + }, + { + "epoch": 1.2243424955507218, + "step": 12383, + "train/total_loss": 0.0947483628988266 + }, + { + "entropy": 9.731674194335938, + "epoch": 1.2244413684002373, + "mean_token_accuracy": 0.9801980257034302, + "num_tokens": 4685228.0, + "step": 12384, + "train/ce_loss": 2.539891056585475e-06 + }, + { + "epoch": 1.2244413684002373, + "step": 12384, + "train/sim_loss": 0.029163122177124023 + }, + { + "epoch": 1.2244413684002373, + "step": 12384, + "train/total_loss": 0.02916337549686432 + }, + { + "entropy": 9.225257873535156, + "epoch": 1.2245402412497528, + "mean_token_accuracy": 0.8507890701293945, + "num_tokens": 4697542.0, + "step": 12385, + "train/ce_loss": 0.21270178258419037 + }, + { + "epoch": 1.2245402412497528, + "step": 12385, + "train/sim_loss": 0.027409791946411133 + }, + { + "epoch": 1.2245402412497528, + "step": 12385, + "train/total_loss": 0.04867997020483017 + }, + { + "entropy": 8.879048347473145, + "epoch": 1.2246391140992683, + "mean_token_accuracy": 0.8844085931777954, + "num_tokens": 4704985.0, + "step": 12386, + "train/ce_loss": 0.45468753576278687 + }, + { + "epoch": 1.2246391140992683, + "step": 12386, + "train/sim_loss": 0.011609494686126709 + }, + { + "epoch": 1.2246391140992683, + "step": 12386, + "train/total_loss": 0.057078249752521515 + }, + { + "entropy": 8.890928268432617, + "epoch": 1.2247379869487838, + "mean_token_accuracy": 0.8380024433135986, + "num_tokens": 4713554.0, + "step": 12387, + "train/ce_loss": 0.7089390158653259 + }, + { + "epoch": 1.2247379869487838, + "step": 12387, + "train/sim_loss": 0.0219915509223938 + }, + { + "epoch": 1.2247379869487838, + "step": 12387, + "train/total_loss": 0.09288545697927475 + }, + { + "entropy": 9.730527877807617, + "epoch": 1.2248368597982995, + "mean_token_accuracy": 0.84375, + "num_tokens": 4727974.0, + "step": 12388, + "train/ce_loss": 0.4192206859588623 + }, + { + "epoch": 1.2248368597982995, + "step": 12388, + "train/sim_loss": 0.03598308563232422 + }, + { + "epoch": 1.2248368597982995, + "step": 12388, + "train/total_loss": 0.07790515571832657 + }, + { + "entropy": 9.12160587310791, + "epoch": 1.224935732647815, + "mean_token_accuracy": 0.8302386999130249, + "num_tokens": 4734601.0, + "step": 12389, + "train/ce_loss": 0.39863651990890503 + }, + { + "epoch": 1.224935732647815, + "step": 12389, + "train/sim_loss": 0.01768583059310913 + }, + { + "epoch": 1.224935732647815, + "step": 12389, + "train/total_loss": 0.05754948407411575 + }, + { + "entropy": 9.285722732543945, + "epoch": 1.2250346054973305, + "mean_token_accuracy": 0.8978328108787537, + "num_tokens": 4745897.0, + "step": 12390, + "train/ce_loss": 0.22871120274066925 + }, + { + "epoch": 1.2250346054973305, + "step": 12390, + "train/sim_loss": 0.09077060222625732 + }, + { + "epoch": 1.2250346054973305, + "step": 12390, + "train/total_loss": 0.11364172399044037 + }, + { + "entropy": 9.513272285461426, + "epoch": 1.225133478346846, + "mean_token_accuracy": 0.8449931144714355, + "num_tokens": 4756657.0, + "step": 12391, + "train/ce_loss": 0.4087177813053131 + }, + { + "epoch": 1.225133478346846, + "step": 12391, + "train/sim_loss": 0.04097718000411987 + }, + { + "epoch": 1.225133478346846, + "step": 12391, + "train/total_loss": 0.08184896409511566 + }, + { + "entropy": 9.516277313232422, + "epoch": 1.2252323511963614, + "mean_token_accuracy": 0.8475936055183411, + "num_tokens": 4773042.0, + "step": 12392, + "train/ce_loss": 0.6994681358337402 + }, + { + "epoch": 1.2252323511963614, + "step": 12392, + "train/sim_loss": 0.06619501113891602 + }, + { + "epoch": 1.2252323511963614, + "step": 12392, + "train/total_loss": 0.136141836643219 + }, + { + "entropy": 9.101799011230469, + "epoch": 1.225331224045877, + "mean_token_accuracy": 0.8070588111877441, + "num_tokens": 4782504.0, + "step": 12393, + "train/ce_loss": 0.6721773147583008 + }, + { + "epoch": 1.225331224045877, + "step": 12393, + "train/sim_loss": 0.03840458393096924 + }, + { + "epoch": 1.225331224045877, + "step": 12393, + "train/total_loss": 0.1056223139166832 + }, + { + "entropy": 9.377408981323242, + "epoch": 1.2254300968953926, + "mean_token_accuracy": 0.8682719469070435, + "num_tokens": 4792203.0, + "step": 12394, + "train/ce_loss": 0.31414225697517395 + }, + { + "epoch": 1.2254300968953926, + "step": 12394, + "train/sim_loss": 0.023420274257659912 + }, + { + "epoch": 1.2254300968953926, + "step": 12394, + "train/total_loss": 0.05483449995517731 + }, + { + "entropy": 9.342248916625977, + "epoch": 1.225528969744908, + "mean_token_accuracy": 0.8178001642227173, + "num_tokens": 4803778.0, + "step": 12395, + "train/ce_loss": 0.6206685304641724 + }, + { + "epoch": 1.225528969744908, + "step": 12395, + "train/sim_loss": 0.10102343559265137 + }, + { + "epoch": 1.225528969744908, + "step": 12395, + "train/total_loss": 0.1630902886390686 + }, + { + "entropy": 9.219236373901367, + "epoch": 1.2256278425944236, + "mean_token_accuracy": 0.8836601376533508, + "num_tokens": 4816799.0, + "step": 12396, + "train/ce_loss": 0.3841201364994049 + }, + { + "epoch": 1.2256278425944236, + "step": 12396, + "train/sim_loss": 0.023147881031036377 + }, + { + "epoch": 1.2256278425944236, + "step": 12396, + "train/total_loss": 0.06155989691615105 + }, + { + "entropy": 9.599056243896484, + "epoch": 1.225726715443939, + "mean_token_accuracy": 0.7945736646652222, + "num_tokens": 4833816.0, + "step": 12397, + "train/ce_loss": 0.5748825669288635 + }, + { + "epoch": 1.225726715443939, + "step": 12397, + "train/sim_loss": 0.015457510948181152 + }, + { + "epoch": 1.225726715443939, + "step": 12397, + "train/total_loss": 0.07294577360153198 + }, + { + "entropy": 9.547582626342773, + "epoch": 1.2258255882934546, + "mean_token_accuracy": 0.8844221234321594, + "num_tokens": 4844244.0, + "step": 12398, + "train/ce_loss": 2.3661911541239533e-07 + }, + { + "epoch": 1.2258255882934546, + "step": 12398, + "train/sim_loss": 0.016715049743652344 + }, + { + "epoch": 1.2258255882934546, + "step": 12398, + "train/total_loss": 0.016715073958039284 + }, + { + "entropy": 9.729368209838867, + "epoch": 1.22592446114297, + "mean_token_accuracy": 0.8758465051651001, + "num_tokens": 4863989.0, + "step": 12399, + "train/ce_loss": 6.727252070959366e-07 + }, + { + "epoch": 1.22592446114297, + "step": 12399, + "train/sim_loss": 0.013350605964660645 + }, + { + "epoch": 1.22592446114297, + "step": 12399, + "train/total_loss": 0.013350673019886017 + }, + { + "epoch": 1.2260233339924858, + "grad_norm": 0.744208574295044, + "learning_rate": 6.936903525688573e-06, + "loss": 0.0826, + "step": 12400 + }, + { + "entropy": 9.208099365234375, + "epoch": 1.2260233339924858, + "mean_token_accuracy": 0.8347205519676208, + "num_tokens": 4875166.0, + "step": 12400, + "train/ce_loss": 0.8244107961654663 + }, + { + "epoch": 1.2260233339924858, + "step": 12400, + "train/sim_loss": 0.03491705656051636 + }, + { + "epoch": 1.2260233339924858, + "step": 12400, + "train/total_loss": 0.11735814064741135 + }, + { + "entropy": 9.390965461730957, + "epoch": 1.2261222068420012, + "mean_token_accuracy": 0.889502763748169, + "num_tokens": 4888086.0, + "step": 12401, + "train/ce_loss": 0.4298509955406189 + }, + { + "epoch": 1.2261222068420012, + "step": 12401, + "train/sim_loss": 0.03531622886657715 + }, + { + "epoch": 1.2261222068420012, + "step": 12401, + "train/total_loss": 0.0783013254404068 + }, + { + "entropy": 9.706430435180664, + "epoch": 1.2262210796915167, + "mean_token_accuracy": 0.8614457845687866, + "num_tokens": 4896852.0, + "step": 12402, + "train/ce_loss": 0.9077231884002686 + }, + { + "epoch": 1.2262210796915167, + "step": 12402, + "train/sim_loss": 0.04546856880187988 + }, + { + "epoch": 1.2262210796915167, + "step": 12402, + "train/total_loss": 0.1362408995628357 + }, + { + "entropy": 9.516220092773438, + "epoch": 1.2263199525410322, + "mean_token_accuracy": 0.8599397540092468, + "num_tokens": 4909127.0, + "step": 12403, + "train/ce_loss": 0.49620088934898376 + }, + { + "epoch": 1.2263199525410322, + "step": 12403, + "train/sim_loss": 0.06034022569656372 + }, + { + "epoch": 1.2263199525410322, + "step": 12403, + "train/total_loss": 0.10996031761169434 + }, + { + "entropy": 9.519196510314941, + "epoch": 1.2264188253905477, + "mean_token_accuracy": 0.8771626353263855, + "num_tokens": 4921529.0, + "step": 12404, + "train/ce_loss": 0.23944006860256195 + }, + { + "epoch": 1.2264188253905477, + "step": 12404, + "train/sim_loss": 0.03766143321990967 + }, + { + "epoch": 1.2264188253905477, + "step": 12404, + "train/total_loss": 0.061605438590049744 + }, + { + "entropy": 9.215254783630371, + "epoch": 1.2265176982400634, + "mean_token_accuracy": 0.8462516069412231, + "num_tokens": 4934872.0, + "step": 12405, + "train/ce_loss": 0.23261551558971405 + }, + { + "epoch": 1.2265176982400634, + "step": 12405, + "train/sim_loss": 0.045020878314971924 + }, + { + "epoch": 1.2265176982400634, + "step": 12405, + "train/total_loss": 0.06828243285417557 + }, + { + "entropy": 9.547462463378906, + "epoch": 1.2266165710895789, + "mean_token_accuracy": 0.8205574750900269, + "num_tokens": 4949133.0, + "step": 12406, + "train/ce_loss": 0.9159778952598572 + }, + { + "epoch": 1.2266165710895789, + "step": 12406, + "train/sim_loss": 0.04731863737106323 + }, + { + "epoch": 1.2266165710895789, + "step": 12406, + "train/total_loss": 0.13891643285751343 + }, + { + "entropy": 9.48952865600586, + "epoch": 1.2267154439390944, + "mean_token_accuracy": 0.9135338068008423, + "num_tokens": 4955469.0, + "step": 12407, + "train/ce_loss": 0.21772153675556183 + }, + { + "epoch": 1.2267154439390944, + "step": 12407, + "train/sim_loss": 0.02523171901702881 + }, + { + "epoch": 1.2267154439390944, + "step": 12407, + "train/total_loss": 0.04700387269258499 + }, + { + "entropy": 9.166736602783203, + "epoch": 1.2268143167886099, + "mean_token_accuracy": 0.8178913593292236, + "num_tokens": 4969705.0, + "step": 12408, + "train/ce_loss": 0.5579422116279602 + }, + { + "epoch": 1.2268143167886099, + "step": 12408, + "train/sim_loss": 0.07109487056732178 + }, + { + "epoch": 1.2268143167886099, + "step": 12408, + "train/total_loss": 0.12688909471035004 + }, + { + "entropy": 9.629936218261719, + "epoch": 1.2269131896381253, + "mean_token_accuracy": 0.8745519518852234, + "num_tokens": 4981312.0, + "step": 12409, + "train/ce_loss": 0.5755468010902405 + }, + { + "epoch": 1.2269131896381253, + "step": 12409, + "train/sim_loss": 0.028397083282470703 + }, + { + "epoch": 1.2269131896381253, + "step": 12409, + "train/total_loss": 0.08595176041126251 + }, + { + "entropy": 9.211981773376465, + "epoch": 1.2270120624876408, + "mean_token_accuracy": 0.8352112770080566, + "num_tokens": 4997914.0, + "step": 12410, + "train/ce_loss": 0.5182022452354431 + }, + { + "epoch": 1.2270120624876408, + "step": 12410, + "train/sim_loss": 0.025291025638580322 + }, + { + "epoch": 1.2270120624876408, + "step": 12410, + "train/total_loss": 0.07711125165224075 + }, + { + "entropy": 9.325172424316406, + "epoch": 1.2271109353371563, + "mean_token_accuracy": 0.853157103061676, + "num_tokens": 5006862.0, + "step": 12411, + "train/ce_loss": 0.23209604620933533 + }, + { + "epoch": 1.2271109353371563, + "step": 12411, + "train/sim_loss": 0.028362274169921875 + }, + { + "epoch": 1.2271109353371563, + "step": 12411, + "train/total_loss": 0.05157187953591347 + }, + { + "entropy": 9.191216468811035, + "epoch": 1.227209808186672, + "mean_token_accuracy": 0.8590381145477295, + "num_tokens": 5014920.0, + "step": 12412, + "train/ce_loss": 1.1505085240059998e-06 + }, + { + "epoch": 1.227209808186672, + "step": 12412, + "train/sim_loss": 0.04185539484024048 + }, + { + "epoch": 1.227209808186672, + "step": 12412, + "train/total_loss": 0.04185551032423973 + }, + { + "entropy": 9.782812118530273, + "epoch": 1.2273086810361875, + "mean_token_accuracy": 0.7754880785942078, + "num_tokens": 5029792.0, + "step": 12413, + "train/ce_loss": 0.7442930340766907 + }, + { + "epoch": 1.2273086810361875, + "step": 12413, + "train/sim_loss": 0.019739210605621338 + }, + { + "epoch": 1.2273086810361875, + "step": 12413, + "train/total_loss": 0.0941685140132904 + }, + { + "entropy": 9.745296478271484, + "epoch": 1.227407553885703, + "mean_token_accuracy": 0.876800000667572, + "num_tokens": 5041655.0, + "step": 12414, + "train/ce_loss": 0.2919980585575104 + }, + { + "epoch": 1.227407553885703, + "step": 12414, + "train/sim_loss": 0.013218998908996582 + }, + { + "epoch": 1.227407553885703, + "step": 12414, + "train/total_loss": 0.04241880774497986 + }, + { + "entropy": 9.896008491516113, + "epoch": 1.2275064267352185, + "mean_token_accuracy": 0.8580508232116699, + "num_tokens": 5050730.0, + "step": 12415, + "train/ce_loss": 1.118806004524231 + }, + { + "epoch": 1.2275064267352185, + "step": 12415, + "train/sim_loss": 0.08187556266784668 + }, + { + "epoch": 1.2275064267352185, + "step": 12415, + "train/total_loss": 0.19375616312026978 + }, + { + "entropy": 9.753684997558594, + "epoch": 1.227605299584734, + "mean_token_accuracy": 0.9139072895050049, + "num_tokens": 5064053.0, + "step": 12416, + "train/ce_loss": 9.315593842984526e-07 + }, + { + "epoch": 1.227605299584734, + "step": 12416, + "train/sim_loss": 0.054150521755218506 + }, + { + "epoch": 1.227605299584734, + "step": 12416, + "train/total_loss": 0.05415061488747597 + }, + { + "entropy": 9.705270767211914, + "epoch": 1.2277041724342497, + "mean_token_accuracy": 0.9247058629989624, + "num_tokens": 5073542.0, + "step": 12417, + "train/ce_loss": 0.7012736201286316 + }, + { + "epoch": 1.2277041724342497, + "step": 12417, + "train/sim_loss": 0.040013134479522705 + }, + { + "epoch": 1.2277041724342497, + "step": 12417, + "train/total_loss": 0.11014049500226974 + }, + { + "entropy": 9.542625427246094, + "epoch": 1.2278030452837652, + "mean_token_accuracy": 0.8251366019248962, + "num_tokens": 5090805.0, + "step": 12418, + "train/ce_loss": 0.6970458030700684 + }, + { + "epoch": 1.2278030452837652, + "step": 12418, + "train/sim_loss": 0.02558720111846924 + }, + { + "epoch": 1.2278030452837652, + "step": 12418, + "train/total_loss": 0.09529178589582443 + }, + { + "entropy": 9.123435974121094, + "epoch": 1.2279019181332806, + "mean_token_accuracy": 0.8058252334594727, + "num_tokens": 5099902.0, + "step": 12419, + "train/ce_loss": 0.4122408330440521 + }, + { + "epoch": 1.2279019181332806, + "step": 12419, + "train/sim_loss": 0.027768433094024658 + }, + { + "epoch": 1.2279019181332806, + "step": 12419, + "train/total_loss": 0.06899251788854599 + }, + { + "epoch": 1.2280007909827961, + "grad_norm": 0.589252769947052, + "learning_rate": 6.931958660930624e-06, + "loss": 0.0885, + "step": 12420 + }, + { + "entropy": 10.022637367248535, + "epoch": 1.2280007909827961, + "mean_token_accuracy": 0.8295254707336426, + "num_tokens": 5113687.0, + "step": 12420, + "train/ce_loss": 1.5862498230490019e-06 + }, + { + "epoch": 1.2280007909827961, + "step": 12420, + "train/sim_loss": 0.028637290000915527 + }, + { + "epoch": 1.2280007909827961, + "step": 12420, + "train/total_loss": 0.028637448325753212 + }, + { + "entropy": 8.89145278930664, + "epoch": 1.2280996638323116, + "mean_token_accuracy": 0.8545454740524292, + "num_tokens": 5121906.0, + "step": 12421, + "train/ce_loss": 0.4930512011051178 + }, + { + "epoch": 1.2280996638323116, + "step": 12421, + "train/sim_loss": 0.011698484420776367 + }, + { + "epoch": 1.2280996638323116, + "step": 12421, + "train/total_loss": 0.061003606766462326 + }, + { + "entropy": 9.3292236328125, + "epoch": 1.228198536681827, + "mean_token_accuracy": 0.8485772609710693, + "num_tokens": 5131019.0, + "step": 12422, + "train/ce_loss": 0.5123193860054016 + }, + { + "epoch": 1.228198536681827, + "step": 12422, + "train/sim_loss": 0.09281051158905029 + }, + { + "epoch": 1.228198536681827, + "step": 12422, + "train/total_loss": 0.14404244720935822 + }, + { + "entropy": 9.462812423706055, + "epoch": 1.2282974095313426, + "mean_token_accuracy": 0.838472843170166, + "num_tokens": 5140651.0, + "step": 12423, + "train/ce_loss": 2.114592803081905e-07 + }, + { + "epoch": 1.2282974095313426, + "step": 12423, + "train/sim_loss": 0.015249967575073242 + }, + { + "epoch": 1.2282974095313426, + "step": 12423, + "train/total_loss": 0.015249988995492458 + }, + { + "entropy": 9.648966789245605, + "epoch": 1.2283962823808583, + "mean_token_accuracy": 0.9206762313842773, + "num_tokens": 5157402.0, + "step": 12424, + "train/ce_loss": 0.14071950316429138 + }, + { + "epoch": 1.2283962823808583, + "step": 12424, + "train/sim_loss": 0.06274908781051636 + }, + { + "epoch": 1.2283962823808583, + "step": 12424, + "train/total_loss": 0.07682103663682938 + }, + { + "entropy": 9.087193489074707, + "epoch": 1.2284951552303738, + "mean_token_accuracy": 0.8405228853225708, + "num_tokens": 5174064.0, + "step": 12425, + "train/ce_loss": 0.9491622447967529 + }, + { + "epoch": 1.2284951552303738, + "step": 12425, + "train/sim_loss": 0.04753375053405762 + }, + { + "epoch": 1.2284951552303738, + "step": 12425, + "train/total_loss": 0.1424499750137329 + }, + { + "entropy": 9.162986755371094, + "epoch": 1.2285940280798893, + "mean_token_accuracy": 0.8419666290283203, + "num_tokens": 5189724.0, + "step": 12426, + "train/ce_loss": 0.5192875266075134 + }, + { + "epoch": 1.2285940280798893, + "step": 12426, + "train/sim_loss": 0.07237029075622559 + }, + { + "epoch": 1.2285940280798893, + "step": 12426, + "train/total_loss": 0.1242990493774414 + }, + { + "entropy": 9.384571075439453, + "epoch": 1.2286929009294048, + "mean_token_accuracy": 0.8397849202156067, + "num_tokens": 5200160.0, + "step": 12427, + "train/ce_loss": 0.709438145160675 + }, + { + "epoch": 1.2286929009294048, + "step": 12427, + "train/sim_loss": 0.04928278923034668 + }, + { + "epoch": 1.2286929009294048, + "step": 12427, + "train/total_loss": 0.12022660672664642 + }, + { + "entropy": 9.610116958618164, + "epoch": 1.2287917737789202, + "mean_token_accuracy": 0.8641975522041321, + "num_tokens": 5216196.0, + "step": 12428, + "train/ce_loss": 0.5381380915641785 + }, + { + "epoch": 1.2287917737789202, + "step": 12428, + "train/sim_loss": 0.06663978099822998 + }, + { + "epoch": 1.2287917737789202, + "step": 12428, + "train/total_loss": 0.1204535961151123 + }, + { + "entropy": 9.137977600097656, + "epoch": 1.228890646628436, + "mean_token_accuracy": 0.8694332242012024, + "num_tokens": 5233253.0, + "step": 12429, + "train/ce_loss": 0.42883843183517456 + }, + { + "epoch": 1.228890646628436, + "step": 12429, + "train/sim_loss": 0.02306145429611206 + }, + { + "epoch": 1.228890646628436, + "step": 12429, + "train/total_loss": 0.06594529747962952 + }, + { + "entropy": 9.160816192626953, + "epoch": 1.2289895194779514, + "mean_token_accuracy": 0.8139013648033142, + "num_tokens": 5245880.0, + "step": 12430, + "train/ce_loss": 0.7171189785003662 + }, + { + "epoch": 1.2289895194779514, + "step": 12430, + "train/sim_loss": 0.009831547737121582 + }, + { + "epoch": 1.2289895194779514, + "step": 12430, + "train/total_loss": 0.0815434455871582 + }, + { + "entropy": 9.983589172363281, + "epoch": 1.229088392327467, + "mean_token_accuracy": 0.8783783912658691, + "num_tokens": 5259360.0, + "step": 12431, + "train/ce_loss": 0.3174327313899994 + }, + { + "epoch": 1.229088392327467, + "step": 12431, + "train/sim_loss": 0.07921034097671509 + }, + { + "epoch": 1.229088392327467, + "step": 12431, + "train/total_loss": 0.11095361411571503 + }, + { + "entropy": 9.169278144836426, + "epoch": 1.2291872651769824, + "mean_token_accuracy": 0.8207316994667053, + "num_tokens": 5267773.0, + "step": 12432, + "train/ce_loss": 0.5206990838050842 + }, + { + "epoch": 1.2291872651769824, + "step": 12432, + "train/sim_loss": 0.05264401435852051 + }, + { + "epoch": 1.2291872651769824, + "step": 12432, + "train/total_loss": 0.10471392422914505 + }, + { + "entropy": 9.600081443786621, + "epoch": 1.2292861380264979, + "mean_token_accuracy": 0.8444444537162781, + "num_tokens": 5279828.0, + "step": 12433, + "train/ce_loss": 0.26086845993995667 + }, + { + "epoch": 1.2292861380264979, + "step": 12433, + "train/sim_loss": 0.048281192779541016 + }, + { + "epoch": 1.2292861380264979, + "step": 12433, + "train/total_loss": 0.07436803728342056 + }, + { + "entropy": 9.299436569213867, + "epoch": 1.2293850108760134, + "mean_token_accuracy": 0.8666666746139526, + "num_tokens": 5294141.0, + "step": 12434, + "train/ce_loss": 0.19532343745231628 + }, + { + "epoch": 1.2293850108760134, + "step": 12434, + "train/sim_loss": 0.014566957950592041 + }, + { + "epoch": 1.2293850108760134, + "step": 12434, + "train/total_loss": 0.03409930318593979 + }, + { + "entropy": 9.37980842590332, + "epoch": 1.2294838837255289, + "mean_token_accuracy": 0.8727272748947144, + "num_tokens": 5299911.0, + "step": 12435, + "train/ce_loss": 0.5873945951461792 + }, + { + "epoch": 1.2294838837255289, + "step": 12435, + "train/sim_loss": 0.034600913524627686 + }, + { + "epoch": 1.2294838837255289, + "step": 12435, + "train/total_loss": 0.09334037452936172 + }, + { + "entropy": 9.55136489868164, + "epoch": 1.2295827565750446, + "mean_token_accuracy": 0.790723979473114, + "num_tokens": 5312137.0, + "step": 12436, + "train/ce_loss": 0.5139425992965698 + }, + { + "epoch": 1.2295827565750446, + "step": 12436, + "train/sim_loss": 0.027994513511657715 + }, + { + "epoch": 1.2295827565750446, + "step": 12436, + "train/total_loss": 0.07938877493143082 + }, + { + "entropy": 9.52511215209961, + "epoch": 1.22968162942456, + "mean_token_accuracy": 0.8498659729957581, + "num_tokens": 5328248.0, + "step": 12437, + "train/ce_loss": 0.4110487997531891 + }, + { + "epoch": 1.22968162942456, + "step": 12437, + "train/sim_loss": 0.015843451023101807 + }, + { + "epoch": 1.22968162942456, + "step": 12437, + "train/total_loss": 0.056948330253362656 + }, + { + "entropy": 9.450922012329102, + "epoch": 1.2297805022740755, + "mean_token_accuracy": 0.8630136847496033, + "num_tokens": 5341216.0, + "step": 12438, + "train/ce_loss": 0.1601411998271942 + }, + { + "epoch": 1.2297805022740755, + "step": 12438, + "train/sim_loss": 0.019965529441833496 + }, + { + "epoch": 1.2297805022740755, + "step": 12438, + "train/total_loss": 0.03597965091466904 + }, + { + "entropy": 9.75070858001709, + "epoch": 1.229879375123591, + "mean_token_accuracy": 0.8550295829772949, + "num_tokens": 5354554.0, + "step": 12439, + "train/ce_loss": 1.537574803478492e-06 + }, + { + "epoch": 1.229879375123591, + "step": 12439, + "train/sim_loss": 0.026693344116210938 + }, + { + "epoch": 1.229879375123591, + "step": 12439, + "train/total_loss": 0.026693498715758324 + }, + { + "epoch": 1.2299782479731065, + "grad_norm": 0.5268516540527344, + "learning_rate": 6.927013796172676e-06, + "loss": 0.0866, + "step": 12440 + }, + { + "entropy": 9.465740203857422, + "epoch": 1.2299782479731065, + "mean_token_accuracy": 0.8464052081108093, + "num_tokens": 5367984.0, + "step": 12440, + "train/ce_loss": 0.44715672731399536 + }, + { + "epoch": 1.2299782479731065, + "step": 12440, + "train/sim_loss": 0.025038957595825195 + }, + { + "epoch": 1.2299782479731065, + "step": 12440, + "train/total_loss": 0.06975463032722473 + }, + { + "entropy": 9.206547737121582, + "epoch": 1.2300771208226222, + "mean_token_accuracy": 0.8252212405204773, + "num_tokens": 5377752.0, + "step": 12441, + "train/ce_loss": 0.9432539343833923 + }, + { + "epoch": 1.2300771208226222, + "step": 12441, + "train/sim_loss": 0.05017435550689697 + }, + { + "epoch": 1.2300771208226222, + "step": 12441, + "train/total_loss": 0.1444997489452362 + }, + { + "entropy": 9.76183795928955, + "epoch": 1.2301759936721377, + "mean_token_accuracy": 0.8775168061256409, + "num_tokens": 5396597.0, + "step": 12442, + "train/ce_loss": 0.7493266463279724 + }, + { + "epoch": 1.2301759936721377, + "step": 12442, + "train/sim_loss": 0.08016061782836914 + }, + { + "epoch": 1.2301759936721377, + "step": 12442, + "train/total_loss": 0.15509328246116638 + }, + { + "entropy": 9.420984268188477, + "epoch": 1.2302748665216532, + "mean_token_accuracy": 0.8983333110809326, + "num_tokens": 5402448.0, + "step": 12443, + "train/ce_loss": 0.29821160435676575 + }, + { + "epoch": 1.2302748665216532, + "step": 12443, + "train/sim_loss": 0.03963500261306763 + }, + { + "epoch": 1.2302748665216532, + "step": 12443, + "train/total_loss": 0.06945616006851196 + }, + { + "entropy": 9.610349655151367, + "epoch": 1.2303737393711687, + "mean_token_accuracy": 0.8164196014404297, + "num_tokens": 5416374.0, + "step": 12444, + "train/ce_loss": 0.7041547298431396 + }, + { + "epoch": 1.2303737393711687, + "step": 12444, + "train/sim_loss": 0.029363393783569336 + }, + { + "epoch": 1.2303737393711687, + "step": 12444, + "train/total_loss": 0.09977886825799942 + }, + { + "entropy": 9.37643814086914, + "epoch": 1.2304726122206842, + "mean_token_accuracy": 0.8650927543640137, + "num_tokens": 5429210.0, + "step": 12445, + "train/ce_loss": 0.44344717264175415 + }, + { + "epoch": 1.2304726122206842, + "step": 12445, + "train/sim_loss": 0.038101255893707275 + }, + { + "epoch": 1.2304726122206842, + "step": 12445, + "train/total_loss": 0.08244597911834717 + }, + { + "entropy": 9.308713912963867, + "epoch": 1.2305714850701996, + "mean_token_accuracy": 0.8024356961250305, + "num_tokens": 5436881.0, + "step": 12446, + "train/ce_loss": 0.9937395453453064 + }, + { + "epoch": 1.2305714850701996, + "step": 12446, + "train/sim_loss": 0.09052455425262451 + }, + { + "epoch": 1.2305714850701996, + "step": 12446, + "train/total_loss": 0.1898985207080841 + }, + { + "entropy": 9.612218856811523, + "epoch": 1.2306703579197151, + "mean_token_accuracy": 0.8211920261383057, + "num_tokens": 5449379.0, + "step": 12447, + "train/ce_loss": 0.6599594950675964 + }, + { + "epoch": 1.2306703579197151, + "step": 12447, + "train/sim_loss": 0.04008924961090088 + }, + { + "epoch": 1.2306703579197151, + "step": 12447, + "train/total_loss": 0.10608520358800888 + }, + { + "entropy": 9.342053413391113, + "epoch": 1.2307692307692308, + "mean_token_accuracy": 0.8617363572120667, + "num_tokens": 5456506.0, + "step": 12448, + "train/ce_loss": 0.40646037459373474 + }, + { + "epoch": 1.2307692307692308, + "step": 12448, + "train/sim_loss": 0.011219501495361328 + }, + { + "epoch": 1.2307692307692308, + "step": 12448, + "train/total_loss": 0.05186554044485092 + }, + { + "entropy": 9.427370071411133, + "epoch": 1.2308681036187463, + "mean_token_accuracy": 0.8538922071456909, + "num_tokens": 5467035.0, + "step": 12449, + "train/ce_loss": 0.3231382369995117 + }, + { + "epoch": 1.2308681036187463, + "step": 12449, + "train/sim_loss": 0.06397968530654907 + }, + { + "epoch": 1.2308681036187463, + "step": 12449, + "train/total_loss": 0.09629350900650024 + }, + { + "entropy": 9.359611511230469, + "epoch": 1.2309669764682618, + "mean_token_accuracy": 0.8438966870307922, + "num_tokens": 5481945.0, + "step": 12450, + "train/ce_loss": 0.3683290481567383 + }, + { + "epoch": 1.2309669764682618, + "step": 12450, + "train/sim_loss": 0.014558553695678711 + }, + { + "epoch": 1.2309669764682618, + "step": 12450, + "train/total_loss": 0.05139146000146866 + }, + { + "entropy": 9.286359786987305, + "epoch": 1.2310658493177773, + "mean_token_accuracy": 0.800212562084198, + "num_tokens": 5491784.0, + "step": 12451, + "train/ce_loss": 0.7032936215400696 + }, + { + "epoch": 1.2310658493177773, + "step": 12451, + "train/sim_loss": 0.056739211082458496 + }, + { + "epoch": 1.2310658493177773, + "step": 12451, + "train/total_loss": 0.12706857919692993 + }, + { + "entropy": 9.792630195617676, + "epoch": 1.2311647221672928, + "mean_token_accuracy": 0.8547903895378113, + "num_tokens": 5500820.0, + "step": 12452, + "train/ce_loss": 0.34886306524276733 + }, + { + "epoch": 1.2311647221672928, + "step": 12452, + "train/sim_loss": 0.04011821746826172 + }, + { + "epoch": 1.2311647221672928, + "step": 12452, + "train/total_loss": 0.07500452548265457 + }, + { + "entropy": 9.51370620727539, + "epoch": 1.2312635950168085, + "mean_token_accuracy": 0.8457223176956177, + "num_tokens": 5511198.0, + "step": 12453, + "train/ce_loss": 0.21164701879024506 + }, + { + "epoch": 1.2312635950168085, + "step": 12453, + "train/sim_loss": 0.014962315559387207 + }, + { + "epoch": 1.2312635950168085, + "step": 12453, + "train/total_loss": 0.03612701594829559 + }, + { + "entropy": 9.100114822387695, + "epoch": 1.231362467866324, + "mean_token_accuracy": 0.7976744174957275, + "num_tokens": 5521596.0, + "step": 12454, + "train/ce_loss": 0.4854869842529297 + }, + { + "epoch": 1.231362467866324, + "step": 12454, + "train/sim_loss": 0.07070291042327881 + }, + { + "epoch": 1.231362467866324, + "step": 12454, + "train/total_loss": 0.11925160884857178 + }, + { + "entropy": 8.829423904418945, + "epoch": 1.2314613407158395, + "mean_token_accuracy": 0.8865313529968262, + "num_tokens": 5530128.0, + "step": 12455, + "train/ce_loss": 0.4094746708869934 + }, + { + "epoch": 1.2314613407158395, + "step": 12455, + "train/sim_loss": 0.05196654796600342 + }, + { + "epoch": 1.2314613407158395, + "step": 12455, + "train/total_loss": 0.09291401505470276 + }, + { + "entropy": 9.203730583190918, + "epoch": 1.231560213565355, + "mean_token_accuracy": 0.7853982448577881, + "num_tokens": 5542569.0, + "step": 12456, + "train/ce_loss": 0.41456273198127747 + }, + { + "epoch": 1.231560213565355, + "step": 12456, + "train/sim_loss": 0.03759753704071045 + }, + { + "epoch": 1.231560213565355, + "step": 12456, + "train/total_loss": 0.07905381172895432 + }, + { + "entropy": 9.641671180725098, + "epoch": 1.2316590864148704, + "mean_token_accuracy": 0.8998242616653442, + "num_tokens": 5556301.0, + "step": 12457, + "train/ce_loss": 0.5174147486686707 + }, + { + "epoch": 1.2316590864148704, + "step": 12457, + "train/sim_loss": 0.0427403450012207 + }, + { + "epoch": 1.2316590864148704, + "step": 12457, + "train/total_loss": 0.09448182582855225 + }, + { + "entropy": 9.230289459228516, + "epoch": 1.231757959264386, + "mean_token_accuracy": 0.8237885236740112, + "num_tokens": 5567258.0, + "step": 12458, + "train/ce_loss": 0.7489534020423889 + }, + { + "epoch": 1.231757959264386, + "step": 12458, + "train/sim_loss": 0.019918322563171387 + }, + { + "epoch": 1.231757959264386, + "step": 12458, + "train/total_loss": 0.09481366723775864 + }, + { + "entropy": 9.201830863952637, + "epoch": 1.2318568321139014, + "mean_token_accuracy": 0.8834661245346069, + "num_tokens": 5578138.0, + "step": 12459, + "train/ce_loss": 0.2346048206090927 + }, + { + "epoch": 1.2318568321139014, + "step": 12459, + "train/sim_loss": 0.04182928800582886 + }, + { + "epoch": 1.2318568321139014, + "step": 12459, + "train/total_loss": 0.06528977304697037 + }, + { + "epoch": 1.231955704963417, + "grad_norm": 0.5643858313560486, + "learning_rate": 6.922068931414726e-06, + "loss": 0.0853, + "step": 12460 + }, + { + "entropy": 9.480239868164062, + "epoch": 1.231955704963417, + "mean_token_accuracy": 0.8741976618766785, + "num_tokens": 5587170.0, + "step": 12460, + "train/ce_loss": 0.6033692359924316 + }, + { + "epoch": 1.231955704963417, + "step": 12460, + "train/sim_loss": 0.04319632053375244 + }, + { + "epoch": 1.231955704963417, + "step": 12460, + "train/total_loss": 0.10353324562311172 + }, + { + "entropy": 9.867774963378906, + "epoch": 1.2320545778129326, + "mean_token_accuracy": 0.8771384358406067, + "num_tokens": 5601621.0, + "step": 12461, + "train/ce_loss": 0.7003886699676514 + }, + { + "epoch": 1.2320545778129326, + "step": 12461, + "train/sim_loss": 0.0574641227722168 + }, + { + "epoch": 1.2320545778129326, + "step": 12461, + "train/total_loss": 0.12750299274921417 + }, + { + "entropy": 9.474895477294922, + "epoch": 1.232153450662448, + "mean_token_accuracy": 0.8121951222419739, + "num_tokens": 5614562.0, + "step": 12462, + "train/ce_loss": 0.27277207374572754 + }, + { + "epoch": 1.232153450662448, + "step": 12462, + "train/sim_loss": 0.026337921619415283 + }, + { + "epoch": 1.232153450662448, + "step": 12462, + "train/total_loss": 0.053615130484104156 + }, + { + "entropy": 9.66238784790039, + "epoch": 1.2322523235119636, + "mean_token_accuracy": 0.8161592483520508, + "num_tokens": 5630087.0, + "step": 12463, + "train/ce_loss": 0.7972533702850342 + }, + { + "epoch": 1.2322523235119636, + "step": 12463, + "train/sim_loss": 0.08376604318618774 + }, + { + "epoch": 1.2322523235119636, + "step": 12463, + "train/total_loss": 0.1634913831949234 + }, + { + "entropy": 9.630582809448242, + "epoch": 1.232351196361479, + "mean_token_accuracy": 0.8286139965057373, + "num_tokens": 5639398.0, + "step": 12464, + "train/ce_loss": 0.4199960231781006 + }, + { + "epoch": 1.232351196361479, + "step": 12464, + "train/sim_loss": 0.06499940156936646 + }, + { + "epoch": 1.232351196361479, + "step": 12464, + "train/total_loss": 0.10699900984764099 + }, + { + "entropy": 9.726856231689453, + "epoch": 1.2324500692109948, + "mean_token_accuracy": 0.7735849022865295, + "num_tokens": 5657568.0, + "step": 12465, + "train/ce_loss": 0.5927906632423401 + }, + { + "epoch": 1.2324500692109948, + "step": 12465, + "train/sim_loss": 0.06410503387451172 + }, + { + "epoch": 1.2324500692109948, + "step": 12465, + "train/total_loss": 0.12338410317897797 + }, + { + "entropy": 9.572921752929688, + "epoch": 1.2325489420605102, + "mean_token_accuracy": 0.8241042494773865, + "num_tokens": 5672319.0, + "step": 12466, + "train/ce_loss": 0.8301641345024109 + }, + { + "epoch": 1.2325489420605102, + "step": 12466, + "train/sim_loss": 0.027299702167510986 + }, + { + "epoch": 1.2325489420605102, + "step": 12466, + "train/total_loss": 0.11031612008810043 + }, + { + "entropy": 9.626396179199219, + "epoch": 1.2326478149100257, + "mean_token_accuracy": 0.8758170008659363, + "num_tokens": 5683167.0, + "step": 12467, + "train/ce_loss": 3.825554983905022e-07 + }, + { + "epoch": 1.2326478149100257, + "step": 12467, + "train/sim_loss": 0.01179724931716919 + }, + { + "epoch": 1.2326478149100257, + "step": 12467, + "train/total_loss": 0.011797287501394749 + }, + { + "entropy": 9.597445487976074, + "epoch": 1.2327466877595412, + "mean_token_accuracy": 0.8238341808319092, + "num_tokens": 5697922.0, + "step": 12468, + "train/ce_loss": 0.9417304992675781 + }, + { + "epoch": 1.2327466877595412, + "step": 12468, + "train/sim_loss": 0.09607744216918945 + }, + { + "epoch": 1.2327466877595412, + "step": 12468, + "train/total_loss": 0.1902504861354828 + }, + { + "entropy": 9.581724166870117, + "epoch": 1.2328455606090567, + "mean_token_accuracy": 0.8299086689949036, + "num_tokens": 5713676.0, + "step": 12469, + "train/ce_loss": 0.5589268803596497 + }, + { + "epoch": 1.2328455606090567, + "step": 12469, + "train/sim_loss": 0.04972398281097412 + }, + { + "epoch": 1.2328455606090567, + "step": 12469, + "train/total_loss": 0.10561667382717133 + }, + { + "entropy": 9.554108619689941, + "epoch": 1.2329444334585722, + "mean_token_accuracy": 0.9014251828193665, + "num_tokens": 5728394.0, + "step": 12470, + "train/ce_loss": 0.18829452991485596 + }, + { + "epoch": 1.2329444334585722, + "step": 12470, + "train/sim_loss": 0.04577231407165527 + }, + { + "epoch": 1.2329444334585722, + "step": 12470, + "train/total_loss": 0.06460176408290863 + }, + { + "entropy": 9.649129867553711, + "epoch": 1.233043306308088, + "mean_token_accuracy": 0.802096962928772, + "num_tokens": 5737932.0, + "step": 12471, + "train/ce_loss": 0.5513995885848999 + }, + { + "epoch": 1.233043306308088, + "step": 12471, + "train/sim_loss": 0.038683295249938965 + }, + { + "epoch": 1.233043306308088, + "step": 12471, + "train/total_loss": 0.09382325410842896 + }, + { + "entropy": 9.984392166137695, + "epoch": 1.2331421791576034, + "mean_token_accuracy": 0.8786610960960388, + "num_tokens": 5751677.0, + "step": 12472, + "train/ce_loss": 5.725946152779215e-07 + }, + { + "epoch": 1.2331421791576034, + "step": 12472, + "train/sim_loss": 0.04288738965988159 + }, + { + "epoch": 1.2331421791576034, + "step": 12472, + "train/total_loss": 0.04288744553923607 + }, + { + "entropy": 9.565532684326172, + "epoch": 1.2332410520071189, + "mean_token_accuracy": 0.8809523582458496, + "num_tokens": 5763906.0, + "step": 12473, + "train/ce_loss": 0.5356455445289612 + }, + { + "epoch": 1.2332410520071189, + "step": 12473, + "train/sim_loss": 0.04218250513076782 + }, + { + "epoch": 1.2332410520071189, + "step": 12473, + "train/total_loss": 0.09574706107378006 + }, + { + "entropy": 9.345226287841797, + "epoch": 1.2333399248566344, + "mean_token_accuracy": 0.9257057905197144, + "num_tokens": 5772689.0, + "step": 12474, + "train/ce_loss": 0.25208982825279236 + }, + { + "epoch": 1.2333399248566344, + "step": 12474, + "train/sim_loss": 0.04620957374572754 + }, + { + "epoch": 1.2333399248566344, + "step": 12474, + "train/total_loss": 0.07141855359077454 + }, + { + "entropy": 9.311262130737305, + "epoch": 1.2334387977061498, + "mean_token_accuracy": 0.9022082090377808, + "num_tokens": 5788595.0, + "step": 12475, + "train/ce_loss": 0.7717061042785645 + }, + { + "epoch": 1.2334387977061498, + "step": 12475, + "train/sim_loss": 0.059177517890930176 + }, + { + "epoch": 1.2334387977061498, + "step": 12475, + "train/total_loss": 0.13634812831878662 + }, + { + "entropy": 9.53399658203125, + "epoch": 1.2335376705556653, + "mean_token_accuracy": 0.8736842274665833, + "num_tokens": 5801693.0, + "step": 12476, + "train/ce_loss": 0.44051027297973633 + }, + { + "epoch": 1.2335376705556653, + "step": 12476, + "train/sim_loss": 0.00910043716430664 + }, + { + "epoch": 1.2335376705556653, + "step": 12476, + "train/total_loss": 0.05315146595239639 + }, + { + "entropy": 9.273439407348633, + "epoch": 1.233636543405181, + "mean_token_accuracy": 0.8287461996078491, + "num_tokens": 5814086.0, + "step": 12477, + "train/ce_loss": 0.3366101384162903 + }, + { + "epoch": 1.233636543405181, + "step": 12477, + "train/sim_loss": 0.024870216846466064 + }, + { + "epoch": 1.233636543405181, + "step": 12477, + "train/total_loss": 0.05853123217821121 + }, + { + "entropy": 9.760299682617188, + "epoch": 1.2337354162546965, + "mean_token_accuracy": 0.8692971467971802, + "num_tokens": 5831533.0, + "step": 12478, + "train/ce_loss": 0.5210216045379639 + }, + { + "epoch": 1.2337354162546965, + "step": 12478, + "train/sim_loss": 0.04120635986328125 + }, + { + "epoch": 1.2337354162546965, + "step": 12478, + "train/total_loss": 0.09330852329730988 + }, + { + "entropy": 9.081663131713867, + "epoch": 1.233834289104212, + "mean_token_accuracy": 0.8174945712089539, + "num_tokens": 5839567.0, + "step": 12479, + "train/ce_loss": 0.2594170868396759 + }, + { + "epoch": 1.233834289104212, + "step": 12479, + "train/sim_loss": 0.01993274688720703 + }, + { + "epoch": 1.233834289104212, + "step": 12479, + "train/total_loss": 0.0458744540810585 + }, + { + "epoch": 1.2339331619537275, + "grad_norm": 0.5771082043647766, + "learning_rate": 6.9171240666567776e-06, + "loss": 0.0934, + "step": 12480 + }, + { + "entropy": 9.168163299560547, + "epoch": 1.2339331619537275, + "mean_token_accuracy": 0.848143994808197, + "num_tokens": 5850167.0, + "step": 12480, + "train/ce_loss": 0.6882924437522888 + }, + { + "epoch": 1.2339331619537275, + "step": 12480, + "train/sim_loss": 0.0538715124130249 + }, + { + "epoch": 1.2339331619537275, + "step": 12480, + "train/total_loss": 0.1227007582783699 + }, + { + "entropy": 9.555922508239746, + "epoch": 1.234032034803243, + "mean_token_accuracy": 0.8021390438079834, + "num_tokens": 5861361.0, + "step": 12481, + "train/ce_loss": 0.21629928052425385 + }, + { + "epoch": 1.234032034803243, + "step": 12481, + "train/sim_loss": 0.010497093200683594 + }, + { + "epoch": 1.234032034803243, + "step": 12481, + "train/total_loss": 0.0321270227432251 + }, + { + "entropy": 9.644549369812012, + "epoch": 1.2341309076527585, + "mean_token_accuracy": 0.8292253613471985, + "num_tokens": 5868776.0, + "step": 12482, + "train/ce_loss": 0.4859948456287384 + }, + { + "epoch": 1.2341309076527585, + "step": 12482, + "train/sim_loss": 0.044355571269989014 + }, + { + "epoch": 1.2341309076527585, + "step": 12482, + "train/total_loss": 0.09295505285263062 + }, + { + "entropy": 8.849251747131348, + "epoch": 1.2342297805022742, + "mean_token_accuracy": 0.8456678986549377, + "num_tokens": 5877952.0, + "step": 12483, + "train/ce_loss": 0.5158043503761292 + }, + { + "epoch": 1.2342297805022742, + "step": 12483, + "train/sim_loss": 0.013993620872497559 + }, + { + "epoch": 1.2342297805022742, + "step": 12483, + "train/total_loss": 0.06557405740022659 + }, + { + "entropy": 9.88748836517334, + "epoch": 1.2343286533517897, + "mean_token_accuracy": 0.8928571343421936, + "num_tokens": 5896730.0, + "step": 12484, + "train/ce_loss": 1.3165811196813593e-06 + }, + { + "epoch": 1.2343286533517897, + "step": 12484, + "train/sim_loss": 0.027594685554504395 + }, + { + "epoch": 1.2343286533517897, + "step": 12484, + "train/total_loss": 0.02759481780230999 + }, + { + "entropy": 9.198343276977539, + "epoch": 1.2344275262013051, + "mean_token_accuracy": 0.8692403435707092, + "num_tokens": 5906788.0, + "step": 12485, + "train/ce_loss": 0.39673346281051636 + }, + { + "epoch": 1.2344275262013051, + "step": 12485, + "train/sim_loss": 0.0956500768661499 + }, + { + "epoch": 1.2344275262013051, + "step": 12485, + "train/total_loss": 0.1353234201669693 + }, + { + "entropy": 9.525102615356445, + "epoch": 1.2345263990508206, + "mean_token_accuracy": 0.8640645742416382, + "num_tokens": 5924588.0, + "step": 12486, + "train/ce_loss": 0.2710881531238556 + }, + { + "epoch": 1.2345263990508206, + "step": 12486, + "train/sim_loss": 0.09870290756225586 + }, + { + "epoch": 1.2345263990508206, + "step": 12486, + "train/total_loss": 0.12581172585487366 + }, + { + "entropy": 9.915403366088867, + "epoch": 1.234625271900336, + "mean_token_accuracy": 0.8421927094459534, + "num_tokens": 5936275.0, + "step": 12487, + "train/ce_loss": 0.2628486752510071 + }, + { + "epoch": 1.234625271900336, + "step": 12487, + "train/sim_loss": 0.047317445278167725 + }, + { + "epoch": 1.234625271900336, + "step": 12487, + "train/total_loss": 0.07360231131315231 + }, + { + "entropy": 9.317927360534668, + "epoch": 1.2347241447498516, + "mean_token_accuracy": 0.8699878454208374, + "num_tokens": 5949269.0, + "step": 12488, + "train/ce_loss": 0.5481224656105042 + }, + { + "epoch": 1.2347241447498516, + "step": 12488, + "train/sim_loss": 0.04385662078857422 + }, + { + "epoch": 1.2347241447498516, + "step": 12488, + "train/total_loss": 0.09866887331008911 + }, + { + "entropy": 9.892882347106934, + "epoch": 1.2348230175993673, + "mean_token_accuracy": 0.7982832789421082, + "num_tokens": 5959546.0, + "step": 12489, + "train/ce_loss": 0.8517626523971558 + }, + { + "epoch": 1.2348230175993673, + "step": 12489, + "train/sim_loss": 0.09499841928482056 + }, + { + "epoch": 1.2348230175993673, + "step": 12489, + "train/total_loss": 0.18017467856407166 + }, + { + "entropy": 9.802144050598145, + "epoch": 1.2349218904488828, + "mean_token_accuracy": 0.9072978496551514, + "num_tokens": 5969444.0, + "step": 12490, + "train/ce_loss": 2.38887196246651e-07 + }, + { + "epoch": 1.2349218904488828, + "step": 12490, + "train/sim_loss": 0.016750216484069824 + }, + { + "epoch": 1.2349218904488828, + "step": 12490, + "train/total_loss": 0.016750240698456764 + }, + { + "entropy": 9.209623336791992, + "epoch": 1.2350207632983983, + "mean_token_accuracy": 0.8722891807556152, + "num_tokens": 5979707.0, + "step": 12491, + "train/ce_loss": 0.30886051058769226 + }, + { + "epoch": 1.2350207632983983, + "step": 12491, + "train/sim_loss": 0.04227471351623535 + }, + { + "epoch": 1.2350207632983983, + "step": 12491, + "train/total_loss": 0.07316076755523682 + }, + { + "entropy": 9.369318008422852, + "epoch": 1.2351196361479138, + "mean_token_accuracy": 0.8744939565658569, + "num_tokens": 5990479.0, + "step": 12492, + "train/ce_loss": 0.6724351048469543 + }, + { + "epoch": 1.2351196361479138, + "step": 12492, + "train/sim_loss": 0.05241149663925171 + }, + { + "epoch": 1.2351196361479138, + "step": 12492, + "train/total_loss": 0.11965500563383102 + }, + { + "entropy": 9.501274108886719, + "epoch": 1.2352185089974292, + "mean_token_accuracy": 0.8501741886138916, + "num_tokens": 6004678.0, + "step": 12493, + "train/ce_loss": 0.2511841356754303 + }, + { + "epoch": 1.2352185089974292, + "step": 12493, + "train/sim_loss": 0.015908002853393555 + }, + { + "epoch": 1.2352185089974292, + "step": 12493, + "train/total_loss": 0.041026417165994644 + }, + { + "entropy": 10.016584396362305, + "epoch": 1.235317381846945, + "mean_token_accuracy": 0.9027777910232544, + "num_tokens": 6013065.0, + "step": 12494, + "train/ce_loss": 0.7323745489120483 + }, + { + "epoch": 1.235317381846945, + "step": 12494, + "train/sim_loss": 0.04405325651168823 + }, + { + "epoch": 1.235317381846945, + "step": 12494, + "train/total_loss": 0.11729071289300919 + }, + { + "entropy": 9.515050888061523, + "epoch": 1.2354162546964604, + "mean_token_accuracy": 0.8471023440361023, + "num_tokens": 6026301.0, + "step": 12495, + "train/ce_loss": 0.4303852915763855 + }, + { + "epoch": 1.2354162546964604, + "step": 12495, + "train/sim_loss": 0.07597202062606812 + }, + { + "epoch": 1.2354162546964604, + "step": 12495, + "train/total_loss": 0.1190105527639389 + }, + { + "entropy": 9.877035140991211, + "epoch": 1.235515127545976, + "mean_token_accuracy": 0.836320161819458, + "num_tokens": 6038237.0, + "step": 12496, + "train/ce_loss": 0.58847576379776 + }, + { + "epoch": 1.235515127545976, + "step": 12496, + "train/sim_loss": 0.051689326763153076 + }, + { + "epoch": 1.235515127545976, + "step": 12496, + "train/total_loss": 0.11053690314292908 + }, + { + "entropy": 9.821012496948242, + "epoch": 1.2356140003954914, + "mean_token_accuracy": 0.8529014587402344, + "num_tokens": 6049827.0, + "step": 12497, + "train/ce_loss": 0.272104948759079 + }, + { + "epoch": 1.2356140003954914, + "step": 12497, + "train/sim_loss": 0.07392865419387817 + }, + { + "epoch": 1.2356140003954914, + "step": 12497, + "train/total_loss": 0.10113915055990219 + }, + { + "entropy": 8.968945503234863, + "epoch": 1.235712873245007, + "mean_token_accuracy": 0.8654545545578003, + "num_tokens": 6055508.0, + "step": 12498, + "train/ce_loss": 0.4220104515552521 + }, + { + "epoch": 1.235712873245007, + "step": 12498, + "train/sim_loss": 0.054599761962890625 + }, + { + "epoch": 1.235712873245007, + "step": 12498, + "train/total_loss": 0.0968008041381836 + }, + { + "entropy": 9.297551155090332, + "epoch": 1.2358117460945224, + "mean_token_accuracy": 0.8555327653884888, + "num_tokens": 6073515.0, + "step": 12499, + "train/ce_loss": 0.6243859529495239 + }, + { + "epoch": 1.2358117460945224, + "step": 12499, + "train/sim_loss": 0.08227699995040894 + }, + { + "epoch": 1.2358117460945224, + "step": 12499, + "train/total_loss": 0.1447155922651291 + }, + { + "epoch": 1.2359106189440379, + "grad_norm": 0.6121355295181274, + "learning_rate": 6.912179201898829e-06, + "loss": 0.0879, + "step": 12500 + }, + { + "entropy": 9.467950820922852, + "epoch": 1.2359106189440379, + "mean_token_accuracy": 0.8100000023841858, + "num_tokens": 6085296.0, + "step": 12500, + "train/ce_loss": 0.2544325292110443 + }, + { + "epoch": 1.2359106189440379, + "step": 12500, + "train/sim_loss": 0.0621340274810791 + }, + { + "epoch": 1.2359106189440379, + "step": 12500, + "train/total_loss": 0.08757728338241577 + }, + { + "entropy": 9.451425552368164, + "epoch": 1.2360094917935536, + "mean_token_accuracy": 0.8419452905654907, + "num_tokens": 6098209.0, + "step": 12501, + "train/ce_loss": 5.826237270412093e-07 + }, + { + "epoch": 1.2360094917935536, + "step": 12501, + "train/sim_loss": 0.03557419776916504 + }, + { + "epoch": 1.2360094917935536, + "step": 12501, + "train/total_loss": 0.035574257373809814 + }, + { + "entropy": 9.56895637512207, + "epoch": 1.236108364643069, + "mean_token_accuracy": 0.8644859790802002, + "num_tokens": 6116622.0, + "step": 12502, + "train/ce_loss": 0.17975090444087982 + }, + { + "epoch": 1.236108364643069, + "step": 12502, + "train/sim_loss": 0.04574263095855713 + }, + { + "epoch": 1.236108364643069, + "step": 12502, + "train/total_loss": 0.06371772289276123 + }, + { + "entropy": 9.708013534545898, + "epoch": 1.2362072374925845, + "mean_token_accuracy": 0.8803641200065613, + "num_tokens": 6133966.0, + "step": 12503, + "train/ce_loss": 0.5630883574485779 + }, + { + "epoch": 1.2362072374925845, + "step": 12503, + "train/sim_loss": 0.040194690227508545 + }, + { + "epoch": 1.2362072374925845, + "step": 12503, + "train/total_loss": 0.09650352597236633 + }, + { + "entropy": 9.294231414794922, + "epoch": 1.2363061103421, + "mean_token_accuracy": 0.8172690868377686, + "num_tokens": 6149243.0, + "step": 12504, + "train/ce_loss": 0.516626238822937 + }, + { + "epoch": 1.2363061103421, + "step": 12504, + "train/sim_loss": 0.08356082439422607 + }, + { + "epoch": 1.2363061103421, + "step": 12504, + "train/total_loss": 0.13522344827651978 + }, + { + "entropy": 9.624732971191406, + "epoch": 1.2364049831916155, + "mean_token_accuracy": 0.8764783143997192, + "num_tokens": 6164063.0, + "step": 12505, + "train/ce_loss": 0.3617064952850342 + }, + { + "epoch": 1.2364049831916155, + "step": 12505, + "train/sim_loss": 0.016138553619384766 + }, + { + "epoch": 1.2364049831916155, + "step": 12505, + "train/total_loss": 0.05230920389294624 + }, + { + "entropy": 9.522979736328125, + "epoch": 1.2365038560411312, + "mean_token_accuracy": 0.8826945424079895, + "num_tokens": 6182220.0, + "step": 12506, + "train/ce_loss": 0.4249723255634308 + }, + { + "epoch": 1.2365038560411312, + "step": 12506, + "train/sim_loss": 0.11097675561904907 + }, + { + "epoch": 1.2365038560411312, + "step": 12506, + "train/total_loss": 0.15347398817539215 + }, + { + "entropy": 8.954086303710938, + "epoch": 1.2366027288906467, + "mean_token_accuracy": 0.8430812954902649, + "num_tokens": 6190469.0, + "step": 12507, + "train/ce_loss": 0.616412341594696 + }, + { + "epoch": 1.2366027288906467, + "step": 12507, + "train/sim_loss": 0.03135812282562256 + }, + { + "epoch": 1.2366027288906467, + "step": 12507, + "train/total_loss": 0.09299935400485992 + }, + { + "entropy": 9.139724731445312, + "epoch": 1.2367016017401622, + "mean_token_accuracy": 0.8344298005104065, + "num_tokens": 6200175.0, + "step": 12508, + "train/ce_loss": 0.901433527469635 + }, + { + "epoch": 1.2367016017401622, + "step": 12508, + "train/sim_loss": 0.03955376148223877 + }, + { + "epoch": 1.2367016017401622, + "step": 12508, + "train/total_loss": 0.12969711422920227 + }, + { + "entropy": 8.803183555603027, + "epoch": 1.2368004745896777, + "mean_token_accuracy": 0.8596938848495483, + "num_tokens": 6207906.0, + "step": 12509, + "train/ce_loss": 6.848121074654046e-07 + }, + { + "epoch": 1.2368004745896777, + "step": 12509, + "train/sim_loss": 0.03197091817855835 + }, + { + "epoch": 1.2368004745896777, + "step": 12509, + "train/total_loss": 0.03197098523378372 + }, + { + "entropy": 9.908498764038086, + "epoch": 1.2368993474391932, + "mean_token_accuracy": 0.8388158082962036, + "num_tokens": 6226936.0, + "step": 12510, + "train/ce_loss": 0.9327084422111511 + }, + { + "epoch": 1.2368993474391932, + "step": 12510, + "train/sim_loss": 0.0554385781288147 + }, + { + "epoch": 1.2368993474391932, + "step": 12510, + "train/total_loss": 0.14870941638946533 + }, + { + "entropy": 9.735718727111816, + "epoch": 1.2369982202887086, + "mean_token_accuracy": 0.8188011050224304, + "num_tokens": 6239187.0, + "step": 12511, + "train/ce_loss": 0.42931950092315674 + }, + { + "epoch": 1.2369982202887086, + "step": 12511, + "train/sim_loss": 0.02213764190673828 + }, + { + "epoch": 1.2369982202887086, + "step": 12511, + "train/total_loss": 0.06506959348917007 + }, + { + "entropy": 9.827651977539062, + "epoch": 1.2370970931382241, + "mean_token_accuracy": 0.8454440832138062, + "num_tokens": 6256298.0, + "step": 12512, + "train/ce_loss": 0.5822435617446899 + }, + { + "epoch": 1.2370970931382241, + "step": 12512, + "train/sim_loss": 0.03908050060272217 + }, + { + "epoch": 1.2370970931382241, + "step": 12512, + "train/total_loss": 0.09730485826730728 + }, + { + "entropy": 9.732207298278809, + "epoch": 1.2371959659877398, + "mean_token_accuracy": 0.8579545617103577, + "num_tokens": 6267626.0, + "step": 12513, + "train/ce_loss": 0.3230358064174652 + }, + { + "epoch": 1.2371959659877398, + "step": 12513, + "train/sim_loss": 0.04725009202957153 + }, + { + "epoch": 1.2371959659877398, + "step": 12513, + "train/total_loss": 0.07955367863178253 + }, + { + "entropy": 9.298759460449219, + "epoch": 1.2372948388372553, + "mean_token_accuracy": 0.841549277305603, + "num_tokens": 6279150.0, + "step": 12514, + "train/ce_loss": 0.31454503536224365 + }, + { + "epoch": 1.2372948388372553, + "step": 12514, + "train/sim_loss": 0.019755542278289795 + }, + { + "epoch": 1.2372948388372553, + "step": 12514, + "train/total_loss": 0.05121004581451416 + }, + { + "entropy": 9.77219009399414, + "epoch": 1.2373937116867708, + "mean_token_accuracy": 0.8429319262504578, + "num_tokens": 6292542.0, + "step": 12515, + "train/ce_loss": 0.48323914408683777 + }, + { + "epoch": 1.2373937116867708, + "step": 12515, + "train/sim_loss": 0.020880699157714844 + }, + { + "epoch": 1.2373937116867708, + "step": 12515, + "train/total_loss": 0.06920461356639862 + }, + { + "entropy": 9.229963302612305, + "epoch": 1.2374925845362863, + "mean_token_accuracy": 0.8739583492279053, + "num_tokens": 6307477.0, + "step": 12516, + "train/ce_loss": 0.23296482861042023 + }, + { + "epoch": 1.2374925845362863, + "step": 12516, + "train/sim_loss": 0.04744213819503784 + }, + { + "epoch": 1.2374925845362863, + "step": 12516, + "train/total_loss": 0.07073862105607986 + }, + { + "entropy": 9.68316650390625, + "epoch": 1.2375914573858018, + "mean_token_accuracy": 0.8110944628715515, + "num_tokens": 6316624.0, + "step": 12517, + "train/ce_loss": 0.6885124444961548 + }, + { + "epoch": 1.2375914573858018, + "step": 12517, + "train/sim_loss": 0.059769272804260254 + }, + { + "epoch": 1.2375914573858018, + "step": 12517, + "train/total_loss": 0.12862052023410797 + }, + { + "entropy": 9.211031913757324, + "epoch": 1.2376903302353175, + "mean_token_accuracy": 0.8523908257484436, + "num_tokens": 6328790.0, + "step": 12518, + "train/ce_loss": 0.6386392712593079 + }, + { + "epoch": 1.2376903302353175, + "step": 12518, + "train/sim_loss": 0.08869814872741699 + }, + { + "epoch": 1.2376903302353175, + "step": 12518, + "train/total_loss": 0.15256208181381226 + }, + { + "entropy": 9.816085815429688, + "epoch": 1.237789203084833, + "mean_token_accuracy": 0.886904776096344, + "num_tokens": 6337582.0, + "step": 12519, + "train/ce_loss": 0.43800079822540283 + }, + { + "epoch": 1.237789203084833, + "step": 12519, + "train/sim_loss": 0.07974517345428467 + }, + { + "epoch": 1.237789203084833, + "step": 12519, + "train/total_loss": 0.12354525923728943 + }, + { + "epoch": 1.2378880759343485, + "grad_norm": 0.5662554502487183, + "learning_rate": 6.90723433714088e-06, + "loss": 0.0904, + "step": 12520 + }, + { + "entropy": 9.47894287109375, + "epoch": 1.2378880759343485, + "mean_token_accuracy": 0.875, + "num_tokens": 6351807.0, + "step": 12520, + "train/ce_loss": 1.0522586535444134e-06 + }, + { + "epoch": 1.2378880759343485, + "step": 12520, + "train/sim_loss": 0.03230494260787964 + }, + { + "epoch": 1.2378880759343485, + "step": 12520, + "train/total_loss": 0.032305046916007996 + }, + { + "entropy": 9.801132202148438, + "epoch": 1.237986948783864, + "mean_token_accuracy": 0.8424437046051025, + "num_tokens": 6369184.0, + "step": 12521, + "train/ce_loss": 0.4584807753562927 + }, + { + "epoch": 1.237986948783864, + "step": 12521, + "train/sim_loss": 0.017684102058410645 + }, + { + "epoch": 1.237986948783864, + "step": 12521, + "train/total_loss": 0.06353218108415604 + }, + { + "entropy": 10.064308166503906, + "epoch": 1.2380858216333794, + "mean_token_accuracy": 0.8552916049957275, + "num_tokens": 6383328.0, + "step": 12522, + "train/ce_loss": 0.5781126022338867 + }, + { + "epoch": 1.2380858216333794, + "step": 12522, + "train/sim_loss": 0.06443345546722412 + }, + { + "epoch": 1.2380858216333794, + "step": 12522, + "train/total_loss": 0.12224471569061279 + }, + { + "entropy": 9.48878288269043, + "epoch": 1.238184694482895, + "mean_token_accuracy": 0.8353344798088074, + "num_tokens": 6394866.0, + "step": 12523, + "train/ce_loss": 0.7190162539482117 + }, + { + "epoch": 1.238184694482895, + "step": 12523, + "train/sim_loss": 0.03738921880722046 + }, + { + "epoch": 1.238184694482895, + "step": 12523, + "train/total_loss": 0.10929084569215775 + }, + { + "entropy": 9.650094032287598, + "epoch": 1.2382835673324104, + "mean_token_accuracy": 0.830788791179657, + "num_tokens": 6414896.0, + "step": 12524, + "train/ce_loss": 0.6103621125221252 + }, + { + "epoch": 1.2382835673324104, + "step": 12524, + "train/sim_loss": 0.018635094165802002 + }, + { + "epoch": 1.2382835673324104, + "step": 12524, + "train/total_loss": 0.07967130839824677 + }, + { + "entropy": 9.647974014282227, + "epoch": 1.2383824401819261, + "mean_token_accuracy": 0.825123131275177, + "num_tokens": 6430542.0, + "step": 12525, + "train/ce_loss": 0.5854089856147766 + }, + { + "epoch": 1.2383824401819261, + "step": 12525, + "train/sim_loss": 0.056499361991882324 + }, + { + "epoch": 1.2383824401819261, + "step": 12525, + "train/total_loss": 0.11504025757312775 + }, + { + "entropy": 9.748770713806152, + "epoch": 1.2384813130314416, + "mean_token_accuracy": 0.8492063283920288, + "num_tokens": 6438279.0, + "step": 12526, + "train/ce_loss": 0.4207918643951416 + }, + { + "epoch": 1.2384813130314416, + "step": 12526, + "train/sim_loss": 0.035882651805877686 + }, + { + "epoch": 1.2384813130314416, + "step": 12526, + "train/total_loss": 0.07796183973550797 + }, + { + "entropy": 9.931772232055664, + "epoch": 1.238580185880957, + "mean_token_accuracy": 0.9395604133605957, + "num_tokens": 6445527.0, + "step": 12527, + "train/ce_loss": 3.748951485249563e-06 + }, + { + "epoch": 1.238580185880957, + "step": 12527, + "train/sim_loss": 0.05252277851104736 + }, + { + "epoch": 1.238580185880957, + "step": 12527, + "train/total_loss": 0.05252315476536751 + }, + { + "entropy": 9.390485763549805, + "epoch": 1.2386790587304726, + "mean_token_accuracy": 0.8474341034889221, + "num_tokens": 6455543.0, + "step": 12528, + "train/ce_loss": 0.23237992823123932 + }, + { + "epoch": 1.2386790587304726, + "step": 12528, + "train/sim_loss": 0.0659404993057251 + }, + { + "epoch": 1.2386790587304726, + "step": 12528, + "train/total_loss": 0.08917849510908127 + }, + { + "entropy": 9.601454734802246, + "epoch": 1.238777931579988, + "mean_token_accuracy": 0.8335419297218323, + "num_tokens": 6472856.0, + "step": 12529, + "train/ce_loss": 0.40244901180267334 + }, + { + "epoch": 1.238777931579988, + "step": 12529, + "train/sim_loss": 0.04056304693222046 + }, + { + "epoch": 1.238777931579988, + "step": 12529, + "train/total_loss": 0.08080795407295227 + }, + { + "entropy": 9.432394027709961, + "epoch": 1.2388768044295038, + "mean_token_accuracy": 0.8253588676452637, + "num_tokens": 6484469.0, + "step": 12530, + "train/ce_loss": 0.5916703343391418 + }, + { + "epoch": 1.2388768044295038, + "step": 12530, + "train/sim_loss": 0.026930928230285645 + }, + { + "epoch": 1.2388768044295038, + "step": 12530, + "train/total_loss": 0.08609796315431595 + }, + { + "entropy": 9.49737548828125, + "epoch": 1.2389756772790192, + "mean_token_accuracy": 0.869328498840332, + "num_tokens": 6499332.0, + "step": 12531, + "train/ce_loss": 0.6128791570663452 + }, + { + "epoch": 1.2389756772790192, + "step": 12531, + "train/sim_loss": 0.04885673522949219 + }, + { + "epoch": 1.2389756772790192, + "step": 12531, + "train/total_loss": 0.11014465242624283 + }, + { + "entropy": 9.206061363220215, + "epoch": 1.2390745501285347, + "mean_token_accuracy": 0.8484848737716675, + "num_tokens": 6509899.0, + "step": 12532, + "train/ce_loss": 0.298514723777771 + }, + { + "epoch": 1.2390745501285347, + "step": 12532, + "train/sim_loss": 0.05087077617645264 + }, + { + "epoch": 1.2390745501285347, + "step": 12532, + "train/total_loss": 0.08072225004434586 + }, + { + "entropy": 9.704033851623535, + "epoch": 1.2391734229780502, + "mean_token_accuracy": 0.7702702879905701, + "num_tokens": 6528154.0, + "step": 12533, + "train/ce_loss": 0.3917025625705719 + }, + { + "epoch": 1.2391734229780502, + "step": 12533, + "train/sim_loss": 0.05463087558746338 + }, + { + "epoch": 1.2391734229780502, + "step": 12533, + "train/total_loss": 0.09380113333463669 + }, + { + "entropy": 9.78607177734375, + "epoch": 1.2392722958275657, + "mean_token_accuracy": 0.8564102649688721, + "num_tokens": 6544207.0, + "step": 12534, + "train/ce_loss": 0.5542641878128052 + }, + { + "epoch": 1.2392722958275657, + "step": 12534, + "train/sim_loss": 0.04285550117492676 + }, + { + "epoch": 1.2392722958275657, + "step": 12534, + "train/total_loss": 0.09828191995620728 + }, + { + "entropy": 9.473562240600586, + "epoch": 1.2393711686770812, + "mean_token_accuracy": 0.8727678656578064, + "num_tokens": 6557606.0, + "step": 12535, + "train/ce_loss": 0.3084079921245575 + }, + { + "epoch": 1.2393711686770812, + "step": 12535, + "train/sim_loss": 0.017737627029418945 + }, + { + "epoch": 1.2393711686770812, + "step": 12535, + "train/total_loss": 0.048578426241874695 + }, + { + "entropy": 9.31233024597168, + "epoch": 1.2394700415265967, + "mean_token_accuracy": 0.8079710006713867, + "num_tokens": 6569527.0, + "step": 12536, + "train/ce_loss": 0.4415064752101898 + }, + { + "epoch": 1.2394700415265967, + "step": 12536, + "train/sim_loss": 0.03932034969329834 + }, + { + "epoch": 1.2394700415265967, + "step": 12536, + "train/total_loss": 0.08347100019454956 + }, + { + "entropy": 9.499988555908203, + "epoch": 1.2395689143761124, + "mean_token_accuracy": 0.8333333134651184, + "num_tokens": 6586029.0, + "step": 12537, + "train/ce_loss": 3.3645611097199435e-07 + }, + { + "epoch": 1.2395689143761124, + "step": 12537, + "train/sim_loss": 0.04025864601135254 + }, + { + "epoch": 1.2395689143761124, + "step": 12537, + "train/total_loss": 0.040258679538965225 + }, + { + "entropy": 9.703176498413086, + "epoch": 1.2396677872256279, + "mean_token_accuracy": 0.8882681727409363, + "num_tokens": 6600159.0, + "step": 12538, + "train/ce_loss": 2.3619709565991798e-07 + }, + { + "epoch": 1.2396677872256279, + "step": 12538, + "train/sim_loss": 0.01202702522277832 + }, + { + "epoch": 1.2396677872256279, + "step": 12538, + "train/total_loss": 0.012027048505842686 + }, + { + "entropy": 9.006195068359375, + "epoch": 1.2397666600751434, + "mean_token_accuracy": 0.8919752836227417, + "num_tokens": 6613674.0, + "step": 12539, + "train/ce_loss": 0.12960083782672882 + }, + { + "epoch": 1.2397666600751434, + "step": 12539, + "train/sim_loss": 0.020288825035095215 + }, + { + "epoch": 1.2397666600751434, + "step": 12539, + "train/total_loss": 0.0332489088177681 + }, + { + "epoch": 1.2398655329246588, + "grad_norm": 0.5579644441604614, + "learning_rate": 6.90228947238293e-06, + "loss": 0.0874, + "step": 12540 + }, + { + "entropy": 9.187555313110352, + "epoch": 1.2398655329246588, + "mean_token_accuracy": 0.8638184070587158, + "num_tokens": 6623331.0, + "step": 12540, + "train/ce_loss": 0.4486141800880432 + }, + { + "epoch": 1.2398655329246588, + "step": 12540, + "train/sim_loss": 0.07071542739868164 + }, + { + "epoch": 1.2398655329246588, + "step": 12540, + "train/total_loss": 0.1155768483877182 + }, + { + "entropy": 9.43659496307373, + "epoch": 1.2399644057741743, + "mean_token_accuracy": 0.8653250932693481, + "num_tokens": 6634154.0, + "step": 12541, + "train/ce_loss": 0.6436477303504944 + }, + { + "epoch": 1.2399644057741743, + "step": 12541, + "train/sim_loss": 0.06096041202545166 + }, + { + "epoch": 1.2399644057741743, + "step": 12541, + "train/total_loss": 0.12532518804073334 + }, + { + "entropy": 9.337491989135742, + "epoch": 1.24006327862369, + "mean_token_accuracy": 0.8693820238113403, + "num_tokens": 6649949.0, + "step": 12542, + "train/ce_loss": 4.0249085486721015e-07 + }, + { + "epoch": 1.24006327862369, + "step": 12542, + "train/sim_loss": 0.032538533210754395 + }, + { + "epoch": 1.24006327862369, + "step": 12542, + "train/total_loss": 0.03253857418894768 + }, + { + "entropy": 9.531417846679688, + "epoch": 1.2401621514732055, + "mean_token_accuracy": 0.7793880701065063, + "num_tokens": 6663107.0, + "step": 12543, + "train/ce_loss": 0.6499949097633362 + }, + { + "epoch": 1.2401621514732055, + "step": 12543, + "train/sim_loss": 0.07677298784255981 + }, + { + "epoch": 1.2401621514732055, + "step": 12543, + "train/total_loss": 0.14177247881889343 + }, + { + "entropy": 9.85240650177002, + "epoch": 1.240261024322721, + "mean_token_accuracy": 0.8368421196937561, + "num_tokens": 6676718.0, + "step": 12544, + "train/ce_loss": 0.5161629915237427 + }, + { + "epoch": 1.240261024322721, + "step": 12544, + "train/sim_loss": 0.07053720951080322 + }, + { + "epoch": 1.240261024322721, + "step": 12544, + "train/total_loss": 0.12215350568294525 + }, + { + "entropy": 9.770992279052734, + "epoch": 1.2403598971722365, + "mean_token_accuracy": 0.9109195470809937, + "num_tokens": 6688257.0, + "step": 12545, + "train/ce_loss": 0.5384774208068848 + }, + { + "epoch": 1.2403598971722365, + "step": 12545, + "train/sim_loss": 0.03627502918243408 + }, + { + "epoch": 1.2403598971722365, + "step": 12545, + "train/total_loss": 0.0901227742433548 + }, + { + "entropy": 9.804027557373047, + "epoch": 1.240458770021752, + "mean_token_accuracy": 0.8611940145492554, + "num_tokens": 6701132.0, + "step": 12546, + "train/ce_loss": 6.390824864865863e-07 + }, + { + "epoch": 1.240458770021752, + "step": 12546, + "train/sim_loss": 0.032839179039001465 + }, + { + "epoch": 1.240458770021752, + "step": 12546, + "train/total_loss": 0.03283924236893654 + }, + { + "entropy": 9.352958679199219, + "epoch": 1.2405576428712675, + "mean_token_accuracy": 0.8220930099487305, + "num_tokens": 6710665.0, + "step": 12547, + "train/ce_loss": 0.6975446343421936 + }, + { + "epoch": 1.2405576428712675, + "step": 12547, + "train/sim_loss": 0.03659236431121826 + }, + { + "epoch": 1.2405576428712675, + "step": 12547, + "train/total_loss": 0.10634683072566986 + }, + { + "entropy": 9.777923583984375, + "epoch": 1.240656515720783, + "mean_token_accuracy": 0.9186046719551086, + "num_tokens": 6728013.0, + "step": 12548, + "train/ce_loss": 5.267332312541839e-07 + }, + { + "epoch": 1.240656515720783, + "step": 12548, + "train/sim_loss": 0.0320553183555603 + }, + { + "epoch": 1.240656515720783, + "step": 12548, + "train/total_loss": 0.03205537050962448 + }, + { + "entropy": 9.50805377960205, + "epoch": 1.2407553885702987, + "mean_token_accuracy": 0.8259526491165161, + "num_tokens": 6746286.0, + "step": 12549, + "train/ce_loss": 0.3701225817203522 + }, + { + "epoch": 1.2407553885702987, + "step": 12549, + "train/sim_loss": 0.01557457447052002 + }, + { + "epoch": 1.2407553885702987, + "step": 12549, + "train/total_loss": 0.052586834877729416 + }, + { + "entropy": 9.285633087158203, + "epoch": 1.2408542614198141, + "mean_token_accuracy": 0.827102780342102, + "num_tokens": 6756543.0, + "step": 12550, + "train/ce_loss": 0.2407236546278 + }, + { + "epoch": 1.2408542614198141, + "step": 12550, + "train/sim_loss": 0.01242220401763916 + }, + { + "epoch": 1.2408542614198141, + "step": 12550, + "train/total_loss": 0.03649456799030304 + }, + { + "entropy": 9.004837989807129, + "epoch": 1.2409531342693296, + "mean_token_accuracy": 0.868852436542511, + "num_tokens": 6764018.0, + "step": 12551, + "train/ce_loss": 0.40045708417892456 + }, + { + "epoch": 1.2409531342693296, + "step": 12551, + "train/sim_loss": 0.04090076684951782 + }, + { + "epoch": 1.2409531342693296, + "step": 12551, + "train/total_loss": 0.08094647526741028 + }, + { + "entropy": 9.174986839294434, + "epoch": 1.2410520071188451, + "mean_token_accuracy": 0.8452380895614624, + "num_tokens": 6775989.0, + "step": 12552, + "train/ce_loss": 0.6266543865203857 + }, + { + "epoch": 1.2410520071188451, + "step": 12552, + "train/sim_loss": 0.03172159194946289 + }, + { + "epoch": 1.2410520071188451, + "step": 12552, + "train/total_loss": 0.09438703209161758 + }, + { + "entropy": 9.253941535949707, + "epoch": 1.2411508799683606, + "mean_token_accuracy": 0.864522397518158, + "num_tokens": 6792631.0, + "step": 12553, + "train/ce_loss": 0.25184372067451477 + }, + { + "epoch": 1.2411508799683606, + "step": 12553, + "train/sim_loss": 0.014913439750671387 + }, + { + "epoch": 1.2411508799683606, + "step": 12553, + "train/total_loss": 0.040097810328006744 + }, + { + "entropy": 9.407503128051758, + "epoch": 1.2412497528178763, + "mean_token_accuracy": 0.8824742436408997, + "num_tokens": 6806903.0, + "step": 12554, + "train/ce_loss": 0.379149854183197 + }, + { + "epoch": 1.2412497528178763, + "step": 12554, + "train/sim_loss": 0.08832180500030518 + }, + { + "epoch": 1.2412497528178763, + "step": 12554, + "train/total_loss": 0.12623679637908936 + }, + { + "entropy": 9.72073745727539, + "epoch": 1.2413486256673918, + "mean_token_accuracy": 0.800000011920929, + "num_tokens": 6819175.0, + "step": 12555, + "train/ce_loss": 0.5071384310722351 + }, + { + "epoch": 1.2413486256673918, + "step": 12555, + "train/sim_loss": 0.04947417974472046 + }, + { + "epoch": 1.2413486256673918, + "step": 12555, + "train/total_loss": 0.10018802434206009 + }, + { + "entropy": 9.946680068969727, + "epoch": 1.2414474985169073, + "mean_token_accuracy": 0.8226690888404846, + "num_tokens": 6831360.0, + "step": 12556, + "train/ce_loss": 0.8999125361442566 + }, + { + "epoch": 1.2414474985169073, + "step": 12556, + "train/sim_loss": 0.05390232801437378 + }, + { + "epoch": 1.2414474985169073, + "step": 12556, + "train/total_loss": 0.14389358460903168 + }, + { + "entropy": 9.451179504394531, + "epoch": 1.2415463713664228, + "mean_token_accuracy": 0.8704225420951843, + "num_tokens": 6847674.0, + "step": 12557, + "train/ce_loss": 3.47885617202337e-07 + }, + { + "epoch": 1.2415463713664228, + "step": 12557, + "train/sim_loss": 0.03375351428985596 + }, + { + "epoch": 1.2415463713664228, + "step": 12557, + "train/total_loss": 0.03375354781746864 + }, + { + "entropy": 9.173002243041992, + "epoch": 1.2416452442159382, + "mean_token_accuracy": 0.8742038011550903, + "num_tokens": 6859830.0, + "step": 12558, + "train/ce_loss": 0.39153510332107544 + }, + { + "epoch": 1.2416452442159382, + "step": 12558, + "train/sim_loss": 0.04718506336212158 + }, + { + "epoch": 1.2416452442159382, + "step": 12558, + "train/total_loss": 0.0863385796546936 + }, + { + "entropy": 9.578485488891602, + "epoch": 1.2417441170654537, + "mean_token_accuracy": 0.8493648171424866, + "num_tokens": 6872890.0, + "step": 12559, + "train/ce_loss": 2.248008513561217e-06 + }, + { + "epoch": 1.2417441170654537, + "step": 12559, + "train/sim_loss": 0.03954946994781494 + }, + { + "epoch": 1.2417441170654537, + "step": 12559, + "train/total_loss": 0.03954969346523285 + }, + { + "epoch": 1.2418429899149694, + "grad_norm": 0.6418776512145996, + "learning_rate": 6.897344607624981e-06, + "loss": 0.0875, + "step": 12560 + }, + { + "entropy": 9.304548263549805, + "epoch": 1.2418429899149694, + "mean_token_accuracy": 0.8702461123466492, + "num_tokens": 6885190.0, + "step": 12560, + "train/ce_loss": 0.5140474438667297 + }, + { + "epoch": 1.2418429899149694, + "step": 12560, + "train/sim_loss": 0.07676124572753906 + }, + { + "epoch": 1.2418429899149694, + "step": 12560, + "train/total_loss": 0.12816599011421204 + }, + { + "entropy": 9.268477439880371, + "epoch": 1.241941862764485, + "mean_token_accuracy": 0.8174863457679749, + "num_tokens": 6898023.0, + "step": 12561, + "train/ce_loss": 0.434886634349823 + }, + { + "epoch": 1.241941862764485, + "step": 12561, + "train/sim_loss": 0.11710512638092041 + }, + { + "epoch": 1.241941862764485, + "step": 12561, + "train/total_loss": 0.16059379279613495 + }, + { + "entropy": 9.197189331054688, + "epoch": 1.2420407356140004, + "mean_token_accuracy": 0.8119325637817383, + "num_tokens": 6906944.0, + "step": 12562, + "train/ce_loss": 0.49903541803359985 + }, + { + "epoch": 1.2420407356140004, + "step": 12562, + "train/sim_loss": 0.1175355315208435 + }, + { + "epoch": 1.2420407356140004, + "step": 12562, + "train/total_loss": 0.1674390733242035 + }, + { + "entropy": 9.5530424118042, + "epoch": 1.242139608463516, + "mean_token_accuracy": 0.8274559378623962, + "num_tokens": 6920208.0, + "step": 12563, + "train/ce_loss": 0.28955140709877014 + }, + { + "epoch": 1.242139608463516, + "step": 12563, + "train/sim_loss": 0.015600800514221191 + }, + { + "epoch": 1.242139608463516, + "step": 12563, + "train/total_loss": 0.044555939733982086 + }, + { + "entropy": 9.042476654052734, + "epoch": 1.2422384813130314, + "mean_token_accuracy": 0.8309859037399292, + "num_tokens": 6932204.0, + "step": 12564, + "train/ce_loss": 0.38161414861679077 + }, + { + "epoch": 1.2422384813130314, + "step": 12564, + "train/sim_loss": 0.03875374794006348 + }, + { + "epoch": 1.2422384813130314, + "step": 12564, + "train/total_loss": 0.07691515982151031 + }, + { + "entropy": 9.521804809570312, + "epoch": 1.2423373541625469, + "mean_token_accuracy": 0.8196022510528564, + "num_tokens": 6945433.0, + "step": 12565, + "train/ce_loss": 0.6691235303878784 + }, + { + "epoch": 1.2423373541625469, + "step": 12565, + "train/sim_loss": 0.03127557039260864 + }, + { + "epoch": 1.2423373541625469, + "step": 12565, + "train/total_loss": 0.09818792343139648 + }, + { + "entropy": 9.957832336425781, + "epoch": 1.2424362270120626, + "mean_token_accuracy": 0.9268292784690857, + "num_tokens": 6956859.0, + "step": 12566, + "train/ce_loss": 4.328970248934638e-07 + }, + { + "epoch": 1.2424362270120626, + "step": 12566, + "train/sim_loss": 0.014637768268585205 + }, + { + "epoch": 1.2424362270120626, + "step": 12566, + "train/total_loss": 0.014637811109423637 + }, + { + "entropy": 8.738134384155273, + "epoch": 1.242535099861578, + "mean_token_accuracy": 0.8565264344215393, + "num_tokens": 6962379.0, + "step": 12567, + "train/ce_loss": 0.6388853788375854 + }, + { + "epoch": 1.242535099861578, + "step": 12567, + "train/sim_loss": 0.015933096408843994 + }, + { + "epoch": 1.242535099861578, + "step": 12567, + "train/total_loss": 0.0798216387629509 + }, + { + "entropy": 9.714546203613281, + "epoch": 1.2426339727110935, + "mean_token_accuracy": 0.8915929198265076, + "num_tokens": 6973224.0, + "step": 12568, + "train/ce_loss": 0.7710983157157898 + }, + { + "epoch": 1.2426339727110935, + "step": 12568, + "train/sim_loss": 0.06190598011016846 + }, + { + "epoch": 1.2426339727110935, + "step": 12568, + "train/total_loss": 0.1390158236026764 + }, + { + "entropy": 9.863862037658691, + "epoch": 1.242732845560609, + "mean_token_accuracy": 0.8902077078819275, + "num_tokens": 6992148.0, + "step": 12569, + "train/ce_loss": 0.26855477690696716 + }, + { + "epoch": 1.242732845560609, + "step": 12569, + "train/sim_loss": 0.07689625024795532 + }, + { + "epoch": 1.242732845560609, + "step": 12569, + "train/total_loss": 0.10375172644853592 + }, + { + "entropy": 9.804666519165039, + "epoch": 1.2428317184101245, + "mean_token_accuracy": 0.8407225012779236, + "num_tokens": 6999976.0, + "step": 12570, + "train/ce_loss": 2.505527731955226e-07 + }, + { + "epoch": 1.2428317184101245, + "step": 12570, + "train/sim_loss": 0.0168914794921875 + }, + { + "epoch": 1.2428317184101245, + "step": 12570, + "train/total_loss": 0.01689150370657444 + }, + { + "entropy": 9.358682632446289, + "epoch": 1.2429305912596402, + "mean_token_accuracy": 0.7913752794265747, + "num_tokens": 7008586.0, + "step": 12571, + "train/ce_loss": 0.3753480017185211 + }, + { + "epoch": 1.2429305912596402, + "step": 12571, + "train/sim_loss": 0.08813691139221191 + }, + { + "epoch": 1.2429305912596402, + "step": 12571, + "train/total_loss": 0.12567171454429626 + }, + { + "entropy": 8.548755645751953, + "epoch": 1.2430294641091557, + "mean_token_accuracy": 0.8115384578704834, + "num_tokens": 7018005.0, + "step": 12572, + "train/ce_loss": 0.2934868037700653 + }, + { + "epoch": 1.2430294641091557, + "step": 12572, + "train/sim_loss": 0.012798190116882324 + }, + { + "epoch": 1.2430294641091557, + "step": 12572, + "train/total_loss": 0.042146869003772736 + }, + { + "entropy": 9.626946449279785, + "epoch": 1.2431283369586712, + "mean_token_accuracy": 0.7915936708450317, + "num_tokens": 7033068.0, + "step": 12573, + "train/ce_loss": 0.599529504776001 + }, + { + "epoch": 1.2431283369586712, + "step": 12573, + "train/sim_loss": 0.043592214584350586 + }, + { + "epoch": 1.2431283369586712, + "step": 12573, + "train/total_loss": 0.1035451665520668 + }, + { + "entropy": 9.535713195800781, + "epoch": 1.2432272098081867, + "mean_token_accuracy": 0.8640275597572327, + "num_tokens": 7047783.0, + "step": 12574, + "train/ce_loss": 0.4901694655418396 + }, + { + "epoch": 1.2432272098081867, + "step": 12574, + "train/sim_loss": 0.029898524284362793 + }, + { + "epoch": 1.2432272098081867, + "step": 12574, + "train/total_loss": 0.07891547679901123 + }, + { + "entropy": 9.685901641845703, + "epoch": 1.2433260826577022, + "mean_token_accuracy": 0.842424213886261, + "num_tokens": 7057087.0, + "step": 12575, + "train/ce_loss": 0.3682090640068054 + }, + { + "epoch": 1.2433260826577022, + "step": 12575, + "train/sim_loss": 0.04559898376464844 + }, + { + "epoch": 1.2433260826577022, + "step": 12575, + "train/total_loss": 0.08241988718509674 + }, + { + "entropy": 9.483298301696777, + "epoch": 1.2434249555072177, + "mean_token_accuracy": 0.8409343957901001, + "num_tokens": 7067163.0, + "step": 12576, + "train/ce_loss": 0.5489999651908875 + }, + { + "epoch": 1.2434249555072177, + "step": 12576, + "train/sim_loss": 0.08381998538970947 + }, + { + "epoch": 1.2434249555072177, + "step": 12576, + "train/total_loss": 0.13871997594833374 + }, + { + "entropy": 9.196770668029785, + "epoch": 1.2435238283567331, + "mean_token_accuracy": 0.8414239287376404, + "num_tokens": 7076360.0, + "step": 12577, + "train/ce_loss": 0.4489936828613281 + }, + { + "epoch": 1.2435238283567331, + "step": 12577, + "train/sim_loss": 0.06471145153045654 + }, + { + "epoch": 1.2435238283567331, + "step": 12577, + "train/total_loss": 0.10961082577705383 + }, + { + "entropy": 9.240026473999023, + "epoch": 1.2436227012062488, + "mean_token_accuracy": 0.7854122519493103, + "num_tokens": 7089565.0, + "step": 12578, + "train/ce_loss": 1.6674221754074097 + }, + { + "epoch": 1.2436227012062488, + "step": 12578, + "train/sim_loss": 0.10402649641036987 + }, + { + "epoch": 1.2436227012062488, + "step": 12578, + "train/total_loss": 0.2707687020301819 + }, + { + "entropy": 9.402301788330078, + "epoch": 1.2437215740557643, + "mean_token_accuracy": 0.8900462985038757, + "num_tokens": 7106394.0, + "step": 12579, + "train/ce_loss": 0.21143294870853424 + }, + { + "epoch": 1.2437215740557643, + "step": 12579, + "train/sim_loss": 0.06567192077636719 + }, + { + "epoch": 1.2437215740557643, + "step": 12579, + "train/total_loss": 0.08681521564722061 + }, + { + "epoch": 1.2438204469052798, + "grad_norm": 0.5220931172370911, + "learning_rate": 6.892399742867033e-06, + "loss": 0.0963, + "step": 12580 + }, + { + "entropy": 9.152870178222656, + "epoch": 1.2438204469052798, + "mean_token_accuracy": 0.8765432238578796, + "num_tokens": 7116473.0, + "step": 12580, + "train/ce_loss": 0.36211785674095154 + }, + { + "epoch": 1.2438204469052798, + "step": 12580, + "train/sim_loss": 0.035111188888549805 + }, + { + "epoch": 1.2438204469052798, + "step": 12580, + "train/total_loss": 0.0713229775428772 + }, + { + "entropy": 9.30038833618164, + "epoch": 1.2439193197547953, + "mean_token_accuracy": 0.8645357489585876, + "num_tokens": 7124147.0, + "step": 12581, + "train/ce_loss": 0.5847383737564087 + }, + { + "epoch": 1.2439193197547953, + "step": 12581, + "train/sim_loss": 0.029063522815704346 + }, + { + "epoch": 1.2439193197547953, + "step": 12581, + "train/total_loss": 0.08753736317157745 + }, + { + "entropy": 9.416177749633789, + "epoch": 1.2440181926043108, + "mean_token_accuracy": 0.8666666746139526, + "num_tokens": 7136747.0, + "step": 12582, + "train/ce_loss": 0.6413390636444092 + }, + { + "epoch": 1.2440181926043108, + "step": 12582, + "train/sim_loss": 0.08061325550079346 + }, + { + "epoch": 1.2440181926043108, + "step": 12582, + "train/total_loss": 0.14474716782569885 + }, + { + "entropy": 9.445638656616211, + "epoch": 1.2441170654538265, + "mean_token_accuracy": 0.8224852085113525, + "num_tokens": 7150126.0, + "step": 12583, + "train/ce_loss": 0.6366064548492432 + }, + { + "epoch": 1.2441170654538265, + "step": 12583, + "train/sim_loss": 0.02001798152923584 + }, + { + "epoch": 1.2441170654538265, + "step": 12583, + "train/total_loss": 0.08367862552404404 + }, + { + "entropy": 9.820512771606445, + "epoch": 1.244215938303342, + "mean_token_accuracy": 0.9054877758026123, + "num_tokens": 7162614.0, + "step": 12584, + "train/ce_loss": 0.3817916810512543 + }, + { + "epoch": 1.244215938303342, + "step": 12584, + "train/sim_loss": 0.041934967041015625 + }, + { + "epoch": 1.244215938303342, + "step": 12584, + "train/total_loss": 0.08011414110660553 + }, + { + "entropy": 9.828611373901367, + "epoch": 1.2443148111528575, + "mean_token_accuracy": 0.7941176295280457, + "num_tokens": 7178068.0, + "step": 12585, + "train/ce_loss": 0.4500652551651001 + }, + { + "epoch": 1.2443148111528575, + "step": 12585, + "train/sim_loss": 0.026416540145874023 + }, + { + "epoch": 1.2443148111528575, + "step": 12585, + "train/total_loss": 0.07142306864261627 + }, + { + "entropy": 9.373201370239258, + "epoch": 1.244413684002373, + "mean_token_accuracy": 0.8153526782989502, + "num_tokens": 7189318.0, + "step": 12586, + "train/ce_loss": 0.5014328360557556 + }, + { + "epoch": 1.244413684002373, + "step": 12586, + "train/sim_loss": 0.07668745517730713 + }, + { + "epoch": 1.244413684002373, + "step": 12586, + "train/total_loss": 0.12683074176311493 + }, + { + "entropy": 9.731372833251953, + "epoch": 1.2445125568518884, + "mean_token_accuracy": 0.8096969723701477, + "num_tokens": 7206396.0, + "step": 12587, + "train/ce_loss": 0.3698316812515259 + }, + { + "epoch": 1.2445125568518884, + "step": 12587, + "train/sim_loss": 0.015968918800354004 + }, + { + "epoch": 1.2445125568518884, + "step": 12587, + "train/total_loss": 0.05295208841562271 + }, + { + "entropy": 9.294719696044922, + "epoch": 1.244611429701404, + "mean_token_accuracy": 0.8298342823982239, + "num_tokens": 7218375.0, + "step": 12588, + "train/ce_loss": 0.07514957338571548 + }, + { + "epoch": 1.244611429701404, + "step": 12588, + "train/sim_loss": 0.07976877689361572 + }, + { + "epoch": 1.244611429701404, + "step": 12588, + "train/total_loss": 0.08728373050689697 + }, + { + "entropy": 10.244794845581055, + "epoch": 1.2447103025509194, + "mean_token_accuracy": 0.868852436542511, + "num_tokens": 7237365.0, + "step": 12589, + "train/ce_loss": 0.6003563404083252 + }, + { + "epoch": 1.2447103025509194, + "step": 12589, + "train/sim_loss": 0.029913604259490967 + }, + { + "epoch": 1.2447103025509194, + "step": 12589, + "train/total_loss": 0.08994923532009125 + }, + { + "entropy": 8.784473419189453, + "epoch": 1.2448091754004351, + "mean_token_accuracy": 0.8342077732086182, + "num_tokens": 7246181.0, + "step": 12590, + "train/ce_loss": 0.3176264464855194 + }, + { + "epoch": 1.2448091754004351, + "step": 12590, + "train/sim_loss": 0.0352855920791626 + }, + { + "epoch": 1.2448091754004351, + "step": 12590, + "train/total_loss": 0.06704823672771454 + }, + { + "entropy": 9.398210525512695, + "epoch": 1.2449080482499506, + "mean_token_accuracy": 0.8532423377037048, + "num_tokens": 7262550.0, + "step": 12591, + "train/ce_loss": 0.48297348618507385 + }, + { + "epoch": 1.2449080482499506, + "step": 12591, + "train/sim_loss": 0.023864328861236572 + }, + { + "epoch": 1.2449080482499506, + "step": 12591, + "train/total_loss": 0.07216167449951172 + }, + { + "entropy": 9.845165252685547, + "epoch": 1.245006921099466, + "mean_token_accuracy": 0.8117839694023132, + "num_tokens": 7276142.0, + "step": 12592, + "train/ce_loss": 2.0993176974570815e-07 + }, + { + "epoch": 1.245006921099466, + "step": 12592, + "train/sim_loss": 0.01584041118621826 + }, + { + "epoch": 1.245006921099466, + "step": 12592, + "train/total_loss": 0.015840431675314903 + }, + { + "entropy": 9.492792129516602, + "epoch": 1.2451057939489816, + "mean_token_accuracy": 0.8159420490264893, + "num_tokens": 7287783.0, + "step": 12593, + "train/ce_loss": 3.37237082703723e-07 + }, + { + "epoch": 1.2451057939489816, + "step": 12593, + "train/sim_loss": 0.05180215835571289 + }, + { + "epoch": 1.2451057939489816, + "step": 12593, + "train/total_loss": 0.05180219188332558 + }, + { + "entropy": 9.44188404083252, + "epoch": 1.245204666798497, + "mean_token_accuracy": 0.8673965930938721, + "num_tokens": 7300368.0, + "step": 12594, + "train/ce_loss": 0.5309848785400391 + }, + { + "epoch": 1.245204666798497, + "step": 12594, + "train/sim_loss": 0.015539765357971191 + }, + { + "epoch": 1.245204666798497, + "step": 12594, + "train/total_loss": 0.06863825023174286 + }, + { + "entropy": 9.788150787353516, + "epoch": 1.2453035396480128, + "mean_token_accuracy": 0.8410852551460266, + "num_tokens": 7309110.0, + "step": 12595, + "train/ce_loss": 0.4779239594936371 + }, + { + "epoch": 1.2453035396480128, + "step": 12595, + "train/sim_loss": 0.03678023815155029 + }, + { + "epoch": 1.2453035396480128, + "step": 12595, + "train/total_loss": 0.08457263559103012 + }, + { + "entropy": 9.074737548828125, + "epoch": 1.2454024124975283, + "mean_token_accuracy": 0.7994157671928406, + "num_tokens": 7321182.0, + "step": 12596, + "train/ce_loss": 0.5592366456985474 + }, + { + "epoch": 1.2454024124975283, + "step": 12596, + "train/sim_loss": 0.12027019262313843 + }, + { + "epoch": 1.2454024124975283, + "step": 12596, + "train/total_loss": 0.17619386315345764 + }, + { + "entropy": 9.509611129760742, + "epoch": 1.2455012853470437, + "mean_token_accuracy": 0.8242990374565125, + "num_tokens": 7331271.0, + "step": 12597, + "train/ce_loss": 0.8618720173835754 + }, + { + "epoch": 1.2455012853470437, + "step": 12597, + "train/sim_loss": 0.08952862024307251 + }, + { + "epoch": 1.2455012853470437, + "step": 12597, + "train/total_loss": 0.175715833902359 + }, + { + "entropy": 9.661395072937012, + "epoch": 1.2456001581965592, + "mean_token_accuracy": 0.8367646932601929, + "num_tokens": 7339658.0, + "step": 12598, + "train/ce_loss": 0.7901215553283691 + }, + { + "epoch": 1.2456001581965592, + "step": 12598, + "train/sim_loss": 0.04457336664199829 + }, + { + "epoch": 1.2456001581965592, + "step": 12598, + "train/total_loss": 0.1235855221748352 + }, + { + "entropy": 9.056669235229492, + "epoch": 1.2456990310460747, + "mean_token_accuracy": 0.8522727489471436, + "num_tokens": 7352291.0, + "step": 12599, + "train/ce_loss": 0.8010520935058594 + }, + { + "epoch": 1.2456990310460747, + "step": 12599, + "train/sim_loss": 0.04820144176483154 + }, + { + "epoch": 1.2456990310460747, + "step": 12599, + "train/total_loss": 0.12830665707588196 + }, + { + "epoch": 1.2457979038955902, + "grad_norm": 0.6022546887397766, + "learning_rate": 6.887454878109084e-06, + "loss": 0.0886, + "step": 12600 + }, + { + "entropy": 9.479053497314453, + "epoch": 1.2457979038955902, + "mean_token_accuracy": 0.8575581312179565, + "num_tokens": 7363670.0, + "step": 12600, + "train/ce_loss": 0.4072502553462982 + }, + { + "epoch": 1.2457979038955902, + "step": 12600, + "train/sim_loss": 0.0330965518951416 + }, + { + "epoch": 1.2457979038955902, + "step": 12600, + "train/total_loss": 0.07382157444953918 + }, + { + "entropy": 9.482100486755371, + "epoch": 1.2458967767451057, + "mean_token_accuracy": 0.9136000275611877, + "num_tokens": 7375552.0, + "step": 12601, + "train/ce_loss": 0.34109872579574585 + }, + { + "epoch": 1.2458967767451057, + "step": 12601, + "train/sim_loss": 0.0991249680519104 + }, + { + "epoch": 1.2458967767451057, + "step": 12601, + "train/total_loss": 0.13323484361171722 + }, + { + "entropy": 9.453620910644531, + "epoch": 1.2459956495946214, + "mean_token_accuracy": 0.8694736957550049, + "num_tokens": 7392702.0, + "step": 12602, + "train/ce_loss": 0.7939415574073792 + }, + { + "epoch": 1.2459956495946214, + "step": 12602, + "train/sim_loss": 0.01910102367401123 + }, + { + "epoch": 1.2459956495946214, + "step": 12602, + "train/total_loss": 0.09849517792463303 + }, + { + "entropy": 9.367427825927734, + "epoch": 1.2460945224441369, + "mean_token_accuracy": 0.8909090757369995, + "num_tokens": 7403269.0, + "step": 12603, + "train/ce_loss": 0.6233584880828857 + }, + { + "epoch": 1.2460945224441369, + "step": 12603, + "train/sim_loss": 0.042314767837524414 + }, + { + "epoch": 1.2460945224441369, + "step": 12603, + "train/total_loss": 0.10465061664581299 + }, + { + "entropy": 9.878939628601074, + "epoch": 1.2461933952936524, + "mean_token_accuracy": 0.87578946352005, + "num_tokens": 7418199.0, + "step": 12604, + "train/ce_loss": 0.4266588091850281 + }, + { + "epoch": 1.2461933952936524, + "step": 12604, + "train/sim_loss": 0.0433804988861084 + }, + { + "epoch": 1.2461933952936524, + "step": 12604, + "train/total_loss": 0.08604638278484344 + }, + { + "entropy": 9.666665077209473, + "epoch": 1.2462922681431678, + "mean_token_accuracy": 0.8164464235305786, + "num_tokens": 7426745.0, + "step": 12605, + "train/ce_loss": 0.6630136966705322 + }, + { + "epoch": 1.2462922681431678, + "step": 12605, + "train/sim_loss": 0.04352813959121704 + }, + { + "epoch": 1.2462922681431678, + "step": 12605, + "train/total_loss": 0.10982950776815414 + }, + { + "entropy": 9.557476997375488, + "epoch": 1.2463911409926833, + "mean_token_accuracy": 0.8592592477798462, + "num_tokens": 7442769.0, + "step": 12606, + "train/ce_loss": 0.5018996596336365 + }, + { + "epoch": 1.2463911409926833, + "step": 12606, + "train/sim_loss": 0.05732482671737671 + }, + { + "epoch": 1.2463911409926833, + "step": 12606, + "train/total_loss": 0.10751479864120483 + }, + { + "entropy": 9.296944618225098, + "epoch": 1.246490013842199, + "mean_token_accuracy": 0.8031319975852966, + "num_tokens": 7454297.0, + "step": 12607, + "train/ce_loss": 0.4880140423774719 + }, + { + "epoch": 1.246490013842199, + "step": 12607, + "train/sim_loss": 0.03784346580505371 + }, + { + "epoch": 1.246490013842199, + "step": 12607, + "train/total_loss": 0.08664487302303314 + }, + { + "entropy": 9.872957229614258, + "epoch": 1.2465888866917145, + "mean_token_accuracy": 0.8583691120147705, + "num_tokens": 7465419.0, + "step": 12608, + "train/ce_loss": 0.2198733389377594 + }, + { + "epoch": 1.2465888866917145, + "step": 12608, + "train/sim_loss": 0.07700741291046143 + }, + { + "epoch": 1.2465888866917145, + "step": 12608, + "train/total_loss": 0.09899474680423737 + }, + { + "entropy": 9.629270553588867, + "epoch": 1.24668775954123, + "mean_token_accuracy": 0.8812615871429443, + "num_tokens": 7474005.0, + "step": 12609, + "train/ce_loss": 0.27311745285987854 + }, + { + "epoch": 1.24668775954123, + "step": 12609, + "train/sim_loss": 0.014318466186523438 + }, + { + "epoch": 1.24668775954123, + "step": 12609, + "train/total_loss": 0.04163021221756935 + }, + { + "entropy": 9.300914764404297, + "epoch": 1.2467866323907455, + "mean_token_accuracy": 0.8425925970077515, + "num_tokens": 7491499.0, + "step": 12610, + "train/ce_loss": 0.36965686082839966 + }, + { + "epoch": 1.2467866323907455, + "step": 12610, + "train/sim_loss": 0.019617557525634766 + }, + { + "epoch": 1.2467866323907455, + "step": 12610, + "train/total_loss": 0.05658324435353279 + }, + { + "entropy": 9.885923385620117, + "epoch": 1.246885505240261, + "mean_token_accuracy": 0.8914100527763367, + "num_tokens": 7505701.0, + "step": 12611, + "train/ce_loss": 0.21140554547309875 + }, + { + "epoch": 1.246885505240261, + "step": 12611, + "train/sim_loss": 0.02591484785079956 + }, + { + "epoch": 1.246885505240261, + "step": 12611, + "train/total_loss": 0.04705540090799332 + }, + { + "entropy": 9.491903305053711, + "epoch": 1.2469843780897765, + "mean_token_accuracy": 0.8080094456672668, + "num_tokens": 7518731.0, + "step": 12612, + "train/ce_loss": 0.15621906518936157 + }, + { + "epoch": 1.2469843780897765, + "step": 12612, + "train/sim_loss": 0.06828749179840088 + }, + { + "epoch": 1.2469843780897765, + "step": 12612, + "train/total_loss": 0.08390939980745316 + }, + { + "entropy": 8.836261749267578, + "epoch": 1.247083250939292, + "mean_token_accuracy": 0.8552631735801697, + "num_tokens": 7529586.0, + "step": 12613, + "train/ce_loss": 0.5002504587173462 + }, + { + "epoch": 1.247083250939292, + "step": 12613, + "train/sim_loss": 0.026151180267333984 + }, + { + "epoch": 1.247083250939292, + "step": 12613, + "train/total_loss": 0.0761762261390686 + }, + { + "entropy": 9.055011749267578, + "epoch": 1.2471821237888077, + "mean_token_accuracy": 0.8142998814582825, + "num_tokens": 7543068.0, + "step": 12614, + "train/ce_loss": 0.5520996451377869 + }, + { + "epoch": 1.2471821237888077, + "step": 12614, + "train/sim_loss": 0.04373490810394287 + }, + { + "epoch": 1.2471821237888077, + "step": 12614, + "train/total_loss": 0.09894487261772156 + }, + { + "entropy": 9.448066711425781, + "epoch": 1.2472809966383231, + "mean_token_accuracy": 0.8105960488319397, + "num_tokens": 7555936.0, + "step": 12615, + "train/ce_loss": 0.43895384669303894 + }, + { + "epoch": 1.2472809966383231, + "step": 12615, + "train/sim_loss": 0.024202823638916016 + }, + { + "epoch": 1.2472809966383231, + "step": 12615, + "train/total_loss": 0.06809820979833603 + }, + { + "entropy": 8.970404624938965, + "epoch": 1.2473798694878386, + "mean_token_accuracy": 0.8158220052719116, + "num_tokens": 7564466.0, + "step": 12616, + "train/ce_loss": 0.27012667059898376 + }, + { + "epoch": 1.2473798694878386, + "step": 12616, + "train/sim_loss": 0.04904484748840332 + }, + { + "epoch": 1.2473798694878386, + "step": 12616, + "train/total_loss": 0.07605751603841782 + }, + { + "entropy": 9.777994155883789, + "epoch": 1.2474787423373541, + "mean_token_accuracy": 0.800000011920929, + "num_tokens": 7578764.0, + "step": 12617, + "train/ce_loss": 0.3930429518222809 + }, + { + "epoch": 1.2474787423373541, + "step": 12617, + "train/sim_loss": 0.02729618549346924 + }, + { + "epoch": 1.2474787423373541, + "step": 12617, + "train/total_loss": 0.0666004866361618 + }, + { + "entropy": 9.597955703735352, + "epoch": 1.2475776151868696, + "mean_token_accuracy": 0.8225308656692505, + "num_tokens": 7593760.0, + "step": 12618, + "train/ce_loss": 0.5091554522514343 + }, + { + "epoch": 1.2475776151868696, + "step": 12618, + "train/sim_loss": 0.02008652687072754 + }, + { + "epoch": 1.2475776151868696, + "step": 12618, + "train/total_loss": 0.07100207358598709 + }, + { + "entropy": 9.421695709228516, + "epoch": 1.2476764880363853, + "mean_token_accuracy": 0.8288177251815796, + "num_tokens": 7604184.0, + "step": 12619, + "train/ce_loss": 0.6279529929161072 + }, + { + "epoch": 1.2476764880363853, + "step": 12619, + "train/sim_loss": 0.0525667667388916 + }, + { + "epoch": 1.2476764880363853, + "step": 12619, + "train/total_loss": 0.11536207050085068 + }, + { + "epoch": 1.2477753608859008, + "grad_norm": 0.5922865867614746, + "learning_rate": 6.882510013351135e-06, + "loss": 0.0883, + "step": 12620 + }, + { + "entropy": 9.940569877624512, + "epoch": 1.2477753608859008, + "mean_token_accuracy": 0.9207921028137207, + "num_tokens": 7618631.0, + "step": 12620, + "train/ce_loss": 1.6888870959519409e-06 + }, + { + "epoch": 1.2477753608859008, + "step": 12620, + "train/sim_loss": 0.024010896682739258 + }, + { + "epoch": 1.2477753608859008, + "step": 12620, + "train/total_loss": 0.024011066183447838 + }, + { + "entropy": 9.910547256469727, + "epoch": 1.2478742337354163, + "mean_token_accuracy": 0.9012739062309265, + "num_tokens": 7635064.0, + "step": 12621, + "train/ce_loss": 0.17047393321990967 + }, + { + "epoch": 1.2478742337354163, + "step": 12621, + "train/sim_loss": 0.019928455352783203 + }, + { + "epoch": 1.2478742337354163, + "step": 12621, + "train/total_loss": 0.03697584941983223 + }, + { + "entropy": 10.050522804260254, + "epoch": 1.2479731065849318, + "mean_token_accuracy": 0.9030100107192993, + "num_tokens": 7645828.0, + "step": 12622, + "train/ce_loss": 0.44241783022880554 + }, + { + "epoch": 1.2479731065849318, + "step": 12622, + "train/sim_loss": 0.03582584857940674 + }, + { + "epoch": 1.2479731065849318, + "step": 12622, + "train/total_loss": 0.08006763458251953 + }, + { + "entropy": 10.212974548339844, + "epoch": 1.2480719794344473, + "mean_token_accuracy": 0.8694030046463013, + "num_tokens": 7664150.0, + "step": 12623, + "train/ce_loss": 1.563501238822937 + }, + { + "epoch": 1.2480719794344473, + "step": 12623, + "train/sim_loss": 0.04954421520233154 + }, + { + "epoch": 1.2480719794344473, + "step": 12623, + "train/total_loss": 0.205894336104393 + }, + { + "entropy": 9.384220123291016, + "epoch": 1.2481708522839627, + "mean_token_accuracy": 0.810844898223877, + "num_tokens": 7678572.0, + "step": 12624, + "train/ce_loss": 0.6171109080314636 + }, + { + "epoch": 1.2481708522839627, + "step": 12624, + "train/sim_loss": 0.030553221702575684 + }, + { + "epoch": 1.2481708522839627, + "step": 12624, + "train/total_loss": 0.09226430952548981 + }, + { + "entropy": 9.838833808898926, + "epoch": 1.2482697251334782, + "mean_token_accuracy": 0.9312795996665955, + "num_tokens": 7694125.0, + "step": 12625, + "train/ce_loss": 7.898197509348392e-07 + }, + { + "epoch": 1.2482697251334782, + "step": 12625, + "train/sim_loss": 0.0562366247177124 + }, + { + "epoch": 1.2482697251334782, + "step": 12625, + "train/total_loss": 0.05623670294880867 + }, + { + "entropy": 9.438700675964355, + "epoch": 1.248368597982994, + "mean_token_accuracy": 0.863218367099762, + "num_tokens": 7704294.0, + "step": 12626, + "train/ce_loss": 0.4362410604953766 + }, + { + "epoch": 1.248368597982994, + "step": 12626, + "train/sim_loss": 0.033902525901794434 + }, + { + "epoch": 1.248368597982994, + "step": 12626, + "train/total_loss": 0.07752662897109985 + }, + { + "entropy": 9.82502555847168, + "epoch": 1.2484674708325094, + "mean_token_accuracy": 0.8920266032218933, + "num_tokens": 7720866.0, + "step": 12627, + "train/ce_loss": 5.108881282467337e-07 + }, + { + "epoch": 1.2484674708325094, + "step": 12627, + "train/sim_loss": 0.040180325508117676 + }, + { + "epoch": 1.2484674708325094, + "step": 12627, + "train/total_loss": 0.040180377662181854 + }, + { + "entropy": 9.444761276245117, + "epoch": 1.248566343682025, + "mean_token_accuracy": 0.8590116500854492, + "num_tokens": 7733269.0, + "step": 12628, + "train/ce_loss": 1.1365597174517461e-06 + }, + { + "epoch": 1.248566343682025, + "step": 12628, + "train/sim_loss": 0.04255950450897217 + }, + { + "epoch": 1.248566343682025, + "step": 12628, + "train/total_loss": 0.04255961999297142 + }, + { + "entropy": 9.131903648376465, + "epoch": 1.2486652165315404, + "mean_token_accuracy": 0.8815789222717285, + "num_tokens": 7746564.0, + "step": 12629, + "train/ce_loss": 0.40896373987197876 + }, + { + "epoch": 1.2486652165315404, + "step": 12629, + "train/sim_loss": 0.06210982799530029 + }, + { + "epoch": 1.2486652165315404, + "step": 12629, + "train/total_loss": 0.10300619900226593 + }, + { + "entropy": 8.695856094360352, + "epoch": 1.2487640893810559, + "mean_token_accuracy": 0.8683314323425293, + "num_tokens": 7754063.0, + "step": 12630, + "train/ce_loss": 0.44635313749313354 + }, + { + "epoch": 1.2487640893810559, + "step": 12630, + "train/sim_loss": 0.01075136661529541 + }, + { + "epoch": 1.2487640893810559, + "step": 12630, + "train/total_loss": 0.055386681109666824 + }, + { + "entropy": 9.222757339477539, + "epoch": 1.2488629622305716, + "mean_token_accuracy": 0.8363874554634094, + "num_tokens": 7767994.0, + "step": 12631, + "train/ce_loss": 3.9069888657650154e-07 + }, + { + "epoch": 1.2488629622305716, + "step": 12631, + "train/sim_loss": 0.04330509901046753 + }, + { + "epoch": 1.2488629622305716, + "step": 12631, + "train/total_loss": 0.043305136263370514 + }, + { + "entropy": 9.077625274658203, + "epoch": 1.248961835080087, + "mean_token_accuracy": 0.8650168776512146, + "num_tokens": 7776758.0, + "step": 12632, + "train/ce_loss": 0.3943471908569336 + }, + { + "epoch": 1.248961835080087, + "step": 12632, + "train/sim_loss": 0.014565825462341309 + }, + { + "epoch": 1.248961835080087, + "step": 12632, + "train/total_loss": 0.05400054529309273 + }, + { + "entropy": 9.901128768920898, + "epoch": 1.2490607079296026, + "mean_token_accuracy": 0.8644763827323914, + "num_tokens": 7789841.0, + "step": 12633, + "train/ce_loss": 0.2337159812450409 + }, + { + "epoch": 1.2490607079296026, + "step": 12633, + "train/sim_loss": 0.04009526968002319 + }, + { + "epoch": 1.2490607079296026, + "step": 12633, + "train/total_loss": 0.0634668692946434 + }, + { + "entropy": 9.472487449645996, + "epoch": 1.249159580779118, + "mean_token_accuracy": 0.7975528240203857, + "num_tokens": 7804652.0, + "step": 12634, + "train/ce_loss": 0.6033667922019958 + }, + { + "epoch": 1.249159580779118, + "step": 12634, + "train/sim_loss": 0.10087859630584717 + }, + { + "epoch": 1.249159580779118, + "step": 12634, + "train/total_loss": 0.16121527552604675 + }, + { + "entropy": 9.754524230957031, + "epoch": 1.2492584536286335, + "mean_token_accuracy": 0.8289655447006226, + "num_tokens": 7821303.0, + "step": 12635, + "train/ce_loss": 0.4414679706096649 + }, + { + "epoch": 1.2492584536286335, + "step": 12635, + "train/sim_loss": 0.05181598663330078 + }, + { + "epoch": 1.2492584536286335, + "step": 12635, + "train/total_loss": 0.09596278518438339 + }, + { + "entropy": 9.431463241577148, + "epoch": 1.249357326478149, + "mean_token_accuracy": 0.8178808093070984, + "num_tokens": 7831420.0, + "step": 12636, + "train/ce_loss": 1.4974789337429684e-06 + }, + { + "epoch": 1.249357326478149, + "step": 12636, + "train/sim_loss": 0.037284791469573975 + }, + { + "epoch": 1.249357326478149, + "step": 12636, + "train/total_loss": 0.03728494048118591 + }, + { + "entropy": 9.330081939697266, + "epoch": 1.2494561993276647, + "mean_token_accuracy": 0.8408521413803101, + "num_tokens": 7845294.0, + "step": 12637, + "train/ce_loss": 0.23457986116409302 + }, + { + "epoch": 1.2494561993276647, + "step": 12637, + "train/sim_loss": 0.025067448616027832 + }, + { + "epoch": 1.2494561993276647, + "step": 12637, + "train/total_loss": 0.04852543771266937 + }, + { + "entropy": 9.134090423583984, + "epoch": 1.2495550721771802, + "mean_token_accuracy": 0.823699414730072, + "num_tokens": 7856676.0, + "step": 12638, + "train/ce_loss": 0.6076108813285828 + }, + { + "epoch": 1.2495550721771802, + "step": 12638, + "train/sim_loss": 0.01723158359527588 + }, + { + "epoch": 1.2495550721771802, + "step": 12638, + "train/total_loss": 0.07799267768859863 + }, + { + "entropy": 9.370524406433105, + "epoch": 1.2496539450266957, + "mean_token_accuracy": 0.8177514672279358, + "num_tokens": 7873023.0, + "step": 12639, + "train/ce_loss": 0.16112828254699707 + }, + { + "epoch": 1.2496539450266957, + "step": 12639, + "train/sim_loss": 0.04887521266937256 + }, + { + "epoch": 1.2496539450266957, + "step": 12639, + "train/total_loss": 0.06498803943395615 + }, + { + "epoch": 1.2497528178762112, + "grad_norm": 0.5949915647506714, + "learning_rate": 6.877565148593186e-06, + "loss": 0.0844, + "step": 12640 + }, + { + "entropy": 9.253595352172852, + "epoch": 1.2497528178762112, + "mean_token_accuracy": 0.8508968353271484, + "num_tokens": 7891021.0, + "step": 12640, + "train/ce_loss": 0.5922147631645203 + }, + { + "epoch": 1.2497528178762112, + "step": 12640, + "train/sim_loss": 0.05849289894104004 + }, + { + "epoch": 1.2497528178762112, + "step": 12640, + "train/total_loss": 0.11771437525749207 + }, + { + "entropy": 9.600678443908691, + "epoch": 1.2498516907257267, + "mean_token_accuracy": 0.8611111044883728, + "num_tokens": 7903189.0, + "step": 12641, + "train/ce_loss": 0.21878641843795776 + }, + { + "epoch": 1.2498516907257267, + "step": 12641, + "train/sim_loss": 0.02677607536315918 + }, + { + "epoch": 1.2498516907257267, + "step": 12641, + "train/total_loss": 0.048654720187187195 + }, + { + "entropy": 9.001873970031738, + "epoch": 1.2499505635752421, + "mean_token_accuracy": 0.8157894611358643, + "num_tokens": 7911936.0, + "step": 12642, + "train/ce_loss": 0.5523393154144287 + }, + { + "epoch": 1.2499505635752421, + "step": 12642, + "train/sim_loss": 0.06516551971435547 + }, + { + "epoch": 1.2499505635752421, + "step": 12642, + "train/total_loss": 0.12039945274591446 + }, + { + "entropy": 9.776985168457031, + "epoch": 1.2500494364247579, + "mean_token_accuracy": 0.9016393423080444, + "num_tokens": 7922821.0, + "step": 12643, + "train/ce_loss": 2.403593271083082e-06 + }, + { + "epoch": 1.2500494364247579, + "step": 12643, + "train/sim_loss": 0.027695059776306152 + }, + { + "epoch": 1.2500494364247579, + "step": 12643, + "train/total_loss": 0.027695300057530403 + }, + { + "entropy": 9.659421920776367, + "epoch": 1.2501483092742733, + "mean_token_accuracy": 0.8600311279296875, + "num_tokens": 7937959.0, + "step": 12644, + "train/ce_loss": 0.6119539141654968 + }, + { + "epoch": 1.2501483092742733, + "step": 12644, + "train/sim_loss": 0.04972827434539795 + }, + { + "epoch": 1.2501483092742733, + "step": 12644, + "train/total_loss": 0.1109236627817154 + }, + { + "entropy": 9.494071960449219, + "epoch": 1.2502471821237888, + "mean_token_accuracy": 0.8335146903991699, + "num_tokens": 7953059.0, + "step": 12645, + "train/ce_loss": 0.5664892792701721 + }, + { + "epoch": 1.2502471821237888, + "step": 12645, + "train/sim_loss": 0.08037972450256348 + }, + { + "epoch": 1.2502471821237888, + "step": 12645, + "train/total_loss": 0.13702864944934845 + }, + { + "entropy": 9.040186882019043, + "epoch": 1.2503460549733043, + "mean_token_accuracy": 0.8970944285392761, + "num_tokens": 7965212.0, + "step": 12646, + "train/ce_loss": 0.1640634983778 + }, + { + "epoch": 1.2503460549733043, + "step": 12646, + "train/sim_loss": 0.058515191078186035 + }, + { + "epoch": 1.2503460549733043, + "step": 12646, + "train/total_loss": 0.07492154091596603 + }, + { + "entropy": 9.192051887512207, + "epoch": 1.2504449278228198, + "mean_token_accuracy": 0.8658410906791687, + "num_tokens": 7977395.0, + "step": 12647, + "train/ce_loss": 0.23638194799423218 + }, + { + "epoch": 1.2504449278228198, + "step": 12647, + "train/sim_loss": 0.01192927360534668 + }, + { + "epoch": 1.2504449278228198, + "step": 12647, + "train/total_loss": 0.03556746989488602 + }, + { + "entropy": 9.091503143310547, + "epoch": 1.2505438006723355, + "mean_token_accuracy": 0.8099352121353149, + "num_tokens": 7987204.0, + "step": 12648, + "train/ce_loss": 0.6075723171234131 + }, + { + "epoch": 1.2505438006723355, + "step": 12648, + "train/sim_loss": 0.030734658241271973 + }, + { + "epoch": 1.2505438006723355, + "step": 12648, + "train/total_loss": 0.09149189293384552 + }, + { + "entropy": 9.23653507232666, + "epoch": 1.2506426735218508, + "mean_token_accuracy": 0.839385449886322, + "num_tokens": 7999376.0, + "step": 12649, + "train/ce_loss": 0.6430754661560059 + }, + { + "epoch": 1.2506426735218508, + "step": 12649, + "train/sim_loss": 0.048655688762664795 + }, + { + "epoch": 1.2506426735218508, + "step": 12649, + "train/total_loss": 0.1129632368683815 + }, + { + "entropy": 8.634485244750977, + "epoch": 1.2507415463713665, + "mean_token_accuracy": 0.8175803422927856, + "num_tokens": 8007890.0, + "step": 12650, + "train/ce_loss": 0.39688774943351746 + }, + { + "epoch": 1.2507415463713665, + "step": 12650, + "train/sim_loss": 0.03489053249359131 + }, + { + "epoch": 1.2507415463713665, + "step": 12650, + "train/total_loss": 0.07457931339740753 + }, + { + "entropy": 9.067602157592773, + "epoch": 1.250840419220882, + "mean_token_accuracy": 0.8852223753929138, + "num_tokens": 8016580.0, + "step": 12651, + "train/ce_loss": 0.4621660113334656 + }, + { + "epoch": 1.250840419220882, + "step": 12651, + "train/sim_loss": 0.06775206327438354 + }, + { + "epoch": 1.250840419220882, + "step": 12651, + "train/total_loss": 0.11396867036819458 + }, + { + "entropy": 9.400471687316895, + "epoch": 1.2509392920703974, + "mean_token_accuracy": 0.8784440755844116, + "num_tokens": 8025444.0, + "step": 12652, + "train/ce_loss": 0.6572991013526917 + }, + { + "epoch": 1.2509392920703974, + "step": 12652, + "train/sim_loss": 0.0293503999710083 + }, + { + "epoch": 1.2509392920703974, + "step": 12652, + "train/total_loss": 0.09508030861616135 + }, + { + "entropy": 8.884125709533691, + "epoch": 1.251038164919913, + "mean_token_accuracy": 0.8491083383560181, + "num_tokens": 8037721.0, + "step": 12653, + "train/ce_loss": 0.45969682931900024 + }, + { + "epoch": 1.251038164919913, + "step": 12653, + "train/sim_loss": 0.03448605537414551 + }, + { + "epoch": 1.251038164919913, + "step": 12653, + "train/total_loss": 0.0804557353258133 + }, + { + "entropy": 9.987768173217773, + "epoch": 1.2511370377694284, + "mean_token_accuracy": 0.8877338767051697, + "num_tokens": 8052230.0, + "step": 12654, + "train/ce_loss": 0.4643118679523468 + }, + { + "epoch": 1.2511370377694284, + "step": 12654, + "train/sim_loss": 0.0282709002494812 + }, + { + "epoch": 1.2511370377694284, + "step": 12654, + "train/total_loss": 0.07470208406448364 + }, + { + "entropy": 9.622217178344727, + "epoch": 1.2512359106189441, + "mean_token_accuracy": 0.8584371209144592, + "num_tokens": 8071619.0, + "step": 12655, + "train/ce_loss": 0.481721431016922 + }, + { + "epoch": 1.2512359106189441, + "step": 12655, + "train/sim_loss": 0.012574195861816406 + }, + { + "epoch": 1.2512359106189441, + "step": 12655, + "train/total_loss": 0.060746338218450546 + }, + { + "entropy": 9.561429023742676, + "epoch": 1.2513347834684596, + "mean_token_accuracy": 0.859078586101532, + "num_tokens": 8086975.0, + "step": 12656, + "train/ce_loss": 5.595249490397691e-07 + }, + { + "epoch": 1.2513347834684596, + "step": 12656, + "train/sim_loss": 0.03082329034805298 + }, + { + "epoch": 1.2513347834684596, + "step": 12656, + "train/total_loss": 0.030823346227407455 + }, + { + "entropy": 9.255630493164062, + "epoch": 1.251433656317975, + "mean_token_accuracy": 0.8014440536499023, + "num_tokens": 8097434.0, + "step": 12657, + "train/ce_loss": 0.6654903292655945 + }, + { + "epoch": 1.251433656317975, + "step": 12657, + "train/sim_loss": 0.08412861824035645 + }, + { + "epoch": 1.251433656317975, + "step": 12657, + "train/total_loss": 0.1506776511669159 + }, + { + "entropy": 9.849204063415527, + "epoch": 1.2515325291674906, + "mean_token_accuracy": 0.8706395626068115, + "num_tokens": 8109402.0, + "step": 12658, + "train/ce_loss": 0.3475286364555359 + }, + { + "epoch": 1.2515325291674906, + "step": 12658, + "train/sim_loss": 0.03437274694442749 + }, + { + "epoch": 1.2515325291674906, + "step": 12658, + "train/total_loss": 0.06912560760974884 + }, + { + "entropy": 9.614819526672363, + "epoch": 1.251631402017006, + "mean_token_accuracy": 0.8931750655174255, + "num_tokens": 8124533.0, + "step": 12659, + "train/ce_loss": 0.703245222568512 + }, + { + "epoch": 1.251631402017006, + "step": 12659, + "train/sim_loss": 0.165662944316864 + }, + { + "epoch": 1.251631402017006, + "step": 12659, + "train/total_loss": 0.23598746955394745 + }, + { + "epoch": 1.2517302748665218, + "grad_norm": 0.5998234748840332, + "learning_rate": 6.872620283835237e-06, + "loss": 0.0873, + "step": 12660 + }, + { + "entropy": 9.234707832336426, + "epoch": 1.2517302748665218, + "mean_token_accuracy": 0.8265895843505859, + "num_tokens": 8132742.0, + "step": 12660, + "train/ce_loss": 0.7071554660797119 + }, + { + "epoch": 1.2517302748665218, + "step": 12660, + "train/sim_loss": 0.051543354988098145 + }, + { + "epoch": 1.2517302748665218, + "step": 12660, + "train/total_loss": 0.12225890159606934 + }, + { + "entropy": 9.330114364624023, + "epoch": 1.251829147716037, + "mean_token_accuracy": 0.8029020428657532, + "num_tokens": 8145681.0, + "step": 12661, + "train/ce_loss": 0.6556068658828735 + }, + { + "epoch": 1.251829147716037, + "step": 12661, + "train/sim_loss": 0.025757968425750732 + }, + { + "epoch": 1.251829147716037, + "step": 12661, + "train/total_loss": 0.09131865948438644 + }, + { + "entropy": 9.516839981079102, + "epoch": 1.2519280205655527, + "mean_token_accuracy": 0.9024896025657654, + "num_tokens": 8157964.0, + "step": 12662, + "train/ce_loss": 0.5589202046394348 + }, + { + "epoch": 1.2519280205655527, + "step": 12662, + "train/sim_loss": 0.05768299102783203 + }, + { + "epoch": 1.2519280205655527, + "step": 12662, + "train/total_loss": 0.11357501149177551 + }, + { + "entropy": 9.66316032409668, + "epoch": 1.2520268934150682, + "mean_token_accuracy": 0.8290366530418396, + "num_tokens": 8172523.0, + "step": 12663, + "train/ce_loss": 1.086316704750061 + }, + { + "epoch": 1.2520268934150682, + "step": 12663, + "train/sim_loss": 0.1008070707321167 + }, + { + "epoch": 1.2520268934150682, + "step": 12663, + "train/total_loss": 0.2094387412071228 + }, + { + "entropy": 9.409005165100098, + "epoch": 1.2521257662645837, + "mean_token_accuracy": 0.8234597444534302, + "num_tokens": 8186364.0, + "step": 12664, + "train/ce_loss": 0.5642682909965515 + }, + { + "epoch": 1.2521257662645837, + "step": 12664, + "train/sim_loss": 0.0465582013130188 + }, + { + "epoch": 1.2521257662645837, + "step": 12664, + "train/total_loss": 0.10298503190279007 + }, + { + "entropy": 9.27552318572998, + "epoch": 1.2522246391140992, + "mean_token_accuracy": 0.8506355881690979, + "num_tokens": 8195665.0, + "step": 12665, + "train/ce_loss": 0.7657904624938965 + }, + { + "epoch": 1.2522246391140992, + "step": 12665, + "train/sim_loss": 0.053546905517578125 + }, + { + "epoch": 1.2522246391140992, + "step": 12665, + "train/total_loss": 0.1301259547472 + }, + { + "entropy": 9.524836540222168, + "epoch": 1.2523235119636147, + "mean_token_accuracy": 0.8528528809547424, + "num_tokens": 8203210.0, + "step": 12666, + "train/ce_loss": 0.5172804594039917 + }, + { + "epoch": 1.2523235119636147, + "step": 12666, + "train/sim_loss": 0.017771005630493164 + }, + { + "epoch": 1.2523235119636147, + "step": 12666, + "train/total_loss": 0.06949905306100845 + }, + { + "entropy": 9.696816444396973, + "epoch": 1.2524223848131304, + "mean_token_accuracy": 0.8286413550376892, + "num_tokens": 8215051.0, + "step": 12667, + "train/ce_loss": 0.3929714560508728 + }, + { + "epoch": 1.2524223848131304, + "step": 12667, + "train/sim_loss": 0.033643364906311035 + }, + { + "epoch": 1.2524223848131304, + "step": 12667, + "train/total_loss": 0.07294051349163055 + }, + { + "entropy": 10.029134750366211, + "epoch": 1.2525212576626459, + "mean_token_accuracy": 0.8545034527778625, + "num_tokens": 8220655.0, + "step": 12668, + "train/ce_loss": 0.6372989416122437 + }, + { + "epoch": 1.2525212576626459, + "step": 12668, + "train/sim_loss": 0.02772343158721924 + }, + { + "epoch": 1.2525212576626459, + "step": 12668, + "train/total_loss": 0.09145332872867584 + }, + { + "entropy": 10.067891120910645, + "epoch": 1.2526201305121614, + "mean_token_accuracy": 0.8995901346206665, + "num_tokens": 8228535.0, + "step": 12669, + "train/ce_loss": 3.2733433386056277e-07 + }, + { + "epoch": 1.2526201305121614, + "step": 12669, + "train/sim_loss": 0.018014371395111084 + }, + { + "epoch": 1.2526201305121614, + "step": 12669, + "train/total_loss": 0.01801440492272377 + }, + { + "entropy": 10.034614562988281, + "epoch": 1.2527190033616769, + "mean_token_accuracy": 0.8269230723381042, + "num_tokens": 8239449.0, + "step": 12670, + "train/ce_loss": 0.6883161067962646 + }, + { + "epoch": 1.2527190033616769, + "step": 12670, + "train/sim_loss": 0.11217570304870605 + }, + { + "epoch": 1.2527190033616769, + "step": 12670, + "train/total_loss": 0.18100732564926147 + }, + { + "entropy": 9.657501220703125, + "epoch": 1.2528178762111923, + "mean_token_accuracy": 0.8851269483566284, + "num_tokens": 8252530.0, + "step": 12671, + "train/ce_loss": 0.4024362862110138 + }, + { + "epoch": 1.2528178762111923, + "step": 12671, + "train/sim_loss": 0.07090586423873901 + }, + { + "epoch": 1.2528178762111923, + "step": 12671, + "train/total_loss": 0.11114948987960815 + }, + { + "entropy": 9.522186279296875, + "epoch": 1.252916749060708, + "mean_token_accuracy": 0.8288707733154297, + "num_tokens": 8262833.0, + "step": 12672, + "train/ce_loss": 0.503608763217926 + }, + { + "epoch": 1.252916749060708, + "step": 12672, + "train/sim_loss": 0.02808547019958496 + }, + { + "epoch": 1.252916749060708, + "step": 12672, + "train/total_loss": 0.07844634354114532 + }, + { + "entropy": 9.903358459472656, + "epoch": 1.2530156219102235, + "mean_token_accuracy": 0.8548123836517334, + "num_tokens": 8275148.0, + "step": 12673, + "train/ce_loss": 0.197514608502388 + }, + { + "epoch": 1.2530156219102235, + "step": 12673, + "train/sim_loss": 0.05439567565917969 + }, + { + "epoch": 1.2530156219102235, + "step": 12673, + "train/total_loss": 0.07414713501930237 + }, + { + "entropy": 9.281057357788086, + "epoch": 1.253114494759739, + "mean_token_accuracy": 0.8690629005432129, + "num_tokens": 8291786.0, + "step": 12674, + "train/ce_loss": 0.22326835989952087 + }, + { + "epoch": 1.253114494759739, + "step": 12674, + "train/sim_loss": 0.09142637252807617 + }, + { + "epoch": 1.253114494759739, + "step": 12674, + "train/total_loss": 0.11375320702791214 + }, + { + "entropy": 9.784383773803711, + "epoch": 1.2532133676092545, + "mean_token_accuracy": 0.884482741355896, + "num_tokens": 8301850.0, + "step": 12675, + "train/ce_loss": 0.10194792598485947 + }, + { + "epoch": 1.2532133676092545, + "step": 12675, + "train/sim_loss": 0.040476977825164795 + }, + { + "epoch": 1.2532133676092545, + "step": 12675, + "train/total_loss": 0.0506717711687088 + }, + { + "entropy": 9.881271362304688, + "epoch": 1.25331224045877, + "mean_token_accuracy": 0.88225257396698, + "num_tokens": 8314240.0, + "step": 12676, + "train/ce_loss": 0.2945507764816284 + }, + { + "epoch": 1.25331224045877, + "step": 12676, + "train/sim_loss": 0.03696775436401367 + }, + { + "epoch": 1.25331224045877, + "step": 12676, + "train/total_loss": 0.06642283499240875 + }, + { + "entropy": 9.017027854919434, + "epoch": 1.2534111133082855, + "mean_token_accuracy": 0.8226414918899536, + "num_tokens": 8325738.0, + "step": 12677, + "train/ce_loss": 3.7906380612184876e-07 + }, + { + "epoch": 1.2534111133082855, + "step": 12677, + "train/sim_loss": 0.034742653369903564 + }, + { + "epoch": 1.2534111133082855, + "step": 12677, + "train/total_loss": 0.03474269062280655 + }, + { + "entropy": 9.895064353942871, + "epoch": 1.253509986157801, + "mean_token_accuracy": 0.8148148059844971, + "num_tokens": 8343194.0, + "step": 12678, + "train/ce_loss": 0.4144485890865326 + }, + { + "epoch": 1.253509986157801, + "step": 12678, + "train/sim_loss": 0.0272558331489563 + }, + { + "epoch": 1.253509986157801, + "step": 12678, + "train/total_loss": 0.06870069354772568 + }, + { + "entropy": 9.313603401184082, + "epoch": 1.2536088590073167, + "mean_token_accuracy": 0.8337696194648743, + "num_tokens": 8352083.0, + "step": 12679, + "train/ce_loss": 0.5982350707054138 + }, + { + "epoch": 1.2536088590073167, + "step": 12679, + "train/sim_loss": 0.06929874420166016 + }, + { + "epoch": 1.2536088590073167, + "step": 12679, + "train/total_loss": 0.12912225723266602 + }, + { + "epoch": 1.2537077318568322, + "grad_norm": 0.5796728730201721, + "learning_rate": 6.867675419077289e-06, + "loss": 0.0894, + "step": 12680 + }, + { + "entropy": 9.687044143676758, + "epoch": 1.2537077318568322, + "mean_token_accuracy": 0.8417431116104126, + "num_tokens": 8367363.0, + "step": 12680, + "train/ce_loss": 0.3085719048976898 + }, + { + "epoch": 1.2537077318568322, + "step": 12680, + "train/sim_loss": 0.08539444208145142 + }, + { + "epoch": 1.2537077318568322, + "step": 12680, + "train/total_loss": 0.1162516325712204 + }, + { + "entropy": 10.121847152709961, + "epoch": 1.2538066047063476, + "mean_token_accuracy": 0.869300901889801, + "num_tokens": 8384956.0, + "step": 12681, + "train/ce_loss": 1.0768544598249719e-06 + }, + { + "epoch": 1.2538066047063476, + "step": 12681, + "train/sim_loss": 0.0359840989112854 + }, + { + "epoch": 1.2538066047063476, + "step": 12681, + "train/total_loss": 0.035984206944704056 + }, + { + "entropy": 9.0270414352417, + "epoch": 1.2539054775558631, + "mean_token_accuracy": 0.8090153932571411, + "num_tokens": 8393552.0, + "step": 12682, + "train/ce_loss": 0.999611496925354 + }, + { + "epoch": 1.2539054775558631, + "step": 12682, + "train/sim_loss": 0.09956502914428711 + }, + { + "epoch": 1.2539054775558631, + "step": 12682, + "train/total_loss": 0.19952619075775146 + }, + { + "entropy": 9.487794876098633, + "epoch": 1.2540043504053786, + "mean_token_accuracy": 0.7785571217536926, + "num_tokens": 8402907.0, + "step": 12683, + "train/ce_loss": 0.9225205779075623 + }, + { + "epoch": 1.2540043504053786, + "step": 12683, + "train/sim_loss": 0.10934579372406006 + }, + { + "epoch": 1.2540043504053786, + "step": 12683, + "train/total_loss": 0.20159785449504852 + }, + { + "entropy": 9.86191177368164, + "epoch": 1.2541032232548943, + "mean_token_accuracy": 0.890656054019928, + "num_tokens": 8417060.0, + "step": 12684, + "train/ce_loss": 0.19192013144493103 + }, + { + "epoch": 1.2541032232548943, + "step": 12684, + "train/sim_loss": 0.051243603229522705 + }, + { + "epoch": 1.2541032232548943, + "step": 12684, + "train/total_loss": 0.07043561339378357 + }, + { + "entropy": 9.724533081054688, + "epoch": 1.2542020961044098, + "mean_token_accuracy": 0.8395061492919922, + "num_tokens": 8425945.0, + "step": 12685, + "train/ce_loss": 0.6022769808769226 + }, + { + "epoch": 1.2542020961044098, + "step": 12685, + "train/sim_loss": 0.08446073532104492 + }, + { + "epoch": 1.2542020961044098, + "step": 12685, + "train/total_loss": 0.1446884274482727 + }, + { + "entropy": 10.083585739135742, + "epoch": 1.2543009689539253, + "mean_token_accuracy": 0.8704062104225159, + "num_tokens": 8440643.0, + "step": 12686, + "train/ce_loss": 0.5269709825515747 + }, + { + "epoch": 1.2543009689539253, + "step": 12686, + "train/sim_loss": 0.06690925359725952 + }, + { + "epoch": 1.2543009689539253, + "step": 12686, + "train/total_loss": 0.11960635334253311 + }, + { + "entropy": 9.467130661010742, + "epoch": 1.2543998418034408, + "mean_token_accuracy": 0.8283313512802124, + "num_tokens": 8457998.0, + "step": 12687, + "train/ce_loss": 0.39501360058784485 + }, + { + "epoch": 1.2543998418034408, + "step": 12687, + "train/sim_loss": 0.020638465881347656 + }, + { + "epoch": 1.2543998418034408, + "step": 12687, + "train/total_loss": 0.06013982743024826 + }, + { + "entropy": 9.802976608276367, + "epoch": 1.2544987146529563, + "mean_token_accuracy": 0.8319559097290039, + "num_tokens": 8468417.0, + "step": 12688, + "train/ce_loss": 1.1374988844181644e-06 + }, + { + "epoch": 1.2544987146529563, + "step": 12688, + "train/sim_loss": 0.03156113624572754 + }, + { + "epoch": 1.2544987146529563, + "step": 12688, + "train/total_loss": 0.03156125172972679 + }, + { + "entropy": 9.601916313171387, + "epoch": 1.2545975875024717, + "mean_token_accuracy": 0.8651852011680603, + "num_tokens": 8485640.0, + "step": 12689, + "train/ce_loss": 0.3804961144924164 + }, + { + "epoch": 1.2545975875024717, + "step": 12689, + "train/sim_loss": 0.018755793571472168 + }, + { + "epoch": 1.2545975875024717, + "step": 12689, + "train/total_loss": 0.056805405765771866 + }, + { + "entropy": 8.863661766052246, + "epoch": 1.2546964603519872, + "mean_token_accuracy": 0.8588621616363525, + "num_tokens": 8494949.0, + "step": 12690, + "train/ce_loss": 0.8100442290306091 + }, + { + "epoch": 1.2546964603519872, + "step": 12690, + "train/sim_loss": 0.05264627933502197 + }, + { + "epoch": 1.2546964603519872, + "step": 12690, + "train/total_loss": 0.13365070521831512 + }, + { + "entropy": 9.641239166259766, + "epoch": 1.254795333201503, + "mean_token_accuracy": 0.867559552192688, + "num_tokens": 8502831.0, + "step": 12691, + "train/ce_loss": 0.38303548097610474 + }, + { + "epoch": 1.254795333201503, + "step": 12691, + "train/sim_loss": 0.02663254737854004 + }, + { + "epoch": 1.254795333201503, + "step": 12691, + "train/total_loss": 0.06493610143661499 + }, + { + "entropy": 9.919465065002441, + "epoch": 1.2548942060510184, + "mean_token_accuracy": 0.8563135266304016, + "num_tokens": 8517127.0, + "step": 12692, + "train/ce_loss": 0.5184690952301025 + }, + { + "epoch": 1.2548942060510184, + "step": 12692, + "train/sim_loss": 0.07936203479766846 + }, + { + "epoch": 1.2548942060510184, + "step": 12692, + "train/total_loss": 0.13120894134044647 + }, + { + "entropy": 9.7489013671875, + "epoch": 1.254993078900534, + "mean_token_accuracy": 0.8195592164993286, + "num_tokens": 8535279.0, + "step": 12693, + "train/ce_loss": 0.7429525256156921 + }, + { + "epoch": 1.254993078900534, + "step": 12693, + "train/sim_loss": 0.1056104302406311 + }, + { + "epoch": 1.254993078900534, + "step": 12693, + "train/total_loss": 0.17990568280220032 + }, + { + "entropy": 9.561041831970215, + "epoch": 1.2550919517500494, + "mean_token_accuracy": 0.8813868761062622, + "num_tokens": 8551672.0, + "step": 12694, + "train/ce_loss": 0.499374121427536 + }, + { + "epoch": 1.2550919517500494, + "step": 12694, + "train/sim_loss": 0.013732373714447021 + }, + { + "epoch": 1.2550919517500494, + "step": 12694, + "train/total_loss": 0.06366978585720062 + }, + { + "entropy": 9.227283477783203, + "epoch": 1.2551908245995649, + "mean_token_accuracy": 0.8901869058609009, + "num_tokens": 8565566.0, + "step": 12695, + "train/ce_loss": 0.37271901965141296 + }, + { + "epoch": 1.2551908245995649, + "step": 12695, + "train/sim_loss": 0.04226875305175781 + }, + { + "epoch": 1.2551908245995649, + "step": 12695, + "train/total_loss": 0.07954065501689911 + }, + { + "entropy": 9.479565620422363, + "epoch": 1.2552896974490806, + "mean_token_accuracy": 0.8180839419364929, + "num_tokens": 8579463.0, + "step": 12696, + "train/ce_loss": 0.22688646614551544 + }, + { + "epoch": 1.2552896974490806, + "step": 12696, + "train/sim_loss": 0.0425875186920166 + }, + { + "epoch": 1.2552896974490806, + "step": 12696, + "train/total_loss": 0.06527616828680038 + }, + { + "entropy": 9.653861999511719, + "epoch": 1.255388570298596, + "mean_token_accuracy": 0.8642150163650513, + "num_tokens": 8595685.0, + "step": 12697, + "train/ce_loss": 0.37679728865623474 + }, + { + "epoch": 1.255388570298596, + "step": 12697, + "train/sim_loss": 0.032444775104522705 + }, + { + "epoch": 1.255388570298596, + "step": 12697, + "train/total_loss": 0.07012450695037842 + }, + { + "entropy": 9.649554252624512, + "epoch": 1.2554874431481116, + "mean_token_accuracy": 0.810841977596283, + "num_tokens": 8606586.0, + "step": 12698, + "train/ce_loss": 0.38052427768707275 + }, + { + "epoch": 1.2554874431481116, + "step": 12698, + "train/sim_loss": 0.036086320877075195 + }, + { + "epoch": 1.2554874431481116, + "step": 12698, + "train/total_loss": 0.07413874566555023 + }, + { + "entropy": 9.707101821899414, + "epoch": 1.255586315997627, + "mean_token_accuracy": 0.8323809504508972, + "num_tokens": 8618141.0, + "step": 12699, + "train/ce_loss": 0.8104786276817322 + }, + { + "epoch": 1.255586315997627, + "step": 12699, + "train/sim_loss": 0.059245407581329346 + }, + { + "epoch": 1.255586315997627, + "step": 12699, + "train/total_loss": 0.14029327034950256 + }, + { + "epoch": 1.2556851888471425, + "grad_norm": 0.7000972032546997, + "learning_rate": 6.86273055431934e-06, + "loss": 0.099, + "step": 12700 + }, + { + "entropy": 9.548104286193848, + "epoch": 1.2556851888471425, + "mean_token_accuracy": 0.8383533954620361, + "num_tokens": 8635128.0, + "step": 12700, + "train/ce_loss": 0.8172045350074768 + }, + { + "epoch": 1.2556851888471425, + "step": 12700, + "train/sim_loss": 0.03824430704116821 + }, + { + "epoch": 1.2556851888471425, + "step": 12700, + "train/total_loss": 0.11996476352214813 + }, + { + "entropy": 9.663156509399414, + "epoch": 1.255784061696658, + "mean_token_accuracy": 0.824455201625824, + "num_tokens": 8648517.0, + "step": 12701, + "train/ce_loss": 0.41376832127571106 + }, + { + "epoch": 1.255784061696658, + "step": 12701, + "train/sim_loss": 0.04393112659454346 + }, + { + "epoch": 1.255784061696658, + "step": 12701, + "train/total_loss": 0.08530795574188232 + }, + { + "entropy": 8.729065895080566, + "epoch": 1.2558829345461735, + "mean_token_accuracy": 0.8912280797958374, + "num_tokens": 8654867.0, + "step": 12702, + "train/ce_loss": 0.5108869075775146 + }, + { + "epoch": 1.2558829345461735, + "step": 12702, + "train/sim_loss": 0.010988593101501465 + }, + { + "epoch": 1.2558829345461735, + "step": 12702, + "train/total_loss": 0.06207728385925293 + }, + { + "entropy": 9.306417465209961, + "epoch": 1.2559818073956892, + "mean_token_accuracy": 0.8136792182922363, + "num_tokens": 8662863.0, + "step": 12703, + "train/ce_loss": 0.19639529287815094 + }, + { + "epoch": 1.2559818073956892, + "step": 12703, + "train/sim_loss": 0.01732426881790161 + }, + { + "epoch": 1.2559818073956892, + "step": 12703, + "train/total_loss": 0.036963798105716705 + }, + { + "entropy": 9.620906829833984, + "epoch": 1.2560806802452047, + "mean_token_accuracy": 0.8821656107902527, + "num_tokens": 8673744.0, + "step": 12704, + "train/ce_loss": 0.6768789887428284 + }, + { + "epoch": 1.2560806802452047, + "step": 12704, + "train/sim_loss": 0.060414910316467285 + }, + { + "epoch": 1.2560806802452047, + "step": 12704, + "train/total_loss": 0.12810280919075012 + }, + { + "entropy": 9.698619842529297, + "epoch": 1.2561795530947202, + "mean_token_accuracy": 0.8765624761581421, + "num_tokens": 8682599.0, + "step": 12705, + "train/ce_loss": 0.4179479479789734 + }, + { + "epoch": 1.2561795530947202, + "step": 12705, + "train/sim_loss": 0.03901374340057373 + }, + { + "epoch": 1.2561795530947202, + "step": 12705, + "train/total_loss": 0.08080853521823883 + }, + { + "entropy": 9.341653823852539, + "epoch": 1.2562784259442357, + "mean_token_accuracy": 0.8388375043869019, + "num_tokens": 8698461.0, + "step": 12706, + "train/ce_loss": 0.3671342134475708 + }, + { + "epoch": 1.2562784259442357, + "step": 12706, + "train/sim_loss": 0.06108808517456055 + }, + { + "epoch": 1.2562784259442357, + "step": 12706, + "train/total_loss": 0.09780150651931763 + }, + { + "entropy": 9.904624938964844, + "epoch": 1.2563772987937512, + "mean_token_accuracy": 0.8528347611427307, + "num_tokens": 8719709.0, + "step": 12707, + "train/ce_loss": 0.3976067006587982 + }, + { + "epoch": 1.2563772987937512, + "step": 12707, + "train/sim_loss": 0.024749755859375 + }, + { + "epoch": 1.2563772987937512, + "step": 12707, + "train/total_loss": 0.06451042741537094 + }, + { + "entropy": 9.800010681152344, + "epoch": 1.2564761716432669, + "mean_token_accuracy": 0.8712446093559265, + "num_tokens": 8731871.0, + "step": 12708, + "train/ce_loss": 0.36555638909339905 + }, + { + "epoch": 1.2564761716432669, + "step": 12708, + "train/sim_loss": 0.009993553161621094 + }, + { + "epoch": 1.2564761716432669, + "step": 12708, + "train/total_loss": 0.04654919356107712 + }, + { + "entropy": 9.13694953918457, + "epoch": 1.2565750444927823, + "mean_token_accuracy": 0.8063291311264038, + "num_tokens": 8744330.0, + "step": 12709, + "train/ce_loss": 0.4347933530807495 + }, + { + "epoch": 1.2565750444927823, + "step": 12709, + "train/sim_loss": 0.05201667547225952 + }, + { + "epoch": 1.2565750444927823, + "step": 12709, + "train/total_loss": 0.09549601376056671 + }, + { + "entropy": 9.302119255065918, + "epoch": 1.2566739173422978, + "mean_token_accuracy": 0.7954022884368896, + "num_tokens": 8757002.0, + "step": 12710, + "train/ce_loss": 0.7015745639801025 + }, + { + "epoch": 1.2566739173422978, + "step": 12710, + "train/sim_loss": 0.07903486490249634 + }, + { + "epoch": 1.2566739173422978, + "step": 12710, + "train/total_loss": 0.14919233322143555 + }, + { + "entropy": 9.816524505615234, + "epoch": 1.2567727901918133, + "mean_token_accuracy": 0.9118236303329468, + "num_tokens": 8768377.0, + "step": 12711, + "train/ce_loss": 0.6039707660675049 + }, + { + "epoch": 1.2567727901918133, + "step": 12711, + "train/sim_loss": 0.04327106475830078 + }, + { + "epoch": 1.2567727901918133, + "step": 12711, + "train/total_loss": 0.10366813838481903 + }, + { + "entropy": 9.371637344360352, + "epoch": 1.2568716630413288, + "mean_token_accuracy": 0.8265424966812134, + "num_tokens": 8786014.0, + "step": 12712, + "train/ce_loss": 0.15587118268013 + }, + { + "epoch": 1.2568716630413288, + "step": 12712, + "train/sim_loss": 0.04055368900299072 + }, + { + "epoch": 1.2568716630413288, + "step": 12712, + "train/total_loss": 0.05614080652594566 + }, + { + "entropy": 9.041444778442383, + "epoch": 1.2569705358908445, + "mean_token_accuracy": 0.8550335764884949, + "num_tokens": 8791382.0, + "step": 12713, + "train/ce_loss": 0.3806450664997101 + }, + { + "epoch": 1.2569705358908445, + "step": 12713, + "train/sim_loss": 0.012490630149841309 + }, + { + "epoch": 1.2569705358908445, + "step": 12713, + "train/total_loss": 0.05055513605475426 + }, + { + "entropy": 9.694938659667969, + "epoch": 1.2570694087403598, + "mean_token_accuracy": 0.8885869383811951, + "num_tokens": 8804754.0, + "step": 12714, + "train/ce_loss": 0.2958345115184784 + }, + { + "epoch": 1.2570694087403598, + "step": 12714, + "train/sim_loss": 0.0410768985748291 + }, + { + "epoch": 1.2570694087403598, + "step": 12714, + "train/total_loss": 0.07066035270690918 + }, + { + "entropy": 10.200640678405762, + "epoch": 1.2571682815898755, + "mean_token_accuracy": 0.8775510191917419, + "num_tokens": 8816229.0, + "step": 12715, + "train/ce_loss": 0.6872371435165405 + }, + { + "epoch": 1.2571682815898755, + "step": 12715, + "train/sim_loss": 0.0747917890548706 + }, + { + "epoch": 1.2571682815898755, + "step": 12715, + "train/total_loss": 0.14351549744606018 + }, + { + "entropy": 9.342012405395508, + "epoch": 1.257267154439391, + "mean_token_accuracy": 0.8312138915061951, + "num_tokens": 8830955.0, + "step": 12716, + "train/ce_loss": 0.5296273231506348 + }, + { + "epoch": 1.257267154439391, + "step": 12716, + "train/sim_loss": 0.030289649963378906 + }, + { + "epoch": 1.257267154439391, + "step": 12716, + "train/total_loss": 0.08325238525867462 + }, + { + "entropy": 9.547781944274902, + "epoch": 1.2573660272889065, + "mean_token_accuracy": 0.9054545164108276, + "num_tokens": 8846001.0, + "step": 12717, + "train/ce_loss": 0.3353313207626343 + }, + { + "epoch": 1.2573660272889065, + "step": 12717, + "train/sim_loss": 0.06514930725097656 + }, + { + "epoch": 1.2573660272889065, + "step": 12717, + "train/total_loss": 0.09868244081735611 + }, + { + "entropy": 9.793542861938477, + "epoch": 1.257464900138422, + "mean_token_accuracy": 0.9152941107749939, + "num_tokens": 8855646.0, + "step": 12718, + "train/ce_loss": 0.22468791902065277 + }, + { + "epoch": 1.257464900138422, + "step": 12718, + "train/sim_loss": 0.042836129665374756 + }, + { + "epoch": 1.257464900138422, + "step": 12718, + "train/total_loss": 0.06530492007732391 + }, + { + "entropy": 9.498424530029297, + "epoch": 1.2575637729879374, + "mean_token_accuracy": 0.8796296119689941, + "num_tokens": 8871156.0, + "step": 12719, + "train/ce_loss": 9.039862902682216e-07 + }, + { + "epoch": 1.2575637729879374, + "step": 12719, + "train/sim_loss": 0.037425875663757324 + }, + { + "epoch": 1.2575637729879374, + "step": 12719, + "train/total_loss": 0.03742596507072449 + }, + { + "epoch": 1.2576626458374531, + "grad_norm": 0.6859956383705139, + "learning_rate": 6.857785689561391e-06, + "loss": 0.0804, + "step": 12720 + }, + { + "entropy": 9.436904907226562, + "epoch": 1.2576626458374531, + "mean_token_accuracy": 0.9067484736442566, + "num_tokens": 8879965.0, + "step": 12720, + "train/ce_loss": 0.1339704990386963 + }, + { + "epoch": 1.2576626458374531, + "step": 12720, + "train/sim_loss": 0.046109676361083984 + }, + { + "epoch": 1.2576626458374531, + "step": 12720, + "train/total_loss": 0.059506725519895554 + }, + { + "entropy": 9.214302062988281, + "epoch": 1.2577615186869686, + "mean_token_accuracy": 0.8189252614974976, + "num_tokens": 8896679.0, + "step": 12721, + "train/ce_loss": 0.44853639602661133 + }, + { + "epoch": 1.2577615186869686, + "step": 12721, + "train/sim_loss": 0.016767501831054688 + }, + { + "epoch": 1.2577615186869686, + "step": 12721, + "train/total_loss": 0.06162114068865776 + }, + { + "entropy": 9.370660781860352, + "epoch": 1.257860391536484, + "mean_token_accuracy": 0.8465062975883484, + "num_tokens": 8904880.0, + "step": 12722, + "train/ce_loss": 0.4549790322780609 + }, + { + "epoch": 1.257860391536484, + "step": 12722, + "train/sim_loss": 0.04206979274749756 + }, + { + "epoch": 1.257860391536484, + "step": 12722, + "train/total_loss": 0.08756770193576813 + }, + { + "entropy": 10.253509521484375, + "epoch": 1.2579592643859996, + "mean_token_accuracy": 0.8914728760719299, + "num_tokens": 8918432.0, + "step": 12723, + "train/ce_loss": 0.6233571171760559 + }, + { + "epoch": 1.2579592643859996, + "step": 12723, + "train/sim_loss": 0.021959543228149414 + }, + { + "epoch": 1.2579592643859996, + "step": 12723, + "train/total_loss": 0.08429525792598724 + }, + { + "entropy": 9.516708374023438, + "epoch": 1.258058137235515, + "mean_token_accuracy": 0.7868852615356445, + "num_tokens": 8935860.0, + "step": 12724, + "train/ce_loss": 0.9911019802093506 + }, + { + "epoch": 1.258058137235515, + "step": 12724, + "train/sim_loss": 0.024582743644714355 + }, + { + "epoch": 1.258058137235515, + "step": 12724, + "train/total_loss": 0.12369294464588165 + }, + { + "entropy": 9.764565467834473, + "epoch": 1.2581570100850308, + "mean_token_accuracy": 0.8666666746139526, + "num_tokens": 8951305.0, + "step": 12725, + "train/ce_loss": 8.179670771824021e-07 + }, + { + "epoch": 1.2581570100850308, + "step": 12725, + "train/sim_loss": 0.03292214870452881 + }, + { + "epoch": 1.2581570100850308, + "step": 12725, + "train/total_loss": 0.032922230660915375 + }, + { + "entropy": 9.367612838745117, + "epoch": 1.258255882934546, + "mean_token_accuracy": 0.8945578336715698, + "num_tokens": 8963469.0, + "step": 12726, + "train/ce_loss": 0.22726884484291077 + }, + { + "epoch": 1.258255882934546, + "step": 12726, + "train/sim_loss": 0.05956625938415527 + }, + { + "epoch": 1.258255882934546, + "step": 12726, + "train/total_loss": 0.08229314535856247 + }, + { + "entropy": 9.502275466918945, + "epoch": 1.2583547557840618, + "mean_token_accuracy": 0.8914728760719299, + "num_tokens": 8978530.0, + "step": 12727, + "train/ce_loss": 0.5612844824790955 + }, + { + "epoch": 1.2583547557840618, + "step": 12727, + "train/sim_loss": 0.0383223295211792 + }, + { + "epoch": 1.2583547557840618, + "step": 12727, + "train/total_loss": 0.09445077925920486 + }, + { + "entropy": 9.059556007385254, + "epoch": 1.2584536286335772, + "mean_token_accuracy": 0.8041236996650696, + "num_tokens": 8990734.0, + "step": 12728, + "train/ce_loss": 0.6179470419883728 + }, + { + "epoch": 1.2584536286335772, + "step": 12728, + "train/sim_loss": 0.011602640151977539 + }, + { + "epoch": 1.2584536286335772, + "step": 12728, + "train/total_loss": 0.07339734584093094 + }, + { + "entropy": 9.894216537475586, + "epoch": 1.2585525014830927, + "mean_token_accuracy": 0.8815080523490906, + "num_tokens": 9004469.0, + "step": 12729, + "train/ce_loss": 0.20416365563869476 + }, + { + "epoch": 1.2585525014830927, + "step": 12729, + "train/sim_loss": 0.057431578636169434 + }, + { + "epoch": 1.2585525014830927, + "step": 12729, + "train/total_loss": 0.07784794270992279 + }, + { + "entropy": 9.120265007019043, + "epoch": 1.2586513743326082, + "mean_token_accuracy": 0.8578312993049622, + "num_tokens": 9011385.0, + "step": 12730, + "train/ce_loss": 0.30020222067832947 + }, + { + "epoch": 1.2586513743326082, + "step": 12730, + "train/sim_loss": 0.03302741050720215 + }, + { + "epoch": 1.2586513743326082, + "step": 12730, + "train/total_loss": 0.0630476325750351 + }, + { + "entropy": 9.32209587097168, + "epoch": 1.2587502471821237, + "mean_token_accuracy": 0.884839653968811, + "num_tokens": 9027320.0, + "step": 12731, + "train/ce_loss": 0.7169942259788513 + }, + { + "epoch": 1.2587502471821237, + "step": 12731, + "train/sim_loss": 0.05140620470046997 + }, + { + "epoch": 1.2587502471821237, + "step": 12731, + "train/total_loss": 0.12310563027858734 + }, + { + "entropy": 9.343070030212402, + "epoch": 1.2588491200316394, + "mean_token_accuracy": 0.8468033671379089, + "num_tokens": 9038857.0, + "step": 12732, + "train/ce_loss": 0.525370180606842 + }, + { + "epoch": 1.2588491200316394, + "step": 12732, + "train/sim_loss": 0.016681194305419922 + }, + { + "epoch": 1.2588491200316394, + "step": 12732, + "train/total_loss": 0.0692182183265686 + }, + { + "entropy": 9.475774765014648, + "epoch": 1.2589479928811549, + "mean_token_accuracy": 0.8481848239898682, + "num_tokens": 9052427.0, + "step": 12733, + "train/ce_loss": 0.47056064009666443 + }, + { + "epoch": 1.2589479928811549, + "step": 12733, + "train/sim_loss": 0.03792440891265869 + }, + { + "epoch": 1.2589479928811549, + "step": 12733, + "train/total_loss": 0.08498047292232513 + }, + { + "entropy": 8.86440658569336, + "epoch": 1.2590468657306704, + "mean_token_accuracy": 0.8363844156265259, + "num_tokens": 9060247.0, + "step": 12734, + "train/ce_loss": 0.2736436128616333 + }, + { + "epoch": 1.2590468657306704, + "step": 12734, + "train/sim_loss": 0.01475745439529419 + }, + { + "epoch": 1.2590468657306704, + "step": 12734, + "train/total_loss": 0.04212181642651558 + }, + { + "entropy": 9.556379318237305, + "epoch": 1.2591457385801859, + "mean_token_accuracy": 0.9118065237998962, + "num_tokens": 9069327.0, + "step": 12735, + "train/ce_loss": 5.555016855396389e-07 + }, + { + "epoch": 1.2591457385801859, + "step": 12735, + "train/sim_loss": 0.02085733413696289 + }, + { + "epoch": 1.2591457385801859, + "step": 12735, + "train/total_loss": 0.020857390016317368 + }, + { + "entropy": 9.676614761352539, + "epoch": 1.2592446114297013, + "mean_token_accuracy": 0.8622668385505676, + "num_tokens": 9082286.0, + "step": 12736, + "train/ce_loss": 0.4486018419265747 + }, + { + "epoch": 1.2592446114297013, + "step": 12736, + "train/sim_loss": 0.08669328689575195 + }, + { + "epoch": 1.2592446114297013, + "step": 12736, + "train/total_loss": 0.13155347108840942 + }, + { + "entropy": 9.6787109375, + "epoch": 1.259343484279217, + "mean_token_accuracy": 0.8773291707038879, + "num_tokens": 9094011.0, + "step": 12737, + "train/ce_loss": 0.32851773500442505 + }, + { + "epoch": 1.259343484279217, + "step": 12737, + "train/sim_loss": 0.030120849609375 + }, + { + "epoch": 1.259343484279217, + "step": 12737, + "train/total_loss": 0.06297262012958527 + }, + { + "entropy": 9.81328010559082, + "epoch": 1.2594423571287323, + "mean_token_accuracy": 0.9054794311523438, + "num_tokens": 9111451.0, + "step": 12738, + "train/ce_loss": 0.2699775695800781 + }, + { + "epoch": 1.2594423571287323, + "step": 12738, + "train/sim_loss": 0.0441933274269104 + }, + { + "epoch": 1.2594423571287323, + "step": 12738, + "train/total_loss": 0.07119108736515045 + }, + { + "entropy": 9.171382904052734, + "epoch": 1.259541229978248, + "mean_token_accuracy": 0.8228128552436829, + "num_tokens": 9127456.0, + "step": 12739, + "train/ce_loss": 0.5504099130630493 + }, + { + "epoch": 1.259541229978248, + "step": 12739, + "train/sim_loss": 0.021279513835906982 + }, + { + "epoch": 1.259541229978248, + "step": 12739, + "train/total_loss": 0.07632050663232803 + }, + { + "epoch": 1.2596401028277635, + "grad_norm": 0.6012497544288635, + "learning_rate": 6.852840824803442e-06, + "loss": 0.0796, + "step": 12740 + }, + { + "entropy": 9.356725692749023, + "epoch": 1.2596401028277635, + "mean_token_accuracy": 0.8240436911582947, + "num_tokens": 9136234.0, + "step": 12740, + "train/ce_loss": 0.6920087337493896 + }, + { + "epoch": 1.2596401028277635, + "step": 12740, + "train/sim_loss": 0.05856132507324219 + }, + { + "epoch": 1.2596401028277635, + "step": 12740, + "train/total_loss": 0.12776219844818115 + }, + { + "entropy": 9.386275291442871, + "epoch": 1.259738975677279, + "mean_token_accuracy": 0.7915376424789429, + "num_tokens": 9148306.0, + "step": 12741, + "train/ce_loss": 0.6972427368164062 + }, + { + "epoch": 1.259738975677279, + "step": 12741, + "train/sim_loss": 0.05080074071884155 + }, + { + "epoch": 1.259738975677279, + "step": 12741, + "train/total_loss": 0.12052501738071442 + }, + { + "entropy": 9.092527389526367, + "epoch": 1.2598378485267945, + "mean_token_accuracy": 0.8625180721282959, + "num_tokens": 9163193.0, + "step": 12742, + "train/ce_loss": 8.818753940431634e-07 + }, + { + "epoch": 1.2598378485267945, + "step": 12742, + "train/sim_loss": 0.04187709093093872 + }, + { + "epoch": 1.2598378485267945, + "step": 12742, + "train/total_loss": 0.041877180337905884 + }, + { + "entropy": 9.167816162109375, + "epoch": 1.25993672137631, + "mean_token_accuracy": 0.8257309794425964, + "num_tokens": 9171674.0, + "step": 12743, + "train/ce_loss": 0.587572455406189 + }, + { + "epoch": 1.25993672137631, + "step": 12743, + "train/sim_loss": 0.09564608335494995 + }, + { + "epoch": 1.25993672137631, + "step": 12743, + "train/total_loss": 0.15440332889556885 + }, + { + "entropy": 9.05340576171875, + "epoch": 1.2600355942258257, + "mean_token_accuracy": 0.8708891868591309, + "num_tokens": 9185424.0, + "step": 12744, + "train/ce_loss": 0.388030081987381 + }, + { + "epoch": 1.2600355942258257, + "step": 12744, + "train/sim_loss": 0.0173262357711792 + }, + { + "epoch": 1.2600355942258257, + "step": 12744, + "train/total_loss": 0.05612924322485924 + }, + { + "entropy": 9.579507827758789, + "epoch": 1.2601344670753412, + "mean_token_accuracy": 0.8867069482803345, + "num_tokens": 9199988.0, + "step": 12745, + "train/ce_loss": 0.36653071641921997 + }, + { + "epoch": 1.2601344670753412, + "step": 12745, + "train/sim_loss": 0.036616742610931396 + }, + { + "epoch": 1.2601344670753412, + "step": 12745, + "train/total_loss": 0.0732698142528534 + }, + { + "entropy": 9.182151794433594, + "epoch": 1.2602333399248566, + "mean_token_accuracy": 0.8297872543334961, + "num_tokens": 9216771.0, + "step": 12746, + "train/ce_loss": 0.335417777299881 + }, + { + "epoch": 1.2602333399248566, + "step": 12746, + "train/sim_loss": 0.016872286796569824 + }, + { + "epoch": 1.2602333399248566, + "step": 12746, + "train/total_loss": 0.0504140667617321 + }, + { + "entropy": 8.975199699401855, + "epoch": 1.2603322127743721, + "mean_token_accuracy": 0.8535791635513306, + "num_tokens": 9224983.0, + "step": 12747, + "train/ce_loss": 0.3790697753429413 + }, + { + "epoch": 1.2603322127743721, + "step": 12747, + "train/sim_loss": 0.04359853267669678 + }, + { + "epoch": 1.2603322127743721, + "step": 12747, + "train/total_loss": 0.08150550723075867 + }, + { + "entropy": 9.34610366821289, + "epoch": 1.2604310856238876, + "mean_token_accuracy": 0.8059298992156982, + "num_tokens": 9242538.0, + "step": 12748, + "train/ce_loss": 0.45960789918899536 + }, + { + "epoch": 1.2604310856238876, + "step": 12748, + "train/sim_loss": 0.0474623441696167 + }, + { + "epoch": 1.2604310856238876, + "step": 12748, + "train/total_loss": 0.09342313557863235 + }, + { + "entropy": 9.321577072143555, + "epoch": 1.2605299584734033, + "mean_token_accuracy": 0.8363636136054993, + "num_tokens": 9253075.0, + "step": 12749, + "train/ce_loss": 0.5265463590621948 + }, + { + "epoch": 1.2605299584734033, + "step": 12749, + "train/sim_loss": 0.10895073413848877 + }, + { + "epoch": 1.2605299584734033, + "step": 12749, + "train/total_loss": 0.1616053730249405 + }, + { + "entropy": 9.660869598388672, + "epoch": 1.2606288313229188, + "mean_token_accuracy": 0.860326886177063, + "num_tokens": 9268223.0, + "step": 12750, + "train/ce_loss": 0.28168413043022156 + }, + { + "epoch": 1.2606288313229188, + "step": 12750, + "train/sim_loss": 0.01671469211578369 + }, + { + "epoch": 1.2606288313229188, + "step": 12750, + "train/total_loss": 0.04488310590386391 + }, + { + "entropy": 9.25925064086914, + "epoch": 1.2607277041724343, + "mean_token_accuracy": 0.8470045924186707, + "num_tokens": 9281565.0, + "step": 12751, + "train/ce_loss": 0.5143239498138428 + }, + { + "epoch": 1.2607277041724343, + "step": 12751, + "train/sim_loss": 0.014307618141174316 + }, + { + "epoch": 1.2607277041724343, + "step": 12751, + "train/total_loss": 0.06574001908302307 + }, + { + "entropy": 9.917739868164062, + "epoch": 1.2608265770219498, + "mean_token_accuracy": 0.8583691120147705, + "num_tokens": 9288796.0, + "step": 12752, + "train/ce_loss": 0.5287806391716003 + }, + { + "epoch": 1.2608265770219498, + "step": 12752, + "train/sim_loss": 0.04308563470840454 + }, + { + "epoch": 1.2608265770219498, + "step": 12752, + "train/total_loss": 0.09596370160579681 + }, + { + "entropy": 9.529983520507812, + "epoch": 1.2609254498714653, + "mean_token_accuracy": 0.8616647124290466, + "num_tokens": 9302713.0, + "step": 12753, + "train/ce_loss": 0.4605177342891693 + }, + { + "epoch": 1.2609254498714653, + "step": 12753, + "train/sim_loss": 0.06663894653320312 + }, + { + "epoch": 1.2609254498714653, + "step": 12753, + "train/total_loss": 0.11269071698188782 + }, + { + "entropy": 9.943115234375, + "epoch": 1.2610243227209808, + "mean_token_accuracy": 0.838518500328064, + "num_tokens": 9313620.0, + "step": 12754, + "train/ce_loss": 0.297884076833725 + }, + { + "epoch": 1.2610243227209808, + "step": 12754, + "train/sim_loss": 0.04441434144973755 + }, + { + "epoch": 1.2610243227209808, + "step": 12754, + "train/total_loss": 0.07420274615287781 + }, + { + "entropy": 9.837543487548828, + "epoch": 1.2611231955704962, + "mean_token_accuracy": 0.8590971231460571, + "num_tokens": 9330375.0, + "step": 12755, + "train/ce_loss": 0.49149274826049805 + }, + { + "epoch": 1.2611231955704962, + "step": 12755, + "train/sim_loss": 0.031954824924468994 + }, + { + "epoch": 1.2611231955704962, + "step": 12755, + "train/total_loss": 0.0811040997505188 + }, + { + "entropy": 9.7786865234375, + "epoch": 1.261222068420012, + "mean_token_accuracy": 0.8422781825065613, + "num_tokens": 9344973.0, + "step": 12756, + "train/ce_loss": 0.5100749731063843 + }, + { + "epoch": 1.261222068420012, + "step": 12756, + "train/sim_loss": 0.02133166790008545 + }, + { + "epoch": 1.261222068420012, + "step": 12756, + "train/total_loss": 0.07233916223049164 + }, + { + "entropy": 8.809988021850586, + "epoch": 1.2613209412695274, + "mean_token_accuracy": 0.85012286901474, + "num_tokens": 9352514.0, + "step": 12757, + "train/ce_loss": 0.5268095135688782 + }, + { + "epoch": 1.2613209412695274, + "step": 12757, + "train/sim_loss": 0.03351926803588867 + }, + { + "epoch": 1.2613209412695274, + "step": 12757, + "train/total_loss": 0.08620022237300873 + }, + { + "entropy": 8.852376937866211, + "epoch": 1.261419814119043, + "mean_token_accuracy": 0.8745210766792297, + "num_tokens": 9365257.0, + "step": 12758, + "train/ce_loss": 0.3201243281364441 + }, + { + "epoch": 1.261419814119043, + "step": 12758, + "train/sim_loss": 0.04046022891998291 + }, + { + "epoch": 1.261419814119043, + "step": 12758, + "train/total_loss": 0.07247266173362732 + }, + { + "entropy": 9.32492733001709, + "epoch": 1.2615186869685584, + "mean_token_accuracy": 0.8921985626220703, + "num_tokens": 9378934.0, + "step": 12759, + "train/ce_loss": 0.31019580364227295 + }, + { + "epoch": 1.2615186869685584, + "step": 12759, + "train/sim_loss": 0.03870558738708496 + }, + { + "epoch": 1.2615186869685584, + "step": 12759, + "train/total_loss": 0.0697251707315445 + }, + { + "epoch": 1.2616175598180739, + "grad_norm": 0.49254310131073, + "learning_rate": 6.847895960045493e-06, + "loss": 0.0943, + "step": 12760 + }, + { + "entropy": 10.020928382873535, + "epoch": 1.2616175598180739, + "mean_token_accuracy": 0.9111111164093018, + "num_tokens": 9387920.0, + "step": 12760, + "train/ce_loss": 0.6224057078361511 + }, + { + "epoch": 1.2616175598180739, + "step": 12760, + "train/sim_loss": 0.02447068691253662 + }, + { + "epoch": 1.2616175598180739, + "step": 12760, + "train/total_loss": 0.08671125769615173 + }, + { + "entropy": 9.626797676086426, + "epoch": 1.2617164326675896, + "mean_token_accuracy": 0.841891884803772, + "num_tokens": 9401968.0, + "step": 12761, + "train/ce_loss": 0.49072983860969543 + }, + { + "epoch": 1.2617164326675896, + "step": 12761, + "train/sim_loss": 0.01957756280899048 + }, + { + "epoch": 1.2617164326675896, + "step": 12761, + "train/total_loss": 0.06865054368972778 + }, + { + "entropy": 9.18535041809082, + "epoch": 1.261815305517105, + "mean_token_accuracy": 0.8626799583435059, + "num_tokens": 9415026.0, + "step": 12762, + "train/ce_loss": 0.571781575679779 + }, + { + "epoch": 1.261815305517105, + "step": 12762, + "train/sim_loss": 0.07390576601028442 + }, + { + "epoch": 1.261815305517105, + "step": 12762, + "train/total_loss": 0.1310839205980301 + }, + { + "entropy": 9.367549896240234, + "epoch": 1.2619141783666206, + "mean_token_accuracy": 0.9048625826835632, + "num_tokens": 9428320.0, + "step": 12763, + "train/ce_loss": 6.361084388117888e-07 + }, + { + "epoch": 1.2619141783666206, + "step": 12763, + "train/sim_loss": 0.017416059970855713 + }, + { + "epoch": 1.2619141783666206, + "step": 12763, + "train/total_loss": 0.017416123300790787 + }, + { + "entropy": 9.450424194335938, + "epoch": 1.262013051216136, + "mean_token_accuracy": 0.84659743309021, + "num_tokens": 9440322.0, + "step": 12764, + "train/ce_loss": 0.5549013614654541 + }, + { + "epoch": 1.262013051216136, + "step": 12764, + "train/sim_loss": 0.07276839017868042 + }, + { + "epoch": 1.262013051216136, + "step": 12764, + "train/total_loss": 0.12825852632522583 + }, + { + "entropy": 9.516936302185059, + "epoch": 1.2621119240656515, + "mean_token_accuracy": 0.8824503421783447, + "num_tokens": 9457713.0, + "step": 12765, + "train/ce_loss": 9.662675211075111e-07 + }, + { + "epoch": 1.2621119240656515, + "step": 12765, + "train/sim_loss": 0.039858222007751465 + }, + { + "epoch": 1.2621119240656515, + "step": 12765, + "train/total_loss": 0.039858318865299225 + }, + { + "entropy": 9.169913291931152, + "epoch": 1.262210796915167, + "mean_token_accuracy": 0.7899878025054932, + "num_tokens": 9467816.0, + "step": 12766, + "train/ce_loss": 0.7269015908241272 + }, + { + "epoch": 1.262210796915167, + "step": 12766, + "train/sim_loss": 0.05562460422515869 + }, + { + "epoch": 1.262210796915167, + "step": 12766, + "train/total_loss": 0.1283147633075714 + }, + { + "entropy": 9.167139053344727, + "epoch": 1.2623096697646825, + "mean_token_accuracy": 0.8177777528762817, + "num_tokens": 9480213.0, + "step": 12767, + "train/ce_loss": 0.44596606492996216 + }, + { + "epoch": 1.2623096697646825, + "step": 12767, + "train/sim_loss": 0.04867154359817505 + }, + { + "epoch": 1.2623096697646825, + "step": 12767, + "train/total_loss": 0.09326815605163574 + }, + { + "entropy": 8.987945556640625, + "epoch": 1.2624085426141982, + "mean_token_accuracy": 0.8766094446182251, + "num_tokens": 9489010.0, + "step": 12768, + "train/ce_loss": 0.35679522156715393 + }, + { + "epoch": 1.2624085426141982, + "step": 12768, + "train/sim_loss": 0.06333565711975098 + }, + { + "epoch": 1.2624085426141982, + "step": 12768, + "train/total_loss": 0.09901517629623413 + }, + { + "entropy": 9.259897232055664, + "epoch": 1.2625074154637137, + "mean_token_accuracy": 0.8579710125923157, + "num_tokens": 9499034.0, + "step": 12769, + "train/ce_loss": 0.20899853110313416 + }, + { + "epoch": 1.2625074154637137, + "step": 12769, + "train/sim_loss": 0.05027341842651367 + }, + { + "epoch": 1.2625074154637137, + "step": 12769, + "train/total_loss": 0.0711732730269432 + }, + { + "entropy": 9.56091022491455, + "epoch": 1.2626062883132292, + "mean_token_accuracy": 0.8907923102378845, + "num_tokens": 9516544.0, + "step": 12770, + "train/ce_loss": 0.5011967420578003 + }, + { + "epoch": 1.2626062883132292, + "step": 12770, + "train/sim_loss": 0.06192970275878906 + }, + { + "epoch": 1.2626062883132292, + "step": 12770, + "train/total_loss": 0.11204937845468521 + }, + { + "entropy": 9.686830520629883, + "epoch": 1.2627051611627447, + "mean_token_accuracy": 0.8085585832595825, + "num_tokens": 9535223.0, + "step": 12771, + "train/ce_loss": 0.7835299968719482 + }, + { + "epoch": 1.2627051611627447, + "step": 12771, + "train/sim_loss": 0.06464505195617676 + }, + { + "epoch": 1.2627051611627447, + "step": 12771, + "train/total_loss": 0.14299805462360382 + }, + { + "entropy": 9.335103034973145, + "epoch": 1.2628040340122602, + "mean_token_accuracy": 0.8198198080062866, + "num_tokens": 9543095.0, + "step": 12772, + "train/ce_loss": 0.571284294128418 + }, + { + "epoch": 1.2628040340122602, + "step": 12772, + "train/sim_loss": 0.03557854890823364 + }, + { + "epoch": 1.2628040340122602, + "step": 12772, + "train/total_loss": 0.09270697832107544 + }, + { + "entropy": 9.198476791381836, + "epoch": 1.2629029068617759, + "mean_token_accuracy": 0.8210399150848389, + "num_tokens": 9557993.0, + "step": 12773, + "train/ce_loss": 0.5793756246566772 + }, + { + "epoch": 1.2629029068617759, + "step": 12773, + "train/sim_loss": 0.025729060173034668 + }, + { + "epoch": 1.2629029068617759, + "step": 12773, + "train/total_loss": 0.08366662263870239 + }, + { + "entropy": 9.166987419128418, + "epoch": 1.2630017797112914, + "mean_token_accuracy": 0.8785310983657837, + "num_tokens": 9568979.0, + "step": 12774, + "train/ce_loss": 0.4508848786354065 + }, + { + "epoch": 1.2630017797112914, + "step": 12774, + "train/sim_loss": 0.05754512548446655 + }, + { + "epoch": 1.2630017797112914, + "step": 12774, + "train/total_loss": 0.10263361036777496 + }, + { + "entropy": 9.838275909423828, + "epoch": 1.2631006525608068, + "mean_token_accuracy": 0.865470826625824, + "num_tokens": 9579405.0, + "step": 12775, + "train/ce_loss": 0.7103832960128784 + }, + { + "epoch": 1.2631006525608068, + "step": 12775, + "train/sim_loss": 0.07466983795166016 + }, + { + "epoch": 1.2631006525608068, + "step": 12775, + "train/total_loss": 0.14570817351341248 + }, + { + "entropy": 9.392705917358398, + "epoch": 1.2631995254103223, + "mean_token_accuracy": 0.841856062412262, + "num_tokens": 9594808.0, + "step": 12776, + "train/ce_loss": 0.5375726222991943 + }, + { + "epoch": 1.2631995254103223, + "step": 12776, + "train/sim_loss": 0.05580151081085205 + }, + { + "epoch": 1.2631995254103223, + "step": 12776, + "train/total_loss": 0.10955877602100372 + }, + { + "entropy": 9.618677139282227, + "epoch": 1.2632983982598378, + "mean_token_accuracy": 0.8233917951583862, + "num_tokens": 9611344.0, + "step": 12777, + "train/ce_loss": 0.39912882447242737 + }, + { + "epoch": 1.2632983982598378, + "step": 12777, + "train/sim_loss": 0.045117855072021484 + }, + { + "epoch": 1.2632983982598378, + "step": 12777, + "train/total_loss": 0.08503073453903198 + }, + { + "entropy": 9.544301986694336, + "epoch": 1.2633972711093533, + "mean_token_accuracy": 0.8589928150177002, + "num_tokens": 9622607.0, + "step": 12778, + "train/ce_loss": 0.2594378888607025 + }, + { + "epoch": 1.2633972711093533, + "step": 12778, + "train/sim_loss": 0.017545104026794434 + }, + { + "epoch": 1.2633972711093533, + "step": 12778, + "train/total_loss": 0.043488893657922745 + }, + { + "entropy": 9.078885078430176, + "epoch": 1.2634961439588688, + "mean_token_accuracy": 0.8090575337409973, + "num_tokens": 9631473.0, + "step": 12779, + "train/ce_loss": 0.293070912361145 + }, + { + "epoch": 1.2634961439588688, + "step": 12779, + "train/sim_loss": 0.06060636043548584 + }, + { + "epoch": 1.2634961439588688, + "step": 12779, + "train/total_loss": 0.08991345018148422 + }, + { + "epoch": 1.2635950168083845, + "grad_norm": 0.5830499529838562, + "learning_rate": 6.8429510952875445e-06, + "loss": 0.0825, + "step": 12780 + }, + { + "entropy": 9.482677459716797, + "epoch": 1.2635950168083845, + "mean_token_accuracy": 0.8157067894935608, + "num_tokens": 9646106.0, + "step": 12780, + "train/ce_loss": 0.9609065651893616 + }, + { + "epoch": 1.2635950168083845, + "step": 12780, + "train/sim_loss": 0.10019946098327637 + }, + { + "epoch": 1.2635950168083845, + "step": 12780, + "train/total_loss": 0.19629012048244476 + }, + { + "entropy": 9.753530502319336, + "epoch": 1.2636938896579, + "mean_token_accuracy": 0.8550724387168884, + "num_tokens": 9659063.0, + "step": 12781, + "train/ce_loss": 0.4874815046787262 + }, + { + "epoch": 1.2636938896579, + "step": 12781, + "train/sim_loss": 0.042707741260528564 + }, + { + "epoch": 1.2636938896579, + "step": 12781, + "train/total_loss": 0.09145589172840118 + }, + { + "entropy": 9.408348083496094, + "epoch": 1.2637927625074155, + "mean_token_accuracy": 0.8588098287582397, + "num_tokens": 9671860.0, + "step": 12782, + "train/ce_loss": 0.35435402393341064 + }, + { + "epoch": 1.2637927625074155, + "step": 12782, + "train/sim_loss": 0.0569341778755188 + }, + { + "epoch": 1.2637927625074155, + "step": 12782, + "train/total_loss": 0.09236958622932434 + }, + { + "entropy": 9.671721458435059, + "epoch": 1.263891635356931, + "mean_token_accuracy": 0.8765008449554443, + "num_tokens": 9685503.0, + "step": 12783, + "train/ce_loss": 0.6386334300041199 + }, + { + "epoch": 1.263891635356931, + "step": 12783, + "train/sim_loss": 0.05825507640838623 + }, + { + "epoch": 1.263891635356931, + "step": 12783, + "train/total_loss": 0.12211842089891434 + }, + { + "entropy": 8.712270736694336, + "epoch": 1.2639905082064464, + "mean_token_accuracy": 0.8475426435470581, + "num_tokens": 9698739.0, + "step": 12784, + "train/ce_loss": 0.4301394820213318 + }, + { + "epoch": 1.2639905082064464, + "step": 12784, + "train/sim_loss": 0.10561871528625488 + }, + { + "epoch": 1.2639905082064464, + "step": 12784, + "train/total_loss": 0.14863266050815582 + }, + { + "entropy": 9.43802547454834, + "epoch": 1.2640893810559621, + "mean_token_accuracy": 0.9028831720352173, + "num_tokens": 9706005.0, + "step": 12785, + "train/ce_loss": 0.3258366584777832 + }, + { + "epoch": 1.2640893810559621, + "step": 12785, + "train/sim_loss": 0.010836482048034668 + }, + { + "epoch": 1.2640893810559621, + "step": 12785, + "train/total_loss": 0.04342014715075493 + }, + { + "entropy": 9.445589065551758, + "epoch": 1.2641882539054776, + "mean_token_accuracy": 0.8647260069847107, + "num_tokens": 9713080.0, + "step": 12786, + "train/ce_loss": 0.9329484701156616 + }, + { + "epoch": 1.2641882539054776, + "step": 12786, + "train/sim_loss": 0.04513108730316162 + }, + { + "epoch": 1.2641882539054776, + "step": 12786, + "train/total_loss": 0.13842594623565674 + }, + { + "entropy": 9.641006469726562, + "epoch": 1.264287126754993, + "mean_token_accuracy": 0.7825503349304199, + "num_tokens": 9723545.0, + "step": 12787, + "train/ce_loss": 0.7713178992271423 + }, + { + "epoch": 1.264287126754993, + "step": 12787, + "train/sim_loss": 0.07544755935668945 + }, + { + "epoch": 1.264287126754993, + "step": 12787, + "train/total_loss": 0.15257935225963593 + }, + { + "entropy": 9.350127220153809, + "epoch": 1.2643859996045086, + "mean_token_accuracy": 0.8149425387382507, + "num_tokens": 9740622.0, + "step": 12788, + "train/ce_loss": 0.38725775480270386 + }, + { + "epoch": 1.2643859996045086, + "step": 12788, + "train/sim_loss": 0.016412317752838135 + }, + { + "epoch": 1.2643859996045086, + "step": 12788, + "train/total_loss": 0.05513809248805046 + }, + { + "entropy": 9.671087265014648, + "epoch": 1.264484872454024, + "mean_token_accuracy": 0.856637179851532, + "num_tokens": 9750772.0, + "step": 12789, + "train/ce_loss": 0.5801568031311035 + }, + { + "epoch": 1.264484872454024, + "step": 12789, + "train/sim_loss": 0.034660160541534424 + }, + { + "epoch": 1.264484872454024, + "step": 12789, + "train/total_loss": 0.0926758423447609 + }, + { + "entropy": 9.238956451416016, + "epoch": 1.2645837453035398, + "mean_token_accuracy": 0.8456725478172302, + "num_tokens": 9761882.0, + "step": 12790, + "train/ce_loss": 0.943893313407898 + }, + { + "epoch": 1.2645837453035398, + "step": 12790, + "train/sim_loss": 0.09198868274688721 + }, + { + "epoch": 1.2645837453035398, + "step": 12790, + "train/total_loss": 0.18637801706790924 + }, + { + "entropy": 9.234472274780273, + "epoch": 1.264682618153055, + "mean_token_accuracy": 0.8207547068595886, + "num_tokens": 9773334.0, + "step": 12791, + "train/ce_loss": 0.5819154381752014 + }, + { + "epoch": 1.264682618153055, + "step": 12791, + "train/sim_loss": 0.050011277198791504 + }, + { + "epoch": 1.264682618153055, + "step": 12791, + "train/total_loss": 0.10820282250642776 + }, + { + "entropy": 9.26746654510498, + "epoch": 1.2647814910025708, + "mean_token_accuracy": 0.8828541040420532, + "num_tokens": 9788185.0, + "step": 12792, + "train/ce_loss": 0.2975376844406128 + }, + { + "epoch": 1.2647814910025708, + "step": 12792, + "train/sim_loss": 0.05528712272644043 + }, + { + "epoch": 1.2647814910025708, + "step": 12792, + "train/total_loss": 0.08504088968038559 + }, + { + "entropy": 8.944171905517578, + "epoch": 1.2648803638520862, + "mean_token_accuracy": 0.870547354221344, + "num_tokens": 9800052.0, + "step": 12793, + "train/ce_loss": 0.2081010490655899 + }, + { + "epoch": 1.2648803638520862, + "step": 12793, + "train/sim_loss": 0.02528858184814453 + }, + { + "epoch": 1.2648803638520862, + "step": 12793, + "train/total_loss": 0.04609868675470352 + }, + { + "entropy": 9.711508750915527, + "epoch": 1.2649792367016017, + "mean_token_accuracy": 0.8055555820465088, + "num_tokens": 9815201.0, + "step": 12794, + "train/ce_loss": 0.39901861548423767 + }, + { + "epoch": 1.2649792367016017, + "step": 12794, + "train/sim_loss": 0.05005776882171631 + }, + { + "epoch": 1.2649792367016017, + "step": 12794, + "train/total_loss": 0.08995963633060455 + }, + { + "entropy": 9.572793960571289, + "epoch": 1.2650781095511172, + "mean_token_accuracy": 0.8917748928070068, + "num_tokens": 9828363.0, + "step": 12795, + "train/ce_loss": 8.146443519763125e-07 + }, + { + "epoch": 1.2650781095511172, + "step": 12795, + "train/sim_loss": 0.03024601936340332 + }, + { + "epoch": 1.2650781095511172, + "step": 12795, + "train/total_loss": 0.030246101319789886 + }, + { + "entropy": 9.01865291595459, + "epoch": 1.2651769824006327, + "mean_token_accuracy": 0.831941545009613, + "num_tokens": 9837757.0, + "step": 12796, + "train/ce_loss": 0.28839749097824097 + }, + { + "epoch": 1.2651769824006327, + "step": 12796, + "train/sim_loss": 0.055702388286590576 + }, + { + "epoch": 1.2651769824006327, + "step": 12796, + "train/total_loss": 0.08454214036464691 + }, + { + "entropy": 9.570634841918945, + "epoch": 1.2652758552501484, + "mean_token_accuracy": 0.8054187297821045, + "num_tokens": 9855225.0, + "step": 12797, + "train/ce_loss": 0.34266775846481323 + }, + { + "epoch": 1.2652758552501484, + "step": 12797, + "train/sim_loss": 0.032707273960113525 + }, + { + "epoch": 1.2652758552501484, + "step": 12797, + "train/total_loss": 0.06697405129671097 + }, + { + "entropy": 9.608091354370117, + "epoch": 1.265374728099664, + "mean_token_accuracy": 0.779347836971283, + "num_tokens": 9870021.0, + "step": 12798, + "train/ce_loss": 0.5523983836174011 + }, + { + "epoch": 1.265374728099664, + "step": 12798, + "train/sim_loss": 0.030515193939208984 + }, + { + "epoch": 1.265374728099664, + "step": 12798, + "train/total_loss": 0.08575503528118134 + }, + { + "entropy": 9.130521774291992, + "epoch": 1.2654736009491794, + "mean_token_accuracy": 0.827625572681427, + "num_tokens": 9880139.0, + "step": 12799, + "train/ce_loss": 0.5600592494010925 + }, + { + "epoch": 1.2654736009491794, + "step": 12799, + "train/sim_loss": 0.01083076000213623 + }, + { + "epoch": 1.2654736009491794, + "step": 12799, + "train/total_loss": 0.06683668494224548 + }, + { + "epoch": 1.2655724737986949, + "grad_norm": 0.5180277228355408, + "learning_rate": 6.838006230529596e-06, + "loss": 0.0922, + "step": 12800 + }, + { + "entropy": 9.195127487182617, + "epoch": 1.2655724737986949, + "mean_token_accuracy": 0.9004149436950684, + "num_tokens": 9896647.0, + "step": 12800, + "train/ce_loss": 0.4013512432575226 + }, + { + "epoch": 1.2655724737986949, + "step": 12800, + "train/sim_loss": 0.008779525756835938 + }, + { + "epoch": 1.2655724737986949, + "step": 12800, + "train/total_loss": 0.048914652317762375 + }, + { + "entropy": 9.91077995300293, + "epoch": 1.2656713466482103, + "mean_token_accuracy": 0.859413206577301, + "num_tokens": 9912724.0, + "step": 12801, + "train/ce_loss": 0.1299242377281189 + }, + { + "epoch": 1.2656713466482103, + "step": 12801, + "train/sim_loss": 0.027864933013916016 + }, + { + "epoch": 1.2656713466482103, + "step": 12801, + "train/total_loss": 0.040857356041669846 + }, + { + "entropy": 9.274356842041016, + "epoch": 1.265770219497726, + "mean_token_accuracy": 0.8392637968063354, + "num_tokens": 9920224.0, + "step": 12802, + "train/ce_loss": 0.3567548990249634 + }, + { + "epoch": 1.265770219497726, + "step": 12802, + "train/sim_loss": 0.013494014739990234 + }, + { + "epoch": 1.265770219497726, + "step": 12802, + "train/total_loss": 0.04916950687766075 + }, + { + "entropy": 9.692560195922852, + "epoch": 1.2658690923472413, + "mean_token_accuracy": 0.8793103694915771, + "num_tokens": 9931079.0, + "step": 12803, + "train/ce_loss": 0.7089366912841797 + }, + { + "epoch": 1.2658690923472413, + "step": 12803, + "train/sim_loss": 0.03937649726867676 + }, + { + "epoch": 1.2658690923472413, + "step": 12803, + "train/total_loss": 0.11027016490697861 + }, + { + "entropy": 9.396546363830566, + "epoch": 1.265967965196757, + "mean_token_accuracy": 0.8333333134651184, + "num_tokens": 9944404.0, + "step": 12804, + "train/ce_loss": 8.235290351876756e-07 + }, + { + "epoch": 1.265967965196757, + "step": 12804, + "train/sim_loss": 0.02363884449005127 + }, + { + "epoch": 1.265967965196757, + "step": 12804, + "train/total_loss": 0.023638926446437836 + }, + { + "entropy": 8.874279975891113, + "epoch": 1.2660668380462725, + "mean_token_accuracy": 0.8561946749687195, + "num_tokens": 9955413.0, + "step": 12805, + "train/ce_loss": 0.39018356800079346 + }, + { + "epoch": 1.2660668380462725, + "step": 12805, + "train/sim_loss": 0.08340883255004883 + }, + { + "epoch": 1.2660668380462725, + "step": 12805, + "train/total_loss": 0.12242719531059265 + }, + { + "entropy": 9.846128463745117, + "epoch": 1.266165710895788, + "mean_token_accuracy": 0.8729792237281799, + "num_tokens": 9967579.0, + "step": 12806, + "train/ce_loss": 0.408256858587265 + }, + { + "epoch": 1.266165710895788, + "step": 12806, + "train/sim_loss": 0.020813703536987305 + }, + { + "epoch": 1.266165710895788, + "step": 12806, + "train/total_loss": 0.061639390885829926 + }, + { + "entropy": 9.585199356079102, + "epoch": 1.2662645837453035, + "mean_token_accuracy": 0.8017241358757019, + "num_tokens": 9981533.0, + "step": 12807, + "train/ce_loss": 5.610958169199876e-07 + }, + { + "epoch": 1.2662645837453035, + "step": 12807, + "train/sim_loss": 0.02945554256439209 + }, + { + "epoch": 1.2662645837453035, + "step": 12807, + "train/total_loss": 0.029455598443746567 + }, + { + "entropy": 9.826873779296875, + "epoch": 1.266363456594819, + "mean_token_accuracy": 0.8185714483261108, + "num_tokens": 9989518.0, + "step": 12808, + "train/ce_loss": 0.2296587973833084 + }, + { + "epoch": 1.266363456594819, + "step": 12808, + "train/sim_loss": 0.04655468463897705 + }, + { + "epoch": 1.266363456594819, + "step": 12808, + "train/total_loss": 0.06952056288719177 + }, + { + "entropy": 9.132806777954102, + "epoch": 1.2664623294443347, + "mean_token_accuracy": 0.8133472204208374, + "num_tokens": 10001205.0, + "step": 12809, + "train/ce_loss": 0.6702746748924255 + }, + { + "epoch": 1.2664623294443347, + "step": 12809, + "train/sim_loss": 0.033663153648376465 + }, + { + "epoch": 1.2664623294443347, + "step": 12809, + "train/total_loss": 0.10069062560796738 + }, + { + "entropy": 9.627890586853027, + "epoch": 1.2665612022938502, + "mean_token_accuracy": 0.868175745010376, + "num_tokens": 10013784.0, + "step": 12810, + "train/ce_loss": 0.3673674762248993 + }, + { + "epoch": 1.2665612022938502, + "step": 12810, + "train/sim_loss": 0.025563955307006836 + }, + { + "epoch": 1.2665612022938502, + "step": 12810, + "train/total_loss": 0.062300704419612885 + }, + { + "entropy": 9.782004356384277, + "epoch": 1.2666600751433656, + "mean_token_accuracy": 0.8218465447425842, + "num_tokens": 10030118.0, + "step": 12811, + "train/ce_loss": 0.4605577886104584 + }, + { + "epoch": 1.2666600751433656, + "step": 12811, + "train/sim_loss": 0.016956627368927002 + }, + { + "epoch": 1.2666600751433656, + "step": 12811, + "train/total_loss": 0.06301240622997284 + }, + { + "entropy": 9.386852264404297, + "epoch": 1.2667589479928811, + "mean_token_accuracy": 0.8537142872810364, + "num_tokens": 10038714.0, + "step": 12812, + "train/ce_loss": 0.4646368622779846 + }, + { + "epoch": 1.2667589479928811, + "step": 12812, + "train/sim_loss": 0.0916627049446106 + }, + { + "epoch": 1.2667589479928811, + "step": 12812, + "train/total_loss": 0.13812638819217682 + }, + { + "entropy": 9.208868980407715, + "epoch": 1.2668578208423966, + "mean_token_accuracy": 0.8630653023719788, + "num_tokens": 10044905.0, + "step": 12813, + "train/ce_loss": 0.1973225176334381 + }, + { + "epoch": 1.2668578208423966, + "step": 12813, + "train/sim_loss": 0.017725825309753418 + }, + { + "epoch": 1.2668578208423966, + "step": 12813, + "train/total_loss": 0.03745807707309723 + }, + { + "entropy": 9.372699737548828, + "epoch": 1.2669566936919123, + "mean_token_accuracy": 0.8540608882904053, + "num_tokens": 10057787.0, + "step": 12814, + "train/ce_loss": 0.2452024221420288 + }, + { + "epoch": 1.2669566936919123, + "step": 12814, + "train/sim_loss": 0.021753370761871338 + }, + { + "epoch": 1.2669566936919123, + "step": 12814, + "train/total_loss": 0.0462736114859581 + }, + { + "entropy": 10.067998886108398, + "epoch": 1.2670555665414276, + "mean_token_accuracy": 0.8810344934463501, + "num_tokens": 10074774.0, + "step": 12815, + "train/ce_loss": 1.492050046181248e-06 + }, + { + "epoch": 1.2670555665414276, + "step": 12815, + "train/sim_loss": 0.0521855354309082 + }, + { + "epoch": 1.2670555665414276, + "step": 12815, + "train/total_loss": 0.05218568444252014 + }, + { + "entropy": 9.684222221374512, + "epoch": 1.2671544393909433, + "mean_token_accuracy": 0.8785424828529358, + "num_tokens": 10087629.0, + "step": 12816, + "train/ce_loss": 0.3092295825481415 + }, + { + "epoch": 1.2671544393909433, + "step": 12816, + "train/sim_loss": 0.05873960256576538 + }, + { + "epoch": 1.2671544393909433, + "step": 12816, + "train/total_loss": 0.08966255933046341 + }, + { + "entropy": 9.53558349609375, + "epoch": 1.2672533122404588, + "mean_token_accuracy": 0.8597285151481628, + "num_tokens": 10104025.0, + "step": 12817, + "train/ce_loss": 0.22431109845638275 + }, + { + "epoch": 1.2672533122404588, + "step": 12817, + "train/sim_loss": 0.020424485206604004 + }, + { + "epoch": 1.2672533122404588, + "step": 12817, + "train/total_loss": 0.04285559803247452 + }, + { + "entropy": 9.49242115020752, + "epoch": 1.2673521850899743, + "mean_token_accuracy": 0.8932714462280273, + "num_tokens": 10114424.0, + "step": 12818, + "train/ce_loss": 0.10065628588199615 + }, + { + "epoch": 1.2673521850899743, + "step": 12818, + "train/sim_loss": 0.07621783018112183 + }, + { + "epoch": 1.2673521850899743, + "step": 12818, + "train/total_loss": 0.08628346025943756 + }, + { + "entropy": 9.580371856689453, + "epoch": 1.2674510579394898, + "mean_token_accuracy": 0.8321078419685364, + "num_tokens": 10125776.0, + "step": 12819, + "train/ce_loss": 0.37977728247642517 + }, + { + "epoch": 1.2674510579394898, + "step": 12819, + "train/sim_loss": 0.06345295906066895 + }, + { + "epoch": 1.2674510579394898, + "step": 12819, + "train/total_loss": 0.10143068432807922 + }, + { + "epoch": 1.2675499307890052, + "grad_norm": 0.6301184892654419, + "learning_rate": 6.833061365771647e-06, + "loss": 0.0872, + "step": 12820 + }, + { + "entropy": 9.813413619995117, + "epoch": 1.2675499307890052, + "mean_token_accuracy": 0.8411513566970825, + "num_tokens": 10140947.0, + "step": 12820, + "train/ce_loss": 0.6068336367607117 + }, + { + "epoch": 1.2675499307890052, + "step": 12820, + "train/sim_loss": 0.04136449098587036 + }, + { + "epoch": 1.2675499307890052, + "step": 12820, + "train/total_loss": 0.102047860622406 + }, + { + "entropy": 9.882776260375977, + "epoch": 1.267648803638521, + "mean_token_accuracy": 0.8425655961036682, + "num_tokens": 10151741.0, + "step": 12821, + "train/ce_loss": 0.7029473185539246 + }, + { + "epoch": 1.267648803638521, + "step": 12821, + "train/sim_loss": 0.07983541488647461 + }, + { + "epoch": 1.267648803638521, + "step": 12821, + "train/total_loss": 0.15013015270233154 + }, + { + "entropy": 9.542869567871094, + "epoch": 1.2677476764880364, + "mean_token_accuracy": 0.9260935187339783, + "num_tokens": 10164689.0, + "step": 12822, + "train/ce_loss": 0.466524600982666 + }, + { + "epoch": 1.2677476764880364, + "step": 12822, + "train/sim_loss": 0.012492060661315918 + }, + { + "epoch": 1.2677476764880364, + "step": 12822, + "train/total_loss": 0.0591445229947567 + }, + { + "entropy": 9.519414901733398, + "epoch": 1.267846549337552, + "mean_token_accuracy": 0.8414096832275391, + "num_tokens": 10179186.0, + "step": 12823, + "train/ce_loss": 0.8580834269523621 + }, + { + "epoch": 1.267846549337552, + "step": 12823, + "train/sim_loss": 0.06407153606414795 + }, + { + "epoch": 1.267846549337552, + "step": 12823, + "train/total_loss": 0.14987987279891968 + }, + { + "entropy": 9.613899230957031, + "epoch": 1.2679454221870674, + "mean_token_accuracy": 0.8960468769073486, + "num_tokens": 10191470.0, + "step": 12824, + "train/ce_loss": 0.27059799432754517 + }, + { + "epoch": 1.2679454221870674, + "step": 12824, + "train/sim_loss": 0.013365387916564941 + }, + { + "epoch": 1.2679454221870674, + "step": 12824, + "train/total_loss": 0.04042518883943558 + }, + { + "entropy": 9.57276439666748, + "epoch": 1.268044295036583, + "mean_token_accuracy": 0.9030023217201233, + "num_tokens": 10206141.0, + "step": 12825, + "train/ce_loss": 0.40258118510246277 + }, + { + "epoch": 1.268044295036583, + "step": 12825, + "train/sim_loss": 0.03818368911743164 + }, + { + "epoch": 1.268044295036583, + "step": 12825, + "train/total_loss": 0.0784418135881424 + }, + { + "entropy": 9.601176261901855, + "epoch": 1.2681431678860986, + "mean_token_accuracy": 0.8608815670013428, + "num_tokens": 10215648.0, + "step": 12826, + "train/ce_loss": 0.3592725992202759 + }, + { + "epoch": 1.2681431678860986, + "step": 12826, + "train/sim_loss": 0.03890347480773926 + }, + { + "epoch": 1.2681431678860986, + "step": 12826, + "train/total_loss": 0.07483074069023132 + }, + { + "entropy": 9.13232421875, + "epoch": 1.2682420407356139, + "mean_token_accuracy": 0.8706293702125549, + "num_tokens": 10228544.0, + "step": 12827, + "train/ce_loss": 0.14161376655101776 + }, + { + "epoch": 1.2682420407356139, + "step": 12827, + "train/sim_loss": 0.020155787467956543 + }, + { + "epoch": 1.2682420407356139, + "step": 12827, + "train/total_loss": 0.03431716561317444 + }, + { + "entropy": 9.69119644165039, + "epoch": 1.2683409135851296, + "mean_token_accuracy": 0.8641811013221741, + "num_tokens": 10240304.0, + "step": 12828, + "train/ce_loss": 0.6991057991981506 + }, + { + "epoch": 1.2683409135851296, + "step": 12828, + "train/sim_loss": 0.03556257486343384 + }, + { + "epoch": 1.2683409135851296, + "step": 12828, + "train/total_loss": 0.10547315329313278 + }, + { + "entropy": 9.334115982055664, + "epoch": 1.268439786434645, + "mean_token_accuracy": 0.9136407971382141, + "num_tokens": 10255238.0, + "step": 12829, + "train/ce_loss": 0.3357826769351959 + }, + { + "epoch": 1.268439786434645, + "step": 12829, + "train/sim_loss": 0.018750131130218506 + }, + { + "epoch": 1.268439786434645, + "step": 12829, + "train/total_loss": 0.05232840031385422 + }, + { + "entropy": 9.47854232788086, + "epoch": 1.2685386592841605, + "mean_token_accuracy": 0.8609431385993958, + "num_tokens": 10269759.0, + "step": 12830, + "train/ce_loss": 0.2964229881763458 + }, + { + "epoch": 1.2685386592841605, + "step": 12830, + "train/sim_loss": 0.05493509769439697 + }, + { + "epoch": 1.2685386592841605, + "step": 12830, + "train/total_loss": 0.08457739651203156 + }, + { + "entropy": 8.755617141723633, + "epoch": 1.268637532133676, + "mean_token_accuracy": 0.8873917460441589, + "num_tokens": 10280465.0, + "step": 12831, + "train/ce_loss": 0.14584386348724365 + }, + { + "epoch": 1.268637532133676, + "step": 12831, + "train/sim_loss": 0.00857764482498169 + }, + { + "epoch": 1.268637532133676, + "step": 12831, + "train/total_loss": 0.023162031546235085 + }, + { + "entropy": 9.275739669799805, + "epoch": 1.2687364049831915, + "mean_token_accuracy": 0.9208754301071167, + "num_tokens": 10292849.0, + "step": 12832, + "train/ce_loss": 0.43017706274986267 + }, + { + "epoch": 1.2687364049831915, + "step": 12832, + "train/sim_loss": 0.028207004070281982 + }, + { + "epoch": 1.2687364049831915, + "step": 12832, + "train/total_loss": 0.07122471183538437 + }, + { + "entropy": 9.33808708190918, + "epoch": 1.2688352778327072, + "mean_token_accuracy": 0.8126649260520935, + "num_tokens": 10307123.0, + "step": 12833, + "train/ce_loss": 0.3861454427242279 + }, + { + "epoch": 1.2688352778327072, + "step": 12833, + "train/sim_loss": 0.023770511150360107 + }, + { + "epoch": 1.2688352778327072, + "step": 12833, + "train/total_loss": 0.06238505616784096 + }, + { + "entropy": 9.772672653198242, + "epoch": 1.2689341506822227, + "mean_token_accuracy": 0.8642972707748413, + "num_tokens": 10323329.0, + "step": 12834, + "train/ce_loss": 0.40519845485687256 + }, + { + "epoch": 1.2689341506822227, + "step": 12834, + "train/sim_loss": 0.018803834915161133 + }, + { + "epoch": 1.2689341506822227, + "step": 12834, + "train/total_loss": 0.05932367965579033 + }, + { + "entropy": 9.17082405090332, + "epoch": 1.2690330235317382, + "mean_token_accuracy": 0.8909090757369995, + "num_tokens": 10341613.0, + "step": 12835, + "train/ce_loss": 0.3547309935092926 + }, + { + "epoch": 1.2690330235317382, + "step": 12835, + "train/sim_loss": 0.06762611865997314 + }, + { + "epoch": 1.2690330235317382, + "step": 12835, + "train/total_loss": 0.10309921950101852 + }, + { + "entropy": 9.90186882019043, + "epoch": 1.2691318963812537, + "mean_token_accuracy": 0.8655462265014648, + "num_tokens": 10362542.0, + "step": 12836, + "train/ce_loss": 0.2048296183347702 + }, + { + "epoch": 1.2691318963812537, + "step": 12836, + "train/sim_loss": 0.03684735298156738 + }, + { + "epoch": 1.2691318963812537, + "step": 12836, + "train/total_loss": 0.05733031779527664 + }, + { + "entropy": 9.431830406188965, + "epoch": 1.2692307692307692, + "mean_token_accuracy": 0.8576122522354126, + "num_tokens": 10378066.0, + "step": 12837, + "train/ce_loss": 0.6615436673164368 + }, + { + "epoch": 1.2692307692307692, + "step": 12837, + "train/sim_loss": 0.0270310640335083 + }, + { + "epoch": 1.2692307692307692, + "step": 12837, + "train/total_loss": 0.0931854322552681 + }, + { + "entropy": 9.315984725952148, + "epoch": 1.2693296420802849, + "mean_token_accuracy": 0.879161536693573, + "num_tokens": 10390776.0, + "step": 12838, + "train/ce_loss": 0.4354919493198395 + }, + { + "epoch": 1.2693296420802849, + "step": 12838, + "train/sim_loss": 0.04391467571258545 + }, + { + "epoch": 1.2693296420802849, + "step": 12838, + "train/total_loss": 0.0874638706445694 + }, + { + "entropy": 9.11844539642334, + "epoch": 1.2694285149298004, + "mean_token_accuracy": 0.9107142686843872, + "num_tokens": 10402707.0, + "step": 12839, + "train/ce_loss": 0.3173166513442993 + }, + { + "epoch": 1.2694285149298004, + "step": 12839, + "train/sim_loss": 0.04141813516616821 + }, + { + "epoch": 1.2694285149298004, + "step": 12839, + "train/total_loss": 0.07314980030059814 + }, + { + "epoch": 1.2695273877793158, + "grad_norm": 0.45692238211631775, + "learning_rate": 6.828116501013698e-06, + "loss": 0.0778, + "step": 12840 + }, + { + "entropy": 9.466839790344238, + "epoch": 1.2695273877793158, + "mean_token_accuracy": 0.8736979365348816, + "num_tokens": 10413386.0, + "step": 12840, + "train/ce_loss": 0.2875478267669678 + }, + { + "epoch": 1.2695273877793158, + "step": 12840, + "train/sim_loss": 0.041203439235687256 + }, + { + "epoch": 1.2695273877793158, + "step": 12840, + "train/total_loss": 0.06995822489261627 + }, + { + "entropy": 9.326166152954102, + "epoch": 1.2696262606288313, + "mean_token_accuracy": 0.8712446093559265, + "num_tokens": 10426997.0, + "step": 12841, + "train/ce_loss": 0.30876070261001587 + }, + { + "epoch": 1.2696262606288313, + "step": 12841, + "train/sim_loss": 0.04334080219268799 + }, + { + "epoch": 1.2696262606288313, + "step": 12841, + "train/total_loss": 0.07421687245368958 + }, + { + "entropy": 9.377447128295898, + "epoch": 1.2697251334783468, + "mean_token_accuracy": 0.9017341136932373, + "num_tokens": 10433536.0, + "step": 12842, + "train/ce_loss": 2.9698699677283e-07 + }, + { + "epoch": 1.2697251334783468, + "step": 12842, + "train/sim_loss": 0.012763679027557373 + }, + { + "epoch": 1.2697251334783468, + "step": 12842, + "train/total_loss": 0.01276370882987976 + }, + { + "entropy": 9.425416946411133, + "epoch": 1.2698240063278623, + "mean_token_accuracy": 0.8898622989654541, + "num_tokens": 10451032.0, + "step": 12843, + "train/ce_loss": 0.32763734459877014 + }, + { + "epoch": 1.2698240063278623, + "step": 12843, + "train/sim_loss": 0.06461858749389648 + }, + { + "epoch": 1.2698240063278623, + "step": 12843, + "train/total_loss": 0.0973823219537735 + }, + { + "entropy": 9.51432991027832, + "epoch": 1.2699228791773778, + "mean_token_accuracy": 0.8933623433113098, + "num_tokens": 10465733.0, + "step": 12844, + "train/ce_loss": 0.4133424460887909 + }, + { + "epoch": 1.2699228791773778, + "step": 12844, + "train/sim_loss": 0.054549336433410645 + }, + { + "epoch": 1.2699228791773778, + "step": 12844, + "train/total_loss": 0.0958835780620575 + }, + { + "entropy": 8.86455249786377, + "epoch": 1.2700217520268935, + "mean_token_accuracy": 0.8845700621604919, + "num_tokens": 10477040.0, + "step": 12845, + "train/ce_loss": 0.2379945069551468 + }, + { + "epoch": 1.2700217520268935, + "step": 12845, + "train/sim_loss": 0.01464223861694336 + }, + { + "epoch": 1.2700217520268935, + "step": 12845, + "train/total_loss": 0.03844168782234192 + }, + { + "entropy": 9.415678977966309, + "epoch": 1.270120624876409, + "mean_token_accuracy": 0.8838383555412292, + "num_tokens": 10494559.0, + "step": 12846, + "train/ce_loss": 0.46324872970581055 + }, + { + "epoch": 1.270120624876409, + "step": 12846, + "train/sim_loss": 0.06444603204727173 + }, + { + "epoch": 1.270120624876409, + "step": 12846, + "train/total_loss": 0.11077091097831726 + }, + { + "entropy": 9.951716423034668, + "epoch": 1.2702194977259245, + "mean_token_accuracy": 0.8830083608627319, + "num_tokens": 10505994.0, + "step": 12847, + "train/ce_loss": 0.7806200981140137 + }, + { + "epoch": 1.2702194977259245, + "step": 12847, + "train/sim_loss": 0.06866556406021118 + }, + { + "epoch": 1.2702194977259245, + "step": 12847, + "train/total_loss": 0.1467275768518448 + }, + { + "entropy": 9.043008804321289, + "epoch": 1.27031837057544, + "mean_token_accuracy": 0.8675957918167114, + "num_tokens": 10519148.0, + "step": 12848, + "train/ce_loss": 0.6046025156974792 + }, + { + "epoch": 1.27031837057544, + "step": 12848, + "train/sim_loss": 0.024025321006774902 + }, + { + "epoch": 1.27031837057544, + "step": 12848, + "train/total_loss": 0.08448557555675507 + }, + { + "entropy": 9.269571304321289, + "epoch": 1.2704172434249554, + "mean_token_accuracy": 0.8871989846229553, + "num_tokens": 10531870.0, + "step": 12849, + "train/ce_loss": 0.1636287122964859 + }, + { + "epoch": 1.2704172434249554, + "step": 12849, + "train/sim_loss": 0.02899688482284546 + }, + { + "epoch": 1.2704172434249554, + "step": 12849, + "train/total_loss": 0.04535975679755211 + }, + { + "entropy": 9.297759056091309, + "epoch": 1.2705161162744711, + "mean_token_accuracy": 0.8469387888908386, + "num_tokens": 10542232.0, + "step": 12850, + "train/ce_loss": 0.41870370507240295 + }, + { + "epoch": 1.2705161162744711, + "step": 12850, + "train/sim_loss": 0.021418213844299316 + }, + { + "epoch": 1.2705161162744711, + "step": 12850, + "train/total_loss": 0.06328858435153961 + }, + { + "entropy": 8.85481071472168, + "epoch": 1.2706149891239866, + "mean_token_accuracy": 0.8829113841056824, + "num_tokens": 10551734.0, + "step": 12851, + "train/ce_loss": 0.3821364641189575 + }, + { + "epoch": 1.2706149891239866, + "step": 12851, + "train/sim_loss": 0.04763352870941162 + }, + { + "epoch": 1.2706149891239866, + "step": 12851, + "train/total_loss": 0.08584717661142349 + }, + { + "entropy": 9.41864013671875, + "epoch": 1.2707138619735021, + "mean_token_accuracy": 0.8307692408561707, + "num_tokens": 10567234.0, + "step": 12852, + "train/ce_loss": 0.8851425051689148 + }, + { + "epoch": 1.2707138619735021, + "step": 12852, + "train/sim_loss": 0.23812776803970337 + }, + { + "epoch": 1.2707138619735021, + "step": 12852, + "train/total_loss": 0.3266420364379883 + }, + { + "entropy": 9.425151824951172, + "epoch": 1.2708127348230176, + "mean_token_accuracy": 0.9269521236419678, + "num_tokens": 10578559.0, + "step": 12853, + "train/ce_loss": 0.1879885494709015 + }, + { + "epoch": 1.2708127348230176, + "step": 12853, + "train/sim_loss": 0.03508925437927246 + }, + { + "epoch": 1.2708127348230176, + "step": 12853, + "train/total_loss": 0.05388811230659485 + }, + { + "entropy": 9.383564949035645, + "epoch": 1.270911607672533, + "mean_token_accuracy": 0.8232118487358093, + "num_tokens": 10587585.0, + "step": 12854, + "train/ce_loss": 0.17227381467819214 + }, + { + "epoch": 1.270911607672533, + "step": 12854, + "train/sim_loss": 0.08015823364257812 + }, + { + "epoch": 1.270911607672533, + "step": 12854, + "train/total_loss": 0.09738561511039734 + }, + { + "entropy": 9.26484489440918, + "epoch": 1.2710104805220486, + "mean_token_accuracy": 0.8438617587089539, + "num_tokens": 10596892.0, + "step": 12855, + "train/ce_loss": 0.34974876046180725 + }, + { + "epoch": 1.2710104805220486, + "step": 12855, + "train/sim_loss": 0.04492974281311035 + }, + { + "epoch": 1.2710104805220486, + "step": 12855, + "train/total_loss": 0.07990461587905884 + }, + { + "entropy": 9.067476272583008, + "epoch": 1.271109353371564, + "mean_token_accuracy": 0.9198369383811951, + "num_tokens": 10606937.0, + "step": 12856, + "train/ce_loss": 0.3200516402721405 + }, + { + "epoch": 1.271109353371564, + "step": 12856, + "train/sim_loss": 0.01579207181930542 + }, + { + "epoch": 1.271109353371564, + "step": 12856, + "train/total_loss": 0.04779723659157753 + }, + { + "entropy": 9.265363693237305, + "epoch": 1.2712082262210798, + "mean_token_accuracy": 0.8551723957061768, + "num_tokens": 10617379.0, + "step": 12857, + "train/ce_loss": 0.33400315046310425 + }, + { + "epoch": 1.2712082262210798, + "step": 12857, + "train/sim_loss": 0.076183021068573 + }, + { + "epoch": 1.2712082262210798, + "step": 12857, + "train/total_loss": 0.10958333313465118 + }, + { + "entropy": 9.662313461303711, + "epoch": 1.2713070990705952, + "mean_token_accuracy": 0.8550295829772949, + "num_tokens": 10633966.0, + "step": 12858, + "train/ce_loss": 0.5440589189529419 + }, + { + "epoch": 1.2713070990705952, + "step": 12858, + "train/sim_loss": 0.07033681869506836 + }, + { + "epoch": 1.2713070990705952, + "step": 12858, + "train/total_loss": 0.12474271655082703 + }, + { + "entropy": 9.688051223754883, + "epoch": 1.2714059719201107, + "mean_token_accuracy": 0.8228005170822144, + "num_tokens": 10645329.0, + "step": 12859, + "train/ce_loss": 0.5974895358085632 + }, + { + "epoch": 1.2714059719201107, + "step": 12859, + "train/sim_loss": 0.09848952293395996 + }, + { + "epoch": 1.2714059719201107, + "step": 12859, + "train/total_loss": 0.1582384705543518 + }, + { + "epoch": 1.2715048447696262, + "grad_norm": 0.7325953841209412, + "learning_rate": 6.823171636255749e-06, + "loss": 0.0848, + "step": 12860 + }, + { + "entropy": 8.972709655761719, + "epoch": 1.2715048447696262, + "mean_token_accuracy": 0.8667481541633606, + "num_tokens": 10660357.0, + "step": 12860, + "train/ce_loss": 3.4625546163624676e-07 + }, + { + "epoch": 1.2715048447696262, + "step": 12860, + "train/sim_loss": 0.021664977073669434 + }, + { + "epoch": 1.2715048447696262, + "step": 12860, + "train/total_loss": 0.02166501246392727 + }, + { + "entropy": 9.856558799743652, + "epoch": 1.2716037176191417, + "mean_token_accuracy": 0.8412256240844727, + "num_tokens": 10673879.0, + "step": 12861, + "train/ce_loss": 0.8639615774154663 + }, + { + "epoch": 1.2716037176191417, + "step": 12861, + "train/sim_loss": 0.023216545581817627 + }, + { + "epoch": 1.2716037176191417, + "step": 12861, + "train/total_loss": 0.10961270332336426 + }, + { + "entropy": 9.455751419067383, + "epoch": 1.2717025904686574, + "mean_token_accuracy": 0.797919750213623, + "num_tokens": 10690168.0, + "step": 12862, + "train/ce_loss": 0.6829213500022888 + }, + { + "epoch": 1.2717025904686574, + "step": 12862, + "train/sim_loss": 0.046751320362091064 + }, + { + "epoch": 1.2717025904686574, + "step": 12862, + "train/total_loss": 0.11504345387220383 + }, + { + "entropy": 9.338129043579102, + "epoch": 1.271801463318173, + "mean_token_accuracy": 0.8517786264419556, + "num_tokens": 10707618.0, + "step": 12863, + "train/ce_loss": 0.5255270600318909 + }, + { + "epoch": 1.271801463318173, + "step": 12863, + "train/sim_loss": 0.08790343999862671 + }, + { + "epoch": 1.271801463318173, + "step": 12863, + "train/total_loss": 0.14045614004135132 + }, + { + "entropy": 9.611677169799805, + "epoch": 1.2719003361676884, + "mean_token_accuracy": 0.8721910119056702, + "num_tokens": 10719201.0, + "step": 12864, + "train/ce_loss": 8.605386483395705e-07 + }, + { + "epoch": 1.2719003361676884, + "step": 12864, + "train/sim_loss": 0.02801060676574707 + }, + { + "epoch": 1.2719003361676884, + "step": 12864, + "train/total_loss": 0.028010692447423935 + }, + { + "entropy": 9.163503646850586, + "epoch": 1.2719992090172039, + "mean_token_accuracy": 0.8856858611106873, + "num_tokens": 10732461.0, + "step": 12865, + "train/ce_loss": 0.4596976637840271 + }, + { + "epoch": 1.2719992090172039, + "step": 12865, + "train/sim_loss": 0.05127251148223877 + }, + { + "epoch": 1.2719992090172039, + "step": 12865, + "train/total_loss": 0.09724228084087372 + }, + { + "entropy": 9.028854370117188, + "epoch": 1.2720980818667194, + "mean_token_accuracy": 0.8508287072181702, + "num_tokens": 10740819.0, + "step": 12866, + "train/ce_loss": 0.5023791790008545 + }, + { + "epoch": 1.2720980818667194, + "step": 12866, + "train/sim_loss": 0.03816366195678711 + }, + { + "epoch": 1.2720980818667194, + "step": 12866, + "train/total_loss": 0.08840158581733704 + }, + { + "entropy": 9.657938003540039, + "epoch": 1.2721969547162348, + "mean_token_accuracy": 0.9273021221160889, + "num_tokens": 10752624.0, + "step": 12867, + "train/ce_loss": 2.750071814716648e-07 + }, + { + "epoch": 1.2721969547162348, + "step": 12867, + "train/sim_loss": 0.01670891046524048 + }, + { + "epoch": 1.2721969547162348, + "step": 12867, + "train/total_loss": 0.016708938404917717 + }, + { + "entropy": 9.606839179992676, + "epoch": 1.2722958275657503, + "mean_token_accuracy": 0.8235294222831726, + "num_tokens": 10769917.0, + "step": 12868, + "train/ce_loss": 0.19444730877876282 + }, + { + "epoch": 1.2722958275657503, + "step": 12868, + "train/sim_loss": 0.055281877517700195 + }, + { + "epoch": 1.2722958275657503, + "step": 12868, + "train/total_loss": 0.07472661137580872 + }, + { + "entropy": 9.499293327331543, + "epoch": 1.272394700415266, + "mean_token_accuracy": 0.9100610017776489, + "num_tokens": 10781537.0, + "step": 12869, + "train/ce_loss": 2.9365932618929946e-07 + }, + { + "epoch": 1.272394700415266, + "step": 12869, + "train/sim_loss": 0.0466001033782959 + }, + { + "epoch": 1.272394700415266, + "step": 12869, + "train/total_loss": 0.046600133180618286 + }, + { + "entropy": 9.380199432373047, + "epoch": 1.2724935732647815, + "mean_token_accuracy": 0.8732084035873413, + "num_tokens": 10798993.0, + "step": 12870, + "train/ce_loss": 0.27717041969299316 + }, + { + "epoch": 1.2724935732647815, + "step": 12870, + "train/sim_loss": 0.04885822534561157 + }, + { + "epoch": 1.2724935732647815, + "step": 12870, + "train/total_loss": 0.07657526433467865 + }, + { + "entropy": 9.414281845092773, + "epoch": 1.272592446114297, + "mean_token_accuracy": 0.8654592633247375, + "num_tokens": 10809420.0, + "step": 12871, + "train/ce_loss": 0.41920721530914307 + }, + { + "epoch": 1.272592446114297, + "step": 12871, + "train/sim_loss": 0.08399653434753418 + }, + { + "epoch": 1.272592446114297, + "step": 12871, + "train/total_loss": 0.1259172558784485 + }, + { + "entropy": 9.167779922485352, + "epoch": 1.2726913189638125, + "mean_token_accuracy": 0.896774172782898, + "num_tokens": 10822573.0, + "step": 12872, + "train/ce_loss": 0.5726273655891418 + }, + { + "epoch": 1.2726913189638125, + "step": 12872, + "train/sim_loss": 0.030106425285339355 + }, + { + "epoch": 1.2726913189638125, + "step": 12872, + "train/total_loss": 0.0873691588640213 + }, + { + "entropy": 9.893659591674805, + "epoch": 1.272790191813328, + "mean_token_accuracy": 0.8980099558830261, + "num_tokens": 10834640.0, + "step": 12873, + "train/ce_loss": 0.6606689691543579 + }, + { + "epoch": 1.272790191813328, + "step": 12873, + "train/sim_loss": 0.05638605356216431 + }, + { + "epoch": 1.272790191813328, + "step": 12873, + "train/total_loss": 0.12245295196771622 + }, + { + "entropy": 9.623441696166992, + "epoch": 1.2728890646628437, + "mean_token_accuracy": 0.8672922253608704, + "num_tokens": 10844972.0, + "step": 12874, + "train/ce_loss": 0.5743618607521057 + }, + { + "epoch": 1.2728890646628437, + "step": 12874, + "train/sim_loss": 0.05228376388549805 + }, + { + "epoch": 1.2728890646628437, + "step": 12874, + "train/total_loss": 0.10971994698047638 + }, + { + "entropy": 9.214603424072266, + "epoch": 1.2729879375123592, + "mean_token_accuracy": 0.8153215050697327, + "num_tokens": 10855002.0, + "step": 12875, + "train/ce_loss": 0.7875350117683411 + }, + { + "epoch": 1.2729879375123592, + "step": 12875, + "train/sim_loss": 0.03626811504364014 + }, + { + "epoch": 1.2729879375123592, + "step": 12875, + "train/total_loss": 0.11502161622047424 + }, + { + "entropy": 9.302963256835938, + "epoch": 1.2730868103618747, + "mean_token_accuracy": 0.8846761584281921, + "num_tokens": 10864505.0, + "step": 12876, + "train/ce_loss": 0.16745224595069885 + }, + { + "epoch": 1.2730868103618747, + "step": 12876, + "train/sim_loss": 0.021616220474243164 + }, + { + "epoch": 1.2730868103618747, + "step": 12876, + "train/total_loss": 0.03836144506931305 + }, + { + "entropy": 9.26881217956543, + "epoch": 1.2731856832113901, + "mean_token_accuracy": 0.870512843132019, + "num_tokens": 10875086.0, + "step": 12877, + "train/ce_loss": 0.15100212395191193 + }, + { + "epoch": 1.2731856832113901, + "step": 12877, + "train/sim_loss": 0.04492682218551636 + }, + { + "epoch": 1.2731856832113901, + "step": 12877, + "train/total_loss": 0.06002703309059143 + }, + { + "entropy": 9.123637199401855, + "epoch": 1.2732845560609056, + "mean_token_accuracy": 0.8615049123764038, + "num_tokens": 10890092.0, + "step": 12878, + "train/ce_loss": 0.34509676694869995 + }, + { + "epoch": 1.2732845560609056, + "step": 12878, + "train/sim_loss": 0.028052330017089844 + }, + { + "epoch": 1.2732845560609056, + "step": 12878, + "train/total_loss": 0.0625620037317276 + }, + { + "entropy": 9.367683410644531, + "epoch": 1.2733834289104213, + "mean_token_accuracy": 0.8745837807655334, + "num_tokens": 10905557.0, + "step": 12879, + "train/ce_loss": 0.33795711398124695 + }, + { + "epoch": 1.2733834289104213, + "step": 12879, + "train/sim_loss": 0.03937196731567383 + }, + { + "epoch": 1.2733834289104213, + "step": 12879, + "train/total_loss": 0.07316768169403076 + }, + { + "epoch": 1.2734823017599366, + "grad_norm": 0.4916248321533203, + "learning_rate": 6.8182267714978004e-06, + "loss": 0.08, + "step": 12880 + }, + { + "entropy": 9.190549850463867, + "epoch": 1.2734823017599366, + "mean_token_accuracy": 0.8542510271072388, + "num_tokens": 10917629.0, + "step": 12880, + "train/ce_loss": 0.1854662001132965 + }, + { + "epoch": 1.2734823017599366, + "step": 12880, + "train/sim_loss": 0.031783342361450195 + }, + { + "epoch": 1.2734823017599366, + "step": 12880, + "train/total_loss": 0.05032996088266373 + }, + { + "entropy": 9.544387817382812, + "epoch": 1.2735811746094523, + "mean_token_accuracy": 0.8853800892829895, + "num_tokens": 10933582.0, + "step": 12881, + "train/ce_loss": 0.3964660167694092 + }, + { + "epoch": 1.2735811746094523, + "step": 12881, + "train/sim_loss": 0.07878780364990234 + }, + { + "epoch": 1.2735811746094523, + "step": 12881, + "train/total_loss": 0.11843440681695938 + }, + { + "entropy": 9.470745086669922, + "epoch": 1.2736800474589678, + "mean_token_accuracy": 0.8999999761581421, + "num_tokens": 10946289.0, + "step": 12882, + "train/ce_loss": 0.3081769347190857 + }, + { + "epoch": 1.2736800474589678, + "step": 12882, + "train/sim_loss": 0.05209165811538696 + }, + { + "epoch": 1.2736800474589678, + "step": 12882, + "train/total_loss": 0.08290935307741165 + }, + { + "entropy": 9.780569076538086, + "epoch": 1.2737789203084833, + "mean_token_accuracy": 0.9085487127304077, + "num_tokens": 10957661.0, + "step": 12883, + "train/ce_loss": 9.394166795573256e-07 + }, + { + "epoch": 1.2737789203084833, + "step": 12883, + "train/sim_loss": 0.025608301162719727 + }, + { + "epoch": 1.2737789203084833, + "step": 12883, + "train/total_loss": 0.025608394294977188 + }, + { + "entropy": 9.575065612792969, + "epoch": 1.2738777931579988, + "mean_token_accuracy": 0.8319814801216125, + "num_tokens": 10972367.0, + "step": 12884, + "train/ce_loss": 0.8653284311294556 + }, + { + "epoch": 1.2738777931579988, + "step": 12884, + "train/sim_loss": 0.08291912078857422 + }, + { + "epoch": 1.2738777931579988, + "step": 12884, + "train/total_loss": 0.16945196688175201 + }, + { + "entropy": 9.54018783569336, + "epoch": 1.2739766660075142, + "mean_token_accuracy": 0.9135220050811768, + "num_tokens": 10985883.0, + "step": 12885, + "train/ce_loss": 9.236409255208855e-07 + }, + { + "epoch": 1.2739766660075142, + "step": 12885, + "train/sim_loss": 0.03618675470352173 + }, + { + "epoch": 1.2739766660075142, + "step": 12885, + "train/total_loss": 0.03618684783577919 + }, + { + "entropy": 8.899423599243164, + "epoch": 1.27407553885703, + "mean_token_accuracy": 0.8929845690727234, + "num_tokens": 10993766.0, + "step": 12886, + "train/ce_loss": 0.15406814217567444 + }, + { + "epoch": 1.27407553885703, + "step": 12886, + "train/sim_loss": 0.01567983627319336 + }, + { + "epoch": 1.27407553885703, + "step": 12886, + "train/total_loss": 0.031086649745702744 + }, + { + "entropy": 9.381353378295898, + "epoch": 1.2741744117065454, + "mean_token_accuracy": 0.863070547580719, + "num_tokens": 11001804.0, + "step": 12887, + "train/ce_loss": 0.4480895698070526 + }, + { + "epoch": 1.2741744117065454, + "step": 12887, + "train/sim_loss": 0.0385814905166626 + }, + { + "epoch": 1.2741744117065454, + "step": 12887, + "train/total_loss": 0.08339044451713562 + }, + { + "entropy": 9.725179672241211, + "epoch": 1.274273284556061, + "mean_token_accuracy": 0.8949771523475647, + "num_tokens": 11017104.0, + "step": 12888, + "train/ce_loss": 8.933851631809375e-07 + }, + { + "epoch": 1.274273284556061, + "step": 12888, + "train/sim_loss": 0.08071398735046387 + }, + { + "epoch": 1.274273284556061, + "step": 12888, + "train/total_loss": 0.08071407675743103 + }, + { + "entropy": 9.930419921875, + "epoch": 1.2743721574055764, + "mean_token_accuracy": 0.9350348114967346, + "num_tokens": 11030269.0, + "step": 12889, + "train/ce_loss": 1.5056369875310338e-06 + }, + { + "epoch": 1.2743721574055764, + "step": 12889, + "train/sim_loss": 0.025670170783996582 + }, + { + "epoch": 1.2743721574055764, + "step": 12889, + "train/total_loss": 0.02567032165825367 + }, + { + "entropy": 9.491585731506348, + "epoch": 1.274471030255092, + "mean_token_accuracy": 0.8394495248794556, + "num_tokens": 11042253.0, + "step": 12890, + "train/ce_loss": 0.6282882690429688 + }, + { + "epoch": 1.274471030255092, + "step": 12890, + "train/sim_loss": 0.08633261919021606 + }, + { + "epoch": 1.274471030255092, + "step": 12890, + "train/total_loss": 0.1491614580154419 + }, + { + "entropy": 9.917547225952148, + "epoch": 1.2745699031046076, + "mean_token_accuracy": 0.9126505851745605, + "num_tokens": 11049955.0, + "step": 12891, + "train/ce_loss": 0.587088942527771 + }, + { + "epoch": 1.2745699031046076, + "step": 12891, + "train/sim_loss": 0.03376877307891846 + }, + { + "epoch": 1.2745699031046076, + "step": 12891, + "train/total_loss": 0.09247766435146332 + }, + { + "entropy": 8.805567741394043, + "epoch": 1.2746687759541229, + "mean_token_accuracy": 0.8465062975883484, + "num_tokens": 11060466.0, + "step": 12892, + "train/ce_loss": 0.45881983637809753 + }, + { + "epoch": 1.2746687759541229, + "step": 12892, + "train/sim_loss": 0.04710441827774048 + }, + { + "epoch": 1.2746687759541229, + "step": 12892, + "train/total_loss": 0.09298640489578247 + }, + { + "entropy": 9.090951919555664, + "epoch": 1.2747676488036386, + "mean_token_accuracy": 0.887499988079071, + "num_tokens": 11075024.0, + "step": 12893, + "train/ce_loss": 0.4158516526222229 + }, + { + "epoch": 1.2747676488036386, + "step": 12893, + "train/sim_loss": 0.026337623596191406 + }, + { + "epoch": 1.2747676488036386, + "step": 12893, + "train/total_loss": 0.06792278587818146 + }, + { + "entropy": 9.275331497192383, + "epoch": 1.274866521653154, + "mean_token_accuracy": 0.8596287965774536, + "num_tokens": 11084681.0, + "step": 12894, + "train/ce_loss": 0.4413143992424011 + }, + { + "epoch": 1.274866521653154, + "step": 12894, + "train/sim_loss": 0.04353976249694824 + }, + { + "epoch": 1.274866521653154, + "step": 12894, + "train/total_loss": 0.0876712054014206 + }, + { + "entropy": 9.427873611450195, + "epoch": 1.2749653945026695, + "mean_token_accuracy": 0.9107142686843872, + "num_tokens": 11098595.0, + "step": 12895, + "train/ce_loss": 0.1429075002670288 + }, + { + "epoch": 1.2749653945026695, + "step": 12895, + "train/sim_loss": 0.05932819843292236 + }, + { + "epoch": 1.2749653945026695, + "step": 12895, + "train/total_loss": 0.07361894845962524 + }, + { + "entropy": 9.652931213378906, + "epoch": 1.275064267352185, + "mean_token_accuracy": 0.877458393573761, + "num_tokens": 11109332.0, + "step": 12896, + "train/ce_loss": 5.01351053117105e-07 + }, + { + "epoch": 1.275064267352185, + "step": 12896, + "train/sim_loss": 0.05498909950256348 + }, + { + "epoch": 1.275064267352185, + "step": 12896, + "train/total_loss": 0.05498914793133736 + }, + { + "entropy": 9.849924087524414, + "epoch": 1.2751631402017005, + "mean_token_accuracy": 0.8157129287719727, + "num_tokens": 11125016.0, + "step": 12897, + "train/ce_loss": 0.6250059008598328 + }, + { + "epoch": 1.2751631402017005, + "step": 12897, + "train/sim_loss": 0.07773637771606445 + }, + { + "epoch": 1.2751631402017005, + "step": 12897, + "train/total_loss": 0.1402369737625122 + }, + { + "entropy": 9.121601104736328, + "epoch": 1.2752620130512162, + "mean_token_accuracy": 0.8822894096374512, + "num_tokens": 11134836.0, + "step": 12898, + "train/ce_loss": 0.364602655172348 + }, + { + "epoch": 1.2752620130512162, + "step": 12898, + "train/sim_loss": 0.014560341835021973 + }, + { + "epoch": 1.2752620130512162, + "step": 12898, + "train/total_loss": 0.051020607352256775 + }, + { + "entropy": 9.123418807983398, + "epoch": 1.2753608859007317, + "mean_token_accuracy": 0.8558375835418701, + "num_tokens": 11148091.0, + "step": 12899, + "train/ce_loss": 0.24942605197429657 + }, + { + "epoch": 1.2753608859007317, + "step": 12899, + "train/sim_loss": 0.041168153285980225 + }, + { + "epoch": 1.2753608859007317, + "step": 12899, + "train/total_loss": 0.066110759973526 + }, + { + "epoch": 1.2754597587502472, + "grad_norm": 0.5641077160835266, + "learning_rate": 6.813281906739852e-06, + "loss": 0.0788, + "step": 12900 + }, + { + "entropy": 9.598296165466309, + "epoch": 1.2754597587502472, + "mean_token_accuracy": 0.8377358317375183, + "num_tokens": 11160390.0, + "step": 12900, + "train/ce_loss": 1.4295042092271615e-06 + }, + { + "epoch": 1.2754597587502472, + "step": 12900, + "train/sim_loss": 0.043539226055145264 + }, + { + "epoch": 1.2754597587502472, + "step": 12900, + "train/total_loss": 0.043539367616176605 + }, + { + "entropy": 9.668094635009766, + "epoch": 1.2755586315997627, + "mean_token_accuracy": 0.9078947305679321, + "num_tokens": 11174707.0, + "step": 12901, + "train/ce_loss": 0.4072307348251343 + }, + { + "epoch": 1.2755586315997627, + "step": 12901, + "train/sim_loss": 0.027898848056793213 + }, + { + "epoch": 1.2755586315997627, + "step": 12901, + "train/total_loss": 0.0686219185590744 + }, + { + "entropy": 9.373266220092773, + "epoch": 1.2756575044492782, + "mean_token_accuracy": 0.8591549396514893, + "num_tokens": 11182526.0, + "step": 12902, + "train/ce_loss": 0.5251780152320862 + }, + { + "epoch": 1.2756575044492782, + "step": 12902, + "train/sim_loss": 0.05905574560165405 + }, + { + "epoch": 1.2756575044492782, + "step": 12902, + "train/total_loss": 0.11157354712486267 + }, + { + "entropy": 9.836627960205078, + "epoch": 1.2757563772987939, + "mean_token_accuracy": 0.8520625829696655, + "num_tokens": 11195378.0, + "step": 12903, + "train/ce_loss": 0.5883368849754333 + }, + { + "epoch": 1.2757563772987939, + "step": 12903, + "train/sim_loss": 0.08017158508300781 + }, + { + "epoch": 1.2757563772987939, + "step": 12903, + "train/total_loss": 0.13900527358055115 + }, + { + "entropy": 9.909394264221191, + "epoch": 1.2758552501483091, + "mean_token_accuracy": 0.9075000286102295, + "num_tokens": 11208759.0, + "step": 12904, + "train/ce_loss": 0.7503929138183594 + }, + { + "epoch": 1.2758552501483091, + "step": 12904, + "train/sim_loss": 0.05865490436553955 + }, + { + "epoch": 1.2758552501483091, + "step": 12904, + "train/total_loss": 0.13369420170783997 + }, + { + "entropy": 9.530678749084473, + "epoch": 1.2759541229978248, + "mean_token_accuracy": 0.8797546029090881, + "num_tokens": 11221720.0, + "step": 12905, + "train/ce_loss": 0.2794561982154846 + }, + { + "epoch": 1.2759541229978248, + "step": 12905, + "train/sim_loss": 0.032744407653808594 + }, + { + "epoch": 1.2759541229978248, + "step": 12905, + "train/total_loss": 0.060690030455589294 + }, + { + "entropy": 9.661216735839844, + "epoch": 1.2760529958473403, + "mean_token_accuracy": 0.8767334222793579, + "num_tokens": 11232376.0, + "step": 12906, + "train/ce_loss": 0.38822323083877563 + }, + { + "epoch": 1.2760529958473403, + "step": 12906, + "train/sim_loss": 0.05662953853607178 + }, + { + "epoch": 1.2760529958473403, + "step": 12906, + "train/total_loss": 0.09545186161994934 + }, + { + "entropy": 8.945301055908203, + "epoch": 1.2761518686968558, + "mean_token_accuracy": 0.9063509106636047, + "num_tokens": 11239122.0, + "step": 12907, + "train/ce_loss": 0.3270571231842041 + }, + { + "epoch": 1.2761518686968558, + "step": 12907, + "train/sim_loss": 0.014863133430480957 + }, + { + "epoch": 1.2761518686968558, + "step": 12907, + "train/total_loss": 0.04756884649395943 + }, + { + "entropy": 9.671579360961914, + "epoch": 1.2762507415463713, + "mean_token_accuracy": 0.9205297827720642, + "num_tokens": 11254146.0, + "step": 12908, + "train/ce_loss": 0.293605238199234 + }, + { + "epoch": 1.2762507415463713, + "step": 12908, + "train/sim_loss": 0.0389631986618042 + }, + { + "epoch": 1.2762507415463713, + "step": 12908, + "train/total_loss": 0.06832372397184372 + }, + { + "entropy": 9.357673645019531, + "epoch": 1.2763496143958868, + "mean_token_accuracy": 0.9043062329292297, + "num_tokens": 11266099.0, + "step": 12909, + "train/ce_loss": 0.14650849997997284 + }, + { + "epoch": 1.2763496143958868, + "step": 12909, + "train/sim_loss": 0.031180202960968018 + }, + { + "epoch": 1.2763496143958868, + "step": 12909, + "train/total_loss": 0.04583105444908142 + }, + { + "entropy": 9.904541015625, + "epoch": 1.2764484872454025, + "mean_token_accuracy": 0.8938193321228027, + "num_tokens": 11273004.0, + "step": 12910, + "train/ce_loss": 0.08387462049722672 + }, + { + "epoch": 1.2764484872454025, + "step": 12910, + "train/sim_loss": 0.05894196033477783 + }, + { + "epoch": 1.2764484872454025, + "step": 12910, + "train/total_loss": 0.06732942163944244 + }, + { + "entropy": 9.630657196044922, + "epoch": 1.276547360094918, + "mean_token_accuracy": 0.8491189479827881, + "num_tokens": 11291505.0, + "step": 12911, + "train/ce_loss": 0.10257188230752945 + }, + { + "epoch": 1.276547360094918, + "step": 12911, + "train/sim_loss": 0.041209638118743896 + }, + { + "epoch": 1.276547360094918, + "step": 12911, + "train/total_loss": 0.05146682634949684 + }, + { + "entropy": 9.387051582336426, + "epoch": 1.2766462329444335, + "mean_token_accuracy": 0.8692206144332886, + "num_tokens": 11298962.0, + "step": 12912, + "train/ce_loss": 0.4747609496116638 + }, + { + "epoch": 1.2766462329444335, + "step": 12912, + "train/sim_loss": 0.014206647872924805 + }, + { + "epoch": 1.2766462329444335, + "step": 12912, + "train/total_loss": 0.06168274208903313 + }, + { + "entropy": 9.302311897277832, + "epoch": 1.276745105793949, + "mean_token_accuracy": 0.8399280309677124, + "num_tokens": 11309526.0, + "step": 12913, + "train/ce_loss": 7.126564582904393e-07 + }, + { + "epoch": 1.276745105793949, + "step": 12913, + "train/sim_loss": 0.05009746551513672 + }, + { + "epoch": 1.276745105793949, + "step": 12913, + "train/total_loss": 0.05009753629565239 + }, + { + "entropy": 9.368050575256348, + "epoch": 1.2768439786434644, + "mean_token_accuracy": 0.9163346886634827, + "num_tokens": 11319583.0, + "step": 12914, + "train/ce_loss": 0.2693035900592804 + }, + { + "epoch": 1.2768439786434644, + "step": 12914, + "train/sim_loss": 0.03359723091125488 + }, + { + "epoch": 1.2768439786434644, + "step": 12914, + "train/total_loss": 0.06052759289741516 + }, + { + "entropy": 9.605445861816406, + "epoch": 1.2769428514929801, + "mean_token_accuracy": 0.8668639063835144, + "num_tokens": 11333222.0, + "step": 12915, + "train/ce_loss": 2.5957729121728335e-07 + }, + { + "epoch": 1.2769428514929801, + "step": 12915, + "train/sim_loss": 0.008783340454101562 + }, + { + "epoch": 1.2769428514929801, + "step": 12915, + "train/total_loss": 0.008783366531133652 + }, + { + "entropy": 9.520974159240723, + "epoch": 1.2770417243424956, + "mean_token_accuracy": 0.927315354347229, + "num_tokens": 11347466.0, + "step": 12916, + "train/ce_loss": 0.07297045737504959 + }, + { + "epoch": 1.2770417243424956, + "step": 12916, + "train/sim_loss": 0.0535581111907959 + }, + { + "epoch": 1.2770417243424956, + "step": 12916, + "train/total_loss": 0.06085515767335892 + }, + { + "entropy": 9.776784896850586, + "epoch": 1.2771405971920111, + "mean_token_accuracy": 0.8961625099182129, + "num_tokens": 11357917.0, + "step": 12917, + "train/ce_loss": 0.5082886815071106 + }, + { + "epoch": 1.2771405971920111, + "step": 12917, + "train/sim_loss": 0.015129685401916504 + }, + { + "epoch": 1.2771405971920111, + "step": 12917, + "train/total_loss": 0.06595855951309204 + }, + { + "entropy": 9.852518081665039, + "epoch": 1.2772394700415266, + "mean_token_accuracy": 0.9133192300796509, + "num_tokens": 11375023.0, + "step": 12918, + "train/ce_loss": 5.766289632447297e-07 + }, + { + "epoch": 1.2772394700415266, + "step": 12918, + "train/sim_loss": 0.029983043670654297 + }, + { + "epoch": 1.2772394700415266, + "step": 12918, + "train/total_loss": 0.029983101412653923 + }, + { + "entropy": 9.345401763916016, + "epoch": 1.277338342891042, + "mean_token_accuracy": 0.8569604158401489, + "num_tokens": 11385145.0, + "step": 12919, + "train/ce_loss": 0.44319847226142883 + }, + { + "epoch": 1.277338342891042, + "step": 12919, + "train/sim_loss": 0.04707467555999756 + }, + { + "epoch": 1.277338342891042, + "step": 12919, + "train/total_loss": 0.09139452874660492 + }, + { + "epoch": 1.2774372157405576, + "grad_norm": 0.5451325178146362, + "learning_rate": 6.808337041981903e-06, + "loss": 0.0774, + "step": 12920 + }, + { + "entropy": 9.269536972045898, + "epoch": 1.2774372157405576, + "mean_token_accuracy": 0.8406015038490295, + "num_tokens": 11396710.0, + "step": 12920, + "train/ce_loss": 0.25890758633613586 + }, + { + "epoch": 1.2774372157405576, + "step": 12920, + "train/sim_loss": 0.039826393127441406 + }, + { + "epoch": 1.2774372157405576, + "step": 12920, + "train/total_loss": 0.06571715325117111 + }, + { + "entropy": 9.236770629882812, + "epoch": 1.277536088590073, + "mean_token_accuracy": 0.8681564331054688, + "num_tokens": 11410477.0, + "step": 12921, + "train/ce_loss": 0.3329896032810211 + }, + { + "epoch": 1.277536088590073, + "step": 12921, + "train/sim_loss": 0.031981706619262695 + }, + { + "epoch": 1.277536088590073, + "step": 12921, + "train/total_loss": 0.06528066843748093 + }, + { + "entropy": 9.342031478881836, + "epoch": 1.2776349614395888, + "mean_token_accuracy": 0.8614009022712708, + "num_tokens": 11423147.0, + "step": 12922, + "train/ce_loss": 0.4361996650695801 + }, + { + "epoch": 1.2776349614395888, + "step": 12922, + "train/sim_loss": 0.0406375527381897 + }, + { + "epoch": 1.2776349614395888, + "step": 12922, + "train/total_loss": 0.08425752073526382 + }, + { + "entropy": 9.000225067138672, + "epoch": 1.2777338342891043, + "mean_token_accuracy": 0.8739290237426758, + "num_tokens": 11429823.0, + "step": 12923, + "train/ce_loss": 0.4474848210811615 + }, + { + "epoch": 1.2777338342891043, + "step": 12923, + "train/sim_loss": 0.01239931583404541 + }, + { + "epoch": 1.2777338342891043, + "step": 12923, + "train/total_loss": 0.0571477971971035 + }, + { + "entropy": 9.318437576293945, + "epoch": 1.2778327071386197, + "mean_token_accuracy": 0.8759689927101135, + "num_tokens": 11444048.0, + "step": 12924, + "train/ce_loss": 0.4634571671485901 + }, + { + "epoch": 1.2778327071386197, + "step": 12924, + "train/sim_loss": 0.0560649037361145 + }, + { + "epoch": 1.2778327071386197, + "step": 12924, + "train/total_loss": 0.10241062194108963 + }, + { + "entropy": 9.264131546020508, + "epoch": 1.2779315799881352, + "mean_token_accuracy": 0.8906497359275818, + "num_tokens": 11455751.0, + "step": 12925, + "train/ce_loss": 0.5527202486991882 + }, + { + "epoch": 1.2779315799881352, + "step": 12925, + "train/sim_loss": 0.019465923309326172 + }, + { + "epoch": 1.2779315799881352, + "step": 12925, + "train/total_loss": 0.07473795115947723 + }, + { + "entropy": 9.027931213378906, + "epoch": 1.2780304528376507, + "mean_token_accuracy": 0.8628205060958862, + "num_tokens": 11463312.0, + "step": 12926, + "train/ce_loss": 0.5055566430091858 + }, + { + "epoch": 1.2780304528376507, + "step": 12926, + "train/sim_loss": 0.07330453395843506 + }, + { + "epoch": 1.2780304528376507, + "step": 12926, + "train/total_loss": 0.1238601952791214 + }, + { + "entropy": 9.217985153198242, + "epoch": 1.2781293256871664, + "mean_token_accuracy": 0.9097065329551697, + "num_tokens": 11470607.0, + "step": 12927, + "train/ce_loss": 0.2728806436061859 + }, + { + "epoch": 1.2781293256871664, + "step": 12927, + "train/sim_loss": 0.08198416233062744 + }, + { + "epoch": 1.2781293256871664, + "step": 12927, + "train/total_loss": 0.10927222669124603 + }, + { + "entropy": 9.483732223510742, + "epoch": 1.278228198536682, + "mean_token_accuracy": 0.8821243643760681, + "num_tokens": 11486266.0, + "step": 12928, + "train/ce_loss": 0.25447413325309753 + }, + { + "epoch": 1.278228198536682, + "step": 12928, + "train/sim_loss": 0.030162811279296875 + }, + { + "epoch": 1.278228198536682, + "step": 12928, + "train/total_loss": 0.05561022460460663 + }, + { + "entropy": 9.572426795959473, + "epoch": 1.2783270713861974, + "mean_token_accuracy": 0.8589341640472412, + "num_tokens": 11499935.0, + "step": 12929, + "train/ce_loss": 0.35796329379081726 + }, + { + "epoch": 1.2783270713861974, + "step": 12929, + "train/sim_loss": 0.0562129020690918 + }, + { + "epoch": 1.2783270713861974, + "step": 12929, + "train/total_loss": 0.09200923144817352 + }, + { + "entropy": 9.439659118652344, + "epoch": 1.2784259442357129, + "mean_token_accuracy": 0.856071949005127, + "num_tokens": 11509785.0, + "step": 12930, + "train/ce_loss": 0.2800532281398773 + }, + { + "epoch": 1.2784259442357129, + "step": 12930, + "train/sim_loss": 0.06019330024719238 + }, + { + "epoch": 1.2784259442357129, + "step": 12930, + "train/total_loss": 0.08819862455129623 + }, + { + "entropy": 9.51968765258789, + "epoch": 1.2785248170852284, + "mean_token_accuracy": 0.8959537744522095, + "num_tokens": 11525090.0, + "step": 12931, + "train/ce_loss": 0.2622526288032532 + }, + { + "epoch": 1.2785248170852284, + "step": 12931, + "train/sim_loss": 0.03496283292770386 + }, + { + "epoch": 1.2785248170852284, + "step": 12931, + "train/total_loss": 0.061188094317913055 + }, + { + "entropy": 9.555574417114258, + "epoch": 1.2786236899347438, + "mean_token_accuracy": 0.806598424911499, + "num_tokens": 11541031.0, + "step": 12932, + "train/ce_loss": 0.607266902923584 + }, + { + "epoch": 1.2786236899347438, + "step": 12932, + "train/sim_loss": 0.01637864112854004 + }, + { + "epoch": 1.2786236899347438, + "step": 12932, + "train/total_loss": 0.0771053284406662 + }, + { + "entropy": 9.520565032958984, + "epoch": 1.2787225627842593, + "mean_token_accuracy": 0.9147005677223206, + "num_tokens": 11549939.0, + "step": 12933, + "train/ce_loss": 0.5232093930244446 + }, + { + "epoch": 1.2787225627842593, + "step": 12933, + "train/sim_loss": 0.034966886043548584 + }, + { + "epoch": 1.2787225627842593, + "step": 12933, + "train/total_loss": 0.08728782832622528 + }, + { + "entropy": 8.955808639526367, + "epoch": 1.278821435633775, + "mean_token_accuracy": 0.8764204382896423, + "num_tokens": 11561339.0, + "step": 12934, + "train/ce_loss": 0.8780666589736938 + }, + { + "epoch": 1.278821435633775, + "step": 12934, + "train/sim_loss": 0.04956328868865967 + }, + { + "epoch": 1.278821435633775, + "step": 12934, + "train/total_loss": 0.13736996054649353 + }, + { + "entropy": 9.769216537475586, + "epoch": 1.2789203084832905, + "mean_token_accuracy": 0.8863636255264282, + "num_tokens": 11573360.0, + "step": 12935, + "train/ce_loss": 0.3455081582069397 + }, + { + "epoch": 1.2789203084832905, + "step": 12935, + "train/sim_loss": 0.04456758499145508 + }, + { + "epoch": 1.2789203084832905, + "step": 12935, + "train/total_loss": 0.07911840081214905 + }, + { + "entropy": 9.458979606628418, + "epoch": 1.279019181332806, + "mean_token_accuracy": 0.8643853068351746, + "num_tokens": 11581490.0, + "step": 12936, + "train/ce_loss": 1.1080622673034668 + }, + { + "epoch": 1.279019181332806, + "step": 12936, + "train/sim_loss": 0.10067594051361084 + }, + { + "epoch": 1.279019181332806, + "step": 12936, + "train/total_loss": 0.21148216724395752 + }, + { + "entropy": 10.061141014099121, + "epoch": 1.2791180541823215, + "mean_token_accuracy": 0.8983451724052429, + "num_tokens": 11595929.0, + "step": 12937, + "train/ce_loss": 0.5977064967155457 + }, + { + "epoch": 1.2791180541823215, + "step": 12937, + "train/sim_loss": 0.058582842350006104 + }, + { + "epoch": 1.2791180541823215, + "step": 12937, + "train/total_loss": 0.11835349351167679 + }, + { + "entropy": 9.80169677734375, + "epoch": 1.279216927031837, + "mean_token_accuracy": 0.7926023602485657, + "num_tokens": 11610077.0, + "step": 12938, + "train/ce_loss": 1.735131718305638e-06 + }, + { + "epoch": 1.279216927031837, + "step": 12938, + "train/sim_loss": 0.03147125244140625 + }, + { + "epoch": 1.279216927031837, + "step": 12938, + "train/total_loss": 0.03147142753005028 + }, + { + "entropy": 9.28862190246582, + "epoch": 1.2793157998813527, + "mean_token_accuracy": 0.8580096960067749, + "num_tokens": 11620544.0, + "step": 12939, + "train/ce_loss": 0.45516011118888855 + }, + { + "epoch": 1.2793157998813527, + "step": 12939, + "train/sim_loss": 0.014690756797790527 + }, + { + "epoch": 1.2793157998813527, + "step": 12939, + "train/total_loss": 0.06020676717162132 + }, + { + "epoch": 1.2794146727308682, + "grad_norm": 0.5630451440811157, + "learning_rate": 6.803392177223953e-06, + "loss": 0.082, + "step": 12940 + }, + { + "entropy": 9.523945808410645, + "epoch": 1.2794146727308682, + "mean_token_accuracy": 0.8779931664466858, + "num_tokens": 11634518.0, + "step": 12940, + "train/ce_loss": 0.25446775555610657 + }, + { + "epoch": 1.2794146727308682, + "step": 12940, + "train/sim_loss": 0.0397721529006958 + }, + { + "epoch": 1.2794146727308682, + "step": 12940, + "train/total_loss": 0.06521892547607422 + }, + { + "entropy": 9.752896308898926, + "epoch": 1.2795135455803837, + "mean_token_accuracy": 0.9204108119010925, + "num_tokens": 11650774.0, + "step": 12941, + "train/ce_loss": 0.21303080022335052 + }, + { + "epoch": 1.2795135455803837, + "step": 12941, + "train/sim_loss": 0.03723627328872681 + }, + { + "epoch": 1.2795135455803837, + "step": 12941, + "train/total_loss": 0.05853935331106186 + }, + { + "entropy": 9.403512954711914, + "epoch": 1.2796124184298991, + "mean_token_accuracy": 0.8948717713356018, + "num_tokens": 11660324.0, + "step": 12942, + "train/ce_loss": 0.4336525499820709 + }, + { + "epoch": 1.2796124184298991, + "step": 12942, + "train/sim_loss": 0.057283759117126465 + }, + { + "epoch": 1.2796124184298991, + "step": 12942, + "train/total_loss": 0.10064901411533356 + }, + { + "entropy": 9.285959243774414, + "epoch": 1.2797112912794146, + "mean_token_accuracy": 0.8646986484527588, + "num_tokens": 11672018.0, + "step": 12943, + "train/ce_loss": 0.4271654784679413 + }, + { + "epoch": 1.2797112912794146, + "step": 12943, + "train/sim_loss": 0.09946966171264648 + }, + { + "epoch": 1.2797112912794146, + "step": 12943, + "train/total_loss": 0.1421862095594406 + }, + { + "entropy": 9.417591094970703, + "epoch": 1.2798101641289301, + "mean_token_accuracy": 0.8500000238418579, + "num_tokens": 11680702.0, + "step": 12944, + "train/ce_loss": 0.7377545237541199 + }, + { + "epoch": 1.2798101641289301, + "step": 12944, + "train/sim_loss": 0.01272130012512207 + }, + { + "epoch": 1.2798101641289301, + "step": 12944, + "train/total_loss": 0.0864967554807663 + }, + { + "entropy": 9.738114356994629, + "epoch": 1.2799090369784456, + "mean_token_accuracy": 0.8892580270767212, + "num_tokens": 11698150.0, + "step": 12945, + "train/ce_loss": 0.1633247435092926 + }, + { + "epoch": 1.2799090369784456, + "step": 12945, + "train/sim_loss": 0.01568007469177246 + }, + { + "epoch": 1.2799090369784456, + "step": 12945, + "train/total_loss": 0.03201255202293396 + }, + { + "entropy": 9.961888313293457, + "epoch": 1.2800079098279613, + "mean_token_accuracy": 0.8324099779129028, + "num_tokens": 11712490.0, + "step": 12946, + "train/ce_loss": 0.4052243232727051 + }, + { + "epoch": 1.2800079098279613, + "step": 12946, + "train/sim_loss": 0.03706181049346924 + }, + { + "epoch": 1.2800079098279613, + "step": 12946, + "train/total_loss": 0.07758424431085587 + }, + { + "entropy": 9.426576614379883, + "epoch": 1.2801067826774768, + "mean_token_accuracy": 0.8541666865348816, + "num_tokens": 11722302.0, + "step": 12947, + "train/ce_loss": 0.553267240524292 + }, + { + "epoch": 1.2801067826774768, + "step": 12947, + "train/sim_loss": 0.08478933572769165 + }, + { + "epoch": 1.2801067826774768, + "step": 12947, + "train/total_loss": 0.14011606574058533 + }, + { + "entropy": 9.499398231506348, + "epoch": 1.2802056555269923, + "mean_token_accuracy": 0.8849673271179199, + "num_tokens": 11733723.0, + "step": 12948, + "train/ce_loss": 0.12066023796796799 + }, + { + "epoch": 1.2802056555269923, + "step": 12948, + "train/sim_loss": 0.02078402042388916 + }, + { + "epoch": 1.2802056555269923, + "step": 12948, + "train/total_loss": 0.03285004571080208 + }, + { + "entropy": 9.637162208557129, + "epoch": 1.2803045283765078, + "mean_token_accuracy": 0.8430379629135132, + "num_tokens": 11750927.0, + "step": 12949, + "train/ce_loss": 0.4681958258152008 + }, + { + "epoch": 1.2803045283765078, + "step": 12949, + "train/sim_loss": 0.05105322599411011 + }, + { + "epoch": 1.2803045283765078, + "step": 12949, + "train/total_loss": 0.09787280857563019 + }, + { + "entropy": 9.74954891204834, + "epoch": 1.2804034012260233, + "mean_token_accuracy": 0.8766233921051025, + "num_tokens": 11767646.0, + "step": 12950, + "train/ce_loss": 0.1837867647409439 + }, + { + "epoch": 1.2804034012260233, + "step": 12950, + "train/sim_loss": 0.07086187601089478 + }, + { + "epoch": 1.2804034012260233, + "step": 12950, + "train/total_loss": 0.08924055099487305 + }, + { + "entropy": 9.843122482299805, + "epoch": 1.280502274075539, + "mean_token_accuracy": 0.8663303852081299, + "num_tokens": 11787759.0, + "step": 12951, + "train/ce_loss": 0.36105480790138245 + }, + { + "epoch": 1.280502274075539, + "step": 12951, + "train/sim_loss": 0.04012531042098999 + }, + { + "epoch": 1.280502274075539, + "step": 12951, + "train/total_loss": 0.07623079419136047 + }, + { + "entropy": 8.896821975708008, + "epoch": 1.2806011469250544, + "mean_token_accuracy": 0.8949275612831116, + "num_tokens": 11798284.0, + "step": 12952, + "train/ce_loss": 0.32959872484207153 + }, + { + "epoch": 1.2806011469250544, + "step": 12952, + "train/sim_loss": 0.04459512233734131 + }, + { + "epoch": 1.2806011469250544, + "step": 12952, + "train/total_loss": 0.07755500078201294 + }, + { + "entropy": 9.756082534790039, + "epoch": 1.28070001977457, + "mean_token_accuracy": 0.8414096832275391, + "num_tokens": 11812540.0, + "step": 12953, + "train/ce_loss": 0.573798418045044 + }, + { + "epoch": 1.28070001977457, + "step": 12953, + "train/sim_loss": 0.10851144790649414 + }, + { + "epoch": 1.28070001977457, + "step": 12953, + "train/total_loss": 0.16589128971099854 + }, + { + "entropy": 10.08863639831543, + "epoch": 1.2807988926240854, + "mean_token_accuracy": 0.8584070801734924, + "num_tokens": 11825068.0, + "step": 12954, + "train/ce_loss": 1.1055494724132586e-06 + }, + { + "epoch": 1.2807988926240854, + "step": 12954, + "train/sim_loss": 0.016824007034301758 + }, + { + "epoch": 1.2807988926240854, + "step": 12954, + "train/total_loss": 0.016824116930365562 + }, + { + "entropy": 9.688840866088867, + "epoch": 1.280897765473601, + "mean_token_accuracy": 0.842293918132782, + "num_tokens": 11837412.0, + "step": 12955, + "train/ce_loss": 0.6198422908782959 + }, + { + "epoch": 1.280897765473601, + "step": 12955, + "train/sim_loss": 0.07183682918548584 + }, + { + "epoch": 1.280897765473601, + "step": 12955, + "train/total_loss": 0.1338210552930832 + }, + { + "entropy": 9.828227043151855, + "epoch": 1.2809966383231166, + "mean_token_accuracy": 0.8741058707237244, + "num_tokens": 11854611.0, + "step": 12956, + "train/ce_loss": 0.44628405570983887 + }, + { + "epoch": 1.2809966383231166, + "step": 12956, + "train/sim_loss": 0.050131797790527344 + }, + { + "epoch": 1.2809966383231166, + "step": 12956, + "train/total_loss": 0.09476020932197571 + }, + { + "entropy": 9.379045486450195, + "epoch": 1.2810955111726319, + "mean_token_accuracy": 0.900447428226471, + "num_tokens": 11866776.0, + "step": 12957, + "train/ce_loss": 0.12529045343399048 + }, + { + "epoch": 1.2810955111726319, + "step": 12957, + "train/sim_loss": 0.04168200492858887 + }, + { + "epoch": 1.2810955111726319, + "step": 12957, + "train/total_loss": 0.054211050271987915 + }, + { + "entropy": 9.871954917907715, + "epoch": 1.2811943840221476, + "mean_token_accuracy": 0.8185231685638428, + "num_tokens": 11883655.0, + "step": 12958, + "train/ce_loss": 7.161143003031611e-07 + }, + { + "epoch": 1.2811943840221476, + "step": 12958, + "train/sim_loss": 0.030262649059295654 + }, + { + "epoch": 1.2811943840221476, + "step": 12958, + "train/total_loss": 0.030262719839811325 + }, + { + "entropy": 10.007369041442871, + "epoch": 1.281293256871663, + "mean_token_accuracy": 0.907010018825531, + "num_tokens": 11900366.0, + "step": 12959, + "train/ce_loss": 8.567578788643004e-07 + }, + { + "epoch": 1.281293256871663, + "step": 12959, + "train/sim_loss": 0.044447124004364014 + }, + { + "epoch": 1.281293256871663, + "step": 12959, + "train/total_loss": 0.04444720968604088 + }, + { + "epoch": 1.2813921297211786, + "grad_norm": 0.5157791972160339, + "learning_rate": 6.798447312466004e-06, + "loss": 0.0871, + "step": 12960 + }, + { + "entropy": 9.811491012573242, + "epoch": 1.2813921297211786, + "mean_token_accuracy": 0.842424213886261, + "num_tokens": 11920902.0, + "step": 12960, + "train/ce_loss": 0.5699785351753235 + }, + { + "epoch": 1.2813921297211786, + "step": 12960, + "train/sim_loss": 0.018766045570373535 + }, + { + "epoch": 1.2813921297211786, + "step": 12960, + "train/total_loss": 0.07576389610767365 + }, + { + "entropy": 9.498687744140625, + "epoch": 1.281491002570694, + "mean_token_accuracy": 0.8299999833106995, + "num_tokens": 11934315.0, + "step": 12961, + "train/ce_loss": 0.3713780641555786 + }, + { + "epoch": 1.281491002570694, + "step": 12961, + "train/sim_loss": 0.07231569290161133 + }, + { + "epoch": 1.281491002570694, + "step": 12961, + "train/total_loss": 0.10945349931716919 + }, + { + "entropy": 9.928874969482422, + "epoch": 1.2815898754202095, + "mean_token_accuracy": 0.8613013625144958, + "num_tokens": 11942628.0, + "step": 12962, + "train/ce_loss": 2.7026021598430816e-07 + }, + { + "epoch": 1.2815898754202095, + "step": 12962, + "train/sim_loss": 0.02053588628768921 + }, + { + "epoch": 1.2815898754202095, + "step": 12962, + "train/total_loss": 0.020535914227366447 + }, + { + "entropy": 9.25062370300293, + "epoch": 1.2816887482697252, + "mean_token_accuracy": 0.8456464409828186, + "num_tokens": 11959804.0, + "step": 12963, + "train/ce_loss": 0.4102458655834198 + }, + { + "epoch": 1.2816887482697252, + "step": 12963, + "train/sim_loss": 0.031222224235534668 + }, + { + "epoch": 1.2816887482697252, + "step": 12963, + "train/total_loss": 0.07224681228399277 + }, + { + "entropy": 9.245183944702148, + "epoch": 1.2817876211192407, + "mean_token_accuracy": 0.8817204236984253, + "num_tokens": 11969604.0, + "step": 12964, + "train/ce_loss": 0.746025800704956 + }, + { + "epoch": 1.2817876211192407, + "step": 12964, + "train/sim_loss": 0.05681443214416504 + }, + { + "epoch": 1.2817876211192407, + "step": 12964, + "train/total_loss": 0.13141700625419617 + }, + { + "entropy": 10.215110778808594, + "epoch": 1.2818864939687562, + "mean_token_accuracy": 0.9112903475761414, + "num_tokens": 11980603.0, + "step": 12965, + "train/ce_loss": 4.370970430045418e-07 + }, + { + "epoch": 1.2818864939687562, + "step": 12965, + "train/sim_loss": 0.018746495246887207 + }, + { + "epoch": 1.2818864939687562, + "step": 12965, + "train/total_loss": 0.01874653808772564 + }, + { + "entropy": 9.470993041992188, + "epoch": 1.2819853668182717, + "mean_token_accuracy": 0.8322916626930237, + "num_tokens": 11996180.0, + "step": 12966, + "train/ce_loss": 0.4643584191799164 + }, + { + "epoch": 1.2819853668182717, + "step": 12966, + "train/sim_loss": 0.02905261516571045 + }, + { + "epoch": 1.2819853668182717, + "step": 12966, + "train/total_loss": 0.07548846304416656 + }, + { + "entropy": 9.78118896484375, + "epoch": 1.2820842396677872, + "mean_token_accuracy": 0.8640973567962646, + "num_tokens": 12009521.0, + "step": 12967, + "train/ce_loss": 1.1354547950759297e-06 + }, + { + "epoch": 1.2820842396677872, + "step": 12967, + "train/sim_loss": 0.07165062427520752 + }, + { + "epoch": 1.2820842396677872, + "step": 12967, + "train/total_loss": 0.07165073603391647 + }, + { + "entropy": 9.506247520446777, + "epoch": 1.2821831125173029, + "mean_token_accuracy": 0.8703956604003906, + "num_tokens": 12025322.0, + "step": 12968, + "train/ce_loss": 0.4044579267501831 + }, + { + "epoch": 1.2821831125173029, + "step": 12968, + "train/sim_loss": 0.030740320682525635 + }, + { + "epoch": 1.2821831125173029, + "step": 12968, + "train/total_loss": 0.0711861103773117 + }, + { + "entropy": 9.974388122558594, + "epoch": 1.2822819853668181, + "mean_token_accuracy": 0.8569023609161377, + "num_tokens": 12036987.0, + "step": 12969, + "train/ce_loss": 0.4935429096221924 + }, + { + "epoch": 1.2822819853668181, + "step": 12969, + "train/sim_loss": 0.04527020454406738 + }, + { + "epoch": 1.2822819853668181, + "step": 12969, + "train/total_loss": 0.09462449699640274 + }, + { + "entropy": 9.346803665161133, + "epoch": 1.2823808582163339, + "mean_token_accuracy": 0.9053876399993896, + "num_tokens": 12045829.0, + "step": 12970, + "train/ce_loss": 0.24795612692832947 + }, + { + "epoch": 1.2823808582163339, + "step": 12970, + "train/sim_loss": 0.012238025665283203 + }, + { + "epoch": 1.2823808582163339, + "step": 12970, + "train/total_loss": 0.03703363984823227 + }, + { + "entropy": 8.757455825805664, + "epoch": 1.2824797310658493, + "mean_token_accuracy": 0.8492063283920288, + "num_tokens": 12054729.0, + "step": 12971, + "train/ce_loss": 0.22845420241355896 + }, + { + "epoch": 1.2824797310658493, + "step": 12971, + "train/sim_loss": 0.01436924934387207 + }, + { + "epoch": 1.2824797310658493, + "step": 12971, + "train/total_loss": 0.037214670330286026 + }, + { + "entropy": 9.555585861206055, + "epoch": 1.2825786039153648, + "mean_token_accuracy": 0.8112149238586426, + "num_tokens": 12063902.0, + "step": 12972, + "train/ce_loss": 0.6622101068496704 + }, + { + "epoch": 1.2825786039153648, + "step": 12972, + "train/sim_loss": 0.06461137533187866 + }, + { + "epoch": 1.2825786039153648, + "step": 12972, + "train/total_loss": 0.13083238899707794 + }, + { + "entropy": 9.088242530822754, + "epoch": 1.2826774767648803, + "mean_token_accuracy": 0.8767908215522766, + "num_tokens": 12076679.0, + "step": 12973, + "train/ce_loss": 0.3502975404262543 + }, + { + "epoch": 1.2826774767648803, + "step": 12973, + "train/sim_loss": 0.027918100357055664 + }, + { + "epoch": 1.2826774767648803, + "step": 12973, + "train/total_loss": 0.06294785439968109 + }, + { + "entropy": 9.139226913452148, + "epoch": 1.2827763496143958, + "mean_token_accuracy": 0.8765652775764465, + "num_tokens": 12093267.0, + "step": 12974, + "train/ce_loss": 0.49255385994911194 + }, + { + "epoch": 1.2827763496143958, + "step": 12974, + "train/sim_loss": 0.03808951377868652 + }, + { + "epoch": 1.2827763496143958, + "step": 12974, + "train/total_loss": 0.08734489977359772 + }, + { + "entropy": 9.463539123535156, + "epoch": 1.2828752224639115, + "mean_token_accuracy": 0.8223404288291931, + "num_tokens": 12114844.0, + "step": 12975, + "train/ce_loss": 0.41156792640686035 + }, + { + "epoch": 1.2828752224639115, + "step": 12975, + "train/sim_loss": 0.03877818584442139 + }, + { + "epoch": 1.2828752224639115, + "step": 12975, + "train/total_loss": 0.0799349844455719 + }, + { + "entropy": 9.633298873901367, + "epoch": 1.282974095313427, + "mean_token_accuracy": 0.8840000033378601, + "num_tokens": 12132888.0, + "step": 12976, + "train/ce_loss": 0.30822840332984924 + }, + { + "epoch": 1.282974095313427, + "step": 12976, + "train/sim_loss": 0.017367541790008545 + }, + { + "epoch": 1.282974095313427, + "step": 12976, + "train/total_loss": 0.04819038510322571 + }, + { + "entropy": 9.202038764953613, + "epoch": 1.2830729681629425, + "mean_token_accuracy": 0.8978675603866577, + "num_tokens": 12147864.0, + "step": 12977, + "train/ce_loss": 0.21986229717731476 + }, + { + "epoch": 1.2830729681629425, + "step": 12977, + "train/sim_loss": 0.019484639167785645 + }, + { + "epoch": 1.2830729681629425, + "step": 12977, + "train/total_loss": 0.04147087037563324 + }, + { + "entropy": 9.19525146484375, + "epoch": 1.283171841012458, + "mean_token_accuracy": 0.8484848737716675, + "num_tokens": 12157644.0, + "step": 12978, + "train/ce_loss": 0.5642848014831543 + }, + { + "epoch": 1.283171841012458, + "step": 12978, + "train/sim_loss": 0.08712482452392578 + }, + { + "epoch": 1.283171841012458, + "step": 12978, + "train/total_loss": 0.14355330169200897 + }, + { + "entropy": 9.124534606933594, + "epoch": 1.2832707138619734, + "mean_token_accuracy": 0.8945946097373962, + "num_tokens": 12163197.0, + "step": 12979, + "train/ce_loss": 0.22113816440105438 + }, + { + "epoch": 1.2832707138619734, + "step": 12979, + "train/sim_loss": 0.01871490478515625 + }, + { + "epoch": 1.2832707138619734, + "step": 12979, + "train/total_loss": 0.04082871973514557 + }, + { + "epoch": 1.2833695867114892, + "grad_norm": 0.40939491987228394, + "learning_rate": 6.7935024477080555e-06, + "loss": 0.0797, + "step": 12980 + }, + { + "entropy": 9.44078254699707, + "epoch": 1.2833695867114892, + "mean_token_accuracy": 0.8239355683326721, + "num_tokens": 12176205.0, + "step": 12980, + "train/ce_loss": 0.4068082273006439 + }, + { + "epoch": 1.2833695867114892, + "step": 12980, + "train/sim_loss": 0.06491190195083618 + }, + { + "epoch": 1.2833695867114892, + "step": 12980, + "train/total_loss": 0.10559272766113281 + }, + { + "entropy": 9.697690963745117, + "epoch": 1.2834684595610044, + "mean_token_accuracy": 0.8480325937271118, + "num_tokens": 12185792.0, + "step": 12981, + "train/ce_loss": 0.6118180751800537 + }, + { + "epoch": 1.2834684595610044, + "step": 12981, + "train/sim_loss": 0.033529043197631836 + }, + { + "epoch": 1.2834684595610044, + "step": 12981, + "train/total_loss": 0.09471085667610168 + }, + { + "entropy": 8.979894638061523, + "epoch": 1.2835673324105201, + "mean_token_accuracy": 0.858397364616394, + "num_tokens": 12193551.0, + "step": 12982, + "train/ce_loss": 0.37243515253067017 + }, + { + "epoch": 1.2835673324105201, + "step": 12982, + "train/sim_loss": 0.043096184730529785 + }, + { + "epoch": 1.2835673324105201, + "step": 12982, + "train/total_loss": 0.0803396999835968 + }, + { + "entropy": 9.252466201782227, + "epoch": 1.2836662052600356, + "mean_token_accuracy": 0.8653421401977539, + "num_tokens": 12204770.0, + "step": 12983, + "train/ce_loss": 0.17733080685138702 + }, + { + "epoch": 1.2836662052600356, + "step": 12983, + "train/sim_loss": 0.03362762928009033 + }, + { + "epoch": 1.2836662052600356, + "step": 12983, + "train/total_loss": 0.051360711455345154 + }, + { + "entropy": 9.139366149902344, + "epoch": 1.283765078109551, + "mean_token_accuracy": 0.8668122291564941, + "num_tokens": 12216015.0, + "step": 12984, + "train/ce_loss": 0.33234915137290955 + }, + { + "epoch": 1.283765078109551, + "step": 12984, + "train/sim_loss": 0.06320828199386597 + }, + { + "epoch": 1.283765078109551, + "step": 12984, + "train/total_loss": 0.09644319862127304 + }, + { + "entropy": 9.516939163208008, + "epoch": 1.2838639509590666, + "mean_token_accuracy": 0.8306818008422852, + "num_tokens": 12230155.0, + "step": 12985, + "train/ce_loss": 0.626083254814148 + }, + { + "epoch": 1.2838639509590666, + "step": 12985, + "train/sim_loss": 0.031049251556396484 + }, + { + "epoch": 1.2838639509590666, + "step": 12985, + "train/total_loss": 0.09365757554769516 + }, + { + "entropy": 9.147111892700195, + "epoch": 1.283962823808582, + "mean_token_accuracy": 0.8549946546554565, + "num_tokens": 12244487.0, + "step": 12986, + "train/ce_loss": 0.31498026847839355 + }, + { + "epoch": 1.283962823808582, + "step": 12986, + "train/sim_loss": 0.03178691864013672 + }, + { + "epoch": 1.283962823808582, + "step": 12986, + "train/total_loss": 0.06328494846820831 + }, + { + "entropy": 9.225622177124023, + "epoch": 1.2840616966580978, + "mean_token_accuracy": 0.9197530746459961, + "num_tokens": 12259701.0, + "step": 12987, + "train/ce_loss": 0.2836921215057373 + }, + { + "epoch": 1.2840616966580978, + "step": 12987, + "train/sim_loss": 0.040438055992126465 + }, + { + "epoch": 1.2840616966580978, + "step": 12987, + "train/total_loss": 0.06880726665258408 + }, + { + "entropy": 9.733598709106445, + "epoch": 1.2841605695076133, + "mean_token_accuracy": 0.8878748416900635, + "num_tokens": 12280348.0, + "step": 12988, + "train/ce_loss": 0.2785850763320923 + }, + { + "epoch": 1.2841605695076133, + "step": 12988, + "train/sim_loss": 0.02507704496383667 + }, + { + "epoch": 1.2841605695076133, + "step": 12988, + "train/total_loss": 0.05293555557727814 + }, + { + "entropy": 9.106463432312012, + "epoch": 1.2842594423571287, + "mean_token_accuracy": 0.8888888955116272, + "num_tokens": 12293124.0, + "step": 12989, + "train/ce_loss": 3.620383211000444e-07 + }, + { + "epoch": 1.2842594423571287, + "step": 12989, + "train/sim_loss": 0.028873026371002197 + }, + { + "epoch": 1.2842594423571287, + "step": 12989, + "train/total_loss": 0.028873061761260033 + }, + { + "entropy": 8.788655281066895, + "epoch": 1.2843583152066442, + "mean_token_accuracy": 0.8947927951812744, + "num_tokens": 12302221.0, + "step": 12990, + "train/ce_loss": 0.2705395519733429 + }, + { + "epoch": 1.2843583152066442, + "step": 12990, + "train/sim_loss": 0.014887332916259766 + }, + { + "epoch": 1.2843583152066442, + "step": 12990, + "train/total_loss": 0.041941288858652115 + }, + { + "entropy": 9.461660385131836, + "epoch": 1.2844571880561597, + "mean_token_accuracy": 0.8790523409843445, + "num_tokens": 12318662.0, + "step": 12991, + "train/ce_loss": 8.989106277113024e-07 + }, + { + "epoch": 1.2844571880561597, + "step": 12991, + "train/sim_loss": 0.04280751943588257 + }, + { + "epoch": 1.2844571880561597, + "step": 12991, + "train/total_loss": 0.04280760884284973 + }, + { + "entropy": 8.914325714111328, + "epoch": 1.2845560609056754, + "mean_token_accuracy": 0.88310307264328, + "num_tokens": 12331338.0, + "step": 12992, + "train/ce_loss": 0.3682178258895874 + }, + { + "epoch": 1.2845560609056754, + "step": 12992, + "train/sim_loss": 0.04645872116088867 + }, + { + "epoch": 1.2845560609056754, + "step": 12992, + "train/total_loss": 0.08328050374984741 + }, + { + "entropy": 9.20543384552002, + "epoch": 1.2846549337551907, + "mean_token_accuracy": 0.8356009125709534, + "num_tokens": 12340921.0, + "step": 12993, + "train/ce_loss": 0.5528079271316528 + }, + { + "epoch": 1.2846549337551907, + "step": 12993, + "train/sim_loss": 0.08084249496459961 + }, + { + "epoch": 1.2846549337551907, + "step": 12993, + "train/total_loss": 0.13612328469753265 + }, + { + "entropy": 10.29845905303955, + "epoch": 1.2847538066047064, + "mean_token_accuracy": 0.8814969062805176, + "num_tokens": 12351644.0, + "step": 12994, + "train/ce_loss": 0.5378929376602173 + }, + { + "epoch": 1.2847538066047064, + "step": 12994, + "train/sim_loss": 0.07647418975830078 + }, + { + "epoch": 1.2847538066047064, + "step": 12994, + "train/total_loss": 0.13026347756385803 + }, + { + "entropy": 9.489006042480469, + "epoch": 1.2848526794542219, + "mean_token_accuracy": 0.8719397187232971, + "num_tokens": 12361481.0, + "step": 12995, + "train/ce_loss": 0.6254003643989563 + }, + { + "epoch": 1.2848526794542219, + "step": 12995, + "train/sim_loss": 0.06245875358581543 + }, + { + "epoch": 1.2848526794542219, + "step": 12995, + "train/total_loss": 0.1249987930059433 + }, + { + "entropy": 9.355842590332031, + "epoch": 1.2849515523037374, + "mean_token_accuracy": 0.8553921580314636, + "num_tokens": 12375986.0, + "step": 12996, + "train/ce_loss": 0.3054891526699066 + }, + { + "epoch": 1.2849515523037374, + "step": 12996, + "train/sim_loss": 0.039202213287353516 + }, + { + "epoch": 1.2849515523037374, + "step": 12996, + "train/total_loss": 0.06975112855434418 + }, + { + "entropy": 9.636269569396973, + "epoch": 1.2850504251532529, + "mean_token_accuracy": 0.8635236024856567, + "num_tokens": 12391053.0, + "step": 12997, + "train/ce_loss": 0.46270453929901123 + }, + { + "epoch": 1.2850504251532529, + "step": 12997, + "train/sim_loss": 0.072884202003479 + }, + { + "epoch": 1.2850504251532529, + "step": 12997, + "train/total_loss": 0.1191546618938446 + }, + { + "entropy": 9.796125411987305, + "epoch": 1.2851492980027683, + "mean_token_accuracy": 0.8383705615997314, + "num_tokens": 12406854.0, + "step": 12998, + "train/ce_loss": 0.6346380114555359 + }, + { + "epoch": 1.2851492980027683, + "step": 12998, + "train/sim_loss": 0.020436465740203857 + }, + { + "epoch": 1.2851492980027683, + "step": 12998, + "train/total_loss": 0.08390026539564133 + }, + { + "entropy": 9.611650466918945, + "epoch": 1.285248170852284, + "mean_token_accuracy": 0.8716813921928406, + "num_tokens": 12418653.0, + "step": 12999, + "train/ce_loss": 2.644382277594559e-07 + }, + { + "epoch": 1.285248170852284, + "step": 12999, + "train/sim_loss": 0.01682138442993164 + }, + { + "epoch": 1.285248170852284, + "step": 12999, + "train/total_loss": 0.01682141050696373 + }, + { + "epoch": 1.2853470437017995, + "grad_norm": 0.569176435470581, + "learning_rate": 6.788557582950107e-06, + "loss": 0.0809, + "step": 13000 + }, + { + "entropy": 8.938741683959961, + "epoch": 1.2853470437017995, + "mean_token_accuracy": 0.8804537653923035, + "num_tokens": 12430637.0, + "step": 13000, + "train/ce_loss": 0.4516219198703766 + }, + { + "epoch": 1.2853470437017995, + "step": 13000, + "train/sim_loss": 0.07674437761306763 + }, + { + "epoch": 1.2853470437017995, + "step": 13000, + "train/total_loss": 0.1219065710902214 + }, + { + "entropy": 9.229232788085938, + "epoch": 1.285445916551315, + "mean_token_accuracy": 0.8586698174476624, + "num_tokens": 12444151.0, + "step": 13001, + "train/ce_loss": 0.5840215086936951 + }, + { + "epoch": 1.285445916551315, + "step": 13001, + "train/sim_loss": 0.09987807273864746 + }, + { + "epoch": 1.285445916551315, + "step": 13001, + "train/total_loss": 0.15828022360801697 + }, + { + "entropy": 9.434066772460938, + "epoch": 1.2855447894008305, + "mean_token_accuracy": 0.8739224076271057, + "num_tokens": 12460807.0, + "step": 13002, + "train/ce_loss": 0.2062690705060959 + }, + { + "epoch": 1.2855447894008305, + "step": 13002, + "train/sim_loss": 0.06934690475463867 + }, + { + "epoch": 1.2855447894008305, + "step": 13002, + "train/total_loss": 0.0899738147854805 + }, + { + "entropy": 9.200807571411133, + "epoch": 1.285643662250346, + "mean_token_accuracy": 0.8917410969734192, + "num_tokens": 12468651.0, + "step": 13003, + "train/ce_loss": 0.3471197783946991 + }, + { + "epoch": 1.285643662250346, + "step": 13003, + "train/sim_loss": 0.06424319744110107 + }, + { + "epoch": 1.285643662250346, + "step": 13003, + "train/total_loss": 0.0989551767706871 + }, + { + "entropy": 9.896810531616211, + "epoch": 1.2857425350998617, + "mean_token_accuracy": 0.9065573811531067, + "num_tokens": 12480839.0, + "step": 13004, + "train/ce_loss": 0.1811918318271637 + }, + { + "epoch": 1.2857425350998617, + "step": 13004, + "train/sim_loss": 0.03980153799057007 + }, + { + "epoch": 1.2857425350998617, + "step": 13004, + "train/total_loss": 0.05792072415351868 + }, + { + "entropy": 9.85608196258545, + "epoch": 1.2858414079493772, + "mean_token_accuracy": 0.8346709609031677, + "num_tokens": 12490942.0, + "step": 13005, + "train/ce_loss": 0.7595687508583069 + }, + { + "epoch": 1.2858414079493772, + "step": 13005, + "train/sim_loss": 0.08854860067367554 + }, + { + "epoch": 1.2858414079493772, + "step": 13005, + "train/total_loss": 0.1645054817199707 + }, + { + "entropy": 9.189538955688477, + "epoch": 1.2859402807988927, + "mean_token_accuracy": 0.8730964660644531, + "num_tokens": 12499574.0, + "step": 13006, + "train/ce_loss": 0.3579713702201843 + }, + { + "epoch": 1.2859402807988927, + "step": 13006, + "train/sim_loss": 0.017612338066101074 + }, + { + "epoch": 1.2859402807988927, + "step": 13006, + "train/total_loss": 0.053409475833177567 + }, + { + "entropy": 8.681164741516113, + "epoch": 1.2860391536484082, + "mean_token_accuracy": 0.8767908215522766, + "num_tokens": 12507270.0, + "step": 13007, + "train/ce_loss": 0.15647734701633453 + }, + { + "epoch": 1.2860391536484082, + "step": 13007, + "train/sim_loss": 0.014286816120147705 + }, + { + "epoch": 1.2860391536484082, + "step": 13007, + "train/total_loss": 0.029934551566839218 + }, + { + "entropy": 9.711187362670898, + "epoch": 1.2861380264979236, + "mean_token_accuracy": 0.8968824744224548, + "num_tokens": 12520520.0, + "step": 13008, + "train/ce_loss": 0.7829767465591431 + }, + { + "epoch": 1.2861380264979236, + "step": 13008, + "train/sim_loss": 0.043418049812316895 + }, + { + "epoch": 1.2861380264979236, + "step": 13008, + "train/total_loss": 0.1217157244682312 + }, + { + "entropy": 9.614130020141602, + "epoch": 1.2862368993474391, + "mean_token_accuracy": 0.8562259078025818, + "num_tokens": 12536011.0, + "step": 13009, + "train/ce_loss": 0.6164280772209167 + }, + { + "epoch": 1.2862368993474391, + "step": 13009, + "train/sim_loss": 0.05992370843887329 + }, + { + "epoch": 1.2862368993474391, + "step": 13009, + "train/total_loss": 0.1215665191411972 + }, + { + "entropy": 9.751737594604492, + "epoch": 1.2863357721969546, + "mean_token_accuracy": 0.9310924410820007, + "num_tokens": 12547143.0, + "step": 13010, + "train/ce_loss": 0.13692016899585724 + }, + { + "epoch": 1.2863357721969546, + "step": 13010, + "train/sim_loss": 0.03399991989135742 + }, + { + "epoch": 1.2863357721969546, + "step": 13010, + "train/total_loss": 0.047691937536001205 + }, + { + "entropy": 9.19014835357666, + "epoch": 1.2864346450464703, + "mean_token_accuracy": 0.8950104117393494, + "num_tokens": 12559532.0, + "step": 13011, + "train/ce_loss": 0.33307069540023804 + }, + { + "epoch": 1.2864346450464703, + "step": 13011, + "train/sim_loss": 0.04633688926696777 + }, + { + "epoch": 1.2864346450464703, + "step": 13011, + "train/total_loss": 0.07964396476745605 + }, + { + "entropy": 9.205676078796387, + "epoch": 1.2865335178959858, + "mean_token_accuracy": 0.8727272748947144, + "num_tokens": 12570747.0, + "step": 13012, + "train/ce_loss": 0.3711928427219391 + }, + { + "epoch": 1.2865335178959858, + "step": 13012, + "train/sim_loss": 0.02355945110321045 + }, + { + "epoch": 1.2865335178959858, + "step": 13012, + "train/total_loss": 0.06067873537540436 + }, + { + "entropy": 9.548750877380371, + "epoch": 1.2866323907455013, + "mean_token_accuracy": 0.8588390350341797, + "num_tokens": 12582284.0, + "step": 13013, + "train/ce_loss": 0.43635880947113037 + }, + { + "epoch": 1.2866323907455013, + "step": 13013, + "train/sim_loss": 0.051698923110961914 + }, + { + "epoch": 1.2866323907455013, + "step": 13013, + "train/total_loss": 0.09533480554819107 + }, + { + "entropy": 9.163959503173828, + "epoch": 1.2867312635950168, + "mean_token_accuracy": 0.8495440483093262, + "num_tokens": 12593553.0, + "step": 13014, + "train/ce_loss": 0.6834109425544739 + }, + { + "epoch": 1.2867312635950168, + "step": 13014, + "train/sim_loss": 0.06467247009277344 + }, + { + "epoch": 1.2867312635950168, + "step": 13014, + "train/total_loss": 0.13301357626914978 + }, + { + "entropy": 9.650543212890625, + "epoch": 1.2868301364445323, + "mean_token_accuracy": 0.8990654349327087, + "num_tokens": 12605268.0, + "step": 13015, + "train/ce_loss": 3.1313618364947615e-06 + }, + { + "epoch": 1.2868301364445323, + "step": 13015, + "train/sim_loss": 0.027446746826171875 + }, + { + "epoch": 1.2868301364445323, + "step": 13015, + "train/total_loss": 0.027447059750556946 + }, + { + "entropy": 9.006299018859863, + "epoch": 1.286929009294048, + "mean_token_accuracy": 0.8707926273345947, + "num_tokens": 12616541.0, + "step": 13016, + "train/ce_loss": 0.4165702164173126 + }, + { + "epoch": 1.286929009294048, + "step": 13016, + "train/sim_loss": 0.07888913154602051 + }, + { + "epoch": 1.286929009294048, + "step": 13016, + "train/total_loss": 0.12054615467786789 + }, + { + "entropy": 9.205194473266602, + "epoch": 1.2870278821435635, + "mean_token_accuracy": 0.8264840245246887, + "num_tokens": 12624889.0, + "step": 13017, + "train/ce_loss": 0.7218993306159973 + }, + { + "epoch": 1.2870278821435635, + "step": 13017, + "train/sim_loss": 0.06390857696533203 + }, + { + "epoch": 1.2870278821435635, + "step": 13017, + "train/total_loss": 0.13609850406646729 + }, + { + "entropy": 9.109619140625, + "epoch": 1.287126754993079, + "mean_token_accuracy": 0.8142144680023193, + "num_tokens": 12634469.0, + "step": 13018, + "train/ce_loss": 1.1729758977890015 + }, + { + "epoch": 1.287126754993079, + "step": 13018, + "train/sim_loss": 0.054232776165008545 + }, + { + "epoch": 1.287126754993079, + "step": 13018, + "train/total_loss": 0.1715303659439087 + }, + { + "entropy": 9.460038185119629, + "epoch": 1.2872256278425944, + "mean_token_accuracy": 0.8557692170143127, + "num_tokens": 12649655.0, + "step": 13019, + "train/ce_loss": 0.4511798024177551 + }, + { + "epoch": 1.2872256278425944, + "step": 13019, + "train/sim_loss": 0.06505084037780762 + }, + { + "epoch": 1.2872256278425944, + "step": 13019, + "train/total_loss": 0.11016882210969925 + }, + { + "epoch": 1.28732450069211, + "grad_norm": 0.5911986827850342, + "learning_rate": 6.783612718192158e-06, + "loss": 0.081, + "step": 13020 + }, + { + "entropy": 9.442934036254883, + "epoch": 1.28732450069211, + "mean_token_accuracy": 0.9079498052597046, + "num_tokens": 12665744.0, + "step": 13020, + "train/ce_loss": 0.2655591666698456 + }, + { + "epoch": 1.28732450069211, + "step": 13020, + "train/sim_loss": 0.019342541694641113 + }, + { + "epoch": 1.28732450069211, + "step": 13020, + "train/total_loss": 0.04589845985174179 + }, + { + "entropy": 9.474649429321289, + "epoch": 1.2874233735416254, + "mean_token_accuracy": 0.831626832485199, + "num_tokens": 12680397.0, + "step": 13021, + "train/ce_loss": 0.5881555676460266 + }, + { + "epoch": 1.2874233735416254, + "step": 13021, + "train/sim_loss": 0.05088990926742554 + }, + { + "epoch": 1.2874233735416254, + "step": 13021, + "train/total_loss": 0.10970546305179596 + }, + { + "entropy": 9.256025314331055, + "epoch": 1.2875222463911409, + "mean_token_accuracy": 0.8836565017700195, + "num_tokens": 12690696.0, + "step": 13022, + "train/ce_loss": 0.36928775906562805 + }, + { + "epoch": 1.2875222463911409, + "step": 13022, + "train/sim_loss": 0.015546143054962158 + }, + { + "epoch": 1.2875222463911409, + "step": 13022, + "train/total_loss": 0.05247491970658302 + }, + { + "entropy": 9.41031265258789, + "epoch": 1.2876211192406566, + "mean_token_accuracy": 0.8571428656578064, + "num_tokens": 12702311.0, + "step": 13023, + "train/ce_loss": 2.921235591202276e-06 + }, + { + "epoch": 1.2876211192406566, + "step": 13023, + "train/sim_loss": 0.02530771493911743 + }, + { + "epoch": 1.2876211192406566, + "step": 13023, + "train/total_loss": 0.02530800737440586 + }, + { + "entropy": 9.235027313232422, + "epoch": 1.287719992090172, + "mean_token_accuracy": 0.8804780840873718, + "num_tokens": 12717124.0, + "step": 13024, + "train/ce_loss": 0.234087735414505 + }, + { + "epoch": 1.287719992090172, + "step": 13024, + "train/sim_loss": 0.019966602325439453 + }, + { + "epoch": 1.287719992090172, + "step": 13024, + "train/total_loss": 0.04337537661194801 + }, + { + "entropy": 9.481306076049805, + "epoch": 1.2878188649396876, + "mean_token_accuracy": 0.8789308071136475, + "num_tokens": 12725670.0, + "step": 13025, + "train/ce_loss": 0.3730338215827942 + }, + { + "epoch": 1.2878188649396876, + "step": 13025, + "train/sim_loss": 0.06320726871490479 + }, + { + "epoch": 1.2878188649396876, + "step": 13025, + "train/total_loss": 0.10051065683364868 + }, + { + "entropy": 9.158496856689453, + "epoch": 1.287917737789203, + "mean_token_accuracy": 0.8717948794364929, + "num_tokens": 12735767.0, + "step": 13026, + "train/ce_loss": 0.4169906675815582 + }, + { + "epoch": 1.287917737789203, + "step": 13026, + "train/sim_loss": 0.0371553897857666 + }, + { + "epoch": 1.287917737789203, + "step": 13026, + "train/total_loss": 0.07885445654392242 + }, + { + "entropy": 8.841415405273438, + "epoch": 1.2880166106387185, + "mean_token_accuracy": 0.8052631616592407, + "num_tokens": 12744488.0, + "step": 13027, + "train/ce_loss": 0.5360863208770752 + }, + { + "epoch": 1.2880166106387185, + "step": 13027, + "train/sim_loss": 0.10206902027130127 + }, + { + "epoch": 1.2880166106387185, + "step": 13027, + "train/total_loss": 0.1556776463985443 + }, + { + "entropy": 9.430191040039062, + "epoch": 1.2881154834882342, + "mean_token_accuracy": 0.8529411554336548, + "num_tokens": 12754960.0, + "step": 13028, + "train/ce_loss": 0.5913705229759216 + }, + { + "epoch": 1.2881154834882342, + "step": 13028, + "train/sim_loss": 0.048468172550201416 + }, + { + "epoch": 1.2881154834882342, + "step": 13028, + "train/total_loss": 0.1076052263379097 + }, + { + "entropy": 9.321270942687988, + "epoch": 1.2882143563377497, + "mean_token_accuracy": 0.8591715693473816, + "num_tokens": 12767843.0, + "step": 13029, + "train/ce_loss": 0.5033043622970581 + }, + { + "epoch": 1.2882143563377497, + "step": 13029, + "train/sim_loss": 0.02137458324432373 + }, + { + "epoch": 1.2882143563377497, + "step": 13029, + "train/total_loss": 0.07170502096414566 + }, + { + "entropy": 8.949934005737305, + "epoch": 1.2883132291872652, + "mean_token_accuracy": 0.9045120477676392, + "num_tokens": 12777048.0, + "step": 13030, + "train/ce_loss": 0.14151713252067566 + }, + { + "epoch": 1.2883132291872652, + "step": 13030, + "train/sim_loss": 0.009978532791137695 + }, + { + "epoch": 1.2883132291872652, + "step": 13030, + "train/total_loss": 0.02413024753332138 + }, + { + "entropy": 9.087729454040527, + "epoch": 1.2884121020367807, + "mean_token_accuracy": 0.8233333230018616, + "num_tokens": 12784404.0, + "step": 13031, + "train/ce_loss": 0.6930145621299744 + }, + { + "epoch": 1.2884121020367807, + "step": 13031, + "train/sim_loss": 0.055955708026885986 + }, + { + "epoch": 1.2884121020367807, + "step": 13031, + "train/total_loss": 0.12525716423988342 + }, + { + "entropy": 9.190378189086914, + "epoch": 1.2885109748862962, + "mean_token_accuracy": 0.8727915287017822, + "num_tokens": 12795841.0, + "step": 13032, + "train/ce_loss": 0.30363017320632935 + }, + { + "epoch": 1.2885109748862962, + "step": 13032, + "train/sim_loss": 0.034922242164611816 + }, + { + "epoch": 1.2885109748862962, + "step": 13032, + "train/total_loss": 0.06528525799512863 + }, + { + "entropy": 9.077205657958984, + "epoch": 1.2886098477358117, + "mean_token_accuracy": 0.8470728993415833, + "num_tokens": 12805310.0, + "step": 13033, + "train/ce_loss": 0.5670459270477295 + }, + { + "epoch": 1.2886098477358117, + "step": 13033, + "train/sim_loss": 0.08460491895675659 + }, + { + "epoch": 1.2886098477358117, + "step": 13033, + "train/total_loss": 0.14130951464176178 + }, + { + "entropy": 9.868510246276855, + "epoch": 1.2887087205853271, + "mean_token_accuracy": 0.8392226099967957, + "num_tokens": 12817067.0, + "step": 13034, + "train/ce_loss": 0.7817869186401367 + }, + { + "epoch": 1.2887087205853271, + "step": 13034, + "train/sim_loss": 0.06673568487167358 + }, + { + "epoch": 1.2887087205853271, + "step": 13034, + "train/total_loss": 0.1449143886566162 + }, + { + "entropy": 9.357596397399902, + "epoch": 1.2888075934348429, + "mean_token_accuracy": 0.9069111347198486, + "num_tokens": 12829806.0, + "step": 13035, + "train/ce_loss": 0.4573175311088562 + }, + { + "epoch": 1.2888075934348429, + "step": 13035, + "train/sim_loss": 0.02254319190979004 + }, + { + "epoch": 1.2888075934348429, + "step": 13035, + "train/total_loss": 0.06827494502067566 + }, + { + "entropy": 9.334470748901367, + "epoch": 1.2889064662843583, + "mean_token_accuracy": 0.8784403800964355, + "num_tokens": 12843685.0, + "step": 13036, + "train/ce_loss": 0.49102556705474854 + }, + { + "epoch": 1.2889064662843583, + "step": 13036, + "train/sim_loss": 0.04014480113983154 + }, + { + "epoch": 1.2889064662843583, + "step": 13036, + "train/total_loss": 0.08924736082553864 + }, + { + "entropy": 9.536188125610352, + "epoch": 1.2890053391338738, + "mean_token_accuracy": 0.839712917804718, + "num_tokens": 12861820.0, + "step": 13037, + "train/ce_loss": 0.47465893626213074 + }, + { + "epoch": 1.2890053391338738, + "step": 13037, + "train/sim_loss": 0.04150599241256714 + }, + { + "epoch": 1.2890053391338738, + "step": 13037, + "train/total_loss": 0.08897188305854797 + }, + { + "entropy": 9.923676490783691, + "epoch": 1.2891042119833893, + "mean_token_accuracy": 0.8959660530090332, + "num_tokens": 12873962.0, + "step": 13038, + "train/ce_loss": 0.6406236886978149 + }, + { + "epoch": 1.2891042119833893, + "step": 13038, + "train/sim_loss": 0.05017441511154175 + }, + { + "epoch": 1.2891042119833893, + "step": 13038, + "train/total_loss": 0.11423678696155548 + }, + { + "entropy": 9.440290451049805, + "epoch": 1.2892030848329048, + "mean_token_accuracy": 0.8957983255386353, + "num_tokens": 12884980.0, + "step": 13039, + "train/ce_loss": 0.28186577558517456 + }, + { + "epoch": 1.2892030848329048, + "step": 13039, + "train/sim_loss": 0.038825392723083496 + }, + { + "epoch": 1.2892030848329048, + "step": 13039, + "train/total_loss": 0.06701196730136871 + }, + { + "epoch": 1.2893019576824205, + "grad_norm": 0.5625731945037842, + "learning_rate": 6.778667853434209e-06, + "loss": 0.0818, + "step": 13040 + }, + { + "entropy": 9.482606887817383, + "epoch": 1.2893019576824205, + "mean_token_accuracy": 0.8689740300178528, + "num_tokens": 12898671.0, + "step": 13040, + "train/ce_loss": 0.3588700592517853 + }, + { + "epoch": 1.2893019576824205, + "step": 13040, + "train/sim_loss": 0.022780954837799072 + }, + { + "epoch": 1.2893019576824205, + "step": 13040, + "train/total_loss": 0.05866796150803566 + }, + { + "entropy": 9.971031188964844, + "epoch": 1.289400830531936, + "mean_token_accuracy": 0.8705281019210815, + "num_tokens": 12907608.0, + "step": 13041, + "train/ce_loss": 0.5518602728843689 + }, + { + "epoch": 1.289400830531936, + "step": 13041, + "train/sim_loss": 0.07832455635070801 + }, + { + "epoch": 1.289400830531936, + "step": 13041, + "train/total_loss": 0.13351058959960938 + }, + { + "entropy": 9.36384391784668, + "epoch": 1.2894997033814515, + "mean_token_accuracy": 0.8831658363342285, + "num_tokens": 12922479.0, + "step": 13042, + "train/ce_loss": 0.39922958612442017 + }, + { + "epoch": 1.2894997033814515, + "step": 13042, + "train/sim_loss": 0.04287588596343994 + }, + { + "epoch": 1.2894997033814515, + "step": 13042, + "train/total_loss": 0.08279884606599808 + }, + { + "entropy": 9.079627990722656, + "epoch": 1.289598576230967, + "mean_token_accuracy": 0.8762446641921997, + "num_tokens": 12931023.0, + "step": 13043, + "train/ce_loss": 0.4928549826145172 + }, + { + "epoch": 1.289598576230967, + "step": 13043, + "train/sim_loss": 0.01631850004196167 + }, + { + "epoch": 1.289598576230967, + "step": 13043, + "train/total_loss": 0.06560400128364563 + }, + { + "entropy": 9.293705940246582, + "epoch": 1.2896974490804824, + "mean_token_accuracy": 0.852522611618042, + "num_tokens": 12945551.0, + "step": 13044, + "train/ce_loss": 0.32825469970703125 + }, + { + "epoch": 1.2896974490804824, + "step": 13044, + "train/sim_loss": 0.02342844009399414 + }, + { + "epoch": 1.2896974490804824, + "step": 13044, + "train/total_loss": 0.056253910064697266 + }, + { + "entropy": 9.614543914794922, + "epoch": 1.2897963219299982, + "mean_token_accuracy": 0.8235294222831726, + "num_tokens": 12953954.0, + "step": 13045, + "train/ce_loss": 0.3750837743282318 + }, + { + "epoch": 1.2897963219299982, + "step": 13045, + "train/sim_loss": 0.05955815315246582 + }, + { + "epoch": 1.2897963219299982, + "step": 13045, + "train/total_loss": 0.09706653654575348 + }, + { + "entropy": 9.132091522216797, + "epoch": 1.2898951947795134, + "mean_token_accuracy": 0.8264299631118774, + "num_tokens": 12965667.0, + "step": 13046, + "train/ce_loss": 0.4459173083305359 + }, + { + "epoch": 1.2898951947795134, + "step": 13046, + "train/sim_loss": 0.014135479927062988 + }, + { + "epoch": 1.2898951947795134, + "step": 13046, + "train/total_loss": 0.0587272122502327 + }, + { + "entropy": 9.3330717086792, + "epoch": 1.2899940676290291, + "mean_token_accuracy": 0.8854489326477051, + "num_tokens": 12981175.0, + "step": 13047, + "train/ce_loss": 0.48994114995002747 + }, + { + "epoch": 1.2899940676290291, + "step": 13047, + "train/sim_loss": 0.03432339429855347 + }, + { + "epoch": 1.2899940676290291, + "step": 13047, + "train/total_loss": 0.08331751078367233 + }, + { + "entropy": 9.740999221801758, + "epoch": 1.2900929404785446, + "mean_token_accuracy": 0.9089692234992981, + "num_tokens": 12997815.0, + "step": 13048, + "train/ce_loss": 0.19907549023628235 + }, + { + "epoch": 1.2900929404785446, + "step": 13048, + "train/sim_loss": 0.054367661476135254 + }, + { + "epoch": 1.2900929404785446, + "step": 13048, + "train/total_loss": 0.07427521049976349 + }, + { + "entropy": 9.416789054870605, + "epoch": 1.29019181332806, + "mean_token_accuracy": 0.8485169410705566, + "num_tokens": 13022059.0, + "step": 13049, + "train/ce_loss": 0.42558062076568604 + }, + { + "epoch": 1.29019181332806, + "step": 13049, + "train/sim_loss": 0.022301137447357178 + }, + { + "epoch": 1.29019181332806, + "step": 13049, + "train/total_loss": 0.06485919654369354 + }, + { + "entropy": 9.43781566619873, + "epoch": 1.2902906861775756, + "mean_token_accuracy": 0.853467583656311, + "num_tokens": 13034740.0, + "step": 13050, + "train/ce_loss": 0.3732900321483612 + }, + { + "epoch": 1.2902906861775756, + "step": 13050, + "train/sim_loss": 0.05958282947540283 + }, + { + "epoch": 1.2902906861775756, + "step": 13050, + "train/total_loss": 0.09691183269023895 + }, + { + "entropy": 9.087618827819824, + "epoch": 1.290389559027091, + "mean_token_accuracy": 0.8637316823005676, + "num_tokens": 13048783.0, + "step": 13051, + "train/ce_loss": 0.5169771313667297 + }, + { + "epoch": 1.290389559027091, + "step": 13051, + "train/sim_loss": 0.03225135803222656 + }, + { + "epoch": 1.290389559027091, + "step": 13051, + "train/total_loss": 0.08394907414913177 + }, + { + "entropy": 9.794479370117188, + "epoch": 1.2904884318766068, + "mean_token_accuracy": 0.8111273646354675, + "num_tokens": 13061276.0, + "step": 13052, + "train/ce_loss": 0.6261052489280701 + }, + { + "epoch": 1.2904884318766068, + "step": 13052, + "train/sim_loss": 0.030115962028503418 + }, + { + "epoch": 1.2904884318766068, + "step": 13052, + "train/total_loss": 0.09272649139165878 + }, + { + "entropy": 9.198439598083496, + "epoch": 1.2905873047261223, + "mean_token_accuracy": 0.856662929058075, + "num_tokens": 13076335.0, + "step": 13053, + "train/ce_loss": 0.2338748276233673 + }, + { + "epoch": 1.2905873047261223, + "step": 13053, + "train/sim_loss": 0.017232060432434082 + }, + { + "epoch": 1.2905873047261223, + "step": 13053, + "train/total_loss": 0.04061954468488693 + }, + { + "entropy": 9.449787139892578, + "epoch": 1.2906861775756377, + "mean_token_accuracy": 0.8713375926017761, + "num_tokens": 13089306.0, + "step": 13054, + "train/ce_loss": 0.3367420732975006 + }, + { + "epoch": 1.2906861775756377, + "step": 13054, + "train/sim_loss": 0.07797914743423462 + }, + { + "epoch": 1.2906861775756377, + "step": 13054, + "train/total_loss": 0.11165335774421692 + }, + { + "entropy": 9.498661041259766, + "epoch": 1.2907850504251532, + "mean_token_accuracy": 0.8694362044334412, + "num_tokens": 13101699.0, + "step": 13055, + "train/ce_loss": 0.5713818073272705 + }, + { + "epoch": 1.2907850504251532, + "step": 13055, + "train/sim_loss": 0.05074125528335571 + }, + { + "epoch": 1.2907850504251532, + "step": 13055, + "train/total_loss": 0.10787943750619888 + }, + { + "entropy": 9.527546882629395, + "epoch": 1.2908839232746687, + "mean_token_accuracy": 0.828976035118103, + "num_tokens": 13118797.0, + "step": 13056, + "train/ce_loss": 0.1305551379919052 + }, + { + "epoch": 1.2908839232746687, + "step": 13056, + "train/sim_loss": 0.03185391426086426 + }, + { + "epoch": 1.2908839232746687, + "step": 13056, + "train/total_loss": 0.04490942880511284 + }, + { + "entropy": 9.304027557373047, + "epoch": 1.2909827961241844, + "mean_token_accuracy": 0.8765281438827515, + "num_tokens": 13131137.0, + "step": 13057, + "train/ce_loss": 0.352775514125824 + }, + { + "epoch": 1.2909827961241844, + "step": 13057, + "train/sim_loss": 0.03927129507064819 + }, + { + "epoch": 1.2909827961241844, + "step": 13057, + "train/total_loss": 0.07454884797334671 + }, + { + "entropy": 9.837218284606934, + "epoch": 1.2910816689736997, + "mean_token_accuracy": 0.8941441178321838, + "num_tokens": 13146178.0, + "step": 13058, + "train/ce_loss": 1.1685367822647095 + }, + { + "epoch": 1.2910816689736997, + "step": 13058, + "train/sim_loss": 0.11298716068267822 + }, + { + "epoch": 1.2910816689736997, + "step": 13058, + "train/total_loss": 0.22984084486961365 + }, + { + "entropy": 9.291985511779785, + "epoch": 1.2911805418232154, + "mean_token_accuracy": 0.8456375598907471, + "num_tokens": 13159320.0, + "step": 13059, + "train/ce_loss": 0.25340786576271057 + }, + { + "epoch": 1.2911805418232154, + "step": 13059, + "train/sim_loss": 0.011356532573699951 + }, + { + "epoch": 1.2911805418232154, + "step": 13059, + "train/total_loss": 0.03669732064008713 + }, + { + "epoch": 1.2912794146727309, + "grad_norm": 0.5461663007736206, + "learning_rate": 6.77372298867626e-06, + "loss": 0.0854, + "step": 13060 + }, + { + "entropy": 9.011316299438477, + "epoch": 1.2912794146727309, + "mean_token_accuracy": 0.8656575083732605, + "num_tokens": 13168992.0, + "step": 13060, + "train/ce_loss": 0.1426311880350113 + }, + { + "epoch": 1.2912794146727309, + "step": 13060, + "train/sim_loss": 0.06444334983825684 + }, + { + "epoch": 1.2912794146727309, + "step": 13060, + "train/total_loss": 0.07870646566152573 + }, + { + "entropy": 9.246992111206055, + "epoch": 1.2913782875222464, + "mean_token_accuracy": 0.8256658315658569, + "num_tokens": 13178969.0, + "step": 13061, + "train/ce_loss": 0.8810110688209534 + }, + { + "epoch": 1.2913782875222464, + "step": 13061, + "train/sim_loss": 0.036243438720703125 + }, + { + "epoch": 1.2913782875222464, + "step": 13061, + "train/total_loss": 0.12434455007314682 + }, + { + "entropy": 9.681699752807617, + "epoch": 1.2914771603717619, + "mean_token_accuracy": 0.9032257795333862, + "num_tokens": 13189916.0, + "step": 13062, + "train/ce_loss": 0.26828497648239136 + }, + { + "epoch": 1.2914771603717619, + "step": 13062, + "train/sim_loss": 0.025420665740966797 + }, + { + "epoch": 1.2914771603717619, + "step": 13062, + "train/total_loss": 0.05224916338920593 + }, + { + "entropy": 9.528358459472656, + "epoch": 1.2915760332212773, + "mean_token_accuracy": 0.8762136101722717, + "num_tokens": 13206244.0, + "step": 13063, + "train/ce_loss": 0.8320903778076172 + }, + { + "epoch": 1.2915760332212773, + "step": 13063, + "train/sim_loss": 0.07412457466125488 + }, + { + "epoch": 1.2915760332212773, + "step": 13063, + "train/total_loss": 0.1573336124420166 + }, + { + "entropy": 9.20278263092041, + "epoch": 1.291674906070793, + "mean_token_accuracy": 0.8501827120780945, + "num_tokens": 13217541.0, + "step": 13064, + "train/ce_loss": 0.7315074801445007 + }, + { + "epoch": 1.291674906070793, + "step": 13064, + "train/sim_loss": 0.07071518898010254 + }, + { + "epoch": 1.291674906070793, + "step": 13064, + "train/total_loss": 0.1438659429550171 + }, + { + "entropy": 9.550895690917969, + "epoch": 1.2917737789203085, + "mean_token_accuracy": 0.865486741065979, + "num_tokens": 13231779.0, + "step": 13065, + "train/ce_loss": 4.861128672928317e-07 + }, + { + "epoch": 1.2917737789203085, + "step": 13065, + "train/sim_loss": 0.025662720203399658 + }, + { + "epoch": 1.2917737789203085, + "step": 13065, + "train/total_loss": 0.025662768632173538 + }, + { + "entropy": 8.858539581298828, + "epoch": 1.291872651769824, + "mean_token_accuracy": 0.8225629925727844, + "num_tokens": 13241484.0, + "step": 13066, + "train/ce_loss": 0.09721887856721878 + }, + { + "epoch": 1.291872651769824, + "step": 13066, + "train/sim_loss": 0.03934645652770996 + }, + { + "epoch": 1.291872651769824, + "step": 13066, + "train/total_loss": 0.04906834661960602 + }, + { + "entropy": 9.291193962097168, + "epoch": 1.2919715246193395, + "mean_token_accuracy": 0.9084699749946594, + "num_tokens": 13253315.0, + "step": 13067, + "train/ce_loss": 0.14668579399585724 + }, + { + "epoch": 1.2919715246193395, + "step": 13067, + "train/sim_loss": 0.028425276279449463 + }, + { + "epoch": 1.2919715246193395, + "step": 13067, + "train/total_loss": 0.043093856424093246 + }, + { + "entropy": 9.161734580993652, + "epoch": 1.292070397468855, + "mean_token_accuracy": 0.8708791136741638, + "num_tokens": 13262271.0, + "step": 13068, + "train/ce_loss": 0.48392271995544434 + }, + { + "epoch": 1.292070397468855, + "step": 13068, + "train/sim_loss": 0.015258491039276123 + }, + { + "epoch": 1.292070397468855, + "step": 13068, + "train/total_loss": 0.06365076452493668 + }, + { + "entropy": 9.008269309997559, + "epoch": 1.2921692703183707, + "mean_token_accuracy": 0.8685939311981201, + "num_tokens": 13275316.0, + "step": 13069, + "train/ce_loss": 2.775779819330637e-07 + }, + { + "epoch": 1.2921692703183707, + "step": 13069, + "train/sim_loss": 0.02984565496444702 + }, + { + "epoch": 1.2921692703183707, + "step": 13069, + "train/total_loss": 0.02984568290412426 + }, + { + "entropy": 9.406621932983398, + "epoch": 1.292268143167886, + "mean_token_accuracy": 0.8754246830940247, + "num_tokens": 13288995.0, + "step": 13070, + "train/ce_loss": 0.4735138416290283 + }, + { + "epoch": 1.292268143167886, + "step": 13070, + "train/sim_loss": 0.07045257091522217 + }, + { + "epoch": 1.292268143167886, + "step": 13070, + "train/total_loss": 0.11780396103858948 + }, + { + "entropy": 9.62076187133789, + "epoch": 1.2923670160174017, + "mean_token_accuracy": 0.8633193969726562, + "num_tokens": 13306221.0, + "step": 13071, + "train/ce_loss": 0.2859983742237091 + }, + { + "epoch": 1.2923670160174017, + "step": 13071, + "train/sim_loss": 0.02640688419342041 + }, + { + "epoch": 1.2923670160174017, + "step": 13071, + "train/total_loss": 0.0550067201256752 + }, + { + "entropy": 9.41617202758789, + "epoch": 1.2924658888669172, + "mean_token_accuracy": 0.8579310178756714, + "num_tokens": 13316650.0, + "step": 13072, + "train/ce_loss": 0.4691223204135895 + }, + { + "epoch": 1.2924658888669172, + "step": 13072, + "train/sim_loss": 0.027465462684631348 + }, + { + "epoch": 1.2924658888669172, + "step": 13072, + "train/total_loss": 0.07437770068645477 + }, + { + "entropy": 9.544830322265625, + "epoch": 1.2925647617164326, + "mean_token_accuracy": 0.8479087352752686, + "num_tokens": 13329826.0, + "step": 13073, + "train/ce_loss": 0.3885248303413391 + }, + { + "epoch": 1.2925647617164326, + "step": 13073, + "train/sim_loss": 0.058549463748931885 + }, + { + "epoch": 1.2925647617164326, + "step": 13073, + "train/total_loss": 0.0974019467830658 + }, + { + "entropy": 9.366340637207031, + "epoch": 1.2926636345659481, + "mean_token_accuracy": 0.8333333134651184, + "num_tokens": 13343653.0, + "step": 13074, + "train/ce_loss": 0.4542709290981293 + }, + { + "epoch": 1.2926636345659481, + "step": 13074, + "train/sim_loss": 0.027785897254943848 + }, + { + "epoch": 1.2926636345659481, + "step": 13074, + "train/total_loss": 0.07321299612522125 + }, + { + "entropy": 9.118303298950195, + "epoch": 1.2927625074154636, + "mean_token_accuracy": 0.8617157340049744, + "num_tokens": 13353351.0, + "step": 13075, + "train/ce_loss": 0.29134035110473633 + }, + { + "epoch": 1.2927625074154636, + "step": 13075, + "train/sim_loss": 0.05139780044555664 + }, + { + "epoch": 1.2927625074154636, + "step": 13075, + "train/total_loss": 0.08053183555603027 + }, + { + "entropy": 9.473846435546875, + "epoch": 1.2928613802649793, + "mean_token_accuracy": 0.8260233998298645, + "num_tokens": 13362900.0, + "step": 13076, + "train/ce_loss": 0.425932914018631 + }, + { + "epoch": 1.2928613802649793, + "step": 13076, + "train/sim_loss": 0.044214487075805664 + }, + { + "epoch": 1.2928613802649793, + "step": 13076, + "train/total_loss": 0.08680777996778488 + }, + { + "entropy": 9.86034107208252, + "epoch": 1.2929602531144948, + "mean_token_accuracy": 0.8295454382896423, + "num_tokens": 13372271.0, + "step": 13077, + "train/ce_loss": 3.034389521872072e-07 + }, + { + "epoch": 1.2929602531144948, + "step": 13077, + "train/sim_loss": 0.01226741075515747 + }, + { + "epoch": 1.2929602531144948, + "step": 13077, + "train/total_loss": 0.012267441488802433 + }, + { + "entropy": 9.4127197265625, + "epoch": 1.2930591259640103, + "mean_token_accuracy": 0.8975672125816345, + "num_tokens": 13384405.0, + "step": 13078, + "train/ce_loss": 0.15102115273475647 + }, + { + "epoch": 1.2930591259640103, + "step": 13078, + "train/sim_loss": 0.021513819694519043 + }, + { + "epoch": 1.2930591259640103, + "step": 13078, + "train/total_loss": 0.03661593422293663 + }, + { + "entropy": 9.622239112854004, + "epoch": 1.2931579988135258, + "mean_token_accuracy": 0.8808724880218506, + "num_tokens": 13400157.0, + "step": 13079, + "train/ce_loss": 0.4750438928604126 + }, + { + "epoch": 1.2931579988135258, + "step": 13079, + "train/sim_loss": 0.0881049633026123 + }, + { + "epoch": 1.2931579988135258, + "step": 13079, + "train/total_loss": 0.13560935854911804 + }, + { + "epoch": 1.2932568716630413, + "grad_norm": 0.5940082669258118, + "learning_rate": 6.7687781239183115e-06, + "loss": 0.0853, + "step": 13080 + }, + { + "entropy": 9.124014854431152, + "epoch": 1.2932568716630413, + "mean_token_accuracy": 0.8927038908004761, + "num_tokens": 13412323.0, + "step": 13080, + "train/ce_loss": 3.315276160265057e-07 + }, + { + "epoch": 1.2932568716630413, + "step": 13080, + "train/sim_loss": 0.03536707162857056 + }, + { + "epoch": 1.2932568716630413, + "step": 13080, + "train/total_loss": 0.03536710515618324 + }, + { + "entropy": 9.405513763427734, + "epoch": 1.293355744512557, + "mean_token_accuracy": 0.859375, + "num_tokens": 13419843.0, + "step": 13081, + "train/ce_loss": 0.5409700870513916 + }, + { + "epoch": 1.293355744512557, + "step": 13081, + "train/sim_loss": 0.05448406934738159 + }, + { + "epoch": 1.293355744512557, + "step": 13081, + "train/total_loss": 0.10858108103275299 + }, + { + "entropy": 9.556428909301758, + "epoch": 1.2934546173620725, + "mean_token_accuracy": 0.8547249436378479, + "num_tokens": 13428182.0, + "step": 13082, + "train/ce_loss": 3.2416448902949924e-07 + }, + { + "epoch": 1.2934546173620725, + "step": 13082, + "train/sim_loss": 0.019087672233581543 + }, + { + "epoch": 1.2934546173620725, + "step": 13082, + "train/total_loss": 0.01908770389854908 + }, + { + "entropy": 9.124223709106445, + "epoch": 1.293553490211588, + "mean_token_accuracy": 0.8767772316932678, + "num_tokens": 13436475.0, + "step": 13083, + "train/ce_loss": 0.398989200592041 + }, + { + "epoch": 1.293553490211588, + "step": 13083, + "train/sim_loss": 0.054577767848968506 + }, + { + "epoch": 1.293553490211588, + "step": 13083, + "train/total_loss": 0.09447668492794037 + }, + { + "entropy": 8.77580451965332, + "epoch": 1.2936523630611034, + "mean_token_accuracy": 0.887837827205658, + "num_tokens": 13452360.0, + "step": 13084, + "train/ce_loss": 0.40761542320251465 + }, + { + "epoch": 1.2936523630611034, + "step": 13084, + "train/sim_loss": 0.041974008083343506 + }, + { + "epoch": 1.2936523630611034, + "step": 13084, + "train/total_loss": 0.08273555338382721 + }, + { + "entropy": 9.342655181884766, + "epoch": 1.293751235910619, + "mean_token_accuracy": 0.8850574493408203, + "num_tokens": 13464899.0, + "step": 13085, + "train/ce_loss": 0.5208089351654053 + }, + { + "epoch": 1.293751235910619, + "step": 13085, + "train/sim_loss": 0.0366782546043396 + }, + { + "epoch": 1.293751235910619, + "step": 13085, + "train/total_loss": 0.0887591540813446 + }, + { + "entropy": 8.677899360656738, + "epoch": 1.2938501087601344, + "mean_token_accuracy": 0.8240252733230591, + "num_tokens": 13474765.0, + "step": 13086, + "train/ce_loss": 0.7126064896583557 + }, + { + "epoch": 1.2938501087601344, + "step": 13086, + "train/sim_loss": 0.04331141710281372 + }, + { + "epoch": 1.2938501087601344, + "step": 13086, + "train/total_loss": 0.11457207053899765 + }, + { + "entropy": 9.575531005859375, + "epoch": 1.2939489816096499, + "mean_token_accuracy": 0.9041533470153809, + "num_tokens": 13489866.0, + "step": 13087, + "train/ce_loss": 0.3832440972328186 + }, + { + "epoch": 1.2939489816096499, + "step": 13087, + "train/sim_loss": 0.0184134840965271 + }, + { + "epoch": 1.2939489816096499, + "step": 13087, + "train/total_loss": 0.05673789605498314 + }, + { + "entropy": 9.642181396484375, + "epoch": 1.2940478544591656, + "mean_token_accuracy": 0.9011345505714417, + "num_tokens": 13502272.0, + "step": 13088, + "train/ce_loss": 0.686722457408905 + }, + { + "epoch": 1.2940478544591656, + "step": 13088, + "train/sim_loss": 0.05413675308227539 + }, + { + "epoch": 1.2940478544591656, + "step": 13088, + "train/total_loss": 0.12280900031328201 + }, + { + "entropy": 8.789238929748535, + "epoch": 1.294146727308681, + "mean_token_accuracy": 0.8306537866592407, + "num_tokens": 13511080.0, + "step": 13089, + "train/ce_loss": 0.5047439932823181 + }, + { + "epoch": 1.294146727308681, + "step": 13089, + "train/sim_loss": 0.03142380714416504 + }, + { + "epoch": 1.294146727308681, + "step": 13089, + "train/total_loss": 0.08189821243286133 + }, + { + "entropy": 9.48184871673584, + "epoch": 1.2942456001581966, + "mean_token_accuracy": 0.8896210789680481, + "num_tokens": 13527292.0, + "step": 13090, + "train/ce_loss": 0.36951744556427 + }, + { + "epoch": 1.2942456001581966, + "step": 13090, + "train/sim_loss": 0.011165142059326172 + }, + { + "epoch": 1.2942456001581966, + "step": 13090, + "train/total_loss": 0.04811688885092735 + }, + { + "entropy": 9.827218055725098, + "epoch": 1.294344473007712, + "mean_token_accuracy": 0.8034397959709167, + "num_tokens": 13537780.0, + "step": 13091, + "train/ce_loss": 0.5967043042182922 + }, + { + "epoch": 1.294344473007712, + "step": 13091, + "train/sim_loss": 0.0839613676071167 + }, + { + "epoch": 1.294344473007712, + "step": 13091, + "train/total_loss": 0.14363180100917816 + }, + { + "entropy": 9.241697311401367, + "epoch": 1.2944433458572275, + "mean_token_accuracy": 0.8678898811340332, + "num_tokens": 13546228.0, + "step": 13092, + "train/ce_loss": 1.5948562577250414e-06 + }, + { + "epoch": 1.2944433458572275, + "step": 13092, + "train/sim_loss": 0.044665515422821045 + }, + { + "epoch": 1.2944433458572275, + "step": 13092, + "train/total_loss": 0.04466567561030388 + }, + { + "entropy": 9.764741897583008, + "epoch": 1.2945422187067432, + "mean_token_accuracy": 0.9053254723548889, + "num_tokens": 13559294.0, + "step": 13093, + "train/ce_loss": 0.6199662089347839 + }, + { + "epoch": 1.2945422187067432, + "step": 13093, + "train/sim_loss": 0.051414430141448975 + }, + { + "epoch": 1.2945422187067432, + "step": 13093, + "train/total_loss": 0.1134110540151596 + }, + { + "entropy": 9.120697021484375, + "epoch": 1.2946410915562587, + "mean_token_accuracy": 0.8648648858070374, + "num_tokens": 13568304.0, + "step": 13094, + "train/ce_loss": 0.22121326625347137 + }, + { + "epoch": 1.2946410915562587, + "step": 13094, + "train/sim_loss": 0.037068188190460205 + }, + { + "epoch": 1.2946410915562587, + "step": 13094, + "train/total_loss": 0.05918951332569122 + }, + { + "entropy": 9.68697738647461, + "epoch": 1.2947399644057742, + "mean_token_accuracy": 0.9045454263687134, + "num_tokens": 13582491.0, + "step": 13095, + "train/ce_loss": 0.32061225175857544 + }, + { + "epoch": 1.2947399644057742, + "step": 13095, + "train/sim_loss": 0.0158158540725708 + }, + { + "epoch": 1.2947399644057742, + "step": 13095, + "train/total_loss": 0.047877080738544464 + }, + { + "entropy": 9.044538497924805, + "epoch": 1.2948388372552897, + "mean_token_accuracy": 0.8446691036224365, + "num_tokens": 13595863.0, + "step": 13096, + "train/ce_loss": 0.5701990127563477 + }, + { + "epoch": 1.2948388372552897, + "step": 13096, + "train/sim_loss": 0.07687157392501831 + }, + { + "epoch": 1.2948388372552897, + "step": 13096, + "train/total_loss": 0.13389147818088531 + }, + { + "entropy": 9.527178764343262, + "epoch": 1.2949377101048052, + "mean_token_accuracy": 0.8921348452568054, + "num_tokens": 13608402.0, + "step": 13097, + "train/ce_loss": 6.632745339629764e-07 + }, + { + "epoch": 1.2949377101048052, + "step": 13097, + "train/sim_loss": 0.019504249095916748 + }, + { + "epoch": 1.2949377101048052, + "step": 13097, + "train/total_loss": 0.01950431615114212 + }, + { + "entropy": 9.698792457580566, + "epoch": 1.2950365829543207, + "mean_token_accuracy": 0.8473581075668335, + "num_tokens": 13621105.0, + "step": 13098, + "train/ce_loss": 0.6170358657836914 + }, + { + "epoch": 1.2950365829543207, + "step": 13098, + "train/sim_loss": 0.023021936416625977 + }, + { + "epoch": 1.2950365829543207, + "step": 13098, + "train/total_loss": 0.0847255289554596 + }, + { + "entropy": 8.972661972045898, + "epoch": 1.2951354558038362, + "mean_token_accuracy": 0.8751472234725952, + "num_tokens": 13630339.0, + "step": 13099, + "train/ce_loss": 0.17931701242923737 + }, + { + "epoch": 1.2951354558038362, + "step": 13099, + "train/sim_loss": 0.11662393808364868 + }, + { + "epoch": 1.2951354558038362, + "step": 13099, + "train/total_loss": 0.1345556378364563 + }, + { + "epoch": 1.2952343286533519, + "grad_norm": 0.5291997194290161, + "learning_rate": 6.763833259160363e-06, + "loss": 0.0776, + "step": 13100 + }, + { + "entropy": 9.281671524047852, + "epoch": 1.2952343286533519, + "mean_token_accuracy": 0.8405627012252808, + "num_tokens": 13641934.0, + "step": 13100, + "train/ce_loss": 0.6161355972290039 + }, + { + "epoch": 1.2952343286533519, + "step": 13100, + "train/sim_loss": 0.04977136850357056 + }, + { + "epoch": 1.2952343286533519, + "step": 13100, + "train/total_loss": 0.11138492822647095 + }, + { + "entropy": 9.592227935791016, + "epoch": 1.2953332015028673, + "mean_token_accuracy": 0.9016393423080444, + "num_tokens": 13650605.0, + "step": 13101, + "train/ce_loss": 0.5007851123809814 + }, + { + "epoch": 1.2953332015028673, + "step": 13101, + "train/sim_loss": 0.03263533115386963 + }, + { + "epoch": 1.2953332015028673, + "step": 13101, + "train/total_loss": 0.08271384239196777 + }, + { + "entropy": 9.455703735351562, + "epoch": 1.2954320743523828, + "mean_token_accuracy": 0.8732572793960571, + "num_tokens": 13665855.0, + "step": 13102, + "train/ce_loss": 0.5191970467567444 + }, + { + "epoch": 1.2954320743523828, + "step": 13102, + "train/sim_loss": 0.07345271110534668 + }, + { + "epoch": 1.2954320743523828, + "step": 13102, + "train/total_loss": 0.12537240982055664 + }, + { + "entropy": 9.503490447998047, + "epoch": 1.2955309472018983, + "mean_token_accuracy": 0.8457648754119873, + "num_tokens": 13679828.0, + "step": 13103, + "train/ce_loss": 0.5015388131141663 + }, + { + "epoch": 1.2955309472018983, + "step": 13103, + "train/sim_loss": 0.05129271745681763 + }, + { + "epoch": 1.2955309472018983, + "step": 13103, + "train/total_loss": 0.10144659876823425 + }, + { + "entropy": 9.691165924072266, + "epoch": 1.2956298200514138, + "mean_token_accuracy": 0.8900169134140015, + "num_tokens": 13696005.0, + "step": 13104, + "train/ce_loss": 0.3017542064189911 + }, + { + "epoch": 1.2956298200514138, + "step": 13104, + "train/sim_loss": 0.0735321044921875 + }, + { + "epoch": 1.2956298200514138, + "step": 13104, + "train/total_loss": 0.10370752215385437 + }, + { + "entropy": 9.408878326416016, + "epoch": 1.2957286929009295, + "mean_token_accuracy": 0.865311324596405, + "num_tokens": 13712312.0, + "step": 13105, + "train/ce_loss": 0.1951957792043686 + }, + { + "epoch": 1.2957286929009295, + "step": 13105, + "train/sim_loss": 0.04794180393218994 + }, + { + "epoch": 1.2957286929009295, + "step": 13105, + "train/total_loss": 0.06746138632297516 + }, + { + "entropy": 9.274435043334961, + "epoch": 1.295827565750445, + "mean_token_accuracy": 0.8407557606697083, + "num_tokens": 13721451.0, + "step": 13106, + "train/ce_loss": 0.45940858125686646 + }, + { + "epoch": 1.295827565750445, + "step": 13106, + "train/sim_loss": 0.02748185396194458 + }, + { + "epoch": 1.295827565750445, + "step": 13106, + "train/total_loss": 0.07342271506786346 + }, + { + "entropy": 10.012338638305664, + "epoch": 1.2959264385999605, + "mean_token_accuracy": 0.9556313753128052, + "num_tokens": 13728390.0, + "step": 13107, + "train/ce_loss": 4.351160896476358e-06 + }, + { + "epoch": 1.2959264385999605, + "step": 13107, + "train/sim_loss": 0.04941201210021973 + }, + { + "epoch": 1.2959264385999605, + "step": 13107, + "train/total_loss": 0.04941244795918465 + }, + { + "entropy": 9.509374618530273, + "epoch": 1.296025311449476, + "mean_token_accuracy": 0.8449198007583618, + "num_tokens": 13743743.0, + "step": 13108, + "train/ce_loss": 0.2931765615940094 + }, + { + "epoch": 1.296025311449476, + "step": 13108, + "train/sim_loss": 0.02236419916152954 + }, + { + "epoch": 1.296025311449476, + "step": 13108, + "train/total_loss": 0.05168185383081436 + }, + { + "entropy": 9.284505844116211, + "epoch": 1.2961241842989915, + "mean_token_accuracy": 0.8208954930305481, + "num_tokens": 13757060.0, + "step": 13109, + "train/ce_loss": 0.20285698771476746 + }, + { + "epoch": 1.2961241842989915, + "step": 13109, + "train/sim_loss": 0.01105189323425293 + }, + { + "epoch": 1.2961241842989915, + "step": 13109, + "train/total_loss": 0.031337592750787735 + }, + { + "entropy": 9.87600326538086, + "epoch": 1.296223057148507, + "mean_token_accuracy": 0.906521737575531, + "num_tokens": 13773099.0, + "step": 13110, + "train/ce_loss": 4.65428826146308e-07 + }, + { + "epoch": 1.296223057148507, + "step": 13110, + "train/sim_loss": 0.01277303695678711 + }, + { + "epoch": 1.296223057148507, + "step": 13110, + "train/total_loss": 0.01277308352291584 + }, + { + "entropy": 9.226179122924805, + "epoch": 1.2963219299980224, + "mean_token_accuracy": 0.8820093274116516, + "num_tokens": 13784676.0, + "step": 13111, + "train/ce_loss": 0.19234341382980347 + }, + { + "epoch": 1.2963219299980224, + "step": 13111, + "train/sim_loss": 0.03894931077957153 + }, + { + "epoch": 1.2963219299980224, + "step": 13111, + "train/total_loss": 0.05818365514278412 + }, + { + "entropy": 9.308832168579102, + "epoch": 1.2964208028475381, + "mean_token_accuracy": 0.8698011040687561, + "num_tokens": 13795289.0, + "step": 13112, + "train/ce_loss": 0.8517903089523315 + }, + { + "epoch": 1.2964208028475381, + "step": 13112, + "train/sim_loss": 0.09312289953231812 + }, + { + "epoch": 1.2964208028475381, + "step": 13112, + "train/total_loss": 0.17830193042755127 + }, + { + "entropy": 9.723417282104492, + "epoch": 1.2965196756970536, + "mean_token_accuracy": 0.8886076211929321, + "num_tokens": 13802282.0, + "step": 13113, + "train/ce_loss": 1.680291006778134e-06 + }, + { + "epoch": 1.2965196756970536, + "step": 13113, + "train/sim_loss": 0.03993880748748779 + }, + { + "epoch": 1.2965196756970536, + "step": 13113, + "train/total_loss": 0.039938975125551224 + }, + { + "entropy": 9.202308654785156, + "epoch": 1.296618548546569, + "mean_token_accuracy": 0.8139796853065491, + "num_tokens": 13819706.0, + "step": 13114, + "train/ce_loss": 0.9013089537620544 + }, + { + "epoch": 1.296618548546569, + "step": 13114, + "train/sim_loss": 0.0649614930152893 + }, + { + "epoch": 1.296618548546569, + "step": 13114, + "train/total_loss": 0.15509238839149475 + }, + { + "entropy": 9.314160346984863, + "epoch": 1.2967174213960846, + "mean_token_accuracy": 0.8836292028427124, + "num_tokens": 13828037.0, + "step": 13115, + "train/ce_loss": 2.449684870953206e-06 + }, + { + "epoch": 1.2967174213960846, + "step": 13115, + "train/sim_loss": 0.032829225063323975 + }, + { + "epoch": 1.2967174213960846, + "step": 13115, + "train/total_loss": 0.03282947093248367 + }, + { + "entropy": 8.944099426269531, + "epoch": 1.2968162942456, + "mean_token_accuracy": 0.8250377178192139, + "num_tokens": 13844207.0, + "step": 13116, + "train/ce_loss": 9.040062423082418e-07 + }, + { + "epoch": 1.2968162942456, + "step": 13116, + "train/sim_loss": 0.025196731090545654 + }, + { + "epoch": 1.2968162942456, + "step": 13116, + "train/total_loss": 0.025196822360157967 + }, + { + "entropy": 8.897087097167969, + "epoch": 1.2969151670951158, + "mean_token_accuracy": 0.8104737997055054, + "num_tokens": 13852878.0, + "step": 13117, + "train/ce_loss": 0.7091268301010132 + }, + { + "epoch": 1.2969151670951158, + "step": 13117, + "train/sim_loss": 0.020534753799438477 + }, + { + "epoch": 1.2969151670951158, + "step": 13117, + "train/total_loss": 0.09144743531942368 + }, + { + "entropy": 9.762693405151367, + "epoch": 1.2970140399446313, + "mean_token_accuracy": 0.8555347323417664, + "num_tokens": 13861696.0, + "step": 13118, + "train/ce_loss": 0.4173201620578766 + }, + { + "epoch": 1.2970140399446313, + "step": 13118, + "train/sim_loss": 0.04226189851760864 + }, + { + "epoch": 1.2970140399446313, + "step": 13118, + "train/total_loss": 0.08399391174316406 + }, + { + "entropy": 9.759932518005371, + "epoch": 1.2971129127941468, + "mean_token_accuracy": 0.8089887499809265, + "num_tokens": 13874056.0, + "step": 13119, + "train/ce_loss": 2.1022462988184998e-06 + }, + { + "epoch": 1.2971129127941468, + "step": 13119, + "train/sim_loss": 0.05022847652435303 + }, + { + "epoch": 1.2971129127941468, + "step": 13119, + "train/total_loss": 0.05022868514060974 + }, + { + "epoch": 1.2972117856436622, + "grad_norm": 0.623751163482666, + "learning_rate": 6.758888394402414e-06, + "loss": 0.0802, + "step": 13120 + }, + { + "entropy": 10.192384719848633, + "epoch": 1.2972117856436622, + "mean_token_accuracy": 0.9220563769340515, + "num_tokens": 13886343.0, + "step": 13120, + "train/ce_loss": 0.6319201588630676 + }, + { + "epoch": 1.2972117856436622, + "step": 13120, + "train/sim_loss": 0.02441030740737915 + }, + { + "epoch": 1.2972117856436622, + "step": 13120, + "train/total_loss": 0.08760232478380203 + }, + { + "entropy": 9.19694709777832, + "epoch": 1.2973106584931777, + "mean_token_accuracy": 0.861261248588562, + "num_tokens": 13896163.0, + "step": 13121, + "train/ce_loss": 0.6540277600288391 + }, + { + "epoch": 1.2973106584931777, + "step": 13121, + "train/sim_loss": 0.030909180641174316 + }, + { + "epoch": 1.2973106584931777, + "step": 13121, + "train/total_loss": 0.09631195664405823 + }, + { + "entropy": 8.704383850097656, + "epoch": 1.2974095313426934, + "mean_token_accuracy": 0.8450134992599487, + "num_tokens": 13907783.0, + "step": 13122, + "train/ce_loss": 0.24624699354171753 + }, + { + "epoch": 1.2974095313426934, + "step": 13122, + "train/sim_loss": 0.05740994215011597 + }, + { + "epoch": 1.2974095313426934, + "step": 13122, + "train/total_loss": 0.0820346400141716 + }, + { + "entropy": 9.449711799621582, + "epoch": 1.2975084041922087, + "mean_token_accuracy": 0.8057742714881897, + "num_tokens": 13921317.0, + "step": 13123, + "train/ce_loss": 0.9785907864570618 + }, + { + "epoch": 1.2975084041922087, + "step": 13123, + "train/sim_loss": 0.047439396381378174 + }, + { + "epoch": 1.2975084041922087, + "step": 13123, + "train/total_loss": 0.14529848098754883 + }, + { + "entropy": 9.544682502746582, + "epoch": 1.2976072770417244, + "mean_token_accuracy": 0.8949843049049377, + "num_tokens": 13936378.0, + "step": 13124, + "train/ce_loss": 1.857876100075373e-06 + }, + { + "epoch": 1.2976072770417244, + "step": 13124, + "train/sim_loss": 0.034526824951171875 + }, + { + "epoch": 1.2976072770417244, + "step": 13124, + "train/total_loss": 0.0345270112156868 + }, + { + "entropy": 9.719030380249023, + "epoch": 1.29770614989124, + "mean_token_accuracy": 0.8160779476165771, + "num_tokens": 13948268.0, + "step": 13125, + "train/ce_loss": 0.6971856951713562 + }, + { + "epoch": 1.29770614989124, + "step": 13125, + "train/sim_loss": 0.08775758743286133 + }, + { + "epoch": 1.29770614989124, + "step": 13125, + "train/total_loss": 0.15747615694999695 + }, + { + "entropy": 9.344571113586426, + "epoch": 1.2978050227407554, + "mean_token_accuracy": 0.898383378982544, + "num_tokens": 13959526.0, + "step": 13126, + "train/ce_loss": 0.29354315996170044 + }, + { + "epoch": 1.2978050227407554, + "step": 13126, + "train/sim_loss": 0.06936240196228027 + }, + { + "epoch": 1.2978050227407554, + "step": 13126, + "train/total_loss": 0.09871672093868256 + }, + { + "entropy": 9.316919326782227, + "epoch": 1.2979038955902709, + "mean_token_accuracy": 0.8283092975616455, + "num_tokens": 13970609.0, + "step": 13127, + "train/ce_loss": 0.41934823989868164 + }, + { + "epoch": 1.2979038955902709, + "step": 13127, + "train/sim_loss": 0.04355788230895996 + }, + { + "epoch": 1.2979038955902709, + "step": 13127, + "train/total_loss": 0.08549270778894424 + }, + { + "entropy": 9.821736335754395, + "epoch": 1.2980027684397863, + "mean_token_accuracy": 0.8934240341186523, + "num_tokens": 13984195.0, + "step": 13128, + "train/ce_loss": 1.8347589048062218e-06 + }, + { + "epoch": 1.2980027684397863, + "step": 13128, + "train/sim_loss": 0.02890145778656006 + }, + { + "epoch": 1.2980027684397863, + "step": 13128, + "train/total_loss": 0.028901642188429832 + }, + { + "entropy": 9.239294052124023, + "epoch": 1.298101641289302, + "mean_token_accuracy": 0.8551068902015686, + "num_tokens": 13992770.0, + "step": 13129, + "train/ce_loss": 0.39185091853141785 + }, + { + "epoch": 1.298101641289302, + "step": 13129, + "train/sim_loss": 0.069385826587677 + }, + { + "epoch": 1.298101641289302, + "step": 13129, + "train/total_loss": 0.10857091844081879 + }, + { + "entropy": 9.305929183959961, + "epoch": 1.2982005141388175, + "mean_token_accuracy": 0.8497596383094788, + "num_tokens": 14005026.0, + "step": 13130, + "train/ce_loss": 0.4627133905887604 + }, + { + "epoch": 1.2982005141388175, + "step": 13130, + "train/sim_loss": 0.0468592643737793 + }, + { + "epoch": 1.2982005141388175, + "step": 13130, + "train/total_loss": 0.09313060343265533 + }, + { + "entropy": 9.747225761413574, + "epoch": 1.298299386988333, + "mean_token_accuracy": 0.8474074006080627, + "num_tokens": 14019287.0, + "step": 13131, + "train/ce_loss": 0.25270092487335205 + }, + { + "epoch": 1.298299386988333, + "step": 13131, + "train/sim_loss": 0.027756214141845703 + }, + { + "epoch": 1.298299386988333, + "step": 13131, + "train/total_loss": 0.05302630737423897 + }, + { + "entropy": 9.5150728225708, + "epoch": 1.2983982598378485, + "mean_token_accuracy": 0.8503937125205994, + "num_tokens": 14031146.0, + "step": 13132, + "train/ce_loss": 0.7391044497489929 + }, + { + "epoch": 1.2983982598378485, + "step": 13132, + "train/sim_loss": 0.0504419207572937 + }, + { + "epoch": 1.2983982598378485, + "step": 13132, + "train/total_loss": 0.124352365732193 + }, + { + "entropy": 8.800956726074219, + "epoch": 1.298497132687364, + "mean_token_accuracy": 0.8109965920448303, + "num_tokens": 14043263.0, + "step": 13133, + "train/ce_loss": 0.8692041039466858 + }, + { + "epoch": 1.298497132687364, + "step": 13133, + "train/sim_loss": 0.026735961437225342 + }, + { + "epoch": 1.298497132687364, + "step": 13133, + "train/total_loss": 0.11365637183189392 + }, + { + "entropy": 9.24026107788086, + "epoch": 1.2985960055368797, + "mean_token_accuracy": 0.8144208192825317, + "num_tokens": 14060653.0, + "step": 13134, + "train/ce_loss": 0.4899321496486664 + }, + { + "epoch": 1.2985960055368797, + "step": 13134, + "train/sim_loss": 0.02009439468383789 + }, + { + "epoch": 1.2985960055368797, + "step": 13134, + "train/total_loss": 0.06908760964870453 + }, + { + "entropy": 9.38184642791748, + "epoch": 1.298694878386395, + "mean_token_accuracy": 0.8172323703765869, + "num_tokens": 14075478.0, + "step": 13135, + "train/ce_loss": 0.5574176907539368 + }, + { + "epoch": 1.298694878386395, + "step": 13135, + "train/sim_loss": 0.028722167015075684 + }, + { + "epoch": 1.298694878386395, + "step": 13135, + "train/total_loss": 0.0844639390707016 + }, + { + "entropy": 9.396990776062012, + "epoch": 1.2987937512359107, + "mean_token_accuracy": 0.8035714030265808, + "num_tokens": 14093390.0, + "step": 13136, + "train/ce_loss": 0.3155410885810852 + }, + { + "epoch": 1.2987937512359107, + "step": 13136, + "train/sim_loss": 0.06227445602416992 + }, + { + "epoch": 1.2987937512359107, + "step": 13136, + "train/total_loss": 0.09382856637239456 + }, + { + "entropy": 9.506364822387695, + "epoch": 1.2988926240854262, + "mean_token_accuracy": 0.8092909455299377, + "num_tokens": 14102657.0, + "step": 13137, + "train/ce_loss": 0.6746295094490051 + }, + { + "epoch": 1.2988926240854262, + "step": 13137, + "train/sim_loss": 0.05371814966201782 + }, + { + "epoch": 1.2988926240854262, + "step": 13137, + "train/total_loss": 0.12118110060691833 + }, + { + "entropy": 9.751786231994629, + "epoch": 1.2989914969349416, + "mean_token_accuracy": 0.8434237837791443, + "num_tokens": 14114209.0, + "step": 13138, + "train/ce_loss": 4.072648152941838e-06 + }, + { + "epoch": 1.2989914969349416, + "step": 13138, + "train/sim_loss": 0.04010665416717529 + }, + { + "epoch": 1.2989914969349416, + "step": 13138, + "train/total_loss": 0.040107060223817825 + }, + { + "entropy": 9.258001327514648, + "epoch": 1.2990903697844571, + "mean_token_accuracy": 0.8510638475418091, + "num_tokens": 14128153.0, + "step": 13139, + "train/ce_loss": 0.6846909523010254 + }, + { + "epoch": 1.2990903697844571, + "step": 13139, + "train/sim_loss": 0.07423406839370728 + }, + { + "epoch": 1.2990903697844571, + "step": 13139, + "train/total_loss": 0.14270317554473877 + }, + { + "epoch": 1.2991892426339726, + "grad_norm": 0.5746552348136902, + "learning_rate": 6.753943529644465e-06, + "loss": 0.0879, + "step": 13140 + }, + { + "entropy": 9.61387825012207, + "epoch": 1.2991892426339726, + "mean_token_accuracy": 0.832504153251648, + "num_tokens": 14141599.0, + "step": 13140, + "train/ce_loss": 7.235422572193784e-07 + }, + { + "epoch": 1.2991892426339726, + "step": 13140, + "train/sim_loss": 0.03465205430984497 + }, + { + "epoch": 1.2991892426339726, + "step": 13140, + "train/total_loss": 0.03465212509036064 + }, + { + "entropy": 9.359336853027344, + "epoch": 1.2992881154834883, + "mean_token_accuracy": 0.8723404407501221, + "num_tokens": 14156855.0, + "step": 13141, + "train/ce_loss": 0.45386970043182373 + }, + { + "epoch": 1.2992881154834883, + "step": 13141, + "train/sim_loss": 0.03487503528594971 + }, + { + "epoch": 1.2992881154834883, + "step": 13141, + "train/total_loss": 0.08026200532913208 + }, + { + "entropy": 10.298580169677734, + "epoch": 1.2993869883330038, + "mean_token_accuracy": 0.8993902206420898, + "num_tokens": 14171593.0, + "step": 13142, + "train/ce_loss": 8.562597031414043e-07 + }, + { + "epoch": 1.2993869883330038, + "step": 13142, + "train/sim_loss": 0.013264298439025879 + }, + { + "epoch": 1.2993869883330038, + "step": 13142, + "train/total_loss": 0.013264384120702744 + }, + { + "entropy": 9.313058853149414, + "epoch": 1.2994858611825193, + "mean_token_accuracy": 0.8779134154319763, + "num_tokens": 14187635.0, + "step": 13143, + "train/ce_loss": 0.1529359668493271 + }, + { + "epoch": 1.2994858611825193, + "step": 13143, + "train/sim_loss": 0.06065011024475098 + }, + { + "epoch": 1.2994858611825193, + "step": 13143, + "train/total_loss": 0.0759437084197998 + }, + { + "entropy": 10.214778900146484, + "epoch": 1.2995847340320348, + "mean_token_accuracy": 0.8894736766815186, + "num_tokens": 14196507.0, + "step": 13144, + "train/ce_loss": 1.1556688832570217e-06 + }, + { + "epoch": 1.2995847340320348, + "step": 13144, + "train/sim_loss": 0.018024325370788574 + }, + { + "epoch": 1.2995847340320348, + "step": 13144, + "train/total_loss": 0.018024440854787827 + }, + { + "entropy": 9.373856544494629, + "epoch": 1.2996836068815503, + "mean_token_accuracy": 0.813747227191925, + "num_tokens": 14209681.0, + "step": 13145, + "train/ce_loss": 0.4630506634712219 + }, + { + "epoch": 1.2996836068815503, + "step": 13145, + "train/sim_loss": 0.04540371894836426 + }, + { + "epoch": 1.2996836068815503, + "step": 13145, + "train/total_loss": 0.09170878678560257 + }, + { + "entropy": 9.427072525024414, + "epoch": 1.299782479731066, + "mean_token_accuracy": 0.8947368264198303, + "num_tokens": 14220749.0, + "step": 13146, + "train/ce_loss": 6.661920792794263e-07 + }, + { + "epoch": 1.299782479731066, + "step": 13146, + "train/sim_loss": 0.031617939472198486 + }, + { + "epoch": 1.299782479731066, + "step": 13146, + "train/total_loss": 0.03161800652742386 + }, + { + "entropy": 9.386902809143066, + "epoch": 1.2998813525805812, + "mean_token_accuracy": 0.8577464818954468, + "num_tokens": 14231744.0, + "step": 13147, + "train/ce_loss": 0.3469593822956085 + }, + { + "epoch": 1.2998813525805812, + "step": 13147, + "train/sim_loss": 0.018856167793273926 + }, + { + "epoch": 1.2998813525805812, + "step": 13147, + "train/total_loss": 0.05355210602283478 + }, + { + "entropy": 10.146183013916016, + "epoch": 1.299980225430097, + "mean_token_accuracy": 0.890489935874939, + "num_tokens": 14244286.0, + "step": 13148, + "train/ce_loss": 2.614911863929592e-06 + }, + { + "epoch": 1.299980225430097, + "step": 13148, + "train/sim_loss": 0.03158068656921387 + }, + { + "epoch": 1.299980225430097, + "step": 13148, + "train/total_loss": 0.03158094733953476 + }, + { + "entropy": 9.640454292297363, + "epoch": 1.3000790982796124, + "mean_token_accuracy": 0.8057324886322021, + "num_tokens": 14257377.0, + "step": 13149, + "train/ce_loss": 7.14482041530573e-07 + }, + { + "epoch": 1.3000790982796124, + "step": 13149, + "train/sim_loss": 0.03323554992675781 + }, + { + "epoch": 1.3000790982796124, + "step": 13149, + "train/total_loss": 0.03323562070727348 + }, + { + "entropy": 9.666902542114258, + "epoch": 1.300177971129128, + "mean_token_accuracy": 0.8621169924736023, + "num_tokens": 14266955.0, + "step": 13150, + "train/ce_loss": 0.6673262715339661 + }, + { + "epoch": 1.300177971129128, + "step": 13150, + "train/sim_loss": 0.044910550117492676 + }, + { + "epoch": 1.300177971129128, + "step": 13150, + "train/total_loss": 0.11164318025112152 + }, + { + "entropy": 10.205889701843262, + "epoch": 1.3002768439786434, + "mean_token_accuracy": 0.910179615020752, + "num_tokens": 14276408.0, + "step": 13151, + "train/ce_loss": 3.5867374208464753e-06 + }, + { + "epoch": 1.3002768439786434, + "step": 13151, + "train/sim_loss": 0.05065619945526123 + }, + { + "epoch": 1.3002768439786434, + "step": 13151, + "train/total_loss": 0.05065655708312988 + }, + { + "entropy": 10.085803985595703, + "epoch": 1.300375716828159, + "mean_token_accuracy": 0.8474576473236084, + "num_tokens": 14289813.0, + "step": 13152, + "train/ce_loss": 0.5472782254219055 + }, + { + "epoch": 1.300375716828159, + "step": 13152, + "train/sim_loss": 0.047034382820129395 + }, + { + "epoch": 1.300375716828159, + "step": 13152, + "train/total_loss": 0.10176220536231995 + }, + { + "entropy": 9.904716491699219, + "epoch": 1.3004745896776746, + "mean_token_accuracy": 0.8942486047744751, + "num_tokens": 14299329.0, + "step": 13153, + "train/ce_loss": 2.314879793630098e-06 + }, + { + "epoch": 1.3004745896776746, + "step": 13153, + "train/sim_loss": 0.008944153785705566 + }, + { + "epoch": 1.3004745896776746, + "step": 13153, + "train/total_loss": 0.008944385685026646 + }, + { + "entropy": 10.051366806030273, + "epoch": 1.30057346252719, + "mean_token_accuracy": 0.8052287697792053, + "num_tokens": 14313796.0, + "step": 13154, + "train/ce_loss": 0.9834200739860535 + }, + { + "epoch": 1.30057346252719, + "step": 13154, + "train/sim_loss": 0.05449175834655762 + }, + { + "epoch": 1.30057346252719, + "step": 13154, + "train/total_loss": 0.1528337597846985 + }, + { + "entropy": 9.82452392578125, + "epoch": 1.3006723353767056, + "mean_token_accuracy": 0.8454258441925049, + "num_tokens": 14333639.0, + "step": 13155, + "train/ce_loss": 0.5773757100105286 + }, + { + "epoch": 1.3006723353767056, + "step": 13155, + "train/sim_loss": 0.03813529014587402 + }, + { + "epoch": 1.3006723353767056, + "step": 13155, + "train/total_loss": 0.09587286412715912 + }, + { + "entropy": 9.163028717041016, + "epoch": 1.300771208226221, + "mean_token_accuracy": 0.8169761300086975, + "num_tokens": 14344122.0, + "step": 13156, + "train/ce_loss": 5.027554266234802e-07 + }, + { + "epoch": 1.300771208226221, + "step": 13156, + "train/sim_loss": 0.043901681900024414 + }, + { + "epoch": 1.300771208226221, + "step": 13156, + "train/total_loss": 0.043901730328798294 + }, + { + "entropy": 10.261436462402344, + "epoch": 1.3008700810757365, + "mean_token_accuracy": 0.9157509207725525, + "num_tokens": 14355420.0, + "step": 13157, + "train/ce_loss": 1.0227054357528687 + }, + { + "epoch": 1.3008700810757365, + "step": 13157, + "train/sim_loss": 0.07275861501693726 + }, + { + "epoch": 1.3008700810757365, + "step": 13157, + "train/total_loss": 0.17502915859222412 + }, + { + "entropy": 9.429755210876465, + "epoch": 1.3009689539252522, + "mean_token_accuracy": 0.8372365236282349, + "num_tokens": 14370477.0, + "step": 13158, + "train/ce_loss": 0.36099973320961 + }, + { + "epoch": 1.3009689539252522, + "step": 13158, + "train/sim_loss": 0.02510702610015869 + }, + { + "epoch": 1.3009689539252522, + "step": 13158, + "train/total_loss": 0.06120700016617775 + }, + { + "entropy": 9.763957023620605, + "epoch": 1.3010678267747675, + "mean_token_accuracy": 0.8139904737472534, + "num_tokens": 14388348.0, + "step": 13159, + "train/ce_loss": 6.125183631411346e-07 + }, + { + "epoch": 1.3010678267747675, + "step": 13159, + "train/sim_loss": 0.020826756954193115 + }, + { + "epoch": 1.3010678267747675, + "step": 13159, + "train/total_loss": 0.02082681842148304 + }, + { + "epoch": 1.3011666996242832, + "grad_norm": 0.6165289282798767, + "learning_rate": 6.748998664886516e-06, + "loss": 0.0792, + "step": 13160 + }, + { + "entropy": 9.449103355407715, + "epoch": 1.3011666996242832, + "mean_token_accuracy": 0.8151515126228333, + "num_tokens": 14397563.0, + "step": 13160, + "train/ce_loss": 0.5814468860626221 + }, + { + "epoch": 1.3011666996242832, + "step": 13160, + "train/sim_loss": 0.08368384838104248 + }, + { + "epoch": 1.3011666996242832, + "step": 13160, + "train/total_loss": 0.1418285369873047 + }, + { + "entropy": 10.00915241241455, + "epoch": 1.3012655724737987, + "mean_token_accuracy": 0.8947368264198303, + "num_tokens": 14407858.0, + "step": 13161, + "train/ce_loss": 0.1817721426486969 + }, + { + "epoch": 1.3012655724737987, + "step": 13161, + "train/sim_loss": 0.0632559061050415 + }, + { + "epoch": 1.3012655724737987, + "step": 13161, + "train/total_loss": 0.08143311738967896 + }, + { + "entropy": 8.916040420532227, + "epoch": 1.3013644453233142, + "mean_token_accuracy": 0.8938547372817993, + "num_tokens": 14417677.0, + "step": 13162, + "train/ce_loss": 0.2117401510477066 + }, + { + "epoch": 1.3013644453233142, + "step": 13162, + "train/sim_loss": 0.01695108413696289 + }, + { + "epoch": 1.3013644453233142, + "step": 13162, + "train/total_loss": 0.03812509775161743 + }, + { + "entropy": 9.553396224975586, + "epoch": 1.3014633181728297, + "mean_token_accuracy": 0.8671755790710449, + "num_tokens": 14430282.0, + "step": 13163, + "train/ce_loss": 0.4043174684047699 + }, + { + "epoch": 1.3014633181728297, + "step": 13163, + "train/sim_loss": 0.04423642158508301 + }, + { + "epoch": 1.3014633181728297, + "step": 13163, + "train/total_loss": 0.08466817438602448 + }, + { + "entropy": 9.80239486694336, + "epoch": 1.3015621910223452, + "mean_token_accuracy": 0.8057553768157959, + "num_tokens": 14439193.0, + "step": 13164, + "train/ce_loss": 0.6873816847801208 + }, + { + "epoch": 1.3015621910223452, + "step": 13164, + "train/sim_loss": 0.07000702619552612 + }, + { + "epoch": 1.3015621910223452, + "step": 13164, + "train/total_loss": 0.13874518871307373 + }, + { + "entropy": 9.46338939666748, + "epoch": 1.3016610638718609, + "mean_token_accuracy": 0.8680465817451477, + "num_tokens": 14449809.0, + "step": 13165, + "train/ce_loss": 0.9209633469581604 + }, + { + "epoch": 1.3016610638718609, + "step": 13165, + "train/sim_loss": 0.07517343759536743 + }, + { + "epoch": 1.3016610638718609, + "step": 13165, + "train/total_loss": 0.167269766330719 + }, + { + "entropy": 9.633916854858398, + "epoch": 1.3017599367213764, + "mean_token_accuracy": 0.9120111465454102, + "num_tokens": 14459754.0, + "step": 13166, + "train/ce_loss": 0.18505364656448364 + }, + { + "epoch": 1.3017599367213764, + "step": 13166, + "train/sim_loss": 0.016085028648376465 + }, + { + "epoch": 1.3017599367213764, + "step": 13166, + "train/total_loss": 0.03459039330482483 + }, + { + "entropy": 9.397195816040039, + "epoch": 1.3018588095708918, + "mean_token_accuracy": 0.8607021570205688, + "num_tokens": 14469339.0, + "step": 13167, + "train/ce_loss": 0.454009085893631 + }, + { + "epoch": 1.3018588095708918, + "step": 13167, + "train/sim_loss": 0.03241688013076782 + }, + { + "epoch": 1.3018588095708918, + "step": 13167, + "train/total_loss": 0.07781779021024704 + }, + { + "entropy": 10.210906982421875, + "epoch": 1.3019576824204073, + "mean_token_accuracy": 0.8973104953765869, + "num_tokens": 14480521.0, + "step": 13168, + "train/ce_loss": 0.5546484589576721 + }, + { + "epoch": 1.3019576824204073, + "step": 13168, + "train/sim_loss": 0.050768375396728516 + }, + { + "epoch": 1.3019576824204073, + "step": 13168, + "train/total_loss": 0.10623322427272797 + }, + { + "entropy": 9.363116264343262, + "epoch": 1.3020565552699228, + "mean_token_accuracy": 0.8199999928474426, + "num_tokens": 14488188.0, + "step": 13169, + "train/ce_loss": 0.6060940027236938 + }, + { + "epoch": 1.3020565552699228, + "step": 13169, + "train/sim_loss": 0.012627124786376953 + }, + { + "epoch": 1.3020565552699228, + "step": 13169, + "train/total_loss": 0.07323652505874634 + }, + { + "entropy": 9.18015193939209, + "epoch": 1.3021554281194385, + "mean_token_accuracy": 0.8470715880393982, + "num_tokens": 14499108.0, + "step": 13170, + "train/ce_loss": 0.26271215081214905 + }, + { + "epoch": 1.3021554281194385, + "step": 13170, + "train/sim_loss": 0.0935051441192627 + }, + { + "epoch": 1.3021554281194385, + "step": 13170, + "train/total_loss": 0.11977636069059372 + }, + { + "entropy": 9.292664527893066, + "epoch": 1.302254300968954, + "mean_token_accuracy": 0.8489999771118164, + "num_tokens": 14514391.0, + "step": 13171, + "train/ce_loss": 0.3660622537136078 + }, + { + "epoch": 1.302254300968954, + "step": 13171, + "train/sim_loss": 0.0486292839050293 + }, + { + "epoch": 1.302254300968954, + "step": 13171, + "train/total_loss": 0.08523550629615784 + }, + { + "entropy": 9.395051956176758, + "epoch": 1.3023531738184695, + "mean_token_accuracy": 0.9041298031806946, + "num_tokens": 14528717.0, + "step": 13172, + "train/ce_loss": 0.20152100920677185 + }, + { + "epoch": 1.3023531738184695, + "step": 13172, + "train/sim_loss": 0.03031831979751587 + }, + { + "epoch": 1.3023531738184695, + "step": 13172, + "train/total_loss": 0.050470419228076935 + }, + { + "entropy": 9.566810607910156, + "epoch": 1.302452046667985, + "mean_token_accuracy": 0.8472400307655334, + "num_tokens": 14545285.0, + "step": 13173, + "train/ce_loss": 0.2516162097454071 + }, + { + "epoch": 1.302452046667985, + "step": 13173, + "train/sim_loss": 0.0890955924987793 + }, + { + "epoch": 1.302452046667985, + "step": 13173, + "train/total_loss": 0.11425721645355225 + }, + { + "entropy": 9.282527923583984, + "epoch": 1.3025509195175005, + "mean_token_accuracy": 0.7780979871749878, + "num_tokens": 14556347.0, + "step": 13174, + "train/ce_loss": 0.7014796137809753 + }, + { + "epoch": 1.3025509195175005, + "step": 13174, + "train/sim_loss": 0.04534769058227539 + }, + { + "epoch": 1.3025509195175005, + "step": 13174, + "train/total_loss": 0.11549565196037292 + }, + { + "entropy": 9.414593696594238, + "epoch": 1.302649792367016, + "mean_token_accuracy": 0.7716346383094788, + "num_tokens": 14568216.0, + "step": 13175, + "train/ce_loss": 0.5885396003723145 + }, + { + "epoch": 1.302649792367016, + "step": 13175, + "train/sim_loss": 0.02770918607711792 + }, + { + "epoch": 1.302649792367016, + "step": 13175, + "train/total_loss": 0.08656314760446548 + }, + { + "entropy": 9.36762809753418, + "epoch": 1.3027486652165314, + "mean_token_accuracy": 0.8669275641441345, + "num_tokens": 14582613.0, + "step": 13176, + "train/ce_loss": 0.6488838195800781 + }, + { + "epoch": 1.3027486652165314, + "step": 13176, + "train/sim_loss": 0.025758981704711914 + }, + { + "epoch": 1.3027486652165314, + "step": 13176, + "train/total_loss": 0.09064736217260361 + }, + { + "entropy": 9.405370712280273, + "epoch": 1.3028475380660471, + "mean_token_accuracy": 0.8852223753929138, + "num_tokens": 14589572.0, + "step": 13177, + "train/ce_loss": 7.183110142250371e-07 + }, + { + "epoch": 1.3028475380660471, + "step": 13177, + "train/sim_loss": 0.016896188259124756 + }, + { + "epoch": 1.3028475380660471, + "step": 13177, + "train/total_loss": 0.016896260902285576 + }, + { + "entropy": 9.747247695922852, + "epoch": 1.3029464109155626, + "mean_token_accuracy": 0.828101634979248, + "num_tokens": 14605092.0, + "step": 13178, + "train/ce_loss": 0.4774790406227112 + }, + { + "epoch": 1.3029464109155626, + "step": 13178, + "train/sim_loss": 0.08198857307434082 + }, + { + "epoch": 1.3029464109155626, + "step": 13178, + "train/total_loss": 0.12973648309707642 + }, + { + "entropy": 10.045919418334961, + "epoch": 1.303045283765078, + "mean_token_accuracy": 0.8763736486434937, + "num_tokens": 14613881.0, + "step": 13179, + "train/ce_loss": 0.7222087383270264 + }, + { + "epoch": 1.303045283765078, + "step": 13179, + "train/sim_loss": 0.07051420211791992 + }, + { + "epoch": 1.303045283765078, + "step": 13179, + "train/total_loss": 0.1427350789308548 + }, + { + "epoch": 1.3031441566145936, + "grad_norm": 0.6325729489326477, + "learning_rate": 6.744053800128567e-06, + "loss": 0.086, + "step": 13180 + }, + { + "entropy": 9.83344841003418, + "epoch": 1.3031441566145936, + "mean_token_accuracy": 0.8464849591255188, + "num_tokens": 14633775.0, + "step": 13180, + "train/ce_loss": 0.7041210532188416 + }, + { + "epoch": 1.3031441566145936, + "step": 13180, + "train/sim_loss": 0.041417598724365234 + }, + { + "epoch": 1.3031441566145936, + "step": 13180, + "train/total_loss": 0.11182970553636551 + }, + { + "entropy": 9.725290298461914, + "epoch": 1.303243029464109, + "mean_token_accuracy": 0.8370702266693115, + "num_tokens": 14652074.0, + "step": 13181, + "train/ce_loss": 0.5655458569526672 + }, + { + "epoch": 1.303243029464109, + "step": 13181, + "train/sim_loss": 0.07773298025131226 + }, + { + "epoch": 1.303243029464109, + "step": 13181, + "train/total_loss": 0.13428756594657898 + }, + { + "entropy": 9.252305030822754, + "epoch": 1.3033419023136248, + "mean_token_accuracy": 0.886956512928009, + "num_tokens": 14663894.0, + "step": 13182, + "train/ce_loss": 0.44023609161376953 + }, + { + "epoch": 1.3033419023136248, + "step": 13182, + "train/sim_loss": 0.00937032699584961 + }, + { + "epoch": 1.3033419023136248, + "step": 13182, + "train/total_loss": 0.05339393764734268 + }, + { + "entropy": 9.621431350708008, + "epoch": 1.3034407751631403, + "mean_token_accuracy": 0.8571428656578064, + "num_tokens": 14670275.0, + "step": 13183, + "train/ce_loss": 0.45068058371543884 + }, + { + "epoch": 1.3034407751631403, + "step": 13183, + "train/sim_loss": 0.04922431707382202 + }, + { + "epoch": 1.3034407751631403, + "step": 13183, + "train/total_loss": 0.09429237246513367 + }, + { + "entropy": 9.30063247680664, + "epoch": 1.3035396480126558, + "mean_token_accuracy": 0.8778625726699829, + "num_tokens": 14683716.0, + "step": 13184, + "train/ce_loss": 0.3205988109111786 + }, + { + "epoch": 1.3035396480126558, + "step": 13184, + "train/sim_loss": 0.043942153453826904 + }, + { + "epoch": 1.3035396480126558, + "step": 13184, + "train/total_loss": 0.07600203156471252 + }, + { + "entropy": 8.959026336669922, + "epoch": 1.3036385208621712, + "mean_token_accuracy": 0.815815806388855, + "num_tokens": 14693667.0, + "step": 13185, + "train/ce_loss": 0.37297338247299194 + }, + { + "epoch": 1.3036385208621712, + "step": 13185, + "train/sim_loss": 0.05258333683013916 + }, + { + "epoch": 1.3036385208621712, + "step": 13185, + "train/total_loss": 0.08988067507743835 + }, + { + "entropy": 10.131430625915527, + "epoch": 1.3037373937116867, + "mean_token_accuracy": 0.8534201979637146, + "num_tokens": 14709716.0, + "step": 13186, + "train/ce_loss": 5.31189471075777e-07 + }, + { + "epoch": 1.3037373937116867, + "step": 13186, + "train/sim_loss": 0.012303471565246582 + }, + { + "epoch": 1.3037373937116867, + "step": 13186, + "train/total_loss": 0.012303524650633335 + }, + { + "entropy": 9.464167594909668, + "epoch": 1.3038362665612022, + "mean_token_accuracy": 0.902531623840332, + "num_tokens": 14730576.0, + "step": 13187, + "train/ce_loss": 0.6141715049743652 + }, + { + "epoch": 1.3038362665612022, + "step": 13187, + "train/sim_loss": 0.04940927028656006 + }, + { + "epoch": 1.3038362665612022, + "step": 13187, + "train/total_loss": 0.11082641780376434 + }, + { + "entropy": 9.219900131225586, + "epoch": 1.3039351394107177, + "mean_token_accuracy": 0.7968923449516296, + "num_tokens": 14744818.0, + "step": 13188, + "train/ce_loss": 0.922296941280365 + }, + { + "epoch": 1.3039351394107177, + "step": 13188, + "train/sim_loss": 0.07188999652862549 + }, + { + "epoch": 1.3039351394107177, + "step": 13188, + "train/total_loss": 0.164119690656662 + }, + { + "entropy": 9.40198802947998, + "epoch": 1.3040340122602334, + "mean_token_accuracy": 0.8670967817306519, + "num_tokens": 14757570.0, + "step": 13189, + "train/ce_loss": 0.31747516989707947 + }, + { + "epoch": 1.3040340122602334, + "step": 13189, + "train/sim_loss": 0.03321045637130737 + }, + { + "epoch": 1.3040340122602334, + "step": 13189, + "train/total_loss": 0.06495797634124756 + }, + { + "entropy": 9.40805721282959, + "epoch": 1.304132885109749, + "mean_token_accuracy": 0.8199672698974609, + "num_tokens": 14774196.0, + "step": 13190, + "train/ce_loss": 0.5267090201377869 + }, + { + "epoch": 1.304132885109749, + "step": 13190, + "train/sim_loss": 0.0333516001701355 + }, + { + "epoch": 1.304132885109749, + "step": 13190, + "train/total_loss": 0.0860225036740303 + }, + { + "entropy": 9.902514457702637, + "epoch": 1.3042317579592644, + "mean_token_accuracy": 0.9147465229034424, + "num_tokens": 14780431.0, + "step": 13191, + "train/ce_loss": 0.35574284195899963 + }, + { + "epoch": 1.3042317579592644, + "step": 13191, + "train/sim_loss": 0.0707165002822876 + }, + { + "epoch": 1.3042317579592644, + "step": 13191, + "train/total_loss": 0.1062907874584198 + }, + { + "entropy": 10.009881973266602, + "epoch": 1.3043306308087799, + "mean_token_accuracy": 0.8556962013244629, + "num_tokens": 14800686.0, + "step": 13192, + "train/ce_loss": 0.7086774706840515 + }, + { + "epoch": 1.3043306308087799, + "step": 13192, + "train/sim_loss": 0.08544707298278809 + }, + { + "epoch": 1.3043306308087799, + "step": 13192, + "train/total_loss": 0.15631482005119324 + }, + { + "entropy": 9.578474044799805, + "epoch": 1.3044295036582954, + "mean_token_accuracy": 0.8523748517036438, + "num_tokens": 14817500.0, + "step": 13193, + "train/ce_loss": 0.4298597276210785 + }, + { + "epoch": 1.3044295036582954, + "step": 13193, + "train/sim_loss": 0.014729022979736328 + }, + { + "epoch": 1.3044295036582954, + "step": 13193, + "train/total_loss": 0.05771499499678612 + }, + { + "entropy": 9.57083511352539, + "epoch": 1.304528376507811, + "mean_token_accuracy": 0.8260162472724915, + "num_tokens": 14829295.0, + "step": 13194, + "train/ce_loss": 1.9931696897401707e-06 + }, + { + "epoch": 1.304528376507811, + "step": 13194, + "train/sim_loss": 0.030447423458099365 + }, + { + "epoch": 1.304528376507811, + "step": 13194, + "train/total_loss": 0.030447622761130333 + }, + { + "entropy": 8.730384826660156, + "epoch": 1.3046272493573265, + "mean_token_accuracy": 0.8215271234512329, + "num_tokens": 14841118.0, + "step": 13195, + "train/ce_loss": 0.5868284702301025 + }, + { + "epoch": 1.3046272493573265, + "step": 13195, + "train/sim_loss": 0.0314253568649292 + }, + { + "epoch": 1.3046272493573265, + "step": 13195, + "train/total_loss": 0.09010820090770721 + }, + { + "entropy": 9.568653106689453, + "epoch": 1.304726122206842, + "mean_token_accuracy": 0.8376811742782593, + "num_tokens": 14851926.0, + "step": 13196, + "train/ce_loss": 0.5921724438667297 + }, + { + "epoch": 1.304726122206842, + "step": 13196, + "train/sim_loss": 0.03058987855911255 + }, + { + "epoch": 1.304726122206842, + "step": 13196, + "train/total_loss": 0.08980712294578552 + }, + { + "entropy": 9.318244934082031, + "epoch": 1.3048249950563575, + "mean_token_accuracy": 0.8523809313774109, + "num_tokens": 14862415.0, + "step": 13197, + "train/ce_loss": 0.35269710421562195 + }, + { + "epoch": 1.3048249950563575, + "step": 13197, + "train/sim_loss": 0.03982067108154297 + }, + { + "epoch": 1.3048249950563575, + "step": 13197, + "train/total_loss": 0.07509037852287292 + }, + { + "entropy": 9.681339263916016, + "epoch": 1.304923867905873, + "mean_token_accuracy": 0.8303571343421936, + "num_tokens": 14874245.0, + "step": 13198, + "train/ce_loss": 0.6492642164230347 + }, + { + "epoch": 1.304923867905873, + "step": 13198, + "train/sim_loss": 0.0333176851272583 + }, + { + "epoch": 1.304923867905873, + "step": 13198, + "train/total_loss": 0.09824410825967789 + }, + { + "entropy": 9.750045776367188, + "epoch": 1.3050227407553885, + "mean_token_accuracy": 0.8640776872634888, + "num_tokens": 14887312.0, + "step": 13199, + "train/ce_loss": 8.880601285454759e-07 + }, + { + "epoch": 1.3050227407553885, + "step": 13199, + "train/sim_loss": 0.01916193962097168 + }, + { + "epoch": 1.3050227407553885, + "step": 13199, + "train/total_loss": 0.019162029027938843 + }, + { + "epoch": 1.305121613604904, + "grad_norm": 0.5508050322532654, + "learning_rate": 6.739108935370619e-06, + "loss": 0.0862, + "step": 13200 + }, + { + "entropy": 9.381288528442383, + "epoch": 1.305121613604904, + "mean_token_accuracy": 0.8625429272651672, + "num_tokens": 14904413.0, + "step": 13200, + "train/ce_loss": 0.2029883861541748 + }, + { + "epoch": 1.305121613604904, + "step": 13200, + "train/sim_loss": 0.030652940273284912 + }, + { + "epoch": 1.305121613604904, + "step": 13200, + "train/total_loss": 0.05095177888870239 + }, + { + "entropy": 9.793725967407227, + "epoch": 1.3052204864544197, + "mean_token_accuracy": 0.8113821148872375, + "num_tokens": 14913884.0, + "step": 13201, + "train/ce_loss": 0.3236582577228546 + }, + { + "epoch": 1.3052204864544197, + "step": 13201, + "train/sim_loss": 0.03635132312774658 + }, + { + "epoch": 1.3052204864544197, + "step": 13201, + "train/total_loss": 0.06871715188026428 + }, + { + "entropy": 9.55381965637207, + "epoch": 1.3053193593039352, + "mean_token_accuracy": 0.9035221934318542, + "num_tokens": 14921140.0, + "step": 13202, + "train/ce_loss": 0.23723477125167847 + }, + { + "epoch": 1.3053193593039352, + "step": 13202, + "train/sim_loss": 0.009784102439880371 + }, + { + "epoch": 1.3053193593039352, + "step": 13202, + "train/total_loss": 0.0335075780749321 + }, + { + "entropy": 9.158967971801758, + "epoch": 1.3054182321534507, + "mean_token_accuracy": 0.8320356011390686, + "num_tokens": 14933116.0, + "step": 13203, + "train/ce_loss": 0.6990651488304138 + }, + { + "epoch": 1.3054182321534507, + "step": 13203, + "train/sim_loss": 0.05325007438659668 + }, + { + "epoch": 1.3054182321534507, + "step": 13203, + "train/total_loss": 0.1231565922498703 + }, + { + "entropy": 9.13525390625, + "epoch": 1.3055171050029661, + "mean_token_accuracy": 0.8673355579376221, + "num_tokens": 14944334.0, + "step": 13204, + "train/ce_loss": 0.36935707926750183 + }, + { + "epoch": 1.3055171050029661, + "step": 13204, + "train/sim_loss": 0.04679363965988159 + }, + { + "epoch": 1.3055171050029661, + "step": 13204, + "train/total_loss": 0.0837293490767479 + }, + { + "entropy": 9.508122444152832, + "epoch": 1.3056159778524816, + "mean_token_accuracy": 0.8586065769195557, + "num_tokens": 14959737.0, + "step": 13205, + "train/ce_loss": 0.18870018422603607 + }, + { + "epoch": 1.3056159778524816, + "step": 13205, + "train/sim_loss": 0.015869617462158203 + }, + { + "epoch": 1.3056159778524816, + "step": 13205, + "train/total_loss": 0.03473963588476181 + }, + { + "entropy": 9.747536659240723, + "epoch": 1.3057148507019973, + "mean_token_accuracy": 0.8151447772979736, + "num_tokens": 14974814.0, + "step": 13206, + "train/ce_loss": 0.4672286808490753 + }, + { + "epoch": 1.3057148507019973, + "step": 13206, + "train/sim_loss": 0.07646894454956055 + }, + { + "epoch": 1.3057148507019973, + "step": 13206, + "train/total_loss": 0.12319181859493256 + }, + { + "entropy": 9.267086029052734, + "epoch": 1.3058137235515128, + "mean_token_accuracy": 0.8469387888908386, + "num_tokens": 14986638.0, + "step": 13207, + "train/ce_loss": 0.46803486347198486 + }, + { + "epoch": 1.3058137235515128, + "step": 13207, + "train/sim_loss": 0.020465731620788574 + }, + { + "epoch": 1.3058137235515128, + "step": 13207, + "train/total_loss": 0.0672692209482193 + }, + { + "entropy": 10.19399642944336, + "epoch": 1.3059125964010283, + "mean_token_accuracy": 0.8403141498565674, + "num_tokens": 14994931.0, + "step": 13208, + "train/ce_loss": 1.3082225322723389 + }, + { + "epoch": 1.3059125964010283, + "step": 13208, + "train/sim_loss": 0.0957687497138977 + }, + { + "epoch": 1.3059125964010283, + "step": 13208, + "train/total_loss": 0.22659100592136383 + }, + { + "entropy": 9.25290298461914, + "epoch": 1.3060114692505438, + "mean_token_accuracy": 0.8571428656578064, + "num_tokens": 15008363.0, + "step": 13209, + "train/ce_loss": 0.18188899755477905 + }, + { + "epoch": 1.3060114692505438, + "step": 13209, + "train/sim_loss": 0.029367566108703613 + }, + { + "epoch": 1.3060114692505438, + "step": 13209, + "train/total_loss": 0.04755646735429764 + }, + { + "entropy": 9.671241760253906, + "epoch": 1.3061103421000593, + "mean_token_accuracy": 0.9566037654876709, + "num_tokens": 15019386.0, + "step": 13210, + "train/ce_loss": 6.018830163156963e-07 + }, + { + "epoch": 1.3061103421000593, + "step": 13210, + "train/sim_loss": 0.01726222038269043 + }, + { + "epoch": 1.3061103421000593, + "step": 13210, + "train/total_loss": 0.017262279987335205 + }, + { + "entropy": 9.418876647949219, + "epoch": 1.306209214949575, + "mean_token_accuracy": 0.8764045238494873, + "num_tokens": 15028292.0, + "step": 13211, + "train/ce_loss": 0.4674012064933777 + }, + { + "epoch": 1.306209214949575, + "step": 13211, + "train/sim_loss": 0.012782156467437744 + }, + { + "epoch": 1.306209214949575, + "step": 13211, + "train/total_loss": 0.05952227860689163 + }, + { + "entropy": 9.395637512207031, + "epoch": 1.3063080877990902, + "mean_token_accuracy": 0.7966850996017456, + "num_tokens": 15044747.0, + "step": 13212, + "train/ce_loss": 0.6854750514030457 + }, + { + "epoch": 1.3063080877990902, + "step": 13212, + "train/sim_loss": 0.02730858325958252 + }, + { + "epoch": 1.3063080877990902, + "step": 13212, + "train/total_loss": 0.09585609287023544 + }, + { + "entropy": 9.993544578552246, + "epoch": 1.306406960648606, + "mean_token_accuracy": 0.7888000011444092, + "num_tokens": 15053672.0, + "step": 13213, + "train/ce_loss": 0.7080709934234619 + }, + { + "epoch": 1.306406960648606, + "step": 13213, + "train/sim_loss": 0.039871156215667725 + }, + { + "epoch": 1.306406960648606, + "step": 13213, + "train/total_loss": 0.11067825555801392 + }, + { + "entropy": 8.856908798217773, + "epoch": 1.3065058334981214, + "mean_token_accuracy": 0.8221957087516785, + "num_tokens": 15062234.0, + "step": 13214, + "train/ce_loss": 0.3612391948699951 + }, + { + "epoch": 1.3065058334981214, + "step": 13214, + "train/sim_loss": 0.0573083758354187 + }, + { + "epoch": 1.3065058334981214, + "step": 13214, + "train/total_loss": 0.09343229234218597 + }, + { + "entropy": 9.787304878234863, + "epoch": 1.306604706347637, + "mean_token_accuracy": 0.8575899600982666, + "num_tokens": 15073528.0, + "step": 13215, + "train/ce_loss": 3.343043317727279e-07 + }, + { + "epoch": 1.306604706347637, + "step": 13215, + "train/sim_loss": 0.017250537872314453 + }, + { + "epoch": 1.306604706347637, + "step": 13215, + "train/total_loss": 0.01725057139992714 + }, + { + "entropy": 9.389260292053223, + "epoch": 1.3067035791971524, + "mean_token_accuracy": 0.8545246124267578, + "num_tokens": 15087628.0, + "step": 13216, + "train/ce_loss": 0.41126155853271484 + }, + { + "epoch": 1.3067035791971524, + "step": 13216, + "train/sim_loss": 0.02150595188140869 + }, + { + "epoch": 1.3067035791971524, + "step": 13216, + "train/total_loss": 0.06263211369514465 + }, + { + "entropy": 9.605131149291992, + "epoch": 1.306802452046668, + "mean_token_accuracy": 0.8642745614051819, + "num_tokens": 15098005.0, + "step": 13217, + "train/ce_loss": 0.3895731568336487 + }, + { + "epoch": 1.306802452046668, + "step": 13217, + "train/sim_loss": 0.07484996318817139 + }, + { + "epoch": 1.306802452046668, + "step": 13217, + "train/total_loss": 0.11380727589130402 + }, + { + "entropy": 9.512226104736328, + "epoch": 1.3069013248961836, + "mean_token_accuracy": 0.8938461542129517, + "num_tokens": 15112399.0, + "step": 13218, + "train/ce_loss": 0.4923429489135742 + }, + { + "epoch": 1.3069013248961836, + "step": 13218, + "train/sim_loss": 0.01605808734893799 + }, + { + "epoch": 1.3069013248961836, + "step": 13218, + "train/total_loss": 0.06529238820075989 + }, + { + "entropy": 9.512334823608398, + "epoch": 1.307000197745699, + "mean_token_accuracy": 0.9112903475761414, + "num_tokens": 15128337.0, + "step": 13219, + "train/ce_loss": 0.13441656529903412 + }, + { + "epoch": 1.307000197745699, + "step": 13219, + "train/sim_loss": 0.028031229972839355 + }, + { + "epoch": 1.307000197745699, + "step": 13219, + "train/total_loss": 0.04147288575768471 + }, + { + "epoch": 1.3070990705952146, + "grad_norm": 0.517873227596283, + "learning_rate": 6.734164070612668e-06, + "loss": 0.0816, + "step": 13220 + }, + { + "entropy": 9.242612838745117, + "epoch": 1.3070990705952146, + "mean_token_accuracy": 0.8681204319000244, + "num_tokens": 15140742.0, + "step": 13220, + "train/ce_loss": 0.1891753226518631 + }, + { + "epoch": 1.3070990705952146, + "step": 13220, + "train/sim_loss": 0.03297531604766846 + }, + { + "epoch": 1.3070990705952146, + "step": 13220, + "train/total_loss": 0.05189284682273865 + }, + { + "entropy": 9.94182300567627, + "epoch": 1.30719794344473, + "mean_token_accuracy": 0.8818181753158569, + "num_tokens": 15155699.0, + "step": 13221, + "train/ce_loss": 0.35273101925849915 + }, + { + "epoch": 1.30719794344473, + "step": 13221, + "train/sim_loss": 0.05994975566864014 + }, + { + "epoch": 1.30719794344473, + "step": 13221, + "train/total_loss": 0.09522286057472229 + }, + { + "entropy": 9.807064056396484, + "epoch": 1.3072968162942455, + "mean_token_accuracy": 0.921658992767334, + "num_tokens": 15175297.0, + "step": 13222, + "train/ce_loss": 9.62430817708082e-07 + }, + { + "epoch": 1.3072968162942455, + "step": 13222, + "train/sim_loss": 0.01610386371612549 + }, + { + "epoch": 1.3072968162942455, + "step": 13222, + "train/total_loss": 0.01610396057367325 + }, + { + "entropy": 9.765094757080078, + "epoch": 1.3073956891437613, + "mean_token_accuracy": 0.875, + "num_tokens": 15192193.0, + "step": 13223, + "train/ce_loss": 0.3935185372829437 + }, + { + "epoch": 1.3073956891437613, + "step": 13223, + "train/sim_loss": 0.07014656066894531 + }, + { + "epoch": 1.3073956891437613, + "step": 13223, + "train/total_loss": 0.10949841141700745 + }, + { + "entropy": 9.576555252075195, + "epoch": 1.3074945619932765, + "mean_token_accuracy": 0.8006952404975891, + "num_tokens": 15205957.0, + "step": 13224, + "train/ce_loss": 0.2596498727798462 + }, + { + "epoch": 1.3074945619932765, + "step": 13224, + "train/sim_loss": 0.04136979579925537 + }, + { + "epoch": 1.3074945619932765, + "step": 13224, + "train/total_loss": 0.06733478605747223 + }, + { + "entropy": 9.65625, + "epoch": 1.3075934348427922, + "mean_token_accuracy": 0.8343881964683533, + "num_tokens": 15221085.0, + "step": 13225, + "train/ce_loss": 0.2729432284832001 + }, + { + "epoch": 1.3075934348427922, + "step": 13225, + "train/sim_loss": 0.061240434646606445 + }, + { + "epoch": 1.3075934348427922, + "step": 13225, + "train/total_loss": 0.08853475749492645 + }, + { + "entropy": 9.614233016967773, + "epoch": 1.3076923076923077, + "mean_token_accuracy": 0.8159420490264893, + "num_tokens": 15231564.0, + "step": 13226, + "train/ce_loss": 0.42258766293525696 + }, + { + "epoch": 1.3076923076923077, + "step": 13226, + "train/sim_loss": 0.08919823169708252 + }, + { + "epoch": 1.3076923076923077, + "step": 13226, + "train/total_loss": 0.13145700097084045 + }, + { + "entropy": 9.402826309204102, + "epoch": 1.3077911805418232, + "mean_token_accuracy": 0.8289156556129456, + "num_tokens": 15242217.0, + "step": 13227, + "train/ce_loss": 0.9291769862174988 + }, + { + "epoch": 1.3077911805418232, + "step": 13227, + "train/sim_loss": 0.13813555240631104 + }, + { + "epoch": 1.3077911805418232, + "step": 13227, + "train/total_loss": 0.23105326294898987 + }, + { + "entropy": 9.569188117980957, + "epoch": 1.3078900533913387, + "mean_token_accuracy": 0.8678237795829773, + "num_tokens": 15252258.0, + "step": 13228, + "train/ce_loss": 0.6606099009513855 + }, + { + "epoch": 1.3078900533913387, + "step": 13228, + "train/sim_loss": 0.037116050720214844 + }, + { + "epoch": 1.3078900533913387, + "step": 13228, + "train/total_loss": 0.1031770408153534 + }, + { + "entropy": 9.10611343383789, + "epoch": 1.3079889262408542, + "mean_token_accuracy": 0.8525280952453613, + "num_tokens": 15259982.0, + "step": 13229, + "train/ce_loss": 0.47944074869155884 + }, + { + "epoch": 1.3079889262408542, + "step": 13229, + "train/sim_loss": 0.062201857566833496 + }, + { + "epoch": 1.3079889262408542, + "step": 13229, + "train/total_loss": 0.1101459339261055 + }, + { + "entropy": 9.268068313598633, + "epoch": 1.3080877990903699, + "mean_token_accuracy": 0.779036819934845, + "num_tokens": 15271161.0, + "step": 13230, + "train/ce_loss": 0.9132916331291199 + }, + { + "epoch": 1.3080877990903699, + "step": 13230, + "train/sim_loss": 0.09513324499130249 + }, + { + "epoch": 1.3080877990903699, + "step": 13230, + "train/total_loss": 0.18646240234375 + }, + { + "entropy": 10.01840591430664, + "epoch": 1.3081866719398854, + "mean_token_accuracy": 0.8607594966888428, + "num_tokens": 15285901.0, + "step": 13231, + "train/ce_loss": 7.506876045226818e-07 + }, + { + "epoch": 1.3081866719398854, + "step": 13231, + "train/sim_loss": 0.09741199016571045 + }, + { + "epoch": 1.3081866719398854, + "step": 13231, + "train/total_loss": 0.09741206467151642 + }, + { + "entropy": 9.562801361083984, + "epoch": 1.3082855447894008, + "mean_token_accuracy": 0.9336609244346619, + "num_tokens": 15295060.0, + "step": 13232, + "train/ce_loss": 1.1328842219882063e-06 + }, + { + "epoch": 1.3082855447894008, + "step": 13232, + "train/sim_loss": 0.021329164505004883 + }, + { + "epoch": 1.3082855447894008, + "step": 13232, + "train/total_loss": 0.021329278126358986 + }, + { + "entropy": 9.328283309936523, + "epoch": 1.3083844176389163, + "mean_token_accuracy": 0.7982359528541565, + "num_tokens": 15309665.0, + "step": 13233, + "train/ce_loss": 0.6160065531730652 + }, + { + "epoch": 1.3083844176389163, + "step": 13233, + "train/sim_loss": 0.12769609689712524 + }, + { + "epoch": 1.3083844176389163, + "step": 13233, + "train/total_loss": 0.18929675221443176 + }, + { + "entropy": 9.25172233581543, + "epoch": 1.3084832904884318, + "mean_token_accuracy": 0.7938044667243958, + "num_tokens": 15322984.0, + "step": 13234, + "train/ce_loss": 0.33106106519699097 + }, + { + "epoch": 1.3084832904884318, + "step": 13234, + "train/sim_loss": 0.01059412956237793 + }, + { + "epoch": 1.3084832904884318, + "step": 13234, + "train/total_loss": 0.043700236827135086 + }, + { + "entropy": 9.762537002563477, + "epoch": 1.3085821633379475, + "mean_token_accuracy": 0.8087953925132751, + "num_tokens": 15334994.0, + "step": 13235, + "train/ce_loss": 1.0402456521987915 + }, + { + "epoch": 1.3085821633379475, + "step": 13235, + "train/sim_loss": 0.026613295078277588 + }, + { + "epoch": 1.3085821633379475, + "step": 13235, + "train/total_loss": 0.13063785433769226 + }, + { + "entropy": 9.295156478881836, + "epoch": 1.3086810361874628, + "mean_token_accuracy": 0.8864696621894836, + "num_tokens": 15348089.0, + "step": 13236, + "train/ce_loss": 0.4934113323688507 + }, + { + "epoch": 1.3086810361874628, + "step": 13236, + "train/sim_loss": 0.022666215896606445 + }, + { + "epoch": 1.3086810361874628, + "step": 13236, + "train/total_loss": 0.07200735062360764 + }, + { + "entropy": 9.977472305297852, + "epoch": 1.3087799090369785, + "mean_token_accuracy": 0.8378728628158569, + "num_tokens": 15365907.0, + "step": 13237, + "train/ce_loss": 0.6907341480255127 + }, + { + "epoch": 1.3087799090369785, + "step": 13237, + "train/sim_loss": 0.12429928779602051 + }, + { + "epoch": 1.3087799090369785, + "step": 13237, + "train/total_loss": 0.1933726966381073 + }, + { + "entropy": 9.767967224121094, + "epoch": 1.308878781886494, + "mean_token_accuracy": 0.8632872700691223, + "num_tokens": 15385769.0, + "step": 13238, + "train/ce_loss": 0.6495885252952576 + }, + { + "epoch": 1.308878781886494, + "step": 13238, + "train/sim_loss": 0.0628156065940857 + }, + { + "epoch": 1.308878781886494, + "step": 13238, + "train/total_loss": 0.1277744621038437 + }, + { + "entropy": 9.444685935974121, + "epoch": 1.3089776547360095, + "mean_token_accuracy": 0.8341463208198547, + "num_tokens": 15394754.0, + "step": 13239, + "train/ce_loss": 0.6045830249786377 + }, + { + "epoch": 1.3089776547360095, + "step": 13239, + "train/sim_loss": 0.03816688060760498 + }, + { + "epoch": 1.3089776547360095, + "step": 13239, + "train/total_loss": 0.09862518310546875 + }, + { + "epoch": 1.309076527585525, + "grad_norm": 0.5379523038864136, + "learning_rate": 6.72921920585472e-06, + "loss": 0.0945, + "step": 13240 + }, + { + "entropy": 9.830916404724121, + "epoch": 1.309076527585525, + "mean_token_accuracy": 0.8752515316009521, + "num_tokens": 15414080.0, + "step": 13240, + "train/ce_loss": 9.123929203269654e-07 + }, + { + "epoch": 1.309076527585525, + "step": 13240, + "train/sim_loss": 0.017548680305480957 + }, + { + "epoch": 1.309076527585525, + "step": 13240, + "train/total_loss": 0.01754877157509327 + }, + { + "entropy": 10.064002990722656, + "epoch": 1.3091754004350404, + "mean_token_accuracy": 0.9610389471054077, + "num_tokens": 15422611.0, + "step": 13241, + "train/ce_loss": 0.36480894684791565 + }, + { + "epoch": 1.3091754004350404, + "step": 13241, + "train/sim_loss": 0.05633670091629028 + }, + { + "epoch": 1.3091754004350404, + "step": 13241, + "train/total_loss": 0.09281759709119797 + }, + { + "entropy": 9.635149002075195, + "epoch": 1.3092742732845561, + "mean_token_accuracy": 0.7641878724098206, + "num_tokens": 15437324.0, + "step": 13242, + "train/ce_loss": 0.5563686490058899 + }, + { + "epoch": 1.3092742732845561, + "step": 13242, + "train/sim_loss": 0.07098066806793213 + }, + { + "epoch": 1.3092742732845561, + "step": 13242, + "train/total_loss": 0.12661753594875336 + }, + { + "entropy": 10.079427719116211, + "epoch": 1.3093731461340716, + "mean_token_accuracy": 0.9068009853363037, + "num_tokens": 15442623.0, + "step": 13243, + "train/ce_loss": 0.4167982339859009 + }, + { + "epoch": 1.3093731461340716, + "step": 13243, + "train/sim_loss": 0.05972999334335327 + }, + { + "epoch": 1.3093731461340716, + "step": 13243, + "train/total_loss": 0.10140982270240784 + }, + { + "entropy": 9.642324447631836, + "epoch": 1.3094720189835871, + "mean_token_accuracy": 0.8450450301170349, + "num_tokens": 15451516.0, + "step": 13244, + "train/ce_loss": 4.794074470737542e-07 + }, + { + "epoch": 1.3094720189835871, + "step": 13244, + "train/sim_loss": 0.019448518753051758 + }, + { + "epoch": 1.3094720189835871, + "step": 13244, + "train/total_loss": 0.019448567181825638 + }, + { + "entropy": 9.726848602294922, + "epoch": 1.3095708918331026, + "mean_token_accuracy": 0.84112149477005, + "num_tokens": 15465435.0, + "step": 13245, + "train/ce_loss": 0.5860307216644287 + }, + { + "epoch": 1.3095708918331026, + "step": 13245, + "train/sim_loss": 0.03600311279296875 + }, + { + "epoch": 1.3095708918331026, + "step": 13245, + "train/total_loss": 0.0946061909198761 + }, + { + "entropy": 9.584689140319824, + "epoch": 1.309669764682618, + "mean_token_accuracy": 0.8236040472984314, + "num_tokens": 15486187.0, + "step": 13246, + "train/ce_loss": 0.4710257947444916 + }, + { + "epoch": 1.309669764682618, + "step": 13246, + "train/sim_loss": 0.03569304943084717 + }, + { + "epoch": 1.309669764682618, + "step": 13246, + "train/total_loss": 0.0827956348657608 + }, + { + "entropy": 9.40968132019043, + "epoch": 1.3097686375321338, + "mean_token_accuracy": 0.7952853441238403, + "num_tokens": 15496836.0, + "step": 13247, + "train/ce_loss": 0.9075015783309937 + }, + { + "epoch": 1.3097686375321338, + "step": 13247, + "train/sim_loss": 0.05806654691696167 + }, + { + "epoch": 1.3097686375321338, + "step": 13247, + "train/total_loss": 0.14881670475006104 + }, + { + "entropy": 9.655336380004883, + "epoch": 1.3098675103816493, + "mean_token_accuracy": 0.8788598775863647, + "num_tokens": 15512321.0, + "step": 13248, + "train/ce_loss": 0.3883976638317108 + }, + { + "epoch": 1.3098675103816493, + "step": 13248, + "train/sim_loss": 0.015882670879364014 + }, + { + "epoch": 1.3098675103816493, + "step": 13248, + "train/total_loss": 0.054722439497709274 + }, + { + "entropy": 9.281705856323242, + "epoch": 1.3099663832311648, + "mean_token_accuracy": 0.8519448041915894, + "num_tokens": 15528734.0, + "step": 13249, + "train/ce_loss": 0.2770186960697174 + }, + { + "epoch": 1.3099663832311648, + "step": 13249, + "train/sim_loss": 0.029078423976898193 + }, + { + "epoch": 1.3099663832311648, + "step": 13249, + "train/total_loss": 0.056780293583869934 + }, + { + "entropy": 9.73990249633789, + "epoch": 1.3100652560806803, + "mean_token_accuracy": 0.7733026742935181, + "num_tokens": 15543279.0, + "step": 13250, + "train/ce_loss": 0.703486442565918 + }, + { + "epoch": 1.3100652560806803, + "step": 13250, + "train/sim_loss": 0.05717611312866211 + }, + { + "epoch": 1.3100652560806803, + "step": 13250, + "train/total_loss": 0.12752476334571838 + }, + { + "entropy": 8.974239349365234, + "epoch": 1.3101641289301957, + "mean_token_accuracy": 0.82869952917099, + "num_tokens": 15560246.0, + "step": 13251, + "train/ce_loss": 0.4623435437679291 + }, + { + "epoch": 1.3101641289301957, + "step": 13251, + "train/sim_loss": 0.01466447114944458 + }, + { + "epoch": 1.3101641289301957, + "step": 13251, + "train/total_loss": 0.06089882552623749 + }, + { + "entropy": 10.007963180541992, + "epoch": 1.3102630017797112, + "mean_token_accuracy": 0.834515392780304, + "num_tokens": 15573194.0, + "step": 13252, + "train/ce_loss": 0.947297215461731 + }, + { + "epoch": 1.3102630017797112, + "step": 13252, + "train/sim_loss": 0.037101149559020996 + }, + { + "epoch": 1.3102630017797112, + "step": 13252, + "train/total_loss": 0.1318308711051941 + }, + { + "entropy": 9.536144256591797, + "epoch": 1.3103618746292267, + "mean_token_accuracy": 0.7978378534317017, + "num_tokens": 15586144.0, + "step": 13253, + "train/ce_loss": 0.469237744808197 + }, + { + "epoch": 1.3103618746292267, + "step": 13253, + "train/sim_loss": 0.03378558158874512 + }, + { + "epoch": 1.3103618746292267, + "step": 13253, + "train/total_loss": 0.08070935308933258 + }, + { + "entropy": 9.555150985717773, + "epoch": 1.3104607474787424, + "mean_token_accuracy": 0.8914728760719299, + "num_tokens": 15598759.0, + "step": 13254, + "train/ce_loss": 0.35658419132232666 + }, + { + "epoch": 1.3104607474787424, + "step": 13254, + "train/sim_loss": 0.07318544387817383 + }, + { + "epoch": 1.3104607474787424, + "step": 13254, + "train/total_loss": 0.1088438630104065 + }, + { + "entropy": 9.65597152709961, + "epoch": 1.310559620328258, + "mean_token_accuracy": 0.8688046932220459, + "num_tokens": 15609255.0, + "step": 13255, + "train/ce_loss": 0.23309171199798584 + }, + { + "epoch": 1.310559620328258, + "step": 13255, + "train/sim_loss": 0.03384882211685181 + }, + { + "epoch": 1.310559620328258, + "step": 13255, + "train/total_loss": 0.05715799331665039 + }, + { + "entropy": 9.549776077270508, + "epoch": 1.3106584931777734, + "mean_token_accuracy": 0.8623188138008118, + "num_tokens": 15623701.0, + "step": 13256, + "train/ce_loss": 0.7365236282348633 + }, + { + "epoch": 1.3106584931777734, + "step": 13256, + "train/sim_loss": 0.07027459144592285 + }, + { + "epoch": 1.3106584931777734, + "step": 13256, + "train/total_loss": 0.1439269483089447 + }, + { + "entropy": 9.442778587341309, + "epoch": 1.3107573660272889, + "mean_token_accuracy": 0.856940507888794, + "num_tokens": 15635942.0, + "step": 13257, + "train/ce_loss": 0.38359105587005615 + }, + { + "epoch": 1.3107573660272889, + "step": 13257, + "train/sim_loss": 0.052019715309143066 + }, + { + "epoch": 1.3107573660272889, + "step": 13257, + "train/total_loss": 0.09037882089614868 + }, + { + "entropy": 9.329708099365234, + "epoch": 1.3108562388768044, + "mean_token_accuracy": 0.828742504119873, + "num_tokens": 15645018.0, + "step": 13258, + "train/ce_loss": 0.698815643787384 + }, + { + "epoch": 1.3108562388768044, + "step": 13258, + "train/sim_loss": 0.03616166114807129 + }, + { + "epoch": 1.3108562388768044, + "step": 13258, + "train/total_loss": 0.10604322701692581 + }, + { + "entropy": 9.25457763671875, + "epoch": 1.31095511172632, + "mean_token_accuracy": 0.8444180488586426, + "num_tokens": 15659926.0, + "step": 13259, + "train/ce_loss": 0.29922056198120117 + }, + { + "epoch": 1.31095511172632, + "step": 13259, + "train/sim_loss": 0.03262674808502197 + }, + { + "epoch": 1.31095511172632, + "step": 13259, + "train/total_loss": 0.06254880130290985 + }, + { + "epoch": 1.3110539845758356, + "grad_norm": 0.5061968564987183, + "learning_rate": 6.724274341096771e-06, + "loss": 0.0846, + "step": 13260 + }, + { + "entropy": 9.601715087890625, + "epoch": 1.3110539845758356, + "mean_token_accuracy": 0.8428571224212646, + "num_tokens": 15667677.0, + "step": 13260, + "train/ce_loss": 9.689593980510836e-07 + }, + { + "epoch": 1.3110539845758356, + "step": 13260, + "train/sim_loss": 0.023805737495422363 + }, + { + "epoch": 1.3110539845758356, + "step": 13260, + "train/total_loss": 0.023805834352970123 + }, + { + "entropy": 9.128297805786133, + "epoch": 1.311152857425351, + "mean_token_accuracy": 0.837284505367279, + "num_tokens": 15675322.0, + "step": 13261, + "train/ce_loss": 0.38914254307746887 + }, + { + "epoch": 1.311152857425351, + "step": 13261, + "train/sim_loss": 0.02879476547241211 + }, + { + "epoch": 1.311152857425351, + "step": 13261, + "train/total_loss": 0.06770902127027512 + }, + { + "entropy": 9.953546524047852, + "epoch": 1.3112517302748665, + "mean_token_accuracy": 0.8213740587234497, + "num_tokens": 15694381.0, + "step": 13262, + "train/ce_loss": 1.0649863497746992e-06 + }, + { + "epoch": 1.3112517302748665, + "step": 13262, + "train/sim_loss": 0.10946166515350342 + }, + { + "epoch": 1.3112517302748665, + "step": 13262, + "train/total_loss": 0.10946176946163177 + }, + { + "entropy": 9.810630798339844, + "epoch": 1.311350603124382, + "mean_token_accuracy": 0.8104667663574219, + "num_tokens": 15710130.0, + "step": 13263, + "train/ce_loss": 0.5712739825248718 + }, + { + "epoch": 1.311350603124382, + "step": 13263, + "train/sim_loss": 0.07756364345550537 + }, + { + "epoch": 1.311350603124382, + "step": 13263, + "train/total_loss": 0.1346910446882248 + }, + { + "entropy": 9.469917297363281, + "epoch": 1.3114494759738975, + "mean_token_accuracy": 0.790123462677002, + "num_tokens": 15722798.0, + "step": 13264, + "train/ce_loss": 0.8327891230583191 + }, + { + "epoch": 1.3114494759738975, + "step": 13264, + "train/sim_loss": 0.10225826501846313 + }, + { + "epoch": 1.3114494759738975, + "step": 13264, + "train/total_loss": 0.185537189245224 + }, + { + "entropy": 9.447912216186523, + "epoch": 1.311548348823413, + "mean_token_accuracy": 0.8545888066291809, + "num_tokens": 15737420.0, + "step": 13265, + "train/ce_loss": 0.5051562190055847 + }, + { + "epoch": 1.311548348823413, + "step": 13265, + "train/sim_loss": 0.08984094858169556 + }, + { + "epoch": 1.311548348823413, + "step": 13265, + "train/total_loss": 0.14035657048225403 + }, + { + "entropy": 9.47281265258789, + "epoch": 1.3116472216729287, + "mean_token_accuracy": 0.8613989353179932, + "num_tokens": 15751386.0, + "step": 13266, + "train/ce_loss": 0.5571385622024536 + }, + { + "epoch": 1.3116472216729287, + "step": 13266, + "train/sim_loss": 0.0638090968132019 + }, + { + "epoch": 1.3116472216729287, + "step": 13266, + "train/total_loss": 0.11952295899391174 + }, + { + "entropy": 9.491016387939453, + "epoch": 1.3117460945224442, + "mean_token_accuracy": 0.7954545617103577, + "num_tokens": 15763039.0, + "step": 13267, + "train/ce_loss": 0.5769677758216858 + }, + { + "epoch": 1.3117460945224442, + "step": 13267, + "train/sim_loss": 0.05328333377838135 + }, + { + "epoch": 1.3117460945224442, + "step": 13267, + "train/total_loss": 0.11098010838031769 + }, + { + "entropy": 9.265564918518066, + "epoch": 1.3118449673719597, + "mean_token_accuracy": 0.8763736486434937, + "num_tokens": 15774542.0, + "step": 13268, + "train/ce_loss": 0.5268097519874573 + }, + { + "epoch": 1.3118449673719597, + "step": 13268, + "train/sim_loss": 0.021155714988708496 + }, + { + "epoch": 1.3118449673719597, + "step": 13268, + "train/total_loss": 0.07383669167757034 + }, + { + "entropy": 9.195451736450195, + "epoch": 1.3119438402214751, + "mean_token_accuracy": 0.8351514935493469, + "num_tokens": 15789778.0, + "step": 13269, + "train/ce_loss": 0.45682743191719055 + }, + { + "epoch": 1.3119438402214751, + "step": 13269, + "train/sim_loss": 0.04500812292098999 + }, + { + "epoch": 1.3119438402214751, + "step": 13269, + "train/total_loss": 0.09069086611270905 + }, + { + "entropy": 9.586082458496094, + "epoch": 1.3120427130709906, + "mean_token_accuracy": 0.8098676204681396, + "num_tokens": 15801573.0, + "step": 13270, + "train/ce_loss": 0.42488357424736023 + }, + { + "epoch": 1.3120427130709906, + "step": 13270, + "train/sim_loss": 0.051146507263183594 + }, + { + "epoch": 1.3120427130709906, + "step": 13270, + "train/total_loss": 0.09363486617803574 + }, + { + "entropy": 9.389373779296875, + "epoch": 1.3121415859205063, + "mean_token_accuracy": 0.8034979701042175, + "num_tokens": 15819117.0, + "step": 13271, + "train/ce_loss": 0.500239372253418 + }, + { + "epoch": 1.3121415859205063, + "step": 13271, + "train/sim_loss": 0.057620465755462646 + }, + { + "epoch": 1.3121415859205063, + "step": 13271, + "train/total_loss": 0.10764440894126892 + }, + { + "entropy": 9.484944343566895, + "epoch": 1.3122404587700218, + "mean_token_accuracy": 0.8206451535224915, + "num_tokens": 15835257.0, + "step": 13272, + "train/ce_loss": 0.4109496772289276 + }, + { + "epoch": 1.3122404587700218, + "step": 13272, + "train/sim_loss": 0.01649075746536255 + }, + { + "epoch": 1.3122404587700218, + "step": 13272, + "train/total_loss": 0.05758572742342949 + }, + { + "entropy": 9.747709274291992, + "epoch": 1.3123393316195373, + "mean_token_accuracy": 0.7889087796211243, + "num_tokens": 15846293.0, + "step": 13273, + "train/ce_loss": 0.4498741626739502 + }, + { + "epoch": 1.3123393316195373, + "step": 13273, + "train/sim_loss": 0.09330624341964722 + }, + { + "epoch": 1.3123393316195373, + "step": 13273, + "train/total_loss": 0.13829365372657776 + }, + { + "entropy": 9.078459739685059, + "epoch": 1.3124382044690528, + "mean_token_accuracy": 0.8560940027236938, + "num_tokens": 15860542.0, + "step": 13274, + "train/ce_loss": 0.5382640957832336 + }, + { + "epoch": 1.3124382044690528, + "step": 13274, + "train/sim_loss": 0.05973392724990845 + }, + { + "epoch": 1.3124382044690528, + "step": 13274, + "train/total_loss": 0.11356033384799957 + }, + { + "entropy": 9.692535400390625, + "epoch": 1.3125370773185683, + "mean_token_accuracy": 0.8599269390106201, + "num_tokens": 15873552.0, + "step": 13275, + "train/ce_loss": 0.5221970081329346 + }, + { + "epoch": 1.3125370773185683, + "step": 13275, + "train/sim_loss": 0.03947955369949341 + }, + { + "epoch": 1.3125370773185683, + "step": 13275, + "train/total_loss": 0.0916992574930191 + }, + { + "entropy": 9.292692184448242, + "epoch": 1.3126359501680838, + "mean_token_accuracy": 0.8247422575950623, + "num_tokens": 15881872.0, + "step": 13276, + "train/ce_loss": 0.6829899549484253 + }, + { + "epoch": 1.3126359501680838, + "step": 13276, + "train/sim_loss": 0.06167459487915039 + }, + { + "epoch": 1.3126359501680838, + "step": 13276, + "train/total_loss": 0.12997359037399292 + }, + { + "entropy": 9.78298568725586, + "epoch": 1.3127348230175993, + "mean_token_accuracy": 0.850220263004303, + "num_tokens": 15892656.0, + "step": 13277, + "train/ce_loss": 0.927311897277832 + }, + { + "epoch": 1.3127348230175993, + "step": 13277, + "train/sim_loss": 0.08626186847686768 + }, + { + "epoch": 1.3127348230175993, + "step": 13277, + "train/total_loss": 0.17899306118488312 + }, + { + "entropy": 9.343063354492188, + "epoch": 1.312833695867115, + "mean_token_accuracy": 0.8350168466567993, + "num_tokens": 15906051.0, + "step": 13278, + "train/ce_loss": 0.38373714685440063 + }, + { + "epoch": 1.312833695867115, + "step": 13278, + "train/sim_loss": 0.020357489585876465 + }, + { + "epoch": 1.312833695867115, + "step": 13278, + "train/total_loss": 0.05873120576143265 + }, + { + "entropy": 9.197990417480469, + "epoch": 1.3129325687166304, + "mean_token_accuracy": 0.8497853875160217, + "num_tokens": 15918439.0, + "step": 13279, + "train/ce_loss": 0.31681719422340393 + }, + { + "epoch": 1.3129325687166304, + "step": 13279, + "train/sim_loss": 0.03252053260803223 + }, + { + "epoch": 1.3129325687166304, + "step": 13279, + "train/total_loss": 0.06420224905014038 + }, + { + "epoch": 1.313031441566146, + "grad_norm": 0.4938790500164032, + "learning_rate": 6.7193294763388225e-06, + "loss": 0.0982, + "step": 13280 + }, + { + "entropy": 9.301772117614746, + "epoch": 1.313031441566146, + "mean_token_accuracy": 0.8719345927238464, + "num_tokens": 15925419.0, + "step": 13280, + "train/ce_loss": 3.430050128372386e-07 + }, + { + "epoch": 1.313031441566146, + "step": 13280, + "train/sim_loss": 0.012969017028808594 + }, + { + "epoch": 1.313031441566146, + "step": 13280, + "train/total_loss": 0.012969051487743855 + }, + { + "entropy": 9.790163040161133, + "epoch": 1.3131303144156614, + "mean_token_accuracy": 0.8023809790611267, + "num_tokens": 15939491.0, + "step": 13281, + "train/ce_loss": 0.7105481624603271 + }, + { + "epoch": 1.3131303144156614, + "step": 13281, + "train/sim_loss": 0.05545806884765625 + }, + { + "epoch": 1.3131303144156614, + "step": 13281, + "train/total_loss": 0.12651288509368896 + }, + { + "entropy": 9.709348678588867, + "epoch": 1.313229187265177, + "mean_token_accuracy": 0.8862385153770447, + "num_tokens": 15948485.0, + "step": 13282, + "train/ce_loss": 0.2408611923456192 + }, + { + "epoch": 1.313229187265177, + "step": 13282, + "train/sim_loss": 0.05338853597640991 + }, + { + "epoch": 1.313229187265177, + "step": 13282, + "train/total_loss": 0.07747465372085571 + }, + { + "entropy": 9.818010330200195, + "epoch": 1.3133280601146926, + "mean_token_accuracy": 0.8202764987945557, + "num_tokens": 15959169.0, + "step": 13283, + "train/ce_loss": 0.660810649394989 + }, + { + "epoch": 1.3133280601146926, + "step": 13283, + "train/sim_loss": 0.07966184616088867 + }, + { + "epoch": 1.3133280601146926, + "step": 13283, + "train/total_loss": 0.14574292302131653 + }, + { + "entropy": 9.271066665649414, + "epoch": 1.313426932964208, + "mean_token_accuracy": 0.8053333163261414, + "num_tokens": 15966814.0, + "step": 13284, + "train/ce_loss": 0.5254663228988647 + }, + { + "epoch": 1.313426932964208, + "step": 13284, + "train/sim_loss": 0.05412471294403076 + }, + { + "epoch": 1.313426932964208, + "step": 13284, + "train/total_loss": 0.10667134821414948 + }, + { + "entropy": 9.800627708435059, + "epoch": 1.3135258058137236, + "mean_token_accuracy": 0.8460490703582764, + "num_tokens": 15986551.0, + "step": 13285, + "train/ce_loss": 0.30457672476768494 + }, + { + "epoch": 1.3135258058137236, + "step": 13285, + "train/sim_loss": 0.04534614086151123 + }, + { + "epoch": 1.3135258058137236, + "step": 13285, + "train/total_loss": 0.07580381631851196 + }, + { + "entropy": 9.595293998718262, + "epoch": 1.313624678663239, + "mean_token_accuracy": 0.8788968920707703, + "num_tokens": 16001058.0, + "step": 13286, + "train/ce_loss": 0.4398408830165863 + }, + { + "epoch": 1.313624678663239, + "step": 13286, + "train/sim_loss": 0.056293487548828125 + }, + { + "epoch": 1.313624678663239, + "step": 13286, + "train/total_loss": 0.10027757287025452 + }, + { + "entropy": 9.415979385375977, + "epoch": 1.3137235515127546, + "mean_token_accuracy": 0.8989546895027161, + "num_tokens": 16011946.0, + "step": 13287, + "train/ce_loss": 0.5814066529273987 + }, + { + "epoch": 1.3137235515127546, + "step": 13287, + "train/sim_loss": 0.028397083282470703 + }, + { + "epoch": 1.3137235515127546, + "step": 13287, + "train/total_loss": 0.08653774857521057 + }, + { + "entropy": 9.7645845413208, + "epoch": 1.3138224243622703, + "mean_token_accuracy": 0.8416206240653992, + "num_tokens": 16024093.0, + "step": 13288, + "train/ce_loss": 0.5698719620704651 + }, + { + "epoch": 1.3138224243622703, + "step": 13288, + "train/sim_loss": 0.08101314306259155 + }, + { + "epoch": 1.3138224243622703, + "step": 13288, + "train/total_loss": 0.13800033926963806 + }, + { + "entropy": 8.966838836669922, + "epoch": 1.3139212972117855, + "mean_token_accuracy": 0.8009592294692993, + "num_tokens": 16037160.0, + "step": 13289, + "train/ce_loss": 0.47579389810562134 + }, + { + "epoch": 1.3139212972117855, + "step": 13289, + "train/sim_loss": 0.014253973960876465 + }, + { + "epoch": 1.3139212972117855, + "step": 13289, + "train/total_loss": 0.06183336302638054 + }, + { + "entropy": 9.477373123168945, + "epoch": 1.3140201700613012, + "mean_token_accuracy": 0.8788461685180664, + "num_tokens": 16049100.0, + "step": 13290, + "train/ce_loss": 0.6032007336616516 + }, + { + "epoch": 1.3140201700613012, + "step": 13290, + "train/sim_loss": 0.019521236419677734 + }, + { + "epoch": 1.3140201700613012, + "step": 13290, + "train/total_loss": 0.07984131574630737 + }, + { + "entropy": 9.307170867919922, + "epoch": 1.3141190429108167, + "mean_token_accuracy": 0.8251121044158936, + "num_tokens": 16061529.0, + "step": 13291, + "train/ce_loss": 0.5812641978263855 + }, + { + "epoch": 1.3141190429108167, + "step": 13291, + "train/sim_loss": 0.05625259876251221 + }, + { + "epoch": 1.3141190429108167, + "step": 13291, + "train/total_loss": 0.11437901854515076 + }, + { + "entropy": 9.453691482543945, + "epoch": 1.3142179157603322, + "mean_token_accuracy": 0.8848560452461243, + "num_tokens": 16074062.0, + "step": 13292, + "train/ce_loss": 0.21901443600654602 + }, + { + "epoch": 1.3142179157603322, + "step": 13292, + "train/sim_loss": 0.061551451683044434 + }, + { + "epoch": 1.3142179157603322, + "step": 13292, + "train/total_loss": 0.08345289528369904 + }, + { + "entropy": 9.259035110473633, + "epoch": 1.3143167886098477, + "mean_token_accuracy": 0.8242424130439758, + "num_tokens": 16082143.0, + "step": 13293, + "train/ce_loss": 0.5094939470291138 + }, + { + "epoch": 1.3143167886098477, + "step": 13293, + "train/sim_loss": 0.05190092325210571 + }, + { + "epoch": 1.3143167886098477, + "step": 13293, + "train/total_loss": 0.10285031795501709 + }, + { + "entropy": 9.867105484008789, + "epoch": 1.3144156614593632, + "mean_token_accuracy": 0.828125, + "num_tokens": 16097530.0, + "step": 13294, + "train/ce_loss": 0.9570987820625305 + }, + { + "epoch": 1.3144156614593632, + "step": 13294, + "train/sim_loss": 0.11310327053070068 + }, + { + "epoch": 1.3144156614593632, + "step": 13294, + "train/total_loss": 0.2088131606578827 + }, + { + "entropy": 9.249149322509766, + "epoch": 1.3145145343088789, + "mean_token_accuracy": 0.8319225907325745, + "num_tokens": 16108117.0, + "step": 13295, + "train/ce_loss": 0.6358902454376221 + }, + { + "epoch": 1.3145145343088789, + "step": 13295, + "train/sim_loss": 0.043181657791137695 + }, + { + "epoch": 1.3145145343088789, + "step": 13295, + "train/total_loss": 0.10677068680524826 + }, + { + "entropy": 10.018238067626953, + "epoch": 1.3146134071583944, + "mean_token_accuracy": 0.8387096524238586, + "num_tokens": 16120467.0, + "step": 13296, + "train/ce_loss": 0.9205040335655212 + }, + { + "epoch": 1.3146134071583944, + "step": 13296, + "train/sim_loss": 0.02829599380493164 + }, + { + "epoch": 1.3146134071583944, + "step": 13296, + "train/total_loss": 0.12034639716148376 + }, + { + "entropy": 9.139952659606934, + "epoch": 1.3147122800079099, + "mean_token_accuracy": 0.8026159405708313, + "num_tokens": 16129692.0, + "step": 13297, + "train/ce_loss": 0.5568993091583252 + }, + { + "epoch": 1.3147122800079099, + "step": 13297, + "train/sim_loss": 0.12284243106842041 + }, + { + "epoch": 1.3147122800079099, + "step": 13297, + "train/total_loss": 0.17853236198425293 + }, + { + "entropy": 9.267021179199219, + "epoch": 1.3148111528574253, + "mean_token_accuracy": 0.8613037467002869, + "num_tokens": 16139135.0, + "step": 13298, + "train/ce_loss": 0.3739243447780609 + }, + { + "epoch": 1.3148111528574253, + "step": 13298, + "train/sim_loss": 0.045975327491760254 + }, + { + "epoch": 1.3148111528574253, + "step": 13298, + "train/total_loss": 0.08336776494979858 + }, + { + "entropy": 8.665166854858398, + "epoch": 1.3149100257069408, + "mean_token_accuracy": 0.8399532437324524, + "num_tokens": 16149650.0, + "step": 13299, + "train/ce_loss": 0.5015440583229065 + }, + { + "epoch": 1.3149100257069408, + "step": 13299, + "train/sim_loss": 0.012157201766967773 + }, + { + "epoch": 1.3149100257069408, + "step": 13299, + "train/total_loss": 0.06231160834431648 + }, + { + "epoch": 1.3150088985564565, + "grad_norm": 0.5182445049285889, + "learning_rate": 6.714384611580874e-06, + "loss": 0.0913, + "step": 13300 + }, + { + "entropy": 9.200881958007812, + "epoch": 1.3150088985564565, + "mean_token_accuracy": 0.833734929561615, + "num_tokens": 16162617.0, + "step": 13300, + "train/ce_loss": 0.6039873957633972 + }, + { + "epoch": 1.3150088985564565, + "step": 13300, + "train/sim_loss": 0.08311909437179565 + }, + { + "epoch": 1.3150088985564565, + "step": 13300, + "train/total_loss": 0.14351783692836761 + }, + { + "entropy": 8.984333038330078, + "epoch": 1.3151077714059718, + "mean_token_accuracy": 0.8395061492919922, + "num_tokens": 16176795.0, + "step": 13301, + "train/ce_loss": 0.42495936155319214 + }, + { + "epoch": 1.3151077714059718, + "step": 13301, + "train/sim_loss": 0.01959294080734253 + }, + { + "epoch": 1.3151077714059718, + "step": 13301, + "train/total_loss": 0.06208887696266174 + }, + { + "entropy": 9.457180976867676, + "epoch": 1.3152066442554875, + "mean_token_accuracy": 0.8701799511909485, + "num_tokens": 16194602.0, + "step": 13302, + "train/ce_loss": 0.4313220977783203 + }, + { + "epoch": 1.3152066442554875, + "step": 13302, + "train/sim_loss": 0.038755059242248535 + }, + { + "epoch": 1.3152066442554875, + "step": 13302, + "train/total_loss": 0.08188727498054504 + }, + { + "entropy": 9.205987930297852, + "epoch": 1.315305517105003, + "mean_token_accuracy": 0.7995642423629761, + "num_tokens": 16213183.0, + "step": 13303, + "train/ce_loss": 0.6315588355064392 + }, + { + "epoch": 1.315305517105003, + "step": 13303, + "train/sim_loss": 0.05254495143890381 + }, + { + "epoch": 1.315305517105003, + "step": 13303, + "train/total_loss": 0.11570083349943161 + }, + { + "entropy": 9.450347900390625, + "epoch": 1.3154043899545185, + "mean_token_accuracy": 0.8224431872367859, + "num_tokens": 16226028.0, + "step": 13304, + "train/ce_loss": 1.1675585508346558 + }, + { + "epoch": 1.3154043899545185, + "step": 13304, + "train/sim_loss": 0.08740627765655518 + }, + { + "epoch": 1.3154043899545185, + "step": 13304, + "train/total_loss": 0.204162135720253 + }, + { + "entropy": 9.110198020935059, + "epoch": 1.315503262804034, + "mean_token_accuracy": 0.870512843132019, + "num_tokens": 16233151.0, + "step": 13305, + "train/ce_loss": 0.21320819854736328 + }, + { + "epoch": 1.315503262804034, + "step": 13305, + "train/sim_loss": 0.02051079273223877 + }, + { + "epoch": 1.315503262804034, + "step": 13305, + "train/total_loss": 0.0418316125869751 + }, + { + "entropy": 9.120857238769531, + "epoch": 1.3156021356535494, + "mean_token_accuracy": 0.8163030743598938, + "num_tokens": 16243520.0, + "step": 13306, + "train/ce_loss": 0.251565545797348 + }, + { + "epoch": 1.3156021356535494, + "step": 13306, + "train/sim_loss": 0.022052884101867676 + }, + { + "epoch": 1.3156021356535494, + "step": 13306, + "train/total_loss": 0.04720944166183472 + }, + { + "entropy": 9.848006248474121, + "epoch": 1.3157010085030652, + "mean_token_accuracy": 0.8487805128097534, + "num_tokens": 16259455.0, + "step": 13307, + "train/ce_loss": 0.5702502727508545 + }, + { + "epoch": 1.3157010085030652, + "step": 13307, + "train/sim_loss": 0.06905478239059448 + }, + { + "epoch": 1.3157010085030652, + "step": 13307, + "train/total_loss": 0.12607981264591217 + }, + { + "entropy": 9.589466094970703, + "epoch": 1.3157998813525806, + "mean_token_accuracy": 0.8761220574378967, + "num_tokens": 16273072.0, + "step": 13308, + "train/ce_loss": 0.4226364195346832 + }, + { + "epoch": 1.3157998813525806, + "step": 13308, + "train/sim_loss": 0.04786825180053711 + }, + { + "epoch": 1.3157998813525806, + "step": 13308, + "train/total_loss": 0.09013189375400543 + }, + { + "entropy": 9.009057998657227, + "epoch": 1.3158987542020961, + "mean_token_accuracy": 0.8696498274803162, + "num_tokens": 16285373.0, + "step": 13309, + "train/ce_loss": 0.529474139213562 + }, + { + "epoch": 1.3158987542020961, + "step": 13309, + "train/sim_loss": 0.047280311584472656 + }, + { + "epoch": 1.3158987542020961, + "step": 13309, + "train/total_loss": 0.1002277284860611 + }, + { + "entropy": 9.60416030883789, + "epoch": 1.3159976270516116, + "mean_token_accuracy": 0.8119440674781799, + "num_tokens": 16295054.0, + "step": 13310, + "train/ce_loss": 0.5916775465011597 + }, + { + "epoch": 1.3159976270516116, + "step": 13310, + "train/sim_loss": 0.05926191806793213 + }, + { + "epoch": 1.3159976270516116, + "step": 13310, + "train/total_loss": 0.11842967569828033 + }, + { + "entropy": 9.306371688842773, + "epoch": 1.316096499901127, + "mean_token_accuracy": 0.8609195351600647, + "num_tokens": 16306258.0, + "step": 13311, + "train/ce_loss": 0.5497068762779236 + }, + { + "epoch": 1.316096499901127, + "step": 13311, + "train/sim_loss": 0.029345393180847168 + }, + { + "epoch": 1.316096499901127, + "step": 13311, + "train/total_loss": 0.08431608229875565 + }, + { + "entropy": 9.501955032348633, + "epoch": 1.3161953727506428, + "mean_token_accuracy": 0.797897219657898, + "num_tokens": 16319759.0, + "step": 13312, + "train/ce_loss": 0.7561444640159607 + }, + { + "epoch": 1.3161953727506428, + "step": 13312, + "train/sim_loss": 0.028530240058898926 + }, + { + "epoch": 1.3161953727506428, + "step": 13312, + "train/total_loss": 0.10414468497037888 + }, + { + "entropy": 10.20763111114502, + "epoch": 1.316294245600158, + "mean_token_accuracy": 0.8722466826438904, + "num_tokens": 16332850.0, + "step": 13313, + "train/ce_loss": 0.22600097954273224 + }, + { + "epoch": 1.316294245600158, + "step": 13313, + "train/sim_loss": 0.028550684452056885 + }, + { + "epoch": 1.316294245600158, + "step": 13313, + "train/total_loss": 0.05115078389644623 + }, + { + "entropy": 9.1973876953125, + "epoch": 1.3163931184496738, + "mean_token_accuracy": 0.7797537446022034, + "num_tokens": 16344511.0, + "step": 13314, + "train/ce_loss": 0.6608161926269531 + }, + { + "epoch": 1.3163931184496738, + "step": 13314, + "train/sim_loss": 0.030228495597839355 + }, + { + "epoch": 1.3163931184496738, + "step": 13314, + "train/total_loss": 0.09631011635065079 + }, + { + "entropy": 9.955551147460938, + "epoch": 1.3164919912991893, + "mean_token_accuracy": 0.8991596698760986, + "num_tokens": 16364138.0, + "step": 13315, + "train/ce_loss": 4.993113634554902e-06 + }, + { + "epoch": 1.3164919912991893, + "step": 13315, + "train/sim_loss": 0.027871370315551758 + }, + { + "epoch": 1.3164919912991893, + "step": 13315, + "train/total_loss": 0.02787186950445175 + }, + { + "entropy": 9.42524528503418, + "epoch": 1.3165908641487047, + "mean_token_accuracy": 0.8283582329750061, + "num_tokens": 16381164.0, + "step": 13316, + "train/ce_loss": 0.689607560634613 + }, + { + "epoch": 1.3165908641487047, + "step": 13316, + "train/sim_loss": 0.07067745923995972 + }, + { + "epoch": 1.3165908641487047, + "step": 13316, + "train/total_loss": 0.13963821530342102 + }, + { + "entropy": 9.540969848632812, + "epoch": 1.3166897369982202, + "mean_token_accuracy": 0.8491879105567932, + "num_tokens": 16392569.0, + "step": 13317, + "train/ce_loss": 0.4534202516078949 + }, + { + "epoch": 1.3166897369982202, + "step": 13317, + "train/sim_loss": 0.07307600975036621 + }, + { + "epoch": 1.3166897369982202, + "step": 13317, + "train/total_loss": 0.11841803789138794 + }, + { + "entropy": 9.479650497436523, + "epoch": 1.3167886098477357, + "mean_token_accuracy": 0.8330188393592834, + "num_tokens": 16406821.0, + "step": 13318, + "train/ce_loss": 0.6757592558860779 + }, + { + "epoch": 1.3167886098477357, + "step": 13318, + "train/sim_loss": 0.06847906112670898 + }, + { + "epoch": 1.3167886098477357, + "step": 13318, + "train/total_loss": 0.13605499267578125 + }, + { + "entropy": 9.366633415222168, + "epoch": 1.3168874826972514, + "mean_token_accuracy": 0.8270777463912964, + "num_tokens": 16418780.0, + "step": 13319, + "train/ce_loss": 0.7439886927604675 + }, + { + "epoch": 1.3168874826972514, + "step": 13319, + "train/sim_loss": 0.04758375883102417 + }, + { + "epoch": 1.3168874826972514, + "step": 13319, + "train/total_loss": 0.1219826266169548 + }, + { + "epoch": 1.316986355546767, + "grad_norm": 0.5829504132270813, + "learning_rate": 6.709439746822924e-06, + "loss": 0.0904, + "step": 13320 + }, + { + "entropy": 9.968992233276367, + "epoch": 1.316986355546767, + "mean_token_accuracy": 0.8494381904602051, + "num_tokens": 16434274.0, + "step": 13320, + "train/ce_loss": 0.6203958988189697 + }, + { + "epoch": 1.316986355546767, + "step": 13320, + "train/sim_loss": 0.023411154747009277 + }, + { + "epoch": 1.316986355546767, + "step": 13320, + "train/total_loss": 0.08545074611902237 + }, + { + "entropy": 9.514514923095703, + "epoch": 1.3170852283962824, + "mean_token_accuracy": 0.8128078579902649, + "num_tokens": 16446185.0, + "step": 13321, + "train/ce_loss": 0.6017529964447021 + }, + { + "epoch": 1.3170852283962824, + "step": 13321, + "train/sim_loss": 0.026019930839538574 + }, + { + "epoch": 1.3170852283962824, + "step": 13321, + "train/total_loss": 0.08619523048400879 + }, + { + "entropy": 9.80773639678955, + "epoch": 1.3171841012457979, + "mean_token_accuracy": 0.8360000252723694, + "num_tokens": 16459807.0, + "step": 13322, + "train/ce_loss": 0.5345301628112793 + }, + { + "epoch": 1.3171841012457979, + "step": 13322, + "train/sim_loss": 0.06510937213897705 + }, + { + "epoch": 1.3171841012457979, + "step": 13322, + "train/total_loss": 0.11856238543987274 + }, + { + "entropy": 9.247859954833984, + "epoch": 1.3172829740953134, + "mean_token_accuracy": 0.7932285666465759, + "num_tokens": 16472955.0, + "step": 13323, + "train/ce_loss": 0.5377441048622131 + }, + { + "epoch": 1.3172829740953134, + "step": 13323, + "train/sim_loss": 0.03604632616043091 + }, + { + "epoch": 1.3172829740953134, + "step": 13323, + "train/total_loss": 0.0898207426071167 + }, + { + "entropy": 9.532901763916016, + "epoch": 1.317381846944829, + "mean_token_accuracy": 0.8527214527130127, + "num_tokens": 16492371.0, + "step": 13324, + "train/ce_loss": 0.21012701094150543 + }, + { + "epoch": 1.317381846944829, + "step": 13324, + "train/sim_loss": 0.019334614276885986 + }, + { + "epoch": 1.317381846944829, + "step": 13324, + "train/total_loss": 0.04034731537103653 + }, + { + "entropy": 9.708731651306152, + "epoch": 1.3174807197943443, + "mean_token_accuracy": 0.8589420914649963, + "num_tokens": 16503020.0, + "step": 13325, + "train/ce_loss": 0.36530792713165283 + }, + { + "epoch": 1.3174807197943443, + "step": 13325, + "train/sim_loss": 0.05860328674316406 + }, + { + "epoch": 1.3174807197943443, + "step": 13325, + "train/total_loss": 0.09513407945632935 + }, + { + "entropy": 9.397686958312988, + "epoch": 1.31757959264386, + "mean_token_accuracy": 0.8518930673599243, + "num_tokens": 16516428.0, + "step": 13326, + "train/ce_loss": 0.2654215097427368 + }, + { + "epoch": 1.31757959264386, + "step": 13326, + "train/sim_loss": 0.014943122863769531 + }, + { + "epoch": 1.31757959264386, + "step": 13326, + "train/total_loss": 0.041485272347927094 + }, + { + "entropy": 9.515253067016602, + "epoch": 1.3176784654933755, + "mean_token_accuracy": 0.8015783429145813, + "num_tokens": 16530732.0, + "step": 13327, + "train/ce_loss": 0.680653989315033 + }, + { + "epoch": 1.3176784654933755, + "step": 13327, + "train/sim_loss": 0.043163418769836426 + }, + { + "epoch": 1.3176784654933755, + "step": 13327, + "train/total_loss": 0.1112288162112236 + }, + { + "entropy": 9.86998176574707, + "epoch": 1.317777338342891, + "mean_token_accuracy": 0.9064327478408813, + "num_tokens": 16545635.0, + "step": 13328, + "train/ce_loss": 2.813554374370142e-06 + }, + { + "epoch": 1.317777338342891, + "step": 13328, + "train/sim_loss": 0.052941083908081055 + }, + { + "epoch": 1.317777338342891, + "step": 13328, + "train/total_loss": 0.05294136703014374 + }, + { + "entropy": 8.736130714416504, + "epoch": 1.3178762111924065, + "mean_token_accuracy": 0.8657143115997314, + "num_tokens": 16555962.0, + "step": 13329, + "train/ce_loss": 0.2432689517736435 + }, + { + "epoch": 1.3178762111924065, + "step": 13329, + "train/sim_loss": 0.012568473815917969 + }, + { + "epoch": 1.3178762111924065, + "step": 13329, + "train/total_loss": 0.03689537197351456 + }, + { + "entropy": 9.402727127075195, + "epoch": 1.317975084041922, + "mean_token_accuracy": 0.8739612102508545, + "num_tokens": 16571447.0, + "step": 13330, + "train/ce_loss": 0.351159930229187 + }, + { + "epoch": 1.317975084041922, + "step": 13330, + "train/sim_loss": 0.03125882148742676 + }, + { + "epoch": 1.317975084041922, + "step": 13330, + "train/total_loss": 0.06637481600046158 + }, + { + "entropy": 9.803570747375488, + "epoch": 1.3180739568914377, + "mean_token_accuracy": 0.8662499785423279, + "num_tokens": 16586897.0, + "step": 13331, + "train/ce_loss": 0.5733577609062195 + }, + { + "epoch": 1.3180739568914377, + "step": 13331, + "train/sim_loss": 0.05171525478363037 + }, + { + "epoch": 1.3180739568914377, + "step": 13331, + "train/total_loss": 0.10905103385448456 + }, + { + "entropy": 10.505146980285645, + "epoch": 1.3181728297409532, + "mean_token_accuracy": 0.8833333253860474, + "num_tokens": 16592450.0, + "step": 13332, + "train/ce_loss": 2.5403940071555553e-06 + }, + { + "epoch": 1.3181728297409532, + "step": 13332, + "train/sim_loss": 0.012817919254302979 + }, + { + "epoch": 1.3181728297409532, + "step": 13332, + "train/total_loss": 0.012818173505365849 + }, + { + "entropy": 10.017816543579102, + "epoch": 1.3182717025904687, + "mean_token_accuracy": 0.9144254326820374, + "num_tokens": 16604638.0, + "step": 13333, + "train/ce_loss": 1.2042991102134692e-06 + }, + { + "epoch": 1.3182717025904687, + "step": 13333, + "train/sim_loss": 0.015104591846466064 + }, + { + "epoch": 1.3182717025904687, + "step": 13333, + "train/total_loss": 0.01510471198707819 + }, + { + "entropy": 9.497638702392578, + "epoch": 1.3183705754399841, + "mean_token_accuracy": 0.8265306353569031, + "num_tokens": 16624495.0, + "step": 13334, + "train/ce_loss": 0.9980210065841675 + }, + { + "epoch": 1.3183705754399841, + "step": 13334, + "train/sim_loss": 0.03364729881286621 + }, + { + "epoch": 1.3183705754399841, + "step": 13334, + "train/total_loss": 0.13344940543174744 + }, + { + "entropy": 9.221397399902344, + "epoch": 1.3184694482894996, + "mean_token_accuracy": 0.8243902325630188, + "num_tokens": 16635613.0, + "step": 13335, + "train/ce_loss": 0.33361881971359253 + }, + { + "epoch": 1.3184694482894996, + "step": 13335, + "train/sim_loss": 0.01978135108947754 + }, + { + "epoch": 1.3184694482894996, + "step": 13335, + "train/total_loss": 0.05314323306083679 + }, + { + "entropy": 9.419354438781738, + "epoch": 1.3185683211390153, + "mean_token_accuracy": 0.8520475625991821, + "num_tokens": 16647932.0, + "step": 13336, + "train/ce_loss": 0.44778022170066833 + }, + { + "epoch": 1.3185683211390153, + "step": 13336, + "train/sim_loss": 0.029895901679992676 + }, + { + "epoch": 1.3185683211390153, + "step": 13336, + "train/total_loss": 0.07467392086982727 + }, + { + "entropy": 8.769939422607422, + "epoch": 1.3186671939885308, + "mean_token_accuracy": 0.8230165839195251, + "num_tokens": 16657448.0, + "step": 13337, + "train/ce_loss": 0.5906682014465332 + }, + { + "epoch": 1.3186671939885308, + "step": 13337, + "train/sim_loss": 0.038675546646118164 + }, + { + "epoch": 1.3186671939885308, + "step": 13337, + "train/total_loss": 0.09774236381053925 + }, + { + "entropy": 9.782853126525879, + "epoch": 1.3187660668380463, + "mean_token_accuracy": 0.8199234008789062, + "num_tokens": 16668429.0, + "step": 13338, + "train/ce_loss": 0.7399955987930298 + }, + { + "epoch": 1.3187660668380463, + "step": 13338, + "train/sim_loss": 0.024748563766479492 + }, + { + "epoch": 1.3187660668380463, + "step": 13338, + "train/total_loss": 0.09874812513589859 + }, + { + "entropy": 9.9364013671875, + "epoch": 1.3188649396875618, + "mean_token_accuracy": 0.8843187689781189, + "num_tokens": 16689110.0, + "step": 13339, + "train/ce_loss": 0.26804086565971375 + }, + { + "epoch": 1.3188649396875618, + "step": 13339, + "train/sim_loss": 0.023616671562194824 + }, + { + "epoch": 1.3188649396875618, + "step": 13339, + "train/total_loss": 0.05042076110839844 + }, + { + "epoch": 1.3189638125370773, + "grad_norm": 0.5686473846435547, + "learning_rate": 6.704494882064976e-06, + "loss": 0.0811, + "step": 13340 + }, + { + "entropy": 9.605273246765137, + "epoch": 1.3189638125370773, + "mean_token_accuracy": 0.9013453125953674, + "num_tokens": 16694350.0, + "step": 13340, + "train/ce_loss": 5.50600248061528e-07 + }, + { + "epoch": 1.3189638125370773, + "step": 13340, + "train/sim_loss": 0.015198051929473877 + }, + { + "epoch": 1.3189638125370773, + "step": 13340, + "train/total_loss": 0.01519810687750578 + }, + { + "entropy": 9.472112655639648, + "epoch": 1.3190626853865928, + "mean_token_accuracy": 0.8813775777816772, + "num_tokens": 16707778.0, + "step": 13341, + "train/ce_loss": 0.1986587643623352 + }, + { + "epoch": 1.3190626853865928, + "step": 13341, + "train/sim_loss": 0.05924046039581299 + }, + { + "epoch": 1.3190626853865928, + "step": 13341, + "train/total_loss": 0.07910633832216263 + }, + { + "entropy": 9.364163398742676, + "epoch": 1.3191615582361083, + "mean_token_accuracy": 0.8670212626457214, + "num_tokens": 16726607.0, + "step": 13342, + "train/ce_loss": 0.42232567071914673 + }, + { + "epoch": 1.3191615582361083, + "step": 13342, + "train/sim_loss": 0.056381165981292725 + }, + { + "epoch": 1.3191615582361083, + "step": 13342, + "train/total_loss": 0.09861373901367188 + }, + { + "entropy": 10.078015327453613, + "epoch": 1.319260431085624, + "mean_token_accuracy": 0.895348846912384, + "num_tokens": 16738416.0, + "step": 13343, + "train/ce_loss": 0.2605947256088257 + }, + { + "epoch": 1.319260431085624, + "step": 13343, + "train/sim_loss": 0.017624318599700928 + }, + { + "epoch": 1.319260431085624, + "step": 13343, + "train/total_loss": 0.04368378967046738 + }, + { + "entropy": 9.93891716003418, + "epoch": 1.3193593039351394, + "mean_token_accuracy": 0.8469387888908386, + "num_tokens": 16749252.0, + "step": 13344, + "train/ce_loss": 1.99307191905973e-06 + }, + { + "epoch": 1.3193593039351394, + "step": 13344, + "train/sim_loss": 0.04736602306365967 + }, + { + "epoch": 1.3193593039351394, + "step": 13344, + "train/total_loss": 0.047366224229335785 + }, + { + "entropy": 9.96468734741211, + "epoch": 1.319458176784655, + "mean_token_accuracy": 0.8227360248565674, + "num_tokens": 16762115.0, + "step": 13345, + "train/ce_loss": 1.333030809291813e-06 + }, + { + "epoch": 1.319458176784655, + "step": 13345, + "train/sim_loss": 0.04345673322677612 + }, + { + "epoch": 1.319458176784655, + "step": 13345, + "train/total_loss": 0.04345686733722687 + }, + { + "entropy": 9.632813453674316, + "epoch": 1.3195570496341704, + "mean_token_accuracy": 0.8886827230453491, + "num_tokens": 16774770.0, + "step": 13346, + "train/ce_loss": 4.82136726986937e-07 + }, + { + "epoch": 1.3195570496341704, + "step": 13346, + "train/sim_loss": 0.03149259090423584 + }, + { + "epoch": 1.3195570496341704, + "step": 13346, + "train/total_loss": 0.03149263933300972 + }, + { + "entropy": 8.710588455200195, + "epoch": 1.319655922483686, + "mean_token_accuracy": 0.9213197827339172, + "num_tokens": 16784808.0, + "step": 13347, + "train/ce_loss": 0.21479202806949615 + }, + { + "epoch": 1.319655922483686, + "step": 13347, + "train/sim_loss": 0.014715135097503662 + }, + { + "epoch": 1.319655922483686, + "step": 13347, + "train/total_loss": 0.0361943393945694 + }, + { + "entropy": 9.066690444946289, + "epoch": 1.3197547953332016, + "mean_token_accuracy": 0.8705752491950989, + "num_tokens": 16794806.0, + "step": 13348, + "train/ce_loss": 0.2556961476802826 + }, + { + "epoch": 1.3197547953332016, + "step": 13348, + "train/sim_loss": 0.04867511987686157 + }, + { + "epoch": 1.3197547953332016, + "step": 13348, + "train/total_loss": 0.07424473762512207 + }, + { + "entropy": 9.58807373046875, + "epoch": 1.319853668182717, + "mean_token_accuracy": 0.8474576473236084, + "num_tokens": 16810244.0, + "step": 13349, + "train/ce_loss": 0.7407410144805908 + }, + { + "epoch": 1.319853668182717, + "step": 13349, + "train/sim_loss": 0.0840572714805603 + }, + { + "epoch": 1.319853668182717, + "step": 13349, + "train/total_loss": 0.15813137590885162 + }, + { + "entropy": 9.83108901977539, + "epoch": 1.3199525410322326, + "mean_token_accuracy": 0.8507223129272461, + "num_tokens": 16828474.0, + "step": 13350, + "train/ce_loss": 0.7402809858322144 + }, + { + "epoch": 1.3199525410322326, + "step": 13350, + "train/sim_loss": 0.07725071907043457 + }, + { + "epoch": 1.3199525410322326, + "step": 13350, + "train/total_loss": 0.15127882361412048 + }, + { + "entropy": 9.301416397094727, + "epoch": 1.320051413881748, + "mean_token_accuracy": 0.8013157844543457, + "num_tokens": 16842122.0, + "step": 13351, + "train/ce_loss": 0.49225053191185 + }, + { + "epoch": 1.320051413881748, + "step": 13351, + "train/sim_loss": 0.014360427856445312 + }, + { + "epoch": 1.320051413881748, + "step": 13351, + "train/total_loss": 0.06358548253774643 + }, + { + "entropy": 9.782999992370605, + "epoch": 1.3201502867312636, + "mean_token_accuracy": 0.8571428656578064, + "num_tokens": 16857888.0, + "step": 13352, + "train/ce_loss": 0.5123775601387024 + }, + { + "epoch": 1.3201502867312636, + "step": 13352, + "train/sim_loss": 0.034225642681121826 + }, + { + "epoch": 1.3201502867312636, + "step": 13352, + "train/total_loss": 0.08546340465545654 + }, + { + "entropy": 9.418386459350586, + "epoch": 1.320249159580779, + "mean_token_accuracy": 0.822407603263855, + "num_tokens": 16867518.0, + "step": 13353, + "train/ce_loss": 0.4372901916503906 + }, + { + "epoch": 1.320249159580779, + "step": 13353, + "train/sim_loss": 0.011119484901428223 + }, + { + "epoch": 1.320249159580779, + "step": 13353, + "train/total_loss": 0.054848503321409225 + }, + { + "entropy": 9.566298484802246, + "epoch": 1.3203480324302945, + "mean_token_accuracy": 0.9153605103492737, + "num_tokens": 16881423.0, + "step": 13354, + "train/ce_loss": 3.255742740293499e-06 + }, + { + "epoch": 1.3203480324302945, + "step": 13354, + "train/sim_loss": 0.029431462287902832 + }, + { + "epoch": 1.3203480324302945, + "step": 13354, + "train/total_loss": 0.029431788250803947 + }, + { + "entropy": 9.56184196472168, + "epoch": 1.3204469052798102, + "mean_token_accuracy": 0.8295454382896423, + "num_tokens": 16893410.0, + "step": 13355, + "train/ce_loss": 0.45033469796180725 + }, + { + "epoch": 1.3204469052798102, + "step": 13355, + "train/sim_loss": 0.019436359405517578 + }, + { + "epoch": 1.3204469052798102, + "step": 13355, + "train/total_loss": 0.0644698292016983 + }, + { + "entropy": 9.618050575256348, + "epoch": 1.3205457781293257, + "mean_token_accuracy": 0.7946428656578064, + "num_tokens": 16903492.0, + "step": 13356, + "train/ce_loss": 0.5597832798957825 + }, + { + "epoch": 1.3205457781293257, + "step": 13356, + "train/sim_loss": 0.10462832450866699 + }, + { + "epoch": 1.3205457781293257, + "step": 13356, + "train/total_loss": 0.16060665249824524 + }, + { + "entropy": 8.824474334716797, + "epoch": 1.3206446509788412, + "mean_token_accuracy": 0.8293691873550415, + "num_tokens": 16911528.0, + "step": 13357, + "train/ce_loss": 0.4972597658634186 + }, + { + "epoch": 1.3206446509788412, + "step": 13357, + "train/sim_loss": 0.06785935163497925 + }, + { + "epoch": 1.3206446509788412, + "step": 13357, + "train/total_loss": 0.11758533120155334 + }, + { + "entropy": 8.961605072021484, + "epoch": 1.3207435238283567, + "mean_token_accuracy": 0.7925407886505127, + "num_tokens": 16924537.0, + "step": 13358, + "train/ce_loss": 0.5306475162506104 + }, + { + "epoch": 1.3207435238283567, + "step": 13358, + "train/sim_loss": 0.0584028959274292 + }, + { + "epoch": 1.3207435238283567, + "step": 13358, + "train/total_loss": 0.111467644572258 + }, + { + "entropy": 9.53011703491211, + "epoch": 1.3208423966778722, + "mean_token_accuracy": 0.7841945290565491, + "num_tokens": 16938332.0, + "step": 13359, + "train/ce_loss": 0.7194317579269409 + }, + { + "epoch": 1.3208423966778722, + "step": 13359, + "train/sim_loss": 0.03494620323181152 + }, + { + "epoch": 1.3208423966778722, + "step": 13359, + "train/total_loss": 0.10688938200473785 + }, + { + "epoch": 1.3209412695273879, + "grad_norm": 0.614790678024292, + "learning_rate": 6.699550017307027e-06, + "loss": 0.087, + "step": 13360 + }, + { + "entropy": 9.455802917480469, + "epoch": 1.3209412695273879, + "mean_token_accuracy": 0.7990654110908508, + "num_tokens": 16952644.0, + "step": 13360, + "train/ce_loss": 0.4522811472415924 + }, + { + "epoch": 1.3209412695273879, + "step": 13360, + "train/sim_loss": 0.06350016593933105 + }, + { + "epoch": 1.3209412695273879, + "step": 13360, + "train/total_loss": 0.10872828215360641 + }, + { + "entropy": 9.031377792358398, + "epoch": 1.3210401423769034, + "mean_token_accuracy": 0.8294117450714111, + "num_tokens": 16965962.0, + "step": 13361, + "train/ce_loss": 0.701546847820282 + }, + { + "epoch": 1.3210401423769034, + "step": 13361, + "train/sim_loss": 0.032414019107818604 + }, + { + "epoch": 1.3210401423769034, + "step": 13361, + "train/total_loss": 0.10256870836019516 + }, + { + "entropy": 9.156750679016113, + "epoch": 1.3211390152264189, + "mean_token_accuracy": 0.8315946459770203, + "num_tokens": 16976130.0, + "step": 13362, + "train/ce_loss": 2.309880301254452e-06 + }, + { + "epoch": 1.3211390152264189, + "step": 13362, + "train/sim_loss": 0.037273287773132324 + }, + { + "epoch": 1.3211390152264189, + "step": 13362, + "train/total_loss": 0.03727351874113083 + }, + { + "entropy": 8.656023979187012, + "epoch": 1.3212378880759343, + "mean_token_accuracy": 0.8696498274803162, + "num_tokens": 16983492.0, + "step": 13363, + "train/ce_loss": 0.38693082332611084 + }, + { + "epoch": 1.3212378880759343, + "step": 13363, + "train/sim_loss": 0.01578420400619507 + }, + { + "epoch": 1.3212378880759343, + "step": 13363, + "train/total_loss": 0.05447728559374809 + }, + { + "entropy": 9.544281005859375, + "epoch": 1.3213367609254498, + "mean_token_accuracy": 0.8110137581825256, + "num_tokens": 16998508.0, + "step": 13364, + "train/ce_loss": 0.5533183813095093 + }, + { + "epoch": 1.3213367609254498, + "step": 13364, + "train/sim_loss": 0.03815150260925293 + }, + { + "epoch": 1.3213367609254498, + "step": 13364, + "train/total_loss": 0.0934833437204361 + }, + { + "entropy": 9.153199195861816, + "epoch": 1.3214356337749653, + "mean_token_accuracy": 0.7812197208404541, + "num_tokens": 17012635.0, + "step": 13365, + "train/ce_loss": 0.36611923575401306 + }, + { + "epoch": 1.3214356337749653, + "step": 13365, + "train/sim_loss": 0.035717785358428955 + }, + { + "epoch": 1.3214356337749653, + "step": 13365, + "train/total_loss": 0.07232971489429474 + }, + { + "entropy": 9.509355545043945, + "epoch": 1.3215345066244808, + "mean_token_accuracy": 0.8704581260681152, + "num_tokens": 17024125.0, + "step": 13366, + "train/ce_loss": 0.559939980506897 + }, + { + "epoch": 1.3215345066244808, + "step": 13366, + "train/sim_loss": 0.033730387687683105 + }, + { + "epoch": 1.3215345066244808, + "step": 13366, + "train/total_loss": 0.08972439169883728 + }, + { + "entropy": 9.697420120239258, + "epoch": 1.3216333794739965, + "mean_token_accuracy": 0.837837815284729, + "num_tokens": 17040370.0, + "step": 13367, + "train/ce_loss": 0.3447698950767517 + }, + { + "epoch": 1.3216333794739965, + "step": 13367, + "train/sim_loss": 0.04288607835769653 + }, + { + "epoch": 1.3216333794739965, + "step": 13367, + "train/total_loss": 0.07736307382583618 + }, + { + "entropy": 9.710683822631836, + "epoch": 1.321732252323512, + "mean_token_accuracy": 0.9085603356361389, + "num_tokens": 17061312.0, + "step": 13368, + "train/ce_loss": 0.36309510469436646 + }, + { + "epoch": 1.321732252323512, + "step": 13368, + "train/sim_loss": 0.04643082618713379 + }, + { + "epoch": 1.321732252323512, + "step": 13368, + "train/total_loss": 0.08274033665657043 + }, + { + "entropy": 9.383424758911133, + "epoch": 1.3218311251730275, + "mean_token_accuracy": 0.8314606547355652, + "num_tokens": 17074009.0, + "step": 13369, + "train/ce_loss": 0.7880679965019226 + }, + { + "epoch": 1.3218311251730275, + "step": 13369, + "train/sim_loss": 0.03302884101867676 + }, + { + "epoch": 1.3218311251730275, + "step": 13369, + "train/total_loss": 0.11183564364910126 + }, + { + "entropy": 9.107256889343262, + "epoch": 1.321929998022543, + "mean_token_accuracy": 0.8347508907318115, + "num_tokens": 17085050.0, + "step": 13370, + "train/ce_loss": 0.18107858300209045 + }, + { + "epoch": 1.321929998022543, + "step": 13370, + "train/sim_loss": 0.016140878200531006 + }, + { + "epoch": 1.321929998022543, + "step": 13370, + "train/total_loss": 0.03424873948097229 + }, + { + "entropy": 9.757635116577148, + "epoch": 1.3220288708720584, + "mean_token_accuracy": 0.8694581389427185, + "num_tokens": 17095490.0, + "step": 13371, + "train/ce_loss": 1.005323497338395e-06 + }, + { + "epoch": 1.3220288708720584, + "step": 13371, + "train/sim_loss": 0.025426149368286133 + }, + { + "epoch": 1.3220288708720584, + "step": 13371, + "train/total_loss": 0.02542624995112419 + }, + { + "entropy": 9.366535186767578, + "epoch": 1.3221277437215742, + "mean_token_accuracy": 0.860162615776062, + "num_tokens": 17109378.0, + "step": 13372, + "train/ce_loss": 0.5272496342658997 + }, + { + "epoch": 1.3221277437215742, + "step": 13372, + "train/sim_loss": 0.04813110828399658 + }, + { + "epoch": 1.3221277437215742, + "step": 13372, + "train/total_loss": 0.10085607320070267 + }, + { + "entropy": 9.654485702514648, + "epoch": 1.3222266165710896, + "mean_token_accuracy": 0.7940025925636292, + "num_tokens": 17119469.0, + "step": 13373, + "train/ce_loss": 0.4759131669998169 + }, + { + "epoch": 1.3222266165710896, + "step": 13373, + "train/sim_loss": 0.04499399662017822 + }, + { + "epoch": 1.3222266165710896, + "step": 13373, + "train/total_loss": 0.09258531033992767 + }, + { + "entropy": 9.261266708374023, + "epoch": 1.3223254894206051, + "mean_token_accuracy": 0.8626570701599121, + "num_tokens": 17132333.0, + "step": 13374, + "train/ce_loss": 0.30826255679130554 + }, + { + "epoch": 1.3223254894206051, + "step": 13374, + "train/sim_loss": 0.014085531234741211 + }, + { + "epoch": 1.3223254894206051, + "step": 13374, + "train/total_loss": 0.044911786913871765 + }, + { + "entropy": 8.930421829223633, + "epoch": 1.3224243622701206, + "mean_token_accuracy": 0.85628741979599, + "num_tokens": 17149733.0, + "step": 13375, + "train/ce_loss": 0.8179171681404114 + }, + { + "epoch": 1.3224243622701206, + "step": 13375, + "train/sim_loss": 0.0487859845161438 + }, + { + "epoch": 1.3224243622701206, + "step": 13375, + "train/total_loss": 0.1305777132511139 + }, + { + "entropy": 9.570283889770508, + "epoch": 1.322523235119636, + "mean_token_accuracy": 0.8466981053352356, + "num_tokens": 17159108.0, + "step": 13376, + "train/ce_loss": 0.5554689168930054 + }, + { + "epoch": 1.322523235119636, + "step": 13376, + "train/sim_loss": 0.06321543455123901 + }, + { + "epoch": 1.322523235119636, + "step": 13376, + "train/total_loss": 0.11876232922077179 + }, + { + "entropy": 10.217780113220215, + "epoch": 1.3226221079691518, + "mean_token_accuracy": 0.8732057213783264, + "num_tokens": 17169106.0, + "step": 13377, + "train/ce_loss": 0.26183784008026123 + }, + { + "epoch": 1.3226221079691518, + "step": 13377, + "train/sim_loss": 0.09610611200332642 + }, + { + "epoch": 1.3226221079691518, + "step": 13377, + "train/total_loss": 0.12228989601135254 + }, + { + "entropy": 9.580485343933105, + "epoch": 1.322720980818667, + "mean_token_accuracy": 0.8399532437324524, + "num_tokens": 17183770.0, + "step": 13378, + "train/ce_loss": 0.5350359082221985 + }, + { + "epoch": 1.322720980818667, + "step": 13378, + "train/sim_loss": 0.026187419891357422 + }, + { + "epoch": 1.322720980818667, + "step": 13378, + "train/total_loss": 0.07969100773334503 + }, + { + "entropy": 10.165133476257324, + "epoch": 1.3228198536681828, + "mean_token_accuracy": 0.8695651888847351, + "num_tokens": 17190499.0, + "step": 13379, + "train/ce_loss": 5.382727977121249e-06 + }, + { + "epoch": 1.3228198536681828, + "step": 13379, + "train/sim_loss": 0.030317068099975586 + }, + { + "epoch": 1.3228198536681828, + "step": 13379, + "train/total_loss": 0.030317606404423714 + }, + { + "epoch": 1.3229187265176983, + "grad_norm": 0.5934211015701294, + "learning_rate": 6.694605152549078e-06, + "loss": 0.0888, + "step": 13380 + }, + { + "entropy": 9.549409866333008, + "epoch": 1.3229187265176983, + "mean_token_accuracy": 0.8488371968269348, + "num_tokens": 17206576.0, + "step": 13380, + "train/ce_loss": 0.20215801894664764 + }, + { + "epoch": 1.3229187265176983, + "step": 13380, + "train/sim_loss": 0.021937012672424316 + }, + { + "epoch": 1.3229187265176983, + "step": 13380, + "train/total_loss": 0.04215281456708908 + }, + { + "entropy": 10.079780578613281, + "epoch": 1.3230175993672137, + "mean_token_accuracy": 0.849686861038208, + "num_tokens": 17219674.0, + "step": 13381, + "train/ce_loss": 0.8022977113723755 + }, + { + "epoch": 1.3230175993672137, + "step": 13381, + "train/sim_loss": 0.018021106719970703 + }, + { + "epoch": 1.3230175993672137, + "step": 13381, + "train/total_loss": 0.09825088083744049 + }, + { + "entropy": 9.562725067138672, + "epoch": 1.3231164722167292, + "mean_token_accuracy": 0.8976486921310425, + "num_tokens": 17228265.0, + "step": 13382, + "train/ce_loss": 0.5008400678634644 + }, + { + "epoch": 1.3231164722167292, + "step": 13382, + "train/sim_loss": 0.06121569871902466 + }, + { + "epoch": 1.3231164722167292, + "step": 13382, + "train/total_loss": 0.11129970848560333 + }, + { + "entropy": 9.578832626342773, + "epoch": 1.3232153450662447, + "mean_token_accuracy": 0.8445532321929932, + "num_tokens": 17240849.0, + "step": 13383, + "train/ce_loss": 0.44490087032318115 + }, + { + "epoch": 1.3232153450662447, + "step": 13383, + "train/sim_loss": 0.016199827194213867 + }, + { + "epoch": 1.3232153450662447, + "step": 13383, + "train/total_loss": 0.06068991497159004 + }, + { + "entropy": 9.493255615234375, + "epoch": 1.3233142179157604, + "mean_token_accuracy": 0.8438596725463867, + "num_tokens": 17252946.0, + "step": 13384, + "train/ce_loss": 0.5100287199020386 + }, + { + "epoch": 1.3233142179157604, + "step": 13384, + "train/sim_loss": 0.0296974778175354 + }, + { + "epoch": 1.3233142179157604, + "step": 13384, + "train/total_loss": 0.0807003527879715 + }, + { + "entropy": 9.678180694580078, + "epoch": 1.323413090765276, + "mean_token_accuracy": 0.8427562117576599, + "num_tokens": 17260770.0, + "step": 13385, + "train/ce_loss": 5.711831363441888e-07 + }, + { + "epoch": 1.323413090765276, + "step": 13385, + "train/sim_loss": 0.01353001594543457 + }, + { + "epoch": 1.323413090765276, + "step": 13385, + "train/total_loss": 0.013530072756111622 + }, + { + "entropy": 10.156087875366211, + "epoch": 1.3235119636147914, + "mean_token_accuracy": 0.8544973731040955, + "num_tokens": 17268564.0, + "step": 13386, + "train/ce_loss": 5.626108645628847e-07 + }, + { + "epoch": 1.3235119636147914, + "step": 13386, + "train/sim_loss": 0.018782854080200195 + }, + { + "epoch": 1.3235119636147914, + "step": 13386, + "train/total_loss": 0.018782909959554672 + }, + { + "entropy": 9.318023681640625, + "epoch": 1.3236108364643069, + "mean_token_accuracy": 0.8521462678909302, + "num_tokens": 17285892.0, + "step": 13387, + "train/ce_loss": 0.7071394920349121 + }, + { + "epoch": 1.3236108364643069, + "step": 13387, + "train/sim_loss": 0.03576469421386719 + }, + { + "epoch": 1.3236108364643069, + "step": 13387, + "train/total_loss": 0.10647864639759064 + }, + { + "entropy": 9.268230438232422, + "epoch": 1.3237097093138224, + "mean_token_accuracy": 0.8358974456787109, + "num_tokens": 17295722.0, + "step": 13388, + "train/ce_loss": 0.6516973376274109 + }, + { + "epoch": 1.3237097093138224, + "step": 13388, + "train/sim_loss": 0.03938788175582886 + }, + { + "epoch": 1.3237097093138224, + "step": 13388, + "train/total_loss": 0.10455761849880219 + }, + { + "entropy": 9.778114318847656, + "epoch": 1.323808582163338, + "mean_token_accuracy": 0.8486111164093018, + "num_tokens": 17309833.0, + "step": 13389, + "train/ce_loss": 0.5977028012275696 + }, + { + "epoch": 1.323808582163338, + "step": 13389, + "train/sim_loss": 0.03786182403564453 + }, + { + "epoch": 1.323808582163338, + "step": 13389, + "train/total_loss": 0.09763211011886597 + }, + { + "entropy": 9.230982780456543, + "epoch": 1.3239074550128533, + "mean_token_accuracy": 0.8407257795333862, + "num_tokens": 17321546.0, + "step": 13390, + "train/ce_loss": 0.6178915500640869 + }, + { + "epoch": 1.3239074550128533, + "step": 13390, + "train/sim_loss": 0.04175448417663574 + }, + { + "epoch": 1.3239074550128533, + "step": 13390, + "train/total_loss": 0.10354363918304443 + }, + { + "entropy": 9.23946762084961, + "epoch": 1.324006327862369, + "mean_token_accuracy": 0.8524844646453857, + "num_tokens": 17331200.0, + "step": 13391, + "train/ce_loss": 0.4426475763320923 + }, + { + "epoch": 1.324006327862369, + "step": 13391, + "train/sim_loss": 0.06673377752304077 + }, + { + "epoch": 1.324006327862369, + "step": 13391, + "train/total_loss": 0.11099854111671448 + }, + { + "entropy": 9.495895385742188, + "epoch": 1.3241052007118845, + "mean_token_accuracy": 0.8227488398551941, + "num_tokens": 17345209.0, + "step": 13392, + "train/ce_loss": 0.44118639826774597 + }, + { + "epoch": 1.3241052007118845, + "step": 13392, + "train/sim_loss": 0.03713393211364746 + }, + { + "epoch": 1.3241052007118845, + "step": 13392, + "train/total_loss": 0.0812525749206543 + }, + { + "entropy": 9.776432037353516, + "epoch": 1.3242040735614, + "mean_token_accuracy": 0.8364688754081726, + "num_tokens": 17361364.0, + "step": 13393, + "train/ce_loss": 0.535634458065033 + }, + { + "epoch": 1.3242040735614, + "step": 13393, + "train/sim_loss": 0.017652153968811035 + }, + { + "epoch": 1.3242040735614, + "step": 13393, + "train/total_loss": 0.07121559977531433 + }, + { + "entropy": 9.754926681518555, + "epoch": 1.3243029464109155, + "mean_token_accuracy": 0.8741865754127502, + "num_tokens": 17376698.0, + "step": 13394, + "train/ce_loss": 0.8127319812774658 + }, + { + "epoch": 1.3243029464109155, + "step": 13394, + "train/sim_loss": 0.04404401779174805 + }, + { + "epoch": 1.3243029464109155, + "step": 13394, + "train/total_loss": 0.12531721591949463 + }, + { + "entropy": 8.636177062988281, + "epoch": 1.324401819260431, + "mean_token_accuracy": 0.8808080554008484, + "num_tokens": 17383237.0, + "step": 13395, + "train/ce_loss": 0.6401941180229187 + }, + { + "epoch": 1.324401819260431, + "step": 13395, + "train/sim_loss": 0.14000415802001953 + }, + { + "epoch": 1.324401819260431, + "step": 13395, + "train/total_loss": 0.2040235698223114 + }, + { + "entropy": 9.787298202514648, + "epoch": 1.3245006921099467, + "mean_token_accuracy": 0.8571428656578064, + "num_tokens": 17390584.0, + "step": 13396, + "train/ce_loss": 0.5638439059257507 + }, + { + "epoch": 1.3245006921099467, + "step": 13396, + "train/sim_loss": 0.13879072666168213 + }, + { + "epoch": 1.3245006921099467, + "step": 13396, + "train/total_loss": 0.19517511129379272 + }, + { + "entropy": 9.499061584472656, + "epoch": 1.3245995649594622, + "mean_token_accuracy": 0.8280657529830933, + "num_tokens": 17402543.0, + "step": 13397, + "train/ce_loss": 0.21656183898448944 + }, + { + "epoch": 1.3245995649594622, + "step": 13397, + "train/sim_loss": 0.010110616683959961 + }, + { + "epoch": 1.3245995649594622, + "step": 13397, + "train/total_loss": 0.031766802072525024 + }, + { + "entropy": 9.436878204345703, + "epoch": 1.3246984378089777, + "mean_token_accuracy": 0.9028077721595764, + "num_tokens": 17417319.0, + "step": 13398, + "train/ce_loss": 6.096830134083575e-07 + }, + { + "epoch": 1.3246984378089777, + "step": 13398, + "train/sim_loss": 0.030514776706695557 + }, + { + "epoch": 1.3246984378089777, + "step": 13398, + "train/total_loss": 0.03051483817398548 + }, + { + "entropy": 9.023988723754883, + "epoch": 1.3247973106584932, + "mean_token_accuracy": 0.8538367748260498, + "num_tokens": 17426942.0, + "step": 13399, + "train/ce_loss": 0.42580607533454895 + }, + { + "epoch": 1.3247973106584932, + "step": 13399, + "train/sim_loss": 0.030776560306549072 + }, + { + "epoch": 1.3247973106584932, + "step": 13399, + "train/total_loss": 0.07335716485977173 + }, + { + "epoch": 1.3248961835080086, + "grad_norm": 0.5857319831848145, + "learning_rate": 6.68966028779113e-06, + "loss": 0.0861, + "step": 13400 + }, + { + "entropy": 8.656081199645996, + "epoch": 1.3248961835080086, + "mean_token_accuracy": 0.8378099203109741, + "num_tokens": 17441944.0, + "step": 13400, + "train/ce_loss": 0.43294909596443176 + }, + { + "epoch": 1.3248961835080086, + "step": 13400, + "train/sim_loss": 0.01580679416656494 + }, + { + "epoch": 1.3248961835080086, + "step": 13400, + "train/total_loss": 0.05910170450806618 + }, + { + "entropy": 9.18591594696045, + "epoch": 1.3249950563575243, + "mean_token_accuracy": 0.8318408131599426, + "num_tokens": 17457305.0, + "step": 13401, + "train/ce_loss": 0.5830603837966919 + }, + { + "epoch": 1.3249950563575243, + "step": 13401, + "train/sim_loss": 0.07842451333999634 + }, + { + "epoch": 1.3249950563575243, + "step": 13401, + "train/total_loss": 0.13673055171966553 + }, + { + "entropy": 9.52663803100586, + "epoch": 1.3250939292070396, + "mean_token_accuracy": 0.8541353344917297, + "num_tokens": 17464204.0, + "step": 13402, + "train/ce_loss": 3.1478049322686275e-07 + }, + { + "epoch": 1.3250939292070396, + "step": 13402, + "train/sim_loss": 0.008518099784851074 + }, + { + "epoch": 1.3250939292070396, + "step": 13402, + "train/total_loss": 0.008518131449818611 + }, + { + "entropy": 9.35667610168457, + "epoch": 1.3251928020565553, + "mean_token_accuracy": 0.8393063545227051, + "num_tokens": 17481287.0, + "step": 13403, + "train/ce_loss": 0.20222939550876617 + }, + { + "epoch": 1.3251928020565553, + "step": 13403, + "train/sim_loss": 0.023484885692596436 + }, + { + "epoch": 1.3251928020565553, + "step": 13403, + "train/total_loss": 0.04370782524347305 + }, + { + "entropy": 9.45348834991455, + "epoch": 1.3252916749060708, + "mean_token_accuracy": 0.8585858345031738, + "num_tokens": 17495042.0, + "step": 13404, + "train/ce_loss": 0.33068883419036865 + }, + { + "epoch": 1.3252916749060708, + "step": 13404, + "train/sim_loss": 0.036371707916259766 + }, + { + "epoch": 1.3252916749060708, + "step": 13404, + "train/total_loss": 0.06944058835506439 + }, + { + "entropy": 9.823037147521973, + "epoch": 1.3253905477555863, + "mean_token_accuracy": 0.8552036285400391, + "num_tokens": 17502752.0, + "step": 13405, + "train/ce_loss": 0.5795084834098816 + }, + { + "epoch": 1.3253905477555863, + "step": 13405, + "train/sim_loss": 0.0388789176940918 + }, + { + "epoch": 1.3253905477555863, + "step": 13405, + "train/total_loss": 0.09682977199554443 + }, + { + "entropy": 9.665068626403809, + "epoch": 1.3254894206051018, + "mean_token_accuracy": 0.8879310488700867, + "num_tokens": 17524239.0, + "step": 13406, + "train/ce_loss": 0.12118284404277802 + }, + { + "epoch": 1.3254894206051018, + "step": 13406, + "train/sim_loss": 0.04744154214859009 + }, + { + "epoch": 1.3254894206051018, + "step": 13406, + "train/total_loss": 0.05955982580780983 + }, + { + "entropy": 9.253952026367188, + "epoch": 1.3255882934546173, + "mean_token_accuracy": 0.8920780420303345, + "num_tokens": 17539569.0, + "step": 13407, + "train/ce_loss": 0.326447069644928 + }, + { + "epoch": 1.3255882934546173, + "step": 13407, + "train/sim_loss": 0.024942636489868164 + }, + { + "epoch": 1.3255882934546173, + "step": 13407, + "train/total_loss": 0.05758734419941902 + }, + { + "entropy": 9.452646255493164, + "epoch": 1.325687166304133, + "mean_token_accuracy": 0.8880000114440918, + "num_tokens": 17550035.0, + "step": 13408, + "train/ce_loss": 3.496753038234601e-07 + }, + { + "epoch": 1.325687166304133, + "step": 13408, + "train/sim_loss": 0.014201760292053223 + }, + { + "epoch": 1.325687166304133, + "step": 13408, + "train/total_loss": 0.014201795682311058 + }, + { + "entropy": 9.520074844360352, + "epoch": 1.3257860391536485, + "mean_token_accuracy": 0.8482758402824402, + "num_tokens": 17562190.0, + "step": 13409, + "train/ce_loss": 0.6697500944137573 + }, + { + "epoch": 1.3257860391536485, + "step": 13409, + "train/sim_loss": 0.040941834449768066 + }, + { + "epoch": 1.3257860391536485, + "step": 13409, + "train/total_loss": 0.10791684687137604 + }, + { + "entropy": 9.813138961791992, + "epoch": 1.325884912003164, + "mean_token_accuracy": 0.842783510684967, + "num_tokens": 17572301.0, + "step": 13410, + "train/ce_loss": 9.831434226725833e-07 + }, + { + "epoch": 1.325884912003164, + "step": 13410, + "train/sim_loss": 0.013533294200897217 + }, + { + "epoch": 1.325884912003164, + "step": 13410, + "train/total_loss": 0.013533392921090126 + }, + { + "entropy": 9.882838249206543, + "epoch": 1.3259837848526794, + "mean_token_accuracy": 0.9271758198738098, + "num_tokens": 17584648.0, + "step": 13411, + "train/ce_loss": 0.7409204244613647 + }, + { + "epoch": 1.3259837848526794, + "step": 13411, + "train/sim_loss": 0.045946717262268066 + }, + { + "epoch": 1.3259837848526794, + "step": 13411, + "train/total_loss": 0.12003876268863678 + }, + { + "entropy": 9.198545455932617, + "epoch": 1.326082657702195, + "mean_token_accuracy": 0.8588873744010925, + "num_tokens": 17599604.0, + "step": 13412, + "train/ce_loss": 0.47017940878868103 + }, + { + "epoch": 1.326082657702195, + "step": 13412, + "train/sim_loss": 0.09526044130325317 + }, + { + "epoch": 1.326082657702195, + "step": 13412, + "train/total_loss": 0.14227838814258575 + }, + { + "entropy": 8.926258087158203, + "epoch": 1.3261815305517106, + "mean_token_accuracy": 0.8481153249740601, + "num_tokens": 17610342.0, + "step": 13413, + "train/ce_loss": 0.1956724226474762 + }, + { + "epoch": 1.3261815305517106, + "step": 13413, + "train/sim_loss": 0.06295239925384521 + }, + { + "epoch": 1.3261815305517106, + "step": 13413, + "train/total_loss": 0.08251964300870895 + }, + { + "entropy": 9.241889953613281, + "epoch": 1.326280403401226, + "mean_token_accuracy": 0.8482532501220703, + "num_tokens": 17623721.0, + "step": 13414, + "train/ce_loss": 0.5698719024658203 + }, + { + "epoch": 1.326280403401226, + "step": 13414, + "train/sim_loss": 0.08861804008483887 + }, + { + "epoch": 1.326280403401226, + "step": 13414, + "train/total_loss": 0.14560523629188538 + }, + { + "entropy": 9.674175262451172, + "epoch": 1.3263792762507416, + "mean_token_accuracy": 0.8609112501144409, + "num_tokens": 17641794.0, + "step": 13415, + "train/ce_loss": 0.37267735600471497 + }, + { + "epoch": 1.3263792762507416, + "step": 13415, + "train/sim_loss": 0.016665101051330566 + }, + { + "epoch": 1.3263792762507416, + "step": 13415, + "train/total_loss": 0.05393283814191818 + }, + { + "entropy": 9.320576667785645, + "epoch": 1.326478149100257, + "mean_token_accuracy": 0.8548035025596619, + "num_tokens": 17658106.0, + "step": 13416, + "train/ce_loss": 0.43375933170318604 + }, + { + "epoch": 1.326478149100257, + "step": 13416, + "train/sim_loss": 0.07074224948883057 + }, + { + "epoch": 1.326478149100257, + "step": 13416, + "train/total_loss": 0.11411818861961365 + }, + { + "entropy": 9.340551376342773, + "epoch": 1.3265770219497726, + "mean_token_accuracy": 0.842285692691803, + "num_tokens": 17673133.0, + "step": 13417, + "train/ce_loss": 0.23295927047729492 + }, + { + "epoch": 1.3265770219497726, + "step": 13417, + "train/sim_loss": 0.03235518932342529 + }, + { + "epoch": 1.3265770219497726, + "step": 13417, + "train/total_loss": 0.055651117116212845 + }, + { + "entropy": 9.106839179992676, + "epoch": 1.326675894799288, + "mean_token_accuracy": 0.7826510667800903, + "num_tokens": 17685236.0, + "step": 13418, + "train/ce_loss": 0.612234890460968 + }, + { + "epoch": 1.326675894799288, + "step": 13418, + "train/sim_loss": 0.07363414764404297 + }, + { + "epoch": 1.326675894799288, + "step": 13418, + "train/total_loss": 0.134857639670372 + }, + { + "entropy": 9.978455543518066, + "epoch": 1.3267747676488035, + "mean_token_accuracy": 0.9054545164108276, + "num_tokens": 17694575.0, + "step": 13419, + "train/ce_loss": 4.776969717568136e-07 + }, + { + "epoch": 1.3267747676488035, + "step": 13419, + "train/sim_loss": 0.007936716079711914 + }, + { + "epoch": 1.3267747676488035, + "step": 13419, + "train/total_loss": 0.00793676357716322 + }, + { + "epoch": 1.3268736404983192, + "grad_norm": 0.460967481136322, + "learning_rate": 6.68471542303318e-06, + "loss": 0.0845, + "step": 13420 + }, + { + "entropy": 8.997169494628906, + "epoch": 1.3268736404983192, + "mean_token_accuracy": 0.853157103061676, + "num_tokens": 17702229.0, + "step": 13420, + "train/ce_loss": 1.5879022612352856e-06 + }, + { + "epoch": 1.3268736404983192, + "step": 13420, + "train/sim_loss": 0.05131363868713379 + }, + { + "epoch": 1.3268736404983192, + "step": 13420, + "train/total_loss": 0.05131379887461662 + }, + { + "entropy": 9.354909896850586, + "epoch": 1.3269725133478347, + "mean_token_accuracy": 0.8880707025527954, + "num_tokens": 17718491.0, + "step": 13421, + "train/ce_loss": 0.3112185597419739 + }, + { + "epoch": 1.3269725133478347, + "step": 13421, + "train/sim_loss": 0.08755373954772949 + }, + { + "epoch": 1.3269725133478347, + "step": 13421, + "train/total_loss": 0.118675597012043 + }, + { + "entropy": 9.948894500732422, + "epoch": 1.3270713861973502, + "mean_token_accuracy": 0.8422818779945374, + "num_tokens": 17732042.0, + "step": 13422, + "train/ce_loss": 1.2160199958088924e-06 + }, + { + "epoch": 1.3270713861973502, + "step": 13422, + "train/sim_loss": 0.02722907066345215 + }, + { + "epoch": 1.3270713861973502, + "step": 13422, + "train/total_loss": 0.02722919173538685 + }, + { + "entropy": 9.197949409484863, + "epoch": 1.3271702590468657, + "mean_token_accuracy": 0.8467561602592468, + "num_tokens": 17741337.0, + "step": 13423, + "train/ce_loss": 0.4781422019004822 + }, + { + "epoch": 1.3271702590468657, + "step": 13423, + "train/sim_loss": 0.044617533683776855 + }, + { + "epoch": 1.3271702590468657, + "step": 13423, + "train/total_loss": 0.09243175387382507 + }, + { + "entropy": 9.198509216308594, + "epoch": 1.3272691318963812, + "mean_token_accuracy": 0.8591408729553223, + "num_tokens": 17757441.0, + "step": 13424, + "train/ce_loss": 0.23874755203723907 + }, + { + "epoch": 1.3272691318963812, + "step": 13424, + "train/sim_loss": 0.0228843092918396 + }, + { + "epoch": 1.3272691318963812, + "step": 13424, + "train/total_loss": 0.04675906524062157 + }, + { + "entropy": 8.843297958374023, + "epoch": 1.327368004745897, + "mean_token_accuracy": 0.8217270374298096, + "num_tokens": 17766364.0, + "step": 13425, + "train/ce_loss": 1.0846635103225708 + }, + { + "epoch": 1.327368004745897, + "step": 13425, + "train/sim_loss": 0.04282248020172119 + }, + { + "epoch": 1.327368004745897, + "step": 13425, + "train/total_loss": 0.15128883719444275 + }, + { + "entropy": 9.508880615234375, + "epoch": 1.3274668775954124, + "mean_token_accuracy": 0.8257491588592529, + "num_tokens": 17783797.0, + "step": 13426, + "train/ce_loss": 0.4051012694835663 + }, + { + "epoch": 1.3274668775954124, + "step": 13426, + "train/sim_loss": 0.02033168077468872 + }, + { + "epoch": 1.3274668775954124, + "step": 13426, + "train/total_loss": 0.06084180995821953 + }, + { + "entropy": 9.441737174987793, + "epoch": 1.3275657504449279, + "mean_token_accuracy": 0.8347222208976746, + "num_tokens": 17796349.0, + "step": 13427, + "train/ce_loss": 0.46221059560775757 + }, + { + "epoch": 1.3275657504449279, + "step": 13427, + "train/sim_loss": 0.036103010177612305 + }, + { + "epoch": 1.3275657504449279, + "step": 13427, + "train/total_loss": 0.0823240727186203 + }, + { + "entropy": 9.303205490112305, + "epoch": 1.3276646232944433, + "mean_token_accuracy": 0.8409371376037598, + "num_tokens": 17806108.0, + "step": 13428, + "train/ce_loss": 0.4816519618034363 + }, + { + "epoch": 1.3276646232944433, + "step": 13428, + "train/sim_loss": 0.02608025074005127 + }, + { + "epoch": 1.3276646232944433, + "step": 13428, + "train/total_loss": 0.07424545288085938 + }, + { + "entropy": 9.678404808044434, + "epoch": 1.3277634961439588, + "mean_token_accuracy": 0.8771929740905762, + "num_tokens": 17823029.0, + "step": 13429, + "train/ce_loss": 1.3683070392289665e-06 + }, + { + "epoch": 1.3277634961439588, + "step": 13429, + "train/sim_loss": 0.025435566902160645 + }, + { + "epoch": 1.3277634961439588, + "step": 13429, + "train/total_loss": 0.02543570287525654 + }, + { + "entropy": 9.830622673034668, + "epoch": 1.3278623689934743, + "mean_token_accuracy": 0.8526490330696106, + "num_tokens": 17838679.0, + "step": 13430, + "train/ce_loss": 0.5576057434082031 + }, + { + "epoch": 1.3278623689934743, + "step": 13430, + "train/sim_loss": 0.11928701400756836 + }, + { + "epoch": 1.3278623689934743, + "step": 13430, + "train/total_loss": 0.1750475913286209 + }, + { + "entropy": 9.086605072021484, + "epoch": 1.3279612418429898, + "mean_token_accuracy": 0.8188889026641846, + "num_tokens": 17853268.0, + "step": 13431, + "train/ce_loss": 1.1442152261734009 + }, + { + "epoch": 1.3279612418429898, + "step": 13431, + "train/sim_loss": 0.1030653715133667 + }, + { + "epoch": 1.3279612418429898, + "step": 13431, + "train/total_loss": 0.2174868881702423 + }, + { + "entropy": 9.621644973754883, + "epoch": 1.3280601146925055, + "mean_token_accuracy": 0.8470744490623474, + "num_tokens": 17862187.0, + "step": 13432, + "train/ce_loss": 0.8173866868019104 + }, + { + "epoch": 1.3280601146925055, + "step": 13432, + "train/sim_loss": 0.07135868072509766 + }, + { + "epoch": 1.3280601146925055, + "step": 13432, + "train/total_loss": 0.15309736132621765 + }, + { + "entropy": 9.020383834838867, + "epoch": 1.328158987542021, + "mean_token_accuracy": 0.8002114295959473, + "num_tokens": 17871295.0, + "step": 13433, + "train/ce_loss": 0.2359107881784439 + }, + { + "epoch": 1.328158987542021, + "step": 13433, + "train/sim_loss": 0.03886038064956665 + }, + { + "epoch": 1.328158987542021, + "step": 13433, + "train/total_loss": 0.06245145946741104 + }, + { + "entropy": 9.740494728088379, + "epoch": 1.3282578603915365, + "mean_token_accuracy": 0.904347836971283, + "num_tokens": 17883373.0, + "step": 13434, + "train/ce_loss": 3.6248866308596916e-06 + }, + { + "epoch": 1.3282578603915365, + "step": 13434, + "train/sim_loss": 0.0309523344039917 + }, + { + "epoch": 1.3282578603915365, + "step": 13434, + "train/total_loss": 0.0309526976197958 + }, + { + "entropy": 9.48328685760498, + "epoch": 1.328356733241052, + "mean_token_accuracy": 0.8802395462989807, + "num_tokens": 17891899.0, + "step": 13435, + "train/ce_loss": 0.3746306896209717 + }, + { + "epoch": 1.328356733241052, + "step": 13435, + "train/sim_loss": 0.03842878341674805 + }, + { + "epoch": 1.328356733241052, + "step": 13435, + "train/total_loss": 0.07589185237884521 + }, + { + "entropy": 9.232120513916016, + "epoch": 1.3284556060905675, + "mean_token_accuracy": 0.8641456365585327, + "num_tokens": 17899945.0, + "step": 13436, + "train/ce_loss": 0.4016989469528198 + }, + { + "epoch": 1.3284556060905675, + "step": 13436, + "train/sim_loss": 0.03352391719818115 + }, + { + "epoch": 1.3284556060905675, + "step": 13436, + "train/total_loss": 0.07369381189346313 + }, + { + "entropy": 9.467859268188477, + "epoch": 1.3285544789400832, + "mean_token_accuracy": 0.8794991970062256, + "num_tokens": 17907152.0, + "step": 13437, + "train/ce_loss": 0.2982741892337799 + }, + { + "epoch": 1.3285544789400832, + "step": 13437, + "train/sim_loss": 0.01435232162475586 + }, + { + "epoch": 1.3285544789400832, + "step": 13437, + "train/total_loss": 0.04417974129319191 + }, + { + "entropy": 9.652729034423828, + "epoch": 1.3286533517895986, + "mean_token_accuracy": 0.8345959782600403, + "num_tokens": 17925412.0, + "step": 13438, + "train/ce_loss": 0.6078199148178101 + }, + { + "epoch": 1.3286533517895986, + "step": 13438, + "train/sim_loss": 0.03314697742462158 + }, + { + "epoch": 1.3286533517895986, + "step": 13438, + "train/total_loss": 0.09392897039651871 + }, + { + "entropy": 9.649467468261719, + "epoch": 1.3287522246391141, + "mean_token_accuracy": 0.927943766117096, + "num_tokens": 17939361.0, + "step": 13439, + "train/ce_loss": 5.145384989191371e-07 + }, + { + "epoch": 1.3287522246391141, + "step": 13439, + "train/sim_loss": 0.034844934940338135 + }, + { + "epoch": 1.3287522246391141, + "step": 13439, + "train/total_loss": 0.03484498709440231 + }, + { + "epoch": 1.3288510974886296, + "grad_norm": 0.5230907797813416, + "learning_rate": 6.679770558275232e-06, + "loss": 0.0888, + "step": 13440 + }, + { + "entropy": 10.0940580368042, + "epoch": 1.3288510974886296, + "mean_token_accuracy": 0.8588770627975464, + "num_tokens": 17953673.0, + "step": 13440, + "train/ce_loss": 0.5994626879692078 + }, + { + "epoch": 1.3288510974886296, + "step": 13440, + "train/sim_loss": 0.04774516820907593 + }, + { + "epoch": 1.3288510974886296, + "step": 13440, + "train/total_loss": 0.1076914370059967 + }, + { + "entropy": 9.200372695922852, + "epoch": 1.328949970338145, + "mean_token_accuracy": 0.823687732219696, + "num_tokens": 17962246.0, + "step": 13441, + "train/ce_loss": 0.6430437564849854 + }, + { + "epoch": 1.328949970338145, + "step": 13441, + "train/sim_loss": 0.034832537174224854 + }, + { + "epoch": 1.328949970338145, + "step": 13441, + "train/total_loss": 0.09913691133260727 + }, + { + "entropy": 8.951394081115723, + "epoch": 1.3290488431876606, + "mean_token_accuracy": 0.8709677457809448, + "num_tokens": 17968069.0, + "step": 13442, + "train/ce_loss": 0.49412569403648376 + }, + { + "epoch": 1.3290488431876606, + "step": 13442, + "train/sim_loss": 0.06723189353942871 + }, + { + "epoch": 1.3290488431876606, + "step": 13442, + "train/total_loss": 0.1166444644331932 + }, + { + "entropy": 9.536846160888672, + "epoch": 1.329147716037176, + "mean_token_accuracy": 0.8189415335655212, + "num_tokens": 17982160.0, + "step": 13443, + "train/ce_loss": 0.5579275488853455 + }, + { + "epoch": 1.329147716037176, + "step": 13443, + "train/sim_loss": 0.028571724891662598 + }, + { + "epoch": 1.329147716037176, + "step": 13443, + "train/total_loss": 0.08436448127031326 + }, + { + "entropy": 8.98958969116211, + "epoch": 1.3292465888866918, + "mean_token_accuracy": 0.8895781636238098, + "num_tokens": 17991827.0, + "step": 13444, + "train/ce_loss": 3.75075444480899e-07 + }, + { + "epoch": 1.3292465888866918, + "step": 13444, + "train/sim_loss": 0.03155684471130371 + }, + { + "epoch": 1.3292465888866918, + "step": 13444, + "train/total_loss": 0.031556881964206696 + }, + { + "entropy": 9.455078125, + "epoch": 1.3293454617362073, + "mean_token_accuracy": 0.845610499382019, + "num_tokens": 18006734.0, + "step": 13445, + "train/ce_loss": 0.4707501530647278 + }, + { + "epoch": 1.3293454617362073, + "step": 13445, + "train/sim_loss": 0.03955042362213135 + }, + { + "epoch": 1.3293454617362073, + "step": 13445, + "train/total_loss": 0.08662544190883636 + }, + { + "entropy": 9.593320846557617, + "epoch": 1.3294443345857228, + "mean_token_accuracy": 0.8599790930747986, + "num_tokens": 18017542.0, + "step": 13446, + "train/ce_loss": 0.3347267508506775 + }, + { + "epoch": 1.3294443345857228, + "step": 13446, + "train/sim_loss": 0.041707396507263184 + }, + { + "epoch": 1.3294443345857228, + "step": 13446, + "train/total_loss": 0.0751800686120987 + }, + { + "entropy": 9.574045181274414, + "epoch": 1.3295432074352382, + "mean_token_accuracy": 0.8284883499145508, + "num_tokens": 18032984.0, + "step": 13447, + "train/ce_loss": 3.1970578220352763e-06 + }, + { + "epoch": 1.3295432074352382, + "step": 13447, + "train/sim_loss": 0.026103317737579346 + }, + { + "epoch": 1.3295432074352382, + "step": 13447, + "train/total_loss": 0.026103638112545013 + }, + { + "entropy": 9.540597915649414, + "epoch": 1.3296420802847537, + "mean_token_accuracy": 0.874799370765686, + "num_tokens": 18043262.0, + "step": 13448, + "train/ce_loss": 6.490265604952583e-07 + }, + { + "epoch": 1.3296420802847537, + "step": 13448, + "train/sim_loss": 0.023434221744537354 + }, + { + "epoch": 1.3296420802847537, + "step": 13448, + "train/total_loss": 0.023434286937117577 + }, + { + "entropy": 9.531448364257812, + "epoch": 1.3297409531342694, + "mean_token_accuracy": 0.8573072552680969, + "num_tokens": 18054481.0, + "step": 13449, + "train/ce_loss": 0.4978332221508026 + }, + { + "epoch": 1.3297409531342694, + "step": 13449, + "train/sim_loss": 0.05885589122772217 + }, + { + "epoch": 1.3297409531342694, + "step": 13449, + "train/total_loss": 0.10863921046257019 + }, + { + "entropy": 9.672698020935059, + "epoch": 1.329839825983785, + "mean_token_accuracy": 0.9118942618370056, + "num_tokens": 18069236.0, + "step": 13450, + "train/ce_loss": 9.032352750182326e-07 + }, + { + "epoch": 1.329839825983785, + "step": 13450, + "train/sim_loss": 0.021299123764038086 + }, + { + "epoch": 1.329839825983785, + "step": 13450, + "train/total_loss": 0.02129921317100525 + }, + { + "entropy": 9.15734577178955, + "epoch": 1.3299386988333004, + "mean_token_accuracy": 0.8573529124259949, + "num_tokens": 18080366.0, + "step": 13451, + "train/ce_loss": 0.4846787452697754 + }, + { + "epoch": 1.3299386988333004, + "step": 13451, + "train/sim_loss": 0.04588437080383301 + }, + { + "epoch": 1.3299386988333004, + "step": 13451, + "train/total_loss": 0.09435224533081055 + }, + { + "entropy": 9.356194496154785, + "epoch": 1.3300375716828159, + "mean_token_accuracy": 0.8742856979370117, + "num_tokens": 18092157.0, + "step": 13452, + "train/ce_loss": 0.25760549306869507 + }, + { + "epoch": 1.3300375716828159, + "step": 13452, + "train/sim_loss": 0.014703094959259033 + }, + { + "epoch": 1.3300375716828159, + "step": 13452, + "train/total_loss": 0.0404636450111866 + }, + { + "entropy": 9.26579761505127, + "epoch": 1.3301364445323314, + "mean_token_accuracy": 0.8293172717094421, + "num_tokens": 18108642.0, + "step": 13453, + "train/ce_loss": 0.2077663391828537 + }, + { + "epoch": 1.3301364445323314, + "step": 13453, + "train/sim_loss": 0.026954293251037598 + }, + { + "epoch": 1.3301364445323314, + "step": 13453, + "train/total_loss": 0.047730930149555206 + }, + { + "entropy": 9.076017379760742, + "epoch": 1.330235317381847, + "mean_token_accuracy": 0.8227571249008179, + "num_tokens": 18117715.0, + "step": 13454, + "train/ce_loss": 0.5053238868713379 + }, + { + "epoch": 1.330235317381847, + "step": 13454, + "train/sim_loss": 0.05724930763244629 + }, + { + "epoch": 1.330235317381847, + "step": 13454, + "train/total_loss": 0.10778169333934784 + }, + { + "entropy": 9.029875755310059, + "epoch": 1.3303341902313623, + "mean_token_accuracy": 0.8095238208770752, + "num_tokens": 18130474.0, + "step": 13455, + "train/ce_loss": 0.35933631658554077 + }, + { + "epoch": 1.3303341902313623, + "step": 13455, + "train/sim_loss": 0.047861695289611816 + }, + { + "epoch": 1.3303341902313623, + "step": 13455, + "train/total_loss": 0.08379532396793365 + }, + { + "entropy": 9.900814056396484, + "epoch": 1.330433063080878, + "mean_token_accuracy": 0.9140461087226868, + "num_tokens": 18139750.0, + "step": 13456, + "train/ce_loss": 4.288485513370688e-07 + }, + { + "epoch": 1.330433063080878, + "step": 13456, + "train/sim_loss": 0.012339115142822266 + }, + { + "epoch": 1.330433063080878, + "step": 13456, + "train/total_loss": 0.012339157983660698 + }, + { + "entropy": 9.97657299041748, + "epoch": 1.3305319359303935, + "mean_token_accuracy": 0.8695651888847351, + "num_tokens": 18147470.0, + "step": 13457, + "train/ce_loss": 4.474862180359196e-06 + }, + { + "epoch": 1.3305319359303935, + "step": 13457, + "train/sim_loss": 0.03776431083679199 + }, + { + "epoch": 1.3305319359303935, + "step": 13457, + "train/total_loss": 0.03776475787162781 + }, + { + "entropy": 9.393903732299805, + "epoch": 1.330630808779909, + "mean_token_accuracy": 0.8630527853965759, + "num_tokens": 18154077.0, + "step": 13458, + "train/ce_loss": 0.22982366383075714 + }, + { + "epoch": 1.330630808779909, + "step": 13458, + "train/sim_loss": 0.010138869285583496 + }, + { + "epoch": 1.330630808779909, + "step": 13458, + "train/total_loss": 0.03312123566865921 + }, + { + "entropy": 9.650249481201172, + "epoch": 1.3307296816294245, + "mean_token_accuracy": 0.8598870038986206, + "num_tokens": 18171377.0, + "step": 13459, + "train/ce_loss": 0.4766111969947815 + }, + { + "epoch": 1.3307296816294245, + "step": 13459, + "train/sim_loss": 0.05140268802642822 + }, + { + "epoch": 1.3307296816294245, + "step": 13459, + "train/total_loss": 0.09906381368637085 + }, + { + "epoch": 1.33082855447894, + "grad_norm": 0.5107682943344116, + "learning_rate": 6.674825693517283e-06, + "loss": 0.0795, + "step": 13460 + }, + { + "entropy": 9.739215850830078, + "epoch": 1.33082855447894, + "mean_token_accuracy": 0.8568181991577148, + "num_tokens": 18179674.0, + "step": 13460, + "train/ce_loss": 4.286331204639282e-06 + }, + { + "epoch": 1.33082855447894, + "step": 13460, + "train/sim_loss": 0.09944701194763184 + }, + { + "epoch": 1.33082855447894, + "step": 13460, + "train/total_loss": 0.09944744408130646 + }, + { + "entropy": 9.827133178710938, + "epoch": 1.3309274273284557, + "mean_token_accuracy": 0.8345864415168762, + "num_tokens": 18193374.0, + "step": 13461, + "train/ce_loss": 0.7144044637680054 + }, + { + "epoch": 1.3309274273284557, + "step": 13461, + "train/sim_loss": 0.06739234924316406 + }, + { + "epoch": 1.3309274273284557, + "step": 13461, + "train/total_loss": 0.13883280754089355 + }, + { + "entropy": 9.092930793762207, + "epoch": 1.3310263001779712, + "mean_token_accuracy": 0.8046709299087524, + "num_tokens": 18203921.0, + "step": 13462, + "train/ce_loss": 0.8048780560493469 + }, + { + "epoch": 1.3310263001779712, + "step": 13462, + "train/sim_loss": 0.053087472915649414 + }, + { + "epoch": 1.3310263001779712, + "step": 13462, + "train/total_loss": 0.13357529044151306 + }, + { + "entropy": 9.212769508361816, + "epoch": 1.3311251730274867, + "mean_token_accuracy": 0.871666669845581, + "num_tokens": 18219575.0, + "step": 13463, + "train/ce_loss": 3.5762306538345e-07 + }, + { + "epoch": 1.3311251730274867, + "step": 13463, + "train/sim_loss": 0.028629958629608154 + }, + { + "epoch": 1.3311251730274867, + "step": 13463, + "train/total_loss": 0.02862999401986599 + }, + { + "entropy": 9.081279754638672, + "epoch": 1.3312240458770022, + "mean_token_accuracy": 0.808917224407196, + "num_tokens": 18232782.0, + "step": 13464, + "train/ce_loss": 0.53873211145401 + }, + { + "epoch": 1.3312240458770022, + "step": 13464, + "train/sim_loss": 0.03347158432006836 + }, + { + "epoch": 1.3312240458770022, + "step": 13464, + "train/total_loss": 0.08734479546546936 + }, + { + "entropy": 9.592998504638672, + "epoch": 1.3313229187265176, + "mean_token_accuracy": 0.8360450267791748, + "num_tokens": 18249608.0, + "step": 13465, + "train/ce_loss": 0.7001285552978516 + }, + { + "epoch": 1.3313229187265176, + "step": 13465, + "train/sim_loss": 0.1456437110900879 + }, + { + "epoch": 1.3313229187265176, + "step": 13465, + "train/total_loss": 0.215656578540802 + }, + { + "entropy": 9.248577117919922, + "epoch": 1.3314217915760334, + "mean_token_accuracy": 0.8429487347602844, + "num_tokens": 18265644.0, + "step": 13466, + "train/ce_loss": 0.48515722155570984 + }, + { + "epoch": 1.3314217915760334, + "step": 13466, + "train/sim_loss": 0.04316842555999756 + }, + { + "epoch": 1.3314217915760334, + "step": 13466, + "train/total_loss": 0.09168414771556854 + }, + { + "entropy": 9.647929191589355, + "epoch": 1.3315206644255486, + "mean_token_accuracy": 0.8045822381973267, + "num_tokens": 18275042.0, + "step": 13467, + "train/ce_loss": 0.526470422744751 + }, + { + "epoch": 1.3315206644255486, + "step": 13467, + "train/sim_loss": 0.03321760892868042 + }, + { + "epoch": 1.3315206644255486, + "step": 13467, + "train/total_loss": 0.08586464822292328 + }, + { + "entropy": 9.525763511657715, + "epoch": 1.3316195372750643, + "mean_token_accuracy": 0.8503703474998474, + "num_tokens": 18285066.0, + "step": 13468, + "train/ce_loss": 0.5453870296478271 + }, + { + "epoch": 1.3316195372750643, + "step": 13468, + "train/sim_loss": 0.015427768230438232 + }, + { + "epoch": 1.3316195372750643, + "step": 13468, + "train/total_loss": 0.06996647268533707 + }, + { + "entropy": 9.38901138305664, + "epoch": 1.3317184101245798, + "mean_token_accuracy": 0.8479912877082825, + "num_tokens": 18295725.0, + "step": 13469, + "train/ce_loss": 0.425552636384964 + }, + { + "epoch": 1.3317184101245798, + "step": 13469, + "train/sim_loss": 0.03697967529296875 + }, + { + "epoch": 1.3317184101245798, + "step": 13469, + "train/total_loss": 0.07953494042158127 + }, + { + "entropy": 9.42606258392334, + "epoch": 1.3318172829740953, + "mean_token_accuracy": 0.8392018675804138, + "num_tokens": 18304996.0, + "step": 13470, + "train/ce_loss": 0.8494725227355957 + }, + { + "epoch": 1.3318172829740953, + "step": 13470, + "train/sim_loss": 0.09325259923934937 + }, + { + "epoch": 1.3318172829740953, + "step": 13470, + "train/total_loss": 0.1781998574733734 + }, + { + "entropy": 9.390159606933594, + "epoch": 1.3319161558236108, + "mean_token_accuracy": 0.8665767908096313, + "num_tokens": 18314179.0, + "step": 13471, + "train/ce_loss": 0.46162620186805725 + }, + { + "epoch": 1.3319161558236108, + "step": 13471, + "train/sim_loss": 0.012387752532958984 + }, + { + "epoch": 1.3319161558236108, + "step": 13471, + "train/total_loss": 0.05855037271976471 + }, + { + "entropy": 9.26565170288086, + "epoch": 1.3320150286731263, + "mean_token_accuracy": 0.8246575593948364, + "num_tokens": 18326557.0, + "step": 13472, + "train/ce_loss": 0.5305537581443787 + }, + { + "epoch": 1.3320150286731263, + "step": 13472, + "train/sim_loss": 0.03049832582473755 + }, + { + "epoch": 1.3320150286731263, + "step": 13472, + "train/total_loss": 0.08355370163917542 + }, + { + "entropy": 8.936902046203613, + "epoch": 1.332113901522642, + "mean_token_accuracy": 0.826718270778656, + "num_tokens": 18339914.0, + "step": 13473, + "train/ce_loss": 0.26999616622924805 + }, + { + "epoch": 1.332113901522642, + "step": 13473, + "train/sim_loss": 0.04848754405975342 + }, + { + "epoch": 1.332113901522642, + "step": 13473, + "train/total_loss": 0.0754871591925621 + }, + { + "entropy": 9.87765121459961, + "epoch": 1.3322127743721575, + "mean_token_accuracy": 0.9317803382873535, + "num_tokens": 18354838.0, + "step": 13474, + "train/ce_loss": 0.32581907510757446 + }, + { + "epoch": 1.3322127743721575, + "step": 13474, + "train/sim_loss": 0.03650385141372681 + }, + { + "epoch": 1.3322127743721575, + "step": 13474, + "train/total_loss": 0.06908576190471649 + }, + { + "entropy": 10.164448738098145, + "epoch": 1.332311647221673, + "mean_token_accuracy": 0.932467520236969, + "num_tokens": 18370290.0, + "step": 13475, + "train/ce_loss": 0.30414265394210815 + }, + { + "epoch": 1.332311647221673, + "step": 13475, + "train/sim_loss": 0.030722439289093018 + }, + { + "epoch": 1.332311647221673, + "step": 13475, + "train/total_loss": 0.06113670766353607 + }, + { + "entropy": 9.46872329711914, + "epoch": 1.3324105200711884, + "mean_token_accuracy": 0.8046989440917969, + "num_tokens": 18383810.0, + "step": 13476, + "train/ce_loss": 0.5069012641906738 + }, + { + "epoch": 1.3324105200711884, + "step": 13476, + "train/sim_loss": 0.017332851886749268 + }, + { + "epoch": 1.3324105200711884, + "step": 13476, + "train/total_loss": 0.06802298128604889 + }, + { + "entropy": 9.818872451782227, + "epoch": 1.332509392920704, + "mean_token_accuracy": 0.8454258441925049, + "num_tokens": 18399299.0, + "step": 13477, + "train/ce_loss": 0.49848657846450806 + }, + { + "epoch": 1.332509392920704, + "step": 13477, + "train/sim_loss": 0.01293271780014038 + }, + { + "epoch": 1.332509392920704, + "step": 13477, + "train/total_loss": 0.06278137862682343 + }, + { + "entropy": 9.399162292480469, + "epoch": 1.3326082657702196, + "mean_token_accuracy": 0.823399543762207, + "num_tokens": 18413919.0, + "step": 13478, + "train/ce_loss": 0.6858054399490356 + }, + { + "epoch": 1.3326082657702196, + "step": 13478, + "train/sim_loss": 0.04874074459075928 + }, + { + "epoch": 1.3326082657702196, + "step": 13478, + "train/total_loss": 0.11732129007577896 + }, + { + "entropy": 9.23643684387207, + "epoch": 1.3327071386197349, + "mean_token_accuracy": 0.8774271607398987, + "num_tokens": 18421157.0, + "step": 13479, + "train/ce_loss": 0.18248297274112701 + }, + { + "epoch": 1.3327071386197349, + "step": 13479, + "train/sim_loss": 0.014130651950836182 + }, + { + "epoch": 1.3327071386197349, + "step": 13479, + "train/total_loss": 0.03237894922494888 + }, + { + "epoch": 1.3328060114692506, + "grad_norm": 0.4802892804145813, + "learning_rate": 6.669880828759334e-06, + "loss": 0.089, + "step": 13480 + }, + { + "entropy": 10.121118545532227, + "epoch": 1.3328060114692506, + "mean_token_accuracy": 0.870517909526825, + "num_tokens": 18433842.0, + "step": 13480, + "train/ce_loss": 2.887597929657204e-07 + }, + { + "epoch": 1.3328060114692506, + "step": 13480, + "train/sim_loss": 0.014039278030395508 + }, + { + "epoch": 1.3328060114692506, + "step": 13480, + "train/total_loss": 0.014039306901395321 + }, + { + "entropy": 9.221611022949219, + "epoch": 1.332904884318766, + "mean_token_accuracy": 0.863218367099762, + "num_tokens": 18448801.0, + "step": 13481, + "train/ce_loss": 0.5723136067390442 + }, + { + "epoch": 1.332904884318766, + "step": 13481, + "train/sim_loss": 0.013035893440246582 + }, + { + "epoch": 1.332904884318766, + "step": 13481, + "train/total_loss": 0.07026726007461548 + }, + { + "entropy": 9.335853576660156, + "epoch": 1.3330037571682816, + "mean_token_accuracy": 0.8921052813529968, + "num_tokens": 18459064.0, + "step": 13482, + "train/ce_loss": 0.6065882444381714 + }, + { + "epoch": 1.3330037571682816, + "step": 13482, + "train/sim_loss": 0.03913760185241699 + }, + { + "epoch": 1.3330037571682816, + "step": 13482, + "train/total_loss": 0.09979642927646637 + }, + { + "entropy": 9.58371639251709, + "epoch": 1.333102630017797, + "mean_token_accuracy": 0.8446043133735657, + "num_tokens": 18474160.0, + "step": 13483, + "train/ce_loss": 0.9524046182632446 + }, + { + "epoch": 1.333102630017797, + "step": 13483, + "train/sim_loss": 0.06479716300964355 + }, + { + "epoch": 1.333102630017797, + "step": 13483, + "train/total_loss": 0.16003763675689697 + }, + { + "entropy": 9.990737915039062, + "epoch": 1.3332015028673125, + "mean_token_accuracy": 0.9300912022590637, + "num_tokens": 18485546.0, + "step": 13484, + "train/ce_loss": 0.3979331851005554 + }, + { + "epoch": 1.3332015028673125, + "step": 13484, + "train/sim_loss": 0.06562626361846924 + }, + { + "epoch": 1.3332015028673125, + "step": 13484, + "train/total_loss": 0.1054195836186409 + }, + { + "entropy": 9.762046813964844, + "epoch": 1.3333003757168282, + "mean_token_accuracy": 0.8955512642860413, + "num_tokens": 18494953.0, + "step": 13485, + "train/ce_loss": 1.3049977951595793e-06 + }, + { + "epoch": 1.3333003757168282, + "step": 13485, + "train/sim_loss": 0.03309178352355957 + }, + { + "epoch": 1.3333003757168282, + "step": 13485, + "train/total_loss": 0.033091913908720016 + }, + { + "entropy": 9.647361755371094, + "epoch": 1.3333992485663437, + "mean_token_accuracy": 0.8736517429351807, + "num_tokens": 18503708.0, + "step": 13486, + "train/ce_loss": 0.4027664363384247 + }, + { + "epoch": 1.3333992485663437, + "step": 13486, + "train/sim_loss": 0.06507790088653564 + }, + { + "epoch": 1.3333992485663437, + "step": 13486, + "train/total_loss": 0.10535454750061035 + }, + { + "entropy": 9.673874855041504, + "epoch": 1.3334981214158592, + "mean_token_accuracy": 0.914185643196106, + "num_tokens": 18521040.0, + "step": 13487, + "train/ce_loss": 0.30635255575180054 + }, + { + "epoch": 1.3334981214158592, + "step": 13487, + "train/sim_loss": 0.017303824424743652 + }, + { + "epoch": 1.3334981214158592, + "step": 13487, + "train/total_loss": 0.047939080744981766 + }, + { + "entropy": 9.343833923339844, + "epoch": 1.3335969942653747, + "mean_token_accuracy": 0.8577347993850708, + "num_tokens": 18533906.0, + "step": 13488, + "train/ce_loss": 1.860975316958502e-06 + }, + { + "epoch": 1.3335969942653747, + "step": 13488, + "train/sim_loss": 0.03636246919631958 + }, + { + "epoch": 1.3335969942653747, + "step": 13488, + "train/total_loss": 0.0363626554608345 + }, + { + "entropy": 8.8909273147583, + "epoch": 1.3336958671148902, + "mean_token_accuracy": 0.8178368210792542, + "num_tokens": 18541923.0, + "step": 13489, + "train/ce_loss": 0.6487076282501221 + }, + { + "epoch": 1.3336958671148902, + "step": 13489, + "train/sim_loss": 0.0514412522315979 + }, + { + "epoch": 1.3336958671148902, + "step": 13489, + "train/total_loss": 0.11631201952695847 + }, + { + "entropy": 9.075348854064941, + "epoch": 1.333794739964406, + "mean_token_accuracy": 0.8560924530029297, + "num_tokens": 18552339.0, + "step": 13490, + "train/ce_loss": 0.3160974085330963 + }, + { + "epoch": 1.333794739964406, + "step": 13490, + "train/sim_loss": 0.04060566425323486 + }, + { + "epoch": 1.333794739964406, + "step": 13490, + "train/total_loss": 0.07221540808677673 + }, + { + "entropy": 9.531278610229492, + "epoch": 1.3338936128139212, + "mean_token_accuracy": 0.8612059354782104, + "num_tokens": 18566152.0, + "step": 13491, + "train/ce_loss": 0.29868870973587036 + }, + { + "epoch": 1.3338936128139212, + "step": 13491, + "train/sim_loss": 0.0586894154548645 + }, + { + "epoch": 1.3338936128139212, + "step": 13491, + "train/total_loss": 0.08855828642845154 + }, + { + "entropy": 9.513487815856934, + "epoch": 1.3339924856634369, + "mean_token_accuracy": 0.8806262016296387, + "num_tokens": 18574277.0, + "step": 13492, + "train/ce_loss": 1.6048676343416446e-06 + }, + { + "epoch": 1.3339924856634369, + "step": 13492, + "train/sim_loss": 0.02819955348968506 + }, + { + "epoch": 1.3339924856634369, + "step": 13492, + "train/total_loss": 0.028199713677167892 + }, + { + "entropy": 9.670633316040039, + "epoch": 1.3340913585129524, + "mean_token_accuracy": 0.8657143115997314, + "num_tokens": 18582361.0, + "step": 13493, + "train/ce_loss": 0.589466392993927 + }, + { + "epoch": 1.3340913585129524, + "step": 13493, + "train/sim_loss": 0.04310500621795654 + }, + { + "epoch": 1.3340913585129524, + "step": 13493, + "train/total_loss": 0.10205164551734924 + }, + { + "entropy": 9.42350959777832, + "epoch": 1.3341902313624678, + "mean_token_accuracy": 0.805330216884613, + "num_tokens": 18599147.0, + "step": 13494, + "train/ce_loss": 0.4057849049568176 + }, + { + "epoch": 1.3341902313624678, + "step": 13494, + "train/sim_loss": 0.019587457180023193 + }, + { + "epoch": 1.3341902313624678, + "step": 13494, + "train/total_loss": 0.060165949165821075 + }, + { + "entropy": 9.751670837402344, + "epoch": 1.3342891042119833, + "mean_token_accuracy": 0.9142857193946838, + "num_tokens": 18611414.0, + "step": 13495, + "train/ce_loss": 1.8851786762752454e-06 + }, + { + "epoch": 1.3342891042119833, + "step": 13495, + "train/sim_loss": 0.033787548542022705 + }, + { + "epoch": 1.3342891042119833, + "step": 13495, + "train/total_loss": 0.03378773853182793 + }, + { + "entropy": 9.631170272827148, + "epoch": 1.3343879770614988, + "mean_token_accuracy": 0.8918918967247009, + "num_tokens": 18629175.0, + "step": 13496, + "train/ce_loss": 0.11096319556236267 + }, + { + "epoch": 1.3343879770614988, + "step": 13496, + "train/sim_loss": 0.028615295886993408 + }, + { + "epoch": 1.3343879770614988, + "step": 13496, + "train/total_loss": 0.039711616933345795 + }, + { + "entropy": 9.805849075317383, + "epoch": 1.3344868499110145, + "mean_token_accuracy": 0.8671209812164307, + "num_tokens": 18640841.0, + "step": 13497, + "train/ce_loss": 0.24653537571430206 + }, + { + "epoch": 1.3344868499110145, + "step": 13497, + "train/sim_loss": 0.040515244007110596 + }, + { + "epoch": 1.3344868499110145, + "step": 13497, + "train/total_loss": 0.06516878306865692 + }, + { + "entropy": 9.267699241638184, + "epoch": 1.33458572276053, + "mean_token_accuracy": 0.9035087823867798, + "num_tokens": 18649310.0, + "step": 13498, + "train/ce_loss": 0.6242950558662415 + }, + { + "epoch": 1.33458572276053, + "step": 13498, + "train/sim_loss": 0.03385603427886963 + }, + { + "epoch": 1.33458572276053, + "step": 13498, + "train/total_loss": 0.09628553688526154 + }, + { + "entropy": 9.00866413116455, + "epoch": 1.3346845956100455, + "mean_token_accuracy": 0.8987603187561035, + "num_tokens": 18661036.0, + "step": 13499, + "train/ce_loss": 0.5310581922531128 + }, + { + "epoch": 1.3346845956100455, + "step": 13499, + "train/sim_loss": 0.014681220054626465 + }, + { + "epoch": 1.3346845956100455, + "step": 13499, + "train/total_loss": 0.0677870362997055 + }, + { + "epoch": 1.334783468459561, + "grad_norm": 0.42444729804992676, + "learning_rate": 6.664935964001386e-06, + "loss": 0.0743, + "step": 13500 + }, + { + "entropy": 9.636869430541992, + "epoch": 1.334783468459561, + "mean_token_accuracy": 0.8103448152542114, + "num_tokens": 18669425.0, + "step": 13500, + "train/ce_loss": 0.8090416789054871 + }, + { + "epoch": 1.334783468459561, + "step": 13500, + "train/sim_loss": 0.06575942039489746 + }, + { + "epoch": 1.334783468459561, + "step": 13500, + "train/total_loss": 0.1466635912656784 + }, + { + "entropy": 9.920717239379883, + "epoch": 1.3348823413090765, + "mean_token_accuracy": 0.9201030731201172, + "num_tokens": 18683848.0, + "step": 13501, + "train/ce_loss": 1.9575891201384366e-06 + }, + { + "epoch": 1.3348823413090765, + "step": 13501, + "train/sim_loss": 0.028731107711791992 + }, + { + "epoch": 1.3348823413090765, + "step": 13501, + "train/total_loss": 0.02873130328953266 + }, + { + "entropy": 9.67567253112793, + "epoch": 1.3349812141585922, + "mean_token_accuracy": 0.8305882215499878, + "num_tokens": 18698277.0, + "step": 13502, + "train/ce_loss": 0.46411484479904175 + }, + { + "epoch": 1.3349812141585922, + "step": 13502, + "train/sim_loss": 0.05380791425704956 + }, + { + "epoch": 1.3349812141585922, + "step": 13502, + "train/total_loss": 0.10021939873695374 + }, + { + "entropy": 9.600638389587402, + "epoch": 1.3350800870081077, + "mean_token_accuracy": 0.8303571343421936, + "num_tokens": 18714792.0, + "step": 13503, + "train/ce_loss": 0.36519819498062134 + }, + { + "epoch": 1.3350800870081077, + "step": 13503, + "train/sim_loss": 0.015095531940460205 + }, + { + "epoch": 1.3350800870081077, + "step": 13503, + "train/total_loss": 0.05161535367369652 + }, + { + "entropy": 9.768040657043457, + "epoch": 1.3351789598576231, + "mean_token_accuracy": 0.8810096383094788, + "num_tokens": 18732174.0, + "step": 13504, + "train/ce_loss": 0.11829567700624466 + }, + { + "epoch": 1.3351789598576231, + "step": 13504, + "train/sim_loss": 0.08843076229095459 + }, + { + "epoch": 1.3351789598576231, + "step": 13504, + "train/total_loss": 0.10026033222675323 + }, + { + "entropy": 9.818610191345215, + "epoch": 1.3352778327071386, + "mean_token_accuracy": 0.789825975894928, + "num_tokens": 18743513.0, + "step": 13505, + "train/ce_loss": 0.5940858125686646 + }, + { + "epoch": 1.3352778327071386, + "step": 13505, + "train/sim_loss": 0.10785812139511108 + }, + { + "epoch": 1.3352778327071386, + "step": 13505, + "train/total_loss": 0.16726669669151306 + }, + { + "entropy": 10.026337623596191, + "epoch": 1.335376705556654, + "mean_token_accuracy": 0.9222221970558167, + "num_tokens": 18753454.0, + "step": 13506, + "train/ce_loss": 3.1176148240774637e-06 + }, + { + "epoch": 1.335376705556654, + "step": 13506, + "train/sim_loss": 0.035172224044799805 + }, + { + "epoch": 1.335376705556654, + "step": 13506, + "train/total_loss": 0.035172536969184875 + }, + { + "entropy": 9.495004653930664, + "epoch": 1.3354755784061696, + "mean_token_accuracy": 0.9341772198677063, + "num_tokens": 18766034.0, + "step": 13507, + "train/ce_loss": 0.6117904186248779 + }, + { + "epoch": 1.3354755784061696, + "step": 13507, + "train/sim_loss": 0.04343152046203613 + }, + { + "epoch": 1.3354755784061696, + "step": 13507, + "train/total_loss": 0.10461056232452393 + }, + { + "entropy": 9.418638229370117, + "epoch": 1.335574451255685, + "mean_token_accuracy": 0.8789592981338501, + "num_tokens": 18782546.0, + "step": 13508, + "train/ce_loss": 0.2836463451385498 + }, + { + "epoch": 1.335574451255685, + "step": 13508, + "train/sim_loss": 0.035382091999053955 + }, + { + "epoch": 1.335574451255685, + "step": 13508, + "train/total_loss": 0.06374672800302505 + }, + { + "entropy": 9.454000473022461, + "epoch": 1.3356733241052008, + "mean_token_accuracy": 0.8163716793060303, + "num_tokens": 18794556.0, + "step": 13509, + "train/ce_loss": 0.4932464361190796 + }, + { + "epoch": 1.3356733241052008, + "step": 13509, + "train/sim_loss": 0.034016430377960205 + }, + { + "epoch": 1.3356733241052008, + "step": 13509, + "train/total_loss": 0.0833410769701004 + }, + { + "entropy": 9.732643127441406, + "epoch": 1.3357721969547163, + "mean_token_accuracy": 0.8535109162330627, + "num_tokens": 18809385.0, + "step": 13510, + "train/ce_loss": 0.8326937556266785 + }, + { + "epoch": 1.3357721969547163, + "step": 13510, + "train/sim_loss": 0.059931933879852295 + }, + { + "epoch": 1.3357721969547163, + "step": 13510, + "train/total_loss": 0.1432013213634491 + }, + { + "entropy": 9.373628616333008, + "epoch": 1.3358710698042318, + "mean_token_accuracy": 0.8824289441108704, + "num_tokens": 18816859.0, + "step": 13511, + "train/ce_loss": 0.6594961881637573 + }, + { + "epoch": 1.3358710698042318, + "step": 13511, + "train/sim_loss": 0.09601253271102905 + }, + { + "epoch": 1.3358710698042318, + "step": 13511, + "train/total_loss": 0.16196215152740479 + }, + { + "entropy": 9.906652450561523, + "epoch": 1.3359699426537472, + "mean_token_accuracy": 0.8821954727172852, + "num_tokens": 18836100.0, + "step": 13512, + "train/ce_loss": 0.2891111671924591 + }, + { + "epoch": 1.3359699426537472, + "step": 13512, + "train/sim_loss": 0.019168734550476074 + }, + { + "epoch": 1.3359699426537472, + "step": 13512, + "train/total_loss": 0.048079852014780045 + }, + { + "entropy": 9.49520492553711, + "epoch": 1.3360688155032627, + "mean_token_accuracy": 0.8362069129943848, + "num_tokens": 18849442.0, + "step": 13513, + "train/ce_loss": 0.18281249701976776 + }, + { + "epoch": 1.3360688155032627, + "step": 13513, + "train/sim_loss": 0.05869072675704956 + }, + { + "epoch": 1.3360688155032627, + "step": 13513, + "train/total_loss": 0.07697197794914246 + }, + { + "entropy": 9.54029655456543, + "epoch": 1.3361676883527784, + "mean_token_accuracy": 0.8785489201545715, + "num_tokens": 18856875.0, + "step": 13514, + "train/ce_loss": 1.342397467851697e-06 + }, + { + "epoch": 1.3361676883527784, + "step": 13514, + "train/sim_loss": 0.033305585384368896 + }, + { + "epoch": 1.3361676883527784, + "step": 13514, + "train/total_loss": 0.03330571949481964 + }, + { + "entropy": 10.38245677947998, + "epoch": 1.336266561202294, + "mean_token_accuracy": 0.875, + "num_tokens": 18865686.0, + "step": 13515, + "train/ce_loss": 0.5615636110305786 + }, + { + "epoch": 1.336266561202294, + "step": 13515, + "train/sim_loss": 0.06519949436187744 + }, + { + "epoch": 1.336266561202294, + "step": 13515, + "train/total_loss": 0.12135586142539978 + }, + { + "entropy": 9.723403930664062, + "epoch": 1.3363654340518094, + "mean_token_accuracy": 0.8655569553375244, + "num_tokens": 18882800.0, + "step": 13516, + "train/ce_loss": 0.6725497245788574 + }, + { + "epoch": 1.3363654340518094, + "step": 13516, + "train/sim_loss": 0.05205738544464111 + }, + { + "epoch": 1.3363654340518094, + "step": 13516, + "train/total_loss": 0.1193123608827591 + }, + { + "entropy": 9.699748039245605, + "epoch": 1.336464306901325, + "mean_token_accuracy": 0.8222222328186035, + "num_tokens": 18896240.0, + "step": 13517, + "train/ce_loss": 0.3377651274204254 + }, + { + "epoch": 1.336464306901325, + "step": 13517, + "train/sim_loss": 0.04856395721435547 + }, + { + "epoch": 1.336464306901325, + "step": 13517, + "train/total_loss": 0.08234047144651413 + }, + { + "entropy": 9.828364372253418, + "epoch": 1.3365631797508404, + "mean_token_accuracy": 0.7888402342796326, + "num_tokens": 18914756.0, + "step": 13518, + "train/ce_loss": 0.6367921233177185 + }, + { + "epoch": 1.3365631797508404, + "step": 13518, + "train/sim_loss": 0.07141423225402832 + }, + { + "epoch": 1.3365631797508404, + "step": 13518, + "train/total_loss": 0.13509345054626465 + }, + { + "entropy": 9.785238265991211, + "epoch": 1.3366620526003559, + "mean_token_accuracy": 0.9086115956306458, + "num_tokens": 18931936.0, + "step": 13519, + "train/ce_loss": 0.3511945605278015 + }, + { + "epoch": 1.3366620526003559, + "step": 13519, + "train/sim_loss": 0.048592567443847656 + }, + { + "epoch": 1.3366620526003559, + "step": 13519, + "train/total_loss": 0.08371202647686005 + }, + { + "epoch": 1.3367609254498714, + "grad_norm": 0.5489282608032227, + "learning_rate": 6.659991099243436e-06, + "loss": 0.0848, + "step": 13520 + }, + { + "entropy": 9.904827117919922, + "epoch": 1.3367609254498714, + "mean_token_accuracy": 0.8709121942520142, + "num_tokens": 18948496.0, + "step": 13520, + "train/ce_loss": 0.6965283751487732 + }, + { + "epoch": 1.3367609254498714, + "step": 13520, + "train/sim_loss": 0.04053914546966553 + }, + { + "epoch": 1.3367609254498714, + "step": 13520, + "train/total_loss": 0.11019198596477509 + }, + { + "entropy": 9.41297721862793, + "epoch": 1.336859798299387, + "mean_token_accuracy": 0.8552631735801697, + "num_tokens": 18960041.0, + "step": 13521, + "train/ce_loss": 0.39000624418258667 + }, + { + "epoch": 1.336859798299387, + "step": 13521, + "train/sim_loss": 0.07366037368774414 + }, + { + "epoch": 1.336859798299387, + "step": 13521, + "train/total_loss": 0.11266100406646729 + }, + { + "entropy": 9.087406158447266, + "epoch": 1.3369586711489025, + "mean_token_accuracy": 0.838443398475647, + "num_tokens": 18967219.0, + "step": 13522, + "train/ce_loss": 0.40639203786849976 + }, + { + "epoch": 1.3369586711489025, + "step": 13522, + "train/sim_loss": 0.042794227600097656 + }, + { + "epoch": 1.3369586711489025, + "step": 13522, + "train/total_loss": 0.08343343436717987 + }, + { + "entropy": 9.638022422790527, + "epoch": 1.337057543998418, + "mean_token_accuracy": 0.8327272534370422, + "num_tokens": 18981926.0, + "step": 13523, + "train/ce_loss": 0.7521167993545532 + }, + { + "epoch": 1.337057543998418, + "step": 13523, + "train/sim_loss": 0.05374997854232788 + }, + { + "epoch": 1.337057543998418, + "step": 13523, + "train/total_loss": 0.12896165251731873 + }, + { + "entropy": 9.41408920288086, + "epoch": 1.3371564168479335, + "mean_token_accuracy": 0.8241379261016846, + "num_tokens": 18993127.0, + "step": 13524, + "train/ce_loss": 0.25118568539619446 + }, + { + "epoch": 1.3371564168479335, + "step": 13524, + "train/sim_loss": 0.061635732650756836 + }, + { + "epoch": 1.3371564168479335, + "step": 13524, + "train/total_loss": 0.08675429970026016 + }, + { + "entropy": 9.391714096069336, + "epoch": 1.337255289697449, + "mean_token_accuracy": 0.8257575631141663, + "num_tokens": 19002906.0, + "step": 13525, + "train/ce_loss": 0.8358643651008606 + }, + { + "epoch": 1.337255289697449, + "step": 13525, + "train/sim_loss": 0.11333250999450684 + }, + { + "epoch": 1.337255289697449, + "step": 13525, + "train/total_loss": 0.19691894948482513 + }, + { + "entropy": 9.749296188354492, + "epoch": 1.3373541625469647, + "mean_token_accuracy": 0.9164133667945862, + "num_tokens": 19015306.0, + "step": 13526, + "train/ce_loss": 0.08244309574365616 + }, + { + "epoch": 1.3373541625469647, + "step": 13526, + "train/sim_loss": 0.05138289928436279 + }, + { + "epoch": 1.3373541625469647, + "step": 13526, + "train/total_loss": 0.05962720885872841 + }, + { + "entropy": 9.319541931152344, + "epoch": 1.3374530353964802, + "mean_token_accuracy": 0.8327832818031311, + "num_tokens": 19028258.0, + "step": 13527, + "train/ce_loss": 0.40757375955581665 + }, + { + "epoch": 1.3374530353964802, + "step": 13527, + "train/sim_loss": 0.01650071144104004 + }, + { + "epoch": 1.3374530353964802, + "step": 13527, + "train/total_loss": 0.057258088141679764 + }, + { + "entropy": 9.26878833770752, + "epoch": 1.3375519082459957, + "mean_token_accuracy": 0.8251833915710449, + "num_tokens": 19037901.0, + "step": 13528, + "train/ce_loss": 0.5413814187049866 + }, + { + "epoch": 1.3375519082459957, + "step": 13528, + "train/sim_loss": 0.055571913719177246 + }, + { + "epoch": 1.3375519082459957, + "step": 13528, + "train/total_loss": 0.10971005260944366 + }, + { + "entropy": 9.091133117675781, + "epoch": 1.3376507810955112, + "mean_token_accuracy": 0.8182701468467712, + "num_tokens": 19047855.0, + "step": 13529, + "train/ce_loss": 0.21083186566829681 + }, + { + "epoch": 1.3376507810955112, + "step": 13529, + "train/sim_loss": 0.04896259307861328 + }, + { + "epoch": 1.3376507810955112, + "step": 13529, + "train/total_loss": 0.07004578411579132 + }, + { + "entropy": 9.745767593383789, + "epoch": 1.3377496539450267, + "mean_token_accuracy": 0.8041504621505737, + "num_tokens": 19058427.0, + "step": 13530, + "train/ce_loss": 0.8275084495544434 + }, + { + "epoch": 1.3377496539450267, + "step": 13530, + "train/sim_loss": 0.07096338272094727 + }, + { + "epoch": 1.3377496539450267, + "step": 13530, + "train/total_loss": 0.15371423959732056 + }, + { + "entropy": 9.226648330688477, + "epoch": 1.3378485267945421, + "mean_token_accuracy": 0.8681876063346863, + "num_tokens": 19064843.0, + "step": 13531, + "train/ce_loss": 0.23098430037498474 + }, + { + "epoch": 1.3378485267945421, + "step": 13531, + "train/sim_loss": 0.01638782024383545 + }, + { + "epoch": 1.3378485267945421, + "step": 13531, + "train/total_loss": 0.03948625177145004 + }, + { + "entropy": 9.401315689086914, + "epoch": 1.3379473996440576, + "mean_token_accuracy": 0.8643649816513062, + "num_tokens": 19073414.0, + "step": 13532, + "train/ce_loss": 0.2602488398551941 + }, + { + "epoch": 1.3379473996440576, + "step": 13532, + "train/sim_loss": 0.0186728835105896 + }, + { + "epoch": 1.3379473996440576, + "step": 13532, + "train/total_loss": 0.04469776898622513 + }, + { + "entropy": 9.406947135925293, + "epoch": 1.3380462724935733, + "mean_token_accuracy": 0.8776077628135681, + "num_tokens": 19087264.0, + "step": 13533, + "train/ce_loss": 0.2574727237224579 + }, + { + "epoch": 1.3380462724935733, + "step": 13533, + "train/sim_loss": 0.030915141105651855 + }, + { + "epoch": 1.3380462724935733, + "step": 13533, + "train/total_loss": 0.056662414222955704 + }, + { + "entropy": 9.14692497253418, + "epoch": 1.3381451453430888, + "mean_token_accuracy": 0.8478027582168579, + "num_tokens": 19096268.0, + "step": 13534, + "train/ce_loss": 0.2166273593902588 + }, + { + "epoch": 1.3381451453430888, + "step": 13534, + "train/sim_loss": 0.051958680152893066 + }, + { + "epoch": 1.3381451453430888, + "step": 13534, + "train/total_loss": 0.07362141460180283 + }, + { + "entropy": 9.771675109863281, + "epoch": 1.3382440181926043, + "mean_token_accuracy": 0.8660998940467834, + "num_tokens": 19111146.0, + "step": 13535, + "train/ce_loss": 0.45143210887908936 + }, + { + "epoch": 1.3382440181926043, + "step": 13535, + "train/sim_loss": 0.06527233123779297 + }, + { + "epoch": 1.3382440181926043, + "step": 13535, + "train/total_loss": 0.11041554808616638 + }, + { + "entropy": 9.645147323608398, + "epoch": 1.3383428910421198, + "mean_token_accuracy": 0.8206018805503845, + "num_tokens": 19127344.0, + "step": 13536, + "train/ce_loss": 0.657215416431427 + }, + { + "epoch": 1.3383428910421198, + "step": 13536, + "train/sim_loss": 0.023496389389038086 + }, + { + "epoch": 1.3383428910421198, + "step": 13536, + "train/total_loss": 0.08921793103218079 + }, + { + "entropy": 10.037797927856445, + "epoch": 1.3384417638916353, + "mean_token_accuracy": 0.8133116960525513, + "num_tokens": 19141766.0, + "step": 13537, + "train/ce_loss": 0.5891017317771912 + }, + { + "epoch": 1.3384417638916353, + "step": 13537, + "train/sim_loss": 0.05746269226074219 + }, + { + "epoch": 1.3384417638916353, + "step": 13537, + "train/total_loss": 0.11637286841869354 + }, + { + "entropy": 9.472969055175781, + "epoch": 1.338540636741151, + "mean_token_accuracy": 0.8759036064147949, + "num_tokens": 19151986.0, + "step": 13538, + "train/ce_loss": 0.1475655734539032 + }, + { + "epoch": 1.338540636741151, + "step": 13538, + "train/sim_loss": 0.0776669979095459 + }, + { + "epoch": 1.338540636741151, + "step": 13538, + "train/total_loss": 0.09242355823516846 + }, + { + "entropy": 9.897106170654297, + "epoch": 1.3386395095906665, + "mean_token_accuracy": 0.8754152655601501, + "num_tokens": 19166259.0, + "step": 13539, + "train/ce_loss": 0.18878290057182312 + }, + { + "epoch": 1.3386395095906665, + "step": 13539, + "train/sim_loss": 0.05571359395980835 + }, + { + "epoch": 1.3386395095906665, + "step": 13539, + "train/total_loss": 0.07459188252687454 + }, + { + "epoch": 1.338738382440182, + "grad_norm": 0.5886859893798828, + "learning_rate": 6.6550462344854875e-06, + "loss": 0.0927, + "step": 13540 + }, + { + "entropy": 9.833443641662598, + "epoch": 1.338738382440182, + "mean_token_accuracy": 0.869415819644928, + "num_tokens": 19181486.0, + "step": 13540, + "train/ce_loss": 0.2144763022661209 + }, + { + "epoch": 1.338738382440182, + "step": 13540, + "train/sim_loss": 0.03117382526397705 + }, + { + "epoch": 1.338738382440182, + "step": 13540, + "train/total_loss": 0.05262145400047302 + }, + { + "entropy": 9.807891845703125, + "epoch": 1.3388372552896974, + "mean_token_accuracy": 0.8933143615722656, + "num_tokens": 19201542.0, + "step": 13541, + "train/ce_loss": 0.25754234194755554 + }, + { + "epoch": 1.3388372552896974, + "step": 13541, + "train/sim_loss": 0.02144235372543335 + }, + { + "epoch": 1.3388372552896974, + "step": 13541, + "train/total_loss": 0.04719658941030502 + }, + { + "entropy": 9.572542190551758, + "epoch": 1.338936128139213, + "mean_token_accuracy": 0.8642951250076294, + "num_tokens": 19214003.0, + "step": 13542, + "train/ce_loss": 0.35857293009757996 + }, + { + "epoch": 1.338936128139213, + "step": 13542, + "train/sim_loss": 0.049110352993011475 + }, + { + "epoch": 1.338936128139213, + "step": 13542, + "train/total_loss": 0.08496764302253723 + }, + { + "entropy": 9.857975959777832, + "epoch": 1.3390350009887286, + "mean_token_accuracy": 0.8365921974182129, + "num_tokens": 19233158.0, + "step": 13543, + "train/ce_loss": 0.40653562545776367 + }, + { + "epoch": 1.3390350009887286, + "step": 13543, + "train/sim_loss": 0.026249349117279053 + }, + { + "epoch": 1.3390350009887286, + "step": 13543, + "train/total_loss": 0.06690291315317154 + }, + { + "entropy": 9.08765983581543, + "epoch": 1.339133873838244, + "mean_token_accuracy": 0.8552941083908081, + "num_tokens": 19241865.0, + "step": 13544, + "train/ce_loss": 0.29469600319862366 + }, + { + "epoch": 1.339133873838244, + "step": 13544, + "train/sim_loss": 0.021645426750183105 + }, + { + "epoch": 1.339133873838244, + "step": 13544, + "train/total_loss": 0.05111502856016159 + }, + { + "entropy": 9.660457611083984, + "epoch": 1.3392327466877596, + "mean_token_accuracy": 0.8072487711906433, + "num_tokens": 19253561.0, + "step": 13545, + "train/ce_loss": 6.684928166578175e-07 + }, + { + "epoch": 1.3392327466877596, + "step": 13545, + "train/sim_loss": 0.028104424476623535 + }, + { + "epoch": 1.3392327466877596, + "step": 13545, + "train/total_loss": 0.028104491531848907 + }, + { + "entropy": 9.855910301208496, + "epoch": 1.339331619537275, + "mean_token_accuracy": 0.8328690528869629, + "num_tokens": 19272490.0, + "step": 13546, + "train/ce_loss": 0.4258272349834442 + }, + { + "epoch": 1.339331619537275, + "step": 13546, + "train/sim_loss": 0.04787200689315796 + }, + { + "epoch": 1.339331619537275, + "step": 13546, + "train/total_loss": 0.09045472741127014 + }, + { + "entropy": 9.657157897949219, + "epoch": 1.3394304923867906, + "mean_token_accuracy": 0.8271308541297913, + "num_tokens": 19287716.0, + "step": 13547, + "train/ce_loss": 0.29674842953681946 + }, + { + "epoch": 1.3394304923867906, + "step": 13547, + "train/sim_loss": 0.0409398078918457 + }, + { + "epoch": 1.3394304923867906, + "step": 13547, + "train/total_loss": 0.07061465084552765 + }, + { + "entropy": 9.824300765991211, + "epoch": 1.339529365236306, + "mean_token_accuracy": 0.8191489577293396, + "num_tokens": 19303479.0, + "step": 13548, + "train/ce_loss": 0.6712304353713989 + }, + { + "epoch": 1.339529365236306, + "step": 13548, + "train/sim_loss": 0.10312390327453613 + }, + { + "epoch": 1.339529365236306, + "step": 13548, + "train/total_loss": 0.17024695873260498 + }, + { + "entropy": 9.354917526245117, + "epoch": 1.3396282380858215, + "mean_token_accuracy": 0.8436974883079529, + "num_tokens": 19310325.0, + "step": 13549, + "train/ce_loss": 2.037721060332842e-06 + }, + { + "epoch": 1.3396282380858215, + "step": 13549, + "train/sim_loss": 0.03398030996322632 + }, + { + "epoch": 1.3396282380858215, + "step": 13549, + "train/total_loss": 0.033980514854192734 + }, + { + "entropy": 9.398126602172852, + "epoch": 1.3397271109353373, + "mean_token_accuracy": 0.8647058606147766, + "num_tokens": 19316454.0, + "step": 13550, + "train/ce_loss": 0.5516201853752136 + }, + { + "epoch": 1.3397271109353373, + "step": 13550, + "train/sim_loss": 0.045667409896850586 + }, + { + "epoch": 1.3397271109353373, + "step": 13550, + "train/total_loss": 0.10082942992448807 + }, + { + "entropy": 9.662240982055664, + "epoch": 1.3398259837848527, + "mean_token_accuracy": 0.8893740773200989, + "num_tokens": 19326688.0, + "step": 13551, + "train/ce_loss": 0.6727936267852783 + }, + { + "epoch": 1.3398259837848527, + "step": 13551, + "train/sim_loss": 0.05254411697387695 + }, + { + "epoch": 1.3398259837848527, + "step": 13551, + "train/total_loss": 0.11982347816228867 + }, + { + "entropy": 9.347929000854492, + "epoch": 1.3399248566343682, + "mean_token_accuracy": 0.8233438730239868, + "num_tokens": 19339204.0, + "step": 13552, + "train/ce_loss": 0.34309664368629456 + }, + { + "epoch": 1.3399248566343682, + "step": 13552, + "train/sim_loss": 0.03176999092102051 + }, + { + "epoch": 1.3399248566343682, + "step": 13552, + "train/total_loss": 0.06607966125011444 + }, + { + "entropy": 9.953289985656738, + "epoch": 1.3400237294838837, + "mean_token_accuracy": 0.836569607257843, + "num_tokens": 19352675.0, + "step": 13553, + "train/ce_loss": 0.3186705410480499 + }, + { + "epoch": 1.3400237294838837, + "step": 13553, + "train/sim_loss": 0.018562674522399902 + }, + { + "epoch": 1.3400237294838837, + "step": 13553, + "train/total_loss": 0.050429727882146835 + }, + { + "entropy": 9.999734878540039, + "epoch": 1.3401226023333992, + "mean_token_accuracy": 0.8535432815551758, + "num_tokens": 19362840.0, + "step": 13554, + "train/ce_loss": 0.6450260877609253 + }, + { + "epoch": 1.3401226023333992, + "step": 13554, + "train/sim_loss": 0.03486478328704834 + }, + { + "epoch": 1.3401226023333992, + "step": 13554, + "train/total_loss": 0.09936739504337311 + }, + { + "entropy": 9.093164443969727, + "epoch": 1.340221475182915, + "mean_token_accuracy": 0.8682634830474854, + "num_tokens": 19371446.0, + "step": 13555, + "train/ce_loss": 0.5733265280723572 + }, + { + "epoch": 1.340221475182915, + "step": 13555, + "train/sim_loss": 0.042746663093566895 + }, + { + "epoch": 1.340221475182915, + "step": 13555, + "train/total_loss": 0.10007931292057037 + }, + { + "entropy": 9.736268043518066, + "epoch": 1.3403203480324302, + "mean_token_accuracy": 0.858208954334259, + "num_tokens": 19385930.0, + "step": 13556, + "train/ce_loss": 0.3899311125278473 + }, + { + "epoch": 1.3403203480324302, + "step": 13556, + "train/sim_loss": 0.03881347179412842 + }, + { + "epoch": 1.3403203480324302, + "step": 13556, + "train/total_loss": 0.07780658453702927 + }, + { + "entropy": 9.450453758239746, + "epoch": 1.3404192208819459, + "mean_token_accuracy": 0.8491335511207581, + "num_tokens": 19397967.0, + "step": 13557, + "train/ce_loss": 0.3154570758342743 + }, + { + "epoch": 1.3404192208819459, + "step": 13557, + "train/sim_loss": 0.06584632396697998 + }, + { + "epoch": 1.3404192208819459, + "step": 13557, + "train/total_loss": 0.09739203751087189 + }, + { + "entropy": 9.386117935180664, + "epoch": 1.3405180937314614, + "mean_token_accuracy": 0.8270676732063293, + "num_tokens": 19412064.0, + "step": 13558, + "train/ce_loss": 0.6730096340179443 + }, + { + "epoch": 1.3405180937314614, + "step": 13558, + "train/sim_loss": 0.07894819974899292 + }, + { + "epoch": 1.3405180937314614, + "step": 13558, + "train/total_loss": 0.1462491750717163 + }, + { + "entropy": 9.715444564819336, + "epoch": 1.3406169665809768, + "mean_token_accuracy": 0.867012083530426, + "num_tokens": 19425815.0, + "step": 13559, + "train/ce_loss": 4.1506672232571873e-07 + }, + { + "epoch": 1.3406169665809768, + "step": 13559, + "train/sim_loss": 0.06383174657821655 + }, + { + "epoch": 1.3406169665809768, + "step": 13559, + "train/total_loss": 0.06383179128170013 + }, + { + "epoch": 1.3407158394304923, + "grad_norm": 0.6179267168045044, + "learning_rate": 6.650101369727539e-06, + "loss": 0.0863, + "step": 13560 + }, + { + "entropy": 9.615544319152832, + "epoch": 1.3407158394304923, + "mean_token_accuracy": 0.8928104639053345, + "num_tokens": 19441683.0, + "step": 13560, + "train/ce_loss": 0.4030003845691681 + }, + { + "epoch": 1.3407158394304923, + "step": 13560, + "train/sim_loss": 0.07347965240478516 + }, + { + "epoch": 1.3407158394304923, + "step": 13560, + "train/total_loss": 0.1137796938419342 + }, + { + "entropy": 9.9869384765625, + "epoch": 1.3408147122800078, + "mean_token_accuracy": 0.8688889145851135, + "num_tokens": 19458540.0, + "step": 13561, + "train/ce_loss": 4.068969019499491e-07 + }, + { + "epoch": 1.3408147122800078, + "step": 13561, + "train/sim_loss": 0.029190540313720703 + }, + { + "epoch": 1.3408147122800078, + "step": 13561, + "train/total_loss": 0.029190581291913986 + }, + { + "entropy": 9.973976135253906, + "epoch": 1.3409135851295235, + "mean_token_accuracy": 0.8272425532341003, + "num_tokens": 19471284.0, + "step": 13562, + "train/ce_loss": 0.45761728286743164 + }, + { + "epoch": 1.3409135851295235, + "step": 13562, + "train/sim_loss": 0.03156554698944092 + }, + { + "epoch": 1.3409135851295235, + "step": 13562, + "train/total_loss": 0.07732728123664856 + }, + { + "entropy": 9.350919723510742, + "epoch": 1.341012457979039, + "mean_token_accuracy": 0.8494381904602051, + "num_tokens": 19485704.0, + "step": 13563, + "train/ce_loss": 0.16522137820720673 + }, + { + "epoch": 1.341012457979039, + "step": 13563, + "train/sim_loss": 0.03376150131225586 + }, + { + "epoch": 1.341012457979039, + "step": 13563, + "train/total_loss": 0.05028364062309265 + }, + { + "entropy": 9.653467178344727, + "epoch": 1.3411113308285545, + "mean_token_accuracy": 0.8383961319923401, + "num_tokens": 19500026.0, + "step": 13564, + "train/ce_loss": 0.7405121922492981 + }, + { + "epoch": 1.3411113308285545, + "step": 13564, + "train/sim_loss": 0.0767400860786438 + }, + { + "epoch": 1.3411113308285545, + "step": 13564, + "train/total_loss": 0.15079131722450256 + }, + { + "entropy": 9.292234420776367, + "epoch": 1.34121020367807, + "mean_token_accuracy": 0.8059867024421692, + "num_tokens": 19512673.0, + "step": 13565, + "train/ce_loss": 0.3923459053039551 + }, + { + "epoch": 1.34121020367807, + "step": 13565, + "train/sim_loss": 0.013261497020721436 + }, + { + "epoch": 1.34121020367807, + "step": 13565, + "train/total_loss": 0.052496086806058884 + }, + { + "entropy": 9.618090629577637, + "epoch": 1.3413090765275855, + "mean_token_accuracy": 0.8007335066795349, + "num_tokens": 19523296.0, + "step": 13566, + "train/ce_loss": 0.7371588945388794 + }, + { + "epoch": 1.3413090765275855, + "step": 13566, + "train/sim_loss": 0.047038912773132324 + }, + { + "epoch": 1.3413090765275855, + "step": 13566, + "train/total_loss": 0.12075480073690414 + }, + { + "entropy": 9.872977256774902, + "epoch": 1.3414079493771012, + "mean_token_accuracy": 0.791304349899292, + "num_tokens": 19535772.0, + "step": 13567, + "train/ce_loss": 2.2864254844989773e-07 + }, + { + "epoch": 1.3414079493771012, + "step": 13567, + "train/sim_loss": 0.016932368278503418 + }, + { + "epoch": 1.3414079493771012, + "step": 13567, + "train/total_loss": 0.01693239063024521 + }, + { + "entropy": 9.673417091369629, + "epoch": 1.3415068222266164, + "mean_token_accuracy": 0.8866396546363831, + "num_tokens": 19545222.0, + "step": 13568, + "train/ce_loss": 0.789874792098999 + }, + { + "epoch": 1.3415068222266164, + "step": 13568, + "train/sim_loss": 0.08060187101364136 + }, + { + "epoch": 1.3415068222266164, + "step": 13568, + "train/total_loss": 0.15958935022354126 + }, + { + "entropy": 9.042848587036133, + "epoch": 1.3416056950761321, + "mean_token_accuracy": 0.8273009061813354, + "num_tokens": 19556518.0, + "step": 13569, + "train/ce_loss": 0.4836054742336273 + }, + { + "epoch": 1.3416056950761321, + "step": 13569, + "train/sim_loss": 0.012441635131835938 + }, + { + "epoch": 1.3416056950761321, + "step": 13569, + "train/total_loss": 0.06080218404531479 + }, + { + "entropy": 9.647737503051758, + "epoch": 1.3417045679256476, + "mean_token_accuracy": 0.8462469577789307, + "num_tokens": 19570340.0, + "step": 13570, + "train/ce_loss": 0.3788299262523651 + }, + { + "epoch": 1.3417045679256476, + "step": 13570, + "train/sim_loss": 0.02360522747039795 + }, + { + "epoch": 1.3417045679256476, + "step": 13570, + "train/total_loss": 0.06148822233080864 + }, + { + "entropy": 9.409280776977539, + "epoch": 1.3418034407751631, + "mean_token_accuracy": 0.8925170302391052, + "num_tokens": 19582193.0, + "step": 13571, + "train/ce_loss": 0.22772449254989624 + }, + { + "epoch": 1.3418034407751631, + "step": 13571, + "train/sim_loss": 0.018004179000854492 + }, + { + "epoch": 1.3418034407751631, + "step": 13571, + "train/total_loss": 0.040776629000902176 + }, + { + "entropy": 9.328826904296875, + "epoch": 1.3419023136246786, + "mean_token_accuracy": 0.8728222846984863, + "num_tokens": 19594135.0, + "step": 13572, + "train/ce_loss": 0.515288770198822 + }, + { + "epoch": 1.3419023136246786, + "step": 13572, + "train/sim_loss": 0.01822561025619507 + }, + { + "epoch": 1.3419023136246786, + "step": 13572, + "train/total_loss": 0.06975448876619339 + }, + { + "entropy": 9.040600776672363, + "epoch": 1.342001186474194, + "mean_token_accuracy": 0.8202247023582458, + "num_tokens": 19611480.0, + "step": 13573, + "train/ce_loss": 0.7212572693824768 + }, + { + "epoch": 1.342001186474194, + "step": 13573, + "train/sim_loss": 0.1380366086959839 + }, + { + "epoch": 1.342001186474194, + "step": 13573, + "train/total_loss": 0.21016234159469604 + }, + { + "entropy": 9.851640701293945, + "epoch": 1.3421000593237098, + "mean_token_accuracy": 0.8745980858802795, + "num_tokens": 19629122.0, + "step": 13574, + "train/ce_loss": 0.7503463625907898 + }, + { + "epoch": 1.3421000593237098, + "step": 13574, + "train/sim_loss": 0.031088829040527344 + }, + { + "epoch": 1.3421000593237098, + "step": 13574, + "train/total_loss": 0.10612346976995468 + }, + { + "entropy": 9.703510284423828, + "epoch": 1.3421989321732253, + "mean_token_accuracy": 0.8968692421913147, + "num_tokens": 19637399.0, + "step": 13575, + "train/ce_loss": 0.48465511202812195 + }, + { + "epoch": 1.3421989321732253, + "step": 13575, + "train/sim_loss": 0.033691227436065674 + }, + { + "epoch": 1.3421989321732253, + "step": 13575, + "train/total_loss": 0.08215674012899399 + }, + { + "entropy": 9.563325881958008, + "epoch": 1.3422978050227408, + "mean_token_accuracy": 0.8985201120376587, + "num_tokens": 19648072.0, + "step": 13576, + "train/ce_loss": 0.6726912260055542 + }, + { + "epoch": 1.3422978050227408, + "step": 13576, + "train/sim_loss": 0.03433239459991455 + }, + { + "epoch": 1.3422978050227408, + "step": 13576, + "train/total_loss": 0.10160151869058609 + }, + { + "entropy": 9.122987747192383, + "epoch": 1.3423966778722562, + "mean_token_accuracy": 0.790996789932251, + "num_tokens": 19660564.0, + "step": 13577, + "train/ce_loss": 0.635726273059845 + }, + { + "epoch": 1.3423966778722562, + "step": 13577, + "train/sim_loss": 0.04226493835449219 + }, + { + "epoch": 1.3423966778722562, + "step": 13577, + "train/total_loss": 0.10583756864070892 + }, + { + "entropy": 10.047834396362305, + "epoch": 1.3424955507217717, + "mean_token_accuracy": 0.8961937427520752, + "num_tokens": 19681034.0, + "step": 13578, + "train/ce_loss": 0.41040951013565063 + }, + { + "epoch": 1.3424955507217717, + "step": 13578, + "train/sim_loss": 0.020171165466308594 + }, + { + "epoch": 1.3424955507217717, + "step": 13578, + "train/total_loss": 0.061212118715047836 + }, + { + "entropy": 9.785310745239258, + "epoch": 1.3425944235712874, + "mean_token_accuracy": 0.8171428442001343, + "num_tokens": 19689431.0, + "step": 13579, + "train/ce_loss": 0.46057045459747314 + }, + { + "epoch": 1.3425944235712874, + "step": 13579, + "train/sim_loss": 0.04997837543487549 + }, + { + "epoch": 1.3425944235712874, + "step": 13579, + "train/total_loss": 0.0960354208946228 + }, + { + "epoch": 1.342693296420803, + "grad_norm": 0.59897381067276, + "learning_rate": 6.64515650496959e-06, + "loss": 0.0881, + "step": 13580 + }, + { + "entropy": 9.51736831665039, + "epoch": 1.342693296420803, + "mean_token_accuracy": 0.8598425388336182, + "num_tokens": 19700533.0, + "step": 13580, + "train/ce_loss": 0.3612298369407654 + }, + { + "epoch": 1.342693296420803, + "step": 13580, + "train/sim_loss": 0.028240442276000977 + }, + { + "epoch": 1.342693296420803, + "step": 13580, + "train/total_loss": 0.06436342746019363 + }, + { + "entropy": 9.354118347167969, + "epoch": 1.3427921692703184, + "mean_token_accuracy": 0.8354576826095581, + "num_tokens": 19713656.0, + "step": 13581, + "train/ce_loss": 2.563735677085788e-07 + }, + { + "epoch": 1.3427921692703184, + "step": 13581, + "train/sim_loss": 0.03699994087219238 + }, + { + "epoch": 1.3427921692703184, + "step": 13581, + "train/total_loss": 0.03699996694922447 + }, + { + "entropy": 9.851869583129883, + "epoch": 1.342891042119834, + "mean_token_accuracy": 0.8285714387893677, + "num_tokens": 19729331.0, + "step": 13582, + "train/ce_loss": 1.3871601819992065 + }, + { + "epoch": 1.342891042119834, + "step": 13582, + "train/sim_loss": 0.07534223794937134 + }, + { + "epoch": 1.342891042119834, + "step": 13582, + "train/total_loss": 0.2140582650899887 + }, + { + "entropy": 9.662891387939453, + "epoch": 1.3429899149693494, + "mean_token_accuracy": 0.8817365169525146, + "num_tokens": 19748695.0, + "step": 13583, + "train/ce_loss": 0.3684845268726349 + }, + { + "epoch": 1.3429899149693494, + "step": 13583, + "train/sim_loss": 0.01833784580230713 + }, + { + "epoch": 1.3429899149693494, + "step": 13583, + "train/total_loss": 0.05518629774451256 + }, + { + "entropy": 9.616646766662598, + "epoch": 1.3430887878188649, + "mean_token_accuracy": 0.8583061695098877, + "num_tokens": 19762621.0, + "step": 13584, + "train/ce_loss": 0.38202133774757385 + }, + { + "epoch": 1.3430887878188649, + "step": 13584, + "train/sim_loss": 0.03622502088546753 + }, + { + "epoch": 1.3430887878188649, + "step": 13584, + "train/total_loss": 0.07442715764045715 + }, + { + "entropy": 9.424081802368164, + "epoch": 1.3431876606683804, + "mean_token_accuracy": 0.9105545878410339, + "num_tokens": 19781609.0, + "step": 13585, + "train/ce_loss": 0.34666457772254944 + }, + { + "epoch": 1.3431876606683804, + "step": 13585, + "train/sim_loss": 0.027629971504211426 + }, + { + "epoch": 1.3431876606683804, + "step": 13585, + "train/total_loss": 0.06229643151164055 + }, + { + "entropy": 9.223265647888184, + "epoch": 1.343286533517896, + "mean_token_accuracy": 0.8934240341186523, + "num_tokens": 19793957.0, + "step": 13586, + "train/ce_loss": 0.2509134113788605 + }, + { + "epoch": 1.343286533517896, + "step": 13586, + "train/sim_loss": 0.014609456062316895 + }, + { + "epoch": 1.343286533517896, + "step": 13586, + "train/total_loss": 0.03970079869031906 + }, + { + "entropy": 9.288301467895508, + "epoch": 1.3433854063674115, + "mean_token_accuracy": 0.8120218515396118, + "num_tokens": 19805111.0, + "step": 13587, + "train/ce_loss": 0.20293115079402924 + }, + { + "epoch": 1.3433854063674115, + "step": 13587, + "train/sim_loss": 0.058609724044799805 + }, + { + "epoch": 1.3433854063674115, + "step": 13587, + "train/total_loss": 0.07890284061431885 + }, + { + "entropy": 9.91805362701416, + "epoch": 1.343484279216927, + "mean_token_accuracy": 0.8583765029907227, + "num_tokens": 19817885.0, + "step": 13588, + "train/ce_loss": 5.781227514489728e-07 + }, + { + "epoch": 1.343484279216927, + "step": 13588, + "train/sim_loss": 0.016373872756958008 + }, + { + "epoch": 1.343484279216927, + "step": 13588, + "train/total_loss": 0.016373930498957634 + }, + { + "entropy": 9.168167114257812, + "epoch": 1.3435831520664425, + "mean_token_accuracy": 0.8522727489471436, + "num_tokens": 19826230.0, + "step": 13589, + "train/ce_loss": 5.289738851388393e-07 + }, + { + "epoch": 1.3435831520664425, + "step": 13589, + "train/sim_loss": 0.019660472869873047 + }, + { + "epoch": 1.3435831520664425, + "step": 13589, + "train/total_loss": 0.019660525023937225 + }, + { + "entropy": 9.519360542297363, + "epoch": 1.343682024915958, + "mean_token_accuracy": 0.8500604629516602, + "num_tokens": 19838897.0, + "step": 13590, + "train/ce_loss": 0.43069130182266235 + }, + { + "epoch": 1.343682024915958, + "step": 13590, + "train/sim_loss": 0.022626757621765137 + }, + { + "epoch": 1.343682024915958, + "step": 13590, + "train/total_loss": 0.06569588929414749 + }, + { + "entropy": 9.540843963623047, + "epoch": 1.3437808977654737, + "mean_token_accuracy": 0.8241881132125854, + "num_tokens": 19850848.0, + "step": 13591, + "train/ce_loss": 0.794585108757019 + }, + { + "epoch": 1.3437808977654737, + "step": 13591, + "train/sim_loss": 0.08952504396438599 + }, + { + "epoch": 1.3437808977654737, + "step": 13591, + "train/total_loss": 0.1689835488796234 + }, + { + "entropy": 9.974708557128906, + "epoch": 1.3438797706149892, + "mean_token_accuracy": 0.8402062058448792, + "num_tokens": 19863841.0, + "step": 13592, + "train/ce_loss": 0.3788131773471832 + }, + { + "epoch": 1.3438797706149892, + "step": 13592, + "train/sim_loss": 0.0358661413192749 + }, + { + "epoch": 1.3438797706149892, + "step": 13592, + "train/total_loss": 0.07374745607376099 + }, + { + "entropy": 9.840885162353516, + "epoch": 1.3439786434645047, + "mean_token_accuracy": 0.8660714030265808, + "num_tokens": 19882988.0, + "step": 13593, + "train/ce_loss": 4.108423183879495e-07 + }, + { + "epoch": 1.3439786434645047, + "step": 13593, + "train/sim_loss": 0.027041733264923096 + }, + { + "epoch": 1.3439786434645047, + "step": 13593, + "train/total_loss": 0.02704177424311638 + }, + { + "entropy": 9.367142677307129, + "epoch": 1.3440775163140202, + "mean_token_accuracy": 0.8329979777336121, + "num_tokens": 19895540.0, + "step": 13594, + "train/ce_loss": 0.6015684008598328 + }, + { + "epoch": 1.3440775163140202, + "step": 13594, + "train/sim_loss": 0.05323910713195801 + }, + { + "epoch": 1.3440775163140202, + "step": 13594, + "train/total_loss": 0.11339594423770905 + }, + { + "entropy": 10.025371551513672, + "epoch": 1.3441763891635357, + "mean_token_accuracy": 0.8899456262588501, + "num_tokens": 19908488.0, + "step": 13595, + "train/ce_loss": 0.5156491994857788 + }, + { + "epoch": 1.3441763891635357, + "step": 13595, + "train/sim_loss": 0.08424896001815796 + }, + { + "epoch": 1.3441763891635357, + "step": 13595, + "train/total_loss": 0.1358138769865036 + }, + { + "entropy": 9.182255744934082, + "epoch": 1.3442752620130511, + "mean_token_accuracy": 0.8697142601013184, + "num_tokens": 19924045.0, + "step": 13596, + "train/ce_loss": 0.25417837500572205 + }, + { + "epoch": 1.3442752620130511, + "step": 13596, + "train/sim_loss": 0.029248416423797607 + }, + { + "epoch": 1.3442752620130511, + "step": 13596, + "train/total_loss": 0.05466625466942787 + }, + { + "entropy": 10.283531188964844, + "epoch": 1.3443741348625666, + "mean_token_accuracy": 0.8611111044883728, + "num_tokens": 19933923.0, + "step": 13597, + "train/ce_loss": 4.635882930870139e-07 + }, + { + "epoch": 1.3443741348625666, + "step": 13597, + "train/sim_loss": 0.010008096694946289 + }, + { + "epoch": 1.3443741348625666, + "step": 13597, + "train/total_loss": 0.01000814326107502 + }, + { + "entropy": 9.804117202758789, + "epoch": 1.3444730077120823, + "mean_token_accuracy": 0.8625730872154236, + "num_tokens": 19943481.0, + "step": 13598, + "train/ce_loss": 0.8266734480857849 + }, + { + "epoch": 1.3444730077120823, + "step": 13598, + "train/sim_loss": 0.052347540855407715 + }, + { + "epoch": 1.3444730077120823, + "step": 13598, + "train/total_loss": 0.13501489162445068 + }, + { + "entropy": 9.632442474365234, + "epoch": 1.3445718805615978, + "mean_token_accuracy": 0.8757061958312988, + "num_tokens": 19951914.0, + "step": 13599, + "train/ce_loss": 0.2392933964729309 + }, + { + "epoch": 1.3445718805615978, + "step": 13599, + "train/sim_loss": 0.03530752658843994 + }, + { + "epoch": 1.3445718805615978, + "step": 13599, + "train/total_loss": 0.05923686921596527 + }, + { + "epoch": 1.3446707534111133, + "grad_norm": 0.6099292635917664, + "learning_rate": 6.640211640211642e-06, + "loss": 0.0788, + "step": 13600 + }, + { + "entropy": 9.116918563842773, + "epoch": 1.3446707534111133, + "mean_token_accuracy": 0.7920997738838196, + "num_tokens": 19966559.0, + "step": 13600, + "train/ce_loss": 0.6516214609146118 + }, + { + "epoch": 1.3446707534111133, + "step": 13600, + "train/sim_loss": 0.02228713035583496 + }, + { + "epoch": 1.3446707534111133, + "step": 13600, + "train/total_loss": 0.08744927495718002 + }, + { + "entropy": 9.732169151306152, + "epoch": 1.3447696262606288, + "mean_token_accuracy": 0.9019908308982849, + "num_tokens": 19987844.0, + "step": 13601, + "train/ce_loss": 3.395518319848634e-07 + }, + { + "epoch": 1.3447696262606288, + "step": 13601, + "train/sim_loss": 0.04507499933242798 + }, + { + "epoch": 1.3447696262606288, + "step": 13601, + "train/total_loss": 0.045075032860040665 + }, + { + "entropy": 9.636966705322266, + "epoch": 1.3448684991101443, + "mean_token_accuracy": 0.8079096078872681, + "num_tokens": 20001098.0, + "step": 13602, + "train/ce_loss": 0.23643890023231506 + }, + { + "epoch": 1.3448684991101443, + "step": 13602, + "train/sim_loss": 0.03898477554321289 + }, + { + "epoch": 1.3448684991101443, + "step": 13602, + "train/total_loss": 0.06262866407632828 + }, + { + "entropy": 9.269072532653809, + "epoch": 1.34496737195966, + "mean_token_accuracy": 0.8314465284347534, + "num_tokens": 20014458.0, + "step": 13603, + "train/ce_loss": 0.23603354394435883 + }, + { + "epoch": 1.34496737195966, + "step": 13603, + "train/sim_loss": 0.04462563991546631 + }, + { + "epoch": 1.34496737195966, + "step": 13603, + "train/total_loss": 0.06822899729013443 + }, + { + "entropy": 9.383710861206055, + "epoch": 1.3450662448091755, + "mean_token_accuracy": 0.8688311576843262, + "num_tokens": 20027765.0, + "step": 13604, + "train/ce_loss": 0.4489801526069641 + }, + { + "epoch": 1.3450662448091755, + "step": 13604, + "train/sim_loss": 0.03891271352767944 + }, + { + "epoch": 1.3450662448091755, + "step": 13604, + "train/total_loss": 0.0838107317686081 + }, + { + "entropy": 9.617308616638184, + "epoch": 1.345165117658691, + "mean_token_accuracy": 0.8843648433685303, + "num_tokens": 20044400.0, + "step": 13605, + "train/ce_loss": 0.34660425782203674 + }, + { + "epoch": 1.345165117658691, + "step": 13605, + "train/sim_loss": 0.032576799392700195 + }, + { + "epoch": 1.345165117658691, + "step": 13605, + "train/total_loss": 0.06723722815513611 + }, + { + "entropy": 9.302160263061523, + "epoch": 1.3452639905082064, + "mean_token_accuracy": 0.8597491383552551, + "num_tokens": 20056743.0, + "step": 13606, + "train/ce_loss": 0.24314725399017334 + }, + { + "epoch": 1.3452639905082064, + "step": 13606, + "train/sim_loss": 0.04395115375518799 + }, + { + "epoch": 1.3452639905082064, + "step": 13606, + "train/total_loss": 0.0682658776640892 + }, + { + "entropy": 8.66811752319336, + "epoch": 1.345362863357722, + "mean_token_accuracy": 0.8136882185935974, + "num_tokens": 20063061.0, + "step": 13607, + "train/ce_loss": 0.8484485745429993 + }, + { + "epoch": 1.345362863357722, + "step": 13607, + "train/sim_loss": 0.018839657306671143 + }, + { + "epoch": 1.345362863357722, + "step": 13607, + "train/total_loss": 0.10368451476097107 + }, + { + "entropy": 9.458782196044922, + "epoch": 1.3454617362072374, + "mean_token_accuracy": 0.8242343664169312, + "num_tokens": 20081002.0, + "step": 13608, + "train/ce_loss": 0.3745451271533966 + }, + { + "epoch": 1.3454617362072374, + "step": 13608, + "train/sim_loss": 0.025951802730560303 + }, + { + "epoch": 1.3454617362072374, + "step": 13608, + "train/total_loss": 0.0634063184261322 + }, + { + "entropy": 9.509448051452637, + "epoch": 1.345560609056753, + "mean_token_accuracy": 0.8662576675415039, + "num_tokens": 20094519.0, + "step": 13609, + "train/ce_loss": 0.20524606108665466 + }, + { + "epoch": 1.345560609056753, + "step": 13609, + "train/sim_loss": 0.030240178108215332 + }, + { + "epoch": 1.345560609056753, + "step": 13609, + "train/total_loss": 0.0507647842168808 + }, + { + "entropy": 9.626548767089844, + "epoch": 1.3456594819062686, + "mean_token_accuracy": 0.8792452812194824, + "num_tokens": 20102084.0, + "step": 13610, + "train/ce_loss": 0.48105230927467346 + }, + { + "epoch": 1.3456594819062686, + "step": 13610, + "train/sim_loss": 0.008506178855895996 + }, + { + "epoch": 1.3456594819062686, + "step": 13610, + "train/total_loss": 0.05661141127347946 + }, + { + "entropy": 9.25865650177002, + "epoch": 1.345758354755784, + "mean_token_accuracy": 0.8698554039001465, + "num_tokens": 20121098.0, + "step": 13611, + "train/ce_loss": 0.3869575560092926 + }, + { + "epoch": 1.345758354755784, + "step": 13611, + "train/sim_loss": 0.020962417125701904 + }, + { + "epoch": 1.345758354755784, + "step": 13611, + "train/total_loss": 0.059658173471689224 + }, + { + "entropy": 9.72959041595459, + "epoch": 1.3458572276052996, + "mean_token_accuracy": 0.8493150472640991, + "num_tokens": 20138347.0, + "step": 13612, + "train/ce_loss": 0.3165067136287689 + }, + { + "epoch": 1.3458572276052996, + "step": 13612, + "train/sim_loss": 0.07444977760314941 + }, + { + "epoch": 1.3458572276052996, + "step": 13612, + "train/total_loss": 0.10610045492649078 + }, + { + "entropy": 9.49731159210205, + "epoch": 1.345956100454815, + "mean_token_accuracy": 0.9328969120979309, + "num_tokens": 20153390.0, + "step": 13613, + "train/ce_loss": 4.3312525122018997e-07 + }, + { + "epoch": 1.345956100454815, + "step": 13613, + "train/sim_loss": 0.021566152572631836 + }, + { + "epoch": 1.345956100454815, + "step": 13613, + "train/total_loss": 0.021566195413470268 + }, + { + "entropy": 9.528189659118652, + "epoch": 1.3460549733043305, + "mean_token_accuracy": 0.8307952880859375, + "num_tokens": 20168369.0, + "step": 13614, + "train/ce_loss": 0.5909230709075928 + }, + { + "epoch": 1.3460549733043305, + "step": 13614, + "train/sim_loss": 0.03931379318237305 + }, + { + "epoch": 1.3460549733043305, + "step": 13614, + "train/total_loss": 0.0984061062335968 + }, + { + "entropy": 9.459526062011719, + "epoch": 1.3461538461538463, + "mean_token_accuracy": 0.8363874554634094, + "num_tokens": 20177946.0, + "step": 13615, + "train/ce_loss": 0.6375317573547363 + }, + { + "epoch": 1.3461538461538463, + "step": 13615, + "train/sim_loss": 0.04911285638809204 + }, + { + "epoch": 1.3461538461538463, + "step": 13615, + "train/total_loss": 0.11286603659391403 + }, + { + "entropy": 9.670404434204102, + "epoch": 1.3462527190033617, + "mean_token_accuracy": 0.8445322513580322, + "num_tokens": 20195162.0, + "step": 13616, + "train/ce_loss": 0.35727784037590027 + }, + { + "epoch": 1.3462527190033617, + "step": 13616, + "train/sim_loss": 0.023502230644226074 + }, + { + "epoch": 1.3462527190033617, + "step": 13616, + "train/total_loss": 0.0592300146818161 + }, + { + "entropy": 9.980717658996582, + "epoch": 1.3463515918528772, + "mean_token_accuracy": 0.8619676828384399, + "num_tokens": 20214692.0, + "step": 13617, + "train/ce_loss": 4.194108100818994e-07 + }, + { + "epoch": 1.3463515918528772, + "step": 13617, + "train/sim_loss": 0.06244248151779175 + }, + { + "epoch": 1.3463515918528772, + "step": 13617, + "train/total_loss": 0.06244252249598503 + }, + { + "entropy": 9.578248977661133, + "epoch": 1.3464504647023927, + "mean_token_accuracy": 0.83253014087677, + "num_tokens": 20228308.0, + "step": 13618, + "train/ce_loss": 0.8120304942131042 + }, + { + "epoch": 1.3464504647023927, + "step": 13618, + "train/sim_loss": 0.045076847076416016 + }, + { + "epoch": 1.3464504647023927, + "step": 13618, + "train/total_loss": 0.12627989053726196 + }, + { + "entropy": 9.37696647644043, + "epoch": 1.3465493375519082, + "mean_token_accuracy": 0.8727034330368042, + "num_tokens": 20242268.0, + "step": 13619, + "train/ce_loss": 0.32988667488098145 + }, + { + "epoch": 1.3465493375519082, + "step": 13619, + "train/sim_loss": 0.012613654136657715 + }, + { + "epoch": 1.3465493375519082, + "step": 13619, + "train/total_loss": 0.04560232162475586 + }, + { + "epoch": 1.346648210401424, + "grad_norm": 0.5265718698501587, + "learning_rate": 6.635266775453691e-06, + "loss": 0.0835, + "step": 13620 + }, + { + "entropy": 9.315948486328125, + "epoch": 1.346648210401424, + "mean_token_accuracy": 0.783643901348114, + "num_tokens": 20255239.0, + "step": 13620, + "train/ce_loss": 0.5185098648071289 + }, + { + "epoch": 1.346648210401424, + "step": 13620, + "train/sim_loss": 0.026797175407409668 + }, + { + "epoch": 1.346648210401424, + "step": 13620, + "train/total_loss": 0.0786481648683548 + }, + { + "entropy": 9.470430374145508, + "epoch": 1.3467470832509392, + "mean_token_accuracy": 0.8527004718780518, + "num_tokens": 20268970.0, + "step": 13621, + "train/ce_loss": 0.3200956881046295 + }, + { + "epoch": 1.3467470832509392, + "step": 13621, + "train/sim_loss": 0.07174944877624512 + }, + { + "epoch": 1.3467470832509392, + "step": 13621, + "train/total_loss": 0.10375902056694031 + }, + { + "entropy": 9.548873901367188, + "epoch": 1.3468459561004549, + "mean_token_accuracy": 0.8209959864616394, + "num_tokens": 20290135.0, + "step": 13622, + "train/ce_loss": 0.632245659828186 + }, + { + "epoch": 1.3468459561004549, + "step": 13622, + "train/sim_loss": 0.024839401245117188 + }, + { + "epoch": 1.3468459561004549, + "step": 13622, + "train/total_loss": 0.08806397020816803 + }, + { + "entropy": 9.696945190429688, + "epoch": 1.3469448289499704, + "mean_token_accuracy": 0.8534350991249084, + "num_tokens": 20305002.0, + "step": 13623, + "train/ce_loss": 0.552831768989563 + }, + { + "epoch": 1.3469448289499704, + "step": 13623, + "train/sim_loss": 0.05154407024383545 + }, + { + "epoch": 1.3469448289499704, + "step": 13623, + "train/total_loss": 0.10682724416255951 + }, + { + "entropy": 9.514297485351562, + "epoch": 1.3470437017994858, + "mean_token_accuracy": 0.8972895741462708, + "num_tokens": 20319600.0, + "step": 13624, + "train/ce_loss": 2.9793520184284716e-07 + }, + { + "epoch": 1.3470437017994858, + "step": 13624, + "train/sim_loss": 0.02889871597290039 + }, + { + "epoch": 1.3470437017994858, + "step": 13624, + "train/total_loss": 0.02889874577522278 + }, + { + "entropy": 8.909339904785156, + "epoch": 1.3471425746490013, + "mean_token_accuracy": 0.8767123222351074, + "num_tokens": 20327783.0, + "step": 13625, + "train/ce_loss": 0.47266384959220886 + }, + { + "epoch": 1.3471425746490013, + "step": 13625, + "train/sim_loss": 0.06042623519897461 + }, + { + "epoch": 1.3471425746490013, + "step": 13625, + "train/total_loss": 0.10769262164831161 + }, + { + "entropy": 9.452773094177246, + "epoch": 1.3472414474985168, + "mean_token_accuracy": 0.8178771138191223, + "num_tokens": 20342473.0, + "step": 13626, + "train/ce_loss": 0.6101201176643372 + }, + { + "epoch": 1.3472414474985168, + "step": 13626, + "train/sim_loss": 0.03364157676696777 + }, + { + "epoch": 1.3472414474985168, + "step": 13626, + "train/total_loss": 0.09465359151363373 + }, + { + "entropy": 9.54005241394043, + "epoch": 1.3473403203480325, + "mean_token_accuracy": 0.8163265585899353, + "num_tokens": 20358916.0, + "step": 13627, + "train/ce_loss": 0.4906456470489502 + }, + { + "epoch": 1.3473403203480325, + "step": 13627, + "train/sim_loss": 0.02690225839614868 + }, + { + "epoch": 1.3473403203480325, + "step": 13627, + "train/total_loss": 0.07596682012081146 + }, + { + "entropy": 9.509601593017578, + "epoch": 1.347439193197548, + "mean_token_accuracy": 0.8799048662185669, + "num_tokens": 20373702.0, + "step": 13628, + "train/ce_loss": 0.3665846288204193 + }, + { + "epoch": 1.347439193197548, + "step": 13628, + "train/sim_loss": 0.03023582696914673 + }, + { + "epoch": 1.347439193197548, + "step": 13628, + "train/total_loss": 0.0668942928314209 + }, + { + "entropy": 9.306737899780273, + "epoch": 1.3475380660470635, + "mean_token_accuracy": 0.8589341640472412, + "num_tokens": 20389435.0, + "step": 13629, + "train/ce_loss": 0.5503881573677063 + }, + { + "epoch": 1.3475380660470635, + "step": 13629, + "train/sim_loss": 0.03125309944152832 + }, + { + "epoch": 1.3475380660470635, + "step": 13629, + "train/total_loss": 0.08629191666841507 + }, + { + "entropy": 9.794876098632812, + "epoch": 1.347636938896579, + "mean_token_accuracy": 0.895638644695282, + "num_tokens": 20402012.0, + "step": 13630, + "train/ce_loss": 0.5362863540649414 + }, + { + "epoch": 1.347636938896579, + "step": 13630, + "train/sim_loss": 0.030851125717163086 + }, + { + "epoch": 1.347636938896579, + "step": 13630, + "train/total_loss": 0.08447976410388947 + }, + { + "entropy": 9.21249771118164, + "epoch": 1.3477358117460945, + "mean_token_accuracy": 0.8070175647735596, + "num_tokens": 20413932.0, + "step": 13631, + "train/ce_loss": 0.3146662712097168 + }, + { + "epoch": 1.3477358117460945, + "step": 13631, + "train/sim_loss": 0.04091811180114746 + }, + { + "epoch": 1.3477358117460945, + "step": 13631, + "train/total_loss": 0.07238474488258362 + }, + { + "entropy": 9.142019271850586, + "epoch": 1.3478346845956102, + "mean_token_accuracy": 0.8062677979469299, + "num_tokens": 20431626.0, + "step": 13632, + "train/ce_loss": 0.3594343066215515 + }, + { + "epoch": 1.3478346845956102, + "step": 13632, + "train/sim_loss": 0.02195107936859131 + }, + { + "epoch": 1.3478346845956102, + "step": 13632, + "train/total_loss": 0.0578945092856884 + }, + { + "entropy": 9.546784400939941, + "epoch": 1.3479335574451254, + "mean_token_accuracy": 0.8297567963600159, + "num_tokens": 20443020.0, + "step": 13633, + "train/ce_loss": 0.5253205299377441 + }, + { + "epoch": 1.3479335574451254, + "step": 13633, + "train/sim_loss": 0.01568591594696045 + }, + { + "epoch": 1.3479335574451254, + "step": 13633, + "train/total_loss": 0.06821797043085098 + }, + { + "entropy": 9.750186920166016, + "epoch": 1.3480324302946411, + "mean_token_accuracy": 0.8548951148986816, + "num_tokens": 20452133.0, + "step": 13634, + "train/ce_loss": 3.842978060220048e-07 + }, + { + "epoch": 1.3480324302946411, + "step": 13634, + "train/sim_loss": 0.00917971134185791 + }, + { + "epoch": 1.3480324302946411, + "step": 13634, + "train/total_loss": 0.00917974952608347 + }, + { + "entropy": 9.534496307373047, + "epoch": 1.3481313031441566, + "mean_token_accuracy": 0.8071979284286499, + "num_tokens": 20468388.0, + "step": 13635, + "train/ce_loss": 0.5086488127708435 + }, + { + "epoch": 1.3481313031441566, + "step": 13635, + "train/sim_loss": 0.03552985191345215 + }, + { + "epoch": 1.3481313031441566, + "step": 13635, + "train/total_loss": 0.08639473468065262 + }, + { + "entropy": 9.680959701538086, + "epoch": 1.3482301759936721, + "mean_token_accuracy": 0.8237951993942261, + "num_tokens": 20479570.0, + "step": 13636, + "train/ce_loss": 0.8860619068145752 + }, + { + "epoch": 1.3482301759936721, + "step": 13636, + "train/sim_loss": 0.11530864238739014 + }, + { + "epoch": 1.3482301759936721, + "step": 13636, + "train/total_loss": 0.2039148360490799 + }, + { + "entropy": 9.519018173217773, + "epoch": 1.3483290488431876, + "mean_token_accuracy": 0.8521126508712769, + "num_tokens": 20496176.0, + "step": 13637, + "train/ce_loss": 0.4856070578098297 + }, + { + "epoch": 1.3483290488431876, + "step": 13637, + "train/sim_loss": 0.055460333824157715 + }, + { + "epoch": 1.3483290488431876, + "step": 13637, + "train/total_loss": 0.10402104258537292 + }, + { + "entropy": 9.438509941101074, + "epoch": 1.348427921692703, + "mean_token_accuracy": 0.7975460290908813, + "num_tokens": 20509065.0, + "step": 13638, + "train/ce_loss": 0.622859537601471 + }, + { + "epoch": 1.348427921692703, + "step": 13638, + "train/sim_loss": 0.05760383605957031 + }, + { + "epoch": 1.348427921692703, + "step": 13638, + "train/total_loss": 0.11988979578018188 + }, + { + "entropy": 9.369848251342773, + "epoch": 1.3485267945422188, + "mean_token_accuracy": 0.8187995553016663, + "num_tokens": 20527732.0, + "step": 13639, + "train/ce_loss": 0.3842400312423706 + }, + { + "epoch": 1.3485267945422188, + "step": 13639, + "train/sim_loss": 0.02767401933670044 + }, + { + "epoch": 1.3485267945422188, + "step": 13639, + "train/total_loss": 0.06609801948070526 + }, + { + "epoch": 1.3486256673917343, + "grad_norm": 0.5913474559783936, + "learning_rate": 6.630321910695743e-06, + "loss": 0.0924, + "step": 13640 + }, + { + "entropy": 9.618206977844238, + "epoch": 1.3486256673917343, + "mean_token_accuracy": 0.8842832446098328, + "num_tokens": 20541718.0, + "step": 13640, + "train/ce_loss": 0.4600289762020111 + }, + { + "epoch": 1.3486256673917343, + "step": 13640, + "train/sim_loss": 0.01607358455657959 + }, + { + "epoch": 1.3486256673917343, + "step": 13640, + "train/total_loss": 0.06207648292183876 + }, + { + "entropy": 9.824331283569336, + "epoch": 1.3487245402412498, + "mean_token_accuracy": 0.8595317602157593, + "num_tokens": 20550348.0, + "step": 13641, + "train/ce_loss": 0.5811305642127991 + }, + { + "epoch": 1.3487245402412498, + "step": 13641, + "train/sim_loss": 0.02928173542022705 + }, + { + "epoch": 1.3487245402412498, + "step": 13641, + "train/total_loss": 0.08739478886127472 + }, + { + "entropy": 9.925785064697266, + "epoch": 1.3488234130907653, + "mean_token_accuracy": 0.8289855122566223, + "num_tokens": 20567451.0, + "step": 13642, + "train/ce_loss": 0.8075533509254456 + }, + { + "epoch": 1.3488234130907653, + "step": 13642, + "train/sim_loss": 0.028055429458618164 + }, + { + "epoch": 1.3488234130907653, + "step": 13642, + "train/total_loss": 0.10881076753139496 + }, + { + "entropy": 9.561555862426758, + "epoch": 1.3489222859402807, + "mean_token_accuracy": 0.8559423685073853, + "num_tokens": 20581283.0, + "step": 13643, + "train/ce_loss": 0.36728715896606445 + }, + { + "epoch": 1.3489222859402807, + "step": 13643, + "train/sim_loss": 0.015116453170776367 + }, + { + "epoch": 1.3489222859402807, + "step": 13643, + "train/total_loss": 0.05184517055749893 + }, + { + "entropy": 9.54364013671875, + "epoch": 1.3490211587897964, + "mean_token_accuracy": 0.8482428193092346, + "num_tokens": 20596965.0, + "step": 13644, + "train/ce_loss": 0.37834715843200684 + }, + { + "epoch": 1.3490211587897964, + "step": 13644, + "train/sim_loss": 0.01733088493347168 + }, + { + "epoch": 1.3490211587897964, + "step": 13644, + "train/total_loss": 0.055165600031614304 + }, + { + "entropy": 9.53550910949707, + "epoch": 1.3491200316393117, + "mean_token_accuracy": 0.8628389239311218, + "num_tokens": 20612701.0, + "step": 13645, + "train/ce_loss": 3.992613528680522e-07 + }, + { + "epoch": 1.3491200316393117, + "step": 13645, + "train/sim_loss": 0.05732637643814087 + }, + { + "epoch": 1.3491200316393117, + "step": 13645, + "train/total_loss": 0.05732641741633415 + }, + { + "entropy": 9.463737487792969, + "epoch": 1.3492189044888274, + "mean_token_accuracy": 0.8579952120780945, + "num_tokens": 20622943.0, + "step": 13646, + "train/ce_loss": 0.479726105928421 + }, + { + "epoch": 1.3492189044888274, + "step": 13646, + "train/sim_loss": 0.017206192016601562 + }, + { + "epoch": 1.3492189044888274, + "step": 13646, + "train/total_loss": 0.06517880409955978 + }, + { + "entropy": 9.052145004272461, + "epoch": 1.349317777338343, + "mean_token_accuracy": 0.8390341997146606, + "num_tokens": 20636548.0, + "step": 13647, + "train/ce_loss": 0.7623302340507507 + }, + { + "epoch": 1.349317777338343, + "step": 13647, + "train/sim_loss": 0.060335636138916016 + }, + { + "epoch": 1.349317777338343, + "step": 13647, + "train/total_loss": 0.13656866550445557 + }, + { + "entropy": 9.632833480834961, + "epoch": 1.3494166501878584, + "mean_token_accuracy": 0.861586332321167, + "num_tokens": 20654666.0, + "step": 13648, + "train/ce_loss": 0.4108925759792328 + }, + { + "epoch": 1.3494166501878584, + "step": 13648, + "train/sim_loss": 0.03578448295593262 + }, + { + "epoch": 1.3494166501878584, + "step": 13648, + "train/total_loss": 0.07687374204397202 + }, + { + "entropy": 9.77056884765625, + "epoch": 1.3495155230373739, + "mean_token_accuracy": 0.8675417900085449, + "num_tokens": 20675921.0, + "step": 13649, + "train/ce_loss": 0.26306048035621643 + }, + { + "epoch": 1.3495155230373739, + "step": 13649, + "train/sim_loss": 0.026702404022216797 + }, + { + "epoch": 1.3495155230373739, + "step": 13649, + "train/total_loss": 0.05300845205783844 + }, + { + "entropy": 9.755699157714844, + "epoch": 1.3496143958868894, + "mean_token_accuracy": 0.8380634188652039, + "num_tokens": 20688672.0, + "step": 13650, + "train/ce_loss": 0.4000292420387268 + }, + { + "epoch": 1.3496143958868894, + "step": 13650, + "train/sim_loss": 0.04509615898132324 + }, + { + "epoch": 1.3496143958868894, + "step": 13650, + "train/total_loss": 0.08509908616542816 + }, + { + "entropy": 8.938912391662598, + "epoch": 1.349713268736405, + "mean_token_accuracy": 0.8433981537818909, + "num_tokens": 20695950.0, + "step": 13651, + "train/ce_loss": 0.3357605040073395 + }, + { + "epoch": 1.349713268736405, + "step": 13651, + "train/sim_loss": 0.02139115333557129 + }, + { + "epoch": 1.349713268736405, + "step": 13651, + "train/total_loss": 0.054967205971479416 + }, + { + "entropy": 9.311410903930664, + "epoch": 1.3498121415859206, + "mean_token_accuracy": 0.7918604612350464, + "num_tokens": 20706612.0, + "step": 13652, + "train/ce_loss": 0.6576704382896423 + }, + { + "epoch": 1.3498121415859206, + "step": 13652, + "train/sim_loss": 0.05026531219482422 + }, + { + "epoch": 1.3498121415859206, + "step": 13652, + "train/total_loss": 0.11603235453367233 + }, + { + "entropy": 9.417867660522461, + "epoch": 1.349911014435436, + "mean_token_accuracy": 0.8429448008537292, + "num_tokens": 20720129.0, + "step": 13653, + "train/ce_loss": 0.7458851337432861 + }, + { + "epoch": 1.349911014435436, + "step": 13653, + "train/sim_loss": 0.06490862369537354 + }, + { + "epoch": 1.349911014435436, + "step": 13653, + "train/total_loss": 0.13949713110923767 + }, + { + "entropy": 9.088125228881836, + "epoch": 1.3500098872849515, + "mean_token_accuracy": 0.8180593252182007, + "num_tokens": 20729136.0, + "step": 13654, + "train/ce_loss": 0.9032076001167297 + }, + { + "epoch": 1.3500098872849515, + "step": 13654, + "train/sim_loss": 0.04316675662994385 + }, + { + "epoch": 1.3500098872849515, + "step": 13654, + "train/total_loss": 0.1334875226020813 + }, + { + "entropy": 9.390620231628418, + "epoch": 1.350108760134467, + "mean_token_accuracy": 0.8893557190895081, + "num_tokens": 20741578.0, + "step": 13655, + "train/ce_loss": 6.744927532054135e-07 + }, + { + "epoch": 1.350108760134467, + "step": 13655, + "train/sim_loss": 0.06166565418243408 + }, + { + "epoch": 1.350108760134467, + "step": 13655, + "train/total_loss": 0.061665721237659454 + }, + { + "entropy": 9.346436500549316, + "epoch": 1.3502076329839827, + "mean_token_accuracy": 0.7911602258682251, + "num_tokens": 20755029.0, + "step": 13656, + "train/ce_loss": 0.6362446546554565 + }, + { + "epoch": 1.3502076329839827, + "step": 13656, + "train/sim_loss": 0.06780064105987549 + }, + { + "epoch": 1.3502076329839827, + "step": 13656, + "train/total_loss": 0.13142511248588562 + }, + { + "entropy": 9.513689041137695, + "epoch": 1.350306505833498, + "mean_token_accuracy": 0.8301886916160583, + "num_tokens": 20769170.0, + "step": 13657, + "train/ce_loss": 0.338259756565094 + }, + { + "epoch": 1.350306505833498, + "step": 13657, + "train/sim_loss": 0.04737520217895508 + }, + { + "epoch": 1.350306505833498, + "step": 13657, + "train/total_loss": 0.08120118081569672 + }, + { + "entropy": 9.279478073120117, + "epoch": 1.3504053786830137, + "mean_token_accuracy": 0.8365527391433716, + "num_tokens": 20787704.0, + "step": 13658, + "train/ce_loss": 0.3791244626045227 + }, + { + "epoch": 1.3504053786830137, + "step": 13658, + "train/sim_loss": 0.044151127338409424 + }, + { + "epoch": 1.3504053786830137, + "step": 13658, + "train/total_loss": 0.08206357061862946 + }, + { + "entropy": 8.715532302856445, + "epoch": 1.3505042515325292, + "mean_token_accuracy": 0.8713858127593994, + "num_tokens": 20799038.0, + "step": 13659, + "train/ce_loss": 0.260022908449173 + }, + { + "epoch": 1.3505042515325292, + "step": 13659, + "train/sim_loss": 0.06436890363693237 + }, + { + "epoch": 1.3505042515325292, + "step": 13659, + "train/total_loss": 0.09037119150161743 + }, + { + "epoch": 1.3506031243820447, + "grad_norm": 0.47040724754333496, + "learning_rate": 6.625377045937794e-06, + "loss": 0.0852, + "step": 13660 + }, + { + "entropy": 9.460280418395996, + "epoch": 1.3506031243820447, + "mean_token_accuracy": 0.8517315983772278, + "num_tokens": 20811703.0, + "step": 13660, + "train/ce_loss": 0.3336544930934906 + }, + { + "epoch": 1.3506031243820447, + "step": 13660, + "train/sim_loss": 0.07105123996734619 + }, + { + "epoch": 1.3506031243820447, + "step": 13660, + "train/total_loss": 0.10441669076681137 + }, + { + "entropy": 9.226823806762695, + "epoch": 1.3507019972315601, + "mean_token_accuracy": 0.8804205060005188, + "num_tokens": 20824569.0, + "step": 13661, + "train/ce_loss": 0.4465678632259369 + }, + { + "epoch": 1.3507019972315601, + "step": 13661, + "train/sim_loss": 0.029268205165863037 + }, + { + "epoch": 1.3507019972315601, + "step": 13661, + "train/total_loss": 0.07392498850822449 + }, + { + "entropy": 9.114995956420898, + "epoch": 1.3508008700810756, + "mean_token_accuracy": 0.8483221530914307, + "num_tokens": 20832409.0, + "step": 13662, + "train/ce_loss": 0.5511818528175354 + }, + { + "epoch": 1.3508008700810756, + "step": 13662, + "train/sim_loss": 0.05144548416137695 + }, + { + "epoch": 1.3508008700810756, + "step": 13662, + "train/total_loss": 0.10656367242336273 + }, + { + "entropy": 9.861185073852539, + "epoch": 1.3508997429305913, + "mean_token_accuracy": 0.8830769062042236, + "num_tokens": 20846320.0, + "step": 13663, + "train/ce_loss": 0.19680428504943848 + }, + { + "epoch": 1.3508997429305913, + "step": 13663, + "train/sim_loss": 0.028049826622009277 + }, + { + "epoch": 1.3508997429305913, + "step": 13663, + "train/total_loss": 0.047730255872011185 + }, + { + "entropy": 9.307769775390625, + "epoch": 1.3509986157801068, + "mean_token_accuracy": 0.8421052694320679, + "num_tokens": 20861964.0, + "step": 13664, + "train/ce_loss": 0.6888612508773804 + }, + { + "epoch": 1.3509986157801068, + "step": 13664, + "train/sim_loss": 0.03980463743209839 + }, + { + "epoch": 1.3509986157801068, + "step": 13664, + "train/total_loss": 0.1086907610297203 + }, + { + "entropy": 9.584297180175781, + "epoch": 1.3510974886296223, + "mean_token_accuracy": 0.840531587600708, + "num_tokens": 20878062.0, + "step": 13665, + "train/ce_loss": 0.7937121391296387 + }, + { + "epoch": 1.3510974886296223, + "step": 13665, + "train/sim_loss": 0.035603880882263184 + }, + { + "epoch": 1.3510974886296223, + "step": 13665, + "train/total_loss": 0.11497509479522705 + }, + { + "entropy": 9.446189880371094, + "epoch": 1.3511963614791378, + "mean_token_accuracy": 0.8957415819168091, + "num_tokens": 20889840.0, + "step": 13666, + "train/ce_loss": 0.5858819484710693 + }, + { + "epoch": 1.3511963614791378, + "step": 13666, + "train/sim_loss": 0.07067757844924927 + }, + { + "epoch": 1.3511963614791378, + "step": 13666, + "train/total_loss": 0.12926577031612396 + }, + { + "entropy": 9.713780403137207, + "epoch": 1.3512952343286533, + "mean_token_accuracy": 0.8760330677032471, + "num_tokens": 20900071.0, + "step": 13667, + "train/ce_loss": 2.1856669718545163e-06 + }, + { + "epoch": 1.3512952343286533, + "step": 13667, + "train/sim_loss": 0.043507933616638184 + }, + { + "epoch": 1.3512952343286533, + "step": 13667, + "train/total_loss": 0.04350815340876579 + }, + { + "entropy": 8.882857322692871, + "epoch": 1.351394107178169, + "mean_token_accuracy": 0.84375, + "num_tokens": 20910655.0, + "step": 13668, + "train/ce_loss": 0.5003335475921631 + }, + { + "epoch": 1.351394107178169, + "step": 13668, + "train/sim_loss": 0.05475962162017822 + }, + { + "epoch": 1.351394107178169, + "step": 13668, + "train/total_loss": 0.10479298233985901 + }, + { + "entropy": 9.451465606689453, + "epoch": 1.3514929800276845, + "mean_token_accuracy": 0.8666666746139526, + "num_tokens": 20924978.0, + "step": 13669, + "train/ce_loss": 0.14235422015190125 + }, + { + "epoch": 1.3514929800276845, + "step": 13669, + "train/sim_loss": 0.040856897830963135 + }, + { + "epoch": 1.3514929800276845, + "step": 13669, + "train/total_loss": 0.05509231984615326 + }, + { + "entropy": 9.571219444274902, + "epoch": 1.3515918528772, + "mean_token_accuracy": 0.8622589707374573, + "num_tokens": 20938361.0, + "step": 13670, + "train/ce_loss": 0.39895856380462646 + }, + { + "epoch": 1.3515918528772, + "step": 13670, + "train/sim_loss": 0.045949697494506836 + }, + { + "epoch": 1.3515918528772, + "step": 13670, + "train/total_loss": 0.08584555983543396 + }, + { + "entropy": 10.024397850036621, + "epoch": 1.3516907257267154, + "mean_token_accuracy": 0.8823529481887817, + "num_tokens": 20954979.0, + "step": 13671, + "train/ce_loss": 1.5115524547582027e-06 + }, + { + "epoch": 1.3516907257267154, + "step": 13671, + "train/sim_loss": 0.032278478145599365 + }, + { + "epoch": 1.3516907257267154, + "step": 13671, + "train/total_loss": 0.0322786308825016 + }, + { + "entropy": 9.903566360473633, + "epoch": 1.351789598576231, + "mean_token_accuracy": 0.8595317602157593, + "num_tokens": 20963075.0, + "step": 13672, + "train/ce_loss": 0.9620798230171204 + }, + { + "epoch": 1.351789598576231, + "step": 13672, + "train/sim_loss": 0.03511011600494385 + }, + { + "epoch": 1.351789598576231, + "step": 13672, + "train/total_loss": 0.1313180923461914 + }, + { + "entropy": 9.321784973144531, + "epoch": 1.3518884714257464, + "mean_token_accuracy": 0.861495852470398, + "num_tokens": 20974903.0, + "step": 13673, + "train/ce_loss": 0.3781823515892029 + }, + { + "epoch": 1.3518884714257464, + "step": 13673, + "train/sim_loss": 0.05600583553314209 + }, + { + "epoch": 1.3518884714257464, + "step": 13673, + "train/total_loss": 0.09382407367229462 + }, + { + "entropy": 9.845005989074707, + "epoch": 1.351987344275262, + "mean_token_accuracy": 0.8668407201766968, + "num_tokens": 20992111.0, + "step": 13674, + "train/ce_loss": 0.5103499889373779 + }, + { + "epoch": 1.351987344275262, + "step": 13674, + "train/sim_loss": 0.07189428806304932 + }, + { + "epoch": 1.351987344275262, + "step": 13674, + "train/total_loss": 0.12292928993701935 + }, + { + "entropy": 9.970853805541992, + "epoch": 1.3520862171247776, + "mean_token_accuracy": 1.0, + "num_tokens": 21009247.0, + "step": 13675, + "train/ce_loss": 9.953827429853845e-06 + }, + { + "epoch": 1.3520862171247776, + "step": 13675, + "train/sim_loss": 0.030089378356933594 + }, + { + "epoch": 1.3520862171247776, + "step": 13675, + "train/total_loss": 0.030090373009443283 + }, + { + "entropy": 9.394556045532227, + "epoch": 1.352185089974293, + "mean_token_accuracy": 0.8264151215553284, + "num_tokens": 21019435.0, + "step": 13676, + "train/ce_loss": 0.9093856811523438 + }, + { + "epoch": 1.352185089974293, + "step": 13676, + "train/sim_loss": 0.07012403011322021 + }, + { + "epoch": 1.352185089974293, + "step": 13676, + "train/total_loss": 0.1610625982284546 + }, + { + "entropy": 9.150138854980469, + "epoch": 1.3522839628238086, + "mean_token_accuracy": 0.794350266456604, + "num_tokens": 21033386.0, + "step": 13677, + "train/ce_loss": 0.879384458065033 + }, + { + "epoch": 1.3522839628238086, + "step": 13677, + "train/sim_loss": 0.04772663116455078 + }, + { + "epoch": 1.3522839628238086, + "step": 13677, + "train/total_loss": 0.13566508889198303 + }, + { + "entropy": 9.36704158782959, + "epoch": 1.352382835673324, + "mean_token_accuracy": 0.8592178821563721, + "num_tokens": 21048317.0, + "step": 13678, + "train/ce_loss": 0.45957478880882263 + }, + { + "epoch": 1.352382835673324, + "step": 13678, + "train/sim_loss": 0.061781346797943115 + }, + { + "epoch": 1.352382835673324, + "step": 13678, + "train/total_loss": 0.10773882269859314 + }, + { + "entropy": 9.314252853393555, + "epoch": 1.3524817085228396, + "mean_token_accuracy": 0.8839957118034363, + "num_tokens": 21059525.0, + "step": 13679, + "train/ce_loss": 0.5363433957099915 + }, + { + "epoch": 1.3524817085228396, + "step": 13679, + "train/sim_loss": 0.05536627769470215 + }, + { + "epoch": 1.3524817085228396, + "step": 13679, + "train/total_loss": 0.10900062322616577 + }, + { + "epoch": 1.3525805813723553, + "grad_norm": 0.49083560705184937, + "learning_rate": 6.620432181179845e-06, + "loss": 0.0814, + "step": 13680 + }, + { + "entropy": 9.882791519165039, + "epoch": 1.3525805813723553, + "mean_token_accuracy": 0.9039735198020935, + "num_tokens": 21068243.0, + "step": 13680, + "train/ce_loss": 0.7804428935050964 + }, + { + "epoch": 1.3525805813723553, + "step": 13680, + "train/sim_loss": 0.024962246417999268 + }, + { + "epoch": 1.3525805813723553, + "step": 13680, + "train/total_loss": 0.10300653427839279 + }, + { + "entropy": 9.343620300292969, + "epoch": 1.3526794542218707, + "mean_token_accuracy": 0.7877461910247803, + "num_tokens": 21079036.0, + "step": 13681, + "train/ce_loss": 0.8876268267631531 + }, + { + "epoch": 1.3526794542218707, + "step": 13681, + "train/sim_loss": 0.03623753786087036 + }, + { + "epoch": 1.3526794542218707, + "step": 13681, + "train/total_loss": 0.1250002235174179 + }, + { + "entropy": 10.111467361450195, + "epoch": 1.3527783270713862, + "mean_token_accuracy": 0.8901098966598511, + "num_tokens": 21094325.0, + "step": 13682, + "train/ce_loss": 5.512391680895234e-07 + }, + { + "epoch": 1.3527783270713862, + "step": 13682, + "train/sim_loss": 0.011517763137817383 + }, + { + "epoch": 1.3527783270713862, + "step": 13682, + "train/total_loss": 0.011517818085849285 + }, + { + "entropy": 9.968816757202148, + "epoch": 1.3528771999209017, + "mean_token_accuracy": 0.8844936490058899, + "num_tokens": 21106474.0, + "step": 13683, + "train/ce_loss": 0.5474779009819031 + }, + { + "epoch": 1.3528771999209017, + "step": 13683, + "train/sim_loss": 0.030763864517211914 + }, + { + "epoch": 1.3528771999209017, + "step": 13683, + "train/total_loss": 0.08551165461540222 + }, + { + "entropy": 8.74228286743164, + "epoch": 1.3529760727704172, + "mean_token_accuracy": 0.8237179517745972, + "num_tokens": 21116885.0, + "step": 13684, + "train/ce_loss": 0.4563940465450287 + }, + { + "epoch": 1.3529760727704172, + "step": 13684, + "train/sim_loss": 0.01993584632873535 + }, + { + "epoch": 1.3529760727704172, + "step": 13684, + "train/total_loss": 0.0655752569437027 + }, + { + "entropy": 9.917462348937988, + "epoch": 1.3530749456199327, + "mean_token_accuracy": 0.8086591958999634, + "num_tokens": 21129874.0, + "step": 13685, + "train/ce_loss": 7.744950494270597e-07 + }, + { + "epoch": 1.3530749456199327, + "step": 13685, + "train/sim_loss": 0.07148236036300659 + }, + { + "epoch": 1.3530749456199327, + "step": 13685, + "train/total_loss": 0.07148243486881256 + }, + { + "entropy": 8.942914009094238, + "epoch": 1.3531738184694482, + "mean_token_accuracy": 0.8819188475608826, + "num_tokens": 21137511.0, + "step": 13686, + "train/ce_loss": 0.3395788371562958 + }, + { + "epoch": 1.3531738184694482, + "step": 13686, + "train/sim_loss": 0.04911959171295166 + }, + { + "epoch": 1.3531738184694482, + "step": 13686, + "train/total_loss": 0.08307747542858124 + }, + { + "entropy": 9.62411880493164, + "epoch": 1.3532726913189639, + "mean_token_accuracy": 0.8528784513473511, + "num_tokens": 21149696.0, + "step": 13687, + "train/ce_loss": 0.5020937323570251 + }, + { + "epoch": 1.3532726913189639, + "step": 13687, + "train/sim_loss": 0.036615967750549316 + }, + { + "epoch": 1.3532726913189639, + "step": 13687, + "train/total_loss": 0.08682534098625183 + }, + { + "entropy": 9.437527656555176, + "epoch": 1.3533715641684794, + "mean_token_accuracy": 0.7902330756187439, + "num_tokens": 21162134.0, + "step": 13688, + "train/ce_loss": 0.6496618390083313 + }, + { + "epoch": 1.3533715641684794, + "step": 13688, + "train/sim_loss": 0.12764215469360352 + }, + { + "epoch": 1.3533715641684794, + "step": 13688, + "train/total_loss": 0.19260834157466888 + }, + { + "entropy": 9.93929672241211, + "epoch": 1.3534704370179949, + "mean_token_accuracy": 0.8837606906890869, + "num_tokens": 21179339.0, + "step": 13689, + "train/ce_loss": 4.694923347869917e-07 + }, + { + "epoch": 1.3534704370179949, + "step": 13689, + "train/sim_loss": 0.04545736312866211 + }, + { + "epoch": 1.3534704370179949, + "step": 13689, + "train/total_loss": 0.04545741155743599 + }, + { + "entropy": 9.728919982910156, + "epoch": 1.3535693098675103, + "mean_token_accuracy": 0.8656716346740723, + "num_tokens": 21192419.0, + "step": 13690, + "train/ce_loss": 0.7525399327278137 + }, + { + "epoch": 1.3535693098675103, + "step": 13690, + "train/sim_loss": 0.0697554349899292 + }, + { + "epoch": 1.3535693098675103, + "step": 13690, + "train/total_loss": 0.14500942826271057 + }, + { + "entropy": 9.032828330993652, + "epoch": 1.3536681827170258, + "mean_token_accuracy": 0.8991097807884216, + "num_tokens": 21203843.0, + "step": 13691, + "train/ce_loss": 0.48004278540611267 + }, + { + "epoch": 1.3536681827170258, + "step": 13691, + "train/sim_loss": 0.030698657035827637 + }, + { + "epoch": 1.3536681827170258, + "step": 13691, + "train/total_loss": 0.07870294153690338 + }, + { + "entropy": 9.838924407958984, + "epoch": 1.3537670555665415, + "mean_token_accuracy": 0.831818163394928, + "num_tokens": 21220022.0, + "step": 13692, + "train/ce_loss": 0.6308091282844543 + }, + { + "epoch": 1.3537670555665415, + "step": 13692, + "train/sim_loss": 0.02589893341064453 + }, + { + "epoch": 1.3537670555665415, + "step": 13692, + "train/total_loss": 0.08897984772920609 + }, + { + "entropy": 9.38890266418457, + "epoch": 1.353865928416057, + "mean_token_accuracy": 0.8523908257484436, + "num_tokens": 21237516.0, + "step": 13693, + "train/ce_loss": 0.19335439801216125 + }, + { + "epoch": 1.353865928416057, + "step": 13693, + "train/sim_loss": 0.028039216995239258 + }, + { + "epoch": 1.353865928416057, + "step": 13693, + "train/total_loss": 0.0473746582865715 + }, + { + "entropy": 9.17097282409668, + "epoch": 1.3539648012655725, + "mean_token_accuracy": 0.8223981857299805, + "num_tokens": 21248070.0, + "step": 13694, + "train/ce_loss": 0.3473811149597168 + }, + { + "epoch": 1.3539648012655725, + "step": 13694, + "train/sim_loss": 0.04520869255065918 + }, + { + "epoch": 1.3539648012655725, + "step": 13694, + "train/total_loss": 0.07994680106639862 + }, + { + "entropy": 9.45095443725586, + "epoch": 1.354063674115088, + "mean_token_accuracy": 0.863173246383667, + "num_tokens": 21259601.0, + "step": 13695, + "train/ce_loss": 0.2778593897819519 + }, + { + "epoch": 1.354063674115088, + "step": 13695, + "train/sim_loss": 0.02582484483718872 + }, + { + "epoch": 1.354063674115088, + "step": 13695, + "train/total_loss": 0.05361078679561615 + }, + { + "entropy": 9.597597122192383, + "epoch": 1.3541625469646035, + "mean_token_accuracy": 0.8496376872062683, + "num_tokens": 21266408.0, + "step": 13696, + "train/ce_loss": 0.6507938504219055 + }, + { + "epoch": 1.3541625469646035, + "step": 13696, + "train/sim_loss": 0.02554112672805786 + }, + { + "epoch": 1.3541625469646035, + "step": 13696, + "train/total_loss": 0.0906205102801323 + }, + { + "entropy": 9.837652206420898, + "epoch": 1.354261419814119, + "mean_token_accuracy": 0.8276422619819641, + "num_tokens": 21278528.0, + "step": 13697, + "train/ce_loss": 3.5665334507939406e-07 + }, + { + "epoch": 1.354261419814119, + "step": 13697, + "train/sim_loss": 0.011739969253540039 + }, + { + "epoch": 1.354261419814119, + "step": 13697, + "train/total_loss": 0.011740004643797874 + }, + { + "entropy": 9.7898588180542, + "epoch": 1.3543602926636344, + "mean_token_accuracy": 0.8633802533149719, + "num_tokens": 21292872.0, + "step": 13698, + "train/ce_loss": 0.44085487723350525 + }, + { + "epoch": 1.3543602926636344, + "step": 13698, + "train/sim_loss": 0.014790177345275879 + }, + { + "epoch": 1.3543602926636344, + "step": 13698, + "train/total_loss": 0.058875665068626404 + }, + { + "entropy": 8.924560546875, + "epoch": 1.3544591655131502, + "mean_token_accuracy": 0.8479408621788025, + "num_tokens": 21304539.0, + "step": 13699, + "train/ce_loss": 0.4733443260192871 + }, + { + "epoch": 1.3544591655131502, + "step": 13699, + "train/sim_loss": 0.01756906509399414 + }, + { + "epoch": 1.3544591655131502, + "step": 13699, + "train/total_loss": 0.06490349769592285 + }, + { + "epoch": 1.3545580383626656, + "grad_norm": 0.46372318267822266, + "learning_rate": 6.615487316421897e-06, + "loss": 0.0832, + "step": 13700 + }, + { + "entropy": 9.423088073730469, + "epoch": 1.3545580383626656, + "mean_token_accuracy": 0.906593382358551, + "num_tokens": 21321097.0, + "step": 13700, + "train/ce_loss": 3.624274711455655e-07 + }, + { + "epoch": 1.3545580383626656, + "step": 13700, + "train/sim_loss": 0.048324644565582275 + }, + { + "epoch": 1.3545580383626656, + "step": 13700, + "train/total_loss": 0.04832468181848526 + }, + { + "entropy": 9.702707290649414, + "epoch": 1.3546569112121811, + "mean_token_accuracy": 0.866847813129425, + "num_tokens": 21335777.0, + "step": 13701, + "train/ce_loss": 0.5067835450172424 + }, + { + "epoch": 1.3546569112121811, + "step": 13701, + "train/sim_loss": 0.09978657960891724 + }, + { + "epoch": 1.3546569112121811, + "step": 13701, + "train/total_loss": 0.15046493709087372 + }, + { + "entropy": 10.293310165405273, + "epoch": 1.3547557840616966, + "mean_token_accuracy": 0.8739495873451233, + "num_tokens": 21342497.0, + "step": 13702, + "train/ce_loss": 0.79531329870224 + }, + { + "epoch": 1.3547557840616966, + "step": 13702, + "train/sim_loss": 0.05019557476043701 + }, + { + "epoch": 1.3547557840616966, + "step": 13702, + "train/total_loss": 0.12972691655158997 + }, + { + "entropy": 9.38435173034668, + "epoch": 1.354854656911212, + "mean_token_accuracy": 0.8415094614028931, + "num_tokens": 21351929.0, + "step": 13703, + "train/ce_loss": 0.5858197808265686 + }, + { + "epoch": 1.354854656911212, + "step": 13703, + "train/sim_loss": 0.03939276933670044 + }, + { + "epoch": 1.354854656911212, + "step": 13703, + "train/total_loss": 0.0979747474193573 + }, + { + "entropy": 9.524249076843262, + "epoch": 1.3549535297607278, + "mean_token_accuracy": 0.8055555820465088, + "num_tokens": 21366576.0, + "step": 13704, + "train/ce_loss": 0.5867825150489807 + }, + { + "epoch": 1.3549535297607278, + "step": 13704, + "train/sim_loss": 0.0211142897605896 + }, + { + "epoch": 1.3549535297607278, + "step": 13704, + "train/total_loss": 0.07979254424571991 + }, + { + "entropy": 9.725940704345703, + "epoch": 1.3550524026102433, + "mean_token_accuracy": 0.8510131239891052, + "num_tokens": 21379694.0, + "step": 13705, + "train/ce_loss": 0.3601626455783844 + }, + { + "epoch": 1.3550524026102433, + "step": 13705, + "train/sim_loss": 0.02266395092010498 + }, + { + "epoch": 1.3550524026102433, + "step": 13705, + "train/total_loss": 0.0586802177131176 + }, + { + "entropy": 10.03100872039795, + "epoch": 1.3551512754597588, + "mean_token_accuracy": 0.8421052694320679, + "num_tokens": 21388634.0, + "step": 13706, + "train/ce_loss": 0.6063358187675476 + }, + { + "epoch": 1.3551512754597588, + "step": 13706, + "train/sim_loss": 0.04362577199935913 + }, + { + "epoch": 1.3551512754597588, + "step": 13706, + "train/total_loss": 0.10425935685634613 + }, + { + "entropy": 9.478357315063477, + "epoch": 1.3552501483092743, + "mean_token_accuracy": 0.7989071011543274, + "num_tokens": 21402674.0, + "step": 13707, + "train/ce_loss": 0.3340933918952942 + }, + { + "epoch": 1.3552501483092743, + "step": 13707, + "train/sim_loss": 0.07743042707443237 + }, + { + "epoch": 1.3552501483092743, + "step": 13707, + "train/total_loss": 0.11083976924419403 + }, + { + "entropy": 9.229923248291016, + "epoch": 1.3553490211587897, + "mean_token_accuracy": 0.8595564961433411, + "num_tokens": 21414831.0, + "step": 13708, + "train/ce_loss": 0.38915473222732544 + }, + { + "epoch": 1.3553490211587897, + "step": 13708, + "train/sim_loss": 0.05059254169464111 + }, + { + "epoch": 1.3553490211587897, + "step": 13708, + "train/total_loss": 0.08950801193714142 + }, + { + "entropy": 10.00566291809082, + "epoch": 1.3554478940083055, + "mean_token_accuracy": 0.8647058606147766, + "num_tokens": 21432151.0, + "step": 13709, + "train/ce_loss": 4.137231996992341e-07 + }, + { + "epoch": 1.3554478940083055, + "step": 13709, + "train/sim_loss": 0.019805073738098145 + }, + { + "epoch": 1.3554478940083055, + "step": 13709, + "train/total_loss": 0.019805114716291428 + }, + { + "entropy": 9.698070526123047, + "epoch": 1.3555467668578207, + "mean_token_accuracy": 0.8344155550003052, + "num_tokens": 21449998.0, + "step": 13710, + "train/ce_loss": 0.8143669366836548 + }, + { + "epoch": 1.3555467668578207, + "step": 13710, + "train/sim_loss": 0.023020625114440918 + }, + { + "epoch": 1.3555467668578207, + "step": 13710, + "train/total_loss": 0.1044573187828064 + }, + { + "entropy": 9.66541576385498, + "epoch": 1.3556456397073364, + "mean_token_accuracy": 0.7910447716712952, + "num_tokens": 21462878.0, + "step": 13711, + "train/ce_loss": 0.7044131755828857 + }, + { + "epoch": 1.3556456397073364, + "step": 13711, + "train/sim_loss": 0.04507744312286377 + }, + { + "epoch": 1.3556456397073364, + "step": 13711, + "train/total_loss": 0.11551876366138458 + }, + { + "entropy": 8.82071304321289, + "epoch": 1.355744512556852, + "mean_token_accuracy": 0.8370786309242249, + "num_tokens": 21474831.0, + "step": 13712, + "train/ce_loss": 0.5589843392372131 + }, + { + "epoch": 1.355744512556852, + "step": 13712, + "train/sim_loss": 0.056587278842926025 + }, + { + "epoch": 1.355744512556852, + "step": 13712, + "train/total_loss": 0.11248571425676346 + }, + { + "entropy": 9.653894424438477, + "epoch": 1.3558433854063674, + "mean_token_accuracy": 0.9009370803833008, + "num_tokens": 21485833.0, + "step": 13713, + "train/ce_loss": 0.23732690513134003 + }, + { + "epoch": 1.3558433854063674, + "step": 13713, + "train/sim_loss": 0.020706892013549805 + }, + { + "epoch": 1.3558433854063674, + "step": 13713, + "train/total_loss": 0.04443958401679993 + }, + { + "entropy": 9.596099853515625, + "epoch": 1.3559422582558829, + "mean_token_accuracy": 0.8961424231529236, + "num_tokens": 21500121.0, + "step": 13714, + "train/ce_loss": 0.294996440410614 + }, + { + "epoch": 1.3559422582558829, + "step": 13714, + "train/sim_loss": 0.020581960678100586 + }, + { + "epoch": 1.3559422582558829, + "step": 13714, + "train/total_loss": 0.05008160322904587 + }, + { + "entropy": 9.208581924438477, + "epoch": 1.3560411311053984, + "mean_token_accuracy": 0.8562434315681458, + "num_tokens": 21512711.0, + "step": 13715, + "train/ce_loss": 0.44226470589637756 + }, + { + "epoch": 1.3560411311053984, + "step": 13715, + "train/sim_loss": 0.012945175170898438 + }, + { + "epoch": 1.3560411311053984, + "step": 13715, + "train/total_loss": 0.057171646505594254 + }, + { + "entropy": 9.735888481140137, + "epoch": 1.356140003954914, + "mean_token_accuracy": 0.8375394344329834, + "num_tokens": 21533223.0, + "step": 13716, + "train/ce_loss": 0.544191300868988 + }, + { + "epoch": 1.356140003954914, + "step": 13716, + "train/sim_loss": 0.022939562797546387 + }, + { + "epoch": 1.356140003954914, + "step": 13716, + "train/total_loss": 0.07735869288444519 + }, + { + "entropy": 9.719367027282715, + "epoch": 1.3562388768044296, + "mean_token_accuracy": 0.8491947054862976, + "num_tokens": 21546021.0, + "step": 13717, + "train/ce_loss": 0.6548551917076111 + }, + { + "epoch": 1.3562388768044296, + "step": 13717, + "train/sim_loss": 0.06005251407623291 + }, + { + "epoch": 1.3562388768044296, + "step": 13717, + "train/total_loss": 0.12553803622722626 + }, + { + "entropy": 8.95343017578125, + "epoch": 1.356337749653945, + "mean_token_accuracy": 0.842424213886261, + "num_tokens": 21554574.0, + "step": 13718, + "train/ce_loss": 0.6780514121055603 + }, + { + "epoch": 1.356337749653945, + "step": 13718, + "train/sim_loss": 0.04437756538391113 + }, + { + "epoch": 1.356337749653945, + "step": 13718, + "train/total_loss": 0.11218270659446716 + }, + { + "entropy": 9.142863273620605, + "epoch": 1.3564366225034605, + "mean_token_accuracy": 0.8671532869338989, + "num_tokens": 21569782.0, + "step": 13719, + "train/ce_loss": 0.5245649814605713 + }, + { + "epoch": 1.3564366225034605, + "step": 13719, + "train/sim_loss": 0.012881934642791748 + }, + { + "epoch": 1.3564366225034605, + "step": 13719, + "train/total_loss": 0.06533843278884888 + }, + { + "epoch": 1.356535495352976, + "grad_norm": 0.5584450364112854, + "learning_rate": 6.610542451663947e-06, + "loss": 0.083, + "step": 13720 + }, + { + "entropy": 9.506569862365723, + "epoch": 1.356535495352976, + "mean_token_accuracy": 0.8161865472793579, + "num_tokens": 21584406.0, + "step": 13720, + "train/ce_loss": 0.3363630175590515 + }, + { + "epoch": 1.356535495352976, + "step": 13720, + "train/sim_loss": 0.039182186126708984 + }, + { + "epoch": 1.356535495352976, + "step": 13720, + "train/total_loss": 0.07281848788261414 + }, + { + "entropy": 9.385215759277344, + "epoch": 1.3566343682024917, + "mean_token_accuracy": 0.844362735748291, + "num_tokens": 21597704.0, + "step": 13721, + "train/ce_loss": 0.26551562547683716 + }, + { + "epoch": 1.3566343682024917, + "step": 13721, + "train/sim_loss": 0.030759453773498535 + }, + { + "epoch": 1.3566343682024917, + "step": 13721, + "train/total_loss": 0.05731101706624031 + }, + { + "entropy": 9.368226051330566, + "epoch": 1.356733241052007, + "mean_token_accuracy": 0.7856257557868958, + "num_tokens": 21610012.0, + "step": 13722, + "train/ce_loss": 0.7172117829322815 + }, + { + "epoch": 1.356733241052007, + "step": 13722, + "train/sim_loss": 0.08666861057281494 + }, + { + "epoch": 1.356733241052007, + "step": 13722, + "train/total_loss": 0.15838979184627533 + }, + { + "entropy": 9.22604751586914, + "epoch": 1.3568321139015227, + "mean_token_accuracy": 0.8425480723381042, + "num_tokens": 21622312.0, + "step": 13723, + "train/ce_loss": 0.6751191020011902 + }, + { + "epoch": 1.3568321139015227, + "step": 13723, + "train/sim_loss": 0.04663431644439697 + }, + { + "epoch": 1.3568321139015227, + "step": 13723, + "train/total_loss": 0.11414622515439987 + }, + { + "entropy": 9.278923988342285, + "epoch": 1.3569309867510382, + "mean_token_accuracy": 0.849056601524353, + "num_tokens": 21635520.0, + "step": 13724, + "train/ce_loss": 0.5171721577644348 + }, + { + "epoch": 1.3569309867510382, + "step": 13724, + "train/sim_loss": 0.09123289585113525 + }, + { + "epoch": 1.3569309867510382, + "step": 13724, + "train/total_loss": 0.1429501175880432 + }, + { + "entropy": 9.39232063293457, + "epoch": 1.3570298596005537, + "mean_token_accuracy": 0.8677685856819153, + "num_tokens": 21649819.0, + "step": 13725, + "train/ce_loss": 0.5698904991149902 + }, + { + "epoch": 1.3570298596005537, + "step": 13725, + "train/sim_loss": 0.07716536521911621 + }, + { + "epoch": 1.3570298596005537, + "step": 13725, + "train/total_loss": 0.13415440917015076 + }, + { + "entropy": 9.330062866210938, + "epoch": 1.3571287324500692, + "mean_token_accuracy": 0.8401534557342529, + "num_tokens": 21658057.0, + "step": 13726, + "train/ce_loss": 0.45410269498825073 + }, + { + "epoch": 1.3571287324500692, + "step": 13726, + "train/sim_loss": 0.035578250885009766 + }, + { + "epoch": 1.3571287324500692, + "step": 13726, + "train/total_loss": 0.08098852634429932 + }, + { + "entropy": 9.598749160766602, + "epoch": 1.3572276052995846, + "mean_token_accuracy": 0.8447368144989014, + "num_tokens": 21665728.0, + "step": 13727, + "train/ce_loss": 0.3600887358188629 + }, + { + "epoch": 1.3572276052995846, + "step": 13727, + "train/sim_loss": 0.09355688095092773 + }, + { + "epoch": 1.3572276052995846, + "step": 13727, + "train/total_loss": 0.1295657604932785 + }, + { + "entropy": 9.815214157104492, + "epoch": 1.3573264781491003, + "mean_token_accuracy": 0.7874817848205566, + "num_tokens": 21678419.0, + "step": 13728, + "train/ce_loss": 1.1066559553146362 + }, + { + "epoch": 1.3573264781491003, + "step": 13728, + "train/sim_loss": 0.09902441501617432 + }, + { + "epoch": 1.3573264781491003, + "step": 13728, + "train/total_loss": 0.20969000458717346 + }, + { + "entropy": 9.287603378295898, + "epoch": 1.3574253509986158, + "mean_token_accuracy": 0.8450854420661926, + "num_tokens": 21693978.0, + "step": 13729, + "train/ce_loss": 0.7392051219940186 + }, + { + "epoch": 1.3574253509986158, + "step": 13729, + "train/sim_loss": 0.09325677156448364 + }, + { + "epoch": 1.3574253509986158, + "step": 13729, + "train/total_loss": 0.16717728972434998 + }, + { + "entropy": 9.847122192382812, + "epoch": 1.3575242238481313, + "mean_token_accuracy": 0.8353413939476013, + "num_tokens": 21704085.0, + "step": 13730, + "train/ce_loss": 0.40673473477363586 + }, + { + "epoch": 1.3575242238481313, + "step": 13730, + "train/sim_loss": 0.06107461452484131 + }, + { + "epoch": 1.3575242238481313, + "step": 13730, + "train/total_loss": 0.10174809396266937 + }, + { + "entropy": 8.93281364440918, + "epoch": 1.3576230966976468, + "mean_token_accuracy": 0.8054162263870239, + "num_tokens": 21714207.0, + "step": 13731, + "train/ce_loss": 0.19387811422348022 + }, + { + "epoch": 1.3576230966976468, + "step": 13731, + "train/sim_loss": 0.014207243919372559 + }, + { + "epoch": 1.3576230966976468, + "step": 13731, + "train/total_loss": 0.03359505534172058 + }, + { + "entropy": 8.428970336914062, + "epoch": 1.3577219695471623, + "mean_token_accuracy": 0.8509052395820618, + "num_tokens": 21719361.0, + "step": 13732, + "train/ce_loss": 0.3665456771850586 + }, + { + "epoch": 1.3577219695471623, + "step": 13732, + "train/sim_loss": 0.015523910522460938 + }, + { + "epoch": 1.3577219695471623, + "step": 13732, + "train/total_loss": 0.052178479731082916 + }, + { + "entropy": 9.257729530334473, + "epoch": 1.357820842396678, + "mean_token_accuracy": 0.849943995475769, + "num_tokens": 21726789.0, + "step": 13733, + "train/ce_loss": 0.6288134455680847 + }, + { + "epoch": 1.357820842396678, + "step": 13733, + "train/sim_loss": 0.03122687339782715 + }, + { + "epoch": 1.357820842396678, + "step": 13733, + "train/total_loss": 0.0941082164645195 + }, + { + "entropy": 9.537420272827148, + "epoch": 1.3579197152461933, + "mean_token_accuracy": 0.8435114622116089, + "num_tokens": 21742778.0, + "step": 13734, + "train/ce_loss": 0.6128895878791809 + }, + { + "epoch": 1.3579197152461933, + "step": 13734, + "train/sim_loss": 0.03668332099914551 + }, + { + "epoch": 1.3579197152461933, + "step": 13734, + "train/total_loss": 0.09797228127717972 + }, + { + "entropy": 9.535186767578125, + "epoch": 1.358018588095709, + "mean_token_accuracy": 0.8047618865966797, + "num_tokens": 21759325.0, + "step": 13735, + "train/ce_loss": 0.39758390188217163 + }, + { + "epoch": 1.358018588095709, + "step": 13735, + "train/sim_loss": 0.07508194446563721 + }, + { + "epoch": 1.358018588095709, + "step": 13735, + "train/total_loss": 0.11484033614397049 + }, + { + "entropy": 10.066858291625977, + "epoch": 1.3581174609452245, + "mean_token_accuracy": 0.843478262424469, + "num_tokens": 21770592.0, + "step": 13736, + "train/ce_loss": 0.8727384209632874 + }, + { + "epoch": 1.3581174609452245, + "step": 13736, + "train/sim_loss": 0.049549400806427 + }, + { + "epoch": 1.3581174609452245, + "step": 13736, + "train/total_loss": 0.13682323694229126 + }, + { + "entropy": 9.72442626953125, + "epoch": 1.35821633379474, + "mean_token_accuracy": 0.794949471950531, + "num_tokens": 21784313.0, + "step": 13737, + "train/ce_loss": 0.5856649875640869 + }, + { + "epoch": 1.35821633379474, + "step": 13737, + "train/sim_loss": 0.0428696870803833 + }, + { + "epoch": 1.35821633379474, + "step": 13737, + "train/total_loss": 0.10143618285655975 + }, + { + "entropy": 9.613783836364746, + "epoch": 1.3583152066442554, + "mean_token_accuracy": 0.8316953182220459, + "num_tokens": 21797546.0, + "step": 13738, + "train/ce_loss": 0.44312161207199097 + }, + { + "epoch": 1.3583152066442554, + "step": 13738, + "train/sim_loss": 0.050975680351257324 + }, + { + "epoch": 1.3583152066442554, + "step": 13738, + "train/total_loss": 0.09528784453868866 + }, + { + "entropy": 9.132417678833008, + "epoch": 1.358414079493771, + "mean_token_accuracy": 0.8368663787841797, + "num_tokens": 21808159.0, + "step": 13739, + "train/ce_loss": 0.9006848335266113 + }, + { + "epoch": 1.358414079493771, + "step": 13739, + "train/sim_loss": 0.059739530086517334 + }, + { + "epoch": 1.358414079493771, + "step": 13739, + "train/total_loss": 0.14980801939964294 + }, + { + "epoch": 1.3585129523432866, + "grad_norm": 0.5144840478897095, + "learning_rate": 6.6055975869059986e-06, + "loss": 0.0921, + "step": 13740 + }, + { + "entropy": 9.318131446838379, + "epoch": 1.3585129523432866, + "mean_token_accuracy": 0.8277153372764587, + "num_tokens": 21815399.0, + "step": 13740, + "train/ce_loss": 0.4838864803314209 + }, + { + "epoch": 1.3585129523432866, + "step": 13740, + "train/sim_loss": 0.04403948783874512 + }, + { + "epoch": 1.3585129523432866, + "step": 13740, + "train/total_loss": 0.09242813289165497 + }, + { + "entropy": 9.427881240844727, + "epoch": 1.358611825192802, + "mean_token_accuracy": 0.8169934749603271, + "num_tokens": 21828876.0, + "step": 13741, + "train/ce_loss": 0.1636670082807541 + }, + { + "epoch": 1.358611825192802, + "step": 13741, + "train/sim_loss": 0.06764793395996094 + }, + { + "epoch": 1.358611825192802, + "step": 13741, + "train/total_loss": 0.0840146392583847 + }, + { + "entropy": 9.18632698059082, + "epoch": 1.3587106980423176, + "mean_token_accuracy": 0.8660022020339966, + "num_tokens": 21837751.0, + "step": 13742, + "train/ce_loss": 0.3551768958568573 + }, + { + "epoch": 1.3587106980423176, + "step": 13742, + "train/sim_loss": 0.051291704177856445 + }, + { + "epoch": 1.3587106980423176, + "step": 13742, + "train/total_loss": 0.08680939674377441 + }, + { + "entropy": 9.165215492248535, + "epoch": 1.358809570891833, + "mean_token_accuracy": 0.893410861492157, + "num_tokens": 21851010.0, + "step": 13743, + "train/ce_loss": 0.23093701899051666 + }, + { + "epoch": 1.358809570891833, + "step": 13743, + "train/sim_loss": 0.022817671298980713 + }, + { + "epoch": 1.358809570891833, + "step": 13743, + "train/total_loss": 0.04591137170791626 + }, + { + "entropy": 9.799308776855469, + "epoch": 1.3589084437413486, + "mean_token_accuracy": 0.8176637887954712, + "num_tokens": 21858993.0, + "step": 13744, + "train/ce_loss": 0.6270349621772766 + }, + { + "epoch": 1.3589084437413486, + "step": 13744, + "train/sim_loss": 0.04397517442703247 + }, + { + "epoch": 1.3589084437413486, + "step": 13744, + "train/total_loss": 0.10667867213487625 + }, + { + "entropy": 9.702397346496582, + "epoch": 1.3590073165908643, + "mean_token_accuracy": 0.8784860372543335, + "num_tokens": 21871394.0, + "step": 13745, + "train/ce_loss": 6.886484698043205e-07 + }, + { + "epoch": 1.3590073165908643, + "step": 13745, + "train/sim_loss": 0.02575218677520752 + }, + { + "epoch": 1.3590073165908643, + "step": 13745, + "train/total_loss": 0.02575225569307804 + }, + { + "entropy": 9.745864868164062, + "epoch": 1.3591061894403798, + "mean_token_accuracy": 0.8721804618835449, + "num_tokens": 21887580.0, + "step": 13746, + "train/ce_loss": 3.3700848689477425e-07 + }, + { + "epoch": 1.3591061894403798, + "step": 13746, + "train/sim_loss": 0.02713751792907715 + }, + { + "epoch": 1.3591061894403798, + "step": 13746, + "train/total_loss": 0.027137551456689835 + }, + { + "entropy": 9.509721755981445, + "epoch": 1.3592050622898952, + "mean_token_accuracy": 0.8097208142280579, + "num_tokens": 21902447.0, + "step": 13747, + "train/ce_loss": 0.9047937989234924 + }, + { + "epoch": 1.3592050622898952, + "step": 13747, + "train/sim_loss": 0.08272767066955566 + }, + { + "epoch": 1.3592050622898952, + "step": 13747, + "train/total_loss": 0.17320704460144043 + }, + { + "entropy": 9.955342292785645, + "epoch": 1.3593039351394107, + "mean_token_accuracy": 0.9046321511268616, + "num_tokens": 21909794.0, + "step": 13748, + "train/ce_loss": 0.36131057143211365 + }, + { + "epoch": 1.3593039351394107, + "step": 13748, + "train/sim_loss": 0.04520845413208008 + }, + { + "epoch": 1.3593039351394107, + "step": 13748, + "train/total_loss": 0.0813395082950592 + }, + { + "entropy": 9.488768577575684, + "epoch": 1.3594028079889262, + "mean_token_accuracy": 0.8511087894439697, + "num_tokens": 21926350.0, + "step": 13749, + "train/ce_loss": 0.4631899297237396 + }, + { + "epoch": 1.3594028079889262, + "step": 13749, + "train/sim_loss": 0.020753324031829834 + }, + { + "epoch": 1.3594028079889262, + "step": 13749, + "train/total_loss": 0.0670723170042038 + }, + { + "entropy": 9.252395629882812, + "epoch": 1.3595016808384417, + "mean_token_accuracy": 0.8278145790100098, + "num_tokens": 21937892.0, + "step": 13750, + "train/ce_loss": 0.3636316955089569 + }, + { + "epoch": 1.3595016808384417, + "step": 13750, + "train/sim_loss": 0.04173409938812256 + }, + { + "epoch": 1.3595016808384417, + "step": 13750, + "train/total_loss": 0.07809726893901825 + }, + { + "entropy": 9.457666397094727, + "epoch": 1.3596005536879572, + "mean_token_accuracy": 0.819753110408783, + "num_tokens": 21947641.0, + "step": 13751, + "train/ce_loss": 0.15119598805904388 + }, + { + "epoch": 1.3596005536879572, + "step": 13751, + "train/sim_loss": 0.01574563980102539 + }, + { + "epoch": 1.3596005536879572, + "step": 13751, + "train/total_loss": 0.03086523897945881 + }, + { + "entropy": 9.814287185668945, + "epoch": 1.3596994265374729, + "mean_token_accuracy": 0.843137264251709, + "num_tokens": 21965263.0, + "step": 13752, + "train/ce_loss": 0.43915584683418274 + }, + { + "epoch": 1.3596994265374729, + "step": 13752, + "train/sim_loss": 0.03240227699279785 + }, + { + "epoch": 1.3596994265374729, + "step": 13752, + "train/total_loss": 0.07631786167621613 + }, + { + "entropy": 9.627860069274902, + "epoch": 1.3597982993869884, + "mean_token_accuracy": 0.9442307949066162, + "num_tokens": 21973983.0, + "step": 13753, + "train/ce_loss": 9.022940616887354e-07 + }, + { + "epoch": 1.3597982993869884, + "step": 13753, + "train/sim_loss": 0.028763234615325928 + }, + { + "epoch": 1.3597982993869884, + "step": 13753, + "train/total_loss": 0.02876332402229309 + }, + { + "entropy": 10.320704460144043, + "epoch": 1.3598971722365039, + "mean_token_accuracy": 0.931506872177124, + "num_tokens": 21983375.0, + "step": 13754, + "train/ce_loss": 0.9198481440544128 + }, + { + "epoch": 1.3598971722365039, + "step": 13754, + "train/sim_loss": 0.043970346450805664 + }, + { + "epoch": 1.3598971722365039, + "step": 13754, + "train/total_loss": 0.13595515489578247 + }, + { + "entropy": 9.816895484924316, + "epoch": 1.3599960450860193, + "mean_token_accuracy": 0.8704512119293213, + "num_tokens": 21999562.0, + "step": 13755, + "train/ce_loss": 9.654304449213669e-07 + }, + { + "epoch": 1.3599960450860193, + "step": 13755, + "train/sim_loss": 0.0383983850479126 + }, + { + "epoch": 1.3599960450860193, + "step": 13755, + "train/total_loss": 0.03839848190546036 + }, + { + "entropy": 9.419590950012207, + "epoch": 1.3600949179355348, + "mean_token_accuracy": 0.8966202735900879, + "num_tokens": 22011610.0, + "step": 13756, + "train/ce_loss": 0.7946957945823669 + }, + { + "epoch": 1.3600949179355348, + "step": 13756, + "train/sim_loss": 0.00991898775100708 + }, + { + "epoch": 1.3600949179355348, + "step": 13756, + "train/total_loss": 0.08938857167959213 + }, + { + "entropy": 9.436821937561035, + "epoch": 1.3601937907850505, + "mean_token_accuracy": 0.9071428775787354, + "num_tokens": 22022063.0, + "step": 13757, + "train/ce_loss": 0.2929241359233856 + }, + { + "epoch": 1.3601937907850505, + "step": 13757, + "train/sim_loss": 0.06512469053268433 + }, + { + "epoch": 1.3601937907850505, + "step": 13757, + "train/total_loss": 0.09441710263490677 + }, + { + "entropy": 9.639632225036621, + "epoch": 1.360292663634566, + "mean_token_accuracy": 0.8406374454498291, + "num_tokens": 22040525.0, + "step": 13758, + "train/ce_loss": 2.026383185693703e-07 + }, + { + "epoch": 1.360292663634566, + "step": 13758, + "train/sim_loss": 0.03166705369949341 + }, + { + "epoch": 1.360292663634566, + "step": 13758, + "train/total_loss": 0.0316670723259449 + }, + { + "entropy": 9.46492862701416, + "epoch": 1.3603915364840815, + "mean_token_accuracy": 0.8357616066932678, + "num_tokens": 22050594.0, + "step": 13759, + "train/ce_loss": 0.29218509793281555 + }, + { + "epoch": 1.3603915364840815, + "step": 13759, + "train/sim_loss": 0.05460190773010254 + }, + { + "epoch": 1.3603915364840815, + "step": 13759, + "train/total_loss": 0.0838204175233841 + }, + { + "epoch": 1.360490409333597, + "grad_norm": 0.6090583205223083, + "learning_rate": 6.60065272214805e-06, + "loss": 0.0811, + "step": 13760 + }, + { + "entropy": 9.507867813110352, + "epoch": 1.360490409333597, + "mean_token_accuracy": 0.8434547781944275, + "num_tokens": 22063645.0, + "step": 13760, + "train/ce_loss": 0.5449047088623047 + }, + { + "epoch": 1.360490409333597, + "step": 13760, + "train/sim_loss": 0.03253781795501709 + }, + { + "epoch": 1.360490409333597, + "step": 13760, + "train/total_loss": 0.08702829480171204 + }, + { + "entropy": 9.504903793334961, + "epoch": 1.3605892821831125, + "mean_token_accuracy": 0.837520956993103, + "num_tokens": 22077380.0, + "step": 13761, + "train/ce_loss": 2.5459173684794223e-06 + }, + { + "epoch": 1.3605892821831125, + "step": 13761, + "train/sim_loss": 0.030605077743530273 + }, + { + "epoch": 1.3605892821831125, + "step": 13761, + "train/total_loss": 0.030605332925915718 + }, + { + "entropy": 8.888824462890625, + "epoch": 1.360688155032628, + "mean_token_accuracy": 0.8658959269523621, + "num_tokens": 22087339.0, + "step": 13762, + "train/ce_loss": 1.130559439843637e-06 + }, + { + "epoch": 1.360688155032628, + "step": 13762, + "train/sim_loss": 0.03938126564025879 + }, + { + "epoch": 1.360688155032628, + "step": 13762, + "train/total_loss": 0.03938137739896774 + }, + { + "entropy": 9.128804206848145, + "epoch": 1.3607870278821435, + "mean_token_accuracy": 0.8780487775802612, + "num_tokens": 22095059.0, + "step": 13763, + "train/ce_loss": 0.540686845779419 + }, + { + "epoch": 1.3607870278821435, + "step": 13763, + "train/sim_loss": 0.02905738353729248 + }, + { + "epoch": 1.3607870278821435, + "step": 13763, + "train/total_loss": 0.08312606811523438 + }, + { + "entropy": 9.620512008666992, + "epoch": 1.3608859007316592, + "mean_token_accuracy": 0.8685259222984314, + "num_tokens": 22111318.0, + "step": 13764, + "train/ce_loss": 0.2170589566230774 + }, + { + "epoch": 1.3608859007316592, + "step": 13764, + "train/sim_loss": 0.021599531173706055 + }, + { + "epoch": 1.3608859007316592, + "step": 13764, + "train/total_loss": 0.043305426836013794 + }, + { + "entropy": 9.798940658569336, + "epoch": 1.3609847735811746, + "mean_token_accuracy": 0.8665568232536316, + "num_tokens": 22126543.0, + "step": 13765, + "train/ce_loss": 0.6541737914085388 + }, + { + "epoch": 1.3609847735811746, + "step": 13765, + "train/sim_loss": 0.05143725872039795 + }, + { + "epoch": 1.3609847735811746, + "step": 13765, + "train/total_loss": 0.11685463786125183 + }, + { + "entropy": 9.325217247009277, + "epoch": 1.3610836464306901, + "mean_token_accuracy": 0.8617143034934998, + "num_tokens": 22145891.0, + "step": 13766, + "train/ce_loss": 0.4651808440685272 + }, + { + "epoch": 1.3610836464306901, + "step": 13766, + "train/sim_loss": 0.01902461051940918 + }, + { + "epoch": 1.3610836464306901, + "step": 13766, + "train/total_loss": 0.06554269790649414 + }, + { + "entropy": 9.543397903442383, + "epoch": 1.3611825192802056, + "mean_token_accuracy": 0.8253557682037354, + "num_tokens": 22155526.0, + "step": 13767, + "train/ce_loss": 0.6952235102653503 + }, + { + "epoch": 1.3611825192802056, + "step": 13767, + "train/sim_loss": 0.05067867040634155 + }, + { + "epoch": 1.3611825192802056, + "step": 13767, + "train/total_loss": 0.12020102143287659 + }, + { + "entropy": 9.318105697631836, + "epoch": 1.361281392129721, + "mean_token_accuracy": 0.7857142686843872, + "num_tokens": 22162947.0, + "step": 13768, + "train/ce_loss": 0.5866307616233826 + }, + { + "epoch": 1.361281392129721, + "step": 13768, + "train/sim_loss": 0.044571518898010254 + }, + { + "epoch": 1.361281392129721, + "step": 13768, + "train/total_loss": 0.10323459655046463 + }, + { + "entropy": 9.561079978942871, + "epoch": 1.3613802649792368, + "mean_token_accuracy": 0.8120300769805908, + "num_tokens": 22176783.0, + "step": 13769, + "train/ce_loss": 0.7409642934799194 + }, + { + "epoch": 1.3613802649792368, + "step": 13769, + "train/sim_loss": 0.05173540115356445 + }, + { + "epoch": 1.3613802649792368, + "step": 13769, + "train/total_loss": 0.12583184242248535 + }, + { + "entropy": 9.289384841918945, + "epoch": 1.3614791378287523, + "mean_token_accuracy": 0.8877995610237122, + "num_tokens": 22189974.0, + "step": 13770, + "train/ce_loss": 0.5473302602767944 + }, + { + "epoch": 1.3614791378287523, + "step": 13770, + "train/sim_loss": 0.04575395584106445 + }, + { + "epoch": 1.3614791378287523, + "step": 13770, + "train/total_loss": 0.10048697888851166 + }, + { + "entropy": 9.054268836975098, + "epoch": 1.3615780106782678, + "mean_token_accuracy": 0.8464328646659851, + "num_tokens": 22201480.0, + "step": 13771, + "train/ce_loss": 0.23982346057891846 + }, + { + "epoch": 1.3615780106782678, + "step": 13771, + "train/sim_loss": 0.029253065586090088 + }, + { + "epoch": 1.3615780106782678, + "step": 13771, + "train/total_loss": 0.053235411643981934 + }, + { + "entropy": 9.636343002319336, + "epoch": 1.3616768835277833, + "mean_token_accuracy": 0.846905529499054, + "num_tokens": 22212011.0, + "step": 13772, + "train/ce_loss": 0.46790215373039246 + }, + { + "epoch": 1.3616768835277833, + "step": 13772, + "train/sim_loss": 0.09198999404907227 + }, + { + "epoch": 1.3616768835277833, + "step": 13772, + "train/total_loss": 0.13878020644187927 + }, + { + "entropy": 9.22850513458252, + "epoch": 1.3617757563772988, + "mean_token_accuracy": 0.8795698881149292, + "num_tokens": 22226999.0, + "step": 13773, + "train/ce_loss": 0.4019180238246918 + }, + { + "epoch": 1.3617757563772988, + "step": 13773, + "train/sim_loss": 0.016980528831481934 + }, + { + "epoch": 1.3617757563772988, + "step": 13773, + "train/total_loss": 0.05717233195900917 + }, + { + "entropy": 9.525383949279785, + "epoch": 1.3618746292268142, + "mean_token_accuracy": 0.8571428656578064, + "num_tokens": 22245112.0, + "step": 13774, + "train/ce_loss": 0.5558558702468872 + }, + { + "epoch": 1.3618746292268142, + "step": 13774, + "train/sim_loss": 0.06919050216674805 + }, + { + "epoch": 1.3618746292268142, + "step": 13774, + "train/total_loss": 0.12477609515190125 + }, + { + "entropy": 9.80687427520752, + "epoch": 1.3619735020763297, + "mean_token_accuracy": 0.8426666855812073, + "num_tokens": 22261276.0, + "step": 13775, + "train/ce_loss": 0.4626316428184509 + }, + { + "epoch": 1.3619735020763297, + "step": 13775, + "train/sim_loss": 0.05873727798461914 + }, + { + "epoch": 1.3619735020763297, + "step": 13775, + "train/total_loss": 0.10500044375658035 + }, + { + "entropy": 9.666775703430176, + "epoch": 1.3620723749258454, + "mean_token_accuracy": 0.8627907037734985, + "num_tokens": 22279456.0, + "step": 13776, + "train/ce_loss": 0.2729234993457794 + }, + { + "epoch": 1.3620723749258454, + "step": 13776, + "train/sim_loss": 0.014458119869232178 + }, + { + "epoch": 1.3620723749258454, + "step": 13776, + "train/total_loss": 0.041750468313694 + }, + { + "entropy": 9.637613296508789, + "epoch": 1.362171247775361, + "mean_token_accuracy": 0.8621907830238342, + "num_tokens": 22291842.0, + "step": 13777, + "train/ce_loss": 3.7573590816464275e-07 + }, + { + "epoch": 1.362171247775361, + "step": 13777, + "train/sim_loss": 0.04910087585449219 + }, + { + "epoch": 1.362171247775361, + "step": 13777, + "train/total_loss": 0.04910091310739517 + }, + { + "entropy": 9.316720962524414, + "epoch": 1.3622701206248764, + "mean_token_accuracy": 0.7687074542045593, + "num_tokens": 22301477.0, + "step": 13778, + "train/ce_loss": 0.5648329854011536 + }, + { + "epoch": 1.3622701206248764, + "step": 13778, + "train/sim_loss": 0.044412434101104736 + }, + { + "epoch": 1.3622701206248764, + "step": 13778, + "train/total_loss": 0.10089573264122009 + }, + { + "entropy": 9.73898983001709, + "epoch": 1.3623689934743919, + "mean_token_accuracy": 0.8494423627853394, + "num_tokens": 22309617.0, + "step": 13779, + "train/ce_loss": 8.853928079588513e-07 + }, + { + "epoch": 1.3623689934743919, + "step": 13779, + "train/sim_loss": 0.029130220413208008 + }, + { + "epoch": 1.3623689934743919, + "step": 13779, + "train/total_loss": 0.02913030982017517 + }, + { + "epoch": 1.3624678663239074, + "grad_norm": 0.603261411190033, + "learning_rate": 6.595707857390101e-06, + "loss": 0.0875, + "step": 13780 + }, + { + "entropy": 9.58203125, + "epoch": 1.3624678663239074, + "mean_token_accuracy": 0.913294792175293, + "num_tokens": 22325940.0, + "step": 13780, + "train/ce_loss": 1.0032551926997257e-06 + }, + { + "epoch": 1.3624678663239074, + "step": 13780, + "train/sim_loss": 0.048482656478881836 + }, + { + "epoch": 1.3624678663239074, + "step": 13780, + "train/total_loss": 0.048482757061719894 + }, + { + "entropy": 9.183082580566406, + "epoch": 1.362566739173423, + "mean_token_accuracy": 0.8667563796043396, + "num_tokens": 22340829.0, + "step": 13781, + "train/ce_loss": 5.211015832173871e-07 + }, + { + "epoch": 1.362566739173423, + "step": 13781, + "train/sim_loss": 0.062201738357543945 + }, + { + "epoch": 1.362566739173423, + "step": 13781, + "train/total_loss": 0.062201790511608124 + }, + { + "entropy": 9.374377250671387, + "epoch": 1.3626656120229386, + "mean_token_accuracy": 0.8112000226974487, + "num_tokens": 22353717.0, + "step": 13782, + "train/ce_loss": 0.4230117201805115 + }, + { + "epoch": 1.3626656120229386, + "step": 13782, + "train/sim_loss": 0.04367411136627197 + }, + { + "epoch": 1.3626656120229386, + "step": 13782, + "train/total_loss": 0.0859752893447876 + }, + { + "entropy": 8.806699752807617, + "epoch": 1.362764484872454, + "mean_token_accuracy": 0.8436853289604187, + "num_tokens": 22361112.0, + "step": 13783, + "train/ce_loss": 0.44216111302375793 + }, + { + "epoch": 1.362764484872454, + "step": 13783, + "train/sim_loss": 0.009497880935668945 + }, + { + "epoch": 1.362764484872454, + "step": 13783, + "train/total_loss": 0.05371399223804474 + }, + { + "entropy": 9.399908065795898, + "epoch": 1.3628633577219695, + "mean_token_accuracy": 0.8557142615318298, + "num_tokens": 22375874.0, + "step": 13784, + "train/ce_loss": 0.5909030437469482 + }, + { + "epoch": 1.3628633577219695, + "step": 13784, + "train/sim_loss": 0.025271892547607422 + }, + { + "epoch": 1.3628633577219695, + "step": 13784, + "train/total_loss": 0.08436219394207001 + }, + { + "entropy": 9.761590003967285, + "epoch": 1.362962230571485, + "mean_token_accuracy": 0.8425584435462952, + "num_tokens": 22385906.0, + "step": 13785, + "train/ce_loss": 0.48448970913887024 + }, + { + "epoch": 1.362962230571485, + "step": 13785, + "train/sim_loss": 0.043299078941345215 + }, + { + "epoch": 1.362962230571485, + "step": 13785, + "train/total_loss": 0.09174805134534836 + }, + { + "entropy": 9.411831855773926, + "epoch": 1.3630611034210007, + "mean_token_accuracy": 0.8635810017585754, + "num_tokens": 22399717.0, + "step": 13786, + "train/ce_loss": 0.15826527774333954 + }, + { + "epoch": 1.3630611034210007, + "step": 13786, + "train/sim_loss": 0.057818055152893066 + }, + { + "epoch": 1.3630611034210007, + "step": 13786, + "train/total_loss": 0.07364458590745926 + }, + { + "entropy": 9.348626136779785, + "epoch": 1.363159976270516, + "mean_token_accuracy": 0.9185888767242432, + "num_tokens": 22409639.0, + "step": 13787, + "train/ce_loss": 0.39919111132621765 + }, + { + "epoch": 1.363159976270516, + "step": 13787, + "train/sim_loss": 0.10341942310333252 + }, + { + "epoch": 1.363159976270516, + "step": 13787, + "train/total_loss": 0.14333853125572205 + }, + { + "entropy": 9.650938987731934, + "epoch": 1.3632588491200317, + "mean_token_accuracy": 0.8594377636909485, + "num_tokens": 22422665.0, + "step": 13788, + "train/ce_loss": 0.16773928701877594 + }, + { + "epoch": 1.3632588491200317, + "step": 13788, + "train/sim_loss": 0.04828232526779175 + }, + { + "epoch": 1.3632588491200317, + "step": 13788, + "train/total_loss": 0.06505625694990158 + }, + { + "entropy": 9.399871826171875, + "epoch": 1.3633577219695472, + "mean_token_accuracy": 0.840665876865387, + "num_tokens": 22445352.0, + "step": 13789, + "train/ce_loss": 0.5938544273376465 + }, + { + "epoch": 1.3633577219695472, + "step": 13789, + "train/sim_loss": 0.05567371845245361 + }, + { + "epoch": 1.3633577219695472, + "step": 13789, + "train/total_loss": 0.11505916714668274 + }, + { + "entropy": 10.050697326660156, + "epoch": 1.3634565948190627, + "mean_token_accuracy": 0.8237287998199463, + "num_tokens": 22459618.0, + "step": 13790, + "train/ce_loss": 0.5311241149902344 + }, + { + "epoch": 1.3634565948190627, + "step": 13790, + "train/sim_loss": 0.04304957389831543 + }, + { + "epoch": 1.3634565948190627, + "step": 13790, + "train/total_loss": 0.09616199135780334 + }, + { + "entropy": 9.039783477783203, + "epoch": 1.3635554676685782, + "mean_token_accuracy": 0.8644444346427917, + "num_tokens": 22469673.0, + "step": 13791, + "train/ce_loss": 0.7265397310256958 + }, + { + "epoch": 1.3635554676685782, + "step": 13791, + "train/sim_loss": 0.04138600826263428 + }, + { + "epoch": 1.3635554676685782, + "step": 13791, + "train/total_loss": 0.11403997987508774 + }, + { + "entropy": 9.265087127685547, + "epoch": 1.3636543405180936, + "mean_token_accuracy": 0.8869779109954834, + "num_tokens": 22476886.0, + "step": 13792, + "train/ce_loss": 0.5423145890235901 + }, + { + "epoch": 1.3636543405180936, + "step": 13792, + "train/sim_loss": 0.0698699951171875 + }, + { + "epoch": 1.3636543405180936, + "step": 13792, + "train/total_loss": 0.12410145998001099 + }, + { + "entropy": 9.597753524780273, + "epoch": 1.3637532133676094, + "mean_token_accuracy": 0.836241602897644, + "num_tokens": 22492726.0, + "step": 13793, + "train/ce_loss": 0.2747665345668793 + }, + { + "epoch": 1.3637532133676094, + "step": 13793, + "train/sim_loss": 0.011340856552124023 + }, + { + "epoch": 1.3637532133676094, + "step": 13793, + "train/total_loss": 0.03881751000881195 + }, + { + "entropy": 9.214323043823242, + "epoch": 1.3638520862171248, + "mean_token_accuracy": 0.8056679964065552, + "num_tokens": 22503809.0, + "step": 13794, + "train/ce_loss": 0.3831706941127777 + }, + { + "epoch": 1.3638520862171248, + "step": 13794, + "train/sim_loss": 0.042816996574401855 + }, + { + "epoch": 1.3638520862171248, + "step": 13794, + "train/total_loss": 0.08113406598567963 + }, + { + "entropy": 9.329965591430664, + "epoch": 1.3639509590666403, + "mean_token_accuracy": 0.8658008575439453, + "num_tokens": 22517106.0, + "step": 13795, + "train/ce_loss": 0.7100604772567749 + }, + { + "epoch": 1.3639509590666403, + "step": 13795, + "train/sim_loss": 0.02104783058166504 + }, + { + "epoch": 1.3639509590666403, + "step": 13795, + "train/total_loss": 0.09205388277769089 + }, + { + "entropy": 9.34455680847168, + "epoch": 1.3640498319161558, + "mean_token_accuracy": 0.8619102239608765, + "num_tokens": 22528640.0, + "step": 13796, + "train/ce_loss": 0.6453511714935303 + }, + { + "epoch": 1.3640498319161558, + "step": 13796, + "train/sim_loss": 0.01763629913330078 + }, + { + "epoch": 1.3640498319161558, + "step": 13796, + "train/total_loss": 0.08217141777276993 + }, + { + "entropy": 9.319913864135742, + "epoch": 1.3641487047656713, + "mean_token_accuracy": 0.8393574357032776, + "num_tokens": 22537580.0, + "step": 13797, + "train/ce_loss": 0.4693279564380646 + }, + { + "epoch": 1.3641487047656713, + "step": 13797, + "train/sim_loss": 0.05976152420043945 + }, + { + "epoch": 1.3641487047656713, + "step": 13797, + "train/total_loss": 0.10669432580471039 + }, + { + "entropy": 9.039377212524414, + "epoch": 1.364247577615187, + "mean_token_accuracy": 0.8886389136314392, + "num_tokens": 22549620.0, + "step": 13798, + "train/ce_loss": 0.5141133069992065 + }, + { + "epoch": 1.364247577615187, + "step": 13798, + "train/sim_loss": 0.048925697803497314 + }, + { + "epoch": 1.364247577615187, + "step": 13798, + "train/total_loss": 0.10033702850341797 + }, + { + "entropy": 9.494223594665527, + "epoch": 1.3643464504647023, + "mean_token_accuracy": 0.8790322542190552, + "num_tokens": 22566774.0, + "step": 13799, + "train/ce_loss": 0.5808945894241333 + }, + { + "epoch": 1.3643464504647023, + "step": 13799, + "train/sim_loss": 0.03744858503341675 + }, + { + "epoch": 1.3643464504647023, + "step": 13799, + "train/total_loss": 0.09553804993629456 + }, + { + "epoch": 1.364445323314218, + "grad_norm": 0.6429176926612854, + "learning_rate": 6.590762992632153e-06, + "loss": 0.081, + "step": 13800 + }, + { + "entropy": 10.002534866333008, + "epoch": 1.364445323314218, + "mean_token_accuracy": 0.8695651888847351, + "num_tokens": 22578378.0, + "step": 13800, + "train/ce_loss": 0.6245404481887817 + }, + { + "epoch": 1.364445323314218, + "step": 13800, + "train/sim_loss": 0.08859765529632568 + }, + { + "epoch": 1.364445323314218, + "step": 13800, + "train/total_loss": 0.15105170011520386 + }, + { + "entropy": 9.438724517822266, + "epoch": 1.3645441961637335, + "mean_token_accuracy": 0.8979187607765198, + "num_tokens": 22595963.0, + "step": 13801, + "train/ce_loss": 0.27470311522483826 + }, + { + "epoch": 1.3645441961637335, + "step": 13801, + "train/sim_loss": 0.024792194366455078 + }, + { + "epoch": 1.3645441961637335, + "step": 13801, + "train/total_loss": 0.05226250737905502 + }, + { + "entropy": 8.79354476928711, + "epoch": 1.364643069013249, + "mean_token_accuracy": 0.8488593101501465, + "num_tokens": 22602907.0, + "step": 13802, + "train/ce_loss": 0.22997814416885376 + }, + { + "epoch": 1.364643069013249, + "step": 13802, + "train/sim_loss": 0.052928924560546875 + }, + { + "epoch": 1.364643069013249, + "step": 13802, + "train/total_loss": 0.07592673599720001 + }, + { + "entropy": 9.304563522338867, + "epoch": 1.3647419418627644, + "mean_token_accuracy": 0.8359941840171814, + "num_tokens": 22612845.0, + "step": 13803, + "train/ce_loss": 0.7079283595085144 + }, + { + "epoch": 1.3647419418627644, + "step": 13803, + "train/sim_loss": 0.11785954236984253 + }, + { + "epoch": 1.3647419418627644, + "step": 13803, + "train/total_loss": 0.1886523813009262 + }, + { + "entropy": 9.975330352783203, + "epoch": 1.36484081471228, + "mean_token_accuracy": 0.8355342149734497, + "num_tokens": 22628804.0, + "step": 13804, + "train/ce_loss": 0.5214459896087646 + }, + { + "epoch": 1.36484081471228, + "step": 13804, + "train/sim_loss": 0.055860936641693115 + }, + { + "epoch": 1.36484081471228, + "step": 13804, + "train/total_loss": 0.10800553858280182 + }, + { + "entropy": 9.360086441040039, + "epoch": 1.3649396875617956, + "mean_token_accuracy": 0.8514644503593445, + "num_tokens": 22638682.0, + "step": 13805, + "train/ce_loss": 0.4692053496837616 + }, + { + "epoch": 1.3649396875617956, + "step": 13805, + "train/sim_loss": 0.1064954400062561 + }, + { + "epoch": 1.3649396875617956, + "step": 13805, + "train/total_loss": 0.1534159779548645 + }, + { + "entropy": 8.908499717712402, + "epoch": 1.365038560411311, + "mean_token_accuracy": 0.8592848777770996, + "num_tokens": 22645416.0, + "step": 13806, + "train/ce_loss": 0.42634740471839905 + }, + { + "epoch": 1.365038560411311, + "step": 13806, + "train/sim_loss": 0.022678494453430176 + }, + { + "epoch": 1.365038560411311, + "step": 13806, + "train/total_loss": 0.06531323492527008 + }, + { + "entropy": 8.93994140625, + "epoch": 1.3651374332608266, + "mean_token_accuracy": 0.7755444049835205, + "num_tokens": 22656898.0, + "step": 13807, + "train/ce_loss": 0.28506770730018616 + }, + { + "epoch": 1.3651374332608266, + "step": 13807, + "train/sim_loss": 0.015801072120666504 + }, + { + "epoch": 1.3651374332608266, + "step": 13807, + "train/total_loss": 0.04430784285068512 + }, + { + "entropy": 9.646768569946289, + "epoch": 1.365236306110342, + "mean_token_accuracy": 0.8371501564979553, + "num_tokens": 22667358.0, + "step": 13808, + "train/ce_loss": 0.8057633638381958 + }, + { + "epoch": 1.365236306110342, + "step": 13808, + "train/sim_loss": 0.035383403301239014 + }, + { + "epoch": 1.365236306110342, + "step": 13808, + "train/total_loss": 0.11595974117517471 + }, + { + "entropy": 9.259330749511719, + "epoch": 1.3653351789598576, + "mean_token_accuracy": 0.9010238647460938, + "num_tokens": 22682558.0, + "step": 13809, + "train/ce_loss": 0.17765958607196808 + }, + { + "epoch": 1.3653351789598576, + "step": 13809, + "train/sim_loss": 0.016211748123168945 + }, + { + "epoch": 1.3653351789598576, + "step": 13809, + "train/total_loss": 0.03397770971059799 + }, + { + "entropy": 9.558192253112793, + "epoch": 1.3654340518093733, + "mean_token_accuracy": 0.8568904399871826, + "num_tokens": 22694010.0, + "step": 13810, + "train/ce_loss": 0.3196695148944855 + }, + { + "epoch": 1.3654340518093733, + "step": 13810, + "train/sim_loss": 0.0304223895072937 + }, + { + "epoch": 1.3654340518093733, + "step": 13810, + "train/total_loss": 0.06238934025168419 + }, + { + "entropy": 10.004034042358398, + "epoch": 1.3655329246588885, + "mean_token_accuracy": 0.8904429078102112, + "num_tokens": 22702769.0, + "step": 13811, + "train/ce_loss": 0.3175959289073944 + }, + { + "epoch": 1.3655329246588885, + "step": 13811, + "train/sim_loss": 0.07649952173233032 + }, + { + "epoch": 1.3655329246588885, + "step": 13811, + "train/total_loss": 0.10825911164283752 + }, + { + "entropy": 9.715599060058594, + "epoch": 1.3656317975084042, + "mean_token_accuracy": 0.8883720636367798, + "num_tokens": 22710989.0, + "step": 13812, + "train/ce_loss": 0.4905047118663788 + }, + { + "epoch": 1.3656317975084042, + "step": 13812, + "train/sim_loss": 0.03819084167480469 + }, + { + "epoch": 1.3656317975084042, + "step": 13812, + "train/total_loss": 0.08724131435155869 + }, + { + "entropy": 8.9828462600708, + "epoch": 1.3657306703579197, + "mean_token_accuracy": 0.8634204268455505, + "num_tokens": 22724109.0, + "step": 13813, + "train/ce_loss": 0.291395366191864 + }, + { + "epoch": 1.3657306703579197, + "step": 13813, + "train/sim_loss": 0.014636814594268799 + }, + { + "epoch": 1.3657306703579197, + "step": 13813, + "train/total_loss": 0.04377635195851326 + }, + { + "entropy": 9.392688751220703, + "epoch": 1.3658295432074352, + "mean_token_accuracy": 0.8074866533279419, + "num_tokens": 22735382.0, + "step": 13814, + "train/ce_loss": 0.7404588460922241 + }, + { + "epoch": 1.3658295432074352, + "step": 13814, + "train/sim_loss": 0.04902148246765137 + }, + { + "epoch": 1.3658295432074352, + "step": 13814, + "train/total_loss": 0.12306737154722214 + }, + { + "entropy": 9.595955848693848, + "epoch": 1.3659284160569507, + "mean_token_accuracy": 0.8359046578407288, + "num_tokens": 22752101.0, + "step": 13815, + "train/ce_loss": 0.582929253578186 + }, + { + "epoch": 1.3659284160569507, + "step": 13815, + "train/sim_loss": 0.016118228435516357 + }, + { + "epoch": 1.3659284160569507, + "step": 13815, + "train/total_loss": 0.07441115379333496 + }, + { + "entropy": 9.817838668823242, + "epoch": 1.3660272889064662, + "mean_token_accuracy": 0.8566493988037109, + "num_tokens": 22765713.0, + "step": 13816, + "train/ce_loss": 0.6447961330413818 + }, + { + "epoch": 1.3660272889064662, + "step": 13816, + "train/sim_loss": 0.04389607906341553 + }, + { + "epoch": 1.3660272889064662, + "step": 13816, + "train/total_loss": 0.10837569087743759 + }, + { + "entropy": 9.220440864562988, + "epoch": 1.366126161755982, + "mean_token_accuracy": 0.8738019466400146, + "num_tokens": 22782226.0, + "step": 13817, + "train/ce_loss": 0.32990583777427673 + }, + { + "epoch": 1.366126161755982, + "step": 13817, + "train/sim_loss": 0.04321587085723877 + }, + { + "epoch": 1.366126161755982, + "step": 13817, + "train/total_loss": 0.07620646059513092 + }, + { + "entropy": 9.497188568115234, + "epoch": 1.3662250346054974, + "mean_token_accuracy": 0.9191918969154358, + "num_tokens": 22792876.0, + "step": 13818, + "train/ce_loss": 0.5710245966911316 + }, + { + "epoch": 1.3662250346054974, + "step": 13818, + "train/sim_loss": 0.01718771457672119 + }, + { + "epoch": 1.3662250346054974, + "step": 13818, + "train/total_loss": 0.07429017126560211 + }, + { + "entropy": 9.31828498840332, + "epoch": 1.3663239074550129, + "mean_token_accuracy": 0.8834951519966125, + "num_tokens": 22806116.0, + "step": 13819, + "train/ce_loss": 0.15948434174060822 + }, + { + "epoch": 1.3663239074550129, + "step": 13819, + "train/sim_loss": 0.02209937572479248 + }, + { + "epoch": 1.3663239074550129, + "step": 13819, + "train/total_loss": 0.03804781287908554 + }, + { + "epoch": 1.3664227803045284, + "grad_norm": 0.4967081844806671, + "learning_rate": 6.585818127874203e-06, + "loss": 0.0807, + "step": 13820 + }, + { + "entropy": 8.853270530700684, + "epoch": 1.3664227803045284, + "mean_token_accuracy": 0.8371859192848206, + "num_tokens": 22819385.0, + "step": 13820, + "train/ce_loss": 0.47179949283599854 + }, + { + "epoch": 1.3664227803045284, + "step": 13820, + "train/sim_loss": 0.06973481178283691 + }, + { + "epoch": 1.3664227803045284, + "step": 13820, + "train/total_loss": 0.116914764046669 + }, + { + "entropy": 8.934610366821289, + "epoch": 1.3665216531540438, + "mean_token_accuracy": 0.814254879951477, + "num_tokens": 22830779.0, + "step": 13821, + "train/ce_loss": 0.4189877212047577 + }, + { + "epoch": 1.3665216531540438, + "step": 13821, + "train/sim_loss": 0.013635814189910889 + }, + { + "epoch": 1.3665216531540438, + "step": 13821, + "train/total_loss": 0.05553458631038666 + }, + { + "entropy": 9.704974174499512, + "epoch": 1.3666205260035595, + "mean_token_accuracy": 0.8293918967247009, + "num_tokens": 22841601.0, + "step": 13822, + "train/ce_loss": 0.4118225574493408 + }, + { + "epoch": 1.3666205260035595, + "step": 13822, + "train/sim_loss": 0.024481475353240967 + }, + { + "epoch": 1.3666205260035595, + "step": 13822, + "train/total_loss": 0.06566373258829117 + }, + { + "entropy": 8.9993896484375, + "epoch": 1.3667193988530748, + "mean_token_accuracy": 0.8629876375198364, + "num_tokens": 22851555.0, + "step": 13823, + "train/ce_loss": 0.48661747574806213 + }, + { + "epoch": 1.3667193988530748, + "step": 13823, + "train/sim_loss": 0.11721408367156982 + }, + { + "epoch": 1.3667193988530748, + "step": 13823, + "train/total_loss": 0.16587583720684052 + }, + { + "entropy": 9.170077323913574, + "epoch": 1.3668182717025905, + "mean_token_accuracy": 0.8661710023880005, + "num_tokens": 22864062.0, + "step": 13824, + "train/ce_loss": 0.5262790322303772 + }, + { + "epoch": 1.3668182717025905, + "step": 13824, + "train/sim_loss": 0.0285794734954834 + }, + { + "epoch": 1.3668182717025905, + "step": 13824, + "train/total_loss": 0.08120737969875336 + }, + { + "entropy": 9.990426063537598, + "epoch": 1.366917144552106, + "mean_token_accuracy": 0.8927335739135742, + "num_tokens": 22877025.0, + "step": 13825, + "train/ce_loss": 0.2161870300769806 + }, + { + "epoch": 1.366917144552106, + "step": 13825, + "train/sim_loss": 0.03201007843017578 + }, + { + "epoch": 1.366917144552106, + "step": 13825, + "train/total_loss": 0.05362877994775772 + }, + { + "entropy": 9.58093547821045, + "epoch": 1.3670160174016215, + "mean_token_accuracy": 0.7885952591896057, + "num_tokens": 22887124.0, + "step": 13826, + "train/ce_loss": 0.5279332399368286 + }, + { + "epoch": 1.3670160174016215, + "step": 13826, + "train/sim_loss": 0.02901935577392578 + }, + { + "epoch": 1.3670160174016215, + "step": 13826, + "train/total_loss": 0.08181267976760864 + }, + { + "entropy": 8.834610939025879, + "epoch": 1.367114890251137, + "mean_token_accuracy": 0.8394815325737, + "num_tokens": 22896521.0, + "step": 13827, + "train/ce_loss": 0.5142495632171631 + }, + { + "epoch": 1.367114890251137, + "step": 13827, + "train/sim_loss": 0.02963203191757202 + }, + { + "epoch": 1.367114890251137, + "step": 13827, + "train/total_loss": 0.08105698972940445 + }, + { + "entropy": 9.658327102661133, + "epoch": 1.3672137631006525, + "mean_token_accuracy": 0.8745046257972717, + "num_tokens": 22916828.0, + "step": 13828, + "train/ce_loss": 0.35286518931388855 + }, + { + "epoch": 1.3672137631006525, + "step": 13828, + "train/sim_loss": 0.0182797908782959 + }, + { + "epoch": 1.3672137631006525, + "step": 13828, + "train/total_loss": 0.05356631055474281 + }, + { + "entropy": 9.472847938537598, + "epoch": 1.3673126359501682, + "mean_token_accuracy": 0.89952152967453, + "num_tokens": 22929669.0, + "step": 13829, + "train/ce_loss": 3.855736281366262e-07 + }, + { + "epoch": 1.3673126359501682, + "step": 13829, + "train/sim_loss": 0.02244579792022705 + }, + { + "epoch": 1.3673126359501682, + "step": 13829, + "train/total_loss": 0.022445837035775185 + }, + { + "entropy": 9.679444313049316, + "epoch": 1.3674115087996837, + "mean_token_accuracy": 0.8872548937797546, + "num_tokens": 22944728.0, + "step": 13830, + "train/ce_loss": 0.395760178565979 + }, + { + "epoch": 1.3674115087996837, + "step": 13830, + "train/sim_loss": 0.020945310592651367 + }, + { + "epoch": 1.3674115087996837, + "step": 13830, + "train/total_loss": 0.06052133068442345 + }, + { + "entropy": 9.544538497924805, + "epoch": 1.3675103816491991, + "mean_token_accuracy": 0.888198733329773, + "num_tokens": 22958817.0, + "step": 13831, + "train/ce_loss": 0.28417539596557617 + }, + { + "epoch": 1.3675103816491991, + "step": 13831, + "train/sim_loss": 0.06374335289001465 + }, + { + "epoch": 1.3675103816491991, + "step": 13831, + "train/total_loss": 0.0921608954668045 + }, + { + "entropy": 9.3396635055542, + "epoch": 1.3676092544987146, + "mean_token_accuracy": 0.8082026243209839, + "num_tokens": 22971232.0, + "step": 13832, + "train/ce_loss": 0.7004625797271729 + }, + { + "epoch": 1.3676092544987146, + "step": 13832, + "train/sim_loss": 0.05066436529159546 + }, + { + "epoch": 1.3676092544987146, + "step": 13832, + "train/total_loss": 0.12071062624454498 + }, + { + "entropy": 9.183473587036133, + "epoch": 1.36770812734823, + "mean_token_accuracy": 0.8923327922821045, + "num_tokens": 22982547.0, + "step": 13833, + "train/ce_loss": 2.1158056995318475e-07 + }, + { + "epoch": 1.36770812734823, + "step": 13833, + "train/sim_loss": 0.04799675941467285 + }, + { + "epoch": 1.36770812734823, + "step": 13833, + "train/total_loss": 0.04799678176641464 + }, + { + "entropy": 8.896663665771484, + "epoch": 1.3678070001977458, + "mean_token_accuracy": 0.8410732746124268, + "num_tokens": 22992218.0, + "step": 13834, + "train/ce_loss": 0.441775918006897 + }, + { + "epoch": 1.3678070001977458, + "step": 13834, + "train/sim_loss": 0.026129603385925293 + }, + { + "epoch": 1.3678070001977458, + "step": 13834, + "train/total_loss": 0.07030719518661499 + }, + { + "entropy": 9.360769271850586, + "epoch": 1.3679058730472613, + "mean_token_accuracy": 0.840694010257721, + "num_tokens": 23001615.0, + "step": 13835, + "train/ce_loss": 4.0237185316982504e-07 + }, + { + "epoch": 1.3679058730472613, + "step": 13835, + "train/sim_loss": 0.045170605182647705 + }, + { + "epoch": 1.3679058730472613, + "step": 13835, + "train/total_loss": 0.04517064616084099 + }, + { + "entropy": 8.93310260772705, + "epoch": 1.3680047458967768, + "mean_token_accuracy": 0.8203724026679993, + "num_tokens": 23015085.0, + "step": 13836, + "train/ce_loss": 0.6279071569442749 + }, + { + "epoch": 1.3680047458967768, + "step": 13836, + "train/sim_loss": 0.02397477626800537 + }, + { + "epoch": 1.3680047458967768, + "step": 13836, + "train/total_loss": 0.08676549047231674 + }, + { + "entropy": 9.190844535827637, + "epoch": 1.3681036187462923, + "mean_token_accuracy": 0.8805257081985474, + "num_tokens": 23023557.0, + "step": 13837, + "train/ce_loss": 0.2973804175853729 + }, + { + "epoch": 1.3681036187462923, + "step": 13837, + "train/sim_loss": 0.009469270706176758 + }, + { + "epoch": 1.3681036187462923, + "step": 13837, + "train/total_loss": 0.03920731320977211 + }, + { + "entropy": 9.057822227478027, + "epoch": 1.3682024915958078, + "mean_token_accuracy": 0.8719676733016968, + "num_tokens": 23035472.0, + "step": 13838, + "train/ce_loss": 3.2517101544726756e-07 + }, + { + "epoch": 1.3682024915958078, + "step": 13838, + "train/sim_loss": 0.042849838733673096 + }, + { + "epoch": 1.3682024915958078, + "step": 13838, + "train/total_loss": 0.04284987226128578 + }, + { + "entropy": 9.490015029907227, + "epoch": 1.3683013644453232, + "mean_token_accuracy": 0.8623853325843811, + "num_tokens": 23050579.0, + "step": 13839, + "train/ce_loss": 0.667823076248169 + }, + { + "epoch": 1.3683013644453232, + "step": 13839, + "train/sim_loss": 0.03509920835494995 + }, + { + "epoch": 1.3683013644453232, + "step": 13839, + "train/total_loss": 0.10188151895999908 + }, + { + "epoch": 1.3684002372948387, + "grad_norm": 0.5980611443519592, + "learning_rate": 6.5808732631162545e-06, + "loss": 0.0825, + "step": 13840 + }, + { + "entropy": 9.546745300292969, + "epoch": 1.3684002372948387, + "mean_token_accuracy": 0.841176450252533, + "num_tokens": 23064401.0, + "step": 13840, + "train/ce_loss": 4.1979825482485467e-07 + }, + { + "epoch": 1.3684002372948387, + "step": 13840, + "train/sim_loss": 0.036231279373168945 + }, + { + "epoch": 1.3684002372948387, + "step": 13840, + "train/total_loss": 0.03623132035136223 + }, + { + "entropy": 8.985108375549316, + "epoch": 1.3684991101443544, + "mean_token_accuracy": 0.855332612991333, + "num_tokens": 23074431.0, + "step": 13841, + "train/ce_loss": 0.6043205261230469 + }, + { + "epoch": 1.3684991101443544, + "step": 13841, + "train/sim_loss": 0.03976947069168091 + }, + { + "epoch": 1.3684991101443544, + "step": 13841, + "train/total_loss": 0.10020152479410172 + }, + { + "entropy": 9.377220153808594, + "epoch": 1.36859798299387, + "mean_token_accuracy": 0.9094955325126648, + "num_tokens": 23092775.0, + "step": 13842, + "train/ce_loss": 0.2673787474632263 + }, + { + "epoch": 1.36859798299387, + "step": 13842, + "train/sim_loss": 0.034796953201293945 + }, + { + "epoch": 1.36859798299387, + "step": 13842, + "train/total_loss": 0.0615348294377327 + }, + { + "entropy": 9.577933311462402, + "epoch": 1.3686968558433854, + "mean_token_accuracy": 0.8747252821922302, + "num_tokens": 23109091.0, + "step": 13843, + "train/ce_loss": 0.6946851015090942 + }, + { + "epoch": 1.3686968558433854, + "step": 13843, + "train/sim_loss": 0.022708117961883545 + }, + { + "epoch": 1.3686968558433854, + "step": 13843, + "train/total_loss": 0.09217663109302521 + }, + { + "entropy": 9.912851333618164, + "epoch": 1.368795728692901, + "mean_token_accuracy": 0.8661290407180786, + "num_tokens": 23120397.0, + "step": 13844, + "train/ce_loss": 0.2145165503025055 + }, + { + "epoch": 1.368795728692901, + "step": 13844, + "train/sim_loss": 0.022054195404052734 + }, + { + "epoch": 1.368795728692901, + "step": 13844, + "train/total_loss": 0.04350585117936134 + }, + { + "entropy": 9.363027572631836, + "epoch": 1.3688946015424164, + "mean_token_accuracy": 0.8846737742424011, + "num_tokens": 23130834.0, + "step": 13845, + "train/ce_loss": 5.35433343884506e-07 + }, + { + "epoch": 1.3688946015424164, + "step": 13845, + "train/sim_loss": 0.037619709968566895 + }, + { + "epoch": 1.3688946015424164, + "step": 13845, + "train/total_loss": 0.03761976212263107 + }, + { + "entropy": 9.841047286987305, + "epoch": 1.368993474391932, + "mean_token_accuracy": 0.8355855941772461, + "num_tokens": 23149080.0, + "step": 13846, + "train/ce_loss": 0.7877315282821655 + }, + { + "epoch": 1.368993474391932, + "step": 13846, + "train/sim_loss": 0.018597304821014404 + }, + { + "epoch": 1.368993474391932, + "step": 13846, + "train/total_loss": 0.0973704606294632 + }, + { + "entropy": 9.281170845031738, + "epoch": 1.3690923472414476, + "mean_token_accuracy": 0.8545627593994141, + "num_tokens": 23162407.0, + "step": 13847, + "train/ce_loss": 0.7930374145507812 + }, + { + "epoch": 1.3690923472414476, + "step": 13847, + "train/sim_loss": 0.0523684024810791 + }, + { + "epoch": 1.3690923472414476, + "step": 13847, + "train/total_loss": 0.13167214393615723 + }, + { + "entropy": 9.592119216918945, + "epoch": 1.369191220090963, + "mean_token_accuracy": 0.8291855454444885, + "num_tokens": 23176481.0, + "step": 13848, + "train/ce_loss": 0.6325310468673706 + }, + { + "epoch": 1.369191220090963, + "step": 13848, + "train/sim_loss": 0.0315474271774292 + }, + { + "epoch": 1.369191220090963, + "step": 13848, + "train/total_loss": 0.09480053186416626 + }, + { + "entropy": 9.10464096069336, + "epoch": 1.3692900929404785, + "mean_token_accuracy": 0.8161849975585938, + "num_tokens": 23188828.0, + "step": 13849, + "train/ce_loss": 0.5941178798675537 + }, + { + "epoch": 1.3692900929404785, + "step": 13849, + "train/sim_loss": 0.06406450271606445 + }, + { + "epoch": 1.3692900929404785, + "step": 13849, + "train/total_loss": 0.1234762966632843 + }, + { + "entropy": 9.400124549865723, + "epoch": 1.369388965789994, + "mean_token_accuracy": 0.8223684430122375, + "num_tokens": 23204022.0, + "step": 13850, + "train/ce_loss": 0.32337653636932373 + }, + { + "epoch": 1.369388965789994, + "step": 13850, + "train/sim_loss": 0.050863444805145264 + }, + { + "epoch": 1.369388965789994, + "step": 13850, + "train/total_loss": 0.0832010954618454 + }, + { + "entropy": 9.556974411010742, + "epoch": 1.3694878386395095, + "mean_token_accuracy": 0.8822463750839233, + "num_tokens": 23216692.0, + "step": 13851, + "train/ce_loss": 0.45948436856269836 + }, + { + "epoch": 1.3694878386395095, + "step": 13851, + "train/sim_loss": 0.04212313890457153 + }, + { + "epoch": 1.3694878386395095, + "step": 13851, + "train/total_loss": 0.08807157725095749 + }, + { + "entropy": 9.700048446655273, + "epoch": 1.369586711489025, + "mean_token_accuracy": 0.7855297327041626, + "num_tokens": 23227890.0, + "step": 13852, + "train/ce_loss": 0.3947431445121765 + }, + { + "epoch": 1.369586711489025, + "step": 13852, + "train/sim_loss": 0.03872114419937134 + }, + { + "epoch": 1.369586711489025, + "step": 13852, + "train/total_loss": 0.07819546014070511 + }, + { + "entropy": 9.323882102966309, + "epoch": 1.3696855843385407, + "mean_token_accuracy": 0.8564867973327637, + "num_tokens": 23242632.0, + "step": 13853, + "train/ce_loss": 0.7864463329315186 + }, + { + "epoch": 1.3696855843385407, + "step": 13853, + "train/sim_loss": 0.039597153663635254 + }, + { + "epoch": 1.3696855843385407, + "step": 13853, + "train/total_loss": 0.11824178695678711 + }, + { + "entropy": 9.558794975280762, + "epoch": 1.3697844571880562, + "mean_token_accuracy": 0.8249694108963013, + "num_tokens": 23260743.0, + "step": 13854, + "train/ce_loss": 0.5903465747833252 + }, + { + "epoch": 1.3697844571880562, + "step": 13854, + "train/sim_loss": 0.04441887140274048 + }, + { + "epoch": 1.3697844571880562, + "step": 13854, + "train/total_loss": 0.10345353186130524 + }, + { + "entropy": 8.775455474853516, + "epoch": 1.3698833300375717, + "mean_token_accuracy": 0.8708487153053284, + "num_tokens": 23267848.0, + "step": 13855, + "train/ce_loss": 0.32121405005455017 + }, + { + "epoch": 1.3698833300375717, + "step": 13855, + "train/sim_loss": 0.014717638492584229 + }, + { + "epoch": 1.3698833300375717, + "step": 13855, + "train/total_loss": 0.046839043498039246 + }, + { + "entropy": 9.524299621582031, + "epoch": 1.3699822028870872, + "mean_token_accuracy": 0.8480300307273865, + "num_tokens": 23278843.0, + "step": 13856, + "train/ce_loss": 6.530632390422397e-07 + }, + { + "epoch": 1.3699822028870872, + "step": 13856, + "train/sim_loss": 0.03584599494934082 + }, + { + "epoch": 1.3699822028870872, + "step": 13856, + "train/total_loss": 0.03584606200456619 + }, + { + "entropy": 9.386935234069824, + "epoch": 1.3700810757366026, + "mean_token_accuracy": 0.9055555462837219, + "num_tokens": 23289604.0, + "step": 13857, + "train/ce_loss": 2.960327378787042e-07 + }, + { + "epoch": 1.3700810757366026, + "step": 13857, + "train/sim_loss": 0.058025479316711426 + }, + { + "epoch": 1.3700810757366026, + "step": 13857, + "train/total_loss": 0.058025509119033813 + }, + { + "entropy": 9.601119995117188, + "epoch": 1.3701799485861184, + "mean_token_accuracy": 0.8100407123565674, + "num_tokens": 23300170.0, + "step": 13858, + "train/ce_loss": 0.7122638821601868 + }, + { + "epoch": 1.3701799485861184, + "step": 13858, + "train/sim_loss": 0.0498543381690979 + }, + { + "epoch": 1.3701799485861184, + "step": 13858, + "train/total_loss": 0.12108072638511658 + }, + { + "entropy": 9.15296745300293, + "epoch": 1.3702788214356338, + "mean_token_accuracy": 0.8225806355476379, + "num_tokens": 23312843.0, + "step": 13859, + "train/ce_loss": 0.323698490858078 + }, + { + "epoch": 1.3702788214356338, + "step": 13859, + "train/sim_loss": 0.06182914972305298 + }, + { + "epoch": 1.3702788214356338, + "step": 13859, + "train/total_loss": 0.09419900178909302 + }, + { + "epoch": 1.3703776942851493, + "grad_norm": 0.586679995059967, + "learning_rate": 6.575928398358306e-06, + "loss": 0.0844, + "step": 13860 + }, + { + "entropy": 9.338920593261719, + "epoch": 1.3703776942851493, + "mean_token_accuracy": 0.8853288292884827, + "num_tokens": 23325470.0, + "step": 13860, + "train/ce_loss": 2.6615762749315763e-07 + }, + { + "epoch": 1.3703776942851493, + "step": 13860, + "train/sim_loss": 0.021440744400024414 + }, + { + "epoch": 1.3703776942851493, + "step": 13860, + "train/total_loss": 0.021440770477056503 + }, + { + "entropy": 8.90053939819336, + "epoch": 1.3704765671346648, + "mean_token_accuracy": 0.8438966870307922, + "num_tokens": 23332219.0, + "step": 13861, + "train/ce_loss": 0.28921636939048767 + }, + { + "epoch": 1.3704765671346648, + "step": 13861, + "train/sim_loss": 0.04733169078826904 + }, + { + "epoch": 1.3704765671346648, + "step": 13861, + "train/total_loss": 0.07625332474708557 + }, + { + "entropy": 9.202983856201172, + "epoch": 1.3705754399841803, + "mean_token_accuracy": 0.8450899124145508, + "num_tokens": 23346096.0, + "step": 13862, + "train/ce_loss": 0.5870763659477234 + }, + { + "epoch": 1.3705754399841803, + "step": 13862, + "train/sim_loss": 0.06635856628417969 + }, + { + "epoch": 1.3705754399841803, + "step": 13862, + "train/total_loss": 0.12506620585918427 + }, + { + "entropy": 9.201610565185547, + "epoch": 1.3706743128336958, + "mean_token_accuracy": 0.8447653651237488, + "num_tokens": 23358210.0, + "step": 13863, + "train/ce_loss": 0.4843589663505554 + }, + { + "epoch": 1.3706743128336958, + "step": 13863, + "train/sim_loss": 0.038543701171875 + }, + { + "epoch": 1.3706743128336958, + "step": 13863, + "train/total_loss": 0.08697959780693054 + }, + { + "entropy": 8.85639476776123, + "epoch": 1.3707731856832113, + "mean_token_accuracy": 0.8571428656578064, + "num_tokens": 23370157.0, + "step": 13864, + "train/ce_loss": 0.5905900001525879 + }, + { + "epoch": 1.3707731856832113, + "step": 13864, + "train/sim_loss": 0.052366673946380615 + }, + { + "epoch": 1.3707731856832113, + "step": 13864, + "train/total_loss": 0.11142567545175552 + }, + { + "entropy": 9.271858215332031, + "epoch": 1.370872058532727, + "mean_token_accuracy": 0.8229398727416992, + "num_tokens": 23388325.0, + "step": 13865, + "train/ce_loss": 0.374661386013031 + }, + { + "epoch": 1.370872058532727, + "step": 13865, + "train/sim_loss": 0.025042712688446045 + }, + { + "epoch": 1.370872058532727, + "step": 13865, + "train/total_loss": 0.06250885128974915 + }, + { + "entropy": 9.112791061401367, + "epoch": 1.3709709313822425, + "mean_token_accuracy": 0.8331360816955566, + "num_tokens": 23399851.0, + "step": 13866, + "train/ce_loss": 0.596876859664917 + }, + { + "epoch": 1.3709709313822425, + "step": 13866, + "train/sim_loss": 0.05633729696273804 + }, + { + "epoch": 1.3709709313822425, + "step": 13866, + "train/total_loss": 0.11602498590946198 + }, + { + "entropy": 9.368254661560059, + "epoch": 1.371069804231758, + "mean_token_accuracy": 0.8711864352226257, + "num_tokens": 23415519.0, + "step": 13867, + "train/ce_loss": 0.5216188430786133 + }, + { + "epoch": 1.371069804231758, + "step": 13867, + "train/sim_loss": 0.030410945415496826 + }, + { + "epoch": 1.371069804231758, + "step": 13867, + "train/total_loss": 0.0825728327035904 + }, + { + "entropy": 9.29050064086914, + "epoch": 1.3711686770812734, + "mean_token_accuracy": 0.8210961818695068, + "num_tokens": 23430200.0, + "step": 13868, + "train/ce_loss": 0.6519060730934143 + }, + { + "epoch": 1.3711686770812734, + "step": 13868, + "train/sim_loss": 0.0635608434677124 + }, + { + "epoch": 1.3711686770812734, + "step": 13868, + "train/total_loss": 0.1287514567375183 + }, + { + "entropy": 9.493026733398438, + "epoch": 1.371267549930789, + "mean_token_accuracy": 0.8105781078338623, + "num_tokens": 23446529.0, + "step": 13869, + "train/ce_loss": 0.6488566994667053 + }, + { + "epoch": 1.371267549930789, + "step": 13869, + "train/sim_loss": 0.031896889209747314 + }, + { + "epoch": 1.371267549930789, + "step": 13869, + "train/total_loss": 0.09678255766630173 + }, + { + "entropy": 9.54654312133789, + "epoch": 1.3713664227803046, + "mean_token_accuracy": 0.8886986374855042, + "num_tokens": 23459644.0, + "step": 13870, + "train/ce_loss": 0.6801572442054749 + }, + { + "epoch": 1.3713664227803046, + "step": 13870, + "train/sim_loss": 0.0486147403717041 + }, + { + "epoch": 1.3713664227803046, + "step": 13870, + "train/total_loss": 0.11663046479225159 + }, + { + "entropy": 9.68115234375, + "epoch": 1.3714652956298201, + "mean_token_accuracy": 0.8181818127632141, + "num_tokens": 23474832.0, + "step": 13871, + "train/ce_loss": 0.40134429931640625 + }, + { + "epoch": 1.3714652956298201, + "step": 13871, + "train/sim_loss": 0.035157203674316406 + }, + { + "epoch": 1.3714652956298201, + "step": 13871, + "train/total_loss": 0.07529163360595703 + }, + { + "entropy": 9.706581115722656, + "epoch": 1.3715641684793356, + "mean_token_accuracy": 0.8092209696769714, + "num_tokens": 23492100.0, + "step": 13872, + "train/ce_loss": 0.6459500789642334 + }, + { + "epoch": 1.3715641684793356, + "step": 13872, + "train/sim_loss": 0.07242649793624878 + }, + { + "epoch": 1.3715641684793356, + "step": 13872, + "train/total_loss": 0.1370215117931366 + }, + { + "entropy": 9.640811920166016, + "epoch": 1.371663041328851, + "mean_token_accuracy": 0.8376963138580322, + "num_tokens": 23509375.0, + "step": 13873, + "train/ce_loss": 2.8023134746035794e-07 + }, + { + "epoch": 1.371663041328851, + "step": 13873, + "train/sim_loss": 0.02453714609146118 + }, + { + "epoch": 1.371663041328851, + "step": 13873, + "train/total_loss": 0.02453717403113842 + }, + { + "entropy": 9.564970016479492, + "epoch": 1.3717619141783666, + "mean_token_accuracy": 0.843137264251709, + "num_tokens": 23520256.0, + "step": 13874, + "train/ce_loss": 0.6295363306999207 + }, + { + "epoch": 1.3717619141783666, + "step": 13874, + "train/sim_loss": 0.04815167188644409 + }, + { + "epoch": 1.3717619141783666, + "step": 13874, + "train/total_loss": 0.1111053079366684 + }, + { + "entropy": 8.770133018493652, + "epoch": 1.3718607870278823, + "mean_token_accuracy": 0.7971333861351013, + "num_tokens": 23530613.0, + "step": 13875, + "train/ce_loss": 0.45354145765304565 + }, + { + "epoch": 1.3718607870278823, + "step": 13875, + "train/sim_loss": 0.011299848556518555 + }, + { + "epoch": 1.3718607870278823, + "step": 13875, + "train/total_loss": 0.05665399506688118 + }, + { + "entropy": 9.373443603515625, + "epoch": 1.3719596598773975, + "mean_token_accuracy": 0.8611897826194763, + "num_tokens": 23543747.0, + "step": 13876, + "train/ce_loss": 0.3383879065513611 + }, + { + "epoch": 1.3719596598773975, + "step": 13876, + "train/sim_loss": 0.009656429290771484 + }, + { + "epoch": 1.3719596598773975, + "step": 13876, + "train/total_loss": 0.04349521920084953 + }, + { + "entropy": 9.683053016662598, + "epoch": 1.3720585327269132, + "mean_token_accuracy": 0.9176201224327087, + "num_tokens": 23551221.0, + "step": 13877, + "train/ce_loss": 0.33135759830474854 + }, + { + "epoch": 1.3720585327269132, + "step": 13877, + "train/sim_loss": 0.053089439868927 + }, + { + "epoch": 1.3720585327269132, + "step": 13877, + "train/total_loss": 0.08622519671916962 + }, + { + "entropy": 9.554137229919434, + "epoch": 1.3721574055764287, + "mean_token_accuracy": 0.8421052694320679, + "num_tokens": 23565223.0, + "step": 13878, + "train/ce_loss": 0.4061722159385681 + }, + { + "epoch": 1.3721574055764287, + "step": 13878, + "train/sim_loss": 0.043727874755859375 + }, + { + "epoch": 1.3721574055764287, + "step": 13878, + "train/total_loss": 0.08434510231018066 + }, + { + "entropy": 9.651819229125977, + "epoch": 1.3722562784259442, + "mean_token_accuracy": 0.8381502628326416, + "num_tokens": 23581995.0, + "step": 13879, + "train/ce_loss": 0.4548113942146301 + }, + { + "epoch": 1.3722562784259442, + "step": 13879, + "train/sim_loss": 0.07445961236953735 + }, + { + "epoch": 1.3722562784259442, + "step": 13879, + "train/total_loss": 0.11994075775146484 + }, + { + "epoch": 1.3723551512754597, + "grad_norm": 0.6217468976974487, + "learning_rate": 6.570983533600357e-06, + "loss": 0.089, + "step": 13880 + }, + { + "entropy": 9.835947036743164, + "epoch": 1.3723551512754597, + "mean_token_accuracy": 0.8703703880310059, + "num_tokens": 23592169.0, + "step": 13880, + "train/ce_loss": 5.474722684084554e-07 + }, + { + "epoch": 1.3723551512754597, + "step": 13880, + "train/sim_loss": 0.018162548542022705 + }, + { + "epoch": 1.3723551512754597, + "step": 13880, + "train/total_loss": 0.018162602558732033 + }, + { + "entropy": 9.357010841369629, + "epoch": 1.3724540241249752, + "mean_token_accuracy": 0.9120567440986633, + "num_tokens": 23607811.0, + "step": 13881, + "train/ce_loss": 3.4561762163320964e-07 + }, + { + "epoch": 1.3724540241249752, + "step": 13881, + "train/sim_loss": 0.06667006015777588 + }, + { + "epoch": 1.3724540241249752, + "step": 13881, + "train/total_loss": 0.06667009741067886 + }, + { + "entropy": 9.429508209228516, + "epoch": 1.372552896974491, + "mean_token_accuracy": 0.8478500843048096, + "num_tokens": 23622627.0, + "step": 13882, + "train/ce_loss": 0.1990702599287033 + }, + { + "epoch": 1.372552896974491, + "step": 13882, + "train/sim_loss": 0.08470934629440308 + }, + { + "epoch": 1.372552896974491, + "step": 13882, + "train/total_loss": 0.10461637377738953 + }, + { + "entropy": 9.359426498413086, + "epoch": 1.3726517698240064, + "mean_token_accuracy": 0.8058645129203796, + "num_tokens": 23638727.0, + "step": 13883, + "train/ce_loss": 0.5365924835205078 + }, + { + "epoch": 1.3726517698240064, + "step": 13883, + "train/sim_loss": 0.0306851863861084 + }, + { + "epoch": 1.3726517698240064, + "step": 13883, + "train/total_loss": 0.08434443175792694 + }, + { + "entropy": 9.047636032104492, + "epoch": 1.3727506426735219, + "mean_token_accuracy": 0.8172690868377686, + "num_tokens": 23651038.0, + "step": 13884, + "train/ce_loss": 0.43311789631843567 + }, + { + "epoch": 1.3727506426735219, + "step": 13884, + "train/sim_loss": 0.014465272426605225 + }, + { + "epoch": 1.3727506426735219, + "step": 13884, + "train/total_loss": 0.05777706205844879 + }, + { + "entropy": 9.252368927001953, + "epoch": 1.3728495155230374, + "mean_token_accuracy": 0.8511363863945007, + "num_tokens": 23669328.0, + "step": 13885, + "train/ce_loss": 0.7370021343231201 + }, + { + "epoch": 1.3728495155230374, + "step": 13885, + "train/sim_loss": 0.02308875322341919 + }, + { + "epoch": 1.3728495155230374, + "step": 13885, + "train/total_loss": 0.09678896516561508 + }, + { + "entropy": 8.975217819213867, + "epoch": 1.3729483883725528, + "mean_token_accuracy": 0.8001998066902161, + "num_tokens": 23678992.0, + "step": 13886, + "train/ce_loss": 0.534784197807312 + }, + { + "epoch": 1.3729483883725528, + "step": 13886, + "train/sim_loss": 0.06230175495147705 + }, + { + "epoch": 1.3729483883725528, + "step": 13886, + "train/total_loss": 0.11578017473220825 + }, + { + "entropy": 9.707601547241211, + "epoch": 1.3730472612220685, + "mean_token_accuracy": 0.9799196720123291, + "num_tokens": 23691656.0, + "step": 13887, + "train/ce_loss": 1.4649427839685814e-06 + }, + { + "epoch": 1.3730472612220685, + "step": 13887, + "train/sim_loss": 0.02529221773147583 + }, + { + "epoch": 1.3730472612220685, + "step": 13887, + "train/total_loss": 0.02529236488044262 + }, + { + "entropy": 9.996026992797852, + "epoch": 1.3731461340715838, + "mean_token_accuracy": 0.859375, + "num_tokens": 23699501.0, + "step": 13888, + "train/ce_loss": 3.416591596305807e-07 + }, + { + "epoch": 1.3731461340715838, + "step": 13888, + "train/sim_loss": 0.012556195259094238 + }, + { + "epoch": 1.3731461340715838, + "step": 13888, + "train/total_loss": 0.012556229718029499 + }, + { + "entropy": 9.280986785888672, + "epoch": 1.3732450069210995, + "mean_token_accuracy": 0.872107207775116, + "num_tokens": 23711349.0, + "step": 13889, + "train/ce_loss": 0.3821379840373993 + }, + { + "epoch": 1.3732450069210995, + "step": 13889, + "train/sim_loss": 0.05900174379348755 + }, + { + "epoch": 1.3732450069210995, + "step": 13889, + "train/total_loss": 0.09721554815769196 + }, + { + "entropy": 9.361623764038086, + "epoch": 1.373343879770615, + "mean_token_accuracy": 0.8544303774833679, + "num_tokens": 23722433.0, + "step": 13890, + "train/ce_loss": 4.436294602783164e-07 + }, + { + "epoch": 1.373343879770615, + "step": 13890, + "train/sim_loss": 0.033450543880462646 + }, + { + "epoch": 1.373343879770615, + "step": 13890, + "train/total_loss": 0.03345058858394623 + }, + { + "entropy": 9.144758224487305, + "epoch": 1.3734427526201305, + "mean_token_accuracy": 0.849723756313324, + "num_tokens": 23738463.0, + "step": 13891, + "train/ce_loss": 0.2834640443325043 + }, + { + "epoch": 1.3734427526201305, + "step": 13891, + "train/sim_loss": 0.031150519847869873 + }, + { + "epoch": 1.3734427526201305, + "step": 13891, + "train/total_loss": 0.0594969242811203 + }, + { + "entropy": 9.11716079711914, + "epoch": 1.373541625469646, + "mean_token_accuracy": 0.8898963928222656, + "num_tokens": 23746682.0, + "step": 13892, + "train/ce_loss": 0.1867993175983429 + }, + { + "epoch": 1.373541625469646, + "step": 13892, + "train/sim_loss": 0.05629622936248779 + }, + { + "epoch": 1.373541625469646, + "step": 13892, + "train/total_loss": 0.07497616112232208 + }, + { + "entropy": 9.04564094543457, + "epoch": 1.3736404983191615, + "mean_token_accuracy": 0.8505997657775879, + "num_tokens": 23755851.0, + "step": 13893, + "train/ce_loss": 0.48511719703674316 + }, + { + "epoch": 1.3736404983191615, + "step": 13893, + "train/sim_loss": 0.015531182289123535 + }, + { + "epoch": 1.3736404983191615, + "step": 13893, + "train/total_loss": 0.06404290348291397 + }, + { + "entropy": 9.082109451293945, + "epoch": 1.3737393711686772, + "mean_token_accuracy": 0.8122171759605408, + "num_tokens": 23765451.0, + "step": 13894, + "train/ce_loss": 0.4640989601612091 + }, + { + "epoch": 1.3737393711686772, + "step": 13894, + "train/sim_loss": 0.03855210542678833 + }, + { + "epoch": 1.3737393711686772, + "step": 13894, + "train/total_loss": 0.08496200293302536 + }, + { + "entropy": 9.815646171569824, + "epoch": 1.3738382440181927, + "mean_token_accuracy": 0.9178082346916199, + "num_tokens": 23775146.0, + "step": 13895, + "train/ce_loss": 0.5184015035629272 + }, + { + "epoch": 1.3738382440181927, + "step": 13895, + "train/sim_loss": 0.041051387786865234 + }, + { + "epoch": 1.3738382440181927, + "step": 13895, + "train/total_loss": 0.09289154410362244 + }, + { + "entropy": 8.916810989379883, + "epoch": 1.3739371168677081, + "mean_token_accuracy": 0.8216432929039001, + "num_tokens": 23783069.0, + "step": 13896, + "train/ce_loss": 0.4635941684246063 + }, + { + "epoch": 1.3739371168677081, + "step": 13896, + "train/sim_loss": 0.013851940631866455 + }, + { + "epoch": 1.3739371168677081, + "step": 13896, + "train/total_loss": 0.06021135672926903 + }, + { + "entropy": 9.749626159667969, + "epoch": 1.3740359897172236, + "mean_token_accuracy": 0.8520833253860474, + "num_tokens": 23799957.0, + "step": 13897, + "train/ce_loss": 0.5450055599212646 + }, + { + "epoch": 1.3740359897172236, + "step": 13897, + "train/sim_loss": 0.0568767786026001 + }, + { + "epoch": 1.3740359897172236, + "step": 13897, + "train/total_loss": 0.11137733608484268 + }, + { + "entropy": 9.04585075378418, + "epoch": 1.3741348625667391, + "mean_token_accuracy": 0.8258317112922668, + "num_tokens": 23808713.0, + "step": 13898, + "train/ce_loss": 0.5014678239822388 + }, + { + "epoch": 1.3741348625667391, + "step": 13898, + "train/sim_loss": 0.062149643898010254 + }, + { + "epoch": 1.3741348625667391, + "step": 13898, + "train/total_loss": 0.11229643225669861 + }, + { + "entropy": 9.61094856262207, + "epoch": 1.3742337354162548, + "mean_token_accuracy": 0.8388888835906982, + "num_tokens": 23822401.0, + "step": 13899, + "train/ce_loss": 0.36625993251800537 + }, + { + "epoch": 1.3742337354162548, + "step": 13899, + "train/sim_loss": 0.04985332489013672 + }, + { + "epoch": 1.3742337354162548, + "step": 13899, + "train/total_loss": 0.0864793211221695 + }, + { + "epoch": 1.37433260826577, + "grad_norm": 0.6422215104103088, + "learning_rate": 6.566038668842407e-06, + "loss": 0.0845, + "step": 13900 + }, + { + "entropy": 9.22319221496582, + "epoch": 1.37433260826577, + "mean_token_accuracy": 0.8927943706512451, + "num_tokens": 23833236.0, + "step": 13900, + "train/ce_loss": 4.6593564206887095e-07 + }, + { + "epoch": 1.37433260826577, + "step": 13900, + "train/sim_loss": 0.04534804821014404 + }, + { + "epoch": 1.37433260826577, + "step": 13900, + "train/total_loss": 0.04534809663891792 + }, + { + "entropy": 9.744295120239258, + "epoch": 1.3744314811152858, + "mean_token_accuracy": 0.8227611780166626, + "num_tokens": 23844814.0, + "step": 13901, + "train/ce_loss": 0.8917611837387085 + }, + { + "epoch": 1.3744314811152858, + "step": 13901, + "train/sim_loss": 0.03811770677566528 + }, + { + "epoch": 1.3744314811152858, + "step": 13901, + "train/total_loss": 0.12729382514953613 + }, + { + "entropy": 9.680051803588867, + "epoch": 1.3745303539648013, + "mean_token_accuracy": 0.8676844835281372, + "num_tokens": 23857366.0, + "step": 13902, + "train/ce_loss": 0.34043797850608826 + }, + { + "epoch": 1.3745303539648013, + "step": 13902, + "train/sim_loss": 0.03372180461883545 + }, + { + "epoch": 1.3745303539648013, + "step": 13902, + "train/total_loss": 0.06776560842990875 + }, + { + "entropy": 9.020137786865234, + "epoch": 1.3746292268143168, + "mean_token_accuracy": 0.8838028311729431, + "num_tokens": 23868945.0, + "step": 13903, + "train/ce_loss": 0.44548550248146057 + }, + { + "epoch": 1.3746292268143168, + "step": 13903, + "train/sim_loss": 0.04125833511352539 + }, + { + "epoch": 1.3746292268143168, + "step": 13903, + "train/total_loss": 0.08580689132213593 + }, + { + "entropy": 9.563875198364258, + "epoch": 1.3747280996638322, + "mean_token_accuracy": 0.8431635499000549, + "num_tokens": 23886211.0, + "step": 13904, + "train/ce_loss": 3.157566652589594e-07 + }, + { + "epoch": 1.3747280996638322, + "step": 13904, + "train/sim_loss": 0.023174524307250977 + }, + { + "epoch": 1.3747280996638322, + "step": 13904, + "train/total_loss": 0.023174555972218513 + }, + { + "entropy": 9.585658073425293, + "epoch": 1.3748269725133477, + "mean_token_accuracy": 0.8926174640655518, + "num_tokens": 23902136.0, + "step": 13905, + "train/ce_loss": 0.4532662332057953 + }, + { + "epoch": 1.3748269725133477, + "step": 13905, + "train/sim_loss": 0.03964042663574219 + }, + { + "epoch": 1.3748269725133477, + "step": 13905, + "train/total_loss": 0.08496704697608948 + }, + { + "entropy": 9.628188133239746, + "epoch": 1.3749258453628634, + "mean_token_accuracy": 0.877237856388092, + "num_tokens": 23919129.0, + "step": 13906, + "train/ce_loss": 0.3498459756374359 + }, + { + "epoch": 1.3749258453628634, + "step": 13906, + "train/sim_loss": 0.03224003314971924 + }, + { + "epoch": 1.3749258453628634, + "step": 13906, + "train/total_loss": 0.06722463667392731 + }, + { + "entropy": 9.86685562133789, + "epoch": 1.375024718212379, + "mean_token_accuracy": 0.8715953230857849, + "num_tokens": 23927234.0, + "step": 13907, + "train/ce_loss": 0.44608330726623535 + }, + { + "epoch": 1.375024718212379, + "step": 13907, + "train/sim_loss": 0.026126205921173096 + }, + { + "epoch": 1.375024718212379, + "step": 13907, + "train/total_loss": 0.07073453813791275 + }, + { + "entropy": 8.858743667602539, + "epoch": 1.3751235910618944, + "mean_token_accuracy": 0.802955687046051, + "num_tokens": 23936038.0, + "step": 13908, + "train/ce_loss": 0.3579162061214447 + }, + { + "epoch": 1.3751235910618944, + "step": 13908, + "train/sim_loss": 0.07682573795318604 + }, + { + "epoch": 1.3751235910618944, + "step": 13908, + "train/total_loss": 0.1126173585653305 + }, + { + "entropy": 9.546225547790527, + "epoch": 1.37522246391141, + "mean_token_accuracy": 0.7743966579437256, + "num_tokens": 23950399.0, + "step": 13909, + "train/ce_loss": 0.49134591221809387 + }, + { + "epoch": 1.37522246391141, + "step": 13909, + "train/sim_loss": 0.026303648948669434 + }, + { + "epoch": 1.37522246391141, + "step": 13909, + "train/total_loss": 0.0754382461309433 + }, + { + "entropy": 8.961647033691406, + "epoch": 1.3753213367609254, + "mean_token_accuracy": 0.8258293867111206, + "num_tokens": 23962331.0, + "step": 13910, + "train/ce_loss": 0.4550477862358093 + }, + { + "epoch": 1.3753213367609254, + "step": 13910, + "train/sim_loss": 0.020793795585632324 + }, + { + "epoch": 1.3753213367609254, + "step": 13910, + "train/total_loss": 0.06629857420921326 + }, + { + "entropy": 9.429853439331055, + "epoch": 1.375420209610441, + "mean_token_accuracy": 0.8755707740783691, + "num_tokens": 23975466.0, + "step": 13911, + "train/ce_loss": 0.36771366000175476 + }, + { + "epoch": 1.375420209610441, + "step": 13911, + "train/sim_loss": 0.042024075984954834 + }, + { + "epoch": 1.375420209610441, + "step": 13911, + "train/total_loss": 0.07879544794559479 + }, + { + "entropy": 9.514701843261719, + "epoch": 1.3755190824599566, + "mean_token_accuracy": 0.8344113826751709, + "num_tokens": 23987303.0, + "step": 13912, + "train/ce_loss": 0.6417713761329651 + }, + { + "epoch": 1.3755190824599566, + "step": 13912, + "train/sim_loss": 0.034774839878082275 + }, + { + "epoch": 1.3755190824599566, + "step": 13912, + "train/total_loss": 0.09895198047161102 + }, + { + "entropy": 9.2638578414917, + "epoch": 1.375617955309472, + "mean_token_accuracy": 0.8743094205856323, + "num_tokens": 23996271.0, + "step": 13913, + "train/ce_loss": 7.442018841175013e-07 + }, + { + "epoch": 1.375617955309472, + "step": 13913, + "train/sim_loss": 0.03149980306625366 + }, + { + "epoch": 1.375617955309472, + "step": 13913, + "train/total_loss": 0.03149987757205963 + }, + { + "entropy": 9.402384757995605, + "epoch": 1.3757168281589875, + "mean_token_accuracy": 0.8666666746139526, + "num_tokens": 24009211.0, + "step": 13914, + "train/ce_loss": 0.4099857211112976 + }, + { + "epoch": 1.3757168281589875, + "step": 13914, + "train/sim_loss": 0.026082634925842285 + }, + { + "epoch": 1.3757168281589875, + "step": 13914, + "train/total_loss": 0.06708121299743652 + }, + { + "entropy": 9.456110954284668, + "epoch": 1.375815701008503, + "mean_token_accuracy": 0.7928416728973389, + "num_tokens": 24023172.0, + "step": 13915, + "train/ce_loss": 0.6386025547981262 + }, + { + "epoch": 1.375815701008503, + "step": 13915, + "train/sim_loss": 0.017979025840759277 + }, + { + "epoch": 1.375815701008503, + "step": 13915, + "train/total_loss": 0.08183928579092026 + }, + { + "entropy": 9.514813423156738, + "epoch": 1.3759145738580185, + "mean_token_accuracy": 0.8050633072853088, + "num_tokens": 24041063.0, + "step": 13916, + "train/ce_loss": 0.6663211584091187 + }, + { + "epoch": 1.3759145738580185, + "step": 13916, + "train/sim_loss": 0.07780206203460693 + }, + { + "epoch": 1.3759145738580185, + "step": 13916, + "train/total_loss": 0.14443418383598328 + }, + { + "entropy": 9.40833568572998, + "epoch": 1.376013446707534, + "mean_token_accuracy": 0.8537455201148987, + "num_tokens": 24056677.0, + "step": 13917, + "train/ce_loss": 0.7187623381614685 + }, + { + "epoch": 1.376013446707534, + "step": 13917, + "train/sim_loss": 0.09195059537887573 + }, + { + "epoch": 1.376013446707534, + "step": 13917, + "train/total_loss": 0.1638268232345581 + }, + { + "entropy": 9.534385681152344, + "epoch": 1.3761123195570497, + "mean_token_accuracy": 0.82492995262146, + "num_tokens": 24070674.0, + "step": 13918, + "train/ce_loss": 0.5182105898857117 + }, + { + "epoch": 1.3761123195570497, + "step": 13918, + "train/sim_loss": 0.04710465669631958 + }, + { + "epoch": 1.3761123195570497, + "step": 13918, + "train/total_loss": 0.09892571717500687 + }, + { + "entropy": 9.192604064941406, + "epoch": 1.3762111924065652, + "mean_token_accuracy": 0.8597475290298462, + "num_tokens": 24085850.0, + "step": 13919, + "train/ce_loss": 0.6294293999671936 + }, + { + "epoch": 1.3762111924065652, + "step": 13919, + "train/sim_loss": 0.03343033790588379 + }, + { + "epoch": 1.3762111924065652, + "step": 13919, + "train/total_loss": 0.09637328237295151 + }, + { + "epoch": 1.3763100652560807, + "grad_norm": 0.4817509353160858, + "learning_rate": 6.561093804084458e-06, + "loss": 0.0847, + "step": 13920 + }, + { + "entropy": 9.162508964538574, + "epoch": 1.3763100652560807, + "mean_token_accuracy": 0.8537005186080933, + "num_tokens": 24097493.0, + "step": 13920, + "train/ce_loss": 1.1098378896713257 + }, + { + "epoch": 1.3763100652560807, + "step": 13920, + "train/sim_loss": 0.039204955101013184 + }, + { + "epoch": 1.3763100652560807, + "step": 13920, + "train/total_loss": 0.15018874406814575 + }, + { + "entropy": 9.242898941040039, + "epoch": 1.3764089381055962, + "mean_token_accuracy": 0.8625180721282959, + "num_tokens": 24108852.0, + "step": 13921, + "train/ce_loss": 0.34860602021217346 + }, + { + "epoch": 1.3764089381055962, + "step": 13921, + "train/sim_loss": 0.03486287593841553 + }, + { + "epoch": 1.3764089381055962, + "step": 13921, + "train/total_loss": 0.06972347944974899 + }, + { + "entropy": 9.636077880859375, + "epoch": 1.3765078109551117, + "mean_token_accuracy": 0.8766404390335083, + "num_tokens": 24117649.0, + "step": 13922, + "train/ce_loss": 0.2387883961200714 + }, + { + "epoch": 1.3765078109551117, + "step": 13922, + "train/sim_loss": 0.01286923885345459 + }, + { + "epoch": 1.3765078109551117, + "step": 13922, + "train/total_loss": 0.03674808144569397 + }, + { + "entropy": 8.923932075500488, + "epoch": 1.3766066838046274, + "mean_token_accuracy": 0.8524203300476074, + "num_tokens": 24128110.0, + "step": 13923, + "train/ce_loss": 0.43957871198654175 + }, + { + "epoch": 1.3766066838046274, + "step": 13923, + "train/sim_loss": 0.0831761360168457 + }, + { + "epoch": 1.3766066838046274, + "step": 13923, + "train/total_loss": 0.12713401019573212 + }, + { + "entropy": 9.377098083496094, + "epoch": 1.3767055566541428, + "mean_token_accuracy": 0.8163506984710693, + "num_tokens": 24141077.0, + "step": 13924, + "train/ce_loss": 0.45431649684906006 + }, + { + "epoch": 1.3767055566541428, + "step": 13924, + "train/sim_loss": 0.027185142040252686 + }, + { + "epoch": 1.3767055566541428, + "step": 13924, + "train/total_loss": 0.07261679321527481 + }, + { + "entropy": 9.190387725830078, + "epoch": 1.3768044295036583, + "mean_token_accuracy": 0.8302122354507446, + "num_tokens": 24149266.0, + "step": 13925, + "train/ce_loss": 0.5034080743789673 + }, + { + "epoch": 1.3768044295036583, + "step": 13925, + "train/sim_loss": 0.03440237045288086 + }, + { + "epoch": 1.3768044295036583, + "step": 13925, + "train/total_loss": 0.08474317938089371 + }, + { + "entropy": 8.88524341583252, + "epoch": 1.3769033023531738, + "mean_token_accuracy": 0.8940397500991821, + "num_tokens": 24156076.0, + "step": 13926, + "train/ce_loss": 0.36972326040267944 + }, + { + "epoch": 1.3769033023531738, + "step": 13926, + "train/sim_loss": 0.0798954963684082 + }, + { + "epoch": 1.3769033023531738, + "step": 13926, + "train/total_loss": 0.11686782538890839 + }, + { + "entropy": 9.603087425231934, + "epoch": 1.3770021752026893, + "mean_token_accuracy": 0.852011501789093, + "num_tokens": 24168560.0, + "step": 13927, + "train/ce_loss": 0.7463496327400208 + }, + { + "epoch": 1.3770021752026893, + "step": 13927, + "train/sim_loss": 0.03291213512420654 + }, + { + "epoch": 1.3770021752026893, + "step": 13927, + "train/total_loss": 0.1075470969080925 + }, + { + "entropy": 9.1751127243042, + "epoch": 1.3771010480522048, + "mean_token_accuracy": 0.8302158117294312, + "num_tokens": 24178144.0, + "step": 13928, + "train/ce_loss": 0.5554742217063904 + }, + { + "epoch": 1.3771010480522048, + "step": 13928, + "train/sim_loss": 0.040030717849731445 + }, + { + "epoch": 1.3771010480522048, + "step": 13928, + "train/total_loss": 0.0955781415104866 + }, + { + "entropy": 9.801437377929688, + "epoch": 1.3771999209017203, + "mean_token_accuracy": 0.9030390977859497, + "num_tokens": 24197469.0, + "step": 13929, + "train/ce_loss": 0.33995094895362854 + }, + { + "epoch": 1.3771999209017203, + "step": 13929, + "train/sim_loss": 0.057780921459198 + }, + { + "epoch": 1.3771999209017203, + "step": 13929, + "train/total_loss": 0.09177601337432861 + }, + { + "entropy": 9.34759521484375, + "epoch": 1.377298793751236, + "mean_token_accuracy": 0.8424479365348816, + "num_tokens": 24209468.0, + "step": 13930, + "train/ce_loss": 0.47585511207580566 + }, + { + "epoch": 1.377298793751236, + "step": 13930, + "train/sim_loss": 0.03333193063735962 + }, + { + "epoch": 1.377298793751236, + "step": 13930, + "train/total_loss": 0.08091744780540466 + }, + { + "entropy": 9.385797500610352, + "epoch": 1.3773976666007515, + "mean_token_accuracy": 0.8270858526229858, + "num_tokens": 24222782.0, + "step": 13931, + "train/ce_loss": 0.432742714881897 + }, + { + "epoch": 1.3773976666007515, + "step": 13931, + "train/sim_loss": 0.05934441089630127 + }, + { + "epoch": 1.3773976666007515, + "step": 13931, + "train/total_loss": 0.10261867940425873 + }, + { + "entropy": 9.346695899963379, + "epoch": 1.377496539450267, + "mean_token_accuracy": 0.8623853325843811, + "num_tokens": 24236986.0, + "step": 13932, + "train/ce_loss": 0.17061109840869904 + }, + { + "epoch": 1.377496539450267, + "step": 13932, + "train/sim_loss": 0.016151785850524902 + }, + { + "epoch": 1.377496539450267, + "step": 13932, + "train/total_loss": 0.033212896436452866 + }, + { + "entropy": 9.240543365478516, + "epoch": 1.3775954122997824, + "mean_token_accuracy": 0.8133848309516907, + "num_tokens": 24254806.0, + "step": 13933, + "train/ce_loss": 0.6171711683273315 + }, + { + "epoch": 1.3775954122997824, + "step": 13933, + "train/sim_loss": 0.03143829107284546 + }, + { + "epoch": 1.3775954122997824, + "step": 13933, + "train/total_loss": 0.09315541386604309 + }, + { + "entropy": 9.103118896484375, + "epoch": 1.377694285149298, + "mean_token_accuracy": 0.8528610467910767, + "num_tokens": 24266360.0, + "step": 13934, + "train/ce_loss": 0.6123046278953552 + }, + { + "epoch": 1.377694285149298, + "step": 13934, + "train/sim_loss": 0.06892436742782593 + }, + { + "epoch": 1.377694285149298, + "step": 13934, + "train/total_loss": 0.1301548331975937 + }, + { + "entropy": 9.600377082824707, + "epoch": 1.3777931579988136, + "mean_token_accuracy": 0.860832154750824, + "num_tokens": 24279667.0, + "step": 13935, + "train/ce_loss": 0.4007144868373871 + }, + { + "epoch": 1.3777931579988136, + "step": 13935, + "train/sim_loss": 0.040634751319885254 + }, + { + "epoch": 1.3777931579988136, + "step": 13935, + "train/total_loss": 0.08070620149374008 + }, + { + "entropy": 9.082670211791992, + "epoch": 1.3778920308483291, + "mean_token_accuracy": 0.8536880016326904, + "num_tokens": 24291199.0, + "step": 13936, + "train/ce_loss": 0.20365990698337555 + }, + { + "epoch": 1.3778920308483291, + "step": 13936, + "train/sim_loss": 0.018965601921081543 + }, + { + "epoch": 1.3778920308483291, + "step": 13936, + "train/total_loss": 0.0393315926194191 + }, + { + "entropy": 9.592559814453125, + "epoch": 1.3779909036978446, + "mean_token_accuracy": 0.9068047404289246, + "num_tokens": 24302935.0, + "step": 13937, + "train/ce_loss": 1.6153110493632994e-07 + }, + { + "epoch": 1.3779909036978446, + "step": 13937, + "train/sim_loss": 0.01793515682220459 + }, + { + "epoch": 1.3779909036978446, + "step": 13937, + "train/total_loss": 0.017935173586010933 + }, + { + "entropy": 9.751811981201172, + "epoch": 1.37808977654736, + "mean_token_accuracy": 0.8080939650535583, + "num_tokens": 24313488.0, + "step": 13938, + "train/ce_loss": 0.19907835125923157 + }, + { + "epoch": 1.37808977654736, + "step": 13938, + "train/sim_loss": 0.03594022989273071 + }, + { + "epoch": 1.37808977654736, + "step": 13938, + "train/total_loss": 0.05584806576371193 + }, + { + "entropy": 9.777763366699219, + "epoch": 1.3781886493968756, + "mean_token_accuracy": 0.8362069129943848, + "num_tokens": 24323632.0, + "step": 13939, + "train/ce_loss": 0.3024597764015198 + }, + { + "epoch": 1.3781886493968756, + "step": 13939, + "train/sim_loss": 0.015307247638702393 + }, + { + "epoch": 1.3781886493968756, + "step": 13939, + "train/total_loss": 0.04555322602391243 + }, + { + "epoch": 1.378287522246391, + "grad_norm": 0.5356742143630981, + "learning_rate": 6.5561489393265096e-06, + "loss": 0.085, + "step": 13940 + }, + { + "entropy": 9.605423927307129, + "epoch": 1.378287522246391, + "mean_token_accuracy": 0.8461538553237915, + "num_tokens": 24337736.0, + "step": 13940, + "train/ce_loss": 0.41716501116752625 + }, + { + "epoch": 1.378287522246391, + "step": 13940, + "train/sim_loss": 0.0203513503074646 + }, + { + "epoch": 1.378287522246391, + "step": 13940, + "train/total_loss": 0.062067851424217224 + }, + { + "entropy": 9.460667610168457, + "epoch": 1.3783863950959065, + "mean_token_accuracy": 0.8281829357147217, + "num_tokens": 24349003.0, + "step": 13941, + "train/ce_loss": 0.5912578701972961 + }, + { + "epoch": 1.3783863950959065, + "step": 13941, + "train/sim_loss": 0.02574169635772705 + }, + { + "epoch": 1.3783863950959065, + "step": 13941, + "train/total_loss": 0.08486748486757278 + }, + { + "entropy": 9.625324249267578, + "epoch": 1.3784852679454223, + "mean_token_accuracy": 0.8526703715324402, + "num_tokens": 24357372.0, + "step": 13942, + "train/ce_loss": 2.01864963855769e-06 + }, + { + "epoch": 1.3784852679454223, + "step": 13942, + "train/sim_loss": 0.043318867683410645 + }, + { + "epoch": 1.3784852679454223, + "step": 13942, + "train/total_loss": 0.04331906884908676 + }, + { + "entropy": 9.703758239746094, + "epoch": 1.3785841407949377, + "mean_token_accuracy": 0.8556700944900513, + "num_tokens": 24374802.0, + "step": 13943, + "train/ce_loss": 0.2547847330570221 + }, + { + "epoch": 1.3785841407949377, + "step": 13943, + "train/sim_loss": 0.13040035963058472 + }, + { + "epoch": 1.3785841407949377, + "step": 13943, + "train/total_loss": 0.15587882697582245 + }, + { + "entropy": 9.515959739685059, + "epoch": 1.3786830136444532, + "mean_token_accuracy": 0.7988636493682861, + "num_tokens": 24384055.0, + "step": 13944, + "train/ce_loss": 0.5653519630432129 + }, + { + "epoch": 1.3786830136444532, + "step": 13944, + "train/sim_loss": 0.03519797325134277 + }, + { + "epoch": 1.3786830136444532, + "step": 13944, + "train/total_loss": 0.0917331725358963 + }, + { + "entropy": 9.178869247436523, + "epoch": 1.3787818864939687, + "mean_token_accuracy": 0.828262984752655, + "num_tokens": 24395591.0, + "step": 13945, + "train/ce_loss": 0.532359778881073 + }, + { + "epoch": 1.3787818864939687, + "step": 13945, + "train/sim_loss": 0.029424071311950684 + }, + { + "epoch": 1.3787818864939687, + "step": 13945, + "train/total_loss": 0.08266004920005798 + }, + { + "entropy": 9.548254013061523, + "epoch": 1.3788807593434842, + "mean_token_accuracy": 0.8647798895835876, + "num_tokens": 24402889.0, + "step": 13946, + "train/ce_loss": 3.126402248199156e-07 + }, + { + "epoch": 1.3788807593434842, + "step": 13946, + "train/sim_loss": 0.013827800750732422 + }, + { + "epoch": 1.3788807593434842, + "step": 13946, + "train/total_loss": 0.013827832415699959 + }, + { + "entropy": 9.651055335998535, + "epoch": 1.378979632193, + "mean_token_accuracy": 0.8146666884422302, + "num_tokens": 24413969.0, + "step": 13947, + "train/ce_loss": 0.7344833016395569 + }, + { + "epoch": 1.378979632193, + "step": 13947, + "train/sim_loss": 0.04426783323287964 + }, + { + "epoch": 1.378979632193, + "step": 13947, + "train/total_loss": 0.11771616339683533 + }, + { + "entropy": 9.316275596618652, + "epoch": 1.3790785050425154, + "mean_token_accuracy": 0.8421733379364014, + "num_tokens": 24427306.0, + "step": 13948, + "train/ce_loss": 0.8433458805084229 + }, + { + "epoch": 1.3790785050425154, + "step": 13948, + "train/sim_loss": 0.05987751483917236 + }, + { + "epoch": 1.3790785050425154, + "step": 13948, + "train/total_loss": 0.14421209692955017 + }, + { + "entropy": 9.658449172973633, + "epoch": 1.3791773778920309, + "mean_token_accuracy": 0.8171683549880981, + "num_tokens": 24445174.0, + "step": 13949, + "train/ce_loss": 0.3563327491283417 + }, + { + "epoch": 1.3791773778920309, + "step": 13949, + "train/sim_loss": 0.03467607498168945 + }, + { + "epoch": 1.3791773778920309, + "step": 13949, + "train/total_loss": 0.0703093558549881 + }, + { + "entropy": 9.662166595458984, + "epoch": 1.3792762507415464, + "mean_token_accuracy": 0.8539007306098938, + "num_tokens": 24456538.0, + "step": 13950, + "train/ce_loss": 0.4726163446903229 + }, + { + "epoch": 1.3792762507415464, + "step": 13950, + "train/sim_loss": 0.10309648513793945 + }, + { + "epoch": 1.3792762507415464, + "step": 13950, + "train/total_loss": 0.15035812556743622 + }, + { + "entropy": 8.626102447509766, + "epoch": 1.3793751235910618, + "mean_token_accuracy": 0.8871415257453918, + "num_tokens": 24465518.0, + "step": 13951, + "train/ce_loss": 0.40115946531295776 + }, + { + "epoch": 1.3793751235910618, + "step": 13951, + "train/sim_loss": 0.07675635814666748 + }, + { + "epoch": 1.3793751235910618, + "step": 13951, + "train/total_loss": 0.11687231063842773 + }, + { + "entropy": 9.159401893615723, + "epoch": 1.3794739964405776, + "mean_token_accuracy": 0.8613975048065186, + "num_tokens": 24483479.0, + "step": 13952, + "train/ce_loss": 0.5351526737213135 + }, + { + "epoch": 1.3794739964405776, + "step": 13952, + "train/sim_loss": 0.028619110584259033 + }, + { + "epoch": 1.3794739964405776, + "step": 13952, + "train/total_loss": 0.08213438093662262 + }, + { + "entropy": 9.411161422729492, + "epoch": 1.3795728692900928, + "mean_token_accuracy": 0.8382165431976318, + "num_tokens": 24494088.0, + "step": 13953, + "train/ce_loss": 0.47281596064567566 + }, + { + "epoch": 1.3795728692900928, + "step": 13953, + "train/sim_loss": 0.026639282703399658 + }, + { + "epoch": 1.3795728692900928, + "step": 13953, + "train/total_loss": 0.07392087578773499 + }, + { + "entropy": 9.27473258972168, + "epoch": 1.3796717421396085, + "mean_token_accuracy": 0.7527352571487427, + "num_tokens": 24503123.0, + "step": 13954, + "train/ce_loss": 0.8293408751487732 + }, + { + "epoch": 1.3796717421396085, + "step": 13954, + "train/sim_loss": 0.06826651096343994 + }, + { + "epoch": 1.3796717421396085, + "step": 13954, + "train/total_loss": 0.15120059251785278 + }, + { + "entropy": 8.702776908874512, + "epoch": 1.379770614989124, + "mean_token_accuracy": 0.85617595911026, + "num_tokens": 24518368.0, + "step": 13955, + "train/ce_loss": 0.2924647331237793 + }, + { + "epoch": 1.379770614989124, + "step": 13955, + "train/sim_loss": 0.01953721046447754 + }, + { + "epoch": 1.379770614989124, + "step": 13955, + "train/total_loss": 0.04878368228673935 + }, + { + "entropy": 9.600364685058594, + "epoch": 1.3798694878386395, + "mean_token_accuracy": 0.837027370929718, + "num_tokens": 24539034.0, + "step": 13956, + "train/ce_loss": 0.7065325975418091 + }, + { + "epoch": 1.3798694878386395, + "step": 13956, + "train/sim_loss": 0.04092049598693848 + }, + { + "epoch": 1.3798694878386395, + "step": 13956, + "train/total_loss": 0.11157375574111938 + }, + { + "entropy": 8.896234512329102, + "epoch": 1.379968360688155, + "mean_token_accuracy": 0.8700565099716187, + "num_tokens": 24545840.0, + "step": 13957, + "train/ce_loss": 0.4211970865726471 + }, + { + "epoch": 1.379968360688155, + "step": 13957, + "train/sim_loss": 0.05045616626739502 + }, + { + "epoch": 1.379968360688155, + "step": 13957, + "train/total_loss": 0.09257587790489197 + }, + { + "entropy": 8.951017379760742, + "epoch": 1.3800672335376705, + "mean_token_accuracy": 0.8079096078872681, + "num_tokens": 24552387.0, + "step": 13958, + "train/ce_loss": 0.682729184627533 + }, + { + "epoch": 1.3800672335376705, + "step": 13958, + "train/sim_loss": 0.04528486728668213 + }, + { + "epoch": 1.3800672335376705, + "step": 13958, + "train/total_loss": 0.11355778574943542 + }, + { + "entropy": 9.363370895385742, + "epoch": 1.3801661063871862, + "mean_token_accuracy": 0.8687499761581421, + "num_tokens": 24565155.0, + "step": 13959, + "train/ce_loss": 0.40528279542922974 + }, + { + "epoch": 1.3801661063871862, + "step": 13959, + "train/sim_loss": 0.03563332557678223 + }, + { + "epoch": 1.3801661063871862, + "step": 13959, + "train/total_loss": 0.07616160809993744 + }, + { + "epoch": 1.3802649792367017, + "grad_norm": 0.5601577758789062, + "learning_rate": 6.551204074568561e-06, + "loss": 0.0899, + "step": 13960 + }, + { + "entropy": 9.208361625671387, + "epoch": 1.3802649792367017, + "mean_token_accuracy": 0.8745046257972717, + "num_tokens": 24577612.0, + "step": 13960, + "train/ce_loss": 0.18961280584335327 + }, + { + "epoch": 1.3802649792367017, + "step": 13960, + "train/sim_loss": 0.03767871856689453 + }, + { + "epoch": 1.3802649792367017, + "step": 13960, + "train/total_loss": 0.05663999915122986 + }, + { + "entropy": 8.88204574584961, + "epoch": 1.3803638520862171, + "mean_token_accuracy": 0.8047847151756287, + "num_tokens": 24585793.0, + "step": 13961, + "train/ce_loss": 0.48044800758361816 + }, + { + "epoch": 1.3803638520862171, + "step": 13961, + "train/sim_loss": 0.017166197299957275 + }, + { + "epoch": 1.3803638520862171, + "step": 13961, + "train/total_loss": 0.06521099805831909 + }, + { + "entropy": 9.428874969482422, + "epoch": 1.3804627249357326, + "mean_token_accuracy": 0.8470728993415833, + "num_tokens": 24603224.0, + "step": 13962, + "train/ce_loss": 0.5682137608528137 + }, + { + "epoch": 1.3804627249357326, + "step": 13962, + "train/sim_loss": 0.06026589870452881 + }, + { + "epoch": 1.3804627249357326, + "step": 13962, + "train/total_loss": 0.11708727478981018 + }, + { + "entropy": 9.057958602905273, + "epoch": 1.3805615977852481, + "mean_token_accuracy": 0.8435294032096863, + "num_tokens": 24618147.0, + "step": 13963, + "train/ce_loss": 0.2648930549621582 + }, + { + "epoch": 1.3805615977852481, + "step": 13963, + "train/sim_loss": 0.019452333450317383 + }, + { + "epoch": 1.3805615977852481, + "step": 13963, + "train/total_loss": 0.04594163969159126 + }, + { + "entropy": 9.29734992980957, + "epoch": 1.3806604706347638, + "mean_token_accuracy": 0.8186638355255127, + "num_tokens": 24631219.0, + "step": 13964, + "train/ce_loss": 0.3625255525112152 + }, + { + "epoch": 1.3806604706347638, + "step": 13964, + "train/sim_loss": 0.012954175472259521 + }, + { + "epoch": 1.3806604706347638, + "step": 13964, + "train/total_loss": 0.04920672997832298 + }, + { + "entropy": 9.278820037841797, + "epoch": 1.380759343484279, + "mean_token_accuracy": 0.8587896227836609, + "num_tokens": 24644079.0, + "step": 13965, + "train/ce_loss": 0.26741668581962585 + }, + { + "epoch": 1.380759343484279, + "step": 13965, + "train/sim_loss": 0.02462548017501831 + }, + { + "epoch": 1.380759343484279, + "step": 13965, + "train/total_loss": 0.051367148756980896 + }, + { + "entropy": 9.514371871948242, + "epoch": 1.3808582163337948, + "mean_token_accuracy": 0.863539457321167, + "num_tokens": 24656152.0, + "step": 13966, + "train/ce_loss": 9.871872634903411e-07 + }, + { + "epoch": 1.3808582163337948, + "step": 13966, + "train/sim_loss": 0.07532918453216553 + }, + { + "epoch": 1.3808582163337948, + "step": 13966, + "train/total_loss": 0.07532928138971329 + }, + { + "entropy": 9.261968612670898, + "epoch": 1.3809570891833103, + "mean_token_accuracy": 0.8550488352775574, + "num_tokens": 24665537.0, + "step": 13967, + "train/ce_loss": 0.6481295228004456 + }, + { + "epoch": 1.3809570891833103, + "step": 13967, + "train/sim_loss": 0.061447978019714355 + }, + { + "epoch": 1.3809570891833103, + "step": 13967, + "train/total_loss": 0.1262609362602234 + }, + { + "entropy": 9.542393684387207, + "epoch": 1.3810559620328258, + "mean_token_accuracy": 0.8783314228057861, + "num_tokens": 24684413.0, + "step": 13968, + "train/ce_loss": 0.31097936630249023 + }, + { + "epoch": 1.3810559620328258, + "step": 13968, + "train/sim_loss": 0.03506582975387573 + }, + { + "epoch": 1.3810559620328258, + "step": 13968, + "train/total_loss": 0.06616376340389252 + }, + { + "entropy": 9.124422073364258, + "epoch": 1.3811548348823413, + "mean_token_accuracy": 0.8317025303840637, + "num_tokens": 24694355.0, + "step": 13969, + "train/ce_loss": 0.8262174129486084 + }, + { + "epoch": 1.3811548348823413, + "step": 13969, + "train/sim_loss": 0.11357623338699341 + }, + { + "epoch": 1.3811548348823413, + "step": 13969, + "train/total_loss": 0.1961979866027832 + }, + { + "entropy": 10.007556915283203, + "epoch": 1.3812537077318567, + "mean_token_accuracy": 0.9166666865348816, + "num_tokens": 24708971.0, + "step": 13970, + "train/ce_loss": 1.1620887517929077 + }, + { + "epoch": 1.3812537077318567, + "step": 13970, + "train/sim_loss": 0.05059230327606201 + }, + { + "epoch": 1.3812537077318567, + "step": 13970, + "train/total_loss": 0.16680118441581726 + }, + { + "entropy": 9.609212875366211, + "epoch": 1.3813525805813724, + "mean_token_accuracy": 0.9074491858482361, + "num_tokens": 24724959.0, + "step": 13971, + "train/ce_loss": 0.2791804373264313 + }, + { + "epoch": 1.3813525805813724, + "step": 13971, + "train/sim_loss": 0.028432726860046387 + }, + { + "epoch": 1.3813525805813724, + "step": 13971, + "train/total_loss": 0.056350771337747574 + }, + { + "entropy": 8.759716987609863, + "epoch": 1.381451453430888, + "mean_token_accuracy": 0.898876428604126, + "num_tokens": 24735262.0, + "step": 13972, + "train/ce_loss": 0.31972265243530273 + }, + { + "epoch": 1.381451453430888, + "step": 13972, + "train/sim_loss": 0.016495108604431152 + }, + { + "epoch": 1.381451453430888, + "step": 13972, + "train/total_loss": 0.048467375338077545 + }, + { + "entropy": 9.407781600952148, + "epoch": 1.3815503262804034, + "mean_token_accuracy": 0.8451749682426453, + "num_tokens": 24746179.0, + "step": 13973, + "train/ce_loss": 0.2860535979270935 + }, + { + "epoch": 1.3815503262804034, + "step": 13973, + "train/sim_loss": 0.03618597984313965 + }, + { + "epoch": 1.3815503262804034, + "step": 13973, + "train/total_loss": 0.06479133665561676 + }, + { + "entropy": 9.270545959472656, + "epoch": 1.381649199129919, + "mean_token_accuracy": 0.856965184211731, + "num_tokens": 24761415.0, + "step": 13974, + "train/ce_loss": 0.33775991201400757 + }, + { + "epoch": 1.381649199129919, + "step": 13974, + "train/sim_loss": 0.048484086990356445 + }, + { + "epoch": 1.381649199129919, + "step": 13974, + "train/total_loss": 0.08226007968187332 + }, + { + "entropy": 9.261489868164062, + "epoch": 1.3817480719794344, + "mean_token_accuracy": 0.8680444955825806, + "num_tokens": 24778587.0, + "step": 13975, + "train/ce_loss": 0.6420547366142273 + }, + { + "epoch": 1.3817480719794344, + "step": 13975, + "train/sim_loss": 0.048647940158843994 + }, + { + "epoch": 1.3817480719794344, + "step": 13975, + "train/total_loss": 0.11285341531038284 + }, + { + "entropy": 9.454740524291992, + "epoch": 1.38184694482895, + "mean_token_accuracy": 0.898888885974884, + "num_tokens": 24796027.0, + "step": 13976, + "train/ce_loss": 0.364955872297287 + }, + { + "epoch": 1.38184694482895, + "step": 13976, + "train/sim_loss": 0.05308091640472412 + }, + { + "epoch": 1.38184694482895, + "step": 13976, + "train/total_loss": 0.08957650512456894 + }, + { + "entropy": 9.540069580078125, + "epoch": 1.3819458176784654, + "mean_token_accuracy": 0.8444444537162781, + "num_tokens": 24811535.0, + "step": 13977, + "train/ce_loss": 0.910991907119751 + }, + { + "epoch": 1.3819458176784654, + "step": 13977, + "train/sim_loss": 0.09313368797302246 + }, + { + "epoch": 1.3819458176784654, + "step": 13977, + "train/total_loss": 0.1842328906059265 + }, + { + "entropy": 8.7493314743042, + "epoch": 1.382044690527981, + "mean_token_accuracy": 0.8218527436256409, + "num_tokens": 24820105.0, + "step": 13978, + "train/ce_loss": 0.7417550086975098 + }, + { + "epoch": 1.382044690527981, + "step": 13978, + "train/sim_loss": 0.05584216117858887 + }, + { + "epoch": 1.382044690527981, + "step": 13978, + "train/total_loss": 0.13001766800880432 + }, + { + "entropy": 9.076089859008789, + "epoch": 1.3821435633774966, + "mean_token_accuracy": 0.8338398933410645, + "num_tokens": 24831963.0, + "step": 13979, + "train/ce_loss": 0.33969739079475403 + }, + { + "epoch": 1.3821435633774966, + "step": 13979, + "train/sim_loss": 0.0860905647277832 + }, + { + "epoch": 1.3821435633774966, + "step": 13979, + "train/total_loss": 0.12006030976772308 + }, + { + "epoch": 1.382242436227012, + "grad_norm": 0.5435335636138916, + "learning_rate": 6.546259209810612e-06, + "loss": 0.0841, + "step": 13980 + }, + { + "entropy": 9.213099479675293, + "epoch": 1.382242436227012, + "mean_token_accuracy": 0.8413098454475403, + "num_tokens": 24842350.0, + "step": 13980, + "train/ce_loss": 0.41887813806533813 + }, + { + "epoch": 1.382242436227012, + "step": 13980, + "train/sim_loss": 0.06337672472000122 + }, + { + "epoch": 1.382242436227012, + "step": 13980, + "train/total_loss": 0.10526454448699951 + }, + { + "entropy": 8.534372329711914, + "epoch": 1.3823413090765275, + "mean_token_accuracy": 0.8390532732009888, + "num_tokens": 24856303.0, + "step": 13981, + "train/ce_loss": 0.37684813141822815 + }, + { + "epoch": 1.3823413090765275, + "step": 13981, + "train/sim_loss": 0.044118642807006836 + }, + { + "epoch": 1.3823413090765275, + "step": 13981, + "train/total_loss": 0.08180345594882965 + }, + { + "entropy": 9.700693130493164, + "epoch": 1.382440181926043, + "mean_token_accuracy": 0.8742514848709106, + "num_tokens": 24863851.0, + "step": 13982, + "train/ce_loss": 2.579281499492936e-07 + }, + { + "epoch": 1.382440181926043, + "step": 13982, + "train/sim_loss": 0.020282089710235596 + }, + { + "epoch": 1.382440181926043, + "step": 13982, + "train/total_loss": 0.020282115787267685 + }, + { + "entropy": 9.503582954406738, + "epoch": 1.3825390547755587, + "mean_token_accuracy": 0.8761467933654785, + "num_tokens": 24878969.0, + "step": 13983, + "train/ce_loss": 0.19360339641571045 + }, + { + "epoch": 1.3825390547755587, + "step": 13983, + "train/sim_loss": 0.047195613384246826 + }, + { + "epoch": 1.3825390547755587, + "step": 13983, + "train/total_loss": 0.06655595451593399 + }, + { + "entropy": 9.330087661743164, + "epoch": 1.3826379276250742, + "mean_token_accuracy": 0.8441558480262756, + "num_tokens": 24896232.0, + "step": 13984, + "train/ce_loss": 0.45729678869247437 + }, + { + "epoch": 1.3826379276250742, + "step": 13984, + "train/sim_loss": 0.022771477699279785 + }, + { + "epoch": 1.3826379276250742, + "step": 13984, + "train/total_loss": 0.06850115954875946 + }, + { + "entropy": 9.581954956054688, + "epoch": 1.3827368004745897, + "mean_token_accuracy": 0.8815789222717285, + "num_tokens": 24909764.0, + "step": 13985, + "train/ce_loss": 0.3952173590660095 + }, + { + "epoch": 1.3827368004745897, + "step": 13985, + "train/sim_loss": 0.07204341888427734 + }, + { + "epoch": 1.3827368004745897, + "step": 13985, + "train/total_loss": 0.11156515777111053 + }, + { + "entropy": 9.401514053344727, + "epoch": 1.3828356733241052, + "mean_token_accuracy": 0.8357142806053162, + "num_tokens": 24928405.0, + "step": 13986, + "train/ce_loss": 0.5011827945709229 + }, + { + "epoch": 1.3828356733241052, + "step": 13986, + "train/sim_loss": 0.02591043710708618 + }, + { + "epoch": 1.3828356733241052, + "step": 13986, + "train/total_loss": 0.0760287195444107 + }, + { + "entropy": 9.357701301574707, + "epoch": 1.3829345461736207, + "mean_token_accuracy": 0.7961711883544922, + "num_tokens": 24938057.0, + "step": 13987, + "train/ce_loss": 0.651103138923645 + }, + { + "epoch": 1.3829345461736207, + "step": 13987, + "train/sim_loss": 0.09391152858734131 + }, + { + "epoch": 1.3829345461736207, + "step": 13987, + "train/total_loss": 0.15902185440063477 + }, + { + "entropy": 9.2534818649292, + "epoch": 1.3830334190231364, + "mean_token_accuracy": 0.8456375598907471, + "num_tokens": 24952088.0, + "step": 13988, + "train/ce_loss": 0.4825076460838318 + }, + { + "epoch": 1.3830334190231364, + "step": 13988, + "train/sim_loss": 0.030556023120880127 + }, + { + "epoch": 1.3830334190231364, + "step": 13988, + "train/total_loss": 0.0788067877292633 + }, + { + "entropy": 8.766721725463867, + "epoch": 1.3831322918726516, + "mean_token_accuracy": 0.8260869383811951, + "num_tokens": 24961227.0, + "step": 13989, + "train/ce_loss": 0.39475229382514954 + }, + { + "epoch": 1.3831322918726516, + "step": 13989, + "train/sim_loss": 0.04077470302581787 + }, + { + "epoch": 1.3831322918726516, + "step": 13989, + "train/total_loss": 0.08024993538856506 + }, + { + "entropy": 9.92740249633789, + "epoch": 1.3832311647221673, + "mean_token_accuracy": 0.9086161851882935, + "num_tokens": 24972237.0, + "step": 13990, + "train/ce_loss": 8.814401439849462e-07 + }, + { + "epoch": 1.3832311647221673, + "step": 13990, + "train/sim_loss": 0.0408705472946167 + }, + { + "epoch": 1.3832311647221673, + "step": 13990, + "train/total_loss": 0.04087063670158386 + }, + { + "entropy": 9.749592781066895, + "epoch": 1.3833300375716828, + "mean_token_accuracy": 0.9108911156654358, + "num_tokens": 24984014.0, + "step": 13991, + "train/ce_loss": 3.6600577004719526e-06 + }, + { + "epoch": 1.3833300375716828, + "step": 13991, + "train/sim_loss": 0.037566184997558594 + }, + { + "epoch": 1.3833300375716828, + "step": 13991, + "train/total_loss": 0.03756655007600784 + }, + { + "entropy": 9.635738372802734, + "epoch": 1.3834289104211983, + "mean_token_accuracy": 0.9186256527900696, + "num_tokens": 25001609.0, + "step": 13992, + "train/ce_loss": 0.27938416600227356 + }, + { + "epoch": 1.3834289104211983, + "step": 13992, + "train/sim_loss": 0.09388279914855957 + }, + { + "epoch": 1.3834289104211983, + "step": 13992, + "train/total_loss": 0.12182121723890305 + }, + { + "entropy": 9.131599426269531, + "epoch": 1.3835277832707138, + "mean_token_accuracy": 0.8538461327552795, + "num_tokens": 25016192.0, + "step": 13993, + "train/ce_loss": 0.5029550790786743 + }, + { + "epoch": 1.3835277832707138, + "step": 13993, + "train/sim_loss": 0.15482890605926514 + }, + { + "epoch": 1.3835277832707138, + "step": 13993, + "train/total_loss": 0.2051244080066681 + }, + { + "entropy": 9.683431625366211, + "epoch": 1.3836266561202293, + "mean_token_accuracy": 0.882539689540863, + "num_tokens": 25027667.0, + "step": 13994, + "train/ce_loss": 1.5273279814209673e-06 + }, + { + "epoch": 1.3836266561202293, + "step": 13994, + "train/sim_loss": 0.025699257850646973 + }, + { + "epoch": 1.3836266561202293, + "step": 13994, + "train/total_loss": 0.02569941058754921 + }, + { + "entropy": 9.299657821655273, + "epoch": 1.383725528969745, + "mean_token_accuracy": 0.8186274766921997, + "num_tokens": 25039840.0, + "step": 13995, + "train/ce_loss": 0.6369435787200928 + }, + { + "epoch": 1.383725528969745, + "step": 13995, + "train/sim_loss": 0.04391920566558838 + }, + { + "epoch": 1.383725528969745, + "step": 13995, + "train/total_loss": 0.10761356353759766 + }, + { + "entropy": 8.842829704284668, + "epoch": 1.3838244018192605, + "mean_token_accuracy": 0.8895348906517029, + "num_tokens": 25047854.0, + "step": 13996, + "train/ce_loss": 1.2280310102141812e-06 + }, + { + "epoch": 1.3838244018192605, + "step": 13996, + "train/sim_loss": 0.04753124713897705 + }, + { + "epoch": 1.3838244018192605, + "step": 13996, + "train/total_loss": 0.0475313700735569 + }, + { + "entropy": 9.45857048034668, + "epoch": 1.383923274668776, + "mean_token_accuracy": 0.8611111044883728, + "num_tokens": 25064354.0, + "step": 13997, + "train/ce_loss": 0.22842444479465485 + }, + { + "epoch": 1.383923274668776, + "step": 13997, + "train/sim_loss": 0.06466370820999146 + }, + { + "epoch": 1.383923274668776, + "step": 13997, + "train/total_loss": 0.08750615268945694 + }, + { + "entropy": 9.073253631591797, + "epoch": 1.3840221475182914, + "mean_token_accuracy": 0.8625393509864807, + "num_tokens": 25080447.0, + "step": 13998, + "train/ce_loss": 0.46598201990127563 + }, + { + "epoch": 1.3840221475182914, + "step": 13998, + "train/sim_loss": 0.01157844066619873 + }, + { + "epoch": 1.3840221475182914, + "step": 13998, + "train/total_loss": 0.05817664414644241 + }, + { + "entropy": 8.988044738769531, + "epoch": 1.384121020367807, + "mean_token_accuracy": 0.8172972798347473, + "num_tokens": 25088104.0, + "step": 13999, + "train/ce_loss": 0.33954450488090515 + }, + { + "epoch": 1.384121020367807, + "step": 13999, + "train/sim_loss": 0.017025113105773926 + }, + { + "epoch": 1.384121020367807, + "step": 13999, + "train/total_loss": 0.05097956582903862 + }, + { + "epoch": 1.3842198932173226, + "grad_norm": 0.4969352185726166, + "learning_rate": 6.541314345052663e-06, + "loss": 0.0898, + "step": 14000 + }, + { + "entropy": 9.75169849395752, + "epoch": 1.3842198932173226, + "mean_token_accuracy": 0.8854680061340332, + "num_tokens": 25099128.0, + "step": 14000, + "train/ce_loss": 2.1199079469624849e-07 + }, + { + "epoch": 1.3842198932173226, + "step": 14000, + "train/sim_loss": 0.011401474475860596 + }, + { + "epoch": 1.3842198932173226, + "step": 14000, + "train/total_loss": 0.011401495896279812 + }, + { + "entropy": 9.10491943359375, + "epoch": 1.3843187660668381, + "mean_token_accuracy": 0.8685120940208435, + "num_tokens": 25110076.0, + "step": 14001, + "train/ce_loss": 0.17699788510799408 + }, + { + "epoch": 1.3843187660668381, + "step": 14001, + "train/sim_loss": 0.04452919960021973 + }, + { + "epoch": 1.3843187660668381, + "step": 14001, + "train/total_loss": 0.062228988856077194 + }, + { + "entropy": 9.679015159606934, + "epoch": 1.3844176389163536, + "mean_token_accuracy": 0.842293918132782, + "num_tokens": 25129936.0, + "step": 14002, + "train/ce_loss": 4.5461368358701293e-07 + }, + { + "epoch": 1.3844176389163536, + "step": 14002, + "train/sim_loss": 0.050757646560668945 + }, + { + "epoch": 1.3844176389163536, + "step": 14002, + "train/total_loss": 0.05075769126415253 + }, + { + "entropy": 9.483434677124023, + "epoch": 1.384516511765869, + "mean_token_accuracy": 0.8841871023178101, + "num_tokens": 25143040.0, + "step": 14003, + "train/ce_loss": 0.4275735020637512 + }, + { + "epoch": 1.384516511765869, + "step": 14003, + "train/sim_loss": 0.042192935943603516 + }, + { + "epoch": 1.384516511765869, + "step": 14003, + "train/total_loss": 0.0849502831697464 + }, + { + "entropy": 9.572535514831543, + "epoch": 1.3846153846153846, + "mean_token_accuracy": 0.8704783916473389, + "num_tokens": 25157576.0, + "step": 14004, + "train/ce_loss": 0.4864366948604584 + }, + { + "epoch": 1.3846153846153846, + "step": 14004, + "train/sim_loss": 0.050843119621276855 + }, + { + "epoch": 1.3846153846153846, + "step": 14004, + "train/total_loss": 0.09948679059743881 + }, + { + "entropy": 9.373445510864258, + "epoch": 1.3847142574649, + "mean_token_accuracy": 0.8592493534088135, + "num_tokens": 25168781.0, + "step": 14005, + "train/ce_loss": 0.4306119382381439 + }, + { + "epoch": 1.3847142574649, + "step": 14005, + "train/sim_loss": 0.046961188316345215 + }, + { + "epoch": 1.3847142574649, + "step": 14005, + "train/total_loss": 0.09002238512039185 + }, + { + "entropy": 9.590492248535156, + "epoch": 1.3848131303144156, + "mean_token_accuracy": 0.8218859434127808, + "num_tokens": 25187969.0, + "step": 14006, + "train/ce_loss": 2.831018832694099e-07 + }, + { + "epoch": 1.3848131303144156, + "step": 14006, + "train/sim_loss": 0.030978798866271973 + }, + { + "epoch": 1.3848131303144156, + "step": 14006, + "train/total_loss": 0.03097882680594921 + }, + { + "entropy": 9.954916000366211, + "epoch": 1.3849120031639313, + "mean_token_accuracy": 0.895061731338501, + "num_tokens": 25198322.0, + "step": 14007, + "train/ce_loss": 0.6503703594207764 + }, + { + "epoch": 1.3849120031639313, + "step": 14007, + "train/sim_loss": 0.03550088405609131 + }, + { + "epoch": 1.3849120031639313, + "step": 14007, + "train/total_loss": 0.10053791850805283 + }, + { + "entropy": 9.833845138549805, + "epoch": 1.3850108760134467, + "mean_token_accuracy": 0.8937381505966187, + "num_tokens": 25207042.0, + "step": 14008, + "train/ce_loss": 0.6679763197898865 + }, + { + "epoch": 1.3850108760134467, + "step": 14008, + "train/sim_loss": 0.05257296562194824 + }, + { + "epoch": 1.3850108760134467, + "step": 14008, + "train/total_loss": 0.11937060207128525 + }, + { + "entropy": 9.568957328796387, + "epoch": 1.3851097488629622, + "mean_token_accuracy": 0.9319371581077576, + "num_tokens": 25218191.0, + "step": 14009, + "train/ce_loss": 4.2190947624476394e-07 + }, + { + "epoch": 1.3851097488629622, + "step": 14009, + "train/sim_loss": 0.0477178692817688 + }, + { + "epoch": 1.3851097488629622, + "step": 14009, + "train/total_loss": 0.04771791025996208 + }, + { + "entropy": 9.351359367370605, + "epoch": 1.3852086217124777, + "mean_token_accuracy": 0.8210735321044922, + "num_tokens": 25234743.0, + "step": 14010, + "train/ce_loss": 0.40465047955513 + }, + { + "epoch": 1.3852086217124777, + "step": 14010, + "train/sim_loss": 0.029979944229125977 + }, + { + "epoch": 1.3852086217124777, + "step": 14010, + "train/total_loss": 0.0704449936747551 + }, + { + "entropy": 8.996642112731934, + "epoch": 1.3853074945619932, + "mean_token_accuracy": 0.8622754216194153, + "num_tokens": 25250659.0, + "step": 14011, + "train/ce_loss": 0.40548425912857056 + }, + { + "epoch": 1.3853074945619932, + "step": 14011, + "train/sim_loss": 0.01789015531539917 + }, + { + "epoch": 1.3853074945619932, + "step": 14011, + "train/total_loss": 0.058438580483198166 + }, + { + "entropy": 9.667001724243164, + "epoch": 1.385406367411509, + "mean_token_accuracy": 0.8221614360809326, + "num_tokens": 25265625.0, + "step": 14012, + "train/ce_loss": 0.2023642510175705 + }, + { + "epoch": 1.385406367411509, + "step": 14012, + "train/sim_loss": 0.013689398765563965 + }, + { + "epoch": 1.385406367411509, + "step": 14012, + "train/total_loss": 0.033925823867321014 + }, + { + "entropy": 9.617258071899414, + "epoch": 1.3855052402610244, + "mean_token_accuracy": 0.9160104990005493, + "num_tokens": 25281475.0, + "step": 14013, + "train/ce_loss": 0.19414277374744415 + }, + { + "epoch": 1.3855052402610244, + "step": 14013, + "train/sim_loss": 0.01849263906478882 + }, + { + "epoch": 1.3855052402610244, + "step": 14013, + "train/total_loss": 0.037906914949417114 + }, + { + "entropy": 9.432373046875, + "epoch": 1.3856041131105399, + "mean_token_accuracy": 0.8149883151054382, + "num_tokens": 25296094.0, + "step": 14014, + "train/ce_loss": 0.37925097346305847 + }, + { + "epoch": 1.3856041131105399, + "step": 14014, + "train/sim_loss": 0.07155591249465942 + }, + { + "epoch": 1.3856041131105399, + "step": 14014, + "train/total_loss": 0.10948100686073303 + }, + { + "entropy": 9.496255874633789, + "epoch": 1.3857029859600554, + "mean_token_accuracy": 0.8761220574378967, + "num_tokens": 25305824.0, + "step": 14015, + "train/ce_loss": 0.37911567091941833 + }, + { + "epoch": 1.3857029859600554, + "step": 14015, + "train/sim_loss": 0.07154619693756104 + }, + { + "epoch": 1.3857029859600554, + "step": 14015, + "train/total_loss": 0.10945776104927063 + }, + { + "entropy": 9.591556549072266, + "epoch": 1.3858018588095709, + "mean_token_accuracy": 0.857594907283783, + "num_tokens": 25323496.0, + "step": 14016, + "train/ce_loss": 0.5828253626823425 + }, + { + "epoch": 1.3858018588095709, + "step": 14016, + "train/sim_loss": 0.017704486846923828 + }, + { + "epoch": 1.3858018588095709, + "step": 14016, + "train/total_loss": 0.07598702609539032 + }, + { + "entropy": 9.745766639709473, + "epoch": 1.3859007316590863, + "mean_token_accuracy": 0.8309115171432495, + "num_tokens": 25334559.0, + "step": 14017, + "train/ce_loss": 0.4028666317462921 + }, + { + "epoch": 1.3859007316590863, + "step": 14017, + "train/sim_loss": 0.07392251491546631 + }, + { + "epoch": 1.3859007316590863, + "step": 14017, + "train/total_loss": 0.11420917510986328 + }, + { + "entropy": 9.011651992797852, + "epoch": 1.3859996045086018, + "mean_token_accuracy": 0.9278350472450256, + "num_tokens": 25344875.0, + "step": 14018, + "train/ce_loss": 0.188345804810524 + }, + { + "epoch": 1.3859996045086018, + "step": 14018, + "train/sim_loss": 0.028509438037872314 + }, + { + "epoch": 1.3859996045086018, + "step": 14018, + "train/total_loss": 0.04734402149915695 + }, + { + "entropy": 9.675342559814453, + "epoch": 1.3860984773581175, + "mean_token_accuracy": 0.8485342264175415, + "num_tokens": 25357682.0, + "step": 14019, + "train/ce_loss": 0.32185840606689453 + }, + { + "epoch": 1.3860984773581175, + "step": 14019, + "train/sim_loss": 0.04026174545288086 + }, + { + "epoch": 1.3860984773581175, + "step": 14019, + "train/total_loss": 0.07244758307933807 + }, + { + "epoch": 1.386197350207633, + "grad_norm": 0.643006443977356, + "learning_rate": 6.536369480294714e-06, + "loss": 0.0809, + "step": 14020 + }, + { + "entropy": 9.054669380187988, + "epoch": 1.386197350207633, + "mean_token_accuracy": 0.8245398998260498, + "num_tokens": 25369037.0, + "step": 14020, + "train/ce_loss": 0.491157591342926 + }, + { + "epoch": 1.386197350207633, + "step": 14020, + "train/sim_loss": 0.014986038208007812 + }, + { + "epoch": 1.386197350207633, + "step": 14020, + "train/total_loss": 0.06410180032253265 + }, + { + "entropy": 9.58296012878418, + "epoch": 1.3862962230571485, + "mean_token_accuracy": 0.8561403751373291, + "num_tokens": 25381814.0, + "step": 14021, + "train/ce_loss": 1.083272536561708e-06 + }, + { + "epoch": 1.3862962230571485, + "step": 14021, + "train/sim_loss": 0.027234196662902832 + }, + { + "epoch": 1.3862962230571485, + "step": 14021, + "train/total_loss": 0.027234304696321487 + }, + { + "entropy": 9.346487045288086, + "epoch": 1.386395095906664, + "mean_token_accuracy": 0.8042744398117065, + "num_tokens": 25393780.0, + "step": 14022, + "train/ce_loss": 0.8199933767318726 + }, + { + "epoch": 1.386395095906664, + "step": 14022, + "train/sim_loss": 0.12418663501739502 + }, + { + "epoch": 1.386395095906664, + "step": 14022, + "train/total_loss": 0.2061859667301178 + }, + { + "entropy": 9.312490463256836, + "epoch": 1.3864939687561795, + "mean_token_accuracy": 0.821052610874176, + "num_tokens": 25408324.0, + "step": 14023, + "train/ce_loss": 0.8790779709815979 + }, + { + "epoch": 1.3864939687561795, + "step": 14023, + "train/sim_loss": 0.08946788311004639 + }, + { + "epoch": 1.3864939687561795, + "step": 14023, + "train/total_loss": 0.1773756742477417 + }, + { + "entropy": 9.220161437988281, + "epoch": 1.3865928416056952, + "mean_token_accuracy": 0.8987013101577759, + "num_tokens": 25425614.0, + "step": 14024, + "train/ce_loss": 0.6357032060623169 + }, + { + "epoch": 1.3865928416056952, + "step": 14024, + "train/sim_loss": 0.04261314868927002 + }, + { + "epoch": 1.3865928416056952, + "step": 14024, + "train/total_loss": 0.10618346929550171 + }, + { + "entropy": 9.239956855773926, + "epoch": 1.3866917144552107, + "mean_token_accuracy": 0.8655256628990173, + "num_tokens": 25436227.0, + "step": 14025, + "train/ce_loss": 0.45986631512641907 + }, + { + "epoch": 1.3866917144552107, + "step": 14025, + "train/sim_loss": 0.05266404151916504 + }, + { + "epoch": 1.3866917144552107, + "step": 14025, + "train/total_loss": 0.09865067899227142 + }, + { + "entropy": 9.167726516723633, + "epoch": 1.3867905873047262, + "mean_token_accuracy": 0.8085299730300903, + "num_tokens": 25448212.0, + "step": 14026, + "train/ce_loss": 0.7773721218109131 + }, + { + "epoch": 1.3867905873047262, + "step": 14026, + "train/sim_loss": 0.05899554491043091 + }, + { + "epoch": 1.3867905873047262, + "step": 14026, + "train/total_loss": 0.13673275709152222 + }, + { + "entropy": 9.397178649902344, + "epoch": 1.3868894601542416, + "mean_token_accuracy": 0.8717948794364929, + "num_tokens": 25464814.0, + "step": 14027, + "train/ce_loss": 0.45298466086387634 + }, + { + "epoch": 1.3868894601542416, + "step": 14027, + "train/sim_loss": 0.024284720420837402 + }, + { + "epoch": 1.3868894601542416, + "step": 14027, + "train/total_loss": 0.06958319246768951 + }, + { + "entropy": 9.696096420288086, + "epoch": 1.3869883330037571, + "mean_token_accuracy": 0.8436481952667236, + "num_tokens": 25483189.0, + "step": 14028, + "train/ce_loss": 0.16928745806217194 + }, + { + "epoch": 1.3869883330037571, + "step": 14028, + "train/sim_loss": 0.01957613229751587 + }, + { + "epoch": 1.3869883330037571, + "step": 14028, + "train/total_loss": 0.03650487959384918 + }, + { + "entropy": 9.650108337402344, + "epoch": 1.3870872058532726, + "mean_token_accuracy": 0.9322493076324463, + "num_tokens": 25497986.0, + "step": 14029, + "train/ce_loss": 0.15087345242500305 + }, + { + "epoch": 1.3870872058532726, + "step": 14029, + "train/sim_loss": 0.017380595207214355 + }, + { + "epoch": 1.3870872058532726, + "step": 14029, + "train/total_loss": 0.03246793895959854 + }, + { + "entropy": 9.4080228805542, + "epoch": 1.387186078702788, + "mean_token_accuracy": 0.8096446990966797, + "num_tokens": 25512486.0, + "step": 14030, + "train/ce_loss": 0.3807747960090637 + }, + { + "epoch": 1.387186078702788, + "step": 14030, + "train/sim_loss": 0.0959165096282959 + }, + { + "epoch": 1.387186078702788, + "step": 14030, + "train/total_loss": 0.1339939832687378 + }, + { + "entropy": 9.410938262939453, + "epoch": 1.3872849515523038, + "mean_token_accuracy": 0.8498789072036743, + "num_tokens": 25530418.0, + "step": 14031, + "train/ce_loss": 0.47926566004753113 + }, + { + "epoch": 1.3872849515523038, + "step": 14031, + "train/sim_loss": 0.022745132446289062 + }, + { + "epoch": 1.3872849515523038, + "step": 14031, + "train/total_loss": 0.0706716999411583 + }, + { + "entropy": 9.60828685760498, + "epoch": 1.3873838244018193, + "mean_token_accuracy": 0.8894736766815186, + "num_tokens": 25546571.0, + "step": 14032, + "train/ce_loss": 0.2418988198041916 + }, + { + "epoch": 1.3873838244018193, + "step": 14032, + "train/sim_loss": 0.08084940910339355 + }, + { + "epoch": 1.3873838244018193, + "step": 14032, + "train/total_loss": 0.10503929108381271 + }, + { + "entropy": 9.585840225219727, + "epoch": 1.3874826972513348, + "mean_token_accuracy": 0.8806818127632141, + "num_tokens": 25559226.0, + "step": 14033, + "train/ce_loss": 0.54721599817276 + }, + { + "epoch": 1.3874826972513348, + "step": 14033, + "train/sim_loss": 0.06526511907577515 + }, + { + "epoch": 1.3874826972513348, + "step": 14033, + "train/total_loss": 0.11998672038316727 + }, + { + "entropy": 9.340727806091309, + "epoch": 1.3875815701008503, + "mean_token_accuracy": 0.8523676991462708, + "num_tokens": 25568415.0, + "step": 14034, + "train/ce_loss": 0.8156701922416687 + }, + { + "epoch": 1.3875815701008503, + "step": 14034, + "train/sim_loss": 0.062397003173828125 + }, + { + "epoch": 1.3875815701008503, + "step": 14034, + "train/total_loss": 0.143964022397995 + }, + { + "entropy": 8.965322494506836, + "epoch": 1.3876804429503657, + "mean_token_accuracy": 0.8582781553268433, + "num_tokens": 25578065.0, + "step": 14035, + "train/ce_loss": 0.4829643964767456 + }, + { + "epoch": 1.3876804429503657, + "step": 14035, + "train/sim_loss": 0.06271147727966309 + }, + { + "epoch": 1.3876804429503657, + "step": 14035, + "train/total_loss": 0.11100791394710541 + }, + { + "entropy": 9.74696159362793, + "epoch": 1.3877793157998815, + "mean_token_accuracy": 0.8479999899864197, + "num_tokens": 25591752.0, + "step": 14036, + "train/ce_loss": 2.1359501261031255e-06 + }, + { + "epoch": 1.3877793157998815, + "step": 14036, + "train/sim_loss": 0.037896037101745605 + }, + { + "epoch": 1.3877793157998815, + "step": 14036, + "train/total_loss": 0.03789624944329262 + }, + { + "entropy": 9.327229499816895, + "epoch": 1.387878188649397, + "mean_token_accuracy": 0.8644264340400696, + "num_tokens": 25602634.0, + "step": 14037, + "train/ce_loss": 0.36755576729774475 + }, + { + "epoch": 1.387878188649397, + "step": 14037, + "train/sim_loss": 0.07553064823150635 + }, + { + "epoch": 1.387878188649397, + "step": 14037, + "train/total_loss": 0.11228622496128082 + }, + { + "entropy": 9.081170082092285, + "epoch": 1.3879770614989124, + "mean_token_accuracy": 0.8777142763137817, + "num_tokens": 25611348.0, + "step": 14038, + "train/ce_loss": 0.3461114466190338 + }, + { + "epoch": 1.3879770614989124, + "step": 14038, + "train/sim_loss": 0.05831336975097656 + }, + { + "epoch": 1.3879770614989124, + "step": 14038, + "train/total_loss": 0.09292452037334442 + }, + { + "entropy": 8.974950790405273, + "epoch": 1.388075934348428, + "mean_token_accuracy": 0.8190854787826538, + "num_tokens": 25624531.0, + "step": 14039, + "train/ce_loss": 0.4502817690372467 + }, + { + "epoch": 1.388075934348428, + "step": 14039, + "train/sim_loss": 0.08775323629379272 + }, + { + "epoch": 1.388075934348428, + "step": 14039, + "train/total_loss": 0.13278141617774963 + }, + { + "epoch": 1.3881748071979434, + "grad_norm": 0.610990583896637, + "learning_rate": 6.5314246155367655e-06, + "loss": 0.0893, + "step": 14040 + }, + { + "entropy": 9.27134895324707, + "epoch": 1.3881748071979434, + "mean_token_accuracy": 0.8279688954353333, + "num_tokens": 25636275.0, + "step": 14040, + "train/ce_loss": 0.42268431186676025 + }, + { + "epoch": 1.3881748071979434, + "step": 14040, + "train/sim_loss": 0.05205368995666504 + }, + { + "epoch": 1.3881748071979434, + "step": 14040, + "train/total_loss": 0.09432212263345718 + }, + { + "entropy": 9.55746841430664, + "epoch": 1.388273680047459, + "mean_token_accuracy": 0.8922222256660461, + "num_tokens": 25655734.0, + "step": 14041, + "train/ce_loss": 0.2997441291809082 + }, + { + "epoch": 1.388273680047459, + "step": 14041, + "train/sim_loss": 0.06533944606781006 + }, + { + "epoch": 1.388273680047459, + "step": 14041, + "train/total_loss": 0.09531386196613312 + }, + { + "entropy": 9.48387336730957, + "epoch": 1.3883725528969744, + "mean_token_accuracy": 0.8707482814788818, + "num_tokens": 25670343.0, + "step": 14042, + "train/ce_loss": 7.338829846048611e-07 + }, + { + "epoch": 1.3883725528969744, + "step": 14042, + "train/sim_loss": 0.030928492546081543 + }, + { + "epoch": 1.3883725528969744, + "step": 14042, + "train/total_loss": 0.030928565189242363 + }, + { + "entropy": 9.42895793914795, + "epoch": 1.38847142574649, + "mean_token_accuracy": 0.8800880312919617, + "num_tokens": 25684484.0, + "step": 14043, + "train/ce_loss": 0.9201368689537048 + }, + { + "epoch": 1.38847142574649, + "step": 14043, + "train/sim_loss": 0.05624270439147949 + }, + { + "epoch": 1.38847142574649, + "step": 14043, + "train/total_loss": 0.14825639128684998 + }, + { + "entropy": 9.339324951171875, + "epoch": 1.3885702985960056, + "mean_token_accuracy": 0.8221970796585083, + "num_tokens": 25696366.0, + "step": 14044, + "train/ce_loss": 0.5463277697563171 + }, + { + "epoch": 1.3885702985960056, + "step": 14044, + "train/sim_loss": 0.07044404745101929 + }, + { + "epoch": 1.3885702985960056, + "step": 14044, + "train/total_loss": 0.12507683038711548 + }, + { + "entropy": 9.679258346557617, + "epoch": 1.388669171445521, + "mean_token_accuracy": 0.8810741901397705, + "num_tokens": 25712785.0, + "step": 14045, + "train/ce_loss": 0.19450925290584564 + }, + { + "epoch": 1.388669171445521, + "step": 14045, + "train/sim_loss": 0.017884552478790283 + }, + { + "epoch": 1.388669171445521, + "step": 14045, + "train/total_loss": 0.03733547776937485 + }, + { + "entropy": 9.929631233215332, + "epoch": 1.3887680442950365, + "mean_token_accuracy": 0.9146341681480408, + "num_tokens": 25723293.0, + "step": 14046, + "train/ce_loss": 8.699187219463056e-07 + }, + { + "epoch": 1.3887680442950365, + "step": 14046, + "train/sim_loss": 0.008471846580505371 + }, + { + "epoch": 1.3887680442950365, + "step": 14046, + "train/total_loss": 0.00847193319350481 + }, + { + "entropy": 9.146450996398926, + "epoch": 1.388866917144552, + "mean_token_accuracy": 0.8388746976852417, + "num_tokens": 25735815.0, + "step": 14047, + "train/ce_loss": 0.3778129518032074 + }, + { + "epoch": 1.388866917144552, + "step": 14047, + "train/sim_loss": 0.04096400737762451 + }, + { + "epoch": 1.388866917144552, + "step": 14047, + "train/total_loss": 0.07874530553817749 + }, + { + "entropy": 9.041343688964844, + "epoch": 1.3889657899940677, + "mean_token_accuracy": 0.831556499004364, + "num_tokens": 25745109.0, + "step": 14048, + "train/ce_loss": 0.22948627173900604 + }, + { + "epoch": 1.3889657899940677, + "step": 14048, + "train/sim_loss": 0.00926828384399414 + }, + { + "epoch": 1.3889657899940677, + "step": 14048, + "train/total_loss": 0.032216913998126984 + }, + { + "entropy": 9.091493606567383, + "epoch": 1.3890646628435832, + "mean_token_accuracy": 0.854651153087616, + "num_tokens": 25754701.0, + "step": 14049, + "train/ce_loss": 0.731563150882721 + }, + { + "epoch": 1.3890646628435832, + "step": 14049, + "train/sim_loss": 0.07209742069244385 + }, + { + "epoch": 1.3890646628435832, + "step": 14049, + "train/total_loss": 0.1452537477016449 + }, + { + "entropy": 9.61160659790039, + "epoch": 1.3891635356930987, + "mean_token_accuracy": 0.7878788113594055, + "num_tokens": 25767803.0, + "step": 14050, + "train/ce_loss": 0.8553800582885742 + }, + { + "epoch": 1.3891635356930987, + "step": 14050, + "train/sim_loss": 0.06771808862686157 + }, + { + "epoch": 1.3891635356930987, + "step": 14050, + "train/total_loss": 0.15325608849525452 + }, + { + "entropy": 9.450563430786133, + "epoch": 1.3892624085426142, + "mean_token_accuracy": 0.9021739363670349, + "num_tokens": 25781199.0, + "step": 14051, + "train/ce_loss": 0.3062465488910675 + }, + { + "epoch": 1.3892624085426142, + "step": 14051, + "train/sim_loss": 0.017527759075164795 + }, + { + "epoch": 1.3892624085426142, + "step": 14051, + "train/total_loss": 0.048152416944503784 + }, + { + "entropy": 8.978357315063477, + "epoch": 1.3893612813921297, + "mean_token_accuracy": 0.8539576530456543, + "num_tokens": 25792543.0, + "step": 14052, + "train/ce_loss": 0.42179247736930847 + }, + { + "epoch": 1.3893612813921297, + "step": 14052, + "train/sim_loss": 0.013476371765136719 + }, + { + "epoch": 1.3893612813921297, + "step": 14052, + "train/total_loss": 0.055655620992183685 + }, + { + "entropy": 9.032546997070312, + "epoch": 1.3894601542416454, + "mean_token_accuracy": 0.8438617587089539, + "num_tokens": 25805443.0, + "step": 14053, + "train/ce_loss": 0.5148121118545532 + }, + { + "epoch": 1.3894601542416454, + "step": 14053, + "train/sim_loss": 0.056095004081726074 + }, + { + "epoch": 1.3894601542416454, + "step": 14053, + "train/total_loss": 0.10757622122764587 + }, + { + "entropy": 9.597373962402344, + "epoch": 1.3895590270911606, + "mean_token_accuracy": 0.8722786903381348, + "num_tokens": 25817634.0, + "step": 14054, + "train/ce_loss": 0.6934501528739929 + }, + { + "epoch": 1.3895590270911606, + "step": 14054, + "train/sim_loss": 0.03846561908721924 + }, + { + "epoch": 1.3895590270911606, + "step": 14054, + "train/total_loss": 0.10781063884496689 + }, + { + "entropy": 9.795202255249023, + "epoch": 1.3896578999406763, + "mean_token_accuracy": 0.8469945192337036, + "num_tokens": 25829433.0, + "step": 14055, + "train/ce_loss": 0.5855966210365295 + }, + { + "epoch": 1.3896578999406763, + "step": 14055, + "train/sim_loss": 0.07165825366973877 + }, + { + "epoch": 1.3896578999406763, + "step": 14055, + "train/total_loss": 0.13021790981292725 + }, + { + "entropy": 9.209331512451172, + "epoch": 1.3897567727901918, + "mean_token_accuracy": 0.8310023546218872, + "num_tokens": 25841240.0, + "step": 14056, + "train/ce_loss": 0.37751978635787964 + }, + { + "epoch": 1.3897567727901918, + "step": 14056, + "train/sim_loss": 0.01350855827331543 + }, + { + "epoch": 1.3897567727901918, + "step": 14056, + "train/total_loss": 0.05126053839921951 + }, + { + "entropy": 9.047378540039062, + "epoch": 1.3898556456397073, + "mean_token_accuracy": 0.790043294429779, + "num_tokens": 25857062.0, + "step": 14057, + "train/ce_loss": 0.480477511882782 + }, + { + "epoch": 1.3898556456397073, + "step": 14057, + "train/sim_loss": 0.047186434268951416 + }, + { + "epoch": 1.3898556456397073, + "step": 14057, + "train/total_loss": 0.09523418545722961 + }, + { + "entropy": 8.952238082885742, + "epoch": 1.3899545184892228, + "mean_token_accuracy": 0.8383961319923401, + "num_tokens": 25864893.0, + "step": 14058, + "train/ce_loss": 0.4213572144508362 + }, + { + "epoch": 1.3899545184892228, + "step": 14058, + "train/sim_loss": 0.03878164291381836 + }, + { + "epoch": 1.3899545184892228, + "step": 14058, + "train/total_loss": 0.0809173658490181 + }, + { + "entropy": 9.537979125976562, + "epoch": 1.3900533913387383, + "mean_token_accuracy": 0.8560940027236938, + "num_tokens": 25873715.0, + "step": 14059, + "train/ce_loss": 0.8129847049713135 + }, + { + "epoch": 1.3900533913387383, + "step": 14059, + "train/sim_loss": 0.033663272857666016 + }, + { + "epoch": 1.3900533913387383, + "step": 14059, + "train/total_loss": 0.11496174335479736 + }, + { + "epoch": 1.390152264188254, + "grad_norm": 0.5604841113090515, + "learning_rate": 6.526479750778817e-06, + "loss": 0.0838, + "step": 14060 + }, + { + "entropy": 9.611379623413086, + "epoch": 1.390152264188254, + "mean_token_accuracy": 0.8219326734542847, + "num_tokens": 25886654.0, + "step": 14060, + "train/ce_loss": 0.5190297365188599 + }, + { + "epoch": 1.390152264188254, + "step": 14060, + "train/sim_loss": 0.027638494968414307 + }, + { + "epoch": 1.390152264188254, + "step": 14060, + "train/total_loss": 0.07954147458076477 + }, + { + "entropy": 9.139570236206055, + "epoch": 1.3902511370377695, + "mean_token_accuracy": 0.8227040767669678, + "num_tokens": 25894119.0, + "step": 14061, + "train/ce_loss": 0.3280947208404541 + }, + { + "epoch": 1.3902511370377695, + "step": 14061, + "train/sim_loss": 0.05499887466430664 + }, + { + "epoch": 1.3902511370377695, + "step": 14061, + "train/total_loss": 0.08780834823846817 + }, + { + "entropy": 10.024404525756836, + "epoch": 1.390350009887285, + "mean_token_accuracy": 0.9289772510528564, + "num_tokens": 25905127.0, + "step": 14062, + "train/ce_loss": 0.5774950981140137 + }, + { + "epoch": 1.390350009887285, + "step": 14062, + "train/sim_loss": 0.032843172550201416 + }, + { + "epoch": 1.390350009887285, + "step": 14062, + "train/total_loss": 0.09059268236160278 + }, + { + "entropy": 9.592483520507812, + "epoch": 1.3904488827368005, + "mean_token_accuracy": 0.8157894611358643, + "num_tokens": 25917849.0, + "step": 14063, + "train/ce_loss": 0.7472907304763794 + }, + { + "epoch": 1.3904488827368005, + "step": 14063, + "train/sim_loss": 0.08761835098266602 + }, + { + "epoch": 1.3904488827368005, + "step": 14063, + "train/total_loss": 0.1623474359512329 + }, + { + "entropy": 9.486358642578125, + "epoch": 1.390547755586316, + "mean_token_accuracy": 0.8269230723381042, + "num_tokens": 25934006.0, + "step": 14064, + "train/ce_loss": 0.3795771598815918 + }, + { + "epoch": 1.390547755586316, + "step": 14064, + "train/sim_loss": 0.029220938682556152 + }, + { + "epoch": 1.390547755586316, + "step": 14064, + "train/total_loss": 0.0671786516904831 + }, + { + "entropy": 10.334770202636719, + "epoch": 1.3906466284358316, + "mean_token_accuracy": 1.0, + "num_tokens": 25940219.0, + "step": 14065, + "train/ce_loss": 1.2596215128723998e-05 + }, + { + "epoch": 1.3906466284358316, + "step": 14065, + "train/sim_loss": 0.029698550701141357 + }, + { + "epoch": 1.3906466284358316, + "step": 14065, + "train/total_loss": 0.029699809849262238 + }, + { + "entropy": 9.582427978515625, + "epoch": 1.390745501285347, + "mean_token_accuracy": 0.8588435649871826, + "num_tokens": 25957314.0, + "step": 14066, + "train/ce_loss": 0.4207130968570709 + }, + { + "epoch": 1.390745501285347, + "step": 14066, + "train/sim_loss": 0.053751587867736816 + }, + { + "epoch": 1.390745501285347, + "step": 14066, + "train/total_loss": 0.09582290053367615 + }, + { + "entropy": 9.382261276245117, + "epoch": 1.3908443741348626, + "mean_token_accuracy": 0.8842653036117554, + "num_tokens": 25967832.0, + "step": 14067, + "train/ce_loss": 0.45449113845825195 + }, + { + "epoch": 1.3908443741348626, + "step": 14067, + "train/sim_loss": 0.04912972450256348 + }, + { + "epoch": 1.3908443741348626, + "step": 14067, + "train/total_loss": 0.09457883983850479 + }, + { + "entropy": 9.313314437866211, + "epoch": 1.390943246984378, + "mean_token_accuracy": 0.8501093983650208, + "num_tokens": 25977019.0, + "step": 14068, + "train/ce_loss": 0.3008173108100891 + }, + { + "epoch": 1.390943246984378, + "step": 14068, + "train/sim_loss": 0.013377785682678223 + }, + { + "epoch": 1.390943246984378, + "step": 14068, + "train/total_loss": 0.04345951974391937 + }, + { + "entropy": 9.708199501037598, + "epoch": 1.3910421198338936, + "mean_token_accuracy": 0.8627737164497375, + "num_tokens": 25989935.0, + "step": 14069, + "train/ce_loss": 0.5160689353942871 + }, + { + "epoch": 1.3910421198338936, + "step": 14069, + "train/sim_loss": 0.03329014778137207 + }, + { + "epoch": 1.3910421198338936, + "step": 14069, + "train/total_loss": 0.08489704132080078 + }, + { + "entropy": 9.689962387084961, + "epoch": 1.391140992683409, + "mean_token_accuracy": 0.9017980694770813, + "num_tokens": 26005853.0, + "step": 14070, + "train/ce_loss": 0.3815387785434723 + }, + { + "epoch": 1.391140992683409, + "step": 14070, + "train/sim_loss": 0.020282447338104248 + }, + { + "epoch": 1.391140992683409, + "step": 14070, + "train/total_loss": 0.058436326682567596 + }, + { + "entropy": 9.217988967895508, + "epoch": 1.3912398655329246, + "mean_token_accuracy": 0.8405994772911072, + "num_tokens": 26017399.0, + "step": 14071, + "train/ce_loss": 0.4385863244533539 + }, + { + "epoch": 1.3912398655329246, + "step": 14071, + "train/sim_loss": 0.01625847816467285 + }, + { + "epoch": 1.3912398655329246, + "step": 14071, + "train/total_loss": 0.06011711061000824 + }, + { + "entropy": 9.933448791503906, + "epoch": 1.3913387383824403, + "mean_token_accuracy": 0.8924999833106995, + "num_tokens": 26038422.0, + "step": 14072, + "train/ce_loss": 0.5027812123298645 + }, + { + "epoch": 1.3913387383824403, + "step": 14072, + "train/sim_loss": 0.03435760736465454 + }, + { + "epoch": 1.3913387383824403, + "step": 14072, + "train/total_loss": 0.08463573455810547 + }, + { + "entropy": 9.468256950378418, + "epoch": 1.3914376112319558, + "mean_token_accuracy": 0.8243412971496582, + "num_tokens": 26052732.0, + "step": 14073, + "train/ce_loss": 0.7417545914649963 + }, + { + "epoch": 1.3914376112319558, + "step": 14073, + "train/sim_loss": 0.06044769287109375 + }, + { + "epoch": 1.3914376112319558, + "step": 14073, + "train/total_loss": 0.13462315499782562 + }, + { + "entropy": 8.830217361450195, + "epoch": 1.3915364840814712, + "mean_token_accuracy": 0.8643274903297424, + "num_tokens": 26061244.0, + "step": 14074, + "train/ce_loss": 0.1322992444038391 + }, + { + "epoch": 1.3915364840814712, + "step": 14074, + "train/sim_loss": 0.03148394823074341 + }, + { + "epoch": 1.3915364840814712, + "step": 14074, + "train/total_loss": 0.04471387341618538 + }, + { + "entropy": 8.882186889648438, + "epoch": 1.3916353569309867, + "mean_token_accuracy": 0.8614609837532043, + "num_tokens": 26073351.0, + "step": 14075, + "train/ce_loss": 0.471713125705719 + }, + { + "epoch": 1.3916353569309867, + "step": 14075, + "train/sim_loss": 0.01143336296081543 + }, + { + "epoch": 1.3916353569309867, + "step": 14075, + "train/total_loss": 0.05860467627644539 + }, + { + "entropy": 9.392906188964844, + "epoch": 1.3917342297805022, + "mean_token_accuracy": 0.8171589374542236, + "num_tokens": 26088513.0, + "step": 14076, + "train/ce_loss": 0.7270969748497009 + }, + { + "epoch": 1.3917342297805022, + "step": 14076, + "train/sim_loss": 0.04870486259460449 + }, + { + "epoch": 1.3917342297805022, + "step": 14076, + "train/total_loss": 0.12141456454992294 + }, + { + "entropy": 9.432867050170898, + "epoch": 1.391833102630018, + "mean_token_accuracy": 0.8647959232330322, + "num_tokens": 26099450.0, + "step": 14077, + "train/ce_loss": 0.40868330001831055 + }, + { + "epoch": 1.391833102630018, + "step": 14077, + "train/sim_loss": 0.03255724906921387 + }, + { + "epoch": 1.391833102630018, + "step": 14077, + "train/total_loss": 0.07342557609081268 + }, + { + "entropy": 9.750722885131836, + "epoch": 1.3919319754795334, + "mean_token_accuracy": 0.8812500238418579, + "num_tokens": 26107345.0, + "step": 14078, + "train/ce_loss": 6.392396016963175e-07 + }, + { + "epoch": 1.3919319754795334, + "step": 14078, + "train/sim_loss": 0.01257014274597168 + }, + { + "epoch": 1.3919319754795334, + "step": 14078, + "train/total_loss": 0.012570207007229328 + }, + { + "entropy": 9.636151313781738, + "epoch": 1.3920308483290489, + "mean_token_accuracy": 0.8775834441184998, + "num_tokens": 26121047.0, + "step": 14079, + "train/ce_loss": 0.2569136917591095 + }, + { + "epoch": 1.3920308483290489, + "step": 14079, + "train/sim_loss": 0.04882347583770752 + }, + { + "epoch": 1.3920308483290489, + "step": 14079, + "train/total_loss": 0.07451484352350235 + }, + { + "epoch": 1.3921297211785644, + "grad_norm": 0.6687290668487549, + "learning_rate": 6.521534886020868e-06, + "loss": 0.0818, + "step": 14080 + }, + { + "entropy": 9.271010398864746, + "epoch": 1.3921297211785644, + "mean_token_accuracy": 0.8903878331184387, + "num_tokens": 26127498.0, + "step": 14080, + "train/ce_loss": 0.7251719236373901 + }, + { + "epoch": 1.3921297211785644, + "step": 14080, + "train/sim_loss": 0.010542988777160645 + }, + { + "epoch": 1.3921297211785644, + "step": 14080, + "train/total_loss": 0.08306018263101578 + }, + { + "entropy": 9.381009101867676, + "epoch": 1.3922285940280799, + "mean_token_accuracy": 0.8602287173271179, + "num_tokens": 26144290.0, + "step": 14081, + "train/ce_loss": 0.3897065222263336 + }, + { + "epoch": 1.3922285940280799, + "step": 14081, + "train/sim_loss": 0.01523745059967041 + }, + { + "epoch": 1.3922285940280799, + "step": 14081, + "train/total_loss": 0.05420810356736183 + }, + { + "entropy": 9.710844993591309, + "epoch": 1.3923274668775953, + "mean_token_accuracy": 0.8996960520744324, + "num_tokens": 26152955.0, + "step": 14082, + "train/ce_loss": 0.743600606918335 + }, + { + "epoch": 1.3923274668775953, + "step": 14082, + "train/sim_loss": 0.026758134365081787 + }, + { + "epoch": 1.3923274668775953, + "step": 14082, + "train/total_loss": 0.10111819952726364 + }, + { + "entropy": 9.135896682739258, + "epoch": 1.3924263397271108, + "mean_token_accuracy": 0.86834317445755, + "num_tokens": 26163116.0, + "step": 14083, + "train/ce_loss": 0.36926382780075073 + }, + { + "epoch": 1.3924263397271108, + "step": 14083, + "train/sim_loss": 0.028567612171173096 + }, + { + "epoch": 1.3924263397271108, + "step": 14083, + "train/total_loss": 0.06549400091171265 + }, + { + "entropy": 8.944341659545898, + "epoch": 1.3925252125766265, + "mean_token_accuracy": 0.8343788981437683, + "num_tokens": 26173553.0, + "step": 14084, + "train/ce_loss": 0.6492791175842285 + }, + { + "epoch": 1.3925252125766265, + "step": 14084, + "train/sim_loss": 0.041113853454589844 + }, + { + "epoch": 1.3925252125766265, + "step": 14084, + "train/total_loss": 0.10604176670312881 + }, + { + "entropy": 9.219717025756836, + "epoch": 1.392624085426142, + "mean_token_accuracy": 0.8061224222183228, + "num_tokens": 26186718.0, + "step": 14085, + "train/ce_loss": 0.5571999549865723 + }, + { + "epoch": 1.392624085426142, + "step": 14085, + "train/sim_loss": 0.0401763916015625 + }, + { + "epoch": 1.392624085426142, + "step": 14085, + "train/total_loss": 0.0958963930606842 + }, + { + "entropy": 9.389852523803711, + "epoch": 1.3927229582756575, + "mean_token_accuracy": 0.8920764923095703, + "num_tokens": 26205238.0, + "step": 14086, + "train/ce_loss": 0.2692595422267914 + }, + { + "epoch": 1.3927229582756575, + "step": 14086, + "train/sim_loss": 0.016110599040985107 + }, + { + "epoch": 1.3927229582756575, + "step": 14086, + "train/total_loss": 0.043036554008722305 + }, + { + "entropy": 9.410992622375488, + "epoch": 1.392821831125173, + "mean_token_accuracy": 0.8601036071777344, + "num_tokens": 26218053.0, + "step": 14087, + "train/ce_loss": 0.415080726146698 + }, + { + "epoch": 1.392821831125173, + "step": 14087, + "train/sim_loss": 0.050650835037231445 + }, + { + "epoch": 1.392821831125173, + "step": 14087, + "train/total_loss": 0.09215891361236572 + }, + { + "entropy": 8.925359725952148, + "epoch": 1.3929207039746885, + "mean_token_accuracy": 0.814385175704956, + "num_tokens": 26232083.0, + "step": 14088, + "train/ce_loss": 0.6625407338142395 + }, + { + "epoch": 1.3929207039746885, + "step": 14088, + "train/sim_loss": 0.027989625930786133 + }, + { + "epoch": 1.3929207039746885, + "step": 14088, + "train/total_loss": 0.09424369782209396 + }, + { + "entropy": 9.333984375, + "epoch": 1.3930195768242042, + "mean_token_accuracy": 0.8229665160179138, + "num_tokens": 26240771.0, + "step": 14089, + "train/ce_loss": 0.8361597657203674 + }, + { + "epoch": 1.3930195768242042, + "step": 14089, + "train/sim_loss": 0.07001149654388428 + }, + { + "epoch": 1.3930195768242042, + "step": 14089, + "train/total_loss": 0.15362748503684998 + }, + { + "entropy": 9.626256942749023, + "epoch": 1.3931184496737197, + "mean_token_accuracy": 0.9191374778747559, + "num_tokens": 26250846.0, + "step": 14090, + "train/ce_loss": 0.7027310132980347 + }, + { + "epoch": 1.3931184496737197, + "step": 14090, + "train/sim_loss": 0.03424990177154541 + }, + { + "epoch": 1.3931184496737197, + "step": 14090, + "train/total_loss": 0.10452300310134888 + }, + { + "entropy": 9.908906936645508, + "epoch": 1.3932173225232352, + "mean_token_accuracy": 0.8756097555160522, + "num_tokens": 26257101.0, + "step": 14091, + "train/ce_loss": 1.4141783140075859e-06 + }, + { + "epoch": 1.3932173225232352, + "step": 14091, + "train/sim_loss": 0.042156755924224854 + }, + { + "epoch": 1.3932173225232352, + "step": 14091, + "train/total_loss": 0.042156897485256195 + }, + { + "entropy": 9.370878219604492, + "epoch": 1.3933161953727506, + "mean_token_accuracy": 0.8350754976272583, + "num_tokens": 26271824.0, + "step": 14092, + "train/ce_loss": 0.7225604057312012 + }, + { + "epoch": 1.3933161953727506, + "step": 14092, + "train/sim_loss": 0.025003433227539062 + }, + { + "epoch": 1.3933161953727506, + "step": 14092, + "train/total_loss": 0.09725947678089142 + }, + { + "entropy": 9.168994903564453, + "epoch": 1.3934150682222661, + "mean_token_accuracy": 0.8340135812759399, + "num_tokens": 26283412.0, + "step": 14093, + "train/ce_loss": 0.5871109962463379 + }, + { + "epoch": 1.3934150682222661, + "step": 14093, + "train/sim_loss": 0.05074810981750488 + }, + { + "epoch": 1.3934150682222661, + "step": 14093, + "train/total_loss": 0.10945920646190643 + }, + { + "entropy": 8.696952819824219, + "epoch": 1.3935139410717816, + "mean_token_accuracy": 0.8155555725097656, + "num_tokens": 26292079.0, + "step": 14094, + "train/ce_loss": 0.605261504650116 + }, + { + "epoch": 1.3935139410717816, + "step": 14094, + "train/sim_loss": 0.07752549648284912 + }, + { + "epoch": 1.3935139410717816, + "step": 14094, + "train/total_loss": 0.13805164396762848 + }, + { + "entropy": 9.615062713623047, + "epoch": 1.393612813921297, + "mean_token_accuracy": 0.778859555721283, + "num_tokens": 26306999.0, + "step": 14095, + "train/ce_loss": 0.455190509557724 + }, + { + "epoch": 1.393612813921297, + "step": 14095, + "train/sim_loss": 0.017029762268066406 + }, + { + "epoch": 1.393612813921297, + "step": 14095, + "train/total_loss": 0.06254881620407104 + }, + { + "entropy": 9.02385425567627, + "epoch": 1.3937116867708128, + "mean_token_accuracy": 0.8754028081893921, + "num_tokens": 26317490.0, + "step": 14096, + "train/ce_loss": 0.32681408524513245 + }, + { + "epoch": 1.3937116867708128, + "step": 14096, + "train/sim_loss": 0.03540003299713135 + }, + { + "epoch": 1.3937116867708128, + "step": 14096, + "train/total_loss": 0.06808143854141235 + }, + { + "entropy": 9.566242218017578, + "epoch": 1.3938105596203283, + "mean_token_accuracy": 0.8358038663864136, + "num_tokens": 26331248.0, + "step": 14097, + "train/ce_loss": 0.28903400897979736 + }, + { + "epoch": 1.3938105596203283, + "step": 14097, + "train/sim_loss": 0.04151350259780884 + }, + { + "epoch": 1.3938105596203283, + "step": 14097, + "train/total_loss": 0.0704169049859047 + }, + { + "entropy": 9.18105697631836, + "epoch": 1.3939094324698438, + "mean_token_accuracy": 0.8834285736083984, + "num_tokens": 26346429.0, + "step": 14098, + "train/ce_loss": 0.6997601389884949 + }, + { + "epoch": 1.3939094324698438, + "step": 14098, + "train/sim_loss": 0.04632139205932617 + }, + { + "epoch": 1.3939094324698438, + "step": 14098, + "train/total_loss": 0.1162974089384079 + }, + { + "entropy": 8.825766563415527, + "epoch": 1.3940083053193593, + "mean_token_accuracy": 0.8014861941337585, + "num_tokens": 26357313.0, + "step": 14099, + "train/ce_loss": 0.48156580328941345 + }, + { + "epoch": 1.3940083053193593, + "step": 14099, + "train/sim_loss": 0.057558417320251465 + }, + { + "epoch": 1.3940083053193593, + "step": 14099, + "train/total_loss": 0.10571499913930893 + }, + { + "epoch": 1.3941071781688747, + "grad_norm": 0.5783944725990295, + "learning_rate": 6.516590021262919e-06, + "loss": 0.0863, + "step": 14100 + }, + { + "entropy": 9.309133529663086, + "epoch": 1.3941071781688747, + "mean_token_accuracy": 0.8667563796043396, + "num_tokens": 26371347.0, + "step": 14100, + "train/ce_loss": 0.3117145597934723 + }, + { + "epoch": 1.3941071781688747, + "step": 14100, + "train/sim_loss": 0.026380836963653564 + }, + { + "epoch": 1.3941071781688747, + "step": 14100, + "train/total_loss": 0.05755229294300079 + }, + { + "entropy": 8.60973072052002, + "epoch": 1.3942060510183905, + "mean_token_accuracy": 0.8866799473762512, + "num_tokens": 26383563.0, + "step": 14101, + "train/ce_loss": 0.3507865071296692 + }, + { + "epoch": 1.3942060510183905, + "step": 14101, + "train/sim_loss": 0.03311574459075928 + }, + { + "epoch": 1.3942060510183905, + "step": 14101, + "train/total_loss": 0.06819439679384232 + }, + { + "entropy": 9.474096298217773, + "epoch": 1.394304923867906, + "mean_token_accuracy": 0.8805257081985474, + "num_tokens": 26397541.0, + "step": 14102, + "train/ce_loss": 0.22669164836406708 + }, + { + "epoch": 1.394304923867906, + "step": 14102, + "train/sim_loss": 0.028120338916778564 + }, + { + "epoch": 1.394304923867906, + "step": 14102, + "train/total_loss": 0.05078950524330139 + }, + { + "entropy": 9.389063835144043, + "epoch": 1.3944037967174214, + "mean_token_accuracy": 0.8457583785057068, + "num_tokens": 26408738.0, + "step": 14103, + "train/ce_loss": 0.26264408230781555 + }, + { + "epoch": 1.3944037967174214, + "step": 14103, + "train/sim_loss": 0.05194813013076782 + }, + { + "epoch": 1.3944037967174214, + "step": 14103, + "train/total_loss": 0.07821253687143326 + }, + { + "entropy": 9.49044418334961, + "epoch": 1.394502669566937, + "mean_token_accuracy": 0.8960302472114563, + "num_tokens": 26415532.0, + "step": 14104, + "train/ce_loss": 0.4610804319381714 + }, + { + "epoch": 1.394502669566937, + "step": 14104, + "train/sim_loss": 0.0560680627822876 + }, + { + "epoch": 1.394502669566937, + "step": 14104, + "train/total_loss": 0.10217610746622086 + }, + { + "entropy": 9.053086280822754, + "epoch": 1.3946015424164524, + "mean_token_accuracy": 0.817808210849762, + "num_tokens": 26426747.0, + "step": 14105, + "train/ce_loss": 0.48443537950515747 + }, + { + "epoch": 1.3946015424164524, + "step": 14105, + "train/sim_loss": 0.017889976501464844 + }, + { + "epoch": 1.3946015424164524, + "step": 14105, + "train/total_loss": 0.06633351743221283 + }, + { + "entropy": 8.96740436553955, + "epoch": 1.3947004152659679, + "mean_token_accuracy": 0.8249227404594421, + "num_tokens": 26437150.0, + "step": 14106, + "train/ce_loss": 0.5634850859642029 + }, + { + "epoch": 1.3947004152659679, + "step": 14106, + "train/sim_loss": 0.08068990707397461 + }, + { + "epoch": 1.3947004152659679, + "step": 14106, + "train/total_loss": 0.13703840970993042 + }, + { + "entropy": 9.453581809997559, + "epoch": 1.3947992881154834, + "mean_token_accuracy": 0.8294573426246643, + "num_tokens": 26450445.0, + "step": 14107, + "train/ce_loss": 0.9194504022598267 + }, + { + "epoch": 1.3947992881154834, + "step": 14107, + "train/sim_loss": 0.08611512184143066 + }, + { + "epoch": 1.3947992881154834, + "step": 14107, + "train/total_loss": 0.17806017398834229 + }, + { + "entropy": 9.357580184936523, + "epoch": 1.394898160964999, + "mean_token_accuracy": 0.7848297357559204, + "num_tokens": 26467023.0, + "step": 14108, + "train/ce_loss": 4.332819685259892e-07 + }, + { + "epoch": 1.394898160964999, + "step": 14108, + "train/sim_loss": 0.02853214740753174 + }, + { + "epoch": 1.394898160964999, + "step": 14108, + "train/total_loss": 0.02853219024837017 + }, + { + "entropy": 9.638638496398926, + "epoch": 1.3949970338145146, + "mean_token_accuracy": 0.8448637127876282, + "num_tokens": 26482032.0, + "step": 14109, + "train/ce_loss": 0.625886857509613 + }, + { + "epoch": 1.3949970338145146, + "step": 14109, + "train/sim_loss": 0.021572768688201904 + }, + { + "epoch": 1.3949970338145146, + "step": 14109, + "train/total_loss": 0.08416145294904709 + }, + { + "entropy": 9.296353340148926, + "epoch": 1.39509590666403, + "mean_token_accuracy": 0.8714810013771057, + "num_tokens": 26498753.0, + "step": 14110, + "train/ce_loss": 0.5662516355514526 + }, + { + "epoch": 1.39509590666403, + "step": 14110, + "train/sim_loss": 0.05545198917388916 + }, + { + "epoch": 1.39509590666403, + "step": 14110, + "train/total_loss": 0.11207715421915054 + }, + { + "entropy": 9.793333053588867, + "epoch": 1.3951947795135455, + "mean_token_accuracy": 0.8552188277244568, + "num_tokens": 26511172.0, + "step": 14111, + "train/ce_loss": 1.1374319228707463e-06 + }, + { + "epoch": 1.3951947795135455, + "step": 14111, + "train/sim_loss": 0.04541975259780884 + }, + { + "epoch": 1.3951947795135455, + "step": 14111, + "train/total_loss": 0.04541986808180809 + }, + { + "entropy": 9.038379669189453, + "epoch": 1.395293652363061, + "mean_token_accuracy": 0.8355408310890198, + "num_tokens": 26524681.0, + "step": 14112, + "train/ce_loss": 0.22139647603034973 + }, + { + "epoch": 1.395293652363061, + "step": 14112, + "train/sim_loss": 0.03800356388092041 + }, + { + "epoch": 1.395293652363061, + "step": 14112, + "train/total_loss": 0.060143209993839264 + }, + { + "entropy": 9.582745552062988, + "epoch": 1.3953925252125767, + "mean_token_accuracy": 0.8365508317947388, + "num_tokens": 26536126.0, + "step": 14113, + "train/ce_loss": 0.6734885573387146 + }, + { + "epoch": 1.3953925252125767, + "step": 14113, + "train/sim_loss": 0.04375052452087402 + }, + { + "epoch": 1.3953925252125767, + "step": 14113, + "train/total_loss": 0.11109938472509384 + }, + { + "entropy": 9.205771446228027, + "epoch": 1.3954913980620922, + "mean_token_accuracy": 0.8702771067619324, + "num_tokens": 26547887.0, + "step": 14114, + "train/ce_loss": 0.2654516398906708 + }, + { + "epoch": 1.3954913980620922, + "step": 14114, + "train/sim_loss": 0.047982215881347656 + }, + { + "epoch": 1.3954913980620922, + "step": 14114, + "train/total_loss": 0.07452738285064697 + }, + { + "entropy": 9.22810173034668, + "epoch": 1.3955902709116077, + "mean_token_accuracy": 0.8299680948257446, + "num_tokens": 26561619.0, + "step": 14115, + "train/ce_loss": 0.6034418344497681 + }, + { + "epoch": 1.3955902709116077, + "step": 14115, + "train/sim_loss": 0.06083393096923828 + }, + { + "epoch": 1.3955902709116077, + "step": 14115, + "train/total_loss": 0.12117812037467957 + }, + { + "entropy": 9.362455368041992, + "epoch": 1.3956891437611232, + "mean_token_accuracy": 0.8709090948104858, + "num_tokens": 26571582.0, + "step": 14116, + "train/ce_loss": 5.600552412943216e-07 + }, + { + "epoch": 1.3956891437611232, + "step": 14116, + "train/sim_loss": 0.034783363342285156 + }, + { + "epoch": 1.3956891437611232, + "step": 14116, + "train/total_loss": 0.03478341922163963 + }, + { + "entropy": 9.2037353515625, + "epoch": 1.3957880166106387, + "mean_token_accuracy": 0.843137264251709, + "num_tokens": 26580939.0, + "step": 14117, + "train/ce_loss": 0.37184908986091614 + }, + { + "epoch": 1.3957880166106387, + "step": 14117, + "train/sim_loss": 0.03899741172790527 + }, + { + "epoch": 1.3957880166106387, + "step": 14117, + "train/total_loss": 0.07618232071399689 + }, + { + "entropy": 9.501696586608887, + "epoch": 1.3958868894601544, + "mean_token_accuracy": 0.8020231127738953, + "num_tokens": 26589705.0, + "step": 14118, + "train/ce_loss": 0.4921643137931824 + }, + { + "epoch": 1.3958868894601544, + "step": 14118, + "train/sim_loss": 0.03668081760406494 + }, + { + "epoch": 1.3958868894601544, + "step": 14118, + "train/total_loss": 0.08589725196361542 + }, + { + "entropy": 9.306829452514648, + "epoch": 1.3959857623096696, + "mean_token_accuracy": 0.8443465232849121, + "num_tokens": 26605886.0, + "step": 14119, + "train/ce_loss": 0.3387848734855652 + }, + { + "epoch": 1.3959857623096696, + "step": 14119, + "train/sim_loss": 0.05703318119049072 + }, + { + "epoch": 1.3959857623096696, + "step": 14119, + "train/total_loss": 0.09091167151927948 + }, + { + "epoch": 1.3960846351591853, + "grad_norm": 0.6303807497024536, + "learning_rate": 6.51164515650497e-06, + "loss": 0.0878, + "step": 14120 + }, + { + "entropy": 8.99302864074707, + "epoch": 1.3960846351591853, + "mean_token_accuracy": 0.8494736552238464, + "num_tokens": 26615753.0, + "step": 14120, + "train/ce_loss": 0.26754796504974365 + }, + { + "epoch": 1.3960846351591853, + "step": 14120, + "train/sim_loss": 0.05915522575378418 + }, + { + "epoch": 1.3960846351591853, + "step": 14120, + "train/total_loss": 0.08591002225875854 + }, + { + "entropy": 9.796385765075684, + "epoch": 1.3961835080087008, + "mean_token_accuracy": 0.8648208379745483, + "num_tokens": 26631218.0, + "step": 14121, + "train/ce_loss": 1.5445414192072349e-06 + }, + { + "epoch": 1.3961835080087008, + "step": 14121, + "train/sim_loss": 0.027706623077392578 + }, + { + "epoch": 1.3961835080087008, + "step": 14121, + "train/total_loss": 0.027706777676939964 + }, + { + "entropy": 9.640462875366211, + "epoch": 1.3962823808582163, + "mean_token_accuracy": 0.8642659187316895, + "num_tokens": 26640852.0, + "step": 14122, + "train/ce_loss": 0.5893134474754333 + }, + { + "epoch": 1.3962823808582163, + "step": 14122, + "train/sim_loss": 0.056488096714019775 + }, + { + "epoch": 1.3962823808582163, + "step": 14122, + "train/total_loss": 0.11541944742202759 + }, + { + "entropy": 9.120355606079102, + "epoch": 1.3963812537077318, + "mean_token_accuracy": 0.8122448921203613, + "num_tokens": 26653877.0, + "step": 14123, + "train/ce_loss": 0.5024378895759583 + }, + { + "epoch": 1.3963812537077318, + "step": 14123, + "train/sim_loss": 0.0637366771697998 + }, + { + "epoch": 1.3963812537077318, + "step": 14123, + "train/total_loss": 0.11398047208786011 + }, + { + "entropy": 9.502204895019531, + "epoch": 1.3964801265572473, + "mean_token_accuracy": 0.7993680834770203, + "num_tokens": 26666739.0, + "step": 14124, + "train/ce_loss": 0.896040141582489 + }, + { + "epoch": 1.3964801265572473, + "step": 14124, + "train/sim_loss": 0.12580835819244385 + }, + { + "epoch": 1.3964801265572473, + "step": 14124, + "train/total_loss": 0.21541237831115723 + }, + { + "entropy": 9.26923942565918, + "epoch": 1.396578999406763, + "mean_token_accuracy": 0.83012455701828, + "num_tokens": 26676316.0, + "step": 14125, + "train/ce_loss": 0.3996444046497345 + }, + { + "epoch": 1.396578999406763, + "step": 14125, + "train/sim_loss": 0.024196982383728027 + }, + { + "epoch": 1.396578999406763, + "step": 14125, + "train/total_loss": 0.06416141986846924 + }, + { + "entropy": 9.375186920166016, + "epoch": 1.3966778722562785, + "mean_token_accuracy": 0.8301886916160583, + "num_tokens": 26689625.0, + "step": 14126, + "train/ce_loss": 0.561835527420044 + }, + { + "epoch": 1.3966778722562785, + "step": 14126, + "train/sim_loss": 0.03848278522491455 + }, + { + "epoch": 1.3966778722562785, + "step": 14126, + "train/total_loss": 0.09466633945703506 + }, + { + "entropy": 9.54641342163086, + "epoch": 1.396776745105794, + "mean_token_accuracy": 0.882170557975769, + "num_tokens": 26705128.0, + "step": 14127, + "train/ce_loss": 0.5843651294708252 + }, + { + "epoch": 1.396776745105794, + "step": 14127, + "train/sim_loss": 0.031666576862335205 + }, + { + "epoch": 1.396776745105794, + "step": 14127, + "train/total_loss": 0.09010308980941772 + }, + { + "entropy": 9.0413818359375, + "epoch": 1.3968756179553095, + "mean_token_accuracy": 0.8347368240356445, + "num_tokens": 26718707.0, + "step": 14128, + "train/ce_loss": 0.47375088930130005 + }, + { + "epoch": 1.3968756179553095, + "step": 14128, + "train/sim_loss": 0.02434241771697998 + }, + { + "epoch": 1.3968756179553095, + "step": 14128, + "train/total_loss": 0.0717175081372261 + }, + { + "entropy": 9.436563491821289, + "epoch": 1.396974490804825, + "mean_token_accuracy": 0.8543922901153564, + "num_tokens": 26738460.0, + "step": 14129, + "train/ce_loss": 0.45659828186035156 + }, + { + "epoch": 1.396974490804825, + "step": 14129, + "train/sim_loss": 0.03713500499725342 + }, + { + "epoch": 1.396974490804825, + "step": 14129, + "train/total_loss": 0.08279483020305634 + }, + { + "entropy": 9.666936874389648, + "epoch": 1.3970733636543406, + "mean_token_accuracy": 0.7862481474876404, + "num_tokens": 26751743.0, + "step": 14130, + "train/ce_loss": 6.856529921606125e-07 + }, + { + "epoch": 1.3970733636543406, + "step": 14130, + "train/sim_loss": 0.01907658576965332 + }, + { + "epoch": 1.3970733636543406, + "step": 14130, + "train/total_loss": 0.019076654687523842 + }, + { + "entropy": 9.291583061218262, + "epoch": 1.397172236503856, + "mean_token_accuracy": 0.8586309552192688, + "num_tokens": 26764904.0, + "step": 14131, + "train/ce_loss": 0.34795060753822327 + }, + { + "epoch": 1.397172236503856, + "step": 14131, + "train/sim_loss": 0.030128061771392822 + }, + { + "epoch": 1.397172236503856, + "step": 14131, + "train/total_loss": 0.06492312252521515 + }, + { + "entropy": 9.269630432128906, + "epoch": 1.3972711093533716, + "mean_token_accuracy": 0.7867830395698547, + "num_tokens": 26781167.0, + "step": 14132, + "train/ce_loss": 0.5804487466812134 + }, + { + "epoch": 1.3972711093533716, + "step": 14132, + "train/sim_loss": 0.06688195466995239 + }, + { + "epoch": 1.3972711093533716, + "step": 14132, + "train/total_loss": 0.12492683529853821 + }, + { + "entropy": 8.754179954528809, + "epoch": 1.397369982202887, + "mean_token_accuracy": 0.8543599247932434, + "num_tokens": 26789172.0, + "step": 14133, + "train/ce_loss": 0.4011830687522888 + }, + { + "epoch": 1.397369982202887, + "step": 14133, + "train/sim_loss": 0.08824604749679565 + }, + { + "epoch": 1.397369982202887, + "step": 14133, + "train/total_loss": 0.12836435437202454 + }, + { + "entropy": 9.822469711303711, + "epoch": 1.3974688550524026, + "mean_token_accuracy": 0.8569682240486145, + "num_tokens": 26809331.0, + "step": 14134, + "train/ce_loss": 0.38028544187545776 + }, + { + "epoch": 1.3974688550524026, + "step": 14134, + "train/sim_loss": 0.047749876976013184 + }, + { + "epoch": 1.3974688550524026, + "step": 14134, + "train/total_loss": 0.08577842265367508 + }, + { + "entropy": 9.706966400146484, + "epoch": 1.397567727901918, + "mean_token_accuracy": 0.8450331091880798, + "num_tokens": 26828930.0, + "step": 14135, + "train/ce_loss": 0.5403458476066589 + }, + { + "epoch": 1.397567727901918, + "step": 14135, + "train/sim_loss": 0.01358330249786377 + }, + { + "epoch": 1.397567727901918, + "step": 14135, + "train/total_loss": 0.06761789321899414 + }, + { + "entropy": 9.319454193115234, + "epoch": 1.3976666007514336, + "mean_token_accuracy": 0.831364095211029, + "num_tokens": 26845349.0, + "step": 14136, + "train/ce_loss": 0.4463956952095032 + }, + { + "epoch": 1.3976666007514336, + "step": 14136, + "train/sim_loss": 0.02828383445739746 + }, + { + "epoch": 1.3976666007514336, + "step": 14136, + "train/total_loss": 0.07292340695858002 + }, + { + "entropy": 9.080450057983398, + "epoch": 1.3977654736009493, + "mean_token_accuracy": 0.8497174978256226, + "num_tokens": 26855746.0, + "step": 14137, + "train/ce_loss": 0.2929580509662628 + }, + { + "epoch": 1.3977654736009493, + "step": 14137, + "train/sim_loss": 0.05523878335952759 + }, + { + "epoch": 1.3977654736009493, + "step": 14137, + "train/total_loss": 0.08453458547592163 + }, + { + "entropy": 9.257818222045898, + "epoch": 1.3978643464504648, + "mean_token_accuracy": 0.8212974071502686, + "num_tokens": 26871531.0, + "step": 14138, + "train/ce_loss": 0.40176519751548767 + }, + { + "epoch": 1.3978643464504648, + "step": 14138, + "train/sim_loss": 0.012143909931182861 + }, + { + "epoch": 1.3978643464504648, + "step": 14138, + "train/total_loss": 0.05232043191790581 + }, + { + "entropy": 9.46998405456543, + "epoch": 1.3979632192999802, + "mean_token_accuracy": 0.8257978558540344, + "num_tokens": 26883377.0, + "step": 14139, + "train/ce_loss": 0.4190909266471863 + }, + { + "epoch": 1.3979632192999802, + "step": 14139, + "train/sim_loss": 0.06797409057617188 + }, + { + "epoch": 1.3979632192999802, + "step": 14139, + "train/total_loss": 0.10988318920135498 + }, + { + "epoch": 1.3980620921494957, + "grad_norm": 0.7345295548439026, + "learning_rate": 6.5067002917470214e-06, + "loss": 0.094, + "step": 14140 + }, + { + "entropy": 9.53476619720459, + "epoch": 1.3980620921494957, + "mean_token_accuracy": 0.8163265585899353, + "num_tokens": 26895530.0, + "step": 14140, + "train/ce_loss": 1.1091632843017578 + }, + { + "epoch": 1.3980620921494957, + "step": 14140, + "train/sim_loss": 0.05519622564315796 + }, + { + "epoch": 1.3980620921494957, + "step": 14140, + "train/total_loss": 0.16611255705356598 + }, + { + "entropy": 9.334039688110352, + "epoch": 1.3981609649990112, + "mean_token_accuracy": 0.8335570693016052, + "num_tokens": 26908029.0, + "step": 14141, + "train/ce_loss": 0.5155856013298035 + }, + { + "epoch": 1.3981609649990112, + "step": 14141, + "train/sim_loss": 0.04421931505203247 + }, + { + "epoch": 1.3981609649990112, + "step": 14141, + "train/total_loss": 0.09577787667512894 + }, + { + "entropy": 9.18412971496582, + "epoch": 1.398259837848527, + "mean_token_accuracy": 0.8537054657936096, + "num_tokens": 26922052.0, + "step": 14142, + "train/ce_loss": 0.43836700916290283 + }, + { + "epoch": 1.398259837848527, + "step": 14142, + "train/sim_loss": 0.06558454036712646 + }, + { + "epoch": 1.398259837848527, + "step": 14142, + "train/total_loss": 0.10942123830318451 + }, + { + "entropy": 9.586336135864258, + "epoch": 1.3983587106980422, + "mean_token_accuracy": 0.840723991394043, + "num_tokens": 26938452.0, + "step": 14143, + "train/ce_loss": 0.8287760019302368 + }, + { + "epoch": 1.3983587106980422, + "step": 14143, + "train/sim_loss": 0.05059456825256348 + }, + { + "epoch": 1.3983587106980422, + "step": 14143, + "train/total_loss": 0.13347217440605164 + }, + { + "entropy": 9.962640762329102, + "epoch": 1.398457583547558, + "mean_token_accuracy": 0.9239766001701355, + "num_tokens": 26949711.0, + "step": 14144, + "train/ce_loss": 0.36340612173080444 + }, + { + "epoch": 1.398457583547558, + "step": 14144, + "train/sim_loss": 0.020492494106292725 + }, + { + "epoch": 1.398457583547558, + "step": 14144, + "train/total_loss": 0.05683310702443123 + }, + { + "entropy": 9.328707695007324, + "epoch": 1.3985564563970734, + "mean_token_accuracy": 0.7987878918647766, + "num_tokens": 26959304.0, + "step": 14145, + "train/ce_loss": 0.7212763428688049 + }, + { + "epoch": 1.3985564563970734, + "step": 14145, + "train/sim_loss": 0.10036635398864746 + }, + { + "epoch": 1.3985564563970734, + "step": 14145, + "train/total_loss": 0.17249399423599243 + }, + { + "entropy": 9.239861488342285, + "epoch": 1.3986553292465889, + "mean_token_accuracy": 0.8591885566711426, + "num_tokens": 26972393.0, + "step": 14146, + "train/ce_loss": 0.3850778341293335 + }, + { + "epoch": 1.3986553292465889, + "step": 14146, + "train/sim_loss": 0.01769310235977173 + }, + { + "epoch": 1.3986553292465889, + "step": 14146, + "train/total_loss": 0.05620088800787926 + }, + { + "entropy": 9.480884552001953, + "epoch": 1.3987542020961043, + "mean_token_accuracy": 0.9122516512870789, + "num_tokens": 26983681.0, + "step": 14147, + "train/ce_loss": 0.32706865668296814 + }, + { + "epoch": 1.3987542020961043, + "step": 14147, + "train/sim_loss": 0.0642848014831543 + }, + { + "epoch": 1.3987542020961043, + "step": 14147, + "train/total_loss": 0.09699167311191559 + }, + { + "entropy": 9.072394371032715, + "epoch": 1.3988530749456198, + "mean_token_accuracy": 0.8615384697914124, + "num_tokens": 26997677.0, + "step": 14148, + "train/ce_loss": 0.10032809525728226 + }, + { + "epoch": 1.3988530749456198, + "step": 14148, + "train/sim_loss": 0.028287172317504883 + }, + { + "epoch": 1.3988530749456198, + "step": 14148, + "train/total_loss": 0.03831998258829117 + }, + { + "entropy": 9.815958023071289, + "epoch": 1.3989519477951355, + "mean_token_accuracy": 0.8973606824874878, + "num_tokens": 27011793.0, + "step": 14149, + "train/ce_loss": 7.592925612698309e-07 + }, + { + "epoch": 1.3989519477951355, + "step": 14149, + "train/sim_loss": 0.036225199699401855 + }, + { + "epoch": 1.3989519477951355, + "step": 14149, + "train/total_loss": 0.036225274205207825 + }, + { + "entropy": 9.016768455505371, + "epoch": 1.399050820644651, + "mean_token_accuracy": 0.8404977321624756, + "num_tokens": 27024405.0, + "step": 14150, + "train/ce_loss": 0.5143503546714783 + }, + { + "epoch": 1.399050820644651, + "step": 14150, + "train/sim_loss": 0.06525516510009766 + }, + { + "epoch": 1.399050820644651, + "step": 14150, + "train/total_loss": 0.11669020354747772 + }, + { + "entropy": 9.339387893676758, + "epoch": 1.3991496934941665, + "mean_token_accuracy": 0.8741188049316406, + "num_tokens": 27038687.0, + "step": 14151, + "train/ce_loss": 0.45462456345558167 + }, + { + "epoch": 1.3991496934941665, + "step": 14151, + "train/sim_loss": 0.03202521800994873 + }, + { + "epoch": 1.3991496934941665, + "step": 14151, + "train/total_loss": 0.07748767733573914 + }, + { + "entropy": 9.329050064086914, + "epoch": 1.399248566343682, + "mean_token_accuracy": 0.8062015771865845, + "num_tokens": 27055551.0, + "step": 14152, + "train/ce_loss": 0.5252780318260193 + }, + { + "epoch": 1.399248566343682, + "step": 14152, + "train/sim_loss": 0.036393940448760986 + }, + { + "epoch": 1.399248566343682, + "step": 14152, + "train/total_loss": 0.08892174065113068 + }, + { + "entropy": 9.586254119873047, + "epoch": 1.3993474391931975, + "mean_token_accuracy": 0.8568507432937622, + "num_tokens": 27062682.0, + "step": 14153, + "train/ce_loss": 3.2992049909807974e-06 + }, + { + "epoch": 1.3993474391931975, + "step": 14153, + "train/sim_loss": 0.012563705444335938 + }, + { + "epoch": 1.3993474391931975, + "step": 14153, + "train/total_loss": 0.012564035132527351 + }, + { + "entropy": 9.237415313720703, + "epoch": 1.3994463120427132, + "mean_token_accuracy": 0.9522935748100281, + "num_tokens": 27071329.0, + "step": 14154, + "train/ce_loss": 0.4352723956108093 + }, + { + "epoch": 1.3994463120427132, + "step": 14154, + "train/sim_loss": 0.07825767993927002 + }, + { + "epoch": 1.3994463120427132, + "step": 14154, + "train/total_loss": 0.12178492546081543 + }, + { + "entropy": 9.939704895019531, + "epoch": 1.3995451848922285, + "mean_token_accuracy": 0.8577648997306824, + "num_tokens": 27084753.0, + "step": 14155, + "train/ce_loss": 3.0796726946391573e-07 + }, + { + "epoch": 1.3995451848922285, + "step": 14155, + "train/sim_loss": 0.01684170961380005 + }, + { + "epoch": 1.3995451848922285, + "step": 14155, + "train/total_loss": 0.016841741278767586 + }, + { + "entropy": 9.040067672729492, + "epoch": 1.3996440577417442, + "mean_token_accuracy": 0.9158415794372559, + "num_tokens": 27093332.0, + "step": 14156, + "train/ce_loss": 0.20001405477523804 + }, + { + "epoch": 1.3996440577417442, + "step": 14156, + "train/sim_loss": 0.04807126522064209 + }, + { + "epoch": 1.3996440577417442, + "step": 14156, + "train/total_loss": 0.06807266920804977 + }, + { + "entropy": 9.020735740661621, + "epoch": 1.3997429305912596, + "mean_token_accuracy": 0.8571428656578064, + "num_tokens": 27106162.0, + "step": 14157, + "train/ce_loss": 0.27936089038848877 + }, + { + "epoch": 1.3997429305912596, + "step": 14157, + "train/sim_loss": 0.0379335880279541 + }, + { + "epoch": 1.3997429305912596, + "step": 14157, + "train/total_loss": 0.06586967408657074 + }, + { + "entropy": 8.805767059326172, + "epoch": 1.3998418034407751, + "mean_token_accuracy": 0.8250652551651001, + "num_tokens": 27118632.0, + "step": 14158, + "train/ce_loss": 0.6047290563583374 + }, + { + "epoch": 1.3998418034407751, + "step": 14158, + "train/sim_loss": 0.016580581665039062 + }, + { + "epoch": 1.3998418034407751, + "step": 14158, + "train/total_loss": 0.0770534873008728 + }, + { + "entropy": 9.324532508850098, + "epoch": 1.3999406762902906, + "mean_token_accuracy": 0.8474264740943909, + "num_tokens": 27135188.0, + "step": 14159, + "train/ce_loss": 0.6736559867858887 + }, + { + "epoch": 1.3999406762902906, + "step": 14159, + "train/sim_loss": 0.028274178504943848 + }, + { + "epoch": 1.3999406762902906, + "step": 14159, + "train/total_loss": 0.09563978016376495 + }, + { + "epoch": 1.400039549139806, + "grad_norm": 0.5910269618034363, + "learning_rate": 6.501755426989073e-06, + "loss": 0.0841, + "step": 14160 + }, + { + "entropy": 9.322026252746582, + "epoch": 1.400039549139806, + "mean_token_accuracy": 0.8643791079521179, + "num_tokens": 27148273.0, + "step": 14160, + "train/ce_loss": 0.5815840363502502 + }, + { + "epoch": 1.400039549139806, + "step": 14160, + "train/sim_loss": 0.05074191093444824 + }, + { + "epoch": 1.400039549139806, + "step": 14160, + "train/total_loss": 0.10890031605958939 + }, + { + "entropy": 9.311187744140625, + "epoch": 1.4001384219893218, + "mean_token_accuracy": 0.8952381014823914, + "num_tokens": 27164006.0, + "step": 14161, + "train/ce_loss": 0.20482639968395233 + }, + { + "epoch": 1.4001384219893218, + "step": 14161, + "train/sim_loss": 0.03773391246795654 + }, + { + "epoch": 1.4001384219893218, + "step": 14161, + "train/total_loss": 0.058216553181409836 + }, + { + "entropy": 8.684549331665039, + "epoch": 1.4002372948388373, + "mean_token_accuracy": 0.8861788511276245, + "num_tokens": 27173046.0, + "step": 14162, + "train/ce_loss": 0.36284297704696655 + }, + { + "epoch": 1.4002372948388373, + "step": 14162, + "train/sim_loss": 0.048328280448913574 + }, + { + "epoch": 1.4002372948388373, + "step": 14162, + "train/total_loss": 0.08461257815361023 + }, + { + "entropy": 9.619462013244629, + "epoch": 1.4003361676883528, + "mean_token_accuracy": 0.8744939565658569, + "num_tokens": 27181863.0, + "step": 14163, + "train/ce_loss": 0.7575337886810303 + }, + { + "epoch": 1.4003361676883528, + "step": 14163, + "train/sim_loss": 0.05691969394683838 + }, + { + "epoch": 1.4003361676883528, + "step": 14163, + "train/total_loss": 0.13267308473587036 + }, + { + "entropy": 9.14434814453125, + "epoch": 1.4004350405378683, + "mean_token_accuracy": 0.8536251783370972, + "num_tokens": 27194354.0, + "step": 14164, + "train/ce_loss": 0.4435311257839203 + }, + { + "epoch": 1.4004350405378683, + "step": 14164, + "train/sim_loss": 0.01723867654800415 + }, + { + "epoch": 1.4004350405378683, + "step": 14164, + "train/total_loss": 0.06159178912639618 + }, + { + "entropy": 9.215421676635742, + "epoch": 1.4005339133873838, + "mean_token_accuracy": 0.8814504742622375, + "num_tokens": 27202477.0, + "step": 14165, + "train/ce_loss": 0.420479953289032 + }, + { + "epoch": 1.4005339133873838, + "step": 14165, + "train/sim_loss": 0.011348724365234375 + }, + { + "epoch": 1.4005339133873838, + "step": 14165, + "train/total_loss": 0.05339672043919563 + }, + { + "entropy": 9.413853645324707, + "epoch": 1.4006327862368995, + "mean_token_accuracy": 0.838918924331665, + "num_tokens": 27217669.0, + "step": 14166, + "train/ce_loss": 0.33635854721069336 + }, + { + "epoch": 1.4006327862368995, + "step": 14166, + "train/sim_loss": 0.029948115348815918 + }, + { + "epoch": 1.4006327862368995, + "step": 14166, + "train/total_loss": 0.06358397006988525 + }, + { + "entropy": 9.127863883972168, + "epoch": 1.400731659086415, + "mean_token_accuracy": 0.8860887289047241, + "num_tokens": 27230403.0, + "step": 14167, + "train/ce_loss": 0.5573869347572327 + }, + { + "epoch": 1.400731659086415, + "step": 14167, + "train/sim_loss": 0.051133573055267334 + }, + { + "epoch": 1.400731659086415, + "step": 14167, + "train/total_loss": 0.10687226802110672 + }, + { + "entropy": 9.414217948913574, + "epoch": 1.4008305319359304, + "mean_token_accuracy": 0.8888888955116272, + "num_tokens": 27245198.0, + "step": 14168, + "train/ce_loss": 0.07046722620725632 + }, + { + "epoch": 1.4008305319359304, + "step": 14168, + "train/sim_loss": 0.022479355335235596 + }, + { + "epoch": 1.4008305319359304, + "step": 14168, + "train/total_loss": 0.029526077210903168 + }, + { + "entropy": 9.023843765258789, + "epoch": 1.400929404785446, + "mean_token_accuracy": 0.8640226721763611, + "num_tokens": 27257272.0, + "step": 14169, + "train/ce_loss": 0.8677409887313843 + }, + { + "epoch": 1.400929404785446, + "step": 14169, + "train/sim_loss": 0.05940401554107666 + }, + { + "epoch": 1.400929404785446, + "step": 14169, + "train/total_loss": 0.14617812633514404 + }, + { + "entropy": 9.713257789611816, + "epoch": 1.4010282776349614, + "mean_token_accuracy": 0.8571428656578064, + "num_tokens": 27272960.0, + "step": 14170, + "train/ce_loss": 4.689993318152119e-07 + }, + { + "epoch": 1.4010282776349614, + "step": 14170, + "train/sim_loss": 0.01860123872756958 + }, + { + "epoch": 1.4010282776349614, + "step": 14170, + "train/total_loss": 0.01860128529369831 + }, + { + "entropy": 9.445023536682129, + "epoch": 1.401127150484477, + "mean_token_accuracy": 0.855614960193634, + "num_tokens": 27288366.0, + "step": 14171, + "train/ce_loss": 0.4481832683086395 + }, + { + "epoch": 1.401127150484477, + "step": 14171, + "train/sim_loss": 0.07221800088882446 + }, + { + "epoch": 1.401127150484477, + "step": 14171, + "train/total_loss": 0.11703632771968842 + }, + { + "entropy": 9.140592575073242, + "epoch": 1.4012260233339924, + "mean_token_accuracy": 0.8625377416610718, + "num_tokens": 27301141.0, + "step": 14172, + "train/ce_loss": 0.587911069393158 + }, + { + "epoch": 1.4012260233339924, + "step": 14172, + "train/sim_loss": 0.03548729419708252 + }, + { + "epoch": 1.4012260233339924, + "step": 14172, + "train/total_loss": 0.09427840262651443 + }, + { + "entropy": 9.35733699798584, + "epoch": 1.401324896183508, + "mean_token_accuracy": 0.821471631526947, + "num_tokens": 27313483.0, + "step": 14173, + "train/ce_loss": 0.5477089881896973 + }, + { + "epoch": 1.401324896183508, + "step": 14173, + "train/sim_loss": 0.049971938133239746 + }, + { + "epoch": 1.401324896183508, + "step": 14173, + "train/total_loss": 0.10474283993244171 + }, + { + "entropy": 9.520551681518555, + "epoch": 1.4014237690330236, + "mean_token_accuracy": 0.8942857384681702, + "num_tokens": 27326679.0, + "step": 14174, + "train/ce_loss": 0.6945672631263733 + }, + { + "epoch": 1.4014237690330236, + "step": 14174, + "train/sim_loss": 0.07798570394515991 + }, + { + "epoch": 1.4014237690330236, + "step": 14174, + "train/total_loss": 0.14744243025779724 + }, + { + "entropy": 9.999822616577148, + "epoch": 1.401522641882539, + "mean_token_accuracy": 0.875, + "num_tokens": 27339290.0, + "step": 14175, + "train/ce_loss": 0.5413763523101807 + }, + { + "epoch": 1.401522641882539, + "step": 14175, + "train/sim_loss": 0.10713762044906616 + }, + { + "epoch": 1.401522641882539, + "step": 14175, + "train/total_loss": 0.161275252699852 + }, + { + "entropy": 8.920557975769043, + "epoch": 1.4016215147320545, + "mean_token_accuracy": 0.8911495208740234, + "num_tokens": 27355865.0, + "step": 14176, + "train/ce_loss": 0.46847832202911377 + }, + { + "epoch": 1.4016215147320545, + "step": 14176, + "train/sim_loss": 0.017401576042175293 + }, + { + "epoch": 1.4016215147320545, + "step": 14176, + "train/total_loss": 0.06424941122531891 + }, + { + "entropy": 9.197328567504883, + "epoch": 1.40172038758157, + "mean_token_accuracy": 0.8907034993171692, + "num_tokens": 27364397.0, + "step": 14177, + "train/ce_loss": 0.1925896257162094 + }, + { + "epoch": 1.40172038758157, + "step": 14177, + "train/sim_loss": 0.030437469482421875 + }, + { + "epoch": 1.40172038758157, + "step": 14177, + "train/total_loss": 0.0496964305639267 + }, + { + "entropy": 9.730057716369629, + "epoch": 1.4018192604310857, + "mean_token_accuracy": 0.850632905960083, + "num_tokens": 27381616.0, + "step": 14178, + "train/ce_loss": 0.6101085543632507 + }, + { + "epoch": 1.4018192604310857, + "step": 14178, + "train/sim_loss": 0.037182509899139404 + }, + { + "epoch": 1.4018192604310857, + "step": 14178, + "train/total_loss": 0.09819336235523224 + }, + { + "entropy": 9.580209732055664, + "epoch": 1.4019181332806012, + "mean_token_accuracy": 0.9064327478408813, + "num_tokens": 27399190.0, + "step": 14179, + "train/ce_loss": 0.5726090669631958 + }, + { + "epoch": 1.4019181332806012, + "step": 14179, + "train/sim_loss": 0.03652697801589966 + }, + { + "epoch": 1.4019181332806012, + "step": 14179, + "train/total_loss": 0.09378788620233536 + }, + { + "epoch": 1.4020170061301167, + "grad_norm": 0.7239306569099426, + "learning_rate": 6.496810562231124e-06, + "loss": 0.0796, + "step": 14180 + }, + { + "entropy": 9.06196403503418, + "epoch": 1.4020170061301167, + "mean_token_accuracy": 0.835518479347229, + "num_tokens": 27409357.0, + "step": 14180, + "train/ce_loss": 0.2558625340461731 + }, + { + "epoch": 1.4020170061301167, + "step": 14180, + "train/sim_loss": 0.05019569396972656 + }, + { + "epoch": 1.4020170061301167, + "step": 14180, + "train/total_loss": 0.07578194886445999 + }, + { + "entropy": 9.452101707458496, + "epoch": 1.4021158789796322, + "mean_token_accuracy": 0.9005168080329895, + "num_tokens": 27426687.0, + "step": 14181, + "train/ce_loss": 0.509581983089447 + }, + { + "epoch": 1.4021158789796322, + "step": 14181, + "train/sim_loss": 0.06066262722015381 + }, + { + "epoch": 1.4021158789796322, + "step": 14181, + "train/total_loss": 0.11162082850933075 + }, + { + "entropy": 9.453571319580078, + "epoch": 1.4022147518291477, + "mean_token_accuracy": 0.8208661675453186, + "num_tokens": 27439698.0, + "step": 14182, + "train/ce_loss": 0.41606405377388 + }, + { + "epoch": 1.4022147518291477, + "step": 14182, + "train/sim_loss": 0.03730803728103638 + }, + { + "epoch": 1.4022147518291477, + "step": 14182, + "train/total_loss": 0.07891444861888885 + }, + { + "entropy": 9.319069862365723, + "epoch": 1.4023136246786632, + "mean_token_accuracy": 0.8188405632972717, + "num_tokens": 27456414.0, + "step": 14183, + "train/ce_loss": 0.553254246711731 + }, + { + "epoch": 1.4023136246786632, + "step": 14183, + "train/sim_loss": 0.03456830978393555 + }, + { + "epoch": 1.4023136246786632, + "step": 14183, + "train/total_loss": 0.08989373594522476 + }, + { + "entropy": 9.711193084716797, + "epoch": 1.4024124975281786, + "mean_token_accuracy": 0.8632258176803589, + "num_tokens": 27478834.0, + "step": 14184, + "train/ce_loss": 0.5878923535346985 + }, + { + "epoch": 1.4024124975281786, + "step": 14184, + "train/sim_loss": 0.04541778564453125 + }, + { + "epoch": 1.4024124975281786, + "step": 14184, + "train/total_loss": 0.10420702397823334 + }, + { + "entropy": 9.660788536071777, + "epoch": 1.4025113703776944, + "mean_token_accuracy": 0.9113923907279968, + "num_tokens": 27490656.0, + "step": 14185, + "train/ce_loss": 0.25232192873954773 + }, + { + "epoch": 1.4025113703776944, + "step": 14185, + "train/sim_loss": 0.016915202140808105 + }, + { + "epoch": 1.4025113703776944, + "step": 14185, + "train/total_loss": 0.04214739799499512 + }, + { + "entropy": 9.508828163146973, + "epoch": 1.4026102432272098, + "mean_token_accuracy": 0.8282828330993652, + "num_tokens": 27501781.0, + "step": 14186, + "train/ce_loss": 0.47912514209747314 + }, + { + "epoch": 1.4026102432272098, + "step": 14186, + "train/sim_loss": 0.07863616943359375 + }, + { + "epoch": 1.4026102432272098, + "step": 14186, + "train/total_loss": 0.1265486776828766 + }, + { + "entropy": 9.679401397705078, + "epoch": 1.4027091160767253, + "mean_token_accuracy": 0.817307710647583, + "num_tokens": 27515464.0, + "step": 14187, + "train/ce_loss": 1.3371645763982087e-06 + }, + { + "epoch": 1.4027091160767253, + "step": 14187, + "train/sim_loss": 0.02718961238861084 + }, + { + "epoch": 1.4027091160767253, + "step": 14187, + "train/total_loss": 0.027189746499061584 + }, + { + "entropy": 8.831350326538086, + "epoch": 1.4028079889262408, + "mean_token_accuracy": 0.8260019421577454, + "num_tokens": 27528193.0, + "step": 14188, + "train/ce_loss": 0.5756431221961975 + }, + { + "epoch": 1.4028079889262408, + "step": 14188, + "train/sim_loss": 0.06856465339660645 + }, + { + "epoch": 1.4028079889262408, + "step": 14188, + "train/total_loss": 0.12612897157669067 + }, + { + "entropy": 9.718804359436035, + "epoch": 1.4029068617757563, + "mean_token_accuracy": 0.8651399612426758, + "num_tokens": 27536829.0, + "step": 14189, + "train/ce_loss": 2.499429854196933e-07 + }, + { + "epoch": 1.4029068617757563, + "step": 14189, + "train/sim_loss": 0.014994025230407715 + }, + { + "epoch": 1.4029068617757563, + "step": 14189, + "train/total_loss": 0.01499405037611723 + }, + { + "entropy": 9.01399040222168, + "epoch": 1.403005734625272, + "mean_token_accuracy": 0.7895256876945496, + "num_tokens": 27549018.0, + "step": 14190, + "train/ce_loss": 0.4924055337905884 + }, + { + "epoch": 1.403005734625272, + "step": 14190, + "train/sim_loss": 0.017076492309570312 + }, + { + "epoch": 1.403005734625272, + "step": 14190, + "train/total_loss": 0.06631705164909363 + }, + { + "entropy": 9.353961944580078, + "epoch": 1.4031046074747875, + "mean_token_accuracy": 0.866847813129425, + "num_tokens": 27560982.0, + "step": 14191, + "train/ce_loss": 0.3337879478931427 + }, + { + "epoch": 1.4031046074747875, + "step": 14191, + "train/sim_loss": 0.012837111949920654 + }, + { + "epoch": 1.4031046074747875, + "step": 14191, + "train/total_loss": 0.046215906739234924 + }, + { + "entropy": 8.779327392578125, + "epoch": 1.403203480324303, + "mean_token_accuracy": 0.830258309841156, + "num_tokens": 27568989.0, + "step": 14192, + "train/ce_loss": 0.8403013944625854 + }, + { + "epoch": 1.403203480324303, + "step": 14192, + "train/sim_loss": 0.0627240538597107 + }, + { + "epoch": 1.403203480324303, + "step": 14192, + "train/total_loss": 0.1467542052268982 + }, + { + "entropy": 9.433804512023926, + "epoch": 1.4033023531738185, + "mean_token_accuracy": 0.8598442673683167, + "num_tokens": 27583108.0, + "step": 14193, + "train/ce_loss": 0.44933944940567017 + }, + { + "epoch": 1.4033023531738185, + "step": 14193, + "train/sim_loss": 0.04882270097732544 + }, + { + "epoch": 1.4033023531738185, + "step": 14193, + "train/total_loss": 0.09375664591789246 + }, + { + "entropy": 8.95838737487793, + "epoch": 1.403401226023334, + "mean_token_accuracy": 0.8671978712081909, + "num_tokens": 27589766.0, + "step": 14194, + "train/ce_loss": 0.3195834755897522 + }, + { + "epoch": 1.403401226023334, + "step": 14194, + "train/sim_loss": 0.01691746711730957 + }, + { + "epoch": 1.403401226023334, + "step": 14194, + "train/total_loss": 0.04887581616640091 + }, + { + "entropy": 9.290105819702148, + "epoch": 1.4035000988728494, + "mean_token_accuracy": 0.8065326809883118, + "num_tokens": 27604763.0, + "step": 14195, + "train/ce_loss": 0.5561066269874573 + }, + { + "epoch": 1.4035000988728494, + "step": 14195, + "train/sim_loss": 0.02982938289642334 + }, + { + "epoch": 1.4035000988728494, + "step": 14195, + "train/total_loss": 0.08544004708528519 + }, + { + "entropy": 9.474761009216309, + "epoch": 1.403598971722365, + "mean_token_accuracy": 0.8832487463951111, + "num_tokens": 27620447.0, + "step": 14196, + "train/ce_loss": 0.43080461025238037 + }, + { + "epoch": 1.403598971722365, + "step": 14196, + "train/sim_loss": 0.020471692085266113 + }, + { + "epoch": 1.403598971722365, + "step": 14196, + "train/total_loss": 0.06355215609073639 + }, + { + "entropy": 9.196749687194824, + "epoch": 1.4036978445718806, + "mean_token_accuracy": 0.8366164565086365, + "num_tokens": 27629907.0, + "step": 14197, + "train/ce_loss": 0.4853244721889496 + }, + { + "epoch": 1.4036978445718806, + "step": 14197, + "train/sim_loss": 0.04002225399017334 + }, + { + "epoch": 1.4036978445718806, + "step": 14197, + "train/total_loss": 0.08855470269918442 + }, + { + "entropy": 8.442893981933594, + "epoch": 1.4037967174213961, + "mean_token_accuracy": 0.8258132338523865, + "num_tokens": 27638294.0, + "step": 14198, + "train/ce_loss": 0.39905834197998047 + }, + { + "epoch": 1.4037967174213961, + "step": 14198, + "train/sim_loss": 0.03891193866729736 + }, + { + "epoch": 1.4037967174213961, + "step": 14198, + "train/total_loss": 0.07881776988506317 + }, + { + "entropy": 9.377008438110352, + "epoch": 1.4038955902709116, + "mean_token_accuracy": 0.8316383957862854, + "num_tokens": 27650977.0, + "step": 14199, + "train/ce_loss": 0.35153621435165405 + }, + { + "epoch": 1.4038955902709116, + "step": 14199, + "train/sim_loss": 0.051178574562072754 + }, + { + "epoch": 1.4038955902709116, + "step": 14199, + "train/total_loss": 0.08633220195770264 + }, + { + "epoch": 1.403994463120427, + "grad_norm": 0.6151056289672852, + "learning_rate": 6.491865697473174e-06, + "loss": 0.0896, + "step": 14200 + }, + { + "entropy": 9.024818420410156, + "epoch": 1.403994463120427, + "mean_token_accuracy": 0.8580247163772583, + "num_tokens": 27661793.0, + "step": 14200, + "train/ce_loss": 0.2031448483467102 + }, + { + "epoch": 1.403994463120427, + "step": 14200, + "train/sim_loss": 0.029285669326782227 + }, + { + "epoch": 1.403994463120427, + "step": 14200, + "train/total_loss": 0.04960015416145325 + }, + { + "entropy": 9.436050415039062, + "epoch": 1.4040933359699426, + "mean_token_accuracy": 0.8520709872245789, + "num_tokens": 27672484.0, + "step": 14201, + "train/ce_loss": 0.18369656801223755 + }, + { + "epoch": 1.4040933359699426, + "step": 14201, + "train/sim_loss": 0.014928460121154785 + }, + { + "epoch": 1.4040933359699426, + "step": 14201, + "train/total_loss": 0.03329811990261078 + }, + { + "entropy": 9.686580657958984, + "epoch": 1.4041922088194583, + "mean_token_accuracy": 0.9232209920883179, + "num_tokens": 27688722.0, + "step": 14202, + "train/ce_loss": 9.384598911310604e-07 + }, + { + "epoch": 1.4041922088194583, + "step": 14202, + "train/sim_loss": 0.02097344398498535 + }, + { + "epoch": 1.4041922088194583, + "step": 14202, + "train/total_loss": 0.020973537117242813 + }, + { + "entropy": 9.349278450012207, + "epoch": 1.4042910816689738, + "mean_token_accuracy": 0.8253806829452515, + "num_tokens": 27704327.0, + "step": 14203, + "train/ce_loss": 0.5760983824729919 + }, + { + "epoch": 1.4042910816689738, + "step": 14203, + "train/sim_loss": 0.045537590980529785 + }, + { + "epoch": 1.4042910816689738, + "step": 14203, + "train/total_loss": 0.10314743220806122 + }, + { + "entropy": 9.183860778808594, + "epoch": 1.4043899545184892, + "mean_token_accuracy": 0.8640081882476807, + "num_tokens": 27714433.0, + "step": 14204, + "train/ce_loss": 0.2678177058696747 + }, + { + "epoch": 1.4043899545184892, + "step": 14204, + "train/sim_loss": 0.009834885597229004 + }, + { + "epoch": 1.4043899545184892, + "step": 14204, + "train/total_loss": 0.03661665692925453 + }, + { + "entropy": 8.659086227416992, + "epoch": 1.4044888273680047, + "mean_token_accuracy": 0.8157894611358643, + "num_tokens": 27723114.0, + "step": 14205, + "train/ce_loss": 0.4051755368709564 + }, + { + "epoch": 1.4044888273680047, + "step": 14205, + "train/sim_loss": 0.01604539155960083 + }, + { + "epoch": 1.4044888273680047, + "step": 14205, + "train/total_loss": 0.05656294524669647 + }, + { + "entropy": 9.388836860656738, + "epoch": 1.4045877002175202, + "mean_token_accuracy": 0.7744565010070801, + "num_tokens": 27733556.0, + "step": 14206, + "train/ce_loss": 7.508503472308803e-07 + }, + { + "epoch": 1.4045877002175202, + "step": 14206, + "train/sim_loss": 0.04637080430984497 + }, + { + "epoch": 1.4045877002175202, + "step": 14206, + "train/total_loss": 0.04637087881565094 + }, + { + "entropy": 9.538055419921875, + "epoch": 1.404686573067036, + "mean_token_accuracy": 0.8601108193397522, + "num_tokens": 27751556.0, + "step": 14207, + "train/ce_loss": 0.14420342445373535 + }, + { + "epoch": 1.404686573067036, + "step": 14207, + "train/sim_loss": 0.04985696077346802 + }, + { + "epoch": 1.404686573067036, + "step": 14207, + "train/total_loss": 0.06427730619907379 + }, + { + "entropy": 9.59081745147705, + "epoch": 1.4047854459165512, + "mean_token_accuracy": 0.9023255705833435, + "num_tokens": 27765542.0, + "step": 14208, + "train/ce_loss": 2.9828318020008737e-06 + }, + { + "epoch": 1.4047854459165512, + "step": 14208, + "train/sim_loss": 0.05160146951675415 + }, + { + "epoch": 1.4047854459165512, + "step": 14208, + "train/total_loss": 0.05160176753997803 + }, + { + "entropy": 9.601303100585938, + "epoch": 1.404884318766067, + "mean_token_accuracy": 0.8634615540504456, + "num_tokens": 27782577.0, + "step": 14209, + "train/ce_loss": 0.5266227126121521 + }, + { + "epoch": 1.404884318766067, + "step": 14209, + "train/sim_loss": 0.07945060729980469 + }, + { + "epoch": 1.404884318766067, + "step": 14209, + "train/total_loss": 0.13211287558078766 + }, + { + "entropy": 9.45840072631836, + "epoch": 1.4049831916155824, + "mean_token_accuracy": 0.8219178318977356, + "num_tokens": 27790978.0, + "step": 14210, + "train/ce_loss": 0.6380736231803894 + }, + { + "epoch": 1.4049831916155824, + "step": 14210, + "train/sim_loss": 0.0931321382522583 + }, + { + "epoch": 1.4049831916155824, + "step": 14210, + "train/total_loss": 0.15693950653076172 + }, + { + "entropy": 9.425167083740234, + "epoch": 1.4050820644650979, + "mean_token_accuracy": 0.8771384358406067, + "num_tokens": 27799480.0, + "step": 14211, + "train/ce_loss": 0.4648033082485199 + }, + { + "epoch": 1.4050820644650979, + "step": 14211, + "train/sim_loss": 0.024567484855651855 + }, + { + "epoch": 1.4050820644650979, + "step": 14211, + "train/total_loss": 0.0710478127002716 + }, + { + "entropy": 9.55706787109375, + "epoch": 1.4051809373146134, + "mean_token_accuracy": 0.8878718614578247, + "num_tokens": 27814099.0, + "step": 14212, + "train/ce_loss": 1.2100449566787574e-06 + }, + { + "epoch": 1.4051809373146134, + "step": 14212, + "train/sim_loss": 0.045882463455200195 + }, + { + "epoch": 1.4051809373146134, + "step": 14212, + "train/total_loss": 0.045882582664489746 + }, + { + "entropy": 9.265987396240234, + "epoch": 1.4052798101641288, + "mean_token_accuracy": 0.8933823704719543, + "num_tokens": 27823490.0, + "step": 14213, + "train/ce_loss": 0.26309117674827576 + }, + { + "epoch": 1.4052798101641288, + "step": 14213, + "train/sim_loss": 0.03615903854370117 + }, + { + "epoch": 1.4052798101641288, + "step": 14213, + "train/total_loss": 0.06246815621852875 + }, + { + "entropy": 9.02297592163086, + "epoch": 1.4053786830136445, + "mean_token_accuracy": 0.852090060710907, + "num_tokens": 27833244.0, + "step": 14214, + "train/ce_loss": 0.576809287071228 + }, + { + "epoch": 1.4053786830136445, + "step": 14214, + "train/sim_loss": 0.036932408809661865 + }, + { + "epoch": 1.4053786830136445, + "step": 14214, + "train/total_loss": 0.09461334347724915 + }, + { + "entropy": 9.104659080505371, + "epoch": 1.40547755586316, + "mean_token_accuracy": 0.8504273295402527, + "num_tokens": 27843550.0, + "step": 14215, + "train/ce_loss": 0.4653536081314087 + }, + { + "epoch": 1.40547755586316, + "step": 14215, + "train/sim_loss": 0.01907259225845337 + }, + { + "epoch": 1.40547755586316, + "step": 14215, + "train/total_loss": 0.065607950091362 + }, + { + "entropy": 8.867043495178223, + "epoch": 1.4055764287126755, + "mean_token_accuracy": 0.8569868803024292, + "num_tokens": 27852175.0, + "step": 14216, + "train/ce_loss": 0.2209147810935974 + }, + { + "epoch": 1.4055764287126755, + "step": 14216, + "train/sim_loss": 0.06297308206558228 + }, + { + "epoch": 1.4055764287126755, + "step": 14216, + "train/total_loss": 0.08506456017494202 + }, + { + "entropy": 9.305876731872559, + "epoch": 1.405675301562191, + "mean_token_accuracy": 0.8625730872154236, + "num_tokens": 27863908.0, + "step": 14217, + "train/ce_loss": 2.969745196423901e-07 + }, + { + "epoch": 1.405675301562191, + "step": 14217, + "train/sim_loss": 0.034040212631225586 + }, + { + "epoch": 1.405675301562191, + "step": 14217, + "train/total_loss": 0.034040242433547974 + }, + { + "entropy": 9.208902359008789, + "epoch": 1.4057741744117065, + "mean_token_accuracy": 0.8860759735107422, + "num_tokens": 27875832.0, + "step": 14218, + "train/ce_loss": 0.41206979751586914 + }, + { + "epoch": 1.4057741744117065, + "step": 14218, + "train/sim_loss": 0.051875948905944824 + }, + { + "epoch": 1.4057741744117065, + "step": 14218, + "train/total_loss": 0.09308293461799622 + }, + { + "entropy": 9.57554817199707, + "epoch": 1.4058730472612222, + "mean_token_accuracy": 0.8656195402145386, + "num_tokens": 27888942.0, + "step": 14219, + "train/ce_loss": 0.5134620070457458 + }, + { + "epoch": 1.4058730472612222, + "step": 14219, + "train/sim_loss": 0.031425535678863525 + }, + { + "epoch": 1.4058730472612222, + "step": 14219, + "train/total_loss": 0.08277173340320587 + }, + { + "epoch": 1.4059719201107375, + "grad_norm": 0.5564767718315125, + "learning_rate": 6.486920832715225e-06, + "loss": 0.0875, + "step": 14220 + }, + { + "entropy": 9.473921775817871, + "epoch": 1.4059719201107375, + "mean_token_accuracy": 0.7913593053817749, + "num_tokens": 27908772.0, + "step": 14220, + "train/ce_loss": 0.5201936364173889 + }, + { + "epoch": 1.4059719201107375, + "step": 14220, + "train/sim_loss": 0.03775954246520996 + }, + { + "epoch": 1.4059719201107375, + "step": 14220, + "train/total_loss": 0.08977890759706497 + }, + { + "entropy": 9.617219924926758, + "epoch": 1.4060707929602532, + "mean_token_accuracy": 0.8743455410003662, + "num_tokens": 27921494.0, + "step": 14221, + "train/ce_loss": 8.587903153056686e-07 + }, + { + "epoch": 1.4060707929602532, + "step": 14221, + "train/sim_loss": 0.0492480993270874 + }, + { + "epoch": 1.4060707929602532, + "step": 14221, + "train/total_loss": 0.04924818500876427 + }, + { + "entropy": 9.722689628601074, + "epoch": 1.4061696658097687, + "mean_token_accuracy": 0.832812488079071, + "num_tokens": 27938479.0, + "step": 14222, + "train/ce_loss": 7.420597398777318e-07 + }, + { + "epoch": 1.4061696658097687, + "step": 14222, + "train/sim_loss": 0.02514880895614624 + }, + { + "epoch": 1.4061696658097687, + "step": 14222, + "train/total_loss": 0.02514888346195221 + }, + { + "entropy": 9.177149772644043, + "epoch": 1.4062685386592841, + "mean_token_accuracy": 0.8439226746559143, + "num_tokens": 27948963.0, + "step": 14223, + "train/ce_loss": 0.3710871636867523 + }, + { + "epoch": 1.4062685386592841, + "step": 14223, + "train/sim_loss": 0.02628624439239502 + }, + { + "epoch": 1.4062685386592841, + "step": 14223, + "train/total_loss": 0.06339496374130249 + }, + { + "entropy": 9.41274356842041, + "epoch": 1.4063674115087996, + "mean_token_accuracy": 0.8328025341033936, + "num_tokens": 27956014.0, + "step": 14224, + "train/ce_loss": 0.42649728059768677 + }, + { + "epoch": 1.4063674115087996, + "step": 14224, + "train/sim_loss": 0.044586181640625 + }, + { + "epoch": 1.4063674115087996, + "step": 14224, + "train/total_loss": 0.08723591268062592 + }, + { + "entropy": 9.240615844726562, + "epoch": 1.406466284358315, + "mean_token_accuracy": 0.8350126147270203, + "num_tokens": 27970026.0, + "step": 14225, + "train/ce_loss": 0.4747823476791382 + }, + { + "epoch": 1.406466284358315, + "step": 14225, + "train/sim_loss": 0.02812516689300537 + }, + { + "epoch": 1.406466284358315, + "step": 14225, + "train/total_loss": 0.07560340315103531 + }, + { + "entropy": 9.349905014038086, + "epoch": 1.4065651572078308, + "mean_token_accuracy": 0.8415961265563965, + "num_tokens": 27980346.0, + "step": 14226, + "train/ce_loss": 0.5441060066223145 + }, + { + "epoch": 1.4065651572078308, + "step": 14226, + "train/sim_loss": 0.020978450775146484 + }, + { + "epoch": 1.4065651572078308, + "step": 14226, + "train/total_loss": 0.07538905739784241 + }, + { + "entropy": 8.976632118225098, + "epoch": 1.4066640300573463, + "mean_token_accuracy": 0.8708133697509766, + "num_tokens": 27987920.0, + "step": 14227, + "train/ce_loss": 0.43601930141448975 + }, + { + "epoch": 1.4066640300573463, + "step": 14227, + "train/sim_loss": 0.06726586818695068 + }, + { + "epoch": 1.4066640300573463, + "step": 14227, + "train/total_loss": 0.11086779832839966 + }, + { + "entropy": 8.982340812683105, + "epoch": 1.4067629029068618, + "mean_token_accuracy": 0.8377550840377808, + "num_tokens": 27995692.0, + "step": 14228, + "train/ce_loss": 0.4550510346889496 + }, + { + "epoch": 1.4067629029068618, + "step": 14228, + "train/sim_loss": 0.013629257678985596 + }, + { + "epoch": 1.4067629029068618, + "step": 14228, + "train/total_loss": 0.059134360402822495 + }, + { + "entropy": 9.149374961853027, + "epoch": 1.4068617757563773, + "mean_token_accuracy": 0.8581418395042419, + "num_tokens": 28009126.0, + "step": 14229, + "train/ce_loss": 0.3423170745372772 + }, + { + "epoch": 1.4068617757563773, + "step": 14229, + "train/sim_loss": 0.08062756061553955 + }, + { + "epoch": 1.4068617757563773, + "step": 14229, + "train/total_loss": 0.11485926806926727 + }, + { + "entropy": 9.430305480957031, + "epoch": 1.4069606486058928, + "mean_token_accuracy": 0.8530150651931763, + "num_tokens": 28024928.0, + "step": 14230, + "train/ce_loss": 2.911303624841821e-07 + }, + { + "epoch": 1.4069606486058928, + "step": 14230, + "train/sim_loss": 0.04798412322998047 + }, + { + "epoch": 1.4069606486058928, + "step": 14230, + "train/total_loss": 0.047984153032302856 + }, + { + "entropy": 9.20423698425293, + "epoch": 1.4070595214554085, + "mean_token_accuracy": 0.8619102239608765, + "num_tokens": 28041419.0, + "step": 14231, + "train/ce_loss": 0.506929337978363 + }, + { + "epoch": 1.4070595214554085, + "step": 14231, + "train/sim_loss": 0.04560661315917969 + }, + { + "epoch": 1.4070595214554085, + "step": 14231, + "train/total_loss": 0.09629954397678375 + }, + { + "entropy": 9.389885902404785, + "epoch": 1.4071583943049237, + "mean_token_accuracy": 0.8216560482978821, + "num_tokens": 28058383.0, + "step": 14232, + "train/ce_loss": 0.6290305852890015 + }, + { + "epoch": 1.4071583943049237, + "step": 14232, + "train/sim_loss": 0.015353679656982422 + }, + { + "epoch": 1.4071583943049237, + "step": 14232, + "train/total_loss": 0.07825674116611481 + }, + { + "entropy": 8.667591094970703, + "epoch": 1.4072572671544394, + "mean_token_accuracy": 0.8364661931991577, + "num_tokens": 28065859.0, + "step": 14233, + "train/ce_loss": 0.2421276718378067 + }, + { + "epoch": 1.4072572671544394, + "step": 14233, + "train/sim_loss": 0.011673808097839355 + }, + { + "epoch": 1.4072572671544394, + "step": 14233, + "train/total_loss": 0.035886578261852264 + }, + { + "entropy": 10.214483261108398, + "epoch": 1.407356140003955, + "mean_token_accuracy": 0.9251870512962341, + "num_tokens": 28072085.0, + "step": 14234, + "train/ce_loss": 8.050137125792389e-07 + }, + { + "epoch": 1.407356140003955, + "step": 14234, + "train/sim_loss": 0.011264801025390625 + }, + { + "epoch": 1.407356140003955, + "step": 14234, + "train/total_loss": 0.011264881119132042 + }, + { + "entropy": 9.125544548034668, + "epoch": 1.4074550128534704, + "mean_token_accuracy": 0.81210857629776, + "num_tokens": 28081997.0, + "step": 14235, + "train/ce_loss": 0.6522428393363953 + }, + { + "epoch": 1.4074550128534704, + "step": 14235, + "train/sim_loss": 0.030881166458129883 + }, + { + "epoch": 1.4074550128534704, + "step": 14235, + "train/total_loss": 0.09610544890165329 + }, + { + "entropy": 10.09282398223877, + "epoch": 1.407553885702986, + "mean_token_accuracy": 0.9154929518699646, + "num_tokens": 28089172.0, + "step": 14236, + "train/ce_loss": 1.177521198769682e-06 + }, + { + "epoch": 1.407553885702986, + "step": 14236, + "train/sim_loss": 0.010081887245178223 + }, + { + "epoch": 1.407553885702986, + "step": 14236, + "train/total_loss": 0.010082004591822624 + }, + { + "entropy": 9.18012809753418, + "epoch": 1.4076527585525014, + "mean_token_accuracy": 0.8436657786369324, + "num_tokens": 28106418.0, + "step": 14237, + "train/ce_loss": 0.9275426864624023 + }, + { + "epoch": 1.4076527585525014, + "step": 14237, + "train/sim_loss": 0.05833864212036133 + }, + { + "epoch": 1.4076527585525014, + "step": 14237, + "train/total_loss": 0.15109291672706604 + }, + { + "entropy": 9.366405487060547, + "epoch": 1.407751631402017, + "mean_token_accuracy": 0.7922077775001526, + "num_tokens": 28119142.0, + "step": 14238, + "train/ce_loss": 0.3623669445514679 + }, + { + "epoch": 1.407751631402017, + "step": 14238, + "train/sim_loss": 0.03539818525314331 + }, + { + "epoch": 1.407751631402017, + "step": 14238, + "train/total_loss": 0.07163488119840622 + }, + { + "entropy": 9.12481689453125, + "epoch": 1.4078505042515326, + "mean_token_accuracy": 0.8475452065467834, + "num_tokens": 28132176.0, + "step": 14239, + "train/ce_loss": 0.36228370666503906 + }, + { + "epoch": 1.4078505042515326, + "step": 14239, + "train/sim_loss": 0.03432798385620117 + }, + { + "epoch": 1.4078505042515326, + "step": 14239, + "train/total_loss": 0.07055635750293732 + }, + { + "epoch": 1.407949377101048, + "grad_norm": 0.5617564916610718, + "learning_rate": 6.4819759679572765e-06, + "loss": 0.0817, + "step": 14240 + }, + { + "entropy": 9.343055725097656, + "epoch": 1.407949377101048, + "mean_token_accuracy": 0.8709288239479065, + "num_tokens": 28151507.0, + "step": 14240, + "train/ce_loss": 0.5070720911026001 + }, + { + "epoch": 1.407949377101048, + "step": 14240, + "train/sim_loss": 0.05181610584259033 + }, + { + "epoch": 1.407949377101048, + "step": 14240, + "train/total_loss": 0.1025233119726181 + }, + { + "entropy": 9.063824653625488, + "epoch": 1.4080482499505635, + "mean_token_accuracy": 0.8472727537155151, + "num_tokens": 28160606.0, + "step": 14241, + "train/ce_loss": 0.3983953595161438 + }, + { + "epoch": 1.4080482499505635, + "step": 14241, + "train/sim_loss": 0.09733200073242188 + }, + { + "epoch": 1.4080482499505635, + "step": 14241, + "train/total_loss": 0.13717153668403625 + }, + { + "entropy": 9.543461799621582, + "epoch": 1.408147122800079, + "mean_token_accuracy": 0.8490771055221558, + "num_tokens": 28171267.0, + "step": 14242, + "train/ce_loss": 0.4872809052467346 + }, + { + "epoch": 1.408147122800079, + "step": 14242, + "train/sim_loss": 0.05380427837371826 + }, + { + "epoch": 1.408147122800079, + "step": 14242, + "train/total_loss": 0.10253237187862396 + }, + { + "entropy": 9.555644989013672, + "epoch": 1.4082459956495947, + "mean_token_accuracy": 0.9418604373931885, + "num_tokens": 28185615.0, + "step": 14243, + "train/ce_loss": 0.4200593829154968 + }, + { + "epoch": 1.4082459956495947, + "step": 14243, + "train/sim_loss": 0.017520546913146973 + }, + { + "epoch": 1.4082459956495947, + "step": 14243, + "train/total_loss": 0.059526484459638596 + }, + { + "entropy": 9.342386245727539, + "epoch": 1.4083448684991102, + "mean_token_accuracy": 0.9300912022590637, + "num_tokens": 28199329.0, + "step": 14244, + "train/ce_loss": 0.235498309135437 + }, + { + "epoch": 1.4083448684991102, + "step": 14244, + "train/sim_loss": 0.0696333646774292 + }, + { + "epoch": 1.4083448684991102, + "step": 14244, + "train/total_loss": 0.09318319708108902 + }, + { + "entropy": 9.119025230407715, + "epoch": 1.4084437413486257, + "mean_token_accuracy": 0.8287037014961243, + "num_tokens": 28216188.0, + "step": 14245, + "train/ce_loss": 0.5650016665458679 + }, + { + "epoch": 1.4084437413486257, + "step": 14245, + "train/sim_loss": 0.045283496379852295 + }, + { + "epoch": 1.4084437413486257, + "step": 14245, + "train/total_loss": 0.10178366303443909 + }, + { + "entropy": 9.377260208129883, + "epoch": 1.4085426141981412, + "mean_token_accuracy": 0.8891786336898804, + "num_tokens": 28234770.0, + "step": 14246, + "train/ce_loss": 0.3827865719795227 + }, + { + "epoch": 1.4085426141981412, + "step": 14246, + "train/sim_loss": 0.025823354721069336 + }, + { + "epoch": 1.4085426141981412, + "step": 14246, + "train/total_loss": 0.06410200893878937 + }, + { + "entropy": 8.969637870788574, + "epoch": 1.4086414870476567, + "mean_token_accuracy": 0.828199028968811, + "num_tokens": 28249505.0, + "step": 14247, + "train/ce_loss": 0.5427882075309753 + }, + { + "epoch": 1.4086414870476567, + "step": 14247, + "train/sim_loss": 0.08265483379364014 + }, + { + "epoch": 1.4086414870476567, + "step": 14247, + "train/total_loss": 0.13693365454673767 + }, + { + "entropy": 10.083497047424316, + "epoch": 1.4087403598971722, + "mean_token_accuracy": 0.9729729890823364, + "num_tokens": 28263003.0, + "step": 14248, + "train/ce_loss": 9.321846050625027e-07 + }, + { + "epoch": 1.4087403598971722, + "step": 14248, + "train/sim_loss": 0.019632399082183838 + }, + { + "epoch": 1.4087403598971722, + "step": 14248, + "train/total_loss": 0.0196324922144413 + }, + { + "entropy": 9.503331184387207, + "epoch": 1.4088392327466877, + "mean_token_accuracy": 0.8320126533508301, + "num_tokens": 28273882.0, + "step": 14249, + "train/ce_loss": 0.47996336221694946 + }, + { + "epoch": 1.4088392327466877, + "step": 14249, + "train/sim_loss": 0.019311726093292236 + }, + { + "epoch": 1.4088392327466877, + "step": 14249, + "train/total_loss": 0.06730806827545166 + }, + { + "entropy": 8.651681900024414, + "epoch": 1.4089381055962034, + "mean_token_accuracy": 0.8427350521087646, + "num_tokens": 28286767.0, + "step": 14250, + "train/ce_loss": 0.17477910220623016 + }, + { + "epoch": 1.4089381055962034, + "step": 14250, + "train/sim_loss": 0.03268331289291382 + }, + { + "epoch": 1.4089381055962034, + "step": 14250, + "train/total_loss": 0.050161223858594894 + }, + { + "entropy": 9.496428489685059, + "epoch": 1.4090369784457188, + "mean_token_accuracy": 0.8898305296897888, + "num_tokens": 28294947.0, + "step": 14251, + "train/ce_loss": 0.3902758061885834 + }, + { + "epoch": 1.4090369784457188, + "step": 14251, + "train/sim_loss": 0.03953206539154053 + }, + { + "epoch": 1.4090369784457188, + "step": 14251, + "train/total_loss": 0.07855965197086334 + }, + { + "entropy": 9.821812629699707, + "epoch": 1.4091358512952343, + "mean_token_accuracy": 0.8875969052314758, + "num_tokens": 28308333.0, + "step": 14252, + "train/ce_loss": 6.422387741622515e-07 + }, + { + "epoch": 1.4091358512952343, + "step": 14252, + "train/sim_loss": 0.04664289951324463 + }, + { + "epoch": 1.4091358512952343, + "step": 14252, + "train/total_loss": 0.0466429628431797 + }, + { + "entropy": 9.235160827636719, + "epoch": 1.4092347241447498, + "mean_token_accuracy": 0.8611422181129456, + "num_tokens": 28320965.0, + "step": 14253, + "train/ce_loss": 0.3174266219139099 + }, + { + "epoch": 1.4092347241447498, + "step": 14253, + "train/sim_loss": 0.027759671211242676 + }, + { + "epoch": 1.4092347241447498, + "step": 14253, + "train/total_loss": 0.05950233340263367 + }, + { + "entropy": 9.309961318969727, + "epoch": 1.4093335969942653, + "mean_token_accuracy": 0.9182156324386597, + "num_tokens": 28333586.0, + "step": 14254, + "train/ce_loss": 0.401362806558609 + }, + { + "epoch": 1.4093335969942653, + "step": 14254, + "train/sim_loss": 0.03723883628845215 + }, + { + "epoch": 1.4093335969942653, + "step": 14254, + "train/total_loss": 0.07737511396408081 + }, + { + "entropy": 8.93638801574707, + "epoch": 1.409432469843781, + "mean_token_accuracy": 0.8336842060089111, + "num_tokens": 28344622.0, + "step": 14255, + "train/ce_loss": 0.5443292260169983 + }, + { + "epoch": 1.409432469843781, + "step": 14255, + "train/sim_loss": 0.09119164943695068 + }, + { + "epoch": 1.409432469843781, + "step": 14255, + "train/total_loss": 0.145624577999115 + }, + { + "entropy": 9.29684066772461, + "epoch": 1.4095313426932965, + "mean_token_accuracy": 0.8730366230010986, + "num_tokens": 28361412.0, + "step": 14256, + "train/ce_loss": 0.4822956621646881 + }, + { + "epoch": 1.4095313426932965, + "step": 14256, + "train/sim_loss": 0.03931081295013428 + }, + { + "epoch": 1.4095313426932965, + "step": 14256, + "train/total_loss": 0.08754038065671921 + }, + { + "entropy": 9.253896713256836, + "epoch": 1.409630215542812, + "mean_token_accuracy": 0.8912500143051147, + "num_tokens": 28368509.0, + "step": 14257, + "train/ce_loss": 2.7536816560314037e-07 + }, + { + "epoch": 1.409630215542812, + "step": 14257, + "train/sim_loss": 0.019469261169433594 + }, + { + "epoch": 1.409630215542812, + "step": 14257, + "train/total_loss": 0.019469289109110832 + }, + { + "entropy": 8.833481788635254, + "epoch": 1.4097290883923275, + "mean_token_accuracy": 0.8147763013839722, + "num_tokens": 28383196.0, + "step": 14258, + "train/ce_loss": 0.45867493748664856 + }, + { + "epoch": 1.4097290883923275, + "step": 14258, + "train/sim_loss": 0.03757011890411377 + }, + { + "epoch": 1.4097290883923275, + "step": 14258, + "train/total_loss": 0.08343761414289474 + }, + { + "entropy": 9.73717975616455, + "epoch": 1.409827961241843, + "mean_token_accuracy": 0.920634925365448, + "num_tokens": 28392652.0, + "step": 14259, + "train/ce_loss": 0.3221738636493683 + }, + { + "epoch": 1.409827961241843, + "step": 14259, + "train/sim_loss": 0.04229164123535156 + }, + { + "epoch": 1.409827961241843, + "step": 14259, + "train/total_loss": 0.07450902462005615 + }, + { + "epoch": 1.4099268340913584, + "grad_norm": 0.5055751800537109, + "learning_rate": 6.477031103199328e-06, + "loss": 0.0745, + "step": 14260 + }, + { + "entropy": 8.710456848144531, + "epoch": 1.4099268340913584, + "mean_token_accuracy": 0.8289225101470947, + "num_tokens": 28402596.0, + "step": 14260, + "train/ce_loss": 0.18225079774856567 + }, + { + "epoch": 1.4099268340913584, + "step": 14260, + "train/sim_loss": 0.03698331117630005 + }, + { + "epoch": 1.4099268340913584, + "step": 14260, + "train/total_loss": 0.055208392441272736 + }, + { + "entropy": 9.210020065307617, + "epoch": 1.410025706940874, + "mean_token_accuracy": 0.8707482814788818, + "num_tokens": 28410247.0, + "step": 14261, + "train/ce_loss": 1.3687636055692565e-06 + }, + { + "epoch": 1.410025706940874, + "step": 14261, + "train/sim_loss": 0.035164713859558105 + }, + { + "epoch": 1.410025706940874, + "step": 14261, + "train/total_loss": 0.03516485169529915 + }, + { + "entropy": 9.128801345825195, + "epoch": 1.4101245797903896, + "mean_token_accuracy": 0.8663967847824097, + "num_tokens": 28417711.0, + "step": 14262, + "train/ce_loss": 0.5580455660820007 + }, + { + "epoch": 1.4101245797903896, + "step": 14262, + "train/sim_loss": 0.042402684688568115 + }, + { + "epoch": 1.4101245797903896, + "step": 14262, + "train/total_loss": 0.09820724278688431 + }, + { + "entropy": 8.12077808380127, + "epoch": 1.4102234526399051, + "mean_token_accuracy": 0.8874239325523376, + "num_tokens": 28424219.0, + "step": 14263, + "train/ce_loss": 0.13642330467700958 + }, + { + "epoch": 1.4102234526399051, + "step": 14263, + "train/sim_loss": 0.013514518737792969 + }, + { + "epoch": 1.4102234526399051, + "step": 14263, + "train/total_loss": 0.027156848460435867 + }, + { + "entropy": 9.415255546569824, + "epoch": 1.4103223254894206, + "mean_token_accuracy": 0.9184890389442444, + "num_tokens": 28435719.0, + "step": 14264, + "train/ce_loss": 1.0067307130157133e-06 + }, + { + "epoch": 1.4103223254894206, + "step": 14264, + "train/sim_loss": 0.03249925374984741 + }, + { + "epoch": 1.4103223254894206, + "step": 14264, + "train/total_loss": 0.03249935433268547 + }, + { + "entropy": 9.570289611816406, + "epoch": 1.410421198338936, + "mean_token_accuracy": 0.8730606436729431, + "num_tokens": 28452314.0, + "step": 14265, + "train/ce_loss": 0.16737647354602814 + }, + { + "epoch": 1.410421198338936, + "step": 14265, + "train/sim_loss": 0.04563939571380615 + }, + { + "epoch": 1.410421198338936, + "step": 14265, + "train/total_loss": 0.062377043068408966 + }, + { + "entropy": 9.648322105407715, + "epoch": 1.4105200711884516, + "mean_token_accuracy": 0.8352059721946716, + "num_tokens": 28466814.0, + "step": 14266, + "train/ce_loss": 0.47914373874664307 + }, + { + "epoch": 1.4105200711884516, + "step": 14266, + "train/sim_loss": 0.07389557361602783 + }, + { + "epoch": 1.4105200711884516, + "step": 14266, + "train/total_loss": 0.1218099445104599 + }, + { + "entropy": 9.596237182617188, + "epoch": 1.4106189440379673, + "mean_token_accuracy": 0.824416995048523, + "num_tokens": 28475654.0, + "step": 14267, + "train/ce_loss": 0.11226487904787064 + }, + { + "epoch": 1.4106189440379673, + "step": 14267, + "train/sim_loss": 0.0785905122756958 + }, + { + "epoch": 1.4106189440379673, + "step": 14267, + "train/total_loss": 0.08981700241565704 + }, + { + "entropy": 9.189953804016113, + "epoch": 1.4107178168874828, + "mean_token_accuracy": 0.88376384973526, + "num_tokens": 28482674.0, + "step": 14268, + "train/ce_loss": 0.65351402759552 + }, + { + "epoch": 1.4107178168874828, + "step": 14268, + "train/sim_loss": 0.03409087657928467 + }, + { + "epoch": 1.4107178168874828, + "step": 14268, + "train/total_loss": 0.09944228082895279 + }, + { + "entropy": 9.43425178527832, + "epoch": 1.4108166897369983, + "mean_token_accuracy": 0.8715789318084717, + "num_tokens": 28490442.0, + "step": 14269, + "train/ce_loss": 0.34705492854118347 + }, + { + "epoch": 1.4108166897369983, + "step": 14269, + "train/sim_loss": 0.0286405086517334 + }, + { + "epoch": 1.4108166897369983, + "step": 14269, + "train/total_loss": 0.0633459985256195 + }, + { + "entropy": 9.140392303466797, + "epoch": 1.4109155625865137, + "mean_token_accuracy": 0.8202115297317505, + "num_tokens": 28501521.0, + "step": 14270, + "train/ce_loss": 0.6529157757759094 + }, + { + "epoch": 1.4109155625865137, + "step": 14270, + "train/sim_loss": 0.10659193992614746 + }, + { + "epoch": 1.4109155625865137, + "step": 14270, + "train/total_loss": 0.17188352346420288 + }, + { + "entropy": 9.096192359924316, + "epoch": 1.4110144354360292, + "mean_token_accuracy": 0.8313513398170471, + "num_tokens": 28517353.0, + "step": 14271, + "train/ce_loss": 0.7547451853752136 + }, + { + "epoch": 1.4110144354360292, + "step": 14271, + "train/sim_loss": 0.07989513874053955 + }, + { + "epoch": 1.4110144354360292, + "step": 14271, + "train/total_loss": 0.15536966919898987 + }, + { + "entropy": 9.156052589416504, + "epoch": 1.4111133082855447, + "mean_token_accuracy": 0.8849797248840332, + "num_tokens": 28524928.0, + "step": 14272, + "train/ce_loss": 0.458545058965683 + }, + { + "epoch": 1.4111133082855447, + "step": 14272, + "train/sim_loss": 0.030409395694732666 + }, + { + "epoch": 1.4111133082855447, + "step": 14272, + "train/total_loss": 0.0762639045715332 + }, + { + "entropy": 9.716094970703125, + "epoch": 1.4112121811350602, + "mean_token_accuracy": 0.8603491187095642, + "num_tokens": 28545406.0, + "step": 14273, + "train/ce_loss": 0.46057504415512085 + }, + { + "epoch": 1.4112121811350602, + "step": 14273, + "train/sim_loss": 0.07278978824615479 + }, + { + "epoch": 1.4112121811350602, + "step": 14273, + "train/total_loss": 0.11884729564189911 + }, + { + "entropy": 9.13350772857666, + "epoch": 1.411311053984576, + "mean_token_accuracy": 0.7783902883529663, + "num_tokens": 28554032.0, + "step": 14274, + "train/ce_loss": 0.8681187033653259 + }, + { + "epoch": 1.411311053984576, + "step": 14274, + "train/sim_loss": 0.03182709217071533 + }, + { + "epoch": 1.411311053984576, + "step": 14274, + "train/total_loss": 0.11863896250724792 + }, + { + "entropy": 9.140863418579102, + "epoch": 1.4114099268340914, + "mean_token_accuracy": 0.8331242203712463, + "num_tokens": 28566067.0, + "step": 14275, + "train/ce_loss": 0.4652175307273865 + }, + { + "epoch": 1.4114099268340914, + "step": 14275, + "train/sim_loss": 0.047193169593811035 + }, + { + "epoch": 1.4114099268340914, + "step": 14275, + "train/total_loss": 0.09371492266654968 + }, + { + "entropy": 9.41482162475586, + "epoch": 1.4115087996836069, + "mean_token_accuracy": 0.8374864459037781, + "num_tokens": 28584298.0, + "step": 14276, + "train/ce_loss": 0.39541590213775635 + }, + { + "epoch": 1.4115087996836069, + "step": 14276, + "train/sim_loss": 0.04432392120361328 + }, + { + "epoch": 1.4115087996836069, + "step": 14276, + "train/total_loss": 0.08386550843715668 + }, + { + "entropy": 9.31795883178711, + "epoch": 1.4116076725331224, + "mean_token_accuracy": 0.8668699264526367, + "num_tokens": 28598768.0, + "step": 14277, + "train/ce_loss": 0.6279720067977905 + }, + { + "epoch": 1.4116076725331224, + "step": 14277, + "train/sim_loss": 0.03175783157348633 + }, + { + "epoch": 1.4116076725331224, + "step": 14277, + "train/total_loss": 0.09455503523349762 + }, + { + "entropy": 9.275676727294922, + "epoch": 1.4117065453826378, + "mean_token_accuracy": 0.8776435256004333, + "num_tokens": 28607148.0, + "step": 14278, + "train/ce_loss": 0.5999038815498352 + }, + { + "epoch": 1.4117065453826378, + "step": 14278, + "train/sim_loss": 0.04885643720626831 + }, + { + "epoch": 1.4117065453826378, + "step": 14278, + "train/total_loss": 0.10884682834148407 + }, + { + "entropy": 9.17181396484375, + "epoch": 1.4118054182321536, + "mean_token_accuracy": 0.8955696225166321, + "num_tokens": 28627995.0, + "step": 14279, + "train/ce_loss": 0.27505791187286377 + }, + { + "epoch": 1.4118054182321536, + "step": 14279, + "train/sim_loss": 0.018035292625427246 + }, + { + "epoch": 1.4118054182321536, + "step": 14279, + "train/total_loss": 0.04554108530282974 + }, + { + "epoch": 1.411904291081669, + "grad_norm": 0.517031192779541, + "learning_rate": 6.472086238441379e-06, + "loss": 0.0804, + "step": 14280 + }, + { + "entropy": 9.223255157470703, + "epoch": 1.411904291081669, + "mean_token_accuracy": 0.8128872513771057, + "num_tokens": 28638582.0, + "step": 14280, + "train/ce_loss": 0.4330977201461792 + }, + { + "epoch": 1.411904291081669, + "step": 14280, + "train/sim_loss": 0.04688602685928345 + }, + { + "epoch": 1.411904291081669, + "step": 14280, + "train/total_loss": 0.09019580483436584 + }, + { + "entropy": 9.596845626831055, + "epoch": 1.4120031639311845, + "mean_token_accuracy": 0.8857142925262451, + "num_tokens": 28651317.0, + "step": 14281, + "train/ce_loss": 0.4993003010749817 + }, + { + "epoch": 1.4120031639311845, + "step": 14281, + "train/sim_loss": 0.03654450178146362 + }, + { + "epoch": 1.4120031639311845, + "step": 14281, + "train/total_loss": 0.08647453784942627 + }, + { + "entropy": 9.055822372436523, + "epoch": 1.4121020367807, + "mean_token_accuracy": 0.8208954930305481, + "num_tokens": 28662178.0, + "step": 14282, + "train/ce_loss": 8.867312430993479e-07 + }, + { + "epoch": 1.4121020367807, + "step": 14282, + "train/sim_loss": 0.0308840274810791 + }, + { + "epoch": 1.4121020367807, + "step": 14282, + "train/total_loss": 0.030884116888046265 + }, + { + "entropy": 9.328622817993164, + "epoch": 1.4122009096302155, + "mean_token_accuracy": 0.8149779438972473, + "num_tokens": 28679618.0, + "step": 14283, + "train/ce_loss": 0.5404912233352661 + }, + { + "epoch": 1.4122009096302155, + "step": 14283, + "train/sim_loss": 0.020751595497131348 + }, + { + "epoch": 1.4122009096302155, + "step": 14283, + "train/total_loss": 0.07480071485042572 + }, + { + "entropy": 9.726393699645996, + "epoch": 1.4122997824797312, + "mean_token_accuracy": 0.8688085675239563, + "num_tokens": 28700811.0, + "step": 14284, + "train/ce_loss": 0.38007354736328125 + }, + { + "epoch": 1.4122997824797312, + "step": 14284, + "train/sim_loss": 0.017499804496765137 + }, + { + "epoch": 1.4122997824797312, + "step": 14284, + "train/total_loss": 0.05550716072320938 + }, + { + "entropy": 9.516536712646484, + "epoch": 1.4123986553292465, + "mean_token_accuracy": 0.8271334767341614, + "num_tokens": 28718786.0, + "step": 14285, + "train/ce_loss": 0.8557976484298706 + }, + { + "epoch": 1.4123986553292465, + "step": 14285, + "train/sim_loss": 0.057289183139801025 + }, + { + "epoch": 1.4123986553292465, + "step": 14285, + "train/total_loss": 0.14286895096302032 + }, + { + "entropy": 9.103117942810059, + "epoch": 1.4124975281787622, + "mean_token_accuracy": 0.8545454740524292, + "num_tokens": 28729382.0, + "step": 14286, + "train/ce_loss": 0.6927191615104675 + }, + { + "epoch": 1.4124975281787622, + "step": 14286, + "train/sim_loss": 0.050718843936920166 + }, + { + "epoch": 1.4124975281787622, + "step": 14286, + "train/total_loss": 0.1199907585978508 + }, + { + "entropy": 9.97776985168457, + "epoch": 1.4125964010282777, + "mean_token_accuracy": 0.8825136423110962, + "num_tokens": 28741041.0, + "step": 14287, + "train/ce_loss": 8.676660741002706e-07 + }, + { + "epoch": 1.4125964010282777, + "step": 14287, + "train/sim_loss": 0.014161109924316406 + }, + { + "epoch": 1.4125964010282777, + "step": 14287, + "train/total_loss": 0.014161196537315845 + }, + { + "entropy": 9.285501480102539, + "epoch": 1.4126952738777931, + "mean_token_accuracy": 0.8936170339584351, + "num_tokens": 28758917.0, + "step": 14288, + "train/ce_loss": 1.5598338904965203e-06 + }, + { + "epoch": 1.4126952738777931, + "step": 14288, + "train/sim_loss": 0.021946072578430176 + }, + { + "epoch": 1.4126952738777931, + "step": 14288, + "train/total_loss": 0.02194622904062271 + }, + { + "entropy": 8.856098175048828, + "epoch": 1.4127941467273086, + "mean_token_accuracy": 0.8102189898490906, + "num_tokens": 28766207.0, + "step": 14289, + "train/ce_loss": 0.3363381028175354 + }, + { + "epoch": 1.4127941467273086, + "step": 14289, + "train/sim_loss": 0.035353660583496094 + }, + { + "epoch": 1.4127941467273086, + "step": 14289, + "train/total_loss": 0.06898747384548187 + }, + { + "entropy": 9.488446235656738, + "epoch": 1.4128930195768241, + "mean_token_accuracy": 0.8355437517166138, + "num_tokens": 28779899.0, + "step": 14290, + "train/ce_loss": 0.44532278180122375 + }, + { + "epoch": 1.4128930195768241, + "step": 14290, + "train/sim_loss": 0.06085175275802612 + }, + { + "epoch": 1.4128930195768241, + "step": 14290, + "train/total_loss": 0.10538403689861298 + }, + { + "entropy": 9.31629753112793, + "epoch": 1.4129918924263398, + "mean_token_accuracy": 0.8553615808486938, + "num_tokens": 28790809.0, + "step": 14291, + "train/ce_loss": 0.6101003885269165 + }, + { + "epoch": 1.4129918924263398, + "step": 14291, + "train/sim_loss": 0.032988667488098145 + }, + { + "epoch": 1.4129918924263398, + "step": 14291, + "train/total_loss": 0.09399870783090591 + }, + { + "entropy": 9.377607345581055, + "epoch": 1.4130907652758553, + "mean_token_accuracy": 0.8647342920303345, + "num_tokens": 28805771.0, + "step": 14292, + "train/ce_loss": 0.4583742618560791 + }, + { + "epoch": 1.4130907652758553, + "step": 14292, + "train/sim_loss": 0.057679951190948486 + }, + { + "epoch": 1.4130907652758553, + "step": 14292, + "train/total_loss": 0.10351738333702087 + }, + { + "entropy": 9.306011199951172, + "epoch": 1.4131896381253708, + "mean_token_accuracy": 0.8221942186355591, + "num_tokens": 28820265.0, + "step": 14293, + "train/ce_loss": 0.5946489572525024 + }, + { + "epoch": 1.4131896381253708, + "step": 14293, + "train/sim_loss": 0.051208555698394775 + }, + { + "epoch": 1.4131896381253708, + "step": 14293, + "train/total_loss": 0.1106734573841095 + }, + { + "entropy": 9.333880424499512, + "epoch": 1.4132885109748863, + "mean_token_accuracy": 0.7960250973701477, + "num_tokens": 28836040.0, + "step": 14294, + "train/ce_loss": 0.604929506778717 + }, + { + "epoch": 1.4132885109748863, + "step": 14294, + "train/sim_loss": 0.037457942962646484 + }, + { + "epoch": 1.4132885109748863, + "step": 14294, + "train/total_loss": 0.09795089066028595 + }, + { + "entropy": 8.980154037475586, + "epoch": 1.4133873838244018, + "mean_token_accuracy": 0.8589164614677429, + "num_tokens": 28846009.0, + "step": 14295, + "train/ce_loss": 0.3581741154193878 + }, + { + "epoch": 1.4133873838244018, + "step": 14295, + "train/sim_loss": 0.017476916313171387 + }, + { + "epoch": 1.4133873838244018, + "step": 14295, + "train/total_loss": 0.05329432711005211 + }, + { + "entropy": 9.559971809387207, + "epoch": 1.4134862566739175, + "mean_token_accuracy": 0.910815954208374, + "num_tokens": 28854784.0, + "step": 14296, + "train/ce_loss": 0.496171236038208 + }, + { + "epoch": 1.4134862566739175, + "step": 14296, + "train/sim_loss": 0.018696188926696777 + }, + { + "epoch": 1.4134862566739175, + "step": 14296, + "train/total_loss": 0.06831331551074982 + }, + { + "entropy": 9.700864791870117, + "epoch": 1.4135851295234327, + "mean_token_accuracy": 0.8694214820861816, + "num_tokens": 28867840.0, + "step": 14297, + "train/ce_loss": 8.180722375072946e-07 + }, + { + "epoch": 1.4135851295234327, + "step": 14297, + "train/sim_loss": 0.014699101448059082 + }, + { + "epoch": 1.4135851295234327, + "step": 14297, + "train/total_loss": 0.014699183404445648 + }, + { + "entropy": 9.285881996154785, + "epoch": 1.4136840023729484, + "mean_token_accuracy": 0.8467742204666138, + "num_tokens": 28885038.0, + "step": 14298, + "train/ce_loss": 0.23360396921634674 + }, + { + "epoch": 1.4136840023729484, + "step": 14298, + "train/sim_loss": 0.04032754898071289 + }, + { + "epoch": 1.4136840023729484, + "step": 14298, + "train/total_loss": 0.06368795037269592 + }, + { + "entropy": 9.216838836669922, + "epoch": 1.413782875222464, + "mean_token_accuracy": 0.8488612771034241, + "num_tokens": 28896809.0, + "step": 14299, + "train/ce_loss": 6.01218289375538e-07 + }, + { + "epoch": 1.413782875222464, + "step": 14299, + "train/sim_loss": 0.022694885730743408 + }, + { + "epoch": 1.413782875222464, + "step": 14299, + "train/total_loss": 0.022694945335388184 + }, + { + "epoch": 1.4138817480719794, + "grad_norm": 0.6675313711166382, + "learning_rate": 6.46714137368343e-06, + "loss": 0.0835, + "step": 14300 + }, + { + "entropy": 9.640838623046875, + "epoch": 1.4138817480719794, + "mean_token_accuracy": 0.8033472895622253, + "num_tokens": 28909314.0, + "step": 14300, + "train/ce_loss": 0.37484362721443176 + }, + { + "epoch": 1.4138817480719794, + "step": 14300, + "train/sim_loss": 0.05844581127166748 + }, + { + "epoch": 1.4138817480719794, + "step": 14300, + "train/total_loss": 0.09593017399311066 + }, + { + "entropy": 9.192411422729492, + "epoch": 1.413980620921495, + "mean_token_accuracy": 0.7638376355171204, + "num_tokens": 28920832.0, + "step": 14301, + "train/ce_loss": 0.8278588056564331 + }, + { + "epoch": 1.413980620921495, + "step": 14301, + "train/sim_loss": 0.014213919639587402 + }, + { + "epoch": 1.413980620921495, + "step": 14301, + "train/total_loss": 0.09699980169534683 + }, + { + "entropy": 9.104510307312012, + "epoch": 1.4140794937710104, + "mean_token_accuracy": 0.8393839597702026, + "num_tokens": 28930889.0, + "step": 14302, + "train/ce_loss": 0.5203571319580078 + }, + { + "epoch": 1.4140794937710104, + "step": 14302, + "train/sim_loss": 0.04231095314025879 + }, + { + "epoch": 1.4140794937710104, + "step": 14302, + "train/total_loss": 0.09434667229652405 + }, + { + "entropy": 9.468367576599121, + "epoch": 1.414178366620526, + "mean_token_accuracy": 0.8836265206336975, + "num_tokens": 28944447.0, + "step": 14303, + "train/ce_loss": 8.910316182664246e-07 + }, + { + "epoch": 1.414178366620526, + "step": 14303, + "train/sim_loss": 0.03209865093231201 + }, + { + "epoch": 1.414178366620526, + "step": 14303, + "train/total_loss": 0.032098740339279175 + }, + { + "entropy": 9.087950706481934, + "epoch": 1.4142772394700416, + "mean_token_accuracy": 0.8438502550125122, + "num_tokens": 28953431.0, + "step": 14304, + "train/ce_loss": 0.37175410985946655 + }, + { + "epoch": 1.4142772394700416, + "step": 14304, + "train/sim_loss": 0.0621798038482666 + }, + { + "epoch": 1.4142772394700416, + "step": 14304, + "train/total_loss": 0.09935522079467773 + }, + { + "entropy": 9.006723403930664, + "epoch": 1.414376112319557, + "mean_token_accuracy": 0.8571428656578064, + "num_tokens": 28965638.0, + "step": 14305, + "train/ce_loss": 0.13941629230976105 + }, + { + "epoch": 1.414376112319557, + "step": 14305, + "train/sim_loss": 0.05039459466934204 + }, + { + "epoch": 1.414376112319557, + "step": 14305, + "train/total_loss": 0.06433622539043427 + }, + { + "entropy": 9.406856536865234, + "epoch": 1.4144749851690726, + "mean_token_accuracy": 0.9047619104385376, + "num_tokens": 28977728.0, + "step": 14306, + "train/ce_loss": 0.5054859519004822 + }, + { + "epoch": 1.4144749851690726, + "step": 14306, + "train/sim_loss": 0.08093035221099854 + }, + { + "epoch": 1.4144749851690726, + "step": 14306, + "train/total_loss": 0.131478950381279 + }, + { + "entropy": 9.479146957397461, + "epoch": 1.414573858018588, + "mean_token_accuracy": 0.8449040055274963, + "num_tokens": 28986728.0, + "step": 14307, + "train/ce_loss": 0.35902848839759827 + }, + { + "epoch": 1.414573858018588, + "step": 14307, + "train/sim_loss": 0.13106799125671387 + }, + { + "epoch": 1.414573858018588, + "step": 14307, + "train/total_loss": 0.1669708490371704 + }, + { + "entropy": 9.628703117370605, + "epoch": 1.4146727308681037, + "mean_token_accuracy": 0.8677685856819153, + "num_tokens": 29001097.0, + "step": 14308, + "train/ce_loss": 0.8269357085227966 + }, + { + "epoch": 1.4146727308681037, + "step": 14308, + "train/sim_loss": 0.06558150053024292 + }, + { + "epoch": 1.4146727308681037, + "step": 14308, + "train/total_loss": 0.14827507734298706 + }, + { + "entropy": 9.27635383605957, + "epoch": 1.414771603717619, + "mean_token_accuracy": 0.8622668385505676, + "num_tokens": 29018387.0, + "step": 14309, + "train/ce_loss": 0.42281976342201233 + }, + { + "epoch": 1.414771603717619, + "step": 14309, + "train/sim_loss": 0.015891432762145996 + }, + { + "epoch": 1.414771603717619, + "step": 14309, + "train/total_loss": 0.05817341059446335 + }, + { + "entropy": 9.577102661132812, + "epoch": 1.4148704765671347, + "mean_token_accuracy": 0.8989726305007935, + "num_tokens": 29030400.0, + "step": 14310, + "train/ce_loss": 1.0042596159109962e-06 + }, + { + "epoch": 1.4148704765671347, + "step": 14310, + "train/sim_loss": 0.04438197612762451 + }, + { + "epoch": 1.4148704765671347, + "step": 14310, + "train/total_loss": 0.04438207671046257 + }, + { + "entropy": 9.532858848571777, + "epoch": 1.4149693494166502, + "mean_token_accuracy": 0.8326612710952759, + "num_tokens": 29048749.0, + "step": 14311, + "train/ce_loss": 0.49574846029281616 + }, + { + "epoch": 1.4149693494166502, + "step": 14311, + "train/sim_loss": 0.032227396965026855 + }, + { + "epoch": 1.4149693494166502, + "step": 14311, + "train/total_loss": 0.08180224895477295 + }, + { + "entropy": 9.421438217163086, + "epoch": 1.4150682222661657, + "mean_token_accuracy": 0.8702185750007629, + "num_tokens": 29060182.0, + "step": 14312, + "train/ce_loss": 0.4274728000164032 + }, + { + "epoch": 1.4150682222661657, + "step": 14312, + "train/sim_loss": 0.01780003309249878 + }, + { + "epoch": 1.4150682222661657, + "step": 14312, + "train/total_loss": 0.06054731458425522 + }, + { + "entropy": 9.84732437133789, + "epoch": 1.4151670951156812, + "mean_token_accuracy": 0.8164251446723938, + "num_tokens": 29077865.0, + "step": 14313, + "train/ce_loss": 7.801066317369987e-07 + }, + { + "epoch": 1.4151670951156812, + "step": 14313, + "train/sim_loss": 0.0456312894821167 + }, + { + "epoch": 1.4151670951156812, + "step": 14313, + "train/total_loss": 0.04563136771321297 + }, + { + "entropy": 8.99839973449707, + "epoch": 1.4152659679651967, + "mean_token_accuracy": 0.8489289879798889, + "num_tokens": 29094372.0, + "step": 14314, + "train/ce_loss": 0.4414396286010742 + }, + { + "epoch": 1.4152659679651967, + "step": 14314, + "train/sim_loss": 0.05934947729110718 + }, + { + "epoch": 1.4152659679651967, + "step": 14314, + "train/total_loss": 0.10349343717098236 + }, + { + "entropy": 9.485929489135742, + "epoch": 1.4153648408147124, + "mean_token_accuracy": 0.8508474826812744, + "num_tokens": 29107617.0, + "step": 14315, + "train/ce_loss": 1.6421444115621853e-06 + }, + { + "epoch": 1.4153648408147124, + "step": 14315, + "train/sim_loss": 0.017722606658935547 + }, + { + "epoch": 1.4153648408147124, + "step": 14315, + "train/total_loss": 0.01772277057170868 + }, + { + "entropy": 10.036153793334961, + "epoch": 1.4154637136642279, + "mean_token_accuracy": 0.9288256168365479, + "num_tokens": 29113266.0, + "step": 14316, + "train/ce_loss": 0.39166226983070374 + }, + { + "epoch": 1.4154637136642279, + "step": 14316, + "train/sim_loss": 0.06637680530548096 + }, + { + "epoch": 1.4154637136642279, + "step": 14316, + "train/total_loss": 0.10554303228855133 + }, + { + "entropy": 8.943845748901367, + "epoch": 1.4155625865137433, + "mean_token_accuracy": 0.8481613397598267, + "num_tokens": 29124539.0, + "step": 14317, + "train/ce_loss": 0.643854022026062 + }, + { + "epoch": 1.4155625865137433, + "step": 14317, + "train/sim_loss": 0.02211129665374756 + }, + { + "epoch": 1.4155625865137433, + "step": 14317, + "train/total_loss": 0.08649670332670212 + }, + { + "entropy": 9.597870826721191, + "epoch": 1.4156614593632588, + "mean_token_accuracy": 0.9213051795959473, + "num_tokens": 29138421.0, + "step": 14318, + "train/ce_loss": 0.40760383009910583 + }, + { + "epoch": 1.4156614593632588, + "step": 14318, + "train/sim_loss": 0.06333237886428833 + }, + { + "epoch": 1.4156614593632588, + "step": 14318, + "train/total_loss": 0.10409276187419891 + }, + { + "entropy": 9.381855010986328, + "epoch": 1.4157603322127743, + "mean_token_accuracy": 0.7993079423904419, + "num_tokens": 29152171.0, + "step": 14319, + "train/ce_loss": 0.4544372260570526 + }, + { + "epoch": 1.4157603322127743, + "step": 14319, + "train/sim_loss": 0.019096195697784424 + }, + { + "epoch": 1.4157603322127743, + "step": 14319, + "train/total_loss": 0.06453992426395416 + }, + { + "epoch": 1.41585920506229, + "grad_norm": 0.593795895576477, + "learning_rate": 6.462196508925481e-06, + "loss": 0.0871, + "step": 14320 + }, + { + "entropy": 9.703853607177734, + "epoch": 1.41585920506229, + "mean_token_accuracy": 0.8922829627990723, + "num_tokens": 29160081.0, + "step": 14320, + "train/ce_loss": 0.5413927435874939 + }, + { + "epoch": 1.41585920506229, + "step": 14320, + "train/sim_loss": 0.03852510452270508 + }, + { + "epoch": 1.41585920506229, + "step": 14320, + "train/total_loss": 0.09266437590122223 + }, + { + "entropy": 9.696954727172852, + "epoch": 1.4159580779118053, + "mean_token_accuracy": 0.8390313386917114, + "num_tokens": 29176906.0, + "step": 14321, + "train/ce_loss": 0.702381432056427 + }, + { + "epoch": 1.4159580779118053, + "step": 14321, + "train/sim_loss": 0.0656505823135376 + }, + { + "epoch": 1.4159580779118053, + "step": 14321, + "train/total_loss": 0.1358887255191803 + }, + { + "entropy": 9.123464584350586, + "epoch": 1.416056950761321, + "mean_token_accuracy": 0.8200691938400269, + "num_tokens": 29187456.0, + "step": 14322, + "train/ce_loss": 0.5135560631752014 + }, + { + "epoch": 1.416056950761321, + "step": 14322, + "train/sim_loss": 0.13683509826660156 + }, + { + "epoch": 1.416056950761321, + "step": 14322, + "train/total_loss": 0.18819069862365723 + }, + { + "entropy": 9.217803955078125, + "epoch": 1.4161558236108365, + "mean_token_accuracy": 0.8338338136672974, + "num_tokens": 29201847.0, + "step": 14323, + "train/ce_loss": 0.2656051218509674 + }, + { + "epoch": 1.4161558236108365, + "step": 14323, + "train/sim_loss": 0.031579017639160156 + }, + { + "epoch": 1.4161558236108365, + "step": 14323, + "train/total_loss": 0.058139532804489136 + }, + { + "entropy": 9.801025390625, + "epoch": 1.416254696460352, + "mean_token_accuracy": 0.8397058844566345, + "num_tokens": 29211780.0, + "step": 14324, + "train/ce_loss": 0.33081838488578796 + }, + { + "epoch": 1.416254696460352, + "step": 14324, + "train/sim_loss": 0.07702195644378662 + }, + { + "epoch": 1.416254696460352, + "step": 14324, + "train/total_loss": 0.1101038008928299 + }, + { + "entropy": 9.307328224182129, + "epoch": 1.4163535693098674, + "mean_token_accuracy": 0.7973421812057495, + "num_tokens": 29224810.0, + "step": 14325, + "train/ce_loss": 0.655440092086792 + }, + { + "epoch": 1.4163535693098674, + "step": 14325, + "train/sim_loss": 0.0324629545211792 + }, + { + "epoch": 1.4163535693098674, + "step": 14325, + "train/total_loss": 0.0980069637298584 + }, + { + "entropy": 9.242330551147461, + "epoch": 1.416452442159383, + "mean_token_accuracy": 0.8586956262588501, + "num_tokens": 29232720.0, + "step": 14326, + "train/ce_loss": 0.9206317663192749 + }, + { + "epoch": 1.416452442159383, + "step": 14326, + "train/sim_loss": 0.07910430431365967 + }, + { + "epoch": 1.416452442159383, + "step": 14326, + "train/total_loss": 0.1711674928665161 + }, + { + "entropy": 9.347898483276367, + "epoch": 1.4165513150088986, + "mean_token_accuracy": 0.856589138507843, + "num_tokens": 29244185.0, + "step": 14327, + "train/ce_loss": 0.39923664927482605 + }, + { + "epoch": 1.4165513150088986, + "step": 14327, + "train/sim_loss": 0.013996779918670654 + }, + { + "epoch": 1.4165513150088986, + "step": 14327, + "train/total_loss": 0.0539204441010952 + }, + { + "entropy": 9.03996753692627, + "epoch": 1.4166501878584141, + "mean_token_accuracy": 0.8084449172019958, + "num_tokens": 29253020.0, + "step": 14328, + "train/ce_loss": 0.6437071561813354 + }, + { + "epoch": 1.4166501878584141, + "step": 14328, + "train/sim_loss": 0.07457566261291504 + }, + { + "epoch": 1.4166501878584141, + "step": 14328, + "train/total_loss": 0.13894638419151306 + }, + { + "entropy": 9.478622436523438, + "epoch": 1.4167490607079296, + "mean_token_accuracy": 0.8122392296791077, + "num_tokens": 29263433.0, + "step": 14329, + "train/ce_loss": 0.320237934589386 + }, + { + "epoch": 1.4167490607079296, + "step": 14329, + "train/sim_loss": 0.044490933418273926 + }, + { + "epoch": 1.4167490607079296, + "step": 14329, + "train/total_loss": 0.07651472836732864 + }, + { + "entropy": 9.800357818603516, + "epoch": 1.416847933557445, + "mean_token_accuracy": 0.7768292427062988, + "num_tokens": 29273139.0, + "step": 14330, + "train/ce_loss": 0.9562351107597351 + }, + { + "epoch": 1.416847933557445, + "step": 14330, + "train/sim_loss": 0.04813742637634277 + }, + { + "epoch": 1.416847933557445, + "step": 14330, + "train/total_loss": 0.14376094937324524 + }, + { + "entropy": 9.516857147216797, + "epoch": 1.4169468064069606, + "mean_token_accuracy": 0.8906009197235107, + "num_tokens": 29284574.0, + "step": 14331, + "train/ce_loss": 7.501217282879225e-07 + }, + { + "epoch": 1.4169468064069606, + "step": 14331, + "train/sim_loss": 0.019259154796600342 + }, + { + "epoch": 1.4169468064069606, + "step": 14331, + "train/total_loss": 0.01925922930240631 + }, + { + "entropy": 9.406002044677734, + "epoch": 1.4170456792564763, + "mean_token_accuracy": 0.8775811195373535, + "num_tokens": 29296901.0, + "step": 14332, + "train/ce_loss": 1.4212295127435937e-06 + }, + { + "epoch": 1.4170456792564763, + "step": 14332, + "train/sim_loss": 0.033233642578125 + }, + { + "epoch": 1.4170456792564763, + "step": 14332, + "train/total_loss": 0.03323378413915634 + }, + { + "entropy": 10.16868782043457, + "epoch": 1.4171445521059918, + "mean_token_accuracy": 0.894505500793457, + "num_tokens": 29310915.0, + "step": 14333, + "train/ce_loss": 1.680878312981804e-06 + }, + { + "epoch": 1.4171445521059918, + "step": 14333, + "train/sim_loss": 0.02950870990753174 + }, + { + "epoch": 1.4171445521059918, + "step": 14333, + "train/total_loss": 0.02950887754559517 + }, + { + "entropy": 9.45411491394043, + "epoch": 1.4172434249555073, + "mean_token_accuracy": 0.8994082808494568, + "num_tokens": 29323805.0, + "step": 14334, + "train/ce_loss": 0.24472352862358093 + }, + { + "epoch": 1.4172434249555073, + "step": 14334, + "train/sim_loss": 0.014919638633728027 + }, + { + "epoch": 1.4172434249555073, + "step": 14334, + "train/total_loss": 0.03939199447631836 + }, + { + "entropy": 9.095417022705078, + "epoch": 1.4173422978050227, + "mean_token_accuracy": 0.8349875807762146, + "num_tokens": 29332834.0, + "step": 14335, + "train/ce_loss": 0.5458517074584961 + }, + { + "epoch": 1.4173422978050227, + "step": 14335, + "train/sim_loss": 0.05282527208328247 + }, + { + "epoch": 1.4173422978050227, + "step": 14335, + "train/total_loss": 0.10741044580936432 + }, + { + "entropy": 9.834983825683594, + "epoch": 1.4174411706545382, + "mean_token_accuracy": 0.853741466999054, + "num_tokens": 29341131.0, + "step": 14336, + "train/ce_loss": 0.7194315195083618 + }, + { + "epoch": 1.4174411706545382, + "step": 14336, + "train/sim_loss": 0.029172182083129883 + }, + { + "epoch": 1.4174411706545382, + "step": 14336, + "train/total_loss": 0.10111533850431442 + }, + { + "entropy": 9.063791275024414, + "epoch": 1.4175400435040537, + "mean_token_accuracy": 0.8389639854431152, + "num_tokens": 29353862.0, + "step": 14337, + "train/ce_loss": 0.43461519479751587 + }, + { + "epoch": 1.4175400435040537, + "step": 14337, + "train/sim_loss": 0.019719958305358887 + }, + { + "epoch": 1.4175400435040537, + "step": 14337, + "train/total_loss": 0.06318147480487823 + }, + { + "entropy": 9.43075942993164, + "epoch": 1.4176389163535692, + "mean_token_accuracy": 0.9389534592628479, + "num_tokens": 29369443.0, + "step": 14338, + "train/ce_loss": 0.20051725208759308 + }, + { + "epoch": 1.4176389163535692, + "step": 14338, + "train/sim_loss": 0.018999338150024414 + }, + { + "epoch": 1.4176389163535692, + "step": 14338, + "train/total_loss": 0.03905106335878372 + }, + { + "entropy": 9.794269561767578, + "epoch": 1.417737789203085, + "mean_token_accuracy": 0.8487972617149353, + "num_tokens": 29384050.0, + "step": 14339, + "train/ce_loss": 0.35354843735694885 + }, + { + "epoch": 1.417737789203085, + "step": 14339, + "train/sim_loss": 0.03925669193267822 + }, + { + "epoch": 1.417737789203085, + "step": 14339, + "train/total_loss": 0.07461153715848923 + }, + { + "epoch": 1.4178366620526004, + "grad_norm": 0.5939170718193054, + "learning_rate": 6.4572516441675324e-06, + "loss": 0.0867, + "step": 14340 + }, + { + "entropy": 9.195112228393555, + "epoch": 1.4178366620526004, + "mean_token_accuracy": 0.8729559779167175, + "num_tokens": 29394891.0, + "step": 14340, + "train/ce_loss": 0.3366169333457947 + }, + { + "epoch": 1.4178366620526004, + "step": 14340, + "train/sim_loss": 0.04239410161972046 + }, + { + "epoch": 1.4178366620526004, + "step": 14340, + "train/total_loss": 0.07605579495429993 + }, + { + "entropy": 9.420816421508789, + "epoch": 1.4179355349021159, + "mean_token_accuracy": 0.8512499928474426, + "num_tokens": 29408232.0, + "step": 14341, + "train/ce_loss": 0.4940626621246338 + }, + { + "epoch": 1.4179355349021159, + "step": 14341, + "train/sim_loss": 0.021417737007141113 + }, + { + "epoch": 1.4179355349021159, + "step": 14341, + "train/total_loss": 0.07082400470972061 + }, + { + "entropy": 10.340827941894531, + "epoch": 1.4180344077516314, + "mean_token_accuracy": 0.8708708882331848, + "num_tokens": 29415309.0, + "step": 14342, + "train/ce_loss": 0.7792012691497803 + }, + { + "epoch": 1.4180344077516314, + "step": 14342, + "train/sim_loss": 0.04382526874542236 + }, + { + "epoch": 1.4180344077516314, + "step": 14342, + "train/total_loss": 0.12174540013074875 + }, + { + "entropy": 9.63072681427002, + "epoch": 1.4181332806011469, + "mean_token_accuracy": 0.9232673048973083, + "num_tokens": 29428903.0, + "step": 14343, + "train/ce_loss": 0.5524744987487793 + }, + { + "epoch": 1.4181332806011469, + "step": 14343, + "train/sim_loss": 0.06532025337219238 + }, + { + "epoch": 1.4181332806011469, + "step": 14343, + "train/total_loss": 0.12056770920753479 + }, + { + "entropy": 9.6685791015625, + "epoch": 1.4182321534506626, + "mean_token_accuracy": 0.834502100944519, + "num_tokens": 29444801.0, + "step": 14344, + "train/ce_loss": 0.7362594604492188 + }, + { + "epoch": 1.4182321534506626, + "step": 14344, + "train/sim_loss": 0.017388880252838135 + }, + { + "epoch": 1.4182321534506626, + "step": 14344, + "train/total_loss": 0.09101482480764389 + }, + { + "entropy": 9.661643981933594, + "epoch": 1.418331026300178, + "mean_token_accuracy": 0.8557376861572266, + "num_tokens": 29451245.0, + "step": 14345, + "train/ce_loss": 0.7282818555831909 + }, + { + "epoch": 1.418331026300178, + "step": 14345, + "train/sim_loss": 0.11780285835266113 + }, + { + "epoch": 1.418331026300178, + "step": 14345, + "train/total_loss": 0.19063104689121246 + }, + { + "entropy": 9.722611427307129, + "epoch": 1.4184298991496935, + "mean_token_accuracy": 0.86328125, + "num_tokens": 29463233.0, + "step": 14346, + "train/ce_loss": 0.4933731257915497 + }, + { + "epoch": 1.4184298991496935, + "step": 14346, + "train/sim_loss": 0.09750354290008545 + }, + { + "epoch": 1.4184298991496935, + "step": 14346, + "train/total_loss": 0.14684085547924042 + }, + { + "entropy": 9.285707473754883, + "epoch": 1.418528771999209, + "mean_token_accuracy": 0.8665879368782043, + "num_tokens": 29471820.0, + "step": 14347, + "train/ce_loss": 0.5318387746810913 + }, + { + "epoch": 1.418528771999209, + "step": 14347, + "train/sim_loss": 0.02042531967163086 + }, + { + "epoch": 1.418528771999209, + "step": 14347, + "train/total_loss": 0.07360920310020447 + }, + { + "entropy": 9.034488677978516, + "epoch": 1.4186276448487245, + "mean_token_accuracy": 0.9194444417953491, + "num_tokens": 29477783.0, + "step": 14348, + "train/ce_loss": 0.362305611371994 + }, + { + "epoch": 1.4186276448487245, + "step": 14348, + "train/sim_loss": 0.02704983949661255 + }, + { + "epoch": 1.4186276448487245, + "step": 14348, + "train/total_loss": 0.06328040361404419 + }, + { + "entropy": 9.195104598999023, + "epoch": 1.41872651769824, + "mean_token_accuracy": 0.8637531995773315, + "num_tokens": 29491939.0, + "step": 14349, + "train/ce_loss": 0.29280295968055725 + }, + { + "epoch": 1.41872651769824, + "step": 14349, + "train/sim_loss": 0.019787251949310303 + }, + { + "epoch": 1.41872651769824, + "step": 14349, + "train/total_loss": 0.04906754940748215 + }, + { + "entropy": 9.310347557067871, + "epoch": 1.4188253905477555, + "mean_token_accuracy": 0.7915717363357544, + "num_tokens": 29501282.0, + "step": 14350, + "train/ce_loss": 0.8332515358924866 + }, + { + "epoch": 1.4188253905477555, + "step": 14350, + "train/sim_loss": 0.06635516881942749 + }, + { + "epoch": 1.4188253905477555, + "step": 14350, + "train/total_loss": 0.14968031644821167 + }, + { + "entropy": 9.270563125610352, + "epoch": 1.4189242633972712, + "mean_token_accuracy": 0.8293691873550415, + "num_tokens": 29517299.0, + "step": 14351, + "train/ce_loss": 0.16685332357883453 + }, + { + "epoch": 1.4189242633972712, + "step": 14351, + "train/sim_loss": 0.017682254314422607 + }, + { + "epoch": 1.4189242633972712, + "step": 14351, + "train/total_loss": 0.03436758741736412 + }, + { + "entropy": 9.539997100830078, + "epoch": 1.4190231362467867, + "mean_token_accuracy": 0.8564814925193787, + "num_tokens": 29527423.0, + "step": 14352, + "train/ce_loss": 0.5375288128852844 + }, + { + "epoch": 1.4190231362467867, + "step": 14352, + "train/sim_loss": 0.052896320819854736 + }, + { + "epoch": 1.4190231362467867, + "step": 14352, + "train/total_loss": 0.10664920508861542 + }, + { + "entropy": 9.600976943969727, + "epoch": 1.4191220090963022, + "mean_token_accuracy": 0.8475222587585449, + "num_tokens": 29548669.0, + "step": 14353, + "train/ce_loss": 0.3808101415634155 + }, + { + "epoch": 1.4191220090963022, + "step": 14353, + "train/sim_loss": 0.07268297672271729 + }, + { + "epoch": 1.4191220090963022, + "step": 14353, + "train/total_loss": 0.11076399683952332 + }, + { + "entropy": 9.750638961791992, + "epoch": 1.4192208819458176, + "mean_token_accuracy": 0.8871892690658569, + "num_tokens": 29560558.0, + "step": 14354, + "train/ce_loss": 0.572519063949585 + }, + { + "epoch": 1.4192208819458176, + "step": 14354, + "train/sim_loss": 0.04346185922622681 + }, + { + "epoch": 1.4192208819458176, + "step": 14354, + "train/total_loss": 0.10071376711130142 + }, + { + "entropy": 9.751394271850586, + "epoch": 1.4193197547953331, + "mean_token_accuracy": 0.9325000047683716, + "num_tokens": 29572626.0, + "step": 14355, + "train/ce_loss": 0.33268243074417114 + }, + { + "epoch": 1.4193197547953331, + "step": 14355, + "train/sim_loss": 0.09396141767501831 + }, + { + "epoch": 1.4193197547953331, + "step": 14355, + "train/total_loss": 0.12722966074943542 + }, + { + "entropy": 9.603422164916992, + "epoch": 1.4194186276448488, + "mean_token_accuracy": 0.834002673625946, + "num_tokens": 29585889.0, + "step": 14356, + "train/ce_loss": 0.7494556903839111 + }, + { + "epoch": 1.4194186276448488, + "step": 14356, + "train/sim_loss": 0.07169663906097412 + }, + { + "epoch": 1.4194186276448488, + "step": 14356, + "train/total_loss": 0.14664220809936523 + }, + { + "entropy": 9.41692066192627, + "epoch": 1.4195175004943643, + "mean_token_accuracy": 0.881154477596283, + "num_tokens": 29603001.0, + "step": 14357, + "train/ce_loss": 8.402991511502478e-07 + }, + { + "epoch": 1.4195175004943643, + "step": 14357, + "train/sim_loss": 0.042229652404785156 + }, + { + "epoch": 1.4195175004943643, + "step": 14357, + "train/total_loss": 0.04222973808646202 + }, + { + "entropy": 9.174726486206055, + "epoch": 1.4196163733438798, + "mean_token_accuracy": 0.8441246747970581, + "num_tokens": 29615063.0, + "step": 14358, + "train/ce_loss": 0.3704524636268616 + }, + { + "epoch": 1.4196163733438798, + "step": 14358, + "train/sim_loss": 0.08056056499481201 + }, + { + "epoch": 1.4196163733438798, + "step": 14358, + "train/total_loss": 0.11760581284761429 + }, + { + "entropy": 8.988004684448242, + "epoch": 1.4197152461933953, + "mean_token_accuracy": 0.8723404407501221, + "num_tokens": 29625889.0, + "step": 14359, + "train/ce_loss": 0.44274288415908813 + }, + { + "epoch": 1.4197152461933953, + "step": 14359, + "train/sim_loss": 0.028612375259399414 + }, + { + "epoch": 1.4197152461933953, + "step": 14359, + "train/total_loss": 0.07288666069507599 + }, + { + "epoch": 1.4198141190429108, + "grad_norm": 0.5719578862190247, + "learning_rate": 6.452306779409584e-06, + "loss": 0.0845, + "step": 14360 + }, + { + "entropy": 9.2268705368042, + "epoch": 1.4198141190429108, + "mean_token_accuracy": 0.8949671983718872, + "num_tokens": 29636589.0, + "step": 14360, + "train/ce_loss": 0.4781950116157532 + }, + { + "epoch": 1.4198141190429108, + "step": 14360, + "train/sim_loss": 0.04453247785568237 + }, + { + "epoch": 1.4198141190429108, + "step": 14360, + "train/total_loss": 0.09235198050737381 + }, + { + "entropy": 9.429277420043945, + "epoch": 1.4199129918924263, + "mean_token_accuracy": 0.8163742423057556, + "num_tokens": 29649697.0, + "step": 14361, + "train/ce_loss": 0.31584808230400085 + }, + { + "epoch": 1.4199129918924263, + "step": 14361, + "train/sim_loss": 0.02922278642654419 + }, + { + "epoch": 1.4199129918924263, + "step": 14361, + "train/total_loss": 0.060807596892118454 + }, + { + "entropy": 9.61331558227539, + "epoch": 1.4200118647419417, + "mean_token_accuracy": 0.910646378993988, + "num_tokens": 29663848.0, + "step": 14362, + "train/ce_loss": 0.423469603061676 + }, + { + "epoch": 1.4200118647419417, + "step": 14362, + "train/sim_loss": 0.014092743396759033 + }, + { + "epoch": 1.4200118647419417, + "step": 14362, + "train/total_loss": 0.056439705193042755 + }, + { + "entropy": 9.536380767822266, + "epoch": 1.4201107375914575, + "mean_token_accuracy": 0.8996139168739319, + "num_tokens": 29679518.0, + "step": 14363, + "train/ce_loss": 0.6750512719154358 + }, + { + "epoch": 1.4201107375914575, + "step": 14363, + "train/sim_loss": 0.039667606353759766 + }, + { + "epoch": 1.4201107375914575, + "step": 14363, + "train/total_loss": 0.10717273503541946 + }, + { + "entropy": 9.782621383666992, + "epoch": 1.420209610440973, + "mean_token_accuracy": 0.8603066205978394, + "num_tokens": 29697148.0, + "step": 14364, + "train/ce_loss": 0.7832186818122864 + }, + { + "epoch": 1.420209610440973, + "step": 14364, + "train/sim_loss": 0.04289919137954712 + }, + { + "epoch": 1.420209610440973, + "step": 14364, + "train/total_loss": 0.12122105807065964 + }, + { + "entropy": 9.539461135864258, + "epoch": 1.4203084832904884, + "mean_token_accuracy": 0.8900523781776428, + "num_tokens": 29702920.0, + "step": 14365, + "train/ce_loss": 0.5576204061508179 + }, + { + "epoch": 1.4203084832904884, + "step": 14365, + "train/sim_loss": 0.010967016220092773 + }, + { + "epoch": 1.4203084832904884, + "step": 14365, + "train/total_loss": 0.06672905385494232 + }, + { + "entropy": 9.692572593688965, + "epoch": 1.420407356140004, + "mean_token_accuracy": 0.8849999904632568, + "num_tokens": 29717370.0, + "step": 14366, + "train/ce_loss": 0.4201872646808624 + }, + { + "epoch": 1.420407356140004, + "step": 14366, + "train/sim_loss": 0.05213463306427002 + }, + { + "epoch": 1.420407356140004, + "step": 14366, + "train/total_loss": 0.09415335953235626 + }, + { + "entropy": 9.595476150512695, + "epoch": 1.4205062289895194, + "mean_token_accuracy": 0.8802660703659058, + "num_tokens": 29730927.0, + "step": 14367, + "train/ce_loss": 0.5624062418937683 + }, + { + "epoch": 1.4205062289895194, + "step": 14367, + "train/sim_loss": 0.03248333930969238 + }, + { + "epoch": 1.4205062289895194, + "step": 14367, + "train/total_loss": 0.08872396498918533 + }, + { + "entropy": 9.339960098266602, + "epoch": 1.420605101839035, + "mean_token_accuracy": 0.8848167657852173, + "num_tokens": 29746975.0, + "step": 14368, + "train/ce_loss": 0.524850070476532 + }, + { + "epoch": 1.420605101839035, + "step": 14368, + "train/sim_loss": 0.01608818769454956 + }, + { + "epoch": 1.420605101839035, + "step": 14368, + "train/total_loss": 0.06857319176197052 + }, + { + "entropy": 9.550315856933594, + "epoch": 1.4207039746885506, + "mean_token_accuracy": 0.8657957315444946, + "num_tokens": 29763643.0, + "step": 14369, + "train/ce_loss": 0.4445662200450897 + }, + { + "epoch": 1.4207039746885506, + "step": 14369, + "train/sim_loss": 0.023840785026550293 + }, + { + "epoch": 1.4207039746885506, + "step": 14369, + "train/total_loss": 0.06829740852117538 + }, + { + "entropy": 9.621057510375977, + "epoch": 1.420802847538066, + "mean_token_accuracy": 0.8841201663017273, + "num_tokens": 29775391.0, + "step": 14370, + "train/ce_loss": 5.818711201754923e-07 + }, + { + "epoch": 1.420802847538066, + "step": 14370, + "train/sim_loss": 0.03262448310852051 + }, + { + "epoch": 1.420802847538066, + "step": 14370, + "train/total_loss": 0.03262454271316528 + }, + { + "entropy": 9.363765716552734, + "epoch": 1.4209017203875816, + "mean_token_accuracy": 0.8105670213699341, + "num_tokens": 29782422.0, + "step": 14371, + "train/ce_loss": 0.3315468728542328 + }, + { + "epoch": 1.4209017203875816, + "step": 14371, + "train/sim_loss": 0.018909037113189697 + }, + { + "epoch": 1.4209017203875816, + "step": 14371, + "train/total_loss": 0.052063725888729095 + }, + { + "entropy": 9.420591354370117, + "epoch": 1.421000593237097, + "mean_token_accuracy": 0.862811803817749, + "num_tokens": 29795538.0, + "step": 14372, + "train/ce_loss": 0.09769900143146515 + }, + { + "epoch": 1.421000593237097, + "step": 14372, + "train/sim_loss": 0.08475112915039062 + }, + { + "epoch": 1.421000593237097, + "step": 14372, + "train/total_loss": 0.09452103078365326 + }, + { + "entropy": 9.174867630004883, + "epoch": 1.4210994660866128, + "mean_token_accuracy": 0.847478449344635, + "num_tokens": 29806022.0, + "step": 14373, + "train/ce_loss": 0.1648331582546234 + }, + { + "epoch": 1.4210994660866128, + "step": 14373, + "train/sim_loss": 0.0450892448425293 + }, + { + "epoch": 1.4210994660866128, + "step": 14373, + "train/total_loss": 0.06157255917787552 + }, + { + "entropy": 9.135377883911133, + "epoch": 1.421198338936128, + "mean_token_accuracy": 0.891933023929596, + "num_tokens": 29813983.0, + "step": 14374, + "train/ce_loss": 0.264219731092453 + }, + { + "epoch": 1.421198338936128, + "step": 14374, + "train/sim_loss": 0.04984921216964722 + }, + { + "epoch": 1.421198338936128, + "step": 14374, + "train/total_loss": 0.0762711837887764 + }, + { + "entropy": 9.213376998901367, + "epoch": 1.4212972117856437, + "mean_token_accuracy": 0.921241044998169, + "num_tokens": 29824311.0, + "step": 14375, + "train/ce_loss": 0.35855409502983093 + }, + { + "epoch": 1.4212972117856437, + "step": 14375, + "train/sim_loss": 0.08793377876281738 + }, + { + "epoch": 1.4212972117856437, + "step": 14375, + "train/total_loss": 0.12378919124603271 + }, + { + "entropy": 9.54196548461914, + "epoch": 1.4213960846351592, + "mean_token_accuracy": 0.8018741607666016, + "num_tokens": 29837821.0, + "step": 14376, + "train/ce_loss": 0.48526984453201294 + }, + { + "epoch": 1.4213960846351592, + "step": 14376, + "train/sim_loss": 0.03457033634185791 + }, + { + "epoch": 1.4213960846351592, + "step": 14376, + "train/total_loss": 0.08309732377529144 + }, + { + "entropy": 8.869089126586914, + "epoch": 1.4214949574846747, + "mean_token_accuracy": 0.829383909702301, + "num_tokens": 29845357.0, + "step": 14377, + "train/ce_loss": 0.28027522563934326 + }, + { + "epoch": 1.4214949574846747, + "step": 14377, + "train/sim_loss": 0.030906319618225098 + }, + { + "epoch": 1.4214949574846747, + "step": 14377, + "train/total_loss": 0.058933842927217484 + }, + { + "entropy": 9.719057083129883, + "epoch": 1.4215938303341902, + "mean_token_accuracy": 0.8583162426948547, + "num_tokens": 29850226.0, + "step": 14378, + "train/ce_loss": 3.83815233817586e-07 + }, + { + "epoch": 1.4215938303341902, + "step": 14378, + "train/sim_loss": 0.014569759368896484 + }, + { + "epoch": 1.4215938303341902, + "step": 14378, + "train/total_loss": 0.014569797553122044 + }, + { + "entropy": 9.189773559570312, + "epoch": 1.4216927031837057, + "mean_token_accuracy": 0.8282828330993652, + "num_tokens": 29864696.0, + "step": 14379, + "train/ce_loss": 0.45748409628868103 + }, + { + "epoch": 1.4216927031837057, + "step": 14379, + "train/sim_loss": 0.01985490322113037 + }, + { + "epoch": 1.4216927031837057, + "step": 14379, + "train/total_loss": 0.06560331583023071 + }, + { + "epoch": 1.4217915760332214, + "grad_norm": 0.6738894581794739, + "learning_rate": 6.447361914651635e-06, + "loss": 0.0797, + "step": 14380 + }, + { + "entropy": 9.50020694732666, + "epoch": 1.4217915760332214, + "mean_token_accuracy": 0.9017264246940613, + "num_tokens": 29872325.0, + "step": 14380, + "train/ce_loss": 4.863217668571451e-07 + }, + { + "epoch": 1.4217915760332214, + "step": 14380, + "train/sim_loss": 0.013253450393676758 + }, + { + "epoch": 1.4217915760332214, + "step": 14380, + "train/total_loss": 0.013253498822450638 + }, + { + "entropy": 9.71529483795166, + "epoch": 1.4218904488827369, + "mean_token_accuracy": 0.897744357585907, + "num_tokens": 29881810.0, + "step": 14381, + "train/ce_loss": 2.5096460376516916e-07 + }, + { + "epoch": 1.4218904488827369, + "step": 14381, + "train/sim_loss": 0.008940935134887695 + }, + { + "epoch": 1.4218904488827369, + "step": 14381, + "train/total_loss": 0.00894096028059721 + }, + { + "entropy": 9.100013732910156, + "epoch": 1.4219893217322523, + "mean_token_accuracy": 0.8256410360336304, + "num_tokens": 29890421.0, + "step": 14382, + "train/ce_loss": 0.4414537847042084 + }, + { + "epoch": 1.4219893217322523, + "step": 14382, + "train/sim_loss": 0.05950695276260376 + }, + { + "epoch": 1.4219893217322523, + "step": 14382, + "train/total_loss": 0.10365232825279236 + }, + { + "entropy": 9.595781326293945, + "epoch": 1.4220881945817678, + "mean_token_accuracy": 0.8804348111152649, + "num_tokens": 29901728.0, + "step": 14383, + "train/ce_loss": 3.3763171813916415e-07 + }, + { + "epoch": 1.4220881945817678, + "step": 14383, + "train/sim_loss": 0.010573863983154297 + }, + { + "epoch": 1.4220881945817678, + "step": 14383, + "train/total_loss": 0.010573897510766983 + }, + { + "entropy": 9.510406494140625, + "epoch": 1.4221870674312833, + "mean_token_accuracy": 0.8604651093482971, + "num_tokens": 29909485.0, + "step": 14384, + "train/ce_loss": 0.30954283475875854 + }, + { + "epoch": 1.4221870674312833, + "step": 14384, + "train/sim_loss": 0.032064199447631836 + }, + { + "epoch": 1.4221870674312833, + "step": 14384, + "train/total_loss": 0.06301848590373993 + }, + { + "entropy": 9.581403732299805, + "epoch": 1.422285940280799, + "mean_token_accuracy": 0.8669108748435974, + "num_tokens": 29920658.0, + "step": 14385, + "train/ce_loss": 0.4239061176776886 + }, + { + "epoch": 1.422285940280799, + "step": 14385, + "train/sim_loss": 0.0358428955078125 + }, + { + "epoch": 1.422285940280799, + "step": 14385, + "train/total_loss": 0.0782335102558136 + }, + { + "entropy": 9.331624984741211, + "epoch": 1.4223848131303143, + "mean_token_accuracy": 0.8125, + "num_tokens": 29939189.0, + "step": 14386, + "train/ce_loss": 0.8725661635398865 + }, + { + "epoch": 1.4223848131303143, + "step": 14386, + "train/sim_loss": 0.09237653017044067 + }, + { + "epoch": 1.4223848131303143, + "step": 14386, + "train/total_loss": 0.17963314056396484 + }, + { + "entropy": 9.965359687805176, + "epoch": 1.42248368597983, + "mean_token_accuracy": 0.9244186282157898, + "num_tokens": 29946378.0, + "step": 14387, + "train/ce_loss": 2.952239356091013e-06 + }, + { + "epoch": 1.42248368597983, + "step": 14387, + "train/sim_loss": 0.024430155754089355 + }, + { + "epoch": 1.42248368597983, + "step": 14387, + "train/total_loss": 0.024430450052022934 + }, + { + "entropy": 9.51984977722168, + "epoch": 1.4225825588293455, + "mean_token_accuracy": 0.902158260345459, + "num_tokens": 29958651.0, + "step": 14388, + "train/ce_loss": 0.3957468569278717 + }, + { + "epoch": 1.4225825588293455, + "step": 14388, + "train/sim_loss": 0.06270807981491089 + }, + { + "epoch": 1.4225825588293455, + "step": 14388, + "train/total_loss": 0.10228276252746582 + }, + { + "entropy": 9.180434226989746, + "epoch": 1.422681431678861, + "mean_token_accuracy": 0.8333333134651184, + "num_tokens": 29969656.0, + "step": 14389, + "train/ce_loss": 0.7478528022766113 + }, + { + "epoch": 1.422681431678861, + "step": 14389, + "train/sim_loss": 0.048356056213378906 + }, + { + "epoch": 1.422681431678861, + "step": 14389, + "train/total_loss": 0.1231413409113884 + }, + { + "entropy": 9.331530570983887, + "epoch": 1.4227803045283764, + "mean_token_accuracy": 0.7868263721466064, + "num_tokens": 29981602.0, + "step": 14390, + "train/ce_loss": 0.28279784321784973 + }, + { + "epoch": 1.4227803045283764, + "step": 14390, + "train/sim_loss": 0.04544055461883545 + }, + { + "epoch": 1.4227803045283764, + "step": 14390, + "train/total_loss": 0.07372033596038818 + }, + { + "entropy": 9.380842208862305, + "epoch": 1.422879177377892, + "mean_token_accuracy": 0.7884417176246643, + "num_tokens": 29991772.0, + "step": 14391, + "train/ce_loss": 0.24626410007476807 + }, + { + "epoch": 1.422879177377892, + "step": 14391, + "train/sim_loss": 0.05542159080505371 + }, + { + "epoch": 1.422879177377892, + "step": 14391, + "train/total_loss": 0.08004800230264664 + }, + { + "entropy": 9.432313919067383, + "epoch": 1.4229780502274076, + "mean_token_accuracy": 0.8379022479057312, + "num_tokens": 30011795.0, + "step": 14392, + "train/ce_loss": 0.5361075401306152 + }, + { + "epoch": 1.4229780502274076, + "step": 14392, + "train/sim_loss": 0.012907147407531738 + }, + { + "epoch": 1.4229780502274076, + "step": 14392, + "train/total_loss": 0.0665179044008255 + }, + { + "entropy": 9.027058601379395, + "epoch": 1.4230769230769231, + "mean_token_accuracy": 0.8665997982025146, + "num_tokens": 30027258.0, + "step": 14393, + "train/ce_loss": 0.7050599455833435 + }, + { + "epoch": 1.4230769230769231, + "step": 14393, + "train/sim_loss": 0.017193317413330078 + }, + { + "epoch": 1.4230769230769231, + "step": 14393, + "train/total_loss": 0.08769931644201279 + }, + { + "entropy": 8.955638885498047, + "epoch": 1.4231757959264386, + "mean_token_accuracy": 0.8365679383277893, + "num_tokens": 30040380.0, + "step": 14394, + "train/ce_loss": 0.47322607040405273 + }, + { + "epoch": 1.4231757959264386, + "step": 14394, + "train/sim_loss": 0.07600116729736328 + }, + { + "epoch": 1.4231757959264386, + "step": 14394, + "train/total_loss": 0.12332377582788467 + }, + { + "entropy": 9.741524696350098, + "epoch": 1.423274668775954, + "mean_token_accuracy": 0.8600953817367554, + "num_tokens": 30055614.0, + "step": 14395, + "train/ce_loss": 8.891952347767074e-07 + }, + { + "epoch": 1.423274668775954, + "step": 14395, + "train/sim_loss": 0.044725656509399414 + }, + { + "epoch": 1.423274668775954, + "step": 14395, + "train/total_loss": 0.04472574591636658 + }, + { + "entropy": 9.773210525512695, + "epoch": 1.4233735416254696, + "mean_token_accuracy": 0.8286852836608887, + "num_tokens": 30074117.0, + "step": 14396, + "train/ce_loss": 3.4891618838628347e-07 + }, + { + "epoch": 1.4233735416254696, + "step": 14396, + "train/sim_loss": 0.03297215700149536 + }, + { + "epoch": 1.4233735416254696, + "step": 14396, + "train/total_loss": 0.03297219052910805 + }, + { + "entropy": 9.36522102355957, + "epoch": 1.4234724144749853, + "mean_token_accuracy": 0.8742690086364746, + "num_tokens": 30088831.0, + "step": 14397, + "train/ce_loss": 0.25782716274261475 + }, + { + "epoch": 1.4234724144749853, + "step": 14397, + "train/sim_loss": 0.018538594245910645 + }, + { + "epoch": 1.4234724144749853, + "step": 14397, + "train/total_loss": 0.04432131350040436 + }, + { + "entropy": 9.48720932006836, + "epoch": 1.4235712873245006, + "mean_token_accuracy": 0.8527919054031372, + "num_tokens": 30097769.0, + "step": 14398, + "train/ce_loss": 0.4767293334007263 + }, + { + "epoch": 1.4235712873245006, + "step": 14398, + "train/sim_loss": 0.06864815950393677 + }, + { + "epoch": 1.4235712873245006, + "step": 14398, + "train/total_loss": 0.11632109433412552 + }, + { + "entropy": 9.829651832580566, + "epoch": 1.4236701601740163, + "mean_token_accuracy": 0.8564356565475464, + "num_tokens": 30109304.0, + "step": 14399, + "train/ce_loss": 1.5444480823134654e-06 + }, + { + "epoch": 1.4236701601740163, + "step": 14399, + "train/sim_loss": 0.044050753116607666 + }, + { + "epoch": 1.4236701601740163, + "step": 14399, + "train/total_loss": 0.0440509058535099 + }, + { + "epoch": 1.4237690330235317, + "grad_norm": 0.6657541394233704, + "learning_rate": 6.442417049893686e-06, + "loss": 0.0849, + "step": 14400 + }, + { + "entropy": 9.28369140625, + "epoch": 1.4237690330235317, + "mean_token_accuracy": 0.818823516368866, + "num_tokens": 30125242.0, + "step": 14400, + "train/ce_loss": 0.6295405626296997 + }, + { + "epoch": 1.4237690330235317, + "step": 14400, + "train/sim_loss": 0.0541079044342041 + }, + { + "epoch": 1.4237690330235317, + "step": 14400, + "train/total_loss": 0.11706196516752243 + }, + { + "entropy": 9.77627182006836, + "epoch": 1.4238679058730472, + "mean_token_accuracy": 0.8685445785522461, + "num_tokens": 30138186.0, + "step": 14401, + "train/ce_loss": 1.0685665607452393 + }, + { + "epoch": 1.4238679058730472, + "step": 14401, + "train/sim_loss": 0.08295619487762451 + }, + { + "epoch": 1.4238679058730472, + "step": 14401, + "train/total_loss": 0.18981285393238068 + }, + { + "entropy": 9.12117862701416, + "epoch": 1.4239667787225627, + "mean_token_accuracy": 0.8150619864463806, + "num_tokens": 30149861.0, + "step": 14402, + "train/ce_loss": 0.32327353954315186 + }, + { + "epoch": 1.4239667787225627, + "step": 14402, + "train/sim_loss": 0.013737142086029053 + }, + { + "epoch": 1.4239667787225627, + "step": 14402, + "train/total_loss": 0.04606449604034424 + }, + { + "entropy": 9.296346664428711, + "epoch": 1.4240656515720782, + "mean_token_accuracy": 0.8706766963005066, + "num_tokens": 30157226.0, + "step": 14403, + "train/ce_loss": 0.4213647246360779 + }, + { + "epoch": 1.4240656515720782, + "step": 14403, + "train/sim_loss": 0.050713181495666504 + }, + { + "epoch": 1.4240656515720782, + "step": 14403, + "train/total_loss": 0.09284965693950653 + }, + { + "entropy": 9.435564994812012, + "epoch": 1.424164524421594, + "mean_token_accuracy": 0.8665018677711487, + "num_tokens": 30171027.0, + "step": 14404, + "train/ce_loss": 0.12846946716308594 + }, + { + "epoch": 1.424164524421594, + "step": 14404, + "train/sim_loss": 0.06796753406524658 + }, + { + "epoch": 1.424164524421594, + "step": 14404, + "train/total_loss": 0.08081448078155518 + }, + { + "entropy": 9.857213973999023, + "epoch": 1.4242633972711094, + "mean_token_accuracy": 0.8396226167678833, + "num_tokens": 30182274.0, + "step": 14405, + "train/ce_loss": 8.164516884789919e-07 + }, + { + "epoch": 1.4242633972711094, + "step": 14405, + "train/sim_loss": 0.04376363754272461 + }, + { + "epoch": 1.4242633972711094, + "step": 14405, + "train/total_loss": 0.043763719499111176 + }, + { + "entropy": 9.271778106689453, + "epoch": 1.4243622701206249, + "mean_token_accuracy": 0.8305439352989197, + "num_tokens": 30194954.0, + "step": 14406, + "train/ce_loss": 0.3468573987483978 + }, + { + "epoch": 1.4243622701206249, + "step": 14406, + "train/sim_loss": 0.011892735958099365 + }, + { + "epoch": 1.4243622701206249, + "step": 14406, + "train/total_loss": 0.04657847806811333 + }, + { + "entropy": 9.379627227783203, + "epoch": 1.4244611429701404, + "mean_token_accuracy": 0.8519637584686279, + "num_tokens": 30207330.0, + "step": 14407, + "train/ce_loss": 0.39181822538375854 + }, + { + "epoch": 1.4244611429701404, + "step": 14407, + "train/sim_loss": 0.010316014289855957 + }, + { + "epoch": 1.4244611429701404, + "step": 14407, + "train/total_loss": 0.04949783906340599 + }, + { + "entropy": 8.898662567138672, + "epoch": 1.4245600158196559, + "mean_token_accuracy": 0.8709677457809448, + "num_tokens": 30217835.0, + "step": 14408, + "train/ce_loss": 0.4519009590148926 + }, + { + "epoch": 1.4245600158196559, + "step": 14408, + "train/sim_loss": 0.05195295810699463 + }, + { + "epoch": 1.4245600158196559, + "step": 14408, + "train/total_loss": 0.09714305400848389 + }, + { + "entropy": 9.387675285339355, + "epoch": 1.4246588886691716, + "mean_token_accuracy": 0.8269720077514648, + "num_tokens": 30228412.0, + "step": 14409, + "train/ce_loss": 0.5151435732841492 + }, + { + "epoch": 1.4246588886691716, + "step": 14409, + "train/sim_loss": 0.03410637378692627 + }, + { + "epoch": 1.4246588886691716, + "step": 14409, + "train/total_loss": 0.08562073111534119 + }, + { + "entropy": 9.852798461914062, + "epoch": 1.424757761518687, + "mean_token_accuracy": 0.8360927104949951, + "num_tokens": 30236326.0, + "step": 14410, + "train/ce_loss": 0.8218562602996826 + }, + { + "epoch": 1.424757761518687, + "step": 14410, + "train/sim_loss": 0.03721785545349121 + }, + { + "epoch": 1.424757761518687, + "step": 14410, + "train/total_loss": 0.11940348148345947 + }, + { + "entropy": 9.307390213012695, + "epoch": 1.4248566343682025, + "mean_token_accuracy": 0.8496659398078918, + "num_tokens": 30248321.0, + "step": 14411, + "train/ce_loss": 1.035532832145691 + }, + { + "epoch": 1.4248566343682025, + "step": 14411, + "train/sim_loss": 0.04311215877532959 + }, + { + "epoch": 1.4248566343682025, + "step": 14411, + "train/total_loss": 0.14666545391082764 + }, + { + "entropy": 9.321115493774414, + "epoch": 1.424955507217718, + "mean_token_accuracy": 0.837837815284729, + "num_tokens": 30264970.0, + "step": 14412, + "train/ce_loss": 0.4192918539047241 + }, + { + "epoch": 1.424955507217718, + "step": 14412, + "train/sim_loss": 0.02553558349609375 + }, + { + "epoch": 1.424955507217718, + "step": 14412, + "train/total_loss": 0.06746476888656616 + }, + { + "entropy": 9.664200782775879, + "epoch": 1.4250543800672335, + "mean_token_accuracy": 0.8174019455909729, + "num_tokens": 30275058.0, + "step": 14413, + "train/ce_loss": 0.5507782101631165 + }, + { + "epoch": 1.4250543800672335, + "step": 14413, + "train/sim_loss": 0.09768271446228027 + }, + { + "epoch": 1.4250543800672335, + "step": 14413, + "train/total_loss": 0.15276053547859192 + }, + { + "entropy": 9.471854209899902, + "epoch": 1.425153252916749, + "mean_token_accuracy": 0.8231292366981506, + "num_tokens": 30282114.0, + "step": 14414, + "train/ce_loss": 0.5135163068771362 + }, + { + "epoch": 1.425153252916749, + "step": 14414, + "train/sim_loss": 0.025317728519439697 + }, + { + "epoch": 1.425153252916749, + "step": 14414, + "train/total_loss": 0.0766693651676178 + }, + { + "entropy": 9.75131893157959, + "epoch": 1.4252521257662645, + "mean_token_accuracy": 0.8820422291755676, + "num_tokens": 30291785.0, + "step": 14415, + "train/ce_loss": 0.437326043844223 + }, + { + "epoch": 1.4252521257662645, + "step": 14415, + "train/sim_loss": 0.04425394535064697 + }, + { + "epoch": 1.4252521257662645, + "step": 14415, + "train/total_loss": 0.0879865512251854 + }, + { + "entropy": 9.103126525878906, + "epoch": 1.4253509986157802, + "mean_token_accuracy": 0.8824833631515503, + "num_tokens": 30304456.0, + "step": 14416, + "train/ce_loss": 0.3024064600467682 + }, + { + "epoch": 1.4253509986157802, + "step": 14416, + "train/sim_loss": 0.016855955123901367 + }, + { + "epoch": 1.4253509986157802, + "step": 14416, + "train/total_loss": 0.047096602618694305 + }, + { + "entropy": 9.202560424804688, + "epoch": 1.4254498714652957, + "mean_token_accuracy": 0.8056460618972778, + "num_tokens": 30317097.0, + "step": 14417, + "train/ce_loss": 0.7644512057304382 + }, + { + "epoch": 1.4254498714652957, + "step": 14417, + "train/sim_loss": 0.050118982791900635 + }, + { + "epoch": 1.4254498714652957, + "step": 14417, + "train/total_loss": 0.1265641152858734 + }, + { + "entropy": 9.338340759277344, + "epoch": 1.4255487443148112, + "mean_token_accuracy": 0.7942794561386108, + "num_tokens": 30327296.0, + "step": 14418, + "train/ce_loss": 0.46658632159233093 + }, + { + "epoch": 1.4255487443148112, + "step": 14418, + "train/sim_loss": 0.031600117683410645 + }, + { + "epoch": 1.4255487443148112, + "step": 14418, + "train/total_loss": 0.07825875282287598 + }, + { + "entropy": 9.655938148498535, + "epoch": 1.4256476171643266, + "mean_token_accuracy": 0.8462623357772827, + "num_tokens": 30338058.0, + "step": 14419, + "train/ce_loss": 0.5866039991378784 + }, + { + "epoch": 1.4256476171643266, + "step": 14419, + "train/sim_loss": 0.051235079765319824 + }, + { + "epoch": 1.4256476171643266, + "step": 14419, + "train/total_loss": 0.1098954826593399 + }, + { + "epoch": 1.4257464900138421, + "grad_norm": 0.536979079246521, + "learning_rate": 6.437472185135737e-06, + "loss": 0.0906, + "step": 14420 + }, + { + "entropy": 9.576374053955078, + "epoch": 1.4257464900138421, + "mean_token_accuracy": 0.8026070594787598, + "num_tokens": 30347431.0, + "step": 14420, + "train/ce_loss": 7.707270128776145e-07 + }, + { + "epoch": 1.4257464900138421, + "step": 14420, + "train/sim_loss": 0.013219118118286133 + }, + { + "epoch": 1.4257464900138421, + "step": 14420, + "train/total_loss": 0.013219195418059826 + }, + { + "entropy": 9.522965431213379, + "epoch": 1.4258453628633578, + "mean_token_accuracy": 0.8342036604881287, + "num_tokens": 30362042.0, + "step": 14421, + "train/ce_loss": 0.4467335343360901 + }, + { + "epoch": 1.4258453628633578, + "step": 14421, + "train/sim_loss": 0.03561514616012573 + }, + { + "epoch": 1.4258453628633578, + "step": 14421, + "train/total_loss": 0.08028849959373474 + }, + { + "entropy": 9.434203147888184, + "epoch": 1.4259442357128733, + "mean_token_accuracy": 0.8999999761581421, + "num_tokens": 30374157.0, + "step": 14422, + "train/ce_loss": 0.397684782743454 + }, + { + "epoch": 1.4259442357128733, + "step": 14422, + "train/sim_loss": 0.053305864334106445 + }, + { + "epoch": 1.4259442357128733, + "step": 14422, + "train/total_loss": 0.09307434409856796 + }, + { + "entropy": 9.570005416870117, + "epoch": 1.4260431085623888, + "mean_token_accuracy": 0.8877722024917603, + "num_tokens": 30385445.0, + "step": 14423, + "train/ce_loss": 0.3041023910045624 + }, + { + "epoch": 1.4260431085623888, + "step": 14423, + "train/sim_loss": 0.05312490463256836 + }, + { + "epoch": 1.4260431085623888, + "step": 14423, + "train/total_loss": 0.08353514224290848 + }, + { + "entropy": 9.801443099975586, + "epoch": 1.4261419814119043, + "mean_token_accuracy": 0.8623718619346619, + "num_tokens": 30400165.0, + "step": 14424, + "train/ce_loss": 0.5121539235115051 + }, + { + "epoch": 1.4261419814119043, + "step": 14424, + "train/sim_loss": 0.031054377555847168 + }, + { + "epoch": 1.4261419814119043, + "step": 14424, + "train/total_loss": 0.08226977288722992 + }, + { + "entropy": 9.861083984375, + "epoch": 1.4262408542614198, + "mean_token_accuracy": 0.8634020686149597, + "num_tokens": 30410019.0, + "step": 14425, + "train/ce_loss": 1.0403015613555908 + }, + { + "epoch": 1.4262408542614198, + "step": 14425, + "train/sim_loss": 0.028311550617218018 + }, + { + "epoch": 1.4262408542614198, + "step": 14425, + "train/total_loss": 0.13234171271324158 + }, + { + "entropy": 9.466300010681152, + "epoch": 1.4263397271109353, + "mean_token_accuracy": 0.8569231033325195, + "num_tokens": 30419734.0, + "step": 14426, + "train/ce_loss": 1.6188748759304872e-06 + }, + { + "epoch": 1.4263397271109353, + "step": 14426, + "train/sim_loss": 0.04745984077453613 + }, + { + "epoch": 1.4263397271109353, + "step": 14426, + "train/total_loss": 0.04746000096201897 + }, + { + "entropy": 9.092672348022461, + "epoch": 1.4264385999604507, + "mean_token_accuracy": 0.8565072417259216, + "num_tokens": 30429912.0, + "step": 14427, + "train/ce_loss": 0.5017252564430237 + }, + { + "epoch": 1.4264385999604507, + "step": 14427, + "train/sim_loss": 0.04049807786941528 + }, + { + "epoch": 1.4264385999604507, + "step": 14427, + "train/total_loss": 0.09067060053348541 + }, + { + "entropy": 9.281003952026367, + "epoch": 1.4265374728099665, + "mean_token_accuracy": 0.8806584477424622, + "num_tokens": 30441890.0, + "step": 14428, + "train/ce_loss": 0.41076597571372986 + }, + { + "epoch": 1.4265374728099665, + "step": 14428, + "train/sim_loss": 0.026047945022583008 + }, + { + "epoch": 1.4265374728099665, + "step": 14428, + "train/total_loss": 0.06712454557418823 + }, + { + "entropy": 9.674819946289062, + "epoch": 1.426636345659482, + "mean_token_accuracy": 0.8658743500709534, + "num_tokens": 30460461.0, + "step": 14429, + "train/ce_loss": 3.14919446964268e-07 + }, + { + "epoch": 1.426636345659482, + "step": 14429, + "train/sim_loss": 0.02171987295150757 + }, + { + "epoch": 1.426636345659482, + "step": 14429, + "train/total_loss": 0.021719904616475105 + }, + { + "entropy": 9.545660972595215, + "epoch": 1.4267352185089974, + "mean_token_accuracy": 0.8784000277519226, + "num_tokens": 30471116.0, + "step": 14430, + "train/ce_loss": 7.209491172943672e-07 + }, + { + "epoch": 1.4267352185089974, + "step": 14430, + "train/sim_loss": 0.029291510581970215 + }, + { + "epoch": 1.4267352185089974, + "step": 14430, + "train/total_loss": 0.029291583225131035 + }, + { + "entropy": 8.943201065063477, + "epoch": 1.426834091358513, + "mean_token_accuracy": 0.8826815485954285, + "num_tokens": 30480011.0, + "step": 14431, + "train/ce_loss": 0.10699521005153656 + }, + { + "epoch": 1.426834091358513, + "step": 14431, + "train/sim_loss": 0.07804977893829346 + }, + { + "epoch": 1.426834091358513, + "step": 14431, + "train/total_loss": 0.08874929696321487 + }, + { + "entropy": 9.532476425170898, + "epoch": 1.4269329642080284, + "mean_token_accuracy": 0.8366336822509766, + "num_tokens": 30494566.0, + "step": 14432, + "train/ce_loss": 0.7706764340400696 + }, + { + "epoch": 1.4269329642080284, + "step": 14432, + "train/sim_loss": 0.05781364440917969 + }, + { + "epoch": 1.4269329642080284, + "step": 14432, + "train/total_loss": 0.13488128781318665 + }, + { + "entropy": 9.712750434875488, + "epoch": 1.427031837057544, + "mean_token_accuracy": 0.8369330167770386, + "num_tokens": 30509738.0, + "step": 14433, + "train/ce_loss": 0.408974826335907 + }, + { + "epoch": 1.427031837057544, + "step": 14433, + "train/sim_loss": 0.05481022596359253 + }, + { + "epoch": 1.427031837057544, + "step": 14433, + "train/total_loss": 0.0957077145576477 + }, + { + "entropy": 9.484503746032715, + "epoch": 1.4271307099070596, + "mean_token_accuracy": 0.8157894611358643, + "num_tokens": 30521678.0, + "step": 14434, + "train/ce_loss": 0.8326213955879211 + }, + { + "epoch": 1.4271307099070596, + "step": 14434, + "train/sim_loss": 0.04897123575210571 + }, + { + "epoch": 1.4271307099070596, + "step": 14434, + "train/total_loss": 0.1322333812713623 + }, + { + "entropy": 9.508562088012695, + "epoch": 1.427229582756575, + "mean_token_accuracy": 0.8240000009536743, + "num_tokens": 30545746.0, + "step": 14435, + "train/ce_loss": 0.46211475133895874 + }, + { + "epoch": 1.427229582756575, + "step": 14435, + "train/sim_loss": 0.032687604427337646 + }, + { + "epoch": 1.427229582756575, + "step": 14435, + "train/total_loss": 0.078899085521698 + }, + { + "entropy": 9.290735244750977, + "epoch": 1.4273284556060906, + "mean_token_accuracy": 0.8657718300819397, + "num_tokens": 30556491.0, + "step": 14436, + "train/ce_loss": 0.4179740250110626 + }, + { + "epoch": 1.4273284556060906, + "step": 14436, + "train/sim_loss": 0.02799379825592041 + }, + { + "epoch": 1.4273284556060906, + "step": 14436, + "train/total_loss": 0.06979119777679443 + }, + { + "entropy": 9.384008407592773, + "epoch": 1.427427328455606, + "mean_token_accuracy": 0.8083832263946533, + "num_tokens": 30566282.0, + "step": 14437, + "train/ce_loss": 0.4120073914527893 + }, + { + "epoch": 1.427427328455606, + "step": 14437, + "train/sim_loss": 0.029180824756622314 + }, + { + "epoch": 1.427427328455606, + "step": 14437, + "train/total_loss": 0.07038156688213348 + }, + { + "entropy": 9.1912841796875, + "epoch": 1.4275262013051215, + "mean_token_accuracy": 0.8342421054840088, + "num_tokens": 30575378.0, + "step": 14438, + "train/ce_loss": 0.5051108598709106 + }, + { + "epoch": 1.4275262013051215, + "step": 14438, + "train/sim_loss": 0.06758522987365723 + }, + { + "epoch": 1.4275262013051215, + "step": 14438, + "train/total_loss": 0.11809632182121277 + }, + { + "entropy": 9.53966236114502, + "epoch": 1.427625074154637, + "mean_token_accuracy": 0.7991631627082825, + "num_tokens": 30585619.0, + "step": 14439, + "train/ce_loss": 0.6125617623329163 + }, + { + "epoch": 1.427625074154637, + "step": 14439, + "train/sim_loss": 0.027238130569458008 + }, + { + "epoch": 1.427625074154637, + "step": 14439, + "train/total_loss": 0.08849430829286575 + }, + { + "epoch": 1.4277239470041527, + "grad_norm": 0.5718291997909546, + "learning_rate": 6.432527320377788e-06, + "loss": 0.0831, + "step": 14440 + }, + { + "entropy": 9.015283584594727, + "epoch": 1.4277239470041527, + "mean_token_accuracy": 0.8540816307067871, + "num_tokens": 30598367.0, + "step": 14440, + "train/ce_loss": 0.18536114692687988 + }, + { + "epoch": 1.4277239470041527, + "step": 14440, + "train/sim_loss": 0.03653407096862793 + }, + { + "epoch": 1.4277239470041527, + "step": 14440, + "train/total_loss": 0.0550701841711998 + }, + { + "entropy": 9.366034507751465, + "epoch": 1.4278228198536682, + "mean_token_accuracy": 0.8106508851051331, + "num_tokens": 30607877.0, + "step": 14441, + "train/ce_loss": 0.3871096670627594 + }, + { + "epoch": 1.4278228198536682, + "step": 14441, + "train/sim_loss": 0.022189855575561523 + }, + { + "epoch": 1.4278228198536682, + "step": 14441, + "train/total_loss": 0.06090082228183746 + }, + { + "entropy": 8.783736228942871, + "epoch": 1.4279216927031837, + "mean_token_accuracy": 0.8404966592788696, + "num_tokens": 30615621.0, + "step": 14442, + "train/ce_loss": 0.2388465702533722 + }, + { + "epoch": 1.4279216927031837, + "step": 14442, + "train/sim_loss": 0.01282501220703125 + }, + { + "epoch": 1.4279216927031837, + "step": 14442, + "train/total_loss": 0.03670966997742653 + }, + { + "entropy": 9.840505599975586, + "epoch": 1.4280205655526992, + "mean_token_accuracy": 0.8723099231719971, + "num_tokens": 30627917.0, + "step": 14443, + "train/ce_loss": 2.0377519831527025e-06 + }, + { + "epoch": 1.4280205655526992, + "step": 14443, + "train/sim_loss": 0.04752671718597412 + }, + { + "epoch": 1.4280205655526992, + "step": 14443, + "train/total_loss": 0.047526922076940536 + }, + { + "entropy": 10.168814659118652, + "epoch": 1.4281194384022147, + "mean_token_accuracy": 0.8482906222343445, + "num_tokens": 30635210.0, + "step": 14444, + "train/ce_loss": 5.135109972798091e-07 + }, + { + "epoch": 1.4281194384022147, + "step": 14444, + "train/sim_loss": 0.02009117603302002 + }, + { + "epoch": 1.4281194384022147, + "step": 14444, + "train/total_loss": 0.020091228187084198 + }, + { + "entropy": 9.515216827392578, + "epoch": 1.4282183112517304, + "mean_token_accuracy": 0.8404017686843872, + "num_tokens": 30649828.0, + "step": 14445, + "train/ce_loss": 1.030260443687439 + }, + { + "epoch": 1.4282183112517304, + "step": 14445, + "train/sim_loss": 0.06465280055999756 + }, + { + "epoch": 1.4282183112517304, + "step": 14445, + "train/total_loss": 0.1676788479089737 + }, + { + "entropy": 9.626810073852539, + "epoch": 1.4283171841012459, + "mean_token_accuracy": 0.8420382142066956, + "num_tokens": 30665073.0, + "step": 14446, + "train/ce_loss": 0.2639566659927368 + }, + { + "epoch": 1.4283171841012459, + "step": 14446, + "train/sim_loss": 0.05916935205459595 + }, + { + "epoch": 1.4283171841012459, + "step": 14446, + "train/total_loss": 0.08556501567363739 + }, + { + "entropy": 9.884926795959473, + "epoch": 1.4284160569507613, + "mean_token_accuracy": 0.8936170339584351, + "num_tokens": 30674830.0, + "step": 14447, + "train/ce_loss": 0.8575129508972168 + }, + { + "epoch": 1.4284160569507613, + "step": 14447, + "train/sim_loss": 0.06317627429962158 + }, + { + "epoch": 1.4284160569507613, + "step": 14447, + "train/total_loss": 0.14892756938934326 + }, + { + "entropy": 9.380413055419922, + "epoch": 1.4285149298002768, + "mean_token_accuracy": 0.8526448607444763, + "num_tokens": 30687764.0, + "step": 14448, + "train/ce_loss": 0.3509548306465149 + }, + { + "epoch": 1.4285149298002768, + "step": 14448, + "train/sim_loss": 0.03521370887756348 + }, + { + "epoch": 1.4285149298002768, + "step": 14448, + "train/total_loss": 0.07030919194221497 + }, + { + "entropy": 9.3031005859375, + "epoch": 1.4286138026497923, + "mean_token_accuracy": 0.8342541456222534, + "num_tokens": 30698262.0, + "step": 14449, + "train/ce_loss": 0.25286129117012024 + }, + { + "epoch": 1.4286138026497923, + "step": 14449, + "train/sim_loss": 0.01182568073272705 + }, + { + "epoch": 1.4286138026497923, + "step": 14449, + "train/total_loss": 0.037111811339855194 + }, + { + "entropy": 9.257417678833008, + "epoch": 1.428712675499308, + "mean_token_accuracy": 0.8302299976348877, + "num_tokens": 30706826.0, + "step": 14450, + "train/ce_loss": 0.46577873826026917 + }, + { + "epoch": 1.428712675499308, + "step": 14450, + "train/sim_loss": 0.09371662139892578 + }, + { + "epoch": 1.428712675499308, + "step": 14450, + "train/total_loss": 0.14029449224472046 + }, + { + "entropy": 9.517560958862305, + "epoch": 1.4288115483488233, + "mean_token_accuracy": 0.8576576709747314, + "num_tokens": 30717879.0, + "step": 14451, + "train/ce_loss": 2.6493321456655394e-06 + }, + { + "epoch": 1.4288115483488233, + "step": 14451, + "train/sim_loss": 0.054498910903930664 + }, + { + "epoch": 1.4288115483488233, + "step": 14451, + "train/total_loss": 0.054499175399541855 + }, + { + "entropy": 9.049727439880371, + "epoch": 1.428910421198339, + "mean_token_accuracy": 0.8776595592498779, + "num_tokens": 30726845.0, + "step": 14452, + "train/ce_loss": 0.254780113697052 + }, + { + "epoch": 1.428910421198339, + "step": 14452, + "train/sim_loss": 0.07607471942901611 + }, + { + "epoch": 1.428910421198339, + "step": 14452, + "train/total_loss": 0.10155273228883743 + }, + { + "entropy": 9.529438018798828, + "epoch": 1.4290092940478545, + "mean_token_accuracy": 0.8423295617103577, + "num_tokens": 30738382.0, + "step": 14453, + "train/ce_loss": 0.4780544340610504 + }, + { + "epoch": 1.4290092940478545, + "step": 14453, + "train/sim_loss": 0.05442333221435547 + }, + { + "epoch": 1.4290092940478545, + "step": 14453, + "train/total_loss": 0.10222877562046051 + }, + { + "entropy": 9.769718170166016, + "epoch": 1.42910816689737, + "mean_token_accuracy": 0.7693920135498047, + "num_tokens": 30755864.0, + "step": 14454, + "train/ce_loss": 0.4812104105949402 + }, + { + "epoch": 1.42910816689737, + "step": 14454, + "train/sim_loss": 0.025490641593933105 + }, + { + "epoch": 1.42910816689737, + "step": 14454, + "train/total_loss": 0.07361168414354324 + }, + { + "entropy": 9.22054672241211, + "epoch": 1.4292070397468855, + "mean_token_accuracy": 0.8147239089012146, + "num_tokens": 30768213.0, + "step": 14455, + "train/ce_loss": 0.6734371185302734 + }, + { + "epoch": 1.4292070397468855, + "step": 14455, + "train/sim_loss": 0.06277263164520264 + }, + { + "epoch": 1.4292070397468855, + "step": 14455, + "train/total_loss": 0.13011634349822998 + }, + { + "entropy": 9.755207061767578, + "epoch": 1.429305912596401, + "mean_token_accuracy": 0.8385416865348816, + "num_tokens": 30782312.0, + "step": 14456, + "train/ce_loss": 0.5981870889663696 + }, + { + "epoch": 1.429305912596401, + "step": 14456, + "train/sim_loss": 0.03267109394073486 + }, + { + "epoch": 1.429305912596401, + "step": 14456, + "train/total_loss": 0.0924898087978363 + }, + { + "entropy": 9.414392471313477, + "epoch": 1.4294047854459166, + "mean_token_accuracy": 0.9035714268684387, + "num_tokens": 30793715.0, + "step": 14457, + "train/ce_loss": 0.3759884834289551 + }, + { + "epoch": 1.4294047854459166, + "step": 14457, + "train/sim_loss": 0.041129887104034424 + }, + { + "epoch": 1.4294047854459166, + "step": 14457, + "train/total_loss": 0.07872873544692993 + }, + { + "entropy": 9.329514503479004, + "epoch": 1.4295036582954321, + "mean_token_accuracy": 0.8314606547355652, + "num_tokens": 30810183.0, + "step": 14458, + "train/ce_loss": 0.6128535866737366 + }, + { + "epoch": 1.4295036582954321, + "step": 14458, + "train/sim_loss": 0.04892301559448242 + }, + { + "epoch": 1.4295036582954321, + "step": 14458, + "train/total_loss": 0.11020837724208832 + }, + { + "entropy": 9.374374389648438, + "epoch": 1.4296025311449476, + "mean_token_accuracy": 0.8745432496070862, + "num_tokens": 30826599.0, + "step": 14459, + "train/ce_loss": 0.35764527320861816 + }, + { + "epoch": 1.4296025311449476, + "step": 14459, + "train/sim_loss": 0.08627402782440186 + }, + { + "epoch": 1.4296025311449476, + "step": 14459, + "train/total_loss": 0.12203855812549591 + }, + { + "epoch": 1.429701403994463, + "grad_norm": 0.5279479622840881, + "learning_rate": 6.42758245561984e-06, + "loss": 0.0934, + "step": 14460 + }, + { + "entropy": 9.61376953125, + "epoch": 1.429701403994463, + "mean_token_accuracy": 0.8064516186714172, + "num_tokens": 30837845.0, + "step": 14460, + "train/ce_loss": 0.7269161343574524 + }, + { + "epoch": 1.429701403994463, + "step": 14460, + "train/sim_loss": 0.040659427642822266 + }, + { + "epoch": 1.429701403994463, + "step": 14460, + "train/total_loss": 0.11335103958845139 + }, + { + "entropy": 9.291421890258789, + "epoch": 1.4298002768439786, + "mean_token_accuracy": 0.8070403933525085, + "num_tokens": 30845324.0, + "step": 14461, + "train/ce_loss": 0.25732091069221497 + }, + { + "epoch": 1.4298002768439786, + "step": 14461, + "train/sim_loss": 0.02882969379425049 + }, + { + "epoch": 1.4298002768439786, + "step": 14461, + "train/total_loss": 0.054561786353588104 + }, + { + "entropy": 8.98956298828125, + "epoch": 1.4298991496934943, + "mean_token_accuracy": 0.9200000166893005, + "num_tokens": 30856568.0, + "step": 14462, + "train/ce_loss": 0.23918871581554413 + }, + { + "epoch": 1.4298991496934943, + "step": 14462, + "train/sim_loss": 0.04784059524536133 + }, + { + "epoch": 1.4298991496934943, + "step": 14462, + "train/total_loss": 0.07175946980714798 + }, + { + "entropy": 10.1258544921875, + "epoch": 1.4299980225430096, + "mean_token_accuracy": 0.8500000238418579, + "num_tokens": 30870128.0, + "step": 14463, + "train/ce_loss": 0.6359955668449402 + }, + { + "epoch": 1.4299980225430096, + "step": 14463, + "train/sim_loss": 0.07697051763534546 + }, + { + "epoch": 1.4299980225430096, + "step": 14463, + "train/total_loss": 0.14057007431983948 + }, + { + "entropy": 9.679704666137695, + "epoch": 1.4300968953925253, + "mean_token_accuracy": 0.8164893388748169, + "num_tokens": 30883018.0, + "step": 14464, + "train/ce_loss": 0.47077974677085876 + }, + { + "epoch": 1.4300968953925253, + "step": 14464, + "train/sim_loss": 0.034128785133361816 + }, + { + "epoch": 1.4300968953925253, + "step": 14464, + "train/total_loss": 0.08120676130056381 + }, + { + "entropy": 9.490532875061035, + "epoch": 1.4301957682420408, + "mean_token_accuracy": 0.872305154800415, + "num_tokens": 30895974.0, + "step": 14465, + "train/ce_loss": 0.4577838182449341 + }, + { + "epoch": 1.4301957682420408, + "step": 14465, + "train/sim_loss": 0.011107146739959717 + }, + { + "epoch": 1.4301957682420408, + "step": 14465, + "train/total_loss": 0.056885529309511185 + }, + { + "entropy": 9.610279083251953, + "epoch": 1.4302946410915562, + "mean_token_accuracy": 0.8878248929977417, + "num_tokens": 30911132.0, + "step": 14466, + "train/ce_loss": 0.10618585348129272 + }, + { + "epoch": 1.4302946410915562, + "step": 14466, + "train/sim_loss": 0.06839489936828613 + }, + { + "epoch": 1.4302946410915562, + "step": 14466, + "train/total_loss": 0.07901348173618317 + }, + { + "entropy": 9.296867370605469, + "epoch": 1.4303935139410717, + "mean_token_accuracy": 0.9414557218551636, + "num_tokens": 30923330.0, + "step": 14467, + "train/ce_loss": 1.0366044307374978e-06 + }, + { + "epoch": 1.4303935139410717, + "step": 14467, + "train/sim_loss": 0.04096972942352295 + }, + { + "epoch": 1.4303935139410717, + "step": 14467, + "train/total_loss": 0.040969833731651306 + }, + { + "entropy": 9.896856307983398, + "epoch": 1.4304923867905872, + "mean_token_accuracy": 0.9029734134674072, + "num_tokens": 30939231.0, + "step": 14468, + "train/ce_loss": 0.15331360697746277 + }, + { + "epoch": 1.4304923867905872, + "step": 14468, + "train/sim_loss": 0.05850207805633545 + }, + { + "epoch": 1.4304923867905872, + "step": 14468, + "train/total_loss": 0.07383343577384949 + }, + { + "entropy": 9.632708549499512, + "epoch": 1.430591259640103, + "mean_token_accuracy": 0.864696741104126, + "num_tokens": 30954857.0, + "step": 14469, + "train/ce_loss": 0.8047281503677368 + }, + { + "epoch": 1.430591259640103, + "step": 14469, + "train/sim_loss": 0.050359487533569336 + }, + { + "epoch": 1.430591259640103, + "step": 14469, + "train/total_loss": 0.13083231449127197 + }, + { + "entropy": 8.813482284545898, + "epoch": 1.4306901324896184, + "mean_token_accuracy": 0.8157181739807129, + "num_tokens": 30964897.0, + "step": 14470, + "train/ce_loss": 0.8321239948272705 + }, + { + "epoch": 1.4306901324896184, + "step": 14470, + "train/sim_loss": 0.04356789588928223 + }, + { + "epoch": 1.4306901324896184, + "step": 14470, + "train/total_loss": 0.12678030133247375 + }, + { + "entropy": 9.579706192016602, + "epoch": 1.430789005339134, + "mean_token_accuracy": 0.8492706418037415, + "num_tokens": 30976751.0, + "step": 14471, + "train/ce_loss": 0.5255671143531799 + }, + { + "epoch": 1.430789005339134, + "step": 14471, + "train/sim_loss": 0.08000236749649048 + }, + { + "epoch": 1.430789005339134, + "step": 14471, + "train/total_loss": 0.13255907595157623 + }, + { + "entropy": 9.22259521484375, + "epoch": 1.4308878781886494, + "mean_token_accuracy": 0.8443670272827148, + "num_tokens": 30989510.0, + "step": 14472, + "train/ce_loss": 0.3576308786869049 + }, + { + "epoch": 1.4308878781886494, + "step": 14472, + "train/sim_loss": 0.026382148265838623 + }, + { + "epoch": 1.4308878781886494, + "step": 14472, + "train/total_loss": 0.06214523687958717 + }, + { + "entropy": 9.592528343200684, + "epoch": 1.4309867510381649, + "mean_token_accuracy": 0.8765778541564941, + "num_tokens": 30998348.0, + "step": 14473, + "train/ce_loss": 0.4452955424785614 + }, + { + "epoch": 1.4309867510381649, + "step": 14473, + "train/sim_loss": 0.10347938537597656 + }, + { + "epoch": 1.4309867510381649, + "step": 14473, + "train/total_loss": 0.14800894260406494 + }, + { + "entropy": 9.348803520202637, + "epoch": 1.4310856238876806, + "mean_token_accuracy": 0.86094069480896, + "num_tokens": 31008084.0, + "step": 14474, + "train/ce_loss": 0.3314208984375 + }, + { + "epoch": 1.4310856238876806, + "step": 14474, + "train/sim_loss": 0.07767045497894287 + }, + { + "epoch": 1.4310856238876806, + "step": 14474, + "train/total_loss": 0.11081254482269287 + }, + { + "entropy": 8.982358932495117, + "epoch": 1.4311844967371958, + "mean_token_accuracy": 0.8460526466369629, + "num_tokens": 31023184.0, + "step": 14475, + "train/ce_loss": 0.3235640823841095 + }, + { + "epoch": 1.4311844967371958, + "step": 14475, + "train/sim_loss": 0.056880176067352295 + }, + { + "epoch": 1.4311844967371958, + "step": 14475, + "train/total_loss": 0.08923658728599548 + }, + { + "entropy": 9.115060806274414, + "epoch": 1.4312833695867115, + "mean_token_accuracy": 0.8614270687103271, + "num_tokens": 31033454.0, + "step": 14476, + "train/ce_loss": 0.5868799686431885 + }, + { + "epoch": 1.4312833695867115, + "step": 14476, + "train/sim_loss": 0.05965083837509155 + }, + { + "epoch": 1.4312833695867115, + "step": 14476, + "train/total_loss": 0.11833883821964264 + }, + { + "entropy": 9.511176109313965, + "epoch": 1.431382242436227, + "mean_token_accuracy": 0.8738532066345215, + "num_tokens": 31057586.0, + "step": 14477, + "train/ce_loss": 0.3838875889778137 + }, + { + "epoch": 1.431382242436227, + "step": 14477, + "train/sim_loss": 0.028448760509490967 + }, + { + "epoch": 1.431382242436227, + "step": 14477, + "train/total_loss": 0.06683751940727234 + }, + { + "entropy": 9.038564682006836, + "epoch": 1.4314811152857425, + "mean_token_accuracy": 0.8342665433883667, + "num_tokens": 31065933.0, + "step": 14478, + "train/ce_loss": 0.38061264157295227 + }, + { + "epoch": 1.4314811152857425, + "step": 14478, + "train/sim_loss": 0.01629805564880371 + }, + { + "epoch": 1.4314811152857425, + "step": 14478, + "train/total_loss": 0.054359320551157 + }, + { + "entropy": 8.913496971130371, + "epoch": 1.431579988135258, + "mean_token_accuracy": 0.800000011920929, + "num_tokens": 31074545.0, + "step": 14479, + "train/ce_loss": 0.9071431756019592 + }, + { + "epoch": 1.431579988135258, + "step": 14479, + "train/sim_loss": 0.11893725395202637 + }, + { + "epoch": 1.431579988135258, + "step": 14479, + "train/total_loss": 0.20965157449245453 + }, + { + "epoch": 1.4316788609847735, + "grad_norm": 0.5521076321601868, + "learning_rate": 6.422637590861891e-06, + "loss": 0.086, + "step": 14480 + }, + { + "entropy": 9.484870910644531, + "epoch": 1.4316788609847735, + "mean_token_accuracy": 0.856217622756958, + "num_tokens": 31091857.0, + "step": 14480, + "train/ce_loss": 0.20165415108203888 + }, + { + "epoch": 1.4316788609847735, + "step": 14480, + "train/sim_loss": 0.11042559146881104 + }, + { + "epoch": 1.4316788609847735, + "step": 14480, + "train/total_loss": 0.1305910050868988 + }, + { + "entropy": 9.356924057006836, + "epoch": 1.4317777338342892, + "mean_token_accuracy": 0.8803301453590393, + "num_tokens": 31107051.0, + "step": 14481, + "train/ce_loss": 0.3349626064300537 + }, + { + "epoch": 1.4317777338342892, + "step": 14481, + "train/sim_loss": 0.05207890272140503 + }, + { + "epoch": 1.4317777338342892, + "step": 14481, + "train/total_loss": 0.0855751633644104 + }, + { + "entropy": 9.470451354980469, + "epoch": 1.4318766066838047, + "mean_token_accuracy": 0.831038773059845, + "num_tokens": 31121504.0, + "step": 14482, + "train/ce_loss": 0.15686319768428802 + }, + { + "epoch": 1.4318766066838047, + "step": 14482, + "train/sim_loss": 0.020801186561584473 + }, + { + "epoch": 1.4318766066838047, + "step": 14482, + "train/total_loss": 0.036487504839897156 + }, + { + "entropy": 9.181278228759766, + "epoch": 1.4319754795333202, + "mean_token_accuracy": 0.8607142567634583, + "num_tokens": 31135006.0, + "step": 14483, + "train/ce_loss": 0.5335291624069214 + }, + { + "epoch": 1.4319754795333202, + "step": 14483, + "train/sim_loss": 0.027201533317565918 + }, + { + "epoch": 1.4319754795333202, + "step": 14483, + "train/total_loss": 0.08055445551872253 + }, + { + "entropy": 8.593378067016602, + "epoch": 1.4320743523828356, + "mean_token_accuracy": 0.8422648906707764, + "num_tokens": 31141243.0, + "step": 14484, + "train/ce_loss": 0.4888329803943634 + }, + { + "epoch": 1.4320743523828356, + "step": 14484, + "train/sim_loss": 0.011580348014831543 + }, + { + "epoch": 1.4320743523828356, + "step": 14484, + "train/total_loss": 0.06046364828944206 + }, + { + "entropy": 9.573951721191406, + "epoch": 1.4321732252323511, + "mean_token_accuracy": 0.842024564743042, + "num_tokens": 31158411.0, + "step": 14485, + "train/ce_loss": 0.5918315052986145 + }, + { + "epoch": 1.4321732252323511, + "step": 14485, + "train/sim_loss": 0.04435837268829346 + }, + { + "epoch": 1.4321732252323511, + "step": 14485, + "train/total_loss": 0.10354152321815491 + }, + { + "entropy": 8.762996673583984, + "epoch": 1.4322720980818668, + "mean_token_accuracy": 0.8480154871940613, + "num_tokens": 31170109.0, + "step": 14486, + "train/ce_loss": 0.2109425812959671 + }, + { + "epoch": 1.4322720980818668, + "step": 14486, + "train/sim_loss": 0.0155562162399292 + }, + { + "epoch": 1.4322720980818668, + "step": 14486, + "train/total_loss": 0.03665047511458397 + }, + { + "entropy": 9.353555679321289, + "epoch": 1.432370970931382, + "mean_token_accuracy": 0.8552631735801697, + "num_tokens": 31187119.0, + "step": 14487, + "train/ce_loss": 0.45285025238990784 + }, + { + "epoch": 1.432370970931382, + "step": 14487, + "train/sim_loss": 0.06455397605895996 + }, + { + "epoch": 1.432370970931382, + "step": 14487, + "train/total_loss": 0.10983900725841522 + }, + { + "entropy": 9.435758590698242, + "epoch": 1.4324698437808978, + "mean_token_accuracy": 0.8808029890060425, + "num_tokens": 31198160.0, + "step": 14488, + "train/ce_loss": 3.009355964422866e-07 + }, + { + "epoch": 1.4324698437808978, + "step": 14488, + "train/sim_loss": 0.019615471363067627 + }, + { + "epoch": 1.4324698437808978, + "step": 14488, + "train/total_loss": 0.019615501165390015 + }, + { + "entropy": 9.140937805175781, + "epoch": 1.4325687166304133, + "mean_token_accuracy": 0.889072835445404, + "num_tokens": 31208373.0, + "step": 14489, + "train/ce_loss": 0.9090163111686707 + }, + { + "epoch": 1.4325687166304133, + "step": 14489, + "train/sim_loss": 0.03781998157501221 + }, + { + "epoch": 1.4325687166304133, + "step": 14489, + "train/total_loss": 0.12872162461280823 + }, + { + "entropy": 9.397751808166504, + "epoch": 1.4326675894799288, + "mean_token_accuracy": 0.8134034276008606, + "num_tokens": 31221390.0, + "step": 14490, + "train/ce_loss": 0.6122830510139465 + }, + { + "epoch": 1.4326675894799288, + "step": 14490, + "train/sim_loss": 0.037400126457214355 + }, + { + "epoch": 1.4326675894799288, + "step": 14490, + "train/total_loss": 0.09862843155860901 + }, + { + "entropy": 9.381446838378906, + "epoch": 1.4327664623294443, + "mean_token_accuracy": 0.8447432518005371, + "num_tokens": 31235486.0, + "step": 14491, + "train/ce_loss": 0.5965215563774109 + }, + { + "epoch": 1.4327664623294443, + "step": 14491, + "train/sim_loss": 0.01802980899810791 + }, + { + "epoch": 1.4327664623294443, + "step": 14491, + "train/total_loss": 0.07768196612596512 + }, + { + "entropy": 9.100128173828125, + "epoch": 1.4328653351789598, + "mean_token_accuracy": 0.8491879105567932, + "num_tokens": 31245064.0, + "step": 14492, + "train/ce_loss": 0.38840726017951965 + }, + { + "epoch": 1.4328653351789598, + "step": 14492, + "train/sim_loss": 0.011306166648864746 + }, + { + "epoch": 1.4328653351789598, + "step": 14492, + "train/total_loss": 0.05014689266681671 + }, + { + "entropy": 9.67130184173584, + "epoch": 1.4329642080284755, + "mean_token_accuracy": 0.8397626280784607, + "num_tokens": 31257895.0, + "step": 14493, + "train/ce_loss": 0.33163538575172424 + }, + { + "epoch": 1.4329642080284755, + "step": 14493, + "train/sim_loss": 0.02047443389892578 + }, + { + "epoch": 1.4329642080284755, + "step": 14493, + "train/total_loss": 0.053637973964214325 + }, + { + "entropy": 9.540574073791504, + "epoch": 1.433063080877991, + "mean_token_accuracy": 0.948164165019989, + "num_tokens": 31265636.0, + "step": 14494, + "train/ce_loss": 0.428330659866333 + }, + { + "epoch": 1.433063080877991, + "step": 14494, + "train/sim_loss": 0.019646048545837402 + }, + { + "epoch": 1.433063080877991, + "step": 14494, + "train/total_loss": 0.06247911602258682 + }, + { + "entropy": 8.867687225341797, + "epoch": 1.4331619537275064, + "mean_token_accuracy": 0.8249452710151672, + "num_tokens": 31272835.0, + "step": 14495, + "train/ce_loss": 0.580823540687561 + }, + { + "epoch": 1.4331619537275064, + "step": 14495, + "train/sim_loss": 0.010447800159454346 + }, + { + "epoch": 1.4331619537275064, + "step": 14495, + "train/total_loss": 0.06853015720844269 + }, + { + "entropy": 9.611468315124512, + "epoch": 1.433260826577022, + "mean_token_accuracy": 0.8296703100204468, + "num_tokens": 31282183.0, + "step": 14496, + "train/ce_loss": 0.6703208684921265 + }, + { + "epoch": 1.433260826577022, + "step": 14496, + "train/sim_loss": 0.02459007501602173 + }, + { + "epoch": 1.433260826577022, + "step": 14496, + "train/total_loss": 0.09162216633558273 + }, + { + "entropy": 9.379871368408203, + "epoch": 1.4333596994265374, + "mean_token_accuracy": 0.8864468932151794, + "num_tokens": 31297927.0, + "step": 14497, + "train/ce_loss": 0.19343136250972748 + }, + { + "epoch": 1.4333596994265374, + "step": 14497, + "train/sim_loss": 0.018134355545043945 + }, + { + "epoch": 1.4333596994265374, + "step": 14497, + "train/total_loss": 0.03747749328613281 + }, + { + "entropy": 9.099498748779297, + "epoch": 1.4334585722760531, + "mean_token_accuracy": 0.8761062026023865, + "num_tokens": 31311118.0, + "step": 14498, + "train/ce_loss": 0.5475701093673706 + }, + { + "epoch": 1.4334585722760531, + "step": 14498, + "train/sim_loss": 0.013517916202545166 + }, + { + "epoch": 1.4334585722760531, + "step": 14498, + "train/total_loss": 0.06827493011951447 + }, + { + "entropy": 9.030326843261719, + "epoch": 1.4335574451255686, + "mean_token_accuracy": 0.8569604158401489, + "num_tokens": 31317783.0, + "step": 14499, + "train/ce_loss": 0.23271122574806213 + }, + { + "epoch": 1.4335574451255686, + "step": 14499, + "train/sim_loss": 0.014013528823852539 + }, + { + "epoch": 1.4335574451255686, + "step": 14499, + "train/total_loss": 0.03728464990854263 + }, + { + "epoch": 1.433656317975084, + "grad_norm": 0.5132755637168884, + "learning_rate": 6.417692726103942e-06, + "loss": 0.081, + "step": 14500 + }, + { + "entropy": 9.596534729003906, + "epoch": 1.433656317975084, + "mean_token_accuracy": 0.8662092685699463, + "num_tokens": 31326392.0, + "step": 14500, + "train/ce_loss": 1.3764210962108336e-06 + }, + { + "epoch": 1.433656317975084, + "step": 14500, + "train/sim_loss": 0.014190912246704102 + }, + { + "epoch": 1.433656317975084, + "step": 14500, + "train/total_loss": 0.014191050082445145 + }, + { + "entropy": 9.751749038696289, + "epoch": 1.4337551908245996, + "mean_token_accuracy": 0.8527777791023254, + "num_tokens": 31337339.0, + "step": 14501, + "train/ce_loss": 0.4420563876628876 + }, + { + "epoch": 1.4337551908245996, + "step": 14501, + "train/sim_loss": 0.01079857349395752 + }, + { + "epoch": 1.4337551908245996, + "step": 14501, + "train/total_loss": 0.05500421300530434 + }, + { + "entropy": 9.267425537109375, + "epoch": 1.433854063674115, + "mean_token_accuracy": 0.8645319938659668, + "num_tokens": 31349426.0, + "step": 14502, + "train/ce_loss": 0.13996729254722595 + }, + { + "epoch": 1.433854063674115, + "step": 14502, + "train/sim_loss": 0.02875959873199463 + }, + { + "epoch": 1.433854063674115, + "step": 14502, + "train/total_loss": 0.042756326496601105 + }, + { + "entropy": 9.011636734008789, + "epoch": 1.4339529365236305, + "mean_token_accuracy": 0.8393077850341797, + "num_tokens": 31361422.0, + "step": 14503, + "train/ce_loss": 0.4755282700061798 + }, + { + "epoch": 1.4339529365236305, + "step": 14503, + "train/sim_loss": 0.0520397424697876 + }, + { + "epoch": 1.4339529365236305, + "step": 14503, + "train/total_loss": 0.09959256649017334 + }, + { + "entropy": 9.271200180053711, + "epoch": 1.434051809373146, + "mean_token_accuracy": 0.8571428656578064, + "num_tokens": 31372372.0, + "step": 14504, + "train/ce_loss": 0.5062962174415588 + }, + { + "epoch": 1.434051809373146, + "step": 14504, + "train/sim_loss": 0.01759016513824463 + }, + { + "epoch": 1.434051809373146, + "step": 14504, + "train/total_loss": 0.06821978837251663 + }, + { + "entropy": 9.941670417785645, + "epoch": 1.4341506822226617, + "mean_token_accuracy": 0.9168207049369812, + "num_tokens": 31382081.0, + "step": 14505, + "train/ce_loss": 0.21754829585552216 + }, + { + "epoch": 1.4341506822226617, + "step": 14505, + "train/sim_loss": 0.04299354553222656 + }, + { + "epoch": 1.4341506822226617, + "step": 14505, + "train/total_loss": 0.0647483766078949 + }, + { + "entropy": 9.627309799194336, + "epoch": 1.4342495550721772, + "mean_token_accuracy": 0.8939617276191711, + "num_tokens": 31396141.0, + "step": 14506, + "train/ce_loss": 0.165215864777565 + }, + { + "epoch": 1.4342495550721772, + "step": 14506, + "train/sim_loss": 0.024099409580230713 + }, + { + "epoch": 1.4342495550721772, + "step": 14506, + "train/total_loss": 0.04062099754810333 + }, + { + "entropy": 9.462331771850586, + "epoch": 1.4343484279216927, + "mean_token_accuracy": 0.828157365322113, + "num_tokens": 31412072.0, + "step": 14507, + "train/ce_loss": 0.42909935116767883 + }, + { + "epoch": 1.4343484279216927, + "step": 14507, + "train/sim_loss": 0.026348412036895752 + }, + { + "epoch": 1.4343484279216927, + "step": 14507, + "train/total_loss": 0.06925834715366364 + }, + { + "entropy": 9.219844818115234, + "epoch": 1.4344473007712082, + "mean_token_accuracy": 0.8778467774391174, + "num_tokens": 31425516.0, + "step": 14508, + "train/ce_loss": 0.21967074275016785 + }, + { + "epoch": 1.4344473007712082, + "step": 14508, + "train/sim_loss": 0.055510640144348145 + }, + { + "epoch": 1.4344473007712082, + "step": 14508, + "train/total_loss": 0.07747771590948105 + }, + { + "entropy": 9.092109680175781, + "epoch": 1.4345461736207237, + "mean_token_accuracy": 0.7872860431671143, + "num_tokens": 31437855.0, + "step": 14509, + "train/ce_loss": 1.1365203857421875 + }, + { + "epoch": 1.4345461736207237, + "step": 14509, + "train/sim_loss": 0.047549307346343994 + }, + { + "epoch": 1.4345461736207237, + "step": 14509, + "train/total_loss": 0.1612013578414917 + }, + { + "entropy": 9.806546211242676, + "epoch": 1.4346450464702394, + "mean_token_accuracy": 0.8846675753593445, + "num_tokens": 31459629.0, + "step": 14510, + "train/ce_loss": 0.8258240222930908 + }, + { + "epoch": 1.4346450464702394, + "step": 14510, + "train/sim_loss": 0.10486459732055664 + }, + { + "epoch": 1.4346450464702394, + "step": 14510, + "train/total_loss": 0.18744701147079468 + }, + { + "entropy": 10.14162826538086, + "epoch": 1.4347439193197549, + "mean_token_accuracy": 0.8376753330230713, + "num_tokens": 31474849.0, + "step": 14511, + "train/ce_loss": 1.2374081279631355e-06 + }, + { + "epoch": 1.4347439193197549, + "step": 14511, + "train/sim_loss": 0.013864994049072266 + }, + { + "epoch": 1.4347439193197549, + "step": 14511, + "train/total_loss": 0.01386511791497469 + }, + { + "entropy": 9.167587280273438, + "epoch": 1.4348427921692704, + "mean_token_accuracy": 0.8305489420890808, + "num_tokens": 31487916.0, + "step": 14512, + "train/ce_loss": 0.6652074456214905 + }, + { + "epoch": 1.4348427921692704, + "step": 14512, + "train/sim_loss": 0.019019722938537598 + }, + { + "epoch": 1.4348427921692704, + "step": 14512, + "train/total_loss": 0.08554046601057053 + }, + { + "entropy": 9.468704223632812, + "epoch": 1.4349416650187858, + "mean_token_accuracy": 0.8952702879905701, + "num_tokens": 31497901.0, + "step": 14513, + "train/ce_loss": 0.41985708475112915 + }, + { + "epoch": 1.4349416650187858, + "step": 14513, + "train/sim_loss": 0.05893212556838989 + }, + { + "epoch": 1.4349416650187858, + "step": 14513, + "train/total_loss": 0.10091783106327057 + }, + { + "entropy": 9.796292304992676, + "epoch": 1.4350405378683013, + "mean_token_accuracy": 0.9621211886405945, + "num_tokens": 31506514.0, + "step": 14514, + "train/ce_loss": 0.385287344455719 + }, + { + "epoch": 1.4350405378683013, + "step": 14514, + "train/sim_loss": 0.015472412109375 + }, + { + "epoch": 1.4350405378683013, + "step": 14514, + "train/total_loss": 0.05400114879012108 + }, + { + "entropy": 8.712931632995605, + "epoch": 1.4351394107178168, + "mean_token_accuracy": 0.8132586479187012, + "num_tokens": 31519107.0, + "step": 14515, + "train/ce_loss": 0.48485690355300903 + }, + { + "epoch": 1.4351394107178168, + "step": 14515, + "train/sim_loss": 0.03872966766357422 + }, + { + "epoch": 1.4351394107178168, + "step": 14515, + "train/total_loss": 0.0872153639793396 + }, + { + "entropy": 9.436750411987305, + "epoch": 1.4352382835673323, + "mean_token_accuracy": 0.8464730381965637, + "num_tokens": 31526531.0, + "step": 14516, + "train/ce_loss": 0.8468260765075684 + }, + { + "epoch": 1.4352382835673323, + "step": 14516, + "train/sim_loss": 0.08190381526947021 + }, + { + "epoch": 1.4352382835673323, + "step": 14516, + "train/total_loss": 0.16658642888069153 + }, + { + "entropy": 9.922510147094727, + "epoch": 1.435337156416848, + "mean_token_accuracy": 0.8728070259094238, + "num_tokens": 31537625.0, + "step": 14517, + "train/ce_loss": 0.5100148320198059 + }, + { + "epoch": 1.435337156416848, + "step": 14517, + "train/sim_loss": 0.033829689025878906 + }, + { + "epoch": 1.435337156416848, + "step": 14517, + "train/total_loss": 0.08483117818832397 + }, + { + "entropy": 9.994237899780273, + "epoch": 1.4354360292663635, + "mean_token_accuracy": 0.875, + "num_tokens": 31550131.0, + "step": 14518, + "train/ce_loss": 0.23404556512832642 + }, + { + "epoch": 1.4354360292663635, + "step": 14518, + "train/sim_loss": 0.019170641899108887 + }, + { + "epoch": 1.4354360292663635, + "step": 14518, + "train/total_loss": 0.04257519915699959 + }, + { + "entropy": 9.345804214477539, + "epoch": 1.435534902115879, + "mean_token_accuracy": 0.8727810382843018, + "num_tokens": 31566580.0, + "step": 14519, + "train/ce_loss": 9.755021892488003e-07 + }, + { + "epoch": 1.435534902115879, + "step": 14519, + "train/sim_loss": 0.02645742893218994 + }, + { + "epoch": 1.435534902115879, + "step": 14519, + "train/total_loss": 0.0264575257897377 + }, + { + "epoch": 1.4356337749653945, + "grad_norm": 0.6296176910400391, + "learning_rate": 6.412747861345993e-06, + "loss": 0.0817, + "step": 14520 + }, + { + "entropy": 9.025440216064453, + "epoch": 1.4356337749653945, + "mean_token_accuracy": 0.8122317790985107, + "num_tokens": 31580929.0, + "step": 14520, + "train/ce_loss": 0.33150872588157654 + }, + { + "epoch": 1.4356337749653945, + "step": 14520, + "train/sim_loss": 0.03155237436294556 + }, + { + "epoch": 1.4356337749653945, + "step": 14520, + "train/total_loss": 0.06470324844121933 + }, + { + "entropy": 9.04975414276123, + "epoch": 1.43573264781491, + "mean_token_accuracy": 0.834611177444458, + "num_tokens": 31593963.0, + "step": 14521, + "train/ce_loss": 0.32146787643432617 + }, + { + "epoch": 1.43573264781491, + "step": 14521, + "train/sim_loss": 0.016504287719726562 + }, + { + "epoch": 1.43573264781491, + "step": 14521, + "train/total_loss": 0.0486510768532753 + }, + { + "entropy": 9.756893157958984, + "epoch": 1.4358315206644257, + "mean_token_accuracy": 0.8635513782501221, + "num_tokens": 31603227.0, + "step": 14522, + "train/ce_loss": 0.6994713544845581 + }, + { + "epoch": 1.4358315206644257, + "step": 14522, + "train/sim_loss": 0.07296395301818848 + }, + { + "epoch": 1.4358315206644257, + "step": 14522, + "train/total_loss": 0.14291109144687653 + }, + { + "entropy": 9.566566467285156, + "epoch": 1.4359303935139411, + "mean_token_accuracy": 0.8538681864738464, + "num_tokens": 31612709.0, + "step": 14523, + "train/ce_loss": 0.5190490484237671 + }, + { + "epoch": 1.4359303935139411, + "step": 14523, + "train/sim_loss": 0.05494409799575806 + }, + { + "epoch": 1.4359303935139411, + "step": 14523, + "train/total_loss": 0.10684899985790253 + }, + { + "entropy": 9.131004333496094, + "epoch": 1.4360292663634566, + "mean_token_accuracy": 0.878892719745636, + "num_tokens": 31623228.0, + "step": 14524, + "train/ce_loss": 0.08942928165197372 + }, + { + "epoch": 1.4360292663634566, + "step": 14524, + "train/sim_loss": 0.02530825138092041 + }, + { + "epoch": 1.4360292663634566, + "step": 14524, + "train/total_loss": 0.03425117954611778 + }, + { + "entropy": 9.99777603149414, + "epoch": 1.436128139212972, + "mean_token_accuracy": 0.8569079041481018, + "num_tokens": 31633198.0, + "step": 14525, + "train/ce_loss": 0.47967395186424255 + }, + { + "epoch": 1.436128139212972, + "step": 14525, + "train/sim_loss": 0.03348219394683838 + }, + { + "epoch": 1.436128139212972, + "step": 14525, + "train/total_loss": 0.08144959062337875 + }, + { + "entropy": 9.871187210083008, + "epoch": 1.4362270120624876, + "mean_token_accuracy": 0.8991869688034058, + "num_tokens": 31646042.0, + "step": 14526, + "train/ce_loss": 0.6612950563430786 + }, + { + "epoch": 1.4362270120624876, + "step": 14526, + "train/sim_loss": 0.02265143394470215 + }, + { + "epoch": 1.4362270120624876, + "step": 14526, + "train/total_loss": 0.08878093957901001 + }, + { + "entropy": 9.030468940734863, + "epoch": 1.436325884912003, + "mean_token_accuracy": 0.8248081803321838, + "num_tokens": 31656528.0, + "step": 14527, + "train/ce_loss": 0.4467707872390747 + }, + { + "epoch": 1.436325884912003, + "step": 14527, + "train/sim_loss": 0.04730713367462158 + }, + { + "epoch": 1.436325884912003, + "step": 14527, + "train/total_loss": 0.09198421239852905 + }, + { + "entropy": 9.35554313659668, + "epoch": 1.4364247577615186, + "mean_token_accuracy": 0.8291770815849304, + "num_tokens": 31665669.0, + "step": 14528, + "train/ce_loss": 0.6106131076812744 + }, + { + "epoch": 1.4364247577615186, + "step": 14528, + "train/sim_loss": 0.010378360748291016 + }, + { + "epoch": 1.4364247577615186, + "step": 14528, + "train/total_loss": 0.07143966853618622 + }, + { + "entropy": 9.783172607421875, + "epoch": 1.4365236306110343, + "mean_token_accuracy": 0.8701517581939697, + "num_tokens": 31680165.0, + "step": 14529, + "train/ce_loss": 0.6248469352722168 + }, + { + "epoch": 1.4365236306110343, + "step": 14529, + "train/sim_loss": 0.035601019859313965 + }, + { + "epoch": 1.4365236306110343, + "step": 14529, + "train/total_loss": 0.09808571636676788 + }, + { + "entropy": 9.31309700012207, + "epoch": 1.4366225034605498, + "mean_token_accuracy": 0.8624078631401062, + "num_tokens": 31693685.0, + "step": 14530, + "train/ce_loss": 0.46627891063690186 + }, + { + "epoch": 1.4366225034605498, + "step": 14530, + "train/sim_loss": 0.02857232093811035 + }, + { + "epoch": 1.4366225034605498, + "step": 14530, + "train/total_loss": 0.07520021498203278 + }, + { + "entropy": 9.323880195617676, + "epoch": 1.4367213763100652, + "mean_token_accuracy": 0.8501872420310974, + "num_tokens": 31706025.0, + "step": 14531, + "train/ce_loss": 0.4275275766849518 + }, + { + "epoch": 1.4367213763100652, + "step": 14531, + "train/sim_loss": 0.07645821571350098 + }, + { + "epoch": 1.4367213763100652, + "step": 14531, + "train/total_loss": 0.11921097338199615 + }, + { + "entropy": 9.642963409423828, + "epoch": 1.4368202491595807, + "mean_token_accuracy": 0.7945544719696045, + "num_tokens": 31718408.0, + "step": 14532, + "train/ce_loss": 0.5971882343292236 + }, + { + "epoch": 1.4368202491595807, + "step": 14532, + "train/sim_loss": 0.05772411823272705 + }, + { + "epoch": 1.4368202491595807, + "step": 14532, + "train/total_loss": 0.11744294315576553 + }, + { + "entropy": 9.673288345336914, + "epoch": 1.4369191220090962, + "mean_token_accuracy": 0.871191143989563, + "num_tokens": 31734412.0, + "step": 14533, + "train/ce_loss": 0.2243497371673584 + }, + { + "epoch": 1.4369191220090962, + "step": 14533, + "train/sim_loss": 0.03693091869354248 + }, + { + "epoch": 1.4369191220090962, + "step": 14533, + "train/total_loss": 0.0593658909201622 + }, + { + "entropy": 9.532695770263672, + "epoch": 1.437017994858612, + "mean_token_accuracy": 0.896774172782898, + "num_tokens": 31746754.0, + "step": 14534, + "train/ce_loss": 0.43821263313293457 + }, + { + "epoch": 1.437017994858612, + "step": 14534, + "train/sim_loss": 0.03361093997955322 + }, + { + "epoch": 1.437017994858612, + "step": 14534, + "train/total_loss": 0.07743220031261444 + }, + { + "entropy": 9.748039245605469, + "epoch": 1.4371168677081274, + "mean_token_accuracy": 0.9038461446762085, + "num_tokens": 31758900.0, + "step": 14535, + "train/ce_loss": 0.5591680407524109 + }, + { + "epoch": 1.4371168677081274, + "step": 14535, + "train/sim_loss": 0.04663437604904175 + }, + { + "epoch": 1.4371168677081274, + "step": 14535, + "train/total_loss": 0.1025511771440506 + }, + { + "entropy": 9.77360725402832, + "epoch": 1.437215740557643, + "mean_token_accuracy": 0.8538283109664917, + "num_tokens": 31771576.0, + "step": 14536, + "train/ce_loss": 1.2108479738235474 + }, + { + "epoch": 1.437215740557643, + "step": 14536, + "train/sim_loss": 0.05395698547363281 + }, + { + "epoch": 1.437215740557643, + "step": 14536, + "train/total_loss": 0.1750417947769165 + }, + { + "entropy": 9.890512466430664, + "epoch": 1.4373146134071584, + "mean_token_accuracy": 0.8613037467002869, + "num_tokens": 31783888.0, + "step": 14537, + "train/ce_loss": 0.5585634708404541 + }, + { + "epoch": 1.4373146134071584, + "step": 14537, + "train/sim_loss": 0.07194244861602783 + }, + { + "epoch": 1.4373146134071584, + "step": 14537, + "train/total_loss": 0.12779879570007324 + }, + { + "entropy": 9.27400016784668, + "epoch": 1.4374134862566739, + "mean_token_accuracy": 0.8626760840415955, + "num_tokens": 31796467.0, + "step": 14538, + "train/ce_loss": 0.36350446939468384 + }, + { + "epoch": 1.4374134862566739, + "step": 14538, + "train/sim_loss": 0.04624462127685547 + }, + { + "epoch": 1.4374134862566739, + "step": 14538, + "train/total_loss": 0.08259506523609161 + }, + { + "entropy": 9.367195129394531, + "epoch": 1.4375123591061896, + "mean_token_accuracy": 0.8602150678634644, + "num_tokens": 31814556.0, + "step": 14539, + "train/ce_loss": 8.228971069002e-07 + }, + { + "epoch": 1.4375123591061896, + "step": 14539, + "train/sim_loss": 0.03634834289550781 + }, + { + "epoch": 1.4375123591061896, + "step": 14539, + "train/total_loss": 0.03634842485189438 + }, + { + "epoch": 1.4376112319557048, + "grad_norm": 0.650425136089325, + "learning_rate": 6.407802996588044e-06, + "loss": 0.0828, + "step": 14540 + }, + { + "entropy": 9.439560890197754, + "epoch": 1.4376112319557048, + "mean_token_accuracy": 0.8227990865707397, + "num_tokens": 31827702.0, + "step": 14540, + "train/ce_loss": 0.32747799158096313 + }, + { + "epoch": 1.4376112319557048, + "step": 14540, + "train/sim_loss": 0.03334909677505493 + }, + { + "epoch": 1.4376112319557048, + "step": 14540, + "train/total_loss": 0.06609690189361572 + }, + { + "entropy": 9.432538986206055, + "epoch": 1.4377101048052205, + "mean_token_accuracy": 0.843870997428894, + "num_tokens": 31843630.0, + "step": 14541, + "train/ce_loss": 0.36320969462394714 + }, + { + "epoch": 1.4377101048052205, + "step": 14541, + "train/sim_loss": 0.017304420471191406 + }, + { + "epoch": 1.4377101048052205, + "step": 14541, + "train/total_loss": 0.05362538993358612 + }, + { + "entropy": 9.671825408935547, + "epoch": 1.437808977654736, + "mean_token_accuracy": 0.8680738806724548, + "num_tokens": 31855818.0, + "step": 14542, + "train/ce_loss": 1.1637592933766427e-06 + }, + { + "epoch": 1.437808977654736, + "step": 14542, + "train/sim_loss": 0.022519350051879883 + }, + { + "epoch": 1.437808977654736, + "step": 14542, + "train/total_loss": 0.022519465535879135 + }, + { + "entropy": 9.381658554077148, + "epoch": 1.4379078505042515, + "mean_token_accuracy": 0.8695651888847351, + "num_tokens": 31869953.0, + "step": 14543, + "train/ce_loss": 0.5932686924934387 + }, + { + "epoch": 1.4379078505042515, + "step": 14543, + "train/sim_loss": 0.06038689613342285 + }, + { + "epoch": 1.4379078505042515, + "step": 14543, + "train/total_loss": 0.11971376836299896 + }, + { + "entropy": 10.111763000488281, + "epoch": 1.438006723353767, + "mean_token_accuracy": 0.90234375, + "num_tokens": 31882625.0, + "step": 14544, + "train/ce_loss": 3.8590583244513255e-06 + }, + { + "epoch": 1.438006723353767, + "step": 14544, + "train/sim_loss": 0.03304719924926758 + }, + { + "epoch": 1.438006723353767, + "step": 14544, + "train/total_loss": 0.03304758667945862 + }, + { + "entropy": 9.490362167358398, + "epoch": 1.4381055962032825, + "mean_token_accuracy": 0.9111424684524536, + "num_tokens": 31900244.0, + "step": 14545, + "train/ce_loss": 0.19432170689105988 + }, + { + "epoch": 1.4381055962032825, + "step": 14545, + "train/sim_loss": 0.060891926288604736 + }, + { + "epoch": 1.4381055962032825, + "step": 14545, + "train/total_loss": 0.08032409846782684 + }, + { + "entropy": 9.780714988708496, + "epoch": 1.4382044690527982, + "mean_token_accuracy": 0.8594436049461365, + "num_tokens": 31912398.0, + "step": 14546, + "train/ce_loss": 0.2164606899023056 + }, + { + "epoch": 1.4382044690527982, + "step": 14546, + "train/sim_loss": 0.06740987300872803 + }, + { + "epoch": 1.4382044690527982, + "step": 14546, + "train/total_loss": 0.08905594050884247 + }, + { + "entropy": 9.29792594909668, + "epoch": 1.4383033419023137, + "mean_token_accuracy": 0.8228498101234436, + "num_tokens": 31927639.0, + "step": 14547, + "train/ce_loss": 0.39969971776008606 + }, + { + "epoch": 1.4383033419023137, + "step": 14547, + "train/sim_loss": 0.07313060760498047 + }, + { + "epoch": 1.4383033419023137, + "step": 14547, + "train/total_loss": 0.1131005808711052 + }, + { + "entropy": 9.200510025024414, + "epoch": 1.4384022147518292, + "mean_token_accuracy": 0.8692810535430908, + "num_tokens": 31936641.0, + "step": 14548, + "train/ce_loss": 0.4141820967197418 + }, + { + "epoch": 1.4384022147518292, + "step": 14548, + "train/sim_loss": 0.033393144607543945 + }, + { + "epoch": 1.4384022147518292, + "step": 14548, + "train/total_loss": 0.07481135427951813 + }, + { + "entropy": 9.333635330200195, + "epoch": 1.4385010876013447, + "mean_token_accuracy": 0.8349236845970154, + "num_tokens": 31948563.0, + "step": 14549, + "train/ce_loss": 0.5411738753318787 + }, + { + "epoch": 1.4385010876013447, + "step": 14549, + "train/sim_loss": 0.035181403160095215 + }, + { + "epoch": 1.4385010876013447, + "step": 14549, + "train/total_loss": 0.0892987921833992 + }, + { + "entropy": 9.77088451385498, + "epoch": 1.4385999604508601, + "mean_token_accuracy": 0.8255033493041992, + "num_tokens": 31962906.0, + "step": 14550, + "train/ce_loss": 0.7482711672782898 + }, + { + "epoch": 1.4385999604508601, + "step": 14550, + "train/sim_loss": 0.01453554630279541 + }, + { + "epoch": 1.4385999604508601, + "step": 14550, + "train/total_loss": 0.08936266601085663 + }, + { + "entropy": 9.117462158203125, + "epoch": 1.4386988333003758, + "mean_token_accuracy": 0.819803774356842, + "num_tokens": 31974550.0, + "step": 14551, + "train/ce_loss": 0.5066497325897217 + }, + { + "epoch": 1.4386988333003758, + "step": 14551, + "train/sim_loss": 0.051952362060546875 + }, + { + "epoch": 1.4386988333003758, + "step": 14551, + "train/total_loss": 0.10261733829975128 + }, + { + "entropy": 9.618139266967773, + "epoch": 1.438797706149891, + "mean_token_accuracy": 0.9250353574752808, + "num_tokens": 31985915.0, + "step": 14552, + "train/ce_loss": 0.23770613968372345 + }, + { + "epoch": 1.438797706149891, + "step": 14552, + "train/sim_loss": 0.05286967754364014 + }, + { + "epoch": 1.438797706149891, + "step": 14552, + "train/total_loss": 0.0766402930021286 + }, + { + "entropy": 9.603761672973633, + "epoch": 1.4388965789994068, + "mean_token_accuracy": 0.8344459533691406, + "num_tokens": 31996553.0, + "step": 14553, + "train/ce_loss": 0.36182984709739685 + }, + { + "epoch": 1.4388965789994068, + "step": 14553, + "train/sim_loss": 0.057588815689086914 + }, + { + "epoch": 1.4388965789994068, + "step": 14553, + "train/total_loss": 0.0937718003988266 + }, + { + "entropy": 9.695276260375977, + "epoch": 1.4389954518489223, + "mean_token_accuracy": 0.848537027835846, + "num_tokens": 32009035.0, + "step": 14554, + "train/ce_loss": 0.4781777858734131 + }, + { + "epoch": 1.4389954518489223, + "step": 14554, + "train/sim_loss": 0.020065009593963623 + }, + { + "epoch": 1.4389954518489223, + "step": 14554, + "train/total_loss": 0.06788279116153717 + }, + { + "entropy": 9.714441299438477, + "epoch": 1.4390943246984378, + "mean_token_accuracy": 0.8836773037910461, + "num_tokens": 32018066.0, + "step": 14555, + "train/ce_loss": 1.2211023658892373e-06 + }, + { + "epoch": 1.4390943246984378, + "step": 14555, + "train/sim_loss": 0.048263728618621826 + }, + { + "epoch": 1.4390943246984378, + "step": 14555, + "train/total_loss": 0.048263851553201675 + }, + { + "entropy": 9.691261291503906, + "epoch": 1.4391931975479533, + "mean_token_accuracy": 0.8414795398712158, + "num_tokens": 32031670.0, + "step": 14556, + "train/ce_loss": 1.1524450778961182 + }, + { + "epoch": 1.4391931975479533, + "step": 14556, + "train/sim_loss": 0.07381242513656616 + }, + { + "epoch": 1.4391931975479533, + "step": 14556, + "train/total_loss": 0.18905693292617798 + }, + { + "entropy": 9.262045860290527, + "epoch": 1.4392920703974688, + "mean_token_accuracy": 0.8232104182243347, + "num_tokens": 32047003.0, + "step": 14557, + "train/ce_loss": 0.9177016019821167 + }, + { + "epoch": 1.4392920703974688, + "step": 14557, + "train/sim_loss": 0.04603874683380127 + }, + { + "epoch": 1.4392920703974688, + "step": 14557, + "train/total_loss": 0.1378089189529419 + }, + { + "entropy": 9.703557014465332, + "epoch": 1.4393909432469845, + "mean_token_accuracy": 0.8793363571166992, + "num_tokens": 32059696.0, + "step": 14558, + "train/ce_loss": 0.17117640376091003 + }, + { + "epoch": 1.4393909432469845, + "step": 14558, + "train/sim_loss": 0.0452038049697876 + }, + { + "epoch": 1.4393909432469845, + "step": 14558, + "train/total_loss": 0.06232144683599472 + }, + { + "entropy": 9.60752010345459, + "epoch": 1.4394898160965, + "mean_token_accuracy": 0.8398692607879639, + "num_tokens": 32071967.0, + "step": 14559, + "train/ce_loss": 0.4751426875591278 + }, + { + "epoch": 1.4394898160965, + "step": 14559, + "train/sim_loss": 0.06232893466949463 + }, + { + "epoch": 1.4394898160965, + "step": 14559, + "train/total_loss": 0.10984320938587189 + }, + { + "epoch": 1.4395886889460154, + "grad_norm": 0.6867581009864807, + "learning_rate": 6.402858131830096e-06, + "loss": 0.085, + "step": 14560 + }, + { + "entropy": 9.825187683105469, + "epoch": 1.4395886889460154, + "mean_token_accuracy": 0.8705673813819885, + "num_tokens": 32084630.0, + "step": 14560, + "train/ce_loss": 0.5101678967475891 + }, + { + "epoch": 1.4395886889460154, + "step": 14560, + "train/sim_loss": 0.02532881498336792 + }, + { + "epoch": 1.4395886889460154, + "step": 14560, + "train/total_loss": 0.07634560763835907 + }, + { + "entropy": 9.704113006591797, + "epoch": 1.439687561795531, + "mean_token_accuracy": 0.8956043720245361, + "num_tokens": 32096552.0, + "step": 14561, + "train/ce_loss": 0.5411919355392456 + }, + { + "epoch": 1.439687561795531, + "step": 14561, + "train/sim_loss": 0.048085927963256836 + }, + { + "epoch": 1.439687561795531, + "step": 14561, + "train/total_loss": 0.10220512747764587 + }, + { + "entropy": 9.500476837158203, + "epoch": 1.4397864346450464, + "mean_token_accuracy": 0.8754717111587524, + "num_tokens": 32115278.0, + "step": 14562, + "train/ce_loss": 7.872108653828036e-07 + }, + { + "epoch": 1.4397864346450464, + "step": 14562, + "train/sim_loss": 0.029781579971313477 + }, + { + "epoch": 1.4397864346450464, + "step": 14562, + "train/total_loss": 0.029781658202409744 + }, + { + "entropy": 9.248747825622559, + "epoch": 1.4398853074945621, + "mean_token_accuracy": 0.886583685874939, + "num_tokens": 32127852.0, + "step": 14563, + "train/ce_loss": 2.305419684489607e-06 + }, + { + "epoch": 1.4398853074945621, + "step": 14563, + "train/sim_loss": 0.0330280065536499 + }, + { + "epoch": 1.4398853074945621, + "step": 14563, + "train/total_loss": 0.03302823752164841 + }, + { + "entropy": 9.706594467163086, + "epoch": 1.4399841803440774, + "mean_token_accuracy": 0.8371466994285583, + "num_tokens": 32141760.0, + "step": 14564, + "train/ce_loss": 0.5827215909957886 + }, + { + "epoch": 1.4399841803440774, + "step": 14564, + "train/sim_loss": 0.05089527368545532 + }, + { + "epoch": 1.4399841803440774, + "step": 14564, + "train/total_loss": 0.1091674342751503 + }, + { + "entropy": 9.102794647216797, + "epoch": 1.440083053193593, + "mean_token_accuracy": 0.8429448008537292, + "num_tokens": 32152954.0, + "step": 14565, + "train/ce_loss": 0.8022871613502502 + }, + { + "epoch": 1.440083053193593, + "step": 14565, + "train/sim_loss": 0.07293152809143066 + }, + { + "epoch": 1.440083053193593, + "step": 14565, + "train/total_loss": 0.1531602442264557 + }, + { + "entropy": 10.098072052001953, + "epoch": 1.4401819260431086, + "mean_token_accuracy": 0.8588807582855225, + "num_tokens": 32165750.0, + "step": 14566, + "train/ce_loss": 0.6860750913619995 + }, + { + "epoch": 1.4401819260431086, + "step": 14566, + "train/sim_loss": 0.036412060260772705 + }, + { + "epoch": 1.4401819260431086, + "step": 14566, + "train/total_loss": 0.10501956939697266 + }, + { + "entropy": 9.9885892868042, + "epoch": 1.440280798892624, + "mean_token_accuracy": 0.8462733030319214, + "num_tokens": 32180740.0, + "step": 14567, + "train/ce_loss": 0.6810916662216187 + }, + { + "epoch": 1.440280798892624, + "step": 14567, + "train/sim_loss": 0.016806960105895996 + }, + { + "epoch": 1.440280798892624, + "step": 14567, + "train/total_loss": 0.0849161297082901 + }, + { + "entropy": 10.062567710876465, + "epoch": 1.4403796717421395, + "mean_token_accuracy": 0.8690807819366455, + "num_tokens": 32196685.0, + "step": 14568, + "train/ce_loss": 2.3933305328682764e-06 + }, + { + "epoch": 1.4403796717421395, + "step": 14568, + "train/sim_loss": 0.03876030445098877 + }, + { + "epoch": 1.4403796717421395, + "step": 14568, + "train/total_loss": 0.03876054286956787 + }, + { + "entropy": 9.498468399047852, + "epoch": 1.440478544591655, + "mean_token_accuracy": 0.8363636136054993, + "num_tokens": 32214457.0, + "step": 14569, + "train/ce_loss": 0.5265893936157227 + }, + { + "epoch": 1.440478544591655, + "step": 14569, + "train/sim_loss": 0.02179431915283203 + }, + { + "epoch": 1.440478544591655, + "step": 14569, + "train/total_loss": 0.07445326447486877 + }, + { + "entropy": 9.53511905670166, + "epoch": 1.4405774174411707, + "mean_token_accuracy": 0.824911892414093, + "num_tokens": 32229759.0, + "step": 14570, + "train/ce_loss": 0.48262473940849304 + }, + { + "epoch": 1.4405774174411707, + "step": 14570, + "train/sim_loss": 0.040753960609436035 + }, + { + "epoch": 1.4405774174411707, + "step": 14570, + "train/total_loss": 0.08901643753051758 + }, + { + "entropy": 10.234804153442383, + "epoch": 1.4406762902906862, + "mean_token_accuracy": 0.8194174766540527, + "num_tokens": 32240333.0, + "step": 14571, + "train/ce_loss": 0.642479658126831 + }, + { + "epoch": 1.4406762902906862, + "step": 14571, + "train/sim_loss": 0.06918132305145264 + }, + { + "epoch": 1.4406762902906862, + "step": 14571, + "train/total_loss": 0.13342928886413574 + }, + { + "entropy": 9.398170471191406, + "epoch": 1.4407751631402017, + "mean_token_accuracy": 0.8983957171440125, + "num_tokens": 32256899.0, + "step": 14572, + "train/ce_loss": 0.2316751927137375 + }, + { + "epoch": 1.4407751631402017, + "step": 14572, + "train/sim_loss": 0.007129549980163574 + }, + { + "epoch": 1.4407751631402017, + "step": 14572, + "train/total_loss": 0.030297068879008293 + }, + { + "entropy": 9.147394180297852, + "epoch": 1.4408740359897172, + "mean_token_accuracy": 0.8309692740440369, + "num_tokens": 32264861.0, + "step": 14573, + "train/ce_loss": 0.3653711676597595 + }, + { + "epoch": 1.4408740359897172, + "step": 14573, + "train/sim_loss": 0.06859052181243896 + }, + { + "epoch": 1.4408740359897172, + "step": 14573, + "train/total_loss": 0.10512764006853104 + }, + { + "entropy": 10.205806732177734, + "epoch": 1.4409729088392327, + "mean_token_accuracy": 0.8503937125205994, + "num_tokens": 32274720.0, + "step": 14574, + "train/ce_loss": 0.7179292440414429 + }, + { + "epoch": 1.4409729088392327, + "step": 14574, + "train/sim_loss": 0.043123602867126465 + }, + { + "epoch": 1.4409729088392327, + "step": 14574, + "train/total_loss": 0.11491652578115463 + }, + { + "entropy": 9.61725902557373, + "epoch": 1.4410717816887484, + "mean_token_accuracy": 0.9029462933540344, + "num_tokens": 32285094.0, + "step": 14575, + "train/ce_loss": 0.20817101001739502 + }, + { + "epoch": 1.4410717816887484, + "step": 14575, + "train/sim_loss": 0.031282782554626465 + }, + { + "epoch": 1.4410717816887484, + "step": 14575, + "train/total_loss": 0.05209988355636597 + }, + { + "entropy": 9.509458541870117, + "epoch": 1.4411706545382639, + "mean_token_accuracy": 0.828976035118103, + "num_tokens": 32297239.0, + "step": 14576, + "train/ce_loss": 0.25572580099105835 + }, + { + "epoch": 1.4411706545382639, + "step": 14576, + "train/sim_loss": 0.041590988636016846 + }, + { + "epoch": 1.4411706545382639, + "step": 14576, + "train/total_loss": 0.06716357171535492 + }, + { + "entropy": 9.286993980407715, + "epoch": 1.4412695273877794, + "mean_token_accuracy": 0.8458864688873291, + "num_tokens": 32315213.0, + "step": 14577, + "train/ce_loss": 0.6904225945472717 + }, + { + "epoch": 1.4412695273877794, + "step": 14577, + "train/sim_loss": 0.03836590051651001 + }, + { + "epoch": 1.4412695273877794, + "step": 14577, + "train/total_loss": 0.10740815848112106 + }, + { + "entropy": 9.465087890625, + "epoch": 1.4413684002372948, + "mean_token_accuracy": 0.7931034564971924, + "num_tokens": 32325210.0, + "step": 14578, + "train/ce_loss": 0.35128486156463623 + }, + { + "epoch": 1.4413684002372948, + "step": 14578, + "train/sim_loss": 0.038799405097961426 + }, + { + "epoch": 1.4413684002372948, + "step": 14578, + "train/total_loss": 0.07392789423465729 + }, + { + "entropy": 9.98463249206543, + "epoch": 1.4414672730868103, + "mean_token_accuracy": 0.913690447807312, + "num_tokens": 32335042.0, + "step": 14579, + "train/ce_loss": 0.751327633857727 + }, + { + "epoch": 1.4414672730868103, + "step": 14579, + "train/sim_loss": 0.07266867160797119 + }, + { + "epoch": 1.4414672730868103, + "step": 14579, + "train/total_loss": 0.14780142903327942 + }, + { + "epoch": 1.4415661459363258, + "grad_norm": 0.6401826739311218, + "learning_rate": 6.397913267072145e-06, + "loss": 0.0862, + "step": 14580 + }, + { + "entropy": 10.183969497680664, + "epoch": 1.4415661459363258, + "mean_token_accuracy": 0.9292035102844238, + "num_tokens": 32350281.0, + "step": 14580, + "train/ce_loss": 3.588678009691648e-05 + }, + { + "epoch": 1.4415661459363258, + "step": 14580, + "train/sim_loss": 0.030649185180664062 + }, + { + "epoch": 1.4415661459363258, + "step": 14580, + "train/total_loss": 0.03065277449786663 + }, + { + "entropy": 9.484038352966309, + "epoch": 1.4416650187858413, + "mean_token_accuracy": 0.8502024412155151, + "num_tokens": 32368288.0, + "step": 14581, + "train/ce_loss": 0.29920706152915955 + }, + { + "epoch": 1.4416650187858413, + "step": 14581, + "train/sim_loss": 0.018263816833496094 + }, + { + "epoch": 1.4416650187858413, + "step": 14581, + "train/total_loss": 0.04818452149629593 + }, + { + "entropy": 9.376938819885254, + "epoch": 1.441763891635357, + "mean_token_accuracy": 0.8371298313140869, + "num_tokens": 32384057.0, + "step": 14582, + "train/ce_loss": 0.562480628490448 + }, + { + "epoch": 1.441763891635357, + "step": 14582, + "train/sim_loss": 0.010660290718078613 + }, + { + "epoch": 1.441763891635357, + "step": 14582, + "train/total_loss": 0.06690835952758789 + }, + { + "entropy": 9.643786430358887, + "epoch": 1.4418627644848725, + "mean_token_accuracy": 0.8399471044540405, + "num_tokens": 32399414.0, + "step": 14583, + "train/ce_loss": 0.6493986248970032 + }, + { + "epoch": 1.4418627644848725, + "step": 14583, + "train/sim_loss": 0.02970302104949951 + }, + { + "epoch": 1.4418627644848725, + "step": 14583, + "train/total_loss": 0.09464288502931595 + }, + { + "entropy": 9.894110679626465, + "epoch": 1.441961637334388, + "mean_token_accuracy": 0.8764302134513855, + "num_tokens": 32413424.0, + "step": 14584, + "train/ce_loss": 1.8363169829171966e-06 + }, + { + "epoch": 1.441961637334388, + "step": 14584, + "train/sim_loss": 0.047727346420288086 + }, + { + "epoch": 1.441961637334388, + "step": 14584, + "train/total_loss": 0.04772752895951271 + }, + { + "entropy": 9.390972137451172, + "epoch": 1.4420605101839035, + "mean_token_accuracy": 0.8545647263526917, + "num_tokens": 32422737.0, + "step": 14585, + "train/ce_loss": 0.3436822295188904 + }, + { + "epoch": 1.4420605101839035, + "step": 14585, + "train/sim_loss": 0.01588308811187744 + }, + { + "epoch": 1.4420605101839035, + "step": 14585, + "train/total_loss": 0.0502513125538826 + }, + { + "entropy": 9.689887046813965, + "epoch": 1.442159383033419, + "mean_token_accuracy": 0.8639896512031555, + "num_tokens": 32442628.0, + "step": 14586, + "train/ce_loss": 0.22668766975402832 + }, + { + "epoch": 1.442159383033419, + "step": 14586, + "train/sim_loss": 0.022662580013275146 + }, + { + "epoch": 1.442159383033419, + "step": 14586, + "train/total_loss": 0.04533134773373604 + }, + { + "entropy": 8.912571907043457, + "epoch": 1.4422582558829347, + "mean_token_accuracy": 0.8298109173774719, + "num_tokens": 32451940.0, + "step": 14587, + "train/ce_loss": 0.3959619402885437 + }, + { + "epoch": 1.4422582558829347, + "step": 14587, + "train/sim_loss": 0.01810920238494873 + }, + { + "epoch": 1.4422582558829347, + "step": 14587, + "train/total_loss": 0.05770539864897728 + }, + { + "entropy": 9.454691886901855, + "epoch": 1.4423571287324501, + "mean_token_accuracy": 0.8207547068595886, + "num_tokens": 32461458.0, + "step": 14588, + "train/ce_loss": 1.1298061508568935e-06 + }, + { + "epoch": 1.4423571287324501, + "step": 14588, + "train/sim_loss": 0.04568898677825928 + }, + { + "epoch": 1.4423571287324501, + "step": 14588, + "train/total_loss": 0.04568909853696823 + }, + { + "entropy": 9.606610298156738, + "epoch": 1.4424560015819656, + "mean_token_accuracy": 0.9001883268356323, + "num_tokens": 32469796.0, + "step": 14589, + "train/ce_loss": 0.18033063411712646 + }, + { + "epoch": 1.4424560015819656, + "step": 14589, + "train/sim_loss": 0.07308542728424072 + }, + { + "epoch": 1.4424560015819656, + "step": 14589, + "train/total_loss": 0.09111849218606949 + }, + { + "entropy": 9.29546070098877, + "epoch": 1.4425548744314811, + "mean_token_accuracy": 0.8428927659988403, + "num_tokens": 32479269.0, + "step": 14590, + "train/ce_loss": 0.3528243899345398 + }, + { + "epoch": 1.4425548744314811, + "step": 14590, + "train/sim_loss": 0.029341161251068115 + }, + { + "epoch": 1.4425548744314811, + "step": 14590, + "train/total_loss": 0.06462360173463821 + }, + { + "entropy": 9.497236251831055, + "epoch": 1.4426537472809966, + "mean_token_accuracy": 0.8489461541175842, + "num_tokens": 32491742.0, + "step": 14591, + "train/ce_loss": 0.373849481344223 + }, + { + "epoch": 1.4426537472809966, + "step": 14591, + "train/sim_loss": 0.06421154737472534 + }, + { + "epoch": 1.4426537472809966, + "step": 14591, + "train/total_loss": 0.10159649699926376 + }, + { + "entropy": 8.99254322052002, + "epoch": 1.442752620130512, + "mean_token_accuracy": 0.8454440832138062, + "num_tokens": 32499324.0, + "step": 14592, + "train/ce_loss": 0.4662312865257263 + }, + { + "epoch": 1.442752620130512, + "step": 14592, + "train/sim_loss": 0.032244324684143066 + }, + { + "epoch": 1.442752620130512, + "step": 14592, + "train/total_loss": 0.07886745035648346 + }, + { + "entropy": 9.647192001342773, + "epoch": 1.4428514929800276, + "mean_token_accuracy": 0.8787878751754761, + "num_tokens": 32512889.0, + "step": 14593, + "train/ce_loss": 0.22981026768684387 + }, + { + "epoch": 1.4428514929800276, + "step": 14593, + "train/sim_loss": 0.040574073791503906 + }, + { + "epoch": 1.4428514929800276, + "step": 14593, + "train/total_loss": 0.06355509907007217 + }, + { + "entropy": 9.512432098388672, + "epoch": 1.4429503658295433, + "mean_token_accuracy": 0.8315789699554443, + "num_tokens": 32524660.0, + "step": 14594, + "train/ce_loss": 0.46496307849884033 + }, + { + "epoch": 1.4429503658295433, + "step": 14594, + "train/sim_loss": 0.01437687873840332 + }, + { + "epoch": 1.4429503658295433, + "step": 14594, + "train/total_loss": 0.06087318807840347 + }, + { + "entropy": 9.771037101745605, + "epoch": 1.4430492386790588, + "mean_token_accuracy": 0.8537735939025879, + "num_tokens": 32541137.0, + "step": 14595, + "train/ce_loss": 0.5768418312072754 + }, + { + "epoch": 1.4430492386790588, + "step": 14595, + "train/sim_loss": 0.03282582759857178 + }, + { + "epoch": 1.4430492386790588, + "step": 14595, + "train/total_loss": 0.09051001071929932 + }, + { + "entropy": 9.462038040161133, + "epoch": 1.4431481115285743, + "mean_token_accuracy": 0.8535414338111877, + "num_tokens": 32559048.0, + "step": 14596, + "train/ce_loss": 0.4631037712097168 + }, + { + "epoch": 1.4431481115285743, + "step": 14596, + "train/sim_loss": 0.01657271385192871 + }, + { + "epoch": 1.4431481115285743, + "step": 14596, + "train/total_loss": 0.06288309395313263 + }, + { + "entropy": 9.578369140625, + "epoch": 1.4432469843780897, + "mean_token_accuracy": 0.8516886830329895, + "num_tokens": 32571358.0, + "step": 14597, + "train/ce_loss": 0.7566284537315369 + }, + { + "epoch": 1.4432469843780897, + "step": 14597, + "train/sim_loss": 0.07852888107299805 + }, + { + "epoch": 1.4432469843780897, + "step": 14597, + "train/total_loss": 0.1541917324066162 + }, + { + "entropy": 9.354190826416016, + "epoch": 1.4433458572276052, + "mean_token_accuracy": 0.8853904008865356, + "num_tokens": 32580911.0, + "step": 14598, + "train/ce_loss": 0.384911447763443 + }, + { + "epoch": 1.4433458572276052, + "step": 14598, + "train/sim_loss": 0.034540534019470215 + }, + { + "epoch": 1.4433458572276052, + "step": 14598, + "train/total_loss": 0.07303167879581451 + }, + { + "entropy": 9.202412605285645, + "epoch": 1.443444730077121, + "mean_token_accuracy": 0.8260340690612793, + "num_tokens": 32593781.0, + "step": 14599, + "train/ce_loss": 0.293381929397583 + }, + { + "epoch": 1.443444730077121, + "step": 14599, + "train/sim_loss": 0.0178225040435791 + }, + { + "epoch": 1.443444730077121, + "step": 14599, + "train/total_loss": 0.04716069996356964 + }, + { + "epoch": 1.4435436029266364, + "grad_norm": 0.636113166809082, + "learning_rate": 6.392968402314197e-06, + "loss": 0.0835, + "step": 14600 + }, + { + "entropy": 9.418548583984375, + "epoch": 1.4435436029266364, + "mean_token_accuracy": 0.835839569568634, + "num_tokens": 32607446.0, + "step": 14600, + "train/ce_loss": 0.27616527676582336 + }, + { + "epoch": 1.4435436029266364, + "step": 14600, + "train/sim_loss": 0.03625619411468506 + }, + { + "epoch": 1.4435436029266364, + "step": 14600, + "train/total_loss": 0.06387272477149963 + }, + { + "entropy": 9.361166000366211, + "epoch": 1.443642475776152, + "mean_token_accuracy": 0.8286270499229431, + "num_tokens": 32621987.0, + "step": 14601, + "train/ce_loss": 0.5543732643127441 + }, + { + "epoch": 1.443642475776152, + "step": 14601, + "train/sim_loss": 0.07146000862121582 + }, + { + "epoch": 1.443642475776152, + "step": 14601, + "train/total_loss": 0.12689733505249023 + }, + { + "entropy": 9.101633071899414, + "epoch": 1.4437413486256674, + "mean_token_accuracy": 0.8486841917037964, + "num_tokens": 32632181.0, + "step": 14602, + "train/ce_loss": 0.20580121874809265 + }, + { + "epoch": 1.4437413486256674, + "step": 14602, + "train/sim_loss": 0.04177898168563843 + }, + { + "epoch": 1.4437413486256674, + "step": 14602, + "train/total_loss": 0.062359102070331573 + }, + { + "entropy": 9.45675277709961, + "epoch": 1.4438402214751829, + "mean_token_accuracy": 0.866746723651886, + "num_tokens": 32643271.0, + "step": 14603, + "train/ce_loss": 0.11077118664979935 + }, + { + "epoch": 1.4438402214751829, + "step": 14603, + "train/sim_loss": 0.07202893495559692 + }, + { + "epoch": 1.4438402214751829, + "step": 14603, + "train/total_loss": 0.08310605585575104 + }, + { + "entropy": 9.608598709106445, + "epoch": 1.4439390943246984, + "mean_token_accuracy": 0.8492063283920288, + "num_tokens": 32653157.0, + "step": 14604, + "train/ce_loss": 0.8633255362510681 + }, + { + "epoch": 1.4439390943246984, + "step": 14604, + "train/sim_loss": 0.0713266134262085 + }, + { + "epoch": 1.4439390943246984, + "step": 14604, + "train/total_loss": 0.15765917301177979 + }, + { + "entropy": 9.251363754272461, + "epoch": 1.4440379671742138, + "mean_token_accuracy": 0.8343710899353027, + "num_tokens": 32664001.0, + "step": 14605, + "train/ce_loss": 0.6841039061546326 + }, + { + "epoch": 1.4440379671742138, + "step": 14605, + "train/sim_loss": 0.05610096454620361 + }, + { + "epoch": 1.4440379671742138, + "step": 14605, + "train/total_loss": 0.12451135367155075 + }, + { + "entropy": 9.479217529296875, + "epoch": 1.4441368400237296, + "mean_token_accuracy": 0.8409703373908997, + "num_tokens": 32677768.0, + "step": 14606, + "train/ce_loss": 0.3673322796821594 + }, + { + "epoch": 1.4441368400237296, + "step": 14606, + "train/sim_loss": 0.029169321060180664 + }, + { + "epoch": 1.4441368400237296, + "step": 14606, + "train/total_loss": 0.06590254604816437 + }, + { + "entropy": 10.029556274414062, + "epoch": 1.444235712873245, + "mean_token_accuracy": 0.9450980424880981, + "num_tokens": 32687471.0, + "step": 14607, + "train/ce_loss": 4.356927263415855e-07 + }, + { + "epoch": 1.444235712873245, + "step": 14607, + "train/sim_loss": 0.015250921249389648 + }, + { + "epoch": 1.444235712873245, + "step": 14607, + "train/total_loss": 0.015250965021550655 + }, + { + "entropy": 9.527566909790039, + "epoch": 1.4443345857227605, + "mean_token_accuracy": 0.8677685856819153, + "num_tokens": 32701426.0, + "step": 14608, + "train/ce_loss": 9.788519719222677e-07 + }, + { + "epoch": 1.4443345857227605, + "step": 14608, + "train/sim_loss": 0.04686450958251953 + }, + { + "epoch": 1.4443345857227605, + "step": 14608, + "train/total_loss": 0.04686460644006729 + }, + { + "entropy": 9.812463760375977, + "epoch": 1.444433458572276, + "mean_token_accuracy": 0.8645161390304565, + "num_tokens": 32721007.0, + "step": 14609, + "train/ce_loss": 6.634533633587125e-07 + }, + { + "epoch": 1.444433458572276, + "step": 14609, + "train/sim_loss": 0.015812277793884277 + }, + { + "epoch": 1.444433458572276, + "step": 14609, + "train/total_loss": 0.01581234484910965 + }, + { + "entropy": 9.56894302368164, + "epoch": 1.4445323314217915, + "mean_token_accuracy": 0.8528896570205688, + "num_tokens": 32730315.0, + "step": 14610, + "train/ce_loss": 0.6467200517654419 + }, + { + "epoch": 1.4445323314217915, + "step": 14610, + "train/sim_loss": 0.11551332473754883 + }, + { + "epoch": 1.4445323314217915, + "step": 14610, + "train/total_loss": 0.18018533289432526 + }, + { + "entropy": 9.580412864685059, + "epoch": 1.4446312042713072, + "mean_token_accuracy": 0.8341902494430542, + "num_tokens": 32747621.0, + "step": 14611, + "train/ce_loss": 0.6819410920143127 + }, + { + "epoch": 1.4446312042713072, + "step": 14611, + "train/sim_loss": 0.04654508829116821 + }, + { + "epoch": 1.4446312042713072, + "step": 14611, + "train/total_loss": 0.11473920196294785 + }, + { + "entropy": 9.6181640625, + "epoch": 1.4447300771208227, + "mean_token_accuracy": 0.8589211702346802, + "num_tokens": 32759181.0, + "step": 14612, + "train/ce_loss": 0.29238200187683105 + }, + { + "epoch": 1.4447300771208227, + "step": 14612, + "train/sim_loss": 0.050563156604766846 + }, + { + "epoch": 1.4447300771208227, + "step": 14612, + "train/total_loss": 0.07980135828256607 + }, + { + "entropy": 9.507830619812012, + "epoch": 1.4448289499703382, + "mean_token_accuracy": 0.8345588445663452, + "num_tokens": 32771203.0, + "step": 14613, + "train/ce_loss": 0.5635054707527161 + }, + { + "epoch": 1.4448289499703382, + "step": 14613, + "train/sim_loss": 0.06560254096984863 + }, + { + "epoch": 1.4448289499703382, + "step": 14613, + "train/total_loss": 0.121953085064888 + }, + { + "entropy": 9.20347785949707, + "epoch": 1.4449278228198537, + "mean_token_accuracy": 0.8847290873527527, + "num_tokens": 32781123.0, + "step": 14614, + "train/ce_loss": 0.5066969990730286 + }, + { + "epoch": 1.4449278228198537, + "step": 14614, + "train/sim_loss": 0.0667310357093811 + }, + { + "epoch": 1.4449278228198537, + "step": 14614, + "train/total_loss": 0.11740073561668396 + }, + { + "entropy": 9.627239227294922, + "epoch": 1.4450266956693691, + "mean_token_accuracy": 0.8205128312110901, + "num_tokens": 32797753.0, + "step": 14615, + "train/ce_loss": 0.48147669434547424 + }, + { + "epoch": 1.4450266956693691, + "step": 14615, + "train/sim_loss": 0.07549703121185303 + }, + { + "epoch": 1.4450266956693691, + "step": 14615, + "train/total_loss": 0.12364470213651657 + }, + { + "entropy": 9.207231521606445, + "epoch": 1.4451255685188849, + "mean_token_accuracy": 0.8328912258148193, + "num_tokens": 32812982.0, + "step": 14616, + "train/ce_loss": 0.5344613790512085 + }, + { + "epoch": 1.4451255685188849, + "step": 14616, + "train/sim_loss": 0.03652065992355347 + }, + { + "epoch": 1.4451255685188849, + "step": 14616, + "train/total_loss": 0.0899668037891388 + }, + { + "entropy": 9.277443885803223, + "epoch": 1.4452244413684001, + "mean_token_accuracy": 0.8381430506706238, + "num_tokens": 32825928.0, + "step": 14617, + "train/ce_loss": 0.3467673659324646 + }, + { + "epoch": 1.4452244413684001, + "step": 14617, + "train/sim_loss": 0.04496091604232788 + }, + { + "epoch": 1.4452244413684001, + "step": 14617, + "train/total_loss": 0.07963765412569046 + }, + { + "entropy": 9.61695384979248, + "epoch": 1.4453233142179158, + "mean_token_accuracy": 0.9020978808403015, + "num_tokens": 32837366.0, + "step": 14618, + "train/ce_loss": 0.1917971521615982 + }, + { + "epoch": 1.4453233142179158, + "step": 14618, + "train/sim_loss": 0.01802164316177368 + }, + { + "epoch": 1.4453233142179158, + "step": 14618, + "train/total_loss": 0.03720135986804962 + }, + { + "entropy": 9.5716552734375, + "epoch": 1.4454221870674313, + "mean_token_accuracy": 0.8547418713569641, + "num_tokens": 32852429.0, + "step": 14619, + "train/ce_loss": 0.7334805130958557 + }, + { + "epoch": 1.4454221870674313, + "step": 14619, + "train/sim_loss": 0.04417818784713745 + }, + { + "epoch": 1.4454221870674313, + "step": 14619, + "train/total_loss": 0.11752624064683914 + }, + { + "epoch": 1.4455210599169468, + "grad_norm": 0.5686050653457642, + "learning_rate": 6.388023537556248e-06, + "loss": 0.087, + "step": 14620 + }, + { + "entropy": 9.740022659301758, + "epoch": 1.4455210599169468, + "mean_token_accuracy": 0.8996478915214539, + "num_tokens": 32859245.0, + "step": 14620, + "train/ce_loss": 0.12590354681015015 + }, + { + "epoch": 1.4455210599169468, + "step": 14620, + "train/sim_loss": 0.05294299125671387 + }, + { + "epoch": 1.4455210599169468, + "step": 14620, + "train/total_loss": 0.065533347427845 + }, + { + "entropy": 9.444664001464844, + "epoch": 1.4456199327664623, + "mean_token_accuracy": 0.8571428656578064, + "num_tokens": 32872928.0, + "step": 14621, + "train/ce_loss": 0.46400994062423706 + }, + { + "epoch": 1.4456199327664623, + "step": 14621, + "train/sim_loss": 0.04616594314575195 + }, + { + "epoch": 1.4456199327664623, + "step": 14621, + "train/total_loss": 0.09256693720817566 + }, + { + "entropy": 8.935014724731445, + "epoch": 1.4457188056159778, + "mean_token_accuracy": 0.8360465168952942, + "num_tokens": 32883407.0, + "step": 14622, + "train/ce_loss": 0.47907376289367676 + }, + { + "epoch": 1.4457188056159778, + "step": 14622, + "train/sim_loss": 0.1390058994293213 + }, + { + "epoch": 1.4457188056159778, + "step": 14622, + "train/total_loss": 0.18691328167915344 + }, + { + "entropy": 9.820673942565918, + "epoch": 1.4458176784654935, + "mean_token_accuracy": 0.8536585569381714, + "num_tokens": 32894003.0, + "step": 14623, + "train/ce_loss": 0.8164084553718567 + }, + { + "epoch": 1.4458176784654935, + "step": 14623, + "train/sim_loss": 0.09269070625305176 + }, + { + "epoch": 1.4458176784654935, + "step": 14623, + "train/total_loss": 0.17433154582977295 + }, + { + "entropy": 9.24417781829834, + "epoch": 1.445916551315009, + "mean_token_accuracy": 0.8621745705604553, + "num_tokens": 32909763.0, + "step": 14624, + "train/ce_loss": 0.4984704852104187 + }, + { + "epoch": 1.445916551315009, + "step": 14624, + "train/sim_loss": 0.0809166431427002 + }, + { + "epoch": 1.445916551315009, + "step": 14624, + "train/total_loss": 0.1307636946439743 + }, + { + "entropy": 9.265401840209961, + "epoch": 1.4460154241645244, + "mean_token_accuracy": 0.8783120512962341, + "num_tokens": 32918535.0, + "step": 14625, + "train/ce_loss": 0.3482081890106201 + }, + { + "epoch": 1.4460154241645244, + "step": 14625, + "train/sim_loss": 0.04707658290863037 + }, + { + "epoch": 1.4460154241645244, + "step": 14625, + "train/total_loss": 0.08189740777015686 + }, + { + "entropy": 8.978026390075684, + "epoch": 1.44611429701404, + "mean_token_accuracy": 0.8751248717308044, + "num_tokens": 32929059.0, + "step": 14626, + "train/ce_loss": 0.4554237425327301 + }, + { + "epoch": 1.44611429701404, + "step": 14626, + "train/sim_loss": 0.10012942552566528 + }, + { + "epoch": 1.44611429701404, + "step": 14626, + "train/total_loss": 0.1456717997789383 + }, + { + "entropy": 9.456725120544434, + "epoch": 1.4462131698635554, + "mean_token_accuracy": 0.8661518692970276, + "num_tokens": 32941653.0, + "step": 14627, + "train/ce_loss": 0.140574112534523 + }, + { + "epoch": 1.4462131698635554, + "step": 14627, + "train/sim_loss": 0.07034015655517578 + }, + { + "epoch": 1.4462131698635554, + "step": 14627, + "train/total_loss": 0.0843975692987442 + }, + { + "entropy": 9.42393970489502, + "epoch": 1.4463120427130711, + "mean_token_accuracy": 0.8406015038490295, + "num_tokens": 32954445.0, + "step": 14628, + "train/ce_loss": 0.5720897912979126 + }, + { + "epoch": 1.4463120427130711, + "step": 14628, + "train/sim_loss": 0.034625470638275146 + }, + { + "epoch": 1.4463120427130711, + "step": 14628, + "train/total_loss": 0.09183445572853088 + }, + { + "entropy": 9.460916519165039, + "epoch": 1.4464109155625864, + "mean_token_accuracy": 0.8892921805381775, + "num_tokens": 32963049.0, + "step": 14629, + "train/ce_loss": 0.303007036447525 + }, + { + "epoch": 1.4464109155625864, + "step": 14629, + "train/sim_loss": 0.020079195499420166 + }, + { + "epoch": 1.4464109155625864, + "step": 14629, + "train/total_loss": 0.05037990212440491 + }, + { + "entropy": 9.18505573272705, + "epoch": 1.446509788412102, + "mean_token_accuracy": 0.8113402128219604, + "num_tokens": 32974071.0, + "step": 14630, + "train/ce_loss": 0.38704347610473633 + }, + { + "epoch": 1.446509788412102, + "step": 14630, + "train/sim_loss": 0.04341632127761841 + }, + { + "epoch": 1.446509788412102, + "step": 14630, + "train/total_loss": 0.08212067186832428 + }, + { + "entropy": 9.51382827758789, + "epoch": 1.4466086612616176, + "mean_token_accuracy": 0.8232405781745911, + "num_tokens": 32984464.0, + "step": 14631, + "train/ce_loss": 0.5752773284912109 + }, + { + "epoch": 1.4466086612616176, + "step": 14631, + "train/sim_loss": 0.07670462131500244 + }, + { + "epoch": 1.4466086612616176, + "step": 14631, + "train/total_loss": 0.13423235714435577 + }, + { + "entropy": 9.317789077758789, + "epoch": 1.446707534111133, + "mean_token_accuracy": 0.8004956841468811, + "num_tokens": 32996500.0, + "step": 14632, + "train/ce_loss": 0.8328883647918701 + }, + { + "epoch": 1.446707534111133, + "step": 14632, + "train/sim_loss": 0.07562661170959473 + }, + { + "epoch": 1.446707534111133, + "step": 14632, + "train/total_loss": 0.1589154601097107 + }, + { + "entropy": 9.355307579040527, + "epoch": 1.4468064069606485, + "mean_token_accuracy": 0.8736951947212219, + "num_tokens": 33008228.0, + "step": 14633, + "train/ce_loss": 0.5346320271492004 + }, + { + "epoch": 1.4468064069606485, + "step": 14633, + "train/sim_loss": 0.04765927791595459 + }, + { + "epoch": 1.4468064069606485, + "step": 14633, + "train/total_loss": 0.10112248361110687 + }, + { + "entropy": 9.40829849243164, + "epoch": 1.446905279810164, + "mean_token_accuracy": 0.8713080286979675, + "num_tokens": 33021346.0, + "step": 14634, + "train/ce_loss": 0.21640047430992126 + }, + { + "epoch": 1.446905279810164, + "step": 14634, + "train/sim_loss": 0.021801233291625977 + }, + { + "epoch": 1.446905279810164, + "step": 14634, + "train/total_loss": 0.0434412807226181 + }, + { + "entropy": 9.646099090576172, + "epoch": 1.4470041526596797, + "mean_token_accuracy": 0.8626943230628967, + "num_tokens": 33037238.0, + "step": 14635, + "train/ce_loss": 0.6354832053184509 + }, + { + "epoch": 1.4470041526596797, + "step": 14635, + "train/sim_loss": 0.06588518619537354 + }, + { + "epoch": 1.4470041526596797, + "step": 14635, + "train/total_loss": 0.1294335126876831 + }, + { + "entropy": 9.760445594787598, + "epoch": 1.4471030255091952, + "mean_token_accuracy": 0.8655692934989929, + "num_tokens": 33052399.0, + "step": 14636, + "train/ce_loss": 0.4866870045661926 + }, + { + "epoch": 1.4471030255091952, + "step": 14636, + "train/sim_loss": 0.0719602108001709 + }, + { + "epoch": 1.4471030255091952, + "step": 14636, + "train/total_loss": 0.12062890827655792 + }, + { + "entropy": 8.878039360046387, + "epoch": 1.4472018983587107, + "mean_token_accuracy": 0.861321747303009, + "num_tokens": 33059218.0, + "step": 14637, + "train/ce_loss": 0.551733672618866 + }, + { + "epoch": 1.4472018983587107, + "step": 14637, + "train/sim_loss": 0.017313718795776367 + }, + { + "epoch": 1.4472018983587107, + "step": 14637, + "train/total_loss": 0.07248708605766296 + }, + { + "entropy": 9.228010177612305, + "epoch": 1.4473007712082262, + "mean_token_accuracy": 0.8828715085983276, + "num_tokens": 33072686.0, + "step": 14638, + "train/ce_loss": 0.43191730976104736 + }, + { + "epoch": 1.4473007712082262, + "step": 14638, + "train/sim_loss": 0.014553248882293701 + }, + { + "epoch": 1.4473007712082262, + "step": 14638, + "train/total_loss": 0.05774497985839844 + }, + { + "entropy": 9.255489349365234, + "epoch": 1.4473996440577417, + "mean_token_accuracy": 0.8860543966293335, + "num_tokens": 33084338.0, + "step": 14639, + "train/ce_loss": 0.5404191613197327 + }, + { + "epoch": 1.4473996440577417, + "step": 14639, + "train/sim_loss": 0.029080569744110107 + }, + { + "epoch": 1.4473996440577417, + "step": 14639, + "train/total_loss": 0.08312249183654785 + }, + { + "epoch": 1.4474985169072574, + "grad_norm": 0.5194477438926697, + "learning_rate": 6.383078672798299e-06, + "loss": 0.0879, + "step": 14640 + }, + { + "entropy": 9.233247756958008, + "epoch": 1.4474985169072574, + "mean_token_accuracy": 0.8528995513916016, + "num_tokens": 33092860.0, + "step": 14640, + "train/ce_loss": 0.5107365846633911 + }, + { + "epoch": 1.4474985169072574, + "step": 14640, + "train/sim_loss": 0.014633417129516602 + }, + { + "epoch": 1.4474985169072574, + "step": 14640, + "train/total_loss": 0.06570707261562347 + }, + { + "entropy": 9.363058090209961, + "epoch": 1.4475973897567727, + "mean_token_accuracy": 0.8303248882293701, + "num_tokens": 33104112.0, + "step": 14641, + "train/ce_loss": 0.4775668978691101 + }, + { + "epoch": 1.4475973897567727, + "step": 14641, + "train/sim_loss": 0.04779183864593506 + }, + { + "epoch": 1.4475973897567727, + "step": 14641, + "train/total_loss": 0.09554852545261383 + }, + { + "entropy": 9.227144241333008, + "epoch": 1.4476962626062884, + "mean_token_accuracy": 0.8763867020606995, + "num_tokens": 33117744.0, + "step": 14642, + "train/ce_loss": 0.4976555109024048 + }, + { + "epoch": 1.4476962626062884, + "step": 14642, + "train/sim_loss": 0.110542893409729 + }, + { + "epoch": 1.4476962626062884, + "step": 14642, + "train/total_loss": 0.16030845046043396 + }, + { + "entropy": 9.622013092041016, + "epoch": 1.4477951354558038, + "mean_token_accuracy": 0.8496000170707703, + "num_tokens": 33134613.0, + "step": 14643, + "train/ce_loss": 0.5908256769180298 + }, + { + "epoch": 1.4477951354558038, + "step": 14643, + "train/sim_loss": 0.04033011198043823 + }, + { + "epoch": 1.4477951354558038, + "step": 14643, + "train/total_loss": 0.09941267967224121 + }, + { + "entropy": 9.303192138671875, + "epoch": 1.4478940083053193, + "mean_token_accuracy": 0.8523111343383789, + "num_tokens": 33148369.0, + "step": 14644, + "train/ce_loss": 0.4892221689224243 + }, + { + "epoch": 1.4478940083053193, + "step": 14644, + "train/sim_loss": 0.026181578636169434 + }, + { + "epoch": 1.4478940083053193, + "step": 14644, + "train/total_loss": 0.07510379701852798 + }, + { + "entropy": 9.617070198059082, + "epoch": 1.4479928811548348, + "mean_token_accuracy": 0.9252199530601501, + "num_tokens": 33158661.0, + "step": 14645, + "train/ce_loss": 2.572936352862598e-07 + }, + { + "epoch": 1.4479928811548348, + "step": 14645, + "train/sim_loss": 0.01657545566558838 + }, + { + "epoch": 1.4479928811548348, + "step": 14645, + "train/total_loss": 0.016575481742620468 + }, + { + "entropy": 9.325555801391602, + "epoch": 1.4480917540043503, + "mean_token_accuracy": 0.8237232565879822, + "num_tokens": 33173462.0, + "step": 14646, + "train/ce_loss": 0.43967515230178833 + }, + { + "epoch": 1.4480917540043503, + "step": 14646, + "train/sim_loss": 0.023655295372009277 + }, + { + "epoch": 1.4480917540043503, + "step": 14646, + "train/total_loss": 0.06762281060218811 + }, + { + "entropy": 9.3221435546875, + "epoch": 1.448190626853866, + "mean_token_accuracy": 0.8572916388511658, + "num_tokens": 33187010.0, + "step": 14647, + "train/ce_loss": 0.5485677123069763 + }, + { + "epoch": 1.448190626853866, + "step": 14647, + "train/sim_loss": 0.035593628883361816 + }, + { + "epoch": 1.448190626853866, + "step": 14647, + "train/total_loss": 0.09045040607452393 + }, + { + "entropy": 9.855774879455566, + "epoch": 1.4482894997033815, + "mean_token_accuracy": 0.875984251499176, + "num_tokens": 33200136.0, + "step": 14648, + "train/ce_loss": 0.48586317896842957 + }, + { + "epoch": 1.4482894997033815, + "step": 14648, + "train/sim_loss": 0.020947754383087158 + }, + { + "epoch": 1.4482894997033815, + "step": 14648, + "train/total_loss": 0.06953407824039459 + }, + { + "entropy": 9.42752456665039, + "epoch": 1.448388372552897, + "mean_token_accuracy": 0.8723926544189453, + "num_tokens": 33219378.0, + "step": 14649, + "train/ce_loss": 0.3703184127807617 + }, + { + "epoch": 1.448388372552897, + "step": 14649, + "train/sim_loss": 0.04552626609802246 + }, + { + "epoch": 1.448388372552897, + "step": 14649, + "train/total_loss": 0.08255811035633087 + }, + { + "entropy": 9.349323272705078, + "epoch": 1.4484872454024125, + "mean_token_accuracy": 0.89670330286026, + "num_tokens": 33229972.0, + "step": 14650, + "train/ce_loss": 0.742614209651947 + }, + { + "epoch": 1.4484872454024125, + "step": 14650, + "train/sim_loss": 0.07401704788208008 + }, + { + "epoch": 1.4484872454024125, + "step": 14650, + "train/total_loss": 0.14827847480773926 + }, + { + "entropy": 9.361462593078613, + "epoch": 1.448586118251928, + "mean_token_accuracy": 0.8304821252822876, + "num_tokens": 33238742.0, + "step": 14651, + "train/ce_loss": 0.5985924601554871 + }, + { + "epoch": 1.448586118251928, + "step": 14651, + "train/sim_loss": 0.04900139570236206 + }, + { + "epoch": 1.448586118251928, + "step": 14651, + "train/total_loss": 0.10886064171791077 + }, + { + "entropy": 9.691993713378906, + "epoch": 1.4486849911014437, + "mean_token_accuracy": 0.860805869102478, + "num_tokens": 33260230.0, + "step": 14652, + "train/ce_loss": 0.5560237765312195 + }, + { + "epoch": 1.4486849911014437, + "step": 14652, + "train/sim_loss": 0.052364230155944824 + }, + { + "epoch": 1.4486849911014437, + "step": 14652, + "train/total_loss": 0.10796660929918289 + }, + { + "entropy": 9.21683120727539, + "epoch": 1.4487838639509591, + "mean_token_accuracy": 0.8605714440345764, + "num_tokens": 33270886.0, + "step": 14653, + "train/ce_loss": 0.29392102360725403 + }, + { + "epoch": 1.4487838639509591, + "step": 14653, + "train/sim_loss": 0.04037153720855713 + }, + { + "epoch": 1.4487838639509591, + "step": 14653, + "train/total_loss": 0.06976363807916641 + }, + { + "entropy": 9.184913635253906, + "epoch": 1.4488827368004746, + "mean_token_accuracy": 0.813747227191925, + "num_tokens": 33283898.0, + "step": 14654, + "train/ce_loss": 0.34826958179473877 + }, + { + "epoch": 1.4488827368004746, + "step": 14654, + "train/sim_loss": 0.014536738395690918 + }, + { + "epoch": 1.4488827368004746, + "step": 14654, + "train/total_loss": 0.049363698810338974 + }, + { + "entropy": 9.126859664916992, + "epoch": 1.4489816096499901, + "mean_token_accuracy": 0.8695651888847351, + "num_tokens": 33295082.0, + "step": 14655, + "train/ce_loss": 0.5153428316116333 + }, + { + "epoch": 1.4489816096499901, + "step": 14655, + "train/sim_loss": 0.03438913822174072 + }, + { + "epoch": 1.4489816096499901, + "step": 14655, + "train/total_loss": 0.08592341840267181 + }, + { + "entropy": 9.470640182495117, + "epoch": 1.4490804824995056, + "mean_token_accuracy": 0.8337950110435486, + "num_tokens": 33303368.0, + "step": 14656, + "train/ce_loss": 0.3461177349090576 + }, + { + "epoch": 1.4490804824995056, + "step": 14656, + "train/sim_loss": 0.047835707664489746 + }, + { + "epoch": 1.4490804824995056, + "step": 14656, + "train/total_loss": 0.08244748413562775 + }, + { + "entropy": 9.599265098571777, + "epoch": 1.449179355349021, + "mean_token_accuracy": 0.8809523582458496, + "num_tokens": 33319957.0, + "step": 14657, + "train/ce_loss": 0.39546558260917664 + }, + { + "epoch": 1.449179355349021, + "step": 14657, + "train/sim_loss": 0.03899914026260376 + }, + { + "epoch": 1.449179355349021, + "step": 14657, + "train/total_loss": 0.0785457044839859 + }, + { + "entropy": 9.371747970581055, + "epoch": 1.4492782281985366, + "mean_token_accuracy": 0.8655778765678406, + "num_tokens": 33332280.0, + "step": 14658, + "train/ce_loss": 0.5211436748504639 + }, + { + "epoch": 1.4492782281985366, + "step": 14658, + "train/sim_loss": 0.059886276721954346 + }, + { + "epoch": 1.4492782281985366, + "step": 14658, + "train/total_loss": 0.11200064420700073 + }, + { + "entropy": 9.1581392288208, + "epoch": 1.4493771010480523, + "mean_token_accuracy": 0.8912710547447205, + "num_tokens": 33339015.0, + "step": 14659, + "train/ce_loss": 0.5206813812255859 + }, + { + "epoch": 1.4493771010480523, + "step": 14659, + "train/sim_loss": 0.04920458793640137 + }, + { + "epoch": 1.4493771010480523, + "step": 14659, + "train/total_loss": 0.10127273201942444 + }, + { + "epoch": 1.4494759738975678, + "grad_norm": 0.5147165656089783, + "learning_rate": 6.378133808040351e-06, + "loss": 0.0808, + "step": 14660 + }, + { + "entropy": 8.739776611328125, + "epoch": 1.4494759738975678, + "mean_token_accuracy": 0.853042483329773, + "num_tokens": 33348982.0, + "step": 14660, + "train/ce_loss": 0.24739843606948853 + }, + { + "epoch": 1.4494759738975678, + "step": 14660, + "train/sim_loss": 0.0593414306640625 + }, + { + "epoch": 1.4494759738975678, + "step": 14660, + "train/total_loss": 0.08408127725124359 + }, + { + "entropy": 9.390892028808594, + "epoch": 1.4495748467470833, + "mean_token_accuracy": 0.8342857360839844, + "num_tokens": 33365271.0, + "step": 14661, + "train/ce_loss": 0.38324058055877686 + }, + { + "epoch": 1.4495748467470833, + "step": 14661, + "train/sim_loss": 0.032340288162231445 + }, + { + "epoch": 1.4495748467470833, + "step": 14661, + "train/total_loss": 0.07066434621810913 + }, + { + "entropy": 8.723437309265137, + "epoch": 1.4496737195965987, + "mean_token_accuracy": 0.8349429368972778, + "num_tokens": 33377835.0, + "step": 14662, + "train/ce_loss": 0.41891559958457947 + }, + { + "epoch": 1.4496737195965987, + "step": 14662, + "train/sim_loss": 0.014190912246704102 + }, + { + "epoch": 1.4496737195965987, + "step": 14662, + "train/total_loss": 0.05608247220516205 + }, + { + "entropy": 9.616903305053711, + "epoch": 1.4497725924461142, + "mean_token_accuracy": 0.8389021754264832, + "num_tokens": 33392286.0, + "step": 14663, + "train/ce_loss": 0.4437319338321686 + }, + { + "epoch": 1.4497725924461142, + "step": 14663, + "train/sim_loss": 0.050224483013153076 + }, + { + "epoch": 1.4497725924461142, + "step": 14663, + "train/total_loss": 0.09459768235683441 + }, + { + "entropy": 9.540325164794922, + "epoch": 1.44987146529563, + "mean_token_accuracy": 0.9060402512550354, + "num_tokens": 33406143.0, + "step": 14664, + "train/ce_loss": 6.928262905603333e-07 + }, + { + "epoch": 1.44987146529563, + "step": 14664, + "train/sim_loss": 0.02345740795135498 + }, + { + "epoch": 1.44987146529563, + "step": 14664, + "train/total_loss": 0.023457476869225502 + }, + { + "entropy": 9.62871265411377, + "epoch": 1.4499703381451454, + "mean_token_accuracy": 0.8414154648780823, + "num_tokens": 33417218.0, + "step": 14665, + "train/ce_loss": 0.5710089802742004 + }, + { + "epoch": 1.4499703381451454, + "step": 14665, + "train/sim_loss": 0.04705202579498291 + }, + { + "epoch": 1.4499703381451454, + "step": 14665, + "train/total_loss": 0.10415292531251907 + }, + { + "entropy": 9.240850448608398, + "epoch": 1.450069210994661, + "mean_token_accuracy": 0.881465494632721, + "num_tokens": 33431912.0, + "step": 14666, + "train/ce_loss": 0.3971051275730133 + }, + { + "epoch": 1.450069210994661, + "step": 14666, + "train/sim_loss": 0.027614116668701172 + }, + { + "epoch": 1.450069210994661, + "step": 14666, + "train/total_loss": 0.06732463091611862 + }, + { + "entropy": 9.521869659423828, + "epoch": 1.4501680838441764, + "mean_token_accuracy": 0.8674971461296082, + "num_tokens": 33443172.0, + "step": 14667, + "train/ce_loss": 0.4167539179325104 + }, + { + "epoch": 1.4501680838441764, + "step": 14667, + "train/sim_loss": 0.053287506103515625 + }, + { + "epoch": 1.4501680838441764, + "step": 14667, + "train/total_loss": 0.09496289491653442 + }, + { + "entropy": 9.540433883666992, + "epoch": 1.4502669566936919, + "mean_token_accuracy": 0.8495145440101624, + "num_tokens": 33456812.0, + "step": 14668, + "train/ce_loss": 0.42930924892425537 + }, + { + "epoch": 1.4502669566936919, + "step": 14668, + "train/sim_loss": 0.028553545475006104 + }, + { + "epoch": 1.4502669566936919, + "step": 14668, + "train/total_loss": 0.07148447632789612 + }, + { + "entropy": 9.494424819946289, + "epoch": 1.4503658295432074, + "mean_token_accuracy": 0.916967511177063, + "num_tokens": 33472449.0, + "step": 14669, + "train/ce_loss": 0.28679296374320984 + }, + { + "epoch": 1.4503658295432074, + "step": 14669, + "train/sim_loss": 0.040726661682128906 + }, + { + "epoch": 1.4503658295432074, + "step": 14669, + "train/total_loss": 0.06940595805644989 + }, + { + "entropy": 9.743650436401367, + "epoch": 1.4504647023927228, + "mean_token_accuracy": 0.8417818546295166, + "num_tokens": 33480920.0, + "step": 14670, + "train/ce_loss": 0.4527071416378021 + }, + { + "epoch": 1.4504647023927228, + "step": 14670, + "train/sim_loss": 0.08013749122619629 + }, + { + "epoch": 1.4504647023927228, + "step": 14670, + "train/total_loss": 0.12540820240974426 + }, + { + "entropy": 9.267477035522461, + "epoch": 1.4505635752422386, + "mean_token_accuracy": 0.8445873260498047, + "num_tokens": 33494106.0, + "step": 14671, + "train/ce_loss": 0.26008284091949463 + }, + { + "epoch": 1.4505635752422386, + "step": 14671, + "train/sim_loss": 0.01477140188217163 + }, + { + "epoch": 1.4505635752422386, + "step": 14671, + "train/total_loss": 0.04077968746423721 + }, + { + "entropy": 9.318244934082031, + "epoch": 1.450662448091754, + "mean_token_accuracy": 0.8176670670509338, + "num_tokens": 33506423.0, + "step": 14672, + "train/ce_loss": 0.25422534346580505 + }, + { + "epoch": 1.450662448091754, + "step": 14672, + "train/sim_loss": 0.009165585041046143 + }, + { + "epoch": 1.450662448091754, + "step": 14672, + "train/total_loss": 0.03458812087774277 + }, + { + "entropy": 8.702000617980957, + "epoch": 1.4507613209412695, + "mean_token_accuracy": 0.8671003580093384, + "num_tokens": 33516778.0, + "step": 14673, + "train/ce_loss": 0.5765774250030518 + }, + { + "epoch": 1.4507613209412695, + "step": 14673, + "train/sim_loss": 0.020522713661193848 + }, + { + "epoch": 1.4507613209412695, + "step": 14673, + "train/total_loss": 0.0781804621219635 + }, + { + "entropy": 8.712478637695312, + "epoch": 1.450860193790785, + "mean_token_accuracy": 0.8745454549789429, + "num_tokens": 33526183.0, + "step": 14674, + "train/ce_loss": 0.3483887314796448 + }, + { + "epoch": 1.450860193790785, + "step": 14674, + "train/sim_loss": 0.013893485069274902 + }, + { + "epoch": 1.450860193790785, + "step": 14674, + "train/total_loss": 0.04873235896229744 + }, + { + "entropy": 9.594045639038086, + "epoch": 1.4509590666403005, + "mean_token_accuracy": 0.8906009197235107, + "num_tokens": 33541930.0, + "step": 14675, + "train/ce_loss": 3.313567162877007e-07 + }, + { + "epoch": 1.4509590666403005, + "step": 14675, + "train/sim_loss": 0.01813340187072754 + }, + { + "epoch": 1.4509590666403005, + "step": 14675, + "train/total_loss": 0.018133435398340225 + }, + { + "entropy": 9.585663795471191, + "epoch": 1.4510579394898162, + "mean_token_accuracy": 0.8181818127632141, + "num_tokens": 33551674.0, + "step": 14676, + "train/ce_loss": 0.5093943476676941 + }, + { + "epoch": 1.4510579394898162, + "step": 14676, + "train/sim_loss": 0.03471553325653076 + }, + { + "epoch": 1.4510579394898162, + "step": 14676, + "train/total_loss": 0.08565497398376465 + }, + { + "entropy": 9.144400596618652, + "epoch": 1.4511568123393317, + "mean_token_accuracy": 0.8356481194496155, + "num_tokens": 33559754.0, + "step": 14677, + "train/ce_loss": 0.5014157295227051 + }, + { + "epoch": 1.4511568123393317, + "step": 14677, + "train/sim_loss": 0.06764054298400879 + }, + { + "epoch": 1.4511568123393317, + "step": 14677, + "train/total_loss": 0.1177821159362793 + }, + { + "entropy": 9.582964897155762, + "epoch": 1.4512556851888472, + "mean_token_accuracy": 0.8731883764266968, + "num_tokens": 33572735.0, + "step": 14678, + "train/ce_loss": 0.7881671786308289 + }, + { + "epoch": 1.4512556851888472, + "step": 14678, + "train/sim_loss": 0.018743693828582764 + }, + { + "epoch": 1.4512556851888472, + "step": 14678, + "train/total_loss": 0.09756041318178177 + }, + { + "entropy": 9.639993667602539, + "epoch": 1.4513545580383627, + "mean_token_accuracy": 0.8549222946166992, + "num_tokens": 33583965.0, + "step": 14679, + "train/ce_loss": 3.977721405590273e-07 + }, + { + "epoch": 1.4513545580383627, + "step": 14679, + "train/sim_loss": 0.016023993492126465 + }, + { + "epoch": 1.4513545580383627, + "step": 14679, + "train/total_loss": 0.0160240326076746 + }, + { + "epoch": 1.4514534308878781, + "grad_norm": 0.6150686740875244, + "learning_rate": 6.373188943282401e-06, + "loss": 0.0821, + "step": 14680 + }, + { + "entropy": 9.334566116333008, + "epoch": 1.4514534308878781, + "mean_token_accuracy": 0.8109820485115051, + "num_tokens": 33596755.0, + "step": 14680, + "train/ce_loss": 0.6800681352615356 + }, + { + "epoch": 1.4514534308878781, + "step": 14680, + "train/sim_loss": 0.047608256340026855 + }, + { + "epoch": 1.4514534308878781, + "step": 14680, + "train/total_loss": 0.11561506986618042 + }, + { + "entropy": 9.577949523925781, + "epoch": 1.4515523037373936, + "mean_token_accuracy": 0.8839285969734192, + "num_tokens": 33603137.0, + "step": 14681, + "train/ce_loss": 5.120086370880017e-06 + }, + { + "epoch": 1.4515523037373936, + "step": 14681, + "train/sim_loss": 0.03108006715774536 + }, + { + "epoch": 1.4515523037373936, + "step": 14681, + "train/total_loss": 0.0310805793851614 + }, + { + "entropy": 8.795356750488281, + "epoch": 1.4516511765869091, + "mean_token_accuracy": 0.8177777528762817, + "num_tokens": 33616421.0, + "step": 14682, + "train/ce_loss": 0.8019314408302307 + }, + { + "epoch": 1.4516511765869091, + "step": 14682, + "train/sim_loss": 0.06007760763168335 + }, + { + "epoch": 1.4516511765869091, + "step": 14682, + "train/total_loss": 0.14027075469493866 + }, + { + "entropy": 9.300925254821777, + "epoch": 1.4517500494364248, + "mean_token_accuracy": 0.862500011920929, + "num_tokens": 33625212.0, + "step": 14683, + "train/ce_loss": 3.51662947650766e-07 + }, + { + "epoch": 1.4517500494364248, + "step": 14683, + "train/sim_loss": 0.045381367206573486 + }, + { + "epoch": 1.4517500494364248, + "step": 14683, + "train/total_loss": 0.04538140073418617 + }, + { + "entropy": 10.039361953735352, + "epoch": 1.4518489222859403, + "mean_token_accuracy": 0.8579545617103577, + "num_tokens": 33636441.0, + "step": 14684, + "train/ce_loss": 0.5025747418403625 + }, + { + "epoch": 1.4518489222859403, + "step": 14684, + "train/sim_loss": 0.07263827323913574 + }, + { + "epoch": 1.4518489222859403, + "step": 14684, + "train/total_loss": 0.122895747423172 + }, + { + "entropy": 9.379241943359375, + "epoch": 1.4519477951354558, + "mean_token_accuracy": 0.8678414225578308, + "num_tokens": 33648324.0, + "step": 14685, + "train/ce_loss": 0.411716490983963 + }, + { + "epoch": 1.4519477951354558, + "step": 14685, + "train/sim_loss": 0.034843623638153076 + }, + { + "epoch": 1.4519477951354558, + "step": 14685, + "train/total_loss": 0.07601527869701385 + }, + { + "entropy": 9.327396392822266, + "epoch": 1.4520466679849713, + "mean_token_accuracy": 0.8237791657447815, + "num_tokens": 33663674.0, + "step": 14686, + "train/ce_loss": 0.602919340133667 + }, + { + "epoch": 1.4520466679849713, + "step": 14686, + "train/sim_loss": 0.05683004856109619 + }, + { + "epoch": 1.4520466679849713, + "step": 14686, + "train/total_loss": 0.11712197959423065 + }, + { + "entropy": 9.577615737915039, + "epoch": 1.4521455408344868, + "mean_token_accuracy": 0.8536184430122375, + "num_tokens": 33673275.0, + "step": 14687, + "train/ce_loss": 3.6435892525332747e-06 + }, + { + "epoch": 1.4521455408344868, + "step": 14687, + "train/sim_loss": 0.034397661685943604 + }, + { + "epoch": 1.4521455408344868, + "step": 14687, + "train/total_loss": 0.03439802676439285 + }, + { + "entropy": 9.136085510253906, + "epoch": 1.4522444136840025, + "mean_token_accuracy": 0.8672055602073669, + "num_tokens": 33681202.0, + "step": 14688, + "train/ce_loss": 0.34792855381965637 + }, + { + "epoch": 1.4522444136840025, + "step": 14688, + "train/sim_loss": 0.021801531314849854 + }, + { + "epoch": 1.4522444136840025, + "step": 14688, + "train/total_loss": 0.05659438669681549 + }, + { + "entropy": 9.595154762268066, + "epoch": 1.452343286533518, + "mean_token_accuracy": 0.8526970744132996, + "num_tokens": 33698436.0, + "step": 14689, + "train/ce_loss": 0.8732252717018127 + }, + { + "epoch": 1.452343286533518, + "step": 14689, + "train/sim_loss": 0.11701184511184692 + }, + { + "epoch": 1.452343286533518, + "step": 14689, + "train/total_loss": 0.20433437824249268 + }, + { + "entropy": 9.369937896728516, + "epoch": 1.4524421593830334, + "mean_token_accuracy": 0.8279301524162292, + "num_tokens": 33713423.0, + "step": 14690, + "train/ce_loss": 0.3965267539024353 + }, + { + "epoch": 1.4524421593830334, + "step": 14690, + "train/sim_loss": 0.03334248065948486 + }, + { + "epoch": 1.4524421593830334, + "step": 14690, + "train/total_loss": 0.0729951560497284 + }, + { + "entropy": 9.51523494720459, + "epoch": 1.452541032232549, + "mean_token_accuracy": 0.90317702293396, + "num_tokens": 33725262.0, + "step": 14691, + "train/ce_loss": 0.522767961025238 + }, + { + "epoch": 1.452541032232549, + "step": 14691, + "train/sim_loss": 0.016354799270629883 + }, + { + "epoch": 1.452541032232549, + "step": 14691, + "train/total_loss": 0.0686315968632698 + }, + { + "entropy": 9.210704803466797, + "epoch": 1.4526399050820644, + "mean_token_accuracy": 0.8657786846160889, + "num_tokens": 33733562.0, + "step": 14692, + "train/ce_loss": 0.6514019966125488 + }, + { + "epoch": 1.4526399050820644, + "step": 14692, + "train/sim_loss": 0.06895405054092407 + }, + { + "epoch": 1.4526399050820644, + "step": 14692, + "train/total_loss": 0.1340942531824112 + }, + { + "entropy": 9.394209861755371, + "epoch": 1.45273877793158, + "mean_token_accuracy": 0.8365527391433716, + "num_tokens": 33746880.0, + "step": 14693, + "train/ce_loss": 0.4537045955657959 + }, + { + "epoch": 1.45273877793158, + "step": 14693, + "train/sim_loss": 0.028767526149749756 + }, + { + "epoch": 1.45273877793158, + "step": 14693, + "train/total_loss": 0.07413798570632935 + }, + { + "entropy": 9.01870346069336, + "epoch": 1.4528376507810954, + "mean_token_accuracy": 0.8722826242446899, + "num_tokens": 33759790.0, + "step": 14694, + "train/ce_loss": 0.3588119149208069 + }, + { + "epoch": 1.4528376507810954, + "step": 14694, + "train/sim_loss": 0.07483386993408203 + }, + { + "epoch": 1.4528376507810954, + "step": 14694, + "train/total_loss": 0.11071506142616272 + }, + { + "entropy": 9.486310958862305, + "epoch": 1.452936523630611, + "mean_token_accuracy": 0.8705416321754456, + "num_tokens": 33774173.0, + "step": 14695, + "train/ce_loss": 0.47129499912261963 + }, + { + "epoch": 1.452936523630611, + "step": 14695, + "train/sim_loss": 0.022643208503723145 + }, + { + "epoch": 1.452936523630611, + "step": 14695, + "train/total_loss": 0.06977270543575287 + }, + { + "entropy": 9.208636283874512, + "epoch": 1.4530353964801266, + "mean_token_accuracy": 0.8181818127632141, + "num_tokens": 33782782.0, + "step": 14696, + "train/ce_loss": 0.6319429278373718 + }, + { + "epoch": 1.4530353964801266, + "step": 14696, + "train/sim_loss": 0.039192795753479004 + }, + { + "epoch": 1.4530353964801266, + "step": 14696, + "train/total_loss": 0.10238709300756454 + }, + { + "entropy": 9.80825424194336, + "epoch": 1.453134269329642, + "mean_token_accuracy": 0.894444465637207, + "num_tokens": 33799931.0, + "step": 14697, + "train/ce_loss": 0.3215264678001404 + }, + { + "epoch": 1.453134269329642, + "step": 14697, + "train/sim_loss": 0.031705379486083984 + }, + { + "epoch": 1.453134269329642, + "step": 14697, + "train/total_loss": 0.0638580322265625 + }, + { + "entropy": 9.380996704101562, + "epoch": 1.4532331421791576, + "mean_token_accuracy": 0.8469750881195068, + "num_tokens": 33813748.0, + "step": 14698, + "train/ce_loss": 0.3768058717250824 + }, + { + "epoch": 1.4532331421791576, + "step": 14698, + "train/sim_loss": 0.020936667919158936 + }, + { + "epoch": 1.4532331421791576, + "step": 14698, + "train/total_loss": 0.058617256581783295 + }, + { + "entropy": 9.70685863494873, + "epoch": 1.453332015028673, + "mean_token_accuracy": 0.8014941215515137, + "num_tokens": 33832369.0, + "step": 14699, + "train/ce_loss": 0.36050066351890564 + }, + { + "epoch": 1.453332015028673, + "step": 14699, + "train/sim_loss": 0.032318949699401855 + }, + { + "epoch": 1.453332015028673, + "step": 14699, + "train/total_loss": 0.06836901605129242 + }, + { + "epoch": 1.4534308878781887, + "grad_norm": 0.579157292842865, + "learning_rate": 6.368244078524453e-06, + "loss": 0.0827, + "step": 14700 + }, + { + "entropy": 9.070991516113281, + "epoch": 1.4534308878781887, + "mean_token_accuracy": 0.8560267686843872, + "num_tokens": 33841295.0, + "step": 14700, + "train/ce_loss": 0.5267177224159241 + }, + { + "epoch": 1.4534308878781887, + "step": 14700, + "train/sim_loss": 0.02613210678100586 + }, + { + "epoch": 1.4534308878781887, + "step": 14700, + "train/total_loss": 0.0788038820028305 + }, + { + "entropy": 9.68459701538086, + "epoch": 1.4535297607277042, + "mean_token_accuracy": 0.8484848737716675, + "num_tokens": 33848767.0, + "step": 14701, + "train/ce_loss": 0.6436049938201904 + }, + { + "epoch": 1.4535297607277042, + "step": 14701, + "train/sim_loss": 0.11059725284576416 + }, + { + "epoch": 1.4535297607277042, + "step": 14701, + "train/total_loss": 0.1749577522277832 + }, + { + "entropy": 9.0930814743042, + "epoch": 1.4536286335772197, + "mean_token_accuracy": 0.8368725776672363, + "num_tokens": 33862806.0, + "step": 14702, + "train/ce_loss": 0.47131603956222534 + }, + { + "epoch": 1.4536286335772197, + "step": 14702, + "train/sim_loss": 0.01834958791732788 + }, + { + "epoch": 1.4536286335772197, + "step": 14702, + "train/total_loss": 0.06548119336366653 + }, + { + "entropy": 9.409143447875977, + "epoch": 1.4537275064267352, + "mean_token_accuracy": 0.8782608509063721, + "num_tokens": 33877083.0, + "step": 14703, + "train/ce_loss": 0.3460744321346283 + }, + { + "epoch": 1.4537275064267352, + "step": 14703, + "train/sim_loss": 0.01676023006439209 + }, + { + "epoch": 1.4537275064267352, + "step": 14703, + "train/total_loss": 0.05136767402291298 + }, + { + "entropy": 9.18404483795166, + "epoch": 1.4538263792762507, + "mean_token_accuracy": 0.8672316670417786, + "num_tokens": 33885489.0, + "step": 14704, + "train/ce_loss": 0.2778667211532593 + }, + { + "epoch": 1.4538263792762507, + "step": 14704, + "train/sim_loss": 0.05722922086715698 + }, + { + "epoch": 1.4538263792762507, + "step": 14704, + "train/total_loss": 0.08501589298248291 + }, + { + "entropy": 9.205083847045898, + "epoch": 1.4539252521257664, + "mean_token_accuracy": 0.8415529727935791, + "num_tokens": 33899218.0, + "step": 14705, + "train/ce_loss": 0.6417967677116394 + }, + { + "epoch": 1.4539252521257664, + "step": 14705, + "train/sim_loss": 0.04861879348754883 + }, + { + "epoch": 1.4539252521257664, + "step": 14705, + "train/total_loss": 0.11279847472906113 + }, + { + "entropy": 9.215303421020508, + "epoch": 1.4540241249752817, + "mean_token_accuracy": 0.8011173009872437, + "num_tokens": 33911155.0, + "step": 14706, + "train/ce_loss": 0.5515859127044678 + }, + { + "epoch": 1.4540241249752817, + "step": 14706, + "train/sim_loss": 0.041493237018585205 + }, + { + "epoch": 1.4540241249752817, + "step": 14706, + "train/total_loss": 0.0966518297791481 + }, + { + "entropy": 9.224387168884277, + "epoch": 1.4541229978247974, + "mean_token_accuracy": 0.8611793518066406, + "num_tokens": 33922006.0, + "step": 14707, + "train/ce_loss": 0.35951751470565796 + }, + { + "epoch": 1.4541229978247974, + "step": 14707, + "train/sim_loss": 0.016010046005249023 + }, + { + "epoch": 1.4541229978247974, + "step": 14707, + "train/total_loss": 0.05196179822087288 + }, + { + "entropy": 9.309141159057617, + "epoch": 1.4542218706743129, + "mean_token_accuracy": 0.8191881775856018, + "num_tokens": 33940398.0, + "step": 14708, + "train/ce_loss": 0.534654438495636 + }, + { + "epoch": 1.4542218706743129, + "step": 14708, + "train/sim_loss": 0.031191229820251465 + }, + { + "epoch": 1.4542218706743129, + "step": 14708, + "train/total_loss": 0.08465667068958282 + }, + { + "entropy": 9.588021278381348, + "epoch": 1.4543207435238283, + "mean_token_accuracy": 0.8474114537239075, + "num_tokens": 33950399.0, + "step": 14709, + "train/ce_loss": 0.5212854146957397 + }, + { + "epoch": 1.4543207435238283, + "step": 14709, + "train/sim_loss": 0.08702635765075684 + }, + { + "epoch": 1.4543207435238283, + "step": 14709, + "train/total_loss": 0.13915489614009857 + }, + { + "entropy": 9.04483413696289, + "epoch": 1.4544196163733438, + "mean_token_accuracy": 0.8626834154129028, + "num_tokens": 33963472.0, + "step": 14710, + "train/ce_loss": 0.5475457906723022 + }, + { + "epoch": 1.4544196163733438, + "step": 14710, + "train/sim_loss": 0.07411587238311768 + }, + { + "epoch": 1.4544196163733438, + "step": 14710, + "train/total_loss": 0.12887045741081238 + }, + { + "entropy": 9.392282485961914, + "epoch": 1.4545184892228593, + "mean_token_accuracy": 0.821566104888916, + "num_tokens": 33976207.0, + "step": 14711, + "train/ce_loss": 0.5691770315170288 + }, + { + "epoch": 1.4545184892228593, + "step": 14711, + "train/sim_loss": 0.05511903762817383 + }, + { + "epoch": 1.4545184892228593, + "step": 14711, + "train/total_loss": 0.11203674226999283 + }, + { + "entropy": 9.22180461883545, + "epoch": 1.454617362072375, + "mean_token_accuracy": 0.821566104888916, + "num_tokens": 33989633.0, + "step": 14712, + "train/ce_loss": 0.5951242446899414 + }, + { + "epoch": 1.454617362072375, + "step": 14712, + "train/sim_loss": 0.08610498905181885 + }, + { + "epoch": 1.454617362072375, + "step": 14712, + "train/total_loss": 0.14561741054058075 + }, + { + "entropy": 9.009918212890625, + "epoch": 1.4547162349218905, + "mean_token_accuracy": 0.8511165976524353, + "num_tokens": 34000565.0, + "step": 14713, + "train/ce_loss": 0.31421613693237305 + }, + { + "epoch": 1.4547162349218905, + "step": 14713, + "train/sim_loss": 0.029319703578948975 + }, + { + "epoch": 1.4547162349218905, + "step": 14713, + "train/total_loss": 0.06074131652712822 + }, + { + "entropy": 9.619643211364746, + "epoch": 1.454815107771406, + "mean_token_accuracy": 0.9230769276618958, + "num_tokens": 34017510.0, + "step": 14714, + "train/ce_loss": 4.196306690573692e-06 + }, + { + "epoch": 1.454815107771406, + "step": 14714, + "train/sim_loss": 0.03212630748748779 + }, + { + "epoch": 1.454815107771406, + "step": 14714, + "train/total_loss": 0.03212672844529152 + }, + { + "entropy": 9.78915023803711, + "epoch": 1.4549139806209215, + "mean_token_accuracy": 0.822429895401001, + "num_tokens": 34039027.0, + "step": 14715, + "train/ce_loss": 0.6105338335037231 + }, + { + "epoch": 1.4549139806209215, + "step": 14715, + "train/sim_loss": 0.04827970266342163 + }, + { + "epoch": 1.4549139806209215, + "step": 14715, + "train/total_loss": 0.1093330830335617 + }, + { + "entropy": 9.210119247436523, + "epoch": 1.455012853470437, + "mean_token_accuracy": 0.8424657583236694, + "num_tokens": 34048090.0, + "step": 14716, + "train/ce_loss": 0.2085479497909546 + }, + { + "epoch": 1.455012853470437, + "step": 14716, + "train/sim_loss": 0.06029993295669556 + }, + { + "epoch": 1.455012853470437, + "step": 14716, + "train/total_loss": 0.0811547264456749 + }, + { + "entropy": 9.530182838439941, + "epoch": 1.4551117263199527, + "mean_token_accuracy": 0.8423236608505249, + "num_tokens": 34062213.0, + "step": 14717, + "train/ce_loss": 0.928874135017395 + }, + { + "epoch": 1.4551117263199527, + "step": 14717, + "train/sim_loss": 0.0753210186958313 + }, + { + "epoch": 1.4551117263199527, + "step": 14717, + "train/total_loss": 0.16820843517780304 + }, + { + "entropy": 9.672266006469727, + "epoch": 1.455210599169468, + "mean_token_accuracy": 0.864904522895813, + "num_tokens": 34073144.0, + "step": 14718, + "train/ce_loss": 0.7522497773170471 + }, + { + "epoch": 1.455210599169468, + "step": 14718, + "train/sim_loss": 0.04282248020172119 + }, + { + "epoch": 1.455210599169468, + "step": 14718, + "train/total_loss": 0.11804746091365814 + }, + { + "entropy": 9.473533630371094, + "epoch": 1.4553094720189836, + "mean_token_accuracy": 0.7709497213363647, + "num_tokens": 34088424.0, + "step": 14719, + "train/ce_loss": 0.5964364409446716 + }, + { + "epoch": 1.4553094720189836, + "step": 14719, + "train/sim_loss": 0.07926952838897705 + }, + { + "epoch": 1.4553094720189836, + "step": 14719, + "train/total_loss": 0.13891316950321198 + }, + { + "epoch": 1.4554083448684991, + "grad_norm": 0.6937066912651062, + "learning_rate": 6.363299213766504e-06, + "loss": 0.091, + "step": 14720 + }, + { + "entropy": 8.902830123901367, + "epoch": 1.4554083448684991, + "mean_token_accuracy": 0.8455377817153931, + "num_tokens": 34096722.0, + "step": 14720, + "train/ce_loss": 0.6609516739845276 + }, + { + "epoch": 1.4554083448684991, + "step": 14720, + "train/sim_loss": 0.04707181453704834 + }, + { + "epoch": 1.4554083448684991, + "step": 14720, + "train/total_loss": 0.11316698044538498 + }, + { + "entropy": 9.872310638427734, + "epoch": 1.4555072177180146, + "mean_token_accuracy": 0.8425655961036682, + "num_tokens": 34107702.0, + "step": 14721, + "train/ce_loss": 1.3705021142959595 + }, + { + "epoch": 1.4555072177180146, + "step": 14721, + "train/sim_loss": 0.04017770290374756 + }, + { + "epoch": 1.4555072177180146, + "step": 14721, + "train/total_loss": 0.1772279143333435 + }, + { + "entropy": 9.414993286132812, + "epoch": 1.45560609056753, + "mean_token_accuracy": 0.876800000667572, + "num_tokens": 34114634.0, + "step": 14722, + "train/ce_loss": 0.3575807213783264 + }, + { + "epoch": 1.45560609056753, + "step": 14722, + "train/sim_loss": 0.03965198993682861 + }, + { + "epoch": 1.45560609056753, + "step": 14722, + "train/total_loss": 0.07541006803512573 + }, + { + "entropy": 9.22012710571289, + "epoch": 1.4557049634170456, + "mean_token_accuracy": 0.8443449139595032, + "num_tokens": 34122663.0, + "step": 14723, + "train/ce_loss": 0.403114378452301 + }, + { + "epoch": 1.4557049634170456, + "step": 14723, + "train/sim_loss": 0.06546396017074585 + }, + { + "epoch": 1.4557049634170456, + "step": 14723, + "train/total_loss": 0.10577540099620819 + }, + { + "entropy": 9.159160614013672, + "epoch": 1.4558038362665613, + "mean_token_accuracy": 0.8256997466087341, + "num_tokens": 34134203.0, + "step": 14724, + "train/ce_loss": 0.5966235399246216 + }, + { + "epoch": 1.4558038362665613, + "step": 14724, + "train/sim_loss": 0.052378296852111816 + }, + { + "epoch": 1.4558038362665613, + "step": 14724, + "train/total_loss": 0.11204065382480621 + }, + { + "entropy": 9.676472663879395, + "epoch": 1.4559027091160768, + "mean_token_accuracy": 0.8827160596847534, + "num_tokens": 34144110.0, + "step": 14725, + "train/ce_loss": 0.1663014143705368 + }, + { + "epoch": 1.4559027091160768, + "step": 14725, + "train/sim_loss": 0.0416485071182251 + }, + { + "epoch": 1.4559027091160768, + "step": 14725, + "train/total_loss": 0.0582786500453949 + }, + { + "entropy": 9.27940559387207, + "epoch": 1.4560015819655923, + "mean_token_accuracy": 0.8583333492279053, + "num_tokens": 34153716.0, + "step": 14726, + "train/ce_loss": 0.528950572013855 + }, + { + "epoch": 1.4560015819655923, + "step": 14726, + "train/sim_loss": 0.028372883796691895 + }, + { + "epoch": 1.4560015819655923, + "step": 14726, + "train/total_loss": 0.08126793801784515 + }, + { + "entropy": 9.570968627929688, + "epoch": 1.4561004548151077, + "mean_token_accuracy": 0.8546666502952576, + "num_tokens": 34170373.0, + "step": 14727, + "train/ce_loss": 0.42261433601379395 + }, + { + "epoch": 1.4561004548151077, + "step": 14727, + "train/sim_loss": 0.03482311964035034 + }, + { + "epoch": 1.4561004548151077, + "step": 14727, + "train/total_loss": 0.07708455622196198 + }, + { + "entropy": 8.91617488861084, + "epoch": 1.4561993276646232, + "mean_token_accuracy": 0.8270041942596436, + "num_tokens": 34183545.0, + "step": 14728, + "train/ce_loss": 0.6594251394271851 + }, + { + "epoch": 1.4561993276646232, + "step": 14728, + "train/sim_loss": 0.021551072597503662 + }, + { + "epoch": 1.4561993276646232, + "step": 14728, + "train/total_loss": 0.08749359101057053 + }, + { + "entropy": 9.132759094238281, + "epoch": 1.456298200514139, + "mean_token_accuracy": 0.8614457845687866, + "num_tokens": 34196418.0, + "step": 14729, + "train/ce_loss": 0.5366277694702148 + }, + { + "epoch": 1.456298200514139, + "step": 14729, + "train/sim_loss": 0.03757321834564209 + }, + { + "epoch": 1.456298200514139, + "step": 14729, + "train/total_loss": 0.09123599529266357 + }, + { + "entropy": 9.64008617401123, + "epoch": 1.4563970733636542, + "mean_token_accuracy": 0.8586626052856445, + "num_tokens": 34206694.0, + "step": 14730, + "train/ce_loss": 1.321737840953574e-06 + }, + { + "epoch": 1.4563970733636542, + "step": 14730, + "train/sim_loss": 0.04925870895385742 + }, + { + "epoch": 1.4563970733636542, + "step": 14730, + "train/total_loss": 0.04925883933901787 + }, + { + "entropy": 8.804506301879883, + "epoch": 1.45649594621317, + "mean_token_accuracy": 0.8695651888847351, + "num_tokens": 34214200.0, + "step": 14731, + "train/ce_loss": 0.5652631521224976 + }, + { + "epoch": 1.45649594621317, + "step": 14731, + "train/sim_loss": 0.048404574394226074 + }, + { + "epoch": 1.45649594621317, + "step": 14731, + "train/total_loss": 0.10493089258670807 + }, + { + "entropy": 8.863138198852539, + "epoch": 1.4565948190626854, + "mean_token_accuracy": 0.8914728760719299, + "num_tokens": 34232086.0, + "step": 14732, + "train/ce_loss": 1.7896380768434028e-06 + }, + { + "epoch": 1.4565948190626854, + "step": 14732, + "train/sim_loss": 0.025284230709075928 + }, + { + "epoch": 1.4565948190626854, + "step": 14732, + "train/total_loss": 0.025284409523010254 + }, + { + "entropy": 9.811891555786133, + "epoch": 1.4566936919122009, + "mean_token_accuracy": 0.8268733620643616, + "num_tokens": 34241253.0, + "step": 14733, + "train/ce_loss": 0.6149784922599792 + }, + { + "epoch": 1.4566936919122009, + "step": 14733, + "train/sim_loss": 0.008935809135437012 + }, + { + "epoch": 1.4566936919122009, + "step": 14733, + "train/total_loss": 0.07043366134166718 + }, + { + "entropy": 9.549681663513184, + "epoch": 1.4567925647617164, + "mean_token_accuracy": 0.8591304421424866, + "num_tokens": 34253581.0, + "step": 14734, + "train/ce_loss": 0.5832153558731079 + }, + { + "epoch": 1.4567925647617164, + "step": 14734, + "train/sim_loss": 0.03338998556137085 + }, + { + "epoch": 1.4567925647617164, + "step": 14734, + "train/total_loss": 0.09171152114868164 + }, + { + "entropy": 9.316329956054688, + "epoch": 1.4568914376112319, + "mean_token_accuracy": 0.8475524187088013, + "num_tokens": 34261182.0, + "step": 14735, + "train/ce_loss": 0.18562933802604675 + }, + { + "epoch": 1.4568914376112319, + "step": 14735, + "train/sim_loss": 0.019197702407836914 + }, + { + "epoch": 1.4568914376112319, + "step": 14735, + "train/total_loss": 0.03776063770055771 + }, + { + "entropy": 8.905550003051758, + "epoch": 1.4569903104607476, + "mean_token_accuracy": 0.8562753200531006, + "num_tokens": 34271813.0, + "step": 14736, + "train/ce_loss": 0.33847007155418396 + }, + { + "epoch": 1.4569903104607476, + "step": 14736, + "train/sim_loss": 0.02109438180923462 + }, + { + "epoch": 1.4569903104607476, + "step": 14736, + "train/total_loss": 0.054941389709711075 + }, + { + "entropy": 9.690364837646484, + "epoch": 1.457089183310263, + "mean_token_accuracy": 0.850953221321106, + "num_tokens": 34290184.0, + "step": 14737, + "train/ce_loss": 0.6871480941772461 + }, + { + "epoch": 1.457089183310263, + "step": 14737, + "train/sim_loss": 0.02774113416671753 + }, + { + "epoch": 1.457089183310263, + "step": 14737, + "train/total_loss": 0.09645594656467438 + }, + { + "entropy": 9.586849212646484, + "epoch": 1.4571880561597785, + "mean_token_accuracy": 0.9262899160385132, + "num_tokens": 34301629.0, + "step": 14738, + "train/ce_loss": 1.3519667163564009e-06 + }, + { + "epoch": 1.4571880561597785, + "step": 14738, + "train/sim_loss": 0.041194796562194824 + }, + { + "epoch": 1.4571880561597785, + "step": 14738, + "train/total_loss": 0.04119493067264557 + }, + { + "entropy": 9.914782524108887, + "epoch": 1.457286929009294, + "mean_token_accuracy": 0.8322147727012634, + "num_tokens": 34314368.0, + "step": 14739, + "train/ce_loss": 0.7435516715049744 + }, + { + "epoch": 1.457286929009294, + "step": 14739, + "train/sim_loss": 0.05830502510070801 + }, + { + "epoch": 1.457286929009294, + "step": 14739, + "train/total_loss": 0.13266019523143768 + }, + { + "epoch": 1.4573858018588095, + "grad_norm": 0.6169192790985107, + "learning_rate": 6.358354349008555e-06, + "loss": 0.0778, + "step": 14740 + }, + { + "entropy": 9.558239936828613, + "epoch": 1.4573858018588095, + "mean_token_accuracy": 0.8568738102912903, + "num_tokens": 34324937.0, + "step": 14740, + "train/ce_loss": 2.5319811811641557e-06 + }, + { + "epoch": 1.4573858018588095, + "step": 14740, + "train/sim_loss": 0.029158353805541992 + }, + { + "epoch": 1.4573858018588095, + "step": 14740, + "train/total_loss": 0.029158607125282288 + }, + { + "entropy": 9.327095985412598, + "epoch": 1.4574846747083252, + "mean_token_accuracy": 0.882108211517334, + "num_tokens": 34342747.0, + "step": 14741, + "train/ce_loss": 0.4249788522720337 + }, + { + "epoch": 1.4574846747083252, + "step": 14741, + "train/sim_loss": 0.058197855949401855 + }, + { + "epoch": 1.4574846747083252, + "step": 14741, + "train/total_loss": 0.10069574415683746 + }, + { + "entropy": 9.607195854187012, + "epoch": 1.4575835475578407, + "mean_token_accuracy": 0.8761062026023865, + "num_tokens": 34356497.0, + "step": 14742, + "train/ce_loss": 0.7173543572425842 + }, + { + "epoch": 1.4575835475578407, + "step": 14742, + "train/sim_loss": 0.036957740783691406 + }, + { + "epoch": 1.4575835475578407, + "step": 14742, + "train/total_loss": 0.10869317501783371 + }, + { + "entropy": 8.654059410095215, + "epoch": 1.4576824204073562, + "mean_token_accuracy": 0.7915851473808289, + "num_tokens": 34363829.0, + "step": 14743, + "train/ce_loss": 0.7364321351051331 + }, + { + "epoch": 1.4576824204073562, + "step": 14743, + "train/sim_loss": 0.06471908092498779 + }, + { + "epoch": 1.4576824204073562, + "step": 14743, + "train/total_loss": 0.13836228847503662 + }, + { + "entropy": 9.098073959350586, + "epoch": 1.4577812932568717, + "mean_token_accuracy": 0.8389021754264832, + "num_tokens": 34374365.0, + "step": 14744, + "train/ce_loss": 0.341255784034729 + }, + { + "epoch": 1.4577812932568717, + "step": 14744, + "train/sim_loss": 0.05053424835205078 + }, + { + "epoch": 1.4577812932568717, + "step": 14744, + "train/total_loss": 0.08465982973575592 + }, + { + "entropy": 8.936030387878418, + "epoch": 1.4578801661063872, + "mean_token_accuracy": 0.7985507249832153, + "num_tokens": 34385941.0, + "step": 14745, + "train/ce_loss": 1.046135425567627 + }, + { + "epoch": 1.4578801661063872, + "step": 14745, + "train/sim_loss": 0.10454332828521729 + }, + { + "epoch": 1.4578801661063872, + "step": 14745, + "train/total_loss": 0.20915687084197998 + }, + { + "entropy": 8.984359741210938, + "epoch": 1.4579790389559026, + "mean_token_accuracy": 0.8433874845504761, + "num_tokens": 34397885.0, + "step": 14746, + "train/ce_loss": 0.4887876510620117 + }, + { + "epoch": 1.4579790389559026, + "step": 14746, + "train/sim_loss": 0.01474529504776001 + }, + { + "epoch": 1.4579790389559026, + "step": 14746, + "train/total_loss": 0.0636240616440773 + }, + { + "entropy": 8.693568229675293, + "epoch": 1.4580779118054181, + "mean_token_accuracy": 0.8491570353507996, + "num_tokens": 34408433.0, + "step": 14747, + "train/ce_loss": 0.5459139943122864 + }, + { + "epoch": 1.4580779118054181, + "step": 14747, + "train/sim_loss": 0.044080138206481934 + }, + { + "epoch": 1.4580779118054181, + "step": 14747, + "train/total_loss": 0.09867154061794281 + }, + { + "entropy": 9.578787803649902, + "epoch": 1.4581767846549338, + "mean_token_accuracy": 0.7721518874168396, + "num_tokens": 34420716.0, + "step": 14748, + "train/ce_loss": 0.5468826293945312 + }, + { + "epoch": 1.4581767846549338, + "step": 14748, + "train/sim_loss": 0.0385129451751709 + }, + { + "epoch": 1.4581767846549338, + "step": 14748, + "train/total_loss": 0.09320120513439178 + }, + { + "entropy": 9.214608192443848, + "epoch": 1.4582756575044493, + "mean_token_accuracy": 0.8370044231414795, + "num_tokens": 34440635.0, + "step": 14749, + "train/ce_loss": 0.39852654933929443 + }, + { + "epoch": 1.4582756575044493, + "step": 14749, + "train/sim_loss": 0.016429364681243896 + }, + { + "epoch": 1.4582756575044493, + "step": 14749, + "train/total_loss": 0.05628202110528946 + }, + { + "entropy": 8.858333587646484, + "epoch": 1.4583745303539648, + "mean_token_accuracy": 0.8618784546852112, + "num_tokens": 34448521.0, + "step": 14750, + "train/ce_loss": 0.5373514294624329 + }, + { + "epoch": 1.4583745303539648, + "step": 14750, + "train/sim_loss": 0.03349876403808594 + }, + { + "epoch": 1.4583745303539648, + "step": 14750, + "train/total_loss": 0.08723390847444534 + }, + { + "entropy": 10.197728157043457, + "epoch": 1.4584734032034803, + "mean_token_accuracy": 0.8780889511108398, + "num_tokens": 34459344.0, + "step": 14751, + "train/ce_loss": 0.7081827521324158 + }, + { + "epoch": 1.4584734032034803, + "step": 14751, + "train/sim_loss": 0.032428741455078125 + }, + { + "epoch": 1.4584734032034803, + "step": 14751, + "train/total_loss": 0.1032470166683197 + }, + { + "entropy": 9.785977363586426, + "epoch": 1.4585722760529958, + "mean_token_accuracy": 0.8526570200920105, + "num_tokens": 34470417.0, + "step": 14752, + "train/ce_loss": 1.4684487723570783e-06 + }, + { + "epoch": 1.4585722760529958, + "step": 14752, + "train/sim_loss": 0.014370143413543701 + }, + { + "epoch": 1.4585722760529958, + "step": 14752, + "train/total_loss": 0.01437029056251049 + }, + { + "entropy": 9.498327255249023, + "epoch": 1.4586711489025115, + "mean_token_accuracy": 0.8399999737739563, + "num_tokens": 34485511.0, + "step": 14753, + "train/ce_loss": 0.31042683124542236 + }, + { + "epoch": 1.4586711489025115, + "step": 14753, + "train/sim_loss": 0.03980815410614014 + }, + { + "epoch": 1.4586711489025115, + "step": 14753, + "train/total_loss": 0.07085083425045013 + }, + { + "entropy": 9.888091087341309, + "epoch": 1.458770021752027, + "mean_token_accuracy": 0.8214285969734192, + "num_tokens": 34492700.0, + "step": 14754, + "train/ce_loss": 0.576901376247406 + }, + { + "epoch": 1.458770021752027, + "step": 14754, + "train/sim_loss": 0.040815114974975586 + }, + { + "epoch": 1.458770021752027, + "step": 14754, + "train/total_loss": 0.09850525856018066 + }, + { + "entropy": 9.39135456085205, + "epoch": 1.4588688946015425, + "mean_token_accuracy": 0.8691437840461731, + "num_tokens": 34505715.0, + "step": 14755, + "train/ce_loss": 0.47241702675819397 + }, + { + "epoch": 1.4588688946015425, + "step": 14755, + "train/sim_loss": 0.03767424821853638 + }, + { + "epoch": 1.4588688946015425, + "step": 14755, + "train/total_loss": 0.08491595089435577 + }, + { + "entropy": 9.172744750976562, + "epoch": 1.458967767451058, + "mean_token_accuracy": 0.8316559791564941, + "num_tokens": 34520504.0, + "step": 14756, + "train/ce_loss": 0.8859386444091797 + }, + { + "epoch": 1.458967767451058, + "step": 14756, + "train/sim_loss": 0.12145185470581055 + }, + { + "epoch": 1.458967767451058, + "step": 14756, + "train/total_loss": 0.210045725107193 + }, + { + "entropy": 9.137900352478027, + "epoch": 1.4590666403005734, + "mean_token_accuracy": 0.8207547068595886, + "num_tokens": 34529315.0, + "step": 14757, + "train/ce_loss": 0.40683257579803467 + }, + { + "epoch": 1.4590666403005734, + "step": 14757, + "train/sim_loss": 0.06572484970092773 + }, + { + "epoch": 1.4590666403005734, + "step": 14757, + "train/total_loss": 0.10640810430049896 + }, + { + "entropy": 9.603946685791016, + "epoch": 1.459165513150089, + "mean_token_accuracy": 0.889684796333313, + "num_tokens": 34541114.0, + "step": 14758, + "train/ce_loss": 0.4757995307445526 + }, + { + "epoch": 1.459165513150089, + "step": 14758, + "train/sim_loss": 0.05169510841369629 + }, + { + "epoch": 1.459165513150089, + "step": 14758, + "train/total_loss": 0.09927506744861603 + }, + { + "entropy": 9.119061470031738, + "epoch": 1.4592643859996044, + "mean_token_accuracy": 0.8575096130371094, + "num_tokens": 34553699.0, + "step": 14759, + "train/ce_loss": 0.5093128085136414 + }, + { + "epoch": 1.4592643859996044, + "step": 14759, + "train/sim_loss": 0.037663936614990234 + }, + { + "epoch": 1.4592643859996044, + "step": 14759, + "train/total_loss": 0.08859521895647049 + }, + { + "epoch": 1.45936325884912, + "grad_norm": 0.5129842758178711, + "learning_rate": 6.353409484250607e-06, + "loss": 0.0869, + "step": 14760 + }, + { + "entropy": 9.27376937866211, + "epoch": 1.45936325884912, + "mean_token_accuracy": 0.8632352948188782, + "num_tokens": 34573745.0, + "step": 14760, + "train/ce_loss": 0.7233640551567078 + }, + { + "epoch": 1.45936325884912, + "step": 14760, + "train/sim_loss": 0.03672987222671509 + }, + { + "epoch": 1.45936325884912, + "step": 14760, + "train/total_loss": 0.10906627774238586 + }, + { + "entropy": 9.054298400878906, + "epoch": 1.4594621316986356, + "mean_token_accuracy": 0.8145492076873779, + "num_tokens": 34581530.0, + "step": 14761, + "train/ce_loss": 0.349432110786438 + }, + { + "epoch": 1.4594621316986356, + "step": 14761, + "train/sim_loss": 0.02641618251800537 + }, + { + "epoch": 1.4594621316986356, + "step": 14761, + "train/total_loss": 0.06135939434170723 + }, + { + "entropy": 8.95485782623291, + "epoch": 1.459561004548151, + "mean_token_accuracy": 0.8694362044334412, + "num_tokens": 34593983.0, + "step": 14762, + "train/ce_loss": 0.3410196602344513 + }, + { + "epoch": 1.459561004548151, + "step": 14762, + "train/sim_loss": 0.03167611360549927 + }, + { + "epoch": 1.459561004548151, + "step": 14762, + "train/total_loss": 0.06577807664871216 + }, + { + "entropy": 9.115020751953125, + "epoch": 1.4596598773976666, + "mean_token_accuracy": 0.8444165587425232, + "num_tokens": 34606795.0, + "step": 14763, + "train/ce_loss": 0.47711890935897827 + }, + { + "epoch": 1.4596598773976666, + "step": 14763, + "train/sim_loss": 0.017623543739318848 + }, + { + "epoch": 1.4596598773976666, + "step": 14763, + "train/total_loss": 0.06533543765544891 + }, + { + "entropy": 9.53960132598877, + "epoch": 1.459758750247182, + "mean_token_accuracy": 0.8759811520576477, + "num_tokens": 34624484.0, + "step": 14764, + "train/ce_loss": 0.561225950717926 + }, + { + "epoch": 1.459758750247182, + "step": 14764, + "train/sim_loss": 0.023847579956054688 + }, + { + "epoch": 1.459758750247182, + "step": 14764, + "train/total_loss": 0.07997018098831177 + }, + { + "entropy": 9.162836074829102, + "epoch": 1.4598576230966978, + "mean_token_accuracy": 0.8651960492134094, + "num_tokens": 34630949.0, + "step": 14765, + "train/ce_loss": 0.47565796971321106 + }, + { + "epoch": 1.4598576230966978, + "step": 14765, + "train/sim_loss": 0.04266095161437988 + }, + { + "epoch": 1.4598576230966978, + "step": 14765, + "train/total_loss": 0.09022675454616547 + }, + { + "entropy": 9.701299667358398, + "epoch": 1.4599564959462132, + "mean_token_accuracy": 0.8321078419685364, + "num_tokens": 34645969.0, + "step": 14766, + "train/ce_loss": 0.3756248354911804 + }, + { + "epoch": 1.4599564959462132, + "step": 14766, + "train/sim_loss": 0.048714637756347656 + }, + { + "epoch": 1.4599564959462132, + "step": 14766, + "train/total_loss": 0.08627712726593018 + }, + { + "entropy": 9.398783683776855, + "epoch": 1.4600553687957287, + "mean_token_accuracy": 0.8917861580848694, + "num_tokens": 34652224.0, + "step": 14767, + "train/ce_loss": 4.898791985397111e-07 + }, + { + "epoch": 1.4600553687957287, + "step": 14767, + "train/sim_loss": 0.014796555042266846 + }, + { + "epoch": 1.4600553687957287, + "step": 14767, + "train/total_loss": 0.0147966044023633 + }, + { + "entropy": 9.550052642822266, + "epoch": 1.4601542416452442, + "mean_token_accuracy": 0.8372413516044617, + "num_tokens": 34663579.0, + "step": 14768, + "train/ce_loss": 1.025963544845581 + }, + { + "epoch": 1.4601542416452442, + "step": 14768, + "train/sim_loss": 0.11212599277496338 + }, + { + "epoch": 1.4601542416452442, + "step": 14768, + "train/total_loss": 0.21472235023975372 + }, + { + "entropy": 9.29251480102539, + "epoch": 1.4602531144947597, + "mean_token_accuracy": 0.8252184987068176, + "num_tokens": 34677029.0, + "step": 14769, + "train/ce_loss": 0.1956557333469391 + }, + { + "epoch": 1.4602531144947597, + "step": 14769, + "train/sim_loss": 0.08257198333740234 + }, + { + "epoch": 1.4602531144947597, + "step": 14769, + "train/total_loss": 0.10213755816221237 + }, + { + "entropy": 9.522626876831055, + "epoch": 1.4603519873442752, + "mean_token_accuracy": 0.8966101408004761, + "num_tokens": 34687734.0, + "step": 14770, + "train/ce_loss": 0.29302895069122314 + }, + { + "epoch": 1.4603519873442752, + "step": 14770, + "train/sim_loss": 0.01600182056427002 + }, + { + "epoch": 1.4603519873442752, + "step": 14770, + "train/total_loss": 0.045304715633392334 + }, + { + "entropy": 9.622587203979492, + "epoch": 1.4604508601937907, + "mean_token_accuracy": 0.8855721354484558, + "num_tokens": 34695054.0, + "step": 14771, + "train/ce_loss": 1.090433897843468e-06 + }, + { + "epoch": 1.4604508601937907, + "step": 14771, + "train/sim_loss": 0.022754669189453125 + }, + { + "epoch": 1.4604508601937907, + "step": 14771, + "train/total_loss": 0.02275477908551693 + }, + { + "entropy": 9.959922790527344, + "epoch": 1.4605497330433064, + "mean_token_accuracy": 0.8795180916786194, + "num_tokens": 34715781.0, + "step": 14772, + "train/ce_loss": 1.1144653626615764e-06 + }, + { + "epoch": 1.4605497330433064, + "step": 14772, + "train/sim_loss": 0.02628803253173828 + }, + { + "epoch": 1.4605497330433064, + "step": 14772, + "train/total_loss": 0.026288144290447235 + }, + { + "entropy": 9.630066871643066, + "epoch": 1.4606486058928219, + "mean_token_accuracy": 0.9159021377563477, + "num_tokens": 34733488.0, + "step": 14773, + "train/ce_loss": 8.180343797903333e-07 + }, + { + "epoch": 1.4606486058928219, + "step": 14773, + "train/sim_loss": 0.04629552364349365 + }, + { + "epoch": 1.4606486058928219, + "step": 14773, + "train/total_loss": 0.04629560559988022 + }, + { + "entropy": 9.093170166015625, + "epoch": 1.4607474787423373, + "mean_token_accuracy": 0.8431794047355652, + "num_tokens": 34745731.0, + "step": 14774, + "train/ce_loss": 0.19318541884422302 + }, + { + "epoch": 1.4607474787423373, + "step": 14774, + "train/sim_loss": 0.032086730003356934 + }, + { + "epoch": 1.4607474787423373, + "step": 14774, + "train/total_loss": 0.051405273377895355 + }, + { + "entropy": 9.261043548583984, + "epoch": 1.4608463515918528, + "mean_token_accuracy": 0.8852242827415466, + "num_tokens": 34755584.0, + "step": 14775, + "train/ce_loss": 0.351878821849823 + }, + { + "epoch": 1.4608463515918528, + "step": 14775, + "train/sim_loss": 0.07175701856613159 + }, + { + "epoch": 1.4608463515918528, + "step": 14775, + "train/total_loss": 0.10694490373134613 + }, + { + "entropy": 9.614445686340332, + "epoch": 1.4609452244413683, + "mean_token_accuracy": 0.8986014127731323, + "num_tokens": 34775414.0, + "step": 14776, + "train/ce_loss": 0.46676337718963623 + }, + { + "epoch": 1.4609452244413683, + "step": 14776, + "train/sim_loss": 0.022000133991241455 + }, + { + "epoch": 1.4609452244413683, + "step": 14776, + "train/total_loss": 0.06867647171020508 + }, + { + "entropy": 9.303205490112305, + "epoch": 1.461044097290884, + "mean_token_accuracy": 0.8091334700584412, + "num_tokens": 34788190.0, + "step": 14777, + "train/ce_loss": 0.9384104013442993 + }, + { + "epoch": 1.461044097290884, + "step": 14777, + "train/sim_loss": 0.03526425361633301 + }, + { + "epoch": 1.461044097290884, + "step": 14777, + "train/total_loss": 0.12910529971122742 + }, + { + "entropy": 9.67520809173584, + "epoch": 1.4611429701403995, + "mean_token_accuracy": 0.8152011632919312, + "num_tokens": 34796663.0, + "step": 14778, + "train/ce_loss": 4.95302515446383e-07 + }, + { + "epoch": 1.4611429701403995, + "step": 14778, + "train/sim_loss": 0.01524275541305542 + }, + { + "epoch": 1.4611429701403995, + "step": 14778, + "train/total_loss": 0.015242804773151875 + }, + { + "entropy": 9.422597885131836, + "epoch": 1.461241842989915, + "mean_token_accuracy": 0.8122363090515137, + "num_tokens": 34804801.0, + "step": 14779, + "train/ce_loss": 1.204451560974121 + }, + { + "epoch": 1.461241842989915, + "step": 14779, + "train/sim_loss": 0.019783496856689453 + }, + { + "epoch": 1.461241842989915, + "step": 14779, + "train/total_loss": 0.14022865891456604 + }, + { + "epoch": 1.4613407158394305, + "grad_norm": 0.7315090298652649, + "learning_rate": 6.348464619492657e-06, + "loss": 0.0799, + "step": 14780 + }, + { + "entropy": 8.878735542297363, + "epoch": 1.4613407158394305, + "mean_token_accuracy": 0.8888888955116272, + "num_tokens": 34820287.0, + "step": 14780, + "train/ce_loss": 0.31656861305236816 + }, + { + "epoch": 1.4613407158394305, + "step": 14780, + "train/sim_loss": 0.04177892208099365 + }, + { + "epoch": 1.4613407158394305, + "step": 14780, + "train/total_loss": 0.07343578338623047 + }, + { + "entropy": 9.438133239746094, + "epoch": 1.461439588688946, + "mean_token_accuracy": 0.8171091675758362, + "num_tokens": 34826851.0, + "step": 14781, + "train/ce_loss": 0.7640236616134644 + }, + { + "epoch": 1.461439588688946, + "step": 14781, + "train/sim_loss": 0.04104161262512207 + }, + { + "epoch": 1.461439588688946, + "step": 14781, + "train/total_loss": 0.1174439787864685 + }, + { + "entropy": 9.424406051635742, + "epoch": 1.4615384615384617, + "mean_token_accuracy": 0.8611713647842407, + "num_tokens": 34840296.0, + "step": 14782, + "train/ce_loss": 0.21209706366062164 + }, + { + "epoch": 1.4615384615384617, + "step": 14782, + "train/sim_loss": 0.0563654899597168 + }, + { + "epoch": 1.4615384615384617, + "step": 14782, + "train/total_loss": 0.0775751993060112 + }, + { + "entropy": 9.107901573181152, + "epoch": 1.461637334387977, + "mean_token_accuracy": 0.8428974747657776, + "num_tokens": 34851772.0, + "step": 14783, + "train/ce_loss": 0.49503231048583984 + }, + { + "epoch": 1.461637334387977, + "step": 14783, + "train/sim_loss": 0.026537656784057617 + }, + { + "epoch": 1.461637334387977, + "step": 14783, + "train/total_loss": 0.07604089379310608 + }, + { + "entropy": 9.553415298461914, + "epoch": 1.4617362072374926, + "mean_token_accuracy": 0.8927152156829834, + "num_tokens": 34862985.0, + "step": 14784, + "train/ce_loss": 0.3983820378780365 + }, + { + "epoch": 1.4617362072374926, + "step": 14784, + "train/sim_loss": 0.021099090576171875 + }, + { + "epoch": 1.4617362072374926, + "step": 14784, + "train/total_loss": 0.060937296599149704 + }, + { + "entropy": 9.492406845092773, + "epoch": 1.4618350800870081, + "mean_token_accuracy": 0.8650674819946289, + "num_tokens": 34871761.0, + "step": 14785, + "train/ce_loss": 0.5387603640556335 + }, + { + "epoch": 1.4618350800870081, + "step": 14785, + "train/sim_loss": 0.06555759906768799 + }, + { + "epoch": 1.4618350800870081, + "step": 14785, + "train/total_loss": 0.11943364143371582 + }, + { + "entropy": 9.390578269958496, + "epoch": 1.4619339529365236, + "mean_token_accuracy": 0.8935483694076538, + "num_tokens": 34882874.0, + "step": 14786, + "train/ce_loss": 3.7216866530798143e-06 + }, + { + "epoch": 1.4619339529365236, + "step": 14786, + "train/sim_loss": 0.059984803199768066 + }, + { + "epoch": 1.4619339529365236, + "step": 14786, + "train/total_loss": 0.05998517572879791 + }, + { + "entropy": 9.455252647399902, + "epoch": 1.462032825786039, + "mean_token_accuracy": 0.9299191236495972, + "num_tokens": 34902121.0, + "step": 14787, + "train/ce_loss": 5.655064114762354e-07 + }, + { + "epoch": 1.462032825786039, + "step": 14787, + "train/sim_loss": 0.0522005558013916 + }, + { + "epoch": 1.462032825786039, + "step": 14787, + "train/total_loss": 0.05220061168074608 + }, + { + "entropy": 9.234516143798828, + "epoch": 1.4621316986355546, + "mean_token_accuracy": 0.8388554453849792, + "num_tokens": 34912482.0, + "step": 14788, + "train/ce_loss": 0.8372805714607239 + }, + { + "epoch": 1.4621316986355546, + "step": 14788, + "train/sim_loss": 0.08830845355987549 + }, + { + "epoch": 1.4621316986355546, + "step": 14788, + "train/total_loss": 0.17203651368618011 + }, + { + "entropy": 9.47305679321289, + "epoch": 1.4622305714850703, + "mean_token_accuracy": 0.7919621467590332, + "num_tokens": 34924206.0, + "step": 14789, + "train/ce_loss": 0.8091756701469421 + }, + { + "epoch": 1.4622305714850703, + "step": 14789, + "train/sim_loss": 0.04920041561126709 + }, + { + "epoch": 1.4622305714850703, + "step": 14789, + "train/total_loss": 0.1301179826259613 + }, + { + "entropy": 9.620367050170898, + "epoch": 1.4623294443345858, + "mean_token_accuracy": 0.8629737496376038, + "num_tokens": 34937365.0, + "step": 14790, + "train/ce_loss": 0.4107508063316345 + }, + { + "epoch": 1.4623294443345858, + "step": 14790, + "train/sim_loss": 0.04711043834686279 + }, + { + "epoch": 1.4623294443345858, + "step": 14790, + "train/total_loss": 0.08818551898002625 + }, + { + "entropy": 9.066823959350586, + "epoch": 1.4624283171841013, + "mean_token_accuracy": 0.8249452710151672, + "num_tokens": 34948349.0, + "step": 14791, + "train/ce_loss": 0.44929635524749756 + }, + { + "epoch": 1.4624283171841013, + "step": 14791, + "train/sim_loss": 0.03149139881134033 + }, + { + "epoch": 1.4624283171841013, + "step": 14791, + "train/total_loss": 0.07642103731632233 + }, + { + "entropy": 9.280540466308594, + "epoch": 1.4625271900336168, + "mean_token_accuracy": 0.7688053250312805, + "num_tokens": 34962942.0, + "step": 14792, + "train/ce_loss": 0.7832919955253601 + }, + { + "epoch": 1.4625271900336168, + "step": 14792, + "train/sim_loss": 0.029217243194580078 + }, + { + "epoch": 1.4625271900336168, + "step": 14792, + "train/total_loss": 0.10754644125699997 + }, + { + "entropy": 9.591526985168457, + "epoch": 1.4626260628831322, + "mean_token_accuracy": 0.9696969985961914, + "num_tokens": 34974015.0, + "step": 14793, + "train/ce_loss": 0.4824974536895752 + }, + { + "epoch": 1.4626260628831322, + "step": 14793, + "train/sim_loss": 0.03366959095001221 + }, + { + "epoch": 1.4626260628831322, + "step": 14793, + "train/total_loss": 0.0819193422794342 + }, + { + "entropy": 9.3763427734375, + "epoch": 1.462724935732648, + "mean_token_accuracy": 0.8891928791999817, + "num_tokens": 34991259.0, + "step": 14794, + "train/ce_loss": 0.5092003345489502 + }, + { + "epoch": 1.462724935732648, + "step": 14794, + "train/sim_loss": 0.03762471675872803 + }, + { + "epoch": 1.462724935732648, + "step": 14794, + "train/total_loss": 0.08854475617408752 + }, + { + "entropy": 9.13394546508789, + "epoch": 1.4628238085821632, + "mean_token_accuracy": 0.8983516693115234, + "num_tokens": 35003494.0, + "step": 14795, + "train/ce_loss": 0.2525918781757355 + }, + { + "epoch": 1.4628238085821632, + "step": 14795, + "train/sim_loss": 0.013395428657531738 + }, + { + "epoch": 1.4628238085821632, + "step": 14795, + "train/total_loss": 0.038654617965221405 + }, + { + "entropy": 9.225605964660645, + "epoch": 1.462922681431679, + "mean_token_accuracy": 0.8454545736312866, + "num_tokens": 35017523.0, + "step": 14796, + "train/ce_loss": 0.595008909702301 + }, + { + "epoch": 1.462922681431679, + "step": 14796, + "train/sim_loss": 0.0192413330078125 + }, + { + "epoch": 1.462922681431679, + "step": 14796, + "train/total_loss": 0.07874222099781036 + }, + { + "entropy": 9.21419906616211, + "epoch": 1.4630215542811944, + "mean_token_accuracy": 0.8316455483436584, + "num_tokens": 35028118.0, + "step": 14797, + "train/ce_loss": 0.4041503071784973 + }, + { + "epoch": 1.4630215542811944, + "step": 14797, + "train/sim_loss": 0.06986725330352783 + }, + { + "epoch": 1.4630215542811944, + "step": 14797, + "train/total_loss": 0.1102822870016098 + }, + { + "entropy": 9.61075210571289, + "epoch": 1.4631204271307099, + "mean_token_accuracy": 0.8436018824577332, + "num_tokens": 35043989.0, + "step": 14798, + "train/ce_loss": 0.8005504012107849 + }, + { + "epoch": 1.4631204271307099, + "step": 14798, + "train/sim_loss": 0.06460416316986084 + }, + { + "epoch": 1.4631204271307099, + "step": 14798, + "train/total_loss": 0.14465920627117157 + }, + { + "entropy": 9.610732078552246, + "epoch": 1.4632192999802254, + "mean_token_accuracy": 0.82492995262146, + "num_tokens": 35051760.0, + "step": 14799, + "train/ce_loss": 0.7277933955192566 + }, + { + "epoch": 1.4632192999802254, + "step": 14799, + "train/sim_loss": 0.10114490985870361 + }, + { + "epoch": 1.4632192999802254, + "step": 14799, + "train/total_loss": 0.1739242523908615 + }, + { + "epoch": 1.4633181728297409, + "grad_norm": 0.5764335989952087, + "learning_rate": 6.3435197547347085e-06, + "loss": 0.0812, + "step": 14800 + }, + { + "entropy": 9.02776050567627, + "epoch": 1.4633181728297409, + "mean_token_accuracy": 0.881428599357605, + "num_tokens": 35067536.0, + "step": 14800, + "train/ce_loss": 3.9440638488486e-07 + }, + { + "epoch": 1.4633181728297409, + "step": 14800, + "train/sim_loss": 0.0182456374168396 + }, + { + "epoch": 1.4633181728297409, + "step": 14800, + "train/total_loss": 0.018245676532387733 + }, + { + "entropy": 8.912018775939941, + "epoch": 1.4634170456792566, + "mean_token_accuracy": 0.840512216091156, + "num_tokens": 35077058.0, + "step": 14801, + "train/ce_loss": 0.15911506116390228 + }, + { + "epoch": 1.4634170456792566, + "step": 14801, + "train/sim_loss": 0.032276153564453125 + }, + { + "epoch": 1.4634170456792566, + "step": 14801, + "train/total_loss": 0.048187658190727234 + }, + { + "entropy": 9.46267318725586, + "epoch": 1.463515918528772, + "mean_token_accuracy": 0.8160136342048645, + "num_tokens": 35084882.0, + "step": 14802, + "train/ce_loss": 0.5270261764526367 + }, + { + "epoch": 1.463515918528772, + "step": 14802, + "train/sim_loss": 0.02388608455657959 + }, + { + "epoch": 1.463515918528772, + "step": 14802, + "train/total_loss": 0.0765887051820755 + }, + { + "entropy": 9.130023002624512, + "epoch": 1.4636147913782875, + "mean_token_accuracy": 0.8400460481643677, + "num_tokens": 35094788.0, + "step": 14803, + "train/ce_loss": 0.4503009617328644 + }, + { + "epoch": 1.4636147913782875, + "step": 14803, + "train/sim_loss": 0.051133573055267334 + }, + { + "epoch": 1.4636147913782875, + "step": 14803, + "train/total_loss": 0.09616367518901825 + }, + { + "entropy": 9.089750289916992, + "epoch": 1.463713664227803, + "mean_token_accuracy": 0.8506731986999512, + "num_tokens": 35107762.0, + "step": 14804, + "train/ce_loss": 0.35815566778182983 + }, + { + "epoch": 1.463713664227803, + "step": 14804, + "train/sim_loss": 0.03478187322616577 + }, + { + "epoch": 1.463713664227803, + "step": 14804, + "train/total_loss": 0.07059744000434875 + }, + { + "entropy": 9.19051742553711, + "epoch": 1.4638125370773185, + "mean_token_accuracy": 0.8664302825927734, + "num_tokens": 35120569.0, + "step": 14805, + "train/ce_loss": 0.41899436712265015 + }, + { + "epoch": 1.4638125370773185, + "step": 14805, + "train/sim_loss": 0.03034263849258423 + }, + { + "epoch": 1.4638125370773185, + "step": 14805, + "train/total_loss": 0.07224208116531372 + }, + { + "entropy": 9.507523536682129, + "epoch": 1.4639114099268342, + "mean_token_accuracy": 0.8155339956283569, + "num_tokens": 35139576.0, + "step": 14806, + "train/ce_loss": 0.520013153553009 + }, + { + "epoch": 1.4639114099268342, + "step": 14806, + "train/sim_loss": 0.02943110466003418 + }, + { + "epoch": 1.4639114099268342, + "step": 14806, + "train/total_loss": 0.08143241703510284 + }, + { + "entropy": 9.501909255981445, + "epoch": 1.4640102827763495, + "mean_token_accuracy": 0.8808724880218506, + "num_tokens": 35152384.0, + "step": 14807, + "train/ce_loss": 0.33149170875549316 + }, + { + "epoch": 1.4640102827763495, + "step": 14807, + "train/sim_loss": 0.02794194221496582 + }, + { + "epoch": 1.4640102827763495, + "step": 14807, + "train/total_loss": 0.061091113835573196 + }, + { + "entropy": 8.860761642456055, + "epoch": 1.4641091556258652, + "mean_token_accuracy": 0.858397364616394, + "num_tokens": 35162403.0, + "step": 14808, + "train/ce_loss": 0.3849569857120514 + }, + { + "epoch": 1.4641091556258652, + "step": 14808, + "train/sim_loss": 0.016349077224731445 + }, + { + "epoch": 1.4641091556258652, + "step": 14808, + "train/total_loss": 0.054844778031110764 + }, + { + "entropy": 9.110429763793945, + "epoch": 1.4642080284753807, + "mean_token_accuracy": 0.8807439804077148, + "num_tokens": 35178474.0, + "step": 14809, + "train/ce_loss": 0.4971686005592346 + }, + { + "epoch": 1.4642080284753807, + "step": 14809, + "train/sim_loss": 0.026354074478149414 + }, + { + "epoch": 1.4642080284753807, + "step": 14809, + "train/total_loss": 0.07607093453407288 + }, + { + "entropy": 9.379919052124023, + "epoch": 1.4643069013248962, + "mean_token_accuracy": 0.7931034564971924, + "num_tokens": 35196401.0, + "step": 14810, + "train/ce_loss": 0.9857801198959351 + }, + { + "epoch": 1.4643069013248962, + "step": 14810, + "train/sim_loss": 0.07069110870361328 + }, + { + "epoch": 1.4643069013248962, + "step": 14810, + "train/total_loss": 0.1692691147327423 + }, + { + "entropy": 9.459428787231445, + "epoch": 1.4644057741744116, + "mean_token_accuracy": 0.9210526347160339, + "num_tokens": 35209004.0, + "step": 14811, + "train/ce_loss": 0.35000747442245483 + }, + { + "epoch": 1.4644057741744116, + "step": 14811, + "train/sim_loss": 0.05050063133239746 + }, + { + "epoch": 1.4644057741744116, + "step": 14811, + "train/total_loss": 0.08550138026475906 + }, + { + "entropy": 9.397123336791992, + "epoch": 1.4645046470239271, + "mean_token_accuracy": 0.8850967288017273, + "num_tokens": 35225863.0, + "step": 14812, + "train/ce_loss": 0.33004242181777954 + }, + { + "epoch": 1.4645046470239271, + "step": 14812, + "train/sim_loss": 0.0833742618560791 + }, + { + "epoch": 1.4645046470239271, + "step": 14812, + "train/total_loss": 0.11637850105762482 + }, + { + "entropy": 9.250324249267578, + "epoch": 1.4646035198734428, + "mean_token_accuracy": 0.8629921078681946, + "num_tokens": 35234275.0, + "step": 14813, + "train/ce_loss": 0.5417956113815308 + }, + { + "epoch": 1.4646035198734428, + "step": 14813, + "train/sim_loss": 0.021661758422851562 + }, + { + "epoch": 1.4646035198734428, + "step": 14813, + "train/total_loss": 0.07584132254123688 + }, + { + "entropy": 9.478407859802246, + "epoch": 1.4647023927229583, + "mean_token_accuracy": 0.8330494165420532, + "num_tokens": 35242975.0, + "step": 14814, + "train/ce_loss": 0.5936204195022583 + }, + { + "epoch": 1.4647023927229583, + "step": 14814, + "train/sim_loss": 0.05034935474395752 + }, + { + "epoch": 1.4647023927229583, + "step": 14814, + "train/total_loss": 0.10971139371395111 + }, + { + "entropy": 9.34897518157959, + "epoch": 1.4648012655724738, + "mean_token_accuracy": 0.8668555021286011, + "num_tokens": 35254786.0, + "step": 14815, + "train/ce_loss": 0.5993205308914185 + }, + { + "epoch": 1.4648012655724738, + "step": 14815, + "train/sim_loss": 0.028681397438049316 + }, + { + "epoch": 1.4648012655724738, + "step": 14815, + "train/total_loss": 0.08861345052719116 + }, + { + "entropy": 9.206274032592773, + "epoch": 1.4649001384219893, + "mean_token_accuracy": 0.8338727355003357, + "num_tokens": 35269315.0, + "step": 14816, + "train/ce_loss": 0.3642207384109497 + }, + { + "epoch": 1.4649001384219893, + "step": 14816, + "train/sim_loss": 0.023010671138763428 + }, + { + "epoch": 1.4649001384219893, + "step": 14816, + "train/total_loss": 0.0594327449798584 + }, + { + "entropy": 9.479185104370117, + "epoch": 1.4649990112715048, + "mean_token_accuracy": 0.856692910194397, + "num_tokens": 35277676.0, + "step": 14817, + "train/ce_loss": 0.5687761902809143 + }, + { + "epoch": 1.4649990112715048, + "step": 14817, + "train/sim_loss": 0.01198434829711914 + }, + { + "epoch": 1.4649990112715048, + "step": 14817, + "train/total_loss": 0.06886196881532669 + }, + { + "entropy": 9.39988899230957, + "epoch": 1.4650978841210205, + "mean_token_accuracy": 0.8778337240219116, + "num_tokens": 35289514.0, + "step": 14818, + "train/ce_loss": 0.4156968891620636 + }, + { + "epoch": 1.4650978841210205, + "step": 14818, + "train/sim_loss": 0.06565463542938232 + }, + { + "epoch": 1.4650978841210205, + "step": 14818, + "train/total_loss": 0.10722433030605316 + }, + { + "entropy": 9.373445510864258, + "epoch": 1.465196756970536, + "mean_token_accuracy": 0.8282442688941956, + "num_tokens": 35303619.0, + "step": 14819, + "train/ce_loss": 0.3941057324409485 + }, + { + "epoch": 1.465196756970536, + "step": 14819, + "train/sim_loss": 0.01626211404800415 + }, + { + "epoch": 1.465196756970536, + "step": 14819, + "train/total_loss": 0.05567268654704094 + }, + { + "epoch": 1.4652956298200515, + "grad_norm": 0.7071399688720703, + "learning_rate": 6.33857488997676e-06, + "loss": 0.0862, + "step": 14820 + }, + { + "entropy": 9.5513277053833, + "epoch": 1.4652956298200515, + "mean_token_accuracy": 0.9273743033409119, + "num_tokens": 35311893.0, + "step": 14820, + "train/ce_loss": 1.3327556871445267e-06 + }, + { + "epoch": 1.4652956298200515, + "step": 14820, + "train/sim_loss": 0.02373450994491577 + }, + { + "epoch": 1.4652956298200515, + "step": 14820, + "train/total_loss": 0.023734644055366516 + }, + { + "entropy": 8.7789306640625, + "epoch": 1.465394502669567, + "mean_token_accuracy": 0.881028950214386, + "num_tokens": 35325172.0, + "step": 14821, + "train/ce_loss": 0.31968578696250916 + }, + { + "epoch": 1.465394502669567, + "step": 14821, + "train/sim_loss": 0.018020153045654297 + }, + { + "epoch": 1.465394502669567, + "step": 14821, + "train/total_loss": 0.04998873174190521 + }, + { + "entropy": 9.531147956848145, + "epoch": 1.4654933755190824, + "mean_token_accuracy": 0.8817567825317383, + "num_tokens": 35337922.0, + "step": 14822, + "train/ce_loss": 0.2843243479728699 + }, + { + "epoch": 1.4654933755190824, + "step": 14822, + "train/sim_loss": 0.04303324222564697 + }, + { + "epoch": 1.4654933755190824, + "step": 14822, + "train/total_loss": 0.07146567851305008 + }, + { + "entropy": 8.955179214477539, + "epoch": 1.465592248368598, + "mean_token_accuracy": 0.8393480777740479, + "num_tokens": 35345484.0, + "step": 14823, + "train/ce_loss": 0.4255801737308502 + }, + { + "epoch": 1.465592248368598, + "step": 14823, + "train/sim_loss": 0.012681722640991211 + }, + { + "epoch": 1.465592248368598, + "step": 14823, + "train/total_loss": 0.05523974075913429 + }, + { + "entropy": 9.28305721282959, + "epoch": 1.4656911212181134, + "mean_token_accuracy": 0.8372092843055725, + "num_tokens": 35362110.0, + "step": 14824, + "train/ce_loss": 0.42037466168403625 + }, + { + "epoch": 1.4656911212181134, + "step": 14824, + "train/sim_loss": 0.02214103937149048 + }, + { + "epoch": 1.4656911212181134, + "step": 14824, + "train/total_loss": 0.06417851150035858 + }, + { + "entropy": 9.123416900634766, + "epoch": 1.465789994067629, + "mean_token_accuracy": 0.838470995426178, + "num_tokens": 35374721.0, + "step": 14825, + "train/ce_loss": 0.45695963501930237 + }, + { + "epoch": 1.465789994067629, + "step": 14825, + "train/sim_loss": 0.061879754066467285 + }, + { + "epoch": 1.465789994067629, + "step": 14825, + "train/total_loss": 0.10757571458816528 + }, + { + "entropy": 9.602636337280273, + "epoch": 1.4658888669171446, + "mean_token_accuracy": 0.8828282952308655, + "num_tokens": 35384107.0, + "step": 14826, + "train/ce_loss": 6.926028390807915e-07 + }, + { + "epoch": 1.4658888669171446, + "step": 14826, + "train/sim_loss": 0.05047029256820679 + }, + { + "epoch": 1.4658888669171446, + "step": 14826, + "train/total_loss": 0.05047036334872246 + }, + { + "entropy": 9.603254318237305, + "epoch": 1.46598773976666, + "mean_token_accuracy": 0.8818040490150452, + "num_tokens": 35391744.0, + "step": 14827, + "train/ce_loss": 0.31017792224884033 + }, + { + "epoch": 1.46598773976666, + "step": 14827, + "train/sim_loss": 0.0258256196975708 + }, + { + "epoch": 1.46598773976666, + "step": 14827, + "train/total_loss": 0.05684341490268707 + }, + { + "entropy": 9.243420600891113, + "epoch": 1.4660866126161756, + "mean_token_accuracy": 0.8519737124443054, + "num_tokens": 35397602.0, + "step": 14828, + "train/ce_loss": 0.6092718243598938 + }, + { + "epoch": 1.4660866126161756, + "step": 14828, + "train/sim_loss": 0.018258094787597656 + }, + { + "epoch": 1.4660866126161756, + "step": 14828, + "train/total_loss": 0.07918527722358704 + }, + { + "entropy": 9.275784492492676, + "epoch": 1.466185485465691, + "mean_token_accuracy": 0.8209607005119324, + "num_tokens": 35404655.0, + "step": 14829, + "train/ce_loss": 1.2992373967790627e-06 + }, + { + "epoch": 1.466185485465691, + "step": 14829, + "train/sim_loss": 0.04918062686920166 + }, + { + "epoch": 1.466185485465691, + "step": 14829, + "train/total_loss": 0.049180757254362106 + }, + { + "entropy": 10.052253723144531, + "epoch": 1.4662843583152068, + "mean_token_accuracy": 0.8743842244148254, + "num_tokens": 35412873.0, + "step": 14830, + "train/ce_loss": 5.437743766378844e-07 + }, + { + "epoch": 1.4662843583152068, + "step": 14830, + "train/sim_loss": 0.01240074634552002 + }, + { + "epoch": 1.4662843583152068, + "step": 14830, + "train/total_loss": 0.012400800362229347 + }, + { + "entropy": 9.084867477416992, + "epoch": 1.4663832311647222, + "mean_token_accuracy": 0.8404255509376526, + "num_tokens": 35422095.0, + "step": 14831, + "train/ce_loss": 0.6204336285591125 + }, + { + "epoch": 1.4663832311647222, + "step": 14831, + "train/sim_loss": 0.0815308690071106 + }, + { + "epoch": 1.4663832311647222, + "step": 14831, + "train/total_loss": 0.14357423782348633 + }, + { + "entropy": 9.528806686401367, + "epoch": 1.4664821040142377, + "mean_token_accuracy": 0.8120185136795044, + "num_tokens": 35435984.0, + "step": 14832, + "train/ce_loss": 0.4866643249988556 + }, + { + "epoch": 1.4664821040142377, + "step": 14832, + "train/sim_loss": 0.052810072898864746 + }, + { + "epoch": 1.4664821040142377, + "step": 14832, + "train/total_loss": 0.1014765053987503 + }, + { + "entropy": 9.21518325805664, + "epoch": 1.4665809768637532, + "mean_token_accuracy": 0.8628719449043274, + "num_tokens": 35452280.0, + "step": 14833, + "train/ce_loss": 0.34231436252593994 + }, + { + "epoch": 1.4665809768637532, + "step": 14833, + "train/sim_loss": 0.02529776096343994 + }, + { + "epoch": 1.4665809768637532, + "step": 14833, + "train/total_loss": 0.059529196470975876 + }, + { + "entropy": 9.647212028503418, + "epoch": 1.4666798497132687, + "mean_token_accuracy": 0.8573668003082275, + "num_tokens": 35467315.0, + "step": 14834, + "train/ce_loss": 0.7123132944107056 + }, + { + "epoch": 1.4666798497132687, + "step": 14834, + "train/sim_loss": 0.04391217231750488 + }, + { + "epoch": 1.4666798497132687, + "step": 14834, + "train/total_loss": 0.11514350026845932 + }, + { + "entropy": 9.308540344238281, + "epoch": 1.4667787225627842, + "mean_token_accuracy": 0.8154981732368469, + "num_tokens": 35478901.0, + "step": 14835, + "train/ce_loss": 0.6576448678970337 + }, + { + "epoch": 1.4667787225627842, + "step": 14835, + "train/sim_loss": 0.0451434850692749 + }, + { + "epoch": 1.4667787225627842, + "step": 14835, + "train/total_loss": 0.11090797185897827 + }, + { + "entropy": 9.278716087341309, + "epoch": 1.4668775954122997, + "mean_token_accuracy": 0.8064876794815063, + "num_tokens": 35489735.0, + "step": 14836, + "train/ce_loss": 0.45187264680862427 + }, + { + "epoch": 1.4668775954122997, + "step": 14836, + "train/sim_loss": 0.06654369831085205 + }, + { + "epoch": 1.4668775954122997, + "step": 14836, + "train/total_loss": 0.11173096299171448 + }, + { + "entropy": 9.37168025970459, + "epoch": 1.4669764682618154, + "mean_token_accuracy": 0.8518518805503845, + "num_tokens": 35499422.0, + "step": 14837, + "train/ce_loss": 0.3206202983856201 + }, + { + "epoch": 1.4669764682618154, + "step": 14837, + "train/sim_loss": 0.052436232566833496 + }, + { + "epoch": 1.4669764682618154, + "step": 14837, + "train/total_loss": 0.08449826389551163 + }, + { + "entropy": 9.112443923950195, + "epoch": 1.4670753411113309, + "mean_token_accuracy": 0.8939393758773804, + "num_tokens": 35507256.0, + "step": 14838, + "train/ce_loss": 0.3515181839466095 + }, + { + "epoch": 1.4670753411113309, + "step": 14838, + "train/sim_loss": 0.012353062629699707 + }, + { + "epoch": 1.4670753411113309, + "step": 14838, + "train/total_loss": 0.047504883259534836 + }, + { + "entropy": 9.351481437683105, + "epoch": 1.4671742139608464, + "mean_token_accuracy": 0.8472418785095215, + "num_tokens": 35520961.0, + "step": 14839, + "train/ce_loss": 0.25754714012145996 + }, + { + "epoch": 1.4671742139608464, + "step": 14839, + "train/sim_loss": 0.01676464080810547 + }, + { + "epoch": 1.4671742139608464, + "step": 14839, + "train/total_loss": 0.042519353330135345 + }, + { + "epoch": 1.4672730868103618, + "grad_norm": 0.5989866852760315, + "learning_rate": 6.333630025218811e-06, + "loss": 0.0835, + "step": 14840 + }, + { + "entropy": 10.022928237915039, + "epoch": 1.4672730868103618, + "mean_token_accuracy": 0.894505500793457, + "num_tokens": 35527359.0, + "step": 14840, + "train/ce_loss": 0.6121292114257812 + }, + { + "epoch": 1.4672730868103618, + "step": 14840, + "train/sim_loss": 0.0117262601852417 + }, + { + "epoch": 1.4672730868103618, + "step": 14840, + "train/total_loss": 0.0729391872882843 + }, + { + "entropy": 9.103287696838379, + "epoch": 1.4673719596598773, + "mean_token_accuracy": 0.8360655903816223, + "num_tokens": 35542743.0, + "step": 14841, + "train/ce_loss": 0.1528797298669815 + }, + { + "epoch": 1.4673719596598773, + "step": 14841, + "train/sim_loss": 0.025919079780578613 + }, + { + "epoch": 1.4673719596598773, + "step": 14841, + "train/total_loss": 0.041207052767276764 + }, + { + "entropy": 9.335341453552246, + "epoch": 1.467470832509393, + "mean_token_accuracy": 0.8461538553237915, + "num_tokens": 35554900.0, + "step": 14842, + "train/ce_loss": 0.379677951335907 + }, + { + "epoch": 1.467470832509393, + "step": 14842, + "train/sim_loss": 0.018487215042114258 + }, + { + "epoch": 1.467470832509393, + "step": 14842, + "train/total_loss": 0.056455012410879135 + }, + { + "entropy": 9.822649002075195, + "epoch": 1.4675697053589085, + "mean_token_accuracy": 0.8456299901008606, + "num_tokens": 35569459.0, + "step": 14843, + "train/ce_loss": 0.43627792596817017 + }, + { + "epoch": 1.4675697053589085, + "step": 14843, + "train/sim_loss": 0.0343860387802124 + }, + { + "epoch": 1.4675697053589085, + "step": 14843, + "train/total_loss": 0.0780138373374939 + }, + { + "entropy": 9.779891014099121, + "epoch": 1.467668578208424, + "mean_token_accuracy": 0.9010791182518005, + "num_tokens": 35581659.0, + "step": 14844, + "train/ce_loss": 1.4295549135567853e-06 + }, + { + "epoch": 1.467668578208424, + "step": 14844, + "train/sim_loss": 0.03687453269958496 + }, + { + "epoch": 1.467668578208424, + "step": 14844, + "train/total_loss": 0.0368746742606163 + }, + { + "entropy": 9.111077308654785, + "epoch": 1.4677674510579395, + "mean_token_accuracy": 0.8738049864768982, + "num_tokens": 35591404.0, + "step": 14845, + "train/ce_loss": 1.0703278121582116e-06 + }, + { + "epoch": 1.4677674510579395, + "step": 14845, + "train/sim_loss": 0.022464334964752197 + }, + { + "epoch": 1.4677674510579395, + "step": 14845, + "train/total_loss": 0.022464441135525703 + }, + { + "entropy": 9.307188034057617, + "epoch": 1.467866323907455, + "mean_token_accuracy": 0.8359172940254211, + "num_tokens": 35608313.0, + "step": 14846, + "train/ce_loss": 0.7483430504798889 + }, + { + "epoch": 1.467866323907455, + "step": 14846, + "train/sim_loss": 0.04977595806121826 + }, + { + "epoch": 1.467866323907455, + "step": 14846, + "train/total_loss": 0.12461026757955551 + }, + { + "entropy": 9.739642143249512, + "epoch": 1.4679651967569705, + "mean_token_accuracy": 0.8081493973731995, + "num_tokens": 35627893.0, + "step": 14847, + "train/ce_loss": 0.7165751457214355 + }, + { + "epoch": 1.4679651967569705, + "step": 14847, + "train/sim_loss": 0.03913170099258423 + }, + { + "epoch": 1.4679651967569705, + "step": 14847, + "train/total_loss": 0.1107892170548439 + }, + { + "entropy": 9.102151870727539, + "epoch": 1.468064069606486, + "mean_token_accuracy": 0.8592000007629395, + "num_tokens": 35636378.0, + "step": 14848, + "train/ce_loss": 1.2457885532057844e-06 + }, + { + "epoch": 1.468064069606486, + "step": 14848, + "train/sim_loss": 0.033187270164489746 + }, + { + "epoch": 1.468064069606486, + "step": 14848, + "train/total_loss": 0.033187393099069595 + }, + { + "entropy": 9.521984100341797, + "epoch": 1.4681629424560017, + "mean_token_accuracy": 0.8363893032073975, + "num_tokens": 35646256.0, + "step": 14849, + "train/ce_loss": 0.22771966457366943 + }, + { + "epoch": 1.4681629424560017, + "step": 14849, + "train/sim_loss": 0.05394190549850464 + }, + { + "epoch": 1.4681629424560017, + "step": 14849, + "train/total_loss": 0.07671387493610382 + }, + { + "entropy": 9.336666107177734, + "epoch": 1.4682618153055171, + "mean_token_accuracy": 0.8657894730567932, + "num_tokens": 35664040.0, + "step": 14850, + "train/ce_loss": 0.41519710421562195 + }, + { + "epoch": 1.4682618153055171, + "step": 14850, + "train/sim_loss": 0.0887877345085144 + }, + { + "epoch": 1.4682618153055171, + "step": 14850, + "train/total_loss": 0.13030745089054108 + }, + { + "entropy": 9.265832901000977, + "epoch": 1.4683606881550326, + "mean_token_accuracy": 0.8190476298332214, + "num_tokens": 35678145.0, + "step": 14851, + "train/ce_loss": 0.9873136878013611 + }, + { + "epoch": 1.4683606881550326, + "step": 14851, + "train/sim_loss": 0.05678683519363403 + }, + { + "epoch": 1.4683606881550326, + "step": 14851, + "train/total_loss": 0.15551820397377014 + }, + { + "entropy": 9.20113754272461, + "epoch": 1.468459561004548, + "mean_token_accuracy": 0.8559113144874573, + "num_tokens": 35691636.0, + "step": 14852, + "train/ce_loss": 0.19738559424877167 + }, + { + "epoch": 1.468459561004548, + "step": 14852, + "train/sim_loss": 0.10797858238220215 + }, + { + "epoch": 1.468459561004548, + "step": 14852, + "train/total_loss": 0.12771713733673096 + }, + { + "entropy": 9.51856803894043, + "epoch": 1.4685584338540636, + "mean_token_accuracy": 0.8911704421043396, + "num_tokens": 35706523.0, + "step": 14853, + "train/ce_loss": 0.31939926743507385 + }, + { + "epoch": 1.4685584338540636, + "step": 14853, + "train/sim_loss": 0.05771374702453613 + }, + { + "epoch": 1.4685584338540636, + "step": 14853, + "train/total_loss": 0.08965367078781128 + }, + { + "entropy": 9.077404022216797, + "epoch": 1.4686573067035793, + "mean_token_accuracy": 0.8518123626708984, + "num_tokens": 35719596.0, + "step": 14854, + "train/ce_loss": 0.3171207308769226 + }, + { + "epoch": 1.4686573067035793, + "step": 14854, + "train/sim_loss": 0.030056536197662354 + }, + { + "epoch": 1.4686573067035793, + "step": 14854, + "train/total_loss": 0.061768610030412674 + }, + { + "entropy": 9.629941940307617, + "epoch": 1.4687561795530948, + "mean_token_accuracy": 0.8515337705612183, + "num_tokens": 35740697.0, + "step": 14855, + "train/ce_loss": 0.5918033719062805 + }, + { + "epoch": 1.4687561795530948, + "step": 14855, + "train/sim_loss": 0.05341792106628418 + }, + { + "epoch": 1.4687561795530948, + "step": 14855, + "train/total_loss": 0.11259825527667999 + }, + { + "entropy": 9.526681900024414, + "epoch": 1.4688550524026103, + "mean_token_accuracy": 0.8931419253349304, + "num_tokens": 35748300.0, + "step": 14856, + "train/ce_loss": 0.32757803797721863 + }, + { + "epoch": 1.4688550524026103, + "step": 14856, + "train/sim_loss": 0.10242915153503418 + }, + { + "epoch": 1.4688550524026103, + "step": 14856, + "train/total_loss": 0.13518695533275604 + }, + { + "entropy": 9.265487670898438, + "epoch": 1.4689539252521258, + "mean_token_accuracy": 0.8210382461547852, + "num_tokens": 35761104.0, + "step": 14857, + "train/ce_loss": 1.0187490033786162e-06 + }, + { + "epoch": 1.4689539252521258, + "step": 14857, + "train/sim_loss": 0.03586411476135254 + }, + { + "epoch": 1.4689539252521258, + "step": 14857, + "train/total_loss": 0.0358642153441906 + }, + { + "entropy": 10.002304077148438, + "epoch": 1.4690527981016412, + "mean_token_accuracy": 0.8598616123199463, + "num_tokens": 35778653.0, + "step": 14858, + "train/ce_loss": 0.5669687390327454 + }, + { + "epoch": 1.4690527981016412, + "step": 14858, + "train/sim_loss": 0.04535567760467529 + }, + { + "epoch": 1.4690527981016412, + "step": 14858, + "train/total_loss": 0.10205255448818207 + }, + { + "entropy": 9.511805534362793, + "epoch": 1.4691516709511567, + "mean_token_accuracy": 0.7884097099304199, + "num_tokens": 35789137.0, + "step": 14859, + "train/ce_loss": 0.8225998282432556 + }, + { + "epoch": 1.4691516709511567, + "step": 14859, + "train/sim_loss": 0.04419362545013428 + }, + { + "epoch": 1.4691516709511567, + "step": 14859, + "train/total_loss": 0.12645360827445984 + }, + { + "epoch": 1.4692505438006722, + "grad_norm": 0.6769019365310669, + "learning_rate": 6.328685160460863e-06, + "loss": 0.0858, + "step": 14860 + }, + { + "entropy": 9.050755500793457, + "epoch": 1.4692505438006722, + "mean_token_accuracy": 0.8442822098731995, + "num_tokens": 35797255.0, + "step": 14860, + "train/ce_loss": 0.29987484216690063 + }, + { + "epoch": 1.4692505438006722, + "step": 14860, + "train/sim_loss": 0.013253927230834961 + }, + { + "epoch": 1.4692505438006722, + "step": 14860, + "train/total_loss": 0.043241411447525024 + }, + { + "entropy": 9.702640533447266, + "epoch": 1.469349416650188, + "mean_token_accuracy": 0.8675135970115662, + "num_tokens": 35806843.0, + "step": 14861, + "train/ce_loss": 0.2834220230579376 + }, + { + "epoch": 1.469349416650188, + "step": 14861, + "train/sim_loss": 0.07338893413543701 + }, + { + "epoch": 1.469349416650188, + "step": 14861, + "train/total_loss": 0.10173113644123077 + }, + { + "entropy": 9.519960403442383, + "epoch": 1.4694482894997034, + "mean_token_accuracy": 0.8365508317947388, + "num_tokens": 35820595.0, + "step": 14862, + "train/ce_loss": 0.5211740732192993 + }, + { + "epoch": 1.4694482894997034, + "step": 14862, + "train/sim_loss": 0.03954362869262695 + }, + { + "epoch": 1.4694482894997034, + "step": 14862, + "train/total_loss": 0.09166103601455688 + }, + { + "entropy": 9.255646705627441, + "epoch": 1.469547162349219, + "mean_token_accuracy": 0.8494055271148682, + "num_tokens": 35835135.0, + "step": 14863, + "train/ce_loss": 0.6144018173217773 + }, + { + "epoch": 1.469547162349219, + "step": 14863, + "train/sim_loss": 0.05286771059036255 + }, + { + "epoch": 1.469547162349219, + "step": 14863, + "train/total_loss": 0.11430789530277252 + }, + { + "entropy": 9.57419204711914, + "epoch": 1.4696460351987344, + "mean_token_accuracy": 0.880859375, + "num_tokens": 35846494.0, + "step": 14864, + "train/ce_loss": 0.34329652786254883 + }, + { + "epoch": 1.4696460351987344, + "step": 14864, + "train/sim_loss": 0.027292609214782715 + }, + { + "epoch": 1.4696460351987344, + "step": 14864, + "train/total_loss": 0.0616222620010376 + }, + { + "entropy": 9.548566818237305, + "epoch": 1.4697449080482499, + "mean_token_accuracy": 0.9316628575325012, + "num_tokens": 35857766.0, + "step": 14865, + "train/ce_loss": 0.4275035858154297 + }, + { + "epoch": 1.4697449080482499, + "step": 14865, + "train/sim_loss": 0.06060349941253662 + }, + { + "epoch": 1.4697449080482499, + "step": 14865, + "train/total_loss": 0.10335385799407959 + }, + { + "entropy": 9.383034706115723, + "epoch": 1.4698437808977656, + "mean_token_accuracy": 0.913103461265564, + "num_tokens": 35874084.0, + "step": 14866, + "train/ce_loss": 5.215488840804028e-07 + }, + { + "epoch": 1.4698437808977656, + "step": 14866, + "train/sim_loss": 0.03642988204956055 + }, + { + "epoch": 1.4698437808977656, + "step": 14866, + "train/total_loss": 0.036429934203624725 + }, + { + "entropy": 10.061108589172363, + "epoch": 1.469942653747281, + "mean_token_accuracy": 0.8432601690292358, + "num_tokens": 35884852.0, + "step": 14867, + "train/ce_loss": 1.6621420400042553e-06 + }, + { + "epoch": 1.469942653747281, + "step": 14867, + "train/sim_loss": 0.021588921546936035 + }, + { + "epoch": 1.469942653747281, + "step": 14867, + "train/total_loss": 0.021589087322354317 + }, + { + "entropy": 9.590165138244629, + "epoch": 1.4700415265967965, + "mean_token_accuracy": 0.8826366662979126, + "num_tokens": 35897689.0, + "step": 14868, + "train/ce_loss": 3.380747841674747e-07 + }, + { + "epoch": 1.4700415265967965, + "step": 14868, + "train/sim_loss": 0.0104178786277771 + }, + { + "epoch": 1.4700415265967965, + "step": 14868, + "train/total_loss": 0.010417912155389786 + }, + { + "entropy": 9.188995361328125, + "epoch": 1.470140399446312, + "mean_token_accuracy": 0.8814554214477539, + "num_tokens": 35912574.0, + "step": 14869, + "train/ce_loss": 0.38603338599205017 + }, + { + "epoch": 1.470140399446312, + "step": 14869, + "train/sim_loss": 0.02630937099456787 + }, + { + "epoch": 1.470140399446312, + "step": 14869, + "train/total_loss": 0.06491270661354065 + }, + { + "entropy": 9.235467910766602, + "epoch": 1.4702392722958275, + "mean_token_accuracy": 0.8390501141548157, + "num_tokens": 35927626.0, + "step": 14870, + "train/ce_loss": 0.3710508644580841 + }, + { + "epoch": 1.4702392722958275, + "step": 14870, + "train/sim_loss": 0.02442026138305664 + }, + { + "epoch": 1.4702392722958275, + "step": 14870, + "train/total_loss": 0.06152534857392311 + }, + { + "entropy": 9.58516788482666, + "epoch": 1.4703381451453432, + "mean_token_accuracy": 0.8404255509376526, + "num_tokens": 35940992.0, + "step": 14871, + "train/ce_loss": 0.5522510409355164 + }, + { + "epoch": 1.4703381451453432, + "step": 14871, + "train/sim_loss": 0.03609764575958252 + }, + { + "epoch": 1.4703381451453432, + "step": 14871, + "train/total_loss": 0.09132274985313416 + }, + { + "entropy": 9.067390441894531, + "epoch": 1.4704370179948585, + "mean_token_accuracy": 0.8612334728240967, + "num_tokens": 35957009.0, + "step": 14872, + "train/ce_loss": 0.5846180319786072 + }, + { + "epoch": 1.4704370179948585, + "step": 14872, + "train/sim_loss": 0.017145633697509766 + }, + { + "epoch": 1.4704370179948585, + "step": 14872, + "train/total_loss": 0.07560743391513824 + }, + { + "entropy": 9.567153930664062, + "epoch": 1.4705358908443742, + "mean_token_accuracy": 0.8517940640449524, + "num_tokens": 35970904.0, + "step": 14873, + "train/ce_loss": 0.6745213866233826 + }, + { + "epoch": 1.4705358908443742, + "step": 14873, + "train/sim_loss": 0.05961573123931885 + }, + { + "epoch": 1.4705358908443742, + "step": 14873, + "train/total_loss": 0.12706786394119263 + }, + { + "entropy": 9.664652824401855, + "epoch": 1.4706347636938897, + "mean_token_accuracy": 0.7540172934532166, + "num_tokens": 35985004.0, + "step": 14874, + "train/ce_loss": 2.78200644743265e-07 + }, + { + "epoch": 1.4706347636938897, + "step": 14874, + "train/sim_loss": 0.02123415470123291 + }, + { + "epoch": 1.4706347636938897, + "step": 14874, + "train/total_loss": 0.02123418264091015 + }, + { + "entropy": 9.387800216674805, + "epoch": 1.4707336365434052, + "mean_token_accuracy": 0.847484290599823, + "num_tokens": 35997347.0, + "step": 14875, + "train/ce_loss": 0.9203101396560669 + }, + { + "epoch": 1.4707336365434052, + "step": 14875, + "train/sim_loss": 0.06263488531112671 + }, + { + "epoch": 1.4707336365434052, + "step": 14875, + "train/total_loss": 0.15466590225696564 + }, + { + "entropy": 9.75330924987793, + "epoch": 1.4708325093929207, + "mean_token_accuracy": 0.8434504866600037, + "num_tokens": 36015106.0, + "step": 14876, + "train/ce_loss": 0.3362449109554291 + }, + { + "epoch": 1.4708325093929207, + "step": 14876, + "train/sim_loss": 0.056902527809143066 + }, + { + "epoch": 1.4708325093929207, + "step": 14876, + "train/total_loss": 0.0905270203948021 + }, + { + "entropy": 9.098085403442383, + "epoch": 1.4709313822424361, + "mean_token_accuracy": 0.8619354963302612, + "num_tokens": 36029170.0, + "step": 14877, + "train/ce_loss": 0.4856531023979187 + }, + { + "epoch": 1.4709313822424361, + "step": 14877, + "train/sim_loss": 0.015470623970031738 + }, + { + "epoch": 1.4709313822424361, + "step": 14877, + "train/total_loss": 0.06403593719005585 + }, + { + "entropy": 9.548490524291992, + "epoch": 1.4710302550919518, + "mean_token_accuracy": 0.8787515163421631, + "num_tokens": 36042603.0, + "step": 14878, + "train/ce_loss": 0.437849760055542 + }, + { + "epoch": 1.4710302550919518, + "step": 14878, + "train/sim_loss": 0.040823519229888916 + }, + { + "epoch": 1.4710302550919518, + "step": 14878, + "train/total_loss": 0.08460849523544312 + }, + { + "entropy": 9.37511920928955, + "epoch": 1.4711291279414673, + "mean_token_accuracy": 0.8808446526527405, + "num_tokens": 36051837.0, + "step": 14879, + "train/ce_loss": 0.6226027607917786 + }, + { + "epoch": 1.4711291279414673, + "step": 14879, + "train/sim_loss": 0.06317400932312012 + }, + { + "epoch": 1.4711291279414673, + "step": 14879, + "train/total_loss": 0.1254342794418335 + }, + { + "epoch": 1.4712280007909828, + "grad_norm": 0.5225405097007751, + "learning_rate": 6.323740295702912e-06, + "loss": 0.087, + "step": 14880 + }, + { + "entropy": 9.45015811920166, + "epoch": 1.4712280007909828, + "mean_token_accuracy": 0.8471177816390991, + "num_tokens": 36063115.0, + "step": 14880, + "train/ce_loss": 0.33736616373062134 + }, + { + "epoch": 1.4712280007909828, + "step": 14880, + "train/sim_loss": 0.06297111511230469 + }, + { + "epoch": 1.4712280007909828, + "step": 14880, + "train/total_loss": 0.09670773148536682 + }, + { + "entropy": 9.492433547973633, + "epoch": 1.4713268736404983, + "mean_token_accuracy": 0.8356589078903198, + "num_tokens": 36073409.0, + "step": 14881, + "train/ce_loss": 0.7889821529388428 + }, + { + "epoch": 1.4713268736404983, + "step": 14881, + "train/sim_loss": 0.07515007257461548 + }, + { + "epoch": 1.4713268736404983, + "step": 14881, + "train/total_loss": 0.15404829382896423 + }, + { + "entropy": 9.481014251708984, + "epoch": 1.4714257464900138, + "mean_token_accuracy": 0.8421052694320679, + "num_tokens": 36093115.0, + "step": 14882, + "train/ce_loss": 0.4737603962421417 + }, + { + "epoch": 1.4714257464900138, + "step": 14882, + "train/sim_loss": 0.028028130531311035 + }, + { + "epoch": 1.4714257464900138, + "step": 14882, + "train/total_loss": 0.07540416717529297 + }, + { + "entropy": 9.118361473083496, + "epoch": 1.4715246193395295, + "mean_token_accuracy": 0.8606383204460144, + "num_tokens": 36109686.0, + "step": 14883, + "train/ce_loss": 0.29505544900894165 + }, + { + "epoch": 1.4715246193395295, + "step": 14883, + "train/sim_loss": 0.013614535331726074 + }, + { + "epoch": 1.4715246193395295, + "step": 14883, + "train/total_loss": 0.04312007874250412 + }, + { + "entropy": 9.563699722290039, + "epoch": 1.4716234921890448, + "mean_token_accuracy": 0.8709090948104858, + "num_tokens": 36126519.0, + "step": 14884, + "train/ce_loss": 0.2848665416240692 + }, + { + "epoch": 1.4716234921890448, + "step": 14884, + "train/sim_loss": 0.06779217720031738 + }, + { + "epoch": 1.4716234921890448, + "step": 14884, + "train/total_loss": 0.0962788313627243 + }, + { + "entropy": 9.057425498962402, + "epoch": 1.4717223650385605, + "mean_token_accuracy": 0.8730158805847168, + "num_tokens": 36137452.0, + "step": 14885, + "train/ce_loss": 6.577882345482067e-07 + }, + { + "epoch": 1.4717223650385605, + "step": 14885, + "train/sim_loss": 0.020416319370269775 + }, + { + "epoch": 1.4717223650385605, + "step": 14885, + "train/total_loss": 0.02041638456285 + }, + { + "entropy": 9.274478912353516, + "epoch": 1.471821237888076, + "mean_token_accuracy": 0.8112305998802185, + "num_tokens": 36150825.0, + "step": 14886, + "train/ce_loss": 0.6632164716720581 + }, + { + "epoch": 1.471821237888076, + "step": 14886, + "train/sim_loss": 0.028442740440368652 + }, + { + "epoch": 1.471821237888076, + "step": 14886, + "train/total_loss": 0.09476438909769058 + }, + { + "entropy": 9.40845012664795, + "epoch": 1.4719201107375914, + "mean_token_accuracy": 0.8221153616905212, + "num_tokens": 36165082.0, + "step": 14887, + "train/ce_loss": 0.5860942006111145 + }, + { + "epoch": 1.4719201107375914, + "step": 14887, + "train/sim_loss": 0.04861551523208618 + }, + { + "epoch": 1.4719201107375914, + "step": 14887, + "train/total_loss": 0.10722494125366211 + }, + { + "entropy": 9.762941360473633, + "epoch": 1.472018983587107, + "mean_token_accuracy": 0.8631284832954407, + "num_tokens": 36177231.0, + "step": 14888, + "train/ce_loss": 4.993803486286197e-06 + }, + { + "epoch": 1.472018983587107, + "step": 14888, + "train/sim_loss": 0.02452397346496582 + }, + { + "epoch": 1.472018983587107, + "step": 14888, + "train/total_loss": 0.024524472653865814 + }, + { + "entropy": 10.04977035522461, + "epoch": 1.4721178564366224, + "mean_token_accuracy": 0.878238320350647, + "num_tokens": 36184379.0, + "step": 14889, + "train/ce_loss": 1.4786209021622199e-06 + }, + { + "epoch": 1.4721178564366224, + "step": 14889, + "train/sim_loss": 0.04205787181854248 + }, + { + "epoch": 1.4721178564366224, + "step": 14889, + "train/total_loss": 0.04205802083015442 + }, + { + "entropy": 9.39554214477539, + "epoch": 1.4722167292861381, + "mean_token_accuracy": 0.8558558821678162, + "num_tokens": 36194546.0, + "step": 14890, + "train/ce_loss": 0.49917981028556824 + }, + { + "epoch": 1.4722167292861381, + "step": 14890, + "train/sim_loss": 0.057550132274627686 + }, + { + "epoch": 1.4722167292861381, + "step": 14890, + "train/total_loss": 0.10746811330318451 + }, + { + "entropy": 8.934233665466309, + "epoch": 1.4723156021356536, + "mean_token_accuracy": 0.8704819083213806, + "num_tokens": 36203262.0, + "step": 14891, + "train/ce_loss": 0.37743937969207764 + }, + { + "epoch": 1.4723156021356536, + "step": 14891, + "train/sim_loss": 0.06582963466644287 + }, + { + "epoch": 1.4723156021356536, + "step": 14891, + "train/total_loss": 0.10357357561588287 + }, + { + "entropy": 9.48813533782959, + "epoch": 1.472414474985169, + "mean_token_accuracy": 0.9221453070640564, + "num_tokens": 36219510.0, + "step": 14892, + "train/ce_loss": 0.30859479308128357 + }, + { + "epoch": 1.472414474985169, + "step": 14892, + "train/sim_loss": 0.03724682331085205 + }, + { + "epoch": 1.472414474985169, + "step": 14892, + "train/total_loss": 0.06810630112886429 + }, + { + "entropy": 9.04372787475586, + "epoch": 1.4725133478346846, + "mean_token_accuracy": 0.8357664346694946, + "num_tokens": 36233181.0, + "step": 14893, + "train/ce_loss": 0.19673459231853485 + }, + { + "epoch": 1.4725133478346846, + "step": 14893, + "train/sim_loss": 0.03567349910736084 + }, + { + "epoch": 1.4725133478346846, + "step": 14893, + "train/total_loss": 0.055346958339214325 + }, + { + "entropy": 9.330462455749512, + "epoch": 1.4726122206842, + "mean_token_accuracy": 0.8832116723060608, + "num_tokens": 36248521.0, + "step": 14894, + "train/ce_loss": 0.6144726276397705 + }, + { + "epoch": 1.4726122206842, + "step": 14894, + "train/sim_loss": 0.08107852935791016 + }, + { + "epoch": 1.4726122206842, + "step": 14894, + "train/total_loss": 0.1425257921218872 + }, + { + "entropy": 9.837318420410156, + "epoch": 1.4727110935337158, + "mean_token_accuracy": 0.8360301852226257, + "num_tokens": 36267014.0, + "step": 14895, + "train/ce_loss": 0.9430738687515259 + }, + { + "epoch": 1.4727110935337158, + "step": 14895, + "train/sim_loss": 0.07975345849990845 + }, + { + "epoch": 1.4727110935337158, + "step": 14895, + "train/total_loss": 0.1740608513355255 + }, + { + "entropy": 9.127885818481445, + "epoch": 1.472809966383231, + "mean_token_accuracy": 0.8668076395988464, + "num_tokens": 36279702.0, + "step": 14896, + "train/ce_loss": 0.417125940322876 + }, + { + "epoch": 1.472809966383231, + "step": 14896, + "train/sim_loss": 0.10599827766418457 + }, + { + "epoch": 1.472809966383231, + "step": 14896, + "train/total_loss": 0.1477108746767044 + }, + { + "entropy": 9.81852912902832, + "epoch": 1.4729088392327467, + "mean_token_accuracy": 0.8698347210884094, + "num_tokens": 36294322.0, + "step": 14897, + "train/ce_loss": 0.47541698813438416 + }, + { + "epoch": 1.4729088392327467, + "step": 14897, + "train/sim_loss": 0.04625058174133301 + }, + { + "epoch": 1.4729088392327467, + "step": 14897, + "train/total_loss": 0.09379228204488754 + }, + { + "entropy": 9.984485626220703, + "epoch": 1.4730077120822622, + "mean_token_accuracy": 0.9306930899620056, + "num_tokens": 36300507.0, + "step": 14898, + "train/ce_loss": 0.548168420791626 + }, + { + "epoch": 1.4730077120822622, + "step": 14898, + "train/sim_loss": 0.023004114627838135 + }, + { + "epoch": 1.4730077120822622, + "step": 14898, + "train/total_loss": 0.07782095670700073 + }, + { + "entropy": 9.074799537658691, + "epoch": 1.4731065849317777, + "mean_token_accuracy": 0.8295081853866577, + "num_tokens": 36309174.0, + "step": 14899, + "train/ce_loss": 0.7886816263198853 + }, + { + "epoch": 1.4731065849317777, + "step": 14899, + "train/sim_loss": 0.016621947288513184 + }, + { + "epoch": 1.4731065849317777, + "step": 14899, + "train/total_loss": 0.09549011290073395 + }, + { + "epoch": 1.4732054577812932, + "grad_norm": 0.5237008929252625, + "learning_rate": 6.318795430944964e-06, + "loss": 0.0853, + "step": 14900 + }, + { + "entropy": 9.342679977416992, + "epoch": 1.4732054577812932, + "mean_token_accuracy": 0.8362318873405457, + "num_tokens": 36323999.0, + "step": 14900, + "train/ce_loss": 0.8771877288818359 + }, + { + "epoch": 1.4732054577812932, + "step": 14900, + "train/sim_loss": 0.060230255126953125 + }, + { + "epoch": 1.4732054577812932, + "step": 14900, + "train/total_loss": 0.14794903993606567 + }, + { + "entropy": 9.313081741333008, + "epoch": 1.4733043306308087, + "mean_token_accuracy": 0.8538102507591248, + "num_tokens": 36338990.0, + "step": 14901, + "train/ce_loss": 0.9159755706787109 + }, + { + "epoch": 1.4733043306308087, + "step": 14901, + "train/sim_loss": 0.04425084590911865 + }, + { + "epoch": 1.4733043306308087, + "step": 14901, + "train/total_loss": 0.13584840297698975 + }, + { + "entropy": 9.37509536743164, + "epoch": 1.4734032034803244, + "mean_token_accuracy": 0.932634711265564, + "num_tokens": 36347980.0, + "step": 14902, + "train/ce_loss": 0.3044816851615906 + }, + { + "epoch": 1.4734032034803244, + "step": 14902, + "train/sim_loss": 0.020301222801208496 + }, + { + "epoch": 1.4734032034803244, + "step": 14902, + "train/total_loss": 0.050749391317367554 + }, + { + "entropy": 9.54073429107666, + "epoch": 1.4735020763298399, + "mean_token_accuracy": 0.9189189076423645, + "num_tokens": 36361377.0, + "step": 14903, + "train/ce_loss": 5.996859613333072e-07 + }, + { + "epoch": 1.4735020763298399, + "step": 14903, + "train/sim_loss": 0.02016693353652954 + }, + { + "epoch": 1.4735020763298399, + "step": 14903, + "train/total_loss": 0.020166993141174316 + }, + { + "entropy": 9.63778305053711, + "epoch": 1.4736009491793554, + "mean_token_accuracy": 0.8119158744812012, + "num_tokens": 36374539.0, + "step": 14904, + "train/ce_loss": 0.5279603600502014 + }, + { + "epoch": 1.4736009491793554, + "step": 14904, + "train/sim_loss": 0.11203551292419434 + }, + { + "epoch": 1.4736009491793554, + "step": 14904, + "train/total_loss": 0.16483154892921448 + }, + { + "entropy": 9.590919494628906, + "epoch": 1.4736998220288708, + "mean_token_accuracy": 0.8991477489471436, + "num_tokens": 36390320.0, + "step": 14905, + "train/ce_loss": 0.3772510588169098 + }, + { + "epoch": 1.4736998220288708, + "step": 14905, + "train/sim_loss": 0.016253292560577393 + }, + { + "epoch": 1.4736998220288708, + "step": 14905, + "train/total_loss": 0.05397839844226837 + }, + { + "entropy": 9.373505592346191, + "epoch": 1.4737986948783863, + "mean_token_accuracy": 0.8618654012680054, + "num_tokens": 36403069.0, + "step": 14906, + "train/ce_loss": 0.14345382153987885 + }, + { + "epoch": 1.4737986948783863, + "step": 14906, + "train/sim_loss": 0.03986334800720215 + }, + { + "epoch": 1.4737986948783863, + "step": 14906, + "train/total_loss": 0.05420872941613197 + }, + { + "entropy": 9.656482696533203, + "epoch": 1.473897567727902, + "mean_token_accuracy": 0.8503401279449463, + "num_tokens": 36422686.0, + "step": 14907, + "train/ce_loss": 0.4038097560405731 + }, + { + "epoch": 1.473897567727902, + "step": 14907, + "train/sim_loss": 0.024123847484588623 + }, + { + "epoch": 1.473897567727902, + "step": 14907, + "train/total_loss": 0.06450482457876205 + }, + { + "entropy": 9.509496688842773, + "epoch": 1.4739964405774175, + "mean_token_accuracy": 0.8684210777282715, + "num_tokens": 36436996.0, + "step": 14908, + "train/ce_loss": 0.41009095311164856 + }, + { + "epoch": 1.4739964405774175, + "step": 14908, + "train/sim_loss": 0.04008185863494873 + }, + { + "epoch": 1.4739964405774175, + "step": 14908, + "train/total_loss": 0.08109095692634583 + }, + { + "entropy": 9.021411895751953, + "epoch": 1.474095313426933, + "mean_token_accuracy": 0.8374689817428589, + "num_tokens": 36445552.0, + "step": 14909, + "train/ce_loss": 0.45916157960891724 + }, + { + "epoch": 1.474095313426933, + "step": 14909, + "train/sim_loss": 0.03127962350845337 + }, + { + "epoch": 1.474095313426933, + "step": 14909, + "train/total_loss": 0.07719577848911285 + }, + { + "entropy": 9.326314926147461, + "epoch": 1.4741941862764485, + "mean_token_accuracy": 0.8631051778793335, + "num_tokens": 36456941.0, + "step": 14910, + "train/ce_loss": 0.7114282846450806 + }, + { + "epoch": 1.4741941862764485, + "step": 14910, + "train/sim_loss": 0.08975684642791748 + }, + { + "epoch": 1.4741941862764485, + "step": 14910, + "train/total_loss": 0.16089966893196106 + }, + { + "entropy": 9.30103588104248, + "epoch": 1.474293059125964, + "mean_token_accuracy": 0.8614357113838196, + "num_tokens": 36465705.0, + "step": 14911, + "train/ce_loss": 0.6082483530044556 + }, + { + "epoch": 1.474293059125964, + "step": 14911, + "train/sim_loss": 0.045674145221710205 + }, + { + "epoch": 1.474293059125964, + "step": 14911, + "train/total_loss": 0.10649898648262024 + }, + { + "entropy": 9.009361267089844, + "epoch": 1.4743919319754795, + "mean_token_accuracy": 0.915730357170105, + "num_tokens": 36477833.0, + "step": 14912, + "train/ce_loss": 0.7207719087600708 + }, + { + "epoch": 1.4743919319754795, + "step": 14912, + "train/sim_loss": 0.03934621810913086 + }, + { + "epoch": 1.4743919319754795, + "step": 14912, + "train/total_loss": 0.11142341047525406 + }, + { + "entropy": 9.227579116821289, + "epoch": 1.474490804824995, + "mean_token_accuracy": 0.8661290407180786, + "num_tokens": 36490593.0, + "step": 14913, + "train/ce_loss": 0.5665361881256104 + }, + { + "epoch": 1.474490804824995, + "step": 14913, + "train/sim_loss": 0.06020081043243408 + }, + { + "epoch": 1.474490804824995, + "step": 14913, + "train/total_loss": 0.11685442924499512 + }, + { + "entropy": 8.983060836791992, + "epoch": 1.4745896776745107, + "mean_token_accuracy": 0.8335314989089966, + "num_tokens": 36498741.0, + "step": 14914, + "train/ce_loss": 0.6369285583496094 + }, + { + "epoch": 1.4745896776745107, + "step": 14914, + "train/sim_loss": 0.023966848850250244 + }, + { + "epoch": 1.4745896776745107, + "step": 14914, + "train/total_loss": 0.08765970915555954 + }, + { + "entropy": 9.03384780883789, + "epoch": 1.4746885505240261, + "mean_token_accuracy": 0.8573333621025085, + "num_tokens": 36510848.0, + "step": 14915, + "train/ce_loss": 0.7846380472183228 + }, + { + "epoch": 1.4746885505240261, + "step": 14915, + "train/sim_loss": 0.06265753507614136 + }, + { + "epoch": 1.4746885505240261, + "step": 14915, + "train/total_loss": 0.14112134277820587 + }, + { + "entropy": 9.471056938171387, + "epoch": 1.4747874233735416, + "mean_token_accuracy": 0.8766716122627258, + "num_tokens": 36518104.0, + "step": 14916, + "train/ce_loss": 0.46002161502838135 + }, + { + "epoch": 1.4747874233735416, + "step": 14916, + "train/sim_loss": 0.058712780475616455 + }, + { + "epoch": 1.4747874233735416, + "step": 14916, + "train/total_loss": 0.10471494495868683 + }, + { + "entropy": 9.537837982177734, + "epoch": 1.4748862962230571, + "mean_token_accuracy": 0.9225225448608398, + "num_tokens": 36529524.0, + "step": 14917, + "train/ce_loss": 0.46597784757614136 + }, + { + "epoch": 1.4748862962230571, + "step": 14917, + "train/sim_loss": 0.030210018157958984 + }, + { + "epoch": 1.4748862962230571, + "step": 14917, + "train/total_loss": 0.07680780440568924 + }, + { + "entropy": 9.534835815429688, + "epoch": 1.4749851690725726, + "mean_token_accuracy": 0.8464619517326355, + "num_tokens": 36545953.0, + "step": 14918, + "train/ce_loss": 0.7864009141921997 + }, + { + "epoch": 1.4749851690725726, + "step": 14918, + "train/sim_loss": 0.0362015962600708 + }, + { + "epoch": 1.4749851690725726, + "step": 14918, + "train/total_loss": 0.11484169214963913 + }, + { + "entropy": 9.121780395507812, + "epoch": 1.4750840419220883, + "mean_token_accuracy": 0.8106734156608582, + "num_tokens": 36561916.0, + "step": 14919, + "train/ce_loss": 0.7809391617774963 + }, + { + "epoch": 1.4750840419220883, + "step": 14919, + "train/sim_loss": 0.03448081016540527 + }, + { + "epoch": 1.4750840419220883, + "step": 14919, + "train/total_loss": 0.11257472634315491 + }, + { + "epoch": 1.4751829147716038, + "grad_norm": 0.6575708985328674, + "learning_rate": 6.313850566187015e-06, + "loss": 0.0811, + "step": 14920 + }, + { + "entropy": 8.984821319580078, + "epoch": 1.4751829147716038, + "mean_token_accuracy": 0.8715012669563293, + "num_tokens": 36567951.0, + "step": 14920, + "train/ce_loss": 0.35756075382232666 + }, + { + "epoch": 1.4751829147716038, + "step": 14920, + "train/sim_loss": 0.04178309440612793 + }, + { + "epoch": 1.4751829147716038, + "step": 14920, + "train/total_loss": 0.07753917574882507 + }, + { + "entropy": 9.094942092895508, + "epoch": 1.4752817876211193, + "mean_token_accuracy": 0.8762136101722717, + "num_tokens": 36581395.0, + "step": 14921, + "train/ce_loss": 0.2720898389816284 + }, + { + "epoch": 1.4752817876211193, + "step": 14921, + "train/sim_loss": 0.033668339252471924 + }, + { + "epoch": 1.4752817876211193, + "step": 14921, + "train/total_loss": 0.060877323150634766 + }, + { + "entropy": 9.823702812194824, + "epoch": 1.4753806604706348, + "mean_token_accuracy": 0.8617234230041504, + "num_tokens": 36597764.0, + "step": 14922, + "train/ce_loss": 0.2559289336204529 + }, + { + "epoch": 1.4753806604706348, + "step": 14922, + "train/sim_loss": 0.07103323936462402 + }, + { + "epoch": 1.4753806604706348, + "step": 14922, + "train/total_loss": 0.09662613272666931 + }, + { + "entropy": 9.164931297302246, + "epoch": 1.4754795333201502, + "mean_token_accuracy": 0.817241370677948, + "num_tokens": 36611749.0, + "step": 14923, + "train/ce_loss": 0.4682912528514862 + }, + { + "epoch": 1.4754795333201502, + "step": 14923, + "train/sim_loss": 0.01716923713684082 + }, + { + "epoch": 1.4754795333201502, + "step": 14923, + "train/total_loss": 0.06399836391210556 + }, + { + "entropy": 8.795166015625, + "epoch": 1.4755784061696657, + "mean_token_accuracy": 0.8150753974914551, + "num_tokens": 36621991.0, + "step": 14924, + "train/ce_loss": 0.523177444934845 + }, + { + "epoch": 1.4755784061696657, + "step": 14924, + "train/sim_loss": 0.01123666763305664 + }, + { + "epoch": 1.4755784061696657, + "step": 14924, + "train/total_loss": 0.06355441361665726 + }, + { + "entropy": 9.55709457397461, + "epoch": 1.4756772790191812, + "mean_token_accuracy": 0.8463901877403259, + "num_tokens": 36638156.0, + "step": 14925, + "train/ce_loss": 2.093829834848293e-06 + }, + { + "epoch": 1.4756772790191812, + "step": 14925, + "train/sim_loss": 0.025241732597351074 + }, + { + "epoch": 1.4756772790191812, + "step": 14925, + "train/total_loss": 0.025241941213607788 + }, + { + "entropy": 9.867060661315918, + "epoch": 1.475776151868697, + "mean_token_accuracy": 0.9024389982223511, + "num_tokens": 36653961.0, + "step": 14926, + "train/ce_loss": 9.56566054810537e-07 + }, + { + "epoch": 1.475776151868697, + "step": 14926, + "train/sim_loss": 0.032620251178741455 + }, + { + "epoch": 1.475776151868697, + "step": 14926, + "train/total_loss": 0.032620348036289215 + }, + { + "entropy": 9.534547805786133, + "epoch": 1.4758750247182124, + "mean_token_accuracy": 0.837243378162384, + "num_tokens": 36664053.0, + "step": 14927, + "train/ce_loss": 0.7461698055267334 + }, + { + "epoch": 1.4758750247182124, + "step": 14927, + "train/sim_loss": 0.0519641637802124 + }, + { + "epoch": 1.4758750247182124, + "step": 14927, + "train/total_loss": 0.12658114731311798 + }, + { + "entropy": 9.378898620605469, + "epoch": 1.475973897567728, + "mean_token_accuracy": 0.8380743861198425, + "num_tokens": 36673390.0, + "step": 14928, + "train/ce_loss": 0.5544854998588562 + }, + { + "epoch": 1.475973897567728, + "step": 14928, + "train/sim_loss": 0.043899357318878174 + }, + { + "epoch": 1.475973897567728, + "step": 14928, + "train/total_loss": 0.09934790432453156 + }, + { + "entropy": 8.947704315185547, + "epoch": 1.4760727704172434, + "mean_token_accuracy": 0.8554973602294922, + "num_tokens": 36687020.0, + "step": 14929, + "train/ce_loss": 0.3574209213256836 + }, + { + "epoch": 1.4760727704172434, + "step": 14929, + "train/sim_loss": 0.026489078998565674 + }, + { + "epoch": 1.4760727704172434, + "step": 14929, + "train/total_loss": 0.06223117187619209 + }, + { + "entropy": 9.592889785766602, + "epoch": 1.4761716432667589, + "mean_token_accuracy": 0.859399676322937, + "num_tokens": 36700295.0, + "step": 14930, + "train/ce_loss": 0.5801463723182678 + }, + { + "epoch": 1.4761716432667589, + "step": 14930, + "train/sim_loss": 0.04581189155578613 + }, + { + "epoch": 1.4761716432667589, + "step": 14930, + "train/total_loss": 0.10382653027772903 + }, + { + "entropy": 9.658551216125488, + "epoch": 1.4762705161162746, + "mean_token_accuracy": 0.8861047625541687, + "num_tokens": 36717893.0, + "step": 14931, + "train/ce_loss": 7.027501283118909e-07 + }, + { + "epoch": 1.4762705161162746, + "step": 14931, + "train/sim_loss": 0.029278278350830078 + }, + { + "epoch": 1.4762705161162746, + "step": 14931, + "train/total_loss": 0.02927834913134575 + }, + { + "entropy": 9.292740821838379, + "epoch": 1.47636938896579, + "mean_token_accuracy": 0.8540410399436951, + "num_tokens": 36731895.0, + "step": 14932, + "train/ce_loss": 0.30874425172805786 + }, + { + "epoch": 1.47636938896579, + "step": 14932, + "train/sim_loss": 0.09208285808563232 + }, + { + "epoch": 1.47636938896579, + "step": 14932, + "train/total_loss": 0.12295728176832199 + }, + { + "entropy": 9.43310546875, + "epoch": 1.4764682618153055, + "mean_token_accuracy": 0.8863636255264282, + "num_tokens": 36746584.0, + "step": 14933, + "train/ce_loss": 0.3721502423286438 + }, + { + "epoch": 1.4764682618153055, + "step": 14933, + "train/sim_loss": 0.009368658065795898 + }, + { + "epoch": 1.4764682618153055, + "step": 14933, + "train/total_loss": 0.04658368229866028 + }, + { + "entropy": 8.940361976623535, + "epoch": 1.476567134664821, + "mean_token_accuracy": 0.8326572179794312, + "num_tokens": 36759975.0, + "step": 14934, + "train/ce_loss": 0.8109882473945618 + }, + { + "epoch": 1.476567134664821, + "step": 14934, + "train/sim_loss": 0.08343660831451416 + }, + { + "epoch": 1.476567134664821, + "step": 14934, + "train/total_loss": 0.16453543305397034 + }, + { + "entropy": 9.70263671875, + "epoch": 1.4766660075143365, + "mean_token_accuracy": 0.8562401533126831, + "num_tokens": 36781225.0, + "step": 14935, + "train/ce_loss": 0.34739524126052856 + }, + { + "epoch": 1.4766660075143365, + "step": 14935, + "train/sim_loss": 0.029956698417663574 + }, + { + "epoch": 1.4766660075143365, + "step": 14935, + "train/total_loss": 0.06469622254371643 + }, + { + "entropy": 9.227206230163574, + "epoch": 1.476764880363852, + "mean_token_accuracy": 0.8350622653961182, + "num_tokens": 36793784.0, + "step": 14936, + "train/ce_loss": 0.653536319732666 + }, + { + "epoch": 1.476764880363852, + "step": 14936, + "train/sim_loss": 0.07066792249679565 + }, + { + "epoch": 1.476764880363852, + "step": 14936, + "train/total_loss": 0.13602155447006226 + }, + { + "entropy": 9.496101379394531, + "epoch": 1.4768637532133675, + "mean_token_accuracy": 0.8627968430519104, + "num_tokens": 36812290.0, + "step": 14937, + "train/ce_loss": 0.43063104152679443 + }, + { + "epoch": 1.4768637532133675, + "step": 14937, + "train/sim_loss": 0.06390136480331421 + }, + { + "epoch": 1.4768637532133675, + "step": 14937, + "train/total_loss": 0.10696446895599365 + }, + { + "entropy": 9.263202667236328, + "epoch": 1.4769626260628832, + "mean_token_accuracy": 0.8525139689445496, + "num_tokens": 36825705.0, + "step": 14938, + "train/ce_loss": 0.5983403325080872 + }, + { + "epoch": 1.4769626260628832, + "step": 14938, + "train/sim_loss": 0.0503271222114563 + }, + { + "epoch": 1.4769626260628832, + "step": 14938, + "train/total_loss": 0.11016115546226501 + }, + { + "entropy": 9.908235549926758, + "epoch": 1.4770614989123987, + "mean_token_accuracy": 0.870728075504303, + "num_tokens": 36839985.0, + "step": 14939, + "train/ce_loss": 0.5656437873840332 + }, + { + "epoch": 1.4770614989123987, + "step": 14939, + "train/sim_loss": 0.03507697582244873 + }, + { + "epoch": 1.4770614989123987, + "step": 14939, + "train/total_loss": 0.09164135158061981 + }, + { + "epoch": 1.4771603717619142, + "grad_norm": 0.5372821092605591, + "learning_rate": 6.308905701429066e-06, + "loss": 0.082, + "step": 14940 + }, + { + "entropy": 9.192087173461914, + "epoch": 1.4771603717619142, + "mean_token_accuracy": 0.8903225660324097, + "num_tokens": 36851109.0, + "step": 14940, + "train/ce_loss": 0.5692213773727417 + }, + { + "epoch": 1.4771603717619142, + "step": 14940, + "train/sim_loss": 0.0547061562538147 + }, + { + "epoch": 1.4771603717619142, + "step": 14940, + "train/total_loss": 0.11162829399108887 + }, + { + "entropy": 9.462625503540039, + "epoch": 1.4772592446114297, + "mean_token_accuracy": 0.8590381145477295, + "num_tokens": 36866683.0, + "step": 14941, + "train/ce_loss": 0.5104891061782837 + }, + { + "epoch": 1.4772592446114297, + "step": 14941, + "train/sim_loss": 0.022940635681152344 + }, + { + "epoch": 1.4772592446114297, + "step": 14941, + "train/total_loss": 0.07398954778909683 + }, + { + "entropy": 9.360565185546875, + "epoch": 1.4773581174609451, + "mean_token_accuracy": 0.9178082346916199, + "num_tokens": 36883183.0, + "step": 14942, + "train/ce_loss": 0.29612889885902405 + }, + { + "epoch": 1.4773581174609451, + "step": 14942, + "train/sim_loss": 0.036894798278808594 + }, + { + "epoch": 1.4773581174609451, + "step": 14942, + "train/total_loss": 0.06650768965482712 + }, + { + "entropy": 9.288143157958984, + "epoch": 1.4774569903104608, + "mean_token_accuracy": 0.8417639136314392, + "num_tokens": 36893963.0, + "step": 14943, + "train/ce_loss": 0.6820167899131775 + }, + { + "epoch": 1.4774569903104608, + "step": 14943, + "train/sim_loss": 0.036734700202941895 + }, + { + "epoch": 1.4774569903104608, + "step": 14943, + "train/total_loss": 0.104936383664608 + }, + { + "entropy": 9.342516899108887, + "epoch": 1.4775558631599763, + "mean_token_accuracy": 0.8772189617156982, + "num_tokens": 36901453.0, + "step": 14944, + "train/ce_loss": 1.2667628652707208e-06 + }, + { + "epoch": 1.4775558631599763, + "step": 14944, + "train/sim_loss": 0.03950631618499756 + }, + { + "epoch": 1.4775558631599763, + "step": 14944, + "train/total_loss": 0.039506442844867706 + }, + { + "entropy": 9.329920768737793, + "epoch": 1.4776547360094918, + "mean_token_accuracy": 0.8527851700782776, + "num_tokens": 36914970.0, + "step": 14945, + "train/ce_loss": 0.424967885017395 + }, + { + "epoch": 1.4776547360094918, + "step": 14945, + "train/sim_loss": 0.030954957008361816 + }, + { + "epoch": 1.4776547360094918, + "step": 14945, + "train/total_loss": 0.07345174252986908 + }, + { + "entropy": 9.38082504272461, + "epoch": 1.4777536088590073, + "mean_token_accuracy": 0.920258641242981, + "num_tokens": 36926093.0, + "step": 14946, + "train/ce_loss": 0.2133101373910904 + }, + { + "epoch": 1.4777536088590073, + "step": 14946, + "train/sim_loss": 0.05220150947570801 + }, + { + "epoch": 1.4777536088590073, + "step": 14946, + "train/total_loss": 0.07353252172470093 + }, + { + "entropy": 9.678796768188477, + "epoch": 1.4778524817085228, + "mean_token_accuracy": 0.8275030255317688, + "num_tokens": 36941093.0, + "step": 14947, + "train/ce_loss": 3.583404293294734e-07 + }, + { + "epoch": 1.4778524817085228, + "step": 14947, + "train/sim_loss": 0.02872776985168457 + }, + { + "epoch": 1.4778524817085228, + "step": 14947, + "train/total_loss": 0.028727805241942406 + }, + { + "entropy": 9.288466453552246, + "epoch": 1.4779513545580385, + "mean_token_accuracy": 0.9137167930603027, + "num_tokens": 36957361.0, + "step": 14948, + "train/ce_loss": 0.36168229579925537 + }, + { + "epoch": 1.4779513545580385, + "step": 14948, + "train/sim_loss": 0.0408061146736145 + }, + { + "epoch": 1.4779513545580385, + "step": 14948, + "train/total_loss": 0.07697434723377228 + }, + { + "entropy": 9.51530647277832, + "epoch": 1.4780502274075538, + "mean_token_accuracy": 0.8522727489471436, + "num_tokens": 36970504.0, + "step": 14949, + "train/ce_loss": 0.31622427701950073 + }, + { + "epoch": 1.4780502274075538, + "step": 14949, + "train/sim_loss": 0.03459525108337402 + }, + { + "epoch": 1.4780502274075538, + "step": 14949, + "train/total_loss": 0.06621767580509186 + }, + { + "entropy": 8.897710800170898, + "epoch": 1.4781491002570695, + "mean_token_accuracy": 0.8356322050094604, + "num_tokens": 36980294.0, + "step": 14950, + "train/ce_loss": 0.2683727443218231 + }, + { + "epoch": 1.4781491002570695, + "step": 14950, + "train/sim_loss": 0.03338825702667236 + }, + { + "epoch": 1.4781491002570695, + "step": 14950, + "train/total_loss": 0.060225531458854675 + }, + { + "entropy": 9.56480598449707, + "epoch": 1.478247973106585, + "mean_token_accuracy": 0.8235294222831726, + "num_tokens": 36997259.0, + "step": 14951, + "train/ce_loss": 0.465000718832016 + }, + { + "epoch": 1.478247973106585, + "step": 14951, + "train/sim_loss": 0.04608219861984253 + }, + { + "epoch": 1.478247973106585, + "step": 14951, + "train/total_loss": 0.09258227050304413 + }, + { + "entropy": 9.706275939941406, + "epoch": 1.4783468459561004, + "mean_token_accuracy": 0.8870431780815125, + "num_tokens": 37006326.0, + "step": 14952, + "train/ce_loss": 0.43367841839790344 + }, + { + "epoch": 1.4783468459561004, + "step": 14952, + "train/sim_loss": 0.02476370334625244 + }, + { + "epoch": 1.4783468459561004, + "step": 14952, + "train/total_loss": 0.06813155114650726 + }, + { + "entropy": 9.470956802368164, + "epoch": 1.478445718805616, + "mean_token_accuracy": 0.8667582273483276, + "num_tokens": 37018160.0, + "step": 14953, + "train/ce_loss": 0.46277743577957153 + }, + { + "epoch": 1.478445718805616, + "step": 14953, + "train/sim_loss": 0.07018190622329712 + }, + { + "epoch": 1.478445718805616, + "step": 14953, + "train/total_loss": 0.11645965278148651 + }, + { + "entropy": 9.45355224609375, + "epoch": 1.4785445916551314, + "mean_token_accuracy": 0.8142856955528259, + "num_tokens": 37027248.0, + "step": 14954, + "train/ce_loss": 0.5848475098609924 + }, + { + "epoch": 1.4785445916551314, + "step": 14954, + "train/sim_loss": 0.06057584285736084 + }, + { + "epoch": 1.4785445916551314, + "step": 14954, + "train/total_loss": 0.11906059086322784 + }, + { + "entropy": 9.444905281066895, + "epoch": 1.4786434645046471, + "mean_token_accuracy": 0.8420289754867554, + "num_tokens": 37037232.0, + "step": 14955, + "train/ce_loss": 0.870322048664093 + }, + { + "epoch": 1.4786434645046471, + "step": 14955, + "train/sim_loss": 0.13205033540725708 + }, + { + "epoch": 1.4786434645046471, + "step": 14955, + "train/total_loss": 0.2190825343132019 + }, + { + "entropy": 9.930135726928711, + "epoch": 1.4787423373541626, + "mean_token_accuracy": 0.9168081283569336, + "num_tokens": 37052341.0, + "step": 14956, + "train/ce_loss": 8.84828978087171e-07 + }, + { + "epoch": 1.4787423373541626, + "step": 14956, + "train/sim_loss": 0.056853532791137695 + }, + { + "epoch": 1.4787423373541626, + "step": 14956, + "train/total_loss": 0.05685362219810486 + }, + { + "entropy": 8.807106018066406, + "epoch": 1.478841210203678, + "mean_token_accuracy": 0.9157894849777222, + "num_tokens": 37063385.0, + "step": 14957, + "train/ce_loss": 0.35227537155151367 + }, + { + "epoch": 1.478841210203678, + "step": 14957, + "train/sim_loss": 0.013468265533447266 + }, + { + "epoch": 1.478841210203678, + "step": 14957, + "train/total_loss": 0.04869580268859863 + }, + { + "entropy": 9.38494873046875, + "epoch": 1.4789400830531936, + "mean_token_accuracy": 0.8800504803657532, + "num_tokens": 37080919.0, + "step": 14958, + "train/ce_loss": 0.11757692694664001 + }, + { + "epoch": 1.4789400830531936, + "step": 14958, + "train/sim_loss": 0.09219253063201904 + }, + { + "epoch": 1.4789400830531936, + "step": 14958, + "train/total_loss": 0.10395022481679916 + }, + { + "entropy": 9.338163375854492, + "epoch": 1.479038955902709, + "mean_token_accuracy": 0.8551351428031921, + "num_tokens": 37093595.0, + "step": 14959, + "train/ce_loss": 0.5389516353607178 + }, + { + "epoch": 1.479038955902709, + "step": 14959, + "train/sim_loss": 0.07045495510101318 + }, + { + "epoch": 1.479038955902709, + "step": 14959, + "train/total_loss": 0.12435011565685272 + }, + { + "epoch": 1.4791378287522248, + "grad_norm": 0.6057789325714111, + "learning_rate": 6.303960836671118e-06, + "loss": 0.085, + "step": 14960 + }, + { + "entropy": 9.114398956298828, + "epoch": 1.4791378287522248, + "mean_token_accuracy": 0.8487499952316284, + "num_tokens": 37106049.0, + "step": 14960, + "train/ce_loss": 0.8027193546295166 + }, + { + "epoch": 1.4791378287522248, + "step": 14960, + "train/sim_loss": 0.06682467460632324 + }, + { + "epoch": 1.4791378287522248, + "step": 14960, + "train/total_loss": 0.14709660410881042 + }, + { + "entropy": 8.852293968200684, + "epoch": 1.47923670160174, + "mean_token_accuracy": 0.8125714063644409, + "num_tokens": 37122534.0, + "step": 14961, + "train/ce_loss": 0.3598540425300598 + }, + { + "epoch": 1.47923670160174, + "step": 14961, + "train/sim_loss": 0.08688896894454956 + }, + { + "epoch": 1.47923670160174, + "step": 14961, + "train/total_loss": 0.12287437915802002 + }, + { + "entropy": 9.343367576599121, + "epoch": 1.4793355744512557, + "mean_token_accuracy": 0.7955555319786072, + "num_tokens": 37139259.0, + "step": 14962, + "train/ce_loss": 0.6917961239814758 + }, + { + "epoch": 1.4793355744512557, + "step": 14962, + "train/sim_loss": 0.12489622831344604 + }, + { + "epoch": 1.4793355744512557, + "step": 14962, + "train/total_loss": 0.19407585263252258 + }, + { + "entropy": 9.00822639465332, + "epoch": 1.4794344473007712, + "mean_token_accuracy": 0.9142496585845947, + "num_tokens": 37153012.0, + "step": 14963, + "train/ce_loss": 0.3321908116340637 + }, + { + "epoch": 1.4794344473007712, + "step": 14963, + "train/sim_loss": 0.030809104442596436 + }, + { + "epoch": 1.4794344473007712, + "step": 14963, + "train/total_loss": 0.06402818858623505 + }, + { + "entropy": 9.407880783081055, + "epoch": 1.4795333201502867, + "mean_token_accuracy": 0.8544973731040955, + "num_tokens": 37166690.0, + "step": 14964, + "train/ce_loss": 0.24436858296394348 + }, + { + "epoch": 1.4795333201502867, + "step": 14964, + "train/sim_loss": 0.062251508235931396 + }, + { + "epoch": 1.4795333201502867, + "step": 14964, + "train/total_loss": 0.08668836951255798 + }, + { + "entropy": 9.025476455688477, + "epoch": 1.4796321929998022, + "mean_token_accuracy": 0.8911564350128174, + "num_tokens": 37175018.0, + "step": 14965, + "train/ce_loss": 0.31371554732322693 + }, + { + "epoch": 1.4796321929998022, + "step": 14965, + "train/sim_loss": 0.0692669153213501 + }, + { + "epoch": 1.4796321929998022, + "step": 14965, + "train/total_loss": 0.10063847154378891 + }, + { + "entropy": 9.251422882080078, + "epoch": 1.4797310658493177, + "mean_token_accuracy": 0.8509895205497742, + "num_tokens": 37187563.0, + "step": 14966, + "train/ce_loss": 0.4302532970905304 + }, + { + "epoch": 1.4797310658493177, + "step": 14966, + "train/sim_loss": 0.05106997489929199 + }, + { + "epoch": 1.4797310658493177, + "step": 14966, + "train/total_loss": 0.09409530460834503 + }, + { + "entropy": 9.185770034790039, + "epoch": 1.4798299386988334, + "mean_token_accuracy": 0.8519448041915894, + "num_tokens": 37200998.0, + "step": 14967, + "train/ce_loss": 0.3412763178348541 + }, + { + "epoch": 1.4798299386988334, + "step": 14967, + "train/sim_loss": 0.0365680456161499 + }, + { + "epoch": 1.4798299386988334, + "step": 14967, + "train/total_loss": 0.07069568336009979 + }, + { + "entropy": 8.935094833374023, + "epoch": 1.4799288115483489, + "mean_token_accuracy": 0.9288343787193298, + "num_tokens": 37216579.0, + "step": 14968, + "train/ce_loss": 0.38909822702407837 + }, + { + "epoch": 1.4799288115483489, + "step": 14968, + "train/sim_loss": 0.019764244556427002 + }, + { + "epoch": 1.4799288115483489, + "step": 14968, + "train/total_loss": 0.05867406725883484 + }, + { + "entropy": 9.194498062133789, + "epoch": 1.4800276843978644, + "mean_token_accuracy": 0.8708661198616028, + "num_tokens": 37231297.0, + "step": 14969, + "train/ce_loss": 0.32727786898612976 + }, + { + "epoch": 1.4800276843978644, + "step": 14969, + "train/sim_loss": 0.034674882888793945 + }, + { + "epoch": 1.4800276843978644, + "step": 14969, + "train/total_loss": 0.0674026757478714 + }, + { + "entropy": 8.961013793945312, + "epoch": 1.4801265572473798, + "mean_token_accuracy": 0.8453159332275391, + "num_tokens": 37238119.0, + "step": 14970, + "train/ce_loss": 0.32489004731178284 + }, + { + "epoch": 1.4801265572473798, + "step": 14970, + "train/sim_loss": 0.026123404502868652 + }, + { + "epoch": 1.4801265572473798, + "step": 14970, + "train/total_loss": 0.058612409979104996 + }, + { + "entropy": 9.227992057800293, + "epoch": 1.4802254300968953, + "mean_token_accuracy": 0.7773584723472595, + "num_tokens": 37250831.0, + "step": 14971, + "train/ce_loss": 0.5027151703834534 + }, + { + "epoch": 1.4802254300968953, + "step": 14971, + "train/sim_loss": 0.06863391399383545 + }, + { + "epoch": 1.4802254300968953, + "step": 14971, + "train/total_loss": 0.1189054325222969 + }, + { + "entropy": 9.01174545288086, + "epoch": 1.480324302946411, + "mean_token_accuracy": 0.8525576591491699, + "num_tokens": 37263691.0, + "step": 14972, + "train/ce_loss": 0.34895455837249756 + }, + { + "epoch": 1.480324302946411, + "step": 14972, + "train/sim_loss": 0.016380369663238525 + }, + { + "epoch": 1.480324302946411, + "step": 14972, + "train/total_loss": 0.0512758269906044 + }, + { + "entropy": 9.403447151184082, + "epoch": 1.4804231757959263, + "mean_token_accuracy": 0.8635703921318054, + "num_tokens": 37274184.0, + "step": 14973, + "train/ce_loss": 0.49240776896476746 + }, + { + "epoch": 1.4804231757959263, + "step": 14973, + "train/sim_loss": 0.06330645084381104 + }, + { + "epoch": 1.4804231757959263, + "step": 14973, + "train/total_loss": 0.11254723370075226 + }, + { + "entropy": 9.186136245727539, + "epoch": 1.480522048645442, + "mean_token_accuracy": 0.8669673204421997, + "num_tokens": 37292358.0, + "step": 14974, + "train/ce_loss": 0.2547745108604431 + }, + { + "epoch": 1.480522048645442, + "step": 14974, + "train/sim_loss": 0.023388266563415527 + }, + { + "epoch": 1.480522048645442, + "step": 14974, + "train/total_loss": 0.04886572062969208 + }, + { + "entropy": 9.44534969329834, + "epoch": 1.4806209214949575, + "mean_token_accuracy": 0.9277108311653137, + "num_tokens": 37306968.0, + "step": 14975, + "train/ce_loss": 0.20727315545082092 + }, + { + "epoch": 1.4806209214949575, + "step": 14975, + "train/sim_loss": 0.040298640727996826 + }, + { + "epoch": 1.4806209214949575, + "step": 14975, + "train/total_loss": 0.0610259547829628 + }, + { + "entropy": 10.025482177734375, + "epoch": 1.480719794344473, + "mean_token_accuracy": 0.9333333373069763, + "num_tokens": 37317402.0, + "step": 14976, + "train/ce_loss": 9.556478062222595e-07 + }, + { + "epoch": 1.480719794344473, + "step": 14976, + "train/sim_loss": 0.02006828784942627 + }, + { + "epoch": 1.480719794344473, + "step": 14976, + "train/total_loss": 0.02006838284432888 + }, + { + "entropy": 9.349868774414062, + "epoch": 1.4808186671939885, + "mean_token_accuracy": 0.8314732313156128, + "num_tokens": 37335216.0, + "step": 14977, + "train/ce_loss": 0.7564713358879089 + }, + { + "epoch": 1.4808186671939885, + "step": 14977, + "train/sim_loss": 0.04090690612792969 + }, + { + "epoch": 1.4808186671939885, + "step": 14977, + "train/total_loss": 0.11655404418706894 + }, + { + "entropy": 9.143832206726074, + "epoch": 1.480917540043504, + "mean_token_accuracy": 0.8800504803657532, + "num_tokens": 37348641.0, + "step": 14978, + "train/ce_loss": 0.3094131648540497 + }, + { + "epoch": 1.480917540043504, + "step": 14978, + "train/sim_loss": 0.1352616548538208 + }, + { + "epoch": 1.480917540043504, + "step": 14978, + "train/total_loss": 0.16620297729969025 + }, + { + "entropy": 9.373424530029297, + "epoch": 1.4810164128930197, + "mean_token_accuracy": 0.8344433903694153, + "num_tokens": 37363412.0, + "step": 14979, + "train/ce_loss": 0.7298264503479004 + }, + { + "epoch": 1.4810164128930197, + "step": 14979, + "train/sim_loss": 0.05621993541717529 + }, + { + "epoch": 1.4810164128930197, + "step": 14979, + "train/total_loss": 0.12920257449150085 + }, + { + "epoch": 1.4811152857425351, + "grad_norm": 0.5739089250564575, + "learning_rate": 6.299015971913168e-06, + "loss": 0.0817, + "step": 14980 + }, + { + "entropy": 9.678909301757812, + "epoch": 1.4811152857425351, + "mean_token_accuracy": 0.8705501556396484, + "num_tokens": 37375668.0, + "step": 14980, + "train/ce_loss": 0.510582685470581 + }, + { + "epoch": 1.4811152857425351, + "step": 14980, + "train/sim_loss": 0.05729860067367554 + }, + { + "epoch": 1.4811152857425351, + "step": 14980, + "train/total_loss": 0.10835687071084976 + }, + { + "entropy": 8.764516830444336, + "epoch": 1.4812141585920506, + "mean_token_accuracy": 0.8489871025085449, + "num_tokens": 37383071.0, + "step": 14981, + "train/ce_loss": 0.47718971967697144 + }, + { + "epoch": 1.4812141585920506, + "step": 14981, + "train/sim_loss": 0.042847514152526855 + }, + { + "epoch": 1.4812141585920506, + "step": 14981, + "train/total_loss": 0.090566486120224 + }, + { + "entropy": 9.473352432250977, + "epoch": 1.4813130314415661, + "mean_token_accuracy": 0.8641390204429626, + "num_tokens": 37403242.0, + "step": 14982, + "train/ce_loss": 3.6534297009893635e-07 + }, + { + "epoch": 1.4813130314415661, + "step": 14982, + "train/sim_loss": 0.030515551567077637 + }, + { + "epoch": 1.4813130314415661, + "step": 14982, + "train/total_loss": 0.03051558881998062 + }, + { + "entropy": 8.859106063842773, + "epoch": 1.4814119042910816, + "mean_token_accuracy": 0.8461538553237915, + "num_tokens": 37414685.0, + "step": 14983, + "train/ce_loss": 0.34229496121406555 + }, + { + "epoch": 1.4814119042910816, + "step": 14983, + "train/sim_loss": 0.034627318382263184 + }, + { + "epoch": 1.4814119042910816, + "step": 14983, + "train/total_loss": 0.06885682046413422 + }, + { + "entropy": 8.908939361572266, + "epoch": 1.4815107771405973, + "mean_token_accuracy": 0.8362745046615601, + "num_tokens": 37425182.0, + "step": 14984, + "train/ce_loss": 0.512554943561554 + }, + { + "epoch": 1.4815107771405973, + "step": 14984, + "train/sim_loss": 0.06527429819107056 + }, + { + "epoch": 1.4815107771405973, + "step": 14984, + "train/total_loss": 0.11652979254722595 + }, + { + "entropy": 8.982105255126953, + "epoch": 1.4816096499901128, + "mean_token_accuracy": 0.8199999928474426, + "num_tokens": 37433679.0, + "step": 14985, + "train/ce_loss": 0.15684229135513306 + }, + { + "epoch": 1.4816096499901128, + "step": 14985, + "train/sim_loss": 0.024489164352416992 + }, + { + "epoch": 1.4816096499901128, + "step": 14985, + "train/total_loss": 0.04017339646816254 + }, + { + "entropy": 9.04867172241211, + "epoch": 1.4817085228396283, + "mean_token_accuracy": 0.8666666746139526, + "num_tokens": 37448111.0, + "step": 14986, + "train/ce_loss": 0.6094263195991516 + }, + { + "epoch": 1.4817085228396283, + "step": 14986, + "train/sim_loss": 0.09310686588287354 + }, + { + "epoch": 1.4817085228396283, + "step": 14986, + "train/total_loss": 0.15404950082302094 + }, + { + "entropy": 9.150197982788086, + "epoch": 1.4818073956891438, + "mean_token_accuracy": 0.8255280256271362, + "num_tokens": 37462921.0, + "step": 14987, + "train/ce_loss": 0.6539767384529114 + }, + { + "epoch": 1.4818073956891438, + "step": 14987, + "train/sim_loss": 0.03963303565979004 + }, + { + "epoch": 1.4818073956891438, + "step": 14987, + "train/total_loss": 0.10503070801496506 + }, + { + "entropy": 9.224123001098633, + "epoch": 1.4819062685386593, + "mean_token_accuracy": 0.8530183434486389, + "num_tokens": 37483071.0, + "step": 14988, + "train/ce_loss": 0.6665726900100708 + }, + { + "epoch": 1.4819062685386593, + "step": 14988, + "train/sim_loss": 0.03077375888824463 + }, + { + "epoch": 1.4819062685386593, + "step": 14988, + "train/total_loss": 0.09743102639913559 + }, + { + "entropy": 9.790525436401367, + "epoch": 1.4820051413881747, + "mean_token_accuracy": 0.89737468957901, + "num_tokens": 37499554.0, + "step": 14989, + "train/ce_loss": 0.6001322269439697 + }, + { + "epoch": 1.4820051413881747, + "step": 14989, + "train/sim_loss": 0.04037576913833618 + }, + { + "epoch": 1.4820051413881747, + "step": 14989, + "train/total_loss": 0.10038898885250092 + }, + { + "entropy": 9.535390853881836, + "epoch": 1.4821040142376902, + "mean_token_accuracy": 0.8818380832672119, + "num_tokens": 37513057.0, + "step": 14990, + "train/ce_loss": 8.806154028206947e-07 + }, + { + "epoch": 1.4821040142376902, + "step": 14990, + "train/sim_loss": 0.0517045259475708 + }, + { + "epoch": 1.4821040142376902, + "step": 14990, + "train/total_loss": 0.051704615354537964 + }, + { + "entropy": 8.811477661132812, + "epoch": 1.482202887087206, + "mean_token_accuracy": 0.8458049893379211, + "num_tokens": 37520634.0, + "step": 14991, + "train/ce_loss": 0.24958525598049164 + }, + { + "epoch": 1.482202887087206, + "step": 14991, + "train/sim_loss": 0.033179521560668945 + }, + { + "epoch": 1.482202887087206, + "step": 14991, + "train/total_loss": 0.05813805013895035 + }, + { + "entropy": 9.39010238647461, + "epoch": 1.4823017599367214, + "mean_token_accuracy": 0.8247422575950623, + "num_tokens": 37533333.0, + "step": 14992, + "train/ce_loss": 0.3155944347381592 + }, + { + "epoch": 1.4823017599367214, + "step": 14992, + "train/sim_loss": 0.03135472536087036 + }, + { + "epoch": 1.4823017599367214, + "step": 14992, + "train/total_loss": 0.0629141703248024 + }, + { + "entropy": 9.417718887329102, + "epoch": 1.482400632786237, + "mean_token_accuracy": 0.8769792914390564, + "num_tokens": 37550052.0, + "step": 14993, + "train/ce_loss": 0.12245060503482819 + }, + { + "epoch": 1.482400632786237, + "step": 14993, + "train/sim_loss": 0.017811059951782227 + }, + { + "epoch": 1.482400632786237, + "step": 14993, + "train/total_loss": 0.030056120827794075 + }, + { + "entropy": 8.927042007446289, + "epoch": 1.4824995056357524, + "mean_token_accuracy": 0.8800557851791382, + "num_tokens": 37564675.0, + "step": 14994, + "train/ce_loss": 1.2315560979914153e-06 + }, + { + "epoch": 1.4824995056357524, + "step": 14994, + "train/sim_loss": 0.02456831932067871 + }, + { + "epoch": 1.4824995056357524, + "step": 14994, + "train/total_loss": 0.02456844225525856 + }, + { + "entropy": 9.556062698364258, + "epoch": 1.4825983784852679, + "mean_token_accuracy": 0.8430717587471008, + "num_tokens": 37572977.0, + "step": 14995, + "train/ce_loss": 0.48424065113067627 + }, + { + "epoch": 1.4825983784852679, + "step": 14995, + "train/sim_loss": 0.03207629919052124 + }, + { + "epoch": 1.4825983784852679, + "step": 14995, + "train/total_loss": 0.08050036430358887 + }, + { + "entropy": 9.087687492370605, + "epoch": 1.4826972513347836, + "mean_token_accuracy": 0.8650472164154053, + "num_tokens": 37586517.0, + "step": 14996, + "train/ce_loss": 0.6573355197906494 + }, + { + "epoch": 1.4826972513347836, + "step": 14996, + "train/sim_loss": 0.10496866703033447 + }, + { + "epoch": 1.4826972513347836, + "step": 14996, + "train/total_loss": 0.17070221900939941 + }, + { + "entropy": 9.470322608947754, + "epoch": 1.482796124184299, + "mean_token_accuracy": 0.8533950448036194, + "num_tokens": 37602775.0, + "step": 14997, + "train/ce_loss": 0.33715882897377014 + }, + { + "epoch": 1.482796124184299, + "step": 14997, + "train/sim_loss": 0.015169858932495117 + }, + { + "epoch": 1.482796124184299, + "step": 14997, + "train/total_loss": 0.04888574406504631 + }, + { + "entropy": 9.078641891479492, + "epoch": 1.4828949970338146, + "mean_token_accuracy": 0.8109559416770935, + "num_tokens": 37619369.0, + "step": 14998, + "train/ce_loss": 0.48327648639678955 + }, + { + "epoch": 1.4828949970338146, + "step": 14998, + "train/sim_loss": 0.023130297660827637 + }, + { + "epoch": 1.4828949970338146, + "step": 14998, + "train/total_loss": 0.07145795226097107 + }, + { + "entropy": 9.478710174560547, + "epoch": 1.48299386988333, + "mean_token_accuracy": 0.8700565099716187, + "num_tokens": 37638930.0, + "step": 14999, + "train/ce_loss": 0.0933859571814537 + }, + { + "epoch": 1.48299386988333, + "step": 14999, + "train/sim_loss": 0.01796621084213257 + }, + { + "epoch": 1.48299386988333, + "step": 14999, + "train/total_loss": 0.02730480581521988 + }, + { + "epoch": 1.4830927427328455, + "grad_norm": 0.5518420934677124, + "learning_rate": 6.2940711071552196e-06, + "loss": 0.0831, + "step": 15000 + }, + { + "entropy": 9.697328567504883, + "epoch": 1.4830927427328455, + "mean_token_accuracy": 0.851190447807312, + "num_tokens": 37654971.0, + "step": 15000, + "train/ce_loss": 1.0557290315628052 + }, + { + "epoch": 1.4830927427328455, + "step": 15000, + "train/sim_loss": 0.09018909931182861 + }, + { + "epoch": 1.4830927427328455, + "step": 15000, + "train/total_loss": 0.1957620084285736 + }, + { + "entropy": 8.777091979980469, + "epoch": 1.483191615582361, + "mean_token_accuracy": 0.8252631425857544, + "num_tokens": 37667371.0, + "step": 15001, + "train/ce_loss": 0.6968262791633606 + }, + { + "epoch": 1.483191615582361, + "step": 15001, + "train/sim_loss": 0.045418739318847656 + }, + { + "epoch": 1.483191615582361, + "step": 15001, + "train/total_loss": 0.11510136723518372 + }, + { + "entropy": 9.69543743133545, + "epoch": 1.4832904884318765, + "mean_token_accuracy": 0.8730158805847168, + "num_tokens": 37681242.0, + "step": 15002, + "train/ce_loss": 0.23158319294452667 + }, + { + "epoch": 1.4832904884318765, + "step": 15002, + "train/sim_loss": 0.045116543769836426 + }, + { + "epoch": 1.4832904884318765, + "step": 15002, + "train/total_loss": 0.06827486306428909 + }, + { + "entropy": 9.30307388305664, + "epoch": 1.4833893612813922, + "mean_token_accuracy": 0.8627167344093323, + "num_tokens": 37695107.0, + "step": 15003, + "train/ce_loss": 0.10638990998268127 + }, + { + "epoch": 1.4833893612813922, + "step": 15003, + "train/sim_loss": 0.029138505458831787 + }, + { + "epoch": 1.4833893612813922, + "step": 15003, + "train/total_loss": 0.039777494966983795 + }, + { + "entropy": 8.98840045928955, + "epoch": 1.4834882341309077, + "mean_token_accuracy": 0.8622366189956665, + "num_tokens": 37702138.0, + "step": 15004, + "train/ce_loss": 0.41536223888397217 + }, + { + "epoch": 1.4834882341309077, + "step": 15004, + "train/sim_loss": 0.010127663612365723 + }, + { + "epoch": 1.4834882341309077, + "step": 15004, + "train/total_loss": 0.05166388675570488 + }, + { + "entropy": 8.85542106628418, + "epoch": 1.4835871069804232, + "mean_token_accuracy": 0.8349206447601318, + "num_tokens": 37711225.0, + "step": 15005, + "train/ce_loss": 0.38252782821655273 + }, + { + "epoch": 1.4835871069804232, + "step": 15005, + "train/sim_loss": 0.03484225273132324 + }, + { + "epoch": 1.4835871069804232, + "step": 15005, + "train/total_loss": 0.07309503853321075 + }, + { + "entropy": 9.212516784667969, + "epoch": 1.4836859798299387, + "mean_token_accuracy": 0.8099063038825989, + "num_tokens": 37723941.0, + "step": 15006, + "train/ce_loss": 0.8433322906494141 + }, + { + "epoch": 1.4836859798299387, + "step": 15006, + "train/sim_loss": 0.08137869834899902 + }, + { + "epoch": 1.4836859798299387, + "step": 15006, + "train/total_loss": 0.16571193933486938 + }, + { + "entropy": 9.723735809326172, + "epoch": 1.4837848526794541, + "mean_token_accuracy": 0.8776978254318237, + "num_tokens": 37738281.0, + "step": 15007, + "train/ce_loss": 0.2824016213417053 + }, + { + "epoch": 1.4837848526794541, + "step": 15007, + "train/sim_loss": 0.06737279891967773 + }, + { + "epoch": 1.4837848526794541, + "step": 15007, + "train/total_loss": 0.09561295807361603 + }, + { + "entropy": 9.296063423156738, + "epoch": 1.4838837255289699, + "mean_token_accuracy": 0.8777292370796204, + "num_tokens": 37756726.0, + "step": 15008, + "train/ce_loss": 0.40038272738456726 + }, + { + "epoch": 1.4838837255289699, + "step": 15008, + "train/sim_loss": 0.03516209125518799 + }, + { + "epoch": 1.4838837255289699, + "step": 15008, + "train/total_loss": 0.07520036399364471 + }, + { + "entropy": 10.023046493530273, + "epoch": 1.4839825983784853, + "mean_token_accuracy": 0.9008264541625977, + "num_tokens": 37773581.0, + "step": 15009, + "train/ce_loss": 0.21850013732910156 + }, + { + "epoch": 1.4839825983784853, + "step": 15009, + "train/sim_loss": 0.027268826961517334 + }, + { + "epoch": 1.4839825983784853, + "step": 15009, + "train/total_loss": 0.04911883920431137 + }, + { + "entropy": 9.419977188110352, + "epoch": 1.4840814712280008, + "mean_token_accuracy": 0.8523706793785095, + "num_tokens": 37788213.0, + "step": 15010, + "train/ce_loss": 0.3484366238117218 + }, + { + "epoch": 1.4840814712280008, + "step": 15010, + "train/sim_loss": 0.07552093267440796 + }, + { + "epoch": 1.4840814712280008, + "step": 15010, + "train/total_loss": 0.11036460101604462 + }, + { + "entropy": 9.108331680297852, + "epoch": 1.4841803440775163, + "mean_token_accuracy": 0.8360128402709961, + "num_tokens": 37796898.0, + "step": 15011, + "train/ce_loss": 0.5818582773208618 + }, + { + "epoch": 1.4841803440775163, + "step": 15011, + "train/sim_loss": 0.04172360897064209 + }, + { + "epoch": 1.4841803440775163, + "step": 15011, + "train/total_loss": 0.09990943968296051 + }, + { + "entropy": 9.183599472045898, + "epoch": 1.4842792169270318, + "mean_token_accuracy": 0.8352272510528564, + "num_tokens": 37811244.0, + "step": 15012, + "train/ce_loss": 0.26306504011154175 + }, + { + "epoch": 1.4842792169270318, + "step": 15012, + "train/sim_loss": 0.0190962553024292 + }, + { + "epoch": 1.4842792169270318, + "step": 15012, + "train/total_loss": 0.045402757823467255 + }, + { + "entropy": 9.627036094665527, + "epoch": 1.4843780897765473, + "mean_token_accuracy": 0.83753502368927, + "num_tokens": 37825890.0, + "step": 15013, + "train/ce_loss": 0.532468855381012 + }, + { + "epoch": 1.4843780897765473, + "step": 15013, + "train/sim_loss": 0.028573215007781982 + }, + { + "epoch": 1.4843780897765473, + "step": 15013, + "train/total_loss": 0.08182010054588318 + }, + { + "entropy": 9.485824584960938, + "epoch": 1.4844769626260628, + "mean_token_accuracy": 0.8441558480262756, + "num_tokens": 37841782.0, + "step": 15014, + "train/ce_loss": 0.3573329746723175 + }, + { + "epoch": 1.4844769626260628, + "step": 15014, + "train/sim_loss": 0.012578248977661133 + }, + { + "epoch": 1.4844769626260628, + "step": 15014, + "train/total_loss": 0.04831154644489288 + }, + { + "entropy": 8.989956855773926, + "epoch": 1.4845758354755785, + "mean_token_accuracy": 0.82594233751297, + "num_tokens": 37853129.0, + "step": 15015, + "train/ce_loss": 0.4186013638973236 + }, + { + "epoch": 1.4845758354755785, + "step": 15015, + "train/sim_loss": 0.01238870620727539 + }, + { + "epoch": 1.4845758354755785, + "step": 15015, + "train/total_loss": 0.05424884334206581 + }, + { + "entropy": 9.729019165039062, + "epoch": 1.484674708325094, + "mean_token_accuracy": 0.8590425252914429, + "num_tokens": 37866092.0, + "step": 15016, + "train/ce_loss": 8.065437100412964e-07 + }, + { + "epoch": 1.484674708325094, + "step": 15016, + "train/sim_loss": 0.032153964042663574 + }, + { + "epoch": 1.484674708325094, + "step": 15016, + "train/total_loss": 0.03215404599905014 + }, + { + "entropy": 9.027633666992188, + "epoch": 1.4847735811746094, + "mean_token_accuracy": 0.7944936156272888, + "num_tokens": 37880738.0, + "step": 15017, + "train/ce_loss": 0.4926283657550812 + }, + { + "epoch": 1.4847735811746094, + "step": 15017, + "train/sim_loss": 0.14000678062438965 + }, + { + "epoch": 1.4847735811746094, + "step": 15017, + "train/total_loss": 0.18926961719989777 + }, + { + "entropy": 9.583025932312012, + "epoch": 1.484872454024125, + "mean_token_accuracy": 0.8600237369537354, + "num_tokens": 37896122.0, + "step": 15018, + "train/ce_loss": 0.6336471438407898 + }, + { + "epoch": 1.484872454024125, + "step": 15018, + "train/sim_loss": 0.087005615234375 + }, + { + "epoch": 1.484872454024125, + "step": 15018, + "train/total_loss": 0.15037032961845398 + }, + { + "entropy": 9.356805801391602, + "epoch": 1.4849713268736404, + "mean_token_accuracy": 0.798895001411438, + "num_tokens": 37903729.0, + "step": 15019, + "train/ce_loss": 1.8282980818185024e-07 + }, + { + "epoch": 1.4849713268736404, + "step": 15019, + "train/sim_loss": 0.008507609367370605 + }, + { + "epoch": 1.4849713268736404, + "step": 15019, + "train/total_loss": 0.008507627993822098 + }, + { + "epoch": 1.4850701997231561, + "grad_norm": 0.5553337931632996, + "learning_rate": 6.289126242397271e-06, + "loss": 0.089, + "step": 15020 + }, + { + "entropy": 9.375971794128418, + "epoch": 1.4850701997231561, + "mean_token_accuracy": 0.8115429878234863, + "num_tokens": 37919627.0, + "step": 15020, + "train/ce_loss": 0.596939206123352 + }, + { + "epoch": 1.4850701997231561, + "step": 15020, + "train/sim_loss": 0.051547348499298096 + }, + { + "epoch": 1.4850701997231561, + "step": 15020, + "train/total_loss": 0.11124126613140106 + }, + { + "entropy": 9.345190048217773, + "epoch": 1.4851690725726716, + "mean_token_accuracy": 0.8737863898277283, + "num_tokens": 37933410.0, + "step": 15021, + "train/ce_loss": 0.4877208471298218 + }, + { + "epoch": 1.4851690725726716, + "step": 15021, + "train/sim_loss": 0.07528603076934814 + }, + { + "epoch": 1.4851690725726716, + "step": 15021, + "train/total_loss": 0.12405811250209808 + }, + { + "entropy": 9.430085182189941, + "epoch": 1.485267945422187, + "mean_token_accuracy": 0.9273885488510132, + "num_tokens": 37954657.0, + "step": 15022, + "train/ce_loss": 0.2768864631652832 + }, + { + "epoch": 1.485267945422187, + "step": 15022, + "train/sim_loss": 0.013282299041748047 + }, + { + "epoch": 1.485267945422187, + "step": 15022, + "train/total_loss": 0.04097094386816025 + }, + { + "entropy": 9.088557243347168, + "epoch": 1.4853668182717026, + "mean_token_accuracy": 0.8180019855499268, + "num_tokens": 37968291.0, + "step": 15023, + "train/ce_loss": 0.38117414712905884 + }, + { + "epoch": 1.4853668182717026, + "step": 15023, + "train/sim_loss": 0.02355480194091797 + }, + { + "epoch": 1.4853668182717026, + "step": 15023, + "train/total_loss": 0.06167221814393997 + }, + { + "entropy": 9.611122131347656, + "epoch": 1.485465691121218, + "mean_token_accuracy": 0.8505997657775879, + "num_tokens": 37982264.0, + "step": 15024, + "train/ce_loss": 0.18757201731204987 + }, + { + "epoch": 1.485465691121218, + "step": 15024, + "train/sim_loss": 0.03268975019454956 + }, + { + "epoch": 1.485465691121218, + "step": 15024, + "train/total_loss": 0.05144695192575455 + }, + { + "entropy": 9.691186904907227, + "epoch": 1.4855645639707336, + "mean_token_accuracy": 0.9069767594337463, + "num_tokens": 37993021.0, + "step": 15025, + "train/ce_loss": 0.6585802435874939 + }, + { + "epoch": 1.4855645639707336, + "step": 15025, + "train/sim_loss": 0.04816889762878418 + }, + { + "epoch": 1.4855645639707336, + "step": 15025, + "train/total_loss": 0.11402692645788193 + }, + { + "entropy": 9.39810562133789, + "epoch": 1.485663436820249, + "mean_token_accuracy": 0.925000011920929, + "num_tokens": 38003910.0, + "step": 15026, + "train/ce_loss": 0.38655999302864075 + }, + { + "epoch": 1.485663436820249, + "step": 15026, + "train/sim_loss": 0.011219620704650879 + }, + { + "epoch": 1.485663436820249, + "step": 15026, + "train/total_loss": 0.04987562075257301 + }, + { + "entropy": 9.104606628417969, + "epoch": 1.4857623096697647, + "mean_token_accuracy": 0.8537499904632568, + "num_tokens": 38017258.0, + "step": 15027, + "train/ce_loss": 0.37872809171676636 + }, + { + "epoch": 1.4857623096697647, + "step": 15027, + "train/sim_loss": 0.04969131946563721 + }, + { + "epoch": 1.4857623096697647, + "step": 15027, + "train/total_loss": 0.0875641256570816 + }, + { + "entropy": 9.412891387939453, + "epoch": 1.4858611825192802, + "mean_token_accuracy": 0.8581173419952393, + "num_tokens": 38027880.0, + "step": 15028, + "train/ce_loss": 0.38527539372444153 + }, + { + "epoch": 1.4858611825192802, + "step": 15028, + "train/sim_loss": 0.013897061347961426 + }, + { + "epoch": 1.4858611825192802, + "step": 15028, + "train/total_loss": 0.0524246022105217 + }, + { + "entropy": 9.62893295288086, + "epoch": 1.4859600553687957, + "mean_token_accuracy": 0.9133663177490234, + "num_tokens": 38044617.0, + "step": 15029, + "train/ce_loss": 0.542843222618103 + }, + { + "epoch": 1.4859600553687957, + "step": 15029, + "train/sim_loss": 0.03560394048690796 + }, + { + "epoch": 1.4859600553687957, + "step": 15029, + "train/total_loss": 0.08988825976848602 + }, + { + "entropy": 9.634407997131348, + "epoch": 1.4860589282183112, + "mean_token_accuracy": 0.9061033129692078, + "num_tokens": 38062059.0, + "step": 15030, + "train/ce_loss": 5.988380280541605e-07 + }, + { + "epoch": 1.4860589282183112, + "step": 15030, + "train/sim_loss": 0.05767303705215454 + }, + { + "epoch": 1.4860589282183112, + "step": 15030, + "train/total_loss": 0.057673096656799316 + }, + { + "entropy": 9.583456039428711, + "epoch": 1.4861578010678267, + "mean_token_accuracy": 0.8986615538597107, + "num_tokens": 38072149.0, + "step": 15031, + "train/ce_loss": 0.27093204855918884 + }, + { + "epoch": 1.4861578010678267, + "step": 15031, + "train/sim_loss": 0.041261136531829834 + }, + { + "epoch": 1.4861578010678267, + "step": 15031, + "train/total_loss": 0.06835433840751648 + }, + { + "entropy": 9.210660934448242, + "epoch": 1.4862566739173424, + "mean_token_accuracy": 0.8637565970420837, + "num_tokens": 38082985.0, + "step": 15032, + "train/ce_loss": 0.3919694423675537 + }, + { + "epoch": 1.4862566739173424, + "step": 15032, + "train/sim_loss": 0.03508031368255615 + }, + { + "epoch": 1.4862566739173424, + "step": 15032, + "train/total_loss": 0.07427725940942764 + }, + { + "entropy": 9.720458984375, + "epoch": 1.4863555467668579, + "mean_token_accuracy": 0.8770949840545654, + "num_tokens": 38100944.0, + "step": 15033, + "train/ce_loss": 0.5267914533615112 + }, + { + "epoch": 1.4863555467668579, + "step": 15033, + "train/sim_loss": 0.0356406569480896 + }, + { + "epoch": 1.4863555467668579, + "step": 15033, + "train/total_loss": 0.0883198082447052 + }, + { + "entropy": 10.137519836425781, + "epoch": 1.4864544196163734, + "mean_token_accuracy": 0.869328498840332, + "num_tokens": 38115895.0, + "step": 15034, + "train/ce_loss": 0.45481863617897034 + }, + { + "epoch": 1.4864544196163734, + "step": 15034, + "train/sim_loss": 0.043496549129486084 + }, + { + "epoch": 1.4864544196163734, + "step": 15034, + "train/total_loss": 0.08897840976715088 + }, + { + "entropy": 9.134880065917969, + "epoch": 1.4865532924658889, + "mean_token_accuracy": 0.8747345805168152, + "num_tokens": 38131290.0, + "step": 15035, + "train/ce_loss": 0.41917744278907776 + }, + { + "epoch": 1.4865532924658889, + "step": 15035, + "train/sim_loss": 0.03490900993347168 + }, + { + "epoch": 1.4865532924658889, + "step": 15035, + "train/total_loss": 0.07682675123214722 + }, + { + "entropy": 9.30974292755127, + "epoch": 1.4866521653154043, + "mean_token_accuracy": 0.8265641927719116, + "num_tokens": 38147316.0, + "step": 15036, + "train/ce_loss": 0.5399686098098755 + }, + { + "epoch": 1.4866521653154043, + "step": 15036, + "train/sim_loss": 0.04519015550613403 + }, + { + "epoch": 1.4866521653154043, + "step": 15036, + "train/total_loss": 0.09918701648712158 + }, + { + "entropy": 9.516191482543945, + "epoch": 1.48675103816492, + "mean_token_accuracy": 0.8189189434051514, + "num_tokens": 38158925.0, + "step": 15037, + "train/ce_loss": 0.6673029065132141 + }, + { + "epoch": 1.48675103816492, + "step": 15037, + "train/sim_loss": 0.019002676010131836 + }, + { + "epoch": 1.48675103816492, + "step": 15037, + "train/total_loss": 0.08573296666145325 + }, + { + "entropy": 9.605680465698242, + "epoch": 1.4868499110144353, + "mean_token_accuracy": 0.8518518805503845, + "num_tokens": 38172539.0, + "step": 15038, + "train/ce_loss": 0.47947344183921814 + }, + { + "epoch": 1.4868499110144353, + "step": 15038, + "train/sim_loss": 0.0640726089477539 + }, + { + "epoch": 1.4868499110144353, + "step": 15038, + "train/total_loss": 0.11201995611190796 + }, + { + "entropy": 9.715400695800781, + "epoch": 1.486948783863951, + "mean_token_accuracy": 0.876838207244873, + "num_tokens": 38182488.0, + "step": 15039, + "train/ce_loss": 0.34612810611724854 + }, + { + "epoch": 1.486948783863951, + "step": 15039, + "train/sim_loss": 0.041818976402282715 + }, + { + "epoch": 1.486948783863951, + "step": 15039, + "train/total_loss": 0.07643178850412369 + }, + { + "epoch": 1.4870476567134665, + "grad_norm": 0.6400157809257507, + "learning_rate": 6.284181377639322e-06, + "loss": 0.0818, + "step": 15040 + }, + { + "entropy": 9.447501182556152, + "epoch": 1.4870476567134665, + "mean_token_accuracy": 0.9297820925712585, + "num_tokens": 38196731.0, + "step": 15040, + "train/ce_loss": 4.144866636579536e-07 + }, + { + "epoch": 1.4870476567134665, + "step": 15040, + "train/sim_loss": 0.03199207782745361 + }, + { + "epoch": 1.4870476567134665, + "step": 15040, + "train/total_loss": 0.031992118805646896 + }, + { + "entropy": 9.30866813659668, + "epoch": 1.487146529562982, + "mean_token_accuracy": 0.8630490899085999, + "num_tokens": 38207705.0, + "step": 15041, + "train/ce_loss": 0.4646686017513275 + }, + { + "epoch": 1.487146529562982, + "step": 15041, + "train/sim_loss": 0.02159249782562256 + }, + { + "epoch": 1.487146529562982, + "step": 15041, + "train/total_loss": 0.06805935502052307 + }, + { + "entropy": 9.218036651611328, + "epoch": 1.4872454024124975, + "mean_token_accuracy": 0.8689138293266296, + "num_tokens": 38222368.0, + "step": 15042, + "train/ce_loss": 0.6077502965927124 + }, + { + "epoch": 1.4872454024124975, + "step": 15042, + "train/sim_loss": 0.07647514343261719 + }, + { + "epoch": 1.4872454024124975, + "step": 15042, + "train/total_loss": 0.1372501701116562 + }, + { + "entropy": 9.923735618591309, + "epoch": 1.487344275262013, + "mean_token_accuracy": 0.8619718551635742, + "num_tokens": 38235478.0, + "step": 15043, + "train/ce_loss": 0.6698992252349854 + }, + { + "epoch": 1.487344275262013, + "step": 15043, + "train/sim_loss": 0.04156160354614258 + }, + { + "epoch": 1.487344275262013, + "step": 15043, + "train/total_loss": 0.108551524579525 + }, + { + "entropy": 9.53133773803711, + "epoch": 1.4874431481115287, + "mean_token_accuracy": 0.877284586429596, + "num_tokens": 38252823.0, + "step": 15044, + "train/ce_loss": 0.6462368369102478 + }, + { + "epoch": 1.4874431481115287, + "step": 15044, + "train/sim_loss": 0.02024078369140625 + }, + { + "epoch": 1.4874431481115287, + "step": 15044, + "train/total_loss": 0.08486446738243103 + }, + { + "entropy": 9.24500846862793, + "epoch": 1.4875420209610442, + "mean_token_accuracy": 0.8070443868637085, + "num_tokens": 38265724.0, + "step": 15045, + "train/ce_loss": 0.40111681818962097 + }, + { + "epoch": 1.4875420209610442, + "step": 15045, + "train/sim_loss": 0.04785799980163574 + }, + { + "epoch": 1.4875420209610442, + "step": 15045, + "train/total_loss": 0.08796968311071396 + }, + { + "entropy": 9.708577156066895, + "epoch": 1.4876408938105596, + "mean_token_accuracy": 0.8329048752784729, + "num_tokens": 38276429.0, + "step": 15046, + "train/ce_loss": 0.5552307963371277 + }, + { + "epoch": 1.4876408938105596, + "step": 15046, + "train/sim_loss": 0.07189762592315674 + }, + { + "epoch": 1.4876408938105596, + "step": 15046, + "train/total_loss": 0.12742070853710175 + }, + { + "entropy": 8.889612197875977, + "epoch": 1.4877397666600751, + "mean_token_accuracy": 0.8612021803855896, + "num_tokens": 38287422.0, + "step": 15047, + "train/ce_loss": 0.44768792390823364 + }, + { + "epoch": 1.4877397666600751, + "step": 15047, + "train/sim_loss": 0.026941895484924316 + }, + { + "epoch": 1.4877397666600751, + "step": 15047, + "train/total_loss": 0.07171069085597992 + }, + { + "entropy": 9.399898529052734, + "epoch": 1.4878386395095906, + "mean_token_accuracy": 0.876288652420044, + "num_tokens": 38301963.0, + "step": 15048, + "train/ce_loss": 5.579263415711466e-07 + }, + { + "epoch": 1.4878386395095906, + "step": 15048, + "train/sim_loss": 0.05673617124557495 + }, + { + "epoch": 1.4878386395095906, + "step": 15048, + "train/total_loss": 0.05673622712492943 + }, + { + "entropy": 9.339942932128906, + "epoch": 1.4879375123591063, + "mean_token_accuracy": 0.8180661797523499, + "num_tokens": 38312277.0, + "step": 15049, + "train/ce_loss": 0.5009257197380066 + }, + { + "epoch": 1.4879375123591063, + "step": 15049, + "train/sim_loss": 0.0930449366569519 + }, + { + "epoch": 1.4879375123591063, + "step": 15049, + "train/total_loss": 0.14313751459121704 + }, + { + "entropy": 9.037216186523438, + "epoch": 1.4880363852086216, + "mean_token_accuracy": 0.8010416626930237, + "num_tokens": 38322025.0, + "step": 15050, + "train/ce_loss": 0.5754390358924866 + }, + { + "epoch": 1.4880363852086216, + "step": 15050, + "train/sim_loss": 0.036963820457458496 + }, + { + "epoch": 1.4880363852086216, + "step": 15050, + "train/total_loss": 0.09450772404670715 + }, + { + "entropy": 9.574355125427246, + "epoch": 1.4881352580581373, + "mean_token_accuracy": 0.8766520023345947, + "num_tokens": 38337288.0, + "step": 15051, + "train/ce_loss": 2.1717901290685404e-06 + }, + { + "epoch": 1.4881352580581373, + "step": 15051, + "train/sim_loss": 0.03552430868148804 + }, + { + "epoch": 1.4881352580581373, + "step": 15051, + "train/total_loss": 0.03552452474832535 + }, + { + "entropy": 9.322052955627441, + "epoch": 1.4882341309076528, + "mean_token_accuracy": 0.7989276051521301, + "num_tokens": 38349143.0, + "step": 15052, + "train/ce_loss": 0.6195496320724487 + }, + { + "epoch": 1.4882341309076528, + "step": 15052, + "train/sim_loss": 0.03303110599517822 + }, + { + "epoch": 1.4882341309076528, + "step": 15052, + "train/total_loss": 0.09498606622219086 + }, + { + "entropy": 9.482124328613281, + "epoch": 1.4883330037571683, + "mean_token_accuracy": 0.8437132835388184, + "num_tokens": 38362173.0, + "step": 15053, + "train/ce_loss": 0.3980512022972107 + }, + { + "epoch": 1.4883330037571683, + "step": 15053, + "train/sim_loss": 0.03193235397338867 + }, + { + "epoch": 1.4883330037571683, + "step": 15053, + "train/total_loss": 0.07173747569322586 + }, + { + "entropy": 8.71811580657959, + "epoch": 1.4884318766066837, + "mean_token_accuracy": 0.8230366706848145, + "num_tokens": 38372936.0, + "step": 15054, + "train/ce_loss": 0.6891573667526245 + }, + { + "epoch": 1.4884318766066837, + "step": 15054, + "train/sim_loss": 0.04925501346588135 + }, + { + "epoch": 1.4884318766066837, + "step": 15054, + "train/total_loss": 0.11817075312137604 + }, + { + "entropy": 9.48864459991455, + "epoch": 1.4885307494561992, + "mean_token_accuracy": 0.8664921522140503, + "num_tokens": 38384509.0, + "step": 15055, + "train/ce_loss": 0.184811532497406 + }, + { + "epoch": 1.4885307494561992, + "step": 15055, + "train/sim_loss": 0.0427626371383667 + }, + { + "epoch": 1.4885307494561992, + "step": 15055, + "train/total_loss": 0.06124379113316536 + }, + { + "entropy": 9.827169418334961, + "epoch": 1.488629622305715, + "mean_token_accuracy": 0.8740554451942444, + "num_tokens": 38397470.0, + "step": 15056, + "train/ce_loss": 1.0713346004486084 + }, + { + "epoch": 1.488629622305715, + "step": 15056, + "train/sim_loss": 0.02613741159439087 + }, + { + "epoch": 1.488629622305715, + "step": 15056, + "train/total_loss": 0.13327087461948395 + }, + { + "entropy": 9.158239364624023, + "epoch": 1.4887284951552304, + "mean_token_accuracy": 0.9013453125953674, + "num_tokens": 38410419.0, + "step": 15057, + "train/ce_loss": 8.666713711136254e-07 + }, + { + "epoch": 1.4887284951552304, + "step": 15057, + "train/sim_loss": 0.057074904441833496 + }, + { + "epoch": 1.4887284951552304, + "step": 15057, + "train/total_loss": 0.05707499012351036 + }, + { + "entropy": 9.913228988647461, + "epoch": 1.488827368004746, + "mean_token_accuracy": 0.9069767594337463, + "num_tokens": 38422149.0, + "step": 15058, + "train/ce_loss": 1.3362330264499178e-06 + }, + { + "epoch": 1.488827368004746, + "step": 15058, + "train/sim_loss": 0.038802504539489746 + }, + { + "epoch": 1.488827368004746, + "step": 15058, + "train/total_loss": 0.03880263864994049 + }, + { + "entropy": 9.149731636047363, + "epoch": 1.4889262408542614, + "mean_token_accuracy": 0.8532675504684448, + "num_tokens": 38431211.0, + "step": 15059, + "train/ce_loss": 0.5078800916671753 + }, + { + "epoch": 1.4889262408542614, + "step": 15059, + "train/sim_loss": 0.010540485382080078 + }, + { + "epoch": 1.4889262408542614, + "step": 15059, + "train/total_loss": 0.061328496783971786 + }, + { + "epoch": 1.4890251137037769, + "grad_norm": 0.49739110469818115, + "learning_rate": 6.279236512881374e-06, + "loss": 0.0875, + "step": 15060 + }, + { + "entropy": 9.127708435058594, + "epoch": 1.4890251137037769, + "mean_token_accuracy": 0.8489751815795898, + "num_tokens": 38444467.0, + "step": 15060, + "train/ce_loss": 0.45487964153289795 + }, + { + "epoch": 1.4890251137037769, + "step": 15060, + "train/sim_loss": 0.0221748948097229 + }, + { + "epoch": 1.4890251137037769, + "step": 15060, + "train/total_loss": 0.06766286492347717 + }, + { + "entropy": 9.223292350769043, + "epoch": 1.4891239865532926, + "mean_token_accuracy": 0.8730702996253967, + "num_tokens": 38460299.0, + "step": 15061, + "train/ce_loss": 0.6124764680862427 + }, + { + "epoch": 1.4891239865532926, + "step": 15061, + "train/sim_loss": 0.03660404682159424 + }, + { + "epoch": 1.4891239865532926, + "step": 15061, + "train/total_loss": 0.0978516936302185 + }, + { + "entropy": 9.427467346191406, + "epoch": 1.4892228594028079, + "mean_token_accuracy": 0.8333333134651184, + "num_tokens": 38473131.0, + "step": 15062, + "train/ce_loss": 0.25700902938842773 + }, + { + "epoch": 1.4892228594028079, + "step": 15062, + "train/sim_loss": 0.025446772575378418 + }, + { + "epoch": 1.4892228594028079, + "step": 15062, + "train/total_loss": 0.05114767700433731 + }, + { + "entropy": 9.600116729736328, + "epoch": 1.4893217322523236, + "mean_token_accuracy": 0.8782051205635071, + "num_tokens": 38490828.0, + "step": 15063, + "train/ce_loss": 0.7033687233924866 + }, + { + "epoch": 1.4893217322523236, + "step": 15063, + "train/sim_loss": 0.0558319091796875 + }, + { + "epoch": 1.4893217322523236, + "step": 15063, + "train/total_loss": 0.12616878747940063 + }, + { + "entropy": 9.288564682006836, + "epoch": 1.489420605101839, + "mean_token_accuracy": 0.840104877948761, + "num_tokens": 38499834.0, + "step": 15064, + "train/ce_loss": 0.2816846966743469 + }, + { + "epoch": 1.489420605101839, + "step": 15064, + "train/sim_loss": 0.050326406955718994 + }, + { + "epoch": 1.489420605101839, + "step": 15064, + "train/total_loss": 0.07849487662315369 + }, + { + "entropy": 9.758413314819336, + "epoch": 1.4895194779513545, + "mean_token_accuracy": 0.9055117964744568, + "num_tokens": 38509208.0, + "step": 15065, + "train/ce_loss": 4.3302932795086235e-07 + }, + { + "epoch": 1.4895194779513545, + "step": 15065, + "train/sim_loss": 0.017262637615203857 + }, + { + "epoch": 1.4895194779513545, + "step": 15065, + "train/total_loss": 0.01726268045604229 + }, + { + "entropy": 9.859883308410645, + "epoch": 1.48961835080087, + "mean_token_accuracy": 0.8402515649795532, + "num_tokens": 38526675.0, + "step": 15066, + "train/ce_loss": 0.5729032158851624 + }, + { + "epoch": 1.48961835080087, + "step": 15066, + "train/sim_loss": 0.023862600326538086 + }, + { + "epoch": 1.48961835080087, + "step": 15066, + "train/total_loss": 0.08115292340517044 + }, + { + "entropy": 9.735730171203613, + "epoch": 1.4897172236503855, + "mean_token_accuracy": 0.8541666865348816, + "num_tokens": 38537118.0, + "step": 15067, + "train/ce_loss": 8.973643161880318e-07 + }, + { + "epoch": 1.4897172236503855, + "step": 15067, + "train/sim_loss": 0.014823377132415771 + }, + { + "epoch": 1.4897172236503855, + "step": 15067, + "train/total_loss": 0.014823466539382935 + }, + { + "entropy": 9.216903686523438, + "epoch": 1.4898160964999012, + "mean_token_accuracy": 0.8710144758224487, + "num_tokens": 38545017.0, + "step": 15068, + "train/ce_loss": 0.8333630561828613 + }, + { + "epoch": 1.4898160964999012, + "step": 15068, + "train/sim_loss": 0.03721243143081665 + }, + { + "epoch": 1.4898160964999012, + "step": 15068, + "train/total_loss": 0.12054874002933502 + }, + { + "entropy": 9.265979766845703, + "epoch": 1.4899149693494167, + "mean_token_accuracy": 0.8583815097808838, + "num_tokens": 38559137.0, + "step": 15069, + "train/ce_loss": 0.6744340062141418 + }, + { + "epoch": 1.4899149693494167, + "step": 15069, + "train/sim_loss": 0.03824174404144287 + }, + { + "epoch": 1.4899149693494167, + "step": 15069, + "train/total_loss": 0.10568514466285706 + }, + { + "entropy": 9.699445724487305, + "epoch": 1.4900138421989322, + "mean_token_accuracy": 0.8817005753517151, + "num_tokens": 38574695.0, + "step": 15070, + "train/ce_loss": 0.7470896244049072 + }, + { + "epoch": 1.4900138421989322, + "step": 15070, + "train/sim_loss": 0.028379738330841064 + }, + { + "epoch": 1.4900138421989322, + "step": 15070, + "train/total_loss": 0.10308869928121567 + }, + { + "entropy": 9.587093353271484, + "epoch": 1.4901127150484477, + "mean_token_accuracy": 0.8525252342224121, + "num_tokens": 38589080.0, + "step": 15071, + "train/ce_loss": 0.44644486904144287 + }, + { + "epoch": 1.4901127150484477, + "step": 15071, + "train/sim_loss": 0.034824371337890625 + }, + { + "epoch": 1.4901127150484477, + "step": 15071, + "train/total_loss": 0.07946886122226715 + }, + { + "entropy": 9.185014724731445, + "epoch": 1.4902115878979632, + "mean_token_accuracy": 0.8231368064880371, + "num_tokens": 38597348.0, + "step": 15072, + "train/ce_loss": 0.30644625425338745 + }, + { + "epoch": 1.4902115878979632, + "step": 15072, + "train/sim_loss": 0.03303927183151245 + }, + { + "epoch": 1.4902115878979632, + "step": 15072, + "train/total_loss": 0.0636838972568512 + }, + { + "entropy": 9.244428634643555, + "epoch": 1.4903104607474789, + "mean_token_accuracy": 0.8391061425209045, + "num_tokens": 38615587.0, + "step": 15073, + "train/ce_loss": 0.3997326195240021 + }, + { + "epoch": 1.4903104607474789, + "step": 15073, + "train/sim_loss": 0.030806899070739746 + }, + { + "epoch": 1.4903104607474789, + "step": 15073, + "train/total_loss": 0.07078015804290771 + }, + { + "entropy": 9.684181213378906, + "epoch": 1.4904093335969943, + "mean_token_accuracy": 0.8847352266311646, + "num_tokens": 38625149.0, + "step": 15074, + "train/ce_loss": 0.5107548236846924 + }, + { + "epoch": 1.4904093335969943, + "step": 15074, + "train/sim_loss": 0.0503389835357666 + }, + { + "epoch": 1.4904093335969943, + "step": 15074, + "train/total_loss": 0.10141447186470032 + }, + { + "entropy": 9.852960586547852, + "epoch": 1.4905082064465098, + "mean_token_accuracy": 0.8733509182929993, + "num_tokens": 38636704.0, + "step": 15075, + "train/ce_loss": 3.3340606364617997e-07 + }, + { + "epoch": 1.4905082064465098, + "step": 15075, + "train/sim_loss": 0.021953701972961426 + }, + { + "epoch": 1.4905082064465098, + "step": 15075, + "train/total_loss": 0.021953735500574112 + }, + { + "entropy": 9.420707702636719, + "epoch": 1.4906070792960253, + "mean_token_accuracy": 0.84429532289505, + "num_tokens": 38646474.0, + "step": 15076, + "train/ce_loss": 0.3868057429790497 + }, + { + "epoch": 1.4906070792960253, + "step": 15076, + "train/sim_loss": 0.01939976215362549 + }, + { + "epoch": 1.4906070792960253, + "step": 15076, + "train/total_loss": 0.058080337941646576 + }, + { + "entropy": 9.281779289245605, + "epoch": 1.4907059521455408, + "mean_token_accuracy": 0.8512763381004333, + "num_tokens": 38666255.0, + "step": 15077, + "train/ce_loss": 0.1871093362569809 + }, + { + "epoch": 1.4907059521455408, + "step": 15077, + "train/sim_loss": 0.016238033771514893 + }, + { + "epoch": 1.4907059521455408, + "step": 15077, + "train/total_loss": 0.03494896739721298 + }, + { + "entropy": 9.394889831542969, + "epoch": 1.4908048249950563, + "mean_token_accuracy": 0.8708920478820801, + "num_tokens": 38673126.0, + "step": 15078, + "train/ce_loss": 1.0263111591339111 + }, + { + "epoch": 1.4908048249950563, + "step": 15078, + "train/sim_loss": 0.05401664972305298 + }, + { + "epoch": 1.4908048249950563, + "step": 15078, + "train/total_loss": 0.15664777159690857 + }, + { + "entropy": 9.322502136230469, + "epoch": 1.4909036978445718, + "mean_token_accuracy": 0.8370565176010132, + "num_tokens": 38691263.0, + "step": 15079, + "train/ce_loss": 0.4925348162651062 + }, + { + "epoch": 1.4909036978445718, + "step": 15079, + "train/sim_loss": 0.06421053409576416 + }, + { + "epoch": 1.4909036978445718, + "step": 15079, + "train/total_loss": 0.11346401274204254 + }, + { + "epoch": 1.4910025706940875, + "grad_norm": 0.588395893573761, + "learning_rate": 6.274291648123424e-06, + "loss": 0.0806, + "step": 15080 + }, + { + "entropy": 9.229768753051758, + "epoch": 1.4910025706940875, + "mean_token_accuracy": 0.8290598392486572, + "num_tokens": 38700902.0, + "step": 15080, + "train/ce_loss": 0.5068921446800232 + }, + { + "epoch": 1.4910025706940875, + "step": 15080, + "train/sim_loss": 0.047586917877197266 + }, + { + "epoch": 1.4910025706940875, + "step": 15080, + "train/total_loss": 0.09827613830566406 + }, + { + "entropy": 9.970208168029785, + "epoch": 1.491101443543603, + "mean_token_accuracy": 0.8193384408950806, + "num_tokens": 38710566.0, + "step": 15081, + "train/ce_loss": 0.8383012413978577 + }, + { + "epoch": 1.491101443543603, + "step": 15081, + "train/sim_loss": 0.1500263214111328 + }, + { + "epoch": 1.491101443543603, + "step": 15081, + "train/total_loss": 0.2338564395904541 + }, + { + "entropy": 8.789587020874023, + "epoch": 1.4912003163931185, + "mean_token_accuracy": 0.8483353853225708, + "num_tokens": 38724568.0, + "step": 15082, + "train/ce_loss": 2.9574138693533314e-07 + }, + { + "epoch": 1.4912003163931185, + "step": 15082, + "train/sim_loss": 0.02993786334991455 + }, + { + "epoch": 1.4912003163931185, + "step": 15082, + "train/total_loss": 0.02993789315223694 + }, + { + "entropy": 9.922070503234863, + "epoch": 1.491299189242634, + "mean_token_accuracy": 0.8836292028427124, + "num_tokens": 38736631.0, + "step": 15083, + "train/ce_loss": 0.4959407150745392 + }, + { + "epoch": 1.491299189242634, + "step": 15083, + "train/sim_loss": 0.03520917892456055 + }, + { + "epoch": 1.491299189242634, + "step": 15083, + "train/total_loss": 0.0848032534122467 + }, + { + "entropy": 9.420791625976562, + "epoch": 1.4913980620921494, + "mean_token_accuracy": 0.8783217072486877, + "num_tokens": 38746212.0, + "step": 15084, + "train/ce_loss": 0.27729350328445435 + }, + { + "epoch": 1.4913980620921494, + "step": 15084, + "train/sim_loss": 0.04236185550689697 + }, + { + "epoch": 1.4913980620921494, + "step": 15084, + "train/total_loss": 0.07009120285511017 + }, + { + "entropy": 9.548210144042969, + "epoch": 1.4914969349416651, + "mean_token_accuracy": 0.858048141002655, + "num_tokens": 38759203.0, + "step": 15085, + "train/ce_loss": 2.344877430004999e-07 + }, + { + "epoch": 1.4914969349416651, + "step": 15085, + "train/sim_loss": 0.012863874435424805 + }, + { + "epoch": 1.4914969349416651, + "step": 15085, + "train/total_loss": 0.01286389771848917 + }, + { + "entropy": 9.696563720703125, + "epoch": 1.4915958077911806, + "mean_token_accuracy": 0.8982758522033691, + "num_tokens": 38776526.0, + "step": 15086, + "train/ce_loss": 4.51344362772943e-07 + }, + { + "epoch": 1.4915958077911806, + "step": 15086, + "train/sim_loss": 0.0267411470413208 + }, + { + "epoch": 1.4915958077911806, + "step": 15086, + "train/total_loss": 0.026741191744804382 + }, + { + "entropy": 9.447992324829102, + "epoch": 1.491694680640696, + "mean_token_accuracy": 0.818652868270874, + "num_tokens": 38788149.0, + "step": 15087, + "train/ce_loss": 1.050789478540537e-06 + }, + { + "epoch": 1.491694680640696, + "step": 15087, + "train/sim_loss": 0.0359417200088501 + }, + { + "epoch": 1.491694680640696, + "step": 15087, + "train/total_loss": 0.035941824316978455 + }, + { + "entropy": 9.468624114990234, + "epoch": 1.4917935534902116, + "mean_token_accuracy": 0.9262820482254028, + "num_tokens": 38803228.0, + "step": 15088, + "train/ce_loss": 8.053897317950032e-07 + }, + { + "epoch": 1.4917935534902116, + "step": 15088, + "train/sim_loss": 0.03589874505996704 + }, + { + "epoch": 1.4917935534902116, + "step": 15088, + "train/total_loss": 0.03589882701635361 + }, + { + "entropy": 9.59033489227295, + "epoch": 1.491892426339727, + "mean_token_accuracy": 0.8481228947639465, + "num_tokens": 38816285.0, + "step": 15089, + "train/ce_loss": 3.1246275966623216e-07 + }, + { + "epoch": 1.491892426339727, + "step": 15089, + "train/sim_loss": 0.0271570086479187 + }, + { + "epoch": 1.491892426339727, + "step": 15089, + "train/total_loss": 0.027157040312886238 + }, + { + "entropy": 9.577157974243164, + "epoch": 1.4919912991892426, + "mean_token_accuracy": 0.9110764265060425, + "num_tokens": 38832412.0, + "step": 15090, + "train/ce_loss": 0.31030452251434326 + }, + { + "epoch": 1.4919912991892426, + "step": 15090, + "train/sim_loss": 0.01174473762512207 + }, + { + "epoch": 1.4919912991892426, + "step": 15090, + "train/total_loss": 0.042775191366672516 + }, + { + "entropy": 9.397069931030273, + "epoch": 1.492090172038758, + "mean_token_accuracy": 0.7989247441291809, + "num_tokens": 38844960.0, + "step": 15091, + "train/ce_loss": 0.7876615524291992 + }, + { + "epoch": 1.492090172038758, + "step": 15091, + "train/sim_loss": 0.07125508785247803 + }, + { + "epoch": 1.492090172038758, + "step": 15091, + "train/total_loss": 0.1500212550163269 + }, + { + "entropy": 9.522782325744629, + "epoch": 1.4921890448882738, + "mean_token_accuracy": 0.822047233581543, + "num_tokens": 38857440.0, + "step": 15092, + "train/ce_loss": 0.9352394938468933 + }, + { + "epoch": 1.4921890448882738, + "step": 15092, + "train/sim_loss": 0.05247950553894043 + }, + { + "epoch": 1.4921890448882738, + "step": 15092, + "train/total_loss": 0.14600345492362976 + }, + { + "entropy": 9.175798416137695, + "epoch": 1.4922879177377892, + "mean_token_accuracy": 0.8330039381980896, + "num_tokens": 38869721.0, + "step": 15093, + "train/ce_loss": 0.645945131778717 + }, + { + "epoch": 1.4922879177377892, + "step": 15093, + "train/sim_loss": 0.04654175043106079 + }, + { + "epoch": 1.4922879177377892, + "step": 15093, + "train/total_loss": 0.11113626509904861 + }, + { + "entropy": 9.21624755859375, + "epoch": 1.4923867905873047, + "mean_token_accuracy": 0.8714285492897034, + "num_tokens": 38878519.0, + "step": 15094, + "train/ce_loss": 0.3192640542984009 + }, + { + "epoch": 1.4923867905873047, + "step": 15094, + "train/sim_loss": 0.03313225507736206 + }, + { + "epoch": 1.4923867905873047, + "step": 15094, + "train/total_loss": 0.06505866348743439 + }, + { + "entropy": 9.140603065490723, + "epoch": 1.4924856634368202, + "mean_token_accuracy": 0.853630006313324, + "num_tokens": 38889399.0, + "step": 15095, + "train/ce_loss": 0.5797489881515503 + }, + { + "epoch": 1.4924856634368202, + "step": 15095, + "train/sim_loss": 0.03588545322418213 + }, + { + "epoch": 1.4924856634368202, + "step": 15095, + "train/total_loss": 0.09386035799980164 + }, + { + "entropy": 9.88005256652832, + "epoch": 1.4925845362863357, + "mean_token_accuracy": 0.9095744490623474, + "num_tokens": 38904404.0, + "step": 15096, + "train/ce_loss": 1.4279455626819981e-06 + }, + { + "epoch": 1.4925845362863357, + "step": 15096, + "train/sim_loss": 0.02254265546798706 + }, + { + "epoch": 1.4925845362863357, + "step": 15096, + "train/total_loss": 0.02254279889166355 + }, + { + "entropy": 9.621406555175781, + "epoch": 1.4926834091358514, + "mean_token_accuracy": 0.9147058725357056, + "num_tokens": 38919598.0, + "step": 15097, + "train/ce_loss": 0.3179495930671692 + }, + { + "epoch": 1.4926834091358514, + "step": 15097, + "train/sim_loss": 0.020297706127166748 + }, + { + "epoch": 1.4926834091358514, + "step": 15097, + "train/total_loss": 0.052092667669057846 + }, + { + "entropy": 9.592304229736328, + "epoch": 1.4927822819853669, + "mean_token_accuracy": 0.8353253602981567, + "num_tokens": 38937914.0, + "step": 15098, + "train/ce_loss": 0.4557726979255676 + }, + { + "epoch": 1.4927822819853669, + "step": 15098, + "train/sim_loss": 0.04842585325241089 + }, + { + "epoch": 1.4927822819853669, + "step": 15098, + "train/total_loss": 0.09400312602519989 + }, + { + "entropy": 9.10334587097168, + "epoch": 1.4928811548348824, + "mean_token_accuracy": 0.8579610586166382, + "num_tokens": 38951032.0, + "step": 15099, + "train/ce_loss": 0.27396711707115173 + }, + { + "epoch": 1.4928811548348824, + "step": 15099, + "train/sim_loss": 0.02772200107574463 + }, + { + "epoch": 1.4928811548348824, + "step": 15099, + "train/total_loss": 0.05511871352791786 + }, + { + "epoch": 1.4929800276843979, + "grad_norm": 0.5351325273513794, + "learning_rate": 6.2693467833654755e-06, + "loss": 0.0817, + "step": 15100 + }, + { + "entropy": 9.326388359069824, + "epoch": 1.4929800276843979, + "mean_token_accuracy": 0.8875501751899719, + "num_tokens": 38962256.0, + "step": 15100, + "train/ce_loss": 0.39044323563575745 + }, + { + "epoch": 1.4929800276843979, + "step": 15100, + "train/sim_loss": 0.06495046615600586 + }, + { + "epoch": 1.4929800276843979, + "step": 15100, + "train/total_loss": 0.10399478673934937 + }, + { + "entropy": 9.407894134521484, + "epoch": 1.4930789005339133, + "mean_token_accuracy": 0.8310626745223999, + "num_tokens": 38975226.0, + "step": 15101, + "train/ce_loss": 0.7197384238243103 + }, + { + "epoch": 1.4930789005339133, + "step": 15101, + "train/sim_loss": 0.03178000450134277 + }, + { + "epoch": 1.4930789005339133, + "step": 15101, + "train/total_loss": 0.10375384986400604 + }, + { + "entropy": 9.850854873657227, + "epoch": 1.4931777733834288, + "mean_token_accuracy": 0.8414815068244934, + "num_tokens": 38985697.0, + "step": 15102, + "train/ce_loss": 0.2315627932548523 + }, + { + "epoch": 1.4931777733834288, + "step": 15102, + "train/sim_loss": 0.06556868553161621 + }, + { + "epoch": 1.4931777733834288, + "step": 15102, + "train/total_loss": 0.08872496336698532 + }, + { + "entropy": 9.333404541015625, + "epoch": 1.4932766462329443, + "mean_token_accuracy": 0.8805031180381775, + "num_tokens": 38995030.0, + "step": 15103, + "train/ce_loss": 1.7932161426870152e-06 + }, + { + "epoch": 1.4932766462329443, + "step": 15103, + "train/sim_loss": 0.047268688678741455 + }, + { + "epoch": 1.4932766462329443, + "step": 15103, + "train/total_loss": 0.04726886749267578 + }, + { + "entropy": 9.269002914428711, + "epoch": 1.49337551908246, + "mean_token_accuracy": 0.8296893239021301, + "num_tokens": 39007202.0, + "step": 15104, + "train/ce_loss": 0.45452332496643066 + }, + { + "epoch": 1.49337551908246, + "step": 15104, + "train/sim_loss": 0.045235633850097656 + }, + { + "epoch": 1.49337551908246, + "step": 15104, + "train/total_loss": 0.09068796783685684 + }, + { + "entropy": 9.335432052612305, + "epoch": 1.4934743919319755, + "mean_token_accuracy": 0.8352365493774414, + "num_tokens": 39023403.0, + "step": 15105, + "train/ce_loss": 0.5627084970474243 + }, + { + "epoch": 1.4934743919319755, + "step": 15105, + "train/sim_loss": 0.013565123081207275 + }, + { + "epoch": 1.4934743919319755, + "step": 15105, + "train/total_loss": 0.06983597576618195 + }, + { + "entropy": 9.156750679016113, + "epoch": 1.493573264781491, + "mean_token_accuracy": 0.8263624906539917, + "num_tokens": 39034012.0, + "step": 15106, + "train/ce_loss": 0.7707264423370361 + }, + { + "epoch": 1.493573264781491, + "step": 15106, + "train/sim_loss": 0.12235128879547119 + }, + { + "epoch": 1.493573264781491, + "step": 15106, + "train/total_loss": 0.19942393898963928 + }, + { + "entropy": 9.180400848388672, + "epoch": 1.4936721376310065, + "mean_token_accuracy": 0.845645010471344, + "num_tokens": 39047417.0, + "step": 15107, + "train/ce_loss": 0.35235854983329773 + }, + { + "epoch": 1.4936721376310065, + "step": 15107, + "train/sim_loss": 0.011587262153625488 + }, + { + "epoch": 1.4936721376310065, + "step": 15107, + "train/total_loss": 0.04682311788201332 + }, + { + "entropy": 8.951709747314453, + "epoch": 1.493771010480522, + "mean_token_accuracy": 0.8978723287582397, + "num_tokens": 39059044.0, + "step": 15108, + "train/ce_loss": 0.48555853962898254 + }, + { + "epoch": 1.493771010480522, + "step": 15108, + "train/sim_loss": 0.030707478523254395 + }, + { + "epoch": 1.493771010480522, + "step": 15108, + "train/total_loss": 0.07926332950592041 + }, + { + "entropy": 8.896417617797852, + "epoch": 1.4938698833300377, + "mean_token_accuracy": 0.8511293530464172, + "num_tokens": 39067092.0, + "step": 15109, + "train/ce_loss": 0.46877947449684143 + }, + { + "epoch": 1.4938698833300377, + "step": 15109, + "train/sim_loss": 0.05056893825531006 + }, + { + "epoch": 1.4938698833300377, + "step": 15109, + "train/total_loss": 0.09744688868522644 + }, + { + "entropy": 9.015594482421875, + "epoch": 1.4939687561795532, + "mean_token_accuracy": 0.8128019571304321, + "num_tokens": 39078613.0, + "step": 15110, + "train/ce_loss": 0.43006113171577454 + }, + { + "epoch": 1.4939687561795532, + "step": 15110, + "train/sim_loss": 0.020022213459014893 + }, + { + "epoch": 1.4939687561795532, + "step": 15110, + "train/total_loss": 0.06302832812070847 + }, + { + "entropy": 9.914295196533203, + "epoch": 1.4940676290290686, + "mean_token_accuracy": 0.875, + "num_tokens": 39096210.0, + "step": 15111, + "train/ce_loss": 0.3406418263912201 + }, + { + "epoch": 1.4940676290290686, + "step": 15111, + "train/sim_loss": 0.029797136783599854 + }, + { + "epoch": 1.4940676290290686, + "step": 15111, + "train/total_loss": 0.06386132538318634 + }, + { + "entropy": 9.446602821350098, + "epoch": 1.4941665018785841, + "mean_token_accuracy": 0.8587075471878052, + "num_tokens": 39113708.0, + "step": 15112, + "train/ce_loss": 0.14905966818332672 + }, + { + "epoch": 1.4941665018785841, + "step": 15112, + "train/sim_loss": 0.05249983072280884 + }, + { + "epoch": 1.4941665018785841, + "step": 15112, + "train/total_loss": 0.06740579754114151 + }, + { + "entropy": 9.174368858337402, + "epoch": 1.4942653747280996, + "mean_token_accuracy": 0.8692660331726074, + "num_tokens": 39125786.0, + "step": 15113, + "train/ce_loss": 0.8675083518028259 + }, + { + "epoch": 1.4942653747280996, + "step": 15113, + "train/sim_loss": 0.04370284080505371 + }, + { + "epoch": 1.4942653747280996, + "step": 15113, + "train/total_loss": 0.1304536759853363 + }, + { + "entropy": 9.381421089172363, + "epoch": 1.4943642475776153, + "mean_token_accuracy": 0.8769230842590332, + "num_tokens": 39135023.0, + "step": 15114, + "train/ce_loss": 0.45703423023223877 + }, + { + "epoch": 1.4943642475776153, + "step": 15114, + "train/sim_loss": 0.08700805902481079 + }, + { + "epoch": 1.4943642475776153, + "step": 15114, + "train/total_loss": 0.1327114850282669 + }, + { + "entropy": 9.045553207397461, + "epoch": 1.4944631204271306, + "mean_token_accuracy": 0.8340857625007629, + "num_tokens": 39147377.0, + "step": 15115, + "train/ce_loss": 0.5432165861129761 + }, + { + "epoch": 1.4944631204271306, + "step": 15115, + "train/sim_loss": 0.06033909320831299 + }, + { + "epoch": 1.4944631204271306, + "step": 15115, + "train/total_loss": 0.11466075479984283 + }, + { + "entropy": 8.800740242004395, + "epoch": 1.4945619932766463, + "mean_token_accuracy": 0.8608981370925903, + "num_tokens": 39157812.0, + "step": 15116, + "train/ce_loss": 0.2845962345600128 + }, + { + "epoch": 1.4945619932766463, + "step": 15116, + "train/sim_loss": 0.07160937786102295 + }, + { + "epoch": 1.4945619932766463, + "step": 15116, + "train/total_loss": 0.10006900131702423 + }, + { + "entropy": 9.565103530883789, + "epoch": 1.4946608661261618, + "mean_token_accuracy": 0.8992933034896851, + "num_tokens": 39169468.0, + "step": 15117, + "train/ce_loss": 0.44130608439445496 + }, + { + "epoch": 1.4946608661261618, + "step": 15117, + "train/sim_loss": 0.030487418174743652 + }, + { + "epoch": 1.4946608661261618, + "step": 15117, + "train/total_loss": 0.07461802661418915 + }, + { + "entropy": 9.334700584411621, + "epoch": 1.4947597389756773, + "mean_token_accuracy": 0.8182957172393799, + "num_tokens": 39179945.0, + "step": 15118, + "train/ce_loss": 2.032453494393849e-06 + }, + { + "epoch": 1.4947597389756773, + "step": 15118, + "train/sim_loss": 0.02910017967224121 + }, + { + "epoch": 1.4947597389756773, + "step": 15118, + "train/total_loss": 0.029100382700562477 + }, + { + "entropy": 9.112356185913086, + "epoch": 1.4948586118251928, + "mean_token_accuracy": 0.8396573066711426, + "num_tokens": 39190609.0, + "step": 15119, + "train/ce_loss": 0.22022198140621185 + }, + { + "epoch": 1.4948586118251928, + "step": 15119, + "train/sim_loss": 0.029355883598327637 + }, + { + "epoch": 1.4948586118251928, + "step": 15119, + "train/total_loss": 0.05137808248400688 + }, + { + "epoch": 1.4949574846747082, + "grad_norm": 0.5262286067008972, + "learning_rate": 6.264401918607527e-06, + "loss": 0.0862, + "step": 15120 + }, + { + "entropy": 9.02835464477539, + "epoch": 1.4949574846747082, + "mean_token_accuracy": 0.8454039096832275, + "num_tokens": 39198443.0, + "step": 15120, + "train/ce_loss": 0.3107323944568634 + }, + { + "epoch": 1.4949574846747082, + "step": 15120, + "train/sim_loss": 0.05002391338348389 + }, + { + "epoch": 1.4949574846747082, + "step": 15120, + "train/total_loss": 0.08109715580940247 + }, + { + "entropy": 9.40757942199707, + "epoch": 1.495056357524224, + "mean_token_accuracy": 0.8222748637199402, + "num_tokens": 39212363.0, + "step": 15121, + "train/ce_loss": 0.9470800161361694 + }, + { + "epoch": 1.495056357524224, + "step": 15121, + "train/sim_loss": 0.031493425369262695 + }, + { + "epoch": 1.495056357524224, + "step": 15121, + "train/total_loss": 0.12620142102241516 + }, + { + "entropy": 8.966063499450684, + "epoch": 1.4951552303737394, + "mean_token_accuracy": 0.8994565010070801, + "num_tokens": 39222862.0, + "step": 15122, + "train/ce_loss": 0.11533279716968536 + }, + { + "epoch": 1.4951552303737394, + "step": 15122, + "train/sim_loss": 0.017707109451293945 + }, + { + "epoch": 1.4951552303737394, + "step": 15122, + "train/total_loss": 0.029240388423204422 + }, + { + "entropy": 9.153676986694336, + "epoch": 1.495254103223255, + "mean_token_accuracy": 0.8664731383323669, + "num_tokens": 39234223.0, + "step": 15123, + "train/ce_loss": 0.21789298951625824 + }, + { + "epoch": 1.495254103223255, + "step": 15123, + "train/sim_loss": 0.08968222141265869 + }, + { + "epoch": 1.495254103223255, + "step": 15123, + "train/total_loss": 0.1114715188741684 + }, + { + "entropy": 9.244321823120117, + "epoch": 1.4953529760727704, + "mean_token_accuracy": 0.8444148898124695, + "num_tokens": 39243171.0, + "step": 15124, + "train/ce_loss": 1.9783475124768302e-07 + }, + { + "epoch": 1.4953529760727704, + "step": 15124, + "train/sim_loss": 0.015195906162261963 + }, + { + "epoch": 1.4953529760727704, + "step": 15124, + "train/total_loss": 0.01519592572003603 + }, + { + "entropy": 9.194463729858398, + "epoch": 1.4954518489222859, + "mean_token_accuracy": 0.8206967115402222, + "num_tokens": 39256264.0, + "step": 15125, + "train/ce_loss": 0.5151622295379639 + }, + { + "epoch": 1.4954518489222859, + "step": 15125, + "train/sim_loss": 0.055383265018463135 + }, + { + "epoch": 1.4954518489222859, + "step": 15125, + "train/total_loss": 0.10689948499202728 + }, + { + "entropy": 9.320720672607422, + "epoch": 1.4955507217718016, + "mean_token_accuracy": 0.8283752799034119, + "num_tokens": 39270227.0, + "step": 15126, + "train/ce_loss": 0.46002131700515747 + }, + { + "epoch": 1.4955507217718016, + "step": 15126, + "train/sim_loss": 0.07493829727172852 + }, + { + "epoch": 1.4955507217718016, + "step": 15126, + "train/total_loss": 0.1209404319524765 + }, + { + "entropy": 9.347113609313965, + "epoch": 1.4956495946213169, + "mean_token_accuracy": 0.8231552243232727, + "num_tokens": 39281671.0, + "step": 15127, + "train/ce_loss": 0.1649942696094513 + }, + { + "epoch": 1.4956495946213169, + "step": 15127, + "train/sim_loss": 0.05170208215713501 + }, + { + "epoch": 1.4956495946213169, + "step": 15127, + "train/total_loss": 0.06820151209831238 + }, + { + "entropy": 9.075918197631836, + "epoch": 1.4957484674708326, + "mean_token_accuracy": 0.8482353091239929, + "num_tokens": 39293658.0, + "step": 15128, + "train/ce_loss": 0.25280413031578064 + }, + { + "epoch": 1.4957484674708326, + "step": 15128, + "train/sim_loss": 0.022802352905273438 + }, + { + "epoch": 1.4957484674708326, + "step": 15128, + "train/total_loss": 0.04808276891708374 + }, + { + "entropy": 9.061079025268555, + "epoch": 1.495847340320348, + "mean_token_accuracy": 0.9081419706344604, + "num_tokens": 39304343.0, + "step": 15129, + "train/ce_loss": 0.22485138475894928 + }, + { + "epoch": 1.495847340320348, + "step": 15129, + "train/sim_loss": 0.011426210403442383 + }, + { + "epoch": 1.495847340320348, + "step": 15129, + "train/total_loss": 0.03391134738922119 + }, + { + "entropy": 9.856714248657227, + "epoch": 1.4959462131698635, + "mean_token_accuracy": 0.9503311514854431, + "num_tokens": 39317171.0, + "step": 15130, + "train/ce_loss": 0.2178579419851303 + }, + { + "epoch": 1.4959462131698635, + "step": 15130, + "train/sim_loss": 0.044953882694244385 + }, + { + "epoch": 1.4959462131698635, + "step": 15130, + "train/total_loss": 0.06673967838287354 + }, + { + "entropy": 8.639551162719727, + "epoch": 1.496045086019379, + "mean_token_accuracy": 0.8388888835906982, + "num_tokens": 39325565.0, + "step": 15131, + "train/ce_loss": 0.5222164392471313 + }, + { + "epoch": 1.496045086019379, + "step": 15131, + "train/sim_loss": 0.04413944482803345 + }, + { + "epoch": 1.496045086019379, + "step": 15131, + "train/total_loss": 0.09636108577251434 + }, + { + "entropy": 9.104244232177734, + "epoch": 1.4961439588688945, + "mean_token_accuracy": 0.8626373410224915, + "num_tokens": 39334215.0, + "step": 15132, + "train/ce_loss": 0.16356493532657623 + }, + { + "epoch": 1.4961439588688945, + "step": 15132, + "train/sim_loss": 0.03187054395675659 + }, + { + "epoch": 1.4961439588688945, + "step": 15132, + "train/total_loss": 0.048227038234472275 + }, + { + "entropy": 8.85458755493164, + "epoch": 1.4962428317184102, + "mean_token_accuracy": 0.8765957355499268, + "num_tokens": 39343564.0, + "step": 15133, + "train/ce_loss": 0.4598280191421509 + }, + { + "epoch": 1.4962428317184102, + "step": 15133, + "train/sim_loss": 0.013328492641448975 + }, + { + "epoch": 1.4962428317184102, + "step": 15133, + "train/total_loss": 0.05931129679083824 + }, + { + "entropy": 9.479982376098633, + "epoch": 1.4963417045679257, + "mean_token_accuracy": 0.850923478603363, + "num_tokens": 39357937.0, + "step": 15134, + "train/ce_loss": 0.29182419180870056 + }, + { + "epoch": 1.4963417045679257, + "step": 15134, + "train/sim_loss": 0.03513222932815552 + }, + { + "epoch": 1.4963417045679257, + "step": 15134, + "train/total_loss": 0.06431464850902557 + }, + { + "entropy": 8.953836441040039, + "epoch": 1.4964405774174412, + "mean_token_accuracy": 0.8585858345031738, + "num_tokens": 39371717.0, + "step": 15135, + "train/ce_loss": 0.5412020087242126 + }, + { + "epoch": 1.4964405774174412, + "step": 15135, + "train/sim_loss": 0.030631303787231445 + }, + { + "epoch": 1.4964405774174412, + "step": 15135, + "train/total_loss": 0.08475150167942047 + }, + { + "entropy": 9.757759094238281, + "epoch": 1.4965394502669567, + "mean_token_accuracy": 0.8480392098426819, + "num_tokens": 39391674.0, + "step": 15136, + "train/ce_loss": 1.4459557178270188e-06 + }, + { + "epoch": 1.4965394502669567, + "step": 15136, + "train/sim_loss": 0.01737082004547119 + }, + { + "epoch": 1.4965394502669567, + "step": 15136, + "train/total_loss": 0.01737096533179283 + }, + { + "entropy": 9.672212600708008, + "epoch": 1.4966383231164722, + "mean_token_accuracy": 0.8421875238418579, + "num_tokens": 39406335.0, + "step": 15137, + "train/ce_loss": 0.6836280226707458 + }, + { + "epoch": 1.4966383231164722, + "step": 15137, + "train/sim_loss": 0.06010735034942627 + }, + { + "epoch": 1.4966383231164722, + "step": 15137, + "train/total_loss": 0.12847015261650085 + }, + { + "entropy": 8.991621017456055, + "epoch": 1.4967371959659879, + "mean_token_accuracy": 0.8470588326454163, + "num_tokens": 39420314.0, + "step": 15138, + "train/ce_loss": 0.7409477233886719 + }, + { + "epoch": 1.4967371959659879, + "step": 15138, + "train/sim_loss": 0.06917834281921387 + }, + { + "epoch": 1.4967371959659879, + "step": 15138, + "train/total_loss": 0.14327311515808105 + }, + { + "entropy": 9.144347190856934, + "epoch": 1.4968360688155031, + "mean_token_accuracy": 0.8842224478721619, + "num_tokens": 39433289.0, + "step": 15139, + "train/ce_loss": 0.4912078082561493 + }, + { + "epoch": 1.4968360688155031, + "step": 15139, + "train/sim_loss": 0.008229553699493408 + }, + { + "epoch": 1.4968360688155031, + "step": 15139, + "train/total_loss": 0.05735033378005028 + }, + { + "epoch": 1.4969349416650188, + "grad_norm": 0.5193765759468079, + "learning_rate": 6.259457053849578e-06, + "loss": 0.0834, + "step": 15140 + }, + { + "entropy": 9.19943904876709, + "epoch": 1.4969349416650188, + "mean_token_accuracy": 0.8512696623802185, + "num_tokens": 39446186.0, + "step": 15140, + "train/ce_loss": 0.5645666122436523 + }, + { + "epoch": 1.4969349416650188, + "step": 15140, + "train/sim_loss": 0.028725266456604004 + }, + { + "epoch": 1.4969349416650188, + "step": 15140, + "train/total_loss": 0.08518192917108536 + }, + { + "entropy": 8.990560531616211, + "epoch": 1.4970338145145343, + "mean_token_accuracy": 0.8769792914390564, + "num_tokens": 39457026.0, + "step": 15141, + "train/ce_loss": 0.2901891767978668 + }, + { + "epoch": 1.4970338145145343, + "step": 15141, + "train/sim_loss": 0.06941831111907959 + }, + { + "epoch": 1.4970338145145343, + "step": 15141, + "train/total_loss": 0.09843722730875015 + }, + { + "entropy": 9.05801773071289, + "epoch": 1.4971326873640498, + "mean_token_accuracy": 0.800000011920929, + "num_tokens": 39464792.0, + "step": 15142, + "train/ce_loss": 0.3939250409603119 + }, + { + "epoch": 1.4971326873640498, + "step": 15142, + "train/sim_loss": 0.03928118944168091 + }, + { + "epoch": 1.4971326873640498, + "step": 15142, + "train/total_loss": 0.07867369055747986 + }, + { + "entropy": 8.935766220092773, + "epoch": 1.4972315602135653, + "mean_token_accuracy": 0.820135772228241, + "num_tokens": 39475351.0, + "step": 15143, + "train/ce_loss": 0.41509220004081726 + }, + { + "epoch": 1.4972315602135653, + "step": 15143, + "train/sim_loss": 0.0433272123336792 + }, + { + "epoch": 1.4972315602135653, + "step": 15143, + "train/total_loss": 0.0848364382982254 + }, + { + "entropy": 9.514236450195312, + "epoch": 1.4973304330630808, + "mean_token_accuracy": 0.8833107352256775, + "num_tokens": 39491218.0, + "step": 15144, + "train/ce_loss": 0.3476286232471466 + }, + { + "epoch": 1.4973304330630808, + "step": 15144, + "train/sim_loss": 0.035509586334228516 + }, + { + "epoch": 1.4973304330630808, + "step": 15144, + "train/total_loss": 0.07027244567871094 + }, + { + "entropy": 9.423131942749023, + "epoch": 1.4974293059125965, + "mean_token_accuracy": 0.8335208296775818, + "num_tokens": 39504380.0, + "step": 15145, + "train/ce_loss": 0.21937082707881927 + }, + { + "epoch": 1.4974293059125965, + "step": 15145, + "train/sim_loss": 0.016483306884765625 + }, + { + "epoch": 1.4974293059125965, + "step": 15145, + "train/total_loss": 0.03842039033770561 + }, + { + "entropy": 10.003029823303223, + "epoch": 1.497528178762112, + "mean_token_accuracy": 0.8925233483314514, + "num_tokens": 39522487.0, + "step": 15146, + "train/ce_loss": 3.709926943429309e-07 + }, + { + "epoch": 1.497528178762112, + "step": 15146, + "train/sim_loss": 0.01649320125579834 + }, + { + "epoch": 1.497528178762112, + "step": 15146, + "train/total_loss": 0.016493238508701324 + }, + { + "entropy": 9.31668758392334, + "epoch": 1.4976270516116275, + "mean_token_accuracy": 0.819639265537262, + "num_tokens": 39534358.0, + "step": 15147, + "train/ce_loss": 0.6010738611221313 + }, + { + "epoch": 1.4976270516116275, + "step": 15147, + "train/sim_loss": 0.07731258869171143 + }, + { + "epoch": 1.4976270516116275, + "step": 15147, + "train/total_loss": 0.13741996884346008 + }, + { + "entropy": 9.52307415008545, + "epoch": 1.497725924461143, + "mean_token_accuracy": 0.8774080276489258, + "num_tokens": 39552615.0, + "step": 15148, + "train/ce_loss": 0.35120099782943726 + }, + { + "epoch": 1.497725924461143, + "step": 15148, + "train/sim_loss": 0.015827596187591553 + }, + { + "epoch": 1.497725924461143, + "step": 15148, + "train/total_loss": 0.05094769597053528 + }, + { + "entropy": 9.487222671508789, + "epoch": 1.4978247973106584, + "mean_token_accuracy": 0.9621052742004395, + "num_tokens": 39568995.0, + "step": 15149, + "train/ce_loss": 0.28547269105911255 + }, + { + "epoch": 1.4978247973106584, + "step": 15149, + "train/sim_loss": 0.018096864223480225 + }, + { + "epoch": 1.4978247973106584, + "step": 15149, + "train/total_loss": 0.04664413630962372 + }, + { + "entropy": 9.000682830810547, + "epoch": 1.4979236701601741, + "mean_token_accuracy": 0.8602673411369324, + "num_tokens": 39583542.0, + "step": 15150, + "train/ce_loss": 0.6049697399139404 + }, + { + "epoch": 1.4979236701601741, + "step": 15150, + "train/sim_loss": 0.018468737602233887 + }, + { + "epoch": 1.4979236701601741, + "step": 15150, + "train/total_loss": 0.07896570861339569 + }, + { + "entropy": 9.794321060180664, + "epoch": 1.4980225430096896, + "mean_token_accuracy": 0.8600953817367554, + "num_tokens": 39591891.0, + "step": 15151, + "train/ce_loss": 0.5904937982559204 + }, + { + "epoch": 1.4980225430096896, + "step": 15151, + "train/sim_loss": 0.0504223108291626 + }, + { + "epoch": 1.4980225430096896, + "step": 15151, + "train/total_loss": 0.10947169363498688 + }, + { + "entropy": 9.488121032714844, + "epoch": 1.498121415859205, + "mean_token_accuracy": 0.9090909361839294, + "num_tokens": 39606954.0, + "step": 15152, + "train/ce_loss": 1.280383912671823e-06 + }, + { + "epoch": 1.498121415859205, + "step": 15152, + "train/sim_loss": 0.018891751766204834 + }, + { + "epoch": 1.498121415859205, + "step": 15152, + "train/total_loss": 0.01889188028872013 + }, + { + "entropy": 8.808496475219727, + "epoch": 1.4982202887087206, + "mean_token_accuracy": 0.8700696229934692, + "num_tokens": 39619461.0, + "step": 15153, + "train/ce_loss": 0.35227879881858826 + }, + { + "epoch": 1.4982202887087206, + "step": 15153, + "train/sim_loss": 0.019581496715545654 + }, + { + "epoch": 1.4982202887087206, + "step": 15153, + "train/total_loss": 0.05480937659740448 + }, + { + "entropy": 9.454244613647461, + "epoch": 1.498319161558236, + "mean_token_accuracy": 0.8390092849731445, + "num_tokens": 39629787.0, + "step": 15154, + "train/ce_loss": 0.4579808712005615 + }, + { + "epoch": 1.498319161558236, + "step": 15154, + "train/sim_loss": 0.03954422473907471 + }, + { + "epoch": 1.498319161558236, + "step": 15154, + "train/total_loss": 0.08534231781959534 + }, + { + "entropy": 9.471002578735352, + "epoch": 1.4984180344077516, + "mean_token_accuracy": 0.8828337788581848, + "num_tokens": 39643556.0, + "step": 15155, + "train/ce_loss": 0.36492785811424255 + }, + { + "epoch": 1.4984180344077516, + "step": 15155, + "train/sim_loss": 0.036237359046936035 + }, + { + "epoch": 1.4984180344077516, + "step": 15155, + "train/total_loss": 0.07273014634847641 + }, + { + "entropy": 9.351749420166016, + "epoch": 1.498516907257267, + "mean_token_accuracy": 0.8792710900306702, + "num_tokens": 39657107.0, + "step": 15156, + "train/ce_loss": 0.4605361223220825 + }, + { + "epoch": 1.498516907257267, + "step": 15156, + "train/sim_loss": 0.05572056770324707 + }, + { + "epoch": 1.498516907257267, + "step": 15156, + "train/total_loss": 0.1017741858959198 + }, + { + "entropy": 8.99226188659668, + "epoch": 1.4986157801067828, + "mean_token_accuracy": 0.8356282114982605, + "num_tokens": 39670780.0, + "step": 15157, + "train/ce_loss": 0.4007316827774048 + }, + { + "epoch": 1.4986157801067828, + "step": 15157, + "train/sim_loss": 0.016381442546844482 + }, + { + "epoch": 1.4986157801067828, + "step": 15157, + "train/total_loss": 0.0564546100795269 + }, + { + "entropy": 9.572797775268555, + "epoch": 1.4987146529562982, + "mean_token_accuracy": 0.8550547957420349, + "num_tokens": 39687990.0, + "step": 15158, + "train/ce_loss": 0.2770717740058899 + }, + { + "epoch": 1.4987146529562982, + "step": 15158, + "train/sim_loss": 0.02106451988220215 + }, + { + "epoch": 1.4987146529562982, + "step": 15158, + "train/total_loss": 0.0487716980278492 + }, + { + "entropy": 9.029102325439453, + "epoch": 1.4988135258058137, + "mean_token_accuracy": 0.8568443059921265, + "num_tokens": 39700057.0, + "step": 15159, + "train/ce_loss": 0.3091081976890564 + }, + { + "epoch": 1.4988135258058137, + "step": 15159, + "train/sim_loss": 0.05063068866729736 + }, + { + "epoch": 1.4988135258058137, + "step": 15159, + "train/total_loss": 0.081541508436203 + }, + { + "epoch": 1.4989123986553292, + "grad_norm": 0.5002369284629822, + "learning_rate": 6.2545121890916295e-06, + "loss": 0.0789, + "step": 15160 + }, + { + "entropy": 9.260238647460938, + "epoch": 1.4989123986553292, + "mean_token_accuracy": 0.8552238941192627, + "num_tokens": 39708273.0, + "step": 15160, + "train/ce_loss": 0.39273345470428467 + }, + { + "epoch": 1.4989123986553292, + "step": 15160, + "train/sim_loss": 0.0953027606010437 + }, + { + "epoch": 1.4989123986553292, + "step": 15160, + "train/total_loss": 0.13457611203193665 + }, + { + "entropy": 9.667566299438477, + "epoch": 1.4990112715048447, + "mean_token_accuracy": 0.8642172813415527, + "num_tokens": 39724404.0, + "step": 15161, + "train/ce_loss": 0.4636703431606293 + }, + { + "epoch": 1.4990112715048447, + "step": 15161, + "train/sim_loss": 0.024380087852478027 + }, + { + "epoch": 1.4990112715048447, + "step": 15161, + "train/total_loss": 0.07074712216854095 + }, + { + "entropy": 9.304909706115723, + "epoch": 1.4991101443543604, + "mean_token_accuracy": 0.8600682616233826, + "num_tokens": 39736677.0, + "step": 15162, + "train/ce_loss": 0.39851024746894836 + }, + { + "epoch": 1.4991101443543604, + "step": 15162, + "train/sim_loss": 0.057361602783203125 + }, + { + "epoch": 1.4991101443543604, + "step": 15162, + "train/total_loss": 0.09721262753009796 + }, + { + "entropy": 9.33272933959961, + "epoch": 1.499209017203876, + "mean_token_accuracy": 0.9190647602081299, + "num_tokens": 39750299.0, + "step": 15163, + "train/ce_loss": 0.26481008529663086 + }, + { + "epoch": 1.499209017203876, + "step": 15163, + "train/sim_loss": 0.06044495105743408 + }, + { + "epoch": 1.499209017203876, + "step": 15163, + "train/total_loss": 0.08692596107721329 + }, + { + "entropy": 9.354560852050781, + "epoch": 1.4993078900533914, + "mean_token_accuracy": 0.9024096131324768, + "num_tokens": 39761831.0, + "step": 15164, + "train/ce_loss": 0.440672367811203 + }, + { + "epoch": 1.4993078900533914, + "step": 15164, + "train/sim_loss": 0.07492697238922119 + }, + { + "epoch": 1.4993078900533914, + "step": 15164, + "train/total_loss": 0.11899420619010925 + }, + { + "entropy": 9.465045928955078, + "epoch": 1.4994067629029069, + "mean_token_accuracy": 0.862500011920929, + "num_tokens": 39776931.0, + "step": 15165, + "train/ce_loss": 0.2382536679506302 + }, + { + "epoch": 1.4994067629029069, + "step": 15165, + "train/sim_loss": 0.028716683387756348 + }, + { + "epoch": 1.4994067629029069, + "step": 15165, + "train/total_loss": 0.052542053163051605 + }, + { + "entropy": 9.093544006347656, + "epoch": 1.4995056357524224, + "mean_token_accuracy": 0.8324675559997559, + "num_tokens": 39788381.0, + "step": 15166, + "train/ce_loss": 0.5416273474693298 + }, + { + "epoch": 1.4995056357524224, + "step": 15166, + "train/sim_loss": 0.021637678146362305 + }, + { + "epoch": 1.4995056357524224, + "step": 15166, + "train/total_loss": 0.07580041885375977 + }, + { + "entropy": 8.859637260437012, + "epoch": 1.4996045086019378, + "mean_token_accuracy": 0.8161033987998962, + "num_tokens": 39799471.0, + "step": 15167, + "train/ce_loss": 0.22818420827388763 + }, + { + "epoch": 1.4996045086019378, + "step": 15167, + "train/sim_loss": 0.03665238618850708 + }, + { + "epoch": 1.4996045086019378, + "step": 15167, + "train/total_loss": 0.05947080999612808 + }, + { + "entropy": 9.280158996582031, + "epoch": 1.4997033814514533, + "mean_token_accuracy": 0.8301675915718079, + "num_tokens": 39809069.0, + "step": 15168, + "train/ce_loss": 0.4359304904937744 + }, + { + "epoch": 1.4997033814514533, + "step": 15168, + "train/sim_loss": 0.0362856388092041 + }, + { + "epoch": 1.4997033814514533, + "step": 15168, + "train/total_loss": 0.07987868785858154 + }, + { + "entropy": 9.597339630126953, + "epoch": 1.499802254300969, + "mean_token_accuracy": 0.8459259271621704, + "num_tokens": 39818552.0, + "step": 15169, + "train/ce_loss": 0.7077531218528748 + }, + { + "epoch": 1.499802254300969, + "step": 15169, + "train/sim_loss": 0.04328787326812744 + }, + { + "epoch": 1.499802254300969, + "step": 15169, + "train/total_loss": 0.11406318843364716 + }, + { + "entropy": 9.192806243896484, + "epoch": 1.4999011271504845, + "mean_token_accuracy": 0.8556460738182068, + "num_tokens": 39829716.0, + "step": 15170, + "train/ce_loss": 0.5265418291091919 + }, + { + "epoch": 1.4999011271504845, + "step": 15170, + "train/sim_loss": 0.03742671012878418 + }, + { + "epoch": 1.4999011271504845, + "step": 15170, + "train/total_loss": 0.09008089452981949 + }, + { + "entropy": 9.722183227539062, + "epoch": 1.5, + "mean_token_accuracy": 0.8467532396316528, + "num_tokens": 39845650.0, + "step": 15171, + "train/ce_loss": 0.6293071508407593 + }, + { + "epoch": 1.5, + "step": 15171, + "train/sim_loss": 0.03823882341384888 + }, + { + "epoch": 1.5, + "step": 15171, + "train/total_loss": 0.10116954147815704 + }, + { + "entropy": 9.428055763244629, + "epoch": 1.5000988728495155, + "mean_token_accuracy": 0.8569321632385254, + "num_tokens": 39860220.0, + "step": 15172, + "train/ce_loss": 0.35653820633888245 + }, + { + "epoch": 1.5000988728495155, + "step": 15172, + "train/sim_loss": 0.03227204084396362 + }, + { + "epoch": 1.5000988728495155, + "step": 15172, + "train/total_loss": 0.06792586296796799 + }, + { + "entropy": 9.130834579467773, + "epoch": 1.500197745699031, + "mean_token_accuracy": 0.8643147945404053, + "num_tokens": 39872117.0, + "step": 15173, + "train/ce_loss": 0.6268719434738159 + }, + { + "epoch": 1.500197745699031, + "step": 15173, + "train/sim_loss": 0.04211461544036865 + }, + { + "epoch": 1.500197745699031, + "step": 15173, + "train/total_loss": 0.10480181127786636 + }, + { + "entropy": 9.490482330322266, + "epoch": 1.5002966185485467, + "mean_token_accuracy": 0.829959511756897, + "num_tokens": 39886019.0, + "step": 15174, + "train/ce_loss": 0.4758540987968445 + }, + { + "epoch": 1.5002966185485467, + "step": 15174, + "train/sim_loss": 0.04945218563079834 + }, + { + "epoch": 1.5002966185485467, + "step": 15174, + "train/total_loss": 0.09703759849071503 + }, + { + "entropy": 9.438987731933594, + "epoch": 1.500395491398062, + "mean_token_accuracy": 0.8450331091880798, + "num_tokens": 39895559.0, + "step": 15175, + "train/ce_loss": 0.5788338780403137 + }, + { + "epoch": 1.500395491398062, + "step": 15175, + "train/sim_loss": 0.017498791217803955 + }, + { + "epoch": 1.500395491398062, + "step": 15175, + "train/total_loss": 0.07538218051195145 + }, + { + "entropy": 8.972881317138672, + "epoch": 1.5004943642475776, + "mean_token_accuracy": 0.834871768951416, + "num_tokens": 39908070.0, + "step": 15176, + "train/ce_loss": 0.4845488369464874 + }, + { + "epoch": 1.5004943642475776, + "step": 15176, + "train/sim_loss": 0.05933976173400879 + }, + { + "epoch": 1.5004943642475776, + "step": 15176, + "train/total_loss": 0.10779464244842529 + }, + { + "entropy": 9.486251831054688, + "epoch": 1.5005932370970931, + "mean_token_accuracy": 0.8631240129470825, + "num_tokens": 39927919.0, + "step": 15177, + "train/ce_loss": 0.19101256132125854 + }, + { + "epoch": 1.5005932370970931, + "step": 15177, + "train/sim_loss": 0.03926706314086914 + }, + { + "epoch": 1.5005932370970931, + "step": 15177, + "train/total_loss": 0.058368317782878876 + }, + { + "entropy": 9.905961990356445, + "epoch": 1.5006921099466086, + "mean_token_accuracy": 0.8973799347877502, + "num_tokens": 39940034.0, + "step": 15178, + "train/ce_loss": 3.8417229575316014e-07 + }, + { + "epoch": 1.5006921099466086, + "step": 15178, + "train/sim_loss": 0.01257413625717163 + }, + { + "epoch": 1.5006921099466086, + "step": 15178, + "train/total_loss": 0.01257417444139719 + }, + { + "entropy": 9.749719619750977, + "epoch": 1.5007909827961243, + "mean_token_accuracy": 0.9191374778747559, + "num_tokens": 39953007.0, + "step": 15179, + "train/ce_loss": 1.2145264918217435e-06 + }, + { + "epoch": 1.5007909827961243, + "step": 15179, + "train/sim_loss": 0.04179030656814575 + }, + { + "epoch": 1.5007909827961243, + "step": 15179, + "train/total_loss": 0.0417904295027256 + }, + { + "epoch": 1.5008898556456396, + "grad_norm": 0.5759223103523254, + "learning_rate": 6.249567324333679e-06, + "loss": 0.081, + "step": 15180 + }, + { + "entropy": 9.321737289428711, + "epoch": 1.5008898556456396, + "mean_token_accuracy": 0.8490284085273743, + "num_tokens": 39961812.0, + "step": 15180, + "train/ce_loss": 1.960505642273347e-06 + }, + { + "epoch": 1.5008898556456396, + "step": 15180, + "train/sim_loss": 0.03720742464065552 + }, + { + "epoch": 1.5008898556456396, + "step": 15180, + "train/total_loss": 0.037207622081041336 + }, + { + "entropy": 9.20757007598877, + "epoch": 1.5009887284951553, + "mean_token_accuracy": 0.8883994221687317, + "num_tokens": 39977592.0, + "step": 15181, + "train/ce_loss": 3.2699080065867747e-07 + }, + { + "epoch": 1.5009887284951553, + "step": 15181, + "train/sim_loss": 0.021359503269195557 + }, + { + "epoch": 1.5009887284951553, + "step": 15181, + "train/total_loss": 0.021359536796808243 + }, + { + "entropy": 8.6918306350708, + "epoch": 1.5010876013446708, + "mean_token_accuracy": 0.8405215740203857, + "num_tokens": 39984223.0, + "step": 15182, + "train/ce_loss": 0.5596126317977905 + }, + { + "epoch": 1.5010876013446708, + "step": 15182, + "train/sim_loss": 0.05271279811859131 + }, + { + "epoch": 1.5010876013446708, + "step": 15182, + "train/total_loss": 0.1086740642786026 + }, + { + "entropy": 8.924819946289062, + "epoch": 1.5011864741941863, + "mean_token_accuracy": 0.8639366030693054, + "num_tokens": 39991291.0, + "step": 15183, + "train/ce_loss": 9.850928108789958e-07 + }, + { + "epoch": 1.5011864741941863, + "step": 15183, + "train/sim_loss": 0.05207371711730957 + }, + { + "epoch": 1.5011864741941863, + "step": 15183, + "train/total_loss": 0.05207381397485733 + }, + { + "entropy": 9.322359085083008, + "epoch": 1.5012853470437018, + "mean_token_accuracy": 0.9176954627037048, + "num_tokens": 40001427.0, + "step": 15184, + "train/ce_loss": 6.269367531785974e-07 + }, + { + "epoch": 1.5012853470437018, + "step": 15184, + "train/sim_loss": 0.02805769443511963 + }, + { + "epoch": 1.5012853470437018, + "step": 15184, + "train/total_loss": 0.028057757765054703 + }, + { + "entropy": 9.058174133300781, + "epoch": 1.5013842198932172, + "mean_token_accuracy": 0.88453608751297, + "num_tokens": 40014184.0, + "step": 15185, + "train/ce_loss": 4.6404696263380174e-07 + }, + { + "epoch": 1.5013842198932172, + "step": 15185, + "train/sim_loss": 0.025935232639312744 + }, + { + "epoch": 1.5013842198932172, + "step": 15185, + "train/total_loss": 0.025935279205441475 + }, + { + "entropy": 9.27491569519043, + "epoch": 1.501483092742733, + "mean_token_accuracy": 0.8181818127632141, + "num_tokens": 40028523.0, + "step": 15186, + "train/ce_loss": 0.526157021522522 + }, + { + "epoch": 1.501483092742733, + "step": 15186, + "train/sim_loss": 0.08877086639404297 + }, + { + "epoch": 1.501483092742733, + "step": 15186, + "train/total_loss": 0.14138656854629517 + }, + { + "entropy": 9.322210311889648, + "epoch": 1.5015819655922482, + "mean_token_accuracy": 0.8333333134651184, + "num_tokens": 40042544.0, + "step": 15187, + "train/ce_loss": 0.5068386793136597 + }, + { + "epoch": 1.5015819655922482, + "step": 15187, + "train/sim_loss": 0.04934287071228027 + }, + { + "epoch": 1.5015819655922482, + "step": 15187, + "train/total_loss": 0.10002674162387848 + }, + { + "entropy": 9.514153480529785, + "epoch": 1.501680838441764, + "mean_token_accuracy": 0.8403648734092712, + "num_tokens": 40056594.0, + "step": 15188, + "train/ce_loss": 0.3607718348503113 + }, + { + "epoch": 1.501680838441764, + "step": 15188, + "train/sim_loss": 0.016230106353759766 + }, + { + "epoch": 1.501680838441764, + "step": 15188, + "train/total_loss": 0.052307289093732834 + }, + { + "entropy": 9.268404960632324, + "epoch": 1.5017797112912794, + "mean_token_accuracy": 0.8494318127632141, + "num_tokens": 40068346.0, + "step": 15189, + "train/ce_loss": 2.8515049166344397e-07 + }, + { + "epoch": 1.5017797112912794, + "step": 15189, + "train/sim_loss": 0.05336141586303711 + }, + { + "epoch": 1.5017797112912794, + "step": 15189, + "train/total_loss": 0.0533614456653595 + }, + { + "entropy": 9.060400009155273, + "epoch": 1.501878584140795, + "mean_token_accuracy": 0.8725961446762085, + "num_tokens": 40081075.0, + "step": 15190, + "train/ce_loss": 0.2237226963043213 + }, + { + "epoch": 1.501878584140795, + "step": 15190, + "train/sim_loss": 0.025957822799682617 + }, + { + "epoch": 1.501878584140795, + "step": 15190, + "train/total_loss": 0.04833009093999863 + }, + { + "entropy": 9.486708641052246, + "epoch": 1.5019774569903106, + "mean_token_accuracy": 0.8738049864768982, + "num_tokens": 40098629.0, + "step": 15191, + "train/ce_loss": 0.49176025390625 + }, + { + "epoch": 1.5019774569903106, + "step": 15191, + "train/sim_loss": 0.02426600456237793 + }, + { + "epoch": 1.5019774569903106, + "step": 15191, + "train/total_loss": 0.07344202697277069 + }, + { + "entropy": 9.081436157226562, + "epoch": 1.5020763298398259, + "mean_token_accuracy": 0.8758085370063782, + "num_tokens": 40109108.0, + "step": 15192, + "train/ce_loss": 0.20966273546218872 + }, + { + "epoch": 1.5020763298398259, + "step": 15192, + "train/sim_loss": 0.07466471195220947 + }, + { + "epoch": 1.5020763298398259, + "step": 15192, + "train/total_loss": 0.09563098847866058 + }, + { + "entropy": 9.206497192382812, + "epoch": 1.5021752026893416, + "mean_token_accuracy": 0.8424317836761475, + "num_tokens": 40124830.0, + "step": 15193, + "train/ce_loss": 0.7916797399520874 + }, + { + "epoch": 1.5021752026893416, + "step": 15193, + "train/sim_loss": 0.03629148006439209 + }, + { + "epoch": 1.5021752026893416, + "step": 15193, + "train/total_loss": 0.11545945703983307 + }, + { + "entropy": 9.606072425842285, + "epoch": 1.502274075538857, + "mean_token_accuracy": 0.8280543088912964, + "num_tokens": 40139960.0, + "step": 15194, + "train/ce_loss": 0.1882932186126709 + }, + { + "epoch": 1.502274075538857, + "step": 15194, + "train/sim_loss": 0.04286015033721924 + }, + { + "epoch": 1.502274075538857, + "step": 15194, + "train/total_loss": 0.06168947368860245 + }, + { + "entropy": 9.466682434082031, + "epoch": 1.5023729483883725, + "mean_token_accuracy": 0.8733905553817749, + "num_tokens": 40146601.0, + "step": 15195, + "train/ce_loss": 2.232479801023146e-06 + }, + { + "epoch": 1.5023729483883725, + "step": 15195, + "train/sim_loss": 0.03667271137237549 + }, + { + "epoch": 1.5023729483883725, + "step": 15195, + "train/total_loss": 0.036672934889793396 + }, + { + "entropy": 9.955863952636719, + "epoch": 1.502471821237888, + "mean_token_accuracy": 0.8784648180007935, + "num_tokens": 40163461.0, + "step": 15196, + "train/ce_loss": 0.6800300478935242 + }, + { + "epoch": 1.502471821237888, + "step": 15196, + "train/sim_loss": 0.02192246913909912 + }, + { + "epoch": 1.502471821237888, + "step": 15196, + "train/total_loss": 0.08992547541856766 + }, + { + "entropy": 8.623655319213867, + "epoch": 1.5025706940874035, + "mean_token_accuracy": 0.8620038032531738, + "num_tokens": 40173409.0, + "step": 15197, + "train/ce_loss": 0.2910744249820709 + }, + { + "epoch": 1.5025706940874035, + "step": 15197, + "train/sim_loss": 0.03810584545135498 + }, + { + "epoch": 1.5025706940874035, + "step": 15197, + "train/total_loss": 0.06721328943967819 + }, + { + "entropy": 9.002880096435547, + "epoch": 1.5026695669369192, + "mean_token_accuracy": 0.8419936299324036, + "num_tokens": 40186624.0, + "step": 15198, + "train/ce_loss": 0.5361162424087524 + }, + { + "epoch": 1.5026695669369192, + "step": 15198, + "train/sim_loss": 0.02038717269897461 + }, + { + "epoch": 1.5026695669369192, + "step": 15198, + "train/total_loss": 0.07399879395961761 + }, + { + "entropy": 9.104183197021484, + "epoch": 1.5027684397864345, + "mean_token_accuracy": 0.8769429922103882, + "num_tokens": 40198997.0, + "step": 15199, + "train/ce_loss": 0.1626977026462555 + }, + { + "epoch": 1.5027684397864345, + "step": 15199, + "train/sim_loss": 0.039562880992889404 + }, + { + "epoch": 1.5027684397864345, + "step": 15199, + "train/total_loss": 0.05583265423774719 + }, + { + "epoch": 1.5028673126359502, + "grad_norm": 0.5236416459083557, + "learning_rate": 6.2446224595757306e-06, + "loss": 0.0828, + "step": 15200 + }, + { + "entropy": 9.313703536987305, + "epoch": 1.5028673126359502, + "mean_token_accuracy": 0.8487972617149353, + "num_tokens": 40207152.0, + "step": 15200, + "train/ce_loss": 0.6928451657295227 + }, + { + "epoch": 1.5028673126359502, + "step": 15200, + "train/sim_loss": 0.057276368141174316 + }, + { + "epoch": 1.5028673126359502, + "step": 15200, + "train/total_loss": 0.12656089663505554 + }, + { + "entropy": 9.62602424621582, + "epoch": 1.5029661854854657, + "mean_token_accuracy": 0.8424657583236694, + "num_tokens": 40216084.0, + "step": 15201, + "train/ce_loss": 0.49119263887405396 + }, + { + "epoch": 1.5029661854854657, + "step": 15201, + "train/sim_loss": 0.04738515615463257 + }, + { + "epoch": 1.5029661854854657, + "step": 15201, + "train/total_loss": 0.09650442004203796 + }, + { + "entropy": 9.561809539794922, + "epoch": 1.5030650583349812, + "mean_token_accuracy": 0.8641221523284912, + "num_tokens": 40224938.0, + "step": 15202, + "train/ce_loss": 0.6533565521240234 + }, + { + "epoch": 1.5030650583349812, + "step": 15202, + "train/sim_loss": 0.03555721044540405 + }, + { + "epoch": 1.5030650583349812, + "step": 15202, + "train/total_loss": 0.10089286416769028 + }, + { + "entropy": 9.411216735839844, + "epoch": 1.5031639311844969, + "mean_token_accuracy": 0.8075268864631653, + "num_tokens": 40237245.0, + "step": 15203, + "train/ce_loss": 0.4440142810344696 + }, + { + "epoch": 1.5031639311844969, + "step": 15203, + "train/sim_loss": 0.07132792472839355 + }, + { + "epoch": 1.5031639311844969, + "step": 15203, + "train/total_loss": 0.11572935432195663 + }, + { + "entropy": 9.38520622253418, + "epoch": 1.5032628040340121, + "mean_token_accuracy": 0.852571427822113, + "num_tokens": 40256571.0, + "step": 15204, + "train/ce_loss": 0.3996528685092926 + }, + { + "epoch": 1.5032628040340121, + "step": 15204, + "train/sim_loss": 0.018774032592773438 + }, + { + "epoch": 1.5032628040340121, + "step": 15204, + "train/total_loss": 0.0587393194437027 + }, + { + "entropy": 9.402250289916992, + "epoch": 1.5033616768835278, + "mean_token_accuracy": 0.8515534996986389, + "num_tokens": 40273992.0, + "step": 15205, + "train/ce_loss": 0.5147198438644409 + }, + { + "epoch": 1.5033616768835278, + "step": 15205, + "train/sim_loss": 0.048720717430114746 + }, + { + "epoch": 1.5033616768835278, + "step": 15205, + "train/total_loss": 0.10019270330667496 + }, + { + "entropy": 9.783710479736328, + "epoch": 1.5034605497330433, + "mean_token_accuracy": 0.9059701561927795, + "num_tokens": 40291704.0, + "step": 15206, + "train/ce_loss": 0.22109360992908478 + }, + { + "epoch": 1.5034605497330433, + "step": 15206, + "train/sim_loss": 0.020949602127075195 + }, + { + "epoch": 1.5034605497330433, + "step": 15206, + "train/total_loss": 0.043058961629867554 + }, + { + "entropy": 9.209661483764648, + "epoch": 1.5035594225825588, + "mean_token_accuracy": 0.8588064312934875, + "num_tokens": 40300514.0, + "step": 15207, + "train/ce_loss": 0.19384309649467468 + }, + { + "epoch": 1.5035594225825588, + "step": 15207, + "train/sim_loss": 0.03182661533355713 + }, + { + "epoch": 1.5035594225825588, + "step": 15207, + "train/total_loss": 0.0512109249830246 + }, + { + "entropy": 9.95749568939209, + "epoch": 1.5036582954320743, + "mean_token_accuracy": 0.8835758566856384, + "num_tokens": 40315610.0, + "step": 15208, + "train/ce_loss": 0.610381007194519 + }, + { + "epoch": 1.5036582954320743, + "step": 15208, + "train/sim_loss": 0.025448858737945557 + }, + { + "epoch": 1.5036582954320743, + "step": 15208, + "train/total_loss": 0.08648696541786194 + }, + { + "entropy": 9.506927490234375, + "epoch": 1.5037571682815898, + "mean_token_accuracy": 0.8863636255264282, + "num_tokens": 40328030.0, + "step": 15209, + "train/ce_loss": 2.6321556561015313e-06 + }, + { + "epoch": 1.5037571682815898, + "step": 15209, + "train/sim_loss": 0.025579512119293213 + }, + { + "epoch": 1.5037571682815898, + "step": 15209, + "train/total_loss": 0.025579774752259254 + }, + { + "entropy": 9.569524765014648, + "epoch": 1.5038560411311055, + "mean_token_accuracy": 0.939393937587738, + "num_tokens": 40339240.0, + "step": 15210, + "train/ce_loss": 3.0938835493543593e-07 + }, + { + "epoch": 1.5038560411311055, + "step": 15210, + "train/sim_loss": 0.0134199857711792 + }, + { + "epoch": 1.5038560411311055, + "step": 15210, + "train/total_loss": 0.013420016504824162 + }, + { + "entropy": 9.191258430480957, + "epoch": 1.5039549139806208, + "mean_token_accuracy": 0.8626198172569275, + "num_tokens": 40348479.0, + "step": 15211, + "train/ce_loss": 0.5752975344657898 + }, + { + "epoch": 1.5039549139806208, + "step": 15211, + "train/sim_loss": 0.08391714096069336 + }, + { + "epoch": 1.5039549139806208, + "step": 15211, + "train/total_loss": 0.14144688844680786 + }, + { + "entropy": 9.455498695373535, + "epoch": 1.5040537868301365, + "mean_token_accuracy": 0.8167116045951843, + "num_tokens": 40362220.0, + "step": 15212, + "train/ce_loss": 0.688572883605957 + }, + { + "epoch": 1.5040537868301365, + "step": 15212, + "train/sim_loss": 0.017350494861602783 + }, + { + "epoch": 1.5040537868301365, + "step": 15212, + "train/total_loss": 0.0862077847123146 + }, + { + "entropy": 9.828707695007324, + "epoch": 1.504152659679652, + "mean_token_accuracy": 0.9041095972061157, + "num_tokens": 40370453.0, + "step": 15213, + "train/ce_loss": 0.4223119616508484 + }, + { + "epoch": 1.504152659679652, + "step": 15213, + "train/sim_loss": 0.10278606414794922 + }, + { + "epoch": 1.504152659679652, + "step": 15213, + "train/total_loss": 0.14501726627349854 + }, + { + "entropy": 9.676553726196289, + "epoch": 1.5042515325291674, + "mean_token_accuracy": 0.8506666421890259, + "num_tokens": 40387095.0, + "step": 15214, + "train/ce_loss": 0.5545254945755005 + }, + { + "epoch": 1.5042515325291674, + "step": 15214, + "train/sim_loss": 0.047440409660339355 + }, + { + "epoch": 1.5042515325291674, + "step": 15214, + "train/total_loss": 0.10289296507835388 + }, + { + "entropy": 9.831157684326172, + "epoch": 1.5043504053786831, + "mean_token_accuracy": 0.8777968883514404, + "num_tokens": 40404386.0, + "step": 15215, + "train/ce_loss": 0.6625138521194458 + }, + { + "epoch": 1.5043504053786831, + "step": 15215, + "train/sim_loss": 0.06813657283782959 + }, + { + "epoch": 1.5043504053786831, + "step": 15215, + "train/total_loss": 0.13438796997070312 + }, + { + "entropy": 9.202566146850586, + "epoch": 1.5044492782281984, + "mean_token_accuracy": 0.8628659248352051, + "num_tokens": 40412341.0, + "step": 15216, + "train/ce_loss": 2.6540506041783374e-06 + }, + { + "epoch": 1.5044492782281984, + "step": 15216, + "train/sim_loss": 0.04094743728637695 + }, + { + "epoch": 1.5044492782281984, + "step": 15216, + "train/total_loss": 0.040947701781988144 + }, + { + "entropy": 8.902584075927734, + "epoch": 1.5045481510777141, + "mean_token_accuracy": 0.8425655961036682, + "num_tokens": 40423700.0, + "step": 15217, + "train/ce_loss": 0.22886289656162262 + }, + { + "epoch": 1.5045481510777141, + "step": 15217, + "train/sim_loss": 0.011919200420379639 + }, + { + "epoch": 1.5045481510777141, + "step": 15217, + "train/total_loss": 0.03480549156665802 + }, + { + "entropy": 9.108661651611328, + "epoch": 1.5046470239272296, + "mean_token_accuracy": 0.8787878751754761, + "num_tokens": 40430856.0, + "step": 15218, + "train/ce_loss": 0.5159815549850464 + }, + { + "epoch": 1.5046470239272296, + "step": 15218, + "train/sim_loss": 0.05429279804229736 + }, + { + "epoch": 1.5046470239272296, + "step": 15218, + "train/total_loss": 0.10589095950126648 + }, + { + "entropy": 9.765701293945312, + "epoch": 1.504745896776745, + "mean_token_accuracy": 0.8594815731048584, + "num_tokens": 40454505.0, + "step": 15219, + "train/ce_loss": 0.49772587418556213 + }, + { + "epoch": 1.504745896776745, + "step": 15219, + "train/sim_loss": 0.014795541763305664 + }, + { + "epoch": 1.504745896776745, + "step": 15219, + "train/total_loss": 0.06456813216209412 + }, + { + "epoch": 1.5048447696262608, + "grad_norm": 0.6121154427528381, + "learning_rate": 6.239677594817783e-06, + "loss": 0.0769, + "step": 15220 + }, + { + "entropy": 9.38621711730957, + "epoch": 1.5048447696262608, + "mean_token_accuracy": 0.800561785697937, + "num_tokens": 40474042.0, + "step": 15220, + "train/ce_loss": 0.7123592495918274 + }, + { + "epoch": 1.5048447696262608, + "step": 15220, + "train/sim_loss": 0.03461700677871704 + }, + { + "epoch": 1.5048447696262608, + "step": 15220, + "train/total_loss": 0.10585293173789978 + }, + { + "entropy": 9.362293243408203, + "epoch": 1.504943642475776, + "mean_token_accuracy": 0.8180862069129944, + "num_tokens": 40489424.0, + "step": 15221, + "train/ce_loss": 0.5545012354850769 + }, + { + "epoch": 1.504943642475776, + "step": 15221, + "train/sim_loss": 0.041007280349731445 + }, + { + "epoch": 1.504943642475776, + "step": 15221, + "train/total_loss": 0.09645740687847137 + }, + { + "entropy": 9.208000183105469, + "epoch": 1.5050425153252918, + "mean_token_accuracy": 0.7992240786552429, + "num_tokens": 40498291.0, + "step": 15222, + "train/ce_loss": 0.3374202251434326 + }, + { + "epoch": 1.5050425153252918, + "step": 15222, + "train/sim_loss": 0.03248453140258789 + }, + { + "epoch": 1.5050425153252918, + "step": 15222, + "train/total_loss": 0.06622655689716339 + }, + { + "entropy": 9.43783950805664, + "epoch": 1.5051413881748072, + "mean_token_accuracy": 0.8639876246452332, + "num_tokens": 40513548.0, + "step": 15223, + "train/ce_loss": 0.6812253594398499 + }, + { + "epoch": 1.5051413881748072, + "step": 15223, + "train/sim_loss": 0.0675501823425293 + }, + { + "epoch": 1.5051413881748072, + "step": 15223, + "train/total_loss": 0.13567271828651428 + }, + { + "entropy": 9.522711753845215, + "epoch": 1.5052402610243227, + "mean_token_accuracy": 0.8434886336326599, + "num_tokens": 40534286.0, + "step": 15224, + "train/ce_loss": 0.43024763464927673 + }, + { + "epoch": 1.5052402610243227, + "step": 15224, + "train/sim_loss": 0.024197697639465332 + }, + { + "epoch": 1.5052402610243227, + "step": 15224, + "train/total_loss": 0.067222461104393 + }, + { + "entropy": 9.19251823425293, + "epoch": 1.5053391338738382, + "mean_token_accuracy": 0.7862856984138489, + "num_tokens": 40545618.0, + "step": 15225, + "train/ce_loss": 0.5972182154655457 + }, + { + "epoch": 1.5053391338738382, + "step": 15225, + "train/sim_loss": 0.04977011680603027 + }, + { + "epoch": 1.5053391338738382, + "step": 15225, + "train/total_loss": 0.10949194431304932 + }, + { + "entropy": 9.470169067382812, + "epoch": 1.5054380067233537, + "mean_token_accuracy": 0.8256097435951233, + "num_tokens": 40558561.0, + "step": 15226, + "train/ce_loss": 0.3029656708240509 + }, + { + "epoch": 1.5054380067233537, + "step": 15226, + "train/sim_loss": 0.029944539070129395 + }, + { + "epoch": 1.5054380067233537, + "step": 15226, + "train/total_loss": 0.060241106897592545 + }, + { + "entropy": 9.15914535522461, + "epoch": 1.5055368795728694, + "mean_token_accuracy": 0.8192513585090637, + "num_tokens": 40569966.0, + "step": 15227, + "train/ce_loss": 0.9660186171531677 + }, + { + "epoch": 1.5055368795728694, + "step": 15227, + "train/sim_loss": 0.08489549160003662 + }, + { + "epoch": 1.5055368795728694, + "step": 15227, + "train/total_loss": 0.18149736523628235 + }, + { + "entropy": 9.326544761657715, + "epoch": 1.5056357524223847, + "mean_token_accuracy": 0.8257839679718018, + "num_tokens": 40584502.0, + "step": 15228, + "train/ce_loss": 0.43407759070396423 + }, + { + "epoch": 1.5056357524223847, + "step": 15228, + "train/sim_loss": 0.08259797096252441 + }, + { + "epoch": 1.5056357524223847, + "step": 15228, + "train/total_loss": 0.12600573897361755 + }, + { + "entropy": 9.214956283569336, + "epoch": 1.5057346252719004, + "mean_token_accuracy": 0.8678222894668579, + "num_tokens": 40596283.0, + "step": 15229, + "train/ce_loss": 0.46910110116004944 + }, + { + "epoch": 1.5057346252719004, + "step": 15229, + "train/sim_loss": 0.042162954807281494 + }, + { + "epoch": 1.5057346252719004, + "step": 15229, + "train/total_loss": 0.0890730619430542 + }, + { + "entropy": 9.2188081741333, + "epoch": 1.5058334981214159, + "mean_token_accuracy": 0.877293586730957, + "num_tokens": 40608371.0, + "step": 15230, + "train/ce_loss": 0.45212239027023315 + }, + { + "epoch": 1.5058334981214159, + "step": 15230, + "train/sim_loss": 0.042571425437927246 + }, + { + "epoch": 1.5058334981214159, + "step": 15230, + "train/total_loss": 0.08778366446495056 + }, + { + "entropy": 9.356695175170898, + "epoch": 1.5059323709709314, + "mean_token_accuracy": 0.8539944887161255, + "num_tokens": 40621386.0, + "step": 15231, + "train/ce_loss": 2.0360626251658687e-07 + }, + { + "epoch": 1.5059323709709314, + "step": 15231, + "train/sim_loss": 0.0267103910446167 + }, + { + "epoch": 1.5059323709709314, + "step": 15231, + "train/total_loss": 0.02671041153371334 + }, + { + "entropy": 9.839664459228516, + "epoch": 1.506031243820447, + "mean_token_accuracy": 0.8780487775802612, + "num_tokens": 40640548.0, + "step": 15232, + "train/ce_loss": 0.5475400686264038 + }, + { + "epoch": 1.506031243820447, + "step": 15232, + "train/sim_loss": 0.021553099155426025 + }, + { + "epoch": 1.506031243820447, + "step": 15232, + "train/total_loss": 0.07630710303783417 + }, + { + "entropy": 9.32794189453125, + "epoch": 1.5061301166699623, + "mean_token_accuracy": 0.8430736064910889, + "num_tokens": 40656571.0, + "step": 15233, + "train/ce_loss": 0.5635461807250977 + }, + { + "epoch": 1.5061301166699623, + "step": 15233, + "train/sim_loss": 0.05874812602996826 + }, + { + "epoch": 1.5061301166699623, + "step": 15233, + "train/total_loss": 0.11510274559259415 + }, + { + "entropy": 9.796225547790527, + "epoch": 1.506228989519478, + "mean_token_accuracy": 0.8255612850189209, + "num_tokens": 40665474.0, + "step": 15234, + "train/ce_loss": 0.4094967842102051 + }, + { + "epoch": 1.506228989519478, + "step": 15234, + "train/sim_loss": 0.02642035484313965 + }, + { + "epoch": 1.506228989519478, + "step": 15234, + "train/total_loss": 0.06737003475427628 + }, + { + "entropy": 9.534854888916016, + "epoch": 1.5063278623689935, + "mean_token_accuracy": 0.9109874963760376, + "num_tokens": 40677262.0, + "step": 15235, + "train/ce_loss": 1.6670459217493772e-06 + }, + { + "epoch": 1.5063278623689935, + "step": 15235, + "train/sim_loss": 0.032077133655548096 + }, + { + "epoch": 1.5063278623689935, + "step": 15235, + "train/total_loss": 0.032077301293611526 + }, + { + "entropy": 9.426905632019043, + "epoch": 1.506426735218509, + "mean_token_accuracy": 0.8598790168762207, + "num_tokens": 40692156.0, + "step": 15236, + "train/ce_loss": 0.5234300494194031 + }, + { + "epoch": 1.506426735218509, + "step": 15236, + "train/sim_loss": 0.06570130586624146 + }, + { + "epoch": 1.506426735218509, + "step": 15236, + "train/total_loss": 0.11804431676864624 + }, + { + "entropy": 9.371700286865234, + "epoch": 1.5065256080680245, + "mean_token_accuracy": 0.8831942677497864, + "num_tokens": 40706701.0, + "step": 15237, + "train/ce_loss": 0.2773515582084656 + }, + { + "epoch": 1.5065256080680245, + "step": 15237, + "train/sim_loss": 0.0671035647392273 + }, + { + "epoch": 1.5065256080680245, + "step": 15237, + "train/total_loss": 0.09483872354030609 + }, + { + "entropy": 9.018950462341309, + "epoch": 1.50662448091754, + "mean_token_accuracy": 0.8599801659584045, + "num_tokens": 40718922.0, + "step": 15238, + "train/ce_loss": 0.760210394859314 + }, + { + "epoch": 1.50662448091754, + "step": 15238, + "train/sim_loss": 0.029628872871398926 + }, + { + "epoch": 1.50662448091754, + "step": 15238, + "train/total_loss": 0.1056499108672142 + }, + { + "entropy": 9.121578216552734, + "epoch": 1.5067233537670557, + "mean_token_accuracy": 0.8355664014816284, + "num_tokens": 40731790.0, + "step": 15239, + "train/ce_loss": 0.7375785708427429 + }, + { + "epoch": 1.5067233537670557, + "step": 15239, + "train/sim_loss": 0.034826576709747314 + }, + { + "epoch": 1.5067233537670557, + "step": 15239, + "train/total_loss": 0.1085844337940216 + }, + { + "epoch": 1.506822226616571, + "grad_norm": 0.5749530792236328, + "learning_rate": 6.234732730059834e-06, + "loss": 0.0838, + "step": 15240 + }, + { + "entropy": 9.867752075195312, + "epoch": 1.506822226616571, + "mean_token_accuracy": 0.8736000061035156, + "num_tokens": 40745285.0, + "step": 15240, + "train/ce_loss": 0.7589419484138489 + }, + { + "epoch": 1.506822226616571, + "step": 15240, + "train/sim_loss": 0.055574893951416016 + }, + { + "epoch": 1.506822226616571, + "step": 15240, + "train/total_loss": 0.13146910071372986 + }, + { + "entropy": 9.935896873474121, + "epoch": 1.5069210994660867, + "mean_token_accuracy": 0.9159935116767883, + "num_tokens": 40766495.0, + "step": 15241, + "train/ce_loss": 0.5095211863517761 + }, + { + "epoch": 1.5069210994660867, + "step": 15241, + "train/sim_loss": 0.039010822772979736 + }, + { + "epoch": 1.5069210994660867, + "step": 15241, + "train/total_loss": 0.08996294438838959 + }, + { + "entropy": 9.59440803527832, + "epoch": 1.5070199723156021, + "mean_token_accuracy": 0.8573351502418518, + "num_tokens": 40784091.0, + "step": 15242, + "train/ce_loss": 0.29677581787109375 + }, + { + "epoch": 1.5070199723156021, + "step": 15242, + "train/sim_loss": 0.02828395366668701 + }, + { + "epoch": 1.5070199723156021, + "step": 15242, + "train/total_loss": 0.057961538434028625 + }, + { + "entropy": 9.18238639831543, + "epoch": 1.5071188451651176, + "mean_token_accuracy": 0.8507772088050842, + "num_tokens": 40792926.0, + "step": 15243, + "train/ce_loss": 0.33126580715179443 + }, + { + "epoch": 1.5071188451651176, + "step": 15243, + "train/sim_loss": 0.023417532444000244 + }, + { + "epoch": 1.5071188451651176, + "step": 15243, + "train/total_loss": 0.05654411390423775 + }, + { + "entropy": 9.389902114868164, + "epoch": 1.5072177180146333, + "mean_token_accuracy": 0.8289473652839661, + "num_tokens": 40802366.0, + "step": 15244, + "train/ce_loss": 0.6158009767532349 + }, + { + "epoch": 1.5072177180146333, + "step": 15244, + "train/sim_loss": 0.030820846557617188 + }, + { + "epoch": 1.5072177180146333, + "step": 15244, + "train/total_loss": 0.0924009457230568 + }, + { + "entropy": 9.520034790039062, + "epoch": 1.5073165908641486, + "mean_token_accuracy": 0.8708609342575073, + "num_tokens": 40816012.0, + "step": 15245, + "train/ce_loss": 0.38281089067459106 + }, + { + "epoch": 1.5073165908641486, + "step": 15245, + "train/sim_loss": 0.03734540939331055 + }, + { + "epoch": 1.5073165908641486, + "step": 15245, + "train/total_loss": 0.07562649995088577 + }, + { + "entropy": 9.11259937286377, + "epoch": 1.5074154637136643, + "mean_token_accuracy": 0.8683351278305054, + "num_tokens": 40824581.0, + "step": 15246, + "train/ce_loss": 0.2845692038536072 + }, + { + "epoch": 1.5074154637136643, + "step": 15246, + "train/sim_loss": 0.027393102645874023 + }, + { + "epoch": 1.5074154637136643, + "step": 15246, + "train/total_loss": 0.05585002154111862 + }, + { + "entropy": 10.04674243927002, + "epoch": 1.5075143365631798, + "mean_token_accuracy": 0.884403645992279, + "num_tokens": 40838730.0, + "step": 15247, + "train/ce_loss": 0.37114575505256653 + }, + { + "epoch": 1.5075143365631798, + "step": 15247, + "train/sim_loss": 0.023322105407714844 + }, + { + "epoch": 1.5075143365631798, + "step": 15247, + "train/total_loss": 0.0604366809129715 + }, + { + "entropy": 9.348958969116211, + "epoch": 1.5076132094126953, + "mean_token_accuracy": 0.8280329704284668, + "num_tokens": 40852129.0, + "step": 15248, + "train/ce_loss": 0.6215295791625977 + }, + { + "epoch": 1.5076132094126953, + "step": 15248, + "train/sim_loss": 0.09190535545349121 + }, + { + "epoch": 1.5076132094126953, + "step": 15248, + "train/total_loss": 0.1540583074092865 + }, + { + "entropy": 9.11739444732666, + "epoch": 1.5077120822622108, + "mean_token_accuracy": 0.8501827120780945, + "num_tokens": 40859774.0, + "step": 15249, + "train/ce_loss": 0.5532041788101196 + }, + { + "epoch": 1.5077120822622108, + "step": 15249, + "train/sim_loss": 0.04745984077453613 + }, + { + "epoch": 1.5077120822622108, + "step": 15249, + "train/total_loss": 0.10278026014566422 + }, + { + "entropy": 9.90300178527832, + "epoch": 1.5078109551117262, + "mean_token_accuracy": 0.8321512937545776, + "num_tokens": 40870984.0, + "step": 15250, + "train/ce_loss": 1.0483204277988989e-06 + }, + { + "epoch": 1.5078109551117262, + "step": 15250, + "train/sim_loss": 0.03597617149353027 + }, + { + "epoch": 1.5078109551117262, + "step": 15250, + "train/total_loss": 0.03597627580165863 + }, + { + "entropy": 9.249942779541016, + "epoch": 1.507909827961242, + "mean_token_accuracy": 0.8587849140167236, + "num_tokens": 40878402.0, + "step": 15251, + "train/ce_loss": 0.6465960144996643 + }, + { + "epoch": 1.507909827961242, + "step": 15251, + "train/sim_loss": 0.017161905765533447 + }, + { + "epoch": 1.507909827961242, + "step": 15251, + "train/total_loss": 0.081821508705616 + }, + { + "entropy": 9.337729454040527, + "epoch": 1.5080087008107572, + "mean_token_accuracy": 0.8747731447219849, + "num_tokens": 40894729.0, + "step": 15252, + "train/ce_loss": 0.4083227515220642 + }, + { + "epoch": 1.5080087008107572, + "step": 15252, + "train/sim_loss": 0.0425412654876709 + }, + { + "epoch": 1.5080087008107572, + "step": 15252, + "train/total_loss": 0.0833735466003418 + }, + { + "entropy": 9.987682342529297, + "epoch": 1.508107573660273, + "mean_token_accuracy": 0.9238095283508301, + "num_tokens": 40904744.0, + "step": 15253, + "train/ce_loss": 3.780600366098952e-07 + }, + { + "epoch": 1.508107573660273, + "step": 15253, + "train/sim_loss": 0.01882404088973999 + }, + { + "epoch": 1.508107573660273, + "step": 15253, + "train/total_loss": 0.018824078142642975 + }, + { + "entropy": 9.144794464111328, + "epoch": 1.5082064465097884, + "mean_token_accuracy": 0.9008073806762695, + "num_tokens": 40913710.0, + "step": 15254, + "train/ce_loss": 0.19407670199871063 + }, + { + "epoch": 1.5082064465097884, + "step": 15254, + "train/sim_loss": 0.010732769966125488 + }, + { + "epoch": 1.5082064465097884, + "step": 15254, + "train/total_loss": 0.03014044091105461 + }, + { + "entropy": 9.481929779052734, + "epoch": 1.508305319359304, + "mean_token_accuracy": 0.873161792755127, + "num_tokens": 40927394.0, + "step": 15255, + "train/ce_loss": 0.36850666999816895 + }, + { + "epoch": 1.508305319359304, + "step": 15255, + "train/sim_loss": 0.03813004493713379 + }, + { + "epoch": 1.508305319359304, + "step": 15255, + "train/total_loss": 0.0749807134270668 + }, + { + "entropy": 9.055500030517578, + "epoch": 1.5084041922088196, + "mean_token_accuracy": 0.8684210777282715, + "num_tokens": 40933466.0, + "step": 15256, + "train/ce_loss": 0.2984480559825897 + }, + { + "epoch": 1.5084041922088196, + "step": 15256, + "train/sim_loss": 0.04872465133666992 + }, + { + "epoch": 1.5084041922088196, + "step": 15256, + "train/total_loss": 0.0785694569349289 + }, + { + "entropy": 9.90013313293457, + "epoch": 1.5085030650583349, + "mean_token_accuracy": 0.8503521084785461, + "num_tokens": 40946158.0, + "step": 15257, + "train/ce_loss": 8.579280006415502e-07 + }, + { + "epoch": 1.5085030650583349, + "step": 15257, + "train/sim_loss": 0.06594276428222656 + }, + { + "epoch": 1.5085030650583349, + "step": 15257, + "train/total_loss": 0.06594285368919373 + }, + { + "entropy": 9.521310806274414, + "epoch": 1.5086019379078506, + "mean_token_accuracy": 0.8262500166893005, + "num_tokens": 40955801.0, + "step": 15258, + "train/ce_loss": 0.5404940247535706 + }, + { + "epoch": 1.5086019379078506, + "step": 15258, + "train/sim_loss": 0.04881274700164795 + }, + { + "epoch": 1.5086019379078506, + "step": 15258, + "train/total_loss": 0.102862149477005 + }, + { + "entropy": 9.716279983520508, + "epoch": 1.508700810757366, + "mean_token_accuracy": 0.8613251447677612, + "num_tokens": 40970984.0, + "step": 15259, + "train/ce_loss": 0.3726854622364044 + }, + { + "epoch": 1.508700810757366, + "step": 15259, + "train/sim_loss": 0.02962726354598999 + }, + { + "epoch": 1.508700810757366, + "step": 15259, + "train/total_loss": 0.06689581274986267 + }, + { + "epoch": 1.5087996836068815, + "grad_norm": 0.49536895751953125, + "learning_rate": 6.229787865301884e-06, + "loss": 0.0771, + "step": 15260 + }, + { + "entropy": 9.495792388916016, + "epoch": 1.5087996836068815, + "mean_token_accuracy": 0.8387942314147949, + "num_tokens": 40984185.0, + "step": 15260, + "train/ce_loss": 0.5685760974884033 + }, + { + "epoch": 1.5087996836068815, + "step": 15260, + "train/sim_loss": 0.07410544157028198 + }, + { + "epoch": 1.5087996836068815, + "step": 15260, + "train/total_loss": 0.1309630572795868 + }, + { + "entropy": 9.370916366577148, + "epoch": 1.508898556456397, + "mean_token_accuracy": 0.8238557577133179, + "num_tokens": 41000934.0, + "step": 15261, + "train/ce_loss": 0.3087047338485718 + }, + { + "epoch": 1.508898556456397, + "step": 15261, + "train/sim_loss": 0.04057514667510986 + }, + { + "epoch": 1.508898556456397, + "step": 15261, + "train/total_loss": 0.07144562155008316 + }, + { + "entropy": 9.492696762084961, + "epoch": 1.5089974293059125, + "mean_token_accuracy": 0.8585164546966553, + "num_tokens": 41014711.0, + "step": 15262, + "train/ce_loss": 0.4420113265514374 + }, + { + "epoch": 1.5089974293059125, + "step": 15262, + "train/sim_loss": 0.03941071033477783 + }, + { + "epoch": 1.5089974293059125, + "step": 15262, + "train/total_loss": 0.08361184597015381 + }, + { + "entropy": 9.570785522460938, + "epoch": 1.5090963021554282, + "mean_token_accuracy": 0.8102094531059265, + "num_tokens": 41032518.0, + "step": 15263, + "train/ce_loss": 0.5612106919288635 + }, + { + "epoch": 1.5090963021554282, + "step": 15263, + "train/sim_loss": 0.026184558868408203 + }, + { + "epoch": 1.5090963021554282, + "step": 15263, + "train/total_loss": 0.08230562508106232 + }, + { + "entropy": 9.491113662719727, + "epoch": 1.5091951750049435, + "mean_token_accuracy": 0.8651960492134094, + "num_tokens": 41049068.0, + "step": 15264, + "train/ce_loss": 0.782423198223114 + }, + { + "epoch": 1.5091951750049435, + "step": 15264, + "train/sim_loss": 0.037886202335357666 + }, + { + "epoch": 1.5091951750049435, + "step": 15264, + "train/total_loss": 0.11612852662801743 + }, + { + "entropy": 9.681403160095215, + "epoch": 1.5092940478544592, + "mean_token_accuracy": 0.8575999736785889, + "num_tokens": 41059747.0, + "step": 15265, + "train/ce_loss": 0.6205748915672302 + }, + { + "epoch": 1.5092940478544592, + "step": 15265, + "train/sim_loss": 0.06865537166595459 + }, + { + "epoch": 1.5092940478544592, + "step": 15265, + "train/total_loss": 0.1307128667831421 + }, + { + "entropy": 9.254836082458496, + "epoch": 1.5093929207039747, + "mean_token_accuracy": 0.8070403933525085, + "num_tokens": 41070181.0, + "step": 15266, + "train/ce_loss": 0.5549293756484985 + }, + { + "epoch": 1.5093929207039747, + "step": 15266, + "train/sim_loss": 0.06612944602966309 + }, + { + "epoch": 1.5093929207039747, + "step": 15266, + "train/total_loss": 0.12162238359451294 + }, + { + "entropy": 9.503713607788086, + "epoch": 1.5094917935534902, + "mean_token_accuracy": 0.8649094104766846, + "num_tokens": 41077961.0, + "step": 15267, + "train/ce_loss": 0.33384183049201965 + }, + { + "epoch": 1.5094917935534902, + "step": 15267, + "train/sim_loss": 0.05990719795227051 + }, + { + "epoch": 1.5094917935534902, + "step": 15267, + "train/total_loss": 0.09329138696193695 + }, + { + "entropy": 9.496091842651367, + "epoch": 1.5095906664030059, + "mean_token_accuracy": 0.8259385824203491, + "num_tokens": 41094001.0, + "step": 15268, + "train/ce_loss": 0.39406824111938477 + }, + { + "epoch": 1.5095906664030059, + "step": 15268, + "train/sim_loss": 0.039667367935180664 + }, + { + "epoch": 1.5095906664030059, + "step": 15268, + "train/total_loss": 0.0790741890668869 + }, + { + "entropy": 9.552652359008789, + "epoch": 1.5096895392525211, + "mean_token_accuracy": 0.8353590965270996, + "num_tokens": 41106966.0, + "step": 15269, + "train/ce_loss": 0.6555337905883789 + }, + { + "epoch": 1.5096895392525211, + "step": 15269, + "train/sim_loss": 0.08039647340774536 + }, + { + "epoch": 1.5096895392525211, + "step": 15269, + "train/total_loss": 0.1459498554468155 + }, + { + "entropy": 8.835590362548828, + "epoch": 1.5097884121020368, + "mean_token_accuracy": 0.8362234830856323, + "num_tokens": 41116369.0, + "step": 15270, + "train/ce_loss": 0.7440898418426514 + }, + { + "epoch": 1.5097884121020368, + "step": 15270, + "train/sim_loss": 0.057096123695373535 + }, + { + "epoch": 1.5097884121020368, + "step": 15270, + "train/total_loss": 0.1315051019191742 + }, + { + "entropy": 9.774566650390625, + "epoch": 1.5098872849515523, + "mean_token_accuracy": 0.8851963877677917, + "num_tokens": 41133029.0, + "step": 15271, + "train/ce_loss": 0.49260443449020386 + }, + { + "epoch": 1.5098872849515523, + "step": 15271, + "train/sim_loss": 0.035079121589660645 + }, + { + "epoch": 1.5098872849515523, + "step": 15271, + "train/total_loss": 0.08433956652879715 + }, + { + "entropy": 9.355579376220703, + "epoch": 1.5099861578010678, + "mean_token_accuracy": 0.8381994962692261, + "num_tokens": 41146390.0, + "step": 15272, + "train/ce_loss": 0.4988965094089508 + }, + { + "epoch": 1.5099861578010678, + "step": 15272, + "train/sim_loss": 0.08671915531158447 + }, + { + "epoch": 1.5099861578010678, + "step": 15272, + "train/total_loss": 0.1366088092327118 + }, + { + "entropy": 9.941305160522461, + "epoch": 1.5100850306505833, + "mean_token_accuracy": 0.8635578751564026, + "num_tokens": 41155280.0, + "step": 15273, + "train/ce_loss": 0.5522709488868713 + }, + { + "epoch": 1.5100850306505833, + "step": 15273, + "train/sim_loss": 0.06190061569213867 + }, + { + "epoch": 1.5100850306505833, + "step": 15273, + "train/total_loss": 0.11712771654129028 + }, + { + "entropy": 9.389190673828125, + "epoch": 1.5101839035000988, + "mean_token_accuracy": 0.8622620105743408, + "num_tokens": 41172032.0, + "step": 15274, + "train/ce_loss": 0.4067184031009674 + }, + { + "epoch": 1.5101839035000988, + "step": 15274, + "train/sim_loss": 0.016785860061645508 + }, + { + "epoch": 1.5101839035000988, + "step": 15274, + "train/total_loss": 0.05745770037174225 + }, + { + "entropy": 9.822257041931152, + "epoch": 1.5102827763496145, + "mean_token_accuracy": 0.8807339668273926, + "num_tokens": 41180249.0, + "step": 15275, + "train/ce_loss": 0.7231151461601257 + }, + { + "epoch": 1.5102827763496145, + "step": 15275, + "train/sim_loss": 0.054636240005493164 + }, + { + "epoch": 1.5102827763496145, + "step": 15275, + "train/total_loss": 0.12694776058197021 + }, + { + "entropy": 9.491188049316406, + "epoch": 1.5103816491991298, + "mean_token_accuracy": 0.8422301411628723, + "num_tokens": 41197125.0, + "step": 15276, + "train/ce_loss": 0.3610512912273407 + }, + { + "epoch": 1.5103816491991298, + "step": 15276, + "train/sim_loss": 0.04751396179199219 + }, + { + "epoch": 1.5103816491991298, + "step": 15276, + "train/total_loss": 0.08361908793449402 + }, + { + "entropy": 9.516081809997559, + "epoch": 1.5104805220486455, + "mean_token_accuracy": 0.8697394728660583, + "num_tokens": 41214656.0, + "step": 15277, + "train/ce_loss": 0.33351343870162964 + }, + { + "epoch": 1.5104805220486455, + "step": 15277, + "train/sim_loss": 0.03243839740753174 + }, + { + "epoch": 1.5104805220486455, + "step": 15277, + "train/total_loss": 0.06578974425792694 + }, + { + "entropy": 9.425627708435059, + "epoch": 1.510579394898161, + "mean_token_accuracy": 0.8807471394538879, + "num_tokens": 41222350.0, + "step": 15278, + "train/ce_loss": 0.27515971660614014 + }, + { + "epoch": 1.510579394898161, + "step": 15278, + "train/sim_loss": 0.011372566223144531 + }, + { + "epoch": 1.510579394898161, + "step": 15278, + "train/total_loss": 0.038888536393642426 + }, + { + "entropy": 9.485260009765625, + "epoch": 1.5106782677476764, + "mean_token_accuracy": 0.8438355922698975, + "num_tokens": 41232808.0, + "step": 15279, + "train/ce_loss": 0.40320444107055664 + }, + { + "epoch": 1.5106782677476764, + "step": 15279, + "train/sim_loss": 0.017361760139465332 + }, + { + "epoch": 1.5106782677476764, + "step": 15279, + "train/total_loss": 0.057682204991579056 + }, + { + "epoch": 1.5107771405971921, + "grad_norm": 0.6366609334945679, + "learning_rate": 6.224843000543935e-06, + "loss": 0.0852, + "step": 15280 + }, + { + "entropy": 9.283798217773438, + "epoch": 1.5107771405971921, + "mean_token_accuracy": 0.8797385692596436, + "num_tokens": 41239056.0, + "step": 15280, + "train/ce_loss": 0.30756837129592896 + }, + { + "epoch": 1.5107771405971921, + "step": 15280, + "train/sim_loss": 0.008325278759002686 + }, + { + "epoch": 1.5107771405971921, + "step": 15280, + "train/total_loss": 0.0390821173787117 + }, + { + "entropy": 9.339574813842773, + "epoch": 1.5108760134467074, + "mean_token_accuracy": 0.8447432518005371, + "num_tokens": 41256723.0, + "step": 15281, + "train/ce_loss": 0.40038490295410156 + }, + { + "epoch": 1.5108760134467074, + "step": 15281, + "train/sim_loss": 0.02067244052886963 + }, + { + "epoch": 1.5108760134467074, + "step": 15281, + "train/total_loss": 0.060710933059453964 + }, + { + "entropy": 9.379606246948242, + "epoch": 1.5109748862962231, + "mean_token_accuracy": 0.8488371968269348, + "num_tokens": 41268093.0, + "step": 15282, + "train/ce_loss": 0.6721352934837341 + }, + { + "epoch": 1.5109748862962231, + "step": 15282, + "train/sim_loss": 0.06878876686096191 + }, + { + "epoch": 1.5109748862962231, + "step": 15282, + "train/total_loss": 0.1360023021697998 + }, + { + "entropy": 9.680747032165527, + "epoch": 1.5110737591457386, + "mean_token_accuracy": 0.8484398126602173, + "num_tokens": 41277607.0, + "step": 15283, + "train/ce_loss": 2.2814235478563205e-07 + }, + { + "epoch": 1.5110737591457386, + "step": 15283, + "train/sim_loss": 0.014687418937683105 + }, + { + "epoch": 1.5110737591457386, + "step": 15283, + "train/total_loss": 0.014687441289424896 + }, + { + "entropy": 9.789131164550781, + "epoch": 1.511172631995254, + "mean_token_accuracy": 0.8227513432502747, + "num_tokens": 41294428.0, + "step": 15284, + "train/ce_loss": 0.5236789584159851 + }, + { + "epoch": 1.511172631995254, + "step": 15284, + "train/sim_loss": 0.03452920913696289 + }, + { + "epoch": 1.511172631995254, + "step": 15284, + "train/total_loss": 0.0868971049785614 + }, + { + "entropy": 8.788528442382812, + "epoch": 1.5112715048447696, + "mean_token_accuracy": 0.8566038012504578, + "num_tokens": 41303690.0, + "step": 15285, + "train/ce_loss": 0.5582460165023804 + }, + { + "epoch": 1.5112715048447696, + "step": 15285, + "train/sim_loss": 0.07293307781219482 + }, + { + "epoch": 1.5112715048447696, + "step": 15285, + "train/total_loss": 0.12875768542289734 + }, + { + "entropy": 9.242887496948242, + "epoch": 1.511370377694285, + "mean_token_accuracy": 0.8418079018592834, + "num_tokens": 41311301.0, + "step": 15286, + "train/ce_loss": 0.4831138551235199 + }, + { + "epoch": 1.511370377694285, + "step": 15286, + "train/sim_loss": 0.01729607582092285 + }, + { + "epoch": 1.511370377694285, + "step": 15286, + "train/total_loss": 0.0656074583530426 + }, + { + "entropy": 9.648238182067871, + "epoch": 1.5114692505438008, + "mean_token_accuracy": 0.8345588445663452, + "num_tokens": 41325304.0, + "step": 15287, + "train/ce_loss": 0.34291672706604004 + }, + { + "epoch": 1.5114692505438008, + "step": 15287, + "train/sim_loss": 0.04513740539550781 + }, + { + "epoch": 1.5114692505438008, + "step": 15287, + "train/total_loss": 0.07942907512187958 + }, + { + "entropy": 9.772977828979492, + "epoch": 1.511568123393316, + "mean_token_accuracy": 0.8697478771209717, + "num_tokens": 41342568.0, + "step": 15288, + "train/ce_loss": 0.2515970766544342 + }, + { + "epoch": 1.511568123393316, + "step": 15288, + "train/sim_loss": 0.06859332323074341 + }, + { + "epoch": 1.511568123393316, + "step": 15288, + "train/total_loss": 0.09375303238630295 + }, + { + "entropy": 9.47530746459961, + "epoch": 1.5116669962428317, + "mean_token_accuracy": 0.8633005023002625, + "num_tokens": 41356647.0, + "step": 15289, + "train/ce_loss": 0.665469765663147 + }, + { + "epoch": 1.5116669962428317, + "step": 15289, + "train/sim_loss": 0.0458950400352478 + }, + { + "epoch": 1.5116669962428317, + "step": 15289, + "train/total_loss": 0.1124420166015625 + }, + { + "entropy": 9.105123519897461, + "epoch": 1.5117658690923472, + "mean_token_accuracy": 0.8302752375602722, + "num_tokens": 41366557.0, + "step": 15290, + "train/ce_loss": 0.3100374639034271 + }, + { + "epoch": 1.5117658690923472, + "step": 15290, + "train/sim_loss": 0.04611349105834961 + }, + { + "epoch": 1.5117658690923472, + "step": 15290, + "train/total_loss": 0.07711723446846008 + }, + { + "entropy": 9.369185447692871, + "epoch": 1.5118647419418627, + "mean_token_accuracy": 0.8662704229354858, + "num_tokens": 41377743.0, + "step": 15291, + "train/ce_loss": 0.3802074193954468 + }, + { + "epoch": 1.5118647419418627, + "step": 15291, + "train/sim_loss": 0.03268396854400635 + }, + { + "epoch": 1.5118647419418627, + "step": 15291, + "train/total_loss": 0.07070471346378326 + }, + { + "entropy": 9.23088264465332, + "epoch": 1.5119636147913784, + "mean_token_accuracy": 0.8473520278930664, + "num_tokens": 41393069.0, + "step": 15292, + "train/ce_loss": 0.5027745962142944 + }, + { + "epoch": 1.5119636147913784, + "step": 15292, + "train/sim_loss": 0.08037173748016357 + }, + { + "epoch": 1.5119636147913784, + "step": 15292, + "train/total_loss": 0.13064919412136078 + }, + { + "entropy": 10.05140495300293, + "epoch": 1.5120624876408937, + "mean_token_accuracy": 0.8852988481521606, + "num_tokens": 41408742.0, + "step": 15293, + "train/ce_loss": 0.5298846960067749 + }, + { + "epoch": 1.5120624876408937, + "step": 15293, + "train/sim_loss": 0.05489271879196167 + }, + { + "epoch": 1.5120624876408937, + "step": 15293, + "train/total_loss": 0.10788118839263916 + }, + { + "entropy": 9.093461990356445, + "epoch": 1.5121613604904094, + "mean_token_accuracy": 0.8673780560493469, + "num_tokens": 41416724.0, + "step": 15294, + "train/ce_loss": 6.442693120334297e-06 + }, + { + "epoch": 1.5121613604904094, + "step": 15294, + "train/sim_loss": 0.027222037315368652 + }, + { + "epoch": 1.5121613604904094, + "step": 15294, + "train/total_loss": 0.027222681790590286 + }, + { + "entropy": 9.32328987121582, + "epoch": 1.5122602333399249, + "mean_token_accuracy": 0.8628659248352051, + "num_tokens": 41425736.0, + "step": 15295, + "train/ce_loss": 0.4483465850353241 + }, + { + "epoch": 1.5122602333399249, + "step": 15295, + "train/sim_loss": 0.060864150524139404 + }, + { + "epoch": 1.5122602333399249, + "step": 15295, + "train/total_loss": 0.10569880902767181 + }, + { + "entropy": 9.623566627502441, + "epoch": 1.5123591061894404, + "mean_token_accuracy": 0.8651315569877625, + "num_tokens": 41438573.0, + "step": 15296, + "train/ce_loss": 0.5123060941696167 + }, + { + "epoch": 1.5123591061894404, + "step": 15296, + "train/sim_loss": 0.014915943145751953 + }, + { + "epoch": 1.5123591061894404, + "step": 15296, + "train/total_loss": 0.06614655256271362 + }, + { + "entropy": 9.432600021362305, + "epoch": 1.512457979038956, + "mean_token_accuracy": 0.8571428656578064, + "num_tokens": 41453718.0, + "step": 15297, + "train/ce_loss": 0.6818351149559021 + }, + { + "epoch": 1.512457979038956, + "step": 15297, + "train/sim_loss": 0.04745745658874512 + }, + { + "epoch": 1.512457979038956, + "step": 15297, + "train/total_loss": 0.11564096808433533 + }, + { + "entropy": 9.110176086425781, + "epoch": 1.5125568518884713, + "mean_token_accuracy": 0.7939814925193787, + "num_tokens": 41462985.0, + "step": 15298, + "train/ce_loss": 0.2891484797000885 + }, + { + "epoch": 1.5125568518884713, + "step": 15298, + "train/sim_loss": 0.013653099536895752 + }, + { + "epoch": 1.5125568518884713, + "step": 15298, + "train/total_loss": 0.04256794601678848 + }, + { + "entropy": 9.503353118896484, + "epoch": 1.512655724737987, + "mean_token_accuracy": 0.8909090757369995, + "num_tokens": 41478787.0, + "step": 15299, + "train/ce_loss": 0.6403515934944153 + }, + { + "epoch": 1.512655724737987, + "step": 15299, + "train/sim_loss": 0.04794740676879883 + }, + { + "epoch": 1.512655724737987, + "step": 15299, + "train/total_loss": 0.1119825690984726 + }, + { + "epoch": 1.5127545975875025, + "grad_norm": 0.5071054100990295, + "learning_rate": 6.2198981357859865e-06, + "loss": 0.0847, + "step": 15300 + }, + { + "entropy": 8.93539810180664, + "epoch": 1.5127545975875025, + "mean_token_accuracy": 0.8681862950325012, + "num_tokens": 41487140.0, + "step": 15300, + "train/ce_loss": 0.5179027915000916 + }, + { + "epoch": 1.5127545975875025, + "step": 15300, + "train/sim_loss": 0.012547671794891357 + }, + { + "epoch": 1.5127545975875025, + "step": 15300, + "train/total_loss": 0.06433795392513275 + }, + { + "entropy": 9.841476440429688, + "epoch": 1.512853470437018, + "mean_token_accuracy": 0.8356807231903076, + "num_tokens": 41505691.0, + "step": 15301, + "train/ce_loss": 0.5636676549911499 + }, + { + "epoch": 1.512853470437018, + "step": 15301, + "train/sim_loss": 0.04436659812927246 + }, + { + "epoch": 1.512853470437018, + "step": 15301, + "train/total_loss": 0.10073336958885193 + }, + { + "entropy": 9.624027252197266, + "epoch": 1.5129523432865335, + "mean_token_accuracy": 0.868563711643219, + "num_tokens": 41515320.0, + "step": 15302, + "train/ce_loss": 0.34827056527137756 + }, + { + "epoch": 1.5129523432865335, + "step": 15302, + "train/sim_loss": 0.025673866271972656 + }, + { + "epoch": 1.5129523432865335, + "step": 15302, + "train/total_loss": 0.06050092354416847 + }, + { + "entropy": 9.400556564331055, + "epoch": 1.513051216136049, + "mean_token_accuracy": 0.8674351572990417, + "num_tokens": 41522190.0, + "step": 15303, + "train/ce_loss": 0.5635198950767517 + }, + { + "epoch": 1.513051216136049, + "step": 15303, + "train/sim_loss": 0.015951156616210938 + }, + { + "epoch": 1.513051216136049, + "step": 15303, + "train/total_loss": 0.07230314612388611 + }, + { + "entropy": 9.560579299926758, + "epoch": 1.5131500889855647, + "mean_token_accuracy": 0.8514285683631897, + "num_tokens": 41537379.0, + "step": 15304, + "train/ce_loss": 0.35272711515426636 + }, + { + "epoch": 1.5131500889855647, + "step": 15304, + "train/sim_loss": 0.018010258674621582 + }, + { + "epoch": 1.5131500889855647, + "step": 15304, + "train/total_loss": 0.0532829724252224 + }, + { + "entropy": 9.182016372680664, + "epoch": 1.51324896183508, + "mean_token_accuracy": 0.8378062844276428, + "num_tokens": 41545192.0, + "step": 15305, + "train/ce_loss": 0.2680305540561676 + }, + { + "epoch": 1.51324896183508, + "step": 15305, + "train/sim_loss": 0.01291424036026001 + }, + { + "epoch": 1.51324896183508, + "step": 15305, + "train/total_loss": 0.03971729427576065 + }, + { + "entropy": 9.525224685668945, + "epoch": 1.5133478346845957, + "mean_token_accuracy": 0.8115577697753906, + "num_tokens": 41559053.0, + "step": 15306, + "train/ce_loss": 0.3822838068008423 + }, + { + "epoch": 1.5133478346845957, + "step": 15306, + "train/sim_loss": 0.037681400775909424 + }, + { + "epoch": 1.5133478346845957, + "step": 15306, + "train/total_loss": 0.07590977847576141 + }, + { + "entropy": 9.099453926086426, + "epoch": 1.5134467075341111, + "mean_token_accuracy": 0.7986190915107727, + "num_tokens": 41568789.0, + "step": 15307, + "train/ce_loss": 0.3614880442619324 + }, + { + "epoch": 1.5134467075341111, + "step": 15307, + "train/sim_loss": 0.06391680240631104 + }, + { + "epoch": 1.5134467075341111, + "step": 15307, + "train/total_loss": 0.10006560385227203 + }, + { + "entropy": 10.065584182739258, + "epoch": 1.5135455803836266, + "mean_token_accuracy": 0.8794726729393005, + "num_tokens": 41581121.0, + "step": 15308, + "train/ce_loss": 2.7837691618515237e-07 + }, + { + "epoch": 1.5135455803836266, + "step": 15308, + "train/sim_loss": 0.012564897537231445 + }, + { + "epoch": 1.5135455803836266, + "step": 15308, + "train/total_loss": 0.012564925476908684 + }, + { + "entropy": 9.795450210571289, + "epoch": 1.5136444532331423, + "mean_token_accuracy": 0.8907563090324402, + "num_tokens": 41586340.0, + "step": 15309, + "train/ce_loss": 0.6996292471885681 + }, + { + "epoch": 1.5136444532331423, + "step": 15309, + "train/sim_loss": 0.024226486682891846 + }, + { + "epoch": 1.5136444532331423, + "step": 15309, + "train/total_loss": 0.09418941289186478 + }, + { + "entropy": 8.58891487121582, + "epoch": 1.5137433260826576, + "mean_token_accuracy": 0.8618181943893433, + "num_tokens": 41595154.0, + "step": 15310, + "train/ce_loss": 0.3550266921520233 + }, + { + "epoch": 1.5137433260826576, + "step": 15310, + "train/sim_loss": 0.03498673439025879 + }, + { + "epoch": 1.5137433260826576, + "step": 15310, + "train/total_loss": 0.07048940658569336 + }, + { + "entropy": 8.879453659057617, + "epoch": 1.5138421989321733, + "mean_token_accuracy": 0.8977832794189453, + "num_tokens": 41605788.0, + "step": 15311, + "train/ce_loss": 0.1926434487104416 + }, + { + "epoch": 1.5138421989321733, + "step": 15311, + "train/sim_loss": 0.10846936702728271 + }, + { + "epoch": 1.5138421989321733, + "step": 15311, + "train/total_loss": 0.12773370742797852 + }, + { + "entropy": 9.551843643188477, + "epoch": 1.5139410717816888, + "mean_token_accuracy": 0.8651102185249329, + "num_tokens": 41618307.0, + "step": 15312, + "train/ce_loss": 0.3579200506210327 + }, + { + "epoch": 1.5139410717816888, + "step": 15312, + "train/sim_loss": 0.05696845054626465 + }, + { + "epoch": 1.5139410717816888, + "step": 15312, + "train/total_loss": 0.09276045858860016 + }, + { + "entropy": 9.661539077758789, + "epoch": 1.5140399446312043, + "mean_token_accuracy": 0.846815824508667, + "num_tokens": 41628914.0, + "step": 15313, + "train/ce_loss": 0.5355564951896667 + }, + { + "epoch": 1.5140399446312043, + "step": 15313, + "train/sim_loss": 0.020774126052856445 + }, + { + "epoch": 1.5140399446312043, + "step": 15313, + "train/total_loss": 0.07432977855205536 + }, + { + "entropy": 9.319146156311035, + "epoch": 1.5141388174807198, + "mean_token_accuracy": 0.8565737009048462, + "num_tokens": 41639021.0, + "step": 15314, + "train/ce_loss": 0.4239491820335388 + }, + { + "epoch": 1.5141388174807198, + "step": 15314, + "train/sim_loss": 0.05708807706832886 + }, + { + "epoch": 1.5141388174807198, + "step": 15314, + "train/total_loss": 0.09948299825191498 + }, + { + "entropy": 9.523260116577148, + "epoch": 1.5142376903302353, + "mean_token_accuracy": 0.8613497018814087, + "num_tokens": 41654040.0, + "step": 15315, + "train/ce_loss": 0.19089433550834656 + }, + { + "epoch": 1.5142376903302353, + "step": 15315, + "train/sim_loss": 0.08124059438705444 + }, + { + "epoch": 1.5142376903302353, + "step": 15315, + "train/total_loss": 0.10033002495765686 + }, + { + "entropy": 9.534064292907715, + "epoch": 1.514336563179751, + "mean_token_accuracy": 0.8445441126823425, + "num_tokens": 41662532.0, + "step": 15316, + "train/ce_loss": 0.6244343519210815 + }, + { + "epoch": 1.514336563179751, + "step": 15316, + "train/sim_loss": 0.03187835216522217 + }, + { + "epoch": 1.514336563179751, + "step": 15316, + "train/total_loss": 0.09432178735733032 + }, + { + "entropy": 9.569313049316406, + "epoch": 1.5144354360292662, + "mean_token_accuracy": 0.8293555974960327, + "num_tokens": 41676374.0, + "step": 15317, + "train/ce_loss": 0.6287281513214111 + }, + { + "epoch": 1.5144354360292662, + "step": 15317, + "train/sim_loss": 0.06470894813537598 + }, + { + "epoch": 1.5144354360292662, + "step": 15317, + "train/total_loss": 0.12758177518844604 + }, + { + "entropy": 9.645223617553711, + "epoch": 1.514534308878782, + "mean_token_accuracy": 0.8987341523170471, + "num_tokens": 41687461.0, + "step": 15318, + "train/ce_loss": 0.37029170989990234 + }, + { + "epoch": 1.514534308878782, + "step": 15318, + "train/sim_loss": 0.01756840944290161 + }, + { + "epoch": 1.514534308878782, + "step": 15318, + "train/total_loss": 0.054597582668066025 + }, + { + "entropy": 9.685308456420898, + "epoch": 1.5146331817282974, + "mean_token_accuracy": 0.8729016780853271, + "num_tokens": 41707452.0, + "step": 15319, + "train/ce_loss": 0.2802647352218628 + }, + { + "epoch": 1.5146331817282974, + "step": 15319, + "train/sim_loss": 0.06794428825378418 + }, + { + "epoch": 1.5146331817282974, + "step": 15319, + "train/total_loss": 0.0959707647562027 + }, + { + "epoch": 1.514732054577813, + "grad_norm": 0.5761536359786987, + "learning_rate": 6.214953271028038e-06, + "loss": 0.0811, + "step": 15320 + }, + { + "entropy": 9.35706615447998, + "epoch": 1.514732054577813, + "mean_token_accuracy": 0.8603026866912842, + "num_tokens": 41717998.0, + "step": 15320, + "train/ce_loss": 0.29530197381973267 + }, + { + "epoch": 1.514732054577813, + "step": 15320, + "train/sim_loss": 0.04946857690811157 + }, + { + "epoch": 1.514732054577813, + "step": 15320, + "train/total_loss": 0.07899877429008484 + }, + { + "entropy": 9.092074394226074, + "epoch": 1.5148309274273286, + "mean_token_accuracy": 0.8004434704780579, + "num_tokens": 41727557.0, + "step": 15321, + "train/ce_loss": 0.21197277307510376 + }, + { + "epoch": 1.5148309274273286, + "step": 15321, + "train/sim_loss": 0.06692427396774292 + }, + { + "epoch": 1.5148309274273286, + "step": 15321, + "train/total_loss": 0.08812154829502106 + }, + { + "entropy": 9.864897727966309, + "epoch": 1.5149298002768439, + "mean_token_accuracy": 0.8523592352867126, + "num_tokens": 41739788.0, + "step": 15322, + "train/ce_loss": 0.5470691323280334 + }, + { + "epoch": 1.5149298002768439, + "step": 15322, + "train/sim_loss": 0.02036905288696289 + }, + { + "epoch": 1.5149298002768439, + "step": 15322, + "train/total_loss": 0.07507596909999847 + }, + { + "entropy": 9.862483978271484, + "epoch": 1.5150286731263596, + "mean_token_accuracy": 0.8844884634017944, + "num_tokens": 41756388.0, + "step": 15323, + "train/ce_loss": 0.595809817314148 + }, + { + "epoch": 1.5150286731263596, + "step": 15323, + "train/sim_loss": 0.0576968789100647 + }, + { + "epoch": 1.5150286731263596, + "step": 15323, + "train/total_loss": 0.11727786064147949 + }, + { + "entropy": 9.37603759765625, + "epoch": 1.515127545975875, + "mean_token_accuracy": 0.8831775784492493, + "num_tokens": 41767077.0, + "step": 15324, + "train/ce_loss": 0.4565706253051758 + }, + { + "epoch": 1.515127545975875, + "step": 15324, + "train/sim_loss": 0.045584678649902344 + }, + { + "epoch": 1.515127545975875, + "step": 15324, + "train/total_loss": 0.0912417471408844 + }, + { + "entropy": 9.489492416381836, + "epoch": 1.5152264188253906, + "mean_token_accuracy": 0.812362015247345, + "num_tokens": 41781665.0, + "step": 15325, + "train/ce_loss": 0.3366992175579071 + }, + { + "epoch": 1.5152264188253906, + "step": 15325, + "train/sim_loss": 0.02751338481903076 + }, + { + "epoch": 1.5152264188253906, + "step": 15325, + "train/total_loss": 0.06118330731987953 + }, + { + "entropy": 9.011606216430664, + "epoch": 1.515325291674906, + "mean_token_accuracy": 0.8608198165893555, + "num_tokens": 41793953.0, + "step": 15326, + "train/ce_loss": 0.4568960666656494 + }, + { + "epoch": 1.515325291674906, + "step": 15326, + "train/sim_loss": 0.015096545219421387 + }, + { + "epoch": 1.515325291674906, + "step": 15326, + "train/total_loss": 0.06078615412116051 + }, + { + "entropy": 9.746309280395508, + "epoch": 1.5154241645244215, + "mean_token_accuracy": 0.875852644443512, + "num_tokens": 41807870.0, + "step": 15327, + "train/ce_loss": 0.19106556475162506 + }, + { + "epoch": 1.5154241645244215, + "step": 15327, + "train/sim_loss": 0.07854533195495605 + }, + { + "epoch": 1.5154241645244215, + "step": 15327, + "train/total_loss": 0.0976518914103508 + }, + { + "entropy": 9.557496070861816, + "epoch": 1.5155230373739372, + "mean_token_accuracy": 0.8779761791229248, + "num_tokens": 41819145.0, + "step": 15328, + "train/ce_loss": 0.43750661611557007 + }, + { + "epoch": 1.5155230373739372, + "step": 15328, + "train/sim_loss": 0.04594826698303223 + }, + { + "epoch": 1.5155230373739372, + "step": 15328, + "train/total_loss": 0.089698925614357 + }, + { + "entropy": 9.222186088562012, + "epoch": 1.5156219102234525, + "mean_token_accuracy": 0.8498331308364868, + "num_tokens": 41830572.0, + "step": 15329, + "train/ce_loss": 0.28995972871780396 + }, + { + "epoch": 1.5156219102234525, + "step": 15329, + "train/sim_loss": 0.07632845640182495 + }, + { + "epoch": 1.5156219102234525, + "step": 15329, + "train/total_loss": 0.10532443225383759 + }, + { + "entropy": 8.979402542114258, + "epoch": 1.5157207830729682, + "mean_token_accuracy": 0.8820286393165588, + "num_tokens": 41837682.0, + "step": 15330, + "train/ce_loss": 0.30442190170288086 + }, + { + "epoch": 1.5157207830729682, + "step": 15330, + "train/sim_loss": 0.014702320098876953 + }, + { + "epoch": 1.5157207830729682, + "step": 15330, + "train/total_loss": 0.04514451324939728 + }, + { + "entropy": 9.212423324584961, + "epoch": 1.5158196559224837, + "mean_token_accuracy": 0.860927164554596, + "num_tokens": 41846702.0, + "step": 15331, + "train/ce_loss": 0.31803569197654724 + }, + { + "epoch": 1.5158196559224837, + "step": 15331, + "train/sim_loss": 0.024610161781311035 + }, + { + "epoch": 1.5158196559224837, + "step": 15331, + "train/total_loss": 0.05641373246908188 + }, + { + "entropy": 9.33746337890625, + "epoch": 1.5159185287719992, + "mean_token_accuracy": 0.8748299479484558, + "num_tokens": 41858647.0, + "step": 15332, + "train/ce_loss": 0.5899785161018372 + }, + { + "epoch": 1.5159185287719992, + "step": 15332, + "train/sim_loss": 0.07695293426513672 + }, + { + "epoch": 1.5159185287719992, + "step": 15332, + "train/total_loss": 0.13595078885555267 + }, + { + "entropy": 9.426146507263184, + "epoch": 1.5160174016215149, + "mean_token_accuracy": 0.8095808625221252, + "num_tokens": 41869703.0, + "step": 15333, + "train/ce_loss": 0.6409577131271362 + }, + { + "epoch": 1.5160174016215149, + "step": 15333, + "train/sim_loss": 0.039656221866607666 + }, + { + "epoch": 1.5160174016215149, + "step": 15333, + "train/total_loss": 0.10375199466943741 + }, + { + "entropy": 9.426513671875, + "epoch": 1.5161162744710301, + "mean_token_accuracy": 0.8581157922744751, + "num_tokens": 41879920.0, + "step": 15334, + "train/ce_loss": 0.40165936946868896 + }, + { + "epoch": 1.5161162744710301, + "step": 15334, + "train/sim_loss": 0.06303220987319946 + }, + { + "epoch": 1.5161162744710301, + "step": 15334, + "train/total_loss": 0.10319814831018448 + }, + { + "entropy": 9.573299407958984, + "epoch": 1.5162151473205459, + "mean_token_accuracy": 0.8941176533699036, + "num_tokens": 41894402.0, + "step": 15335, + "train/ce_loss": 0.5533101558685303 + }, + { + "epoch": 1.5162151473205459, + "step": 15335, + "train/sim_loss": 0.029624342918395996 + }, + { + "epoch": 1.5162151473205459, + "step": 15335, + "train/total_loss": 0.0849553644657135 + }, + { + "entropy": 9.830947875976562, + "epoch": 1.5163140201700613, + "mean_token_accuracy": 0.8702461123466492, + "num_tokens": 41901086.0, + "step": 15336, + "train/ce_loss": 0.4702914357185364 + }, + { + "epoch": 1.5163140201700613, + "step": 15336, + "train/sim_loss": 0.06170934438705444 + }, + { + "epoch": 1.5163140201700613, + "step": 15336, + "train/total_loss": 0.1087384894490242 + }, + { + "entropy": 8.843080520629883, + "epoch": 1.5164128930195768, + "mean_token_accuracy": 0.832335352897644, + "num_tokens": 41909664.0, + "step": 15337, + "train/ce_loss": 0.3514362573623657 + }, + { + "epoch": 1.5164128930195768, + "step": 15337, + "train/sim_loss": 0.0395890474319458 + }, + { + "epoch": 1.5164128930195768, + "step": 15337, + "train/total_loss": 0.07473267614841461 + }, + { + "entropy": 9.559646606445312, + "epoch": 1.5165117658690923, + "mean_token_accuracy": 0.8595505356788635, + "num_tokens": 41919674.0, + "step": 15338, + "train/ce_loss": 8.839805900606734e-07 + }, + { + "epoch": 1.5165117658690923, + "step": 15338, + "train/sim_loss": 0.03650999069213867 + }, + { + "epoch": 1.5165117658690923, + "step": 15338, + "train/total_loss": 0.036510080099105835 + }, + { + "entropy": 9.317806243896484, + "epoch": 1.5166106387186078, + "mean_token_accuracy": 0.8451219797134399, + "num_tokens": 41931159.0, + "step": 15339, + "train/ce_loss": 0.42745721340179443 + }, + { + "epoch": 1.5166106387186078, + "step": 15339, + "train/sim_loss": 0.04772740602493286 + }, + { + "epoch": 1.5166106387186078, + "step": 15339, + "train/total_loss": 0.09047313034534454 + }, + { + "epoch": 1.5167095115681235, + "grad_norm": 0.621955156326294, + "learning_rate": 6.210008406270089e-06, + "loss": 0.0832, + "step": 15340 + }, + { + "entropy": 10.299261093139648, + "epoch": 1.5167095115681235, + "mean_token_accuracy": 0.8847184777259827, + "num_tokens": 41938904.0, + "step": 15340, + "train/ce_loss": 0.455218106508255 + }, + { + "epoch": 1.5167095115681235, + "step": 15340, + "train/sim_loss": 0.1345347762107849 + }, + { + "epoch": 1.5167095115681235, + "step": 15340, + "train/total_loss": 0.1800565868616104 + }, + { + "entropy": 9.628652572631836, + "epoch": 1.5168083844176388, + "mean_token_accuracy": 0.8895487189292908, + "num_tokens": 41950376.0, + "step": 15341, + "train/ce_loss": 0.30735209584236145 + }, + { + "epoch": 1.5168083844176388, + "step": 15341, + "train/sim_loss": 0.037671446800231934 + }, + { + "epoch": 1.5168083844176388, + "step": 15341, + "train/total_loss": 0.06840665638446808 + }, + { + "entropy": 9.314868927001953, + "epoch": 1.5169072572671545, + "mean_token_accuracy": 0.8335301280021667, + "num_tokens": 41963873.0, + "step": 15342, + "train/ce_loss": 0.8668986558914185 + }, + { + "epoch": 1.5169072572671545, + "step": 15342, + "train/sim_loss": 0.04531151056289673 + }, + { + "epoch": 1.5169072572671545, + "step": 15342, + "train/total_loss": 0.1320013701915741 + }, + { + "entropy": 9.65693473815918, + "epoch": 1.51700613011667, + "mean_token_accuracy": 0.8143885135650635, + "num_tokens": 41977208.0, + "step": 15343, + "train/ce_loss": 0.5352185368537903 + }, + { + "epoch": 1.51700613011667, + "step": 15343, + "train/sim_loss": 0.029056906700134277 + }, + { + "epoch": 1.51700613011667, + "step": 15343, + "train/total_loss": 0.08257876336574554 + }, + { + "entropy": 9.494815826416016, + "epoch": 1.5171050029661854, + "mean_token_accuracy": 0.8685047626495361, + "num_tokens": 41998085.0, + "step": 15344, + "train/ce_loss": 0.4051591157913208 + }, + { + "epoch": 1.5171050029661854, + "step": 15344, + "train/sim_loss": 0.024067580699920654 + }, + { + "epoch": 1.5171050029661854, + "step": 15344, + "train/total_loss": 0.06458349525928497 + }, + { + "entropy": 9.308320999145508, + "epoch": 1.5172038758157012, + "mean_token_accuracy": 0.847478449344635, + "num_tokens": 42006639.0, + "step": 15345, + "train/ce_loss": 0.4750339090824127 + }, + { + "epoch": 1.5172038758157012, + "step": 15345, + "train/sim_loss": 0.0596315860748291 + }, + { + "epoch": 1.5172038758157012, + "step": 15345, + "train/total_loss": 0.10713498294353485 + }, + { + "entropy": 9.358270645141602, + "epoch": 1.5173027486652164, + "mean_token_accuracy": 0.8709198832511902, + "num_tokens": 42020375.0, + "step": 15346, + "train/ce_loss": 2.787423625250085e-07 + }, + { + "epoch": 1.5173027486652164, + "step": 15346, + "train/sim_loss": 0.016151607036590576 + }, + { + "epoch": 1.5173027486652164, + "step": 15346, + "train/total_loss": 0.016151634976267815 + }, + { + "entropy": 9.41522216796875, + "epoch": 1.5174016215147321, + "mean_token_accuracy": 0.8132529854774475, + "num_tokens": 42031929.0, + "step": 15347, + "train/ce_loss": 0.23747433722019196 + }, + { + "epoch": 1.5174016215147321, + "step": 15347, + "train/sim_loss": 0.07695472240447998 + }, + { + "epoch": 1.5174016215147321, + "step": 15347, + "train/total_loss": 0.10070215910673141 + }, + { + "entropy": 9.1301908493042, + "epoch": 1.5175004943642476, + "mean_token_accuracy": 0.7967479825019836, + "num_tokens": 42041822.0, + "step": 15348, + "train/ce_loss": 0.8358924388885498 + }, + { + "epoch": 1.5175004943642476, + "step": 15348, + "train/sim_loss": 0.10275918245315552 + }, + { + "epoch": 1.5175004943642476, + "step": 15348, + "train/total_loss": 0.18634843826293945 + }, + { + "entropy": 9.538987159729004, + "epoch": 1.517599367213763, + "mean_token_accuracy": 0.9250646233558655, + "num_tokens": 42050238.0, + "step": 15349, + "train/ce_loss": 7.701807589910459e-06 + }, + { + "epoch": 1.517599367213763, + "step": 15349, + "train/sim_loss": 0.030423402786254883 + }, + { + "epoch": 1.517599367213763, + "step": 15349, + "train/total_loss": 0.030424172058701515 + }, + { + "entropy": 9.309733390808105, + "epoch": 1.5176982400632786, + "mean_token_accuracy": 0.8039950132369995, + "num_tokens": 42060279.0, + "step": 15350, + "train/ce_loss": 0.5358825922012329 + }, + { + "epoch": 1.5176982400632786, + "step": 15350, + "train/sim_loss": 0.0364377498626709 + }, + { + "epoch": 1.5176982400632786, + "step": 15350, + "train/total_loss": 0.09002600610256195 + }, + { + "entropy": 9.591800689697266, + "epoch": 1.517797112912794, + "mean_token_accuracy": 0.8852201104164124, + "num_tokens": 42077401.0, + "step": 15351, + "train/ce_loss": 0.46293601393699646 + }, + { + "epoch": 1.517797112912794, + "step": 15351, + "train/sim_loss": 0.06648808717727661 + }, + { + "epoch": 1.517797112912794, + "step": 15351, + "train/total_loss": 0.11278168857097626 + }, + { + "entropy": 9.502721786499023, + "epoch": 1.5178959857623098, + "mean_token_accuracy": 0.7962308526039124, + "num_tokens": 42090400.0, + "step": 15352, + "train/ce_loss": 0.5978214740753174 + }, + { + "epoch": 1.5178959857623098, + "step": 15352, + "train/sim_loss": 0.09240633249282837 + }, + { + "epoch": 1.5178959857623098, + "step": 15352, + "train/total_loss": 0.1521884799003601 + }, + { + "entropy": 9.629555702209473, + "epoch": 1.517994858611825, + "mean_token_accuracy": 0.8435114622116089, + "num_tokens": 42106473.0, + "step": 15353, + "train/ce_loss": 0.579669713973999 + }, + { + "epoch": 1.517994858611825, + "step": 15353, + "train/sim_loss": 0.01457369327545166 + }, + { + "epoch": 1.517994858611825, + "step": 15353, + "train/total_loss": 0.07254067063331604 + }, + { + "entropy": 9.12011432647705, + "epoch": 1.5180937314613407, + "mean_token_accuracy": 0.8138352036476135, + "num_tokens": 42115714.0, + "step": 15354, + "train/ce_loss": 0.6398884654045105 + }, + { + "epoch": 1.5180937314613407, + "step": 15354, + "train/sim_loss": 0.022256970405578613 + }, + { + "epoch": 1.5180937314613407, + "step": 15354, + "train/total_loss": 0.0862458199262619 + }, + { + "entropy": 9.687910079956055, + "epoch": 1.5181926043108562, + "mean_token_accuracy": 0.8095872402191162, + "num_tokens": 42129354.0, + "step": 15355, + "train/ce_loss": 0.24505667388439178 + }, + { + "epoch": 1.5181926043108562, + "step": 15355, + "train/sim_loss": 0.10333096981048584 + }, + { + "epoch": 1.5181926043108562, + "step": 15355, + "train/total_loss": 0.12783664464950562 + }, + { + "entropy": 9.451581001281738, + "epoch": 1.5182914771603717, + "mean_token_accuracy": 0.8429629802703857, + "num_tokens": 42153310.0, + "step": 15356, + "train/ce_loss": 0.4099917411804199 + }, + { + "epoch": 1.5182914771603717, + "step": 15356, + "train/sim_loss": 0.06337201595306396 + }, + { + "epoch": 1.5182914771603717, + "step": 15356, + "train/total_loss": 0.10437119007110596 + }, + { + "entropy": 9.59560775756836, + "epoch": 1.5183903500098874, + "mean_token_accuracy": 0.9105180501937866, + "num_tokens": 42161751.0, + "step": 15357, + "train/ce_loss": 0.17043523490428925 + }, + { + "epoch": 1.5183903500098874, + "step": 15357, + "train/sim_loss": 0.025784730911254883 + }, + { + "epoch": 1.5183903500098874, + "step": 15357, + "train/total_loss": 0.04282825440168381 + }, + { + "entropy": 9.843306541442871, + "epoch": 1.5184892228594027, + "mean_token_accuracy": 0.8663238883018494, + "num_tokens": 42172762.0, + "step": 15358, + "train/ce_loss": 0.5846161246299744 + }, + { + "epoch": 1.5184892228594027, + "step": 15358, + "train/sim_loss": 0.025686264038085938 + }, + { + "epoch": 1.5184892228594027, + "step": 15358, + "train/total_loss": 0.0841478779911995 + }, + { + "entropy": 9.439739227294922, + "epoch": 1.5185880957089184, + "mean_token_accuracy": 0.820652186870575, + "num_tokens": 42185737.0, + "step": 15359, + "train/ce_loss": 0.554789662361145 + }, + { + "epoch": 1.5185880957089184, + "step": 15359, + "train/sim_loss": 0.06644928455352783 + }, + { + "epoch": 1.5185880957089184, + "step": 15359, + "train/total_loss": 0.12192825227975845 + }, + { + "epoch": 1.5186869685584339, + "grad_norm": 0.6017338037490845, + "learning_rate": 6.20506354151214e-06, + "loss": 0.089, + "step": 15360 + }, + { + "entropy": 9.581140518188477, + "epoch": 1.5186869685584339, + "mean_token_accuracy": 0.9265822768211365, + "num_tokens": 42202942.0, + "step": 15360, + "train/ce_loss": 1.9156491362082306e-06 + }, + { + "epoch": 1.5186869685584339, + "step": 15360, + "train/sim_loss": 0.043786585330963135 + }, + { + "epoch": 1.5186869685584339, + "step": 15360, + "train/total_loss": 0.043786775320768356 + }, + { + "entropy": 9.04450798034668, + "epoch": 1.5187858414079494, + "mean_token_accuracy": 0.8940149545669556, + "num_tokens": 42215658.0, + "step": 15361, + "train/ce_loss": 0.2967563271522522 + }, + { + "epoch": 1.5187858414079494, + "step": 15361, + "train/sim_loss": 0.013794422149658203 + }, + { + "epoch": 1.5187858414079494, + "step": 15361, + "train/total_loss": 0.04347005486488342 + }, + { + "entropy": 9.455604553222656, + "epoch": 1.5188847142574649, + "mean_token_accuracy": 0.8617449402809143, + "num_tokens": 42226444.0, + "step": 15362, + "train/ce_loss": 0.8807546496391296 + }, + { + "epoch": 1.5188847142574649, + "step": 15362, + "train/sim_loss": 0.07988178730010986 + }, + { + "epoch": 1.5188847142574649, + "step": 15362, + "train/total_loss": 0.16795724630355835 + }, + { + "entropy": 9.534673690795898, + "epoch": 1.5189835871069803, + "mean_token_accuracy": 0.8752327561378479, + "num_tokens": 42236429.0, + "step": 15363, + "train/ce_loss": 0.5303903818130493 + }, + { + "epoch": 1.5189835871069803, + "step": 15363, + "train/sim_loss": 0.016776561737060547 + }, + { + "epoch": 1.5189835871069803, + "step": 15363, + "train/total_loss": 0.06981560587882996 + }, + { + "entropy": 8.316993713378906, + "epoch": 1.519082459956496, + "mean_token_accuracy": 0.8721153736114502, + "num_tokens": 42245976.0, + "step": 15364, + "train/ce_loss": 0.45983803272247314 + }, + { + "epoch": 1.519082459956496, + "step": 15364, + "train/sim_loss": 0.027808427810668945 + }, + { + "epoch": 1.519082459956496, + "step": 15364, + "train/total_loss": 0.0737922340631485 + }, + { + "entropy": 9.549626350402832, + "epoch": 1.5191813328060113, + "mean_token_accuracy": 0.8367003202438354, + "num_tokens": 42253524.0, + "step": 15365, + "train/ce_loss": 0.7490454316139221 + }, + { + "epoch": 1.5191813328060113, + "step": 15365, + "train/sim_loss": 0.036997079849243164 + }, + { + "epoch": 1.5191813328060113, + "step": 15365, + "train/total_loss": 0.11190162599086761 + }, + { + "entropy": 9.30963134765625, + "epoch": 1.519280205655527, + "mean_token_accuracy": 0.8532373905181885, + "num_tokens": 42266245.0, + "step": 15366, + "train/ce_loss": 0.6242178082466125 + }, + { + "epoch": 1.519280205655527, + "step": 15366, + "train/sim_loss": 0.029305994510650635 + }, + { + "epoch": 1.519280205655527, + "step": 15366, + "train/total_loss": 0.09172777831554413 + }, + { + "entropy": 9.53359603881836, + "epoch": 1.5193790785050425, + "mean_token_accuracy": 0.8685612678527832, + "num_tokens": 42283107.0, + "step": 15367, + "train/ce_loss": 0.7405552864074707 + }, + { + "epoch": 1.5193790785050425, + "step": 15367, + "train/sim_loss": 0.05114734172821045 + }, + { + "epoch": 1.5193790785050425, + "step": 15367, + "train/total_loss": 0.12520286440849304 + }, + { + "entropy": 9.872098922729492, + "epoch": 1.519477951354558, + "mean_token_accuracy": 0.8330097198486328, + "num_tokens": 42293219.0, + "step": 15368, + "train/ce_loss": 0.44976288080215454 + }, + { + "epoch": 1.519477951354558, + "step": 15368, + "train/sim_loss": 0.04254448413848877 + }, + { + "epoch": 1.519477951354558, + "step": 15368, + "train/total_loss": 0.0875207781791687 + }, + { + "entropy": 9.299637794494629, + "epoch": 1.5195768242040737, + "mean_token_accuracy": 0.8924484848976135, + "num_tokens": 42306627.0, + "step": 15369, + "train/ce_loss": 0.41139447689056396 + }, + { + "epoch": 1.5195768242040737, + "step": 15369, + "train/sim_loss": 0.033210933208465576 + }, + { + "epoch": 1.5195768242040737, + "step": 15369, + "train/total_loss": 0.07435038685798645 + }, + { + "entropy": 9.76349925994873, + "epoch": 1.519675697053589, + "mean_token_accuracy": 0.8569051623344421, + "num_tokens": 42316751.0, + "step": 15370, + "train/ce_loss": 0.6519121527671814 + }, + { + "epoch": 1.519675697053589, + "step": 15370, + "train/sim_loss": 0.009195029735565186 + }, + { + "epoch": 1.519675697053589, + "step": 15370, + "train/total_loss": 0.07438624650239944 + }, + { + "entropy": 9.520767211914062, + "epoch": 1.5197745699031047, + "mean_token_accuracy": 0.8997461795806885, + "num_tokens": 42329150.0, + "step": 15371, + "train/ce_loss": 0.3214116394519806 + }, + { + "epoch": 1.5197745699031047, + "step": 15371, + "train/sim_loss": 0.013680815696716309 + }, + { + "epoch": 1.5197745699031047, + "step": 15371, + "train/total_loss": 0.04582197964191437 + }, + { + "entropy": 9.366132736206055, + "epoch": 1.5198734427526202, + "mean_token_accuracy": 0.8535619974136353, + "num_tokens": 42339079.0, + "step": 15372, + "train/ce_loss": 0.8810186386108398 + }, + { + "epoch": 1.5198734427526202, + "step": 15372, + "train/sim_loss": 0.09473925828933716 + }, + { + "epoch": 1.5198734427526202, + "step": 15372, + "train/total_loss": 0.18284112215042114 + }, + { + "entropy": 9.471761703491211, + "epoch": 1.5199723156021356, + "mean_token_accuracy": 0.8342174887657166, + "num_tokens": 42351410.0, + "step": 15373, + "train/ce_loss": 0.2969237267971039 + }, + { + "epoch": 1.5199723156021356, + "step": 15373, + "train/sim_loss": 0.04863321781158447 + }, + { + "epoch": 1.5199723156021356, + "step": 15373, + "train/total_loss": 0.07832559198141098 + }, + { + "entropy": 9.325735092163086, + "epoch": 1.5200711884516511, + "mean_token_accuracy": 0.8505897521972656, + "num_tokens": 42362968.0, + "step": 15374, + "train/ce_loss": 1.6077666487035458e-06 + }, + { + "epoch": 1.5200711884516511, + "step": 15374, + "train/sim_loss": 0.04478764533996582 + }, + { + "epoch": 1.5200711884516511, + "step": 15374, + "train/total_loss": 0.044787805527448654 + }, + { + "entropy": 9.232270240783691, + "epoch": 1.5201700613011666, + "mean_token_accuracy": 0.8841269612312317, + "num_tokens": 42373645.0, + "step": 15375, + "train/ce_loss": 2.792329041767516e-06 + }, + { + "epoch": 1.5201700613011666, + "step": 15375, + "train/sim_loss": 0.03782212734222412 + }, + { + "epoch": 1.5201700613011666, + "step": 15375, + "train/total_loss": 0.037822406738996506 + }, + { + "entropy": 9.260459899902344, + "epoch": 1.5202689341506823, + "mean_token_accuracy": 0.802281379699707, + "num_tokens": 42385294.0, + "step": 15376, + "train/ce_loss": 0.7986775040626526 + }, + { + "epoch": 1.5202689341506823, + "step": 15376, + "train/sim_loss": 0.0978701114654541 + }, + { + "epoch": 1.5202689341506823, + "step": 15376, + "train/total_loss": 0.17773786187171936 + }, + { + "entropy": 9.736528396606445, + "epoch": 1.5203678070001976, + "mean_token_accuracy": 0.8423236608505249, + "num_tokens": 42405173.0, + "step": 15377, + "train/ce_loss": 0.5356343984603882 + }, + { + "epoch": 1.5203678070001976, + "step": 15377, + "train/sim_loss": 0.01913273334503174 + }, + { + "epoch": 1.5203678070001976, + "step": 15377, + "train/total_loss": 0.07269617915153503 + }, + { + "entropy": 9.291656494140625, + "epoch": 1.5204666798497133, + "mean_token_accuracy": 0.8774774670600891, + "num_tokens": 42413428.0, + "step": 15378, + "train/ce_loss": 3.110158104391303e-07 + }, + { + "epoch": 1.5204666798497133, + "step": 15378, + "train/sim_loss": 0.013059496879577637 + }, + { + "epoch": 1.5204666798497133, + "step": 15378, + "train/total_loss": 0.013059527613222599 + }, + { + "entropy": 9.546746253967285, + "epoch": 1.5205655526992288, + "mean_token_accuracy": 0.8017789125442505, + "num_tokens": 42425130.0, + "step": 15379, + "train/ce_loss": 0.4795750081539154 + }, + { + "epoch": 1.5205655526992288, + "step": 15379, + "train/sim_loss": 0.07294094562530518 + }, + { + "epoch": 1.5205655526992288, + "step": 15379, + "train/total_loss": 0.12089844793081284 + }, + { + "epoch": 1.5206644255487443, + "grad_norm": 0.6328514814376831, + "learning_rate": 6.200118676754191e-06, + "loss": 0.081, + "step": 15380 + }, + { + "entropy": 9.45956802368164, + "epoch": 1.5206644255487443, + "mean_token_accuracy": 0.8722860813140869, + "num_tokens": 42436988.0, + "step": 15380, + "train/ce_loss": 0.39833199977874756 + }, + { + "epoch": 1.5206644255487443, + "step": 15380, + "train/sim_loss": 0.045087575912475586 + }, + { + "epoch": 1.5206644255487443, + "step": 15380, + "train/total_loss": 0.08492077887058258 + }, + { + "entropy": 9.668779373168945, + "epoch": 1.52076329839826, + "mean_token_accuracy": 0.8561983704566956, + "num_tokens": 42456451.0, + "step": 15381, + "train/ce_loss": 0.331136554479599 + }, + { + "epoch": 1.52076329839826, + "step": 15381, + "train/sim_loss": 0.025841832160949707 + }, + { + "epoch": 1.52076329839826, + "step": 15381, + "train/total_loss": 0.05895548686385155 + }, + { + "entropy": 9.147344589233398, + "epoch": 1.5208621712477752, + "mean_token_accuracy": 0.8426323533058167, + "num_tokens": 42468180.0, + "step": 15382, + "train/ce_loss": 0.5442518591880798 + }, + { + "epoch": 1.5208621712477752, + "step": 15382, + "train/sim_loss": 0.04060351848602295 + }, + { + "epoch": 1.5208621712477752, + "step": 15382, + "train/total_loss": 0.09502870589494705 + }, + { + "entropy": 9.669784545898438, + "epoch": 1.520961044097291, + "mean_token_accuracy": 0.8472222089767456, + "num_tokens": 42476108.0, + "step": 15383, + "train/ce_loss": 0.5239436626434326 + }, + { + "epoch": 1.520961044097291, + "step": 15383, + "train/sim_loss": 0.06384313106536865 + }, + { + "epoch": 1.520961044097291, + "step": 15383, + "train/total_loss": 0.11623749881982803 + }, + { + "entropy": 9.6922025680542, + "epoch": 1.5210599169468064, + "mean_token_accuracy": 0.8304093480110168, + "num_tokens": 42484823.0, + "step": 15384, + "train/ce_loss": 0.5842523574829102 + }, + { + "epoch": 1.5210599169468064, + "step": 15384, + "train/sim_loss": 0.05425369739532471 + }, + { + "epoch": 1.5210599169468064, + "step": 15384, + "train/total_loss": 0.11267893016338348 + }, + { + "entropy": 8.986412048339844, + "epoch": 1.521158789796322, + "mean_token_accuracy": 0.8112359642982483, + "num_tokens": 42492147.0, + "step": 15385, + "train/ce_loss": 0.5558973550796509 + }, + { + "epoch": 1.521158789796322, + "step": 15385, + "train/sim_loss": 0.04688072204589844 + }, + { + "epoch": 1.521158789796322, + "step": 15385, + "train/total_loss": 0.10247045755386353 + }, + { + "entropy": 9.554206848144531, + "epoch": 1.5212576626458376, + "mean_token_accuracy": 0.8760445713996887, + "num_tokens": 42507661.0, + "step": 15386, + "train/ce_loss": 0.1462477594614029 + }, + { + "epoch": 1.5212576626458376, + "step": 15386, + "train/sim_loss": 0.04244434833526611 + }, + { + "epoch": 1.5212576626458376, + "step": 15386, + "train/total_loss": 0.05706912279129028 + }, + { + "entropy": 9.380088806152344, + "epoch": 1.5213565354953529, + "mean_token_accuracy": 0.8992481231689453, + "num_tokens": 42520826.0, + "step": 15387, + "train/ce_loss": 0.30309343338012695 + }, + { + "epoch": 1.5213565354953529, + "step": 15387, + "train/sim_loss": 0.02710103988647461 + }, + { + "epoch": 1.5213565354953529, + "step": 15387, + "train/total_loss": 0.057410381734371185 + }, + { + "entropy": 9.001814842224121, + "epoch": 1.5214554083448686, + "mean_token_accuracy": 0.8493626713752747, + "num_tokens": 42537728.0, + "step": 15388, + "train/ce_loss": 0.4405060112476349 + }, + { + "epoch": 1.5214554083448686, + "step": 15388, + "train/sim_loss": 0.03299260139465332 + }, + { + "epoch": 1.5214554083448686, + "step": 15388, + "train/total_loss": 0.07704320549964905 + }, + { + "entropy": 9.729874610900879, + "epoch": 1.521554281194384, + "mean_token_accuracy": 0.86012864112854, + "num_tokens": 42555428.0, + "step": 15389, + "train/ce_loss": 3.8177128658389847e-07 + }, + { + "epoch": 1.521554281194384, + "step": 15389, + "train/sim_loss": 0.025330543518066406 + }, + { + "epoch": 1.521554281194384, + "step": 15389, + "train/total_loss": 0.02533058077096939 + }, + { + "entropy": 9.37531852722168, + "epoch": 1.5216531540438996, + "mean_token_accuracy": 0.8343634009361267, + "num_tokens": 42568468.0, + "step": 15390, + "train/ce_loss": 0.31646886467933655 + }, + { + "epoch": 1.5216531540438996, + "step": 15390, + "train/sim_loss": 0.017465710639953613 + }, + { + "epoch": 1.5216531540438996, + "step": 15390, + "train/total_loss": 0.04911259934306145 + }, + { + "entropy": 9.29571533203125, + "epoch": 1.521752026893415, + "mean_token_accuracy": 0.8431183695793152, + "num_tokens": 42586621.0, + "step": 15391, + "train/ce_loss": 0.5694504976272583 + }, + { + "epoch": 1.521752026893415, + "step": 15391, + "train/sim_loss": 0.036613523960113525 + }, + { + "epoch": 1.521752026893415, + "step": 15391, + "train/total_loss": 0.09355857968330383 + }, + { + "entropy": 9.668160438537598, + "epoch": 1.5218508997429305, + "mean_token_accuracy": 0.8560460805892944, + "num_tokens": 42601040.0, + "step": 15392, + "train/ce_loss": 0.44762346148490906 + }, + { + "epoch": 1.5218508997429305, + "step": 15392, + "train/sim_loss": 0.027818799018859863 + }, + { + "epoch": 1.5218508997429305, + "step": 15392, + "train/total_loss": 0.07258114218711853 + }, + { + "entropy": 9.54342269897461, + "epoch": 1.5219497725924462, + "mean_token_accuracy": 0.8999999761581421, + "num_tokens": 42614269.0, + "step": 15393, + "train/ce_loss": 0.2751399278640747 + }, + { + "epoch": 1.5219497725924462, + "step": 15393, + "train/sim_loss": 0.07063424587249756 + }, + { + "epoch": 1.5219497725924462, + "step": 15393, + "train/total_loss": 0.09814824163913727 + }, + { + "entropy": 9.072576522827148, + "epoch": 1.5220486454419615, + "mean_token_accuracy": 0.8852691054344177, + "num_tokens": 42628238.0, + "step": 15394, + "train/ce_loss": 5.794792627966672e-07 + }, + { + "epoch": 1.5220486454419615, + "step": 15394, + "train/sim_loss": 0.029342949390411377 + }, + { + "epoch": 1.5220486454419615, + "step": 15394, + "train/total_loss": 0.029343007132411003 + }, + { + "entropy": 9.625904083251953, + "epoch": 1.5221475182914772, + "mean_token_accuracy": 0.864814817905426, + "num_tokens": 42641469.0, + "step": 15395, + "train/ce_loss": 0.5248979926109314 + }, + { + "epoch": 1.5221475182914772, + "step": 15395, + "train/sim_loss": 0.04457592964172363 + }, + { + "epoch": 1.5221475182914772, + "step": 15395, + "train/total_loss": 0.09706573188304901 + }, + { + "entropy": 9.451313972473145, + "epoch": 1.5222463911409927, + "mean_token_accuracy": 0.8509036302566528, + "num_tokens": 42659704.0, + "step": 15396, + "train/ce_loss": 0.8754187822341919 + }, + { + "epoch": 1.5222463911409927, + "step": 15396, + "train/sim_loss": 0.04530477523803711 + }, + { + "epoch": 1.5222463911409927, + "step": 15396, + "train/total_loss": 0.1328466534614563 + }, + { + "entropy": 9.882341384887695, + "epoch": 1.5223452639905082, + "mean_token_accuracy": 0.8897637724876404, + "num_tokens": 42669804.0, + "step": 15397, + "train/ce_loss": 3.1069259875948774e-07 + }, + { + "epoch": 1.5223452639905082, + "step": 15397, + "train/sim_loss": 0.010907948017120361 + }, + { + "epoch": 1.5223452639905082, + "step": 15397, + "train/total_loss": 0.010907978750765324 + }, + { + "entropy": 9.419870376586914, + "epoch": 1.5224441368400239, + "mean_token_accuracy": 0.8643678426742554, + "num_tokens": 42681461.0, + "step": 15398, + "train/ce_loss": 0.7032784819602966 + }, + { + "epoch": 1.5224441368400239, + "step": 15398, + "train/sim_loss": 0.05081605911254883 + }, + { + "epoch": 1.5224441368400239, + "step": 15398, + "train/total_loss": 0.12114390730857849 + }, + { + "entropy": 9.258503913879395, + "epoch": 1.5225430096895392, + "mean_token_accuracy": 0.9024389982223511, + "num_tokens": 42693456.0, + "step": 15399, + "train/ce_loss": 0.2551913559436798 + }, + { + "epoch": 1.5225430096895392, + "step": 15399, + "train/sim_loss": 0.02756720781326294 + }, + { + "epoch": 1.5225430096895392, + "step": 15399, + "train/total_loss": 0.05308634415268898 + }, + { + "epoch": 1.5226418825390549, + "grad_norm": 0.44243109226226807, + "learning_rate": 6.1951738119962424e-06, + "loss": 0.0782, + "step": 15400 + }, + { + "entropy": 8.453797340393066, + "epoch": 1.5226418825390549, + "mean_token_accuracy": 0.8516833782196045, + "num_tokens": 42700897.0, + "step": 15400, + "train/ce_loss": 0.46472033858299255 + }, + { + "epoch": 1.5226418825390549, + "step": 15400, + "train/sim_loss": 0.05999279022216797 + }, + { + "epoch": 1.5226418825390549, + "step": 15400, + "train/total_loss": 0.10646482557058334 + }, + { + "entropy": 8.886391639709473, + "epoch": 1.5227407553885703, + "mean_token_accuracy": 0.8655332326889038, + "num_tokens": 42713029.0, + "step": 15401, + "train/ce_loss": 2.748980136857426e-07 + }, + { + "epoch": 1.5227407553885703, + "step": 15401, + "train/sim_loss": 0.031086206436157227 + }, + { + "epoch": 1.5227407553885703, + "step": 15401, + "train/total_loss": 0.031086234375834465 + }, + { + "entropy": 9.152541160583496, + "epoch": 1.5228396282380858, + "mean_token_accuracy": 0.8461538553237915, + "num_tokens": 42720994.0, + "step": 15402, + "train/ce_loss": 0.4189847409725189 + }, + { + "epoch": 1.5228396282380858, + "step": 15402, + "train/sim_loss": 0.03255271911621094 + }, + { + "epoch": 1.5228396282380858, + "step": 15402, + "train/total_loss": 0.07445119321346283 + }, + { + "entropy": 9.892059326171875, + "epoch": 1.5229385010876013, + "mean_token_accuracy": 0.8779220581054688, + "num_tokens": 42731916.0, + "step": 15403, + "train/ce_loss": 0.6235288381576538 + }, + { + "epoch": 1.5229385010876013, + "step": 15403, + "train/sim_loss": 0.07577216625213623 + }, + { + "epoch": 1.5229385010876013, + "step": 15403, + "train/total_loss": 0.13812504708766937 + }, + { + "entropy": 9.699091911315918, + "epoch": 1.5230373739371168, + "mean_token_accuracy": 0.9007999897003174, + "num_tokens": 42743946.0, + "step": 15404, + "train/ce_loss": 1.9149635477333504e-07 + }, + { + "epoch": 1.5230373739371168, + "step": 15404, + "train/sim_loss": 0.01582920551300049 + }, + { + "epoch": 1.5230373739371168, + "step": 15404, + "train/total_loss": 0.01582922413945198 + }, + { + "entropy": 8.703510284423828, + "epoch": 1.5231362467866325, + "mean_token_accuracy": 0.8394543528556824, + "num_tokens": 42751942.0, + "step": 15405, + "train/ce_loss": 0.5845744013786316 + }, + { + "epoch": 1.5231362467866325, + "step": 15405, + "train/sim_loss": 0.06138753890991211 + }, + { + "epoch": 1.5231362467866325, + "step": 15405, + "train/total_loss": 0.11984498053789139 + }, + { + "entropy": 9.281676292419434, + "epoch": 1.5232351196361478, + "mean_token_accuracy": 0.8200992345809937, + "num_tokens": 42763384.0, + "step": 15406, + "train/ce_loss": 0.3776664733886719 + }, + { + "epoch": 1.5232351196361478, + "step": 15406, + "train/sim_loss": 0.03827035427093506 + }, + { + "epoch": 1.5232351196361478, + "step": 15406, + "train/total_loss": 0.07603700459003448 + }, + { + "entropy": 9.898880004882812, + "epoch": 1.5233339924856635, + "mean_token_accuracy": 0.8891352415084839, + "num_tokens": 42778670.0, + "step": 15407, + "train/ce_loss": 4.207957431390241e-07 + }, + { + "epoch": 1.5233339924856635, + "step": 15407, + "train/sim_loss": 0.04956752061843872 + }, + { + "epoch": 1.5233339924856635, + "step": 15407, + "train/total_loss": 0.049567561596632004 + }, + { + "entropy": 9.58751106262207, + "epoch": 1.523432865335179, + "mean_token_accuracy": 0.8604651093482971, + "num_tokens": 42786285.0, + "step": 15408, + "train/ce_loss": 1.6719077677862515e-07 + }, + { + "epoch": 1.523432865335179, + "step": 15408, + "train/sim_loss": 0.011471390724182129 + }, + { + "epoch": 1.523432865335179, + "step": 15408, + "train/total_loss": 0.011471407487988472 + }, + { + "entropy": 9.774266242980957, + "epoch": 1.5235317381846945, + "mean_token_accuracy": 0.8651102185249329, + "num_tokens": 42807470.0, + "step": 15409, + "train/ce_loss": 0.48006168007850647 + }, + { + "epoch": 1.5235317381846945, + "step": 15409, + "train/sim_loss": 0.049157023429870605 + }, + { + "epoch": 1.5235317381846945, + "step": 15409, + "train/total_loss": 0.09716319292783737 + }, + { + "entropy": 9.211522102355957, + "epoch": 1.5236306110342102, + "mean_token_accuracy": 0.8097062706947327, + "num_tokens": 42820333.0, + "step": 15410, + "train/ce_loss": 0.4986821711063385 + }, + { + "epoch": 1.5236306110342102, + "step": 15410, + "train/sim_loss": 0.07492196559906006 + }, + { + "epoch": 1.5236306110342102, + "step": 15410, + "train/total_loss": 0.12479018419981003 + }, + { + "entropy": 9.462430953979492, + "epoch": 1.5237294838837254, + "mean_token_accuracy": 0.8111110925674438, + "num_tokens": 42836126.0, + "step": 15411, + "train/ce_loss": 0.9760185480117798 + }, + { + "epoch": 1.5237294838837254, + "step": 15411, + "train/sim_loss": 0.07951533794403076 + }, + { + "epoch": 1.5237294838837254, + "step": 15411, + "train/total_loss": 0.17711719870567322 + }, + { + "entropy": 9.24152946472168, + "epoch": 1.5238283567332411, + "mean_token_accuracy": 0.859375, + "num_tokens": 42850841.0, + "step": 15412, + "train/ce_loss": 0.37360942363739014 + }, + { + "epoch": 1.5238283567332411, + "step": 15412, + "train/sim_loss": 0.03306692838668823 + }, + { + "epoch": 1.5238283567332411, + "step": 15412, + "train/total_loss": 0.07042787224054337 + }, + { + "entropy": 9.281631469726562, + "epoch": 1.5239272295827566, + "mean_token_accuracy": 0.837133526802063, + "num_tokens": 42867472.0, + "step": 15413, + "train/ce_loss": 0.5331763625144958 + }, + { + "epoch": 1.5239272295827566, + "step": 15413, + "train/sim_loss": 0.03853046894073486 + }, + { + "epoch": 1.5239272295827566, + "step": 15413, + "train/total_loss": 0.09184810519218445 + }, + { + "entropy": 9.496925354003906, + "epoch": 1.524026102432272, + "mean_token_accuracy": 0.8671755790710449, + "num_tokens": 42881507.0, + "step": 15414, + "train/ce_loss": 0.3117406368255615 + }, + { + "epoch": 1.524026102432272, + "step": 15414, + "train/sim_loss": 0.02402549982070923 + }, + { + "epoch": 1.524026102432272, + "step": 15414, + "train/total_loss": 0.05519956350326538 + }, + { + "entropy": 9.163619995117188, + "epoch": 1.5241249752817876, + "mean_token_accuracy": 0.8438913822174072, + "num_tokens": 42893189.0, + "step": 15415, + "train/ce_loss": 0.6154484152793884 + }, + { + "epoch": 1.5241249752817876, + "step": 15415, + "train/sim_loss": 0.019047319889068604 + }, + { + "epoch": 1.5241249752817876, + "step": 15415, + "train/total_loss": 0.08059216290712357 + }, + { + "entropy": 9.112007141113281, + "epoch": 1.524223848131303, + "mean_token_accuracy": 0.8292410969734192, + "num_tokens": 42904491.0, + "step": 15416, + "train/ce_loss": 0.445070743560791 + }, + { + "epoch": 1.524223848131303, + "step": 15416, + "train/sim_loss": 0.014597177505493164 + }, + { + "epoch": 1.524223848131303, + "step": 15416, + "train/total_loss": 0.059104252606630325 + }, + { + "entropy": 9.689467430114746, + "epoch": 1.5243227209808188, + "mean_token_accuracy": 0.909547746181488, + "num_tokens": 42921905.0, + "step": 15417, + "train/ce_loss": 3.4384626701466914e-07 + }, + { + "epoch": 1.5243227209808188, + "step": 15417, + "train/sim_loss": 0.05000978708267212 + }, + { + "epoch": 1.5243227209808188, + "step": 15417, + "train/total_loss": 0.050009820610284805 + }, + { + "entropy": 9.923579216003418, + "epoch": 1.524421593830334, + "mean_token_accuracy": 0.8947368264198303, + "num_tokens": 42934399.0, + "step": 15418, + "train/ce_loss": 1.2374349580568378e-06 + }, + { + "epoch": 1.524421593830334, + "step": 15418, + "train/sim_loss": 0.02928638458251953 + }, + { + "epoch": 1.524421593830334, + "step": 15418, + "train/total_loss": 0.02928650751709938 + }, + { + "entropy": 9.25045394897461, + "epoch": 1.5245204666798498, + "mean_token_accuracy": 0.8352402448654175, + "num_tokens": 42946164.0, + "step": 15419, + "train/ce_loss": 0.6466756463050842 + }, + { + "epoch": 1.5245204666798498, + "step": 15419, + "train/sim_loss": 0.04921305179595947 + }, + { + "epoch": 1.5245204666798498, + "step": 15419, + "train/total_loss": 0.11388061940670013 + }, + { + "epoch": 1.5246193395293652, + "grad_norm": 0.5515692234039307, + "learning_rate": 6.190228947238294e-06, + "loss": 0.0828, + "step": 15420 + }, + { + "entropy": 9.480977058410645, + "epoch": 1.5246193395293652, + "mean_token_accuracy": 0.7558257579803467, + "num_tokens": 42960871.0, + "step": 15420, + "train/ce_loss": 0.7994852066040039 + }, + { + "epoch": 1.5246193395293652, + "step": 15420, + "train/sim_loss": 0.07584011554718018 + }, + { + "epoch": 1.5246193395293652, + "step": 15420, + "train/total_loss": 0.1557886302471161 + }, + { + "entropy": 9.558106422424316, + "epoch": 1.5247182123788807, + "mean_token_accuracy": 0.8894230723381042, + "num_tokens": 42977293.0, + "step": 15421, + "train/ce_loss": 0.5545672178268433 + }, + { + "epoch": 1.5247182123788807, + "step": 15421, + "train/sim_loss": 0.018375039100646973 + }, + { + "epoch": 1.5247182123788807, + "step": 15421, + "train/total_loss": 0.07383176684379578 + }, + { + "entropy": 9.487804412841797, + "epoch": 1.5248170852283964, + "mean_token_accuracy": 0.8877005577087402, + "num_tokens": 42988582.0, + "step": 15422, + "train/ce_loss": 0.359647661447525 + }, + { + "epoch": 1.5248170852283964, + "step": 15422, + "train/sim_loss": 0.06165313720703125 + }, + { + "epoch": 1.5248170852283964, + "step": 15422, + "train/total_loss": 0.09761790931224823 + }, + { + "entropy": 9.335411071777344, + "epoch": 1.5249159580779117, + "mean_token_accuracy": 0.8790760636329651, + "num_tokens": 43002392.0, + "step": 15423, + "train/ce_loss": 0.22502227127552032 + }, + { + "epoch": 1.5249159580779117, + "step": 15423, + "train/sim_loss": 0.05163681507110596 + }, + { + "epoch": 1.5249159580779117, + "step": 15423, + "train/total_loss": 0.07413904368877411 + }, + { + "entropy": 9.612969398498535, + "epoch": 1.5250148309274274, + "mean_token_accuracy": 0.8234567642211914, + "num_tokens": 43014552.0, + "step": 15424, + "train/ce_loss": 0.3399457335472107 + }, + { + "epoch": 1.5250148309274274, + "step": 15424, + "train/sim_loss": 0.028078317642211914 + }, + { + "epoch": 1.5250148309274274, + "step": 15424, + "train/total_loss": 0.06207289174199104 + }, + { + "entropy": 9.539220809936523, + "epoch": 1.5251137037769429, + "mean_token_accuracy": 0.9015918970108032, + "num_tokens": 43027472.0, + "step": 15425, + "train/ce_loss": 0.45998409390449524 + }, + { + "epoch": 1.5251137037769429, + "step": 15425, + "train/sim_loss": 0.03979909420013428 + }, + { + "epoch": 1.5251137037769429, + "step": 15425, + "train/total_loss": 0.0857975035905838 + }, + { + "entropy": 9.520830154418945, + "epoch": 1.5252125766264584, + "mean_token_accuracy": 0.8292220234870911, + "num_tokens": 43041629.0, + "step": 15426, + "train/ce_loss": 0.6974421143531799 + }, + { + "epoch": 1.5252125766264584, + "step": 15426, + "train/sim_loss": 0.0284879207611084 + }, + { + "epoch": 1.5252125766264584, + "step": 15426, + "train/total_loss": 0.09823213517665863 + }, + { + "entropy": 9.585643768310547, + "epoch": 1.5253114494759739, + "mean_token_accuracy": 0.846045196056366, + "num_tokens": 43055721.0, + "step": 15427, + "train/ce_loss": 8.640498663226026e-07 + }, + { + "epoch": 1.5253114494759739, + "step": 15427, + "train/sim_loss": 0.01646256446838379 + }, + { + "epoch": 1.5253114494759739, + "step": 15427, + "train/total_loss": 0.016462650150060654 + }, + { + "entropy": 9.120428085327148, + "epoch": 1.5254103223254893, + "mean_token_accuracy": 0.8518057465553284, + "num_tokens": 43069726.0, + "step": 15428, + "train/ce_loss": 0.14246173202991486 + }, + { + "epoch": 1.5254103223254893, + "step": 15428, + "train/sim_loss": 0.02119767665863037 + }, + { + "epoch": 1.5254103223254893, + "step": 15428, + "train/total_loss": 0.03544384986162186 + }, + { + "entropy": 9.583549499511719, + "epoch": 1.525509195175005, + "mean_token_accuracy": 0.8277310729026794, + "num_tokens": 43082542.0, + "step": 15429, + "train/ce_loss": 0.6229826807975769 + }, + { + "epoch": 1.525509195175005, + "step": 15429, + "train/sim_loss": 0.05930978059768677 + }, + { + "epoch": 1.525509195175005, + "step": 15429, + "train/total_loss": 0.12160804867744446 + }, + { + "entropy": 9.497201919555664, + "epoch": 1.5256080680245203, + "mean_token_accuracy": 0.8601503968238831, + "num_tokens": 43092178.0, + "step": 15430, + "train/ce_loss": 0.48197075724601746 + }, + { + "epoch": 1.5256080680245203, + "step": 15430, + "train/sim_loss": 0.03446483612060547 + }, + { + "epoch": 1.5256080680245203, + "step": 15430, + "train/total_loss": 0.08266191184520721 + }, + { + "entropy": 9.631439208984375, + "epoch": 1.525706940874036, + "mean_token_accuracy": 0.9220296740531921, + "num_tokens": 43103116.0, + "step": 15431, + "train/ce_loss": 1.5697735022968118e-07 + }, + { + "epoch": 1.525706940874036, + "step": 15431, + "train/sim_loss": 0.01393425464630127 + }, + { + "epoch": 1.525706940874036, + "step": 15431, + "train/total_loss": 0.013934270478785038 + }, + { + "entropy": 9.911876678466797, + "epoch": 1.5258058137235515, + "mean_token_accuracy": 0.8931451439857483, + "num_tokens": 43118035.0, + "step": 15432, + "train/ce_loss": 3.239770478558057e-07 + }, + { + "epoch": 1.5258058137235515, + "step": 15432, + "train/sim_loss": 0.01800549030303955 + }, + { + "epoch": 1.5258058137235515, + "step": 15432, + "train/total_loss": 0.018005521968007088 + }, + { + "entropy": 9.420243263244629, + "epoch": 1.525904686573067, + "mean_token_accuracy": 0.7888493537902832, + "num_tokens": 43129134.0, + "step": 15433, + "train/ce_loss": 0.7224249839782715 + }, + { + "epoch": 1.525904686573067, + "step": 15433, + "train/sim_loss": 0.07063192129135132 + }, + { + "epoch": 1.525904686573067, + "step": 15433, + "train/total_loss": 0.14287441968917847 + }, + { + "entropy": 9.293094635009766, + "epoch": 1.5260035594225827, + "mean_token_accuracy": 0.8583569526672363, + "num_tokens": 43145866.0, + "step": 15434, + "train/ce_loss": 0.327631413936615 + }, + { + "epoch": 1.5260035594225827, + "step": 15434, + "train/sim_loss": 0.022934019565582275 + }, + { + "epoch": 1.5260035594225827, + "step": 15434, + "train/total_loss": 0.055697161704301834 + }, + { + "entropy": 9.53591251373291, + "epoch": 1.526102432272098, + "mean_token_accuracy": 0.9152941107749939, + "num_tokens": 43159537.0, + "step": 15435, + "train/ce_loss": 7.5703542279370595e-06 + }, + { + "epoch": 1.526102432272098, + "step": 15435, + "train/sim_loss": 0.01925373077392578 + }, + { + "epoch": 1.526102432272098, + "step": 15435, + "train/total_loss": 0.01925448700785637 + }, + { + "entropy": 9.652667045593262, + "epoch": 1.5262013051216137, + "mean_token_accuracy": 0.837837815284729, + "num_tokens": 43174092.0, + "step": 15436, + "train/ce_loss": 0.5436776876449585 + }, + { + "epoch": 1.5262013051216137, + "step": 15436, + "train/sim_loss": 0.07770133018493652 + }, + { + "epoch": 1.5262013051216137, + "step": 15436, + "train/total_loss": 0.13206909596920013 + }, + { + "entropy": 9.152897834777832, + "epoch": 1.5263001779711292, + "mean_token_accuracy": 0.8109540343284607, + "num_tokens": 43189628.0, + "step": 15437, + "train/ce_loss": 3.201348306447471e-07 + }, + { + "epoch": 1.5263001779711292, + "step": 15437, + "train/sim_loss": 0.025294184684753418 + }, + { + "epoch": 1.5263001779711292, + "step": 15437, + "train/total_loss": 0.025294216349720955 + }, + { + "entropy": 10.259848594665527, + "epoch": 1.5263990508206446, + "mean_token_accuracy": 0.8242990374565125, + "num_tokens": 43197512.0, + "step": 15438, + "train/ce_loss": 2.3618868283392658e-07 + }, + { + "epoch": 1.5263990508206446, + "step": 15438, + "train/sim_loss": 0.010546565055847168 + }, + { + "epoch": 1.5263990508206446, + "step": 15438, + "train/total_loss": 0.010546588338911533 + }, + { + "entropy": 9.335585594177246, + "epoch": 1.5264979236701601, + "mean_token_accuracy": 0.8538102507591248, + "num_tokens": 43210564.0, + "step": 15439, + "train/ce_loss": 0.7365971207618713 + }, + { + "epoch": 1.5264979236701601, + "step": 15439, + "train/sim_loss": 0.043473005294799805 + }, + { + "epoch": 1.5264979236701601, + "step": 15439, + "train/total_loss": 0.11713271588087082 + }, + { + "epoch": 1.5265967965196756, + "grad_norm": 0.6665899753570557, + "learning_rate": 6.185284082480345e-06, + "loss": 0.0875, + "step": 15440 + }, + { + "entropy": 9.133319854736328, + "epoch": 1.5265967965196756, + "mean_token_accuracy": 0.8266360759735107, + "num_tokens": 43221665.0, + "step": 15440, + "train/ce_loss": 0.22696584463119507 + }, + { + "epoch": 1.5265967965196756, + "step": 15440, + "train/sim_loss": 0.0641632080078125 + }, + { + "epoch": 1.5265967965196756, + "step": 15440, + "train/total_loss": 0.086859792470932 + }, + { + "entropy": 8.631453514099121, + "epoch": 1.5266956693691913, + "mean_token_accuracy": 0.8127053380012512, + "num_tokens": 43230695.0, + "step": 15441, + "train/ce_loss": 0.5891202688217163 + }, + { + "epoch": 1.5266956693691913, + "step": 15441, + "train/sim_loss": 0.03140532970428467 + }, + { + "epoch": 1.5266956693691913, + "step": 15441, + "train/total_loss": 0.09031735360622406 + }, + { + "entropy": 9.20678424835205, + "epoch": 1.5267945422187066, + "mean_token_accuracy": 0.8625146746635437, + "num_tokens": 43241786.0, + "step": 15442, + "train/ce_loss": 0.329886257648468 + }, + { + "epoch": 1.5267945422187066, + "step": 15442, + "train/sim_loss": 0.034504055976867676 + }, + { + "epoch": 1.5267945422187066, + "step": 15442, + "train/total_loss": 0.06749267876148224 + }, + { + "entropy": 9.419666290283203, + "epoch": 1.5268934150682223, + "mean_token_accuracy": 0.7671517729759216, + "num_tokens": 43253888.0, + "step": 15443, + "train/ce_loss": 1.1529024839401245 + }, + { + "epoch": 1.5268934150682223, + "step": 15443, + "train/sim_loss": 0.08563238382339478 + }, + { + "epoch": 1.5268934150682223, + "step": 15443, + "train/total_loss": 0.2009226381778717 + }, + { + "entropy": 9.526002883911133, + "epoch": 1.5269922879177378, + "mean_token_accuracy": 0.8739902973175049, + "num_tokens": 43266381.0, + "step": 15444, + "train/ce_loss": 2.3006164155958686e-06 + }, + { + "epoch": 1.5269922879177378, + "step": 15444, + "train/sim_loss": 0.04616957902908325 + }, + { + "epoch": 1.5269922879177378, + "step": 15444, + "train/total_loss": 0.04616980999708176 + }, + { + "entropy": 9.525718688964844, + "epoch": 1.5270911607672533, + "mean_token_accuracy": 0.8648325204849243, + "num_tokens": 43283249.0, + "step": 15445, + "train/ce_loss": 0.40163615345954895 + }, + { + "epoch": 1.5270911607672533, + "step": 15445, + "train/sim_loss": 0.04622840881347656 + }, + { + "epoch": 1.5270911607672533, + "step": 15445, + "train/total_loss": 0.08639203011989594 + }, + { + "entropy": 9.206613540649414, + "epoch": 1.527190033616769, + "mean_token_accuracy": 0.8617886304855347, + "num_tokens": 43297967.0, + "step": 15446, + "train/ce_loss": 0.16242621839046478 + }, + { + "epoch": 1.527190033616769, + "step": 15446, + "train/sim_loss": 0.04044008255004883 + }, + { + "epoch": 1.527190033616769, + "step": 15446, + "train/total_loss": 0.056682705879211426 + }, + { + "entropy": 9.606590270996094, + "epoch": 1.5272889064662842, + "mean_token_accuracy": 0.9055876731872559, + "num_tokens": 43305049.0, + "step": 15447, + "train/ce_loss": 3.1605063099959807e-07 + }, + { + "epoch": 1.5272889064662842, + "step": 15447, + "train/sim_loss": 0.01144421100616455 + }, + { + "epoch": 1.5272889064662842, + "step": 15447, + "train/total_loss": 0.011444242671132088 + }, + { + "entropy": 9.835893630981445, + "epoch": 1.5273877793158, + "mean_token_accuracy": 0.8991150259971619, + "num_tokens": 43321099.0, + "step": 15448, + "train/ce_loss": 0.36673086881637573 + }, + { + "epoch": 1.5273877793158, + "step": 15448, + "train/sim_loss": 0.04165083169937134 + }, + { + "epoch": 1.5273877793158, + "step": 15448, + "train/total_loss": 0.07832391560077667 + }, + { + "entropy": 9.027153015136719, + "epoch": 1.5274866521653154, + "mean_token_accuracy": 0.8185792565345764, + "num_tokens": 43332823.0, + "step": 15449, + "train/ce_loss": 0.7536535859107971 + }, + { + "epoch": 1.5274866521653154, + "step": 15449, + "train/sim_loss": 0.0342099666595459 + }, + { + "epoch": 1.5274866521653154, + "step": 15449, + "train/total_loss": 0.10957532376050949 + }, + { + "entropy": 9.302239418029785, + "epoch": 1.527585525014831, + "mean_token_accuracy": 0.8461538553237915, + "num_tokens": 43347651.0, + "step": 15450, + "train/ce_loss": 0.553152322769165 + }, + { + "epoch": 1.527585525014831, + "step": 15450, + "train/sim_loss": 0.04526907205581665 + }, + { + "epoch": 1.527585525014831, + "step": 15450, + "train/total_loss": 0.10058430582284927 + }, + { + "entropy": 9.16146183013916, + "epoch": 1.5276843978643464, + "mean_token_accuracy": 0.7979669570922852, + "num_tokens": 43354466.0, + "step": 15451, + "train/ce_loss": 0.6868612766265869 + }, + { + "epoch": 1.5276843978643464, + "step": 15451, + "train/sim_loss": 0.025181472301483154 + }, + { + "epoch": 1.5276843978643464, + "step": 15451, + "train/total_loss": 0.09386759996414185 + }, + { + "entropy": 9.82723331451416, + "epoch": 1.5277832707138619, + "mean_token_accuracy": 0.9117646813392639, + "num_tokens": 43361268.0, + "step": 15452, + "train/ce_loss": 6.374712029355578e-06 + }, + { + "epoch": 1.5277832707138619, + "step": 15452, + "train/sim_loss": 0.02392113208770752 + }, + { + "epoch": 1.5277832707138619, + "step": 15452, + "train/total_loss": 0.023921769112348557 + }, + { + "entropy": 9.963419914245605, + "epoch": 1.5278821435633776, + "mean_token_accuracy": 0.8714788556098938, + "num_tokens": 43379833.0, + "step": 15453, + "train/ce_loss": 0.5890187621116638 + }, + { + "epoch": 1.5278821435633776, + "step": 15453, + "train/sim_loss": 0.06644654273986816 + }, + { + "epoch": 1.5278821435633776, + "step": 15453, + "train/total_loss": 0.12534841895103455 + }, + { + "entropy": 9.991231918334961, + "epoch": 1.5279810164128929, + "mean_token_accuracy": 0.881600022315979, + "num_tokens": 43400908.0, + "step": 15454, + "train/ce_loss": 0.42674243450164795 + }, + { + "epoch": 1.5279810164128929, + "step": 15454, + "train/sim_loss": 0.04471468925476074 + }, + { + "epoch": 1.5279810164128929, + "step": 15454, + "train/total_loss": 0.08738893270492554 + }, + { + "entropy": 9.392107009887695, + "epoch": 1.5280798892624086, + "mean_token_accuracy": 0.9012776017189026, + "num_tokens": 43412964.0, + "step": 15455, + "train/ce_loss": 0.13004590570926666 + }, + { + "epoch": 1.5280798892624086, + "step": 15455, + "train/sim_loss": 0.017487049102783203 + }, + { + "epoch": 1.5280798892624086, + "step": 15455, + "train/total_loss": 0.03049163892865181 + }, + { + "entropy": 9.408182144165039, + "epoch": 1.528178762111924, + "mean_token_accuracy": 0.8940852880477905, + "num_tokens": 43425194.0, + "step": 15456, + "train/ce_loss": 0.3337799310684204 + }, + { + "epoch": 1.528178762111924, + "step": 15456, + "train/sim_loss": 0.05792957544326782 + }, + { + "epoch": 1.528178762111924, + "step": 15456, + "train/total_loss": 0.09130756556987762 + }, + { + "entropy": 8.7642240524292, + "epoch": 1.5282776349614395, + "mean_token_accuracy": 0.8682252764701843, + "num_tokens": 43434445.0, + "step": 15457, + "train/ce_loss": 0.29937809705734253 + }, + { + "epoch": 1.5282776349614395, + "step": 15457, + "train/sim_loss": 0.0259588360786438 + }, + { + "epoch": 1.5282776349614395, + "step": 15457, + "train/total_loss": 0.05589664727449417 + }, + { + "entropy": 9.263066291809082, + "epoch": 1.5283765078109552, + "mean_token_accuracy": 0.859649121761322, + "num_tokens": 43448305.0, + "step": 15458, + "train/ce_loss": 0.4699074625968933 + }, + { + "epoch": 1.5283765078109552, + "step": 15458, + "train/sim_loss": 0.0895203948020935 + }, + { + "epoch": 1.5283765078109552, + "step": 15458, + "train/total_loss": 0.13651114702224731 + }, + { + "entropy": 9.086026191711426, + "epoch": 1.5284753806604705, + "mean_token_accuracy": 0.8442822098731995, + "num_tokens": 43461846.0, + "step": 15459, + "train/ce_loss": 0.5205419659614563 + }, + { + "epoch": 1.5284753806604705, + "step": 15459, + "train/sim_loss": 0.03618204593658447 + }, + { + "epoch": 1.5284753806604705, + "step": 15459, + "train/total_loss": 0.0882362425327301 + }, + { + "epoch": 1.5285742535099862, + "grad_norm": 0.5812387466430664, + "learning_rate": 6.180339217722396e-06, + "loss": 0.0837, + "step": 15460 + }, + { + "entropy": 9.586475372314453, + "epoch": 1.5285742535099862, + "mean_token_accuracy": 0.8464285731315613, + "num_tokens": 43479700.0, + "step": 15460, + "train/ce_loss": 0.4577283561229706 + }, + { + "epoch": 1.5285742535099862, + "step": 15460, + "train/sim_loss": 0.05077773332595825 + }, + { + "epoch": 1.5285742535099862, + "step": 15460, + "train/total_loss": 0.09655056893825531 + }, + { + "entropy": 8.972555160522461, + "epoch": 1.5286731263595017, + "mean_token_accuracy": 0.8254335522651672, + "num_tokens": 43488291.0, + "step": 15461, + "train/ce_loss": 0.46334323287010193 + }, + { + "epoch": 1.5286731263595017, + "step": 15461, + "train/sim_loss": 0.05957663059234619 + }, + { + "epoch": 1.5286731263595017, + "step": 15461, + "train/total_loss": 0.10591095685958862 + }, + { + "entropy": 9.731525421142578, + "epoch": 1.5287719992090172, + "mean_token_accuracy": 0.8698011040687561, + "num_tokens": 43496993.0, + "step": 15462, + "train/ce_loss": 0.4872763156890869 + }, + { + "epoch": 1.5287719992090172, + "step": 15462, + "train/sim_loss": 0.0672142505645752 + }, + { + "epoch": 1.5287719992090172, + "step": 15462, + "train/total_loss": 0.11594188213348389 + }, + { + "entropy": 9.497172355651855, + "epoch": 1.528870872058533, + "mean_token_accuracy": 0.8964496850967407, + "num_tokens": 43509053.0, + "step": 15463, + "train/ce_loss": 1.8365529967923067e-06 + }, + { + "epoch": 1.528870872058533, + "step": 15463, + "train/sim_loss": 0.050855398178100586 + }, + { + "epoch": 1.528870872058533, + "step": 15463, + "train/total_loss": 0.05085558071732521 + }, + { + "entropy": 9.52902889251709, + "epoch": 1.5289697449080482, + "mean_token_accuracy": 0.8284574747085571, + "num_tokens": 43527128.0, + "step": 15464, + "train/ce_loss": 0.5225924849510193 + }, + { + "epoch": 1.5289697449080482, + "step": 15464, + "train/sim_loss": 0.03661990165710449 + }, + { + "epoch": 1.5289697449080482, + "step": 15464, + "train/total_loss": 0.08887915313243866 + }, + { + "entropy": 9.646438598632812, + "epoch": 1.5290686177575639, + "mean_token_accuracy": 0.8158205151557922, + "num_tokens": 43544511.0, + "step": 15465, + "train/ce_loss": 0.3571479022502899 + }, + { + "epoch": 1.5290686177575639, + "step": 15465, + "train/sim_loss": 0.024982571601867676 + }, + { + "epoch": 1.5290686177575639, + "step": 15465, + "train/total_loss": 0.06069736182689667 + }, + { + "entropy": 9.39267349243164, + "epoch": 1.5291674906070793, + "mean_token_accuracy": 0.8340306878089905, + "num_tokens": 43560728.0, + "step": 15466, + "train/ce_loss": 0.3098009526729584 + }, + { + "epoch": 1.5291674906070793, + "step": 15466, + "train/sim_loss": 0.024459540843963623 + }, + { + "epoch": 1.5291674906070793, + "step": 15466, + "train/total_loss": 0.05543963611125946 + }, + { + "entropy": 9.360600471496582, + "epoch": 1.5292663634565948, + "mean_token_accuracy": 0.8711985945701599, + "num_tokens": 43572109.0, + "step": 15467, + "train/ce_loss": 0.5029110312461853 + }, + { + "epoch": 1.5292663634565948, + "step": 15467, + "train/sim_loss": 0.06836307048797607 + }, + { + "epoch": 1.5292663634565948, + "step": 15467, + "train/total_loss": 0.11865417659282684 + }, + { + "entropy": 9.133516311645508, + "epoch": 1.5293652363061103, + "mean_token_accuracy": 0.8978829383850098, + "num_tokens": 43583961.0, + "step": 15468, + "train/ce_loss": 0.46999046206474304 + }, + { + "epoch": 1.5293652363061103, + "step": 15468, + "train/sim_loss": 0.0497894287109375 + }, + { + "epoch": 1.5293652363061103, + "step": 15468, + "train/total_loss": 0.09678848087787628 + }, + { + "entropy": 9.333590507507324, + "epoch": 1.5294641091556258, + "mean_token_accuracy": 0.7966480255126953, + "num_tokens": 43594674.0, + "step": 15469, + "train/ce_loss": 0.7847602963447571 + }, + { + "epoch": 1.5294641091556258, + "step": 15469, + "train/sim_loss": 0.0547829270362854 + }, + { + "epoch": 1.5294641091556258, + "step": 15469, + "train/total_loss": 0.13325896859169006 + }, + { + "entropy": 9.232897758483887, + "epoch": 1.5295629820051415, + "mean_token_accuracy": 0.8231440782546997, + "num_tokens": 43604038.0, + "step": 15470, + "train/ce_loss": 0.7842482328414917 + }, + { + "epoch": 1.5295629820051415, + "step": 15470, + "train/sim_loss": 0.03994953632354736 + }, + { + "epoch": 1.5295629820051415, + "step": 15470, + "train/total_loss": 0.11837436258792877 + }, + { + "entropy": 9.584918975830078, + "epoch": 1.5296618548546568, + "mean_token_accuracy": 0.8754098415374756, + "num_tokens": 43617115.0, + "step": 15471, + "train/ce_loss": 2.8297333187765616e-07 + }, + { + "epoch": 1.5296618548546568, + "step": 15471, + "train/sim_loss": 0.049033403396606445 + }, + { + "epoch": 1.5296618548546568, + "step": 15471, + "train/total_loss": 0.04903343319892883 + }, + { + "entropy": 9.93316650390625, + "epoch": 1.5297607277041725, + "mean_token_accuracy": 0.9006928205490112, + "num_tokens": 43624427.0, + "step": 15472, + "train/ce_loss": 0.6665570735931396 + }, + { + "epoch": 1.5297607277041725, + "step": 15472, + "train/sim_loss": 0.07689666748046875 + }, + { + "epoch": 1.5297607277041725, + "step": 15472, + "train/total_loss": 0.14355237782001495 + }, + { + "entropy": 10.166695594787598, + "epoch": 1.529859600553688, + "mean_token_accuracy": 0.9277777671813965, + "num_tokens": 43637475.0, + "step": 15473, + "train/ce_loss": 1.2265164741620538e-06 + }, + { + "epoch": 1.529859600553688, + "step": 15473, + "train/sim_loss": 0.03214830160140991 + }, + { + "epoch": 1.529859600553688, + "step": 15473, + "train/total_loss": 0.03214842453598976 + }, + { + "entropy": 9.387628555297852, + "epoch": 1.5299584734032035, + "mean_token_accuracy": 0.8391608595848083, + "num_tokens": 43649078.0, + "step": 15474, + "train/ce_loss": 0.4269368350505829 + }, + { + "epoch": 1.5299584734032035, + "step": 15474, + "train/sim_loss": 0.12827730178833008 + }, + { + "epoch": 1.5299584734032035, + "step": 15474, + "train/total_loss": 0.17097099125385284 + }, + { + "entropy": 9.522991180419922, + "epoch": 1.5300573462527192, + "mean_token_accuracy": 0.8576388955116272, + "num_tokens": 43669367.0, + "step": 15475, + "train/ce_loss": 0.4057595431804657 + }, + { + "epoch": 1.5300573462527192, + "step": 15475, + "train/sim_loss": 0.05894911289215088 + }, + { + "epoch": 1.5300573462527192, + "step": 15475, + "train/total_loss": 0.09952506422996521 + }, + { + "entropy": 9.490129470825195, + "epoch": 1.5301562191022344, + "mean_token_accuracy": 0.8693069219589233, + "num_tokens": 43689296.0, + "step": 15476, + "train/ce_loss": 4.787182774634857e-07 + }, + { + "epoch": 1.5301562191022344, + "step": 15476, + "train/sim_loss": 0.027346014976501465 + }, + { + "epoch": 1.5301562191022344, + "step": 15476, + "train/total_loss": 0.027346063405275345 + }, + { + "entropy": 9.447914123535156, + "epoch": 1.5302550919517501, + "mean_token_accuracy": 0.8303834795951843, + "num_tokens": 43704823.0, + "step": 15477, + "train/ce_loss": 0.574324369430542 + }, + { + "epoch": 1.5302550919517501, + "step": 15477, + "train/sim_loss": 0.018462300300598145 + }, + { + "epoch": 1.5302550919517501, + "step": 15477, + "train/total_loss": 0.07589474320411682 + }, + { + "entropy": 9.00955581665039, + "epoch": 1.5303539648012656, + "mean_token_accuracy": 0.8652729392051697, + "num_tokens": 43715788.0, + "step": 15478, + "train/ce_loss": 0.4806424379348755 + }, + { + "epoch": 1.5303539648012656, + "step": 15478, + "train/sim_loss": 0.039006590843200684 + }, + { + "epoch": 1.5303539648012656, + "step": 15478, + "train/total_loss": 0.08707083761692047 + }, + { + "entropy": 9.987934112548828, + "epoch": 1.530452837650781, + "mean_token_accuracy": 0.9137930870056152, + "num_tokens": 43723303.0, + "step": 15479, + "train/ce_loss": 3.247403981276875e-07 + }, + { + "epoch": 1.530452837650781, + "step": 15479, + "train/sim_loss": 0.013158440589904785 + }, + { + "epoch": 1.530452837650781, + "step": 15479, + "train/total_loss": 0.013158473186194897 + }, + { + "epoch": 1.5305517105002966, + "grad_norm": 0.630560040473938, + "learning_rate": 6.175394352964447e-06, + "loss": 0.0838, + "step": 15480 + }, + { + "entropy": 9.463879585266113, + "epoch": 1.5305517105002966, + "mean_token_accuracy": 0.8546895384788513, + "num_tokens": 43740097.0, + "step": 15480, + "train/ce_loss": 0.6382138133049011 + }, + { + "epoch": 1.5305517105002966, + "step": 15480, + "train/sim_loss": 0.026807188987731934 + }, + { + "epoch": 1.5305517105002966, + "step": 15480, + "train/total_loss": 0.09062857180833817 + }, + { + "entropy": 9.131683349609375, + "epoch": 1.530650583349812, + "mean_token_accuracy": 0.8557047247886658, + "num_tokens": 43747694.0, + "step": 15481, + "train/ce_loss": 0.5497118234634399 + }, + { + "epoch": 1.530650583349812, + "step": 15481, + "train/sim_loss": 0.041198551654815674 + }, + { + "epoch": 1.530650583349812, + "step": 15481, + "train/total_loss": 0.09616973996162415 + }, + { + "entropy": 9.47244644165039, + "epoch": 1.5307494561993278, + "mean_token_accuracy": 0.9095414876937866, + "num_tokens": 43768426.0, + "step": 15482, + "train/ce_loss": 0.30656564235687256 + }, + { + "epoch": 1.5307494561993278, + "step": 15482, + "train/sim_loss": 0.029695510864257812 + }, + { + "epoch": 1.5307494561993278, + "step": 15482, + "train/total_loss": 0.06035207584500313 + }, + { + "entropy": 9.163944244384766, + "epoch": 1.530848329048843, + "mean_token_accuracy": 0.9028475880622864, + "num_tokens": 43782393.0, + "step": 15483, + "train/ce_loss": 0.29347261786460876 + }, + { + "epoch": 1.530848329048843, + "step": 15483, + "train/sim_loss": 0.012752890586853027 + }, + { + "epoch": 1.530848329048843, + "step": 15483, + "train/total_loss": 0.04210015386343002 + }, + { + "entropy": 9.689958572387695, + "epoch": 1.5309472018983588, + "mean_token_accuracy": 0.8231611847877502, + "num_tokens": 43797228.0, + "step": 15484, + "train/ce_loss": 0.940960168838501 + }, + { + "epoch": 1.5309472018983588, + "step": 15484, + "train/sim_loss": 0.021427631378173828 + }, + { + "epoch": 1.5309472018983588, + "step": 15484, + "train/total_loss": 0.11552365124225616 + }, + { + "entropy": 9.362424850463867, + "epoch": 1.5310460747478742, + "mean_token_accuracy": 0.854604184627533, + "num_tokens": 43807340.0, + "step": 15485, + "train/ce_loss": 0.6699473261833191 + }, + { + "epoch": 1.5310460747478742, + "step": 15485, + "train/sim_loss": 0.03405416011810303 + }, + { + "epoch": 1.5310460747478742, + "step": 15485, + "train/total_loss": 0.10104889422655106 + }, + { + "entropy": 9.331134796142578, + "epoch": 1.5311449475973897, + "mean_token_accuracy": 0.849056601524353, + "num_tokens": 43819539.0, + "step": 15486, + "train/ce_loss": 0.6574016809463501 + }, + { + "epoch": 1.5311449475973897, + "step": 15486, + "train/sim_loss": 0.08303689956665039 + }, + { + "epoch": 1.5311449475973897, + "step": 15486, + "train/total_loss": 0.1487770676612854 + }, + { + "entropy": 9.65963363647461, + "epoch": 1.5312438204469054, + "mean_token_accuracy": 0.8035044074058533, + "num_tokens": 43836862.0, + "step": 15487, + "train/ce_loss": 0.7670727372169495 + }, + { + "epoch": 1.5312438204469054, + "step": 15487, + "train/sim_loss": 0.018929481506347656 + }, + { + "epoch": 1.5312438204469054, + "step": 15487, + "train/total_loss": 0.0956367552280426 + }, + { + "entropy": 9.982677459716797, + "epoch": 1.5313426932964207, + "mean_token_accuracy": 0.8790435791015625, + "num_tokens": 43852799.0, + "step": 15488, + "train/ce_loss": 0.741909384727478 + }, + { + "epoch": 1.5313426932964207, + "step": 15488, + "train/sim_loss": 0.07829689979553223 + }, + { + "epoch": 1.5313426932964207, + "step": 15488, + "train/total_loss": 0.1524878442287445 + }, + { + "entropy": 9.097314834594727, + "epoch": 1.5314415661459364, + "mean_token_accuracy": 0.8865710496902466, + "num_tokens": 43865240.0, + "step": 15489, + "train/ce_loss": 1.1363423482180224e-06 + }, + { + "epoch": 1.5314415661459364, + "step": 15489, + "train/sim_loss": 0.03184223175048828 + }, + { + "epoch": 1.5314415661459364, + "step": 15489, + "train/total_loss": 0.031842347234487534 + }, + { + "entropy": 9.695588111877441, + "epoch": 1.531540438995452, + "mean_token_accuracy": 0.8604651093482971, + "num_tokens": 43881116.0, + "step": 15490, + "train/ce_loss": 0.309286504983902 + }, + { + "epoch": 1.531540438995452, + "step": 15490, + "train/sim_loss": 0.029981374740600586 + }, + { + "epoch": 1.531540438995452, + "step": 15490, + "train/total_loss": 0.060910023748874664 + }, + { + "entropy": 9.097681045532227, + "epoch": 1.5316393118449674, + "mean_token_accuracy": 0.8502475023269653, + "num_tokens": 43897451.0, + "step": 15491, + "train/ce_loss": 0.6114529371261597 + }, + { + "epoch": 1.5316393118449674, + "step": 15491, + "train/sim_loss": 0.06259119510650635 + }, + { + "epoch": 1.5316393118449674, + "step": 15491, + "train/total_loss": 0.12373648583889008 + }, + { + "entropy": 9.385726928710938, + "epoch": 1.5317381846944829, + "mean_token_accuracy": 0.8398532867431641, + "num_tokens": 43913758.0, + "step": 15492, + "train/ce_loss": 0.5310359001159668 + }, + { + "epoch": 1.5317381846944829, + "step": 15492, + "train/sim_loss": 0.08433020114898682 + }, + { + "epoch": 1.5317381846944829, + "step": 15492, + "train/total_loss": 0.13743379712104797 + }, + { + "entropy": 9.383230209350586, + "epoch": 1.5318370575439983, + "mean_token_accuracy": 0.8437067866325378, + "num_tokens": 43924500.0, + "step": 15493, + "train/ce_loss": 0.5164880156517029 + }, + { + "epoch": 1.5318370575439983, + "step": 15493, + "train/sim_loss": 0.08721184730529785 + }, + { + "epoch": 1.5318370575439983, + "step": 15493, + "train/total_loss": 0.13886064291000366 + }, + { + "entropy": 8.488800048828125, + "epoch": 1.531935930393514, + "mean_token_accuracy": 0.8711770176887512, + "num_tokens": 43935311.0, + "step": 15494, + "train/ce_loss": 0.3615928292274475 + }, + { + "epoch": 1.531935930393514, + "step": 15494, + "train/sim_loss": 0.045641303062438965 + }, + { + "epoch": 1.531935930393514, + "step": 15494, + "train/total_loss": 0.08180058747529984 + }, + { + "entropy": 8.969705581665039, + "epoch": 1.5320348032430293, + "mean_token_accuracy": 0.8310249447822571, + "num_tokens": 43947617.0, + "step": 15495, + "train/ce_loss": 0.3527357876300812 + }, + { + "epoch": 1.5320348032430293, + "step": 15495, + "train/sim_loss": 0.0375935435295105 + }, + { + "epoch": 1.5320348032430293, + "step": 15495, + "train/total_loss": 0.07286712527275085 + }, + { + "entropy": 9.319526672363281, + "epoch": 1.532133676092545, + "mean_token_accuracy": 0.8869258165359497, + "num_tokens": 43959862.0, + "step": 15496, + "train/ce_loss": 0.6060508489608765 + }, + { + "epoch": 1.532133676092545, + "step": 15496, + "train/sim_loss": 0.020372092723846436 + }, + { + "epoch": 1.532133676092545, + "step": 15496, + "train/total_loss": 0.0809771791100502 + }, + { + "entropy": 9.346254348754883, + "epoch": 1.5322325489420605, + "mean_token_accuracy": 0.9307411909103394, + "num_tokens": 43970107.0, + "step": 15497, + "train/ce_loss": 0.1884574294090271 + }, + { + "epoch": 1.5322325489420605, + "step": 15497, + "train/sim_loss": 0.04234516620635986 + }, + { + "epoch": 1.5322325489420605, + "step": 15497, + "train/total_loss": 0.06119091063737869 + }, + { + "entropy": 8.920207023620605, + "epoch": 1.532331421791576, + "mean_token_accuracy": 0.8550440669059753, + "num_tokens": 43980438.0, + "step": 15498, + "train/ce_loss": 0.4136747419834137 + }, + { + "epoch": 1.532331421791576, + "step": 15498, + "train/sim_loss": 0.02072852849960327 + }, + { + "epoch": 1.532331421791576, + "step": 15498, + "train/total_loss": 0.0620960034430027 + }, + { + "entropy": 9.488016128540039, + "epoch": 1.5324302946410917, + "mean_token_accuracy": 0.7958937287330627, + "num_tokens": 43992474.0, + "step": 15499, + "train/ce_loss": 0.21676045656204224 + }, + { + "epoch": 1.5324302946410917, + "step": 15499, + "train/sim_loss": 0.032296180725097656 + }, + { + "epoch": 1.5324302946410917, + "step": 15499, + "train/total_loss": 0.05397222936153412 + }, + { + "epoch": 1.532529167490607, + "grad_norm": 0.5420845746994019, + "learning_rate": 6.170449488206498e-06, + "loss": 0.0867, + "step": 15500 + }, + { + "entropy": 9.311519622802734, + "epoch": 1.532529167490607, + "mean_token_accuracy": 0.8421701788902283, + "num_tokens": 44005058.0, + "step": 15500, + "train/ce_loss": 0.19454215466976166 + }, + { + "epoch": 1.532529167490607, + "step": 15500, + "train/sim_loss": 0.03037095069885254 + }, + { + "epoch": 1.532529167490607, + "step": 15500, + "train/total_loss": 0.049825169146060944 + }, + { + "entropy": 9.630913734436035, + "epoch": 1.5326280403401227, + "mean_token_accuracy": 0.8857545852661133, + "num_tokens": 44021286.0, + "step": 15501, + "train/ce_loss": 0.4610142111778259 + }, + { + "epoch": 1.5326280403401227, + "step": 15501, + "train/sim_loss": 0.040408432483673096 + }, + { + "epoch": 1.5326280403401227, + "step": 15501, + "train/total_loss": 0.08650985360145569 + }, + { + "entropy": 9.829570770263672, + "epoch": 1.5327269131896382, + "mean_token_accuracy": 0.8218954205513, + "num_tokens": 44030754.0, + "step": 15502, + "train/ce_loss": 2.547787403273105e-07 + }, + { + "epoch": 1.5327269131896382, + "step": 15502, + "train/sim_loss": 0.016239821910858154 + }, + { + "epoch": 1.5327269131896382, + "step": 15502, + "train/total_loss": 0.016239847987890244 + }, + { + "entropy": 9.172697067260742, + "epoch": 1.5328257860391536, + "mean_token_accuracy": 0.8745762705802917, + "num_tokens": 44044130.0, + "step": 15503, + "train/ce_loss": 0.5221872925758362 + }, + { + "epoch": 1.5328257860391536, + "step": 15503, + "train/sim_loss": 0.05393707752227783 + }, + { + "epoch": 1.5328257860391536, + "step": 15503, + "train/total_loss": 0.10615581274032593 + }, + { + "entropy": 9.365796089172363, + "epoch": 1.5329246588886691, + "mean_token_accuracy": 0.8321013450622559, + "num_tokens": 44057632.0, + "step": 15504, + "train/ce_loss": 0.39284762740135193 + }, + { + "epoch": 1.5329246588886691, + "step": 15504, + "train/sim_loss": 0.05253708362579346 + }, + { + "epoch": 1.5329246588886691, + "step": 15504, + "train/total_loss": 0.09182184934616089 + }, + { + "entropy": 9.230521202087402, + "epoch": 1.5330235317381846, + "mean_token_accuracy": 0.8417874574661255, + "num_tokens": 44066581.0, + "step": 15505, + "train/ce_loss": 0.5495624542236328 + }, + { + "epoch": 1.5330235317381846, + "step": 15505, + "train/sim_loss": 0.02982187271118164 + }, + { + "epoch": 1.5330235317381846, + "step": 15505, + "train/total_loss": 0.08477811515331268 + }, + { + "entropy": 9.299023628234863, + "epoch": 1.5331224045877003, + "mean_token_accuracy": 0.89853435754776, + "num_tokens": 44079340.0, + "step": 15506, + "train/ce_loss": 0.2400834709405899 + }, + { + "epoch": 1.5331224045877003, + "step": 15506, + "train/sim_loss": 0.04198718070983887 + }, + { + "epoch": 1.5331224045877003, + "step": 15506, + "train/total_loss": 0.06599552929401398 + }, + { + "entropy": 9.111177444458008, + "epoch": 1.5332212774372156, + "mean_token_accuracy": 0.8139534592628479, + "num_tokens": 44087595.0, + "step": 15507, + "train/ce_loss": 0.4364094138145447 + }, + { + "epoch": 1.5332212774372156, + "step": 15507, + "train/sim_loss": 0.015587031841278076 + }, + { + "epoch": 1.5332212774372156, + "step": 15507, + "train/total_loss": 0.059227973222732544 + }, + { + "entropy": 10.365911483764648, + "epoch": 1.5333201502867313, + "mean_token_accuracy": 0.8496240377426147, + "num_tokens": 44094076.0, + "step": 15508, + "train/ce_loss": 5.682561550202081e-07 + }, + { + "epoch": 1.5333201502867313, + "step": 15508, + "train/sim_loss": 0.016487956047058105 + }, + { + "epoch": 1.5333201502867313, + "step": 15508, + "train/total_loss": 0.01648801378905773 + }, + { + "entropy": 9.19618034362793, + "epoch": 1.5334190231362468, + "mean_token_accuracy": 0.7986842393875122, + "num_tokens": 44103087.0, + "step": 15509, + "train/ce_loss": 0.4650461971759796 + }, + { + "epoch": 1.5334190231362468, + "step": 15509, + "train/sim_loss": 0.10466611385345459 + }, + { + "epoch": 1.5334190231362468, + "step": 15509, + "train/total_loss": 0.1511707305908203 + }, + { + "entropy": 9.213501930236816, + "epoch": 1.5335178959857623, + "mean_token_accuracy": 0.8461538553237915, + "num_tokens": 44116698.0, + "step": 15510, + "train/ce_loss": 0.40826553106307983 + }, + { + "epoch": 1.5335178959857623, + "step": 15510, + "train/sim_loss": 0.03056180477142334 + }, + { + "epoch": 1.5335178959857623, + "step": 15510, + "train/total_loss": 0.0713883638381958 + }, + { + "entropy": 9.528871536254883, + "epoch": 1.533616768835278, + "mean_token_accuracy": 0.906191349029541, + "num_tokens": 44132283.0, + "step": 15511, + "train/ce_loss": 0.3191855251789093 + }, + { + "epoch": 1.533616768835278, + "step": 15511, + "train/sim_loss": 0.015382528305053711 + }, + { + "epoch": 1.533616768835278, + "step": 15511, + "train/total_loss": 0.04730108007788658 + }, + { + "entropy": 9.511738777160645, + "epoch": 1.5337156416847932, + "mean_token_accuracy": 0.8754512667655945, + "num_tokens": 44146100.0, + "step": 15512, + "train/ce_loss": 0.7696167230606079 + }, + { + "epoch": 1.5337156416847932, + "step": 15512, + "train/sim_loss": 0.03528416156768799 + }, + { + "epoch": 1.5337156416847932, + "step": 15512, + "train/total_loss": 0.1122458353638649 + }, + { + "entropy": 9.951263427734375, + "epoch": 1.533814514534309, + "mean_token_accuracy": 0.9248120188713074, + "num_tokens": 44157121.0, + "step": 15513, + "train/ce_loss": 4.1588430121919373e-07 + }, + { + "epoch": 1.533814514534309, + "step": 15513, + "train/sim_loss": 0.007455170154571533 + }, + { + "epoch": 1.533814514534309, + "step": 15513, + "train/total_loss": 0.007455211598426104 + }, + { + "entropy": 9.670467376708984, + "epoch": 1.5339133873838244, + "mean_token_accuracy": 0.8565573692321777, + "num_tokens": 44170073.0, + "step": 15514, + "train/ce_loss": 0.5907592177391052 + }, + { + "epoch": 1.5339133873838244, + "step": 15514, + "train/sim_loss": 0.03936415910720825 + }, + { + "epoch": 1.5339133873838244, + "step": 15514, + "train/total_loss": 0.09844008088111877 + }, + { + "entropy": 9.373334884643555, + "epoch": 1.53401226023334, + "mean_token_accuracy": 0.8521126508712769, + "num_tokens": 44194965.0, + "step": 15515, + "train/ce_loss": 0.7678623795509338 + }, + { + "epoch": 1.53401226023334, + "step": 15515, + "train/sim_loss": 0.03160226345062256 + }, + { + "epoch": 1.53401226023334, + "step": 15515, + "train/total_loss": 0.1083885058760643 + }, + { + "entropy": 9.540596961975098, + "epoch": 1.5341111330828554, + "mean_token_accuracy": 0.8484848737716675, + "num_tokens": 44209911.0, + "step": 15516, + "train/ce_loss": 2.6669954422686715e-06 + }, + { + "epoch": 1.5341111330828554, + "step": 15516, + "train/sim_loss": 0.027154505252838135 + }, + { + "epoch": 1.5341111330828554, + "step": 15516, + "train/total_loss": 0.027154771611094475 + }, + { + "entropy": 10.169517517089844, + "epoch": 1.534210005932371, + "mean_token_accuracy": 0.8622589707374573, + "num_tokens": 44218455.0, + "step": 15517, + "train/ce_loss": 1.0528333187103271 + }, + { + "epoch": 1.534210005932371, + "step": 15517, + "train/sim_loss": 0.050302207469940186 + }, + { + "epoch": 1.534210005932371, + "step": 15517, + "train/total_loss": 0.15558554232120514 + }, + { + "entropy": 9.7276611328125, + "epoch": 1.5343088787818866, + "mean_token_accuracy": 0.8987341523170471, + "num_tokens": 44233520.0, + "step": 15518, + "train/ce_loss": 0.1843760460615158 + }, + { + "epoch": 1.5343088787818866, + "step": 15518, + "train/sim_loss": 0.03132355213165283 + }, + { + "epoch": 1.5343088787818866, + "step": 15518, + "train/total_loss": 0.04976115748286247 + }, + { + "entropy": 9.24221420288086, + "epoch": 1.5344077516314019, + "mean_token_accuracy": 0.8720588088035583, + "num_tokens": 44250416.0, + "step": 15519, + "train/ce_loss": 0.5139808654785156 + }, + { + "epoch": 1.5344077516314019, + "step": 15519, + "train/sim_loss": 0.04826498031616211 + }, + { + "epoch": 1.5344077516314019, + "step": 15519, + "train/total_loss": 0.09966306388378143 + }, + { + "epoch": 1.5345066244809176, + "grad_norm": 0.5416471362113953, + "learning_rate": 6.16550462344855e-06, + "loss": 0.0774, + "step": 15520 + }, + { + "entropy": 9.532791137695312, + "epoch": 1.5345066244809176, + "mean_token_accuracy": 0.8326055407524109, + "num_tokens": 44263997.0, + "step": 15520, + "train/ce_loss": 0.6778619289398193 + }, + { + "epoch": 1.5345066244809176, + "step": 15520, + "train/sim_loss": 0.040982067584991455 + }, + { + "epoch": 1.5345066244809176, + "step": 15520, + "train/total_loss": 0.10876826196908951 + }, + { + "entropy": 8.94248104095459, + "epoch": 1.534605497330433, + "mean_token_accuracy": 0.8556581735610962, + "num_tokens": 44272101.0, + "step": 15521, + "train/ce_loss": 0.38642117381095886 + }, + { + "epoch": 1.534605497330433, + "step": 15521, + "train/sim_loss": 0.05522739887237549 + }, + { + "epoch": 1.534605497330433, + "step": 15521, + "train/total_loss": 0.09386952221393585 + }, + { + "entropy": 9.221086502075195, + "epoch": 1.5347043701799485, + "mean_token_accuracy": 0.8371501564979553, + "num_tokens": 44283090.0, + "step": 15522, + "train/ce_loss": 2.031962594628567e-06 + }, + { + "epoch": 1.5347043701799485, + "step": 15522, + "train/sim_loss": 0.042963385581970215 + }, + { + "epoch": 1.5347043701799485, + "step": 15522, + "train/total_loss": 0.04296359047293663 + }, + { + "entropy": 9.085994720458984, + "epoch": 1.5348032430294642, + "mean_token_accuracy": 0.8644067645072937, + "num_tokens": 44293188.0, + "step": 15523, + "train/ce_loss": 0.35460931062698364 + }, + { + "epoch": 1.5348032430294642, + "step": 15523, + "train/sim_loss": 0.0339435338973999 + }, + { + "epoch": 1.5348032430294642, + "step": 15523, + "train/total_loss": 0.0694044679403305 + }, + { + "entropy": 9.325504302978516, + "epoch": 1.5349021158789795, + "mean_token_accuracy": 0.8514969944953918, + "num_tokens": 44311324.0, + "step": 15524, + "train/ce_loss": 0.2248862385749817 + }, + { + "epoch": 1.5349021158789795, + "step": 15524, + "train/sim_loss": 0.061233460903167725 + }, + { + "epoch": 1.5349021158789795, + "step": 15524, + "train/total_loss": 0.0837220847606659 + }, + { + "entropy": 9.268471717834473, + "epoch": 1.5350009887284952, + "mean_token_accuracy": 0.8604382872581482, + "num_tokens": 44324490.0, + "step": 15525, + "train/ce_loss": 0.4122193157672882 + }, + { + "epoch": 1.5350009887284952, + "step": 15525, + "train/sim_loss": 0.016553759574890137 + }, + { + "epoch": 1.5350009887284952, + "step": 15525, + "train/total_loss": 0.05777569115161896 + }, + { + "entropy": 9.21963882446289, + "epoch": 1.5350998615780107, + "mean_token_accuracy": 0.8335134983062744, + "num_tokens": 44341927.0, + "step": 15526, + "train/ce_loss": 0.5864313244819641 + }, + { + "epoch": 1.5350998615780107, + "step": 15526, + "train/sim_loss": 0.017277777194976807 + }, + { + "epoch": 1.5350998615780107, + "step": 15526, + "train/total_loss": 0.07592090964317322 + }, + { + "entropy": 9.230839729309082, + "epoch": 1.5351987344275262, + "mean_token_accuracy": 0.8777292370796204, + "num_tokens": 44356667.0, + "step": 15527, + "train/ce_loss": 0.5999716520309448 + }, + { + "epoch": 1.5351987344275262, + "step": 15527, + "train/sim_loss": 0.024262309074401855 + }, + { + "epoch": 1.5351987344275262, + "step": 15527, + "train/total_loss": 0.08425948023796082 + }, + { + "entropy": 9.540874481201172, + "epoch": 1.5352976072770417, + "mean_token_accuracy": 0.8554006814956665, + "num_tokens": 44372705.0, + "step": 15528, + "train/ce_loss": 0.5123423933982849 + }, + { + "epoch": 1.5352976072770417, + "step": 15528, + "train/sim_loss": 0.019721150398254395 + }, + { + "epoch": 1.5352976072770417, + "step": 15528, + "train/total_loss": 0.07095539569854736 + }, + { + "entropy": 9.077851295471191, + "epoch": 1.5353964801265572, + "mean_token_accuracy": 0.8485293984413147, + "num_tokens": 44390123.0, + "step": 15529, + "train/ce_loss": 0.7602454423904419 + }, + { + "epoch": 1.5353964801265572, + "step": 15529, + "train/sim_loss": 0.012822866439819336 + }, + { + "epoch": 1.5353964801265572, + "step": 15529, + "train/total_loss": 0.08884741365909576 + }, + { + "entropy": 9.028739929199219, + "epoch": 1.5354953529760729, + "mean_token_accuracy": 0.8765100836753845, + "num_tokens": 44404921.0, + "step": 15530, + "train/ce_loss": 1.9137343088004855e-07 + }, + { + "epoch": 1.5354953529760729, + "step": 15530, + "train/sim_loss": 0.03732973337173462 + }, + { + "epoch": 1.5354953529760729, + "step": 15530, + "train/total_loss": 0.03732975199818611 + }, + { + "entropy": 9.749067306518555, + "epoch": 1.5355942258255881, + "mean_token_accuracy": 0.8776978254318237, + "num_tokens": 44419952.0, + "step": 15531, + "train/ce_loss": 0.40412774682044983 + }, + { + "epoch": 1.5355942258255881, + "step": 15531, + "train/sim_loss": 0.0715746283531189 + }, + { + "epoch": 1.5355942258255881, + "step": 15531, + "train/total_loss": 0.11198740452528 + }, + { + "entropy": 9.541303634643555, + "epoch": 1.5356930986751038, + "mean_token_accuracy": 0.8417508602142334, + "num_tokens": 44435518.0, + "step": 15532, + "train/ce_loss": 0.6373078227043152 + }, + { + "epoch": 1.5356930986751038, + "step": 15532, + "train/sim_loss": 0.02529233694076538 + }, + { + "epoch": 1.5356930986751038, + "step": 15532, + "train/total_loss": 0.08902312070131302 + }, + { + "entropy": 9.36393928527832, + "epoch": 1.5357919715246193, + "mean_token_accuracy": 0.8808290362358093, + "num_tokens": 44442138.0, + "step": 15533, + "train/ce_loss": 0.46860140562057495 + }, + { + "epoch": 1.5357919715246193, + "step": 15533, + "train/sim_loss": 0.0362887978553772 + }, + { + "epoch": 1.5357919715246193, + "step": 15533, + "train/total_loss": 0.08314894139766693 + }, + { + "entropy": 9.314950942993164, + "epoch": 1.5358908443741348, + "mean_token_accuracy": 0.8692468404769897, + "num_tokens": 44455932.0, + "step": 15534, + "train/ce_loss": 0.2262345403432846 + }, + { + "epoch": 1.5358908443741348, + "step": 15534, + "train/sim_loss": 0.041899919509887695 + }, + { + "epoch": 1.5358908443741348, + "step": 15534, + "train/total_loss": 0.0645233765244484 + }, + { + "entropy": 9.471124649047852, + "epoch": 1.5359897172236505, + "mean_token_accuracy": 0.8791064620018005, + "num_tokens": 44467050.0, + "step": 15535, + "train/ce_loss": 0.3533307611942291 + }, + { + "epoch": 1.5359897172236505, + "step": 15535, + "train/sim_loss": 0.03632044792175293 + }, + { + "epoch": 1.5359897172236505, + "step": 15535, + "train/total_loss": 0.07165353000164032 + }, + { + "entropy": 9.630651473999023, + "epoch": 1.5360885900731658, + "mean_token_accuracy": 0.8557844758033752, + "num_tokens": 44482527.0, + "step": 15536, + "train/ce_loss": 9.241496741196897e-07 + }, + { + "epoch": 1.5360885900731658, + "step": 15536, + "train/sim_loss": 0.03623932600021362 + }, + { + "epoch": 1.5360885900731658, + "step": 15536, + "train/total_loss": 0.036239419132471085 + }, + { + "entropy": 9.382251739501953, + "epoch": 1.5361874629226815, + "mean_token_accuracy": 0.8641456365585327, + "num_tokens": 44496179.0, + "step": 15537, + "train/ce_loss": 2.5377630663570017e-07 + }, + { + "epoch": 1.5361874629226815, + "step": 15537, + "train/sim_loss": 0.013514578342437744 + }, + { + "epoch": 1.5361874629226815, + "step": 15537, + "train/total_loss": 0.013514603488147259 + }, + { + "entropy": 9.295955657958984, + "epoch": 1.536286335772197, + "mean_token_accuracy": 0.8440678119659424, + "num_tokens": 44510711.0, + "step": 15538, + "train/ce_loss": 0.406403511762619 + }, + { + "epoch": 1.536286335772197, + "step": 15538, + "train/sim_loss": 0.03211390972137451 + }, + { + "epoch": 1.536286335772197, + "step": 15538, + "train/total_loss": 0.07275426387786865 + }, + { + "entropy": 9.50147819519043, + "epoch": 1.5363852086217125, + "mean_token_accuracy": 0.8455114960670471, + "num_tokens": 44526831.0, + "step": 15539, + "train/ce_loss": 3.5438876011539833e-07 + }, + { + "epoch": 1.5363852086217125, + "step": 15539, + "train/sim_loss": 0.025833308696746826 + }, + { + "epoch": 1.5363852086217125, + "step": 15539, + "train/total_loss": 0.02583334408700466 + }, + { + "epoch": 1.536484081471228, + "grad_norm": 0.7324240803718567, + "learning_rate": 6.160559758690601e-06, + "loss": 0.0832, + "step": 15540 + }, + { + "entropy": 9.213797569274902, + "epoch": 1.536484081471228, + "mean_token_accuracy": 0.8888888955116272, + "num_tokens": 44541130.0, + "step": 15540, + "train/ce_loss": 0.5100647211074829 + }, + { + "epoch": 1.536484081471228, + "step": 15540, + "train/sim_loss": 0.04110151529312134 + }, + { + "epoch": 1.536484081471228, + "step": 15540, + "train/total_loss": 0.09210798889398575 + }, + { + "entropy": 9.13371467590332, + "epoch": 1.5365829543207434, + "mean_token_accuracy": 0.839184582233429, + "num_tokens": 44555071.0, + "step": 15541, + "train/ce_loss": 0.5463962554931641 + }, + { + "epoch": 1.5365829543207434, + "step": 15541, + "train/sim_loss": 0.06407654285430908 + }, + { + "epoch": 1.5365829543207434, + "step": 15541, + "train/total_loss": 0.11871616542339325 + }, + { + "entropy": 9.569046974182129, + "epoch": 1.5366818271702591, + "mean_token_accuracy": 0.8906497359275818, + "num_tokens": 44565300.0, + "step": 15542, + "train/ce_loss": 0.29837530851364136 + }, + { + "epoch": 1.5366818271702591, + "step": 15542, + "train/sim_loss": 0.06425285339355469 + }, + { + "epoch": 1.5366818271702591, + "step": 15542, + "train/total_loss": 0.09409038722515106 + }, + { + "entropy": 8.753713607788086, + "epoch": 1.5367807000197744, + "mean_token_accuracy": 0.8548864722251892, + "num_tokens": 44575391.0, + "step": 15543, + "train/ce_loss": 0.36818304657936096 + }, + { + "epoch": 1.5367807000197744, + "step": 15543, + "train/sim_loss": 0.11928689479827881 + }, + { + "epoch": 1.5367807000197744, + "step": 15543, + "train/total_loss": 0.15610520541667938 + }, + { + "entropy": 9.154935836791992, + "epoch": 1.5368795728692901, + "mean_token_accuracy": 0.8194748163223267, + "num_tokens": 44587546.0, + "step": 15544, + "train/ce_loss": 0.31740882992744446 + }, + { + "epoch": 1.5368795728692901, + "step": 15544, + "train/sim_loss": 0.015049219131469727 + }, + { + "epoch": 1.5368795728692901, + "step": 15544, + "train/total_loss": 0.04679010435938835 + }, + { + "entropy": 8.728304862976074, + "epoch": 1.5369784457188056, + "mean_token_accuracy": 0.8290686011314392, + "num_tokens": 44595226.0, + "step": 15545, + "train/ce_loss": 0.8360991477966309 + }, + { + "epoch": 1.5369784457188056, + "step": 15545, + "train/sim_loss": 0.037242591381073 + }, + { + "epoch": 1.5369784457188056, + "step": 15545, + "train/total_loss": 0.1208525076508522 + }, + { + "entropy": 9.278260231018066, + "epoch": 1.537077318568321, + "mean_token_accuracy": 0.8220211267471313, + "num_tokens": 44613428.0, + "step": 15546, + "train/ce_loss": 1.1532337665557861 + }, + { + "epoch": 1.537077318568321, + "step": 15546, + "train/sim_loss": 0.054852962493896484 + }, + { + "epoch": 1.537077318568321, + "step": 15546, + "train/total_loss": 0.17017634212970734 + }, + { + "entropy": 9.313729286193848, + "epoch": 1.5371761914178368, + "mean_token_accuracy": 0.8645962476730347, + "num_tokens": 44628061.0, + "step": 15547, + "train/ce_loss": 0.5622519254684448 + }, + { + "epoch": 1.5371761914178368, + "step": 15547, + "train/sim_loss": 0.02125924825668335 + }, + { + "epoch": 1.5371761914178368, + "step": 15547, + "train/total_loss": 0.07748444378376007 + }, + { + "entropy": 9.99081039428711, + "epoch": 1.537275064267352, + "mean_token_accuracy": 0.9261083602905273, + "num_tokens": 44637631.0, + "step": 15548, + "train/ce_loss": 1.4233885394787649e-06 + }, + { + "epoch": 1.537275064267352, + "step": 15548, + "train/sim_loss": 0.04382157325744629 + }, + { + "epoch": 1.537275064267352, + "step": 15548, + "train/total_loss": 0.04382171481847763 + }, + { + "entropy": 9.971100807189941, + "epoch": 1.5373739371168678, + "mean_token_accuracy": 0.8628762364387512, + "num_tokens": 44650450.0, + "step": 15549, + "train/ce_loss": 0.5072386264801025 + }, + { + "epoch": 1.5373739371168678, + "step": 15549, + "train/sim_loss": 0.05651390552520752 + }, + { + "epoch": 1.5373739371168678, + "step": 15549, + "train/total_loss": 0.10723777115345001 + }, + { + "entropy": 8.984760284423828, + "epoch": 1.5374728099663832, + "mean_token_accuracy": 0.8864796161651611, + "num_tokens": 44658725.0, + "step": 15550, + "train/ce_loss": 0.1607818603515625 + }, + { + "epoch": 1.5374728099663832, + "step": 15550, + "train/sim_loss": 0.031545817852020264 + }, + { + "epoch": 1.5374728099663832, + "step": 15550, + "train/total_loss": 0.04762400686740875 + }, + { + "entropy": 9.26809310913086, + "epoch": 1.5375716828158987, + "mean_token_accuracy": 0.8094629049301147, + "num_tokens": 44669998.0, + "step": 15551, + "train/ce_loss": 0.4364279806613922 + }, + { + "epoch": 1.5375716828158987, + "step": 15551, + "train/sim_loss": 0.019725561141967773 + }, + { + "epoch": 1.5375716828158987, + "step": 15551, + "train/total_loss": 0.06336836516857147 + }, + { + "entropy": 9.473377227783203, + "epoch": 1.5376705556654144, + "mean_token_accuracy": 0.8491379022598267, + "num_tokens": 44678255.0, + "step": 15552, + "train/ce_loss": 0.6349029541015625 + }, + { + "epoch": 1.5376705556654144, + "step": 15552, + "train/sim_loss": 0.048188626766204834 + }, + { + "epoch": 1.5376705556654144, + "step": 15552, + "train/total_loss": 0.11167892068624496 + }, + { + "entropy": 9.62903118133545, + "epoch": 1.5377694285149297, + "mean_token_accuracy": 0.8755555748939514, + "num_tokens": 44690251.0, + "step": 15553, + "train/ce_loss": 0.48945239186286926 + }, + { + "epoch": 1.5377694285149297, + "step": 15553, + "train/sim_loss": 0.04017496109008789 + }, + { + "epoch": 1.5377694285149297, + "step": 15553, + "train/total_loss": 0.08912020176649094 + }, + { + "entropy": 9.074546813964844, + "epoch": 1.5378683013644454, + "mean_token_accuracy": 0.8014705777168274, + "num_tokens": 44700014.0, + "step": 15554, + "train/ce_loss": 0.7528289556503296 + }, + { + "epoch": 1.5378683013644454, + "step": 15554, + "train/sim_loss": 0.0772784948348999 + }, + { + "epoch": 1.5378683013644454, + "step": 15554, + "train/total_loss": 0.15256139636039734 + }, + { + "entropy": 9.28653335571289, + "epoch": 1.537967174213961, + "mean_token_accuracy": 0.8707799911499023, + "num_tokens": 44711960.0, + "step": 15555, + "train/ce_loss": 0.23730185627937317 + }, + { + "epoch": 1.537967174213961, + "step": 15555, + "train/sim_loss": 0.05544722080230713 + }, + { + "epoch": 1.537967174213961, + "step": 15555, + "train/total_loss": 0.07917740941047668 + }, + { + "entropy": 9.113178253173828, + "epoch": 1.5380660470634764, + "mean_token_accuracy": 0.8925619721412659, + "num_tokens": 44718060.0, + "step": 15556, + "train/ce_loss": 0.2809114158153534 + }, + { + "epoch": 1.5380660470634764, + "step": 15556, + "train/sim_loss": 0.016695022583007812 + }, + { + "epoch": 1.5380660470634764, + "step": 15556, + "train/total_loss": 0.04478616267442703 + }, + { + "entropy": 9.28750991821289, + "epoch": 1.5381649199129919, + "mean_token_accuracy": 0.8513341546058655, + "num_tokens": 44731183.0, + "step": 15557, + "train/ce_loss": 0.6976374983787537 + }, + { + "epoch": 1.5381649199129919, + "step": 15557, + "train/sim_loss": 0.03159892559051514 + }, + { + "epoch": 1.5381649199129919, + "step": 15557, + "train/total_loss": 0.1013626754283905 + }, + { + "entropy": 9.566449165344238, + "epoch": 1.5382637927625074, + "mean_token_accuracy": 0.8135592937469482, + "num_tokens": 44746281.0, + "step": 15558, + "train/ce_loss": 0.22687289118766785 + }, + { + "epoch": 1.5382637927625074, + "step": 15558, + "train/sim_loss": 0.017385363578796387 + }, + { + "epoch": 1.5382637927625074, + "step": 15558, + "train/total_loss": 0.04007265344262123 + }, + { + "entropy": 10.11129379272461, + "epoch": 1.538362665612023, + "mean_token_accuracy": 0.9281437397003174, + "num_tokens": 44760088.0, + "step": 15559, + "train/ce_loss": 1.184942789222987e-06 + }, + { + "epoch": 1.538362665612023, + "step": 15559, + "train/sim_loss": 0.03168600797653198 + }, + { + "epoch": 1.538362665612023, + "step": 15559, + "train/total_loss": 0.03168612718582153 + }, + { + "epoch": 1.5384615384615383, + "grad_norm": 0.749226450920105, + "learning_rate": 6.155614893932651e-06, + "loss": 0.0871, + "step": 15560 + }, + { + "entropy": 8.9140625, + "epoch": 1.5384615384615383, + "mean_token_accuracy": 0.8397436141967773, + "num_tokens": 44774510.0, + "step": 15560, + "train/ce_loss": 7.671895900784875e-07 + }, + { + "epoch": 1.5384615384615383, + "step": 15560, + "train/sim_loss": 0.03339838981628418 + }, + { + "epoch": 1.5384615384615383, + "step": 15560, + "train/total_loss": 0.03339846804738045 + }, + { + "entropy": 9.452481269836426, + "epoch": 1.538560411311054, + "mean_token_accuracy": 0.8628205060958862, + "num_tokens": 44791831.0, + "step": 15561, + "train/ce_loss": 0.31491464376449585 + }, + { + "epoch": 1.538560411311054, + "step": 15561, + "train/sim_loss": 0.03274953365325928 + }, + { + "epoch": 1.538560411311054, + "step": 15561, + "train/total_loss": 0.06424099951982498 + }, + { + "entropy": 9.241341590881348, + "epoch": 1.5386592841605695, + "mean_token_accuracy": 0.8520461916923523, + "num_tokens": 44811327.0, + "step": 15562, + "train/ce_loss": 0.3365449607372284 + }, + { + "epoch": 1.5386592841605695, + "step": 15562, + "train/sim_loss": 0.022071897983551025 + }, + { + "epoch": 1.5386592841605695, + "step": 15562, + "train/total_loss": 0.055726394057273865 + }, + { + "entropy": 9.69753360748291, + "epoch": 1.538758157010085, + "mean_token_accuracy": 0.8376288414001465, + "num_tokens": 44823637.0, + "step": 15563, + "train/ce_loss": 0.7772836089134216 + }, + { + "epoch": 1.538758157010085, + "step": 15563, + "train/sim_loss": 0.07043731212615967 + }, + { + "epoch": 1.538758157010085, + "step": 15563, + "train/total_loss": 0.14816567301750183 + }, + { + "entropy": 9.053631782531738, + "epoch": 1.5388570298596007, + "mean_token_accuracy": 0.8730325102806091, + "num_tokens": 44833368.0, + "step": 15564, + "train/ce_loss": 0.5360143184661865 + }, + { + "epoch": 1.5388570298596007, + "step": 15564, + "train/sim_loss": 0.03468090295791626 + }, + { + "epoch": 1.5388570298596007, + "step": 15564, + "train/total_loss": 0.08828233182430267 + }, + { + "entropy": 9.552724838256836, + "epoch": 1.538955902709116, + "mean_token_accuracy": 0.836686372756958, + "num_tokens": 44853157.0, + "step": 15565, + "train/ce_loss": 0.29991036653518677 + }, + { + "epoch": 1.538955902709116, + "step": 15565, + "train/sim_loss": 0.022078275680541992 + }, + { + "epoch": 1.538955902709116, + "step": 15565, + "train/total_loss": 0.05206931382417679 + }, + { + "entropy": 9.349292755126953, + "epoch": 1.5390547755586317, + "mean_token_accuracy": 0.8229755163192749, + "num_tokens": 44859943.0, + "step": 15566, + "train/ce_loss": 0.4456663429737091 + }, + { + "epoch": 1.5390547755586317, + "step": 15566, + "train/sim_loss": 0.06722599267959595 + }, + { + "epoch": 1.5390547755586317, + "step": 15566, + "train/total_loss": 0.11179262399673462 + }, + { + "entropy": 9.250455856323242, + "epoch": 1.5391536484081472, + "mean_token_accuracy": 0.8352272510528564, + "num_tokens": 44869150.0, + "step": 15567, + "train/ce_loss": 0.6816310286521912 + }, + { + "epoch": 1.5391536484081472, + "step": 15567, + "train/sim_loss": 0.04425644874572754 + }, + { + "epoch": 1.5391536484081472, + "step": 15567, + "train/total_loss": 0.11241955310106277 + }, + { + "entropy": 9.177759170532227, + "epoch": 1.5392525212576627, + "mean_token_accuracy": 0.875809907913208, + "num_tokens": 44879490.0, + "step": 15568, + "train/ce_loss": 0.34121274948120117 + }, + { + "epoch": 1.5392525212576627, + "step": 15568, + "train/sim_loss": 0.01703011989593506 + }, + { + "epoch": 1.5392525212576627, + "step": 15568, + "train/total_loss": 0.051151394844055176 + }, + { + "entropy": 9.555173873901367, + "epoch": 1.5393513941071781, + "mean_token_accuracy": 0.8543165326118469, + "num_tokens": 44887586.0, + "step": 15569, + "train/ce_loss": 0.49759581685066223 + }, + { + "epoch": 1.5393513941071781, + "step": 15569, + "train/sim_loss": 0.04457199573516846 + }, + { + "epoch": 1.5393513941071781, + "step": 15569, + "train/total_loss": 0.09433157742023468 + }, + { + "entropy": 8.407096862792969, + "epoch": 1.5394502669566936, + "mean_token_accuracy": 0.8421052694320679, + "num_tokens": 44895102.0, + "step": 15570, + "train/ce_loss": 0.40045875310897827 + }, + { + "epoch": 1.5394502669566936, + "step": 15570, + "train/sim_loss": 0.10085994005203247 + }, + { + "epoch": 1.5394502669566936, + "step": 15570, + "train/total_loss": 0.14090581238269806 + }, + { + "entropy": 9.72424030303955, + "epoch": 1.5395491398062093, + "mean_token_accuracy": 0.893203854560852, + "num_tokens": 44907112.0, + "step": 15571, + "train/ce_loss": 3.555404077815183e-07 + }, + { + "epoch": 1.5395491398062093, + "step": 15571, + "train/sim_loss": 0.011334061622619629 + }, + { + "epoch": 1.5395491398062093, + "step": 15571, + "train/total_loss": 0.011334097012877464 + }, + { + "entropy": 8.962196350097656, + "epoch": 1.5396480126557246, + "mean_token_accuracy": 0.8279816508293152, + "num_tokens": 44915276.0, + "step": 15572, + "train/ce_loss": 0.9635932445526123 + }, + { + "epoch": 1.5396480126557246, + "step": 15572, + "train/sim_loss": 0.08694446086883545 + }, + { + "epoch": 1.5396480126557246, + "step": 15572, + "train/total_loss": 0.18330378830432892 + }, + { + "entropy": 9.330257415771484, + "epoch": 1.5397468855052403, + "mean_token_accuracy": 0.8466165661811829, + "num_tokens": 44929205.0, + "step": 15573, + "train/ce_loss": 0.3745930790901184 + }, + { + "epoch": 1.5397468855052403, + "step": 15573, + "train/sim_loss": 0.015923619270324707 + }, + { + "epoch": 1.5397468855052403, + "step": 15573, + "train/total_loss": 0.05338292941451073 + }, + { + "entropy": 9.153312683105469, + "epoch": 1.5398457583547558, + "mean_token_accuracy": 0.7976952791213989, + "num_tokens": 44944402.0, + "step": 15574, + "train/ce_loss": 2.5825883653851633e-07 + }, + { + "epoch": 1.5398457583547558, + "step": 15574, + "train/sim_loss": 0.02271968126296997 + }, + { + "epoch": 1.5398457583547558, + "step": 15574, + "train/total_loss": 0.02271970734000206 + }, + { + "entropy": 9.03882884979248, + "epoch": 1.5399446312042713, + "mean_token_accuracy": 0.7865296602249146, + "num_tokens": 44952963.0, + "step": 15575, + "train/ce_loss": 0.43071678280830383 + }, + { + "epoch": 1.5399446312042713, + "step": 15575, + "train/sim_loss": 0.011766910552978516 + }, + { + "epoch": 1.5399446312042713, + "step": 15575, + "train/total_loss": 0.05483859032392502 + }, + { + "entropy": 9.203664779663086, + "epoch": 1.540043504053787, + "mean_token_accuracy": 0.8474025726318359, + "num_tokens": 44969067.0, + "step": 15576, + "train/ce_loss": 0.767663836479187 + }, + { + "epoch": 1.540043504053787, + "step": 15576, + "train/sim_loss": 0.04570186138153076 + }, + { + "epoch": 1.540043504053787, + "step": 15576, + "train/total_loss": 0.1224682480096817 + }, + { + "entropy": 9.366912841796875, + "epoch": 1.5401423769033022, + "mean_token_accuracy": 0.863070547580719, + "num_tokens": 44981933.0, + "step": 15577, + "train/ce_loss": 0.40167421102523804 + }, + { + "epoch": 1.5401423769033022, + "step": 15577, + "train/sim_loss": 0.04983878135681152 + }, + { + "epoch": 1.5401423769033022, + "step": 15577, + "train/total_loss": 0.09000620245933533 + }, + { + "entropy": 9.785273551940918, + "epoch": 1.540241249752818, + "mean_token_accuracy": 0.9016697406768799, + "num_tokens": 44990270.0, + "step": 15578, + "train/ce_loss": 0.24077239632606506 + }, + { + "epoch": 1.540241249752818, + "step": 15578, + "train/sim_loss": 0.058567166328430176 + }, + { + "epoch": 1.540241249752818, + "step": 15578, + "train/total_loss": 0.08264440298080444 + }, + { + "entropy": 9.579989433288574, + "epoch": 1.5403401226023334, + "mean_token_accuracy": 0.8882783651351929, + "num_tokens": 45000349.0, + "step": 15579, + "train/ce_loss": 0.41338011622428894 + }, + { + "epoch": 1.5403401226023334, + "step": 15579, + "train/sim_loss": 0.016424894332885742 + }, + { + "epoch": 1.5403401226023334, + "step": 15579, + "train/total_loss": 0.057762905955314636 + }, + { + "epoch": 1.540438995451849, + "grad_norm": 0.5518446564674377, + "learning_rate": 6.150670029174702e-06, + "loss": 0.0827, + "step": 15580 + }, + { + "entropy": 9.070225715637207, + "epoch": 1.540438995451849, + "mean_token_accuracy": 0.873997688293457, + "num_tokens": 45008847.0, + "step": 15580, + "train/ce_loss": 0.5278785824775696 + }, + { + "epoch": 1.540438995451849, + "step": 15580, + "train/sim_loss": 0.046226680278778076 + }, + { + "epoch": 1.540438995451849, + "step": 15580, + "train/total_loss": 0.0990145355463028 + }, + { + "entropy": 9.240795135498047, + "epoch": 1.5405378683013644, + "mean_token_accuracy": 0.8956781029701233, + "num_tokens": 45017404.0, + "step": 15581, + "train/ce_loss": 0.20593689382076263 + }, + { + "epoch": 1.5405378683013644, + "step": 15581, + "train/sim_loss": 0.029889047145843506 + }, + { + "epoch": 1.5405378683013644, + "step": 15581, + "train/total_loss": 0.05048273503780365 + }, + { + "entropy": 9.928196907043457, + "epoch": 1.54063674115088, + "mean_token_accuracy": 0.9017632007598877, + "num_tokens": 45029840.0, + "step": 15582, + "train/ce_loss": 6.774097869310935e-07 + }, + { + "epoch": 1.54063674115088, + "step": 15582, + "train/sim_loss": 0.026789307594299316 + }, + { + "epoch": 1.54063674115088, + "step": 15582, + "train/total_loss": 0.02678937464952469 + }, + { + "entropy": 9.35612678527832, + "epoch": 1.5407356140003956, + "mean_token_accuracy": 0.8627049326896667, + "num_tokens": 45041106.0, + "step": 15583, + "train/ce_loss": 4.289521200462332e-07 + }, + { + "epoch": 1.5407356140003956, + "step": 15583, + "train/sim_loss": 0.017502129077911377 + }, + { + "epoch": 1.5407356140003956, + "step": 15583, + "train/total_loss": 0.01750217191874981 + }, + { + "entropy": 9.518864631652832, + "epoch": 1.5408344868499109, + "mean_token_accuracy": 0.8598265647888184, + "num_tokens": 45054385.0, + "step": 15584, + "train/ce_loss": 0.551129937171936 + }, + { + "epoch": 1.5408344868499109, + "step": 15584, + "train/sim_loss": 0.038038671016693115 + }, + { + "epoch": 1.5408344868499109, + "step": 15584, + "train/total_loss": 0.09315166622400284 + }, + { + "entropy": 9.096382141113281, + "epoch": 1.5409333596994266, + "mean_token_accuracy": 0.8153619170188904, + "num_tokens": 45067166.0, + "step": 15585, + "train/ce_loss": 0.6365681290626526 + }, + { + "epoch": 1.5409333596994266, + "step": 15585, + "train/sim_loss": 0.015120625495910645 + }, + { + "epoch": 1.5409333596994266, + "step": 15585, + "train/total_loss": 0.07877743989229202 + }, + { + "entropy": 9.319936752319336, + "epoch": 1.541032232548942, + "mean_token_accuracy": 0.8259459733963013, + "num_tokens": 45080416.0, + "step": 15586, + "train/ce_loss": 0.4595819115638733 + }, + { + "epoch": 1.541032232548942, + "step": 15586, + "train/sim_loss": 0.0715017318725586 + }, + { + "epoch": 1.541032232548942, + "step": 15586, + "train/total_loss": 0.11745992302894592 + }, + { + "entropy": 9.047931671142578, + "epoch": 1.5411311053984575, + "mean_token_accuracy": 0.8623279333114624, + "num_tokens": 45088044.0, + "step": 15587, + "train/ce_loss": 0.4425033628940582 + }, + { + "epoch": 1.5411311053984575, + "step": 15587, + "train/sim_loss": 0.010923206806182861 + }, + { + "epoch": 1.5411311053984575, + "step": 15587, + "train/total_loss": 0.055173542350530624 + }, + { + "entropy": 9.18014907836914, + "epoch": 1.5412299782479733, + "mean_token_accuracy": 0.8364115953445435, + "num_tokens": 45095211.0, + "step": 15588, + "train/ce_loss": 0.5064647197723389 + }, + { + "epoch": 1.5412299782479733, + "step": 15588, + "train/sim_loss": 0.009697556495666504 + }, + { + "epoch": 1.5412299782479733, + "step": 15588, + "train/total_loss": 0.06034402921795845 + }, + { + "entropy": 9.321525573730469, + "epoch": 1.5413288510974885, + "mean_token_accuracy": 0.8261474370956421, + "num_tokens": 45111558.0, + "step": 15589, + "train/ce_loss": 0.6869306564331055 + }, + { + "epoch": 1.5413288510974885, + "step": 15589, + "train/sim_loss": 0.04155093431472778 + }, + { + "epoch": 1.5413288510974885, + "step": 15589, + "train/total_loss": 0.11024399846792221 + }, + { + "entropy": 9.128913879394531, + "epoch": 1.5414277239470042, + "mean_token_accuracy": 0.8113803863525391, + "num_tokens": 45122776.0, + "step": 15590, + "train/ce_loss": 0.480161190032959 + }, + { + "epoch": 1.5414277239470042, + "step": 15590, + "train/sim_loss": 0.0214921236038208 + }, + { + "epoch": 1.5414277239470042, + "step": 15590, + "train/total_loss": 0.06950823962688446 + }, + { + "entropy": 9.249202728271484, + "epoch": 1.5415265967965197, + "mean_token_accuracy": 0.8088064789772034, + "num_tokens": 45132431.0, + "step": 15591, + "train/ce_loss": 0.5489510893821716 + }, + { + "epoch": 1.5415265967965197, + "step": 15591, + "train/sim_loss": 0.08168041706085205 + }, + { + "epoch": 1.5415265967965197, + "step": 15591, + "train/total_loss": 0.13657552003860474 + }, + { + "entropy": 9.473143577575684, + "epoch": 1.5416254696460352, + "mean_token_accuracy": 0.8536912798881531, + "num_tokens": 45144272.0, + "step": 15592, + "train/ce_loss": 2.0097401431939943e-07 + }, + { + "epoch": 1.5416254696460352, + "step": 15592, + "train/sim_loss": 0.01356285810470581 + }, + { + "epoch": 1.5416254696460352, + "step": 15592, + "train/total_loss": 0.013562878593802452 + }, + { + "entropy": 8.951631546020508, + "epoch": 1.5417243424955507, + "mean_token_accuracy": 0.8386004567146301, + "num_tokens": 45154096.0, + "step": 15593, + "train/ce_loss": 0.497791588306427 + }, + { + "epoch": 1.5417243424955507, + "step": 15593, + "train/sim_loss": 0.06358885765075684 + }, + { + "epoch": 1.5417243424955507, + "step": 15593, + "train/total_loss": 0.11336801946163177 + }, + { + "entropy": 9.750216484069824, + "epoch": 1.5418232153450662, + "mean_token_accuracy": 0.8702490329742432, + "num_tokens": 45165432.0, + "step": 15594, + "train/ce_loss": 0.46677058935165405 + }, + { + "epoch": 1.5418232153450662, + "step": 15594, + "train/sim_loss": 0.023453593254089355 + }, + { + "epoch": 1.5418232153450662, + "step": 15594, + "train/total_loss": 0.07013065367937088 + }, + { + "entropy": 9.463769912719727, + "epoch": 1.5419220881945819, + "mean_token_accuracy": 0.8662420511245728, + "num_tokens": 45173841.0, + "step": 15595, + "train/ce_loss": 7.309270699806802e-07 + }, + { + "epoch": 1.5419220881945819, + "step": 15595, + "train/sim_loss": 0.027944326400756836 + }, + { + "epoch": 1.5419220881945819, + "step": 15595, + "train/total_loss": 0.027944399043917656 + }, + { + "entropy": 9.383512496948242, + "epoch": 1.5420209610440971, + "mean_token_accuracy": 0.9159212708473206, + "num_tokens": 45179851.0, + "step": 15596, + "train/ce_loss": 3.7532458918576594e-07 + }, + { + "epoch": 1.5420209610440971, + "step": 15596, + "train/sim_loss": 0.008891642093658447 + }, + { + "epoch": 1.5420209610440971, + "step": 15596, + "train/total_loss": 0.008891679346561432 + }, + { + "entropy": 9.550897598266602, + "epoch": 1.5421198338936128, + "mean_token_accuracy": 0.835597813129425, + "num_tokens": 45193185.0, + "step": 15597, + "train/ce_loss": 0.5893364548683167 + }, + { + "epoch": 1.5421198338936128, + "step": 15597, + "train/sim_loss": 0.11024636030197144 + }, + { + "epoch": 1.5421198338936128, + "step": 15597, + "train/total_loss": 0.1691800057888031 + }, + { + "entropy": 9.521127700805664, + "epoch": 1.5422187067431283, + "mean_token_accuracy": 0.8902255892753601, + "num_tokens": 45202435.0, + "step": 15598, + "train/ce_loss": 0.2402624785900116 + }, + { + "epoch": 1.5422187067431283, + "step": 15598, + "train/sim_loss": 0.051277875900268555 + }, + { + "epoch": 1.5422187067431283, + "step": 15598, + "train/total_loss": 0.07530412077903748 + }, + { + "entropy": 9.650398254394531, + "epoch": 1.5423175795926438, + "mean_token_accuracy": 0.8435583114624023, + "num_tokens": 45215176.0, + "step": 15599, + "train/ce_loss": 0.572933554649353 + }, + { + "epoch": 1.5423175795926438, + "step": 15599, + "train/sim_loss": 0.04240846633911133 + }, + { + "epoch": 1.5423175795926438, + "step": 15599, + "train/total_loss": 0.09970182180404663 + }, + { + "epoch": 1.5424164524421595, + "grad_norm": 0.6489714980125427, + "learning_rate": 6.1457251644167534e-06, + "loss": 0.0772, + "step": 15600 + }, + { + "entropy": 9.08685302734375, + "epoch": 1.5424164524421595, + "mean_token_accuracy": 0.9005449414253235, + "num_tokens": 45221939.0, + "step": 15600, + "train/ce_loss": 9.64003106673772e-07 + }, + { + "epoch": 1.5424164524421595, + "step": 15600, + "train/sim_loss": 0.04302126169204712 + }, + { + "epoch": 1.5424164524421595, + "step": 15600, + "train/total_loss": 0.04302135854959488 + }, + { + "entropy": 9.280653953552246, + "epoch": 1.5425153252916748, + "mean_token_accuracy": 0.8578784465789795, + "num_tokens": 45236791.0, + "step": 15601, + "train/ce_loss": 0.5464820861816406 + }, + { + "epoch": 1.5425153252916748, + "step": 15601, + "train/sim_loss": 0.05272543430328369 + }, + { + "epoch": 1.5425153252916748, + "step": 15601, + "train/total_loss": 0.10737363994121552 + }, + { + "entropy": 9.526152610778809, + "epoch": 1.5426141981411905, + "mean_token_accuracy": 0.8918918967247009, + "num_tokens": 45256190.0, + "step": 15602, + "train/ce_loss": 0.5304763317108154 + }, + { + "epoch": 1.5426141981411905, + "step": 15602, + "train/sim_loss": 0.015118837356567383 + }, + { + "epoch": 1.5426141981411905, + "step": 15602, + "train/total_loss": 0.06816647201776505 + }, + { + "entropy": 9.003318786621094, + "epoch": 1.542713070990706, + "mean_token_accuracy": 0.8713826537132263, + "num_tokens": 45266970.0, + "step": 15603, + "train/ce_loss": 0.25699150562286377 + }, + { + "epoch": 1.542713070990706, + "step": 15603, + "train/sim_loss": 0.04376643896102905 + }, + { + "epoch": 1.542713070990706, + "step": 15603, + "train/total_loss": 0.06946559250354767 + }, + { + "entropy": 9.796723365783691, + "epoch": 1.5428119438402215, + "mean_token_accuracy": 0.8885191082954407, + "num_tokens": 45279764.0, + "step": 15604, + "train/ce_loss": 3.4116087022084685e-07 + }, + { + "epoch": 1.5428119438402215, + "step": 15604, + "train/sim_loss": 0.02262401580810547 + }, + { + "epoch": 1.5428119438402215, + "step": 15604, + "train/total_loss": 0.022624049335718155 + }, + { + "entropy": 9.09741497039795, + "epoch": 1.542910816689737, + "mean_token_accuracy": 0.8458289504051208, + "num_tokens": 45292777.0, + "step": 15605, + "train/ce_loss": 0.552160382270813 + }, + { + "epoch": 1.542910816689737, + "step": 15605, + "train/sim_loss": 0.031004667282104492 + }, + { + "epoch": 1.542910816689737, + "step": 15605, + "train/total_loss": 0.08622071146965027 + }, + { + "entropy": 9.4090576171875, + "epoch": 1.5430096895392524, + "mean_token_accuracy": 0.8305084705352783, + "num_tokens": 45305719.0, + "step": 15606, + "train/ce_loss": 0.411599338054657 + }, + { + "epoch": 1.5430096895392524, + "step": 15606, + "train/sim_loss": 0.05060243606567383 + }, + { + "epoch": 1.5430096895392524, + "step": 15606, + "train/total_loss": 0.09176237136125565 + }, + { + "entropy": 9.223952293395996, + "epoch": 1.5431085623887681, + "mean_token_accuracy": 0.7931442260742188, + "num_tokens": 45320415.0, + "step": 15607, + "train/ce_loss": 0.459621787071228 + }, + { + "epoch": 1.5431085623887681, + "step": 15607, + "train/sim_loss": 0.021483540534973145 + }, + { + "epoch": 1.5431085623887681, + "step": 15607, + "train/total_loss": 0.06744572520256042 + }, + { + "entropy": 9.005350112915039, + "epoch": 1.5432074352382834, + "mean_token_accuracy": 0.8757575750350952, + "num_tokens": 45328110.0, + "step": 15608, + "train/ce_loss": 5.909675451221119e-07 + }, + { + "epoch": 1.5432074352382834, + "step": 15608, + "train/sim_loss": 0.04314213991165161 + }, + { + "epoch": 1.5432074352382834, + "step": 15608, + "train/total_loss": 0.04314219951629639 + }, + { + "entropy": 9.717281341552734, + "epoch": 1.5433063080877991, + "mean_token_accuracy": 0.8282208442687988, + "num_tokens": 45341060.0, + "step": 15609, + "train/ce_loss": 8.863471521181054e-07 + }, + { + "epoch": 1.5433063080877991, + "step": 15609, + "train/sim_loss": 0.03958117961883545 + }, + { + "epoch": 1.5433063080877991, + "step": 15609, + "train/total_loss": 0.03958126902580261 + }, + { + "entropy": 8.635913848876953, + "epoch": 1.5434051809373146, + "mean_token_accuracy": 0.8631687164306641, + "num_tokens": 45347962.0, + "step": 15610, + "train/ce_loss": 0.399108350276947 + }, + { + "epoch": 1.5434051809373146, + "step": 15610, + "train/sim_loss": 0.033695220947265625 + }, + { + "epoch": 1.5434051809373146, + "step": 15610, + "train/total_loss": 0.07360605895519257 + }, + { + "entropy": 8.74773120880127, + "epoch": 1.54350405378683, + "mean_token_accuracy": 0.8713968992233276, + "num_tokens": 45364370.0, + "step": 15611, + "train/ce_loss": 0.275755912065506 + }, + { + "epoch": 1.54350405378683, + "step": 15611, + "train/sim_loss": 0.03054642677307129 + }, + { + "epoch": 1.54350405378683, + "step": 15611, + "train/total_loss": 0.05812201648950577 + }, + { + "entropy": 9.366189956665039, + "epoch": 1.5436029266363458, + "mean_token_accuracy": 0.8345534205436707, + "num_tokens": 45375759.0, + "step": 15612, + "train/ce_loss": 2.562190672961151e-07 + }, + { + "epoch": 1.5436029266363458, + "step": 15612, + "train/sim_loss": 0.02381730079650879 + }, + { + "epoch": 1.5436029266363458, + "step": 15612, + "train/total_loss": 0.02381732687354088 + }, + { + "entropy": 9.056024551391602, + "epoch": 1.543701799485861, + "mean_token_accuracy": 0.8744588494300842, + "num_tokens": 45388051.0, + "step": 15613, + "train/ce_loss": 3.3027126278284413e-07 + }, + { + "epoch": 1.543701799485861, + "step": 15613, + "train/sim_loss": 0.0631018877029419 + }, + { + "epoch": 1.543701799485861, + "step": 15613, + "train/total_loss": 0.06310191750526428 + }, + { + "entropy": 9.628496170043945, + "epoch": 1.5438006723353768, + "mean_token_accuracy": 0.8829953074455261, + "num_tokens": 45396763.0, + "step": 15614, + "train/ce_loss": 0.2917753756046295 + }, + { + "epoch": 1.5438006723353768, + "step": 15614, + "train/sim_loss": 0.029671847820281982 + }, + { + "epoch": 1.5438006723353768, + "step": 15614, + "train/total_loss": 0.05884938687086105 + }, + { + "entropy": 9.354480743408203, + "epoch": 1.5438995451848923, + "mean_token_accuracy": 0.8341708779335022, + "num_tokens": 45410182.0, + "step": 15615, + "train/ce_loss": 0.29530656337738037 + }, + { + "epoch": 1.5438995451848923, + "step": 15615, + "train/sim_loss": 0.020443081855773926 + }, + { + "epoch": 1.5438995451848923, + "step": 15615, + "train/total_loss": 0.0499737411737442 + }, + { + "entropy": 9.287147521972656, + "epoch": 1.5439984180344077, + "mean_token_accuracy": 0.8608247637748718, + "num_tokens": 45425446.0, + "step": 15616, + "train/ce_loss": 0.33112531900405884 + }, + { + "epoch": 1.5439984180344077, + "step": 15616, + "train/sim_loss": 0.022794902324676514 + }, + { + "epoch": 1.5439984180344077, + "step": 15616, + "train/total_loss": 0.05590743571519852 + }, + { + "entropy": 9.251413345336914, + "epoch": 1.5440972908839232, + "mean_token_accuracy": 0.8082352876663208, + "num_tokens": 45441186.0, + "step": 15617, + "train/ce_loss": 4.168009297700337e-07 + }, + { + "epoch": 1.5440972908839232, + "step": 15617, + "train/sim_loss": 0.039984166622161865 + }, + { + "epoch": 1.5440972908839232, + "step": 15617, + "train/total_loss": 0.03998420760035515 + }, + { + "entropy": 9.032806396484375, + "epoch": 1.5441961637334387, + "mean_token_accuracy": 0.8484472036361694, + "num_tokens": 45451025.0, + "step": 15618, + "train/ce_loss": 0.7598740458488464 + }, + { + "epoch": 1.5441961637334387, + "step": 15618, + "train/sim_loss": 0.05614185333251953 + }, + { + "epoch": 1.5441961637334387, + "step": 15618, + "train/total_loss": 0.1321292519569397 + }, + { + "entropy": 9.566598892211914, + "epoch": 1.5442950365829544, + "mean_token_accuracy": 0.948849081993103, + "num_tokens": 45456816.0, + "step": 15619, + "train/ce_loss": 0.5443524122238159 + }, + { + "epoch": 1.5442950365829544, + "step": 15619, + "train/sim_loss": 0.01848655939102173 + }, + { + "epoch": 1.5442950365829544, + "step": 15619, + "train/total_loss": 0.07292179763317108 + }, + { + "epoch": 1.5443939094324697, + "grad_norm": 0.38172489404678345, + "learning_rate": 6.140780299658805e-06, + "loss": 0.0816, + "step": 15620 + }, + { + "entropy": 9.603689193725586, + "epoch": 1.5443939094324697, + "mean_token_accuracy": 0.8601226806640625, + "num_tokens": 45471836.0, + "step": 15620, + "train/ce_loss": 0.19885803759098053 + }, + { + "epoch": 1.5443939094324697, + "step": 15620, + "train/sim_loss": 0.018852949142456055 + }, + { + "epoch": 1.5443939094324697, + "step": 15620, + "train/total_loss": 0.03873875364661217 + }, + { + "entropy": 9.224769592285156, + "epoch": 1.5444927822819854, + "mean_token_accuracy": 0.849056601524353, + "num_tokens": 45481460.0, + "step": 15621, + "train/ce_loss": 0.5553520321846008 + }, + { + "epoch": 1.5444927822819854, + "step": 15621, + "train/sim_loss": 0.03884875774383545 + }, + { + "epoch": 1.5444927822819854, + "step": 15621, + "train/total_loss": 0.09438396245241165 + }, + { + "entropy": 9.134050369262695, + "epoch": 1.5445916551315009, + "mean_token_accuracy": 0.7921478152275085, + "num_tokens": 45493824.0, + "step": 15622, + "train/ce_loss": 0.440762996673584 + }, + { + "epoch": 1.5445916551315009, + "step": 15622, + "train/sim_loss": 0.01893007755279541 + }, + { + "epoch": 1.5445916551315009, + "step": 15622, + "train/total_loss": 0.06300637871026993 + }, + { + "entropy": 9.302806854248047, + "epoch": 1.5446905279810164, + "mean_token_accuracy": 0.8324872851371765, + "num_tokens": 45503272.0, + "step": 15623, + "train/ce_loss": 0.2812969386577606 + }, + { + "epoch": 1.5446905279810164, + "step": 15623, + "train/sim_loss": 0.027310431003570557 + }, + { + "epoch": 1.5446905279810164, + "step": 15623, + "train/total_loss": 0.05544012784957886 + }, + { + "entropy": 8.99970817565918, + "epoch": 1.544789400830532, + "mean_token_accuracy": 0.8785625696182251, + "num_tokens": 45511185.0, + "step": 15624, + "train/ce_loss": 0.21131588518619537 + }, + { + "epoch": 1.544789400830532, + "step": 15624, + "train/sim_loss": 0.011223554611206055 + }, + { + "epoch": 1.544789400830532, + "step": 15624, + "train/total_loss": 0.03235514461994171 + }, + { + "entropy": 9.43292236328125, + "epoch": 1.5448882736800473, + "mean_token_accuracy": 0.8218085169792175, + "num_tokens": 45527604.0, + "step": 15625, + "train/ce_loss": 0.4765556752681732 + }, + { + "epoch": 1.5448882736800473, + "step": 15625, + "train/sim_loss": 0.03524714708328247 + }, + { + "epoch": 1.5448882736800473, + "step": 15625, + "train/total_loss": 0.08290271461009979 + }, + { + "entropy": 9.328231811523438, + "epoch": 1.544987146529563, + "mean_token_accuracy": 0.819327712059021, + "num_tokens": 45546899.0, + "step": 15626, + "train/ce_loss": 0.6472474932670593 + }, + { + "epoch": 1.544987146529563, + "step": 15626, + "train/sim_loss": 0.021121740341186523 + }, + { + "epoch": 1.544987146529563, + "step": 15626, + "train/total_loss": 0.08584649115800858 + }, + { + "entropy": 9.719928741455078, + "epoch": 1.5450860193790785, + "mean_token_accuracy": 0.8967136144638062, + "num_tokens": 45558319.0, + "step": 15627, + "train/ce_loss": 7.79041045007034e-07 + }, + { + "epoch": 1.5450860193790785, + "step": 15627, + "train/sim_loss": 0.04966336488723755 + }, + { + "epoch": 1.5450860193790785, + "step": 15627, + "train/total_loss": 0.049663443118333817 + }, + { + "entropy": 9.4605073928833, + "epoch": 1.545184892228594, + "mean_token_accuracy": 0.9113923907279968, + "num_tokens": 45569766.0, + "step": 15628, + "train/ce_loss": 0.23414388298988342 + }, + { + "epoch": 1.545184892228594, + "step": 15628, + "train/sim_loss": 0.013228356838226318 + }, + { + "epoch": 1.545184892228594, + "step": 15628, + "train/total_loss": 0.03664274513721466 + }, + { + "entropy": 9.030471801757812, + "epoch": 1.5452837650781097, + "mean_token_accuracy": 0.8262757658958435, + "num_tokens": 45585098.0, + "step": 15629, + "train/ce_loss": 0.408565491437912 + }, + { + "epoch": 1.5452837650781097, + "step": 15629, + "train/sim_loss": 0.04763782024383545 + }, + { + "epoch": 1.5452837650781097, + "step": 15629, + "train/total_loss": 0.08849437534809113 + }, + { + "entropy": 9.385357856750488, + "epoch": 1.545382637927625, + "mean_token_accuracy": 0.8695090413093567, + "num_tokens": 45595305.0, + "step": 15630, + "train/ce_loss": 0.29603928327560425 + }, + { + "epoch": 1.545382637927625, + "step": 15630, + "train/sim_loss": 0.05909925699234009 + }, + { + "epoch": 1.545382637927625, + "step": 15630, + "train/total_loss": 0.08870318531990051 + }, + { + "entropy": 9.698470115661621, + "epoch": 1.5454815107771407, + "mean_token_accuracy": 0.8852813839912415, + "num_tokens": 45601091.0, + "step": 15631, + "train/ce_loss": 7.988373909029178e-07 + }, + { + "epoch": 1.5454815107771407, + "step": 15631, + "train/sim_loss": 0.07124412059783936 + }, + { + "epoch": 1.5454815107771407, + "step": 15631, + "train/total_loss": 0.07124420255422592 + }, + { + "entropy": 8.91831111907959, + "epoch": 1.5455803836266562, + "mean_token_accuracy": 0.8365650773048401, + "num_tokens": 45613255.0, + "step": 15632, + "train/ce_loss": 0.6912111639976501 + }, + { + "epoch": 1.5455803836266562, + "step": 15632, + "train/sim_loss": 0.01475512981414795 + }, + { + "epoch": 1.5455803836266562, + "step": 15632, + "train/total_loss": 0.08387624472379684 + }, + { + "entropy": 8.981664657592773, + "epoch": 1.5456792564761717, + "mean_token_accuracy": 0.8541666865348816, + "num_tokens": 45622928.0, + "step": 15633, + "train/ce_loss": 0.30597230792045593 + }, + { + "epoch": 1.5456792564761717, + "step": 15633, + "train/sim_loss": 0.032704710960388184 + }, + { + "epoch": 1.5456792564761717, + "step": 15633, + "train/total_loss": 0.0633019432425499 + }, + { + "entropy": 9.269279479980469, + "epoch": 1.5457781293256871, + "mean_token_accuracy": 0.8501259684562683, + "num_tokens": 45631173.0, + "step": 15634, + "train/ce_loss": 1.9697878883562225e-07 + }, + { + "epoch": 1.5457781293256871, + "step": 15634, + "train/sim_loss": 0.022446632385253906 + }, + { + "epoch": 1.5457781293256871, + "step": 15634, + "train/total_loss": 0.022446652874350548 + }, + { + "entropy": 8.981874465942383, + "epoch": 1.5458770021752026, + "mean_token_accuracy": 0.8912237286567688, + "num_tokens": 45642999.0, + "step": 15635, + "train/ce_loss": 0.3740626573562622 + }, + { + "epoch": 1.5458770021752026, + "step": 15635, + "train/sim_loss": 0.04887950420379639 + }, + { + "epoch": 1.5458770021752026, + "step": 15635, + "train/total_loss": 0.08628576993942261 + }, + { + "entropy": 9.336723327636719, + "epoch": 1.5459758750247183, + "mean_token_accuracy": 0.8575711846351624, + "num_tokens": 45654096.0, + "step": 15636, + "train/ce_loss": 0.6536567211151123 + }, + { + "epoch": 1.5459758750247183, + "step": 15636, + "train/sim_loss": 0.056222498416900635 + }, + { + "epoch": 1.5459758750247183, + "step": 15636, + "train/total_loss": 0.12158817052841187 + }, + { + "entropy": 9.430076599121094, + "epoch": 1.5460747478742336, + "mean_token_accuracy": 0.8533772826194763, + "num_tokens": 45665890.0, + "step": 15637, + "train/ce_loss": 4.5954956817695347e-07 + }, + { + "epoch": 1.5460747478742336, + "step": 15637, + "train/sim_loss": 0.029237031936645508 + }, + { + "epoch": 1.5460747478742336, + "step": 15637, + "train/total_loss": 0.02923707850277424 + }, + { + "entropy": 9.639629364013672, + "epoch": 1.5461736207237493, + "mean_token_accuracy": 0.9067055583000183, + "num_tokens": 45671854.0, + "step": 15638, + "train/ce_loss": 0.5564567446708679 + }, + { + "epoch": 1.5461736207237493, + "step": 15638, + "train/sim_loss": 0.03619194030761719 + }, + { + "epoch": 1.5461736207237493, + "step": 15638, + "train/total_loss": 0.09183761477470398 + }, + { + "entropy": 9.880377769470215, + "epoch": 1.5462724935732648, + "mean_token_accuracy": 0.8880455493927002, + "num_tokens": 45680184.0, + "step": 15639, + "train/ce_loss": 1.8935903653982677e-06 + }, + { + "epoch": 1.5462724935732648, + "step": 15639, + "train/sim_loss": 0.056058526039123535 + }, + { + "epoch": 1.5462724935732648, + "step": 15639, + "train/total_loss": 0.05605871602892876 + }, + { + "epoch": 1.5463713664227803, + "grad_norm": 0.5967413783073425, + "learning_rate": 6.135835434900856e-06, + "loss": 0.0802, + "step": 15640 + }, + { + "entropy": 9.253717422485352, + "epoch": 1.5463713664227803, + "mean_token_accuracy": 0.8168643116950989, + "num_tokens": 45690104.0, + "step": 15640, + "train/ce_loss": 0.4006483554840088 + }, + { + "epoch": 1.5463713664227803, + "step": 15640, + "train/sim_loss": 0.05276530981063843 + }, + { + "epoch": 1.5463713664227803, + "step": 15640, + "train/total_loss": 0.09283015131950378 + }, + { + "entropy": 8.652387619018555, + "epoch": 1.546470239272296, + "mean_token_accuracy": 0.84375, + "num_tokens": 45698279.0, + "step": 15641, + "train/ce_loss": 0.5111157894134521 + }, + { + "epoch": 1.546470239272296, + "step": 15641, + "train/sim_loss": 0.020316481590270996 + }, + { + "epoch": 1.546470239272296, + "step": 15641, + "train/total_loss": 0.07142806053161621 + }, + { + "entropy": 9.17073917388916, + "epoch": 1.5465691121218113, + "mean_token_accuracy": 0.8430296182632446, + "num_tokens": 45711668.0, + "step": 15642, + "train/ce_loss": 0.5619246959686279 + }, + { + "epoch": 1.5465691121218113, + "step": 15642, + "train/sim_loss": 0.078635573387146 + }, + { + "epoch": 1.5465691121218113, + "step": 15642, + "train/total_loss": 0.13482804596424103 + }, + { + "entropy": 9.483931541442871, + "epoch": 1.546667984971327, + "mean_token_accuracy": 0.8526785969734192, + "num_tokens": 45728237.0, + "step": 15643, + "train/ce_loss": 0.5516900420188904 + }, + { + "epoch": 1.546667984971327, + "step": 15643, + "train/sim_loss": 0.08701145648956299 + }, + { + "epoch": 1.546667984971327, + "step": 15643, + "train/total_loss": 0.1421804577112198 + }, + { + "entropy": 9.197301864624023, + "epoch": 1.5467668578208424, + "mean_token_accuracy": 0.8521634340286255, + "num_tokens": 45740850.0, + "step": 15644, + "train/ce_loss": 0.4864870309829712 + }, + { + "epoch": 1.5467668578208424, + "step": 15644, + "train/sim_loss": 0.03586232662200928 + }, + { + "epoch": 1.5467668578208424, + "step": 15644, + "train/total_loss": 0.08451102674007416 + }, + { + "entropy": 8.983429908752441, + "epoch": 1.546865730670358, + "mean_token_accuracy": 0.8337469100952148, + "num_tokens": 45748776.0, + "step": 15645, + "train/ce_loss": 0.8629199862480164 + }, + { + "epoch": 1.546865730670358, + "step": 15645, + "train/sim_loss": 0.06376194953918457 + }, + { + "epoch": 1.546865730670358, + "step": 15645, + "train/total_loss": 0.1500539481639862 + }, + { + "entropy": 9.26438045501709, + "epoch": 1.5469646035198734, + "mean_token_accuracy": 0.8815330862998962, + "num_tokens": 45758036.0, + "step": 15646, + "train/ce_loss": 0.3230993449687958 + }, + { + "epoch": 1.5469646035198734, + "step": 15646, + "train/sim_loss": 0.014527082443237305 + }, + { + "epoch": 1.5469646035198734, + "step": 15646, + "train/total_loss": 0.04683701694011688 + }, + { + "entropy": 9.121455192565918, + "epoch": 1.547063476369389, + "mean_token_accuracy": 0.8324552178382874, + "num_tokens": 45766855.0, + "step": 15647, + "train/ce_loss": 0.5624510645866394 + }, + { + "epoch": 1.547063476369389, + "step": 15647, + "train/sim_loss": 0.06701231002807617 + }, + { + "epoch": 1.547063476369389, + "step": 15647, + "train/total_loss": 0.12325741350650787 + }, + { + "entropy": 9.351839065551758, + "epoch": 1.5471623492189046, + "mean_token_accuracy": 0.8349650502204895, + "num_tokens": 45783495.0, + "step": 15648, + "train/ce_loss": 0.6007263660430908 + }, + { + "epoch": 1.5471623492189046, + "step": 15648, + "train/sim_loss": 0.04470717906951904 + }, + { + "epoch": 1.5471623492189046, + "step": 15648, + "train/total_loss": 0.10477981716394424 + }, + { + "entropy": 9.50648021697998, + "epoch": 1.5472612220684199, + "mean_token_accuracy": 0.8755690455436707, + "num_tokens": 45799557.0, + "step": 15649, + "train/ce_loss": 0.35850897431373596 + }, + { + "epoch": 1.5472612220684199, + "step": 15649, + "train/sim_loss": 0.026557743549346924 + }, + { + "epoch": 1.5472612220684199, + "step": 15649, + "train/total_loss": 0.06240864098072052 + }, + { + "entropy": 9.033809661865234, + "epoch": 1.5473600949179356, + "mean_token_accuracy": 0.8456864953041077, + "num_tokens": 45807848.0, + "step": 15650, + "train/ce_loss": 0.4865289628505707 + }, + { + "epoch": 1.5473600949179356, + "step": 15650, + "train/sim_loss": 0.012995779514312744 + }, + { + "epoch": 1.5473600949179356, + "step": 15650, + "train/total_loss": 0.06164867803454399 + }, + { + "entropy": 8.8345365524292, + "epoch": 1.547458967767451, + "mean_token_accuracy": 0.8427272439002991, + "num_tokens": 45819245.0, + "step": 15651, + "train/ce_loss": 0.33822450041770935 + }, + { + "epoch": 1.547458967767451, + "step": 15651, + "train/sim_loss": 0.02201920747756958 + }, + { + "epoch": 1.547458967767451, + "step": 15651, + "train/total_loss": 0.055841658264398575 + }, + { + "entropy": 8.672144889831543, + "epoch": 1.5475578406169666, + "mean_token_accuracy": 0.8947368264198303, + "num_tokens": 45830284.0, + "step": 15652, + "train/ce_loss": 0.21627303957939148 + }, + { + "epoch": 1.5475578406169666, + "step": 15652, + "train/sim_loss": 0.048229098320007324 + }, + { + "epoch": 1.5475578406169666, + "step": 15652, + "train/total_loss": 0.06985640525817871 + }, + { + "entropy": 9.428756713867188, + "epoch": 1.5476567134664823, + "mean_token_accuracy": 0.8731988668441772, + "num_tokens": 45840249.0, + "step": 15653, + "train/ce_loss": 6.348419105961511e-07 + }, + { + "epoch": 1.5476567134664823, + "step": 15653, + "train/sim_loss": 0.026137590408325195 + }, + { + "epoch": 1.5476567134664823, + "step": 15653, + "train/total_loss": 0.02613765373826027 + }, + { + "entropy": 9.21458625793457, + "epoch": 1.5477555863159975, + "mean_token_accuracy": 0.817715048789978, + "num_tokens": 45850783.0, + "step": 15654, + "train/ce_loss": 0.6848011612892151 + }, + { + "epoch": 1.5477555863159975, + "step": 15654, + "train/sim_loss": 0.10717016458511353 + }, + { + "epoch": 1.5477555863159975, + "step": 15654, + "train/total_loss": 0.17565028369426727 + }, + { + "entropy": 8.987127304077148, + "epoch": 1.5478544591655132, + "mean_token_accuracy": 0.8321838974952698, + "num_tokens": 45858329.0, + "step": 15655, + "train/ce_loss": 0.4432981312274933 + }, + { + "epoch": 1.5478544591655132, + "step": 15655, + "train/sim_loss": 0.03003096580505371 + }, + { + "epoch": 1.5478544591655132, + "step": 15655, + "train/total_loss": 0.07436078041791916 + }, + { + "entropy": 9.858881950378418, + "epoch": 1.5479533320150287, + "mean_token_accuracy": 0.8854625821113586, + "num_tokens": 45866175.0, + "step": 15656, + "train/ce_loss": 0.623006284236908 + }, + { + "epoch": 1.5479533320150287, + "step": 15656, + "train/sim_loss": 0.03543746471405029 + }, + { + "epoch": 1.5479533320150287, + "step": 15656, + "train/total_loss": 0.09773809462785721 + }, + { + "entropy": 9.131939888000488, + "epoch": 1.5480522048645442, + "mean_token_accuracy": 0.8695175647735596, + "num_tokens": 45878237.0, + "step": 15657, + "train/ce_loss": 0.37292882800102234 + }, + { + "epoch": 1.5480522048645442, + "step": 15657, + "train/sim_loss": 0.012780547142028809 + }, + { + "epoch": 1.5480522048645442, + "step": 15657, + "train/total_loss": 0.05007342994213104 + }, + { + "entropy": 8.784173965454102, + "epoch": 1.5481510777140597, + "mean_token_accuracy": 0.8807157278060913, + "num_tokens": 45894643.0, + "step": 15658, + "train/ce_loss": 0.25356367230415344 + }, + { + "epoch": 1.5481510777140597, + "step": 15658, + "train/sim_loss": 0.07185649871826172 + }, + { + "epoch": 1.5481510777140597, + "step": 15658, + "train/total_loss": 0.09721286594867706 + }, + { + "entropy": 9.754575729370117, + "epoch": 1.5482499505635752, + "mean_token_accuracy": 0.9032257795333862, + "num_tokens": 45909685.0, + "step": 15659, + "train/ce_loss": 2.655915807281417e-07 + }, + { + "epoch": 1.5482499505635752, + "step": 15659, + "train/sim_loss": 0.022112011909484863 + }, + { + "epoch": 1.5482499505635752, + "step": 15659, + "train/total_loss": 0.022112037986516953 + }, + { + "epoch": 1.5483488234130909, + "grad_norm": 0.5699752569198608, + "learning_rate": 6.130890570142907e-06, + "loss": 0.0857, + "step": 15660 + }, + { + "entropy": 9.447016716003418, + "epoch": 1.5483488234130909, + "mean_token_accuracy": 0.8469241857528687, + "num_tokens": 45922850.0, + "step": 15660, + "train/ce_loss": 0.5974963903427124 + }, + { + "epoch": 1.5483488234130909, + "step": 15660, + "train/sim_loss": 0.03526425361633301 + }, + { + "epoch": 1.5483488234130909, + "step": 15660, + "train/total_loss": 0.09501389414072037 + }, + { + "entropy": 9.744436264038086, + "epoch": 1.5484476962626061, + "mean_token_accuracy": 0.8773903250694275, + "num_tokens": 45932612.0, + "step": 15661, + "train/ce_loss": 0.2976418435573578 + }, + { + "epoch": 1.5484476962626061, + "step": 15661, + "train/sim_loss": 0.059799253940582275 + }, + { + "epoch": 1.5484476962626061, + "step": 15661, + "train/total_loss": 0.08956343680620193 + }, + { + "entropy": 9.137310981750488, + "epoch": 1.5485465691121219, + "mean_token_accuracy": 0.8585366010665894, + "num_tokens": 45943051.0, + "step": 15662, + "train/ce_loss": 0.449313759803772 + }, + { + "epoch": 1.5485465691121219, + "step": 15662, + "train/sim_loss": 0.08253085613250732 + }, + { + "epoch": 1.5485465691121219, + "step": 15662, + "train/total_loss": 0.127462238073349 + }, + { + "entropy": 9.020183563232422, + "epoch": 1.5486454419616373, + "mean_token_accuracy": 0.9070904850959778, + "num_tokens": 45957098.0, + "step": 15663, + "train/ce_loss": 0.24208498001098633 + }, + { + "epoch": 1.5486454419616373, + "step": 15663, + "train/sim_loss": 0.00951087474822998 + }, + { + "epoch": 1.5486454419616373, + "step": 15663, + "train/total_loss": 0.03371937572956085 + }, + { + "entropy": 9.582225799560547, + "epoch": 1.5487443148111528, + "mean_token_accuracy": 0.8765880465507507, + "num_tokens": 45972486.0, + "step": 15664, + "train/ce_loss": 0.4694520831108093 + }, + { + "epoch": 1.5487443148111528, + "step": 15664, + "train/sim_loss": 0.03752315044403076 + }, + { + "epoch": 1.5487443148111528, + "step": 15664, + "train/total_loss": 0.08446836471557617 + }, + { + "entropy": 9.225802421569824, + "epoch": 1.5488431876606685, + "mean_token_accuracy": 0.8506787419319153, + "num_tokens": 45987081.0, + "step": 15665, + "train/ce_loss": 0.5049560070037842 + }, + { + "epoch": 1.5488431876606685, + "step": 15665, + "train/sim_loss": 0.02679312229156494 + }, + { + "epoch": 1.5488431876606685, + "step": 15665, + "train/total_loss": 0.07728872448205948 + }, + { + "entropy": 9.246749877929688, + "epoch": 1.5489420605101838, + "mean_token_accuracy": 0.8529411554336548, + "num_tokens": 45999779.0, + "step": 15666, + "train/ce_loss": 0.48936736583709717 + }, + { + "epoch": 1.5489420605101838, + "step": 15666, + "train/sim_loss": 0.016205430030822754 + }, + { + "epoch": 1.5489420605101838, + "step": 15666, + "train/total_loss": 0.06514216959476471 + }, + { + "entropy": 8.710245132446289, + "epoch": 1.5490409333596995, + "mean_token_accuracy": 0.8185185194015503, + "num_tokens": 46011739.0, + "step": 15667, + "train/ce_loss": 0.4321404993534088 + }, + { + "epoch": 1.5490409333596995, + "step": 15667, + "train/sim_loss": 0.04239547252655029 + }, + { + "epoch": 1.5490409333596995, + "step": 15667, + "train/total_loss": 0.08560952544212341 + }, + { + "entropy": 9.755804061889648, + "epoch": 1.549139806209215, + "mean_token_accuracy": 0.8393352031707764, + "num_tokens": 46028801.0, + "step": 15668, + "train/ce_loss": 0.2517149746417999 + }, + { + "epoch": 1.549139806209215, + "step": 15668, + "train/sim_loss": 0.027183949947357178 + }, + { + "epoch": 1.549139806209215, + "step": 15668, + "train/total_loss": 0.05235544592142105 + }, + { + "entropy": 9.880817413330078, + "epoch": 1.5492386790587305, + "mean_token_accuracy": 0.9244851469993591, + "num_tokens": 46043722.0, + "step": 15669, + "train/ce_loss": 0.3622335195541382 + }, + { + "epoch": 1.5492386790587305, + "step": 15669, + "train/sim_loss": 0.032102108001708984 + }, + { + "epoch": 1.5492386790587305, + "step": 15669, + "train/total_loss": 0.0683254599571228 + }, + { + "entropy": 9.158727645874023, + "epoch": 1.549337551908246, + "mean_token_accuracy": 0.8688760995864868, + "num_tokens": 46053059.0, + "step": 15670, + "train/ce_loss": 0.511117160320282 + }, + { + "epoch": 1.549337551908246, + "step": 15670, + "train/sim_loss": 0.031363844871520996 + }, + { + "epoch": 1.549337551908246, + "step": 15670, + "train/total_loss": 0.08247555792331696 + }, + { + "entropy": 8.940417289733887, + "epoch": 1.5494364247577614, + "mean_token_accuracy": 0.8744827508926392, + "num_tokens": 46066080.0, + "step": 15671, + "train/ce_loss": 0.46233823895454407 + }, + { + "epoch": 1.5494364247577614, + "step": 15671, + "train/sim_loss": 0.02050083875656128 + }, + { + "epoch": 1.5494364247577614, + "step": 15671, + "train/total_loss": 0.0667346641421318 + }, + { + "entropy": 9.217689514160156, + "epoch": 1.5495352976072772, + "mean_token_accuracy": 0.8743455410003662, + "num_tokens": 46080369.0, + "step": 15672, + "train/ce_loss": 5.259271915747377e-07 + }, + { + "epoch": 1.5495352976072772, + "step": 15672, + "train/sim_loss": 0.03611648082733154 + }, + { + "epoch": 1.5495352976072772, + "step": 15672, + "train/total_loss": 0.03611653298139572 + }, + { + "entropy": 9.34118938446045, + "epoch": 1.5496341704567924, + "mean_token_accuracy": 0.8667426109313965, + "num_tokens": 46094084.0, + "step": 15673, + "train/ce_loss": 0.6571410298347473 + }, + { + "epoch": 1.5496341704567924, + "step": 15673, + "train/sim_loss": 0.026794075965881348 + }, + { + "epoch": 1.5496341704567924, + "step": 15673, + "train/total_loss": 0.09250818192958832 + }, + { + "entropy": 9.913898468017578, + "epoch": 1.5497330433063081, + "mean_token_accuracy": 0.8673469424247742, + "num_tokens": 46101555.0, + "step": 15674, + "train/ce_loss": 2.1019630480623164e-07 + }, + { + "epoch": 1.5497330433063081, + "step": 15674, + "train/sim_loss": 0.008224546909332275 + }, + { + "epoch": 1.5497330433063081, + "step": 15674, + "train/total_loss": 0.008224568329751492 + }, + { + "entropy": 9.160429954528809, + "epoch": 1.5498319161558236, + "mean_token_accuracy": 0.856249988079071, + "num_tokens": 46110128.0, + "step": 15675, + "train/ce_loss": 0.37166479229927063 + }, + { + "epoch": 1.5498319161558236, + "step": 15675, + "train/sim_loss": 0.04059040546417236 + }, + { + "epoch": 1.5498319161558236, + "step": 15675, + "train/total_loss": 0.07775688171386719 + }, + { + "entropy": 9.386529922485352, + "epoch": 1.549930789005339, + "mean_token_accuracy": 0.8049921989440918, + "num_tokens": 46123941.0, + "step": 15676, + "train/ce_loss": 0.3434840440750122 + }, + { + "epoch": 1.549930789005339, + "step": 15676, + "train/sim_loss": 0.05890423059463501 + }, + { + "epoch": 1.549930789005339, + "step": 15676, + "train/total_loss": 0.09325263649225235 + }, + { + "entropy": 9.30536937713623, + "epoch": 1.5500296618548548, + "mean_token_accuracy": 0.8512628674507141, + "num_tokens": 46144918.0, + "step": 15677, + "train/ce_loss": 0.5052834749221802 + }, + { + "epoch": 1.5500296618548548, + "step": 15677, + "train/sim_loss": 0.042383670806884766 + }, + { + "epoch": 1.5500296618548548, + "step": 15677, + "train/total_loss": 0.09291201829910278 + }, + { + "entropy": 9.446775436401367, + "epoch": 1.55012853470437, + "mean_token_accuracy": 0.8258064389228821, + "num_tokens": 46165737.0, + "step": 15678, + "train/ce_loss": 0.42808395624160767 + }, + { + "epoch": 1.55012853470437, + "step": 15678, + "train/sim_loss": 0.025471746921539307 + }, + { + "epoch": 1.55012853470437, + "step": 15678, + "train/total_loss": 0.06828014552593231 + }, + { + "entropy": 9.462512969970703, + "epoch": 1.5502274075538858, + "mean_token_accuracy": 0.8616822361946106, + "num_tokens": 46176140.0, + "step": 15679, + "train/ce_loss": 1.1960296433244366e-06 + }, + { + "epoch": 1.5502274075538858, + "step": 15679, + "train/sim_loss": 0.043540358543395996 + }, + { + "epoch": 1.5502274075538858, + "step": 15679, + "train/total_loss": 0.04354047775268555 + }, + { + "epoch": 1.5503262804034013, + "grad_norm": 0.5889574289321899, + "learning_rate": 6.125945705384958e-06, + "loss": 0.0761, + "step": 15680 + }, + { + "entropy": 9.315435409545898, + "epoch": 1.5503262804034013, + "mean_token_accuracy": 0.8458213210105896, + "num_tokens": 46185526.0, + "step": 15680, + "train/ce_loss": 0.25557997822761536 + }, + { + "epoch": 1.5503262804034013, + "step": 15680, + "train/sim_loss": 0.023727893829345703 + }, + { + "epoch": 1.5503262804034013, + "step": 15680, + "train/total_loss": 0.0492858923971653 + }, + { + "entropy": 9.189371109008789, + "epoch": 1.5504251532529167, + "mean_token_accuracy": 0.8643215894699097, + "num_tokens": 46198233.0, + "step": 15681, + "train/ce_loss": 0.5020971298217773 + }, + { + "epoch": 1.5504251532529167, + "step": 15681, + "train/sim_loss": 0.05896252393722534 + }, + { + "epoch": 1.5504251532529167, + "step": 15681, + "train/total_loss": 0.10917223989963531 + }, + { + "entropy": 9.77637004852295, + "epoch": 1.5505240261024322, + "mean_token_accuracy": 0.8849315047264099, + "num_tokens": 46213339.0, + "step": 15682, + "train/ce_loss": 5.147184651832504e-07 + }, + { + "epoch": 1.5505240261024322, + "step": 15682, + "train/sim_loss": 0.03213530778884888 + }, + { + "epoch": 1.5505240261024322, + "step": 15682, + "train/total_loss": 0.032135359942913055 + }, + { + "entropy": 9.270858764648438, + "epoch": 1.5506228989519477, + "mean_token_accuracy": 0.8013793230056763, + "num_tokens": 46221883.0, + "step": 15683, + "train/ce_loss": 1.7429080401143437e-07 + }, + { + "epoch": 1.5506228989519477, + "step": 15683, + "train/sim_loss": 0.016737282276153564 + }, + { + "epoch": 1.5506228989519477, + "step": 15683, + "train/total_loss": 0.016737299039959908 + }, + { + "entropy": 9.323690414428711, + "epoch": 1.5507217718014634, + "mean_token_accuracy": 0.8822927474975586, + "num_tokens": 46237757.0, + "step": 15684, + "train/ce_loss": 0.28721001744270325 + }, + { + "epoch": 1.5507217718014634, + "step": 15684, + "train/sim_loss": 0.03906524181365967 + }, + { + "epoch": 1.5507217718014634, + "step": 15684, + "train/total_loss": 0.06778624653816223 + }, + { + "entropy": 9.54666519165039, + "epoch": 1.5508206446509787, + "mean_token_accuracy": 0.8631178736686707, + "num_tokens": 46248100.0, + "step": 15685, + "train/ce_loss": 0.4637601673603058 + }, + { + "epoch": 1.5508206446509787, + "step": 15685, + "train/sim_loss": 0.06465160846710205 + }, + { + "epoch": 1.5508206446509787, + "step": 15685, + "train/total_loss": 0.11102762818336487 + }, + { + "entropy": 9.342275619506836, + "epoch": 1.5509195175004944, + "mean_token_accuracy": 0.8755760192871094, + "num_tokens": 46262665.0, + "step": 15686, + "train/ce_loss": 0.19122350215911865 + }, + { + "epoch": 1.5509195175004944, + "step": 15686, + "train/sim_loss": 0.058706820011138916 + }, + { + "epoch": 1.5509195175004944, + "step": 15686, + "train/total_loss": 0.07782916724681854 + }, + { + "entropy": 9.217941284179688, + "epoch": 1.5510183903500099, + "mean_token_accuracy": 0.8070374727249146, + "num_tokens": 46271401.0, + "step": 15687, + "train/ce_loss": 0.46399083733558655 + }, + { + "epoch": 1.5510183903500099, + "step": 15687, + "train/sim_loss": 0.033850669860839844 + }, + { + "epoch": 1.5510183903500099, + "step": 15687, + "train/total_loss": 0.08024975657463074 + }, + { + "entropy": 9.082212448120117, + "epoch": 1.5511172631995254, + "mean_token_accuracy": 0.8541374206542969, + "num_tokens": 46279912.0, + "step": 15688, + "train/ce_loss": 2.474448308475985e-07 + }, + { + "epoch": 1.5511172631995254, + "step": 15688, + "train/sim_loss": 0.05759066343307495 + }, + { + "epoch": 1.5511172631995254, + "step": 15688, + "train/total_loss": 0.05759068951010704 + }, + { + "entropy": 9.538225173950195, + "epoch": 1.551216136049041, + "mean_token_accuracy": 0.8927202820777893, + "num_tokens": 46289619.0, + "step": 15689, + "train/ce_loss": 4.5855901475988503e-07 + }, + { + "epoch": 1.551216136049041, + "step": 15689, + "train/sim_loss": 0.009071409702301025 + }, + { + "epoch": 1.551216136049041, + "step": 15689, + "train/total_loss": 0.009071455337107182 + }, + { + "entropy": 8.712657928466797, + "epoch": 1.5513150088985563, + "mean_token_accuracy": 0.8431983590126038, + "num_tokens": 46297311.0, + "step": 15690, + "train/ce_loss": 0.7464666962623596 + }, + { + "epoch": 1.5513150088985563, + "step": 15690, + "train/sim_loss": 0.03748154640197754 + }, + { + "epoch": 1.5513150088985563, + "step": 15690, + "train/total_loss": 0.11212822049856186 + }, + { + "entropy": 8.34632396697998, + "epoch": 1.551413881748072, + "mean_token_accuracy": 0.8755292296409607, + "num_tokens": 46307380.0, + "step": 15691, + "train/ce_loss": 0.17705032229423523 + }, + { + "epoch": 1.551413881748072, + "step": 15691, + "train/sim_loss": 0.01648843288421631 + }, + { + "epoch": 1.551413881748072, + "step": 15691, + "train/total_loss": 0.03419346362352371 + }, + { + "entropy": 9.898255348205566, + "epoch": 1.5515127545975875, + "mean_token_accuracy": 0.9240196347236633, + "num_tokens": 46317391.0, + "step": 15692, + "train/ce_loss": 0.5776790380477905 + }, + { + "epoch": 1.5515127545975875, + "step": 15692, + "train/sim_loss": 0.038832008838653564 + }, + { + "epoch": 1.5515127545975875, + "step": 15692, + "train/total_loss": 0.09659991413354874 + }, + { + "entropy": 9.25849437713623, + "epoch": 1.551611627447103, + "mean_token_accuracy": 0.8716049194335938, + "num_tokens": 46329054.0, + "step": 15693, + "train/ce_loss": 0.5960769653320312 + }, + { + "epoch": 1.551611627447103, + "step": 15693, + "train/sim_loss": 0.07372522354125977 + }, + { + "epoch": 1.551611627447103, + "step": 15693, + "train/total_loss": 0.13333292305469513 + }, + { + "entropy": 9.219467163085938, + "epoch": 1.5517105002966185, + "mean_token_accuracy": 0.8369704484939575, + "num_tokens": 46345287.0, + "step": 15694, + "train/ce_loss": 0.4417257308959961 + }, + { + "epoch": 1.5517105002966185, + "step": 15694, + "train/sim_loss": 0.023808956146240234 + }, + { + "epoch": 1.5517105002966185, + "step": 15694, + "train/total_loss": 0.0679815262556076 + }, + { + "entropy": 9.266408920288086, + "epoch": 1.551809373146134, + "mean_token_accuracy": 0.7903056144714355, + "num_tokens": 46355300.0, + "step": 15695, + "train/ce_loss": 0.5011931657791138 + }, + { + "epoch": 1.551809373146134, + "step": 15695, + "train/sim_loss": 0.14752423763275146 + }, + { + "epoch": 1.551809373146134, + "step": 15695, + "train/total_loss": 0.19764354825019836 + }, + { + "entropy": 9.518730163574219, + "epoch": 1.5519082459956497, + "mean_token_accuracy": 0.8437103033065796, + "num_tokens": 46372290.0, + "step": 15696, + "train/ce_loss": 0.407260537147522 + }, + { + "epoch": 1.5519082459956497, + "step": 15696, + "train/sim_loss": 0.054062724113464355 + }, + { + "epoch": 1.5519082459956497, + "step": 15696, + "train/total_loss": 0.09478877484798431 + }, + { + "entropy": 9.12658977508545, + "epoch": 1.552007118845165, + "mean_token_accuracy": 0.7877777814865112, + "num_tokens": 46383752.0, + "step": 15697, + "train/ce_loss": 0.4930509924888611 + }, + { + "epoch": 1.552007118845165, + "step": 15697, + "train/sim_loss": 0.042863309383392334 + }, + { + "epoch": 1.552007118845165, + "step": 15697, + "train/total_loss": 0.0921684056520462 + }, + { + "entropy": 10.147747039794922, + "epoch": 1.5521059916946807, + "mean_token_accuracy": 0.9197860956192017, + "num_tokens": 46394898.0, + "step": 15698, + "train/ce_loss": 3.518875644203945e-07 + }, + { + "epoch": 1.5521059916946807, + "step": 15698, + "train/sim_loss": 0.015013813972473145 + }, + { + "epoch": 1.5521059916946807, + "step": 15698, + "train/total_loss": 0.01501384936273098 + }, + { + "entropy": 9.020627975463867, + "epoch": 1.5522048645441962, + "mean_token_accuracy": 0.7901098728179932, + "num_tokens": 46406735.0, + "step": 15699, + "train/ce_loss": 0.8650935888290405 + }, + { + "epoch": 1.5522048645441962, + "step": 15699, + "train/sim_loss": 0.028690099716186523 + }, + { + "epoch": 1.5522048645441962, + "step": 15699, + "train/total_loss": 0.11519946157932281 + }, + { + "epoch": 1.5523037373937116, + "grad_norm": 0.6774216890335083, + "learning_rate": 6.121000840627009e-06, + "loss": 0.0853, + "step": 15700 + }, + { + "entropy": 9.578407287597656, + "epoch": 1.5523037373937116, + "mean_token_accuracy": 0.9279778599739075, + "num_tokens": 46419297.0, + "step": 15700, + "train/ce_loss": 3.8041085304030275e-07 + }, + { + "epoch": 1.5523037373937116, + "step": 15700, + "train/sim_loss": 0.053363144397735596 + }, + { + "epoch": 1.5523037373937116, + "step": 15700, + "train/total_loss": 0.05336318165063858 + }, + { + "entropy": 8.91268539428711, + "epoch": 1.5524026102432273, + "mean_token_accuracy": 0.8278145790100098, + "num_tokens": 46427128.0, + "step": 15701, + "train/ce_loss": 0.6080796122550964 + }, + { + "epoch": 1.5524026102432273, + "step": 15701, + "train/sim_loss": 0.058782100677490234 + }, + { + "epoch": 1.5524026102432273, + "step": 15701, + "train/total_loss": 0.11959005892276764 + }, + { + "entropy": 9.009777069091797, + "epoch": 1.5525014830927426, + "mean_token_accuracy": 0.8825065493583679, + "num_tokens": 46443896.0, + "step": 15702, + "train/ce_loss": 0.7284564971923828 + }, + { + "epoch": 1.5525014830927426, + "step": 15702, + "train/sim_loss": 0.04567122459411621 + }, + { + "epoch": 1.5525014830927426, + "step": 15702, + "train/total_loss": 0.11851687729358673 + }, + { + "entropy": 9.439438819885254, + "epoch": 1.5526003559422583, + "mean_token_accuracy": 0.801257848739624, + "num_tokens": 46454952.0, + "step": 15703, + "train/ce_loss": 0.47424066066741943 + }, + { + "epoch": 1.5526003559422583, + "step": 15703, + "train/sim_loss": 0.1129065752029419 + }, + { + "epoch": 1.5526003559422583, + "step": 15703, + "train/total_loss": 0.1603306382894516 + }, + { + "entropy": 9.103830337524414, + "epoch": 1.5526992287917738, + "mean_token_accuracy": 0.8907103538513184, + "num_tokens": 46471563.0, + "step": 15704, + "train/ce_loss": 0.2140415608882904 + }, + { + "epoch": 1.5526992287917738, + "step": 15704, + "train/sim_loss": 0.051560938358306885 + }, + { + "epoch": 1.5526992287917738, + "step": 15704, + "train/total_loss": 0.0729650929570198 + }, + { + "entropy": 8.961755752563477, + "epoch": 1.5527981016412893, + "mean_token_accuracy": 0.8017621040344238, + "num_tokens": 46481222.0, + "step": 15705, + "train/ce_loss": 0.7523961067199707 + }, + { + "epoch": 1.5527981016412893, + "step": 15705, + "train/sim_loss": 0.049171388149261475 + }, + { + "epoch": 1.5527981016412893, + "step": 15705, + "train/total_loss": 0.12441100180149078 + }, + { + "entropy": 9.176806449890137, + "epoch": 1.5528969744908048, + "mean_token_accuracy": 0.8468965291976929, + "num_tokens": 46491332.0, + "step": 15706, + "train/ce_loss": 0.37043094635009766 + }, + { + "epoch": 1.5528969744908048, + "step": 15706, + "train/sim_loss": 0.039356887340545654 + }, + { + "epoch": 1.5528969744908048, + "step": 15706, + "train/total_loss": 0.07639998197555542 + }, + { + "entropy": 8.601694107055664, + "epoch": 1.5529958473403203, + "mean_token_accuracy": 0.8274824619293213, + "num_tokens": 46500089.0, + "step": 15707, + "train/ce_loss": 0.5350475311279297 + }, + { + "epoch": 1.5529958473403203, + "step": 15707, + "train/sim_loss": 0.0436931848526001 + }, + { + "epoch": 1.5529958473403203, + "step": 15707, + "train/total_loss": 0.09719793498516083 + }, + { + "entropy": 9.229578018188477, + "epoch": 1.553094720189836, + "mean_token_accuracy": 0.8489042520523071, + "num_tokens": 46511431.0, + "step": 15708, + "train/ce_loss": 0.5429620146751404 + }, + { + "epoch": 1.553094720189836, + "step": 15708, + "train/sim_loss": 0.03311425447463989 + }, + { + "epoch": 1.553094720189836, + "step": 15708, + "train/total_loss": 0.08741045743227005 + }, + { + "entropy": 8.94769287109375, + "epoch": 1.5531935930393512, + "mean_token_accuracy": 0.8413705825805664, + "num_tokens": 46525880.0, + "step": 15709, + "train/ce_loss": 0.3339719772338867 + }, + { + "epoch": 1.5531935930393512, + "step": 15709, + "train/sim_loss": 0.08092594146728516 + }, + { + "epoch": 1.5531935930393512, + "step": 15709, + "train/total_loss": 0.11432313919067383 + }, + { + "entropy": 9.140824317932129, + "epoch": 1.553292465888867, + "mean_token_accuracy": 0.862500011920929, + "num_tokens": 46536423.0, + "step": 15710, + "train/ce_loss": 0.34305503964424133 + }, + { + "epoch": 1.553292465888867, + "step": 15710, + "train/sim_loss": 0.044281601905822754 + }, + { + "epoch": 1.553292465888867, + "step": 15710, + "train/total_loss": 0.078587107360363 + }, + { + "entropy": 9.379253387451172, + "epoch": 1.5533913387383824, + "mean_token_accuracy": 0.8427745699882507, + "num_tokens": 46549942.0, + "step": 15711, + "train/ce_loss": 0.4634197950363159 + }, + { + "epoch": 1.5533913387383824, + "step": 15711, + "train/sim_loss": 0.02615201473236084 + }, + { + "epoch": 1.5533913387383824, + "step": 15711, + "train/total_loss": 0.07249400019645691 + }, + { + "entropy": 8.895564079284668, + "epoch": 1.553490211587898, + "mean_token_accuracy": 0.8534482717514038, + "num_tokens": 46560098.0, + "step": 15712, + "train/ce_loss": 0.5105572938919067 + }, + { + "epoch": 1.553490211587898, + "step": 15712, + "train/sim_loss": 0.0636412501335144 + }, + { + "epoch": 1.553490211587898, + "step": 15712, + "train/total_loss": 0.11469697952270508 + }, + { + "entropy": 9.604105949401855, + "epoch": 1.5535890844374136, + "mean_token_accuracy": 0.8860946893692017, + "num_tokens": 46568611.0, + "step": 15713, + "train/ce_loss": 0.33599594235420227 + }, + { + "epoch": 1.5535890844374136, + "step": 15713, + "train/sim_loss": 0.05145186185836792 + }, + { + "epoch": 1.5535890844374136, + "step": 15713, + "train/total_loss": 0.08505146205425262 + }, + { + "entropy": 9.616985321044922, + "epoch": 1.5536879572869289, + "mean_token_accuracy": 0.8917197585105896, + "num_tokens": 46585990.0, + "step": 15714, + "train/ce_loss": 2.763794384463836e-07 + }, + { + "epoch": 1.5536879572869289, + "step": 15714, + "train/sim_loss": 0.04752439260482788 + }, + { + "epoch": 1.5536879572869289, + "step": 15714, + "train/total_loss": 0.04752441868185997 + }, + { + "entropy": 8.99721908569336, + "epoch": 1.5537868301364446, + "mean_token_accuracy": 0.8760234117507935, + "num_tokens": 46599288.0, + "step": 15715, + "train/ce_loss": 0.4712829291820526 + }, + { + "epoch": 1.5537868301364446, + "step": 15715, + "train/sim_loss": 0.04883730411529541 + }, + { + "epoch": 1.5537868301364446, + "step": 15715, + "train/total_loss": 0.09596559405326843 + }, + { + "entropy": 9.156014442443848, + "epoch": 1.55388570298596, + "mean_token_accuracy": 0.8860164284706116, + "num_tokens": 46607975.0, + "step": 15716, + "train/ce_loss": 0.3094271123409271 + }, + { + "epoch": 1.55388570298596, + "step": 15716, + "train/sim_loss": 0.047161102294921875 + }, + { + "epoch": 1.55388570298596, + "step": 15716, + "train/total_loss": 0.07810381054878235 + }, + { + "entropy": 9.001300811767578, + "epoch": 1.5539845758354756, + "mean_token_accuracy": 0.8515151739120483, + "num_tokens": 46624689.0, + "step": 15717, + "train/ce_loss": 0.2805338203907013 + }, + { + "epoch": 1.5539845758354756, + "step": 15717, + "train/sim_loss": 0.021316826343536377 + }, + { + "epoch": 1.5539845758354756, + "step": 15717, + "train/total_loss": 0.04937020689249039 + }, + { + "entropy": 8.946529388427734, + "epoch": 1.5540834486849913, + "mean_token_accuracy": 0.9034175276756287, + "num_tokens": 46637193.0, + "step": 15718, + "train/ce_loss": 1.884666289697634e-07 + }, + { + "epoch": 1.5540834486849913, + "step": 15718, + "train/sim_loss": 0.0478672981262207 + }, + { + "epoch": 1.5540834486849913, + "step": 15718, + "train/total_loss": 0.047867316752672195 + }, + { + "entropy": 8.753867149353027, + "epoch": 1.5541823215345065, + "mean_token_accuracy": 0.825745701789856, + "num_tokens": 46648217.0, + "step": 15719, + "train/ce_loss": 0.5983554124832153 + }, + { + "epoch": 1.5541823215345065, + "step": 15719, + "train/sim_loss": 0.08314800262451172 + }, + { + "epoch": 1.5541823215345065, + "step": 15719, + "train/total_loss": 0.142983540892601 + }, + { + "epoch": 1.5542811943840222, + "grad_norm": 0.6287860870361328, + "learning_rate": 6.116055975869061e-06, + "loss": 0.0842, + "step": 15720 + }, + { + "entropy": 9.30988883972168, + "epoch": 1.5542811943840222, + "mean_token_accuracy": 0.8105489611625671, + "num_tokens": 46664787.0, + "step": 15720, + "train/ce_loss": 0.6156203150749207 + }, + { + "epoch": 1.5542811943840222, + "step": 15720, + "train/sim_loss": 0.044893860816955566 + }, + { + "epoch": 1.5542811943840222, + "step": 15720, + "train/total_loss": 0.10645589232444763 + }, + { + "entropy": 9.31213665008545, + "epoch": 1.5543800672335377, + "mean_token_accuracy": 0.8539473414421082, + "num_tokens": 46680665.0, + "step": 15721, + "train/ce_loss": 0.39789292216300964 + }, + { + "epoch": 1.5543800672335377, + "step": 15721, + "train/sim_loss": 0.020291566848754883 + }, + { + "epoch": 1.5543800672335377, + "step": 15721, + "train/total_loss": 0.06008085981011391 + }, + { + "entropy": 9.453084945678711, + "epoch": 1.5544789400830532, + "mean_token_accuracy": 0.8723809719085693, + "num_tokens": 46692712.0, + "step": 15722, + "train/ce_loss": 0.24175086617469788 + }, + { + "epoch": 1.5544789400830532, + "step": 15722, + "train/sim_loss": 0.029601335525512695 + }, + { + "epoch": 1.5544789400830532, + "step": 15722, + "train/total_loss": 0.053776420652866364 + }, + { + "entropy": 8.957817077636719, + "epoch": 1.5545778129325687, + "mean_token_accuracy": 0.8462623357772827, + "num_tokens": 46705857.0, + "step": 15723, + "train/ce_loss": 0.5123854279518127 + }, + { + "epoch": 1.5545778129325687, + "step": 15723, + "train/sim_loss": 0.03688013553619385 + }, + { + "epoch": 1.5545778129325687, + "step": 15723, + "train/total_loss": 0.08811867982149124 + }, + { + "entropy": 9.248348236083984, + "epoch": 1.5546766857820842, + "mean_token_accuracy": 0.8362573385238647, + "num_tokens": 46724981.0, + "step": 15724, + "train/ce_loss": 0.6156405806541443 + }, + { + "epoch": 1.5546766857820842, + "step": 15724, + "train/sim_loss": 0.08550000190734863 + }, + { + "epoch": 1.5546766857820842, + "step": 15724, + "train/total_loss": 0.14706405997276306 + }, + { + "entropy": 9.066566467285156, + "epoch": 1.5547755586315999, + "mean_token_accuracy": 0.8930041193962097, + "num_tokens": 46737783.0, + "step": 15725, + "train/ce_loss": 0.5002317428588867 + }, + { + "epoch": 1.5547755586315999, + "step": 15725, + "train/sim_loss": 0.00919950008392334 + }, + { + "epoch": 1.5547755586315999, + "step": 15725, + "train/total_loss": 0.05922267585992813 + }, + { + "entropy": 9.165069580078125, + "epoch": 1.5548744314811151, + "mean_token_accuracy": 0.8720496892929077, + "num_tokens": 46750309.0, + "step": 15726, + "train/ce_loss": 0.3764681816101074 + }, + { + "epoch": 1.5548744314811151, + "step": 15726, + "train/sim_loss": 0.04527616500854492 + }, + { + "epoch": 1.5548744314811151, + "step": 15726, + "train/total_loss": 0.08292298018932343 + }, + { + "entropy": 9.480690002441406, + "epoch": 1.5549733043306309, + "mean_token_accuracy": 0.8592848777770996, + "num_tokens": 46759992.0, + "step": 15727, + "train/ce_loss": 0.5822692513465881 + }, + { + "epoch": 1.5549733043306309, + "step": 15727, + "train/sim_loss": 0.03837031126022339 + }, + { + "epoch": 1.5549733043306309, + "step": 15727, + "train/total_loss": 0.09659723937511444 + }, + { + "entropy": 9.573526382446289, + "epoch": 1.5550721771801463, + "mean_token_accuracy": 0.8369747996330261, + "num_tokens": 46773750.0, + "step": 15728, + "train/ce_loss": 4.4236622898097266e-07 + }, + { + "epoch": 1.5550721771801463, + "step": 15728, + "train/sim_loss": 0.04155278205871582 + }, + { + "epoch": 1.5550721771801463, + "step": 15728, + "train/total_loss": 0.0415528267621994 + }, + { + "entropy": 9.603748321533203, + "epoch": 1.5551710500296618, + "mean_token_accuracy": 0.8398983478546143, + "num_tokens": 46784274.0, + "step": 15729, + "train/ce_loss": 0.7056593298912048 + }, + { + "epoch": 1.5551710500296618, + "step": 15729, + "train/sim_loss": 0.033602893352508545 + }, + { + "epoch": 1.5551710500296618, + "step": 15729, + "train/total_loss": 0.10416882485151291 + }, + { + "entropy": 9.467205047607422, + "epoch": 1.5552699228791775, + "mean_token_accuracy": 0.879059374332428, + "num_tokens": 46798973.0, + "step": 15730, + "train/ce_loss": 0.4651127755641937 + }, + { + "epoch": 1.5552699228791775, + "step": 15730, + "train/sim_loss": 0.02748161554336548 + }, + { + "epoch": 1.5552699228791775, + "step": 15730, + "train/total_loss": 0.07399289309978485 + }, + { + "entropy": 9.084956169128418, + "epoch": 1.5553687957286928, + "mean_token_accuracy": 0.8776881694793701, + "num_tokens": 46807129.0, + "step": 15731, + "train/ce_loss": 0.6654130816459656 + }, + { + "epoch": 1.5553687957286928, + "step": 15731, + "train/sim_loss": 0.04093468189239502 + }, + { + "epoch": 1.5553687957286928, + "step": 15731, + "train/total_loss": 0.10747598856687546 + }, + { + "entropy": 9.204830169677734, + "epoch": 1.5554676685782085, + "mean_token_accuracy": 0.8281081318855286, + "num_tokens": 46820510.0, + "step": 15732, + "train/ce_loss": 0.36140912771224976 + }, + { + "epoch": 1.5554676685782085, + "step": 15732, + "train/sim_loss": 0.04867076873779297 + }, + { + "epoch": 1.5554676685782085, + "step": 15732, + "train/total_loss": 0.08481168746948242 + }, + { + "entropy": 9.357643127441406, + "epoch": 1.555566541427724, + "mean_token_accuracy": 0.8614097833633423, + "num_tokens": 46836499.0, + "step": 15733, + "train/ce_loss": 0.5563905239105225 + }, + { + "epoch": 1.555566541427724, + "step": 15733, + "train/sim_loss": 0.026948213577270508 + }, + { + "epoch": 1.555566541427724, + "step": 15733, + "train/total_loss": 0.08258727192878723 + }, + { + "entropy": 9.351432800292969, + "epoch": 1.5556654142772395, + "mean_token_accuracy": 0.8844765424728394, + "num_tokens": 46846168.0, + "step": 15734, + "train/ce_loss": 0.5414650440216064 + }, + { + "epoch": 1.5556654142772395, + "step": 15734, + "train/sim_loss": 0.0553586483001709 + }, + { + "epoch": 1.5556654142772395, + "step": 15734, + "train/total_loss": 0.10950515419244766 + }, + { + "entropy": 9.45409107208252, + "epoch": 1.555764287126755, + "mean_token_accuracy": 0.851951539516449, + "num_tokens": 46854770.0, + "step": 15735, + "train/ce_loss": 0.5437065362930298 + }, + { + "epoch": 1.555764287126755, + "step": 15735, + "train/sim_loss": 0.03444373607635498 + }, + { + "epoch": 1.555764287126755, + "step": 15735, + "train/total_loss": 0.0888143926858902 + }, + { + "entropy": 9.003011703491211, + "epoch": 1.5558631599762704, + "mean_token_accuracy": 0.9105392098426819, + "num_tokens": 46862935.0, + "step": 15736, + "train/ce_loss": 0.27863824367523193 + }, + { + "epoch": 1.5558631599762704, + "step": 15736, + "train/sim_loss": 0.04053312540054321 + }, + { + "epoch": 1.5558631599762704, + "step": 15736, + "train/total_loss": 0.06839694827795029 + }, + { + "entropy": 9.461055755615234, + "epoch": 1.5559620328257862, + "mean_token_accuracy": 0.8905472755432129, + "num_tokens": 46876433.0, + "step": 15737, + "train/ce_loss": 0.4267282485961914 + }, + { + "epoch": 1.5559620328257862, + "step": 15737, + "train/sim_loss": 0.04148739576339722 + }, + { + "epoch": 1.5559620328257862, + "step": 15737, + "train/total_loss": 0.0841602236032486 + }, + { + "entropy": 9.33540153503418, + "epoch": 1.5560609056753014, + "mean_token_accuracy": 0.8847059011459351, + "num_tokens": 46886783.0, + "step": 15738, + "train/ce_loss": 0.6638190746307373 + }, + { + "epoch": 1.5560609056753014, + "step": 15738, + "train/sim_loss": 0.03718447685241699 + }, + { + "epoch": 1.5560609056753014, + "step": 15738, + "train/total_loss": 0.10356638580560684 + }, + { + "entropy": 9.09779167175293, + "epoch": 1.5561597785248171, + "mean_token_accuracy": 0.8927738666534424, + "num_tokens": 46898751.0, + "step": 15739, + "train/ce_loss": 0.32453593611717224 + }, + { + "epoch": 1.5561597785248171, + "step": 15739, + "train/sim_loss": 0.015806257724761963 + }, + { + "epoch": 1.5561597785248171, + "step": 15739, + "train/total_loss": 0.04825985059142113 + }, + { + "epoch": 1.5562586513743326, + "grad_norm": 0.4372722804546356, + "learning_rate": 6.111111111111112e-06, + "loss": 0.0834, + "step": 15740 + }, + { + "entropy": 9.536203384399414, + "epoch": 1.5562586513743326, + "mean_token_accuracy": 0.8629737496376038, + "num_tokens": 46913186.0, + "step": 15740, + "train/ce_loss": 2.523179603031167e-07 + }, + { + "epoch": 1.5562586513743326, + "step": 15740, + "train/sim_loss": 0.024775564670562744 + }, + { + "epoch": 1.5562586513743326, + "step": 15740, + "train/total_loss": 0.024775590747594833 + }, + { + "entropy": 9.199480056762695, + "epoch": 1.556357524223848, + "mean_token_accuracy": 0.875, + "num_tokens": 46924373.0, + "step": 15741, + "train/ce_loss": 0.4662700891494751 + }, + { + "epoch": 1.556357524223848, + "step": 15741, + "train/sim_loss": 0.05383390188217163 + }, + { + "epoch": 1.556357524223848, + "step": 15741, + "train/total_loss": 0.10046091675758362 + }, + { + "entropy": 9.18977165222168, + "epoch": 1.5564563970733638, + "mean_token_accuracy": 0.841317355632782, + "num_tokens": 46934641.0, + "step": 15742, + "train/ce_loss": 0.8767086863517761 + }, + { + "epoch": 1.5564563970733638, + "step": 15742, + "train/sim_loss": 0.048421382904052734 + }, + { + "epoch": 1.5564563970733638, + "step": 15742, + "train/total_loss": 0.13609224557876587 + }, + { + "entropy": 9.315130233764648, + "epoch": 1.556555269922879, + "mean_token_accuracy": 0.9150521755218506, + "num_tokens": 46945862.0, + "step": 15743, + "train/ce_loss": 2.188741206055056e-07 + }, + { + "epoch": 1.556555269922879, + "step": 15743, + "train/sim_loss": 0.0343356728553772 + }, + { + "epoch": 1.556555269922879, + "step": 15743, + "train/total_loss": 0.03433569520711899 + }, + { + "entropy": 9.461790084838867, + "epoch": 1.5566541427723948, + "mean_token_accuracy": 0.8620154857635498, + "num_tokens": 46954764.0, + "step": 15744, + "train/ce_loss": 2.979265332214709e-07 + }, + { + "epoch": 1.5566541427723948, + "step": 15744, + "train/sim_loss": 0.02447432279586792 + }, + { + "epoch": 1.5566541427723948, + "step": 15744, + "train/total_loss": 0.024474352598190308 + }, + { + "entropy": 9.28864860534668, + "epoch": 1.5567530156219103, + "mean_token_accuracy": 0.864546537399292, + "num_tokens": 46969642.0, + "step": 15745, + "train/ce_loss": 0.20080088078975677 + }, + { + "epoch": 1.5567530156219103, + "step": 15745, + "train/sim_loss": 0.03573107719421387 + }, + { + "epoch": 1.5567530156219103, + "step": 15745, + "train/total_loss": 0.055811166763305664 + }, + { + "entropy": 9.046772003173828, + "epoch": 1.5568518884714257, + "mean_token_accuracy": 0.8226163983345032, + "num_tokens": 46982704.0, + "step": 15746, + "train/ce_loss": 0.42858290672302246 + }, + { + "epoch": 1.5568518884714257, + "step": 15746, + "train/sim_loss": 0.01610434055328369 + }, + { + "epoch": 1.5568518884714257, + "step": 15746, + "train/total_loss": 0.058962631970644 + }, + { + "entropy": 9.49223518371582, + "epoch": 1.5569507613209412, + "mean_token_accuracy": 0.8658367991447449, + "num_tokens": 46999957.0, + "step": 15747, + "train/ce_loss": 0.3770449161529541 + }, + { + "epoch": 1.5569507613209412, + "step": 15747, + "train/sim_loss": 0.05980062484741211 + }, + { + "epoch": 1.5569507613209412, + "step": 15747, + "train/total_loss": 0.097505122423172 + }, + { + "entropy": 10.025689125061035, + "epoch": 1.5570496341704567, + "mean_token_accuracy": 0.8852097392082214, + "num_tokens": 47010650.0, + "step": 15748, + "train/ce_loss": 0.6302890181541443 + }, + { + "epoch": 1.5570496341704567, + "step": 15748, + "train/sim_loss": 0.052389562129974365 + }, + { + "epoch": 1.5570496341704567, + "step": 15748, + "train/total_loss": 0.1154184639453888 + }, + { + "entropy": 9.120437622070312, + "epoch": 1.5571485070199724, + "mean_token_accuracy": 0.8027750253677368, + "num_tokens": 47028928.0, + "step": 15749, + "train/ce_loss": 0.5037153959274292 + }, + { + "epoch": 1.5571485070199724, + "step": 15749, + "train/sim_loss": 0.030238032341003418 + }, + { + "epoch": 1.5571485070199724, + "step": 15749, + "train/total_loss": 0.08060957491397858 + }, + { + "entropy": 9.215807914733887, + "epoch": 1.5572473798694877, + "mean_token_accuracy": 0.83152174949646, + "num_tokens": 47042548.0, + "step": 15750, + "train/ce_loss": 0.2898944914340973 + }, + { + "epoch": 1.5572473798694877, + "step": 15750, + "train/sim_loss": 0.039929211139678955 + }, + { + "epoch": 1.5572473798694877, + "step": 15750, + "train/total_loss": 0.06891866028308868 + }, + { + "entropy": 8.866264343261719, + "epoch": 1.5573462527190034, + "mean_token_accuracy": 0.8568102717399597, + "num_tokens": 47054916.0, + "step": 15751, + "train/ce_loss": 0.4455616772174835 + }, + { + "epoch": 1.5573462527190034, + "step": 15751, + "train/sim_loss": 0.033232033252716064 + }, + { + "epoch": 1.5573462527190034, + "step": 15751, + "train/total_loss": 0.07778820395469666 + }, + { + "entropy": 9.212398529052734, + "epoch": 1.5574451255685189, + "mean_token_accuracy": 0.8711819648742676, + "num_tokens": 47068326.0, + "step": 15752, + "train/ce_loss": 0.49931344389915466 + }, + { + "epoch": 1.5574451255685189, + "step": 15752, + "train/sim_loss": 0.04415649175643921 + }, + { + "epoch": 1.5574451255685189, + "step": 15752, + "train/total_loss": 0.09408783912658691 + }, + { + "entropy": 9.709627151489258, + "epoch": 1.5575439984180344, + "mean_token_accuracy": 0.8517241477966309, + "num_tokens": 47084884.0, + "step": 15753, + "train/ce_loss": 0.6330956816673279 + }, + { + "epoch": 1.5575439984180344, + "step": 15753, + "train/sim_loss": 0.09599101543426514 + }, + { + "epoch": 1.5575439984180344, + "step": 15753, + "train/total_loss": 0.15930059552192688 + }, + { + "entropy": 9.305789947509766, + "epoch": 1.55764287126755, + "mean_token_accuracy": 0.840266227722168, + "num_tokens": 47094884.0, + "step": 15754, + "train/ce_loss": 0.4887711703777313 + }, + { + "epoch": 1.55764287126755, + "step": 15754, + "train/sim_loss": 0.09613806009292603 + }, + { + "epoch": 1.55764287126755, + "step": 15754, + "train/total_loss": 0.1450151801109314 + }, + { + "entropy": 9.789873123168945, + "epoch": 1.5577417441170653, + "mean_token_accuracy": 0.8853932619094849, + "num_tokens": 47110244.0, + "step": 15755, + "train/ce_loss": 0.30348995327949524 + }, + { + "epoch": 1.5577417441170653, + "step": 15755, + "train/sim_loss": 0.025676727294921875 + }, + { + "epoch": 1.5577417441170653, + "step": 15755, + "train/total_loss": 0.05602572113275528 + }, + { + "entropy": 8.571470260620117, + "epoch": 1.557840616966581, + "mean_token_accuracy": 0.8547595739364624, + "num_tokens": 47120586.0, + "step": 15756, + "train/ce_loss": 0.5307879447937012 + }, + { + "epoch": 1.557840616966581, + "step": 15756, + "train/sim_loss": 0.039738476276397705 + }, + { + "epoch": 1.557840616966581, + "step": 15756, + "train/total_loss": 0.0928172767162323 + }, + { + "entropy": 8.9027738571167, + "epoch": 1.5579394898160965, + "mean_token_accuracy": 0.7941463589668274, + "num_tokens": 47131140.0, + "step": 15757, + "train/ce_loss": 0.46604010462760925 + }, + { + "epoch": 1.5579394898160965, + "step": 15757, + "train/sim_loss": 0.05792611837387085 + }, + { + "epoch": 1.5579394898160965, + "step": 15757, + "train/total_loss": 0.10453012585639954 + }, + { + "entropy": 8.704180717468262, + "epoch": 1.558038362665612, + "mean_token_accuracy": 0.8987951874732971, + "num_tokens": 47140774.0, + "step": 15758, + "train/ce_loss": 0.27492955327033997 + }, + { + "epoch": 1.558038362665612, + "step": 15758, + "train/sim_loss": 0.023731887340545654 + }, + { + "epoch": 1.558038362665612, + "step": 15758, + "train/total_loss": 0.05122484266757965 + }, + { + "entropy": 8.985593795776367, + "epoch": 1.5581372355151275, + "mean_token_accuracy": 0.8632855415344238, + "num_tokens": 47148161.0, + "step": 15759, + "train/ce_loss": 0.553312361240387 + }, + { + "epoch": 1.5581372355151275, + "step": 15759, + "train/sim_loss": 0.0820510983467102 + }, + { + "epoch": 1.5581372355151275, + "step": 15759, + "train/total_loss": 0.13738232851028442 + }, + { + "epoch": 1.558236108364643, + "grad_norm": 0.5120614171028137, + "learning_rate": 6.106166246353163e-06, + "loss": 0.0836, + "step": 15760 + }, + { + "entropy": 9.195892333984375, + "epoch": 1.558236108364643, + "mean_token_accuracy": 0.8476070761680603, + "num_tokens": 47166015.0, + "step": 15760, + "train/ce_loss": 0.5062524676322937 + }, + { + "epoch": 1.558236108364643, + "step": 15760, + "train/sim_loss": 0.04906165599822998 + }, + { + "epoch": 1.558236108364643, + "step": 15760, + "train/total_loss": 0.09968690574169159 + }, + { + "entropy": 9.06983757019043, + "epoch": 1.5583349812141587, + "mean_token_accuracy": 0.8497913479804993, + "num_tokens": 47174785.0, + "step": 15761, + "train/ce_loss": 1.1141628419863991e-07 + }, + { + "epoch": 1.5583349812141587, + "step": 15761, + "train/sim_loss": 0.02171492576599121 + }, + { + "epoch": 1.5583349812141587, + "step": 15761, + "train/total_loss": 0.021714936941862106 + }, + { + "entropy": 9.197790145874023, + "epoch": 1.558433854063674, + "mean_token_accuracy": 0.8119999766349792, + "num_tokens": 47188461.0, + "step": 15762, + "train/ce_loss": 0.2241365909576416 + }, + { + "epoch": 1.558433854063674, + "step": 15762, + "train/sim_loss": 0.08405566215515137 + }, + { + "epoch": 1.558433854063674, + "step": 15762, + "train/total_loss": 0.10646931827068329 + }, + { + "entropy": 9.379617691040039, + "epoch": 1.5585327269131897, + "mean_token_accuracy": 0.8822580575942993, + "num_tokens": 47193661.0, + "step": 15763, + "train/ce_loss": 0.44160136580467224 + }, + { + "epoch": 1.5585327269131897, + "step": 15763, + "train/sim_loss": 0.010564148426055908 + }, + { + "epoch": 1.5585327269131897, + "step": 15763, + "train/total_loss": 0.05472428724169731 + }, + { + "entropy": 9.511597633361816, + "epoch": 1.5586315997627052, + "mean_token_accuracy": 0.8628571629524231, + "num_tokens": 47202015.0, + "step": 15764, + "train/ce_loss": 0.6503296494483948 + }, + { + "epoch": 1.5586315997627052, + "step": 15764, + "train/sim_loss": 0.04813051223754883 + }, + { + "epoch": 1.5586315997627052, + "step": 15764, + "train/total_loss": 0.11316347867250443 + }, + { + "entropy": 9.155470848083496, + "epoch": 1.5587304726122206, + "mean_token_accuracy": 0.8847059011459351, + "num_tokens": 47218392.0, + "step": 15765, + "train/ce_loss": 0.50464928150177 + }, + { + "epoch": 1.5587304726122206, + "step": 15765, + "train/sim_loss": 0.026297688484191895 + }, + { + "epoch": 1.5587304726122206, + "step": 15765, + "train/total_loss": 0.0767626166343689 + }, + { + "entropy": 9.505622863769531, + "epoch": 1.5588293454617363, + "mean_token_accuracy": 0.8703703880310059, + "num_tokens": 47233058.0, + "step": 15766, + "train/ce_loss": 4.399019530865189e-07 + }, + { + "epoch": 1.5588293454617363, + "step": 15766, + "train/sim_loss": 0.01327371597290039 + }, + { + "epoch": 1.5588293454617363, + "step": 15766, + "train/total_loss": 0.013273759745061398 + }, + { + "entropy": 8.991125106811523, + "epoch": 1.5589282183112516, + "mean_token_accuracy": 0.8222222328186035, + "num_tokens": 47239716.0, + "step": 15767, + "train/ce_loss": 0.40941736102104187 + }, + { + "epoch": 1.5589282183112516, + "step": 15767, + "train/sim_loss": 0.01706564426422119 + }, + { + "epoch": 1.5589282183112516, + "step": 15767, + "train/total_loss": 0.0580073818564415 + }, + { + "entropy": 9.321334838867188, + "epoch": 1.5590270911607673, + "mean_token_accuracy": 0.9404990673065186, + "num_tokens": 47253846.0, + "step": 15768, + "train/ce_loss": 0.1890973299741745 + }, + { + "epoch": 1.5590270911607673, + "step": 15768, + "train/sim_loss": 0.05601179599761963 + }, + { + "epoch": 1.5590270911607673, + "step": 15768, + "train/total_loss": 0.07492153346538544 + }, + { + "entropy": 9.882631301879883, + "epoch": 1.5591259640102828, + "mean_token_accuracy": 0.8552631735801697, + "num_tokens": 47269767.0, + "step": 15769, + "train/ce_loss": 4.601003809057147e-07 + }, + { + "epoch": 1.5591259640102828, + "step": 15769, + "train/sim_loss": 0.06351882219314575 + }, + { + "epoch": 1.5591259640102828, + "step": 15769, + "train/total_loss": 0.06351886689662933 + }, + { + "entropy": 9.474620819091797, + "epoch": 1.5592248368597983, + "mean_token_accuracy": 0.855614960193634, + "num_tokens": 47280983.0, + "step": 15770, + "train/ce_loss": 0.1907556653022766 + }, + { + "epoch": 1.5592248368597983, + "step": 15770, + "train/sim_loss": 0.07005667686462402 + }, + { + "epoch": 1.5592248368597983, + "step": 15770, + "train/total_loss": 0.08913224190473557 + }, + { + "entropy": 8.92306137084961, + "epoch": 1.5593237097093138, + "mean_token_accuracy": 0.8535242080688477, + "num_tokens": 47293004.0, + "step": 15771, + "train/ce_loss": 0.4925260841846466 + }, + { + "epoch": 1.5593237097093138, + "step": 15771, + "train/sim_loss": 0.028747737407684326 + }, + { + "epoch": 1.5593237097093138, + "step": 15771, + "train/total_loss": 0.07800035178661346 + }, + { + "entropy": 9.28172492980957, + "epoch": 1.5594225825588293, + "mean_token_accuracy": 0.8313953280448914, + "num_tokens": 47304628.0, + "step": 15772, + "train/ce_loss": 0.6867620348930359 + }, + { + "epoch": 1.5594225825588293, + "step": 15772, + "train/sim_loss": 0.05195188522338867 + }, + { + "epoch": 1.5594225825588293, + "step": 15772, + "train/total_loss": 0.12062808871269226 + }, + { + "entropy": 8.946195602416992, + "epoch": 1.559521455408345, + "mean_token_accuracy": 0.8463329672813416, + "num_tokens": 47316612.0, + "step": 15773, + "train/ce_loss": 0.3472118377685547 + }, + { + "epoch": 1.559521455408345, + "step": 15773, + "train/sim_loss": 0.03537726402282715 + }, + { + "epoch": 1.559521455408345, + "step": 15773, + "train/total_loss": 0.07009844481945038 + }, + { + "entropy": 8.918015480041504, + "epoch": 1.5596203282578602, + "mean_token_accuracy": 0.8201257586479187, + "num_tokens": 47323726.0, + "step": 15774, + "train/ce_loss": 0.5346350073814392 + }, + { + "epoch": 1.5596203282578602, + "step": 15774, + "train/sim_loss": 0.06504535675048828 + }, + { + "epoch": 1.5596203282578602, + "step": 15774, + "train/total_loss": 0.11850886046886444 + }, + { + "entropy": 9.02702522277832, + "epoch": 1.559719201107376, + "mean_token_accuracy": 0.8417818546295166, + "num_tokens": 47335963.0, + "step": 15775, + "train/ce_loss": 2.2926101905795804e-07 + }, + { + "epoch": 1.559719201107376, + "step": 15775, + "train/sim_loss": 0.018378615379333496 + }, + { + "epoch": 1.559719201107376, + "step": 15775, + "train/total_loss": 0.018378637731075287 + }, + { + "entropy": 8.888659477233887, + "epoch": 1.5598180739568914, + "mean_token_accuracy": 0.8261904716491699, + "num_tokens": 47344945.0, + "step": 15776, + "train/ce_loss": 0.7320350408554077 + }, + { + "epoch": 1.5598180739568914, + "step": 15776, + "train/sim_loss": 0.11500662565231323 + }, + { + "epoch": 1.5598180739568914, + "step": 15776, + "train/total_loss": 0.188210129737854 + }, + { + "entropy": 9.004532814025879, + "epoch": 1.559916946806407, + "mean_token_accuracy": 0.852185070514679, + "num_tokens": 47357352.0, + "step": 15777, + "train/ce_loss": 0.35220712423324585 + }, + { + "epoch": 1.559916946806407, + "step": 15777, + "train/sim_loss": 0.03395962715148926 + }, + { + "epoch": 1.559916946806407, + "step": 15777, + "train/total_loss": 0.06918033957481384 + }, + { + "entropy": 9.663440704345703, + "epoch": 1.5600158196559226, + "mean_token_accuracy": 0.8914473652839661, + "num_tokens": 47365735.0, + "step": 15778, + "train/ce_loss": 0.6046767234802246 + }, + { + "epoch": 1.5600158196559226, + "step": 15778, + "train/sim_loss": 0.07161247730255127 + }, + { + "epoch": 1.5600158196559226, + "step": 15778, + "train/total_loss": 0.13208015263080597 + }, + { + "entropy": 9.551118850708008, + "epoch": 1.5601146925054379, + "mean_token_accuracy": 0.8852459192276001, + "num_tokens": 47377082.0, + "step": 15779, + "train/ce_loss": 0.23065608739852905 + }, + { + "epoch": 1.5601146925054379, + "step": 15779, + "train/sim_loss": 0.03350192308425903 + }, + { + "epoch": 1.5601146925054379, + "step": 15779, + "train/total_loss": 0.05656753480434418 + }, + { + "epoch": 1.5602135653549536, + "grad_norm": 0.5331553816795349, + "learning_rate": 6.101221381595214e-06, + "loss": 0.0819, + "step": 15780 + }, + { + "entropy": 9.532270431518555, + "epoch": 1.5602135653549536, + "mean_token_accuracy": 0.8547770977020264, + "num_tokens": 47398085.0, + "step": 15780, + "train/ce_loss": 0.25060024857521057 + }, + { + "epoch": 1.5602135653549536, + "step": 15780, + "train/sim_loss": 0.022771894931793213 + }, + { + "epoch": 1.5602135653549536, + "step": 15780, + "train/total_loss": 0.04783192276954651 + }, + { + "entropy": 9.166528701782227, + "epoch": 1.560312438204469, + "mean_token_accuracy": 0.834938108921051, + "num_tokens": 47406084.0, + "step": 15781, + "train/ce_loss": 0.23931339383125305 + }, + { + "epoch": 1.560312438204469, + "step": 15781, + "train/sim_loss": 0.0496748685836792 + }, + { + "epoch": 1.560312438204469, + "step": 15781, + "train/total_loss": 0.0736062079668045 + }, + { + "entropy": 9.405535697937012, + "epoch": 1.5604113110539846, + "mean_token_accuracy": 0.9046321511268616, + "num_tokens": 47419480.0, + "step": 15782, + "train/ce_loss": 0.7568545937538147 + }, + { + "epoch": 1.5604113110539846, + "step": 15782, + "train/sim_loss": 0.025248289108276367 + }, + { + "epoch": 1.5604113110539846, + "step": 15782, + "train/total_loss": 0.1009337529540062 + }, + { + "entropy": 9.031351089477539, + "epoch": 1.5605101839035, + "mean_token_accuracy": 0.8301886916160583, + "num_tokens": 47427865.0, + "step": 15783, + "train/ce_loss": 0.5515562295913696 + }, + { + "epoch": 1.5605101839035, + "step": 15783, + "train/sim_loss": 0.030139446258544922 + }, + { + "epoch": 1.5605101839035, + "step": 15783, + "train/total_loss": 0.08529506623744965 + }, + { + "entropy": 9.60854721069336, + "epoch": 1.5606090567530155, + "mean_token_accuracy": 0.8902077078819275, + "num_tokens": 47442957.0, + "step": 15784, + "train/ce_loss": 2.7237501853960566e-07 + }, + { + "epoch": 1.5606090567530155, + "step": 15784, + "train/sim_loss": 0.03833365440368652 + }, + { + "epoch": 1.5606090567530155, + "step": 15784, + "train/total_loss": 0.03833368048071861 + }, + { + "entropy": 8.873796463012695, + "epoch": 1.5607079296025312, + "mean_token_accuracy": 0.8557214140892029, + "num_tokens": 47455087.0, + "step": 15785, + "train/ce_loss": 0.30038169026374817 + }, + { + "epoch": 1.5607079296025312, + "step": 15785, + "train/sim_loss": 0.015091896057128906 + }, + { + "epoch": 1.5607079296025312, + "step": 15785, + "train/total_loss": 0.04513006657361984 + }, + { + "entropy": 9.38815689086914, + "epoch": 1.5608068024520465, + "mean_token_accuracy": 0.8722466826438904, + "num_tokens": 47465690.0, + "step": 15786, + "train/ce_loss": 0.4326803386211395 + }, + { + "epoch": 1.5608068024520465, + "step": 15786, + "train/sim_loss": 0.03363919258117676 + }, + { + "epoch": 1.5608068024520465, + "step": 15786, + "train/total_loss": 0.07690723240375519 + }, + { + "entropy": 9.486082077026367, + "epoch": 1.5609056753015622, + "mean_token_accuracy": 0.8809219002723694, + "num_tokens": 47482847.0, + "step": 15787, + "train/ce_loss": 0.36208540201187134 + }, + { + "epoch": 1.5609056753015622, + "step": 15787, + "train/sim_loss": 0.02676612138748169 + }, + { + "epoch": 1.5609056753015622, + "step": 15787, + "train/total_loss": 0.06297466158866882 + }, + { + "entropy": 9.321636199951172, + "epoch": 1.5610045481510777, + "mean_token_accuracy": 0.815511167049408, + "num_tokens": 47493637.0, + "step": 15788, + "train/ce_loss": 0.8723928928375244 + }, + { + "epoch": 1.5610045481510777, + "step": 15788, + "train/sim_loss": 0.045029282569885254 + }, + { + "epoch": 1.5610045481510777, + "step": 15788, + "train/total_loss": 0.13226857781410217 + }, + { + "entropy": 9.160540580749512, + "epoch": 1.5611034210005932, + "mean_token_accuracy": 0.8236024975776672, + "num_tokens": 47506862.0, + "step": 15789, + "train/ce_loss": 0.47401243448257446 + }, + { + "epoch": 1.5611034210005932, + "step": 15789, + "train/sim_loss": 0.029174089431762695 + }, + { + "epoch": 1.5611034210005932, + "step": 15789, + "train/total_loss": 0.07657533884048462 + }, + { + "entropy": 9.090778350830078, + "epoch": 1.561202293850109, + "mean_token_accuracy": 0.8964613080024719, + "num_tokens": 47519875.0, + "step": 15790, + "train/ce_loss": 0.4879362881183624 + }, + { + "epoch": 1.561202293850109, + "step": 15790, + "train/sim_loss": 0.04600358009338379 + }, + { + "epoch": 1.561202293850109, + "step": 15790, + "train/total_loss": 0.09479720890522003 + }, + { + "entropy": 9.771489143371582, + "epoch": 1.5613011666996242, + "mean_token_accuracy": 0.9034090638160706, + "num_tokens": 47534424.0, + "step": 15791, + "train/ce_loss": 0.33408138155937195 + }, + { + "epoch": 1.5613011666996242, + "step": 15791, + "train/sim_loss": 0.014909863471984863 + }, + { + "epoch": 1.5613011666996242, + "step": 15791, + "train/total_loss": 0.04831800237298012 + }, + { + "entropy": 9.305597305297852, + "epoch": 1.5614000395491399, + "mean_token_accuracy": 0.8937605619430542, + "num_tokens": 47547862.0, + "step": 15792, + "train/ce_loss": 0.3717341423034668 + }, + { + "epoch": 1.5614000395491399, + "step": 15792, + "train/sim_loss": 0.03538167476654053 + }, + { + "epoch": 1.5614000395491399, + "step": 15792, + "train/total_loss": 0.07255509495735168 + }, + { + "entropy": 9.663167953491211, + "epoch": 1.5614989123986553, + "mean_token_accuracy": 0.8530066609382629, + "num_tokens": 47569206.0, + "step": 15793, + "train/ce_loss": 0.6722031235694885 + }, + { + "epoch": 1.5614989123986553, + "step": 15793, + "train/sim_loss": 0.08928298950195312 + }, + { + "epoch": 1.5614989123986553, + "step": 15793, + "train/total_loss": 0.15650330483913422 + }, + { + "entropy": 9.083696365356445, + "epoch": 1.5615977852481708, + "mean_token_accuracy": 0.8484136462211609, + "num_tokens": 47580746.0, + "step": 15794, + "train/ce_loss": 0.630323052406311 + }, + { + "epoch": 1.5615977852481708, + "step": 15794, + "train/sim_loss": 0.010597467422485352 + }, + { + "epoch": 1.5615977852481708, + "step": 15794, + "train/total_loss": 0.07362977415323257 + }, + { + "entropy": 9.319427490234375, + "epoch": 1.5616966580976865, + "mean_token_accuracy": 0.8263157606124878, + "num_tokens": 47590172.0, + "step": 15795, + "train/ce_loss": 1.2653943300247192 + }, + { + "epoch": 1.5616966580976865, + "step": 15795, + "train/sim_loss": 0.07171452045440674 + }, + { + "epoch": 1.5616966580976865, + "step": 15795, + "train/total_loss": 0.19825395941734314 + }, + { + "entropy": 9.170762062072754, + "epoch": 1.5617955309472018, + "mean_token_accuracy": 0.8623853325843811, + "num_tokens": 47603150.0, + "step": 15796, + "train/ce_loss": 0.3721350133419037 + }, + { + "epoch": 1.5617955309472018, + "step": 15796, + "train/sim_loss": 0.03551870584487915 + }, + { + "epoch": 1.5617955309472018, + "step": 15796, + "train/total_loss": 0.07273221015930176 + }, + { + "entropy": 9.464855194091797, + "epoch": 1.5618944037967175, + "mean_token_accuracy": 0.9051490426063538, + "num_tokens": 47616342.0, + "step": 15797, + "train/ce_loss": 0.5102019906044006 + }, + { + "epoch": 1.5618944037967175, + "step": 15797, + "train/sim_loss": 0.0978119969367981 + }, + { + "epoch": 1.5618944037967175, + "step": 15797, + "train/total_loss": 0.14883220195770264 + }, + { + "entropy": 9.44974422454834, + "epoch": 1.561993276646233, + "mean_token_accuracy": 0.8491228222846985, + "num_tokens": 47636771.0, + "step": 15798, + "train/ce_loss": 0.22453264892101288 + }, + { + "epoch": 1.561993276646233, + "step": 15798, + "train/sim_loss": 0.015946626663208008 + }, + { + "epoch": 1.561993276646233, + "step": 15798, + "train/total_loss": 0.038399890065193176 + }, + { + "entropy": 9.608624458312988, + "epoch": 1.5620921494957485, + "mean_token_accuracy": 0.8425324559211731, + "num_tokens": 47650222.0, + "step": 15799, + "train/ce_loss": 2.0358314145596523e-07 + }, + { + "epoch": 1.5620921494957485, + "step": 15799, + "train/sim_loss": 0.013219594955444336 + }, + { + "epoch": 1.5620921494957485, + "step": 15799, + "train/total_loss": 0.013219615444540977 + }, + { + "epoch": 1.562191022345264, + "grad_norm": 0.6095255017280579, + "learning_rate": 6.096276516837265e-06, + "loss": 0.0798, + "step": 15800 + }, + { + "entropy": 9.055099487304688, + "epoch": 1.562191022345264, + "mean_token_accuracy": 0.8189300298690796, + "num_tokens": 47659770.0, + "step": 15800, + "train/ce_loss": 0.5880956053733826 + }, + { + "epoch": 1.562191022345264, + "step": 15800, + "train/sim_loss": 0.08818924427032471 + }, + { + "epoch": 1.562191022345264, + "step": 15800, + "train/total_loss": 0.1469988077878952 + }, + { + "entropy": 9.208189010620117, + "epoch": 1.5622898951947795, + "mean_token_accuracy": 0.8928571343421936, + "num_tokens": 47676527.0, + "step": 15801, + "train/ce_loss": 0.37473949790000916 + }, + { + "epoch": 1.5622898951947795, + "step": 15801, + "train/sim_loss": 0.028521418571472168 + }, + { + "epoch": 1.5622898951947795, + "step": 15801, + "train/total_loss": 0.06599536538124084 + }, + { + "entropy": 9.530482292175293, + "epoch": 1.5623887680442952, + "mean_token_accuracy": 0.8863636255264282, + "num_tokens": 47694031.0, + "step": 15802, + "train/ce_loss": 0.6744652390480042 + }, + { + "epoch": 1.5623887680442952, + "step": 15802, + "train/sim_loss": 0.0378873348236084 + }, + { + "epoch": 1.5623887680442952, + "step": 15802, + "train/total_loss": 0.1053338572382927 + }, + { + "entropy": 8.960055351257324, + "epoch": 1.5624876408938104, + "mean_token_accuracy": 0.8063111901283264, + "num_tokens": 47706438.0, + "step": 15803, + "train/ce_loss": 0.4824890196323395 + }, + { + "epoch": 1.5624876408938104, + "step": 15803, + "train/sim_loss": 0.03263062238693237 + }, + { + "epoch": 1.5624876408938104, + "step": 15803, + "train/total_loss": 0.08087952435016632 + }, + { + "entropy": 8.90746021270752, + "epoch": 1.5625865137433261, + "mean_token_accuracy": 0.875, + "num_tokens": 47719142.0, + "step": 15804, + "train/ce_loss": 0.6525356769561768 + }, + { + "epoch": 1.5625865137433261, + "step": 15804, + "train/sim_loss": 0.04633831977844238 + }, + { + "epoch": 1.5625865137433261, + "step": 15804, + "train/total_loss": 0.1115918904542923 + }, + { + "entropy": 8.912151336669922, + "epoch": 1.5626853865928416, + "mean_token_accuracy": 0.8609865307807922, + "num_tokens": 47731662.0, + "step": 15805, + "train/ce_loss": 0.3894743025302887 + }, + { + "epoch": 1.5626853865928416, + "step": 15805, + "train/sim_loss": 0.060184478759765625 + }, + { + "epoch": 1.5626853865928416, + "step": 15805, + "train/total_loss": 0.09913191199302673 + }, + { + "entropy": 9.041410446166992, + "epoch": 1.562784259442357, + "mean_token_accuracy": 0.8650217652320862, + "num_tokens": 47739643.0, + "step": 15806, + "train/ce_loss": 0.4555576741695404 + }, + { + "epoch": 1.562784259442357, + "step": 15806, + "train/sim_loss": 0.028802990913391113 + }, + { + "epoch": 1.562784259442357, + "step": 15806, + "train/total_loss": 0.07435876131057739 + }, + { + "entropy": 8.933256149291992, + "epoch": 1.5628831322918728, + "mean_token_accuracy": 0.8847032189369202, + "num_tokens": 47750821.0, + "step": 15807, + "train/ce_loss": 0.4896056652069092 + }, + { + "epoch": 1.5628831322918728, + "step": 15807, + "train/sim_loss": 0.011495411396026611 + }, + { + "epoch": 1.5628831322918728, + "step": 15807, + "train/total_loss": 0.06045597791671753 + }, + { + "entropy": 8.964347839355469, + "epoch": 1.562982005141388, + "mean_token_accuracy": 0.8553137183189392, + "num_tokens": 47761269.0, + "step": 15808, + "train/ce_loss": 0.3513280749320984 + }, + { + "epoch": 1.562982005141388, + "step": 15808, + "train/sim_loss": 0.03580445051193237 + }, + { + "epoch": 1.562982005141388, + "step": 15808, + "train/total_loss": 0.07093726098537445 + }, + { + "entropy": 8.98019027709961, + "epoch": 1.5630808779909038, + "mean_token_accuracy": 0.8936651349067688, + "num_tokens": 47770099.0, + "step": 15809, + "train/ce_loss": 0.3564247786998749 + }, + { + "epoch": 1.5630808779909038, + "step": 15809, + "train/sim_loss": 0.015673458576202393 + }, + { + "epoch": 1.5630808779909038, + "step": 15809, + "train/total_loss": 0.05131593719124794 + }, + { + "entropy": 9.384149551391602, + "epoch": 1.5631797508404193, + "mean_token_accuracy": 0.9226618409156799, + "num_tokens": 47780134.0, + "step": 15810, + "train/ce_loss": 4.879775588051416e-07 + }, + { + "epoch": 1.5631797508404193, + "step": 15810, + "train/sim_loss": 0.045716941356658936 + }, + { + "epoch": 1.5631797508404193, + "step": 15810, + "train/total_loss": 0.045716989785432816 + }, + { + "entropy": 8.861888885498047, + "epoch": 1.5632786236899348, + "mean_token_accuracy": 0.8729817271232605, + "num_tokens": 47790099.0, + "step": 15811, + "train/ce_loss": 0.7225176692008972 + }, + { + "epoch": 1.5632786236899348, + "step": 15811, + "train/sim_loss": 0.07637143135070801 + }, + { + "epoch": 1.5632786236899348, + "step": 15811, + "train/total_loss": 0.14862319827079773 + }, + { + "entropy": 9.558913230895996, + "epoch": 1.5633774965394502, + "mean_token_accuracy": 0.8923766613006592, + "num_tokens": 47798169.0, + "step": 15812, + "train/ce_loss": 0.4724428653717041 + }, + { + "epoch": 1.5633774965394502, + "step": 15812, + "train/sim_loss": 0.03643918037414551 + }, + { + "epoch": 1.5633774965394502, + "step": 15812, + "train/total_loss": 0.08368346840143204 + }, + { + "entropy": 9.504261016845703, + "epoch": 1.5634763693889657, + "mean_token_accuracy": 0.8687196373939514, + "num_tokens": 47805936.0, + "step": 15813, + "train/ce_loss": 4.6987469204395893e-07 + }, + { + "epoch": 1.5634763693889657, + "step": 15813, + "train/sim_loss": 0.01698613166809082 + }, + { + "epoch": 1.5634763693889657, + "step": 15813, + "train/total_loss": 0.01698617823421955 + }, + { + "entropy": 9.258057594299316, + "epoch": 1.5635752422384814, + "mean_token_accuracy": 0.8596882224082947, + "num_tokens": 47818703.0, + "step": 15814, + "train/ce_loss": 0.5093987584114075 + }, + { + "epoch": 1.5635752422384814, + "step": 15814, + "train/sim_loss": 0.040985107421875 + }, + { + "epoch": 1.5635752422384814, + "step": 15814, + "train/total_loss": 0.09192498028278351 + }, + { + "entropy": 9.05990219116211, + "epoch": 1.5636741150879967, + "mean_token_accuracy": 0.8337129950523376, + "num_tokens": 47832106.0, + "step": 15815, + "train/ce_loss": 0.4955824315547943 + }, + { + "epoch": 1.5636741150879967, + "step": 15815, + "train/sim_loss": 0.0663633942604065 + }, + { + "epoch": 1.5636741150879967, + "step": 15815, + "train/total_loss": 0.11592163890600204 + }, + { + "entropy": 8.997861862182617, + "epoch": 1.5637729879375124, + "mean_token_accuracy": 0.8523489832878113, + "num_tokens": 47841528.0, + "step": 15816, + "train/ce_loss": 0.3449510931968689 + }, + { + "epoch": 1.5637729879375124, + "step": 15816, + "train/sim_loss": 0.013212144374847412 + }, + { + "epoch": 1.5637729879375124, + "step": 15816, + "train/total_loss": 0.04770725592970848 + }, + { + "entropy": 9.56743049621582, + "epoch": 1.563871860787028, + "mean_token_accuracy": 0.875481367111206, + "num_tokens": 47862206.0, + "step": 15817, + "train/ce_loss": 0.7158397436141968 + }, + { + "epoch": 1.563871860787028, + "step": 15817, + "train/sim_loss": 0.05003917217254639 + }, + { + "epoch": 1.563871860787028, + "step": 15817, + "train/total_loss": 0.12162315100431442 + }, + { + "entropy": 9.014289855957031, + "epoch": 1.5639707336365434, + "mean_token_accuracy": 0.8445860147476196, + "num_tokens": 47874599.0, + "step": 15818, + "train/ce_loss": 0.44173645973205566 + }, + { + "epoch": 1.5639707336365434, + "step": 15818, + "train/sim_loss": 0.022766947746276855 + }, + { + "epoch": 1.5639707336365434, + "step": 15818, + "train/total_loss": 0.06694059073925018 + }, + { + "entropy": 9.195480346679688, + "epoch": 1.564069606486059, + "mean_token_accuracy": 0.8888888955116272, + "num_tokens": 47888534.0, + "step": 15819, + "train/ce_loss": 6.377483714459231e-07 + }, + { + "epoch": 1.564069606486059, + "step": 15819, + "train/sim_loss": 0.024746179580688477 + }, + { + "epoch": 1.564069606486059, + "step": 15819, + "train/total_loss": 0.02474624291062355 + }, + { + "epoch": 1.5641684793355743, + "grad_norm": 0.5312469601631165, + "learning_rate": 6.091331652079317e-06, + "loss": 0.0781, + "step": 15820 + }, + { + "entropy": 9.368324279785156, + "epoch": 1.5641684793355743, + "mean_token_accuracy": 0.846446692943573, + "num_tokens": 47901861.0, + "step": 15820, + "train/ce_loss": 0.4219319224357605 + }, + { + "epoch": 1.5641684793355743, + "step": 15820, + "train/sim_loss": 0.05601149797439575 + }, + { + "epoch": 1.5641684793355743, + "step": 15820, + "train/total_loss": 0.09820468723773956 + }, + { + "entropy": 9.206037521362305, + "epoch": 1.56426735218509, + "mean_token_accuracy": 0.7997685074806213, + "num_tokens": 47917551.0, + "step": 15821, + "train/ce_loss": 0.9726559519767761 + }, + { + "epoch": 1.56426735218509, + "step": 15821, + "train/sim_loss": 0.07644814252853394 + }, + { + "epoch": 1.56426735218509, + "step": 15821, + "train/total_loss": 0.17371374368667603 + }, + { + "entropy": 9.555869102478027, + "epoch": 1.5643662250346055, + "mean_token_accuracy": 0.9111111164093018, + "num_tokens": 47929016.0, + "step": 15822, + "train/ce_loss": 3.0258385663728404e-07 + }, + { + "epoch": 1.5643662250346055, + "step": 15822, + "train/sim_loss": 0.02697122097015381 + }, + { + "epoch": 1.5643662250346055, + "step": 15822, + "train/total_loss": 0.026971250772476196 + }, + { + "entropy": 9.159305572509766, + "epoch": 1.564465097884121, + "mean_token_accuracy": 0.8485958576202393, + "num_tokens": 47937329.0, + "step": 15823, + "train/ce_loss": 0.5641740560531616 + }, + { + "epoch": 1.564465097884121, + "step": 15823, + "train/sim_loss": 0.042352139949798584 + }, + { + "epoch": 1.564465097884121, + "step": 15823, + "train/total_loss": 0.09876954555511475 + }, + { + "entropy": 8.71562385559082, + "epoch": 1.5645639707336365, + "mean_token_accuracy": 0.8859649300575256, + "num_tokens": 47947549.0, + "step": 15824, + "train/ce_loss": 0.3126494884490967 + }, + { + "epoch": 1.5645639707336365, + "step": 15824, + "train/sim_loss": 0.0352325439453125 + }, + { + "epoch": 1.5645639707336365, + "step": 15824, + "train/total_loss": 0.06649748980998993 + }, + { + "entropy": 9.419139862060547, + "epoch": 1.564662843583152, + "mean_token_accuracy": 0.875, + "num_tokens": 47964716.0, + "step": 15825, + "train/ce_loss": 0.22291839122772217 + }, + { + "epoch": 1.564662843583152, + "step": 15825, + "train/sim_loss": 0.01782846450805664 + }, + { + "epoch": 1.564662843583152, + "step": 15825, + "train/total_loss": 0.04012030363082886 + }, + { + "entropy": 9.255643844604492, + "epoch": 1.5647617164326677, + "mean_token_accuracy": 0.8664302825927734, + "num_tokens": 47977679.0, + "step": 15826, + "train/ce_loss": 0.48030886054039 + }, + { + "epoch": 1.5647617164326677, + "step": 15826, + "train/sim_loss": 0.04991805553436279 + }, + { + "epoch": 1.5647617164326677, + "step": 15826, + "train/total_loss": 0.09794893860816956 + }, + { + "entropy": 9.743715286254883, + "epoch": 1.564860589282183, + "mean_token_accuracy": 0.8694404363632202, + "num_tokens": 47990820.0, + "step": 15827, + "train/ce_loss": 0.3608745336532593 + }, + { + "epoch": 1.564860589282183, + "step": 15827, + "train/sim_loss": 0.04332464933395386 + }, + { + "epoch": 1.564860589282183, + "step": 15827, + "train/total_loss": 0.07941210269927979 + }, + { + "entropy": 9.396297454833984, + "epoch": 1.5649594621316987, + "mean_token_accuracy": 0.8347921371459961, + "num_tokens": 48009011.0, + "step": 15828, + "train/ce_loss": 0.36065685749053955 + }, + { + "epoch": 1.5649594621316987, + "step": 15828, + "train/sim_loss": 0.046814560890197754 + }, + { + "epoch": 1.5649594621316987, + "step": 15828, + "train/total_loss": 0.08288024365901947 + }, + { + "entropy": 9.52999496459961, + "epoch": 1.5650583349812142, + "mean_token_accuracy": 0.8695651888847351, + "num_tokens": 48014931.0, + "step": 15829, + "train/ce_loss": 0.48229411244392395 + }, + { + "epoch": 1.5650583349812142, + "step": 15829, + "train/sim_loss": 0.029576420783996582 + }, + { + "epoch": 1.5650583349812142, + "step": 15829, + "train/total_loss": 0.07780583202838898 + }, + { + "entropy": 9.09892463684082, + "epoch": 1.5651572078307296, + "mean_token_accuracy": 0.7955307364463806, + "num_tokens": 48023214.0, + "step": 15830, + "train/ce_loss": 0.32536035776138306 + }, + { + "epoch": 1.5651572078307296, + "step": 15830, + "train/sim_loss": 0.020647406578063965 + }, + { + "epoch": 1.5651572078307296, + "step": 15830, + "train/total_loss": 0.05318344384431839 + }, + { + "entropy": 9.393125534057617, + "epoch": 1.5652560806802454, + "mean_token_accuracy": 0.8525640964508057, + "num_tokens": 48037507.0, + "step": 15831, + "train/ce_loss": 0.4772332012653351 + }, + { + "epoch": 1.5652560806802454, + "step": 15831, + "train/sim_loss": 0.04823571443557739 + }, + { + "epoch": 1.5652560806802454, + "step": 15831, + "train/total_loss": 0.09595903754234314 + }, + { + "entropy": 9.632883071899414, + "epoch": 1.5653549535297606, + "mean_token_accuracy": 0.9225941300392151, + "num_tokens": 48057505.0, + "step": 15832, + "train/ce_loss": 3.5313547641635523e-07 + }, + { + "epoch": 1.5653549535297606, + "step": 15832, + "train/sim_loss": 0.020426392555236816 + }, + { + "epoch": 1.5653549535297606, + "step": 15832, + "train/total_loss": 0.020426427945494652 + }, + { + "entropy": 9.69515609741211, + "epoch": 1.5654538263792763, + "mean_token_accuracy": 0.8524844646453857, + "num_tokens": 48070281.0, + "step": 15833, + "train/ce_loss": 1.0106175523105776e-06 + }, + { + "epoch": 1.5654538263792763, + "step": 15833, + "train/sim_loss": 0.045555174350738525 + }, + { + "epoch": 1.5654538263792763, + "step": 15833, + "train/total_loss": 0.045555274933576584 + }, + { + "entropy": 9.242338180541992, + "epoch": 1.5655526992287918, + "mean_token_accuracy": 0.8402848243713379, + "num_tokens": 48085191.0, + "step": 15834, + "train/ce_loss": 0.5979923009872437 + }, + { + "epoch": 1.5655526992287918, + "step": 15834, + "train/sim_loss": 0.036434829235076904 + }, + { + "epoch": 1.5655526992287918, + "step": 15834, + "train/total_loss": 0.09623406082391739 + }, + { + "entropy": 9.295859336853027, + "epoch": 1.5656515720783073, + "mean_token_accuracy": 0.8169934749603271, + "num_tokens": 48099016.0, + "step": 15835, + "train/ce_loss": 0.1596008539199829 + }, + { + "epoch": 1.5656515720783073, + "step": 15835, + "train/sim_loss": 0.055748939514160156 + }, + { + "epoch": 1.5656515720783073, + "step": 15835, + "train/total_loss": 0.07170902192592621 + }, + { + "entropy": 9.442811012268066, + "epoch": 1.5657504449278228, + "mean_token_accuracy": 0.8585366010665894, + "num_tokens": 48109372.0, + "step": 15836, + "train/ce_loss": 0.5594943165779114 + }, + { + "epoch": 1.5657504449278228, + "step": 15836, + "train/sim_loss": 0.03456193208694458 + }, + { + "epoch": 1.5657504449278228, + "step": 15836, + "train/total_loss": 0.09051136672496796 + }, + { + "entropy": 9.3623046875, + "epoch": 1.5658493177773383, + "mean_token_accuracy": 0.8735083341598511, + "num_tokens": 48121250.0, + "step": 15837, + "train/ce_loss": 0.24332134425640106 + }, + { + "epoch": 1.5658493177773383, + "step": 15837, + "train/sim_loss": 0.041654109954833984 + }, + { + "epoch": 1.5658493177773383, + "step": 15837, + "train/total_loss": 0.06598624587059021 + }, + { + "entropy": 9.249478340148926, + "epoch": 1.565948190626854, + "mean_token_accuracy": 0.9041916131973267, + "num_tokens": 48137596.0, + "step": 15838, + "train/ce_loss": 0.5993022918701172 + }, + { + "epoch": 1.565948190626854, + "step": 15838, + "train/sim_loss": 0.05630171298980713 + }, + { + "epoch": 1.565948190626854, + "step": 15838, + "train/total_loss": 0.11623194813728333 + }, + { + "entropy": 8.837223052978516, + "epoch": 1.5660470634763692, + "mean_token_accuracy": 0.8566879034042358, + "num_tokens": 48152609.0, + "step": 15839, + "train/ce_loss": 0.47825273871421814 + }, + { + "epoch": 1.5660470634763692, + "step": 15839, + "train/sim_loss": 0.02920430898666382 + }, + { + "epoch": 1.5660470634763692, + "step": 15839, + "train/total_loss": 0.07702958583831787 + }, + { + "epoch": 1.566145936325885, + "grad_norm": 0.49203741550445557, + "learning_rate": 6.086386787321368e-06, + "loss": 0.0843, + "step": 15840 + }, + { + "entropy": 9.253789901733398, + "epoch": 1.566145936325885, + "mean_token_accuracy": 0.8870636820793152, + "num_tokens": 48162997.0, + "step": 15840, + "train/ce_loss": 7.930737524475262e-07 + }, + { + "epoch": 1.566145936325885, + "step": 15840, + "train/sim_loss": 0.053340256214141846 + }, + { + "epoch": 1.566145936325885, + "step": 15840, + "train/total_loss": 0.05334033444523811 + }, + { + "entropy": 9.64954948425293, + "epoch": 1.5662448091754004, + "mean_token_accuracy": 0.9301801919937134, + "num_tokens": 48183308.0, + "step": 15841, + "train/ce_loss": 4.2098730546058505e-07 + }, + { + "epoch": 1.5662448091754004, + "step": 15841, + "train/sim_loss": 0.040720582008361816 + }, + { + "epoch": 1.5662448091754004, + "step": 15841, + "train/total_loss": 0.0407206229865551 + }, + { + "entropy": 9.064847946166992, + "epoch": 1.566343682024916, + "mean_token_accuracy": 0.8553845882415771, + "num_tokens": 48196836.0, + "step": 15842, + "train/ce_loss": 0.5495026707649231 + }, + { + "epoch": 1.566343682024916, + "step": 15842, + "train/sim_loss": 0.033653855323791504 + }, + { + "epoch": 1.566343682024916, + "step": 15842, + "train/total_loss": 0.08860412240028381 + }, + { + "entropy": 8.613834381103516, + "epoch": 1.5664425548744316, + "mean_token_accuracy": 0.8496310114860535, + "num_tokens": 48204551.0, + "step": 15843, + "train/ce_loss": 0.4821617305278778 + }, + { + "epoch": 1.5664425548744316, + "step": 15843, + "train/sim_loss": 0.010471522808074951 + }, + { + "epoch": 1.5664425548744316, + "step": 15843, + "train/total_loss": 0.05868769809603691 + }, + { + "entropy": 9.633814811706543, + "epoch": 1.5665414277239469, + "mean_token_accuracy": 0.8255977630615234, + "num_tokens": 48223057.0, + "step": 15844, + "train/ce_loss": 0.8146496415138245 + }, + { + "epoch": 1.5665414277239469, + "step": 15844, + "train/sim_loss": 0.04530453681945801 + }, + { + "epoch": 1.5665414277239469, + "step": 15844, + "train/total_loss": 0.1267695128917694 + }, + { + "entropy": 9.105598449707031, + "epoch": 1.5666403005734626, + "mean_token_accuracy": 0.8276231288909912, + "num_tokens": 48231748.0, + "step": 15845, + "train/ce_loss": 0.6233729124069214 + }, + { + "epoch": 1.5666403005734626, + "step": 15845, + "train/sim_loss": 0.019490599632263184 + }, + { + "epoch": 1.5666403005734626, + "step": 15845, + "train/total_loss": 0.08182789385318756 + }, + { + "entropy": 9.025690078735352, + "epoch": 1.566739173422978, + "mean_token_accuracy": 0.8419567346572876, + "num_tokens": 48241220.0, + "step": 15846, + "train/ce_loss": 0.3354133069515228 + }, + { + "epoch": 1.566739173422978, + "step": 15846, + "train/sim_loss": 0.015376567840576172 + }, + { + "epoch": 1.566739173422978, + "step": 15846, + "train/total_loss": 0.048917900770902634 + }, + { + "entropy": 9.659529685974121, + "epoch": 1.5668380462724936, + "mean_token_accuracy": 0.8463414907455444, + "num_tokens": 48248699.0, + "step": 15847, + "train/ce_loss": 2.4385753931710497e-06 + }, + { + "epoch": 1.5668380462724936, + "step": 15847, + "train/sim_loss": 0.031281471252441406 + }, + { + "epoch": 1.5668380462724936, + "step": 15847, + "train/total_loss": 0.031281713396310806 + }, + { + "entropy": 8.939800262451172, + "epoch": 1.566936919122009, + "mean_token_accuracy": 0.8931204080581665, + "num_tokens": 48261978.0, + "step": 15848, + "train/ce_loss": 0.4019067585468292 + }, + { + "epoch": 1.566936919122009, + "step": 15848, + "train/sim_loss": 0.04751652479171753 + }, + { + "epoch": 1.566936919122009, + "step": 15848, + "train/total_loss": 0.08770720660686493 + }, + { + "entropy": 8.964418411254883, + "epoch": 1.5670357919715245, + "mean_token_accuracy": 0.857782781124115, + "num_tokens": 48271025.0, + "step": 15849, + "train/ce_loss": 0.22259843349456787 + }, + { + "epoch": 1.5670357919715245, + "step": 15849, + "train/sim_loss": 0.02452218532562256 + }, + { + "epoch": 1.5670357919715245, + "step": 15849, + "train/total_loss": 0.046782031655311584 + }, + { + "entropy": 9.304888725280762, + "epoch": 1.5671346648210402, + "mean_token_accuracy": 0.9089673757553101, + "num_tokens": 48282410.0, + "step": 15850, + "train/ce_loss": 0.6149476766586304 + }, + { + "epoch": 1.5671346648210402, + "step": 15850, + "train/sim_loss": 0.09668457508087158 + }, + { + "epoch": 1.5671346648210402, + "step": 15850, + "train/total_loss": 0.15817934274673462 + }, + { + "entropy": 9.100290298461914, + "epoch": 1.5672335376705555, + "mean_token_accuracy": 0.8585034012794495, + "num_tokens": 48298465.0, + "step": 15851, + "train/ce_loss": 0.5197852849960327 + }, + { + "epoch": 1.5672335376705555, + "step": 15851, + "train/sim_loss": 0.08481597900390625 + }, + { + "epoch": 1.5672335376705555, + "step": 15851, + "train/total_loss": 0.13679450750350952 + }, + { + "entropy": 9.403717994689941, + "epoch": 1.5673324105200712, + "mean_token_accuracy": 0.832524299621582, + "num_tokens": 48312100.0, + "step": 15852, + "train/ce_loss": 0.48858851194381714 + }, + { + "epoch": 1.5673324105200712, + "step": 15852, + "train/sim_loss": 0.08513671159744263 + }, + { + "epoch": 1.5673324105200712, + "step": 15852, + "train/total_loss": 0.13399556279182434 + }, + { + "entropy": 9.00065803527832, + "epoch": 1.5674312833695867, + "mean_token_accuracy": 0.855967104434967, + "num_tokens": 48324566.0, + "step": 15853, + "train/ce_loss": 0.5685367584228516 + }, + { + "epoch": 1.5674312833695867, + "step": 15853, + "train/sim_loss": 0.020473837852478027 + }, + { + "epoch": 1.5674312833695867, + "step": 15853, + "train/total_loss": 0.07732751965522766 + }, + { + "entropy": 9.169309616088867, + "epoch": 1.5675301562191022, + "mean_token_accuracy": 0.8340857625007629, + "num_tokens": 48337941.0, + "step": 15854, + "train/ce_loss": 0.41180816292762756 + }, + { + "epoch": 1.5675301562191022, + "step": 15854, + "train/sim_loss": 0.07812964916229248 + }, + { + "epoch": 1.5675301562191022, + "step": 15854, + "train/total_loss": 0.11931046843528748 + }, + { + "entropy": 9.478955268859863, + "epoch": 1.567629029068618, + "mean_token_accuracy": 0.9061976671218872, + "num_tokens": 48347421.0, + "step": 15855, + "train/ce_loss": 8.274391802842729e-07 + }, + { + "epoch": 1.567629029068618, + "step": 15855, + "train/sim_loss": 0.05585139989852905 + }, + { + "epoch": 1.567629029068618, + "step": 15855, + "train/total_loss": 0.05585148185491562 + }, + { + "entropy": 8.8385009765625, + "epoch": 1.5677279019181332, + "mean_token_accuracy": 0.8778135180473328, + "num_tokens": 48356227.0, + "step": 15856, + "train/ce_loss": 0.3052666485309601 + }, + { + "epoch": 1.5677279019181332, + "step": 15856, + "train/sim_loss": 0.031111598014831543 + }, + { + "epoch": 1.5677279019181332, + "step": 15856, + "train/total_loss": 0.06163826584815979 + }, + { + "entropy": 9.576251983642578, + "epoch": 1.5678267747676489, + "mean_token_accuracy": 0.8909883499145508, + "num_tokens": 48368328.0, + "step": 15857, + "train/ce_loss": 2.543565926771407e-07 + }, + { + "epoch": 1.5678267747676489, + "step": 15857, + "train/sim_loss": 0.016611695289611816 + }, + { + "epoch": 1.5678267747676489, + "step": 15857, + "train/total_loss": 0.016611721366643906 + }, + { + "entropy": 9.307401657104492, + "epoch": 1.5679256476171644, + "mean_token_accuracy": 0.8452380895614624, + "num_tokens": 48389188.0, + "step": 15858, + "train/ce_loss": 0.22627827525138855 + }, + { + "epoch": 1.5679256476171644, + "step": 15858, + "train/sim_loss": 0.03890186548233032 + }, + { + "epoch": 1.5679256476171644, + "step": 15858, + "train/total_loss": 0.061529695987701416 + }, + { + "entropy": 9.398603439331055, + "epoch": 1.5680245204666798, + "mean_token_accuracy": 0.83012455701828, + "num_tokens": 48399553.0, + "step": 15859, + "train/ce_loss": 0.4198402166366577 + }, + { + "epoch": 1.5680245204666798, + "step": 15859, + "train/sim_loss": 0.042168259620666504 + }, + { + "epoch": 1.5680245204666798, + "step": 15859, + "train/total_loss": 0.08415228128433228 + }, + { + "epoch": 1.5681233933161953, + "grad_norm": 0.6186414361000061, + "learning_rate": 6.081441922563418e-06, + "loss": 0.0818, + "step": 15860 + }, + { + "entropy": 9.23475170135498, + "epoch": 1.5681233933161953, + "mean_token_accuracy": 0.8591065406799316, + "num_tokens": 48414027.0, + "step": 15860, + "train/ce_loss": 0.55767822265625 + }, + { + "epoch": 1.5681233933161953, + "step": 15860, + "train/sim_loss": 0.04885941743850708 + }, + { + "epoch": 1.5681233933161953, + "step": 15860, + "train/total_loss": 0.10462723672389984 + }, + { + "entropy": 9.273736953735352, + "epoch": 1.5682222661657108, + "mean_token_accuracy": 0.8416075706481934, + "num_tokens": 48423260.0, + "step": 15861, + "train/ce_loss": 0.8652260303497314 + }, + { + "epoch": 1.5682222661657108, + "step": 15861, + "train/sim_loss": 0.049285054206848145 + }, + { + "epoch": 1.5682222661657108, + "step": 15861, + "train/total_loss": 0.13580766320228577 + }, + { + "entropy": 9.483192443847656, + "epoch": 1.5683211390152265, + "mean_token_accuracy": 0.8855421543121338, + "num_tokens": 48431976.0, + "step": 15862, + "train/ce_loss": 3.3703773283377814e-07 + }, + { + "epoch": 1.5683211390152265, + "step": 15862, + "train/sim_loss": 0.013614535331726074 + }, + { + "epoch": 1.5683211390152265, + "step": 15862, + "train/total_loss": 0.01361456885933876 + }, + { + "entropy": 9.365715026855469, + "epoch": 1.5684200118647418, + "mean_token_accuracy": 0.8029196858406067, + "num_tokens": 48445032.0, + "step": 15863, + "train/ce_loss": 0.3876144587993622 + }, + { + "epoch": 1.5684200118647418, + "step": 15863, + "train/sim_loss": 0.03640460968017578 + }, + { + "epoch": 1.5684200118647418, + "step": 15863, + "train/total_loss": 0.07516606152057648 + }, + { + "entropy": 9.291646957397461, + "epoch": 1.5685188847142575, + "mean_token_accuracy": 0.8850129246711731, + "num_tokens": 48459421.0, + "step": 15864, + "train/ce_loss": 0.599913477897644 + }, + { + "epoch": 1.5685188847142575, + "step": 15864, + "train/sim_loss": 0.031635940074920654 + }, + { + "epoch": 1.5685188847142575, + "step": 15864, + "train/total_loss": 0.09162728488445282 + }, + { + "entropy": 9.692911148071289, + "epoch": 1.568617757563773, + "mean_token_accuracy": 0.8512544631958008, + "num_tokens": 48472960.0, + "step": 15865, + "train/ce_loss": 4.1615791701588023e-07 + }, + { + "epoch": 1.568617757563773, + "step": 15865, + "train/sim_loss": 0.03219008445739746 + }, + { + "epoch": 1.568617757563773, + "step": 15865, + "train/total_loss": 0.032190125435590744 + }, + { + "entropy": 8.992292404174805, + "epoch": 1.5687166304132885, + "mean_token_accuracy": 0.8712394833564758, + "num_tokens": 48484672.0, + "step": 15866, + "train/ce_loss": 0.31775933504104614 + }, + { + "epoch": 1.5687166304132885, + "step": 15866, + "train/sim_loss": 0.03207087516784668 + }, + { + "epoch": 1.5687166304132885, + "step": 15866, + "train/total_loss": 0.06384681165218353 + }, + { + "entropy": 8.950361251831055, + "epoch": 1.5688155032628042, + "mean_token_accuracy": 0.8246346712112427, + "num_tokens": 48494001.0, + "step": 15867, + "train/ce_loss": 0.6218948364257812 + }, + { + "epoch": 1.5688155032628042, + "step": 15867, + "train/sim_loss": 0.08407407999038696 + }, + { + "epoch": 1.5688155032628042, + "step": 15867, + "train/total_loss": 0.14626356959342957 + }, + { + "entropy": 9.377833366394043, + "epoch": 1.5689143761123194, + "mean_token_accuracy": 0.8973104953765869, + "num_tokens": 48508269.0, + "step": 15868, + "train/ce_loss": 2.5881954002215934e-07 + }, + { + "epoch": 1.5689143761123194, + "step": 15868, + "train/sim_loss": 0.04300546646118164 + }, + { + "epoch": 1.5689143761123194, + "step": 15868, + "train/total_loss": 0.04300549253821373 + }, + { + "entropy": 8.960348129272461, + "epoch": 1.5690132489618351, + "mean_token_accuracy": 0.8856707215309143, + "num_tokens": 48518011.0, + "step": 15869, + "train/ce_loss": 0.3069848418235779 + }, + { + "epoch": 1.5690132489618351, + "step": 15869, + "train/sim_loss": 0.022250771522521973 + }, + { + "epoch": 1.5690132489618351, + "step": 15869, + "train/total_loss": 0.05294925719499588 + }, + { + "entropy": 9.077926635742188, + "epoch": 1.5691121218113506, + "mean_token_accuracy": 0.8136958479881287, + "num_tokens": 48528350.0, + "step": 15870, + "train/ce_loss": 0.48755165934562683 + }, + { + "epoch": 1.5691121218113506, + "step": 15870, + "train/sim_loss": 0.040576815605163574 + }, + { + "epoch": 1.5691121218113506, + "step": 15870, + "train/total_loss": 0.0893319845199585 + }, + { + "entropy": 9.336973190307617, + "epoch": 1.569210994660866, + "mean_token_accuracy": 0.8390663266181946, + "num_tokens": 48544463.0, + "step": 15871, + "train/ce_loss": 0.505347728729248 + }, + { + "epoch": 1.569210994660866, + "step": 15871, + "train/sim_loss": 0.02183365821838379 + }, + { + "epoch": 1.569210994660866, + "step": 15871, + "train/total_loss": 0.07236842811107635 + }, + { + "entropy": 9.256305694580078, + "epoch": 1.5693098675103816, + "mean_token_accuracy": 0.875, + "num_tokens": 48562305.0, + "step": 15872, + "train/ce_loss": 0.24637633562088013 + }, + { + "epoch": 1.5693098675103816, + "step": 15872, + "train/sim_loss": 0.017864763736724854 + }, + { + "epoch": 1.5693098675103816, + "step": 15872, + "train/total_loss": 0.04250239580869675 + }, + { + "entropy": 8.956583023071289, + "epoch": 1.569408740359897, + "mean_token_accuracy": 0.8374999761581421, + "num_tokens": 48575209.0, + "step": 15873, + "train/ce_loss": 0.19934453070163727 + }, + { + "epoch": 1.569408740359897, + "step": 15873, + "train/sim_loss": 0.02804785966873169 + }, + { + "epoch": 1.569408740359897, + "step": 15873, + "train/total_loss": 0.047982312738895416 + }, + { + "entropy": 9.68305778503418, + "epoch": 1.5695076132094128, + "mean_token_accuracy": 0.8963730335235596, + "num_tokens": 48585255.0, + "step": 15874, + "train/ce_loss": 0.543398380279541 + }, + { + "epoch": 1.5695076132094128, + "step": 15874, + "train/sim_loss": 0.07557272911071777 + }, + { + "epoch": 1.5695076132094128, + "step": 15874, + "train/total_loss": 0.1299125701189041 + }, + { + "entropy": 9.06590461730957, + "epoch": 1.569606486058928, + "mean_token_accuracy": 0.8271954655647278, + "num_tokens": 48598164.0, + "step": 15875, + "train/ce_loss": 0.6267587542533875 + }, + { + "epoch": 1.569606486058928, + "step": 15875, + "train/sim_loss": 0.013057887554168701 + }, + { + "epoch": 1.569606486058928, + "step": 15875, + "train/total_loss": 0.07573376595973969 + }, + { + "entropy": 9.037946701049805, + "epoch": 1.5697053589084438, + "mean_token_accuracy": 0.9181585907936096, + "num_tokens": 48612575.0, + "step": 15876, + "train/ce_loss": 8.048726272136264e-07 + }, + { + "epoch": 1.5697053589084438, + "step": 15876, + "train/sim_loss": 0.022802114486694336 + }, + { + "epoch": 1.5697053589084438, + "step": 15876, + "train/total_loss": 0.022802194580435753 + }, + { + "entropy": 9.103460311889648, + "epoch": 1.5698042317579592, + "mean_token_accuracy": 0.8696236610412598, + "num_tokens": 48624852.0, + "step": 15877, + "train/ce_loss": 0.2389717996120453 + }, + { + "epoch": 1.5698042317579592, + "step": 15877, + "train/sim_loss": 0.04377174377441406 + }, + { + "epoch": 1.5698042317579592, + "step": 15877, + "train/total_loss": 0.06766892224550247 + }, + { + "entropy": 9.456571578979492, + "epoch": 1.5699031046074747, + "mean_token_accuracy": 0.8544935584068298, + "num_tokens": 48639972.0, + "step": 15878, + "train/ce_loss": 0.43453270196914673 + }, + { + "epoch": 1.5699031046074747, + "step": 15878, + "train/sim_loss": 0.0441511869430542 + }, + { + "epoch": 1.5699031046074747, + "step": 15878, + "train/total_loss": 0.08760446310043335 + }, + { + "entropy": 8.715633392333984, + "epoch": 1.5700019774569904, + "mean_token_accuracy": 0.8584905862808228, + "num_tokens": 48653368.0, + "step": 15879, + "train/ce_loss": 0.6537434458732605 + }, + { + "epoch": 1.5700019774569904, + "step": 15879, + "train/sim_loss": 0.07518601417541504 + }, + { + "epoch": 1.5700019774569904, + "step": 15879, + "train/total_loss": 0.1405603587627411 + }, + { + "epoch": 1.5701008503065057, + "grad_norm": 0.5638779401779175, + "learning_rate": 6.076497057805469e-06, + "loss": 0.0838, + "step": 15880 + }, + { + "entropy": 9.86469554901123, + "epoch": 1.5701008503065057, + "mean_token_accuracy": 0.8875219821929932, + "num_tokens": 48666144.0, + "step": 15880, + "train/ce_loss": 2.1118123072483286e-07 + }, + { + "epoch": 1.5701008503065057, + "step": 15880, + "train/sim_loss": 0.014827311038970947 + }, + { + "epoch": 1.5701008503065057, + "step": 15880, + "train/total_loss": 0.014827332459390163 + }, + { + "entropy": 8.888524055480957, + "epoch": 1.5701997231560214, + "mean_token_accuracy": 0.8949275612831116, + "num_tokens": 48682917.0, + "step": 15881, + "train/ce_loss": 0.5262683629989624 + }, + { + "epoch": 1.5701997231560214, + "step": 15881, + "train/sim_loss": 0.0267258882522583 + }, + { + "epoch": 1.5701997231560214, + "step": 15881, + "train/total_loss": 0.0793527215719223 + }, + { + "entropy": 9.27767276763916, + "epoch": 1.570298596005537, + "mean_token_accuracy": 0.8491379022598267, + "num_tokens": 48693659.0, + "step": 15882, + "train/ce_loss": 0.6360225677490234 + }, + { + "epoch": 1.570298596005537, + "step": 15882, + "train/sim_loss": 0.05795085430145264 + }, + { + "epoch": 1.570298596005537, + "step": 15882, + "train/total_loss": 0.12155311554670334 + }, + { + "entropy": 9.368104934692383, + "epoch": 1.5703974688550524, + "mean_token_accuracy": 0.8118081092834473, + "num_tokens": 48703470.0, + "step": 15883, + "train/ce_loss": 0.6402401328086853 + }, + { + "epoch": 1.5703974688550524, + "step": 15883, + "train/sim_loss": 0.03456830978393555 + }, + { + "epoch": 1.5703974688550524, + "step": 15883, + "train/total_loss": 0.09859232604503632 + }, + { + "entropy": 8.898174285888672, + "epoch": 1.570496341704568, + "mean_token_accuracy": 0.8368356227874756, + "num_tokens": 48716784.0, + "step": 15884, + "train/ce_loss": 0.2915784418582916 + }, + { + "epoch": 1.570496341704568, + "step": 15884, + "train/sim_loss": 0.008366525173187256 + }, + { + "epoch": 1.570496341704568, + "step": 15884, + "train/total_loss": 0.03752437233924866 + }, + { + "entropy": 9.16317081451416, + "epoch": 1.5705952145540834, + "mean_token_accuracy": 0.9059633016586304, + "num_tokens": 48732092.0, + "step": 15885, + "train/ce_loss": 0.5754201412200928 + }, + { + "epoch": 1.5705952145540834, + "step": 15885, + "train/sim_loss": 0.050389766693115234 + }, + { + "epoch": 1.5705952145540834, + "step": 15885, + "train/total_loss": 0.10793177783489227 + }, + { + "entropy": 8.828998565673828, + "epoch": 1.570694087403599, + "mean_token_accuracy": 0.8497899174690247, + "num_tokens": 48739658.0, + "step": 15886, + "train/ce_loss": 0.5289114117622375 + }, + { + "epoch": 1.570694087403599, + "step": 15886, + "train/sim_loss": 0.044103801250457764 + }, + { + "epoch": 1.570694087403599, + "step": 15886, + "train/total_loss": 0.09699494391679764 + }, + { + "entropy": 9.532485961914062, + "epoch": 1.5707929602531145, + "mean_token_accuracy": 0.9207161068916321, + "num_tokens": 48754071.0, + "step": 15887, + "train/ce_loss": 0.9191177487373352 + }, + { + "epoch": 1.5707929602531145, + "step": 15887, + "train/sim_loss": 0.05799978971481323 + }, + { + "epoch": 1.5707929602531145, + "step": 15887, + "train/total_loss": 0.149911567568779 + }, + { + "entropy": 9.09381103515625, + "epoch": 1.57089183310263, + "mean_token_accuracy": 0.801257848739624, + "num_tokens": 48765954.0, + "step": 15888, + "train/ce_loss": 0.5982921123504639 + }, + { + "epoch": 1.57089183310263, + "step": 15888, + "train/sim_loss": 0.013082146644592285 + }, + { + "epoch": 1.57089183310263, + "step": 15888, + "train/total_loss": 0.07291135936975479 + }, + { + "entropy": 9.383919715881348, + "epoch": 1.5709907059521455, + "mean_token_accuracy": 0.900943398475647, + "num_tokens": 48781445.0, + "step": 15889, + "train/ce_loss": 3.598743205657229e-07 + }, + { + "epoch": 1.5709907059521455, + "step": 15889, + "train/sim_loss": 0.03702712059020996 + }, + { + "epoch": 1.5709907059521455, + "step": 15889, + "train/total_loss": 0.037027157843112946 + }, + { + "entropy": 9.328116416931152, + "epoch": 1.571089578801661, + "mean_token_accuracy": 0.7943078875541687, + "num_tokens": 48794872.0, + "step": 15890, + "train/ce_loss": 0.7175344824790955 + }, + { + "epoch": 1.571089578801661, + "step": 15890, + "train/sim_loss": 0.04640251398086548 + }, + { + "epoch": 1.571089578801661, + "step": 15890, + "train/total_loss": 0.11815596371889114 + }, + { + "entropy": 9.50864028930664, + "epoch": 1.5711884516511767, + "mean_token_accuracy": 0.9034335017204285, + "num_tokens": 48812378.0, + "step": 15891, + "train/ce_loss": 3.5506670315044175e-07 + }, + { + "epoch": 1.5711884516511767, + "step": 15891, + "train/sim_loss": 0.04120737314224243 + }, + { + "epoch": 1.5711884516511767, + "step": 15891, + "train/total_loss": 0.041207410395145416 + }, + { + "entropy": 9.711718559265137, + "epoch": 1.571287324500692, + "mean_token_accuracy": 0.9127725958824158, + "num_tokens": 48824661.0, + "step": 15892, + "train/ce_loss": 0.6368452310562134 + }, + { + "epoch": 1.571287324500692, + "step": 15892, + "train/sim_loss": 0.027692675590515137 + }, + { + "epoch": 1.571287324500692, + "step": 15892, + "train/total_loss": 0.09137719869613647 + }, + { + "entropy": 9.269386291503906, + "epoch": 1.5713861973502077, + "mean_token_accuracy": 0.8506900668144226, + "num_tokens": 48844080.0, + "step": 15893, + "train/ce_loss": 0.1946335732936859 + }, + { + "epoch": 1.5713861973502077, + "step": 15893, + "train/sim_loss": 0.013424515724182129 + }, + { + "epoch": 1.5713861973502077, + "step": 15893, + "train/total_loss": 0.03288787603378296 + }, + { + "entropy": 9.589265823364258, + "epoch": 1.5714850701997232, + "mean_token_accuracy": 0.9020044803619385, + "num_tokens": 48856322.0, + "step": 15894, + "train/ce_loss": 2.7399360646995774e-07 + }, + { + "epoch": 1.5714850701997232, + "step": 15894, + "train/sim_loss": 0.014137983322143555 + }, + { + "epoch": 1.5714850701997232, + "step": 15894, + "train/total_loss": 0.014138010330498219 + }, + { + "entropy": 9.002460479736328, + "epoch": 1.5715839430492387, + "mean_token_accuracy": 0.849711000919342, + "num_tokens": 48864100.0, + "step": 15895, + "train/ce_loss": 0.440441757440567 + }, + { + "epoch": 1.5715839430492387, + "step": 15895, + "train/sim_loss": 0.012631535530090332 + }, + { + "epoch": 1.5715839430492387, + "step": 15895, + "train/total_loss": 0.05667571350932121 + }, + { + "entropy": 9.469149589538574, + "epoch": 1.5716828158987544, + "mean_token_accuracy": 0.9027552604675293, + "num_tokens": 48878352.0, + "step": 15896, + "train/ce_loss": 0.31707292795181274 + }, + { + "epoch": 1.5716828158987544, + "step": 15896, + "train/sim_loss": 0.013952851295471191 + }, + { + "epoch": 1.5716828158987544, + "step": 15896, + "train/total_loss": 0.045660145580768585 + }, + { + "entropy": 9.410257339477539, + "epoch": 1.5717816887482696, + "mean_token_accuracy": 0.8600000143051147, + "num_tokens": 48891737.0, + "step": 15897, + "train/ce_loss": 0.4242422580718994 + }, + { + "epoch": 1.5717816887482696, + "step": 15897, + "train/sim_loss": 0.024347901344299316 + }, + { + "epoch": 1.5717816887482696, + "step": 15897, + "train/total_loss": 0.06677213311195374 + }, + { + "entropy": 8.994909286499023, + "epoch": 1.5718805615977853, + "mean_token_accuracy": 0.8677248954772949, + "num_tokens": 48901481.0, + "step": 15898, + "train/ce_loss": 0.30850690603256226 + }, + { + "epoch": 1.5718805615977853, + "step": 15898, + "train/sim_loss": 0.08128482103347778 + }, + { + "epoch": 1.5718805615977853, + "step": 15898, + "train/total_loss": 0.11213551461696625 + }, + { + "entropy": 8.77490234375, + "epoch": 1.5719794344473008, + "mean_token_accuracy": 0.8420413136482239, + "num_tokens": 48911954.0, + "step": 15899, + "train/ce_loss": 0.47682657837867737 + }, + { + "epoch": 1.5719794344473008, + "step": 15899, + "train/sim_loss": 0.018493950366973877 + }, + { + "epoch": 1.5719794344473008, + "step": 15899, + "train/total_loss": 0.06617660820484161 + }, + { + "epoch": 1.5720783072968163, + "grad_norm": 0.4977512061595917, + "learning_rate": 6.07155219304752e-06, + "loss": 0.0763, + "step": 15900 + }, + { + "entropy": 8.939138412475586, + "epoch": 1.5720783072968163, + "mean_token_accuracy": 0.8244365453720093, + "num_tokens": 48924924.0, + "step": 15900, + "train/ce_loss": 0.592671811580658 + }, + { + "epoch": 1.5720783072968163, + "step": 15900, + "train/sim_loss": 0.043321967124938965 + }, + { + "epoch": 1.5720783072968163, + "step": 15900, + "train/total_loss": 0.10258914530277252 + }, + { + "entropy": 9.084707260131836, + "epoch": 1.5721771801463318, + "mean_token_accuracy": 0.8769771456718445, + "num_tokens": 48943268.0, + "step": 15901, + "train/ce_loss": 0.4623299241065979 + }, + { + "epoch": 1.5721771801463318, + "step": 15901, + "train/sim_loss": 0.060661375522613525 + }, + { + "epoch": 1.5721771801463318, + "step": 15901, + "train/total_loss": 0.10689437389373779 + }, + { + "entropy": 8.949701309204102, + "epoch": 1.5722760529958473, + "mean_token_accuracy": 0.9136302471160889, + "num_tokens": 48957646.0, + "step": 15902, + "train/ce_loss": 0.33727794885635376 + }, + { + "epoch": 1.5722760529958473, + "step": 15902, + "train/sim_loss": 0.05082195997238159 + }, + { + "epoch": 1.5722760529958473, + "step": 15902, + "train/total_loss": 0.08454975485801697 + }, + { + "entropy": 9.38361930847168, + "epoch": 1.572374925845363, + "mean_token_accuracy": 0.905063271522522, + "num_tokens": 48973048.0, + "step": 15903, + "train/ce_loss": 0.2499748170375824 + }, + { + "epoch": 1.572374925845363, + "step": 15903, + "train/sim_loss": 0.07595032453536987 + }, + { + "epoch": 1.572374925845363, + "step": 15903, + "train/total_loss": 0.100947804749012 + }, + { + "entropy": 9.58840274810791, + "epoch": 1.5724737986948782, + "mean_token_accuracy": 0.9271708726882935, + "num_tokens": 48984779.0, + "step": 15904, + "train/ce_loss": 0.5735549926757812 + }, + { + "epoch": 1.5724737986948782, + "step": 15904, + "train/sim_loss": 0.03928029537200928 + }, + { + "epoch": 1.5724737986948782, + "step": 15904, + "train/total_loss": 0.09663579612970352 + }, + { + "entropy": 9.016256332397461, + "epoch": 1.572572671544394, + "mean_token_accuracy": 0.8791848421096802, + "num_tokens": 49003924.0, + "step": 15905, + "train/ce_loss": 0.5107579231262207 + }, + { + "epoch": 1.572572671544394, + "step": 15905, + "train/sim_loss": 0.02506101131439209 + }, + { + "epoch": 1.572572671544394, + "step": 15905, + "train/total_loss": 0.07613680511713028 + }, + { + "entropy": 8.470539093017578, + "epoch": 1.5726715443939094, + "mean_token_accuracy": 0.860773503780365, + "num_tokens": 49010729.0, + "step": 15906, + "train/ce_loss": 0.5016570687294006 + }, + { + "epoch": 1.5726715443939094, + "step": 15906, + "train/sim_loss": 0.059241652488708496 + }, + { + "epoch": 1.5726715443939094, + "step": 15906, + "train/total_loss": 0.10940736532211304 + }, + { + "entropy": 9.348677635192871, + "epoch": 1.572770417243425, + "mean_token_accuracy": 0.8151751160621643, + "num_tokens": 49019156.0, + "step": 15907, + "train/ce_loss": 0.7255646586418152 + }, + { + "epoch": 1.572770417243425, + "step": 15907, + "train/sim_loss": 0.051534175872802734 + }, + { + "epoch": 1.572770417243425, + "step": 15907, + "train/total_loss": 0.12409064173698425 + }, + { + "entropy": 8.837939262390137, + "epoch": 1.5728692900929406, + "mean_token_accuracy": 0.8153526782989502, + "num_tokens": 49026397.0, + "step": 15908, + "train/ce_loss": 0.7342962026596069 + }, + { + "epoch": 1.5728692900929406, + "step": 15908, + "train/sim_loss": 0.07915639877319336 + }, + { + "epoch": 1.5728692900929406, + "step": 15908, + "train/total_loss": 0.15258601307868958 + }, + { + "entropy": 8.781536102294922, + "epoch": 1.572968162942456, + "mean_token_accuracy": 0.8679678440093994, + "num_tokens": 49040405.0, + "step": 15909, + "train/ce_loss": 0.3872874081134796 + }, + { + "epoch": 1.572968162942456, + "step": 15909, + "train/sim_loss": 0.03183424472808838 + }, + { + "epoch": 1.572968162942456, + "step": 15909, + "train/total_loss": 0.07056298851966858 + }, + { + "entropy": 9.029507637023926, + "epoch": 1.5730670357919716, + "mean_token_accuracy": 0.8269230723381042, + "num_tokens": 49053862.0, + "step": 15910, + "train/ce_loss": 0.5642181038856506 + }, + { + "epoch": 1.5730670357919716, + "step": 15910, + "train/sim_loss": 0.04439806938171387 + }, + { + "epoch": 1.5730670357919716, + "step": 15910, + "train/total_loss": 0.10081988573074341 + }, + { + "entropy": 9.25462532043457, + "epoch": 1.573165908641487, + "mean_token_accuracy": 0.8802469372749329, + "num_tokens": 49065115.0, + "step": 15911, + "train/ce_loss": 0.31366050243377686 + }, + { + "epoch": 1.573165908641487, + "step": 15911, + "train/sim_loss": 0.060683369636535645 + }, + { + "epoch": 1.573165908641487, + "step": 15911, + "train/total_loss": 0.09204941987991333 + }, + { + "entropy": 9.775318145751953, + "epoch": 1.5732647814910026, + "mean_token_accuracy": 0.8566775321960449, + "num_tokens": 49080819.0, + "step": 15912, + "train/ce_loss": 0.4579221308231354 + }, + { + "epoch": 1.5732647814910026, + "step": 15912, + "train/sim_loss": 0.01967555284500122 + }, + { + "epoch": 1.5732647814910026, + "step": 15912, + "train/total_loss": 0.06546776741743088 + }, + { + "entropy": 9.301116943359375, + "epoch": 1.573363654340518, + "mean_token_accuracy": 0.8269841074943542, + "num_tokens": 49090636.0, + "step": 15913, + "train/ce_loss": 0.3150844871997833 + }, + { + "epoch": 1.573363654340518, + "step": 15913, + "train/sim_loss": 0.03924614191055298 + }, + { + "epoch": 1.573363654340518, + "step": 15913, + "train/total_loss": 0.07075458765029907 + }, + { + "entropy": 9.100683212280273, + "epoch": 1.5734625271900335, + "mean_token_accuracy": 0.8732782602310181, + "num_tokens": 49097628.0, + "step": 15914, + "train/ce_loss": 0.3864707350730896 + }, + { + "epoch": 1.5734625271900335, + "step": 15914, + "train/sim_loss": 0.06106346845626831 + }, + { + "epoch": 1.5734625271900335, + "step": 15914, + "train/total_loss": 0.09971053898334503 + }, + { + "entropy": 9.286470413208008, + "epoch": 1.5735614000395493, + "mean_token_accuracy": 0.8687350749969482, + "num_tokens": 49107347.0, + "step": 15915, + "train/ce_loss": 0.5016193389892578 + }, + { + "epoch": 1.5735614000395493, + "step": 15915, + "train/sim_loss": 0.06802284717559814 + }, + { + "epoch": 1.5735614000395493, + "step": 15915, + "train/total_loss": 0.11818478256464005 + }, + { + "entropy": 9.52770709991455, + "epoch": 1.5736602728890645, + "mean_token_accuracy": 0.7984595894813538, + "num_tokens": 49124839.0, + "step": 15916, + "train/ce_loss": 0.33142489194869995 + }, + { + "epoch": 1.5736602728890645, + "step": 15916, + "train/sim_loss": 0.03731203079223633 + }, + { + "epoch": 1.5736602728890645, + "step": 15916, + "train/total_loss": 0.07045452296733856 + }, + { + "entropy": 9.252232551574707, + "epoch": 1.5737591457385802, + "mean_token_accuracy": 0.8590381145477295, + "num_tokens": 49138793.0, + "step": 15917, + "train/ce_loss": 3.068178955345502e-07 + }, + { + "epoch": 1.5737591457385802, + "step": 15917, + "train/sim_loss": 0.029697179794311523 + }, + { + "epoch": 1.5737591457385802, + "step": 15917, + "train/total_loss": 0.02969720959663391 + }, + { + "entropy": 8.981443405151367, + "epoch": 1.5738580185880957, + "mean_token_accuracy": 0.8201892971992493, + "num_tokens": 49149900.0, + "step": 15918, + "train/ce_loss": 0.31949183344841003 + }, + { + "epoch": 1.5738580185880957, + "step": 15918, + "train/sim_loss": 0.03305208683013916 + }, + { + "epoch": 1.5738580185880957, + "step": 15918, + "train/total_loss": 0.06500127166509628 + }, + { + "entropy": 9.4169921875, + "epoch": 1.5739568914376112, + "mean_token_accuracy": 0.8095238208770752, + "num_tokens": 49160933.0, + "step": 15919, + "train/ce_loss": 0.6002764105796814 + }, + { + "epoch": 1.5739568914376112, + "step": 15919, + "train/sim_loss": 0.07006657123565674 + }, + { + "epoch": 1.5739568914376112, + "step": 15919, + "train/total_loss": 0.13009421527385712 + }, + { + "epoch": 1.574055764287127, + "grad_norm": 0.666652500629425, + "learning_rate": 6.066607328289572e-06, + "loss": 0.0815, + "step": 15920 + }, + { + "entropy": 9.258638381958008, + "epoch": 1.574055764287127, + "mean_token_accuracy": 0.8585164546966553, + "num_tokens": 49172999.0, + "step": 15920, + "train/ce_loss": 0.22044621407985687 + }, + { + "epoch": 1.574055764287127, + "step": 15920, + "train/sim_loss": 0.06491982936859131 + }, + { + "epoch": 1.574055764287127, + "step": 15920, + "train/total_loss": 0.086964450776577 + }, + { + "entropy": 9.666000366210938, + "epoch": 1.5741546371366422, + "mean_token_accuracy": 0.9145728349685669, + "num_tokens": 49185075.0, + "step": 15921, + "train/ce_loss": 0.503993034362793 + }, + { + "epoch": 1.5741546371366422, + "step": 15921, + "train/sim_loss": 0.047968149185180664 + }, + { + "epoch": 1.5741546371366422, + "step": 15921, + "train/total_loss": 0.09836745262145996 + }, + { + "entropy": 9.141494750976562, + "epoch": 1.5742535099861579, + "mean_token_accuracy": 0.8538283109664917, + "num_tokens": 49195357.0, + "step": 15922, + "train/ce_loss": 0.565011203289032 + }, + { + "epoch": 1.5742535099861579, + "step": 15922, + "train/sim_loss": 0.07426774501800537 + }, + { + "epoch": 1.5742535099861579, + "step": 15922, + "train/total_loss": 0.13076886534690857 + }, + { + "entropy": 9.248133659362793, + "epoch": 1.5743523828356734, + "mean_token_accuracy": 0.8565683364868164, + "num_tokens": 49204482.0, + "step": 15923, + "train/ce_loss": 0.2149181067943573 + }, + { + "epoch": 1.5743523828356734, + "step": 15923, + "train/sim_loss": 0.02886795997619629 + }, + { + "epoch": 1.5743523828356734, + "step": 15923, + "train/total_loss": 0.05035977065563202 + }, + { + "entropy": 9.271732330322266, + "epoch": 1.5744512556851888, + "mean_token_accuracy": 0.8703703880310059, + "num_tokens": 49214880.0, + "step": 15924, + "train/ce_loss": 0.6173175573348999 + }, + { + "epoch": 1.5744512556851888, + "step": 15924, + "train/sim_loss": 0.03672587871551514 + }, + { + "epoch": 1.5744512556851888, + "step": 15924, + "train/total_loss": 0.09845763444900513 + }, + { + "entropy": 8.967631340026855, + "epoch": 1.5745501285347043, + "mean_token_accuracy": 0.867132842540741, + "num_tokens": 49225877.0, + "step": 15925, + "train/ce_loss": 0.546755850315094 + }, + { + "epoch": 1.5745501285347043, + "step": 15925, + "train/sim_loss": 0.051055312156677246 + }, + { + "epoch": 1.5745501285347043, + "step": 15925, + "train/total_loss": 0.10573089867830276 + }, + { + "entropy": 8.878446578979492, + "epoch": 1.5746490013842198, + "mean_token_accuracy": 0.8417639136314392, + "num_tokens": 49233444.0, + "step": 15926, + "train/ce_loss": 0.7176011204719543 + }, + { + "epoch": 1.5746490013842198, + "step": 15926, + "train/sim_loss": 0.060309410095214844 + }, + { + "epoch": 1.5746490013842198, + "step": 15926, + "train/total_loss": 0.13206952810287476 + }, + { + "entropy": 8.77463150024414, + "epoch": 1.5747478742337355, + "mean_token_accuracy": 0.814612865447998, + "num_tokens": 49242630.0, + "step": 15927, + "train/ce_loss": 0.3437041640281677 + }, + { + "epoch": 1.5747478742337355, + "step": 15927, + "train/sim_loss": 0.09109622240066528 + }, + { + "epoch": 1.5747478742337355, + "step": 15927, + "train/total_loss": 0.12546664476394653 + }, + { + "entropy": 9.602570533752441, + "epoch": 1.5748467470832508, + "mean_token_accuracy": 0.81717449426651, + "num_tokens": 49253690.0, + "step": 15928, + "train/ce_loss": 0.9084250926971436 + }, + { + "epoch": 1.5748467470832508, + "step": 15928, + "train/sim_loss": 0.05321323871612549 + }, + { + "epoch": 1.5748467470832508, + "step": 15928, + "train/total_loss": 0.14405575394630432 + }, + { + "entropy": 9.033856391906738, + "epoch": 1.5749456199327665, + "mean_token_accuracy": 0.8493826985359192, + "num_tokens": 49268230.0, + "step": 15929, + "train/ce_loss": 0.44195258617401123 + }, + { + "epoch": 1.5749456199327665, + "step": 15929, + "train/sim_loss": 0.09711742401123047 + }, + { + "epoch": 1.5749456199327665, + "step": 15929, + "train/total_loss": 0.14131268858909607 + }, + { + "entropy": 8.762190818786621, + "epoch": 1.575044492782282, + "mean_token_accuracy": 0.8383937478065491, + "num_tokens": 49275972.0, + "step": 15930, + "train/ce_loss": 0.5990592241287231 + }, + { + "epoch": 1.575044492782282, + "step": 15930, + "train/sim_loss": 0.050846755504608154 + }, + { + "epoch": 1.575044492782282, + "step": 15930, + "train/total_loss": 0.11075267940759659 + }, + { + "entropy": 8.614356994628906, + "epoch": 1.5751433656317975, + "mean_token_accuracy": 0.8197064995765686, + "num_tokens": 49284592.0, + "step": 15931, + "train/ce_loss": 0.4539239704608917 + }, + { + "epoch": 1.5751433656317975, + "step": 15931, + "train/sim_loss": 0.04961287975311279 + }, + { + "epoch": 1.5751433656317975, + "step": 15931, + "train/total_loss": 0.09500527381896973 + }, + { + "entropy": 9.209739685058594, + "epoch": 1.5752422384813132, + "mean_token_accuracy": 0.8425414562225342, + "num_tokens": 49297873.0, + "step": 15932, + "train/ce_loss": 0.30936169624328613 + }, + { + "epoch": 1.5752422384813132, + "step": 15932, + "train/sim_loss": 0.04240775108337402 + }, + { + "epoch": 1.5752422384813132, + "step": 15932, + "train/total_loss": 0.0733439177274704 + }, + { + "entropy": 9.592315673828125, + "epoch": 1.5753411113308284, + "mean_token_accuracy": 0.9188255667686462, + "num_tokens": 49312688.0, + "step": 15933, + "train/ce_loss": 0.5324755311012268 + }, + { + "epoch": 1.5753411113308284, + "step": 15933, + "train/sim_loss": 0.029326319694519043 + }, + { + "epoch": 1.5753411113308284, + "step": 15933, + "train/total_loss": 0.08257387578487396 + }, + { + "entropy": 9.11993408203125, + "epoch": 1.5754399841803441, + "mean_token_accuracy": 0.8223463892936707, + "num_tokens": 49331162.0, + "step": 15934, + "train/ce_loss": 0.30410832166671753 + }, + { + "epoch": 1.5754399841803441, + "step": 15934, + "train/sim_loss": 0.022069573402404785 + }, + { + "epoch": 1.5754399841803441, + "step": 15934, + "train/total_loss": 0.05248040705919266 + }, + { + "entropy": 9.133321762084961, + "epoch": 1.5755388570298596, + "mean_token_accuracy": 0.8717201352119446, + "num_tokens": 49341529.0, + "step": 15935, + "train/ce_loss": 0.6455310583114624 + }, + { + "epoch": 1.5755388570298596, + "step": 15935, + "train/sim_loss": 0.07080519199371338 + }, + { + "epoch": 1.5755388570298596, + "step": 15935, + "train/total_loss": 0.1353583037853241 + }, + { + "entropy": 9.279413223266602, + "epoch": 1.5756377298793751, + "mean_token_accuracy": 0.9073482155799866, + "num_tokens": 49354716.0, + "step": 15936, + "train/ce_loss": 0.272725909948349 + }, + { + "epoch": 1.5756377298793751, + "step": 15936, + "train/sim_loss": 0.027530431747436523 + }, + { + "epoch": 1.5756377298793751, + "step": 15936, + "train/total_loss": 0.054803021252155304 + }, + { + "entropy": 9.718352317810059, + "epoch": 1.5757366027288906, + "mean_token_accuracy": 0.9213114976882935, + "num_tokens": 49365473.0, + "step": 15937, + "train/ce_loss": 0.2616370916366577 + }, + { + "epoch": 1.5757366027288906, + "step": 15937, + "train/sim_loss": 0.034931719303131104 + }, + { + "epoch": 1.5757366027288906, + "step": 15937, + "train/total_loss": 0.061095431447029114 + }, + { + "entropy": 9.685508728027344, + "epoch": 1.575835475578406, + "mean_token_accuracy": 0.8835098147392273, + "num_tokens": 49384573.0, + "step": 15938, + "train/ce_loss": 0.6681841015815735 + }, + { + "epoch": 1.575835475578406, + "step": 15938, + "train/sim_loss": 0.05484670400619507 + }, + { + "epoch": 1.575835475578406, + "step": 15938, + "train/total_loss": 0.1216651126742363 + }, + { + "entropy": 9.03739070892334, + "epoch": 1.5759343484279218, + "mean_token_accuracy": 0.8674699068069458, + "num_tokens": 49392047.0, + "step": 15939, + "train/ce_loss": 0.4023229777812958 + }, + { + "epoch": 1.5759343484279218, + "step": 15939, + "train/sim_loss": 0.013317763805389404 + }, + { + "epoch": 1.5759343484279218, + "step": 15939, + "train/total_loss": 0.05355006083846092 + }, + { + "epoch": 1.576033221277437, + "grad_norm": 0.5476851463317871, + "learning_rate": 6.061662463531622e-06, + "loss": 0.0864, + "step": 15940 + }, + { + "entropy": 9.256002426147461, + "epoch": 1.576033221277437, + "mean_token_accuracy": 0.8305084705352783, + "num_tokens": 49402470.0, + "step": 15940, + "train/ce_loss": 0.4186308681964874 + }, + { + "epoch": 1.576033221277437, + "step": 15940, + "train/sim_loss": 0.027724087238311768 + }, + { + "epoch": 1.576033221277437, + "step": 15940, + "train/total_loss": 0.06958717107772827 + }, + { + "entropy": 9.910099983215332, + "epoch": 1.5761320941269528, + "mean_token_accuracy": 0.855967104434967, + "num_tokens": 49412124.0, + "step": 15941, + "train/ce_loss": 3.365304905855737e-07 + }, + { + "epoch": 1.5761320941269528, + "step": 15941, + "train/sim_loss": 0.01220768690109253 + }, + { + "epoch": 1.5761320941269528, + "step": 15941, + "train/total_loss": 0.012207720428705215 + }, + { + "entropy": 9.840921401977539, + "epoch": 1.5762309669764683, + "mean_token_accuracy": 0.9014778137207031, + "num_tokens": 49426091.0, + "step": 15942, + "train/ce_loss": 6.482997036982852e-07 + }, + { + "epoch": 1.5762309669764683, + "step": 15942, + "train/sim_loss": 0.07065439224243164 + }, + { + "epoch": 1.5762309669764683, + "step": 15942, + "train/total_loss": 0.07065445929765701 + }, + { + "entropy": 9.231411933898926, + "epoch": 1.5763298398259837, + "mean_token_accuracy": 0.85173499584198, + "num_tokens": 49434936.0, + "step": 15943, + "train/ce_loss": 9.002293950288731e-07 + }, + { + "epoch": 1.5763298398259837, + "step": 15943, + "train/sim_loss": 0.05034446716308594 + }, + { + "epoch": 1.5763298398259837, + "step": 15943, + "train/total_loss": 0.0503445565700531 + }, + { + "entropy": 9.674810409545898, + "epoch": 1.5764287126754994, + "mean_token_accuracy": 0.8479166626930237, + "num_tokens": 49444175.0, + "step": 15944, + "train/ce_loss": 0.3885717988014221 + }, + { + "epoch": 1.5764287126754994, + "step": 15944, + "train/sim_loss": 0.04361152648925781 + }, + { + "epoch": 1.5764287126754994, + "step": 15944, + "train/total_loss": 0.08246870338916779 + }, + { + "entropy": 9.1126070022583, + "epoch": 1.5765275855250147, + "mean_token_accuracy": 0.9054487347602844, + "num_tokens": 49458107.0, + "step": 15945, + "train/ce_loss": 0.14275458455085754 + }, + { + "epoch": 1.5765275855250147, + "step": 15945, + "train/sim_loss": 0.03749823570251465 + }, + { + "epoch": 1.5765275855250147, + "step": 15945, + "train/total_loss": 0.05177369341254234 + }, + { + "entropy": 9.445106506347656, + "epoch": 1.5766264583745304, + "mean_token_accuracy": 0.8575342297554016, + "num_tokens": 49477701.0, + "step": 15946, + "train/ce_loss": 0.4769849479198456 + }, + { + "epoch": 1.5766264583745304, + "step": 15946, + "train/sim_loss": 0.07144761085510254 + }, + { + "epoch": 1.5766264583745304, + "step": 15946, + "train/total_loss": 0.11914610862731934 + }, + { + "entropy": 8.820891380310059, + "epoch": 1.576725331224046, + "mean_token_accuracy": 0.8529089093208313, + "num_tokens": 49489460.0, + "step": 15947, + "train/ce_loss": 0.29545947909355164 + }, + { + "epoch": 1.576725331224046, + "step": 15947, + "train/sim_loss": 0.03449892997741699 + }, + { + "epoch": 1.576725331224046, + "step": 15947, + "train/total_loss": 0.06404487788677216 + }, + { + "entropy": 9.294071197509766, + "epoch": 1.5768242040735614, + "mean_token_accuracy": 0.8450704216957092, + "num_tokens": 49502843.0, + "step": 15948, + "train/ce_loss": 0.44735631346702576 + }, + { + "epoch": 1.5768242040735614, + "step": 15948, + "train/sim_loss": 0.023726940155029297 + }, + { + "epoch": 1.5768242040735614, + "step": 15948, + "train/total_loss": 0.06846257299184799 + }, + { + "entropy": 9.202743530273438, + "epoch": 1.5769230769230769, + "mean_token_accuracy": 0.9019354581832886, + "num_tokens": 49515256.0, + "step": 15949, + "train/ce_loss": 0.2678481340408325 + }, + { + "epoch": 1.5769230769230769, + "step": 15949, + "train/sim_loss": 0.07027047872543335 + }, + { + "epoch": 1.5769230769230769, + "step": 15949, + "train/total_loss": 0.09705529361963272 + }, + { + "entropy": 9.537704467773438, + "epoch": 1.5770219497725924, + "mean_token_accuracy": 0.862500011920929, + "num_tokens": 49524944.0, + "step": 15950, + "train/ce_loss": 0.29793214797973633 + }, + { + "epoch": 1.5770219497725924, + "step": 15950, + "train/sim_loss": 0.07477909326553345 + }, + { + "epoch": 1.5770219497725924, + "step": 15950, + "train/total_loss": 0.10457231104373932 + }, + { + "entropy": 9.129077911376953, + "epoch": 1.577120822622108, + "mean_token_accuracy": 0.8672438859939575, + "num_tokens": 49540655.0, + "step": 15951, + "train/ce_loss": 0.67886883020401 + }, + { + "epoch": 1.577120822622108, + "step": 15951, + "train/sim_loss": 0.036888301372528076 + }, + { + "epoch": 1.577120822622108, + "step": 15951, + "train/total_loss": 0.10477518290281296 + }, + { + "entropy": 8.885499954223633, + "epoch": 1.5772196954716233, + "mean_token_accuracy": 0.872474730014801, + "num_tokens": 49553998.0, + "step": 15952, + "train/ce_loss": 0.2649514973163605 + }, + { + "epoch": 1.5772196954716233, + "step": 15952, + "train/sim_loss": 0.051135361194610596 + }, + { + "epoch": 1.5772196954716233, + "step": 15952, + "train/total_loss": 0.07763051241636276 + }, + { + "entropy": 9.207929611206055, + "epoch": 1.577318568321139, + "mean_token_accuracy": 0.9019337296485901, + "num_tokens": 49561923.0, + "step": 15953, + "train/ce_loss": 0.37492311000823975 + }, + { + "epoch": 1.577318568321139, + "step": 15953, + "train/sim_loss": 0.03973841667175293 + }, + { + "epoch": 1.577318568321139, + "step": 15953, + "train/total_loss": 0.07723072916269302 + }, + { + "entropy": 9.298975944519043, + "epoch": 1.5774174411706545, + "mean_token_accuracy": 0.8193939328193665, + "num_tokens": 49574919.0, + "step": 15954, + "train/ce_loss": 0.6790241599082947 + }, + { + "epoch": 1.5774174411706545, + "step": 15954, + "train/sim_loss": 0.0512431263923645 + }, + { + "epoch": 1.5774174411706545, + "step": 15954, + "train/total_loss": 0.11914554238319397 + }, + { + "entropy": 9.07737922668457, + "epoch": 1.57751631402017, + "mean_token_accuracy": 0.8569868803024292, + "num_tokens": 49588208.0, + "step": 15955, + "train/ce_loss": 0.7326099276542664 + }, + { + "epoch": 1.57751631402017, + "step": 15955, + "train/sim_loss": 0.024073541164398193 + }, + { + "epoch": 1.57751631402017, + "step": 15955, + "train/total_loss": 0.09733453392982483 + }, + { + "entropy": 9.395018577575684, + "epoch": 1.5776151868696857, + "mean_token_accuracy": 0.8571428656578064, + "num_tokens": 49596149.0, + "step": 15956, + "train/ce_loss": 0.5675173401832581 + }, + { + "epoch": 1.5776151868696857, + "step": 15956, + "train/sim_loss": 0.01037585735321045 + }, + { + "epoch": 1.5776151868696857, + "step": 15956, + "train/total_loss": 0.06712759286165237 + }, + { + "entropy": 9.311325073242188, + "epoch": 1.577714059719201, + "mean_token_accuracy": 0.8054607510566711, + "num_tokens": 49612692.0, + "step": 15957, + "train/ce_loss": 0.8788385391235352 + }, + { + "epoch": 1.577714059719201, + "step": 15957, + "train/sim_loss": 0.05590546131134033 + }, + { + "epoch": 1.577714059719201, + "step": 15957, + "train/total_loss": 0.14378932118415833 + }, + { + "entropy": 9.191245079040527, + "epoch": 1.5778129325687167, + "mean_token_accuracy": 0.8365791440010071, + "num_tokens": 49630351.0, + "step": 15958, + "train/ce_loss": 0.3849017918109894 + }, + { + "epoch": 1.5778129325687167, + "step": 15958, + "train/sim_loss": 0.028940916061401367 + }, + { + "epoch": 1.5778129325687167, + "step": 15958, + "train/total_loss": 0.06743109226226807 + }, + { + "entropy": 9.159993171691895, + "epoch": 1.5779118054182322, + "mean_token_accuracy": 0.8601863980293274, + "num_tokens": 49637939.0, + "step": 15959, + "train/ce_loss": 0.4991941750049591 + }, + { + "epoch": 1.5779118054182322, + "step": 15959, + "train/sim_loss": 0.022220313549041748 + }, + { + "epoch": 1.5779118054182322, + "step": 15959, + "train/total_loss": 0.07213973253965378 + }, + { + "epoch": 1.5780106782677477, + "grad_norm": 0.5643295049667358, + "learning_rate": 6.056717598773674e-06, + "loss": 0.0811, + "step": 15960 + }, + { + "entropy": 9.372682571411133, + "epoch": 1.5780106782677477, + "mean_token_accuracy": 0.8346773982048035, + "num_tokens": 49650283.0, + "step": 15960, + "train/ce_loss": 0.47128984332084656 + }, + { + "epoch": 1.5780106782677477, + "step": 15960, + "train/sim_loss": 0.03025496006011963 + }, + { + "epoch": 1.5780106782677477, + "step": 15960, + "train/total_loss": 0.07738395035266876 + }, + { + "entropy": 9.564737319946289, + "epoch": 1.5781095511172634, + "mean_token_accuracy": 0.9012048244476318, + "num_tokens": 49658424.0, + "step": 15961, + "train/ce_loss": 0.6248454451560974 + }, + { + "epoch": 1.5781095511172634, + "step": 15961, + "train/sim_loss": 0.04422175884246826 + }, + { + "epoch": 1.5781095511172634, + "step": 15961, + "train/total_loss": 0.10670630633831024 + }, + { + "entropy": 9.342260360717773, + "epoch": 1.5782084239667786, + "mean_token_accuracy": 0.8542600870132446, + "num_tokens": 49665747.0, + "step": 15962, + "train/ce_loss": 0.32406365871429443 + }, + { + "epoch": 1.5782084239667786, + "step": 15962, + "train/sim_loss": 0.012230157852172852 + }, + { + "epoch": 1.5782084239667786, + "step": 15962, + "train/total_loss": 0.044636525213718414 + }, + { + "entropy": 9.246193885803223, + "epoch": 1.5783072968162943, + "mean_token_accuracy": 0.7622601389884949, + "num_tokens": 49678027.0, + "step": 15963, + "train/ce_loss": 0.5043503046035767 + }, + { + "epoch": 1.5783072968162943, + "step": 15963, + "train/sim_loss": 0.10932385921478271 + }, + { + "epoch": 1.5783072968162943, + "step": 15963, + "train/total_loss": 0.15975889563560486 + }, + { + "entropy": 9.541383743286133, + "epoch": 1.5784061696658098, + "mean_token_accuracy": 0.8827067613601685, + "num_tokens": 49689193.0, + "step": 15964, + "train/ce_loss": 8.489405445288867e-07 + }, + { + "epoch": 1.5784061696658098, + "step": 15964, + "train/sim_loss": 0.022684752941131592 + }, + { + "epoch": 1.5784061696658098, + "step": 15964, + "train/total_loss": 0.022684838622808456 + }, + { + "entropy": 9.129566192626953, + "epoch": 1.5785050425153253, + "mean_token_accuracy": 0.8185880184173584, + "num_tokens": 49704500.0, + "step": 15965, + "train/ce_loss": 0.8658284544944763 + }, + { + "epoch": 1.5785050425153253, + "step": 15965, + "train/sim_loss": 0.03577625751495361 + }, + { + "epoch": 1.5785050425153253, + "step": 15965, + "train/total_loss": 0.12235910445451736 + }, + { + "entropy": 9.112735748291016, + "epoch": 1.5786039153648408, + "mean_token_accuracy": 0.8554477095603943, + "num_tokens": 49714282.0, + "step": 15966, + "train/ce_loss": 0.6228700280189514 + }, + { + "epoch": 1.5786039153648408, + "step": 15966, + "train/sim_loss": 0.05161041021347046 + }, + { + "epoch": 1.5786039153648408, + "step": 15966, + "train/total_loss": 0.1138974130153656 + }, + { + "entropy": 9.341955184936523, + "epoch": 1.5787027882143563, + "mean_token_accuracy": 0.8363874554634094, + "num_tokens": 49729233.0, + "step": 15967, + "train/ce_loss": 0.46889451146125793 + }, + { + "epoch": 1.5787027882143563, + "step": 15967, + "train/sim_loss": 0.04147440195083618 + }, + { + "epoch": 1.5787027882143563, + "step": 15967, + "train/total_loss": 0.08836385607719421 + }, + { + "entropy": 9.314718246459961, + "epoch": 1.578801661063872, + "mean_token_accuracy": 0.8765903115272522, + "num_tokens": 49738822.0, + "step": 15968, + "train/ce_loss": 0.38016608357429504 + }, + { + "epoch": 1.578801661063872, + "step": 15968, + "train/sim_loss": 0.059075355529785156 + }, + { + "epoch": 1.578801661063872, + "step": 15968, + "train/total_loss": 0.09709196537733078 + }, + { + "entropy": 9.229000091552734, + "epoch": 1.5789005339133872, + "mean_token_accuracy": 0.8888888955116272, + "num_tokens": 49752157.0, + "step": 15969, + "train/ce_loss": 0.4809665083885193 + }, + { + "epoch": 1.5789005339133872, + "step": 15969, + "train/sim_loss": 0.0190579891204834 + }, + { + "epoch": 1.5789005339133872, + "step": 15969, + "train/total_loss": 0.0671546459197998 + }, + { + "entropy": 9.053614616394043, + "epoch": 1.578999406762903, + "mean_token_accuracy": 0.8501577377319336, + "num_tokens": 49762554.0, + "step": 15970, + "train/ce_loss": 4.166601570432249e-07 + }, + { + "epoch": 1.578999406762903, + "step": 15970, + "train/sim_loss": 0.03283500671386719 + }, + { + "epoch": 1.578999406762903, + "step": 15970, + "train/total_loss": 0.03283504769206047 + }, + { + "entropy": 9.07286548614502, + "epoch": 1.5790982796124184, + "mean_token_accuracy": 0.8538083434104919, + "num_tokens": 49774589.0, + "step": 15971, + "train/ce_loss": 0.3412306010723114 + }, + { + "epoch": 1.5790982796124184, + "step": 15971, + "train/sim_loss": 0.017911314964294434 + }, + { + "epoch": 1.5790982796124184, + "step": 15971, + "train/total_loss": 0.052034374326467514 + }, + { + "entropy": 8.761480331420898, + "epoch": 1.579197152461934, + "mean_token_accuracy": 0.8938356041908264, + "num_tokens": 49785759.0, + "step": 15972, + "train/ce_loss": 0.3246527314186096 + }, + { + "epoch": 1.579197152461934, + "step": 15972, + "train/sim_loss": 0.01773083209991455 + }, + { + "epoch": 1.579197152461934, + "step": 15972, + "train/total_loss": 0.05019610747694969 + }, + { + "entropy": 9.322854995727539, + "epoch": 1.5792960253114496, + "mean_token_accuracy": 0.8439790606498718, + "num_tokens": 49798302.0, + "step": 15973, + "train/ce_loss": 0.41455507278442383 + }, + { + "epoch": 1.5792960253114496, + "step": 15973, + "train/sim_loss": 0.07048749923706055 + }, + { + "epoch": 1.5792960253114496, + "step": 15973, + "train/total_loss": 0.11194300651550293 + }, + { + "entropy": 9.601356506347656, + "epoch": 1.579394898160965, + "mean_token_accuracy": 0.9117646813392639, + "num_tokens": 49809105.0, + "step": 15974, + "train/ce_loss": 0.6090611219406128 + }, + { + "epoch": 1.579394898160965, + "step": 15974, + "train/sim_loss": 0.03658449649810791 + }, + { + "epoch": 1.579394898160965, + "step": 15974, + "train/total_loss": 0.09749060869216919 + }, + { + "entropy": 8.886115074157715, + "epoch": 1.5794937710104806, + "mean_token_accuracy": 0.8333333134651184, + "num_tokens": 49819985.0, + "step": 15975, + "train/ce_loss": 0.3922129273414612 + }, + { + "epoch": 1.5794937710104806, + "step": 15975, + "train/sim_loss": 0.03760528564453125 + }, + { + "epoch": 1.5794937710104806, + "step": 15975, + "train/total_loss": 0.07682657986879349 + }, + { + "entropy": 9.316946983337402, + "epoch": 1.579592643859996, + "mean_token_accuracy": 0.8576778769493103, + "num_tokens": 49833241.0, + "step": 15976, + "train/ce_loss": 0.4930369257926941 + }, + { + "epoch": 1.579592643859996, + "step": 15976, + "train/sim_loss": 0.040153443813323975 + }, + { + "epoch": 1.579592643859996, + "step": 15976, + "train/total_loss": 0.08945713937282562 + }, + { + "entropy": 9.114171981811523, + "epoch": 1.5796915167095116, + "mean_token_accuracy": 0.8212237358093262, + "num_tokens": 49843051.0, + "step": 15977, + "train/ce_loss": 0.7081742286682129 + }, + { + "epoch": 1.5796915167095116, + "step": 15977, + "train/sim_loss": 0.04528754949569702 + }, + { + "epoch": 1.5796915167095116, + "step": 15977, + "train/total_loss": 0.11610497534275055 + }, + { + "entropy": 9.28496265411377, + "epoch": 1.579790389559027, + "mean_token_accuracy": 0.9132353067398071, + "num_tokens": 49861477.0, + "step": 15978, + "train/ce_loss": 0.3227757215499878 + }, + { + "epoch": 1.579790389559027, + "step": 15978, + "train/sim_loss": 0.017124593257904053 + }, + { + "epoch": 1.579790389559027, + "step": 15978, + "train/total_loss": 0.04940216615796089 + }, + { + "entropy": 9.238897323608398, + "epoch": 1.5798892624085425, + "mean_token_accuracy": 0.8565737009048462, + "num_tokens": 49869968.0, + "step": 15979, + "train/ce_loss": 0.317760169506073 + }, + { + "epoch": 1.5798892624085425, + "step": 15979, + "train/sim_loss": 0.028933405876159668 + }, + { + "epoch": 1.5798892624085425, + "step": 15979, + "train/total_loss": 0.06070942431688309 + }, + { + "epoch": 1.5799881352580583, + "grad_norm": 0.5311784148216248, + "learning_rate": 6.051772734015725e-06, + "loss": 0.0855, + "step": 15980 + }, + { + "entropy": 9.706369400024414, + "epoch": 1.5799881352580583, + "mean_token_accuracy": 0.8735632300376892, + "num_tokens": 49886709.0, + "step": 15980, + "train/ce_loss": 1.589449283301292e-07 + }, + { + "epoch": 1.5799881352580583, + "step": 15980, + "train/sim_loss": 0.029415130615234375 + }, + { + "epoch": 1.5799881352580583, + "step": 15980, + "train/total_loss": 0.029415147379040718 + }, + { + "entropy": 9.430574417114258, + "epoch": 1.5800870081075735, + "mean_token_accuracy": 0.8863636255264282, + "num_tokens": 49904229.0, + "step": 15981, + "train/ce_loss": 0.2718079686164856 + }, + { + "epoch": 1.5800870081075735, + "step": 15981, + "train/sim_loss": 0.018864333629608154 + }, + { + "epoch": 1.5800870081075735, + "step": 15981, + "train/total_loss": 0.04604513198137283 + }, + { + "entropy": 8.937982559204102, + "epoch": 1.5801858809570892, + "mean_token_accuracy": 0.7860125303268433, + "num_tokens": 49917537.0, + "step": 15982, + "train/ce_loss": 0.6174289584159851 + }, + { + "epoch": 1.5801858809570892, + "step": 15982, + "train/sim_loss": 0.05286097526550293 + }, + { + "epoch": 1.5801858809570892, + "step": 15982, + "train/total_loss": 0.11460387706756592 + }, + { + "entropy": 9.22401237487793, + "epoch": 1.5802847538066047, + "mean_token_accuracy": 0.9039450883865356, + "num_tokens": 49924384.0, + "step": 15983, + "train/ce_loss": 0.566940188407898 + }, + { + "epoch": 1.5802847538066047, + "step": 15983, + "train/sim_loss": 0.06745517253875732 + }, + { + "epoch": 1.5802847538066047, + "step": 15983, + "train/total_loss": 0.12414918839931488 + }, + { + "entropy": 9.57781982421875, + "epoch": 1.5803836266561202, + "mean_token_accuracy": 0.8653465509414673, + "num_tokens": 49936060.0, + "step": 15984, + "train/ce_loss": 3.087608035912126e-07 + }, + { + "epoch": 1.5803836266561202, + "step": 15984, + "train/sim_loss": 0.03063511848449707 + }, + { + "epoch": 1.5803836266561202, + "step": 15984, + "train/total_loss": 0.030635150149464607 + }, + { + "entropy": 9.262323379516602, + "epoch": 1.580482499505636, + "mean_token_accuracy": 0.882556140422821, + "num_tokens": 49948016.0, + "step": 15985, + "train/ce_loss": 0.5561661720275879 + }, + { + "epoch": 1.580482499505636, + "step": 15985, + "train/sim_loss": 0.06491130590438843 + }, + { + "epoch": 1.580482499505636, + "step": 15985, + "train/total_loss": 0.12052792310714722 + }, + { + "entropy": 9.33661937713623, + "epoch": 1.5805813723551512, + "mean_token_accuracy": 0.8618504405021667, + "num_tokens": 49964187.0, + "step": 15986, + "train/ce_loss": 0.46075475215911865 + }, + { + "epoch": 1.5805813723551512, + "step": 15986, + "train/sim_loss": 0.03278231620788574 + }, + { + "epoch": 1.5805813723551512, + "step": 15986, + "train/total_loss": 0.07885779440402985 + }, + { + "entropy": 9.041160583496094, + "epoch": 1.5806802452046669, + "mean_token_accuracy": 0.8203309774398804, + "num_tokens": 49976854.0, + "step": 15987, + "train/ce_loss": 0.5339841246604919 + }, + { + "epoch": 1.5806802452046669, + "step": 15987, + "train/sim_loss": 0.04738724231719971 + }, + { + "epoch": 1.5806802452046669, + "step": 15987, + "train/total_loss": 0.10078565776348114 + }, + { + "entropy": 9.38930606842041, + "epoch": 1.5807791180541824, + "mean_token_accuracy": 0.8285229206085205, + "num_tokens": 49987724.0, + "step": 15988, + "train/ce_loss": 0.4824028015136719 + }, + { + "epoch": 1.5807791180541824, + "step": 15988, + "train/sim_loss": 0.08816301822662354 + }, + { + "epoch": 1.5807791180541824, + "step": 15988, + "train/total_loss": 0.13640329241752625 + }, + { + "entropy": 9.797718048095703, + "epoch": 1.5808779909036978, + "mean_token_accuracy": 0.8926014304161072, + "num_tokens": 50002124.0, + "step": 15989, + "train/ce_loss": 2.0029342806537898e-07 + }, + { + "epoch": 1.5808779909036978, + "step": 15989, + "train/sim_loss": 0.012420952320098877 + }, + { + "epoch": 1.5808779909036978, + "step": 15989, + "train/total_loss": 0.012420972809195518 + }, + { + "entropy": 9.432026863098145, + "epoch": 1.5809768637532133, + "mean_token_accuracy": 0.8611111044883728, + "num_tokens": 50013049.0, + "step": 15990, + "train/ce_loss": 0.24073056876659393 + }, + { + "epoch": 1.5809768637532133, + "step": 15990, + "train/sim_loss": 0.028657495975494385 + }, + { + "epoch": 1.5809768637532133, + "step": 15990, + "train/total_loss": 0.05273055285215378 + }, + { + "entropy": 8.5089111328125, + "epoch": 1.5810757366027288, + "mean_token_accuracy": 0.846666693687439, + "num_tokens": 50022509.0, + "step": 15991, + "train/ce_loss": 0.5248335599899292 + }, + { + "epoch": 1.5810757366027288, + "step": 15991, + "train/sim_loss": 0.04546743631362915 + }, + { + "epoch": 1.5810757366027288, + "step": 15991, + "train/total_loss": 0.09795079380273819 + }, + { + "entropy": 9.37919807434082, + "epoch": 1.5811746094522445, + "mean_token_accuracy": 0.8511404395103455, + "num_tokens": 50033051.0, + "step": 15992, + "train/ce_loss": 0.5574295520782471 + }, + { + "epoch": 1.5811746094522445, + "step": 15992, + "train/sim_loss": 0.07366985082626343 + }, + { + "epoch": 1.5811746094522445, + "step": 15992, + "train/total_loss": 0.12941280007362366 + }, + { + "entropy": 9.364943504333496, + "epoch": 1.5812734823017598, + "mean_token_accuracy": 0.8486647009849548, + "num_tokens": 50043973.0, + "step": 15993, + "train/ce_loss": 0.3814941644668579 + }, + { + "epoch": 1.5812734823017598, + "step": 15993, + "train/sim_loss": 0.02423536777496338 + }, + { + "epoch": 1.5812734823017598, + "step": 15993, + "train/total_loss": 0.06238478422164917 + }, + { + "entropy": 9.189462661743164, + "epoch": 1.5813723551512755, + "mean_token_accuracy": 0.8963911533355713, + "num_tokens": 50055989.0, + "step": 15994, + "train/ce_loss": 0.3146936893463135 + }, + { + "epoch": 1.5813723551512755, + "step": 15994, + "train/sim_loss": 0.029935598373413086 + }, + { + "epoch": 1.5813723551512755, + "step": 15994, + "train/total_loss": 0.06140496954321861 + }, + { + "entropy": 9.170012474060059, + "epoch": 1.581471228000791, + "mean_token_accuracy": 0.8678343892097473, + "num_tokens": 50068712.0, + "step": 15995, + "train/ce_loss": 0.3436673581600189 + }, + { + "epoch": 1.581471228000791, + "step": 15995, + "train/sim_loss": 0.03687816858291626 + }, + { + "epoch": 1.581471228000791, + "step": 15995, + "train/total_loss": 0.07124491035938263 + }, + { + "entropy": 9.5399169921875, + "epoch": 1.5815701008503065, + "mean_token_accuracy": 0.7801724076271057, + "num_tokens": 50086763.0, + "step": 15996, + "train/ce_loss": 0.7836617827415466 + }, + { + "epoch": 1.5815701008503065, + "step": 15996, + "train/sim_loss": 0.022718846797943115 + }, + { + "epoch": 1.5815701008503065, + "step": 15996, + "train/total_loss": 0.10108502954244614 + }, + { + "entropy": 9.131124496459961, + "epoch": 1.5816689736998222, + "mean_token_accuracy": 0.7944444417953491, + "num_tokens": 50100750.0, + "step": 15997, + "train/ce_loss": 0.7900123000144958 + }, + { + "epoch": 1.5816689736998222, + "step": 15997, + "train/sim_loss": 0.02741605043411255 + }, + { + "epoch": 1.5816689736998222, + "step": 15997, + "train/total_loss": 0.10641728341579437 + }, + { + "entropy": 8.865788459777832, + "epoch": 1.5817678465493374, + "mean_token_accuracy": 0.883131206035614, + "num_tokens": 50112375.0, + "step": 15998, + "train/ce_loss": 0.3900896906852722 + }, + { + "epoch": 1.5817678465493374, + "step": 15998, + "train/sim_loss": 0.020119130611419678 + }, + { + "epoch": 1.5817678465493374, + "step": 15998, + "train/total_loss": 0.05912810191512108 + }, + { + "entropy": 9.25123405456543, + "epoch": 1.5818667193988531, + "mean_token_accuracy": 0.8812729716300964, + "num_tokens": 50124992.0, + "step": 15999, + "train/ce_loss": 0.37621763348579407 + }, + { + "epoch": 1.5818667193988531, + "step": 15999, + "train/sim_loss": 0.05711251497268677 + }, + { + "epoch": 1.5818667193988531, + "step": 15999, + "train/total_loss": 0.09473428130149841 + }, + { + "epoch": 1.5819655922483686, + "grad_norm": 0.522887647151947, + "learning_rate": 6.046827869257776e-06, + "loss": 0.0822, + "step": 16000 } ], "logging_steps": 20, @@ -220227,7 +293627,7 @@ "attributes": {} } }, - "total_flos": 2.044314653047128e+18, + "total_flos": 2.6422687254907453e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null